Merge tag 'for-4.20/block-20181021' of git://git.kernel.dk/linux-block
authorLinus Torvalds <torvalds@linux-foundation.org>
Mon, 22 Oct 2018 16:46:08 +0000 (17:46 +0100)
committerLinus Torvalds <torvalds@linux-foundation.org>
Mon, 22 Oct 2018 16:46:08 +0000 (17:46 +0100)
Pull block layer updates from Jens Axboe:
 "This is the main pull request for block changes for 4.20. This
  contains:

   - Series enabling runtime PM for blk-mq (Bart).

   - Two pull requests from Christoph for NVMe, with items such as;
      - Better AEN tracking
      - Multipath improvements
      - RDMA fixes
      - Rework of FC for target removal
      - Fixes for issues identified by static checkers
      - Fabric cleanups, as prep for TCP transport
      - Various cleanups and bug fixes

   - Block merging cleanups (Christoph)

   - Conversion of drivers to generic DMA mapping API (Christoph)

   - Series fixing ref count issues with blkcg (Dennis)

   - Series improving BFQ heuristics (Paolo, et al)

   - Series improving heuristics for the Kyber IO scheduler (Omar)

   - Removal of dangerous bio_rewind_iter() API (Ming)

   - Apply single queue IPI redirection logic to blk-mq (Ming)

   - Set of fixes and improvements for bcache (Coly et al)

   - Series closing a hotplug race with sysfs group attributes (Hannes)

   - Set of patches for lightnvm:
      - pblk trace support (Hans)
      - SPDX license header update (Javier)
      - Tons of refactoring patches to cleanly abstract the 1.2 and 2.0
        specs behind a common core interface. (Javier, Matias)
      - Enable pblk to use a common interface to retrieve chunk metadata
        (Matias)
      - Bug fixes (Various)

   - Set of fixes and updates to the blk IO latency target (Josef)

   - blk-mq queue number updates fixes (Jianchao)

   - Convert a bunch of drivers from the old legacy IO interface to
     blk-mq. This will conclude with the removal of the legacy IO
     interface itself in 4.21, with the rest of the drivers (me, Omar)

   - Removal of the DAC960 driver. The SCSI tree will introduce two
     replacement drivers for this (Hannes)"

* tag 'for-4.20/block-20181021' of git://git.kernel.dk/linux-block: (204 commits)
  block: setup bounce bio_sets properly
  blkcg: reassociate bios when make_request() is called recursively
  blkcg: fix edge case for blk_get_rl() under memory pressure
  nvme-fabrics: move controller options matching to fabrics
  nvme-rdma: always have a valid trsvcid
  mtip32xx: fully switch to the generic DMA API
  rsxx: switch to the generic DMA API
  umem: switch to the generic DMA API
  sx8: switch to the generic DMA API
  sx8: remove dead IF_64BIT_DMA_IS_POSSIBLE code
  skd: switch to the generic DMA API
  ubd: remove use of blk_rq_map_sg
  nvme-pci: remove duplicate check
  drivers/block: Remove DAC960 driver
  nvme-pci: fix hot removal during error handling
  nvmet-fcloop: suppress a compiler warning
  nvme-core: make implicit seed truncation explicit
  nvmet-fc: fix kernel-doc headers
  nvme-fc: rework the request initialization code
  nvme-fc: introduce struct nvme_fcp_op_w_sgl
  ...

181 files changed:
Documentation/admin-guide/cgroup-v2.rst
Documentation/blockdev/README.DAC960 [deleted file]
Documentation/blockdev/zram.txt
Documentation/device-mapper/log-writes.txt
arch/arm/include/asm/io.h
arch/arm64/include/asm/io.h
arch/m68k/emu/nfblock.c
arch/m68k/include/asm/atafd.h [deleted file]
arch/m68k/include/asm/atafdreg.h [deleted file]
arch/um/drivers/ubd_kern.c
arch/x86/include/asm/io.h
arch/x86/include/asm/xen/events.h
arch/x86/xen/enlighten.c
arch/x86/xen/enlighten_pvh.c
arch/x86/xen/platform-pci-unplug.c
arch/x86/xen/pmu.c
block/Kconfig
block/Kconfig.iosched
block/Makefile
block/bfq-cgroup.c
block/bfq-iosched.c
block/bfq-iosched.h
block/bfq-wf2q.c
block/bio-integrity.c
block/bio.c
block/blk-cgroup.c
block/blk-core.c
block/blk-flush.c
block/blk-integrity.c
block/blk-iolatency.c
block/blk-merge.c
block/blk-mq-debugfs.c
block/blk-mq-sched.h
block/blk-mq-tag.c
block/blk-mq.c
block/blk-pm.c [new file with mode: 0644]
block/blk-pm.h [new file with mode: 0644]
block/blk-softirq.c
block/blk-stat.c
block/blk-throttle.c
block/blk.h
block/bounce.c
block/cfq-iosched.c
block/elevator.c
block/genhd.c
block/kyber-iosched.c
drivers/block/DAC960.c [deleted file]
drivers/block/DAC960.h [deleted file]
drivers/block/Kconfig
drivers/block/Makefile
drivers/block/amiflop.c
drivers/block/aoe/aoe.h
drivers/block/aoe/aoeblk.c
drivers/block/aoe/aoecmd.c
drivers/block/aoe/aoedev.c
drivers/block/ataflop.c
drivers/block/drbd/Kconfig
drivers/block/drbd/drbd_int.h
drivers/block/drbd/drbd_main.c
drivers/block/drbd/drbd_nl.c
drivers/block/drbd/drbd_protocol.h
drivers/block/drbd/drbd_receiver.c
drivers/block/drbd/drbd_req.c
drivers/block/drbd/drbd_worker.c
drivers/block/floppy.c
drivers/block/loop.c
drivers/block/mtip32xx/mtip32xx.c
drivers/block/null_blk_main.c
drivers/block/paride/pcd.c
drivers/block/paride/pd.c
drivers/block/paride/pf.c
drivers/block/pktcdvd.c
drivers/block/ps3disk.c
drivers/block/ps3vram.c
drivers/block/rsxx/core.c
drivers/block/rsxx/cregs.c
drivers/block/rsxx/dev.c
drivers/block/rsxx/dma.c
drivers/block/skd_main.c
drivers/block/sunvdc.c
drivers/block/swim.c
drivers/block/swim3.c
drivers/block/sx8.c
drivers/block/umem.c
drivers/block/virtio_blk.c
drivers/block/xen-blkfront.c
drivers/block/xsysace.c
drivers/block/z2ram.c
drivers/block/zram/Kconfig
drivers/block/zram/zram_drv.c
drivers/cdrom/cdrom.c
drivers/cdrom/gdrom.c
drivers/ide/ide-cd.c
drivers/ide/ide-gd.c
drivers/lightnvm/Kconfig
drivers/lightnvm/core.c
drivers/lightnvm/pblk-cache.c
drivers/lightnvm/pblk-core.c
drivers/lightnvm/pblk-gc.c
drivers/lightnvm/pblk-init.c
drivers/lightnvm/pblk-map.c
drivers/lightnvm/pblk-rb.c
drivers/lightnvm/pblk-read.c
drivers/lightnvm/pblk-recovery.c
drivers/lightnvm/pblk-rl.c
drivers/lightnvm/pblk-sysfs.c
drivers/lightnvm/pblk-trace.h [new file with mode: 0644]
drivers/lightnvm/pblk-write.c
drivers/lightnvm/pblk.h
drivers/md/bcache/alloc.c
drivers/md/bcache/bcache.h
drivers/md/bcache/btree.c
drivers/md/bcache/closure.h
drivers/md/bcache/debug.c
drivers/md/bcache/extents.c
drivers/md/bcache/request.c
drivers/md/bcache/request.h
drivers/md/bcache/super.c
drivers/md/bcache/sysfs.c
drivers/md/raid0.c
drivers/memstick/core/ms_block.c
drivers/memstick/core/mspro_block.c
drivers/mmc/core/block.c
drivers/mtd/mtd_blkdevs.c
drivers/nvdimm/blk.c
drivers/nvdimm/btt.c
drivers/nvdimm/pmem.c
drivers/nvme/host/core.c
drivers/nvme/host/fabrics.c
drivers/nvme/host/fabrics.h
drivers/nvme/host/fc.c
drivers/nvme/host/lightnvm.c
drivers/nvme/host/multipath.c
drivers/nvme/host/nvme.h
drivers/nvme/host/pci.c
drivers/nvme/host/rdma.c
drivers/nvme/host/trace.h
drivers/nvme/target/admin-cmd.c
drivers/nvme/target/core.c
drivers/nvme/target/discovery.c
drivers/nvme/target/fc.c
drivers/nvme/target/fcloop.c
drivers/nvme/target/io-cmd-bdev.c
drivers/nvme/target/io-cmd-file.c
drivers/nvme/target/nvmet.h
drivers/nvme/target/rdma.c
drivers/s390/block/dasd_genhd.c
drivers/s390/block/dcssblk.c
drivers/s390/block/scm_blk.c
drivers/scsi/scsi_lib.c
drivers/scsi/scsi_pm.c
drivers/scsi/sd.c
drivers/scsi/sr.c
drivers/target/target_core_spc.c
drivers/xen/biomerge.c
drivers/xen/xen-acpi-pad.c
fs/buffer.c
fs/ext4/page-io.c
include/linux/amifd.h [deleted file]
include/linux/amifdreg.h [deleted file]
include/linux/bio.h
include/linux/blk-cgroup.h
include/linux/blk-mq.h
include/linux/blk-pm.h [new file with mode: 0644]
include/linux/blk_types.h
include/linux/blkdev.h
include/linux/bvec.h
include/linux/cgroup.h
include/linux/elevator.h
include/linux/genhd.h
include/linux/lightnvm.h
include/linux/mtd/blktrans.h
include/linux/nvme.h
include/linux/percpu-refcount.h
include/linux/writeback.h
include/trace/events/kyber.h [new file with mode: 0644]
include/xen/xen.h
kernel/cgroup/cgroup.c
kernel/trace/blktrace.c
lib/percpu-refcount.c
mm/page_io.c

index 184193bcb262ac908f1f5a7a7c2c662dec0ea4b8..caf36105a1c7b50b3b76d53afcb2d75d094fee77 100644 (file)
@@ -1857,8 +1857,10 @@ following two functions.
 
   wbc_init_bio(@wbc, @bio)
        Should be called for each bio carrying writeback data and
-       associates the bio with the inode's owner cgroup.  Can be
-       called anytime between bio allocation and submission.
+       associates the bio with the inode's owner cgroup and the
+       corresponding request queue.  This must be called after
+       a queue (device) has been associated with the bio and
+       before submission.
 
   wbc_account_io(@wbc, @page, @bytes)
        Should be called for each data segment being written out.
@@ -1877,7 +1879,7 @@ the configuration, the bio may be executed at a lower priority and if
 the writeback session is holding shared resources, e.g. a journal
 entry, may lead to priority inversion.  There is no one easy solution
 for the problem.  Filesystems can try to work around specific problem
-cases by skipping wbc_init_bio() or using bio_associate_blkcg()
+cases by skipping wbc_init_bio() or using bio_associate_create_blkg()
 directly.
 
 
diff --git a/Documentation/blockdev/README.DAC960 b/Documentation/blockdev/README.DAC960
deleted file mode 100644 (file)
index bd85fb9..0000000
+++ /dev/null
@@ -1,756 +0,0 @@
-   Linux Driver for Mylex DAC960/AcceleRAID/eXtremeRAID PCI RAID Controllers
-
-                       Version 2.2.11 for Linux 2.2.19
-                       Version 2.4.11 for Linux 2.4.12
-
-                             PRODUCTION RELEASE
-
-                               11 October 2001
-
-                              Leonard N. Zubkoff
-                              Dandelion Digital
-                              lnz@dandelion.com
-
-        Copyright 1998-2001 by Leonard N. Zubkoff <lnz@dandelion.com>
-
-
-                                INTRODUCTION
-
-Mylex, Inc. designs and manufactures a variety of high performance PCI RAID
-controllers.  Mylex Corporation is located at 34551 Ardenwood Blvd., Fremont,
-California 94555, USA and can be reached at 510.796.6100 or on the World Wide
-Web at http://www.mylex.com.  Mylex Technical Support can be reached by
-electronic mail at mylexsup@us.ibm.com, by voice at 510.608.2400, or by FAX at
-510.745.7715.  Contact information for offices in Europe and Japan is available
-on their Web site.
-
-The latest information on Linux support for DAC960 PCI RAID Controllers, as
-well as the most recent release of this driver, will always be available from
-my Linux Home Page at URL "http://www.dandelion.com/Linux/".  The Linux DAC960
-driver supports all current Mylex PCI RAID controllers including the new
-eXtremeRAID 2000/3000 and AcceleRAID 352/170/160 models which have an entirely
-new firmware interface from the older eXtremeRAID 1100, AcceleRAID 150/200/250,
-and DAC960PJ/PG/PU/PD/PL.  See below for a complete controller list as well as
-minimum firmware version requirements.  For simplicity, in most places this
-documentation refers to DAC960 generically rather than explicitly listing all
-the supported models.
-
-Driver bug reports should be sent via electronic mail to "lnz@dandelion.com".
-Please include with the bug report the complete configuration messages reported
-by the driver at startup, along with any subsequent system messages relevant to
-the controller's operation, and a detailed description of your system's
-hardware configuration.  Driver bugs are actually quite rare; if you encounter
-problems with disks being marked offline, for example, please contact Mylex
-Technical Support as the problem is related to the hardware configuration
-rather than the Linux driver.
-
-Please consult the RAID controller documentation for detailed information
-regarding installation and configuration of the controllers.  This document
-primarily provides information specific to the Linux support.
-
-
-                               DRIVER FEATURES
-
-The DAC960 RAID controllers are supported solely as high performance RAID
-controllers, not as interfaces to arbitrary SCSI devices.  The Linux DAC960
-driver operates at the block device level, the same level as the SCSI and IDE
-drivers.  Unlike other RAID controllers currently supported on Linux, the
-DAC960 driver is not dependent on the SCSI subsystem, and hence avoids all the
-complexity and unnecessary code that would be associated with an implementation
-as a SCSI driver.  The DAC960 driver is designed for as high a performance as
-possible with no compromises or extra code for compatibility with lower
-performance devices.  The DAC960 driver includes extensive error logging and
-online configuration management capabilities.  Except for initial configuration
-of the controller and adding new disk drives, most everything can be handled
-from Linux while the system is operational.
-
-The DAC960 driver is architected to support up to 8 controllers per system.
-Each DAC960 parallel SCSI controller can support up to 15 disk drives per
-channel, for a maximum of 60 drives on a four channel controller; the fibre
-channel eXtremeRAID 3000 controller supports up to 125 disk drives per loop for
-a total of 250 drives.  The drives installed on a controller are divided into
-one or more "Drive Groups", and then each Drive Group is subdivided further
-into 1 to 32 "Logical Drives".  Each Logical Drive has a specific RAID Level
-and caching policy associated with it, and it appears to Linux as a single
-block device.  Logical Drives are further subdivided into up to 7 partitions
-through the normal Linux and PC disk partitioning schemes.  Logical Drives are
-also known as "System Drives", and Drive Groups are also called "Packs".  Both
-terms are in use in the Mylex documentation; I have chosen to standardize on
-the more generic "Logical Drive" and "Drive Group".
-
-DAC960 RAID disk devices are named in the style of the obsolete Device File
-System (DEVFS).  The device corresponding to Logical Drive D on Controller C
-is referred to as /dev/rd/cCdD, and the partitions are called /dev/rd/cCdDp1
-through /dev/rd/cCdDp7.  For example, partition 3 of Logical Drive 5 on
-Controller 2 is referred to as /dev/rd/c2d5p3.  Note that unlike with SCSI
-disks the device names will not change in the event of a disk drive failure.
-The DAC960 driver is assigned major numbers 48 - 55 with one major number per
-controller.  The 8 bits of minor number are divided into 5 bits for the Logical
-Drive and 3 bits for the partition.
-
-
-         SUPPORTED DAC960/AcceleRAID/eXtremeRAID PCI RAID CONTROLLERS
-
-The following list comprises the supported DAC960, AcceleRAID, and eXtremeRAID
-PCI RAID Controllers as of the date of this document.  It is recommended that
-anyone purchasing a Mylex PCI RAID Controller not in the following table
-contact the author beforehand to verify that it is or will be supported.
-
-eXtremeRAID 3000
-           1 Wide Ultra-2/LVD SCSI channel
-           2 External Fibre FC-AL channels
-           233MHz StrongARM SA 110 Processor
-           64 Bit 33MHz PCI (backward compatible with 32 Bit PCI slots)
-           32MB/64MB ECC SDRAM Memory
-
-eXtremeRAID 2000
-           4 Wide Ultra-160 LVD SCSI channels
-           233MHz StrongARM SA 110 Processor
-           64 Bit 33MHz PCI (backward compatible with 32 Bit PCI slots)
-           32MB/64MB ECC SDRAM Memory
-
-AcceleRAID 352
-           2 Wide Ultra-160 LVD SCSI channels
-           100MHz Intel i960RN RISC Processor
-           64 Bit 33MHz PCI (backward compatible with 32 Bit PCI slots)
-           32MB/64MB ECC SDRAM Memory
-
-AcceleRAID 170
-           1 Wide Ultra-160 LVD SCSI channel
-           100MHz Intel i960RM RISC Processor
-           16MB/32MB/64MB ECC SDRAM Memory
-
-AcceleRAID 160 (AcceleRAID 170LP)
-           1 Wide Ultra-160 LVD SCSI channel
-           100MHz Intel i960RS RISC Processor
-           Built in 16M ECC SDRAM Memory
-           PCI Low Profile Form Factor - fit for 2U height
-
-eXtremeRAID 1100 (DAC1164P)
-           3 Wide Ultra-2/LVD SCSI channels
-           233MHz StrongARM SA 110 Processor
-           64 Bit 33MHz PCI (backward compatible with 32 Bit PCI slots)
-           16MB/32MB/64MB Parity SDRAM Memory with Battery Backup
-
-AcceleRAID 250 (DAC960PTL1)
-           Uses onboard Symbios SCSI chips on certain motherboards
-           Also includes one onboard Wide Ultra-2/LVD SCSI Channel
-           66MHz Intel i960RD RISC Processor
-           4MB/8MB/16MB/32MB/64MB/128MB ECC EDO Memory
-
-AcceleRAID 200 (DAC960PTL0)
-           Uses onboard Symbios SCSI chips on certain motherboards
-           Includes no onboard SCSI Channels
-           66MHz Intel i960RD RISC Processor
-           4MB/8MB/16MB/32MB/64MB/128MB ECC EDO Memory
-
-AcceleRAID 150 (DAC960PRL)
-           Uses onboard Symbios SCSI chips on certain motherboards
-           Also includes one onboard Wide Ultra-2/LVD SCSI Channel
-           33MHz Intel i960RP RISC Processor
-           4MB Parity EDO Memory
-
-DAC960PJ    1/2/3 Wide Ultra SCSI-3 Channels
-           66MHz Intel i960RD RISC Processor
-           4MB/8MB/16MB/32MB/64MB/128MB ECC EDO Memory
-
-DAC960PG    1/2/3 Wide Ultra SCSI-3 Channels
-           33MHz Intel i960RP RISC Processor
-           4MB/8MB ECC EDO Memory
-
-DAC960PU    1/2/3 Wide Ultra SCSI-3 Channels
-           Intel i960CF RISC Processor
-           4MB/8MB EDRAM or 2MB/4MB/8MB/16MB/32MB DRAM Memory
-
-DAC960PD    1/2/3 Wide Fast SCSI-2 Channels
-           Intel i960CF RISC Processor
-           4MB/8MB EDRAM or 2MB/4MB/8MB/16MB/32MB DRAM Memory
-
-DAC960PL    1/2/3 Wide Fast SCSI-2 Channels
-           Intel i960 RISC Processor
-           2MB/4MB/8MB/16MB/32MB DRAM Memory
-
-DAC960P            1/2/3 Wide Fast SCSI-2 Channels
-           Intel i960 RISC Processor
-           2MB/4MB/8MB/16MB/32MB DRAM Memory
-
-For the eXtremeRAID 2000/3000 and AcceleRAID 352/170/160, firmware version
-6.00-01 or above is required.
-
-For the eXtremeRAID 1100, firmware version 5.06-0-52 or above is required.
-
-For the AcceleRAID 250, 200, and 150, firmware version 4.06-0-57 or above is
-required.
-
-For the DAC960PJ and DAC960PG, firmware version 4.06-0-00 or above is required.
-
-For the DAC960PU, DAC960PD, DAC960PL, and DAC960P, either firmware version
-3.51-0-04 or above is required (for dual Flash ROM controllers), or firmware
-version 2.73-0-00 or above is required (for single Flash ROM controllers)
-
-Please note that not all SCSI disk drives are suitable for use with DAC960
-controllers, and only particular firmware versions of any given model may
-actually function correctly.  Similarly, not all motherboards have a BIOS that
-properly initializes the AcceleRAID 250, AcceleRAID 200, AcceleRAID 150,
-DAC960PJ, and DAC960PG because the Intel i960RD/RP is a multi-function device.
-If in doubt, contact Mylex RAID Technical Support (mylexsup@us.ibm.com) to
-verify compatibility.  Mylex makes available a hard disk compatibility list at
-http://www.mylex.com/support/hdcomp/hd-lists.html.
-
-
-                             DRIVER INSTALLATION
-
-This distribution was prepared for Linux kernel version 2.2.19 or 2.4.12.
-
-To install the DAC960 RAID driver, you may use the following commands,
-replacing "/usr/src" with wherever you keep your Linux kernel source tree:
-
-  cd /usr/src
-  tar -xvzf DAC960-2.2.11.tar.gz (or DAC960-2.4.11.tar.gz)
-  mv README.DAC960 linux/Documentation
-  mv DAC960.[ch] linux/drivers/block
-  patch -p0 < DAC960.patch (if DAC960.patch is included)
-  cd linux
-  make config
-  make bzImage (or zImage)
-
-Then install "arch/x86/boot/bzImage" or "arch/x86/boot/zImage" as your
-standard kernel, run lilo if appropriate, and reboot.
-
-To create the necessary devices in /dev, the "make_rd" script included in
-"DAC960-Utilities.tar.gz" from http://www.dandelion.com/Linux/ may be used.
-LILO 21 and FDISK v2.9 include DAC960 support; also included in this archive
-are patches to LILO 20 and FDISK v2.8 that add DAC960 support, along with
-statically linked executables of LILO and FDISK.  This modified version of LILO
-will allow booting from a DAC960 controller and/or mounting the root file
-system from a DAC960.
-
-Red Hat Linux 6.0 and SuSE Linux 6.1 include support for Mylex PCI RAID
-controllers.  Installing directly onto a DAC960 may be problematic from other
-Linux distributions until their installation utilities are updated.
-
-
-                             INSTALLATION NOTES
-
-Before installing Linux or adding DAC960 logical drives to an existing Linux
-system, the controller must first be configured to provide one or more logical
-drives using the BIOS Configuration Utility or DACCF.  Please note that since
-there are only at most 6 usable partitions on each logical drive, systems
-requiring more partitions should subdivide a drive group into multiple logical
-drives, each of which can have up to 6 usable partitions.  Also, note that with
-large disk arrays it is advisable to enable the 8GB BIOS Geometry (255/63)
-rather than accepting the default 2GB BIOS Geometry (128/32); failing to so do
-will cause the logical drive geometry to have more than 65535 cylinders which
-will make it impossible for FDISK to be used properly.  The 8GB BIOS Geometry
-can be enabled by configuring the DAC960 BIOS, which is accessible via Alt-M
-during the BIOS initialization sequence.
-
-For maximum performance and the most efficient E2FSCK performance, it is
-recommended that EXT2 file systems be built with a 4KB block size and 16 block
-stride to match the DAC960 controller's 64KB default stripe size.  The command
-"mke2fs -b 4096 -R stride=16 <device>" is appropriate.  Unless there will be a
-large number of small files on the file systems, it is also beneficial to add
-the "-i 16384" option to increase the bytes per inode parameter thereby
-reducing the file system metadata.  Finally, on systems that will only be run
-with Linux 2.2 or later kernels it is beneficial to enable sparse superblocks
-with the "-s 1" option.
-
-
-                     DAC960 ANNOUNCEMENTS MAILING LIST
-
-The DAC960 Announcements Mailing List provides a forum for informing Linux
-users of new driver releases and other announcements regarding Linux support
-for DAC960 PCI RAID Controllers.  To join the mailing list, send a message to
-"dac960-announce-request@dandelion.com" with the line "subscribe" in the
-message body.
-
-
-               CONTROLLER CONFIGURATION AND STATUS MONITORING
-
-The DAC960 RAID controllers running firmware 4.06 or above include a Background
-Initialization facility so that system downtime is minimized both for initial
-installation and subsequent configuration of additional storage.  The BIOS
-Configuration Utility (accessible via Alt-R during the BIOS initialization
-sequence) is used to quickly configure the controller, and then the logical
-drives that have been created are available for immediate use even while they
-are still being initialized by the controller.  The primary need for online
-configuration and status monitoring is then to avoid system downtime when disk
-drives fail and must be replaced.  Mylex's online monitoring and configuration
-utilities are being ported to Linux and will become available at some point in
-the future.  Note that with a SAF-TE (SCSI Accessed Fault-Tolerant Enclosure)
-enclosure, the controller is able to rebuild failed drives automatically as
-soon as a drive replacement is made available.
-
-The primary interfaces for controller configuration and status monitoring are
-special files created in the /proc/rd/... hierarchy along with the normal
-system console logging mechanism.  Whenever the system is operating, the DAC960
-driver queries each controller for status information every 10 seconds, and
-checks for additional conditions every 60 seconds.  The initial status of each
-controller is always available for controller N in /proc/rd/cN/initial_status,
-and the current status as of the last status monitoring query is available in
-/proc/rd/cN/current_status.  In addition, status changes are also logged by the
-driver to the system console and will appear in the log files maintained by
-syslog.  The progress of asynchronous rebuild or consistency check operations
-is also available in /proc/rd/cN/current_status, and progress messages are
-logged to the system console at most every 60 seconds.
-
-Starting with the 2.2.3/2.0.3 versions of the driver, the status information
-available in /proc/rd/cN/initial_status and /proc/rd/cN/current_status has been
-augmented to include the vendor, model, revision, and serial number (if
-available) for each physical device found connected to the controller:
-
-***** DAC960 RAID Driver Version 2.2.3 of 19 August 1999 *****
-Copyright 1998-1999 by Leonard N. Zubkoff <lnz@dandelion.com>
-Configuring Mylex DAC960PRL PCI RAID Controller
-  Firmware Version: 4.07-0-07, Channels: 1, Memory Size: 16MB
-  PCI Bus: 1, Device: 4, Function: 1, I/O Address: Unassigned
-  PCI Address: 0xFE300000 mapped at 0xA0800000, IRQ Channel: 21
-  Controller Queue Depth: 128, Maximum Blocks per Command: 128
-  Driver Queue Depth: 127, Maximum Scatter/Gather Segments: 33
-  Stripe Size: 64KB, Segment Size: 8KB, BIOS Geometry: 255/63
-  SAF-TE Enclosure Management Enabled
-  Physical Devices:
-    0:0  Vendor: IBM       Model: DRVS09D           Revision: 0270
-         Serial Number:       68016775HA
-         Disk Status: Online, 17928192 blocks
-    0:1  Vendor: IBM       Model: DRVS09D           Revision: 0270
-         Serial Number:       68004E53HA
-         Disk Status: Online, 17928192 blocks
-    0:2  Vendor: IBM       Model: DRVS09D           Revision: 0270
-         Serial Number:       13013935HA
-         Disk Status: Online, 17928192 blocks
-    0:3  Vendor: IBM       Model: DRVS09D           Revision: 0270
-         Serial Number:       13016897HA
-         Disk Status: Online, 17928192 blocks
-    0:4  Vendor: IBM       Model: DRVS09D           Revision: 0270
-         Serial Number:       68019905HA
-         Disk Status: Online, 17928192 blocks
-    0:5  Vendor: IBM       Model: DRVS09D           Revision: 0270
-         Serial Number:       68012753HA
-         Disk Status: Online, 17928192 blocks
-    0:6  Vendor: ESG-SHV   Model: SCA HSBP M6       Revision: 0.61
-  Logical Drives:
-    /dev/rd/c0d0: RAID-5, Online, 89640960 blocks, Write Thru
-  No Rebuild or Consistency Check in Progress
-
-To simplify the monitoring process for custom software, the special file
-/proc/rd/status returns "OK" when all DAC960 controllers in the system are
-operating normally and no failures have occurred, or "ALERT" if any logical
-drives are offline or critical or any non-standby physical drives are dead.
-
-Configuration commands for controller N are available via the special file
-/proc/rd/cN/user_command.  A human readable command can be written to this
-special file to initiate a configuration operation, and the results of the
-operation can then be read back from the special file in addition to being
-logged to the system console.  The shell command sequence
-
-  echo "<configuration-command>" > /proc/rd/c0/user_command
-  cat /proc/rd/c0/user_command
-
-is typically used to execute configuration commands.  The configuration
-commands are:
-
-  flush-cache
-
-    The "flush-cache" command flushes the controller's cache.  The system
-    automatically flushes the cache at shutdown or if the driver module is
-    unloaded, so this command is only needed to be certain a write back cache
-    is flushed to disk before the system is powered off by a command to a UPS.
-    Note that the flush-cache command also stops an asynchronous rebuild or
-    consistency check, so it should not be used except when the system is being
-    halted.
-
-  kill <channel>:<target-id>
-
-    The "kill" command marks the physical drive <channel>:<target-id> as DEAD.
-    This command is provided primarily for testing, and should not be used
-    during normal system operation.
-
-  make-online <channel>:<target-id>
-
-    The "make-online" command changes the physical drive <channel>:<target-id>
-    from status DEAD to status ONLINE.  In cases where multiple physical drives
-    have been killed simultaneously, this command may be used to bring all but
-    one of them back online, after which a rebuild to the final drive is
-    necessary.
-
-    Warning: make-online should only be used on a dead physical drive that is
-    an active part of a drive group, never on a standby drive.  The command
-    should never be used on a dead drive that is part of a critical logical
-    drive; rebuild should be used if only a single drive is dead.
-
-  make-standby <channel>:<target-id>
-
-    The "make-standby" command changes physical drive <channel>:<target-id>
-    from status DEAD to status STANDBY.  It should only be used in cases where
-    a dead drive was replaced after an automatic rebuild was performed onto a
-    standby drive.  It cannot be used to add a standby drive to the controller
-    configuration if one was not created initially; the BIOS Configuration
-    Utility must be used for that currently.
-
-  rebuild <channel>:<target-id>
-
-    The "rebuild" command initiates an asynchronous rebuild onto physical drive
-    <channel>:<target-id>.  It should only be used when a dead drive has been
-    replaced.
-
-  check-consistency <logical-drive-number>
-
-    The "check-consistency" command initiates an asynchronous consistency check
-    of <logical-drive-number> with automatic restoration.  It can be used
-    whenever it is desired to verify the consistency of the redundancy
-    information.
-
-  cancel-rebuild
-  cancel-consistency-check
-
-    The "cancel-rebuild" and "cancel-consistency-check" commands cancel any
-    rebuild or consistency check operations previously initiated.
-
-
-              EXAMPLE I - DRIVE FAILURE WITHOUT A STANDBY DRIVE
-
-The following annotated logs demonstrate the controller configuration and and
-online status monitoring capabilities of the Linux DAC960 Driver.  The test
-configuration comprises 6 1GB Quantum Atlas I disk drives on two channels of a
-DAC960PJ controller.  The physical drives are configured into a single drive
-group without a standby drive, and the drive group has been configured into two
-logical drives, one RAID-5 and one RAID-6.  Note that these logs are from an
-earlier version of the driver and the messages have changed somewhat with newer
-releases, but the functionality remains similar.  First, here is the current
-status of the RAID configuration:
-
-gwynedd:/u/lnz# cat /proc/rd/c0/current_status
-***** DAC960 RAID Driver Version 2.0.0 of 23 March 1999 *****
-Copyright 1998-1999 by Leonard N. Zubkoff <lnz@dandelion.com>
-Configuring Mylex DAC960PJ PCI RAID Controller
-  Firmware Version: 4.06-0-08, Channels: 3, Memory Size: 8MB
-  PCI Bus: 0, Device: 19, Function: 1, I/O Address: Unassigned
-  PCI Address: 0xFD4FC000 mapped at 0x8807000, IRQ Channel: 9
-  Controller Queue Depth: 128, Maximum Blocks per Command: 128
-  Driver Queue Depth: 127, Maximum Scatter/Gather Segments: 33
-  Stripe Size: 64KB, Segment Size: 8KB, BIOS Geometry: 255/63
-  Physical Devices:
-    0:1 - Disk: Online, 2201600 blocks
-    0:2 - Disk: Online, 2201600 blocks
-    0:3 - Disk: Online, 2201600 blocks
-    1:1 - Disk: Online, 2201600 blocks
-    1:2 - Disk: Online, 2201600 blocks
-    1:3 - Disk: Online, 2201600 blocks
-  Logical Drives:
-    /dev/rd/c0d0: RAID-5, Online, 5498880 blocks, Write Thru
-    /dev/rd/c0d1: RAID-6, Online, 3305472 blocks, Write Thru
-  No Rebuild or Consistency Check in Progress
-
-gwynedd:/u/lnz# cat /proc/rd/status
-OK
-
-The above messages indicate that everything is healthy, and /proc/rd/status
-returns "OK" indicating that there are no problems with any DAC960 controller
-in the system.  For demonstration purposes, while I/O is active Physical Drive
-1:1 is now disconnected, simulating a drive failure.  The failure is noted by
-the driver within 10 seconds of the controller's having detected it, and the
-driver logs the following console status messages indicating that Logical
-Drives 0 and 1 are now CRITICAL as a result of Physical Drive 1:1 being DEAD:
-
-DAC960#0: Physical Drive 1:2 Error Log: Sense Key = 6, ASC = 29, ASCQ = 02
-DAC960#0: Physical Drive 1:3 Error Log: Sense Key = 6, ASC = 29, ASCQ = 02
-DAC960#0: Physical Drive 1:1 killed because of timeout on SCSI command
-DAC960#0: Physical Drive 1:1 is now DEAD
-DAC960#0: Logical Drive 0 (/dev/rd/c0d0) is now CRITICAL
-DAC960#0: Logical Drive 1 (/dev/rd/c0d1) is now CRITICAL
-
-The Sense Keys logged here are just Check Condition / Unit Attention conditions
-arising from a SCSI bus reset that is forced by the controller during its error
-recovery procedures.  Concurrently with the above, the driver status available
-from /proc/rd also reflects the drive failure.  The status message in
-/proc/rd/status has changed from "OK" to "ALERT":
-
-gwynedd:/u/lnz# cat /proc/rd/status
-ALERT
-
-and /proc/rd/c0/current_status has been updated:
-
-gwynedd:/u/lnz# cat /proc/rd/c0/current_status
-  ...
-  Physical Devices:
-    0:1 - Disk: Online, 2201600 blocks
-    0:2 - Disk: Online, 2201600 blocks
-    0:3 - Disk: Online, 2201600 blocks
-    1:1 - Disk: Dead, 2201600 blocks
-    1:2 - Disk: Online, 2201600 blocks
-    1:3 - Disk: Online, 2201600 blocks
-  Logical Drives:
-    /dev/rd/c0d0: RAID-5, Critical, 5498880 blocks, Write Thru
-    /dev/rd/c0d1: RAID-6, Critical, 3305472 blocks, Write Thru
-  No Rebuild or Consistency Check in Progress
-
-Since there are no standby drives configured, the system can continue to access
-the logical drives in a performance degraded mode until the failed drive is
-replaced and a rebuild operation completed to restore the redundancy of the
-logical drives.  Once Physical Drive 1:1 is replaced with a properly
-functioning drive, or if the physical drive was killed without having failed
-(e.g., due to electrical problems on the SCSI bus), the user can instruct the
-controller to initiate a rebuild operation onto the newly replaced drive:
-
-gwynedd:/u/lnz# echo "rebuild 1:1" > /proc/rd/c0/user_command
-gwynedd:/u/lnz# cat /proc/rd/c0/user_command
-Rebuild of Physical Drive 1:1 Initiated
-
-The echo command instructs the controller to initiate an asynchronous rebuild
-operation onto Physical Drive 1:1, and the status message that results from the
-operation is then available for reading from /proc/rd/c0/user_command, as well
-as being logged to the console by the driver.
-
-Within 10 seconds of this command the driver logs the initiation of the
-asynchronous rebuild operation:
-
-DAC960#0: Rebuild of Physical Drive 1:1 Initiated
-DAC960#0: Physical Drive 1:1 Error Log: Sense Key = 6, ASC = 29, ASCQ = 01
-DAC960#0: Physical Drive 1:1 is now WRITE-ONLY
-DAC960#0: Rebuild in Progress: Logical Drive 0 (/dev/rd/c0d0) 1% completed
-
-and /proc/rd/c0/current_status is updated:
-
-gwynedd:/u/lnz# cat /proc/rd/c0/current_status
-  ...
-  Physical Devices:
-    0:1 - Disk: Online, 2201600 blocks
-    0:2 - Disk: Online, 2201600 blocks
-    0:3 - Disk: Online, 2201600 blocks
-    1:1 - Disk: Write-Only, 2201600 blocks
-    1:2 - Disk: Online, 2201600 blocks
-    1:3 - Disk: Online, 2201600 blocks
-  Logical Drives:
-    /dev/rd/c0d0: RAID-5, Critical, 5498880 blocks, Write Thru
-    /dev/rd/c0d1: RAID-6, Critical, 3305472 blocks, Write Thru
-  Rebuild in Progress: Logical Drive 0 (/dev/rd/c0d0) 6% completed
-
-As the rebuild progresses, the current status in /proc/rd/c0/current_status is
-updated every 10 seconds:
-
-gwynedd:/u/lnz# cat /proc/rd/c0/current_status
-  ...
-  Physical Devices:
-    0:1 - Disk: Online, 2201600 blocks
-    0:2 - Disk: Online, 2201600 blocks
-    0:3 - Disk: Online, 2201600 blocks
-    1:1 - Disk: Write-Only, 2201600 blocks
-    1:2 - Disk: Online, 2201600 blocks
-    1:3 - Disk: Online, 2201600 blocks
-  Logical Drives:
-    /dev/rd/c0d0: RAID-5, Critical, 5498880 blocks, Write Thru
-    /dev/rd/c0d1: RAID-6, Critical, 3305472 blocks, Write Thru
-  Rebuild in Progress: Logical Drive 0 (/dev/rd/c0d0) 15% completed
-
-and every minute a progress message is logged to the console by the driver:
-
-DAC960#0: Rebuild in Progress: Logical Drive 0 (/dev/rd/c0d0) 32% completed
-DAC960#0: Rebuild in Progress: Logical Drive 0 (/dev/rd/c0d0) 63% completed
-DAC960#0: Rebuild in Progress: Logical Drive 0 (/dev/rd/c0d0) 94% completed
-DAC960#0: Rebuild in Progress: Logical Drive 1 (/dev/rd/c0d1) 94% completed
-
-Finally, the rebuild completes successfully.  The driver logs the status of the 
-logical and physical drives and the rebuild completion:
-
-DAC960#0: Rebuild Completed Successfully
-DAC960#0: Physical Drive 1:1 is now ONLINE
-DAC960#0: Logical Drive 0 (/dev/rd/c0d0) is now ONLINE
-DAC960#0: Logical Drive 1 (/dev/rd/c0d1) is now ONLINE
-
-/proc/rd/c0/current_status is updated:
-
-gwynedd:/u/lnz# cat /proc/rd/c0/current_status
-  ...
-  Physical Devices:
-    0:1 - Disk: Online, 2201600 blocks
-    0:2 - Disk: Online, 2201600 blocks
-    0:3 - Disk: Online, 2201600 blocks
-    1:1 - Disk: Online, 2201600 blocks
-    1:2 - Disk: Online, 2201600 blocks
-    1:3 - Disk: Online, 2201600 blocks
-  Logical Drives:
-    /dev/rd/c0d0: RAID-5, Online, 5498880 blocks, Write Thru
-    /dev/rd/c0d1: RAID-6, Online, 3305472 blocks, Write Thru
-  Rebuild Completed Successfully
-
-and /proc/rd/status indicates that everything is healthy once again:
-
-gwynedd:/u/lnz# cat /proc/rd/status
-OK
-
-
-               EXAMPLE II - DRIVE FAILURE WITH A STANDBY DRIVE
-
-The following annotated logs demonstrate the controller configuration and and
-online status monitoring capabilities of the Linux DAC960 Driver.  The test
-configuration comprises 6 1GB Quantum Atlas I disk drives on two channels of a
-DAC960PJ controller.  The physical drives are configured into a single drive
-group with a standby drive, and the drive group has been configured into two
-logical drives, one RAID-5 and one RAID-6.  Note that these logs are from an
-earlier version of the driver and the messages have changed somewhat with newer
-releases, but the functionality remains similar.  First, here is the current
-status of the RAID configuration:
-
-gwynedd:/u/lnz# cat /proc/rd/c0/current_status
-***** DAC960 RAID Driver Version 2.0.0 of 23 March 1999 *****
-Copyright 1998-1999 by Leonard N. Zubkoff <lnz@dandelion.com>
-Configuring Mylex DAC960PJ PCI RAID Controller
-  Firmware Version: 4.06-0-08, Channels: 3, Memory Size: 8MB
-  PCI Bus: 0, Device: 19, Function: 1, I/O Address: Unassigned
-  PCI Address: 0xFD4FC000 mapped at 0x8807000, IRQ Channel: 9
-  Controller Queue Depth: 128, Maximum Blocks per Command: 128
-  Driver Queue Depth: 127, Maximum Scatter/Gather Segments: 33
-  Stripe Size: 64KB, Segment Size: 8KB, BIOS Geometry: 255/63
-  Physical Devices:
-    0:1 - Disk: Online, 2201600 blocks
-    0:2 - Disk: Online, 2201600 blocks
-    0:3 - Disk: Online, 2201600 blocks
-    1:1 - Disk: Online, 2201600 blocks
-    1:2 - Disk: Online, 2201600 blocks
-    1:3 - Disk: Standby, 2201600 blocks
-  Logical Drives:
-    /dev/rd/c0d0: RAID-5, Online, 4399104 blocks, Write Thru
-    /dev/rd/c0d1: RAID-6, Online, 2754560 blocks, Write Thru
-  No Rebuild or Consistency Check in Progress
-
-gwynedd:/u/lnz# cat /proc/rd/status
-OK
-
-The above messages indicate that everything is healthy, and /proc/rd/status
-returns "OK" indicating that there are no problems with any DAC960 controller
-in the system.  For demonstration purposes, while I/O is active Physical Drive
-1:2 is now disconnected, simulating a drive failure.  The failure is noted by
-the driver within 10 seconds of the controller's having detected it, and the
-driver logs the following console status messages:
-
-DAC960#0: Physical Drive 1:1 Error Log: Sense Key = 6, ASC = 29, ASCQ = 02
-DAC960#0: Physical Drive 1:3 Error Log: Sense Key = 6, ASC = 29, ASCQ = 02
-DAC960#0: Physical Drive 1:2 killed because of timeout on SCSI command
-DAC960#0: Physical Drive 1:2 is now DEAD
-DAC960#0: Physical Drive 1:2 killed because it was removed
-DAC960#0: Logical Drive 0 (/dev/rd/c0d0) is now CRITICAL
-DAC960#0: Logical Drive 1 (/dev/rd/c0d1) is now CRITICAL
-
-Since a standby drive is configured, the controller automatically begins
-rebuilding onto the standby drive:
-
-DAC960#0: Physical Drive 1:3 is now WRITE-ONLY
-DAC960#0: Rebuild in Progress: Logical Drive 0 (/dev/rd/c0d0) 4% completed
-
-Concurrently with the above, the driver status available from /proc/rd also
-reflects the drive failure and automatic rebuild.  The status message in
-/proc/rd/status has changed from "OK" to "ALERT":
-
-gwynedd:/u/lnz# cat /proc/rd/status
-ALERT
-
-and /proc/rd/c0/current_status has been updated:
-
-gwynedd:/u/lnz# cat /proc/rd/c0/current_status
-  ...
-  Physical Devices:
-    0:1 - Disk: Online, 2201600 blocks
-    0:2 - Disk: Online, 2201600 blocks
-    0:3 - Disk: Online, 2201600 blocks
-    1:1 - Disk: Online, 2201600 blocks
-    1:2 - Disk: Dead, 2201600 blocks
-    1:3 - Disk: Write-Only, 2201600 blocks
-  Logical Drives:
-    /dev/rd/c0d0: RAID-5, Critical, 4399104 blocks, Write Thru
-    /dev/rd/c0d1: RAID-6, Critical, 2754560 blocks, Write Thru
-  Rebuild in Progress: Logical Drive 0 (/dev/rd/c0d0) 4% completed
-
-As the rebuild progresses, the current status in /proc/rd/c0/current_status is
-updated every 10 seconds:
-
-gwynedd:/u/lnz# cat /proc/rd/c0/current_status
-  ...
-  Physical Devices:
-    0:1 - Disk: Online, 2201600 blocks
-    0:2 - Disk: Online, 2201600 blocks
-    0:3 - Disk: Online, 2201600 blocks
-    1:1 - Disk: Online, 2201600 blocks
-    1:2 - Disk: Dead, 2201600 blocks
-    1:3 - Disk: Write-Only, 2201600 blocks
-  Logical Drives:
-    /dev/rd/c0d0: RAID-5, Critical, 4399104 blocks, Write Thru
-    /dev/rd/c0d1: RAID-6, Critical, 2754560 blocks, Write Thru
-  Rebuild in Progress: Logical Drive 0 (/dev/rd/c0d0) 40% completed
-
-and every minute a progress message is logged on the console by the driver:
-
-DAC960#0: Rebuild in Progress: Logical Drive 0 (/dev/rd/c0d0) 40% completed
-DAC960#0: Rebuild in Progress: Logical Drive 0 (/dev/rd/c0d0) 76% completed
-DAC960#0: Rebuild in Progress: Logical Drive 1 (/dev/rd/c0d1) 66% completed
-DAC960#0: Rebuild in Progress: Logical Drive 1 (/dev/rd/c0d1) 84% completed
-
-Finally, the rebuild completes successfully.  The driver logs the status of the 
-logical and physical drives and the rebuild completion:
-
-DAC960#0: Rebuild Completed Successfully
-DAC960#0: Physical Drive 1:3 is now ONLINE
-DAC960#0: Logical Drive 0 (/dev/rd/c0d0) is now ONLINE
-DAC960#0: Logical Drive 1 (/dev/rd/c0d1) is now ONLINE
-
-/proc/rd/c0/current_status is updated:
-
-***** DAC960 RAID Driver Version 2.0.0 of 23 March 1999 *****
-Copyright 1998-1999 by Leonard N. Zubkoff <lnz@dandelion.com>
-Configuring Mylex DAC960PJ PCI RAID Controller
-  Firmware Version: 4.06-0-08, Channels: 3, Memory Size: 8MB
-  PCI Bus: 0, Device: 19, Function: 1, I/O Address: Unassigned
-  PCI Address: 0xFD4FC000 mapped at 0x8807000, IRQ Channel: 9
-  Controller Queue Depth: 128, Maximum Blocks per Command: 128
-  Driver Queue Depth: 127, Maximum Scatter/Gather Segments: 33
-  Stripe Size: 64KB, Segment Size: 8KB, BIOS Geometry: 255/63
-  Physical Devices:
-    0:1 - Disk: Online, 2201600 blocks
-    0:2 - Disk: Online, 2201600 blocks
-    0:3 - Disk: Online, 2201600 blocks
-    1:1 - Disk: Online, 2201600 blocks
-    1:2 - Disk: Dead, 2201600 blocks
-    1:3 - Disk: Online, 2201600 blocks
-  Logical Drives:
-    /dev/rd/c0d0: RAID-5, Online, 4399104 blocks, Write Thru
-    /dev/rd/c0d1: RAID-6, Online, 2754560 blocks, Write Thru
-  Rebuild Completed Successfully
-
-and /proc/rd/status indicates that everything is healthy once again:
-
-gwynedd:/u/lnz# cat /proc/rd/status
-OK
-
-Note that the absence of a viable standby drive does not create an "ALERT"
-status.  Once dead Physical Drive 1:2 has been replaced, the controller must be
-told that this has occurred and that the newly replaced drive should become the
-new standby drive:
-
-gwynedd:/u/lnz# echo "make-standby 1:2" > /proc/rd/c0/user_command
-gwynedd:/u/lnz# cat /proc/rd/c0/user_command
-Make Standby of Physical Drive 1:2 Succeeded
-
-The echo command instructs the controller to make Physical Drive 1:2 into a
-standby drive, and the status message that results from the operation is then
-available for reading from /proc/rd/c0/user_command, as well as being logged to
-the console by the driver.  Within 60 seconds of this command the driver logs:
-
-DAC960#0: Physical Drive 1:2 Error Log: Sense Key = 6, ASC = 29, ASCQ = 01
-DAC960#0: Physical Drive 1:2 is now STANDBY
-DAC960#0: Make Standby of Physical Drive 1:2 Succeeded
-
-and /proc/rd/c0/current_status is updated:
-
-gwynedd:/u/lnz# cat /proc/rd/c0/current_status
-  ...
-  Physical Devices:
-    0:1 - Disk: Online, 2201600 blocks
-    0:2 - Disk: Online, 2201600 blocks
-    0:3 - Disk: Online, 2201600 blocks
-    1:1 - Disk: Online, 2201600 blocks
-    1:2 - Disk: Standby, 2201600 blocks
-    1:3 - Disk: Online, 2201600 blocks
-  Logical Drives:
-    /dev/rd/c0d0: RAID-5, Online, 4399104 blocks, Write Thru
-    /dev/rd/c0d1: RAID-6, Online, 2754560 blocks, Write Thru
-  Rebuild Completed Successfully
index 875b2b56b87fc88131324bb10bbecc8594e02109..3c1b5ab54bc07a487517c1441adc408b170cf5fa 100644 (file)
@@ -190,7 +190,7 @@ whitespace:
  notify_free      Depending on device usage scenario it may account
                   a) the number of pages freed because of swap slot free
                   notifications or b) the number of pages freed because of
-                  REQ_DISCARD requests sent by bio. The former ones are
+                  REQ_OP_DISCARD requests sent by bio. The former ones are
                   sent to a swap block device when a swap slot is freed,
                   which implies that this disk is being used as a swap disk.
                   The latter ones are sent by filesystem mounted with
index f4ebcbaf50f308313770e79a37d180ac5acd6542..b638d124be6ac7e934c8a5f0ea8b65b3e83daeeb 100644 (file)
@@ -38,7 +38,7 @@ inconsistent file system.
 Any REQ_FUA requests bypass this flushing mechanism and are logged as soon as
 they complete as those requests will obviously bypass the device cache.
 
-Any REQ_DISCARD requests are treated like WRITE requests.  Otherwise we would
+Any REQ_OP_DISCARD requests are treated like WRITE requests.  Otherwise we would
 have all the DISCARD requests, and then the WRITE requests and then the FLUSH
 request.  Consider the following example:
 
index 2cfbc531f63b61154fc7c18340b8a2c24f9732a8..6b51826ab3d10752c1d5e98b0ea3d5450b5359b1 100644 (file)
@@ -28,7 +28,6 @@
 #include <asm/byteorder.h>
 #include <asm/memory.h>
 #include <asm-generic/pci_iomap.h>
-#include <xen/xen.h>
 
 /*
  * ISA I/O bus memory addresses are 1:1 with the physical address.
@@ -459,20 +458,6 @@ extern void pci_iounmap(struct pci_dev *dev, void __iomem *addr);
 
 #include <asm-generic/io.h>
 
-/*
- * can the hardware map this into one segment or not, given no other
- * constraints.
- */
-#define BIOVEC_MERGEABLE(vec1, vec2)   \
-       ((bvec_to_phys((vec1)) + (vec1)->bv_len) == bvec_to_phys((vec2)))
-
-struct bio_vec;
-extern bool xen_biovec_phys_mergeable(const struct bio_vec *vec1,
-                                     const struct bio_vec *vec2);
-#define BIOVEC_PHYS_MERGEABLE(vec1, vec2)                              \
-       (__BIOVEC_PHYS_MERGEABLE(vec1, vec2) &&                         \
-        (!xen_domain() || xen_biovec_phys_mergeable(vec1, vec2)))
-
 #ifdef CONFIG_MMU
 #define ARCH_HAS_VALID_PHYS_ADDR_RANGE
 extern int valid_phys_addr_range(phys_addr_t addr, size_t size);
index 35b2e50f17fbfedc220f50c5162d42cec73160b5..9f8b915af3a718976ad0ddaca547c4deef21a4d2 100644 (file)
@@ -31,8 +31,6 @@
 #include <asm/alternative.h>
 #include <asm/cpufeature.h>
 
-#include <xen/xen.h>
-
 /*
  * Generic IO read/write.  These perform native-endian accesses.
  */
@@ -205,12 +203,5 @@ extern int valid_mmap_phys_addr_range(unsigned long pfn, size_t size);
 
 extern int devmem_is_allowed(unsigned long pfn);
 
-struct bio_vec;
-extern bool xen_biovec_phys_mergeable(const struct bio_vec *vec1,
-                                     const struct bio_vec *vec2);
-#define BIOVEC_PHYS_MERGEABLE(vec1, vec2)                              \
-       (__BIOVEC_PHYS_MERGEABLE(vec1, vec2) &&                         \
-        (!xen_domain() || xen_biovec_phys_mergeable(vec1, vec2)))
-
 #endif /* __KERNEL__ */
 #endif /* __ASM_IO_H */
index e9110b9b8bcdec50ec26ef974557447ebadb213a..38049357d6d3279ff7b95ff801a01e2cb88d9a4b 100644 (file)
@@ -73,7 +73,7 @@ static blk_qc_t nfhd_make_request(struct request_queue *queue, struct bio *bio)
                len = bvec.bv_len;
                len >>= 9;
                nfhd_read_write(dev->id, 0, dir, sec >> shift, len >> shift,
-                               bvec_to_phys(&bvec));
+                               page_to_phys(bvec.bv_page) + bvec.bv_offset);
                sec += len;
        }
        bio_endio(bio);
diff --git a/arch/m68k/include/asm/atafd.h b/arch/m68k/include/asm/atafd.h
deleted file mode 100644 (file)
index ad7014c..0000000
+++ /dev/null
@@ -1,13 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _ASM_M68K_FD_H
-#define _ASM_M68K_FD_H
-
-/* Definitions for the Atari Floppy driver */
-
-struct atari_format_descr {
-    int track;                 /* to be formatted */
-    int head;                  /*   ""     ""     */
-    int sect_offset;           /* offset of first sector */
-};
-
-#endif
diff --git a/arch/m68k/include/asm/atafdreg.h b/arch/m68k/include/asm/atafdreg.h
deleted file mode 100644 (file)
index c31b491..0000000
+++ /dev/null
@@ -1,80 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _LINUX_FDREG_H
-#define _LINUX_FDREG_H
-
-/*
-** WD1772 stuff
- */
-
-/* register codes */
-
-#define FDCSELREG_STP   (0x80)   /* command/status register */
-#define FDCSELREG_TRA   (0x82)   /* track register */
-#define FDCSELREG_SEC   (0x84)   /* sector register */
-#define FDCSELREG_DTA   (0x86)   /* data register */
-
-/* register names for FDC_READ/WRITE macros */
-
-#define FDCREG_CMD             0
-#define FDCREG_STATUS  0
-#define FDCREG_TRACK   2
-#define FDCREG_SECTOR  4
-#define FDCREG_DATA            6
-
-/* command opcodes */
-
-#define FDCCMD_RESTORE  (0x00)   /*  -                   */
-#define FDCCMD_SEEK     (0x10)   /*   |                  */
-#define FDCCMD_STEP     (0x20)   /*   |  TYP 1 Commands  */
-#define FDCCMD_STIN     (0x40)   /*   |                  */
-#define FDCCMD_STOT     (0x60)   /*  -                   */
-#define FDCCMD_RDSEC    (0x80)   /*  -   TYP 2 Commands  */
-#define FDCCMD_WRSEC    (0xa0)   /*  -          "        */
-#define FDCCMD_RDADR    (0xc0)   /*  -                   */
-#define FDCCMD_RDTRA    (0xe0)   /*   |  TYP 3 Commands  */
-#define FDCCMD_WRTRA    (0xf0)   /*  -                   */
-#define FDCCMD_FORCI    (0xd0)   /*  -   TYP 4 Command   */
-
-/* command modifier bits */
-
-#define FDCCMDADD_SR6   (0x00)   /* step rate settings */
-#define FDCCMDADD_SR12  (0x01)
-#define FDCCMDADD_SR2   (0x02)
-#define FDCCMDADD_SR3   (0x03)
-#define FDCCMDADD_V     (0x04)   /* verify */
-#define FDCCMDADD_H     (0x08)   /* wait for spin-up */
-#define FDCCMDADD_U     (0x10)   /* update track register */
-#define FDCCMDADD_M     (0x10)   /* multiple sector access */
-#define FDCCMDADD_E     (0x04)   /* head settling flag */
-#define FDCCMDADD_P     (0x02)   /* precompensation off */
-#define FDCCMDADD_A0    (0x01)   /* DAM flag */
-
-/* status register bits */
-
-#define        FDCSTAT_MOTORON (0x80)   /* motor on */
-#define        FDCSTAT_WPROT   (0x40)   /* write protected (FDCCMD_WR*) */
-#define        FDCSTAT_SPINUP  (0x20)   /* motor speed stable (Type I) */
-#define        FDCSTAT_DELDAM  (0x20)   /* sector has deleted DAM (Type II+III) */
-#define        FDCSTAT_RECNF   (0x10)   /* record not found */
-#define        FDCSTAT_CRC             (0x08)   /* CRC error */
-#define        FDCSTAT_TR00    (0x04)   /* Track 00 flag (Type I) */
-#define        FDCSTAT_LOST    (0x04)   /* Lost Data (Type II+III) */
-#define        FDCSTAT_IDX             (0x02)   /* Index status (Type I) */
-#define        FDCSTAT_DRQ             (0x02)   /* DRQ status (Type II+III) */
-#define        FDCSTAT_BUSY    (0x01)   /* FDC is busy */
-
-
-/* PSG Port A Bit Nr 0 .. Side Sel .. 0 -> Side 1  1 -> Side 2 */
-#define DSKSIDE     (0x01)
-
-#define DSKDRVNONE  (0x06)
-#define DSKDRV0     (0x02)
-#define DSKDRV1     (0x04)
-
-/* step rates */
-#define        FDCSTEP_6       0x00
-#define        FDCSTEP_12      0x01
-#define        FDCSTEP_2       0x02
-#define        FDCSTEP_3       0x03
-
-#endif
index 83c470364dfb34dce51320322e9b119c10157f3e..74c002ddc0ce74868286b77f43dfa6885e6c3e70 100644 (file)
@@ -23,6 +23,7 @@
 #include <linux/module.h>
 #include <linux/init.h>
 #include <linux/blkdev.h>
+#include <linux/blk-mq.h>
 #include <linux/ata.h>
 #include <linux/hdreg.h>
 #include <linux/cdrom.h>
@@ -142,7 +143,6 @@ struct cow {
 #define MAX_SG 64
 
 struct ubd {
-       struct list_head restart;
        /* name (and fd, below) of the file opened for writing, either the
         * backing or the cow file. */
        char *file;
@@ -156,11 +156,8 @@ struct ubd {
        struct cow cow;
        struct platform_device pdev;
        struct request_queue *queue;
+       struct blk_mq_tag_set tag_set;
        spinlock_t lock;
-       struct scatterlist sg[MAX_SG];
-       struct request *request;
-       int start_sg, end_sg;
-       sector_t rq_pos;
 };
 
 #define DEFAULT_COW { \
@@ -182,10 +179,6 @@ struct ubd {
        .shared =               0, \
        .cow =                  DEFAULT_COW, \
        .lock =                 __SPIN_LOCK_UNLOCKED(ubd_devs.lock), \
-       .request =              NULL, \
-       .start_sg =             0, \
-       .end_sg =               0, \
-       .rq_pos =               0, \
 }
 
 /* Protected by ubd_lock */
@@ -196,6 +189,9 @@ static int fake_ide = 0;
 static struct proc_dir_entry *proc_ide_root = NULL;
 static struct proc_dir_entry *proc_ide = NULL;
 
+static blk_status_t ubd_queue_rq(struct blk_mq_hw_ctx *hctx,
+                                const struct blk_mq_queue_data *bd);
+
 static void make_proc_ide(void)
 {
        proc_ide_root = proc_mkdir("ide", NULL);
@@ -436,11 +432,8 @@ __uml_help(udb_setup,
 "    in the boot output.\n\n"
 );
 
-static void do_ubd_request(struct request_queue * q);
-
 /* Only changed by ubd_init, which is an initcall. */
 static int thread_fd = -1;
-static LIST_HEAD(restart);
 
 /* Function to read several request pointers at a time
 * handling fractional reads if (and as) needed
@@ -498,9 +491,6 @@ static int bulk_req_safe_read(
 /* Called without dev->lock held, and only in interrupt context. */
 static void ubd_handler(void)
 {
-       struct ubd *ubd;
-       struct list_head *list, *next_ele;
-       unsigned long flags;
        int n;
        int count;
 
@@ -520,23 +510,17 @@ static void ubd_handler(void)
                        return;
                }
                for (count = 0; count < n/sizeof(struct io_thread_req *); count++) {
-                       blk_end_request(
-                               (*irq_req_buffer)[count]->req,
-                               BLK_STS_OK,
-                               (*irq_req_buffer)[count]->length
-                       );
-                       kfree((*irq_req_buffer)[count]);
+                       struct io_thread_req *io_req = (*irq_req_buffer)[count];
+                       int err = io_req->error ? BLK_STS_IOERR : BLK_STS_OK;
+
+                       if (!blk_update_request(io_req->req, err, io_req->length))
+                               __blk_mq_end_request(io_req->req, err);
+
+                       kfree(io_req);
                }
        }
-       reactivate_fd(thread_fd, UBD_IRQ);
 
-       list_for_each_safe(list, next_ele, &restart){
-               ubd = container_of(list, struct ubd, restart);
-               list_del_init(&ubd->restart);
-               spin_lock_irqsave(&ubd->lock, flags);
-               do_ubd_request(ubd->queue);
-               spin_unlock_irqrestore(&ubd->lock, flags);
-       }
+       reactivate_fd(thread_fd, UBD_IRQ);
 }
 
 static irqreturn_t ubd_intr(int irq, void *dev)
@@ -857,6 +841,7 @@ static void ubd_device_release(struct device *dev)
        struct ubd *ubd_dev = dev_get_drvdata(dev);
 
        blk_cleanup_queue(ubd_dev->queue);
+       blk_mq_free_tag_set(&ubd_dev->tag_set);
        *ubd_dev = ((struct ubd) DEFAULT_UBD);
 }
 
@@ -891,7 +876,7 @@ static int ubd_disk_register(int major, u64 size, int unit,
 
        disk->private_data = &ubd_devs[unit];
        disk->queue = ubd_devs[unit].queue;
-       device_add_disk(parent, disk);
+       device_add_disk(parent, disk, NULL);
 
        *disk_out = disk;
        return 0;
@@ -899,6 +884,10 @@ static int ubd_disk_register(int major, u64 size, int unit,
 
 #define ROUND_BLOCK(n) ((n + ((1 << 9) - 1)) & (-1 << 9))
 
+static const struct blk_mq_ops ubd_mq_ops = {
+       .queue_rq = ubd_queue_rq,
+};
+
 static int ubd_add(int n, char **error_out)
 {
        struct ubd *ubd_dev = &ubd_devs[n];
@@ -915,15 +904,23 @@ static int ubd_add(int n, char **error_out)
 
        ubd_dev->size = ROUND_BLOCK(ubd_dev->size);
 
-       INIT_LIST_HEAD(&ubd_dev->restart);
-       sg_init_table(ubd_dev->sg, MAX_SG);
+       ubd_dev->tag_set.ops = &ubd_mq_ops;
+       ubd_dev->tag_set.queue_depth = 64;
+       ubd_dev->tag_set.numa_node = NUMA_NO_NODE;
+       ubd_dev->tag_set.flags = BLK_MQ_F_SHOULD_MERGE;
+       ubd_dev->tag_set.driver_data = ubd_dev;
+       ubd_dev->tag_set.nr_hw_queues = 1;
 
-       err = -ENOMEM;
-       ubd_dev->queue = blk_init_queue(do_ubd_request, &ubd_dev->lock);
-       if (ubd_dev->queue == NULL) {
-               *error_out = "Failed to initialize device queue";
+       err = blk_mq_alloc_tag_set(&ubd_dev->tag_set);
+       if (err)
                goto out;
+
+       ubd_dev->queue = blk_mq_init_queue(&ubd_dev->tag_set);
+       if (IS_ERR(ubd_dev->queue)) {
+               err = PTR_ERR(ubd_dev->queue);
+               goto out_cleanup;
        }
+
        ubd_dev->queue->queuedata = ubd_dev;
        blk_queue_write_cache(ubd_dev->queue, true, false);
 
@@ -931,7 +928,7 @@ static int ubd_add(int n, char **error_out)
        err = ubd_disk_register(UBD_MAJOR, ubd_dev->size, n, &ubd_gendisk[n]);
        if(err){
                *error_out = "Failed to register device";
-               goto out_cleanup;
+               goto out_cleanup_tags;
        }
 
        if (fake_major != UBD_MAJOR)
@@ -949,6 +946,8 @@ static int ubd_add(int n, char **error_out)
 out:
        return err;
 
+out_cleanup_tags:
+       blk_mq_free_tag_set(&ubd_dev->tag_set);
 out_cleanup:
        blk_cleanup_queue(ubd_dev->queue);
        goto out;
@@ -1290,123 +1289,82 @@ static void cowify_req(struct io_thread_req *req, unsigned long *bitmap,
                           req->bitmap_words, bitmap_len);
 }
 
-/* Called with dev->lock held */
-static void prepare_request(struct request *req, struct io_thread_req *io_req,
-                           unsigned long long offset, int page_offset,
-                           int len, struct page *page)
+static int ubd_queue_one_vec(struct blk_mq_hw_ctx *hctx, struct request *req,
+               u64 off, struct bio_vec *bvec)
 {
-       struct gendisk *disk = req->rq_disk;
-       struct ubd *ubd_dev = disk->private_data;
-
-       io_req->req = req;
-       io_req->fds[0] = (ubd_dev->cow.file != NULL) ? ubd_dev->cow.fd :
-               ubd_dev->fd;
-       io_req->fds[1] = ubd_dev->fd;
-       io_req->cow_offset = -1;
-       io_req->offset = offset;
-       io_req->length = len;
-       io_req->error = 0;
-       io_req->sector_mask = 0;
-
-       io_req->op = (rq_data_dir(req) == READ) ? UBD_READ : UBD_WRITE;
-       io_req->offsets[0] = 0;
-       io_req->offsets[1] = ubd_dev->cow.data_offset;
-       io_req->buffer = page_address(page) + page_offset;
-       io_req->sectorsize = 1 << 9;
-
-       if(ubd_dev->cow.file != NULL)
-               cowify_req(io_req, ubd_dev->cow.bitmap,
-                          ubd_dev->cow.bitmap_offset, ubd_dev->cow.bitmap_len);
-
-}
+       struct ubd *dev = hctx->queue->queuedata;
+       struct io_thread_req *io_req;
+       int ret;
 
-/* Called with dev->lock held */
-static void prepare_flush_request(struct request *req,
-                                 struct io_thread_req *io_req)
-{
-       struct gendisk *disk = req->rq_disk;
-       struct ubd *ubd_dev = disk->private_data;
+       io_req = kmalloc(sizeof(struct io_thread_req), GFP_ATOMIC);
+       if (!io_req)
+               return -ENOMEM;
 
        io_req->req = req;
-       io_req->fds[0] = (ubd_dev->cow.file != NULL) ? ubd_dev->cow.fd :
-               ubd_dev->fd;
-       io_req->op = UBD_FLUSH;
-}
+       if (dev->cow.file)
+               io_req->fds[0] = dev->cow.fd;
+       else
+               io_req->fds[0] = dev->fd;
 
-static bool submit_request(struct io_thread_req *io_req, struct ubd *dev)
-{
-       int n = os_write_file(thread_fd, &io_req,
-                            sizeof(io_req));
-       if (n != sizeof(io_req)) {
-               if (n != -EAGAIN)
-                       printk("write to io thread failed, "
-                              "errno = %d\n", -n);
-               else if (list_empty(&dev->restart))
-                       list_add(&dev->restart, &restart);
+       if (req_op(req) == REQ_OP_FLUSH) {
+               io_req->op = UBD_FLUSH;
+       } else {
+               io_req->fds[1] = dev->fd;
+               io_req->cow_offset = -1;
+               io_req->offset = off;
+               io_req->length = bvec->bv_len;
+               io_req->error = 0;
+               io_req->sector_mask = 0;
+
+               io_req->op = rq_data_dir(req) == READ ? UBD_READ : UBD_WRITE;
+               io_req->offsets[0] = 0;
+               io_req->offsets[1] = dev->cow.data_offset;
+               io_req->buffer = page_address(bvec->bv_page) + bvec->bv_offset;
+               io_req->sectorsize = 1 << 9;
+
+               if (dev->cow.file) {
+                       cowify_req(io_req, dev->cow.bitmap,
+                                  dev->cow.bitmap_offset, dev->cow.bitmap_len);
+               }
+       }
 
+       ret = os_write_file(thread_fd, &io_req, sizeof(io_req));
+       if (ret != sizeof(io_req)) {
+               if (ret != -EAGAIN)
+                       pr_err("write to io thread failed: %d\n", -ret);
                kfree(io_req);
-               return false;
        }
-       return true;
+
+       return ret;
 }
 
-/* Called with dev->lock held */
-static void do_ubd_request(struct request_queue *q)
+static blk_status_t ubd_queue_rq(struct blk_mq_hw_ctx *hctx,
+                                const struct blk_mq_queue_data *bd)
 {
-       struct io_thread_req *io_req;
-       struct request *req;
-
-       while(1){
-               struct ubd *dev = q->queuedata;
-               if(dev->request == NULL){
-                       struct request *req = blk_fetch_request(q);
-                       if(req == NULL)
-                               return;
-
-                       dev->request = req;
-                       dev->rq_pos = blk_rq_pos(req);
-                       dev->start_sg = 0;
-                       dev->end_sg = blk_rq_map_sg(q, req, dev->sg);
-               }
-
-               req = dev->request;
+       struct request *req = bd->rq;
+       int ret = 0;
 
-               if (req_op(req) == REQ_OP_FLUSH) {
-                       io_req = kmalloc(sizeof(struct io_thread_req),
-                                        GFP_ATOMIC);
-                       if (io_req == NULL) {
-                               if (list_empty(&dev->restart))
-                                       list_add(&dev->restart, &restart);
-                               return;
-                       }
-                       prepare_flush_request(req, io_req);
-                       if (submit_request(io_req, dev) == false)
-                               return;
-               }
+       blk_mq_start_request(req);
 
-               while(dev->start_sg < dev->end_sg){
-                       struct scatterlist *sg = &dev->sg[dev->start_sg];
-
-                       io_req = kmalloc(sizeof(struct io_thread_req),
-                                        GFP_ATOMIC);
-                       if(io_req == NULL){
-                               if(list_empty(&dev->restart))
-                                       list_add(&dev->restart, &restart);
-                               return;
-                       }
-                       prepare_request(req, io_req,
-                                       (unsigned long long)dev->rq_pos << 9,
-                                       sg->offset, sg->length, sg_page(sg));
-
-                       if (submit_request(io_req, dev) == false)
-                               return;
-
-                       dev->rq_pos += sg->length >> 9;
-                       dev->start_sg++;
+       if (req_op(req) == REQ_OP_FLUSH) {
+               ret = ubd_queue_one_vec(hctx, req, 0, NULL);
+       } else {
+               struct req_iterator iter;
+               struct bio_vec bvec;
+               u64 off = (u64)blk_rq_pos(req) << 9;
+
+               rq_for_each_segment(bvec, req, iter) {
+                       ret = ubd_queue_one_vec(hctx, req, off, &bvec);
+                       if (ret < 0)
+                               goto out;
+                       off += bvec.bv_len;
                }
-               dev->end_sg = 0;
-               dev->request = NULL;
        }
+out:
+       if (ret < 0) {
+               blk_mq_requeue_request(req, true);
+       }
+       return BLK_STS_OK;
 }
 
 static int ubd_getgeo(struct block_device *bdev, struct hd_geometry *geo)
index 6de64840dd22ede96c410243dcfad3b5a685365c..9a92a3ac2ac5eebde4368dcdd395ad432e77d191 100644 (file)
@@ -369,18 +369,6 @@ extern void __iomem *ioremap_wt(resource_size_t offset, unsigned long size);
 
 extern bool is_early_ioremap_ptep(pte_t *ptep);
 
-#ifdef CONFIG_XEN
-#include <xen/xen.h>
-struct bio_vec;
-
-extern bool xen_biovec_phys_mergeable(const struct bio_vec *vec1,
-                                     const struct bio_vec *vec2);
-
-#define BIOVEC_PHYS_MERGEABLE(vec1, vec2)                              \
-       (__BIOVEC_PHYS_MERGEABLE(vec1, vec2) &&                         \
-        (!xen_domain() || xen_biovec_phys_mergeable(vec1, vec2)))
-#endif /* CONFIG_XEN */
-
 #define IO_SPACE_LIMIT 0xffff
 
 #include <asm-generic/io.h>
index d383140e1dc88b802d51fd68dddfeb77b002408e..068d9b067c83cc2f6d606d0d790a6dec76c3e326 100644 (file)
@@ -2,6 +2,8 @@
 #ifndef _ASM_X86_XEN_EVENTS_H
 #define _ASM_X86_XEN_EVENTS_H
 
+#include <xen/xen.h>
+
 enum ipi_vector {
        XEN_RESCHEDULE_VECTOR,
        XEN_CALL_FUNCTION_VECTOR,
index 2eeddd81465330f43e269d4e1d2449f82772bfd1..0ca46e03b8309c23e27e2ea1c1bff9106bd6a926 100644 (file)
@@ -5,6 +5,7 @@
 #include <linux/kexec.h>
 #include <linux/slab.h>
 
+#include <xen/xen.h>
 #include <xen/features.h>
 #include <xen/page.h>
 #include <xen/interface/memory.h>
index c85d1a88f47693232369411588cfc19084086b25..2a9025343534e4db679bdd6f860744f6b5be107b 100644 (file)
@@ -11,6 +11,7 @@
 #include <asm/xen/interface.h>
 #include <asm/xen/hypercall.h>
 
+#include <xen/xen.h>
 #include <xen/interface/memory.h>
 #include <xen/interface/hvm/start_info.h>
 
index 33a783c77d969ecb5c76b841e13aa2ac19ad0da5..b99585034dd2d370cef04e5b580499ca27701f88 100644 (file)
@@ -23,6 +23,7 @@
 #include <linux/io.h>
 #include <linux/export.h>
 
+#include <xen/xen.h>
 #include <xen/platform_pci.h>
 #include "xen-ops.h"
 
index 95997e6c06960073c75b713cb2be76ec74c0886c..0972184f3f199272d534c1f5d2cad46093fdce54 100644 (file)
@@ -3,6 +3,7 @@
 #include <linux/interrupt.h>
 
 #include <asm/xen/hypercall.h>
+#include <xen/xen.h>
 #include <xen/page.h>
 #include <xen/interface/xen.h>
 #include <xen/interface/vcpu.h>
index 1f2469a0123ceb1f36103c0db9eb71d140556193..f7045aa47edbabebf8ff1c627956f61eebdab812 100644 (file)
@@ -74,7 +74,6 @@ config BLK_DEV_BSG
 
 config BLK_DEV_BSGLIB
        bool "Block layer SG support v4 helper lib"
-       default n
        select BLK_DEV_BSG
        select BLK_SCSI_REQUEST
        help
@@ -107,7 +106,6 @@ config BLK_DEV_ZONED
 config BLK_DEV_THROTTLING
        bool "Block layer bio throttling support"
        depends on BLK_CGROUP=y
-       default n
        ---help---
        Block layer bio throttling support. It can be used to limit
        the IO rate to a device. IO rate policies are per cgroup and
@@ -119,7 +117,6 @@ config BLK_DEV_THROTTLING
 config BLK_DEV_THROTTLING_LOW
        bool "Block throttling .low limit interface support (EXPERIMENTAL)"
        depends on BLK_DEV_THROTTLING
-       default n
        ---help---
        Add .low limit interface for block throttling. The low limit is a best
        effort limit to prioritize cgroups. Depending on the setting, the limit
@@ -130,7 +127,6 @@ config BLK_DEV_THROTTLING_LOW
 
 config BLK_CMDLINE_PARSER
        bool "Block device command line partition parser"
-       default n
        ---help---
        Enabling this option allows you to specify the partition layout from
        the kernel boot args.  This is typically of use for embedded devices
@@ -141,7 +137,6 @@ config BLK_CMDLINE_PARSER
 
 config BLK_WBT
        bool "Enable support for block device writeback throttling"
-       default n
        ---help---
        Enabling this option enables the block layer to throttle buffered
        background writeback from the VM, making it more smooth and having
@@ -152,7 +147,6 @@ config BLK_WBT
 config BLK_CGROUP_IOLATENCY
        bool "Enable support for latency based cgroup IO protection"
        depends on BLK_CGROUP=y
-       default n
        ---help---
        Enabling this option enables the .latency interface for IO throttling.
        The IO controller will attempt to maintain average IO latencies below
@@ -163,7 +157,6 @@ config BLK_CGROUP_IOLATENCY
 
 config BLK_WBT_SQ
        bool "Single queue writeback throttling"
-       default n
        depends on BLK_WBT
        ---help---
        Enable writeback throttling by default on legacy single queue devices
@@ -228,4 +221,7 @@ config BLK_MQ_RDMA
        depends on BLOCK && INFINIBAND
        default y
 
+config BLK_PM
+       def_bool BLOCK && PM
+
 source block/Kconfig.iosched
index a4a8914bf7a408ddb0ace6bab8ba4e5894e59d94..f95a48b0d7b23df67435395e2e18f40d678090d1 100644 (file)
@@ -36,7 +36,6 @@ config IOSCHED_CFQ
 config CFQ_GROUP_IOSCHED
        bool "CFQ Group Scheduling support"
        depends on IOSCHED_CFQ && BLK_CGROUP
-       default n
        ---help---
          Enable group IO scheduling in CFQ.
 
@@ -82,7 +81,6 @@ config MQ_IOSCHED_KYBER
 
 config IOSCHED_BFQ
        tristate "BFQ I/O scheduler"
-       default n
        ---help---
        BFQ I/O scheduler for BLK-MQ. BFQ distributes the bandwidth of
        of the device among all processes according to their weights,
@@ -94,7 +92,6 @@ config IOSCHED_BFQ
 config BFQ_GROUP_IOSCHED
        bool "BFQ hierarchical scheduling support"
        depends on IOSCHED_BFQ && BLK_CGROUP
-       default n
        ---help---
 
        Enable hierarchical scheduling in BFQ, using the blkio
index 572b33f32c07cf7056fb1121abba753a9ba8a0ac..27eac600474f026b89c8caa885a2cb0ba3a07729 100644 (file)
@@ -37,3 +37,4 @@ obj-$(CONFIG_BLK_WBT)         += blk-wbt.o
 obj-$(CONFIG_BLK_DEBUG_FS)     += blk-mq-debugfs.o
 obj-$(CONFIG_BLK_DEBUG_FS_ZONED)+= blk-mq-debugfs-zoned.o
 obj-$(CONFIG_BLK_SED_OPAL)     += sed-opal.o
+obj-$(CONFIG_BLK_PM)           += blk-pm.o
index 9fe5952d117d553f12f32055fde8683c554b06a8..d9a7916ff0ab6474a6f4abac2873a6685ad4d467 100644 (file)
@@ -642,7 +642,7 @@ void bfq_bic_update_cgroup(struct bfq_io_cq *bic, struct bio *bio)
        uint64_t serial_nr;
 
        rcu_read_lock();
-       serial_nr = bio_blkcg(bio)->css.serial_nr;
+       serial_nr = __bio_blkcg(bio)->css.serial_nr;
 
        /*
         * Check whether blkcg has changed.  The condition may trigger
@@ -651,7 +651,7 @@ void bfq_bic_update_cgroup(struct bfq_io_cq *bic, struct bio *bio)
        if (unlikely(!bfqd) || likely(bic->blkcg_serial_nr == serial_nr))
                goto out;
 
-       bfqg = __bfq_bic_change_cgroup(bfqd, bic, bio_blkcg(bio));
+       bfqg = __bfq_bic_change_cgroup(bfqd, bic, __bio_blkcg(bio));
        /*
         * Update blkg_path for bfq_log_* functions. We cache this
         * path, and update it here, for the following
index 653100fb719eb80e1bb11e9ef7761f14f960623f..6075100f03a50a73da838b19891b923d0ad422a7 100644 (file)
@@ -624,12 +624,13 @@ void bfq_pos_tree_add_move(struct bfq_data *bfqd, struct bfq_queue *bfqq)
 }
 
 /*
- * Tell whether there are active queues or groups with differentiated weights.
+ * Tell whether there are active queues with different weights or
+ * active groups.
  */
-static bool bfq_differentiated_weights(struct bfq_data *bfqd)
+static bool bfq_varied_queue_weights_or_active_groups(struct bfq_data *bfqd)
 {
        /*
-        * For weights to differ, at least one of the trees must contain
+        * For queue weights to differ, queue_weights_tree must contain
         * at least two nodes.
         */
        return (!RB_EMPTY_ROOT(&bfqd->queue_weights_tree) &&
@@ -637,9 +638,7 @@ static bool bfq_differentiated_weights(struct bfq_data *bfqd)
                 bfqd->queue_weights_tree.rb_node->rb_right)
 #ifdef CONFIG_BFQ_GROUP_IOSCHED
               ) ||
-              (!RB_EMPTY_ROOT(&bfqd->group_weights_tree) &&
-               (bfqd->group_weights_tree.rb_node->rb_left ||
-                bfqd->group_weights_tree.rb_node->rb_right)
+               (bfqd->num_active_groups > 0
 #endif
               );
 }
@@ -657,26 +656,25 @@ static bool bfq_differentiated_weights(struct bfq_data *bfqd)
  * 3) all active groups at the same level in the groups tree have the same
  *    number of children.
  *
- * Unfortunately, keeping the necessary state for evaluating exactly the
- * above symmetry conditions would be quite complex and time-consuming.
- * Therefore this function evaluates, instead, the following stronger
- * sub-conditions, for which it is much easier to maintain the needed
- * state:
+ * Unfortunately, keeping the necessary state for evaluating exactly
+ * the last two symmetry sub-conditions above would be quite complex
+ * and time consuming.  Therefore this function evaluates, instead,
+ * only the following stronger two sub-conditions, for which it is
+ * much easier to maintain the needed state:
  * 1) all active queues have the same weight,
- * 2) all active groups have the same weight,
- * 3) all active groups have at most one active child each.
- * In particular, the last two conditions are always true if hierarchical
- * support and the cgroups interface are not enabled, thus no state needs
- * to be maintained in this case.
+ * 2) there are no active groups.
+ * In particular, the last condition is always true if hierarchical
+ * support or the cgroups interface are not enabled, thus no state
+ * needs to be maintained in this case.
  */
 static bool bfq_symmetric_scenario(struct bfq_data *bfqd)
 {
-       return !bfq_differentiated_weights(bfqd);
+       return !bfq_varied_queue_weights_or_active_groups(bfqd);
 }
 
 /*
  * If the weight-counter tree passed as input contains no counter for
- * the weight of the input entity, then add that counter; otherwise just
+ * the weight of the input queue, then add that counter; otherwise just
  * increment the existing counter.
  *
  * Note that weight-counter trees contain few nodes in mostly symmetric
@@ -687,25 +685,25 @@ static bool bfq_symmetric_scenario(struct bfq_data *bfqd)
  * In most scenarios, the rate at which nodes are created/destroyed
  * should be low too.
  */
-void bfq_weights_tree_add(struct bfq_data *bfqd, struct bfq_entity *entity,
+void bfq_weights_tree_add(struct bfq_data *bfqd, struct bfq_queue *bfqq,
                          struct rb_root *root)
 {
+       struct bfq_entity *entity = &bfqq->entity;
        struct rb_node **new = &(root->rb_node), *parent = NULL;
 
        /*
-        * Do not insert if the entity is already associated with a
+        * Do not insert if the queue is already associated with a
         * counter, which happens if:
-        *   1) the entity is associated with a queue,
-        *   2) a request arrival has caused the queue to become both
+        *   1) a request arrival has caused the queue to become both
         *      non-weight-raised, and hence change its weight, and
         *      backlogged; in this respect, each of the two events
         *      causes an invocation of this function,
-        *   3) this is the invocation of this function caused by the
+        *   2) this is the invocation of this function caused by the
         *      second event. This second invocation is actually useless,
         *      and we handle this fact by exiting immediately. More
         *      efficient or clearer solutions might possibly be adopted.
         */
-       if (entity->weight_counter)
+       if (bfqq->weight_counter)
                return;
 
        while (*new) {
@@ -715,7 +713,7 @@ void bfq_weights_tree_add(struct bfq_data *bfqd, struct bfq_entity *entity,
                parent = *new;
 
                if (entity->weight == __counter->weight) {
-                       entity->weight_counter = __counter;
+                       bfqq->weight_counter = __counter;
                        goto inc_counter;
                }
                if (entity->weight < __counter->weight)
@@ -724,66 +722,67 @@ void bfq_weights_tree_add(struct bfq_data *bfqd, struct bfq_entity *entity,
                        new = &((*new)->rb_right);
        }
 
-       entity->weight_counter = kzalloc(sizeof(struct bfq_weight_counter),
-                                        GFP_ATOMIC);
+       bfqq->weight_counter = kzalloc(sizeof(struct bfq_weight_counter),
+                                      GFP_ATOMIC);
 
        /*
         * In the unlucky event of an allocation failure, we just
-        * exit. This will cause the weight of entity to not be
-        * considered in bfq_differentiated_weights, which, in its
-        * turn, causes the scenario to be deemed wrongly symmetric in
-        * case entity's weight would have been the only weight making
-        * the scenario asymmetric. On the bright side, no unbalance
-        * will however occur when entity becomes inactive again (the
-        * invocation of this function is triggered by an activation
-        * of entity). In fact, bfq_weights_tree_remove does nothing
-        * if !entity->weight_counter.
+        * exit. This will cause the weight of queue to not be
+        * considered in bfq_varied_queue_weights_or_active_groups,
+        * which, in its turn, causes the scenario to be deemed
+        * wrongly symmetric in case bfqq's weight would have been
+        * the only weight making the scenario asymmetric.  On the
+        * bright side, no unbalance will however occur when bfqq
+        * becomes inactive again (the invocation of this function
+        * is triggered by an activation of queue).  In fact,
+        * bfq_weights_tree_remove does nothing if
+        * !bfqq->weight_counter.
         */
-       if (unlikely(!entity->weight_counter))
+       if (unlikely(!bfqq->weight_counter))
                return;
 
-       entity->weight_counter->weight = entity->weight;
-       rb_link_node(&entity->weight_counter->weights_node, parent, new);
-       rb_insert_color(&entity->weight_counter->weights_node, root);
+       bfqq->weight_counter->weight = entity->weight;
+       rb_link_node(&bfqq->weight_counter->weights_node, parent, new);
+       rb_insert_color(&bfqq->weight_counter->weights_node, root);
 
 inc_counter:
-       entity->weight_counter->num_active++;
+       bfqq->weight_counter->num_active++;
 }
 
 /*
- * Decrement the weight counter associated with the entity, and, if the
+ * Decrement the weight counter associated with the queue, and, if the
  * counter reaches 0, remove the counter from the tree.
  * See the comments to the function bfq_weights_tree_add() for considerations
  * about overhead.
  */
 void __bfq_weights_tree_remove(struct bfq_data *bfqd,
-                              struct bfq_entity *entity,
+                              struct bfq_queue *bfqq,
                               struct rb_root *root)
 {
-       if (!entity->weight_counter)
+       if (!bfqq->weight_counter)
                return;
 
-       entity->weight_counter->num_active--;
-       if (entity->weight_counter->num_active > 0)
+       bfqq->weight_counter->num_active--;
+       if (bfqq->weight_counter->num_active > 0)
                goto reset_entity_pointer;
 
-       rb_erase(&entity->weight_counter->weights_node, root);
-       kfree(entity->weight_counter);
+       rb_erase(&bfqq->weight_counter->weights_node, root);
+       kfree(bfqq->weight_counter);
 
 reset_entity_pointer:
-       entity->weight_counter = NULL;
+       bfqq->weight_counter = NULL;
 }
 
 /*
- * Invoke __bfq_weights_tree_remove on bfqq and all its inactive
- * parent entities.
+ * Invoke __bfq_weights_tree_remove on bfqq and decrement the number
+ * of active groups for each queue's inactive parent entity.
  */
 void bfq_weights_tree_remove(struct bfq_data *bfqd,
                             struct bfq_queue *bfqq)
 {
        struct bfq_entity *entity = bfqq->entity.parent;
 
-       __bfq_weights_tree_remove(bfqd, &bfqq->entity,
+       __bfq_weights_tree_remove(bfqd, bfqq,
                                  &bfqd->queue_weights_tree);
 
        for_each_entity(entity) {
@@ -797,17 +796,13 @@ void bfq_weights_tree_remove(struct bfq_data *bfqd,
                         * next_in_service for details on why
                         * in_service_entity must be checked too).
                         *
-                        * As a consequence, the weight of entity is
-                        * not to be removed. In addition, if entity
-                        * is active, then its parent entities are
-                        * active as well, and thus their weights are
-                        * not to be removed either. In the end, this
-                        * loop must stop here.
+                        * As a consequence, its parent entities are
+                        * active as well, and thus this loop must
+                        * stop here.
                         */
                        break;
                }
-               __bfq_weights_tree_remove(bfqd, entity,
-                                         &bfqd->group_weights_tree);
+               bfqd->num_active_groups--;
        }
 }
 
@@ -3182,6 +3177,13 @@ static unsigned long bfq_bfqq_softrt_next_start(struct bfq_data *bfqd,
                    jiffies + nsecs_to_jiffies(bfqq->bfqd->bfq_slice_idle) + 4);
 }
 
+static bool bfq_bfqq_injectable(struct bfq_queue *bfqq)
+{
+       return BFQQ_SEEKY(bfqq) && bfqq->wr_coeff == 1 &&
+               blk_queue_nonrot(bfqq->bfqd->queue) &&
+               bfqq->bfqd->hw_tag;
+}
+
 /**
  * bfq_bfqq_expire - expire a queue.
  * @bfqd: device owning the queue.
@@ -3291,6 +3293,8 @@ void bfq_bfqq_expire(struct bfq_data *bfqd,
        if (ref == 1) /* bfqq is gone, no more actions on it */
                return;
 
+       bfqq->injected_service = 0;
+
        /* mark bfqq as waiting a request only if a bic still points to it */
        if (!bfq_bfqq_busy(bfqq) &&
            reason != BFQQE_BUDGET_TIMEOUT &&
@@ -3497,9 +3501,11 @@ static bool bfq_better_to_idle(struct bfq_queue *bfqq)
         * symmetric scenario where:
         * (i)  each of these processes must get the same throughput as
         *      the others;
-        * (ii) all these processes have the same I/O pattern
-               (either sequential or random).
-        * In fact, in such a scenario, the drive will tend to treat
+        * (ii) the I/O of each process has the same properties, in
+        *      terms of locality (sequential or random), direction
+        *      (reads or writes), request sizes, greediness
+        *      (from I/O-bound to sporadic), and so on.
+        * In fact, in such a scenario, the drive tends to treat
         * the requests of each of these processes in about the same
         * way as the requests of the others, and thus to provide
         * each of these processes with about the same throughput
@@ -3508,18 +3514,50 @@ static bool bfq_better_to_idle(struct bfq_queue *bfqq)
         * certainly needed to guarantee that bfqq receives its
         * assigned fraction of the device throughput (see [1] for
         * details).
+        * The problem is that idling may significantly reduce
+        * throughput with certain combinations of types of I/O and
+        * devices. An important example is sync random I/O, on flash
+        * storage with command queueing. So, unless bfqq falls in the
+        * above cases where idling also boosts throughput, it would
+        * be important to check conditions (i) and (ii) accurately,
+        * so as to avoid idling when not strictly needed for service
+        * guarantees.
+        *
+        * Unfortunately, it is extremely difficult to thoroughly
+        * check condition (ii). And, in case there are active groups,
+        * it becomes very difficult to check condition (i) too. In
+        * fact, if there are active groups, then, for condition (i)
+        * to become false, it is enough that an active group contains
+        * more active processes or sub-groups than some other active
+        * group. We address this issue with the following bi-modal
+        * behavior, implemented in the function
+        * bfq_symmetric_scenario().
         *
-        * We address this issue by controlling, actually, only the
-        * symmetry sub-condition (i), i.e., provided that
-        * sub-condition (i) holds, idling is not performed,
-        * regardless of whether sub-condition (ii) holds. In other
-        * words, only if sub-condition (i) holds, then idling is
+        * If there are active groups, then the scenario is tagged as
+        * asymmetric, conservatively, without checking any of the
+        * conditions (i) and (ii). So the device is idled for bfqq.
+        * This behavior matches also the fact that groups are created
+        * exactly if controlling I/O (to preserve bandwidth and
+        * latency guarantees) is a primary concern.
+        *
+        * On the opposite end, if there are no active groups, then
+        * only condition (i) is actually controlled, i.e., provided
+        * that condition (i) holds, idling is not performed,
+        * regardless of whether condition (ii) holds. In other words,
+        * only if condition (i) does not hold, then idling is
         * allowed, and the device tends to be prevented from queueing
-        * many requests, possibly of several processes. The reason
-        * for not controlling also sub-condition (ii) is that we
-        * exploit preemption to preserve guarantees in case of
-        * symmetric scenarios, even if (ii) does not hold, as
-        * explained in the next two paragraphs.
+        * many requests, possibly of several processes. Since there
+        * are no active groups, then, to control condition (i) it is
+        * enough to check whether all active queues have the same
+        * weight.
+        *
+        * Not checking condition (ii) evidently exposes bfqq to the
+        * risk of getting less throughput than its fair share.
+        * However, for queues with the same weight, a further
+        * mechanism, preemption, mitigates or even eliminates this
+        * problem. And it does so without consequences on overall
+        * throughput. This mechanism and its benefits are explained
+        * in the next three paragraphs.
         *
         * Even if a queue, say Q, is expired when it remains idle, Q
         * can still preempt the new in-service queue if the next
@@ -3533,11 +3571,7 @@ static bool bfq_better_to_idle(struct bfq_queue *bfqq)
         * idling allows the internal queues of the device to contain
         * many requests, and thus to reorder requests, we can rather
         * safely assume that the internal scheduler still preserves a
-        * minimum of mid-term fairness. The motivation for using
-        * preemption instead of idling is that, by not idling,
-        * service guarantees are preserved without minimally
-        * sacrificing throughput. In other words, both a high
-        * throughput and its desired distribution are obtained.
+        * minimum of mid-term fairness.
         *
         * More precisely, this preemption-based, idleless approach
         * provides fairness in terms of IOPS, and not sectors per
@@ -3556,22 +3590,27 @@ static bool bfq_better_to_idle(struct bfq_queue *bfqq)
         * 1024/8 times as high as the service received by the other
         * queue.
         *
-        * On the other hand, device idling is performed, and thus
-        * pure sector-domain guarantees are provided, for the
-        * following queues, which are likely to need stronger
-        * throughput guarantees: weight-raised queues, and queues
-        * with a higher weight than other queues. When such queues
-        * are active, sub-condition (i) is false, which triggers
-        * device idling.
+        * The motivation for using preemption instead of idling (for
+        * queues with the same weight) is that, by not idling,
+        * service guarantees are preserved (completely or at least in
+        * part) without minimally sacrificing throughput. And, if
+        * there is no active group, then the primary expectation for
+        * this device is probably a high throughput.
         *
-        * According to the above considerations, the next variable is
-        * true (only) if sub-condition (i) holds. To compute the
-        * value of this variable, we not only use the return value of
-        * the function bfq_symmetric_scenario(), but also check
-        * whether bfqq is being weight-raised, because
-        * bfq_symmetric_scenario() does not take into account also
-        * weight-raised queues (see comments on
-        * bfq_weights_tree_add()).
+        * We are now left only with explaining the additional
+        * compound condition that is checked below for deciding
+        * whether the scenario is asymmetric. To explain this
+        * compound condition, we need to add that the function
+        * bfq_symmetric_scenario checks the weights of only
+        * non-weight-raised queues, for efficiency reasons (see
+        * comments on bfq_weights_tree_add()). Then the fact that
+        * bfqq is weight-raised is checked explicitly here. More
+        * precisely, the compound condition below takes into account
+        * also the fact that, even if bfqq is being weight-raised,
+        * the scenario is still symmetric if all active queues happen
+        * to be weight-raised. Actually, we should be even more
+        * precise here, and differentiate between interactive weight
+        * raising and soft real-time weight raising.
         *
         * As a side note, it is worth considering that the above
         * device-idling countermeasures may however fail in the
@@ -3583,7 +3622,8 @@ static bool bfq_better_to_idle(struct bfq_queue *bfqq)
         * to let requests be served in the desired order until all
         * the requests already queued in the device have been served.
         */
-       asymmetric_scenario = bfqq->wr_coeff > 1 ||
+       asymmetric_scenario = (bfqq->wr_coeff > 1 &&
+                              bfqd->wr_busy_queues < bfqd->busy_queues) ||
                !bfq_symmetric_scenario(bfqd);
 
        /*
@@ -3629,6 +3669,30 @@ static bool bfq_bfqq_must_idle(struct bfq_queue *bfqq)
        return RB_EMPTY_ROOT(&bfqq->sort_list) && bfq_better_to_idle(bfqq);
 }
 
+static struct bfq_queue *bfq_choose_bfqq_for_injection(struct bfq_data *bfqd)
+{
+       struct bfq_queue *bfqq;
+
+       /*
+        * A linear search; but, with a high probability, very few
+        * steps are needed to find a candidate queue, i.e., a queue
+        * with enough budget left for its next request. In fact:
+        * - BFQ dynamically updates the budget of every queue so as
+        *   to accommodate the expected backlog of the queue;
+        * - if a queue gets all its requests dispatched as injected
+        *   service, then the queue is removed from the active list
+        *   (and re-added only if it gets new requests, but with
+        *   enough budget for its new backlog).
+        */
+       list_for_each_entry(bfqq, &bfqd->active_list, bfqq_list)
+               if (!RB_EMPTY_ROOT(&bfqq->sort_list) &&
+                   bfq_serv_to_charge(bfqq->next_rq, bfqq) <=
+                   bfq_bfqq_budget_left(bfqq))
+                       return bfqq;
+
+       return NULL;
+}
+
 /*
  * Select a queue for service.  If we have a current queue in service,
  * check whether to continue servicing it, or retrieve and set a new one.
@@ -3710,10 +3774,19 @@ check_queue:
         * No requests pending. However, if the in-service queue is idling
         * for a new request, or has requests waiting for a completion and
         * may idle after their completion, then keep it anyway.
+        *
+        * Yet, to boost throughput, inject service from other queues if
+        * possible.
         */
        if (bfq_bfqq_wait_request(bfqq) ||
            (bfqq->dispatched != 0 && bfq_better_to_idle(bfqq))) {
-               bfqq = NULL;
+               if (bfq_bfqq_injectable(bfqq) &&
+                   bfqq->injected_service * bfqq->inject_coeff <
+                   bfqq->entity.service * 10)
+                       bfqq = bfq_choose_bfqq_for_injection(bfqd);
+               else
+                       bfqq = NULL;
+
                goto keep_queue;
        }
 
@@ -3803,6 +3876,14 @@ static struct request *bfq_dispatch_rq_from_bfqq(struct bfq_data *bfqd,
 
        bfq_dispatch_remove(bfqd->queue, rq);
 
+       if (bfqq != bfqd->in_service_queue) {
+               if (likely(bfqd->in_service_queue))
+                       bfqd->in_service_queue->injected_service +=
+                               bfq_serv_to_charge(rq, bfqq);
+
+               goto return_rq;
+       }
+
        /*
         * If weight raising has to terminate for bfqq, then next
         * function causes an immediate update of bfqq's weight,
@@ -3821,13 +3902,12 @@ static struct request *bfq_dispatch_rq_from_bfqq(struct bfq_data *bfqd,
         * belongs to CLASS_IDLE and other queues are waiting for
         * service.
         */
-       if (bfqd->busy_queues > 1 && bfq_class_idle(bfqq))
-               goto expire;
-
-       return rq;
+       if (!(bfqd->busy_queues > 1 && bfq_class_idle(bfqq)))
+               goto return_rq;
 
-expire:
        bfq_bfqq_expire(bfqd, bfqq, false, BFQQE_BUDGET_EXHAUSTED);
+
+return_rq:
        return rq;
 }
 
@@ -4232,6 +4312,13 @@ static void bfq_init_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq,
                        bfq_mark_bfqq_has_short_ttime(bfqq);
                bfq_mark_bfqq_sync(bfqq);
                bfq_mark_bfqq_just_created(bfqq);
+               /*
+                * Aggressively inject a lot of service: up to 90%.
+                * This coefficient remains constant during bfqq life,
+                * but this behavior might be changed, after enough
+                * testing and tuning.
+                */
+               bfqq->inject_coeff = 1;
        } else
                bfq_clear_bfqq_sync(bfqq);
 
@@ -4297,7 +4384,7 @@ static struct bfq_queue *bfq_get_queue(struct bfq_data *bfqd,
 
        rcu_read_lock();
 
-       bfqg = bfq_find_set_group(bfqd, bio_blkcg(bio));
+       bfqg = bfq_find_set_group(bfqd, __bio_blkcg(bio));
        if (!bfqg) {
                bfqq = &bfqd->oom_bfqq;
                goto out;
@@ -5330,7 +5417,7 @@ static int bfq_init_queue(struct request_queue *q, struct elevator_type *e)
        bfqd->idle_slice_timer.function = bfq_idle_slice_timer;
 
        bfqd->queue_weights_tree = RB_ROOT;
-       bfqd->group_weights_tree = RB_ROOT;
+       bfqd->num_active_groups = 0;
 
        INIT_LIST_HEAD(&bfqd->active_list);
        INIT_LIST_HEAD(&bfqd->idle_list);
index a8a2e5aca4d48f328dbb1c14bff88f1aaa485a2c..77651d817ecd36fe59827f2aa55f9c4ec5ffb979 100644 (file)
@@ -108,15 +108,14 @@ struct bfq_sched_data {
 };
 
 /**
- * struct bfq_weight_counter - counter of the number of all active entities
+ * struct bfq_weight_counter - counter of the number of all active queues
  *                             with a given weight.
  */
 struct bfq_weight_counter {
-       unsigned int weight; /* weight of the entities this counter refers to */
-       unsigned int num_active; /* nr of active entities with this weight */
+       unsigned int weight; /* weight of the queues this counter refers to */
+       unsigned int num_active; /* nr of active queues with this weight */
        /*
-        * Weights tree member (see bfq_data's @queue_weights_tree and
-        * @group_weights_tree)
+        * Weights tree member (see bfq_data's @queue_weights_tree)
         */
        struct rb_node weights_node;
 };
@@ -151,8 +150,6 @@ struct bfq_weight_counter {
 struct bfq_entity {
        /* service_tree member */
        struct rb_node rb_node;
-       /* pointer to the weight counter associated with this entity */
-       struct bfq_weight_counter *weight_counter;
 
        /*
         * Flag, true if the entity is on a tree (either the active or
@@ -266,6 +263,9 @@ struct bfq_queue {
        /* entity representing this queue in the scheduler */
        struct bfq_entity entity;
 
+       /* pointer to the weight counter associated with this entity */
+       struct bfq_weight_counter *weight_counter;
+
        /* maximum budget allowed from the feedback mechanism */
        int max_budget;
        /* budget expiration (in jiffies) */
@@ -351,6 +351,32 @@ struct bfq_queue {
        unsigned long split_time; /* time of last split */
 
        unsigned long first_IO_time; /* time of first I/O for this queue */
+
+       /* max service rate measured so far */
+       u32 max_service_rate;
+       /*
+        * Ratio between the service received by bfqq while it is in
+        * service, and the cumulative service (of requests of other
+        * queues) that may be injected while bfqq is empty but still
+        * in service. To increase precision, the coefficient is
+        * measured in tenths of unit. Here are some example of (1)
+        * ratios, (2) resulting percentages of service injected
+        * w.r.t. to the total service dispatched while bfqq is in
+        * service, and (3) corresponding values of the coefficient:
+        * 1 (50%) -> 10
+        * 2 (33%) -> 20
+        * 10 (9%) -> 100
+        * 9.9 (9%) -> 99
+        * 1.5 (40%) -> 15
+        * 0.5 (66%) -> 5
+        * 0.1 (90%) -> 1
+        *
+        * So, if the coefficient is lower than 10, then
+        * injected service is more than bfqq service.
+        */
+       unsigned int inject_coeff;
+       /* amount of service injected in current service slot */
+       unsigned int injected_service;
 };
 
 /**
@@ -423,14 +449,9 @@ struct bfq_data {
         */
        struct rb_root queue_weights_tree;
        /*
-        * rbtree of non-queue @bfq_entity weight counters, sorted by
-        * weight. Used to keep track of whether all @bfq_groups have
-        * the same weight. The tree contains one counter for each
-        * distinct weight associated to some active @bfq_group (see
-        * the comments to the functions bfq_weights_tree_[add|remove]
-        * for further details).
+        * number of groups with requests still waiting for completion
         */
-       struct rb_root group_weights_tree;
+       unsigned int num_active_groups;
 
        /*
         * Number of bfq_queues containing requests (including the
@@ -825,10 +846,10 @@ struct bfq_queue *bic_to_bfqq(struct bfq_io_cq *bic, bool is_sync);
 void bic_set_bfqq(struct bfq_io_cq *bic, struct bfq_queue *bfqq, bool is_sync);
 struct bfq_data *bic_to_bfqd(struct bfq_io_cq *bic);
 void bfq_pos_tree_add_move(struct bfq_data *bfqd, struct bfq_queue *bfqq);
-void bfq_weights_tree_add(struct bfq_data *bfqd, struct bfq_entity *entity,
+void bfq_weights_tree_add(struct bfq_data *bfqd, struct bfq_queue *bfqq,
                          struct rb_root *root);
 void __bfq_weights_tree_remove(struct bfq_data *bfqd,
-                              struct bfq_entity *entity,
+                              struct bfq_queue *bfqq,
                               struct rb_root *root);
 void bfq_weights_tree_remove(struct bfq_data *bfqd,
                             struct bfq_queue *bfqq);
index ae52bff43ce4ff1697fdfdbc0c9c10fb4148c69e..476b5a90a5a48852d0849bc9d72bcd1a4e91f148 100644 (file)
@@ -788,25 +788,29 @@ __bfq_entity_update_weight_prio(struct bfq_service_tree *old_st,
                new_weight = entity->orig_weight *
                             (bfqq ? bfqq->wr_coeff : 1);
                /*
-                * If the weight of the entity changes, remove the entity
-                * from its old weight counter (if there is a counter
-                * associated with the entity), and add it to the counter
-                * associated with its new weight.
+                * If the weight of the entity changes, and the entity is a
+                * queue, remove the entity from its old weight counter (if
+                * there is a counter associated with the entity).
                 */
                if (prev_weight != new_weight) {
-                       root = bfqq ? &bfqd->queue_weights_tree :
-                                     &bfqd->group_weights_tree;
-                       __bfq_weights_tree_remove(bfqd, entity, root);
+                       if (bfqq) {
+                               root = &bfqd->queue_weights_tree;
+                               __bfq_weights_tree_remove(bfqd, bfqq, root);
+                       } else
+                               bfqd->num_active_groups--;
                }
                entity->weight = new_weight;
                /*
-                * Add the entity to its weights tree only if it is
-                * not associated with a weight-raised queue.
+                * Add the entity, if it is not a weight-raised queue,
+                * to the counter associated with its new weight.
                 */
-               if (prev_weight != new_weight &&
-                   (bfqq ? bfqq->wr_coeff == 1 : 1))
-                       /* If we get here, root has been initialized. */
-                       bfq_weights_tree_add(bfqd, entity, root);
+               if (prev_weight != new_weight) {
+                       if (bfqq && bfqq->wr_coeff == 1) {
+                               /* If we get here, root has been initialized. */
+                               bfq_weights_tree_add(bfqd, bfqq, root);
+                       } else
+                               bfqd->num_active_groups++;
+               }
 
                new_st->wsum += entity->weight;
 
@@ -1012,9 +1016,9 @@ static void __bfq_activate_entity(struct bfq_entity *entity,
        if (!bfq_entity_to_bfqq(entity)) { /* bfq_group */
                struct bfq_group *bfqg =
                        container_of(entity, struct bfq_group, entity);
+               struct bfq_data *bfqd = bfqg->bfqd;
 
-               bfq_weights_tree_add(bfqg->bfqd, entity,
-                                    &bfqd->group_weights_tree);
+               bfqd->num_active_groups++;
        }
 #endif
 
@@ -1181,10 +1185,17 @@ bool __bfq_deactivate_entity(struct bfq_entity *entity, bool ins_into_idle_tree)
        st = bfq_entity_service_tree(entity);
        is_in_service = entity == sd->in_service_entity;
 
-       if (is_in_service) {
-               bfq_calc_finish(entity, entity->service);
+       bfq_calc_finish(entity, entity->service);
+
+       if (is_in_service)
                sd->in_service_entity = NULL;
-       }
+       else
+               /*
+                * Non in-service entity: nobody will take care of
+                * resetting its service counter on expiration. Do it
+                * now.
+                */
+               entity->service = 0;
 
        if (entity->tree == &st->active)
                bfq_active_extract(st, entity);
@@ -1685,7 +1696,7 @@ void bfq_add_bfqq_busy(struct bfq_data *bfqd, struct bfq_queue *bfqq)
 
        if (!bfqq->dispatched)
                if (bfqq->wr_coeff == 1)
-                       bfq_weights_tree_add(bfqd, &bfqq->entity,
+                       bfq_weights_tree_add(bfqd, bfqq,
                                             &bfqd->queue_weights_tree);
 
        if (bfqq->wr_coeff > 1)
index 67b5fb861a5100c5294e572668881a187e478a6b..290af497997be49f8ab0b543cf65a8b1cf2347f3 100644 (file)
@@ -306,6 +306,8 @@ bool bio_integrity_prep(struct bio *bio)
        if (bio_data_dir(bio) == WRITE) {
                bio_integrity_process(bio, &bio->bi_iter,
                                      bi->profile->generate_fn);
+       } else {
+               bip->bio_iter = bio->bi_iter;
        }
        return true;
 
@@ -331,20 +333,14 @@ static void bio_integrity_verify_fn(struct work_struct *work)
                container_of(work, struct bio_integrity_payload, bip_work);
        struct bio *bio = bip->bip_bio;
        struct blk_integrity *bi = blk_get_integrity(bio->bi_disk);
-       struct bvec_iter iter = bio->bi_iter;
 
        /*
         * At the moment verify is called bio's iterator was advanced
         * during split and completion, we need to rewind iterator to
         * it's original position.
         */
-       if (bio_rewind_iter(bio, &iter, iter.bi_done)) {
-               bio->bi_status = bio_integrity_process(bio, &iter,
-                                                      bi->profile->verify_fn);
-       } else {
-               bio->bi_status = BLK_STS_IOERR;
-       }
-
+       bio->bi_status = bio_integrity_process(bio, &bip->bio_iter,
+                                               bi->profile->verify_fn);
        bio_integrity_free(bio);
        bio_endio(bio);
 }
index 0093bed81c0e85882066499dcd92c5e94bd8a35d..bbfeb4ee2892fcbd9d51de450c41fab7dc466ce5 100644 (file)
@@ -609,7 +609,9 @@ void __bio_clone_fast(struct bio *bio, struct bio *bio_src)
        bio->bi_iter = bio_src->bi_iter;
        bio->bi_io_vec = bio_src->bi_io_vec;
 
-       bio_clone_blkcg_association(bio, bio_src);
+       bio_clone_blkg_association(bio, bio_src);
+
+       blkcg_bio_issue_init(bio);
 }
 EXPORT_SYMBOL(__bio_clone_fast);
 
@@ -729,7 +731,7 @@ int bio_add_pc_page(struct request_queue *q, struct bio *bio, struct page
        }
 
        /* If we may be able to merge these biovecs, force a recount */
-       if (bio->bi_vcnt > 1 && (BIOVEC_PHYS_MERGEABLE(bvec-1, bvec)))
+       if (bio->bi_vcnt > 1 && biovec_phys_mergeable(q, bvec - 1, bvec))
                bio_clear_flag(bio, BIO_SEG_VALID);
 
  done:
@@ -827,6 +829,8 @@ int bio_add_page(struct bio *bio, struct page *page,
 }
 EXPORT_SYMBOL(bio_add_page);
 
+#define PAGE_PTRS_PER_BVEC     (sizeof(struct bio_vec) / sizeof(struct page *))
+
 /**
  * __bio_iov_iter_get_pages - pin user or kernel pages and add them to a bio
  * @bio: bio to add pages to
@@ -839,38 +843,35 @@ EXPORT_SYMBOL(bio_add_page);
  */
 static int __bio_iov_iter_get_pages(struct bio *bio, struct iov_iter *iter)
 {
-       unsigned short nr_pages = bio->bi_max_vecs - bio->bi_vcnt, idx;
+       unsigned short nr_pages = bio->bi_max_vecs - bio->bi_vcnt;
+       unsigned short entries_left = bio->bi_max_vecs - bio->bi_vcnt;
        struct bio_vec *bv = bio->bi_io_vec + bio->bi_vcnt;
        struct page **pages = (struct page **)bv;
+       ssize_t size, left;
+       unsigned len, i;
        size_t offset;
-       ssize_t size;
+
+       /*
+        * Move page array up in the allocated memory for the bio vecs as far as
+        * possible so that we can start filling biovecs from the beginning
+        * without overwriting the temporary page array.
+       */
+       BUILD_BUG_ON(PAGE_PTRS_PER_BVEC < 2);
+       pages += entries_left * (PAGE_PTRS_PER_BVEC - 1);
 
        size = iov_iter_get_pages(iter, pages, LONG_MAX, nr_pages, &offset);
        if (unlikely(size <= 0))
                return size ? size : -EFAULT;
-       idx = nr_pages = (size + offset + PAGE_SIZE - 1) / PAGE_SIZE;
 
-       /*
-        * Deep magic below:  We need to walk the pinned pages backwards
-        * because we are abusing the space allocated for the bio_vecs
-        * for the page array.  Because the bio_vecs are larger than the
-        * page pointers by definition this will always work.  But it also
-        * means we can't use bio_add_page, so any changes to it's semantics
-        * need to be reflected here as well.
-        */
-       bio->bi_iter.bi_size += size;
-       bio->bi_vcnt += nr_pages;
+       for (left = size, i = 0; left > 0; left -= len, i++) {
+               struct page *page = pages[i];
 
-       while (idx--) {
-               bv[idx].bv_page = pages[idx];
-               bv[idx].bv_len = PAGE_SIZE;
-               bv[idx].bv_offset = 0;
+               len = min_t(size_t, PAGE_SIZE - offset, left);
+               if (WARN_ON_ONCE(bio_add_page(bio, page, len, offset) != len))
+                       return -EINVAL;
+               offset = 0;
        }
 
-       bv[0].bv_offset += offset;
-       bv[0].bv_len -= offset;
-       bv[nr_pages - 1].bv_len -= nr_pages * PAGE_SIZE - offset - size;
-
        iov_iter_advance(iter, size);
        return 0;
 }
@@ -1807,7 +1808,6 @@ struct bio *bio_split(struct bio *bio, int sectors,
                bio_integrity_trim(split);
 
        bio_advance(bio, split->bi_iter.bi_size);
-       bio->bi_iter.bi_done = 0;
 
        if (bio_flagged(bio, BIO_TRACE_COMPLETION))
                bio_set_flag(split, BIO_TRACE_COMPLETION);
@@ -1956,69 +1956,151 @@ EXPORT_SYMBOL(bioset_init_from_src);
 
 #ifdef CONFIG_BLK_CGROUP
 
+/**
+ * bio_associate_blkg - associate a bio with the a blkg
+ * @bio: target bio
+ * @blkg: the blkg to associate
+ *
+ * This tries to associate @bio with the specified blkg.  Association failure
+ * is handled by walking up the blkg tree.  Therefore, the blkg associated can
+ * be anything between @blkg and the root_blkg.  This situation only happens
+ * when a cgroup is dying and then the remaining bios will spill to the closest
+ * alive blkg.
+ *
+ * A reference will be taken on the @blkg and will be released when @bio is
+ * freed.
+ */
+int bio_associate_blkg(struct bio *bio, struct blkcg_gq *blkg)
+{
+       if (unlikely(bio->bi_blkg))
+               return -EBUSY;
+       bio->bi_blkg = blkg_tryget_closest(blkg);
+       return 0;
+}
+
+/**
+ * __bio_associate_blkg_from_css - internal blkg association function
+ *
+ * This in the core association function that all association paths rely on.
+ * A blkg reference is taken which is released upon freeing of the bio.
+ */
+static int __bio_associate_blkg_from_css(struct bio *bio,
+                                        struct cgroup_subsys_state *css)
+{
+       struct request_queue *q = bio->bi_disk->queue;
+       struct blkcg_gq *blkg;
+       int ret;
+
+       rcu_read_lock();
+
+       if (!css || !css->parent)
+               blkg = q->root_blkg;
+       else
+               blkg = blkg_lookup_create(css_to_blkcg(css), q);
+
+       ret = bio_associate_blkg(bio, blkg);
+
+       rcu_read_unlock();
+       return ret;
+}
+
+/**
+ * bio_associate_blkg_from_css - associate a bio with a specified css
+ * @bio: target bio
+ * @css: target css
+ *
+ * Associate @bio with the blkg found by combining the css's blkg and the
+ * request_queue of the @bio.  This falls back to the queue's root_blkg if
+ * the association fails with the css.
+ */
+int bio_associate_blkg_from_css(struct bio *bio,
+                               struct cgroup_subsys_state *css)
+{
+       if (unlikely(bio->bi_blkg))
+               return -EBUSY;
+       return __bio_associate_blkg_from_css(bio, css);
+}
+EXPORT_SYMBOL_GPL(bio_associate_blkg_from_css);
+
 #ifdef CONFIG_MEMCG
 /**
- * bio_associate_blkcg_from_page - associate a bio with the page's blkcg
+ * bio_associate_blkg_from_page - associate a bio with the page's blkg
  * @bio: target bio
  * @page: the page to lookup the blkcg from
  *
- * Associate @bio with the blkcg from @page's owning memcg.  This works like
- * every other associate function wrt references.
+ * Associate @bio with the blkg from @page's owning memcg and the respective
+ * request_queue.  If cgroup_e_css returns NULL, fall back to the queue's
+ * root_blkg.
+ *
+ * Note: this must be called after bio has an associated device.
  */
-int bio_associate_blkcg_from_page(struct bio *bio, struct page *page)
+int bio_associate_blkg_from_page(struct bio *bio, struct page *page)
 {
-       struct cgroup_subsys_state *blkcg_css;
+       struct cgroup_subsys_state *css;
+       int ret;
 
-       if (unlikely(bio->bi_css))
+       if (unlikely(bio->bi_blkg))
                return -EBUSY;
        if (!page->mem_cgroup)
                return 0;
-       blkcg_css = cgroup_get_e_css(page->mem_cgroup->css.cgroup,
-                                    &io_cgrp_subsys);
-       bio->bi_css = blkcg_css;
-       return 0;
+
+       rcu_read_lock();
+
+       css = cgroup_e_css(page->mem_cgroup->css.cgroup, &io_cgrp_subsys);
+
+       ret = __bio_associate_blkg_from_css(bio, css);
+
+       rcu_read_unlock();
+       return ret;
 }
 #endif /* CONFIG_MEMCG */
 
 /**
- * bio_associate_blkcg - associate a bio with the specified blkcg
+ * bio_associate_create_blkg - associate a bio with a blkg from q
+ * @q: request_queue where bio is going
  * @bio: target bio
- * @blkcg_css: css of the blkcg to associate
- *
- * Associate @bio with the blkcg specified by @blkcg_css.  Block layer will
- * treat @bio as if it were issued by a task which belongs to the blkcg.
  *
- * This function takes an extra reference of @blkcg_css which will be put
- * when @bio is released.  The caller must own @bio and is responsible for
- * synchronizing calls to this function.
+ * Associate @bio with the blkg found from the bio's css and the request_queue.
+ * If one is not found, bio_lookup_blkg creates the blkg.  This falls back to
+ * the queue's root_blkg if association fails.
  */
-int bio_associate_blkcg(struct bio *bio, struct cgroup_subsys_state *blkcg_css)
+int bio_associate_create_blkg(struct request_queue *q, struct bio *bio)
 {
-       if (unlikely(bio->bi_css))
-               return -EBUSY;
-       css_get(blkcg_css);
-       bio->bi_css = blkcg_css;
-       return 0;
+       struct cgroup_subsys_state *css;
+       int ret = 0;
+
+       /* someone has already associated this bio with a blkg */
+       if (bio->bi_blkg)
+               return ret;
+
+       rcu_read_lock();
+
+       css = blkcg_css();
+
+       ret = __bio_associate_blkg_from_css(bio, css);
+
+       rcu_read_unlock();
+       return ret;
 }
-EXPORT_SYMBOL_GPL(bio_associate_blkcg);
 
 /**
- * bio_associate_blkg - associate a bio with the specified blkg
+ * bio_reassociate_blkg - reassociate a bio with a blkg from q
+ * @q: request_queue where bio is going
  * @bio: target bio
- * @blkg: the blkg to associate
  *
- * Associate @bio with the blkg specified by @blkg.  This is the queue specific
- * blkcg information associated with the @bio, a reference will be taken on the
- * @blkg and will be freed when the bio is freed.
+ * When submitting a bio, multiple recursive calls to make_request() may occur.
+ * This causes the initial associate done in blkcg_bio_issue_check() to be
+ * incorrect and reference the prior request_queue.  This performs reassociation
+ * when this situation happens.
  */
-int bio_associate_blkg(struct bio *bio, struct blkcg_gq *blkg)
+int bio_reassociate_blkg(struct request_queue *q, struct bio *bio)
 {
-       if (unlikely(bio->bi_blkg))
-               return -EBUSY;
-       if (!blkg_try_get(blkg))
-               return -ENODEV;
-       bio->bi_blkg = blkg;
-       return 0;
+       if (bio->bi_blkg) {
+               blkg_put(bio->bi_blkg);
+               bio->bi_blkg = NULL;
+       }
+
+       return bio_associate_create_blkg(q, bio);
 }
 
 /**
@@ -2031,10 +2113,6 @@ void bio_disassociate_task(struct bio *bio)
                put_io_context(bio->bi_ioc);
                bio->bi_ioc = NULL;
        }
-       if (bio->bi_css) {
-               css_put(bio->bi_css);
-               bio->bi_css = NULL;
-       }
        if (bio->bi_blkg) {
                blkg_put(bio->bi_blkg);
                bio->bi_blkg = NULL;
@@ -2042,16 +2120,16 @@ void bio_disassociate_task(struct bio *bio)
 }
 
 /**
- * bio_clone_blkcg_association - clone blkcg association from src to dst bio
+ * bio_clone_blkg_association - clone blkg association from src to dst bio
  * @dst: destination bio
  * @src: source bio
  */
-void bio_clone_blkcg_association(struct bio *dst, struct bio *src)
+void bio_clone_blkg_association(struct bio *dst, struct bio *src)
 {
-       if (src->bi_css)
-               WARN_ON(bio_associate_blkcg(dst, src->bi_css));
+       if (src->bi_blkg)
+               bio_associate_blkg(dst, src->bi_blkg);
 }
-EXPORT_SYMBOL_GPL(bio_clone_blkcg_association);
+EXPORT_SYMBOL_GPL(bio_clone_blkg_association);
 #endif /* CONFIG_BLK_CGROUP */
 
 static void __init biovec_init_slabs(void)
index c630e02836a80d7d406778208c659aebda8fcf06..992da5592c6ed14208116b794975c75c2b3986a1 100644 (file)
@@ -84,6 +84,37 @@ static void blkg_free(struct blkcg_gq *blkg)
        kfree(blkg);
 }
 
+static void __blkg_release(struct rcu_head *rcu)
+{
+       struct blkcg_gq *blkg = container_of(rcu, struct blkcg_gq, rcu_head);
+
+       percpu_ref_exit(&blkg->refcnt);
+
+       /* release the blkcg and parent blkg refs this blkg has been holding */
+       css_put(&blkg->blkcg->css);
+       if (blkg->parent)
+               blkg_put(blkg->parent);
+
+       wb_congested_put(blkg->wb_congested);
+
+       blkg_free(blkg);
+}
+
+/*
+ * A group is RCU protected, but having an rcu lock does not mean that one
+ * can access all the fields of blkg and assume these are valid.  For
+ * example, don't try to follow throtl_data and request queue links.
+ *
+ * Having a reference to blkg under an rcu allows accesses to only values
+ * local to groups like group stats and group rate limits.
+ */
+static void blkg_release(struct percpu_ref *ref)
+{
+       struct blkcg_gq *blkg = container_of(ref, struct blkcg_gq, refcnt);
+
+       call_rcu(&blkg->rcu_head, __blkg_release);
+}
+
 /**
  * blkg_alloc - allocate a blkg
  * @blkcg: block cgroup the new blkg is associated with
@@ -110,7 +141,6 @@ static struct blkcg_gq *blkg_alloc(struct blkcg *blkcg, struct request_queue *q,
        blkg->q = q;
        INIT_LIST_HEAD(&blkg->q_node);
        blkg->blkcg = blkcg;
-       atomic_set(&blkg->refcnt, 1);
 
        /* root blkg uses @q->root_rl, init rl only for !root blkgs */
        if (blkcg != &blkcg_root) {
@@ -217,6 +247,11 @@ static struct blkcg_gq *blkg_create(struct blkcg *blkcg,
                blkg_get(blkg->parent);
        }
 
+       ret = percpu_ref_init(&blkg->refcnt, blkg_release, 0,
+                             GFP_NOWAIT | __GFP_NOWARN);
+       if (ret)
+               goto err_cancel_ref;
+
        /* invoke per-policy init */
        for (i = 0; i < BLKCG_MAX_POLS; i++) {
                struct blkcg_policy *pol = blkcg_policy[i];
@@ -249,6 +284,8 @@ static struct blkcg_gq *blkg_create(struct blkcg *blkcg,
        blkg_put(blkg);
        return ERR_PTR(ret);
 
+err_cancel_ref:
+       percpu_ref_exit(&blkg->refcnt);
 err_put_congested:
        wb_congested_put(wb_congested);
 err_put_css:
@@ -259,7 +296,7 @@ err_free_blkg:
 }
 
 /**
- * blkg_lookup_create - lookup blkg, try to create one if not there
+ * __blkg_lookup_create - lookup blkg, try to create one if not there
  * @blkcg: blkcg of interest
  * @q: request_queue of interest
  *
@@ -268,12 +305,11 @@ err_free_blkg:
  * that all non-root blkg's have access to the parent blkg.  This function
  * should be called under RCU read lock and @q->queue_lock.
  *
- * Returns pointer to the looked up or created blkg on success, ERR_PTR()
- * value on error.  If @q is dead, returns ERR_PTR(-EINVAL).  If @q is not
- * dead and bypassing, returns ERR_PTR(-EBUSY).
+ * Returns the blkg or the closest blkg if blkg_create fails as it walks
+ * down from root.
  */
-struct blkcg_gq *blkg_lookup_create(struct blkcg *blkcg,
-                                   struct request_queue *q)
+struct blkcg_gq *__blkg_lookup_create(struct blkcg *blkcg,
+                                     struct request_queue *q)
 {
        struct blkcg_gq *blkg;
 
@@ -285,7 +321,7 @@ struct blkcg_gq *blkg_lookup_create(struct blkcg *blkcg,
         * we shouldn't allow anything to go through for a bypassing queue.
         */
        if (unlikely(blk_queue_bypass(q)))
-               return ERR_PTR(blk_queue_dying(q) ? -ENODEV : -EBUSY);
+               return q->root_blkg;
 
        blkg = __blkg_lookup(blkcg, q, true);
        if (blkg)
@@ -293,23 +329,58 @@ struct blkcg_gq *blkg_lookup_create(struct blkcg *blkcg,
 
        /*
         * Create blkgs walking down from blkcg_root to @blkcg, so that all
-        * non-root blkgs have access to their parents.
+        * non-root blkgs have access to their parents.  Returns the closest
+        * blkg to the intended blkg should blkg_create() fail.
         */
        while (true) {
                struct blkcg *pos = blkcg;
                struct blkcg *parent = blkcg_parent(blkcg);
-
-               while (parent && !__blkg_lookup(parent, q, false)) {
+               struct blkcg_gq *ret_blkg = q->root_blkg;
+
+               while (parent) {
+                       blkg = __blkg_lookup(parent, q, false);
+                       if (blkg) {
+                               /* remember closest blkg */
+                               ret_blkg = blkg;
+                               break;
+                       }
                        pos = parent;
                        parent = blkcg_parent(parent);
                }
 
                blkg = blkg_create(pos, q, NULL);
-               if (pos == blkcg || IS_ERR(blkg))
+               if (IS_ERR(blkg))
+                       return ret_blkg;
+               if (pos == blkcg)
                        return blkg;
        }
 }
 
+/**
+ * blkg_lookup_create - find or create a blkg
+ * @blkcg: target block cgroup
+ * @q: target request_queue
+ *
+ * This looks up or creates the blkg representing the unique pair
+ * of the blkcg and the request_queue.
+ */
+struct blkcg_gq *blkg_lookup_create(struct blkcg *blkcg,
+                                   struct request_queue *q)
+{
+       struct blkcg_gq *blkg = blkg_lookup(blkcg, q);
+       unsigned long flags;
+
+       if (unlikely(!blkg)) {
+               spin_lock_irqsave(q->queue_lock, flags);
+
+               blkg = __blkg_lookup_create(blkcg, q);
+
+               spin_unlock_irqrestore(q->queue_lock, flags);
+       }
+
+       return blkg;
+}
+
 static void blkg_destroy(struct blkcg_gq *blkg)
 {
        struct blkcg *blkcg = blkg->blkcg;
@@ -353,7 +424,7 @@ static void blkg_destroy(struct blkcg_gq *blkg)
         * Put the reference taken at the time of creation so that when all
         * queues are gone, group can be destroyed.
         */
-       blkg_put(blkg);
+       percpu_ref_kill(&blkg->refcnt);
 }
 
 /**
@@ -380,29 +451,6 @@ static void blkg_destroy_all(struct request_queue *q)
        q->root_rl.blkg = NULL;
 }
 
-/*
- * A group is RCU protected, but having an rcu lock does not mean that one
- * can access all the fields of blkg and assume these are valid.  For
- * example, don't try to follow throtl_data and request queue links.
- *
- * Having a reference to blkg under an rcu allows accesses to only values
- * local to groups like group stats and group rate limits.
- */
-void __blkg_release_rcu(struct rcu_head *rcu_head)
-{
-       struct blkcg_gq *blkg = container_of(rcu_head, struct blkcg_gq, rcu_head);
-
-       /* release the blkcg and parent blkg refs this blkg has been holding */
-       css_put(&blkg->blkcg->css);
-       if (blkg->parent)
-               blkg_put(blkg->parent);
-
-       wb_congested_put(blkg->wb_congested);
-
-       blkg_free(blkg);
-}
-EXPORT_SYMBOL_GPL(__blkg_release_rcu);
-
 /*
  * The next function used by blk_queue_for_each_rl().  It's a bit tricky
  * because the root blkg uses @q->root_rl instead of its own rl.
@@ -1748,8 +1796,7 @@ void blkcg_maybe_throttle_current(void)
        blkg = blkg_lookup(blkcg, q);
        if (!blkg)
                goto out;
-       blkg = blkg_try_get(blkg);
-       if (!blkg)
+       if (!blkg_tryget(blkg))
                goto out;
        rcu_read_unlock();
 
index cff0a60ee20066c2fc2d7c4fb2da0bc9ea7c50da..3ed60723e2429d4902fb386b66cca59561151458 100644 (file)
@@ -42,6 +42,7 @@
 #include "blk.h"
 #include "blk-mq.h"
 #include "blk-mq-sched.h"
+#include "blk-pm.h"
 #include "blk-rq-qos.h"
 
 #ifdef CONFIG_DEBUG_FS
@@ -421,24 +422,25 @@ void blk_sync_queue(struct request_queue *q)
 EXPORT_SYMBOL(blk_sync_queue);
 
 /**
- * blk_set_preempt_only - set QUEUE_FLAG_PREEMPT_ONLY
+ * blk_set_pm_only - increment pm_only counter
  * @q: request queue pointer
- *
- * Returns the previous value of the PREEMPT_ONLY flag - 0 if the flag was not
- * set and 1 if the flag was already set.
  */
-int blk_set_preempt_only(struct request_queue *q)
+void blk_set_pm_only(struct request_queue *q)
 {
-       return blk_queue_flag_test_and_set(QUEUE_FLAG_PREEMPT_ONLY, q);
+       atomic_inc(&q->pm_only);
 }
-EXPORT_SYMBOL_GPL(blk_set_preempt_only);
+EXPORT_SYMBOL_GPL(blk_set_pm_only);
 
-void blk_clear_preempt_only(struct request_queue *q)
+void blk_clear_pm_only(struct request_queue *q)
 {
-       blk_queue_flag_clear(QUEUE_FLAG_PREEMPT_ONLY, q);
-       wake_up_all(&q->mq_freeze_wq);
+       int pm_only;
+
+       pm_only = atomic_dec_return(&q->pm_only);
+       WARN_ON_ONCE(pm_only < 0);
+       if (pm_only == 0)
+               wake_up_all(&q->mq_freeze_wq);
 }
-EXPORT_SYMBOL_GPL(blk_clear_preempt_only);
+EXPORT_SYMBOL_GPL(blk_clear_pm_only);
 
 /**
  * __blk_run_queue_uncond - run a queue whether or not it has been stopped
@@ -917,7 +919,7 @@ EXPORT_SYMBOL(blk_alloc_queue);
  */
 int blk_queue_enter(struct request_queue *q, blk_mq_req_flags_t flags)
 {
-       const bool preempt = flags & BLK_MQ_REQ_PREEMPT;
+       const bool pm = flags & BLK_MQ_REQ_PREEMPT;
 
        while (true) {
                bool success = false;
@@ -925,11 +927,11 @@ int blk_queue_enter(struct request_queue *q, blk_mq_req_flags_t flags)
                rcu_read_lock();
                if (percpu_ref_tryget_live(&q->q_usage_counter)) {
                        /*
-                        * The code that sets the PREEMPT_ONLY flag is
-                        * responsible for ensuring that that flag is globally
-                        * visible before the queue is unfrozen.
+                        * The code that increments the pm_only counter is
+                        * responsible for ensuring that that counter is
+                        * globally visible before the queue is unfrozen.
                         */
-                       if (preempt || !blk_queue_preempt_only(q)) {
+                       if (pm || !blk_queue_pm_only(q)) {
                                success = true;
                        } else {
                                percpu_ref_put(&q->q_usage_counter);
@@ -954,7 +956,8 @@ int blk_queue_enter(struct request_queue *q, blk_mq_req_flags_t flags)
 
                wait_event(q->mq_freeze_wq,
                           (atomic_read(&q->mq_freeze_depth) == 0 &&
-                           (preempt || !blk_queue_preempt_only(q))) ||
+                           (pm || (blk_pm_request_resume(q),
+                                   !blk_queue_pm_only(q)))) ||
                           blk_queue_dying(q));
                if (blk_queue_dying(q))
                        return -ENODEV;
@@ -1051,8 +1054,7 @@ struct request_queue *blk_alloc_queue_node(gfp_t gfp_mask, int node_id,
        mutex_init(&q->sysfs_lock);
        spin_lock_init(&q->__queue_lock);
 
-       if (!q->mq_ops)
-               q->queue_lock = lock ? : &q->__queue_lock;
+       q->queue_lock = lock ? : &q->__queue_lock;
 
        /*
         * A queue starts its life with bypass turned on to avoid
@@ -1160,7 +1162,7 @@ int blk_init_allocated_queue(struct request_queue *q)
 {
        WARN_ON_ONCE(q->mq_ops);
 
-       q->fq = blk_alloc_flush_queue(q, NUMA_NO_NODE, q->cmd_size);
+       q->fq = blk_alloc_flush_queue(q, NUMA_NO_NODE, q->cmd_size, GFP_KERNEL);
        if (!q->fq)
                return -ENOMEM;
 
@@ -1726,16 +1728,6 @@ void part_round_stats(struct request_queue *q, int cpu, struct hd_struct *part)
 }
 EXPORT_SYMBOL_GPL(part_round_stats);
 
-#ifdef CONFIG_PM
-static void blk_pm_put_request(struct request *rq)
-{
-       if (rq->q->dev && !(rq->rq_flags & RQF_PM) && !--rq->q->nr_pending)
-               pm_runtime_mark_last_busy(rq->q->dev);
-}
-#else
-static inline void blk_pm_put_request(struct request *rq) {}
-#endif
-
 void __blk_put_request(struct request_queue *q, struct request *req)
 {
        req_flags_t rq_flags = req->rq_flags;
@@ -1752,6 +1744,7 @@ void __blk_put_request(struct request_queue *q, struct request *req)
 
        blk_req_zone_write_unlock(req);
        blk_pm_put_request(req);
+       blk_pm_mark_last_busy(req);
 
        elv_completed_request(q, req);
 
@@ -2440,6 +2433,7 @@ blk_qc_t generic_make_request(struct bio *bio)
                        if (q)
                                blk_queue_exit(q);
                        q = bio->bi_disk->queue;
+                       bio_reassociate_blkg(q, bio);
                        flags = 0;
                        if (bio->bi_opf & REQ_NOWAIT)
                                flags = BLK_MQ_REQ_NOWAIT;
@@ -2750,30 +2744,6 @@ void blk_account_io_done(struct request *req, u64 now)
        }
 }
 
-#ifdef CONFIG_PM
-/*
- * Don't process normal requests when queue is suspended
- * or in the process of suspending/resuming
- */
-static bool blk_pm_allow_request(struct request *rq)
-{
-       switch (rq->q->rpm_status) {
-       case RPM_RESUMING:
-       case RPM_SUSPENDING:
-               return rq->rq_flags & RQF_PM;
-       case RPM_SUSPENDED:
-               return false;
-       default:
-               return true;
-       }
-}
-#else
-static bool blk_pm_allow_request(struct request *rq)
-{
-       return true;
-}
-#endif
-
 void blk_account_io_start(struct request *rq, bool new_io)
 {
        struct hd_struct *part;
@@ -2819,11 +2789,14 @@ static struct request *elv_next_request(struct request_queue *q)
 
        while (1) {
                list_for_each_entry(rq, &q->queue_head, queuelist) {
-                       if (blk_pm_allow_request(rq))
-                               return rq;
-
-                       if (rq->rq_flags & RQF_SOFTBARRIER)
-                               break;
+#ifdef CONFIG_PM
+                       /*
+                        * If a request gets queued in state RPM_SUSPENDED
+                        * then that's a kernel bug.
+                        */
+                       WARN_ON_ONCE(q->rpm_status == RPM_SUSPENDED);
+#endif
+                       return rq;
                }
 
                /*
@@ -3755,191 +3728,6 @@ void blk_finish_plug(struct blk_plug *plug)
 }
 EXPORT_SYMBOL(blk_finish_plug);
 
-#ifdef CONFIG_PM
-/**
- * blk_pm_runtime_init - Block layer runtime PM initialization routine
- * @q: the queue of the device
- * @dev: the device the queue belongs to
- *
- * Description:
- *    Initialize runtime-PM-related fields for @q and start auto suspend for
- *    @dev. Drivers that want to take advantage of request-based runtime PM
- *    should call this function after @dev has been initialized, and its
- *    request queue @q has been allocated, and runtime PM for it can not happen
- *    yet(either due to disabled/forbidden or its usage_count > 0). In most
- *    cases, driver should call this function before any I/O has taken place.
- *
- *    This function takes care of setting up using auto suspend for the device,
- *    the autosuspend delay is set to -1 to make runtime suspend impossible
- *    until an updated value is either set by user or by driver. Drivers do
- *    not need to touch other autosuspend settings.
- *
- *    The block layer runtime PM is request based, so only works for drivers
- *    that use request as their IO unit instead of those directly use bio's.
- */
-void blk_pm_runtime_init(struct request_queue *q, struct device *dev)
-{
-       /* Don't enable runtime PM for blk-mq until it is ready */
-       if (q->mq_ops) {
-               pm_runtime_disable(dev);
-               return;
-       }
-
-       q->dev = dev;
-       q->rpm_status = RPM_ACTIVE;
-       pm_runtime_set_autosuspend_delay(q->dev, -1);
-       pm_runtime_use_autosuspend(q->dev);
-}
-EXPORT_SYMBOL(blk_pm_runtime_init);
-
-/**
- * blk_pre_runtime_suspend - Pre runtime suspend check
- * @q: the queue of the device
- *
- * Description:
- *    This function will check if runtime suspend is allowed for the device
- *    by examining if there are any requests pending in the queue. If there
- *    are requests pending, the device can not be runtime suspended; otherwise,
- *    the queue's status will be updated to SUSPENDING and the driver can
- *    proceed to suspend the device.
- *
- *    For the not allowed case, we mark last busy for the device so that
- *    runtime PM core will try to autosuspend it some time later.
- *
- *    This function should be called near the start of the device's
- *    runtime_suspend callback.
- *
- * Return:
- *    0                - OK to runtime suspend the device
- *    -EBUSY   - Device should not be runtime suspended
- */
-int blk_pre_runtime_suspend(struct request_queue *q)
-{
-       int ret = 0;
-
-       if (!q->dev)
-               return ret;
-
-       spin_lock_irq(q->queue_lock);
-       if (q->nr_pending) {
-               ret = -EBUSY;
-               pm_runtime_mark_last_busy(q->dev);
-       } else {
-               q->rpm_status = RPM_SUSPENDING;
-       }
-       spin_unlock_irq(q->queue_lock);
-       return ret;
-}
-EXPORT_SYMBOL(blk_pre_runtime_suspend);
-
-/**
- * blk_post_runtime_suspend - Post runtime suspend processing
- * @q: the queue of the device
- * @err: return value of the device's runtime_suspend function
- *
- * Description:
- *    Update the queue's runtime status according to the return value of the
- *    device's runtime suspend function and mark last busy for the device so
- *    that PM core will try to auto suspend the device at a later time.
- *
- *    This function should be called near the end of the device's
- *    runtime_suspend callback.
- */
-void blk_post_runtime_suspend(struct request_queue *q, int err)
-{
-       if (!q->dev)
-               return;
-
-       spin_lock_irq(q->queue_lock);
-       if (!err) {
-               q->rpm_status = RPM_SUSPENDED;
-       } else {
-               q->rpm_status = RPM_ACTIVE;
-               pm_runtime_mark_last_busy(q->dev);
-       }
-       spin_unlock_irq(q->queue_lock);
-}
-EXPORT_SYMBOL(blk_post_runtime_suspend);
-
-/**
- * blk_pre_runtime_resume - Pre runtime resume processing
- * @q: the queue of the device
- *
- * Description:
- *    Update the queue's runtime status to RESUMING in preparation for the
- *    runtime resume of the device.
- *
- *    This function should be called near the start of the device's
- *    runtime_resume callback.
- */
-void blk_pre_runtime_resume(struct request_queue *q)
-{
-       if (!q->dev)
-               return;
-
-       spin_lock_irq(q->queue_lock);
-       q->rpm_status = RPM_RESUMING;
-       spin_unlock_irq(q->queue_lock);
-}
-EXPORT_SYMBOL(blk_pre_runtime_resume);
-
-/**
- * blk_post_runtime_resume - Post runtime resume processing
- * @q: the queue of the device
- * @err: return value of the device's runtime_resume function
- *
- * Description:
- *    Update the queue's runtime status according to the return value of the
- *    device's runtime_resume function. If it is successfully resumed, process
- *    the requests that are queued into the device's queue when it is resuming
- *    and then mark last busy and initiate autosuspend for it.
- *
- *    This function should be called near the end of the device's
- *    runtime_resume callback.
- */
-void blk_post_runtime_resume(struct request_queue *q, int err)
-{
-       if (!q->dev)
-               return;
-
-       spin_lock_irq(q->queue_lock);
-       if (!err) {
-               q->rpm_status = RPM_ACTIVE;
-               __blk_run_queue(q);
-               pm_runtime_mark_last_busy(q->dev);
-               pm_request_autosuspend(q->dev);
-       } else {
-               q->rpm_status = RPM_SUSPENDED;
-       }
-       spin_unlock_irq(q->queue_lock);
-}
-EXPORT_SYMBOL(blk_post_runtime_resume);
-
-/**
- * blk_set_runtime_active - Force runtime status of the queue to be active
- * @q: the queue of the device
- *
- * If the device is left runtime suspended during system suspend the resume
- * hook typically resumes the device and corrects runtime status
- * accordingly. However, that does not affect the queue runtime PM status
- * which is still "suspended". This prevents processing requests from the
- * queue.
- *
- * This function can be used in driver's resume hook to correct queue
- * runtime PM status and re-enable peeking requests from the queue. It
- * should be called before first request is added to the queue.
- */
-void blk_set_runtime_active(struct request_queue *q)
-{
-       spin_lock_irq(q->queue_lock);
-       q->rpm_status = RPM_ACTIVE;
-       pm_runtime_mark_last_busy(q->dev);
-       pm_request_autosuspend(q->dev);
-       spin_unlock_irq(q->queue_lock);
-}
-EXPORT_SYMBOL(blk_set_runtime_active);
-#endif
-
 int __init blk_dev_init(void)
 {
        BUILD_BUG_ON(REQ_OP_LAST >= (1 << REQ_OP_BITS));
index ce41f666de3e1d068e78698349c1629d7e118bc8..8b44b86779daa83e96cf9408532933f75d23ea51 100644 (file)
@@ -566,12 +566,12 @@ int blkdev_issue_flush(struct block_device *bdev, gfp_t gfp_mask,
 EXPORT_SYMBOL(blkdev_issue_flush);
 
 struct blk_flush_queue *blk_alloc_flush_queue(struct request_queue *q,
-               int node, int cmd_size)
+               int node, int cmd_size, gfp_t flags)
 {
        struct blk_flush_queue *fq;
        int rq_sz = sizeof(struct request);
 
-       fq = kzalloc_node(sizeof(*fq), GFP_KERNEL, node);
+       fq = kzalloc_node(sizeof(*fq), flags, node);
        if (!fq)
                goto fail;
 
@@ -579,7 +579,7 @@ struct blk_flush_queue *blk_alloc_flush_queue(struct request_queue *q,
                spin_lock_init(&fq->mq_flush_lock);
 
        rq_sz = round_up(rq_sz + cmd_size, cache_line_size());
-       fq->flush_rq = kzalloc_node(rq_sz, GFP_KERNEL, node);
+       fq->flush_rq = kzalloc_node(rq_sz, flags, node);
        if (!fq->flush_rq)
                goto fail_rq;
 
index 6121611e1316420372ce510300321532e9c76ef0..d1ab089e09191ba1841e37211c95a68ef21b4bc9 100644 (file)
@@ -49,12 +49,8 @@ int blk_rq_count_integrity_sg(struct request_queue *q, struct bio *bio)
        bio_for_each_integrity_vec(iv, bio, iter) {
 
                if (prev) {
-                       if (!BIOVEC_PHYS_MERGEABLE(&ivprv, &iv))
+                       if (!biovec_phys_mergeable(q, &ivprv, &iv))
                                goto new_segment;
-
-                       if (!BIOVEC_SEG_BOUNDARY(q, &ivprv, &iv))
-                               goto new_segment;
-
                        if (seg_size + iv.bv_len > queue_max_segment_size(q))
                                goto new_segment;
 
@@ -95,12 +91,8 @@ int blk_rq_map_integrity_sg(struct request_queue *q, struct bio *bio,
        bio_for_each_integrity_vec(iv, bio, iter) {
 
                if (prev) {
-                       if (!BIOVEC_PHYS_MERGEABLE(&ivprv, &iv))
+                       if (!biovec_phys_mergeable(q, &ivprv, &iv))
                                goto new_segment;
-
-                       if (!BIOVEC_SEG_BOUNDARY(q, &ivprv, &iv))
-                               goto new_segment;
-
                        if (sg->length + iv.bv_len > queue_max_segment_size(q))
                                goto new_segment;
 
index 19923f8a029ddf199e01a7571914e8217f70b308..35c48d7b8f78e1c9594533d113ea72aafa80ff14 100644 (file)
@@ -115,9 +115,22 @@ struct child_latency_info {
        atomic_t scale_cookie;
 };
 
+struct percentile_stats {
+       u64 total;
+       u64 missed;
+};
+
+struct latency_stat {
+       union {
+               struct percentile_stats ps;
+               struct blk_rq_stat rqs;
+       };
+};
+
 struct iolatency_grp {
        struct blkg_policy_data pd;
-       struct blk_rq_stat __percpu *stats;
+       struct latency_stat __percpu *stats;
+       struct latency_stat cur_stat;
        struct blk_iolatency *blkiolat;
        struct rq_depth rq_depth;
        struct rq_wait rq_wait;
@@ -132,6 +145,7 @@ struct iolatency_grp {
        /* Our current number of IO's for the last summation. */
        u64 nr_samples;
 
+       bool ssd;
        struct child_latency_info child_lat;
 };
 
@@ -172,6 +186,80 @@ static inline struct blkcg_gq *lat_to_blkg(struct iolatency_grp *iolat)
        return pd_to_blkg(&iolat->pd);
 }
 
+static inline void latency_stat_init(struct iolatency_grp *iolat,
+                                    struct latency_stat *stat)
+{
+       if (iolat->ssd) {
+               stat->ps.total = 0;
+               stat->ps.missed = 0;
+       } else
+               blk_rq_stat_init(&stat->rqs);
+}
+
+static inline void latency_stat_sum(struct iolatency_grp *iolat,
+                                   struct latency_stat *sum,
+                                   struct latency_stat *stat)
+{
+       if (iolat->ssd) {
+               sum->ps.total += stat->ps.total;
+               sum->ps.missed += stat->ps.missed;
+       } else
+               blk_rq_stat_sum(&sum->rqs, &stat->rqs);
+}
+
+static inline void latency_stat_record_time(struct iolatency_grp *iolat,
+                                           u64 req_time)
+{
+       struct latency_stat *stat = get_cpu_ptr(iolat->stats);
+       if (iolat->ssd) {
+               if (req_time >= iolat->min_lat_nsec)
+                       stat->ps.missed++;
+               stat->ps.total++;
+       } else
+               blk_rq_stat_add(&stat->rqs, req_time);
+       put_cpu_ptr(stat);
+}
+
+static inline bool latency_sum_ok(struct iolatency_grp *iolat,
+                                 struct latency_stat *stat)
+{
+       if (iolat->ssd) {
+               u64 thresh = div64_u64(stat->ps.total, 10);
+               thresh = max(thresh, 1ULL);
+               return stat->ps.missed < thresh;
+       }
+       return stat->rqs.mean <= iolat->min_lat_nsec;
+}
+
+static inline u64 latency_stat_samples(struct iolatency_grp *iolat,
+                                      struct latency_stat *stat)
+{
+       if (iolat->ssd)
+               return stat->ps.total;
+       return stat->rqs.nr_samples;
+}
+
+static inline void iolat_update_total_lat_avg(struct iolatency_grp *iolat,
+                                             struct latency_stat *stat)
+{
+       int exp_idx;
+
+       if (iolat->ssd)
+               return;
+
+       /*
+        * CALC_LOAD takes in a number stored in fixed point representation.
+        * Because we are using this for IO time in ns, the values stored
+        * are significantly larger than the FIXED_1 denominator (2048).
+        * Therefore, rounding errors in the calculation are negligible and
+        * can be ignored.
+        */
+       exp_idx = min_t(int, BLKIOLATENCY_NR_EXP_FACTORS - 1,
+                       div64_u64(iolat->cur_win_nsec,
+                                 BLKIOLATENCY_EXP_BUCKET_SIZE));
+       CALC_LOAD(iolat->lat_avg, iolatency_exp_factors[exp_idx], stat->rqs.mean);
+}
+
 static inline bool iolatency_may_queue(struct iolatency_grp *iolat,
                                       wait_queue_entry_t *wait,
                                       bool first_block)
@@ -255,7 +343,7 @@ static void scale_cookie_change(struct blk_iolatency *blkiolat,
                                struct child_latency_info *lat_info,
                                bool up)
 {
-       unsigned long qd = blk_queue_depth(blkiolat->rqos.q);
+       unsigned long qd = blkiolat->rqos.q->nr_requests;
        unsigned long scale = scale_amount(qd, up);
        unsigned long old = atomic_read(&lat_info->scale_cookie);
        unsigned long max_scale = qd << 1;
@@ -295,10 +383,9 @@ static void scale_cookie_change(struct blk_iolatency *blkiolat,
  */
 static void scale_change(struct iolatency_grp *iolat, bool up)
 {
-       unsigned long qd = blk_queue_depth(iolat->blkiolat->rqos.q);
+       unsigned long qd = iolat->blkiolat->rqos.q->nr_requests;
        unsigned long scale = scale_amount(qd, up);
        unsigned long old = iolat->rq_depth.max_depth;
-       bool changed = false;
 
        if (old > qd)
                old = qd;
@@ -308,15 +395,13 @@ static void scale_change(struct iolatency_grp *iolat, bool up)
                        return;
 
                if (old < qd) {
-                       changed = true;
                        old += scale;
                        old = min(old, qd);
                        iolat->rq_depth.max_depth = old;
                        wake_up_all(&iolat->rq_wait.wait);
                }
-       } else if (old > 1) {
+       } else {
                old >>= 1;
-               changed = true;
                iolat->rq_depth.max_depth = max(old, 1UL);
        }
 }
@@ -369,7 +454,7 @@ static void check_scale_change(struct iolatency_grp *iolat)
                 * scale down event.
                 */
                samples_thresh = lat_info->nr_samples * 5;
-               samples_thresh = div64_u64(samples_thresh, 100);
+               samples_thresh = max(1ULL, div64_u64(samples_thresh, 100));
                if (iolat->nr_samples <= samples_thresh)
                        return;
        }
@@ -395,34 +480,12 @@ static void blkcg_iolatency_throttle(struct rq_qos *rqos, struct bio *bio,
                                     spinlock_t *lock)
 {
        struct blk_iolatency *blkiolat = BLKIOLATENCY(rqos);
-       struct blkcg *blkcg;
-       struct blkcg_gq *blkg;
-       struct request_queue *q = rqos->q;
+       struct blkcg_gq *blkg = bio->bi_blkg;
        bool issue_as_root = bio_issue_as_root_blkg(bio);
 
        if (!blk_iolatency_enabled(blkiolat))
                return;
 
-       rcu_read_lock();
-       blkcg = bio_blkcg(bio);
-       bio_associate_blkcg(bio, &blkcg->css);
-       blkg = blkg_lookup(blkcg, q);
-       if (unlikely(!blkg)) {
-               if (!lock)
-                       spin_lock_irq(q->queue_lock);
-               blkg = blkg_lookup_create(blkcg, q);
-               if (IS_ERR(blkg))
-                       blkg = NULL;
-               if (!lock)
-                       spin_unlock_irq(q->queue_lock);
-       }
-       if (!blkg)
-               goto out;
-
-       bio_issue_init(&bio->bi_issue, bio_sectors(bio));
-       bio_associate_blkg(bio, blkg);
-out:
-       rcu_read_unlock();
        while (blkg && blkg->parent) {
                struct iolatency_grp *iolat = blkg_to_lat(blkg);
                if (!iolat) {
@@ -443,7 +506,6 @@ static void iolatency_record_time(struct iolatency_grp *iolat,
                                  struct bio_issue *issue, u64 now,
                                  bool issue_as_root)
 {
-       struct blk_rq_stat *rq_stat;
        u64 start = bio_issue_time(issue);
        u64 req_time;
 
@@ -469,9 +531,7 @@ static void iolatency_record_time(struct iolatency_grp *iolat,
                return;
        }
 
-       rq_stat = get_cpu_ptr(iolat->stats);
-       blk_rq_stat_add(rq_stat, req_time);
-       put_cpu_ptr(rq_stat);
+       latency_stat_record_time(iolat, req_time);
 }
 
 #define BLKIOLATENCY_MIN_ADJUST_TIME (500 * NSEC_PER_MSEC)
@@ -482,17 +542,17 @@ static void iolatency_check_latencies(struct iolatency_grp *iolat, u64 now)
        struct blkcg_gq *blkg = lat_to_blkg(iolat);
        struct iolatency_grp *parent;
        struct child_latency_info *lat_info;
-       struct blk_rq_stat stat;
+       struct latency_stat stat;
        unsigned long flags;
-       int cpu, exp_idx;
+       int cpu;
 
-       blk_rq_stat_init(&stat);
+       latency_stat_init(iolat, &stat);
        preempt_disable();
        for_each_online_cpu(cpu) {
-               struct blk_rq_stat *s;
+               struct latency_stat *s;
                s = per_cpu_ptr(iolat->stats, cpu);
-               blk_rq_stat_sum(&stat, s);
-               blk_rq_stat_init(s);
+               latency_stat_sum(iolat, &stat, s);
+               latency_stat_init(iolat, s);
        }
        preempt_enable();
 
@@ -502,41 +562,36 @@ static void iolatency_check_latencies(struct iolatency_grp *iolat, u64 now)
 
        lat_info = &parent->child_lat;
 
-       /*
-        * CALC_LOAD takes in a number stored in fixed point representation.
-        * Because we are using this for IO time in ns, the values stored
-        * are significantly larger than the FIXED_1 denominator (2048).
-        * Therefore, rounding errors in the calculation are negligible and
-        * can be ignored.
-        */
-       exp_idx = min_t(int, BLKIOLATENCY_NR_EXP_FACTORS - 1,
-                       div64_u64(iolat->cur_win_nsec,
-                                 BLKIOLATENCY_EXP_BUCKET_SIZE));
-       CALC_LOAD(iolat->lat_avg, iolatency_exp_factors[exp_idx], stat.mean);
+       iolat_update_total_lat_avg(iolat, &stat);
 
        /* Everything is ok and we don't need to adjust the scale. */
-       if (stat.mean <= iolat->min_lat_nsec &&
+       if (latency_sum_ok(iolat, &stat) &&
            atomic_read(&lat_info->scale_cookie) == DEFAULT_SCALE_COOKIE)
                return;
 
        /* Somebody beat us to the punch, just bail. */
        spin_lock_irqsave(&lat_info->lock, flags);
+
+       latency_stat_sum(iolat, &iolat->cur_stat, &stat);
        lat_info->nr_samples -= iolat->nr_samples;
-       lat_info->nr_samples += stat.nr_samples;
-       iolat->nr_samples = stat.nr_samples;
+       lat_info->nr_samples += latency_stat_samples(iolat, &iolat->cur_stat);
+       iolat->nr_samples = latency_stat_samples(iolat, &iolat->cur_stat);
 
        if ((lat_info->last_scale_event >= now ||
-           now - lat_info->last_scale_event < BLKIOLATENCY_MIN_ADJUST_TIME) &&
-           lat_info->scale_lat <= iolat->min_lat_nsec)
+           now - lat_info->last_scale_event < BLKIOLATENCY_MIN_ADJUST_TIME))
                goto out;
 
-       if (stat.mean <= iolat->min_lat_nsec &&
-           stat.nr_samples >= BLKIOLATENCY_MIN_GOOD_SAMPLES) {
+       if (latency_sum_ok(iolat, &iolat->cur_stat) &&
+           latency_sum_ok(iolat, &stat)) {
+               if (latency_stat_samples(iolat, &iolat->cur_stat) <
+                   BLKIOLATENCY_MIN_GOOD_SAMPLES)
+                       goto out;
                if (lat_info->scale_grp == iolat) {
                        lat_info->last_scale_event = now;
                        scale_cookie_change(iolat->blkiolat, lat_info, true);
                }
-       } else if (stat.mean > iolat->min_lat_nsec) {
+       } else if (lat_info->scale_lat == 0 ||
+                  lat_info->scale_lat >= iolat->min_lat_nsec) {
                lat_info->last_scale_event = now;
                if (!lat_info->scale_grp ||
                    lat_info->scale_lat > iolat->min_lat_nsec) {
@@ -545,6 +600,7 @@ static void iolatency_check_latencies(struct iolatency_grp *iolat, u64 now)
                }
                scale_cookie_change(iolat->blkiolat, lat_info, false);
        }
+       latency_stat_init(iolat, &iolat->cur_stat);
 out:
        spin_unlock_irqrestore(&lat_info->lock, flags);
 }
@@ -650,7 +706,7 @@ static void blkiolatency_timer_fn(struct timer_list *t)
                 * We could be exiting, don't access the pd unless we have a
                 * ref on the blkg.
                 */
-               if (!blkg_try_get(blkg))
+               if (!blkg_tryget(blkg))
                        continue;
 
                iolat = blkg_to_lat(blkg);
@@ -761,7 +817,6 @@ static ssize_t iolatency_set_limit(struct kernfs_open_file *of, char *buf,
 {
        struct blkcg *blkcg = css_to_blkcg(of_css(of));
        struct blkcg_gq *blkg;
-       struct blk_iolatency *blkiolat;
        struct blkg_conf_ctx ctx;
        struct iolatency_grp *iolat;
        char *p, *tok;
@@ -774,7 +829,6 @@ static ssize_t iolatency_set_limit(struct kernfs_open_file *of, char *buf,
                return ret;
 
        iolat = blkg_to_lat(ctx.blkg);
-       blkiolat = iolat->blkiolat;
        p = ctx.body;
 
        ret = -EINVAL;
@@ -835,13 +889,43 @@ static int iolatency_print_limit(struct seq_file *sf, void *v)
        return 0;
 }
 
+static size_t iolatency_ssd_stat(struct iolatency_grp *iolat, char *buf,
+                                size_t size)
+{
+       struct latency_stat stat;
+       int cpu;
+
+       latency_stat_init(iolat, &stat);
+       preempt_disable();
+       for_each_online_cpu(cpu) {
+               struct latency_stat *s;
+               s = per_cpu_ptr(iolat->stats, cpu);
+               latency_stat_sum(iolat, &stat, s);
+       }
+       preempt_enable();
+
+       if (iolat->rq_depth.max_depth == UINT_MAX)
+               return scnprintf(buf, size, " missed=%llu total=%llu depth=max",
+                                (unsigned long long)stat.ps.missed,
+                                (unsigned long long)stat.ps.total);
+       return scnprintf(buf, size, " missed=%llu total=%llu depth=%u",
+                        (unsigned long long)stat.ps.missed,
+                        (unsigned long long)stat.ps.total,
+                        iolat->rq_depth.max_depth);
+}
+
 static size_t iolatency_pd_stat(struct blkg_policy_data *pd, char *buf,
                                size_t size)
 {
        struct iolatency_grp *iolat = pd_to_lat(pd);
-       unsigned long long avg_lat = div64_u64(iolat->lat_avg, NSEC_PER_USEC);
-       unsigned long long cur_win = div64_u64(iolat->cur_win_nsec, NSEC_PER_MSEC);
+       unsigned long long avg_lat;
+       unsigned long long cur_win;
+
+       if (iolat->ssd)
+               return iolatency_ssd_stat(iolat, buf, size);
 
+       avg_lat = div64_u64(iolat->lat_avg, NSEC_PER_USEC);
+       cur_win = div64_u64(iolat->cur_win_nsec, NSEC_PER_MSEC);
        if (iolat->rq_depth.max_depth == UINT_MAX)
                return scnprintf(buf, size, " depth=max avg_lat=%llu win=%llu",
                                 avg_lat, cur_win);
@@ -858,8 +942,8 @@ static struct blkg_policy_data *iolatency_pd_alloc(gfp_t gfp, int node)
        iolat = kzalloc_node(sizeof(*iolat), gfp, node);
        if (!iolat)
                return NULL;
-       iolat->stats = __alloc_percpu_gfp(sizeof(struct blk_rq_stat),
-                                      __alignof__(struct blk_rq_stat), gfp);
+       iolat->stats = __alloc_percpu_gfp(sizeof(struct latency_stat),
+                                      __alignof__(struct latency_stat), gfp);
        if (!iolat->stats) {
                kfree(iolat);
                return NULL;
@@ -876,15 +960,21 @@ static void iolatency_pd_init(struct blkg_policy_data *pd)
        u64 now = ktime_to_ns(ktime_get());
        int cpu;
 
+       if (blk_queue_nonrot(blkg->q))
+               iolat->ssd = true;
+       else
+               iolat->ssd = false;
+
        for_each_possible_cpu(cpu) {
-               struct blk_rq_stat *stat;
+               struct latency_stat *stat;
                stat = per_cpu_ptr(iolat->stats, cpu);
-               blk_rq_stat_init(stat);
+               latency_stat_init(iolat, stat);
        }
 
+       latency_stat_init(iolat, &iolat->cur_stat);
        rq_wait_init(&iolat->rq_wait);
        spin_lock_init(&iolat->child_lat.lock);
-       iolat->rq_depth.queue_depth = blk_queue_depth(blkg->q);
+       iolat->rq_depth.queue_depth = blkg->q->nr_requests;
        iolat->rq_depth.max_depth = UINT_MAX;
        iolat->rq_depth.default_depth = iolat->rq_depth.queue_depth;
        iolat->blkiolat = blkiolat;
index aaec38cc37b86489cdfed9ff8f4202f72516ede0..42a46744c11b45e4970bbe8a918fcf8b29d895d8 100644 (file)
 
 #include "blk.h"
 
+/*
+ * Check if the two bvecs from two bios can be merged to one segment.  If yes,
+ * no need to check gap between the two bios since the 1st bio and the 1st bvec
+ * in the 2nd bio can be handled in one segment.
+ */
+static inline bool bios_segs_mergeable(struct request_queue *q,
+               struct bio *prev, struct bio_vec *prev_last_bv,
+               struct bio_vec *next_first_bv)
+{
+       if (!biovec_phys_mergeable(q, prev_last_bv, next_first_bv))
+               return false;
+       if (prev->bi_seg_back_size + next_first_bv->bv_len >
+                       queue_max_segment_size(q))
+               return false;
+       return true;
+}
+
+static inline bool bio_will_gap(struct request_queue *q,
+               struct request *prev_rq, struct bio *prev, struct bio *next)
+{
+       struct bio_vec pb, nb;
+
+       if (!bio_has_data(prev) || !queue_virt_boundary(q))
+               return false;
+
+       /*
+        * Don't merge if the 1st bio starts with non-zero offset, otherwise it
+        * is quite difficult to respect the sg gap limit.  We work hard to
+        * merge a huge number of small single bios in case of mkfs.
+        */
+       if (prev_rq)
+               bio_get_first_bvec(prev_rq->bio, &pb);
+       else
+               bio_get_first_bvec(prev, &pb);
+       if (pb.bv_offset)
+               return true;
+
+       /*
+        * We don't need to worry about the situation that the merged segment
+        * ends in unaligned virt boundary:
+        *
+        * - if 'pb' ends aligned, the merged segment ends aligned
+        * - if 'pb' ends unaligned, the next bio must include
+        *   one single bvec of 'nb', otherwise the 'nb' can't
+        *   merge with 'pb'
+        */
+       bio_get_last_bvec(prev, &pb);
+       bio_get_first_bvec(next, &nb);
+       if (bios_segs_mergeable(q, prev, &pb, &nb))
+               return false;
+       return __bvec_gap_to_prev(q, &pb, nb.bv_offset);
+}
+
+static inline bool req_gap_back_merge(struct request *req, struct bio *bio)
+{
+       return bio_will_gap(req->q, req, req->biotail, bio);
+}
+
+static inline bool req_gap_front_merge(struct request *req, struct bio *bio)
+{
+       return bio_will_gap(req->q, NULL, bio, req->bio);
+}
+
 static struct bio *blk_bio_discard_split(struct request_queue *q,
                                         struct bio *bio,
                                         struct bio_set *bs,
@@ -134,9 +197,7 @@ static struct bio *blk_bio_segment_split(struct request_queue *q,
                if (bvprvp && blk_queue_cluster(q)) {
                        if (seg_size + bv.bv_len > queue_max_segment_size(q))
                                goto new_segment;
-                       if (!BIOVEC_PHYS_MERGEABLE(bvprvp, &bv))
-                               goto new_segment;
-                       if (!BIOVEC_SEG_BOUNDARY(q, bvprvp, &bv))
+                       if (!biovec_phys_mergeable(q, bvprvp, &bv))
                                goto new_segment;
 
                        seg_size += bv.bv_len;
@@ -267,9 +328,7 @@ static unsigned int __blk_recalc_rq_segments(struct request_queue *q,
                                if (seg_size + bv.bv_len
                                    > queue_max_segment_size(q))
                                        goto new_segment;
-                               if (!BIOVEC_PHYS_MERGEABLE(&bvprv, &bv))
-                                       goto new_segment;
-                               if (!BIOVEC_SEG_BOUNDARY(q, &bvprv, &bv))
+                               if (!biovec_phys_mergeable(q, &bvprv, &bv))
                                        goto new_segment;
 
                                seg_size += bv.bv_len;
@@ -349,17 +408,7 @@ static int blk_phys_contig_segment(struct request_queue *q, struct bio *bio,
        bio_get_last_bvec(bio, &end_bv);
        bio_get_first_bvec(nxt, &nxt_bv);
 
-       if (!BIOVEC_PHYS_MERGEABLE(&end_bv, &nxt_bv))
-               return 0;
-
-       /*
-        * bio and nxt are contiguous in memory; check if the queue allows
-        * these two to be merged into one
-        */
-       if (BIOVEC_SEG_BOUNDARY(q, &end_bv, &nxt_bv))
-               return 1;
-
-       return 0;
+       return biovec_phys_mergeable(q, &end_bv, &nxt_bv);
 }
 
 static inline void
@@ -373,10 +422,7 @@ __blk_segment_map_sg(struct request_queue *q, struct bio_vec *bvec,
        if (*sg && *cluster) {
                if ((*sg)->length + nbytes > queue_max_segment_size(q))
                        goto new_segment;
-
-               if (!BIOVEC_PHYS_MERGEABLE(bvprv, bvec))
-                       goto new_segment;
-               if (!BIOVEC_SEG_BOUNDARY(q, bvprv, bvec))
+               if (!biovec_phys_mergeable(q, bvprv, bvec))
                        goto new_segment;
 
                (*sg)->length += nbytes;
index cb1e6cf7ac48f4e187e915896376cdfec79c9d2a..41b86f50d1262fff5d8b1939a9eafcec289390b7 100644 (file)
@@ -102,6 +102,14 @@ static int blk_flags_show(struct seq_file *m, const unsigned long flags,
        return 0;
 }
 
+static int queue_pm_only_show(void *data, struct seq_file *m)
+{
+       struct request_queue *q = data;
+
+       seq_printf(m, "%d\n", atomic_read(&q->pm_only));
+       return 0;
+}
+
 #define QUEUE_FLAG_NAME(name) [QUEUE_FLAG_##name] = #name
 static const char *const blk_queue_flag_name[] = {
        QUEUE_FLAG_NAME(QUEUED),
@@ -132,7 +140,6 @@ static const char *const blk_queue_flag_name[] = {
        QUEUE_FLAG_NAME(REGISTERED),
        QUEUE_FLAG_NAME(SCSI_PASSTHROUGH),
        QUEUE_FLAG_NAME(QUIESCED),
-       QUEUE_FLAG_NAME(PREEMPT_ONLY),
 };
 #undef QUEUE_FLAG_NAME
 
@@ -209,6 +216,7 @@ static ssize_t queue_write_hint_store(void *data, const char __user *buf,
 static const struct blk_mq_debugfs_attr blk_mq_debugfs_queue_attrs[] = {
        { "poll_stat", 0400, queue_poll_stat_show },
        { "requeue_list", 0400, .seq_ops = &queue_requeue_list_seq_ops },
+       { "pm_only", 0600, queue_pm_only_show, NULL },
        { "state", 0600, queue_state_show, queue_state_write },
        { "write_hints", 0600, queue_write_hint_show, queue_write_hint_store },
        { "zone_wlock", 0400, queue_zone_wlock_show, NULL },
@@ -423,8 +431,7 @@ static void hctx_show_busy_rq(struct request *rq, void *data, bool reserved)
 {
        const struct show_busy_params *params = data;
 
-       if (blk_mq_map_queue(rq->q, rq->mq_ctx->cpu) == params->hctx &&
-           blk_mq_rq_state(rq) != MQ_RQ_IDLE)
+       if (blk_mq_map_queue(rq->q, rq->mq_ctx->cpu) == params->hctx)
                __blk_mq_debugfs_rq_show(params->m,
                                         list_entry_rq(&rq->queuelist));
 }
index 4e028ee4243014ff8eed44627f8e0cc5068217ca..8a9544203173fac13a718f22e746128246dde4c7 100644 (file)
@@ -49,12 +49,12 @@ blk_mq_sched_allow_merge(struct request_queue *q, struct request *rq,
        return true;
 }
 
-static inline void blk_mq_sched_completed_request(struct request *rq)
+static inline void blk_mq_sched_completed_request(struct request *rq, u64 now)
 {
        struct elevator_queue *e = rq->q->elevator;
 
        if (e && e->type->ops.mq.completed_request)
-               e->type->ops.mq.completed_request(rq);
+               e->type->ops.mq.completed_request(rq, now);
 }
 
 static inline void blk_mq_sched_started_request(struct request *rq)
index 41317c50a44628e9ef4930e9f17ad6d8297c9190..cfda95b85d3475570dfeb9468732b6d587584610 100644 (file)
@@ -232,13 +232,26 @@ static bool bt_iter(struct sbitmap *bitmap, unsigned int bitnr, void *data)
 
        /*
         * We can hit rq == NULL here, because the tagging functions
-        * test and set the bit before assining ->rqs[].
+        * test and set the bit before assigning ->rqs[].
         */
        if (rq && rq->q == hctx->queue)
                iter_data->fn(hctx, rq, iter_data->data, reserved);
        return true;
 }
 
+/**
+ * bt_for_each - iterate over the requests associated with a hardware queue
+ * @hctx:      Hardware queue to examine.
+ * @bt:                sbitmap to examine. This is either the breserved_tags member
+ *             or the bitmap_tags member of struct blk_mq_tags.
+ * @fn:                Pointer to the function that will be called for each request
+ *             associated with @hctx that has been assigned a driver tag.
+ *             @fn will be called as follows: @fn(@hctx, rq, @data, @reserved)
+ *             where rq is a pointer to a request.
+ * @data:      Will be passed as third argument to @fn.
+ * @reserved:  Indicates whether @bt is the breserved_tags member or the
+ *             bitmap_tags member of struct blk_mq_tags.
+ */
 static void bt_for_each(struct blk_mq_hw_ctx *hctx, struct sbitmap_queue *bt,
                        busy_iter_fn *fn, void *data, bool reserved)
 {
@@ -280,6 +293,18 @@ static bool bt_tags_iter(struct sbitmap *bitmap, unsigned int bitnr, void *data)
        return true;
 }
 
+/**
+ * bt_tags_for_each - iterate over the requests in a tag map
+ * @tags:      Tag map to iterate over.
+ * @bt:                sbitmap to examine. This is either the breserved_tags member
+ *             or the bitmap_tags member of struct blk_mq_tags.
+ * @fn:                Pointer to the function that will be called for each started
+ *             request. @fn will be called as follows: @fn(rq, @data,
+ *             @reserved) where rq is a pointer to a request.
+ * @data:      Will be passed as second argument to @fn.
+ * @reserved:  Indicates whether @bt is the breserved_tags member or the
+ *             bitmap_tags member of struct blk_mq_tags.
+ */
 static void bt_tags_for_each(struct blk_mq_tags *tags, struct sbitmap_queue *bt,
                             busy_tag_iter_fn *fn, void *data, bool reserved)
 {
@@ -294,6 +319,15 @@ static void bt_tags_for_each(struct blk_mq_tags *tags, struct sbitmap_queue *bt,
                sbitmap_for_each_set(&bt->sb, bt_tags_iter, &iter_data);
 }
 
+/**
+ * blk_mq_all_tag_busy_iter - iterate over all started requests in a tag map
+ * @tags:      Tag map to iterate over.
+ * @fn:                Pointer to the function that will be called for each started
+ *             request. @fn will be called as follows: @fn(rq, @priv,
+ *             reserved) where rq is a pointer to a request. 'reserved'
+ *             indicates whether or not @rq is a reserved request.
+ * @priv:      Will be passed as second argument to @fn.
+ */
 static void blk_mq_all_tag_busy_iter(struct blk_mq_tags *tags,
                busy_tag_iter_fn *fn, void *priv)
 {
@@ -302,6 +336,15 @@ static void blk_mq_all_tag_busy_iter(struct blk_mq_tags *tags,
        bt_tags_for_each(tags, &tags->bitmap_tags, fn, priv, false);
 }
 
+/**
+ * blk_mq_tagset_busy_iter - iterate over all started requests in a tag set
+ * @tagset:    Tag set to iterate over.
+ * @fn:                Pointer to the function that will be called for each started
+ *             request. @fn will be called as follows: @fn(rq, @priv,
+ *             reserved) where rq is a pointer to a request. 'reserved'
+ *             indicates whether or not @rq is a reserved request.
+ * @priv:      Will be passed as second argument to @fn.
+ */
 void blk_mq_tagset_busy_iter(struct blk_mq_tag_set *tagset,
                busy_tag_iter_fn *fn, void *priv)
 {
@@ -314,6 +357,20 @@ void blk_mq_tagset_busy_iter(struct blk_mq_tag_set *tagset,
 }
 EXPORT_SYMBOL(blk_mq_tagset_busy_iter);
 
+/**
+ * blk_mq_queue_tag_busy_iter - iterate over all requests with a driver tag
+ * @q:         Request queue to examine.
+ * @fn:                Pointer to the function that will be called for each request
+ *             on @q. @fn will be called as follows: @fn(hctx, rq, @priv,
+ *             reserved) where rq is a pointer to a request and hctx points
+ *             to the hardware queue associated with the request. 'reserved'
+ *             indicates whether or not @rq is a reserved request.
+ * @priv:      Will be passed as third argument to @fn.
+ *
+ * Note: if @q->tag_set is shared with other request queues then @fn will be
+ * called for all requests on all queues that share that tag set and not only
+ * for requests associated with @q.
+ */
 void blk_mq_queue_tag_busy_iter(struct request_queue *q, busy_iter_fn *fn,
                void *priv)
 {
@@ -321,9 +378,11 @@ void blk_mq_queue_tag_busy_iter(struct request_queue *q, busy_iter_fn *fn,
        int i;
 
        /*
-        * __blk_mq_update_nr_hw_queues will update the nr_hw_queues and
-        * queue_hw_ctx after freeze the queue, so we use q_usage_counter
-        * to avoid race with it.
+        * __blk_mq_update_nr_hw_queues() updates nr_hw_queues and queue_hw_ctx
+        * while the queue is frozen. So we can use q_usage_counter to avoid
+        * racing with it. __blk_mq_update_nr_hw_queues() uses
+        * synchronize_rcu() to ensure this function left the critical section
+        * below.
         */
        if (!percpu_ref_tryget(&q->q_usage_counter))
                return;
@@ -332,7 +391,7 @@ void blk_mq_queue_tag_busy_iter(struct request_queue *q, busy_iter_fn *fn,
                struct blk_mq_tags *tags = hctx->tags;
 
                /*
-                * If not software queues are currently mapped to this
+                * If no software queues are currently mapped to this
                 * hardware queue, there's nothing to check
                 */
                if (!blk_mq_hw_queue_mapped(hctx))
index e3c39ea8e17b04b0787e53959cd4f68cb1a43f3d..dcf10e39995a668d9f9af77ebb0edaf700aef2bb 100644 (file)
@@ -33,6 +33,7 @@
 #include "blk-mq.h"
 #include "blk-mq-debugfs.h"
 #include "blk-mq-tag.h"
+#include "blk-pm.h"
 #include "blk-stat.h"
 #include "blk-mq-sched.h"
 #include "blk-rq-qos.h"
@@ -198,7 +199,7 @@ void blk_mq_unfreeze_queue(struct request_queue *q)
        freeze_depth = atomic_dec_return(&q->mq_freeze_depth);
        WARN_ON_ONCE(freeze_depth < 0);
        if (!freeze_depth) {
-               percpu_ref_reinit(&q->q_usage_counter);
+               percpu_ref_resurrect(&q->q_usage_counter);
                wake_up_all(&q->mq_freeze_wq);
        }
 }
@@ -475,6 +476,7 @@ static void __blk_mq_free_request(struct request *rq)
        struct blk_mq_hw_ctx *hctx = blk_mq_map_queue(q, ctx->cpu);
        const int sched_tag = rq->internal_tag;
 
+       blk_pm_mark_last_busy(rq);
        if (rq->tag != -1)
                blk_mq_put_tag(hctx, hctx->tags, ctx, rq->tag);
        if (sched_tag != -1)
@@ -526,6 +528,9 @@ inline void __blk_mq_end_request(struct request *rq, blk_status_t error)
                blk_stat_add(rq, now);
        }
 
+       if (rq->internal_tag != -1)
+               blk_mq_sched_completed_request(rq, now);
+
        blk_account_io_done(rq, now);
 
        if (rq->end_io) {
@@ -562,8 +567,20 @@ static void __blk_mq_complete_request(struct request *rq)
 
        if (!blk_mq_mark_complete(rq))
                return;
-       if (rq->internal_tag != -1)
-               blk_mq_sched_completed_request(rq);
+
+       /*
+        * Most of single queue controllers, there is only one irq vector
+        * for handling IO completion, and the only irq's affinity is set
+        * as all possible CPUs. On most of ARCHs, this affinity means the
+        * irq is handled on one specific CPU.
+        *
+        * So complete IO reqeust in softirq context in case of single queue
+        * for not degrading IO performance by irqsoff latency.
+        */
+       if (rq->q->nr_hw_queues == 1) {
+               __blk_complete_request(rq);
+               return;
+       }
 
        if (!test_bit(QUEUE_FLAG_SAME_COMP, &rq->q->queue_flags)) {
                rq->q->softirq_done_fn(rq);
@@ -2137,8 +2154,6 @@ static void blk_mq_exit_hctx(struct request_queue *q,
                struct blk_mq_tag_set *set,
                struct blk_mq_hw_ctx *hctx, unsigned int hctx_idx)
 {
-       blk_mq_debugfs_unregister_hctx(hctx);
-
        if (blk_mq_hw_queue_mapped(hctx))
                blk_mq_tag_idle(hctx);
 
@@ -2165,6 +2180,7 @@ static void blk_mq_exit_hw_queues(struct request_queue *q,
        queue_for_each_hw_ctx(q, hctx, i) {
                if (i == nr_queue)
                        break;
+               blk_mq_debugfs_unregister_hctx(hctx);
                blk_mq_exit_hctx(q, set, hctx, i);
        }
 }
@@ -2194,12 +2210,12 @@ static int blk_mq_init_hctx(struct request_queue *q,
         * runtime
         */
        hctx->ctxs = kmalloc_array_node(nr_cpu_ids, sizeof(void *),
-                                       GFP_KERNEL, node);
+                       GFP_NOIO | __GFP_NOWARN | __GFP_NORETRY, node);
        if (!hctx->ctxs)
                goto unregister_cpu_notifier;
 
-       if (sbitmap_init_node(&hctx->ctx_map, nr_cpu_ids, ilog2(8), GFP_KERNEL,
-                             node))
+       if (sbitmap_init_node(&hctx->ctx_map, nr_cpu_ids, ilog2(8),
+                               GFP_NOIO | __GFP_NOWARN | __GFP_NORETRY, node))
                goto free_ctxs;
 
        hctx->nr_ctx = 0;
@@ -2212,7 +2228,8 @@ static int blk_mq_init_hctx(struct request_queue *q,
            set->ops->init_hctx(hctx, set->driver_data, hctx_idx))
                goto free_bitmap;
 
-       hctx->fq = blk_alloc_flush_queue(q, hctx->numa_node, set->cmd_size);
+       hctx->fq = blk_alloc_flush_queue(q, hctx->numa_node, set->cmd_size,
+                       GFP_NOIO | __GFP_NOWARN | __GFP_NORETRY);
        if (!hctx->fq)
                goto exit_hctx;
 
@@ -2222,8 +2239,6 @@ static int blk_mq_init_hctx(struct request_queue *q,
        if (hctx->flags & BLK_MQ_F_BLOCKING)
                init_srcu_struct(hctx->srcu);
 
-       blk_mq_debugfs_register_hctx(q, hctx);
-
        return 0;
 
  free_fq:
@@ -2492,6 +2507,39 @@ struct request_queue *blk_mq_init_queue(struct blk_mq_tag_set *set)
 }
 EXPORT_SYMBOL(blk_mq_init_queue);
 
+/*
+ * Helper for setting up a queue with mq ops, given queue depth, and
+ * the passed in mq ops flags.
+ */
+struct request_queue *blk_mq_init_sq_queue(struct blk_mq_tag_set *set,
+                                          const struct blk_mq_ops *ops,
+                                          unsigned int queue_depth,
+                                          unsigned int set_flags)
+{
+       struct request_queue *q;
+       int ret;
+
+       memset(set, 0, sizeof(*set));
+       set->ops = ops;
+       set->nr_hw_queues = 1;
+       set->queue_depth = queue_depth;
+       set->numa_node = NUMA_NO_NODE;
+       set->flags = set_flags;
+
+       ret = blk_mq_alloc_tag_set(set);
+       if (ret)
+               return ERR_PTR(ret);
+
+       q = blk_mq_init_queue(set);
+       if (IS_ERR(q)) {
+               blk_mq_free_tag_set(set);
+               return q;
+       }
+
+       return q;
+}
+EXPORT_SYMBOL(blk_mq_init_sq_queue);
+
 static int blk_mq_hw_ctx_size(struct blk_mq_tag_set *tag_set)
 {
        int hw_ctx_size = sizeof(struct blk_mq_hw_ctx);
@@ -2506,48 +2554,90 @@ static int blk_mq_hw_ctx_size(struct blk_mq_tag_set *tag_set)
        return hw_ctx_size;
 }
 
+static struct blk_mq_hw_ctx *blk_mq_alloc_and_init_hctx(
+               struct blk_mq_tag_set *set, struct request_queue *q,
+               int hctx_idx, int node)
+{
+       struct blk_mq_hw_ctx *hctx;
+
+       hctx = kzalloc_node(blk_mq_hw_ctx_size(set),
+                       GFP_NOIO | __GFP_NOWARN | __GFP_NORETRY,
+                       node);
+       if (!hctx)
+               return NULL;
+
+       if (!zalloc_cpumask_var_node(&hctx->cpumask,
+                               GFP_NOIO | __GFP_NOWARN | __GFP_NORETRY,
+                               node)) {
+               kfree(hctx);
+               return NULL;
+       }
+
+       atomic_set(&hctx->nr_active, 0);
+       hctx->numa_node = node;
+       hctx->queue_num = hctx_idx;
+
+       if (blk_mq_init_hctx(q, set, hctx, hctx_idx)) {
+               free_cpumask_var(hctx->cpumask);
+               kfree(hctx);
+               return NULL;
+       }
+       blk_mq_hctx_kobj_init(hctx);
+
+       return hctx;
+}
+
 static void blk_mq_realloc_hw_ctxs(struct blk_mq_tag_set *set,
                                                struct request_queue *q)
 {
-       int i, j;
+       int i, j, end;
        struct blk_mq_hw_ctx **hctxs = q->queue_hw_ctx;
 
-       blk_mq_sysfs_unregister(q);
-
        /* protect against switching io scheduler  */
        mutex_lock(&q->sysfs_lock);
        for (i = 0; i < set->nr_hw_queues; i++) {
                int node;
-
-               if (hctxs[i])
-                       continue;
+               struct blk_mq_hw_ctx *hctx;
 
                node = blk_mq_hw_queue_to_node(q->mq_map, i);
-               hctxs[i] = kzalloc_node(blk_mq_hw_ctx_size(set),
-                                       GFP_KERNEL, node);
-               if (!hctxs[i])
-                       break;
-
-               if (!zalloc_cpumask_var_node(&hctxs[i]->cpumask, GFP_KERNEL,
-                                               node)) {
-                       kfree(hctxs[i]);
-                       hctxs[i] = NULL;
-                       break;
-               }
-
-               atomic_set(&hctxs[i]->nr_active, 0);
-               hctxs[i]->numa_node = node;
-               hctxs[i]->queue_num = i;
+               /*
+                * If the hw queue has been mapped to another numa node,
+                * we need to realloc the hctx. If allocation fails, fallback
+                * to use the previous one.
+                */
+               if (hctxs[i] && (hctxs[i]->numa_node == node))
+                       continue;
 
-               if (blk_mq_init_hctx(q, set, hctxs[i], i)) {
-                       free_cpumask_var(hctxs[i]->cpumask);
-                       kfree(hctxs[i]);
-                       hctxs[i] = NULL;
-                       break;
+               hctx = blk_mq_alloc_and_init_hctx(set, q, i, node);
+               if (hctx) {
+                       if (hctxs[i]) {
+                               blk_mq_exit_hctx(q, set, hctxs[i], i);
+                               kobject_put(&hctxs[i]->kobj);
+                       }
+                       hctxs[i] = hctx;
+               } else {
+                       if (hctxs[i])
+                               pr_warn("Allocate new hctx on node %d fails,\
+                                               fallback to previous one on node %d\n",
+                                               node, hctxs[i]->numa_node);
+                       else
+                               break;
                }
-               blk_mq_hctx_kobj_init(hctxs[i]);
        }
-       for (j = i; j < q->nr_hw_queues; j++) {
+       /*
+        * Increasing nr_hw_queues fails. Free the newly allocated
+        * hctxs and keep the previous q->nr_hw_queues.
+        */
+       if (i != set->nr_hw_queues) {
+               j = q->nr_hw_queues;
+               end = i;
+       } else {
+               j = i;
+               end = q->nr_hw_queues;
+               q->nr_hw_queues = set->nr_hw_queues;
+       }
+
+       for (; j < end; j++) {
                struct blk_mq_hw_ctx *hctx = hctxs[j];
 
                if (hctx) {
@@ -2559,9 +2649,7 @@ static void blk_mq_realloc_hw_ctxs(struct blk_mq_tag_set *set,
 
                }
        }
-       q->nr_hw_queues = i;
        mutex_unlock(&q->sysfs_lock);
-       blk_mq_sysfs_register(q);
 }
 
 struct request_queue *blk_mq_init_allocated_queue(struct blk_mq_tag_set *set,
@@ -2659,25 +2747,6 @@ void blk_mq_free_queue(struct request_queue *q)
        blk_mq_exit_hw_queues(q, set, set->nr_hw_queues);
 }
 
-/* Basically redo blk_mq_init_queue with queue frozen */
-static void blk_mq_queue_reinit(struct request_queue *q)
-{
-       WARN_ON_ONCE(!atomic_read(&q->mq_freeze_depth));
-
-       blk_mq_debugfs_unregister_hctxs(q);
-       blk_mq_sysfs_unregister(q);
-
-       /*
-        * redo blk_mq_init_cpu_queues and blk_mq_init_hw_queues. FIXME: maybe
-        * we should change hctx numa_node according to the new topology (this
-        * involves freeing and re-allocating memory, worth doing?)
-        */
-       blk_mq_map_swqueue(q);
-
-       blk_mq_sysfs_register(q);
-       blk_mq_debugfs_register_hctxs(q);
-}
-
 static int __blk_mq_alloc_rq_maps(struct blk_mq_tag_set *set)
 {
        int i;
@@ -2964,6 +3033,7 @@ static void __blk_mq_update_nr_hw_queues(struct blk_mq_tag_set *set,
 {
        struct request_queue *q;
        LIST_HEAD(head);
+       int prev_nr_hw_queues;
 
        lockdep_assert_held(&set->tag_list_lock);
 
@@ -2987,11 +3057,30 @@ static void __blk_mq_update_nr_hw_queues(struct blk_mq_tag_set *set,
                if (!blk_mq_elv_switch_none(&head, q))
                        goto switch_back;
 
+       list_for_each_entry(q, &set->tag_list, tag_set_list) {
+               blk_mq_debugfs_unregister_hctxs(q);
+               blk_mq_sysfs_unregister(q);
+       }
+
+       prev_nr_hw_queues = set->nr_hw_queues;
        set->nr_hw_queues = nr_hw_queues;
        blk_mq_update_queue_map(set);
+fallback:
        list_for_each_entry(q, &set->tag_list, tag_set_list) {
                blk_mq_realloc_hw_ctxs(set, q);
-               blk_mq_queue_reinit(q);
+               if (q->nr_hw_queues != set->nr_hw_queues) {
+                       pr_warn("Increasing nr_hw_queues to %d fails, fallback to %d\n",
+                                       nr_hw_queues, prev_nr_hw_queues);
+                       set->nr_hw_queues = prev_nr_hw_queues;
+                       blk_mq_map_queues(set);
+                       goto fallback;
+               }
+               blk_mq_map_swqueue(q);
+       }
+
+       list_for_each_entry(q, &set->tag_list, tag_set_list) {
+               blk_mq_sysfs_register(q);
+               blk_mq_debugfs_register_hctxs(q);
        }
 
 switch_back:
diff --git a/block/blk-pm.c b/block/blk-pm.c
new file mode 100644 (file)
index 0000000..f8fdae0
--- /dev/null
@@ -0,0 +1,216 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include <linux/blk-mq.h>
+#include <linux/blk-pm.h>
+#include <linux/blkdev.h>
+#include <linux/pm_runtime.h>
+#include "blk-mq.h"
+#include "blk-mq-tag.h"
+
+/**
+ * blk_pm_runtime_init - Block layer runtime PM initialization routine
+ * @q: the queue of the device
+ * @dev: the device the queue belongs to
+ *
+ * Description:
+ *    Initialize runtime-PM-related fields for @q and start auto suspend for
+ *    @dev. Drivers that want to take advantage of request-based runtime PM
+ *    should call this function after @dev has been initialized, and its
+ *    request queue @q has been allocated, and runtime PM for it can not happen
+ *    yet(either due to disabled/forbidden or its usage_count > 0). In most
+ *    cases, driver should call this function before any I/O has taken place.
+ *
+ *    This function takes care of setting up using auto suspend for the device,
+ *    the autosuspend delay is set to -1 to make runtime suspend impossible
+ *    until an updated value is either set by user or by driver. Drivers do
+ *    not need to touch other autosuspend settings.
+ *
+ *    The block layer runtime PM is request based, so only works for drivers
+ *    that use request as their IO unit instead of those directly use bio's.
+ */
+void blk_pm_runtime_init(struct request_queue *q, struct device *dev)
+{
+       q->dev = dev;
+       q->rpm_status = RPM_ACTIVE;
+       pm_runtime_set_autosuspend_delay(q->dev, -1);
+       pm_runtime_use_autosuspend(q->dev);
+}
+EXPORT_SYMBOL(blk_pm_runtime_init);
+
+/**
+ * blk_pre_runtime_suspend - Pre runtime suspend check
+ * @q: the queue of the device
+ *
+ * Description:
+ *    This function will check if runtime suspend is allowed for the device
+ *    by examining if there are any requests pending in the queue. If there
+ *    are requests pending, the device can not be runtime suspended; otherwise,
+ *    the queue's status will be updated to SUSPENDING and the driver can
+ *    proceed to suspend the device.
+ *
+ *    For the not allowed case, we mark last busy for the device so that
+ *    runtime PM core will try to autosuspend it some time later.
+ *
+ *    This function should be called near the start of the device's
+ *    runtime_suspend callback.
+ *
+ * Return:
+ *    0                - OK to runtime suspend the device
+ *    -EBUSY   - Device should not be runtime suspended
+ */
+int blk_pre_runtime_suspend(struct request_queue *q)
+{
+       int ret = 0;
+
+       if (!q->dev)
+               return ret;
+
+       WARN_ON_ONCE(q->rpm_status != RPM_ACTIVE);
+
+       /*
+        * Increase the pm_only counter before checking whether any
+        * non-PM blk_queue_enter() calls are in progress to avoid that any
+        * new non-PM blk_queue_enter() calls succeed before the pm_only
+        * counter is decreased again.
+        */
+       blk_set_pm_only(q);
+       ret = -EBUSY;
+       /* Switch q_usage_counter from per-cpu to atomic mode. */
+       blk_freeze_queue_start(q);
+       /*
+        * Wait until atomic mode has been reached. Since that
+        * involves calling call_rcu(), it is guaranteed that later
+        * blk_queue_enter() calls see the pm-only state. See also
+        * http://lwn.net/Articles/573497/.
+        */
+       percpu_ref_switch_to_atomic_sync(&q->q_usage_counter);
+       if (percpu_ref_is_zero(&q->q_usage_counter))
+               ret = 0;
+       /* Switch q_usage_counter back to per-cpu mode. */
+       blk_mq_unfreeze_queue(q);
+
+       spin_lock_irq(q->queue_lock);
+       if (ret < 0)
+               pm_runtime_mark_last_busy(q->dev);
+       else
+               q->rpm_status = RPM_SUSPENDING;
+       spin_unlock_irq(q->queue_lock);
+
+       if (ret)
+               blk_clear_pm_only(q);
+
+       return ret;
+}
+EXPORT_SYMBOL(blk_pre_runtime_suspend);
+
+/**
+ * blk_post_runtime_suspend - Post runtime suspend processing
+ * @q: the queue of the device
+ * @err: return value of the device's runtime_suspend function
+ *
+ * Description:
+ *    Update the queue's runtime status according to the return value of the
+ *    device's runtime suspend function and mark last busy for the device so
+ *    that PM core will try to auto suspend the device at a later time.
+ *
+ *    This function should be called near the end of the device's
+ *    runtime_suspend callback.
+ */
+void blk_post_runtime_suspend(struct request_queue *q, int err)
+{
+       if (!q->dev)
+               return;
+
+       spin_lock_irq(q->queue_lock);
+       if (!err) {
+               q->rpm_status = RPM_SUSPENDED;
+       } else {
+               q->rpm_status = RPM_ACTIVE;
+               pm_runtime_mark_last_busy(q->dev);
+       }
+       spin_unlock_irq(q->queue_lock);
+
+       if (err)
+               blk_clear_pm_only(q);
+}
+EXPORT_SYMBOL(blk_post_runtime_suspend);
+
+/**
+ * blk_pre_runtime_resume - Pre runtime resume processing
+ * @q: the queue of the device
+ *
+ * Description:
+ *    Update the queue's runtime status to RESUMING in preparation for the
+ *    runtime resume of the device.
+ *
+ *    This function should be called near the start of the device's
+ *    runtime_resume callback.
+ */
+void blk_pre_runtime_resume(struct request_queue *q)
+{
+       if (!q->dev)
+               return;
+
+       spin_lock_irq(q->queue_lock);
+       q->rpm_status = RPM_RESUMING;
+       spin_unlock_irq(q->queue_lock);
+}
+EXPORT_SYMBOL(blk_pre_runtime_resume);
+
+/**
+ * blk_post_runtime_resume - Post runtime resume processing
+ * @q: the queue of the device
+ * @err: return value of the device's runtime_resume function
+ *
+ * Description:
+ *    Update the queue's runtime status according to the return value of the
+ *    device's runtime_resume function. If it is successfully resumed, process
+ *    the requests that are queued into the device's queue when it is resuming
+ *    and then mark last busy and initiate autosuspend for it.
+ *
+ *    This function should be called near the end of the device's
+ *    runtime_resume callback.
+ */
+void blk_post_runtime_resume(struct request_queue *q, int err)
+{
+       if (!q->dev)
+               return;
+
+       spin_lock_irq(q->queue_lock);
+       if (!err) {
+               q->rpm_status = RPM_ACTIVE;
+               pm_runtime_mark_last_busy(q->dev);
+               pm_request_autosuspend(q->dev);
+       } else {
+               q->rpm_status = RPM_SUSPENDED;
+       }
+       spin_unlock_irq(q->queue_lock);
+
+       if (!err)
+               blk_clear_pm_only(q);
+}
+EXPORT_SYMBOL(blk_post_runtime_resume);
+
+/**
+ * blk_set_runtime_active - Force runtime status of the queue to be active
+ * @q: the queue of the device
+ *
+ * If the device is left runtime suspended during system suspend the resume
+ * hook typically resumes the device and corrects runtime status
+ * accordingly. However, that does not affect the queue runtime PM status
+ * which is still "suspended". This prevents processing requests from the
+ * queue.
+ *
+ * This function can be used in driver's resume hook to correct queue
+ * runtime PM status and re-enable peeking requests from the queue. It
+ * should be called before first request is added to the queue.
+ */
+void blk_set_runtime_active(struct request_queue *q)
+{
+       spin_lock_irq(q->queue_lock);
+       q->rpm_status = RPM_ACTIVE;
+       pm_runtime_mark_last_busy(q->dev);
+       pm_request_autosuspend(q->dev);
+       spin_unlock_irq(q->queue_lock);
+}
+EXPORT_SYMBOL(blk_set_runtime_active);
diff --git a/block/blk-pm.h b/block/blk-pm.h
new file mode 100644 (file)
index 0000000..a8564ea
--- /dev/null
@@ -0,0 +1,69 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+
+#ifndef _BLOCK_BLK_PM_H_
+#define _BLOCK_BLK_PM_H_
+
+#include <linux/pm_runtime.h>
+
+#ifdef CONFIG_PM
+static inline void blk_pm_request_resume(struct request_queue *q)
+{
+       if (q->dev && (q->rpm_status == RPM_SUSPENDED ||
+                      q->rpm_status == RPM_SUSPENDING))
+               pm_request_resume(q->dev);
+}
+
+static inline void blk_pm_mark_last_busy(struct request *rq)
+{
+       if (rq->q->dev && !(rq->rq_flags & RQF_PM))
+               pm_runtime_mark_last_busy(rq->q->dev);
+}
+
+static inline void blk_pm_requeue_request(struct request *rq)
+{
+       lockdep_assert_held(rq->q->queue_lock);
+
+       if (rq->q->dev && !(rq->rq_flags & RQF_PM))
+               rq->q->nr_pending--;
+}
+
+static inline void blk_pm_add_request(struct request_queue *q,
+                                     struct request *rq)
+{
+       lockdep_assert_held(q->queue_lock);
+
+       if (q->dev && !(rq->rq_flags & RQF_PM))
+               q->nr_pending++;
+}
+
+static inline void blk_pm_put_request(struct request *rq)
+{
+       lockdep_assert_held(rq->q->queue_lock);
+
+       if (rq->q->dev && !(rq->rq_flags & RQF_PM))
+               --rq->q->nr_pending;
+}
+#else
+static inline void blk_pm_request_resume(struct request_queue *q)
+{
+}
+
+static inline void blk_pm_mark_last_busy(struct request *rq)
+{
+}
+
+static inline void blk_pm_requeue_request(struct request *rq)
+{
+}
+
+static inline void blk_pm_add_request(struct request_queue *q,
+                                     struct request *rq)
+{
+}
+
+static inline void blk_pm_put_request(struct request *rq)
+{
+}
+#endif
+
+#endif /* _BLOCK_BLK_PM_H_ */
index 15c1f5e12eb89460bc42eb7f5807eaa03254e51d..e47a2f751884ddb7821fededef643693db752393 100644 (file)
@@ -97,8 +97,8 @@ static int blk_softirq_cpu_dead(unsigned int cpu)
 
 void __blk_complete_request(struct request *req)
 {
-       int ccpu, cpu;
        struct request_queue *q = req->q;
+       int cpu, ccpu = q->mq_ops ? req->mq_ctx->cpu : req->cpu;
        unsigned long flags;
        bool shared = false;
 
@@ -110,8 +110,7 @@ void __blk_complete_request(struct request *req)
        /*
         * Select completion CPU
         */
-       if (req->cpu != -1) {
-               ccpu = req->cpu;
+       if (test_bit(QUEUE_FLAG_SAME_COMP, &q->queue_flags) && ccpu != -1) {
                if (!test_bit(QUEUE_FLAG_SAME_FORCE, &q->queue_flags))
                        shared = cpus_share_cache(cpu, ccpu);
        } else
index 7587b1c3caaf5299613686e005ac211346234928..90561af85a6236750509be628d21077f03e3599d 100644 (file)
@@ -190,6 +190,7 @@ void blk_stat_enable_accounting(struct request_queue *q)
        blk_queue_flag_set(QUEUE_FLAG_STATS, q);
        spin_unlock(&q->stats->lock);
 }
+EXPORT_SYMBOL_GPL(blk_stat_enable_accounting);
 
 struct blk_queue_stats *blk_alloc_queue_stats(void)
 {
index 01d0620a4e4a5e829c9de5c2dec0ac5e0b2b3ab3..4bda70e8db48a9150880dc04a8a1f3fcb8844ac6 100644 (file)
@@ -84,8 +84,7 @@ struct throtl_service_queue {
         * RB tree of active children throtl_grp's, which are sorted by
         * their ->disptime.
         */
-       struct rb_root          pending_tree;   /* RB tree of active tgs */
-       struct rb_node          *first_pending; /* first node in the tree */
+       struct rb_root_cached   pending_tree;   /* RB tree of active tgs */
        unsigned int            nr_pending;     /* # queued in the tree */
        unsigned long           first_pending_disptime; /* disptime of the first tg */
        struct timer_list       pending_timer;  /* fires on first_pending_disptime */
@@ -475,7 +474,7 @@ static void throtl_service_queue_init(struct throtl_service_queue *sq)
 {
        INIT_LIST_HEAD(&sq->queued[0]);
        INIT_LIST_HEAD(&sq->queued[1]);
-       sq->pending_tree = RB_ROOT;
+       sq->pending_tree = RB_ROOT_CACHED;
        timer_setup(&sq->pending_timer, throtl_pending_timer_fn, 0);
 }
 
@@ -616,31 +615,23 @@ static void throtl_pd_free(struct blkg_policy_data *pd)
 static struct throtl_grp *
 throtl_rb_first(struct throtl_service_queue *parent_sq)
 {
+       struct rb_node *n;
        /* Service tree is empty */
        if (!parent_sq->nr_pending)
                return NULL;
 
-       if (!parent_sq->first_pending)
-               parent_sq->first_pending = rb_first(&parent_sq->pending_tree);
-
-       if (parent_sq->first_pending)
-               return rb_entry_tg(parent_sq->first_pending);
-
-       return NULL;
-}
-
-static void rb_erase_init(struct rb_node *n, struct rb_root *root)
-{
-       rb_erase(n, root);
-       RB_CLEAR_NODE(n);
+       n = rb_first_cached(&parent_sq->pending_tree);
+       WARN_ON_ONCE(!n);
+       if (!n)
+               return NULL;
+       return rb_entry_tg(n);
 }
 
 static void throtl_rb_erase(struct rb_node *n,
                            struct throtl_service_queue *parent_sq)
 {
-       if (parent_sq->first_pending == n)
-               parent_sq->first_pending = NULL;
-       rb_erase_init(n, &parent_sq->pending_tree);
+       rb_erase_cached(n, &parent_sq->pending_tree);
+       RB_CLEAR_NODE(n);
        --parent_sq->nr_pending;
 }
 
@@ -658,11 +649,11 @@ static void update_min_dispatch_time(struct throtl_service_queue *parent_sq)
 static void tg_service_queue_add(struct throtl_grp *tg)
 {
        struct throtl_service_queue *parent_sq = tg->service_queue.parent_sq;
-       struct rb_node **node = &parent_sq->pending_tree.rb_node;
+       struct rb_node **node = &parent_sq->pending_tree.rb_root.rb_node;
        struct rb_node *parent = NULL;
        struct throtl_grp *__tg;
        unsigned long key = tg->disptime;
-       int left = 1;
+       bool leftmost = true;
 
        while (*node != NULL) {
                parent = *node;
@@ -672,15 +663,13 @@ static void tg_service_queue_add(struct throtl_grp *tg)
                        node = &parent->rb_left;
                else {
                        node = &parent->rb_right;
-                       left = 0;
+                       leftmost = false;
                }
        }
 
-       if (left)
-               parent_sq->first_pending = &tg->rb_node;
-
        rb_link_node(&tg->rb_node, parent, node);
-       rb_insert_color(&tg->rb_node, &parent_sq->pending_tree);
+       rb_insert_color_cached(&tg->rb_node, &parent_sq->pending_tree,
+                              leftmost);
 }
 
 static void __throtl_enqueue_tg(struct throtl_grp *tg)
@@ -2126,21 +2115,11 @@ static inline void throtl_update_latency_buckets(struct throtl_data *td)
 }
 #endif
 
-static void blk_throtl_assoc_bio(struct throtl_grp *tg, struct bio *bio)
-{
-#ifdef CONFIG_BLK_DEV_THROTTLING_LOW
-       /* fallback to root_blkg if we fail to get a blkg ref */
-       if (bio->bi_css && (bio_associate_blkg(bio, tg_to_blkg(tg)) == -ENODEV))
-               bio_associate_blkg(bio, bio->bi_disk->queue->root_blkg);
-       bio_issue_init(&bio->bi_issue, bio_sectors(bio));
-#endif
-}
-
 bool blk_throtl_bio(struct request_queue *q, struct blkcg_gq *blkg,
                    struct bio *bio)
 {
        struct throtl_qnode *qn = NULL;
-       struct throtl_grp *tg = blkg_to_tg(blkg ?: q->root_blkg);
+       struct throtl_grp *tg = blkg_to_tg(blkg);
        struct throtl_service_queue *sq;
        bool rw = bio_data_dir(bio);
        bool throttled = false;
@@ -2159,7 +2138,6 @@ bool blk_throtl_bio(struct request_queue *q, struct blkcg_gq *blkg,
        if (unlikely(blk_queue_bypass(q)))
                goto out_unlock;
 
-       blk_throtl_assoc_bio(tg, bio);
        blk_throtl_update_idletime(tg);
 
        sq = &tg->service_queue;
index 9db4e389582c8da7848d458a9d4f12d64a1aecae..3d2aecba96a46c7b28dd1207a0874ab2e27520ee 100644 (file)
@@ -4,6 +4,7 @@
 
 #include <linux/idr.h>
 #include <linux/blk-mq.h>
+#include <xen/xen.h>
 #include "blk-mq.h"
 
 /* Amount of time in which a process may batch requests */
@@ -124,7 +125,7 @@ static inline void __blk_get_queue(struct request_queue *q)
 }
 
 struct blk_flush_queue *blk_alloc_flush_queue(struct request_queue *q,
-               int node, int cmd_size);
+               int node, int cmd_size, gfp_t flags);
 void blk_free_flush_queue(struct blk_flush_queue *q);
 
 int blk_init_rl(struct request_list *rl, struct request_queue *q,
@@ -149,6 +150,41 @@ static inline void blk_queue_enter_live(struct request_queue *q)
        percpu_ref_get(&q->q_usage_counter);
 }
 
+static inline bool biovec_phys_mergeable(struct request_queue *q,
+               struct bio_vec *vec1, struct bio_vec *vec2)
+{
+       unsigned long mask = queue_segment_boundary(q);
+       phys_addr_t addr1 = page_to_phys(vec1->bv_page) + vec1->bv_offset;
+       phys_addr_t addr2 = page_to_phys(vec2->bv_page) + vec2->bv_offset;
+
+       if (addr1 + vec1->bv_len != addr2)
+               return false;
+       if (xen_domain() && !xen_biovec_phys_mergeable(vec1, vec2))
+               return false;
+       if ((addr1 | mask) != ((addr2 + vec2->bv_len - 1) | mask))
+               return false;
+       return true;
+}
+
+static inline bool __bvec_gap_to_prev(struct request_queue *q,
+               struct bio_vec *bprv, unsigned int offset)
+{
+       return offset ||
+               ((bprv->bv_offset + bprv->bv_len) & queue_virt_boundary(q));
+}
+
+/*
+ * Check if adding a bio_vec after bprv with offset would create a gap in
+ * the SG list. Most drivers don't care about this, but some do.
+ */
+static inline bool bvec_gap_to_prev(struct request_queue *q,
+               struct bio_vec *bprv, unsigned int offset)
+{
+       if (!queue_virt_boundary(q))
+               return false;
+       return __bvec_gap_to_prev(q, bprv, offset);
+}
+
 #ifdef CONFIG_BLK_DEV_INTEGRITY
 void blk_flush_integrity(void);
 bool __bio_integrity_endio(struct bio *);
@@ -158,7 +194,38 @@ static inline bool bio_integrity_endio(struct bio *bio)
                return __bio_integrity_endio(bio);
        return true;
 }
-#else
+
+static inline bool integrity_req_gap_back_merge(struct request *req,
+               struct bio *next)
+{
+       struct bio_integrity_payload *bip = bio_integrity(req->bio);
+       struct bio_integrity_payload *bip_next = bio_integrity(next);
+
+       return bvec_gap_to_prev(req->q, &bip->bip_vec[bip->bip_vcnt - 1],
+                               bip_next->bip_vec[0].bv_offset);
+}
+
+static inline bool integrity_req_gap_front_merge(struct request *req,
+               struct bio *bio)
+{
+       struct bio_integrity_payload *bip = bio_integrity(bio);
+       struct bio_integrity_payload *bip_next = bio_integrity(req->bio);
+
+       return bvec_gap_to_prev(req->q, &bip->bip_vec[bip->bip_vcnt - 1],
+                               bip_next->bip_vec[0].bv_offset);
+}
+#else /* CONFIG_BLK_DEV_INTEGRITY */
+static inline bool integrity_req_gap_back_merge(struct request *req,
+               struct bio *next)
+{
+       return false;
+}
+static inline bool integrity_req_gap_front_merge(struct request *req,
+               struct bio *bio)
+{
+       return false;
+}
+
 static inline void blk_flush_integrity(void)
 {
 }
@@ -166,7 +233,7 @@ static inline bool bio_integrity_endio(struct bio *bio)
 {
        return true;
 }
-#endif
+#endif /* CONFIG_BLK_DEV_INTEGRITY */
 
 void blk_timeout_work(struct work_struct *work);
 unsigned long blk_rq_timeout(unsigned long timeout);
index bc63b3a2d18cad59b0061c4fc1d4ff7b042ac5f4..ec0d99995f5f0581ebe32928cc8e82acd8773e04 100644 (file)
 static struct bio_set bounce_bio_set, bounce_bio_split;
 static mempool_t page_pool, isa_page_pool;
 
+static void init_bounce_bioset(void)
+{
+       static bool bounce_bs_setup;
+       int ret;
+
+       if (bounce_bs_setup)
+               return;
+
+       ret = bioset_init(&bounce_bio_set, BIO_POOL_SIZE, 0, BIOSET_NEED_BVECS);
+       BUG_ON(ret);
+       if (bioset_integrity_create(&bounce_bio_set, BIO_POOL_SIZE))
+               BUG_ON(1);
+
+       ret = bioset_init(&bounce_bio_split, BIO_POOL_SIZE, 0, 0);
+       BUG_ON(ret);
+       bounce_bs_setup = true;
+}
+
 #if defined(CONFIG_HIGHMEM)
 static __init int init_emergency_pool(void)
 {
@@ -44,14 +62,7 @@ static __init int init_emergency_pool(void)
        BUG_ON(ret);
        pr_info("pool size: %d pages\n", POOL_SIZE);
 
-       ret = bioset_init(&bounce_bio_set, BIO_POOL_SIZE, 0, BIOSET_NEED_BVECS);
-       BUG_ON(ret);
-       if (bioset_integrity_create(&bounce_bio_set, BIO_POOL_SIZE))
-               BUG_ON(1);
-
-       ret = bioset_init(&bounce_bio_split, BIO_POOL_SIZE, 0, 0);
-       BUG_ON(ret);
-
+       init_bounce_bioset();
        return 0;
 }
 
@@ -86,6 +97,8 @@ static void *mempool_alloc_pages_isa(gfp_t gfp_mask, void *data)
        return mempool_alloc_pages(gfp_mask | GFP_DMA, data);
 }
 
+static DEFINE_MUTEX(isa_mutex);
+
 /*
  * gets called "every" time someone init's a queue with BLK_BOUNCE_ISA
  * as the max address, so check if the pool has already been created.
@@ -94,14 +107,20 @@ int init_emergency_isa_pool(void)
 {
        int ret;
 
-       if (mempool_initialized(&isa_page_pool))
+       mutex_lock(&isa_mutex);
+
+       if (mempool_initialized(&isa_page_pool)) {
+               mutex_unlock(&isa_mutex);
                return 0;
+       }
 
        ret = mempool_init(&isa_page_pool, ISA_POOL_SIZE, mempool_alloc_pages_isa,
                           mempool_free_pages, (void *) 0);
        BUG_ON(ret);
 
        pr_info("isa pool size: %d pages\n", ISA_POOL_SIZE);
+       init_bounce_bioset();
+       mutex_unlock(&isa_mutex);
        return 0;
 }
 
@@ -257,7 +276,9 @@ static struct bio *bounce_clone_bio(struct bio *bio_src, gfp_t gfp_mask,
                }
        }
 
-       bio_clone_blkcg_association(bio, bio_src);
+       bio_clone_blkg_association(bio, bio_src);
+
+       blkcg_bio_issue_init(bio);
 
        return bio;
 }
index 2eb87444b157271fe4c3e9d57f4cffc303f0c0a5..6a3d87dd3c1ac42abf04223ea14eb673e7c5ec5c 100644 (file)
@@ -1644,14 +1644,20 @@ static void cfq_pd_offline(struct blkg_policy_data *pd)
        int i;
 
        for (i = 0; i < IOPRIO_BE_NR; i++) {
-               if (cfqg->async_cfqq[0][i])
+               if (cfqg->async_cfqq[0][i]) {
                        cfq_put_queue(cfqg->async_cfqq[0][i]);
-               if (cfqg->async_cfqq[1][i])
+                       cfqg->async_cfqq[0][i] = NULL;
+               }
+               if (cfqg->async_cfqq[1][i]) {
                        cfq_put_queue(cfqg->async_cfqq[1][i]);
+                       cfqg->async_cfqq[1][i] = NULL;
+               }
        }
 
-       if (cfqg->async_idle_cfqq)
+       if (cfqg->async_idle_cfqq) {
                cfq_put_queue(cfqg->async_idle_cfqq);
+               cfqg->async_idle_cfqq = NULL;
+       }
 
        /*
         * @blkg is going offline and will be ignored by
@@ -3753,7 +3759,7 @@ static void check_blkcg_changed(struct cfq_io_cq *cic, struct bio *bio)
        uint64_t serial_nr;
 
        rcu_read_lock();
-       serial_nr = bio_blkcg(bio)->css.serial_nr;
+       serial_nr = __bio_blkcg(bio)->css.serial_nr;
        rcu_read_unlock();
 
        /*
@@ -3818,7 +3824,7 @@ cfq_get_queue(struct cfq_data *cfqd, bool is_sync, struct cfq_io_cq *cic,
        struct cfq_group *cfqg;
 
        rcu_read_lock();
-       cfqg = cfq_lookup_cfqg(cfqd, bio_blkcg(bio));
+       cfqg = cfq_lookup_cfqg(cfqd, __bio_blkcg(bio));
        if (!cfqg) {
                cfqq = &cfqd->oom_cfqq;
                goto out;
index fae58b2f906fc5e0352c3f3194780abe13369784..8fdcd64ae12e19a5c6734780fa838f1c5a533eee 100644 (file)
@@ -41,6 +41,7 @@
 
 #include "blk.h"
 #include "blk-mq-sched.h"
+#include "blk-pm.h"
 #include "blk-wbt.h"
 
 static DEFINE_SPINLOCK(elv_list_lock);
@@ -557,27 +558,6 @@ void elv_bio_merged(struct request_queue *q, struct request *rq,
                e->type->ops.sq.elevator_bio_merged_fn(q, rq, bio);
 }
 
-#ifdef CONFIG_PM
-static void blk_pm_requeue_request(struct request *rq)
-{
-       if (rq->q->dev && !(rq->rq_flags & RQF_PM))
-               rq->q->nr_pending--;
-}
-
-static void blk_pm_add_request(struct request_queue *q, struct request *rq)
-{
-       if (q->dev && !(rq->rq_flags & RQF_PM) && q->nr_pending++ == 0 &&
-           (q->rpm_status == RPM_SUSPENDED || q->rpm_status == RPM_SUSPENDING))
-               pm_request_resume(q->dev);
-}
-#else
-static inline void blk_pm_requeue_request(struct request *rq) {}
-static inline void blk_pm_add_request(struct request_queue *q,
-                                     struct request *rq)
-{
-}
-#endif
-
 void elv_requeue_request(struct request_queue *q, struct request *rq)
 {
        /*
index be5bab20b2abf278fd7d7370c1a082de0928b1ed..cff6bdf27226bb597066bc377af3760b38a8250a 100644 (file)
@@ -567,7 +567,8 @@ static int exact_lock(dev_t devt, void *data)
        return 0;
 }
 
-static void register_disk(struct device *parent, struct gendisk *disk)
+static void register_disk(struct device *parent, struct gendisk *disk,
+                         const struct attribute_group **groups)
 {
        struct device *ddev = disk_to_dev(disk);
        struct block_device *bdev;
@@ -582,6 +583,10 @@ static void register_disk(struct device *parent, struct gendisk *disk)
        /* delay uevents, until we scanned partition table */
        dev_set_uevent_suppress(ddev, 1);
 
+       if (groups) {
+               WARN_ON(ddev->groups);
+               ddev->groups = groups;
+       }
        if (device_add(ddev))
                return;
        if (!sysfs_deprecated) {
@@ -647,6 +652,7 @@ exit:
  * __device_add_disk - add disk information to kernel list
  * @parent: parent device for the disk
  * @disk: per-device partitioning information
+ * @groups: Additional per-device sysfs groups
  * @register_queue: register the queue if set to true
  *
  * This function registers the partitioning information in @disk
@@ -655,6 +661,7 @@ exit:
  * FIXME: error handling
  */
 static void __device_add_disk(struct device *parent, struct gendisk *disk,
+                             const struct attribute_group **groups,
                              bool register_queue)
 {
        dev_t devt;
@@ -698,7 +705,7 @@ static void __device_add_disk(struct device *parent, struct gendisk *disk,
                blk_register_region(disk_devt(disk), disk->minors, NULL,
                                    exact_match, exact_lock, disk);
        }
-       register_disk(parent, disk);
+       register_disk(parent, disk, groups);
        if (register_queue)
                blk_register_queue(disk);
 
@@ -712,15 +719,17 @@ static void __device_add_disk(struct device *parent, struct gendisk *disk,
        blk_integrity_add(disk);
 }
 
-void device_add_disk(struct device *parent, struct gendisk *disk)
+void device_add_disk(struct device *parent, struct gendisk *disk,
+                    const struct attribute_group **groups)
+
 {
-       __device_add_disk(parent, disk, true);
+       __device_add_disk(parent, disk, groups, true);
 }
 EXPORT_SYMBOL(device_add_disk);
 
 void device_add_disk_no_queue_reg(struct device *parent, struct gendisk *disk)
 {
-       __device_add_disk(parent, disk, false);
+       __device_add_disk(parent, disk, NULL, false);
 }
 EXPORT_SYMBOL(device_add_disk_no_queue_reg);
 
index a1660bafc912452a37730ac0ed2bf02f6c8a99bc..eccac01a10b6552b693de1f1c45077bfb245def6 100644 (file)
 #include "blk-mq-debugfs.h"
 #include "blk-mq-sched.h"
 #include "blk-mq-tag.h"
-#include "blk-stat.h"
 
-/* Scheduling domains. */
+#define CREATE_TRACE_POINTS
+#include <trace/events/kyber.h>
+
+/*
+ * Scheduling domains: the device is divided into multiple domains based on the
+ * request type.
+ */
 enum {
        KYBER_READ,
-       KYBER_SYNC_WRITE,
-       KYBER_OTHER, /* Async writes, discard, etc. */
+       KYBER_WRITE,
+       KYBER_DISCARD,
+       KYBER_OTHER,
        KYBER_NUM_DOMAINS,
 };
 
-enum {
-       KYBER_MIN_DEPTH = 256,
+static const char *kyber_domain_names[] = {
+       [KYBER_READ] = "READ",
+       [KYBER_WRITE] = "WRITE",
+       [KYBER_DISCARD] = "DISCARD",
+       [KYBER_OTHER] = "OTHER",
+};
 
+enum {
        /*
         * In order to prevent starvation of synchronous requests by a flood of
         * asynchronous requests, we reserve 25% of requests for synchronous
@@ -51,25 +62,87 @@ enum {
 };
 
 /*
- * Initial device-wide depths for each scheduling domain.
+ * Maximum device-wide depth for each scheduling domain.
  *
- * Even for fast devices with lots of tags like NVMe, you can saturate
- * the device with only a fraction of the maximum possible queue depth.
- * So, we cap these to a reasonable value.
+ * Even for fast devices with lots of tags like NVMe, you can saturate the
+ * device with only a fraction of the maximum possible queue depth. So, we cap
+ * these to a reasonable value.
  */
 static const unsigned int kyber_depth[] = {
        [KYBER_READ] = 256,
-       [KYBER_SYNC_WRITE] = 128,
-       [KYBER_OTHER] = 64,
+       [KYBER_WRITE] = 128,
+       [KYBER_DISCARD] = 64,
+       [KYBER_OTHER] = 16,
 };
 
 /*
- * Scheduling domain batch sizes. We favor reads.
+ * Default latency targets for each scheduling domain.
+ */
+static const u64 kyber_latency_targets[] = {
+       [KYBER_READ] = 2ULL * NSEC_PER_MSEC,
+       [KYBER_WRITE] = 10ULL * NSEC_PER_MSEC,
+       [KYBER_DISCARD] = 5ULL * NSEC_PER_SEC,
+};
+
+/*
+ * Batch size (number of requests we'll dispatch in a row) for each scheduling
+ * domain.
  */
 static const unsigned int kyber_batch_size[] = {
        [KYBER_READ] = 16,
-       [KYBER_SYNC_WRITE] = 8,
-       [KYBER_OTHER] = 8,
+       [KYBER_WRITE] = 8,
+       [KYBER_DISCARD] = 1,
+       [KYBER_OTHER] = 1,
+};
+
+/*
+ * Requests latencies are recorded in a histogram with buckets defined relative
+ * to the target latency:
+ *
+ * <= 1/4 * target latency
+ * <= 1/2 * target latency
+ * <= 3/4 * target latency
+ * <= target latency
+ * <= 1 1/4 * target latency
+ * <= 1 1/2 * target latency
+ * <= 1 3/4 * target latency
+ * > 1 3/4 * target latency
+ */
+enum {
+       /*
+        * The width of the latency histogram buckets is
+        * 1 / (1 << KYBER_LATENCY_SHIFT) * target latency.
+        */
+       KYBER_LATENCY_SHIFT = 2,
+       /*
+        * The first (1 << KYBER_LATENCY_SHIFT) buckets are <= target latency,
+        * thus, "good".
+        */
+       KYBER_GOOD_BUCKETS = 1 << KYBER_LATENCY_SHIFT,
+       /* There are also (1 << KYBER_LATENCY_SHIFT) "bad" buckets. */
+       KYBER_LATENCY_BUCKETS = 2 << KYBER_LATENCY_SHIFT,
+};
+
+/*
+ * We measure both the total latency and the I/O latency (i.e., latency after
+ * submitting to the device).
+ */
+enum {
+       KYBER_TOTAL_LATENCY,
+       KYBER_IO_LATENCY,
+};
+
+static const char *kyber_latency_type_names[] = {
+       [KYBER_TOTAL_LATENCY] = "total",
+       [KYBER_IO_LATENCY] = "I/O",
+};
+
+/*
+ * Per-cpu latency histograms: total latency and I/O latency for each scheduling
+ * domain except for KYBER_OTHER.
+ */
+struct kyber_cpu_latency {
+       atomic_t buckets[KYBER_OTHER][2][KYBER_LATENCY_BUCKETS];
 };
 
 /*
@@ -88,12 +161,9 @@ struct kyber_ctx_queue {
 struct kyber_queue_data {
        struct request_queue *q;
 
-       struct blk_stat_callback *cb;
-
        /*
-        * The device is divided into multiple scheduling domains based on the
-        * request type. Each domain has a fixed number of in-flight requests of
-        * that type device-wide, limited by these tokens.
+        * Each scheduling domain has a limited number of in-flight requests
+        * device-wide, limited by these tokens.
         */
        struct sbitmap_queue domain_tokens[KYBER_NUM_DOMAINS];
 
@@ -103,8 +173,19 @@ struct kyber_queue_data {
         */
        unsigned int async_depth;
 
+       struct kyber_cpu_latency __percpu *cpu_latency;
+
+       /* Timer for stats aggregation and adjusting domain tokens. */
+       struct timer_list timer;
+
+       unsigned int latency_buckets[KYBER_OTHER][2][KYBER_LATENCY_BUCKETS];
+
+       unsigned long latency_timeout[KYBER_OTHER];
+
+       int domain_p99[KYBER_OTHER];
+
        /* Target latencies in nanoseconds. */
-       u64 read_lat_nsec, write_lat_nsec;
+       u64 latency_targets[KYBER_OTHER];
 };
 
 struct kyber_hctx_data {
@@ -124,233 +205,219 @@ static int kyber_domain_wake(wait_queue_entry_t *wait, unsigned mode, int flags,
 
 static unsigned int kyber_sched_domain(unsigned int op)
 {
-       if ((op & REQ_OP_MASK) == REQ_OP_READ)
+       switch (op & REQ_OP_MASK) {
+       case REQ_OP_READ:
                return KYBER_READ;
-       else if ((op & REQ_OP_MASK) == REQ_OP_WRITE && op_is_sync(op))
-               return KYBER_SYNC_WRITE;
-       else
+       case REQ_OP_WRITE:
+               return KYBER_WRITE;
+       case REQ_OP_DISCARD:
+               return KYBER_DISCARD;
+       default:
                return KYBER_OTHER;
+       }
 }
 
-enum {
-       NONE = 0,
-       GOOD = 1,
-       GREAT = 2,
-       BAD = -1,
-       AWFUL = -2,
-};
-
-#define IS_GOOD(status) ((status) > 0)
-#define IS_BAD(status) ((status) < 0)
-
-static int kyber_lat_status(struct blk_stat_callback *cb,
-                           unsigned int sched_domain, u64 target)
+static void flush_latency_buckets(struct kyber_queue_data *kqd,
+                                 struct kyber_cpu_latency *cpu_latency,
+                                 unsigned int sched_domain, unsigned int type)
 {
-       u64 latency;
-
-       if (!cb->stat[sched_domain].nr_samples)
-               return NONE;
+       unsigned int *buckets = kqd->latency_buckets[sched_domain][type];
+       atomic_t *cpu_buckets = cpu_latency->buckets[sched_domain][type];
+       unsigned int bucket;
 
-       latency = cb->stat[sched_domain].mean;
-       if (latency >= 2 * target)
-               return AWFUL;
-       else if (latency > target)
-               return BAD;
-       else if (latency <= target / 2)
-               return GREAT;
-       else /* (latency <= target) */
-               return GOOD;
+       for (bucket = 0; bucket < KYBER_LATENCY_BUCKETS; bucket++)
+               buckets[bucket] += atomic_xchg(&cpu_buckets[bucket], 0);
 }
 
 /*
- * Adjust the read or synchronous write depth given the status of reads and
- * writes. The goal is that the latencies of the two domains are fair (i.e., if
- * one is good, then the other is good).
+ * Calculate the histogram bucket with the given percentile rank, or -1 if there
+ * aren't enough samples yet.
  */
-static void kyber_adjust_rw_depth(struct kyber_queue_data *kqd,
-                                 unsigned int sched_domain, int this_status,
-                                 int other_status)
+static int calculate_percentile(struct kyber_queue_data *kqd,
+                               unsigned int sched_domain, unsigned int type,
+                               unsigned int percentile)
 {
-       unsigned int orig_depth, depth;
+       unsigned int *buckets = kqd->latency_buckets[sched_domain][type];
+       unsigned int bucket, samples = 0, percentile_samples;
+
+       for (bucket = 0; bucket < KYBER_LATENCY_BUCKETS; bucket++)
+               samples += buckets[bucket];
+
+       if (!samples)
+               return -1;
 
        /*
-        * If this domain had no samples, or reads and writes are both good or
-        * both bad, don't adjust the depth.
+        * We do the calculation once we have 500 samples or one second passes
+        * since the first sample was recorded, whichever comes first.
         */
-       if (this_status == NONE ||
-           (IS_GOOD(this_status) && IS_GOOD(other_status)) ||
-           (IS_BAD(this_status) && IS_BAD(other_status)))
-               return;
-
-       orig_depth = depth = kqd->domain_tokens[sched_domain].sb.depth;
+       if (!kqd->latency_timeout[sched_domain])
+               kqd->latency_timeout[sched_domain] = max(jiffies + HZ, 1UL);
+       if (samples < 500 &&
+           time_is_after_jiffies(kqd->latency_timeout[sched_domain])) {
+               return -1;
+       }
+       kqd->latency_timeout[sched_domain] = 0;
 
-       if (other_status == NONE) {
-               depth++;
-       } else {
-               switch (this_status) {
-               case GOOD:
-                       if (other_status == AWFUL)
-                               depth -= max(depth / 4, 1U);
-                       else
-                               depth -= max(depth / 8, 1U);
-                       break;
-               case GREAT:
-                       if (other_status == AWFUL)
-                               depth /= 2;
-                       else
-                               depth -= max(depth / 4, 1U);
+       percentile_samples = DIV_ROUND_UP(samples * percentile, 100);
+       for (bucket = 0; bucket < KYBER_LATENCY_BUCKETS - 1; bucket++) {
+               if (buckets[bucket] >= percentile_samples)
                        break;
-               case BAD:
-                       depth++;
-                       break;
-               case AWFUL:
-                       if (other_status == GREAT)
-                               depth += 2;
-                       else
-                               depth++;
-                       break;
-               }
+               percentile_samples -= buckets[bucket];
        }
+       memset(buckets, 0, sizeof(kqd->latency_buckets[sched_domain][type]));
 
-       depth = clamp(depth, 1U, kyber_depth[sched_domain]);
-       if (depth != orig_depth)
-               sbitmap_queue_resize(&kqd->domain_tokens[sched_domain], depth);
+       trace_kyber_latency(kqd->q, kyber_domain_names[sched_domain],
+                           kyber_latency_type_names[type], percentile,
+                           bucket + 1, 1 << KYBER_LATENCY_SHIFT, samples);
+
+       return bucket;
 }
 
-/*
- * Adjust the depth of other requests given the status of reads and synchronous
- * writes. As long as either domain is doing fine, we don't throttle, but if
- * both domains are doing badly, we throttle heavily.
- */
-static void kyber_adjust_other_depth(struct kyber_queue_data *kqd,
-                                    int read_status, int write_status,
-                                    bool have_samples)
-{
-       unsigned int orig_depth, depth;
-       int status;
-
-       orig_depth = depth = kqd->domain_tokens[KYBER_OTHER].sb.depth;
-
-       if (read_status == NONE && write_status == NONE) {
-               depth += 2;
-       } else if (have_samples) {
-               if (read_status == NONE)
-                       status = write_status;
-               else if (write_status == NONE)
-                       status = read_status;
-               else
-                       status = max(read_status, write_status);
-               switch (status) {
-               case GREAT:
-                       depth += 2;
-                       break;
-               case GOOD:
-                       depth++;
-                       break;
-               case BAD:
-                       depth -= max(depth / 4, 1U);
-                       break;
-               case AWFUL:
-                       depth /= 2;
-                       break;
-               }
+static void kyber_resize_domain(struct kyber_queue_data *kqd,
+                               unsigned int sched_domain, unsigned int depth)
+{
+       depth = clamp(depth, 1U, kyber_depth[sched_domain]);
+       if (depth != kqd->domain_tokens[sched_domain].sb.depth) {
+               sbitmap_queue_resize(&kqd->domain_tokens[sched_domain], depth);
+               trace_kyber_adjust(kqd->q, kyber_domain_names[sched_domain],
+                                  depth);
        }
-
-       depth = clamp(depth, 1U, kyber_depth[KYBER_OTHER]);
-       if (depth != orig_depth)
-               sbitmap_queue_resize(&kqd->domain_tokens[KYBER_OTHER], depth);
 }
 
-/*
- * Apply heuristics for limiting queue depths based on gathered latency
- * statistics.
- */
-static void kyber_stat_timer_fn(struct blk_stat_callback *cb)
+static void kyber_timer_fn(struct timer_list *t)
 {
-       struct kyber_queue_data *kqd = cb->data;
-       int read_status, write_status;
+       struct kyber_queue_data *kqd = from_timer(kqd, t, timer);
+       unsigned int sched_domain;
+       int cpu;
+       bool bad = false;
+
+       /* Sum all of the per-cpu latency histograms. */
+       for_each_online_cpu(cpu) {
+               struct kyber_cpu_latency *cpu_latency;
+
+               cpu_latency = per_cpu_ptr(kqd->cpu_latency, cpu);
+               for (sched_domain = 0; sched_domain < KYBER_OTHER; sched_domain++) {
+                       flush_latency_buckets(kqd, cpu_latency, sched_domain,
+                                             KYBER_TOTAL_LATENCY);
+                       flush_latency_buckets(kqd, cpu_latency, sched_domain,
+                                             KYBER_IO_LATENCY);
+               }
+       }
 
-       read_status = kyber_lat_status(cb, KYBER_READ, kqd->read_lat_nsec);
-       write_status = kyber_lat_status(cb, KYBER_SYNC_WRITE, kqd->write_lat_nsec);
+       /*
+        * Check if any domains have a high I/O latency, which might indicate
+        * congestion in the device. Note that we use the p90; we don't want to
+        * be too sensitive to outliers here.
+        */
+       for (sched_domain = 0; sched_domain < KYBER_OTHER; sched_domain++) {
+               int p90;
 
-       kyber_adjust_rw_depth(kqd, KYBER_READ, read_status, write_status);
-       kyber_adjust_rw_depth(kqd, KYBER_SYNC_WRITE, write_status, read_status);
-       kyber_adjust_other_depth(kqd, read_status, write_status,
-                                cb->stat[KYBER_OTHER].nr_samples != 0);
+               p90 = calculate_percentile(kqd, sched_domain, KYBER_IO_LATENCY,
+                                          90);
+               if (p90 >= KYBER_GOOD_BUCKETS)
+                       bad = true;
+       }
 
        /*
-        * Continue monitoring latencies if we aren't hitting the targets or
-        * we're still throttling other requests.
+        * Adjust the scheduling domain depths. If we determined that there was
+        * congestion, we throttle all domains with good latencies. Either way,
+        * we ease up on throttling domains with bad latencies.
         */
-       if (!blk_stat_is_active(kqd->cb) &&
-           ((IS_BAD(read_status) || IS_BAD(write_status) ||
-             kqd->domain_tokens[KYBER_OTHER].sb.depth < kyber_depth[KYBER_OTHER])))
-               blk_stat_activate_msecs(kqd->cb, 100);
+       for (sched_domain = 0; sched_domain < KYBER_OTHER; sched_domain++) {
+               unsigned int orig_depth, depth;
+               int p99;
+
+               p99 = calculate_percentile(kqd, sched_domain,
+                                          KYBER_TOTAL_LATENCY, 99);
+               /*
+                * This is kind of subtle: different domains will not
+                * necessarily have enough samples to calculate the latency
+                * percentiles during the same window, so we have to remember
+                * the p99 for the next time we observe congestion; once we do,
+                * we don't want to throttle again until we get more data, so we
+                * reset it to -1.
+                */
+               if (bad) {
+                       if (p99 < 0)
+                               p99 = kqd->domain_p99[sched_domain];
+                       kqd->domain_p99[sched_domain] = -1;
+               } else if (p99 >= 0) {
+                       kqd->domain_p99[sched_domain] = p99;
+               }
+               if (p99 < 0)
+                       continue;
+
+               /*
+                * If this domain has bad latency, throttle less. Otherwise,
+                * throttle more iff we determined that there is congestion.
+                *
+                * The new depth is scaled linearly with the p99 latency vs the
+                * latency target. E.g., if the p99 is 3/4 of the target, then
+                * we throttle down to 3/4 of the current depth, and if the p99
+                * is 2x the target, then we double the depth.
+                */
+               if (bad || p99 >= KYBER_GOOD_BUCKETS) {
+                       orig_depth = kqd->domain_tokens[sched_domain].sb.depth;
+                       depth = (orig_depth * (p99 + 1)) >> KYBER_LATENCY_SHIFT;
+                       kyber_resize_domain(kqd, sched_domain, depth);
+               }
+       }
 }
 
-static unsigned int kyber_sched_tags_shift(struct kyber_queue_data *kqd)
+static unsigned int kyber_sched_tags_shift(struct request_queue *q)
 {
        /*
         * All of the hardware queues have the same depth, so we can just grab
         * the shift of the first one.
         */
-       return kqd->q->queue_hw_ctx[0]->sched_tags->bitmap_tags.sb.shift;
-}
-
-static int kyber_bucket_fn(const struct request *rq)
-{
-       return kyber_sched_domain(rq->cmd_flags);
+       return q->queue_hw_ctx[0]->sched_tags->bitmap_tags.sb.shift;
 }
 
 static struct kyber_queue_data *kyber_queue_data_alloc(struct request_queue *q)
 {
        struct kyber_queue_data *kqd;
-       unsigned int max_tokens;
        unsigned int shift;
        int ret = -ENOMEM;
        int i;
 
-       kqd = kmalloc_node(sizeof(*kqd), GFP_KERNEL, q->node);
+       kqd = kzalloc_node(sizeof(*kqd), GFP_KERNEL, q->node);
        if (!kqd)
                goto err;
+
        kqd->q = q;
 
-       kqd->cb = blk_stat_alloc_callback(kyber_stat_timer_fn, kyber_bucket_fn,
-                                         KYBER_NUM_DOMAINS, kqd);
-       if (!kqd->cb)
+       kqd->cpu_latency = alloc_percpu_gfp(struct kyber_cpu_latency,
+                                           GFP_KERNEL | __GFP_ZERO);
+       if (!kqd->cpu_latency)
                goto err_kqd;
 
-       /*
-        * The maximum number of tokens for any scheduling domain is at least
-        * the queue depth of a single hardware queue. If the hardware doesn't
-        * have many tags, still provide a reasonable number.
-        */
-       max_tokens = max_t(unsigned int, q->tag_set->queue_depth,
-                          KYBER_MIN_DEPTH);
+       timer_setup(&kqd->timer, kyber_timer_fn, 0);
+
        for (i = 0; i < KYBER_NUM_DOMAINS; i++) {
                WARN_ON(!kyber_depth[i]);
                WARN_ON(!kyber_batch_size[i]);
                ret = sbitmap_queue_init_node(&kqd->domain_tokens[i],
-                                             max_tokens, -1, false, GFP_KERNEL,
-                                             q->node);
+                                             kyber_depth[i], -1, false,
+                                             GFP_KERNEL, q->node);
                if (ret) {
                        while (--i >= 0)
                                sbitmap_queue_free(&kqd->domain_tokens[i]);
-                       goto err_cb;
+                       goto err_buckets;
                }
-               sbitmap_queue_resize(&kqd->domain_tokens[i], kyber_depth[i]);
        }
 
-       shift = kyber_sched_tags_shift(kqd);
-       kqd->async_depth = (1U << shift) * KYBER_ASYNC_PERCENT / 100U;
+       for (i = 0; i < KYBER_OTHER; i++) {
+               kqd->domain_p99[i] = -1;
+               kqd->latency_targets[i] = kyber_latency_targets[i];
+       }
 
-       kqd->read_lat_nsec = 2000000ULL;
-       kqd->write_lat_nsec = 10000000ULL;
+       shift = kyber_sched_tags_shift(q);
+       kqd->async_depth = (1U << shift) * KYBER_ASYNC_PERCENT / 100U;
 
        return kqd;
 
-err_cb:
-       blk_stat_free_callback(kqd->cb);
+err_buckets:
+       free_percpu(kqd->cpu_latency);
 err_kqd:
        kfree(kqd);
 err:
@@ -372,25 +439,24 @@ static int kyber_init_sched(struct request_queue *q, struct elevator_type *e)
                return PTR_ERR(kqd);
        }
 
+       blk_stat_enable_accounting(q);
+
        eq->elevator_data = kqd;
        q->elevator = eq;
 
-       blk_stat_add_callback(q, kqd->cb);
-
        return 0;
 }
 
 static void kyber_exit_sched(struct elevator_queue *e)
 {
        struct kyber_queue_data *kqd = e->elevator_data;
-       struct request_queue *q = kqd->q;
        int i;
 
-       blk_stat_remove_callback(q, kqd->cb);
+       del_timer_sync(&kqd->timer);
 
        for (i = 0; i < KYBER_NUM_DOMAINS; i++)
                sbitmap_queue_free(&kqd->domain_tokens[i]);
-       blk_stat_free_callback(kqd->cb);
+       free_percpu(kqd->cpu_latency);
        kfree(kqd);
 }
 
@@ -558,41 +624,44 @@ static void kyber_finish_request(struct request *rq)
        rq_clear_domain_token(kqd, rq);
 }
 
-static void kyber_completed_request(struct request *rq)
+static void add_latency_sample(struct kyber_cpu_latency *cpu_latency,
+                              unsigned int sched_domain, unsigned int type,
+                              u64 target, u64 latency)
 {
-       struct request_queue *q = rq->q;
-       struct kyber_queue_data *kqd = q->elevator->elevator_data;
-       unsigned int sched_domain;
-       u64 now, latency, target;
+       unsigned int bucket;
+       u64 divisor;
 
-       /*
-        * Check if this request met our latency goal. If not, quickly gather
-        * some statistics and start throttling.
-        */
-       sched_domain = kyber_sched_domain(rq->cmd_flags);
-       switch (sched_domain) {
-       case KYBER_READ:
-               target = kqd->read_lat_nsec;
-               break;
-       case KYBER_SYNC_WRITE:
-               target = kqd->write_lat_nsec;
-               break;
-       default:
-               return;
+       if (latency > 0) {
+               divisor = max_t(u64, target >> KYBER_LATENCY_SHIFT, 1);
+               bucket = min_t(unsigned int, div64_u64(latency - 1, divisor),
+                              KYBER_LATENCY_BUCKETS - 1);
+       } else {
+               bucket = 0;
        }
 
-       /* If we are already monitoring latencies, don't check again. */
-       if (blk_stat_is_active(kqd->cb))
-               return;
+       atomic_inc(&cpu_latency->buckets[sched_domain][type][bucket]);
+}
 
-       now = ktime_get_ns();
-       if (now < rq->io_start_time_ns)
+static void kyber_completed_request(struct request *rq, u64 now)
+{
+       struct kyber_queue_data *kqd = rq->q->elevator->elevator_data;
+       struct kyber_cpu_latency *cpu_latency;
+       unsigned int sched_domain;
+       u64 target;
+
+       sched_domain = kyber_sched_domain(rq->cmd_flags);
+       if (sched_domain == KYBER_OTHER)
                return;
 
-       latency = now - rq->io_start_time_ns;
+       cpu_latency = get_cpu_ptr(kqd->cpu_latency);
+       target = kqd->latency_targets[sched_domain];
+       add_latency_sample(cpu_latency, sched_domain, KYBER_TOTAL_LATENCY,
+                          target, now - rq->start_time_ns);
+       add_latency_sample(cpu_latency, sched_domain, KYBER_IO_LATENCY, target,
+                          now - rq->io_start_time_ns);
+       put_cpu_ptr(kqd->cpu_latency);
 
-       if (latency > target)
-               blk_stat_activate_msecs(kqd->cb, 10);
+       timer_reduce(&kqd->timer, jiffies + HZ / 10);
 }
 
 struct flush_kcq_data {
@@ -713,6 +782,9 @@ kyber_dispatch_cur_domain(struct kyber_queue_data *kqd,
                        rq_set_domain_token(rq, nr);
                        list_del_init(&rq->queuelist);
                        return rq;
+               } else {
+                       trace_kyber_throttled(kqd->q,
+                                             kyber_domain_names[khd->cur_domain]);
                }
        } else if (sbitmap_any_bit_set(&khd->kcq_map[khd->cur_domain])) {
                nr = kyber_get_domain_token(kqd, khd, hctx);
@@ -723,6 +795,9 @@ kyber_dispatch_cur_domain(struct kyber_queue_data *kqd,
                        rq_set_domain_token(rq, nr);
                        list_del_init(&rq->queuelist);
                        return rq;
+               } else {
+                       trace_kyber_throttled(kqd->q,
+                                             kyber_domain_names[khd->cur_domain]);
                }
        }
 
@@ -790,17 +865,17 @@ static bool kyber_has_work(struct blk_mq_hw_ctx *hctx)
        return false;
 }
 
-#define KYBER_LAT_SHOW_STORE(op)                                       \
-static ssize_t kyber_##op##_lat_show(struct elevator_queue *e,         \
-                                    char *page)                        \
+#define KYBER_LAT_SHOW_STORE(domain, name)                             \
+static ssize_t kyber_##name##_lat_show(struct elevator_queue *e,       \
+                                      char *page)                      \
 {                                                                      \
        struct kyber_queue_data *kqd = e->elevator_data;                \
                                                                        \
-       return sprintf(page, "%llu\n", kqd->op##_lat_nsec);             \
+       return sprintf(page, "%llu\n", kqd->latency_targets[domain]);   \
 }                                                                      \
                                                                        \
-static ssize_t kyber_##op##_lat_store(struct elevator_queue *e,                \
-                                     const char *page, size_t count)   \
+static ssize_t kyber_##name##_lat_store(struct elevator_queue *e,      \
+                                       const char *page, size_t count) \
 {                                                                      \
        struct kyber_queue_data *kqd = e->elevator_data;                \
        unsigned long long nsec;                                        \
@@ -810,12 +885,12 @@ static ssize_t kyber_##op##_lat_store(struct elevator_queue *e,           \
        if (ret)                                                        \
                return ret;                                             \
                                                                        \
-       kqd->op##_lat_nsec = nsec;                                      \
+       kqd->latency_targets[domain] = nsec;                            \
                                                                        \
        return count;                                                   \
 }
-KYBER_LAT_SHOW_STORE(read);
-KYBER_LAT_SHOW_STORE(write);
+KYBER_LAT_SHOW_STORE(KYBER_READ, read);
+KYBER_LAT_SHOW_STORE(KYBER_WRITE, write);
 #undef KYBER_LAT_SHOW_STORE
 
 #define KYBER_LAT_ATTR(op) __ATTR(op##_lat_nsec, 0644, kyber_##op##_lat_show, kyber_##op##_lat_store)
@@ -882,7 +957,8 @@ static int kyber_##name##_waiting_show(void *data, struct seq_file *m)      \
        return 0;                                                       \
 }
 KYBER_DEBUGFS_DOMAIN_ATTRS(KYBER_READ, read)
-KYBER_DEBUGFS_DOMAIN_ATTRS(KYBER_SYNC_WRITE, sync_write)
+KYBER_DEBUGFS_DOMAIN_ATTRS(KYBER_WRITE, write)
+KYBER_DEBUGFS_DOMAIN_ATTRS(KYBER_DISCARD, discard)
 KYBER_DEBUGFS_DOMAIN_ATTRS(KYBER_OTHER, other)
 #undef KYBER_DEBUGFS_DOMAIN_ATTRS
 
@@ -900,20 +976,7 @@ static int kyber_cur_domain_show(void *data, struct seq_file *m)
        struct blk_mq_hw_ctx *hctx = data;
        struct kyber_hctx_data *khd = hctx->sched_data;
 
-       switch (khd->cur_domain) {
-       case KYBER_READ:
-               seq_puts(m, "READ\n");
-               break;
-       case KYBER_SYNC_WRITE:
-               seq_puts(m, "SYNC_WRITE\n");
-               break;
-       case KYBER_OTHER:
-               seq_puts(m, "OTHER\n");
-               break;
-       default:
-               seq_printf(m, "%u\n", khd->cur_domain);
-               break;
-       }
+       seq_printf(m, "%s\n", kyber_domain_names[khd->cur_domain]);
        return 0;
 }
 
@@ -930,7 +993,8 @@ static int kyber_batching_show(void *data, struct seq_file *m)
        {#name "_tokens", 0400, kyber_##name##_tokens_show}
 static const struct blk_mq_debugfs_attr kyber_queue_debugfs_attrs[] = {
        KYBER_QUEUE_DOMAIN_ATTRS(read),
-       KYBER_QUEUE_DOMAIN_ATTRS(sync_write),
+       KYBER_QUEUE_DOMAIN_ATTRS(write),
+       KYBER_QUEUE_DOMAIN_ATTRS(discard),
        KYBER_QUEUE_DOMAIN_ATTRS(other),
        {"async_depth", 0400, kyber_async_depth_show},
        {},
@@ -942,7 +1006,8 @@ static const struct blk_mq_debugfs_attr kyber_queue_debugfs_attrs[] = {
        {#name "_waiting", 0400, kyber_##name##_waiting_show}
 static const struct blk_mq_debugfs_attr kyber_hctx_debugfs_attrs[] = {
        KYBER_HCTX_DOMAIN_ATTRS(read),
-       KYBER_HCTX_DOMAIN_ATTRS(sync_write),
+       KYBER_HCTX_DOMAIN_ATTRS(write),
+       KYBER_HCTX_DOMAIN_ATTRS(discard),
        KYBER_HCTX_DOMAIN_ATTRS(other),
        {"cur_domain", 0400, kyber_cur_domain_show},
        {"batching", 0400, kyber_batching_show},
diff --git a/drivers/block/DAC960.c b/drivers/block/DAC960.c
deleted file mode 100644 (file)
index 581312a..0000000
+++ /dev/null
@@ -1,7229 +0,0 @@
-/*
-
-  Linux Driver for Mylex DAC960/AcceleRAID/eXtremeRAID PCI RAID Controllers
-
-  Copyright 1998-2001 by Leonard N. Zubkoff <lnz@dandelion.com>
-  Portions Copyright 2002 by Mylex (An IBM Business Unit)
-
-  This program is free software; you may redistribute and/or modify it under
-  the terms of the GNU General Public License Version 2 as published by the
-  Free Software Foundation.
-
-  This program is distributed in the hope that it will be useful, but
-  WITHOUT ANY WARRANTY, without even the implied warranty of MERCHANTABILITY
-  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
-  for complete details.
-
-*/
-
-
-#define DAC960_DriverVersion                   "2.5.49"
-#define DAC960_DriverDate                      "21 Aug 2007"
-
-
-#include <linux/compiler.h>
-#include <linux/module.h>
-#include <linux/types.h>
-#include <linux/miscdevice.h>
-#include <linux/blkdev.h>
-#include <linux/bio.h>
-#include <linux/completion.h>
-#include <linux/delay.h>
-#include <linux/genhd.h>
-#include <linux/hdreg.h>
-#include <linux/blkpg.h>
-#include <linux/dma-mapping.h>
-#include <linux/interrupt.h>
-#include <linux/ioport.h>
-#include <linux/mm.h>
-#include <linux/slab.h>
-#include <linux/mutex.h>
-#include <linux/proc_fs.h>
-#include <linux/seq_file.h>
-#include <linux/reboot.h>
-#include <linux/spinlock.h>
-#include <linux/timer.h>
-#include <linux/pci.h>
-#include <linux/init.h>
-#include <linux/jiffies.h>
-#include <linux/random.h>
-#include <linux/scatterlist.h>
-#include <asm/io.h>
-#include <linux/uaccess.h>
-#include "DAC960.h"
-
-#define DAC960_GAM_MINOR       252
-
-
-static DEFINE_MUTEX(DAC960_mutex);
-static DAC960_Controller_T *DAC960_Controllers[DAC960_MaxControllers];
-static int DAC960_ControllerCount;
-static struct proc_dir_entry *DAC960_ProcDirectoryEntry;
-
-static long disk_size(DAC960_Controller_T *p, int drive_nr)
-{
-       if (p->FirmwareType == DAC960_V1_Controller) {
-               if (drive_nr >= p->LogicalDriveCount)
-                       return 0;
-               return p->V1.LogicalDriveInformation[drive_nr].
-                       LogicalDriveSize;
-       } else {
-               DAC960_V2_LogicalDeviceInfo_T *i =
-                       p->V2.LogicalDeviceInformation[drive_nr];
-               if (i == NULL)
-                       return 0;
-               return i->ConfigurableDeviceSize;
-       }
-}
-
-static int DAC960_open(struct block_device *bdev, fmode_t mode)
-{
-       struct gendisk *disk = bdev->bd_disk;
-       DAC960_Controller_T *p = disk->queue->queuedata;
-       int drive_nr = (long)disk->private_data;
-       int ret = -ENXIO;
-
-       mutex_lock(&DAC960_mutex);
-       if (p->FirmwareType == DAC960_V1_Controller) {
-               if (p->V1.LogicalDriveInformation[drive_nr].
-                   LogicalDriveState == DAC960_V1_LogicalDrive_Offline)
-                       goto out;
-       } else {
-               DAC960_V2_LogicalDeviceInfo_T *i =
-                       p->V2.LogicalDeviceInformation[drive_nr];
-               if (!i || i->LogicalDeviceState == DAC960_V2_LogicalDevice_Offline)
-                       goto out;
-       }
-
-       check_disk_change(bdev);
-
-       if (!get_capacity(p->disks[drive_nr]))
-               goto out;
-       ret = 0;
-out:
-       mutex_unlock(&DAC960_mutex);
-       return ret;
-}
-
-static int DAC960_getgeo(struct block_device *bdev, struct hd_geometry *geo)
-{
-       struct gendisk *disk = bdev->bd_disk;
-       DAC960_Controller_T *p = disk->queue->queuedata;
-       int drive_nr = (long)disk->private_data;
-
-       if (p->FirmwareType == DAC960_V1_Controller) {
-               geo->heads = p->V1.GeometryTranslationHeads;
-               geo->sectors = p->V1.GeometryTranslationSectors;
-               geo->cylinders = p->V1.LogicalDriveInformation[drive_nr].
-                       LogicalDriveSize / (geo->heads * geo->sectors);
-       } else {
-               DAC960_V2_LogicalDeviceInfo_T *i =
-                       p->V2.LogicalDeviceInformation[drive_nr];
-               switch (i->DriveGeometry) {
-               case DAC960_V2_Geometry_128_32:
-                       geo->heads = 128;
-                       geo->sectors = 32;
-                       break;
-               case DAC960_V2_Geometry_255_63:
-                       geo->heads = 255;
-                       geo->sectors = 63;
-                       break;
-               default:
-                       DAC960_Error("Illegal Logical Device Geometry %d\n",
-                                       p, i->DriveGeometry);
-                       return -EINVAL;
-               }
-
-               geo->cylinders = i->ConfigurableDeviceSize /
-                       (geo->heads * geo->sectors);
-       }
-       
-       return 0;
-}
-
-static unsigned int DAC960_check_events(struct gendisk *disk,
-                                       unsigned int clearing)
-{
-       DAC960_Controller_T *p = disk->queue->queuedata;
-       int drive_nr = (long)disk->private_data;
-
-       if (!p->LogicalDriveInitiallyAccessible[drive_nr])
-               return DISK_EVENT_MEDIA_CHANGE;
-       return 0;
-}
-
-static int DAC960_revalidate_disk(struct gendisk *disk)
-{
-       DAC960_Controller_T *p = disk->queue->queuedata;
-       int unit = (long)disk->private_data;
-
-       set_capacity(disk, disk_size(p, unit));
-       return 0;
-}
-
-static const struct block_device_operations DAC960_BlockDeviceOperations = {
-       .owner                  = THIS_MODULE,
-       .open                   = DAC960_open,
-       .getgeo                 = DAC960_getgeo,
-       .check_events           = DAC960_check_events,
-       .revalidate_disk        = DAC960_revalidate_disk,
-};
-
-
-/*
-  DAC960_AnnounceDriver announces the Driver Version and Date, Author's Name,
-  Copyright Notice, and Electronic Mail Address.
-*/
-
-static void DAC960_AnnounceDriver(DAC960_Controller_T *Controller)
-{
-  DAC960_Announce("***** DAC960 RAID Driver Version "
-                 DAC960_DriverVersion " of "
-                 DAC960_DriverDate " *****\n", Controller);
-  DAC960_Announce("Copyright 1998-2001 by Leonard N. Zubkoff "
-                 "<lnz@dandelion.com>\n", Controller);
-}
-
-
-/*
-  DAC960_Failure prints a standardized error message, and then returns false.
-*/
-
-static bool DAC960_Failure(DAC960_Controller_T *Controller,
-                             unsigned char *ErrorMessage)
-{
-  DAC960_Error("While configuring DAC960 PCI RAID Controller at\n",
-              Controller);
-  if (Controller->IO_Address == 0)
-    DAC960_Error("PCI Bus %d Device %d Function %d I/O Address N/A "
-                "PCI Address 0x%X\n", Controller,
-                Controller->Bus, Controller->Device,
-                Controller->Function, Controller->PCI_Address);
-  else DAC960_Error("PCI Bus %d Device %d Function %d I/O Address "
-                   "0x%X PCI Address 0x%X\n", Controller,
-                   Controller->Bus, Controller->Device,
-                   Controller->Function, Controller->IO_Address,
-                   Controller->PCI_Address);
-  DAC960_Error("%s FAILED - DETACHING\n", Controller, ErrorMessage);
-  return false;
-}
-
-/*
-  init_dma_loaf() and slice_dma_loaf() are helper functions for
-  aggregating the dma-mapped memory for a well-known collection of
-  data structures that are of different lengths.
-
-  These routines don't guarantee any alignment.  The caller must
-  include any space needed for alignment in the sizes of the structures
-  that are passed in.
- */
-
-static bool init_dma_loaf(struct pci_dev *dev, struct dma_loaf *loaf,
-                                                                size_t len)
-{
-       void *cpu_addr;
-       dma_addr_t dma_handle;
-
-       cpu_addr = pci_alloc_consistent(dev, len, &dma_handle);
-       if (cpu_addr == NULL)
-               return false;
-       
-       loaf->cpu_free = loaf->cpu_base = cpu_addr;
-       loaf->dma_free =loaf->dma_base = dma_handle;
-       loaf->length = len;
-       memset(cpu_addr, 0, len);
-       return true;
-}
-
-static void *slice_dma_loaf(struct dma_loaf *loaf, size_t len,
-                                       dma_addr_t *dma_handle)
-{
-       void *cpu_end = loaf->cpu_free + len;
-       void *cpu_addr = loaf->cpu_free;
-
-       BUG_ON(cpu_end > loaf->cpu_base + loaf->length);
-       *dma_handle = loaf->dma_free;
-       loaf->cpu_free = cpu_end;
-       loaf->dma_free += len;
-       return cpu_addr;
-}
-
-static void free_dma_loaf(struct pci_dev *dev, struct dma_loaf *loaf_handle)
-{
-       if (loaf_handle->cpu_base != NULL)
-               pci_free_consistent(dev, loaf_handle->length,
-                       loaf_handle->cpu_base, loaf_handle->dma_base);
-}
-
-
-/*
-  DAC960_CreateAuxiliaryStructures allocates and initializes the auxiliary
-  data structures for Controller.  It returns true on success and false on
-  failure.
-*/
-
-static bool DAC960_CreateAuxiliaryStructures(DAC960_Controller_T *Controller)
-{
-  int CommandAllocationLength, CommandAllocationGroupSize;
-  int CommandsRemaining = 0, CommandIdentifier, CommandGroupByteCount;
-  void *AllocationPointer = NULL;
-  void *ScatterGatherCPU = NULL;
-  dma_addr_t ScatterGatherDMA;
-  struct dma_pool *ScatterGatherPool;
-  void *RequestSenseCPU = NULL;
-  dma_addr_t RequestSenseDMA;
-  struct dma_pool *RequestSensePool = NULL;
-
-  if (Controller->FirmwareType == DAC960_V1_Controller)
-    {
-      CommandAllocationLength = offsetof(DAC960_Command_T, V1.EndMarker);
-      CommandAllocationGroupSize = DAC960_V1_CommandAllocationGroupSize;
-      ScatterGatherPool = dma_pool_create("DAC960_V1_ScatterGather",
-               &Controller->PCIDevice->dev,
-       DAC960_V1_ScatterGatherLimit * sizeof(DAC960_V1_ScatterGatherSegment_T),
-       sizeof(DAC960_V1_ScatterGatherSegment_T), 0);
-      if (ScatterGatherPool == NULL)
-           return DAC960_Failure(Controller,
-                       "AUXILIARY STRUCTURE CREATION (SG)");
-      Controller->ScatterGatherPool = ScatterGatherPool;
-    }
-  else
-    {
-      CommandAllocationLength = offsetof(DAC960_Command_T, V2.EndMarker);
-      CommandAllocationGroupSize = DAC960_V2_CommandAllocationGroupSize;
-      ScatterGatherPool = dma_pool_create("DAC960_V2_ScatterGather",
-               &Controller->PCIDevice->dev,
-       DAC960_V2_ScatterGatherLimit * sizeof(DAC960_V2_ScatterGatherSegment_T),
-       sizeof(DAC960_V2_ScatterGatherSegment_T), 0);
-      if (ScatterGatherPool == NULL)
-           return DAC960_Failure(Controller,
-                       "AUXILIARY STRUCTURE CREATION (SG)");
-      RequestSensePool = dma_pool_create("DAC960_V2_RequestSense",
-               &Controller->PCIDevice->dev, sizeof(DAC960_SCSI_RequestSense_T),
-               sizeof(int), 0);
-      if (RequestSensePool == NULL) {
-           dma_pool_destroy(ScatterGatherPool);
-           return DAC960_Failure(Controller,
-                       "AUXILIARY STRUCTURE CREATION (SG)");
-      }
-      Controller->ScatterGatherPool = ScatterGatherPool;
-      Controller->V2.RequestSensePool = RequestSensePool;
-    }
-  Controller->CommandAllocationGroupSize = CommandAllocationGroupSize;
-  Controller->FreeCommands = NULL;
-  for (CommandIdentifier = 1;
-       CommandIdentifier <= Controller->DriverQueueDepth;
-       CommandIdentifier++)
-    {
-      DAC960_Command_T *Command;
-      if (--CommandsRemaining <= 0)
-       {
-         CommandsRemaining =
-               Controller->DriverQueueDepth - CommandIdentifier + 1;
-         if (CommandsRemaining > CommandAllocationGroupSize)
-               CommandsRemaining = CommandAllocationGroupSize;
-         CommandGroupByteCount =
-               CommandsRemaining * CommandAllocationLength;
-         AllocationPointer = kzalloc(CommandGroupByteCount, GFP_ATOMIC);
-         if (AllocationPointer == NULL)
-               return DAC960_Failure(Controller,
-                                       "AUXILIARY STRUCTURE CREATION");
-        }
-      Command = (DAC960_Command_T *) AllocationPointer;
-      AllocationPointer += CommandAllocationLength;
-      Command->CommandIdentifier = CommandIdentifier;
-      Command->Controller = Controller;
-      Command->Next = Controller->FreeCommands;
-      Controller->FreeCommands = Command;
-      Controller->Commands[CommandIdentifier-1] = Command;
-      ScatterGatherCPU = dma_pool_alloc(ScatterGatherPool, GFP_ATOMIC,
-                                                       &ScatterGatherDMA);
-      if (ScatterGatherCPU == NULL)
-         return DAC960_Failure(Controller, "AUXILIARY STRUCTURE CREATION");
-
-      if (RequestSensePool != NULL) {
-         RequestSenseCPU = dma_pool_alloc(RequestSensePool, GFP_ATOMIC,
-                                               &RequestSenseDMA);
-         if (RequestSenseCPU == NULL) {
-                dma_pool_free(ScatterGatherPool, ScatterGatherCPU,
-                                ScatterGatherDMA);
-               return DAC960_Failure(Controller,
-                                       "AUXILIARY STRUCTURE CREATION");
-         }
-        }
-     if (Controller->FirmwareType == DAC960_V1_Controller) {
-        Command->cmd_sglist = Command->V1.ScatterList;
-       Command->V1.ScatterGatherList =
-               (DAC960_V1_ScatterGatherSegment_T *)ScatterGatherCPU;
-       Command->V1.ScatterGatherListDMA = ScatterGatherDMA;
-       sg_init_table(Command->cmd_sglist, DAC960_V1_ScatterGatherLimit);
-      } else {
-        Command->cmd_sglist = Command->V2.ScatterList;
-       Command->V2.ScatterGatherList =
-               (DAC960_V2_ScatterGatherSegment_T *)ScatterGatherCPU;
-       Command->V2.ScatterGatherListDMA = ScatterGatherDMA;
-       Command->V2.RequestSense =
-                               (DAC960_SCSI_RequestSense_T *)RequestSenseCPU;
-       Command->V2.RequestSenseDMA = RequestSenseDMA;
-       sg_init_table(Command->cmd_sglist, DAC960_V2_ScatterGatherLimit);
-      }
-    }
-  return true;
-}
-
-
-/*
-  DAC960_DestroyAuxiliaryStructures deallocates the auxiliary data
-  structures for Controller.
-*/
-
-static void DAC960_DestroyAuxiliaryStructures(DAC960_Controller_T *Controller)
-{
-  int i;
-  struct dma_pool *ScatterGatherPool = Controller->ScatterGatherPool;
-  struct dma_pool *RequestSensePool = NULL;
-  void *ScatterGatherCPU;
-  dma_addr_t ScatterGatherDMA;
-  void *RequestSenseCPU;
-  dma_addr_t RequestSenseDMA;
-  DAC960_Command_T *CommandGroup = NULL;
-  
-
-  if (Controller->FirmwareType == DAC960_V2_Controller)
-        RequestSensePool = Controller->V2.RequestSensePool;
-
-  Controller->FreeCommands = NULL;
-  for (i = 0; i < Controller->DriverQueueDepth; i++)
-    {
-      DAC960_Command_T *Command = Controller->Commands[i];
-
-      if (Command == NULL)
-         continue;
-
-      if (Controller->FirmwareType == DAC960_V1_Controller) {
-         ScatterGatherCPU = (void *)Command->V1.ScatterGatherList;
-         ScatterGatherDMA = Command->V1.ScatterGatherListDMA;
-         RequestSenseCPU = NULL;
-         RequestSenseDMA = (dma_addr_t)0;
-      } else {
-          ScatterGatherCPU = (void *)Command->V2.ScatterGatherList;
-         ScatterGatherDMA = Command->V2.ScatterGatherListDMA;
-         RequestSenseCPU = (void *)Command->V2.RequestSense;
-         RequestSenseDMA = Command->V2.RequestSenseDMA;
-      }
-      if (ScatterGatherCPU != NULL)
-          dma_pool_free(ScatterGatherPool, ScatterGatherCPU, ScatterGatherDMA);
-      if (RequestSenseCPU != NULL)
-          dma_pool_free(RequestSensePool, RequestSenseCPU, RequestSenseDMA);
-
-      if ((Command->CommandIdentifier
-          % Controller->CommandAllocationGroupSize) == 1) {
-          /*
-           * We can't free the group of commands until all of the
-           * request sense and scatter gather dma structures are free.
-            * Remember the beginning of the group, but don't free it
-           * until we've reached the beginning of the next group.
-           */
-          kfree(CommandGroup);
-          CommandGroup = Command;
-      }
-      Controller->Commands[i] = NULL;
-    }
-  kfree(CommandGroup);
-
-  if (Controller->CombinedStatusBuffer != NULL)
-    {
-      kfree(Controller->CombinedStatusBuffer);
-      Controller->CombinedStatusBuffer = NULL;
-      Controller->CurrentStatusBuffer = NULL;
-    }
-
-  dma_pool_destroy(ScatterGatherPool);
-  if (Controller->FirmwareType == DAC960_V1_Controller)
-       return;
-
-  dma_pool_destroy(RequestSensePool);
-
-  for (i = 0; i < DAC960_MaxLogicalDrives; i++) {
-       kfree(Controller->V2.LogicalDeviceInformation[i]);
-       Controller->V2.LogicalDeviceInformation[i] = NULL;
-  }
-
-  for (i = 0; i < DAC960_V2_MaxPhysicalDevices; i++)
-    {
-      kfree(Controller->V2.PhysicalDeviceInformation[i]);
-      Controller->V2.PhysicalDeviceInformation[i] = NULL;
-      kfree(Controller->V2.InquiryUnitSerialNumber[i]);
-      Controller->V2.InquiryUnitSerialNumber[i] = NULL;
-    }
-}
-
-
-/*
-  DAC960_V1_ClearCommand clears critical fields of Command for DAC960 V1
-  Firmware Controllers.
-*/
-
-static inline void DAC960_V1_ClearCommand(DAC960_Command_T *Command)
-{
-  DAC960_V1_CommandMailbox_T *CommandMailbox = &Command->V1.CommandMailbox;
-  memset(CommandMailbox, 0, sizeof(DAC960_V1_CommandMailbox_T));
-  Command->V1.CommandStatus = 0;
-}
-
-
-/*
-  DAC960_V2_ClearCommand clears critical fields of Command for DAC960 V2
-  Firmware Controllers.
-*/
-
-static inline void DAC960_V2_ClearCommand(DAC960_Command_T *Command)
-{
-  DAC960_V2_CommandMailbox_T *CommandMailbox = &Command->V2.CommandMailbox;
-  memset(CommandMailbox, 0, sizeof(DAC960_V2_CommandMailbox_T));
-  Command->V2.CommandStatus = 0;
-}
-
-
-/*
-  DAC960_AllocateCommand allocates a Command structure from Controller's
-  free list.  During driver initialization, a special initialization command
-  has been placed on the free list to guarantee that command allocation can
-  never fail.
-*/
-
-static inline DAC960_Command_T *DAC960_AllocateCommand(DAC960_Controller_T
-                                                      *Controller)
-{
-  DAC960_Command_T *Command = Controller->FreeCommands;
-  if (Command == NULL) return NULL;
-  Controller->FreeCommands = Command->Next;
-  Command->Next = NULL;
-  return Command;
-}
-
-
-/*
-  DAC960_DeallocateCommand deallocates Command, returning it to Controller's
-  free list.
-*/
-
-static inline void DAC960_DeallocateCommand(DAC960_Command_T *Command)
-{
-  DAC960_Controller_T *Controller = Command->Controller;
-
-  Command->Request = NULL;
-  Command->Next = Controller->FreeCommands;
-  Controller->FreeCommands = Command;
-}
-
-
-/*
-  DAC960_WaitForCommand waits for a wake_up on Controller's Command Wait Queue.
-*/
-
-static void DAC960_WaitForCommand(DAC960_Controller_T *Controller)
-{
-  spin_unlock_irq(&Controller->queue_lock);
-  __wait_event(Controller->CommandWaitQueue, Controller->FreeCommands);
-  spin_lock_irq(&Controller->queue_lock);
-}
-
-/*
-  DAC960_GEM_QueueCommand queues Command for DAC960 GEM Series Controllers.
-*/
-
-static void DAC960_GEM_QueueCommand(DAC960_Command_T *Command)
-{
-  DAC960_Controller_T *Controller = Command->Controller;
-  void __iomem *ControllerBaseAddress = Controller->BaseAddress;
-  DAC960_V2_CommandMailbox_T *CommandMailbox = &Command->V2.CommandMailbox;
-  DAC960_V2_CommandMailbox_T *NextCommandMailbox =
-      Controller->V2.NextCommandMailbox;
-
-  CommandMailbox->Common.CommandIdentifier = Command->CommandIdentifier;
-  DAC960_GEM_WriteCommandMailbox(NextCommandMailbox, CommandMailbox);
-
-  if (Controller->V2.PreviousCommandMailbox1->Words[0] == 0 ||
-      Controller->V2.PreviousCommandMailbox2->Words[0] == 0)
-      DAC960_GEM_MemoryMailboxNewCommand(ControllerBaseAddress);
-
-  Controller->V2.PreviousCommandMailbox2 =
-      Controller->V2.PreviousCommandMailbox1;
-  Controller->V2.PreviousCommandMailbox1 = NextCommandMailbox;
-
-  if (++NextCommandMailbox > Controller->V2.LastCommandMailbox)
-      NextCommandMailbox = Controller->V2.FirstCommandMailbox;
-
-  Controller->V2.NextCommandMailbox = NextCommandMailbox;
-}
-
-/*
-  DAC960_BA_QueueCommand queues Command for DAC960 BA Series Controllers.
-*/
-
-static void DAC960_BA_QueueCommand(DAC960_Command_T *Command)
-{
-  DAC960_Controller_T *Controller = Command->Controller;
-  void __iomem *ControllerBaseAddress = Controller->BaseAddress;
-  DAC960_V2_CommandMailbox_T *CommandMailbox = &Command->V2.CommandMailbox;
-  DAC960_V2_CommandMailbox_T *NextCommandMailbox =
-    Controller->V2.NextCommandMailbox;
-  CommandMailbox->Common.CommandIdentifier = Command->CommandIdentifier;
-  DAC960_BA_WriteCommandMailbox(NextCommandMailbox, CommandMailbox);
-  if (Controller->V2.PreviousCommandMailbox1->Words[0] == 0 ||
-      Controller->V2.PreviousCommandMailbox2->Words[0] == 0)
-    DAC960_BA_MemoryMailboxNewCommand(ControllerBaseAddress);
-  Controller->V2.PreviousCommandMailbox2 =
-    Controller->V2.PreviousCommandMailbox1;
-  Controller->V2.PreviousCommandMailbox1 = NextCommandMailbox;
-  if (++NextCommandMailbox > Controller->V2.LastCommandMailbox)
-    NextCommandMailbox = Controller->V2.FirstCommandMailbox;
-  Controller->V2.NextCommandMailbox = NextCommandMailbox;
-}
-
-
-/*
-  DAC960_LP_QueueCommand queues Command for DAC960 LP Series Controllers.
-*/
-
-static void DAC960_LP_QueueCommand(DAC960_Command_T *Command)
-{
-  DAC960_Controller_T *Controller = Command->Controller;
-  void __iomem *ControllerBaseAddress = Controller->BaseAddress;
-  DAC960_V2_CommandMailbox_T *CommandMailbox = &Command->V2.CommandMailbox;
-  DAC960_V2_CommandMailbox_T *NextCommandMailbox =
-    Controller->V2.NextCommandMailbox;
-  CommandMailbox->Common.CommandIdentifier = Command->CommandIdentifier;
-  DAC960_LP_WriteCommandMailbox(NextCommandMailbox, CommandMailbox);
-  if (Controller->V2.PreviousCommandMailbox1->Words[0] == 0 ||
-      Controller->V2.PreviousCommandMailbox2->Words[0] == 0)
-    DAC960_LP_MemoryMailboxNewCommand(ControllerBaseAddress);
-  Controller->V2.PreviousCommandMailbox2 =
-    Controller->V2.PreviousCommandMailbox1;
-  Controller->V2.PreviousCommandMailbox1 = NextCommandMailbox;
-  if (++NextCommandMailbox > Controller->V2.LastCommandMailbox)
-    NextCommandMailbox = Controller->V2.FirstCommandMailbox;
-  Controller->V2.NextCommandMailbox = NextCommandMailbox;
-}
-
-
-/*
-  DAC960_LA_QueueCommandDualMode queues Command for DAC960 LA Series
-  Controllers with Dual Mode Firmware.
-*/
-
-static void DAC960_LA_QueueCommandDualMode(DAC960_Command_T *Command)
-{
-  DAC960_Controller_T *Controller = Command->Controller;
-  void __iomem *ControllerBaseAddress = Controller->BaseAddress;
-  DAC960_V1_CommandMailbox_T *CommandMailbox = &Command->V1.CommandMailbox;
-  DAC960_V1_CommandMailbox_T *NextCommandMailbox =
-    Controller->V1.NextCommandMailbox;
-  CommandMailbox->Common.CommandIdentifier = Command->CommandIdentifier;
-  DAC960_LA_WriteCommandMailbox(NextCommandMailbox, CommandMailbox);
-  if (Controller->V1.PreviousCommandMailbox1->Words[0] == 0 ||
-      Controller->V1.PreviousCommandMailbox2->Words[0] == 0)
-    DAC960_LA_MemoryMailboxNewCommand(ControllerBaseAddress);
-  Controller->V1.PreviousCommandMailbox2 =
-    Controller->V1.PreviousCommandMailbox1;
-  Controller->V1.PreviousCommandMailbox1 = NextCommandMailbox;
-  if (++NextCommandMailbox > Controller->V1.LastCommandMailbox)
-    NextCommandMailbox = Controller->V1.FirstCommandMailbox;
-  Controller->V1.NextCommandMailbox = NextCommandMailbox;
-}
-
-
-/*
-  DAC960_LA_QueueCommandSingleMode queues Command for DAC960 LA Series
-  Controllers with Single Mode Firmware.
-*/
-
-static void DAC960_LA_QueueCommandSingleMode(DAC960_Command_T *Command)
-{
-  DAC960_Controller_T *Controller = Command->Controller;
-  void __iomem *ControllerBaseAddress = Controller->BaseAddress;
-  DAC960_V1_CommandMailbox_T *CommandMailbox = &Command->V1.CommandMailbox;
-  DAC960_V1_CommandMailbox_T *NextCommandMailbox =
-    Controller->V1.NextCommandMailbox;
-  CommandMailbox->Common.CommandIdentifier = Command->CommandIdentifier;
-  DAC960_LA_WriteCommandMailbox(NextCommandMailbox, CommandMailbox);
-  if (Controller->V1.PreviousCommandMailbox1->Words[0] == 0 ||
-      Controller->V1.PreviousCommandMailbox2->Words[0] == 0)
-    DAC960_LA_HardwareMailboxNewCommand(ControllerBaseAddress);
-  Controller->V1.PreviousCommandMailbox2 =
-    Controller->V1.PreviousCommandMailbox1;
-  Controller->V1.PreviousCommandMailbox1 = NextCommandMailbox;
-  if (++NextCommandMailbox > Controller->V1.LastCommandMailbox)
-    NextCommandMailbox = Controller->V1.FirstCommandMailbox;
-  Controller->V1.NextCommandMailbox = NextCommandMailbox;
-}
-
-
-/*
-  DAC960_PG_QueueCommandDualMode queues Command for DAC960 PG Series
-  Controllers with Dual Mode Firmware.
-*/
-
-static void DAC960_PG_QueueCommandDualMode(DAC960_Command_T *Command)
-{
-  DAC960_Controller_T *Controller = Command->Controller;
-  void __iomem *ControllerBaseAddress = Controller->BaseAddress;
-  DAC960_V1_CommandMailbox_T *CommandMailbox = &Command->V1.CommandMailbox;
-  DAC960_V1_CommandMailbox_T *NextCommandMailbox =
-    Controller->V1.NextCommandMailbox;
-  CommandMailbox->Common.CommandIdentifier = Command->CommandIdentifier;
-  DAC960_PG_WriteCommandMailbox(NextCommandMailbox, CommandMailbox);
-  if (Controller->V1.PreviousCommandMailbox1->Words[0] == 0 ||
-      Controller->V1.PreviousCommandMailbox2->Words[0] == 0)
-    DAC960_PG_MemoryMailboxNewCommand(ControllerBaseAddress);
-  Controller->V1.PreviousCommandMailbox2 =
-    Controller->V1.PreviousCommandMailbox1;
-  Controller->V1.PreviousCommandMailbox1 = NextCommandMailbox;
-  if (++NextCommandMailbox > Controller->V1.LastCommandMailbox)
-    NextCommandMailbox = Controller->V1.FirstCommandMailbox;
-  Controller->V1.NextCommandMailbox = NextCommandMailbox;
-}
-
-
-/*
-  DAC960_PG_QueueCommandSingleMode queues Command for DAC960 PG Series
-  Controllers with Single Mode Firmware.
-*/
-
-static void DAC960_PG_QueueCommandSingleMode(DAC960_Command_T *Command)
-{
-  DAC960_Controller_T *Controller = Command->Controller;
-  void __iomem *ControllerBaseAddress = Controller->BaseAddress;
-  DAC960_V1_CommandMailbox_T *CommandMailbox = &Command->V1.CommandMailbox;
-  DAC960_V1_CommandMailbox_T *NextCommandMailbox =
-    Controller->V1.NextCommandMailbox;
-  CommandMailbox->Common.CommandIdentifier = Command->CommandIdentifier;
-  DAC960_PG_WriteCommandMailbox(NextCommandMailbox, CommandMailbox);
-  if (Controller->V1.PreviousCommandMailbox1->Words[0] == 0 ||
-      Controller->V1.PreviousCommandMailbox2->Words[0] == 0)
-    DAC960_PG_HardwareMailboxNewCommand(ControllerBaseAddress);
-  Controller->V1.PreviousCommandMailbox2 =
-    Controller->V1.PreviousCommandMailbox1;
-  Controller->V1.PreviousCommandMailbox1 = NextCommandMailbox;
-  if (++NextCommandMailbox > Controller->V1.LastCommandMailbox)
-    NextCommandMailbox = Controller->V1.FirstCommandMailbox;
-  Controller->V1.NextCommandMailbox = NextCommandMailbox;
-}
-
-
-/*
-  DAC960_PD_QueueCommand queues Command for DAC960 PD Series Controllers.
-*/
-
-static void DAC960_PD_QueueCommand(DAC960_Command_T *Command)
-{
-  DAC960_Controller_T *Controller = Command->Controller;
-  void __iomem *ControllerBaseAddress = Controller->BaseAddress;
-  DAC960_V1_CommandMailbox_T *CommandMailbox = &Command->V1.CommandMailbox;
-  CommandMailbox->Common.CommandIdentifier = Command->CommandIdentifier;
-  while (DAC960_PD_MailboxFullP(ControllerBaseAddress))
-    udelay(1);
-  DAC960_PD_WriteCommandMailbox(ControllerBaseAddress, CommandMailbox);
-  DAC960_PD_NewCommand(ControllerBaseAddress);
-}
-
-
-/*
-  DAC960_P_QueueCommand queues Command for DAC960 P Series Controllers.
-*/
-
-static void DAC960_P_QueueCommand(DAC960_Command_T *Command)
-{
-  DAC960_Controller_T *Controller = Command->Controller;
-  void __iomem *ControllerBaseAddress = Controller->BaseAddress;
-  DAC960_V1_CommandMailbox_T *CommandMailbox = &Command->V1.CommandMailbox;
-  CommandMailbox->Common.CommandIdentifier = Command->CommandIdentifier;
-  switch (CommandMailbox->Common.CommandOpcode)
-    {
-    case DAC960_V1_Enquiry:
-      CommandMailbox->Common.CommandOpcode = DAC960_V1_Enquiry_Old;
-      break;
-    case DAC960_V1_GetDeviceState:
-      CommandMailbox->Common.CommandOpcode = DAC960_V1_GetDeviceState_Old;
-      break;
-    case DAC960_V1_Read:
-      CommandMailbox->Common.CommandOpcode = DAC960_V1_Read_Old;
-      DAC960_PD_To_P_TranslateReadWriteCommand(CommandMailbox);
-      break;
-    case DAC960_V1_Write:
-      CommandMailbox->Common.CommandOpcode = DAC960_V1_Write_Old;
-      DAC960_PD_To_P_TranslateReadWriteCommand(CommandMailbox);
-      break;
-    case DAC960_V1_ReadWithScatterGather:
-      CommandMailbox->Common.CommandOpcode =
-       DAC960_V1_ReadWithScatterGather_Old;
-      DAC960_PD_To_P_TranslateReadWriteCommand(CommandMailbox);
-      break;
-    case DAC960_V1_WriteWithScatterGather:
-      CommandMailbox->Common.CommandOpcode =