Fixed MTP to work with TWRP

2025-10-28 14:58:52 +01:00 · 2018-06-19 23:16:04 +02:00 · 2018-06-19 23:16:04 +02:00 · f6dfaef42e
commit f6dfaef42e
50820 changed files with 20846062 additions and 0 deletions
--- a/block/Kconfig
+++ b/block/Kconfig
@ -0,0 +1,127 @@
+#
+# Block layer core configuration
+#
+menuconfig BLOCK
+       bool "Enable the block layer" if EXPERT
+       default y
+       help
+	 Provide block layer support for the kernel.
+
+	 Disable this option to remove the block layer support from the
+	 kernel. This may be useful for embedded devices.
+
+	 If this option is disabled:
+
+	   - block device files will become unusable
+	   - some filesystems (such as ext3) will become unavailable.
+
+	 Also, SCSI character devices and USB storage will be disabled since
+	 they make use of various block layer definitions and facilities.
+
+	 Say Y here unless you know you really don't want to mount disks and
+	 suchlike.
+
+if BLOCK
+
+config LBDAF
+	bool "Support for large (2TB+) block devices and files"
+	depends on !64BIT
+	default y
+	help
+	  Enable block devices or files of size 2TB and larger.
+
+	  This option is required to support the full capacity of large
+	  (2TB+) block devices, including RAID, disk, Network Block Device,
+	  Logical Volume Manager (LVM) and loopback.
+	
+	  This option also enables support for single files larger than
+	  2TB.
+
+	  The ext4 filesystem requires that this feature be enabled in
+	  order to support filesystems that have the huge_file feature
+	  enabled.  Otherwise, it will refuse to mount in the read-write
+	  mode any filesystems that use the huge_file feature, which is
+	  enabled by default by mke2fs.ext4.
+
+	  The GFS2 filesystem also requires this feature.
+
+	  If unsure, say Y.
+
+config BLK_DEV_BSG
+	bool "Block layer SG support v4"
+	default y
+	help
+	  Saying Y here will enable generic SG (SCSI generic) v4 support
+	  for any block device.
+
+	  Unlike SG v3 (aka block/scsi_ioctl.c drivers/scsi/sg.c), SG v4
+	  can handle complicated SCSI commands: tagged variable length cdbs
+	  with bidirectional data transfers and generic request/response
+	  protocols (e.g. Task Management Functions and SMP in Serial
+	  Attached SCSI).
+
+	  This option is required by recent UDEV versions to properly
+	  access device serial numbers, etc.
+
+	  If unsure, say Y.
+
+config BLK_DEV_BSGLIB
+	bool "Block layer SG support v4 helper lib"
+	default n
+	select BLK_DEV_BSG
+	help
+	  Subsystems will normally enable this if needed. Users will not
+	  normally need to manually enable this.
+
+	  If unsure, say N.
+
+config BLK_DEV_INTEGRITY
+	bool "Block layer data integrity support"
+	select CRC_T10DIF if BLK_DEV_INTEGRITY
+	---help---
+	Some storage devices allow extra information to be
+	stored/retrieved to help protect the data.  The block layer
+	data integrity option provides hooks which can be used by
+	filesystems to ensure better data integrity.
+
+	Say yes here if you have a storage device that provides the
+	T10/SCSI Data Integrity Field or the T13/ATA External Path
+	Protection.  If in doubt, say N.
+
+config BLK_DEV_THROTTLING
+	bool "Block layer bio throttling support"
+	depends on BLK_CGROUP=y
+	default n
+	---help---
+	Block layer bio throttling support. It can be used to limit
+	the IO rate to a device. IO rate policies are per cgroup and
+	one needs to mount and use blkio cgroup controller for creating
+	cgroups and specifying per device IO rate policies.
+
+	See Documentation/cgroups/blkio-controller.txt for more information.
+
+config BLK_CMDLINE_PARSER
+	bool "Block device command line partition parser"
+	default n
+	---help---
+	Enabling this option allows you to specify the partition layout from
+	the kernel boot args.  This is typically of use for embedded devices
+	which don't otherwise have any standardized method for listing the
+	partitions on a block device.
+
+	See Documentation/block/cmdline-partition.txt for more information.
+
+menu "Partition Types"
+
+source "block/partitions/Kconfig"
+
+endmenu
+
+endif # BLOCK
+
+config BLOCK_COMPAT
+	bool
+	depends on BLOCK && COMPAT
+	default y
+
+source block/Kconfig.iosched
--- a/block/Kconfig.iosched
+++ b/block/Kconfig.iosched
@ -0,0 +1,68 @@
+if BLOCK
+
+menu "IO Schedulers"
+
+config IOSCHED_NOOP
+	bool
+	default y
+	---help---
+	  The no-op I/O scheduler is a minimal scheduler that does basic merging
+	  and sorting. Its main uses include non-disk based block devices like
+	  memory devices, and specialised software or hardware environments
+	  that do their own scheduling and require only minimal assistance from
+	  the kernel.
+
+config IOSCHED_DEADLINE
+	tristate "Deadline I/O scheduler"
+	default y
+	---help---
+	  The deadline I/O scheduler is simple and compact. It will provide
+	  CSCAN service with FIFO expiration of requests, switching to
+	  a new point in the service tree and doing a batch of IO from there
+	  in case of expiry.
+
+config IOSCHED_CFQ
+	tristate "CFQ I/O scheduler"
+	default y
+	---help---
+	  The CFQ I/O scheduler tries to distribute bandwidth equally
+	  among all processes in the system. It should provide a fair
+	  and low latency working environment, suitable for both desktop
+	  and server systems.
+
+	  This is the default I/O scheduler.
+
+config CFQ_GROUP_IOSCHED
+	bool "CFQ Group Scheduling support"
+	depends on IOSCHED_CFQ && BLK_CGROUP
+	default n
+	---help---
+	  Enable group IO scheduling in CFQ.
+
+choice
+	prompt "Default I/O scheduler"
+	default DEFAULT_CFQ
+	help
+	  Select the I/O scheduler which will be used by default for all
+	  block devices.
+
+	config DEFAULT_DEADLINE
+		bool "Deadline" if IOSCHED_DEADLINE=y
+
+	config DEFAULT_CFQ
+		bool "CFQ" if IOSCHED_CFQ=y
+
+	config DEFAULT_NOOP
+		bool "No-op"
+
+endchoice
+
+config DEFAULT_IOSCHED
+	string
+	default "deadline" if DEFAULT_DEADLINE
+	default "cfq" if DEFAULT_CFQ
+	default "noop" if DEFAULT_NOOP
+
+endmenu
+
+endif
--- a/block/Makefile
+++ b/block/Makefile
@ -0,0 +1,25 @@
+#
+# Makefile for the kernel block layer
+#
+
+obj-$(CONFIG_BLOCK) := bio.o elevator.o blk-core.o blk-tag.o blk-sysfs.o \
+			blk-flush.o blk-settings.o blk-ioc.o blk-map.o \
+			blk-exec.o blk-merge.o blk-softirq.o blk-timeout.o \
+			blk-iopoll.o blk-lib.o blk-mq.o blk-mq-tag.o \
+			blk-mq-sysfs.o blk-mq-cpu.o blk-mq-cpumap.o ioctl.o \
+			genhd.o scsi_ioctl.o partition-generic.o ioprio.o \
+			partitions/
+
+obj-$(CONFIG_BOUNCE)	+= bounce.o
+obj-$(CONFIG_BLK_DEV_BSG)	+= bsg.o
+obj-$(CONFIG_BLK_DEV_BSGLIB)	+= bsg-lib.o
+obj-$(CONFIG_BLK_CGROUP)	+= blk-cgroup.o
+obj-$(CONFIG_BLK_DEV_THROTTLING)	+= blk-throttle.o
+obj-$(CONFIG_IOSCHED_NOOP)	+= noop-iosched.o
+obj-$(CONFIG_IOSCHED_DEADLINE)	+= deadline-iosched.o
+obj-$(CONFIG_IOSCHED_CFQ)	+= cfq-iosched.o
+
+obj-$(CONFIG_BLOCK_COMPAT)	+= compat_ioctl.o
+obj-$(CONFIG_BLK_CMDLINE_PARSER)	+= cmdline-parser.o
+obj-$(CONFIG_BLK_DEV_INTEGRITY) += bio-integrity.o blk-integrity.o t10-pi.o
+
--- a/block/bio-integrity.c
+++ b/block/bio-integrity.c
@ -0,0 +1,517 @@
+/*
+ * bio-integrity.c - bio data integrity extensions
+ *
+ * Copyright (C) 2007, 2008, 2009 Oracle Corporation
+ * Written by: Martin K. Petersen <martin.petersen@oracle.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License version
+ * 2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; see the file COPYING.  If not, write to
+ * the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139,
+ * USA.
+ *
+ */
+
+#include <linux/blkdev.h>
+#include <linux/mempool.h>
+#include <linux/export.h>
+#include <linux/bio.h>
+#include <linux/workqueue.h>
+#include <linux/slab.h>
+
+#define BIP_INLINE_VECS	4
+
+static struct kmem_cache *bip_slab;
+static struct workqueue_struct *kintegrityd_wq;
+
+/**
+ * bio_integrity_alloc - Allocate integrity payload and attach it to bio
+ * @bio:	bio to attach integrity metadata to
+ * @gfp_mask:	Memory allocation mask
+ * @nr_vecs:	Number of integrity metadata scatter-gather elements
+ *
+ * Description: This function prepares a bio for attaching integrity
+ * metadata.  nr_vecs specifies the maximum number of pages containing
+ * integrity metadata that can be attached.
+ */
+struct bio_integrity_payload *bio_integrity_alloc(struct bio *bio,
+						  gfp_t gfp_mask,
+						  unsigned int nr_vecs)
+{
+	struct bio_integrity_payload *bip;
+	struct bio_set *bs = bio->bi_pool;
+	unsigned long idx = BIO_POOL_NONE;
+	unsigned inline_vecs;
+
+	if (!bs) {
+		bip = kmalloc(sizeof(struct bio_integrity_payload) +
+			      sizeof(struct bio_vec) * nr_vecs, gfp_mask);
+		inline_vecs = nr_vecs;
+	} else {
+		bip = mempool_alloc(bs->bio_integrity_pool, gfp_mask);
+		inline_vecs = BIP_INLINE_VECS;
+	}
+
+	if (unlikely(!bip))
+		return NULL;
+
+	memset(bip, 0, sizeof(*bip));
+
+	if (nr_vecs > inline_vecs) {
+		bip->bip_vec = bvec_alloc(gfp_mask, nr_vecs, &idx,
+					  bs->bvec_integrity_pool);
+		if (!bip->bip_vec)
+			goto err;
+		bip->bip_max_vcnt = bvec_nr_vecs(idx);
+	} else {
+		bip->bip_vec = bip->bip_inline_vecs;
+		bip->bip_max_vcnt = inline_vecs;
+	}
+
+	bip->bip_slab = idx;
+	bip->bip_bio = bio;
+	bio->bi_integrity = bip;
+	bio->bi_rw |= REQ_INTEGRITY;
+
+	return bip;
+err:
+	mempool_free(bip, bs->bio_integrity_pool);
+	return NULL;
+}
+EXPORT_SYMBOL(bio_integrity_alloc);
+
+/**
+ * bio_integrity_free - Free bio integrity payload
+ * @bio:	bio containing bip to be freed
+ *
+ * Description: Used to free the integrity portion of a bio. Usually
+ * called from bio_free().
+ */
+void bio_integrity_free(struct bio *bio)
+{
+	struct bio_integrity_payload *bip = bio_integrity(bio);
+	struct bio_set *bs = bio->bi_pool;
+
+	if (bip->bip_flags & BIP_BLOCK_INTEGRITY)
+		kfree(page_address(bip->bip_vec->bv_page) +
+		      bip->bip_vec->bv_offset);
+
+	if (bs) {
+		if (bip->bip_slab != BIO_POOL_NONE)
+			bvec_free(bs->bvec_integrity_pool, bip->bip_vec,
+				  bip->bip_slab);
+
+		mempool_free(bip, bs->bio_integrity_pool);
+	} else {
+		kfree(bip);
+	}
+
+	bio->bi_integrity = NULL;
+}
+EXPORT_SYMBOL(bio_integrity_free);
+
+/**
+ * bio_integrity_add_page - Attach integrity metadata
+ * @bio:	bio to update
+ * @page:	page containing integrity metadata
+ * @len:	number of bytes of integrity metadata in page
+ * @offset:	start offset within page
+ *
+ * Description: Attach a page containing integrity metadata to bio.
+ */
+int bio_integrity_add_page(struct bio *bio, struct page *page,
+			   unsigned int len, unsigned int offset)
+{
+	struct bio_integrity_payload *bip = bio_integrity(bio);
+	struct bio_vec *iv;
+
+	if (bip->bip_vcnt >= bip->bip_max_vcnt) {
+		printk(KERN_ERR "%s: bip_vec full\n", __func__);
+		return 0;
+	}
+
+	iv = bip->bip_vec + bip->bip_vcnt;
+
+	iv->bv_page = page;
+	iv->bv_len = len;
+	iv->bv_offset = offset;
+	bip->bip_vcnt++;
+
+	return len;
+}
+EXPORT_SYMBOL(bio_integrity_add_page);
+
+/**
+ * bio_integrity_enabled - Check whether integrity can be passed
+ * @bio:	bio to check
+ *
+ * Description: Determines whether bio_integrity_prep() can be called
+ * on this bio or not.	bio data direction and target device must be
+ * set prior to calling.  The functions honors the write_generate and
+ * read_verify flags in sysfs.
+ */
+bool bio_integrity_enabled(struct bio *bio)
+{
+	struct blk_integrity *bi = bdev_get_integrity(bio->bi_bdev);
+
+	if (!bio_is_rw(bio))
+		return false;
+
+	/* Already protected? */
+	if (bio_integrity(bio))
+		return false;
+
+	if (bi == NULL)
+		return false;
+
+	if (bio_data_dir(bio) == READ && bi->verify_fn != NULL &&
+	    (bi->flags & BLK_INTEGRITY_VERIFY))
+		return true;
+
+	if (bio_data_dir(bio) == WRITE && bi->generate_fn != NULL &&
+	    (bi->flags & BLK_INTEGRITY_GENERATE))
+		return true;
+
+	return false;
+}
+EXPORT_SYMBOL(bio_integrity_enabled);
+
+/**
+ * bio_integrity_intervals - Return number of integrity intervals for a bio
+ * @bi:		blk_integrity profile for device
+ * @sectors:	Size of the bio in 512-byte sectors
+ *
+ * Description: The block layer calculates everything in 512 byte
+ * sectors but integrity metadata is done in terms of the data integrity
+ * interval size of the storage device.  Convert the block layer sectors
+ * to the appropriate number of integrity intervals.
+ */
+static inline unsigned int bio_integrity_intervals(struct blk_integrity *bi,
+						   unsigned int sectors)
+{
+	return sectors >> (ilog2(bi->interval) - 9);
+}
+
+static inline unsigned int bio_integrity_bytes(struct blk_integrity *bi,
+					       unsigned int sectors)
+{
+	return bio_integrity_intervals(bi, sectors) * bi->tuple_size;
+}
+
+/**
+ * bio_integrity_process - Process integrity metadata for a bio
+ * @bio:	bio to generate/verify integrity metadata for
+ * @proc_fn:	Pointer to the relevant processing function
+ */
+static int bio_integrity_process(struct bio *bio,
+				 integrity_processing_fn *proc_fn)
+{
+	struct blk_integrity *bi = bdev_get_integrity(bio->bi_bdev);
+	struct blk_integrity_iter iter;
+	struct bvec_iter bviter;
+	struct bio_vec bv;
+	struct bio_integrity_payload *bip = bio_integrity(bio);
+	unsigned int ret = 0;
+	void *prot_buf = page_address(bip->bip_vec->bv_page) +
+		bip->bip_vec->bv_offset;
+
+	iter.disk_name = bio->bi_bdev->bd_disk->disk_name;
+	iter.interval = bi->interval;
+	iter.seed = bip_get_seed(bip);
+	iter.prot_buf = prot_buf;
+
+	bio_for_each_segment(bv, bio, bviter) {
+		void *kaddr = kmap_atomic(bv.bv_page);
+
+		iter.data_buf = kaddr + bv.bv_offset;
+		iter.data_size = bv.bv_len;
+
+		ret = proc_fn(&iter);
+		if (ret) {
+			kunmap_atomic(kaddr);
+			return ret;
+		}
+
+		kunmap_atomic(kaddr);
+	}
+	return ret;
+}
+
+/**
+ * bio_integrity_prep - Prepare bio for integrity I/O
+ * @bio:	bio to prepare
+ *
+ * Description: Allocates a buffer for integrity metadata, maps the
+ * pages and attaches them to a bio.  The bio must have data
+ * direction, target device and start sector set priot to calling.  In
+ * the WRITE case, integrity metadata will be generated using the
+ * block device's integrity function.  In the READ case, the buffer
+ * will be prepared for DMA and a suitable end_io handler set up.
+ */
+int bio_integrity_prep(struct bio *bio)
+{
+	struct bio_integrity_payload *bip;
+	struct blk_integrity *bi;
+	struct request_queue *q;
+	void *buf;
+	unsigned long start, end;
+	unsigned int len, nr_pages;
+	unsigned int bytes, offset, i;
+	unsigned int intervals;
+
+	bi = bdev_get_integrity(bio->bi_bdev);
+	q = bdev_get_queue(bio->bi_bdev);
+	BUG_ON(bi == NULL);
+	BUG_ON(bio_integrity(bio));
+
+	intervals = bio_integrity_intervals(bi, bio_sectors(bio));
+
+	/* Allocate kernel buffer for protection data */
+	len = intervals * bi->tuple_size;
+	buf = kmalloc(len, GFP_NOIO | q->bounce_gfp);
+	if (unlikely(buf == NULL)) {
+		printk(KERN_ERR "could not allocate integrity buffer\n");
+		return -ENOMEM;
+	}
+
+	end = (((unsigned long) buf) + len + PAGE_SIZE - 1) >> PAGE_SHIFT;
+	start = ((unsigned long) buf) >> PAGE_SHIFT;
+	nr_pages = end - start;
+
+	/* Allocate bio integrity payload and integrity vectors */
+	bip = bio_integrity_alloc(bio, GFP_NOIO, nr_pages);
+	if (unlikely(bip == NULL)) {
+		printk(KERN_ERR "could not allocate data integrity bioset\n");
+		kfree(buf);
+		return -EIO;
+	}
+
+	bip->bip_flags |= BIP_BLOCK_INTEGRITY;
+	bip->bip_iter.bi_size = len;
+	bip_set_seed(bip, bio->bi_iter.bi_sector);
+
+	if (bi->flags & BLK_INTEGRITY_IP_CHECKSUM)
+		bip->bip_flags |= BIP_IP_CHECKSUM;
+
+	/* Map it */
+	offset = offset_in_page(buf);
+	for (i = 0 ; i < nr_pages ; i++) {
+		int ret;
+		bytes = PAGE_SIZE - offset;
+
+		if (len <= 0)
+			break;
+
+		if (bytes > len)
+			bytes = len;
+
+		ret = bio_integrity_add_page(bio, virt_to_page(buf),
+					     bytes, offset);
+
+		if (ret == 0)
+			return 0;
+
+		if (ret < bytes)
+			break;
+
+		buf += bytes;
+		len -= bytes;
+		offset = 0;
+	}
+
+	/* Install custom I/O completion handler if read verify is enabled */
+	if (bio_data_dir(bio) == READ) {
+		bip->bip_end_io = bio->bi_end_io;
+		bio->bi_end_io = bio_integrity_endio;
+	}
+
+	/* Auto-generate integrity metadata if this is a write */
+	if (bio_data_dir(bio) == WRITE)
+		bio_integrity_process(bio, bi->generate_fn);
+
+	return 0;
+}
+EXPORT_SYMBOL(bio_integrity_prep);
+
+/**
+ * bio_integrity_verify_fn - Integrity I/O completion worker
+ * @work:	Work struct stored in bio to be verified
+ *
+ * Description: This workqueue function is called to complete a READ
+ * request.  The function verifies the transferred integrity metadata
+ * and then calls the original bio end_io function.
+ */
+static void bio_integrity_verify_fn(struct work_struct *work)
+{
+	struct bio_integrity_payload *bip =
+		container_of(work, struct bio_integrity_payload, bip_work);
+	struct bio *bio = bip->bip_bio;
+	struct blk_integrity *bi = bdev_get_integrity(bio->bi_bdev);
+	int error;
+
+	error = bio_integrity_process(bio, bi->verify_fn);
+
+	/* Restore original bio completion handler */
+	bio->bi_end_io = bip->bip_end_io;
+	bio_endio_nodec(bio, error);
+}
+
+/**
+ * bio_integrity_endio - Integrity I/O completion function
+ * @bio:	Protected bio
+ * @error:	Pointer to errno
+ *
+ * Description: Completion for integrity I/O
+ *
+ * Normally I/O completion is done in interrupt context.  However,
+ * verifying I/O integrity is a time-consuming task which must be run
+ * in process context.	This function postpones completion
+ * accordingly.
+ */
+void bio_integrity_endio(struct bio *bio, int error)
+{
+	struct bio_integrity_payload *bip = bio_integrity(bio);
+
+	BUG_ON(bip->bip_bio != bio);
+
+	/* In case of an I/O error there is no point in verifying the
+	 * integrity metadata.  Restore original bio end_io handler
+	 * and run it.
+	 */
+	if (error) {
+		bio->bi_end_io = bip->bip_end_io;
+		bio_endio_nodec(bio, error);
+
+		return;
+	}
+
+	INIT_WORK(&bip->bip_work, bio_integrity_verify_fn);
+	queue_work(kintegrityd_wq, &bip->bip_work);
+}
+EXPORT_SYMBOL(bio_integrity_endio);
+
+/**
+ * bio_integrity_advance - Advance integrity vector
+ * @bio:	bio whose integrity vector to update
+ * @bytes_done:	number of data bytes that have been completed
+ *
+ * Description: This function calculates how many integrity bytes the
+ * number of completed data bytes correspond to and advances the
+ * integrity vector accordingly.
+ */
+void bio_integrity_advance(struct bio *bio, unsigned int bytes_done)
+{
+	struct bio_integrity_payload *bip = bio_integrity(bio);
+	struct blk_integrity *bi = bdev_get_integrity(bio->bi_bdev);
+	unsigned bytes = bio_integrity_bytes(bi, bytes_done >> 9);
+
+	bvec_iter_advance(bip->bip_vec, &bip->bip_iter, bytes);
+}
+EXPORT_SYMBOL(bio_integrity_advance);
+
+/**
+ * bio_integrity_trim - Trim integrity vector
+ * @bio:	bio whose integrity vector to update
+ * @offset:	offset to first data sector
+ * @sectors:	number of data sectors
+ *
+ * Description: Used to trim the integrity vector in a cloned bio.
+ * The ivec will be advanced corresponding to 'offset' data sectors
+ * and the length will be truncated corresponding to 'len' data
+ * sectors.
+ */
+void bio_integrity_trim(struct bio *bio, unsigned int offset,
+			unsigned int sectors)
+{
+	struct bio_integrity_payload *bip = bio_integrity(bio);
+	struct blk_integrity *bi = bdev_get_integrity(bio->bi_bdev);
+
+	bio_integrity_advance(bio, offset << 9);
+	bip->bip_iter.bi_size = bio_integrity_bytes(bi, sectors);
+}
+EXPORT_SYMBOL(bio_integrity_trim);
+
+/**
+ * bio_integrity_clone - Callback for cloning bios with integrity metadata
+ * @bio:	New bio
+ * @bio_src:	Original bio
+ * @gfp_mask:	Memory allocation mask
+ *
+ * Description:	Called to allocate a bip when cloning a bio
+ */
+int bio_integrity_clone(struct bio *bio, struct bio *bio_src,
+			gfp_t gfp_mask)
+{
+	struct bio_integrity_payload *bip_src = bio_integrity(bio_src);
+	struct bio_integrity_payload *bip;
+
+	BUG_ON(bip_src == NULL);
+
+	bip = bio_integrity_alloc(bio, gfp_mask, bip_src->bip_vcnt);
+
+	if (bip == NULL)
+		return -EIO;
+
+	memcpy(bip->bip_vec, bip_src->bip_vec,
+	       bip_src->bip_vcnt * sizeof(struct bio_vec));
+
+	bip->bip_vcnt = bip_src->bip_vcnt;
+	bip->bip_iter = bip_src->bip_iter;
+
+	return 0;
+}
+EXPORT_SYMBOL(bio_integrity_clone);
+
+int bioset_integrity_create(struct bio_set *bs, int pool_size)
+{
+	if (bs->bio_integrity_pool)
+		return 0;
+
+	bs->bio_integrity_pool = mempool_create_slab_pool(pool_size, bip_slab);
+	if (!bs->bio_integrity_pool)
+		return -1;
+
+	bs->bvec_integrity_pool = biovec_create_pool(pool_size);
+	if (!bs->bvec_integrity_pool) {
+		mempool_destroy(bs->bio_integrity_pool);
+		return -1;
+	}
+
+	return 0;
+}
+EXPORT_SYMBOL(bioset_integrity_create);
+
+void bioset_integrity_free(struct bio_set *bs)
+{
+	if (bs->bio_integrity_pool)
+		mempool_destroy(bs->bio_integrity_pool);
+
+	if (bs->bvec_integrity_pool)
+		mempool_destroy(bs->bvec_integrity_pool);
+}
+EXPORT_SYMBOL(bioset_integrity_free);
+
+void __init bio_integrity_init(void)
+{
+	/*
+	 * kintegrityd won't block much but may burn a lot of CPU cycles.
+	 * Make it highpri CPU intensive wq with max concurrency of 1.
+	 */
+	kintegrityd_wq = alloc_workqueue("kintegrityd", WQ_MEM_RECLAIM |
+					 WQ_HIGHPRI | WQ_CPU_INTENSIVE, 1);
+	if (!kintegrityd_wq)
+		panic("Failed to create kintegrityd\n");
+
+	bip_slab = kmem_cache_create("bio_integrity_payload",
+				     sizeof(struct bio_integrity_payload) +
+				     sizeof(struct bio_vec) * BIP_INLINE_VECS,
+				     0, SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL);
+}
--- a/block/bio.c
+++ b/block/bio.c
--- a/block/blk-cgroup.c
+++ b/block/blk-cgroup.c
--- a/block/blk-cgroup.h
+++ b/block/blk-cgroup.h
@ -0,0 +1,603 @@
+#ifndef _BLK_CGROUP_H
+#define _BLK_CGROUP_H
+/*
+ * Common Block IO controller cgroup interface
+ *
+ * Based on ideas and code from CFQ, CFS and BFQ:
+ * Copyright (C) 2003 Jens Axboe <axboe@kernel.dk>
+ *
+ * Copyright (C) 2008 Fabio Checconi <fabio@gandalf.sssup.it>
+ *		      Paolo Valente <paolo.valente@unimore.it>
+ *
+ * Copyright (C) 2009 Vivek Goyal <vgoyal@redhat.com>
+ * 	              Nauman Rafique <nauman@google.com>
+ */
+
+#include <linux/cgroup.h>
+#include <linux/u64_stats_sync.h>
+#include <linux/seq_file.h>
+#include <linux/radix-tree.h>
+#include <linux/blkdev.h>
+#include <linux/atomic.h>
+
+/* Max limits for throttle policy */
+#define THROTL_IOPS_MAX		UINT_MAX
+
+/* CFQ specific, out here for blkcg->cfq_weight */
+#define CFQ_WEIGHT_MIN		10
+#define CFQ_WEIGHT_MAX		1000
+#define CFQ_WEIGHT_DEFAULT	500
+
+#ifdef CONFIG_BLK_CGROUP
+
+enum blkg_rwstat_type {
+	BLKG_RWSTAT_READ,
+	BLKG_RWSTAT_WRITE,
+	BLKG_RWSTAT_SYNC,
+	BLKG_RWSTAT_ASYNC,
+
+	BLKG_RWSTAT_NR,
+	BLKG_RWSTAT_TOTAL = BLKG_RWSTAT_NR,
+};
+
+struct blkcg_gq;
+
+struct blkcg {
+	struct cgroup_subsys_state	css;
+	spinlock_t			lock;
+
+	struct radix_tree_root		blkg_tree;
+	struct blkcg_gq			*blkg_hint;
+	struct hlist_head		blkg_list;
+
+	/* TODO: per-policy storage in blkcg */
+	unsigned int			cfq_weight;	/* belongs to cfq */
+	unsigned int			cfq_leaf_weight;
+};
+
+struct blkg_stat {
+	struct u64_stats_sync		syncp;
+	uint64_t			cnt;
+};
+
+struct blkg_rwstat {
+	struct u64_stats_sync		syncp;
+	uint64_t			cnt[BLKG_RWSTAT_NR];
+};
+
+/*
+ * A blkcg_gq (blkg) is association between a block cgroup (blkcg) and a
+ * request_queue (q).  This is used by blkcg policies which need to track
+ * information per blkcg - q pair.
+ *
+ * There can be multiple active blkcg policies and each has its private
+ * data on each blkg, the size of which is determined by
+ * blkcg_policy->pd_size.  blkcg core allocates and frees such areas
+ * together with blkg and invokes pd_init/exit_fn() methods.
+ *
+ * Such private data must embed struct blkg_policy_data (pd) at the
+ * beginning and pd_size can't be smaller than pd.
+ */
+struct blkg_policy_data {
+	/* the blkg and policy id this per-policy data belongs to */
+	struct blkcg_gq			*blkg;
+	int				plid;
+
+	/* used during policy activation */
+	struct list_head		alloc_node;
+};
+
+/* association between a blk cgroup and a request queue */
+struct blkcg_gq {
+	/* Pointer to the associated request_queue */
+	struct request_queue		*q;
+	struct list_head		q_node;
+	struct hlist_node		blkcg_node;
+	struct blkcg			*blkcg;
+
+	/* all non-root blkcg_gq's are guaranteed to have access to parent */
+	struct blkcg_gq			*parent;
+
+	/* request allocation list for this blkcg-q pair */
+	struct request_list		rl;
+
+	/* reference count */
+	atomic_t			refcnt;
+
+	/* is this blkg online? protected by both blkcg and q locks */
+	bool				online;
+
+	struct blkg_policy_data		*pd[BLKCG_MAX_POLS];
+
+	struct rcu_head			rcu_head;
+};
+
+typedef void (blkcg_pol_init_pd_fn)(struct blkcg_gq *blkg);
+typedef void (blkcg_pol_online_pd_fn)(struct blkcg_gq *blkg);
+typedef void (blkcg_pol_offline_pd_fn)(struct blkcg_gq *blkg);
+typedef void (blkcg_pol_exit_pd_fn)(struct blkcg_gq *blkg);
+typedef void (blkcg_pol_reset_pd_stats_fn)(struct blkcg_gq *blkg);
+
+struct blkcg_policy {
+	int				plid;
+	/* policy specific private data size */
+	size_t				pd_size;
+	/* cgroup files for the policy */
+	struct cftype			*cftypes;
+
+	/* operations */
+	blkcg_pol_init_pd_fn		*pd_init_fn;
+	blkcg_pol_online_pd_fn		*pd_online_fn;
+	blkcg_pol_offline_pd_fn		*pd_offline_fn;
+	blkcg_pol_exit_pd_fn		*pd_exit_fn;
+	blkcg_pol_reset_pd_stats_fn	*pd_reset_stats_fn;
+};
+
+extern struct blkcg blkcg_root;
+
+struct blkcg_gq *blkg_lookup(struct blkcg *blkcg, struct request_queue *q);
+struct blkcg_gq *blkg_lookup_create(struct blkcg *blkcg,
+				    struct request_queue *q);
+int blkcg_init_queue(struct request_queue *q);
+void blkcg_drain_queue(struct request_queue *q);
+void blkcg_exit_queue(struct request_queue *q);
+
+/* Blkio controller policy registration */
+int blkcg_policy_register(struct blkcg_policy *pol);
+void blkcg_policy_unregister(struct blkcg_policy *pol);
+int blkcg_activate_policy(struct request_queue *q,
+			  const struct blkcg_policy *pol);
+void blkcg_deactivate_policy(struct request_queue *q,
+			     const struct blkcg_policy *pol);
+
+void blkcg_print_blkgs(struct seq_file *sf, struct blkcg *blkcg,
+		       u64 (*prfill)(struct seq_file *,
+				     struct blkg_policy_data *, int),
+		       const struct blkcg_policy *pol, int data,
+		       bool show_total);
+u64 __blkg_prfill_u64(struct seq_file *sf, struct blkg_policy_data *pd, u64 v);
+u64 __blkg_prfill_rwstat(struct seq_file *sf, struct blkg_policy_data *pd,
+			 const struct blkg_rwstat *rwstat);
+u64 blkg_prfill_stat(struct seq_file *sf, struct blkg_policy_data *pd, int off);
+u64 blkg_prfill_rwstat(struct seq_file *sf, struct blkg_policy_data *pd,
+		       int off);
+
+u64 blkg_stat_recursive_sum(struct blkg_policy_data *pd, int off);
+struct blkg_rwstat blkg_rwstat_recursive_sum(struct blkg_policy_data *pd,
+					     int off);
+
+struct blkg_conf_ctx {
+	struct gendisk			*disk;
+	struct blkcg_gq			*blkg;
+	u64				v;
+};
+
+int blkg_conf_prep(struct blkcg *blkcg, const struct blkcg_policy *pol,
+		   const char *input, struct blkg_conf_ctx *ctx);
+void blkg_conf_finish(struct blkg_conf_ctx *ctx);
+
+
+static inline struct blkcg *css_to_blkcg(struct cgroup_subsys_state *css)
+{
+	return css ? container_of(css, struct blkcg, css) : NULL;
+}
+
+static inline struct blkcg *task_blkcg(struct task_struct *tsk)
+{
+	return css_to_blkcg(task_css(tsk, blkio_cgrp_id));
+}
+
+static inline struct blkcg *bio_blkcg(struct bio *bio)
+{
+	if (bio && bio->bi_css)
+		return css_to_blkcg(bio->bi_css);
+	return task_blkcg(current);
+}
+
+/**
+ * blkcg_parent - get the parent of a blkcg
+ * @blkcg: blkcg of interest
+ *
+ * Return the parent blkcg of @blkcg.  Can be called anytime.
+ */
+static inline struct blkcg *blkcg_parent(struct blkcg *blkcg)
+{
+	return css_to_blkcg(blkcg->css.parent);
+}
+
+/**
+ * blkg_to_pdata - get policy private data
+ * @blkg: blkg of interest
+ * @pol: policy of interest
+ *
+ * Return pointer to private data associated with the @blkg-@pol pair.
+ */
+static inline struct blkg_policy_data *blkg_to_pd(struct blkcg_gq *blkg,
+						  struct blkcg_policy *pol)
+{
+	return blkg ? blkg->pd[pol->plid] : NULL;
+}
+
+/**
+ * pdata_to_blkg - get blkg associated with policy private data
+ * @pd: policy private data of interest
+ *
+ * @pd is policy private data.  Determine the blkg it's associated with.
+ */
+static inline struct blkcg_gq *pd_to_blkg(struct blkg_policy_data *pd)
+{
+	return pd ? pd->blkg : NULL;
+}
+
+/**
+ * blkg_path - format cgroup path of blkg
+ * @blkg: blkg of interest
+ * @buf: target buffer
+ * @buflen: target buffer length
+ *
+ * Format the path of the cgroup of @blkg into @buf.
+ */
+static inline int blkg_path(struct blkcg_gq *blkg, char *buf, int buflen)
+{
+	char *p;
+
+	p = cgroup_path(blkg->blkcg->css.cgroup, buf, buflen);
+	if (!p) {
+		strncpy(buf, "<unavailable>", buflen);
+		return -ENAMETOOLONG;
+	}
+
+	memmove(buf, p, buf + buflen - p);
+	return 0;
+}
+
+/**
+ * blkg_get - get a blkg reference
+ * @blkg: blkg to get
+ *
+ * The caller should be holding an existing reference.
+ */
+static inline void blkg_get(struct blkcg_gq *blkg)
+{
+	WARN_ON_ONCE(atomic_read(&blkg->refcnt) <= 0);
+	atomic_inc(&blkg->refcnt);
+}
+
+void __blkg_release_rcu(struct rcu_head *rcu);
+
+/**
+ * blkg_put - put a blkg reference
+ * @blkg: blkg to put
+ */
+static inline void blkg_put(struct blkcg_gq *blkg)
+{
+	WARN_ON_ONCE(atomic_read(&blkg->refcnt) <= 0);
+	if (atomic_dec_and_test(&blkg->refcnt))
+		call_rcu(&blkg->rcu_head, __blkg_release_rcu);
+}
+
+struct blkcg_gq *__blkg_lookup(struct blkcg *blkcg, struct request_queue *q,
+			       bool update_hint);
+
+/**
+ * blkg_for_each_descendant_pre - pre-order walk of a blkg's descendants
+ * @d_blkg: loop cursor pointing to the current descendant
+ * @pos_css: used for iteration
+ * @p_blkg: target blkg to walk descendants of
+ *
+ * Walk @c_blkg through the descendants of @p_blkg.  Must be used with RCU
+ * read locked.  If called under either blkcg or queue lock, the iteration
+ * is guaranteed to include all and only online blkgs.  The caller may
+ * update @pos_css by calling css_rightmost_descendant() to skip subtree.
+ * @p_blkg is included in the iteration and the first node to be visited.
+ */
+#define blkg_for_each_descendant_pre(d_blkg, pos_css, p_blkg)		\
+	css_for_each_descendant_pre((pos_css), &(p_blkg)->blkcg->css)	\
+		if (((d_blkg) = __blkg_lookup(css_to_blkcg(pos_css),	\
+					      (p_blkg)->q, false)))
+
+/**
+ * blkg_for_each_descendant_post - post-order walk of a blkg's descendants
+ * @d_blkg: loop cursor pointing to the current descendant
+ * @pos_css: used for iteration
+ * @p_blkg: target blkg to walk descendants of
+ *
+ * Similar to blkg_for_each_descendant_pre() but performs post-order
+ * traversal instead.  Synchronization rules are the same.  @p_blkg is
+ * included in the iteration and the last node to be visited.
+ */
+#define blkg_for_each_descendant_post(d_blkg, pos_css, p_blkg)		\
+	css_for_each_descendant_post((pos_css), &(p_blkg)->blkcg->css)	\
+		if (((d_blkg) = __blkg_lookup(css_to_blkcg(pos_css),	\
+					      (p_blkg)->q, false)))
+
+/**
+ * blk_get_rl - get request_list to use
+ * @q: request_queue of interest
+ * @bio: bio which will be attached to the allocated request (may be %NULL)
+ *
+ * The caller wants to allocate a request from @q to use for @bio.  Find
+ * the request_list to use and obtain a reference on it.  Should be called
+ * under queue_lock.  This function is guaranteed to return non-%NULL
+ * request_list.
+ */
+static inline struct request_list *blk_get_rl(struct request_queue *q,
+					      struct bio *bio)
+{
+	struct blkcg *blkcg;
+	struct blkcg_gq *blkg;
+
+	rcu_read_lock();
+
+	blkcg = bio_blkcg(bio);
+
+	/* bypass blkg lookup and use @q->root_rl directly for root */
+	if (blkcg == &blkcg_root)
+		goto root_rl;
+
+	/*
+	 * Try to use blkg->rl.  blkg lookup may fail under memory pressure
+	 * or if either the blkcg or queue is going away.  Fall back to
+	 * root_rl in such cases.
+	 */
+	blkg = blkg_lookup_create(blkcg, q);
+	if (unlikely(IS_ERR(blkg)))
+		goto root_rl;
+
+	blkg_get(blkg);
+	rcu_read_unlock();
+	return &blkg->rl;
+root_rl:
+	rcu_read_unlock();
+	return &q->root_rl;
+}
+
+/**
+ * blk_put_rl - put request_list
+ * @rl: request_list to put
+ *
+ * Put the reference acquired by blk_get_rl().  Should be called under
+ * queue_lock.
+ */
+static inline void blk_put_rl(struct request_list *rl)
+{
+	/* root_rl may not have blkg set */
+	if (rl->blkg && rl->blkg->blkcg != &blkcg_root)
+		blkg_put(rl->blkg);
+}
+
+/**
+ * blk_rq_set_rl - associate a request with a request_list
+ * @rq: request of interest
+ * @rl: target request_list
+ *
+ * Associate @rq with @rl so that accounting and freeing can know the
+ * request_list @rq came from.
+ */
+static inline void blk_rq_set_rl(struct request *rq, struct request_list *rl)
+{
+	rq->rl = rl;
+}
+
+/**
+ * blk_rq_rl - return the request_list a request came from
+ * @rq: request of interest
+ *
+ * Return the request_list @rq is allocated from.
+ */
+static inline struct request_list *blk_rq_rl(struct request *rq)
+{
+	return rq->rl;
+}
+
+struct request_list *__blk_queue_next_rl(struct request_list *rl,
+					 struct request_queue *q);
+/**
+ * blk_queue_for_each_rl - iterate through all request_lists of a request_queue
+ *
+ * Should be used under queue_lock.
+ */
+#define blk_queue_for_each_rl(rl, q)	\
+	for ((rl) = &(q)->root_rl; (rl); (rl) = __blk_queue_next_rl((rl), (q)))
+
+static inline void blkg_stat_init(struct blkg_stat *stat)
+{
+	u64_stats_init(&stat->syncp);
+}
+
+/**
+ * blkg_stat_add - add a value to a blkg_stat
+ * @stat: target blkg_stat
+ * @val: value to add
+ *
+ * Add @val to @stat.  The caller is responsible for synchronizing calls to
+ * this function.
+ */
+static inline void blkg_stat_add(struct blkg_stat *stat, uint64_t val)
+{
+	u64_stats_update_begin(&stat->syncp);
+	stat->cnt += val;
+	u64_stats_update_end(&stat->syncp);
+}
+
+/**
+ * blkg_stat_read - read the current value of a blkg_stat
+ * @stat: blkg_stat to read
+ *
+ * Read the current value of @stat.  This function can be called without
+ * synchroniztion and takes care of u64 atomicity.
+ */
+static inline uint64_t blkg_stat_read(struct blkg_stat *stat)
+{
+	unsigned int start;
+	uint64_t v;
+
+	do {
+		start = u64_stats_fetch_begin_irq(&stat->syncp);
+		v = stat->cnt;
+	} while (u64_stats_fetch_retry_irq(&stat->syncp, start));
+
+	return v;
+}
+
+/**
+ * blkg_stat_reset - reset a blkg_stat
+ * @stat: blkg_stat to reset
+ */
+static inline void blkg_stat_reset(struct blkg_stat *stat)
+{
+	stat->cnt = 0;
+}
+
+/**
+ * blkg_stat_merge - merge a blkg_stat into another
+ * @to: the destination blkg_stat
+ * @from: the source
+ *
+ * Add @from's count to @to.
+ */
+static inline void blkg_stat_merge(struct blkg_stat *to, struct blkg_stat *from)
+{
+	blkg_stat_add(to, blkg_stat_read(from));
+}
+
+static inline void blkg_rwstat_init(struct blkg_rwstat *rwstat)
+{
+	u64_stats_init(&rwstat->syncp);
+}
+
+/**
+ * blkg_rwstat_add - add a value to a blkg_rwstat
+ * @rwstat: target blkg_rwstat
+ * @rw: mask of REQ_{WRITE|SYNC}
+ * @val: value to add
+ *
+ * Add @val to @rwstat.  The counters are chosen according to @rw.  The
+ * caller is responsible for synchronizing calls to this function.
+ */
+static inline void blkg_rwstat_add(struct blkg_rwstat *rwstat,
+				   int rw, uint64_t val)
+{
+	u64_stats_update_begin(&rwstat->syncp);
+
+	if (rw & REQ_WRITE)
+		rwstat->cnt[BLKG_RWSTAT_WRITE] += val;
+	else
+		rwstat->cnt[BLKG_RWSTAT_READ] += val;
+	if (rw & REQ_SYNC)
+		rwstat->cnt[BLKG_RWSTAT_SYNC] += val;
+	else
+		rwstat->cnt[BLKG_RWSTAT_ASYNC] += val;
+
+	u64_stats_update_end(&rwstat->syncp);
+}
+
+/**
+ * blkg_rwstat_read - read the current values of a blkg_rwstat
+ * @rwstat: blkg_rwstat to read
+ *
+ * Read the current snapshot of @rwstat and return it as the return value.
+ * This function can be called without synchronization and takes care of
+ * u64 atomicity.
+ */
+static inline struct blkg_rwstat blkg_rwstat_read(struct blkg_rwstat *rwstat)
+{
+	unsigned int start;
+	struct blkg_rwstat tmp;
+
+	do {
+		start = u64_stats_fetch_begin_irq(&rwstat->syncp);
+		tmp = *rwstat;
+	} while (u64_stats_fetch_retry_irq(&rwstat->syncp, start));
+
+	return tmp;
+}
+
+/**
+ * blkg_rwstat_total - read the total count of a blkg_rwstat
+ * @rwstat: blkg_rwstat to read
+ *
+ * Return the total count of @rwstat regardless of the IO direction.  This
+ * function can be called without synchronization and takes care of u64
+ * atomicity.
+ */
+static inline uint64_t blkg_rwstat_total(struct blkg_rwstat *rwstat)
+{
+	struct blkg_rwstat tmp = blkg_rwstat_read(rwstat);
+
+	return tmp.cnt[BLKG_RWSTAT_READ] + tmp.cnt[BLKG_RWSTAT_WRITE];
+}
+
+/**
+ * blkg_rwstat_reset - reset a blkg_rwstat
+ * @rwstat: blkg_rwstat to reset
+ */
+static inline void blkg_rwstat_reset(struct blkg_rwstat *rwstat)
+{
+	memset(rwstat->cnt, 0, sizeof(rwstat->cnt));
+}
+
+/**
+ * blkg_rwstat_merge - merge a blkg_rwstat into another
+ * @to: the destination blkg_rwstat
+ * @from: the source
+ *
+ * Add @from's counts to @to.
+ */
+static inline void blkg_rwstat_merge(struct blkg_rwstat *to,
+				     struct blkg_rwstat *from)
+{
+	struct blkg_rwstat v = blkg_rwstat_read(from);
+	int i;
+
+	u64_stats_update_begin(&to->syncp);
+	for (i = 0; i < BLKG_RWSTAT_NR; i++)
+		to->cnt[i] += v.cnt[i];
+	u64_stats_update_end(&to->syncp);
+}
+
+#else	/* CONFIG_BLK_CGROUP */
+
+struct cgroup;
+struct blkcg;
+
+struct blkg_policy_data {
+};
+
+struct blkcg_gq {
+};
+
+struct blkcg_policy {
+};
+
+static inline struct blkcg_gq *blkg_lookup(struct blkcg *blkcg, void *key) { return NULL; }
+static inline int blkcg_init_queue(struct request_queue *q) { return 0; }
+static inline void blkcg_drain_queue(struct request_queue *q) { }
+static inline void blkcg_exit_queue(struct request_queue *q) { }
+static inline int blkcg_policy_register(struct blkcg_policy *pol) { return 0; }
+static inline void blkcg_policy_unregister(struct blkcg_policy *pol) { }
+static inline int blkcg_activate_policy(struct request_queue *q,
+					const struct blkcg_policy *pol) { return 0; }
+static inline void blkcg_deactivate_policy(struct request_queue *q,
+					   const struct blkcg_policy *pol) { }
+
+static inline struct blkcg *bio_blkcg(struct bio *bio) { return NULL; }
+
+static inline struct blkg_policy_data *blkg_to_pd(struct blkcg_gq *blkg,
+						  struct blkcg_policy *pol) { return NULL; }
+static inline struct blkcg_gq *pd_to_blkg(struct blkg_policy_data *pd) { return NULL; }
+static inline char *blkg_path(struct blkcg_gq *blkg) { return NULL; }
+static inline void blkg_get(struct blkcg_gq *blkg) { }
+static inline void blkg_put(struct blkcg_gq *blkg) { }
+
+static inline struct request_list *blk_get_rl(struct request_queue *q,
+					      struct bio *bio) { return &q->root_rl; }
+static inline void blk_put_rl(struct request_list *rl) { }
+static inline void blk_rq_set_rl(struct request *rq, struct request_list *rl) { }
+static inline struct request_list *blk_rq_rl(struct request *rq) { return &rq->q->root_rl; }
+
+#define blk_queue_for_each_rl(rl, q)	\
+	for ((rl) = &(q)->root_rl; (rl); (rl) = NULL)
+
+#endif	/* CONFIG_BLK_CGROUP */
+#endif	/* _BLK_CGROUP_H */
--- a/block/blk-core.c
+++ b/block/blk-core.c
--- a/block/blk-exec.c
+++ b/block/blk-exec.c
@ -0,0 +1,143 @@
+/*
+ * Functions related to setting various queue properties from drivers
+ */
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/bio.h>
+#include <linux/blkdev.h>
+#include <linux/blk-mq.h>
+#include <linux/sched/sysctl.h>
+
+#include "blk.h"
+
+/*
+ * for max sense size
+ */
+#include <scsi/scsi_cmnd.h>
+
+/**
+ * blk_end_sync_rq - executes a completion event on a request
+ * @rq: request to complete
+ * @error: end I/O status of the request
+ */
+static void blk_end_sync_rq(struct request *rq, int error)
+{
+	struct completion *waiting = rq->end_io_data;
+
+	rq->end_io_data = NULL;
+
+	/*
+	 * complete last, if this is a stack request the process (and thus
+	 * the rq pointer) could be invalid right after this complete()
+	 */
+	complete(waiting);
+}
+
+/**
+ * blk_execute_rq_nowait - insert a request into queue for execution
+ * @q:		queue to insert the request in
+ * @bd_disk:	matching gendisk
+ * @rq:		request to insert
+ * @at_head:    insert request at head or tail of queue
+ * @done:	I/O completion handler
+ *
+ * Description:
+ *    Insert a fully prepared request at the back of the I/O scheduler queue
+ *    for execution.  Don't wait for completion.
+ *
+ * Note:
+ *    This function will invoke @done directly if the queue is dead.
+ */
+void blk_execute_rq_nowait(struct request_queue *q, struct gendisk *bd_disk,
+			   struct request *rq, int at_head,
+			   rq_end_io_fn *done)
+{
+	int where = at_head ? ELEVATOR_INSERT_FRONT : ELEVATOR_INSERT_BACK;
+	bool is_pm_resume;
+
+	WARN_ON(irqs_disabled());
+	WARN_ON(rq->cmd_type == REQ_TYPE_FS);
+
+	rq->rq_disk = bd_disk;
+	rq->end_io = done;
+
+	/*
+	 * don't check dying flag for MQ because the request won't
+	 * be resued after dying flag is set
+	 */
+	if (q->mq_ops) {
+		blk_mq_insert_request(rq, at_head, true, false);
+		return;
+	}
+
+	/*
+	 * need to check this before __blk_run_queue(), because rq can
+	 * be freed before that returns.
+	 */
+	is_pm_resume = rq->cmd_type == REQ_TYPE_PM_RESUME;
+
+	spin_lock_irq(q->queue_lock);
+
+	if (unlikely(blk_queue_dying(q))) {
+		rq->cmd_flags |= REQ_QUIET; 
+		rq->errors = -ENXIO;
+		__blk_end_request_all(rq, rq->errors);
+		spin_unlock_irq(q->queue_lock);
+		return;
+	}
+
+	__elv_add_request(q, rq, where);
+	__blk_run_queue(q);
+	/* the queue is stopped so it won't be run */
+	if (is_pm_resume)
+		__blk_run_queue_uncond(q);
+	spin_unlock_irq(q->queue_lock);
+}
+EXPORT_SYMBOL_GPL(blk_execute_rq_nowait);
+
+/**
+ * blk_execute_rq - insert a request into queue for execution
+ * @q:		queue to insert the request in
+ * @bd_disk:	matching gendisk
+ * @rq:		request to insert
+ * @at_head:    insert request at head or tail of queue
+ *
+ * Description:
+ *    Insert a fully prepared request at the back of the I/O scheduler queue
+ *    for execution and wait for completion.
+ */
+int blk_execute_rq(struct request_queue *q, struct gendisk *bd_disk,
+		   struct request *rq, int at_head)
+{
+	DECLARE_COMPLETION_ONSTACK(wait);
+	char sense[SCSI_SENSE_BUFFERSIZE];
+	int err = 0;
+	unsigned long hang_check;
+
+	if (!rq->sense) {
+		memset(sense, 0, sizeof(sense));
+		rq->sense = sense;
+		rq->sense_len = 0;
+	}
+
+	rq->end_io_data = &wait;
+	blk_execute_rq_nowait(q, bd_disk, rq, at_head, blk_end_sync_rq);
+
+	/* Prevent hang_check timer from firing at us during very long I/O */
+	hang_check = sysctl_hung_task_timeout_secs;
+	if (hang_check)
+		while (!wait_for_completion_io_timeout(&wait, hang_check * (HZ/2)));
+	else
+		wait_for_completion_io(&wait);
+
+	if (rq->errors)
+		err = -EIO;
+
+	if (rq->sense == sense)	{
+		rq->sense = NULL;
+		rq->sense_len = 0;
+	}
+
+	return err;
+}
+EXPORT_SYMBOL(blk_execute_rq);
--- a/block/blk-flush.c
+++ b/block/blk-flush.c
@ -0,0 +1,529 @@
+/*
+ * Functions to sequence FLUSH and FUA writes.
+ *
+ * Copyright (C) 2011		Max Planck Institute for Gravitational Physics
+ * Copyright (C) 2011		Tejun Heo <tj@kernel.org>
+ *
+ * This file is released under the GPLv2.
+ *
+ * REQ_{FLUSH|FUA} requests are decomposed to sequences consisted of three
+ * optional steps - PREFLUSH, DATA and POSTFLUSH - according to the request
+ * properties and hardware capability.
+ *
+ * If a request doesn't have data, only REQ_FLUSH makes sense, which
+ * indicates a simple flush request.  If there is data, REQ_FLUSH indicates
+ * that the device cache should be flushed before the data is executed, and
+ * REQ_FUA means that the data must be on non-volatile media on request
+ * completion.
+ *
+ * If the device doesn't have writeback cache, FLUSH and FUA don't make any
+ * difference.  The requests are either completed immediately if there's no
+ * data or executed as normal requests otherwise.
+ *
+ * If the device has writeback cache and supports FUA, REQ_FLUSH is
+ * translated to PREFLUSH but REQ_FUA is passed down directly with DATA.
+ *
+ * If the device has writeback cache and doesn't support FUA, REQ_FLUSH is
+ * translated to PREFLUSH and REQ_FUA to POSTFLUSH.
+ *
+ * The actual execution of flush is double buffered.  Whenever a request
+ * needs to execute PRE or POSTFLUSH, it queues at
+ * fq->flush_queue[fq->flush_pending_idx].  Once certain criteria are met, a
+ * flush is issued and the pending_idx is toggled.  When the flush
+ * completes, all the requests which were pending are proceeded to the next
+ * step.  This allows arbitrary merging of different types of FLUSH/FUA
+ * requests.
+ *
+ * Currently, the following conditions are used to determine when to issue
+ * flush.
+ *
+ * C1. At any given time, only one flush shall be in progress.  This makes
+ *     double buffering sufficient.
+ *
+ * C2. Flush is deferred if any request is executing DATA of its sequence.
+ *     This avoids issuing separate POSTFLUSHes for requests which shared
+ *     PREFLUSH.
+ *
+ * C3. The second condition is ignored if there is a request which has
+ *     waited longer than FLUSH_PENDING_TIMEOUT.  This is to avoid
+ *     starvation in the unlikely case where there are continuous stream of
+ *     FUA (without FLUSH) requests.
+ *
+ * For devices which support FUA, it isn't clear whether C2 (and thus C3)
+ * is beneficial.
+ *
+ * Note that a sequenced FLUSH/FUA request with DATA is completed twice.
+ * Once while executing DATA and again after the whole sequence is
+ * complete.  The first completion updates the contained bio but doesn't
+ * finish it so that the bio submitter is notified only after the whole
+ * sequence is complete.  This is implemented by testing REQ_FLUSH_SEQ in
+ * req_bio_endio().
+ *
+ * The above peculiarity requires that each FLUSH/FUA request has only one
+ * bio attached to it, which is guaranteed as they aren't allowed to be
+ * merged in the usual way.
+ */
+
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/bio.h>
+#include <linux/blkdev.h>
+#include <linux/gfp.h>
+#include <linux/blk-mq.h>
+
+#include "blk.h"
+#include "blk-mq.h"
+
+/* FLUSH/FUA sequences */
+enum {
+	REQ_FSEQ_PREFLUSH	= (1 << 0), /* pre-flushing in progress */
+	REQ_FSEQ_DATA		= (1 << 1), /* data write in progress */
+	REQ_FSEQ_POSTFLUSH	= (1 << 2), /* post-flushing in progress */
+	REQ_FSEQ_DONE		= (1 << 3),
+
+	REQ_FSEQ_ACTIONS	= REQ_FSEQ_PREFLUSH | REQ_FSEQ_DATA |
+				  REQ_FSEQ_POSTFLUSH,
+
+	/*
+	 * If flush has been pending longer than the following timeout,
+	 * it's issued even if flush_data requests are still in flight.
+	 */
+	FLUSH_PENDING_TIMEOUT	= 5 * HZ,
+};
+
+static bool blk_kick_flush(struct request_queue *q,
+			   struct blk_flush_queue *fq);
+
+static unsigned int blk_flush_policy(unsigned int fflags, struct request *rq)
+{
+	unsigned int policy = 0;
+
+	if (blk_rq_sectors(rq))
+		policy |= REQ_FSEQ_DATA;
+
+	if (fflags & REQ_FLUSH) {
+		if (rq->cmd_flags & REQ_FLUSH)
+			policy |= REQ_FSEQ_PREFLUSH;
+		if (!(fflags & REQ_FUA) && (rq->cmd_flags & REQ_FUA))
+			policy |= REQ_FSEQ_POSTFLUSH;
+	}
+	return policy;
+}
+
+static unsigned int blk_flush_cur_seq(struct request *rq)
+{
+	return 1 << ffz(rq->flush.seq);
+}
+
+static void blk_flush_restore_request(struct request *rq)
+{
+	/*
+	 * After flush data completion, @rq->bio is %NULL but we need to
+	 * complete the bio again.  @rq->biotail is guaranteed to equal the
+	 * original @rq->bio.  Restore it.
+	 */
+	rq->bio = rq->biotail;
+
+	/* make @rq a normal request */
+	rq->cmd_flags &= ~REQ_FLUSH_SEQ;
+	rq->end_io = rq->flush.saved_end_io;
+}
+
+static bool blk_flush_queue_rq(struct request *rq, bool add_front)
+{
+	if (rq->q->mq_ops) {
+		struct request_queue *q = rq->q;
+
+		blk_mq_add_to_requeue_list(rq, add_front);
+		blk_mq_kick_requeue_list(q);
+		return false;
+	} else {
+		if (add_front)
+			list_add(&rq->queuelist, &rq->q->queue_head);
+		else
+			list_add_tail(&rq->queuelist, &rq->q->queue_head);
+		return true;
+	}
+}
+
+/**
+ * blk_flush_complete_seq - complete flush sequence
+ * @rq: FLUSH/FUA request being sequenced
+ * @fq: flush queue
+ * @seq: sequences to complete (mask of %REQ_FSEQ_*, can be zero)
+ * @error: whether an error occurred
+ *
+ * @rq just completed @seq part of its flush sequence, record the
+ * completion and trigger the next step.
+ *
+ * CONTEXT:
+ * spin_lock_irq(q->queue_lock or fq->mq_flush_lock)
+ *
+ * RETURNS:
+ * %true if requests were added to the dispatch queue, %false otherwise.
+ */
+static bool blk_flush_complete_seq(struct request *rq,
+				   struct blk_flush_queue *fq,
+				   unsigned int seq, int error)
+{
+	struct request_queue *q = rq->q;
+	struct list_head *pending = &fq->flush_queue[fq->flush_pending_idx];
+	bool queued = false, kicked;
+
+	BUG_ON(rq->flush.seq & seq);
+	rq->flush.seq |= seq;
+
+	if (likely(!error))
+		seq = blk_flush_cur_seq(rq);
+	else
+		seq = REQ_FSEQ_DONE;
+
+	switch (seq) {
+	case REQ_FSEQ_PREFLUSH:
+	case REQ_FSEQ_POSTFLUSH:
+		/* queue for flush */
+		if (list_empty(pending))
+			fq->flush_pending_since = jiffies;
+		list_move_tail(&rq->flush.list, pending);
+		break;
+
+	case REQ_FSEQ_DATA:
+		list_move_tail(&rq->flush.list, &fq->flush_data_in_flight);
+		queued = blk_flush_queue_rq(rq, true);
+		break;
+
+	case REQ_FSEQ_DONE:
+		/*
+		 * @rq was previously adjusted by blk_flush_issue() for
+		 * flush sequencing and may already have gone through the
+		 * flush data request completion path.  Restore @rq for
+		 * normal completion and end it.
+		 */
+		BUG_ON(!list_empty(&rq->queuelist));
+		list_del_init(&rq->flush.list);
+		blk_flush_restore_request(rq);
+		if (q->mq_ops)
+			blk_mq_end_request(rq, error);
+		else
+			__blk_end_request_all(rq, error);
+		break;
+
+	default:
+		BUG();
+	}
+
+	kicked = blk_kick_flush(q, fq);
+	return kicked | queued;
+}
+
+static void flush_end_io(struct request *flush_rq, int error)
+{
+	struct request_queue *q = flush_rq->q;
+	struct list_head *running;
+	bool queued = false;
+	struct request *rq, *n;
+	unsigned long flags = 0;
+	struct blk_flush_queue *fq = blk_get_flush_queue(q, flush_rq->mq_ctx);
+
+	if (q->mq_ops) {
+		spin_lock_irqsave(&fq->mq_flush_lock, flags);
+		flush_rq->tag = -1;
+	}
+
+	running = &fq->flush_queue[fq->flush_running_idx];
+	BUG_ON(fq->flush_pending_idx == fq->flush_running_idx);
+
+	/* account completion of the flush request */
+	fq->flush_running_idx ^= 1;
+
+	if (!q->mq_ops)
+		elv_completed_request(q, flush_rq);
+
+	/* and push the waiting requests to the next stage */
+	list_for_each_entry_safe(rq, n, running, flush.list) {
+		unsigned int seq = blk_flush_cur_seq(rq);
+
+		BUG_ON(seq != REQ_FSEQ_PREFLUSH && seq != REQ_FSEQ_POSTFLUSH);
+		queued |= blk_flush_complete_seq(rq, fq, seq, error);
+	}
+
+	/*
+	 * Kick the queue to avoid stall for two cases:
+	 * 1. Moving a request silently to empty queue_head may stall the
+	 * queue.
+	 * 2. When flush request is running in non-queueable queue, the
+	 * queue is hold. Restart the queue after flush request is finished
+	 * to avoid stall.
+	 * This function is called from request completion path and calling
+	 * directly into request_fn may confuse the driver.  Always use
+	 * kblockd.
+	 */
+	if (queued || fq->flush_queue_delayed) {
+		WARN_ON(q->mq_ops);
+		blk_run_queue_async(q);
+	}
+	fq->flush_queue_delayed = 0;
+	if (q->mq_ops)
+		spin_unlock_irqrestore(&fq->mq_flush_lock, flags);
+}
+
+/**
+ * blk_kick_flush - consider issuing flush request
+ * @q: request_queue being kicked
+ * @fq: flush queue
+ *
+ * Flush related states of @q have changed, consider issuing flush request.
+ * Please read the comment at the top of this file for more info.
+ *
+ * CONTEXT:
+ * spin_lock_irq(q->queue_lock or fq->mq_flush_lock)
+ *
+ * RETURNS:
+ * %true if flush was issued, %false otherwise.
+ */
+static bool blk_kick_flush(struct request_queue *q, struct blk_flush_queue *fq)
+{
+	struct list_head *pending = &fq->flush_queue[fq->flush_pending_idx];
+	struct request *first_rq =
+		list_first_entry(pending, struct request, flush.list);
+	struct request *flush_rq = fq->flush_rq;
+
+	/* C1 described at the top of this file */
+	if (fq->flush_pending_idx != fq->flush_running_idx || list_empty(pending))
+		return false;
+
+	/* C2 and C3 */
+	if (!list_empty(&fq->flush_data_in_flight) &&
+	    time_before(jiffies,
+			fq->flush_pending_since + FLUSH_PENDING_TIMEOUT))
+		return false;
+
+	/*
+	 * Issue flush and toggle pending_idx.  This makes pending_idx
+	 * different from running_idx, which means flush is in flight.
+	 */
+	fq->flush_pending_idx ^= 1;
+
+	blk_rq_init(q, flush_rq);
+
+	/*
+	 * Borrow tag from the first request since they can't
+	 * be in flight at the same time.
+	 */
+	if (q->mq_ops) {
+		flush_rq->mq_ctx = first_rq->mq_ctx;
+		flush_rq->tag = first_rq->tag;
+	}
+
+	flush_rq->cmd_type = REQ_TYPE_FS;
+	flush_rq->cmd_flags = WRITE_FLUSH | REQ_FLUSH_SEQ;
+	flush_rq->rq_disk = first_rq->rq_disk;
+	flush_rq->end_io = flush_end_io;
+
+	return blk_flush_queue_rq(flush_rq, false);
+}
+
+static void flush_data_end_io(struct request *rq, int error)
+{
+	struct request_queue *q = rq->q;
+	struct blk_flush_queue *fq = blk_get_flush_queue(q, NULL);
+
+	/*
+	 * After populating an empty queue, kick it to avoid stall.  Read
+	 * the comment in flush_end_io().
+	 */
+	if (blk_flush_complete_seq(rq, fq, REQ_FSEQ_DATA, error))
+		blk_run_queue_async(q);
+}
+
+static void mq_flush_data_end_io(struct request *rq, int error)
+{
+	struct request_queue *q = rq->q;
+	struct blk_mq_hw_ctx *hctx;
+	struct blk_mq_ctx *ctx = rq->mq_ctx;
+	unsigned long flags;
+	struct blk_flush_queue *fq = blk_get_flush_queue(q, ctx);
+
+	hctx = q->mq_ops->map_queue(q, ctx->cpu);
+
+	/*
+	 * After populating an empty queue, kick it to avoid stall.  Read
+	 * the comment in flush_end_io().
+	 */
+	spin_lock_irqsave(&fq->mq_flush_lock, flags);
+	if (blk_flush_complete_seq(rq, fq, REQ_FSEQ_DATA, error))
+		blk_mq_run_hw_queue(hctx, true);
+	spin_unlock_irqrestore(&fq->mq_flush_lock, flags);
+}
+
+/**
+ * blk_insert_flush - insert a new FLUSH/FUA request
+ * @rq: request to insert
+ *
+ * To be called from __elv_add_request() for %ELEVATOR_INSERT_FLUSH insertions.
+ * or __blk_mq_run_hw_queue() to dispatch request.
+ * @rq is being submitted.  Analyze what needs to be done and put it on the
+ * right queue.
+ *
+ * CONTEXT:
+ * spin_lock_irq(q->queue_lock) in !mq case
+ */
+void blk_insert_flush(struct request *rq)
+{
+	struct request_queue *q = rq->q;
+	unsigned int fflags = q->flush_flags;	/* may change, cache */
+	unsigned int policy = blk_flush_policy(fflags, rq);
+	struct blk_flush_queue *fq = blk_get_flush_queue(q, rq->mq_ctx);
+
+	/*
+	 * @policy now records what operations need to be done.  Adjust
+	 * REQ_FLUSH and FUA for the driver.
+	 */
+	rq->cmd_flags &= ~REQ_FLUSH;
+	if (!(fflags & REQ_FUA))
+		rq->cmd_flags &= ~REQ_FUA;
+
+	/*
+	 * An empty flush handed down from a stacking driver may
+	 * translate into nothing if the underlying device does not
+	 * advertise a write-back cache.  In this case, simply
+	 * complete the request.
+	 */
+	if (!policy) {
+		if (q->mq_ops)
+			blk_mq_end_request(rq, 0);
+		else
+			__blk_end_bidi_request(rq, 0, 0, 0);
+		return;
+	}
+
+	BUG_ON(rq->bio != rq->biotail); /*assumes zero or single bio rq */
+
+	/*
+	 * If there's data but flush is not necessary, the request can be
+	 * processed directly without going through flush machinery.  Queue
+	 * for normal execution.
+	 */
+	if ((policy & REQ_FSEQ_DATA) &&
+	    !(policy & (REQ_FSEQ_PREFLUSH | REQ_FSEQ_POSTFLUSH))) {
+		if (q->mq_ops) {
+			blk_mq_insert_request(rq, false, false, true);
+		} else
+			list_add_tail(&rq->queuelist, &q->queue_head);
+		return;
+	}
+
+	/*
+	 * @rq should go through flush machinery.  Mark it part of flush
+	 * sequence and submit for further processing.
+	 */
+	memset(&rq->flush, 0, sizeof(rq->flush));
+	INIT_LIST_HEAD(&rq->flush.list);
+	rq->cmd_flags |= REQ_FLUSH_SEQ;
+	rq->flush.saved_end_io = rq->end_io; /* Usually NULL */
+	if (q->mq_ops) {
+		rq->end_io = mq_flush_data_end_io;
+
+		spin_lock_irq(&fq->mq_flush_lock);
+		blk_flush_complete_seq(rq, fq, REQ_FSEQ_ACTIONS & ~policy, 0);
+		spin_unlock_irq(&fq->mq_flush_lock);
+		return;
+	}
+	rq->end_io = flush_data_end_io;
+
+	blk_flush_complete_seq(rq, fq, REQ_FSEQ_ACTIONS & ~policy, 0);
+}
+
+/**
+ * blkdev_issue_flush - queue a flush
+ * @bdev:	blockdev to issue flush for
+ * @gfp_mask:	memory allocation flags (for bio_alloc)
+ * @error_sector:	error sector
+ *
+ * Description:
+ *    Issue a flush for the block device in question. Caller can supply
+ *    room for storing the error offset in case of a flush error, if they
+ *    wish to. If WAIT flag is not passed then caller may check only what
+ *    request was pushed in some internal queue for later handling.
+ */
+int blkdev_issue_flush(struct block_device *bdev, gfp_t gfp_mask,
+		sector_t *error_sector)
+{
+	struct request_queue *q;
+	struct bio *bio;
+	int ret = 0;
+
+	if (bdev->bd_disk == NULL)
+		return -ENXIO;
+
+	q = bdev_get_queue(bdev);
+	if (!q)
+		return -ENXIO;
+
+	/*
+	 * some block devices may not have their queue correctly set up here
+	 * (e.g. loop device without a backing file) and so issuing a flush
+	 * here will panic. Ensure there is a request function before issuing
+	 * the flush.
+	 */
+	if (!q->make_request_fn)
+		return -ENXIO;
+
+	bio = bio_alloc(gfp_mask, 0);
+	bio->bi_bdev = bdev;
+
+	ret = submit_bio_wait(WRITE_FLUSH, bio);
+
+	/*
+	 * The driver must store the error location in ->bi_sector, if
+	 * it supports it. For non-stacked drivers, this should be
+	 * copied from blk_rq_pos(rq).
+	 */
+	if (error_sector)
+		*error_sector = bio->bi_iter.bi_sector;
+
+	bio_put(bio);
+	return ret;
+}
+EXPORT_SYMBOL(blkdev_issue_flush);
+
+struct blk_flush_queue *blk_alloc_flush_queue(struct request_queue *q,
+		int node, int cmd_size)
+{
+	struct blk_flush_queue *fq;
+	int rq_sz = sizeof(struct request);
+
+	fq = kzalloc_node(sizeof(*fq), GFP_KERNEL, node);
+	if (!fq)
+		goto fail;
+
+	if (q->mq_ops) {
+		spin_lock_init(&fq->mq_flush_lock);
+		rq_sz = round_up(rq_sz + cmd_size, cache_line_size());
+	}
+
+	fq->flush_rq = kzalloc_node(rq_sz, GFP_KERNEL, node);
+	if (!fq->flush_rq)
+		goto fail_rq;
+
+	INIT_LIST_HEAD(&fq->flush_queue[0]);
+	INIT_LIST_HEAD(&fq->flush_queue[1]);
+	INIT_LIST_HEAD(&fq->flush_data_in_flight);
+
+	return fq;
+
+ fail_rq:
+	kfree(fq);
+ fail:
+	return NULL;
+}
+
+void blk_free_flush_queue(struct blk_flush_queue *fq)
+{
+	/* bio based request queue hasn't flush queue */
+	if (!fq)
+		return;
+
+	kfree(fq->flush_rq);
+	kfree(fq);
+}
--- a/block/blk-integrity.c
+++ b/block/blk-integrity.c
@ -0,0 +1,483 @@
+/*
+ * blk-integrity.c - Block layer data integrity extensions
+ *
+ * Copyright (C) 2007, 2008 Oracle Corporation
+ * Written by: Martin K. Petersen <martin.petersen@oracle.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License version
+ * 2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; see the file COPYING.  If not, write to
+ * the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139,
+ * USA.
+ *
+ */
+
+#include <linux/blkdev.h>
+#include <linux/mempool.h>
+#include <linux/bio.h>
+#include <linux/scatterlist.h>
+#include <linux/export.h>
+#include <linux/slab.h>
+
+#include "blk.h"
+
+static struct kmem_cache *integrity_cachep;
+
+static const char *bi_unsupported_name = "unsupported";
+
+/**
+ * blk_rq_count_integrity_sg - Count number of integrity scatterlist elements
+ * @q:		request queue
+ * @bio:	bio with integrity metadata attached
+ *
+ * Description: Returns the number of elements required in a
+ * scatterlist corresponding to the integrity metadata in a bio.
+ */
+int blk_rq_count_integrity_sg(struct request_queue *q, struct bio *bio)
+{
+	struct bio_vec iv, ivprv = { NULL };
+	unsigned int segments = 0;
+	unsigned int seg_size = 0;
+	struct bvec_iter iter;
+	int prev = 0;
+
+	bio_for_each_integrity_vec(iv, bio, iter) {
+
+		if (prev) {
+			if (!BIOVEC_PHYS_MERGEABLE(&ivprv, &iv))
+				goto new_segment;
+
+			if (!BIOVEC_SEG_BOUNDARY(q, &ivprv, &iv))
+				goto new_segment;
+
+			if (seg_size + iv.bv_len > queue_max_segment_size(q))
+				goto new_segment;
+
+			seg_size += iv.bv_len;
+		} else {
+new_segment:
+			segments++;
+			seg_size = iv.bv_len;
+		}
+
+		prev = 1;
+		ivprv = iv;
+	}
+
+	return segments;
+}
+EXPORT_SYMBOL(blk_rq_count_integrity_sg);
+
+/**
+ * blk_rq_map_integrity_sg - Map integrity metadata into a scatterlist
+ * @q:		request queue
+ * @bio:	bio with integrity metadata attached
+ * @sglist:	target scatterlist
+ *
+ * Description: Map the integrity vectors in request into a
+ * scatterlist.  The scatterlist must be big enough to hold all
+ * elements.  I.e. sized using blk_rq_count_integrity_sg().
+ */
+int blk_rq_map_integrity_sg(struct request_queue *q, struct bio *bio,
+			    struct scatterlist *sglist)
+{
+	struct bio_vec iv, ivprv = { NULL };
+	struct scatterlist *sg = NULL;
+	unsigned int segments = 0;
+	struct bvec_iter iter;
+	int prev = 0;
+
+	bio_for_each_integrity_vec(iv, bio, iter) {
+
+		if (prev) {
+			if (!BIOVEC_PHYS_MERGEABLE(&ivprv, &iv))
+				goto new_segment;
+
+			if (!BIOVEC_SEG_BOUNDARY(q, &ivprv, &iv))
+				goto new_segment;
+
+			if (sg->length + iv.bv_len > queue_max_segment_size(q))
+				goto new_segment;
+
+			sg->length += iv.bv_len;
+		} else {
+new_segment:
+			if (!sg)
+				sg = sglist;
+			else {
+				sg_unmark_end(sg);
+				sg = sg_next(sg);
+			}
+
+			sg_set_page(sg, iv.bv_page, iv.bv_len, iv.bv_offset);
+			segments++;
+		}
+
+		prev = 1;
+		ivprv = iv;
+	}
+
+	if (sg)
+		sg_mark_end(sg);
+
+	return segments;
+}
+EXPORT_SYMBOL(blk_rq_map_integrity_sg);
+
+/**
+ * blk_integrity_compare - Compare integrity profile of two disks
+ * @gd1:	Disk to compare
+ * @gd2:	Disk to compare
+ *
+ * Description: Meta-devices like DM and MD need to verify that all
+ * sub-devices use the same integrity format before advertising to
+ * upper layers that they can send/receive integrity metadata.  This
+ * function can be used to check whether two gendisk devices have
+ * compatible integrity formats.
+ */
+int blk_integrity_compare(struct gendisk *gd1, struct gendisk *gd2)
+{
+	struct blk_integrity *b1 = gd1->integrity;
+	struct blk_integrity *b2 = gd2->integrity;
+
+	if (!b1 && !b2)
+		return 0;
+
+	if (!b1 || !b2)
+		return -1;
+
+	if (b1->interval != b2->interval) {
+		pr_err("%s: %s/%s protection interval %u != %u\n",
+		       __func__, gd1->disk_name, gd2->disk_name,
+		       b1->interval, b2->interval);
+		return -1;
+	}
+
+	if (b1->tuple_size != b2->tuple_size) {
+		printk(KERN_ERR "%s: %s/%s tuple sz %u != %u\n", __func__,
+		       gd1->disk_name, gd2->disk_name,
+		       b1->tuple_size, b2->tuple_size);
+		return -1;
+	}
+
+	if (b1->tag_size && b2->tag_size && (b1->tag_size != b2->tag_size)) {
+		printk(KERN_ERR "%s: %s/%s tag sz %u != %u\n", __func__,
+		       gd1->disk_name, gd2->disk_name,
+		       b1->tag_size, b2->tag_size);
+		return -1;
+	}
+
+	if (strcmp(b1->name, b2->name)) {
+		printk(KERN_ERR "%s: %s/%s type %s != %s\n", __func__,
+		       gd1->disk_name, gd2->disk_name,
+		       b1->name, b2->name);
+		return -1;
+	}
+
+	return 0;
+}
+EXPORT_SYMBOL(blk_integrity_compare);
+
+bool blk_integrity_merge_rq(struct request_queue *q, struct request *req,
+			    struct request *next)
+{
+	if (blk_integrity_rq(req) == 0 && blk_integrity_rq(next) == 0)
+		return true;
+
+	if (blk_integrity_rq(req) == 0 || blk_integrity_rq(next) == 0)
+		return false;
+
+	if (bio_integrity(req->bio)->bip_flags !=
+	    bio_integrity(next->bio)->bip_flags)
+		return false;
+
+	if (req->nr_integrity_segments + next->nr_integrity_segments >
+	    q->limits.max_integrity_segments)
+		return false;
+
+	return true;
+}
+EXPORT_SYMBOL(blk_integrity_merge_rq);
+
+bool blk_integrity_merge_bio(struct request_queue *q, struct request *req,
+			     struct bio *bio)
+{
+	int nr_integrity_segs;
+	struct bio *next = bio->bi_next;
+
+	if (blk_integrity_rq(req) == 0 && bio_integrity(bio) == NULL)
+		return true;
+
+	if (blk_integrity_rq(req) == 0 || bio_integrity(bio) == NULL)
+		return false;
+
+	if (bio_integrity(req->bio)->bip_flags != bio_integrity(bio)->bip_flags)
+		return false;
+
+	bio->bi_next = NULL;
+	nr_integrity_segs = blk_rq_count_integrity_sg(q, bio);
+	bio->bi_next = next;
+
+	if (req->nr_integrity_segments + nr_integrity_segs >
+	    q->limits.max_integrity_segments)
+		return false;
+
+	req->nr_integrity_segments += nr_integrity_segs;
+
+	return true;
+}
+EXPORT_SYMBOL(blk_integrity_merge_bio);
+
+struct integrity_sysfs_entry {
+	struct attribute attr;
+	ssize_t (*show)(struct blk_integrity *, char *);
+	ssize_t (*store)(struct blk_integrity *, const char *, size_t);
+};
+
+static ssize_t integrity_attr_show(struct kobject *kobj, struct attribute *attr,
+				   char *page)
+{
+	struct blk_integrity *bi =
+		container_of(kobj, struct blk_integrity, kobj);
+	struct integrity_sysfs_entry *entry =
+		container_of(attr, struct integrity_sysfs_entry, attr);
+
+	return entry->show(bi, page);
+}
+
+static ssize_t integrity_attr_store(struct kobject *kobj,
+				    struct attribute *attr, const char *page,
+				    size_t count)
+{
+	struct blk_integrity *bi =
+		container_of(kobj, struct blk_integrity, kobj);
+	struct integrity_sysfs_entry *entry =
+		container_of(attr, struct integrity_sysfs_entry, attr);
+	ssize_t ret = 0;
+
+	if (entry->store)
+		ret = entry->store(bi, page, count);
+
+	return ret;
+}
+
+static ssize_t integrity_format_show(struct blk_integrity *bi, char *page)
+{
+	if (bi != NULL && bi->name != NULL)
+		return sprintf(page, "%s\n", bi->name);
+	else
+		return sprintf(page, "none\n");
+}
+
+static ssize_t integrity_tag_size_show(struct blk_integrity *bi, char *page)
+{
+	if (bi != NULL)
+		return sprintf(page, "%u\n", bi->tag_size);
+	else
+		return sprintf(page, "0\n");
+}
+
+static ssize_t integrity_verify_store(struct blk_integrity *bi,
+				      const char *page, size_t count)
+{
+	char *p = (char *) page;
+	unsigned long val = simple_strtoul(p, &p, 10);
+
+	if (val)
+		bi->flags |= BLK_INTEGRITY_VERIFY;
+	else
+		bi->flags &= ~BLK_INTEGRITY_VERIFY;
+
+	return count;
+}
+
+static ssize_t integrity_verify_show(struct blk_integrity *bi, char *page)
+{
+	return sprintf(page, "%d\n", (bi->flags & BLK_INTEGRITY_VERIFY) != 0);
+}
+
+static ssize_t integrity_generate_store(struct blk_integrity *bi,
+					const char *page, size_t count)
+{
+	char *p = (char *) page;
+	unsigned long val = simple_strtoul(p, &p, 10);
+
+	if (val)
+		bi->flags |= BLK_INTEGRITY_GENERATE;
+	else
+		bi->flags &= ~BLK_INTEGRITY_GENERATE;
+
+	return count;
+}
+
+static ssize_t integrity_generate_show(struct blk_integrity *bi, char *page)
+{
+	return sprintf(page, "%d\n", (bi->flags & BLK_INTEGRITY_GENERATE) != 0);
+}
+
+static ssize_t integrity_device_show(struct blk_integrity *bi, char *page)
+{
+	return sprintf(page, "%u\n",
+		       (bi->flags & BLK_INTEGRITY_DEVICE_CAPABLE) != 0);
+}
+
+static struct integrity_sysfs_entry integrity_format_entry = {
+	.attr = { .name = "format", .mode = S_IRUGO },
+	.show = integrity_format_show,
+};
+
+static struct integrity_sysfs_entry integrity_tag_size_entry = {
+	.attr = { .name = "tag_size", .mode = S_IRUGO },
+	.show = integrity_tag_size_show,
+};
+
+static struct integrity_sysfs_entry integrity_verify_entry = {
+	.attr = { .name = "read_verify", .mode = S_IRUGO | S_IWUSR },
+	.show = integrity_verify_show,
+	.store = integrity_verify_store,
+};
+
+static struct integrity_sysfs_entry integrity_generate_entry = {
+	.attr = { .name = "write_generate", .mode = S_IRUGO | S_IWUSR },
+	.show = integrity_generate_show,
+	.store = integrity_generate_store,
+};
+
+static struct integrity_sysfs_entry integrity_device_entry = {
+	.attr = { .name = "device_is_integrity_capable", .mode = S_IRUGO },
+	.show = integrity_device_show,
+};
+
+static struct attribute *integrity_attrs[] = {
+	&integrity_format_entry.attr,
+	&integrity_tag_size_entry.attr,
+	&integrity_verify_entry.attr,
+	&integrity_generate_entry.attr,
+	&integrity_device_entry.attr,
+	NULL,
+};
+
+static const struct sysfs_ops integrity_ops = {
+	.show	= &integrity_attr_show,
+	.store	= &integrity_attr_store,
+};
+
+static int __init blk_dev_integrity_init(void)
+{
+	integrity_cachep = kmem_cache_create("blkdev_integrity",
+					     sizeof(struct blk_integrity),
+					     0, SLAB_PANIC, NULL);
+	return 0;
+}
+subsys_initcall(blk_dev_integrity_init);
+
+static void blk_integrity_release(struct kobject *kobj)
+{
+	struct blk_integrity *bi =
+		container_of(kobj, struct blk_integrity, kobj);
+
+	kmem_cache_free(integrity_cachep, bi);
+}
+
+static struct kobj_type integrity_ktype = {
+	.default_attrs	= integrity_attrs,
+	.sysfs_ops	= &integrity_ops,
+	.release	= blk_integrity_release,
+};
+
+bool blk_integrity_is_initialized(struct gendisk *disk)
+{
+	struct blk_integrity *bi = blk_get_integrity(disk);
+
+	return (bi && bi->name && strcmp(bi->name, bi_unsupported_name) != 0);
+}
+EXPORT_SYMBOL(blk_integrity_is_initialized);
+
+/**
+ * blk_integrity_register - Register a gendisk as being integrity-capable
+ * @disk:	struct gendisk pointer to make integrity-aware
+ * @template:	optional integrity profile to register
+ *
+ * Description: When a device needs to advertise itself as being able
+ * to send/receive integrity metadata it must use this function to
+ * register the capability with the block layer.  The template is a
+ * blk_integrity struct with values appropriate for the underlying
+ * hardware.  If template is NULL the new profile is allocated but
+ * not filled out. See Documentation/block/data-integrity.txt.
+ */
+int blk_integrity_register(struct gendisk *disk, struct blk_integrity *template)
+{
+	struct blk_integrity *bi;
+
+	BUG_ON(disk == NULL);
+
+	if (disk->integrity == NULL) {
+		bi = kmem_cache_alloc(integrity_cachep,
+				      GFP_KERNEL | __GFP_ZERO);
+		if (!bi)
+			return -1;
+
+		if (kobject_init_and_add(&bi->kobj, &integrity_ktype,
+					 &disk_to_dev(disk)->kobj,
+					 "%s", "integrity")) {
+			kmem_cache_free(integrity_cachep, bi);
+			return -1;
+		}
+
+		kobject_uevent(&bi->kobj, KOBJ_ADD);
+
+		bi->flags |= BLK_INTEGRITY_VERIFY | BLK_INTEGRITY_GENERATE;
+		bi->interval = queue_logical_block_size(disk->queue);
+		disk->integrity = bi;
+	} else
+		bi = disk->integrity;
+
+	/* Use the provided profile as template */
+	if (template != NULL) {
+		bi->name = template->name;
+		bi->generate_fn = template->generate_fn;
+		bi->verify_fn = template->verify_fn;
+		bi->tuple_size = template->tuple_size;
+		bi->tag_size = template->tag_size;
+		bi->flags |= template->flags;
+	} else
+		bi->name = bi_unsupported_name;
+
+	disk->queue->backing_dev_info.capabilities |= BDI_CAP_STABLE_WRITES;
+
+	return 0;
+}
+EXPORT_SYMBOL(blk_integrity_register);
+
+/**
+ * blk_integrity_unregister - Remove block integrity profile
+ * @disk:	disk whose integrity profile to deallocate
+ *
+ * Description: This function frees all memory used by the block
+ * integrity profile.  To be called at device teardown.
+ */
+void blk_integrity_unregister(struct gendisk *disk)
+{
+	struct blk_integrity *bi;
+
+	if (!disk || !disk->integrity)
+		return;
+
+	disk->queue->backing_dev_info.capabilities &= ~BDI_CAP_STABLE_WRITES;
+
+	bi = disk->integrity;
+
+	kobject_uevent(&bi->kobj, KOBJ_REMOVE);
+	kobject_del(&bi->kobj);
+	kobject_put(&bi->kobj);
+	disk->integrity = NULL;
+}
+EXPORT_SYMBOL(blk_integrity_unregister);
--- a/block/blk-ioc.c
+++ b/block/blk-ioc.c
@ -0,0 +1,407 @@
+/*
+ * Functions related to io context handling
+ */
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/bio.h>
+#include <linux/blkdev.h>
+#include <linux/slab.h>
+
+#include "blk.h"
+
+/*
+ * For io context allocations
+ */
+static struct kmem_cache *iocontext_cachep;
+
+/**
+ * get_io_context - increment reference count to io_context
+ * @ioc: io_context to get
+ *
+ * Increment reference count to @ioc.
+ */
+void get_io_context(struct io_context *ioc)
+{
+	BUG_ON(atomic_long_read(&ioc->refcount) <= 0);
+	atomic_long_inc(&ioc->refcount);
+}
+EXPORT_SYMBOL(get_io_context);
+
+static void icq_free_icq_rcu(struct rcu_head *head)
+{
+	struct io_cq *icq = container_of(head, struct io_cq, __rcu_head);
+
+	kmem_cache_free(icq->__rcu_icq_cache, icq);
+}
+
+/* Exit an icq. Called with both ioc and q locked. */
+static void ioc_exit_icq(struct io_cq *icq)
+{
+	struct elevator_type *et = icq->q->elevator->type;
+
+	if (icq->flags & ICQ_EXITED)
+		return;
+
+	if (et->ops.elevator_exit_icq_fn)
+		et->ops.elevator_exit_icq_fn(icq);
+
+	icq->flags |= ICQ_EXITED;
+}
+
+/* Release an icq.  Called with both ioc and q locked. */
+static void ioc_destroy_icq(struct io_cq *icq)
+{
+	struct io_context *ioc = icq->ioc;
+	struct request_queue *q = icq->q;
+	struct elevator_type *et = q->elevator->type;
+
+	lockdep_assert_held(&ioc->lock);
+	lockdep_assert_held(q->queue_lock);
+
+	radix_tree_delete(&ioc->icq_tree, icq->q->id);
+	hlist_del_init(&icq->ioc_node);
+	list_del_init(&icq->q_node);
+
+	/*
+	 * Both setting lookup hint to and clearing it from @icq are done
+	 * under queue_lock.  If it's not pointing to @icq now, it never
+	 * will.  Hint assignment itself can race safely.
+	 */
+	if (rcu_access_pointer(ioc->icq_hint) == icq)
+		rcu_assign_pointer(ioc->icq_hint, NULL);
+
+	ioc_exit_icq(icq);
+
+	/*
+	 * @icq->q might have gone away by the time RCU callback runs
+	 * making it impossible to determine icq_cache.  Record it in @icq.
+	 */
+	icq->__rcu_icq_cache = et->icq_cache;
+	call_rcu(&icq->__rcu_head, icq_free_icq_rcu);
+}
+
+/*
+ * Slow path for ioc release in put_io_context().  Performs double-lock
+ * dancing to unlink all icq's and then frees ioc.
+ */
+static void ioc_release_fn(struct work_struct *work)
+{
+	struct io_context *ioc = container_of(work, struct io_context,
+					      release_work);
+	unsigned long flags;
+
+	/*
+	 * Exiting icq may call into put_io_context() through elevator
+	 * which will trigger lockdep warning.  The ioc's are guaranteed to
+	 * be different, use a different locking subclass here.  Use
+	 * irqsave variant as there's no spin_lock_irq_nested().
+	 */
+	spin_lock_irqsave_nested(&ioc->lock, flags, 1);
+
+	while (!hlist_empty(&ioc->icq_list)) {
+		struct io_cq *icq = hlist_entry(ioc->icq_list.first,
+						struct io_cq, ioc_node);
+		struct request_queue *q = icq->q;
+
+		if (spin_trylock(q->queue_lock)) {
+			ioc_destroy_icq(icq);
+			spin_unlock(q->queue_lock);
+		} else {
+			spin_unlock_irqrestore(&ioc->lock, flags);
+			cpu_relax();
+			spin_lock_irqsave_nested(&ioc->lock, flags, 1);
+		}
+	}
+
+	spin_unlock_irqrestore(&ioc->lock, flags);
+
+	kmem_cache_free(iocontext_cachep, ioc);
+}
+
+/**
+ * put_io_context - put a reference of io_context
+ * @ioc: io_context to put
+ *
+ * Decrement reference count of @ioc and release it if the count reaches
+ * zero.
+ */
+void put_io_context(struct io_context *ioc)
+{
+	unsigned long flags;
+	bool free_ioc = false;
+
+	if (ioc == NULL)
+		return;
+
+	BUG_ON(atomic_long_read(&ioc->refcount) <= 0);
+
+	/*
+	 * Releasing ioc requires reverse order double locking and we may
+	 * already be holding a queue_lock.  Do it asynchronously from wq.
+	 */
+	if (atomic_long_dec_and_test(&ioc->refcount)) {
+		spin_lock_irqsave(&ioc->lock, flags);
+		if (!hlist_empty(&ioc->icq_list))
+			queue_work(system_power_efficient_wq,
+					&ioc->release_work);
+		else
+			free_ioc = true;
+		spin_unlock_irqrestore(&ioc->lock, flags);
+	}
+
+	if (free_ioc)
+		kmem_cache_free(iocontext_cachep, ioc);
+}
+EXPORT_SYMBOL(put_io_context);
+
+/**
+ * put_io_context_active - put active reference on ioc
+ * @ioc: ioc of interest
+ *
+ * Undo get_io_context_active().  If active reference reaches zero after
+ * put, @ioc can never issue further IOs and ioscheds are notified.
+ */
+void put_io_context_active(struct io_context *ioc)
+{
+	unsigned long flags;
+	struct io_cq *icq;
+
+	if (!atomic_dec_and_test(&ioc->active_ref)) {
+		put_io_context(ioc);
+		return;
+	}
+
+	/*
+	 * Need ioc lock to walk icq_list and q lock to exit icq.  Perform
+	 * reverse double locking.  Read comment in ioc_release_fn() for
+	 * explanation on the nested locking annotation.
+	 */
+retry:
+	spin_lock_irqsave_nested(&ioc->lock, flags, 1);
+	hlist_for_each_entry(icq, &ioc->icq_list, ioc_node) {
+		if (icq->flags & ICQ_EXITED)
+			continue;
+		if (spin_trylock(icq->q->queue_lock)) {
+			ioc_exit_icq(icq);
+			spin_unlock(icq->q->queue_lock);
+		} else {
+			spin_unlock_irqrestore(&ioc->lock, flags);
+			cpu_relax();
+			goto retry;
+		}
+	}
+	spin_unlock_irqrestore(&ioc->lock, flags);
+
+	put_io_context(ioc);
+}
+
+/* Called by the exiting task */
+void exit_io_context(struct task_struct *task)
+{
+	struct io_context *ioc;
+
+	task_lock(task);
+	ioc = task->io_context;
+	task->io_context = NULL;
+	task_unlock(task);
+
+	atomic_dec(&ioc->nr_tasks);
+	put_io_context_active(ioc);
+}
+
+/**
+ * ioc_clear_queue - break any ioc association with the specified queue
+ * @q: request_queue being cleared
+ *
+ * Walk @q->icq_list and exit all io_cq's.  Must be called with @q locked.
+ */
+void ioc_clear_queue(struct request_queue *q)
+{
+	lockdep_assert_held(q->queue_lock);
+
+	while (!list_empty(&q->icq_list)) {
+		struct io_cq *icq = list_entry(q->icq_list.next,
+					       struct io_cq, q_node);
+		struct io_context *ioc = icq->ioc;
+
+		spin_lock(&ioc->lock);
+		ioc_destroy_icq(icq);
+		spin_unlock(&ioc->lock);
+	}
+}
+
+int create_task_io_context(struct task_struct *task, gfp_t gfp_flags, int node)
+{
+	struct io_context *ioc;
+	int ret;
+
+	ioc = kmem_cache_alloc_node(iocontext_cachep, gfp_flags | __GFP_ZERO,
+				    node);
+	if (unlikely(!ioc))
+		return -ENOMEM;
+
+	/* initialize */
+	atomic_long_set(&ioc->refcount, 1);
+	atomic_set(&ioc->nr_tasks, 1);
+	atomic_set(&ioc->active_ref, 1);
+	spin_lock_init(&ioc->lock);
+	INIT_RADIX_TREE(&ioc->icq_tree, GFP_ATOMIC | __GFP_HIGH);
+	INIT_HLIST_HEAD(&ioc->icq_list);
+	INIT_WORK(&ioc->release_work, ioc_release_fn);
+
+	/*
+	 * Try to install.  ioc shouldn't be installed if someone else
+	 * already did or @task, which isn't %current, is exiting.  Note
+	 * that we need to allow ioc creation on exiting %current as exit
+	 * path may issue IOs from e.g. exit_files().  The exit path is
+	 * responsible for not issuing IO after exit_io_context().
+	 */
+	task_lock(task);
+	if (!task->io_context &&
+	    (task == current || !(task->flags & PF_EXITING)))
+		task->io_context = ioc;
+	else
+		kmem_cache_free(iocontext_cachep, ioc);
+
+	ret = task->io_context ? 0 : -EBUSY;
+
+	task_unlock(task);
+
+	return ret;
+}
+
+/**
+ * get_task_io_context - get io_context of a task
+ * @task: task of interest
+ * @gfp_flags: allocation flags, used if allocation is necessary
+ * @node: allocation node, used if allocation is necessary
+ *
+ * Return io_context of @task.  If it doesn't exist, it is created with
+ * @gfp_flags and @node.  The returned io_context has its reference count
+ * incremented.
+ *
+ * This function always goes through task_lock() and it's better to use
+ * %current->io_context + get_io_context() for %current.
+ */
+struct io_context *get_task_io_context(struct task_struct *task,
+				       gfp_t gfp_flags, int node)
+{
+	struct io_context *ioc;
+
+	might_sleep_if(gfp_flags & __GFP_WAIT);
+
+	do {
+		task_lock(task);
+		ioc = task->io_context;
+		if (likely(ioc)) {
+			get_io_context(ioc);
+			task_unlock(task);
+			return ioc;
+		}
+		task_unlock(task);
+	} while (!create_task_io_context(task, gfp_flags, node));
+
+	return NULL;
+}
+EXPORT_SYMBOL(get_task_io_context);
+
+/**
+ * ioc_lookup_icq - lookup io_cq from ioc
+ * @ioc: the associated io_context
+ * @q: the associated request_queue
+ *
+ * Look up io_cq associated with @ioc - @q pair from @ioc.  Must be called
+ * with @q->queue_lock held.
+ */
+struct io_cq *ioc_lookup_icq(struct io_context *ioc, struct request_queue *q)
+{
+	struct io_cq *icq;
+
+	lockdep_assert_held(q->queue_lock);
+
+	/*
+	 * icq's are indexed from @ioc using radix tree and hint pointer,
+	 * both of which are protected with RCU.  All removals are done
+	 * holding both q and ioc locks, and we're holding q lock - if we
+	 * find a icq which points to us, it's guaranteed to be valid.
+	 */
+	rcu_read_lock();
+	icq = rcu_dereference(ioc->icq_hint);
+	if (icq && icq->q == q)
+		goto out;
+
+	icq = radix_tree_lookup(&ioc->icq_tree, q->id);
+	if (icq && icq->q == q)
+		rcu_assign_pointer(ioc->icq_hint, icq);	/* allowed to race */
+	else
+		icq = NULL;
+out:
+	rcu_read_unlock();
+	return icq;
+}
+EXPORT_SYMBOL(ioc_lookup_icq);
+
+/**
+ * ioc_create_icq - create and link io_cq
+ * @ioc: io_context of interest
+ * @q: request_queue of interest
+ * @gfp_mask: allocation mask
+ *
+ * Make sure io_cq linking @ioc and @q exists.  If icq doesn't exist, they
+ * will be created using @gfp_mask.
+ *
+ * The caller is responsible for ensuring @ioc won't go away and @q is
+ * alive and will stay alive until this function returns.
+ */
+struct io_cq *ioc_create_icq(struct io_context *ioc, struct request_queue *q,
+			     gfp_t gfp_mask)
+{
+	struct elevator_type *et = q->elevator->type;
+	struct io_cq *icq;
+
+	/* allocate stuff */
+	icq = kmem_cache_alloc_node(et->icq_cache, gfp_mask | __GFP_ZERO,
+				    q->node);
+	if (!icq)
+		return NULL;
+
+	if (radix_tree_maybe_preload(gfp_mask) < 0) {
+		kmem_cache_free(et->icq_cache, icq);
+		return NULL;
+	}
+
+	icq->ioc = ioc;
+	icq->q = q;
+	INIT_LIST_HEAD(&icq->q_node);
+	INIT_HLIST_NODE(&icq->ioc_node);
+
+	/* lock both q and ioc and try to link @icq */
+	spin_lock_irq(q->queue_lock);
+	spin_lock(&ioc->lock);
+
+	if (likely(!radix_tree_insert(&ioc->icq_tree, q->id, icq))) {
+		hlist_add_head(&icq->ioc_node, &ioc->icq_list);
+		list_add(&icq->q_node, &q->icq_list);
+		if (et->ops.elevator_init_icq_fn)
+			et->ops.elevator_init_icq_fn(icq);
+	} else {
+		kmem_cache_free(et->icq_cache, icq);
+		icq = ioc_lookup_icq(ioc, q);
+		if (!icq)
+			printk(KERN_ERR "cfq: icq link failed!\n");
+	}
+
+	spin_unlock(&ioc->lock);
+	spin_unlock_irq(q->queue_lock);
+	radix_tree_preload_end();
+	return icq;
+}
+
+static int __init blk_ioc_init(void)
+{
+	iocontext_cachep = kmem_cache_create("blkdev_ioc",
+			sizeof(struct io_context), 0, SLAB_PANIC, NULL);
+	return 0;
+}
+subsys_initcall(blk_ioc_init);
--- a/block/blk-iopoll.c
+++ b/block/blk-iopoll.c
@ -0,0 +1,224 @@
+/*
+ * Functions related to interrupt-poll handling in the block layer. This
+ * is similar to NAPI for network devices.
+ */
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/bio.h>
+#include <linux/blkdev.h>
+#include <linux/interrupt.h>
+#include <linux/cpu.h>
+#include <linux/blk-iopoll.h>
+#include <linux/delay.h>
+
+#include "blk.h"
+
+static unsigned int blk_iopoll_budget __read_mostly = 256;
+
+static DEFINE_PER_CPU(struct list_head, blk_cpu_iopoll);
+
+/**
+ * blk_iopoll_sched - Schedule a run of the iopoll handler
+ * @iop:      The parent iopoll structure
+ *
+ * Description:
+ *     Add this blk_iopoll structure to the pending poll list and trigger the
+ *     raise of the blk iopoll softirq. The driver must already have gotten a
+ *     successful return from blk_iopoll_sched_prep() before calling this.
+ **/
+void blk_iopoll_sched(struct blk_iopoll *iop)
+{
+	unsigned long flags;
+
+	local_irq_save(flags);
+	list_add_tail(&iop->list, this_cpu_ptr(&blk_cpu_iopoll));
+	__raise_softirq_irqoff(BLOCK_IOPOLL_SOFTIRQ);
+	local_irq_restore(flags);
+}
+EXPORT_SYMBOL(blk_iopoll_sched);
+
+/**
+ * __blk_iopoll_complete - Mark this @iop as un-polled again
+ * @iop:      The parent iopoll structure
+ *
+ * Description:
+ *     See blk_iopoll_complete(). This function must be called with interrupts
+ *     disabled.
+ **/
+void __blk_iopoll_complete(struct blk_iopoll *iop)
+{
+	list_del(&iop->list);
+	smp_mb__before_atomic();
+	clear_bit_unlock(IOPOLL_F_SCHED, &iop->state);
+}
+EXPORT_SYMBOL(__blk_iopoll_complete);
+
+/**
+ * blk_iopoll_complete - Mark this @iop as un-polled again
+ * @iop:      The parent iopoll structure
+ *
+ * Description:
+ *     If a driver consumes less than the assigned budget in its run of the
+ *     iopoll handler, it'll end the polled mode by calling this function. The
+ *     iopoll handler will not be invoked again before blk_iopoll_sched_prep()
+ *     is called.
+ **/
+void blk_iopoll_complete(struct blk_iopoll *iop)
+{
+	unsigned long flags;
+
+	local_irq_save(flags);
+	__blk_iopoll_complete(iop);
+	local_irq_restore(flags);
+}
+EXPORT_SYMBOL(blk_iopoll_complete);
+
+static void blk_iopoll_softirq(struct softirq_action *h)
+{
+	struct list_head *list = this_cpu_ptr(&blk_cpu_iopoll);
+	int rearm = 0, budget = blk_iopoll_budget;
+	unsigned long start_time = jiffies;
+
+	local_irq_disable();
+
+	while (!list_empty(list)) {
+		struct blk_iopoll *iop;
+		int work, weight;
+
+		/*
+		 * If softirq window is exhausted then punt.
+		 */
+		if (budget <= 0 || time_after(jiffies, start_time)) {
+			rearm = 1;
+			break;
+		}
+
+		local_irq_enable();
+
+		/* Even though interrupts have been re-enabled, this
+		 * access is safe because interrupts can only add new
+		 * entries to the tail of this list, and only ->poll()
+		 * calls can remove this head entry from the list.
+		 */
+		iop = list_entry(list->next, struct blk_iopoll, list);
+
+		weight = iop->weight;
+		work = 0;
+		if (test_bit(IOPOLL_F_SCHED, &iop->state))
+			work = iop->poll(iop, weight);
+
+		budget -= work;
+
+		local_irq_disable();
+
+		/*
+		 * Drivers must not modify the iopoll state, if they
+		 * consume their assigned weight (or more, some drivers can't
+		 * easily just stop processing, they have to complete an
+		 * entire mask of commands).In such cases this code
+		 * still "owns" the iopoll instance and therefore can
+		 * move the instance around on the list at-will.
+		 */
+		if (work >= weight) {
+			if (blk_iopoll_disable_pending(iop))
+				__blk_iopoll_complete(iop);
+			else
+				list_move_tail(&iop->list, list);
+		}
+	}
+
+	if (rearm)
+		__raise_softirq_irqoff(BLOCK_IOPOLL_SOFTIRQ);
+
+	local_irq_enable();
+}
+
+/**
+ * blk_iopoll_disable - Disable iopoll on this @iop
+ * @iop:      The parent iopoll structure
+ *
+ * Description:
+ *     Disable io polling and wait for any pending callbacks to have completed.
+ **/
+void blk_iopoll_disable(struct blk_iopoll *iop)
+{
+	set_bit(IOPOLL_F_DISABLE, &iop->state);
+	while (test_and_set_bit(IOPOLL_F_SCHED, &iop->state))
+		msleep(1);
+	clear_bit(IOPOLL_F_DISABLE, &iop->state);
+}
+EXPORT_SYMBOL(blk_iopoll_disable);
+
+/**
+ * blk_iopoll_enable - Enable iopoll on this @iop
+ * @iop:      The parent iopoll structure
+ *
+ * Description:
+ *     Enable iopoll on this @iop. Note that the handler run will not be
+ *     scheduled, it will only mark it as active.
+ **/
+void blk_iopoll_enable(struct blk_iopoll *iop)
+{
+	BUG_ON(!test_bit(IOPOLL_F_SCHED, &iop->state));
+	smp_mb__before_atomic();
+	clear_bit_unlock(IOPOLL_F_SCHED, &iop->state);
+}
+EXPORT_SYMBOL(blk_iopoll_enable);
+
+/**
+ * blk_iopoll_init - Initialize this @iop
+ * @iop:      The parent iopoll structure
+ * @weight:   The default weight (or command completion budget)
+ * @poll_fn:  The handler to invoke
+ *
+ * Description:
+ *     Initialize this blk_iopoll structure. Before being actively used, the
+ *     driver must call blk_iopoll_enable().
+ **/
+void blk_iopoll_init(struct blk_iopoll *iop, int weight, blk_iopoll_fn *poll_fn)
+{
+	memset(iop, 0, sizeof(*iop));
+	INIT_LIST_HEAD(&iop->list);
+	iop->weight = weight;
+	iop->poll = poll_fn;
+	set_bit(IOPOLL_F_SCHED, &iop->state);
+}
+EXPORT_SYMBOL(blk_iopoll_init);
+
+static int blk_iopoll_cpu_notify(struct notifier_block *self,
+				 unsigned long action, void *hcpu)
+{
+	/*
+	 * If a CPU goes away, splice its entries to the current CPU
+	 * and trigger a run of the softirq
+	 */
+	if (action == CPU_DEAD || action == CPU_DEAD_FROZEN) {
+		int cpu = (unsigned long) hcpu;
+
+		local_irq_disable();
+		list_splice_init(&per_cpu(blk_cpu_iopoll, cpu),
+				 this_cpu_ptr(&blk_cpu_iopoll));
+		__raise_softirq_irqoff(BLOCK_IOPOLL_SOFTIRQ);
+		local_irq_enable();
+	}
+
+	return NOTIFY_OK;
+}
+
+static struct notifier_block blk_iopoll_cpu_notifier = {
+	.notifier_call	= blk_iopoll_cpu_notify,
+};
+
+static __init int blk_iopoll_setup(void)
+{
+	int i;
+
+	for_each_possible_cpu(i)
+		INIT_LIST_HEAD(&per_cpu(blk_cpu_iopoll, i));
+
+	open_softirq(BLOCK_IOPOLL_SOFTIRQ, blk_iopoll_softirq);
+	register_hotcpu_notifier(&blk_iopoll_cpu_notifier);
+	return 0;
+}
+subsys_initcall(blk_iopoll_setup);
--- a/block/blk-lib.c
+++ b/block/blk-lib.c
@ -0,0 +1,307 @@
+/*
+ * Functions related to generic helpers functions
+ */
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/bio.h>
+#include <linux/blkdev.h>
+#include <linux/scatterlist.h>
+
+#include "blk.h"
+
+struct bio_batch {
+	atomic_t		done;
+	unsigned long		flags;
+	struct completion	*wait;
+};
+
+static void bio_batch_end_io(struct bio *bio, int err)
+{
+	struct bio_batch *bb = bio->bi_private;
+
+	if (err && (err != -EOPNOTSUPP))
+		clear_bit(BIO_UPTODATE, &bb->flags);
+	if (atomic_dec_and_test(&bb->done))
+		complete(bb->wait);
+	bio_put(bio);
+}
+
+/**
+ * blkdev_issue_discard - queue a discard
+ * @bdev:	blockdev to issue discard for
+ * @sector:	start sector
+ * @nr_sects:	number of sectors to discard
+ * @gfp_mask:	memory allocation flags (for bio_alloc)
+ * @flags:	BLKDEV_IFL_* flags to control behaviour
+ *
+ * Description:
+ *    Issue a discard request for the sectors in question.
+ */
+int blkdev_issue_discard(struct block_device *bdev, sector_t sector,
+		sector_t nr_sects, gfp_t gfp_mask, unsigned long flags)
+{
+	DECLARE_COMPLETION_ONSTACK(wait);
+	struct request_queue *q = bdev_get_queue(bdev);
+	int type = REQ_WRITE | REQ_DISCARD | REQ_PRIO;
+	unsigned int max_discard_sectors, granularity;
+	int alignment;
+	struct bio_batch bb;
+	struct bio *bio;
+	int ret = 0;
+	struct blk_plug plug;
+
+	if (!q)
+		return -ENXIO;
+
+	if (!blk_queue_discard(q))
+		return -EOPNOTSUPP;
+
+	/* Zero-sector (unknown) and one-sector granularities are the same.  */
+	granularity = max(q->limits.discard_granularity >> 9, 1U);
+	alignment = (bdev_discard_alignment(bdev) >> 9) % granularity;
+
+	/*
+	 * Ensure that max_discard_sectors is of the proper
+	 * granularity, so that requests stay aligned after a split.
+	 */
+	max_discard_sectors = min(q->limits.max_discard_sectors, UINT_MAX >> 9);
+	max_discard_sectors -= max_discard_sectors % granularity;
+	if (unlikely(!max_discard_sectors)) {
+		/* Avoid infinite loop below. Being cautious never hurts. */
+		return -EOPNOTSUPP;
+	}
+
+	if (flags & BLKDEV_DISCARD_SECURE) {
+		if (!blk_queue_secdiscard(q))
+			return -EOPNOTSUPP;
+		type |= REQ_SECURE;
+	}
+
+	atomic_set(&bb.done, 1);
+	bb.flags = 1 << BIO_UPTODATE;
+	bb.wait = &wait;
+
+	blk_start_plug(&plug);
+	while (nr_sects) {
+		unsigned int req_sects;
+		sector_t end_sect, tmp;
+
+		bio = bio_alloc(gfp_mask, 1);
+		if (!bio) {
+			ret = -ENOMEM;
+			break;
+		}
+
+		req_sects = min_t(sector_t, nr_sects, max_discard_sectors);
+
+		/*
+		 * If splitting a request, and the next starting sector would be
+		 * misaligned, stop the discard at the previous aligned sector.
+		 */
+		end_sect = sector + req_sects;
+		tmp = end_sect;
+		if (req_sects < nr_sects &&
+		    sector_div(tmp, granularity) != alignment) {
+			end_sect = end_sect - alignment;
+			sector_div(end_sect, granularity);
+			end_sect = end_sect * granularity + alignment;
+			req_sects = end_sect - sector;
+		}
+
+		bio->bi_iter.bi_sector = sector;
+		bio->bi_end_io = bio_batch_end_io;
+		bio->bi_bdev = bdev;
+		bio->bi_private = &bb;
+
+		bio->bi_iter.bi_size = req_sects << 9;
+		nr_sects -= req_sects;
+		sector = end_sect;
+
+		atomic_inc(&bb.done);
+		submit_bio(type, bio);
+
+		/*
+		 * We can loop for a long time in here, if someone does
+		 * full device discards (like mkfs). Be nice and allow
+		 * us to schedule out to avoid softlocking if preempt
+		 * is disabled.
+		 */
+		cond_resched();
+	}
+	blk_finish_plug(&plug);
+
+	/* Wait for bios in-flight */
+	if (!atomic_dec_and_test(&bb.done))
+		wait_for_completion_io(&wait);
+
+	if (!test_bit(BIO_UPTODATE, &bb.flags))
+		ret = -EIO;
+
+	return ret;
+}
+EXPORT_SYMBOL(blkdev_issue_discard);
+
+/**
+ * blkdev_issue_write_same - queue a write same operation
+ * @bdev:	target blockdev
+ * @sector:	start sector
+ * @nr_sects:	number of sectors to write
+ * @gfp_mask:	memory allocation flags (for bio_alloc)
+ * @page:	page containing data to write
+ *
+ * Description:
+ *    Issue a write same request for the sectors in question.
+ */
+int blkdev_issue_write_same(struct block_device *bdev, sector_t sector,
+			    sector_t nr_sects, gfp_t gfp_mask,
+			    struct page *page)
+{
+	DECLARE_COMPLETION_ONSTACK(wait);
+	struct request_queue *q = bdev_get_queue(bdev);
+	unsigned int max_write_same_sectors;
+	struct bio_batch bb;
+	struct bio *bio;
+	int ret = 0;
+
+	if (!q)
+		return -ENXIO;
+
+	max_write_same_sectors = q->limits.max_write_same_sectors;
+
+	if (max_write_same_sectors == 0)
+		return -EOPNOTSUPP;
+
+	atomic_set(&bb.done, 1);
+	bb.flags = 1 << BIO_UPTODATE;
+	bb.wait = &wait;
+
+	while (nr_sects) {
+		bio = bio_alloc(gfp_mask, 1);
+		if (!bio) {
+			ret = -ENOMEM;
+			break;
+		}
+
+		bio->bi_iter.bi_sector = sector;
+		bio->bi_end_io = bio_batch_end_io;
+		bio->bi_bdev = bdev;
+		bio->bi_private = &bb;
+		bio->bi_vcnt = 1;
+		bio->bi_io_vec->bv_page = page;
+		bio->bi_io_vec->bv_offset = 0;
+		bio->bi_io_vec->bv_len = bdev_logical_block_size(bdev);
+
+		if (nr_sects > max_write_same_sectors) {
+			bio->bi_iter.bi_size = max_write_same_sectors << 9;
+			nr_sects -= max_write_same_sectors;
+			sector += max_write_same_sectors;
+		} else {
+			bio->bi_iter.bi_size = nr_sects << 9;
+			nr_sects = 0;
+		}
+
+		atomic_inc(&bb.done);
+		submit_bio(REQ_WRITE | REQ_WRITE_SAME, bio);
+	}
+
+	/* Wait for bios in-flight */
+	if (!atomic_dec_and_test(&bb.done))
+		wait_for_completion_io(&wait);
+
+	if (!test_bit(BIO_UPTODATE, &bb.flags))
+		ret = -ENOTSUPP;
+
+	return ret;
+}
+EXPORT_SYMBOL(blkdev_issue_write_same);
+
+/**
+ * blkdev_issue_zeroout - generate number of zero filed write bios
+ * @bdev:	blockdev to issue
+ * @sector:	start sector
+ * @nr_sects:	number of sectors to write
+ * @gfp_mask:	memory allocation flags (for bio_alloc)
+ *
+ * Description:
+ *  Generate and issue number of bios with zerofiled pages.
+ */
+
+static int __blkdev_issue_zeroout(struct block_device *bdev, sector_t sector,
+				  sector_t nr_sects, gfp_t gfp_mask)
+{
+	int ret;
+	struct bio *bio;
+	struct bio_batch bb;
+	unsigned int sz;
+	DECLARE_COMPLETION_ONSTACK(wait);
+
+	atomic_set(&bb.done, 1);
+	bb.flags = 1 << BIO_UPTODATE;
+	bb.wait = &wait;
+
+	ret = 0;
+	while (nr_sects != 0) {
+		bio = bio_alloc(gfp_mask,
+				min(nr_sects, (sector_t)BIO_MAX_PAGES));
+		if (!bio) {
+			ret = -ENOMEM;
+			break;
+		}
+
+		bio->bi_iter.bi_sector = sector;
+		bio->bi_bdev   = bdev;
+		bio->bi_end_io = bio_batch_end_io;
+		bio->bi_private = &bb;
+
+		while (nr_sects != 0) {
+			sz = min((sector_t) PAGE_SIZE >> 9 , nr_sects);
+			ret = bio_add_page(bio, ZERO_PAGE(0), sz << 9, 0);
+			nr_sects -= ret >> 9;
+			sector += ret >> 9;
+			if (ret < (sz << 9))
+				break;
+		}
+		ret = 0;
+		atomic_inc(&bb.done);
+		submit_bio(WRITE, bio);
+	}
+
+	/* Wait for bios in-flight */
+	if (!atomic_dec_and_test(&bb.done))
+		wait_for_completion_io(&wait);
+
+	if (!test_bit(BIO_UPTODATE, &bb.flags))
+		/* One of bios in the batch was completed with error.*/
+		ret = -EIO;
+
+	return ret;
+}
+
+/**
+ * blkdev_issue_zeroout - zero-fill a block range
+ * @bdev:	blockdev to write
+ * @sector:	start sector
+ * @nr_sects:	number of sectors to write
+ * @gfp_mask:	memory allocation flags (for bio_alloc)
+ *
+ * Description:
+ *  Generate and issue number of bios with zerofiled pages.
+ */
+
+int blkdev_issue_zeroout(struct block_device *bdev, sector_t sector,
+			 sector_t nr_sects, gfp_t gfp_mask)
+{
+	if (bdev_write_same(bdev)) {
+		unsigned char bdn[BDEVNAME_SIZE];
+
+		if (!blkdev_issue_write_same(bdev, sector, nr_sects, gfp_mask,
+					     ZERO_PAGE(0)))
+			return 0;
+
+		bdevname(bdev, bdn);
+		pr_err("%s: WRITE SAME failed. Manually zeroing.\n", bdn);
+	}
+
+	return __blkdev_issue_zeroout(bdev, sector, nr_sects, gfp_mask);
+}
+EXPORT_SYMBOL(blkdev_issue_zeroout);
--- a/block/blk-map.c
+++ b/block/blk-map.c
@ -0,0 +1,328 @@
+/*
+ * Functions related to mapping data to requests
+ */
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/bio.h>
+#include <linux/blkdev.h>
+#include <scsi/sg.h>		/* for struct sg_iovec */
+
+#include "blk.h"
+
+int blk_rq_append_bio(struct request_queue *q, struct request *rq,
+		      struct bio *bio)
+{
+	if (!rq->bio)
+		blk_rq_bio_prep(q, rq, bio);
+	else if (!ll_back_merge_fn(q, rq, bio))
+		return -EINVAL;
+	else {
+		rq->biotail->bi_next = bio;
+		rq->biotail = bio;
+
+		rq->__data_len += bio->bi_iter.bi_size;
+	}
+	return 0;
+}
+
+static int __blk_rq_unmap_user(struct bio *bio)
+{
+	int ret = 0;
+
+	if (bio) {
+		if (bio_flagged(bio, BIO_USER_MAPPED))
+			bio_unmap_user(bio);
+		else
+			ret = bio_uncopy_user(bio);
+	}
+
+	return ret;
+}
+
+static int __blk_rq_map_user(struct request_queue *q, struct request *rq,
+			     struct rq_map_data *map_data, void __user *ubuf,
+			     unsigned int len, gfp_t gfp_mask)
+{
+	unsigned long uaddr;
+	struct bio *bio, *orig_bio;
+	int reading, ret;
+
+	reading = rq_data_dir(rq) == READ;
+
+	/*
+	 * if alignment requirement is satisfied, map in user pages for
+	 * direct dma. else, set up kernel bounce buffers
+	 */
+	uaddr = (unsigned long) ubuf;
+	if (blk_rq_aligned(q, uaddr, len) && !map_data)
+		bio = bio_map_user(q, NULL, uaddr, len, reading, gfp_mask);
+	else
+		bio = bio_copy_user(q, map_data, uaddr, len, reading, gfp_mask);
+
+	if (IS_ERR(bio))
+		return PTR_ERR(bio);
+
+	if (map_data && map_data->null_mapped)
+		bio->bi_flags |= (1 << BIO_NULL_MAPPED);
+
+	orig_bio = bio;
+	blk_queue_bounce(q, &bio);
+
+	/*
+	 * We link the bounce buffer in and could have to traverse it
+	 * later so we have to get a ref to prevent it from being freed
+	 */
+	bio_get(bio);
+
+	ret = blk_rq_append_bio(q, rq, bio);
+	if (!ret)
+		return bio->bi_iter.bi_size;
+
+	/* if it was boucned we must call the end io function */
+	bio_endio(bio, 0);
+	__blk_rq_unmap_user(orig_bio);
+	bio_put(bio);
+	return ret;
+}
+
+/**
+ * blk_rq_map_user - map user data to a request, for REQ_TYPE_BLOCK_PC usage
+ * @q:		request queue where request should be inserted
+ * @rq:		request structure to fill
+ * @map_data:   pointer to the rq_map_data holding pages (if necessary)
+ * @ubuf:	the user buffer
+ * @len:	length of user data
+ * @gfp_mask:	memory allocation flags
+ *
+ * Description:
+ *    Data will be mapped directly for zero copy I/O, if possible. Otherwise
+ *    a kernel bounce buffer is used.
+ *
+ *    A matching blk_rq_unmap_user() must be issued at the end of I/O, while
+ *    still in process context.
+ *
+ *    Note: The mapped bio may need to be bounced through blk_queue_bounce()
+ *    before being submitted to the device, as pages mapped may be out of
+ *    reach. It's the callers responsibility to make sure this happens. The
+ *    original bio must be passed back in to blk_rq_unmap_user() for proper
+ *    unmapping.
+ */
+int blk_rq_map_user(struct request_queue *q, struct request *rq,
+		    struct rq_map_data *map_data, void __user *ubuf,
+		    unsigned long len, gfp_t gfp_mask)
+{
+	unsigned long bytes_read = 0;
+	struct bio *bio = NULL;
+	int ret;
+
+	if (len > (queue_max_hw_sectors(q) << 9))
+		return -EINVAL;
+	if (!len)
+		return -EINVAL;
+
+	if (!ubuf && (!map_data || !map_data->null_mapped))
+		return -EINVAL;
+
+	while (bytes_read != len) {
+		unsigned long map_len, end, start;
+
+		map_len = min_t(unsigned long, len - bytes_read, BIO_MAX_SIZE);
+		end = ((unsigned long)ubuf + map_len + PAGE_SIZE - 1)
+								>> PAGE_SHIFT;
+		start = (unsigned long)ubuf >> PAGE_SHIFT;
+
+		/*
+		 * A bad offset could cause us to require BIO_MAX_PAGES + 1
+		 * pages. If this happens we just lower the requested
+		 * mapping len by a page so that we can fit
+		 */
+		if (end - start > BIO_MAX_PAGES)
+			map_len -= PAGE_SIZE;
+
+		ret = __blk_rq_map_user(q, rq, map_data, ubuf, map_len,
+					gfp_mask);
+		if (ret < 0)
+			goto unmap_rq;
+		if (!bio)
+			bio = rq->bio;
+		bytes_read += ret;
+		ubuf += ret;
+
+		if (map_data)
+			map_data->offset += ret;
+	}
+
+	if (!bio_flagged(bio, BIO_USER_MAPPED))
+		rq->cmd_flags |= REQ_COPY_USER;
+
+	return 0;
+unmap_rq:
+	blk_rq_unmap_user(bio);
+	rq->bio = NULL;
+	return ret;
+}
+EXPORT_SYMBOL(blk_rq_map_user);
+
+/**
+ * blk_rq_map_user_iov - map user data to a request, for REQ_TYPE_BLOCK_PC usage
+ * @q:		request queue where request should be inserted
+ * @rq:		request to map data to
+ * @map_data:   pointer to the rq_map_data holding pages (if necessary)
+ * @iov:	pointer to the iovec
+ * @iov_count:	number of elements in the iovec
+ * @len:	I/O byte count
+ * @gfp_mask:	memory allocation flags
+ *
+ * Description:
+ *    Data will be mapped directly for zero copy I/O, if possible. Otherwise
+ *    a kernel bounce buffer is used.
+ *
+ *    A matching blk_rq_unmap_user() must be issued at the end of I/O, while
+ *    still in process context.
+ *
+ *    Note: The mapped bio may need to be bounced through blk_queue_bounce()
+ *    before being submitted to the device, as pages mapped may be out of
+ *    reach. It's the callers responsibility to make sure this happens. The
+ *    original bio must be passed back in to blk_rq_unmap_user() for proper
+ *    unmapping.
+ */
+int blk_rq_map_user_iov(struct request_queue *q, struct request *rq,
+			struct rq_map_data *map_data, const struct sg_iovec *iov,
+			int iov_count, unsigned int len, gfp_t gfp_mask)
+{
+	struct bio *bio;
+	int i, read = rq_data_dir(rq) == READ;
+	int unaligned = 0;
+
+	if (!iov || iov_count <= 0)
+		return -EINVAL;
+
+	for (i = 0; i < iov_count; i++) {
+		unsigned long uaddr = (unsigned long)iov[i].iov_base;
+
+		if (!iov[i].iov_len)
+			return -EINVAL;
+
+		/*
+		 * Keep going so we check length of all segments
+		 */
+		if (uaddr & queue_dma_alignment(q))
+			unaligned = 1;
+	}
+
+	if (unaligned || (q->dma_pad_mask & len) || map_data)
+		bio = bio_copy_user_iov(q, map_data, iov, iov_count, read,
+					gfp_mask);
+	else
+		bio = bio_map_user_iov(q, NULL, iov, iov_count, read, gfp_mask);
+
+	if (IS_ERR(bio))
+		return PTR_ERR(bio);
+
+	if (bio->bi_iter.bi_size != len) {
+		/*
+		 * Grab an extra reference to this bio, as bio_unmap_user()
+		 * expects to be able to drop it twice as it happens on the
+		 * normal IO completion path
+		 */
+		bio_get(bio);
+		bio_endio(bio, 0);
+		__blk_rq_unmap_user(bio);
+		return -EINVAL;
+	}
+
+	if (!bio_flagged(bio, BIO_USER_MAPPED))
+		rq->cmd_flags |= REQ_COPY_USER;
+
+	blk_queue_bounce(q, &bio);
+	bio_get(bio);
+	blk_rq_bio_prep(q, rq, bio);
+	return 0;
+}
+EXPORT_SYMBOL(blk_rq_map_user_iov);
+
+/**
+ * blk_rq_unmap_user - unmap a request with user data
+ * @bio:	       start of bio list
+ *
+ * Description:
+ *    Unmap a rq previously mapped by blk_rq_map_user(). The caller must
+ *    supply the original rq->bio from the blk_rq_map_user() return, since
+ *    the I/O completion may have changed rq->bio.
+ */
+int blk_rq_unmap_user(struct bio *bio)
+{
+	struct bio *mapped_bio;
+	int ret = 0, ret2;
+
+	while (bio) {
+		mapped_bio = bio;
+		if (unlikely(bio_flagged(bio, BIO_BOUNCED)))
+			mapped_bio = bio->bi_private;
+
+		ret2 = __blk_rq_unmap_user(mapped_bio);
+		if (ret2 && !ret)
+			ret = ret2;
+
+		mapped_bio = bio;
+		bio = bio->bi_next;
+		bio_put(mapped_bio);
+	}
+
+	return ret;
+}
+EXPORT_SYMBOL(blk_rq_unmap_user);
+
+/**
+ * blk_rq_map_kern - map kernel data to a request, for REQ_TYPE_BLOCK_PC usage
+ * @q:		request queue where request should be inserted
+ * @rq:		request to fill
+ * @kbuf:	the kernel buffer
+ * @len:	length of user data
+ * @gfp_mask:	memory allocation flags
+ *
+ * Description:
+ *    Data will be mapped directly if possible. Otherwise a bounce
+ *    buffer is used. Can be called multiple times to append multiple
+ *    buffers.
+ */
+int blk_rq_map_kern(struct request_queue *q, struct request *rq, void *kbuf,
+		    unsigned int len, gfp_t gfp_mask)
+{
+	int reading = rq_data_dir(rq) == READ;
+	unsigned long addr = (unsigned long) kbuf;
+	int do_copy = 0;
+	struct bio *bio;
+	int ret;
+
+	if (len > (queue_max_hw_sectors(q) << 9))
+		return -EINVAL;
+	if (!len || !kbuf)
+		return -EINVAL;
+
+	do_copy = !blk_rq_aligned(q, addr, len) || object_is_on_stack(kbuf);
+	if (do_copy)
+		bio = bio_copy_kern(q, kbuf, len, gfp_mask, reading);
+	else
+		bio = bio_map_kern(q, kbuf, len, gfp_mask);
+
+	if (IS_ERR(bio))
+		return PTR_ERR(bio);
+
+	if (!reading)
+		bio->bi_rw |= REQ_WRITE;
+
+	if (do_copy)
+		rq->cmd_flags |= REQ_COPY_USER;
+
+	ret = blk_rq_append_bio(q, rq, bio);
+	if (unlikely(ret)) {
+		/* request is too big */
+		bio_put(bio);
+		return ret;
+	}
+
+	blk_queue_bounce(q, &rq->bio);
+	return 0;
+}
+EXPORT_SYMBOL(blk_rq_map_kern);
--- a/block/blk-merge.c
+++ b/block/blk-merge.c
@ -0,0 +1,627 @@
+/*
+ * Functions related to segment and merge handling
+ */
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/bio.h>
+#include <linux/blkdev.h>
+#include <linux/scatterlist.h>
+
+#include "blk.h"
+
+static unsigned int __blk_recalc_rq_segments(struct request_queue *q,
+					     struct bio *bio,
+					     bool no_sg_merge)
+{
+	struct bio_vec bv, bvprv = { NULL };
+	int cluster, high, highprv = 1;
+	unsigned int seg_size, nr_phys_segs;
+	struct bio *fbio, *bbio;
+	struct bvec_iter iter;
+
+	if (!bio)
+		return 0;
+
+	/*
+	 * This should probably be returning 0, but blk_add_request_payload()
+	 * (Christoph!!!!)
+	 */
+	if (bio->bi_rw & REQ_DISCARD)
+		return 1;
+
+	if (bio->bi_rw & REQ_WRITE_SAME)
+		return 1;
+
+	fbio = bio;
+	cluster = blk_queue_cluster(q);
+	seg_size = 0;
+	nr_phys_segs = 0;
+	high = 0;
+	for_each_bio(bio) {
+		bio_for_each_segment(bv, bio, iter) {
+			/*
+			 * If SG merging is disabled, each bio vector is
+			 * a segment
+			 */
+			if (no_sg_merge)
+				goto new_segment;
+
+			/*
+			 * the trick here is making sure that a high page is
+			 * never considered part of another segment, since
+			 * that might change with the bounce page.
+			 */
+			high = page_to_pfn(bv.bv_page) > queue_bounce_pfn(q);
+			if (!high && !highprv && cluster) {
+				if (seg_size + bv.bv_len
+				    > queue_max_segment_size(q))
+					goto new_segment;
+				if (!BIOVEC_PHYS_MERGEABLE(&bvprv, &bv))
+					goto new_segment;
+				if (!BIOVEC_SEG_BOUNDARY(q, &bvprv, &bv))
+					goto new_segment;
+
+				seg_size += bv.bv_len;
+				bvprv = bv;
+				continue;
+			}
+new_segment:
+			if (nr_phys_segs == 1 && seg_size >
+			    fbio->bi_seg_front_size)
+				fbio->bi_seg_front_size = seg_size;
+
+			nr_phys_segs++;
+			bvprv = bv;
+			seg_size = bv.bv_len;
+			highprv = high;
+		}
+		bbio = bio;
+	}
+
+	if (nr_phys_segs == 1 && seg_size > fbio->bi_seg_front_size)
+		fbio->bi_seg_front_size = seg_size;
+	if (seg_size > bbio->bi_seg_back_size)
+		bbio->bi_seg_back_size = seg_size;
+
+	return nr_phys_segs;
+}
+
+void blk_recalc_rq_segments(struct request *rq)
+{
+	bool no_sg_merge = !!test_bit(QUEUE_FLAG_NO_SG_MERGE,
+			&rq->q->queue_flags);
+
+	rq->nr_phys_segments = __blk_recalc_rq_segments(rq->q, rq->bio,
+			no_sg_merge);
+}
+
+void blk_recount_segments(struct request_queue *q, struct bio *bio)
+{
+	unsigned short seg_cnt;
+
+	/* estimate segment number by bi_vcnt for non-cloned bio */
+	if (bio_flagged(bio, BIO_CLONED))
+		seg_cnt = bio_segments(bio);
+	else
+		seg_cnt = bio->bi_vcnt;
+
+	if (test_bit(QUEUE_FLAG_NO_SG_MERGE, &q->queue_flags) &&
+			(seg_cnt < queue_max_segments(q)))
+		bio->bi_phys_segments = seg_cnt;
+	else {
+		struct bio *nxt = bio->bi_next;
+
+		bio->bi_next = NULL;
+		bio->bi_phys_segments = __blk_recalc_rq_segments(q, bio, false);
+		bio->bi_next = nxt;
+	}
+
+	bio->bi_flags |= (1 << BIO_SEG_VALID);
+}
+EXPORT_SYMBOL(blk_recount_segments);
+
+static int blk_phys_contig_segment(struct request_queue *q, struct bio *bio,
+				   struct bio *nxt)
+{
+	struct bio_vec end_bv = { NULL }, nxt_bv;
+	struct bvec_iter iter;
+
+	if (!blk_queue_cluster(q))
+		return 0;
+
+	if (bio->bi_seg_back_size + nxt->bi_seg_front_size >
+	    queue_max_segment_size(q))
+		return 0;
+
+	if (!bio_has_data(bio))
+		return 1;
+
+	bio_for_each_segment(end_bv, bio, iter)
+		if (end_bv.bv_len == iter.bi_size)
+			break;
+
+	nxt_bv = bio_iovec(nxt);
+
+	if (!BIOVEC_PHYS_MERGEABLE(&end_bv, &nxt_bv))
+		return 0;
+
+	/*
+	 * bio and nxt are contiguous in memory; check if the queue allows
+	 * these two to be merged into one
+	 */
+	if (BIOVEC_SEG_BOUNDARY(q, &end_bv, &nxt_bv))
+		return 1;
+
+	return 0;
+}
+
+static inline void
+__blk_segment_map_sg(struct request_queue *q, struct bio_vec *bvec,
+		     struct scatterlist *sglist, struct bio_vec *bvprv,
+		     struct scatterlist **sg, int *nsegs, int *cluster)
+{
+
+	int nbytes = bvec->bv_len;
+
+	if (*sg && *cluster) {
+		if ((*sg)->length + nbytes > queue_max_segment_size(q))
+			goto new_segment;
+
+		if (!BIOVEC_PHYS_MERGEABLE(bvprv, bvec))
+			goto new_segment;
+		if (!BIOVEC_SEG_BOUNDARY(q, bvprv, bvec))
+			goto new_segment;
+
+		(*sg)->length += nbytes;
+	} else {
+new_segment:
+		if (!*sg)
+			*sg = sglist;
+		else {
+			/*
+			 * If the driver previously mapped a shorter
+			 * list, we could see a termination bit
+			 * prematurely unless it fully inits the sg
+			 * table on each mapping. We KNOW that there
+			 * must be more entries here or the driver
+			 * would be buggy, so force clear the
+			 * termination bit to avoid doing a full
+			 * sg_init_table() in drivers for each command.
+			 */
+			sg_unmark_end(*sg);
+			*sg = sg_next(*sg);
+		}
+
+		sg_set_page(*sg, bvec->bv_page, nbytes, bvec->bv_offset);
+		(*nsegs)++;
+	}
+	*bvprv = *bvec;
+}
+
+static int __blk_bios_map_sg(struct request_queue *q, struct bio *bio,
+			     struct scatterlist *sglist,
+			     struct scatterlist **sg)
+{
+	struct bio_vec bvec, bvprv = { NULL };
+	struct bvec_iter iter;
+	int nsegs, cluster;
+
+	nsegs = 0;
+	cluster = blk_queue_cluster(q);
+
+	if (bio->bi_rw & REQ_DISCARD) {
+		/*
+		 * This is a hack - drivers should be neither modifying the
+		 * biovec, nor relying on bi_vcnt - but because of
+		 * blk_add_request_payload(), a discard bio may or may not have
+		 * a payload we need to set up here (thank you Christoph) and
+		 * bi_vcnt is really the only way of telling if we need to.
+		 */
+
+		if (bio->bi_vcnt)
+			goto single_segment;
+
+		return 0;
+	}
+
+	if (bio->bi_rw & REQ_WRITE_SAME) {
+single_segment:
+		*sg = sglist;
+		bvec = bio_iovec(bio);
+		sg_set_page(*sg, bvec.bv_page, bvec.bv_len, bvec.bv_offset);
+		return 1;
+	}
+
+	for_each_bio(bio)
+		bio_for_each_segment(bvec, bio, iter)
+			__blk_segment_map_sg(q, &bvec, sglist, &bvprv, sg,
+					     &nsegs, &cluster);
+
+	return nsegs;
+}
+
+/*
+ * map a request to scatterlist, return number of sg entries setup. Caller
+ * must make sure sg can hold rq->nr_phys_segments entries
+ */
+int blk_rq_map_sg(struct request_queue *q, struct request *rq,
+		  struct scatterlist *sglist)
+{
+	struct scatterlist *sg = NULL;
+	int nsegs = 0;
+
+	if (rq->bio)
+		nsegs = __blk_bios_map_sg(q, rq->bio, sglist, &sg);
+
+	if (unlikely(rq->cmd_flags & REQ_COPY_USER) &&
+	    (blk_rq_bytes(rq) & q->dma_pad_mask)) {
+		unsigned int pad_len =
+			(q->dma_pad_mask & ~blk_rq_bytes(rq)) + 1;
+
+		sg->length += pad_len;
+		rq->extra_len += pad_len;
+	}
+
+	if (q->dma_drain_size && q->dma_drain_needed(rq)) {
+		if (rq->cmd_flags & REQ_WRITE)
+			memset(q->dma_drain_buffer, 0, q->dma_drain_size);
+
+		sg->page_link &= ~0x02;
+		sg = sg_next(sg);
+		sg_set_page(sg, virt_to_page(q->dma_drain_buffer),
+			    q->dma_drain_size,
+			    ((unsigned long)q->dma_drain_buffer) &
+			    (PAGE_SIZE - 1));
+		nsegs++;
+		rq->extra_len += q->dma_drain_size;
+	}
+
+	if (sg)
+		sg_mark_end(sg);
+
+	return nsegs;
+}
+EXPORT_SYMBOL(blk_rq_map_sg);
+
+/**
+ * blk_bio_map_sg - map a bio to a scatterlist
+ * @q: request_queue in question
+ * @bio: bio being mapped
+ * @sglist: scatterlist being mapped
+ *
+ * Note:
+ *    Caller must make sure sg can hold bio->bi_phys_segments entries
+ *
+ * Will return the number of sg entries setup
+ */
+int blk_bio_map_sg(struct request_queue *q, struct bio *bio,
+		   struct scatterlist *sglist)
+{
+	struct scatterlist *sg = NULL;
+	int nsegs;
+	struct bio *next = bio->bi_next;
+	bio->bi_next = NULL;
+
+	nsegs = __blk_bios_map_sg(q, bio, sglist, &sg);
+	bio->bi_next = next;
+	if (sg)
+		sg_mark_end(sg);
+
+	BUG_ON(bio->bi_phys_segments && nsegs > bio->bi_phys_segments);
+	return nsegs;
+}
+EXPORT_SYMBOL(blk_bio_map_sg);
+
+static inline int ll_new_hw_segment(struct request_queue *q,
+				    struct request *req,
+				    struct bio *bio)
+{
+	int nr_phys_segs = bio_phys_segments(q, bio);
+
+	if (req->nr_phys_segments + nr_phys_segs > queue_max_segments(q))
+		goto no_merge;
+
+	if (blk_integrity_merge_bio(q, req, bio) == false)
+		goto no_merge;
+
+	/*
+	 * This will form the start of a new hw segment.  Bump both
+	 * counters.
+	 */
+	req->nr_phys_segments += nr_phys_segs;
+	return 1;
+
+no_merge:
+	req->cmd_flags |= REQ_NOMERGE;
+	if (req == q->last_merge)
+		q->last_merge = NULL;
+	return 0;
+}
+
+int ll_back_merge_fn(struct request_queue *q, struct request *req,
+		     struct bio *bio)
+{
+	if (blk_rq_sectors(req) + bio_sectors(bio) >
+	    blk_rq_get_max_sectors(req)) {
+		req->cmd_flags |= REQ_NOMERGE;
+		if (req == q->last_merge)
+			q->last_merge = NULL;
+		return 0;
+	}
+	if (!bio_flagged(req->biotail, BIO_SEG_VALID))
+		blk_recount_segments(q, req->biotail);
+	if (!bio_flagged(bio, BIO_SEG_VALID))
+		blk_recount_segments(q, bio);
+
+	return ll_new_hw_segment(q, req, bio);
+}
+
+int ll_front_merge_fn(struct request_queue *q, struct request *req,
+		      struct bio *bio)
+{
+	if (blk_rq_sectors(req) + bio_sectors(bio) >
+	    blk_rq_get_max_sectors(req)) {
+		req->cmd_flags |= REQ_NOMERGE;
+		if (req == q->last_merge)
+			q->last_merge = NULL;
+		return 0;
+	}
+	if (!bio_flagged(bio, BIO_SEG_VALID))
+		blk_recount_segments(q, bio);
+	if (!bio_flagged(req->bio, BIO_SEG_VALID))
+		blk_recount_segments(q, req->bio);
+
+	return ll_new_hw_segment(q, req, bio);
+}
+
+/*
+ * blk-mq uses req->special to carry normal driver per-request payload, it
+ * does not indicate a prepared command that we cannot merge with.
+ */
+static bool req_no_special_merge(struct request *req)
+{
+	struct request_queue *q = req->q;
+
+	return !q->mq_ops && req->special;
+}
+
+static int ll_merge_requests_fn(struct request_queue *q, struct request *req,
+				struct request *next)
+{
+	int total_phys_segments;
+	unsigned int seg_size =
+		req->biotail->bi_seg_back_size + next->bio->bi_seg_front_size;
+
+	/*
+	 * First check if the either of the requests are re-queued
+	 * requests.  Can't merge them if they are.
+	 */
+	if (req_no_special_merge(req) || req_no_special_merge(next))
+		return 0;
+
+	/*
+	 * Will it become too large?
+	 */
+	if ((blk_rq_sectors(req) + blk_rq_sectors(next)) >
+	    blk_rq_get_max_sectors(req))
+		return 0;
+
+	total_phys_segments = req->nr_phys_segments + next->nr_phys_segments;
+	if (blk_phys_contig_segment(q, req->biotail, next->bio)) {
+		if (req->nr_phys_segments == 1)
+			req->bio->bi_seg_front_size = seg_size;
+		if (next->nr_phys_segments == 1)
+			next->biotail->bi_seg_back_size = seg_size;
+		total_phys_segments--;
+	}
+
+	if (total_phys_segments > queue_max_segments(q))
+		return 0;
+
+	if (blk_integrity_merge_rq(q, req, next) == false)
+		return 0;
+
+	/* Merge is OK... */
+	req->nr_phys_segments = total_phys_segments;
+	return 1;
+}
+
+/**
+ * blk_rq_set_mixed_merge - mark a request as mixed merge
+ * @rq: request to mark as mixed merge
+ *
+ * Description:
+ *     @rq is about to be mixed merged.  Make sure the attributes
+ *     which can be mixed are set in each bio and mark @rq as mixed
+ *     merged.
+ */
+void blk_rq_set_mixed_merge(struct request *rq)
+{
+	unsigned int ff = rq->cmd_flags & REQ_FAILFAST_MASK;
+	struct bio *bio;
+
+	if (rq->cmd_flags & REQ_MIXED_MERGE)
+		return;
+
+	/*
+	 * @rq will no longer represent mixable attributes for all the
+	 * contained bios.  It will just track those of the first one.
+	 * Distributes the attributs to each bio.
+	 */
+	for (bio = rq->bio; bio; bio = bio->bi_next) {
+		WARN_ON_ONCE((bio->bi_rw & REQ_FAILFAST_MASK) &&
+			     (bio->bi_rw & REQ_FAILFAST_MASK) != ff);
+		bio->bi_rw |= ff;
+	}
+	rq->cmd_flags |= REQ_MIXED_MERGE;
+}
+
+static void blk_account_io_merge(struct request *req)
+{
+	if (blk_do_io_stat(req)) {
+		struct hd_struct *part;
+		int cpu;
+
+		cpu = part_stat_lock();
+		part = req->part;
+
+		part_round_stats(cpu, part);
+		part_dec_in_flight(part, rq_data_dir(req));
+
+		hd_struct_put(part);
+		part_stat_unlock();
+	}
+}
+
+/*
+ * Has to be called with the request spinlock acquired
+ */
+static int attempt_merge(struct request_queue *q, struct request *req,
+			  struct request *next)
+{
+	if (!rq_mergeable(req) || !rq_mergeable(next))
+		return 0;
+
+	if (!blk_check_merge_flags(req->cmd_flags, next->cmd_flags))
+		return 0;
+
+	/*
+	 * not contiguous
+	 */
+	if (blk_rq_pos(req) + blk_rq_sectors(req) != blk_rq_pos(next))
+		return 0;
+
+	if (rq_data_dir(req) != rq_data_dir(next)
+	    || req->rq_disk != next->rq_disk
+	    || req_no_special_merge(next))
+		return 0;
+
+	if (req->cmd_flags & REQ_WRITE_SAME &&
+	    !blk_write_same_mergeable(req->bio, next->bio))
+		return 0;
+
+	/*
+	 * If we are allowed to merge, then append bio list
+	 * from next to rq and release next. merge_requests_fn
+	 * will have updated segment counts, update sector
+	 * counts here.
+	 */
+	if (!ll_merge_requests_fn(q, req, next))
+		return 0;
+
+	/*
+	 * If failfast settings disagree or any of the two is already
+	 * a mixed merge, mark both as mixed before proceeding.  This
+	 * makes sure that all involved bios have mixable attributes
+	 * set properly.
+	 */
+	if ((req->cmd_flags | next->cmd_flags) & REQ_MIXED_MERGE ||
+	    (req->cmd_flags & REQ_FAILFAST_MASK) !=
+	    (next->cmd_flags & REQ_FAILFAST_MASK)) {
+		blk_rq_set_mixed_merge(req);
+		blk_rq_set_mixed_merge(next);
+	}
+
+	/*
+	 * At this point we have either done a back merge
+	 * or front merge. We need the smaller start_time of
+	 * the merged requests to be the current request
+	 * for accounting purposes.
+	 */
+	if (time_after(req->start_time, next->start_time))
+		req->start_time = next->start_time;
+
+	req->biotail->bi_next = next->bio;
+	req->biotail = next->biotail;
+
+	req->__data_len += blk_rq_bytes(next);
+
+	elv_merge_requests(q, req, next);
+
+	/*
+	 * 'next' is going away, so update stats accordingly
+	 */
+	blk_account_io_merge(next);
+
+	req->ioprio = ioprio_best(req->ioprio, next->ioprio);
+	if (blk_rq_cpu_valid(next))
+		req->cpu = next->cpu;
+
+	/* owner-ship of bio passed from next to req */
+	next->bio = NULL;
+	__blk_put_request(q, next);
+	return 1;
+}
+
+int attempt_back_merge(struct request_queue *q, struct request *rq)
+{
+	struct request *next = elv_latter_request(q, rq);
+
+	if (next)
+		return attempt_merge(q, rq, next);
+
+	return 0;
+}
+
+int attempt_front_merge(struct request_queue *q, struct request *rq)
+{
+	struct request *prev = elv_former_request(q, rq);
+
+	if (prev)
+		return attempt_merge(q, prev, rq);
+
+	return 0;
+}
+
+int blk_attempt_req_merge(struct request_queue *q, struct request *rq,
+			  struct request *next)
+{
+	return attempt_merge(q, rq, next);
+}
+
+bool blk_rq_merge_ok(struct request *rq, struct bio *bio)
+{
+	struct request_queue *q = rq->q;
+
+	if (!rq_mergeable(rq) || !bio_mergeable(bio))
+		return false;
+
+	if (!blk_check_merge_flags(rq->cmd_flags, bio->bi_rw))
+		return false;
+
+	/* different data direction or already started, don't merge */
+	if (bio_data_dir(bio) != rq_data_dir(rq))
+		return false;
+
+	/* must be same device and not a special request */
+	if (rq->rq_disk != bio->bi_bdev->bd_disk || req_no_special_merge(rq))
+		return false;
+
+	/* only merge integrity protected bio into ditto rq */
+	if (blk_integrity_merge_bio(rq->q, rq, bio) == false)
+		return false;
+
+	/* must be using the same buffer */
+	if (rq->cmd_flags & REQ_WRITE_SAME &&
+	    !blk_write_same_mergeable(rq->bio, bio))
+		return false;
+
+	if (q->queue_flags & (1 << QUEUE_FLAG_SG_GAPS)) {
+		struct bio_vec *bprev;
+
+		bprev = &rq->biotail->bi_io_vec[rq->biotail->bi_vcnt - 1];
+		if (bvec_gap_to_prev(bprev, bio->bi_io_vec[0].bv_offset))
+			return false;
+	}
+
+	return true;
+}
+
+int blk_try_merge(struct request *rq, struct bio *bio)
+{
+	if (blk_rq_pos(rq) + blk_rq_sectors(rq) == bio->bi_iter.bi_sector)
+		return ELEVATOR_BACK_MERGE;
+	else if (blk_rq_pos(rq) - bio_sectors(bio) == bio->bi_iter.bi_sector)
+		return ELEVATOR_FRONT_MERGE;
+	return ELEVATOR_NO_MERGE;
+}
--- a/block/blk-mq-cpu.c
+++ b/block/blk-mq-cpu.c
@ -0,0 +1,67 @@
+/*
+ * CPU notifier helper code for blk-mq
+ *
+ * Copyright (C) 2013-2014 Jens Axboe
+ */
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/blkdev.h>
+#include <linux/list.h>
+#include <linux/llist.h>
+#include <linux/smp.h>
+#include <linux/cpu.h>
+
+#include <linux/blk-mq.h>
+#include "blk-mq.h"
+
+static LIST_HEAD(blk_mq_cpu_notify_list);
+static DEFINE_RAW_SPINLOCK(blk_mq_cpu_notify_lock);
+
+static int blk_mq_main_cpu_notify(struct notifier_block *self,
+				  unsigned long action, void *hcpu)
+{
+	unsigned int cpu = (unsigned long) hcpu;
+	struct blk_mq_cpu_notifier *notify;
+	int ret = NOTIFY_OK;
+
+	raw_spin_lock(&blk_mq_cpu_notify_lock);
+
+	list_for_each_entry(notify, &blk_mq_cpu_notify_list, list) {
+		ret = notify->notify(notify->data, action, cpu);
+		if (ret != NOTIFY_OK)
+			break;
+	}
+
+	raw_spin_unlock(&blk_mq_cpu_notify_lock);
+	return ret;
+}
+
+void blk_mq_register_cpu_notifier(struct blk_mq_cpu_notifier *notifier)
+{
+	BUG_ON(!notifier->notify);
+
+	raw_spin_lock(&blk_mq_cpu_notify_lock);
+	list_add_tail(&notifier->list, &blk_mq_cpu_notify_list);
+	raw_spin_unlock(&blk_mq_cpu_notify_lock);
+}
+
+void blk_mq_unregister_cpu_notifier(struct blk_mq_cpu_notifier *notifier)
+{
+	raw_spin_lock(&blk_mq_cpu_notify_lock);
+	list_del(&notifier->list);
+	raw_spin_unlock(&blk_mq_cpu_notify_lock);
+}
+
+void blk_mq_init_cpu_notifier(struct blk_mq_cpu_notifier *notifier,
+			      int (*fn)(void *, unsigned long, unsigned int),
+			      void *data)
+{
+	notifier->notify = fn;
+	notifier->data = data;
+}
+
+void __init blk_mq_cpu_init(void)
+{
+	hotcpu_notifier(blk_mq_main_cpu_notify, 0);
+}
--- a/block/blk-mq-cpumap.c
+++ b/block/blk-mq-cpumap.c
@ -0,0 +1,119 @@
+/*
+ * CPU <-> hardware queue mapping helpers
+ *
+ * Copyright (C) 2013-2014 Jens Axboe
+ */
+#include <linux/kernel.h>
+#include <linux/threads.h>
+#include <linux/module.h>
+#include <linux/mm.h>
+#include <linux/smp.h>
+#include <linux/cpu.h>
+
+#include <linux/blk-mq.h>
+#include "blk.h"
+#include "blk-mq.h"
+
+static int cpu_to_queue_index(unsigned int nr_cpus, unsigned int nr_queues,
+			      const int cpu)
+{
+	return cpu / ((nr_cpus + nr_queues - 1) / nr_queues);
+}
+
+static int get_first_sibling(unsigned int cpu)
+{
+	unsigned int ret;
+
+	ret = cpumask_first(topology_thread_cpumask(cpu));
+	if (ret < nr_cpu_ids)
+		return ret;
+
+	return cpu;
+}
+
+int blk_mq_update_queue_map(unsigned int *map, unsigned int nr_queues)
+{
+	unsigned int i, nr_cpus, nr_uniq_cpus, queue, first_sibling;
+	cpumask_var_t cpus;
+
+	if (!alloc_cpumask_var(&cpus, GFP_ATOMIC))
+		return 1;
+
+	cpumask_clear(cpus);
+	nr_cpus = nr_uniq_cpus = 0;
+	for_each_online_cpu(i) {
+		nr_cpus++;
+		first_sibling = get_first_sibling(i);
+		if (!cpumask_test_cpu(first_sibling, cpus))
+			nr_uniq_cpus++;
+		cpumask_set_cpu(i, cpus);
+	}
+
+	queue = 0;
+	for_each_possible_cpu(i) {
+		if (!cpu_online(i)) {
+			map[i] = 0;
+			continue;
+		}
+
+		/*
+		 * Easy case - we have equal or more hardware queues. Or
+		 * there are no thread siblings to take into account. Do
+		 * 1:1 if enough, or sequential mapping if less.
+		 */
+		if (nr_queues >= nr_cpus || nr_cpus == nr_uniq_cpus) {
+			map[i] = cpu_to_queue_index(nr_cpus, nr_queues, queue);
+			queue++;
+			continue;
+		}
+
+		/*
+		 * Less then nr_cpus queues, and we have some number of
+		 * threads per cores. Map sibling threads to the same
+		 * queue.
+		 */
+		first_sibling = get_first_sibling(i);
+		if (first_sibling == i) {
+			map[i] = cpu_to_queue_index(nr_uniq_cpus, nr_queues,
+							queue);
+			queue++;
+		} else
+			map[i] = map[first_sibling];
+	}
+
+	free_cpumask_var(cpus);
+	return 0;
+}
+
+unsigned int *blk_mq_make_queue_map(struct blk_mq_tag_set *set)
+{
+	unsigned int *map;
+
+	/* If cpus are offline, map them to first hctx */
+	map = kzalloc_node(sizeof(*map) * nr_cpu_ids, GFP_KERNEL,
+				set->numa_node);
+	if (!map)
+		return NULL;
+
+	if (!blk_mq_update_queue_map(map, set->nr_hw_queues))
+		return map;
+
+	kfree(map);
+	return NULL;
+}
+
+/*
+ * We have no quick way of doing reverse lookups. This is only used at
+ * queue init time, so runtime isn't important.
+ */
+int blk_mq_hw_queue_to_node(unsigned int *mq_map, unsigned int index)
+{
+	int i;
+
+	for_each_possible_cpu(i) {
+		if (index == mq_map[i])
+			return cpu_to_node(i);
+	}
+
+	return NUMA_NO_NODE;
+}
--- a/block/blk-mq-sysfs.c
+++ b/block/blk-mq-sysfs.c
@ -0,0 +1,461 @@
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/backing-dev.h>
+#include <linux/bio.h>
+#include <linux/blkdev.h>
+#include <linux/mm.h>
+#include <linux/init.h>
+#include <linux/slab.h>
+#include <linux/workqueue.h>
+#include <linux/smp.h>
+
+#include <linux/blk-mq.h>
+#include "blk-mq.h"
+#include "blk-mq-tag.h"
+
+static void blk_mq_sysfs_release(struct kobject *kobj)
+{
+}
+
+struct blk_mq_ctx_sysfs_entry {
+	struct attribute attr;
+	ssize_t (*show)(struct blk_mq_ctx *, char *);
+	ssize_t (*store)(struct blk_mq_ctx *, const char *, size_t);
+};
+
+struct blk_mq_hw_ctx_sysfs_entry {
+	struct attribute attr;
+	ssize_t (*show)(struct blk_mq_hw_ctx *, char *);
+	ssize_t (*store)(struct blk_mq_hw_ctx *, const char *, size_t);
+};
+
+static ssize_t blk_mq_sysfs_show(struct kobject *kobj, struct attribute *attr,
+				 char *page)
+{
+	struct blk_mq_ctx_sysfs_entry *entry;
+	struct blk_mq_ctx *ctx;
+	struct request_queue *q;
+	ssize_t res;
+
+	entry = container_of(attr, struct blk_mq_ctx_sysfs_entry, attr);
+	ctx = container_of(kobj, struct blk_mq_ctx, kobj);
+	q = ctx->queue;
+
+	if (!entry->show)
+		return -EIO;
+
+	res = -ENOENT;
+	mutex_lock(&q->sysfs_lock);
+	if (!blk_queue_dying(q))
+		res = entry->show(ctx, page);
+	mutex_unlock(&q->sysfs_lock);
+	return res;
+}
+
+static ssize_t blk_mq_sysfs_store(struct kobject *kobj, struct attribute *attr,
+				  const char *page, size_t length)
+{
+	struct blk_mq_ctx_sysfs_entry *entry;
+	struct blk_mq_ctx *ctx;
+	struct request_queue *q;
+	ssize_t res;
+
+	entry = container_of(attr, struct blk_mq_ctx_sysfs_entry, attr);
+	ctx = container_of(kobj, struct blk_mq_ctx, kobj);
+	q = ctx->queue;
+
+	if (!entry->store)
+		return -EIO;
+
+	res = -ENOENT;
+	mutex_lock(&q->sysfs_lock);
+	if (!blk_queue_dying(q))
+		res = entry->store(ctx, page, length);
+	mutex_unlock(&q->sysfs_lock);
+	return res;
+}
+
+static ssize_t blk_mq_hw_sysfs_show(struct kobject *kobj,
+				    struct attribute *attr, char *page)
+{
+	struct blk_mq_hw_ctx_sysfs_entry *entry;
+	struct blk_mq_hw_ctx *hctx;
+	struct request_queue *q;
+	ssize_t res;
+
+	entry = container_of(attr, struct blk_mq_hw_ctx_sysfs_entry, attr);
+	hctx = container_of(kobj, struct blk_mq_hw_ctx, kobj);
+	q = hctx->queue;
+
+	if (!entry->show)
+		return -EIO;
+
+	res = -ENOENT;
+	mutex_lock(&q->sysfs_lock);
+	if (!blk_queue_dying(q))
+		res = entry->show(hctx, page);
+	mutex_unlock(&q->sysfs_lock);
+	return res;
+}
+
+static ssize_t blk_mq_hw_sysfs_store(struct kobject *kobj,
+				     struct attribute *attr, const char *page,
+				     size_t length)
+{
+	struct blk_mq_hw_ctx_sysfs_entry *entry;
+	struct blk_mq_hw_ctx *hctx;
+	struct request_queue *q;
+	ssize_t res;
+
+	entry = container_of(attr, struct blk_mq_hw_ctx_sysfs_entry, attr);
+	hctx = container_of(kobj, struct blk_mq_hw_ctx, kobj);
+	q = hctx->queue;
+
+	if (!entry->store)
+		return -EIO;
+
+	res = -ENOENT;
+	mutex_lock(&q->sysfs_lock);
+	if (!blk_queue_dying(q))
+		res = entry->store(hctx, page, length);
+	mutex_unlock(&q->sysfs_lock);
+	return res;
+}
+
+static ssize_t blk_mq_sysfs_dispatched_show(struct blk_mq_ctx *ctx, char *page)
+{
+	return sprintf(page, "%lu %lu\n", ctx->rq_dispatched[1],
+				ctx->rq_dispatched[0]);
+}
+
+static ssize_t blk_mq_sysfs_merged_show(struct blk_mq_ctx *ctx, char *page)
+{
+	return sprintf(page, "%lu\n", ctx->rq_merged);
+}
+
+static ssize_t blk_mq_sysfs_completed_show(struct blk_mq_ctx *ctx, char *page)
+{
+	return sprintf(page, "%lu %lu\n", ctx->rq_completed[1],
+				ctx->rq_completed[0]);
+}
+
+static ssize_t sysfs_list_show(char *page, struct list_head *list, char *msg)
+{
+	char *start_page = page;
+	struct request *rq;
+
+	page += sprintf(page, "%s:\n", msg);
+
+	list_for_each_entry(rq, list, queuelist)
+		page += sprintf(page, "\t%p\n", rq);
+
+	return page - start_page;
+}
+
+static ssize_t blk_mq_sysfs_rq_list_show(struct blk_mq_ctx *ctx, char *page)
+{
+	ssize_t ret;
+
+	spin_lock(&ctx->lock);
+	ret = sysfs_list_show(page, &ctx->rq_list, "CTX pending");
+	spin_unlock(&ctx->lock);
+
+	return ret;
+}
+
+static ssize_t blk_mq_hw_sysfs_queued_show(struct blk_mq_hw_ctx *hctx,
+					   char *page)
+{
+	return sprintf(page, "%lu\n", hctx->queued);
+}
+
+static ssize_t blk_mq_hw_sysfs_run_show(struct blk_mq_hw_ctx *hctx, char *page)
+{
+	return sprintf(page, "%lu\n", hctx->run);
+}
+
+static ssize_t blk_mq_hw_sysfs_dispatched_show(struct blk_mq_hw_ctx *hctx,
+					       char *page)
+{
+	char *start_page = page;
+	int i;
+
+	page += sprintf(page, "%8u\t%lu\n", 0U, hctx->dispatched[0]);
+
+	for (i = 1; i < BLK_MQ_MAX_DISPATCH_ORDER; i++) {
+		unsigned long d = 1U << (i - 1);
+
+		page += sprintf(page, "%8lu\t%lu\n", d, hctx->dispatched[i]);
+	}
+
+	return page - start_page;
+}
+
+static ssize_t blk_mq_hw_sysfs_rq_list_show(struct blk_mq_hw_ctx *hctx,
+					    char *page)
+{
+	ssize_t ret;
+
+	spin_lock(&hctx->lock);
+	ret = sysfs_list_show(page, &hctx->dispatch, "HCTX pending");
+	spin_unlock(&hctx->lock);
+
+	return ret;
+}
+
+static ssize_t blk_mq_hw_sysfs_tags_show(struct blk_mq_hw_ctx *hctx, char *page)
+{
+	return blk_mq_tag_sysfs_show(hctx->tags, page);
+}
+
+static ssize_t blk_mq_hw_sysfs_active_show(struct blk_mq_hw_ctx *hctx, char *page)
+{
+	return sprintf(page, "%u\n", atomic_read(&hctx->nr_active));
+}
+
+static ssize_t blk_mq_hw_sysfs_cpus_show(struct blk_mq_hw_ctx *hctx, char *page)
+{
+	unsigned int i, first = 1;
+	ssize_t ret = 0;
+
+	blk_mq_disable_hotplug();
+
+	for_each_cpu(i, hctx->cpumask) {
+		if (first)
+			ret += sprintf(ret + page, "%u", i);
+		else
+			ret += sprintf(ret + page, ", %u", i);
+
+		first = 0;
+	}
+
+	blk_mq_enable_hotplug();
+
+	ret += sprintf(ret + page, "\n");
+	return ret;
+}
+
+static struct blk_mq_ctx_sysfs_entry blk_mq_sysfs_dispatched = {
+	.attr = {.name = "dispatched", .mode = S_IRUGO },
+	.show = blk_mq_sysfs_dispatched_show,
+};
+static struct blk_mq_ctx_sysfs_entry blk_mq_sysfs_merged = {
+	.attr = {.name = "merged", .mode = S_IRUGO },
+	.show = blk_mq_sysfs_merged_show,
+};
+static struct blk_mq_ctx_sysfs_entry blk_mq_sysfs_completed = {
+	.attr = {.name = "completed", .mode = S_IRUGO },
+	.show = blk_mq_sysfs_completed_show,
+};
+static struct blk_mq_ctx_sysfs_entry blk_mq_sysfs_rq_list = {
+	.attr = {.name = "rq_list", .mode = S_IRUGO },
+	.show = blk_mq_sysfs_rq_list_show,
+};
+
+static struct attribute *default_ctx_attrs[] = {
+	&blk_mq_sysfs_dispatched.attr,
+	&blk_mq_sysfs_merged.attr,
+	&blk_mq_sysfs_completed.attr,
+	&blk_mq_sysfs_rq_list.attr,
+	NULL,
+};
+
+static struct blk_mq_hw_ctx_sysfs_entry blk_mq_hw_sysfs_queued = {
+	.attr = {.name = "queued", .mode = S_IRUGO },
+	.show = blk_mq_hw_sysfs_queued_show,
+};
+static struct blk_mq_hw_ctx_sysfs_entry blk_mq_hw_sysfs_run = {
+	.attr = {.name = "run", .mode = S_IRUGO },
+	.show = blk_mq_hw_sysfs_run_show,
+};
+static struct blk_mq_hw_ctx_sysfs_entry blk_mq_hw_sysfs_dispatched = {
+	.attr = {.name = "dispatched", .mode = S_IRUGO },
+	.show = blk_mq_hw_sysfs_dispatched_show,
+};
+static struct blk_mq_hw_ctx_sysfs_entry blk_mq_hw_sysfs_active = {
+	.attr = {.name = "active", .mode = S_IRUGO },
+	.show = blk_mq_hw_sysfs_active_show,
+};
+static struct blk_mq_hw_ctx_sysfs_entry blk_mq_hw_sysfs_pending = {
+	.attr = {.name = "pending", .mode = S_IRUGO },
+	.show = blk_mq_hw_sysfs_rq_list_show,
+};
+static struct blk_mq_hw_ctx_sysfs_entry blk_mq_hw_sysfs_tags = {
+	.attr = {.name = "tags", .mode = S_IRUGO },
+	.show = blk_mq_hw_sysfs_tags_show,
+};
+static struct blk_mq_hw_ctx_sysfs_entry blk_mq_hw_sysfs_cpus = {
+	.attr = {.name = "cpu_list", .mode = S_IRUGO },
+	.show = blk_mq_hw_sysfs_cpus_show,
+};
+
+static struct attribute *default_hw_ctx_attrs[] = {
+	&blk_mq_hw_sysfs_queued.attr,
+	&blk_mq_hw_sysfs_run.attr,
+	&blk_mq_hw_sysfs_dispatched.attr,
+	&blk_mq_hw_sysfs_pending.attr,
+	&blk_mq_hw_sysfs_tags.attr,
+	&blk_mq_hw_sysfs_cpus.attr,
+	&blk_mq_hw_sysfs_active.attr,
+	NULL,
+};
+
+static const struct sysfs_ops blk_mq_sysfs_ops = {
+	.show	= blk_mq_sysfs_show,
+	.store	= blk_mq_sysfs_store,
+};
+
+static const struct sysfs_ops blk_mq_hw_sysfs_ops = {
+	.show	= blk_mq_hw_sysfs_show,
+	.store	= blk_mq_hw_sysfs_store,
+};
+
+static struct kobj_type blk_mq_ktype = {
+	.sysfs_ops	= &blk_mq_sysfs_ops,
+	.release	= blk_mq_sysfs_release,
+};
+
+static struct kobj_type blk_mq_ctx_ktype = {
+	.sysfs_ops	= &blk_mq_sysfs_ops,
+	.default_attrs	= default_ctx_attrs,
+	.release	= blk_mq_sysfs_release,
+};
+
+static struct kobj_type blk_mq_hw_ktype = {
+	.sysfs_ops	= &blk_mq_hw_sysfs_ops,
+	.default_attrs	= default_hw_ctx_attrs,
+	.release	= blk_mq_sysfs_release,
+};
+
+static void blk_mq_unregister_hctx(struct blk_mq_hw_ctx *hctx)
+{
+	struct blk_mq_ctx *ctx;
+	int i;
+
+	if (!hctx->nr_ctx || !(hctx->flags & BLK_MQ_F_SYSFS_UP))
+		return;
+
+	hctx_for_each_ctx(hctx, ctx, i)
+		kobject_del(&ctx->kobj);
+
+	kobject_del(&hctx->kobj);
+}
+
+static int blk_mq_register_hctx(struct blk_mq_hw_ctx *hctx)
+{
+	struct request_queue *q = hctx->queue;
+	struct blk_mq_ctx *ctx;
+	int i, ret;
+
+	if (!hctx->nr_ctx || !(hctx->flags & BLK_MQ_F_SYSFS_UP))
+		return 0;
+
+	ret = kobject_add(&hctx->kobj, &q->mq_kobj, "%u", hctx->queue_num);
+	if (ret)
+		return ret;
+
+	hctx_for_each_ctx(hctx, ctx, i) {
+		ret = kobject_add(&ctx->kobj, &hctx->kobj, "cpu%u", ctx->cpu);
+		if (ret)
+			break;
+	}
+
+	return ret;
+}
+
+void blk_mq_unregister_disk(struct gendisk *disk)
+{
+	struct request_queue *q = disk->queue;
+	struct blk_mq_hw_ctx *hctx;
+	struct blk_mq_ctx *ctx;
+	int i, j;
+
+	queue_for_each_hw_ctx(q, hctx, i) {
+		blk_mq_unregister_hctx(hctx);
+
+		hctx_for_each_ctx(hctx, ctx, j)
+			kobject_put(&ctx->kobj);
+
+		kobject_put(&hctx->kobj);
+	}
+
+	kobject_uevent(&q->mq_kobj, KOBJ_REMOVE);
+	kobject_del(&q->mq_kobj);
+	kobject_put(&q->mq_kobj);
+
+	kobject_put(&disk_to_dev(disk)->kobj);
+}
+
+static void blk_mq_sysfs_init(struct request_queue *q)
+{
+	struct blk_mq_hw_ctx *hctx;
+	struct blk_mq_ctx *ctx;
+	int i;
+
+	kobject_init(&q->mq_kobj, &blk_mq_ktype);
+
+	queue_for_each_hw_ctx(q, hctx, i)
+		kobject_init(&hctx->kobj, &blk_mq_hw_ktype);
+
+	queue_for_each_ctx(q, ctx, i)
+		kobject_init(&ctx->kobj, &blk_mq_ctx_ktype);
+}
+
+/* see blk_register_queue() */
+void blk_mq_finish_init(struct request_queue *q)
+{
+	percpu_ref_switch_to_percpu(&q->mq_usage_counter);
+}
+
+int blk_mq_register_disk(struct gendisk *disk)
+{
+	struct device *dev = disk_to_dev(disk);
+	struct request_queue *q = disk->queue;
+	struct blk_mq_hw_ctx *hctx;
+	int ret, i;
+
+	blk_mq_sysfs_init(q);
+
+	ret = kobject_add(&q->mq_kobj, kobject_get(&dev->kobj), "%s", "mq");
+	if (ret < 0)
+		return ret;
+
+	kobject_uevent(&q->mq_kobj, KOBJ_ADD);
+
+	queue_for_each_hw_ctx(q, hctx, i) {
+		hctx->flags |= BLK_MQ_F_SYSFS_UP;
+		ret = blk_mq_register_hctx(hctx);
+		if (ret)
+			break;
+	}
+
+	if (ret) {
+		blk_mq_unregister_disk(disk);
+		return ret;
+	}
+
+	return 0;
+}
+
+void blk_mq_sysfs_unregister(struct request_queue *q)
+{
+	struct blk_mq_hw_ctx *hctx;
+	int i;
+
+	queue_for_each_hw_ctx(q, hctx, i)
+		blk_mq_unregister_hctx(hctx);
+}
+
+int blk_mq_sysfs_register(struct request_queue *q)
+{
+	struct blk_mq_hw_ctx *hctx;
+	int i, ret = 0;
+
+	queue_for_each_hw_ctx(q, hctx, i) {
+		ret = blk_mq_register_hctx(hctx);
+		if (ret)
+			break;
+	}
+
+	return ret;
+}
--- a/block/blk-mq-tag.c
+++ b/block/blk-mq-tag.c
@ -0,0 +1,609 @@
+/*
+ * Fast and scalable bitmap tagging variant. Uses sparser bitmaps spread
+ * over multiple cachelines to avoid ping-pong between multiple submitters
+ * or submitter and completer. Uses rolling wakeups to avoid falling of
+ * the scaling cliff when we run out of tags and have to start putting
+ * submitters to sleep.
+ *
+ * Uses active queue tracking to support fairer distribution of tags
+ * between multiple submitters when a shared tag map is used.
+ *
+ * Copyright (C) 2013-2014 Jens Axboe
+ */
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/random.h>
+
+#include <linux/blk-mq.h>
+#include "blk.h"
+#include "blk-mq.h"
+#include "blk-mq-tag.h"
+
+static bool bt_has_free_tags(struct blk_mq_bitmap_tags *bt)
+{
+	int i;
+
+	for (i = 0; i < bt->map_nr; i++) {
+		struct blk_align_bitmap *bm = &bt->map[i];
+		int ret;
+
+		ret = find_first_zero_bit(&bm->word, bm->depth);
+		if (ret < bm->depth)
+			return true;
+	}
+
+	return false;
+}
+
+bool blk_mq_has_free_tags(struct blk_mq_tags *tags)
+{
+	if (!tags)
+		return true;
+
+	return bt_has_free_tags(&tags->bitmap_tags);
+}
+
+static inline int bt_index_inc(int index)
+{
+	return (index + 1) & (BT_WAIT_QUEUES - 1);
+}
+
+static inline void bt_index_atomic_inc(atomic_t *index)
+{
+	int old = atomic_read(index);
+	int new = bt_index_inc(old);
+	atomic_cmpxchg(index, old, new);
+}
+
+/*
+ * If a previously inactive queue goes active, bump the active user count.
+ */
+bool __blk_mq_tag_busy(struct blk_mq_hw_ctx *hctx)
+{
+	if (!test_bit(BLK_MQ_S_TAG_ACTIVE, &hctx->state) &&
+	    !test_and_set_bit(BLK_MQ_S_TAG_ACTIVE, &hctx->state))
+		atomic_inc(&hctx->tags->active_queues);
+
+	return true;
+}
+
+/*
+ * Wakeup all potentially sleeping on normal (non-reserved) tags
+ */
+static void blk_mq_tag_wakeup_all(struct blk_mq_tags *tags)
+{
+	struct blk_mq_bitmap_tags *bt;
+	int i, wake_index;
+
+	bt = &tags->bitmap_tags;
+	wake_index = atomic_read(&bt->wake_index);
+	for (i = 0; i < BT_WAIT_QUEUES; i++) {
+		struct bt_wait_state *bs = &bt->bs[wake_index];
+
+		if (waitqueue_active(&bs->wait))
+			wake_up(&bs->wait);
+
+		wake_index = bt_index_inc(wake_index);
+	}
+}
+
+/*
+ * If a previously busy queue goes inactive, potential waiters could now
+ * be allowed to queue. Wake them up and check.
+ */
+void __blk_mq_tag_idle(struct blk_mq_hw_ctx *hctx)
+{
+	struct blk_mq_tags *tags = hctx->tags;
+
+	if (!test_and_clear_bit(BLK_MQ_S_TAG_ACTIVE, &hctx->state))
+		return;
+
+	atomic_dec(&tags->active_queues);
+
+	blk_mq_tag_wakeup_all(tags);
+}
+
+/*
+ * For shared tag users, we track the number of currently active users
+ * and attempt to provide a fair share of the tag depth for each of them.
+ */
+static inline bool hctx_may_queue(struct blk_mq_hw_ctx *hctx,
+				  struct blk_mq_bitmap_tags *bt)
+{
+	unsigned int depth, users;
+
+	if (!hctx || !(hctx->flags & BLK_MQ_F_TAG_SHARED))
+		return true;
+	if (!test_bit(BLK_MQ_S_TAG_ACTIVE, &hctx->state))
+		return true;
+
+	/*
+	 * Don't try dividing an ant
+	 */
+	if (bt->depth == 1)
+		return true;
+
+	users = atomic_read(&hctx->tags->active_queues);
+	if (!users)
+		return true;
+
+	/*
+	 * Allow at least some tags
+	 */
+	depth = max((bt->depth + users - 1) / users, 4U);
+	return atomic_read(&hctx->nr_active) < depth;
+}
+
+static int __bt_get_word(struct blk_align_bitmap *bm, unsigned int last_tag)
+{
+	int tag, org_last_tag, end;
+	bool wrap = last_tag != 0;
+
+	org_last_tag = last_tag;
+	end = bm->depth;
+	do {
+restart:
+		tag = find_next_zero_bit(&bm->word, end, last_tag);
+		if (unlikely(tag >= end)) {
+			/*
+			 * We started with an offset, start from 0 to
+			 * exhaust the map.
+			 */
+			if (wrap) {
+				wrap = false;
+				end = org_last_tag;
+				last_tag = 0;
+				goto restart;
+			}
+			return -1;
+		}
+		last_tag = tag + 1;
+	} while (test_and_set_bit(tag, &bm->word));
+
+	return tag;
+}
+
+/*
+ * Straight forward bitmap tag implementation, where each bit is a tag
+ * (cleared == free, and set == busy). The small twist is using per-cpu
+ * last_tag caches, which blk-mq stores in the blk_mq_ctx software queue
+ * contexts. This enables us to drastically limit the space searched,
+ * without dirtying an extra shared cacheline like we would if we stored
+ * the cache value inside the shared blk_mq_bitmap_tags structure. On top
+ * of that, each word of tags is in a separate cacheline. This means that
+ * multiple users will tend to stick to different cachelines, at least
+ * until the map is exhausted.
+ */
+static int __bt_get(struct blk_mq_hw_ctx *hctx, struct blk_mq_bitmap_tags *bt,
+		    unsigned int *tag_cache)
+{
+	unsigned int last_tag, org_last_tag;
+	int index, i, tag;
+
+	if (!hctx_may_queue(hctx, bt))
+		return -1;
+
+	last_tag = org_last_tag = *tag_cache;
+	index = TAG_TO_INDEX(bt, last_tag);
+
+	for (i = 0; i < bt->map_nr; i++) {
+		tag = __bt_get_word(&bt->map[index], TAG_TO_BIT(bt, last_tag));
+		if (tag != -1) {
+			tag += (index << bt->bits_per_word);
+			goto done;
+		}
+
+		last_tag = 0;
+		if (++index >= bt->map_nr)
+			index = 0;
+	}
+
+	*tag_cache = 0;
+	return -1;
+
+	/*
+	 * Only update the cache from the allocation path, if we ended
+	 * up using the specific cached tag.
+	 */
+done:
+	if (tag == org_last_tag) {
+		last_tag = tag + 1;
+		if (last_tag >= bt->depth - 1)
+			last_tag = 0;
+
+		*tag_cache = last_tag;
+	}
+
+	return tag;
+}
+
+static struct bt_wait_state *bt_wait_ptr(struct blk_mq_bitmap_tags *bt,
+					 struct blk_mq_hw_ctx *hctx)
+{
+	struct bt_wait_state *bs;
+	int wait_index;
+
+	if (!hctx)
+		return &bt->bs[0];
+
+	wait_index = atomic_read(&hctx->wait_index);
+	bs = &bt->bs[wait_index];
+	bt_index_atomic_inc(&hctx->wait_index);
+	return bs;
+}
+
+static int bt_get(struct blk_mq_alloc_data *data,
+		struct blk_mq_bitmap_tags *bt,
+		struct blk_mq_hw_ctx *hctx,
+		unsigned int *last_tag)
+{
+	struct bt_wait_state *bs;
+	DEFINE_WAIT(wait);
+	int tag;
+
+	tag = __bt_get(hctx, bt, last_tag);
+	if (tag != -1)
+		return tag;
+
+	if (!(data->gfp & __GFP_WAIT))
+		return -1;
+
+	bs = bt_wait_ptr(bt, hctx);
+	do {
+		prepare_to_wait(&bs->wait, &wait, TASK_UNINTERRUPTIBLE);
+
+		tag = __bt_get(hctx, bt, last_tag);
+		if (tag != -1)
+			break;
+
+		blk_mq_put_ctx(data->ctx);
+
+		io_schedule();
+
+		data->ctx = blk_mq_get_ctx(data->q);
+		data->hctx = data->q->mq_ops->map_queue(data->q,
+				data->ctx->cpu);
+		if (data->reserved) {
+			bt = &data->hctx->tags->breserved_tags;
+		} else {
+			last_tag = &data->ctx->last_tag;
+			hctx = data->hctx;
+			bt = &hctx->tags->bitmap_tags;
+		}
+		finish_wait(&bs->wait, &wait);
+		bs = bt_wait_ptr(bt, hctx);
+	} while (1);
+
+	finish_wait(&bs->wait, &wait);
+	return tag;
+}
+
+static unsigned int __blk_mq_get_tag(struct blk_mq_alloc_data *data)
+{
+	int tag;
+
+	tag = bt_get(data, &data->hctx->tags->bitmap_tags, data->hctx,
+			&data->ctx->last_tag);
+	if (tag >= 0)
+		return tag + data->hctx->tags->nr_reserved_tags;
+
+	return BLK_MQ_TAG_FAIL;
+}
+
+static unsigned int __blk_mq_get_reserved_tag(struct blk_mq_alloc_data *data)
+{
+	int tag, zero = 0;
+
+	if (unlikely(!data->hctx->tags->nr_reserved_tags)) {
+		WARN_ON_ONCE(1);
+		return BLK_MQ_TAG_FAIL;
+	}
+
+	tag = bt_get(data, &data->hctx->tags->breserved_tags, NULL, &zero);
+	if (tag < 0)
+		return BLK_MQ_TAG_FAIL;
+
+	return tag;
+}
+
+unsigned int blk_mq_get_tag(struct blk_mq_alloc_data *data)
+{
+	if (!data->reserved)
+		return __blk_mq_get_tag(data);
+
+	return __blk_mq_get_reserved_tag(data);
+}
+
+static struct bt_wait_state *bt_wake_ptr(struct blk_mq_bitmap_tags *bt)
+{
+	int i, wake_index;
+
+	wake_index = atomic_read(&bt->wake_index);
+	for (i = 0; i < BT_WAIT_QUEUES; i++) {
+		struct bt_wait_state *bs = &bt->bs[wake_index];
+
+		if (waitqueue_active(&bs->wait)) {
+			int o = atomic_read(&bt->wake_index);
+			if (wake_index != o)
+				atomic_cmpxchg(&bt->wake_index, o, wake_index);
+
+			return bs;
+		}
+
+		wake_index = bt_index_inc(wake_index);
+	}
+
+	return NULL;
+}
+
+static void bt_clear_tag(struct blk_mq_bitmap_tags *bt, unsigned int tag)
+{
+	const int index = TAG_TO_INDEX(bt, tag);
+	struct bt_wait_state *bs;
+	int wait_cnt;
+
+	clear_bit(TAG_TO_BIT(bt, tag), &bt->map[index].word);
+
+	/* Ensure that the wait list checks occur after clear_bit(). */
+	smp_mb();
+
+	bs = bt_wake_ptr(bt);
+	if (!bs)
+		return;
+
+	wait_cnt = atomic_dec_return(&bs->wait_cnt);
+	if (unlikely(wait_cnt < 0))
+		wait_cnt = atomic_inc_return(&bs->wait_cnt);
+	if (wait_cnt == 0) {
+		atomic_add(bt->wake_cnt, &bs->wait_cnt);
+		bt_index_atomic_inc(&bt->wake_index);
+		wake_up(&bs->wait);
+	}
+}
+
+static void __blk_mq_put_tag(struct blk_mq_tags *tags, unsigned int tag)
+{
+	BUG_ON(tag >= tags->nr_tags);
+
+	bt_clear_tag(&tags->bitmap_tags, tag);
+}
+
+static void __blk_mq_put_reserved_tag(struct blk_mq_tags *tags,
+				      unsigned int tag)
+{
+	BUG_ON(tag >= tags->nr_reserved_tags);
+
+	bt_clear_tag(&tags->breserved_tags, tag);
+}
+
+void blk_mq_put_tag(struct blk_mq_hw_ctx *hctx, unsigned int tag,
+		    unsigned int *last_tag)
+{
+	struct blk_mq_tags *tags = hctx->tags;
+
+	if (tag >= tags->nr_reserved_tags) {
+		const int real_tag = tag - tags->nr_reserved_tags;
+
+		__blk_mq_put_tag(tags, real_tag);
+		*last_tag = real_tag;
+	} else
+		__blk_mq_put_reserved_tag(tags, tag);
+}
+
+static void bt_for_each(struct blk_mq_hw_ctx *hctx,
+		struct blk_mq_bitmap_tags *bt, unsigned int off,
+		busy_iter_fn *fn, void *data, bool reserved)
+{
+	struct request *rq;
+	int bit, i;
+
+	for (i = 0; i < bt->map_nr; i++) {
+		struct blk_align_bitmap *bm = &bt->map[i];
+
+		for (bit = find_first_bit(&bm->word, bm->depth);
+		     bit < bm->depth;
+		     bit = find_next_bit(&bm->word, bm->depth, bit + 1)) {
+		     	rq = blk_mq_tag_to_rq(hctx->tags, off + bit);
+			if (rq->q == hctx->queue)
+				fn(hctx, rq, data, reserved);
+		}
+
+		off += (1 << bt->bits_per_word);
+	}
+}
+
+void blk_mq_tag_busy_iter(struct blk_mq_hw_ctx *hctx, busy_iter_fn *fn,
+		void *priv)
+{
+	struct blk_mq_tags *tags = hctx->tags;
+
+	if (tags->nr_reserved_tags)
+		bt_for_each(hctx, &tags->breserved_tags, 0, fn, priv, true);
+	bt_for_each(hctx, &tags->bitmap_tags, tags->nr_reserved_tags, fn, priv,
+			false);
+}
+EXPORT_SYMBOL(blk_mq_tag_busy_iter);
+
+static unsigned int bt_unused_tags(struct blk_mq_bitmap_tags *bt)
+{
+	unsigned int i, used;
+
+	for (i = 0, used = 0; i < bt->map_nr; i++) {
+		struct blk_align_bitmap *bm = &bt->map[i];
+
+		used += bitmap_weight(&bm->word, bm->depth);
+	}
+
+	return bt->depth - used;
+}
+
+static void bt_update_count(struct blk_mq_bitmap_tags *bt,
+			    unsigned int depth)
+{
+	unsigned int tags_per_word = 1U << bt->bits_per_word;
+	unsigned int map_depth = depth;
+
+	if (depth) {
+		int i;
+
+		for (i = 0; i < bt->map_nr; i++) {
+			bt->map[i].depth = min(map_depth, tags_per_word);
+			map_depth -= bt->map[i].depth;
+		}
+	}
+
+	bt->wake_cnt = BT_WAIT_BATCH;
+	if (bt->wake_cnt > depth / BT_WAIT_QUEUES)
+		bt->wake_cnt = max(1U, depth / BT_WAIT_QUEUES);
+
+	bt->depth = depth;
+}
+
+static int bt_alloc(struct blk_mq_bitmap_tags *bt, unsigned int depth,
+			int node, bool reserved)
+{
+	int i;
+
+	bt->bits_per_word = ilog2(BITS_PER_LONG);
+
+	/*
+	 * Depth can be zero for reserved tags, that's not a failure
+	 * condition.
+	 */
+	if (depth) {
+		unsigned int nr, tags_per_word;
+
+		tags_per_word = (1 << bt->bits_per_word);
+
+		/*
+		 * If the tag space is small, shrink the number of tags
+		 * per word so we spread over a few cachelines, at least.
+		 * If less than 4 tags, just forget about it, it's not
+		 * going to work optimally anyway.
+		 */
+		if (depth >= 4) {
+			while (tags_per_word * 4 > depth) {
+				bt->bits_per_word--;
+				tags_per_word = (1 << bt->bits_per_word);
+			}
+		}
+
+		nr = ALIGN(depth, tags_per_word) / tags_per_word;
+		bt->map = kzalloc_node(nr * sizeof(struct blk_align_bitmap),
+						GFP_KERNEL, node);
+		if (!bt->map)
+			return -ENOMEM;
+
+		bt->map_nr = nr;
+	}
+
+	bt->bs = kzalloc(BT_WAIT_QUEUES * sizeof(*bt->bs), GFP_KERNEL);
+	if (!bt->bs) {
+		kfree(bt->map);
+		bt->map = NULL;
+		return -ENOMEM;
+	}
+
+	bt_update_count(bt, depth);
+
+	for (i = 0; i < BT_WAIT_QUEUES; i++) {
+		init_waitqueue_head(&bt->bs[i].wait);
+		atomic_set(&bt->bs[i].wait_cnt, bt->wake_cnt);
+	}
+
+	return 0;
+}
+
+static void bt_free(struct blk_mq_bitmap_tags *bt)
+{
+	kfree(bt->map);
+	kfree(bt->bs);
+}
+
+static struct blk_mq_tags *blk_mq_init_bitmap_tags(struct blk_mq_tags *tags,
+						   int node)
+{
+	unsigned int depth = tags->nr_tags - tags->nr_reserved_tags;
+
+	if (bt_alloc(&tags->bitmap_tags, depth, node, false))
+		goto enomem;
+	if (bt_alloc(&tags->breserved_tags, tags->nr_reserved_tags, node, true))
+		goto enomem;
+
+	return tags;
+enomem:
+	bt_free(&tags->bitmap_tags);
+	kfree(tags);
+	return NULL;
+}
+
+struct blk_mq_tags *blk_mq_init_tags(unsigned int total_tags,
+				     unsigned int reserved_tags, int node)
+{
+	struct blk_mq_tags *tags;
+
+	if (total_tags > BLK_MQ_TAG_MAX) {
+		pr_err("blk-mq: tag depth too large\n");
+		return NULL;
+	}
+
+	tags = kzalloc_node(sizeof(*tags), GFP_KERNEL, node);
+	if (!tags)
+		return NULL;
+
+	tags->nr_tags = total_tags;
+	tags->nr_reserved_tags = reserved_tags;
+
+	return blk_mq_init_bitmap_tags(tags, node);
+}
+
+void blk_mq_free_tags(struct blk_mq_tags *tags)
+{
+	bt_free(&tags->bitmap_tags);
+	bt_free(&tags->breserved_tags);
+	kfree(tags);
+}
+
+void blk_mq_tag_init_last_tag(struct blk_mq_tags *tags, unsigned int *tag)
+{
+	unsigned int depth = tags->nr_tags - tags->nr_reserved_tags;
+
+	*tag = prandom_u32() % depth;
+}
+
+int blk_mq_tag_update_depth(struct blk_mq_tags *tags, unsigned int tdepth)
+{
+	tdepth -= tags->nr_reserved_tags;
+	if (tdepth > tags->nr_tags)
+		return -EINVAL;
+
+	/*
+	 * Don't need (or can't) update reserved tags here, they remain
+	 * static and should never need resizing.
+	 */
+	bt_update_count(&tags->bitmap_tags, tdepth);
+	blk_mq_tag_wakeup_all(tags);
+	return 0;
+}
+
+ssize_t blk_mq_tag_sysfs_show(struct blk_mq_tags *tags, char *page)
+{
+	char *orig_page = page;
+	unsigned int free, res;
+
+	if (!tags)
+		return 0;
+
+	page += sprintf(page, "nr_tags=%u, reserved_tags=%u, "
+			"bits_per_word=%u\n",
+			tags->nr_tags, tags->nr_reserved_tags,
+			tags->bitmap_tags.bits_per_word);
+
+	free = bt_unused_tags(&tags->bitmap_tags);
+	res = bt_unused_tags(&tags->breserved_tags);
+
+	page += sprintf(page, "nr_free=%u, nr_reserved=%u\n", free, res);
+	page += sprintf(page, "active_queues=%u\n", atomic_read(&tags->active_queues));
+
+	return page - orig_page;
+}
--- a/block/blk-mq-tag.h
+++ b/block/blk-mq-tag.h
@ -0,0 +1,88 @@
+#ifndef INT_BLK_MQ_TAG_H
+#define INT_BLK_MQ_TAG_H
+
+#include "blk-mq.h"
+
+enum {
+	BT_WAIT_QUEUES	= 8,
+	BT_WAIT_BATCH	= 8,
+};
+
+struct bt_wait_state {
+	atomic_t wait_cnt;
+	wait_queue_head_t wait;
+} ____cacheline_aligned_in_smp;
+
+#define TAG_TO_INDEX(bt, tag)	((tag) >> (bt)->bits_per_word)
+#define TAG_TO_BIT(bt, tag)	((tag) & ((1 << (bt)->bits_per_word) - 1))
+
+struct blk_mq_bitmap_tags {
+	unsigned int depth;
+	unsigned int wake_cnt;
+	unsigned int bits_per_word;
+
+	unsigned int map_nr;
+	struct blk_align_bitmap *map;
+
+	atomic_t wake_index;
+	struct bt_wait_state *bs;
+};
+
+/*
+ * Tag address space map.
+ */
+struct blk_mq_tags {
+	unsigned int nr_tags;
+	unsigned int nr_reserved_tags;
+
+	atomic_t active_queues;
+
+	struct blk_mq_bitmap_tags bitmap_tags;
+	struct blk_mq_bitmap_tags breserved_tags;
+
+	struct request **rqs;
+	struct list_head page_list;
+};
+
+
+extern struct blk_mq_tags *blk_mq_init_tags(unsigned int nr_tags, unsigned int reserved_tags, int node);
+extern void blk_mq_free_tags(struct blk_mq_tags *tags);
+
+extern unsigned int blk_mq_get_tag(struct blk_mq_alloc_data *data);
+extern void blk_mq_put_tag(struct blk_mq_hw_ctx *hctx, unsigned int tag, unsigned int *last_tag);
+extern bool blk_mq_has_free_tags(struct blk_mq_tags *tags);
+extern ssize_t blk_mq_tag_sysfs_show(struct blk_mq_tags *tags, char *page);
+extern void blk_mq_tag_init_last_tag(struct blk_mq_tags *tags, unsigned int *last_tag);
+extern int blk_mq_tag_update_depth(struct blk_mq_tags *tags, unsigned int depth);
+
+enum {
+	BLK_MQ_TAG_CACHE_MIN	= 1,
+	BLK_MQ_TAG_CACHE_MAX	= 64,
+};
+
+enum {
+	BLK_MQ_TAG_FAIL		= -1U,
+	BLK_MQ_TAG_MIN		= BLK_MQ_TAG_CACHE_MIN,
+	BLK_MQ_TAG_MAX		= BLK_MQ_TAG_FAIL - 1,
+};
+
+extern bool __blk_mq_tag_busy(struct blk_mq_hw_ctx *);
+extern void __blk_mq_tag_idle(struct blk_mq_hw_ctx *);
+
+static inline bool blk_mq_tag_busy(struct blk_mq_hw_ctx *hctx)
+{
+	if (!(hctx->flags & BLK_MQ_F_TAG_SHARED))
+		return false;
+
+	return __blk_mq_tag_busy(hctx);
+}
+
+static inline void blk_mq_tag_idle(struct blk_mq_hw_ctx *hctx)
+{
+	if (!(hctx->flags & BLK_MQ_F_TAG_SHARED))
+		return;
+
+	__blk_mq_tag_idle(hctx);
+}
+
+#endif
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
--- a/block/blk-mq.h
+++ b/block/blk-mq.h
@ -0,0 +1,118 @@
+#ifndef INT_BLK_MQ_H
+#define INT_BLK_MQ_H
+
+struct blk_mq_tag_set;
+
+struct blk_mq_ctx {
+	struct {
+		spinlock_t		lock;
+		struct list_head	rq_list;
+	}  ____cacheline_aligned_in_smp;
+
+	unsigned int		cpu;
+	unsigned int		index_hw;
+
+	unsigned int		last_tag ____cacheline_aligned_in_smp;
+
+	/* incremented at dispatch time */
+	unsigned long		rq_dispatched[2];
+	unsigned long		rq_merged;
+
+	/* incremented at completion time */
+	unsigned long		____cacheline_aligned_in_smp rq_completed[2];
+
+	struct request_queue	*queue;
+	struct kobject		kobj;
+} ____cacheline_aligned_in_smp;
+
+void __blk_mq_complete_request(struct request *rq);
+void blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx, bool async);
+void blk_mq_freeze_queue(struct request_queue *q);
+void blk_mq_free_queue(struct request_queue *q);
+void blk_mq_clone_flush_request(struct request *flush_rq,
+		struct request *orig_rq);
+int blk_mq_update_nr_requests(struct request_queue *q, unsigned int nr);
+
+/*
+ * CPU hotplug helpers
+ */
+struct blk_mq_cpu_notifier;
+void blk_mq_init_cpu_notifier(struct blk_mq_cpu_notifier *notifier,
+			      int (*fn)(void *, unsigned long, unsigned int),
+			      void *data);
+void blk_mq_register_cpu_notifier(struct blk_mq_cpu_notifier *notifier);
+void blk_mq_unregister_cpu_notifier(struct blk_mq_cpu_notifier *notifier);
+void blk_mq_cpu_init(void);
+void blk_mq_enable_hotplug(void);
+void blk_mq_disable_hotplug(void);
+
+/*
+ * CPU -> queue mappings
+ */
+extern unsigned int *blk_mq_make_queue_map(struct blk_mq_tag_set *set);
+extern int blk_mq_update_queue_map(unsigned int *map, unsigned int nr_queues);
+extern int blk_mq_hw_queue_to_node(unsigned int *map, unsigned int);
+
+/*
+ * sysfs helpers
+ */
+extern int blk_mq_sysfs_register(struct request_queue *q);
+extern void blk_mq_sysfs_unregister(struct request_queue *q);
+
+extern void blk_mq_rq_timed_out(struct request *req, bool reserved);
+
+/*
+ * Basic implementation of sparser bitmap, allowing the user to spread
+ * the bits over more cachelines.
+ */
+struct blk_align_bitmap {
+	unsigned long word;
+	unsigned long depth;
+} ____cacheline_aligned_in_smp;
+
+static inline struct blk_mq_ctx *__blk_mq_get_ctx(struct request_queue *q,
+					   unsigned int cpu)
+{
+	return per_cpu_ptr(q->queue_ctx, cpu);
+}
+
+/*
+ * This assumes per-cpu software queueing queues. They could be per-node
+ * as well, for instance. For now this is hardcoded as-is. Note that we don't
+ * care about preemption, since we know the ctx's are persistent. This does
+ * mean that we can't rely on ctx always matching the currently running CPU.
+ */
+static inline struct blk_mq_ctx *blk_mq_get_ctx(struct request_queue *q)
+{
+	return __blk_mq_get_ctx(q, get_cpu());
+}
+
+static inline void blk_mq_put_ctx(struct blk_mq_ctx *ctx)
+{
+	put_cpu();
+}
+
+struct blk_mq_alloc_data {
+	/* input parameter */
+	struct request_queue *q;
+	gfp_t gfp;
+	bool reserved;
+
+	/* input & output parameter */
+	struct blk_mq_ctx *ctx;
+	struct blk_mq_hw_ctx *hctx;
+};
+
+static inline void blk_mq_set_alloc_data(struct blk_mq_alloc_data *data,
+		struct request_queue *q, gfp_t gfp, bool reserved,
+		struct blk_mq_ctx *ctx,
+		struct blk_mq_hw_ctx *hctx)
+{
+	data->q = q;
+	data->gfp = gfp;
+	data->reserved = reserved;
+	data->ctx = ctx;
+	data->hctx = hctx;
+}
+
+#endif
--- a/block/blk-settings.c
+++ b/block/blk-settings.c
@ -0,0 +1,863 @@
+/*
+ * Functions related to setting various queue properties from drivers
+ */
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/bio.h>
+#include <linux/blkdev.h>
+#include <linux/bootmem.h>	/* for max_pfn/max_low_pfn */
+#include <linux/gcd.h>
+#include <linux/lcm.h>
+#include <linux/jiffies.h>
+#include <linux/gfp.h>
+
+#include "blk.h"
+
+unsigned long blk_max_low_pfn;
+EXPORT_SYMBOL(blk_max_low_pfn);
+
+unsigned long blk_max_pfn;
+
+/**
+ * blk_queue_prep_rq - set a prepare_request function for queue
+ * @q:		queue
+ * @pfn:	prepare_request function
+ *
+ * It's possible for a queue to register a prepare_request callback which
+ * is invoked before the request is handed to the request_fn. The goal of
+ * the function is to prepare a request for I/O, it can be used to build a
+ * cdb from the request data for instance.
+ *
+ */
+void blk_queue_prep_rq(struct request_queue *q, prep_rq_fn *pfn)
+{
+	q->prep_rq_fn = pfn;
+}
+EXPORT_SYMBOL(blk_queue_prep_rq);
+
+/**
+ * blk_queue_unprep_rq - set an unprepare_request function for queue
+ * @q:		queue
+ * @ufn:	unprepare_request function
+ *
+ * It's possible for a queue to register an unprepare_request callback
+ * which is invoked before the request is finally completed. The goal
+ * of the function is to deallocate any data that was allocated in the
+ * prepare_request callback.
+ *
+ */
+void blk_queue_unprep_rq(struct request_queue *q, unprep_rq_fn *ufn)
+{
+	q->unprep_rq_fn = ufn;
+}
+EXPORT_SYMBOL(blk_queue_unprep_rq);
+
+/**
+ * blk_queue_merge_bvec - set a merge_bvec function for queue
+ * @q:		queue
+ * @mbfn:	merge_bvec_fn
+ *
+ * Usually queues have static limitations on the max sectors or segments that
+ * we can put in a request. Stacking drivers may have some settings that
+ * are dynamic, and thus we have to query the queue whether it is ok to
+ * add a new bio_vec to a bio at a given offset or not. If the block device
+ * has such limitations, it needs to register a merge_bvec_fn to control
+ * the size of bio's sent to it. Note that a block device *must* allow a
+ * single page to be added to an empty bio. The block device driver may want
+ * to use the bio_split() function to deal with these bio's. By default
+ * no merge_bvec_fn is defined for a queue, and only the fixed limits are
+ * honored.
+ */
+void blk_queue_merge_bvec(struct request_queue *q, merge_bvec_fn *mbfn)
+{
+	q->merge_bvec_fn = mbfn;
+}
+EXPORT_SYMBOL(blk_queue_merge_bvec);
+
+void blk_queue_softirq_done(struct request_queue *q, softirq_done_fn *fn)
+{
+	q->softirq_done_fn = fn;
+}
+EXPORT_SYMBOL(blk_queue_softirq_done);
+
+void blk_queue_rq_timeout(struct request_queue *q, unsigned int timeout)
+{
+	q->rq_timeout = timeout;
+}
+EXPORT_SYMBOL_GPL(blk_queue_rq_timeout);
+
+void blk_queue_rq_timed_out(struct request_queue *q, rq_timed_out_fn *fn)
+{
+	q->rq_timed_out_fn = fn;
+}
+EXPORT_SYMBOL_GPL(blk_queue_rq_timed_out);
+
+void blk_queue_lld_busy(struct request_queue *q, lld_busy_fn *fn)
+{
+	q->lld_busy_fn = fn;
+}
+EXPORT_SYMBOL_GPL(blk_queue_lld_busy);
+
+/**
+ * blk_set_default_limits - reset limits to default values
+ * @lim:  the queue_limits structure to reset
+ *
+ * Description:
+ *   Returns a queue_limit struct to its default state.
+ */
+void blk_set_default_limits(struct queue_limits *lim)
+{
+	lim->max_segments = BLK_MAX_SEGMENTS;
+	lim->max_integrity_segments = 0;
+	lim->seg_boundary_mask = BLK_SEG_BOUNDARY_MASK;
+	lim->max_segment_size = BLK_MAX_SEGMENT_SIZE;
+	lim->max_sectors = lim->max_hw_sectors = BLK_SAFE_MAX_SECTORS;
+	lim->chunk_sectors = 0;
+	lim->max_write_same_sectors = 0;
+	lim->max_discard_sectors = 0;
+	lim->discard_granularity = 0;
+	lim->discard_alignment = 0;
+	lim->discard_misaligned = 0;
+	lim->discard_zeroes_data = 0;
+	lim->logical_block_size = lim->physical_block_size = lim->io_min = 512;
+	lim->bounce_pfn = (unsigned long)(BLK_BOUNCE_ANY >> PAGE_SHIFT);
+	lim->alignment_offset = 0;
+	lim->io_opt = 0;
+	lim->misaligned = 0;
+	lim->cluster = 1;
+}
+EXPORT_SYMBOL(blk_set_default_limits);
+
+/**
+ * blk_set_stacking_limits - set default limits for stacking devices
+ * @lim:  the queue_limits structure to reset
+ *
+ * Description:
+ *   Returns a queue_limit struct to its default state. Should be used
+ *   by stacking drivers like DM that have no internal limits.
+ */
+void blk_set_stacking_limits(struct queue_limits *lim)
+{
+	blk_set_default_limits(lim);
+
+	/* Inherit limits from component devices */
+	lim->discard_zeroes_data = 1;
+	lim->max_segments = USHRT_MAX;
+	lim->max_hw_sectors = UINT_MAX;
+	lim->max_segment_size = UINT_MAX;
+	lim->max_sectors = UINT_MAX;
+	lim->max_write_same_sectors = UINT_MAX;
+}
+EXPORT_SYMBOL(blk_set_stacking_limits);
+
+/**
+ * blk_queue_make_request - define an alternate make_request function for a device
+ * @q:  the request queue for the device to be affected
+ * @mfn: the alternate make_request function
+ *
+ * Description:
+ *    The normal way for &struct bios to be passed to a device
+ *    driver is for them to be collected into requests on a request
+ *    queue, and then to allow the device driver to select requests
+ *    off that queue when it is ready.  This works well for many block
+ *    devices. However some block devices (typically virtual devices
+ *    such as md or lvm) do not benefit from the processing on the
+ *    request queue, and are served best by having the requests passed
+ *    directly to them.  This can be achieved by providing a function
+ *    to blk_queue_make_request().
+ *
+ * Caveat:
+ *    The driver that does this *must* be able to deal appropriately
+ *    with buffers in "highmemory". This can be accomplished by either calling
+ *    __bio_kmap_atomic() to get a temporary kernel mapping, or by calling
+ *    blk_queue_bounce() to create a buffer in normal memory.
+ **/
+void blk_queue_make_request(struct request_queue *q, make_request_fn *mfn)
+{
+	/*
+	 * set defaults
+	 */
+	q->nr_requests = BLKDEV_MAX_RQ;
+
+	q->make_request_fn = mfn;
+	blk_queue_dma_alignment(q, 511);
+	blk_queue_congestion_threshold(q);
+	q->nr_batching = BLK_BATCH_REQ;
+
+	blk_set_default_limits(&q->limits);
+
+	/*
+	 * by default assume old behaviour and bounce for any highmem page
+	 */
+	blk_queue_bounce_limit(q, BLK_BOUNCE_HIGH);
+}
+EXPORT_SYMBOL(blk_queue_make_request);
+
+/**
+ * blk_queue_bounce_limit - set bounce buffer limit for queue
+ * @q: the request queue for the device
+ * @max_addr: the maximum address the device can handle
+ *
+ * Description:
+ *    Different hardware can have different requirements as to what pages
+ *    it can do I/O directly to. A low level driver can call
+ *    blk_queue_bounce_limit to have lower memory pages allocated as bounce
+ *    buffers for doing I/O to pages residing above @max_addr.
+ **/
+void blk_queue_bounce_limit(struct request_queue *q, u64 max_addr)
+{
+	unsigned long b_pfn = max_addr >> PAGE_SHIFT;
+	int dma = 0;
+
+	q->bounce_gfp = GFP_NOIO;
+#if BITS_PER_LONG == 64
+	/*
+	 * Assume anything <= 4GB can be handled by IOMMU.  Actually
+	 * some IOMMUs can handle everything, but I don't know of a
+	 * way to test this here.
+	 */
+	if (b_pfn < (min_t(u64, 0xffffffffUL, BLK_BOUNCE_HIGH) >> PAGE_SHIFT))
+		dma = 1;
+	q->limits.bounce_pfn = max(max_low_pfn, b_pfn);
+#else
+	if (b_pfn < blk_max_low_pfn)
+		dma = 1;
+	q->limits.bounce_pfn = b_pfn;
+#endif
+	if (dma) {
+		init_emergency_isa_pool();
+		q->bounce_gfp = GFP_NOIO | GFP_DMA;
+		q->limits.bounce_pfn = b_pfn;
+	}
+}
+EXPORT_SYMBOL(blk_queue_bounce_limit);
+
+/**
+ * blk_limits_max_hw_sectors - set hard and soft limit of max sectors for request
+ * @limits: the queue limits
+ * @max_hw_sectors:  max hardware sectors in the usual 512b unit
+ *
+ * Description:
+ *    Enables a low level driver to set a hard upper limit,
+ *    max_hw_sectors, on the size of requests.  max_hw_sectors is set by
+ *    the device driver based upon the combined capabilities of I/O
+ *    controller and storage device.
+ *
+ *    max_sectors is a soft limit imposed by the block layer for
+ *    filesystem type requests.  This value can be overridden on a
+ *    per-device basis in /sys/block/<device>/queue/max_sectors_kb.
+ *    The soft limit can not exceed max_hw_sectors.
+ **/
+void blk_limits_max_hw_sectors(struct queue_limits *limits, unsigned int max_hw_sectors)
+{
+	if ((max_hw_sectors << 9) < PAGE_CACHE_SIZE) {
+		max_hw_sectors = 1 << (PAGE_CACHE_SHIFT - 9);
+		printk(KERN_INFO "%s: set to minimum %d\n",
+		       __func__, max_hw_sectors);
+	}
+
+	limits->max_hw_sectors = max_hw_sectors;
+	limits->max_sectors = min_t(unsigned int, max_hw_sectors,
+				    BLK_DEF_MAX_SECTORS);
+}
+EXPORT_SYMBOL(blk_limits_max_hw_sectors);
+
+/**
+ * blk_queue_max_hw_sectors - set max sectors for a request for this queue
+ * @q:  the request queue for the device
+ * @max_hw_sectors:  max hardware sectors in the usual 512b unit
+ *
+ * Description:
+ *    See description for blk_limits_max_hw_sectors().
+ **/
+void blk_queue_max_hw_sectors(struct request_queue *q, unsigned int max_hw_sectors)
+{
+	blk_limits_max_hw_sectors(&q->limits, max_hw_sectors);
+}
+EXPORT_SYMBOL(blk_queue_max_hw_sectors);
+
+/**
+ * blk_queue_chunk_sectors - set size of the chunk for this queue
+ * @q:  the request queue for the device
+ * @chunk_sectors:  chunk sectors in the usual 512b unit
+ *
+ * Description:
+ *    If a driver doesn't want IOs to cross a given chunk size, it can set
+ *    this limit and prevent merging across chunks. Note that the chunk size
+ *    must currently be a power-of-2 in sectors. Also note that the block
+ *    layer must accept a page worth of data at any offset. So if the
+ *    crossing of chunks is a hard limitation in the driver, it must still be
+ *    prepared to split single page bios.
+ **/
+void blk_queue_chunk_sectors(struct request_queue *q, unsigned int chunk_sectors)
+{
+	BUG_ON(!is_power_of_2(chunk_sectors));
+	q->limits.chunk_sectors = chunk_sectors;
+}
+EXPORT_SYMBOL(blk_queue_chunk_sectors);
+
+/**
+ * blk_queue_max_discard_sectors - set max sectors for a single discard
+ * @q:  the request queue for the device
+ * @max_discard_sectors: maximum number of sectors to discard
+ **/
+void blk_queue_max_discard_sectors(struct request_queue *q,
+		unsigned int max_discard_sectors)
+{
+	q->limits.max_discard_sectors = max_discard_sectors;
+}
+EXPORT_SYMBOL(blk_queue_max_discard_sectors);
+
+/**
+ * blk_queue_max_write_same_sectors - set max sectors for a single write same
+ * @q:  the request queue for the device
+ * @max_write_same_sectors: maximum number of sectors to write per command
+ **/
+void blk_queue_max_write_same_sectors(struct request_queue *q,
+				      unsigned int max_write_same_sectors)
+{
+	q->limits.max_write_same_sectors = max_write_same_sectors;
+}
+EXPORT_SYMBOL(blk_queue_max_write_same_sectors);
+
+/**
+ * blk_queue_max_segments - set max hw segments for a request for this queue
+ * @q:  the request queue for the device
+ * @max_segments:  max number of segments
+ *
+ * Description:
+ *    Enables a low level driver to set an upper limit on the number of
+ *    hw data segments in a request.
+ **/
+void blk_queue_max_segments(struct request_queue *q, unsigned short max_segments)
+{
+	if (!max_segments) {
+		max_segments = 1;
+		printk(KERN_INFO "%s: set to minimum %d\n",
+		       __func__, max_segments);
+	}
+
+	q->limits.max_segments = max_segments;
+}
+EXPORT_SYMBOL(blk_queue_max_segments);
+
+/**
+ * blk_queue_max_segment_size - set max segment size for blk_rq_map_sg
+ * @q:  the request queue for the device
+ * @max_size:  max size of segment in bytes
+ *
+ * Description:
+ *    Enables a low level driver to set an upper limit on the size of a
+ *    coalesced segment
+ **/
+void blk_queue_max_segment_size(struct request_queue *q, unsigned int max_size)
+{
+	if (max_size < PAGE_CACHE_SIZE) {
+		max_size = PAGE_CACHE_SIZE;
+		printk(KERN_INFO "%s: set to minimum %d\n",
+		       __func__, max_size);
+	}
+
+	q->limits.max_segment_size = max_size;
+}
+EXPORT_SYMBOL(blk_queue_max_segment_size);
+
+/**
+ * blk_queue_logical_block_size - set logical block size for the queue
+ * @q:  the request queue for the device
+ * @size:  the logical block size, in bytes
+ *
+ * Description:
+ *   This should be set to the lowest possible block size that the
+ *   storage device can address.  The default of 512 covers most
+ *   hardware.
+ **/
+void blk_queue_logical_block_size(struct request_queue *q, unsigned short size)
+{
+	q->limits.logical_block_size = size;
+
+	if (q->limits.physical_block_size < size)
+		q->limits.physical_block_size = size;
+
+	if (q->limits.io_min < q->limits.physical_block_size)
+		q->limits.io_min = q->limits.physical_block_size;
+}
+EXPORT_SYMBOL(blk_queue_logical_block_size);
+
+/**
+ * blk_queue_physical_block_size - set physical block size for the queue
+ * @q:  the request queue for the device
+ * @size:  the physical block size, in bytes
+ *
+ * Description:
+ *   This should be set to the lowest possible sector size that the
+ *   hardware can operate on without reverting to read-modify-write
+ *   operations.
+ */
+void blk_queue_physical_block_size(struct request_queue *q, unsigned int size)
+{
+	q->limits.physical_block_size = size;
+
+	if (q->limits.physical_block_size < q->limits.logical_block_size)
+		q->limits.physical_block_size = q->limits.logical_block_size;
+
+	if (q->limits.io_min < q->limits.physical_block_size)
+		q->limits.io_min = q->limits.physical_block_size;
+}
+EXPORT_SYMBOL(blk_queue_physical_block_size);
+
+/**
+ * blk_queue_alignment_offset - set physical block alignment offset
+ * @q:	the request queue for the device
+ * @offset: alignment offset in bytes
+ *
+ * Description:
+ *   Some devices are naturally misaligned to compensate for things like
+ *   the legacy DOS partition table 63-sector offset.  Low-level drivers
+ *   should call this function for devices whose first sector is not
+ *   naturally aligned.
+ */
+void blk_queue_alignment_offset(struct request_queue *q, unsigned int offset)
+{
+	q->limits.alignment_offset =
+		offset & (q->limits.physical_block_size - 1);
+	q->limits.misaligned = 0;
+}
+EXPORT_SYMBOL(blk_queue_alignment_offset);
+
+/**
+ * blk_limits_io_min - set minimum request size for a device
+ * @limits: the queue limits
+ * @min:  smallest I/O size in bytes
+ *
+ * Description:
+ *   Some devices have an internal block size bigger than the reported
+ *   hardware sector size.  This function can be used to signal the
+ *   smallest I/O the device can perform without incurring a performance
+ *   penalty.
+ */
+void blk_limits_io_min(struct queue_limits *limits, unsigned int min)
+{
+	limits->io_min = min;
+
+	if (limits->io_min < limits->logical_block_size)
+		limits->io_min = limits->logical_block_size;
+
+	if (limits->io_min < limits->physical_block_size)
+		limits->io_min = limits->physical_block_size;
+}
+EXPORT_SYMBOL(blk_limits_io_min);
+
+/**
+ * blk_queue_io_min - set minimum request size for the queue
+ * @q:	the request queue for the device
+ * @min:  smallest I/O size in bytes
+ *
+ * Description:
+ *   Storage devices may report a granularity or preferred minimum I/O
+ *   size which is the smallest request the device can perform without
+ *   incurring a performance penalty.  For disk drives this is often the
+ *   physical block size.  For RAID arrays it is often the stripe chunk
+ *   size.  A properly aligned multiple of minimum_io_size is the
+ *   preferred request size for workloads where a high number of I/O
+ *   operations is desired.
+ */
+void blk_queue_io_min(struct request_queue *q, unsigned int min)
+{
+	blk_limits_io_min(&q->limits, min);
+}
+EXPORT_SYMBOL(blk_queue_io_min);
+
+/**
+ * blk_limits_io_opt - set optimal request size for a device
+ * @limits: the queue limits
+ * @opt:  smallest I/O size in bytes
+ *
+ * Description:
+ *   Storage devices may report an optimal I/O size, which is the
+ *   device's preferred unit for sustained I/O.  This is rarely reported
+ *   for disk drives.  For RAID arrays it is usually the stripe width or
+ *   the internal track size.  A properly aligned multiple of
+ *   optimal_io_size is the preferred request size for workloads where
+ *   sustained throughput is desired.
+ */
+void blk_limits_io_opt(struct queue_limits *limits, unsigned int opt)
+{
+	limits->io_opt = opt;
+}
+EXPORT_SYMBOL(blk_limits_io_opt);
+
+/**
+ * blk_queue_io_opt - set optimal request size for the queue
+ * @q:	the request queue for the device
+ * @opt:  optimal request size in bytes
+ *
+ * Description:
+ *   Storage devices may report an optimal I/O size, which is the
+ *   device's preferred unit for sustained I/O.  This is rarely reported
+ *   for disk drives.  For RAID arrays it is usually the stripe width or
+ *   the internal track size.  A properly aligned multiple of
+ *   optimal_io_size is the preferred request size for workloads where
+ *   sustained throughput is desired.
+ */
+void blk_queue_io_opt(struct request_queue *q, unsigned int opt)
+{
+	blk_limits_io_opt(&q->limits, opt);
+}
+EXPORT_SYMBOL(blk_queue_io_opt);
+
+/**
+ * blk_queue_stack_limits - inherit underlying queue limits for stacked drivers
+ * @t:	the stacking driver (top)
+ * @b:  the underlying device (bottom)
+ **/
+void blk_queue_stack_limits(struct request_queue *t, struct request_queue *b)
+{
+	blk_stack_limits(&t->limits, &b->limits, 0);
+}
+EXPORT_SYMBOL(blk_queue_stack_limits);
+
+/**
+ * blk_stack_limits - adjust queue_limits for stacked devices
+ * @t:	the stacking driver limits (top device)
+ * @b:  the underlying queue limits (bottom, component device)
+ * @start:  first data sector within component device
+ *
+ * Description:
+ *    This function is used by stacking drivers like MD and DM to ensure
+ *    that all component devices have compatible block sizes and
+ *    alignments.  The stacking driver must provide a queue_limits
+ *    struct (top) and then iteratively call the stacking function for
+ *    all component (bottom) devices.  The stacking function will
+ *    attempt to combine the values and ensure proper alignment.
+ *
+ *    Returns 0 if the top and bottom queue_limits are compatible.  The
+ *    top device's block sizes and alignment offsets may be adjusted to
+ *    ensure alignment with the bottom device. If no compatible sizes
+ *    and alignments exist, -1 is returned and the resulting top
+ *    queue_limits will have the misaligned flag set to indicate that
+ *    the alignment_offset is undefined.
+ */
+int blk_stack_limits(struct queue_limits *t, struct queue_limits *b,
+		     sector_t start)
+{
+	unsigned int top, bottom, alignment, ret = 0;
+
+	t->max_sectors = min_not_zero(t->max_sectors, b->max_sectors);
+	t->max_hw_sectors = min_not_zero(t->max_hw_sectors, b->max_hw_sectors);
+	t->max_write_same_sectors = min(t->max_write_same_sectors,
+					b->max_write_same_sectors);
+	t->bounce_pfn = min_not_zero(t->bounce_pfn, b->bounce_pfn);
+
+	t->seg_boundary_mask = min_not_zero(t->seg_boundary_mask,
+					    b->seg_boundary_mask);
+
+	t->max_segments = min_not_zero(t->max_segments, b->max_segments);
+	t->max_integrity_segments = min_not_zero(t->max_integrity_segments,
+						 b->max_integrity_segments);
+
+	t->max_segment_size = min_not_zero(t->max_segment_size,
+					   b->max_segment_size);
+
+	t->misaligned |= b->misaligned;
+
+	alignment = queue_limit_alignment_offset(b, start);
+
+	/* Bottom device has different alignment.  Check that it is
+	 * compatible with the current top alignment.
+	 */
+	if (t->alignment_offset != alignment) {
+
+		top = max(t->physical_block_size, t->io_min)
+			+ t->alignment_offset;
+		bottom = max(b->physical_block_size, b->io_min) + alignment;
+
+		/* Verify that top and bottom intervals line up */
+		if (max(top, bottom) % min(top, bottom)) {
+			t->misaligned = 1;
+			ret = -1;
+		}
+	}
+
+	t->logical_block_size = max(t->logical_block_size,
+				    b->logical_block_size);
+
+	t->physical_block_size = max(t->physical_block_size,
+				     b->physical_block_size);
+
+	t->io_min = max(t->io_min, b->io_min);
+	t->io_opt = lcm(t->io_opt, b->io_opt);
+
+	t->cluster &= b->cluster;
+	t->discard_zeroes_data &= b->discard_zeroes_data;
+
+	/* Physical block size a multiple of the logical block size? */
+	if (t->physical_block_size & (t->logical_block_size - 1)) {
+		t->physical_block_size = t->logical_block_size;
+		t->misaligned = 1;
+		ret = -1;
+	}
+
+	/* Minimum I/O a multiple of the physical block size? */
+	if (t->io_min & (t->physical_block_size - 1)) {
+		t->io_min = t->physical_block_size;
+		t->misaligned = 1;
+		ret = -1;
+	}
+
+	/* Optimal I/O a multiple of the physical block size? */
+	if (t->io_opt & (t->physical_block_size - 1)) {
+		t->io_opt = 0;
+		t->misaligned = 1;
+		ret = -1;
+	}
+
+	t->raid_partial_stripes_expensive =
+		max(t->raid_partial_stripes_expensive,
+		    b->raid_partial_stripes_expensive);
+
+	/* Find lowest common alignment_offset */
+	t->alignment_offset = lcm(t->alignment_offset, alignment)
+		% max(t->physical_block_size, t->io_min);
+
+	/* Verify that new alignment_offset is on a logical block boundary */
+	if (t->alignment_offset & (t->logical_block_size - 1)) {
+		t->misaligned = 1;
+		ret = -1;
+	}
+
+	/* Discard alignment and granularity */
+	if (b->discard_granularity) {
+		alignment = queue_limit_discard_alignment(b, start);
+
+		if (t->discard_granularity != 0 &&
+		    t->discard_alignment != alignment) {
+			top = t->discard_granularity + t->discard_alignment;
+			bottom = b->discard_granularity + alignment;
+
+			/* Verify that top and bottom intervals line up */
+			if ((max(top, bottom) % min(top, bottom)) != 0)
+				t->discard_misaligned = 1;
+		}
+
+		t->max_discard_sectors = min_not_zero(t->max_discard_sectors,
+						      b->max_discard_sectors);
+		t->discard_granularity = max(t->discard_granularity,
+					     b->discard_granularity);
+		t->discard_alignment = lcm(t->discard_alignment, alignment) %
+			t->discard_granularity;
+	}
+
+	return ret;
+}
+EXPORT_SYMBOL(blk_stack_limits);
+
+/**
+ * bdev_stack_limits - adjust queue limits for stacked drivers
+ * @t:	the stacking driver limits (top device)
+ * @bdev:  the component block_device (bottom)
+ * @start:  first data sector within component device
+ *
+ * Description:
+ *    Merges queue limits for a top device and a block_device.  Returns
+ *    0 if alignment didn't change.  Returns -1 if adding the bottom
+ *    device caused misalignment.
+ */
+int bdev_stack_limits(struct queue_limits *t, struct block_device *bdev,
+		      sector_t start)
+{
+	struct request_queue *bq = bdev_get_queue(bdev);
+
+	start += get_start_sect(bdev);
+
+	return blk_stack_limits(t, &bq->limits, start);
+}
+EXPORT_SYMBOL(bdev_stack_limits);
+
+/**
+ * disk_stack_limits - adjust queue limits for stacked drivers
+ * @disk:  MD/DM gendisk (top)
+ * @bdev:  the underlying block device (bottom)
+ * @offset:  offset to beginning of data within component device
+ *
+ * Description:
+ *    Merges the limits for a top level gendisk and a bottom level
+ *    block_device.
+ */
+void disk_stack_limits(struct gendisk *disk, struct block_device *bdev,
+		       sector_t offset)
+{
+	struct request_queue *t = disk->queue;
+
+	if (bdev_stack_limits(&t->limits, bdev, offset >> 9) < 0) {
+		char top[BDEVNAME_SIZE], bottom[BDEVNAME_SIZE];
+
+		disk_name(disk, 0, top);
+		bdevname(bdev, bottom);
+
+		printk(KERN_NOTICE "%s: Warning: Device %s is misaligned\n",
+		       top, bottom);
+	}
+}
+EXPORT_SYMBOL(disk_stack_limits);
+
+/**
+ * blk_queue_dma_pad - set pad mask
+ * @q:     the request queue for the device
+ * @mask:  pad mask
+ *
+ * Set dma pad mask.
+ *
+ * Appending pad buffer to a request modifies the last entry of a
+ * scatter list such that it includes the pad buffer.
+ **/
+void blk_queue_dma_pad(struct request_queue *q, unsigned int mask)
+{
+	q->dma_pad_mask = mask;
+}
+EXPORT_SYMBOL(blk_queue_dma_pad);
+
+/**
+ * blk_queue_update_dma_pad - update pad mask
+ * @q:     the request queue for the device
+ * @mask:  pad mask
+ *
+ * Update dma pad mask.
+ *
+ * Appending pad buffer to a request modifies the last entry of a
+ * scatter list such that it includes the pad buffer.
+ **/
+void blk_queue_update_dma_pad(struct request_queue *q, unsigned int mask)
+{
+	if (mask > q->dma_pad_mask)
+		q->dma_pad_mask = mask;
+}
+EXPORT_SYMBOL(blk_queue_update_dma_pad);
+
+/**
+ * blk_queue_dma_drain - Set up a drain buffer for excess dma.
+ * @q:  the request queue for the device
+ * @dma_drain_needed: fn which returns non-zero if drain is necessary
+ * @buf:	physically contiguous buffer
+ * @size:	size of the buffer in bytes
+ *
+ * Some devices have excess DMA problems and can't simply discard (or
+ * zero fill) the unwanted piece of the transfer.  They have to have a
+ * real area of memory to transfer it into.  The use case for this is
+ * ATAPI devices in DMA mode.  If the packet command causes a transfer
+ * bigger than the transfer size some HBAs will lock up if there
+ * aren't DMA elements to contain the excess transfer.  What this API
+ * does is adjust the queue so that the buf is always appended
+ * silently to the scatterlist.
+ *
+ * Note: This routine adjusts max_hw_segments to make room for appending
+ * the drain buffer.  If you call blk_queue_max_segments() after calling
+ * this routine, you must set the limit to one fewer than your device
+ * can support otherwise there won't be room for the drain buffer.
+ */
+int blk_queue_dma_drain(struct request_queue *q,
+			       dma_drain_needed_fn *dma_drain_needed,
+			       void *buf, unsigned int size)
+{
+	if (queue_max_segments(q) < 2)
+		return -EINVAL;
+	/* make room for appending the drain */
+	blk_queue_max_segments(q, queue_max_segments(q) - 1);
+	q->dma_drain_needed = dma_drain_needed;
+	q->dma_drain_buffer = buf;
+	q->dma_drain_size = size;
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(blk_queue_dma_drain);
+
+/**
+ * blk_queue_segment_boundary - set boundary rules for segment merging
+ * @q:  the request queue for the device
+ * @mask:  the memory boundary mask
+ **/
+void blk_queue_segment_boundary(struct request_queue *q, unsigned long mask)
+{
+	if (mask < PAGE_CACHE_SIZE - 1) {
+		mask = PAGE_CACHE_SIZE - 1;
+		printk(KERN_INFO "%s: set to minimum %lx\n",
+		       __func__, mask);
+	}
+
+	q->limits.seg_boundary_mask = mask;
+}
+EXPORT_SYMBOL(blk_queue_segment_boundary);
+
+/**
+ * blk_queue_dma_alignment - set dma length and memory alignment
+ * @q:     the request queue for the device
+ * @mask:  alignment mask
+ *
+ * description:
+ *    set required memory and length alignment for direct dma transactions.
+ *    this is used when building direct io requests for the queue.
+ *
+ **/
+void blk_queue_dma_alignment(struct request_queue *q, int mask)
+{
+	q->dma_alignment = mask;
+}
+EXPORT_SYMBOL(blk_queue_dma_alignment);
+
+/**
+ * blk_queue_update_dma_alignment - update dma length and memory alignment
+ * @q:     the request queue for the device
+ * @mask:  alignment mask
+ *
+ * description:
+ *    update required memory and length alignment for direct dma transactions.
+ *    If the requested alignment is larger than the current alignment, then
+ *    the current queue alignment is updated to the new value, otherwise it
+ *    is left alone.  The design of this is to allow multiple objects
+ *    (driver, device, transport etc) to set their respective
+ *    alignments without having them interfere.
+ *
+ **/
+void blk_queue_update_dma_alignment(struct request_queue *q, int mask)
+{
+	BUG_ON(mask > PAGE_SIZE);
+
+	if (mask > q->dma_alignment)
+		q->dma_alignment = mask;
+}
+EXPORT_SYMBOL(blk_queue_update_dma_alignment);
+
+/**
+ * blk_queue_flush - configure queue's cache flush capability
+ * @q:		the request queue for the device
+ * @flush:	0, REQ_FLUSH or REQ_FLUSH | REQ_FUA
+ *
+ * Tell block layer cache flush capability of @q.  If it supports
+ * flushing, REQ_FLUSH should be set.  If it supports bypassing
+ * write cache for individual writes, REQ_FUA should be set.
+ */
+void blk_queue_flush(struct request_queue *q, unsigned int flush)
+{
+	WARN_ON_ONCE(flush & ~(REQ_FLUSH | REQ_FUA));
+
+	if (WARN_ON_ONCE(!(flush & REQ_FLUSH) && (flush & REQ_FUA)))
+		flush &= ~REQ_FUA;
+
+	q->flush_flags = flush & (REQ_FLUSH | REQ_FUA);
+}
+EXPORT_SYMBOL_GPL(blk_queue_flush);
+
+void blk_queue_flush_queueable(struct request_queue *q, bool queueable)
+{
+	q->flush_not_queueable = !queueable;
+}
+EXPORT_SYMBOL_GPL(blk_queue_flush_queueable);
+
+static int __init blk_settings_init(void)
+{
+	blk_max_low_pfn = max_low_pfn - 1;
+	blk_max_pfn = max_pfn - 1;
+	return 0;
+}
+subsys_initcall(blk_settings_init);
--- a/block/blk-softirq.c
+++ b/block/blk-softirq.c
@ -0,0 +1,186 @@
+/*
+ * Functions related to softirq rq completions
+ */
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/bio.h>
+#include <linux/blkdev.h>
+#include <linux/interrupt.h>
+#include <linux/cpu.h>
+#include <linux/sched.h>
+
+#include "blk.h"
+
+static DEFINE_PER_CPU(struct list_head, blk_cpu_done);
+
+/*
+ * Softirq action handler - move entries to local list and loop over them
+ * while passing them to the queue registered handler.
+ */
+static void blk_done_softirq(struct softirq_action *h)
+{
+	struct list_head *cpu_list, local_list;
+
+	local_irq_disable();
+	cpu_list = this_cpu_ptr(&blk_cpu_done);
+	list_replace_init(cpu_list, &local_list);
+	local_irq_enable();
+
+	while (!list_empty(&local_list)) {
+		struct request *rq;
+
+		rq = list_entry(local_list.next, struct request, ipi_list);
+		list_del_init(&rq->ipi_list);
+		rq->q->softirq_done_fn(rq);
+	}
+}
+
+#ifdef CONFIG_SMP
+static void trigger_softirq(void *data)
+{
+	struct request *rq = data;
+	unsigned long flags;
+	struct list_head *list;
+
+	local_irq_save(flags);
+	list = this_cpu_ptr(&blk_cpu_done);
+	list_add_tail(&rq->ipi_list, list);
+
+	if (list->next == &rq->ipi_list)
+		raise_softirq_irqoff(BLOCK_SOFTIRQ);
+
+	local_irq_restore(flags);
+}
+
+/*
+ * Setup and invoke a run of 'trigger_softirq' on the given cpu.
+ */
+static int raise_blk_irq(int cpu, struct request *rq)
+{
+	if (cpu_online(cpu)) {
+		struct call_single_data *data = &rq->csd;
+
+		data->func = trigger_softirq;
+		data->info = rq;
+		data->flags = 0;
+
+		smp_call_function_single_async(cpu, data);
+		return 0;
+	}
+
+	return 1;
+}
+#else /* CONFIG_SMP */
+static int raise_blk_irq(int cpu, struct request *rq)
+{
+	return 1;
+}
+#endif
+
+static int blk_cpu_notify(struct notifier_block *self, unsigned long action,
+			  void *hcpu)
+{
+	/*
+	 * If a CPU goes away, splice its entries to the current CPU
+	 * and trigger a run of the softirq
+	 */
+	if (action == CPU_DEAD || action == CPU_DEAD_FROZEN) {
+		int cpu = (unsigned long) hcpu;
+
+		local_irq_disable();
+		list_splice_init(&per_cpu(blk_cpu_done, cpu),
+				 this_cpu_ptr(&blk_cpu_done));
+		raise_softirq_irqoff(BLOCK_SOFTIRQ);
+		local_irq_enable();
+	}
+
+	return NOTIFY_OK;
+}
+
+static struct notifier_block blk_cpu_notifier = {
+	.notifier_call	= blk_cpu_notify,
+};
+
+void __blk_complete_request(struct request *req)
+{
+	int ccpu, cpu;
+	struct request_queue *q = req->q;
+	unsigned long flags;
+	bool shared = false;
+
+	BUG_ON(!q->softirq_done_fn);
+
+	local_irq_save(flags);
+	cpu = smp_processor_id();
+
+	/*
+	 * Select completion CPU
+	 */
+	if (req->cpu != -1) {
+		ccpu = req->cpu;
+		if (!test_bit(QUEUE_FLAG_SAME_FORCE, &q->queue_flags))
+			shared = cpus_share_cache(cpu, ccpu);
+	} else
+		ccpu = cpu;
+
+	/*
+	 * If current CPU and requested CPU share a cache, run the softirq on
+	 * the current CPU. One might concern this is just like
+	 * QUEUE_FLAG_SAME_FORCE, but actually not. blk_complete_request() is
+	 * running in interrupt handler, and currently I/O controller doesn't
+	 * support multiple interrupts, so current CPU is unique actually. This
+	 * avoids IPI sending from current CPU to the first CPU of a group.
+	 */
+	if (ccpu == cpu || shared) {
+		struct list_head *list;
+do_local:
+		list = this_cpu_ptr(&blk_cpu_done);
+		list_add_tail(&req->ipi_list, list);
+
+		/*
+		 * if the list only contains our just added request,
+		 * signal a raise of the softirq. If there are already
+		 * entries there, someone already raised the irq but it
+		 * hasn't run yet.
+		 */
+		if (list->next == &req->ipi_list)
+			raise_softirq_irqoff(BLOCK_SOFTIRQ);
+	} else if (raise_blk_irq(ccpu, req))
+		goto do_local;
+
+	local_irq_restore(flags);
+}
+
+/**
+ * blk_complete_request - end I/O on a request
+ * @req:      the request being processed
+ *
+ * Description:
+ *     Ends all I/O on a request. It does not handle partial completions,
+ *     unless the driver actually implements this in its completion callback
+ *     through requeueing. The actual completion happens out-of-order,
+ *     through a softirq handler. The user must have registered a completion
+ *     callback through blk_queue_softirq_done().
+ **/
+void blk_complete_request(struct request *req)
+{
+	if (unlikely(blk_should_fake_timeout(req->q)))
+		return;
+	if (!blk_mark_rq_complete(req))
+		__blk_complete_request(req);
+}
+EXPORT_SYMBOL(blk_complete_request);
+
+static __init int blk_softirq_init(void)
+{
+	int i;
+
+	for_each_possible_cpu(i)
+		INIT_LIST_HEAD(&per_cpu(blk_cpu_done, i));
+
+	open_softirq(BLOCK_SOFTIRQ, blk_done_softirq);
+	register_hotcpu_notifier(&blk_cpu_notifier);
+	return 0;
+}
+subsys_initcall(blk_softirq_init);
--- a/block/blk-sysfs.c
+++ b/block/blk-sysfs.c
@ -0,0 +1,612 @@
+/*
+ * Functions related to sysfs handling
+ */
+#include <linux/kernel.h>
+#include <linux/slab.h>
+#include <linux/module.h>
+#include <linux/bio.h>
+#include <linux/blkdev.h>
+#include <linux/blktrace_api.h>
+#include <linux/blk-mq.h>
+
+#include "blk.h"
+#include "blk-cgroup.h"
+#include "blk-mq.h"
+
+struct queue_sysfs_entry {
+	struct attribute attr;
+	ssize_t (*show)(struct request_queue *, char *);
+	ssize_t (*store)(struct request_queue *, const char *, size_t);
+};
+
+static ssize_t
+queue_var_show(unsigned long var, char *page)
+{
+	return sprintf(page, "%lu\n", var);
+}
+
+static ssize_t
+queue_var_store(unsigned long *var, const char *page, size_t count)
+{
+	int err;
+	unsigned long v;
+
+	err = kstrtoul(page, 10, &v);
+	if (err || v > UINT_MAX)
+		return -EINVAL;
+
+	*var = v;
+
+	return count;
+}
+
+static ssize_t queue_requests_show(struct request_queue *q, char *page)
+{
+	return queue_var_show(q->nr_requests, (page));
+}
+
+static ssize_t
+queue_requests_store(struct request_queue *q, const char *page, size_t count)
+{
+	unsigned long nr;
+	int ret, err;
+
+	if (!q->request_fn && !q->mq_ops)
+		return -EINVAL;
+
+	ret = queue_var_store(&nr, page, count);
+	if (ret < 0)
+		return ret;
+
+	if (nr < BLKDEV_MIN_RQ)
+		nr = BLKDEV_MIN_RQ;
+
+	if (q->request_fn)
+		err = blk_update_nr_requests(q, nr);
+	else
+		err = blk_mq_update_nr_requests(q, nr);
+
+	if (err)
+		return err;
+
+	return ret;
+}
+
+static ssize_t queue_ra_show(struct request_queue *q, char *page)
+{
+	unsigned long ra_kb = q->backing_dev_info.ra_pages <<
+					(PAGE_CACHE_SHIFT - 10);
+
+	return queue_var_show(ra_kb, (page));
+}
+
+static ssize_t
+queue_ra_store(struct request_queue *q, const char *page, size_t count)
+{
+	unsigned long ra_kb;
+	ssize_t ret = queue_var_store(&ra_kb, page, count);
+
+	if (ret < 0)
+		return ret;
+
+	q->backing_dev_info.ra_pages = ra_kb >> (PAGE_CACHE_SHIFT - 10);
+
+	return ret;
+}
+
+static ssize_t queue_max_sectors_show(struct request_queue *q, char *page)
+{
+	int max_sectors_kb = queue_max_sectors(q) >> 1;
+
+	return queue_var_show(max_sectors_kb, (page));
+}
+
+static ssize_t queue_max_segments_show(struct request_queue *q, char *page)
+{
+	return queue_var_show(queue_max_segments(q), (page));
+}
+
+static ssize_t queue_max_integrity_segments_show(struct request_queue *q, char *page)
+{
+	return queue_var_show(q->limits.max_integrity_segments, (page));
+}
+
+static ssize_t queue_max_segment_size_show(struct request_queue *q, char *page)
+{
+	if (blk_queue_cluster(q))
+		return queue_var_show(queue_max_segment_size(q), (page));
+
+	return queue_var_show(PAGE_CACHE_SIZE, (page));
+}
+
+static ssize_t queue_logical_block_size_show(struct request_queue *q, char *page)
+{
+	return queue_var_show(queue_logical_block_size(q), page);
+}
+
+static ssize_t queue_physical_block_size_show(struct request_queue *q, char *page)
+{
+	return queue_var_show(queue_physical_block_size(q), page);
+}
+
+static ssize_t queue_io_min_show(struct request_queue *q, char *page)
+{
+	return queue_var_show(queue_io_min(q), page);
+}
+
+static ssize_t queue_io_opt_show(struct request_queue *q, char *page)
+{
+	return queue_var_show(queue_io_opt(q), page);
+}
+
+static ssize_t queue_discard_granularity_show(struct request_queue *q, char *page)
+{
+	return queue_var_show(q->limits.discard_granularity, page);
+}
+
+static ssize_t queue_discard_max_show(struct request_queue *q, char *page)
+{
+	return sprintf(page, "%llu\n",
+		       (unsigned long long)q->limits.max_discard_sectors << 9);
+}
+
+static ssize_t queue_discard_zeroes_data_show(struct request_queue *q, char *page)
+{
+	return queue_var_show(queue_discard_zeroes_data(q), page);
+}
+
+static ssize_t queue_write_same_max_show(struct request_queue *q, char *page)
+{
+	return sprintf(page, "%llu\n",
+		(unsigned long long)q->limits.max_write_same_sectors << 9);
+}
+
+
+static ssize_t
+queue_max_sectors_store(struct request_queue *q, const char *page, size_t count)
+{
+	unsigned long max_sectors_kb,
+		max_hw_sectors_kb = queue_max_hw_sectors(q) >> 1,
+			page_kb = 1 << (PAGE_CACHE_SHIFT - 10);
+	ssize_t ret = queue_var_store(&max_sectors_kb, page, count);
+
+	if (ret < 0)
+		return ret;
+
+	if (max_sectors_kb > max_hw_sectors_kb || max_sectors_kb < page_kb)
+		return -EINVAL;
+
+	spin_lock_irq(q->queue_lock);
+	q->limits.max_sectors = max_sectors_kb << 1;
+	spin_unlock_irq(q->queue_lock);
+
+	return ret;
+}
+
+static ssize_t queue_max_hw_sectors_show(struct request_queue *q, char *page)
+{
+	int max_hw_sectors_kb = queue_max_hw_sectors(q) >> 1;
+
+	return queue_var_show(max_hw_sectors_kb, (page));
+}
+
+#define QUEUE_SYSFS_BIT_FNS(name, flag, neg)				\
+static ssize_t								\
+queue_show_##name(struct request_queue *q, char *page)			\
+{									\
+	int bit;							\
+	bit = test_bit(QUEUE_FLAG_##flag, &q->queue_flags);		\
+	return queue_var_show(neg ? !bit : bit, page);			\
+}									\
+static ssize_t								\
+queue_store_##name(struct request_queue *q, const char *page, size_t count) \
+{									\
+	unsigned long val;						\
+	ssize_t ret;							\
+	ret = queue_var_store(&val, page, count);			\
+	if (ret < 0)							\
+		 return ret;						\
+	if (neg)							\
+		val = !val;						\
+									\
+	spin_lock_irq(q->queue_lock);					\
+	if (val)							\
+		queue_flag_set(QUEUE_FLAG_##flag, q);			\
+	else								\
+		queue_flag_clear(QUEUE_FLAG_##flag, q);			\
+	spin_unlock_irq(q->queue_lock);					\
+	return ret;							\
+}
+
+QUEUE_SYSFS_BIT_FNS(nonrot, NONROT, 1);
+QUEUE_SYSFS_BIT_FNS(random, ADD_RANDOM, 0);
+QUEUE_SYSFS_BIT_FNS(iostats, IO_STAT, 0);
+#undef QUEUE_SYSFS_BIT_FNS
+
+static ssize_t queue_nomerges_show(struct request_queue *q, char *page)
+{
+	return queue_var_show((blk_queue_nomerges(q) << 1) |
+			       blk_queue_noxmerges(q), page);
+}
+
+static ssize_t queue_nomerges_store(struct request_queue *q, const char *page,
+				    size_t count)
+{
+	unsigned long nm;
+	ssize_t ret = queue_var_store(&nm, page, count);
+
+	if (ret < 0)
+		return ret;
+
+	spin_lock_irq(q->queue_lock);
+	queue_flag_clear(QUEUE_FLAG_NOMERGES, q);
+	queue_flag_clear(QUEUE_FLAG_NOXMERGES, q);
+	if (nm == 2)
+		queue_flag_set(QUEUE_FLAG_NOMERGES, q);
+	else if (nm)
+		queue_flag_set(QUEUE_FLAG_NOXMERGES, q);
+	spin_unlock_irq(q->queue_lock);
+
+	return ret;
+}
+
+static ssize_t queue_rq_affinity_show(struct request_queue *q, char *page)
+{
+	bool set = test_bit(QUEUE_FLAG_SAME_COMP, &q->queue_flags);
+	bool force = test_bit(QUEUE_FLAG_SAME_FORCE, &q->queue_flags);
+
+	return queue_var_show(set << force, page);
+}
+
+static ssize_t
+queue_rq_affinity_store(struct request_queue *q, const char *page, size_t count)
+{
+	ssize_t ret = -EINVAL;
+#ifdef CONFIG_SMP
+	unsigned long val;
+
+	ret = queue_var_store(&val, page, count);
+	if (ret < 0)
+		return ret;
+
+	spin_lock_irq(q->queue_lock);
+	if (val == 2) {
+		queue_flag_set(QUEUE_FLAG_SAME_COMP, q);
+		queue_flag_set(QUEUE_FLAG_SAME_FORCE, q);
+	} else if (val == 1) {
+		queue_flag_set(QUEUE_FLAG_SAME_COMP, q);
+		queue_flag_clear(QUEUE_FLAG_SAME_FORCE, q);
+	} else if (val == 0) {
+		queue_flag_clear(QUEUE_FLAG_SAME_COMP, q);
+		queue_flag_clear(QUEUE_FLAG_SAME_FORCE, q);
+	}
+	spin_unlock_irq(q->queue_lock);
+#endif
+	return ret;
+}
+
+static struct queue_sysfs_entry queue_requests_entry = {
+	.attr = {.name = "nr_requests", .mode = S_IRUGO | S_IWUSR },
+	.show = queue_requests_show,
+	.store = queue_requests_store,
+};
+
+static struct queue_sysfs_entry queue_ra_entry = {
+	.attr = {.name = "read_ahead_kb", .mode = S_IRUGO | S_IWUSR },
+	.show = queue_ra_show,
+	.store = queue_ra_store,
+};
+
+static struct queue_sysfs_entry queue_max_sectors_entry = {
+	.attr = {.name = "max_sectors_kb", .mode = S_IRUGO | S_IWUSR },
+	.show = queue_max_sectors_show,
+	.store = queue_max_sectors_store,
+};
+
+static struct queue_sysfs_entry queue_max_hw_sectors_entry = {
+	.attr = {.name = "max_hw_sectors_kb", .mode = S_IRUGO },
+	.show = queue_max_hw_sectors_show,
+};
+
+static struct queue_sysfs_entry queue_max_segments_entry = {
+	.attr = {.name = "max_segments", .mode = S_IRUGO },
+	.show = queue_max_segments_show,
+};
+
+static struct queue_sysfs_entry queue_max_integrity_segments_entry = {
+	.attr = {.name = "max_integrity_segments", .mode = S_IRUGO },
+	.show = queue_max_integrity_segments_show,
+};
+
+static struct queue_sysfs_entry queue_max_segment_size_entry = {
+	.attr = {.name = "max_segment_size", .mode = S_IRUGO },
+	.show = queue_max_segment_size_show,
+};
+
+static struct queue_sysfs_entry queue_iosched_entry = {
+	.attr = {.name = "scheduler", .mode = S_IRUGO | S_IWUSR },
+	.show = elv_iosched_show,
+	.store = elv_iosched_store,
+};
+
+static struct queue_sysfs_entry queue_hw_sector_size_entry = {
+	.attr = {.name = "hw_sector_size", .mode = S_IRUGO },
+	.show = queue_logical_block_size_show,
+};
+
+static struct queue_sysfs_entry queue_logical_block_size_entry = {
+	.attr = {.name = "logical_block_size", .mode = S_IRUGO },
+	.show = queue_logical_block_size_show,
+};
+
+static struct queue_sysfs_entry queue_physical_block_size_entry = {
+	.attr = {.name = "physical_block_size", .mode = S_IRUGO },
+	.show = queue_physical_block_size_show,
+};
+
+static struct queue_sysfs_entry queue_io_min_entry = {
+	.attr = {.name = "minimum_io_size", .mode = S_IRUGO },
+	.show = queue_io_min_show,
+};
+
+static struct queue_sysfs_entry queue_io_opt_entry = {
+	.attr = {.name = "optimal_io_size", .mode = S_IRUGO },
+	.show = queue_io_opt_show,
+};
+
+static struct queue_sysfs_entry queue_discard_granularity_entry = {
+	.attr = {.name = "discard_granularity", .mode = S_IRUGO },
+	.show = queue_discard_granularity_show,
+};
+
+static struct queue_sysfs_entry queue_discard_max_entry = {
+	.attr = {.name = "discard_max_bytes", .mode = S_IRUGO },
+	.show = queue_discard_max_show,
+};
+
+static struct queue_sysfs_entry queue_discard_zeroes_data_entry = {
+	.attr = {.name = "discard_zeroes_data", .mode = S_IRUGO },
+	.show = queue_discard_zeroes_data_show,
+};
+
+static struct queue_sysfs_entry queue_write_same_max_entry = {
+	.attr = {.name = "write_same_max_bytes", .mode = S_IRUGO },
+	.show = queue_write_same_max_show,
+};
+
+static struct queue_sysfs_entry queue_nonrot_entry = {
+	.attr = {.name = "rotational", .mode = S_IRUGO | S_IWUSR },
+	.show = queue_show_nonrot,
+	.store = queue_store_nonrot,
+};
+
+static struct queue_sysfs_entry queue_nomerges_entry = {
+	.attr = {.name = "nomerges", .mode = S_IRUGO | S_IWUSR },
+	.show = queue_nomerges_show,
+	.store = queue_nomerges_store,
+};
+
+static struct queue_sysfs_entry queue_rq_affinity_entry = {
+	.attr = {.name = "rq_affinity", .mode = S_IRUGO | S_IWUSR },
+	.show = queue_rq_affinity_show,
+	.store = queue_rq_affinity_store,
+};
+
+static struct queue_sysfs_entry queue_iostats_entry = {
+	.attr = {.name = "iostats", .mode = S_IRUGO | S_IWUSR },
+	.show = queue_show_iostats,
+	.store = queue_store_iostats,
+};
+
+static struct queue_sysfs_entry queue_random_entry = {
+	.attr = {.name = "add_random", .mode = S_IRUGO | S_IWUSR },
+	.show = queue_show_random,
+	.store = queue_store_random,
+};
+
+static struct attribute *default_attrs[] = {
+	&queue_requests_entry.attr,
+	&queue_ra_entry.attr,
+	&queue_max_hw_sectors_entry.attr,
+	&queue_max_sectors_entry.attr,
+	&queue_max_segments_entry.attr,
+	&queue_max_integrity_segments_entry.attr,
+	&queue_max_segment_size_entry.attr,
+	&queue_iosched_entry.attr,
+	&queue_hw_sector_size_entry.attr,
+	&queue_logical_block_size_entry.attr,
+	&queue_physical_block_size_entry.attr,
+	&queue_io_min_entry.attr,
+	&queue_io_opt_entry.attr,
+	&queue_discard_granularity_entry.attr,
+	&queue_discard_max_entry.attr,
+	&queue_discard_zeroes_data_entry.attr,
+	&queue_write_same_max_entry.attr,
+	&queue_nonrot_entry.attr,
+	&queue_nomerges_entry.attr,
+	&queue_rq_affinity_entry.attr,
+	&queue_iostats_entry.attr,
+	&queue_random_entry.attr,
+	NULL,
+};
+
+#define to_queue(atr) container_of((atr), struct queue_sysfs_entry, attr)
+
+static ssize_t
+queue_attr_show(struct kobject *kobj, struct attribute *attr, char *page)
+{
+	struct queue_sysfs_entry *entry = to_queue(attr);
+	struct request_queue *q =
+		container_of(kobj, struct request_queue, kobj);
+	ssize_t res;
+
+	if (!entry->show)
+		return -EIO;
+	mutex_lock(&q->sysfs_lock);
+	if (blk_queue_dying(q)) {
+		mutex_unlock(&q->sysfs_lock);
+		return -ENOENT;
+	}
+	res = entry->show(q, page);
+	mutex_unlock(&q->sysfs_lock);
+	return res;
+}
+
+static ssize_t
+queue_attr_store(struct kobject *kobj, struct attribute *attr,
+		    const char *page, size_t length)
+{
+	struct queue_sysfs_entry *entry = to_queue(attr);
+	struct request_queue *q;
+	ssize_t res;
+
+	if (!entry->store)
+		return -EIO;
+
+	q = container_of(kobj, struct request_queue, kobj);
+	mutex_lock(&q->sysfs_lock);
+	if (blk_queue_dying(q)) {
+		mutex_unlock(&q->sysfs_lock);
+		return -ENOENT;
+	}
+	res = entry->store(q, page, length);
+	mutex_unlock(&q->sysfs_lock);
+	return res;
+}
+
+static void blk_free_queue_rcu(struct rcu_head *rcu_head)
+{
+	struct request_queue *q = container_of(rcu_head, struct request_queue,
+					       rcu_head);
+	kmem_cache_free(blk_requestq_cachep, q);
+}
+
+/**
+ * blk_release_queue: - release a &struct request_queue when it is no longer needed
+ * @kobj:    the kobj belonging to the request queue to be released
+ *
+ * Description:
+ *     blk_release_queue is the pair to blk_init_queue() or
+ *     blk_queue_make_request().  It should be called when a request queue is
+ *     being released; typically when a block device is being de-registered.
+ *     Currently, its primary task it to free all the &struct request
+ *     structures that were allocated to the queue and the queue itself.
+ *
+ * Note:
+ *     The low level driver must have finished any outstanding requests first
+ *     via blk_cleanup_queue().
+ **/
+static void blk_release_queue(struct kobject *kobj)
+{
+	struct request_queue *q =
+		container_of(kobj, struct request_queue, kobj);
+
+	blkcg_exit_queue(q);
+
+	if (q->elevator) {
+		spin_lock_irq(q->queue_lock);
+		ioc_clear_queue(q);
+		spin_unlock_irq(q->queue_lock);
+		elevator_exit(q->elevator);
+	}
+
+	blk_exit_rl(&q->root_rl);
+
+	if (q->queue_tags)
+		__blk_queue_free_tags(q);
+
+	if (!q->mq_ops)
+		blk_free_flush_queue(q->fq);
+
+	blk_trace_shutdown(q);
+
+	bdi_destroy(&q->backing_dev_info);
+
+	ida_simple_remove(&blk_queue_ida, q->id);
+	call_rcu(&q->rcu_head, blk_free_queue_rcu);
+}
+
+static const struct sysfs_ops queue_sysfs_ops = {
+	.show	= queue_attr_show,
+	.store	= queue_attr_store,
+};
+
+struct kobj_type blk_queue_ktype = {
+	.sysfs_ops	= &queue_sysfs_ops,
+	.default_attrs	= default_attrs,
+	.release	= blk_release_queue,
+};
+
+int blk_register_queue(struct gendisk *disk)
+{
+	int ret;
+	struct device *dev = disk_to_dev(disk);
+	struct request_queue *q = disk->queue;
+
+	if (WARN_ON(!q))
+		return -ENXIO;
+
+	/*
+	 * SCSI probing may synchronously create and destroy a lot of
+	 * request_queues for non-existent devices.  Shutting down a fully
+	 * functional queue takes measureable wallclock time as RCU grace
+	 * periods are involved.  To avoid excessive latency in these
+	 * cases, a request_queue starts out in a degraded mode which is
+	 * faster to shut down and is made fully functional here as
+	 * request_queues for non-existent devices never get registered.
+	 */
+	if (!blk_queue_init_done(q)) {
+		queue_flag_set_unlocked(QUEUE_FLAG_INIT_DONE, q);
+		blk_queue_bypass_end(q);
+		if (q->mq_ops)
+			blk_mq_finish_init(q);
+	}
+
+	ret = blk_trace_init_sysfs(dev);
+	if (ret)
+		return ret;
+
+	ret = kobject_add(&q->kobj, kobject_get(&dev->kobj), "%s", "queue");
+	if (ret < 0) {
+		blk_trace_remove_sysfs(dev);
+		return ret;
+	}
+
+	kobject_uevent(&q->kobj, KOBJ_ADD);
+
+	if (q->mq_ops)
+		blk_mq_register_disk(disk);
+
+	if (!q->request_fn)
+		return 0;
+
+	ret = elv_register_queue(q);
+	if (ret) {
+		kobject_uevent(&q->kobj, KOBJ_REMOVE);
+		kobject_del(&q->kobj);
+		blk_trace_remove_sysfs(dev);
+		kobject_put(&dev->kobj);
+		return ret;
+	}
+
+	return 0;
+}
+
+void blk_unregister_queue(struct gendisk *disk)
+{
+	struct request_queue *q = disk->queue;
+
+	if (WARN_ON(!q))
+		return;
+
+	if (q->mq_ops)
+		blk_mq_unregister_disk(disk);
+
+	if (q->request_fn)
+		elv_unregister_queue(q);
+
+	kobject_uevent(&q->kobj, KOBJ_REMOVE);
+	kobject_del(&q->kobj);
+	blk_trace_remove_sysfs(disk_to_dev(disk));
+	kobject_put(&disk_to_dev(disk)->kobj);
+}
--- a/block/blk-tag.c
+++ b/block/blk-tag.c
@ -0,0 +1,385 @@
+/*
+ * Functions related to tagged command queuing
+ */
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/bio.h>
+#include <linux/blkdev.h>
+#include <linux/slab.h>
+
+#include "blk.h"
+
+/**
+ * blk_queue_find_tag - find a request by its tag and queue
+ * @q:	 The request queue for the device
+ * @tag: The tag of the request
+ *
+ * Notes:
+ *    Should be used when a device returns a tag and you want to match
+ *    it with a request.
+ *
+ *    no locks need be held.
+ **/
+struct request *blk_queue_find_tag(struct request_queue *q, int tag)
+{
+	return blk_map_queue_find_tag(q->queue_tags, tag);
+}
+EXPORT_SYMBOL(blk_queue_find_tag);
+
+/**
+ * blk_free_tags - release a given set of tag maintenance info
+ * @bqt:	the tag map to free
+ *
+ * Drop the reference count on @bqt and frees it when the last reference
+ * is dropped.
+ */
+void blk_free_tags(struct blk_queue_tag *bqt)
+{
+	if (atomic_dec_and_test(&bqt->refcnt)) {
+		BUG_ON(find_first_bit(bqt->tag_map, bqt->max_depth) <
+							bqt->max_depth);
+
+		kfree(bqt->tag_index);
+		bqt->tag_index = NULL;
+
+		kfree(bqt->tag_map);
+		bqt->tag_map = NULL;
+
+		kfree(bqt);
+	}
+}
+EXPORT_SYMBOL(blk_free_tags);
+
+/**
+ * __blk_queue_free_tags - release tag maintenance info
+ * @q:  the request queue for the device
+ *
+ *  Notes:
+ *    blk_cleanup_queue() will take care of calling this function, if tagging
+ *    has been used. So there's no need to call this directly.
+ **/
+void __blk_queue_free_tags(struct request_queue *q)
+{
+	struct blk_queue_tag *bqt = q->queue_tags;
+
+	if (!bqt)
+		return;
+
+	blk_free_tags(bqt);
+
+	q->queue_tags = NULL;
+	queue_flag_clear_unlocked(QUEUE_FLAG_QUEUED, q);
+}
+
+/**
+ * blk_queue_free_tags - release tag maintenance info
+ * @q:  the request queue for the device
+ *
+ *  Notes:
+ *	This is used to disable tagged queuing to a device, yet leave
+ *	queue in function.
+ **/
+void blk_queue_free_tags(struct request_queue *q)
+{
+	queue_flag_clear_unlocked(QUEUE_FLAG_QUEUED, q);
+}
+EXPORT_SYMBOL(blk_queue_free_tags);
+
+static int
+init_tag_map(struct request_queue *q, struct blk_queue_tag *tags, int depth)
+{
+	struct request **tag_index;
+	unsigned long *tag_map;
+	int nr_ulongs;
+
+	if (q && depth > q->nr_requests * 2) {
+		depth = q->nr_requests * 2;
+		printk(KERN_ERR "%s: adjusted depth to %d\n",
+		       __func__, depth);
+	}
+
+	tag_index = kzalloc(depth * sizeof(struct request *), GFP_ATOMIC);
+	if (!tag_index)
+		goto fail;
+
+	nr_ulongs = ALIGN(depth, BITS_PER_LONG) / BITS_PER_LONG;
+	tag_map = kzalloc(nr_ulongs * sizeof(unsigned long), GFP_ATOMIC);
+	if (!tag_map)
+		goto fail;
+
+	tags->real_max_depth = depth;
+	tags->max_depth = depth;
+	tags->tag_index = tag_index;
+	tags->tag_map = tag_map;
+
+	return 0;
+fail:
+	kfree(tag_index);
+	return -ENOMEM;
+}
+
+static struct blk_queue_tag *__blk_queue_init_tags(struct request_queue *q,
+						   int depth)
+{
+	struct blk_queue_tag *tags;
+
+	tags = kmalloc(sizeof(struct blk_queue_tag), GFP_ATOMIC);
+	if (!tags)
+		goto fail;
+
+	if (init_tag_map(q, tags, depth))
+		goto fail;
+
+	atomic_set(&tags->refcnt, 1);
+	return tags;
+fail:
+	kfree(tags);
+	return NULL;
+}
+
+/**
+ * blk_init_tags - initialize the tag info for an external tag map
+ * @depth:	the maximum queue depth supported
+ **/
+struct blk_queue_tag *blk_init_tags(int depth)
+{
+	return __blk_queue_init_tags(NULL, depth);
+}
+EXPORT_SYMBOL(blk_init_tags);
+
+/**
+ * blk_queue_init_tags - initialize the queue tag info
+ * @q:  the request queue for the device
+ * @depth:  the maximum queue depth supported
+ * @tags: the tag to use
+ *
+ * Queue lock must be held here if the function is called to resize an
+ * existing map.
+ **/
+int blk_queue_init_tags(struct request_queue *q, int depth,
+			struct blk_queue_tag *tags)
+{
+	int rc;
+
+	BUG_ON(tags && q->queue_tags && tags != q->queue_tags);
+
+	if (!tags && !q->queue_tags) {
+		tags = __blk_queue_init_tags(q, depth);
+
+		if (!tags)
+			return -ENOMEM;
+
+	} else if (q->queue_tags) {
+		rc = blk_queue_resize_tags(q, depth);
+		if (rc)
+			return rc;
+		queue_flag_set(QUEUE_FLAG_QUEUED, q);
+		return 0;
+	} else
+		atomic_inc(&tags->refcnt);
+
+	/*
+	 * assign it, all done
+	 */
+	q->queue_tags = tags;
+	queue_flag_set_unlocked(QUEUE_FLAG_QUEUED, q);
+	INIT_LIST_HEAD(&q->tag_busy_list);
+	return 0;
+}
+EXPORT_SYMBOL(blk_queue_init_tags);
+
+/**
+ * blk_queue_resize_tags - change the queueing depth
+ * @q:  the request queue for the device
+ * @new_depth: the new max command queueing depth
+ *
+ *  Notes:
+ *    Must be called with the queue lock held.
+ **/
+int blk_queue_resize_tags(struct request_queue *q, int new_depth)
+{
+	struct blk_queue_tag *bqt = q->queue_tags;
+	struct request **tag_index;
+	unsigned long *tag_map;
+	int max_depth, nr_ulongs;
+
+	if (!bqt)
+		return -ENXIO;
+
+	/*
+	 * if we already have large enough real_max_depth.  just
+	 * adjust max_depth.  *NOTE* as requests with tag value
+	 * between new_depth and real_max_depth can be in-flight, tag
+	 * map can not be shrunk blindly here.
+	 */
+	if (new_depth <= bqt->real_max_depth) {
+		bqt->max_depth = new_depth;
+		return 0;
+	}
+
+	/*
+	 * Currently cannot replace a shared tag map with a new
+	 * one, so error out if this is the case
+	 */
+	if (atomic_read(&bqt->refcnt) != 1)
+		return -EBUSY;
+
+	/*
+	 * save the old state info, so we can copy it back
+	 */
+	tag_index = bqt->tag_index;
+	tag_map = bqt->tag_map;
+	max_depth = bqt->real_max_depth;
+
+	if (init_tag_map(q, bqt, new_depth))
+		return -ENOMEM;
+
+	memcpy(bqt->tag_index, tag_index, max_depth * sizeof(struct request *));
+	nr_ulongs = ALIGN(max_depth, BITS_PER_LONG) / BITS_PER_LONG;
+	memcpy(bqt->tag_map, tag_map, nr_ulongs * sizeof(unsigned long));
+
+	kfree(tag_index);
+	kfree(tag_map);
+	return 0;
+}
+EXPORT_SYMBOL(blk_queue_resize_tags);
+
+/**
+ * blk_queue_end_tag - end tag operations for a request
+ * @q:  the request queue for the device
+ * @rq: the request that has completed
+ *
+ *  Description:
+ *    Typically called when end_that_request_first() returns %0, meaning
+ *    all transfers have been done for a request. It's important to call
+ *    this function before end_that_request_last(), as that will put the
+ *    request back on the free list thus corrupting the internal tag list.
+ *
+ *  Notes:
+ *   queue lock must be held.
+ **/
+void blk_queue_end_tag(struct request_queue *q, struct request *rq)
+{
+	struct blk_queue_tag *bqt = q->queue_tags;
+	unsigned tag = rq->tag; /* negative tags invalid */
+
+	BUG_ON(tag >= bqt->real_max_depth);
+
+	list_del_init(&rq->queuelist);
+	rq->cmd_flags &= ~REQ_QUEUED;
+	rq->tag = -1;
+
+	if (unlikely(bqt->tag_index[tag] == NULL))
+		printk(KERN_ERR "%s: tag %d is missing\n",
+		       __func__, tag);
+
+	bqt->tag_index[tag] = NULL;
+
+	if (unlikely(!test_bit(tag, bqt->tag_map))) {
+		printk(KERN_ERR "%s: attempt to clear non-busy tag (%d)\n",
+		       __func__, tag);
+		return;
+	}
+	/*
+	 * The tag_map bit acts as a lock for tag_index[bit], so we need
+	 * unlock memory barrier semantics.
+	 */
+	clear_bit_unlock(tag, bqt->tag_map);
+}
+EXPORT_SYMBOL(blk_queue_end_tag);
+
+/**
+ * blk_queue_start_tag - find a free tag and assign it
+ * @q:  the request queue for the device
+ * @rq:  the block request that needs tagging
+ *
+ *  Description:
+ *    This can either be used as a stand-alone helper, or possibly be
+ *    assigned as the queue &prep_rq_fn (in which case &struct request
+ *    automagically gets a tag assigned). Note that this function
+ *    assumes that any type of request can be queued! if this is not
+ *    true for your device, you must check the request type before
+ *    calling this function.  The request will also be removed from
+ *    the request queue, so it's the drivers responsibility to readd
+ *    it if it should need to be restarted for some reason.
+ *
+ *  Notes:
+ *   queue lock must be held.
+ **/
+int blk_queue_start_tag(struct request_queue *q, struct request *rq)
+{
+	struct blk_queue_tag *bqt = q->queue_tags;
+	unsigned max_depth;
+	int tag;
+
+	if (unlikely((rq->cmd_flags & REQ_QUEUED))) {
+		printk(KERN_ERR
+		       "%s: request %p for device [%s] already tagged %d",
+		       __func__, rq,
+		       rq->rq_disk ? rq->rq_disk->disk_name : "?", rq->tag);
+		BUG();
+	}
+
+	/*
+	 * Protect against shared tag maps, as we may not have exclusive
+	 * access to the tag map.
+	 *
+	 * We reserve a few tags just for sync IO, since we don't want
+	 * to starve sync IO on behalf of flooding async IO.
+	 */
+	max_depth = bqt->max_depth;
+	if (!rq_is_sync(rq) && max_depth > 1) {
+		switch (max_depth) {
+		case 2:
+			max_depth = 1;
+			break;
+		case 3:
+			max_depth = 2;
+			break;
+		default:
+			max_depth -= 2;
+		}
+		if (q->in_flight[BLK_RW_ASYNC] > max_depth)
+			return 1;
+	}
+
+	do {
+		tag = find_first_zero_bit(bqt->tag_map, max_depth);
+		if (tag >= max_depth)
+			return 1;
+
+	} while (test_and_set_bit_lock(tag, bqt->tag_map));
+	/*
+	 * We need lock ordering semantics given by test_and_set_bit_lock.
+	 * See blk_queue_end_tag for details.
+	 */
+
+	rq->cmd_flags |= REQ_QUEUED;
+	rq->tag = tag;
+	bqt->tag_index[tag] = rq;
+	blk_start_request(rq);
+	list_add(&rq->queuelist, &q->tag_busy_list);
+	return 0;
+}
+EXPORT_SYMBOL(blk_queue_start_tag);
+
+/**
+ * blk_queue_invalidate_tags - invalidate all pending tags
+ * @q:  the request queue for the device
+ *
+ *  Description:
+ *   Hardware conditions may dictate a need to stop all pending requests.
+ *   In this case, we will safely clear the block side of the tag queue and
+ *   readd all requests to the request queue in the right order.
+ *
+ *  Notes:
+ *   queue lock must be held.
+ **/
+void blk_queue_invalidate_tags(struct request_queue *q)
+{
+	struct list_head *tmp, *n;
+
+	list_for_each_safe(tmp, n, &q->tag_busy_list)
+		blk_requeue_request(q, list_entry_rq(tmp));
+}
+EXPORT_SYMBOL(blk_queue_invalidate_tags);
--- a/block/blk-throttle.c
+++ b/block/blk-throttle.c
--- a/block/blk-timeout.c
+++ b/block/blk-timeout.c
@ -0,0 +1,232 @@
+/*
+ * Functions related to generic timeout handling of requests.
+ */
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/blkdev.h>
+#include <linux/fault-inject.h>
+
+#include "blk.h"
+#include "blk-mq.h"
+
+#ifdef CONFIG_FAIL_IO_TIMEOUT
+
+static DECLARE_FAULT_ATTR(fail_io_timeout);
+
+static int __init setup_fail_io_timeout(char *str)
+{
+	return setup_fault_attr(&fail_io_timeout, str);
+}
+__setup("fail_io_timeout=", setup_fail_io_timeout);
+
+int blk_should_fake_timeout(struct request_queue *q)
+{
+	if (!test_bit(QUEUE_FLAG_FAIL_IO, &q->queue_flags))
+		return 0;
+
+	return should_fail(&fail_io_timeout, 1);
+}
+
+static int __init fail_io_timeout_debugfs(void)
+{
+	struct dentry *dir = fault_create_debugfs_attr("fail_io_timeout",
+						NULL, &fail_io_timeout);
+
+	return PTR_ERR_OR_ZERO(dir);
+}
+
+late_initcall(fail_io_timeout_debugfs);
+
+ssize_t part_timeout_show(struct device *dev, struct device_attribute *attr,
+			  char *buf)
+{
+	struct gendisk *disk = dev_to_disk(dev);
+	int set = test_bit(QUEUE_FLAG_FAIL_IO, &disk->queue->queue_flags);
+
+	return sprintf(buf, "%d\n", set != 0);
+}
+
+ssize_t part_timeout_store(struct device *dev, struct device_attribute *attr,
+			   const char *buf, size_t count)
+{
+	struct gendisk *disk = dev_to_disk(dev);
+	int val;
+
+	if (count) {
+		struct request_queue *q = disk->queue;
+		char *p = (char *) buf;
+
+		val = simple_strtoul(p, &p, 10);
+		spin_lock_irq(q->queue_lock);
+		if (val)
+			queue_flag_set(QUEUE_FLAG_FAIL_IO, q);
+		else
+			queue_flag_clear(QUEUE_FLAG_FAIL_IO, q);
+		spin_unlock_irq(q->queue_lock);
+	}
+
+	return count;
+}
+
+#endif /* CONFIG_FAIL_IO_TIMEOUT */
+
+/*
+ * blk_delete_timer - Delete/cancel timer for a given function.
+ * @req:	request that we are canceling timer for
+ *
+ */
+void blk_delete_timer(struct request *req)
+{
+	list_del_init(&req->timeout_list);
+}
+
+static void blk_rq_timed_out(struct request *req)
+{
+	struct request_queue *q = req->q;
+	enum blk_eh_timer_return ret = BLK_EH_RESET_TIMER;
+
+	if (q->rq_timed_out_fn)
+		ret = q->rq_timed_out_fn(req);
+	switch (ret) {
+	case BLK_EH_HANDLED:
+		/* Can we use req->errors here? */
+		__blk_complete_request(req);
+		break;
+	case BLK_EH_RESET_TIMER:
+		blk_add_timer(req);
+		blk_clear_rq_complete(req);
+		break;
+	case BLK_EH_NOT_HANDLED:
+		/*
+		 * LLD handles this for now but in the future
+		 * we can send a request msg to abort the command
+		 * and we can move more of the generic scsi eh code to
+		 * the blk layer.
+		 */
+		break;
+	default:
+		printk(KERN_ERR "block: bad eh return: %d\n", ret);
+		break;
+	}
+}
+
+static void blk_rq_check_expired(struct request *rq, unsigned long *next_timeout,
+			  unsigned int *next_set)
+{
+	if (time_after_eq(jiffies, rq->deadline)) {
+		list_del_init(&rq->timeout_list);
+
+		/*
+		 * Check if we raced with end io completion
+		 */
+		if (!blk_mark_rq_complete(rq))
+			blk_rq_timed_out(rq);
+	} else if (!*next_set || time_after(*next_timeout, rq->deadline)) {
+		*next_timeout = rq->deadline;
+		*next_set = 1;
+	}
+}
+
+void blk_rq_timed_out_timer(unsigned long data)
+{
+	struct request_queue *q = (struct request_queue *) data;
+	unsigned long flags, next = 0;
+	struct request *rq, *tmp;
+	int next_set = 0;
+
+	spin_lock_irqsave(q->queue_lock, flags);
+
+	list_for_each_entry_safe(rq, tmp, &q->timeout_list, timeout_list)
+		blk_rq_check_expired(rq, &next, &next_set);
+
+	if (next_set)
+		mod_timer(&q->timeout, round_jiffies_up(next));
+
+	spin_unlock_irqrestore(q->queue_lock, flags);
+}
+
+/**
+ * blk_abort_request -- Request request recovery for the specified command
+ * @req:	pointer to the request of interest
+ *
+ * This function requests that the block layer start recovery for the
+ * request by deleting the timer and calling the q's timeout function.
+ * LLDDs who implement their own error recovery MAY ignore the timeout
+ * event if they generated blk_abort_req. Must hold queue lock.
+ */
+void blk_abort_request(struct request *req)
+{
+	if (blk_mark_rq_complete(req))
+		return;
+	blk_delete_timer(req);
+	if (req->q->mq_ops)
+		blk_mq_rq_timed_out(req, false);
+	else
+		blk_rq_timed_out(req);
+}
+EXPORT_SYMBOL_GPL(blk_abort_request);
+
+unsigned long blk_rq_timeout(unsigned long timeout)
+{
+	unsigned long maxt;
+
+	maxt = round_jiffies_up(jiffies + BLK_MAX_TIMEOUT);
+	if (time_after(timeout, maxt))
+		timeout = maxt;
+
+	return timeout;
+}
+
+/**
+ * blk_add_timer - Start timeout timer for a single request
+ * @req:	request that is about to start running.
+ *
+ * Notes:
+ *    Each request has its own timer, and as it is added to the queue, we
+ *    set up the timer. When the request completes, we cancel the timer.
+ */
+void blk_add_timer(struct request *req)
+{
+	struct request_queue *q = req->q;
+	unsigned long expiry;
+
+	/* blk-mq has its own handler, so we don't need ->rq_timed_out_fn */
+	if (!q->mq_ops && !q->rq_timed_out_fn)
+		return;
+
+	BUG_ON(!list_empty(&req->timeout_list));
+
+	/*
+	 * Some LLDs, like scsi, peek at the timeout to prevent a
+	 * command from being retried forever.
+	 */
+	if (!req->timeout)
+		req->timeout = q->rq_timeout;
+
+	req->deadline = jiffies + req->timeout;
+	if (!q->mq_ops)
+		list_add_tail(&req->timeout_list, &req->q->timeout_list);
+
+	/*
+	 * If the timer isn't already pending or this timeout is earlier
+	 * than an existing one, modify the timer. Round up to next nearest
+	 * second.
+	 */
+	expiry = blk_rq_timeout(round_jiffies_up(req->deadline));
+
+	if (!timer_pending(&q->timeout) ||
+	    time_before(expiry, q->timeout.expires)) {
+		unsigned long diff = q->timeout.expires - expiry;
+
+		/*
+		 * Due to added timer slack to group timers, the timer
+		 * will often be a little in front of what we asked for.
+		 * So apply some tolerance here too, otherwise we keep
+		 * modifying the timer because expires for value X
+		 * will be X + something.
+		 */
+		if (!timer_pending(&q->timeout) || (diff >= HZ / 2))
+			mod_timer(&q->timeout, expiry);
+	}
+
+}
--- a/block/blk.h
+++ b/block/blk.h
@ -0,0 +1,284 @@
+#ifndef BLK_INTERNAL_H
+#define BLK_INTERNAL_H
+
+#include <linux/idr.h>
+#include <linux/blk-mq.h>
+#include "blk-mq.h"
+
+/* Amount of time in which a process may batch requests */
+#define BLK_BATCH_TIME	(HZ/50UL)
+
+/* Number of requests a "batching" process may submit */
+#define BLK_BATCH_REQ	32
+
+/* Max future timer expiry for timeouts */
+#define BLK_MAX_TIMEOUT		(5 * HZ)
+
+struct blk_flush_queue {
+	unsigned int		flush_queue_delayed:1;
+	unsigned int		flush_pending_idx:1;
+	unsigned int		flush_running_idx:1;
+	unsigned long		flush_pending_since;
+	struct list_head	flush_queue[2];
+	struct list_head	flush_data_in_flight;
+	struct request		*flush_rq;
+	spinlock_t		mq_flush_lock;
+};
+
+extern struct kmem_cache *blk_requestq_cachep;
+extern struct kmem_cache *request_cachep;
+extern struct kobj_type blk_queue_ktype;
+extern struct ida blk_queue_ida;
+
+static inline struct blk_flush_queue *blk_get_flush_queue(
+		struct request_queue *q, struct blk_mq_ctx *ctx)
+{
+	struct blk_mq_hw_ctx *hctx;
+
+	if (!q->mq_ops)
+		return q->fq;
+
+	hctx = q->mq_ops->map_queue(q, ctx->cpu);
+
+	return hctx->fq;
+}
+
+static inline void __blk_get_queue(struct request_queue *q)
+{
+	kobject_get(&q->kobj);
+}
+
+struct blk_flush_queue *blk_alloc_flush_queue(struct request_queue *q,
+		int node, int cmd_size);
+void blk_free_flush_queue(struct blk_flush_queue *q);
+
+int blk_init_rl(struct request_list *rl, struct request_queue *q,
+		gfp_t gfp_mask);
+void blk_exit_rl(struct request_list *rl);
+void init_request_from_bio(struct request *req, struct bio *bio);
+void blk_rq_bio_prep(struct request_queue *q, struct request *rq,
+			struct bio *bio);
+int blk_rq_append_bio(struct request_queue *q, struct request *rq,
+		      struct bio *bio);
+void blk_queue_bypass_start(struct request_queue *q);
+void blk_queue_bypass_end(struct request_queue *q);
+void blk_dequeue_request(struct request *rq);
+void __blk_queue_free_tags(struct request_queue *q);
+bool __blk_end_bidi_request(struct request *rq, int error,
+			    unsigned int nr_bytes, unsigned int bidi_bytes);
+
+void blk_rq_timed_out_timer(unsigned long data);
+unsigned long blk_rq_timeout(unsigned long timeout);
+void blk_add_timer(struct request *req);
+void blk_delete_timer(struct request *);
+
+
+bool bio_attempt_front_merge(struct request_queue *q, struct request *req,
+			     struct bio *bio);
+bool bio_attempt_back_merge(struct request_queue *q, struct request *req,
+			    struct bio *bio);
+bool blk_attempt_plug_merge(struct request_queue *q, struct bio *bio,
+			    unsigned int *request_count);
+
+void blk_account_io_start(struct request *req, bool new_io);
+void blk_account_io_completion(struct request *req, unsigned int bytes);
+void blk_account_io_done(struct request *req);
+
+/*
+ * Internal atomic flags for request handling
+ */
+enum rq_atomic_flags {
+	REQ_ATOM_COMPLETE = 0,
+	REQ_ATOM_STARTED,
+};
+
+/*
+ * EH timer and IO completion will both attempt to 'grab' the request, make
+ * sure that only one of them succeeds
+ */
+static inline int blk_mark_rq_complete(struct request *rq)
+{
+	return test_and_set_bit(REQ_ATOM_COMPLETE, &rq->atomic_flags);
+}
+
+static inline void blk_clear_rq_complete(struct request *rq)
+{
+	clear_bit(REQ_ATOM_COMPLETE, &rq->atomic_flags);
+}
+
+/*
+ * Internal elevator interface
+ */
+#define ELV_ON_HASH(rq) ((rq)->cmd_flags & REQ_HASHED)
+
+void blk_insert_flush(struct request *rq);
+
+static inline struct request *__elv_next_request(struct request_queue *q)
+{
+	struct request *rq;
+	struct blk_flush_queue *fq = blk_get_flush_queue(q, NULL);
+
+	while (1) {
+		if (!list_empty(&q->queue_head)) {
+			rq = list_entry_rq(q->queue_head.next);
+			return rq;
+		}
+
+		/*
+		 * Flush request is running and flush request isn't queueable
+		 * in the drive, we can hold the queue till flush request is
+		 * finished. Even we don't do this, driver can't dispatch next
+		 * requests and will requeue them. And this can improve
+		 * throughput too. For example, we have request flush1, write1,
+		 * flush 2. flush1 is dispatched, then queue is hold, write1
+		 * isn't inserted to queue. After flush1 is finished, flush2
+		 * will be dispatched. Since disk cache is already clean,
+		 * flush2 will be finished very soon, so looks like flush2 is
+		 * folded to flush1.
+		 * Since the queue is hold, a flag is set to indicate the queue
+		 * should be restarted later. Please see flush_end_io() for
+		 * details.
+		 */
+		if (fq->flush_pending_idx != fq->flush_running_idx &&
+				!queue_flush_queueable(q)) {
+			fq->flush_queue_delayed = 1;
+			return NULL;
+		}
+		if (unlikely(blk_queue_bypass(q)) ||
+		    !q->elevator->type->ops.elevator_dispatch_fn(q, 0))
+			return NULL;
+	}
+}
+
+static inline void elv_activate_rq(struct request_queue *q, struct request *rq)
+{
+	struct elevator_queue *e = q->elevator;
+
+	if (e->type->ops.elevator_activate_req_fn)
+		e->type->ops.elevator_activate_req_fn(q, rq);
+}
+
+static inline void elv_deactivate_rq(struct request_queue *q, struct request *rq)
+{
+	struct elevator_queue *e = q->elevator;
+
+	if (e->type->ops.elevator_deactivate_req_fn)
+		e->type->ops.elevator_deactivate_req_fn(q, rq);
+}
+
+#ifdef CONFIG_FAIL_IO_TIMEOUT
+int blk_should_fake_timeout(struct request_queue *);
+ssize_t part_timeout_show(struct device *, struct device_attribute *, char *);
+ssize_t part_timeout_store(struct device *, struct device_attribute *,
+				const char *, size_t);
+#else
+static inline int blk_should_fake_timeout(struct request_queue *q)
+{
+	return 0;
+}
+#endif
+
+int ll_back_merge_fn(struct request_queue *q, struct request *req,
+		     struct bio *bio);
+int ll_front_merge_fn(struct request_queue *q, struct request *req, 
+		      struct bio *bio);
+int attempt_back_merge(struct request_queue *q, struct request *rq);
+int attempt_front_merge(struct request_queue *q, struct request *rq);
+int blk_attempt_req_merge(struct request_queue *q, struct request *rq,
+				struct request *next);
+void blk_recalc_rq_segments(struct request *rq);
+void blk_rq_set_mixed_merge(struct request *rq);
+bool blk_rq_merge_ok(struct request *rq, struct bio *bio);
+int blk_try_merge(struct request *rq, struct bio *bio);
+
+void blk_queue_congestion_threshold(struct request_queue *q);
+
+void __blk_run_queue_uncond(struct request_queue *q);
+
+int blk_dev_init(void);
+
+
+/*
+ * Return the threshold (number of used requests) at which the queue is
+ * considered to be congested.  It include a little hysteresis to keep the
+ * context switch rate down.
+ */
+static inline int queue_congestion_on_threshold(struct request_queue *q)
+{
+	return q->nr_congestion_on;
+}
+
+/*
+ * The threshold at which a queue is considered to be uncongested
+ */
+static inline int queue_congestion_off_threshold(struct request_queue *q)
+{
+	return q->nr_congestion_off;
+}
+
+extern int blk_update_nr_requests(struct request_queue *, unsigned int);
+
+/*
+ * Contribute to IO statistics IFF:
+ *
+ *	a) it's attached to a gendisk, and
+ *	b) the queue had IO stats enabled when this request was started, and
+ *	c) it's a file system request
+ */
+static inline int blk_do_io_stat(struct request *rq)
+{
+	return rq->rq_disk &&
+	       (rq->cmd_flags & REQ_IO_STAT) &&
+		(rq->cmd_type == REQ_TYPE_FS);
+}
+
+/*
+ * Internal io_context interface
+ */
+void get_io_context(struct io_context *ioc);
+struct io_cq *ioc_lookup_icq(struct io_context *ioc, struct request_queue *q);
+struct io_cq *ioc_create_icq(struct io_context *ioc, struct request_queue *q,
+			     gfp_t gfp_mask);
+void ioc_clear_queue(struct request_queue *q);
+
+int create_task_io_context(struct task_struct *task, gfp_t gfp_mask, int node);
+
+/**
+ * create_io_context - try to create task->io_context
+ * @gfp_mask: allocation mask
+ * @node: allocation node
+ *
+ * If %current->io_context is %NULL, allocate a new io_context and install
+ * it.  Returns the current %current->io_context which may be %NULL if
+ * allocation failed.
+ *
+ * Note that this function can't be called with IRQ disabled because
+ * task_lock which protects %current->io_context is IRQ-unsafe.
+ */
+static inline struct io_context *create_io_context(gfp_t gfp_mask, int node)
+{
+	WARN_ON_ONCE(irqs_disabled());
+	if (unlikely(!current->io_context))
+		create_task_io_context(current, gfp_mask, node);
+	return current->io_context;
+}
+
+/*
+ * Internal throttling interface
+ */
+#ifdef CONFIG_BLK_DEV_THROTTLING
+extern bool blk_throtl_bio(struct request_queue *q, struct bio *bio);
+extern void blk_throtl_drain(struct request_queue *q);
+extern int blk_throtl_init(struct request_queue *q);
+extern void blk_throtl_exit(struct request_queue *q);
+#else /* CONFIG_BLK_DEV_THROTTLING */
+static inline bool blk_throtl_bio(struct request_queue *q, struct bio *bio)
+{
+	return false;
+}
+static inline void blk_throtl_drain(struct request_queue *q) { }
+static inline int blk_throtl_init(struct request_queue *q) { return 0; }
+static inline void blk_throtl_exit(struct request_queue *q) { }
+#endif /* CONFIG_BLK_DEV_THROTTLING */
+
+#endif /* BLK_INTERNAL_H */
--- a/block/bounce.c
+++ b/block/bounce.c
@ -0,0 +1,290 @@
+/* bounce buffer handling for block devices
+ *
+ * - Split from highmem.c
+ */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/mm.h>
+#include <linux/export.h>
+#include <linux/swap.h>
+#include <linux/gfp.h>
+#include <linux/bio.h>
+#include <linux/pagemap.h>
+#include <linux/mempool.h>
+#include <linux/blkdev.h>
+#include <linux/init.h>
+#include <linux/hash.h>
+#include <linux/highmem.h>
+#include <linux/bootmem.h>
+#include <linux/printk.h>
+#include <asm/tlbflush.h>
+
+#include <trace/events/block.h>
+
+#define POOL_SIZE	64
+#define ISA_POOL_SIZE	16
+
+static mempool_t *page_pool, *isa_page_pool;
+
+#if defined(CONFIG_HIGHMEM) || defined(CONFIG_NEED_BOUNCE_POOL)
+static __init int init_emergency_pool(void)
+{
+#if defined(CONFIG_HIGHMEM) && !defined(CONFIG_MEMORY_HOTPLUG)
+	if (max_pfn <= max_low_pfn)
+		return 0;
+#endif
+
+	page_pool = mempool_create_page_pool(POOL_SIZE, 0);
+	BUG_ON(!page_pool);
+	pr_info("pool size: %d pages\n", POOL_SIZE);
+
+	return 0;
+}
+
+__initcall(init_emergency_pool);
+#endif
+
+#ifdef CONFIG_HIGHMEM
+/*
+ * highmem version, map in to vec
+ */
+static void bounce_copy_vec(struct bio_vec *to, unsigned char *vfrom)
+{
+	unsigned long flags;
+	unsigned char *vto;
+
+	local_irq_save(flags);
+	vto = kmap_atomic(to->bv_page);
+	memcpy(vto + to->bv_offset, vfrom, to->bv_len);
+	kunmap_atomic(vto);
+	local_irq_restore(flags);
+}
+
+#else /* CONFIG_HIGHMEM */
+
+#define bounce_copy_vec(to, vfrom)	\
+	memcpy(page_address((to)->bv_page) + (to)->bv_offset, vfrom, (to)->bv_len)
+
+#endif /* CONFIG_HIGHMEM */
+
+/*
+ * allocate pages in the DMA region for the ISA pool
+ */
+static void *mempool_alloc_pages_isa(gfp_t gfp_mask, void *data)
+{
+	return mempool_alloc_pages(gfp_mask | GFP_DMA, data);
+}
+
+/*
+ * gets called "every" time someone init's a queue with BLK_BOUNCE_ISA
+ * as the max address, so check if the pool has already been created.
+ */
+int init_emergency_isa_pool(void)
+{
+	if (isa_page_pool)
+		return 0;
+
+	isa_page_pool = mempool_create(ISA_POOL_SIZE, mempool_alloc_pages_isa,
+				       mempool_free_pages, (void *) 0);
+	BUG_ON(!isa_page_pool);
+
+	pr_info("isa pool size: %d pages\n", ISA_POOL_SIZE);
+	return 0;
+}
+
+/*
+ * Simple bounce buffer support for highmem pages. Depending on the
+ * queue gfp mask set, *to may or may not be a highmem page. kmap it
+ * always, it will do the Right Thing
+ */
+static void copy_to_high_bio_irq(struct bio *to, struct bio *from)
+{
+	unsigned char *vfrom;
+	struct bio_vec tovec, *fromvec = from->bi_io_vec;
+	struct bvec_iter iter;
+
+	bio_for_each_segment(tovec, to, iter) {
+		if (tovec.bv_page != fromvec->bv_page) {
+			/*
+			 * fromvec->bv_offset and fromvec->bv_len might have
+			 * been modified by the block layer, so use the original
+			 * copy, bounce_copy_vec already uses tovec->bv_len
+			 */
+			vfrom = page_address(fromvec->bv_page) +
+				tovec.bv_offset;
+
+			bounce_copy_vec(&tovec, vfrom);
+			flush_dcache_page(tovec.bv_page);
+		}
+
+		fromvec++;
+	}
+}
+
+static void bounce_end_io(struct bio *bio, mempool_t *pool, int err)
+{
+	struct bio *bio_orig = bio->bi_private;
+	struct bio_vec *bvec, *org_vec;
+	int i;
+
+	if (test_bit(BIO_EOPNOTSUPP, &bio->bi_flags))
+		set_bit(BIO_EOPNOTSUPP, &bio_orig->bi_flags);
+
+	/*
+	 * free up bounce indirect pages used
+	 */
+	bio_for_each_segment_all(bvec, bio, i) {
+		org_vec = bio_orig->bi_io_vec + i;
+		if (bvec->bv_page == org_vec->bv_page)
+			continue;
+
+		dec_zone_page_state(bvec->bv_page, NR_BOUNCE);
+		mempool_free(bvec->bv_page, pool);
+	}
+
+	bio_endio(bio_orig, err);
+	bio_put(bio);
+}
+
+static void bounce_end_io_write(struct bio *bio, int err)
+{
+	bounce_end_io(bio, page_pool, err);
+}
+
+static void bounce_end_io_write_isa(struct bio *bio, int err)
+{
+
+	bounce_end_io(bio, isa_page_pool, err);
+}
+
+static void __bounce_end_io_read(struct bio *bio, mempool_t *pool, int err)
+{
+	struct bio *bio_orig = bio->bi_private;
+
+	if (test_bit(BIO_UPTODATE, &bio->bi_flags))
+		copy_to_high_bio_irq(bio_orig, bio);
+
+	bounce_end_io(bio, pool, err);
+}
+
+static void bounce_end_io_read(struct bio *bio, int err)
+{
+	__bounce_end_io_read(bio, page_pool, err);
+}
+
+static void bounce_end_io_read_isa(struct bio *bio, int err)
+{
+	__bounce_end_io_read(bio, isa_page_pool, err);
+}
+
+#ifdef CONFIG_NEED_BOUNCE_POOL
+static int must_snapshot_stable_pages(struct request_queue *q, struct bio *bio)
+{
+	if (bio_data_dir(bio) != WRITE)
+		return 0;
+
+	if (!bdi_cap_stable_pages_required(&q->backing_dev_info))
+		return 0;
+
+	return test_bit(BIO_SNAP_STABLE, &bio->bi_flags);
+}
+#else
+static int must_snapshot_stable_pages(struct request_queue *q, struct bio *bio)
+{
+	return 0;
+}
+#endif /* CONFIG_NEED_BOUNCE_POOL */
+
+static void __blk_queue_bounce(struct request_queue *q, struct bio **bio_orig,
+			       mempool_t *pool, int force)
+{
+	struct bio *bio;
+	int rw = bio_data_dir(*bio_orig);
+	struct bio_vec *to, from;
+	struct bvec_iter iter;
+	unsigned i;
+
+	if (force)
+		goto bounce;
+	bio_for_each_segment(from, *bio_orig, iter)
+		if (page_to_pfn(from.bv_page) > queue_bounce_pfn(q))
+			goto bounce;
+
+	return;
+bounce:
+	bio = bio_clone_bioset(*bio_orig, GFP_NOIO, fs_bio_set);
+
+	bio_for_each_segment_all(to, bio, i) {
+		struct page *page = to->bv_page;
+
+		if (page_to_pfn(page) <= queue_bounce_pfn(q) && !force)
+			continue;
+
+		inc_zone_page_state(to->bv_page, NR_BOUNCE);
+		to->bv_page = mempool_alloc(pool, q->bounce_gfp);
+
+		if (rw == WRITE) {
+			char *vto, *vfrom;
+
+			flush_dcache_page(page);
+
+			vto = page_address(to->bv_page) + to->bv_offset;
+			vfrom = kmap_atomic(page) + to->bv_offset;
+			memcpy(vto, vfrom, to->bv_len);
+			kunmap_atomic(vfrom);
+		}
+	}
+
+	trace_block_bio_bounce(q, *bio_orig);
+
+	bio->bi_flags |= (1 << BIO_BOUNCED);
+
+	if (pool == page_pool) {
+		bio->bi_end_io = bounce_end_io_write;
+		if (rw == READ)
+			bio->bi_end_io = bounce_end_io_read;
+	} else {
+		bio->bi_end_io = bounce_end_io_write_isa;
+		if (rw == READ)
+			bio->bi_end_io = bounce_end_io_read_isa;
+	}
+
+	bio->bi_private = *bio_orig;
+	*bio_orig = bio;
+}
+
+void blk_queue_bounce(struct request_queue *q, struct bio **bio_orig)
+{
+	int must_bounce;
+	mempool_t *pool;
+
+	/*
+	 * Data-less bio, nothing to bounce
+	 */
+	if (!bio_has_data(*bio_orig))
+		return;
+
+	must_bounce = must_snapshot_stable_pages(q, *bio_orig);
+
+	/*
+	 * for non-isa bounce case, just check if the bounce pfn is equal
+	 * to or bigger than the highest pfn in the system -- in that case,
+	 * don't waste time iterating over bio segments
+	 */
+	if (!(q->bounce_gfp & GFP_DMA)) {
+		if (queue_bounce_pfn(q) >= blk_max_pfn && !must_bounce)
+			return;
+		pool = page_pool;
+	} else {
+		BUG_ON(!isa_page_pool);
+		pool = isa_page_pool;
+	}
+
+	/*
+	 * slow path
+	 */
+	__blk_queue_bounce(q, bio_orig, pool, must_bounce);
+}
+
+EXPORT_SYMBOL(blk_queue_bounce);
--- a/block/bsg-lib.c
+++ b/block/bsg-lib.c
@ -0,0 +1,232 @@
+/*
+ *  BSG helper library
+ *
+ *  Copyright (C) 2008   James Smart, Emulex Corporation
+ *  Copyright (C) 2011   Red Hat, Inc.  All rights reserved.
+ *  Copyright (C) 2011   Mike Christie
+ *
+ *  This program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation; either version 2 of the License, or
+ *  (at your option) any later version.
+ *
+ *  This program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with this program; if not, write to the Free Software
+ *  Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ *
+ */
+#include <linux/slab.h>
+#include <linux/blkdev.h>
+#include <linux/delay.h>
+#include <linux/scatterlist.h>
+#include <linux/bsg-lib.h>
+#include <linux/export.h>
+#include <scsi/scsi_cmnd.h>
+
+/**
+ * bsg_destroy_job - routine to teardown/delete a bsg job
+ * @job: bsg_job that is to be torn down
+ */
+static void bsg_destroy_job(struct bsg_job *job)
+{
+	put_device(job->dev);	/* release reference for the request */
+
+	kfree(job->request_payload.sg_list);
+	kfree(job->reply_payload.sg_list);
+	kfree(job);
+}
+
+/**
+ * bsg_job_done - completion routine for bsg requests
+ * @job: bsg_job that is complete
+ * @result: job reply result
+ * @reply_payload_rcv_len: length of payload recvd
+ *
+ * The LLD should call this when the bsg job has completed.
+ */
+void bsg_job_done(struct bsg_job *job, int result,
+		  unsigned int reply_payload_rcv_len)
+{
+	struct request *req = job->req;
+	struct request *rsp = req->next_rq;
+	int err;
+
+	err = job->req->errors = result;
+	if (err < 0)
+		/* we're only returning the result field in the reply */
+		job->req->sense_len = sizeof(u32);
+	else
+		job->req->sense_len = job->reply_len;
+	/* we assume all request payload was transferred, residual == 0 */
+	req->resid_len = 0;
+
+	if (rsp) {
+		WARN_ON(reply_payload_rcv_len > rsp->resid_len);
+
+		/* set reply (bidi) residual */
+		rsp->resid_len -= min(reply_payload_rcv_len, rsp->resid_len);
+	}
+	blk_complete_request(req);
+}
+EXPORT_SYMBOL_GPL(bsg_job_done);
+
+/**
+ * bsg_softirq_done - softirq done routine for destroying the bsg requests
+ * @rq: BSG request that holds the job to be destroyed
+ */
+static void bsg_softirq_done(struct request *rq)
+{
+	struct bsg_job *job = rq->special;
+
+	blk_end_request_all(rq, rq->errors);
+	bsg_destroy_job(job);
+}
+
+static int bsg_map_buffer(struct bsg_buffer *buf, struct request *req)
+{
+	size_t sz = (sizeof(struct scatterlist) * req->nr_phys_segments);
+
+	BUG_ON(!req->nr_phys_segments);
+
+	buf->sg_list = kzalloc(sz, GFP_KERNEL);
+	if (!buf->sg_list)
+		return -ENOMEM;
+	sg_init_table(buf->sg_list, req->nr_phys_segments);
+	buf->sg_cnt = blk_rq_map_sg(req->q, req, buf->sg_list);
+	buf->payload_len = blk_rq_bytes(req);
+	return 0;
+}
+
+/**
+ * bsg_create_job - create the bsg_job structure for the bsg request
+ * @dev: device that is being sent the bsg request
+ * @req: BSG request that needs a job structure
+ */
+static int bsg_create_job(struct device *dev, struct request *req)
+{
+	struct request *rsp = req->next_rq;
+	struct request_queue *q = req->q;
+	struct bsg_job *job;
+	int ret;
+
+	BUG_ON(req->special);
+
+	job = kzalloc(sizeof(struct bsg_job) + q->bsg_job_size, GFP_KERNEL);
+	if (!job)
+		return -ENOMEM;
+
+	req->special = job;
+	job->req = req;
+	if (q->bsg_job_size)
+		job->dd_data = (void *)&job[1];
+	job->request = req->cmd;
+	job->request_len = req->cmd_len;
+	job->reply = req->sense;
+	job->reply_len = SCSI_SENSE_BUFFERSIZE;	/* Size of sense buffer
+						 * allocated */
+	if (req->bio) {
+		ret = bsg_map_buffer(&job->request_payload, req);
+		if (ret)
+			goto failjob_rls_job;
+	}
+	if (rsp && rsp->bio) {
+		ret = bsg_map_buffer(&job->reply_payload, rsp);
+		if (ret)
+			goto failjob_rls_rqst_payload;
+	}
+	job->dev = dev;
+	/* take a reference for the request */
+	get_device(job->dev);
+	return 0;
+
+failjob_rls_rqst_payload:
+	kfree(job->request_payload.sg_list);
+failjob_rls_job:
+	kfree(job);
+	return -ENOMEM;
+}
+
+/**
+ * bsg_request_fn - generic handler for bsg requests
+ * @q: request queue to manage
+ *
+ * On error the create_bsg_job function should return a -Exyz error value
+ * that will be set to the req->errors.
+ *
+ * Drivers/subsys should pass this to the queue init function.
+ */
+void bsg_request_fn(struct request_queue *q)
+{
+	struct device *dev = q->queuedata;
+	struct request *req;
+	struct bsg_job *job;
+	int ret;
+
+	if (!get_device(dev))
+		return;
+
+	while (1) {
+		req = blk_fetch_request(q);
+		if (!req)
+			break;
+		spin_unlock_irq(q->queue_lock);
+
+		ret = bsg_create_job(dev, req);
+		if (ret) {
+			req->errors = ret;
+			blk_end_request_all(req, ret);
+			spin_lock_irq(q->queue_lock);
+			continue;
+		}
+
+		job = req->special;
+		ret = q->bsg_job_fn(job);
+		spin_lock_irq(q->queue_lock);
+		if (ret)
+			break;
+	}
+
+	spin_unlock_irq(q->queue_lock);
+	put_device(dev);
+	spin_lock_irq(q->queue_lock);
+}
+EXPORT_SYMBOL_GPL(bsg_request_fn);
+
+/**
+ * bsg_setup_queue - Create and add the bsg hooks so we can receive requests
+ * @dev: device to attach bsg device to
+ * @q: request queue setup by caller
+ * @name: device to give bsg device
+ * @job_fn: bsg job handler
+ * @dd_job_size: size of LLD data needed for each job
+ *
+ * The caller should have setup the reuqest queue with bsg_request_fn
+ * as the request_fn.
+ */
+int bsg_setup_queue(struct device *dev, struct request_queue *q,
+		    char *name, bsg_job_fn *job_fn, int dd_job_size)
+{
+	int ret;
+
+	q->queuedata = dev;
+	q->bsg_job_size = dd_job_size;
+	q->bsg_job_fn = job_fn;
+	queue_flag_set_unlocked(QUEUE_FLAG_BIDI, q);
+	blk_queue_softirq_done(q, bsg_softirq_done);
+	blk_queue_rq_timeout(q, BLK_DEFAULT_SG_TIMEOUT);
+
+	ret = bsg_register_queue(q, dev, name, NULL);
+	if (ret) {
+		printk(KERN_ERR "%s: bsg interface failed to "
+		       "initialize - register queue\n", dev->kobj.name);
+		return ret;
+	}
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(bsg_setup_queue);
--- a/block/bsg.c
+++ b/block/bsg.c
--- a/block/cfq-iosched.c
+++ b/block/cfq-iosched.c
--- a/block/cmdline-parser.c
+++ b/block/cmdline-parser.c
@ -0,0 +1,254 @@
+/*
+ * Parse command line, get partition information
+ *
+ * Written by Cai Zhiyong <caizhiyong@huawei.com>
+ *
+ */
+#include <linux/export.h>
+#include <linux/cmdline-parser.h>
+
+static int parse_subpart(struct cmdline_subpart **subpart, char *partdef)
+{
+	int ret = 0;
+	struct cmdline_subpart *new_subpart;
+
+	*subpart = NULL;
+
+	new_subpart = kzalloc(sizeof(struct cmdline_subpart), GFP_KERNEL);
+	if (!new_subpart)
+		return -ENOMEM;
+
+	if (*partdef == '-') {
+		new_subpart->size = (sector_t)(~0ULL);
+		partdef++;
+	} else {
+		new_subpart->size = (sector_t)memparse(partdef, &partdef);
+		if (new_subpart->size < (sector_t)PAGE_SIZE) {
+			pr_warn("cmdline partition size is invalid.");
+			ret = -EINVAL;
+			goto fail;
+		}
+	}
+
+	if (*partdef == '@') {
+		partdef++;
+		new_subpart->from = (sector_t)memparse(partdef, &partdef);
+	} else {
+		new_subpart->from = (sector_t)(~0ULL);
+	}
+
+	if (*partdef == '(') {
+		int length;
+		char *next = strchr(++partdef, ')');
+
+		if (!next) {
+			pr_warn("cmdline partition format is invalid.");
+			ret = -EINVAL;
+			goto fail;
+		}
+
+		length = min_t(int, next - partdef,
+			       sizeof(new_subpart->name) - 1);
+		strncpy(new_subpart->name, partdef, length);
+		new_subpart->name[length] = '\0';
+
+		partdef = ++next;
+	} else
+		new_subpart->name[0] = '\0';
+
+	new_subpart->flags = 0;
+
+	if (!strncmp(partdef, "ro", 2)) {
+		new_subpart->flags |= PF_RDONLY;
+		partdef += 2;
+	}
+
+	if (!strncmp(partdef, "lk", 2)) {
+		new_subpart->flags |= PF_POWERUP_LOCK;
+		partdef += 2;
+	}
+
+	*subpart = new_subpart;
+	return 0;
+fail:
+	kfree(new_subpart);
+	return ret;
+}
+
+static void free_subpart(struct cmdline_parts *parts)
+{
+	struct cmdline_subpart *subpart;
+
+	while (parts->subpart) {
+		subpart = parts->subpart;
+		parts->subpart = subpart->next_subpart;
+		kfree(subpart);
+	}
+}
+
+static int parse_parts(struct cmdline_parts **parts, const char *bdevdef)
+{
+	int ret = -EINVAL;
+	char *next;
+	int length;
+	struct cmdline_subpart **next_subpart;
+	struct cmdline_parts *newparts;
+	char buf[BDEVNAME_SIZE + 32 + 4];
+
+	*parts = NULL;
+
+	newparts = kzalloc(sizeof(struct cmdline_parts), GFP_KERNEL);
+	if (!newparts)
+		return -ENOMEM;
+
+	next = strchr(bdevdef, ':');
+	if (!next) {
+		pr_warn("cmdline partition has no block device.");
+		goto fail;
+	}
+
+	length = min_t(int, next - bdevdef, sizeof(newparts->name) - 1);
+	strncpy(newparts->name, bdevdef, length);
+	newparts->name[length] = '\0';
+	newparts->nr_subparts = 0;
+
+	next_subpart = &newparts->subpart;
+
+	while (next && *(++next)) {
+		bdevdef = next;
+		next = strchr(bdevdef, ',');
+
+		length = (!next) ? (sizeof(buf) - 1) :
+			min_t(int, next - bdevdef, sizeof(buf) - 1);
+
+		strncpy(buf, bdevdef, length);
+		buf[length] = '\0';
+
+		ret = parse_subpart(next_subpart, buf);
+		if (ret)
+			goto fail;
+
+		newparts->nr_subparts++;
+		next_subpart = &(*next_subpart)->next_subpart;
+	}
+
+	if (!newparts->subpart) {
+		pr_warn("cmdline partition has no valid partition.");
+		ret = -EINVAL;
+		goto fail;
+	}
+
+	*parts = newparts;
+
+	return 0;
+fail:
+	free_subpart(newparts);
+	kfree(newparts);
+	return ret;
+}
+
+void cmdline_parts_free(struct cmdline_parts **parts)
+{
+	struct cmdline_parts *next_parts;
+
+	while (*parts) {
+		next_parts = (*parts)->next_parts;
+		free_subpart(*parts);
+		kfree(*parts);
+		*parts = next_parts;
+	}
+}
+EXPORT_SYMBOL(cmdline_parts_free);
+
+int cmdline_parts_parse(struct cmdline_parts **parts, const char *cmdline)
+{
+	int ret;
+	char *buf;
+	char *pbuf;
+	char *next;
+	struct cmdline_parts **next_parts;
+
+	*parts = NULL;
+
+	next = pbuf = buf = kstrdup(cmdline, GFP_KERNEL);
+	if (!buf)
+		return -ENOMEM;
+
+	next_parts = parts;
+
+	while (next && *pbuf) {
+		next = strchr(pbuf, ';');
+		if (next)
+			*next = '\0';
+
+		ret = parse_parts(next_parts, pbuf);
+		if (ret)
+			goto fail;
+
+		if (next)
+			pbuf = ++next;
+
+		next_parts = &(*next_parts)->next_parts;
+	}
+
+	if (!*parts) {
+		pr_warn("cmdline partition has no valid partition.");
+		ret = -EINVAL;
+		goto fail;
+	}
+
+	ret = 0;
+done:
+	kfree(buf);
+	return ret;
+
+fail:
+	cmdline_parts_free(parts);
+	goto done;
+}
+EXPORT_SYMBOL(cmdline_parts_parse);
+
+struct cmdline_parts *cmdline_parts_find(struct cmdline_parts *parts,
+					 const char *bdev)
+{
+	while (parts && strncmp(bdev, parts->name, sizeof(parts->name)))
+		parts = parts->next_parts;
+	return parts;
+}
+EXPORT_SYMBOL(cmdline_parts_find);
+
+/*
+ *  add_part()
+ *    0 success.
+ *    1 can not add so many partitions.
+ */
+int cmdline_parts_set(struct cmdline_parts *parts, sector_t disk_size,
+		      int slot,
+		      int (*add_part)(int, struct cmdline_subpart *, void *),
+		      void *param)
+{
+	sector_t from = 0;
+	struct cmdline_subpart *subpart;
+
+	for (subpart = parts->subpart; subpart;
+	     subpart = subpart->next_subpart, slot++) {
+		if (subpart->from == (sector_t)(~0ULL))
+			subpart->from = from;
+		else
+			from = subpart->from;
+
+		if (from >= disk_size)
+			break;
+
+		if (subpart->size > (disk_size - from))
+			subpart->size = disk_size - from;
+
+		from += subpart->size;
+
+		if (add_part(slot, subpart, param))
+			break;
+	}
+
+	return slot;
+}
+EXPORT_SYMBOL(cmdline_parts_set);
--- a/block/compat_ioctl.c
+++ b/block/compat_ioctl.c
@ -0,0 +1,756 @@
+#include <linux/blkdev.h>
+#include <linux/blkpg.h>
+#include <linux/blktrace_api.h>
+#include <linux/cdrom.h>
+#include <linux/compat.h>
+#include <linux/elevator.h>
+#include <linux/fd.h>
+#include <linux/hdreg.h>
+#include <linux/slab.h>
+#include <linux/syscalls.h>
+#include <linux/types.h>
+#include <linux/uaccess.h>
+
+static int compat_put_ushort(unsigned long arg, unsigned short val)
+{
+	return put_user(val, (unsigned short __user *)compat_ptr(arg));
+}
+
+static int compat_put_int(unsigned long arg, int val)
+{
+	return put_user(val, (compat_int_t __user *)compat_ptr(arg));
+}
+
+static int compat_put_uint(unsigned long arg, unsigned int val)
+{
+	return put_user(val, (compat_uint_t __user *)compat_ptr(arg));
+}
+
+static int compat_put_long(unsigned long arg, long val)
+{
+	return put_user(val, (compat_long_t __user *)compat_ptr(arg));
+}
+
+static int compat_put_ulong(unsigned long arg, compat_ulong_t val)
+{
+	return put_user(val, (compat_ulong_t __user *)compat_ptr(arg));
+}
+
+static int compat_put_u64(unsigned long arg, u64 val)
+{
+	return put_user(val, (compat_u64 __user *)compat_ptr(arg));
+}
+
+struct compat_hd_geometry {
+	unsigned char heads;
+	unsigned char sectors;
+	unsigned short cylinders;
+	u32 start;
+};
+
+static int compat_hdio_getgeo(struct gendisk *disk, struct block_device *bdev,
+			struct compat_hd_geometry __user *ugeo)
+{
+	struct hd_geometry geo;
+	int ret;
+
+	if (!ugeo)
+		return -EINVAL;
+	if (!disk->fops->getgeo)
+		return -ENOTTY;
+
+	memset(&geo, 0, sizeof(geo));
+	/*
+	 * We need to set the startsect first, the driver may
+	 * want to override it.
+	 */
+	geo.start = get_start_sect(bdev);
+	ret = disk->fops->getgeo(bdev, &geo);
+	if (ret)
+		return ret;
+
+	ret = copy_to_user(ugeo, &geo, 4);
+	ret |= put_user(geo.start, &ugeo->start);
+	if (ret)
+		ret = -EFAULT;
+
+	return ret;
+}
+
+static int compat_hdio_ioctl(struct block_device *bdev, fmode_t mode,
+		unsigned int cmd, unsigned long arg)
+{
+	mm_segment_t old_fs = get_fs();
+	unsigned long kval;
+	unsigned int __user *uvp;
+	int error;
+
+	set_fs(KERNEL_DS);
+	error = __blkdev_driver_ioctl(bdev, mode,
+				cmd, (unsigned long)(&kval));
+	set_fs(old_fs);
+
+	if (error == 0) {
+		uvp = compat_ptr(arg);
+		if (put_user(kval, uvp))
+			error = -EFAULT;
+	}
+	return error;
+}
+
+struct compat_cdrom_read_audio {
+	union cdrom_addr	addr;
+	u8			addr_format;
+	compat_int_t		nframes;
+	compat_caddr_t		buf;
+};
+
+struct compat_cdrom_generic_command {
+	unsigned char	cmd[CDROM_PACKET_SIZE];
+	compat_caddr_t	buffer;
+	compat_uint_t	buflen;
+	compat_int_t	stat;
+	compat_caddr_t	sense;
+	unsigned char	data_direction;
+	compat_int_t	quiet;
+	compat_int_t	timeout;
+	compat_caddr_t	reserved[1];
+};
+
+static int compat_cdrom_read_audio(struct block_device *bdev, fmode_t mode,
+		unsigned int cmd, unsigned long arg)
+{
+	struct cdrom_read_audio __user *cdread_audio;
+	struct compat_cdrom_read_audio __user *cdread_audio32;
+	__u32 data;
+	void __user *datap;
+
+	cdread_audio = compat_alloc_user_space(sizeof(*cdread_audio));
+	cdread_audio32 = compat_ptr(arg);
+
+	if (copy_in_user(&cdread_audio->addr,
+			 &cdread_audio32->addr,
+			 (sizeof(*cdread_audio32) -
+			  sizeof(compat_caddr_t))))
+		return -EFAULT;
+
+	if (get_user(data, &cdread_audio32->buf))
+		return -EFAULT;
+	datap = compat_ptr(data);
+	if (put_user(datap, &cdread_audio->buf))
+		return -EFAULT;
+
+	return __blkdev_driver_ioctl(bdev, mode, cmd,
+			(unsigned long)cdread_audio);
+}
+
+static int compat_cdrom_generic_command(struct block_device *bdev, fmode_t mode,
+		unsigned int cmd, unsigned long arg)
+{
+	struct cdrom_generic_command __user *cgc;
+	struct compat_cdrom_generic_command __user *cgc32;
+	u32 data;
+	unsigned char dir;
+	int itmp;
+
+	cgc = compat_alloc_user_space(sizeof(*cgc));
+	cgc32 = compat_ptr(arg);
+
+	if (copy_in_user(&cgc->cmd, &cgc32->cmd, sizeof(cgc->cmd)) ||
+	    get_user(data, &cgc32->buffer) ||
+	    put_user(compat_ptr(data), &cgc->buffer) ||
+	    copy_in_user(&cgc->buflen, &cgc32->buflen,
+			 (sizeof(unsigned int) + sizeof(int))) ||
+	    get_user(data, &cgc32->sense) ||
+	    put_user(compat_ptr(data), &cgc->sense) ||
+	    get_user(dir, &cgc32->data_direction) ||
+	    put_user(dir, &cgc->data_direction) ||
+	    get_user(itmp, &cgc32->quiet) ||
+	    put_user(itmp, &cgc->quiet) ||
+	    get_user(itmp, &cgc32->timeout) ||
+	    put_user(itmp, &cgc->timeout) ||
+	    get_user(data, &cgc32->reserved[0]) ||
+	    put_user(compat_ptr(data), &cgc->reserved[0]))
+		return -EFAULT;
+
+	return __blkdev_driver_ioctl(bdev, mode, cmd, (unsigned long)cgc);
+}
+
+struct compat_blkpg_ioctl_arg {
+	compat_int_t op;
+	compat_int_t flags;
+	compat_int_t datalen;
+	compat_caddr_t data;
+};
+
+static int compat_blkpg_ioctl(struct block_device *bdev, fmode_t mode,
+		unsigned int cmd, struct compat_blkpg_ioctl_arg __user *ua32)
+{
+	struct blkpg_ioctl_arg __user *a = compat_alloc_user_space(sizeof(*a));
+	compat_caddr_t udata;
+	compat_int_t n;
+	int err;
+
+	err = get_user(n, &ua32->op);
+	err |= put_user(n, &a->op);
+	err |= get_user(n, &ua32->flags);
+	err |= put_user(n, &a->flags);
+	err |= get_user(n, &ua32->datalen);
+	err |= put_user(n, &a->datalen);
+	err |= get_user(udata, &ua32->data);
+	err |= put_user(compat_ptr(udata), &a->data);
+	if (err)
+		return err;
+
+	return blkdev_ioctl(bdev, mode, cmd, (unsigned long)a);
+}
+
+#define BLKBSZGET_32		_IOR(0x12, 112, int)
+#define BLKBSZSET_32		_IOW(0x12, 113, int)
+#define BLKGETSIZE64_32		_IOR(0x12, 114, int)
+
+struct compat_floppy_drive_params {
+	char		cmos;
+	compat_ulong_t	max_dtr;
+	compat_ulong_t	hlt;
+	compat_ulong_t	hut;
+	compat_ulong_t	srt;
+	compat_ulong_t	spinup;
+	compat_ulong_t	spindown;
+	unsigned char	spindown_offset;
+	unsigned char	select_delay;
+	unsigned char	rps;
+	unsigned char	tracks;
+	compat_ulong_t	timeout;
+	unsigned char	interleave_sect;
+	struct floppy_max_errors max_errors;
+	char		flags;
+	char		read_track;
+	short		autodetect[8];
+	compat_int_t	checkfreq;
+	compat_int_t	native_format;
+};
+
+struct compat_floppy_drive_struct {
+	signed char	flags;
+	compat_ulong_t	spinup_date;
+	compat_ulong_t	select_date;
+	compat_ulong_t	first_read_date;
+	short		probed_format;
+	short		track;
+	short		maxblock;
+	short		maxtrack;
+	compat_int_t	generation;
+	compat_int_t	keep_data;
+	compat_int_t	fd_ref;
+	compat_int_t	fd_device;
+	compat_int_t	last_checked;
+	compat_caddr_t dmabuf;
+	compat_int_t	bufblocks;
+};
+
+struct compat_floppy_fdc_state {
+	compat_int_t	spec1;
+	compat_int_t	spec2;
+	compat_int_t	dtr;
+	unsigned char	version;
+	unsigned char	dor;
+	compat_ulong_t	address;
+	unsigned int	rawcmd:2;
+	unsigned int	reset:1;
+	unsigned int	need_configure:1;
+	unsigned int	perp_mode:2;
+	unsigned int	has_fifo:1;
+	unsigned int	driver_version;
+	unsigned char	track[4];
+};
+
+struct compat_floppy_write_errors {
+	unsigned int	write_errors;
+	compat_ulong_t	first_error_sector;
+	compat_int_t	first_error_generation;
+	compat_ulong_t	last_error_sector;
+	compat_int_t	last_error_generation;
+	compat_uint_t	badness;
+};
+
+#define FDSETPRM32 _IOW(2, 0x42, struct compat_floppy_struct)
+#define FDDEFPRM32 _IOW(2, 0x43, struct compat_floppy_struct)
+#define FDSETDRVPRM32 _IOW(2, 0x90, struct compat_floppy_drive_params)
+#define FDGETDRVPRM32 _IOR(2, 0x11, struct compat_floppy_drive_params)
+#define FDGETDRVSTAT32 _IOR(2, 0x12, struct compat_floppy_drive_struct)
+#define FDPOLLDRVSTAT32 _IOR(2, 0x13, struct compat_floppy_drive_struct)
+#define FDGETFDCSTAT32 _IOR(2, 0x15, struct compat_floppy_fdc_state)
+#define FDWERRORGET32  _IOR(2, 0x17, struct compat_floppy_write_errors)
+
+static struct {
+	unsigned int	cmd32;
+	unsigned int	cmd;
+} fd_ioctl_trans_table[] = {
+	{ FDSETPRM32, FDSETPRM },
+	{ FDDEFPRM32, FDDEFPRM },
+	{ FDGETPRM32, FDGETPRM },
+	{ FDSETDRVPRM32, FDSETDRVPRM },
+	{ FDGETDRVPRM32, FDGETDRVPRM },
+	{ FDGETDRVSTAT32, FDGETDRVSTAT },
+	{ FDPOLLDRVSTAT32, FDPOLLDRVSTAT },
+	{ FDGETFDCSTAT32, FDGETFDCSTAT },
+	{ FDWERRORGET32, FDWERRORGET }
+};
+
+#define NR_FD_IOCTL_TRANS ARRAY_SIZE(fd_ioctl_trans_table)
+
+static int compat_fd_ioctl(struct block_device *bdev, fmode_t mode,
+		unsigned int cmd, unsigned long arg)
+{
+	mm_segment_t old_fs = get_fs();
+	void *karg = NULL;
+	unsigned int kcmd = 0;
+	int i, err;
+
+	for (i = 0; i < NR_FD_IOCTL_TRANS; i++)
+		if (cmd == fd_ioctl_trans_table[i].cmd32) {
+			kcmd = fd_ioctl_trans_table[i].cmd;
+			break;
+		}
+	if (!kcmd)
+		return -EINVAL;
+
+	switch (cmd) {
+	case FDSETPRM32:
+	case FDDEFPRM32:
+	case FDGETPRM32:
+	{
+		compat_uptr_t name;
+		struct compat_floppy_struct __user *uf;
+		struct floppy_struct *f;
+
+		uf = compat_ptr(arg);
+		f = karg = kmalloc(sizeof(struct floppy_struct), GFP_KERNEL);
+		if (!karg)
+			return -ENOMEM;
+		if (cmd == FDGETPRM32)
+			break;
+		err = __get_user(f->size, &uf->size);
+		err |= __get_user(f->sect, &uf->sect);
+		err |= __get_user(f->head, &uf->head);
+		err |= __get_user(f->track, &uf->track);
+		err |= __get_user(f->stretch, &uf->stretch);
+		err |= __get_user(f->gap, &uf->gap);
+		err |= __get_user(f->rate, &uf->rate);
+		err |= __get_user(f->spec1, &uf->spec1);
+		err |= __get_user(f->fmt_gap, &uf->fmt_gap);
+		err |= __get_user(name, &uf->name);
+		f->name = compat_ptr(name);
+		if (err) {
+			err = -EFAULT;
+			goto out;
+		}
+		break;
+	}
+	case FDSETDRVPRM32:
+	case FDGETDRVPRM32:
+	{
+		struct compat_floppy_drive_params __user *uf;
+		struct floppy_drive_params *f;
+
+		uf = compat_ptr(arg);
+		f = karg = kmalloc(sizeof(struct floppy_drive_params), GFP_KERNEL);
+		if (!karg)
+			return -ENOMEM;
+		if (cmd == FDGETDRVPRM32)
+			break;
+		err = __get_user(f->cmos, &uf->cmos);
+		err |= __get_user(f->max_dtr, &uf->max_dtr);
+		err |= __get_user(f->hlt, &uf->hlt);
+		err |= __get_user(f->hut, &uf->hut);
+		err |= __get_user(f->srt, &uf->srt);
+		err |= __get_user(f->spinup, &uf->spinup);
+		err |= __get_user(f->spindown, &uf->spindown);
+		err |= __get_user(f->spindown_offset, &uf->spindown_offset);
+		err |= __get_user(f->select_delay, &uf->select_delay);
+		err |= __get_user(f->rps, &uf->rps);
+		err |= __get_user(f->tracks, &uf->tracks);
+		err |= __get_user(f->timeout, &uf->timeout);
+		err |= __get_user(f->interleave_sect, &uf->interleave_sect);
+		err |= __copy_from_user(&f->max_errors, &uf->max_errors, sizeof(f->max_errors));
+		err |= __get_user(f->flags, &uf->flags);
+		err |= __get_user(f->read_track, &uf->read_track);
+		err |= __copy_from_user(f->autodetect, uf->autodetect, sizeof(f->autodetect));
+		err |= __get_user(f->checkfreq, &uf->checkfreq);
+		err |= __get_user(f->native_format, &uf->native_format);
+		if (err) {
+			err = -EFAULT;
+			goto out;
+		}
+		break;
+	}
+	case FDGETDRVSTAT32:
+	case FDPOLLDRVSTAT32:
+		karg = kmalloc(sizeof(struct floppy_drive_struct), GFP_KERNEL);
+		if (!karg)
+			return -ENOMEM;
+		break;
+	case FDGETFDCSTAT32:
+		karg = kmalloc(sizeof(struct floppy_fdc_state), GFP_KERNEL);
+		if (!karg)
+			return -ENOMEM;
+		break;
+	case FDWERRORGET32:
+		karg = kmalloc(sizeof(struct floppy_write_errors), GFP_KERNEL);
+		if (!karg)
+			return -ENOMEM;
+		break;
+	default:
+		return -EINVAL;
+	}
+	set_fs(KERNEL_DS);
+	err = __blkdev_driver_ioctl(bdev, mode, kcmd, (unsigned long)karg);
+	set_fs(old_fs);
+	if (err)
+		goto out;
+	switch (cmd) {
+	case FDGETPRM32:
+	{
+		struct floppy_struct *f = karg;
+		struct compat_floppy_struct __user *uf = compat_ptr(arg);
+
+		err = __put_user(f->size, &uf->size);
+		err |= __put_user(f->sect, &uf->sect);
+		err |= __put_user(f->head, &uf->head);
+		err |= __put_user(f->track, &uf->track);
+		err |= __put_user(f->stretch, &uf->stretch);
+		err |= __put_user(f->gap, &uf->gap);
+		err |= __put_user(f->rate, &uf->rate);
+		err |= __put_user(f->spec1, &uf->spec1);
+		err |= __put_user(f->fmt_gap, &uf->fmt_gap);
+		err |= __put_user((u64)f->name, (compat_caddr_t __user *)&uf->name);
+		break;
+	}
+	case FDGETDRVPRM32:
+	{
+		struct compat_floppy_drive_params __user *uf;
+		struct floppy_drive_params *f = karg;
+
+		uf = compat_ptr(arg);
+		err = __put_user(f->cmos, &uf->cmos);
+		err |= __put_user(f->max_dtr, &uf->max_dtr);
+		err |= __put_user(f->hlt, &uf->hlt);
+		err |= __put_user(f->hut, &uf->hut);
+		err |= __put_user(f->srt, &uf->srt);
+		err |= __put_user(f->spinup, &uf->spinup);
+		err |= __put_user(f->spindown, &uf->spindown);
+		err |= __put_user(f->spindown_offset, &uf->spindown_offset);
+		err |= __put_user(f->select_delay, &uf->select_delay);
+		err |= __put_user(f->rps, &uf->rps);
+		err |= __put_user(f->tracks, &uf->tracks);
+		err |= __put_user(f->timeout, &uf->timeout);
+		err |= __put_user(f->interleave_sect, &uf->interleave_sect);
+		err |= __copy_to_user(&uf->max_errors, &f->max_errors, sizeof(f->max_errors));
+		err |= __put_user(f->flags, &uf->flags);
+		err |= __put_user(f->read_track, &uf->read_track);
+		err |= __copy_to_user(uf->autodetect, f->autodetect, sizeof(f->autodetect));
+		err |= __put_user(f->checkfreq, &uf->checkfreq);
+		err |= __put_user(f->native_format, &uf->native_format);
+		break;
+	}
+	case FDGETDRVSTAT32:
+	case FDPOLLDRVSTAT32:
+	{
+		struct compat_floppy_drive_struct __user *uf;
+		struct floppy_drive_struct *f = karg;
+
+		uf = compat_ptr(arg);
+		err = __put_user(f->flags, &uf->flags);
+		err |= __put_user(f->spinup_date, &uf->spinup_date);
+		err |= __put_user(f->select_date, &uf->select_date);
+		err |= __put_user(f->first_read_date, &uf->first_read_date);
+		err |= __put_user(f->probed_format, &uf->probed_format);
+		err |= __put_user(f->track, &uf->track);
+		err |= __put_user(f->maxblock, &uf->maxblock);
+		err |= __put_user(f->maxtrack, &uf->maxtrack);
+		err |= __put_user(f->generation, &uf->generation);
+		err |= __put_user(f->keep_data, &uf->keep_data);
+		err |= __put_user(f->fd_ref, &uf->fd_ref);
+		err |= __put_user(f->fd_device, &uf->fd_device);
+		err |= __put_user(f->last_checked, &uf->last_checked);
+		err |= __put_user((u64)f->dmabuf, &uf->dmabuf);
+		err |= __put_user((u64)f->bufblocks, &uf->bufblocks);
+		break;
+	}
+	case FDGETFDCSTAT32:
+	{
+		struct compat_floppy_fdc_state __user *uf;
+		struct floppy_fdc_state *f = karg;
+
+		uf = compat_ptr(arg);
+		err = __put_user(f->spec1, &uf->spec1);
+		err |= __put_user(f->spec2, &uf->spec2);
+		err |= __put_user(f->dtr, &uf->dtr);
+		err |= __put_user(f->version, &uf->version);
+		err |= __put_user(f->dor, &uf->dor);
+		err |= __put_user(f->address, &uf->address);
+		err |= __copy_to_user((char __user *)&uf->address + sizeof(uf->address),
+				   (char *)&f->address + sizeof(f->address), sizeof(int));
+		err |= __put_user(f->driver_version, &uf->driver_version);
+		err |= __copy_to_user(uf->track, f->track, sizeof(f->track));
+		break;
+	}
+	case FDWERRORGET32:
+	{
+		struct compat_floppy_write_errors __user *uf;
+		struct floppy_write_errors *f = karg;
+
+		uf = compat_ptr(arg);
+		err = __put_user(f->write_errors, &uf->write_errors);
+		err |= __put_user(f->first_error_sector, &uf->first_error_sector);
+		err |= __put_user(f->first_error_generation, &uf->first_error_generation);
+		err |= __put_user(f->last_error_sector, &uf->last_error_sector);
+		err |= __put_user(f->last_error_generation, &uf->last_error_generation);
+		err |= __put_user(f->badness, &uf->badness);
+		break;
+	}
+	default:
+		break;
+	}
+	if (err)
+		err = -EFAULT;
+
+out:
+	kfree(karg);
+	return err;
+}
+
+static int compat_blkdev_driver_ioctl(struct block_device *bdev, fmode_t mode,
+			unsigned cmd, unsigned long arg)
+{
+	switch (cmd) {
+	case HDIO_GET_UNMASKINTR:
+	case HDIO_GET_MULTCOUNT:
+	case HDIO_GET_KEEPSETTINGS:
+	case HDIO_GET_32BIT:
+	case HDIO_GET_NOWERR:
+	case HDIO_GET_DMA:
+	case HDIO_GET_NICE:
+	case HDIO_GET_WCACHE:
+	case HDIO_GET_ACOUSTIC:
+	case HDIO_GET_ADDRESS:
+	case HDIO_GET_BUSSTATE:
+		return compat_hdio_ioctl(bdev, mode, cmd, arg);
+	case FDSETPRM32:
+	case FDDEFPRM32:
+	case FDGETPRM32:
+	case FDSETDRVPRM32:
+	case FDGETDRVPRM32:
+	case FDGETDRVSTAT32:
+	case FDPOLLDRVSTAT32:
+	case FDGETFDCSTAT32:
+	case FDWERRORGET32:
+		return compat_fd_ioctl(bdev, mode, cmd, arg);
+	case CDROMREADAUDIO:
+		return compat_cdrom_read_audio(bdev, mode, cmd, arg);
+	case CDROM_SEND_PACKET:
+		return compat_cdrom_generic_command(bdev, mode, cmd, arg);
+
+	/*
+	 * No handler required for the ones below, we just need to
+	 * convert arg to a 64 bit pointer.
+	 */
+	case BLKSECTSET:
+	/*
+	 * 0x03 -- HD/IDE ioctl's used by hdparm and friends.
+	 *         Some need translations, these do not.
+	 */
+	case HDIO_GET_IDENTITY:
+	case HDIO_DRIVE_TASK:
+	case HDIO_DRIVE_CMD:
+	/* 0x330 is reserved -- it used to be HDIO_GETGEO_BIG */
+	case 0x330:
+	/* 0x02 -- Floppy ioctls */
+	case FDMSGON:
+	case FDMSGOFF:
+	case FDSETEMSGTRESH:
+	case FDFLUSH:
+	case FDWERRORCLR:
+	case FDSETMAXERRS:
+	case FDGETMAXERRS:
+	case FDGETDRVTYP:
+	case FDEJECT:
+	case FDCLRPRM:
+	case FDFMTBEG:
+	case FDFMTEND:
+	case FDRESET:
+	case FDTWADDLE:
+	case FDFMTTRK:
+	case FDRAWCMD:
+	/* CDROM stuff */
+	case CDROMPAUSE:
+	case CDROMRESUME:
+	case CDROMPLAYMSF:
+	case CDROMPLAYTRKIND:
+	case CDROMREADTOCHDR:
+	case CDROMREADTOCENTRY:
+	case CDROMSTOP:
+	case CDROMSTART:
+	case CDROMEJECT:
+	case CDROMVOLCTRL:
+	case CDROMSUBCHNL:
+	case CDROMMULTISESSION:
+	case CDROM_GET_MCN:
+	case CDROMRESET:
+	case CDROMVOLREAD:
+	case CDROMSEEK:
+	case CDROMPLAYBLK:
+	case CDROMCLOSETRAY:
+	case CDROM_DISC_STATUS:
+	case CDROM_CHANGER_NSLOTS:
+	case CDROM_GET_CAPABILITY:
+	/* Ignore cdrom.h about these next 5 ioctls, they absolutely do
+	 * not take a struct cdrom_read, instead they take a struct cdrom_msf
+	 * which is compatible.
+	 */
+	case CDROMREADMODE2:
+	case CDROMREADMODE1:
+	case CDROMREADRAW:
+	case CDROMREADCOOKED:
+	case CDROMREADALL:
+	/* DVD ioctls */
+	case DVD_READ_STRUCT:
+	case DVD_WRITE_STRUCT:
+	case DVD_AUTH:
+		arg = (unsigned long)compat_ptr(arg);
+	/* These intepret arg as an unsigned long, not as a pointer,
+	 * so we must not do compat_ptr() conversion. */
+	case HDIO_SET_MULTCOUNT:
+	case HDIO_SET_UNMASKINTR:
+	case HDIO_SET_KEEPSETTINGS:
+	case HDIO_SET_32BIT:
+	case HDIO_SET_NOWERR:
+	case HDIO_SET_DMA:
+	case HDIO_SET_PIO_MODE:
+	case HDIO_SET_NICE:
+	case HDIO_SET_WCACHE:
+	case HDIO_SET_ACOUSTIC:
+	case HDIO_SET_BUSSTATE:
+	case HDIO_SET_ADDRESS:
+	case CDROMEJECT_SW:
+	case CDROM_SET_OPTIONS:
+	case CDROM_CLEAR_OPTIONS:
+	case CDROM_SELECT_SPEED:
+	case CDROM_SELECT_DISC:
+	case CDROM_MEDIA_CHANGED:
+	case CDROM_DRIVE_STATUS:
+	case CDROM_LOCKDOOR:
+	case CDROM_DEBUG:
+		break;
+	default:
+		/* unknown ioctl number */
+		return -ENOIOCTLCMD;
+	}
+
+	return __blkdev_driver_ioctl(bdev, mode, cmd, arg);
+}
+
+/* Most of the generic ioctls are handled in the normal fallback path.
+   This assumes the blkdev's low level compat_ioctl always returns
+   ENOIOCTLCMD for unknown ioctls. */
+long compat_blkdev_ioctl(struct file *file, unsigned cmd, unsigned long arg)
+{
+	int ret = -ENOIOCTLCMD;
+	struct inode *inode = file->f_mapping->host;
+	struct block_device *bdev = inode->i_bdev;
+	struct gendisk *disk = bdev->bd_disk;
+	fmode_t mode = file->f_mode;
+	struct backing_dev_info *bdi;
+	loff_t size;
+	unsigned int max_sectors;
+
+	/*
+	 * O_NDELAY can be altered using fcntl(.., F_SETFL, ..), so we have
+	 * to updated it before every ioctl.
+	 */
+	if (file->f_flags & O_NDELAY)
+		mode |= FMODE_NDELAY;
+	else
+		mode &= ~FMODE_NDELAY;
+
+	switch (cmd) {
+	case HDIO_GETGEO:
+		return compat_hdio_getgeo(disk, bdev, compat_ptr(arg));
+	case BLKPBSZGET:
+		return compat_put_uint(arg, bdev_physical_block_size(bdev));
+	case BLKIOMIN:
+		return compat_put_uint(arg, bdev_io_min(bdev));
+	case BLKIOOPT:
+		return compat_put_uint(arg, bdev_io_opt(bdev));
+	case BLKALIGNOFF:
+		return compat_put_int(arg, bdev_alignment_offset(bdev));
+	case BLKDISCARDZEROES:
+		return compat_put_uint(arg, bdev_discard_zeroes_data(bdev));
+	case BLKFLSBUF:
+	case BLKROSET:
+	case BLKDISCARD:
+	case BLKSECDISCARD:
+	case BLKZEROOUT:
+	/*
+	 * the ones below are implemented in blkdev_locked_ioctl,
+	 * but we call blkdev_ioctl, which gets the lock for us
+	 */
+	case BLKRRPART:
+		return blkdev_ioctl(bdev, mode, cmd,
+				(unsigned long)compat_ptr(arg));
+	case BLKBSZSET_32:
+		return blkdev_ioctl(bdev, mode, BLKBSZSET,
+				(unsigned long)compat_ptr(arg));
+	case BLKPG:
+		return compat_blkpg_ioctl(bdev, mode, cmd, compat_ptr(arg));
+	case BLKRAGET:
+	case BLKFRAGET:
+		if (!arg)
+			return -EINVAL;
+		bdi = blk_get_backing_dev_info(bdev);
+		return compat_put_long(arg,
+				       (bdi->ra_pages * PAGE_CACHE_SIZE) / 512);
+	case BLKROGET: /* compatible */
+		return compat_put_int(arg, bdev_read_only(bdev) != 0);
+	case BLKBSZGET_32: /* get the logical block size (cf. BLKSSZGET) */
+		return compat_put_int(arg, block_size(bdev));
+	case BLKSSZGET: /* get block device hardware sector size */
+		return compat_put_int(arg, bdev_logical_block_size(bdev));
+	case BLKSECTGET:
+		max_sectors = min_t(unsigned int, USHRT_MAX,
+				    queue_max_sectors(bdev_get_queue(bdev)));
+		return compat_put_ushort(arg, max_sectors);
+	case BLKROTATIONAL:
+		return compat_put_ushort(arg,
+					 !blk_queue_nonrot(bdev_get_queue(bdev)));
+	case BLKRASET: /* compatible, but no compat_ptr (!) */
+	case BLKFRASET:
+		if (!capable(CAP_SYS_ADMIN))
+			return -EACCES;
+		bdi = blk_get_backing_dev_info(bdev);
+		bdi->ra_pages = (arg * 512) / PAGE_CACHE_SIZE;
+		return 0;
+	case BLKGETSIZE:
+		size = i_size_read(bdev->bd_inode);
+		if ((size >> 9) > ~0UL)
+			return -EFBIG;
+		return compat_put_ulong(arg, size >> 9);
+
+	case BLKGETSIZE64_32:
+		return compat_put_u64(arg, i_size_read(bdev->bd_inode));
+
+	case BLKTRACESETUP32:
+	case BLKTRACESTART: /* compatible */
+	case BLKTRACESTOP:  /* compatible */
+	case BLKTRACETEARDOWN: /* compatible */
+		ret = blk_trace_ioctl(bdev, cmd, compat_ptr(arg));
+		return ret;
+	default:
+		if (disk->fops->compat_ioctl)
+			ret = disk->fops->compat_ioctl(bdev, mode, cmd, arg);
+		if (ret == -ENOIOCTLCMD)
+			ret = compat_blkdev_driver_ioctl(bdev, mode, cmd, arg);
+		return ret;
+	}
+}
--- a/block/deadline-iosched.c
+++ b/block/deadline-iosched.c
@ -0,0 +1,476 @@
+/*
+ *  Deadline i/o scheduler.
+ *
+ *  Copyright (C) 2002 Jens Axboe <axboe@kernel.dk>
+ */
+#include <linux/kernel.h>
+#include <linux/fs.h>
+#include <linux/blkdev.h>
+#include <linux/elevator.h>
+#include <linux/bio.h>
+#include <linux/module.h>
+#include <linux/slab.h>
+#include <linux/init.h>
+#include <linux/compiler.h>
+#include <linux/rbtree.h>
+
+/*
+ * See Documentation/block/deadline-iosched.txt
+ */
+static const int read_expire = HZ / 2;  /* max time before a read is submitted. */
+static const int write_expire = 5 * HZ; /* ditto for writes, these limits are SOFT! */
+static const int writes_starved = 2;    /* max times reads can starve a write */
+static const int fifo_batch = 16;       /* # of sequential requests treated as one
+				     by the above parameters. For throughput. */
+
+struct deadline_data {
+	/*
+	 * run time data
+	 */
+
+	/*
+	 * requests (deadline_rq s) are present on both sort_list and fifo_list
+	 */
+	struct rb_root sort_list[2];	
+	struct list_head fifo_list[2];
+
+	/*
+	 * next in sort order. read, write or both are NULL
+	 */
+	struct request *next_rq[2];
+	unsigned int batching;		/* number of sequential requests made */
+	sector_t last_sector;		/* head position */
+	unsigned int starved;		/* times reads have starved writes */
+
+	/*
+	 * settings that change how the i/o scheduler behaves
+	 */
+	int fifo_expire[2];
+	int fifo_batch;
+	int writes_starved;
+	int front_merges;
+};
+
+static void deadline_move_request(struct deadline_data *, struct request *);
+
+static inline struct rb_root *
+deadline_rb_root(struct deadline_data *dd, struct request *rq)
+{
+	return &dd->sort_list[rq_data_dir(rq)];
+}
+
+/*
+ * get the request after `rq' in sector-sorted order
+ */
+static inline struct request *
+deadline_latter_request(struct request *rq)
+{
+	struct rb_node *node = rb_next(&rq->rb_node);
+
+	if (node)
+		return rb_entry_rq(node);
+
+	return NULL;
+}
+
+static void
+deadline_add_rq_rb(struct deadline_data *dd, struct request *rq)
+{
+	struct rb_root *root = deadline_rb_root(dd, rq);
+
+	elv_rb_add(root, rq);
+}
+
+static inline void
+deadline_del_rq_rb(struct deadline_data *dd, struct request *rq)
+{
+	const int data_dir = rq_data_dir(rq);
+
+	if (dd->next_rq[data_dir] == rq)
+		dd->next_rq[data_dir] = deadline_latter_request(rq);
+
+	elv_rb_del(deadline_rb_root(dd, rq), rq);
+}
+
+/*
+ * add rq to rbtree and fifo
+ */
+static void
+deadline_add_request(struct request_queue *q, struct request *rq)
+{
+	struct deadline_data *dd = q->elevator->elevator_data;
+	const int data_dir = rq_data_dir(rq);
+
+	deadline_add_rq_rb(dd, rq);
+
+	/*
+	 * set expire time and add to fifo list
+	 */
+	rq->fifo_time = jiffies + dd->fifo_expire[data_dir];
+	list_add_tail(&rq->queuelist, &dd->fifo_list[data_dir]);
+}
+
+/*
+ * remove rq from rbtree and fifo.
+ */
+static void deadline_remove_request(struct request_queue *q, struct request *rq)
+{
+	struct deadline_data *dd = q->elevator->elevator_data;
+
+	rq_fifo_clear(rq);
+	deadline_del_rq_rb(dd, rq);
+}
+
+static int
+deadline_merge(struct request_queue *q, struct request **req, struct bio *bio)
+{
+	struct deadline_data *dd = q->elevator->elevator_data;
+	struct request *__rq;
+	int ret;
+
+	/*
+	 * check for front merge
+	 */
+	if (dd->front_merges) {
+		sector_t sector = bio_end_sector(bio);
+
+		__rq = elv_rb_find(&dd->sort_list[bio_data_dir(bio)], sector);
+		if (__rq) {
+			BUG_ON(sector != blk_rq_pos(__rq));
+
+			if (elv_rq_merge_ok(__rq, bio)) {
+				ret = ELEVATOR_FRONT_MERGE;
+				goto out;
+			}
+		}
+	}
+
+	return ELEVATOR_NO_MERGE;
+out:
+	*req = __rq;
+	return ret;
+}
+
+static void deadline_merged_request(struct request_queue *q,
+				    struct request *req, int type)
+{
+	struct deadline_data *dd = q->elevator->elevator_data;
+
+	/*
+	 * if the merge was a front merge, we need to reposition request
+	 */
+	if (type == ELEVATOR_FRONT_MERGE) {
+		elv_rb_del(deadline_rb_root(dd, req), req);
+		deadline_add_rq_rb(dd, req);
+	}
+}
+
+static void
+deadline_merged_requests(struct request_queue *q, struct request *req,
+			 struct request *next)
+{
+	/*
+	 * if next expires before rq, assign its expire time to rq
+	 * and move into next position (next will be deleted) in fifo
+	 */
+	if (!list_empty(&req->queuelist) && !list_empty(&next->queuelist)) {
+		if (time_before(next->fifo_time, req->fifo_time)) {
+			list_move(&req->queuelist, &next->queuelist);
+			req->fifo_time = next->fifo_time;
+		}
+	}
+
+	/*
+	 * kill knowledge of next, this one is a goner
+	 */
+	deadline_remove_request(q, next);
+}
+
+/*
+ * move request from sort list to dispatch queue.
+ */
+static inline void
+deadline_move_to_dispatch(struct deadline_data *dd, struct request *rq)
+{
+	struct request_queue *q = rq->q;
+
+	deadline_remove_request(q, rq);
+	elv_dispatch_add_tail(q, rq);
+}
+
+/*
+ * move an entry to dispatch queue
+ */
+static void
+deadline_move_request(struct deadline_data *dd, struct request *rq)
+{
+	const int data_dir = rq_data_dir(rq);
+
+	dd->next_rq[READ] = NULL;
+	dd->next_rq[WRITE] = NULL;
+	dd->next_rq[data_dir] = deadline_latter_request(rq);
+
+	dd->last_sector = rq_end_sector(rq);
+
+	/*
+	 * take it off the sort and fifo list, move
+	 * to dispatch queue
+	 */
+	deadline_move_to_dispatch(dd, rq);
+}
+
+/*
+ * deadline_check_fifo returns 0 if there are no expired requests on the fifo,
+ * 1 otherwise. Requires !list_empty(&dd->fifo_list[data_dir])
+ */
+static inline int deadline_check_fifo(struct deadline_data *dd, int ddir)
+{
+	struct request *rq = rq_entry_fifo(dd->fifo_list[ddir].next);
+
+	/*
+	 * rq is expired!
+	 */
+	if (time_after_eq(jiffies, rq->fifo_time))
+		return 1;
+
+	return 0;
+}
+
+/*
+ * deadline_dispatch_requests selects the best request according to
+ * read/write expire, fifo_batch, etc
+ */
+static int deadline_dispatch_requests(struct request_queue *q, int force)
+{
+	struct deadline_data *dd = q->elevator->elevator_data;
+	const int reads = !list_empty(&dd->fifo_list[READ]);
+	const int writes = !list_empty(&dd->fifo_list[WRITE]);
+	struct request *rq;
+	int data_dir;
+
+	/*
+	 * batches are currently reads XOR writes
+	 */
+	if (dd->next_rq[WRITE])
+		rq = dd->next_rq[WRITE];
+	else
+		rq = dd->next_rq[READ];
+
+	if (rq && dd->batching < dd->fifo_batch)
+		/* we have a next request are still entitled to batch */
+		goto dispatch_request;
+
+	/*
+	 * at this point we are not running a batch. select the appropriate
+	 * data direction (read / write)
+	 */
+
+	if (reads) {
+		BUG_ON(RB_EMPTY_ROOT(&dd->sort_list[READ]));
+
+		if (writes && (dd->starved++ >= dd->writes_starved))
+			goto dispatch_writes;
+
+		data_dir = READ;
+
+		goto dispatch_find_request;
+	}
+
+	/*
+	 * there are either no reads or writes have been starved
+	 */
+
+	if (writes) {
+dispatch_writes:
+		BUG_ON(RB_EMPTY_ROOT(&dd->sort_list[WRITE]));
+
+		dd->starved = 0;
+
+		data_dir = WRITE;
+
+		goto dispatch_find_request;
+	}
+
+	return 0;
+
+dispatch_find_request:
+	/*
+	 * we are not running a batch, find best request for selected data_dir
+	 */
+	if (deadline_check_fifo(dd, data_dir) || !dd->next_rq[data_dir]) {
+		/*
+		 * A deadline has expired, the last request was in the other
+		 * direction, or we have run out of higher-sectored requests.
+		 * Start again from the request with the earliest expiry time.
+		 */
+		rq = rq_entry_fifo(dd->fifo_list[data_dir].next);
+	} else {
+		/*
+		 * The last req was the same dir and we have a next request in
+		 * sort order. No expired requests so continue on from here.
+		 */
+		rq = dd->next_rq[data_dir];
+	}
+
+	dd->batching = 0;
+
+dispatch_request:
+	/*
+	 * rq is the selected appropriate request.
+	 */
+	dd->batching++;
+	deadline_move_request(dd, rq);
+
+	return 1;
+}
+
+static void deadline_exit_queue(struct elevator_queue *e)
+{
+	struct deadline_data *dd = e->elevator_data;
+
+	BUG_ON(!list_empty(&dd->fifo_list[READ]));
+	BUG_ON(!list_empty(&dd->fifo_list[WRITE]));
+
+	kfree(dd);
+}
+
+/*
+ * initialize elevator private data (deadline_data).
+ */
+static int deadline_init_queue(struct request_queue *q, struct elevator_type *e)
+{
+	struct deadline_data *dd;
+	struct elevator_queue *eq;
+
+	eq = elevator_alloc(q, e);
+	if (!eq)
+		return -ENOMEM;
+
+	dd = kzalloc_node(sizeof(*dd), GFP_KERNEL, q->node);
+	if (!dd) {
+		kobject_put(&eq->kobj);
+		return -ENOMEM;
+	}
+	eq->elevator_data = dd;
+
+	INIT_LIST_HEAD(&dd->fifo_list[READ]);
+	INIT_LIST_HEAD(&dd->fifo_list[WRITE]);
+	dd->sort_list[READ] = RB_ROOT;
+	dd->sort_list[WRITE] = RB_ROOT;
+	dd->fifo_expire[READ] = read_expire;
+	dd->fifo_expire[WRITE] = write_expire;
+	dd->writes_starved = writes_starved;
+	dd->front_merges = 1;
+	dd->fifo_batch = fifo_batch;
+
+	spin_lock_irq(q->queue_lock);
+	q->elevator = eq;
+	spin_unlock_irq(q->queue_lock);
+	return 0;
+}
+
+/*
+ * sysfs parts below
+ */
+
+static ssize_t
+deadline_var_show(int var, char *page)
+{
+	return sprintf(page, "%d\n", var);
+}
+
+static ssize_t
+deadline_var_store(int *var, const char *page, size_t count)
+{
+	char *p = (char *) page;
+
+	*var = simple_strtol(p, &p, 10);
+	return count;
+}
+
+#define SHOW_FUNCTION(__FUNC, __VAR, __CONV)				\
+static ssize_t __FUNC(struct elevator_queue *e, char *page)		\
+{									\
+	struct deadline_data *dd = e->elevator_data;			\
+	int __data = __VAR;						\
+	if (__CONV)							\
+		__data = jiffies_to_msecs(__data);			\
+	return deadline_var_show(__data, (page));			\
+}
+SHOW_FUNCTION(deadline_read_expire_show, dd->fifo_expire[READ], 1);
+SHOW_FUNCTION(deadline_write_expire_show, dd->fifo_expire[WRITE], 1);
+SHOW_FUNCTION(deadline_writes_starved_show, dd->writes_starved, 0);
+SHOW_FUNCTION(deadline_front_merges_show, dd->front_merges, 0);
+SHOW_FUNCTION(deadline_fifo_batch_show, dd->fifo_batch, 0);
+#undef SHOW_FUNCTION
+
+#define STORE_FUNCTION(__FUNC, __PTR, MIN, MAX, __CONV)			\
+static ssize_t __FUNC(struct elevator_queue *e, const char *page, size_t count)	\
+{									\
+	struct deadline_data *dd = e->elevator_data;			\
+	int __data;							\
+	int ret = deadline_var_store(&__data, (page), count);		\
+	if (__data < (MIN))						\
+		__data = (MIN);						\
+	else if (__data > (MAX))					\
+		__data = (MAX);						\
+	if (__CONV)							\
+		*(__PTR) = msecs_to_jiffies(__data);			\
+	else								\
+		*(__PTR) = __data;					\
+	return ret;							\
+}
+STORE_FUNCTION(deadline_read_expire_store, &dd->fifo_expire[READ], 0, INT_MAX, 1);
+STORE_FUNCTION(deadline_write_expire_store, &dd->fifo_expire[WRITE], 0, INT_MAX, 1);
+STORE_FUNCTION(deadline_writes_starved_store, &dd->writes_starved, INT_MIN, INT_MAX, 0);
+STORE_FUNCTION(deadline_front_merges_store, &dd->front_merges, 0, 1, 0);
+STORE_FUNCTION(deadline_fifo_batch_store, &dd->fifo_batch, 0, INT_MAX, 0);
+#undef STORE_FUNCTION
+
+#define DD_ATTR(name) \
+	__ATTR(name, S_IRUGO|S_IWUSR, deadline_##name##_show, \
+				      deadline_##name##_store)
+
+static struct elv_fs_entry deadline_attrs[] = {
+	DD_ATTR(read_expire),
+	DD_ATTR(write_expire),
+	DD_ATTR(writes_starved),
+	DD_ATTR(front_merges),
+	DD_ATTR(fifo_batch),
+	__ATTR_NULL
+};
+
+static struct elevator_type iosched_deadline = {
+	.ops = {
+		.elevator_merge_fn = 		deadline_merge,
+		.elevator_merged_fn =		deadline_merged_request,
+		.elevator_merge_req_fn =	deadline_merged_requests,
+		.elevator_dispatch_fn =		deadline_dispatch_requests,
+		.elevator_add_req_fn =		deadline_add_request,
+		.elevator_former_req_fn =	elv_rb_former_request,
+		.elevator_latter_req_fn =	elv_rb_latter_request,
+		.elevator_init_fn =		deadline_init_queue,
+		.elevator_exit_fn =		deadline_exit_queue,
+	},
+
+	.elevator_attrs = deadline_attrs,
+	.elevator_name = "deadline",
+	.elevator_owner = THIS_MODULE,
+};
+
+static int __init deadline_init(void)
+{
+	return elv_register(&iosched_deadline);
+}
+
+static void __exit deadline_exit(void)
+{
+	elv_unregister(&iosched_deadline);
+}
+
+module_init(deadline_init);
+module_exit(deadline_exit);
+
+MODULE_AUTHOR("Jens Axboe");
+MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION("deadline IO scheduler");
--- a/block/elevator.c
+++ b/block/elevator.c
--- a/block/genhd.c
+++ b/block/genhd.c
--- a/block/ioctl.c
+++ b/block/ioctl.c
@ -0,0 +1,432 @@
+#include <linux/capability.h>
+#include <linux/blkdev.h>
+#include <linux/export.h>
+#include <linux/gfp.h>
+#include <linux/blkpg.h>
+#include <linux/hdreg.h>
+#include <linux/backing-dev.h>
+#include <linux/fs.h>
+#include <linux/blktrace_api.h>
+#include <asm/uaccess.h>
+
+static int blkpg_ioctl(struct block_device *bdev, struct blkpg_ioctl_arg __user *arg)
+{
+	struct block_device *bdevp;
+	struct gendisk *disk;
+	struct hd_struct *part, *lpart;
+	struct blkpg_ioctl_arg a;
+	struct blkpg_partition p;
+	struct disk_part_iter piter;
+	long long start, length;
+	int partno;
+
+	if (!capable(CAP_SYS_ADMIN))
+		return -EACCES;
+	if (copy_from_user(&a, arg, sizeof(struct blkpg_ioctl_arg)))
+		return -EFAULT;
+	if (copy_from_user(&p, a.data, sizeof(struct blkpg_partition)))
+		return -EFAULT;
+	disk = bdev->bd_disk;
+	if (bdev != bdev->bd_contains)
+		return -EINVAL;
+	partno = p.pno;
+	if (partno <= 0)
+		return -EINVAL;
+	switch (a.op) {
+		case BLKPG_ADD_PARTITION:
+			start = p.start >> 9;
+			length = p.length >> 9;
+			/* check for fit in a hd_struct */
+			if (sizeof(sector_t) == sizeof(long) &&
+			    sizeof(long long) > sizeof(long)) {
+				long pstart = start, plength = length;
+				if (pstart != start || plength != length
+				    || pstart < 0 || plength < 0 || partno > 65535)
+					return -EINVAL;
+			}
+
+			mutex_lock(&bdev->bd_mutex);
+
+			/* overlap? */
+			disk_part_iter_init(&piter, disk,
+					    DISK_PITER_INCL_EMPTY);
+			while ((part = disk_part_iter_next(&piter))) {
+				if (!(start + length <= part->start_sect ||
+				      start >= part->start_sect + part->nr_sects)) {
+					disk_part_iter_exit(&piter);
+					mutex_unlock(&bdev->bd_mutex);
+					return -EBUSY;
+				}
+			}
+			disk_part_iter_exit(&piter);
+
+			/* all seems OK */
+			part = add_partition(disk, partno, start, length,
+					     ADDPART_FLAG_NONE, NULL);
+			mutex_unlock(&bdev->bd_mutex);
+			return PTR_ERR_OR_ZERO(part);
+		case BLKPG_DEL_PARTITION:
+			part = disk_get_part(disk, partno);
+			if (!part)
+				return -ENXIO;
+
+			bdevp = bdget(part_devt(part));
+			disk_put_part(part);
+			if (!bdevp)
+				return -ENOMEM;
+
+			mutex_lock(&bdevp->bd_mutex);
+			if (bdevp->bd_openers) {
+				mutex_unlock(&bdevp->bd_mutex);
+				bdput(bdevp);
+				return -EBUSY;
+			}
+			/* all seems OK */
+			fsync_bdev(bdevp);
+			invalidate_bdev(bdevp);
+
+			mutex_lock_nested(&bdev->bd_mutex, 1);
+			delete_partition(disk, partno);
+			mutex_unlock(&bdev->bd_mutex);
+			mutex_unlock(&bdevp->bd_mutex);
+			bdput(bdevp);
+
+			return 0;
+		case BLKPG_RESIZE_PARTITION:
+			start = p.start >> 9;
+			/* new length of partition in bytes */
+			length = p.length >> 9;
+			/* check for fit in a hd_struct */
+			if (sizeof(sector_t) == sizeof(long) &&
+			    sizeof(long long) > sizeof(long)) {
+				long pstart = start, plength = length;
+				if (pstart != start || plength != length
+				    || pstart < 0 || plength < 0)
+					return -EINVAL;
+			}
+			part = disk_get_part(disk, partno);
+			if (!part)
+				return -ENXIO;
+			bdevp = bdget(part_devt(part));
+			if (!bdevp) {
+				disk_put_part(part);
+				return -ENOMEM;
+			}
+			mutex_lock(&bdevp->bd_mutex);
+			mutex_lock_nested(&bdev->bd_mutex, 1);
+			if (start != part->start_sect) {
+				mutex_unlock(&bdevp->bd_mutex);
+				mutex_unlock(&bdev->bd_mutex);
+				bdput(bdevp);
+				disk_put_part(part);
+				return -EINVAL;
+			}
+			/* overlap? */
+			disk_part_iter_init(&piter, disk,
+					    DISK_PITER_INCL_EMPTY);
+			while ((lpart = disk_part_iter_next(&piter))) {
+				if (lpart->partno != partno &&
+				   !(start + length <= lpart->start_sect ||
+				   start >= lpart->start_sect + lpart->nr_sects)
+				   ) {
+					disk_part_iter_exit(&piter);
+					mutex_unlock(&bdevp->bd_mutex);
+					mutex_unlock(&bdev->bd_mutex);
+					bdput(bdevp);
+					disk_put_part(part);
+					return -EBUSY;
+				}
+			}
+			disk_part_iter_exit(&piter);
+			part_nr_sects_write(part, (sector_t)length);
+			i_size_write(bdevp->bd_inode, p.length);
+			mutex_unlock(&bdevp->bd_mutex);
+			mutex_unlock(&bdev->bd_mutex);
+			bdput(bdevp);
+			disk_put_part(part);
+			return 0;
+		default:
+			return -EINVAL;
+	}
+}
+
+static int blkdev_reread_part(struct block_device *bdev)
+{
+	struct gendisk *disk = bdev->bd_disk;
+	int res;
+
+	if (!disk_part_scan_enabled(disk) || bdev != bdev->bd_contains)
+		return -EINVAL;
+	if (!capable(CAP_SYS_ADMIN))
+		return -EACCES;
+	if (!mutex_trylock(&bdev->bd_mutex))
+		return -EBUSY;
+	res = rescan_partitions(disk, bdev);
+	mutex_unlock(&bdev->bd_mutex);
+	return res;
+}
+
+static int blk_ioctl_discard(struct block_device *bdev, uint64_t start,
+			     uint64_t len, int secure)
+{
+	unsigned long flags = 0;
+
+	if (start & 511)
+		return -EINVAL;
+	if (len & 511)
+		return -EINVAL;
+	start >>= 9;
+	len >>= 9;
+
+	if (start + len > (i_size_read(bdev->bd_inode) >> 9))
+		return -EINVAL;
+	if (secure)
+		flags |= BLKDEV_DISCARD_SECURE;
+	printk("%s %d:%d %llu %llu", secure?"SECDIS":"DIS", MAJOR(bdev->bd_dev), MINOR(bdev->bd_dev), 
+		(unsigned long long)start, (unsigned long long)len);
+	return blkdev_issue_discard(bdev, start, len, GFP_KERNEL, flags);
+}
+
+static int blk_ioctl_zeroout(struct block_device *bdev, uint64_t start,
+			     uint64_t len)
+{
+	if (start & 511)
+		return -EINVAL;
+	if (len & 511)
+		return -EINVAL;
+	start >>= 9;
+	len >>= 9;
+
+	if (start + len > (i_size_read(bdev->bd_inode) >> 9))
+		return -EINVAL;
+
+	return blkdev_issue_zeroout(bdev, start, len, GFP_KERNEL);
+}
+
+static int put_ushort(unsigned long arg, unsigned short val)
+{
+	return put_user(val, (unsigned short __user *)arg);
+}
+
+static int put_int(unsigned long arg, int val)
+{
+	return put_user(val, (int __user *)arg);
+}
+
+static int put_uint(unsigned long arg, unsigned int val)
+{
+	return put_user(val, (unsigned int __user *)arg);
+}
+
+static int put_long(unsigned long arg, long val)
+{
+	return put_user(val, (long __user *)arg);
+}
+
+static int put_ulong(unsigned long arg, unsigned long val)
+{
+	return put_user(val, (unsigned long __user *)arg);
+}
+
+static int put_u64(unsigned long arg, u64 val)
+{
+	return put_user(val, (u64 __user *)arg);
+}
+
+int __blkdev_driver_ioctl(struct block_device *bdev, fmode_t mode,
+			unsigned cmd, unsigned long arg)
+{
+	struct gendisk *disk = bdev->bd_disk;
+
+	if (disk->fops->ioctl)
+		return disk->fops->ioctl(bdev, mode, cmd, arg);
+
+	return -ENOTTY;
+}
+/*
+ * For the record: _GPL here is only because somebody decided to slap it
+ * on the previous export.  Sheer idiocy, since it wasn't copyrightable
+ * at all and could be open-coded without any exports by anybody who cares.
+ */
+EXPORT_SYMBOL_GPL(__blkdev_driver_ioctl);
+
+/*
+ * Is it an unrecognized ioctl? The correct returns are either
+ * ENOTTY (final) or ENOIOCTLCMD ("I don't know this one, try a
+ * fallback"). ENOIOCTLCMD gets turned into ENOTTY by the ioctl
+ * code before returning.
+ *
+ * Confused drivers sometimes return EINVAL, which is wrong. It
+ * means "I understood the ioctl command, but the parameters to
+ * it were wrong".
+ *
+ * We should aim to just fix the broken drivers, the EINVAL case
+ * should go away.
+ */
+static inline int is_unrecognized_ioctl(int ret)
+{
+	return	ret == -EINVAL ||
+		ret == -ENOTTY ||
+		ret == -ENOIOCTLCMD;
+}
+
+/*
+ * always keep this in sync with compat_blkdev_ioctl()
+ */
+int blkdev_ioctl(struct block_device *bdev, fmode_t mode, unsigned cmd,
+			unsigned long arg)
+{
+	struct gendisk *disk = bdev->bd_disk;
+	struct backing_dev_info *bdi;
+	loff_t size;
+	int ret, n;
+	unsigned int max_sectors;
+
+	switch(cmd) {
+	case BLKFLSBUF:
+		if (!capable(CAP_SYS_ADMIN))
+			return -EACCES;
+
+		ret = __blkdev_driver_ioctl(bdev, mode, cmd, arg);
+		if (!is_unrecognized_ioctl(ret))
+			return ret;
+
+		fsync_bdev(bdev);
+		invalidate_bdev(bdev);
+		return 0;
+
+	case BLKROSET:
+		ret = __blkdev_driver_ioctl(bdev, mode, cmd, arg);
+		if (!is_unrecognized_ioctl(ret))
+			return ret;
+		if (!capable(CAP_SYS_ADMIN))
+			return -EACCES;
+		if (get_user(n, (int __user *)(arg)))
+			return -EFAULT;
+		set_device_ro(bdev, n);
+		return 0;
+
+	case BLKDISCARD:
+	case BLKSECDISCARD: {
+		uint64_t range[2];
+
+		if (!(mode & FMODE_WRITE))
+			return -EBADF;
+
+		if (copy_from_user(range, (void __user *)arg, sizeof(range)))
+			return -EFAULT;
+
+		return blk_ioctl_discard(bdev, range[0], range[1],
+					 cmd == BLKSECDISCARD);
+	}
+	case BLKZEROOUT: {
+		uint64_t range[2];
+
+		if (!(mode & FMODE_WRITE))
+			return -EBADF;
+
+		if (copy_from_user(range, (void __user *)arg, sizeof(range)))
+			return -EFAULT;
+
+		return blk_ioctl_zeroout(bdev, range[0], range[1]);
+	}
+
+	case HDIO_GETGEO: {
+		struct hd_geometry geo;
+
+		if (!arg)
+			return -EINVAL;
+		if (!disk->fops->getgeo)
+			return -ENOTTY;
+
+		/*
+		 * We need to set the startsect first, the driver may
+		 * want to override it.
+		 */
+		memset(&geo, 0, sizeof(geo));
+		geo.start = get_start_sect(bdev);
+		ret = disk->fops->getgeo(bdev, &geo);
+		if (ret)
+			return ret;
+		if (copy_to_user((struct hd_geometry __user *)arg, &geo,
+					sizeof(geo)))
+			return -EFAULT;
+		return 0;
+	}
+	case BLKRAGET:
+	case BLKFRAGET:
+		if (!arg)
+			return -EINVAL;
+		bdi = blk_get_backing_dev_info(bdev);
+		return put_long(arg, (bdi->ra_pages * PAGE_CACHE_SIZE) / 512);
+	case BLKROGET:
+		return put_int(arg, bdev_read_only(bdev) != 0);
+	case BLKBSZGET: /* get block device soft block size (cf. BLKSSZGET) */
+		return put_int(arg, block_size(bdev));
+	case BLKSSZGET: /* get block device logical block size */
+		return put_int(arg, bdev_logical_block_size(bdev));
+	case BLKPBSZGET: /* get block device physical block size */
+		return put_uint(arg, bdev_physical_block_size(bdev));
+	case BLKIOMIN:
+		return put_uint(arg, bdev_io_min(bdev));
+	case BLKIOOPT:
+		return put_uint(arg, bdev_io_opt(bdev));
+	case BLKALIGNOFF:
+		return put_int(arg, bdev_alignment_offset(bdev));
+	case BLKDISCARDZEROES:
+		return put_uint(arg, bdev_discard_zeroes_data(bdev));
+	case BLKSECTGET:
+		max_sectors = min_t(unsigned int, USHRT_MAX,
+				    queue_max_sectors(bdev_get_queue(bdev)));
+		return put_ushort(arg, max_sectors);
+	case BLKROTATIONAL:
+		return put_ushort(arg, !blk_queue_nonrot(bdev_get_queue(bdev)));
+	case BLKRASET:
+	case BLKFRASET:
+		if(!capable(CAP_SYS_ADMIN))
+			return -EACCES;
+		bdi = blk_get_backing_dev_info(bdev);
+		bdi->ra_pages = (arg * 512) / PAGE_CACHE_SIZE;
+		return 0;
+	case BLKBSZSET:
+		/* set the logical block size */
+		if (!capable(CAP_SYS_ADMIN))
+			return -EACCES;
+		if (!arg)
+			return -EINVAL;
+		if (get_user(n, (int __user *) arg))
+			return -EFAULT;
+		if (!(mode & FMODE_EXCL)) {
+			bdgrab(bdev);
+			if (blkdev_get(bdev, mode | FMODE_EXCL, &bdev) < 0)
+				return -EBUSY;
+		}
+		ret = set_blocksize(bdev, n);
+		if (!(mode & FMODE_EXCL))
+			blkdev_put(bdev, mode | FMODE_EXCL);
+		return ret;
+	case BLKPG:
+		ret = blkpg_ioctl(bdev, (struct blkpg_ioctl_arg __user *) arg);
+		break;
+	case BLKRRPART:
+		ret = blkdev_reread_part(bdev);
+		break;
+	case BLKGETSIZE:
+		size = i_size_read(bdev->bd_inode);
+		if ((size >> 9) > ~0UL)
+			return -EFBIG;
+		return put_ulong(arg, size >> 9);
+	case BLKGETSIZE64:
+		return put_u64(arg, i_size_read(bdev->bd_inode));
+	case BLKTRACESTART:
+	case BLKTRACESTOP:
+	case BLKTRACESETUP:
+	case BLKTRACETEARDOWN:
+		ret = blk_trace_ioctl(bdev, cmd, (char __user *) arg);
+		break;
+	default:
+		ret = __blkdev_driver_ioctl(bdev, mode, cmd, arg);
+	}
+	return ret;
+}
+EXPORT_SYMBOL_GPL(blkdev_ioctl);
--- a/block/ioprio.c
+++ b/block/ioprio.c
@ -0,0 +1,243 @@
+/*
+ * fs/ioprio.c
+ *
+ * Copyright (C) 2004 Jens Axboe <axboe@kernel.dk>
+ *
+ * Helper functions for setting/querying io priorities of processes. The
+ * system calls closely mimmick getpriority/setpriority, see the man page for
+ * those. The prio argument is a composite of prio class and prio data, where
+ * the data argument has meaning within that class. The standard scheduling
+ * classes have 8 distinct prio levels, with 0 being the highest prio and 7
+ * being the lowest.
+ *
+ * IOW, setting BE scheduling class with prio 2 is done ala:
+ *
+ * unsigned int prio = (IOPRIO_CLASS_BE << IOPRIO_CLASS_SHIFT) | 2;
+ *
+ * ioprio_set(PRIO_PROCESS, pid, prio);
+ *
+ * See also Documentation/block/ioprio.txt
+ *
+ */
+#include <linux/gfp.h>
+#include <linux/kernel.h>
+#include <linux/export.h>
+#include <linux/ioprio.h>
+#include <linux/blkdev.h>
+#include <linux/capability.h>
+#include <linux/syscalls.h>
+#include <linux/security.h>
+#include <linux/pid_namespace.h>
+
+int set_task_ioprio(struct task_struct *task, int ioprio)
+{
+	int err;
+	struct io_context *ioc;
+	const struct cred *cred = current_cred(), *tcred;
+
+	rcu_read_lock();
+	tcred = __task_cred(task);
+	if (!uid_eq(tcred->uid, cred->euid) &&
+	    !uid_eq(tcred->uid, cred->uid) && !capable(CAP_SYS_NICE)) {
+		rcu_read_unlock();
+		return -EPERM;
+	}
+	rcu_read_unlock();
+
+	err = security_task_setioprio(task, ioprio);
+	if (err)
+		return err;
+
+	ioc = get_task_io_context(task, GFP_ATOMIC, NUMA_NO_NODE);
+	if (ioc) {
+		ioc->ioprio = ioprio;
+		put_io_context(ioc);
+	}
+
+	return err;
+}
+EXPORT_SYMBOL_GPL(set_task_ioprio);
+
+SYSCALL_DEFINE3(ioprio_set, int, which, int, who, int, ioprio)
+{
+	int class = IOPRIO_PRIO_CLASS(ioprio);
+	int data = IOPRIO_PRIO_DATA(ioprio);
+	struct task_struct *p, *g;
+	struct user_struct *user;
+	struct pid *pgrp;
+	kuid_t uid;
+	int ret;
+
+	switch (class) {
+		case IOPRIO_CLASS_RT:
+			if (!capable(CAP_SYS_ADMIN))
+				return -EPERM;
+			/* fall through, rt has prio field too */
+		case IOPRIO_CLASS_BE:
+			if (data >= IOPRIO_BE_NR || data < 0)
+				return -EINVAL;
+
+			break;
+		case IOPRIO_CLASS_IDLE:
+			break;
+		case IOPRIO_CLASS_NONE:
+			if (data)
+				return -EINVAL;
+			break;
+		default:
+			return -EINVAL;
+	}
+
+	ret = -ESRCH;
+	rcu_read_lock();
+	switch (which) {
+		case IOPRIO_WHO_PROCESS:
+			if (!who)
+				p = current;
+			else
+				p = find_task_by_vpid(who);
+			if (p)
+				ret = set_task_ioprio(p, ioprio);
+			break;
+		case IOPRIO_WHO_PGRP:
+			if (!who)
+				pgrp = task_pgrp(current);
+			else
+				pgrp = find_vpid(who);
+			do_each_pid_thread(pgrp, PIDTYPE_PGID, p) {
+				ret = set_task_ioprio(p, ioprio);
+				if (ret)
+					break;
+			} while_each_pid_thread(pgrp, PIDTYPE_PGID, p);
+			break;
+		case IOPRIO_WHO_USER:
+			uid = make_kuid(current_user_ns(), who);
+			if (!uid_valid(uid))
+				break;
+			if (!who)
+				user = current_user();
+			else
+				user = find_user(uid);
+
+			if (!user)
+				break;
+
+			do_each_thread(g, p) {
+				if (!uid_eq(task_uid(p), uid))
+					continue;
+				ret = set_task_ioprio(p, ioprio);
+				if (ret)
+					goto free_uid;
+			} while_each_thread(g, p);
+free_uid:
+			if (who)
+				free_uid(user);
+			break;
+		default:
+			ret = -EINVAL;
+	}
+
+	rcu_read_unlock();
+	return ret;
+}
+
+static int get_task_ioprio(struct task_struct *p)
+{
+	int ret;
+
+	ret = security_task_getioprio(p);
+	if (ret)
+		goto out;
+	ret = IOPRIO_PRIO_VALUE(IOPRIO_CLASS_NONE, IOPRIO_NORM);
+	if (p->io_context)
+		ret = p->io_context->ioprio;
+out:
+	return ret;
+}
+
+int ioprio_best(unsigned short aprio, unsigned short bprio)
+{
+	unsigned short aclass;
+	unsigned short bclass;
+
+	if (!ioprio_valid(aprio))
+		aprio = IOPRIO_PRIO_VALUE(IOPRIO_CLASS_BE, IOPRIO_NORM);
+	if (!ioprio_valid(bprio))
+		bprio = IOPRIO_PRIO_VALUE(IOPRIO_CLASS_BE, IOPRIO_NORM);
+
+	aclass = IOPRIO_PRIO_CLASS(aprio);
+	bclass = IOPRIO_PRIO_CLASS(bprio);
+	if (aclass == bclass)
+		return min(aprio, bprio);
+	if (aclass > bclass)
+		return bprio;
+	else
+		return aprio;
+}
+
+SYSCALL_DEFINE2(ioprio_get, int, which, int, who)
+{
+	struct task_struct *g, *p;
+	struct user_struct *user;
+	struct pid *pgrp;
+	kuid_t uid;
+	int ret = -ESRCH;
+	int tmpio;
+
+	rcu_read_lock();
+	switch (which) {
+		case IOPRIO_WHO_PROCESS:
+			if (!who)
+				p = current;
+			else
+				p = find_task_by_vpid(who);
+			if (p)
+				ret = get_task_ioprio(p);
+			break;
+		case IOPRIO_WHO_PGRP:
+			if (!who)
+				pgrp = task_pgrp(current);
+			else
+				pgrp = find_vpid(who);
+			do_each_pid_thread(pgrp, PIDTYPE_PGID, p) {
+				tmpio = get_task_ioprio(p);
+				if (tmpio < 0)
+					continue;
+				if (ret == -ESRCH)
+					ret = tmpio;
+				else
+					ret = ioprio_best(ret, tmpio);
+			} while_each_pid_thread(pgrp, PIDTYPE_PGID, p);
+			break;
+		case IOPRIO_WHO_USER:
+			uid = make_kuid(current_user_ns(), who);
+			if (!who)
+				user = current_user();
+			else
+				user = find_user(uid);
+
+			if (!user)
+				break;
+
+			do_each_thread(g, p) {
+				if (!uid_eq(task_uid(p), user->uid))
+					continue;
+				tmpio = get_task_ioprio(p);
+				if (tmpio < 0)
+					continue;
+				if (ret == -ESRCH)
+					ret = tmpio;
+				else
+					ret = ioprio_best(ret, tmpio);
+			} while_each_thread(g, p);
+
+			if (who)
+				free_uid(user);
+			break;
+		default:
+			ret = -EINVAL;
+	}
+
+	rcu_read_unlock();
+	return ret;
+}
--- a/block/noop-iosched.c
+++ b/block/noop-iosched.c
@ -0,0 +1,124 @@
+/*
+ * elevator noop
+ */
+#include <linux/blkdev.h>
+#include <linux/elevator.h>
+#include <linux/bio.h>
+#include <linux/module.h>
+#include <linux/slab.h>
+#include <linux/init.h>
+
+struct noop_data {
+	struct list_head queue;
+};
+
+static void noop_merged_requests(struct request_queue *q, struct request *rq,
+				 struct request *next)
+{
+	list_del_init(&next->queuelist);
+}
+
+static int noop_dispatch(struct request_queue *q, int force)
+{
+	struct noop_data *nd = q->elevator->elevator_data;
+
+	if (!list_empty(&nd->queue)) {
+		struct request *rq;
+		rq = list_entry(nd->queue.next, struct request, queuelist);
+		list_del_init(&rq->queuelist);
+		elv_dispatch_sort(q, rq);
+		return 1;
+	}
+	return 0;
+}
+
+static void noop_add_request(struct request_queue *q, struct request *rq)
+{
+	struct noop_data *nd = q->elevator->elevator_data;
+
+	list_add_tail(&rq->queuelist, &nd->queue);
+}
+
+static struct request *
+noop_former_request(struct request_queue *q, struct request *rq)
+{
+	struct noop_data *nd = q->elevator->elevator_data;
+
+	if (rq->queuelist.prev == &nd->queue)
+		return NULL;
+	return list_entry(rq->queuelist.prev, struct request, queuelist);
+}
+
+static struct request *
+noop_latter_request(struct request_queue *q, struct request *rq)
+{
+	struct noop_data *nd = q->elevator->elevator_data;
+
+	if (rq->queuelist.next == &nd->queue)
+		return NULL;
+	return list_entry(rq->queuelist.next, struct request, queuelist);
+}
+
+static int noop_init_queue(struct request_queue *q, struct elevator_type *e)
+{
+	struct noop_data *nd;
+	struct elevator_queue *eq;
+
+	eq = elevator_alloc(q, e);
+	if (!eq)
+		return -ENOMEM;
+
+	nd = kmalloc_node(sizeof(*nd), GFP_KERNEL, q->node);
+	if (!nd) {
+		kobject_put(&eq->kobj);
+		return -ENOMEM;
+	}
+	eq->elevator_data = nd;
+
+	INIT_LIST_HEAD(&nd->queue);
+
+	spin_lock_irq(q->queue_lock);
+	q->elevator = eq;
+	spin_unlock_irq(q->queue_lock);
+	return 0;
+}
+
+static void noop_exit_queue(struct elevator_queue *e)
+{
+	struct noop_data *nd = e->elevator_data;
+
+	BUG_ON(!list_empty(&nd->queue));
+	kfree(nd);
+}
+
+static struct elevator_type elevator_noop = {
+	.ops = {
+		.elevator_merge_req_fn		= noop_merged_requests,
+		.elevator_dispatch_fn		= noop_dispatch,
+		.elevator_add_req_fn		= noop_add_request,
+		.elevator_former_req_fn		= noop_former_request,
+		.elevator_latter_req_fn		= noop_latter_request,
+		.elevator_init_fn		= noop_init_queue,
+		.elevator_exit_fn		= noop_exit_queue,
+	},
+	.elevator_name = "noop",
+	.elevator_owner = THIS_MODULE,
+};
+
+static int __init noop_init(void)
+{
+	return elv_register(&elevator_noop);
+}
+
+static void __exit noop_exit(void)
+{
+	elv_unregister(&elevator_noop);
+}
+
+module_init(noop_init);
+module_exit(noop_exit);
+
+
+MODULE_AUTHOR("Jens Axboe");
+MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION("No-op IO scheduler");
--- a/block/partition-generic.c
+++ b/block/partition-generic.c
@ -0,0 +1,582 @@
+/*
+ *  Code extracted from drivers/block/genhd.c
+ *  Copyright (C) 1991-1998  Linus Torvalds
+ *  Re-organised Feb 1998 Russell King
+ *
+ *  We now have independent partition support from the
+ *  block drivers, which allows all the partition code to
+ *  be grouped in one location, and it to be mostly self
+ *  contained.
+ */
+
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/fs.h>
+#include <linux/slab.h>
+#include <linux/kmod.h>
+#include <linux/ctype.h>
+#include <linux/genhd.h>
+#include <linux/blktrace_api.h>
+
+#include "partitions/check.h"
+
+#ifdef CONFIG_BLK_DEV_MD
+extern void md_autodetect_dev(dev_t dev);
+#endif
+ 
+/*
+ * disk_name() is used by partition check code and the genhd driver.
+ * It formats the devicename of the indicated disk into
+ * the supplied buffer (of size at least 32), and returns
+ * a pointer to that same buffer (for convenience).
+ */
+
+char *disk_name(struct gendisk *hd, int partno, char *buf)
+{
+	if (!partno)
+		snprintf(buf, BDEVNAME_SIZE, "%s", hd->disk_name);
+	else if (isdigit(hd->disk_name[strlen(hd->disk_name)-1]))
+		snprintf(buf, BDEVNAME_SIZE, "%sp%d", hd->disk_name, partno);
+	else
+		snprintf(buf, BDEVNAME_SIZE, "%s%d", hd->disk_name, partno);
+
+	return buf;
+}
+
+const char *bdevname(struct block_device *bdev, char *buf)
+{
+	return disk_name(bdev->bd_disk, bdev->bd_part->partno, buf);
+}
+
+EXPORT_SYMBOL(bdevname);
+
+/*
+ * There's very little reason to use this, you should really
+ * have a struct block_device just about everywhere and use
+ * bdevname() instead.
+ */
+const char *__bdevname(dev_t dev, char *buffer)
+{
+	scnprintf(buffer, BDEVNAME_SIZE, "unknown-block(%u,%u)",
+				MAJOR(dev), MINOR(dev));
+	return buffer;
+}
+
+EXPORT_SYMBOL(__bdevname);
+
+static ssize_t part_partition_show(struct device *dev,
+				   struct device_attribute *attr, char *buf)
+{
+	struct hd_struct *p = dev_to_part(dev);
+
+	return sprintf(buf, "%d\n", p->partno);
+}
+
+static ssize_t part_start_show(struct device *dev,
+			       struct device_attribute *attr, char *buf)
+{
+	struct hd_struct *p = dev_to_part(dev);
+
+	return sprintf(buf, "%llu\n",(unsigned long long)p->start_sect);
+}
+
+ssize_t part_size_show(struct device *dev,
+		       struct device_attribute *attr, char *buf)
+{
+	struct hd_struct *p = dev_to_part(dev);
+	return sprintf(buf, "%llu\n",(unsigned long long)part_nr_sects_read(p));
+}
+
+static ssize_t part_ro_show(struct device *dev,
+			    struct device_attribute *attr, char *buf)
+{
+	struct hd_struct *p = dev_to_part(dev);
+	return sprintf(buf, "%d\n", p->policy ? 1 : 0);
+}
+
+static ssize_t part_alignment_offset_show(struct device *dev,
+					  struct device_attribute *attr, char *buf)
+{
+	struct hd_struct *p = dev_to_part(dev);
+	return sprintf(buf, "%llu\n", (unsigned long long)p->alignment_offset);
+}
+
+static ssize_t part_discard_alignment_show(struct device *dev,
+					   struct device_attribute *attr, char *buf)
+{
+	struct hd_struct *p = dev_to_part(dev);
+	return sprintf(buf, "%u\n", p->discard_alignment);
+}
+
+ssize_t part_stat_show(struct device *dev,
+		       struct device_attribute *attr, char *buf)
+{
+	struct hd_struct *p = dev_to_part(dev);
+	int cpu;
+
+	cpu = part_stat_lock();
+	part_round_stats(cpu, p);
+	part_stat_unlock();
+	return sprintf(buf,
+		"%8lu %8lu %8llu %8u "
+		"%8lu %8lu %8llu %8u "
+		"%8u %8u %8u"
+		"\n",
+		part_stat_read(p, ios[READ]),
+		part_stat_read(p, merges[READ]),
+		(unsigned long long)part_stat_read(p, sectors[READ]),
+		jiffies_to_msecs(part_stat_read(p, ticks[READ])),
+		part_stat_read(p, ios[WRITE]),
+		part_stat_read(p, merges[WRITE]),
+		(unsigned long long)part_stat_read(p, sectors[WRITE]),
+		jiffies_to_msecs(part_stat_read(p, ticks[WRITE])),
+		part_in_flight(p),
+		jiffies_to_msecs(part_stat_read(p, io_ticks)),
+		jiffies_to_msecs(part_stat_read(p, time_in_queue)));
+}
+
+ssize_t part_inflight_show(struct device *dev,
+			struct device_attribute *attr, char *buf)
+{
+	struct hd_struct *p = dev_to_part(dev);
+
+	return sprintf(buf, "%8u %8u\n", atomic_read(&p->in_flight[0]),
+		atomic_read(&p->in_flight[1]));
+}
+
+#ifdef CONFIG_FAIL_MAKE_REQUEST
+ssize_t part_fail_show(struct device *dev,
+		       struct device_attribute *attr, char *buf)
+{
+	struct hd_struct *p = dev_to_part(dev);
+
+	return sprintf(buf, "%d\n", p->make_it_fail);
+}
+
+ssize_t part_fail_store(struct device *dev,
+			struct device_attribute *attr,
+			const char *buf, size_t count)
+{
+	struct hd_struct *p = dev_to_part(dev);
+	int i;
+
+	if (count > 0 && sscanf(buf, "%d", &i) > 0)
+		p->make_it_fail = (i == 0) ? 0 : 1;
+
+	return count;
+}
+#endif
+
+static DEVICE_ATTR(partition, S_IRUGO, part_partition_show, NULL);
+static DEVICE_ATTR(start, S_IRUGO, part_start_show, NULL);
+static DEVICE_ATTR(size, S_IRUGO, part_size_show, NULL);
+static DEVICE_ATTR(ro, S_IRUGO, part_ro_show, NULL);
+static DEVICE_ATTR(alignment_offset, S_IRUGO, part_alignment_offset_show, NULL);
+static DEVICE_ATTR(discard_alignment, S_IRUGO, part_discard_alignment_show,
+		   NULL);
+static DEVICE_ATTR(stat, S_IRUGO, part_stat_show, NULL);
+static DEVICE_ATTR(inflight, S_IRUGO, part_inflight_show, NULL);
+#ifdef CONFIG_FAIL_MAKE_REQUEST
+static struct device_attribute dev_attr_fail =
+	__ATTR(make-it-fail, S_IRUGO|S_IWUSR, part_fail_show, part_fail_store);
+#endif
+
+static struct attribute *part_attrs[] = {
+	&dev_attr_partition.attr,
+	&dev_attr_start.attr,
+	&dev_attr_size.attr,
+	&dev_attr_ro.attr,
+	&dev_attr_alignment_offset.attr,
+	&dev_attr_discard_alignment.attr,
+	&dev_attr_stat.attr,
+	&dev_attr_inflight.attr,
+#ifdef CONFIG_FAIL_MAKE_REQUEST
+	&dev_attr_fail.attr,
+#endif
+	NULL
+};
+
+static struct attribute_group part_attr_group = {
+	.attrs = part_attrs,
+};
+
+static const struct attribute_group *part_attr_groups[] = {
+	&part_attr_group,
+#ifdef CONFIG_BLK_DEV_IO_TRACE
+	&blk_trace_attr_group,
+#endif
+	NULL
+};
+
+static void part_release(struct device *dev)
+{
+	struct hd_struct *p = dev_to_part(dev);
+	blk_free_devt(dev->devt);
+	free_part_stats(p);
+	free_part_info(p);
+	kfree(p);
+}
+
+static int part_uevent(struct device *dev, struct kobj_uevent_env *env)
+{
+	struct hd_struct *part = dev_to_part(dev);
+
+	add_uevent_var(env, "PARTN=%u", part->partno);
+	if (part->info && part->info->volname[0])
+		add_uevent_var(env, "PARTNAME=%s", part->info->volname);
+	return 0;
+}
+
+struct device_type part_type = {
+	.name		= "partition",
+	.groups		= part_attr_groups,
+	.release	= part_release,
+	.uevent		= part_uevent,
+};
+
+static void delete_partition_rcu_cb(struct rcu_head *head)
+{
+	struct hd_struct *part = container_of(head, struct hd_struct, rcu_head);
+
+	part->start_sect = 0;
+	part->nr_sects = 0;
+	part_stat_set_all(part, 0);
+	put_device(part_to_dev(part));
+}
+
+void __delete_partition(struct hd_struct *part)
+{
+	call_rcu(&part->rcu_head, delete_partition_rcu_cb);
+}
+
+void delete_partition(struct gendisk *disk, int partno)
+{
+	struct disk_part_tbl *ptbl = disk->part_tbl;
+	struct hd_struct *part;
+
+	if (partno >= ptbl->len)
+		return;
+
+	part = ptbl->part[partno];
+	if (!part)
+		return;
+
+	rcu_assign_pointer(ptbl->part[partno], NULL);
+	rcu_assign_pointer(ptbl->last_lookup, NULL);
+	kobject_put(part->holder_dir);
+	device_del(part_to_dev(part));
+
+	hd_struct_put(part);
+}
+
+static ssize_t whole_disk_show(struct device *dev,
+			       struct device_attribute *attr, char *buf)
+{
+	return 0;
+}
+static DEVICE_ATTR(whole_disk, S_IRUSR | S_IRGRP | S_IROTH,
+		   whole_disk_show, NULL);
+
+struct hd_struct *add_partition(struct gendisk *disk, int partno,
+				sector_t start, sector_t len, int flags,
+				struct partition_meta_info *info)
+{
+	struct hd_struct *p;
+	dev_t devt = MKDEV(0, 0);
+	struct device *ddev = disk_to_dev(disk);
+	struct device *pdev;
+	struct disk_part_tbl *ptbl;
+	const char *dname;
+	int err;
+
+	err = disk_expand_part_tbl(disk, partno);
+	if (err)
+		return ERR_PTR(err);
+	ptbl = disk->part_tbl;
+
+	if (ptbl->part[partno])
+		return ERR_PTR(-EBUSY);
+
+	p = kzalloc(sizeof(*p), GFP_KERNEL);
+	if (!p)
+		return ERR_PTR(-EBUSY);
+
+	if (!init_part_stats(p)) {
+		err = -ENOMEM;
+		goto out_free;
+	}
+
+	seqcount_init(&p->nr_sects_seq);
+	pdev = part_to_dev(p);
+
+	p->start_sect = start;
+	p->alignment_offset =
+		queue_limit_alignment_offset(&disk->queue->limits, start);
+	p->discard_alignment =
+		queue_limit_discard_alignment(&disk->queue->limits, start);
+	p->nr_sects = len;
+	p->partno = partno;
+	p->policy = get_disk_ro(disk);
+
+	if (info) {
+		struct partition_meta_info *pinfo = alloc_part_info(disk);
+		if (!pinfo)
+			goto out_free_stats;
+		memcpy(pinfo, info, sizeof(*info));
+		p->info = pinfo;
+	}
+
+	dname = dev_name(ddev);
+	if (isdigit(dname[strlen(dname) - 1]))
+		dev_set_name(pdev, "%sp%d", dname, partno);
+	else
+		dev_set_name(pdev, "%s%d", dname, partno);
+
+	device_initialize(pdev);
+	pdev->class = &block_class;
+	pdev->type = &part_type;
+	pdev->parent = ddev;
+
+	err = blk_alloc_devt(p, &devt);
+	if (err)
+		goto out_free_info;
+	pdev->devt = devt;
+
+	/* delay uevent until 'holders' subdir is created */
+	dev_set_uevent_suppress(pdev, 1);
+	err = device_add(pdev);
+	if (err)
+		goto out_put;
+
+	err = -ENOMEM;
+	p->holder_dir = kobject_create_and_add("holders", &pdev->kobj);
+	if (!p->holder_dir)
+		goto out_del;
+
+	dev_set_uevent_suppress(pdev, 0);
+	if (flags & ADDPART_FLAG_WHOLEDISK) {
+		err = device_create_file(pdev, &dev_attr_whole_disk);
+		if (err)
+			goto out_del;
+	}
+
+	/* everything is up and running, commence */
+	rcu_assign_pointer(ptbl->part[partno], p);
+
+	/* suppress uevent if the disk suppresses it */
+	if (!dev_get_uevent_suppress(ddev))
+		kobject_uevent(&pdev->kobj, KOBJ_ADD);
+
+	hd_ref_init(p);
+	return p;
+
+out_free_info:
+	free_part_info(p);
+out_free_stats:
+	free_part_stats(p);
+out_free:
+	kfree(p);
+	return ERR_PTR(err);
+out_del:
+	kobject_put(p->holder_dir);
+	device_del(pdev);
+out_put:
+	put_device(pdev);
+	blk_free_devt(devt);
+	return ERR_PTR(err);
+}
+
+static bool disk_unlock_native_capacity(struct gendisk *disk)
+{
+	const struct block_device_operations *bdops = disk->fops;
+
+	if (bdops->unlock_native_capacity &&
+	    !(disk->flags & GENHD_FL_NATIVE_CAPACITY)) {
+		printk(KERN_CONT "enabling native capacity\n");
+		bdops->unlock_native_capacity(disk);
+		disk->flags |= GENHD_FL_NATIVE_CAPACITY;
+		return true;
+	} else {
+		printk(KERN_CONT "truncated\n");
+		return false;
+	}
+}
+
+static int drop_partitions(struct gendisk *disk, struct block_device *bdev)
+{
+	struct disk_part_iter piter;
+	struct hd_struct *part;
+	int res;
+
+	if (bdev->bd_part_count)
+		return -EBUSY;
+	res = invalidate_partition(disk, 0);
+	if (res)
+		return res;
+
+	disk_part_iter_init(&piter, disk, DISK_PITER_INCL_EMPTY);
+	while ((part = disk_part_iter_next(&piter)))
+		delete_partition(disk, part->partno);
+	disk_part_iter_exit(&piter);
+
+	return 0;
+}
+
+int rescan_partitions(struct gendisk *disk, struct block_device *bdev)
+{
+	struct parsed_partitions *state = NULL;
+	struct hd_struct *part;
+	int p, highest, res;
+rescan:
+	if (state && !IS_ERR(state)) {
+		free_partitions(state);
+		state = NULL;
+	}
+
+	res = drop_partitions(disk, bdev);
+	if (res)
+		return res;
+
+	if (disk->fops->revalidate_disk)
+		disk->fops->revalidate_disk(disk);
+	check_disk_size_change(disk, bdev);
+	bdev->bd_invalidated = 0;
+	if (!get_capacity(disk) || !(state = check_partition(disk, bdev)))
+		return 0;
+	if (IS_ERR(state)) {
+		/*
+		 * I/O error reading the partition table.  If any
+		 * partition code tried to read beyond EOD, retry
+		 * after unlocking native capacity.
+		 */
+		if (PTR_ERR(state) == -ENOSPC) {
+			printk(KERN_WARNING "%s: partition table beyond EOD, ",
+			       disk->disk_name);
+			if (disk_unlock_native_capacity(disk))
+				goto rescan;
+		}
+		return -EIO;
+	}
+	/*
+	 * If any partition code tried to read beyond EOD, try
+	 * unlocking native capacity even if partition table is
+	 * successfully read as we could be missing some partitions.
+	 */
+	if (state->access_beyond_eod) {
+		printk(KERN_WARNING
+		       "%s: partition table partially beyond EOD, ",
+		       disk->disk_name);
+		if (disk_unlock_native_capacity(disk))
+			goto rescan;
+	}
+
+	/* tell userspace that the media / partition table may have changed */
+	kobject_uevent(&disk_to_dev(disk)->kobj, KOBJ_CHANGE);
+
+	/* Detect the highest partition number and preallocate
+	 * disk->part_tbl.  This is an optimization and not strictly
+	 * necessary.
+	 */
+	for (p = 1, highest = 0; p < state->limit; p++)
+		if (state->parts[p].size)
+			highest = p;
+
+	disk_expand_part_tbl(disk, highest);
+
+	/* add partitions */
+	for (p = 1; p < state->limit; p++) {
+		sector_t size, from;
+		struct partition_meta_info *info = NULL;
+
+		size = state->parts[p].size;
+		if (!size)
+			continue;
+
+		from = state->parts[p].from;
+		if (from >= get_capacity(disk)) {
+			printk(KERN_WARNING
+			       "%s: p%d start %llu is beyond EOD, ",
+			       disk->disk_name, p, (unsigned long long) from);
+			if (disk_unlock_native_capacity(disk))
+				goto rescan;
+			continue;
+		}
+
+		if (from + size > get_capacity(disk)) {
+			printk(KERN_WARNING
+			       "%s: p%d size %llu extends beyond EOD, ",
+			       disk->disk_name, p, (unsigned long long) size);
+
+			if (disk_unlock_native_capacity(disk)) {
+				/* free state and restart */
+				goto rescan;
+			} else {
+				/*
+				 * we can not ignore partitions of broken tables
+				 * created by for example camera firmware, but
+				 * we limit them to the end of the disk to avoid
+				 * creating invalid block devices
+				 */
+				size = get_capacity(disk) - from;
+			}
+		}
+
+		if (state->parts[p].has_info)
+			info = &state->parts[p].info;
+		part = add_partition(disk, p, from, size,
+				     state->parts[p].flags,
+				     &state->parts[p].info);
+		if (IS_ERR(part)) {
+			printk(KERN_ERR " %s: p%d could not be added: %ld\n",
+			       disk->disk_name, p, -PTR_ERR(part));
+			continue;
+		}
+#ifdef CONFIG_BLK_DEV_MD
+		if (state->parts[p].flags & ADDPART_FLAG_RAID)
+			md_autodetect_dev(part_to_dev(part)->devt);
+#endif
+	}
+	free_partitions(state);
+	return 0;
+}
+
+int invalidate_partitions(struct gendisk *disk, struct block_device *bdev)
+{
+	int res;
+
+	if (!bdev->bd_invalidated)
+		return 0;
+
+	res = drop_partitions(disk, bdev);
+	if (res)
+		return res;
+
+	set_capacity(disk, 0);
+	check_disk_size_change(disk, bdev);
+	bdev->bd_invalidated = 0;
+	/* tell userspace that the media / partition table may have changed */
+	kobject_uevent(&disk_to_dev(disk)->kobj, KOBJ_CHANGE);
+
+	return 0;
+}
+
+unsigned char *read_dev_sector(struct block_device *bdev, sector_t n, Sector *p)
+{
+	struct address_space *mapping = bdev->bd_inode->i_mapping;
+	struct page *page;
+
+	page = read_mapping_page(mapping, (pgoff_t)(n >> (PAGE_CACHE_SHIFT-9)),
+				 NULL);
+	if (!IS_ERR(page)) {
+		if (PageError(page))
+			goto fail;
+		p->v = page;
+		return (unsigned char *)page_address(page) +  ((n & ((1 << (PAGE_CACHE_SHIFT - 9)) - 1)) << 9);
+fail:
+		page_cache_release(page);
+	}
+	p->v = NULL;
+	return NULL;
+}
+
+EXPORT_SYMBOL(read_dev_sector);
--- a/block/partitions/Kconfig
+++ b/block/partitions/Kconfig
@ -0,0 +1,269 @@
+#
+# Partition configuration
+#
+config PARTITION_ADVANCED
+	bool "Advanced partition selection"
+	help
+	  Say Y here if you would like to use hard disks under Linux which
+	  were partitioned under an operating system running on a different
+	  architecture than your Linux system.
+
+	  Note that the answer to this question won't directly affect the
+	  kernel: saying N will just cause the configurator to skip all
+	  the questions about foreign partitioning schemes.
+
+	  If unsure, say N.
+
+config ACORN_PARTITION
+	bool "Acorn partition support" if PARTITION_ADVANCED
+	default y if ARCH_ACORN
+	help
+	  Support hard disks partitioned under Acorn operating systems.
+
+config ACORN_PARTITION_CUMANA
+	bool "Cumana partition support" if PARTITION_ADVANCED
+	default y if ARCH_ACORN
+	depends on ACORN_PARTITION
+	help
+	  Say Y here if you would like to use hard disks under Linux which
+	  were partitioned using the Cumana interface on Acorn machines.
+
+config ACORN_PARTITION_EESOX
+	bool "EESOX partition support" if PARTITION_ADVANCED
+	default y if ARCH_ACORN
+	depends on ACORN_PARTITION
+
+config ACORN_PARTITION_ICS
+	bool "ICS partition support" if PARTITION_ADVANCED
+	default y if ARCH_ACORN
+	depends on ACORN_PARTITION
+	help
+	  Say Y here if you would like to use hard disks under Linux which
+	  were partitioned using the ICS interface on Acorn machines.
+
+config ACORN_PARTITION_ADFS
+	bool "Native filecore partition support" if PARTITION_ADVANCED
+	default y if ARCH_ACORN
+	depends on ACORN_PARTITION
+	help
+	  The Acorn Disc Filing System is the standard file system of the
+	  RiscOS operating system which runs on Acorn's ARM-based Risc PC
+	  systems and the Acorn Archimedes range of machines.  If you say
+	  `Y' here, Linux will support disk partitions created under ADFS.
+
+config ACORN_PARTITION_POWERTEC
+	bool "PowerTec partition support" if PARTITION_ADVANCED
+	default y if ARCH_ACORN
+	depends on ACORN_PARTITION
+	help
+	  Support reading partition tables created on Acorn machines using
+	  the PowerTec SCSI drive.
+
+config ACORN_PARTITION_RISCIX
+	bool "RISCiX partition support" if PARTITION_ADVANCED
+	default y if ARCH_ACORN
+	depends on ACORN_PARTITION
+	help
+	  Once upon a time, there was a native Unix port for the Acorn series
+	  of machines called RISCiX.  If you say 'Y' here, Linux will be able
+	  to read disks partitioned under RISCiX.
+
+config AIX_PARTITION
+	bool "AIX basic partition table support" if PARTITION_ADVANCED
+	help
+	  Say Y here if you would like to be able to read the hard disk
+	  partition table format used by IBM or Motorola PowerPC machines
+	  running AIX.  AIX actually uses a Logical Volume Manager, where
+	  "logical volumes" can be spread across one or multiple disks,
+	  but this driver works only for the simple case of partitions which
+	  are contiguous.
+	  Otherwise, say N.
+
+config OSF_PARTITION
+	bool "Alpha OSF partition support" if PARTITION_ADVANCED
+	default y if ALPHA
+	help
+	  Say Y here if you would like to use hard disks under Linux which
+	  were partitioned on an Alpha machine.
+
+config AMIGA_PARTITION
+	bool "Amiga partition table support" if PARTITION_ADVANCED
+	default y if (AMIGA || AFFS_FS=y)
+	help
+	  Say Y here if you would like to use hard disks under Linux which
+	  were partitioned under AmigaOS.
+
+config ATARI_PARTITION
+	bool "Atari partition table support" if PARTITION_ADVANCED
+	default y if ATARI
+	help
+	  Say Y here if you would like to use hard disks under Linux which
+	  were partitioned under the Atari OS.
+
+config IBM_PARTITION
+	bool "IBM disk label and partition support"
+	depends on PARTITION_ADVANCED && S390
+	help
+	  Say Y here if you would like to be able to read the hard disk
+	  partition table format used by IBM DASD disks operating under CMS.
+	  Otherwise, say N.
+
+config MAC_PARTITION
+	bool "Macintosh partition map support" if PARTITION_ADVANCED
+	default y if (MAC || PPC_PMAC)
+	help
+	  Say Y here if you would like to use hard disks under Linux which
+	  were partitioned on a Macintosh.
+
+config MSDOS_PARTITION
+	bool "PC BIOS (MSDOS partition tables) support" if PARTITION_ADVANCED
+	default y
+	help
+	  Say Y here.
+
+config BSD_DISKLABEL
+	bool "BSD disklabel (FreeBSD partition tables) support"
+	depends on PARTITION_ADVANCED && MSDOS_PARTITION
+	help
+	  FreeBSD uses its own hard disk partition scheme on your PC. It
+	  requires only one entry in the primary partition table of your disk
+	  and manages it similarly to DOS extended partitions, putting in its
+	  first sector a new partition table in BSD disklabel format. Saying Y
+	  here allows you to read these disklabels and further mount FreeBSD
+	  partitions from within Linux if you have also said Y to "UFS
+	  file system support", above. If you don't know what all this is
+	  about, say N.
+
+config MINIX_SUBPARTITION
+	bool "Minix subpartition support"
+	depends on PARTITION_ADVANCED && MSDOS_PARTITION
+	help
+	  Minix 2.0.0/2.0.2 subpartition table support for Linux.
+	  Say Y here if you want to mount and use Minix 2.0.0/2.0.2
+	  subpartitions.
+
+config SOLARIS_X86_PARTITION
+	bool "Solaris (x86) partition table support"
+	depends on PARTITION_ADVANCED && MSDOS_PARTITION
+	help
+	  Like most systems, Solaris x86 uses its own hard disk partition
+	  table format, incompatible with all others. Saying Y here allows you
+	  to read these partition tables and further mount Solaris x86
+	  partitions from within Linux if you have also said Y to "UFS
+	  file system support", above.
+
+config UNIXWARE_DISKLABEL
+	bool "Unixware slices support"
+	depends on PARTITION_ADVANCED && MSDOS_PARTITION
+	---help---
+	  Like some systems, UnixWare uses its own slice table inside a
+	  partition (VTOC - Virtual Table of Contents). Its format is
+	  incompatible with all other OSes. Saying Y here allows you to read
+	  VTOC and further mount UnixWare partitions read-only from within
+	  Linux if you have also said Y to "UFS file system support" or
+	  "System V and Coherent file system support", above.
+
+	  This is mainly used to carry data from a UnixWare box to your
+	  Linux box via a removable medium like magneto-optical, ZIP or
+	  removable IDE drives. Note, however, that a good portable way to
+	  transport files and directories between unixes (and even other
+	  operating systems) is given by the tar program ("man tar" or
+	  preferably "info tar").
+
+	  If you don't know what all this is about, say N.
+
+config LDM_PARTITION
+	bool "Windows Logical Disk Manager (Dynamic Disk) support"
+	depends on PARTITION_ADVANCED
+	---help---
+	  Say Y here if you would like to use hard disks under Linux which
+	  were partitioned using Windows 2000's/XP's or Vista's Logical Disk
+	  Manager.  They are also known as "Dynamic Disks".
+
+	  Note this driver only supports Dynamic Disks with a protective MBR
+	  label, i.e. DOS partition table.  It does not support GPT labelled
+	  Dynamic Disks yet as can be created with Vista.
+
+	  Windows 2000 introduced the concept of Dynamic Disks to get around
+	  the limitations of the PC's partitioning scheme.  The Logical Disk
+	  Manager allows the user to repartition a disk and create spanned,
+	  mirrored, striped or RAID volumes, all without the need for
+	  rebooting.
+
+	  Normal partitions are now called Basic Disks under Windows 2000, XP,
+	  and Vista.
+
+	  For a fuller description read <file:Documentation/ldm.txt>.
+
+	  If unsure, say N.
+
+config LDM_DEBUG
+	bool "Windows LDM extra logging"
+	depends on LDM_PARTITION
+	help
+	  Say Y here if you would like LDM to log verbosely.  This could be
+	  helpful if the driver doesn't work as expected and you'd like to
+	  report a bug.
+
+	  If unsure, say N.
+
+config SGI_PARTITION
+	bool "SGI partition support" if PARTITION_ADVANCED
+	default y if DEFAULT_SGI_PARTITION
+	help
+	  Say Y here if you would like to be able to read the hard disk
+	  partition table format used by SGI machines.
+
+config ULTRIX_PARTITION
+	bool "Ultrix partition table support" if PARTITION_ADVANCED
+	default y if MACH_DECSTATION
+	help
+	  Say Y here if you would like to be able to read the hard disk
+	  partition table format used by DEC (now Compaq) Ultrix machines.
+	  Otherwise, say N.
+
+config SUN_PARTITION
+	bool "Sun partition tables support" if PARTITION_ADVANCED
+	default y if (SPARC || SUN3 || SUN3X)
+	---help---
+	  Like most systems, SunOS uses its own hard disk partition table
+	  format, incompatible with all others. Saying Y here allows you to
+	  read these partition tables and further mount SunOS partitions from
+	  within Linux if you have also said Y to "UFS file system support",
+	  above. This is mainly used to carry data from a SPARC under SunOS to
+	  your Linux box via a removable medium like magneto-optical or ZIP
+	  drives; note however that a good portable way to transport files and
+	  directories between unixes (and even other operating systems) is
+	  given by the tar program ("man tar" or preferably "info tar"). If
+	  you don't know what all this is about, say N.
+
+config KARMA_PARTITION
+	bool "Karma Partition support"
+	depends on PARTITION_ADVANCED
+	help
+	  Say Y here if you would like to mount the Rio Karma MP3 player, as it
+	  uses a proprietary partition table.
+
+config EFI_PARTITION
+	bool "EFI GUID Partition support" if PARTITION_ADVANCED
+	default y
+	select CRC32
+	help
+	  Say Y here if you would like to use hard disks under Linux which
+	  were partitioned using EFI GPT.
+
+config SYSV68_PARTITION
+	bool "SYSV68 partition table support" if PARTITION_ADVANCED
+	default y if VME
+	help
+	  Say Y here if you would like to be able to read the hard disk
+	  partition table format used by Motorola Delta machines (using
+	  sysv68).
+	  Otherwise, say N.
+
+config CMDLINE_PARTITION
+	bool "Command line partition support" if PARTITION_ADVANCED
+	select BLK_CMDLINE_PARSER
+	help
+	  Say Y here if you want to read the partition table from bootargs.
+	  The format for the command line is just like mtdparts.
--- a/block/partitions/Makefile
+++ b/block/partitions/Makefile
@ -0,0 +1,22 @@
+#
+# Makefile for the linux kernel.
+#
+
+obj-$(CONFIG_BLOCK) := check.o
+
+obj-$(CONFIG_ACORN_PARTITION) += acorn.o
+obj-$(CONFIG_AMIGA_PARTITION) += amiga.o
+obj-$(CONFIG_ATARI_PARTITION) += atari.o
+obj-$(CONFIG_AIX_PARTITION) += aix.o
+obj-$(CONFIG_CMDLINE_PARTITION) += cmdline.o
+obj-$(CONFIG_MAC_PARTITION) += mac.o
+obj-$(CONFIG_LDM_PARTITION) += ldm.o
+obj-$(CONFIG_MSDOS_PARTITION) += msdos.o
+obj-$(CONFIG_OSF_PARTITION) += osf.o
+obj-$(CONFIG_SGI_PARTITION) += sgi.o
+obj-$(CONFIG_SUN_PARTITION) += sun.o
+obj-$(CONFIG_ULTRIX_PARTITION) += ultrix.o
+obj-$(CONFIG_IBM_PARTITION) += ibm.o
+obj-$(CONFIG_EFI_PARTITION) += efi.o
+obj-$(CONFIG_KARMA_PARTITION) += karma.o
+obj-$(CONFIG_SYSV68_PARTITION) += sysv68.o
--- a/block/partitions/acorn.c
+++ b/block/partitions/acorn.c
@ -0,0 +1,556 @@
+/*
+ *  linux/fs/partitions/acorn.c
+ *
+ *  Copyright (c) 1996-2000 Russell King.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ *  Scan ADFS partitions on hard disk drives.  Unfortunately, there
+ *  isn't a standard for partitioning drives on Acorn machines, so
+ *  every single manufacturer of SCSI and IDE cards created their own
+ *  method.
+ */
+#include <linux/buffer_head.h>
+#include <linux/adfs_fs.h>
+
+#include "check.h"
+#include "acorn.h"
+
+/*
+ * Partition types. (Oh for reusability)
+ */
+#define PARTITION_RISCIX_MFM	1
+#define PARTITION_RISCIX_SCSI	2
+#define PARTITION_LINUX		9
+
+#if defined(CONFIG_ACORN_PARTITION_CUMANA) || \
+	defined(CONFIG_ACORN_PARTITION_ADFS)
+static struct adfs_discrecord *
+adfs_partition(struct parsed_partitions *state, char *name, char *data,
+	       unsigned long first_sector, int slot)
+{
+	struct adfs_discrecord *dr;
+	unsigned int nr_sects;
+
+	if (adfs_checkbblk(data))
+		return NULL;
+
+	dr = (struct adfs_discrecord *)(data + 0x1c0);
+
+	if (dr->disc_size == 0 && dr->disc_size_high == 0)
+		return NULL;
+
+	nr_sects = (le32_to_cpu(dr->disc_size_high) << 23) |
+		   (le32_to_cpu(dr->disc_size) >> 9);
+
+	if (name) {
+		strlcat(state->pp_buf, " [", PAGE_SIZE);
+		strlcat(state->pp_buf, name, PAGE_SIZE);
+		strlcat(state->pp_buf, "]", PAGE_SIZE);
+	}
+	put_partition(state, slot, first_sector, nr_sects);
+	return dr;
+}
+#endif
+
+#ifdef CONFIG_ACORN_PARTITION_RISCIX
+
+struct riscix_part {
+	__le32	start;
+	__le32	length;
+	__le32	one;
+	char	name[16];
+};
+
+struct riscix_record {
+	__le32	magic;
+#define RISCIX_MAGIC	cpu_to_le32(0x4a657320)
+	__le32	date;
+	struct riscix_part part[8];
+};
+
+#if defined(CONFIG_ACORN_PARTITION_CUMANA) || \
+	defined(CONFIG_ACORN_PARTITION_ADFS)
+static int riscix_partition(struct parsed_partitions *state,
+			    unsigned long first_sect, int slot,
+			    unsigned long nr_sects)
+{
+	Sector sect;
+	struct riscix_record *rr;
+	
+	rr = read_part_sector(state, first_sect, &sect);
+	if (!rr)
+		return -1;
+
+	strlcat(state->pp_buf, " [RISCiX]", PAGE_SIZE);
+
+
+	if (rr->magic == RISCIX_MAGIC) {
+		unsigned long size = nr_sects > 2 ? 2 : nr_sects;
+		int part;
+
+		strlcat(state->pp_buf, " <", PAGE_SIZE);
+
+		put_partition(state, slot++, first_sect, size);
+		for (part = 0; part < 8; part++) {
+			if (rr->part[part].one &&
+			    memcmp(rr->part[part].name, "All\0", 4)) {
+				put_partition(state, slot++,
+					le32_to_cpu(rr->part[part].start),
+					le32_to_cpu(rr->part[part].length));
+				strlcat(state->pp_buf, "(", PAGE_SIZE);
+				strlcat(state->pp_buf, rr->part[part].name, PAGE_SIZE);
+				strlcat(state->pp_buf, ")", PAGE_SIZE);
+			}
+		}
+
+		strlcat(state->pp_buf, " >\n", PAGE_SIZE);
+	} else {
+		put_partition(state, slot++, first_sect, nr_sects);
+	}
+
+	put_dev_sector(sect);
+	return slot;
+}
+#endif
+#endif
+
+#define LINUX_NATIVE_MAGIC 0xdeafa1de
+#define LINUX_SWAP_MAGIC   0xdeafab1e
+
+struct linux_part {
+	__le32 magic;
+	__le32 start_sect;
+	__le32 nr_sects;
+};
+
+#if defined(CONFIG_ACORN_PARTITION_CUMANA) || \
+	defined(CONFIG_ACORN_PARTITION_ADFS)
+static int linux_partition(struct parsed_partitions *state,
+			   unsigned long first_sect, int slot,
+			   unsigned long nr_sects)
+{
+	Sector sect;
+	struct linux_part *linuxp;
+	unsigned long size = nr_sects > 2 ? 2 : nr_sects;
+
+	strlcat(state->pp_buf, " [Linux]", PAGE_SIZE);
+
+	put_partition(state, slot++, first_sect, size);
+
+	linuxp = read_part_sector(state, first_sect, &sect);
+	if (!linuxp)
+		return -1;
+
+	strlcat(state->pp_buf, " <", PAGE_SIZE);
+	while (linuxp->magic == cpu_to_le32(LINUX_NATIVE_MAGIC) ||
+	       linuxp->magic == cpu_to_le32(LINUX_SWAP_MAGIC)) {
+		if (slot == state->limit)
+			break;
+		put_partition(state, slot++, first_sect +
+				 le32_to_cpu(linuxp->start_sect),
+				 le32_to_cpu(linuxp->nr_sects));
+		linuxp ++;
+	}
+	strlcat(state->pp_buf, " >", PAGE_SIZE);
+
+	put_dev_sector(sect);
+	return slot;
+}
+#endif
+
+#ifdef CONFIG_ACORN_PARTITION_CUMANA
+int adfspart_check_CUMANA(struct parsed_partitions *state)
+{
+	unsigned long first_sector = 0;
+	unsigned int start_blk = 0;
+	Sector sect;
+	unsigned char *data;
+	char *name = "CUMANA/ADFS";
+	int first = 1;
+	int slot = 1;
+
+	/*
+	 * Try Cumana style partitions - sector 6 contains ADFS boot block
+	 * with pointer to next 'drive'.
+	 *
+	 * There are unknowns in this code - is the 'cylinder number' of the
+	 * next partition relative to the start of this one - I'm assuming
+	 * it is.
+	 *
+	 * Also, which ID did Cumana use?
+	 *
+	 * This is totally unfinished, and will require more work to get it
+	 * going. Hence it is totally untested.
+	 */
+	do {
+		struct adfs_discrecord *dr;
+		unsigned int nr_sects;
+
+		data = read_part_sector(state, start_blk * 2 + 6, &sect);
+		if (!data)
+			return -1;
+
+		if (slot == state->limit)
+			break;
+
+		dr = adfs_partition(state, name, data, first_sector, slot++);
+		if (!dr)
+			break;
+
+		name = NULL;
+
+		nr_sects = (data[0x1fd] + (data[0x1fe] << 8)) *
+			   (dr->heads + (dr->lowsector & 0x40 ? 1 : 0)) *
+			   dr->secspertrack;
+
+		if (!nr_sects)
+			break;
+
+		first = 0;
+		first_sector += nr_sects;
+		start_blk += nr_sects >> (BLOCK_SIZE_BITS - 9);
+		nr_sects = 0; /* hmm - should be partition size */
+
+		switch (data[0x1fc] & 15) {
+		case 0: /* No partition / ADFS? */
+			break;
+
+#ifdef CONFIG_ACORN_PARTITION_RISCIX
+		case PARTITION_RISCIX_SCSI:
+			/* RISCiX - we don't know how to find the next one. */
+			slot = riscix_partition(state, first_sector, slot,
+						nr_sects);
+			break;
+#endif
+
+		case PARTITION_LINUX:
+			slot = linux_partition(state, first_sector, slot,
+					       nr_sects);
+			break;
+		}
+		put_dev_sector(sect);
+		if (slot == -1)
+			return -1;
+	} while (1);
+	put_dev_sector(sect);
+	return first ? 0 : 1;
+}
+#endif
+
+#ifdef CONFIG_ACORN_PARTITION_ADFS
+/*
+ * Purpose: allocate ADFS partitions.
+ *
+ * Params : hd		- pointer to gendisk structure to store partition info.
+ *	    dev		- device number to access.
+ *
+ * Returns: -1 on error, 0 for no ADFS boot sector, 1 for ok.
+ *
+ * Alloc  : hda  = whole drive
+ *	    hda1 = ADFS partition on first drive.
+ *	    hda2 = non-ADFS partition.
+ */
+int adfspart_check_ADFS(struct parsed_partitions *state)
+{
+	unsigned long start_sect, nr_sects, sectscyl, heads;
+	Sector sect;
+	unsigned char *data;
+	struct adfs_discrecord *dr;
+	unsigned char id;
+	int slot = 1;
+
+	data = read_part_sector(state, 6, &sect);
+	if (!data)
+		return -1;
+
+	dr = adfs_partition(state, "ADFS", data, 0, slot++);
+	if (!dr) {
+		put_dev_sector(sect);
+    		return 0;
+	}
+
+	heads = dr->heads + ((dr->lowsector >> 6) & 1);
+	sectscyl = dr->secspertrack * heads;
+	start_sect = ((data[0x1fe] << 8) + data[0x1fd]) * sectscyl;
+	id = data[0x1fc] & 15;
+	put_dev_sector(sect);
+
+	/*
+	 * Work out start of non-adfs partition.
+	 */
+	nr_sects = (state->bdev->bd_inode->i_size >> 9) - start_sect;
+
+	if (start_sect) {
+		switch (id) {
+#ifdef CONFIG_ACORN_PARTITION_RISCIX
+		case PARTITION_RISCIX_SCSI:
+		case PARTITION_RISCIX_MFM:
+			slot = riscix_partition(state, start_sect, slot,
+						nr_sects);
+			break;
+#endif
+
+		case PARTITION_LINUX:
+			slot = linux_partition(state, start_sect, slot,
+					       nr_sects);
+			break;
+		}
+	}
+	strlcat(state->pp_buf, "\n", PAGE_SIZE);
+	return 1;
+}
+#endif
+
+#ifdef CONFIG_ACORN_PARTITION_ICS
+
+struct ics_part {
+	__le32 start;
+	__le32 size;
+};
+
+static int adfspart_check_ICSLinux(struct parsed_partitions *state,
+				   unsigned long block)
+{
+	Sector sect;
+	unsigned char *data = read_part_sector(state, block, &sect);
+	int result = 0;
+
+	if (data) {
+		if (memcmp(data, "LinuxPart", 9) == 0)
+			result = 1;
+		put_dev_sector(sect);
+	}
+
+	return result;
+}
+
+/*
+ * Check for a valid ICS partition using the checksum.
+ */
+static inline int valid_ics_sector(const unsigned char *data)
+{
+	unsigned long sum;
+	int i;
+
+	for (i = 0, sum = 0x50617274; i < 508; i++)
+		sum += data[i];
+
+	sum -= le32_to_cpu(*(__le32 *)(&data[508]));
+
+	return sum == 0;
+}
+
+/*
+ * Purpose: allocate ICS partitions.
+ * Params : hd		- pointer to gendisk structure to store partition info.
+ *	    dev		- device number to access.
+ * Returns: -1 on error, 0 for no ICS table, 1 for partitions ok.
+ * Alloc  : hda  = whole drive
+ *	    hda1 = ADFS partition 0 on first drive.
+ *	    hda2 = ADFS partition 1 on first drive.
+ *		..etc..
+ */
+int adfspart_check_ICS(struct parsed_partitions *state)
+{
+	const unsigned char *data;
+	const struct ics_part *p;
+	int slot;
+	Sector sect;
+
+	/*
+	 * Try ICS style partitions - sector 0 contains partition info.
+	 */
+	data = read_part_sector(state, 0, &sect);
+	if (!data)
+	    	return -1;
+
+	if (!valid_ics_sector(data)) {
+	    	put_dev_sector(sect);
+		return 0;
+	}
+
+	strlcat(state->pp_buf, " [ICS]", PAGE_SIZE);
+
+	for (slot = 1, p = (const struct ics_part *)data; p->size; p++) {
+		u32 start = le32_to_cpu(p->start);
+		s32 size = le32_to_cpu(p->size); /* yes, it's signed. */
+
+		if (slot == state->limit)
+			break;
+
+		/*
+		 * Negative sizes tell the RISC OS ICS driver to ignore
+		 * this partition - in effect it says that this does not
+		 * contain an ADFS filesystem.
+		 */
+		if (size < 0) {
+			size = -size;
+
+			/*
+			 * Our own extension - We use the first sector
+			 * of the partition to identify what type this
+			 * partition is.  We must not make this visible
+			 * to the filesystem.
+			 */
+			if (size > 1 && adfspart_check_ICSLinux(state, start)) {
+				start += 1;
+				size -= 1;
+			}
+		}
+
+		if (size)
+			put_partition(state, slot++, start, size);
+	}
+
+	put_dev_sector(sect);
+	strlcat(state->pp_buf, "\n", PAGE_SIZE);
+	return 1;
+}
+#endif
+
+#ifdef CONFIG_ACORN_PARTITION_POWERTEC
+struct ptec_part {
+	__le32 unused1;
+	__le32 unused2;
+	__le32 start;
+	__le32 size;
+	__le32 unused5;
+	char type[8];
+};
+
+static inline int valid_ptec_sector(const unsigned char *data)
+{
+	unsigned char checksum = 0x2a;
+	int i;
+
+	/*
+	 * If it looks like a PC/BIOS partition, then it
+	 * probably isn't PowerTec.
+	 */
+	if (data[510] == 0x55 && data[511] == 0xaa)
+		return 0;
+
+	for (i = 0; i < 511; i++)
+		checksum += data[i];
+
+	return checksum == data[511];
+}
+
+/*
+ * Purpose: allocate ICS partitions.
+ * Params : hd		- pointer to gendisk structure to store partition info.
+ *	    dev		- device number to access.
+ * Returns: -1 on error, 0 for no ICS table, 1 for partitions ok.
+ * Alloc  : hda  = whole drive
+ *	    hda1 = ADFS partition 0 on first drive.
+ *	    hda2 = ADFS partition 1 on first drive.
+ *		..etc..
+ */
+int adfspart_check_POWERTEC(struct parsed_partitions *state)
+{
+	Sector sect;
+	const unsigned char *data;
+	const struct ptec_part *p;
+	int slot = 1;
+	int i;
+
+	data = read_part_sector(state, 0, &sect);
+	if (!data)
+		return -1;
+
+	if (!valid_ptec_sector(data)) {
+		put_dev_sector(sect);
+		return 0;
+	}
+
+	strlcat(state->pp_buf, " [POWERTEC]", PAGE_SIZE);
+
+	for (i = 0, p = (const struct ptec_part *)data; i < 12; i++, p++) {
+		u32 start = le32_to_cpu(p->start);
+		u32 size = le32_to_cpu(p->size);
+
+		if (size)
+			put_partition(state, slot++, start, size);
+	}
+
+	put_dev_sector(sect);
+	strlcat(state->pp_buf, "\n", PAGE_SIZE);
+	return 1;
+}
+#endif
+
+#ifdef CONFIG_ACORN_PARTITION_EESOX
+struct eesox_part {
+	char	magic[6];
+	char	name[10];
+	__le32	start;
+	__le32	unused6;
+	__le32	unused7;
+	__le32	unused8;
+};
+
+/*
+ * Guess who created this format?
+ */
+static const char eesox_name[] = {
+	'N', 'e', 'i', 'l', ' ',
+	'C', 'r', 'i', 't', 'c', 'h', 'e', 'l', 'l', ' ', ' '
+};
+
+/*
+ * EESOX SCSI partition format.
+ *
+ * This is a goddamned awful partition format.  We don't seem to store
+ * the size of the partition in this table, only the start addresses.
+ *
+ * There are two possibilities where the size comes from:
+ *  1. The individual ADFS boot block entries that are placed on the disk.
+ *  2. The start address of the next entry.
+ */
+int adfspart_check_EESOX(struct parsed_partitions *state)
+{
+	Sector sect;
+	const unsigned char *data;
+	unsigned char buffer[256];
+	struct eesox_part *p;
+	sector_t start = 0;
+	int i, slot = 1;
+
+	data = read_part_sector(state, 7, &sect);
+	if (!data)
+		return -1;
+
+	/*
+	 * "Decrypt" the partition table.  God knows why...
+	 */
+	for (i = 0; i < 256; i++)
+		buffer[i] = data[i] ^ eesox_name[i & 15];
+
+	put_dev_sector(sect);
+
+	for (i = 0, p = (struct eesox_part *)buffer; i < 8; i++, p++) {
+		sector_t next;
+
+		if (memcmp(p->magic, "Eesox", 6))
+			break;
+
+		next = le32_to_cpu(p->start);
+		if (i)
+			put_partition(state, slot++, start, next - start);
+		start = next;
+	}
+
+	if (i != 0) {
+		sector_t size;
+
+		size = get_capacity(state->bdev->bd_disk);
+		put_partition(state, slot++, start, size - start);
+		strlcat(state->pp_buf, "\n", PAGE_SIZE);
+	}
+
+	return i ? 1 : 0;
+}
+#endif
--- a/block/partitions/acorn.h
+++ b/block/partitions/acorn.h
@ -0,0 +1,14 @@
+/*
+ * linux/fs/partitions/acorn.h
+ *
+ * Copyright (C) 1996-2001 Russell King.
+ *
+ *  I _hate_ this partitioning mess - why can't we have one defined
+ *  format, and everyone stick to it?
+ */
+
+int adfspart_check_CUMANA(struct parsed_partitions *state);
+int adfspart_check_ADFS(struct parsed_partitions *state);
+int adfspart_check_ICS(struct parsed_partitions *state);
+int adfspart_check_POWERTEC(struct parsed_partitions *state);
+int adfspart_check_EESOX(struct parsed_partitions *state);
--- a/block/partitions/aix.c
+++ b/block/partitions/aix.c
@ -0,0 +1,293 @@
+/*
+ *  fs/partitions/aix.c
+ *
+ *  Copyright (C) 2012-2013 Philippe De Muyter <phdm@macqel.be>
+ */
+
+#include "check.h"
+#include "aix.h"
+
+struct lvm_rec {
+	char lvm_id[4]; /* "_LVM" */
+	char reserved4[16];
+	__be32 lvmarea_len;
+	__be32 vgda_len;
+	__be32 vgda_psn[2];
+	char reserved36[10];
+	__be16 pp_size; /* log2(pp_size) */
+	char reserved46[12];
+	__be16 version;
+	};
+
+struct vgda {
+	__be32 secs;
+	__be32 usec;
+	char reserved8[16];
+	__be16 numlvs;
+	__be16 maxlvs;
+	__be16 pp_size;
+	__be16 numpvs;
+	__be16 total_vgdas;
+	__be16 vgda_size;
+	};
+
+struct lvd {
+	__be16 lv_ix;
+	__be16 res2;
+	__be16 res4;
+	__be16 maxsize;
+	__be16 lv_state;
+	__be16 mirror;
+	__be16 mirror_policy;
+	__be16 num_lps;
+	__be16 res10[8];
+	};
+
+struct lvname {
+	char name[64];
+	};
+
+struct ppe {
+	__be16 lv_ix;
+	unsigned short res2;
+	unsigned short res4;
+	__be16 lp_ix;
+	unsigned short res8[12];
+	};
+
+struct pvd {
+	char reserved0[16];
+	__be16 pp_count;
+	char reserved18[2];
+	__be32 psn_part1;
+	char reserved24[8];
+	struct ppe ppe[1016];
+	};
+
+#define LVM_MAXLVS 256
+
+/**
+ * last_lba(): return number of last logical block of device
+ * @bdev: block device
+ *
+ * Description: Returns last LBA value on success, 0 on error.
+ * This is stored (by sd and ide-geometry) in
+ *  the part[0] entry for this disk, and is the number of
+ *  physical sectors available on the disk.
+ */
+static u64 last_lba(struct block_device *bdev)
+{
+	if (!bdev || !bdev->bd_inode)
+		return 0;
+	return (bdev->bd_inode->i_size >> 9) - 1ULL;
+}
+
+/**
+ * read_lba(): Read bytes from disk, starting at given LBA
+ * @state
+ * @lba
+ * @buffer
+ * @count
+ *
+ * Description:  Reads @count bytes from @state->bdev into @buffer.
+ * Returns number of bytes read on success, 0 on error.
+ */
+static size_t read_lba(struct parsed_partitions *state, u64 lba, u8 *buffer,
+			size_t count)
+{
+	size_t totalreadcount = 0;
+
+	if (!buffer || lba + count / 512 > last_lba(state->bdev))
+		return 0;
+
+	while (count) {
+		int copied = 512;
+		Sector sect;
+		unsigned char *data = read_part_sector(state, lba++, &sect);
+		if (!data)
+			break;
+		if (copied > count)
+			copied = count;
+		memcpy(buffer, data, copied);
+		put_dev_sector(sect);
+		buffer += copied;
+		totalreadcount += copied;
+		count -= copied;
+	}
+	return totalreadcount;
+}
+
+/**
+ * alloc_pvd(): reads physical volume descriptor
+ * @state
+ * @lba
+ *
+ * Description: Returns pvd on success,  NULL on error.
+ * Allocates space for pvd and fill it with disk blocks at @lba
+ * Notes: remember to free pvd when you're done!
+ */
+static struct pvd *alloc_pvd(struct parsed_partitions *state, u32 lba)
+{
+	size_t count = sizeof(struct pvd);
+	struct pvd *p;
+
+	p = kmalloc(count, GFP_KERNEL);
+	if (!p)
+		return NULL;
+
+	if (read_lba(state, lba, (u8 *) p, count) < count) {
+		kfree(p);
+		return NULL;
+	}
+	return p;
+}
+
+/**
+ * alloc_lvn(): reads logical volume names
+ * @state
+ * @lba
+ *
+ * Description: Returns lvn on success,  NULL on error.
+ * Allocates space for lvn and fill it with disk blocks at @lba
+ * Notes: remember to free lvn when you're done!
+ */
+static struct lvname *alloc_lvn(struct parsed_partitions *state, u32 lba)
+{
+	size_t count = sizeof(struct lvname) * LVM_MAXLVS;
+	struct lvname *p;
+
+	p = kmalloc(count, GFP_KERNEL);
+	if (!p)
+		return NULL;
+
+	if (read_lba(state, lba, (u8 *) p, count) < count) {
+		kfree(p);
+		return NULL;
+	}
+	return p;
+}
+
+int aix_partition(struct parsed_partitions *state)
+{
+	int ret = 0;
+	Sector sect;
+	unsigned char *d;
+	u32 pp_bytes_size;
+	u32 pp_blocks_size = 0;
+	u32 vgda_sector = 0;
+	u32 vgda_len = 0;
+	int numlvs = 0;
+	struct pvd *pvd;
+	struct lv_info {
+		unsigned short pps_per_lv;
+		unsigned short pps_found;
+		unsigned char lv_is_contiguous;
+	} *lvip;
+	struct lvname *n = NULL;
+
+	d = read_part_sector(state, 7, &sect);
+	if (d) {
+		struct lvm_rec *p = (struct lvm_rec *)d;
+		u16 lvm_version = be16_to_cpu(p->version);
+		char tmp[64];
+
+		if (lvm_version == 1) {
+			int pp_size_log2 = be16_to_cpu(p->pp_size);
+
+			pp_bytes_size = 1 << pp_size_log2;
+			pp_blocks_size = pp_bytes_size / 512;
+			snprintf(tmp, sizeof(tmp),
+				" AIX LVM header version %u found\n",
+				lvm_version);
+			vgda_len = be32_to_cpu(p->vgda_len);
+			vgda_sector = be32_to_cpu(p->vgda_psn[0]);
+		} else {
+			snprintf(tmp, sizeof(tmp),
+				" unsupported AIX LVM version %d found\n",
+				lvm_version);
+		}
+		strlcat(state->pp_buf, tmp, PAGE_SIZE);
+		put_dev_sector(sect);
+	}
+	if (vgda_sector && (d = read_part_sector(state, vgda_sector, &sect))) {
+		struct vgda *p = (struct vgda *)d;
+
+		numlvs = be16_to_cpu(p->numlvs);
+		put_dev_sector(sect);
+	}
+	lvip = kcalloc(state->limit, sizeof(struct lv_info), GFP_KERNEL);
+	if (!lvip)
+		return 0;
+	if (numlvs && (d = read_part_sector(state, vgda_sector + 1, &sect))) {
+		struct lvd *p = (struct lvd *)d;
+		int i;
+
+		n = alloc_lvn(state, vgda_sector + vgda_len - 33);
+		if (n) {
+			int foundlvs = 0;
+
+			for (i = 0; foundlvs < numlvs && i < state->limit; i += 1) {
+				lvip[i].pps_per_lv = be16_to_cpu(p[i].num_lps);
+				if (lvip[i].pps_per_lv)
+					foundlvs += 1;
+			}
+		}
+		put_dev_sector(sect);
+	}
+	pvd = alloc_pvd(state, vgda_sector + 17);
+	if (pvd) {
+		int numpps = be16_to_cpu(pvd->pp_count);
+		int psn_part1 = be32_to_cpu(pvd->psn_part1);
+		int i;
+		int cur_lv_ix = -1;
+		int next_lp_ix = 1;
+		int lp_ix;
+
+		for (i = 0; i < numpps; i += 1) {
+			struct ppe *p = pvd->ppe + i;
+			unsigned int lv_ix;
+
+			lp_ix = be16_to_cpu(p->lp_ix);
+			if (!lp_ix) {
+				next_lp_ix = 1;
+				continue;
+			}
+			lv_ix = be16_to_cpu(p->lv_ix) - 1;
+			if (lv_ix >= state->limit) {
+				cur_lv_ix = -1;
+				continue;
+			}
+			lvip[lv_ix].pps_found += 1;
+			if (lp_ix == 1) {
+				cur_lv_ix = lv_ix;
+				next_lp_ix = 1;
+			} else if (lv_ix != cur_lv_ix || lp_ix != next_lp_ix) {
+				next_lp_ix = 1;
+				continue;
+			}
+			if (lp_ix == lvip[lv_ix].pps_per_lv) {
+				char tmp[70];
+
+				put_partition(state, lv_ix + 1,
+				  (i + 1 - lp_ix) * pp_blocks_size + psn_part1,
+				  lvip[lv_ix].pps_per_lv * pp_blocks_size);
+				snprintf(tmp, sizeof(tmp), " <%s>\n",
+					 n[lv_ix].name);
+				strlcat(state->pp_buf, tmp, PAGE_SIZE);
+				lvip[lv_ix].lv_is_contiguous = 1;
+				ret = 1;
+				next_lp_ix = 1;
+			} else
+				next_lp_ix += 1;
+		}
+		for (i = 0; i < state->limit; i += 1)
+			if (lvip[i].pps_found && !lvip[i].lv_is_contiguous)
+				pr_warn("partition %s (%u pp's found) is "
+					"not contiguous\n",
+					n[i].name, lvip[i].pps_found);
+		kfree(pvd);
+	}
+	kfree(n);
+	kfree(lvip);
+	return ret;
+}
--- a/block/partitions/aix.h
+++ b/block/partitions/aix.h
@ -0,0 +1 @@
+extern int aix_partition(struct parsed_partitions *state);
--- a/block/partitions/amiga.c
+++ b/block/partitions/amiga.c
@ -0,0 +1,141 @@
+/*
+ *  fs/partitions/amiga.c
+ *
+ *  Code extracted from drivers/block/genhd.c
+ *
+ *  Copyright (C) 1991-1998  Linus Torvalds
+ *  Re-organised Feb 1998 Russell King
+ */
+
+#define pr_fmt(fmt) fmt
+
+#include <linux/types.h>
+#include <linux/affs_hardblocks.h>
+
+#include "check.h"
+#include "amiga.h"
+
+static __inline__ u32
+checksum_block(__be32 *m, int size)
+{
+	u32 sum = 0;
+
+	while (size--)
+		sum += be32_to_cpu(*m++);
+	return sum;
+}
+
+int amiga_partition(struct parsed_partitions *state)
+{
+	Sector sect;
+	unsigned char *data;
+	struct RigidDiskBlock *rdb;
+	struct PartitionBlock *pb;
+	int start_sect, nr_sects, blk, part, res = 0;
+	int blksize = 1;	/* Multiplier for disk block size */
+	int slot = 1;
+	char b[BDEVNAME_SIZE];
+
+	for (blk = 0; ; blk++, put_dev_sector(sect)) {
+		if (blk == RDB_ALLOCATION_LIMIT)
+			goto rdb_done;
+		data = read_part_sector(state, blk, &sect);
+		if (!data) {
+			if (warn_no_part)
+				pr_err("Dev %s: unable to read RDB block %d\n",
+				       bdevname(state->bdev, b), blk);
+			res = -1;
+			goto rdb_done;
+		}
+		if (*(__be32 *)data != cpu_to_be32(IDNAME_RIGIDDISK))
+			continue;
+
+		rdb = (struct RigidDiskBlock *)data;
+		if (checksum_block((__be32 *)data, be32_to_cpu(rdb->rdb_SummedLongs) & 0x7F) == 0)
+			break;
+		/* Try again with 0xdc..0xdf zeroed, Windows might have
+		 * trashed it.
+		 */
+		*(__be32 *)(data+0xdc) = 0;
+		if (checksum_block((__be32 *)data,
+				be32_to_cpu(rdb->rdb_SummedLongs) & 0x7F)==0) {
+			pr_err("Trashed word at 0xd0 in block %d ignored in checksum calculation\n",
+			       blk);
+			break;
+		}
+
+		pr_err("Dev %s: RDB in block %d has bad checksum\n",
+		       bdevname(state->bdev, b), blk);
+	}
+
+	/* blksize is blocks per 512 byte standard block */
+	blksize = be32_to_cpu( rdb->rdb_BlockBytes ) / 512;
+
+	{
+		char tmp[7 + 10 + 1 + 1];
+
+		/* Be more informative */
+		snprintf(tmp, sizeof(tmp), " RDSK (%d)", blksize * 512);
+		strlcat(state->pp_buf, tmp, PAGE_SIZE);
+	}
+	blk = be32_to_cpu(rdb->rdb_PartitionList);
+	put_dev_sector(sect);
+	for (part = 1; blk>0 && part<=16; part++, put_dev_sector(sect)) {
+		blk *= blksize;	/* Read in terms partition table understands */
+		data = read_part_sector(state, blk, &sect);
+		if (!data) {
+			if (warn_no_part)
+				pr_err("Dev %s: unable to read partition block %d\n",
+				       bdevname(state->bdev, b), blk);
+			res = -1;
+			goto rdb_done;
+		}
+		pb  = (struct PartitionBlock *)data;
+		blk = be32_to_cpu(pb->pb_Next);
+		if (pb->pb_ID != cpu_to_be32(IDNAME_PARTITION))
+			continue;
+		if (checksum_block((__be32 *)pb, be32_to_cpu(pb->pb_SummedLongs) & 0x7F) != 0 )
+			continue;
+
+		/* Tell Kernel about it */
+
+		nr_sects = (be32_to_cpu(pb->pb_Environment[10]) + 1 -
+			    be32_to_cpu(pb->pb_Environment[9])) *
+			   be32_to_cpu(pb->pb_Environment[3]) *
+			   be32_to_cpu(pb->pb_Environment[5]) *
+			   blksize;
+		if (!nr_sects)
+			continue;
+		start_sect = be32_to_cpu(pb->pb_Environment[9]) *
+			     be32_to_cpu(pb->pb_Environment[3]) *
+			     be32_to_cpu(pb->pb_Environment[5]) *
+			     blksize;
+		put_partition(state,slot++,start_sect,nr_sects);
+		{
+			/* Be even more informative to aid mounting */
+			char dostype[4];
+			char tmp[42];
+
+			__be32 *dt = (__be32 *)dostype;
+			*dt = pb->pb_Environment[16];
+			if (dostype[3] < ' ')
+				snprintf(tmp, sizeof(tmp), " (%c%c%c^%c)",
+					dostype[0], dostype[1],
+					dostype[2], dostype[3] + '@' );
+			else
+				snprintf(tmp, sizeof(tmp), " (%c%c%c%c)",
+					dostype[0], dostype[1],
+					dostype[2], dostype[3]);
+			strlcat(state->pp_buf, tmp, PAGE_SIZE);
+			snprintf(tmp, sizeof(tmp), "(res %d spb %d)",
+				be32_to_cpu(pb->pb_Environment[6]),
+				be32_to_cpu(pb->pb_Environment[4]));
+			strlcat(state->pp_buf, tmp, PAGE_SIZE);
+		}
+		res = 1;
+	}
+	strlcat(state->pp_buf, "\n", PAGE_SIZE);
+
+rdb_done:
+	return res;
+}
--- a/block/partitions/amiga.h
+++ b/block/partitions/amiga.h
@ -0,0 +1,6 @@
+/*
+ *  fs/partitions/amiga.h
+ */
+
+int amiga_partition(struct parsed_partitions *state);
+
--- a/block/partitions/atari.c
+++ b/block/partitions/atari.c
@ -0,0 +1,149 @@
+/*
+ *  fs/partitions/atari.c
+ *
+ *  Code extracted from drivers/block/genhd.c
+ *
+ *  Copyright (C) 1991-1998  Linus Torvalds
+ *  Re-organised Feb 1998 Russell King
+ */
+
+#include <linux/ctype.h>
+#include "check.h"
+#include "atari.h"
+
+/* ++guenther: this should be settable by the user ("make config")?.
+ */
+#define ICD_PARTS
+
+/* check if a partition entry looks valid -- Atari format is assumed if at
+   least one of the primary entries is ok this way */
+#define	VALID_PARTITION(pi,hdsiz)					     \
+    (((pi)->flg & 1) &&							     \
+     isalnum((pi)->id[0]) && isalnum((pi)->id[1]) && isalnum((pi)->id[2]) && \
+     be32_to_cpu((pi)->st) <= (hdsiz) &&				     \
+     be32_to_cpu((pi)->st) + be32_to_cpu((pi)->siz) <= (hdsiz))
+
+static inline int OK_id(char *s)
+{
+	return  memcmp (s, "GEM", 3) == 0 || memcmp (s, "BGM", 3) == 0 ||
+		memcmp (s, "LNX", 3) == 0 || memcmp (s, "SWP", 3) == 0 ||
+		memcmp (s, "RAW", 3) == 0 ;
+}
+
+int atari_partition(struct parsed_partitions *state)
+{
+	Sector sect;
+	struct rootsector *rs;
+	struct partition_info *pi;
+	u32 extensect;
+	u32 hd_size;
+	int slot;
+#ifdef ICD_PARTS
+	int part_fmt = 0; /* 0:unknown, 1:AHDI, 2:ICD/Supra */
+#endif
+
+	rs = read_part_sector(state, 0, &sect);
+	if (!rs)
+		return -1;
+
+	/* Verify this is an Atari rootsector: */
+	hd_size = state->bdev->bd_inode->i_size >> 9;
+	if (!VALID_PARTITION(&rs->part[0], hd_size) &&
+	    !VALID_PARTITION(&rs->part[1], hd_size) &&
+	    !VALID_PARTITION(&rs->part[2], hd_size) &&
+	    !VALID_PARTITION(&rs->part[3], hd_size)) {
+		/*
+		 * if there's no valid primary partition, assume that no Atari
+		 * format partition table (there's no reliable magic or the like
+	         * :-()
+		 */
+		put_dev_sector(sect);
+		return 0;
+	}
+
+	pi = &rs->part[0];
+	strlcat(state->pp_buf, " AHDI", PAGE_SIZE);
+	for (slot = 1; pi < &rs->part[4] && slot < state->limit; slot++, pi++) {
+		struct rootsector *xrs;
+		Sector sect2;
+		ulong partsect;
+
+		if ( !(pi->flg & 1) )
+			continue;
+		/* active partition */
+		if (memcmp (pi->id, "XGM", 3) != 0) {
+			/* we don't care about other id's */
+			put_partition (state, slot, be32_to_cpu(pi->st),
+					be32_to_cpu(pi->siz));
+			continue;
+		}
+		/* extension partition */
+#ifdef ICD_PARTS
+		part_fmt = 1;
+#endif
+		strlcat(state->pp_buf, " XGM<", PAGE_SIZE);
+		partsect = extensect = be32_to_cpu(pi->st);
+		while (1) {
+			xrs = read_part_sector(state, partsect, &sect2);
+			if (!xrs) {
+				printk (" block %ld read failed\n", partsect);
+				put_dev_sector(sect);
+				return -1;
+			}
+
+			/* ++roman: sanity check: bit 0 of flg field must be set */
+			if (!(xrs->part[0].flg & 1)) {
+				printk( "\nFirst sub-partition in extended partition is not valid!\n" );
+				put_dev_sector(sect2);
+				break;
+			}
+
+			put_partition(state, slot,
+				   partsect + be32_to_cpu(xrs->part[0].st),
+				   be32_to_cpu(xrs->part[0].siz));
+
+			if (!(xrs->part[1].flg & 1)) {
+				/* end of linked partition list */
+				put_dev_sector(sect2);
+				break;
+			}
+			if (memcmp( xrs->part[1].id, "XGM", 3 ) != 0) {
+				printk("\nID of extended partition is not XGM!\n");
+				put_dev_sector(sect2);
+				break;
+			}
+
+			partsect = be32_to_cpu(xrs->part[1].st) + extensect;
+			put_dev_sector(sect2);
+			if (++slot == state->limit) {
+				printk( "\nMaximum number of partitions reached!\n" );
+				break;
+			}
+		}
+		strlcat(state->pp_buf, " >", PAGE_SIZE);
+	}
+#ifdef ICD_PARTS
+	if ( part_fmt!=1 ) { /* no extended partitions -> test ICD-format */
+		pi = &rs->icdpart[0];
+		/* sanity check: no ICD format if first partition invalid */
+		if (OK_id(pi->id)) {
+			strlcat(state->pp_buf, " ICD<", PAGE_SIZE);
+			for (; pi < &rs->icdpart[8] && slot < state->limit; slot++, pi++) {
+				/* accept only GEM,BGM,RAW,LNX,SWP partitions */
+				if (!((pi->flg & 1) && OK_id(pi->id)))
+					continue;
+				part_fmt = 2;
+				put_partition (state, slot,
+						be32_to_cpu(pi->st),
+						be32_to_cpu(pi->siz));
+			}
+			strlcat(state->pp_buf, " >", PAGE_SIZE);
+		}
+	}
+#endif
+	put_dev_sector(sect);
+
+	strlcat(state->pp_buf, "\n", PAGE_SIZE);
+
+	return 1;
+}
--- a/block/partitions/atari.h
+++ b/block/partitions/atari.h
@ -0,0 +1,36 @@
+/*
+ *  fs/partitions/atari.h
+ *  Moved by Russell King from:
+ *
+ * linux/include/linux/atari_rootsec.h
+ * definitions for Atari Rootsector layout
+ * by Andreas Schwab (schwab@ls5.informatik.uni-dortmund.de)
+ *
+ * modified for ICD/Supra partitioning scheme restricted to at most 12
+ * partitions
+ * by Guenther Kelleter (guenther@pool.informatik.rwth-aachen.de)
+ */
+
+#include <linux/compiler.h>
+
+struct partition_info
+{
+  u8 flg;			/* bit 0: active; bit 7: bootable */
+  char id[3];			/* "GEM", "BGM", "XGM", or other */
+  __be32 st;			/* start of partition */
+  __be32 siz;			/* length of partition */
+};
+
+struct rootsector
+{
+  char unused[0x156];		/* room for boot code */
+  struct partition_info icdpart[8];	/* info for ICD-partitions 5..12 */
+  char unused2[0xc];
+  u32 hd_siz;			/* size of disk in blocks */
+  struct partition_info part[4];
+  u32 bsl_st;			/* start of bad sector list */
+  u32 bsl_cnt;			/* length of bad sector list */
+  u16 checksum;			/* checksum for bootable disks */
+} __packed;
+
+int atari_partition(struct parsed_partitions *state);
--- a/block/partitions/check.c
+++ b/block/partitions/check.c
@ -0,0 +1,197 @@
+/*
+ *  fs/partitions/check.c
+ *
+ *  Code extracted from drivers/block/genhd.c
+ *  Copyright (C) 1991-1998  Linus Torvalds
+ *  Re-organised Feb 1998 Russell King
+ *
+ *  We now have independent partition support from the
+ *  block drivers, which allows all the partition code to
+ *  be grouped in one location, and it to be mostly self
+ *  contained.
+ *
+ *  Added needed MAJORS for new pairs, {hdi,hdj}, {hdk,hdl}
+ */
+
+#include <linux/slab.h>
+#include <linux/vmalloc.h>
+#include <linux/ctype.h>
+#include <linux/genhd.h>
+
+#include "check.h"
+
+#include "acorn.h"
+#include "amiga.h"
+#include "atari.h"
+#include "ldm.h"
+#include "mac.h"
+#include "msdos.h"
+#include "osf.h"
+#include "sgi.h"
+#include "sun.h"
+#include "ibm.h"
+#include "ultrix.h"
+#include "efi.h"
+#include "karma.h"
+#include "sysv68.h"
+#include "cmdline.h"
+
+int warn_no_part = 1; /*This is ugly: should make genhd removable media aware*/
+
+static int (*check_part[])(struct parsed_partitions *) = {
+	/*
+	 * Probe partition formats with tables at disk address 0
+	 * that also have an ADFS boot block at 0xdc0.
+	 */
+#ifdef CONFIG_ACORN_PARTITION_ICS
+	adfspart_check_ICS,
+#endif
+#ifdef CONFIG_ACORN_PARTITION_POWERTEC
+	adfspart_check_POWERTEC,
+#endif
+#ifdef CONFIG_ACORN_PARTITION_EESOX
+	adfspart_check_EESOX,
+#endif
+
+	/*
+	 * Now move on to formats that only have partition info at
+	 * disk address 0xdc0.  Since these may also have stale
+	 * PC/BIOS partition tables, they need to come before
+	 * the msdos entry.
+	 */
+#ifdef CONFIG_ACORN_PARTITION_CUMANA
+	adfspart_check_CUMANA,
+#endif
+#ifdef CONFIG_ACORN_PARTITION_ADFS
+	adfspart_check_ADFS,
+#endif
+
+#ifdef CONFIG_CMDLINE_PARTITION
+	cmdline_partition,
+#endif
+#ifdef CONFIG_EFI_PARTITION
+	efi_partition,		/* this must come before msdos */
+#endif
+#ifdef CONFIG_SGI_PARTITION
+	sgi_partition,
+#endif
+#ifdef CONFIG_LDM_PARTITION
+	ldm_partition,		/* this must come before msdos */
+#endif
+#ifdef CONFIG_MSDOS_PARTITION
+	msdos_partition,
+#endif
+#ifdef CONFIG_OSF_PARTITION
+	osf_partition,
+#endif
+#ifdef CONFIG_SUN_PARTITION
+	sun_partition,
+#endif
+#ifdef CONFIG_AMIGA_PARTITION
+	amiga_partition,
+#endif
+#ifdef CONFIG_ATARI_PARTITION
+	atari_partition,
+#endif
+#ifdef CONFIG_MAC_PARTITION
+	mac_partition,
+#endif
+#ifdef CONFIG_ULTRIX_PARTITION
+	ultrix_partition,
+#endif
+#ifdef CONFIG_IBM_PARTITION
+	ibm_partition,
+#endif
+#ifdef CONFIG_KARMA_PARTITION
+	karma_partition,
+#endif
+#ifdef CONFIG_SYSV68_PARTITION
+	sysv68_partition,
+#endif
+	NULL
+};
+
+static struct parsed_partitions *allocate_partitions(struct gendisk *hd)
+{
+	struct parsed_partitions *state;
+	int nr;
+
+	state = kzalloc(sizeof(*state), GFP_KERNEL);
+	if (!state)
+		return NULL;
+
+	nr = disk_max_parts(hd);
+	state->parts = vzalloc(nr * sizeof(state->parts[0]));
+	if (!state->parts) {
+		kfree(state);
+		return NULL;
+	}
+
+	state->limit = nr;
+
+	return state;
+}
+
+void free_partitions(struct parsed_partitions *state)
+{
+	vfree(state->parts);
+	kfree(state);
+}
+
+struct parsed_partitions *
+check_partition(struct gendisk *hd, struct block_device *bdev)
+{
+	struct parsed_partitions *state;
+	int i, res, err;
+
+	state = allocate_partitions(hd);
+	if (!state)
+		return NULL;
+	state->pp_buf = (char *)__get_free_page(GFP_KERNEL);
+	if (!state->pp_buf) {
+		free_partitions(state);
+		return NULL;
+	}
+	state->pp_buf[0] = '\0';
+
+	state->bdev = bdev;
+	disk_name(hd, 0, state->name);
+	snprintf(state->pp_buf, PAGE_SIZE, " %s:", state->name);
+	if (isdigit(state->name[strlen(state->name)-1]))
+		sprintf(state->name, "p");
+
+	i = res = err = 0;
+	while (!res && check_part[i]) {
+		memset(state->parts, 0, state->limit * sizeof(state->parts[0]));
+		res = check_part[i++](state);
+		if (res < 0) {
+			/* We have hit an I/O error which we don't report now.
+		 	* But record it, and let the others do their job.
+		 	*/
+			err = res;
+			res = 0;
+		}
+
+	}
+	if (res > 0) {
+		printk(KERN_INFO "%s", state->pp_buf);
+
+		free_page((unsigned long)state->pp_buf);
+		return state;
+	}
+	if (state->access_beyond_eod)
+		err = -ENOSPC;
+	if (err)
+	/* The partition is unrecognized. So report I/O errors if there were any */
+		res = err;
+	if (!res)
+		strlcat(state->pp_buf, " unknown partition table\n", PAGE_SIZE);
+	else if (warn_no_part)
+		strlcat(state->pp_buf, " unable to read partition table\n", PAGE_SIZE);
+
+	printk(KERN_INFO "%s", state->pp_buf);
+
+	free_page((unsigned long)state->pp_buf);
+	free_partitions(state);
+	return ERR_PTR(res);
+}
--- a/block/partitions/check.h
+++ b/block/partitions/check.h
@ -0,0 +1,54 @@
+#include <linux/pagemap.h>
+#include <linux/blkdev.h>
+#include <linux/genhd.h>
+
+/*
+ * add_gd_partition adds a partitions details to the devices partition
+ * description.
+ */
+struct parsed_partitions {
+	struct block_device *bdev;
+	char name[BDEVNAME_SIZE];
+	struct {
+		sector_t from;
+		sector_t size;
+		int flags;
+		bool has_info;
+		struct partition_meta_info info;
+	} *parts;
+	int next;
+	int limit;
+	bool access_beyond_eod;
+	char *pp_buf;
+};
+
+void free_partitions(struct parsed_partitions *state);
+
+struct parsed_partitions *
+check_partition(struct gendisk *, struct block_device *);
+
+static inline void *read_part_sector(struct parsed_partitions *state,
+				     sector_t n, Sector *p)
+{
+	if (n >= get_capacity(state->bdev->bd_disk)) {
+		state->access_beyond_eod = true;
+		return NULL;
+	}
+	return read_dev_sector(state->bdev, n, p);
+}
+
+static inline void
+put_partition(struct parsed_partitions *p, int n, sector_t from, sector_t size)
+{
+	if (n < p->limit) {
+		char tmp[1 + BDEVNAME_SIZE + 10 + 1];
+
+		p->parts[n].from = from;
+		p->parts[n].size = size;
+		snprintf(tmp, sizeof(tmp), " %s%d", p->name, n);
+		strlcat(p->pp_buf, tmp, PAGE_SIZE);
+	}
+}
+
+extern int warn_no_part;
+
--- a/block/partitions/cmdline.c
+++ b/block/partitions/cmdline.c
@ -0,0 +1,99 @@
+/*
+ * Copyright (C) 2013 HUAWEI
+ * Author: Cai Zhiyong <caizhiyong@huawei.com>
+ *
+ * Read block device partition table from the command line.
+ * Typically used for fixed block (eMMC) embedded devices.
+ * It has no MBR, so saves storage space. Bootloader can be easily accessed
+ * by absolute address of data on the block device.
+ * Users can easily change the partition.
+ *
+ * The format for the command line is just like mtdparts.
+ *
+ * For further information, see "Documentation/block/cmdline-partition.txt"
+ *
+ */
+
+#include <linux/cmdline-parser.h>
+
+#include "check.h"
+#include "cmdline.h"
+
+static char *cmdline;
+static struct cmdline_parts *bdev_parts;
+
+static int add_part(int slot, struct cmdline_subpart *subpart, void *param)
+{
+	int label_min;
+	struct partition_meta_info *info;
+	char tmp[sizeof(info->volname) + 4];
+	struct parsed_partitions *state = (struct parsed_partitions *)param;
+
+	if (slot >= state->limit)
+		return 1;
+
+	put_partition(state, slot, subpart->from >> 9,
+		      subpart->size >> 9);
+
+	info = &state->parts[slot].info;
+
+	label_min = min_t(int, sizeof(info->volname) - 1,
+			  sizeof(subpart->name));
+	strncpy(info->volname, subpart->name, label_min);
+	info->volname[label_min] = '\0';
+
+	snprintf(tmp, sizeof(tmp), "(%s)", info->volname);
+	strlcat(state->pp_buf, tmp, PAGE_SIZE);
+
+	state->parts[slot].has_info = true;
+
+	return 0;
+}
+
+static int __init cmdline_parts_setup(char *s)
+{
+	cmdline = s;
+	return 1;
+}
+__setup("blkdevparts=", cmdline_parts_setup);
+
+/*
+ * Purpose: allocate cmdline partitions.
+ * Returns:
+ * -1 if unable to read the partition table
+ *  0 if this isn't our partition table
+ *  1 if successful
+ */
+int cmdline_partition(struct parsed_partitions *state)
+{
+	sector_t disk_size;
+	char bdev[BDEVNAME_SIZE];
+	struct cmdline_parts *parts;
+
+	if (cmdline) {
+		if (bdev_parts)
+			cmdline_parts_free(&bdev_parts);
+
+		if (cmdline_parts_parse(&bdev_parts, cmdline)) {
+			cmdline = NULL;
+			return -1;
+		}
+		cmdline = NULL;
+	}
+
+	if (!bdev_parts)
+		return 0;
+
+	bdevname(state->bdev, bdev);
+	parts = cmdline_parts_find(bdev_parts, bdev);
+	if (!parts)
+		return 0;
+
+	disk_size = get_capacity(state->bdev->bd_disk) << 9;
+
+	cmdline_parts_set(parts, disk_size, 1, add_part, (void *)state);
+
+	strlcat(state->pp_buf, "\n", PAGE_SIZE);
+
+	return 1;
+}
--- a/block/partitions/cmdline.h
+++ b/block/partitions/cmdline.h
@ -0,0 +1,2 @@
+
+int cmdline_partition(struct parsed_partitions *state);
--- a/block/partitions/efi.c
+++ b/block/partitions/efi.c
@ -0,0 +1,737 @@
+/************************************************************
+ * EFI GUID Partition Table handling
+ *
+ * http://www.uefi.org/specs/
+ * http://www.intel.com/technology/efi/
+ *
+ * efi.[ch] by Matt Domsch <Matt_Domsch@dell.com>
+ *   Copyright 2000,2001,2002,2004 Dell Inc.
+ *
+ *  This program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation; either version 2 of the License, or
+ *  (at your option) any later version.
+ *
+ *  This program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with this program; if not, write to the Free Software
+ *  Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ *
+ *
+ * TODO:
+ *
+ * Changelog:
+ * Mon August 5th, 2013 Davidlohr Bueso <davidlohr@hp.com>
+ * - detect hybrid MBRs, tighter pMBR checking & cleanups.
+ *
+ * Mon Nov 09 2004 Matt Domsch <Matt_Domsch@dell.com>
+ * - test for valid PMBR and valid PGPT before ever reading
+ *   AGPT, allow override with 'gpt' kernel command line option.
+ * - check for first/last_usable_lba outside of size of disk
+ *
+ * Tue  Mar 26 2002 Matt Domsch <Matt_Domsch@dell.com>
+ * - Ported to 2.5.7-pre1 and 2.5.7-dj2
+ * - Applied patch to avoid fault in alternate header handling
+ * - cleaned up find_valid_gpt
+ * - On-disk structure and copy in memory is *always* LE now - 
+ *   swab fields as needed
+ * - remove print_gpt_header()
+ * - only use first max_p partition entries, to keep the kernel minor number
+ *   and partition numbers tied.
+ *
+ * Mon  Feb 04 2002 Matt Domsch <Matt_Domsch@dell.com>
+ * - Removed __PRIPTR_PREFIX - not being used
+ *
+ * Mon  Jan 14 2002 Matt Domsch <Matt_Domsch@dell.com>
+ * - Ported to 2.5.2-pre11 + library crc32 patch Linus applied
+ *
+ * Thu Dec 6 2001 Matt Domsch <Matt_Domsch@dell.com>
+ * - Added compare_gpts().
+ * - moved le_efi_guid_to_cpus() back into this file.  GPT is the only
+ *   thing that keeps EFI GUIDs on disk.
+ * - Changed gpt structure names and members to be simpler and more Linux-like.
+ * 
+ * Wed Oct 17 2001 Matt Domsch <Matt_Domsch@dell.com>
+ * - Removed CONFIG_DEVFS_VOLUMES_UUID code entirely per Martin Wilck
+ *
+ * Wed Oct 10 2001 Matt Domsch <Matt_Domsch@dell.com>
+ * - Changed function comments to DocBook style per Andreas Dilger suggestion.
+ *
+ * Mon Oct 08 2001 Matt Domsch <Matt_Domsch@dell.com>
+ * - Change read_lba() to use the page cache per Al Viro's work.
+ * - print u64s properly on all architectures
+ * - fixed debug_printk(), now Dprintk()
+ *
+ * Mon Oct 01 2001 Matt Domsch <Matt_Domsch@dell.com>
+ * - Style cleanups
+ * - made most functions static
+ * - Endianness addition
+ * - remove test for second alternate header, as it's not per spec,
+ *   and is unnecessary.  There's now a method to read/write the last
+ *   sector of an odd-sized disk from user space.  No tools have ever
+ *   been released which used this code, so it's effectively dead.
+ * - Per Asit Mallick of Intel, added a test for a valid PMBR.
+ * - Added kernel command line option 'gpt' to override valid PMBR test.
+ *
+ * Wed Jun  6 2001 Martin Wilck <Martin.Wilck@Fujitsu-Siemens.com>
+ * - added devfs volume UUID support (/dev/volumes/uuids) for
+ *   mounting file systems by the partition GUID. 
+ *
+ * Tue Dec  5 2000 Matt Domsch <Matt_Domsch@dell.com>
+ * - Moved crc32() to linux/lib, added efi_crc32().
+ *
+ * Thu Nov 30 2000 Matt Domsch <Matt_Domsch@dell.com>
+ * - Replaced Intel's CRC32 function with an equivalent
+ *   non-license-restricted version.
+ *
+ * Wed Oct 25 2000 Matt Domsch <Matt_Domsch@dell.com>
+ * - Fixed the last_lba() call to return the proper last block
+ *
+ * Thu Oct 12 2000 Matt Domsch <Matt_Domsch@dell.com>
+ * - Thanks to Andries Brouwer for his debugging assistance.
+ * - Code works, detects all the partitions.
+ *
+ ************************************************************/
+#include <linux/kernel.h>
+#include <linux/crc32.h>
+#include <linux/ctype.h>
+#include <linux/math64.h>
+#include <linux/slab.h>
+#include "check.h"
+#include "efi.h"
+
+/* This allows a kernel command line option 'gpt' to override
+ * the test for invalid PMBR.  Not __initdata because reloading
+ * the partition tables happens after init too.
+ */
+static int force_gpt;
+static int __init
+force_gpt_fn(char *str)
+{
+	force_gpt = 1;
+	return 1;
+}
+__setup("gpt", force_gpt_fn);
+
+
+/**
+ * efi_crc32() - EFI version of crc32 function
+ * @buf: buffer to calculate crc32 of
+ * @len: length of buf
+ *
+ * Description: Returns EFI-style CRC32 value for @buf
+ * 
+ * This function uses the little endian Ethernet polynomial
+ * but seeds the function with ~0, and xor's with ~0 at the end.
+ * Note, the EFI Specification, v1.02, has a reference to
+ * Dr. Dobbs Journal, May 1994 (actually it's in May 1992).
+ */
+static inline u32
+efi_crc32(const void *buf, unsigned long len)
+{
+	return (crc32(~0L, buf, len) ^ ~0L);
+}
+
+/**
+ * last_lba(): return number of last logical block of device
+ * @bdev: block device
+ * 
+ * Description: Returns last LBA value on success, 0 on error.
+ * This is stored (by sd and ide-geometry) in
+ *  the part[0] entry for this disk, and is the number of
+ *  physical sectors available on the disk.
+ */
+static u64 last_lba(struct block_device *bdev)
+{
+	if (!bdev || !bdev->bd_inode)
+		return 0;
+	return div_u64(bdev->bd_inode->i_size,
+		       bdev_logical_block_size(bdev)) - 1ULL;
+}
+
+static inline int pmbr_part_valid(gpt_mbr_record *part)
+{
+	if (part->os_type != EFI_PMBR_OSTYPE_EFI_GPT)
+		goto invalid;
+
+	/* set to 0x00000001 (i.e., the LBA of the GPT Partition Header) */
+	if (le32_to_cpu(part->starting_lba) != GPT_PRIMARY_PARTITION_TABLE_LBA)
+		goto invalid;
+
+	return GPT_MBR_PROTECTIVE;
+invalid:
+	return 0;
+}
+
+/**
+ * is_pmbr_valid(): test Protective MBR for validity
+ * @mbr: pointer to a legacy mbr structure
+ * @total_sectors: amount of sectors in the device
+ *
+ * Description: Checks for a valid protective or hybrid
+ * master boot record (MBR). The validity of a pMBR depends
+ * on all of the following properties:
+ *  1) MSDOS signature is in the last two bytes of the MBR
+ *  2) One partition of type 0xEE is found
+ *
+ * In addition, a hybrid MBR will have up to three additional
+ * primary partitions, which point to the same space that's
+ * marked out by up to three GPT partitions.
+ *
+ * Returns 0 upon invalid MBR, or GPT_MBR_PROTECTIVE or
+ * GPT_MBR_HYBRID depending on the device layout.
+ */
+static int is_pmbr_valid(legacy_mbr *mbr, sector_t total_sectors)
+{
+	uint32_t sz = 0;
+	int i, part = 0, ret = 0; /* invalid by default */
+
+	if (!mbr || le16_to_cpu(mbr->signature) != MSDOS_MBR_SIGNATURE)
+		goto done;
+
+	for (i = 0; i < 4; i++) {
+		ret = pmbr_part_valid(&mbr->partition_record[i]);
+		if (ret == GPT_MBR_PROTECTIVE) {
+			part = i;
+			/*
+			 * Ok, we at least know that there's a protective MBR,
+			 * now check if there are other partition types for
+			 * hybrid MBR.
+			 */
+			goto check_hybrid;
+		}
+	}
+
+	if (ret != GPT_MBR_PROTECTIVE)
+		goto done;
+check_hybrid:
+	for (i = 0; i < 4; i++)
+		if ((mbr->partition_record[i].os_type !=
+			EFI_PMBR_OSTYPE_EFI_GPT) &&
+		    (mbr->partition_record[i].os_type != 0x00))
+			ret = GPT_MBR_HYBRID;
+
+	/*
+	 * Protective MBRs take up the lesser of the whole disk
+	 * or 2 TiB (32bit LBA), ignoring the rest of the disk.
+	 * Some partitioning programs, nonetheless, choose to set
+	 * the size to the maximum 32-bit limitation, disregarding
+	 * the disk size.
+	 *
+	 * Hybrid MBRs do not necessarily comply with this.
+	 *
+	 * Consider a bad value here to be a warning to support dd'ing
+	 * an image from a smaller disk to a larger disk.
+	 */
+	if (ret == GPT_MBR_PROTECTIVE) {
+		sz = le32_to_cpu(mbr->partition_record[part].size_in_lba);
+		if (sz != (uint32_t) total_sectors - 1 && sz != 0xFFFFFFFF)
+			pr_debug("GPT: mbr size in lba (%u) different than whole disk (%u).\n",
+				 sz, min_t(uint32_t,
+					   total_sectors - 1, 0xFFFFFFFF));
+	}
+done:
+	return ret;
+}
+
+/**
+ * read_lba(): Read bytes from disk, starting at given LBA
+ * @state: disk parsed partitions
+ * @lba: the Logical Block Address of the partition table
+ * @buffer: destination buffer
+ * @count: bytes to read
+ *
+ * Description: Reads @count bytes from @state->bdev into @buffer.
+ * Returns number of bytes read on success, 0 on error.
+ */
+static size_t read_lba(struct parsed_partitions *state,
+		       u64 lba, u8 *buffer, size_t count)
+{
+	size_t totalreadcount = 0;
+	struct block_device *bdev = state->bdev;
+	sector_t n = lba * (bdev_logical_block_size(bdev) / 512);
+
+	if (!buffer || lba > last_lba(bdev))
+                return 0;
+
+	while (count) {
+		int copied = 512;
+		Sector sect;
+		unsigned char *data = read_part_sector(state, n++, &sect);
+		if (!data)
+			break;
+		if (copied > count)
+			copied = count;
+		memcpy(buffer, data, copied);
+		put_dev_sector(sect);
+		buffer += copied;
+		totalreadcount +=copied;
+		count -= copied;
+	}
+	return totalreadcount;
+}
+
+/**
+ * alloc_read_gpt_entries(): reads partition entries from disk
+ * @state: disk parsed partitions
+ * @gpt: GPT header
+ * 
+ * Description: Returns ptes on success,  NULL on error.
+ * Allocates space for PTEs based on information found in @gpt.
+ * Notes: remember to free pte when you're done!
+ */
+static gpt_entry *alloc_read_gpt_entries(struct parsed_partitions *state,
+					 gpt_header *gpt)
+{
+	size_t count;
+	gpt_entry *pte;
+
+	if (!gpt)
+		return NULL;
+
+	count = le32_to_cpu(gpt->num_partition_entries) *
+                le32_to_cpu(gpt->sizeof_partition_entry);
+	if (!count)
+		return NULL;
+	pte = kmalloc(count, GFP_KERNEL);
+	if (!pte)
+		return NULL;
+
+	if (read_lba(state, le64_to_cpu(gpt->partition_entry_lba),
+			(u8 *) pte, count) < count) {
+		kfree(pte);
+                pte=NULL;
+		return NULL;
+	}
+	return pte;
+}
+
+/**
+ * alloc_read_gpt_header(): Allocates GPT header, reads into it from disk
+ * @state: disk parsed partitions
+ * @lba: the Logical Block Address of the partition table
+ * 
+ * Description: returns GPT header on success, NULL on error.   Allocates
+ * and fills a GPT header starting at @ from @state->bdev.
+ * Note: remember to free gpt when finished with it.
+ */
+static gpt_header *alloc_read_gpt_header(struct parsed_partitions *state,
+					 u64 lba)
+{
+	gpt_header *gpt;
+	unsigned ssz = bdev_logical_block_size(state->bdev);
+
+	gpt = kmalloc(ssz, GFP_KERNEL);
+	if (!gpt)
+		return NULL;
+
+	if (read_lba(state, lba, (u8 *) gpt, ssz) < ssz) {
+		kfree(gpt);
+                gpt=NULL;
+		return NULL;
+	}
+
+	return gpt;
+}
+
+/**
+ * is_gpt_valid() - tests one GPT header and PTEs for validity
+ * @state: disk parsed partitions
+ * @lba: logical block address of the GPT header to test
+ * @gpt: GPT header ptr, filled on return.
+ * @ptes: PTEs ptr, filled on return.
+ *
+ * Description: returns 1 if valid,  0 on error.
+ * If valid, returns pointers to newly allocated GPT header and PTEs.
+ */
+static int is_gpt_valid(struct parsed_partitions *state, u64 lba,
+			gpt_header **gpt, gpt_entry **ptes)
+{
+	u32 crc, origcrc;
+	u64 lastlba;
+
+	if (!ptes)
+		return 0;
+	if (!(*gpt = alloc_read_gpt_header(state, lba)))
+		return 0;
+
+	/* Check the GUID Partition Table signature */
+	if (le64_to_cpu((*gpt)->signature) != GPT_HEADER_SIGNATURE) {
+		pr_debug("GUID Partition Table Header signature is wrong:"
+			 "%lld != %lld\n",
+			 (unsigned long long)le64_to_cpu((*gpt)->signature),
+			 (unsigned long long)GPT_HEADER_SIGNATURE);
+		goto fail;
+	}
+
+	/* Check the GUID Partition Table header size is too big */
+	if (le32_to_cpu((*gpt)->header_size) >
+			bdev_logical_block_size(state->bdev)) {
+		pr_debug("GUID Partition Table Header size is too large: %u > %u\n",
+			le32_to_cpu((*gpt)->header_size),
+			bdev_logical_block_size(state->bdev));
+		goto fail;
+	}
+
+	/* Check the GUID Partition Table header size is too small */
+	if (le32_to_cpu((*gpt)->header_size) < sizeof(gpt_header)) {
+		pr_debug("GUID Partition Table Header size is too small: %u < %zu\n",
+			le32_to_cpu((*gpt)->header_size),
+			sizeof(gpt_header));
+		goto fail;
+	}
+
+	/* Check the GUID Partition Table CRC */
+	origcrc = le32_to_cpu((*gpt)->header_crc32);
+	(*gpt)->header_crc32 = 0;
+	crc = efi_crc32((const unsigned char *) (*gpt), le32_to_cpu((*gpt)->header_size));
+
+	if (crc != origcrc) {
+		pr_debug("GUID Partition Table Header CRC is wrong: %x != %x\n",
+			 crc, origcrc);
+		goto fail;
+	}
+	(*gpt)->header_crc32 = cpu_to_le32(origcrc);
+
+	/* Check that the my_lba entry points to the LBA that contains
+	 * the GUID Partition Table */
+	if (le64_to_cpu((*gpt)->my_lba) != lba) {
+		pr_debug("GPT my_lba incorrect: %lld != %lld\n",
+			 (unsigned long long)le64_to_cpu((*gpt)->my_lba),
+			 (unsigned long long)lba);
+		goto fail;
+	}
+
+	/* Check the first_usable_lba and last_usable_lba are
+	 * within the disk.
+	 */
+	lastlba = last_lba(state->bdev);
+	if (le64_to_cpu((*gpt)->first_usable_lba) > lastlba) {
+		pr_debug("GPT: first_usable_lba incorrect: %lld > %lld\n",
+			 (unsigned long long)le64_to_cpu((*gpt)->first_usable_lba),
+			 (unsigned long long)lastlba);
+		goto fail;
+	}
+	if (le64_to_cpu((*gpt)->last_usable_lba) > lastlba) {
+		pr_debug("GPT: last_usable_lba incorrect: %lld > %lld\n",
+			 (unsigned long long)le64_to_cpu((*gpt)->last_usable_lba),
+			 (unsigned long long)lastlba);
+		goto fail;
+	}
+	if (le64_to_cpu((*gpt)->last_usable_lba) < le64_to_cpu((*gpt)->first_usable_lba)) {
+		pr_debug("GPT: last_usable_lba incorrect: %lld > %lld\n",
+			 (unsigned long long)le64_to_cpu((*gpt)->last_usable_lba),
+			 (unsigned long long)le64_to_cpu((*gpt)->first_usable_lba));
+		goto fail;
+	}
+	/* Check that sizeof_partition_entry has the correct value */
+	if (le32_to_cpu((*gpt)->sizeof_partition_entry) != sizeof(gpt_entry)) {
+		pr_debug("GUID Partitition Entry Size check failed.\n");
+		goto fail;
+	}
+
+	if (!(*ptes = alloc_read_gpt_entries(state, *gpt)))
+		goto fail;
+
+	/* Check the GUID Partition Entry Array CRC */
+	crc = efi_crc32((const unsigned char *) (*ptes),
+			le32_to_cpu((*gpt)->num_partition_entries) *
+			le32_to_cpu((*gpt)->sizeof_partition_entry));
+
+	if (crc != le32_to_cpu((*gpt)->partition_entry_array_crc32)) {
+		pr_debug("GUID Partitition Entry Array CRC check failed.\n");
+		goto fail_ptes;
+	}
+
+	/* We're done, all's well */
+	return 1;
+
+ fail_ptes:
+	kfree(*ptes);
+	*ptes = NULL;
+ fail:
+	kfree(*gpt);
+	*gpt = NULL;
+	return 0;
+}
+
+/**
+ * is_pte_valid() - tests one PTE for validity
+ * @pte:pte to check
+ * @lastlba: last lba of the disk
+ *
+ * Description: returns 1 if valid,  0 on error.
+ */
+static inline int
+is_pte_valid(const gpt_entry *pte, const u64 lastlba)
+{
+	if ((!efi_guidcmp(pte->partition_type_guid, NULL_GUID)) ||
+	    le64_to_cpu(pte->starting_lba) > lastlba         ||
+	    le64_to_cpu(pte->ending_lba)   > lastlba)
+		return 0;
+	return 1;
+}
+
+/**
+ * compare_gpts() - Search disk for valid GPT headers and PTEs
+ * @pgpt: primary GPT header
+ * @agpt: alternate GPT header
+ * @lastlba: last LBA number
+ *
+ * Description: Returns nothing.  Sanity checks pgpt and agpt fields
+ * and prints warnings on discrepancies.
+ * 
+ */
+static void
+compare_gpts(gpt_header *pgpt, gpt_header *agpt, u64 lastlba)
+{
+	int error_found = 0;
+	if (!pgpt || !agpt)
+		return;
+	if (le64_to_cpu(pgpt->my_lba) != le64_to_cpu(agpt->alternate_lba)) {
+		pr_warn("GPT:Primary header LBA != Alt. header alternate_lba\n");
+		pr_warn("GPT:%lld != %lld\n",
+		       (unsigned long long)le64_to_cpu(pgpt->my_lba),
+                       (unsigned long long)le64_to_cpu(agpt->alternate_lba));
+		error_found++;
+	}
+	if (le64_to_cpu(pgpt->alternate_lba) != le64_to_cpu(agpt->my_lba)) {
+		pr_warn("GPT:Primary header alternate_lba != Alt. header my_lba\n");
+		pr_warn("GPT:%lld != %lld\n",
+		       (unsigned long long)le64_to_cpu(pgpt->alternate_lba),
+                       (unsigned long long)le64_to_cpu(agpt->my_lba));
+		error_found++;
+	}
+	if (le64_to_cpu(pgpt->first_usable_lba) !=
+            le64_to_cpu(agpt->first_usable_lba)) {
+		pr_warn("GPT:first_usable_lbas don't match.\n");
+		pr_warn("GPT:%lld != %lld\n",
+		       (unsigned long long)le64_to_cpu(pgpt->first_usable_lba),
+                       (unsigned long long)le64_to_cpu(agpt->first_usable_lba));
+		error_found++;
+	}
+	if (le64_to_cpu(pgpt->last_usable_lba) !=
+            le64_to_cpu(agpt->last_usable_lba)) {
+		pr_warn("GPT:last_usable_lbas don't match.\n");
+		pr_warn("GPT:%lld != %lld\n",
+		       (unsigned long long)le64_to_cpu(pgpt->last_usable_lba),
+                       (unsigned long long)le64_to_cpu(agpt->last_usable_lba));
+		error_found++;
+	}
+	if (efi_guidcmp(pgpt->disk_guid, agpt->disk_guid)) {
+		pr_warn("GPT:disk_guids don't match.\n");
+		error_found++;
+	}
+	if (le32_to_cpu(pgpt->num_partition_entries) !=
+            le32_to_cpu(agpt->num_partition_entries)) {
+		pr_warn("GPT:num_partition_entries don't match: "
+		       "0x%x != 0x%x\n",
+		       le32_to_cpu(pgpt->num_partition_entries),
+		       le32_to_cpu(agpt->num_partition_entries));
+		error_found++;
+	}
+	if (le32_to_cpu(pgpt->sizeof_partition_entry) !=
+            le32_to_cpu(agpt->sizeof_partition_entry)) {
+		pr_warn("GPT:sizeof_partition_entry values don't match: "
+		       "0x%x != 0x%x\n",
+                       le32_to_cpu(pgpt->sizeof_partition_entry),
+		       le32_to_cpu(agpt->sizeof_partition_entry));
+		error_found++;
+	}
+	if (le32_to_cpu(pgpt->partition_entry_array_crc32) !=
+            le32_to_cpu(agpt->partition_entry_array_crc32)) {
+		pr_warn("GPT:partition_entry_array_crc32 values don't match: "
+		       "0x%x != 0x%x\n",
+                       le32_to_cpu(pgpt->partition_entry_array_crc32),
+		       le32_to_cpu(agpt->partition_entry_array_crc32));
+		error_found++;
+	}
+	if (le64_to_cpu(pgpt->alternate_lba) != lastlba) {
+		pr_warn("GPT:Primary header thinks Alt. header is not at the end of the disk.\n");
+		pr_warn("GPT:%lld != %lld\n",
+			(unsigned long long)le64_to_cpu(pgpt->alternate_lba),
+			(unsigned long long)lastlba);
+		error_found++;
+	}
+
+	if (le64_to_cpu(agpt->my_lba) != lastlba) {
+		pr_warn("GPT:Alternate GPT header not at the end of the disk.\n");
+		pr_warn("GPT:%lld != %lld\n",
+			(unsigned long long)le64_to_cpu(agpt->my_lba),
+			(unsigned long long)lastlba);
+		error_found++;
+	}
+
+	if (error_found)
+		pr_warn("GPT: Use GNU Parted to correct GPT errors.\n");
+	return;
+}
+
+/**
+ * find_valid_gpt() - Search disk for valid GPT headers and PTEs
+ * @state: disk parsed partitions
+ * @gpt: GPT header ptr, filled on return.
+ * @ptes: PTEs ptr, filled on return.
+ *
+ * Description: Returns 1 if valid, 0 on error.
+ * If valid, returns pointers to newly allocated GPT header and PTEs.
+ * Validity depends on PMBR being valid (or being overridden by the
+ * 'gpt' kernel command line option) and finding either the Primary
+ * GPT header and PTEs valid, or the Alternate GPT header and PTEs
+ * valid.  If the Primary GPT header is not valid, the Alternate GPT header
+ * is not checked unless the 'gpt' kernel command line option is passed.
+ * This protects against devices which misreport their size, and forces
+ * the user to decide to use the Alternate GPT.
+ */
+static int find_valid_gpt(struct parsed_partitions *state, gpt_header **gpt,
+			  gpt_entry **ptes)
+{
+	int good_pgpt = 0, good_agpt = 0, good_pmbr = 0;
+	gpt_header *pgpt = NULL, *agpt = NULL;
+	gpt_entry *pptes = NULL, *aptes = NULL;
+	legacy_mbr *legacymbr;
+	sector_t total_sectors = i_size_read(state->bdev->bd_inode) >> 9;
+	u64 lastlba;
+
+	if (!ptes)
+		return 0;
+
+	lastlba = last_lba(state->bdev);
+        if (!force_gpt) {
+		/* This will be added to the EFI Spec. per Intel after v1.02. */
+		legacymbr = kzalloc(sizeof(*legacymbr), GFP_KERNEL);
+		if (!legacymbr)
+			goto fail;
+
+		read_lba(state, 0, (u8 *)legacymbr, sizeof(*legacymbr));
+		good_pmbr = is_pmbr_valid(legacymbr, total_sectors);
+		kfree(legacymbr);
+
+		if (!good_pmbr)
+			goto fail;
+
+		pr_debug("Device has a %s MBR\n",
+			 good_pmbr == GPT_MBR_PROTECTIVE ?
+						"protective" : "hybrid");
+	}
+
+	good_pgpt = is_gpt_valid(state, GPT_PRIMARY_PARTITION_TABLE_LBA,
+				 &pgpt, &pptes);
+        if (good_pgpt)
+		good_agpt = is_gpt_valid(state,
+					 le64_to_cpu(pgpt->alternate_lba),
+					 &agpt, &aptes);
+        if (!good_agpt && force_gpt)
+                good_agpt = is_gpt_valid(state, lastlba, &agpt, &aptes);
+
+        /* The obviously unsuccessful case */
+        if (!good_pgpt && !good_agpt)
+                goto fail;
+
+        compare_gpts(pgpt, agpt, lastlba);
+
+        /* The good cases */
+        if (good_pgpt) {
+                *gpt  = pgpt;
+                *ptes = pptes;
+                kfree(agpt);
+                kfree(aptes);
+		if (!good_agpt)
+                        pr_warn("Alternate GPT is invalid, using primary GPT.\n");
+                return 1;
+        }
+        else if (good_agpt) {
+                *gpt  = agpt;
+                *ptes = aptes;
+                kfree(pgpt);
+                kfree(pptes);
+		pr_warn("Primary GPT is invalid, using alternate GPT.\n");
+                return 1;
+        }
+
+ fail:
+        kfree(pgpt);
+        kfree(agpt);
+        kfree(pptes);
+        kfree(aptes);
+        *gpt = NULL;
+        *ptes = NULL;
+        return 0;
+}
+
+/**
+ * efi_partition(struct parsed_partitions *state)
+ * @state: disk parsed partitions
+ *
+ * Description: called from check.c, if the disk contains GPT
+ * partitions, sets up partition entries in the kernel.
+ *
+ * If the first block on the disk is a legacy MBR,
+ * it will get handled by msdos_partition().
+ * If it's a Protective MBR, we'll handle it here.
+ *
+ * We do not create a Linux partition for GPT, but
+ * only for the actual data partitions.
+ * Returns:
+ * -1 if unable to read the partition table
+ *  0 if this isn't our partition table
+ *  1 if successful
+ *
+ */
+int efi_partition(struct parsed_partitions *state)
+{
+	gpt_header *gpt = NULL;
+	gpt_entry *ptes = NULL;
+	u32 i;
+	unsigned ssz = bdev_logical_block_size(state->bdev) / 512;
+
+	if (!find_valid_gpt(state, &gpt, &ptes) || !gpt || !ptes) {
+		kfree(gpt);
+		kfree(ptes);
+		return 0;
+	}
+
+	pr_debug("GUID Partition Table is valid!  Yea!\n");
+
+	for (i = 0; i < le32_to_cpu(gpt->num_partition_entries) && i < state->limit-1; i++) {
+		struct partition_meta_info *info;
+		unsigned label_count = 0;
+		unsigned label_max;
+		u64 start = le64_to_cpu(ptes[i].starting_lba);
+		u64 size = le64_to_cpu(ptes[i].ending_lba) -
+			   le64_to_cpu(ptes[i].starting_lba) + 1ULL;
+
+		if (!is_pte_valid(&ptes[i], last_lba(state->bdev)))
+			continue;
+
+		put_partition(state, i+1, start * ssz, size * ssz);
+
+		/* If this is a RAID volume, tell md */
+		if (!efi_guidcmp(ptes[i].partition_type_guid, PARTITION_LINUX_RAID_GUID))
+			state->parts[i + 1].flags = ADDPART_FLAG_RAID;
+
+		info = &state->parts[i + 1].info;
+		efi_guid_unparse(&ptes[i].unique_partition_guid, info->uuid);
+
+		/* Naively convert UTF16-LE to 7 bits. */
+		label_max = min(ARRAY_SIZE(info->volname) - 1,
+				ARRAY_SIZE(ptes[i].partition_name));
+		info->volname[label_max] = 0;
+		while (label_count < label_max) {
+			u8 c = ptes[i].partition_name[label_count] & 0xff;
+			if (c && !isprint(c))
+				c = '!';
+			info->volname[label_count] = c;
+			label_count++;
+		}
+		state->parts[i + 1].has_info = true;
+	}
+	kfree(ptes);
+	kfree(gpt);
+	strlcat(state->pp_buf, "\n", PAGE_SIZE);
+	return 1;
+}
--- a/block/partitions/efi.h
+++ b/block/partitions/efi.h
@ -0,0 +1,133 @@
+/************************************************************
+ * EFI GUID Partition Table
+ * Per Intel EFI Specification v1.02
+ * http://developer.intel.com/technology/efi/efi.htm
+ *
+ * By Matt Domsch <Matt_Domsch@dell.com>  Fri Sep 22 22:15:56 CDT 2000  
+ *   Copyright 2000,2001 Dell Inc.
+ *
+ *  This program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation; either version 2 of the License, or
+ *  (at your option) any later version.
+ * 
+ *  This program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with this program; if not, write to the Free Software
+ *  Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ * 
+ ************************************************************/
+
+#ifndef FS_PART_EFI_H_INCLUDED
+#define FS_PART_EFI_H_INCLUDED
+
+#include <linux/types.h>
+#include <linux/fs.h>
+#include <linux/genhd.h>
+#include <linux/kernel.h>
+#include <linux/major.h>
+#include <linux/string.h>
+#include <linux/efi.h>
+#include <linux/compiler.h>
+
+#define MSDOS_MBR_SIGNATURE 0xaa55
+#define EFI_PMBR_OSTYPE_EFI 0xEF
+#define EFI_PMBR_OSTYPE_EFI_GPT 0xEE
+
+#define GPT_MBR_PROTECTIVE  1
+#define GPT_MBR_HYBRID      2
+
+#define GPT_HEADER_SIGNATURE 0x5452415020494645ULL
+#define GPT_HEADER_REVISION_V1 0x00010000
+#define GPT_PRIMARY_PARTITION_TABLE_LBA 1
+
+#define PARTITION_SYSTEM_GUID \
+    EFI_GUID( 0xC12A7328, 0xF81F, 0x11d2, \
+              0xBA, 0x4B, 0x00, 0xA0, 0xC9, 0x3E, 0xC9, 0x3B) 
+#define LEGACY_MBR_PARTITION_GUID \
+    EFI_GUID( 0x024DEE41, 0x33E7, 0x11d3, \
+              0x9D, 0x69, 0x00, 0x08, 0xC7, 0x81, 0xF3, 0x9F)
+#define PARTITION_MSFT_RESERVED_GUID \
+    EFI_GUID( 0xE3C9E316, 0x0B5C, 0x4DB8, \
+              0x81, 0x7D, 0xF9, 0x2D, 0xF0, 0x02, 0x15, 0xAE)
+#define PARTITION_BASIC_DATA_GUID \
+    EFI_GUID( 0xEBD0A0A2, 0xB9E5, 0x4433, \
+              0x87, 0xC0, 0x68, 0xB6, 0xB7, 0x26, 0x99, 0xC7)
+#define PARTITION_LINUX_RAID_GUID \
+    EFI_GUID( 0xa19d880f, 0x05fc, 0x4d3b, \
+              0xa0, 0x06, 0x74, 0x3f, 0x0f, 0x84, 0x91, 0x1e)
+#define PARTITION_LINUX_SWAP_GUID \
+    EFI_GUID( 0x0657fd6d, 0xa4ab, 0x43c4, \
+              0x84, 0xe5, 0x09, 0x33, 0xc8, 0x4b, 0x4f, 0x4f)
+#define PARTITION_LINUX_LVM_GUID \
+    EFI_GUID( 0xe6d6d379, 0xf507, 0x44c2, \
+              0xa2, 0x3c, 0x23, 0x8f, 0x2a, 0x3d, 0xf9, 0x28)
+
+typedef struct _gpt_header {
+	__le64 signature;
+	__le32 revision;
+	__le32 header_size;
+	__le32 header_crc32;
+	__le32 reserved1;
+	__le64 my_lba;
+	__le64 alternate_lba;
+	__le64 first_usable_lba;
+	__le64 last_usable_lba;
+	efi_guid_t disk_guid;
+	__le64 partition_entry_lba;
+	__le32 num_partition_entries;
+	__le32 sizeof_partition_entry;
+	__le32 partition_entry_array_crc32;
+
+	/* The rest of the logical block is reserved by UEFI and must be zero.
+	 * EFI standard handles this by:
+	 *
+	 * uint8_t		reserved2[ BlockSize - 92 ];
+	 */
+} __packed gpt_header;
+
+typedef struct _gpt_entry_attributes {
+	u64 required_to_function:1;
+	u64 reserved:47;
+        u64 type_guid_specific:16;
+} __packed gpt_entry_attributes;
+
+typedef struct _gpt_entry {
+	efi_guid_t partition_type_guid;
+	efi_guid_t unique_partition_guid;
+	__le64 starting_lba;
+	__le64 ending_lba;
+	gpt_entry_attributes attributes;
+	efi_char16_t partition_name[72 / sizeof (efi_char16_t)];
+} __packed gpt_entry;
+
+typedef struct _gpt_mbr_record {
+	u8	boot_indicator; /* unused by EFI, set to 0x80 for bootable */
+	u8	start_head;     /* unused by EFI, pt start in CHS */
+	u8	start_sector;   /* unused by EFI, pt start in CHS */
+	u8	start_track;
+	u8	os_type;        /* EFI and legacy non-EFI OS types */
+	u8	end_head;       /* unused by EFI, pt end in CHS */
+	u8	end_sector;     /* unused by EFI, pt end in CHS */
+	u8	end_track;      /* unused by EFI, pt end in CHS */
+	__le32	starting_lba;   /* used by EFI - start addr of the on disk pt */
+	__le32	size_in_lba;    /* used by EFI - size of pt in LBA */
+} __packed gpt_mbr_record;
+
+
+typedef struct _legacy_mbr {
+	u8 boot_code[440];
+	__le32 unique_mbr_signature;
+	__le16 unknown;
+	gpt_mbr_record partition_record[4];
+	__le16 signature;
+} __packed legacy_mbr;
+
+/* Functions */
+extern int efi_partition(struct parsed_partitions *state);
+
+#endif
--- a/block/partitions/ibm.c
+++ b/block/partitions/ibm.c
@ -0,0 +1,364 @@
+/*
+ * Author(s)......: Holger Smolinski <Holger.Smolinski@de.ibm.com>
+ *                  Volker Sameske <sameske@de.ibm.com>
+ * Bugreports.to..: <Linux390@de.ibm.com>
+ * Copyright IBM Corp. 1999, 2012
+ */
+
+#include <linux/buffer_head.h>
+#include <linux/hdreg.h>
+#include <linux/slab.h>
+#include <asm/dasd.h>
+#include <asm/ebcdic.h>
+#include <asm/uaccess.h>
+#include <asm/vtoc.h>
+
+#include "check.h"
+#include "ibm.h"
+
+
+union label_t {
+	struct vtoc_volume_label_cdl vol;
+	struct vtoc_volume_label_ldl lnx;
+	struct vtoc_cms_label cms;
+};
+
+/*
+ * compute the block number from a
+ * cyl-cyl-head-head structure
+ */
+static sector_t cchh2blk(struct vtoc_cchh *ptr, struct hd_geometry *geo)
+{
+	sector_t cyl;
+	__u16 head;
+
+	/* decode cylinder and heads for large volumes */
+	cyl = ptr->hh & 0xFFF0;
+	cyl <<= 12;
+	cyl |= ptr->cc;
+	head = ptr->hh & 0x000F;
+	return cyl * geo->heads * geo->sectors +
+	       head * geo->sectors;
+}
+
+/*
+ * compute the block number from a
+ * cyl-cyl-head-head-block structure
+ */
+static sector_t cchhb2blk(struct vtoc_cchhb *ptr, struct hd_geometry *geo)
+{
+	sector_t cyl;
+	__u16 head;
+
+	/* decode cylinder and heads for large volumes */
+	cyl = ptr->hh & 0xFFF0;
+	cyl <<= 12;
+	cyl |= ptr->cc;
+	head = ptr->hh & 0x000F;
+	return	cyl * geo->heads * geo->sectors +
+		head * geo->sectors +
+		ptr->b;
+}
+
+static int find_label(struct parsed_partitions *state,
+		      dasd_information2_t *info,
+		      struct hd_geometry *geo,
+		      int blocksize,
+		      sector_t *labelsect,
+		      char name[],
+		      char type[],
+		      union label_t *label)
+{
+	Sector sect;
+	unsigned char *data;
+	sector_t testsect[3];
+	unsigned char temp[5];
+	int found = 0;
+	int i, testcount;
+
+	/* There a three places where we may find a valid label:
+	 * - on an ECKD disk it's block 2
+	 * - on an FBA disk it's block 1
+	 * - on an CMS formatted FBA disk it is sector 1, even if the block size
+	 *   is larger than 512 bytes (possible if the DIAG discipline is used)
+	 * If we have a valid info structure, then we know exactly which case we
+	 * have, otherwise we just search through all possebilities.
+	 */
+	if (info) {
+		if ((info->cu_type == 0x6310 && info->dev_type == 0x9336) ||
+		    (info->cu_type == 0x3880 && info->dev_type == 0x3370))
+			testsect[0] = info->label_block;
+		else
+			testsect[0] = info->label_block * (blocksize >> 9);
+		testcount = 1;
+	} else {
+		testsect[0] = 1;
+		testsect[1] = (blocksize >> 9);
+		testsect[2] = 2 * (blocksize >> 9);
+		testcount = 3;
+	}
+	for (i = 0; i < testcount; ++i) {
+		data = read_part_sector(state, testsect[i], &sect);
+		if (data == NULL)
+			continue;
+		memcpy(label, data, sizeof(*label));
+		memcpy(temp, data, 4);
+		temp[4] = 0;
+		EBCASC(temp, 4);
+		put_dev_sector(sect);
+		if (!strcmp(temp, "VOL1") ||
+		    !strcmp(temp, "LNX1") ||
+		    !strcmp(temp, "CMS1")) {
+			if (!strcmp(temp, "VOL1")) {
+				strncpy(type, label->vol.vollbl, 4);
+				strncpy(name, label->vol.volid, 6);
+			} else {
+				strncpy(type, label->lnx.vollbl, 4);
+				strncpy(name, label->lnx.volid, 6);
+			}
+			EBCASC(type, 4);
+			EBCASC(name, 6);
+			*labelsect = testsect[i];
+			found = 1;
+			break;
+		}
+	}
+	if (!found)
+		memset(label, 0, sizeof(*label));
+
+	return found;
+}
+
+static int find_vol1_partitions(struct parsed_partitions *state,
+				struct hd_geometry *geo,
+				int blocksize,
+				char name[],
+				union label_t *label)
+{
+	sector_t blk;
+	int counter;
+	char tmp[64];
+	Sector sect;
+	unsigned char *data;
+	loff_t offset, size;
+	struct vtoc_format1_label f1;
+	int secperblk;
+
+	snprintf(tmp, sizeof(tmp), "VOL1/%8s:", name);
+	strlcat(state->pp_buf, tmp, PAGE_SIZE);
+	/*
+	 * get start of VTOC from the disk label and then search for format1
+	 * and format8 labels
+	 */
+	secperblk = blocksize >> 9;
+	blk = cchhb2blk(&label->vol.vtoc, geo) + 1;
+	counter = 0;
+	data = read_part_sector(state, blk * secperblk, &sect);
+	while (data != NULL) {
+		memcpy(&f1, data, sizeof(struct vtoc_format1_label));
+		put_dev_sector(sect);
+		/* skip FMT4 / FMT5 / FMT7 labels */
+		if (f1.DS1FMTID == _ascebc['4']
+		    || f1.DS1FMTID == _ascebc['5']
+		    || f1.DS1FMTID == _ascebc['7']
+		    || f1.DS1FMTID == _ascebc['9']) {
+			blk++;
+			data = read_part_sector(state, blk * secperblk, &sect);
+			continue;
+		}
+		/* only FMT1 and 8 labels valid at this point */
+		if (f1.DS1FMTID != _ascebc['1'] &&
+		    f1.DS1FMTID != _ascebc['8'])
+			break;
+		/* OK, we got valid partition data */
+		offset = cchh2blk(&f1.DS1EXT1.llimit, geo);
+		size  = cchh2blk(&f1.DS1EXT1.ulimit, geo) -
+			offset + geo->sectors;
+		offset *= secperblk;
+		size *= secperblk;
+		if (counter >= state->limit)
+			break;
+		put_partition(state, counter + 1, offset, size);
+		counter++;
+		blk++;
+		data = read_part_sector(state, blk * secperblk, &sect);
+	}
+	strlcat(state->pp_buf, "\n", PAGE_SIZE);
+
+	if (!data)
+		return -1;
+
+	return 1;
+}
+
+static int find_lnx1_partitions(struct parsed_partitions *state,
+				struct hd_geometry *geo,
+				int blocksize,
+				char name[],
+				union label_t *label,
+				sector_t labelsect,
+				loff_t i_size,
+				dasd_information2_t *info)
+{
+	loff_t offset, geo_size, size;
+	char tmp[64];
+	int secperblk;
+
+	snprintf(tmp, sizeof(tmp), "LNX1/%8s:", name);
+	strlcat(state->pp_buf, tmp, PAGE_SIZE);
+	secperblk = blocksize >> 9;
+	if (label->lnx.ldl_version == 0xf2) {
+		size = label->lnx.formatted_blocks * secperblk;
+	} else {
+		/*
+		 * Formated w/o large volume support. If the sanity check
+		 * 'size based on geo == size based on i_size' is true, then
+		 * we can safely assume that we know the formatted size of
+		 * the disk, otherwise we need additional information
+		 * that we can only get from a real DASD device.
+		 */
+		geo_size = geo->cylinders * geo->heads
+			* geo->sectors * secperblk;
+		size = i_size >> 9;
+		if (size != geo_size) {
+			if (!info) {
+				strlcat(state->pp_buf, "\n", PAGE_SIZE);
+				return 1;
+			}
+			if (!strcmp(info->type, "ECKD"))
+				if (geo_size < size)
+					size = geo_size;
+			/* else keep size based on i_size */
+		}
+	}
+	/* first and only partition starts in the first block after the label */
+	offset = labelsect + secperblk;
+	put_partition(state, 1, offset, size - offset);
+	strlcat(state->pp_buf, "\n", PAGE_SIZE);
+	return 1;
+}
+
+static int find_cms1_partitions(struct parsed_partitions *state,
+				struct hd_geometry *geo,
+				int blocksize,
+				char name[],
+				union label_t *label,
+				sector_t labelsect)
+{
+	loff_t offset, size;
+	char tmp[64];
+	int secperblk;
+
+	/*
+	 * VM style CMS1 labeled disk
+	 */
+	blocksize = label->cms.block_size;
+	secperblk = blocksize >> 9;
+	if (label->cms.disk_offset != 0) {
+		snprintf(tmp, sizeof(tmp), "CMS1/%8s(MDSK):", name);
+		strlcat(state->pp_buf, tmp, PAGE_SIZE);
+		/* disk is reserved minidisk */
+		offset = label->cms.disk_offset * secperblk;
+		size = (label->cms.block_count - 1) * secperblk;
+	} else {
+		snprintf(tmp, sizeof(tmp), "CMS1/%8s:", name);
+		strlcat(state->pp_buf, tmp, PAGE_SIZE);
+		/*
+		 * Special case for FBA devices:
+		 * If an FBA device is CMS formatted with blocksize > 512 byte
+		 * and the DIAG discipline is used, then the CMS label is found
+		 * in sector 1 instead of block 1. However, the partition is
+		 * still supposed to start in block 2.
+		 */
+		if (labelsect == 1)
+			offset = 2 * secperblk;
+		else
+			offset = labelsect + secperblk;
+		size = label->cms.block_count * secperblk;
+	}
+
+	put_partition(state, 1, offset, size-offset);
+	strlcat(state->pp_buf, "\n", PAGE_SIZE);
+	return 1;
+}
+
+
+/*
+ * This is the main function, called by check.c
+ */
+int ibm_partition(struct parsed_partitions *state)
+{
+	struct block_device *bdev = state->bdev;
+	int blocksize, res;
+	loff_t i_size, offset, size;
+	dasd_information2_t *info;
+	struct hd_geometry *geo;
+	char type[5] = {0,};
+	char name[7] = {0,};
+	sector_t labelsect;
+	union label_t *label;
+
+	res = 0;
+	blocksize = bdev_logical_block_size(bdev);
+	if (blocksize <= 0)
+		goto out_exit;
+	i_size = i_size_read(bdev->bd_inode);
+	if (i_size == 0)
+		goto out_exit;
+	info = kmalloc(sizeof(dasd_information2_t), GFP_KERNEL);
+	if (info == NULL)
+		goto out_exit;
+	geo = kmalloc(sizeof(struct hd_geometry), GFP_KERNEL);
+	if (geo == NULL)
+		goto out_nogeo;
+	label = kmalloc(sizeof(union label_t), GFP_KERNEL);
+	if (label == NULL)
+		goto out_nolab;
+	if (ioctl_by_bdev(bdev, HDIO_GETGEO, (unsigned long)geo) != 0)
+		goto out_freeall;
+	if (ioctl_by_bdev(bdev, BIODASDINFO2, (unsigned long)info) != 0) {
+		kfree(info);
+		info = NULL;
+	}
+
+	if (find_label(state, info, geo, blocksize, &labelsect, name, type,
+		       label)) {
+		if (!strncmp(type, "VOL1", 4)) {
+			res = find_vol1_partitions(state, geo, blocksize, name,
+						   label);
+		} else if (!strncmp(type, "LNX1", 4)) {
+			res = find_lnx1_partitions(state, geo, blocksize, name,
+						   label, labelsect, i_size,
+						   info);
+		} else if (!strncmp(type, "CMS1", 4)) {
+			res = find_cms1_partitions(state, geo, blocksize, name,
+						   label, labelsect);
+		}
+	} else if (info) {
+		/*
+		 * ugly but needed for backward compatibility:
+		 * If the block device is a DASD (i.e. BIODASDINFO2 works),
+		 * then we claim it in any case, even though it has no valid
+		 * label. If it has the LDL format, then we simply define a
+		 * partition as if it had an LNX1 label.
+		 */
+		res = 1;
+		if (info->format == DASD_FORMAT_LDL) {
+			strlcat(state->pp_buf, "(nonl)", PAGE_SIZE);
+			size = i_size >> 9;
+			offset = (info->label_block + 1) * (blocksize >> 9);
+			put_partition(state, 1, offset, size-offset);
+			strlcat(state->pp_buf, "\n", PAGE_SIZE);
+		}
+	} else
+		res = 0;
+
+out_freeall:
+	kfree(label);
+out_nolab:
+	kfree(geo);
+out_nogeo:
+	kfree(info);
+out_exit:
+	return res;
+}
--- a/block/partitions/ibm.h
+++ b/block/partitions/ibm.h
@ -0,0 +1 @@
+int ibm_partition(struct parsed_partitions *);
--- a/block/partitions/karma.c
+++ b/block/partitions/karma.c
@ -0,0 +1,58 @@
+/*
+ *  fs/partitions/karma.c
+ *  Rio Karma partition info.
+ *
+ *  Copyright (C) 2006 Bob Copeland (me@bobcopeland.com)
+ *  based on osf.c
+ */
+
+#include "check.h"
+#include "karma.h"
+#include <linux/compiler.h>
+
+int karma_partition(struct parsed_partitions *state)
+{
+	int i;
+	int slot = 1;
+	Sector sect;
+	unsigned char *data;
+	struct disklabel {
+		u8 d_reserved[270];
+		struct d_partition {
+			__le32 p_res;
+			u8 p_fstype;
+			u8 p_res2[3];
+			__le32 p_offset;
+			__le32 p_size;
+		} d_partitions[2];
+		u8 d_blank[208];
+		__le16 d_magic;
+	} __packed *label;
+	struct d_partition *p;
+
+	data = read_part_sector(state, 0, &sect);
+	if (!data)
+		return -1;
+
+	label = (struct disklabel *)data;
+	if (le16_to_cpu(label->d_magic) != KARMA_LABEL_MAGIC) {
+		put_dev_sector(sect);
+		return 0;
+	}
+
+	p = label->d_partitions;
+	for (i = 0 ; i < 2; i++, p++) {
+		if (slot == state->limit)
+			break;
+
+		if (p->p_fstype == 0x4d && le32_to_cpu(p->p_size)) {
+			put_partition(state, slot, le32_to_cpu(p->p_offset),
+				le32_to_cpu(p->p_size));
+		}
+		slot++;
+	}
+	strlcat(state->pp_buf, "\n", PAGE_SIZE);
+	put_dev_sector(sect);
+	return 1;
+}
+
--- a/block/partitions/karma.h
+++ b/block/partitions/karma.h
@ -0,0 +1,8 @@
+/*
+ *  fs/partitions/karma.h
+ */
+
+#define KARMA_LABEL_MAGIC		0xAB56
+
+int karma_partition(struct parsed_partitions *state);
+
--- a/block/partitions/ldm.c
+++ b/block/partitions/ldm.c
--- a/block/partitions/ldm.h
+++ b/block/partitions/ldm.h
@ -0,0 +1,215 @@
+/**
+ * ldm - Part of the Linux-NTFS project.
+ *
+ * Copyright (C) 2001,2002 Richard Russon <ldm@flatcap.org>
+ * Copyright (c) 2001-2007 Anton Altaparmakov
+ * Copyright (C) 2001,2002 Jakob Kemi <jakob.kemi@telia.com>
+ *
+ * Documentation is available at http://www.linux-ntfs.org/doku.php?id=downloads 
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the Free
+ * Software Foundation; either version 2 of the License, or (at your option)
+ * any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program (in the main directory of the Linux-NTFS source
+ * in the file COPYING); if not, write to the Free Software Foundation,
+ * Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ */
+
+#ifndef _FS_PT_LDM_H_
+#define _FS_PT_LDM_H_
+
+#include <linux/types.h>
+#include <linux/list.h>
+#include <linux/genhd.h>
+#include <linux/fs.h>
+#include <asm/unaligned.h>
+#include <asm/byteorder.h>
+
+struct parsed_partitions;
+
+/* Magic numbers in CPU format. */
+#define MAGIC_VMDB	0x564D4442		/* VMDB */
+#define MAGIC_VBLK	0x56424C4B		/* VBLK */
+#define MAGIC_PRIVHEAD	0x5052495648454144ULL	/* PRIVHEAD */
+#define MAGIC_TOCBLOCK	0x544F43424C4F434BULL	/* TOCBLOCK */
+
+/* The defined vblk types. */
+#define VBLK_VOL5		0x51		/* Volume,     version 5 */
+#define VBLK_CMP3		0x32		/* Component,  version 3 */
+#define VBLK_PRT3		0x33		/* Partition,  version 3 */
+#define VBLK_DSK3		0x34		/* Disk,       version 3 */
+#define VBLK_DSK4		0x44		/* Disk,       version 4 */
+#define VBLK_DGR3		0x35		/* Disk Group, version 3 */
+#define VBLK_DGR4		0x45		/* Disk Group, version 4 */
+
+/* vblk flags indicating extra information will be present */
+#define	VBLK_FLAG_COMP_STRIPE	0x10
+#define	VBLK_FLAG_PART_INDEX	0x08
+#define	VBLK_FLAG_DGR3_IDS	0x08
+#define	VBLK_FLAG_DGR4_IDS	0x08
+#define	VBLK_FLAG_VOLU_ID1	0x08
+#define	VBLK_FLAG_VOLU_ID2	0x20
+#define	VBLK_FLAG_VOLU_SIZE	0x80
+#define	VBLK_FLAG_VOLU_DRIVE	0x02
+
+/* size of a vblk's static parts */
+#define VBLK_SIZE_HEAD		16
+#define VBLK_SIZE_CMP3		22		/* Name and version */
+#define VBLK_SIZE_DGR3		12
+#define VBLK_SIZE_DGR4		44
+#define VBLK_SIZE_DSK3		12
+#define VBLK_SIZE_DSK4		45
+#define VBLK_SIZE_PRT3		28
+#define VBLK_SIZE_VOL5		58
+
+/* component types */
+#define COMP_STRIPE		0x01		/* Stripe-set */
+#define COMP_BASIC		0x02		/* Basic disk */
+#define COMP_RAID		0x03		/* Raid-set */
+
+/* Other constants. */
+#define LDM_DB_SIZE		2048		/* Size in sectors (= 1MiB). */
+
+#define OFF_PRIV1		6		/* Offset of the first privhead
+						   relative to the start of the
+						   device in sectors */
+
+/* Offsets to structures within the LDM Database in sectors. */
+#define OFF_PRIV2		1856		/* Backup private headers. */
+#define OFF_PRIV3		2047
+
+#define OFF_TOCB1		1		/* Tables of contents. */
+#define OFF_TOCB2		2
+#define OFF_TOCB3		2045
+#define OFF_TOCB4		2046
+
+#define OFF_VMDB		17		/* List of partitions. */
+
+#define LDM_PARTITION		0x42		/* Formerly SFS (Landis). */
+
+#define TOC_BITMAP1		"config"	/* Names of the two defined */
+#define TOC_BITMAP2		"log"		/* bitmaps in the TOCBLOCK. */
+
+/* Borrowed from msdos.c */
+#define SYS_IND(p)		(get_unaligned(&(p)->sys_ind))
+
+struct frag {				/* VBLK Fragment handling */
+	struct list_head list;
+	u32		group;
+	u8		num;		/* Total number of records */
+	u8		rec;		/* This is record number n */
+	u8		map;		/* Which portions are in use */
+	u8		data[0];
+};
+
+/* In memory LDM database structures. */
+
+#define GUID_SIZE		16
+
+struct privhead {			/* Offsets and sizes are in sectors. */
+	u16	ver_major;
+	u16	ver_minor;
+	u64	logical_disk_start;
+	u64	logical_disk_size;
+	u64	config_start;
+	u64	config_size;
+	u8	disk_id[GUID_SIZE];
+};
+
+struct tocblock {			/* We have exactly two bitmaps. */
+	u8	bitmap1_name[16];
+	u64	bitmap1_start;
+	u64	bitmap1_size;
+	u8	bitmap2_name[16];
+	u64	bitmap2_start;
+	u64	bitmap2_size;
+};
+
+struct vmdb {				/* VMDB: The database header */
+	u16	ver_major;
+	u16	ver_minor;
+	u32	vblk_size;
+	u32	vblk_offset;
+	u32	last_vblk_seq;
+};
+
+struct vblk_comp {			/* VBLK Component */
+	u8	state[16];
+	u64	parent_id;
+	u8	type;
+	u8	children;
+	u16	chunksize;
+};
+
+struct vblk_dgrp {			/* VBLK Disk Group */
+	u8	disk_id[64];
+};
+
+struct vblk_disk {			/* VBLK Disk */
+	u8	disk_id[GUID_SIZE];
+	u8	alt_name[128];
+};
+
+struct vblk_part {			/* VBLK Partition */
+	u64	start;
+	u64	size;			/* start, size and vol_off in sectors */
+	u64	volume_offset;
+	u64	parent_id;
+	u64	disk_id;
+	u8	partnum;
+};
+
+struct vblk_volu {			/* VBLK Volume */
+	u8	volume_type[16];
+	u8	volume_state[16];
+	u8	guid[16];
+	u8	drive_hint[4];
+	u64	size;
+	u8	partition_type;
+};
+
+struct vblk_head {			/* VBLK standard header */
+	u32 group;
+	u16 rec;
+	u16 nrec;
+};
+
+struct vblk {				/* Generalised VBLK */
+	u8	name[64];
+	u64	obj_id;
+	u32	sequence;
+	u8	flags;
+	u8	type;
+	union {
+		struct vblk_comp comp;
+		struct vblk_dgrp dgrp;
+		struct vblk_disk disk;
+		struct vblk_part part;
+		struct vblk_volu volu;
+	} vblk;
+	struct list_head list;
+};
+
+struct ldmdb {				/* Cache of the database */
+	struct privhead ph;
+	struct tocblock toc;
+	struct vmdb     vm;
+	struct list_head v_dgrp;
+	struct list_head v_disk;
+	struct list_head v_volu;
+	struct list_head v_comp;
+	struct list_head v_part;
+};
+
+int ldm_partition(struct parsed_partitions *state);
+
+#endif /* _FS_PT_LDM_H_ */
+
--- a/block/partitions/mac.c
+++ b/block/partitions/mac.c
@ -0,0 +1,138 @@
+/*
+ *  fs/partitions/mac.c
+ *
+ *  Code extracted from drivers/block/genhd.c
+ *  Copyright (C) 1991-1998  Linus Torvalds
+ *  Re-organised Feb 1998 Russell King
+ */
+
+#include <linux/ctype.h>
+#include "check.h"
+#include "mac.h"
+
+#ifdef CONFIG_PPC_PMAC
+#include <asm/machdep.h>
+extern void note_bootable_part(dev_t dev, int part, int goodness);
+#endif
+
+/*
+ * Code to understand MacOS partition tables.
+ */
+
+static inline void mac_fix_string(char *stg, int len)
+{
+	int i;
+
+	for (i = len - 1; i >= 0 && stg[i] == ' '; i--)
+		stg[i] = 0;
+}
+
+int mac_partition(struct parsed_partitions *state)
+{
+	Sector sect;
+	unsigned char *data;
+	int slot, blocks_in_map;
+	unsigned secsize;
+#ifdef CONFIG_PPC_PMAC
+	int found_root = 0;
+	int found_root_goodness = 0;
+#endif
+	struct mac_partition *part;
+	struct mac_driver_desc *md;
+
+	/* Get 0th block and look at the first partition map entry. */
+	md = read_part_sector(state, 0, &sect);
+	if (!md)
+		return -1;
+	if (be16_to_cpu(md->signature) != MAC_DRIVER_MAGIC) {
+		put_dev_sector(sect);
+		return 0;
+	}
+	secsize = be16_to_cpu(md->block_size);
+	put_dev_sector(sect);
+	data = read_part_sector(state, secsize/512, &sect);
+	if (!data)
+		return -1;
+	part = (struct mac_partition *) (data + secsize%512);
+	if (be16_to_cpu(part->signature) != MAC_PARTITION_MAGIC) {
+		put_dev_sector(sect);
+		return 0;		/* not a MacOS disk */
+	}
+	blocks_in_map = be32_to_cpu(part->map_count);
+	if (blocks_in_map < 0 || blocks_in_map >= DISK_MAX_PARTS) {
+		put_dev_sector(sect);
+		return 0;
+	}
+
+	if (blocks_in_map >= state->limit)
+		blocks_in_map = state->limit - 1;
+
+	strlcat(state->pp_buf, " [mac]", PAGE_SIZE);
+	for (slot = 1; slot <= blocks_in_map; ++slot) {
+		int pos = slot * secsize;
+		put_dev_sector(sect);
+		data = read_part_sector(state, pos/512, &sect);
+		if (!data)
+			return -1;
+		part = (struct mac_partition *) (data + pos%512);
+		if (be16_to_cpu(part->signature) != MAC_PARTITION_MAGIC)
+			break;
+		put_partition(state, slot,
+			be32_to_cpu(part->start_block) * (secsize/512),
+			be32_to_cpu(part->block_count) * (secsize/512));
+
+		if (!strncasecmp(part->type, "Linux_RAID", 10))
+			state->parts[slot].flags = ADDPART_FLAG_RAID;
+#ifdef CONFIG_PPC_PMAC
+		/*
+		 * If this is the first bootable partition, tell the
+		 * setup code, in case it wants to make this the root.
+		 */
+		if (machine_is(powermac)) {
+			int goodness = 0;
+
+			mac_fix_string(part->processor, 16);
+			mac_fix_string(part->name, 32);
+			mac_fix_string(part->type, 32);					
+		    
+			if ((be32_to_cpu(part->status) & MAC_STATUS_BOOTABLE)
+			    && strcasecmp(part->processor, "powerpc") == 0)
+				goodness++;
+
+			if (strcasecmp(part->type, "Apple_UNIX_SVR2") == 0
+			    || (strncasecmp(part->type, "Linux", 5) == 0
+			        && strcasecmp(part->type, "Linux_swap") != 0)) {
+				int i, l;
+
+				goodness++;
+				l = strlen(part->name);
+				if (strcmp(part->name, "/") == 0)
+					goodness++;
+				for (i = 0; i <= l - 4; ++i) {
+					if (strncasecmp(part->name + i, "root",
+						     4) == 0) {
+						goodness += 2;
+						break;
+					}
+				}
+				if (strncasecmp(part->name, "swap", 4) == 0)
+					goodness--;
+			}
+
+			if (goodness > found_root_goodness) {
+				found_root = slot;
+				found_root_goodness = goodness;
+			}
+		}
+#endif /* CONFIG_PPC_PMAC */
+	}
+#ifdef CONFIG_PPC_PMAC
+	if (found_root_goodness)
+		note_bootable_part(state->bdev->bd_dev, found_root,
+				   found_root_goodness);
+#endif
+
+	put_dev_sector(sect);
+	strlcat(state->pp_buf, "\n", PAGE_SIZE);
+	return 1;
+}
--- a/block/partitions/mac.h
+++ b/block/partitions/mac.h
@ -0,0 +1,44 @@
+/*
+ *  fs/partitions/mac.h
+ */
+
+#define MAC_PARTITION_MAGIC	0x504d
+
+/* type field value for A/UX or other Unix partitions */
+#define APPLE_AUX_TYPE	"Apple_UNIX_SVR2"
+
+struct mac_partition {
+	__be16	signature;	/* expected to be MAC_PARTITION_MAGIC */
+	__be16	res1;
+	__be32	map_count;	/* # blocks in partition map */
+	__be32	start_block;	/* absolute starting block # of partition */
+	__be32	block_count;	/* number of blocks in partition */
+	char	name[32];	/* partition name */
+	char	type[32];	/* string type description */
+	__be32	data_start;	/* rel block # of first data block */
+	__be32	data_count;	/* number of data blocks */
+	__be32	status;		/* partition status bits */
+	__be32	boot_start;
+	__be32	boot_size;
+	__be32	boot_load;
+	__be32	boot_load2;
+	__be32	boot_entry;
+	__be32	boot_entry2;
+	__be32	boot_cksum;
+	char	processor[16];	/* identifies ISA of boot */
+	/* there is more stuff after this that we don't need */
+};
+
+#define MAC_STATUS_BOOTABLE	8	/* partition is bootable */
+
+#define MAC_DRIVER_MAGIC	0x4552
+
+/* Driver descriptor structure, in block 0 */
+struct mac_driver_desc {
+	__be16	signature;	/* expected to be MAC_DRIVER_MAGIC */
+	__be16	block_size;
+	__be32	block_count;
+    /* ... more stuff */
+};
+
+int mac_partition(struct parsed_partitions *state);
--- a/block/partitions/msdos.c
+++ b/block/partitions/msdos.c
@ -0,0 +1,582 @@
+/*
+ *  fs/partitions/msdos.c
+ *
+ *  Code extracted from drivers/block/genhd.c
+ *  Copyright (C) 1991-1998  Linus Torvalds
+ *
+ *  Thanks to Branko Lankester, lankeste@fwi.uva.nl, who found a bug
+ *  in the early extended-partition checks and added DM partitions
+ *
+ *  Support for DiskManager v6.0x added by Mark Lord,
+ *  with information provided by OnTrack.  This now works for linux fdisk
+ *  and LILO, as well as loadlin and bootln.  Note that disks other than
+ *  /dev/hda *must* have a "DOS" type 0x51 partition in the first slot (hda1).
+ *
+ *  More flexible handling of extended partitions - aeb, 950831
+ *
+ *  Check partition table on IDE disks for common CHS translations
+ *
+ *  Re-organised Feb 1998 Russell King
+ */
+#include <linux/msdos_fs.h>
+
+#include "check.h"
+#include "msdos.h"
+#include "efi.h"
+#include "aix.h"
+
+/*
+ * Many architectures don't like unaligned accesses, while
+ * the nr_sects and start_sect partition table entries are
+ * at a 2 (mod 4) address.
+ */
+#include <asm/unaligned.h>
+
+#define SYS_IND(p)	get_unaligned(&p->sys_ind)
+
+static inline sector_t nr_sects(struct partition *p)
+{
+	return (sector_t)get_unaligned_le32(&p->nr_sects);
+}
+
+static inline sector_t start_sect(struct partition *p)
+{
+	return (sector_t)get_unaligned_le32(&p->start_sect);
+}
+
+static inline int is_extended_partition(struct partition *p)
+{
+	return (SYS_IND(p) == DOS_EXTENDED_PARTITION ||
+		SYS_IND(p) == WIN98_EXTENDED_PARTITION ||
+		SYS_IND(p) == LINUX_EXTENDED_PARTITION);
+}
+
+#define MSDOS_LABEL_MAGIC1	0x55
+#define MSDOS_LABEL_MAGIC2	0xAA
+
+static inline int
+msdos_magic_present(unsigned char *p)
+{
+	return (p[0] == MSDOS_LABEL_MAGIC1 && p[1] == MSDOS_LABEL_MAGIC2);
+}
+
+/* Value is EBCDIC 'IBMA' */
+#define AIX_LABEL_MAGIC1	0xC9
+#define AIX_LABEL_MAGIC2	0xC2
+#define AIX_LABEL_MAGIC3	0xD4
+#define AIX_LABEL_MAGIC4	0xC1
+static int aix_magic_present(struct parsed_partitions *state, unsigned char *p)
+{
+	struct partition *pt = (struct partition *) (p + 0x1be);
+	Sector sect;
+	unsigned char *d;
+	int slot, ret = 0;
+
+	if (!(p[0] == AIX_LABEL_MAGIC1 &&
+		p[1] == AIX_LABEL_MAGIC2 &&
+		p[2] == AIX_LABEL_MAGIC3 &&
+		p[3] == AIX_LABEL_MAGIC4))
+		return 0;
+	/* Assume the partition table is valid if Linux partitions exists */
+	for (slot = 1; slot <= 4; slot++, pt++) {
+		if (pt->sys_ind == LINUX_SWAP_PARTITION ||
+			pt->sys_ind == LINUX_RAID_PARTITION ||
+			pt->sys_ind == LINUX_DATA_PARTITION ||
+			pt->sys_ind == LINUX_LVM_PARTITION ||
+			is_extended_partition(pt))
+			return 0;
+	}
+	d = read_part_sector(state, 7, &sect);
+	if (d) {
+		if (d[0] == '_' && d[1] == 'L' && d[2] == 'V' && d[3] == 'M')
+			ret = 1;
+		put_dev_sector(sect);
+	}
+	return ret;
+}
+
+static void set_info(struct parsed_partitions *state, int slot,
+		     u32 disksig)
+{
+	struct partition_meta_info *info = &state->parts[slot].info;
+
+	snprintf(info->uuid, sizeof(info->uuid), "%08x-%02x", disksig,
+		 slot);
+	info->volname[0] = 0;
+	state->parts[slot].has_info = true;
+}
+
+/*
+ * Create devices for each logical partition in an extended partition.
+ * The logical partitions form a linked list, with each entry being
+ * a partition table with two entries.  The first entry
+ * is the real data partition (with a start relative to the partition
+ * table start).  The second is a pointer to the next logical partition
+ * (with a start relative to the entire extended partition).
+ * We do not create a Linux partition for the partition tables, but
+ * only for the actual data partitions.
+ */
+
+static void parse_extended(struct parsed_partitions *state,
+			   sector_t first_sector, sector_t first_size,
+			   u32 disksig)
+{
+	struct partition *p;
+	Sector sect;
+	unsigned char *data;
+	sector_t this_sector, this_size;
+	sector_t sector_size = bdev_logical_block_size(state->bdev) / 512;
+	int loopct = 0;		/* number of links followed
+				   without finding a data partition */
+	int i;
+
+	this_sector = first_sector;
+	this_size = first_size;
+
+	while (1) {
+		if (++loopct > 100)
+			return;
+		if (state->next == state->limit)
+			return;
+		data = read_part_sector(state, this_sector, &sect);
+		if (!data)
+			return;
+
+		if (!msdos_magic_present(data + 510))
+			goto done;
+
+		p = (struct partition *) (data + 0x1be);
+
+		/*
+		 * Usually, the first entry is the real data partition,
+		 * the 2nd entry is the next extended partition, or empty,
+		 * and the 3rd and 4th entries are unused.
+		 * However, DRDOS sometimes has the extended partition as
+		 * the first entry (when the data partition is empty),
+		 * and OS/2 seems to use all four entries.
+		 */
+
+		/*
+		 * First process the data partition(s)
+		 */
+		for (i = 0; i < 4; i++, p++) {
+			sector_t offs, size, next;
+
+			if (!nr_sects(p) || is_extended_partition(p))
+				continue;
+
+			/* Check the 3rd and 4th entries -
+			   these sometimes contain random garbage */
+			offs = start_sect(p)*sector_size;
+			size = nr_sects(p)*sector_size;
+			next = this_sector + offs;
+			if (i >= 2) {
+				if (offs + size > this_size)
+					continue;
+				if (next < first_sector)
+					continue;
+				if (next + size > first_sector + first_size)
+					continue;
+			}
+
+			put_partition(state, state->next, next, size);
+			set_info(state, state->next, disksig);
+			if (SYS_IND(p) == LINUX_RAID_PARTITION)
+				state->parts[state->next].flags = ADDPART_FLAG_RAID;
+			loopct = 0;
+			if (++state->next == state->limit)
+				goto done;
+		}
+		/*
+		 * Next, process the (first) extended partition, if present.
+		 * (So far, there seems to be no reason to make
+		 *  parse_extended()  recursive and allow a tree
+		 *  of extended partitions.)
+		 * It should be a link to the next logical partition.
+		 */
+		p -= 4;
+		for (i = 0; i < 4; i++, p++)
+			if (nr_sects(p) && is_extended_partition(p))
+				break;
+		if (i == 4)
+			goto done;	 /* nothing left to do */
+
+		this_sector = first_sector + start_sect(p) * sector_size;
+		this_size = nr_sects(p) * sector_size;
+		put_dev_sector(sect);
+	}
+done:
+	put_dev_sector(sect);
+}
+
+/* james@bpgc.com: Solaris has a nasty indicator: 0x82 which also
+   indicates linux swap.  Be careful before believing this is Solaris. */
+
+static void parse_solaris_x86(struct parsed_partitions *state,
+			      sector_t offset, sector_t size, int origin)
+{
+#ifdef CONFIG_SOLARIS_X86_PARTITION
+	Sector sect;
+	struct solaris_x86_vtoc *v;
+	int i;
+	short max_nparts;
+
+	v = read_part_sector(state, offset + 1, &sect);
+	if (!v)
+		return;
+	if (le32_to_cpu(v->v_sanity) != SOLARIS_X86_VTOC_SANE) {
+		put_dev_sector(sect);
+		return;
+	}
+	{
+		char tmp[1 + BDEVNAME_SIZE + 10 + 11 + 1];
+
+		snprintf(tmp, sizeof(tmp), " %s%d: <solaris:", state->name, origin);
+		strlcat(state->pp_buf, tmp, PAGE_SIZE);
+	}
+	if (le32_to_cpu(v->v_version) != 1) {
+		char tmp[64];
+
+		snprintf(tmp, sizeof(tmp), "  cannot handle version %d vtoc>\n",
+			 le32_to_cpu(v->v_version));
+		strlcat(state->pp_buf, tmp, PAGE_SIZE);
+		put_dev_sector(sect);
+		return;
+	}
+	/* Ensure we can handle previous case of VTOC with 8 entries gracefully */
+	max_nparts = le16_to_cpu(v->v_nparts) > 8 ? SOLARIS_X86_NUMSLICE : 8;
+	for (i = 0; i < max_nparts && state->next < state->limit; i++) {
+		struct solaris_x86_slice *s = &v->v_slice[i];
+		char tmp[3 + 10 + 1 + 1];
+
+		if (s->s_size == 0)
+			continue;
+		snprintf(tmp, sizeof(tmp), " [s%d]", i);
+		strlcat(state->pp_buf, tmp, PAGE_SIZE);
+		/* solaris partitions are relative to current MS-DOS
+		 * one; must add the offset of the current partition */
+		put_partition(state, state->next++,
+				 le32_to_cpu(s->s_start)+offset,
+				 le32_to_cpu(s->s_size));
+	}
+	put_dev_sector(sect);
+	strlcat(state->pp_buf, " >\n", PAGE_SIZE);
+#endif
+}
+
+#if defined(CONFIG_BSD_DISKLABEL)
+/*
+ * Create devices for BSD partitions listed in a disklabel, under a
+ * dos-like partition. See parse_extended() for more information.
+ */
+static void parse_bsd(struct parsed_partitions *state,
+		      sector_t offset, sector_t size, int origin, char *flavour,
+		      int max_partitions)
+{
+	Sector sect;
+	struct bsd_disklabel *l;
+	struct bsd_partition *p;
+	char tmp[64];
+
+	l = read_part_sector(state, offset + 1, &sect);
+	if (!l)
+		return;
+	if (le32_to_cpu(l->d_magic) != BSD_DISKMAGIC) {
+		put_dev_sector(sect);
+		return;
+	}
+
+	snprintf(tmp, sizeof(tmp), " %s%d: <%s:", state->name, origin, flavour);
+	strlcat(state->pp_buf, tmp, PAGE_SIZE);
+
+	if (le16_to_cpu(l->d_npartitions) < max_partitions)
+		max_partitions = le16_to_cpu(l->d_npartitions);
+	for (p = l->d_partitions; p - l->d_partitions < max_partitions; p++) {
+		sector_t bsd_start, bsd_size;
+
+		if (state->next == state->limit)
+			break;
+		if (p->p_fstype == BSD_FS_UNUSED)
+			continue;
+		bsd_start = le32_to_cpu(p->p_offset);
+		bsd_size = le32_to_cpu(p->p_size);
+		if (offset == bsd_start && size == bsd_size)
+			/* full parent partition, we have it already */
+			continue;
+		if (offset > bsd_start || offset+size < bsd_start+bsd_size) {
+			strlcat(state->pp_buf, "bad subpartition - ignored\n", PAGE_SIZE);
+			continue;
+		}
+		put_partition(state, state->next++, bsd_start, bsd_size);
+	}
+	put_dev_sector(sect);
+	if (le16_to_cpu(l->d_npartitions) > max_partitions) {
+		snprintf(tmp, sizeof(tmp), " (ignored %d more)",
+			 le16_to_cpu(l->d_npartitions) - max_partitions);
+		strlcat(state->pp_buf, tmp, PAGE_SIZE);
+	}
+	strlcat(state->pp_buf, " >\n", PAGE_SIZE);
+}
+#endif
+
+static void parse_freebsd(struct parsed_partitions *state,
+			  sector_t offset, sector_t size, int origin)
+{
+#ifdef CONFIG_BSD_DISKLABEL
+	parse_bsd(state, offset, size, origin, "bsd", BSD_MAXPARTITIONS);
+#endif
+}
+
+static void parse_netbsd(struct parsed_partitions *state,
+			 sector_t offset, sector_t size, int origin)
+{
+#ifdef CONFIG_BSD_DISKLABEL
+	parse_bsd(state, offset, size, origin, "netbsd", BSD_MAXPARTITIONS);
+#endif
+}
+
+static void parse_openbsd(struct parsed_partitions *state,
+			  sector_t offset, sector_t size, int origin)
+{
+#ifdef CONFIG_BSD_DISKLABEL
+	parse_bsd(state, offset, size, origin, "openbsd",
+		  OPENBSD_MAXPARTITIONS);
+#endif
+}
+
+/*
+ * Create devices for Unixware partitions listed in a disklabel, under a
+ * dos-like partition. See parse_extended() for more information.
+ */
+static void parse_unixware(struct parsed_partitions *state,
+			   sector_t offset, sector_t size, int origin)
+{
+#ifdef CONFIG_UNIXWARE_DISKLABEL
+	Sector sect;
+	struct unixware_disklabel *l;
+	struct unixware_slice *p;
+
+	l = read_part_sector(state, offset + 29, &sect);
+	if (!l)
+		return;
+	if (le32_to_cpu(l->d_magic) != UNIXWARE_DISKMAGIC ||
+	    le32_to_cpu(l->vtoc.v_magic) != UNIXWARE_DISKMAGIC2) {
+		put_dev_sector(sect);
+		return;
+	}
+	{
+		char tmp[1 + BDEVNAME_SIZE + 10 + 12 + 1];
+
+		snprintf(tmp, sizeof(tmp), " %s%d: <unixware:", state->name, origin);
+		strlcat(state->pp_buf, tmp, PAGE_SIZE);
+	}
+	p = &l->vtoc.v_slice[1];
+	/* I omit the 0th slice as it is the same as whole disk. */
+	while (p - &l->vtoc.v_slice[0] < UNIXWARE_NUMSLICE) {
+		if (state->next == state->limit)
+			break;
+
+		if (p->s_label != UNIXWARE_FS_UNUSED)
+			put_partition(state, state->next++,
+				      le32_to_cpu(p->start_sect),
+				      le32_to_cpu(p->nr_sects));
+		p++;
+	}
+	put_dev_sector(sect);
+	strlcat(state->pp_buf, " >\n", PAGE_SIZE);
+#endif
+}
+
+/*
+ * Minix 2.0.0/2.0.2 subpartition support.
+ * Anand Krishnamurthy <anandk@wiproge.med.ge.com>
+ * Rajeev V. Pillai    <rajeevvp@yahoo.com>
+ */
+static void parse_minix(struct parsed_partitions *state,
+			sector_t offset, sector_t size, int origin)
+{
+#ifdef CONFIG_MINIX_SUBPARTITION
+	Sector sect;
+	unsigned char *data;
+	struct partition *p;
+	int i;
+
+	data = read_part_sector(state, offset, &sect);
+	if (!data)
+		return;
+
+	p = (struct partition *)(data + 0x1be);
+
+	/* The first sector of a Minix partition can have either
+	 * a secondary MBR describing its subpartitions, or
+	 * the normal boot sector. */
+	if (msdos_magic_present(data + 510) &&
+	    SYS_IND(p) == MINIX_PARTITION) { /* subpartition table present */
+		char tmp[1 + BDEVNAME_SIZE + 10 + 9 + 1];
+
+		snprintf(tmp, sizeof(tmp), " %s%d: <minix:", state->name, origin);
+		strlcat(state->pp_buf, tmp, PAGE_SIZE);
+		for (i = 0; i < MINIX_NR_SUBPARTITIONS; i++, p++) {
+			if (state->next == state->limit)
+				break;
+			/* add each partition in use */
+			if (SYS_IND(p) == MINIX_PARTITION)
+				put_partition(state, state->next++,
+					      start_sect(p), nr_sects(p));
+		}
+		strlcat(state->pp_buf, " >\n", PAGE_SIZE);
+	}
+	put_dev_sector(sect);
+#endif /* CONFIG_MINIX_SUBPARTITION */
+}
+
+static struct {
+	unsigned char id;
+	void (*parse)(struct parsed_partitions *, sector_t, sector_t, int);
+} subtypes[] = {
+	{FREEBSD_PARTITION, parse_freebsd},
+	{NETBSD_PARTITION, parse_netbsd},
+	{OPENBSD_PARTITION, parse_openbsd},
+	{MINIX_PARTITION, parse_minix},
+	{UNIXWARE_PARTITION, parse_unixware},
+	{SOLARIS_X86_PARTITION, parse_solaris_x86},
+	{NEW_SOLARIS_X86_PARTITION, parse_solaris_x86},
+	{0, NULL},
+};
+
+int msdos_partition(struct parsed_partitions *state)
+{
+	sector_t sector_size = bdev_logical_block_size(state->bdev) / 512;
+	Sector sect;
+	unsigned char *data;
+	struct partition *p;
+	struct fat_boot_sector *fb;
+	int slot;
+	u32 disksig;
+
+	data = read_part_sector(state, 0, &sect);
+	if (!data)
+		return -1;
+
+	/*
+	 * Note order! (some AIX disks, e.g. unbootable kind,
+	 * have no MSDOS 55aa)
+	 */
+	if (aix_magic_present(state, data)) {
+		put_dev_sector(sect);
+#ifdef CONFIG_AIX_PARTITION
+		return aix_partition(state);
+#else
+		strlcat(state->pp_buf, " [AIX]", PAGE_SIZE);
+		return 0;
+#endif
+	}
+
+	if (!msdos_magic_present(data + 510)) {
+		put_dev_sector(sect);
+		return 0;
+	}
+
+	/*
+	 * Now that the 55aa signature is present, this is probably
+	 * either the boot sector of a FAT filesystem or a DOS-type
+	 * partition table. Reject this in case the boot indicator
+	 * is not 0 or 0x80.
+	 */
+	p = (struct partition *) (data + 0x1be);
+	for (slot = 1; slot <= 4; slot++, p++) {
+		if (p->boot_ind != 0 && p->boot_ind != 0x80) {
+			/*
+			 * Even without a valid boot inidicator value
+			 * its still possible this is valid FAT filesystem
+			 * without a partition table.
+			 */
+			fb = (struct fat_boot_sector *) data;
+			if (slot == 1 && fb->reserved && fb->fats
+				&& fat_valid_media(fb->media)) {
+				strlcat(state->pp_buf, "\n", PAGE_SIZE);
+				put_dev_sector(sect);
+				return 1;
+			} else {
+				put_dev_sector(sect);
+				return 0;
+			}
+		}
+	}
+
+#ifdef CONFIG_EFI_PARTITION
+	p = (struct partition *) (data + 0x1be);
+	for (slot = 1 ; slot <= 4 ; slot++, p++) {
+		/* If this is an EFI GPT disk, msdos should ignore it. */
+		if (SYS_IND(p) == EFI_PMBR_OSTYPE_EFI_GPT) {
+			put_dev_sector(sect);
+			return 0;
+		}
+	}
+#endif
+	p = (struct partition *) (data + 0x1be);
+
+	disksig = le32_to_cpup((__le32 *)(data + 0x1b8));
+
+	/*
+	 * Look for partitions in two passes:
+	 * First find the primary and DOS-type extended partitions.
+	 * On the second pass look inside *BSD, Unixware and Solaris partitions.
+	 */
+
+	state->next = 5;
+	for (slot = 1 ; slot <= 4 ; slot++, p++) {
+		sector_t start = start_sect(p)*sector_size;
+		sector_t size = nr_sects(p)*sector_size;
+
+		if (!size)
+			continue;
+		if (is_extended_partition(p)) {
+			/*
+			 * prevent someone doing mkfs or mkswap on an
+			 * extended partition, but leave room for LILO
+			 * FIXME: this uses one logical sector for > 512b
+			 * sector, although it may not be enough/proper.
+			 */
+			sector_t n = 2;
+
+			n = min(size, max(sector_size, n));
+			put_partition(state, slot, start, n);
+
+			strlcat(state->pp_buf, " <", PAGE_SIZE);
+			parse_extended(state, start, size, disksig);
+			strlcat(state->pp_buf, " >", PAGE_SIZE);
+			continue;
+		}
+		put_partition(state, slot, start, size);
+		set_info(state, slot, disksig);
+		if (SYS_IND(p) == LINUX_RAID_PARTITION)
+			state->parts[slot].flags = ADDPART_FLAG_RAID;
+		if (SYS_IND(p) == DM6_PARTITION)
+			strlcat(state->pp_buf, "[DM]", PAGE_SIZE);
+		if (SYS_IND(p) == EZD_PARTITION)
+			strlcat(state->pp_buf, "[EZD]", PAGE_SIZE);
+	}
+
+	strlcat(state->pp_buf, "\n", PAGE_SIZE);
+
+	/* second pass - output for each on a separate line */
+	p = (struct partition *) (0x1be + data);
+	for (slot = 1 ; slot <= 4 ; slot++, p++) {
+		unsigned char id = SYS_IND(p);
+		int n;
+
+		if (!nr_sects(p))
+			continue;
+
+		for (n = 0; subtypes[n].parse && id != subtypes[n].id; n++)
+			;
+
+		if (!subtypes[n].parse)
+			continue;
+		subtypes[n].parse(state, start_sect(p) * sector_size,
+				  nr_sects(p) * sector_size, slot);
+	}
+	put_dev_sector(sect);
+	return 1;
+}
--- a/block/partitions/msdos.h
+++ b/block/partitions/msdos.h
@ -0,0 +1,8 @@
+/*
+ *  fs/partitions/msdos.h
+ */
+
+#define MSDOS_LABEL_MAGIC		0xAA55
+
+int msdos_partition(struct parsed_partitions *state);
+
--- a/block/partitions/osf.c
+++ b/block/partitions/osf.c
@ -0,0 +1,86 @@
+/*
+ *  fs/partitions/osf.c
+ *
+ *  Code extracted from drivers/block/genhd.c
+ *
+ *  Copyright (C) 1991-1998  Linus Torvalds
+ *  Re-organised Feb 1998 Russell King
+ */
+
+#include "check.h"
+#include "osf.h"
+
+#define MAX_OSF_PARTITIONS 18
+
+int osf_partition(struct parsed_partitions *state)
+{
+	int i;
+	int slot = 1;
+	unsigned int npartitions;
+	Sector sect;
+	unsigned char *data;
+	struct disklabel {
+		__le32 d_magic;
+		__le16 d_type,d_subtype;
+		u8 d_typename[16];
+		u8 d_packname[16];
+		__le32 d_secsize;
+		__le32 d_nsectors;
+		__le32 d_ntracks;
+		__le32 d_ncylinders;
+		__le32 d_secpercyl;
+		__le32 d_secprtunit;
+		__le16 d_sparespertrack;
+		__le16 d_sparespercyl;
+		__le32 d_acylinders;
+		__le16 d_rpm, d_interleave, d_trackskew, d_cylskew;
+		__le32 d_headswitch, d_trkseek, d_flags;
+		__le32 d_drivedata[5];
+		__le32 d_spare[5];
+		__le32 d_magic2;
+		__le16 d_checksum;
+		__le16 d_npartitions;
+		__le32 d_bbsize, d_sbsize;
+		struct d_partition {
+			__le32 p_size;
+			__le32 p_offset;
+			__le32 p_fsize;
+			u8  p_fstype;
+			u8  p_frag;
+			__le16 p_cpg;
+		} d_partitions[MAX_OSF_PARTITIONS];
+	} * label;
+	struct d_partition * partition;
+
+	data = read_part_sector(state, 0, &sect);
+	if (!data)
+		return -1;
+
+	label = (struct disklabel *) (data+64);
+	partition = label->d_partitions;
+	if (le32_to_cpu(label->d_magic) != DISKLABELMAGIC) {
+		put_dev_sector(sect);
+		return 0;
+	}
+	if (le32_to_cpu(label->d_magic2) != DISKLABELMAGIC) {
+		put_dev_sector(sect);
+		return 0;
+	}
+	npartitions = le16_to_cpu(label->d_npartitions);
+	if (npartitions > MAX_OSF_PARTITIONS) {
+		put_dev_sector(sect);
+		return 0;
+	}
+	for (i = 0 ; i < npartitions; i++, partition++) {
+		if (slot == state->limit)
+		        break;
+		if (le32_to_cpu(partition->p_size))
+			put_partition(state, slot,
+				le32_to_cpu(partition->p_offset),
+				le32_to_cpu(partition->p_size));
+		slot++;
+	}
+	strlcat(state->pp_buf, "\n", PAGE_SIZE);
+	put_dev_sector(sect);
+	return 1;
+}
--- a/block/partitions/osf.h
+++ b/block/partitions/osf.h
@ -0,0 +1,7 @@
+/*
+ *  fs/partitions/osf.h
+ */
+
+#define DISKLABELMAGIC (0x82564557UL)
+
+int osf_partition(struct parsed_partitions *state);
--- a/block/partitions/sgi.c
+++ b/block/partitions/sgi.c
@ -0,0 +1,82 @@
+/*
+ *  fs/partitions/sgi.c
+ *
+ *  Code extracted from drivers/block/genhd.c
+ */
+
+#include "check.h"
+#include "sgi.h"
+
+struct sgi_disklabel {
+	__be32 magic_mushroom;		/* Big fat spliff... */
+	__be16 root_part_num;		/* Root partition number */
+	__be16 swap_part_num;		/* Swap partition number */
+	s8 boot_file[16];		/* Name of boot file for ARCS */
+	u8 _unused0[48];		/* Device parameter useless crapola.. */
+	struct sgi_volume {
+		s8 name[8];		/* Name of volume */
+		__be32 block_num;		/* Logical block number */
+		__be32 num_bytes;		/* How big, in bytes */
+	} volume[15];
+	struct sgi_partition {
+		__be32 num_blocks;		/* Size in logical blocks */
+		__be32 first_block;	/* First logical block */
+		__be32 type;		/* Type of this partition */
+	} partitions[16];
+	__be32 csum;			/* Disk label checksum */
+	__be32 _unused1;			/* Padding */
+};
+
+int sgi_partition(struct parsed_partitions *state)
+{
+	int i, csum;
+	__be32 magic;
+	int slot = 1;
+	unsigned int start, blocks;
+	__be32 *ui, cs;
+	Sector sect;
+	struct sgi_disklabel *label;
+	struct sgi_partition *p;
+	char b[BDEVNAME_SIZE];
+
+	label = read_part_sector(state, 0, &sect);
+	if (!label)
+		return -1;
+	p = &label->partitions[0];
+	magic = label->magic_mushroom;
+	if(be32_to_cpu(magic) != SGI_LABEL_MAGIC) {
+		/*printk("Dev %s SGI disklabel: bad magic %08x\n",
+		       bdevname(bdev, b), be32_to_cpu(magic));*/
+		put_dev_sector(sect);
+		return 0;
+	}
+	ui = ((__be32 *) (label + 1)) - 1;
+	for(csum = 0; ui >= ((__be32 *) label);) {
+		cs = *ui--;
+		csum += be32_to_cpu(cs);
+	}
+	if(csum) {
+		printk(KERN_WARNING "Dev %s SGI disklabel: csum bad, label corrupted\n",
+		       bdevname(state->bdev, b));
+		put_dev_sector(sect);
+		return 0;
+	}
+	/* All SGI disk labels have 16 partitions, disks under Linux only
+	 * have 15 minor's.  Luckily there are always a few zero length
+	 * partitions which we don't care about so we never overflow the
+	 * current_minor.
+	 */
+	for(i = 0; i < 16; i++, p++) {
+		blocks = be32_to_cpu(p->num_blocks);
+		start  = be32_to_cpu(p->first_block);
+		if (blocks) {
+			put_partition(state, slot, start, blocks);
+			if (be32_to_cpu(p->type) == LINUX_RAID_PARTITION)
+				state->parts[slot].flags = ADDPART_FLAG_RAID;
+		}
+		slot++;
+	}
+	strlcat(state->pp_buf, "\n", PAGE_SIZE);
+	put_dev_sector(sect);
+	return 1;
+}
--- a/block/partitions/sgi.h
+++ b/block/partitions/sgi.h
@ -0,0 +1,8 @@
+/*
+ *  fs/partitions/sgi.h
+ */
+
+extern int sgi_partition(struct parsed_partitions *state);
+
+#define SGI_LABEL_MAGIC 0x0be5a941
+
--- a/block/partitions/sun.c
+++ b/block/partitions/sun.c
@ -0,0 +1,122 @@
+/*
+ *  fs/partitions/sun.c
+ *
+ *  Code extracted from drivers/block/genhd.c
+ *
+ *  Copyright (C) 1991-1998  Linus Torvalds
+ *  Re-organised Feb 1998 Russell King
+ */
+
+#include "check.h"
+#include "sun.h"
+
+int sun_partition(struct parsed_partitions *state)
+{
+	int i;
+	__be16 csum;
+	int slot = 1;
+	__be16 *ush;
+	Sector sect;
+	struct sun_disklabel {
+		unsigned char info[128];   /* Informative text string */
+		struct sun_vtoc {
+		    __be32 version;     /* Layout version */
+		    char   volume[8];   /* Volume name */
+		    __be16 nparts;      /* Number of partitions */
+		    struct sun_info {           /* Partition hdrs, sec 2 */
+			__be16 id;
+			__be16 flags;
+		    } infos[8];
+		    __be16 padding;     /* Alignment padding */
+		    __be32 bootinfo[3];  /* Info needed by mboot */
+		    __be32 sanity;       /* To verify vtoc sanity */
+		    __be32 reserved[10]; /* Free space */
+		    __be32 timestamp[8]; /* Partition timestamp */
+		} vtoc;
+		__be32 write_reinstruct; /* sectors to skip, writes */
+		__be32 read_reinstruct;  /* sectors to skip, reads */
+		unsigned char spare[148]; /* Padding */
+		__be16 rspeed;     /* Disk rotational speed */
+		__be16 pcylcount;  /* Physical cylinder count */
+		__be16 sparecyl;   /* extra sects per cylinder */
+		__be16 obs1;       /* gap1 */
+		__be16 obs2;       /* gap2 */
+		__be16 ilfact;     /* Interleave factor */
+		__be16 ncyl;       /* Data cylinder count */
+		__be16 nacyl;      /* Alt. cylinder count */
+		__be16 ntrks;      /* Tracks per cylinder */
+		__be16 nsect;      /* Sectors per track */
+		__be16 obs3;       /* bhead - Label head offset */
+		__be16 obs4;       /* ppart - Physical Partition */
+		struct sun_partition {
+			__be32 start_cylinder;
+			__be32 num_sectors;
+		} partitions[8];
+		__be16 magic;      /* Magic number */
+		__be16 csum;       /* Label xor'd checksum */
+	} * label;
+	struct sun_partition *p;
+	unsigned long spc;
+	char b[BDEVNAME_SIZE];
+	int use_vtoc;
+	int nparts;
+
+	label = read_part_sector(state, 0, &sect);
+	if (!label)
+		return -1;
+
+	p = label->partitions;
+	if (be16_to_cpu(label->magic) != SUN_LABEL_MAGIC) {
+/*		printk(KERN_INFO "Dev %s Sun disklabel: bad magic %04x\n",
+		       bdevname(bdev, b), be16_to_cpu(label->magic)); */
+		put_dev_sector(sect);
+		return 0;
+	}
+	/* Look at the checksum */
+	ush = ((__be16 *) (label+1)) - 1;
+	for (csum = 0; ush >= ((__be16 *) label);)
+		csum ^= *ush--;
+	if (csum) {
+		printk("Dev %s Sun disklabel: Csum bad, label corrupted\n",
+		       bdevname(state->bdev, b));
+		put_dev_sector(sect);
+		return 0;
+	}
+
+	/* Check to see if we can use the VTOC table */
+	use_vtoc = ((be32_to_cpu(label->vtoc.sanity) == SUN_VTOC_SANITY) &&
+		    (be32_to_cpu(label->vtoc.version) == 1) &&
+		    (be16_to_cpu(label->vtoc.nparts) <= 8));
+
+	/* Use 8 partition entries if not specified in validated VTOC */
+	nparts = (use_vtoc) ? be16_to_cpu(label->vtoc.nparts) : 8;
+
+	/*
+	 * So that old Linux-Sun partitions continue to work,
+	 * alow the VTOC to be used under the additional condition ...
+	 */
+	use_vtoc = use_vtoc || !(label->vtoc.sanity ||
+				 label->vtoc.version || label->vtoc.nparts);
+	spc = be16_to_cpu(label->ntrks) * be16_to_cpu(label->nsect);
+	for (i = 0; i < nparts; i++, p++) {
+		unsigned long st_sector;
+		unsigned int num_sectors;
+
+		st_sector = be32_to_cpu(p->start_cylinder) * spc;
+		num_sectors = be32_to_cpu(p->num_sectors);
+		if (num_sectors) {
+			put_partition(state, slot, st_sector, num_sectors);
+			state->parts[slot].flags = 0;
+			if (use_vtoc) {
+				if (be16_to_cpu(label->vtoc.infos[i].id) == LINUX_RAID_PARTITION)
+					state->parts[slot].flags |= ADDPART_FLAG_RAID;
+				else if (be16_to_cpu(label->vtoc.infos[i].id) == SUN_WHOLE_DISK)
+					state->parts[slot].flags |= ADDPART_FLAG_WHOLEDISK;
+			}
+		}
+		slot++;
+	}
+	strlcat(state->pp_buf, "\n", PAGE_SIZE);
+	put_dev_sector(sect);
+	return 1;
+}
--- a/block/partitions/sun.h
+++ b/block/partitions/sun.h
@ -0,0 +1,8 @@
+/*
+ *  fs/partitions/sun.h
+ */
+
+#define SUN_LABEL_MAGIC          0xDABE
+#define SUN_VTOC_SANITY          0x600DDEEE
+
+int sun_partition(struct parsed_partitions *state);
--- a/block/partitions/sysv68.c
+++ b/block/partitions/sysv68.c
@ -0,0 +1,95 @@
+/*
+ *  fs/partitions/sysv68.c
+ *
+ *  Copyright (C) 2007 Philippe De Muyter <phdm@macqel.be>
+ */
+
+#include "check.h"
+#include "sysv68.h"
+
+/*
+ *	Volume ID structure: on first 256-bytes sector of disk
+ */
+
+struct volumeid {
+	u8	vid_unused[248];
+	u8	vid_mac[8];	/* ASCII string "MOTOROLA" */
+};
+
+/*
+ *	config block: second 256-bytes sector on disk
+ */
+
+struct dkconfig {
+	u8	ios_unused0[128];
+	__be32	ios_slcblk;	/* Slice table block number */
+	__be16	ios_slccnt;	/* Number of entries in slice table */
+	u8	ios_unused1[122];
+};
+
+/*
+ *	combined volumeid and dkconfig block
+ */
+
+struct dkblk0 {
+	struct volumeid dk_vid;
+	struct dkconfig dk_ios;
+};
+
+/*
+ *	Slice Table Structure
+ */
+
+struct slice {
+	__be32	nblocks;		/* slice size (in blocks) */
+	__be32	blkoff;			/* block offset of slice */
+};
+
+
+int sysv68_partition(struct parsed_partitions *state)
+{
+	int i, slices;
+	int slot = 1;
+	Sector sect;
+	unsigned char *data;
+	struct dkblk0 *b;
+	struct slice *slice;
+	char tmp[64];
+
+	data = read_part_sector(state, 0, &sect);
+	if (!data)
+		return -1;
+
+	b = (struct dkblk0 *)data;
+	if (memcmp(b->dk_vid.vid_mac, "MOTOROLA", sizeof(b->dk_vid.vid_mac))) {
+		put_dev_sector(sect);
+		return 0;
+	}
+	slices = be16_to_cpu(b->dk_ios.ios_slccnt);
+	i = be32_to_cpu(b->dk_ios.ios_slcblk);
+	put_dev_sector(sect);
+
+	data = read_part_sector(state, i, &sect);
+	if (!data)
+		return -1;
+
+	slices -= 1; /* last slice is the whole disk */
+	snprintf(tmp, sizeof(tmp), "sysV68: %s(s%u)", state->name, slices);
+	strlcat(state->pp_buf, tmp, PAGE_SIZE);
+	slice = (struct slice *)data;
+	for (i = 0; i < slices; i++, slice++) {
+		if (slot == state->limit)
+			break;
+		if (be32_to_cpu(slice->nblocks)) {
+			put_partition(state, slot,
+				be32_to_cpu(slice->blkoff),
+				be32_to_cpu(slice->nblocks));
+			snprintf(tmp, sizeof(tmp), "(s%u)", i);
+			strlcat(state->pp_buf, tmp, PAGE_SIZE);
+		}
+		slot++;
+	}
+	strlcat(state->pp_buf, "\n", PAGE_SIZE);
+	put_dev_sector(sect);
+	return 1;
+}
--- a/block/partitions/sysv68.h
+++ b/block/partitions/sysv68.h
@ -0,0 +1 @@
+extern int sysv68_partition(struct parsed_partitions *state);
--- a/block/partitions/ultrix.c
+++ b/block/partitions/ultrix.c
@ -0,0 +1,48 @@
+/*
+ *  fs/partitions/ultrix.c
+ *
+ *  Code extracted from drivers/block/genhd.c
+ *
+ *  Re-organised Jul 1999 Russell King
+ */
+
+#include "check.h"
+#include "ultrix.h"
+
+int ultrix_partition(struct parsed_partitions *state)
+{
+	int i;
+	Sector sect;
+	unsigned char *data;
+	struct ultrix_disklabel {
+		s32	pt_magic;	/* magic no. indicating part. info exits */
+		s32	pt_valid;	/* set by driver if pt is current */
+		struct  pt_info {
+			s32		pi_nblocks; /* no. of sectors */
+			u32		pi_blkoff;  /* block offset for start */
+		} pt_part[8];
+	} *label;
+
+#define PT_MAGIC	0x032957	/* Partition magic number */
+#define PT_VALID	1		/* Indicates if struct is valid */
+
+	data = read_part_sector(state, (16384 - sizeof(*label))/512, &sect);
+	if (!data)
+		return -1;
+	
+	label = (struct ultrix_disklabel *)(data + 512 - sizeof(*label));
+
+	if (label->pt_magic == PT_MAGIC && label->pt_valid == PT_VALID) {
+		for (i=0; i<8; i++)
+			if (label->pt_part[i].pi_nblocks)
+				put_partition(state, i+1, 
+					      label->pt_part[i].pi_blkoff,
+					      label->pt_part[i].pi_nblocks);
+		put_dev_sector(sect);
+		strlcat(state->pp_buf, "\n", PAGE_SIZE);
+		return 1;
+	} else {
+		put_dev_sector(sect);
+		return 0;
+	}
+}
--- a/block/partitions/ultrix.h
+++ b/block/partitions/ultrix.h
@ -0,0 +1,5 @@
+/*
+ *  fs/partitions/ultrix.h
+ */
+
+int ultrix_partition(struct parsed_partitions *state);
--- a/block/scsi_ioctl.c
+++ b/block/scsi_ioctl.c
@ -0,0 +1,761 @@
+/*
+ * Copyright (C) 2001 Jens Axboe <axboe@suse.de>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public Licens
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-
+ *
+ */
+#include <linux/kernel.h>
+#include <linux/errno.h>
+#include <linux/string.h>
+#include <linux/module.h>
+#include <linux/blkdev.h>
+#include <linux/capability.h>
+#include <linux/completion.h>
+#include <linux/cdrom.h>
+#include <linux/ratelimit.h>
+#include <linux/slab.h>
+#include <linux/times.h>
+#include <linux/uio.h>
+#include <asm/uaccess.h>
+
+#include <scsi/scsi.h>
+#include <scsi/scsi_ioctl.h>
+#include <scsi/scsi_cmnd.h>
+
+struct blk_cmd_filter {
+	unsigned long read_ok[BLK_SCSI_CMD_PER_LONG];
+	unsigned long write_ok[BLK_SCSI_CMD_PER_LONG];
+};
+
+static struct blk_cmd_filter blk_default_cmd_filter;
+
+/* Command group 3 is reserved and should never be used.  */
+const unsigned char scsi_command_size_tbl[8] =
+{
+	6, 10, 10, 12,
+	16, 12, 10, 10
+};
+EXPORT_SYMBOL(scsi_command_size_tbl);
+
+#include <scsi/sg.h>
+
+static int sg_get_version(int __user *p)
+{
+	static const int sg_version_num = 30527;
+	return put_user(sg_version_num, p);
+}
+
+static int scsi_get_idlun(struct request_queue *q, int __user *p)
+{
+	return put_user(0, p);
+}
+
+static int scsi_get_bus(struct request_queue *q, int __user *p)
+{
+	return put_user(0, p);
+}
+
+static int sg_get_timeout(struct request_queue *q)
+{
+	return jiffies_to_clock_t(q->sg_timeout);
+}
+
+static int sg_set_timeout(struct request_queue *q, int __user *p)
+{
+	int timeout, err = get_user(timeout, p);
+
+	if (!err)
+		q->sg_timeout = clock_t_to_jiffies(timeout);
+
+	return err;
+}
+
+static int max_sectors_bytes(struct request_queue *q)
+{
+	unsigned int max_sectors = queue_max_sectors(q);
+
+	max_sectors = min_t(unsigned int, max_sectors, INT_MAX >> 9);
+
+	return max_sectors << 9;
+}
+
+static int sg_get_reserved_size(struct request_queue *q, int __user *p)
+{
+	int val = min_t(int, q->sg_reserved_size, max_sectors_bytes(q));
+
+	return put_user(val, p);
+}
+
+static int sg_set_reserved_size(struct request_queue *q, int __user *p)
+{
+	int size, err = get_user(size, p);
+
+	if (err)
+		return err;
+
+	if (size < 0)
+		return -EINVAL;
+
+	q->sg_reserved_size = min(size, max_sectors_bytes(q));
+	return 0;
+}
+
+/*
+ * will always return that we are ATAPI even for a real SCSI drive, I'm not
+ * so sure this is worth doing anything about (why would you care??)
+ */
+static int sg_emulated_host(struct request_queue *q, int __user *p)
+{
+	return put_user(1, p);
+}
+
+static void blk_set_cmd_filter_defaults(struct blk_cmd_filter *filter)
+{
+	/* Basic read-only commands */
+	__set_bit(TEST_UNIT_READY, filter->read_ok);
+	__set_bit(REQUEST_SENSE, filter->read_ok);
+	__set_bit(READ_6, filter->read_ok);
+	__set_bit(READ_10, filter->read_ok);
+	__set_bit(READ_12, filter->read_ok);
+	__set_bit(READ_16, filter->read_ok);
+	__set_bit(READ_BUFFER, filter->read_ok);
+	__set_bit(READ_DEFECT_DATA, filter->read_ok);
+	__set_bit(READ_CAPACITY, filter->read_ok);
+	__set_bit(READ_LONG, filter->read_ok);
+	__set_bit(INQUIRY, filter->read_ok);
+	__set_bit(MODE_SENSE, filter->read_ok);
+	__set_bit(MODE_SENSE_10, filter->read_ok);
+	__set_bit(LOG_SENSE, filter->read_ok);
+	__set_bit(START_STOP, filter->read_ok);
+	__set_bit(GPCMD_VERIFY_10, filter->read_ok);
+	__set_bit(VERIFY_16, filter->read_ok);
+	__set_bit(REPORT_LUNS, filter->read_ok);
+	__set_bit(SERVICE_ACTION_IN, filter->read_ok);
+	__set_bit(RECEIVE_DIAGNOSTIC, filter->read_ok);
+	__set_bit(MAINTENANCE_IN, filter->read_ok);
+	__set_bit(GPCMD_READ_BUFFER_CAPACITY, filter->read_ok);
+
+	/* Audio CD commands */
+	__set_bit(GPCMD_PLAY_CD, filter->read_ok);
+	__set_bit(GPCMD_PLAY_AUDIO_10, filter->read_ok);
+	__set_bit(GPCMD_PLAY_AUDIO_MSF, filter->read_ok);
+	__set_bit(GPCMD_PLAY_AUDIO_TI, filter->read_ok);
+	__set_bit(GPCMD_PAUSE_RESUME, filter->read_ok);
+
+	/* CD/DVD data reading */
+	__set_bit(GPCMD_READ_CD, filter->read_ok);
+	__set_bit(GPCMD_READ_CD_MSF, filter->read_ok);
+	__set_bit(GPCMD_READ_DISC_INFO, filter->read_ok);
+	__set_bit(GPCMD_READ_CDVD_CAPACITY, filter->read_ok);
+	__set_bit(GPCMD_READ_DVD_STRUCTURE, filter->read_ok);
+	__set_bit(GPCMD_READ_HEADER, filter->read_ok);
+	__set_bit(GPCMD_READ_TRACK_RZONE_INFO, filter->read_ok);
+	__set_bit(GPCMD_READ_SUBCHANNEL, filter->read_ok);
+	__set_bit(GPCMD_READ_TOC_PMA_ATIP, filter->read_ok);
+	__set_bit(GPCMD_REPORT_KEY, filter->read_ok);
+	__set_bit(GPCMD_SCAN, filter->read_ok);
+	__set_bit(GPCMD_GET_CONFIGURATION, filter->read_ok);
+	__set_bit(GPCMD_READ_FORMAT_CAPACITIES, filter->read_ok);
+	__set_bit(GPCMD_GET_EVENT_STATUS_NOTIFICATION, filter->read_ok);
+	__set_bit(GPCMD_GET_PERFORMANCE, filter->read_ok);
+	__set_bit(GPCMD_SEEK, filter->read_ok);
+	__set_bit(GPCMD_STOP_PLAY_SCAN, filter->read_ok);
+
+	/* Basic writing commands */
+	__set_bit(WRITE_6, filter->write_ok);
+	__set_bit(WRITE_10, filter->write_ok);
+	__set_bit(WRITE_VERIFY, filter->write_ok);
+	__set_bit(WRITE_12, filter->write_ok);
+	__set_bit(WRITE_VERIFY_12, filter->write_ok);
+	__set_bit(WRITE_16, filter->write_ok);
+	__set_bit(WRITE_LONG, filter->write_ok);
+	__set_bit(WRITE_LONG_2, filter->write_ok);
+	__set_bit(ERASE, filter->write_ok);
+	__set_bit(GPCMD_MODE_SELECT_10, filter->write_ok);
+	__set_bit(MODE_SELECT, filter->write_ok);
+	__set_bit(LOG_SELECT, filter->write_ok);
+	__set_bit(GPCMD_BLANK, filter->write_ok);
+	__set_bit(GPCMD_CLOSE_TRACK, filter->write_ok);
+	__set_bit(GPCMD_FLUSH_CACHE, filter->write_ok);
+	__set_bit(GPCMD_FORMAT_UNIT, filter->write_ok);
+	__set_bit(GPCMD_REPAIR_RZONE_TRACK, filter->write_ok);
+	__set_bit(GPCMD_RESERVE_RZONE_TRACK, filter->write_ok);
+	__set_bit(GPCMD_SEND_DVD_STRUCTURE, filter->write_ok);
+	__set_bit(GPCMD_SEND_EVENT, filter->write_ok);
+	__set_bit(GPCMD_SEND_KEY, filter->write_ok);
+	__set_bit(GPCMD_SEND_OPC, filter->write_ok);
+	__set_bit(GPCMD_SEND_CUE_SHEET, filter->write_ok);
+	__set_bit(GPCMD_SET_SPEED, filter->write_ok);
+	__set_bit(GPCMD_PREVENT_ALLOW_MEDIUM_REMOVAL, filter->write_ok);
+	__set_bit(GPCMD_LOAD_UNLOAD, filter->write_ok);
+	__set_bit(GPCMD_SET_STREAMING, filter->write_ok);
+	__set_bit(GPCMD_SET_READ_AHEAD, filter->write_ok);
+}
+
+int blk_verify_command(unsigned char *cmd, fmode_t has_write_perm)
+{
+	struct blk_cmd_filter *filter = &blk_default_cmd_filter;
+
+	/* root can do any command. */
+	if (capable(CAP_SYS_RAWIO))
+		return 0;
+
+	/* Anybody who can open the device can do a read-safe command */
+	if (test_bit(cmd[0], filter->read_ok))
+		return 0;
+
+	/* Write-safe commands require a writable open */
+	if (test_bit(cmd[0], filter->write_ok) && has_write_perm)
+		return 0;
+
+	return -EPERM;
+}
+EXPORT_SYMBOL(blk_verify_command);
+
+static int blk_fill_sghdr_rq(struct request_queue *q, struct request *rq,
+			     struct sg_io_hdr *hdr, fmode_t mode)
+{
+	if (copy_from_user(rq->cmd, hdr->cmdp, hdr->cmd_len))
+		return -EFAULT;
+	if (blk_verify_command(rq->cmd, mode & FMODE_WRITE))
+		return -EPERM;
+
+	/*
+	 * fill in request structure
+	 */
+	rq->cmd_len = hdr->cmd_len;
+
+	rq->timeout = msecs_to_jiffies(hdr->timeout);
+	if (!rq->timeout)
+		rq->timeout = q->sg_timeout;
+	if (!rq->timeout)
+		rq->timeout = BLK_DEFAULT_SG_TIMEOUT;
+	if (rq->timeout < BLK_MIN_SG_TIMEOUT)
+		rq->timeout = BLK_MIN_SG_TIMEOUT;
+
+	return 0;
+}
+
+static int blk_complete_sghdr_rq(struct request *rq, struct sg_io_hdr *hdr,
+				 struct bio *bio)
+{
+	int r, ret = 0;
+
+	/*
+	 * fill in all the output members
+	 */
+	hdr->status = rq->errors & 0xff;
+	hdr->masked_status = status_byte(rq->errors);
+	hdr->msg_status = msg_byte(rq->errors);
+	hdr->host_status = host_byte(rq->errors);
+	hdr->driver_status = driver_byte(rq->errors);
+	hdr->info = 0;
+	if (hdr->masked_status || hdr->host_status || hdr->driver_status)
+		hdr->info |= SG_INFO_CHECK;
+	hdr->resid = rq->resid_len;
+	hdr->sb_len_wr = 0;
+
+	if (rq->sense_len && hdr->sbp) {
+		int len = min((unsigned int) hdr->mx_sb_len, rq->sense_len);
+
+		if (!copy_to_user(hdr->sbp, rq->sense, len))
+			hdr->sb_len_wr = len;
+		else
+			ret = -EFAULT;
+	}
+
+	r = blk_rq_unmap_user(bio);
+	if (!ret)
+		ret = r;
+
+	return ret;
+}
+
+static int sg_io(struct request_queue *q, struct gendisk *bd_disk,
+		struct sg_io_hdr *hdr, fmode_t mode)
+{
+	unsigned long start_time;
+	ssize_t ret = 0;
+	int writing = 0;
+	int at_head = 0;
+	struct request *rq;
+	char sense[SCSI_SENSE_BUFFERSIZE];
+	struct bio *bio;
+
+	if (hdr->interface_id != 'S')
+		return -EINVAL;
+
+	if (hdr->dxfer_len > (queue_max_hw_sectors(q) << 9))
+		return -EIO;
+
+	if (hdr->dxfer_len)
+		switch (hdr->dxfer_direction) {
+		default:
+			return -EINVAL;
+		case SG_DXFER_TO_DEV:
+			writing = 1;
+			break;
+		case SG_DXFER_TO_FROM_DEV:
+		case SG_DXFER_FROM_DEV:
+			break;
+		}
+	if (hdr->flags & SG_FLAG_Q_AT_HEAD)
+		at_head = 1;
+
+	ret = -ENOMEM;
+	rq = blk_get_request(q, writing ? WRITE : READ, GFP_KERNEL);
+	if (IS_ERR(rq))
+		return PTR_ERR(rq);
+	blk_rq_set_block_pc(rq);
+
+	if (hdr->cmd_len > BLK_MAX_CDB) {
+		rq->cmd = kzalloc(hdr->cmd_len, GFP_KERNEL);
+		if (!rq->cmd)
+			goto out_put_request;
+	}
+
+	ret = -EFAULT;
+	if (blk_fill_sghdr_rq(q, rq, hdr, mode))
+		goto out_free_cdb;
+
+	ret = 0;
+	if (hdr->iovec_count) {
+		size_t iov_data_len;
+		struct iovec *iov = NULL;
+
+		ret = rw_copy_check_uvector(-1, hdr->dxferp, hdr->iovec_count,
+					    0, NULL, &iov);
+		if (ret < 0) {
+			kfree(iov);
+			goto out_free_cdb;
+		}
+
+		iov_data_len = ret;
+		ret = 0;
+
+		/* SG_IO howto says that the shorter of the two wins */
+		if (hdr->dxfer_len < iov_data_len) {
+			hdr->iovec_count = iov_shorten(iov,
+						       hdr->iovec_count,
+						       hdr->dxfer_len);
+			iov_data_len = hdr->dxfer_len;
+		}
+
+		ret = blk_rq_map_user_iov(q, rq, NULL, (struct sg_iovec *) iov,
+					  hdr->iovec_count,
+					  iov_data_len, GFP_KERNEL);
+		kfree(iov);
+	} else if (hdr->dxfer_len)
+		ret = blk_rq_map_user(q, rq, NULL, hdr->dxferp, hdr->dxfer_len,
+				      GFP_KERNEL);
+
+	if (ret)
+		goto out_free_cdb;
+
+	bio = rq->bio;
+	memset(sense, 0, sizeof(sense));
+	rq->sense = sense;
+	rq->sense_len = 0;
+	rq->retries = 0;
+
+	start_time = jiffies;
+
+	/* ignore return value. All information is passed back to caller
+	 * (if he doesn't check that is his problem).
+	 * N.B. a non-zero SCSI status is _not_ necessarily an error.
+	 */
+	blk_execute_rq(q, bd_disk, rq, at_head);
+
+	hdr->duration = jiffies_to_msecs(jiffies - start_time);
+
+	ret = blk_complete_sghdr_rq(rq, hdr, bio);
+
+out_free_cdb:
+	if (rq->cmd != rq->__cmd)
+		kfree(rq->cmd);
+out_put_request:
+	blk_put_request(rq);
+	return ret;
+}
+
+/**
+ * sg_scsi_ioctl  --  handle deprecated SCSI_IOCTL_SEND_COMMAND ioctl
+ * @file:	file this ioctl operates on (optional)
+ * @q:		request queue to send scsi commands down
+ * @disk:	gendisk to operate on (option)
+ * @sic:	userspace structure describing the command to perform
+ *
+ * Send down the scsi command described by @sic to the device below
+ * the request queue @q.  If @file is non-NULL it's used to perform
+ * fine-grained permission checks that allow users to send down
+ * non-destructive SCSI commands.  If the caller has a struct gendisk
+ * available it should be passed in as @disk to allow the low level
+ * driver to use the information contained in it.  A non-NULL @disk
+ * is only allowed if the caller knows that the low level driver doesn't
+ * need it (e.g. in the scsi subsystem).
+ *
+ * Notes:
+ *   -  This interface is deprecated - users should use the SG_IO
+ *      interface instead, as this is a more flexible approach to
+ *      performing SCSI commands on a device.
+ *   -  The SCSI command length is determined by examining the 1st byte
+ *      of the given command. There is no way to override this.
+ *   -  Data transfers are limited to PAGE_SIZE
+ *   -  The length (x + y) must be at least OMAX_SB_LEN bytes long to
+ *      accommodate the sense buffer when an error occurs.
+ *      The sense buffer is truncated to OMAX_SB_LEN (16) bytes so that
+ *      old code will not be surprised.
+ *   -  If a Unix error occurs (e.g. ENOMEM) then the user will receive
+ *      a negative return and the Unix error code in 'errno'.
+ *      If the SCSI command succeeds then 0 is returned.
+ *      Positive numbers returned are the compacted SCSI error codes (4
+ *      bytes in one int) where the lowest byte is the SCSI status.
+ */
+#define OMAX_SB_LEN 16          /* For backward compatibility */
+int sg_scsi_ioctl(struct request_queue *q, struct gendisk *disk, fmode_t mode,
+		struct scsi_ioctl_command __user *sic)
+{
+	struct request *rq;
+	int err;
+	unsigned int in_len, out_len, bytes, opcode, cmdlen;
+	char *buffer = NULL, sense[SCSI_SENSE_BUFFERSIZE];
+
+	if (!sic)
+		return -EINVAL;
+
+	/*
+	 * get in an out lengths, verify they don't exceed a page worth of data
+	 */
+	if (get_user(in_len, &sic->inlen))
+		return -EFAULT;
+	if (get_user(out_len, &sic->outlen))
+		return -EFAULT;
+	if (in_len > PAGE_SIZE || out_len > PAGE_SIZE)
+		return -EINVAL;
+	if (get_user(opcode, sic->data))
+		return -EFAULT;
+
+	bytes = max(in_len, out_len);
+	if (bytes) {
+		buffer = kzalloc(bytes, q->bounce_gfp | GFP_USER| __GFP_NOWARN);
+		if (!buffer)
+			return -ENOMEM;
+
+	}
+
+	rq = blk_get_request(q, in_len ? WRITE : READ, __GFP_WAIT);
+	if (IS_ERR(rq)) {
+		err = PTR_ERR(rq);
+		goto error_free_buffer;
+	}
+	blk_rq_set_block_pc(rq);
+
+	cmdlen = COMMAND_SIZE(opcode);
+
+	/*
+	 * get command and data to send to device, if any
+	 */
+	err = -EFAULT;
+	rq->cmd_len = cmdlen;
+	if (copy_from_user(rq->cmd, sic->data, cmdlen))
+		goto error;
+
+	if (in_len && copy_from_user(buffer, sic->data + cmdlen, in_len))
+		goto error;
+
+	err = blk_verify_command(rq->cmd, mode & FMODE_WRITE);
+	if (err)
+		goto error;
+
+	/* default.  possible overriden later */
+	rq->retries = 5;
+
+	switch (opcode) {
+	case SEND_DIAGNOSTIC:
+	case FORMAT_UNIT:
+		rq->timeout = FORMAT_UNIT_TIMEOUT;
+		rq->retries = 1;
+		break;
+	case START_STOP:
+		rq->timeout = START_STOP_TIMEOUT;
+		break;
+	case MOVE_MEDIUM:
+		rq->timeout = MOVE_MEDIUM_TIMEOUT;
+		break;
+	case READ_ELEMENT_STATUS:
+		rq->timeout = READ_ELEMENT_STATUS_TIMEOUT;
+		break;
+	case READ_DEFECT_DATA:
+		rq->timeout = READ_DEFECT_DATA_TIMEOUT;
+		rq->retries = 1;
+		break;
+	default:
+		rq->timeout = BLK_DEFAULT_SG_TIMEOUT;
+		break;
+	}
+
+	if (bytes && blk_rq_map_kern(q, rq, buffer, bytes, __GFP_WAIT)) {
+		err = DRIVER_ERROR << 24;
+		goto error;
+	}
+
+	memset(sense, 0, sizeof(sense));
+	rq->sense = sense;
+	rq->sense_len = 0;
+
+	blk_execute_rq(q, disk, rq, 0);
+
+	err = rq->errors & 0xff;	/* only 8 bit SCSI status */
+	if (err) {
+		if (rq->sense_len && rq->sense) {
+			bytes = (OMAX_SB_LEN > rq->sense_len) ?
+				rq->sense_len : OMAX_SB_LEN;
+			if (copy_to_user(sic->data, rq->sense, bytes))
+				err = -EFAULT;
+		}
+	} else {
+		if (copy_to_user(sic->data, buffer, out_len))
+			err = -EFAULT;
+	}
+	
+error:
+	blk_put_request(rq);
+
+error_free_buffer:
+	kfree(buffer);
+
+	return err;
+}
+EXPORT_SYMBOL_GPL(sg_scsi_ioctl);
+
+/* Send basic block requests */
+static int __blk_send_generic(struct request_queue *q, struct gendisk *bd_disk,
+			      int cmd, int data)
+{
+	struct request *rq;
+	int err;
+
+	rq = blk_get_request(q, WRITE, __GFP_WAIT);
+	if (IS_ERR(rq))
+		return PTR_ERR(rq);
+	blk_rq_set_block_pc(rq);
+	rq->timeout = BLK_DEFAULT_SG_TIMEOUT;
+	rq->cmd[0] = cmd;
+	rq->cmd[4] = data;
+	rq->cmd_len = 6;
+	err = blk_execute_rq(q, bd_disk, rq, 0);
+	blk_put_request(rq);
+
+	return err;
+}
+
+static inline int blk_send_start_stop(struct request_queue *q,
+				      struct gendisk *bd_disk, int data)
+{
+	return __blk_send_generic(q, bd_disk, GPCMD_START_STOP_UNIT, data);
+}
+
+int scsi_cmd_ioctl(struct request_queue *q, struct gendisk *bd_disk, fmode_t mode,
+		   unsigned int cmd, void __user *arg)
+{
+	int err;
+
+	if (!q)
+		return -ENXIO;
+
+	switch (cmd) {
+		/*
+		 * new sgv3 interface
+		 */
+		case SG_GET_VERSION_NUM:
+			err = sg_get_version(arg);
+			break;
+		case SCSI_IOCTL_GET_IDLUN:
+			err = scsi_get_idlun(q, arg);
+			break;
+		case SCSI_IOCTL_GET_BUS_NUMBER:
+			err = scsi_get_bus(q, arg);
+			break;
+		case SG_SET_TIMEOUT:
+			err = sg_set_timeout(q, arg);
+			break;
+		case SG_GET_TIMEOUT:
+			err = sg_get_timeout(q);
+			break;
+		case SG_GET_RESERVED_SIZE:
+			err = sg_get_reserved_size(q, arg);
+			break;
+		case SG_SET_RESERVED_SIZE:
+			err = sg_set_reserved_size(q, arg);
+			break;
+		case SG_EMULATED_HOST:
+			err = sg_emulated_host(q, arg);
+			break;
+		case SG_IO: {
+			struct sg_io_hdr hdr;
+
+			err = -EFAULT;
+			if (copy_from_user(&hdr, arg, sizeof(hdr)))
+				break;
+			err = sg_io(q, bd_disk, &hdr, mode);
+			if (err == -EFAULT)
+				break;
+
+			if (copy_to_user(arg, &hdr, sizeof(hdr)))
+				err = -EFAULT;
+			break;
+		}
+		case CDROM_SEND_PACKET: {
+			struct cdrom_generic_command cgc;
+			struct sg_io_hdr hdr;
+
+			err = -EFAULT;
+			if (copy_from_user(&cgc, arg, sizeof(cgc)))
+				break;
+			cgc.timeout = clock_t_to_jiffies(cgc.timeout);
+			memset(&hdr, 0, sizeof(hdr));
+			hdr.interface_id = 'S';
+			hdr.cmd_len = sizeof(cgc.cmd);
+			hdr.dxfer_len = cgc.buflen;
+			err = 0;
+			switch (cgc.data_direction) {
+				case CGC_DATA_UNKNOWN:
+					hdr.dxfer_direction = SG_DXFER_UNKNOWN;
+					break;
+				case CGC_DATA_WRITE:
+					hdr.dxfer_direction = SG_DXFER_TO_DEV;
+					break;
+				case CGC_DATA_READ:
+					hdr.dxfer_direction = SG_DXFER_FROM_DEV;
+					break;
+				case CGC_DATA_NONE:
+					hdr.dxfer_direction = SG_DXFER_NONE;
+					break;
+				default:
+					err = -EINVAL;
+			}
+			if (err)
+				break;
+
+			hdr.dxferp = cgc.buffer;
+			hdr.sbp = cgc.sense;
+			if (hdr.sbp)
+				hdr.mx_sb_len = sizeof(struct request_sense);
+			hdr.timeout = jiffies_to_msecs(cgc.timeout);
+			hdr.cmdp = ((struct cdrom_generic_command __user*) arg)->cmd;
+			hdr.cmd_len = sizeof(cgc.cmd);
+
+			err = sg_io(q, bd_disk, &hdr, mode);
+			if (err == -EFAULT)
+				break;
+
+			if (hdr.status)
+				err = -EIO;
+
+			cgc.stat = err;
+			cgc.buflen = hdr.resid;
+			if (copy_to_user(arg, &cgc, sizeof(cgc)))
+				err = -EFAULT;
+
+			break;
+		}
+
+		/*
+		 * old junk scsi send command ioctl
+		 */
+		case SCSI_IOCTL_SEND_COMMAND:
+			printk(KERN_WARNING "program %s is using a deprecated SCSI ioctl, please convert it to SG_IO\n", current->comm);
+			err = -EINVAL;
+			if (!arg)
+				break;
+
+			err = sg_scsi_ioctl(q, bd_disk, mode, arg);
+			break;
+		case CDROMCLOSETRAY:
+			err = blk_send_start_stop(q, bd_disk, 0x03);
+			break;
+		case CDROMEJECT:
+			err = blk_send_start_stop(q, bd_disk, 0x02);
+			break;
+		default:
+			err = -ENOTTY;
+	}
+
+	return err;
+}
+EXPORT_SYMBOL(scsi_cmd_ioctl);
+
+int scsi_verify_blk_ioctl(struct block_device *bd, unsigned int cmd)
+{
+	if (bd && bd == bd->bd_contains)
+		return 0;
+
+	/* Actually none of these is particularly useful on a partition,
+	 * but they are safe.
+	 */
+	switch (cmd) {
+	case SCSI_IOCTL_GET_IDLUN:
+	case SCSI_IOCTL_GET_BUS_NUMBER:
+	case SCSI_IOCTL_GET_PCI:
+	case SCSI_IOCTL_PROBE_HOST:
+	case SG_GET_VERSION_NUM:
+	case SG_SET_TIMEOUT:
+	case SG_GET_TIMEOUT:
+	case SG_GET_RESERVED_SIZE:
+	case SG_SET_RESERVED_SIZE:
+	case SG_EMULATED_HOST:
+	case SCSI_IOCTL_SECURITY_PROTOCOL_IN:
+	case SCSI_IOCTL_SECURITY_PROTOCOL_OUT:
+		return 0;
+	case CDROM_GET_CAPABILITY:
+		/* Keep this until we remove the printk below.  udev sends it
+		 * and we do not want to spam dmesg about it.   CD-ROMs do
+		 * not have partitions, so we get here only for disks.
+		 */
+		return -ENOIOCTLCMD;
+	default:
+		break;
+	}
+
+	if (capable(CAP_SYS_RAWIO))
+		return 0;
+
+	/* In particular, rule out all resets and host-specific ioctls.  */
+	printk_ratelimited(KERN_WARNING
+			   "%s: sending ioctl %x to a partition!\n", current->comm, cmd);
+
+	return -ENOIOCTLCMD;
+}
+EXPORT_SYMBOL(scsi_verify_blk_ioctl);
+
+int scsi_cmd_blk_ioctl(struct block_device *bd, fmode_t mode,
+		       unsigned int cmd, void __user *arg)
+{
+	int ret;
+
+	ret = scsi_verify_blk_ioctl(bd, cmd);
+	if (ret < 0)
+		return ret;
+
+	return scsi_cmd_ioctl(bd->bd_disk->queue, bd->bd_disk, mode, cmd, arg);
+}
+EXPORT_SYMBOL(scsi_cmd_blk_ioctl);
+
+static int __init blk_scsi_ioctl_init(void)
+{
+	blk_set_cmd_filter_defaults(&blk_default_cmd_filter);
+	return 0;
+}
+fs_initcall(blk_scsi_ioctl_init);
--- a/block/t10-pi.c
+++ b/block/t10-pi.c
@ -0,0 +1,197 @@
+/*
+ * t10_pi.c - Functions for generating and verifying T10 Protection
+ *	      Information.
+ *
+ * Copyright (C) 2007, 2008, 2014 Oracle Corporation
+ * Written by: Martin K. Petersen <martin.petersen@oracle.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License version
+ * 2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; see the file COPYING.  If not, write to
+ * the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139,
+ * USA.
+ *
+ */
+
+#include <linux/t10-pi.h>
+#include <linux/blkdev.h>
+#include <linux/crc-t10dif.h>
+#include <net/checksum.h>
+
+typedef __be16 (csum_fn) (void *, unsigned int);
+
+static const __be16 APP_ESCAPE = (__force __be16) 0xffff;
+static const __be32 REF_ESCAPE = (__force __be32) 0xffffffff;
+
+static __be16 t10_pi_crc_fn(void *data, unsigned int len)
+{
+	return cpu_to_be16(crc_t10dif(data, len));
+}
+
+static __be16 t10_pi_ip_fn(void *data, unsigned int len)
+{
+	return (__force __be16)ip_compute_csum(data, len);
+}
+
+/*
+ * Type 1 and Type 2 protection use the same format: 16 bit guard tag,
+ * 16 bit app tag, 32 bit reference tag. Type 3 does not define the ref
+ * tag.
+ */
+static int t10_pi_generate(struct blk_integrity_iter *iter, csum_fn *fn,
+			   unsigned int type)
+{
+	unsigned int i;
+
+	for (i = 0 ; i < iter->data_size ; i += iter->interval) {
+		struct t10_pi_tuple *pi = iter->prot_buf;
+
+		pi->guard_tag = fn(iter->data_buf, iter->interval);
+		pi->app_tag = 0;
+
+		if (type == 1)
+			pi->ref_tag = cpu_to_be32(lower_32_bits(iter->seed));
+		else
+			pi->ref_tag = 0;
+
+		iter->data_buf += iter->interval;
+		iter->prot_buf += sizeof(struct t10_pi_tuple);
+		iter->seed++;
+	}
+
+	return 0;
+}
+
+static int t10_pi_verify(struct blk_integrity_iter *iter, csum_fn *fn,
+				unsigned int type)
+{
+	unsigned int i;
+
+	for (i = 0 ; i < iter->data_size ; i += iter->interval) {
+		struct t10_pi_tuple *pi = iter->prot_buf;
+		__be16 csum;
+
+		switch (type) {
+		case 1:
+		case 2:
+			if (pi->app_tag == APP_ESCAPE)
+				goto next;
+
+			if (be32_to_cpu(pi->ref_tag) !=
+			    lower_32_bits(iter->seed)) {
+				pr_err("%s: ref tag error at location %llu " \
+				       "(rcvd %u)\n", iter->disk_name,
+				       (unsigned long long)
+				       iter->seed, be32_to_cpu(pi->ref_tag));
+				return -EILSEQ;
+			}
+			break;
+		case 3:
+			if (pi->app_tag == APP_ESCAPE &&
+			    pi->ref_tag == REF_ESCAPE)
+				goto next;
+			break;
+		}
+
+		csum = fn(iter->data_buf, iter->interval);
+
+		if (pi->guard_tag != csum) {
+			pr_err("%s: guard tag error at sector %llu " \
+			       "(rcvd %04x, want %04x)\n", iter->disk_name,
+			       (unsigned long long)iter->seed,
+			       be16_to_cpu(pi->guard_tag), be16_to_cpu(csum));
+			return -EILSEQ;
+		}
+
+next:
+		iter->data_buf += iter->interval;
+		iter->prot_buf += sizeof(struct t10_pi_tuple);
+		iter->seed++;
+	}
+
+	return 0;
+}
+
+static int t10_pi_type1_generate_crc(struct blk_integrity_iter *iter)
+{
+	return t10_pi_generate(iter, t10_pi_crc_fn, 1);
+}
+
+static int t10_pi_type1_generate_ip(struct blk_integrity_iter *iter)
+{
+	return t10_pi_generate(iter, t10_pi_ip_fn, 1);
+}
+
+static int t10_pi_type1_verify_crc(struct blk_integrity_iter *iter)
+{
+	return t10_pi_verify(iter, t10_pi_crc_fn, 1);
+}
+
+static int t10_pi_type1_verify_ip(struct blk_integrity_iter *iter)
+{
+	return t10_pi_verify(iter, t10_pi_ip_fn, 1);
+}
+
+static int t10_pi_type3_generate_crc(struct blk_integrity_iter *iter)
+{
+	return t10_pi_generate(iter, t10_pi_crc_fn, 3);
+}
+
+static int t10_pi_type3_generate_ip(struct blk_integrity_iter *iter)
+{
+	return t10_pi_generate(iter, t10_pi_ip_fn, 3);
+}
+
+static int t10_pi_type3_verify_crc(struct blk_integrity_iter *iter)
+{
+	return t10_pi_verify(iter, t10_pi_crc_fn, 3);
+}
+
+static int t10_pi_type3_verify_ip(struct blk_integrity_iter *iter)
+{
+	return t10_pi_verify(iter, t10_pi_ip_fn, 3);
+}
+
+struct blk_integrity t10_pi_type1_crc = {
+	.name			= "T10-DIF-TYPE1-CRC",
+	.generate_fn		= t10_pi_type1_generate_crc,
+	.verify_fn		= t10_pi_type1_verify_crc,
+	.tuple_size		= sizeof(struct t10_pi_tuple),
+	.tag_size		= 0,
+};
+EXPORT_SYMBOL(t10_pi_type1_crc);
+
+struct blk_integrity t10_pi_type1_ip = {
+	.name			= "T10-DIF-TYPE1-IP",
+	.generate_fn		= t10_pi_type1_generate_ip,
+	.verify_fn		= t10_pi_type1_verify_ip,
+	.tuple_size		= sizeof(struct t10_pi_tuple),
+	.tag_size		= 0,
+};
+EXPORT_SYMBOL(t10_pi_type1_ip);
+
+struct blk_integrity t10_pi_type3_crc = {
+	.name			= "T10-DIF-TYPE3-CRC",
+	.generate_fn		= t10_pi_type3_generate_crc,
+	.verify_fn		= t10_pi_type3_verify_crc,
+	.tuple_size		= sizeof(struct t10_pi_tuple),
+	.tag_size		= 0,
+};
+EXPORT_SYMBOL(t10_pi_type3_crc);
+
+struct blk_integrity t10_pi_type3_ip = {
+	.name			= "T10-DIF-TYPE3-IP",
+	.generate_fn		= t10_pi_type3_generate_ip,
+	.verify_fn		= t10_pi_type3_verify_ip,
+	.tuple_size		= sizeof(struct t10_pi_tuple),
+	.tag_size		= 0,
+};
+EXPORT_SYMBOL(t10_pi_type3_ip);
				`@ -0,0 +1 @@`
				`extern int aix_partition(struct parsed_partitions *state);`
				`@ -0,0 +1,2 @@`

				`int cmdline_partition(struct parsed_partitions *state);`
				`@ -0,0 +1 @@`
				`int ibm_partition(struct parsed_partitions *);`
				`@ -0,0 +1 @@`
				`extern int sysv68_partition(struct parsed_partitions *state);`