Fixed MTP to work with TWRP

This commit is contained in:
awab228 2018-06-19 23:16:04 +02:00
commit f6dfaef42e
50820 changed files with 20846062 additions and 0 deletions

127
block/Kconfig Normal file
View file

@ -0,0 +1,127 @@
#
# Block layer core configuration
#
menuconfig BLOCK
bool "Enable the block layer" if EXPERT
default y
help
Provide block layer support for the kernel.
Disable this option to remove the block layer support from the
kernel. This may be useful for embedded devices.
If this option is disabled:
- block device files will become unusable
- some filesystems (such as ext3) will become unavailable.
Also, SCSI character devices and USB storage will be disabled since
they make use of various block layer definitions and facilities.
Say Y here unless you know you really don't want to mount disks and
suchlike.
if BLOCK
config LBDAF
bool "Support for large (2TB+) block devices and files"
depends on !64BIT
default y
help
Enable block devices or files of size 2TB and larger.
This option is required to support the full capacity of large
(2TB+) block devices, including RAID, disk, Network Block Device,
Logical Volume Manager (LVM) and loopback.
This option also enables support for single files larger than
2TB.
The ext4 filesystem requires that this feature be enabled in
order to support filesystems that have the huge_file feature
enabled. Otherwise, it will refuse to mount in the read-write
mode any filesystems that use the huge_file feature, which is
enabled by default by mke2fs.ext4.
The GFS2 filesystem also requires this feature.
If unsure, say Y.
config BLK_DEV_BSG
bool "Block layer SG support v4"
default y
help
Saying Y here will enable generic SG (SCSI generic) v4 support
for any block device.
Unlike SG v3 (aka block/scsi_ioctl.c drivers/scsi/sg.c), SG v4
can handle complicated SCSI commands: tagged variable length cdbs
with bidirectional data transfers and generic request/response
protocols (e.g. Task Management Functions and SMP in Serial
Attached SCSI).
This option is required by recent UDEV versions to properly
access device serial numbers, etc.
If unsure, say Y.
config BLK_DEV_BSGLIB
bool "Block layer SG support v4 helper lib"
default n
select BLK_DEV_BSG
help
Subsystems will normally enable this if needed. Users will not
normally need to manually enable this.
If unsure, say N.
config BLK_DEV_INTEGRITY
bool "Block layer data integrity support"
select CRC_T10DIF if BLK_DEV_INTEGRITY
---help---
Some storage devices allow extra information to be
stored/retrieved to help protect the data. The block layer
data integrity option provides hooks which can be used by
filesystems to ensure better data integrity.
Say yes here if you have a storage device that provides the
T10/SCSI Data Integrity Field or the T13/ATA External Path
Protection. If in doubt, say N.
config BLK_DEV_THROTTLING
bool "Block layer bio throttling support"
depends on BLK_CGROUP=y
default n
---help---
Block layer bio throttling support. It can be used to limit
the IO rate to a device. IO rate policies are per cgroup and
one needs to mount and use blkio cgroup controller for creating
cgroups and specifying per device IO rate policies.
See Documentation/cgroups/blkio-controller.txt for more information.
config BLK_CMDLINE_PARSER
bool "Block device command line partition parser"
default n
---help---
Enabling this option allows you to specify the partition layout from
the kernel boot args. This is typically of use for embedded devices
which don't otherwise have any standardized method for listing the
partitions on a block device.
See Documentation/block/cmdline-partition.txt for more information.
menu "Partition Types"
source "block/partitions/Kconfig"
endmenu
endif # BLOCK
config BLOCK_COMPAT
bool
depends on BLOCK && COMPAT
default y
source block/Kconfig.iosched

68
block/Kconfig.iosched Normal file
View file

@ -0,0 +1,68 @@
if BLOCK
menu "IO Schedulers"
config IOSCHED_NOOP
bool
default y
---help---
The no-op I/O scheduler is a minimal scheduler that does basic merging
and sorting. Its main uses include non-disk based block devices like
memory devices, and specialised software or hardware environments
that do their own scheduling and require only minimal assistance from
the kernel.
config IOSCHED_DEADLINE
tristate "Deadline I/O scheduler"
default y
---help---
The deadline I/O scheduler is simple and compact. It will provide
CSCAN service with FIFO expiration of requests, switching to
a new point in the service tree and doing a batch of IO from there
in case of expiry.
config IOSCHED_CFQ
tristate "CFQ I/O scheduler"
default y
---help---
The CFQ I/O scheduler tries to distribute bandwidth equally
among all processes in the system. It should provide a fair
and low latency working environment, suitable for both desktop
and server systems.
This is the default I/O scheduler.
config CFQ_GROUP_IOSCHED
bool "CFQ Group Scheduling support"
depends on IOSCHED_CFQ && BLK_CGROUP
default n
---help---
Enable group IO scheduling in CFQ.
choice
prompt "Default I/O scheduler"
default DEFAULT_CFQ
help
Select the I/O scheduler which will be used by default for all
block devices.
config DEFAULT_DEADLINE
bool "Deadline" if IOSCHED_DEADLINE=y
config DEFAULT_CFQ
bool "CFQ" if IOSCHED_CFQ=y
config DEFAULT_NOOP
bool "No-op"
endchoice
config DEFAULT_IOSCHED
string
default "deadline" if DEFAULT_DEADLINE
default "cfq" if DEFAULT_CFQ
default "noop" if DEFAULT_NOOP
endmenu
endif

25
block/Makefile Normal file
View file

@ -0,0 +1,25 @@
#
# Makefile for the kernel block layer
#
obj-$(CONFIG_BLOCK) := bio.o elevator.o blk-core.o blk-tag.o blk-sysfs.o \
blk-flush.o blk-settings.o blk-ioc.o blk-map.o \
blk-exec.o blk-merge.o blk-softirq.o blk-timeout.o \
blk-iopoll.o blk-lib.o blk-mq.o blk-mq-tag.o \
blk-mq-sysfs.o blk-mq-cpu.o blk-mq-cpumap.o ioctl.o \
genhd.o scsi_ioctl.o partition-generic.o ioprio.o \
partitions/
obj-$(CONFIG_BOUNCE) += bounce.o
obj-$(CONFIG_BLK_DEV_BSG) += bsg.o
obj-$(CONFIG_BLK_DEV_BSGLIB) += bsg-lib.o
obj-$(CONFIG_BLK_CGROUP) += blk-cgroup.o
obj-$(CONFIG_BLK_DEV_THROTTLING) += blk-throttle.o
obj-$(CONFIG_IOSCHED_NOOP) += noop-iosched.o
obj-$(CONFIG_IOSCHED_DEADLINE) += deadline-iosched.o
obj-$(CONFIG_IOSCHED_CFQ) += cfq-iosched.o
obj-$(CONFIG_BLOCK_COMPAT) += compat_ioctl.o
obj-$(CONFIG_BLK_CMDLINE_PARSER) += cmdline-parser.o
obj-$(CONFIG_BLK_DEV_INTEGRITY) += bio-integrity.o blk-integrity.o t10-pi.o

517
block/bio-integrity.c Normal file
View file

@ -0,0 +1,517 @@
/*
* bio-integrity.c - bio data integrity extensions
*
* Copyright (C) 2007, 2008, 2009 Oracle Corporation
* Written by: Martin K. Petersen <martin.petersen@oracle.com>
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public License version
* 2 as published by the Free Software Foundation.
*
* This program is distributed in the hope that it will be useful, but
* WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; see the file COPYING. If not, write to
* the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139,
* USA.
*
*/
#include <linux/blkdev.h>
#include <linux/mempool.h>
#include <linux/export.h>
#include <linux/bio.h>
#include <linux/workqueue.h>
#include <linux/slab.h>
#define BIP_INLINE_VECS 4
static struct kmem_cache *bip_slab;
static struct workqueue_struct *kintegrityd_wq;
/**
* bio_integrity_alloc - Allocate integrity payload and attach it to bio
* @bio: bio to attach integrity metadata to
* @gfp_mask: Memory allocation mask
* @nr_vecs: Number of integrity metadata scatter-gather elements
*
* Description: This function prepares a bio for attaching integrity
* metadata. nr_vecs specifies the maximum number of pages containing
* integrity metadata that can be attached.
*/
struct bio_integrity_payload *bio_integrity_alloc(struct bio *bio,
gfp_t gfp_mask,
unsigned int nr_vecs)
{
struct bio_integrity_payload *bip;
struct bio_set *bs = bio->bi_pool;
unsigned long idx = BIO_POOL_NONE;
unsigned inline_vecs;
if (!bs) {
bip = kmalloc(sizeof(struct bio_integrity_payload) +
sizeof(struct bio_vec) * nr_vecs, gfp_mask);
inline_vecs = nr_vecs;
} else {
bip = mempool_alloc(bs->bio_integrity_pool, gfp_mask);
inline_vecs = BIP_INLINE_VECS;
}
if (unlikely(!bip))
return NULL;
memset(bip, 0, sizeof(*bip));
if (nr_vecs > inline_vecs) {
bip->bip_vec = bvec_alloc(gfp_mask, nr_vecs, &idx,
bs->bvec_integrity_pool);
if (!bip->bip_vec)
goto err;
bip->bip_max_vcnt = bvec_nr_vecs(idx);
} else {
bip->bip_vec = bip->bip_inline_vecs;
bip->bip_max_vcnt = inline_vecs;
}
bip->bip_slab = idx;
bip->bip_bio = bio;
bio->bi_integrity = bip;
bio->bi_rw |= REQ_INTEGRITY;
return bip;
err:
mempool_free(bip, bs->bio_integrity_pool);
return NULL;
}
EXPORT_SYMBOL(bio_integrity_alloc);
/**
* bio_integrity_free - Free bio integrity payload
* @bio: bio containing bip to be freed
*
* Description: Used to free the integrity portion of a bio. Usually
* called from bio_free().
*/
void bio_integrity_free(struct bio *bio)
{
struct bio_integrity_payload *bip = bio_integrity(bio);
struct bio_set *bs = bio->bi_pool;
if (bip->bip_flags & BIP_BLOCK_INTEGRITY)
kfree(page_address(bip->bip_vec->bv_page) +
bip->bip_vec->bv_offset);
if (bs) {
if (bip->bip_slab != BIO_POOL_NONE)
bvec_free(bs->bvec_integrity_pool, bip->bip_vec,
bip->bip_slab);
mempool_free(bip, bs->bio_integrity_pool);
} else {
kfree(bip);
}
bio->bi_integrity = NULL;
}
EXPORT_SYMBOL(bio_integrity_free);
/**
* bio_integrity_add_page - Attach integrity metadata
* @bio: bio to update
* @page: page containing integrity metadata
* @len: number of bytes of integrity metadata in page
* @offset: start offset within page
*
* Description: Attach a page containing integrity metadata to bio.
*/
int bio_integrity_add_page(struct bio *bio, struct page *page,
unsigned int len, unsigned int offset)
{
struct bio_integrity_payload *bip = bio_integrity(bio);
struct bio_vec *iv;
if (bip->bip_vcnt >= bip->bip_max_vcnt) {
printk(KERN_ERR "%s: bip_vec full\n", __func__);
return 0;
}
iv = bip->bip_vec + bip->bip_vcnt;
iv->bv_page = page;
iv->bv_len = len;
iv->bv_offset = offset;
bip->bip_vcnt++;
return len;
}
EXPORT_SYMBOL(bio_integrity_add_page);
/**
* bio_integrity_enabled - Check whether integrity can be passed
* @bio: bio to check
*
* Description: Determines whether bio_integrity_prep() can be called
* on this bio or not. bio data direction and target device must be
* set prior to calling. The functions honors the write_generate and
* read_verify flags in sysfs.
*/
bool bio_integrity_enabled(struct bio *bio)
{
struct blk_integrity *bi = bdev_get_integrity(bio->bi_bdev);
if (!bio_is_rw(bio))
return false;
/* Already protected? */
if (bio_integrity(bio))
return false;
if (bi == NULL)
return false;
if (bio_data_dir(bio) == READ && bi->verify_fn != NULL &&
(bi->flags & BLK_INTEGRITY_VERIFY))
return true;
if (bio_data_dir(bio) == WRITE && bi->generate_fn != NULL &&
(bi->flags & BLK_INTEGRITY_GENERATE))
return true;
return false;
}
EXPORT_SYMBOL(bio_integrity_enabled);
/**
* bio_integrity_intervals - Return number of integrity intervals for a bio
* @bi: blk_integrity profile for device
* @sectors: Size of the bio in 512-byte sectors
*
* Description: The block layer calculates everything in 512 byte
* sectors but integrity metadata is done in terms of the data integrity
* interval size of the storage device. Convert the block layer sectors
* to the appropriate number of integrity intervals.
*/
static inline unsigned int bio_integrity_intervals(struct blk_integrity *bi,
unsigned int sectors)
{
return sectors >> (ilog2(bi->interval) - 9);
}
static inline unsigned int bio_integrity_bytes(struct blk_integrity *bi,
unsigned int sectors)
{
return bio_integrity_intervals(bi, sectors) * bi->tuple_size;
}
/**
* bio_integrity_process - Process integrity metadata for a bio
* @bio: bio to generate/verify integrity metadata for
* @proc_fn: Pointer to the relevant processing function
*/
static int bio_integrity_process(struct bio *bio,
integrity_processing_fn *proc_fn)
{
struct blk_integrity *bi = bdev_get_integrity(bio->bi_bdev);
struct blk_integrity_iter iter;
struct bvec_iter bviter;
struct bio_vec bv;
struct bio_integrity_payload *bip = bio_integrity(bio);
unsigned int ret = 0;
void *prot_buf = page_address(bip->bip_vec->bv_page) +
bip->bip_vec->bv_offset;
iter.disk_name = bio->bi_bdev->bd_disk->disk_name;
iter.interval = bi->interval;
iter.seed = bip_get_seed(bip);
iter.prot_buf = prot_buf;
bio_for_each_segment(bv, bio, bviter) {
void *kaddr = kmap_atomic(bv.bv_page);
iter.data_buf = kaddr + bv.bv_offset;
iter.data_size = bv.bv_len;
ret = proc_fn(&iter);
if (ret) {
kunmap_atomic(kaddr);
return ret;
}
kunmap_atomic(kaddr);
}
return ret;
}
/**
* bio_integrity_prep - Prepare bio for integrity I/O
* @bio: bio to prepare
*
* Description: Allocates a buffer for integrity metadata, maps the
* pages and attaches them to a bio. The bio must have data
* direction, target device and start sector set priot to calling. In
* the WRITE case, integrity metadata will be generated using the
* block device's integrity function. In the READ case, the buffer
* will be prepared for DMA and a suitable end_io handler set up.
*/
int bio_integrity_prep(struct bio *bio)
{
struct bio_integrity_payload *bip;
struct blk_integrity *bi;
struct request_queue *q;
void *buf;
unsigned long start, end;
unsigned int len, nr_pages;
unsigned int bytes, offset, i;
unsigned int intervals;
bi = bdev_get_integrity(bio->bi_bdev);
q = bdev_get_queue(bio->bi_bdev);
BUG_ON(bi == NULL);
BUG_ON(bio_integrity(bio));
intervals = bio_integrity_intervals(bi, bio_sectors(bio));
/* Allocate kernel buffer for protection data */
len = intervals * bi->tuple_size;
buf = kmalloc(len, GFP_NOIO | q->bounce_gfp);
if (unlikely(buf == NULL)) {
printk(KERN_ERR "could not allocate integrity buffer\n");
return -ENOMEM;
}
end = (((unsigned long) buf) + len + PAGE_SIZE - 1) >> PAGE_SHIFT;
start = ((unsigned long) buf) >> PAGE_SHIFT;
nr_pages = end - start;
/* Allocate bio integrity payload and integrity vectors */
bip = bio_integrity_alloc(bio, GFP_NOIO, nr_pages);
if (unlikely(bip == NULL)) {
printk(KERN_ERR "could not allocate data integrity bioset\n");
kfree(buf);
return -EIO;
}
bip->bip_flags |= BIP_BLOCK_INTEGRITY;
bip->bip_iter.bi_size = len;
bip_set_seed(bip, bio->bi_iter.bi_sector);
if (bi->flags & BLK_INTEGRITY_IP_CHECKSUM)
bip->bip_flags |= BIP_IP_CHECKSUM;
/* Map it */
offset = offset_in_page(buf);
for (i = 0 ; i < nr_pages ; i++) {
int ret;
bytes = PAGE_SIZE - offset;
if (len <= 0)
break;
if (bytes > len)
bytes = len;
ret = bio_integrity_add_page(bio, virt_to_page(buf),
bytes, offset);
if (ret == 0)
return 0;
if (ret < bytes)
break;
buf += bytes;
len -= bytes;
offset = 0;
}
/* Install custom I/O completion handler if read verify is enabled */
if (bio_data_dir(bio) == READ) {
bip->bip_end_io = bio->bi_end_io;
bio->bi_end_io = bio_integrity_endio;
}
/* Auto-generate integrity metadata if this is a write */
if (bio_data_dir(bio) == WRITE)
bio_integrity_process(bio, bi->generate_fn);
return 0;
}
EXPORT_SYMBOL(bio_integrity_prep);
/**
* bio_integrity_verify_fn - Integrity I/O completion worker
* @work: Work struct stored in bio to be verified
*
* Description: This workqueue function is called to complete a READ
* request. The function verifies the transferred integrity metadata
* and then calls the original bio end_io function.
*/
static void bio_integrity_verify_fn(struct work_struct *work)
{
struct bio_integrity_payload *bip =
container_of(work, struct bio_integrity_payload, bip_work);
struct bio *bio = bip->bip_bio;
struct blk_integrity *bi = bdev_get_integrity(bio->bi_bdev);
int error;
error = bio_integrity_process(bio, bi->verify_fn);
/* Restore original bio completion handler */
bio->bi_end_io = bip->bip_end_io;
bio_endio_nodec(bio, error);
}
/**
* bio_integrity_endio - Integrity I/O completion function
* @bio: Protected bio
* @error: Pointer to errno
*
* Description: Completion for integrity I/O
*
* Normally I/O completion is done in interrupt context. However,
* verifying I/O integrity is a time-consuming task which must be run
* in process context. This function postpones completion
* accordingly.
*/
void bio_integrity_endio(struct bio *bio, int error)
{
struct bio_integrity_payload *bip = bio_integrity(bio);
BUG_ON(bip->bip_bio != bio);
/* In case of an I/O error there is no point in verifying the
* integrity metadata. Restore original bio end_io handler
* and run it.
*/
if (error) {
bio->bi_end_io = bip->bip_end_io;
bio_endio_nodec(bio, error);
return;
}
INIT_WORK(&bip->bip_work, bio_integrity_verify_fn);
queue_work(kintegrityd_wq, &bip->bip_work);
}
EXPORT_SYMBOL(bio_integrity_endio);
/**
* bio_integrity_advance - Advance integrity vector
* @bio: bio whose integrity vector to update
* @bytes_done: number of data bytes that have been completed
*
* Description: This function calculates how many integrity bytes the
* number of completed data bytes correspond to and advances the
* integrity vector accordingly.
*/
void bio_integrity_advance(struct bio *bio, unsigned int bytes_done)
{
struct bio_integrity_payload *bip = bio_integrity(bio);
struct blk_integrity *bi = bdev_get_integrity(bio->bi_bdev);
unsigned bytes = bio_integrity_bytes(bi, bytes_done >> 9);
bvec_iter_advance(bip->bip_vec, &bip->bip_iter, bytes);
}
EXPORT_SYMBOL(bio_integrity_advance);
/**
* bio_integrity_trim - Trim integrity vector
* @bio: bio whose integrity vector to update
* @offset: offset to first data sector
* @sectors: number of data sectors
*
* Description: Used to trim the integrity vector in a cloned bio.
* The ivec will be advanced corresponding to 'offset' data sectors
* and the length will be truncated corresponding to 'len' data
* sectors.
*/
void bio_integrity_trim(struct bio *bio, unsigned int offset,
unsigned int sectors)
{
struct bio_integrity_payload *bip = bio_integrity(bio);
struct blk_integrity *bi = bdev_get_integrity(bio->bi_bdev);
bio_integrity_advance(bio, offset << 9);
bip->bip_iter.bi_size = bio_integrity_bytes(bi, sectors);
}
EXPORT_SYMBOL(bio_integrity_trim);
/**
* bio_integrity_clone - Callback for cloning bios with integrity metadata
* @bio: New bio
* @bio_src: Original bio
* @gfp_mask: Memory allocation mask
*
* Description: Called to allocate a bip when cloning a bio
*/
int bio_integrity_clone(struct bio *bio, struct bio *bio_src,
gfp_t gfp_mask)
{
struct bio_integrity_payload *bip_src = bio_integrity(bio_src);
struct bio_integrity_payload *bip;
BUG_ON(bip_src == NULL);
bip = bio_integrity_alloc(bio, gfp_mask, bip_src->bip_vcnt);
if (bip == NULL)
return -EIO;
memcpy(bip->bip_vec, bip_src->bip_vec,
bip_src->bip_vcnt * sizeof(struct bio_vec));
bip->bip_vcnt = bip_src->bip_vcnt;
bip->bip_iter = bip_src->bip_iter;
return 0;
}
EXPORT_SYMBOL(bio_integrity_clone);
int bioset_integrity_create(struct bio_set *bs, int pool_size)
{
if (bs->bio_integrity_pool)
return 0;
bs->bio_integrity_pool = mempool_create_slab_pool(pool_size, bip_slab);
if (!bs->bio_integrity_pool)
return -1;
bs->bvec_integrity_pool = biovec_create_pool(pool_size);
if (!bs->bvec_integrity_pool) {
mempool_destroy(bs->bio_integrity_pool);
return -1;
}
return 0;
}
EXPORT_SYMBOL(bioset_integrity_create);
void bioset_integrity_free(struct bio_set *bs)
{
if (bs->bio_integrity_pool)
mempool_destroy(bs->bio_integrity_pool);
if (bs->bvec_integrity_pool)
mempool_destroy(bs->bvec_integrity_pool);
}
EXPORT_SYMBOL(bioset_integrity_free);
void __init bio_integrity_init(void)
{
/*
* kintegrityd won't block much but may burn a lot of CPU cycles.
* Make it highpri CPU intensive wq with max concurrency of 1.
*/
kintegrityd_wq = alloc_workqueue("kintegrityd", WQ_MEM_RECLAIM |
WQ_HIGHPRI | WQ_CPU_INTENSIVE, 1);
if (!kintegrityd_wq)
panic("Failed to create kintegrityd\n");
bip_slab = kmem_cache_create("bio_integrity_payload",
sizeof(struct bio_integrity_payload) +
sizeof(struct bio_vec) * BIP_INLINE_VECS,
0, SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL);
}

2080
block/bio.c Normal file

File diff suppressed because it is too large Load diff

1160
block/blk-cgroup.c Normal file

File diff suppressed because it is too large Load diff

603
block/blk-cgroup.h Normal file
View file

@ -0,0 +1,603 @@
#ifndef _BLK_CGROUP_H
#define _BLK_CGROUP_H
/*
* Common Block IO controller cgroup interface
*
* Based on ideas and code from CFQ, CFS and BFQ:
* Copyright (C) 2003 Jens Axboe <axboe@kernel.dk>
*
* Copyright (C) 2008 Fabio Checconi <fabio@gandalf.sssup.it>
* Paolo Valente <paolo.valente@unimore.it>
*
* Copyright (C) 2009 Vivek Goyal <vgoyal@redhat.com>
* Nauman Rafique <nauman@google.com>
*/
#include <linux/cgroup.h>
#include <linux/u64_stats_sync.h>
#include <linux/seq_file.h>
#include <linux/radix-tree.h>
#include <linux/blkdev.h>
#include <linux/atomic.h>
/* Max limits for throttle policy */
#define THROTL_IOPS_MAX UINT_MAX
/* CFQ specific, out here for blkcg->cfq_weight */
#define CFQ_WEIGHT_MIN 10
#define CFQ_WEIGHT_MAX 1000
#define CFQ_WEIGHT_DEFAULT 500
#ifdef CONFIG_BLK_CGROUP
enum blkg_rwstat_type {
BLKG_RWSTAT_READ,
BLKG_RWSTAT_WRITE,
BLKG_RWSTAT_SYNC,
BLKG_RWSTAT_ASYNC,
BLKG_RWSTAT_NR,
BLKG_RWSTAT_TOTAL = BLKG_RWSTAT_NR,
};
struct blkcg_gq;
struct blkcg {
struct cgroup_subsys_state css;
spinlock_t lock;
struct radix_tree_root blkg_tree;
struct blkcg_gq *blkg_hint;
struct hlist_head blkg_list;
/* TODO: per-policy storage in blkcg */
unsigned int cfq_weight; /* belongs to cfq */
unsigned int cfq_leaf_weight;
};
struct blkg_stat {
struct u64_stats_sync syncp;
uint64_t cnt;
};
struct blkg_rwstat {
struct u64_stats_sync syncp;
uint64_t cnt[BLKG_RWSTAT_NR];
};
/*
* A blkcg_gq (blkg) is association between a block cgroup (blkcg) and a
* request_queue (q). This is used by blkcg policies which need to track
* information per blkcg - q pair.
*
* There can be multiple active blkcg policies and each has its private
* data on each blkg, the size of which is determined by
* blkcg_policy->pd_size. blkcg core allocates and frees such areas
* together with blkg and invokes pd_init/exit_fn() methods.
*
* Such private data must embed struct blkg_policy_data (pd) at the
* beginning and pd_size can't be smaller than pd.
*/
struct blkg_policy_data {
/* the blkg and policy id this per-policy data belongs to */
struct blkcg_gq *blkg;
int plid;
/* used during policy activation */
struct list_head alloc_node;
};
/* association between a blk cgroup and a request queue */
struct blkcg_gq {
/* Pointer to the associated request_queue */
struct request_queue *q;
struct list_head q_node;
struct hlist_node blkcg_node;
struct blkcg *blkcg;
/* all non-root blkcg_gq's are guaranteed to have access to parent */
struct blkcg_gq *parent;
/* request allocation list for this blkcg-q pair */
struct request_list rl;
/* reference count */
atomic_t refcnt;
/* is this blkg online? protected by both blkcg and q locks */
bool online;
struct blkg_policy_data *pd[BLKCG_MAX_POLS];
struct rcu_head rcu_head;
};
typedef void (blkcg_pol_init_pd_fn)(struct blkcg_gq *blkg);
typedef void (blkcg_pol_online_pd_fn)(struct blkcg_gq *blkg);
typedef void (blkcg_pol_offline_pd_fn)(struct blkcg_gq *blkg);
typedef void (blkcg_pol_exit_pd_fn)(struct blkcg_gq *blkg);
typedef void (blkcg_pol_reset_pd_stats_fn)(struct blkcg_gq *blkg);
struct blkcg_policy {
int plid;
/* policy specific private data size */
size_t pd_size;
/* cgroup files for the policy */
struct cftype *cftypes;
/* operations */
blkcg_pol_init_pd_fn *pd_init_fn;
blkcg_pol_online_pd_fn *pd_online_fn;
blkcg_pol_offline_pd_fn *pd_offline_fn;
blkcg_pol_exit_pd_fn *pd_exit_fn;
blkcg_pol_reset_pd_stats_fn *pd_reset_stats_fn;
};
extern struct blkcg blkcg_root;
struct blkcg_gq *blkg_lookup(struct blkcg *blkcg, struct request_queue *q);
struct blkcg_gq *blkg_lookup_create(struct blkcg *blkcg,
struct request_queue *q);
int blkcg_init_queue(struct request_queue *q);
void blkcg_drain_queue(struct request_queue *q);
void blkcg_exit_queue(struct request_queue *q);
/* Blkio controller policy registration */
int blkcg_policy_register(struct blkcg_policy *pol);
void blkcg_policy_unregister(struct blkcg_policy *pol);
int blkcg_activate_policy(struct request_queue *q,
const struct blkcg_policy *pol);
void blkcg_deactivate_policy(struct request_queue *q,
const struct blkcg_policy *pol);
void blkcg_print_blkgs(struct seq_file *sf, struct blkcg *blkcg,
u64 (*prfill)(struct seq_file *,
struct blkg_policy_data *, int),
const struct blkcg_policy *pol, int data,
bool show_total);
u64 __blkg_prfill_u64(struct seq_file *sf, struct blkg_policy_data *pd, u64 v);
u64 __blkg_prfill_rwstat(struct seq_file *sf, struct blkg_policy_data *pd,
const struct blkg_rwstat *rwstat);
u64 blkg_prfill_stat(struct seq_file *sf, struct blkg_policy_data *pd, int off);
u64 blkg_prfill_rwstat(struct seq_file *sf, struct blkg_policy_data *pd,
int off);
u64 blkg_stat_recursive_sum(struct blkg_policy_data *pd, int off);
struct blkg_rwstat blkg_rwstat_recursive_sum(struct blkg_policy_data *pd,
int off);
struct blkg_conf_ctx {
struct gendisk *disk;
struct blkcg_gq *blkg;
u64 v;
};
int blkg_conf_prep(struct blkcg *blkcg, const struct blkcg_policy *pol,
const char *input, struct blkg_conf_ctx *ctx);
void blkg_conf_finish(struct blkg_conf_ctx *ctx);
static inline struct blkcg *css_to_blkcg(struct cgroup_subsys_state *css)
{
return css ? container_of(css, struct blkcg, css) : NULL;
}
static inline struct blkcg *task_blkcg(struct task_struct *tsk)
{
return css_to_blkcg(task_css(tsk, blkio_cgrp_id));
}
static inline struct blkcg *bio_blkcg(struct bio *bio)
{
if (bio && bio->bi_css)
return css_to_blkcg(bio->bi_css);
return task_blkcg(current);
}
/**
* blkcg_parent - get the parent of a blkcg
* @blkcg: blkcg of interest
*
* Return the parent blkcg of @blkcg. Can be called anytime.
*/
static inline struct blkcg *blkcg_parent(struct blkcg *blkcg)
{
return css_to_blkcg(blkcg->css.parent);
}
/**
* blkg_to_pdata - get policy private data
* @blkg: blkg of interest
* @pol: policy of interest
*
* Return pointer to private data associated with the @blkg-@pol pair.
*/
static inline struct blkg_policy_data *blkg_to_pd(struct blkcg_gq *blkg,
struct blkcg_policy *pol)
{
return blkg ? blkg->pd[pol->plid] : NULL;
}
/**
* pdata_to_blkg - get blkg associated with policy private data
* @pd: policy private data of interest
*
* @pd is policy private data. Determine the blkg it's associated with.
*/
static inline struct blkcg_gq *pd_to_blkg(struct blkg_policy_data *pd)
{
return pd ? pd->blkg : NULL;
}
/**
* blkg_path - format cgroup path of blkg
* @blkg: blkg of interest
* @buf: target buffer
* @buflen: target buffer length
*
* Format the path of the cgroup of @blkg into @buf.
*/
static inline int blkg_path(struct blkcg_gq *blkg, char *buf, int buflen)
{
char *p;
p = cgroup_path(blkg->blkcg->css.cgroup, buf, buflen);
if (!p) {
strncpy(buf, "<unavailable>", buflen);
return -ENAMETOOLONG;
}
memmove(buf, p, buf + buflen - p);
return 0;
}
/**
* blkg_get - get a blkg reference
* @blkg: blkg to get
*
* The caller should be holding an existing reference.
*/
static inline void blkg_get(struct blkcg_gq *blkg)
{
WARN_ON_ONCE(atomic_read(&blkg->refcnt) <= 0);
atomic_inc(&blkg->refcnt);
}
void __blkg_release_rcu(struct rcu_head *rcu);
/**
* blkg_put - put a blkg reference
* @blkg: blkg to put
*/
static inline void blkg_put(struct blkcg_gq *blkg)
{
WARN_ON_ONCE(atomic_read(&blkg->refcnt) <= 0);
if (atomic_dec_and_test(&blkg->refcnt))
call_rcu(&blkg->rcu_head, __blkg_release_rcu);
}
struct blkcg_gq *__blkg_lookup(struct blkcg *blkcg, struct request_queue *q,
bool update_hint);
/**
* blkg_for_each_descendant_pre - pre-order walk of a blkg's descendants
* @d_blkg: loop cursor pointing to the current descendant
* @pos_css: used for iteration
* @p_blkg: target blkg to walk descendants of
*
* Walk @c_blkg through the descendants of @p_blkg. Must be used with RCU
* read locked. If called under either blkcg or queue lock, the iteration
* is guaranteed to include all and only online blkgs. The caller may
* update @pos_css by calling css_rightmost_descendant() to skip subtree.
* @p_blkg is included in the iteration and the first node to be visited.
*/
#define blkg_for_each_descendant_pre(d_blkg, pos_css, p_blkg) \
css_for_each_descendant_pre((pos_css), &(p_blkg)->blkcg->css) \
if (((d_blkg) = __blkg_lookup(css_to_blkcg(pos_css), \
(p_blkg)->q, false)))
/**
* blkg_for_each_descendant_post - post-order walk of a blkg's descendants
* @d_blkg: loop cursor pointing to the current descendant
* @pos_css: used for iteration
* @p_blkg: target blkg to walk descendants of
*
* Similar to blkg_for_each_descendant_pre() but performs post-order
* traversal instead. Synchronization rules are the same. @p_blkg is
* included in the iteration and the last node to be visited.
*/
#define blkg_for_each_descendant_post(d_blkg, pos_css, p_blkg) \
css_for_each_descendant_post((pos_css), &(p_blkg)->blkcg->css) \
if (((d_blkg) = __blkg_lookup(css_to_blkcg(pos_css), \
(p_blkg)->q, false)))
/**
* blk_get_rl - get request_list to use
* @q: request_queue of interest
* @bio: bio which will be attached to the allocated request (may be %NULL)
*
* The caller wants to allocate a request from @q to use for @bio. Find
* the request_list to use and obtain a reference on it. Should be called
* under queue_lock. This function is guaranteed to return non-%NULL
* request_list.
*/
static inline struct request_list *blk_get_rl(struct request_queue *q,
struct bio *bio)
{
struct blkcg *blkcg;
struct blkcg_gq *blkg;
rcu_read_lock();
blkcg = bio_blkcg(bio);
/* bypass blkg lookup and use @q->root_rl directly for root */
if (blkcg == &blkcg_root)
goto root_rl;
/*
* Try to use blkg->rl. blkg lookup may fail under memory pressure
* or if either the blkcg or queue is going away. Fall back to
* root_rl in such cases.
*/
blkg = blkg_lookup_create(blkcg, q);
if (unlikely(IS_ERR(blkg)))
goto root_rl;
blkg_get(blkg);
rcu_read_unlock();
return &blkg->rl;
root_rl:
rcu_read_unlock();
return &q->root_rl;
}
/**
* blk_put_rl - put request_list
* @rl: request_list to put
*
* Put the reference acquired by blk_get_rl(). Should be called under
* queue_lock.
*/
static inline void blk_put_rl(struct request_list *rl)
{
/* root_rl may not have blkg set */
if (rl->blkg && rl->blkg->blkcg != &blkcg_root)
blkg_put(rl->blkg);
}
/**
* blk_rq_set_rl - associate a request with a request_list
* @rq: request of interest
* @rl: target request_list
*
* Associate @rq with @rl so that accounting and freeing can know the
* request_list @rq came from.
*/
static inline void blk_rq_set_rl(struct request *rq, struct request_list *rl)
{
rq->rl = rl;
}
/**
* blk_rq_rl - return the request_list a request came from
* @rq: request of interest
*
* Return the request_list @rq is allocated from.
*/
static inline struct request_list *blk_rq_rl(struct request *rq)
{
return rq->rl;
}
struct request_list *__blk_queue_next_rl(struct request_list *rl,
struct request_queue *q);
/**
* blk_queue_for_each_rl - iterate through all request_lists of a request_queue
*
* Should be used under queue_lock.
*/
#define blk_queue_for_each_rl(rl, q) \
for ((rl) = &(q)->root_rl; (rl); (rl) = __blk_queue_next_rl((rl), (q)))
static inline void blkg_stat_init(struct blkg_stat *stat)
{
u64_stats_init(&stat->syncp);
}
/**
* blkg_stat_add - add a value to a blkg_stat
* @stat: target blkg_stat
* @val: value to add
*
* Add @val to @stat. The caller is responsible for synchronizing calls to
* this function.
*/
static inline void blkg_stat_add(struct blkg_stat *stat, uint64_t val)
{
u64_stats_update_begin(&stat->syncp);
stat->cnt += val;
u64_stats_update_end(&stat->syncp);
}
/**
* blkg_stat_read - read the current value of a blkg_stat
* @stat: blkg_stat to read
*
* Read the current value of @stat. This function can be called without
* synchroniztion and takes care of u64 atomicity.
*/
static inline uint64_t blkg_stat_read(struct blkg_stat *stat)
{
unsigned int start;
uint64_t v;
do {
start = u64_stats_fetch_begin_irq(&stat->syncp);
v = stat->cnt;
} while (u64_stats_fetch_retry_irq(&stat->syncp, start));
return v;
}
/**
* blkg_stat_reset - reset a blkg_stat
* @stat: blkg_stat to reset
*/
static inline void blkg_stat_reset(struct blkg_stat *stat)
{
stat->cnt = 0;
}
/**
* blkg_stat_merge - merge a blkg_stat into another
* @to: the destination blkg_stat
* @from: the source
*
* Add @from's count to @to.
*/
static inline void blkg_stat_merge(struct blkg_stat *to, struct blkg_stat *from)
{
blkg_stat_add(to, blkg_stat_read(from));
}
static inline void blkg_rwstat_init(struct blkg_rwstat *rwstat)
{
u64_stats_init(&rwstat->syncp);
}
/**
* blkg_rwstat_add - add a value to a blkg_rwstat
* @rwstat: target blkg_rwstat
* @rw: mask of REQ_{WRITE|SYNC}
* @val: value to add
*
* Add @val to @rwstat. The counters are chosen according to @rw. The
* caller is responsible for synchronizing calls to this function.
*/
static inline void blkg_rwstat_add(struct blkg_rwstat *rwstat,
int rw, uint64_t val)
{
u64_stats_update_begin(&rwstat->syncp);
if (rw & REQ_WRITE)
rwstat->cnt[BLKG_RWSTAT_WRITE] += val;
else
rwstat->cnt[BLKG_RWSTAT_READ] += val;
if (rw & REQ_SYNC)
rwstat->cnt[BLKG_RWSTAT_SYNC] += val;
else
rwstat->cnt[BLKG_RWSTAT_ASYNC] += val;
u64_stats_update_end(&rwstat->syncp);
}
/**
* blkg_rwstat_read - read the current values of a blkg_rwstat
* @rwstat: blkg_rwstat to read
*
* Read the current snapshot of @rwstat and return it as the return value.
* This function can be called without synchronization and takes care of
* u64 atomicity.
*/
static inline struct blkg_rwstat blkg_rwstat_read(struct blkg_rwstat *rwstat)
{
unsigned int start;
struct blkg_rwstat tmp;
do {
start = u64_stats_fetch_begin_irq(&rwstat->syncp);
tmp = *rwstat;
} while (u64_stats_fetch_retry_irq(&rwstat->syncp, start));
return tmp;
}
/**
* blkg_rwstat_total - read the total count of a blkg_rwstat
* @rwstat: blkg_rwstat to read
*
* Return the total count of @rwstat regardless of the IO direction. This
* function can be called without synchronization and takes care of u64
* atomicity.
*/
static inline uint64_t blkg_rwstat_total(struct blkg_rwstat *rwstat)
{
struct blkg_rwstat tmp = blkg_rwstat_read(rwstat);
return tmp.cnt[BLKG_RWSTAT_READ] + tmp.cnt[BLKG_RWSTAT_WRITE];
}
/**
* blkg_rwstat_reset - reset a blkg_rwstat
* @rwstat: blkg_rwstat to reset
*/
static inline void blkg_rwstat_reset(struct blkg_rwstat *rwstat)
{
memset(rwstat->cnt, 0, sizeof(rwstat->cnt));
}
/**
* blkg_rwstat_merge - merge a blkg_rwstat into another
* @to: the destination blkg_rwstat
* @from: the source
*
* Add @from's counts to @to.
*/
static inline void blkg_rwstat_merge(struct blkg_rwstat *to,
struct blkg_rwstat *from)
{
struct blkg_rwstat v = blkg_rwstat_read(from);
int i;
u64_stats_update_begin(&to->syncp);
for (i = 0; i < BLKG_RWSTAT_NR; i++)
to->cnt[i] += v.cnt[i];
u64_stats_update_end(&to->syncp);
}
#else /* CONFIG_BLK_CGROUP */
struct cgroup;
struct blkcg;
struct blkg_policy_data {
};
struct blkcg_gq {
};
struct blkcg_policy {
};
static inline struct blkcg_gq *blkg_lookup(struct blkcg *blkcg, void *key) { return NULL; }
static inline int blkcg_init_queue(struct request_queue *q) { return 0; }
static inline void blkcg_drain_queue(struct request_queue *q) { }
static inline void blkcg_exit_queue(struct request_queue *q) { }
static inline int blkcg_policy_register(struct blkcg_policy *pol) { return 0; }
static inline void blkcg_policy_unregister(struct blkcg_policy *pol) { }
static inline int blkcg_activate_policy(struct request_queue *q,
const struct blkcg_policy *pol) { return 0; }
static inline void blkcg_deactivate_policy(struct request_queue *q,
const struct blkcg_policy *pol) { }
static inline struct blkcg *bio_blkcg(struct bio *bio) { return NULL; }
static inline struct blkg_policy_data *blkg_to_pd(struct blkcg_gq *blkg,
struct blkcg_policy *pol) { return NULL; }
static inline struct blkcg_gq *pd_to_blkg(struct blkg_policy_data *pd) { return NULL; }
static inline char *blkg_path(struct blkcg_gq *blkg) { return NULL; }
static inline void blkg_get(struct blkcg_gq *blkg) { }
static inline void blkg_put(struct blkcg_gq *blkg) { }
static inline struct request_list *blk_get_rl(struct request_queue *q,
struct bio *bio) { return &q->root_rl; }
static inline void blk_put_rl(struct request_list *rl) { }
static inline void blk_rq_set_rl(struct request *rq, struct request_list *rl) { }
static inline struct request_list *blk_rq_rl(struct request *rq) { return &rq->q->root_rl; }
#define blk_queue_for_each_rl(rl, q) \
for ((rl) = &(q)->root_rl; (rl); (rl) = NULL)
#endif /* CONFIG_BLK_CGROUP */
#endif /* _BLK_CGROUP_H */

3326
block/blk-core.c Normal file

File diff suppressed because it is too large Load diff

143
block/blk-exec.c Normal file
View file

@ -0,0 +1,143 @@
/*
* Functions related to setting various queue properties from drivers
*/
#include <linux/kernel.h>
#include <linux/module.h>
#include <linux/bio.h>
#include <linux/blkdev.h>
#include <linux/blk-mq.h>
#include <linux/sched/sysctl.h>
#include "blk.h"
/*
* for max sense size
*/
#include <scsi/scsi_cmnd.h>
/**
* blk_end_sync_rq - executes a completion event on a request
* @rq: request to complete
* @error: end I/O status of the request
*/
static void blk_end_sync_rq(struct request *rq, int error)
{
struct completion *waiting = rq->end_io_data;
rq->end_io_data = NULL;
/*
* complete last, if this is a stack request the process (and thus
* the rq pointer) could be invalid right after this complete()
*/
complete(waiting);
}
/**
* blk_execute_rq_nowait - insert a request into queue for execution
* @q: queue to insert the request in
* @bd_disk: matching gendisk
* @rq: request to insert
* @at_head: insert request at head or tail of queue
* @done: I/O completion handler
*
* Description:
* Insert a fully prepared request at the back of the I/O scheduler queue
* for execution. Don't wait for completion.
*
* Note:
* This function will invoke @done directly if the queue is dead.
*/
void blk_execute_rq_nowait(struct request_queue *q, struct gendisk *bd_disk,
struct request *rq, int at_head,
rq_end_io_fn *done)
{
int where = at_head ? ELEVATOR_INSERT_FRONT : ELEVATOR_INSERT_BACK;
bool is_pm_resume;
WARN_ON(irqs_disabled());
WARN_ON(rq->cmd_type == REQ_TYPE_FS);
rq->rq_disk = bd_disk;
rq->end_io = done;
/*
* don't check dying flag for MQ because the request won't
* be resued after dying flag is set
*/
if (q->mq_ops) {
blk_mq_insert_request(rq, at_head, true, false);
return;
}
/*
* need to check this before __blk_run_queue(), because rq can
* be freed before that returns.
*/
is_pm_resume = rq->cmd_type == REQ_TYPE_PM_RESUME;
spin_lock_irq(q->queue_lock);
if (unlikely(blk_queue_dying(q))) {
rq->cmd_flags |= REQ_QUIET;
rq->errors = -ENXIO;
__blk_end_request_all(rq, rq->errors);
spin_unlock_irq(q->queue_lock);
return;
}
__elv_add_request(q, rq, where);
__blk_run_queue(q);
/* the queue is stopped so it won't be run */
if (is_pm_resume)
__blk_run_queue_uncond(q);
spin_unlock_irq(q->queue_lock);
}
EXPORT_SYMBOL_GPL(blk_execute_rq_nowait);
/**
* blk_execute_rq - insert a request into queue for execution
* @q: queue to insert the request in
* @bd_disk: matching gendisk
* @rq: request to insert
* @at_head: insert request at head or tail of queue
*
* Description:
* Insert a fully prepared request at the back of the I/O scheduler queue
* for execution and wait for completion.
*/
int blk_execute_rq(struct request_queue *q, struct gendisk *bd_disk,
struct request *rq, int at_head)
{
DECLARE_COMPLETION_ONSTACK(wait);
char sense[SCSI_SENSE_BUFFERSIZE];
int err = 0;
unsigned long hang_check;
if (!rq->sense) {
memset(sense, 0, sizeof(sense));
rq->sense = sense;
rq->sense_len = 0;
}
rq->end_io_data = &wait;
blk_execute_rq_nowait(q, bd_disk, rq, at_head, blk_end_sync_rq);
/* Prevent hang_check timer from firing at us during very long I/O */
hang_check = sysctl_hung_task_timeout_secs;
if (hang_check)
while (!wait_for_completion_io_timeout(&wait, hang_check * (HZ/2)));
else
wait_for_completion_io(&wait);
if (rq->errors)
err = -EIO;
if (rq->sense == sense) {
rq->sense = NULL;
rq->sense_len = 0;
}
return err;
}
EXPORT_SYMBOL(blk_execute_rq);

529
block/blk-flush.c Normal file
View file

@ -0,0 +1,529 @@
/*
* Functions to sequence FLUSH and FUA writes.
*
* Copyright (C) 2011 Max Planck Institute for Gravitational Physics
* Copyright (C) 2011 Tejun Heo <tj@kernel.org>
*
* This file is released under the GPLv2.
*
* REQ_{FLUSH|FUA} requests are decomposed to sequences consisted of three
* optional steps - PREFLUSH, DATA and POSTFLUSH - according to the request
* properties and hardware capability.
*
* If a request doesn't have data, only REQ_FLUSH makes sense, which
* indicates a simple flush request. If there is data, REQ_FLUSH indicates
* that the device cache should be flushed before the data is executed, and
* REQ_FUA means that the data must be on non-volatile media on request
* completion.
*
* If the device doesn't have writeback cache, FLUSH and FUA don't make any
* difference. The requests are either completed immediately if there's no
* data or executed as normal requests otherwise.
*
* If the device has writeback cache and supports FUA, REQ_FLUSH is
* translated to PREFLUSH but REQ_FUA is passed down directly with DATA.
*
* If the device has writeback cache and doesn't support FUA, REQ_FLUSH is
* translated to PREFLUSH and REQ_FUA to POSTFLUSH.
*
* The actual execution of flush is double buffered. Whenever a request
* needs to execute PRE or POSTFLUSH, it queues at
* fq->flush_queue[fq->flush_pending_idx]. Once certain criteria are met, a
* flush is issued and the pending_idx is toggled. When the flush
* completes, all the requests which were pending are proceeded to the next
* step. This allows arbitrary merging of different types of FLUSH/FUA
* requests.
*
* Currently, the following conditions are used to determine when to issue
* flush.
*
* C1. At any given time, only one flush shall be in progress. This makes
* double buffering sufficient.
*
* C2. Flush is deferred if any request is executing DATA of its sequence.
* This avoids issuing separate POSTFLUSHes for requests which shared
* PREFLUSH.
*
* C3. The second condition is ignored if there is a request which has
* waited longer than FLUSH_PENDING_TIMEOUT. This is to avoid
* starvation in the unlikely case where there are continuous stream of
* FUA (without FLUSH) requests.
*
* For devices which support FUA, it isn't clear whether C2 (and thus C3)
* is beneficial.
*
* Note that a sequenced FLUSH/FUA request with DATA is completed twice.
* Once while executing DATA and again after the whole sequence is
* complete. The first completion updates the contained bio but doesn't
* finish it so that the bio submitter is notified only after the whole
* sequence is complete. This is implemented by testing REQ_FLUSH_SEQ in
* req_bio_endio().
*
* The above peculiarity requires that each FLUSH/FUA request has only one
* bio attached to it, which is guaranteed as they aren't allowed to be
* merged in the usual way.
*/
#include <linux/kernel.h>
#include <linux/module.h>
#include <linux/bio.h>
#include <linux/blkdev.h>
#include <linux/gfp.h>
#include <linux/blk-mq.h>
#include "blk.h"
#include "blk-mq.h"
/* FLUSH/FUA sequences */
enum {
REQ_FSEQ_PREFLUSH = (1 << 0), /* pre-flushing in progress */
REQ_FSEQ_DATA = (1 << 1), /* data write in progress */
REQ_FSEQ_POSTFLUSH = (1 << 2), /* post-flushing in progress */
REQ_FSEQ_DONE = (1 << 3),
REQ_FSEQ_ACTIONS = REQ_FSEQ_PREFLUSH | REQ_FSEQ_DATA |
REQ_FSEQ_POSTFLUSH,
/*
* If flush has been pending longer than the following timeout,
* it's issued even if flush_data requests are still in flight.
*/
FLUSH_PENDING_TIMEOUT = 5 * HZ,
};
static bool blk_kick_flush(struct request_queue *q,
struct blk_flush_queue *fq);
static unsigned int blk_flush_policy(unsigned int fflags, struct request *rq)
{
unsigned int policy = 0;
if (blk_rq_sectors(rq))
policy |= REQ_FSEQ_DATA;
if (fflags & REQ_FLUSH) {
if (rq->cmd_flags & REQ_FLUSH)
policy |= REQ_FSEQ_PREFLUSH;
if (!(fflags & REQ_FUA) && (rq->cmd_flags & REQ_FUA))
policy |= REQ_FSEQ_POSTFLUSH;
}
return policy;
}
static unsigned int blk_flush_cur_seq(struct request *rq)
{
return 1 << ffz(rq->flush.seq);
}
static void blk_flush_restore_request(struct request *rq)
{
/*
* After flush data completion, @rq->bio is %NULL but we need to
* complete the bio again. @rq->biotail is guaranteed to equal the
* original @rq->bio. Restore it.
*/
rq->bio = rq->biotail;
/* make @rq a normal request */
rq->cmd_flags &= ~REQ_FLUSH_SEQ;
rq->end_io = rq->flush.saved_end_io;
}
static bool blk_flush_queue_rq(struct request *rq, bool add_front)
{
if (rq->q->mq_ops) {
struct request_queue *q = rq->q;
blk_mq_add_to_requeue_list(rq, add_front);
blk_mq_kick_requeue_list(q);
return false;
} else {
if (add_front)
list_add(&rq->queuelist, &rq->q->queue_head);
else
list_add_tail(&rq->queuelist, &rq->q->queue_head);
return true;
}
}
/**
* blk_flush_complete_seq - complete flush sequence
* @rq: FLUSH/FUA request being sequenced
* @fq: flush queue
* @seq: sequences to complete (mask of %REQ_FSEQ_*, can be zero)
* @error: whether an error occurred
*
* @rq just completed @seq part of its flush sequence, record the
* completion and trigger the next step.
*
* CONTEXT:
* spin_lock_irq(q->queue_lock or fq->mq_flush_lock)
*
* RETURNS:
* %true if requests were added to the dispatch queue, %false otherwise.
*/
static bool blk_flush_complete_seq(struct request *rq,
struct blk_flush_queue *fq,
unsigned int seq, int error)
{
struct request_queue *q = rq->q;
struct list_head *pending = &fq->flush_queue[fq->flush_pending_idx];
bool queued = false, kicked;
BUG_ON(rq->flush.seq & seq);
rq->flush.seq |= seq;
if (likely(!error))
seq = blk_flush_cur_seq(rq);
else
seq = REQ_FSEQ_DONE;
switch (seq) {
case REQ_FSEQ_PREFLUSH:
case REQ_FSEQ_POSTFLUSH:
/* queue for flush */
if (list_empty(pending))
fq->flush_pending_since = jiffies;
list_move_tail(&rq->flush.list, pending);
break;
case REQ_FSEQ_DATA:
list_move_tail(&rq->flush.list, &fq->flush_data_in_flight);
queued = blk_flush_queue_rq(rq, true);
break;
case REQ_FSEQ_DONE:
/*
* @rq was previously adjusted by blk_flush_issue() for
* flush sequencing and may already have gone through the
* flush data request completion path. Restore @rq for
* normal completion and end it.
*/
BUG_ON(!list_empty(&rq->queuelist));
list_del_init(&rq->flush.list);
blk_flush_restore_request(rq);
if (q->mq_ops)
blk_mq_end_request(rq, error);
else
__blk_end_request_all(rq, error);
break;
default:
BUG();
}
kicked = blk_kick_flush(q, fq);
return kicked | queued;
}
static void flush_end_io(struct request *flush_rq, int error)
{
struct request_queue *q = flush_rq->q;
struct list_head *running;
bool queued = false;
struct request *rq, *n;
unsigned long flags = 0;
struct blk_flush_queue *fq = blk_get_flush_queue(q, flush_rq->mq_ctx);
if (q->mq_ops) {
spin_lock_irqsave(&fq->mq_flush_lock, flags);
flush_rq->tag = -1;
}
running = &fq->flush_queue[fq->flush_running_idx];
BUG_ON(fq->flush_pending_idx == fq->flush_running_idx);
/* account completion of the flush request */
fq->flush_running_idx ^= 1;
if (!q->mq_ops)
elv_completed_request(q, flush_rq);
/* and push the waiting requests to the next stage */
list_for_each_entry_safe(rq, n, running, flush.list) {
unsigned int seq = blk_flush_cur_seq(rq);
BUG_ON(seq != REQ_FSEQ_PREFLUSH && seq != REQ_FSEQ_POSTFLUSH);
queued |= blk_flush_complete_seq(rq, fq, seq, error);
}
/*
* Kick the queue to avoid stall for two cases:
* 1. Moving a request silently to empty queue_head may stall the
* queue.
* 2. When flush request is running in non-queueable queue, the
* queue is hold. Restart the queue after flush request is finished
* to avoid stall.
* This function is called from request completion path and calling
* directly into request_fn may confuse the driver. Always use
* kblockd.
*/
if (queued || fq->flush_queue_delayed) {
WARN_ON(q->mq_ops);
blk_run_queue_async(q);
}
fq->flush_queue_delayed = 0;
if (q->mq_ops)
spin_unlock_irqrestore(&fq->mq_flush_lock, flags);
}
/**
* blk_kick_flush - consider issuing flush request
* @q: request_queue being kicked
* @fq: flush queue
*
* Flush related states of @q have changed, consider issuing flush request.
* Please read the comment at the top of this file for more info.
*
* CONTEXT:
* spin_lock_irq(q->queue_lock or fq->mq_flush_lock)
*
* RETURNS:
* %true if flush was issued, %false otherwise.
*/
static bool blk_kick_flush(struct request_queue *q, struct blk_flush_queue *fq)
{
struct list_head *pending = &fq->flush_queue[fq->flush_pending_idx];
struct request *first_rq =
list_first_entry(pending, struct request, flush.list);
struct request *flush_rq = fq->flush_rq;
/* C1 described at the top of this file */
if (fq->flush_pending_idx != fq->flush_running_idx || list_empty(pending))
return false;
/* C2 and C3 */
if (!list_empty(&fq->flush_data_in_flight) &&
time_before(jiffies,
fq->flush_pending_since + FLUSH_PENDING_TIMEOUT))
return false;
/*
* Issue flush and toggle pending_idx. This makes pending_idx
* different from running_idx, which means flush is in flight.
*/
fq->flush_pending_idx ^= 1;
blk_rq_init(q, flush_rq);
/*
* Borrow tag from the first request since they can't
* be in flight at the same time.
*/
if (q->mq_ops) {
flush_rq->mq_ctx = first_rq->mq_ctx;
flush_rq->tag = first_rq->tag;
}
flush_rq->cmd_type = REQ_TYPE_FS;
flush_rq->cmd_flags = WRITE_FLUSH | REQ_FLUSH_SEQ;
flush_rq->rq_disk = first_rq->rq_disk;
flush_rq->end_io = flush_end_io;
return blk_flush_queue_rq(flush_rq, false);
}
static void flush_data_end_io(struct request *rq, int error)
{
struct request_queue *q = rq->q;
struct blk_flush_queue *fq = blk_get_flush_queue(q, NULL);
/*
* After populating an empty queue, kick it to avoid stall. Read
* the comment in flush_end_io().
*/
if (blk_flush_complete_seq(rq, fq, REQ_FSEQ_DATA, error))
blk_run_queue_async(q);
}
static void mq_flush_data_end_io(struct request *rq, int error)
{
struct request_queue *q = rq->q;
struct blk_mq_hw_ctx *hctx;
struct blk_mq_ctx *ctx = rq->mq_ctx;
unsigned long flags;
struct blk_flush_queue *fq = blk_get_flush_queue(q, ctx);
hctx = q->mq_ops->map_queue(q, ctx->cpu);
/*
* After populating an empty queue, kick it to avoid stall. Read
* the comment in flush_end_io().
*/
spin_lock_irqsave(&fq->mq_flush_lock, flags);
if (blk_flush_complete_seq(rq, fq, REQ_FSEQ_DATA, error))
blk_mq_run_hw_queue(hctx, true);
spin_unlock_irqrestore(&fq->mq_flush_lock, flags);
}
/**
* blk_insert_flush - insert a new FLUSH/FUA request
* @rq: request to insert
*
* To be called from __elv_add_request() for %ELEVATOR_INSERT_FLUSH insertions.
* or __blk_mq_run_hw_queue() to dispatch request.
* @rq is being submitted. Analyze what needs to be done and put it on the
* right queue.
*
* CONTEXT:
* spin_lock_irq(q->queue_lock) in !mq case
*/
void blk_insert_flush(struct request *rq)
{
struct request_queue *q = rq->q;
unsigned int fflags = q->flush_flags; /* may change, cache */
unsigned int policy = blk_flush_policy(fflags, rq);
struct blk_flush_queue *fq = blk_get_flush_queue(q, rq->mq_ctx);
/*
* @policy now records what operations need to be done. Adjust
* REQ_FLUSH and FUA for the driver.
*/
rq->cmd_flags &= ~REQ_FLUSH;
if (!(fflags & REQ_FUA))
rq->cmd_flags &= ~REQ_FUA;
/*
* An empty flush handed down from a stacking driver may
* translate into nothing if the underlying device does not
* advertise a write-back cache. In this case, simply
* complete the request.
*/
if (!policy) {
if (q->mq_ops)
blk_mq_end_request(rq, 0);
else
__blk_end_bidi_request(rq, 0, 0, 0);
return;
}
BUG_ON(rq->bio != rq->biotail); /*assumes zero or single bio rq */
/*
* If there's data but flush is not necessary, the request can be
* processed directly without going through flush machinery. Queue
* for normal execution.
*/
if ((policy & REQ_FSEQ_DATA) &&
!(policy & (REQ_FSEQ_PREFLUSH | REQ_FSEQ_POSTFLUSH))) {
if (q->mq_ops) {
blk_mq_insert_request(rq, false, false, true);
} else
list_add_tail(&rq->queuelist, &q->queue_head);
return;
}
/*
* @rq should go through flush machinery. Mark it part of flush
* sequence and submit for further processing.
*/
memset(&rq->flush, 0, sizeof(rq->flush));
INIT_LIST_HEAD(&rq->flush.list);
rq->cmd_flags |= REQ_FLUSH_SEQ;
rq->flush.saved_end_io = rq->end_io; /* Usually NULL */
if (q->mq_ops) {
rq->end_io = mq_flush_data_end_io;
spin_lock_irq(&fq->mq_flush_lock);
blk_flush_complete_seq(rq, fq, REQ_FSEQ_ACTIONS & ~policy, 0);
spin_unlock_irq(&fq->mq_flush_lock);
return;
}
rq->end_io = flush_data_end_io;
blk_flush_complete_seq(rq, fq, REQ_FSEQ_ACTIONS & ~policy, 0);
}
/**
* blkdev_issue_flush - queue a flush
* @bdev: blockdev to issue flush for
* @gfp_mask: memory allocation flags (for bio_alloc)
* @error_sector: error sector
*
* Description:
* Issue a flush for the block device in question. Caller can supply
* room for storing the error offset in case of a flush error, if they
* wish to. If WAIT flag is not passed then caller may check only what
* request was pushed in some internal queue for later handling.
*/
int blkdev_issue_flush(struct block_device *bdev, gfp_t gfp_mask,
sector_t *error_sector)
{
struct request_queue *q;
struct bio *bio;
int ret = 0;
if (bdev->bd_disk == NULL)
return -ENXIO;
q = bdev_get_queue(bdev);
if (!q)
return -ENXIO;
/*
* some block devices may not have their queue correctly set up here
* (e.g. loop device without a backing file) and so issuing a flush
* here will panic. Ensure there is a request function before issuing
* the flush.
*/
if (!q->make_request_fn)
return -ENXIO;
bio = bio_alloc(gfp_mask, 0);
bio->bi_bdev = bdev;
ret = submit_bio_wait(WRITE_FLUSH, bio);
/*
* The driver must store the error location in ->bi_sector, if
* it supports it. For non-stacked drivers, this should be
* copied from blk_rq_pos(rq).
*/
if (error_sector)
*error_sector = bio->bi_iter.bi_sector;
bio_put(bio);
return ret;
}
EXPORT_SYMBOL(blkdev_issue_flush);
struct blk_flush_queue *blk_alloc_flush_queue(struct request_queue *q,
int node, int cmd_size)
{
struct blk_flush_queue *fq;
int rq_sz = sizeof(struct request);
fq = kzalloc_node(sizeof(*fq), GFP_KERNEL, node);
if (!fq)
goto fail;
if (q->mq_ops) {
spin_lock_init(&fq->mq_flush_lock);
rq_sz = round_up(rq_sz + cmd_size, cache_line_size());
}
fq->flush_rq = kzalloc_node(rq_sz, GFP_KERNEL, node);
if (!fq->flush_rq)
goto fail_rq;
INIT_LIST_HEAD(&fq->flush_queue[0]);
INIT_LIST_HEAD(&fq->flush_queue[1]);
INIT_LIST_HEAD(&fq->flush_data_in_flight);
return fq;
fail_rq:
kfree(fq);
fail:
return NULL;
}
void blk_free_flush_queue(struct blk_flush_queue *fq)
{
/* bio based request queue hasn't flush queue */
if (!fq)
return;
kfree(fq->flush_rq);
kfree(fq);
}

483
block/blk-integrity.c Normal file
View file

@ -0,0 +1,483 @@
/*
* blk-integrity.c - Block layer data integrity extensions
*
* Copyright (C) 2007, 2008 Oracle Corporation
* Written by: Martin K. Petersen <martin.petersen@oracle.com>
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public License version
* 2 as published by the Free Software Foundation.
*
* This program is distributed in the hope that it will be useful, but
* WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; see the file COPYING. If not, write to
* the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139,
* USA.
*
*/
#include <linux/blkdev.h>
#include <linux/mempool.h>
#include <linux/bio.h>
#include <linux/scatterlist.h>
#include <linux/export.h>
#include <linux/slab.h>
#include "blk.h"
static struct kmem_cache *integrity_cachep;
static const char *bi_unsupported_name = "unsupported";
/**
* blk_rq_count_integrity_sg - Count number of integrity scatterlist elements
* @q: request queue
* @bio: bio with integrity metadata attached
*
* Description: Returns the number of elements required in a
* scatterlist corresponding to the integrity metadata in a bio.
*/
int blk_rq_count_integrity_sg(struct request_queue *q, struct bio *bio)
{
struct bio_vec iv, ivprv = { NULL };
unsigned int segments = 0;
unsigned int seg_size = 0;
struct bvec_iter iter;
int prev = 0;
bio_for_each_integrity_vec(iv, bio, iter) {
if (prev) {
if (!BIOVEC_PHYS_MERGEABLE(&ivprv, &iv))
goto new_segment;
if (!BIOVEC_SEG_BOUNDARY(q, &ivprv, &iv))
goto new_segment;
if (seg_size + iv.bv_len > queue_max_segment_size(q))
goto new_segment;
seg_size += iv.bv_len;
} else {
new_segment:
segments++;
seg_size = iv.bv_len;
}
prev = 1;
ivprv = iv;
}
return segments;
}
EXPORT_SYMBOL(blk_rq_count_integrity_sg);
/**
* blk_rq_map_integrity_sg - Map integrity metadata into a scatterlist
* @q: request queue
* @bio: bio with integrity metadata attached
* @sglist: target scatterlist
*
* Description: Map the integrity vectors in request into a
* scatterlist. The scatterlist must be big enough to hold all
* elements. I.e. sized using blk_rq_count_integrity_sg().
*/
int blk_rq_map_integrity_sg(struct request_queue *q, struct bio *bio,
struct scatterlist *sglist)
{
struct bio_vec iv, ivprv = { NULL };
struct scatterlist *sg = NULL;
unsigned int segments = 0;
struct bvec_iter iter;
int prev = 0;
bio_for_each_integrity_vec(iv, bio, iter) {
if (prev) {
if (!BIOVEC_PHYS_MERGEABLE(&ivprv, &iv))
goto new_segment;
if (!BIOVEC_SEG_BOUNDARY(q, &ivprv, &iv))
goto new_segment;
if (sg->length + iv.bv_len > queue_max_segment_size(q))
goto new_segment;
sg->length += iv.bv_len;
} else {
new_segment:
if (!sg)
sg = sglist;
else {
sg_unmark_end(sg);
sg = sg_next(sg);
}
sg_set_page(sg, iv.bv_page, iv.bv_len, iv.bv_offset);
segments++;
}
prev = 1;
ivprv = iv;
}
if (sg)
sg_mark_end(sg);
return segments;
}
EXPORT_SYMBOL(blk_rq_map_integrity_sg);
/**
* blk_integrity_compare - Compare integrity profile of two disks
* @gd1: Disk to compare
* @gd2: Disk to compare
*
* Description: Meta-devices like DM and MD need to verify that all
* sub-devices use the same integrity format before advertising to
* upper layers that they can send/receive integrity metadata. This
* function can be used to check whether two gendisk devices have
* compatible integrity formats.
*/
int blk_integrity_compare(struct gendisk *gd1, struct gendisk *gd2)
{
struct blk_integrity *b1 = gd1->integrity;
struct blk_integrity *b2 = gd2->integrity;
if (!b1 && !b2)
return 0;
if (!b1 || !b2)
return -1;
if (b1->interval != b2->interval) {
pr_err("%s: %s/%s protection interval %u != %u\n",
__func__, gd1->disk_name, gd2->disk_name,
b1->interval, b2->interval);
return -1;
}
if (b1->tuple_size != b2->tuple_size) {
printk(KERN_ERR "%s: %s/%s tuple sz %u != %u\n", __func__,
gd1->disk_name, gd2->disk_name,
b1->tuple_size, b2->tuple_size);
return -1;
}
if (b1->tag_size && b2->tag_size && (b1->tag_size != b2->tag_size)) {
printk(KERN_ERR "%s: %s/%s tag sz %u != %u\n", __func__,
gd1->disk_name, gd2->disk_name,
b1->tag_size, b2->tag_size);
return -1;
}
if (strcmp(b1->name, b2->name)) {
printk(KERN_ERR "%s: %s/%s type %s != %s\n", __func__,
gd1->disk_name, gd2->disk_name,
b1->name, b2->name);
return -1;
}
return 0;
}
EXPORT_SYMBOL(blk_integrity_compare);
bool blk_integrity_merge_rq(struct request_queue *q, struct request *req,
struct request *next)
{
if (blk_integrity_rq(req) == 0 && blk_integrity_rq(next) == 0)
return true;
if (blk_integrity_rq(req) == 0 || blk_integrity_rq(next) == 0)
return false;
if (bio_integrity(req->bio)->bip_flags !=
bio_integrity(next->bio)->bip_flags)
return false;
if (req->nr_integrity_segments + next->nr_integrity_segments >
q->limits.max_integrity_segments)
return false;
return true;
}
EXPORT_SYMBOL(blk_integrity_merge_rq);
bool blk_integrity_merge_bio(struct request_queue *q, struct request *req,
struct bio *bio)
{
int nr_integrity_segs;
struct bio *next = bio->bi_next;
if (blk_integrity_rq(req) == 0 && bio_integrity(bio) == NULL)
return true;
if (blk_integrity_rq(req) == 0 || bio_integrity(bio) == NULL)
return false;
if (bio_integrity(req->bio)->bip_flags != bio_integrity(bio)->bip_flags)
return false;
bio->bi_next = NULL;
nr_integrity_segs = blk_rq_count_integrity_sg(q, bio);
bio->bi_next = next;
if (req->nr_integrity_segments + nr_integrity_segs >
q->limits.max_integrity_segments)
return false;
req->nr_integrity_segments += nr_integrity_segs;
return true;
}
EXPORT_SYMBOL(blk_integrity_merge_bio);
struct integrity_sysfs_entry {
struct attribute attr;
ssize_t (*show)(struct blk_integrity *, char *);
ssize_t (*store)(struct blk_integrity *, const char *, size_t);
};
static ssize_t integrity_attr_show(struct kobject *kobj, struct attribute *attr,
char *page)
{
struct blk_integrity *bi =
container_of(kobj, struct blk_integrity, kobj);
struct integrity_sysfs_entry *entry =
container_of(attr, struct integrity_sysfs_entry, attr);
return entry->show(bi, page);
}
static ssize_t integrity_attr_store(struct kobject *kobj,
struct attribute *attr, const char *page,
size_t count)
{
struct blk_integrity *bi =
container_of(kobj, struct blk_integrity, kobj);
struct integrity_sysfs_entry *entry =
container_of(attr, struct integrity_sysfs_entry, attr);
ssize_t ret = 0;
if (entry->store)
ret = entry->store(bi, page, count);
return ret;
}
static ssize_t integrity_format_show(struct blk_integrity *bi, char *page)
{
if (bi != NULL && bi->name != NULL)
return sprintf(page, "%s\n", bi->name);
else
return sprintf(page, "none\n");
}
static ssize_t integrity_tag_size_show(struct blk_integrity *bi, char *page)
{
if (bi != NULL)
return sprintf(page, "%u\n", bi->tag_size);
else
return sprintf(page, "0\n");
}
static ssize_t integrity_verify_store(struct blk_integrity *bi,
const char *page, size_t count)
{
char *p = (char *) page;
unsigned long val = simple_strtoul(p, &p, 10);
if (val)
bi->flags |= BLK_INTEGRITY_VERIFY;
else
bi->flags &= ~BLK_INTEGRITY_VERIFY;
return count;
}
static ssize_t integrity_verify_show(struct blk_integrity *bi, char *page)
{
return sprintf(page, "%d\n", (bi->flags & BLK_INTEGRITY_VERIFY) != 0);
}
static ssize_t integrity_generate_store(struct blk_integrity *bi,
const char *page, size_t count)
{
char *p = (char *) page;
unsigned long val = simple_strtoul(p, &p, 10);
if (val)
bi->flags |= BLK_INTEGRITY_GENERATE;
else
bi->flags &= ~BLK_INTEGRITY_GENERATE;
return count;
}
static ssize_t integrity_generate_show(struct blk_integrity *bi, char *page)
{
return sprintf(page, "%d\n", (bi->flags & BLK_INTEGRITY_GENERATE) != 0);
}
static ssize_t integrity_device_show(struct blk_integrity *bi, char *page)
{
return sprintf(page, "%u\n",
(bi->flags & BLK_INTEGRITY_DEVICE_CAPABLE) != 0);
}
static struct integrity_sysfs_entry integrity_format_entry = {
.attr = { .name = "format", .mode = S_IRUGO },
.show = integrity_format_show,
};
static struct integrity_sysfs_entry integrity_tag_size_entry = {
.attr = { .name = "tag_size", .mode = S_IRUGO },
.show = integrity_tag_size_show,
};
static struct integrity_sysfs_entry integrity_verify_entry = {
.attr = { .name = "read_verify", .mode = S_IRUGO | S_IWUSR },
.show = integrity_verify_show,
.store = integrity_verify_store,
};
static struct integrity_sysfs_entry integrity_generate_entry = {
.attr = { .name = "write_generate", .mode = S_IRUGO | S_IWUSR },
.show = integrity_generate_show,
.store = integrity_generate_store,
};
static struct integrity_sysfs_entry integrity_device_entry = {
.attr = { .name = "device_is_integrity_capable", .mode = S_IRUGO },
.show = integrity_device_show,
};
static struct attribute *integrity_attrs[] = {
&integrity_format_entry.attr,
&integrity_tag_size_entry.attr,
&integrity_verify_entry.attr,
&integrity_generate_entry.attr,
&integrity_device_entry.attr,
NULL,
};
static const struct sysfs_ops integrity_ops = {
.show = &integrity_attr_show,
.store = &integrity_attr_store,
};
static int __init blk_dev_integrity_init(void)
{
integrity_cachep = kmem_cache_create("blkdev_integrity",
sizeof(struct blk_integrity),
0, SLAB_PANIC, NULL);
return 0;
}
subsys_initcall(blk_dev_integrity_init);
static void blk_integrity_release(struct kobject *kobj)
{
struct blk_integrity *bi =
container_of(kobj, struct blk_integrity, kobj);
kmem_cache_free(integrity_cachep, bi);
}
static struct kobj_type integrity_ktype = {
.default_attrs = integrity_attrs,
.sysfs_ops = &integrity_ops,
.release = blk_integrity_release,
};
bool blk_integrity_is_initialized(struct gendisk *disk)
{
struct blk_integrity *bi = blk_get_integrity(disk);
return (bi && bi->name && strcmp(bi->name, bi_unsupported_name) != 0);
}
EXPORT_SYMBOL(blk_integrity_is_initialized);
/**
* blk_integrity_register - Register a gendisk as being integrity-capable
* @disk: struct gendisk pointer to make integrity-aware
* @template: optional integrity profile to register
*
* Description: When a device needs to advertise itself as being able
* to send/receive integrity metadata it must use this function to
* register the capability with the block layer. The template is a
* blk_integrity struct with values appropriate for the underlying
* hardware. If template is NULL the new profile is allocated but
* not filled out. See Documentation/block/data-integrity.txt.
*/
int blk_integrity_register(struct gendisk *disk, struct blk_integrity *template)
{
struct blk_integrity *bi;
BUG_ON(disk == NULL);
if (disk->integrity == NULL) {
bi = kmem_cache_alloc(integrity_cachep,
GFP_KERNEL | __GFP_ZERO);
if (!bi)
return -1;
if (kobject_init_and_add(&bi->kobj, &integrity_ktype,
&disk_to_dev(disk)->kobj,
"%s", "integrity")) {
kmem_cache_free(integrity_cachep, bi);
return -1;
}
kobject_uevent(&bi->kobj, KOBJ_ADD);
bi->flags |= BLK_INTEGRITY_VERIFY | BLK_INTEGRITY_GENERATE;
bi->interval = queue_logical_block_size(disk->queue);
disk->integrity = bi;
} else
bi = disk->integrity;
/* Use the provided profile as template */
if (template != NULL) {
bi->name = template->name;
bi->generate_fn = template->generate_fn;
bi->verify_fn = template->verify_fn;
bi->tuple_size = template->tuple_size;
bi->tag_size = template->tag_size;
bi->flags |= template->flags;
} else
bi->name = bi_unsupported_name;
disk->queue->backing_dev_info.capabilities |= BDI_CAP_STABLE_WRITES;
return 0;
}
EXPORT_SYMBOL(blk_integrity_register);
/**
* blk_integrity_unregister - Remove block integrity profile
* @disk: disk whose integrity profile to deallocate
*
* Description: This function frees all memory used by the block
* integrity profile. To be called at device teardown.
*/
void blk_integrity_unregister(struct gendisk *disk)
{
struct blk_integrity *bi;
if (!disk || !disk->integrity)
return;
disk->queue->backing_dev_info.capabilities &= ~BDI_CAP_STABLE_WRITES;
bi = disk->integrity;
kobject_uevent(&bi->kobj, KOBJ_REMOVE);
kobject_del(&bi->kobj);
kobject_put(&bi->kobj);
disk->integrity = NULL;
}
EXPORT_SYMBOL(blk_integrity_unregister);

407
block/blk-ioc.c Normal file
View file

@ -0,0 +1,407 @@
/*
* Functions related to io context handling
*/
#include <linux/kernel.h>
#include <linux/module.h>
#include <linux/init.h>
#include <linux/bio.h>
#include <linux/blkdev.h>
#include <linux/slab.h>
#include "blk.h"
/*
* For io context allocations
*/
static struct kmem_cache *iocontext_cachep;
/**
* get_io_context - increment reference count to io_context
* @ioc: io_context to get
*
* Increment reference count to @ioc.
*/
void get_io_context(struct io_context *ioc)
{
BUG_ON(atomic_long_read(&ioc->refcount) <= 0);
atomic_long_inc(&ioc->refcount);
}
EXPORT_SYMBOL(get_io_context);
static void icq_free_icq_rcu(struct rcu_head *head)
{
struct io_cq *icq = container_of(head, struct io_cq, __rcu_head);
kmem_cache_free(icq->__rcu_icq_cache, icq);
}
/* Exit an icq. Called with both ioc and q locked. */
static void ioc_exit_icq(struct io_cq *icq)
{
struct elevator_type *et = icq->q->elevator->type;
if (icq->flags & ICQ_EXITED)
return;
if (et->ops.elevator_exit_icq_fn)
et->ops.elevator_exit_icq_fn(icq);
icq->flags |= ICQ_EXITED;
}
/* Release an icq. Called with both ioc and q locked. */
static void ioc_destroy_icq(struct io_cq *icq)
{
struct io_context *ioc = icq->ioc;
struct request_queue *q = icq->q;
struct elevator_type *et = q->elevator->type;
lockdep_assert_held(&ioc->lock);
lockdep_assert_held(q->queue_lock);
radix_tree_delete(&ioc->icq_tree, icq->q->id);
hlist_del_init(&icq->ioc_node);
list_del_init(&icq->q_node);
/*
* Both setting lookup hint to and clearing it from @icq are done
* under queue_lock. If it's not pointing to @icq now, it never
* will. Hint assignment itself can race safely.
*/
if (rcu_access_pointer(ioc->icq_hint) == icq)
rcu_assign_pointer(ioc->icq_hint, NULL);
ioc_exit_icq(icq);
/*
* @icq->q might have gone away by the time RCU callback runs
* making it impossible to determine icq_cache. Record it in @icq.
*/
icq->__rcu_icq_cache = et->icq_cache;
call_rcu(&icq->__rcu_head, icq_free_icq_rcu);
}
/*
* Slow path for ioc release in put_io_context(). Performs double-lock
* dancing to unlink all icq's and then frees ioc.
*/
static void ioc_release_fn(struct work_struct *work)
{
struct io_context *ioc = container_of(work, struct io_context,
release_work);
unsigned long flags;
/*
* Exiting icq may call into put_io_context() through elevator
* which will trigger lockdep warning. The ioc's are guaranteed to
* be different, use a different locking subclass here. Use
* irqsave variant as there's no spin_lock_irq_nested().
*/
spin_lock_irqsave_nested(&ioc->lock, flags, 1);
while (!hlist_empty(&ioc->icq_list)) {
struct io_cq *icq = hlist_entry(ioc->icq_list.first,
struct io_cq, ioc_node);
struct request_queue *q = icq->q;
if (spin_trylock(q->queue_lock)) {
ioc_destroy_icq(icq);
spin_unlock(q->queue_lock);
} else {
spin_unlock_irqrestore(&ioc->lock, flags);
cpu_relax();
spin_lock_irqsave_nested(&ioc->lock, flags, 1);
}
}
spin_unlock_irqrestore(&ioc->lock, flags);
kmem_cache_free(iocontext_cachep, ioc);
}
/**
* put_io_context - put a reference of io_context
* @ioc: io_context to put
*
* Decrement reference count of @ioc and release it if the count reaches
* zero.
*/
void put_io_context(struct io_context *ioc)
{
unsigned long flags;
bool free_ioc = false;
if (ioc == NULL)
return;
BUG_ON(atomic_long_read(&ioc->refcount) <= 0);
/*
* Releasing ioc requires reverse order double locking and we may
* already be holding a queue_lock. Do it asynchronously from wq.
*/
if (atomic_long_dec_and_test(&ioc->refcount)) {
spin_lock_irqsave(&ioc->lock, flags);
if (!hlist_empty(&ioc->icq_list))
queue_work(system_power_efficient_wq,
&ioc->release_work);
else
free_ioc = true;
spin_unlock_irqrestore(&ioc->lock, flags);
}
if (free_ioc)
kmem_cache_free(iocontext_cachep, ioc);
}
EXPORT_SYMBOL(put_io_context);
/**
* put_io_context_active - put active reference on ioc
* @ioc: ioc of interest
*
* Undo get_io_context_active(). If active reference reaches zero after
* put, @ioc can never issue further IOs and ioscheds are notified.
*/
void put_io_context_active(struct io_context *ioc)
{
unsigned long flags;
struct io_cq *icq;
if (!atomic_dec_and_test(&ioc->active_ref)) {
put_io_context(ioc);
return;
}
/*
* Need ioc lock to walk icq_list and q lock to exit icq. Perform
* reverse double locking. Read comment in ioc_release_fn() for
* explanation on the nested locking annotation.
*/
retry:
spin_lock_irqsave_nested(&ioc->lock, flags, 1);
hlist_for_each_entry(icq, &ioc->icq_list, ioc_node) {
if (icq->flags & ICQ_EXITED)
continue;
if (spin_trylock(icq->q->queue_lock)) {
ioc_exit_icq(icq);
spin_unlock(icq->q->queue_lock);
} else {
spin_unlock_irqrestore(&ioc->lock, flags);
cpu_relax();
goto retry;
}
}
spin_unlock_irqrestore(&ioc->lock, flags);
put_io_context(ioc);
}
/* Called by the exiting task */
void exit_io_context(struct task_struct *task)
{
struct io_context *ioc;
task_lock(task);
ioc = task->io_context;
task->io_context = NULL;
task_unlock(task);
atomic_dec(&ioc->nr_tasks);
put_io_context_active(ioc);
}
/**
* ioc_clear_queue - break any ioc association with the specified queue
* @q: request_queue being cleared
*
* Walk @q->icq_list and exit all io_cq's. Must be called with @q locked.
*/
void ioc_clear_queue(struct request_queue *q)
{
lockdep_assert_held(q->queue_lock);
while (!list_empty(&q->icq_list)) {
struct io_cq *icq = list_entry(q->icq_list.next,
struct io_cq, q_node);
struct io_context *ioc = icq->ioc;
spin_lock(&ioc->lock);
ioc_destroy_icq(icq);
spin_unlock(&ioc->lock);
}
}
int create_task_io_context(struct task_struct *task, gfp_t gfp_flags, int node)
{
struct io_context *ioc;
int ret;
ioc = kmem_cache_alloc_node(iocontext_cachep, gfp_flags | __GFP_ZERO,
node);
if (unlikely(!ioc))
return -ENOMEM;
/* initialize */
atomic_long_set(&ioc->refcount, 1);
atomic_set(&ioc->nr_tasks, 1);
atomic_set(&ioc->active_ref, 1);
spin_lock_init(&ioc->lock);
INIT_RADIX_TREE(&ioc->icq_tree, GFP_ATOMIC | __GFP_HIGH);
INIT_HLIST_HEAD(&ioc->icq_list);
INIT_WORK(&ioc->release_work, ioc_release_fn);
/*
* Try to install. ioc shouldn't be installed if someone else
* already did or @task, which isn't %current, is exiting. Note
* that we need to allow ioc creation on exiting %current as exit
* path may issue IOs from e.g. exit_files(). The exit path is
* responsible for not issuing IO after exit_io_context().
*/
task_lock(task);
if (!task->io_context &&
(task == current || !(task->flags & PF_EXITING)))
task->io_context = ioc;
else
kmem_cache_free(iocontext_cachep, ioc);
ret = task->io_context ? 0 : -EBUSY;
task_unlock(task);
return ret;
}
/**
* get_task_io_context - get io_context of a task
* @task: task of interest
* @gfp_flags: allocation flags, used if allocation is necessary
* @node: allocation node, used if allocation is necessary
*
* Return io_context of @task. If it doesn't exist, it is created with
* @gfp_flags and @node. The returned io_context has its reference count
* incremented.
*
* This function always goes through task_lock() and it's better to use
* %current->io_context + get_io_context() for %current.
*/
struct io_context *get_task_io_context(struct task_struct *task,
gfp_t gfp_flags, int node)
{
struct io_context *ioc;
might_sleep_if(gfp_flags & __GFP_WAIT);
do {
task_lock(task);
ioc = task->io_context;
if (likely(ioc)) {
get_io_context(ioc);
task_unlock(task);
return ioc;
}
task_unlock(task);
} while (!create_task_io_context(task, gfp_flags, node));
return NULL;
}
EXPORT_SYMBOL(get_task_io_context);
/**
* ioc_lookup_icq - lookup io_cq from ioc
* @ioc: the associated io_context
* @q: the associated request_queue
*
* Look up io_cq associated with @ioc - @q pair from @ioc. Must be called
* with @q->queue_lock held.
*/
struct io_cq *ioc_lookup_icq(struct io_context *ioc, struct request_queue *q)
{
struct io_cq *icq;
lockdep_assert_held(q->queue_lock);
/*
* icq's are indexed from @ioc using radix tree and hint pointer,
* both of which are protected with RCU. All removals are done
* holding both q and ioc locks, and we're holding q lock - if we
* find a icq which points to us, it's guaranteed to be valid.
*/
rcu_read_lock();
icq = rcu_dereference(ioc->icq_hint);
if (icq && icq->q == q)
goto out;
icq = radix_tree_lookup(&ioc->icq_tree, q->id);
if (icq && icq->q == q)
rcu_assign_pointer(ioc->icq_hint, icq); /* allowed to race */
else
icq = NULL;
out:
rcu_read_unlock();
return icq;
}
EXPORT_SYMBOL(ioc_lookup_icq);
/**
* ioc_create_icq - create and link io_cq
* @ioc: io_context of interest
* @q: request_queue of interest
* @gfp_mask: allocation mask
*
* Make sure io_cq linking @ioc and @q exists. If icq doesn't exist, they
* will be created using @gfp_mask.
*
* The caller is responsible for ensuring @ioc won't go away and @q is
* alive and will stay alive until this function returns.
*/
struct io_cq *ioc_create_icq(struct io_context *ioc, struct request_queue *q,
gfp_t gfp_mask)
{
struct elevator_type *et = q->elevator->type;
struct io_cq *icq;
/* allocate stuff */
icq = kmem_cache_alloc_node(et->icq_cache, gfp_mask | __GFP_ZERO,
q->node);
if (!icq)
return NULL;
if (radix_tree_maybe_preload(gfp_mask) < 0) {
kmem_cache_free(et->icq_cache, icq);
return NULL;
}
icq->ioc = ioc;
icq->q = q;
INIT_LIST_HEAD(&icq->q_node);
INIT_HLIST_NODE(&icq->ioc_node);
/* lock both q and ioc and try to link @icq */
spin_lock_irq(q->queue_lock);
spin_lock(&ioc->lock);
if (likely(!radix_tree_insert(&ioc->icq_tree, q->id, icq))) {
hlist_add_head(&icq->ioc_node, &ioc->icq_list);
list_add(&icq->q_node, &q->icq_list);
if (et->ops.elevator_init_icq_fn)
et->ops.elevator_init_icq_fn(icq);
} else {
kmem_cache_free(et->icq_cache, icq);
icq = ioc_lookup_icq(ioc, q);
if (!icq)
printk(KERN_ERR "cfq: icq link failed!\n");
}
spin_unlock(&ioc->lock);
spin_unlock_irq(q->queue_lock);
radix_tree_preload_end();
return icq;
}
static int __init blk_ioc_init(void)
{
iocontext_cachep = kmem_cache_create("blkdev_ioc",
sizeof(struct io_context), 0, SLAB_PANIC, NULL);
return 0;
}
subsys_initcall(blk_ioc_init);

224
block/blk-iopoll.c Normal file
View file

@ -0,0 +1,224 @@
/*
* Functions related to interrupt-poll handling in the block layer. This
* is similar to NAPI for network devices.
*/
#include <linux/kernel.h>
#include <linux/module.h>
#include <linux/init.h>
#include <linux/bio.h>
#include <linux/blkdev.h>
#include <linux/interrupt.h>
#include <linux/cpu.h>
#include <linux/blk-iopoll.h>
#include <linux/delay.h>
#include "blk.h"
static unsigned int blk_iopoll_budget __read_mostly = 256;
static DEFINE_PER_CPU(struct list_head, blk_cpu_iopoll);
/**
* blk_iopoll_sched - Schedule a run of the iopoll handler
* @iop: The parent iopoll structure
*
* Description:
* Add this blk_iopoll structure to the pending poll list and trigger the
* raise of the blk iopoll softirq. The driver must already have gotten a
* successful return from blk_iopoll_sched_prep() before calling this.
**/
void blk_iopoll_sched(struct blk_iopoll *iop)
{
unsigned long flags;
local_irq_save(flags);
list_add_tail(&iop->list, this_cpu_ptr(&blk_cpu_iopoll));
__raise_softirq_irqoff(BLOCK_IOPOLL_SOFTIRQ);
local_irq_restore(flags);
}
EXPORT_SYMBOL(blk_iopoll_sched);
/**
* __blk_iopoll_complete - Mark this @iop as un-polled again
* @iop: The parent iopoll structure
*
* Description:
* See blk_iopoll_complete(). This function must be called with interrupts
* disabled.
**/
void __blk_iopoll_complete(struct blk_iopoll *iop)
{
list_del(&iop->list);
smp_mb__before_atomic();
clear_bit_unlock(IOPOLL_F_SCHED, &iop->state);
}
EXPORT_SYMBOL(__blk_iopoll_complete);
/**
* blk_iopoll_complete - Mark this @iop as un-polled again
* @iop: The parent iopoll structure
*
* Description:
* If a driver consumes less than the assigned budget in its run of the
* iopoll handler, it'll end the polled mode by calling this function. The
* iopoll handler will not be invoked again before blk_iopoll_sched_prep()
* is called.
**/
void blk_iopoll_complete(struct blk_iopoll *iop)
{
unsigned long flags;
local_irq_save(flags);
__blk_iopoll_complete(iop);
local_irq_restore(flags);
}
EXPORT_SYMBOL(blk_iopoll_complete);
static void blk_iopoll_softirq(struct softirq_action *h)
{
struct list_head *list = this_cpu_ptr(&blk_cpu_iopoll);
int rearm = 0, budget = blk_iopoll_budget;
unsigned long start_time = jiffies;
local_irq_disable();
while (!list_empty(list)) {
struct blk_iopoll *iop;
int work, weight;
/*
* If softirq window is exhausted then punt.
*/
if (budget <= 0 || time_after(jiffies, start_time)) {
rearm = 1;
break;
}
local_irq_enable();
/* Even though interrupts have been re-enabled, this
* access is safe because interrupts can only add new
* entries to the tail of this list, and only ->poll()
* calls can remove this head entry from the list.
*/
iop = list_entry(list->next, struct blk_iopoll, list);
weight = iop->weight;
work = 0;
if (test_bit(IOPOLL_F_SCHED, &iop->state))
work = iop->poll(iop, weight);
budget -= work;
local_irq_disable();
/*
* Drivers must not modify the iopoll state, if they
* consume their assigned weight (or more, some drivers can't
* easily just stop processing, they have to complete an
* entire mask of commands).In such cases this code
* still "owns" the iopoll instance and therefore can
* move the instance around on the list at-will.
*/
if (work >= weight) {
if (blk_iopoll_disable_pending(iop))
__blk_iopoll_complete(iop);
else
list_move_tail(&iop->list, list);
}
}
if (rearm)
__raise_softirq_irqoff(BLOCK_IOPOLL_SOFTIRQ);
local_irq_enable();
}
/**
* blk_iopoll_disable - Disable iopoll on this @iop
* @iop: The parent iopoll structure
*
* Description:
* Disable io polling and wait for any pending callbacks to have completed.
**/
void blk_iopoll_disable(struct blk_iopoll *iop)
{
set_bit(IOPOLL_F_DISABLE, &iop->state);
while (test_and_set_bit(IOPOLL_F_SCHED, &iop->state))
msleep(1);
clear_bit(IOPOLL_F_DISABLE, &iop->state);
}
EXPORT_SYMBOL(blk_iopoll_disable);
/**
* blk_iopoll_enable - Enable iopoll on this @iop
* @iop: The parent iopoll structure
*
* Description:
* Enable iopoll on this @iop. Note that the handler run will not be
* scheduled, it will only mark it as active.
**/
void blk_iopoll_enable(struct blk_iopoll *iop)
{
BUG_ON(!test_bit(IOPOLL_F_SCHED, &iop->state));
smp_mb__before_atomic();
clear_bit_unlock(IOPOLL_F_SCHED, &iop->state);
}
EXPORT_SYMBOL(blk_iopoll_enable);
/**
* blk_iopoll_init - Initialize this @iop
* @iop: The parent iopoll structure
* @weight: The default weight (or command completion budget)
* @poll_fn: The handler to invoke
*
* Description:
* Initialize this blk_iopoll structure. Before being actively used, the
* driver must call blk_iopoll_enable().
**/
void blk_iopoll_init(struct blk_iopoll *iop, int weight, blk_iopoll_fn *poll_fn)
{
memset(iop, 0, sizeof(*iop));
INIT_LIST_HEAD(&iop->list);
iop->weight = weight;
iop->poll = poll_fn;
set_bit(IOPOLL_F_SCHED, &iop->state);
}
EXPORT_SYMBOL(blk_iopoll_init);
static int blk_iopoll_cpu_notify(struct notifier_block *self,
unsigned long action, void *hcpu)
{
/*
* If a CPU goes away, splice its entries to the current CPU
* and trigger a run of the softirq
*/
if (action == CPU_DEAD || action == CPU_DEAD_FROZEN) {
int cpu = (unsigned long) hcpu;
local_irq_disable();
list_splice_init(&per_cpu(blk_cpu_iopoll, cpu),
this_cpu_ptr(&blk_cpu_iopoll));
__raise_softirq_irqoff(BLOCK_IOPOLL_SOFTIRQ);
local_irq_enable();
}
return NOTIFY_OK;
}
static struct notifier_block blk_iopoll_cpu_notifier = {
.notifier_call = blk_iopoll_cpu_notify,
};
static __init int blk_iopoll_setup(void)
{
int i;
for_each_possible_cpu(i)
INIT_LIST_HEAD(&per_cpu(blk_cpu_iopoll, i));
open_softirq(BLOCK_IOPOLL_SOFTIRQ, blk_iopoll_softirq);
register_hotcpu_notifier(&blk_iopoll_cpu_notifier);
return 0;
}
subsys_initcall(blk_iopoll_setup);

307
block/blk-lib.c Normal file
View file

@ -0,0 +1,307 @@
/*
* Functions related to generic helpers functions
*/
#include <linux/kernel.h>
#include <linux/module.h>
#include <linux/bio.h>
#include <linux/blkdev.h>
#include <linux/scatterlist.h>
#include "blk.h"
struct bio_batch {
atomic_t done;
unsigned long flags;
struct completion *wait;
};
static void bio_batch_end_io(struct bio *bio, int err)
{
struct bio_batch *bb = bio->bi_private;
if (err && (err != -EOPNOTSUPP))
clear_bit(BIO_UPTODATE, &bb->flags);
if (atomic_dec_and_test(&bb->done))
complete(bb->wait);
bio_put(bio);
}
/**
* blkdev_issue_discard - queue a discard
* @bdev: blockdev to issue discard for
* @sector: start sector
* @nr_sects: number of sectors to discard
* @gfp_mask: memory allocation flags (for bio_alloc)
* @flags: BLKDEV_IFL_* flags to control behaviour
*
* Description:
* Issue a discard request for the sectors in question.
*/
int blkdev_issue_discard(struct block_device *bdev, sector_t sector,
sector_t nr_sects, gfp_t gfp_mask, unsigned long flags)
{
DECLARE_COMPLETION_ONSTACK(wait);
struct request_queue *q = bdev_get_queue(bdev);
int type = REQ_WRITE | REQ_DISCARD | REQ_PRIO;
unsigned int max_discard_sectors, granularity;
int alignment;
struct bio_batch bb;
struct bio *bio;
int ret = 0;
struct blk_plug plug;
if (!q)
return -ENXIO;
if (!blk_queue_discard(q))
return -EOPNOTSUPP;
/* Zero-sector (unknown) and one-sector granularities are the same. */
granularity = max(q->limits.discard_granularity >> 9, 1U);
alignment = (bdev_discard_alignment(bdev) >> 9) % granularity;
/*
* Ensure that max_discard_sectors is of the proper
* granularity, so that requests stay aligned after a split.
*/
max_discard_sectors = min(q->limits.max_discard_sectors, UINT_MAX >> 9);
max_discard_sectors -= max_discard_sectors % granularity;
if (unlikely(!max_discard_sectors)) {
/* Avoid infinite loop below. Being cautious never hurts. */
return -EOPNOTSUPP;
}
if (flags & BLKDEV_DISCARD_SECURE) {
if (!blk_queue_secdiscard(q))
return -EOPNOTSUPP;
type |= REQ_SECURE;
}
atomic_set(&bb.done, 1);
bb.flags = 1 << BIO_UPTODATE;
bb.wait = &wait;
blk_start_plug(&plug);
while (nr_sects) {
unsigned int req_sects;
sector_t end_sect, tmp;
bio = bio_alloc(gfp_mask, 1);
if (!bio) {
ret = -ENOMEM;
break;
}
req_sects = min_t(sector_t, nr_sects, max_discard_sectors);
/*
* If splitting a request, and the next starting sector would be
* misaligned, stop the discard at the previous aligned sector.
*/
end_sect = sector + req_sects;
tmp = end_sect;
if (req_sects < nr_sects &&
sector_div(tmp, granularity) != alignment) {
end_sect = end_sect - alignment;
sector_div(end_sect, granularity);
end_sect = end_sect * granularity + alignment;
req_sects = end_sect - sector;
}
bio->bi_iter.bi_sector = sector;
bio->bi_end_io = bio_batch_end_io;
bio->bi_bdev = bdev;
bio->bi_private = &bb;
bio->bi_iter.bi_size = req_sects << 9;
nr_sects -= req_sects;
sector = end_sect;
atomic_inc(&bb.done);
submit_bio(type, bio);
/*
* We can loop for a long time in here, if someone does
* full device discards (like mkfs). Be nice and allow
* us to schedule out to avoid softlocking if preempt
* is disabled.
*/
cond_resched();
}
blk_finish_plug(&plug);
/* Wait for bios in-flight */
if (!atomic_dec_and_test(&bb.done))
wait_for_completion_io(&wait);
if (!test_bit(BIO_UPTODATE, &bb.flags))
ret = -EIO;
return ret;
}
EXPORT_SYMBOL(blkdev_issue_discard);
/**
* blkdev_issue_write_same - queue a write same operation
* @bdev: target blockdev
* @sector: start sector
* @nr_sects: number of sectors to write
* @gfp_mask: memory allocation flags (for bio_alloc)
* @page: page containing data to write
*
* Description:
* Issue a write same request for the sectors in question.
*/
int blkdev_issue_write_same(struct block_device *bdev, sector_t sector,
sector_t nr_sects, gfp_t gfp_mask,
struct page *page)
{
DECLARE_COMPLETION_ONSTACK(wait);
struct request_queue *q = bdev_get_queue(bdev);
unsigned int max_write_same_sectors;
struct bio_batch bb;
struct bio *bio;
int ret = 0;
if (!q)
return -ENXIO;
max_write_same_sectors = q->limits.max_write_same_sectors;
if (max_write_same_sectors == 0)
return -EOPNOTSUPP;
atomic_set(&bb.done, 1);
bb.flags = 1 << BIO_UPTODATE;
bb.wait = &wait;
while (nr_sects) {
bio = bio_alloc(gfp_mask, 1);
if (!bio) {
ret = -ENOMEM;
break;
}
bio->bi_iter.bi_sector = sector;
bio->bi_end_io = bio_batch_end_io;
bio->bi_bdev = bdev;
bio->bi_private = &bb;
bio->bi_vcnt = 1;
bio->bi_io_vec->bv_page = page;
bio->bi_io_vec->bv_offset = 0;
bio->bi_io_vec->bv_len = bdev_logical_block_size(bdev);
if (nr_sects > max_write_same_sectors) {
bio->bi_iter.bi_size = max_write_same_sectors << 9;
nr_sects -= max_write_same_sectors;
sector += max_write_same_sectors;
} else {
bio->bi_iter.bi_size = nr_sects << 9;
nr_sects = 0;
}
atomic_inc(&bb.done);
submit_bio(REQ_WRITE | REQ_WRITE_SAME, bio);
}
/* Wait for bios in-flight */
if (!atomic_dec_and_test(&bb.done))
wait_for_completion_io(&wait);
if (!test_bit(BIO_UPTODATE, &bb.flags))
ret = -ENOTSUPP;
return ret;
}
EXPORT_SYMBOL(blkdev_issue_write_same);
/**
* blkdev_issue_zeroout - generate number of zero filed write bios
* @bdev: blockdev to issue
* @sector: start sector
* @nr_sects: number of sectors to write
* @gfp_mask: memory allocation flags (for bio_alloc)
*
* Description:
* Generate and issue number of bios with zerofiled pages.
*/
static int __blkdev_issue_zeroout(struct block_device *bdev, sector_t sector,
sector_t nr_sects, gfp_t gfp_mask)
{
int ret;
struct bio *bio;
struct bio_batch bb;
unsigned int sz;
DECLARE_COMPLETION_ONSTACK(wait);
atomic_set(&bb.done, 1);
bb.flags = 1 << BIO_UPTODATE;
bb.wait = &wait;
ret = 0;
while (nr_sects != 0) {
bio = bio_alloc(gfp_mask,
min(nr_sects, (sector_t)BIO_MAX_PAGES));
if (!bio) {
ret = -ENOMEM;
break;
}
bio->bi_iter.bi_sector = sector;
bio->bi_bdev = bdev;
bio->bi_end_io = bio_batch_end_io;
bio->bi_private = &bb;
while (nr_sects != 0) {
sz = min((sector_t) PAGE_SIZE >> 9 , nr_sects);
ret = bio_add_page(bio, ZERO_PAGE(0), sz << 9, 0);
nr_sects -= ret >> 9;
sector += ret >> 9;
if (ret < (sz << 9))
break;
}
ret = 0;
atomic_inc(&bb.done);
submit_bio(WRITE, bio);
}
/* Wait for bios in-flight */
if (!atomic_dec_and_test(&bb.done))
wait_for_completion_io(&wait);
if (!test_bit(BIO_UPTODATE, &bb.flags))
/* One of bios in the batch was completed with error.*/
ret = -EIO;
return ret;
}
/**
* blkdev_issue_zeroout - zero-fill a block range
* @bdev: blockdev to write
* @sector: start sector
* @nr_sects: number of sectors to write
* @gfp_mask: memory allocation flags (for bio_alloc)
*
* Description:
* Generate and issue number of bios with zerofiled pages.
*/
int blkdev_issue_zeroout(struct block_device *bdev, sector_t sector,
sector_t nr_sects, gfp_t gfp_mask)
{
if (bdev_write_same(bdev)) {
unsigned char bdn[BDEVNAME_SIZE];
if (!blkdev_issue_write_same(bdev, sector, nr_sects, gfp_mask,
ZERO_PAGE(0)))
return 0;
bdevname(bdev, bdn);
pr_err("%s: WRITE SAME failed. Manually zeroing.\n", bdn);
}
return __blkdev_issue_zeroout(bdev, sector, nr_sects, gfp_mask);
}
EXPORT_SYMBOL(blkdev_issue_zeroout);

328
block/blk-map.c Normal file
View file

@ -0,0 +1,328 @@
/*
* Functions related to mapping data to requests
*/
#include <linux/kernel.h>
#include <linux/module.h>
#include <linux/bio.h>
#include <linux/blkdev.h>
#include <scsi/sg.h> /* for struct sg_iovec */
#include "blk.h"
int blk_rq_append_bio(struct request_queue *q, struct request *rq,
struct bio *bio)
{
if (!rq->bio)
blk_rq_bio_prep(q, rq, bio);
else if (!ll_back_merge_fn(q, rq, bio))
return -EINVAL;
else {
rq->biotail->bi_next = bio;
rq->biotail = bio;
rq->__data_len += bio->bi_iter.bi_size;
}
return 0;
}
static int __blk_rq_unmap_user(struct bio *bio)
{
int ret = 0;
if (bio) {
if (bio_flagged(bio, BIO_USER_MAPPED))
bio_unmap_user(bio);
else
ret = bio_uncopy_user(bio);
}
return ret;
}
static int __blk_rq_map_user(struct request_queue *q, struct request *rq,
struct rq_map_data *map_data, void __user *ubuf,
unsigned int len, gfp_t gfp_mask)
{
unsigned long uaddr;
struct bio *bio, *orig_bio;
int reading, ret;
reading = rq_data_dir(rq) == READ;
/*
* if alignment requirement is satisfied, map in user pages for
* direct dma. else, set up kernel bounce buffers
*/
uaddr = (unsigned long) ubuf;
if (blk_rq_aligned(q, uaddr, len) && !map_data)
bio = bio_map_user(q, NULL, uaddr, len, reading, gfp_mask);
else
bio = bio_copy_user(q, map_data, uaddr, len, reading, gfp_mask);
if (IS_ERR(bio))
return PTR_ERR(bio);
if (map_data && map_data->null_mapped)
bio->bi_flags |= (1 << BIO_NULL_MAPPED);
orig_bio = bio;
blk_queue_bounce(q, &bio);
/*
* We link the bounce buffer in and could have to traverse it
* later so we have to get a ref to prevent it from being freed
*/
bio_get(bio);
ret = blk_rq_append_bio(q, rq, bio);
if (!ret)
return bio->bi_iter.bi_size;
/* if it was boucned we must call the end io function */
bio_endio(bio, 0);
__blk_rq_unmap_user(orig_bio);
bio_put(bio);
return ret;
}
/**
* blk_rq_map_user - map user data to a request, for REQ_TYPE_BLOCK_PC usage
* @q: request queue where request should be inserted
* @rq: request structure to fill
* @map_data: pointer to the rq_map_data holding pages (if necessary)
* @ubuf: the user buffer
* @len: length of user data
* @gfp_mask: memory allocation flags
*
* Description:
* Data will be mapped directly for zero copy I/O, if possible. Otherwise
* a kernel bounce buffer is used.
*
* A matching blk_rq_unmap_user() must be issued at the end of I/O, while
* still in process context.
*
* Note: The mapped bio may need to be bounced through blk_queue_bounce()
* before being submitted to the device, as pages mapped may be out of
* reach. It's the callers responsibility to make sure this happens. The
* original bio must be passed back in to blk_rq_unmap_user() for proper
* unmapping.
*/
int blk_rq_map_user(struct request_queue *q, struct request *rq,
struct rq_map_data *map_data, void __user *ubuf,
unsigned long len, gfp_t gfp_mask)
{
unsigned long bytes_read = 0;
struct bio *bio = NULL;
int ret;
if (len > (queue_max_hw_sectors(q) << 9))
return -EINVAL;
if (!len)
return -EINVAL;
if (!ubuf && (!map_data || !map_data->null_mapped))
return -EINVAL;
while (bytes_read != len) {
unsigned long map_len, end, start;
map_len = min_t(unsigned long, len - bytes_read, BIO_MAX_SIZE);
end = ((unsigned long)ubuf + map_len + PAGE_SIZE - 1)
>> PAGE_SHIFT;
start = (unsigned long)ubuf >> PAGE_SHIFT;
/*
* A bad offset could cause us to require BIO_MAX_PAGES + 1
* pages. If this happens we just lower the requested
* mapping len by a page so that we can fit
*/
if (end - start > BIO_MAX_PAGES)
map_len -= PAGE_SIZE;
ret = __blk_rq_map_user(q, rq, map_data, ubuf, map_len,
gfp_mask);
if (ret < 0)
goto unmap_rq;
if (!bio)
bio = rq->bio;
bytes_read += ret;
ubuf += ret;
if (map_data)
map_data->offset += ret;
}
if (!bio_flagged(bio, BIO_USER_MAPPED))
rq->cmd_flags |= REQ_COPY_USER;
return 0;
unmap_rq:
blk_rq_unmap_user(bio);
rq->bio = NULL;
return ret;
}
EXPORT_SYMBOL(blk_rq_map_user);
/**
* blk_rq_map_user_iov - map user data to a request, for REQ_TYPE_BLOCK_PC usage
* @q: request queue where request should be inserted
* @rq: request to map data to
* @map_data: pointer to the rq_map_data holding pages (if necessary)
* @iov: pointer to the iovec
* @iov_count: number of elements in the iovec
* @len: I/O byte count
* @gfp_mask: memory allocation flags
*
* Description:
* Data will be mapped directly for zero copy I/O, if possible. Otherwise
* a kernel bounce buffer is used.
*
* A matching blk_rq_unmap_user() must be issued at the end of I/O, while
* still in process context.
*
* Note: The mapped bio may need to be bounced through blk_queue_bounce()
* before being submitted to the device, as pages mapped may be out of
* reach. It's the callers responsibility to make sure this happens. The
* original bio must be passed back in to blk_rq_unmap_user() for proper
* unmapping.
*/
int blk_rq_map_user_iov(struct request_queue *q, struct request *rq,
struct rq_map_data *map_data, const struct sg_iovec *iov,
int iov_count, unsigned int len, gfp_t gfp_mask)
{
struct bio *bio;
int i, read = rq_data_dir(rq) == READ;
int unaligned = 0;
if (!iov || iov_count <= 0)
return -EINVAL;
for (i = 0; i < iov_count; i++) {
unsigned long uaddr = (unsigned long)iov[i].iov_base;
if (!iov[i].iov_len)
return -EINVAL;
/*
* Keep going so we check length of all segments
*/
if (uaddr & queue_dma_alignment(q))
unaligned = 1;
}
if (unaligned || (q->dma_pad_mask & len) || map_data)
bio = bio_copy_user_iov(q, map_data, iov, iov_count, read,
gfp_mask);
else
bio = bio_map_user_iov(q, NULL, iov, iov_count, read, gfp_mask);
if (IS_ERR(bio))
return PTR_ERR(bio);
if (bio->bi_iter.bi_size != len) {
/*
* Grab an extra reference to this bio, as bio_unmap_user()
* expects to be able to drop it twice as it happens on the
* normal IO completion path
*/
bio_get(bio);
bio_endio(bio, 0);
__blk_rq_unmap_user(bio);
return -EINVAL;
}
if (!bio_flagged(bio, BIO_USER_MAPPED))
rq->cmd_flags |= REQ_COPY_USER;
blk_queue_bounce(q, &bio);
bio_get(bio);
blk_rq_bio_prep(q, rq, bio);
return 0;
}
EXPORT_SYMBOL(blk_rq_map_user_iov);
/**
* blk_rq_unmap_user - unmap a request with user data
* @bio: start of bio list
*
* Description:
* Unmap a rq previously mapped by blk_rq_map_user(). The caller must
* supply the original rq->bio from the blk_rq_map_user() return, since
* the I/O completion may have changed rq->bio.
*/
int blk_rq_unmap_user(struct bio *bio)
{
struct bio *mapped_bio;
int ret = 0, ret2;
while (bio) {
mapped_bio = bio;
if (unlikely(bio_flagged(bio, BIO_BOUNCED)))
mapped_bio = bio->bi_private;
ret2 = __blk_rq_unmap_user(mapped_bio);
if (ret2 && !ret)
ret = ret2;
mapped_bio = bio;
bio = bio->bi_next;
bio_put(mapped_bio);
}
return ret;
}
EXPORT_SYMBOL(blk_rq_unmap_user);
/**
* blk_rq_map_kern - map kernel data to a request, for REQ_TYPE_BLOCK_PC usage
* @q: request queue where request should be inserted
* @rq: request to fill
* @kbuf: the kernel buffer
* @len: length of user data
* @gfp_mask: memory allocation flags
*
* Description:
* Data will be mapped directly if possible. Otherwise a bounce
* buffer is used. Can be called multiple times to append multiple
* buffers.
*/
int blk_rq_map_kern(struct request_queue *q, struct request *rq, void *kbuf,
unsigned int len, gfp_t gfp_mask)
{
int reading = rq_data_dir(rq) == READ;
unsigned long addr = (unsigned long) kbuf;
int do_copy = 0;
struct bio *bio;
int ret;
if (len > (queue_max_hw_sectors(q) << 9))
return -EINVAL;
if (!len || !kbuf)
return -EINVAL;
do_copy = !blk_rq_aligned(q, addr, len) || object_is_on_stack(kbuf);
if (do_copy)
bio = bio_copy_kern(q, kbuf, len, gfp_mask, reading);
else
bio = bio_map_kern(q, kbuf, len, gfp_mask);
if (IS_ERR(bio))
return PTR_ERR(bio);
if (!reading)
bio->bi_rw |= REQ_WRITE;
if (do_copy)
rq->cmd_flags |= REQ_COPY_USER;
ret = blk_rq_append_bio(q, rq, bio);
if (unlikely(ret)) {
/* request is too big */
bio_put(bio);
return ret;
}
blk_queue_bounce(q, &rq->bio);
return 0;
}
EXPORT_SYMBOL(blk_rq_map_kern);

627
block/blk-merge.c Normal file
View file

@ -0,0 +1,627 @@
/*
* Functions related to segment and merge handling
*/
#include <linux/kernel.h>
#include <linux/module.h>
#include <linux/bio.h>
#include <linux/blkdev.h>
#include <linux/scatterlist.h>
#include "blk.h"
static unsigned int __blk_recalc_rq_segments(struct request_queue *q,
struct bio *bio,
bool no_sg_merge)
{
struct bio_vec bv, bvprv = { NULL };
int cluster, high, highprv = 1;
unsigned int seg_size, nr_phys_segs;
struct bio *fbio, *bbio;
struct bvec_iter iter;
if (!bio)
return 0;
/*
* This should probably be returning 0, but blk_add_request_payload()
* (Christoph!!!!)
*/
if (bio->bi_rw & REQ_DISCARD)
return 1;
if (bio->bi_rw & REQ_WRITE_SAME)
return 1;
fbio = bio;
cluster = blk_queue_cluster(q);
seg_size = 0;
nr_phys_segs = 0;
high = 0;
for_each_bio(bio) {
bio_for_each_segment(bv, bio, iter) {
/*
* If SG merging is disabled, each bio vector is
* a segment
*/
if (no_sg_merge)
goto new_segment;
/*
* the trick here is making sure that a high page is
* never considered part of another segment, since
* that might change with the bounce page.
*/
high = page_to_pfn(bv.bv_page) > queue_bounce_pfn(q);
if (!high && !highprv && cluster) {
if (seg_size + bv.bv_len
> queue_max_segment_size(q))
goto new_segment;
if (!BIOVEC_PHYS_MERGEABLE(&bvprv, &bv))
goto new_segment;
if (!BIOVEC_SEG_BOUNDARY(q, &bvprv, &bv))
goto new_segment;
seg_size += bv.bv_len;
bvprv = bv;
continue;
}
new_segment:
if (nr_phys_segs == 1 && seg_size >
fbio->bi_seg_front_size)
fbio->bi_seg_front_size = seg_size;
nr_phys_segs++;
bvprv = bv;
seg_size = bv.bv_len;
highprv = high;
}
bbio = bio;
}
if (nr_phys_segs == 1 && seg_size > fbio->bi_seg_front_size)
fbio->bi_seg_front_size = seg_size;
if (seg_size > bbio->bi_seg_back_size)
bbio->bi_seg_back_size = seg_size;
return nr_phys_segs;
}
void blk_recalc_rq_segments(struct request *rq)
{
bool no_sg_merge = !!test_bit(QUEUE_FLAG_NO_SG_MERGE,
&rq->q->queue_flags);
rq->nr_phys_segments = __blk_recalc_rq_segments(rq->q, rq->bio,
no_sg_merge);
}
void blk_recount_segments(struct request_queue *q, struct bio *bio)
{
unsigned short seg_cnt;
/* estimate segment number by bi_vcnt for non-cloned bio */
if (bio_flagged(bio, BIO_CLONED))
seg_cnt = bio_segments(bio);
else
seg_cnt = bio->bi_vcnt;
if (test_bit(QUEUE_FLAG_NO_SG_MERGE, &q->queue_flags) &&
(seg_cnt < queue_max_segments(q)))
bio->bi_phys_segments = seg_cnt;
else {
struct bio *nxt = bio->bi_next;
bio->bi_next = NULL;
bio->bi_phys_segments = __blk_recalc_rq_segments(q, bio, false);
bio->bi_next = nxt;
}
bio->bi_flags |= (1 << BIO_SEG_VALID);
}
EXPORT_SYMBOL(blk_recount_segments);
static int blk_phys_contig_segment(struct request_queue *q, struct bio *bio,
struct bio *nxt)
{
struct bio_vec end_bv = { NULL }, nxt_bv;
struct bvec_iter iter;
if (!blk_queue_cluster(q))
return 0;
if (bio->bi_seg_back_size + nxt->bi_seg_front_size >
queue_max_segment_size(q))
return 0;
if (!bio_has_data(bio))
return 1;
bio_for_each_segment(end_bv, bio, iter)
if (end_bv.bv_len == iter.bi_size)
break;
nxt_bv = bio_iovec(nxt);
if (!BIOVEC_PHYS_MERGEABLE(&end_bv, &nxt_bv))
return 0;
/*
* bio and nxt are contiguous in memory; check if the queue allows
* these two to be merged into one
*/
if (BIOVEC_SEG_BOUNDARY(q, &end_bv, &nxt_bv))
return 1;
return 0;
}
static inline void
__blk_segment_map_sg(struct request_queue *q, struct bio_vec *bvec,
struct scatterlist *sglist, struct bio_vec *bvprv,
struct scatterlist **sg, int *nsegs, int *cluster)
{
int nbytes = bvec->bv_len;
if (*sg && *cluster) {
if ((*sg)->length + nbytes > queue_max_segment_size(q))
goto new_segment;
if (!BIOVEC_PHYS_MERGEABLE(bvprv, bvec))
goto new_segment;
if (!BIOVEC_SEG_BOUNDARY(q, bvprv, bvec))
goto new_segment;
(*sg)->length += nbytes;
} else {
new_segment:
if (!*sg)
*sg = sglist;
else {
/*
* If the driver previously mapped a shorter
* list, we could see a termination bit
* prematurely unless it fully inits the sg
* table on each mapping. We KNOW that there
* must be more entries here or the driver
* would be buggy, so force clear the
* termination bit to avoid doing a full
* sg_init_table() in drivers for each command.
*/
sg_unmark_end(*sg);
*sg = sg_next(*sg);
}
sg_set_page(*sg, bvec->bv_page, nbytes, bvec->bv_offset);
(*nsegs)++;
}
*bvprv = *bvec;
}
static int __blk_bios_map_sg(struct request_queue *q, struct bio *bio,
struct scatterlist *sglist,
struct scatterlist **sg)
{
struct bio_vec bvec, bvprv = { NULL };
struct bvec_iter iter;
int nsegs, cluster;
nsegs = 0;
cluster = blk_queue_cluster(q);
if (bio->bi_rw & REQ_DISCARD) {
/*
* This is a hack - drivers should be neither modifying the
* biovec, nor relying on bi_vcnt - but because of
* blk_add_request_payload(), a discard bio may or may not have
* a payload we need to set up here (thank you Christoph) and
* bi_vcnt is really the only way of telling if we need to.
*/
if (bio->bi_vcnt)
goto single_segment;
return 0;
}
if (bio->bi_rw & REQ_WRITE_SAME) {
single_segment:
*sg = sglist;
bvec = bio_iovec(bio);
sg_set_page(*sg, bvec.bv_page, bvec.bv_len, bvec.bv_offset);
return 1;
}
for_each_bio(bio)
bio_for_each_segment(bvec, bio, iter)
__blk_segment_map_sg(q, &bvec, sglist, &bvprv, sg,
&nsegs, &cluster);
return nsegs;
}
/*
* map a request to scatterlist, return number of sg entries setup. Caller
* must make sure sg can hold rq->nr_phys_segments entries
*/
int blk_rq_map_sg(struct request_queue *q, struct request *rq,
struct scatterlist *sglist)
{
struct scatterlist *sg = NULL;
int nsegs = 0;
if (rq->bio)
nsegs = __blk_bios_map_sg(q, rq->bio, sglist, &sg);
if (unlikely(rq->cmd_flags & REQ_COPY_USER) &&
(blk_rq_bytes(rq) & q->dma_pad_mask)) {
unsigned int pad_len =
(q->dma_pad_mask & ~blk_rq_bytes(rq)) + 1;
sg->length += pad_len;
rq->extra_len += pad_len;
}
if (q->dma_drain_size && q->dma_drain_needed(rq)) {
if (rq->cmd_flags & REQ_WRITE)
memset(q->dma_drain_buffer, 0, q->dma_drain_size);
sg->page_link &= ~0x02;
sg = sg_next(sg);
sg_set_page(sg, virt_to_page(q->dma_drain_buffer),
q->dma_drain_size,
((unsigned long)q->dma_drain_buffer) &
(PAGE_SIZE - 1));
nsegs++;
rq->extra_len += q->dma_drain_size;
}
if (sg)
sg_mark_end(sg);
return nsegs;
}
EXPORT_SYMBOL(blk_rq_map_sg);
/**
* blk_bio_map_sg - map a bio to a scatterlist
* @q: request_queue in question
* @bio: bio being mapped
* @sglist: scatterlist being mapped
*
* Note:
* Caller must make sure sg can hold bio->bi_phys_segments entries
*
* Will return the number of sg entries setup
*/
int blk_bio_map_sg(struct request_queue *q, struct bio *bio,
struct scatterlist *sglist)
{
struct scatterlist *sg = NULL;
int nsegs;
struct bio *next = bio->bi_next;
bio->bi_next = NULL;
nsegs = __blk_bios_map_sg(q, bio, sglist, &sg);
bio->bi_next = next;
if (sg)
sg_mark_end(sg);
BUG_ON(bio->bi_phys_segments && nsegs > bio->bi_phys_segments);
return nsegs;
}
EXPORT_SYMBOL(blk_bio_map_sg);
static inline int ll_new_hw_segment(struct request_queue *q,
struct request *req,
struct bio *bio)
{
int nr_phys_segs = bio_phys_segments(q, bio);
if (req->nr_phys_segments + nr_phys_segs > queue_max_segments(q))
goto no_merge;
if (blk_integrity_merge_bio(q, req, bio) == false)
goto no_merge;
/*
* This will form the start of a new hw segment. Bump both
* counters.
*/
req->nr_phys_segments += nr_phys_segs;
return 1;
no_merge:
req->cmd_flags |= REQ_NOMERGE;
if (req == q->last_merge)
q->last_merge = NULL;
return 0;
}
int ll_back_merge_fn(struct request_queue *q, struct request *req,
struct bio *bio)
{
if (blk_rq_sectors(req) + bio_sectors(bio) >
blk_rq_get_max_sectors(req)) {
req->cmd_flags |= REQ_NOMERGE;
if (req == q->last_merge)
q->last_merge = NULL;
return 0;
}
if (!bio_flagged(req->biotail, BIO_SEG_VALID))
blk_recount_segments(q, req->biotail);
if (!bio_flagged(bio, BIO_SEG_VALID))
blk_recount_segments(q, bio);
return ll_new_hw_segment(q, req, bio);
}
int ll_front_merge_fn(struct request_queue *q, struct request *req,
struct bio *bio)
{
if (blk_rq_sectors(req) + bio_sectors(bio) >
blk_rq_get_max_sectors(req)) {
req->cmd_flags |= REQ_NOMERGE;
if (req == q->last_merge)
q->last_merge = NULL;
return 0;
}
if (!bio_flagged(bio, BIO_SEG_VALID))
blk_recount_segments(q, bio);
if (!bio_flagged(req->bio, BIO_SEG_VALID))
blk_recount_segments(q, req->bio);
return ll_new_hw_segment(q, req, bio);
}
/*
* blk-mq uses req->special to carry normal driver per-request payload, it
* does not indicate a prepared command that we cannot merge with.
*/
static bool req_no_special_merge(struct request *req)
{
struct request_queue *q = req->q;
return !q->mq_ops && req->special;
}
static int ll_merge_requests_fn(struct request_queue *q, struct request *req,
struct request *next)
{
int total_phys_segments;
unsigned int seg_size =
req->biotail->bi_seg_back_size + next->bio->bi_seg_front_size;
/*
* First check if the either of the requests are re-queued
* requests. Can't merge them if they are.
*/
if (req_no_special_merge(req) || req_no_special_merge(next))
return 0;
/*
* Will it become too large?
*/
if ((blk_rq_sectors(req) + blk_rq_sectors(next)) >
blk_rq_get_max_sectors(req))
return 0;
total_phys_segments = req->nr_phys_segments + next->nr_phys_segments;
if (blk_phys_contig_segment(q, req->biotail, next->bio)) {
if (req->nr_phys_segments == 1)
req->bio->bi_seg_front_size = seg_size;
if (next->nr_phys_segments == 1)
next->biotail->bi_seg_back_size = seg_size;
total_phys_segments--;
}
if (total_phys_segments > queue_max_segments(q))
return 0;
if (blk_integrity_merge_rq(q, req, next) == false)
return 0;
/* Merge is OK... */
req->nr_phys_segments = total_phys_segments;
return 1;
}
/**
* blk_rq_set_mixed_merge - mark a request as mixed merge
* @rq: request to mark as mixed merge
*
* Description:
* @rq is about to be mixed merged. Make sure the attributes
* which can be mixed are set in each bio and mark @rq as mixed
* merged.
*/
void blk_rq_set_mixed_merge(struct request *rq)
{
unsigned int ff = rq->cmd_flags & REQ_FAILFAST_MASK;
struct bio *bio;
if (rq->cmd_flags & REQ_MIXED_MERGE)
return;
/*
* @rq will no longer represent mixable attributes for all the
* contained bios. It will just track those of the first one.
* Distributes the attributs to each bio.
*/
for (bio = rq->bio; bio; bio = bio->bi_next) {
WARN_ON_ONCE((bio->bi_rw & REQ_FAILFAST_MASK) &&
(bio->bi_rw & REQ_FAILFAST_MASK) != ff);
bio->bi_rw |= ff;
}
rq->cmd_flags |= REQ_MIXED_MERGE;
}
static void blk_account_io_merge(struct request *req)
{
if (blk_do_io_stat(req)) {
struct hd_struct *part;
int cpu;
cpu = part_stat_lock();
part = req->part;
part_round_stats(cpu, part);
part_dec_in_flight(part, rq_data_dir(req));
hd_struct_put(part);
part_stat_unlock();
}
}
/*
* Has to be called with the request spinlock acquired
*/
static int attempt_merge(struct request_queue *q, struct request *req,
struct request *next)
{
if (!rq_mergeable(req) || !rq_mergeable(next))
return 0;
if (!blk_check_merge_flags(req->cmd_flags, next->cmd_flags))
return 0;
/*
* not contiguous
*/
if (blk_rq_pos(req) + blk_rq_sectors(req) != blk_rq_pos(next))
return 0;
if (rq_data_dir(req) != rq_data_dir(next)
|| req->rq_disk != next->rq_disk
|| req_no_special_merge(next))
return 0;
if (req->cmd_flags & REQ_WRITE_SAME &&
!blk_write_same_mergeable(req->bio, next->bio))
return 0;
/*
* If we are allowed to merge, then append bio list
* from next to rq and release next. merge_requests_fn
* will have updated segment counts, update sector
* counts here.
*/
if (!ll_merge_requests_fn(q, req, next))
return 0;
/*
* If failfast settings disagree or any of the two is already
* a mixed merge, mark both as mixed before proceeding. This
* makes sure that all involved bios have mixable attributes
* set properly.
*/
if ((req->cmd_flags | next->cmd_flags) & REQ_MIXED_MERGE ||
(req->cmd_flags & REQ_FAILFAST_MASK) !=
(next->cmd_flags & REQ_FAILFAST_MASK)) {
blk_rq_set_mixed_merge(req);
blk_rq_set_mixed_merge(next);
}
/*
* At this point we have either done a back merge
* or front merge. We need the smaller start_time of
* the merged requests to be the current request
* for accounting purposes.
*/
if (time_after(req->start_time, next->start_time))
req->start_time = next->start_time;
req->biotail->bi_next = next->bio;
req->biotail = next->biotail;
req->__data_len += blk_rq_bytes(next);
elv_merge_requests(q, req, next);
/*
* 'next' is going away, so update stats accordingly
*/
blk_account_io_merge(next);
req->ioprio = ioprio_best(req->ioprio, next->ioprio);
if (blk_rq_cpu_valid(next))
req->cpu = next->cpu;
/* owner-ship of bio passed from next to req */
next->bio = NULL;
__blk_put_request(q, next);
return 1;
}
int attempt_back_merge(struct request_queue *q, struct request *rq)
{
struct request *next = elv_latter_request(q, rq);
if (next)
return attempt_merge(q, rq, next);
return 0;
}
int attempt_front_merge(struct request_queue *q, struct request *rq)
{
struct request *prev = elv_former_request(q, rq);
if (prev)
return attempt_merge(q, prev, rq);
return 0;
}
int blk_attempt_req_merge(struct request_queue *q, struct request *rq,
struct request *next)
{
return attempt_merge(q, rq, next);
}
bool blk_rq_merge_ok(struct request *rq, struct bio *bio)
{
struct request_queue *q = rq->q;
if (!rq_mergeable(rq) || !bio_mergeable(bio))
return false;
if (!blk_check_merge_flags(rq->cmd_flags, bio->bi_rw))
return false;
/* different data direction or already started, don't merge */
if (bio_data_dir(bio) != rq_data_dir(rq))
return false;
/* must be same device and not a special request */
if (rq->rq_disk != bio->bi_bdev->bd_disk || req_no_special_merge(rq))
return false;
/* only merge integrity protected bio into ditto rq */
if (blk_integrity_merge_bio(rq->q, rq, bio) == false)
return false;
/* must be using the same buffer */
if (rq->cmd_flags & REQ_WRITE_SAME &&
!blk_write_same_mergeable(rq->bio, bio))
return false;
if (q->queue_flags & (1 << QUEUE_FLAG_SG_GAPS)) {
struct bio_vec *bprev;
bprev = &rq->biotail->bi_io_vec[rq->biotail->bi_vcnt - 1];
if (bvec_gap_to_prev(bprev, bio->bi_io_vec[0].bv_offset))
return false;
}
return true;
}
int blk_try_merge(struct request *rq, struct bio *bio)
{
if (blk_rq_pos(rq) + blk_rq_sectors(rq) == bio->bi_iter.bi_sector)
return ELEVATOR_BACK_MERGE;
else if (blk_rq_pos(rq) - bio_sectors(bio) == bio->bi_iter.bi_sector)
return ELEVATOR_FRONT_MERGE;
return ELEVATOR_NO_MERGE;
}

67
block/blk-mq-cpu.c Normal file
View file

@ -0,0 +1,67 @@
/*
* CPU notifier helper code for blk-mq
*
* Copyright (C) 2013-2014 Jens Axboe
*/
#include <linux/kernel.h>
#include <linux/module.h>
#include <linux/init.h>
#include <linux/blkdev.h>
#include <linux/list.h>
#include <linux/llist.h>
#include <linux/smp.h>
#include <linux/cpu.h>
#include <linux/blk-mq.h>
#include "blk-mq.h"
static LIST_HEAD(blk_mq_cpu_notify_list);
static DEFINE_RAW_SPINLOCK(blk_mq_cpu_notify_lock);
static int blk_mq_main_cpu_notify(struct notifier_block *self,
unsigned long action, void *hcpu)
{
unsigned int cpu = (unsigned long) hcpu;
struct blk_mq_cpu_notifier *notify;
int ret = NOTIFY_OK;
raw_spin_lock(&blk_mq_cpu_notify_lock);
list_for_each_entry(notify, &blk_mq_cpu_notify_list, list) {
ret = notify->notify(notify->data, action, cpu);
if (ret != NOTIFY_OK)
break;
}
raw_spin_unlock(&blk_mq_cpu_notify_lock);
return ret;
}
void blk_mq_register_cpu_notifier(struct blk_mq_cpu_notifier *notifier)
{
BUG_ON(!notifier->notify);
raw_spin_lock(&blk_mq_cpu_notify_lock);
list_add_tail(&notifier->list, &blk_mq_cpu_notify_list);
raw_spin_unlock(&blk_mq_cpu_notify_lock);
}
void blk_mq_unregister_cpu_notifier(struct blk_mq_cpu_notifier *notifier)
{
raw_spin_lock(&blk_mq_cpu_notify_lock);
list_del(&notifier->list);
raw_spin_unlock(&blk_mq_cpu_notify_lock);
}
void blk_mq_init_cpu_notifier(struct blk_mq_cpu_notifier *notifier,
int (*fn)(void *, unsigned long, unsigned int),
void *data)
{
notifier->notify = fn;
notifier->data = data;
}
void __init blk_mq_cpu_init(void)
{
hotcpu_notifier(blk_mq_main_cpu_notify, 0);
}

119
block/blk-mq-cpumap.c Normal file
View file

@ -0,0 +1,119 @@
/*
* CPU <-> hardware queue mapping helpers
*
* Copyright (C) 2013-2014 Jens Axboe
*/
#include <linux/kernel.h>
#include <linux/threads.h>
#include <linux/module.h>
#include <linux/mm.h>
#include <linux/smp.h>
#include <linux/cpu.h>
#include <linux/blk-mq.h>
#include "blk.h"
#include "blk-mq.h"
static int cpu_to_queue_index(unsigned int nr_cpus, unsigned int nr_queues,
const int cpu)
{
return cpu / ((nr_cpus + nr_queues - 1) / nr_queues);
}
static int get_first_sibling(unsigned int cpu)
{
unsigned int ret;
ret = cpumask_first(topology_thread_cpumask(cpu));
if (ret < nr_cpu_ids)
return ret;
return cpu;
}
int blk_mq_update_queue_map(unsigned int *map, unsigned int nr_queues)
{
unsigned int i, nr_cpus, nr_uniq_cpus, queue, first_sibling;
cpumask_var_t cpus;
if (!alloc_cpumask_var(&cpus, GFP_ATOMIC))
return 1;
cpumask_clear(cpus);
nr_cpus = nr_uniq_cpus = 0;
for_each_online_cpu(i) {
nr_cpus++;
first_sibling = get_first_sibling(i);
if (!cpumask_test_cpu(first_sibling, cpus))
nr_uniq_cpus++;
cpumask_set_cpu(i, cpus);
}
queue = 0;
for_each_possible_cpu(i) {
if (!cpu_online(i)) {
map[i] = 0;
continue;
}
/*
* Easy case - we have equal or more hardware queues. Or
* there are no thread siblings to take into account. Do
* 1:1 if enough, or sequential mapping if less.
*/
if (nr_queues >= nr_cpus || nr_cpus == nr_uniq_cpus) {
map[i] = cpu_to_queue_index(nr_cpus, nr_queues, queue);
queue++;
continue;
}
/*
* Less then nr_cpus queues, and we have some number of
* threads per cores. Map sibling threads to the same
* queue.
*/
first_sibling = get_first_sibling(i);
if (first_sibling == i) {
map[i] = cpu_to_queue_index(nr_uniq_cpus, nr_queues,
queue);
queue++;
} else
map[i] = map[first_sibling];
}
free_cpumask_var(cpus);
return 0;
}
unsigned int *blk_mq_make_queue_map(struct blk_mq_tag_set *set)
{
unsigned int *map;
/* If cpus are offline, map them to first hctx */
map = kzalloc_node(sizeof(*map) * nr_cpu_ids, GFP_KERNEL,
set->numa_node);
if (!map)
return NULL;
if (!blk_mq_update_queue_map(map, set->nr_hw_queues))
return map;
kfree(map);
return NULL;
}
/*
* We have no quick way of doing reverse lookups. This is only used at
* queue init time, so runtime isn't important.
*/
int blk_mq_hw_queue_to_node(unsigned int *mq_map, unsigned int index)
{
int i;
for_each_possible_cpu(i) {
if (index == mq_map[i])
return cpu_to_node(i);
}
return NUMA_NO_NODE;
}

461
block/blk-mq-sysfs.c Normal file
View file

@ -0,0 +1,461 @@
#include <linux/kernel.h>
#include <linux/module.h>
#include <linux/backing-dev.h>
#include <linux/bio.h>
#include <linux/blkdev.h>
#include <linux/mm.h>
#include <linux/init.h>
#include <linux/slab.h>
#include <linux/workqueue.h>
#include <linux/smp.h>
#include <linux/blk-mq.h>
#include "blk-mq.h"
#include "blk-mq-tag.h"
static void blk_mq_sysfs_release(struct kobject *kobj)
{
}
struct blk_mq_ctx_sysfs_entry {
struct attribute attr;
ssize_t (*show)(struct blk_mq_ctx *, char *);
ssize_t (*store)(struct blk_mq_ctx *, const char *, size_t);
};
struct blk_mq_hw_ctx_sysfs_entry {
struct attribute attr;
ssize_t (*show)(struct blk_mq_hw_ctx *, char *);
ssize_t (*store)(struct blk_mq_hw_ctx *, const char *, size_t);
};
static ssize_t blk_mq_sysfs_show(struct kobject *kobj, struct attribute *attr,
char *page)
{
struct blk_mq_ctx_sysfs_entry *entry;
struct blk_mq_ctx *ctx;
struct request_queue *q;
ssize_t res;
entry = container_of(attr, struct blk_mq_ctx_sysfs_entry, attr);
ctx = container_of(kobj, struct blk_mq_ctx, kobj);
q = ctx->queue;
if (!entry->show)
return -EIO;
res = -ENOENT;
mutex_lock(&q->sysfs_lock);
if (!blk_queue_dying(q))
res = entry->show(ctx, page);
mutex_unlock(&q->sysfs_lock);
return res;
}
static ssize_t blk_mq_sysfs_store(struct kobject *kobj, struct attribute *attr,
const char *page, size_t length)
{
struct blk_mq_ctx_sysfs_entry *entry;
struct blk_mq_ctx *ctx;
struct request_queue *q;
ssize_t res;
entry = container_of(attr, struct blk_mq_ctx_sysfs_entry, attr);
ctx = container_of(kobj, struct blk_mq_ctx, kobj);
q = ctx->queue;
if (!entry->store)
return -EIO;
res = -ENOENT;
mutex_lock(&q->sysfs_lock);
if (!blk_queue_dying(q))
res = entry->store(ctx, page, length);
mutex_unlock(&q->sysfs_lock);
return res;
}
static ssize_t blk_mq_hw_sysfs_show(struct kobject *kobj,
struct attribute *attr, char *page)
{
struct blk_mq_hw_ctx_sysfs_entry *entry;
struct blk_mq_hw_ctx *hctx;
struct request_queue *q;
ssize_t res;
entry = container_of(attr, struct blk_mq_hw_ctx_sysfs_entry, attr);
hctx = container_of(kobj, struct blk_mq_hw_ctx, kobj);
q = hctx->queue;
if (!entry->show)
return -EIO;
res = -ENOENT;
mutex_lock(&q->sysfs_lock);
if (!blk_queue_dying(q))
res = entry->show(hctx, page);
mutex_unlock(&q->sysfs_lock);
return res;
}
static ssize_t blk_mq_hw_sysfs_store(struct kobject *kobj,
struct attribute *attr, const char *page,
size_t length)
{
struct blk_mq_hw_ctx_sysfs_entry *entry;
struct blk_mq_hw_ctx *hctx;
struct request_queue *q;
ssize_t res;
entry = container_of(attr, struct blk_mq_hw_ctx_sysfs_entry, attr);
hctx = container_of(kobj, struct blk_mq_hw_ctx, kobj);
q = hctx->queue;
if (!entry->store)
return -EIO;
res = -ENOENT;
mutex_lock(&q->sysfs_lock);
if (!blk_queue_dying(q))
res = entry->store(hctx, page, length);
mutex_unlock(&q->sysfs_lock);
return res;
}
static ssize_t blk_mq_sysfs_dispatched_show(struct blk_mq_ctx *ctx, char *page)
{
return sprintf(page, "%lu %lu\n", ctx->rq_dispatched[1],
ctx->rq_dispatched[0]);
}
static ssize_t blk_mq_sysfs_merged_show(struct blk_mq_ctx *ctx, char *page)
{
return sprintf(page, "%lu\n", ctx->rq_merged);
}
static ssize_t blk_mq_sysfs_completed_show(struct blk_mq_ctx *ctx, char *page)
{
return sprintf(page, "%lu %lu\n", ctx->rq_completed[1],
ctx->rq_completed[0]);
}
static ssize_t sysfs_list_show(char *page, struct list_head *list, char *msg)
{
char *start_page = page;
struct request *rq;
page += sprintf(page, "%s:\n", msg);
list_for_each_entry(rq, list, queuelist)
page += sprintf(page, "\t%p\n", rq);
return page - start_page;
}
static ssize_t blk_mq_sysfs_rq_list_show(struct blk_mq_ctx *ctx, char *page)
{
ssize_t ret;
spin_lock(&ctx->lock);
ret = sysfs_list_show(page, &ctx->rq_list, "CTX pending");
spin_unlock(&ctx->lock);
return ret;
}
static ssize_t blk_mq_hw_sysfs_queued_show(struct blk_mq_hw_ctx *hctx,
char *page)
{
return sprintf(page, "%lu\n", hctx->queued);
}
static ssize_t blk_mq_hw_sysfs_run_show(struct blk_mq_hw_ctx *hctx, char *page)
{
return sprintf(page, "%lu\n", hctx->run);
}
static ssize_t blk_mq_hw_sysfs_dispatched_show(struct blk_mq_hw_ctx *hctx,
char *page)
{
char *start_page = page;
int i;
page += sprintf(page, "%8u\t%lu\n", 0U, hctx->dispatched[0]);
for (i = 1; i < BLK_MQ_MAX_DISPATCH_ORDER; i++) {
unsigned long d = 1U << (i - 1);
page += sprintf(page, "%8lu\t%lu\n", d, hctx->dispatched[i]);
}
return page - start_page;
}
static ssize_t blk_mq_hw_sysfs_rq_list_show(struct blk_mq_hw_ctx *hctx,
char *page)
{
ssize_t ret;
spin_lock(&hctx->lock);
ret = sysfs_list_show(page, &hctx->dispatch, "HCTX pending");
spin_unlock(&hctx->lock);
return ret;
}
static ssize_t blk_mq_hw_sysfs_tags_show(struct blk_mq_hw_ctx *hctx, char *page)
{
return blk_mq_tag_sysfs_show(hctx->tags, page);
}
static ssize_t blk_mq_hw_sysfs_active_show(struct blk_mq_hw_ctx *hctx, char *page)
{
return sprintf(page, "%u\n", atomic_read(&hctx->nr_active));
}
static ssize_t blk_mq_hw_sysfs_cpus_show(struct blk_mq_hw_ctx *hctx, char *page)
{
unsigned int i, first = 1;
ssize_t ret = 0;
blk_mq_disable_hotplug();
for_each_cpu(i, hctx->cpumask) {
if (first)
ret += sprintf(ret + page, "%u", i);
else
ret += sprintf(ret + page, ", %u", i);
first = 0;
}
blk_mq_enable_hotplug();
ret += sprintf(ret + page, "\n");
return ret;
}
static struct blk_mq_ctx_sysfs_entry blk_mq_sysfs_dispatched = {
.attr = {.name = "dispatched", .mode = S_IRUGO },
.show = blk_mq_sysfs_dispatched_show,
};
static struct blk_mq_ctx_sysfs_entry blk_mq_sysfs_merged = {
.attr = {.name = "merged", .mode = S_IRUGO },
.show = blk_mq_sysfs_merged_show,
};
static struct blk_mq_ctx_sysfs_entry blk_mq_sysfs_completed = {
.attr = {.name = "completed", .mode = S_IRUGO },
.show = blk_mq_sysfs_completed_show,
};
static struct blk_mq_ctx_sysfs_entry blk_mq_sysfs_rq_list = {
.attr = {.name = "rq_list", .mode = S_IRUGO },
.show = blk_mq_sysfs_rq_list_show,
};
static struct attribute *default_ctx_attrs[] = {
&blk_mq_sysfs_dispatched.attr,
&blk_mq_sysfs_merged.attr,
&blk_mq_sysfs_completed.attr,
&blk_mq_sysfs_rq_list.attr,
NULL,
};
static struct blk_mq_hw_ctx_sysfs_entry blk_mq_hw_sysfs_queued = {
.attr = {.name = "queued", .mode = S_IRUGO },
.show = blk_mq_hw_sysfs_queued_show,
};
static struct blk_mq_hw_ctx_sysfs_entry blk_mq_hw_sysfs_run = {
.attr = {.name = "run", .mode = S_IRUGO },
.show = blk_mq_hw_sysfs_run_show,
};
static struct blk_mq_hw_ctx_sysfs_entry blk_mq_hw_sysfs_dispatched = {
.attr = {.name = "dispatched", .mode = S_IRUGO },
.show = blk_mq_hw_sysfs_dispatched_show,
};
static struct blk_mq_hw_ctx_sysfs_entry blk_mq_hw_sysfs_active = {
.attr = {.name = "active", .mode = S_IRUGO },
.show = blk_mq_hw_sysfs_active_show,
};
static struct blk_mq_hw_ctx_sysfs_entry blk_mq_hw_sysfs_pending = {
.attr = {.name = "pending", .mode = S_IRUGO },
.show = blk_mq_hw_sysfs_rq_list_show,
};
static struct blk_mq_hw_ctx_sysfs_entry blk_mq_hw_sysfs_tags = {
.attr = {.name = "tags", .mode = S_IRUGO },
.show = blk_mq_hw_sysfs_tags_show,
};
static struct blk_mq_hw_ctx_sysfs_entry blk_mq_hw_sysfs_cpus = {
.attr = {.name = "cpu_list", .mode = S_IRUGO },
.show = blk_mq_hw_sysfs_cpus_show,
};
static struct attribute *default_hw_ctx_attrs[] = {
&blk_mq_hw_sysfs_queued.attr,
&blk_mq_hw_sysfs_run.attr,
&blk_mq_hw_sysfs_dispatched.attr,
&blk_mq_hw_sysfs_pending.attr,
&blk_mq_hw_sysfs_tags.attr,
&blk_mq_hw_sysfs_cpus.attr,
&blk_mq_hw_sysfs_active.attr,
NULL,
};
static const struct sysfs_ops blk_mq_sysfs_ops = {
.show = blk_mq_sysfs_show,
.store = blk_mq_sysfs_store,
};
static const struct sysfs_ops blk_mq_hw_sysfs_ops = {
.show = blk_mq_hw_sysfs_show,
.store = blk_mq_hw_sysfs_store,
};
static struct kobj_type blk_mq_ktype = {
.sysfs_ops = &blk_mq_sysfs_ops,
.release = blk_mq_sysfs_release,
};
static struct kobj_type blk_mq_ctx_ktype = {
.sysfs_ops = &blk_mq_sysfs_ops,
.default_attrs = default_ctx_attrs,
.release = blk_mq_sysfs_release,
};
static struct kobj_type blk_mq_hw_ktype = {
.sysfs_ops = &blk_mq_hw_sysfs_ops,
.default_attrs = default_hw_ctx_attrs,
.release = blk_mq_sysfs_release,
};
static void blk_mq_unregister_hctx(struct blk_mq_hw_ctx *hctx)
{
struct blk_mq_ctx *ctx;
int i;
if (!hctx->nr_ctx || !(hctx->flags & BLK_MQ_F_SYSFS_UP))
return;
hctx_for_each_ctx(hctx, ctx, i)
kobject_del(&ctx->kobj);
kobject_del(&hctx->kobj);
}
static int blk_mq_register_hctx(struct blk_mq_hw_ctx *hctx)
{
struct request_queue *q = hctx->queue;
struct blk_mq_ctx *ctx;
int i, ret;
if (!hctx->nr_ctx || !(hctx->flags & BLK_MQ_F_SYSFS_UP))
return 0;
ret = kobject_add(&hctx->kobj, &q->mq_kobj, "%u", hctx->queue_num);
if (ret)
return ret;
hctx_for_each_ctx(hctx, ctx, i) {
ret = kobject_add(&ctx->kobj, &hctx->kobj, "cpu%u", ctx->cpu);
if (ret)
break;
}
return ret;
}
void blk_mq_unregister_disk(struct gendisk *disk)
{
struct request_queue *q = disk->queue;
struct blk_mq_hw_ctx *hctx;
struct blk_mq_ctx *ctx;
int i, j;
queue_for_each_hw_ctx(q, hctx, i) {
blk_mq_unregister_hctx(hctx);
hctx_for_each_ctx(hctx, ctx, j)
kobject_put(&ctx->kobj);
kobject_put(&hctx->kobj);
}
kobject_uevent(&q->mq_kobj, KOBJ_REMOVE);
kobject_del(&q->mq_kobj);
kobject_put(&q->mq_kobj);
kobject_put(&disk_to_dev(disk)->kobj);
}
static void blk_mq_sysfs_init(struct request_queue *q)
{
struct blk_mq_hw_ctx *hctx;
struct blk_mq_ctx *ctx;
int i;
kobject_init(&q->mq_kobj, &blk_mq_ktype);
queue_for_each_hw_ctx(q, hctx, i)
kobject_init(&hctx->kobj, &blk_mq_hw_ktype);
queue_for_each_ctx(q, ctx, i)
kobject_init(&ctx->kobj, &blk_mq_ctx_ktype);
}
/* see blk_register_queue() */
void blk_mq_finish_init(struct request_queue *q)
{
percpu_ref_switch_to_percpu(&q->mq_usage_counter);
}
int blk_mq_register_disk(struct gendisk *disk)
{
struct device *dev = disk_to_dev(disk);
struct request_queue *q = disk->queue;
struct blk_mq_hw_ctx *hctx;
int ret, i;
blk_mq_sysfs_init(q);
ret = kobject_add(&q->mq_kobj, kobject_get(&dev->kobj), "%s", "mq");
if (ret < 0)
return ret;
kobject_uevent(&q->mq_kobj, KOBJ_ADD);
queue_for_each_hw_ctx(q, hctx, i) {
hctx->flags |= BLK_MQ_F_SYSFS_UP;
ret = blk_mq_register_hctx(hctx);
if (ret)
break;
}
if (ret) {
blk_mq_unregister_disk(disk);
return ret;
}
return 0;
}
void blk_mq_sysfs_unregister(struct request_queue *q)
{
struct blk_mq_hw_ctx *hctx;
int i;
queue_for_each_hw_ctx(q, hctx, i)
blk_mq_unregister_hctx(hctx);
}
int blk_mq_sysfs_register(struct request_queue *q)
{
struct blk_mq_hw_ctx *hctx;
int i, ret = 0;
queue_for_each_hw_ctx(q, hctx, i) {
ret = blk_mq_register_hctx(hctx);
if (ret)
break;
}
return ret;
}

609
block/blk-mq-tag.c Normal file
View file

@ -0,0 +1,609 @@
/*
* Fast and scalable bitmap tagging variant. Uses sparser bitmaps spread
* over multiple cachelines to avoid ping-pong between multiple submitters
* or submitter and completer. Uses rolling wakeups to avoid falling of
* the scaling cliff when we run out of tags and have to start putting
* submitters to sleep.
*
* Uses active queue tracking to support fairer distribution of tags
* between multiple submitters when a shared tag map is used.
*
* Copyright (C) 2013-2014 Jens Axboe
*/
#include <linux/kernel.h>
#include <linux/module.h>
#include <linux/random.h>
#include <linux/blk-mq.h>
#include "blk.h"
#include "blk-mq.h"
#include "blk-mq-tag.h"
static bool bt_has_free_tags(struct blk_mq_bitmap_tags *bt)
{
int i;
for (i = 0; i < bt->map_nr; i++) {
struct blk_align_bitmap *bm = &bt->map[i];
int ret;
ret = find_first_zero_bit(&bm->word, bm->depth);
if (ret < bm->depth)
return true;
}
return false;
}
bool blk_mq_has_free_tags(struct blk_mq_tags *tags)
{
if (!tags)
return true;
return bt_has_free_tags(&tags->bitmap_tags);
}
static inline int bt_index_inc(int index)
{
return (index + 1) & (BT_WAIT_QUEUES - 1);
}
static inline void bt_index_atomic_inc(atomic_t *index)
{
int old = atomic_read(index);
int new = bt_index_inc(old);
atomic_cmpxchg(index, old, new);
}
/*
* If a previously inactive queue goes active, bump the active user count.
*/
bool __blk_mq_tag_busy(struct blk_mq_hw_ctx *hctx)
{
if (!test_bit(BLK_MQ_S_TAG_ACTIVE, &hctx->state) &&
!test_and_set_bit(BLK_MQ_S_TAG_ACTIVE, &hctx->state))
atomic_inc(&hctx->tags->active_queues);
return true;
}
/*
* Wakeup all potentially sleeping on normal (non-reserved) tags
*/
static void blk_mq_tag_wakeup_all(struct blk_mq_tags *tags)
{
struct blk_mq_bitmap_tags *bt;
int i, wake_index;
bt = &tags->bitmap_tags;
wake_index = atomic_read(&bt->wake_index);
for (i = 0; i < BT_WAIT_QUEUES; i++) {
struct bt_wait_state *bs = &bt->bs[wake_index];
if (waitqueue_active(&bs->wait))
wake_up(&bs->wait);
wake_index = bt_index_inc(wake_index);
}
}
/*
* If a previously busy queue goes inactive, potential waiters could now
* be allowed to queue. Wake them up and check.
*/
void __blk_mq_tag_idle(struct blk_mq_hw_ctx *hctx)
{
struct blk_mq_tags *tags = hctx->tags;
if (!test_and_clear_bit(BLK_MQ_S_TAG_ACTIVE, &hctx->state))
return;
atomic_dec(&tags->active_queues);
blk_mq_tag_wakeup_all(tags);
}
/*
* For shared tag users, we track the number of currently active users
* and attempt to provide a fair share of the tag depth for each of them.
*/
static inline bool hctx_may_queue(struct blk_mq_hw_ctx *hctx,
struct blk_mq_bitmap_tags *bt)
{
unsigned int depth, users;
if (!hctx || !(hctx->flags & BLK_MQ_F_TAG_SHARED))
return true;
if (!test_bit(BLK_MQ_S_TAG_ACTIVE, &hctx->state))
return true;
/*
* Don't try dividing an ant
*/
if (bt->depth == 1)
return true;
users = atomic_read(&hctx->tags->active_queues);
if (!users)
return true;
/*
* Allow at least some tags
*/
depth = max((bt->depth + users - 1) / users, 4U);
return atomic_read(&hctx->nr_active) < depth;
}
static int __bt_get_word(struct blk_align_bitmap *bm, unsigned int last_tag)
{
int tag, org_last_tag, end;
bool wrap = last_tag != 0;
org_last_tag = last_tag;
end = bm->depth;
do {
restart:
tag = find_next_zero_bit(&bm->word, end, last_tag);
if (unlikely(tag >= end)) {
/*
* We started with an offset, start from 0 to
* exhaust the map.
*/
if (wrap) {
wrap = false;
end = org_last_tag;
last_tag = 0;
goto restart;
}
return -1;
}
last_tag = tag + 1;
} while (test_and_set_bit(tag, &bm->word));
return tag;
}
/*
* Straight forward bitmap tag implementation, where each bit is a tag
* (cleared == free, and set == busy). The small twist is using per-cpu
* last_tag caches, which blk-mq stores in the blk_mq_ctx software queue
* contexts. This enables us to drastically limit the space searched,
* without dirtying an extra shared cacheline like we would if we stored
* the cache value inside the shared blk_mq_bitmap_tags structure. On top
* of that, each word of tags is in a separate cacheline. This means that
* multiple users will tend to stick to different cachelines, at least
* until the map is exhausted.
*/
static int __bt_get(struct blk_mq_hw_ctx *hctx, struct blk_mq_bitmap_tags *bt,
unsigned int *tag_cache)
{
unsigned int last_tag, org_last_tag;
int index, i, tag;
if (!hctx_may_queue(hctx, bt))
return -1;
last_tag = org_last_tag = *tag_cache;
index = TAG_TO_INDEX(bt, last_tag);
for (i = 0; i < bt->map_nr; i++) {
tag = __bt_get_word(&bt->map[index], TAG_TO_BIT(bt, last_tag));
if (tag != -1) {
tag += (index << bt->bits_per_word);
goto done;
}
last_tag = 0;
if (++index >= bt->map_nr)
index = 0;
}
*tag_cache = 0;
return -1;
/*
* Only update the cache from the allocation path, if we ended
* up using the specific cached tag.
*/
done:
if (tag == org_last_tag) {
last_tag = tag + 1;
if (last_tag >= bt->depth - 1)
last_tag = 0;
*tag_cache = last_tag;
}
return tag;
}
static struct bt_wait_state *bt_wait_ptr(struct blk_mq_bitmap_tags *bt,
struct blk_mq_hw_ctx *hctx)
{
struct bt_wait_state *bs;
int wait_index;
if (!hctx)
return &bt->bs[0];
wait_index = atomic_read(&hctx->wait_index);
bs = &bt->bs[wait_index];
bt_index_atomic_inc(&hctx->wait_index);
return bs;
}
static int bt_get(struct blk_mq_alloc_data *data,
struct blk_mq_bitmap_tags *bt,
struct blk_mq_hw_ctx *hctx,
unsigned int *last_tag)
{
struct bt_wait_state *bs;
DEFINE_WAIT(wait);
int tag;
tag = __bt_get(hctx, bt, last_tag);
if (tag != -1)
return tag;
if (!(data->gfp & __GFP_WAIT))
return -1;
bs = bt_wait_ptr(bt, hctx);
do {
prepare_to_wait(&bs->wait, &wait, TASK_UNINTERRUPTIBLE);
tag = __bt_get(hctx, bt, last_tag);
if (tag != -1)
break;
blk_mq_put_ctx(data->ctx);
io_schedule();
data->ctx = blk_mq_get_ctx(data->q);
data->hctx = data->q->mq_ops->map_queue(data->q,
data->ctx->cpu);
if (data->reserved) {
bt = &data->hctx->tags->breserved_tags;
} else {
last_tag = &data->ctx->last_tag;
hctx = data->hctx;
bt = &hctx->tags->bitmap_tags;
}
finish_wait(&bs->wait, &wait);
bs = bt_wait_ptr(bt, hctx);
} while (1);
finish_wait(&bs->wait, &wait);
return tag;
}
static unsigned int __blk_mq_get_tag(struct blk_mq_alloc_data *data)
{
int tag;
tag = bt_get(data, &data->hctx->tags->bitmap_tags, data->hctx,
&data->ctx->last_tag);
if (tag >= 0)
return tag + data->hctx->tags->nr_reserved_tags;
return BLK_MQ_TAG_FAIL;
}
static unsigned int __blk_mq_get_reserved_tag(struct blk_mq_alloc_data *data)
{
int tag, zero = 0;
if (unlikely(!data->hctx->tags->nr_reserved_tags)) {
WARN_ON_ONCE(1);
return BLK_MQ_TAG_FAIL;
}
tag = bt_get(data, &data->hctx->tags->breserved_tags, NULL, &zero);
if (tag < 0)
return BLK_MQ_TAG_FAIL;
return tag;
}
unsigned int blk_mq_get_tag(struct blk_mq_alloc_data *data)
{
if (!data->reserved)
return __blk_mq_get_tag(data);
return __blk_mq_get_reserved_tag(data);
}
static struct bt_wait_state *bt_wake_ptr(struct blk_mq_bitmap_tags *bt)
{
int i, wake_index;
wake_index = atomic_read(&bt->wake_index);
for (i = 0; i < BT_WAIT_QUEUES; i++) {
struct bt_wait_state *bs = &bt->bs[wake_index];
if (waitqueue_active(&bs->wait)) {
int o = atomic_read(&bt->wake_index);
if (wake_index != o)
atomic_cmpxchg(&bt->wake_index, o, wake_index);
return bs;
}
wake_index = bt_index_inc(wake_index);
}
return NULL;
}
static void bt_clear_tag(struct blk_mq_bitmap_tags *bt, unsigned int tag)
{
const int index = TAG_TO_INDEX(bt, tag);
struct bt_wait_state *bs;
int wait_cnt;
clear_bit(TAG_TO_BIT(bt, tag), &bt->map[index].word);
/* Ensure that the wait list checks occur after clear_bit(). */
smp_mb();
bs = bt_wake_ptr(bt);
if (!bs)
return;
wait_cnt = atomic_dec_return(&bs->wait_cnt);
if (unlikely(wait_cnt < 0))
wait_cnt = atomic_inc_return(&bs->wait_cnt);
if (wait_cnt == 0) {
atomic_add(bt->wake_cnt, &bs->wait_cnt);
bt_index_atomic_inc(&bt->wake_index);
wake_up(&bs->wait);
}
}
static void __blk_mq_put_tag(struct blk_mq_tags *tags, unsigned int tag)
{
BUG_ON(tag >= tags->nr_tags);
bt_clear_tag(&tags->bitmap_tags, tag);
}
static void __blk_mq_put_reserved_tag(struct blk_mq_tags *tags,
unsigned int tag)
{
BUG_ON(tag >= tags->nr_reserved_tags);
bt_clear_tag(&tags->breserved_tags, tag);
}
void blk_mq_put_tag(struct blk_mq_hw_ctx *hctx, unsigned int tag,
unsigned int *last_tag)
{
struct blk_mq_tags *tags = hctx->tags;
if (tag >= tags->nr_reserved_tags) {
const int real_tag = tag - tags->nr_reserved_tags;
__blk_mq_put_tag(tags, real_tag);
*last_tag = real_tag;
} else
__blk_mq_put_reserved_tag(tags, tag);
}
static void bt_for_each(struct blk_mq_hw_ctx *hctx,
struct blk_mq_bitmap_tags *bt, unsigned int off,
busy_iter_fn *fn, void *data, bool reserved)
{
struct request *rq;
int bit, i;
for (i = 0; i < bt->map_nr; i++) {
struct blk_align_bitmap *bm = &bt->map[i];
for (bit = find_first_bit(&bm->word, bm->depth);
bit < bm->depth;
bit = find_next_bit(&bm->word, bm->depth, bit + 1)) {
rq = blk_mq_tag_to_rq(hctx->tags, off + bit);
if (rq->q == hctx->queue)
fn(hctx, rq, data, reserved);
}
off += (1 << bt->bits_per_word);
}
}
void blk_mq_tag_busy_iter(struct blk_mq_hw_ctx *hctx, busy_iter_fn *fn,
void *priv)
{
struct blk_mq_tags *tags = hctx->tags;
if (tags->nr_reserved_tags)
bt_for_each(hctx, &tags->breserved_tags, 0, fn, priv, true);
bt_for_each(hctx, &tags->bitmap_tags, tags->nr_reserved_tags, fn, priv,
false);
}
EXPORT_SYMBOL(blk_mq_tag_busy_iter);
static unsigned int bt_unused_tags(struct blk_mq_bitmap_tags *bt)
{
unsigned int i, used;
for (i = 0, used = 0; i < bt->map_nr; i++) {
struct blk_align_bitmap *bm = &bt->map[i];
used += bitmap_weight(&bm->word, bm->depth);
}
return bt->depth - used;
}
static void bt_update_count(struct blk_mq_bitmap_tags *bt,
unsigned int depth)
{
unsigned int tags_per_word = 1U << bt->bits_per_word;
unsigned int map_depth = depth;
if (depth) {
int i;
for (i = 0; i < bt->map_nr; i++) {
bt->map[i].depth = min(map_depth, tags_per_word);
map_depth -= bt->map[i].depth;
}
}
bt->wake_cnt = BT_WAIT_BATCH;
if (bt->wake_cnt > depth / BT_WAIT_QUEUES)
bt->wake_cnt = max(1U, depth / BT_WAIT_QUEUES);
bt->depth = depth;
}
static int bt_alloc(struct blk_mq_bitmap_tags *bt, unsigned int depth,
int node, bool reserved)
{
int i;
bt->bits_per_word = ilog2(BITS_PER_LONG);
/*
* Depth can be zero for reserved tags, that's not a failure
* condition.
*/
if (depth) {
unsigned int nr, tags_per_word;
tags_per_word = (1 << bt->bits_per_word);
/*
* If the tag space is small, shrink the number of tags
* per word so we spread over a few cachelines, at least.
* If less than 4 tags, just forget about it, it's not
* going to work optimally anyway.
*/
if (depth >= 4) {
while (tags_per_word * 4 > depth) {
bt->bits_per_word--;
tags_per_word = (1 << bt->bits_per_word);
}
}
nr = ALIGN(depth, tags_per_word) / tags_per_word;
bt->map = kzalloc_node(nr * sizeof(struct blk_align_bitmap),
GFP_KERNEL, node);
if (!bt->map)
return -ENOMEM;
bt->map_nr = nr;
}
bt->bs = kzalloc(BT_WAIT_QUEUES * sizeof(*bt->bs), GFP_KERNEL);
if (!bt->bs) {
kfree(bt->map);
bt->map = NULL;
return -ENOMEM;
}
bt_update_count(bt, depth);
for (i = 0; i < BT_WAIT_QUEUES; i++) {
init_waitqueue_head(&bt->bs[i].wait);
atomic_set(&bt->bs[i].wait_cnt, bt->wake_cnt);
}
return 0;
}
static void bt_free(struct blk_mq_bitmap_tags *bt)
{
kfree(bt->map);
kfree(bt->bs);
}
static struct blk_mq_tags *blk_mq_init_bitmap_tags(struct blk_mq_tags *tags,
int node)
{
unsigned int depth = tags->nr_tags - tags->nr_reserved_tags;
if (bt_alloc(&tags->bitmap_tags, depth, node, false))
goto enomem;
if (bt_alloc(&tags->breserved_tags, tags->nr_reserved_tags, node, true))
goto enomem;
return tags;
enomem:
bt_free(&tags->bitmap_tags);
kfree(tags);
return NULL;
}
struct blk_mq_tags *blk_mq_init_tags(unsigned int total_tags,
unsigned int reserved_tags, int node)
{
struct blk_mq_tags *tags;
if (total_tags > BLK_MQ_TAG_MAX) {
pr_err("blk-mq: tag depth too large\n");
return NULL;
}
tags = kzalloc_node(sizeof(*tags), GFP_KERNEL, node);
if (!tags)
return NULL;
tags->nr_tags = total_tags;
tags->nr_reserved_tags = reserved_tags;
return blk_mq_init_bitmap_tags(tags, node);
}
void blk_mq_free_tags(struct blk_mq_tags *tags)
{
bt_free(&tags->bitmap_tags);
bt_free(&tags->breserved_tags);
kfree(tags);
}
void blk_mq_tag_init_last_tag(struct blk_mq_tags *tags, unsigned int *tag)
{
unsigned int depth = tags->nr_tags - tags->nr_reserved_tags;
*tag = prandom_u32() % depth;
}
int blk_mq_tag_update_depth(struct blk_mq_tags *tags, unsigned int tdepth)
{
tdepth -= tags->nr_reserved_tags;
if (tdepth > tags->nr_tags)
return -EINVAL;
/*
* Don't need (or can't) update reserved tags here, they remain
* static and should never need resizing.
*/
bt_update_count(&tags->bitmap_tags, tdepth);
blk_mq_tag_wakeup_all(tags);
return 0;
}
ssize_t blk_mq_tag_sysfs_show(struct blk_mq_tags *tags, char *page)
{
char *orig_page = page;
unsigned int free, res;
if (!tags)
return 0;
page += sprintf(page, "nr_tags=%u, reserved_tags=%u, "
"bits_per_word=%u\n",
tags->nr_tags, tags->nr_reserved_tags,
tags->bitmap_tags.bits_per_word);
free = bt_unused_tags(&tags->bitmap_tags);
res = bt_unused_tags(&tags->breserved_tags);
page += sprintf(page, "nr_free=%u, nr_reserved=%u\n", free, res);
page += sprintf(page, "active_queues=%u\n", atomic_read(&tags->active_queues));
return page - orig_page;
}

88
block/blk-mq-tag.h Normal file
View file

@ -0,0 +1,88 @@
#ifndef INT_BLK_MQ_TAG_H
#define INT_BLK_MQ_TAG_H
#include "blk-mq.h"
enum {
BT_WAIT_QUEUES = 8,
BT_WAIT_BATCH = 8,
};
struct bt_wait_state {
atomic_t wait_cnt;
wait_queue_head_t wait;
} ____cacheline_aligned_in_smp;
#define TAG_TO_INDEX(bt, tag) ((tag) >> (bt)->bits_per_word)
#define TAG_TO_BIT(bt, tag) ((tag) & ((1 << (bt)->bits_per_word) - 1))
struct blk_mq_bitmap_tags {
unsigned int depth;
unsigned int wake_cnt;
unsigned int bits_per_word;
unsigned int map_nr;
struct blk_align_bitmap *map;
atomic_t wake_index;
struct bt_wait_state *bs;
};
/*
* Tag address space map.
*/
struct blk_mq_tags {
unsigned int nr_tags;
unsigned int nr_reserved_tags;
atomic_t active_queues;
struct blk_mq_bitmap_tags bitmap_tags;
struct blk_mq_bitmap_tags breserved_tags;
struct request **rqs;
struct list_head page_list;
};
extern struct blk_mq_tags *blk_mq_init_tags(unsigned int nr_tags, unsigned int reserved_tags, int node);
extern void blk_mq_free_tags(struct blk_mq_tags *tags);
extern unsigned int blk_mq_get_tag(struct blk_mq_alloc_data *data);
extern void blk_mq_put_tag(struct blk_mq_hw_ctx *hctx, unsigned int tag, unsigned int *last_tag);
extern bool blk_mq_has_free_tags(struct blk_mq_tags *tags);
extern ssize_t blk_mq_tag_sysfs_show(struct blk_mq_tags *tags, char *page);
extern void blk_mq_tag_init_last_tag(struct blk_mq_tags *tags, unsigned int *last_tag);
extern int blk_mq_tag_update_depth(struct blk_mq_tags *tags, unsigned int depth);
enum {
BLK_MQ_TAG_CACHE_MIN = 1,
BLK_MQ_TAG_CACHE_MAX = 64,
};
enum {
BLK_MQ_TAG_FAIL = -1U,
BLK_MQ_TAG_MIN = BLK_MQ_TAG_CACHE_MIN,
BLK_MQ_TAG_MAX = BLK_MQ_TAG_FAIL - 1,
};
extern bool __blk_mq_tag_busy(struct blk_mq_hw_ctx *);
extern void __blk_mq_tag_idle(struct blk_mq_hw_ctx *);
static inline bool blk_mq_tag_busy(struct blk_mq_hw_ctx *hctx)
{
if (!(hctx->flags & BLK_MQ_F_TAG_SHARED))
return false;
return __blk_mq_tag_busy(hctx);
}
static inline void blk_mq_tag_idle(struct blk_mq_hw_ctx *hctx)
{
if (!(hctx->flags & BLK_MQ_F_TAG_SHARED))
return;
__blk_mq_tag_idle(hctx);
}
#endif

2142
block/blk-mq.c Normal file

File diff suppressed because it is too large Load diff

118
block/blk-mq.h Normal file
View file

@ -0,0 +1,118 @@
#ifndef INT_BLK_MQ_H
#define INT_BLK_MQ_H
struct blk_mq_tag_set;
struct blk_mq_ctx {
struct {
spinlock_t lock;
struct list_head rq_list;
} ____cacheline_aligned_in_smp;
unsigned int cpu;
unsigned int index_hw;
unsigned int last_tag ____cacheline_aligned_in_smp;
/* incremented at dispatch time */
unsigned long rq_dispatched[2];
unsigned long rq_merged;
/* incremented at completion time */
unsigned long ____cacheline_aligned_in_smp rq_completed[2];
struct request_queue *queue;
struct kobject kobj;
} ____cacheline_aligned_in_smp;
void __blk_mq_complete_request(struct request *rq);
void blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx, bool async);
void blk_mq_freeze_queue(struct request_queue *q);
void blk_mq_free_queue(struct request_queue *q);
void blk_mq_clone_flush_request(struct request *flush_rq,
struct request *orig_rq);
int blk_mq_update_nr_requests(struct request_queue *q, unsigned int nr);
/*
* CPU hotplug helpers
*/
struct blk_mq_cpu_notifier;
void blk_mq_init_cpu_notifier(struct blk_mq_cpu_notifier *notifier,
int (*fn)(void *, unsigned long, unsigned int),
void *data);
void blk_mq_register_cpu_notifier(struct blk_mq_cpu_notifier *notifier);
void blk_mq_unregister_cpu_notifier(struct blk_mq_cpu_notifier *notifier);
void blk_mq_cpu_init(void);
void blk_mq_enable_hotplug(void);
void blk_mq_disable_hotplug(void);
/*
* CPU -> queue mappings
*/
extern unsigned int *blk_mq_make_queue_map(struct blk_mq_tag_set *set);
extern int blk_mq_update_queue_map(unsigned int *map, unsigned int nr_queues);
extern int blk_mq_hw_queue_to_node(unsigned int *map, unsigned int);
/*
* sysfs helpers
*/
extern int blk_mq_sysfs_register(struct request_queue *q);
extern void blk_mq_sysfs_unregister(struct request_queue *q);
extern void blk_mq_rq_timed_out(struct request *req, bool reserved);
/*
* Basic implementation of sparser bitmap, allowing the user to spread
* the bits over more cachelines.
*/
struct blk_align_bitmap {
unsigned long word;
unsigned long depth;
} ____cacheline_aligned_in_smp;
static inline struct blk_mq_ctx *__blk_mq_get_ctx(struct request_queue *q,
unsigned int cpu)
{
return per_cpu_ptr(q->queue_ctx, cpu);
}
/*
* This assumes per-cpu software queueing queues. They could be per-node
* as well, for instance. For now this is hardcoded as-is. Note that we don't
* care about preemption, since we know the ctx's are persistent. This does
* mean that we can't rely on ctx always matching the currently running CPU.
*/
static inline struct blk_mq_ctx *blk_mq_get_ctx(struct request_queue *q)
{
return __blk_mq_get_ctx(q, get_cpu());
}
static inline void blk_mq_put_ctx(struct blk_mq_ctx *ctx)
{
put_cpu();
}
struct blk_mq_alloc_data {
/* input parameter */
struct request_queue *q;
gfp_t gfp;
bool reserved;
/* input & output parameter */
struct blk_mq_ctx *ctx;
struct blk_mq_hw_ctx *hctx;
};
static inline void blk_mq_set_alloc_data(struct blk_mq_alloc_data *data,
struct request_queue *q, gfp_t gfp, bool reserved,
struct blk_mq_ctx *ctx,
struct blk_mq_hw_ctx *hctx)
{
data->q = q;
data->gfp = gfp;
data->reserved = reserved;
data->ctx = ctx;
data->hctx = hctx;
}
#endif

863
block/blk-settings.c Normal file
View file

@ -0,0 +1,863 @@
/*
* Functions related to setting various queue properties from drivers
*/
#include <linux/kernel.h>
#include <linux/module.h>
#include <linux/init.h>
#include <linux/bio.h>
#include <linux/blkdev.h>
#include <linux/bootmem.h> /* for max_pfn/max_low_pfn */
#include <linux/gcd.h>
#include <linux/lcm.h>
#include <linux/jiffies.h>
#include <linux/gfp.h>
#include "blk.h"
unsigned long blk_max_low_pfn;
EXPORT_SYMBOL(blk_max_low_pfn);
unsigned long blk_max_pfn;
/**
* blk_queue_prep_rq - set a prepare_request function for queue
* @q: queue
* @pfn: prepare_request function
*
* It's possible for a queue to register a prepare_request callback which
* is invoked before the request is handed to the request_fn. The goal of
* the function is to prepare a request for I/O, it can be used to build a
* cdb from the request data for instance.
*
*/
void blk_queue_prep_rq(struct request_queue *q, prep_rq_fn *pfn)
{
q->prep_rq_fn = pfn;
}
EXPORT_SYMBOL(blk_queue_prep_rq);
/**
* blk_queue_unprep_rq - set an unprepare_request function for queue
* @q: queue
* @ufn: unprepare_request function
*
* It's possible for a queue to register an unprepare_request callback
* which is invoked before the request is finally completed. The goal
* of the function is to deallocate any data that was allocated in the
* prepare_request callback.
*
*/
void blk_queue_unprep_rq(struct request_queue *q, unprep_rq_fn *ufn)
{
q->unprep_rq_fn = ufn;
}
EXPORT_SYMBOL(blk_queue_unprep_rq);
/**
* blk_queue_merge_bvec - set a merge_bvec function for queue
* @q: queue
* @mbfn: merge_bvec_fn
*
* Usually queues have static limitations on the max sectors or segments that
* we can put in a request. Stacking drivers may have some settings that
* are dynamic, and thus we have to query the queue whether it is ok to
* add a new bio_vec to a bio at a given offset or not. If the block device
* has such limitations, it needs to register a merge_bvec_fn to control
* the size of bio's sent to it. Note that a block device *must* allow a
* single page to be added to an empty bio. The block device driver may want
* to use the bio_split() function to deal with these bio's. By default
* no merge_bvec_fn is defined for a queue, and only the fixed limits are
* honored.
*/
void blk_queue_merge_bvec(struct request_queue *q, merge_bvec_fn *mbfn)
{
q->merge_bvec_fn = mbfn;
}
EXPORT_SYMBOL(blk_queue_merge_bvec);
void blk_queue_softirq_done(struct request_queue *q, softirq_done_fn *fn)
{
q->softirq_done_fn = fn;
}
EXPORT_SYMBOL(blk_queue_softirq_done);
void blk_queue_rq_timeout(struct request_queue *q, unsigned int timeout)
{
q->rq_timeout = timeout;
}
EXPORT_SYMBOL_GPL(blk_queue_rq_timeout);
void blk_queue_rq_timed_out(struct request_queue *q, rq_timed_out_fn *fn)
{
q->rq_timed_out_fn = fn;
}
EXPORT_SYMBOL_GPL(blk_queue_rq_timed_out);
void blk_queue_lld_busy(struct request_queue *q, lld_busy_fn *fn)
{
q->lld_busy_fn = fn;
}
EXPORT_SYMBOL_GPL(blk_queue_lld_busy);
/**
* blk_set_default_limits - reset limits to default values
* @lim: the queue_limits structure to reset
*
* Description:
* Returns a queue_limit struct to its default state.
*/
void blk_set_default_limits(struct queue_limits *lim)
{
lim->max_segments = BLK_MAX_SEGMENTS;
lim->max_integrity_segments = 0;
lim->seg_boundary_mask = BLK_SEG_BOUNDARY_MASK;
lim->max_segment_size = BLK_MAX_SEGMENT_SIZE;
lim->max_sectors = lim->max_hw_sectors = BLK_SAFE_MAX_SECTORS;
lim->chunk_sectors = 0;
lim->max_write_same_sectors = 0;
lim->max_discard_sectors = 0;
lim->discard_granularity = 0;
lim->discard_alignment = 0;
lim->discard_misaligned = 0;
lim->discard_zeroes_data = 0;
lim->logical_block_size = lim->physical_block_size = lim->io_min = 512;
lim->bounce_pfn = (unsigned long)(BLK_BOUNCE_ANY >> PAGE_SHIFT);
lim->alignment_offset = 0;
lim->io_opt = 0;
lim->misaligned = 0;
lim->cluster = 1;
}
EXPORT_SYMBOL(blk_set_default_limits);
/**
* blk_set_stacking_limits - set default limits for stacking devices
* @lim: the queue_limits structure to reset
*
* Description:
* Returns a queue_limit struct to its default state. Should be used
* by stacking drivers like DM that have no internal limits.
*/
void blk_set_stacking_limits(struct queue_limits *lim)
{
blk_set_default_limits(lim);
/* Inherit limits from component devices */
lim->discard_zeroes_data = 1;
lim->max_segments = USHRT_MAX;
lim->max_hw_sectors = UINT_MAX;
lim->max_segment_size = UINT_MAX;
lim->max_sectors = UINT_MAX;
lim->max_write_same_sectors = UINT_MAX;
}
EXPORT_SYMBOL(blk_set_stacking_limits);
/**
* blk_queue_make_request - define an alternate make_request function for a device
* @q: the request queue for the device to be affected
* @mfn: the alternate make_request function
*
* Description:
* The normal way for &struct bios to be passed to a device
* driver is for them to be collected into requests on a request
* queue, and then to allow the device driver to select requests
* off that queue when it is ready. This works well for many block
* devices. However some block devices (typically virtual devices
* such as md or lvm) do not benefit from the processing on the
* request queue, and are served best by having the requests passed
* directly to them. This can be achieved by providing a function
* to blk_queue_make_request().
*
* Caveat:
* The driver that does this *must* be able to deal appropriately
* with buffers in "highmemory". This can be accomplished by either calling
* __bio_kmap_atomic() to get a temporary kernel mapping, or by calling
* blk_queue_bounce() to create a buffer in normal memory.
**/
void blk_queue_make_request(struct request_queue *q, make_request_fn *mfn)
{
/*
* set defaults
*/
q->nr_requests = BLKDEV_MAX_RQ;
q->make_request_fn = mfn;
blk_queue_dma_alignment(q, 511);
blk_queue_congestion_threshold(q);
q->nr_batching = BLK_BATCH_REQ;
blk_set_default_limits(&q->limits);
/*
* by default assume old behaviour and bounce for any highmem page
*/
blk_queue_bounce_limit(q, BLK_BOUNCE_HIGH);
}
EXPORT_SYMBOL(blk_queue_make_request);
/**
* blk_queue_bounce_limit - set bounce buffer limit for queue
* @q: the request queue for the device
* @max_addr: the maximum address the device can handle
*
* Description:
* Different hardware can have different requirements as to what pages
* it can do I/O directly to. A low level driver can call
* blk_queue_bounce_limit to have lower memory pages allocated as bounce
* buffers for doing I/O to pages residing above @max_addr.
**/
void blk_queue_bounce_limit(struct request_queue *q, u64 max_addr)
{
unsigned long b_pfn = max_addr >> PAGE_SHIFT;
int dma = 0;
q->bounce_gfp = GFP_NOIO;
#if BITS_PER_LONG == 64
/*
* Assume anything <= 4GB can be handled by IOMMU. Actually
* some IOMMUs can handle everything, but I don't know of a
* way to test this here.
*/
if (b_pfn < (min_t(u64, 0xffffffffUL, BLK_BOUNCE_HIGH) >> PAGE_SHIFT))
dma = 1;
q->limits.bounce_pfn = max(max_low_pfn, b_pfn);
#else
if (b_pfn < blk_max_low_pfn)
dma = 1;
q->limits.bounce_pfn = b_pfn;
#endif
if (dma) {
init_emergency_isa_pool();
q->bounce_gfp = GFP_NOIO | GFP_DMA;
q->limits.bounce_pfn = b_pfn;
}
}
EXPORT_SYMBOL(blk_queue_bounce_limit);
/**
* blk_limits_max_hw_sectors - set hard and soft limit of max sectors for request
* @limits: the queue limits
* @max_hw_sectors: max hardware sectors in the usual 512b unit
*
* Description:
* Enables a low level driver to set a hard upper limit,
* max_hw_sectors, on the size of requests. max_hw_sectors is set by
* the device driver based upon the combined capabilities of I/O
* controller and storage device.
*
* max_sectors is a soft limit imposed by the block layer for
* filesystem type requests. This value can be overridden on a
* per-device basis in /sys/block/<device>/queue/max_sectors_kb.
* The soft limit can not exceed max_hw_sectors.
**/
void blk_limits_max_hw_sectors(struct queue_limits *limits, unsigned int max_hw_sectors)
{
if ((max_hw_sectors << 9) < PAGE_CACHE_SIZE) {
max_hw_sectors = 1 << (PAGE_CACHE_SHIFT - 9);
printk(KERN_INFO "%s: set to minimum %d\n",
__func__, max_hw_sectors);
}
limits->max_hw_sectors = max_hw_sectors;
limits->max_sectors = min_t(unsigned int, max_hw_sectors,
BLK_DEF_MAX_SECTORS);
}
EXPORT_SYMBOL(blk_limits_max_hw_sectors);
/**
* blk_queue_max_hw_sectors - set max sectors for a request for this queue
* @q: the request queue for the device
* @max_hw_sectors: max hardware sectors in the usual 512b unit
*
* Description:
* See description for blk_limits_max_hw_sectors().
**/
void blk_queue_max_hw_sectors(struct request_queue *q, unsigned int max_hw_sectors)
{
blk_limits_max_hw_sectors(&q->limits, max_hw_sectors);
}
EXPORT_SYMBOL(blk_queue_max_hw_sectors);
/**
* blk_queue_chunk_sectors - set size of the chunk for this queue
* @q: the request queue for the device
* @chunk_sectors: chunk sectors in the usual 512b unit
*
* Description:
* If a driver doesn't want IOs to cross a given chunk size, it can set
* this limit and prevent merging across chunks. Note that the chunk size
* must currently be a power-of-2 in sectors. Also note that the block
* layer must accept a page worth of data at any offset. So if the
* crossing of chunks is a hard limitation in the driver, it must still be
* prepared to split single page bios.
**/
void blk_queue_chunk_sectors(struct request_queue *q, unsigned int chunk_sectors)
{
BUG_ON(!is_power_of_2(chunk_sectors));
q->limits.chunk_sectors = chunk_sectors;
}
EXPORT_SYMBOL(blk_queue_chunk_sectors);
/**
* blk_queue_max_discard_sectors - set max sectors for a single discard
* @q: the request queue for the device
* @max_discard_sectors: maximum number of sectors to discard
**/
void blk_queue_max_discard_sectors(struct request_queue *q,
unsigned int max_discard_sectors)
{
q->limits.max_discard_sectors = max_discard_sectors;
}
EXPORT_SYMBOL(blk_queue_max_discard_sectors);
/**
* blk_queue_max_write_same_sectors - set max sectors for a single write same
* @q: the request queue for the device
* @max_write_same_sectors: maximum number of sectors to write per command
**/
void blk_queue_max_write_same_sectors(struct request_queue *q,
unsigned int max_write_same_sectors)
{
q->limits.max_write_same_sectors = max_write_same_sectors;
}
EXPORT_SYMBOL(blk_queue_max_write_same_sectors);
/**
* blk_queue_max_segments - set max hw segments for a request for this queue
* @q: the request queue for the device
* @max_segments: max number of segments
*
* Description:
* Enables a low level driver to set an upper limit on the number of
* hw data segments in a request.
**/
void blk_queue_max_segments(struct request_queue *q, unsigned short max_segments)
{
if (!max_segments) {
max_segments = 1;
printk(KERN_INFO "%s: set to minimum %d\n",
__func__, max_segments);
}
q->limits.max_segments = max_segments;
}
EXPORT_SYMBOL(blk_queue_max_segments);
/**
* blk_queue_max_segment_size - set max segment size for blk_rq_map_sg
* @q: the request queue for the device
* @max_size: max size of segment in bytes
*
* Description:
* Enables a low level driver to set an upper limit on the size of a
* coalesced segment
**/
void blk_queue_max_segment_size(struct request_queue *q, unsigned int max_size)
{
if (max_size < PAGE_CACHE_SIZE) {
max_size = PAGE_CACHE_SIZE;
printk(KERN_INFO "%s: set to minimum %d\n",
__func__, max_size);
}
q->limits.max_segment_size = max_size;
}
EXPORT_SYMBOL(blk_queue_max_segment_size);
/**
* blk_queue_logical_block_size - set logical block size for the queue
* @q: the request queue for the device
* @size: the logical block size, in bytes
*
* Description:
* This should be set to the lowest possible block size that the
* storage device can address. The default of 512 covers most
* hardware.
**/
void blk_queue_logical_block_size(struct request_queue *q, unsigned short size)
{
q->limits.logical_block_size = size;
if (q->limits.physical_block_size < size)
q->limits.physical_block_size = size;
if (q->limits.io_min < q->limits.physical_block_size)
q->limits.io_min = q->limits.physical_block_size;
}
EXPORT_SYMBOL(blk_queue_logical_block_size);
/**
* blk_queue_physical_block_size - set physical block size for the queue
* @q: the request queue for the device
* @size: the physical block size, in bytes
*
* Description:
* This should be set to the lowest possible sector size that the
* hardware can operate on without reverting to read-modify-write
* operations.
*/
void blk_queue_physical_block_size(struct request_queue *q, unsigned int size)
{
q->limits.physical_block_size = size;
if (q->limits.physical_block_size < q->limits.logical_block_size)
q->limits.physical_block_size = q->limits.logical_block_size;
if (q->limits.io_min < q->limits.physical_block_size)
q->limits.io_min = q->limits.physical_block_size;
}
EXPORT_SYMBOL(blk_queue_physical_block_size);
/**
* blk_queue_alignment_offset - set physical block alignment offset
* @q: the request queue for the device
* @offset: alignment offset in bytes
*
* Description:
* Some devices are naturally misaligned to compensate for things like
* the legacy DOS partition table 63-sector offset. Low-level drivers
* should call this function for devices whose first sector is not
* naturally aligned.
*/
void blk_queue_alignment_offset(struct request_queue *q, unsigned int offset)
{
q->limits.alignment_offset =
offset & (q->limits.physical_block_size - 1);
q->limits.misaligned = 0;
}
EXPORT_SYMBOL(blk_queue_alignment_offset);
/**
* blk_limits_io_min - set minimum request size for a device
* @limits: the queue limits
* @min: smallest I/O size in bytes
*
* Description:
* Some devices have an internal block size bigger than the reported
* hardware sector size. This function can be used to signal the
* smallest I/O the device can perform without incurring a performance
* penalty.
*/
void blk_limits_io_min(struct queue_limits *limits, unsigned int min)
{
limits->io_min = min;
if (limits->io_min < limits->logical_block_size)
limits->io_min = limits->logical_block_size;
if (limits->io_min < limits->physical_block_size)
limits->io_min = limits->physical_block_size;
}
EXPORT_SYMBOL(blk_limits_io_min);
/**
* blk_queue_io_min - set minimum request size for the queue
* @q: the request queue for the device
* @min: smallest I/O size in bytes
*
* Description:
* Storage devices may report a granularity or preferred minimum I/O
* size which is the smallest request the device can perform without
* incurring a performance penalty. For disk drives this is often the
* physical block size. For RAID arrays it is often the stripe chunk
* size. A properly aligned multiple of minimum_io_size is the
* preferred request size for workloads where a high number of I/O
* operations is desired.
*/
void blk_queue_io_min(struct request_queue *q, unsigned int min)
{
blk_limits_io_min(&q->limits, min);
}
EXPORT_SYMBOL(blk_queue_io_min);
/**
* blk_limits_io_opt - set optimal request size for a device
* @limits: the queue limits
* @opt: smallest I/O size in bytes
*
* Description:
* Storage devices may report an optimal I/O size, which is the
* device's preferred unit for sustained I/O. This is rarely reported
* for disk drives. For RAID arrays it is usually the stripe width or
* the internal track size. A properly aligned multiple of
* optimal_io_size is the preferred request size for workloads where
* sustained throughput is desired.
*/
void blk_limits_io_opt(struct queue_limits *limits, unsigned int opt)
{
limits->io_opt = opt;
}
EXPORT_SYMBOL(blk_limits_io_opt);
/**
* blk_queue_io_opt - set optimal request size for the queue
* @q: the request queue for the device
* @opt: optimal request size in bytes
*
* Description:
* Storage devices may report an optimal I/O size, which is the
* device's preferred unit for sustained I/O. This is rarely reported
* for disk drives. For RAID arrays it is usually the stripe width or
* the internal track size. A properly aligned multiple of
* optimal_io_size is the preferred request size for workloads where
* sustained throughput is desired.
*/
void blk_queue_io_opt(struct request_queue *q, unsigned int opt)
{
blk_limits_io_opt(&q->limits, opt);
}
EXPORT_SYMBOL(blk_queue_io_opt);
/**
* blk_queue_stack_limits - inherit underlying queue limits for stacked drivers
* @t: the stacking driver (top)
* @b: the underlying device (bottom)
**/
void blk_queue_stack_limits(struct request_queue *t, struct request_queue *b)
{
blk_stack_limits(&t->limits, &b->limits, 0);
}
EXPORT_SYMBOL(blk_queue_stack_limits);
/**
* blk_stack_limits - adjust queue_limits for stacked devices
* @t: the stacking driver limits (top device)
* @b: the underlying queue limits (bottom, component device)
* @start: first data sector within component device
*
* Description:
* This function is used by stacking drivers like MD and DM to ensure
* that all component devices have compatible block sizes and
* alignments. The stacking driver must provide a queue_limits
* struct (top) and then iteratively call the stacking function for
* all component (bottom) devices. The stacking function will
* attempt to combine the values and ensure proper alignment.
*
* Returns 0 if the top and bottom queue_limits are compatible. The
* top device's block sizes and alignment offsets may be adjusted to
* ensure alignment with the bottom device. If no compatible sizes
* and alignments exist, -1 is returned and the resulting top
* queue_limits will have the misaligned flag set to indicate that
* the alignment_offset is undefined.
*/
int blk_stack_limits(struct queue_limits *t, struct queue_limits *b,
sector_t start)
{
unsigned int top, bottom, alignment, ret = 0;
t->max_sectors = min_not_zero(t->max_sectors, b->max_sectors);
t->max_hw_sectors = min_not_zero(t->max_hw_sectors, b->max_hw_sectors);
t->max_write_same_sectors = min(t->max_write_same_sectors,
b->max_write_same_sectors);
t->bounce_pfn = min_not_zero(t->bounce_pfn, b->bounce_pfn);
t->seg_boundary_mask = min_not_zero(t->seg_boundary_mask,
b->seg_boundary_mask);
t->max_segments = min_not_zero(t->max_segments, b->max_segments);
t->max_integrity_segments = min_not_zero(t->max_integrity_segments,
b->max_integrity_segments);
t->max_segment_size = min_not_zero(t->max_segment_size,
b->max_segment_size);
t->misaligned |= b->misaligned;
alignment = queue_limit_alignment_offset(b, start);
/* Bottom device has different alignment. Check that it is
* compatible with the current top alignment.
*/
if (t->alignment_offset != alignment) {
top = max(t->physical_block_size, t->io_min)
+ t->alignment_offset;
bottom = max(b->physical_block_size, b->io_min) + alignment;
/* Verify that top and bottom intervals line up */
if (max(top, bottom) % min(top, bottom)) {
t->misaligned = 1;
ret = -1;
}
}
t->logical_block_size = max(t->logical_block_size,
b->logical_block_size);
t->physical_block_size = max(t->physical_block_size,
b->physical_block_size);
t->io_min = max(t->io_min, b->io_min);
t->io_opt = lcm(t->io_opt, b->io_opt);
t->cluster &= b->cluster;
t->discard_zeroes_data &= b->discard_zeroes_data;
/* Physical block size a multiple of the logical block size? */
if (t->physical_block_size & (t->logical_block_size - 1)) {
t->physical_block_size = t->logical_block_size;
t->misaligned = 1;
ret = -1;
}
/* Minimum I/O a multiple of the physical block size? */
if (t->io_min & (t->physical_block_size - 1)) {
t->io_min = t->physical_block_size;
t->misaligned = 1;
ret = -1;
}
/* Optimal I/O a multiple of the physical block size? */
if (t->io_opt & (t->physical_block_size - 1)) {
t->io_opt = 0;
t->misaligned = 1;
ret = -1;
}
t->raid_partial_stripes_expensive =
max(t->raid_partial_stripes_expensive,
b->raid_partial_stripes_expensive);
/* Find lowest common alignment_offset */
t->alignment_offset = lcm(t->alignment_offset, alignment)
% max(t->physical_block_size, t->io_min);
/* Verify that new alignment_offset is on a logical block boundary */
if (t->alignment_offset & (t->logical_block_size - 1)) {
t->misaligned = 1;
ret = -1;
}
/* Discard alignment and granularity */
if (b->discard_granularity) {
alignment = queue_limit_discard_alignment(b, start);
if (t->discard_granularity != 0 &&
t->discard_alignment != alignment) {
top = t->discard_granularity + t->discard_alignment;
bottom = b->discard_granularity + alignment;
/* Verify that top and bottom intervals line up */
if ((max(top, bottom) % min(top, bottom)) != 0)
t->discard_misaligned = 1;
}
t->max_discard_sectors = min_not_zero(t->max_discard_sectors,
b->max_discard_sectors);
t->discard_granularity = max(t->discard_granularity,
b->discard_granularity);
t->discard_alignment = lcm(t->discard_alignment, alignment) %
t->discard_granularity;
}
return ret;
}
EXPORT_SYMBOL(blk_stack_limits);
/**
* bdev_stack_limits - adjust queue limits for stacked drivers
* @t: the stacking driver limits (top device)
* @bdev: the component block_device (bottom)
* @start: first data sector within component device
*
* Description:
* Merges queue limits for a top device and a block_device. Returns
* 0 if alignment didn't change. Returns -1 if adding the bottom
* device caused misalignment.
*/
int bdev_stack_limits(struct queue_limits *t, struct block_device *bdev,
sector_t start)
{
struct request_queue *bq = bdev_get_queue(bdev);
start += get_start_sect(bdev);
return blk_stack_limits(t, &bq->limits, start);
}
EXPORT_SYMBOL(bdev_stack_limits);
/**
* disk_stack_limits - adjust queue limits for stacked drivers
* @disk: MD/DM gendisk (top)
* @bdev: the underlying block device (bottom)
* @offset: offset to beginning of data within component device
*
* Description:
* Merges the limits for a top level gendisk and a bottom level
* block_device.
*/
void disk_stack_limits(struct gendisk *disk, struct block_device *bdev,
sector_t offset)
{
struct request_queue *t = disk->queue;
if (bdev_stack_limits(&t->limits, bdev, offset >> 9) < 0) {
char top[BDEVNAME_SIZE], bottom[BDEVNAME_SIZE];
disk_name(disk, 0, top);
bdevname(bdev, bottom);
printk(KERN_NOTICE "%s: Warning: Device %s is misaligned\n",
top, bottom);
}
}
EXPORT_SYMBOL(disk_stack_limits);
/**
* blk_queue_dma_pad - set pad mask
* @q: the request queue for the device
* @mask: pad mask
*
* Set dma pad mask.
*
* Appending pad buffer to a request modifies the last entry of a
* scatter list such that it includes the pad buffer.
**/
void blk_queue_dma_pad(struct request_queue *q, unsigned int mask)
{
q->dma_pad_mask = mask;
}
EXPORT_SYMBOL(blk_queue_dma_pad);
/**
* blk_queue_update_dma_pad - update pad mask
* @q: the request queue for the device
* @mask: pad mask
*
* Update dma pad mask.
*
* Appending pad buffer to a request modifies the last entry of a
* scatter list such that it includes the pad buffer.
**/
void blk_queue_update_dma_pad(struct request_queue *q, unsigned int mask)
{
if (mask > q->dma_pad_mask)
q->dma_pad_mask = mask;
}
EXPORT_SYMBOL(blk_queue_update_dma_pad);
/**
* blk_queue_dma_drain - Set up a drain buffer for excess dma.
* @q: the request queue for the device
* @dma_drain_needed: fn which returns non-zero if drain is necessary
* @buf: physically contiguous buffer
* @size: size of the buffer in bytes
*
* Some devices have excess DMA problems and can't simply discard (or
* zero fill) the unwanted piece of the transfer. They have to have a
* real area of memory to transfer it into. The use case for this is
* ATAPI devices in DMA mode. If the packet command causes a transfer
* bigger than the transfer size some HBAs will lock up if there
* aren't DMA elements to contain the excess transfer. What this API
* does is adjust the queue so that the buf is always appended
* silently to the scatterlist.
*
* Note: This routine adjusts max_hw_segments to make room for appending
* the drain buffer. If you call blk_queue_max_segments() after calling
* this routine, you must set the limit to one fewer than your device
* can support otherwise there won't be room for the drain buffer.
*/
int blk_queue_dma_drain(struct request_queue *q,
dma_drain_needed_fn *dma_drain_needed,
void *buf, unsigned int size)
{
if (queue_max_segments(q) < 2)
return -EINVAL;
/* make room for appending the drain */
blk_queue_max_segments(q, queue_max_segments(q) - 1);
q->dma_drain_needed = dma_drain_needed;
q->dma_drain_buffer = buf;
q->dma_drain_size = size;
return 0;
}
EXPORT_SYMBOL_GPL(blk_queue_dma_drain);
/**
* blk_queue_segment_boundary - set boundary rules for segment merging
* @q: the request queue for the device
* @mask: the memory boundary mask
**/
void blk_queue_segment_boundary(struct request_queue *q, unsigned long mask)
{
if (mask < PAGE_CACHE_SIZE - 1) {
mask = PAGE_CACHE_SIZE - 1;
printk(KERN_INFO "%s: set to minimum %lx\n",
__func__, mask);
}
q->limits.seg_boundary_mask = mask;
}
EXPORT_SYMBOL(blk_queue_segment_boundary);
/**
* blk_queue_dma_alignment - set dma length and memory alignment
* @q: the request queue for the device
* @mask: alignment mask
*
* description:
* set required memory and length alignment for direct dma transactions.
* this is used when building direct io requests for the queue.
*
**/
void blk_queue_dma_alignment(struct request_queue *q, int mask)
{
q->dma_alignment = mask;
}
EXPORT_SYMBOL(blk_queue_dma_alignment);
/**
* blk_queue_update_dma_alignment - update dma length and memory alignment
* @q: the request queue for the device
* @mask: alignment mask
*
* description:
* update required memory and length alignment for direct dma transactions.
* If the requested alignment is larger than the current alignment, then
* the current queue alignment is updated to the new value, otherwise it
* is left alone. The design of this is to allow multiple objects
* (driver, device, transport etc) to set their respective
* alignments without having them interfere.
*
**/
void blk_queue_update_dma_alignment(struct request_queue *q, int mask)
{
BUG_ON(mask > PAGE_SIZE);
if (mask > q->dma_alignment)
q->dma_alignment = mask;
}
EXPORT_SYMBOL(blk_queue_update_dma_alignment);
/**
* blk_queue_flush - configure queue's cache flush capability
* @q: the request queue for the device
* @flush: 0, REQ_FLUSH or REQ_FLUSH | REQ_FUA
*
* Tell block layer cache flush capability of @q. If it supports
* flushing, REQ_FLUSH should be set. If it supports bypassing
* write cache for individual writes, REQ_FUA should be set.
*/
void blk_queue_flush(struct request_queue *q, unsigned int flush)
{
WARN_ON_ONCE(flush & ~(REQ_FLUSH | REQ_FUA));
if (WARN_ON_ONCE(!(flush & REQ_FLUSH) && (flush & REQ_FUA)))
flush &= ~REQ_FUA;
q->flush_flags = flush & (REQ_FLUSH | REQ_FUA);
}
EXPORT_SYMBOL_GPL(blk_queue_flush);
void blk_queue_flush_queueable(struct request_queue *q, bool queueable)
{
q->flush_not_queueable = !queueable;
}
EXPORT_SYMBOL_GPL(blk_queue_flush_queueable);
static int __init blk_settings_init(void)
{
blk_max_low_pfn = max_low_pfn - 1;
blk_max_pfn = max_pfn - 1;
return 0;
}
subsys_initcall(blk_settings_init);

186
block/blk-softirq.c Normal file
View file

@ -0,0 +1,186 @@
/*
* Functions related to softirq rq completions
*/
#include <linux/kernel.h>
#include <linux/module.h>
#include <linux/init.h>
#include <linux/bio.h>
#include <linux/blkdev.h>
#include <linux/interrupt.h>
#include <linux/cpu.h>
#include <linux/sched.h>
#include "blk.h"
static DEFINE_PER_CPU(struct list_head, blk_cpu_done);
/*
* Softirq action handler - move entries to local list and loop over them
* while passing them to the queue registered handler.
*/
static void blk_done_softirq(struct softirq_action *h)
{
struct list_head *cpu_list, local_list;
local_irq_disable();
cpu_list = this_cpu_ptr(&blk_cpu_done);
list_replace_init(cpu_list, &local_list);
local_irq_enable();
while (!list_empty(&local_list)) {
struct request *rq;
rq = list_entry(local_list.next, struct request, ipi_list);
list_del_init(&rq->ipi_list);
rq->q->softirq_done_fn(rq);
}
}
#ifdef CONFIG_SMP
static void trigger_softirq(void *data)
{
struct request *rq = data;
unsigned long flags;
struct list_head *list;
local_irq_save(flags);
list = this_cpu_ptr(&blk_cpu_done);
list_add_tail(&rq->ipi_list, list);
if (list->next == &rq->ipi_list)
raise_softirq_irqoff(BLOCK_SOFTIRQ);
local_irq_restore(flags);
}
/*
* Setup and invoke a run of 'trigger_softirq' on the given cpu.
*/
static int raise_blk_irq(int cpu, struct request *rq)
{
if (cpu_online(cpu)) {
struct call_single_data *data = &rq->csd;
data->func = trigger_softirq;
data->info = rq;
data->flags = 0;
smp_call_function_single_async(cpu, data);
return 0;
}
return 1;
}
#else /* CONFIG_SMP */
static int raise_blk_irq(int cpu, struct request *rq)
{
return 1;
}
#endif
static int blk_cpu_notify(struct notifier_block *self, unsigned long action,
void *hcpu)
{
/*
* If a CPU goes away, splice its entries to the current CPU
* and trigger a run of the softirq
*/
if (action == CPU_DEAD || action == CPU_DEAD_FROZEN) {
int cpu = (unsigned long) hcpu;
local_irq_disable();
list_splice_init(&per_cpu(blk_cpu_done, cpu),
this_cpu_ptr(&blk_cpu_done));
raise_softirq_irqoff(BLOCK_SOFTIRQ);
local_irq_enable();
}
return NOTIFY_OK;
}
static struct notifier_block blk_cpu_notifier = {
.notifier_call = blk_cpu_notify,
};
void __blk_complete_request(struct request *req)
{
int ccpu, cpu;
struct request_queue *q = req->q;
unsigned long flags;
bool shared = false;
BUG_ON(!q->softirq_done_fn);
local_irq_save(flags);
cpu = smp_processor_id();
/*
* Select completion CPU
*/
if (req->cpu != -1) {
ccpu = req->cpu;
if (!test_bit(QUEUE_FLAG_SAME_FORCE, &q->queue_flags))
shared = cpus_share_cache(cpu, ccpu);
} else
ccpu = cpu;
/*
* If current CPU and requested CPU share a cache, run the softirq on
* the current CPU. One might concern this is just like
* QUEUE_FLAG_SAME_FORCE, but actually not. blk_complete_request() is
* running in interrupt handler, and currently I/O controller doesn't
* support multiple interrupts, so current CPU is unique actually. This
* avoids IPI sending from current CPU to the first CPU of a group.
*/
if (ccpu == cpu || shared) {
struct list_head *list;
do_local:
list = this_cpu_ptr(&blk_cpu_done);
list_add_tail(&req->ipi_list, list);
/*
* if the list only contains our just added request,
* signal a raise of the softirq. If there are already
* entries there, someone already raised the irq but it
* hasn't run yet.
*/
if (list->next == &req->ipi_list)
raise_softirq_irqoff(BLOCK_SOFTIRQ);
} else if (raise_blk_irq(ccpu, req))
goto do_local;
local_irq_restore(flags);
}
/**
* blk_complete_request - end I/O on a request
* @req: the request being processed
*
* Description:
* Ends all I/O on a request. It does not handle partial completions,
* unless the driver actually implements this in its completion callback
* through requeueing. The actual completion happens out-of-order,
* through a softirq handler. The user must have registered a completion
* callback through blk_queue_softirq_done().
**/
void blk_complete_request(struct request *req)
{
if (unlikely(blk_should_fake_timeout(req->q)))
return;
if (!blk_mark_rq_complete(req))
__blk_complete_request(req);
}
EXPORT_SYMBOL(blk_complete_request);
static __init int blk_softirq_init(void)
{
int i;
for_each_possible_cpu(i)
INIT_LIST_HEAD(&per_cpu(blk_cpu_done, i));
open_softirq(BLOCK_SOFTIRQ, blk_done_softirq);
register_hotcpu_notifier(&blk_cpu_notifier);
return 0;
}
subsys_initcall(blk_softirq_init);

612
block/blk-sysfs.c Normal file
View file

@ -0,0 +1,612 @@
/*
* Functions related to sysfs handling
*/
#include <linux/kernel.h>
#include <linux/slab.h>
#include <linux/module.h>
#include <linux/bio.h>
#include <linux/blkdev.h>
#include <linux/blktrace_api.h>
#include <linux/blk-mq.h>
#include "blk.h"
#include "blk-cgroup.h"
#include "blk-mq.h"
struct queue_sysfs_entry {
struct attribute attr;
ssize_t (*show)(struct request_queue *, char *);
ssize_t (*store)(struct request_queue *, const char *, size_t);
};
static ssize_t
queue_var_show(unsigned long var, char *page)
{
return sprintf(page, "%lu\n", var);
}
static ssize_t
queue_var_store(unsigned long *var, const char *page, size_t count)
{
int err;
unsigned long v;
err = kstrtoul(page, 10, &v);
if (err || v > UINT_MAX)
return -EINVAL;
*var = v;
return count;
}
static ssize_t queue_requests_show(struct request_queue *q, char *page)
{
return queue_var_show(q->nr_requests, (page));
}
static ssize_t
queue_requests_store(struct request_queue *q, const char *page, size_t count)
{
unsigned long nr;
int ret, err;
if (!q->request_fn && !q->mq_ops)
return -EINVAL;
ret = queue_var_store(&nr, page, count);
if (ret < 0)
return ret;
if (nr < BLKDEV_MIN_RQ)
nr = BLKDEV_MIN_RQ;
if (q->request_fn)
err = blk_update_nr_requests(q, nr);
else
err = blk_mq_update_nr_requests(q, nr);
if (err)
return err;
return ret;
}
static ssize_t queue_ra_show(struct request_queue *q, char *page)
{
unsigned long ra_kb = q->backing_dev_info.ra_pages <<
(PAGE_CACHE_SHIFT - 10);
return queue_var_show(ra_kb, (page));
}
static ssize_t
queue_ra_store(struct request_queue *q, const char *page, size_t count)
{
unsigned long ra_kb;
ssize_t ret = queue_var_store(&ra_kb, page, count);
if (ret < 0)
return ret;
q->backing_dev_info.ra_pages = ra_kb >> (PAGE_CACHE_SHIFT - 10);
return ret;
}
static ssize_t queue_max_sectors_show(struct request_queue *q, char *page)
{
int max_sectors_kb = queue_max_sectors(q) >> 1;
return queue_var_show(max_sectors_kb, (page));
}
static ssize_t queue_max_segments_show(struct request_queue *q, char *page)
{
return queue_var_show(queue_max_segments(q), (page));
}
static ssize_t queue_max_integrity_segments_show(struct request_queue *q, char *page)
{
return queue_var_show(q->limits.max_integrity_segments, (page));
}
static ssize_t queue_max_segment_size_show(struct request_queue *q, char *page)
{
if (blk_queue_cluster(q))
return queue_var_show(queue_max_segment_size(q), (page));
return queue_var_show(PAGE_CACHE_SIZE, (page));
}
static ssize_t queue_logical_block_size_show(struct request_queue *q, char *page)
{
return queue_var_show(queue_logical_block_size(q), page);
}
static ssize_t queue_physical_block_size_show(struct request_queue *q, char *page)
{
return queue_var_show(queue_physical_block_size(q), page);
}
static ssize_t queue_io_min_show(struct request_queue *q, char *page)
{
return queue_var_show(queue_io_min(q), page);
}
static ssize_t queue_io_opt_show(struct request_queue *q, char *page)
{
return queue_var_show(queue_io_opt(q), page);
}
static ssize_t queue_discard_granularity_show(struct request_queue *q, char *page)
{
return queue_var_show(q->limits.discard_granularity, page);
}
static ssize_t queue_discard_max_show(struct request_queue *q, char *page)
{
return sprintf(page, "%llu\n",
(unsigned long long)q->limits.max_discard_sectors << 9);
}
static ssize_t queue_discard_zeroes_data_show(struct request_queue *q, char *page)
{
return queue_var_show(queue_discard_zeroes_data(q), page);
}
static ssize_t queue_write_same_max_show(struct request_queue *q, char *page)
{
return sprintf(page, "%llu\n",
(unsigned long long)q->limits.max_write_same_sectors << 9);
}
static ssize_t
queue_max_sectors_store(struct request_queue *q, const char *page, size_t count)
{
unsigned long max_sectors_kb,
max_hw_sectors_kb = queue_max_hw_sectors(q) >> 1,
page_kb = 1 << (PAGE_CACHE_SHIFT - 10);
ssize_t ret = queue_var_store(&max_sectors_kb, page, count);
if (ret < 0)
return ret;
if (max_sectors_kb > max_hw_sectors_kb || max_sectors_kb < page_kb)
return -EINVAL;
spin_lock_irq(q->queue_lock);
q->limits.max_sectors = max_sectors_kb << 1;
spin_unlock_irq(q->queue_lock);
return ret;
}
static ssize_t queue_max_hw_sectors_show(struct request_queue *q, char *page)
{
int max_hw_sectors_kb = queue_max_hw_sectors(q) >> 1;
return queue_var_show(max_hw_sectors_kb, (page));
}
#define QUEUE_SYSFS_BIT_FNS(name, flag, neg) \
static ssize_t \
queue_show_##name(struct request_queue *q, char *page) \
{ \
int bit; \
bit = test_bit(QUEUE_FLAG_##flag, &q->queue_flags); \
return queue_var_show(neg ? !bit : bit, page); \
} \
static ssize_t \
queue_store_##name(struct request_queue *q, const char *page, size_t count) \
{ \
unsigned long val; \
ssize_t ret; \
ret = queue_var_store(&val, page, count); \
if (ret < 0) \
return ret; \
if (neg) \
val = !val; \
\
spin_lock_irq(q->queue_lock); \
if (val) \
queue_flag_set(QUEUE_FLAG_##flag, q); \
else \
queue_flag_clear(QUEUE_FLAG_##flag, q); \
spin_unlock_irq(q->queue_lock); \
return ret; \
}
QUEUE_SYSFS_BIT_FNS(nonrot, NONROT, 1);
QUEUE_SYSFS_BIT_FNS(random, ADD_RANDOM, 0);
QUEUE_SYSFS_BIT_FNS(iostats, IO_STAT, 0);
#undef QUEUE_SYSFS_BIT_FNS
static ssize_t queue_nomerges_show(struct request_queue *q, char *page)
{
return queue_var_show((blk_queue_nomerges(q) << 1) |
blk_queue_noxmerges(q), page);
}
static ssize_t queue_nomerges_store(struct request_queue *q, const char *page,
size_t count)
{
unsigned long nm;
ssize_t ret = queue_var_store(&nm, page, count);
if (ret < 0)
return ret;
spin_lock_irq(q->queue_lock);
queue_flag_clear(QUEUE_FLAG_NOMERGES, q);
queue_flag_clear(QUEUE_FLAG_NOXMERGES, q);
if (nm == 2)
queue_flag_set(QUEUE_FLAG_NOMERGES, q);
else if (nm)
queue_flag_set(QUEUE_FLAG_NOXMERGES, q);
spin_unlock_irq(q->queue_lock);
return ret;
}
static ssize_t queue_rq_affinity_show(struct request_queue *q, char *page)
{
bool set = test_bit(QUEUE_FLAG_SAME_COMP, &q->queue_flags);
bool force = test_bit(QUEUE_FLAG_SAME_FORCE, &q->queue_flags);
return queue_var_show(set << force, page);
}
static ssize_t
queue_rq_affinity_store(struct request_queue *q, const char *page, size_t count)
{
ssize_t ret = -EINVAL;
#ifdef CONFIG_SMP
unsigned long val;
ret = queue_var_store(&val, page, count);
if (ret < 0)
return ret;
spin_lock_irq(q->queue_lock);
if (val == 2) {
queue_flag_set(QUEUE_FLAG_SAME_COMP, q);
queue_flag_set(QUEUE_FLAG_SAME_FORCE, q);
} else if (val == 1) {
queue_flag_set(QUEUE_FLAG_SAME_COMP, q);
queue_flag_clear(QUEUE_FLAG_SAME_FORCE, q);
} else if (val == 0) {
queue_flag_clear(QUEUE_FLAG_SAME_COMP, q);
queue_flag_clear(QUEUE_FLAG_SAME_FORCE, q);
}
spin_unlock_irq(q->queue_lock);
#endif
return ret;
}
static struct queue_sysfs_entry queue_requests_entry = {
.attr = {.name = "nr_requests", .mode = S_IRUGO | S_IWUSR },
.show = queue_requests_show,
.store = queue_requests_store,
};
static struct queue_sysfs_entry queue_ra_entry = {
.attr = {.name = "read_ahead_kb", .mode = S_IRUGO | S_IWUSR },
.show = queue_ra_show,
.store = queue_ra_store,
};
static struct queue_sysfs_entry queue_max_sectors_entry = {
.attr = {.name = "max_sectors_kb", .mode = S_IRUGO | S_IWUSR },
.show = queue_max_sectors_show,
.store = queue_max_sectors_store,
};
static struct queue_sysfs_entry queue_max_hw_sectors_entry = {
.attr = {.name = "max_hw_sectors_kb", .mode = S_IRUGO },
.show = queue_max_hw_sectors_show,
};
static struct queue_sysfs_entry queue_max_segments_entry = {
.attr = {.name = "max_segments", .mode = S_IRUGO },
.show = queue_max_segments_show,
};
static struct queue_sysfs_entry queue_max_integrity_segments_entry = {
.attr = {.name = "max_integrity_segments", .mode = S_IRUGO },
.show = queue_max_integrity_segments_show,
};
static struct queue_sysfs_entry queue_max_segment_size_entry = {
.attr = {.name = "max_segment_size", .mode = S_IRUGO },
.show = queue_max_segment_size_show,
};
static struct queue_sysfs_entry queue_iosched_entry = {
.attr = {.name = "scheduler", .mode = S_IRUGO | S_IWUSR },
.show = elv_iosched_show,
.store = elv_iosched_store,
};
static struct queue_sysfs_entry queue_hw_sector_size_entry = {
.attr = {.name = "hw_sector_size", .mode = S_IRUGO },
.show = queue_logical_block_size_show,
};
static struct queue_sysfs_entry queue_logical_block_size_entry = {
.attr = {.name = "logical_block_size", .mode = S_IRUGO },
.show = queue_logical_block_size_show,
};
static struct queue_sysfs_entry queue_physical_block_size_entry = {
.attr = {.name = "physical_block_size", .mode = S_IRUGO },
.show = queue_physical_block_size_show,
};
static struct queue_sysfs_entry queue_io_min_entry = {
.attr = {.name = "minimum_io_size", .mode = S_IRUGO },
.show = queue_io_min_show,
};
static struct queue_sysfs_entry queue_io_opt_entry = {
.attr = {.name = "optimal_io_size", .mode = S_IRUGO },
.show = queue_io_opt_show,
};
static struct queue_sysfs_entry queue_discard_granularity_entry = {
.attr = {.name = "discard_granularity", .mode = S_IRUGO },
.show = queue_discard_granularity_show,
};
static struct queue_sysfs_entry queue_discard_max_entry = {
.attr = {.name = "discard_max_bytes", .mode = S_IRUGO },
.show = queue_discard_max_show,
};
static struct queue_sysfs_entry queue_discard_zeroes_data_entry = {
.attr = {.name = "discard_zeroes_data", .mode = S_IRUGO },
.show = queue_discard_zeroes_data_show,
};
static struct queue_sysfs_entry queue_write_same_max_entry = {
.attr = {.name = "write_same_max_bytes", .mode = S_IRUGO },
.show = queue_write_same_max_show,
};
static struct queue_sysfs_entry queue_nonrot_entry = {
.attr = {.name = "rotational", .mode = S_IRUGO | S_IWUSR },
.show = queue_show_nonrot,
.store = queue_store_nonrot,
};
static struct queue_sysfs_entry queue_nomerges_entry = {
.attr = {.name = "nomerges", .mode = S_IRUGO | S_IWUSR },
.show = queue_nomerges_show,
.store = queue_nomerges_store,
};
static struct queue_sysfs_entry queue_rq_affinity_entry = {
.attr = {.name = "rq_affinity", .mode = S_IRUGO | S_IWUSR },
.show = queue_rq_affinity_show,
.store = queue_rq_affinity_store,
};
static struct queue_sysfs_entry queue_iostats_entry = {
.attr = {.name = "iostats", .mode = S_IRUGO | S_IWUSR },
.show = queue_show_iostats,
.store = queue_store_iostats,
};
static struct queue_sysfs_entry queue_random_entry = {
.attr = {.name = "add_random", .mode = S_IRUGO | S_IWUSR },
.show = queue_show_random,
.store = queue_store_random,
};
static struct attribute *default_attrs[] = {
&queue_requests_entry.attr,
&queue_ra_entry.attr,
&queue_max_hw_sectors_entry.attr,
&queue_max_sectors_entry.attr,
&queue_max_segments_entry.attr,
&queue_max_integrity_segments_entry.attr,
&queue_max_segment_size_entry.attr,
&queue_iosched_entry.attr,
&queue_hw_sector_size_entry.attr,
&queue_logical_block_size_entry.attr,
&queue_physical_block_size_entry.attr,
&queue_io_min_entry.attr,
&queue_io_opt_entry.attr,
&queue_discard_granularity_entry.attr,
&queue_discard_max_entry.attr,
&queue_discard_zeroes_data_entry.attr,
&queue_write_same_max_entry.attr,
&queue_nonrot_entry.attr,
&queue_nomerges_entry.attr,
&queue_rq_affinity_entry.attr,
&queue_iostats_entry.attr,
&queue_random_entry.attr,
NULL,
};
#define to_queue(atr) container_of((atr), struct queue_sysfs_entry, attr)
static ssize_t
queue_attr_show(struct kobject *kobj, struct attribute *attr, char *page)
{
struct queue_sysfs_entry *entry = to_queue(attr);
struct request_queue *q =
container_of(kobj, struct request_queue, kobj);
ssize_t res;
if (!entry->show)
return -EIO;
mutex_lock(&q->sysfs_lock);
if (blk_queue_dying(q)) {
mutex_unlock(&q->sysfs_lock);
return -ENOENT;
}
res = entry->show(q, page);
mutex_unlock(&q->sysfs_lock);
return res;
}
static ssize_t
queue_attr_store(struct kobject *kobj, struct attribute *attr,
const char *page, size_t length)
{
struct queue_sysfs_entry *entry = to_queue(attr);
struct request_queue *q;
ssize_t res;
if (!entry->store)
return -EIO;
q = container_of(kobj, struct request_queue, kobj);
mutex_lock(&q->sysfs_lock);
if (blk_queue_dying(q)) {
mutex_unlock(&q->sysfs_lock);
return -ENOENT;
}
res = entry->store(q, page, length);
mutex_unlock(&q->sysfs_lock);
return res;
}
static void blk_free_queue_rcu(struct rcu_head *rcu_head)
{
struct request_queue *q = container_of(rcu_head, struct request_queue,
rcu_head);
kmem_cache_free(blk_requestq_cachep, q);
}
/**
* blk_release_queue: - release a &struct request_queue when it is no longer needed
* @kobj: the kobj belonging to the request queue to be released
*
* Description:
* blk_release_queue is the pair to blk_init_queue() or
* blk_queue_make_request(). It should be called when a request queue is
* being released; typically when a block device is being de-registered.
* Currently, its primary task it to free all the &struct request
* structures that were allocated to the queue and the queue itself.
*
* Note:
* The low level driver must have finished any outstanding requests first
* via blk_cleanup_queue().
**/
static void blk_release_queue(struct kobject *kobj)
{
struct request_queue *q =
container_of(kobj, struct request_queue, kobj);
blkcg_exit_queue(q);
if (q->elevator) {
spin_lock_irq(q->queue_lock);
ioc_clear_queue(q);
spin_unlock_irq(q->queue_lock);
elevator_exit(q->elevator);
}
blk_exit_rl(&q->root_rl);
if (q->queue_tags)
__blk_queue_free_tags(q);
if (!q->mq_ops)
blk_free_flush_queue(q->fq);
blk_trace_shutdown(q);
bdi_destroy(&q->backing_dev_info);
ida_simple_remove(&blk_queue_ida, q->id);
call_rcu(&q->rcu_head, blk_free_queue_rcu);
}
static const struct sysfs_ops queue_sysfs_ops = {
.show = queue_attr_show,
.store = queue_attr_store,
};
struct kobj_type blk_queue_ktype = {
.sysfs_ops = &queue_sysfs_ops,
.default_attrs = default_attrs,
.release = blk_release_queue,
};
int blk_register_queue(struct gendisk *disk)
{
int ret;
struct device *dev = disk_to_dev(disk);
struct request_queue *q = disk->queue;
if (WARN_ON(!q))
return -ENXIO;
/*
* SCSI probing may synchronously create and destroy a lot of
* request_queues for non-existent devices. Shutting down a fully
* functional queue takes measureable wallclock time as RCU grace
* periods are involved. To avoid excessive latency in these
* cases, a request_queue starts out in a degraded mode which is
* faster to shut down and is made fully functional here as
* request_queues for non-existent devices never get registered.
*/
if (!blk_queue_init_done(q)) {
queue_flag_set_unlocked(QUEUE_FLAG_INIT_DONE, q);
blk_queue_bypass_end(q);
if (q->mq_ops)
blk_mq_finish_init(q);
}
ret = blk_trace_init_sysfs(dev);
if (ret)
return ret;
ret = kobject_add(&q->kobj, kobject_get(&dev->kobj), "%s", "queue");
if (ret < 0) {
blk_trace_remove_sysfs(dev);
return ret;
}
kobject_uevent(&q->kobj, KOBJ_ADD);
if (q->mq_ops)
blk_mq_register_disk(disk);
if (!q->request_fn)
return 0;
ret = elv_register_queue(q);
if (ret) {
kobject_uevent(&q->kobj, KOBJ_REMOVE);
kobject_del(&q->kobj);
blk_trace_remove_sysfs(dev);
kobject_put(&dev->kobj);
return ret;
}
return 0;
}
void blk_unregister_queue(struct gendisk *disk)
{
struct request_queue *q = disk->queue;
if (WARN_ON(!q))
return;
if (q->mq_ops)
blk_mq_unregister_disk(disk);
if (q->request_fn)
elv_unregister_queue(q);
kobject_uevent(&q->kobj, KOBJ_REMOVE);
kobject_del(&q->kobj);
blk_trace_remove_sysfs(disk_to_dev(disk));
kobject_put(&disk_to_dev(disk)->kobj);
}

385
block/blk-tag.c Normal file
View file

@ -0,0 +1,385 @@
/*
* Functions related to tagged command queuing
*/
#include <linux/kernel.h>
#include <linux/module.h>
#include <linux/bio.h>
#include <linux/blkdev.h>
#include <linux/slab.h>
#include "blk.h"
/**
* blk_queue_find_tag - find a request by its tag and queue
* @q: The request queue for the device
* @tag: The tag of the request
*
* Notes:
* Should be used when a device returns a tag and you want to match
* it with a request.
*
* no locks need be held.
**/
struct request *blk_queue_find_tag(struct request_queue *q, int tag)
{
return blk_map_queue_find_tag(q->queue_tags, tag);
}
EXPORT_SYMBOL(blk_queue_find_tag);
/**
* blk_free_tags - release a given set of tag maintenance info
* @bqt: the tag map to free
*
* Drop the reference count on @bqt and frees it when the last reference
* is dropped.
*/
void blk_free_tags(struct blk_queue_tag *bqt)
{
if (atomic_dec_and_test(&bqt->refcnt)) {
BUG_ON(find_first_bit(bqt->tag_map, bqt->max_depth) <
bqt->max_depth);
kfree(bqt->tag_index);
bqt->tag_index = NULL;
kfree(bqt->tag_map);
bqt->tag_map = NULL;
kfree(bqt);
}
}
EXPORT_SYMBOL(blk_free_tags);
/**
* __blk_queue_free_tags - release tag maintenance info
* @q: the request queue for the device
*
* Notes:
* blk_cleanup_queue() will take care of calling this function, if tagging
* has been used. So there's no need to call this directly.
**/
void __blk_queue_free_tags(struct request_queue *q)
{
struct blk_queue_tag *bqt = q->queue_tags;
if (!bqt)
return;
blk_free_tags(bqt);
q->queue_tags = NULL;
queue_flag_clear_unlocked(QUEUE_FLAG_QUEUED, q);
}
/**
* blk_queue_free_tags - release tag maintenance info
* @q: the request queue for the device
*
* Notes:
* This is used to disable tagged queuing to a device, yet leave
* queue in function.
**/
void blk_queue_free_tags(struct request_queue *q)
{
queue_flag_clear_unlocked(QUEUE_FLAG_QUEUED, q);
}
EXPORT_SYMBOL(blk_queue_free_tags);
static int
init_tag_map(struct request_queue *q, struct blk_queue_tag *tags, int depth)
{
struct request **tag_index;
unsigned long *tag_map;
int nr_ulongs;
if (q && depth > q->nr_requests * 2) {
depth = q->nr_requests * 2;
printk(KERN_ERR "%s: adjusted depth to %d\n",
__func__, depth);
}
tag_index = kzalloc(depth * sizeof(struct request *), GFP_ATOMIC);
if (!tag_index)
goto fail;
nr_ulongs = ALIGN(depth, BITS_PER_LONG) / BITS_PER_LONG;
tag_map = kzalloc(nr_ulongs * sizeof(unsigned long), GFP_ATOMIC);
if (!tag_map)
goto fail;
tags->real_max_depth = depth;
tags->max_depth = depth;
tags->tag_index = tag_index;
tags->tag_map = tag_map;
return 0;
fail:
kfree(tag_index);
return -ENOMEM;
}
static struct blk_queue_tag *__blk_queue_init_tags(struct request_queue *q,
int depth)
{
struct blk_queue_tag *tags;
tags = kmalloc(sizeof(struct blk_queue_tag), GFP_ATOMIC);
if (!tags)
goto fail;
if (init_tag_map(q, tags, depth))
goto fail;
atomic_set(&tags->refcnt, 1);
return tags;
fail:
kfree(tags);
return NULL;
}
/**
* blk_init_tags - initialize the tag info for an external tag map
* @depth: the maximum queue depth supported
**/
struct blk_queue_tag *blk_init_tags(int depth)
{
return __blk_queue_init_tags(NULL, depth);
}
EXPORT_SYMBOL(blk_init_tags);
/**
* blk_queue_init_tags - initialize the queue tag info
* @q: the request queue for the device
* @depth: the maximum queue depth supported
* @tags: the tag to use
*
* Queue lock must be held here if the function is called to resize an
* existing map.
**/
int blk_queue_init_tags(struct request_queue *q, int depth,
struct blk_queue_tag *tags)
{
int rc;
BUG_ON(tags && q->queue_tags && tags != q->queue_tags);
if (!tags && !q->queue_tags) {
tags = __blk_queue_init_tags(q, depth);
if (!tags)
return -ENOMEM;
} else if (q->queue_tags) {
rc = blk_queue_resize_tags(q, depth);
if (rc)
return rc;
queue_flag_set(QUEUE_FLAG_QUEUED, q);
return 0;
} else
atomic_inc(&tags->refcnt);
/*
* assign it, all done
*/
q->queue_tags = tags;
queue_flag_set_unlocked(QUEUE_FLAG_QUEUED, q);
INIT_LIST_HEAD(&q->tag_busy_list);
return 0;
}
EXPORT_SYMBOL(blk_queue_init_tags);
/**
* blk_queue_resize_tags - change the queueing depth
* @q: the request queue for the device
* @new_depth: the new max command queueing depth
*
* Notes:
* Must be called with the queue lock held.
**/
int blk_queue_resize_tags(struct request_queue *q, int new_depth)
{
struct blk_queue_tag *bqt = q->queue_tags;
struct request **tag_index;
unsigned long *tag_map;
int max_depth, nr_ulongs;
if (!bqt)
return -ENXIO;
/*
* if we already have large enough real_max_depth. just
* adjust max_depth. *NOTE* as requests with tag value
* between new_depth and real_max_depth can be in-flight, tag
* map can not be shrunk blindly here.
*/
if (new_depth <= bqt->real_max_depth) {
bqt->max_depth = new_depth;
return 0;
}
/*
* Currently cannot replace a shared tag map with a new
* one, so error out if this is the case
*/
if (atomic_read(&bqt->refcnt) != 1)
return -EBUSY;
/*
* save the old state info, so we can copy it back
*/
tag_index = bqt->tag_index;
tag_map = bqt->tag_map;
max_depth = bqt->real_max_depth;
if (init_tag_map(q, bqt, new_depth))
return -ENOMEM;
memcpy(bqt->tag_index, tag_index, max_depth * sizeof(struct request *));
nr_ulongs = ALIGN(max_depth, BITS_PER_LONG) / BITS_PER_LONG;
memcpy(bqt->tag_map, tag_map, nr_ulongs * sizeof(unsigned long));
kfree(tag_index);
kfree(tag_map);
return 0;
}
EXPORT_SYMBOL(blk_queue_resize_tags);
/**
* blk_queue_end_tag - end tag operations for a request
* @q: the request queue for the device
* @rq: the request that has completed
*
* Description:
* Typically called when end_that_request_first() returns %0, meaning
* all transfers have been done for a request. It's important to call
* this function before end_that_request_last(), as that will put the
* request back on the free list thus corrupting the internal tag list.
*
* Notes:
* queue lock must be held.
**/
void blk_queue_end_tag(struct request_queue *q, struct request *rq)
{
struct blk_queue_tag *bqt = q->queue_tags;
unsigned tag = rq->tag; /* negative tags invalid */
BUG_ON(tag >= bqt->real_max_depth);
list_del_init(&rq->queuelist);
rq->cmd_flags &= ~REQ_QUEUED;
rq->tag = -1;
if (unlikely(bqt->tag_index[tag] == NULL))
printk(KERN_ERR "%s: tag %d is missing\n",
__func__, tag);
bqt->tag_index[tag] = NULL;
if (unlikely(!test_bit(tag, bqt->tag_map))) {
printk(KERN_ERR "%s: attempt to clear non-busy tag (%d)\n",
__func__, tag);
return;
}
/*
* The tag_map bit acts as a lock for tag_index[bit], so we need
* unlock memory barrier semantics.
*/
clear_bit_unlock(tag, bqt->tag_map);
}
EXPORT_SYMBOL(blk_queue_end_tag);
/**
* blk_queue_start_tag - find a free tag and assign it
* @q: the request queue for the device
* @rq: the block request that needs tagging
*
* Description:
* This can either be used as a stand-alone helper, or possibly be
* assigned as the queue &prep_rq_fn (in which case &struct request
* automagically gets a tag assigned). Note that this function
* assumes that any type of request can be queued! if this is not
* true for your device, you must check the request type before
* calling this function. The request will also be removed from
* the request queue, so it's the drivers responsibility to readd
* it if it should need to be restarted for some reason.
*
* Notes:
* queue lock must be held.
**/
int blk_queue_start_tag(struct request_queue *q, struct request *rq)
{
struct blk_queue_tag *bqt = q->queue_tags;
unsigned max_depth;
int tag;
if (unlikely((rq->cmd_flags & REQ_QUEUED))) {
printk(KERN_ERR
"%s: request %p for device [%s] already tagged %d",
__func__, rq,
rq->rq_disk ? rq->rq_disk->disk_name : "?", rq->tag);
BUG();
}
/*
* Protect against shared tag maps, as we may not have exclusive
* access to the tag map.
*
* We reserve a few tags just for sync IO, since we don't want
* to starve sync IO on behalf of flooding async IO.
*/
max_depth = bqt->max_depth;
if (!rq_is_sync(rq) && max_depth > 1) {
switch (max_depth) {
case 2:
max_depth = 1;
break;
case 3:
max_depth = 2;
break;
default:
max_depth -= 2;
}
if (q->in_flight[BLK_RW_ASYNC] > max_depth)
return 1;
}
do {
tag = find_first_zero_bit(bqt->tag_map, max_depth);
if (tag >= max_depth)
return 1;
} while (test_and_set_bit_lock(tag, bqt->tag_map));
/*
* We need lock ordering semantics given by test_and_set_bit_lock.
* See blk_queue_end_tag for details.
*/
rq->cmd_flags |= REQ_QUEUED;
rq->tag = tag;
bqt->tag_index[tag] = rq;
blk_start_request(rq);
list_add(&rq->queuelist, &q->tag_busy_list);
return 0;
}
EXPORT_SYMBOL(blk_queue_start_tag);
/**
* blk_queue_invalidate_tags - invalidate all pending tags
* @q: the request queue for the device
*
* Description:
* Hardware conditions may dictate a need to stop all pending requests.
* In this case, we will safely clear the block side of the tag queue and
* readd all requests to the request queue in the right order.
*
* Notes:
* queue lock must be held.
**/
void blk_queue_invalidate_tags(struct request_queue *q)
{
struct list_head *tmp, *n;
list_for_each_safe(tmp, n, &q->tag_busy_list)
blk_requeue_request(q, list_entry_rq(tmp));
}
EXPORT_SYMBOL(blk_queue_invalidate_tags);

1699
block/blk-throttle.c Normal file

File diff suppressed because it is too large Load diff

232
block/blk-timeout.c Normal file
View file

@ -0,0 +1,232 @@
/*
* Functions related to generic timeout handling of requests.
*/
#include <linux/kernel.h>
#include <linux/module.h>
#include <linux/blkdev.h>
#include <linux/fault-inject.h>
#include "blk.h"
#include "blk-mq.h"
#ifdef CONFIG_FAIL_IO_TIMEOUT
static DECLARE_FAULT_ATTR(fail_io_timeout);
static int __init setup_fail_io_timeout(char *str)
{
return setup_fault_attr(&fail_io_timeout, str);
}
__setup("fail_io_timeout=", setup_fail_io_timeout);
int blk_should_fake_timeout(struct request_queue *q)
{
if (!test_bit(QUEUE_FLAG_FAIL_IO, &q->queue_flags))
return 0;
return should_fail(&fail_io_timeout, 1);
}
static int __init fail_io_timeout_debugfs(void)
{
struct dentry *dir = fault_create_debugfs_attr("fail_io_timeout",
NULL, &fail_io_timeout);
return PTR_ERR_OR_ZERO(dir);
}
late_initcall(fail_io_timeout_debugfs);
ssize_t part_timeout_show(struct device *dev, struct device_attribute *attr,
char *buf)
{
struct gendisk *disk = dev_to_disk(dev);
int set = test_bit(QUEUE_FLAG_FAIL_IO, &disk->queue->queue_flags);
return sprintf(buf, "%d\n", set != 0);
}
ssize_t part_timeout_store(struct device *dev, struct device_attribute *attr,
const char *buf, size_t count)
{
struct gendisk *disk = dev_to_disk(dev);
int val;
if (count) {
struct request_queue *q = disk->queue;
char *p = (char *) buf;
val = simple_strtoul(p, &p, 10);
spin_lock_irq(q->queue_lock);
if (val)
queue_flag_set(QUEUE_FLAG_FAIL_IO, q);
else
queue_flag_clear(QUEUE_FLAG_FAIL_IO, q);
spin_unlock_irq(q->queue_lock);
}
return count;
}
#endif /* CONFIG_FAIL_IO_TIMEOUT */
/*
* blk_delete_timer - Delete/cancel timer for a given function.
* @req: request that we are canceling timer for
*
*/
void blk_delete_timer(struct request *req)
{
list_del_init(&req->timeout_list);
}
static void blk_rq_timed_out(struct request *req)
{
struct request_queue *q = req->q;
enum blk_eh_timer_return ret = BLK_EH_RESET_TIMER;
if (q->rq_timed_out_fn)
ret = q->rq_timed_out_fn(req);
switch (ret) {
case BLK_EH_HANDLED:
/* Can we use req->errors here? */
__blk_complete_request(req);
break;
case BLK_EH_RESET_TIMER:
blk_add_timer(req);
blk_clear_rq_complete(req);
break;
case BLK_EH_NOT_HANDLED:
/*
* LLD handles this for now but in the future
* we can send a request msg to abort the command
* and we can move more of the generic scsi eh code to
* the blk layer.
*/
break;
default:
printk(KERN_ERR "block: bad eh return: %d\n", ret);
break;
}
}
static void blk_rq_check_expired(struct request *rq, unsigned long *next_timeout,
unsigned int *next_set)
{
if (time_after_eq(jiffies, rq->deadline)) {
list_del_init(&rq->timeout_list);
/*
* Check if we raced with end io completion
*/
if (!blk_mark_rq_complete(rq))
blk_rq_timed_out(rq);
} else if (!*next_set || time_after(*next_timeout, rq->deadline)) {
*next_timeout = rq->deadline;
*next_set = 1;
}
}
void blk_rq_timed_out_timer(unsigned long data)
{
struct request_queue *q = (struct request_queue *) data;
unsigned long flags, next = 0;
struct request *rq, *tmp;
int next_set = 0;
spin_lock_irqsave(q->queue_lock, flags);
list_for_each_entry_safe(rq, tmp, &q->timeout_list, timeout_list)
blk_rq_check_expired(rq, &next, &next_set);
if (next_set)
mod_timer(&q->timeout, round_jiffies_up(next));
spin_unlock_irqrestore(q->queue_lock, flags);
}
/**
* blk_abort_request -- Request request recovery for the specified command
* @req: pointer to the request of interest
*
* This function requests that the block layer start recovery for the
* request by deleting the timer and calling the q's timeout function.
* LLDDs who implement their own error recovery MAY ignore the timeout
* event if they generated blk_abort_req. Must hold queue lock.
*/
void blk_abort_request(struct request *req)
{
if (blk_mark_rq_complete(req))
return;
blk_delete_timer(req);
if (req->q->mq_ops)
blk_mq_rq_timed_out(req, false);
else
blk_rq_timed_out(req);
}
EXPORT_SYMBOL_GPL(blk_abort_request);
unsigned long blk_rq_timeout(unsigned long timeout)
{
unsigned long maxt;
maxt = round_jiffies_up(jiffies + BLK_MAX_TIMEOUT);
if (time_after(timeout, maxt))
timeout = maxt;
return timeout;
}
/**
* blk_add_timer - Start timeout timer for a single request
* @req: request that is about to start running.
*
* Notes:
* Each request has its own timer, and as it is added to the queue, we
* set up the timer. When the request completes, we cancel the timer.
*/
void blk_add_timer(struct request *req)
{
struct request_queue *q = req->q;
unsigned long expiry;
/* blk-mq has its own handler, so we don't need ->rq_timed_out_fn */
if (!q->mq_ops && !q->rq_timed_out_fn)
return;
BUG_ON(!list_empty(&req->timeout_list));
/*
* Some LLDs, like scsi, peek at the timeout to prevent a
* command from being retried forever.
*/
if (!req->timeout)
req->timeout = q->rq_timeout;
req->deadline = jiffies + req->timeout;
if (!q->mq_ops)
list_add_tail(&req->timeout_list, &req->q->timeout_list);
/*
* If the timer isn't already pending or this timeout is earlier
* than an existing one, modify the timer. Round up to next nearest
* second.
*/
expiry = blk_rq_timeout(round_jiffies_up(req->deadline));
if (!timer_pending(&q->timeout) ||
time_before(expiry, q->timeout.expires)) {
unsigned long diff = q->timeout.expires - expiry;
/*
* Due to added timer slack to group timers, the timer
* will often be a little in front of what we asked for.
* So apply some tolerance here too, otherwise we keep
* modifying the timer because expires for value X
* will be X + something.
*/
if (!timer_pending(&q->timeout) || (diff >= HZ / 2))
mod_timer(&q->timeout, expiry);
}
}

284
block/blk.h Normal file
View file

@ -0,0 +1,284 @@
#ifndef BLK_INTERNAL_H
#define BLK_INTERNAL_H
#include <linux/idr.h>
#include <linux/blk-mq.h>
#include "blk-mq.h"
/* Amount of time in which a process may batch requests */
#define BLK_BATCH_TIME (HZ/50UL)
/* Number of requests a "batching" process may submit */
#define BLK_BATCH_REQ 32
/* Max future timer expiry for timeouts */
#define BLK_MAX_TIMEOUT (5 * HZ)
struct blk_flush_queue {
unsigned int flush_queue_delayed:1;
unsigned int flush_pending_idx:1;
unsigned int flush_running_idx:1;
unsigned long flush_pending_since;
struct list_head flush_queue[2];
struct list_head flush_data_in_flight;
struct request *flush_rq;
spinlock_t mq_flush_lock;
};
extern struct kmem_cache *blk_requestq_cachep;
extern struct kmem_cache *request_cachep;
extern struct kobj_type blk_queue_ktype;
extern struct ida blk_queue_ida;
static inline struct blk_flush_queue *blk_get_flush_queue(
struct request_queue *q, struct blk_mq_ctx *ctx)
{
struct blk_mq_hw_ctx *hctx;
if (!q->mq_ops)
return q->fq;
hctx = q->mq_ops->map_queue(q, ctx->cpu);
return hctx->fq;
}
static inline void __blk_get_queue(struct request_queue *q)
{
kobject_get(&q->kobj);
}
struct blk_flush_queue *blk_alloc_flush_queue(struct request_queue *q,
int node, int cmd_size);
void blk_free_flush_queue(struct blk_flush_queue *q);
int blk_init_rl(struct request_list *rl, struct request_queue *q,
gfp_t gfp_mask);
void blk_exit_rl(struct request_list *rl);
void init_request_from_bio(struct request *req, struct bio *bio);
void blk_rq_bio_prep(struct request_queue *q, struct request *rq,
struct bio *bio);
int blk_rq_append_bio(struct request_queue *q, struct request *rq,
struct bio *bio);
void blk_queue_bypass_start(struct request_queue *q);
void blk_queue_bypass_end(struct request_queue *q);
void blk_dequeue_request(struct request *rq);
void __blk_queue_free_tags(struct request_queue *q);
bool __blk_end_bidi_request(struct request *rq, int error,
unsigned int nr_bytes, unsigned int bidi_bytes);
void blk_rq_timed_out_timer(unsigned long data);
unsigned long blk_rq_timeout(unsigned long timeout);
void blk_add_timer(struct request *req);
void blk_delete_timer(struct request *);
bool bio_attempt_front_merge(struct request_queue *q, struct request *req,
struct bio *bio);
bool bio_attempt_back_merge(struct request_queue *q, struct request *req,
struct bio *bio);
bool blk_attempt_plug_merge(struct request_queue *q, struct bio *bio,
unsigned int *request_count);
void blk_account_io_start(struct request *req, bool new_io);
void blk_account_io_completion(struct request *req, unsigned int bytes);
void blk_account_io_done(struct request *req);
/*
* Internal atomic flags for request handling
*/
enum rq_atomic_flags {
REQ_ATOM_COMPLETE = 0,
REQ_ATOM_STARTED,
};
/*
* EH timer and IO completion will both attempt to 'grab' the request, make
* sure that only one of them succeeds
*/
static inline int blk_mark_rq_complete(struct request *rq)
{
return test_and_set_bit(REQ_ATOM_COMPLETE, &rq->atomic_flags);
}
static inline void blk_clear_rq_complete(struct request *rq)
{
clear_bit(REQ_ATOM_COMPLETE, &rq->atomic_flags);
}
/*
* Internal elevator interface
*/
#define ELV_ON_HASH(rq) ((rq)->cmd_flags & REQ_HASHED)
void blk_insert_flush(struct request *rq);
static inline struct request *__elv_next_request(struct request_queue *q)
{
struct request *rq;
struct blk_flush_queue *fq = blk_get_flush_queue(q, NULL);
while (1) {
if (!list_empty(&q->queue_head)) {
rq = list_entry_rq(q->queue_head.next);
return rq;
}
/*
* Flush request is running and flush request isn't queueable
* in the drive, we can hold the queue till flush request is
* finished. Even we don't do this, driver can't dispatch next
* requests and will requeue them. And this can improve
* throughput too. For example, we have request flush1, write1,
* flush 2. flush1 is dispatched, then queue is hold, write1
* isn't inserted to queue. After flush1 is finished, flush2
* will be dispatched. Since disk cache is already clean,
* flush2 will be finished very soon, so looks like flush2 is
* folded to flush1.
* Since the queue is hold, a flag is set to indicate the queue
* should be restarted later. Please see flush_end_io() for
* details.
*/
if (fq->flush_pending_idx != fq->flush_running_idx &&
!queue_flush_queueable(q)) {
fq->flush_queue_delayed = 1;
return NULL;
}
if (unlikely(blk_queue_bypass(q)) ||
!q->elevator->type->ops.elevator_dispatch_fn(q, 0))
return NULL;
}
}
static inline void elv_activate_rq(struct request_queue *q, struct request *rq)
{
struct elevator_queue *e = q->elevator;
if (e->type->ops.elevator_activate_req_fn)
e->type->ops.elevator_activate_req_fn(q, rq);
}
static inline void elv_deactivate_rq(struct request_queue *q, struct request *rq)
{
struct elevator_queue *e = q->elevator;
if (e->type->ops.elevator_deactivate_req_fn)
e->type->ops.elevator_deactivate_req_fn(q, rq);
}
#ifdef CONFIG_FAIL_IO_TIMEOUT
int blk_should_fake_timeout(struct request_queue *);
ssize_t part_timeout_show(struct device *, struct device_attribute *, char *);
ssize_t part_timeout_store(struct device *, struct device_attribute *,
const char *, size_t);
#else
static inline int blk_should_fake_timeout(struct request_queue *q)
{
return 0;
}
#endif
int ll_back_merge_fn(struct request_queue *q, struct request *req,
struct bio *bio);
int ll_front_merge_fn(struct request_queue *q, struct request *req,
struct bio *bio);
int attempt_back_merge(struct request_queue *q, struct request *rq);
int attempt_front_merge(struct request_queue *q, struct request *rq);
int blk_attempt_req_merge(struct request_queue *q, struct request *rq,
struct request *next);
void blk_recalc_rq_segments(struct request *rq);
void blk_rq_set_mixed_merge(struct request *rq);
bool blk_rq_merge_ok(struct request *rq, struct bio *bio);
int blk_try_merge(struct request *rq, struct bio *bio);
void blk_queue_congestion_threshold(struct request_queue *q);
void __blk_run_queue_uncond(struct request_queue *q);
int blk_dev_init(void);
/*
* Return the threshold (number of used requests) at which the queue is
* considered to be congested. It include a little hysteresis to keep the
* context switch rate down.
*/
static inline int queue_congestion_on_threshold(struct request_queue *q)
{
return q->nr_congestion_on;
}
/*
* The threshold at which a queue is considered to be uncongested
*/
static inline int queue_congestion_off_threshold(struct request_queue *q)
{
return q->nr_congestion_off;
}
extern int blk_update_nr_requests(struct request_queue *, unsigned int);
/*
* Contribute to IO statistics IFF:
*
* a) it's attached to a gendisk, and
* b) the queue had IO stats enabled when this request was started, and
* c) it's a file system request
*/
static inline int blk_do_io_stat(struct request *rq)
{
return rq->rq_disk &&
(rq->cmd_flags & REQ_IO_STAT) &&
(rq->cmd_type == REQ_TYPE_FS);
}
/*
* Internal io_context interface
*/
void get_io_context(struct io_context *ioc);
struct io_cq *ioc_lookup_icq(struct io_context *ioc, struct request_queue *q);
struct io_cq *ioc_create_icq(struct io_context *ioc, struct request_queue *q,
gfp_t gfp_mask);
void ioc_clear_queue(struct request_queue *q);
int create_task_io_context(struct task_struct *task, gfp_t gfp_mask, int node);
/**
* create_io_context - try to create task->io_context
* @gfp_mask: allocation mask
* @node: allocation node
*
* If %current->io_context is %NULL, allocate a new io_context and install
* it. Returns the current %current->io_context which may be %NULL if
* allocation failed.
*
* Note that this function can't be called with IRQ disabled because
* task_lock which protects %current->io_context is IRQ-unsafe.
*/
static inline struct io_context *create_io_context(gfp_t gfp_mask, int node)
{
WARN_ON_ONCE(irqs_disabled());
if (unlikely(!current->io_context))
create_task_io_context(current, gfp_mask, node);
return current->io_context;
}
/*
* Internal throttling interface
*/
#ifdef CONFIG_BLK_DEV_THROTTLING
extern bool blk_throtl_bio(struct request_queue *q, struct bio *bio);
extern void blk_throtl_drain(struct request_queue *q);
extern int blk_throtl_init(struct request_queue *q);
extern void blk_throtl_exit(struct request_queue *q);
#else /* CONFIG_BLK_DEV_THROTTLING */
static inline bool blk_throtl_bio(struct request_queue *q, struct bio *bio)
{
return false;
}
static inline void blk_throtl_drain(struct request_queue *q) { }
static inline int blk_throtl_init(struct request_queue *q) { return 0; }
static inline void blk_throtl_exit(struct request_queue *q) { }
#endif /* CONFIG_BLK_DEV_THROTTLING */
#endif /* BLK_INTERNAL_H */

290
block/bounce.c Normal file
View file

@ -0,0 +1,290 @@
/* bounce buffer handling for block devices
*
* - Split from highmem.c
*/
#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
#include <linux/mm.h>
#include <linux/export.h>
#include <linux/swap.h>
#include <linux/gfp.h>
#include <linux/bio.h>
#include <linux/pagemap.h>
#include <linux/mempool.h>
#include <linux/blkdev.h>
#include <linux/init.h>
#include <linux/hash.h>
#include <linux/highmem.h>
#include <linux/bootmem.h>
#include <linux/printk.h>
#include <asm/tlbflush.h>
#include <trace/events/block.h>
#define POOL_SIZE 64
#define ISA_POOL_SIZE 16
static mempool_t *page_pool, *isa_page_pool;
#if defined(CONFIG_HIGHMEM) || defined(CONFIG_NEED_BOUNCE_POOL)
static __init int init_emergency_pool(void)
{
#if defined(CONFIG_HIGHMEM) && !defined(CONFIG_MEMORY_HOTPLUG)
if (max_pfn <= max_low_pfn)
return 0;
#endif
page_pool = mempool_create_page_pool(POOL_SIZE, 0);
BUG_ON(!page_pool);
pr_info("pool size: %d pages\n", POOL_SIZE);
return 0;
}
__initcall(init_emergency_pool);
#endif
#ifdef CONFIG_HIGHMEM
/*
* highmem version, map in to vec
*/
static void bounce_copy_vec(struct bio_vec *to, unsigned char *vfrom)
{
unsigned long flags;
unsigned char *vto;
local_irq_save(flags);
vto = kmap_atomic(to->bv_page);
memcpy(vto + to->bv_offset, vfrom, to->bv_len);
kunmap_atomic(vto);
local_irq_restore(flags);
}
#else /* CONFIG_HIGHMEM */
#define bounce_copy_vec(to, vfrom) \
memcpy(page_address((to)->bv_page) + (to)->bv_offset, vfrom, (to)->bv_len)
#endif /* CONFIG_HIGHMEM */
/*
* allocate pages in the DMA region for the ISA pool
*/
static void *mempool_alloc_pages_isa(gfp_t gfp_mask, void *data)
{
return mempool_alloc_pages(gfp_mask | GFP_DMA, data);
}
/*
* gets called "every" time someone init's a queue with BLK_BOUNCE_ISA
* as the max address, so check if the pool has already been created.
*/
int init_emergency_isa_pool(void)
{
if (isa_page_pool)
return 0;
isa_page_pool = mempool_create(ISA_POOL_SIZE, mempool_alloc_pages_isa,
mempool_free_pages, (void *) 0);
BUG_ON(!isa_page_pool);
pr_info("isa pool size: %d pages\n", ISA_POOL_SIZE);
return 0;
}
/*
* Simple bounce buffer support for highmem pages. Depending on the
* queue gfp mask set, *to may or may not be a highmem page. kmap it
* always, it will do the Right Thing
*/
static void copy_to_high_bio_irq(struct bio *to, struct bio *from)
{
unsigned char *vfrom;
struct bio_vec tovec, *fromvec = from->bi_io_vec;
struct bvec_iter iter;
bio_for_each_segment(tovec, to, iter) {
if (tovec.bv_page != fromvec->bv_page) {
/*
* fromvec->bv_offset and fromvec->bv_len might have
* been modified by the block layer, so use the original
* copy, bounce_copy_vec already uses tovec->bv_len
*/
vfrom = page_address(fromvec->bv_page) +
tovec.bv_offset;
bounce_copy_vec(&tovec, vfrom);
flush_dcache_page(tovec.bv_page);
}
fromvec++;
}
}
static void bounce_end_io(struct bio *bio, mempool_t *pool, int err)
{
struct bio *bio_orig = bio->bi_private;
struct bio_vec *bvec, *org_vec;
int i;
if (test_bit(BIO_EOPNOTSUPP, &bio->bi_flags))
set_bit(BIO_EOPNOTSUPP, &bio_orig->bi_flags);
/*
* free up bounce indirect pages used
*/
bio_for_each_segment_all(bvec, bio, i) {
org_vec = bio_orig->bi_io_vec + i;
if (bvec->bv_page == org_vec->bv_page)
continue;
dec_zone_page_state(bvec->bv_page, NR_BOUNCE);
mempool_free(bvec->bv_page, pool);
}
bio_endio(bio_orig, err);
bio_put(bio);
}
static void bounce_end_io_write(struct bio *bio, int err)
{
bounce_end_io(bio, page_pool, err);
}
static void bounce_end_io_write_isa(struct bio *bio, int err)
{
bounce_end_io(bio, isa_page_pool, err);
}
static void __bounce_end_io_read(struct bio *bio, mempool_t *pool, int err)
{
struct bio *bio_orig = bio->bi_private;
if (test_bit(BIO_UPTODATE, &bio->bi_flags))
copy_to_high_bio_irq(bio_orig, bio);
bounce_end_io(bio, pool, err);
}
static void bounce_end_io_read(struct bio *bio, int err)
{
__bounce_end_io_read(bio, page_pool, err);
}
static void bounce_end_io_read_isa(struct bio *bio, int err)
{
__bounce_end_io_read(bio, isa_page_pool, err);
}
#ifdef CONFIG_NEED_BOUNCE_POOL
static int must_snapshot_stable_pages(struct request_queue *q, struct bio *bio)
{
if (bio_data_dir(bio) != WRITE)
return 0;
if (!bdi_cap_stable_pages_required(&q->backing_dev_info))
return 0;
return test_bit(BIO_SNAP_STABLE, &bio->bi_flags);
}
#else
static int must_snapshot_stable_pages(struct request_queue *q, struct bio *bio)
{
return 0;
}
#endif /* CONFIG_NEED_BOUNCE_POOL */
static void __blk_queue_bounce(struct request_queue *q, struct bio **bio_orig,
mempool_t *pool, int force)
{
struct bio *bio;
int rw = bio_data_dir(*bio_orig);
struct bio_vec *to, from;
struct bvec_iter iter;
unsigned i;
if (force)
goto bounce;
bio_for_each_segment(from, *bio_orig, iter)
if (page_to_pfn(from.bv_page) > queue_bounce_pfn(q))
goto bounce;
return;
bounce:
bio = bio_clone_bioset(*bio_orig, GFP_NOIO, fs_bio_set);
bio_for_each_segment_all(to, bio, i) {
struct page *page = to->bv_page;
if (page_to_pfn(page) <= queue_bounce_pfn(q) && !force)
continue;
inc_zone_page_state(to->bv_page, NR_BOUNCE);
to->bv_page = mempool_alloc(pool, q->bounce_gfp);
if (rw == WRITE) {
char *vto, *vfrom;
flush_dcache_page(page);
vto = page_address(to->bv_page) + to->bv_offset;
vfrom = kmap_atomic(page) + to->bv_offset;
memcpy(vto, vfrom, to->bv_len);
kunmap_atomic(vfrom);
}
}
trace_block_bio_bounce(q, *bio_orig);
bio->bi_flags |= (1 << BIO_BOUNCED);
if (pool == page_pool) {
bio->bi_end_io = bounce_end_io_write;
if (rw == READ)
bio->bi_end_io = bounce_end_io_read;
} else {
bio->bi_end_io = bounce_end_io_write_isa;
if (rw == READ)
bio->bi_end_io = bounce_end_io_read_isa;
}
bio->bi_private = *bio_orig;
*bio_orig = bio;
}
void blk_queue_bounce(struct request_queue *q, struct bio **bio_orig)
{
int must_bounce;
mempool_t *pool;
/*
* Data-less bio, nothing to bounce
*/
if (!bio_has_data(*bio_orig))
return;
must_bounce = must_snapshot_stable_pages(q, *bio_orig);
/*
* for non-isa bounce case, just check if the bounce pfn is equal
* to or bigger than the highest pfn in the system -- in that case,
* don't waste time iterating over bio segments
*/
if (!(q->bounce_gfp & GFP_DMA)) {
if (queue_bounce_pfn(q) >= blk_max_pfn && !must_bounce)
return;
pool = page_pool;
} else {
BUG_ON(!isa_page_pool);
pool = isa_page_pool;
}
/*
* slow path
*/
__blk_queue_bounce(q, bio_orig, pool, must_bounce);
}
EXPORT_SYMBOL(blk_queue_bounce);

232
block/bsg-lib.c Normal file
View file

@ -0,0 +1,232 @@
/*
* BSG helper library
*
* Copyright (C) 2008 James Smart, Emulex Corporation
* Copyright (C) 2011 Red Hat, Inc. All rights reserved.
* Copyright (C) 2011 Mike Christie
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
*
*/
#include <linux/slab.h>
#include <linux/blkdev.h>
#include <linux/delay.h>
#include <linux/scatterlist.h>
#include <linux/bsg-lib.h>
#include <linux/export.h>
#include <scsi/scsi_cmnd.h>
/**
* bsg_destroy_job - routine to teardown/delete a bsg job
* @job: bsg_job that is to be torn down
*/
static void bsg_destroy_job(struct bsg_job *job)
{
put_device(job->dev); /* release reference for the request */
kfree(job->request_payload.sg_list);
kfree(job->reply_payload.sg_list);
kfree(job);
}
/**
* bsg_job_done - completion routine for bsg requests
* @job: bsg_job that is complete
* @result: job reply result
* @reply_payload_rcv_len: length of payload recvd
*
* The LLD should call this when the bsg job has completed.
*/
void bsg_job_done(struct bsg_job *job, int result,
unsigned int reply_payload_rcv_len)
{
struct request *req = job->req;
struct request *rsp = req->next_rq;
int err;
err = job->req->errors = result;
if (err < 0)
/* we're only returning the result field in the reply */
job->req->sense_len = sizeof(u32);
else
job->req->sense_len = job->reply_len;
/* we assume all request payload was transferred, residual == 0 */
req->resid_len = 0;
if (rsp) {
WARN_ON(reply_payload_rcv_len > rsp->resid_len);
/* set reply (bidi) residual */
rsp->resid_len -= min(reply_payload_rcv_len, rsp->resid_len);
}
blk_complete_request(req);
}
EXPORT_SYMBOL_GPL(bsg_job_done);
/**
* bsg_softirq_done - softirq done routine for destroying the bsg requests
* @rq: BSG request that holds the job to be destroyed
*/
static void bsg_softirq_done(struct request *rq)
{
struct bsg_job *job = rq->special;
blk_end_request_all(rq, rq->errors);
bsg_destroy_job(job);
}
static int bsg_map_buffer(struct bsg_buffer *buf, struct request *req)
{
size_t sz = (sizeof(struct scatterlist) * req->nr_phys_segments);
BUG_ON(!req->nr_phys_segments);
buf->sg_list = kzalloc(sz, GFP_KERNEL);
if (!buf->sg_list)
return -ENOMEM;
sg_init_table(buf->sg_list, req->nr_phys_segments);
buf->sg_cnt = blk_rq_map_sg(req->q, req, buf->sg_list);
buf->payload_len = blk_rq_bytes(req);
return 0;
}
/**
* bsg_create_job - create the bsg_job structure for the bsg request
* @dev: device that is being sent the bsg request
* @req: BSG request that needs a job structure
*/
static int bsg_create_job(struct device *dev, struct request *req)
{
struct request *rsp = req->next_rq;
struct request_queue *q = req->q;
struct bsg_job *job;
int ret;
BUG_ON(req->special);
job = kzalloc(sizeof(struct bsg_job) + q->bsg_job_size, GFP_KERNEL);
if (!job)
return -ENOMEM;
req->special = job;
job->req = req;
if (q->bsg_job_size)
job->dd_data = (void *)&job[1];
job->request = req->cmd;
job->request_len = req->cmd_len;
job->reply = req->sense;
job->reply_len = SCSI_SENSE_BUFFERSIZE; /* Size of sense buffer
* allocated */
if (req->bio) {
ret = bsg_map_buffer(&job->request_payload, req);
if (ret)
goto failjob_rls_job;
}
if (rsp && rsp->bio) {
ret = bsg_map_buffer(&job->reply_payload, rsp);
if (ret)
goto failjob_rls_rqst_payload;
}
job->dev = dev;
/* take a reference for the request */
get_device(job->dev);
return 0;
failjob_rls_rqst_payload:
kfree(job->request_payload.sg_list);
failjob_rls_job:
kfree(job);
return -ENOMEM;
}
/**
* bsg_request_fn - generic handler for bsg requests
* @q: request queue to manage
*
* On error the create_bsg_job function should return a -Exyz error value
* that will be set to the req->errors.
*
* Drivers/subsys should pass this to the queue init function.
*/
void bsg_request_fn(struct request_queue *q)
{
struct device *dev = q->queuedata;
struct request *req;
struct bsg_job *job;
int ret;
if (!get_device(dev))
return;
while (1) {
req = blk_fetch_request(q);
if (!req)
break;
spin_unlock_irq(q->queue_lock);
ret = bsg_create_job(dev, req);
if (ret) {
req->errors = ret;
blk_end_request_all(req, ret);
spin_lock_irq(q->queue_lock);
continue;
}
job = req->special;
ret = q->bsg_job_fn(job);
spin_lock_irq(q->queue_lock);
if (ret)
break;
}
spin_unlock_irq(q->queue_lock);
put_device(dev);
spin_lock_irq(q->queue_lock);
}
EXPORT_SYMBOL_GPL(bsg_request_fn);
/**
* bsg_setup_queue - Create and add the bsg hooks so we can receive requests
* @dev: device to attach bsg device to
* @q: request queue setup by caller
* @name: device to give bsg device
* @job_fn: bsg job handler
* @dd_job_size: size of LLD data needed for each job
*
* The caller should have setup the reuqest queue with bsg_request_fn
* as the request_fn.
*/
int bsg_setup_queue(struct device *dev, struct request_queue *q,
char *name, bsg_job_fn *job_fn, int dd_job_size)
{
int ret;
q->queuedata = dev;
q->bsg_job_size = dd_job_size;
q->bsg_job_fn = job_fn;
queue_flag_set_unlocked(QUEUE_FLAG_BIDI, q);
blk_queue_softirq_done(q, bsg_softirq_done);
blk_queue_rq_timeout(q, BLK_DEFAULT_SG_TIMEOUT);
ret = bsg_register_queue(q, dev, name, NULL);
if (ret) {
printk(KERN_ERR "%s: bsg interface failed to "
"initialize - register queue\n", dev->kobj.name);
return ret;
}
return 0;
}
EXPORT_SYMBOL_GPL(bsg_setup_queue);

1119
block/bsg.c Normal file

File diff suppressed because it is too large Load diff

4675
block/cfq-iosched.c Normal file

File diff suppressed because it is too large Load diff

254
block/cmdline-parser.c Normal file
View file

@ -0,0 +1,254 @@
/*
* Parse command line, get partition information
*
* Written by Cai Zhiyong <caizhiyong@huawei.com>
*
*/
#include <linux/export.h>
#include <linux/cmdline-parser.h>
static int parse_subpart(struct cmdline_subpart **subpart, char *partdef)
{
int ret = 0;
struct cmdline_subpart *new_subpart;
*subpart = NULL;
new_subpart = kzalloc(sizeof(struct cmdline_subpart), GFP_KERNEL);
if (!new_subpart)
return -ENOMEM;
if (*partdef == '-') {
new_subpart->size = (sector_t)(~0ULL);
partdef++;
} else {
new_subpart->size = (sector_t)memparse(partdef, &partdef);
if (new_subpart->size < (sector_t)PAGE_SIZE) {
pr_warn("cmdline partition size is invalid.");
ret = -EINVAL;
goto fail;
}
}
if (*partdef == '@') {
partdef++;
new_subpart->from = (sector_t)memparse(partdef, &partdef);
} else {
new_subpart->from = (sector_t)(~0ULL);
}
if (*partdef == '(') {
int length;
char *next = strchr(++partdef, ')');
if (!next) {
pr_warn("cmdline partition format is invalid.");
ret = -EINVAL;
goto fail;
}
length = min_t(int, next - partdef,
sizeof(new_subpart->name) - 1);
strncpy(new_subpart->name, partdef, length);
new_subpart->name[length] = '\0';
partdef = ++next;
} else
new_subpart->name[0] = '\0';
new_subpart->flags = 0;
if (!strncmp(partdef, "ro", 2)) {
new_subpart->flags |= PF_RDONLY;
partdef += 2;
}
if (!strncmp(partdef, "lk", 2)) {
new_subpart->flags |= PF_POWERUP_LOCK;
partdef += 2;
}
*subpart = new_subpart;
return 0;
fail:
kfree(new_subpart);
return ret;
}
static void free_subpart(struct cmdline_parts *parts)
{
struct cmdline_subpart *subpart;
while (parts->subpart) {
subpart = parts->subpart;
parts->subpart = subpart->next_subpart;
kfree(subpart);
}
}
static int parse_parts(struct cmdline_parts **parts, const char *bdevdef)
{
int ret = -EINVAL;
char *next;
int length;
struct cmdline_subpart **next_subpart;
struct cmdline_parts *newparts;
char buf[BDEVNAME_SIZE + 32 + 4];
*parts = NULL;
newparts = kzalloc(sizeof(struct cmdline_parts), GFP_KERNEL);
if (!newparts)
return -ENOMEM;
next = strchr(bdevdef, ':');
if (!next) {
pr_warn("cmdline partition has no block device.");
goto fail;
}
length = min_t(int, next - bdevdef, sizeof(newparts->name) - 1);
strncpy(newparts->name, bdevdef, length);
newparts->name[length] = '\0';
newparts->nr_subparts = 0;
next_subpart = &newparts->subpart;
while (next && *(++next)) {
bdevdef = next;
next = strchr(bdevdef, ',');
length = (!next) ? (sizeof(buf) - 1) :
min_t(int, next - bdevdef, sizeof(buf) - 1);
strncpy(buf, bdevdef, length);
buf[length] = '\0';
ret = parse_subpart(next_subpart, buf);
if (ret)
goto fail;
newparts->nr_subparts++;
next_subpart = &(*next_subpart)->next_subpart;
}
if (!newparts->subpart) {
pr_warn("cmdline partition has no valid partition.");
ret = -EINVAL;
goto fail;
}
*parts = newparts;
return 0;
fail:
free_subpart(newparts);
kfree(newparts);
return ret;
}
void cmdline_parts_free(struct cmdline_parts **parts)
{
struct cmdline_parts *next_parts;
while (*parts) {
next_parts = (*parts)->next_parts;
free_subpart(*parts);
kfree(*parts);
*parts = next_parts;
}
}
EXPORT_SYMBOL(cmdline_parts_free);
int cmdline_parts_parse(struct cmdline_parts **parts, const char *cmdline)
{
int ret;
char *buf;
char *pbuf;
char *next;
struct cmdline_parts **next_parts;
*parts = NULL;
next = pbuf = buf = kstrdup(cmdline, GFP_KERNEL);
if (!buf)
return -ENOMEM;
next_parts = parts;
while (next && *pbuf) {
next = strchr(pbuf, ';');
if (next)
*next = '\0';
ret = parse_parts(next_parts, pbuf);
if (ret)
goto fail;
if (next)
pbuf = ++next;
next_parts = &(*next_parts)->next_parts;
}
if (!*parts) {
pr_warn("cmdline partition has no valid partition.");
ret = -EINVAL;
goto fail;
}
ret = 0;
done:
kfree(buf);
return ret;
fail:
cmdline_parts_free(parts);
goto done;
}
EXPORT_SYMBOL(cmdline_parts_parse);
struct cmdline_parts *cmdline_parts_find(struct cmdline_parts *parts,
const char *bdev)
{
while (parts && strncmp(bdev, parts->name, sizeof(parts->name)))
parts = parts->next_parts;
return parts;
}
EXPORT_SYMBOL(cmdline_parts_find);
/*
* add_part()
* 0 success.
* 1 can not add so many partitions.
*/
int cmdline_parts_set(struct cmdline_parts *parts, sector_t disk_size,
int slot,
int (*add_part)(int, struct cmdline_subpart *, void *),
void *param)
{
sector_t from = 0;
struct cmdline_subpart *subpart;
for (subpart = parts->subpart; subpart;
subpart = subpart->next_subpart, slot++) {
if (subpart->from == (sector_t)(~0ULL))
subpart->from = from;
else
from = subpart->from;
if (from >= disk_size)
break;
if (subpart->size > (disk_size - from))
subpart->size = disk_size - from;
from += subpart->size;
if (add_part(slot, subpart, param))
break;
}
return slot;
}
EXPORT_SYMBOL(cmdline_parts_set);

756
block/compat_ioctl.c Normal file
View file

@ -0,0 +1,756 @@
#include <linux/blkdev.h>
#include <linux/blkpg.h>
#include <linux/blktrace_api.h>
#include <linux/cdrom.h>
#include <linux/compat.h>
#include <linux/elevator.h>
#include <linux/fd.h>
#include <linux/hdreg.h>
#include <linux/slab.h>
#include <linux/syscalls.h>
#include <linux/types.h>
#include <linux/uaccess.h>
static int compat_put_ushort(unsigned long arg, unsigned short val)
{
return put_user(val, (unsigned short __user *)compat_ptr(arg));
}
static int compat_put_int(unsigned long arg, int val)
{
return put_user(val, (compat_int_t __user *)compat_ptr(arg));
}
static int compat_put_uint(unsigned long arg, unsigned int val)
{
return put_user(val, (compat_uint_t __user *)compat_ptr(arg));
}
static int compat_put_long(unsigned long arg, long val)
{
return put_user(val, (compat_long_t __user *)compat_ptr(arg));
}
static int compat_put_ulong(unsigned long arg, compat_ulong_t val)
{
return put_user(val, (compat_ulong_t __user *)compat_ptr(arg));
}
static int compat_put_u64(unsigned long arg, u64 val)
{
return put_user(val, (compat_u64 __user *)compat_ptr(arg));
}
struct compat_hd_geometry {
unsigned char heads;
unsigned char sectors;
unsigned short cylinders;
u32 start;
};
static int compat_hdio_getgeo(struct gendisk *disk, struct block_device *bdev,
struct compat_hd_geometry __user *ugeo)
{
struct hd_geometry geo;
int ret;
if (!ugeo)
return -EINVAL;
if (!disk->fops->getgeo)
return -ENOTTY;
memset(&geo, 0, sizeof(geo));
/*
* We need to set the startsect first, the driver may
* want to override it.
*/
geo.start = get_start_sect(bdev);
ret = disk->fops->getgeo(bdev, &geo);
if (ret)
return ret;
ret = copy_to_user(ugeo, &geo, 4);
ret |= put_user(geo.start, &ugeo->start);
if (ret)
ret = -EFAULT;
return ret;
}
static int compat_hdio_ioctl(struct block_device *bdev, fmode_t mode,
unsigned int cmd, unsigned long arg)
{
mm_segment_t old_fs = get_fs();
unsigned long kval;
unsigned int __user *uvp;
int error;
set_fs(KERNEL_DS);
error = __blkdev_driver_ioctl(bdev, mode,
cmd, (unsigned long)(&kval));
set_fs(old_fs);
if (error == 0) {
uvp = compat_ptr(arg);
if (put_user(kval, uvp))
error = -EFAULT;
}
return error;
}
struct compat_cdrom_read_audio {
union cdrom_addr addr;
u8 addr_format;
compat_int_t nframes;
compat_caddr_t buf;
};
struct compat_cdrom_generic_command {
unsigned char cmd[CDROM_PACKET_SIZE];
compat_caddr_t buffer;
compat_uint_t buflen;
compat_int_t stat;
compat_caddr_t sense;
unsigned char data_direction;
compat_int_t quiet;
compat_int_t timeout;
compat_caddr_t reserved[1];
};
static int compat_cdrom_read_audio(struct block_device *bdev, fmode_t mode,
unsigned int cmd, unsigned long arg)
{
struct cdrom_read_audio __user *cdread_audio;
struct compat_cdrom_read_audio __user *cdread_audio32;
__u32 data;
void __user *datap;
cdread_audio = compat_alloc_user_space(sizeof(*cdread_audio));
cdread_audio32 = compat_ptr(arg);
if (copy_in_user(&cdread_audio->addr,
&cdread_audio32->addr,
(sizeof(*cdread_audio32) -
sizeof(compat_caddr_t))))
return -EFAULT;
if (get_user(data, &cdread_audio32->buf))
return -EFAULT;
datap = compat_ptr(data);
if (put_user(datap, &cdread_audio->buf))
return -EFAULT;
return __blkdev_driver_ioctl(bdev, mode, cmd,
(unsigned long)cdread_audio);
}
static int compat_cdrom_generic_command(struct block_device *bdev, fmode_t mode,
unsigned int cmd, unsigned long arg)
{
struct cdrom_generic_command __user *cgc;
struct compat_cdrom_generic_command __user *cgc32;
u32 data;
unsigned char dir;
int itmp;
cgc = compat_alloc_user_space(sizeof(*cgc));
cgc32 = compat_ptr(arg);
if (copy_in_user(&cgc->cmd, &cgc32->cmd, sizeof(cgc->cmd)) ||
get_user(data, &cgc32->buffer) ||
put_user(compat_ptr(data), &cgc->buffer) ||
copy_in_user(&cgc->buflen, &cgc32->buflen,
(sizeof(unsigned int) + sizeof(int))) ||
get_user(data, &cgc32->sense) ||
put_user(compat_ptr(data), &cgc->sense) ||
get_user(dir, &cgc32->data_direction) ||
put_user(dir, &cgc->data_direction) ||
get_user(itmp, &cgc32->quiet) ||
put_user(itmp, &cgc->quiet) ||
get_user(itmp, &cgc32->timeout) ||
put_user(itmp, &cgc->timeout) ||
get_user(data, &cgc32->reserved[0]) ||
put_user(compat_ptr(data), &cgc->reserved[0]))
return -EFAULT;
return __blkdev_driver_ioctl(bdev, mode, cmd, (unsigned long)cgc);
}
struct compat_blkpg_ioctl_arg {
compat_int_t op;
compat_int_t flags;
compat_int_t datalen;
compat_caddr_t data;
};
static int compat_blkpg_ioctl(struct block_device *bdev, fmode_t mode,
unsigned int cmd, struct compat_blkpg_ioctl_arg __user *ua32)
{
struct blkpg_ioctl_arg __user *a = compat_alloc_user_space(sizeof(*a));
compat_caddr_t udata;
compat_int_t n;
int err;
err = get_user(n, &ua32->op);
err |= put_user(n, &a->op);
err |= get_user(n, &ua32->flags);
err |= put_user(n, &a->flags);
err |= get_user(n, &ua32->datalen);
err |= put_user(n, &a->datalen);
err |= get_user(udata, &ua32->data);
err |= put_user(compat_ptr(udata), &a->data);
if (err)
return err;
return blkdev_ioctl(bdev, mode, cmd, (unsigned long)a);
}
#define BLKBSZGET_32 _IOR(0x12, 112, int)
#define BLKBSZSET_32 _IOW(0x12, 113, int)
#define BLKGETSIZE64_32 _IOR(0x12, 114, int)
struct compat_floppy_drive_params {
char cmos;
compat_ulong_t max_dtr;
compat_ulong_t hlt;
compat_ulong_t hut;
compat_ulong_t srt;
compat_ulong_t spinup;
compat_ulong_t spindown;
unsigned char spindown_offset;
unsigned char select_delay;
unsigned char rps;
unsigned char tracks;
compat_ulong_t timeout;
unsigned char interleave_sect;
struct floppy_max_errors max_errors;
char flags;
char read_track;
short autodetect[8];
compat_int_t checkfreq;
compat_int_t native_format;
};
struct compat_floppy_drive_struct {
signed char flags;
compat_ulong_t spinup_date;
compat_ulong_t select_date;
compat_ulong_t first_read_date;
short probed_format;
short track;
short maxblock;
short maxtrack;
compat_int_t generation;
compat_int_t keep_data;
compat_int_t fd_ref;
compat_int_t fd_device;
compat_int_t last_checked;
compat_caddr_t dmabuf;
compat_int_t bufblocks;
};
struct compat_floppy_fdc_state {
compat_int_t spec1;
compat_int_t spec2;
compat_int_t dtr;
unsigned char version;
unsigned char dor;
compat_ulong_t address;
unsigned int rawcmd:2;
unsigned int reset:1;
unsigned int need_configure:1;
unsigned int perp_mode:2;
unsigned int has_fifo:1;
unsigned int driver_version;
unsigned char track[4];
};
struct compat_floppy_write_errors {
unsigned int write_errors;
compat_ulong_t first_error_sector;
compat_int_t first_error_generation;
compat_ulong_t last_error_sector;
compat_int_t last_error_generation;
compat_uint_t badness;
};
#define FDSETPRM32 _IOW(2, 0x42, struct compat_floppy_struct)
#define FDDEFPRM32 _IOW(2, 0x43, struct compat_floppy_struct)
#define FDSETDRVPRM32 _IOW(2, 0x90, struct compat_floppy_drive_params)
#define FDGETDRVPRM32 _IOR(2, 0x11, struct compat_floppy_drive_params)
#define FDGETDRVSTAT32 _IOR(2, 0x12, struct compat_floppy_drive_struct)
#define FDPOLLDRVSTAT32 _IOR(2, 0x13, struct compat_floppy_drive_struct)
#define FDGETFDCSTAT32 _IOR(2, 0x15, struct compat_floppy_fdc_state)
#define FDWERRORGET32 _IOR(2, 0x17, struct compat_floppy_write_errors)
static struct {
unsigned int cmd32;
unsigned int cmd;
} fd_ioctl_trans_table[] = {
{ FDSETPRM32, FDSETPRM },
{ FDDEFPRM32, FDDEFPRM },
{ FDGETPRM32, FDGETPRM },
{ FDSETDRVPRM32, FDSETDRVPRM },
{ FDGETDRVPRM32, FDGETDRVPRM },
{ FDGETDRVSTAT32, FDGETDRVSTAT },
{ FDPOLLDRVSTAT32, FDPOLLDRVSTAT },
{ FDGETFDCSTAT32, FDGETFDCSTAT },
{ FDWERRORGET32, FDWERRORGET }
};
#define NR_FD_IOCTL_TRANS ARRAY_SIZE(fd_ioctl_trans_table)
static int compat_fd_ioctl(struct block_device *bdev, fmode_t mode,
unsigned int cmd, unsigned long arg)
{
mm_segment_t old_fs = get_fs();
void *karg = NULL;
unsigned int kcmd = 0;
int i, err;
for (i = 0; i < NR_FD_IOCTL_TRANS; i++)
if (cmd == fd_ioctl_trans_table[i].cmd32) {
kcmd = fd_ioctl_trans_table[i].cmd;
break;
}
if (!kcmd)
return -EINVAL;
switch (cmd) {
case FDSETPRM32:
case FDDEFPRM32:
case FDGETPRM32:
{
compat_uptr_t name;
struct compat_floppy_struct __user *uf;
struct floppy_struct *f;
uf = compat_ptr(arg);
f = karg = kmalloc(sizeof(struct floppy_struct), GFP_KERNEL);
if (!karg)
return -ENOMEM;
if (cmd == FDGETPRM32)
break;
err = __get_user(f->size, &uf->size);
err |= __get_user(f->sect, &uf->sect);
err |= __get_user(f->head, &uf->head);
err |= __get_user(f->track, &uf->track);
err |= __get_user(f->stretch, &uf->stretch);
err |= __get_user(f->gap, &uf->gap);
err |= __get_user(f->rate, &uf->rate);
err |= __get_user(f->spec1, &uf->spec1);
err |= __get_user(f->fmt_gap, &uf->fmt_gap);
err |= __get_user(name, &uf->name);
f->name = compat_ptr(name);
if (err) {
err = -EFAULT;
goto out;
}
break;
}
case FDSETDRVPRM32:
case FDGETDRVPRM32:
{
struct compat_floppy_drive_params __user *uf;
struct floppy_drive_params *f;
uf = compat_ptr(arg);
f = karg = kmalloc(sizeof(struct floppy_drive_params), GFP_KERNEL);
if (!karg)
return -ENOMEM;
if (cmd == FDGETDRVPRM32)
break;
err = __get_user(f->cmos, &uf->cmos);
err |= __get_user(f->max_dtr, &uf->max_dtr);
err |= __get_user(f->hlt, &uf->hlt);
err |= __get_user(f->hut, &uf->hut);
err |= __get_user(f->srt, &uf->srt);
err |= __get_user(f->spinup, &uf->spinup);
err |= __get_user(f->spindown, &uf->spindown);
err |= __get_user(f->spindown_offset, &uf->spindown_offset);
err |= __get_user(f->select_delay, &uf->select_delay);
err |= __get_user(f->rps, &uf->rps);
err |= __get_user(f->tracks, &uf->tracks);
err |= __get_user(f->timeout, &uf->timeout);
err |= __get_user(f->interleave_sect, &uf->interleave_sect);
err |= __copy_from_user(&f->max_errors, &uf->max_errors, sizeof(f->max_errors));
err |= __get_user(f->flags, &uf->flags);
err |= __get_user(f->read_track, &uf->read_track);
err |= __copy_from_user(f->autodetect, uf->autodetect, sizeof(f->autodetect));
err |= __get_user(f->checkfreq, &uf->checkfreq);
err |= __get_user(f->native_format, &uf->native_format);
if (err) {
err = -EFAULT;
goto out;
}
break;
}
case FDGETDRVSTAT32:
case FDPOLLDRVSTAT32:
karg = kmalloc(sizeof(struct floppy_drive_struct), GFP_KERNEL);
if (!karg)
return -ENOMEM;
break;
case FDGETFDCSTAT32:
karg = kmalloc(sizeof(struct floppy_fdc_state), GFP_KERNEL);
if (!karg)
return -ENOMEM;
break;
case FDWERRORGET32:
karg = kmalloc(sizeof(struct floppy_write_errors), GFP_KERNEL);
if (!karg)
return -ENOMEM;
break;
default:
return -EINVAL;
}
set_fs(KERNEL_DS);
err = __blkdev_driver_ioctl(bdev, mode, kcmd, (unsigned long)karg);
set_fs(old_fs);
if (err)
goto out;
switch (cmd) {
case FDGETPRM32:
{
struct floppy_struct *f = karg;
struct compat_floppy_struct __user *uf = compat_ptr(arg);
err = __put_user(f->size, &uf->size);
err |= __put_user(f->sect, &uf->sect);
err |= __put_user(f->head, &uf->head);
err |= __put_user(f->track, &uf->track);
err |= __put_user(f->stretch, &uf->stretch);
err |= __put_user(f->gap, &uf->gap);
err |= __put_user(f->rate, &uf->rate);
err |= __put_user(f->spec1, &uf->spec1);
err |= __put_user(f->fmt_gap, &uf->fmt_gap);
err |= __put_user((u64)f->name, (compat_caddr_t __user *)&uf->name);
break;
}
case FDGETDRVPRM32:
{
struct compat_floppy_drive_params __user *uf;
struct floppy_drive_params *f = karg;
uf = compat_ptr(arg);
err = __put_user(f->cmos, &uf->cmos);
err |= __put_user(f->max_dtr, &uf->max_dtr);
err |= __put_user(f->hlt, &uf->hlt);
err |= __put_user(f->hut, &uf->hut);
err |= __put_user(f->srt, &uf->srt);
err |= __put_user(f->spinup, &uf->spinup);
err |= __put_user(f->spindown, &uf->spindown);
err |= __put_user(f->spindown_offset, &uf->spindown_offset);
err |= __put_user(f->select_delay, &uf->select_delay);
err |= __put_user(f->rps, &uf->rps);
err |= __put_user(f->tracks, &uf->tracks);
err |= __put_user(f->timeout, &uf->timeout);
err |= __put_user(f->interleave_sect, &uf->interleave_sect);
err |= __copy_to_user(&uf->max_errors, &f->max_errors, sizeof(f->max_errors));
err |= __put_user(f->flags, &uf->flags);
err |= __put_user(f->read_track, &uf->read_track);
err |= __copy_to_user(uf->autodetect, f->autodetect, sizeof(f->autodetect));
err |= __put_user(f->checkfreq, &uf->checkfreq);
err |= __put_user(f->native_format, &uf->native_format);
break;
}
case FDGETDRVSTAT32:
case FDPOLLDRVSTAT32:
{
struct compat_floppy_drive_struct __user *uf;
struct floppy_drive_struct *f = karg;
uf = compat_ptr(arg);
err = __put_user(f->flags, &uf->flags);
err |= __put_user(f->spinup_date, &uf->spinup_date);
err |= __put_user(f->select_date, &uf->select_date);
err |= __put_user(f->first_read_date, &uf->first_read_date);
err |= __put_user(f->probed_format, &uf->probed_format);
err |= __put_user(f->track, &uf->track);
err |= __put_user(f->maxblock, &uf->maxblock);
err |= __put_user(f->maxtrack, &uf->maxtrack);
err |= __put_user(f->generation, &uf->generation);
err |= __put_user(f->keep_data, &uf->keep_data);
err |= __put_user(f->fd_ref, &uf->fd_ref);
err |= __put_user(f->fd_device, &uf->fd_device);
err |= __put_user(f->last_checked, &uf->last_checked);
err |= __put_user((u64)f->dmabuf, &uf->dmabuf);
err |= __put_user((u64)f->bufblocks, &uf->bufblocks);
break;
}
case FDGETFDCSTAT32:
{
struct compat_floppy_fdc_state __user *uf;
struct floppy_fdc_state *f = karg;
uf = compat_ptr(arg);
err = __put_user(f->spec1, &uf->spec1);
err |= __put_user(f->spec2, &uf->spec2);
err |= __put_user(f->dtr, &uf->dtr);
err |= __put_user(f->version, &uf->version);
err |= __put_user(f->dor, &uf->dor);
err |= __put_user(f->address, &uf->address);
err |= __copy_to_user((char __user *)&uf->address + sizeof(uf->address),
(char *)&f->address + sizeof(f->address), sizeof(int));
err |= __put_user(f->driver_version, &uf->driver_version);
err |= __copy_to_user(uf->track, f->track, sizeof(f->track));
break;
}
case FDWERRORGET32:
{
struct compat_floppy_write_errors __user *uf;
struct floppy_write_errors *f = karg;
uf = compat_ptr(arg);
err = __put_user(f->write_errors, &uf->write_errors);
err |= __put_user(f->first_error_sector, &uf->first_error_sector);
err |= __put_user(f->first_error_generation, &uf->first_error_generation);
err |= __put_user(f->last_error_sector, &uf->last_error_sector);
err |= __put_user(f->last_error_generation, &uf->last_error_generation);
err |= __put_user(f->badness, &uf->badness);
break;
}
default:
break;
}
if (err)
err = -EFAULT;
out:
kfree(karg);
return err;
}
static int compat_blkdev_driver_ioctl(struct block_device *bdev, fmode_t mode,
unsigned cmd, unsigned long arg)
{
switch (cmd) {
case HDIO_GET_UNMASKINTR:
case HDIO_GET_MULTCOUNT:
case HDIO_GET_KEEPSETTINGS:
case HDIO_GET_32BIT:
case HDIO_GET_NOWERR:
case HDIO_GET_DMA:
case HDIO_GET_NICE:
case HDIO_GET_WCACHE:
case HDIO_GET_ACOUSTIC:
case HDIO_GET_ADDRESS:
case HDIO_GET_BUSSTATE:
return compat_hdio_ioctl(bdev, mode, cmd, arg);
case FDSETPRM32:
case FDDEFPRM32:
case FDGETPRM32:
case FDSETDRVPRM32:
case FDGETDRVPRM32:
case FDGETDRVSTAT32:
case FDPOLLDRVSTAT32:
case FDGETFDCSTAT32:
case FDWERRORGET32:
return compat_fd_ioctl(bdev, mode, cmd, arg);
case CDROMREADAUDIO:
return compat_cdrom_read_audio(bdev, mode, cmd, arg);
case CDROM_SEND_PACKET:
return compat_cdrom_generic_command(bdev, mode, cmd, arg);
/*
* No handler required for the ones below, we just need to
* convert arg to a 64 bit pointer.
*/
case BLKSECTSET:
/*
* 0x03 -- HD/IDE ioctl's used by hdparm and friends.
* Some need translations, these do not.
*/
case HDIO_GET_IDENTITY:
case HDIO_DRIVE_TASK:
case HDIO_DRIVE_CMD:
/* 0x330 is reserved -- it used to be HDIO_GETGEO_BIG */
case 0x330:
/* 0x02 -- Floppy ioctls */
case FDMSGON:
case FDMSGOFF:
case FDSETEMSGTRESH:
case FDFLUSH:
case FDWERRORCLR:
case FDSETMAXERRS:
case FDGETMAXERRS:
case FDGETDRVTYP:
case FDEJECT:
case FDCLRPRM:
case FDFMTBEG:
case FDFMTEND:
case FDRESET:
case FDTWADDLE:
case FDFMTTRK:
case FDRAWCMD:
/* CDROM stuff */
case CDROMPAUSE:
case CDROMRESUME:
case CDROMPLAYMSF:
case CDROMPLAYTRKIND:
case CDROMREADTOCHDR:
case CDROMREADTOCENTRY:
case CDROMSTOP:
case CDROMSTART:
case CDROMEJECT:
case CDROMVOLCTRL:
case CDROMSUBCHNL:
case CDROMMULTISESSION:
case CDROM_GET_MCN:
case CDROMRESET:
case CDROMVOLREAD:
case CDROMSEEK:
case CDROMPLAYBLK:
case CDROMCLOSETRAY:
case CDROM_DISC_STATUS:
case CDROM_CHANGER_NSLOTS:
case CDROM_GET_CAPABILITY:
/* Ignore cdrom.h about these next 5 ioctls, they absolutely do
* not take a struct cdrom_read, instead they take a struct cdrom_msf
* which is compatible.
*/
case CDROMREADMODE2:
case CDROMREADMODE1:
case CDROMREADRAW:
case CDROMREADCOOKED:
case CDROMREADALL:
/* DVD ioctls */
case DVD_READ_STRUCT:
case DVD_WRITE_STRUCT:
case DVD_AUTH:
arg = (unsigned long)compat_ptr(arg);
/* These intepret arg as an unsigned long, not as a pointer,
* so we must not do compat_ptr() conversion. */
case HDIO_SET_MULTCOUNT:
case HDIO_SET_UNMASKINTR:
case HDIO_SET_KEEPSETTINGS:
case HDIO_SET_32BIT:
case HDIO_SET_NOWERR:
case HDIO_SET_DMA:
case HDIO_SET_PIO_MODE:
case HDIO_SET_NICE:
case HDIO_SET_WCACHE:
case HDIO_SET_ACOUSTIC:
case HDIO_SET_BUSSTATE:
case HDIO_SET_ADDRESS:
case CDROMEJECT_SW:
case CDROM_SET_OPTIONS:
case CDROM_CLEAR_OPTIONS:
case CDROM_SELECT_SPEED:
case CDROM_SELECT_DISC:
case CDROM_MEDIA_CHANGED:
case CDROM_DRIVE_STATUS:
case CDROM_LOCKDOOR:
case CDROM_DEBUG:
break;
default:
/* unknown ioctl number */
return -ENOIOCTLCMD;
}
return __blkdev_driver_ioctl(bdev, mode, cmd, arg);
}
/* Most of the generic ioctls are handled in the normal fallback path.
This assumes the blkdev's low level compat_ioctl always returns
ENOIOCTLCMD for unknown ioctls. */
long compat_blkdev_ioctl(struct file *file, unsigned cmd, unsigned long arg)
{
int ret = -ENOIOCTLCMD;
struct inode *inode = file->f_mapping->host;
struct block_device *bdev = inode->i_bdev;
struct gendisk *disk = bdev->bd_disk;
fmode_t mode = file->f_mode;
struct backing_dev_info *bdi;
loff_t size;
unsigned int max_sectors;
/*
* O_NDELAY can be altered using fcntl(.., F_SETFL, ..), so we have
* to updated it before every ioctl.
*/
if (file->f_flags & O_NDELAY)
mode |= FMODE_NDELAY;
else
mode &= ~FMODE_NDELAY;
switch (cmd) {
case HDIO_GETGEO:
return compat_hdio_getgeo(disk, bdev, compat_ptr(arg));
case BLKPBSZGET:
return compat_put_uint(arg, bdev_physical_block_size(bdev));
case BLKIOMIN:
return compat_put_uint(arg, bdev_io_min(bdev));
case BLKIOOPT:
return compat_put_uint(arg, bdev_io_opt(bdev));
case BLKALIGNOFF:
return compat_put_int(arg, bdev_alignment_offset(bdev));
case BLKDISCARDZEROES:
return compat_put_uint(arg, bdev_discard_zeroes_data(bdev));
case BLKFLSBUF:
case BLKROSET:
case BLKDISCARD:
case BLKSECDISCARD:
case BLKZEROOUT:
/*
* the ones below are implemented in blkdev_locked_ioctl,
* but we call blkdev_ioctl, which gets the lock for us
*/
case BLKRRPART:
return blkdev_ioctl(bdev, mode, cmd,
(unsigned long)compat_ptr(arg));
case BLKBSZSET_32:
return blkdev_ioctl(bdev, mode, BLKBSZSET,
(unsigned long)compat_ptr(arg));
case BLKPG:
return compat_blkpg_ioctl(bdev, mode, cmd, compat_ptr(arg));
case BLKRAGET:
case BLKFRAGET:
if (!arg)
return -EINVAL;
bdi = blk_get_backing_dev_info(bdev);
return compat_put_long(arg,
(bdi->ra_pages * PAGE_CACHE_SIZE) / 512);
case BLKROGET: /* compatible */
return compat_put_int(arg, bdev_read_only(bdev) != 0);
case BLKBSZGET_32: /* get the logical block size (cf. BLKSSZGET) */
return compat_put_int(arg, block_size(bdev));
case BLKSSZGET: /* get block device hardware sector size */
return compat_put_int(arg, bdev_logical_block_size(bdev));
case BLKSECTGET:
max_sectors = min_t(unsigned int, USHRT_MAX,
queue_max_sectors(bdev_get_queue(bdev)));
return compat_put_ushort(arg, max_sectors);
case BLKROTATIONAL:
return compat_put_ushort(arg,
!blk_queue_nonrot(bdev_get_queue(bdev)));
case BLKRASET: /* compatible, but no compat_ptr (!) */
case BLKFRASET:
if (!capable(CAP_SYS_ADMIN))
return -EACCES;
bdi = blk_get_backing_dev_info(bdev);
bdi->ra_pages = (arg * 512) / PAGE_CACHE_SIZE;
return 0;
case BLKGETSIZE:
size = i_size_read(bdev->bd_inode);
if ((size >> 9) > ~0UL)
return -EFBIG;
return compat_put_ulong(arg, size >> 9);
case BLKGETSIZE64_32:
return compat_put_u64(arg, i_size_read(bdev->bd_inode));
case BLKTRACESETUP32:
case BLKTRACESTART: /* compatible */
case BLKTRACESTOP: /* compatible */
case BLKTRACETEARDOWN: /* compatible */
ret = blk_trace_ioctl(bdev, cmd, compat_ptr(arg));
return ret;
default:
if (disk->fops->compat_ioctl)
ret = disk->fops->compat_ioctl(bdev, mode, cmd, arg);
if (ret == -ENOIOCTLCMD)
ret = compat_blkdev_driver_ioctl(bdev, mode, cmd, arg);
return ret;
}
}

476
block/deadline-iosched.c Normal file
View file

@ -0,0 +1,476 @@
/*
* Deadline i/o scheduler.
*
* Copyright (C) 2002 Jens Axboe <axboe@kernel.dk>
*/
#include <linux/kernel.h>
#include <linux/fs.h>
#include <linux/blkdev.h>
#include <linux/elevator.h>
#include <linux/bio.h>
#include <linux/module.h>
#include <linux/slab.h>
#include <linux/init.h>
#include <linux/compiler.h>
#include <linux/rbtree.h>
/*
* See Documentation/block/deadline-iosched.txt
*/
static const int read_expire = HZ / 2; /* max time before a read is submitted. */
static const int write_expire = 5 * HZ; /* ditto for writes, these limits are SOFT! */
static const int writes_starved = 2; /* max times reads can starve a write */
static const int fifo_batch = 16; /* # of sequential requests treated as one
by the above parameters. For throughput. */
struct deadline_data {
/*
* run time data
*/
/*
* requests (deadline_rq s) are present on both sort_list and fifo_list
*/
struct rb_root sort_list[2];
struct list_head fifo_list[2];
/*
* next in sort order. read, write or both are NULL
*/
struct request *next_rq[2];
unsigned int batching; /* number of sequential requests made */
sector_t last_sector; /* head position */
unsigned int starved; /* times reads have starved writes */
/*
* settings that change how the i/o scheduler behaves
*/
int fifo_expire[2];
int fifo_batch;
int writes_starved;
int front_merges;
};
static void deadline_move_request(struct deadline_data *, struct request *);
static inline struct rb_root *
deadline_rb_root(struct deadline_data *dd, struct request *rq)
{
return &dd->sort_list[rq_data_dir(rq)];
}
/*
* get the request after `rq' in sector-sorted order
*/
static inline struct request *
deadline_latter_request(struct request *rq)
{
struct rb_node *node = rb_next(&rq->rb_node);
if (node)
return rb_entry_rq(node);
return NULL;
}
static void
deadline_add_rq_rb(struct deadline_data *dd, struct request *rq)
{
struct rb_root *root = deadline_rb_root(dd, rq);
elv_rb_add(root, rq);
}
static inline void
deadline_del_rq_rb(struct deadline_data *dd, struct request *rq)
{
const int data_dir = rq_data_dir(rq);
if (dd->next_rq[data_dir] == rq)
dd->next_rq[data_dir] = deadline_latter_request(rq);
elv_rb_del(deadline_rb_root(dd, rq), rq);
}
/*
* add rq to rbtree and fifo
*/
static void
deadline_add_request(struct request_queue *q, struct request *rq)
{
struct deadline_data *dd = q->elevator->elevator_data;
const int data_dir = rq_data_dir(rq);
deadline_add_rq_rb(dd, rq);
/*
* set expire time and add to fifo list
*/
rq->fifo_time = jiffies + dd->fifo_expire[data_dir];
list_add_tail(&rq->queuelist, &dd->fifo_list[data_dir]);
}
/*
* remove rq from rbtree and fifo.
*/
static void deadline_remove_request(struct request_queue *q, struct request *rq)
{
struct deadline_data *dd = q->elevator->elevator_data;
rq_fifo_clear(rq);
deadline_del_rq_rb(dd, rq);
}
static int
deadline_merge(struct request_queue *q, struct request **req, struct bio *bio)
{
struct deadline_data *dd = q->elevator->elevator_data;
struct request *__rq;
int ret;
/*
* check for front merge
*/
if (dd->front_merges) {
sector_t sector = bio_end_sector(bio);
__rq = elv_rb_find(&dd->sort_list[bio_data_dir(bio)], sector);
if (__rq) {
BUG_ON(sector != blk_rq_pos(__rq));
if (elv_rq_merge_ok(__rq, bio)) {
ret = ELEVATOR_FRONT_MERGE;
goto out;
}
}
}
return ELEVATOR_NO_MERGE;
out:
*req = __rq;
return ret;
}
static void deadline_merged_request(struct request_queue *q,
struct request *req, int type)
{
struct deadline_data *dd = q->elevator->elevator_data;
/*
* if the merge was a front merge, we need to reposition request
*/
if (type == ELEVATOR_FRONT_MERGE) {
elv_rb_del(deadline_rb_root(dd, req), req);
deadline_add_rq_rb(dd, req);
}
}
static void
deadline_merged_requests(struct request_queue *q, struct request *req,
struct request *next)
{
/*
* if next expires before rq, assign its expire time to rq
* and move into next position (next will be deleted) in fifo
*/
if (!list_empty(&req->queuelist) && !list_empty(&next->queuelist)) {
if (time_before(next->fifo_time, req->fifo_time)) {
list_move(&req->queuelist, &next->queuelist);
req->fifo_time = next->fifo_time;
}
}
/*
* kill knowledge of next, this one is a goner
*/
deadline_remove_request(q, next);
}
/*
* move request from sort list to dispatch queue.
*/
static inline void
deadline_move_to_dispatch(struct deadline_data *dd, struct request *rq)
{
struct request_queue *q = rq->q;
deadline_remove_request(q, rq);
elv_dispatch_add_tail(q, rq);
}
/*
* move an entry to dispatch queue
*/
static void
deadline_move_request(struct deadline_data *dd, struct request *rq)
{
const int data_dir = rq_data_dir(rq);
dd->next_rq[READ] = NULL;
dd->next_rq[WRITE] = NULL;
dd->next_rq[data_dir] = deadline_latter_request(rq);
dd->last_sector = rq_end_sector(rq);
/*
* take it off the sort and fifo list, move
* to dispatch queue
*/
deadline_move_to_dispatch(dd, rq);
}
/*
* deadline_check_fifo returns 0 if there are no expired requests on the fifo,
* 1 otherwise. Requires !list_empty(&dd->fifo_list[data_dir])
*/
static inline int deadline_check_fifo(struct deadline_data *dd, int ddir)
{
struct request *rq = rq_entry_fifo(dd->fifo_list[ddir].next);
/*
* rq is expired!
*/
if (time_after_eq(jiffies, rq->fifo_time))
return 1;
return 0;
}
/*
* deadline_dispatch_requests selects the best request according to
* read/write expire, fifo_batch, etc
*/
static int deadline_dispatch_requests(struct request_queue *q, int force)
{
struct deadline_data *dd = q->elevator->elevator_data;
const int reads = !list_empty(&dd->fifo_list[READ]);
const int writes = !list_empty(&dd->fifo_list[WRITE]);
struct request *rq;
int data_dir;
/*
* batches are currently reads XOR writes
*/
if (dd->next_rq[WRITE])
rq = dd->next_rq[WRITE];
else
rq = dd->next_rq[READ];
if (rq && dd->batching < dd->fifo_batch)
/* we have a next request are still entitled to batch */
goto dispatch_request;
/*
* at this point we are not running a batch. select the appropriate
* data direction (read / write)
*/
if (reads) {
BUG_ON(RB_EMPTY_ROOT(&dd->sort_list[READ]));
if (writes && (dd->starved++ >= dd->writes_starved))
goto dispatch_writes;
data_dir = READ;
goto dispatch_find_request;
}
/*
* there are either no reads or writes have been starved
*/
if (writes) {
dispatch_writes:
BUG_ON(RB_EMPTY_ROOT(&dd->sort_list[WRITE]));
dd->starved = 0;
data_dir = WRITE;
goto dispatch_find_request;
}
return 0;
dispatch_find_request:
/*
* we are not running a batch, find best request for selected data_dir
*/
if (deadline_check_fifo(dd, data_dir) || !dd->next_rq[data_dir]) {
/*
* A deadline has expired, the last request was in the other
* direction, or we have run out of higher-sectored requests.
* Start again from the request with the earliest expiry time.
*/
rq = rq_entry_fifo(dd->fifo_list[data_dir].next);
} else {
/*
* The last req was the same dir and we have a next request in
* sort order. No expired requests so continue on from here.
*/
rq = dd->next_rq[data_dir];
}
dd->batching = 0;
dispatch_request:
/*
* rq is the selected appropriate request.
*/
dd->batching++;
deadline_move_request(dd, rq);
return 1;
}
static void deadline_exit_queue(struct elevator_queue *e)
{
struct deadline_data *dd = e->elevator_data;
BUG_ON(!list_empty(&dd->fifo_list[READ]));
BUG_ON(!list_empty(&dd->fifo_list[WRITE]));
kfree(dd);
}
/*
* initialize elevator private data (deadline_data).
*/
static int deadline_init_queue(struct request_queue *q, struct elevator_type *e)
{
struct deadline_data *dd;
struct elevator_queue *eq;
eq = elevator_alloc(q, e);
if (!eq)
return -ENOMEM;
dd = kzalloc_node(sizeof(*dd), GFP_KERNEL, q->node);
if (!dd) {
kobject_put(&eq->kobj);
return -ENOMEM;
}
eq->elevator_data = dd;
INIT_LIST_HEAD(&dd->fifo_list[READ]);
INIT_LIST_HEAD(&dd->fifo_list[WRITE]);
dd->sort_list[READ] = RB_ROOT;
dd->sort_list[WRITE] = RB_ROOT;
dd->fifo_expire[READ] = read_expire;
dd->fifo_expire[WRITE] = write_expire;
dd->writes_starved = writes_starved;
dd->front_merges = 1;
dd->fifo_batch = fifo_batch;
spin_lock_irq(q->queue_lock);
q->elevator = eq;
spin_unlock_irq(q->queue_lock);
return 0;
}
/*
* sysfs parts below
*/
static ssize_t
deadline_var_show(int var, char *page)
{
return sprintf(page, "%d\n", var);
}
static ssize_t
deadline_var_store(int *var, const char *page, size_t count)
{
char *p = (char *) page;
*var = simple_strtol(p, &p, 10);
return count;
}
#define SHOW_FUNCTION(__FUNC, __VAR, __CONV) \
static ssize_t __FUNC(struct elevator_queue *e, char *page) \
{ \
struct deadline_data *dd = e->elevator_data; \
int __data = __VAR; \
if (__CONV) \
__data = jiffies_to_msecs(__data); \
return deadline_var_show(__data, (page)); \
}
SHOW_FUNCTION(deadline_read_expire_show, dd->fifo_expire[READ], 1);
SHOW_FUNCTION(deadline_write_expire_show, dd->fifo_expire[WRITE], 1);
SHOW_FUNCTION(deadline_writes_starved_show, dd->writes_starved, 0);
SHOW_FUNCTION(deadline_front_merges_show, dd->front_merges, 0);
SHOW_FUNCTION(deadline_fifo_batch_show, dd->fifo_batch, 0);
#undef SHOW_FUNCTION
#define STORE_FUNCTION(__FUNC, __PTR, MIN, MAX, __CONV) \
static ssize_t __FUNC(struct elevator_queue *e, const char *page, size_t count) \
{ \
struct deadline_data *dd = e->elevator_data; \
int __data; \
int ret = deadline_var_store(&__data, (page), count); \
if (__data < (MIN)) \
__data = (MIN); \
else if (__data > (MAX)) \
__data = (MAX); \
if (__CONV) \
*(__PTR) = msecs_to_jiffies(__data); \
else \
*(__PTR) = __data; \
return ret; \
}
STORE_FUNCTION(deadline_read_expire_store, &dd->fifo_expire[READ], 0, INT_MAX, 1);
STORE_FUNCTION(deadline_write_expire_store, &dd->fifo_expire[WRITE], 0, INT_MAX, 1);
STORE_FUNCTION(deadline_writes_starved_store, &dd->writes_starved, INT_MIN, INT_MAX, 0);
STORE_FUNCTION(deadline_front_merges_store, &dd->front_merges, 0, 1, 0);
STORE_FUNCTION(deadline_fifo_batch_store, &dd->fifo_batch, 0, INT_MAX, 0);
#undef STORE_FUNCTION
#define DD_ATTR(name) \
__ATTR(name, S_IRUGO|S_IWUSR, deadline_##name##_show, \
deadline_##name##_store)
static struct elv_fs_entry deadline_attrs[] = {
DD_ATTR(read_expire),
DD_ATTR(write_expire),
DD_ATTR(writes_starved),
DD_ATTR(front_merges),
DD_ATTR(fifo_batch),
__ATTR_NULL
};
static struct elevator_type iosched_deadline = {
.ops = {
.elevator_merge_fn = deadline_merge,
.elevator_merged_fn = deadline_merged_request,
.elevator_merge_req_fn = deadline_merged_requests,
.elevator_dispatch_fn = deadline_dispatch_requests,
.elevator_add_req_fn = deadline_add_request,
.elevator_former_req_fn = elv_rb_former_request,
.elevator_latter_req_fn = elv_rb_latter_request,
.elevator_init_fn = deadline_init_queue,
.elevator_exit_fn = deadline_exit_queue,
},
.elevator_attrs = deadline_attrs,
.elevator_name = "deadline",
.elevator_owner = THIS_MODULE,
};
static int __init deadline_init(void)
{
return elv_register(&iosched_deadline);
}
static void __exit deadline_exit(void)
{
elv_unregister(&iosched_deadline);
}
module_init(deadline_init);
module_exit(deadline_exit);
MODULE_AUTHOR("Jens Axboe");
MODULE_LICENSE("GPL");
MODULE_DESCRIPTION("deadline IO scheduler");

1051
block/elevator.c Normal file

File diff suppressed because it is too large Load diff

1888
block/genhd.c Normal file

File diff suppressed because it is too large Load diff

432
block/ioctl.c Normal file
View file

@ -0,0 +1,432 @@
#include <linux/capability.h>
#include <linux/blkdev.h>
#include <linux/export.h>
#include <linux/gfp.h>
#include <linux/blkpg.h>
#include <linux/hdreg.h>
#include <linux/backing-dev.h>
#include <linux/fs.h>
#include <linux/blktrace_api.h>
#include <asm/uaccess.h>
static int blkpg_ioctl(struct block_device *bdev, struct blkpg_ioctl_arg __user *arg)
{
struct block_device *bdevp;
struct gendisk *disk;
struct hd_struct *part, *lpart;
struct blkpg_ioctl_arg a;
struct blkpg_partition p;
struct disk_part_iter piter;
long long start, length;
int partno;
if (!capable(CAP_SYS_ADMIN))
return -EACCES;
if (copy_from_user(&a, arg, sizeof(struct blkpg_ioctl_arg)))
return -EFAULT;
if (copy_from_user(&p, a.data, sizeof(struct blkpg_partition)))
return -EFAULT;
disk = bdev->bd_disk;
if (bdev != bdev->bd_contains)
return -EINVAL;
partno = p.pno;
if (partno <= 0)
return -EINVAL;
switch (a.op) {
case BLKPG_ADD_PARTITION:
start = p.start >> 9;
length = p.length >> 9;
/* check for fit in a hd_struct */
if (sizeof(sector_t) == sizeof(long) &&
sizeof(long long) > sizeof(long)) {
long pstart = start, plength = length;
if (pstart != start || plength != length
|| pstart < 0 || plength < 0 || partno > 65535)
return -EINVAL;
}
mutex_lock(&bdev->bd_mutex);
/* overlap? */
disk_part_iter_init(&piter, disk,
DISK_PITER_INCL_EMPTY);
while ((part = disk_part_iter_next(&piter))) {
if (!(start + length <= part->start_sect ||
start >= part->start_sect + part->nr_sects)) {
disk_part_iter_exit(&piter);
mutex_unlock(&bdev->bd_mutex);
return -EBUSY;
}
}
disk_part_iter_exit(&piter);
/* all seems OK */
part = add_partition(disk, partno, start, length,
ADDPART_FLAG_NONE, NULL);
mutex_unlock(&bdev->bd_mutex);
return PTR_ERR_OR_ZERO(part);
case BLKPG_DEL_PARTITION:
part = disk_get_part(disk, partno);
if (!part)
return -ENXIO;
bdevp = bdget(part_devt(part));
disk_put_part(part);
if (!bdevp)
return -ENOMEM;
mutex_lock(&bdevp->bd_mutex);
if (bdevp->bd_openers) {
mutex_unlock(&bdevp->bd_mutex);
bdput(bdevp);
return -EBUSY;
}
/* all seems OK */
fsync_bdev(bdevp);
invalidate_bdev(bdevp);
mutex_lock_nested(&bdev->bd_mutex, 1);
delete_partition(disk, partno);
mutex_unlock(&bdev->bd_mutex);
mutex_unlock(&bdevp->bd_mutex);
bdput(bdevp);
return 0;
case BLKPG_RESIZE_PARTITION:
start = p.start >> 9;
/* new length of partition in bytes */
length = p.length >> 9;
/* check for fit in a hd_struct */
if (sizeof(sector_t) == sizeof(long) &&
sizeof(long long) > sizeof(long)) {
long pstart = start, plength = length;
if (pstart != start || plength != length
|| pstart < 0 || plength < 0)
return -EINVAL;
}
part = disk_get_part(disk, partno);
if (!part)
return -ENXIO;
bdevp = bdget(part_devt(part));
if (!bdevp) {
disk_put_part(part);
return -ENOMEM;
}
mutex_lock(&bdevp->bd_mutex);
mutex_lock_nested(&bdev->bd_mutex, 1);
if (start != part->start_sect) {
mutex_unlock(&bdevp->bd_mutex);
mutex_unlock(&bdev->bd_mutex);
bdput(bdevp);
disk_put_part(part);
return -EINVAL;
}
/* overlap? */
disk_part_iter_init(&piter, disk,
DISK_PITER_INCL_EMPTY);
while ((lpart = disk_part_iter_next(&piter))) {
if (lpart->partno != partno &&
!(start + length <= lpart->start_sect ||
start >= lpart->start_sect + lpart->nr_sects)
) {
disk_part_iter_exit(&piter);
mutex_unlock(&bdevp->bd_mutex);
mutex_unlock(&bdev->bd_mutex);
bdput(bdevp);
disk_put_part(part);
return -EBUSY;
}
}
disk_part_iter_exit(&piter);
part_nr_sects_write(part, (sector_t)length);
i_size_write(bdevp->bd_inode, p.length);
mutex_unlock(&bdevp->bd_mutex);
mutex_unlock(&bdev->bd_mutex);
bdput(bdevp);
disk_put_part(part);
return 0;
default:
return -EINVAL;
}
}
static int blkdev_reread_part(struct block_device *bdev)
{
struct gendisk *disk = bdev->bd_disk;
int res;
if (!disk_part_scan_enabled(disk) || bdev != bdev->bd_contains)
return -EINVAL;
if (!capable(CAP_SYS_ADMIN))
return -EACCES;
if (!mutex_trylock(&bdev->bd_mutex))
return -EBUSY;
res = rescan_partitions(disk, bdev);
mutex_unlock(&bdev->bd_mutex);
return res;
}
static int blk_ioctl_discard(struct block_device *bdev, uint64_t start,
uint64_t len, int secure)
{
unsigned long flags = 0;
if (start & 511)
return -EINVAL;
if (len & 511)
return -EINVAL;
start >>= 9;
len >>= 9;
if (start + len > (i_size_read(bdev->bd_inode) >> 9))
return -EINVAL;
if (secure)
flags |= BLKDEV_DISCARD_SECURE;
printk("%s %d:%d %llu %llu", secure?"SECDIS":"DIS", MAJOR(bdev->bd_dev), MINOR(bdev->bd_dev),
(unsigned long long)start, (unsigned long long)len);
return blkdev_issue_discard(bdev, start, len, GFP_KERNEL, flags);
}
static int blk_ioctl_zeroout(struct block_device *bdev, uint64_t start,
uint64_t len)
{
if (start & 511)
return -EINVAL;
if (len & 511)
return -EINVAL;
start >>= 9;
len >>= 9;
if (start + len > (i_size_read(bdev->bd_inode) >> 9))
return -EINVAL;
return blkdev_issue_zeroout(bdev, start, len, GFP_KERNEL);
}
static int put_ushort(unsigned long arg, unsigned short val)
{
return put_user(val, (unsigned short __user *)arg);
}
static int put_int(unsigned long arg, int val)
{
return put_user(val, (int __user *)arg);
}
static int put_uint(unsigned long arg, unsigned int val)
{
return put_user(val, (unsigned int __user *)arg);
}
static int put_long(unsigned long arg, long val)
{
return put_user(val, (long __user *)arg);
}
static int put_ulong(unsigned long arg, unsigned long val)
{
return put_user(val, (unsigned long __user *)arg);
}
static int put_u64(unsigned long arg, u64 val)
{
return put_user(val, (u64 __user *)arg);
}
int __blkdev_driver_ioctl(struct block_device *bdev, fmode_t mode,
unsigned cmd, unsigned long arg)
{
struct gendisk *disk = bdev->bd_disk;
if (disk->fops->ioctl)
return disk->fops->ioctl(bdev, mode, cmd, arg);
return -ENOTTY;
}
/*
* For the record: _GPL here is only because somebody decided to slap it
* on the previous export. Sheer idiocy, since it wasn't copyrightable
* at all and could be open-coded without any exports by anybody who cares.
*/
EXPORT_SYMBOL_GPL(__blkdev_driver_ioctl);
/*
* Is it an unrecognized ioctl? The correct returns are either
* ENOTTY (final) or ENOIOCTLCMD ("I don't know this one, try a
* fallback"). ENOIOCTLCMD gets turned into ENOTTY by the ioctl
* code before returning.
*
* Confused drivers sometimes return EINVAL, which is wrong. It
* means "I understood the ioctl command, but the parameters to
* it were wrong".
*
* We should aim to just fix the broken drivers, the EINVAL case
* should go away.
*/
static inline int is_unrecognized_ioctl(int ret)
{
return ret == -EINVAL ||
ret == -ENOTTY ||
ret == -ENOIOCTLCMD;
}
/*
* always keep this in sync with compat_blkdev_ioctl()
*/
int blkdev_ioctl(struct block_device *bdev, fmode_t mode, unsigned cmd,
unsigned long arg)
{
struct gendisk *disk = bdev->bd_disk;
struct backing_dev_info *bdi;
loff_t size;
int ret, n;
unsigned int max_sectors;
switch(cmd) {
case BLKFLSBUF:
if (!capable(CAP_SYS_ADMIN))
return -EACCES;
ret = __blkdev_driver_ioctl(bdev, mode, cmd, arg);
if (!is_unrecognized_ioctl(ret))
return ret;
fsync_bdev(bdev);
invalidate_bdev(bdev);
return 0;
case BLKROSET:
ret = __blkdev_driver_ioctl(bdev, mode, cmd, arg);
if (!is_unrecognized_ioctl(ret))
return ret;
if (!capable(CAP_SYS_ADMIN))
return -EACCES;
if (get_user(n, (int __user *)(arg)))
return -EFAULT;
set_device_ro(bdev, n);
return 0;
case BLKDISCARD:
case BLKSECDISCARD: {
uint64_t range[2];
if (!(mode & FMODE_WRITE))
return -EBADF;
if (copy_from_user(range, (void __user *)arg, sizeof(range)))
return -EFAULT;
return blk_ioctl_discard(bdev, range[0], range[1],
cmd == BLKSECDISCARD);
}
case BLKZEROOUT: {
uint64_t range[2];
if (!(mode & FMODE_WRITE))
return -EBADF;
if (copy_from_user(range, (void __user *)arg, sizeof(range)))
return -EFAULT;
return blk_ioctl_zeroout(bdev, range[0], range[1]);
}
case HDIO_GETGEO: {
struct hd_geometry geo;
if (!arg)
return -EINVAL;
if (!disk->fops->getgeo)
return -ENOTTY;
/*
* We need to set the startsect first, the driver may
* want to override it.
*/
memset(&geo, 0, sizeof(geo));
geo.start = get_start_sect(bdev);
ret = disk->fops->getgeo(bdev, &geo);
if (ret)
return ret;
if (copy_to_user((struct hd_geometry __user *)arg, &geo,
sizeof(geo)))
return -EFAULT;
return 0;
}
case BLKRAGET:
case BLKFRAGET:
if (!arg)
return -EINVAL;
bdi = blk_get_backing_dev_info(bdev);
return put_long(arg, (bdi->ra_pages * PAGE_CACHE_SIZE) / 512);
case BLKROGET:
return put_int(arg, bdev_read_only(bdev) != 0);
case BLKBSZGET: /* get block device soft block size (cf. BLKSSZGET) */
return put_int(arg, block_size(bdev));
case BLKSSZGET: /* get block device logical block size */
return put_int(arg, bdev_logical_block_size(bdev));
case BLKPBSZGET: /* get block device physical block size */
return put_uint(arg, bdev_physical_block_size(bdev));
case BLKIOMIN:
return put_uint(arg, bdev_io_min(bdev));
case BLKIOOPT:
return put_uint(arg, bdev_io_opt(bdev));
case BLKALIGNOFF:
return put_int(arg, bdev_alignment_offset(bdev));
case BLKDISCARDZEROES:
return put_uint(arg, bdev_discard_zeroes_data(bdev));
case BLKSECTGET:
max_sectors = min_t(unsigned int, USHRT_MAX,
queue_max_sectors(bdev_get_queue(bdev)));
return put_ushort(arg, max_sectors);
case BLKROTATIONAL:
return put_ushort(arg, !blk_queue_nonrot(bdev_get_queue(bdev)));
case BLKRASET:
case BLKFRASET:
if(!capable(CAP_SYS_ADMIN))
return -EACCES;
bdi = blk_get_backing_dev_info(bdev);
bdi->ra_pages = (arg * 512) / PAGE_CACHE_SIZE;
return 0;
case BLKBSZSET:
/* set the logical block size */
if (!capable(CAP_SYS_ADMIN))
return -EACCES;
if (!arg)
return -EINVAL;
if (get_user(n, (int __user *) arg))
return -EFAULT;
if (!(mode & FMODE_EXCL)) {
bdgrab(bdev);
if (blkdev_get(bdev, mode | FMODE_EXCL, &bdev) < 0)
return -EBUSY;
}
ret = set_blocksize(bdev, n);
if (!(mode & FMODE_EXCL))
blkdev_put(bdev, mode | FMODE_EXCL);
return ret;
case BLKPG:
ret = blkpg_ioctl(bdev, (struct blkpg_ioctl_arg __user *) arg);
break;
case BLKRRPART:
ret = blkdev_reread_part(bdev);
break;
case BLKGETSIZE:
size = i_size_read(bdev->bd_inode);
if ((size >> 9) > ~0UL)
return -EFBIG;
return put_ulong(arg, size >> 9);
case BLKGETSIZE64:
return put_u64(arg, i_size_read(bdev->bd_inode));
case BLKTRACESTART:
case BLKTRACESTOP:
case BLKTRACESETUP:
case BLKTRACETEARDOWN:
ret = blk_trace_ioctl(bdev, cmd, (char __user *) arg);
break;
default:
ret = __blkdev_driver_ioctl(bdev, mode, cmd, arg);
}
return ret;
}
EXPORT_SYMBOL_GPL(blkdev_ioctl);

243
block/ioprio.c Normal file
View file

@ -0,0 +1,243 @@
/*
* fs/ioprio.c
*
* Copyright (C) 2004 Jens Axboe <axboe@kernel.dk>
*
* Helper functions for setting/querying io priorities of processes. The
* system calls closely mimmick getpriority/setpriority, see the man page for
* those. The prio argument is a composite of prio class and prio data, where
* the data argument has meaning within that class. The standard scheduling
* classes have 8 distinct prio levels, with 0 being the highest prio and 7
* being the lowest.
*
* IOW, setting BE scheduling class with prio 2 is done ala:
*
* unsigned int prio = (IOPRIO_CLASS_BE << IOPRIO_CLASS_SHIFT) | 2;
*
* ioprio_set(PRIO_PROCESS, pid, prio);
*
* See also Documentation/block/ioprio.txt
*
*/
#include <linux/gfp.h>
#include <linux/kernel.h>
#include <linux/export.h>
#include <linux/ioprio.h>
#include <linux/blkdev.h>
#include <linux/capability.h>
#include <linux/syscalls.h>
#include <linux/security.h>
#include <linux/pid_namespace.h>
int set_task_ioprio(struct task_struct *task, int ioprio)
{
int err;
struct io_context *ioc;
const struct cred *cred = current_cred(), *tcred;
rcu_read_lock();
tcred = __task_cred(task);
if (!uid_eq(tcred->uid, cred->euid) &&
!uid_eq(tcred->uid, cred->uid) && !capable(CAP_SYS_NICE)) {
rcu_read_unlock();
return -EPERM;
}
rcu_read_unlock();
err = security_task_setioprio(task, ioprio);
if (err)
return err;
ioc = get_task_io_context(task, GFP_ATOMIC, NUMA_NO_NODE);
if (ioc) {
ioc->ioprio = ioprio;
put_io_context(ioc);
}
return err;
}
EXPORT_SYMBOL_GPL(set_task_ioprio);
SYSCALL_DEFINE3(ioprio_set, int, which, int, who, int, ioprio)
{
int class = IOPRIO_PRIO_CLASS(ioprio);
int data = IOPRIO_PRIO_DATA(ioprio);
struct task_struct *p, *g;
struct user_struct *user;
struct pid *pgrp;
kuid_t uid;
int ret;
switch (class) {
case IOPRIO_CLASS_RT:
if (!capable(CAP_SYS_ADMIN))
return -EPERM;
/* fall through, rt has prio field too */
case IOPRIO_CLASS_BE:
if (data >= IOPRIO_BE_NR || data < 0)
return -EINVAL;
break;
case IOPRIO_CLASS_IDLE:
break;
case IOPRIO_CLASS_NONE:
if (data)
return -EINVAL;
break;
default:
return -EINVAL;
}
ret = -ESRCH;
rcu_read_lock();
switch (which) {
case IOPRIO_WHO_PROCESS:
if (!who)
p = current;
else
p = find_task_by_vpid(who);
if (p)
ret = set_task_ioprio(p, ioprio);
break;
case IOPRIO_WHO_PGRP:
if (!who)
pgrp = task_pgrp(current);
else
pgrp = find_vpid(who);
do_each_pid_thread(pgrp, PIDTYPE_PGID, p) {
ret = set_task_ioprio(p, ioprio);
if (ret)
break;
} while_each_pid_thread(pgrp, PIDTYPE_PGID, p);
break;
case IOPRIO_WHO_USER:
uid = make_kuid(current_user_ns(), who);
if (!uid_valid(uid))
break;
if (!who)
user = current_user();
else
user = find_user(uid);
if (!user)
break;
do_each_thread(g, p) {
if (!uid_eq(task_uid(p), uid))
continue;
ret = set_task_ioprio(p, ioprio);
if (ret)
goto free_uid;
} while_each_thread(g, p);
free_uid:
if (who)
free_uid(user);
break;
default:
ret = -EINVAL;
}
rcu_read_unlock();
return ret;
}
static int get_task_ioprio(struct task_struct *p)
{
int ret;
ret = security_task_getioprio(p);
if (ret)
goto out;
ret = IOPRIO_PRIO_VALUE(IOPRIO_CLASS_NONE, IOPRIO_NORM);
if (p->io_context)
ret = p->io_context->ioprio;
out:
return ret;
}
int ioprio_best(unsigned short aprio, unsigned short bprio)
{
unsigned short aclass;
unsigned short bclass;
if (!ioprio_valid(aprio))
aprio = IOPRIO_PRIO_VALUE(IOPRIO_CLASS_BE, IOPRIO_NORM);
if (!ioprio_valid(bprio))
bprio = IOPRIO_PRIO_VALUE(IOPRIO_CLASS_BE, IOPRIO_NORM);
aclass = IOPRIO_PRIO_CLASS(aprio);
bclass = IOPRIO_PRIO_CLASS(bprio);
if (aclass == bclass)
return min(aprio, bprio);
if (aclass > bclass)
return bprio;
else
return aprio;
}
SYSCALL_DEFINE2(ioprio_get, int, which, int, who)
{
struct task_struct *g, *p;
struct user_struct *user;
struct pid *pgrp;
kuid_t uid;
int ret = -ESRCH;
int tmpio;
rcu_read_lock();
switch (which) {
case IOPRIO_WHO_PROCESS:
if (!who)
p = current;
else
p = find_task_by_vpid(who);
if (p)
ret = get_task_ioprio(p);
break;
case IOPRIO_WHO_PGRP:
if (!who)
pgrp = task_pgrp(current);
else
pgrp = find_vpid(who);
do_each_pid_thread(pgrp, PIDTYPE_PGID, p) {
tmpio = get_task_ioprio(p);
if (tmpio < 0)
continue;
if (ret == -ESRCH)
ret = tmpio;
else
ret = ioprio_best(ret, tmpio);
} while_each_pid_thread(pgrp, PIDTYPE_PGID, p);
break;
case IOPRIO_WHO_USER:
uid = make_kuid(current_user_ns(), who);
if (!who)
user = current_user();
else
user = find_user(uid);
if (!user)
break;
do_each_thread(g, p) {
if (!uid_eq(task_uid(p), user->uid))
continue;
tmpio = get_task_ioprio(p);
if (tmpio < 0)
continue;
if (ret == -ESRCH)
ret = tmpio;
else
ret = ioprio_best(ret, tmpio);
} while_each_thread(g, p);
if (who)
free_uid(user);
break;
default:
ret = -EINVAL;
}
rcu_read_unlock();
return ret;
}

124
block/noop-iosched.c Normal file
View file

@ -0,0 +1,124 @@
/*
* elevator noop
*/
#include <linux/blkdev.h>
#include <linux/elevator.h>
#include <linux/bio.h>
#include <linux/module.h>
#include <linux/slab.h>
#include <linux/init.h>
struct noop_data {
struct list_head queue;
};
static void noop_merged_requests(struct request_queue *q, struct request *rq,
struct request *next)
{
list_del_init(&next->queuelist);
}
static int noop_dispatch(struct request_queue *q, int force)
{
struct noop_data *nd = q->elevator->elevator_data;
if (!list_empty(&nd->queue)) {
struct request *rq;
rq = list_entry(nd->queue.next, struct request, queuelist);
list_del_init(&rq->queuelist);
elv_dispatch_sort(q, rq);
return 1;
}
return 0;
}
static void noop_add_request(struct request_queue *q, struct request *rq)
{
struct noop_data *nd = q->elevator->elevator_data;
list_add_tail(&rq->queuelist, &nd->queue);
}
static struct request *
noop_former_request(struct request_queue *q, struct request *rq)
{
struct noop_data *nd = q->elevator->elevator_data;
if (rq->queuelist.prev == &nd->queue)
return NULL;
return list_entry(rq->queuelist.prev, struct request, queuelist);
}
static struct request *
noop_latter_request(struct request_queue *q, struct request *rq)
{
struct noop_data *nd = q->elevator->elevator_data;
if (rq->queuelist.next == &nd->queue)
return NULL;
return list_entry(rq->queuelist.next, struct request, queuelist);
}
static int noop_init_queue(struct request_queue *q, struct elevator_type *e)
{
struct noop_data *nd;
struct elevator_queue *eq;
eq = elevator_alloc(q, e);
if (!eq)
return -ENOMEM;
nd = kmalloc_node(sizeof(*nd), GFP_KERNEL, q->node);
if (!nd) {
kobject_put(&eq->kobj);
return -ENOMEM;
}
eq->elevator_data = nd;
INIT_LIST_HEAD(&nd->queue);
spin_lock_irq(q->queue_lock);
q->elevator = eq;
spin_unlock_irq(q->queue_lock);
return 0;
}
static void noop_exit_queue(struct elevator_queue *e)
{
struct noop_data *nd = e->elevator_data;
BUG_ON(!list_empty(&nd->queue));
kfree(nd);
}
static struct elevator_type elevator_noop = {
.ops = {
.elevator_merge_req_fn = noop_merged_requests,
.elevator_dispatch_fn = noop_dispatch,
.elevator_add_req_fn = noop_add_request,
.elevator_former_req_fn = noop_former_request,
.elevator_latter_req_fn = noop_latter_request,
.elevator_init_fn = noop_init_queue,
.elevator_exit_fn = noop_exit_queue,
},
.elevator_name = "noop",
.elevator_owner = THIS_MODULE,
};
static int __init noop_init(void)
{
return elv_register(&elevator_noop);
}
static void __exit noop_exit(void)
{
elv_unregister(&elevator_noop);
}
module_init(noop_init);
module_exit(noop_exit);
MODULE_AUTHOR("Jens Axboe");
MODULE_LICENSE("GPL");
MODULE_DESCRIPTION("No-op IO scheduler");

582
block/partition-generic.c Normal file
View file

@ -0,0 +1,582 @@
/*
* Code extracted from drivers/block/genhd.c
* Copyright (C) 1991-1998 Linus Torvalds
* Re-organised Feb 1998 Russell King
*
* We now have independent partition support from the
* block drivers, which allows all the partition code to
* be grouped in one location, and it to be mostly self
* contained.
*/
#include <linux/init.h>
#include <linux/module.h>
#include <linux/fs.h>
#include <linux/slab.h>
#include <linux/kmod.h>
#include <linux/ctype.h>
#include <linux/genhd.h>
#include <linux/blktrace_api.h>
#include "partitions/check.h"
#ifdef CONFIG_BLK_DEV_MD
extern void md_autodetect_dev(dev_t dev);
#endif
/*
* disk_name() is used by partition check code and the genhd driver.
* It formats the devicename of the indicated disk into
* the supplied buffer (of size at least 32), and returns
* a pointer to that same buffer (for convenience).
*/
char *disk_name(struct gendisk *hd, int partno, char *buf)
{
if (!partno)
snprintf(buf, BDEVNAME_SIZE, "%s", hd->disk_name);
else if (isdigit(hd->disk_name[strlen(hd->disk_name)-1]))
snprintf(buf, BDEVNAME_SIZE, "%sp%d", hd->disk_name, partno);
else
snprintf(buf, BDEVNAME_SIZE, "%s%d", hd->disk_name, partno);
return buf;
}
const char *bdevname(struct block_device *bdev, char *buf)
{
return disk_name(bdev->bd_disk, bdev->bd_part->partno, buf);
}
EXPORT_SYMBOL(bdevname);
/*
* There's very little reason to use this, you should really
* have a struct block_device just about everywhere and use
* bdevname() instead.
*/
const char *__bdevname(dev_t dev, char *buffer)
{
scnprintf(buffer, BDEVNAME_SIZE, "unknown-block(%u,%u)",
MAJOR(dev), MINOR(dev));
return buffer;
}
EXPORT_SYMBOL(__bdevname);
static ssize_t part_partition_show(struct device *dev,
struct device_attribute *attr, char *buf)
{
struct hd_struct *p = dev_to_part(dev);
return sprintf(buf, "%d\n", p->partno);
}
static ssize_t part_start_show(struct device *dev,
struct device_attribute *attr, char *buf)
{
struct hd_struct *p = dev_to_part(dev);
return sprintf(buf, "%llu\n",(unsigned long long)p->start_sect);
}
ssize_t part_size_show(struct device *dev,
struct device_attribute *attr, char *buf)
{
struct hd_struct *p = dev_to_part(dev);
return sprintf(buf, "%llu\n",(unsigned long long)part_nr_sects_read(p));
}
static ssize_t part_ro_show(struct device *dev,
struct device_attribute *attr, char *buf)
{
struct hd_struct *p = dev_to_part(dev);
return sprintf(buf, "%d\n", p->policy ? 1 : 0);
}
static ssize_t part_alignment_offset_show(struct device *dev,
struct device_attribute *attr, char *buf)
{
struct hd_struct *p = dev_to_part(dev);
return sprintf(buf, "%llu\n", (unsigned long long)p->alignment_offset);
}
static ssize_t part_discard_alignment_show(struct device *dev,
struct device_attribute *attr, char *buf)
{
struct hd_struct *p = dev_to_part(dev);
return sprintf(buf, "%u\n", p->discard_alignment);
}
ssize_t part_stat_show(struct device *dev,
struct device_attribute *attr, char *buf)
{
struct hd_struct *p = dev_to_part(dev);
int cpu;
cpu = part_stat_lock();
part_round_stats(cpu, p);
part_stat_unlock();
return sprintf(buf,
"%8lu %8lu %8llu %8u "
"%8lu %8lu %8llu %8u "
"%8u %8u %8u"
"\n",
part_stat_read(p, ios[READ]),
part_stat_read(p, merges[READ]),
(unsigned long long)part_stat_read(p, sectors[READ]),
jiffies_to_msecs(part_stat_read(p, ticks[READ])),
part_stat_read(p, ios[WRITE]),
part_stat_read(p, merges[WRITE]),
(unsigned long long)part_stat_read(p, sectors[WRITE]),
jiffies_to_msecs(part_stat_read(p, ticks[WRITE])),
part_in_flight(p),
jiffies_to_msecs(part_stat_read(p, io_ticks)),
jiffies_to_msecs(part_stat_read(p, time_in_queue)));
}
ssize_t part_inflight_show(struct device *dev,
struct device_attribute *attr, char *buf)
{
struct hd_struct *p = dev_to_part(dev);
return sprintf(buf, "%8u %8u\n", atomic_read(&p->in_flight[0]),
atomic_read(&p->in_flight[1]));
}
#ifdef CONFIG_FAIL_MAKE_REQUEST
ssize_t part_fail_show(struct device *dev,
struct device_attribute *attr, char *buf)
{
struct hd_struct *p = dev_to_part(dev);
return sprintf(buf, "%d\n", p->make_it_fail);
}
ssize_t part_fail_store(struct device *dev,
struct device_attribute *attr,
const char *buf, size_t count)
{
struct hd_struct *p = dev_to_part(dev);
int i;
if (count > 0 && sscanf(buf, "%d", &i) > 0)
p->make_it_fail = (i == 0) ? 0 : 1;
return count;
}
#endif
static DEVICE_ATTR(partition, S_IRUGO, part_partition_show, NULL);
static DEVICE_ATTR(start, S_IRUGO, part_start_show, NULL);
static DEVICE_ATTR(size, S_IRUGO, part_size_show, NULL);
static DEVICE_ATTR(ro, S_IRUGO, part_ro_show, NULL);
static DEVICE_ATTR(alignment_offset, S_IRUGO, part_alignment_offset_show, NULL);
static DEVICE_ATTR(discard_alignment, S_IRUGO, part_discard_alignment_show,
NULL);
static DEVICE_ATTR(stat, S_IRUGO, part_stat_show, NULL);
static DEVICE_ATTR(inflight, S_IRUGO, part_inflight_show, NULL);
#ifdef CONFIG_FAIL_MAKE_REQUEST
static struct device_attribute dev_attr_fail =
__ATTR(make-it-fail, S_IRUGO|S_IWUSR, part_fail_show, part_fail_store);
#endif
static struct attribute *part_attrs[] = {
&dev_attr_partition.attr,
&dev_attr_start.attr,
&dev_attr_size.attr,
&dev_attr_ro.attr,
&dev_attr_alignment_offset.attr,
&dev_attr_discard_alignment.attr,
&dev_attr_stat.attr,
&dev_attr_inflight.attr,
#ifdef CONFIG_FAIL_MAKE_REQUEST
&dev_attr_fail.attr,
#endif
NULL
};
static struct attribute_group part_attr_group = {
.attrs = part_attrs,
};
static const struct attribute_group *part_attr_groups[] = {
&part_attr_group,
#ifdef CONFIG_BLK_DEV_IO_TRACE
&blk_trace_attr_group,
#endif
NULL
};
static void part_release(struct device *dev)
{
struct hd_struct *p = dev_to_part(dev);
blk_free_devt(dev->devt);
free_part_stats(p);
free_part_info(p);
kfree(p);
}
static int part_uevent(struct device *dev, struct kobj_uevent_env *env)
{
struct hd_struct *part = dev_to_part(dev);
add_uevent_var(env, "PARTN=%u", part->partno);
if (part->info && part->info->volname[0])
add_uevent_var(env, "PARTNAME=%s", part->info->volname);
return 0;
}
struct device_type part_type = {
.name = "partition",
.groups = part_attr_groups,
.release = part_release,
.uevent = part_uevent,
};
static void delete_partition_rcu_cb(struct rcu_head *head)
{
struct hd_struct *part = container_of(head, struct hd_struct, rcu_head);
part->start_sect = 0;
part->nr_sects = 0;
part_stat_set_all(part, 0);
put_device(part_to_dev(part));
}
void __delete_partition(struct hd_struct *part)
{
call_rcu(&part->rcu_head, delete_partition_rcu_cb);
}
void delete_partition(struct gendisk *disk, int partno)
{
struct disk_part_tbl *ptbl = disk->part_tbl;
struct hd_struct *part;
if (partno >= ptbl->len)
return;
part = ptbl->part[partno];
if (!part)
return;
rcu_assign_pointer(ptbl->part[partno], NULL);
rcu_assign_pointer(ptbl->last_lookup, NULL);
kobject_put(part->holder_dir);
device_del(part_to_dev(part));
hd_struct_put(part);
}
static ssize_t whole_disk_show(struct device *dev,
struct device_attribute *attr, char *buf)
{
return 0;
}
static DEVICE_ATTR(whole_disk, S_IRUSR | S_IRGRP | S_IROTH,
whole_disk_show, NULL);
struct hd_struct *add_partition(struct gendisk *disk, int partno,
sector_t start, sector_t len, int flags,
struct partition_meta_info *info)
{
struct hd_struct *p;
dev_t devt = MKDEV(0, 0);
struct device *ddev = disk_to_dev(disk);
struct device *pdev;
struct disk_part_tbl *ptbl;
const char *dname;
int err;
err = disk_expand_part_tbl(disk, partno);
if (err)
return ERR_PTR(err);
ptbl = disk->part_tbl;
if (ptbl->part[partno])
return ERR_PTR(-EBUSY);
p = kzalloc(sizeof(*p), GFP_KERNEL);
if (!p)
return ERR_PTR(-EBUSY);
if (!init_part_stats(p)) {
err = -ENOMEM;
goto out_free;
}
seqcount_init(&p->nr_sects_seq);
pdev = part_to_dev(p);
p->start_sect = start;
p->alignment_offset =
queue_limit_alignment_offset(&disk->queue->limits, start);
p->discard_alignment =
queue_limit_discard_alignment(&disk->queue->limits, start);
p->nr_sects = len;
p->partno = partno;
p->policy = get_disk_ro(disk);
if (info) {
struct partition_meta_info *pinfo = alloc_part_info(disk);
if (!pinfo)
goto out_free_stats;
memcpy(pinfo, info, sizeof(*info));
p->info = pinfo;
}
dname = dev_name(ddev);
if (isdigit(dname[strlen(dname) - 1]))
dev_set_name(pdev, "%sp%d", dname, partno);
else
dev_set_name(pdev, "%s%d", dname, partno);
device_initialize(pdev);
pdev->class = &block_class;
pdev->type = &part_type;
pdev->parent = ddev;
err = blk_alloc_devt(p, &devt);
if (err)
goto out_free_info;
pdev->devt = devt;
/* delay uevent until 'holders' subdir is created */
dev_set_uevent_suppress(pdev, 1);
err = device_add(pdev);
if (err)
goto out_put;
err = -ENOMEM;
p->holder_dir = kobject_create_and_add("holders", &pdev->kobj);
if (!p->holder_dir)
goto out_del;
dev_set_uevent_suppress(pdev, 0);
if (flags & ADDPART_FLAG_WHOLEDISK) {
err = device_create_file(pdev, &dev_attr_whole_disk);
if (err)
goto out_del;
}
/* everything is up and running, commence */
rcu_assign_pointer(ptbl->part[partno], p);
/* suppress uevent if the disk suppresses it */
if (!dev_get_uevent_suppress(ddev))
kobject_uevent(&pdev->kobj, KOBJ_ADD);
hd_ref_init(p);
return p;
out_free_info:
free_part_info(p);
out_free_stats:
free_part_stats(p);
out_free:
kfree(p);
return ERR_PTR(err);
out_del:
kobject_put(p->holder_dir);
device_del(pdev);
out_put:
put_device(pdev);
blk_free_devt(devt);
return ERR_PTR(err);
}
static bool disk_unlock_native_capacity(struct gendisk *disk)
{
const struct block_device_operations *bdops = disk->fops;
if (bdops->unlock_native_capacity &&
!(disk->flags & GENHD_FL_NATIVE_CAPACITY)) {
printk(KERN_CONT "enabling native capacity\n");
bdops->unlock_native_capacity(disk);
disk->flags |= GENHD_FL_NATIVE_CAPACITY;
return true;
} else {
printk(KERN_CONT "truncated\n");
return false;
}
}
static int drop_partitions(struct gendisk *disk, struct block_device *bdev)
{
struct disk_part_iter piter;
struct hd_struct *part;
int res;
if (bdev->bd_part_count)
return -EBUSY;
res = invalidate_partition(disk, 0);
if (res)
return res;
disk_part_iter_init(&piter, disk, DISK_PITER_INCL_EMPTY);
while ((part = disk_part_iter_next(&piter)))
delete_partition(disk, part->partno);
disk_part_iter_exit(&piter);
return 0;
}
int rescan_partitions(struct gendisk *disk, struct block_device *bdev)
{
struct parsed_partitions *state = NULL;
struct hd_struct *part;
int p, highest, res;
rescan:
if (state && !IS_ERR(state)) {
free_partitions(state);
state = NULL;
}
res = drop_partitions(disk, bdev);
if (res)
return res;
if (disk->fops->revalidate_disk)
disk->fops->revalidate_disk(disk);
check_disk_size_change(disk, bdev);
bdev->bd_invalidated = 0;
if (!get_capacity(disk) || !(state = check_partition(disk, bdev)))
return 0;
if (IS_ERR(state)) {
/*
* I/O error reading the partition table. If any
* partition code tried to read beyond EOD, retry
* after unlocking native capacity.
*/
if (PTR_ERR(state) == -ENOSPC) {
printk(KERN_WARNING "%s: partition table beyond EOD, ",
disk->disk_name);
if (disk_unlock_native_capacity(disk))
goto rescan;
}
return -EIO;
}
/*
* If any partition code tried to read beyond EOD, try
* unlocking native capacity even if partition table is
* successfully read as we could be missing some partitions.
*/
if (state->access_beyond_eod) {
printk(KERN_WARNING
"%s: partition table partially beyond EOD, ",
disk->disk_name);
if (disk_unlock_native_capacity(disk))
goto rescan;
}
/* tell userspace that the media / partition table may have changed */
kobject_uevent(&disk_to_dev(disk)->kobj, KOBJ_CHANGE);
/* Detect the highest partition number and preallocate
* disk->part_tbl. This is an optimization and not strictly
* necessary.
*/
for (p = 1, highest = 0; p < state->limit; p++)
if (state->parts[p].size)
highest = p;
disk_expand_part_tbl(disk, highest);
/* add partitions */
for (p = 1; p < state->limit; p++) {
sector_t size, from;
struct partition_meta_info *info = NULL;
size = state->parts[p].size;
if (!size)
continue;
from = state->parts[p].from;
if (from >= get_capacity(disk)) {
printk(KERN_WARNING
"%s: p%d start %llu is beyond EOD, ",
disk->disk_name, p, (unsigned long long) from);
if (disk_unlock_native_capacity(disk))
goto rescan;
continue;
}
if (from + size > get_capacity(disk)) {
printk(KERN_WARNING
"%s: p%d size %llu extends beyond EOD, ",
disk->disk_name, p, (unsigned long long) size);
if (disk_unlock_native_capacity(disk)) {
/* free state and restart */
goto rescan;
} else {
/*
* we can not ignore partitions of broken tables
* created by for example camera firmware, but
* we limit them to the end of the disk to avoid
* creating invalid block devices
*/
size = get_capacity(disk) - from;
}
}
if (state->parts[p].has_info)
info = &state->parts[p].info;
part = add_partition(disk, p, from, size,
state->parts[p].flags,
&state->parts[p].info);
if (IS_ERR(part)) {
printk(KERN_ERR " %s: p%d could not be added: %ld\n",
disk->disk_name, p, -PTR_ERR(part));
continue;
}
#ifdef CONFIG_BLK_DEV_MD
if (state->parts[p].flags & ADDPART_FLAG_RAID)
md_autodetect_dev(part_to_dev(part)->devt);
#endif
}
free_partitions(state);
return 0;
}
int invalidate_partitions(struct gendisk *disk, struct block_device *bdev)
{
int res;
if (!bdev->bd_invalidated)
return 0;
res = drop_partitions(disk, bdev);
if (res)
return res;
set_capacity(disk, 0);
check_disk_size_change(disk, bdev);
bdev->bd_invalidated = 0;
/* tell userspace that the media / partition table may have changed */
kobject_uevent(&disk_to_dev(disk)->kobj, KOBJ_CHANGE);
return 0;
}
unsigned char *read_dev_sector(struct block_device *bdev, sector_t n, Sector *p)
{
struct address_space *mapping = bdev->bd_inode->i_mapping;
struct page *page;
page = read_mapping_page(mapping, (pgoff_t)(n >> (PAGE_CACHE_SHIFT-9)),
NULL);
if (!IS_ERR(page)) {
if (PageError(page))
goto fail;
p->v = page;
return (unsigned char *)page_address(page) + ((n & ((1 << (PAGE_CACHE_SHIFT - 9)) - 1)) << 9);
fail:
page_cache_release(page);
}
p->v = NULL;
return NULL;
}
EXPORT_SYMBOL(read_dev_sector);

269
block/partitions/Kconfig Normal file
View file

@ -0,0 +1,269 @@
#
# Partition configuration
#
config PARTITION_ADVANCED
bool "Advanced partition selection"
help
Say Y here if you would like to use hard disks under Linux which
were partitioned under an operating system running on a different
architecture than your Linux system.
Note that the answer to this question won't directly affect the
kernel: saying N will just cause the configurator to skip all
the questions about foreign partitioning schemes.
If unsure, say N.
config ACORN_PARTITION
bool "Acorn partition support" if PARTITION_ADVANCED
default y if ARCH_ACORN
help
Support hard disks partitioned under Acorn operating systems.
config ACORN_PARTITION_CUMANA
bool "Cumana partition support" if PARTITION_ADVANCED
default y if ARCH_ACORN
depends on ACORN_PARTITION
help
Say Y here if you would like to use hard disks under Linux which
were partitioned using the Cumana interface on Acorn machines.
config ACORN_PARTITION_EESOX
bool "EESOX partition support" if PARTITION_ADVANCED
default y if ARCH_ACORN
depends on ACORN_PARTITION
config ACORN_PARTITION_ICS
bool "ICS partition support" if PARTITION_ADVANCED
default y if ARCH_ACORN
depends on ACORN_PARTITION
help
Say Y here if you would like to use hard disks under Linux which
were partitioned using the ICS interface on Acorn machines.
config ACORN_PARTITION_ADFS
bool "Native filecore partition support" if PARTITION_ADVANCED
default y if ARCH_ACORN
depends on ACORN_PARTITION
help
The Acorn Disc Filing System is the standard file system of the
RiscOS operating system which runs on Acorn's ARM-based Risc PC
systems and the Acorn Archimedes range of machines. If you say
`Y' here, Linux will support disk partitions created under ADFS.
config ACORN_PARTITION_POWERTEC
bool "PowerTec partition support" if PARTITION_ADVANCED
default y if ARCH_ACORN
depends on ACORN_PARTITION
help
Support reading partition tables created on Acorn machines using
the PowerTec SCSI drive.
config ACORN_PARTITION_RISCIX
bool "RISCiX partition support" if PARTITION_ADVANCED
default y if ARCH_ACORN
depends on ACORN_PARTITION
help
Once upon a time, there was a native Unix port for the Acorn series
of machines called RISCiX. If you say 'Y' here, Linux will be able
to read disks partitioned under RISCiX.
config AIX_PARTITION
bool "AIX basic partition table support" if PARTITION_ADVANCED
help
Say Y here if you would like to be able to read the hard disk
partition table format used by IBM or Motorola PowerPC machines
running AIX. AIX actually uses a Logical Volume Manager, where
"logical volumes" can be spread across one or multiple disks,
but this driver works only for the simple case of partitions which
are contiguous.
Otherwise, say N.
config OSF_PARTITION
bool "Alpha OSF partition support" if PARTITION_ADVANCED
default y if ALPHA
help
Say Y here if you would like to use hard disks under Linux which
were partitioned on an Alpha machine.
config AMIGA_PARTITION
bool "Amiga partition table support" if PARTITION_ADVANCED
default y if (AMIGA || AFFS_FS=y)
help
Say Y here if you would like to use hard disks under Linux which
were partitioned under AmigaOS.
config ATARI_PARTITION
bool "Atari partition table support" if PARTITION_ADVANCED
default y if ATARI
help
Say Y here if you would like to use hard disks under Linux which
were partitioned under the Atari OS.
config IBM_PARTITION
bool "IBM disk label and partition support"
depends on PARTITION_ADVANCED && S390
help
Say Y here if you would like to be able to read the hard disk
partition table format used by IBM DASD disks operating under CMS.
Otherwise, say N.
config MAC_PARTITION
bool "Macintosh partition map support" if PARTITION_ADVANCED
default y if (MAC || PPC_PMAC)
help
Say Y here if you would like to use hard disks under Linux which
were partitioned on a Macintosh.
config MSDOS_PARTITION
bool "PC BIOS (MSDOS partition tables) support" if PARTITION_ADVANCED
default y
help
Say Y here.
config BSD_DISKLABEL
bool "BSD disklabel (FreeBSD partition tables) support"
depends on PARTITION_ADVANCED && MSDOS_PARTITION
help
FreeBSD uses its own hard disk partition scheme on your PC. It
requires only one entry in the primary partition table of your disk
and manages it similarly to DOS extended partitions, putting in its
first sector a new partition table in BSD disklabel format. Saying Y
here allows you to read these disklabels and further mount FreeBSD
partitions from within Linux if you have also said Y to "UFS
file system support", above. If you don't know what all this is
about, say N.
config MINIX_SUBPARTITION
bool "Minix subpartition support"
depends on PARTITION_ADVANCED && MSDOS_PARTITION
help
Minix 2.0.0/2.0.2 subpartition table support for Linux.
Say Y here if you want to mount and use Minix 2.0.0/2.0.2
subpartitions.
config SOLARIS_X86_PARTITION
bool "Solaris (x86) partition table support"
depends on PARTITION_ADVANCED && MSDOS_PARTITION
help
Like most systems, Solaris x86 uses its own hard disk partition
table format, incompatible with all others. Saying Y here allows you
to read these partition tables and further mount Solaris x86
partitions from within Linux if you have also said Y to "UFS
file system support", above.
config UNIXWARE_DISKLABEL
bool "Unixware slices support"
depends on PARTITION_ADVANCED && MSDOS_PARTITION
---help---
Like some systems, UnixWare uses its own slice table inside a
partition (VTOC - Virtual Table of Contents). Its format is
incompatible with all other OSes. Saying Y here allows you to read
VTOC and further mount UnixWare partitions read-only from within
Linux if you have also said Y to "UFS file system support" or
"System V and Coherent file system support", above.
This is mainly used to carry data from a UnixWare box to your
Linux box via a removable medium like magneto-optical, ZIP or
removable IDE drives. Note, however, that a good portable way to
transport files and directories between unixes (and even other
operating systems) is given by the tar program ("man tar" or
preferably "info tar").
If you don't know what all this is about, say N.
config LDM_PARTITION
bool "Windows Logical Disk Manager (Dynamic Disk) support"
depends on PARTITION_ADVANCED
---help---
Say Y here if you would like to use hard disks under Linux which
were partitioned using Windows 2000's/XP's or Vista's Logical Disk
Manager. They are also known as "Dynamic Disks".
Note this driver only supports Dynamic Disks with a protective MBR
label, i.e. DOS partition table. It does not support GPT labelled
Dynamic Disks yet as can be created with Vista.
Windows 2000 introduced the concept of Dynamic Disks to get around
the limitations of the PC's partitioning scheme. The Logical Disk
Manager allows the user to repartition a disk and create spanned,
mirrored, striped or RAID volumes, all without the need for
rebooting.
Normal partitions are now called Basic Disks under Windows 2000, XP,
and Vista.
For a fuller description read <file:Documentation/ldm.txt>.
If unsure, say N.
config LDM_DEBUG
bool "Windows LDM extra logging"
depends on LDM_PARTITION
help
Say Y here if you would like LDM to log verbosely. This could be
helpful if the driver doesn't work as expected and you'd like to
report a bug.
If unsure, say N.
config SGI_PARTITION
bool "SGI partition support" if PARTITION_ADVANCED
default y if DEFAULT_SGI_PARTITION
help
Say Y here if you would like to be able to read the hard disk
partition table format used by SGI machines.
config ULTRIX_PARTITION
bool "Ultrix partition table support" if PARTITION_ADVANCED
default y if MACH_DECSTATION
help
Say Y here if you would like to be able to read the hard disk
partition table format used by DEC (now Compaq) Ultrix machines.
Otherwise, say N.
config SUN_PARTITION
bool "Sun partition tables support" if PARTITION_ADVANCED
default y if (SPARC || SUN3 || SUN3X)
---help---
Like most systems, SunOS uses its own hard disk partition table
format, incompatible with all others. Saying Y here allows you to
read these partition tables and further mount SunOS partitions from
within Linux if you have also said Y to "UFS file system support",
above. This is mainly used to carry data from a SPARC under SunOS to
your Linux box via a removable medium like magneto-optical or ZIP
drives; note however that a good portable way to transport files and
directories between unixes (and even other operating systems) is
given by the tar program ("man tar" or preferably "info tar"). If
you don't know what all this is about, say N.
config KARMA_PARTITION
bool "Karma Partition support"
depends on PARTITION_ADVANCED
help
Say Y here if you would like to mount the Rio Karma MP3 player, as it
uses a proprietary partition table.
config EFI_PARTITION
bool "EFI GUID Partition support" if PARTITION_ADVANCED
default y
select CRC32
help
Say Y here if you would like to use hard disks under Linux which
were partitioned using EFI GPT.
config SYSV68_PARTITION
bool "SYSV68 partition table support" if PARTITION_ADVANCED
default y if VME
help
Say Y here if you would like to be able to read the hard disk
partition table format used by Motorola Delta machines (using
sysv68).
Otherwise, say N.
config CMDLINE_PARTITION
bool "Command line partition support" if PARTITION_ADVANCED
select BLK_CMDLINE_PARSER
help
Say Y here if you want to read the partition table from bootargs.
The format for the command line is just like mtdparts.

22
block/partitions/Makefile Normal file
View file

@ -0,0 +1,22 @@
#
# Makefile for the linux kernel.
#
obj-$(CONFIG_BLOCK) := check.o
obj-$(CONFIG_ACORN_PARTITION) += acorn.o
obj-$(CONFIG_AMIGA_PARTITION) += amiga.o
obj-$(CONFIG_ATARI_PARTITION) += atari.o
obj-$(CONFIG_AIX_PARTITION) += aix.o
obj-$(CONFIG_CMDLINE_PARTITION) += cmdline.o
obj-$(CONFIG_MAC_PARTITION) += mac.o
obj-$(CONFIG_LDM_PARTITION) += ldm.o
obj-$(CONFIG_MSDOS_PARTITION) += msdos.o
obj-$(CONFIG_OSF_PARTITION) += osf.o
obj-$(CONFIG_SGI_PARTITION) += sgi.o
obj-$(CONFIG_SUN_PARTITION) += sun.o
obj-$(CONFIG_ULTRIX_PARTITION) += ultrix.o
obj-$(CONFIG_IBM_PARTITION) += ibm.o
obj-$(CONFIG_EFI_PARTITION) += efi.o
obj-$(CONFIG_KARMA_PARTITION) += karma.o
obj-$(CONFIG_SYSV68_PARTITION) += sysv68.o

556
block/partitions/acorn.c Normal file
View file

@ -0,0 +1,556 @@
/*
* linux/fs/partitions/acorn.c
*
* Copyright (c) 1996-2000 Russell King.
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License version 2 as
* published by the Free Software Foundation.
*
* Scan ADFS partitions on hard disk drives. Unfortunately, there
* isn't a standard for partitioning drives on Acorn machines, so
* every single manufacturer of SCSI and IDE cards created their own
* method.
*/
#include <linux/buffer_head.h>
#include <linux/adfs_fs.h>
#include "check.h"
#include "acorn.h"
/*
* Partition types. (Oh for reusability)
*/
#define PARTITION_RISCIX_MFM 1
#define PARTITION_RISCIX_SCSI 2
#define PARTITION_LINUX 9
#if defined(CONFIG_ACORN_PARTITION_CUMANA) || \
defined(CONFIG_ACORN_PARTITION_ADFS)
static struct adfs_discrecord *
adfs_partition(struct parsed_partitions *state, char *name, char *data,
unsigned long first_sector, int slot)
{
struct adfs_discrecord *dr;
unsigned int nr_sects;
if (adfs_checkbblk(data))
return NULL;
dr = (struct adfs_discrecord *)(data + 0x1c0);
if (dr->disc_size == 0 && dr->disc_size_high == 0)
return NULL;
nr_sects = (le32_to_cpu(dr->disc_size_high) << 23) |
(le32_to_cpu(dr->disc_size) >> 9);
if (name) {
strlcat(state->pp_buf, " [", PAGE_SIZE);
strlcat(state->pp_buf, name, PAGE_SIZE);
strlcat(state->pp_buf, "]", PAGE_SIZE);
}
put_partition(state, slot, first_sector, nr_sects);
return dr;
}
#endif
#ifdef CONFIG_ACORN_PARTITION_RISCIX
struct riscix_part {
__le32 start;
__le32 length;
__le32 one;
char name[16];
};
struct riscix_record {
__le32 magic;
#define RISCIX_MAGIC cpu_to_le32(0x4a657320)
__le32 date;
struct riscix_part part[8];
};
#if defined(CONFIG_ACORN_PARTITION_CUMANA) || \
defined(CONFIG_ACORN_PARTITION_ADFS)
static int riscix_partition(struct parsed_partitions *state,
unsigned long first_sect, int slot,
unsigned long nr_sects)
{
Sector sect;
struct riscix_record *rr;
rr = read_part_sector(state, first_sect, &sect);
if (!rr)
return -1;
strlcat(state->pp_buf, " [RISCiX]", PAGE_SIZE);
if (rr->magic == RISCIX_MAGIC) {
unsigned long size = nr_sects > 2 ? 2 : nr_sects;
int part;
strlcat(state->pp_buf, " <", PAGE_SIZE);
put_partition(state, slot++, first_sect, size);
for (part = 0; part < 8; part++) {
if (rr->part[part].one &&
memcmp(rr->part[part].name, "All\0", 4)) {
put_partition(state, slot++,
le32_to_cpu(rr->part[part].start),
le32_to_cpu(rr->part[part].length));
strlcat(state->pp_buf, "(", PAGE_SIZE);
strlcat(state->pp_buf, rr->part[part].name, PAGE_SIZE);
strlcat(state->pp_buf, ")", PAGE_SIZE);
}
}
strlcat(state->pp_buf, " >\n", PAGE_SIZE);
} else {
put_partition(state, slot++, first_sect, nr_sects);
}
put_dev_sector(sect);
return slot;
}
#endif
#endif
#define LINUX_NATIVE_MAGIC 0xdeafa1de
#define LINUX_SWAP_MAGIC 0xdeafab1e
struct linux_part {
__le32 magic;
__le32 start_sect;
__le32 nr_sects;
};
#if defined(CONFIG_ACORN_PARTITION_CUMANA) || \
defined(CONFIG_ACORN_PARTITION_ADFS)
static int linux_partition(struct parsed_partitions *state,
unsigned long first_sect, int slot,
unsigned long nr_sects)
{
Sector sect;
struct linux_part *linuxp;
unsigned long size = nr_sects > 2 ? 2 : nr_sects;
strlcat(state->pp_buf, " [Linux]", PAGE_SIZE);
put_partition(state, slot++, first_sect, size);
linuxp = read_part_sector(state, first_sect, &sect);
if (!linuxp)
return -1;
strlcat(state->pp_buf, " <", PAGE_SIZE);
while (linuxp->magic == cpu_to_le32(LINUX_NATIVE_MAGIC) ||
linuxp->magic == cpu_to_le32(LINUX_SWAP_MAGIC)) {
if (slot == state->limit)
break;
put_partition(state, slot++, first_sect +
le32_to_cpu(linuxp->start_sect),
le32_to_cpu(linuxp->nr_sects));
linuxp ++;
}
strlcat(state->pp_buf, " >", PAGE_SIZE);
put_dev_sector(sect);
return slot;
}
#endif
#ifdef CONFIG_ACORN_PARTITION_CUMANA
int adfspart_check_CUMANA(struct parsed_partitions *state)
{
unsigned long first_sector = 0;
unsigned int start_blk = 0;
Sector sect;
unsigned char *data;
char *name = "CUMANA/ADFS";
int first = 1;
int slot = 1;
/*
* Try Cumana style partitions - sector 6 contains ADFS boot block
* with pointer to next 'drive'.
*
* There are unknowns in this code - is the 'cylinder number' of the
* next partition relative to the start of this one - I'm assuming
* it is.
*
* Also, which ID did Cumana use?
*
* This is totally unfinished, and will require more work to get it
* going. Hence it is totally untested.
*/
do {
struct adfs_discrecord *dr;
unsigned int nr_sects;
data = read_part_sector(state, start_blk * 2 + 6, &sect);
if (!data)
return -1;
if (slot == state->limit)
break;
dr = adfs_partition(state, name, data, first_sector, slot++);
if (!dr)
break;
name = NULL;
nr_sects = (data[0x1fd] + (data[0x1fe] << 8)) *
(dr->heads + (dr->lowsector & 0x40 ? 1 : 0)) *
dr->secspertrack;
if (!nr_sects)
break;
first = 0;
first_sector += nr_sects;
start_blk += nr_sects >> (BLOCK_SIZE_BITS - 9);
nr_sects = 0; /* hmm - should be partition size */
switch (data[0x1fc] & 15) {
case 0: /* No partition / ADFS? */
break;
#ifdef CONFIG_ACORN_PARTITION_RISCIX
case PARTITION_RISCIX_SCSI:
/* RISCiX - we don't know how to find the next one. */
slot = riscix_partition(state, first_sector, slot,
nr_sects);
break;
#endif
case PARTITION_LINUX:
slot = linux_partition(state, first_sector, slot,
nr_sects);
break;
}
put_dev_sector(sect);
if (slot == -1)
return -1;
} while (1);
put_dev_sector(sect);
return first ? 0 : 1;
}
#endif
#ifdef CONFIG_ACORN_PARTITION_ADFS
/*
* Purpose: allocate ADFS partitions.
*
* Params : hd - pointer to gendisk structure to store partition info.
* dev - device number to access.
*
* Returns: -1 on error, 0 for no ADFS boot sector, 1 for ok.
*
* Alloc : hda = whole drive
* hda1 = ADFS partition on first drive.
* hda2 = non-ADFS partition.
*/
int adfspart_check_ADFS(struct parsed_partitions *state)
{
unsigned long start_sect, nr_sects, sectscyl, heads;
Sector sect;
unsigned char *data;
struct adfs_discrecord *dr;
unsigned char id;
int slot = 1;
data = read_part_sector(state, 6, &sect);
if (!data)
return -1;
dr = adfs_partition(state, "ADFS", data, 0, slot++);
if (!dr) {
put_dev_sector(sect);
return 0;
}
heads = dr->heads + ((dr->lowsector >> 6) & 1);
sectscyl = dr->secspertrack * heads;
start_sect = ((data[0x1fe] << 8) + data[0x1fd]) * sectscyl;
id = data[0x1fc] & 15;
put_dev_sector(sect);
/*
* Work out start of non-adfs partition.
*/
nr_sects = (state->bdev->bd_inode->i_size >> 9) - start_sect;
if (start_sect) {
switch (id) {
#ifdef CONFIG_ACORN_PARTITION_RISCIX
case PARTITION_RISCIX_SCSI:
case PARTITION_RISCIX_MFM:
slot = riscix_partition(state, start_sect, slot,
nr_sects);
break;
#endif
case PARTITION_LINUX:
slot = linux_partition(state, start_sect, slot,
nr_sects);
break;
}
}
strlcat(state->pp_buf, "\n", PAGE_SIZE);
return 1;
}
#endif
#ifdef CONFIG_ACORN_PARTITION_ICS
struct ics_part {
__le32 start;
__le32 size;
};
static int adfspart_check_ICSLinux(struct parsed_partitions *state,
unsigned long block)
{
Sector sect;
unsigned char *data = read_part_sector(state, block, &sect);
int result = 0;
if (data) {
if (memcmp(data, "LinuxPart", 9) == 0)
result = 1;
put_dev_sector(sect);
}
return result;
}
/*
* Check for a valid ICS partition using the checksum.
*/
static inline int valid_ics_sector(const unsigned char *data)
{
unsigned long sum;
int i;
for (i = 0, sum = 0x50617274; i < 508; i++)
sum += data[i];
sum -= le32_to_cpu(*(__le32 *)(&data[508]));
return sum == 0;
}
/*
* Purpose: allocate ICS partitions.
* Params : hd - pointer to gendisk structure to store partition info.
* dev - device number to access.
* Returns: -1 on error, 0 for no ICS table, 1 for partitions ok.
* Alloc : hda = whole drive
* hda1 = ADFS partition 0 on first drive.
* hda2 = ADFS partition 1 on first drive.
* ..etc..
*/
int adfspart_check_ICS(struct parsed_partitions *state)
{
const unsigned char *data;
const struct ics_part *p;
int slot;
Sector sect;
/*
* Try ICS style partitions - sector 0 contains partition info.
*/
data = read_part_sector(state, 0, &sect);
if (!data)
return -1;
if (!valid_ics_sector(data)) {
put_dev_sector(sect);
return 0;
}
strlcat(state->pp_buf, " [ICS]", PAGE_SIZE);
for (slot = 1, p = (const struct ics_part *)data; p->size; p++) {
u32 start = le32_to_cpu(p->start);
s32 size = le32_to_cpu(p->size); /* yes, it's signed. */
if (slot == state->limit)
break;
/*
* Negative sizes tell the RISC OS ICS driver to ignore
* this partition - in effect it says that this does not
* contain an ADFS filesystem.
*/
if (size < 0) {
size = -size;
/*
* Our own extension - We use the first sector
* of the partition to identify what type this
* partition is. We must not make this visible
* to the filesystem.
*/
if (size > 1 && adfspart_check_ICSLinux(state, start)) {
start += 1;
size -= 1;
}
}
if (size)
put_partition(state, slot++, start, size);
}
put_dev_sector(sect);
strlcat(state->pp_buf, "\n", PAGE_SIZE);
return 1;
}
#endif
#ifdef CONFIG_ACORN_PARTITION_POWERTEC
struct ptec_part {
__le32 unused1;
__le32 unused2;
__le32 start;
__le32 size;
__le32 unused5;
char type[8];
};
static inline int valid_ptec_sector(const unsigned char *data)
{
unsigned char checksum = 0x2a;
int i;
/*
* If it looks like a PC/BIOS partition, then it
* probably isn't PowerTec.
*/
if (data[510] == 0x55 && data[511] == 0xaa)
return 0;
for (i = 0; i < 511; i++)
checksum += data[i];
return checksum == data[511];
}
/*
* Purpose: allocate ICS partitions.
* Params : hd - pointer to gendisk structure to store partition info.
* dev - device number to access.
* Returns: -1 on error, 0 for no ICS table, 1 for partitions ok.
* Alloc : hda = whole drive
* hda1 = ADFS partition 0 on first drive.
* hda2 = ADFS partition 1 on first drive.
* ..etc..
*/
int adfspart_check_POWERTEC(struct parsed_partitions *state)
{
Sector sect;
const unsigned char *data;
const struct ptec_part *p;
int slot = 1;
int i;
data = read_part_sector(state, 0, &sect);
if (!data)
return -1;
if (!valid_ptec_sector(data)) {
put_dev_sector(sect);
return 0;
}
strlcat(state->pp_buf, " [POWERTEC]", PAGE_SIZE);
for (i = 0, p = (const struct ptec_part *)data; i < 12; i++, p++) {
u32 start = le32_to_cpu(p->start);
u32 size = le32_to_cpu(p->size);
if (size)
put_partition(state, slot++, start, size);
}
put_dev_sector(sect);
strlcat(state->pp_buf, "\n", PAGE_SIZE);
return 1;
}
#endif
#ifdef CONFIG_ACORN_PARTITION_EESOX
struct eesox_part {
char magic[6];
char name[10];
__le32 start;
__le32 unused6;
__le32 unused7;
__le32 unused8;
};
/*
* Guess who created this format?
*/
static const char eesox_name[] = {
'N', 'e', 'i', 'l', ' ',
'C', 'r', 'i', 't', 'c', 'h', 'e', 'l', 'l', ' ', ' '
};
/*
* EESOX SCSI partition format.
*
* This is a goddamned awful partition format. We don't seem to store
* the size of the partition in this table, only the start addresses.
*
* There are two possibilities where the size comes from:
* 1. The individual ADFS boot block entries that are placed on the disk.
* 2. The start address of the next entry.
*/
int adfspart_check_EESOX(struct parsed_partitions *state)
{
Sector sect;
const unsigned char *data;
unsigned char buffer[256];
struct eesox_part *p;
sector_t start = 0;
int i, slot = 1;
data = read_part_sector(state, 7, &sect);
if (!data)
return -1;
/*
* "Decrypt" the partition table. God knows why...
*/
for (i = 0; i < 256; i++)
buffer[i] = data[i] ^ eesox_name[i & 15];
put_dev_sector(sect);
for (i = 0, p = (struct eesox_part *)buffer; i < 8; i++, p++) {
sector_t next;
if (memcmp(p->magic, "Eesox", 6))
break;
next = le32_to_cpu(p->start);
if (i)
put_partition(state, slot++, start, next - start);
start = next;
}
if (i != 0) {
sector_t size;
size = get_capacity(state->bdev->bd_disk);
put_partition(state, slot++, start, size - start);
strlcat(state->pp_buf, "\n", PAGE_SIZE);
}
return i ? 1 : 0;
}
#endif

14
block/partitions/acorn.h Normal file
View file

@ -0,0 +1,14 @@
/*
* linux/fs/partitions/acorn.h
*
* Copyright (C) 1996-2001 Russell King.
*
* I _hate_ this partitioning mess - why can't we have one defined
* format, and everyone stick to it?
*/
int adfspart_check_CUMANA(struct parsed_partitions *state);
int adfspart_check_ADFS(struct parsed_partitions *state);
int adfspart_check_ICS(struct parsed_partitions *state);
int adfspart_check_POWERTEC(struct parsed_partitions *state);
int adfspart_check_EESOX(struct parsed_partitions *state);

293
block/partitions/aix.c Normal file
View file

@ -0,0 +1,293 @@
/*
* fs/partitions/aix.c
*
* Copyright (C) 2012-2013 Philippe De Muyter <phdm@macqel.be>
*/
#include "check.h"
#include "aix.h"
struct lvm_rec {
char lvm_id[4]; /* "_LVM" */
char reserved4[16];
__be32 lvmarea_len;
__be32 vgda_len;
__be32 vgda_psn[2];
char reserved36[10];
__be16 pp_size; /* log2(pp_size) */
char reserved46[12];
__be16 version;
};
struct vgda {
__be32 secs;
__be32 usec;
char reserved8[16];
__be16 numlvs;
__be16 maxlvs;
__be16 pp_size;
__be16 numpvs;
__be16 total_vgdas;
__be16 vgda_size;
};
struct lvd {
__be16 lv_ix;
__be16 res2;
__be16 res4;
__be16 maxsize;
__be16 lv_state;
__be16 mirror;
__be16 mirror_policy;
__be16 num_lps;
__be16 res10[8];
};
struct lvname {
char name[64];
};
struct ppe {
__be16 lv_ix;
unsigned short res2;
unsigned short res4;
__be16 lp_ix;
unsigned short res8[12];
};
struct pvd {
char reserved0[16];
__be16 pp_count;
char reserved18[2];
__be32 psn_part1;
char reserved24[8];
struct ppe ppe[1016];
};
#define LVM_MAXLVS 256
/**
* last_lba(): return number of last logical block of device
* @bdev: block device
*
* Description: Returns last LBA value on success, 0 on error.
* This is stored (by sd and ide-geometry) in
* the part[0] entry for this disk, and is the number of
* physical sectors available on the disk.
*/
static u64 last_lba(struct block_device *bdev)
{
if (!bdev || !bdev->bd_inode)
return 0;
return (bdev->bd_inode->i_size >> 9) - 1ULL;
}
/**
* read_lba(): Read bytes from disk, starting at given LBA
* @state
* @lba
* @buffer
* @count
*
* Description: Reads @count bytes from @state->bdev into @buffer.
* Returns number of bytes read on success, 0 on error.
*/
static size_t read_lba(struct parsed_partitions *state, u64 lba, u8 *buffer,
size_t count)
{
size_t totalreadcount = 0;
if (!buffer || lba + count / 512 > last_lba(state->bdev))
return 0;
while (count) {
int copied = 512;
Sector sect;
unsigned char *data = read_part_sector(state, lba++, &sect);
if (!data)
break;
if (copied > count)
copied = count;
memcpy(buffer, data, copied);
put_dev_sector(sect);
buffer += copied;
totalreadcount += copied;
count -= copied;
}
return totalreadcount;
}
/**
* alloc_pvd(): reads physical volume descriptor
* @state
* @lba
*
* Description: Returns pvd on success, NULL on error.
* Allocates space for pvd and fill it with disk blocks at @lba
* Notes: remember to free pvd when you're done!
*/
static struct pvd *alloc_pvd(struct parsed_partitions *state, u32 lba)
{
size_t count = sizeof(struct pvd);
struct pvd *p;
p = kmalloc(count, GFP_KERNEL);
if (!p)
return NULL;
if (read_lba(state, lba, (u8 *) p, count) < count) {
kfree(p);
return NULL;
}
return p;
}
/**
* alloc_lvn(): reads logical volume names
* @state
* @lba
*
* Description: Returns lvn on success, NULL on error.
* Allocates space for lvn and fill it with disk blocks at @lba
* Notes: remember to free lvn when you're done!
*/
static struct lvname *alloc_lvn(struct parsed_partitions *state, u32 lba)
{
size_t count = sizeof(struct lvname) * LVM_MAXLVS;
struct lvname *p;
p = kmalloc(count, GFP_KERNEL);
if (!p)
return NULL;
if (read_lba(state, lba, (u8 *) p, count) < count) {
kfree(p);
return NULL;
}
return p;
}
int aix_partition(struct parsed_partitions *state)
{
int ret = 0;
Sector sect;
unsigned char *d;
u32 pp_bytes_size;
u32 pp_blocks_size = 0;
u32 vgda_sector = 0;
u32 vgda_len = 0;
int numlvs = 0;
struct pvd *pvd;
struct lv_info {
unsigned short pps_per_lv;
unsigned short pps_found;
unsigned char lv_is_contiguous;
} *lvip;
struct lvname *n = NULL;
d = read_part_sector(state, 7, &sect);
if (d) {
struct lvm_rec *p = (struct lvm_rec *)d;
u16 lvm_version = be16_to_cpu(p->version);
char tmp[64];
if (lvm_version == 1) {
int pp_size_log2 = be16_to_cpu(p->pp_size);
pp_bytes_size = 1 << pp_size_log2;
pp_blocks_size = pp_bytes_size / 512;
snprintf(tmp, sizeof(tmp),
" AIX LVM header version %u found\n",
lvm_version);
vgda_len = be32_to_cpu(p->vgda_len);
vgda_sector = be32_to_cpu(p->vgda_psn[0]);
} else {
snprintf(tmp, sizeof(tmp),
" unsupported AIX LVM version %d found\n",
lvm_version);
}
strlcat(state->pp_buf, tmp, PAGE_SIZE);
put_dev_sector(sect);
}
if (vgda_sector && (d = read_part_sector(state, vgda_sector, &sect))) {
struct vgda *p = (struct vgda *)d;
numlvs = be16_to_cpu(p->numlvs);
put_dev_sector(sect);
}
lvip = kcalloc(state->limit, sizeof(struct lv_info), GFP_KERNEL);
if (!lvip)
return 0;
if (numlvs && (d = read_part_sector(state, vgda_sector + 1, &sect))) {
struct lvd *p = (struct lvd *)d;
int i;
n = alloc_lvn(state, vgda_sector + vgda_len - 33);
if (n) {
int foundlvs = 0;
for (i = 0; foundlvs < numlvs && i < state->limit; i += 1) {
lvip[i].pps_per_lv = be16_to_cpu(p[i].num_lps);
if (lvip[i].pps_per_lv)
foundlvs += 1;
}
}
put_dev_sector(sect);
}
pvd = alloc_pvd(state, vgda_sector + 17);
if (pvd) {
int numpps = be16_to_cpu(pvd->pp_count);
int psn_part1 = be32_to_cpu(pvd->psn_part1);
int i;
int cur_lv_ix = -1;
int next_lp_ix = 1;
int lp_ix;
for (i = 0; i < numpps; i += 1) {
struct ppe *p = pvd->ppe + i;
unsigned int lv_ix;
lp_ix = be16_to_cpu(p->lp_ix);
if (!lp_ix) {
next_lp_ix = 1;
continue;
}
lv_ix = be16_to_cpu(p->lv_ix) - 1;
if (lv_ix >= state->limit) {
cur_lv_ix = -1;
continue;
}
lvip[lv_ix].pps_found += 1;
if (lp_ix == 1) {
cur_lv_ix = lv_ix;
next_lp_ix = 1;
} else if (lv_ix != cur_lv_ix || lp_ix != next_lp_ix) {
next_lp_ix = 1;
continue;
}
if (lp_ix == lvip[lv_ix].pps_per_lv) {
char tmp[70];
put_partition(state, lv_ix + 1,
(i + 1 - lp_ix) * pp_blocks_size + psn_part1,
lvip[lv_ix].pps_per_lv * pp_blocks_size);
snprintf(tmp, sizeof(tmp), " <%s>\n",
n[lv_ix].name);
strlcat(state->pp_buf, tmp, PAGE_SIZE);
lvip[lv_ix].lv_is_contiguous = 1;
ret = 1;
next_lp_ix = 1;
} else
next_lp_ix += 1;
}
for (i = 0; i < state->limit; i += 1)
if (lvip[i].pps_found && !lvip[i].lv_is_contiguous)
pr_warn("partition %s (%u pp's found) is "
"not contiguous\n",
n[i].name, lvip[i].pps_found);
kfree(pvd);
}
kfree(n);
kfree(lvip);
return ret;
}

1
block/partitions/aix.h Normal file
View file

@ -0,0 +1 @@
extern int aix_partition(struct parsed_partitions *state);

141
block/partitions/amiga.c Normal file
View file

@ -0,0 +1,141 @@
/*
* fs/partitions/amiga.c
*
* Code extracted from drivers/block/genhd.c
*
* Copyright (C) 1991-1998 Linus Torvalds
* Re-organised Feb 1998 Russell King
*/
#define pr_fmt(fmt) fmt
#include <linux/types.h>
#include <linux/affs_hardblocks.h>
#include "check.h"
#include "amiga.h"
static __inline__ u32
checksum_block(__be32 *m, int size)
{
u32 sum = 0;
while (size--)
sum += be32_to_cpu(*m++);
return sum;
}
int amiga_partition(struct parsed_partitions *state)
{
Sector sect;
unsigned char *data;
struct RigidDiskBlock *rdb;
struct PartitionBlock *pb;
int start_sect, nr_sects, blk, part, res = 0;
int blksize = 1; /* Multiplier for disk block size */
int slot = 1;
char b[BDEVNAME_SIZE];
for (blk = 0; ; blk++, put_dev_sector(sect)) {
if (blk == RDB_ALLOCATION_LIMIT)
goto rdb_done;
data = read_part_sector(state, blk, &sect);
if (!data) {
if (warn_no_part)
pr_err("Dev %s: unable to read RDB block %d\n",
bdevname(state->bdev, b), blk);
res = -1;
goto rdb_done;
}
if (*(__be32 *)data != cpu_to_be32(IDNAME_RIGIDDISK))
continue;
rdb = (struct RigidDiskBlock *)data;
if (checksum_block((__be32 *)data, be32_to_cpu(rdb->rdb_SummedLongs) & 0x7F) == 0)
break;
/* Try again with 0xdc..0xdf zeroed, Windows might have
* trashed it.
*/
*(__be32 *)(data+0xdc) = 0;
if (checksum_block((__be32 *)data,
be32_to_cpu(rdb->rdb_SummedLongs) & 0x7F)==0) {
pr_err("Trashed word at 0xd0 in block %d ignored in checksum calculation\n",
blk);
break;
}
pr_err("Dev %s: RDB in block %d has bad checksum\n",
bdevname(state->bdev, b), blk);
}
/* blksize is blocks per 512 byte standard block */
blksize = be32_to_cpu( rdb->rdb_BlockBytes ) / 512;
{
char tmp[7 + 10 + 1 + 1];
/* Be more informative */
snprintf(tmp, sizeof(tmp), " RDSK (%d)", blksize * 512);
strlcat(state->pp_buf, tmp, PAGE_SIZE);
}
blk = be32_to_cpu(rdb->rdb_PartitionList);
put_dev_sector(sect);
for (part = 1; blk>0 && part<=16; part++, put_dev_sector(sect)) {
blk *= blksize; /* Read in terms partition table understands */
data = read_part_sector(state, blk, &sect);
if (!data) {
if (warn_no_part)
pr_err("Dev %s: unable to read partition block %d\n",
bdevname(state->bdev, b), blk);
res = -1;
goto rdb_done;
}
pb = (struct PartitionBlock *)data;
blk = be32_to_cpu(pb->pb_Next);
if (pb->pb_ID != cpu_to_be32(IDNAME_PARTITION))
continue;
if (checksum_block((__be32 *)pb, be32_to_cpu(pb->pb_SummedLongs) & 0x7F) != 0 )
continue;
/* Tell Kernel about it */
nr_sects = (be32_to_cpu(pb->pb_Environment[10]) + 1 -
be32_to_cpu(pb->pb_Environment[9])) *
be32_to_cpu(pb->pb_Environment[3]) *
be32_to_cpu(pb->pb_Environment[5]) *
blksize;
if (!nr_sects)
continue;
start_sect = be32_to_cpu(pb->pb_Environment[9]) *
be32_to_cpu(pb->pb_Environment[3]) *
be32_to_cpu(pb->pb_Environment[5]) *
blksize;
put_partition(state,slot++,start_sect,nr_sects);
{
/* Be even more informative to aid mounting */
char dostype[4];
char tmp[42];
__be32 *dt = (__be32 *)dostype;
*dt = pb->pb_Environment[16];
if (dostype[3] < ' ')
snprintf(tmp, sizeof(tmp), " (%c%c%c^%c)",
dostype[0], dostype[1],
dostype[2], dostype[3] + '@' );
else
snprintf(tmp, sizeof(tmp), " (%c%c%c%c)",
dostype[0], dostype[1],
dostype[2], dostype[3]);
strlcat(state->pp_buf, tmp, PAGE_SIZE);
snprintf(tmp, sizeof(tmp), "(res %d spb %d)",
be32_to_cpu(pb->pb_Environment[6]),
be32_to_cpu(pb->pb_Environment[4]));
strlcat(state->pp_buf, tmp, PAGE_SIZE);
}
res = 1;
}
strlcat(state->pp_buf, "\n", PAGE_SIZE);
rdb_done:
return res;
}

6
block/partitions/amiga.h Normal file
View file

@ -0,0 +1,6 @@
/*
* fs/partitions/amiga.h
*/
int amiga_partition(struct parsed_partitions *state);

149
block/partitions/atari.c Normal file
View file

@ -0,0 +1,149 @@
/*
* fs/partitions/atari.c
*
* Code extracted from drivers/block/genhd.c
*
* Copyright (C) 1991-1998 Linus Torvalds
* Re-organised Feb 1998 Russell King
*/
#include <linux/ctype.h>
#include "check.h"
#include "atari.h"
/* ++guenther: this should be settable by the user ("make config")?.
*/
#define ICD_PARTS
/* check if a partition entry looks valid -- Atari format is assumed if at
least one of the primary entries is ok this way */
#define VALID_PARTITION(pi,hdsiz) \
(((pi)->flg & 1) && \
isalnum((pi)->id[0]) && isalnum((pi)->id[1]) && isalnum((pi)->id[2]) && \
be32_to_cpu((pi)->st) <= (hdsiz) && \
be32_to_cpu((pi)->st) + be32_to_cpu((pi)->siz) <= (hdsiz))
static inline int OK_id(char *s)
{
return memcmp (s, "GEM", 3) == 0 || memcmp (s, "BGM", 3) == 0 ||
memcmp (s, "LNX", 3) == 0 || memcmp (s, "SWP", 3) == 0 ||
memcmp (s, "RAW", 3) == 0 ;
}
int atari_partition(struct parsed_partitions *state)
{
Sector sect;
struct rootsector *rs;
struct partition_info *pi;
u32 extensect;
u32 hd_size;
int slot;
#ifdef ICD_PARTS
int part_fmt = 0; /* 0:unknown, 1:AHDI, 2:ICD/Supra */
#endif
rs = read_part_sector(state, 0, &sect);
if (!rs)
return -1;
/* Verify this is an Atari rootsector: */
hd_size = state->bdev->bd_inode->i_size >> 9;
if (!VALID_PARTITION(&rs->part[0], hd_size) &&
!VALID_PARTITION(&rs->part[1], hd_size) &&
!VALID_PARTITION(&rs->part[2], hd_size) &&
!VALID_PARTITION(&rs->part[3], hd_size)) {
/*
* if there's no valid primary partition, assume that no Atari
* format partition table (there's no reliable magic or the like
* :-()
*/
put_dev_sector(sect);
return 0;
}
pi = &rs->part[0];
strlcat(state->pp_buf, " AHDI", PAGE_SIZE);
for (slot = 1; pi < &rs->part[4] && slot < state->limit; slot++, pi++) {
struct rootsector *xrs;
Sector sect2;
ulong partsect;
if ( !(pi->flg & 1) )
continue;
/* active partition */
if (memcmp (pi->id, "XGM", 3) != 0) {
/* we don't care about other id's */
put_partition (state, slot, be32_to_cpu(pi->st),
be32_to_cpu(pi->siz));
continue;
}
/* extension partition */
#ifdef ICD_PARTS
part_fmt = 1;
#endif
strlcat(state->pp_buf, " XGM<", PAGE_SIZE);
partsect = extensect = be32_to_cpu(pi->st);
while (1) {
xrs = read_part_sector(state, partsect, &sect2);
if (!xrs) {
printk (" block %ld read failed\n", partsect);
put_dev_sector(sect);
return -1;
}
/* ++roman: sanity check: bit 0 of flg field must be set */
if (!(xrs->part[0].flg & 1)) {
printk( "\nFirst sub-partition in extended partition is not valid!\n" );
put_dev_sector(sect2);
break;
}
put_partition(state, slot,
partsect + be32_to_cpu(xrs->part[0].st),
be32_to_cpu(xrs->part[0].siz));
if (!(xrs->part[1].flg & 1)) {
/* end of linked partition list */
put_dev_sector(sect2);
break;
}
if (memcmp( xrs->part[1].id, "XGM", 3 ) != 0) {
printk("\nID of extended partition is not XGM!\n");
put_dev_sector(sect2);
break;
}
partsect = be32_to_cpu(xrs->part[1].st) + extensect;
put_dev_sector(sect2);
if (++slot == state->limit) {
printk( "\nMaximum number of partitions reached!\n" );
break;
}
}
strlcat(state->pp_buf, " >", PAGE_SIZE);
}
#ifdef ICD_PARTS
if ( part_fmt!=1 ) { /* no extended partitions -> test ICD-format */
pi = &rs->icdpart[0];
/* sanity check: no ICD format if first partition invalid */
if (OK_id(pi->id)) {
strlcat(state->pp_buf, " ICD<", PAGE_SIZE);
for (; pi < &rs->icdpart[8] && slot < state->limit; slot++, pi++) {
/* accept only GEM,BGM,RAW,LNX,SWP partitions */
if (!((pi->flg & 1) && OK_id(pi->id)))
continue;
part_fmt = 2;
put_partition (state, slot,
be32_to_cpu(pi->st),
be32_to_cpu(pi->siz));
}
strlcat(state->pp_buf, " >", PAGE_SIZE);
}
}
#endif
put_dev_sector(sect);
strlcat(state->pp_buf, "\n", PAGE_SIZE);
return 1;
}

36
block/partitions/atari.h Normal file
View file

@ -0,0 +1,36 @@
/*
* fs/partitions/atari.h
* Moved by Russell King from:
*
* linux/include/linux/atari_rootsec.h
* definitions for Atari Rootsector layout
* by Andreas Schwab (schwab@ls5.informatik.uni-dortmund.de)
*
* modified for ICD/Supra partitioning scheme restricted to at most 12
* partitions
* by Guenther Kelleter (guenther@pool.informatik.rwth-aachen.de)
*/
#include <linux/compiler.h>
struct partition_info
{
u8 flg; /* bit 0: active; bit 7: bootable */
char id[3]; /* "GEM", "BGM", "XGM", or other */
__be32 st; /* start of partition */
__be32 siz; /* length of partition */
};
struct rootsector
{
char unused[0x156]; /* room for boot code */
struct partition_info icdpart[8]; /* info for ICD-partitions 5..12 */
char unused2[0xc];
u32 hd_siz; /* size of disk in blocks */
struct partition_info part[4];
u32 bsl_st; /* start of bad sector list */
u32 bsl_cnt; /* length of bad sector list */
u16 checksum; /* checksum for bootable disks */
} __packed;
int atari_partition(struct parsed_partitions *state);

197
block/partitions/check.c Normal file
View file

@ -0,0 +1,197 @@
/*
* fs/partitions/check.c
*
* Code extracted from drivers/block/genhd.c
* Copyright (C) 1991-1998 Linus Torvalds
* Re-organised Feb 1998 Russell King
*
* We now have independent partition support from the
* block drivers, which allows all the partition code to
* be grouped in one location, and it to be mostly self
* contained.
*
* Added needed MAJORS for new pairs, {hdi,hdj}, {hdk,hdl}
*/
#include <linux/slab.h>
#include <linux/vmalloc.h>
#include <linux/ctype.h>
#include <linux/genhd.h>
#include "check.h"
#include "acorn.h"
#include "amiga.h"
#include "atari.h"
#include "ldm.h"
#include "mac.h"
#include "msdos.h"
#include "osf.h"
#include "sgi.h"
#include "sun.h"
#include "ibm.h"
#include "ultrix.h"
#include "efi.h"
#include "karma.h"
#include "sysv68.h"
#include "cmdline.h"
int warn_no_part = 1; /*This is ugly: should make genhd removable media aware*/
static int (*check_part[])(struct parsed_partitions *) = {
/*
* Probe partition formats with tables at disk address 0
* that also have an ADFS boot block at 0xdc0.
*/
#ifdef CONFIG_ACORN_PARTITION_ICS
adfspart_check_ICS,
#endif
#ifdef CONFIG_ACORN_PARTITION_POWERTEC
adfspart_check_POWERTEC,
#endif
#ifdef CONFIG_ACORN_PARTITION_EESOX
adfspart_check_EESOX,
#endif
/*
* Now move on to formats that only have partition info at
* disk address 0xdc0. Since these may also have stale
* PC/BIOS partition tables, they need to come before
* the msdos entry.
*/
#ifdef CONFIG_ACORN_PARTITION_CUMANA
adfspart_check_CUMANA,
#endif
#ifdef CONFIG_ACORN_PARTITION_ADFS
adfspart_check_ADFS,
#endif
#ifdef CONFIG_CMDLINE_PARTITION
cmdline_partition,
#endif
#ifdef CONFIG_EFI_PARTITION
efi_partition, /* this must come before msdos */
#endif
#ifdef CONFIG_SGI_PARTITION
sgi_partition,
#endif
#ifdef CONFIG_LDM_PARTITION
ldm_partition, /* this must come before msdos */
#endif
#ifdef CONFIG_MSDOS_PARTITION
msdos_partition,
#endif
#ifdef CONFIG_OSF_PARTITION
osf_partition,
#endif
#ifdef CONFIG_SUN_PARTITION
sun_partition,
#endif
#ifdef CONFIG_AMIGA_PARTITION
amiga_partition,
#endif
#ifdef CONFIG_ATARI_PARTITION
atari_partition,
#endif
#ifdef CONFIG_MAC_PARTITION
mac_partition,
#endif
#ifdef CONFIG_ULTRIX_PARTITION
ultrix_partition,
#endif
#ifdef CONFIG_IBM_PARTITION
ibm_partition,
#endif
#ifdef CONFIG_KARMA_PARTITION
karma_partition,
#endif
#ifdef CONFIG_SYSV68_PARTITION
sysv68_partition,
#endif
NULL
};
static struct parsed_partitions *allocate_partitions(struct gendisk *hd)
{
struct parsed_partitions *state;
int nr;
state = kzalloc(sizeof(*state), GFP_KERNEL);
if (!state)
return NULL;
nr = disk_max_parts(hd);
state->parts = vzalloc(nr * sizeof(state->parts[0]));
if (!state->parts) {
kfree(state);
return NULL;
}
state->limit = nr;
return state;
}
void free_partitions(struct parsed_partitions *state)
{
vfree(state->parts);
kfree(state);
}
struct parsed_partitions *
check_partition(struct gendisk *hd, struct block_device *bdev)
{
struct parsed_partitions *state;
int i, res, err;
state = allocate_partitions(hd);
if (!state)
return NULL;
state->pp_buf = (char *)__get_free_page(GFP_KERNEL);
if (!state->pp_buf) {
free_partitions(state);
return NULL;
}
state->pp_buf[0] = '\0';
state->bdev = bdev;
disk_name(hd, 0, state->name);
snprintf(state->pp_buf, PAGE_SIZE, " %s:", state->name);
if (isdigit(state->name[strlen(state->name)-1]))
sprintf(state->name, "p");
i = res = err = 0;
while (!res && check_part[i]) {
memset(state->parts, 0, state->limit * sizeof(state->parts[0]));
res = check_part[i++](state);
if (res < 0) {
/* We have hit an I/O error which we don't report now.
* But record it, and let the others do their job.
*/
err = res;
res = 0;
}
}
if (res > 0) {
printk(KERN_INFO "%s", state->pp_buf);
free_page((unsigned long)state->pp_buf);
return state;
}
if (state->access_beyond_eod)
err = -ENOSPC;
if (err)
/* The partition is unrecognized. So report I/O errors if there were any */
res = err;
if (!res)
strlcat(state->pp_buf, " unknown partition table\n", PAGE_SIZE);
else if (warn_no_part)
strlcat(state->pp_buf, " unable to read partition table\n", PAGE_SIZE);
printk(KERN_INFO "%s", state->pp_buf);
free_page((unsigned long)state->pp_buf);
free_partitions(state);
return ERR_PTR(res);
}

54
block/partitions/check.h Normal file
View file

@ -0,0 +1,54 @@
#include <linux/pagemap.h>
#include <linux/blkdev.h>
#include <linux/genhd.h>
/*
* add_gd_partition adds a partitions details to the devices partition
* description.
*/
struct parsed_partitions {
struct block_device *bdev;
char name[BDEVNAME_SIZE];
struct {
sector_t from;
sector_t size;
int flags;
bool has_info;
struct partition_meta_info info;
} *parts;
int next;
int limit;
bool access_beyond_eod;
char *pp_buf;
};
void free_partitions(struct parsed_partitions *state);
struct parsed_partitions *
check_partition(struct gendisk *, struct block_device *);
static inline void *read_part_sector(struct parsed_partitions *state,
sector_t n, Sector *p)
{
if (n >= get_capacity(state->bdev->bd_disk)) {
state->access_beyond_eod = true;
return NULL;
}
return read_dev_sector(state->bdev, n, p);
}
static inline void
put_partition(struct parsed_partitions *p, int n, sector_t from, sector_t size)
{
if (n < p->limit) {
char tmp[1 + BDEVNAME_SIZE + 10 + 1];
p->parts[n].from = from;
p->parts[n].size = size;
snprintf(tmp, sizeof(tmp), " %s%d", p->name, n);
strlcat(p->pp_buf, tmp, PAGE_SIZE);
}
}
extern int warn_no_part;

View file

@ -0,0 +1,99 @@
/*
* Copyright (C) 2013 HUAWEI
* Author: Cai Zhiyong <caizhiyong@huawei.com>
*
* Read block device partition table from the command line.
* Typically used for fixed block (eMMC) embedded devices.
* It has no MBR, so saves storage space. Bootloader can be easily accessed
* by absolute address of data on the block device.
* Users can easily change the partition.
*
* The format for the command line is just like mtdparts.
*
* For further information, see "Documentation/block/cmdline-partition.txt"
*
*/
#include <linux/cmdline-parser.h>
#include "check.h"
#include "cmdline.h"
static char *cmdline;
static struct cmdline_parts *bdev_parts;
static int add_part(int slot, struct cmdline_subpart *subpart, void *param)
{
int label_min;
struct partition_meta_info *info;
char tmp[sizeof(info->volname) + 4];
struct parsed_partitions *state = (struct parsed_partitions *)param;
if (slot >= state->limit)
return 1;
put_partition(state, slot, subpart->from >> 9,
subpart->size >> 9);
info = &state->parts[slot].info;
label_min = min_t(int, sizeof(info->volname) - 1,
sizeof(subpart->name));
strncpy(info->volname, subpart->name, label_min);
info->volname[label_min] = '\0';
snprintf(tmp, sizeof(tmp), "(%s)", info->volname);
strlcat(state->pp_buf, tmp, PAGE_SIZE);
state->parts[slot].has_info = true;
return 0;
}
static int __init cmdline_parts_setup(char *s)
{
cmdline = s;
return 1;
}
__setup("blkdevparts=", cmdline_parts_setup);
/*
* Purpose: allocate cmdline partitions.
* Returns:
* -1 if unable to read the partition table
* 0 if this isn't our partition table
* 1 if successful
*/
int cmdline_partition(struct parsed_partitions *state)
{
sector_t disk_size;
char bdev[BDEVNAME_SIZE];
struct cmdline_parts *parts;
if (cmdline) {
if (bdev_parts)
cmdline_parts_free(&bdev_parts);
if (cmdline_parts_parse(&bdev_parts, cmdline)) {
cmdline = NULL;
return -1;
}
cmdline = NULL;
}
if (!bdev_parts)
return 0;
bdevname(state->bdev, bdev);
parts = cmdline_parts_find(bdev_parts, bdev);
if (!parts)
return 0;
disk_size = get_capacity(state->bdev->bd_disk) << 9;
cmdline_parts_set(parts, disk_size, 1, add_part, (void *)state);
strlcat(state->pp_buf, "\n", PAGE_SIZE);
return 1;
}

View file

@ -0,0 +1,2 @@
int cmdline_partition(struct parsed_partitions *state);

737
block/partitions/efi.c Normal file
View file

@ -0,0 +1,737 @@
/************************************************************
* EFI GUID Partition Table handling
*
* http://www.uefi.org/specs/
* http://www.intel.com/technology/efi/
*
* efi.[ch] by Matt Domsch <Matt_Domsch@dell.com>
* Copyright 2000,2001,2002,2004 Dell Inc.
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
*
*
* TODO:
*
* Changelog:
* Mon August 5th, 2013 Davidlohr Bueso <davidlohr@hp.com>
* - detect hybrid MBRs, tighter pMBR checking & cleanups.
*
* Mon Nov 09 2004 Matt Domsch <Matt_Domsch@dell.com>
* - test for valid PMBR and valid PGPT before ever reading
* AGPT, allow override with 'gpt' kernel command line option.
* - check for first/last_usable_lba outside of size of disk
*
* Tue Mar 26 2002 Matt Domsch <Matt_Domsch@dell.com>
* - Ported to 2.5.7-pre1 and 2.5.7-dj2
* - Applied patch to avoid fault in alternate header handling
* - cleaned up find_valid_gpt
* - On-disk structure and copy in memory is *always* LE now -
* swab fields as needed
* - remove print_gpt_header()
* - only use first max_p partition entries, to keep the kernel minor number
* and partition numbers tied.
*
* Mon Feb 04 2002 Matt Domsch <Matt_Domsch@dell.com>
* - Removed __PRIPTR_PREFIX - not being used
*
* Mon Jan 14 2002 Matt Domsch <Matt_Domsch@dell.com>
* - Ported to 2.5.2-pre11 + library crc32 patch Linus applied
*
* Thu Dec 6 2001 Matt Domsch <Matt_Domsch@dell.com>
* - Added compare_gpts().
* - moved le_efi_guid_to_cpus() back into this file. GPT is the only
* thing that keeps EFI GUIDs on disk.
* - Changed gpt structure names and members to be simpler and more Linux-like.
*
* Wed Oct 17 2001 Matt Domsch <Matt_Domsch@dell.com>
* - Removed CONFIG_DEVFS_VOLUMES_UUID code entirely per Martin Wilck
*
* Wed Oct 10 2001 Matt Domsch <Matt_Domsch@dell.com>
* - Changed function comments to DocBook style per Andreas Dilger suggestion.
*
* Mon Oct 08 2001 Matt Domsch <Matt_Domsch@dell.com>
* - Change read_lba() to use the page cache per Al Viro's work.
* - print u64s properly on all architectures
* - fixed debug_printk(), now Dprintk()
*
* Mon Oct 01 2001 Matt Domsch <Matt_Domsch@dell.com>
* - Style cleanups
* - made most functions static
* - Endianness addition
* - remove test for second alternate header, as it's not per spec,
* and is unnecessary. There's now a method to read/write the last
* sector of an odd-sized disk from user space. No tools have ever
* been released which used this code, so it's effectively dead.
* - Per Asit Mallick of Intel, added a test for a valid PMBR.
* - Added kernel command line option 'gpt' to override valid PMBR test.
*
* Wed Jun 6 2001 Martin Wilck <Martin.Wilck@Fujitsu-Siemens.com>
* - added devfs volume UUID support (/dev/volumes/uuids) for
* mounting file systems by the partition GUID.
*
* Tue Dec 5 2000 Matt Domsch <Matt_Domsch@dell.com>
* - Moved crc32() to linux/lib, added efi_crc32().
*
* Thu Nov 30 2000 Matt Domsch <Matt_Domsch@dell.com>
* - Replaced Intel's CRC32 function with an equivalent
* non-license-restricted version.
*
* Wed Oct 25 2000 Matt Domsch <Matt_Domsch@dell.com>
* - Fixed the last_lba() call to return the proper last block
*
* Thu Oct 12 2000 Matt Domsch <Matt_Domsch@dell.com>
* - Thanks to Andries Brouwer for his debugging assistance.
* - Code works, detects all the partitions.
*
************************************************************/
#include <linux/kernel.h>
#include <linux/crc32.h>
#include <linux/ctype.h>
#include <linux/math64.h>
#include <linux/slab.h>
#include "check.h"
#include "efi.h"
/* This allows a kernel command line option 'gpt' to override
* the test for invalid PMBR. Not __initdata because reloading
* the partition tables happens after init too.
*/
static int force_gpt;
static int __init
force_gpt_fn(char *str)
{
force_gpt = 1;
return 1;
}
__setup("gpt", force_gpt_fn);
/**
* efi_crc32() - EFI version of crc32 function
* @buf: buffer to calculate crc32 of
* @len: length of buf
*
* Description: Returns EFI-style CRC32 value for @buf
*
* This function uses the little endian Ethernet polynomial
* but seeds the function with ~0, and xor's with ~0 at the end.
* Note, the EFI Specification, v1.02, has a reference to
* Dr. Dobbs Journal, May 1994 (actually it's in May 1992).
*/
static inline u32
efi_crc32(const void *buf, unsigned long len)
{
return (crc32(~0L, buf, len) ^ ~0L);
}
/**
* last_lba(): return number of last logical block of device
* @bdev: block device
*
* Description: Returns last LBA value on success, 0 on error.
* This is stored (by sd and ide-geometry) in
* the part[0] entry for this disk, and is the number of
* physical sectors available on the disk.
*/
static u64 last_lba(struct block_device *bdev)
{
if (!bdev || !bdev->bd_inode)
return 0;
return div_u64(bdev->bd_inode->i_size,
bdev_logical_block_size(bdev)) - 1ULL;
}
static inline int pmbr_part_valid(gpt_mbr_record *part)
{
if (part->os_type != EFI_PMBR_OSTYPE_EFI_GPT)
goto invalid;
/* set to 0x00000001 (i.e., the LBA of the GPT Partition Header) */
if (le32_to_cpu(part->starting_lba) != GPT_PRIMARY_PARTITION_TABLE_LBA)
goto invalid;
return GPT_MBR_PROTECTIVE;
invalid:
return 0;
}
/**
* is_pmbr_valid(): test Protective MBR for validity
* @mbr: pointer to a legacy mbr structure
* @total_sectors: amount of sectors in the device
*
* Description: Checks for a valid protective or hybrid
* master boot record (MBR). The validity of a pMBR depends
* on all of the following properties:
* 1) MSDOS signature is in the last two bytes of the MBR
* 2) One partition of type 0xEE is found
*
* In addition, a hybrid MBR will have up to three additional
* primary partitions, which point to the same space that's
* marked out by up to three GPT partitions.
*
* Returns 0 upon invalid MBR, or GPT_MBR_PROTECTIVE or
* GPT_MBR_HYBRID depending on the device layout.
*/
static int is_pmbr_valid(legacy_mbr *mbr, sector_t total_sectors)
{
uint32_t sz = 0;
int i, part = 0, ret = 0; /* invalid by default */
if (!mbr || le16_to_cpu(mbr->signature) != MSDOS_MBR_SIGNATURE)
goto done;
for (i = 0; i < 4; i++) {
ret = pmbr_part_valid(&mbr->partition_record[i]);
if (ret == GPT_MBR_PROTECTIVE) {
part = i;
/*
* Ok, we at least know that there's a protective MBR,
* now check if there are other partition types for
* hybrid MBR.
*/
goto check_hybrid;
}
}
if (ret != GPT_MBR_PROTECTIVE)
goto done;
check_hybrid:
for (i = 0; i < 4; i++)
if ((mbr->partition_record[i].os_type !=
EFI_PMBR_OSTYPE_EFI_GPT) &&
(mbr->partition_record[i].os_type != 0x00))
ret = GPT_MBR_HYBRID;
/*
* Protective MBRs take up the lesser of the whole disk
* or 2 TiB (32bit LBA), ignoring the rest of the disk.
* Some partitioning programs, nonetheless, choose to set
* the size to the maximum 32-bit limitation, disregarding
* the disk size.
*
* Hybrid MBRs do not necessarily comply with this.
*
* Consider a bad value here to be a warning to support dd'ing
* an image from a smaller disk to a larger disk.
*/
if (ret == GPT_MBR_PROTECTIVE) {
sz = le32_to_cpu(mbr->partition_record[part].size_in_lba);
if (sz != (uint32_t) total_sectors - 1 && sz != 0xFFFFFFFF)
pr_debug("GPT: mbr size in lba (%u) different than whole disk (%u).\n",
sz, min_t(uint32_t,
total_sectors - 1, 0xFFFFFFFF));
}
done:
return ret;
}
/**
* read_lba(): Read bytes from disk, starting at given LBA
* @state: disk parsed partitions
* @lba: the Logical Block Address of the partition table
* @buffer: destination buffer
* @count: bytes to read
*
* Description: Reads @count bytes from @state->bdev into @buffer.
* Returns number of bytes read on success, 0 on error.
*/
static size_t read_lba(struct parsed_partitions *state,
u64 lba, u8 *buffer, size_t count)
{
size_t totalreadcount = 0;
struct block_device *bdev = state->bdev;
sector_t n = lba * (bdev_logical_block_size(bdev) / 512);
if (!buffer || lba > last_lba(bdev))
return 0;
while (count) {
int copied = 512;
Sector sect;
unsigned char *data = read_part_sector(state, n++, &sect);
if (!data)
break;
if (copied > count)
copied = count;
memcpy(buffer, data, copied);
put_dev_sector(sect);
buffer += copied;
totalreadcount +=copied;
count -= copied;
}
return totalreadcount;
}
/**
* alloc_read_gpt_entries(): reads partition entries from disk
* @state: disk parsed partitions
* @gpt: GPT header
*
* Description: Returns ptes on success, NULL on error.
* Allocates space for PTEs based on information found in @gpt.
* Notes: remember to free pte when you're done!
*/
static gpt_entry *alloc_read_gpt_entries(struct parsed_partitions *state,
gpt_header *gpt)
{
size_t count;
gpt_entry *pte;
if (!gpt)
return NULL;
count = le32_to_cpu(gpt->num_partition_entries) *
le32_to_cpu(gpt->sizeof_partition_entry);
if (!count)
return NULL;
pte = kmalloc(count, GFP_KERNEL);
if (!pte)
return NULL;
if (read_lba(state, le64_to_cpu(gpt->partition_entry_lba),
(u8 *) pte, count) < count) {
kfree(pte);
pte=NULL;
return NULL;
}
return pte;
}
/**
* alloc_read_gpt_header(): Allocates GPT header, reads into it from disk
* @state: disk parsed partitions
* @lba: the Logical Block Address of the partition table
*
* Description: returns GPT header on success, NULL on error. Allocates
* and fills a GPT header starting at @ from @state->bdev.
* Note: remember to free gpt when finished with it.
*/
static gpt_header *alloc_read_gpt_header(struct parsed_partitions *state,
u64 lba)
{
gpt_header *gpt;
unsigned ssz = bdev_logical_block_size(state->bdev);
gpt = kmalloc(ssz, GFP_KERNEL);
if (!gpt)
return NULL;
if (read_lba(state, lba, (u8 *) gpt, ssz) < ssz) {
kfree(gpt);
gpt=NULL;
return NULL;
}
return gpt;
}
/**
* is_gpt_valid() - tests one GPT header and PTEs for validity
* @state: disk parsed partitions
* @lba: logical block address of the GPT header to test
* @gpt: GPT header ptr, filled on return.
* @ptes: PTEs ptr, filled on return.
*
* Description: returns 1 if valid, 0 on error.
* If valid, returns pointers to newly allocated GPT header and PTEs.
*/
static int is_gpt_valid(struct parsed_partitions *state, u64 lba,
gpt_header **gpt, gpt_entry **ptes)
{
u32 crc, origcrc;
u64 lastlba;
if (!ptes)
return 0;
if (!(*gpt = alloc_read_gpt_header(state, lba)))
return 0;
/* Check the GUID Partition Table signature */
if (le64_to_cpu((*gpt)->signature) != GPT_HEADER_SIGNATURE) {
pr_debug("GUID Partition Table Header signature is wrong:"
"%lld != %lld\n",
(unsigned long long)le64_to_cpu((*gpt)->signature),
(unsigned long long)GPT_HEADER_SIGNATURE);
goto fail;
}
/* Check the GUID Partition Table header size is too big */
if (le32_to_cpu((*gpt)->header_size) >
bdev_logical_block_size(state->bdev)) {
pr_debug("GUID Partition Table Header size is too large: %u > %u\n",
le32_to_cpu((*gpt)->header_size),
bdev_logical_block_size(state->bdev));
goto fail;
}
/* Check the GUID Partition Table header size is too small */
if (le32_to_cpu((*gpt)->header_size) < sizeof(gpt_header)) {
pr_debug("GUID Partition Table Header size is too small: %u < %zu\n",
le32_to_cpu((*gpt)->header_size),
sizeof(gpt_header));
goto fail;
}
/* Check the GUID Partition Table CRC */
origcrc = le32_to_cpu((*gpt)->header_crc32);
(*gpt)->header_crc32 = 0;
crc = efi_crc32((const unsigned char *) (*gpt), le32_to_cpu((*gpt)->header_size));
if (crc != origcrc) {
pr_debug("GUID Partition Table Header CRC is wrong: %x != %x\n",
crc, origcrc);
goto fail;
}
(*gpt)->header_crc32 = cpu_to_le32(origcrc);
/* Check that the my_lba entry points to the LBA that contains
* the GUID Partition Table */
if (le64_to_cpu((*gpt)->my_lba) != lba) {
pr_debug("GPT my_lba incorrect: %lld != %lld\n",
(unsigned long long)le64_to_cpu((*gpt)->my_lba),
(unsigned long long)lba);
goto fail;
}
/* Check the first_usable_lba and last_usable_lba are
* within the disk.
*/
lastlba = last_lba(state->bdev);
if (le64_to_cpu((*gpt)->first_usable_lba) > lastlba) {
pr_debug("GPT: first_usable_lba incorrect: %lld > %lld\n",
(unsigned long long)le64_to_cpu((*gpt)->first_usable_lba),
(unsigned long long)lastlba);
goto fail;
}
if (le64_to_cpu((*gpt)->last_usable_lba) > lastlba) {
pr_debug("GPT: last_usable_lba incorrect: %lld > %lld\n",
(unsigned long long)le64_to_cpu((*gpt)->last_usable_lba),
(unsigned long long)lastlba);
goto fail;
}
if (le64_to_cpu((*gpt)->last_usable_lba) < le64_to_cpu((*gpt)->first_usable_lba)) {
pr_debug("GPT: last_usable_lba incorrect: %lld > %lld\n",
(unsigned long long)le64_to_cpu((*gpt)->last_usable_lba),
(unsigned long long)le64_to_cpu((*gpt)->first_usable_lba));
goto fail;
}
/* Check that sizeof_partition_entry has the correct value */
if (le32_to_cpu((*gpt)->sizeof_partition_entry) != sizeof(gpt_entry)) {
pr_debug("GUID Partitition Entry Size check failed.\n");
goto fail;
}
if (!(*ptes = alloc_read_gpt_entries(state, *gpt)))
goto fail;
/* Check the GUID Partition Entry Array CRC */
crc = efi_crc32((const unsigned char *) (*ptes),
le32_to_cpu((*gpt)->num_partition_entries) *
le32_to_cpu((*gpt)->sizeof_partition_entry));
if (crc != le32_to_cpu((*gpt)->partition_entry_array_crc32)) {
pr_debug("GUID Partitition Entry Array CRC check failed.\n");
goto fail_ptes;
}
/* We're done, all's well */
return 1;
fail_ptes:
kfree(*ptes);
*ptes = NULL;
fail:
kfree(*gpt);
*gpt = NULL;
return 0;
}
/**
* is_pte_valid() - tests one PTE for validity
* @pte:pte to check
* @lastlba: last lba of the disk
*
* Description: returns 1 if valid, 0 on error.
*/
static inline int
is_pte_valid(const gpt_entry *pte, const u64 lastlba)
{
if ((!efi_guidcmp(pte->partition_type_guid, NULL_GUID)) ||
le64_to_cpu(pte->starting_lba) > lastlba ||
le64_to_cpu(pte->ending_lba) > lastlba)
return 0;
return 1;
}
/**
* compare_gpts() - Search disk for valid GPT headers and PTEs
* @pgpt: primary GPT header
* @agpt: alternate GPT header
* @lastlba: last LBA number
*
* Description: Returns nothing. Sanity checks pgpt and agpt fields
* and prints warnings on discrepancies.
*
*/
static void
compare_gpts(gpt_header *pgpt, gpt_header *agpt, u64 lastlba)
{
int error_found = 0;
if (!pgpt || !agpt)
return;
if (le64_to_cpu(pgpt->my_lba) != le64_to_cpu(agpt->alternate_lba)) {
pr_warn("GPT:Primary header LBA != Alt. header alternate_lba\n");
pr_warn("GPT:%lld != %lld\n",
(unsigned long long)le64_to_cpu(pgpt->my_lba),
(unsigned long long)le64_to_cpu(agpt->alternate_lba));
error_found++;
}
if (le64_to_cpu(pgpt->alternate_lba) != le64_to_cpu(agpt->my_lba)) {
pr_warn("GPT:Primary header alternate_lba != Alt. header my_lba\n");
pr_warn("GPT:%lld != %lld\n",
(unsigned long long)le64_to_cpu(pgpt->alternate_lba),
(unsigned long long)le64_to_cpu(agpt->my_lba));
error_found++;
}
if (le64_to_cpu(pgpt->first_usable_lba) !=
le64_to_cpu(agpt->first_usable_lba)) {
pr_warn("GPT:first_usable_lbas don't match.\n");
pr_warn("GPT:%lld != %lld\n",
(unsigned long long)le64_to_cpu(pgpt->first_usable_lba),
(unsigned long long)le64_to_cpu(agpt->first_usable_lba));
error_found++;
}
if (le64_to_cpu(pgpt->last_usable_lba) !=
le64_to_cpu(agpt->last_usable_lba)) {
pr_warn("GPT:last_usable_lbas don't match.\n");
pr_warn("GPT:%lld != %lld\n",
(unsigned long long)le64_to_cpu(pgpt->last_usable_lba),
(unsigned long long)le64_to_cpu(agpt->last_usable_lba));
error_found++;
}
if (efi_guidcmp(pgpt->disk_guid, agpt->disk_guid)) {
pr_warn("GPT:disk_guids don't match.\n");
error_found++;
}
if (le32_to_cpu(pgpt->num_partition_entries) !=
le32_to_cpu(agpt->num_partition_entries)) {
pr_warn("GPT:num_partition_entries don't match: "
"0x%x != 0x%x\n",
le32_to_cpu(pgpt->num_partition_entries),
le32_to_cpu(agpt->num_partition_entries));
error_found++;
}
if (le32_to_cpu(pgpt->sizeof_partition_entry) !=
le32_to_cpu(agpt->sizeof_partition_entry)) {
pr_warn("GPT:sizeof_partition_entry values don't match: "
"0x%x != 0x%x\n",
le32_to_cpu(pgpt->sizeof_partition_entry),
le32_to_cpu(agpt->sizeof_partition_entry));
error_found++;
}
if (le32_to_cpu(pgpt->partition_entry_array_crc32) !=
le32_to_cpu(agpt->partition_entry_array_crc32)) {
pr_warn("GPT:partition_entry_array_crc32 values don't match: "
"0x%x != 0x%x\n",
le32_to_cpu(pgpt->partition_entry_array_crc32),
le32_to_cpu(agpt->partition_entry_array_crc32));
error_found++;
}
if (le64_to_cpu(pgpt->alternate_lba) != lastlba) {
pr_warn("GPT:Primary header thinks Alt. header is not at the end of the disk.\n");
pr_warn("GPT:%lld != %lld\n",
(unsigned long long)le64_to_cpu(pgpt->alternate_lba),
(unsigned long long)lastlba);
error_found++;
}
if (le64_to_cpu(agpt->my_lba) != lastlba) {
pr_warn("GPT:Alternate GPT header not at the end of the disk.\n");
pr_warn("GPT:%lld != %lld\n",
(unsigned long long)le64_to_cpu(agpt->my_lba),
(unsigned long long)lastlba);
error_found++;
}
if (error_found)
pr_warn("GPT: Use GNU Parted to correct GPT errors.\n");
return;
}
/**
* find_valid_gpt() - Search disk for valid GPT headers and PTEs
* @state: disk parsed partitions
* @gpt: GPT header ptr, filled on return.
* @ptes: PTEs ptr, filled on return.
*
* Description: Returns 1 if valid, 0 on error.
* If valid, returns pointers to newly allocated GPT header and PTEs.
* Validity depends on PMBR being valid (or being overridden by the
* 'gpt' kernel command line option) and finding either the Primary
* GPT header and PTEs valid, or the Alternate GPT header and PTEs
* valid. If the Primary GPT header is not valid, the Alternate GPT header
* is not checked unless the 'gpt' kernel command line option is passed.
* This protects against devices which misreport their size, and forces
* the user to decide to use the Alternate GPT.
*/
static int find_valid_gpt(struct parsed_partitions *state, gpt_header **gpt,
gpt_entry **ptes)
{
int good_pgpt = 0, good_agpt = 0, good_pmbr = 0;
gpt_header *pgpt = NULL, *agpt = NULL;
gpt_entry *pptes = NULL, *aptes = NULL;
legacy_mbr *legacymbr;
sector_t total_sectors = i_size_read(state->bdev->bd_inode) >> 9;
u64 lastlba;
if (!ptes)
return 0;
lastlba = last_lba(state->bdev);
if (!force_gpt) {
/* This will be added to the EFI Spec. per Intel after v1.02. */
legacymbr = kzalloc(sizeof(*legacymbr), GFP_KERNEL);
if (!legacymbr)
goto fail;
read_lba(state, 0, (u8 *)legacymbr, sizeof(*legacymbr));
good_pmbr = is_pmbr_valid(legacymbr, total_sectors);
kfree(legacymbr);
if (!good_pmbr)
goto fail;
pr_debug("Device has a %s MBR\n",
good_pmbr == GPT_MBR_PROTECTIVE ?
"protective" : "hybrid");
}
good_pgpt = is_gpt_valid(state, GPT_PRIMARY_PARTITION_TABLE_LBA,
&pgpt, &pptes);
if (good_pgpt)
good_agpt = is_gpt_valid(state,
le64_to_cpu(pgpt->alternate_lba),
&agpt, &aptes);
if (!good_agpt && force_gpt)
good_agpt = is_gpt_valid(state, lastlba, &agpt, &aptes);
/* The obviously unsuccessful case */
if (!good_pgpt && !good_agpt)
goto fail;
compare_gpts(pgpt, agpt, lastlba);
/* The good cases */
if (good_pgpt) {
*gpt = pgpt;
*ptes = pptes;
kfree(agpt);
kfree(aptes);
if (!good_agpt)
pr_warn("Alternate GPT is invalid, using primary GPT.\n");
return 1;
}
else if (good_agpt) {
*gpt = agpt;
*ptes = aptes;
kfree(pgpt);
kfree(pptes);
pr_warn("Primary GPT is invalid, using alternate GPT.\n");
return 1;
}
fail:
kfree(pgpt);
kfree(agpt);
kfree(pptes);
kfree(aptes);
*gpt = NULL;
*ptes = NULL;
return 0;
}
/**
* efi_partition(struct parsed_partitions *state)
* @state: disk parsed partitions
*
* Description: called from check.c, if the disk contains GPT
* partitions, sets up partition entries in the kernel.
*
* If the first block on the disk is a legacy MBR,
* it will get handled by msdos_partition().
* If it's a Protective MBR, we'll handle it here.
*
* We do not create a Linux partition for GPT, but
* only for the actual data partitions.
* Returns:
* -1 if unable to read the partition table
* 0 if this isn't our partition table
* 1 if successful
*
*/
int efi_partition(struct parsed_partitions *state)
{
gpt_header *gpt = NULL;
gpt_entry *ptes = NULL;
u32 i;
unsigned ssz = bdev_logical_block_size(state->bdev) / 512;
if (!find_valid_gpt(state, &gpt, &ptes) || !gpt || !ptes) {
kfree(gpt);
kfree(ptes);
return 0;
}
pr_debug("GUID Partition Table is valid! Yea!\n");
for (i = 0; i < le32_to_cpu(gpt->num_partition_entries) && i < state->limit-1; i++) {
struct partition_meta_info *info;
unsigned label_count = 0;
unsigned label_max;
u64 start = le64_to_cpu(ptes[i].starting_lba);
u64 size = le64_to_cpu(ptes[i].ending_lba) -
le64_to_cpu(ptes[i].starting_lba) + 1ULL;
if (!is_pte_valid(&ptes[i], last_lba(state->bdev)))
continue;
put_partition(state, i+1, start * ssz, size * ssz);
/* If this is a RAID volume, tell md */
if (!efi_guidcmp(ptes[i].partition_type_guid, PARTITION_LINUX_RAID_GUID))
state->parts[i + 1].flags = ADDPART_FLAG_RAID;
info = &state->parts[i + 1].info;
efi_guid_unparse(&ptes[i].unique_partition_guid, info->uuid);
/* Naively convert UTF16-LE to 7 bits. */
label_max = min(ARRAY_SIZE(info->volname) - 1,
ARRAY_SIZE(ptes[i].partition_name));
info->volname[label_max] = 0;
while (label_count < label_max) {
u8 c = ptes[i].partition_name[label_count] & 0xff;
if (c && !isprint(c))
c = '!';
info->volname[label_count] = c;
label_count++;
}
state->parts[i + 1].has_info = true;
}
kfree(ptes);
kfree(gpt);
strlcat(state->pp_buf, "\n", PAGE_SIZE);
return 1;
}

133
block/partitions/efi.h Normal file
View file

@ -0,0 +1,133 @@
/************************************************************
* EFI GUID Partition Table
* Per Intel EFI Specification v1.02
* http://developer.intel.com/technology/efi/efi.htm
*
* By Matt Domsch <Matt_Domsch@dell.com> Fri Sep 22 22:15:56 CDT 2000
* Copyright 2000,2001 Dell Inc.
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
*
************************************************************/
#ifndef FS_PART_EFI_H_INCLUDED
#define FS_PART_EFI_H_INCLUDED
#include <linux/types.h>
#include <linux/fs.h>
#include <linux/genhd.h>
#include <linux/kernel.h>
#include <linux/major.h>
#include <linux/string.h>
#include <linux/efi.h>
#include <linux/compiler.h>
#define MSDOS_MBR_SIGNATURE 0xaa55
#define EFI_PMBR_OSTYPE_EFI 0xEF
#define EFI_PMBR_OSTYPE_EFI_GPT 0xEE
#define GPT_MBR_PROTECTIVE 1
#define GPT_MBR_HYBRID 2
#define GPT_HEADER_SIGNATURE 0x5452415020494645ULL
#define GPT_HEADER_REVISION_V1 0x00010000
#define GPT_PRIMARY_PARTITION_TABLE_LBA 1
#define PARTITION_SYSTEM_GUID \
EFI_GUID( 0xC12A7328, 0xF81F, 0x11d2, \
0xBA, 0x4B, 0x00, 0xA0, 0xC9, 0x3E, 0xC9, 0x3B)
#define LEGACY_MBR_PARTITION_GUID \
EFI_GUID( 0x024DEE41, 0x33E7, 0x11d3, \
0x9D, 0x69, 0x00, 0x08, 0xC7, 0x81, 0xF3, 0x9F)
#define PARTITION_MSFT_RESERVED_GUID \
EFI_GUID( 0xE3C9E316, 0x0B5C, 0x4DB8, \
0x81, 0x7D, 0xF9, 0x2D, 0xF0, 0x02, 0x15, 0xAE)
#define PARTITION_BASIC_DATA_GUID \
EFI_GUID( 0xEBD0A0A2, 0xB9E5, 0x4433, \
0x87, 0xC0, 0x68, 0xB6, 0xB7, 0x26, 0x99, 0xC7)
#define PARTITION_LINUX_RAID_GUID \
EFI_GUID( 0xa19d880f, 0x05fc, 0x4d3b, \
0xa0, 0x06, 0x74, 0x3f, 0x0f, 0x84, 0x91, 0x1e)
#define PARTITION_LINUX_SWAP_GUID \
EFI_GUID( 0x0657fd6d, 0xa4ab, 0x43c4, \
0x84, 0xe5, 0x09, 0x33, 0xc8, 0x4b, 0x4f, 0x4f)
#define PARTITION_LINUX_LVM_GUID \
EFI_GUID( 0xe6d6d379, 0xf507, 0x44c2, \
0xa2, 0x3c, 0x23, 0x8f, 0x2a, 0x3d, 0xf9, 0x28)
typedef struct _gpt_header {
__le64 signature;
__le32 revision;
__le32 header_size;
__le32 header_crc32;
__le32 reserved1;
__le64 my_lba;
__le64 alternate_lba;
__le64 first_usable_lba;
__le64 last_usable_lba;
efi_guid_t disk_guid;
__le64 partition_entry_lba;
__le32 num_partition_entries;
__le32 sizeof_partition_entry;
__le32 partition_entry_array_crc32;
/* The rest of the logical block is reserved by UEFI and must be zero.
* EFI standard handles this by:
*
* uint8_t reserved2[ BlockSize - 92 ];
*/
} __packed gpt_header;
typedef struct _gpt_entry_attributes {
u64 required_to_function:1;
u64 reserved:47;
u64 type_guid_specific:16;
} __packed gpt_entry_attributes;
typedef struct _gpt_entry {
efi_guid_t partition_type_guid;
efi_guid_t unique_partition_guid;
__le64 starting_lba;
__le64 ending_lba;
gpt_entry_attributes attributes;
efi_char16_t partition_name[72 / sizeof (efi_char16_t)];
} __packed gpt_entry;
typedef struct _gpt_mbr_record {
u8 boot_indicator; /* unused by EFI, set to 0x80 for bootable */
u8 start_head; /* unused by EFI, pt start in CHS */
u8 start_sector; /* unused by EFI, pt start in CHS */
u8 start_track;
u8 os_type; /* EFI and legacy non-EFI OS types */
u8 end_head; /* unused by EFI, pt end in CHS */
u8 end_sector; /* unused by EFI, pt end in CHS */
u8 end_track; /* unused by EFI, pt end in CHS */
__le32 starting_lba; /* used by EFI - start addr of the on disk pt */
__le32 size_in_lba; /* used by EFI - size of pt in LBA */
} __packed gpt_mbr_record;
typedef struct _legacy_mbr {
u8 boot_code[440];
__le32 unique_mbr_signature;
__le16 unknown;
gpt_mbr_record partition_record[4];
__le16 signature;
} __packed legacy_mbr;
/* Functions */
extern int efi_partition(struct parsed_partitions *state);
#endif

364
block/partitions/ibm.c Normal file
View file

@ -0,0 +1,364 @@
/*
* Author(s)......: Holger Smolinski <Holger.Smolinski@de.ibm.com>
* Volker Sameske <sameske@de.ibm.com>
* Bugreports.to..: <Linux390@de.ibm.com>
* Copyright IBM Corp. 1999, 2012
*/
#include <linux/buffer_head.h>
#include <linux/hdreg.h>
#include <linux/slab.h>
#include <asm/dasd.h>
#include <asm/ebcdic.h>
#include <asm/uaccess.h>
#include <asm/vtoc.h>
#include "check.h"
#include "ibm.h"
union label_t {
struct vtoc_volume_label_cdl vol;
struct vtoc_volume_label_ldl lnx;
struct vtoc_cms_label cms;
};
/*
* compute the block number from a
* cyl-cyl-head-head structure
*/
static sector_t cchh2blk(struct vtoc_cchh *ptr, struct hd_geometry *geo)
{
sector_t cyl;
__u16 head;
/* decode cylinder and heads for large volumes */
cyl = ptr->hh & 0xFFF0;
cyl <<= 12;
cyl |= ptr->cc;
head = ptr->hh & 0x000F;
return cyl * geo->heads * geo->sectors +
head * geo->sectors;
}
/*
* compute the block number from a
* cyl-cyl-head-head-block structure
*/
static sector_t cchhb2blk(struct vtoc_cchhb *ptr, struct hd_geometry *geo)
{
sector_t cyl;
__u16 head;
/* decode cylinder and heads for large volumes */
cyl = ptr->hh & 0xFFF0;
cyl <<= 12;
cyl |= ptr->cc;
head = ptr->hh & 0x000F;
return cyl * geo->heads * geo->sectors +
head * geo->sectors +
ptr->b;
}
static int find_label(struct parsed_partitions *state,
dasd_information2_t *info,
struct hd_geometry *geo,
int blocksize,
sector_t *labelsect,
char name[],
char type[],
union label_t *label)
{
Sector sect;
unsigned char *data;
sector_t testsect[3];
unsigned char temp[5];
int found = 0;
int i, testcount;
/* There a three places where we may find a valid label:
* - on an ECKD disk it's block 2
* - on an FBA disk it's block 1
* - on an CMS formatted FBA disk it is sector 1, even if the block size
* is larger than 512 bytes (possible if the DIAG discipline is used)
* If we have a valid info structure, then we know exactly which case we
* have, otherwise we just search through all possebilities.
*/
if (info) {
if ((info->cu_type == 0x6310 && info->dev_type == 0x9336) ||
(info->cu_type == 0x3880 && info->dev_type == 0x3370))
testsect[0] = info->label_block;
else
testsect[0] = info->label_block * (blocksize >> 9);
testcount = 1;
} else {
testsect[0] = 1;
testsect[1] = (blocksize >> 9);
testsect[2] = 2 * (blocksize >> 9);
testcount = 3;
}
for (i = 0; i < testcount; ++i) {
data = read_part_sector(state, testsect[i], &sect);
if (data == NULL)
continue;
memcpy(label, data, sizeof(*label));
memcpy(temp, data, 4);
temp[4] = 0;
EBCASC(temp, 4);
put_dev_sector(sect);
if (!strcmp(temp, "VOL1") ||
!strcmp(temp, "LNX1") ||
!strcmp(temp, "CMS1")) {
if (!strcmp(temp, "VOL1")) {
strncpy(type, label->vol.vollbl, 4);
strncpy(name, label->vol.volid, 6);
} else {
strncpy(type, label->lnx.vollbl, 4);
strncpy(name, label->lnx.volid, 6);
}
EBCASC(type, 4);
EBCASC(name, 6);
*labelsect = testsect[i];
found = 1;
break;
}
}
if (!found)
memset(label, 0, sizeof(*label));
return found;
}
static int find_vol1_partitions(struct parsed_partitions *state,
struct hd_geometry *geo,
int blocksize,
char name[],
union label_t *label)
{
sector_t blk;
int counter;
char tmp[64];
Sector sect;
unsigned char *data;
loff_t offset, size;
struct vtoc_format1_label f1;
int secperblk;
snprintf(tmp, sizeof(tmp), "VOL1/%8s:", name);
strlcat(state->pp_buf, tmp, PAGE_SIZE);
/*
* get start of VTOC from the disk label and then search for format1
* and format8 labels
*/
secperblk = blocksize >> 9;
blk = cchhb2blk(&label->vol.vtoc, geo) + 1;
counter = 0;
data = read_part_sector(state, blk * secperblk, &sect);
while (data != NULL) {
memcpy(&f1, data, sizeof(struct vtoc_format1_label));
put_dev_sector(sect);
/* skip FMT4 / FMT5 / FMT7 labels */
if (f1.DS1FMTID == _ascebc['4']
|| f1.DS1FMTID == _ascebc['5']
|| f1.DS1FMTID == _ascebc['7']
|| f1.DS1FMTID == _ascebc['9']) {
blk++;
data = read_part_sector(state, blk * secperblk, &sect);
continue;
}
/* only FMT1 and 8 labels valid at this point */
if (f1.DS1FMTID != _ascebc['1'] &&
f1.DS1FMTID != _ascebc['8'])
break;
/* OK, we got valid partition data */
offset = cchh2blk(&f1.DS1EXT1.llimit, geo);
size = cchh2blk(&f1.DS1EXT1.ulimit, geo) -
offset + geo->sectors;
offset *= secperblk;
size *= secperblk;
if (counter >= state->limit)
break;
put_partition(state, counter + 1, offset, size);
counter++;
blk++;
data = read_part_sector(state, blk * secperblk, &sect);
}
strlcat(state->pp_buf, "\n", PAGE_SIZE);
if (!data)
return -1;
return 1;
}
static int find_lnx1_partitions(struct parsed_partitions *state,
struct hd_geometry *geo,
int blocksize,
char name[],
union label_t *label,
sector_t labelsect,
loff_t i_size,
dasd_information2_t *info)
{
loff_t offset, geo_size, size;
char tmp[64];
int secperblk;
snprintf(tmp, sizeof(tmp), "LNX1/%8s:", name);
strlcat(state->pp_buf, tmp, PAGE_SIZE);
secperblk = blocksize >> 9;
if (label->lnx.ldl_version == 0xf2) {
size = label->lnx.formatted_blocks * secperblk;
} else {
/*
* Formated w/o large volume support. If the sanity check
* 'size based on geo == size based on i_size' is true, then
* we can safely assume that we know the formatted size of
* the disk, otherwise we need additional information
* that we can only get from a real DASD device.
*/
geo_size = geo->cylinders * geo->heads
* geo->sectors * secperblk;
size = i_size >> 9;
if (size != geo_size) {
if (!info) {
strlcat(state->pp_buf, "\n", PAGE_SIZE);
return 1;
}
if (!strcmp(info->type, "ECKD"))
if (geo_size < size)
size = geo_size;
/* else keep size based on i_size */
}
}
/* first and only partition starts in the first block after the label */
offset = labelsect + secperblk;
put_partition(state, 1, offset, size - offset);
strlcat(state->pp_buf, "\n", PAGE_SIZE);
return 1;
}
static int find_cms1_partitions(struct parsed_partitions *state,
struct hd_geometry *geo,
int blocksize,
char name[],
union label_t *label,
sector_t labelsect)
{
loff_t offset, size;
char tmp[64];
int secperblk;
/*
* VM style CMS1 labeled disk
*/
blocksize = label->cms.block_size;
secperblk = blocksize >> 9;
if (label->cms.disk_offset != 0) {
snprintf(tmp, sizeof(tmp), "CMS1/%8s(MDSK):", name);
strlcat(state->pp_buf, tmp, PAGE_SIZE);
/* disk is reserved minidisk */
offset = label->cms.disk_offset * secperblk;
size = (label->cms.block_count - 1) * secperblk;
} else {
snprintf(tmp, sizeof(tmp), "CMS1/%8s:", name);
strlcat(state->pp_buf, tmp, PAGE_SIZE);
/*
* Special case for FBA devices:
* If an FBA device is CMS formatted with blocksize > 512 byte
* and the DIAG discipline is used, then the CMS label is found
* in sector 1 instead of block 1. However, the partition is
* still supposed to start in block 2.
*/
if (labelsect == 1)
offset = 2 * secperblk;
else
offset = labelsect + secperblk;
size = label->cms.block_count * secperblk;
}
put_partition(state, 1, offset, size-offset);
strlcat(state->pp_buf, "\n", PAGE_SIZE);
return 1;
}
/*
* This is the main function, called by check.c
*/
int ibm_partition(struct parsed_partitions *state)
{
struct block_device *bdev = state->bdev;
int blocksize, res;
loff_t i_size, offset, size;
dasd_information2_t *info;
struct hd_geometry *geo;
char type[5] = {0,};
char name[7] = {0,};
sector_t labelsect;
union label_t *label;
res = 0;
blocksize = bdev_logical_block_size(bdev);
if (blocksize <= 0)
goto out_exit;
i_size = i_size_read(bdev->bd_inode);
if (i_size == 0)
goto out_exit;
info = kmalloc(sizeof(dasd_information2_t), GFP_KERNEL);
if (info == NULL)
goto out_exit;
geo = kmalloc(sizeof(struct hd_geometry), GFP_KERNEL);
if (geo == NULL)
goto out_nogeo;
label = kmalloc(sizeof(union label_t), GFP_KERNEL);
if (label == NULL)
goto out_nolab;
if (ioctl_by_bdev(bdev, HDIO_GETGEO, (unsigned long)geo) != 0)
goto out_freeall;
if (ioctl_by_bdev(bdev, BIODASDINFO2, (unsigned long)info) != 0) {
kfree(info);
info = NULL;
}
if (find_label(state, info, geo, blocksize, &labelsect, name, type,
label)) {
if (!strncmp(type, "VOL1", 4)) {
res = find_vol1_partitions(state, geo, blocksize, name,
label);
} else if (!strncmp(type, "LNX1", 4)) {
res = find_lnx1_partitions(state, geo, blocksize, name,
label, labelsect, i_size,
info);
} else if (!strncmp(type, "CMS1", 4)) {
res = find_cms1_partitions(state, geo, blocksize, name,
label, labelsect);
}
} else if (info) {
/*
* ugly but needed for backward compatibility:
* If the block device is a DASD (i.e. BIODASDINFO2 works),
* then we claim it in any case, even though it has no valid
* label. If it has the LDL format, then we simply define a
* partition as if it had an LNX1 label.
*/
res = 1;
if (info->format == DASD_FORMAT_LDL) {
strlcat(state->pp_buf, "(nonl)", PAGE_SIZE);
size = i_size >> 9;
offset = (info->label_block + 1) * (blocksize >> 9);
put_partition(state, 1, offset, size-offset);
strlcat(state->pp_buf, "\n", PAGE_SIZE);
}
} else
res = 0;
out_freeall:
kfree(label);
out_nolab:
kfree(geo);
out_nogeo:
kfree(info);
out_exit:
return res;
}

1
block/partitions/ibm.h Normal file
View file

@ -0,0 +1 @@
int ibm_partition(struct parsed_partitions *);

58
block/partitions/karma.c Normal file
View file

@ -0,0 +1,58 @@
/*
* fs/partitions/karma.c
* Rio Karma partition info.
*
* Copyright (C) 2006 Bob Copeland (me@bobcopeland.com)
* based on osf.c
*/
#include "check.h"
#include "karma.h"
#include <linux/compiler.h>
int karma_partition(struct parsed_partitions *state)
{
int i;
int slot = 1;
Sector sect;
unsigned char *data;
struct disklabel {
u8 d_reserved[270];
struct d_partition {
__le32 p_res;
u8 p_fstype;
u8 p_res2[3];
__le32 p_offset;
__le32 p_size;
} d_partitions[2];
u8 d_blank[208];
__le16 d_magic;
} __packed *label;
struct d_partition *p;
data = read_part_sector(state, 0, &sect);
if (!data)
return -1;
label = (struct disklabel *)data;
if (le16_to_cpu(label->d_magic) != KARMA_LABEL_MAGIC) {
put_dev_sector(sect);
return 0;
}
p = label->d_partitions;
for (i = 0 ; i < 2; i++, p++) {
if (slot == state->limit)
break;
if (p->p_fstype == 0x4d && le32_to_cpu(p->p_size)) {
put_partition(state, slot, le32_to_cpu(p->p_offset),
le32_to_cpu(p->p_size));
}
slot++;
}
strlcat(state->pp_buf, "\n", PAGE_SIZE);
put_dev_sector(sect);
return 1;
}

8
block/partitions/karma.h Normal file
View file

@ -0,0 +1,8 @@
/*
* fs/partitions/karma.h
*/
#define KARMA_LABEL_MAGIC 0xAB56
int karma_partition(struct parsed_partitions *state);

1567
block/partitions/ldm.c Normal file

File diff suppressed because it is too large Load diff

215
block/partitions/ldm.h Normal file
View file

@ -0,0 +1,215 @@
/**
* ldm - Part of the Linux-NTFS project.
*
* Copyright (C) 2001,2002 Richard Russon <ldm@flatcap.org>
* Copyright (c) 2001-2007 Anton Altaparmakov
* Copyright (C) 2001,2002 Jakob Kemi <jakob.kemi@telia.com>
*
* Documentation is available at http://www.linux-ntfs.org/doku.php?id=downloads
*
* This program is free software; you can redistribute it and/or modify it
* under the terms of the GNU General Public License as published by the Free
* Software Foundation; either version 2 of the License, or (at your option)
* any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program (in the main directory of the Linux-NTFS source
* in the file COPYING); if not, write to the Free Software Foundation,
* Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
*/
#ifndef _FS_PT_LDM_H_
#define _FS_PT_LDM_H_
#include <linux/types.h>
#include <linux/list.h>
#include <linux/genhd.h>
#include <linux/fs.h>
#include <asm/unaligned.h>
#include <asm/byteorder.h>
struct parsed_partitions;
/* Magic numbers in CPU format. */
#define MAGIC_VMDB 0x564D4442 /* VMDB */
#define MAGIC_VBLK 0x56424C4B /* VBLK */
#define MAGIC_PRIVHEAD 0x5052495648454144ULL /* PRIVHEAD */
#define MAGIC_TOCBLOCK 0x544F43424C4F434BULL /* TOCBLOCK */
/* The defined vblk types. */
#define VBLK_VOL5 0x51 /* Volume, version 5 */
#define VBLK_CMP3 0x32 /* Component, version 3 */
#define VBLK_PRT3 0x33 /* Partition, version 3 */
#define VBLK_DSK3 0x34 /* Disk, version 3 */
#define VBLK_DSK4 0x44 /* Disk, version 4 */
#define VBLK_DGR3 0x35 /* Disk Group, version 3 */
#define VBLK_DGR4 0x45 /* Disk Group, version 4 */
/* vblk flags indicating extra information will be present */
#define VBLK_FLAG_COMP_STRIPE 0x10
#define VBLK_FLAG_PART_INDEX 0x08
#define VBLK_FLAG_DGR3_IDS 0x08
#define VBLK_FLAG_DGR4_IDS 0x08
#define VBLK_FLAG_VOLU_ID1 0x08
#define VBLK_FLAG_VOLU_ID2 0x20
#define VBLK_FLAG_VOLU_SIZE 0x80
#define VBLK_FLAG_VOLU_DRIVE 0x02
/* size of a vblk's static parts */
#define VBLK_SIZE_HEAD 16
#define VBLK_SIZE_CMP3 22 /* Name and version */
#define VBLK_SIZE_DGR3 12
#define VBLK_SIZE_DGR4 44
#define VBLK_SIZE_DSK3 12
#define VBLK_SIZE_DSK4 45
#define VBLK_SIZE_PRT3 28
#define VBLK_SIZE_VOL5 58
/* component types */
#define COMP_STRIPE 0x01 /* Stripe-set */
#define COMP_BASIC 0x02 /* Basic disk */
#define COMP_RAID 0x03 /* Raid-set */
/* Other constants. */
#define LDM_DB_SIZE 2048 /* Size in sectors (= 1MiB). */
#define OFF_PRIV1 6 /* Offset of the first privhead
relative to the start of the
device in sectors */
/* Offsets to structures within the LDM Database in sectors. */
#define OFF_PRIV2 1856 /* Backup private headers. */
#define OFF_PRIV3 2047
#define OFF_TOCB1 1 /* Tables of contents. */
#define OFF_TOCB2 2
#define OFF_TOCB3 2045
#define OFF_TOCB4 2046
#define OFF_VMDB 17 /* List of partitions. */
#define LDM_PARTITION 0x42 /* Formerly SFS (Landis). */
#define TOC_BITMAP1 "config" /* Names of the two defined */
#define TOC_BITMAP2 "log" /* bitmaps in the TOCBLOCK. */
/* Borrowed from msdos.c */
#define SYS_IND(p) (get_unaligned(&(p)->sys_ind))
struct frag { /* VBLK Fragment handling */
struct list_head list;
u32 group;
u8 num; /* Total number of records */
u8 rec; /* This is record number n */
u8 map; /* Which portions are in use */
u8 data[0];
};
/* In memory LDM database structures. */
#define GUID_SIZE 16
struct privhead { /* Offsets and sizes are in sectors. */
u16 ver_major;
u16 ver_minor;
u64 logical_disk_start;
u64 logical_disk_size;
u64 config_start;
u64 config_size;
u8 disk_id[GUID_SIZE];
};
struct tocblock { /* We have exactly two bitmaps. */
u8 bitmap1_name[16];
u64 bitmap1_start;
u64 bitmap1_size;
u8 bitmap2_name[16];
u64 bitmap2_start;
u64 bitmap2_size;
};
struct vmdb { /* VMDB: The database header */
u16 ver_major;
u16 ver_minor;
u32 vblk_size;
u32 vblk_offset;
u32 last_vblk_seq;
};
struct vblk_comp { /* VBLK Component */
u8 state[16];
u64 parent_id;
u8 type;
u8 children;
u16 chunksize;
};
struct vblk_dgrp { /* VBLK Disk Group */
u8 disk_id[64];
};
struct vblk_disk { /* VBLK Disk */
u8 disk_id[GUID_SIZE];
u8 alt_name[128];
};
struct vblk_part { /* VBLK Partition */
u64 start;
u64 size; /* start, size and vol_off in sectors */
u64 volume_offset;
u64 parent_id;
u64 disk_id;
u8 partnum;
};
struct vblk_volu { /* VBLK Volume */
u8 volume_type[16];
u8 volume_state[16];
u8 guid[16];
u8 drive_hint[4];
u64 size;
u8 partition_type;
};
struct vblk_head { /* VBLK standard header */
u32 group;
u16 rec;
u16 nrec;
};
struct vblk { /* Generalised VBLK */
u8 name[64];
u64 obj_id;
u32 sequence;
u8 flags;
u8 type;
union {
struct vblk_comp comp;
struct vblk_dgrp dgrp;
struct vblk_disk disk;
struct vblk_part part;
struct vblk_volu volu;
} vblk;
struct list_head list;
};
struct ldmdb { /* Cache of the database */
struct privhead ph;
struct tocblock toc;
struct vmdb vm;
struct list_head v_dgrp;
struct list_head v_disk;
struct list_head v_volu;
struct list_head v_comp;
struct list_head v_part;
};
int ldm_partition(struct parsed_partitions *state);
#endif /* _FS_PT_LDM_H_ */

138
block/partitions/mac.c Normal file
View file

@ -0,0 +1,138 @@
/*
* fs/partitions/mac.c
*
* Code extracted from drivers/block/genhd.c
* Copyright (C) 1991-1998 Linus Torvalds
* Re-organised Feb 1998 Russell King
*/
#include <linux/ctype.h>
#include "check.h"
#include "mac.h"
#ifdef CONFIG_PPC_PMAC
#include <asm/machdep.h>
extern void note_bootable_part(dev_t dev, int part, int goodness);
#endif
/*
* Code to understand MacOS partition tables.
*/
static inline void mac_fix_string(char *stg, int len)
{
int i;
for (i = len - 1; i >= 0 && stg[i] == ' '; i--)
stg[i] = 0;
}
int mac_partition(struct parsed_partitions *state)
{
Sector sect;
unsigned char *data;
int slot, blocks_in_map;
unsigned secsize;
#ifdef CONFIG_PPC_PMAC
int found_root = 0;
int found_root_goodness = 0;
#endif
struct mac_partition *part;
struct mac_driver_desc *md;
/* Get 0th block and look at the first partition map entry. */
md = read_part_sector(state, 0, &sect);
if (!md)
return -1;
if (be16_to_cpu(md->signature) != MAC_DRIVER_MAGIC) {
put_dev_sector(sect);
return 0;
}
secsize = be16_to_cpu(md->block_size);
put_dev_sector(sect);
data = read_part_sector(state, secsize/512, &sect);
if (!data)
return -1;
part = (struct mac_partition *) (data + secsize%512);
if (be16_to_cpu(part->signature) != MAC_PARTITION_MAGIC) {
put_dev_sector(sect);
return 0; /* not a MacOS disk */
}
blocks_in_map = be32_to_cpu(part->map_count);
if (blocks_in_map < 0 || blocks_in_map >= DISK_MAX_PARTS) {
put_dev_sector(sect);
return 0;
}
if (blocks_in_map >= state->limit)
blocks_in_map = state->limit - 1;
strlcat(state->pp_buf, " [mac]", PAGE_SIZE);
for (slot = 1; slot <= blocks_in_map; ++slot) {
int pos = slot * secsize;
put_dev_sector(sect);
data = read_part_sector(state, pos/512, &sect);
if (!data)
return -1;
part = (struct mac_partition *) (data + pos%512);
if (be16_to_cpu(part->signature) != MAC_PARTITION_MAGIC)
break;
put_partition(state, slot,
be32_to_cpu(part->start_block) * (secsize/512),
be32_to_cpu(part->block_count) * (secsize/512));
if (!strncasecmp(part->type, "Linux_RAID", 10))
state->parts[slot].flags = ADDPART_FLAG_RAID;
#ifdef CONFIG_PPC_PMAC
/*
* If this is the first bootable partition, tell the
* setup code, in case it wants to make this the root.
*/
if (machine_is(powermac)) {
int goodness = 0;
mac_fix_string(part->processor, 16);
mac_fix_string(part->name, 32);
mac_fix_string(part->type, 32);
if ((be32_to_cpu(part->status) & MAC_STATUS_BOOTABLE)
&& strcasecmp(part->processor, "powerpc") == 0)
goodness++;
if (strcasecmp(part->type, "Apple_UNIX_SVR2") == 0
|| (strncasecmp(part->type, "Linux", 5) == 0
&& strcasecmp(part->type, "Linux_swap") != 0)) {
int i, l;
goodness++;
l = strlen(part->name);
if (strcmp(part->name, "/") == 0)
goodness++;
for (i = 0; i <= l - 4; ++i) {
if (strncasecmp(part->name + i, "root",
4) == 0) {
goodness += 2;
break;
}
}
if (strncasecmp(part->name, "swap", 4) == 0)
goodness--;
}
if (goodness > found_root_goodness) {
found_root = slot;
found_root_goodness = goodness;
}
}
#endif /* CONFIG_PPC_PMAC */
}
#ifdef CONFIG_PPC_PMAC
if (found_root_goodness)
note_bootable_part(state->bdev->bd_dev, found_root,
found_root_goodness);
#endif
put_dev_sector(sect);
strlcat(state->pp_buf, "\n", PAGE_SIZE);
return 1;
}

44
block/partitions/mac.h Normal file
View file

@ -0,0 +1,44 @@
/*
* fs/partitions/mac.h
*/
#define MAC_PARTITION_MAGIC 0x504d
/* type field value for A/UX or other Unix partitions */
#define APPLE_AUX_TYPE "Apple_UNIX_SVR2"
struct mac_partition {
__be16 signature; /* expected to be MAC_PARTITION_MAGIC */
__be16 res1;
__be32 map_count; /* # blocks in partition map */
__be32 start_block; /* absolute starting block # of partition */
__be32 block_count; /* number of blocks in partition */
char name[32]; /* partition name */
char type[32]; /* string type description */
__be32 data_start; /* rel block # of first data block */
__be32 data_count; /* number of data blocks */
__be32 status; /* partition status bits */
__be32 boot_start;
__be32 boot_size;
__be32 boot_load;
__be32 boot_load2;
__be32 boot_entry;
__be32 boot_entry2;
__be32 boot_cksum;
char processor[16]; /* identifies ISA of boot */
/* there is more stuff after this that we don't need */
};
#define MAC_STATUS_BOOTABLE 8 /* partition is bootable */
#define MAC_DRIVER_MAGIC 0x4552
/* Driver descriptor structure, in block 0 */
struct mac_driver_desc {
__be16 signature; /* expected to be MAC_DRIVER_MAGIC */
__be16 block_size;
__be32 block_count;
/* ... more stuff */
};
int mac_partition(struct parsed_partitions *state);

582
block/partitions/msdos.c Normal file
View file

@ -0,0 +1,582 @@
/*
* fs/partitions/msdos.c
*
* Code extracted from drivers/block/genhd.c
* Copyright (C) 1991-1998 Linus Torvalds
*
* Thanks to Branko Lankester, lankeste@fwi.uva.nl, who found a bug
* in the early extended-partition checks and added DM partitions
*
* Support for DiskManager v6.0x added by Mark Lord,
* with information provided by OnTrack. This now works for linux fdisk
* and LILO, as well as loadlin and bootln. Note that disks other than
* /dev/hda *must* have a "DOS" type 0x51 partition in the first slot (hda1).
*
* More flexible handling of extended partitions - aeb, 950831
*
* Check partition table on IDE disks for common CHS translations
*
* Re-organised Feb 1998 Russell King
*/
#include <linux/msdos_fs.h>
#include "check.h"
#include "msdos.h"
#include "efi.h"
#include "aix.h"
/*
* Many architectures don't like unaligned accesses, while
* the nr_sects and start_sect partition table entries are
* at a 2 (mod 4) address.
*/
#include <asm/unaligned.h>
#define SYS_IND(p) get_unaligned(&p->sys_ind)
static inline sector_t nr_sects(struct partition *p)
{
return (sector_t)get_unaligned_le32(&p->nr_sects);
}
static inline sector_t start_sect(struct partition *p)
{
return (sector_t)get_unaligned_le32(&p->start_sect);
}
static inline int is_extended_partition(struct partition *p)
{
return (SYS_IND(p) == DOS_EXTENDED_PARTITION ||
SYS_IND(p) == WIN98_EXTENDED_PARTITION ||
SYS_IND(p) == LINUX_EXTENDED_PARTITION);
}
#define MSDOS_LABEL_MAGIC1 0x55
#define MSDOS_LABEL_MAGIC2 0xAA
static inline int
msdos_magic_present(unsigned char *p)
{
return (p[0] == MSDOS_LABEL_MAGIC1 && p[1] == MSDOS_LABEL_MAGIC2);
}
/* Value is EBCDIC 'IBMA' */
#define AIX_LABEL_MAGIC1 0xC9
#define AIX_LABEL_MAGIC2 0xC2
#define AIX_LABEL_MAGIC3 0xD4
#define AIX_LABEL_MAGIC4 0xC1
static int aix_magic_present(struct parsed_partitions *state, unsigned char *p)
{
struct partition *pt = (struct partition *) (p + 0x1be);
Sector sect;
unsigned char *d;
int slot, ret = 0;
if (!(p[0] == AIX_LABEL_MAGIC1 &&
p[1] == AIX_LABEL_MAGIC2 &&
p[2] == AIX_LABEL_MAGIC3 &&
p[3] == AIX_LABEL_MAGIC4))
return 0;
/* Assume the partition table is valid if Linux partitions exists */
for (slot = 1; slot <= 4; slot++, pt++) {
if (pt->sys_ind == LINUX_SWAP_PARTITION ||
pt->sys_ind == LINUX_RAID_PARTITION ||
pt->sys_ind == LINUX_DATA_PARTITION ||
pt->sys_ind == LINUX_LVM_PARTITION ||
is_extended_partition(pt))
return 0;
}
d = read_part_sector(state, 7, &sect);
if (d) {
if (d[0] == '_' && d[1] == 'L' && d[2] == 'V' && d[3] == 'M')
ret = 1;
put_dev_sector(sect);
}
return ret;
}
static void set_info(struct parsed_partitions *state, int slot,
u32 disksig)
{
struct partition_meta_info *info = &state->parts[slot].info;
snprintf(info->uuid, sizeof(info->uuid), "%08x-%02x", disksig,
slot);
info->volname[0] = 0;
state->parts[slot].has_info = true;
}
/*
* Create devices for each logical partition in an extended partition.
* The logical partitions form a linked list, with each entry being
* a partition table with two entries. The first entry
* is the real data partition (with a start relative to the partition
* table start). The second is a pointer to the next logical partition
* (with a start relative to the entire extended partition).
* We do not create a Linux partition for the partition tables, but
* only for the actual data partitions.
*/
static void parse_extended(struct parsed_partitions *state,
sector_t first_sector, sector_t first_size,
u32 disksig)
{
struct partition *p;
Sector sect;
unsigned char *data;
sector_t this_sector, this_size;
sector_t sector_size = bdev_logical_block_size(state->bdev) / 512;
int loopct = 0; /* number of links followed
without finding a data partition */
int i;
this_sector = first_sector;
this_size = first_size;
while (1) {
if (++loopct > 100)
return;
if (state->next == state->limit)
return;
data = read_part_sector(state, this_sector, &sect);
if (!data)
return;
if (!msdos_magic_present(data + 510))
goto done;
p = (struct partition *) (data + 0x1be);
/*
* Usually, the first entry is the real data partition,
* the 2nd entry is the next extended partition, or empty,
* and the 3rd and 4th entries are unused.
* However, DRDOS sometimes has the extended partition as
* the first entry (when the data partition is empty),
* and OS/2 seems to use all four entries.
*/
/*
* First process the data partition(s)
*/
for (i = 0; i < 4; i++, p++) {
sector_t offs, size, next;
if (!nr_sects(p) || is_extended_partition(p))
continue;
/* Check the 3rd and 4th entries -
these sometimes contain random garbage */
offs = start_sect(p)*sector_size;
size = nr_sects(p)*sector_size;
next = this_sector + offs;
if (i >= 2) {
if (offs + size > this_size)
continue;
if (next < first_sector)
continue;
if (next + size > first_sector + first_size)
continue;
}
put_partition(state, state->next, next, size);
set_info(state, state->next, disksig);
if (SYS_IND(p) == LINUX_RAID_PARTITION)
state->parts[state->next].flags = ADDPART_FLAG_RAID;
loopct = 0;
if (++state->next == state->limit)
goto done;
}
/*
* Next, process the (first) extended partition, if present.
* (So far, there seems to be no reason to make
* parse_extended() recursive and allow a tree
* of extended partitions.)
* It should be a link to the next logical partition.
*/
p -= 4;
for (i = 0; i < 4; i++, p++)
if (nr_sects(p) && is_extended_partition(p))
break;
if (i == 4)
goto done; /* nothing left to do */
this_sector = first_sector + start_sect(p) * sector_size;
this_size = nr_sects(p) * sector_size;
put_dev_sector(sect);
}
done:
put_dev_sector(sect);
}
/* james@bpgc.com: Solaris has a nasty indicator: 0x82 which also
indicates linux swap. Be careful before believing this is Solaris. */
static void parse_solaris_x86(struct parsed_partitions *state,
sector_t offset, sector_t size, int origin)
{
#ifdef CONFIG_SOLARIS_X86_PARTITION
Sector sect;
struct solaris_x86_vtoc *v;
int i;
short max_nparts;
v = read_part_sector(state, offset + 1, &sect);
if (!v)
return;
if (le32_to_cpu(v->v_sanity) != SOLARIS_X86_VTOC_SANE) {
put_dev_sector(sect);
return;
}
{
char tmp[1 + BDEVNAME_SIZE + 10 + 11 + 1];
snprintf(tmp, sizeof(tmp), " %s%d: <solaris:", state->name, origin);
strlcat(state->pp_buf, tmp, PAGE_SIZE);
}
if (le32_to_cpu(v->v_version) != 1) {
char tmp[64];
snprintf(tmp, sizeof(tmp), " cannot handle version %d vtoc>\n",
le32_to_cpu(v->v_version));
strlcat(state->pp_buf, tmp, PAGE_SIZE);
put_dev_sector(sect);
return;
}
/* Ensure we can handle previous case of VTOC with 8 entries gracefully */
max_nparts = le16_to_cpu(v->v_nparts) > 8 ? SOLARIS_X86_NUMSLICE : 8;
for (i = 0; i < max_nparts && state->next < state->limit; i++) {
struct solaris_x86_slice *s = &v->v_slice[i];
char tmp[3 + 10 + 1 + 1];
if (s->s_size == 0)
continue;
snprintf(tmp, sizeof(tmp), " [s%d]", i);
strlcat(state->pp_buf, tmp, PAGE_SIZE);
/* solaris partitions are relative to current MS-DOS
* one; must add the offset of the current partition */
put_partition(state, state->next++,
le32_to_cpu(s->s_start)+offset,
le32_to_cpu(s->s_size));
}
put_dev_sector(sect);
strlcat(state->pp_buf, " >\n", PAGE_SIZE);
#endif
}
#if defined(CONFIG_BSD_DISKLABEL)
/*
* Create devices for BSD partitions listed in a disklabel, under a
* dos-like partition. See parse_extended() for more information.
*/
static void parse_bsd(struct parsed_partitions *state,
sector_t offset, sector_t size, int origin, char *flavour,
int max_partitions)
{
Sector sect;
struct bsd_disklabel *l;
struct bsd_partition *p;
char tmp[64];
l = read_part_sector(state, offset + 1, &sect);
if (!l)
return;
if (le32_to_cpu(l->d_magic) != BSD_DISKMAGIC) {
put_dev_sector(sect);
return;
}
snprintf(tmp, sizeof(tmp), " %s%d: <%s:", state->name, origin, flavour);
strlcat(state->pp_buf, tmp, PAGE_SIZE);
if (le16_to_cpu(l->d_npartitions) < max_partitions)
max_partitions = le16_to_cpu(l->d_npartitions);
for (p = l->d_partitions; p - l->d_partitions < max_partitions; p++) {
sector_t bsd_start, bsd_size;
if (state->next == state->limit)
break;
if (p->p_fstype == BSD_FS_UNUSED)
continue;
bsd_start = le32_to_cpu(p->p_offset);
bsd_size = le32_to_cpu(p->p_size);
if (offset == bsd_start && size == bsd_size)
/* full parent partition, we have it already */
continue;
if (offset > bsd_start || offset+size < bsd_start+bsd_size) {
strlcat(state->pp_buf, "bad subpartition - ignored\n", PAGE_SIZE);
continue;
}
put_partition(state, state->next++, bsd_start, bsd_size);
}
put_dev_sector(sect);
if (le16_to_cpu(l->d_npartitions) > max_partitions) {
snprintf(tmp, sizeof(tmp), " (ignored %d more)",
le16_to_cpu(l->d_npartitions) - max_partitions);
strlcat(state->pp_buf, tmp, PAGE_SIZE);
}
strlcat(state->pp_buf, " >\n", PAGE_SIZE);
}
#endif
static void parse_freebsd(struct parsed_partitions *state,
sector_t offset, sector_t size, int origin)
{
#ifdef CONFIG_BSD_DISKLABEL
parse_bsd(state, offset, size, origin, "bsd", BSD_MAXPARTITIONS);
#endif
}
static void parse_netbsd(struct parsed_partitions *state,
sector_t offset, sector_t size, int origin)
{
#ifdef CONFIG_BSD_DISKLABEL
parse_bsd(state, offset, size, origin, "netbsd", BSD_MAXPARTITIONS);
#endif
}
static void parse_openbsd(struct parsed_partitions *state,
sector_t offset, sector_t size, int origin)
{
#ifdef CONFIG_BSD_DISKLABEL
parse_bsd(state, offset, size, origin, "openbsd",
OPENBSD_MAXPARTITIONS);
#endif
}
/*
* Create devices for Unixware partitions listed in a disklabel, under a
* dos-like partition. See parse_extended() for more information.
*/
static void parse_unixware(struct parsed_partitions *state,
sector_t offset, sector_t size, int origin)
{
#ifdef CONFIG_UNIXWARE_DISKLABEL
Sector sect;
struct unixware_disklabel *l;
struct unixware_slice *p;
l = read_part_sector(state, offset + 29, &sect);
if (!l)
return;
if (le32_to_cpu(l->d_magic) != UNIXWARE_DISKMAGIC ||
le32_to_cpu(l->vtoc.v_magic) != UNIXWARE_DISKMAGIC2) {
put_dev_sector(sect);
return;
}
{
char tmp[1 + BDEVNAME_SIZE + 10 + 12 + 1];
snprintf(tmp, sizeof(tmp), " %s%d: <unixware:", state->name, origin);
strlcat(state->pp_buf, tmp, PAGE_SIZE);
}
p = &l->vtoc.v_slice[1];
/* I omit the 0th slice as it is the same as whole disk. */
while (p - &l->vtoc.v_slice[0] < UNIXWARE_NUMSLICE) {
if (state->next == state->limit)
break;
if (p->s_label != UNIXWARE_FS_UNUSED)
put_partition(state, state->next++,
le32_to_cpu(p->start_sect),
le32_to_cpu(p->nr_sects));
p++;
}
put_dev_sector(sect);
strlcat(state->pp_buf, " >\n", PAGE_SIZE);
#endif
}
/*
* Minix 2.0.0/2.0.2 subpartition support.
* Anand Krishnamurthy <anandk@wiproge.med.ge.com>
* Rajeev V. Pillai <rajeevvp@yahoo.com>
*/
static void parse_minix(struct parsed_partitions *state,
sector_t offset, sector_t size, int origin)
{
#ifdef CONFIG_MINIX_SUBPARTITION
Sector sect;
unsigned char *data;
struct partition *p;
int i;
data = read_part_sector(state, offset, &sect);
if (!data)
return;
p = (struct partition *)(data + 0x1be);
/* The first sector of a Minix partition can have either
* a secondary MBR describing its subpartitions, or
* the normal boot sector. */
if (msdos_magic_present(data + 510) &&
SYS_IND(p) == MINIX_PARTITION) { /* subpartition table present */
char tmp[1 + BDEVNAME_SIZE + 10 + 9 + 1];
snprintf(tmp, sizeof(tmp), " %s%d: <minix:", state->name, origin);
strlcat(state->pp_buf, tmp, PAGE_SIZE);
for (i = 0; i < MINIX_NR_SUBPARTITIONS; i++, p++) {
if (state->next == state->limit)
break;
/* add each partition in use */
if (SYS_IND(p) == MINIX_PARTITION)
put_partition(state, state->next++,
start_sect(p), nr_sects(p));
}
strlcat(state->pp_buf, " >\n", PAGE_SIZE);
}
put_dev_sector(sect);
#endif /* CONFIG_MINIX_SUBPARTITION */
}
static struct {
unsigned char id;
void (*parse)(struct parsed_partitions *, sector_t, sector_t, int);
} subtypes[] = {
{FREEBSD_PARTITION, parse_freebsd},
{NETBSD_PARTITION, parse_netbsd},
{OPENBSD_PARTITION, parse_openbsd},
{MINIX_PARTITION, parse_minix},
{UNIXWARE_PARTITION, parse_unixware},
{SOLARIS_X86_PARTITION, parse_solaris_x86},
{NEW_SOLARIS_X86_PARTITION, parse_solaris_x86},
{0, NULL},
};
int msdos_partition(struct parsed_partitions *state)
{
sector_t sector_size = bdev_logical_block_size(state->bdev) / 512;
Sector sect;
unsigned char *data;
struct partition *p;
struct fat_boot_sector *fb;
int slot;
u32 disksig;
data = read_part_sector(state, 0, &sect);
if (!data)
return -1;
/*
* Note order! (some AIX disks, e.g. unbootable kind,
* have no MSDOS 55aa)
*/
if (aix_magic_present(state, data)) {
put_dev_sector(sect);
#ifdef CONFIG_AIX_PARTITION
return aix_partition(state);
#else
strlcat(state->pp_buf, " [AIX]", PAGE_SIZE);
return 0;
#endif
}
if (!msdos_magic_present(data + 510)) {
put_dev_sector(sect);
return 0;
}
/*
* Now that the 55aa signature is present, this is probably
* either the boot sector of a FAT filesystem or a DOS-type
* partition table. Reject this in case the boot indicator
* is not 0 or 0x80.
*/
p = (struct partition *) (data + 0x1be);
for (slot = 1; slot <= 4; slot++, p++) {
if (p->boot_ind != 0 && p->boot_ind != 0x80) {
/*
* Even without a valid boot inidicator value
* its still possible this is valid FAT filesystem
* without a partition table.
*/
fb = (struct fat_boot_sector *) data;
if (slot == 1 && fb->reserved && fb->fats
&& fat_valid_media(fb->media)) {
strlcat(state->pp_buf, "\n", PAGE_SIZE);
put_dev_sector(sect);
return 1;
} else {
put_dev_sector(sect);
return 0;
}
}
}
#ifdef CONFIG_EFI_PARTITION
p = (struct partition *) (data + 0x1be);
for (slot = 1 ; slot <= 4 ; slot++, p++) {
/* If this is an EFI GPT disk, msdos should ignore it. */
if (SYS_IND(p) == EFI_PMBR_OSTYPE_EFI_GPT) {
put_dev_sector(sect);
return 0;
}
}
#endif
p = (struct partition *) (data + 0x1be);
disksig = le32_to_cpup((__le32 *)(data + 0x1b8));
/*
* Look for partitions in two passes:
* First find the primary and DOS-type extended partitions.
* On the second pass look inside *BSD, Unixware and Solaris partitions.
*/
state->next = 5;
for (slot = 1 ; slot <= 4 ; slot++, p++) {
sector_t start = start_sect(p)*sector_size;
sector_t size = nr_sects(p)*sector_size;
if (!size)
continue;
if (is_extended_partition(p)) {
/*
* prevent someone doing mkfs or mkswap on an
* extended partition, but leave room for LILO
* FIXME: this uses one logical sector for > 512b
* sector, although it may not be enough/proper.
*/
sector_t n = 2;
n = min(size, max(sector_size, n));
put_partition(state, slot, start, n);
strlcat(state->pp_buf, " <", PAGE_SIZE);
parse_extended(state, start, size, disksig);
strlcat(state->pp_buf, " >", PAGE_SIZE);
continue;
}
put_partition(state, slot, start, size);
set_info(state, slot, disksig);
if (SYS_IND(p) == LINUX_RAID_PARTITION)
state->parts[slot].flags = ADDPART_FLAG_RAID;
if (SYS_IND(p) == DM6_PARTITION)
strlcat(state->pp_buf, "[DM]", PAGE_SIZE);
if (SYS_IND(p) == EZD_PARTITION)
strlcat(state->pp_buf, "[EZD]", PAGE_SIZE);
}
strlcat(state->pp_buf, "\n", PAGE_SIZE);
/* second pass - output for each on a separate line */
p = (struct partition *) (0x1be + data);
for (slot = 1 ; slot <= 4 ; slot++, p++) {
unsigned char id = SYS_IND(p);
int n;
if (!nr_sects(p))
continue;
for (n = 0; subtypes[n].parse && id != subtypes[n].id; n++)
;
if (!subtypes[n].parse)
continue;
subtypes[n].parse(state, start_sect(p) * sector_size,
nr_sects(p) * sector_size, slot);
}
put_dev_sector(sect);
return 1;
}

8
block/partitions/msdos.h Normal file
View file

@ -0,0 +1,8 @@
/*
* fs/partitions/msdos.h
*/
#define MSDOS_LABEL_MAGIC 0xAA55
int msdos_partition(struct parsed_partitions *state);

86
block/partitions/osf.c Normal file
View file

@ -0,0 +1,86 @@
/*
* fs/partitions/osf.c
*
* Code extracted from drivers/block/genhd.c
*
* Copyright (C) 1991-1998 Linus Torvalds
* Re-organised Feb 1998 Russell King
*/
#include "check.h"
#include "osf.h"
#define MAX_OSF_PARTITIONS 18
int osf_partition(struct parsed_partitions *state)
{
int i;
int slot = 1;
unsigned int npartitions;
Sector sect;
unsigned char *data;
struct disklabel {
__le32 d_magic;
__le16 d_type,d_subtype;
u8 d_typename[16];
u8 d_packname[16];
__le32 d_secsize;
__le32 d_nsectors;
__le32 d_ntracks;
__le32 d_ncylinders;
__le32 d_secpercyl;
__le32 d_secprtunit;
__le16 d_sparespertrack;
__le16 d_sparespercyl;
__le32 d_acylinders;
__le16 d_rpm, d_interleave, d_trackskew, d_cylskew;
__le32 d_headswitch, d_trkseek, d_flags;
__le32 d_drivedata[5];
__le32 d_spare[5];
__le32 d_magic2;
__le16 d_checksum;
__le16 d_npartitions;
__le32 d_bbsize, d_sbsize;
struct d_partition {
__le32 p_size;
__le32 p_offset;
__le32 p_fsize;
u8 p_fstype;
u8 p_frag;
__le16 p_cpg;
} d_partitions[MAX_OSF_PARTITIONS];
} * label;
struct d_partition * partition;
data = read_part_sector(state, 0, &sect);
if (!data)
return -1;
label = (struct disklabel *) (data+64);
partition = label->d_partitions;
if (le32_to_cpu(label->d_magic) != DISKLABELMAGIC) {
put_dev_sector(sect);
return 0;
}
if (le32_to_cpu(label->d_magic2) != DISKLABELMAGIC) {
put_dev_sector(sect);
return 0;
}
npartitions = le16_to_cpu(label->d_npartitions);
if (npartitions > MAX_OSF_PARTITIONS) {
put_dev_sector(sect);
return 0;
}
for (i = 0 ; i < npartitions; i++, partition++) {
if (slot == state->limit)
break;
if (le32_to_cpu(partition->p_size))
put_partition(state, slot,
le32_to_cpu(partition->p_offset),
le32_to_cpu(partition->p_size));
slot++;
}
strlcat(state->pp_buf, "\n", PAGE_SIZE);
put_dev_sector(sect);
return 1;
}

7
block/partitions/osf.h Normal file
View file

@ -0,0 +1,7 @@
/*
* fs/partitions/osf.h
*/
#define DISKLABELMAGIC (0x82564557UL)
int osf_partition(struct parsed_partitions *state);

82
block/partitions/sgi.c Normal file
View file

@ -0,0 +1,82 @@
/*
* fs/partitions/sgi.c
*
* Code extracted from drivers/block/genhd.c
*/
#include "check.h"
#include "sgi.h"
struct sgi_disklabel {
__be32 magic_mushroom; /* Big fat spliff... */
__be16 root_part_num; /* Root partition number */
__be16 swap_part_num; /* Swap partition number */
s8 boot_file[16]; /* Name of boot file for ARCS */
u8 _unused0[48]; /* Device parameter useless crapola.. */
struct sgi_volume {
s8 name[8]; /* Name of volume */
__be32 block_num; /* Logical block number */
__be32 num_bytes; /* How big, in bytes */
} volume[15];
struct sgi_partition {
__be32 num_blocks; /* Size in logical blocks */
__be32 first_block; /* First logical block */
__be32 type; /* Type of this partition */
} partitions[16];
__be32 csum; /* Disk label checksum */
__be32 _unused1; /* Padding */
};
int sgi_partition(struct parsed_partitions *state)
{
int i, csum;
__be32 magic;
int slot = 1;
unsigned int start, blocks;
__be32 *ui, cs;
Sector sect;
struct sgi_disklabel *label;
struct sgi_partition *p;
char b[BDEVNAME_SIZE];
label = read_part_sector(state, 0, &sect);
if (!label)
return -1;
p = &label->partitions[0];
magic = label->magic_mushroom;
if(be32_to_cpu(magic) != SGI_LABEL_MAGIC) {
/*printk("Dev %s SGI disklabel: bad magic %08x\n",
bdevname(bdev, b), be32_to_cpu(magic));*/
put_dev_sector(sect);
return 0;
}
ui = ((__be32 *) (label + 1)) - 1;
for(csum = 0; ui >= ((__be32 *) label);) {
cs = *ui--;
csum += be32_to_cpu(cs);
}
if(csum) {
printk(KERN_WARNING "Dev %s SGI disklabel: csum bad, label corrupted\n",
bdevname(state->bdev, b));
put_dev_sector(sect);
return 0;
}
/* All SGI disk labels have 16 partitions, disks under Linux only
* have 15 minor's. Luckily there are always a few zero length
* partitions which we don't care about so we never overflow the
* current_minor.
*/
for(i = 0; i < 16; i++, p++) {
blocks = be32_to_cpu(p->num_blocks);
start = be32_to_cpu(p->first_block);
if (blocks) {
put_partition(state, slot, start, blocks);
if (be32_to_cpu(p->type) == LINUX_RAID_PARTITION)
state->parts[slot].flags = ADDPART_FLAG_RAID;
}
slot++;
}
strlcat(state->pp_buf, "\n", PAGE_SIZE);
put_dev_sector(sect);
return 1;
}

8
block/partitions/sgi.h Normal file
View file

@ -0,0 +1,8 @@
/*
* fs/partitions/sgi.h
*/
extern int sgi_partition(struct parsed_partitions *state);
#define SGI_LABEL_MAGIC 0x0be5a941

122
block/partitions/sun.c Normal file
View file

@ -0,0 +1,122 @@
/*
* fs/partitions/sun.c
*
* Code extracted from drivers/block/genhd.c
*
* Copyright (C) 1991-1998 Linus Torvalds
* Re-organised Feb 1998 Russell King
*/
#include "check.h"
#include "sun.h"
int sun_partition(struct parsed_partitions *state)
{
int i;
__be16 csum;
int slot = 1;
__be16 *ush;
Sector sect;
struct sun_disklabel {
unsigned char info[128]; /* Informative text string */
struct sun_vtoc {
__be32 version; /* Layout version */
char volume[8]; /* Volume name */
__be16 nparts; /* Number of partitions */
struct sun_info { /* Partition hdrs, sec 2 */
__be16 id;
__be16 flags;
} infos[8];
__be16 padding; /* Alignment padding */
__be32 bootinfo[3]; /* Info needed by mboot */
__be32 sanity; /* To verify vtoc sanity */
__be32 reserved[10]; /* Free space */
__be32 timestamp[8]; /* Partition timestamp */
} vtoc;
__be32 write_reinstruct; /* sectors to skip, writes */
__be32 read_reinstruct; /* sectors to skip, reads */
unsigned char spare[148]; /* Padding */
__be16 rspeed; /* Disk rotational speed */
__be16 pcylcount; /* Physical cylinder count */
__be16 sparecyl; /* extra sects per cylinder */
__be16 obs1; /* gap1 */
__be16 obs2; /* gap2 */
__be16 ilfact; /* Interleave factor */
__be16 ncyl; /* Data cylinder count */
__be16 nacyl; /* Alt. cylinder count */
__be16 ntrks; /* Tracks per cylinder */
__be16 nsect; /* Sectors per track */
__be16 obs3; /* bhead - Label head offset */
__be16 obs4; /* ppart - Physical Partition */
struct sun_partition {
__be32 start_cylinder;
__be32 num_sectors;
} partitions[8];
__be16 magic; /* Magic number */
__be16 csum; /* Label xor'd checksum */
} * label;
struct sun_partition *p;
unsigned long spc;
char b[BDEVNAME_SIZE];
int use_vtoc;
int nparts;
label = read_part_sector(state, 0, &sect);
if (!label)
return -1;
p = label->partitions;
if (be16_to_cpu(label->magic) != SUN_LABEL_MAGIC) {
/* printk(KERN_INFO "Dev %s Sun disklabel: bad magic %04x\n",
bdevname(bdev, b), be16_to_cpu(label->magic)); */
put_dev_sector(sect);
return 0;
}
/* Look at the checksum */
ush = ((__be16 *) (label+1)) - 1;
for (csum = 0; ush >= ((__be16 *) label);)
csum ^= *ush--;
if (csum) {
printk("Dev %s Sun disklabel: Csum bad, label corrupted\n",
bdevname(state->bdev, b));
put_dev_sector(sect);
return 0;
}
/* Check to see if we can use the VTOC table */
use_vtoc = ((be32_to_cpu(label->vtoc.sanity) == SUN_VTOC_SANITY) &&
(be32_to_cpu(label->vtoc.version) == 1) &&
(be16_to_cpu(label->vtoc.nparts) <= 8));
/* Use 8 partition entries if not specified in validated VTOC */
nparts = (use_vtoc) ? be16_to_cpu(label->vtoc.nparts) : 8;
/*
* So that old Linux-Sun partitions continue to work,
* alow the VTOC to be used under the additional condition ...
*/
use_vtoc = use_vtoc || !(label->vtoc.sanity ||
label->vtoc.version || label->vtoc.nparts);
spc = be16_to_cpu(label->ntrks) * be16_to_cpu(label->nsect);
for (i = 0; i < nparts; i++, p++) {
unsigned long st_sector;
unsigned int num_sectors;
st_sector = be32_to_cpu(p->start_cylinder) * spc;
num_sectors = be32_to_cpu(p->num_sectors);
if (num_sectors) {
put_partition(state, slot, st_sector, num_sectors);
state->parts[slot].flags = 0;
if (use_vtoc) {
if (be16_to_cpu(label->vtoc.infos[i].id) == LINUX_RAID_PARTITION)
state->parts[slot].flags |= ADDPART_FLAG_RAID;
else if (be16_to_cpu(label->vtoc.infos[i].id) == SUN_WHOLE_DISK)
state->parts[slot].flags |= ADDPART_FLAG_WHOLEDISK;
}
}
slot++;
}
strlcat(state->pp_buf, "\n", PAGE_SIZE);
put_dev_sector(sect);
return 1;
}

8
block/partitions/sun.h Normal file
View file

@ -0,0 +1,8 @@
/*
* fs/partitions/sun.h
*/
#define SUN_LABEL_MAGIC 0xDABE
#define SUN_VTOC_SANITY 0x600DDEEE
int sun_partition(struct parsed_partitions *state);

95
block/partitions/sysv68.c Normal file
View file

@ -0,0 +1,95 @@
/*
* fs/partitions/sysv68.c
*
* Copyright (C) 2007 Philippe De Muyter <phdm@macqel.be>
*/
#include "check.h"
#include "sysv68.h"
/*
* Volume ID structure: on first 256-bytes sector of disk
*/
struct volumeid {
u8 vid_unused[248];
u8 vid_mac[8]; /* ASCII string "MOTOROLA" */
};
/*
* config block: second 256-bytes sector on disk
*/
struct dkconfig {
u8 ios_unused0[128];
__be32 ios_slcblk; /* Slice table block number */
__be16 ios_slccnt; /* Number of entries in slice table */
u8 ios_unused1[122];
};
/*
* combined volumeid and dkconfig block
*/
struct dkblk0 {
struct volumeid dk_vid;
struct dkconfig dk_ios;
};
/*
* Slice Table Structure
*/
struct slice {
__be32 nblocks; /* slice size (in blocks) */
__be32 blkoff; /* block offset of slice */
};
int sysv68_partition(struct parsed_partitions *state)
{
int i, slices;
int slot = 1;
Sector sect;
unsigned char *data;
struct dkblk0 *b;
struct slice *slice;
char tmp[64];
data = read_part_sector(state, 0, &sect);
if (!data)
return -1;
b = (struct dkblk0 *)data;
if (memcmp(b->dk_vid.vid_mac, "MOTOROLA", sizeof(b->dk_vid.vid_mac))) {
put_dev_sector(sect);
return 0;
}
slices = be16_to_cpu(b->dk_ios.ios_slccnt);
i = be32_to_cpu(b->dk_ios.ios_slcblk);
put_dev_sector(sect);
data = read_part_sector(state, i, &sect);
if (!data)
return -1;
slices -= 1; /* last slice is the whole disk */
snprintf(tmp, sizeof(tmp), "sysV68: %s(s%u)", state->name, slices);
strlcat(state->pp_buf, tmp, PAGE_SIZE);
slice = (struct slice *)data;
for (i = 0; i < slices; i++, slice++) {
if (slot == state->limit)
break;
if (be32_to_cpu(slice->nblocks)) {
put_partition(state, slot,
be32_to_cpu(slice->blkoff),
be32_to_cpu(slice->nblocks));
snprintf(tmp, sizeof(tmp), "(s%u)", i);
strlcat(state->pp_buf, tmp, PAGE_SIZE);
}
slot++;
}
strlcat(state->pp_buf, "\n", PAGE_SIZE);
put_dev_sector(sect);
return 1;
}

View file

@ -0,0 +1 @@
extern int sysv68_partition(struct parsed_partitions *state);

48
block/partitions/ultrix.c Normal file
View file

@ -0,0 +1,48 @@
/*
* fs/partitions/ultrix.c
*
* Code extracted from drivers/block/genhd.c
*
* Re-organised Jul 1999 Russell King
*/
#include "check.h"
#include "ultrix.h"
int ultrix_partition(struct parsed_partitions *state)
{
int i;
Sector sect;
unsigned char *data;
struct ultrix_disklabel {
s32 pt_magic; /* magic no. indicating part. info exits */
s32 pt_valid; /* set by driver if pt is current */
struct pt_info {
s32 pi_nblocks; /* no. of sectors */
u32 pi_blkoff; /* block offset for start */
} pt_part[8];
} *label;
#define PT_MAGIC 0x032957 /* Partition magic number */
#define PT_VALID 1 /* Indicates if struct is valid */
data = read_part_sector(state, (16384 - sizeof(*label))/512, &sect);
if (!data)
return -1;
label = (struct ultrix_disklabel *)(data + 512 - sizeof(*label));
if (label->pt_magic == PT_MAGIC && label->pt_valid == PT_VALID) {
for (i=0; i<8; i++)
if (label->pt_part[i].pi_nblocks)
put_partition(state, i+1,
label->pt_part[i].pi_blkoff,
label->pt_part[i].pi_nblocks);
put_dev_sector(sect);
strlcat(state->pp_buf, "\n", PAGE_SIZE);
return 1;
} else {
put_dev_sector(sect);
return 0;
}
}

View file

@ -0,0 +1,5 @@
/*
* fs/partitions/ultrix.h
*/
int ultrix_partition(struct parsed_partitions *state);

761
block/scsi_ioctl.c Normal file
View file

@ -0,0 +1,761 @@
/*
* Copyright (C) 2001 Jens Axboe <axboe@suse.de>
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License version 2 as
* published by the Free Software Foundation.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
*
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public Licens
* along with this program; if not, write to the Free Software
* Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-
*
*/
#include <linux/kernel.h>
#include <linux/errno.h>
#include <linux/string.h>
#include <linux/module.h>
#include <linux/blkdev.h>
#include <linux/capability.h>
#include <linux/completion.h>
#include <linux/cdrom.h>
#include <linux/ratelimit.h>
#include <linux/slab.h>
#include <linux/times.h>
#include <linux/uio.h>
#include <asm/uaccess.h>
#include <scsi/scsi.h>
#include <scsi/scsi_ioctl.h>
#include <scsi/scsi_cmnd.h>
struct blk_cmd_filter {
unsigned long read_ok[BLK_SCSI_CMD_PER_LONG];
unsigned long write_ok[BLK_SCSI_CMD_PER_LONG];
};
static struct blk_cmd_filter blk_default_cmd_filter;
/* Command group 3 is reserved and should never be used. */
const unsigned char scsi_command_size_tbl[8] =
{
6, 10, 10, 12,
16, 12, 10, 10
};
EXPORT_SYMBOL(scsi_command_size_tbl);
#include <scsi/sg.h>
static int sg_get_version(int __user *p)
{
static const int sg_version_num = 30527;
return put_user(sg_version_num, p);
}
static int scsi_get_idlun(struct request_queue *q, int __user *p)
{
return put_user(0, p);
}
static int scsi_get_bus(struct request_queue *q, int __user *p)
{
return put_user(0, p);
}
static int sg_get_timeout(struct request_queue *q)
{
return jiffies_to_clock_t(q->sg_timeout);
}
static int sg_set_timeout(struct request_queue *q, int __user *p)
{
int timeout, err = get_user(timeout, p);
if (!err)
q->sg_timeout = clock_t_to_jiffies(timeout);
return err;
}
static int max_sectors_bytes(struct request_queue *q)
{
unsigned int max_sectors = queue_max_sectors(q);
max_sectors = min_t(unsigned int, max_sectors, INT_MAX >> 9);
return max_sectors << 9;
}
static int sg_get_reserved_size(struct request_queue *q, int __user *p)
{
int val = min_t(int, q->sg_reserved_size, max_sectors_bytes(q));
return put_user(val, p);
}
static int sg_set_reserved_size(struct request_queue *q, int __user *p)
{
int size, err = get_user(size, p);
if (err)
return err;
if (size < 0)
return -EINVAL;
q->sg_reserved_size = min(size, max_sectors_bytes(q));
return 0;
}
/*
* will always return that we are ATAPI even for a real SCSI drive, I'm not
* so sure this is worth doing anything about (why would you care??)
*/
static int sg_emulated_host(struct request_queue *q, int __user *p)
{
return put_user(1, p);
}
static void blk_set_cmd_filter_defaults(struct blk_cmd_filter *filter)
{
/* Basic read-only commands */
__set_bit(TEST_UNIT_READY, filter->read_ok);
__set_bit(REQUEST_SENSE, filter->read_ok);
__set_bit(READ_6, filter->read_ok);
__set_bit(READ_10, filter->read_ok);
__set_bit(READ_12, filter->read_ok);
__set_bit(READ_16, filter->read_ok);
__set_bit(READ_BUFFER, filter->read_ok);
__set_bit(READ_DEFECT_DATA, filter->read_ok);
__set_bit(READ_CAPACITY, filter->read_ok);
__set_bit(READ_LONG, filter->read_ok);
__set_bit(INQUIRY, filter->read_ok);
__set_bit(MODE_SENSE, filter->read_ok);
__set_bit(MODE_SENSE_10, filter->read_ok);
__set_bit(LOG_SENSE, filter->read_ok);
__set_bit(START_STOP, filter->read_ok);
__set_bit(GPCMD_VERIFY_10, filter->read_ok);
__set_bit(VERIFY_16, filter->read_ok);
__set_bit(REPORT_LUNS, filter->read_ok);
__set_bit(SERVICE_ACTION_IN, filter->read_ok);
__set_bit(RECEIVE_DIAGNOSTIC, filter->read_ok);
__set_bit(MAINTENANCE_IN, filter->read_ok);
__set_bit(GPCMD_READ_BUFFER_CAPACITY, filter->read_ok);
/* Audio CD commands */
__set_bit(GPCMD_PLAY_CD, filter->read_ok);
__set_bit(GPCMD_PLAY_AUDIO_10, filter->read_ok);
__set_bit(GPCMD_PLAY_AUDIO_MSF, filter->read_ok);
__set_bit(GPCMD_PLAY_AUDIO_TI, filter->read_ok);
__set_bit(GPCMD_PAUSE_RESUME, filter->read_ok);
/* CD/DVD data reading */
__set_bit(GPCMD_READ_CD, filter->read_ok);
__set_bit(GPCMD_READ_CD_MSF, filter->read_ok);
__set_bit(GPCMD_READ_DISC_INFO, filter->read_ok);
__set_bit(GPCMD_READ_CDVD_CAPACITY, filter->read_ok);
__set_bit(GPCMD_READ_DVD_STRUCTURE, filter->read_ok);
__set_bit(GPCMD_READ_HEADER, filter->read_ok);
__set_bit(GPCMD_READ_TRACK_RZONE_INFO, filter->read_ok);
__set_bit(GPCMD_READ_SUBCHANNEL, filter->read_ok);
__set_bit(GPCMD_READ_TOC_PMA_ATIP, filter->read_ok);
__set_bit(GPCMD_REPORT_KEY, filter->read_ok);
__set_bit(GPCMD_SCAN, filter->read_ok);
__set_bit(GPCMD_GET_CONFIGURATION, filter->read_ok);
__set_bit(GPCMD_READ_FORMAT_CAPACITIES, filter->read_ok);
__set_bit(GPCMD_GET_EVENT_STATUS_NOTIFICATION, filter->read_ok);
__set_bit(GPCMD_GET_PERFORMANCE, filter->read_ok);
__set_bit(GPCMD_SEEK, filter->read_ok);
__set_bit(GPCMD_STOP_PLAY_SCAN, filter->read_ok);
/* Basic writing commands */
__set_bit(WRITE_6, filter->write_ok);
__set_bit(WRITE_10, filter->write_ok);
__set_bit(WRITE_VERIFY, filter->write_ok);
__set_bit(WRITE_12, filter->write_ok);
__set_bit(WRITE_VERIFY_12, filter->write_ok);
__set_bit(WRITE_16, filter->write_ok);
__set_bit(WRITE_LONG, filter->write_ok);
__set_bit(WRITE_LONG_2, filter->write_ok);
__set_bit(ERASE, filter->write_ok);
__set_bit(GPCMD_MODE_SELECT_10, filter->write_ok);
__set_bit(MODE_SELECT, filter->write_ok);
__set_bit(LOG_SELECT, filter->write_ok);
__set_bit(GPCMD_BLANK, filter->write_ok);
__set_bit(GPCMD_CLOSE_TRACK, filter->write_ok);
__set_bit(GPCMD_FLUSH_CACHE, filter->write_ok);
__set_bit(GPCMD_FORMAT_UNIT, filter->write_ok);
__set_bit(GPCMD_REPAIR_RZONE_TRACK, filter->write_ok);
__set_bit(GPCMD_RESERVE_RZONE_TRACK, filter->write_ok);
__set_bit(GPCMD_SEND_DVD_STRUCTURE, filter->write_ok);
__set_bit(GPCMD_SEND_EVENT, filter->write_ok);
__set_bit(GPCMD_SEND_KEY, filter->write_ok);
__set_bit(GPCMD_SEND_OPC, filter->write_ok);
__set_bit(GPCMD_SEND_CUE_SHEET, filter->write_ok);
__set_bit(GPCMD_SET_SPEED, filter->write_ok);
__set_bit(GPCMD_PREVENT_ALLOW_MEDIUM_REMOVAL, filter->write_ok);
__set_bit(GPCMD_LOAD_UNLOAD, filter->write_ok);
__set_bit(GPCMD_SET_STREAMING, filter->write_ok);
__set_bit(GPCMD_SET_READ_AHEAD, filter->write_ok);
}
int blk_verify_command(unsigned char *cmd, fmode_t has_write_perm)
{
struct blk_cmd_filter *filter = &blk_default_cmd_filter;
/* root can do any command. */
if (capable(CAP_SYS_RAWIO))
return 0;
/* Anybody who can open the device can do a read-safe command */
if (test_bit(cmd[0], filter->read_ok))
return 0;
/* Write-safe commands require a writable open */
if (test_bit(cmd[0], filter->write_ok) && has_write_perm)
return 0;
return -EPERM;
}
EXPORT_SYMBOL(blk_verify_command);
static int blk_fill_sghdr_rq(struct request_queue *q, struct request *rq,
struct sg_io_hdr *hdr, fmode_t mode)
{
if (copy_from_user(rq->cmd, hdr->cmdp, hdr->cmd_len))
return -EFAULT;
if (blk_verify_command(rq->cmd, mode & FMODE_WRITE))
return -EPERM;
/*
* fill in request structure
*/
rq->cmd_len = hdr->cmd_len;
rq->timeout = msecs_to_jiffies(hdr->timeout);
if (!rq->timeout)
rq->timeout = q->sg_timeout;
if (!rq->timeout)
rq->timeout = BLK_DEFAULT_SG_TIMEOUT;
if (rq->timeout < BLK_MIN_SG_TIMEOUT)
rq->timeout = BLK_MIN_SG_TIMEOUT;
return 0;
}
static int blk_complete_sghdr_rq(struct request *rq, struct sg_io_hdr *hdr,
struct bio *bio)
{
int r, ret = 0;
/*
* fill in all the output members
*/
hdr->status = rq->errors & 0xff;
hdr->masked_status = status_byte(rq->errors);
hdr->msg_status = msg_byte(rq->errors);
hdr->host_status = host_byte(rq->errors);
hdr->driver_status = driver_byte(rq->errors);
hdr->info = 0;
if (hdr->masked_status || hdr->host_status || hdr->driver_status)
hdr->info |= SG_INFO_CHECK;
hdr->resid = rq->resid_len;
hdr->sb_len_wr = 0;
if (rq->sense_len && hdr->sbp) {
int len = min((unsigned int) hdr->mx_sb_len, rq->sense_len);
if (!copy_to_user(hdr->sbp, rq->sense, len))
hdr->sb_len_wr = len;
else
ret = -EFAULT;
}
r = blk_rq_unmap_user(bio);
if (!ret)
ret = r;
return ret;
}
static int sg_io(struct request_queue *q, struct gendisk *bd_disk,
struct sg_io_hdr *hdr, fmode_t mode)
{
unsigned long start_time;
ssize_t ret = 0;
int writing = 0;
int at_head = 0;
struct request *rq;
char sense[SCSI_SENSE_BUFFERSIZE];
struct bio *bio;
if (hdr->interface_id != 'S')
return -EINVAL;
if (hdr->dxfer_len > (queue_max_hw_sectors(q) << 9))
return -EIO;
if (hdr->dxfer_len)
switch (hdr->dxfer_direction) {
default:
return -EINVAL;
case SG_DXFER_TO_DEV:
writing = 1;
break;
case SG_DXFER_TO_FROM_DEV:
case SG_DXFER_FROM_DEV:
break;
}
if (hdr->flags & SG_FLAG_Q_AT_HEAD)
at_head = 1;
ret = -ENOMEM;
rq = blk_get_request(q, writing ? WRITE : READ, GFP_KERNEL);
if (IS_ERR(rq))
return PTR_ERR(rq);
blk_rq_set_block_pc(rq);
if (hdr->cmd_len > BLK_MAX_CDB) {
rq->cmd = kzalloc(hdr->cmd_len, GFP_KERNEL);
if (!rq->cmd)
goto out_put_request;
}
ret = -EFAULT;
if (blk_fill_sghdr_rq(q, rq, hdr, mode))
goto out_free_cdb;
ret = 0;
if (hdr->iovec_count) {
size_t iov_data_len;
struct iovec *iov = NULL;
ret = rw_copy_check_uvector(-1, hdr->dxferp, hdr->iovec_count,
0, NULL, &iov);
if (ret < 0) {
kfree(iov);
goto out_free_cdb;
}
iov_data_len = ret;
ret = 0;
/* SG_IO howto says that the shorter of the two wins */
if (hdr->dxfer_len < iov_data_len) {
hdr->iovec_count = iov_shorten(iov,
hdr->iovec_count,
hdr->dxfer_len);
iov_data_len = hdr->dxfer_len;
}
ret = blk_rq_map_user_iov(q, rq, NULL, (struct sg_iovec *) iov,
hdr->iovec_count,
iov_data_len, GFP_KERNEL);
kfree(iov);
} else if (hdr->dxfer_len)
ret = blk_rq_map_user(q, rq, NULL, hdr->dxferp, hdr->dxfer_len,
GFP_KERNEL);
if (ret)
goto out_free_cdb;
bio = rq->bio;
memset(sense, 0, sizeof(sense));
rq->sense = sense;
rq->sense_len = 0;
rq->retries = 0;
start_time = jiffies;
/* ignore return value. All information is passed back to caller
* (if he doesn't check that is his problem).
* N.B. a non-zero SCSI status is _not_ necessarily an error.
*/
blk_execute_rq(q, bd_disk, rq, at_head);
hdr->duration = jiffies_to_msecs(jiffies - start_time);
ret = blk_complete_sghdr_rq(rq, hdr, bio);
out_free_cdb:
if (rq->cmd != rq->__cmd)
kfree(rq->cmd);
out_put_request:
blk_put_request(rq);
return ret;
}
/**
* sg_scsi_ioctl -- handle deprecated SCSI_IOCTL_SEND_COMMAND ioctl
* @file: file this ioctl operates on (optional)
* @q: request queue to send scsi commands down
* @disk: gendisk to operate on (option)
* @sic: userspace structure describing the command to perform
*
* Send down the scsi command described by @sic to the device below
* the request queue @q. If @file is non-NULL it's used to perform
* fine-grained permission checks that allow users to send down
* non-destructive SCSI commands. If the caller has a struct gendisk
* available it should be passed in as @disk to allow the low level
* driver to use the information contained in it. A non-NULL @disk
* is only allowed if the caller knows that the low level driver doesn't
* need it (e.g. in the scsi subsystem).
*
* Notes:
* - This interface is deprecated - users should use the SG_IO
* interface instead, as this is a more flexible approach to
* performing SCSI commands on a device.
* - The SCSI command length is determined by examining the 1st byte
* of the given command. There is no way to override this.
* - Data transfers are limited to PAGE_SIZE
* - The length (x + y) must be at least OMAX_SB_LEN bytes long to
* accommodate the sense buffer when an error occurs.
* The sense buffer is truncated to OMAX_SB_LEN (16) bytes so that
* old code will not be surprised.
* - If a Unix error occurs (e.g. ENOMEM) then the user will receive
* a negative return and the Unix error code in 'errno'.
* If the SCSI command succeeds then 0 is returned.
* Positive numbers returned are the compacted SCSI error codes (4
* bytes in one int) where the lowest byte is the SCSI status.
*/
#define OMAX_SB_LEN 16 /* For backward compatibility */
int sg_scsi_ioctl(struct request_queue *q, struct gendisk *disk, fmode_t mode,
struct scsi_ioctl_command __user *sic)
{
struct request *rq;
int err;
unsigned int in_len, out_len, bytes, opcode, cmdlen;
char *buffer = NULL, sense[SCSI_SENSE_BUFFERSIZE];
if (!sic)
return -EINVAL;
/*
* get in an out lengths, verify they don't exceed a page worth of data
*/
if (get_user(in_len, &sic->inlen))
return -EFAULT;
if (get_user(out_len, &sic->outlen))
return -EFAULT;
if (in_len > PAGE_SIZE || out_len > PAGE_SIZE)
return -EINVAL;
if (get_user(opcode, sic->data))
return -EFAULT;
bytes = max(in_len, out_len);
if (bytes) {
buffer = kzalloc(bytes, q->bounce_gfp | GFP_USER| __GFP_NOWARN);
if (!buffer)
return -ENOMEM;
}
rq = blk_get_request(q, in_len ? WRITE : READ, __GFP_WAIT);
if (IS_ERR(rq)) {
err = PTR_ERR(rq);
goto error_free_buffer;
}
blk_rq_set_block_pc(rq);
cmdlen = COMMAND_SIZE(opcode);
/*
* get command and data to send to device, if any
*/
err = -EFAULT;
rq->cmd_len = cmdlen;
if (copy_from_user(rq->cmd, sic->data, cmdlen))
goto error;
if (in_len && copy_from_user(buffer, sic->data + cmdlen, in_len))
goto error;
err = blk_verify_command(rq->cmd, mode & FMODE_WRITE);
if (err)
goto error;
/* default. possible overriden later */
rq->retries = 5;
switch (opcode) {
case SEND_DIAGNOSTIC:
case FORMAT_UNIT:
rq->timeout = FORMAT_UNIT_TIMEOUT;
rq->retries = 1;
break;
case START_STOP:
rq->timeout = START_STOP_TIMEOUT;
break;
case MOVE_MEDIUM:
rq->timeout = MOVE_MEDIUM_TIMEOUT;
break;
case READ_ELEMENT_STATUS:
rq->timeout = READ_ELEMENT_STATUS_TIMEOUT;
break;
case READ_DEFECT_DATA:
rq->timeout = READ_DEFECT_DATA_TIMEOUT;
rq->retries = 1;
break;
default:
rq->timeout = BLK_DEFAULT_SG_TIMEOUT;
break;
}
if (bytes && blk_rq_map_kern(q, rq, buffer, bytes, __GFP_WAIT)) {
err = DRIVER_ERROR << 24;
goto error;
}
memset(sense, 0, sizeof(sense));
rq->sense = sense;
rq->sense_len = 0;
blk_execute_rq(q, disk, rq, 0);
err = rq->errors & 0xff; /* only 8 bit SCSI status */
if (err) {
if (rq->sense_len && rq->sense) {
bytes = (OMAX_SB_LEN > rq->sense_len) ?
rq->sense_len : OMAX_SB_LEN;
if (copy_to_user(sic->data, rq->sense, bytes))
err = -EFAULT;
}
} else {
if (copy_to_user(sic->data, buffer, out_len))
err = -EFAULT;
}
error:
blk_put_request(rq);
error_free_buffer:
kfree(buffer);
return err;
}
EXPORT_SYMBOL_GPL(sg_scsi_ioctl);
/* Send basic block requests */
static int __blk_send_generic(struct request_queue *q, struct gendisk *bd_disk,
int cmd, int data)
{
struct request *rq;
int err;
rq = blk_get_request(q, WRITE, __GFP_WAIT);
if (IS_ERR(rq))
return PTR_ERR(rq);
blk_rq_set_block_pc(rq);
rq->timeout = BLK_DEFAULT_SG_TIMEOUT;
rq->cmd[0] = cmd;
rq->cmd[4] = data;
rq->cmd_len = 6;
err = blk_execute_rq(q, bd_disk, rq, 0);
blk_put_request(rq);
return err;
}
static inline int blk_send_start_stop(struct request_queue *q,
struct gendisk *bd_disk, int data)
{
return __blk_send_generic(q, bd_disk, GPCMD_START_STOP_UNIT, data);
}
int scsi_cmd_ioctl(struct request_queue *q, struct gendisk *bd_disk, fmode_t mode,
unsigned int cmd, void __user *arg)
{
int err;
if (!q)
return -ENXIO;
switch (cmd) {
/*
* new sgv3 interface
*/
case SG_GET_VERSION_NUM:
err = sg_get_version(arg);
break;
case SCSI_IOCTL_GET_IDLUN:
err = scsi_get_idlun(q, arg);
break;
case SCSI_IOCTL_GET_BUS_NUMBER:
err = scsi_get_bus(q, arg);
break;
case SG_SET_TIMEOUT:
err = sg_set_timeout(q, arg);
break;
case SG_GET_TIMEOUT:
err = sg_get_timeout(q);
break;
case SG_GET_RESERVED_SIZE:
err = sg_get_reserved_size(q, arg);
break;
case SG_SET_RESERVED_SIZE:
err = sg_set_reserved_size(q, arg);
break;
case SG_EMULATED_HOST:
err = sg_emulated_host(q, arg);
break;
case SG_IO: {
struct sg_io_hdr hdr;
err = -EFAULT;
if (copy_from_user(&hdr, arg, sizeof(hdr)))
break;
err = sg_io(q, bd_disk, &hdr, mode);
if (err == -EFAULT)
break;
if (copy_to_user(arg, &hdr, sizeof(hdr)))
err = -EFAULT;
break;
}
case CDROM_SEND_PACKET: {
struct cdrom_generic_command cgc;
struct sg_io_hdr hdr;
err = -EFAULT;
if (copy_from_user(&cgc, arg, sizeof(cgc)))
break;
cgc.timeout = clock_t_to_jiffies(cgc.timeout);
memset(&hdr, 0, sizeof(hdr));
hdr.interface_id = 'S';
hdr.cmd_len = sizeof(cgc.cmd);
hdr.dxfer_len = cgc.buflen;
err = 0;
switch (cgc.data_direction) {
case CGC_DATA_UNKNOWN:
hdr.dxfer_direction = SG_DXFER_UNKNOWN;
break;
case CGC_DATA_WRITE:
hdr.dxfer_direction = SG_DXFER_TO_DEV;
break;
case CGC_DATA_READ:
hdr.dxfer_direction = SG_DXFER_FROM_DEV;
break;
case CGC_DATA_NONE:
hdr.dxfer_direction = SG_DXFER_NONE;
break;
default:
err = -EINVAL;
}
if (err)
break;
hdr.dxferp = cgc.buffer;
hdr.sbp = cgc.sense;
if (hdr.sbp)
hdr.mx_sb_len = sizeof(struct request_sense);
hdr.timeout = jiffies_to_msecs(cgc.timeout);
hdr.cmdp = ((struct cdrom_generic_command __user*) arg)->cmd;
hdr.cmd_len = sizeof(cgc.cmd);
err = sg_io(q, bd_disk, &hdr, mode);
if (err == -EFAULT)
break;
if (hdr.status)
err = -EIO;
cgc.stat = err;
cgc.buflen = hdr.resid;
if (copy_to_user(arg, &cgc, sizeof(cgc)))
err = -EFAULT;
break;
}
/*
* old junk scsi send command ioctl
*/
case SCSI_IOCTL_SEND_COMMAND:
printk(KERN_WARNING "program %s is using a deprecated SCSI ioctl, please convert it to SG_IO\n", current->comm);
err = -EINVAL;
if (!arg)
break;
err = sg_scsi_ioctl(q, bd_disk, mode, arg);
break;
case CDROMCLOSETRAY:
err = blk_send_start_stop(q, bd_disk, 0x03);
break;
case CDROMEJECT:
err = blk_send_start_stop(q, bd_disk, 0x02);
break;
default:
err = -ENOTTY;
}
return err;
}
EXPORT_SYMBOL(scsi_cmd_ioctl);
int scsi_verify_blk_ioctl(struct block_device *bd, unsigned int cmd)
{
if (bd && bd == bd->bd_contains)
return 0;
/* Actually none of these is particularly useful on a partition,
* but they are safe.
*/
switch (cmd) {
case SCSI_IOCTL_GET_IDLUN:
case SCSI_IOCTL_GET_BUS_NUMBER:
case SCSI_IOCTL_GET_PCI:
case SCSI_IOCTL_PROBE_HOST:
case SG_GET_VERSION_NUM:
case SG_SET_TIMEOUT:
case SG_GET_TIMEOUT:
case SG_GET_RESERVED_SIZE:
case SG_SET_RESERVED_SIZE:
case SG_EMULATED_HOST:
case SCSI_IOCTL_SECURITY_PROTOCOL_IN:
case SCSI_IOCTL_SECURITY_PROTOCOL_OUT:
return 0;
case CDROM_GET_CAPABILITY:
/* Keep this until we remove the printk below. udev sends it
* and we do not want to spam dmesg about it. CD-ROMs do
* not have partitions, so we get here only for disks.
*/
return -ENOIOCTLCMD;
default:
break;
}
if (capable(CAP_SYS_RAWIO))
return 0;
/* In particular, rule out all resets and host-specific ioctls. */
printk_ratelimited(KERN_WARNING
"%s: sending ioctl %x to a partition!\n", current->comm, cmd);
return -ENOIOCTLCMD;
}
EXPORT_SYMBOL(scsi_verify_blk_ioctl);
int scsi_cmd_blk_ioctl(struct block_device *bd, fmode_t mode,
unsigned int cmd, void __user *arg)
{
int ret;
ret = scsi_verify_blk_ioctl(bd, cmd);
if (ret < 0)
return ret;
return scsi_cmd_ioctl(bd->bd_disk->queue, bd->bd_disk, mode, cmd, arg);
}
EXPORT_SYMBOL(scsi_cmd_blk_ioctl);
static int __init blk_scsi_ioctl_init(void)
{
blk_set_cmd_filter_defaults(&blk_default_cmd_filter);
return 0;
}
fs_initcall(blk_scsi_ioctl_init);

197
block/t10-pi.c Normal file
View file

@ -0,0 +1,197 @@
/*
* t10_pi.c - Functions for generating and verifying T10 Protection
* Information.
*
* Copyright (C) 2007, 2008, 2014 Oracle Corporation
* Written by: Martin K. Petersen <martin.petersen@oracle.com>
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public License version
* 2 as published by the Free Software Foundation.
*
* This program is distributed in the hope that it will be useful, but
* WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; see the file COPYING. If not, write to
* the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139,
* USA.
*
*/
#include <linux/t10-pi.h>
#include <linux/blkdev.h>
#include <linux/crc-t10dif.h>
#include <net/checksum.h>
typedef __be16 (csum_fn) (void *, unsigned int);
static const __be16 APP_ESCAPE = (__force __be16) 0xffff;
static const __be32 REF_ESCAPE = (__force __be32) 0xffffffff;
static __be16 t10_pi_crc_fn(void *data, unsigned int len)
{
return cpu_to_be16(crc_t10dif(data, len));
}
static __be16 t10_pi_ip_fn(void *data, unsigned int len)
{
return (__force __be16)ip_compute_csum(data, len);
}
/*
* Type 1 and Type 2 protection use the same format: 16 bit guard tag,
* 16 bit app tag, 32 bit reference tag. Type 3 does not define the ref
* tag.
*/
static int t10_pi_generate(struct blk_integrity_iter *iter, csum_fn *fn,
unsigned int type)
{
unsigned int i;
for (i = 0 ; i < iter->data_size ; i += iter->interval) {
struct t10_pi_tuple *pi = iter->prot_buf;
pi->guard_tag = fn(iter->data_buf, iter->interval);
pi->app_tag = 0;
if (type == 1)
pi->ref_tag = cpu_to_be32(lower_32_bits(iter->seed));
else
pi->ref_tag = 0;
iter->data_buf += iter->interval;
iter->prot_buf += sizeof(struct t10_pi_tuple);
iter->seed++;
}
return 0;
}
static int t10_pi_verify(struct blk_integrity_iter *iter, csum_fn *fn,
unsigned int type)
{
unsigned int i;
for (i = 0 ; i < iter->data_size ; i += iter->interval) {
struct t10_pi_tuple *pi = iter->prot_buf;
__be16 csum;
switch (type) {
case 1:
case 2:
if (pi->app_tag == APP_ESCAPE)
goto next;
if (be32_to_cpu(pi->ref_tag) !=
lower_32_bits(iter->seed)) {
pr_err("%s: ref tag error at location %llu " \
"(rcvd %u)\n", iter->disk_name,
(unsigned long long)
iter->seed, be32_to_cpu(pi->ref_tag));
return -EILSEQ;
}
break;
case 3:
if (pi->app_tag == APP_ESCAPE &&
pi->ref_tag == REF_ESCAPE)
goto next;
break;
}
csum = fn(iter->data_buf, iter->interval);
if (pi->guard_tag != csum) {
pr_err("%s: guard tag error at sector %llu " \
"(rcvd %04x, want %04x)\n", iter->disk_name,
(unsigned long long)iter->seed,
be16_to_cpu(pi->guard_tag), be16_to_cpu(csum));
return -EILSEQ;
}
next:
iter->data_buf += iter->interval;
iter->prot_buf += sizeof(struct t10_pi_tuple);
iter->seed++;
}
return 0;
}
static int t10_pi_type1_generate_crc(struct blk_integrity_iter *iter)
{
return t10_pi_generate(iter, t10_pi_crc_fn, 1);
}
static int t10_pi_type1_generate_ip(struct blk_integrity_iter *iter)
{
return t10_pi_generate(iter, t10_pi_ip_fn, 1);
}
static int t10_pi_type1_verify_crc(struct blk_integrity_iter *iter)
{
return t10_pi_verify(iter, t10_pi_crc_fn, 1);
}
static int t10_pi_type1_verify_ip(struct blk_integrity_iter *iter)
{
return t10_pi_verify(iter, t10_pi_ip_fn, 1);
}
static int t10_pi_type3_generate_crc(struct blk_integrity_iter *iter)
{
return t10_pi_generate(iter, t10_pi_crc_fn, 3);
}
static int t10_pi_type3_generate_ip(struct blk_integrity_iter *iter)
{
return t10_pi_generate(iter, t10_pi_ip_fn, 3);
}
static int t10_pi_type3_verify_crc(struct blk_integrity_iter *iter)
{
return t10_pi_verify(iter, t10_pi_crc_fn, 3);
}
static int t10_pi_type3_verify_ip(struct blk_integrity_iter *iter)
{
return t10_pi_verify(iter, t10_pi_ip_fn, 3);
}
struct blk_integrity t10_pi_type1_crc = {
.name = "T10-DIF-TYPE1-CRC",
.generate_fn = t10_pi_type1_generate_crc,
.verify_fn = t10_pi_type1_verify_crc,
.tuple_size = sizeof(struct t10_pi_tuple),
.tag_size = 0,
};
EXPORT_SYMBOL(t10_pi_type1_crc);
struct blk_integrity t10_pi_type1_ip = {
.name = "T10-DIF-TYPE1-IP",
.generate_fn = t10_pi_type1_generate_ip,
.verify_fn = t10_pi_type1_verify_ip,
.tuple_size = sizeof(struct t10_pi_tuple),
.tag_size = 0,
};
EXPORT_SYMBOL(t10_pi_type1_ip);
struct blk_integrity t10_pi_type3_crc = {
.name = "T10-DIF-TYPE3-CRC",
.generate_fn = t10_pi_type3_generate_crc,
.verify_fn = t10_pi_type3_verify_crc,
.tuple_size = sizeof(struct t10_pi_tuple),
.tag_size = 0,
};
EXPORT_SYMBOL(t10_pi_type3_crc);
struct blk_integrity t10_pi_type3_ip = {
.name = "T10-DIF-TYPE3-IP",
.generate_fn = t10_pi_type3_generate_ip,
.verify_fn = t10_pi_type3_verify_ip,
.tuple_size = sizeof(struct t10_pi_tuple),
.tag_size = 0,
};
EXPORT_SYMBOL(t10_pi_type3_ip);