Fixed MTP to work with TWRP

This commit is contained in:
awab228 2018-06-19 23:16:04 +02:00
commit f6dfaef42e
50820 changed files with 20846062 additions and 0 deletions

View file

@ -0,0 +1,6 @@
#
# Makefile for the pNFS block layout driver kernel module
#
obj-$(CONFIG_PNFS_BLOCK) += blocklayoutdriver.o
blocklayoutdriver-y += blocklayout.o dev.o extent_tree.o rpc_pipefs.o

View file

@ -0,0 +1,925 @@
/*
* linux/fs/nfs/blocklayout/blocklayout.c
*
* Module for the NFSv4.1 pNFS block layout driver.
*
* Copyright (c) 2006 The Regents of the University of Michigan.
* All rights reserved.
*
* Andy Adamson <andros@citi.umich.edu>
* Fred Isaman <iisaman@umich.edu>
*
* permission is granted to use, copy, create derivative works and
* redistribute this software and such derivative works for any purpose,
* so long as the name of the university of michigan is not used in
* any advertising or publicity pertaining to the use or distribution
* of this software without specific, written prior authorization. if
* the above copyright notice or any other identification of the
* university of michigan is included in any copy of any portion of
* this software, then the disclaimer below must also be included.
*
* this software is provided as is, without representation from the
* university of michigan as to its fitness for any purpose, and without
* warranty by the university of michigan of any kind, either express
* or implied, including without limitation the implied warranties of
* merchantability and fitness for a particular purpose. the regents
* of the university of michigan shall not be liable for any damages,
* including special, indirect, incidental, or consequential damages,
* with respect to any claim arising out or in connection with the use
* of the software, even if it has been or is hereafter advised of the
* possibility of such damages.
*/
#include <linux/module.h>
#include <linux/init.h>
#include <linux/mount.h>
#include <linux/namei.h>
#include <linux/bio.h> /* struct bio */
#include <linux/prefetch.h>
#include <linux/pagevec.h>
#include "../pnfs.h"
#include "../nfs4session.h"
#include "../internal.h"
#include "blocklayout.h"
#define NFSDBG_FACILITY NFSDBG_PNFS_LD
MODULE_LICENSE("GPL");
MODULE_AUTHOR("Andy Adamson <andros@citi.umich.edu>");
MODULE_DESCRIPTION("The NFSv4.1 pNFS Block layout driver");
static bool is_hole(struct pnfs_block_extent *be)
{
switch (be->be_state) {
case PNFS_BLOCK_NONE_DATA:
return true;
case PNFS_BLOCK_INVALID_DATA:
return be->be_tag ? false : true;
default:
return false;
}
}
/* The data we are handed might be spread across several bios. We need
* to track when the last one is finished.
*/
struct parallel_io {
struct kref refcnt;
void (*pnfs_callback) (void *data);
void *data;
};
static inline struct parallel_io *alloc_parallel(void *data)
{
struct parallel_io *rv;
rv = kmalloc(sizeof(*rv), GFP_NOFS);
if (rv) {
rv->data = data;
kref_init(&rv->refcnt);
}
return rv;
}
static inline void get_parallel(struct parallel_io *p)
{
kref_get(&p->refcnt);
}
static void destroy_parallel(struct kref *kref)
{
struct parallel_io *p = container_of(kref, struct parallel_io, refcnt);
dprintk("%s enter\n", __func__);
p->pnfs_callback(p->data);
kfree(p);
}
static inline void put_parallel(struct parallel_io *p)
{
kref_put(&p->refcnt, destroy_parallel);
}
static struct bio *
bl_submit_bio(int rw, struct bio *bio)
{
if (bio) {
get_parallel(bio->bi_private);
dprintk("%s submitting %s bio %u@%llu\n", __func__,
rw == READ ? "read" : "write", bio->bi_iter.bi_size,
(unsigned long long)bio->bi_iter.bi_sector);
submit_bio(rw, bio);
}
return NULL;
}
static struct bio *
bl_alloc_init_bio(int npg, struct block_device *bdev, sector_t disk_sector,
void (*end_io)(struct bio *, int err), struct parallel_io *par)
{
struct bio *bio;
npg = min(npg, BIO_MAX_PAGES);
bio = bio_alloc(GFP_NOIO, npg);
if (!bio && (current->flags & PF_MEMALLOC)) {
while (!bio && (npg /= 2))
bio = bio_alloc(GFP_NOIO, npg);
}
if (bio) {
bio->bi_iter.bi_sector = disk_sector;
bio->bi_bdev = bdev;
bio->bi_end_io = end_io;
bio->bi_private = par;
}
return bio;
}
static struct bio *
do_add_page_to_bio(struct bio *bio, int npg, int rw, sector_t isect,
struct page *page, struct pnfs_block_dev_map *map,
struct pnfs_block_extent *be,
void (*end_io)(struct bio *, int err),
struct parallel_io *par, unsigned int offset, int *len)
{
struct pnfs_block_dev *dev =
container_of(be->be_device, struct pnfs_block_dev, node);
u64 disk_addr, end;
dprintk("%s: npg %d rw %d isect %llu offset %u len %d\n", __func__,
npg, rw, (unsigned long long)isect, offset, *len);
/* translate to device offset */
isect += be->be_v_offset;
isect -= be->be_f_offset;
/* translate to physical disk offset */
disk_addr = (u64)isect << SECTOR_SHIFT;
if (disk_addr < map->start || disk_addr >= map->start + map->len) {
if (!dev->map(dev, disk_addr, map))
return ERR_PTR(-EIO);
bio = bl_submit_bio(rw, bio);
}
disk_addr += map->disk_offset;
disk_addr -= map->start;
/* limit length to what the device mapping allows */
end = disk_addr + *len;
if (end >= map->start + map->len)
*len = map->start + map->len - disk_addr;
retry:
if (!bio) {
bio = bl_alloc_init_bio(npg, map->bdev,
disk_addr >> SECTOR_SHIFT, end_io, par);
if (!bio)
return ERR_PTR(-ENOMEM);
}
if (bio_add_page(bio, page, *len, offset) < *len) {
bio = bl_submit_bio(rw, bio);
goto retry;
}
return bio;
}
static void bl_end_io_read(struct bio *bio, int err)
{
struct parallel_io *par = bio->bi_private;
if (err) {
struct nfs_pgio_header *header = par->data;
if (!header->pnfs_error)
header->pnfs_error = -EIO;
pnfs_set_lo_fail(header->lseg);
}
bio_put(bio);
put_parallel(par);
}
static void bl_read_cleanup(struct work_struct *work)
{
struct rpc_task *task;
struct nfs_pgio_header *hdr;
dprintk("%s enter\n", __func__);
task = container_of(work, struct rpc_task, u.tk_work);
hdr = container_of(task, struct nfs_pgio_header, task);
pnfs_ld_read_done(hdr);
}
static void
bl_end_par_io_read(void *data)
{
struct nfs_pgio_header *hdr = data;
hdr->task.tk_status = hdr->pnfs_error;
INIT_WORK(&hdr->task.u.tk_work, bl_read_cleanup);
schedule_work(&hdr->task.u.tk_work);
}
static enum pnfs_try_status
bl_read_pagelist(struct nfs_pgio_header *header)
{
struct pnfs_block_layout *bl = BLK_LSEG2EXT(header->lseg);
struct pnfs_block_dev_map map = { .start = NFS4_MAX_UINT64 };
struct bio *bio = NULL;
struct pnfs_block_extent be;
sector_t isect, extent_length = 0;
struct parallel_io *par;
loff_t f_offset = header->args.offset;
size_t bytes_left = header->args.count;
unsigned int pg_offset, pg_len;
struct page **pages = header->args.pages;
int pg_index = header->args.pgbase >> PAGE_CACHE_SHIFT;
const bool is_dio = (header->dreq != NULL);
struct blk_plug plug;
int i;
dprintk("%s enter nr_pages %u offset %lld count %u\n", __func__,
header->page_array.npages, f_offset,
(unsigned int)header->args.count);
par = alloc_parallel(header);
if (!par)
return PNFS_NOT_ATTEMPTED;
par->pnfs_callback = bl_end_par_io_read;
blk_start_plug(&plug);
isect = (sector_t) (f_offset >> SECTOR_SHIFT);
/* Code assumes extents are page-aligned */
for (i = pg_index; i < header->page_array.npages; i++) {
if (extent_length <= 0) {
/* We've used up the previous extent */
bio = bl_submit_bio(READ, bio);
/* Get the next one */
if (!ext_tree_lookup(bl, isect, &be, false)) {
header->pnfs_error = -EIO;
goto out;
}
extent_length = be.be_length - (isect - be.be_f_offset);
}
pg_offset = f_offset & ~PAGE_CACHE_MASK;
if (is_dio) {
if (pg_offset + bytes_left > PAGE_CACHE_SIZE)
pg_len = PAGE_CACHE_SIZE - pg_offset;
else
pg_len = bytes_left;
} else {
BUG_ON(pg_offset != 0);
pg_len = PAGE_CACHE_SIZE;
}
isect += (pg_offset >> SECTOR_SHIFT);
extent_length -= (pg_offset >> SECTOR_SHIFT);
if (is_hole(&be)) {
bio = bl_submit_bio(READ, bio);
/* Fill hole w/ zeroes w/o accessing device */
dprintk("%s Zeroing page for hole\n", __func__);
zero_user_segment(pages[i], pg_offset, pg_len);
/* invalidate map */
map.start = NFS4_MAX_UINT64;
} else {
bio = do_add_page_to_bio(bio,
header->page_array.npages - i,
READ,
isect, pages[i], &map, &be,
bl_end_io_read, par,
pg_offset, &pg_len);
if (IS_ERR(bio)) {
header->pnfs_error = PTR_ERR(bio);
bio = NULL;
goto out;
}
}
isect += (pg_len >> SECTOR_SHIFT);
extent_length -= (pg_len >> SECTOR_SHIFT);
f_offset += pg_len;
bytes_left -= pg_len;
}
if ((isect << SECTOR_SHIFT) >= header->inode->i_size) {
header->res.eof = 1;
header->res.count = header->inode->i_size - header->args.offset;
} else {
header->res.count = (isect << SECTOR_SHIFT) - header->args.offset;
}
out:
bl_submit_bio(READ, bio);
blk_finish_plug(&plug);
put_parallel(par);
return PNFS_ATTEMPTED;
}
static void bl_end_io_write(struct bio *bio, int err)
{
struct parallel_io *par = bio->bi_private;
const int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
struct nfs_pgio_header *header = par->data;
if (!uptodate) {
if (!header->pnfs_error)
header->pnfs_error = -EIO;
pnfs_set_lo_fail(header->lseg);
}
bio_put(bio);
put_parallel(par);
}
/* Function scheduled for call during bl_end_par_io_write,
* it marks sectors as written and extends the commitlist.
*/
static void bl_write_cleanup(struct work_struct *work)
{
struct rpc_task *task = container_of(work, struct rpc_task, u.tk_work);
struct nfs_pgio_header *hdr =
container_of(task, struct nfs_pgio_header, task);
dprintk("%s enter\n", __func__);
if (likely(!hdr->pnfs_error)) {
struct pnfs_block_layout *bl = BLK_LSEG2EXT(hdr->lseg);
u64 start = hdr->args.offset & (loff_t)PAGE_CACHE_MASK;
u64 end = (hdr->args.offset + hdr->args.count +
PAGE_CACHE_SIZE - 1) & (loff_t)PAGE_CACHE_MASK;
ext_tree_mark_written(bl, start >> SECTOR_SHIFT,
(end - start) >> SECTOR_SHIFT);
}
pnfs_ld_write_done(hdr);
}
/* Called when last of bios associated with a bl_write_pagelist call finishes */
static void bl_end_par_io_write(void *data)
{
struct nfs_pgio_header *hdr = data;
hdr->task.tk_status = hdr->pnfs_error;
hdr->verf.committed = NFS_FILE_SYNC;
INIT_WORK(&hdr->task.u.tk_work, bl_write_cleanup);
schedule_work(&hdr->task.u.tk_work);
}
static enum pnfs_try_status
bl_write_pagelist(struct nfs_pgio_header *header, int sync)
{
struct pnfs_block_layout *bl = BLK_LSEG2EXT(header->lseg);
struct pnfs_block_dev_map map = { .start = NFS4_MAX_UINT64 };
struct bio *bio = NULL;
struct pnfs_block_extent be;
sector_t isect, extent_length = 0;
struct parallel_io *par = NULL;
loff_t offset = header->args.offset;
size_t count = header->args.count;
struct page **pages = header->args.pages;
int pg_index = header->args.pgbase >> PAGE_CACHE_SHIFT;
unsigned int pg_len;
struct blk_plug plug;
int i;
dprintk("%s enter, %Zu@%lld\n", __func__, count, offset);
/* At this point, header->page_aray is a (sequential) list of nfs_pages.
* We want to write each, and if there is an error set pnfs_error
* to have it redone using nfs.
*/
par = alloc_parallel(header);
if (!par)
return PNFS_NOT_ATTEMPTED;
par->pnfs_callback = bl_end_par_io_write;
blk_start_plug(&plug);
/* we always write out the whole page */
offset = offset & (loff_t)PAGE_CACHE_MASK;
isect = offset >> SECTOR_SHIFT;
for (i = pg_index; i < header->page_array.npages; i++) {
if (extent_length <= 0) {
/* We've used up the previous extent */
bio = bl_submit_bio(WRITE, bio);
/* Get the next one */
if (!ext_tree_lookup(bl, isect, &be, true)) {
header->pnfs_error = -EINVAL;
goto out;
}
extent_length = be.be_length - (isect - be.be_f_offset);
}
pg_len = PAGE_CACHE_SIZE;
bio = do_add_page_to_bio(bio, header->page_array.npages - i,
WRITE, isect, pages[i], &map, &be,
bl_end_io_write, par,
0, &pg_len);
if (IS_ERR(bio)) {
header->pnfs_error = PTR_ERR(bio);
bio = NULL;
goto out;
}
offset += pg_len;
count -= pg_len;
isect += (pg_len >> SECTOR_SHIFT);
extent_length -= (pg_len >> SECTOR_SHIFT);
}
header->res.count = header->args.count;
out:
bl_submit_bio(WRITE, bio);
blk_finish_plug(&plug);
put_parallel(par);
return PNFS_ATTEMPTED;
}
static void bl_free_layout_hdr(struct pnfs_layout_hdr *lo)
{
struct pnfs_block_layout *bl = BLK_LO2EXT(lo);
int err;
dprintk("%s enter\n", __func__);
err = ext_tree_remove(bl, true, 0, LLONG_MAX);
WARN_ON(err);
kfree(bl);
}
static struct pnfs_layout_hdr *bl_alloc_layout_hdr(struct inode *inode,
gfp_t gfp_flags)
{
struct pnfs_block_layout *bl;
dprintk("%s enter\n", __func__);
bl = kzalloc(sizeof(*bl), gfp_flags);
if (!bl)
return NULL;
bl->bl_ext_rw = RB_ROOT;
bl->bl_ext_ro = RB_ROOT;
spin_lock_init(&bl->bl_ext_lock);
return &bl->bl_layout;
}
static void bl_free_lseg(struct pnfs_layout_segment *lseg)
{
dprintk("%s enter\n", __func__);
kfree(lseg);
}
/* Tracks info needed to ensure extents in layout obey constraints of spec */
struct layout_verification {
u32 mode; /* R or RW */
u64 start; /* Expected start of next non-COW extent */
u64 inval; /* Start of INVAL coverage */
u64 cowread; /* End of COW read coverage */
};
/* Verify the extent meets the layout requirements of the pnfs-block draft,
* section 2.3.1.
*/
static int verify_extent(struct pnfs_block_extent *be,
struct layout_verification *lv)
{
if (lv->mode == IOMODE_READ) {
if (be->be_state == PNFS_BLOCK_READWRITE_DATA ||
be->be_state == PNFS_BLOCK_INVALID_DATA)
return -EIO;
if (be->be_f_offset != lv->start)
return -EIO;
lv->start += be->be_length;
return 0;
}
/* lv->mode == IOMODE_RW */
if (be->be_state == PNFS_BLOCK_READWRITE_DATA) {
if (be->be_f_offset != lv->start)
return -EIO;
if (lv->cowread > lv->start)
return -EIO;
lv->start += be->be_length;
lv->inval = lv->start;
return 0;
} else if (be->be_state == PNFS_BLOCK_INVALID_DATA) {
if (be->be_f_offset != lv->start)
return -EIO;
lv->start += be->be_length;
return 0;
} else if (be->be_state == PNFS_BLOCK_READ_DATA) {
if (be->be_f_offset > lv->start)
return -EIO;
if (be->be_f_offset < lv->inval)
return -EIO;
if (be->be_f_offset < lv->cowread)
return -EIO;
/* It looks like you might want to min this with lv->start,
* but you really don't.
*/
lv->inval = lv->inval + be->be_length;
lv->cowread = be->be_f_offset + be->be_length;
return 0;
} else
return -EIO;
}
static int decode_sector_number(__be32 **rp, sector_t *sp)
{
uint64_t s;
*rp = xdr_decode_hyper(*rp, &s);
if (s & 0x1ff) {
printk(KERN_WARNING "NFS: %s: sector not aligned\n", __func__);
return -1;
}
*sp = s >> SECTOR_SHIFT;
return 0;
}
static int
bl_alloc_extent(struct xdr_stream *xdr, struct pnfs_layout_hdr *lo,
struct layout_verification *lv, struct list_head *extents,
gfp_t gfp_mask)
{
struct pnfs_block_extent *be;
struct nfs4_deviceid id;
int error;
__be32 *p;
p = xdr_inline_decode(xdr, 28 + NFS4_DEVICEID4_SIZE);
if (!p)
return -EIO;
be = kzalloc(sizeof(*be), GFP_NOFS);
if (!be)
return -ENOMEM;
memcpy(&id, p, NFS4_DEVICEID4_SIZE);
p += XDR_QUADLEN(NFS4_DEVICEID4_SIZE);
error = -EIO;
be->be_device = nfs4_find_get_deviceid(NFS_SERVER(lo->plh_inode), &id,
lo->plh_lc_cred, gfp_mask);
if (!be->be_device)
goto out_free_be;
/*
* The next three values are read in as bytes, but stored in the
* extent structure in 512-byte granularity.
*/
if (decode_sector_number(&p, &be->be_f_offset) < 0)
goto out_put_deviceid;
if (decode_sector_number(&p, &be->be_length) < 0)
goto out_put_deviceid;
if (decode_sector_number(&p, &be->be_v_offset) < 0)
goto out_put_deviceid;
be->be_state = be32_to_cpup(p++);
error = verify_extent(be, lv);
if (error) {
dprintk("%s: extent verification failed\n", __func__);
goto out_put_deviceid;
}
list_add_tail(&be->be_list, extents);
return 0;
out_put_deviceid:
nfs4_put_deviceid_node(be->be_device);
out_free_be:
kfree(be);
return error;
}
static struct pnfs_layout_segment *
bl_alloc_lseg(struct pnfs_layout_hdr *lo, struct nfs4_layoutget_res *lgr,
gfp_t gfp_mask)
{
struct layout_verification lv = {
.mode = lgr->range.iomode,
.start = lgr->range.offset >> SECTOR_SHIFT,
.inval = lgr->range.offset >> SECTOR_SHIFT,
.cowread = lgr->range.offset >> SECTOR_SHIFT,
};
struct pnfs_block_layout *bl = BLK_LO2EXT(lo);
struct pnfs_layout_segment *lseg;
struct xdr_buf buf;
struct xdr_stream xdr;
struct page *scratch;
int status, i;
uint32_t count;
__be32 *p;
LIST_HEAD(extents);
dprintk("---> %s\n", __func__);
lseg = kzalloc(sizeof(*lseg), gfp_mask);
if (!lseg)
return ERR_PTR(-ENOMEM);
status = -ENOMEM;
scratch = alloc_page(gfp_mask);
if (!scratch)
goto out;
xdr_init_decode_pages(&xdr, &buf,
lgr->layoutp->pages, lgr->layoutp->len);
xdr_set_scratch_buffer(&xdr, page_address(scratch), PAGE_SIZE);
status = -EIO;
p = xdr_inline_decode(&xdr, 4);
if (unlikely(!p))
goto out_free_scratch;
count = be32_to_cpup(p++);
dprintk("%s: number of extents %d\n", __func__, count);
/*
* Decode individual extents, putting them in temporary staging area
* until whole layout is decoded to make error recovery easier.
*/
for (i = 0; i < count; i++) {
status = bl_alloc_extent(&xdr, lo, &lv, &extents, gfp_mask);
if (status)
goto process_extents;
}
if (lgr->range.offset + lgr->range.length !=
lv.start << SECTOR_SHIFT) {
dprintk("%s Final length mismatch\n", __func__);
status = -EIO;
goto process_extents;
}
if (lv.start < lv.cowread) {
dprintk("%s Final uncovered COW extent\n", __func__);
status = -EIO;
}
process_extents:
while (!list_empty(&extents)) {
struct pnfs_block_extent *be =
list_first_entry(&extents, struct pnfs_block_extent,
be_list);
list_del(&be->be_list);
if (!status)
status = ext_tree_insert(bl, be);
if (status) {
nfs4_put_deviceid_node(be->be_device);
kfree(be);
}
}
out_free_scratch:
__free_page(scratch);
out:
dprintk("%s returns %d\n", __func__, status);
if (status) {
kfree(lseg);
return ERR_PTR(status);
}
return lseg;
}
static void
bl_return_range(struct pnfs_layout_hdr *lo,
struct pnfs_layout_range *range)
{
struct pnfs_block_layout *bl = BLK_LO2EXT(lo);
sector_t offset = range->offset >> SECTOR_SHIFT, end;
if (range->offset % 8) {
dprintk("%s: offset %lld not block size aligned\n",
__func__, range->offset);
return;
}
if (range->length != NFS4_MAX_UINT64) {
if (range->length % 8) {
dprintk("%s: length %lld not block size aligned\n",
__func__, range->length);
return;
}
end = offset + (range->length >> SECTOR_SHIFT);
} else {
end = round_down(NFS4_MAX_UINT64, PAGE_SIZE);
}
ext_tree_remove(bl, range->iomode & IOMODE_RW, offset, end);
}
static int
bl_prepare_layoutcommit(struct nfs4_layoutcommit_args *arg)
{
return ext_tree_prepare_commit(arg);
}
static void
bl_cleanup_layoutcommit(struct nfs4_layoutcommit_data *lcdata)
{
ext_tree_mark_committed(&lcdata->args, lcdata->res.status);
}
static int
bl_set_layoutdriver(struct nfs_server *server, const struct nfs_fh *fh)
{
dprintk("%s enter\n", __func__);
if (server->pnfs_blksize == 0) {
dprintk("%s Server did not return blksize\n", __func__);
return -EINVAL;
}
if (server->pnfs_blksize > PAGE_SIZE) {
printk(KERN_ERR "%s: pNFS blksize %d not supported.\n",
__func__, server->pnfs_blksize);
return -EINVAL;
}
return 0;
}
static bool
is_aligned_req(struct nfs_pageio_descriptor *pgio,
struct nfs_page *req, unsigned int alignment)
{
/*
* Always accept buffered writes, higher layers take care of the
* right alignment.
*/
if (pgio->pg_dreq == NULL)
return true;
if (!IS_ALIGNED(req->wb_offset, alignment))
return false;
if (IS_ALIGNED(req->wb_bytes, alignment))
return true;
if (req_offset(req) + req->wb_bytes == i_size_read(pgio->pg_inode)) {
/*
* If the write goes up to the inode size, just write
* the full page. Data past the inode size is
* guaranteed to be zeroed by the higher level client
* code, and this behaviour is mandated by RFC 5663
* section 2.3.2.
*/
return true;
}
return false;
}
static void
bl_pg_init_read(struct nfs_pageio_descriptor *pgio, struct nfs_page *req)
{
if (!is_aligned_req(pgio, req, SECTOR_SIZE)) {
nfs_pageio_reset_read_mds(pgio);
return;
}
pnfs_generic_pg_init_read(pgio, req);
}
/*
* Return 0 if @req cannot be coalesced into @pgio, otherwise return the number
* of bytes (maximum @req->wb_bytes) that can be coalesced.
*/
static size_t
bl_pg_test_read(struct nfs_pageio_descriptor *pgio, struct nfs_page *prev,
struct nfs_page *req)
{
if (!is_aligned_req(pgio, req, SECTOR_SIZE))
return 0;
return pnfs_generic_pg_test(pgio, prev, req);
}
/*
* Return the number of contiguous bytes for a given inode
* starting at page frame idx.
*/
static u64 pnfs_num_cont_bytes(struct inode *inode, pgoff_t idx)
{
struct address_space *mapping = inode->i_mapping;
pgoff_t end;
/* Optimize common case that writes from 0 to end of file */
end = DIV_ROUND_UP(i_size_read(inode), PAGE_CACHE_SIZE);
if (end != NFS_I(inode)->npages) {
rcu_read_lock();
end = page_cache_next_hole(mapping, idx + 1, ULONG_MAX);
rcu_read_unlock();
}
if (!end)
return i_size_read(inode) - (idx << PAGE_CACHE_SHIFT);
else
return (end - idx) << PAGE_CACHE_SHIFT;
}
static void
bl_pg_init_write(struct nfs_pageio_descriptor *pgio, struct nfs_page *req)
{
u64 wb_size;
if (!is_aligned_req(pgio, req, PAGE_SIZE)) {
nfs_pageio_reset_write_mds(pgio);
return;
}
if (pgio->pg_dreq == NULL)
wb_size = pnfs_num_cont_bytes(pgio->pg_inode,
req->wb_index);
else
wb_size = nfs_dreq_bytes_left(pgio->pg_dreq);
pnfs_generic_pg_init_write(pgio, req, wb_size);
}
/*
* Return 0 if @req cannot be coalesced into @pgio, otherwise return the number
* of bytes (maximum @req->wb_bytes) that can be coalesced.
*/
static size_t
bl_pg_test_write(struct nfs_pageio_descriptor *pgio, struct nfs_page *prev,
struct nfs_page *req)
{
if (!is_aligned_req(pgio, req, PAGE_SIZE))
return 0;
return pnfs_generic_pg_test(pgio, prev, req);
}
static const struct nfs_pageio_ops bl_pg_read_ops = {
.pg_init = bl_pg_init_read,
.pg_test = bl_pg_test_read,
.pg_doio = pnfs_generic_pg_readpages,
};
static const struct nfs_pageio_ops bl_pg_write_ops = {
.pg_init = bl_pg_init_write,
.pg_test = bl_pg_test_write,
.pg_doio = pnfs_generic_pg_writepages,
};
static struct pnfs_layoutdriver_type blocklayout_type = {
.id = LAYOUT_BLOCK_VOLUME,
.name = "LAYOUT_BLOCK_VOLUME",
.owner = THIS_MODULE,
.flags = PNFS_LAYOUTRET_ON_SETATTR |
PNFS_READ_WHOLE_PAGE,
.read_pagelist = bl_read_pagelist,
.write_pagelist = bl_write_pagelist,
.alloc_layout_hdr = bl_alloc_layout_hdr,
.free_layout_hdr = bl_free_layout_hdr,
.alloc_lseg = bl_alloc_lseg,
.free_lseg = bl_free_lseg,
.return_range = bl_return_range,
.prepare_layoutcommit = bl_prepare_layoutcommit,
.cleanup_layoutcommit = bl_cleanup_layoutcommit,
.set_layoutdriver = bl_set_layoutdriver,
.alloc_deviceid_node = bl_alloc_deviceid_node,
.free_deviceid_node = bl_free_deviceid_node,
.pg_read_ops = &bl_pg_read_ops,
.pg_write_ops = &bl_pg_write_ops,
};
static int __init nfs4blocklayout_init(void)
{
int ret;
dprintk("%s: NFSv4 Block Layout Driver Registering...\n", __func__);
ret = pnfs_register_layoutdriver(&blocklayout_type);
if (ret)
goto out;
ret = bl_init_pipefs();
if (ret)
goto out_unregister;
return 0;
out_unregister:
pnfs_unregister_layoutdriver(&blocklayout_type);
out:
return ret;
}
static void __exit nfs4blocklayout_exit(void)
{
dprintk("%s: NFSv4 Block Layout Driver Unregistering...\n",
__func__);
bl_cleanup_pipefs();
pnfs_unregister_layoutdriver(&blocklayout_type);
}
MODULE_ALIAS("nfs-layouttype4-3");
module_init(nfs4blocklayout_init);
module_exit(nfs4blocklayout_exit);

View file

@ -0,0 +1,204 @@
/*
* linux/fs/nfs/blocklayout/blocklayout.h
*
* Module for the NFSv4.1 pNFS block layout driver.
*
* Copyright (c) 2006 The Regents of the University of Michigan.
* All rights reserved.
*
* Andy Adamson <andros@citi.umich.edu>
* Fred Isaman <iisaman@umich.edu>
*
* permission is granted to use, copy, create derivative works and
* redistribute this software and such derivative works for any purpose,
* so long as the name of the university of michigan is not used in
* any advertising or publicity pertaining to the use or distribution
* of this software without specific, written prior authorization. if
* the above copyright notice or any other identification of the
* university of michigan is included in any copy of any portion of
* this software, then the disclaimer below must also be included.
*
* this software is provided as is, without representation from the
* university of michigan as to its fitness for any purpose, and without
* warranty by the university of michigan of any kind, either express
* or implied, including without limitation the implied warranties of
* merchantability and fitness for a particular purpose. the regents
* of the university of michigan shall not be liable for any damages,
* including special, indirect, incidental, or consequential damages,
* with respect to any claim arising out or in connection with the use
* of the software, even if it has been or is hereafter advised of the
* possibility of such damages.
*/
#ifndef FS_NFS_NFS4BLOCKLAYOUT_H
#define FS_NFS_NFS4BLOCKLAYOUT_H
#include <linux/device-mapper.h>
#include <linux/nfs_fs.h>
#include <linux/sunrpc/rpc_pipe_fs.h>
#include "../nfs4_fs.h"
#include "../pnfs.h"
#include "../netns.h"
#define PAGE_CACHE_SECTORS (PAGE_CACHE_SIZE >> SECTOR_SHIFT)
#define PAGE_CACHE_SECTOR_SHIFT (PAGE_CACHE_SHIFT - SECTOR_SHIFT)
#define SECTOR_SIZE (1 << SECTOR_SHIFT)
struct pnfs_block_dev;
enum pnfs_block_volume_type {
PNFS_BLOCK_VOLUME_SIMPLE = 0,
PNFS_BLOCK_VOLUME_SLICE = 1,
PNFS_BLOCK_VOLUME_CONCAT = 2,
PNFS_BLOCK_VOLUME_STRIPE = 3,
};
#define PNFS_BLOCK_MAX_UUIDS 4
#define PNFS_BLOCK_MAX_DEVICES 64
/*
* Random upper cap for the uuid length to avoid unbounded allocation.
* Not actually limited by the protocol.
*/
#define PNFS_BLOCK_UUID_LEN 128
struct pnfs_block_volume {
enum pnfs_block_volume_type type;
union {
struct {
int len;
int nr_sigs;
struct {
u64 offset;
u32 sig_len;
u8 sig[PNFS_BLOCK_UUID_LEN];
} sigs[PNFS_BLOCK_MAX_UUIDS];
} simple;
struct {
u64 start;
u64 len;
u32 volume;
} slice;
struct {
u32 volumes_count;
u32 volumes[PNFS_BLOCK_MAX_DEVICES];
} concat;
struct {
u64 chunk_size;
u32 volumes_count;
u32 volumes[PNFS_BLOCK_MAX_DEVICES];
} stripe;
};
};
struct pnfs_block_dev_map {
sector_t start;
sector_t len;
sector_t disk_offset;
struct block_device *bdev;
};
struct pnfs_block_dev {
struct nfs4_deviceid_node node;
u64 start;
u64 len;
u32 nr_children;
struct pnfs_block_dev *children;
u64 chunk_size;
struct block_device *bdev;
u64 disk_offset;
bool (*map)(struct pnfs_block_dev *dev, u64 offset,
struct pnfs_block_dev_map *map);
};
enum exstate4 {
PNFS_BLOCK_READWRITE_DATA = 0,
PNFS_BLOCK_READ_DATA = 1,
PNFS_BLOCK_INVALID_DATA = 2, /* mapped, but data is invalid */
PNFS_BLOCK_NONE_DATA = 3 /* unmapped, it's a hole */
};
/* sector_t fields are all in 512-byte sectors */
struct pnfs_block_extent {
union {
struct rb_node be_node;
struct list_head be_list;
};
struct nfs4_deviceid_node *be_device;
sector_t be_f_offset; /* the starting offset in the file */
sector_t be_length; /* the size of the extent */
sector_t be_v_offset; /* the starting offset in the volume */
enum exstate4 be_state; /* the state of this extent */
#define EXTENT_WRITTEN 1
#define EXTENT_COMMITTING 2
unsigned int be_tag;
};
/* on the wire size of the extent */
#define BL_EXTENT_SIZE (7 * sizeof(__be32) + NFS4_DEVICEID4_SIZE)
struct pnfs_block_layout {
struct pnfs_layout_hdr bl_layout;
struct rb_root bl_ext_rw;
struct rb_root bl_ext_ro;
spinlock_t bl_ext_lock; /* Protects list manipulation */
};
static inline struct pnfs_block_layout *
BLK_LO2EXT(struct pnfs_layout_hdr *lo)
{
return container_of(lo, struct pnfs_block_layout, bl_layout);
}
static inline struct pnfs_block_layout *
BLK_LSEG2EXT(struct pnfs_layout_segment *lseg)
{
return BLK_LO2EXT(lseg->pls_layout);
}
struct bl_pipe_msg {
struct rpc_pipe_msg msg;
wait_queue_head_t *bl_wq;
};
struct bl_msg_hdr {
u8 type;
u16 totallen; /* length of entire message, including hdr itself */
};
#define BL_DEVICE_UMOUNT 0x0 /* Umount--delete devices */
#define BL_DEVICE_MOUNT 0x1 /* Mount--create devices*/
#define BL_DEVICE_REQUEST_INIT 0x0 /* Start request */
#define BL_DEVICE_REQUEST_PROC 0x1 /* User level process succeeds */
#define BL_DEVICE_REQUEST_ERR 0x2 /* User level process fails */
/* dev.c */
struct nfs4_deviceid_node *bl_alloc_deviceid_node(struct nfs_server *server,
struct pnfs_device *pdev, gfp_t gfp_mask);
void bl_free_deviceid_node(struct nfs4_deviceid_node *d);
/* extent_tree.c */
int ext_tree_insert(struct pnfs_block_layout *bl,
struct pnfs_block_extent *new);
int ext_tree_remove(struct pnfs_block_layout *bl, bool rw, sector_t start,
sector_t end);
int ext_tree_mark_written(struct pnfs_block_layout *bl, sector_t start,
sector_t len);
bool ext_tree_lookup(struct pnfs_block_layout *bl, sector_t isect,
struct pnfs_block_extent *ret, bool rw);
int ext_tree_prepare_commit(struct nfs4_layoutcommit_args *arg);
void ext_tree_mark_committed(struct nfs4_layoutcommit_args *arg, int status);
/* rpc_pipefs.c */
dev_t bl_resolve_deviceid(struct nfs_server *server,
struct pnfs_block_volume *b, gfp_t gfp_mask);
int __init bl_init_pipefs(void);
void __exit bl_cleanup_pipefs(void);
#endif /* FS_NFS_NFS4BLOCKLAYOUT_H */

363
fs/nfs/blocklayout/dev.c Normal file
View file

@ -0,0 +1,363 @@
/*
* Copyright (c) 2014 Christoph Hellwig.
*/
#include <linux/sunrpc/svc.h>
#include <linux/blkdev.h>
#include <linux/nfs4.h>
#include <linux/nfs_fs.h>
#include <linux/nfs_xdr.h>
#include "blocklayout.h"
#define NFSDBG_FACILITY NFSDBG_PNFS_LD
static void
bl_free_device(struct pnfs_block_dev *dev)
{
if (dev->nr_children) {
int i;
for (i = 0; i < dev->nr_children; i++)
bl_free_device(&dev->children[i]);
kfree(dev->children);
} else {
if (dev->bdev)
blkdev_put(dev->bdev, FMODE_READ);
}
}
void
bl_free_deviceid_node(struct nfs4_deviceid_node *d)
{
struct pnfs_block_dev *dev =
container_of(d, struct pnfs_block_dev, node);
bl_free_device(dev);
kfree(dev);
}
static int
nfs4_block_decode_volume(struct xdr_stream *xdr, struct pnfs_block_volume *b)
{
__be32 *p;
int i;
p = xdr_inline_decode(xdr, 4);
if (!p)
return -EIO;
b->type = be32_to_cpup(p++);
switch (b->type) {
case PNFS_BLOCK_VOLUME_SIMPLE:
p = xdr_inline_decode(xdr, 4);
if (!p)
return -EIO;
b->simple.nr_sigs = be32_to_cpup(p++);
if (!b->simple.nr_sigs) {
dprintk("no signature\n");
return -EIO;
}
b->simple.len = 4 + 4;
for (i = 0; i < b->simple.nr_sigs; i++) {
p = xdr_inline_decode(xdr, 8 + 4);
if (!p)
return -EIO;
p = xdr_decode_hyper(p, &b->simple.sigs[i].offset);
b->simple.sigs[i].sig_len = be32_to_cpup(p++);
p = xdr_inline_decode(xdr, b->simple.sigs[i].sig_len);
if (!p)
return -EIO;
memcpy(&b->simple.sigs[i].sig, p,
b->simple.sigs[i].sig_len);
b->simple.len += 8 + 4 + b->simple.sigs[i].sig_len;
}
break;
case PNFS_BLOCK_VOLUME_SLICE:
p = xdr_inline_decode(xdr, 8 + 8 + 4);
if (!p)
return -EIO;
p = xdr_decode_hyper(p, &b->slice.start);
p = xdr_decode_hyper(p, &b->slice.len);
b->slice.volume = be32_to_cpup(p++);
break;
case PNFS_BLOCK_VOLUME_CONCAT:
p = xdr_inline_decode(xdr, 4);
if (!p)
return -EIO;
b->concat.volumes_count = be32_to_cpup(p++);
p = xdr_inline_decode(xdr, b->concat.volumes_count * 4);
if (!p)
return -EIO;
for (i = 0; i < b->concat.volumes_count; i++)
b->concat.volumes[i] = be32_to_cpup(p++);
break;
case PNFS_BLOCK_VOLUME_STRIPE:
p = xdr_inline_decode(xdr, 8 + 4);
if (!p)
return -EIO;
p = xdr_decode_hyper(p, &b->stripe.chunk_size);
b->stripe.volumes_count = be32_to_cpup(p++);
p = xdr_inline_decode(xdr, b->stripe.volumes_count * 4);
if (!p)
return -EIO;
for (i = 0; i < b->stripe.volumes_count; i++)
b->stripe.volumes[i] = be32_to_cpup(p++);
break;
default:
dprintk("unknown volume type!\n");
return -EIO;
}
return 0;
}
static bool bl_map_simple(struct pnfs_block_dev *dev, u64 offset,
struct pnfs_block_dev_map *map)
{
map->start = dev->start;
map->len = dev->len;
map->disk_offset = dev->disk_offset;
map->bdev = dev->bdev;
return true;
}
static bool bl_map_concat(struct pnfs_block_dev *dev, u64 offset,
struct pnfs_block_dev_map *map)
{
int i;
for (i = 0; i < dev->nr_children; i++) {
struct pnfs_block_dev *child = &dev->children[i];
if (child->start > offset ||
child->start + child->len <= offset)
continue;
child->map(child, offset - child->start, map);
return true;
}
dprintk("%s: ran off loop!\n", __func__);
return false;
}
static bool bl_map_stripe(struct pnfs_block_dev *dev, u64 offset,
struct pnfs_block_dev_map *map)
{
struct pnfs_block_dev *child;
u64 chunk;
u32 chunk_idx;
u64 disk_offset;
chunk = div_u64(offset, dev->chunk_size);
div_u64_rem(chunk, dev->nr_children, &chunk_idx);
if (chunk_idx > dev->nr_children) {
dprintk("%s: invalid chunk idx %d (%lld/%lld)\n",
__func__, chunk_idx, offset, dev->chunk_size);
/* error, should not happen */
return false;
}
/* truncate offset to the beginning of the stripe */
offset = chunk * dev->chunk_size;
/* disk offset of the stripe */
disk_offset = div_u64(offset, dev->nr_children);
child = &dev->children[chunk_idx];
child->map(child, disk_offset, map);
map->start += offset;
map->disk_offset += disk_offset;
map->len = dev->chunk_size;
return true;
}
static int
bl_parse_deviceid(struct nfs_server *server, struct pnfs_block_dev *d,
struct pnfs_block_volume *volumes, int idx, gfp_t gfp_mask);
static int
bl_parse_simple(struct nfs_server *server, struct pnfs_block_dev *d,
struct pnfs_block_volume *volumes, int idx, gfp_t gfp_mask)
{
struct pnfs_block_volume *v = &volumes[idx];
dev_t dev;
dev = bl_resolve_deviceid(server, v, gfp_mask);
if (!dev)
return -EIO;
d->bdev = blkdev_get_by_dev(dev, FMODE_READ, NULL);
if (IS_ERR(d->bdev)) {
printk(KERN_WARNING "pNFS: failed to open device %d:%d (%ld)\n",
MAJOR(dev), MINOR(dev), PTR_ERR(d->bdev));
return PTR_ERR(d->bdev);
}
d->len = i_size_read(d->bdev->bd_inode);
d->map = bl_map_simple;
printk(KERN_INFO "pNFS: using block device %s\n",
d->bdev->bd_disk->disk_name);
return 0;
}
static int
bl_parse_slice(struct nfs_server *server, struct pnfs_block_dev *d,
struct pnfs_block_volume *volumes, int idx, gfp_t gfp_mask)
{
struct pnfs_block_volume *v = &volumes[idx];
int ret;
ret = bl_parse_deviceid(server, d, volumes, v->slice.volume, gfp_mask);
if (ret)
return ret;
d->disk_offset = v->slice.start;
d->len = v->slice.len;
return 0;
}
static int
bl_parse_concat(struct nfs_server *server, struct pnfs_block_dev *d,
struct pnfs_block_volume *volumes, int idx, gfp_t gfp_mask)
{
struct pnfs_block_volume *v = &volumes[idx];
u64 len = 0;
int ret, i;
d->children = kcalloc(v->concat.volumes_count,
sizeof(struct pnfs_block_dev), GFP_KERNEL);
if (!d->children)
return -ENOMEM;
for (i = 0; i < v->concat.volumes_count; i++) {
ret = bl_parse_deviceid(server, &d->children[i],
volumes, v->concat.volumes[i], gfp_mask);
if (ret)
return ret;
d->nr_children++;
d->children[i].start += len;
len += d->children[i].len;
}
d->len = len;
d->map = bl_map_concat;
return 0;
}
static int
bl_parse_stripe(struct nfs_server *server, struct pnfs_block_dev *d,
struct pnfs_block_volume *volumes, int idx, gfp_t gfp_mask)
{
struct pnfs_block_volume *v = &volumes[idx];
u64 len = 0;
int ret, i;
d->children = kcalloc(v->stripe.volumes_count,
sizeof(struct pnfs_block_dev), GFP_KERNEL);
if (!d->children)
return -ENOMEM;
for (i = 0; i < v->stripe.volumes_count; i++) {
ret = bl_parse_deviceid(server, &d->children[i],
volumes, v->stripe.volumes[i], gfp_mask);
if (ret)
return ret;
d->nr_children++;
len += d->children[i].len;
}
d->len = len;
d->chunk_size = v->stripe.chunk_size;
d->map = bl_map_stripe;
return 0;
}
static int
bl_parse_deviceid(struct nfs_server *server, struct pnfs_block_dev *d,
struct pnfs_block_volume *volumes, int idx, gfp_t gfp_mask)
{
switch (volumes[idx].type) {
case PNFS_BLOCK_VOLUME_SIMPLE:
return bl_parse_simple(server, d, volumes, idx, gfp_mask);
case PNFS_BLOCK_VOLUME_SLICE:
return bl_parse_slice(server, d, volumes, idx, gfp_mask);
case PNFS_BLOCK_VOLUME_CONCAT:
return bl_parse_concat(server, d, volumes, idx, gfp_mask);
case PNFS_BLOCK_VOLUME_STRIPE:
return bl_parse_stripe(server, d, volumes, idx, gfp_mask);
default:
dprintk("unsupported volume type: %d\n", volumes[idx].type);
return -EIO;
}
}
struct nfs4_deviceid_node *
bl_alloc_deviceid_node(struct nfs_server *server, struct pnfs_device *pdev,
gfp_t gfp_mask)
{
struct nfs4_deviceid_node *node = NULL;
struct pnfs_block_volume *volumes;
struct pnfs_block_dev *top;
struct xdr_stream xdr;
struct xdr_buf buf;
struct page *scratch;
int nr_volumes, ret, i;
__be32 *p;
scratch = alloc_page(gfp_mask);
if (!scratch)
goto out;
xdr_init_decode_pages(&xdr, &buf, pdev->pages, pdev->pglen);
xdr_set_scratch_buffer(&xdr, page_address(scratch), PAGE_SIZE);
p = xdr_inline_decode(&xdr, sizeof(__be32));
if (!p)
goto out_free_scratch;
nr_volumes = be32_to_cpup(p++);
volumes = kcalloc(nr_volumes, sizeof(struct pnfs_block_volume),
gfp_mask);
if (!volumes)
goto out_free_scratch;
for (i = 0; i < nr_volumes; i++) {
ret = nfs4_block_decode_volume(&xdr, &volumes[i]);
if (ret < 0)
goto out_free_volumes;
}
top = kzalloc(sizeof(*top), gfp_mask);
if (!top)
goto out_free_volumes;
ret = bl_parse_deviceid(server, top, volumes, nr_volumes - 1, gfp_mask);
if (ret) {
bl_free_device(top);
kfree(top);
goto out_free_volumes;
}
node = &top->node;
nfs4_init_deviceid_node(node, server, &pdev->dev_id);
out_free_volumes:
kfree(volumes);
out_free_scratch:
__free_page(scratch);
out:
return node;
}

View file

@ -0,0 +1,602 @@
/*
* Copyright (c) 2014 Christoph Hellwig.
*/
#include <linux/vmalloc.h>
#include "blocklayout.h"
#define NFSDBG_FACILITY NFSDBG_PNFS_LD
static inline struct pnfs_block_extent *
ext_node(struct rb_node *node)
{
return rb_entry(node, struct pnfs_block_extent, be_node);
}
static struct pnfs_block_extent *
ext_tree_first(struct rb_root *root)
{
struct rb_node *node = rb_first(root);
return node ? ext_node(node) : NULL;
}
static struct pnfs_block_extent *
ext_tree_prev(struct pnfs_block_extent *be)
{
struct rb_node *node = rb_prev(&be->be_node);
return node ? ext_node(node) : NULL;
}
static struct pnfs_block_extent *
ext_tree_next(struct pnfs_block_extent *be)
{
struct rb_node *node = rb_next(&be->be_node);
return node ? ext_node(node) : NULL;
}
static inline sector_t
ext_f_end(struct pnfs_block_extent *be)
{
return be->be_f_offset + be->be_length;
}
static struct pnfs_block_extent *
__ext_tree_search(struct rb_root *root, sector_t start)
{
struct rb_node *node = root->rb_node;
struct pnfs_block_extent *be = NULL;
while (node) {
be = ext_node(node);
if (start < be->be_f_offset)
node = node->rb_left;
else if (start >= ext_f_end(be))
node = node->rb_right;
else
return be;
}
if (be) {
if (start < be->be_f_offset)
return be;
if (start >= ext_f_end(be))
return ext_tree_next(be);
}
return NULL;
}
static bool
ext_can_merge(struct pnfs_block_extent *be1, struct pnfs_block_extent *be2)
{
if (be1->be_state != be2->be_state)
return false;
if (be1->be_device != be2->be_device)
return false;
if (be1->be_f_offset + be1->be_length != be2->be_f_offset)
return false;
if (be1->be_state != PNFS_BLOCK_NONE_DATA &&
(be1->be_v_offset + be1->be_length != be2->be_v_offset))
return false;
if (be1->be_state == PNFS_BLOCK_INVALID_DATA &&
be1->be_tag != be2->be_tag)
return false;
return true;
}
static struct pnfs_block_extent *
ext_try_to_merge_left(struct rb_root *root, struct pnfs_block_extent *be)
{
struct pnfs_block_extent *left = ext_tree_prev(be);
if (left && ext_can_merge(left, be)) {
left->be_length += be->be_length;
rb_erase(&be->be_node, root);
nfs4_put_deviceid_node(be->be_device);
kfree(be);
return left;
}
return be;
}
static struct pnfs_block_extent *
ext_try_to_merge_right(struct rb_root *root, struct pnfs_block_extent *be)
{
struct pnfs_block_extent *right = ext_tree_next(be);
if (right && ext_can_merge(be, right)) {
be->be_length += right->be_length;
rb_erase(&right->be_node, root);
nfs4_put_deviceid_node(right->be_device);
kfree(right);
}
return be;
}
static void
__ext_tree_insert(struct rb_root *root,
struct pnfs_block_extent *new, bool merge_ok)
{
struct rb_node **p = &root->rb_node, *parent = NULL;
struct pnfs_block_extent *be;
while (*p) {
parent = *p;
be = ext_node(parent);
if (new->be_f_offset < be->be_f_offset) {
if (merge_ok && ext_can_merge(new, be)) {
be->be_f_offset = new->be_f_offset;
if (be->be_state != PNFS_BLOCK_NONE_DATA)
be->be_v_offset = new->be_v_offset;
be->be_length += new->be_length;
be = ext_try_to_merge_left(root, be);
goto free_new;
}
p = &(*p)->rb_left;
} else if (new->be_f_offset >= ext_f_end(be)) {
if (merge_ok && ext_can_merge(be, new)) {
be->be_length += new->be_length;
be = ext_try_to_merge_right(root, be);
goto free_new;
}
p = &(*p)->rb_right;
} else {
BUG();
}
}
rb_link_node(&new->be_node, parent, p);
rb_insert_color(&new->be_node, root);
return;
free_new:
nfs4_put_deviceid_node(new->be_device);
kfree(new);
}
static int
__ext_tree_remove(struct rb_root *root, sector_t start, sector_t end)
{
struct pnfs_block_extent *be;
sector_t len1 = 0, len2 = 0;
sector_t orig_v_offset;
sector_t orig_len;
be = __ext_tree_search(root, start);
if (!be)
return 0;
if (be->be_f_offset >= end)
return 0;
orig_v_offset = be->be_v_offset;
orig_len = be->be_length;
if (start > be->be_f_offset)
len1 = start - be->be_f_offset;
if (ext_f_end(be) > end)
len2 = ext_f_end(be) - end;
if (len2 > 0) {
if (len1 > 0) {
struct pnfs_block_extent *new;
new = kzalloc(sizeof(*new), GFP_ATOMIC);
if (!new)
return -ENOMEM;
be->be_length = len1;
new->be_f_offset = end;
if (be->be_state != PNFS_BLOCK_NONE_DATA) {
new->be_v_offset =
orig_v_offset + orig_len - len2;
}
new->be_length = len2;
new->be_state = be->be_state;
new->be_tag = be->be_tag;
new->be_device = nfs4_get_deviceid(be->be_device);
__ext_tree_insert(root, new, true);
} else {
be->be_f_offset = end;
if (be->be_state != PNFS_BLOCK_NONE_DATA) {
be->be_v_offset =
orig_v_offset + orig_len - len2;
}
be->be_length = len2;
}
} else {
if (len1 > 0) {
be->be_length = len1;
be = ext_tree_next(be);
}
while (be && ext_f_end(be) <= end) {
struct pnfs_block_extent *next = ext_tree_next(be);
rb_erase(&be->be_node, root);
nfs4_put_deviceid_node(be->be_device);
kfree(be);
be = next;
}
if (be && be->be_f_offset < end) {
len1 = ext_f_end(be) - end;
be->be_f_offset = end;
if (be->be_state != PNFS_BLOCK_NONE_DATA)
be->be_v_offset += be->be_length - len1;
be->be_length = len1;
}
}
return 0;
}
int
ext_tree_insert(struct pnfs_block_layout *bl, struct pnfs_block_extent *new)
{
struct pnfs_block_extent *be;
struct rb_root *root;
int err = 0;
switch (new->be_state) {
case PNFS_BLOCK_READWRITE_DATA:
case PNFS_BLOCK_INVALID_DATA:
root = &bl->bl_ext_rw;
break;
case PNFS_BLOCK_READ_DATA:
case PNFS_BLOCK_NONE_DATA:
root = &bl->bl_ext_ro;
break;
default:
dprintk("invalid extent type\n");
return -EINVAL;
}
spin_lock(&bl->bl_ext_lock);
retry:
be = __ext_tree_search(root, new->be_f_offset);
if (!be || be->be_f_offset >= ext_f_end(new)) {
__ext_tree_insert(root, new, true);
} else if (new->be_f_offset >= be->be_f_offset) {
if (ext_f_end(new) <= ext_f_end(be)) {
nfs4_put_deviceid_node(new->be_device);
kfree(new);
} else {
sector_t new_len = ext_f_end(new) - ext_f_end(be);
sector_t diff = new->be_length - new_len;
new->be_f_offset += diff;
new->be_v_offset += diff;
new->be_length = new_len;
goto retry;
}
} else if (ext_f_end(new) <= ext_f_end(be)) {
new->be_length = be->be_f_offset - new->be_f_offset;
__ext_tree_insert(root, new, true);
} else {
struct pnfs_block_extent *split;
sector_t new_len = ext_f_end(new) - ext_f_end(be);
sector_t diff = new->be_length - new_len;
split = kmemdup(new, sizeof(*new), GFP_ATOMIC);
if (!split) {
err = -EINVAL;
goto out;
}
split->be_length = be->be_f_offset - split->be_f_offset;
split->be_device = nfs4_get_deviceid(new->be_device);
__ext_tree_insert(root, split, true);
new->be_f_offset += diff;
new->be_v_offset += diff;
new->be_length = new_len;
goto retry;
}
out:
spin_unlock(&bl->bl_ext_lock);
return err;
}
static bool
__ext_tree_lookup(struct rb_root *root, sector_t isect,
struct pnfs_block_extent *ret)
{
struct rb_node *node;
struct pnfs_block_extent *be;
node = root->rb_node;
while (node) {
be = ext_node(node);
if (isect < be->be_f_offset)
node = node->rb_left;
else if (isect >= ext_f_end(be))
node = node->rb_right;
else {
*ret = *be;
return true;
}
}
return false;
}
bool
ext_tree_lookup(struct pnfs_block_layout *bl, sector_t isect,
struct pnfs_block_extent *ret, bool rw)
{
bool found = false;
spin_lock(&bl->bl_ext_lock);
if (!rw)
found = __ext_tree_lookup(&bl->bl_ext_ro, isect, ret);
if (!found)
found = __ext_tree_lookup(&bl->bl_ext_rw, isect, ret);
spin_unlock(&bl->bl_ext_lock);
return found;
}
int ext_tree_remove(struct pnfs_block_layout *bl, bool rw,
sector_t start, sector_t end)
{
int err, err2;
spin_lock(&bl->bl_ext_lock);
err = __ext_tree_remove(&bl->bl_ext_ro, start, end);
if (rw) {
err2 = __ext_tree_remove(&bl->bl_ext_rw, start, end);
if (!err)
err = err2;
}
spin_unlock(&bl->bl_ext_lock);
return err;
}
static int
ext_tree_split(struct rb_root *root, struct pnfs_block_extent *be,
sector_t split)
{
struct pnfs_block_extent *new;
sector_t orig_len = be->be_length;
new = kzalloc(sizeof(*new), GFP_ATOMIC);
if (!new)
return -ENOMEM;
be->be_length = split - be->be_f_offset;
new->be_f_offset = split;
if (be->be_state != PNFS_BLOCK_NONE_DATA)
new->be_v_offset = be->be_v_offset + be->be_length;
new->be_length = orig_len - be->be_length;
new->be_state = be->be_state;
new->be_tag = be->be_tag;
new->be_device = nfs4_get_deviceid(be->be_device);
__ext_tree_insert(root, new, false);
return 0;
}
int
ext_tree_mark_written(struct pnfs_block_layout *bl, sector_t start,
sector_t len)
{
struct rb_root *root = &bl->bl_ext_rw;
sector_t end = start + len;
struct pnfs_block_extent *be;
int err = 0;
spin_lock(&bl->bl_ext_lock);
/*
* First remove all COW extents or holes from written to range.
*/
err = __ext_tree_remove(&bl->bl_ext_ro, start, end);
if (err)
goto out;
/*
* Then mark all invalid extents in the range as written to.
*/
for (be = __ext_tree_search(root, start); be; be = ext_tree_next(be)) {
if (be->be_f_offset >= end)
break;
if (be->be_state != PNFS_BLOCK_INVALID_DATA || be->be_tag)
continue;
if (be->be_f_offset < start) {
struct pnfs_block_extent *left = ext_tree_prev(be);
if (left && ext_can_merge(left, be)) {
sector_t diff = start - be->be_f_offset;
left->be_length += diff;
be->be_f_offset += diff;
be->be_v_offset += diff;
be->be_length -= diff;
} else {
err = ext_tree_split(root, be, start);
if (err)
goto out;
}
}
if (ext_f_end(be) > end) {
struct pnfs_block_extent *right = ext_tree_next(be);
if (right && ext_can_merge(be, right)) {
sector_t diff = end - be->be_f_offset;
be->be_length -= diff;
right->be_f_offset -= diff;
right->be_v_offset -= diff;
right->be_length += diff;
} else {
err = ext_tree_split(root, be, end);
if (err)
goto out;
}
}
if (be->be_f_offset >= start && ext_f_end(be) <= end) {
be->be_tag = EXTENT_WRITTEN;
be = ext_try_to_merge_left(root, be);
be = ext_try_to_merge_right(root, be);
}
}
out:
spin_unlock(&bl->bl_ext_lock);
return err;
}
static void ext_tree_free_commitdata(struct nfs4_layoutcommit_args *arg,
size_t buffer_size)
{
if (arg->layoutupdate_pages != &arg->layoutupdate_page) {
int nr_pages = DIV_ROUND_UP(buffer_size, PAGE_SIZE), i;
for (i = 0; i < nr_pages; i++)
put_page(arg->layoutupdate_pages[i]);
kfree(arg->layoutupdate_pages);
} else {
put_page(arg->layoutupdate_page);
}
}
static int ext_tree_encode_commit(struct pnfs_block_layout *bl, __be32 *p,
size_t buffer_size, size_t *count)
{
struct pnfs_block_extent *be;
int ret = 0;
spin_lock(&bl->bl_ext_lock);
for (be = ext_tree_first(&bl->bl_ext_rw); be; be = ext_tree_next(be)) {
if (be->be_state != PNFS_BLOCK_INVALID_DATA ||
be->be_tag != EXTENT_WRITTEN)
continue;
(*count)++;
if (*count * BL_EXTENT_SIZE > buffer_size) {
/* keep counting.. */
ret = -ENOSPC;
continue;
}
p = xdr_encode_opaque_fixed(p, be->be_device->deviceid.data,
NFS4_DEVICEID4_SIZE);
p = xdr_encode_hyper(p, be->be_f_offset << SECTOR_SHIFT);
p = xdr_encode_hyper(p, be->be_length << SECTOR_SHIFT);
p = xdr_encode_hyper(p, 0LL);
*p++ = cpu_to_be32(PNFS_BLOCK_READWRITE_DATA);
be->be_tag = EXTENT_COMMITTING;
}
spin_unlock(&bl->bl_ext_lock);
return ret;
}
int
ext_tree_prepare_commit(struct nfs4_layoutcommit_args *arg)
{
struct pnfs_block_layout *bl = BLK_LO2EXT(NFS_I(arg->inode)->layout);
size_t count = 0, buffer_size = PAGE_SIZE;
__be32 *start_p;
int ret;
dprintk("%s enter\n", __func__);
arg->layoutupdate_page = alloc_page(GFP_NOFS);
if (!arg->layoutupdate_page)
return -ENOMEM;
start_p = page_address(arg->layoutupdate_page);
arg->layoutupdate_pages = &arg->layoutupdate_page;
retry:
ret = ext_tree_encode_commit(bl, start_p + 1, buffer_size, &count);
if (unlikely(ret)) {
ext_tree_free_commitdata(arg, buffer_size);
buffer_size = sizeof(__be32) + BL_EXTENT_SIZE * count;
count = 0;
arg->layoutupdate_pages =
kcalloc(DIV_ROUND_UP(buffer_size, PAGE_SIZE),
sizeof(struct page *), GFP_NOFS);
if (!arg->layoutupdate_pages)
return -ENOMEM;
start_p = __vmalloc(buffer_size, GFP_NOFS, PAGE_KERNEL);
if (!start_p) {
kfree(arg->layoutupdate_pages);
return -ENOMEM;
}
goto retry;
}
*start_p = cpu_to_be32(count);
arg->layoutupdate_len = sizeof(__be32) + BL_EXTENT_SIZE * count;
if (unlikely(arg->layoutupdate_pages != &arg->layoutupdate_page)) {
__be32 *p = start_p;
int i = 0;
for (p = start_p;
p < start_p + arg->layoutupdate_len;
p += PAGE_SIZE) {
arg->layoutupdate_pages[i++] = vmalloc_to_page(p);
}
}
dprintk("%s found %zu ranges\n", __func__, count);
return 0;
}
void
ext_tree_mark_committed(struct nfs4_layoutcommit_args *arg, int status)
{
struct pnfs_block_layout *bl = BLK_LO2EXT(NFS_I(arg->inode)->layout);
struct rb_root *root = &bl->bl_ext_rw;
struct pnfs_block_extent *be;
dprintk("%s status %d\n", __func__, status);
ext_tree_free_commitdata(arg, arg->layoutupdate_len);
spin_lock(&bl->bl_ext_lock);
for (be = ext_tree_first(root); be; be = ext_tree_next(be)) {
if (be->be_state != PNFS_BLOCK_INVALID_DATA ||
be->be_tag != EXTENT_COMMITTING)
continue;
if (status) {
/*
* Mark as written and try again.
*
* XXX: some real error handling here wouldn't hurt..
*/
be->be_tag = EXTENT_WRITTEN;
} else {
be->be_state = PNFS_BLOCK_READWRITE_DATA;
be->be_tag = 0;
}
be = ext_try_to_merge_left(root, be);
be = ext_try_to_merge_right(root, be);
}
spin_unlock(&bl->bl_ext_lock);
}

View file

@ -0,0 +1,288 @@
/*
* Copyright (c) 2006,2007 The Regents of the University of Michigan.
* All rights reserved.
*
* Andy Adamson <andros@citi.umich.edu>
* Fred Isaman <iisaman@umich.edu>
*
* permission is granted to use, copy, create derivative works and
* redistribute this software and such derivative works for any purpose,
* so long as the name of the university of michigan is not used in
* any advertising or publicity pertaining to the use or distribution
* of this software without specific, written prior authorization. if
* the above copyright notice or any other identification of the
* university of michigan is included in any copy of any portion of
* this software, then the disclaimer below must also be included.
*
* this software is provided as is, without representation from the
* university of michigan as to its fitness for any purpose, and without
* warranty by the university of michigan of any kind, either express
* or implied, including without limitation the implied warranties of
* merchantability and fitness for a particular purpose. the regents
* of the university of michigan shall not be liable for any damages,
* including special, indirect, incidental, or consequential damages,
* with respect to any claim arising out or in connection with the use
* of the software, even if it has been or is hereafter advised of the
* possibility of such damages.
*/
#include <linux/module.h>
#include <linux/genhd.h>
#include <linux/blkdev.h>
#include "blocklayout.h"
#define NFSDBG_FACILITY NFSDBG_PNFS_LD
static void
nfs4_encode_simple(__be32 *p, struct pnfs_block_volume *b)
{
int i;
*p++ = cpu_to_be32(1);
*p++ = cpu_to_be32(b->type);
*p++ = cpu_to_be32(b->simple.nr_sigs);
for (i = 0; i < b->simple.nr_sigs; i++) {
p = xdr_encode_hyper(p, b->simple.sigs[i].offset);
p = xdr_encode_opaque(p, b->simple.sigs[i].sig,
b->simple.sigs[i].sig_len);
}
}
dev_t
bl_resolve_deviceid(struct nfs_server *server, struct pnfs_block_volume *b,
gfp_t gfp_mask)
{
struct net *net = server->nfs_client->cl_net;
struct nfs_net *nn = net_generic(net, nfs_net_id);
struct bl_dev_msg *reply = &nn->bl_mount_reply;
struct bl_pipe_msg bl_pipe_msg;
struct rpc_pipe_msg *msg = &bl_pipe_msg.msg;
struct bl_msg_hdr *bl_msg;
DECLARE_WAITQUEUE(wq, current);
dev_t dev = 0;
int rc;
dprintk("%s CREATING PIPEFS MESSAGE\n", __func__);
mutex_lock(&nn->bl_mutex);
bl_pipe_msg.bl_wq = &nn->bl_wq;
b->simple.len += 4; /* single volume */
if (b->simple.len > PAGE_SIZE)
goto out_unlock;
memset(msg, 0, sizeof(*msg));
msg->len = sizeof(*bl_msg) + b->simple.len;
msg->data = kzalloc(msg->len, gfp_mask);
if (!msg->data)
goto out_free_data;
bl_msg = msg->data;
bl_msg->type = BL_DEVICE_MOUNT,
bl_msg->totallen = b->simple.len;
nfs4_encode_simple(msg->data + sizeof(*bl_msg), b);
dprintk("%s CALLING USERSPACE DAEMON\n", __func__);
add_wait_queue(&nn->bl_wq, &wq);
rc = rpc_queue_upcall(nn->bl_device_pipe, msg);
if (rc < 0) {
remove_wait_queue(&nn->bl_wq, &wq);
goto out_free_data;
}
set_current_state(TASK_UNINTERRUPTIBLE);
schedule();
remove_wait_queue(&nn->bl_wq, &wq);
if (reply->status != BL_DEVICE_REQUEST_PROC) {
printk(KERN_WARNING "%s failed to decode device: %d\n",
__func__, reply->status);
goto out_free_data;
}
dev = MKDEV(reply->major, reply->minor);
out_free_data:
kfree(msg->data);
out_unlock:
mutex_unlock(&nn->bl_mutex);
return dev;
}
static ssize_t bl_pipe_downcall(struct file *filp, const char __user *src,
size_t mlen)
{
struct nfs_net *nn = net_generic(filp->f_dentry->d_sb->s_fs_info,
nfs_net_id);
if (mlen != sizeof (struct bl_dev_msg))
return -EINVAL;
if (copy_from_user(&nn->bl_mount_reply, src, mlen) != 0)
return -EFAULT;
wake_up(&nn->bl_wq);
return mlen;
}
static void bl_pipe_destroy_msg(struct rpc_pipe_msg *msg)
{
struct bl_pipe_msg *bl_pipe_msg =
container_of(msg, struct bl_pipe_msg, msg);
if (msg->errno >= 0)
return;
wake_up(bl_pipe_msg->bl_wq);
}
static const struct rpc_pipe_ops bl_upcall_ops = {
.upcall = rpc_pipe_generic_upcall,
.downcall = bl_pipe_downcall,
.destroy_msg = bl_pipe_destroy_msg,
};
static struct dentry *nfs4blocklayout_register_sb(struct super_block *sb,
struct rpc_pipe *pipe)
{
struct dentry *dir, *dentry;
dir = rpc_d_lookup_sb(sb, NFS_PIPE_DIRNAME);
if (dir == NULL)
return ERR_PTR(-ENOENT);
dentry = rpc_mkpipe_dentry(dir, "blocklayout", NULL, pipe);
dput(dir);
return dentry;
}
static void nfs4blocklayout_unregister_sb(struct super_block *sb,
struct rpc_pipe *pipe)
{
if (pipe->dentry)
rpc_unlink(pipe->dentry);
}
static int rpc_pipefs_event(struct notifier_block *nb, unsigned long event,
void *ptr)
{
struct super_block *sb = ptr;
struct net *net = sb->s_fs_info;
struct nfs_net *nn = net_generic(net, nfs_net_id);
struct dentry *dentry;
int ret = 0;
if (!try_module_get(THIS_MODULE))
return 0;
if (nn->bl_device_pipe == NULL) {
module_put(THIS_MODULE);
return 0;
}
switch (event) {
case RPC_PIPEFS_MOUNT:
dentry = nfs4blocklayout_register_sb(sb, nn->bl_device_pipe);
if (IS_ERR(dentry)) {
ret = PTR_ERR(dentry);
break;
}
nn->bl_device_pipe->dentry = dentry;
break;
case RPC_PIPEFS_UMOUNT:
if (nn->bl_device_pipe->dentry)
nfs4blocklayout_unregister_sb(sb, nn->bl_device_pipe);
break;
default:
ret = -ENOTSUPP;
break;
}
module_put(THIS_MODULE);
return ret;
}
static struct notifier_block nfs4blocklayout_block = {
.notifier_call = rpc_pipefs_event,
};
static struct dentry *nfs4blocklayout_register_net(struct net *net,
struct rpc_pipe *pipe)
{
struct super_block *pipefs_sb;
struct dentry *dentry;
pipefs_sb = rpc_get_sb_net(net);
if (!pipefs_sb)
return NULL;
dentry = nfs4blocklayout_register_sb(pipefs_sb, pipe);
rpc_put_sb_net(net);
return dentry;
}
static void nfs4blocklayout_unregister_net(struct net *net,
struct rpc_pipe *pipe)
{
struct super_block *pipefs_sb;
pipefs_sb = rpc_get_sb_net(net);
if (pipefs_sb) {
nfs4blocklayout_unregister_sb(pipefs_sb, pipe);
rpc_put_sb_net(net);
}
}
static int nfs4blocklayout_net_init(struct net *net)
{
struct nfs_net *nn = net_generic(net, nfs_net_id);
struct dentry *dentry;
mutex_init(&nn->bl_mutex);
init_waitqueue_head(&nn->bl_wq);
nn->bl_device_pipe = rpc_mkpipe_data(&bl_upcall_ops, 0);
if (IS_ERR(nn->bl_device_pipe))
return PTR_ERR(nn->bl_device_pipe);
dentry = nfs4blocklayout_register_net(net, nn->bl_device_pipe);
if (IS_ERR(dentry)) {
rpc_destroy_pipe_data(nn->bl_device_pipe);
return PTR_ERR(dentry);
}
nn->bl_device_pipe->dentry = dentry;
return 0;
}
static void nfs4blocklayout_net_exit(struct net *net)
{
struct nfs_net *nn = net_generic(net, nfs_net_id);
nfs4blocklayout_unregister_net(net, nn->bl_device_pipe);
rpc_destroy_pipe_data(nn->bl_device_pipe);
nn->bl_device_pipe = NULL;
}
static struct pernet_operations nfs4blocklayout_net_ops = {
.init = nfs4blocklayout_net_init,
.exit = nfs4blocklayout_net_exit,
};
int __init bl_init_pipefs(void)
{
int ret;
ret = rpc_pipefs_notifier_register(&nfs4blocklayout_block);
if (ret)
goto out;
ret = register_pernet_subsys(&nfs4blocklayout_net_ops);
if (ret)
goto out_unregister_notifier;
return 0;
out_unregister_notifier:
rpc_pipefs_notifier_unregister(&nfs4blocklayout_block);
out:
return ret;
}
void __exit bl_cleanup_pipefs(void)
{
rpc_pipefs_notifier_unregister(&nfs4blocklayout_block);
unregister_pernet_subsys(&nfs4blocklayout_net_ops);
}