mirror of
https://github.com/AetherDroid/android_kernel_samsung_on5xelte.git
synced 2025-09-08 01:08:03 -04:00
Fixed MTP to work with TWRP
This commit is contained in:
commit
f6dfaef42e
50820 changed files with 20846062 additions and 0 deletions
8
net/sunrpc/xprtrdma/Makefile
Normal file
8
net/sunrpc/xprtrdma/Makefile
Normal file
|
@ -0,0 +1,8 @@
|
|||
obj-$(CONFIG_SUNRPC_XPRT_RDMA_CLIENT) += xprtrdma.o
|
||||
|
||||
xprtrdma-y := transport.o rpc_rdma.o verbs.o
|
||||
|
||||
obj-$(CONFIG_SUNRPC_XPRT_RDMA_SERVER) += svcrdma.o
|
||||
|
||||
svcrdma-y := svc_rdma.o svc_rdma_transport.o \
|
||||
svc_rdma_marshal.o svc_rdma_sendto.o svc_rdma_recvfrom.o
|
875
net/sunrpc/xprtrdma/rpc_rdma.c
Normal file
875
net/sunrpc/xprtrdma/rpc_rdma.c
Normal file
|
@ -0,0 +1,875 @@
|
|||
/*
|
||||
* Copyright (c) 2003-2007 Network Appliance, Inc. All rights reserved.
|
||||
*
|
||||
* This software is available to you under a choice of one of two
|
||||
* licenses. You may choose to be licensed under the terms of the GNU
|
||||
* General Public License (GPL) Version 2, available from the file
|
||||
* COPYING in the main directory of this source tree, or the BSD-type
|
||||
* license below:
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions
|
||||
* are met:
|
||||
*
|
||||
* Redistributions of source code must retain the above copyright
|
||||
* notice, this list of conditions and the following disclaimer.
|
||||
*
|
||||
* Redistributions in binary form must reproduce the above
|
||||
* copyright notice, this list of conditions and the following
|
||||
* disclaimer in the documentation and/or other materials provided
|
||||
* with the distribution.
|
||||
*
|
||||
* Neither the name of the Network Appliance, Inc. nor the names of
|
||||
* its contributors may be used to endorse or promote products
|
||||
* derived from this software without specific prior written
|
||||
* permission.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
* A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
* OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*/
|
||||
|
||||
/*
|
||||
* rpc_rdma.c
|
||||
*
|
||||
* This file contains the guts of the RPC RDMA protocol, and
|
||||
* does marshaling/unmarshaling, etc. It is also where interfacing
|
||||
* to the Linux RPC framework lives.
|
||||
*/
|
||||
|
||||
#include "xprt_rdma.h"
|
||||
|
||||
#include <linux/highmem.h>
|
||||
|
||||
#ifdef RPC_DEBUG
|
||||
# define RPCDBG_FACILITY RPCDBG_TRANS
|
||||
#endif
|
||||
|
||||
#ifdef RPC_DEBUG
|
||||
static const char transfertypes[][12] = {
|
||||
"pure inline", /* no chunks */
|
||||
" read chunk", /* some argument via rdma read */
|
||||
"*read chunk", /* entire request via rdma read */
|
||||
"write chunk", /* some result via rdma write */
|
||||
"reply chunk" /* entire reply via rdma write */
|
||||
};
|
||||
#endif
|
||||
|
||||
/*
|
||||
* Chunk assembly from upper layer xdr_buf.
|
||||
*
|
||||
* Prepare the passed-in xdr_buf into representation as RPC/RDMA chunk
|
||||
* elements. Segments are then coalesced when registered, if possible
|
||||
* within the selected memreg mode.
|
||||
*
|
||||
* Returns positive number of segments converted, or a negative errno.
|
||||
*/
|
||||
|
||||
static int
|
||||
rpcrdma_convert_iovs(struct xdr_buf *xdrbuf, unsigned int pos,
|
||||
enum rpcrdma_chunktype type, struct rpcrdma_mr_seg *seg, int nsegs)
|
||||
{
|
||||
int len, n = 0, p;
|
||||
int page_base;
|
||||
struct page **ppages;
|
||||
|
||||
if (pos == 0 && xdrbuf->head[0].iov_len) {
|
||||
seg[n].mr_page = NULL;
|
||||
seg[n].mr_offset = xdrbuf->head[0].iov_base;
|
||||
seg[n].mr_len = xdrbuf->head[0].iov_len;
|
||||
++n;
|
||||
}
|
||||
|
||||
len = xdrbuf->page_len;
|
||||
ppages = xdrbuf->pages + (xdrbuf->page_base >> PAGE_SHIFT);
|
||||
page_base = xdrbuf->page_base & ~PAGE_MASK;
|
||||
p = 0;
|
||||
while (len && n < nsegs) {
|
||||
if (!ppages[p]) {
|
||||
/* alloc the pagelist for receiving buffer */
|
||||
ppages[p] = alloc_page(GFP_ATOMIC);
|
||||
if (!ppages[p])
|
||||
return -ENOMEM;
|
||||
}
|
||||
seg[n].mr_page = ppages[p];
|
||||
seg[n].mr_offset = (void *)(unsigned long) page_base;
|
||||
seg[n].mr_len = min_t(u32, PAGE_SIZE - page_base, len);
|
||||
if (seg[n].mr_len > PAGE_SIZE)
|
||||
return -EIO;
|
||||
len -= seg[n].mr_len;
|
||||
++n;
|
||||
++p;
|
||||
page_base = 0; /* page offset only applies to first page */
|
||||
}
|
||||
|
||||
/* Message overflows the seg array */
|
||||
if (len && n == nsegs)
|
||||
return -EIO;
|
||||
|
||||
if (xdrbuf->tail[0].iov_len) {
|
||||
/* the rpcrdma protocol allows us to omit any trailing
|
||||
* xdr pad bytes, saving the server an RDMA operation. */
|
||||
if (xdrbuf->tail[0].iov_len < 4 && xprt_rdma_pad_optimize)
|
||||
return n;
|
||||
if (n == nsegs)
|
||||
/* Tail remains, but we're out of segments */
|
||||
return -EIO;
|
||||
seg[n].mr_page = NULL;
|
||||
seg[n].mr_offset = xdrbuf->tail[0].iov_base;
|
||||
seg[n].mr_len = xdrbuf->tail[0].iov_len;
|
||||
++n;
|
||||
}
|
||||
|
||||
return n;
|
||||
}
|
||||
|
||||
/*
|
||||
* Create read/write chunk lists, and reply chunks, for RDMA
|
||||
*
|
||||
* Assume check against THRESHOLD has been done, and chunks are required.
|
||||
* Assume only encoding one list entry for read|write chunks. The NFSv3
|
||||
* protocol is simple enough to allow this as it only has a single "bulk
|
||||
* result" in each procedure - complicated NFSv4 COMPOUNDs are not. (The
|
||||
* RDMA/Sessions NFSv4 proposal addresses this for future v4 revs.)
|
||||
*
|
||||
* When used for a single reply chunk (which is a special write
|
||||
* chunk used for the entire reply, rather than just the data), it
|
||||
* is used primarily for READDIR and READLINK which would otherwise
|
||||
* be severely size-limited by a small rdma inline read max. The server
|
||||
* response will come back as an RDMA Write, followed by a message
|
||||
* of type RDMA_NOMSG carrying the xid and length. As a result, reply
|
||||
* chunks do not provide data alignment, however they do not require
|
||||
* "fixup" (moving the response to the upper layer buffer) either.
|
||||
*
|
||||
* Encoding key for single-list chunks (HLOO = Handle32 Length32 Offset64):
|
||||
*
|
||||
* Read chunklist (a linked list):
|
||||
* N elements, position P (same P for all chunks of same arg!):
|
||||
* 1 - PHLOO - 1 - PHLOO - ... - 1 - PHLOO - 0
|
||||
*
|
||||
* Write chunklist (a list of (one) counted array):
|
||||
* N elements:
|
||||
* 1 - N - HLOO - HLOO - ... - HLOO - 0
|
||||
*
|
||||
* Reply chunk (a counted array):
|
||||
* N elements:
|
||||
* 1 - N - HLOO - HLOO - ... - HLOO
|
||||
*
|
||||
* Returns positive RPC/RDMA header size, or negative errno.
|
||||
*/
|
||||
|
||||
static ssize_t
|
||||
rpcrdma_create_chunks(struct rpc_rqst *rqst, struct xdr_buf *target,
|
||||
struct rpcrdma_msg *headerp, enum rpcrdma_chunktype type)
|
||||
{
|
||||
struct rpcrdma_req *req = rpcr_to_rdmar(rqst);
|
||||
struct rpcrdma_xprt *r_xprt = rpcx_to_rdmax(rqst->rq_xprt);
|
||||
int n, nsegs, nchunks = 0;
|
||||
unsigned int pos;
|
||||
struct rpcrdma_mr_seg *seg = req->rl_segments;
|
||||
struct rpcrdma_read_chunk *cur_rchunk = NULL;
|
||||
struct rpcrdma_write_array *warray = NULL;
|
||||
struct rpcrdma_write_chunk *cur_wchunk = NULL;
|
||||
__be32 *iptr = headerp->rm_body.rm_chunks;
|
||||
|
||||
if (type == rpcrdma_readch || type == rpcrdma_areadch) {
|
||||
/* a read chunk - server will RDMA Read our memory */
|
||||
cur_rchunk = (struct rpcrdma_read_chunk *) iptr;
|
||||
} else {
|
||||
/* a write or reply chunk - server will RDMA Write our memory */
|
||||
*iptr++ = xdr_zero; /* encode a NULL read chunk list */
|
||||
if (type == rpcrdma_replych)
|
||||
*iptr++ = xdr_zero; /* a NULL write chunk list */
|
||||
warray = (struct rpcrdma_write_array *) iptr;
|
||||
cur_wchunk = (struct rpcrdma_write_chunk *) (warray + 1);
|
||||
}
|
||||
|
||||
if (type == rpcrdma_replych || type == rpcrdma_areadch)
|
||||
pos = 0;
|
||||
else
|
||||
pos = target->head[0].iov_len;
|
||||
|
||||
nsegs = rpcrdma_convert_iovs(target, pos, type, seg, RPCRDMA_MAX_SEGS);
|
||||
if (nsegs < 0)
|
||||
return nsegs;
|
||||
|
||||
do {
|
||||
n = rpcrdma_register_external(seg, nsegs,
|
||||
cur_wchunk != NULL, r_xprt);
|
||||
if (n <= 0)
|
||||
goto out;
|
||||
if (cur_rchunk) { /* read */
|
||||
cur_rchunk->rc_discrim = xdr_one;
|
||||
/* all read chunks have the same "position" */
|
||||
cur_rchunk->rc_position = htonl(pos);
|
||||
cur_rchunk->rc_target.rs_handle = htonl(seg->mr_rkey);
|
||||
cur_rchunk->rc_target.rs_length = htonl(seg->mr_len);
|
||||
xdr_encode_hyper(
|
||||
(__be32 *)&cur_rchunk->rc_target.rs_offset,
|
||||
seg->mr_base);
|
||||
dprintk("RPC: %s: read chunk "
|
||||
"elem %d@0x%llx:0x%x pos %u (%s)\n", __func__,
|
||||
seg->mr_len, (unsigned long long)seg->mr_base,
|
||||
seg->mr_rkey, pos, n < nsegs ? "more" : "last");
|
||||
cur_rchunk++;
|
||||
r_xprt->rx_stats.read_chunk_count++;
|
||||
} else { /* write/reply */
|
||||
cur_wchunk->wc_target.rs_handle = htonl(seg->mr_rkey);
|
||||
cur_wchunk->wc_target.rs_length = htonl(seg->mr_len);
|
||||
xdr_encode_hyper(
|
||||
(__be32 *)&cur_wchunk->wc_target.rs_offset,
|
||||
seg->mr_base);
|
||||
dprintk("RPC: %s: %s chunk "
|
||||
"elem %d@0x%llx:0x%x (%s)\n", __func__,
|
||||
(type == rpcrdma_replych) ? "reply" : "write",
|
||||
seg->mr_len, (unsigned long long)seg->mr_base,
|
||||
seg->mr_rkey, n < nsegs ? "more" : "last");
|
||||
cur_wchunk++;
|
||||
if (type == rpcrdma_replych)
|
||||
r_xprt->rx_stats.reply_chunk_count++;
|
||||
else
|
||||
r_xprt->rx_stats.write_chunk_count++;
|
||||
r_xprt->rx_stats.total_rdma_request += seg->mr_len;
|
||||
}
|
||||
nchunks++;
|
||||
seg += n;
|
||||
nsegs -= n;
|
||||
} while (nsegs);
|
||||
|
||||
/* success. all failures return above */
|
||||
req->rl_nchunks = nchunks;
|
||||
|
||||
/*
|
||||
* finish off header. If write, marshal discrim and nchunks.
|
||||
*/
|
||||
if (cur_rchunk) {
|
||||
iptr = (__be32 *) cur_rchunk;
|
||||
*iptr++ = xdr_zero; /* finish the read chunk list */
|
||||
*iptr++ = xdr_zero; /* encode a NULL write chunk list */
|
||||
*iptr++ = xdr_zero; /* encode a NULL reply chunk */
|
||||
} else {
|
||||
warray->wc_discrim = xdr_one;
|
||||
warray->wc_nchunks = htonl(nchunks);
|
||||
iptr = (__be32 *) cur_wchunk;
|
||||
if (type == rpcrdma_writech) {
|
||||
*iptr++ = xdr_zero; /* finish the write chunk list */
|
||||
*iptr++ = xdr_zero; /* encode a NULL reply chunk */
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* Return header size.
|
||||
*/
|
||||
return (unsigned char *)iptr - (unsigned char *)headerp;
|
||||
|
||||
out:
|
||||
if (r_xprt->rx_ia.ri_memreg_strategy != RPCRDMA_FRMR) {
|
||||
for (pos = 0; nchunks--;)
|
||||
pos += rpcrdma_deregister_external(
|
||||
&req->rl_segments[pos], r_xprt);
|
||||
}
|
||||
return n;
|
||||
}
|
||||
|
||||
/*
|
||||
* Marshal chunks. This routine returns the header length
|
||||
* consumed by marshaling.
|
||||
*
|
||||
* Returns positive RPC/RDMA header size, or negative errno.
|
||||
*/
|
||||
|
||||
ssize_t
|
||||
rpcrdma_marshal_chunks(struct rpc_rqst *rqst, ssize_t result)
|
||||
{
|
||||
struct rpcrdma_req *req = rpcr_to_rdmar(rqst);
|
||||
struct rpcrdma_msg *headerp = (struct rpcrdma_msg *)req->rl_base;
|
||||
|
||||
if (req->rl_rtype != rpcrdma_noch)
|
||||
result = rpcrdma_create_chunks(rqst, &rqst->rq_snd_buf,
|
||||
headerp, req->rl_rtype);
|
||||
else if (req->rl_wtype != rpcrdma_noch)
|
||||
result = rpcrdma_create_chunks(rqst, &rqst->rq_rcv_buf,
|
||||
headerp, req->rl_wtype);
|
||||
return result;
|
||||
}
|
||||
|
||||
/*
|
||||
* Copy write data inline.
|
||||
* This function is used for "small" requests. Data which is passed
|
||||
* to RPC via iovecs (or page list) is copied directly into the
|
||||
* pre-registered memory buffer for this request. For small amounts
|
||||
* of data, this is efficient. The cutoff value is tunable.
|
||||
*/
|
||||
static int
|
||||
rpcrdma_inline_pullup(struct rpc_rqst *rqst, int pad)
|
||||
{
|
||||
int i, npages, curlen;
|
||||
int copy_len;
|
||||
unsigned char *srcp, *destp;
|
||||
struct rpcrdma_xprt *r_xprt = rpcx_to_rdmax(rqst->rq_xprt);
|
||||
int page_base;
|
||||
struct page **ppages;
|
||||
|
||||
destp = rqst->rq_svec[0].iov_base;
|
||||
curlen = rqst->rq_svec[0].iov_len;
|
||||
destp += curlen;
|
||||
/*
|
||||
* Do optional padding where it makes sense. Alignment of write
|
||||
* payload can help the server, if our setting is accurate.
|
||||
*/
|
||||
pad -= (curlen + 36/*sizeof(struct rpcrdma_msg_padded)*/);
|
||||
if (pad < 0 || rqst->rq_slen - curlen < RPCRDMA_INLINE_PAD_THRESH)
|
||||
pad = 0; /* don't pad this request */
|
||||
|
||||
dprintk("RPC: %s: pad %d destp 0x%p len %d hdrlen %d\n",
|
||||
__func__, pad, destp, rqst->rq_slen, curlen);
|
||||
|
||||
copy_len = rqst->rq_snd_buf.page_len;
|
||||
|
||||
if (rqst->rq_snd_buf.tail[0].iov_len) {
|
||||
curlen = rqst->rq_snd_buf.tail[0].iov_len;
|
||||
if (destp + copy_len != rqst->rq_snd_buf.tail[0].iov_base) {
|
||||
memmove(destp + copy_len,
|
||||
rqst->rq_snd_buf.tail[0].iov_base, curlen);
|
||||
r_xprt->rx_stats.pullup_copy_count += curlen;
|
||||
}
|
||||
dprintk("RPC: %s: tail destp 0x%p len %d\n",
|
||||
__func__, destp + copy_len, curlen);
|
||||
rqst->rq_svec[0].iov_len += curlen;
|
||||
}
|
||||
r_xprt->rx_stats.pullup_copy_count += copy_len;
|
||||
|
||||
page_base = rqst->rq_snd_buf.page_base;
|
||||
ppages = rqst->rq_snd_buf.pages + (page_base >> PAGE_SHIFT);
|
||||
page_base &= ~PAGE_MASK;
|
||||
npages = PAGE_ALIGN(page_base+copy_len) >> PAGE_SHIFT;
|
||||
for (i = 0; copy_len && i < npages; i++) {
|
||||
curlen = PAGE_SIZE - page_base;
|
||||
if (curlen > copy_len)
|
||||
curlen = copy_len;
|
||||
dprintk("RPC: %s: page %d destp 0x%p len %d curlen %d\n",
|
||||
__func__, i, destp, copy_len, curlen);
|
||||
srcp = kmap_atomic(ppages[i]);
|
||||
memcpy(destp, srcp+page_base, curlen);
|
||||
kunmap_atomic(srcp);
|
||||
rqst->rq_svec[0].iov_len += curlen;
|
||||
destp += curlen;
|
||||
copy_len -= curlen;
|
||||
page_base = 0;
|
||||
}
|
||||
/* header now contains entire send message */
|
||||
return pad;
|
||||
}
|
||||
|
||||
/*
|
||||
* Marshal a request: the primary job of this routine is to choose
|
||||
* the transfer modes. See comments below.
|
||||
*
|
||||
* Uses multiple RDMA IOVs for a request:
|
||||
* [0] -- RPC RDMA header, which uses memory from the *start* of the
|
||||
* preregistered buffer that already holds the RPC data in
|
||||
* its middle.
|
||||
* [1] -- the RPC header/data, marshaled by RPC and the NFS protocol.
|
||||
* [2] -- optional padding.
|
||||
* [3] -- if padded, header only in [1] and data here.
|
||||
*
|
||||
* Returns zero on success, otherwise a negative errno.
|
||||
*/
|
||||
|
||||
int
|
||||
rpcrdma_marshal_req(struct rpc_rqst *rqst)
|
||||
{
|
||||
struct rpc_xprt *xprt = rqst->rq_xprt;
|
||||
struct rpcrdma_xprt *r_xprt = rpcx_to_rdmax(xprt);
|
||||
struct rpcrdma_req *req = rpcr_to_rdmar(rqst);
|
||||
char *base;
|
||||
size_t rpclen, padlen;
|
||||
ssize_t hdrlen;
|
||||
struct rpcrdma_msg *headerp;
|
||||
|
||||
/*
|
||||
* rpclen gets amount of data in first buffer, which is the
|
||||
* pre-registered buffer.
|
||||
*/
|
||||
base = rqst->rq_svec[0].iov_base;
|
||||
rpclen = rqst->rq_svec[0].iov_len;
|
||||
|
||||
/* build RDMA header in private area at front */
|
||||
headerp = (struct rpcrdma_msg *) req->rl_base;
|
||||
/* don't htonl XID, it's already done in request */
|
||||
headerp->rm_xid = rqst->rq_xid;
|
||||
headerp->rm_vers = xdr_one;
|
||||
headerp->rm_credit = htonl(r_xprt->rx_buf.rb_max_requests);
|
||||
headerp->rm_type = htonl(RDMA_MSG);
|
||||
|
||||
/*
|
||||
* Chunks needed for results?
|
||||
*
|
||||
* o If the expected result is under the inline threshold, all ops
|
||||
* return as inline (but see later).
|
||||
* o Large non-read ops return as a single reply chunk.
|
||||
* o Large read ops return data as write chunk(s), header as inline.
|
||||
*
|
||||
* Note: the NFS code sending down multiple result segments implies
|
||||
* the op is one of read, readdir[plus], readlink or NFSv4 getacl.
|
||||
*/
|
||||
|
||||
/*
|
||||
* This code can handle read chunks, write chunks OR reply
|
||||
* chunks -- only one type. If the request is too big to fit
|
||||
* inline, then we will choose read chunks. If the request is
|
||||
* a READ, then use write chunks to separate the file data
|
||||
* into pages; otherwise use reply chunks.
|
||||
*/
|
||||
if (rqst->rq_rcv_buf.buflen <= RPCRDMA_INLINE_READ_THRESHOLD(rqst))
|
||||
req->rl_wtype = rpcrdma_noch;
|
||||
else if (rqst->rq_rcv_buf.page_len == 0)
|
||||
req->rl_wtype = rpcrdma_replych;
|
||||
else if (rqst->rq_rcv_buf.flags & XDRBUF_READ)
|
||||
req->rl_wtype = rpcrdma_writech;
|
||||
else
|
||||
req->rl_wtype = rpcrdma_replych;
|
||||
|
||||
/*
|
||||
* Chunks needed for arguments?
|
||||
*
|
||||
* o If the total request is under the inline threshold, all ops
|
||||
* are sent as inline.
|
||||
* o Large non-write ops are sent with the entire message as a
|
||||
* single read chunk (protocol 0-position special case).
|
||||
* o Large write ops transmit data as read chunk(s), header as
|
||||
* inline.
|
||||
*
|
||||
* Note: the NFS code sending down multiple argument segments
|
||||
* implies the op is a write.
|
||||
* TBD check NFSv4 setacl
|
||||
*/
|
||||
if (rqst->rq_snd_buf.len <= RPCRDMA_INLINE_WRITE_THRESHOLD(rqst))
|
||||
req->rl_rtype = rpcrdma_noch;
|
||||
else if (rqst->rq_snd_buf.page_len == 0)
|
||||
req->rl_rtype = rpcrdma_areadch;
|
||||
else
|
||||
req->rl_rtype = rpcrdma_readch;
|
||||
|
||||
/* The following simplification is not true forever */
|
||||
if (req->rl_rtype != rpcrdma_noch && req->rl_wtype == rpcrdma_replych)
|
||||
req->rl_wtype = rpcrdma_noch;
|
||||
if (req->rl_rtype != rpcrdma_noch && req->rl_wtype != rpcrdma_noch) {
|
||||
dprintk("RPC: %s: cannot marshal multiple chunk lists\n",
|
||||
__func__);
|
||||
return -EIO;
|
||||
}
|
||||
|
||||
hdrlen = 28; /*sizeof *headerp;*/
|
||||
padlen = 0;
|
||||
|
||||
/*
|
||||
* Pull up any extra send data into the preregistered buffer.
|
||||
* When padding is in use and applies to the transfer, insert
|
||||
* it and change the message type.
|
||||
*/
|
||||
if (req->rl_rtype == rpcrdma_noch) {
|
||||
|
||||
padlen = rpcrdma_inline_pullup(rqst,
|
||||
RPCRDMA_INLINE_PAD_VALUE(rqst));
|
||||
|
||||
if (padlen) {
|
||||
headerp->rm_type = htonl(RDMA_MSGP);
|
||||
headerp->rm_body.rm_padded.rm_align =
|
||||
htonl(RPCRDMA_INLINE_PAD_VALUE(rqst));
|
||||
headerp->rm_body.rm_padded.rm_thresh =
|
||||
htonl(RPCRDMA_INLINE_PAD_THRESH);
|
||||
headerp->rm_body.rm_padded.rm_pempty[0] = xdr_zero;
|
||||
headerp->rm_body.rm_padded.rm_pempty[1] = xdr_zero;
|
||||
headerp->rm_body.rm_padded.rm_pempty[2] = xdr_zero;
|
||||
hdrlen += 2 * sizeof(u32); /* extra words in padhdr */
|
||||
if (req->rl_wtype != rpcrdma_noch) {
|
||||
dprintk("RPC: %s: invalid chunk list\n",
|
||||
__func__);
|
||||
return -EIO;
|
||||
}
|
||||
} else {
|
||||
headerp->rm_body.rm_nochunks.rm_empty[0] = xdr_zero;
|
||||
headerp->rm_body.rm_nochunks.rm_empty[1] = xdr_zero;
|
||||
headerp->rm_body.rm_nochunks.rm_empty[2] = xdr_zero;
|
||||
/* new length after pullup */
|
||||
rpclen = rqst->rq_svec[0].iov_len;
|
||||
/*
|
||||
* Currently we try to not actually use read inline.
|
||||
* Reply chunks have the desirable property that
|
||||
* they land, packed, directly in the target buffers
|
||||
* without headers, so they require no fixup. The
|
||||
* additional RDMA Write op sends the same amount
|
||||
* of data, streams on-the-wire and adds no overhead
|
||||
* on receive. Therefore, we request a reply chunk
|
||||
* for non-writes wherever feasible and efficient.
|
||||
*/
|
||||
if (req->rl_wtype == rpcrdma_noch)
|
||||
req->rl_wtype = rpcrdma_replych;
|
||||
}
|
||||
}
|
||||
|
||||
hdrlen = rpcrdma_marshal_chunks(rqst, hdrlen);
|
||||
if (hdrlen < 0)
|
||||
return hdrlen;
|
||||
|
||||
dprintk("RPC: %s: %s: hdrlen %zd rpclen %zd padlen %zd"
|
||||
" headerp 0x%p base 0x%p lkey 0x%x\n",
|
||||
__func__, transfertypes[req->rl_wtype], hdrlen, rpclen, padlen,
|
||||
headerp, base, req->rl_iov.lkey);
|
||||
|
||||
/*
|
||||
* initialize send_iov's - normally only two: rdma chunk header and
|
||||
* single preregistered RPC header buffer, but if padding is present,
|
||||
* then use a preregistered (and zeroed) pad buffer between the RPC
|
||||
* header and any write data. In all non-rdma cases, any following
|
||||
* data has been copied into the RPC header buffer.
|
||||
*/
|
||||
req->rl_send_iov[0].addr = req->rl_iov.addr;
|
||||
req->rl_send_iov[0].length = hdrlen;
|
||||
req->rl_send_iov[0].lkey = req->rl_iov.lkey;
|
||||
|
||||
req->rl_send_iov[1].addr = req->rl_iov.addr + (base - req->rl_base);
|
||||
req->rl_send_iov[1].length = rpclen;
|
||||
req->rl_send_iov[1].lkey = req->rl_iov.lkey;
|
||||
|
||||
req->rl_niovs = 2;
|
||||
|
||||
if (padlen) {
|
||||
struct rpcrdma_ep *ep = &r_xprt->rx_ep;
|
||||
|
||||
req->rl_send_iov[2].addr = ep->rep_pad.addr;
|
||||
req->rl_send_iov[2].length = padlen;
|
||||
req->rl_send_iov[2].lkey = ep->rep_pad.lkey;
|
||||
|
||||
req->rl_send_iov[3].addr = req->rl_send_iov[1].addr + rpclen;
|
||||
req->rl_send_iov[3].length = rqst->rq_slen - rpclen;
|
||||
req->rl_send_iov[3].lkey = req->rl_iov.lkey;
|
||||
|
||||
req->rl_niovs = 4;
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
/*
|
||||
* Chase down a received write or reply chunklist to get length
|
||||
* RDMA'd by server. See map at rpcrdma_create_chunks()! :-)
|
||||
*/
|
||||
static int
|
||||
rpcrdma_count_chunks(struct rpcrdma_rep *rep, unsigned int max, int wrchunk, __be32 **iptrp)
|
||||
{
|
||||
unsigned int i, total_len;
|
||||
struct rpcrdma_write_chunk *cur_wchunk;
|
||||
|
||||
i = ntohl(**iptrp); /* get array count */
|
||||
if (i > max)
|
||||
return -1;
|
||||
cur_wchunk = (struct rpcrdma_write_chunk *) (*iptrp + 1);
|
||||
total_len = 0;
|
||||
while (i--) {
|
||||
struct rpcrdma_segment *seg = &cur_wchunk->wc_target;
|
||||
ifdebug(FACILITY) {
|
||||
u64 off;
|
||||
xdr_decode_hyper((__be32 *)&seg->rs_offset, &off);
|
||||
dprintk("RPC: %s: chunk %d@0x%llx:0x%x\n",
|
||||
__func__,
|
||||
ntohl(seg->rs_length),
|
||||
(unsigned long long)off,
|
||||
ntohl(seg->rs_handle));
|
||||
}
|
||||
total_len += ntohl(seg->rs_length);
|
||||
++cur_wchunk;
|
||||
}
|
||||
/* check and adjust for properly terminated write chunk */
|
||||
if (wrchunk) {
|
||||
__be32 *w = (__be32 *) cur_wchunk;
|
||||
if (*w++ != xdr_zero)
|
||||
return -1;
|
||||
cur_wchunk = (struct rpcrdma_write_chunk *) w;
|
||||
}
|
||||
if ((char *) cur_wchunk > rep->rr_base + rep->rr_len)
|
||||
return -1;
|
||||
|
||||
*iptrp = (__be32 *) cur_wchunk;
|
||||
return total_len;
|
||||
}
|
||||
|
||||
/*
|
||||
* Scatter inline received data back into provided iov's.
|
||||
*/
|
||||
static void
|
||||
rpcrdma_inline_fixup(struct rpc_rqst *rqst, char *srcp, int copy_len, int pad)
|
||||
{
|
||||
int i, npages, curlen, olen;
|
||||
char *destp;
|
||||
struct page **ppages;
|
||||
int page_base;
|
||||
|
||||
curlen = rqst->rq_rcv_buf.head[0].iov_len;
|
||||
if (curlen > copy_len) { /* write chunk header fixup */
|
||||
curlen = copy_len;
|
||||
rqst->rq_rcv_buf.head[0].iov_len = curlen;
|
||||
}
|
||||
|
||||
dprintk("RPC: %s: srcp 0x%p len %d hdrlen %d\n",
|
||||
__func__, srcp, copy_len, curlen);
|
||||
|
||||
/* Shift pointer for first receive segment only */
|
||||
rqst->rq_rcv_buf.head[0].iov_base = srcp;
|
||||
srcp += curlen;
|
||||
copy_len -= curlen;
|
||||
|
||||
olen = copy_len;
|
||||
i = 0;
|
||||
rpcx_to_rdmax(rqst->rq_xprt)->rx_stats.fixup_copy_count += olen;
|
||||
page_base = rqst->rq_rcv_buf.page_base;
|
||||
ppages = rqst->rq_rcv_buf.pages + (page_base >> PAGE_SHIFT);
|
||||
page_base &= ~PAGE_MASK;
|
||||
|
||||
if (copy_len && rqst->rq_rcv_buf.page_len) {
|
||||
npages = PAGE_ALIGN(page_base +
|
||||
rqst->rq_rcv_buf.page_len) >> PAGE_SHIFT;
|
||||
for (; i < npages; i++) {
|
||||
curlen = PAGE_SIZE - page_base;
|
||||
if (curlen > copy_len)
|
||||
curlen = copy_len;
|
||||
dprintk("RPC: %s: page %d"
|
||||
" srcp 0x%p len %d curlen %d\n",
|
||||
__func__, i, srcp, copy_len, curlen);
|
||||
destp = kmap_atomic(ppages[i]);
|
||||
memcpy(destp + page_base, srcp, curlen);
|
||||
flush_dcache_page(ppages[i]);
|
||||
kunmap_atomic(destp);
|
||||
srcp += curlen;
|
||||
copy_len -= curlen;
|
||||
if (copy_len == 0)
|
||||
break;
|
||||
page_base = 0;
|
||||
}
|
||||
}
|
||||
|
||||
if (copy_len && rqst->rq_rcv_buf.tail[0].iov_len) {
|
||||
curlen = copy_len;
|
||||
if (curlen > rqst->rq_rcv_buf.tail[0].iov_len)
|
||||
curlen = rqst->rq_rcv_buf.tail[0].iov_len;
|
||||
if (rqst->rq_rcv_buf.tail[0].iov_base != srcp)
|
||||
memmove(rqst->rq_rcv_buf.tail[0].iov_base, srcp, curlen);
|
||||
dprintk("RPC: %s: tail srcp 0x%p len %d curlen %d\n",
|
||||
__func__, srcp, copy_len, curlen);
|
||||
rqst->rq_rcv_buf.tail[0].iov_len = curlen;
|
||||
copy_len -= curlen; ++i;
|
||||
} else
|
||||
rqst->rq_rcv_buf.tail[0].iov_len = 0;
|
||||
|
||||
if (pad) {
|
||||
/* implicit padding on terminal chunk */
|
||||
unsigned char *p = rqst->rq_rcv_buf.tail[0].iov_base;
|
||||
while (pad--)
|
||||
p[rqst->rq_rcv_buf.tail[0].iov_len++] = 0;
|
||||
}
|
||||
|
||||
if (copy_len)
|
||||
dprintk("RPC: %s: %d bytes in"
|
||||
" %d extra segments (%d lost)\n",
|
||||
__func__, olen, i, copy_len);
|
||||
|
||||
/* TBD avoid a warning from call_decode() */
|
||||
rqst->rq_private_buf = rqst->rq_rcv_buf;
|
||||
}
|
||||
|
||||
void
|
||||
rpcrdma_connect_worker(struct work_struct *work)
|
||||
{
|
||||
struct rpcrdma_ep *ep =
|
||||
container_of(work, struct rpcrdma_ep, rep_connect_worker.work);
|
||||
struct rpc_xprt *xprt = ep->rep_xprt;
|
||||
|
||||
spin_lock_bh(&xprt->transport_lock);
|
||||
if (++xprt->connect_cookie == 0) /* maintain a reserved value */
|
||||
++xprt->connect_cookie;
|
||||
if (ep->rep_connected > 0) {
|
||||
if (!xprt_test_and_set_connected(xprt))
|
||||
xprt_wake_pending_tasks(xprt, 0);
|
||||
} else {
|
||||
if (xprt_test_and_clear_connected(xprt))
|
||||
xprt_wake_pending_tasks(xprt, -ENOTCONN);
|
||||
}
|
||||
spin_unlock_bh(&xprt->transport_lock);
|
||||
}
|
||||
|
||||
/*
|
||||
* This function is called when an async event is posted to
|
||||
* the connection which changes the connection state. All it
|
||||
* does at this point is mark the connection up/down, the rpc
|
||||
* timers do the rest.
|
||||
*/
|
||||
void
|
||||
rpcrdma_conn_func(struct rpcrdma_ep *ep)
|
||||
{
|
||||
schedule_delayed_work(&ep->rep_connect_worker, 0);
|
||||
}
|
||||
|
||||
/*
|
||||
* Called as a tasklet to do req/reply match and complete a request
|
||||
* Errors must result in the RPC task either being awakened, or
|
||||
* allowed to timeout, to discover the errors at that time.
|
||||
*/
|
||||
void
|
||||
rpcrdma_reply_handler(struct rpcrdma_rep *rep)
|
||||
{
|
||||
struct rpcrdma_msg *headerp;
|
||||
struct rpcrdma_req *req;
|
||||
struct rpc_rqst *rqst;
|
||||
struct rpc_xprt *xprt = rep->rr_xprt;
|
||||
struct rpcrdma_xprt *r_xprt = rpcx_to_rdmax(xprt);
|
||||
__be32 *iptr;
|
||||
int rdmalen, status;
|
||||
unsigned long cwnd;
|
||||
|
||||
/* Check status. If bad, signal disconnect and return rep to pool */
|
||||
if (rep->rr_len == ~0U) {
|
||||
rpcrdma_recv_buffer_put(rep);
|
||||
if (r_xprt->rx_ep.rep_connected == 1) {
|
||||
r_xprt->rx_ep.rep_connected = -EIO;
|
||||
rpcrdma_conn_func(&r_xprt->rx_ep);
|
||||
}
|
||||
return;
|
||||
}
|
||||
if (rep->rr_len < 28) {
|
||||
dprintk("RPC: %s: short/invalid reply\n", __func__);
|
||||
goto repost;
|
||||
}
|
||||
headerp = (struct rpcrdma_msg *) rep->rr_base;
|
||||
if (headerp->rm_vers != xdr_one) {
|
||||
dprintk("RPC: %s: invalid version %d\n",
|
||||
__func__, ntohl(headerp->rm_vers));
|
||||
goto repost;
|
||||
}
|
||||
|
||||
/* Get XID and try for a match. */
|
||||
spin_lock(&xprt->transport_lock);
|
||||
rqst = xprt_lookup_rqst(xprt, headerp->rm_xid);
|
||||
if (rqst == NULL) {
|
||||
spin_unlock(&xprt->transport_lock);
|
||||
dprintk("RPC: %s: reply 0x%p failed "
|
||||
"to match any request xid 0x%08x len %d\n",
|
||||
__func__, rep, headerp->rm_xid, rep->rr_len);
|
||||
repost:
|
||||
r_xprt->rx_stats.bad_reply_count++;
|
||||
rep->rr_func = rpcrdma_reply_handler;
|
||||
if (rpcrdma_ep_post_recv(&r_xprt->rx_ia, &r_xprt->rx_ep, rep))
|
||||
rpcrdma_recv_buffer_put(rep);
|
||||
|
||||
return;
|
||||
}
|
||||
|
||||
/* get request object */
|
||||
req = rpcr_to_rdmar(rqst);
|
||||
if (req->rl_reply) {
|
||||
spin_unlock(&xprt->transport_lock);
|
||||
dprintk("RPC: %s: duplicate reply 0x%p to RPC "
|
||||
"request 0x%p: xid 0x%08x\n", __func__, rep, req,
|
||||
headerp->rm_xid);
|
||||
goto repost;
|
||||
}
|
||||
|
||||
dprintk("RPC: %s: reply 0x%p completes request 0x%p\n"
|
||||
" RPC request 0x%p xid 0x%08x\n",
|
||||
__func__, rep, req, rqst, headerp->rm_xid);
|
||||
|
||||
/* from here on, the reply is no longer an orphan */
|
||||
req->rl_reply = rep;
|
||||
xprt->reestablish_timeout = 0;
|
||||
|
||||
/* check for expected message types */
|
||||
/* The order of some of these tests is important. */
|
||||
switch (headerp->rm_type) {
|
||||
case htonl(RDMA_MSG):
|
||||
/* never expect read chunks */
|
||||
/* never expect reply chunks (two ways to check) */
|
||||
/* never expect write chunks without having offered RDMA */
|
||||
if (headerp->rm_body.rm_chunks[0] != xdr_zero ||
|
||||
(headerp->rm_body.rm_chunks[1] == xdr_zero &&
|
||||
headerp->rm_body.rm_chunks[2] != xdr_zero) ||
|
||||
(headerp->rm_body.rm_chunks[1] != xdr_zero &&
|
||||
req->rl_nchunks == 0))
|
||||
goto badheader;
|
||||
if (headerp->rm_body.rm_chunks[1] != xdr_zero) {
|
||||
/* count any expected write chunks in read reply */
|
||||
/* start at write chunk array count */
|
||||
iptr = &headerp->rm_body.rm_chunks[2];
|
||||
rdmalen = rpcrdma_count_chunks(rep,
|
||||
req->rl_nchunks, 1, &iptr);
|
||||
/* check for validity, and no reply chunk after */
|
||||
if (rdmalen < 0 || *iptr++ != xdr_zero)
|
||||
goto badheader;
|
||||
rep->rr_len -=
|
||||
((unsigned char *)iptr - (unsigned char *)headerp);
|
||||
status = rep->rr_len + rdmalen;
|
||||
r_xprt->rx_stats.total_rdma_reply += rdmalen;
|
||||
/* special case - last chunk may omit padding */
|
||||
if (rdmalen &= 3) {
|
||||
rdmalen = 4 - rdmalen;
|
||||
status += rdmalen;
|
||||
}
|
||||
} else {
|
||||
/* else ordinary inline */
|
||||
rdmalen = 0;
|
||||
iptr = (__be32 *)((unsigned char *)headerp + 28);
|
||||
rep->rr_len -= 28; /*sizeof *headerp;*/
|
||||
status = rep->rr_len;
|
||||
}
|
||||
/* Fix up the rpc results for upper layer */
|
||||
rpcrdma_inline_fixup(rqst, (char *)iptr, rep->rr_len, rdmalen);
|
||||
break;
|
||||
|
||||
case htonl(RDMA_NOMSG):
|
||||
/* never expect read or write chunks, always reply chunks */
|
||||
if (headerp->rm_body.rm_chunks[0] != xdr_zero ||
|
||||
headerp->rm_body.rm_chunks[1] != xdr_zero ||
|
||||
headerp->rm_body.rm_chunks[2] != xdr_one ||
|
||||
req->rl_nchunks == 0)
|
||||
goto badheader;
|
||||
iptr = (__be32 *)((unsigned char *)headerp + 28);
|
||||
rdmalen = rpcrdma_count_chunks(rep, req->rl_nchunks, 0, &iptr);
|
||||
if (rdmalen < 0)
|
||||
goto badheader;
|
||||
r_xprt->rx_stats.total_rdma_reply += rdmalen;
|
||||
/* Reply chunk buffer already is the reply vector - no fixup. */
|
||||
status = rdmalen;
|
||||
break;
|
||||
|
||||
badheader:
|
||||
default:
|
||||
dprintk("%s: invalid rpcrdma reply header (type %d):"
|
||||
" chunks[012] == %d %d %d"
|
||||
" expected chunks <= %d\n",
|
||||
__func__, ntohl(headerp->rm_type),
|
||||
headerp->rm_body.rm_chunks[0],
|
||||
headerp->rm_body.rm_chunks[1],
|
||||
headerp->rm_body.rm_chunks[2],
|
||||
req->rl_nchunks);
|
||||
status = -EIO;
|
||||
r_xprt->rx_stats.bad_reply_count++;
|
||||
break;
|
||||
}
|
||||
|
||||
cwnd = xprt->cwnd;
|
||||
xprt->cwnd = atomic_read(&r_xprt->rx_buf.rb_credits) << RPC_CWNDSHIFT;
|
||||
if (xprt->cwnd > cwnd)
|
||||
xprt_release_rqst_cong(rqst->rq_task);
|
||||
|
||||
dprintk("RPC: %s: xprt_complete_rqst(0x%p, 0x%p, %d)\n",
|
||||
__func__, xprt, rqst, status);
|
||||
xprt_complete_rqst(rqst->rq_task, status);
|
||||
spin_unlock(&xprt->transport_lock);
|
||||
}
|
302
net/sunrpc/xprtrdma/svc_rdma.c
Normal file
302
net/sunrpc/xprtrdma/svc_rdma.c
Normal file
|
@ -0,0 +1,302 @@
|
|||
/*
|
||||
* Copyright (c) 2005-2006 Network Appliance, Inc. All rights reserved.
|
||||
*
|
||||
* This software is available to you under a choice of one of two
|
||||
* licenses. You may choose to be licensed under the terms of the GNU
|
||||
* General Public License (GPL) Version 2, available from the file
|
||||
* COPYING in the main directory of this source tree, or the BSD-type
|
||||
* license below:
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions
|
||||
* are met:
|
||||
*
|
||||
* Redistributions of source code must retain the above copyright
|
||||
* notice, this list of conditions and the following disclaimer.
|
||||
*
|
||||
* Redistributions in binary form must reproduce the above
|
||||
* copyright notice, this list of conditions and the following
|
||||
* disclaimer in the documentation and/or other materials provided
|
||||
* with the distribution.
|
||||
*
|
||||
* Neither the name of the Network Appliance, Inc. nor the names of
|
||||
* its contributors may be used to endorse or promote products
|
||||
* derived from this software without specific prior written
|
||||
* permission.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
* A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
* OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*
|
||||
* Author: Tom Tucker <tom@opengridcomputing.com>
|
||||
*/
|
||||
#include <linux/module.h>
|
||||
#include <linux/init.h>
|
||||
#include <linux/slab.h>
|
||||
#include <linux/fs.h>
|
||||
#include <linux/sysctl.h>
|
||||
#include <linux/workqueue.h>
|
||||
#include <linux/sunrpc/clnt.h>
|
||||
#include <linux/sunrpc/sched.h>
|
||||
#include <linux/sunrpc/svc_rdma.h>
|
||||
#include "xprt_rdma.h"
|
||||
|
||||
#define RPCDBG_FACILITY RPCDBG_SVCXPRT
|
||||
|
||||
/* RPC/RDMA parameters */
|
||||
unsigned int svcrdma_ord = RPCRDMA_ORD;
|
||||
static unsigned int min_ord = 1;
|
||||
static unsigned int max_ord = 4096;
|
||||
unsigned int svcrdma_max_requests = RPCRDMA_MAX_REQUESTS;
|
||||
static unsigned int min_max_requests = 4;
|
||||
static unsigned int max_max_requests = 16384;
|
||||
unsigned int svcrdma_max_req_size = RPCRDMA_MAX_REQ_SIZE;
|
||||
static unsigned int min_max_inline = 4096;
|
||||
static unsigned int max_max_inline = 65536;
|
||||
|
||||
atomic_t rdma_stat_recv;
|
||||
atomic_t rdma_stat_read;
|
||||
atomic_t rdma_stat_write;
|
||||
atomic_t rdma_stat_sq_starve;
|
||||
atomic_t rdma_stat_rq_starve;
|
||||
atomic_t rdma_stat_rq_poll;
|
||||
atomic_t rdma_stat_rq_prod;
|
||||
atomic_t rdma_stat_sq_poll;
|
||||
atomic_t rdma_stat_sq_prod;
|
||||
|
||||
/* Temporary NFS request map and context caches */
|
||||
struct kmem_cache *svc_rdma_map_cachep;
|
||||
struct kmem_cache *svc_rdma_ctxt_cachep;
|
||||
|
||||
struct workqueue_struct *svc_rdma_wq;
|
||||
|
||||
/*
|
||||
* This function implements reading and resetting an atomic_t stat
|
||||
* variable through read/write to a proc file. Any write to the file
|
||||
* resets the associated statistic to zero. Any read returns it's
|
||||
* current value.
|
||||
*/
|
||||
static int read_reset_stat(struct ctl_table *table, int write,
|
||||
void __user *buffer, size_t *lenp,
|
||||
loff_t *ppos)
|
||||
{
|
||||
atomic_t *stat = (atomic_t *)table->data;
|
||||
|
||||
if (!stat)
|
||||
return -EINVAL;
|
||||
|
||||
if (write)
|
||||
atomic_set(stat, 0);
|
||||
else {
|
||||
char str_buf[32];
|
||||
char *data;
|
||||
int len = snprintf(str_buf, 32, "%d\n", atomic_read(stat));
|
||||
if (len >= 32)
|
||||
return -EFAULT;
|
||||
len = strlen(str_buf);
|
||||
if (*ppos > len) {
|
||||
*lenp = 0;
|
||||
return 0;
|
||||
}
|
||||
data = &str_buf[*ppos];
|
||||
len -= *ppos;
|
||||
if (len > *lenp)
|
||||
len = *lenp;
|
||||
if (len && copy_to_user(buffer, str_buf, len))
|
||||
return -EFAULT;
|
||||
*lenp = len;
|
||||
*ppos += len;
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
static struct ctl_table_header *svcrdma_table_header;
|
||||
static struct ctl_table svcrdma_parm_table[] = {
|
||||
{
|
||||
.procname = "max_requests",
|
||||
.data = &svcrdma_max_requests,
|
||||
.maxlen = sizeof(unsigned int),
|
||||
.mode = 0644,
|
||||
.proc_handler = proc_dointvec_minmax,
|
||||
.extra1 = &min_max_requests,
|
||||
.extra2 = &max_max_requests
|
||||
},
|
||||
{
|
||||
.procname = "max_req_size",
|
||||
.data = &svcrdma_max_req_size,
|
||||
.maxlen = sizeof(unsigned int),
|
||||
.mode = 0644,
|
||||
.proc_handler = proc_dointvec_minmax,
|
||||
.extra1 = &min_max_inline,
|
||||
.extra2 = &max_max_inline
|
||||
},
|
||||
{
|
||||
.procname = "max_outbound_read_requests",
|
||||
.data = &svcrdma_ord,
|
||||
.maxlen = sizeof(unsigned int),
|
||||
.mode = 0644,
|
||||
.proc_handler = proc_dointvec_minmax,
|
||||
.extra1 = &min_ord,
|
||||
.extra2 = &max_ord,
|
||||
},
|
||||
|
||||
{
|
||||
.procname = "rdma_stat_read",
|
||||
.data = &rdma_stat_read,
|
||||
.maxlen = sizeof(atomic_t),
|
||||
.mode = 0644,
|
||||
.proc_handler = read_reset_stat,
|
||||
},
|
||||
{
|
||||
.procname = "rdma_stat_recv",
|
||||
.data = &rdma_stat_recv,
|
||||
.maxlen = sizeof(atomic_t),
|
||||
.mode = 0644,
|
||||
.proc_handler = read_reset_stat,
|
||||
},
|
||||
{
|
||||
.procname = "rdma_stat_write",
|
||||
.data = &rdma_stat_write,
|
||||
.maxlen = sizeof(atomic_t),
|
||||
.mode = 0644,
|
||||
.proc_handler = read_reset_stat,
|
||||
},
|
||||
{
|
||||
.procname = "rdma_stat_sq_starve",
|
||||
.data = &rdma_stat_sq_starve,
|
||||
.maxlen = sizeof(atomic_t),
|
||||
.mode = 0644,
|
||||
.proc_handler = read_reset_stat,
|
||||
},
|
||||
{
|
||||
.procname = "rdma_stat_rq_starve",
|
||||
.data = &rdma_stat_rq_starve,
|
||||
.maxlen = sizeof(atomic_t),
|
||||
.mode = 0644,
|
||||
.proc_handler = read_reset_stat,
|
||||
},
|
||||
{
|
||||
.procname = "rdma_stat_rq_poll",
|
||||
.data = &rdma_stat_rq_poll,
|
||||
.maxlen = sizeof(atomic_t),
|
||||
.mode = 0644,
|
||||
.proc_handler = read_reset_stat,
|
||||
},
|
||||
{
|
||||
.procname = "rdma_stat_rq_prod",
|
||||
.data = &rdma_stat_rq_prod,
|
||||
.maxlen = sizeof(atomic_t),
|
||||
.mode = 0644,
|
||||
.proc_handler = read_reset_stat,
|
||||
},
|
||||
{
|
||||
.procname = "rdma_stat_sq_poll",
|
||||
.data = &rdma_stat_sq_poll,
|
||||
.maxlen = sizeof(atomic_t),
|
||||
.mode = 0644,
|
||||
.proc_handler = read_reset_stat,
|
||||
},
|
||||
{
|
||||
.procname = "rdma_stat_sq_prod",
|
||||
.data = &rdma_stat_sq_prod,
|
||||
.maxlen = sizeof(atomic_t),
|
||||
.mode = 0644,
|
||||
.proc_handler = read_reset_stat,
|
||||
},
|
||||
{ },
|
||||
};
|
||||
|
||||
static struct ctl_table svcrdma_table[] = {
|
||||
{
|
||||
.procname = "svc_rdma",
|
||||
.mode = 0555,
|
||||
.child = svcrdma_parm_table
|
||||
},
|
||||
{ },
|
||||
};
|
||||
|
||||
static struct ctl_table svcrdma_root_table[] = {
|
||||
{
|
||||
.procname = "sunrpc",
|
||||
.mode = 0555,
|
||||
.child = svcrdma_table
|
||||
},
|
||||
{ },
|
||||
};
|
||||
|
||||
void svc_rdma_cleanup(void)
|
||||
{
|
||||
dprintk("SVCRDMA Module Removed, deregister RPC RDMA transport\n");
|
||||
destroy_workqueue(svc_rdma_wq);
|
||||
if (svcrdma_table_header) {
|
||||
unregister_sysctl_table(svcrdma_table_header);
|
||||
svcrdma_table_header = NULL;
|
||||
}
|
||||
svc_unreg_xprt_class(&svc_rdma_class);
|
||||
kmem_cache_destroy(svc_rdma_map_cachep);
|
||||
kmem_cache_destroy(svc_rdma_ctxt_cachep);
|
||||
}
|
||||
|
||||
int svc_rdma_init(void)
|
||||
{
|
||||
dprintk("SVCRDMA Module Init, register RPC RDMA transport\n");
|
||||
dprintk("\tsvcrdma_ord : %d\n", svcrdma_ord);
|
||||
dprintk("\tmax_requests : %d\n", svcrdma_max_requests);
|
||||
dprintk("\tsq_depth : %d\n",
|
||||
svcrdma_max_requests * RPCRDMA_SQ_DEPTH_MULT);
|
||||
dprintk("\tmax_inline : %d\n", svcrdma_max_req_size);
|
||||
|
||||
svc_rdma_wq = alloc_workqueue("svc_rdma", 0, 0);
|
||||
if (!svc_rdma_wq)
|
||||
return -ENOMEM;
|
||||
|
||||
if (!svcrdma_table_header)
|
||||
svcrdma_table_header =
|
||||
register_sysctl_table(svcrdma_root_table);
|
||||
|
||||
/* Create the temporary map cache */
|
||||
svc_rdma_map_cachep = kmem_cache_create("svc_rdma_map_cache",
|
||||
sizeof(struct svc_rdma_req_map),
|
||||
0,
|
||||
SLAB_HWCACHE_ALIGN,
|
||||
NULL);
|
||||
if (!svc_rdma_map_cachep) {
|
||||
printk(KERN_INFO "Could not allocate map cache.\n");
|
||||
goto err0;
|
||||
}
|
||||
|
||||
/* Create the temporary context cache */
|
||||
svc_rdma_ctxt_cachep =
|
||||
kmem_cache_create("svc_rdma_ctxt_cache",
|
||||
sizeof(struct svc_rdma_op_ctxt),
|
||||
0,
|
||||
SLAB_HWCACHE_ALIGN,
|
||||
NULL);
|
||||
if (!svc_rdma_ctxt_cachep) {
|
||||
printk(KERN_INFO "Could not allocate WR ctxt cache.\n");
|
||||
goto err1;
|
||||
}
|
||||
|
||||
/* Register RDMA with the SVC transport switch */
|
||||
svc_reg_xprt_class(&svc_rdma_class);
|
||||
return 0;
|
||||
err1:
|
||||
kmem_cache_destroy(svc_rdma_map_cachep);
|
||||
err0:
|
||||
unregister_sysctl_table(svcrdma_table_header);
|
||||
destroy_workqueue(svc_rdma_wq);
|
||||
return -ENOMEM;
|
||||
}
|
||||
MODULE_AUTHOR("Tom Tucker <tom@opengridcomputing.com>");
|
||||
MODULE_DESCRIPTION("SVC RDMA Transport");
|
||||
MODULE_LICENSE("Dual BSD/GPL");
|
||||
module_init(svc_rdma_init);
|
||||
module_exit(svc_rdma_cleanup);
|
386
net/sunrpc/xprtrdma/svc_rdma_marshal.c
Normal file
386
net/sunrpc/xprtrdma/svc_rdma_marshal.c
Normal file
|
@ -0,0 +1,386 @@
|
|||
/*
|
||||
* Copyright (c) 2005-2006 Network Appliance, Inc. All rights reserved.
|
||||
*
|
||||
* This software is available to you under a choice of one of two
|
||||
* licenses. You may choose to be licensed under the terms of the GNU
|
||||
* General Public License (GPL) Version 2, available from the file
|
||||
* COPYING in the main directory of this source tree, or the BSD-type
|
||||
* license below:
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions
|
||||
* are met:
|
||||
*
|
||||
* Redistributions of source code must retain the above copyright
|
||||
* notice, this list of conditions and the following disclaimer.
|
||||
*
|
||||
* Redistributions in binary form must reproduce the above
|
||||
* copyright notice, this list of conditions and the following
|
||||
* disclaimer in the documentation and/or other materials provided
|
||||
* with the distribution.
|
||||
*
|
||||
* Neither the name of the Network Appliance, Inc. nor the names of
|
||||
* its contributors may be used to endorse or promote products
|
||||
* derived from this software without specific prior written
|
||||
* permission.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
* A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
* OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*
|
||||
* Author: Tom Tucker <tom@opengridcomputing.com>
|
||||
*/
|
||||
|
||||
#include <linux/sunrpc/xdr.h>
|
||||
#include <linux/sunrpc/debug.h>
|
||||
#include <asm/unaligned.h>
|
||||
#include <linux/sunrpc/rpc_rdma.h>
|
||||
#include <linux/sunrpc/svc_rdma.h>
|
||||
|
||||
#define RPCDBG_FACILITY RPCDBG_SVCXPRT
|
||||
|
||||
/*
|
||||
* Decodes a read chunk list. The expected format is as follows:
|
||||
* descrim : xdr_one
|
||||
* position : u32 offset into XDR stream
|
||||
* handle : u32 RKEY
|
||||
* . . .
|
||||
* end-of-list: xdr_zero
|
||||
*/
|
||||
static u32 *decode_read_list(u32 *va, u32 *vaend)
|
||||
{
|
||||
struct rpcrdma_read_chunk *ch = (struct rpcrdma_read_chunk *)va;
|
||||
|
||||
while (ch->rc_discrim != xdr_zero) {
|
||||
if (((unsigned long)ch + sizeof(struct rpcrdma_read_chunk)) >
|
||||
(unsigned long)vaend) {
|
||||
dprintk("svcrdma: vaend=%p, ch=%p\n", vaend, ch);
|
||||
return NULL;
|
||||
}
|
||||
ch++;
|
||||
}
|
||||
return (u32 *)&ch->rc_position;
|
||||
}
|
||||
|
||||
/*
|
||||
* Determine number of chunks and total bytes in chunk list. The chunk
|
||||
* list has already been verified to fit within the RPCRDMA header.
|
||||
*/
|
||||
void svc_rdma_rcl_chunk_counts(struct rpcrdma_read_chunk *ch,
|
||||
int *ch_count, int *byte_count)
|
||||
{
|
||||
/* compute the number of bytes represented by read chunks */
|
||||
*byte_count = 0;
|
||||
*ch_count = 0;
|
||||
for (; ch->rc_discrim != 0; ch++) {
|
||||
*byte_count = *byte_count + ntohl(ch->rc_target.rs_length);
|
||||
*ch_count = *ch_count + 1;
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* Decodes a write chunk list. The expected format is as follows:
|
||||
* descrim : xdr_one
|
||||
* nchunks : <count>
|
||||
* handle : u32 RKEY ---+
|
||||
* length : u32 <len of segment> |
|
||||
* offset : remove va + <count>
|
||||
* . . . |
|
||||
* ---+
|
||||
*/
|
||||
static u32 *decode_write_list(u32 *va, u32 *vaend)
|
||||
{
|
||||
unsigned long start, end;
|
||||
int nchunks;
|
||||
|
||||
struct rpcrdma_write_array *ary =
|
||||
(struct rpcrdma_write_array *)va;
|
||||
|
||||
/* Check for not write-array */
|
||||
if (ary->wc_discrim == xdr_zero)
|
||||
return (u32 *)&ary->wc_nchunks;
|
||||
|
||||
if ((unsigned long)ary + sizeof(struct rpcrdma_write_array) >
|
||||
(unsigned long)vaend) {
|
||||
dprintk("svcrdma: ary=%p, vaend=%p\n", ary, vaend);
|
||||
return NULL;
|
||||
}
|
||||
nchunks = ntohl(ary->wc_nchunks);
|
||||
|
||||
start = (unsigned long)&ary->wc_array[0];
|
||||
end = (unsigned long)vaend;
|
||||
if (nchunks < 0 ||
|
||||
nchunks > (SIZE_MAX - start) / sizeof(struct rpcrdma_write_chunk) ||
|
||||
(start + (sizeof(struct rpcrdma_write_chunk) * nchunks)) > end) {
|
||||
dprintk("svcrdma: ary=%p, wc_nchunks=%d, vaend=%p\n",
|
||||
ary, nchunks, vaend);
|
||||
return NULL;
|
||||
}
|
||||
/*
|
||||
* rs_length is the 2nd 4B field in wc_target and taking its
|
||||
* address skips the list terminator
|
||||
*/
|
||||
return (u32 *)&ary->wc_array[nchunks].wc_target.rs_length;
|
||||
}
|
||||
|
||||
static u32 *decode_reply_array(u32 *va, u32 *vaend)
|
||||
{
|
||||
unsigned long start, end;
|
||||
int nchunks;
|
||||
struct rpcrdma_write_array *ary =
|
||||
(struct rpcrdma_write_array *)va;
|
||||
|
||||
/* Check for no reply-array */
|
||||
if (ary->wc_discrim == xdr_zero)
|
||||
return (u32 *)&ary->wc_nchunks;
|
||||
|
||||
if ((unsigned long)ary + sizeof(struct rpcrdma_write_array) >
|
||||
(unsigned long)vaend) {
|
||||
dprintk("svcrdma: ary=%p, vaend=%p\n", ary, vaend);
|
||||
return NULL;
|
||||
}
|
||||
nchunks = ntohl(ary->wc_nchunks);
|
||||
|
||||
start = (unsigned long)&ary->wc_array[0];
|
||||
end = (unsigned long)vaend;
|
||||
if (nchunks < 0 ||
|
||||
nchunks > (SIZE_MAX - start) / sizeof(struct rpcrdma_write_chunk) ||
|
||||
(start + (sizeof(struct rpcrdma_write_chunk) * nchunks)) > end) {
|
||||
dprintk("svcrdma: ary=%p, wc_nchunks=%d, vaend=%p\n",
|
||||
ary, nchunks, vaend);
|
||||
return NULL;
|
||||
}
|
||||
return (u32 *)&ary->wc_array[nchunks];
|
||||
}
|
||||
|
||||
int svc_rdma_xdr_decode_req(struct rpcrdma_msg **rdma_req,
|
||||
struct svc_rqst *rqstp)
|
||||
{
|
||||
struct rpcrdma_msg *rmsgp = NULL;
|
||||
u32 *va;
|
||||
u32 *vaend;
|
||||
u32 hdr_len;
|
||||
|
||||
rmsgp = (struct rpcrdma_msg *)rqstp->rq_arg.head[0].iov_base;
|
||||
|
||||
/* Verify that there's enough bytes for header + something */
|
||||
if (rqstp->rq_arg.len <= RPCRDMA_HDRLEN_MIN) {
|
||||
dprintk("svcrdma: header too short = %d\n",
|
||||
rqstp->rq_arg.len);
|
||||
return -EINVAL;
|
||||
}
|
||||
|
||||
/* Decode the header */
|
||||
rmsgp->rm_xid = ntohl(rmsgp->rm_xid);
|
||||
rmsgp->rm_vers = ntohl(rmsgp->rm_vers);
|
||||
rmsgp->rm_credit = ntohl(rmsgp->rm_credit);
|
||||
rmsgp->rm_type = ntohl(rmsgp->rm_type);
|
||||
|
||||
if (rmsgp->rm_vers != RPCRDMA_VERSION)
|
||||
return -ENOSYS;
|
||||
|
||||
/* Pull in the extra for the padded case and bump our pointer */
|
||||
if (rmsgp->rm_type == RDMA_MSGP) {
|
||||
int hdrlen;
|
||||
rmsgp->rm_body.rm_padded.rm_align =
|
||||
ntohl(rmsgp->rm_body.rm_padded.rm_align);
|
||||
rmsgp->rm_body.rm_padded.rm_thresh =
|
||||
ntohl(rmsgp->rm_body.rm_padded.rm_thresh);
|
||||
|
||||
va = &rmsgp->rm_body.rm_padded.rm_pempty[4];
|
||||
rqstp->rq_arg.head[0].iov_base = va;
|
||||
hdrlen = (u32)((unsigned long)va - (unsigned long)rmsgp);
|
||||
rqstp->rq_arg.head[0].iov_len -= hdrlen;
|
||||
if (hdrlen > rqstp->rq_arg.len)
|
||||
return -EINVAL;
|
||||
return hdrlen;
|
||||
}
|
||||
|
||||
/* The chunk list may contain either a read chunk list or a write
|
||||
* chunk list and a reply chunk list.
|
||||
*/
|
||||
va = &rmsgp->rm_body.rm_chunks[0];
|
||||
vaend = (u32 *)((unsigned long)rmsgp + rqstp->rq_arg.len);
|
||||
va = decode_read_list(va, vaend);
|
||||
if (!va)
|
||||
return -EINVAL;
|
||||
va = decode_write_list(va, vaend);
|
||||
if (!va)
|
||||
return -EINVAL;
|
||||
va = decode_reply_array(va, vaend);
|
||||
if (!va)
|
||||
return -EINVAL;
|
||||
|
||||
rqstp->rq_arg.head[0].iov_base = va;
|
||||
hdr_len = (unsigned long)va - (unsigned long)rmsgp;
|
||||
rqstp->rq_arg.head[0].iov_len -= hdr_len;
|
||||
|
||||
*rdma_req = rmsgp;
|
||||
return hdr_len;
|
||||
}
|
||||
|
||||
int svc_rdma_xdr_decode_deferred_req(struct svc_rqst *rqstp)
|
||||
{
|
||||
struct rpcrdma_msg *rmsgp = NULL;
|
||||
struct rpcrdma_read_chunk *ch;
|
||||
struct rpcrdma_write_array *ary;
|
||||
u32 *va;
|
||||
u32 hdrlen;
|
||||
|
||||
dprintk("svcrdma: processing deferred RDMA header on rqstp=%p\n",
|
||||
rqstp);
|
||||
rmsgp = (struct rpcrdma_msg *)rqstp->rq_arg.head[0].iov_base;
|
||||
|
||||
/* Pull in the extra for the padded case and bump our pointer */
|
||||
if (rmsgp->rm_type == RDMA_MSGP) {
|
||||
va = &rmsgp->rm_body.rm_padded.rm_pempty[4];
|
||||
rqstp->rq_arg.head[0].iov_base = va;
|
||||
hdrlen = (u32)((unsigned long)va - (unsigned long)rmsgp);
|
||||
rqstp->rq_arg.head[0].iov_len -= hdrlen;
|
||||
return hdrlen;
|
||||
}
|
||||
|
||||
/*
|
||||
* Skip all chunks to find RPC msg. These were previously processed
|
||||
*/
|
||||
va = &rmsgp->rm_body.rm_chunks[0];
|
||||
|
||||
/* Skip read-list */
|
||||
for (ch = (struct rpcrdma_read_chunk *)va;
|
||||
ch->rc_discrim != xdr_zero; ch++);
|
||||
va = (u32 *)&ch->rc_position;
|
||||
|
||||
/* Skip write-list */
|
||||
ary = (struct rpcrdma_write_array *)va;
|
||||
if (ary->wc_discrim == xdr_zero)
|
||||
va = (u32 *)&ary->wc_nchunks;
|
||||
else
|
||||
/*
|
||||
* rs_length is the 2nd 4B field in wc_target and taking its
|
||||
* address skips the list terminator
|
||||
*/
|
||||
va = (u32 *)&ary->wc_array[ary->wc_nchunks].wc_target.rs_length;
|
||||
|
||||
/* Skip reply-array */
|
||||
ary = (struct rpcrdma_write_array *)va;
|
||||
if (ary->wc_discrim == xdr_zero)
|
||||
va = (u32 *)&ary->wc_nchunks;
|
||||
else
|
||||
va = (u32 *)&ary->wc_array[ary->wc_nchunks];
|
||||
|
||||
rqstp->rq_arg.head[0].iov_base = va;
|
||||
hdrlen = (unsigned long)va - (unsigned long)rmsgp;
|
||||
rqstp->rq_arg.head[0].iov_len -= hdrlen;
|
||||
|
||||
return hdrlen;
|
||||
}
|
||||
|
||||
int svc_rdma_xdr_encode_error(struct svcxprt_rdma *xprt,
|
||||
struct rpcrdma_msg *rmsgp,
|
||||
enum rpcrdma_errcode err, u32 *va)
|
||||
{
|
||||
u32 *startp = va;
|
||||
|
||||
*va++ = htonl(rmsgp->rm_xid);
|
||||
*va++ = htonl(rmsgp->rm_vers);
|
||||
*va++ = htonl(xprt->sc_max_requests);
|
||||
*va++ = htonl(RDMA_ERROR);
|
||||
*va++ = htonl(err);
|
||||
if (err == ERR_VERS) {
|
||||
*va++ = htonl(RPCRDMA_VERSION);
|
||||
*va++ = htonl(RPCRDMA_VERSION);
|
||||
}
|
||||
|
||||
return (int)((unsigned long)va - (unsigned long)startp);
|
||||
}
|
||||
|
||||
int svc_rdma_xdr_get_reply_hdr_len(struct rpcrdma_msg *rmsgp)
|
||||
{
|
||||
struct rpcrdma_write_array *wr_ary;
|
||||
|
||||
/* There is no read-list in a reply */
|
||||
|
||||
/* skip write list */
|
||||
wr_ary = (struct rpcrdma_write_array *)
|
||||
&rmsgp->rm_body.rm_chunks[1];
|
||||
if (wr_ary->wc_discrim)
|
||||
wr_ary = (struct rpcrdma_write_array *)
|
||||
&wr_ary->wc_array[ntohl(wr_ary->wc_nchunks)].
|
||||
wc_target.rs_length;
|
||||
else
|
||||
wr_ary = (struct rpcrdma_write_array *)
|
||||
&wr_ary->wc_nchunks;
|
||||
|
||||
/* skip reply array */
|
||||
if (wr_ary->wc_discrim)
|
||||
wr_ary = (struct rpcrdma_write_array *)
|
||||
&wr_ary->wc_array[ntohl(wr_ary->wc_nchunks)];
|
||||
else
|
||||
wr_ary = (struct rpcrdma_write_array *)
|
||||
&wr_ary->wc_nchunks;
|
||||
|
||||
return (unsigned long) wr_ary - (unsigned long) rmsgp;
|
||||
}
|
||||
|
||||
void svc_rdma_xdr_encode_write_list(struct rpcrdma_msg *rmsgp, int chunks)
|
||||
{
|
||||
struct rpcrdma_write_array *ary;
|
||||
|
||||
/* no read-list */
|
||||
rmsgp->rm_body.rm_chunks[0] = xdr_zero;
|
||||
|
||||
/* write-array discrim */
|
||||
ary = (struct rpcrdma_write_array *)
|
||||
&rmsgp->rm_body.rm_chunks[1];
|
||||
ary->wc_discrim = xdr_one;
|
||||
ary->wc_nchunks = htonl(chunks);
|
||||
|
||||
/* write-list terminator */
|
||||
ary->wc_array[chunks].wc_target.rs_handle = xdr_zero;
|
||||
|
||||
/* reply-array discriminator */
|
||||
ary->wc_array[chunks].wc_target.rs_length = xdr_zero;
|
||||
}
|
||||
|
||||
void svc_rdma_xdr_encode_reply_array(struct rpcrdma_write_array *ary,
|
||||
int chunks)
|
||||
{
|
||||
ary->wc_discrim = xdr_one;
|
||||
ary->wc_nchunks = htonl(chunks);
|
||||
}
|
||||
|
||||
void svc_rdma_xdr_encode_array_chunk(struct rpcrdma_write_array *ary,
|
||||
int chunk_no,
|
||||
__be32 rs_handle,
|
||||
__be64 rs_offset,
|
||||
u32 write_len)
|
||||
{
|
||||
struct rpcrdma_segment *seg = &ary->wc_array[chunk_no].wc_target;
|
||||
seg->rs_handle = rs_handle;
|
||||
seg->rs_offset = rs_offset;
|
||||
seg->rs_length = htonl(write_len);
|
||||
}
|
||||
|
||||
void svc_rdma_xdr_encode_reply_header(struct svcxprt_rdma *xprt,
|
||||
struct rpcrdma_msg *rdma_argp,
|
||||
struct rpcrdma_msg *rdma_resp,
|
||||
enum rpcrdma_proc rdma_type)
|
||||
{
|
||||
rdma_resp->rm_xid = htonl(rdma_argp->rm_xid);
|
||||
rdma_resp->rm_vers = htonl(rdma_argp->rm_vers);
|
||||
rdma_resp->rm_credit = htonl(xprt->sc_max_requests);
|
||||
rdma_resp->rm_type = htonl(rdma_type);
|
||||
|
||||
/* Encode <nul> chunks lists */
|
||||
rdma_resp->rm_body.rm_chunks[0] = xdr_zero;
|
||||
rdma_resp->rm_body.rm_chunks[1] = xdr_zero;
|
||||
rdma_resp->rm_body.rm_chunks[2] = xdr_zero;
|
||||
}
|
614
net/sunrpc/xprtrdma/svc_rdma_recvfrom.c
Normal file
614
net/sunrpc/xprtrdma/svc_rdma_recvfrom.c
Normal file
|
@ -0,0 +1,614 @@
|
|||
/*
|
||||
* Copyright (c) 2014 Open Grid Computing, Inc. All rights reserved.
|
||||
* Copyright (c) 2005-2006 Network Appliance, Inc. All rights reserved.
|
||||
*
|
||||
* This software is available to you under a choice of one of two
|
||||
* licenses. You may choose to be licensed under the terms of the GNU
|
||||
* General Public License (GPL) Version 2, available from the file
|
||||
* COPYING in the main directory of this source tree, or the BSD-type
|
||||
* license below:
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions
|
||||
* are met:
|
||||
*
|
||||
* Redistributions of source code must retain the above copyright
|
||||
* notice, this list of conditions and the following disclaimer.
|
||||
*
|
||||
* Redistributions in binary form must reproduce the above
|
||||
* copyright notice, this list of conditions and the following
|
||||
* disclaimer in the documentation and/or other materials provided
|
||||
* with the distribution.
|
||||
*
|
||||
* Neither the name of the Network Appliance, Inc. nor the names of
|
||||
* its contributors may be used to endorse or promote products
|
||||
* derived from this software without specific prior written
|
||||
* permission.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
* A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
* OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*
|
||||
* Author: Tom Tucker <tom@opengridcomputing.com>
|
||||
*/
|
||||
|
||||
#include <linux/sunrpc/debug.h>
|
||||
#include <linux/sunrpc/rpc_rdma.h>
|
||||
#include <linux/spinlock.h>
|
||||
#include <linux/highmem.h>
|
||||
#include <asm/unaligned.h>
|
||||
#include <rdma/ib_verbs.h>
|
||||
#include <rdma/rdma_cm.h>
|
||||
#include <linux/sunrpc/svc_rdma.h>
|
||||
|
||||
#define RPCDBG_FACILITY RPCDBG_SVCXPRT
|
||||
|
||||
/*
|
||||
* Replace the pages in the rq_argpages array with the pages from the SGE in
|
||||
* the RDMA_RECV completion. The SGL should contain full pages up until the
|
||||
* last one.
|
||||
*/
|
||||
static void rdma_build_arg_xdr(struct svc_rqst *rqstp,
|
||||
struct svc_rdma_op_ctxt *ctxt,
|
||||
u32 byte_count)
|
||||
{
|
||||
struct page *page;
|
||||
u32 bc;
|
||||
int sge_no;
|
||||
|
||||
/* Swap the page in the SGE with the page in argpages */
|
||||
page = ctxt->pages[0];
|
||||
put_page(rqstp->rq_pages[0]);
|
||||
rqstp->rq_pages[0] = page;
|
||||
|
||||
/* Set up the XDR head */
|
||||
rqstp->rq_arg.head[0].iov_base = page_address(page);
|
||||
rqstp->rq_arg.head[0].iov_len =
|
||||
min_t(size_t, byte_count, ctxt->sge[0].length);
|
||||
rqstp->rq_arg.len = byte_count;
|
||||
rqstp->rq_arg.buflen = byte_count;
|
||||
|
||||
/* Compute bytes past head in the SGL */
|
||||
bc = byte_count - rqstp->rq_arg.head[0].iov_len;
|
||||
|
||||
/* If data remains, store it in the pagelist */
|
||||
rqstp->rq_arg.page_len = bc;
|
||||
rqstp->rq_arg.page_base = 0;
|
||||
rqstp->rq_arg.pages = &rqstp->rq_pages[1];
|
||||
sge_no = 1;
|
||||
while (bc && sge_no < ctxt->count) {
|
||||
page = ctxt->pages[sge_no];
|
||||
put_page(rqstp->rq_pages[sge_no]);
|
||||
rqstp->rq_pages[sge_no] = page;
|
||||
bc -= min_t(u32, bc, ctxt->sge[sge_no].length);
|
||||
rqstp->rq_arg.buflen += ctxt->sge[sge_no].length;
|
||||
sge_no++;
|
||||
}
|
||||
rqstp->rq_respages = &rqstp->rq_pages[sge_no];
|
||||
rqstp->rq_next_page = rqstp->rq_respages + 1;
|
||||
|
||||
/* We should never run out of SGE because the limit is defined to
|
||||
* support the max allowed RPC data length
|
||||
*/
|
||||
BUG_ON(bc && (sge_no == ctxt->count));
|
||||
BUG_ON((rqstp->rq_arg.head[0].iov_len + rqstp->rq_arg.page_len)
|
||||
!= byte_count);
|
||||
BUG_ON(rqstp->rq_arg.len != byte_count);
|
||||
|
||||
/* If not all pages were used from the SGL, free the remaining ones */
|
||||
bc = sge_no;
|
||||
while (sge_no < ctxt->count) {
|
||||
page = ctxt->pages[sge_no++];
|
||||
put_page(page);
|
||||
}
|
||||
ctxt->count = bc;
|
||||
|
||||
/* Set up tail */
|
||||
rqstp->rq_arg.tail[0].iov_base = NULL;
|
||||
rqstp->rq_arg.tail[0].iov_len = 0;
|
||||
}
|
||||
|
||||
static int rdma_read_max_sge(struct svcxprt_rdma *xprt, int sge_count)
|
||||
{
|
||||
if (rdma_node_get_transport(xprt->sc_cm_id->device->node_type) ==
|
||||
RDMA_TRANSPORT_IWARP)
|
||||
return 1;
|
||||
else
|
||||
return min_t(int, sge_count, xprt->sc_max_sge);
|
||||
}
|
||||
|
||||
typedef int (*rdma_reader_fn)(struct svcxprt_rdma *xprt,
|
||||
struct svc_rqst *rqstp,
|
||||
struct svc_rdma_op_ctxt *head,
|
||||
int *page_no,
|
||||
u32 *page_offset,
|
||||
u32 rs_handle,
|
||||
u32 rs_length,
|
||||
u64 rs_offset,
|
||||
int last);
|
||||
|
||||
/* Issue an RDMA_READ using the local lkey to map the data sink */
|
||||
static int rdma_read_chunk_lcl(struct svcxprt_rdma *xprt,
|
||||
struct svc_rqst *rqstp,
|
||||
struct svc_rdma_op_ctxt *head,
|
||||
int *page_no,
|
||||
u32 *page_offset,
|
||||
u32 rs_handle,
|
||||
u32 rs_length,
|
||||
u64 rs_offset,
|
||||
int last)
|
||||
{
|
||||
struct ib_send_wr read_wr;
|
||||
int pages_needed = PAGE_ALIGN(*page_offset + rs_length) >> PAGE_SHIFT;
|
||||
struct svc_rdma_op_ctxt *ctxt = svc_rdma_get_context(xprt);
|
||||
int ret, read, pno;
|
||||
u32 pg_off = *page_offset;
|
||||
u32 pg_no = *page_no;
|
||||
|
||||
ctxt->direction = DMA_FROM_DEVICE;
|
||||
ctxt->read_hdr = head;
|
||||
pages_needed =
|
||||
min_t(int, pages_needed, rdma_read_max_sge(xprt, pages_needed));
|
||||
read = min_t(int, pages_needed << PAGE_SHIFT, rs_length);
|
||||
|
||||
for (pno = 0; pno < pages_needed; pno++) {
|
||||
int len = min_t(int, rs_length, PAGE_SIZE - pg_off);
|
||||
|
||||
head->arg.pages[pg_no] = rqstp->rq_arg.pages[pg_no];
|
||||
head->arg.page_len += len;
|
||||
head->arg.len += len;
|
||||
if (!pg_off)
|
||||
head->count++;
|
||||
rqstp->rq_respages = &rqstp->rq_arg.pages[pg_no+1];
|
||||
rqstp->rq_next_page = rqstp->rq_respages + 1;
|
||||
ctxt->sge[pno].addr =
|
||||
ib_dma_map_page(xprt->sc_cm_id->device,
|
||||
head->arg.pages[pg_no], pg_off,
|
||||
PAGE_SIZE - pg_off,
|
||||
DMA_FROM_DEVICE);
|
||||
ret = ib_dma_mapping_error(xprt->sc_cm_id->device,
|
||||
ctxt->sge[pno].addr);
|
||||
if (ret)
|
||||
goto err;
|
||||
atomic_inc(&xprt->sc_dma_used);
|
||||
|
||||
/* The lkey here is either a local dma lkey or a dma_mr lkey */
|
||||
ctxt->sge[pno].lkey = xprt->sc_dma_lkey;
|
||||
ctxt->sge[pno].length = len;
|
||||
ctxt->count++;
|
||||
|
||||
/* adjust offset and wrap to next page if needed */
|
||||
pg_off += len;
|
||||
if (pg_off == PAGE_SIZE) {
|
||||
pg_off = 0;
|
||||
pg_no++;
|
||||
}
|
||||
rs_length -= len;
|
||||
}
|
||||
|
||||
if (last && rs_length == 0)
|
||||
set_bit(RDMACTXT_F_LAST_CTXT, &ctxt->flags);
|
||||
else
|
||||
clear_bit(RDMACTXT_F_LAST_CTXT, &ctxt->flags);
|
||||
|
||||
memset(&read_wr, 0, sizeof(read_wr));
|
||||
read_wr.wr_id = (unsigned long)ctxt;
|
||||
read_wr.opcode = IB_WR_RDMA_READ;
|
||||
ctxt->wr_op = read_wr.opcode;
|
||||
read_wr.send_flags = IB_SEND_SIGNALED;
|
||||
read_wr.wr.rdma.rkey = rs_handle;
|
||||
read_wr.wr.rdma.remote_addr = rs_offset;
|
||||
read_wr.sg_list = ctxt->sge;
|
||||
read_wr.num_sge = pages_needed;
|
||||
|
||||
ret = svc_rdma_send(xprt, &read_wr);
|
||||
if (ret) {
|
||||
pr_err("svcrdma: Error %d posting RDMA_READ\n", ret);
|
||||
set_bit(XPT_CLOSE, &xprt->sc_xprt.xpt_flags);
|
||||
goto err;
|
||||
}
|
||||
|
||||
/* return current location in page array */
|
||||
*page_no = pg_no;
|
||||
*page_offset = pg_off;
|
||||
ret = read;
|
||||
atomic_inc(&rdma_stat_read);
|
||||
return ret;
|
||||
err:
|
||||
svc_rdma_unmap_dma(ctxt);
|
||||
svc_rdma_put_context(ctxt, 0);
|
||||
return ret;
|
||||
}
|
||||
|
||||
/* Issue an RDMA_READ using an FRMR to map the data sink */
|
||||
static int rdma_read_chunk_frmr(struct svcxprt_rdma *xprt,
|
||||
struct svc_rqst *rqstp,
|
||||
struct svc_rdma_op_ctxt *head,
|
||||
int *page_no,
|
||||
u32 *page_offset,
|
||||
u32 rs_handle,
|
||||
u32 rs_length,
|
||||
u64 rs_offset,
|
||||
int last)
|
||||
{
|
||||
struct ib_send_wr read_wr;
|
||||
struct ib_send_wr inv_wr;
|
||||
struct ib_send_wr fastreg_wr;
|
||||
u8 key;
|
||||
int pages_needed = PAGE_ALIGN(*page_offset + rs_length) >> PAGE_SHIFT;
|
||||
struct svc_rdma_op_ctxt *ctxt = svc_rdma_get_context(xprt);
|
||||
struct svc_rdma_fastreg_mr *frmr = svc_rdma_get_frmr(xprt);
|
||||
int ret, read, pno;
|
||||
u32 pg_off = *page_offset;
|
||||
u32 pg_no = *page_no;
|
||||
|
||||
if (IS_ERR(frmr))
|
||||
return -ENOMEM;
|
||||
|
||||
ctxt->direction = DMA_FROM_DEVICE;
|
||||
ctxt->frmr = frmr;
|
||||
pages_needed = min_t(int, pages_needed, xprt->sc_frmr_pg_list_len);
|
||||
read = min_t(int, pages_needed << PAGE_SHIFT, rs_length);
|
||||
|
||||
frmr->kva = page_address(rqstp->rq_arg.pages[pg_no]);
|
||||
frmr->direction = DMA_FROM_DEVICE;
|
||||
frmr->access_flags = (IB_ACCESS_LOCAL_WRITE|IB_ACCESS_REMOTE_WRITE);
|
||||
frmr->map_len = pages_needed << PAGE_SHIFT;
|
||||
frmr->page_list_len = pages_needed;
|
||||
|
||||
for (pno = 0; pno < pages_needed; pno++) {
|
||||
int len = min_t(int, rs_length, PAGE_SIZE - pg_off);
|
||||
|
||||
head->arg.pages[pg_no] = rqstp->rq_arg.pages[pg_no];
|
||||
head->arg.page_len += len;
|
||||
head->arg.len += len;
|
||||
if (!pg_off)
|
||||
head->count++;
|
||||
rqstp->rq_respages = &rqstp->rq_arg.pages[pg_no+1];
|
||||
rqstp->rq_next_page = rqstp->rq_respages + 1;
|
||||
frmr->page_list->page_list[pno] =
|
||||
ib_dma_map_page(xprt->sc_cm_id->device,
|
||||
head->arg.pages[pg_no], 0,
|
||||
PAGE_SIZE, DMA_FROM_DEVICE);
|
||||
ret = ib_dma_mapping_error(xprt->sc_cm_id->device,
|
||||
frmr->page_list->page_list[pno]);
|
||||
if (ret)
|
||||
goto err;
|
||||
atomic_inc(&xprt->sc_dma_used);
|
||||
|
||||
/* adjust offset and wrap to next page if needed */
|
||||
pg_off += len;
|
||||
if (pg_off == PAGE_SIZE) {
|
||||
pg_off = 0;
|
||||
pg_no++;
|
||||
}
|
||||
rs_length -= len;
|
||||
}
|
||||
|
||||
if (last && rs_length == 0)
|
||||
set_bit(RDMACTXT_F_LAST_CTXT, &ctxt->flags);
|
||||
else
|
||||
clear_bit(RDMACTXT_F_LAST_CTXT, &ctxt->flags);
|
||||
|
||||
/* Bump the key */
|
||||
key = (u8)(frmr->mr->lkey & 0x000000FF);
|
||||
ib_update_fast_reg_key(frmr->mr, ++key);
|
||||
|
||||
ctxt->sge[0].addr = (unsigned long)frmr->kva + *page_offset;
|
||||
ctxt->sge[0].lkey = frmr->mr->lkey;
|
||||
ctxt->sge[0].length = read;
|
||||
ctxt->count = 1;
|
||||
ctxt->read_hdr = head;
|
||||
|
||||
/* Prepare FASTREG WR */
|
||||
memset(&fastreg_wr, 0, sizeof(fastreg_wr));
|
||||
fastreg_wr.opcode = IB_WR_FAST_REG_MR;
|
||||
fastreg_wr.send_flags = IB_SEND_SIGNALED;
|
||||
fastreg_wr.wr.fast_reg.iova_start = (unsigned long)frmr->kva;
|
||||
fastreg_wr.wr.fast_reg.page_list = frmr->page_list;
|
||||
fastreg_wr.wr.fast_reg.page_list_len = frmr->page_list_len;
|
||||
fastreg_wr.wr.fast_reg.page_shift = PAGE_SHIFT;
|
||||
fastreg_wr.wr.fast_reg.length = frmr->map_len;
|
||||
fastreg_wr.wr.fast_reg.access_flags = frmr->access_flags;
|
||||
fastreg_wr.wr.fast_reg.rkey = frmr->mr->lkey;
|
||||
fastreg_wr.next = &read_wr;
|
||||
|
||||
/* Prepare RDMA_READ */
|
||||
memset(&read_wr, 0, sizeof(read_wr));
|
||||
read_wr.send_flags = IB_SEND_SIGNALED;
|
||||
read_wr.wr.rdma.rkey = rs_handle;
|
||||
read_wr.wr.rdma.remote_addr = rs_offset;
|
||||
read_wr.sg_list = ctxt->sge;
|
||||
read_wr.num_sge = 1;
|
||||
if (xprt->sc_dev_caps & SVCRDMA_DEVCAP_READ_W_INV) {
|
||||
read_wr.opcode = IB_WR_RDMA_READ_WITH_INV;
|
||||
read_wr.wr_id = (unsigned long)ctxt;
|
||||
read_wr.ex.invalidate_rkey = ctxt->frmr->mr->lkey;
|
||||
} else {
|
||||
read_wr.opcode = IB_WR_RDMA_READ;
|
||||
read_wr.next = &inv_wr;
|
||||
/* Prepare invalidate */
|
||||
memset(&inv_wr, 0, sizeof(inv_wr));
|
||||
inv_wr.wr_id = (unsigned long)ctxt;
|
||||
inv_wr.opcode = IB_WR_LOCAL_INV;
|
||||
inv_wr.send_flags = IB_SEND_SIGNALED | IB_SEND_FENCE;
|
||||
inv_wr.ex.invalidate_rkey = frmr->mr->lkey;
|
||||
}
|
||||
ctxt->wr_op = read_wr.opcode;
|
||||
|
||||
/* Post the chain */
|
||||
ret = svc_rdma_send(xprt, &fastreg_wr);
|
||||
if (ret) {
|
||||
pr_err("svcrdma: Error %d posting RDMA_READ\n", ret);
|
||||
set_bit(XPT_CLOSE, &xprt->sc_xprt.xpt_flags);
|
||||
goto err;
|
||||
}
|
||||
|
||||
/* return current location in page array */
|
||||
*page_no = pg_no;
|
||||
*page_offset = pg_off;
|
||||
ret = read;
|
||||
atomic_inc(&rdma_stat_read);
|
||||
return ret;
|
||||
err:
|
||||
svc_rdma_unmap_dma(ctxt);
|
||||
svc_rdma_put_context(ctxt, 0);
|
||||
svc_rdma_put_frmr(xprt, frmr);
|
||||
return ret;
|
||||
}
|
||||
|
||||
static int rdma_read_chunks(struct svcxprt_rdma *xprt,
|
||||
struct rpcrdma_msg *rmsgp,
|
||||
struct svc_rqst *rqstp,
|
||||
struct svc_rdma_op_ctxt *head)
|
||||
{
|
||||
int page_no, ch_count, ret;
|
||||
struct rpcrdma_read_chunk *ch;
|
||||
u32 page_offset, byte_count;
|
||||
u64 rs_offset;
|
||||
rdma_reader_fn reader;
|
||||
|
||||
/* If no read list is present, return 0 */
|
||||
ch = svc_rdma_get_read_chunk(rmsgp);
|
||||
if (!ch)
|
||||
return 0;
|
||||
|
||||
svc_rdma_rcl_chunk_counts(ch, &ch_count, &byte_count);
|
||||
if (ch_count > RPCSVC_MAXPAGES)
|
||||
return -EINVAL;
|
||||
|
||||
/* The request is completed when the RDMA_READs complete. The
|
||||
* head context keeps all the pages that comprise the
|
||||
* request.
|
||||
*/
|
||||
head->arg.head[0] = rqstp->rq_arg.head[0];
|
||||
head->arg.tail[0] = rqstp->rq_arg.tail[0];
|
||||
head->arg.pages = &head->pages[head->count];
|
||||
head->hdr_count = head->count;
|
||||
head->arg.page_base = 0;
|
||||
head->arg.page_len = 0;
|
||||
head->arg.len = rqstp->rq_arg.len;
|
||||
head->arg.buflen = rqstp->rq_arg.buflen;
|
||||
|
||||
/* Use FRMR if supported */
|
||||
if (xprt->sc_dev_caps & SVCRDMA_DEVCAP_FAST_REG)
|
||||
reader = rdma_read_chunk_frmr;
|
||||
else
|
||||
reader = rdma_read_chunk_lcl;
|
||||
|
||||
page_no = 0; page_offset = 0;
|
||||
for (ch = (struct rpcrdma_read_chunk *)&rmsgp->rm_body.rm_chunks[0];
|
||||
ch->rc_discrim != 0; ch++) {
|
||||
|
||||
xdr_decode_hyper((__be32 *)&ch->rc_target.rs_offset,
|
||||
&rs_offset);
|
||||
byte_count = ntohl(ch->rc_target.rs_length);
|
||||
|
||||
while (byte_count > 0) {
|
||||
ret = reader(xprt, rqstp, head,
|
||||
&page_no, &page_offset,
|
||||
ntohl(ch->rc_target.rs_handle),
|
||||
byte_count, rs_offset,
|
||||
((ch+1)->rc_discrim == 0) /* last */
|
||||
);
|
||||
if (ret < 0)
|
||||
goto err;
|
||||
byte_count -= ret;
|
||||
rs_offset += ret;
|
||||
head->arg.buflen += ret;
|
||||
}
|
||||
}
|
||||
ret = 1;
|
||||
err:
|
||||
/* Detach arg pages. svc_recv will replenish them */
|
||||
for (page_no = 0;
|
||||
&rqstp->rq_pages[page_no] < rqstp->rq_respages; page_no++)
|
||||
rqstp->rq_pages[page_no] = NULL;
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
/*
|
||||
* To avoid a separate RDMA READ just for a handful of zero bytes,
|
||||
* RFC 5666 section 3.7 allows the client to omit the XDR zero pad
|
||||
* in chunk lists.
|
||||
*/
|
||||
static void
|
||||
rdma_fix_xdr_pad(struct xdr_buf *buf)
|
||||
{
|
||||
unsigned int page_len = buf->page_len;
|
||||
unsigned int size = (XDR_QUADLEN(page_len) << 2) - page_len;
|
||||
unsigned int offset, pg_no;
|
||||
char *p;
|
||||
|
||||
if (size == 0)
|
||||
return;
|
||||
|
||||
pg_no = page_len >> PAGE_SHIFT;
|
||||
offset = page_len & ~PAGE_MASK;
|
||||
p = page_address(buf->pages[pg_no]);
|
||||
memset(p + offset, 0, size);
|
||||
|
||||
buf->page_len += size;
|
||||
buf->buflen += size;
|
||||
buf->len += size;
|
||||
}
|
||||
|
||||
static int rdma_read_complete(struct svc_rqst *rqstp,
|
||||
struct svc_rdma_op_ctxt *head)
|
||||
{
|
||||
int page_no;
|
||||
int ret;
|
||||
|
||||
BUG_ON(!head);
|
||||
|
||||
/* Copy RPC pages */
|
||||
for (page_no = 0; page_no < head->count; page_no++) {
|
||||
put_page(rqstp->rq_pages[page_no]);
|
||||
rqstp->rq_pages[page_no] = head->pages[page_no];
|
||||
}
|
||||
/* Point rq_arg.pages past header */
|
||||
rdma_fix_xdr_pad(&head->arg);
|
||||
rqstp->rq_arg.pages = &rqstp->rq_pages[head->hdr_count];
|
||||
rqstp->rq_arg.page_len = head->arg.page_len;
|
||||
rqstp->rq_arg.page_base = head->arg.page_base;
|
||||
|
||||
/* rq_respages starts after the last arg page */
|
||||
rqstp->rq_respages = &rqstp->rq_arg.pages[page_no];
|
||||
rqstp->rq_next_page = rqstp->rq_respages + 1;
|
||||
|
||||
/* Rebuild rq_arg head and tail. */
|
||||
rqstp->rq_arg.head[0] = head->arg.head[0];
|
||||
rqstp->rq_arg.tail[0] = head->arg.tail[0];
|
||||
rqstp->rq_arg.len = head->arg.len;
|
||||
rqstp->rq_arg.buflen = head->arg.buflen;
|
||||
|
||||
/* Free the context */
|
||||
svc_rdma_put_context(head, 0);
|
||||
|
||||
/* XXX: What should this be? */
|
||||
rqstp->rq_prot = IPPROTO_MAX;
|
||||
svc_xprt_copy_addrs(rqstp, rqstp->rq_xprt);
|
||||
|
||||
ret = rqstp->rq_arg.head[0].iov_len
|
||||
+ rqstp->rq_arg.page_len
|
||||
+ rqstp->rq_arg.tail[0].iov_len;
|
||||
dprintk("svcrdma: deferred read ret=%d, rq_arg.len =%d, "
|
||||
"rq_arg.head[0].iov_base=%p, rq_arg.head[0].iov_len = %zd\n",
|
||||
ret, rqstp->rq_arg.len, rqstp->rq_arg.head[0].iov_base,
|
||||
rqstp->rq_arg.head[0].iov_len);
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
/*
|
||||
* Set up the rqstp thread context to point to the RQ buffer. If
|
||||
* necessary, pull additional data from the client with an RDMA_READ
|
||||
* request.
|
||||
*/
|
||||
int svc_rdma_recvfrom(struct svc_rqst *rqstp)
|
||||
{
|
||||
struct svc_xprt *xprt = rqstp->rq_xprt;
|
||||
struct svcxprt_rdma *rdma_xprt =
|
||||
container_of(xprt, struct svcxprt_rdma, sc_xprt);
|
||||
struct svc_rdma_op_ctxt *ctxt = NULL;
|
||||
struct rpcrdma_msg *rmsgp;
|
||||
int ret = 0;
|
||||
int len;
|
||||
|
||||
dprintk("svcrdma: rqstp=%p\n", rqstp);
|
||||
|
||||
spin_lock_bh(&rdma_xprt->sc_rq_dto_lock);
|
||||
if (!list_empty(&rdma_xprt->sc_read_complete_q)) {
|
||||
ctxt = list_entry(rdma_xprt->sc_read_complete_q.next,
|
||||
struct svc_rdma_op_ctxt,
|
||||
dto_q);
|
||||
list_del_init(&ctxt->dto_q);
|
||||
spin_unlock_bh(&rdma_xprt->sc_rq_dto_lock);
|
||||
return rdma_read_complete(rqstp, ctxt);
|
||||
} else if (!list_empty(&rdma_xprt->sc_rq_dto_q)) {
|
||||
ctxt = list_entry(rdma_xprt->sc_rq_dto_q.next,
|
||||
struct svc_rdma_op_ctxt,
|
||||
dto_q);
|
||||
list_del_init(&ctxt->dto_q);
|
||||
} else {
|
||||
atomic_inc(&rdma_stat_rq_starve);
|
||||
clear_bit(XPT_DATA, &xprt->xpt_flags);
|
||||
ctxt = NULL;
|
||||
}
|
||||
spin_unlock_bh(&rdma_xprt->sc_rq_dto_lock);
|
||||
if (!ctxt) {
|
||||
/* This is the EAGAIN path. The svc_recv routine will
|
||||
* return -EAGAIN, the nfsd thread will go to call into
|
||||
* svc_recv again and we shouldn't be on the active
|
||||
* transport list
|
||||
*/
|
||||
if (test_bit(XPT_CLOSE, &xprt->xpt_flags))
|
||||
goto close_out;
|
||||
|
||||
goto out;
|
||||
}
|
||||
dprintk("svcrdma: processing ctxt=%p on xprt=%p, rqstp=%p, status=%d\n",
|
||||
ctxt, rdma_xprt, rqstp, ctxt->wc_status);
|
||||
BUG_ON(ctxt->wc_status != IB_WC_SUCCESS);
|
||||
atomic_inc(&rdma_stat_recv);
|
||||
|
||||
/* Build up the XDR from the receive buffers. */
|
||||
rdma_build_arg_xdr(rqstp, ctxt, ctxt->byte_len);
|
||||
|
||||
/* Decode the RDMA header. */
|
||||
len = svc_rdma_xdr_decode_req(&rmsgp, rqstp);
|
||||
rqstp->rq_xprt_hlen = len;
|
||||
|
||||
/* If the request is invalid, reply with an error */
|
||||
if (len < 0) {
|
||||
if (len == -ENOSYS)
|
||||
svc_rdma_send_error(rdma_xprt, rmsgp, ERR_VERS);
|
||||
goto close_out;
|
||||
}
|
||||
|
||||
/* Read read-list data. */
|
||||
ret = rdma_read_chunks(rdma_xprt, rmsgp, rqstp, ctxt);
|
||||
if (ret > 0) {
|
||||
/* read-list posted, defer until data received from client. */
|
||||
goto defer;
|
||||
} else if (ret < 0) {
|
||||
/* Post of read-list failed, free context. */
|
||||
svc_rdma_put_context(ctxt, 1);
|
||||
return 0;
|
||||
}
|
||||
|
||||
ret = rqstp->rq_arg.head[0].iov_len
|
||||
+ rqstp->rq_arg.page_len
|
||||
+ rqstp->rq_arg.tail[0].iov_len;
|
||||
svc_rdma_put_context(ctxt, 0);
|
||||
out:
|
||||
dprintk("svcrdma: ret = %d, rq_arg.len =%d, "
|
||||
"rq_arg.head[0].iov_base=%p, rq_arg.head[0].iov_len = %zd\n",
|
||||
ret, rqstp->rq_arg.len,
|
||||
rqstp->rq_arg.head[0].iov_base,
|
||||
rqstp->rq_arg.head[0].iov_len);
|
||||
rqstp->rq_prot = IPPROTO_MAX;
|
||||
svc_xprt_copy_addrs(rqstp, xprt);
|
||||
return ret;
|
||||
|
||||
close_out:
|
||||
if (ctxt)
|
||||
svc_rdma_put_context(ctxt, 1);
|
||||
dprintk("svcrdma: transport %p is closing\n", xprt);
|
||||
/*
|
||||
* Set the close bit and enqueue it. svc_recv will see the
|
||||
* close bit and call svc_xprt_delete
|
||||
*/
|
||||
set_bit(XPT_CLOSE, &xprt->xpt_flags);
|
||||
defer:
|
||||
return 0;
|
||||
}
|
554
net/sunrpc/xprtrdma/svc_rdma_sendto.c
Normal file
554
net/sunrpc/xprtrdma/svc_rdma_sendto.c
Normal file
|
@ -0,0 +1,554 @@
|
|||
/*
|
||||
* Copyright (c) 2014 Open Grid Computing, Inc. All rights reserved.
|
||||
* Copyright (c) 2005-2006 Network Appliance, Inc. All rights reserved.
|
||||
*
|
||||
* This software is available to you under a choice of one of two
|
||||
* licenses. You may choose to be licensed under the terms of the GNU
|
||||
* General Public License (GPL) Version 2, available from the file
|
||||
* COPYING in the main directory of this source tree, or the BSD-type
|
||||
* license below:
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions
|
||||
* are met:
|
||||
*
|
||||
* Redistributions of source code must retain the above copyright
|
||||
* notice, this list of conditions and the following disclaimer.
|
||||
*
|
||||
* Redistributions in binary form must reproduce the above
|
||||
* copyright notice, this list of conditions and the following
|
||||
* disclaimer in the documentation and/or other materials provided
|
||||
* with the distribution.
|
||||
*
|
||||
* Neither the name of the Network Appliance, Inc. nor the names of
|
||||
* its contributors may be used to endorse or promote products
|
||||
* derived from this software without specific prior written
|
||||
* permission.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
* A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
* OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*
|
||||
* Author: Tom Tucker <tom@opengridcomputing.com>
|
||||
*/
|
||||
|
||||
#include <linux/sunrpc/debug.h>
|
||||
#include <linux/sunrpc/rpc_rdma.h>
|
||||
#include <linux/spinlock.h>
|
||||
#include <asm/unaligned.h>
|
||||
#include <rdma/ib_verbs.h>
|
||||
#include <rdma/rdma_cm.h>
|
||||
#include <linux/sunrpc/svc_rdma.h>
|
||||
|
||||
#define RPCDBG_FACILITY RPCDBG_SVCXPRT
|
||||
|
||||
static int map_xdr(struct svcxprt_rdma *xprt,
|
||||
struct xdr_buf *xdr,
|
||||
struct svc_rdma_req_map *vec)
|
||||
{
|
||||
int sge_no;
|
||||
u32 sge_bytes;
|
||||
u32 page_bytes;
|
||||
u32 page_off;
|
||||
int page_no;
|
||||
|
||||
BUG_ON(xdr->len !=
|
||||
(xdr->head[0].iov_len + xdr->page_len + xdr->tail[0].iov_len));
|
||||
|
||||
/* Skip the first sge, this is for the RPCRDMA header */
|
||||
sge_no = 1;
|
||||
|
||||
/* Head SGE */
|
||||
vec->sge[sge_no].iov_base = xdr->head[0].iov_base;
|
||||
vec->sge[sge_no].iov_len = xdr->head[0].iov_len;
|
||||
sge_no++;
|
||||
|
||||
/* pages SGE */
|
||||
page_no = 0;
|
||||
page_bytes = xdr->page_len;
|
||||
page_off = xdr->page_base;
|
||||
while (page_bytes) {
|
||||
vec->sge[sge_no].iov_base =
|
||||
page_address(xdr->pages[page_no]) + page_off;
|
||||
sge_bytes = min_t(u32, page_bytes, (PAGE_SIZE - page_off));
|
||||
page_bytes -= sge_bytes;
|
||||
vec->sge[sge_no].iov_len = sge_bytes;
|
||||
|
||||
sge_no++;
|
||||
page_no++;
|
||||
page_off = 0; /* reset for next time through loop */
|
||||
}
|
||||
|
||||
/* Tail SGE */
|
||||
if (xdr->tail[0].iov_len) {
|
||||
vec->sge[sge_no].iov_base = xdr->tail[0].iov_base;
|
||||
vec->sge[sge_no].iov_len = xdr->tail[0].iov_len;
|
||||
sge_no++;
|
||||
}
|
||||
|
||||
dprintk("svcrdma: map_xdr: sge_no %d page_no %d "
|
||||
"page_base %u page_len %u head_len %zu tail_len %zu\n",
|
||||
sge_no, page_no, xdr->page_base, xdr->page_len,
|
||||
xdr->head[0].iov_len, xdr->tail[0].iov_len);
|
||||
|
||||
vec->count = sge_no;
|
||||
return 0;
|
||||
}
|
||||
|
||||
static dma_addr_t dma_map_xdr(struct svcxprt_rdma *xprt,
|
||||
struct xdr_buf *xdr,
|
||||
u32 xdr_off, size_t len, int dir)
|
||||
{
|
||||
struct page *page;
|
||||
dma_addr_t dma_addr;
|
||||
if (xdr_off < xdr->head[0].iov_len) {
|
||||
/* This offset is in the head */
|
||||
xdr_off += (unsigned long)xdr->head[0].iov_base & ~PAGE_MASK;
|
||||
page = virt_to_page(xdr->head[0].iov_base);
|
||||
} else {
|
||||
xdr_off -= xdr->head[0].iov_len;
|
||||
if (xdr_off < xdr->page_len) {
|
||||
/* This offset is in the page list */
|
||||
xdr_off += xdr->page_base;
|
||||
page = xdr->pages[xdr_off >> PAGE_SHIFT];
|
||||
xdr_off &= ~PAGE_MASK;
|
||||
} else {
|
||||
/* This offset is in the tail */
|
||||
xdr_off -= xdr->page_len;
|
||||
xdr_off += (unsigned long)
|
||||
xdr->tail[0].iov_base & ~PAGE_MASK;
|
||||
page = virt_to_page(xdr->tail[0].iov_base);
|
||||
}
|
||||
}
|
||||
dma_addr = ib_dma_map_page(xprt->sc_cm_id->device, page, xdr_off,
|
||||
min_t(size_t, PAGE_SIZE, len), dir);
|
||||
return dma_addr;
|
||||
}
|
||||
|
||||
/* Assumptions:
|
||||
* - The specified write_len can be represented in sc_max_sge * PAGE_SIZE
|
||||
*/
|
||||
static int send_write(struct svcxprt_rdma *xprt, struct svc_rqst *rqstp,
|
||||
u32 rmr, u64 to,
|
||||
u32 xdr_off, int write_len,
|
||||
struct svc_rdma_req_map *vec)
|
||||
{
|
||||
struct ib_send_wr write_wr;
|
||||
struct ib_sge *sge;
|
||||
int xdr_sge_no;
|
||||
int sge_no;
|
||||
int sge_bytes;
|
||||
int sge_off;
|
||||
int bc;
|
||||
struct svc_rdma_op_ctxt *ctxt;
|
||||
|
||||
BUG_ON(vec->count > RPCSVC_MAXPAGES);
|
||||
dprintk("svcrdma: RDMA_WRITE rmr=%x, to=%llx, xdr_off=%d, "
|
||||
"write_len=%d, vec->sge=%p, vec->count=%lu\n",
|
||||
rmr, (unsigned long long)to, xdr_off,
|
||||
write_len, vec->sge, vec->count);
|
||||
|
||||
ctxt = svc_rdma_get_context(xprt);
|
||||
ctxt->direction = DMA_TO_DEVICE;
|
||||
sge = ctxt->sge;
|
||||
|
||||
/* Find the SGE associated with xdr_off */
|
||||
for (bc = xdr_off, xdr_sge_no = 1; bc && xdr_sge_no < vec->count;
|
||||
xdr_sge_no++) {
|
||||
if (vec->sge[xdr_sge_no].iov_len > bc)
|
||||
break;
|
||||
bc -= vec->sge[xdr_sge_no].iov_len;
|
||||
}
|
||||
|
||||
sge_off = bc;
|
||||
bc = write_len;
|
||||
sge_no = 0;
|
||||
|
||||
/* Copy the remaining SGE */
|
||||
while (bc != 0) {
|
||||
sge_bytes = min_t(size_t,
|
||||
bc, vec->sge[xdr_sge_no].iov_len-sge_off);
|
||||
sge[sge_no].length = sge_bytes;
|
||||
sge[sge_no].addr =
|
||||
dma_map_xdr(xprt, &rqstp->rq_res, xdr_off,
|
||||
sge_bytes, DMA_TO_DEVICE);
|
||||
xdr_off += sge_bytes;
|
||||
if (ib_dma_mapping_error(xprt->sc_cm_id->device,
|
||||
sge[sge_no].addr))
|
||||
goto err;
|
||||
atomic_inc(&xprt->sc_dma_used);
|
||||
sge[sge_no].lkey = xprt->sc_dma_lkey;
|
||||
ctxt->count++;
|
||||
sge_off = 0;
|
||||
sge_no++;
|
||||
xdr_sge_no++;
|
||||
BUG_ON(xdr_sge_no > vec->count);
|
||||
bc -= sge_bytes;
|
||||
if (sge_no == xprt->sc_max_sge)
|
||||
break;
|
||||
}
|
||||
|
||||
/* Prepare WRITE WR */
|
||||
memset(&write_wr, 0, sizeof write_wr);
|
||||
ctxt->wr_op = IB_WR_RDMA_WRITE;
|
||||
write_wr.wr_id = (unsigned long)ctxt;
|
||||
write_wr.sg_list = &sge[0];
|
||||
write_wr.num_sge = sge_no;
|
||||
write_wr.opcode = IB_WR_RDMA_WRITE;
|
||||
write_wr.send_flags = IB_SEND_SIGNALED;
|
||||
write_wr.wr.rdma.rkey = rmr;
|
||||
write_wr.wr.rdma.remote_addr = to;
|
||||
|
||||
/* Post It */
|
||||
atomic_inc(&rdma_stat_write);
|
||||
if (svc_rdma_send(xprt, &write_wr))
|
||||
goto err;
|
||||
return write_len - bc;
|
||||
err:
|
||||
svc_rdma_unmap_dma(ctxt);
|
||||
svc_rdma_put_context(ctxt, 0);
|
||||
/* Fatal error, close transport */
|
||||
return -EIO;
|
||||
}
|
||||
|
||||
static int send_write_chunks(struct svcxprt_rdma *xprt,
|
||||
struct rpcrdma_msg *rdma_argp,
|
||||
struct rpcrdma_msg *rdma_resp,
|
||||
struct svc_rqst *rqstp,
|
||||
struct svc_rdma_req_map *vec)
|
||||
{
|
||||
u32 xfer_len = rqstp->rq_res.page_len + rqstp->rq_res.tail[0].iov_len;
|
||||
int write_len;
|
||||
u32 xdr_off;
|
||||
int chunk_off;
|
||||
int chunk_no;
|
||||
struct rpcrdma_write_array *arg_ary;
|
||||
struct rpcrdma_write_array *res_ary;
|
||||
int ret;
|
||||
|
||||
arg_ary = svc_rdma_get_write_array(rdma_argp);
|
||||
if (!arg_ary)
|
||||
return 0;
|
||||
res_ary = (struct rpcrdma_write_array *)
|
||||
&rdma_resp->rm_body.rm_chunks[1];
|
||||
|
||||
/* Write chunks start at the pagelist */
|
||||
for (xdr_off = rqstp->rq_res.head[0].iov_len, chunk_no = 0;
|
||||
xfer_len && chunk_no < arg_ary->wc_nchunks;
|
||||
chunk_no++) {
|
||||
struct rpcrdma_segment *arg_ch;
|
||||
u64 rs_offset;
|
||||
|
||||
arg_ch = &arg_ary->wc_array[chunk_no].wc_target;
|
||||
write_len = min(xfer_len, ntohl(arg_ch->rs_length));
|
||||
|
||||
/* Prepare the response chunk given the length actually
|
||||
* written */
|
||||
xdr_decode_hyper((__be32 *)&arg_ch->rs_offset, &rs_offset);
|
||||
svc_rdma_xdr_encode_array_chunk(res_ary, chunk_no,
|
||||
arg_ch->rs_handle,
|
||||
arg_ch->rs_offset,
|
||||
write_len);
|
||||
chunk_off = 0;
|
||||
while (write_len) {
|
||||
ret = send_write(xprt, rqstp,
|
||||
ntohl(arg_ch->rs_handle),
|
||||
rs_offset + chunk_off,
|
||||
xdr_off,
|
||||
write_len,
|
||||
vec);
|
||||
if (ret <= 0) {
|
||||
dprintk("svcrdma: RDMA_WRITE failed, ret=%d\n",
|
||||
ret);
|
||||
return -EIO;
|
||||
}
|
||||
chunk_off += ret;
|
||||
xdr_off += ret;
|
||||
xfer_len -= ret;
|
||||
write_len -= ret;
|
||||
}
|
||||
}
|
||||
/* Update the req with the number of chunks actually used */
|
||||
svc_rdma_xdr_encode_write_list(rdma_resp, chunk_no);
|
||||
|
||||
return rqstp->rq_res.page_len + rqstp->rq_res.tail[0].iov_len;
|
||||
}
|
||||
|
||||
static int send_reply_chunks(struct svcxprt_rdma *xprt,
|
||||
struct rpcrdma_msg *rdma_argp,
|
||||
struct rpcrdma_msg *rdma_resp,
|
||||
struct svc_rqst *rqstp,
|
||||
struct svc_rdma_req_map *vec)
|
||||
{
|
||||
u32 xfer_len = rqstp->rq_res.len;
|
||||
int write_len;
|
||||
u32 xdr_off;
|
||||
int chunk_no;
|
||||
int chunk_off;
|
||||
int nchunks;
|
||||
struct rpcrdma_segment *ch;
|
||||
struct rpcrdma_write_array *arg_ary;
|
||||
struct rpcrdma_write_array *res_ary;
|
||||
int ret;
|
||||
|
||||
arg_ary = svc_rdma_get_reply_array(rdma_argp);
|
||||
if (!arg_ary)
|
||||
return 0;
|
||||
/* XXX: need to fix when reply lists occur with read-list and or
|
||||
* write-list */
|
||||
res_ary = (struct rpcrdma_write_array *)
|
||||
&rdma_resp->rm_body.rm_chunks[2];
|
||||
|
||||
/* xdr offset starts at RPC message */
|
||||
nchunks = ntohl(arg_ary->wc_nchunks);
|
||||
for (xdr_off = 0, chunk_no = 0;
|
||||
xfer_len && chunk_no < nchunks;
|
||||
chunk_no++) {
|
||||
u64 rs_offset;
|
||||
ch = &arg_ary->wc_array[chunk_no].wc_target;
|
||||
write_len = min(xfer_len, htonl(ch->rs_length));
|
||||
|
||||
/* Prepare the reply chunk given the length actually
|
||||
* written */
|
||||
xdr_decode_hyper((__be32 *)&ch->rs_offset, &rs_offset);
|
||||
svc_rdma_xdr_encode_array_chunk(res_ary, chunk_no,
|
||||
ch->rs_handle, ch->rs_offset,
|
||||
write_len);
|
||||
chunk_off = 0;
|
||||
while (write_len) {
|
||||
ret = send_write(xprt, rqstp,
|
||||
ntohl(ch->rs_handle),
|
||||
rs_offset + chunk_off,
|
||||
xdr_off,
|
||||
write_len,
|
||||
vec);
|
||||
if (ret <= 0) {
|
||||
dprintk("svcrdma: RDMA_WRITE failed, ret=%d\n",
|
||||
ret);
|
||||
return -EIO;
|
||||
}
|
||||
chunk_off += ret;
|
||||
xdr_off += ret;
|
||||
xfer_len -= ret;
|
||||
write_len -= ret;
|
||||
}
|
||||
}
|
||||
/* Update the req with the number of chunks actually used */
|
||||
svc_rdma_xdr_encode_reply_array(res_ary, chunk_no);
|
||||
|
||||
return rqstp->rq_res.len;
|
||||
}
|
||||
|
||||
/* This function prepares the portion of the RPCRDMA message to be
|
||||
* sent in the RDMA_SEND. This function is called after data sent via
|
||||
* RDMA has already been transmitted. There are three cases:
|
||||
* - The RPCRDMA header, RPC header, and payload are all sent in a
|
||||
* single RDMA_SEND. This is the "inline" case.
|
||||
* - The RPCRDMA header and some portion of the RPC header and data
|
||||
* are sent via this RDMA_SEND and another portion of the data is
|
||||
* sent via RDMA.
|
||||
* - The RPCRDMA header [NOMSG] is sent in this RDMA_SEND and the RPC
|
||||
* header and data are all transmitted via RDMA.
|
||||
* In all three cases, this function prepares the RPCRDMA header in
|
||||
* sge[0], the 'type' parameter indicates the type to place in the
|
||||
* RPCRDMA header, and the 'byte_count' field indicates how much of
|
||||
* the XDR to include in this RDMA_SEND. NB: The offset of the payload
|
||||
* to send is zero in the XDR.
|
||||
*/
|
||||
static int send_reply(struct svcxprt_rdma *rdma,
|
||||
struct svc_rqst *rqstp,
|
||||
struct page *page,
|
||||
struct rpcrdma_msg *rdma_resp,
|
||||
struct svc_rdma_op_ctxt *ctxt,
|
||||
struct svc_rdma_req_map *vec,
|
||||
int byte_count)
|
||||
{
|
||||
struct ib_send_wr send_wr;
|
||||
int sge_no;
|
||||
int sge_bytes;
|
||||
int page_no;
|
||||
int pages;
|
||||
int ret;
|
||||
|
||||
/* Post a recv buffer to handle another request. */
|
||||
ret = svc_rdma_post_recv(rdma);
|
||||
if (ret) {
|
||||
printk(KERN_INFO
|
||||
"svcrdma: could not post a receive buffer, err=%d."
|
||||
"Closing transport %p.\n", ret, rdma);
|
||||
set_bit(XPT_CLOSE, &rdma->sc_xprt.xpt_flags);
|
||||
svc_rdma_put_context(ctxt, 0);
|
||||
return -ENOTCONN;
|
||||
}
|
||||
|
||||
/* Prepare the context */
|
||||
ctxt->pages[0] = page;
|
||||
ctxt->count = 1;
|
||||
|
||||
/* Prepare the SGE for the RPCRDMA Header */
|
||||
ctxt->sge[0].lkey = rdma->sc_dma_lkey;
|
||||
ctxt->sge[0].length = svc_rdma_xdr_get_reply_hdr_len(rdma_resp);
|
||||
ctxt->sge[0].addr =
|
||||
ib_dma_map_page(rdma->sc_cm_id->device, page, 0,
|
||||
ctxt->sge[0].length, DMA_TO_DEVICE);
|
||||
if (ib_dma_mapping_error(rdma->sc_cm_id->device, ctxt->sge[0].addr))
|
||||
goto err;
|
||||
atomic_inc(&rdma->sc_dma_used);
|
||||
|
||||
ctxt->direction = DMA_TO_DEVICE;
|
||||
|
||||
/* Map the payload indicated by 'byte_count' */
|
||||
for (sge_no = 1; byte_count && sge_no < vec->count; sge_no++) {
|
||||
int xdr_off = 0;
|
||||
sge_bytes = min_t(size_t, vec->sge[sge_no].iov_len, byte_count);
|
||||
byte_count -= sge_bytes;
|
||||
ctxt->sge[sge_no].addr =
|
||||
dma_map_xdr(rdma, &rqstp->rq_res, xdr_off,
|
||||
sge_bytes, DMA_TO_DEVICE);
|
||||
xdr_off += sge_bytes;
|
||||
if (ib_dma_mapping_error(rdma->sc_cm_id->device,
|
||||
ctxt->sge[sge_no].addr))
|
||||
goto err;
|
||||
atomic_inc(&rdma->sc_dma_used);
|
||||
ctxt->sge[sge_no].lkey = rdma->sc_dma_lkey;
|
||||
ctxt->sge[sge_no].length = sge_bytes;
|
||||
}
|
||||
BUG_ON(byte_count != 0);
|
||||
|
||||
/* Save all respages in the ctxt and remove them from the
|
||||
* respages array. They are our pages until the I/O
|
||||
* completes.
|
||||
*/
|
||||
pages = rqstp->rq_next_page - rqstp->rq_respages;
|
||||
for (page_no = 0; page_no < pages; page_no++) {
|
||||
ctxt->pages[page_no+1] = rqstp->rq_respages[page_no];
|
||||
ctxt->count++;
|
||||
rqstp->rq_respages[page_no] = NULL;
|
||||
/*
|
||||
* If there are more pages than SGE, terminate SGE
|
||||
* list so that svc_rdma_unmap_dma doesn't attempt to
|
||||
* unmap garbage.
|
||||
*/
|
||||
if (page_no+1 >= sge_no)
|
||||
ctxt->sge[page_no+1].length = 0;
|
||||
}
|
||||
rqstp->rq_next_page = rqstp->rq_respages + 1;
|
||||
|
||||
BUG_ON(sge_no > rdma->sc_max_sge);
|
||||
memset(&send_wr, 0, sizeof send_wr);
|
||||
ctxt->wr_op = IB_WR_SEND;
|
||||
send_wr.wr_id = (unsigned long)ctxt;
|
||||
send_wr.sg_list = ctxt->sge;
|
||||
send_wr.num_sge = sge_no;
|
||||
send_wr.opcode = IB_WR_SEND;
|
||||
send_wr.send_flags = IB_SEND_SIGNALED;
|
||||
|
||||
ret = svc_rdma_send(rdma, &send_wr);
|
||||
if (ret)
|
||||
goto err;
|
||||
|
||||
return 0;
|
||||
|
||||
err:
|
||||
svc_rdma_unmap_dma(ctxt);
|
||||
svc_rdma_put_context(ctxt, 1);
|
||||
return -EIO;
|
||||
}
|
||||
|
||||
void svc_rdma_prep_reply_hdr(struct svc_rqst *rqstp)
|
||||
{
|
||||
}
|
||||
|
||||
/*
|
||||
* Return the start of an xdr buffer.
|
||||
*/
|
||||
static void *xdr_start(struct xdr_buf *xdr)
|
||||
{
|
||||
return xdr->head[0].iov_base -
|
||||
(xdr->len -
|
||||
xdr->page_len -
|
||||
xdr->tail[0].iov_len -
|
||||
xdr->head[0].iov_len);
|
||||
}
|
||||
|
||||
int svc_rdma_sendto(struct svc_rqst *rqstp)
|
||||
{
|
||||
struct svc_xprt *xprt = rqstp->rq_xprt;
|
||||
struct svcxprt_rdma *rdma =
|
||||
container_of(xprt, struct svcxprt_rdma, sc_xprt);
|
||||
struct rpcrdma_msg *rdma_argp;
|
||||
struct rpcrdma_msg *rdma_resp;
|
||||
struct rpcrdma_write_array *reply_ary;
|
||||
enum rpcrdma_proc reply_type;
|
||||
int ret;
|
||||
int inline_bytes;
|
||||
struct page *res_page;
|
||||
struct svc_rdma_op_ctxt *ctxt;
|
||||
struct svc_rdma_req_map *vec;
|
||||
|
||||
dprintk("svcrdma: sending response for rqstp=%p\n", rqstp);
|
||||
|
||||
/* Get the RDMA request header. */
|
||||
rdma_argp = xdr_start(&rqstp->rq_arg);
|
||||
|
||||
/* Build an req vec for the XDR */
|
||||
ctxt = svc_rdma_get_context(rdma);
|
||||
ctxt->direction = DMA_TO_DEVICE;
|
||||
vec = svc_rdma_get_req_map();
|
||||
ret = map_xdr(rdma, &rqstp->rq_res, vec);
|
||||
if (ret)
|
||||
goto err0;
|
||||
inline_bytes = rqstp->rq_res.len;
|
||||
|
||||
/* Create the RDMA response header */
|
||||
res_page = svc_rdma_get_page();
|
||||
rdma_resp = page_address(res_page);
|
||||
reply_ary = svc_rdma_get_reply_array(rdma_argp);
|
||||
if (reply_ary)
|
||||
reply_type = RDMA_NOMSG;
|
||||
else
|
||||
reply_type = RDMA_MSG;
|
||||
svc_rdma_xdr_encode_reply_header(rdma, rdma_argp,
|
||||
rdma_resp, reply_type);
|
||||
|
||||
/* Send any write-chunk data and build resp write-list */
|
||||
ret = send_write_chunks(rdma, rdma_argp, rdma_resp,
|
||||
rqstp, vec);
|
||||
if (ret < 0) {
|
||||
printk(KERN_ERR "svcrdma: failed to send write chunks, rc=%d\n",
|
||||
ret);
|
||||
goto err1;
|
||||
}
|
||||
inline_bytes -= ret;
|
||||
|
||||
/* Send any reply-list data and update resp reply-list */
|
||||
ret = send_reply_chunks(rdma, rdma_argp, rdma_resp,
|
||||
rqstp, vec);
|
||||
if (ret < 0) {
|
||||
printk(KERN_ERR "svcrdma: failed to send reply chunks, rc=%d\n",
|
||||
ret);
|
||||
goto err1;
|
||||
}
|
||||
inline_bytes -= ret;
|
||||
|
||||
ret = send_reply(rdma, rqstp, res_page, rdma_resp, ctxt, vec,
|
||||
inline_bytes);
|
||||
svc_rdma_put_req_map(vec);
|
||||
dprintk("svcrdma: send_reply returns %d\n", ret);
|
||||
return ret;
|
||||
|
||||
err1:
|
||||
put_page(res_page);
|
||||
err0:
|
||||
svc_rdma_put_req_map(vec);
|
||||
svc_rdma_put_context(ctxt, 0);
|
||||
return ret;
|
||||
}
|
1355
net/sunrpc/xprtrdma/svc_rdma_transport.c
Normal file
1355
net/sunrpc/xprtrdma/svc_rdma_transport.c
Normal file
File diff suppressed because it is too large
Load diff
747
net/sunrpc/xprtrdma/transport.c
Normal file
747
net/sunrpc/xprtrdma/transport.c
Normal file
|
@ -0,0 +1,747 @@
|
|||
/*
|
||||
* Copyright (c) 2003-2007 Network Appliance, Inc. All rights reserved.
|
||||
*
|
||||
* This software is available to you under a choice of one of two
|
||||
* licenses. You may choose to be licensed under the terms of the GNU
|
||||
* General Public License (GPL) Version 2, available from the file
|
||||
* COPYING in the main directory of this source tree, or the BSD-type
|
||||
* license below:
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions
|
||||
* are met:
|
||||
*
|
||||
* Redistributions of source code must retain the above copyright
|
||||
* notice, this list of conditions and the following disclaimer.
|
||||
*
|
||||
* Redistributions in binary form must reproduce the above
|
||||
* copyright notice, this list of conditions and the following
|
||||
* disclaimer in the documentation and/or other materials provided
|
||||
* with the distribution.
|
||||
*
|
||||
* Neither the name of the Network Appliance, Inc. nor the names of
|
||||
* its contributors may be used to endorse or promote products
|
||||
* derived from this software without specific prior written
|
||||
* permission.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
* A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
* OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*/
|
||||
|
||||
/*
|
||||
* transport.c
|
||||
*
|
||||
* This file contains the top-level implementation of an RPC RDMA
|
||||
* transport.
|
||||
*
|
||||
* Naming convention: functions beginning with xprt_ are part of the
|
||||
* transport switch. All others are RPC RDMA internal.
|
||||
*/
|
||||
|
||||
#include <linux/module.h>
|
||||
#include <linux/init.h>
|
||||
#include <linux/slab.h>
|
||||
#include <linux/seq_file.h>
|
||||
#include <linux/sunrpc/addr.h>
|
||||
|
||||
#include "xprt_rdma.h"
|
||||
|
||||
#ifdef RPC_DEBUG
|
||||
# define RPCDBG_FACILITY RPCDBG_TRANS
|
||||
#endif
|
||||
|
||||
MODULE_LICENSE("Dual BSD/GPL");
|
||||
|
||||
MODULE_DESCRIPTION("RPC/RDMA Transport for Linux kernel NFS");
|
||||
MODULE_AUTHOR("Network Appliance, Inc.");
|
||||
|
||||
/*
|
||||
* tunables
|
||||
*/
|
||||
|
||||
static unsigned int xprt_rdma_slot_table_entries = RPCRDMA_DEF_SLOT_TABLE;
|
||||
static unsigned int xprt_rdma_max_inline_read = RPCRDMA_DEF_INLINE;
|
||||
static unsigned int xprt_rdma_max_inline_write = RPCRDMA_DEF_INLINE;
|
||||
static unsigned int xprt_rdma_inline_write_padding;
|
||||
static unsigned int xprt_rdma_memreg_strategy = RPCRDMA_FRMR;
|
||||
int xprt_rdma_pad_optimize = 0;
|
||||
|
||||
#ifdef RPC_DEBUG
|
||||
|
||||
static unsigned int min_slot_table_size = RPCRDMA_MIN_SLOT_TABLE;
|
||||
static unsigned int max_slot_table_size = RPCRDMA_MAX_SLOT_TABLE;
|
||||
static unsigned int zero;
|
||||
static unsigned int max_padding = PAGE_SIZE;
|
||||
static unsigned int min_memreg = RPCRDMA_BOUNCEBUFFERS;
|
||||
static unsigned int max_memreg = RPCRDMA_LAST - 1;
|
||||
|
||||
static struct ctl_table_header *sunrpc_table_header;
|
||||
|
||||
static struct ctl_table xr_tunables_table[] = {
|
||||
{
|
||||
.procname = "rdma_slot_table_entries",
|
||||
.data = &xprt_rdma_slot_table_entries,
|
||||
.maxlen = sizeof(unsigned int),
|
||||
.mode = 0644,
|
||||
.proc_handler = proc_dointvec_minmax,
|
||||
.extra1 = &min_slot_table_size,
|
||||
.extra2 = &max_slot_table_size
|
||||
},
|
||||
{
|
||||
.procname = "rdma_max_inline_read",
|
||||
.data = &xprt_rdma_max_inline_read,
|
||||
.maxlen = sizeof(unsigned int),
|
||||
.mode = 0644,
|
||||
.proc_handler = proc_dointvec,
|
||||
},
|
||||
{
|
||||
.procname = "rdma_max_inline_write",
|
||||
.data = &xprt_rdma_max_inline_write,
|
||||
.maxlen = sizeof(unsigned int),
|
||||
.mode = 0644,
|
||||
.proc_handler = proc_dointvec,
|
||||
},
|
||||
{
|
||||
.procname = "rdma_inline_write_padding",
|
||||
.data = &xprt_rdma_inline_write_padding,
|
||||
.maxlen = sizeof(unsigned int),
|
||||
.mode = 0644,
|
||||
.proc_handler = proc_dointvec_minmax,
|
||||
.extra1 = &zero,
|
||||
.extra2 = &max_padding,
|
||||
},
|
||||
{
|
||||
.procname = "rdma_memreg_strategy",
|
||||
.data = &xprt_rdma_memreg_strategy,
|
||||
.maxlen = sizeof(unsigned int),
|
||||
.mode = 0644,
|
||||
.proc_handler = proc_dointvec_minmax,
|
||||
.extra1 = &min_memreg,
|
||||
.extra2 = &max_memreg,
|
||||
},
|
||||
{
|
||||
.procname = "rdma_pad_optimize",
|
||||
.data = &xprt_rdma_pad_optimize,
|
||||
.maxlen = sizeof(unsigned int),
|
||||
.mode = 0644,
|
||||
.proc_handler = proc_dointvec,
|
||||
},
|
||||
{ },
|
||||
};
|
||||
|
||||
static struct ctl_table sunrpc_table[] = {
|
||||
{
|
||||
.procname = "sunrpc",
|
||||
.mode = 0555,
|
||||
.child = xr_tunables_table
|
||||
},
|
||||
{ },
|
||||
};
|
||||
|
||||
#endif
|
||||
|
||||
#define RPCRDMA_BIND_TO (60U * HZ)
|
||||
#define RPCRDMA_INIT_REEST_TO (5U * HZ)
|
||||
#define RPCRDMA_MAX_REEST_TO (30U * HZ)
|
||||
#define RPCRDMA_IDLE_DISC_TO (5U * 60 * HZ)
|
||||
|
||||
static struct rpc_xprt_ops xprt_rdma_procs; /* forward reference */
|
||||
|
||||
static void
|
||||
xprt_rdma_format_addresses(struct rpc_xprt *xprt)
|
||||
{
|
||||
struct sockaddr *sap = (struct sockaddr *)
|
||||
&rpcx_to_rdmad(xprt).addr;
|
||||
struct sockaddr_in *sin = (struct sockaddr_in *)sap;
|
||||
char buf[64];
|
||||
|
||||
(void)rpc_ntop(sap, buf, sizeof(buf));
|
||||
xprt->address_strings[RPC_DISPLAY_ADDR] = kstrdup(buf, GFP_KERNEL);
|
||||
|
||||
snprintf(buf, sizeof(buf), "%u", rpc_get_port(sap));
|
||||
xprt->address_strings[RPC_DISPLAY_PORT] = kstrdup(buf, GFP_KERNEL);
|
||||
|
||||
xprt->address_strings[RPC_DISPLAY_PROTO] = "rdma";
|
||||
|
||||
snprintf(buf, sizeof(buf), "%08x", ntohl(sin->sin_addr.s_addr));
|
||||
xprt->address_strings[RPC_DISPLAY_HEX_ADDR] = kstrdup(buf, GFP_KERNEL);
|
||||
|
||||
snprintf(buf, sizeof(buf), "%4hx", rpc_get_port(sap));
|
||||
xprt->address_strings[RPC_DISPLAY_HEX_PORT] = kstrdup(buf, GFP_KERNEL);
|
||||
|
||||
/* netid */
|
||||
xprt->address_strings[RPC_DISPLAY_NETID] = "rdma";
|
||||
}
|
||||
|
||||
static void
|
||||
xprt_rdma_free_addresses(struct rpc_xprt *xprt)
|
||||
{
|
||||
unsigned int i;
|
||||
|
||||
for (i = 0; i < RPC_DISPLAY_MAX; i++)
|
||||
switch (i) {
|
||||
case RPC_DISPLAY_PROTO:
|
||||
case RPC_DISPLAY_NETID:
|
||||
continue;
|
||||
default:
|
||||
kfree(xprt->address_strings[i]);
|
||||
}
|
||||
}
|
||||
|
||||
static void
|
||||
xprt_rdma_connect_worker(struct work_struct *work)
|
||||
{
|
||||
struct rpcrdma_xprt *r_xprt =
|
||||
container_of(work, struct rpcrdma_xprt, rdma_connect.work);
|
||||
struct rpc_xprt *xprt = &r_xprt->xprt;
|
||||
int rc = 0;
|
||||
|
||||
xprt_clear_connected(xprt);
|
||||
|
||||
dprintk("RPC: %s: %sconnect\n", __func__,
|
||||
r_xprt->rx_ep.rep_connected != 0 ? "re" : "");
|
||||
rc = rpcrdma_ep_connect(&r_xprt->rx_ep, &r_xprt->rx_ia);
|
||||
if (rc)
|
||||
xprt_wake_pending_tasks(xprt, rc);
|
||||
|
||||
dprintk("RPC: %s: exit\n", __func__);
|
||||
xprt_clear_connecting(xprt);
|
||||
}
|
||||
|
||||
/*
|
||||
* xprt_rdma_destroy
|
||||
*
|
||||
* Destroy the xprt.
|
||||
* Free all memory associated with the object, including its own.
|
||||
* NOTE: none of the *destroy methods free memory for their top-level
|
||||
* objects, even though they may have allocated it (they do free
|
||||
* private memory). It's up to the caller to handle it. In this
|
||||
* case (RDMA transport), all structure memory is inlined with the
|
||||
* struct rpcrdma_xprt.
|
||||
*/
|
||||
static void
|
||||
xprt_rdma_destroy(struct rpc_xprt *xprt)
|
||||
{
|
||||
struct rpcrdma_xprt *r_xprt = rpcx_to_rdmax(xprt);
|
||||
|
||||
dprintk("RPC: %s: called\n", __func__);
|
||||
|
||||
cancel_delayed_work_sync(&r_xprt->rdma_connect);
|
||||
|
||||
xprt_clear_connected(xprt);
|
||||
|
||||
rpcrdma_buffer_destroy(&r_xprt->rx_buf);
|
||||
rpcrdma_ep_destroy(&r_xprt->rx_ep, &r_xprt->rx_ia);
|
||||
rpcrdma_ia_close(&r_xprt->rx_ia);
|
||||
|
||||
xprt_rdma_free_addresses(xprt);
|
||||
|
||||
xprt_free(xprt);
|
||||
|
||||
dprintk("RPC: %s: returning\n", __func__);
|
||||
|
||||
module_put(THIS_MODULE);
|
||||
}
|
||||
|
||||
static const struct rpc_timeout xprt_rdma_default_timeout = {
|
||||
.to_initval = 60 * HZ,
|
||||
.to_maxval = 60 * HZ,
|
||||
};
|
||||
|
||||
/**
|
||||
* xprt_setup_rdma - Set up transport to use RDMA
|
||||
*
|
||||
* @args: rpc transport arguments
|
||||
*/
|
||||
static struct rpc_xprt *
|
||||
xprt_setup_rdma(struct xprt_create *args)
|
||||
{
|
||||
struct rpcrdma_create_data_internal cdata;
|
||||
struct rpc_xprt *xprt;
|
||||
struct rpcrdma_xprt *new_xprt;
|
||||
struct rpcrdma_ep *new_ep;
|
||||
struct sockaddr_in *sin;
|
||||
int rc;
|
||||
|
||||
if (args->addrlen > sizeof(xprt->addr)) {
|
||||
dprintk("RPC: %s: address too large\n", __func__);
|
||||
return ERR_PTR(-EBADF);
|
||||
}
|
||||
|
||||
xprt = xprt_alloc(args->net, sizeof(struct rpcrdma_xprt),
|
||||
xprt_rdma_slot_table_entries,
|
||||
xprt_rdma_slot_table_entries);
|
||||
if (xprt == NULL) {
|
||||
dprintk("RPC: %s: couldn't allocate rpcrdma_xprt\n",
|
||||
__func__);
|
||||
return ERR_PTR(-ENOMEM);
|
||||
}
|
||||
|
||||
/* 60 second timeout, no retries */
|
||||
xprt->timeout = &xprt_rdma_default_timeout;
|
||||
xprt->bind_timeout = RPCRDMA_BIND_TO;
|
||||
xprt->reestablish_timeout = RPCRDMA_INIT_REEST_TO;
|
||||
xprt->idle_timeout = RPCRDMA_IDLE_DISC_TO;
|
||||
|
||||
xprt->resvport = 0; /* privileged port not needed */
|
||||
xprt->tsh_size = 0; /* RPC-RDMA handles framing */
|
||||
xprt->ops = &xprt_rdma_procs;
|
||||
|
||||
/*
|
||||
* Set up RDMA-specific connect data.
|
||||
*/
|
||||
|
||||
/* Put server RDMA address in local cdata */
|
||||
memcpy(&cdata.addr, args->dstaddr, args->addrlen);
|
||||
|
||||
/* Ensure xprt->addr holds valid server TCP (not RDMA)
|
||||
* address, for any side protocols which peek at it */
|
||||
xprt->prot = IPPROTO_TCP;
|
||||
xprt->addrlen = args->addrlen;
|
||||
memcpy(&xprt->addr, &cdata.addr, xprt->addrlen);
|
||||
|
||||
sin = (struct sockaddr_in *)&cdata.addr;
|
||||
if (ntohs(sin->sin_port) != 0)
|
||||
xprt_set_bound(xprt);
|
||||
|
||||
dprintk("RPC: %s: %pI4:%u\n",
|
||||
__func__, &sin->sin_addr.s_addr, ntohs(sin->sin_port));
|
||||
|
||||
/* Set max requests */
|
||||
cdata.max_requests = xprt->max_reqs;
|
||||
|
||||
/* Set some length limits */
|
||||
cdata.rsize = RPCRDMA_MAX_SEGS * PAGE_SIZE; /* RDMA write max */
|
||||
cdata.wsize = RPCRDMA_MAX_SEGS * PAGE_SIZE; /* RDMA read max */
|
||||
|
||||
cdata.inline_wsize = xprt_rdma_max_inline_write;
|
||||
if (cdata.inline_wsize > cdata.wsize)
|
||||
cdata.inline_wsize = cdata.wsize;
|
||||
|
||||
cdata.inline_rsize = xprt_rdma_max_inline_read;
|
||||
if (cdata.inline_rsize > cdata.rsize)
|
||||
cdata.inline_rsize = cdata.rsize;
|
||||
|
||||
cdata.padding = xprt_rdma_inline_write_padding;
|
||||
|
||||
/*
|
||||
* Create new transport instance, which includes initialized
|
||||
* o ia
|
||||
* o endpoint
|
||||
* o buffers
|
||||
*/
|
||||
|
||||
new_xprt = rpcx_to_rdmax(xprt);
|
||||
|
||||
rc = rpcrdma_ia_open(new_xprt, (struct sockaddr *) &cdata.addr,
|
||||
xprt_rdma_memreg_strategy);
|
||||
if (rc)
|
||||
goto out1;
|
||||
|
||||
/*
|
||||
* initialize and create ep
|
||||
*/
|
||||
new_xprt->rx_data = cdata;
|
||||
new_ep = &new_xprt->rx_ep;
|
||||
new_ep->rep_remote_addr = cdata.addr;
|
||||
|
||||
rc = rpcrdma_ep_create(&new_xprt->rx_ep,
|
||||
&new_xprt->rx_ia, &new_xprt->rx_data);
|
||||
if (rc)
|
||||
goto out2;
|
||||
|
||||
/*
|
||||
* Allocate pre-registered send and receive buffers for headers and
|
||||
* any inline data. Also specify any padding which will be provided
|
||||
* from a preregistered zero buffer.
|
||||
*/
|
||||
rc = rpcrdma_buffer_create(&new_xprt->rx_buf, new_ep, &new_xprt->rx_ia,
|
||||
&new_xprt->rx_data);
|
||||
if (rc)
|
||||
goto out3;
|
||||
|
||||
/*
|
||||
* Register a callback for connection events. This is necessary because
|
||||
* connection loss notification is async. We also catch connection loss
|
||||
* when reaping receives.
|
||||
*/
|
||||
INIT_DELAYED_WORK(&new_xprt->rdma_connect, xprt_rdma_connect_worker);
|
||||
new_ep->rep_func = rpcrdma_conn_func;
|
||||
new_ep->rep_xprt = xprt;
|
||||
|
||||
xprt_rdma_format_addresses(xprt);
|
||||
xprt->max_payload = rpcrdma_max_payload(new_xprt);
|
||||
dprintk("RPC: %s: transport data payload maximum: %zu bytes\n",
|
||||
__func__, xprt->max_payload);
|
||||
|
||||
if (!try_module_get(THIS_MODULE))
|
||||
goto out4;
|
||||
|
||||
return xprt;
|
||||
|
||||
out4:
|
||||
xprt_rdma_free_addresses(xprt);
|
||||
rc = -EINVAL;
|
||||
out3:
|
||||
rpcrdma_ep_destroy(new_ep, &new_xprt->rx_ia);
|
||||
out2:
|
||||
rpcrdma_ia_close(&new_xprt->rx_ia);
|
||||
out1:
|
||||
xprt_free(xprt);
|
||||
return ERR_PTR(rc);
|
||||
}
|
||||
|
||||
/*
|
||||
* Close a connection, during shutdown or timeout/reconnect
|
||||
*/
|
||||
static void
|
||||
xprt_rdma_close(struct rpc_xprt *xprt)
|
||||
{
|
||||
struct rpcrdma_xprt *r_xprt = rpcx_to_rdmax(xprt);
|
||||
|
||||
dprintk("RPC: %s: closing\n", __func__);
|
||||
if (r_xprt->rx_ep.rep_connected > 0)
|
||||
xprt->reestablish_timeout = 0;
|
||||
xprt_disconnect_done(xprt);
|
||||
rpcrdma_ep_disconnect(&r_xprt->rx_ep, &r_xprt->rx_ia);
|
||||
}
|
||||
|
||||
static void
|
||||
xprt_rdma_set_port(struct rpc_xprt *xprt, u16 port)
|
||||
{
|
||||
struct sockaddr_in *sap;
|
||||
|
||||
sap = (struct sockaddr_in *)&xprt->addr;
|
||||
sap->sin_port = htons(port);
|
||||
sap = (struct sockaddr_in *)&rpcx_to_rdmad(xprt).addr;
|
||||
sap->sin_port = htons(port);
|
||||
dprintk("RPC: %s: %u\n", __func__, port);
|
||||
}
|
||||
|
||||
static void
|
||||
xprt_rdma_connect(struct rpc_xprt *xprt, struct rpc_task *task)
|
||||
{
|
||||
struct rpcrdma_xprt *r_xprt = rpcx_to_rdmax(xprt);
|
||||
|
||||
if (r_xprt->rx_ep.rep_connected != 0) {
|
||||
/* Reconnect */
|
||||
schedule_delayed_work(&r_xprt->rdma_connect,
|
||||
xprt->reestablish_timeout);
|
||||
xprt->reestablish_timeout <<= 1;
|
||||
if (xprt->reestablish_timeout > RPCRDMA_MAX_REEST_TO)
|
||||
xprt->reestablish_timeout = RPCRDMA_MAX_REEST_TO;
|
||||
else if (xprt->reestablish_timeout < RPCRDMA_INIT_REEST_TO)
|
||||
xprt->reestablish_timeout = RPCRDMA_INIT_REEST_TO;
|
||||
} else {
|
||||
schedule_delayed_work(&r_xprt->rdma_connect, 0);
|
||||
if (!RPC_IS_ASYNC(task))
|
||||
flush_delayed_work(&r_xprt->rdma_connect);
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* The RDMA allocate/free functions need the task structure as a place
|
||||
* to hide the struct rpcrdma_req, which is necessary for the actual send/recv
|
||||
* sequence. For this reason, the recv buffers are attached to send
|
||||
* buffers for portions of the RPC. Note that the RPC layer allocates
|
||||
* both send and receive buffers in the same call. We may register
|
||||
* the receive buffer portion when using reply chunks.
|
||||
*/
|
||||
static void *
|
||||
xprt_rdma_allocate(struct rpc_task *task, size_t size)
|
||||
{
|
||||
struct rpc_xprt *xprt = task->tk_rqstp->rq_xprt;
|
||||
struct rpcrdma_req *req, *nreq;
|
||||
|
||||
req = rpcrdma_buffer_get(&rpcx_to_rdmax(xprt)->rx_buf);
|
||||
if (req == NULL)
|
||||
return NULL;
|
||||
|
||||
if (size > req->rl_size) {
|
||||
dprintk("RPC: %s: size %zd too large for buffer[%zd]: "
|
||||
"prog %d vers %d proc %d\n",
|
||||
__func__, size, req->rl_size,
|
||||
task->tk_client->cl_prog, task->tk_client->cl_vers,
|
||||
task->tk_msg.rpc_proc->p_proc);
|
||||
/*
|
||||
* Outgoing length shortage. Our inline write max must have
|
||||
* been configured to perform direct i/o.
|
||||
*
|
||||
* This is therefore a large metadata operation, and the
|
||||
* allocate call was made on the maximum possible message,
|
||||
* e.g. containing long filename(s) or symlink data. In
|
||||
* fact, while these metadata operations *might* carry
|
||||
* large outgoing payloads, they rarely *do*. However, we
|
||||
* have to commit to the request here, so reallocate and
|
||||
* register it now. The data path will never require this
|
||||
* reallocation.
|
||||
*
|
||||
* If the allocation or registration fails, the RPC framework
|
||||
* will (doggedly) retry.
|
||||
*/
|
||||
if (task->tk_flags & RPC_TASK_SWAPPER)
|
||||
nreq = kmalloc(sizeof *req + size, GFP_ATOMIC);
|
||||
else
|
||||
nreq = kmalloc(sizeof *req + size, GFP_NOFS);
|
||||
if (nreq == NULL)
|
||||
goto outfail;
|
||||
|
||||
if (rpcrdma_register_internal(&rpcx_to_rdmax(xprt)->rx_ia,
|
||||
nreq->rl_base, size + sizeof(struct rpcrdma_req)
|
||||
- offsetof(struct rpcrdma_req, rl_base),
|
||||
&nreq->rl_handle, &nreq->rl_iov)) {
|
||||
kfree(nreq);
|
||||
goto outfail;
|
||||
}
|
||||
rpcx_to_rdmax(xprt)->rx_stats.hardway_register_count += size;
|
||||
nreq->rl_size = size;
|
||||
nreq->rl_niovs = 0;
|
||||
nreq->rl_nchunks = 0;
|
||||
nreq->rl_buffer = (struct rpcrdma_buffer *)req;
|
||||
nreq->rl_reply = req->rl_reply;
|
||||
memcpy(nreq->rl_segments,
|
||||
req->rl_segments, sizeof nreq->rl_segments);
|
||||
/* flag the swap with an unused field */
|
||||
nreq->rl_iov.length = 0;
|
||||
req->rl_reply = NULL;
|
||||
req = nreq;
|
||||
}
|
||||
dprintk("RPC: %s: size %zd, request 0x%p\n", __func__, size, req);
|
||||
req->rl_connect_cookie = 0; /* our reserved value */
|
||||
return req->rl_xdr_buf;
|
||||
|
||||
outfail:
|
||||
rpcrdma_buffer_put(req);
|
||||
rpcx_to_rdmax(xprt)->rx_stats.failed_marshal_count++;
|
||||
return NULL;
|
||||
}
|
||||
|
||||
/*
|
||||
* This function returns all RDMA resources to the pool.
|
||||
*/
|
||||
static void
|
||||
xprt_rdma_free(void *buffer)
|
||||
{
|
||||
struct rpcrdma_req *req;
|
||||
struct rpcrdma_xprt *r_xprt;
|
||||
struct rpcrdma_rep *rep;
|
||||
int i;
|
||||
|
||||
if (buffer == NULL)
|
||||
return;
|
||||
|
||||
req = container_of(buffer, struct rpcrdma_req, rl_xdr_buf[0]);
|
||||
if (req->rl_iov.length == 0) { /* see allocate above */
|
||||
r_xprt = container_of(((struct rpcrdma_req *) req->rl_buffer)->rl_buffer,
|
||||
struct rpcrdma_xprt, rx_buf);
|
||||
} else
|
||||
r_xprt = container_of(req->rl_buffer, struct rpcrdma_xprt, rx_buf);
|
||||
rep = req->rl_reply;
|
||||
|
||||
dprintk("RPC: %s: called on 0x%p%s\n",
|
||||
__func__, rep, (rep && rep->rr_func) ? " (with waiter)" : "");
|
||||
|
||||
/*
|
||||
* Finish the deregistration. The process is considered
|
||||
* complete when the rr_func vector becomes NULL - this
|
||||
* was put in place during rpcrdma_reply_handler() - the wait
|
||||
* call below will not block if the dereg is "done". If
|
||||
* interrupted, our framework will clean up.
|
||||
*/
|
||||
for (i = 0; req->rl_nchunks;) {
|
||||
--req->rl_nchunks;
|
||||
i += rpcrdma_deregister_external(
|
||||
&req->rl_segments[i], r_xprt);
|
||||
}
|
||||
|
||||
if (req->rl_iov.length == 0) { /* see allocate above */
|
||||
struct rpcrdma_req *oreq = (struct rpcrdma_req *)req->rl_buffer;
|
||||
oreq->rl_reply = req->rl_reply;
|
||||
(void) rpcrdma_deregister_internal(&r_xprt->rx_ia,
|
||||
req->rl_handle,
|
||||
&req->rl_iov);
|
||||
kfree(req);
|
||||
req = oreq;
|
||||
}
|
||||
|
||||
/* Put back request+reply buffers */
|
||||
rpcrdma_buffer_put(req);
|
||||
}
|
||||
|
||||
/*
|
||||
* send_request invokes the meat of RPC RDMA. It must do the following:
|
||||
* 1. Marshal the RPC request into an RPC RDMA request, which means
|
||||
* putting a header in front of data, and creating IOVs for RDMA
|
||||
* from those in the request.
|
||||
* 2. In marshaling, detect opportunities for RDMA, and use them.
|
||||
* 3. Post a recv message to set up asynch completion, then send
|
||||
* the request (rpcrdma_ep_post).
|
||||
* 4. No partial sends are possible in the RPC-RDMA protocol (as in UDP).
|
||||
*/
|
||||
|
||||
static int
|
||||
xprt_rdma_send_request(struct rpc_task *task)
|
||||
{
|
||||
struct rpc_rqst *rqst = task->tk_rqstp;
|
||||
struct rpc_xprt *xprt = rqst->rq_xprt;
|
||||
struct rpcrdma_req *req = rpcr_to_rdmar(rqst);
|
||||
struct rpcrdma_xprt *r_xprt = rpcx_to_rdmax(xprt);
|
||||
int rc = 0;
|
||||
|
||||
if (req->rl_niovs == 0)
|
||||
rc = rpcrdma_marshal_req(rqst);
|
||||
else if (r_xprt->rx_ia.ri_memreg_strategy == RPCRDMA_FRMR)
|
||||
rc = rpcrdma_marshal_chunks(rqst, 0);
|
||||
if (rc < 0)
|
||||
goto failed_marshal;
|
||||
|
||||
if (req->rl_reply == NULL) /* e.g. reconnection */
|
||||
rpcrdma_recv_buffer_get(req);
|
||||
|
||||
if (req->rl_reply) {
|
||||
req->rl_reply->rr_func = rpcrdma_reply_handler;
|
||||
/* this need only be done once, but... */
|
||||
req->rl_reply->rr_xprt = xprt;
|
||||
}
|
||||
|
||||
/* Must suppress retransmit to maintain credits */
|
||||
if (req->rl_connect_cookie == xprt->connect_cookie)
|
||||
goto drop_connection;
|
||||
req->rl_connect_cookie = xprt->connect_cookie;
|
||||
|
||||
if (rpcrdma_ep_post(&r_xprt->rx_ia, &r_xprt->rx_ep, req))
|
||||
goto drop_connection;
|
||||
|
||||
rqst->rq_xmit_bytes_sent += rqst->rq_snd_buf.len;
|
||||
rqst->rq_bytes_sent = 0;
|
||||
return 0;
|
||||
|
||||
failed_marshal:
|
||||
r_xprt->rx_stats.failed_marshal_count++;
|
||||
dprintk("RPC: %s: rpcrdma_marshal_req failed, status %i\n",
|
||||
__func__, rc);
|
||||
if (rc == -EIO)
|
||||
return -EIO;
|
||||
drop_connection:
|
||||
xprt_disconnect_done(xprt);
|
||||
return -ENOTCONN; /* implies disconnect */
|
||||
}
|
||||
|
||||
static void xprt_rdma_print_stats(struct rpc_xprt *xprt, struct seq_file *seq)
|
||||
{
|
||||
struct rpcrdma_xprt *r_xprt = rpcx_to_rdmax(xprt);
|
||||
long idle_time = 0;
|
||||
|
||||
if (xprt_connected(xprt))
|
||||
idle_time = (long)(jiffies - xprt->last_used) / HZ;
|
||||
|
||||
seq_printf(seq,
|
||||
"\txprt:\trdma %u %lu %lu %lu %ld %lu %lu %lu %Lu %Lu "
|
||||
"%lu %lu %lu %Lu %Lu %Lu %Lu %lu %lu %lu\n",
|
||||
|
||||
0, /* need a local port? */
|
||||
xprt->stat.bind_count,
|
||||
xprt->stat.connect_count,
|
||||
xprt->stat.connect_time,
|
||||
idle_time,
|
||||
xprt->stat.sends,
|
||||
xprt->stat.recvs,
|
||||
xprt->stat.bad_xids,
|
||||
xprt->stat.req_u,
|
||||
xprt->stat.bklog_u,
|
||||
|
||||
r_xprt->rx_stats.read_chunk_count,
|
||||
r_xprt->rx_stats.write_chunk_count,
|
||||
r_xprt->rx_stats.reply_chunk_count,
|
||||
r_xprt->rx_stats.total_rdma_request,
|
||||
r_xprt->rx_stats.total_rdma_reply,
|
||||
r_xprt->rx_stats.pullup_copy_count,
|
||||
r_xprt->rx_stats.fixup_copy_count,
|
||||
r_xprt->rx_stats.hardway_register_count,
|
||||
r_xprt->rx_stats.failed_marshal_count,
|
||||
r_xprt->rx_stats.bad_reply_count);
|
||||
}
|
||||
|
||||
/*
|
||||
* Plumbing for rpc transport switch and kernel module
|
||||
*/
|
||||
|
||||
static struct rpc_xprt_ops xprt_rdma_procs = {
|
||||
.reserve_xprt = xprt_reserve_xprt_cong,
|
||||
.release_xprt = xprt_release_xprt_cong, /* sunrpc/xprt.c */
|
||||
.alloc_slot = xprt_alloc_slot,
|
||||
.release_request = xprt_release_rqst_cong, /* ditto */
|
||||
.set_retrans_timeout = xprt_set_retrans_timeout_def, /* ditto */
|
||||
.rpcbind = rpcb_getport_async, /* sunrpc/rpcb_clnt.c */
|
||||
.set_port = xprt_rdma_set_port,
|
||||
.connect = xprt_rdma_connect,
|
||||
.buf_alloc = xprt_rdma_allocate,
|
||||
.buf_free = xprt_rdma_free,
|
||||
.send_request = xprt_rdma_send_request,
|
||||
.close = xprt_rdma_close,
|
||||
.destroy = xprt_rdma_destroy,
|
||||
.print_stats = xprt_rdma_print_stats
|
||||
};
|
||||
|
||||
static struct xprt_class xprt_rdma = {
|
||||
.list = LIST_HEAD_INIT(xprt_rdma.list),
|
||||
.name = "rdma",
|
||||
.owner = THIS_MODULE,
|
||||
.ident = XPRT_TRANSPORT_RDMA,
|
||||
.setup = xprt_setup_rdma,
|
||||
};
|
||||
|
||||
static void __exit xprt_rdma_cleanup(void)
|
||||
{
|
||||
int rc;
|
||||
|
||||
dprintk("RPCRDMA Module Removed, deregister RPC RDMA transport\n");
|
||||
#ifdef RPC_DEBUG
|
||||
if (sunrpc_table_header) {
|
||||
unregister_sysctl_table(sunrpc_table_header);
|
||||
sunrpc_table_header = NULL;
|
||||
}
|
||||
#endif
|
||||
rc = xprt_unregister_transport(&xprt_rdma);
|
||||
if (rc)
|
||||
dprintk("RPC: %s: xprt_unregister returned %i\n",
|
||||
__func__, rc);
|
||||
}
|
||||
|
||||
static int __init xprt_rdma_init(void)
|
||||
{
|
||||
int rc;
|
||||
|
||||
rc = xprt_register_transport(&xprt_rdma);
|
||||
|
||||
if (rc)
|
||||
return rc;
|
||||
|
||||
dprintk("RPCRDMA Module Init, register RPC RDMA transport\n");
|
||||
|
||||
dprintk("Defaults:\n");
|
||||
dprintk("\tSlots %d\n"
|
||||
"\tMaxInlineRead %d\n\tMaxInlineWrite %d\n",
|
||||
xprt_rdma_slot_table_entries,
|
||||
xprt_rdma_max_inline_read, xprt_rdma_max_inline_write);
|
||||
dprintk("\tPadding %d\n\tMemreg %d\n",
|
||||
xprt_rdma_inline_write_padding, xprt_rdma_memreg_strategy);
|
||||
|
||||
#ifdef RPC_DEBUG
|
||||
if (!sunrpc_table_header)
|
||||
sunrpc_table_header = register_sysctl_table(sunrpc_table);
|
||||
#endif
|
||||
return 0;
|
||||
}
|
||||
|
||||
module_init(xprt_rdma_init);
|
||||
module_exit(xprt_rdma_cleanup);
|
2076
net/sunrpc/xprtrdma/verbs.c
Normal file
2076
net/sunrpc/xprtrdma/verbs.c
Normal file
File diff suppressed because it is too large
Load diff
402
net/sunrpc/xprtrdma/xprt_rdma.h
Normal file
402
net/sunrpc/xprtrdma/xprt_rdma.h
Normal file
|
@ -0,0 +1,402 @@
|
|||
/*
|
||||
* Copyright (c) 2003-2007 Network Appliance, Inc. All rights reserved.
|
||||
*
|
||||
* This software is available to you under a choice of one of two
|
||||
* licenses. You may choose to be licensed under the terms of the GNU
|
||||
* General Public License (GPL) Version 2, available from the file
|
||||
* COPYING in the main directory of this source tree, or the BSD-type
|
||||
* license below:
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions
|
||||
* are met:
|
||||
*
|
||||
* Redistributions of source code must retain the above copyright
|
||||
* notice, this list of conditions and the following disclaimer.
|
||||
*
|
||||
* Redistributions in binary form must reproduce the above
|
||||
* copyright notice, this list of conditions and the following
|
||||
* disclaimer in the documentation and/or other materials provided
|
||||
* with the distribution.
|
||||
*
|
||||
* Neither the name of the Network Appliance, Inc. nor the names of
|
||||
* its contributors may be used to endorse or promote products
|
||||
* derived from this software without specific prior written
|
||||
* permission.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
* A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
* OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*/
|
||||
|
||||
#ifndef _LINUX_SUNRPC_XPRT_RDMA_H
|
||||
#define _LINUX_SUNRPC_XPRT_RDMA_H
|
||||
|
||||
#include <linux/wait.h> /* wait_queue_head_t, etc */
|
||||
#include <linux/spinlock.h> /* spinlock_t, etc */
|
||||
#include <linux/atomic.h> /* atomic_t, etc */
|
||||
#include <linux/workqueue.h> /* struct work_struct */
|
||||
|
||||
#include <rdma/rdma_cm.h> /* RDMA connection api */
|
||||
#include <rdma/ib_verbs.h> /* RDMA verbs api */
|
||||
|
||||
#include <linux/sunrpc/clnt.h> /* rpc_xprt */
|
||||
#include <linux/sunrpc/rpc_rdma.h> /* RPC/RDMA protocol */
|
||||
#include <linux/sunrpc/xprtrdma.h> /* xprt parameters */
|
||||
#include <linux/sunrpc/svc.h> /* RPCSVC_MAXPAYLOAD */
|
||||
|
||||
#define RDMA_RESOLVE_TIMEOUT (5000) /* 5 seconds */
|
||||
#define RDMA_CONNECT_RETRY_MAX (2) /* retries if no listener backlog */
|
||||
|
||||
/*
|
||||
* Interface Adapter -- one per transport instance
|
||||
*/
|
||||
struct rpcrdma_ia {
|
||||
rwlock_t ri_qplock;
|
||||
struct rdma_cm_id *ri_id;
|
||||
struct ib_pd *ri_pd;
|
||||
struct ib_mr *ri_bind_mem;
|
||||
u32 ri_dma_lkey;
|
||||
int ri_have_dma_lkey;
|
||||
struct completion ri_done;
|
||||
int ri_async_rc;
|
||||
enum rpcrdma_memreg ri_memreg_strategy;
|
||||
unsigned int ri_max_frmr_depth;
|
||||
};
|
||||
|
||||
/*
|
||||
* RDMA Endpoint -- one per transport instance
|
||||
*/
|
||||
|
||||
#define RPCRDMA_WC_BUDGET (128)
|
||||
#define RPCRDMA_POLLSIZE (16)
|
||||
|
||||
struct rpcrdma_ep {
|
||||
atomic_t rep_cqcount;
|
||||
int rep_cqinit;
|
||||
int rep_connected;
|
||||
struct rpcrdma_ia *rep_ia;
|
||||
struct ib_qp_init_attr rep_attr;
|
||||
wait_queue_head_t rep_connect_wait;
|
||||
struct ib_sge rep_pad; /* holds zeroed pad */
|
||||
struct ib_mr *rep_pad_mr; /* holds zeroed pad */
|
||||
void (*rep_func)(struct rpcrdma_ep *);
|
||||
struct rpc_xprt *rep_xprt; /* for rep_func */
|
||||
struct rdma_conn_param rep_remote_cma;
|
||||
struct sockaddr_storage rep_remote_addr;
|
||||
struct delayed_work rep_connect_worker;
|
||||
struct ib_wc rep_send_wcs[RPCRDMA_POLLSIZE];
|
||||
struct ib_wc rep_recv_wcs[RPCRDMA_POLLSIZE];
|
||||
};
|
||||
|
||||
#define INIT_CQCOUNT(ep) atomic_set(&(ep)->rep_cqcount, (ep)->rep_cqinit)
|
||||
#define DECR_CQCOUNT(ep) atomic_sub_return(1, &(ep)->rep_cqcount)
|
||||
|
||||
enum rpcrdma_chunktype {
|
||||
rpcrdma_noch = 0,
|
||||
rpcrdma_readch,
|
||||
rpcrdma_areadch,
|
||||
rpcrdma_writech,
|
||||
rpcrdma_replych
|
||||
};
|
||||
|
||||
/*
|
||||
* struct rpcrdma_rep -- this structure encapsulates state required to recv
|
||||
* and complete a reply, asychronously. It needs several pieces of
|
||||
* state:
|
||||
* o recv buffer (posted to provider)
|
||||
* o ib_sge (also donated to provider)
|
||||
* o status of reply (length, success or not)
|
||||
* o bookkeeping state to get run by tasklet (list, etc)
|
||||
*
|
||||
* These are allocated during initialization, per-transport instance;
|
||||
* however, the tasklet execution list itself is global, as it should
|
||||
* always be pretty short.
|
||||
*
|
||||
* N of these are associated with a transport instance, and stored in
|
||||
* struct rpcrdma_buffer. N is the max number of outstanding requests.
|
||||
*/
|
||||
|
||||
/* temporary static scatter/gather max */
|
||||
#define RPCRDMA_MAX_DATA_SEGS (64) /* max scatter/gather */
|
||||
#define RPCRDMA_MAX_SEGS (RPCRDMA_MAX_DATA_SEGS + 2) /* head+tail = 2 */
|
||||
#define MAX_RPCRDMAHDR (\
|
||||
/* max supported RPC/RDMA header */ \
|
||||
sizeof(struct rpcrdma_msg) + (2 * sizeof(u32)) + \
|
||||
(sizeof(struct rpcrdma_read_chunk) * RPCRDMA_MAX_SEGS) + sizeof(u32))
|
||||
|
||||
struct rpcrdma_buffer;
|
||||
|
||||
struct rpcrdma_rep {
|
||||
unsigned int rr_len; /* actual received reply length */
|
||||
struct rpcrdma_buffer *rr_buffer; /* home base for this structure */
|
||||
struct rpc_xprt *rr_xprt; /* needed for request/reply matching */
|
||||
void (*rr_func)(struct rpcrdma_rep *);/* called by tasklet in softint */
|
||||
struct list_head rr_list; /* tasklet list */
|
||||
struct ib_sge rr_iov; /* for posting */
|
||||
struct ib_mr *rr_handle; /* handle for mem in rr_iov */
|
||||
char rr_base[MAX_RPCRDMAHDR]; /* minimal inline receive buffer */
|
||||
};
|
||||
|
||||
/*
|
||||
* struct rpcrdma_mw - external memory region metadata
|
||||
*
|
||||
* An external memory region is any buffer or page that is registered
|
||||
* on the fly (ie, not pre-registered).
|
||||
*
|
||||
* Each rpcrdma_buffer has a list of free MWs anchored in rb_mws. During
|
||||
* call_allocate, rpcrdma_buffer_get() assigns one to each segment in
|
||||
* an rpcrdma_req. Then rpcrdma_register_external() grabs these to keep
|
||||
* track of registration metadata while each RPC is pending.
|
||||
* rpcrdma_deregister_external() uses this metadata to unmap and
|
||||
* release these resources when an RPC is complete.
|
||||
*/
|
||||
enum rpcrdma_frmr_state {
|
||||
FRMR_IS_INVALID, /* ready to be used */
|
||||
FRMR_IS_VALID, /* in use */
|
||||
FRMR_IS_STALE, /* failed completion */
|
||||
};
|
||||
|
||||
struct rpcrdma_frmr {
|
||||
struct ib_fast_reg_page_list *fr_pgl;
|
||||
struct ib_mr *fr_mr;
|
||||
enum rpcrdma_frmr_state fr_state;
|
||||
};
|
||||
|
||||
struct rpcrdma_mw {
|
||||
union {
|
||||
struct ib_fmr *fmr;
|
||||
struct rpcrdma_frmr frmr;
|
||||
} r;
|
||||
struct list_head mw_list;
|
||||
struct list_head mw_all;
|
||||
};
|
||||
|
||||
/*
|
||||
* struct rpcrdma_req -- structure central to the request/reply sequence.
|
||||
*
|
||||
* N of these are associated with a transport instance, and stored in
|
||||
* struct rpcrdma_buffer. N is the max number of outstanding requests.
|
||||
*
|
||||
* It includes pre-registered buffer memory for send AND recv.
|
||||
* The recv buffer, however, is not owned by this structure, and
|
||||
* is "donated" to the hardware when a recv is posted. When a
|
||||
* reply is handled, the recv buffer used is given back to the
|
||||
* struct rpcrdma_req associated with the request.
|
||||
*
|
||||
* In addition to the basic memory, this structure includes an array
|
||||
* of iovs for send operations. The reason is that the iovs passed to
|
||||
* ib_post_{send,recv} must not be modified until the work request
|
||||
* completes.
|
||||
*
|
||||
* NOTES:
|
||||
* o RPCRDMA_MAX_SEGS is the max number of addressible chunk elements we
|
||||
* marshal. The number needed varies depending on the iov lists that
|
||||
* are passed to us, the memory registration mode we are in, and if
|
||||
* physical addressing is used, the layout.
|
||||
*/
|
||||
|
||||
struct rpcrdma_mr_seg { /* chunk descriptors */
|
||||
union { /* chunk memory handles */
|
||||
struct ib_mr *rl_mr; /* if registered directly */
|
||||
struct rpcrdma_mw *rl_mw; /* if registered from region */
|
||||
} mr_chunk;
|
||||
u64 mr_base; /* registration result */
|
||||
u32 mr_rkey; /* registration result */
|
||||
u32 mr_len; /* length of chunk or segment */
|
||||
int mr_nsegs; /* number of segments in chunk or 0 */
|
||||
enum dma_data_direction mr_dir; /* segment mapping direction */
|
||||
dma_addr_t mr_dma; /* segment mapping address */
|
||||
size_t mr_dmalen; /* segment mapping length */
|
||||
struct page *mr_page; /* owning page, if any */
|
||||
char *mr_offset; /* kva if no page, else offset */
|
||||
};
|
||||
|
||||
struct rpcrdma_req {
|
||||
size_t rl_size; /* actual length of buffer */
|
||||
unsigned int rl_niovs; /* 0, 2 or 4 */
|
||||
unsigned int rl_nchunks; /* non-zero if chunks */
|
||||
unsigned int rl_connect_cookie; /* retry detection */
|
||||
enum rpcrdma_chunktype rl_rtype, rl_wtype;
|
||||
struct rpcrdma_buffer *rl_buffer; /* home base for this structure */
|
||||
struct rpcrdma_rep *rl_reply;/* holder for reply buffer */
|
||||
struct rpcrdma_mr_seg rl_segments[RPCRDMA_MAX_SEGS];/* chunk segments */
|
||||
struct ib_sge rl_send_iov[4]; /* for active requests */
|
||||
struct ib_sge rl_iov; /* for posting */
|
||||
struct ib_mr *rl_handle; /* handle for mem in rl_iov */
|
||||
char rl_base[MAX_RPCRDMAHDR]; /* start of actual buffer */
|
||||
__u32 rl_xdr_buf[0]; /* start of returned rpc rq_buffer */
|
||||
};
|
||||
#define rpcr_to_rdmar(r) \
|
||||
container_of((r)->rq_buffer, struct rpcrdma_req, rl_xdr_buf[0])
|
||||
|
||||
/*
|
||||
* struct rpcrdma_buffer -- holds list/queue of pre-registered memory for
|
||||
* inline requests/replies, and client/server credits.
|
||||
*
|
||||
* One of these is associated with a transport instance
|
||||
*/
|
||||
struct rpcrdma_buffer {
|
||||
spinlock_t rb_lock; /* protects indexes */
|
||||
atomic_t rb_credits; /* most recent server credits */
|
||||
int rb_max_requests;/* client max requests */
|
||||
struct list_head rb_mws; /* optional memory windows/fmrs/frmrs */
|
||||
struct list_head rb_all;
|
||||
int rb_send_index;
|
||||
struct rpcrdma_req **rb_send_bufs;
|
||||
int rb_recv_index;
|
||||
struct rpcrdma_rep **rb_recv_bufs;
|
||||
char *rb_pool;
|
||||
};
|
||||
#define rdmab_to_ia(b) (&container_of((b), struct rpcrdma_xprt, rx_buf)->rx_ia)
|
||||
|
||||
/*
|
||||
* Internal structure for transport instance creation. This
|
||||
* exists primarily for modularity.
|
||||
*
|
||||
* This data should be set with mount options
|
||||
*/
|
||||
struct rpcrdma_create_data_internal {
|
||||
struct sockaddr_storage addr; /* RDMA server address */
|
||||
unsigned int max_requests; /* max requests (slots) in flight */
|
||||
unsigned int rsize; /* mount rsize - max read hdr+data */
|
||||
unsigned int wsize; /* mount wsize - max write hdr+data */
|
||||
unsigned int inline_rsize; /* max non-rdma read data payload */
|
||||
unsigned int inline_wsize; /* max non-rdma write data payload */
|
||||
unsigned int padding; /* non-rdma write header padding */
|
||||
};
|
||||
|
||||
#define RPCRDMA_INLINE_READ_THRESHOLD(rq) \
|
||||
(rpcx_to_rdmad(rq->rq_xprt).inline_rsize)
|
||||
|
||||
#define RPCRDMA_INLINE_WRITE_THRESHOLD(rq)\
|
||||
(rpcx_to_rdmad(rq->rq_xprt).inline_wsize)
|
||||
|
||||
#define RPCRDMA_INLINE_PAD_VALUE(rq)\
|
||||
rpcx_to_rdmad(rq->rq_xprt).padding
|
||||
|
||||
/*
|
||||
* Statistics for RPCRDMA
|
||||
*/
|
||||
struct rpcrdma_stats {
|
||||
unsigned long read_chunk_count;
|
||||
unsigned long write_chunk_count;
|
||||
unsigned long reply_chunk_count;
|
||||
|
||||
unsigned long long total_rdma_request;
|
||||
unsigned long long total_rdma_reply;
|
||||
|
||||
unsigned long long pullup_copy_count;
|
||||
unsigned long long fixup_copy_count;
|
||||
unsigned long hardway_register_count;
|
||||
unsigned long failed_marshal_count;
|
||||
unsigned long bad_reply_count;
|
||||
};
|
||||
|
||||
/*
|
||||
* RPCRDMA transport -- encapsulates the structures above for
|
||||
* integration with RPC.
|
||||
*
|
||||
* The contained structures are embedded, not pointers,
|
||||
* for convenience. This structure need not be visible externally.
|
||||
*
|
||||
* It is allocated and initialized during mount, and released
|
||||
* during unmount.
|
||||
*/
|
||||
struct rpcrdma_xprt {
|
||||
struct rpc_xprt xprt;
|
||||
struct rpcrdma_ia rx_ia;
|
||||
struct rpcrdma_ep rx_ep;
|
||||
struct rpcrdma_buffer rx_buf;
|
||||
struct rpcrdma_create_data_internal rx_data;
|
||||
struct delayed_work rdma_connect;
|
||||
struct rpcrdma_stats rx_stats;
|
||||
};
|
||||
|
||||
#define rpcx_to_rdmax(x) container_of(x, struct rpcrdma_xprt, xprt)
|
||||
#define rpcx_to_rdmad(x) (rpcx_to_rdmax(x)->rx_data)
|
||||
|
||||
/* Setting this to 0 ensures interoperability with early servers.
|
||||
* Setting this to 1 enhances certain unaligned read/write performance.
|
||||
* Default is 0, see sysctl entry and rpc_rdma.c rpcrdma_convert_iovs() */
|
||||
extern int xprt_rdma_pad_optimize;
|
||||
|
||||
/*
|
||||
* Interface Adapter calls - xprtrdma/verbs.c
|
||||
*/
|
||||
int rpcrdma_ia_open(struct rpcrdma_xprt *, struct sockaddr *, int);
|
||||
void rpcrdma_ia_close(struct rpcrdma_ia *);
|
||||
|
||||
/*
|
||||
* Endpoint calls - xprtrdma/verbs.c
|
||||
*/
|
||||
int rpcrdma_ep_create(struct rpcrdma_ep *, struct rpcrdma_ia *,
|
||||
struct rpcrdma_create_data_internal *);
|
||||
void rpcrdma_ep_destroy(struct rpcrdma_ep *, struct rpcrdma_ia *);
|
||||
int rpcrdma_ep_connect(struct rpcrdma_ep *, struct rpcrdma_ia *);
|
||||
void rpcrdma_ep_disconnect(struct rpcrdma_ep *, struct rpcrdma_ia *);
|
||||
|
||||
int rpcrdma_ep_post(struct rpcrdma_ia *, struct rpcrdma_ep *,
|
||||
struct rpcrdma_req *);
|
||||
int rpcrdma_ep_post_recv(struct rpcrdma_ia *, struct rpcrdma_ep *,
|
||||
struct rpcrdma_rep *);
|
||||
|
||||
/*
|
||||
* Buffer calls - xprtrdma/verbs.c
|
||||
*/
|
||||
int rpcrdma_buffer_create(struct rpcrdma_buffer *, struct rpcrdma_ep *,
|
||||
struct rpcrdma_ia *,
|
||||
struct rpcrdma_create_data_internal *);
|
||||
void rpcrdma_buffer_destroy(struct rpcrdma_buffer *);
|
||||
|
||||
struct rpcrdma_req *rpcrdma_buffer_get(struct rpcrdma_buffer *);
|
||||
void rpcrdma_buffer_put(struct rpcrdma_req *);
|
||||
void rpcrdma_recv_buffer_get(struct rpcrdma_req *);
|
||||
void rpcrdma_recv_buffer_put(struct rpcrdma_rep *);
|
||||
|
||||
int rpcrdma_register_internal(struct rpcrdma_ia *, void *, int,
|
||||
struct ib_mr **, struct ib_sge *);
|
||||
int rpcrdma_deregister_internal(struct rpcrdma_ia *,
|
||||
struct ib_mr *, struct ib_sge *);
|
||||
|
||||
int rpcrdma_register_external(struct rpcrdma_mr_seg *,
|
||||
int, int, struct rpcrdma_xprt *);
|
||||
int rpcrdma_deregister_external(struct rpcrdma_mr_seg *,
|
||||
struct rpcrdma_xprt *);
|
||||
|
||||
/*
|
||||
* RPC/RDMA connection management calls - xprtrdma/rpc_rdma.c
|
||||
*/
|
||||
void rpcrdma_connect_worker(struct work_struct *);
|
||||
void rpcrdma_conn_func(struct rpcrdma_ep *);
|
||||
void rpcrdma_reply_handler(struct rpcrdma_rep *);
|
||||
|
||||
/*
|
||||
* RPC/RDMA protocol calls - xprtrdma/rpc_rdma.c
|
||||
*/
|
||||
ssize_t rpcrdma_marshal_chunks(struct rpc_rqst *, ssize_t);
|
||||
int rpcrdma_marshal_req(struct rpc_rqst *);
|
||||
size_t rpcrdma_max_payload(struct rpcrdma_xprt *);
|
||||
|
||||
/* Temporary NFS request map cache. Created in svc_rdma.c */
|
||||
extern struct kmem_cache *svc_rdma_map_cachep;
|
||||
/* WR context cache. Created in svc_rdma.c */
|
||||
extern struct kmem_cache *svc_rdma_ctxt_cachep;
|
||||
/* Workqueue created in svc_rdma.c */
|
||||
extern struct workqueue_struct *svc_rdma_wq;
|
||||
|
||||
#if RPCSVC_MAXPAYLOAD < (RPCRDMA_MAX_DATA_SEGS << PAGE_SHIFT)
|
||||
#define RPCSVC_MAXPAYLOAD_RDMA RPCSVC_MAXPAYLOAD
|
||||
#else
|
||||
#define RPCSVC_MAXPAYLOAD_RDMA (RPCRDMA_MAX_DATA_SEGS << PAGE_SHIFT)
|
||||
#endif
|
||||
|
||||
#endif /* _LINUX_SUNRPC_XPRT_RDMA_H */
|
Loading…
Add table
Add a link
Reference in a new issue