Fixed MTP to work with TWRP

This commit is contained in:
awab228 2018-06-19 23:16:04 +02:00
commit f6dfaef42e
50820 changed files with 20846062 additions and 0 deletions

28
net/rds/Kconfig Normal file
View file

@ -0,0 +1,28 @@
config RDS
tristate "The RDS Protocol"
depends on INET
---help---
The RDS (Reliable Datagram Sockets) protocol provides reliable,
sequenced delivery of datagrams over Infiniband, iWARP,
or TCP.
config RDS_RDMA
tristate "RDS over Infiniband and iWARP"
depends on RDS && INFINIBAND && INFINIBAND_ADDR_TRANS
---help---
Allow RDS to use Infiniband and iWARP as a transport.
This transport supports RDMA operations.
config RDS_TCP
tristate "RDS over TCP"
depends on RDS
---help---
Allow RDS to use TCP as a transport.
This transport does not support RDMA operations.
config RDS_DEBUG
bool "RDS debugging messages"
depends on RDS
default n

19
net/rds/Makefile Normal file
View file

@ -0,0 +1,19 @@
obj-$(CONFIG_RDS) += rds.o
rds-y := af_rds.o bind.o cong.o connection.o info.o message.o \
recv.o send.o stats.o sysctl.o threads.o transport.o \
loop.o page.o rdma.o
obj-$(CONFIG_RDS_RDMA) += rds_rdma.o
rds_rdma-y := rdma_transport.o \
ib.o ib_cm.o ib_recv.o ib_ring.o ib_send.o ib_stats.o \
ib_sysctl.o ib_rdma.o \
iw.o iw_cm.o iw_recv.o iw_ring.o iw_send.o iw_stats.o \
iw_sysctl.o iw_rdma.o
obj-$(CONFIG_RDS_TCP) += rds_tcp.o
rds_tcp-y := tcp.o tcp_connect.o tcp_listen.o tcp_recv.o \
tcp_send.o tcp_stats.o
ccflags-$(CONFIG_RDS_DEBUG) := -DDEBUG

599
net/rds/af_rds.c Normal file
View file

@ -0,0 +1,599 @@
/*
* Copyright (c) 2006 Oracle. All rights reserved.
*
* This software is available to you under a choice of one of two
* licenses. You may choose to be licensed under the terms of the GNU
* General Public License (GPL) Version 2, available from the file
* COPYING in the main directory of this source tree, or the
* OpenIB.org BSD license below:
*
* Redistribution and use in source and binary forms, with or
* without modification, are permitted provided that the following
* conditions are met:
*
* - Redistributions of source code must retain the above
* copyright notice, this list of conditions and the following
* disclaimer.
*
* - Redistributions in binary form must reproduce the above
* copyright notice, this list of conditions and the following
* disclaimer in the documentation and/or other materials
* provided with the distribution.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
* BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
* ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
* CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
*
*/
#include <linux/module.h>
#include <linux/errno.h>
#include <linux/kernel.h>
#include <linux/gfp.h>
#include <linux/in.h>
#include <linux/poll.h>
#include <net/sock.h>
#include "rds.h"
char *rds_str_array(char **array, size_t elements, size_t index)
{
if ((index < elements) && array[index])
return array[index];
else
return "unknown";
}
EXPORT_SYMBOL(rds_str_array);
/* this is just used for stats gathering :/ */
static DEFINE_SPINLOCK(rds_sock_lock);
static unsigned long rds_sock_count;
static LIST_HEAD(rds_sock_list);
DECLARE_WAIT_QUEUE_HEAD(rds_poll_waitq);
/*
* This is called as the final descriptor referencing this socket is closed.
* We have to unbind the socket so that another socket can be bound to the
* address it was using.
*
* We have to be careful about racing with the incoming path. sock_orphan()
* sets SOCK_DEAD and we use that as an indicator to the rx path that new
* messages shouldn't be queued.
*/
static int rds_release(struct socket *sock)
{
struct sock *sk = sock->sk;
struct rds_sock *rs;
if (!sk)
goto out;
rs = rds_sk_to_rs(sk);
sock_orphan(sk);
/* Note - rds_clear_recv_queue grabs rs_recv_lock, so
* that ensures the recv path has completed messing
* with the socket. */
rds_clear_recv_queue(rs);
rds_cong_remove_socket(rs);
/*
* the binding lookup hash uses rcu, we need to
* make sure we synchronize_rcu before we free our
* entry
*/
rds_remove_bound(rs);
synchronize_rcu();
rds_send_drop_to(rs, NULL);
rds_rdma_drop_keys(rs);
rds_notify_queue_get(rs, NULL);
spin_lock_bh(&rds_sock_lock);
list_del_init(&rs->rs_item);
rds_sock_count--;
spin_unlock_bh(&rds_sock_lock);
rds_trans_put(rs->rs_transport);
sock->sk = NULL;
sock_put(sk);
out:
return 0;
}
/*
* Careful not to race with rds_release -> sock_orphan which clears sk_sleep.
* _bh() isn't OK here, we're called from interrupt handlers. It's probably OK
* to wake the waitqueue after sk_sleep is clear as we hold a sock ref, but
* this seems more conservative.
* NB - normally, one would use sk_callback_lock for this, but we can
* get here from interrupts, whereas the network code grabs sk_callback_lock
* with _lock_bh only - so relying on sk_callback_lock introduces livelocks.
*/
void rds_wake_sk_sleep(struct rds_sock *rs)
{
unsigned long flags;
read_lock_irqsave(&rs->rs_recv_lock, flags);
__rds_wake_sk_sleep(rds_rs_to_sk(rs));
read_unlock_irqrestore(&rs->rs_recv_lock, flags);
}
static int rds_getname(struct socket *sock, struct sockaddr *uaddr,
int *uaddr_len, int peer)
{
struct sockaddr_in *sin = (struct sockaddr_in *)uaddr;
struct rds_sock *rs = rds_sk_to_rs(sock->sk);
memset(sin->sin_zero, 0, sizeof(sin->sin_zero));
/* racey, don't care */
if (peer) {
if (!rs->rs_conn_addr)
return -ENOTCONN;
sin->sin_port = rs->rs_conn_port;
sin->sin_addr.s_addr = rs->rs_conn_addr;
} else {
sin->sin_port = rs->rs_bound_port;
sin->sin_addr.s_addr = rs->rs_bound_addr;
}
sin->sin_family = AF_INET;
*uaddr_len = sizeof(*sin);
return 0;
}
/*
* RDS' poll is without a doubt the least intuitive part of the interface,
* as POLLIN and POLLOUT do not behave entirely as you would expect from
* a network protocol.
*
* POLLIN is asserted if
* - there is data on the receive queue.
* - to signal that a previously congested destination may have become
* uncongested
* - A notification has been queued to the socket (this can be a congestion
* update, or a RDMA completion).
*
* POLLOUT is asserted if there is room on the send queue. This does not mean
* however, that the next sendmsg() call will succeed. If the application tries
* to send to a congested destination, the system call may still fail (and
* return ENOBUFS).
*/
static unsigned int rds_poll(struct file *file, struct socket *sock,
poll_table *wait)
{
struct sock *sk = sock->sk;
struct rds_sock *rs = rds_sk_to_rs(sk);
unsigned int mask = 0;
unsigned long flags;
poll_wait(file, sk_sleep(sk), wait);
if (rs->rs_seen_congestion)
poll_wait(file, &rds_poll_waitq, wait);
read_lock_irqsave(&rs->rs_recv_lock, flags);
if (!rs->rs_cong_monitor) {
/* When a congestion map was updated, we signal POLLIN for
* "historical" reasons. Applications can also poll for
* WRBAND instead. */
if (rds_cong_updated_since(&rs->rs_cong_track))
mask |= (POLLIN | POLLRDNORM | POLLWRBAND);
} else {
spin_lock(&rs->rs_lock);
if (rs->rs_cong_notify)
mask |= (POLLIN | POLLRDNORM);
spin_unlock(&rs->rs_lock);
}
if (!list_empty(&rs->rs_recv_queue) ||
!list_empty(&rs->rs_notify_queue))
mask |= (POLLIN | POLLRDNORM);
if (rs->rs_snd_bytes < rds_sk_sndbuf(rs))
mask |= (POLLOUT | POLLWRNORM);
read_unlock_irqrestore(&rs->rs_recv_lock, flags);
/* clear state any time we wake a seen-congested socket */
if (mask)
rs->rs_seen_congestion = 0;
return mask;
}
static int rds_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
{
return -ENOIOCTLCMD;
}
static int rds_cancel_sent_to(struct rds_sock *rs, char __user *optval,
int len)
{
struct sockaddr_in sin;
int ret = 0;
/* racing with another thread binding seems ok here */
if (rs->rs_bound_addr == 0) {
ret = -ENOTCONN; /* XXX not a great errno */
goto out;
}
if (len < sizeof(struct sockaddr_in)) {
ret = -EINVAL;
goto out;
}
if (copy_from_user(&sin, optval, sizeof(sin))) {
ret = -EFAULT;
goto out;
}
rds_send_drop_to(rs, &sin);
out:
return ret;
}
static int rds_set_bool_option(unsigned char *optvar, char __user *optval,
int optlen)
{
int value;
if (optlen < sizeof(int))
return -EINVAL;
if (get_user(value, (int __user *) optval))
return -EFAULT;
*optvar = !!value;
return 0;
}
static int rds_cong_monitor(struct rds_sock *rs, char __user *optval,
int optlen)
{
int ret;
ret = rds_set_bool_option(&rs->rs_cong_monitor, optval, optlen);
if (ret == 0) {
if (rs->rs_cong_monitor) {
rds_cong_add_socket(rs);
} else {
rds_cong_remove_socket(rs);
rs->rs_cong_mask = 0;
rs->rs_cong_notify = 0;
}
}
return ret;
}
static int rds_setsockopt(struct socket *sock, int level, int optname,
char __user *optval, unsigned int optlen)
{
struct rds_sock *rs = rds_sk_to_rs(sock->sk);
int ret;
if (level != SOL_RDS) {
ret = -ENOPROTOOPT;
goto out;
}
switch (optname) {
case RDS_CANCEL_SENT_TO:
ret = rds_cancel_sent_to(rs, optval, optlen);
break;
case RDS_GET_MR:
ret = rds_get_mr(rs, optval, optlen);
break;
case RDS_GET_MR_FOR_DEST:
ret = rds_get_mr_for_dest(rs, optval, optlen);
break;
case RDS_FREE_MR:
ret = rds_free_mr(rs, optval, optlen);
break;
case RDS_RECVERR:
ret = rds_set_bool_option(&rs->rs_recverr, optval, optlen);
break;
case RDS_CONG_MONITOR:
ret = rds_cong_monitor(rs, optval, optlen);
break;
default:
ret = -ENOPROTOOPT;
}
out:
return ret;
}
static int rds_getsockopt(struct socket *sock, int level, int optname,
char __user *optval, int __user *optlen)
{
struct rds_sock *rs = rds_sk_to_rs(sock->sk);
int ret = -ENOPROTOOPT, len;
if (level != SOL_RDS)
goto out;
if (get_user(len, optlen)) {
ret = -EFAULT;
goto out;
}
switch (optname) {
case RDS_INFO_FIRST ... RDS_INFO_LAST:
ret = rds_info_getsockopt(sock, optname, optval,
optlen);
break;
case RDS_RECVERR:
if (len < sizeof(int))
ret = -EINVAL;
else
if (put_user(rs->rs_recverr, (int __user *) optval) ||
put_user(sizeof(int), optlen))
ret = -EFAULT;
else
ret = 0;
break;
default:
break;
}
out:
return ret;
}
static int rds_connect(struct socket *sock, struct sockaddr *uaddr,
int addr_len, int flags)
{
struct sock *sk = sock->sk;
struct sockaddr_in *sin = (struct sockaddr_in *)uaddr;
struct rds_sock *rs = rds_sk_to_rs(sk);
int ret = 0;
lock_sock(sk);
if (addr_len != sizeof(struct sockaddr_in)) {
ret = -EINVAL;
goto out;
}
if (sin->sin_family != AF_INET) {
ret = -EAFNOSUPPORT;
goto out;
}
if (sin->sin_addr.s_addr == htonl(INADDR_ANY)) {
ret = -EDESTADDRREQ;
goto out;
}
rs->rs_conn_addr = sin->sin_addr.s_addr;
rs->rs_conn_port = sin->sin_port;
out:
release_sock(sk);
return ret;
}
static struct proto rds_proto = {
.name = "RDS",
.owner = THIS_MODULE,
.obj_size = sizeof(struct rds_sock),
};
static const struct proto_ops rds_proto_ops = {
.family = AF_RDS,
.owner = THIS_MODULE,
.release = rds_release,
.bind = rds_bind,
.connect = rds_connect,
.socketpair = sock_no_socketpair,
.accept = sock_no_accept,
.getname = rds_getname,
.poll = rds_poll,
.ioctl = rds_ioctl,
.listen = sock_no_listen,
.shutdown = sock_no_shutdown,
.setsockopt = rds_setsockopt,
.getsockopt = rds_getsockopt,
.sendmsg = rds_sendmsg,
.recvmsg = rds_recvmsg,
.mmap = sock_no_mmap,
.sendpage = sock_no_sendpage,
};
static int __rds_create(struct socket *sock, struct sock *sk, int protocol)
{
struct rds_sock *rs;
sock_init_data(sock, sk);
sock->ops = &rds_proto_ops;
sk->sk_protocol = protocol;
rs = rds_sk_to_rs(sk);
spin_lock_init(&rs->rs_lock);
rwlock_init(&rs->rs_recv_lock);
INIT_LIST_HEAD(&rs->rs_send_queue);
INIT_LIST_HEAD(&rs->rs_recv_queue);
INIT_LIST_HEAD(&rs->rs_notify_queue);
INIT_LIST_HEAD(&rs->rs_cong_list);
spin_lock_init(&rs->rs_rdma_lock);
rs->rs_rdma_keys = RB_ROOT;
spin_lock_bh(&rds_sock_lock);
list_add_tail(&rs->rs_item, &rds_sock_list);
rds_sock_count++;
spin_unlock_bh(&rds_sock_lock);
return 0;
}
static int rds_create(struct net *net, struct socket *sock, int protocol,
int kern)
{
struct sock *sk;
if (sock->type != SOCK_SEQPACKET || protocol)
return -ESOCKTNOSUPPORT;
sk = sk_alloc(net, AF_RDS, GFP_ATOMIC, &rds_proto);
if (!sk)
return -ENOMEM;
return __rds_create(sock, sk, protocol);
}
void rds_sock_addref(struct rds_sock *rs)
{
sock_hold(rds_rs_to_sk(rs));
}
void rds_sock_put(struct rds_sock *rs)
{
sock_put(rds_rs_to_sk(rs));
}
static const struct net_proto_family rds_family_ops = {
.family = AF_RDS,
.create = rds_create,
.owner = THIS_MODULE,
};
static void rds_sock_inc_info(struct socket *sock, unsigned int len,
struct rds_info_iterator *iter,
struct rds_info_lengths *lens)
{
struct rds_sock *rs;
struct rds_incoming *inc;
unsigned int total = 0;
len /= sizeof(struct rds_info_message);
spin_lock_bh(&rds_sock_lock);
list_for_each_entry(rs, &rds_sock_list, rs_item) {
read_lock(&rs->rs_recv_lock);
/* XXX too lazy to maintain counts.. */
list_for_each_entry(inc, &rs->rs_recv_queue, i_item) {
total++;
if (total <= len)
rds_inc_info_copy(inc, iter, inc->i_saddr,
rs->rs_bound_addr, 1);
}
read_unlock(&rs->rs_recv_lock);
}
spin_unlock_bh(&rds_sock_lock);
lens->nr = total;
lens->each = sizeof(struct rds_info_message);
}
static void rds_sock_info(struct socket *sock, unsigned int len,
struct rds_info_iterator *iter,
struct rds_info_lengths *lens)
{
struct rds_info_socket sinfo;
struct rds_sock *rs;
len /= sizeof(struct rds_info_socket);
spin_lock_bh(&rds_sock_lock);
if (len < rds_sock_count)
goto out;
list_for_each_entry(rs, &rds_sock_list, rs_item) {
sinfo.sndbuf = rds_sk_sndbuf(rs);
sinfo.rcvbuf = rds_sk_rcvbuf(rs);
sinfo.bound_addr = rs->rs_bound_addr;
sinfo.connected_addr = rs->rs_conn_addr;
sinfo.bound_port = rs->rs_bound_port;
sinfo.connected_port = rs->rs_conn_port;
sinfo.inum = sock_i_ino(rds_rs_to_sk(rs));
rds_info_copy(iter, &sinfo, sizeof(sinfo));
}
out:
lens->nr = rds_sock_count;
lens->each = sizeof(struct rds_info_socket);
spin_unlock_bh(&rds_sock_lock);
}
static void rds_exit(void)
{
sock_unregister(rds_family_ops.family);
proto_unregister(&rds_proto);
rds_conn_exit();
rds_cong_exit();
rds_sysctl_exit();
rds_threads_exit();
rds_stats_exit();
rds_page_exit();
rds_info_deregister_func(RDS_INFO_SOCKETS, rds_sock_info);
rds_info_deregister_func(RDS_INFO_RECV_MESSAGES, rds_sock_inc_info);
}
module_exit(rds_exit);
static int rds_init(void)
{
int ret;
ret = rds_conn_init();
if (ret)
goto out;
ret = rds_threads_init();
if (ret)
goto out_conn;
ret = rds_sysctl_init();
if (ret)
goto out_threads;
ret = rds_stats_init();
if (ret)
goto out_sysctl;
ret = proto_register(&rds_proto, 1);
if (ret)
goto out_stats;
ret = sock_register(&rds_family_ops);
if (ret)
goto out_proto;
rds_info_register_func(RDS_INFO_SOCKETS, rds_sock_info);
rds_info_register_func(RDS_INFO_RECV_MESSAGES, rds_sock_inc_info);
goto out;
out_proto:
proto_unregister(&rds_proto);
out_stats:
rds_stats_exit();
out_sysctl:
rds_sysctl_exit();
out_threads:
rds_threads_exit();
out_conn:
rds_conn_exit();
rds_cong_exit();
rds_page_exit();
out:
return ret;
}
module_init(rds_init);
#define DRV_VERSION "4.0"
#define DRV_RELDATE "Feb 12, 2009"
MODULE_AUTHOR("Oracle Corporation <rds-devel@oss.oracle.com>");
MODULE_DESCRIPTION("RDS: Reliable Datagram Sockets"
" v" DRV_VERSION " (" DRV_RELDATE ")");
MODULE_VERSION(DRV_VERSION);
MODULE_LICENSE("Dual BSD/GPL");
MODULE_ALIAS_NETPROTO(PF_RDS);

203
net/rds/bind.c Normal file
View file

@ -0,0 +1,203 @@
/*
* Copyright (c) 2006 Oracle. All rights reserved.
*
* This software is available to you under a choice of one of two
* licenses. You may choose to be licensed under the terms of the GNU
* General Public License (GPL) Version 2, available from the file
* COPYING in the main directory of this source tree, or the
* OpenIB.org BSD license below:
*
* Redistribution and use in source and binary forms, with or
* without modification, are permitted provided that the following
* conditions are met:
*
* - Redistributions of source code must retain the above
* copyright notice, this list of conditions and the following
* disclaimer.
*
* - Redistributions in binary form must reproduce the above
* copyright notice, this list of conditions and the following
* disclaimer in the documentation and/or other materials
* provided with the distribution.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
* BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
* ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
* CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
*
*/
#include <linux/kernel.h>
#include <net/sock.h>
#include <linux/in.h>
#include <linux/if_arp.h>
#include <linux/jhash.h>
#include <linux/ratelimit.h>
#include "rds.h"
#define BIND_HASH_SIZE 1024
static struct hlist_head bind_hash_table[BIND_HASH_SIZE];
static DEFINE_SPINLOCK(rds_bind_lock);
static struct hlist_head *hash_to_bucket(__be32 addr, __be16 port)
{
return bind_hash_table + (jhash_2words((u32)addr, (u32)port, 0) &
(BIND_HASH_SIZE - 1));
}
static struct rds_sock *rds_bind_lookup(__be32 addr, __be16 port,
struct rds_sock *insert)
{
struct rds_sock *rs;
struct hlist_head *head = hash_to_bucket(addr, port);
u64 cmp;
u64 needle = ((u64)be32_to_cpu(addr) << 32) | be16_to_cpu(port);
rcu_read_lock();
hlist_for_each_entry_rcu(rs, head, rs_bound_node) {
cmp = ((u64)be32_to_cpu(rs->rs_bound_addr) << 32) |
be16_to_cpu(rs->rs_bound_port);
if (cmp == needle) {
rcu_read_unlock();
return rs;
}
}
rcu_read_unlock();
if (insert) {
/*
* make sure our addr and port are set before
* we are added to the list, other people
* in rcu will find us as soon as the
* hlist_add_head_rcu is done
*/
insert->rs_bound_addr = addr;
insert->rs_bound_port = port;
rds_sock_addref(insert);
hlist_add_head_rcu(&insert->rs_bound_node, head);
}
return NULL;
}
/*
* Return the rds_sock bound at the given local address.
*
* The rx path can race with rds_release. We notice if rds_release() has
* marked this socket and don't return a rs ref to the rx path.
*/
struct rds_sock *rds_find_bound(__be32 addr, __be16 port)
{
struct rds_sock *rs;
rs = rds_bind_lookup(addr, port, NULL);
if (rs && !sock_flag(rds_rs_to_sk(rs), SOCK_DEAD))
rds_sock_addref(rs);
else
rs = NULL;
rdsdebug("returning rs %p for %pI4:%u\n", rs, &addr,
ntohs(port));
return rs;
}
/* returns -ve errno or +ve port */
static int rds_add_bound(struct rds_sock *rs, __be32 addr, __be16 *port)
{
unsigned long flags;
int ret = -EADDRINUSE;
u16 rover, last;
if (*port != 0) {
rover = be16_to_cpu(*port);
last = rover;
} else {
rover = max_t(u16, prandom_u32(), 2);
last = rover - 1;
}
spin_lock_irqsave(&rds_bind_lock, flags);
do {
if (rover == 0)
rover++;
if (!rds_bind_lookup(addr, cpu_to_be16(rover), rs)) {
*port = rs->rs_bound_port;
ret = 0;
rdsdebug("rs %p binding to %pI4:%d\n",
rs, &addr, (int)ntohs(*port));
break;
}
} while (rover++ != last);
spin_unlock_irqrestore(&rds_bind_lock, flags);
return ret;
}
void rds_remove_bound(struct rds_sock *rs)
{
unsigned long flags;
spin_lock_irqsave(&rds_bind_lock, flags);
if (rs->rs_bound_addr) {
rdsdebug("rs %p unbinding from %pI4:%d\n",
rs, &rs->rs_bound_addr,
ntohs(rs->rs_bound_port));
hlist_del_init_rcu(&rs->rs_bound_node);
rds_sock_put(rs);
rs->rs_bound_addr = 0;
}
spin_unlock_irqrestore(&rds_bind_lock, flags);
}
int rds_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
{
struct sock *sk = sock->sk;
struct sockaddr_in *sin = (struct sockaddr_in *)uaddr;
struct rds_sock *rs = rds_sk_to_rs(sk);
struct rds_transport *trans;
int ret = 0;
lock_sock(sk);
if (addr_len != sizeof(struct sockaddr_in) ||
sin->sin_family != AF_INET ||
rs->rs_bound_addr ||
sin->sin_addr.s_addr == htonl(INADDR_ANY)) {
ret = -EINVAL;
goto out;
}
ret = rds_add_bound(rs, sin->sin_addr.s_addr, &sin->sin_port);
if (ret)
goto out;
trans = rds_trans_get_preferred(sin->sin_addr.s_addr);
if (!trans) {
ret = -EADDRNOTAVAIL;
rds_remove_bound(rs);
printk_ratelimited(KERN_INFO "RDS: rds_bind() could not find a transport, "
"load rds_tcp or rds_rdma?\n");
goto out;
}
rs->rs_transport = trans;
ret = 0;
out:
release_sock(sk);
/* we might have called rds_remove_bound on error */
if (ret)
synchronize_rcu();
return ret;
}

406
net/rds/cong.c Normal file
View file

@ -0,0 +1,406 @@
/*
* Copyright (c) 2007 Oracle. All rights reserved.
*
* This software is available to you under a choice of one of two
* licenses. You may choose to be licensed under the terms of the GNU
* General Public License (GPL) Version 2, available from the file
* COPYING in the main directory of this source tree, or the
* OpenIB.org BSD license below:
*
* Redistribution and use in source and binary forms, with or
* without modification, are permitted provided that the following
* conditions are met:
*
* - Redistributions of source code must retain the above
* copyright notice, this list of conditions and the following
* disclaimer.
*
* - Redistributions in binary form must reproduce the above
* copyright notice, this list of conditions and the following
* disclaimer in the documentation and/or other materials
* provided with the distribution.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
* BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
* ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
* CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
*
*/
#include <linux/slab.h>
#include <linux/types.h>
#include <linux/rbtree.h>
#include <linux/bitops.h>
#include <linux/export.h>
#include "rds.h"
/*
* This file implements the receive side of the unconventional congestion
* management in RDS.
*
* Messages waiting in the receive queue on the receiving socket are accounted
* against the sockets SO_RCVBUF option value. Only the payload bytes in the
* message are accounted for. If the number of bytes queued equals or exceeds
* rcvbuf then the socket is congested. All sends attempted to this socket's
* address should return block or return -EWOULDBLOCK.
*
* Applications are expected to be reasonably tuned such that this situation
* very rarely occurs. An application encountering this "back-pressure" is
* considered a bug.
*
* This is implemented by having each node maintain bitmaps which indicate
* which ports on bound addresses are congested. As the bitmap changes it is
* sent through all the connections which terminate in the local address of the
* bitmap which changed.
*
* The bitmaps are allocated as connections are brought up. This avoids
* allocation in the interrupt handling path which queues messages on sockets.
* The dense bitmaps let transports send the entire bitmap on any bitmap change
* reasonably efficiently. This is much easier to implement than some
* finer-grained communication of per-port congestion. The sender does a very
* inexpensive bit test to test if the port it's about to send to is congested
* or not.
*/
/*
* Interaction with poll is a tad tricky. We want all processes stuck in
* poll to wake up and check whether a congested destination became uncongested.
* The really sad thing is we have no idea which destinations the application
* wants to send to - we don't even know which rds_connections are involved.
* So until we implement a more flexible rds poll interface, we have to make
* do with this:
* We maintain a global counter that is incremented each time a congestion map
* update is received. Each rds socket tracks this value, and if rds_poll
* finds that the saved generation number is smaller than the global generation
* number, it wakes up the process.
*/
static atomic_t rds_cong_generation = ATOMIC_INIT(0);
/*
* Congestion monitoring
*/
static LIST_HEAD(rds_cong_monitor);
static DEFINE_RWLOCK(rds_cong_monitor_lock);
/*
* Yes, a global lock. It's used so infrequently that it's worth keeping it
* global to simplify the locking. It's only used in the following
* circumstances:
*
* - on connection buildup to associate a conn with its maps
* - on map changes to inform conns of a new map to send
*
* It's sadly ordered under the socket callback lock and the connection lock.
* Receive paths can mark ports congested from interrupt context so the
* lock masks interrupts.
*/
static DEFINE_SPINLOCK(rds_cong_lock);
static struct rb_root rds_cong_tree = RB_ROOT;
static struct rds_cong_map *rds_cong_tree_walk(__be32 addr,
struct rds_cong_map *insert)
{
struct rb_node **p = &rds_cong_tree.rb_node;
struct rb_node *parent = NULL;
struct rds_cong_map *map;
while (*p) {
parent = *p;
map = rb_entry(parent, struct rds_cong_map, m_rb_node);
if (addr < map->m_addr)
p = &(*p)->rb_left;
else if (addr > map->m_addr)
p = &(*p)->rb_right;
else
return map;
}
if (insert) {
rb_link_node(&insert->m_rb_node, parent, p);
rb_insert_color(&insert->m_rb_node, &rds_cong_tree);
}
return NULL;
}
/*
* There is only ever one bitmap for any address. Connections try and allocate
* these bitmaps in the process getting pointers to them. The bitmaps are only
* ever freed as the module is removed after all connections have been freed.
*/
static struct rds_cong_map *rds_cong_from_addr(__be32 addr)
{
struct rds_cong_map *map;
struct rds_cong_map *ret = NULL;
unsigned long zp;
unsigned long i;
unsigned long flags;
map = kzalloc(sizeof(struct rds_cong_map), GFP_KERNEL);
if (!map)
return NULL;
map->m_addr = addr;
init_waitqueue_head(&map->m_waitq);
INIT_LIST_HEAD(&map->m_conn_list);
for (i = 0; i < RDS_CONG_MAP_PAGES; i++) {
zp = get_zeroed_page(GFP_KERNEL);
if (zp == 0)
goto out;
map->m_page_addrs[i] = zp;
}
spin_lock_irqsave(&rds_cong_lock, flags);
ret = rds_cong_tree_walk(addr, map);
spin_unlock_irqrestore(&rds_cong_lock, flags);
if (!ret) {
ret = map;
map = NULL;
}
out:
if (map) {
for (i = 0; i < RDS_CONG_MAP_PAGES && map->m_page_addrs[i]; i++)
free_page(map->m_page_addrs[i]);
kfree(map);
}
rdsdebug("map %p for addr %x\n", ret, be32_to_cpu(addr));
return ret;
}
/*
* Put the conn on its local map's list. This is called when the conn is
* really added to the hash. It's nested under the rds_conn_lock, sadly.
*/
void rds_cong_add_conn(struct rds_connection *conn)
{
unsigned long flags;
rdsdebug("conn %p now on map %p\n", conn, conn->c_lcong);
spin_lock_irqsave(&rds_cong_lock, flags);
list_add_tail(&conn->c_map_item, &conn->c_lcong->m_conn_list);
spin_unlock_irqrestore(&rds_cong_lock, flags);
}
void rds_cong_remove_conn(struct rds_connection *conn)
{
unsigned long flags;
rdsdebug("removing conn %p from map %p\n", conn, conn->c_lcong);
spin_lock_irqsave(&rds_cong_lock, flags);
list_del_init(&conn->c_map_item);
spin_unlock_irqrestore(&rds_cong_lock, flags);
}
int rds_cong_get_maps(struct rds_connection *conn)
{
conn->c_lcong = rds_cong_from_addr(conn->c_laddr);
conn->c_fcong = rds_cong_from_addr(conn->c_faddr);
if (!(conn->c_lcong && conn->c_fcong))
return -ENOMEM;
return 0;
}
void rds_cong_queue_updates(struct rds_cong_map *map)
{
struct rds_connection *conn;
unsigned long flags;
spin_lock_irqsave(&rds_cong_lock, flags);
list_for_each_entry(conn, &map->m_conn_list, c_map_item) {
if (!test_and_set_bit(0, &conn->c_map_queued)) {
rds_stats_inc(s_cong_update_queued);
rds_send_xmit(conn);
}
}
spin_unlock_irqrestore(&rds_cong_lock, flags);
}
void rds_cong_map_updated(struct rds_cong_map *map, uint64_t portmask)
{
rdsdebug("waking map %p for %pI4\n",
map, &map->m_addr);
rds_stats_inc(s_cong_update_received);
atomic_inc(&rds_cong_generation);
if (waitqueue_active(&map->m_waitq))
wake_up(&map->m_waitq);
if (waitqueue_active(&rds_poll_waitq))
wake_up_all(&rds_poll_waitq);
if (portmask && !list_empty(&rds_cong_monitor)) {
unsigned long flags;
struct rds_sock *rs;
read_lock_irqsave(&rds_cong_monitor_lock, flags);
list_for_each_entry(rs, &rds_cong_monitor, rs_cong_list) {
spin_lock(&rs->rs_lock);
rs->rs_cong_notify |= (rs->rs_cong_mask & portmask);
rs->rs_cong_mask &= ~portmask;
spin_unlock(&rs->rs_lock);
if (rs->rs_cong_notify)
rds_wake_sk_sleep(rs);
}
read_unlock_irqrestore(&rds_cong_monitor_lock, flags);
}
}
EXPORT_SYMBOL_GPL(rds_cong_map_updated);
int rds_cong_updated_since(unsigned long *recent)
{
unsigned long gen = atomic_read(&rds_cong_generation);
if (likely(*recent == gen))
return 0;
*recent = gen;
return 1;
}
/*
* We're called under the locking that protects the sockets receive buffer
* consumption. This makes it a lot easier for the caller to only call us
* when it knows that an existing set bit needs to be cleared, and vice versa.
* We can't block and we need to deal with concurrent sockets working against
* the same per-address map.
*/
void rds_cong_set_bit(struct rds_cong_map *map, __be16 port)
{
unsigned long i;
unsigned long off;
rdsdebug("setting congestion for %pI4:%u in map %p\n",
&map->m_addr, ntohs(port), map);
i = be16_to_cpu(port) / RDS_CONG_MAP_PAGE_BITS;
off = be16_to_cpu(port) % RDS_CONG_MAP_PAGE_BITS;
__set_bit_le(off, (void *)map->m_page_addrs[i]);
}
void rds_cong_clear_bit(struct rds_cong_map *map, __be16 port)
{
unsigned long i;
unsigned long off;
rdsdebug("clearing congestion for %pI4:%u in map %p\n",
&map->m_addr, ntohs(port), map);
i = be16_to_cpu(port) / RDS_CONG_MAP_PAGE_BITS;
off = be16_to_cpu(port) % RDS_CONG_MAP_PAGE_BITS;
__clear_bit_le(off, (void *)map->m_page_addrs[i]);
}
static int rds_cong_test_bit(struct rds_cong_map *map, __be16 port)
{
unsigned long i;
unsigned long off;
i = be16_to_cpu(port) / RDS_CONG_MAP_PAGE_BITS;
off = be16_to_cpu(port) % RDS_CONG_MAP_PAGE_BITS;
return test_bit_le(off, (void *)map->m_page_addrs[i]);
}
void rds_cong_add_socket(struct rds_sock *rs)
{
unsigned long flags;
write_lock_irqsave(&rds_cong_monitor_lock, flags);
if (list_empty(&rs->rs_cong_list))
list_add(&rs->rs_cong_list, &rds_cong_monitor);
write_unlock_irqrestore(&rds_cong_monitor_lock, flags);
}
void rds_cong_remove_socket(struct rds_sock *rs)
{
unsigned long flags;
struct rds_cong_map *map;
write_lock_irqsave(&rds_cong_monitor_lock, flags);
list_del_init(&rs->rs_cong_list);
write_unlock_irqrestore(&rds_cong_monitor_lock, flags);
/* update congestion map for now-closed port */
spin_lock_irqsave(&rds_cong_lock, flags);
map = rds_cong_tree_walk(rs->rs_bound_addr, NULL);
spin_unlock_irqrestore(&rds_cong_lock, flags);
if (map && rds_cong_test_bit(map, rs->rs_bound_port)) {
rds_cong_clear_bit(map, rs->rs_bound_port);
rds_cong_queue_updates(map);
}
}
int rds_cong_wait(struct rds_cong_map *map, __be16 port, int nonblock,
struct rds_sock *rs)
{
if (!rds_cong_test_bit(map, port))
return 0;
if (nonblock) {
if (rs && rs->rs_cong_monitor) {
unsigned long flags;
/* It would have been nice to have an atomic set_bit on
* a uint64_t. */
spin_lock_irqsave(&rs->rs_lock, flags);
rs->rs_cong_mask |= RDS_CONG_MONITOR_MASK(ntohs(port));
spin_unlock_irqrestore(&rs->rs_lock, flags);
/* Test again - a congestion update may have arrived in
* the meantime. */
if (!rds_cong_test_bit(map, port))
return 0;
}
rds_stats_inc(s_cong_send_error);
return -ENOBUFS;
}
rds_stats_inc(s_cong_send_blocked);
rdsdebug("waiting on map %p for port %u\n", map, be16_to_cpu(port));
return wait_event_interruptible(map->m_waitq,
!rds_cong_test_bit(map, port));
}
void rds_cong_exit(void)
{
struct rb_node *node;
struct rds_cong_map *map;
unsigned long i;
while ((node = rb_first(&rds_cong_tree))) {
map = rb_entry(node, struct rds_cong_map, m_rb_node);
rdsdebug("freeing map %p\n", map);
rb_erase(&map->m_rb_node, &rds_cong_tree);
for (i = 0; i < RDS_CONG_MAP_PAGES && map->m_page_addrs[i]; i++)
free_page(map->m_page_addrs[i]);
kfree(map);
}
}
/*
* Allocate a RDS message containing a congestion update.
*/
struct rds_message *rds_cong_update_alloc(struct rds_connection *conn)
{
struct rds_cong_map *map = conn->c_lcong;
struct rds_message *rm;
rm = rds_message_map_pages(map->m_page_addrs, RDS_CONG_MAP_BYTES);
if (!IS_ERR(rm))
rm->m_inc.i_hdr.h_flags = RDS_FLAG_CONG_BITMAP;
return rm;
}

577
net/rds/connection.c Normal file
View file

@ -0,0 +1,577 @@
/*
* Copyright (c) 2006 Oracle. All rights reserved.
*
* This software is available to you under a choice of one of two
* licenses. You may choose to be licensed under the terms of the GNU
* General Public License (GPL) Version 2, available from the file
* COPYING in the main directory of this source tree, or the
* OpenIB.org BSD license below:
*
* Redistribution and use in source and binary forms, with or
* without modification, are permitted provided that the following
* conditions are met:
*
* - Redistributions of source code must retain the above
* copyright notice, this list of conditions and the following
* disclaimer.
*
* - Redistributions in binary form must reproduce the above
* copyright notice, this list of conditions and the following
* disclaimer in the documentation and/or other materials
* provided with the distribution.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
* BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
* ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
* CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
*
*/
#include <linux/kernel.h>
#include <linux/list.h>
#include <linux/slab.h>
#include <linux/export.h>
#include <net/inet_hashtables.h>
#include "rds.h"
#include "loop.h"
#define RDS_CONNECTION_HASH_BITS 12
#define RDS_CONNECTION_HASH_ENTRIES (1 << RDS_CONNECTION_HASH_BITS)
#define RDS_CONNECTION_HASH_MASK (RDS_CONNECTION_HASH_ENTRIES - 1)
/* converting this to RCU is a chore for another day.. */
static DEFINE_SPINLOCK(rds_conn_lock);
static unsigned long rds_conn_count;
static struct hlist_head rds_conn_hash[RDS_CONNECTION_HASH_ENTRIES];
static struct kmem_cache *rds_conn_slab;
static struct hlist_head *rds_conn_bucket(__be32 laddr, __be32 faddr)
{
static u32 rds_hash_secret __read_mostly;
unsigned long hash;
net_get_random_once(&rds_hash_secret, sizeof(rds_hash_secret));
/* Pass NULL, don't need struct net for hash */
hash = __inet_ehashfn(be32_to_cpu(laddr), 0,
be32_to_cpu(faddr), 0,
rds_hash_secret);
return &rds_conn_hash[hash & RDS_CONNECTION_HASH_MASK];
}
#define rds_conn_info_set(var, test, suffix) do { \
if (test) \
var |= RDS_INFO_CONNECTION_FLAG_##suffix; \
} while (0)
/* rcu read lock must be held or the connection spinlock */
static struct rds_connection *rds_conn_lookup(struct hlist_head *head,
__be32 laddr, __be32 faddr,
struct rds_transport *trans)
{
struct rds_connection *conn, *ret = NULL;
hlist_for_each_entry_rcu(conn, head, c_hash_node) {
if (conn->c_faddr == faddr && conn->c_laddr == laddr &&
conn->c_trans == trans) {
ret = conn;
break;
}
}
rdsdebug("returning conn %p for %pI4 -> %pI4\n", ret,
&laddr, &faddr);
return ret;
}
/*
* This is called by transports as they're bringing down a connection.
* It clears partial message state so that the transport can start sending
* and receiving over this connection again in the future. It is up to
* the transport to have serialized this call with its send and recv.
*/
static void rds_conn_reset(struct rds_connection *conn)
{
rdsdebug("connection %pI4 to %pI4 reset\n",
&conn->c_laddr, &conn->c_faddr);
rds_stats_inc(s_conn_reset);
rds_send_reset(conn);
conn->c_flags = 0;
/* Do not clear next_rx_seq here, else we cannot distinguish
* retransmitted packets from new packets, and will hand all
* of them to the application. That is not consistent with the
* reliability guarantees of RDS. */
}
/*
* There is only every one 'conn' for a given pair of addresses in the
* system at a time. They contain messages to be retransmitted and so
* span the lifetime of the actual underlying transport connections.
*
* For now they are not garbage collected once they're created. They
* are torn down as the module is removed, if ever.
*/
static struct rds_connection *__rds_conn_create(__be32 laddr, __be32 faddr,
struct rds_transport *trans, gfp_t gfp,
int is_outgoing)
{
struct rds_connection *conn, *parent = NULL;
struct hlist_head *head = rds_conn_bucket(laddr, faddr);
struct rds_transport *loop_trans;
unsigned long flags;
int ret;
rcu_read_lock();
conn = rds_conn_lookup(head, laddr, faddr, trans);
if (conn && conn->c_loopback && conn->c_trans != &rds_loop_transport &&
!is_outgoing) {
/* This is a looped back IB connection, and we're
* called by the code handling the incoming connect.
* We need a second connection object into which we
* can stick the other QP. */
parent = conn;
conn = parent->c_passive;
}
rcu_read_unlock();
if (conn)
goto out;
conn = kmem_cache_zalloc(rds_conn_slab, gfp);
if (!conn) {
conn = ERR_PTR(-ENOMEM);
goto out;
}
INIT_HLIST_NODE(&conn->c_hash_node);
conn->c_laddr = laddr;
conn->c_faddr = faddr;
spin_lock_init(&conn->c_lock);
conn->c_next_tx_seq = 1;
init_waitqueue_head(&conn->c_waitq);
INIT_LIST_HEAD(&conn->c_send_queue);
INIT_LIST_HEAD(&conn->c_retrans);
ret = rds_cong_get_maps(conn);
if (ret) {
kmem_cache_free(rds_conn_slab, conn);
conn = ERR_PTR(ret);
goto out;
}
/*
* This is where a connection becomes loopback. If *any* RDS sockets
* can bind to the destination address then we'd rather the messages
* flow through loopback rather than either transport.
*/
loop_trans = rds_trans_get_preferred(faddr);
if (loop_trans) {
rds_trans_put(loop_trans);
conn->c_loopback = 1;
if (is_outgoing && trans->t_prefer_loopback) {
/* "outgoing" connection - and the transport
* says it wants the connection handled by the
* loopback transport. This is what TCP does.
*/
trans = &rds_loop_transport;
}
}
conn->c_trans = trans;
ret = trans->conn_alloc(conn, gfp);
if (ret) {
kmem_cache_free(rds_conn_slab, conn);
conn = ERR_PTR(ret);
goto out;
}
atomic_set(&conn->c_state, RDS_CONN_DOWN);
conn->c_reconnect_jiffies = 0;
INIT_DELAYED_WORK(&conn->c_send_w, rds_send_worker);
INIT_DELAYED_WORK(&conn->c_recv_w, rds_recv_worker);
INIT_DELAYED_WORK(&conn->c_conn_w, rds_connect_worker);
INIT_WORK(&conn->c_down_w, rds_shutdown_worker);
mutex_init(&conn->c_cm_lock);
conn->c_flags = 0;
rdsdebug("allocated conn %p for %pI4 -> %pI4 over %s %s\n",
conn, &laddr, &faddr,
trans->t_name ? trans->t_name : "[unknown]",
is_outgoing ? "(outgoing)" : "");
/*
* Since we ran without holding the conn lock, someone could
* have created the same conn (either normal or passive) in the
* interim. We check while holding the lock. If we won, we complete
* init and return our conn. If we lost, we rollback and return the
* other one.
*/
spin_lock_irqsave(&rds_conn_lock, flags);
if (parent) {
/* Creating passive conn */
if (parent->c_passive) {
trans->conn_free(conn->c_transport_data);
kmem_cache_free(rds_conn_slab, conn);
conn = parent->c_passive;
} else {
parent->c_passive = conn;
rds_cong_add_conn(conn);
rds_conn_count++;
}
} else {
/* Creating normal conn */
struct rds_connection *found;
found = rds_conn_lookup(head, laddr, faddr, trans);
if (found) {
trans->conn_free(conn->c_transport_data);
kmem_cache_free(rds_conn_slab, conn);
conn = found;
} else {
hlist_add_head_rcu(&conn->c_hash_node, head);
rds_cong_add_conn(conn);
rds_conn_count++;
}
}
spin_unlock_irqrestore(&rds_conn_lock, flags);
out:
return conn;
}
struct rds_connection *rds_conn_create(__be32 laddr, __be32 faddr,
struct rds_transport *trans, gfp_t gfp)
{
return __rds_conn_create(laddr, faddr, trans, gfp, 0);
}
EXPORT_SYMBOL_GPL(rds_conn_create);
struct rds_connection *rds_conn_create_outgoing(__be32 laddr, __be32 faddr,
struct rds_transport *trans, gfp_t gfp)
{
return __rds_conn_create(laddr, faddr, trans, gfp, 1);
}
EXPORT_SYMBOL_GPL(rds_conn_create_outgoing);
void rds_conn_shutdown(struct rds_connection *conn)
{
/* shut it down unless it's down already */
if (!rds_conn_transition(conn, RDS_CONN_DOWN, RDS_CONN_DOWN)) {
/*
* Quiesce the connection mgmt handlers before we start tearing
* things down. We don't hold the mutex for the entire
* duration of the shutdown operation, else we may be
* deadlocking with the CM handler. Instead, the CM event
* handler is supposed to check for state DISCONNECTING
*/
mutex_lock(&conn->c_cm_lock);
if (!rds_conn_transition(conn, RDS_CONN_UP, RDS_CONN_DISCONNECTING)
&& !rds_conn_transition(conn, RDS_CONN_ERROR, RDS_CONN_DISCONNECTING)) {
rds_conn_error(conn, "shutdown called in state %d\n",
atomic_read(&conn->c_state));
mutex_unlock(&conn->c_cm_lock);
return;
}
mutex_unlock(&conn->c_cm_lock);
wait_event(conn->c_waitq,
!test_bit(RDS_IN_XMIT, &conn->c_flags));
conn->c_trans->conn_shutdown(conn);
rds_conn_reset(conn);
if (!rds_conn_transition(conn, RDS_CONN_DISCONNECTING, RDS_CONN_DOWN)) {
/* This can happen - eg when we're in the middle of tearing
* down the connection, and someone unloads the rds module.
* Quite reproduceable with loopback connections.
* Mostly harmless.
*/
rds_conn_error(conn,
"%s: failed to transition to state DOWN, "
"current state is %d\n",
__func__,
atomic_read(&conn->c_state));
return;
}
}
/* Then reconnect if it's still live.
* The passive side of an IB loopback connection is never added
* to the conn hash, so we never trigger a reconnect on this
* conn - the reconnect is always triggered by the active peer. */
cancel_delayed_work_sync(&conn->c_conn_w);
rcu_read_lock();
if (!hlist_unhashed(&conn->c_hash_node)) {
rcu_read_unlock();
rds_queue_reconnect(conn);
} else {
rcu_read_unlock();
}
}
/*
* Stop and free a connection.
*
* This can only be used in very limited circumstances. It assumes that once
* the conn has been shutdown that no one else is referencing the connection.
* We can only ensure this in the rmmod path in the current code.
*/
void rds_conn_destroy(struct rds_connection *conn)
{
struct rds_message *rm, *rtmp;
unsigned long flags;
rdsdebug("freeing conn %p for %pI4 -> "
"%pI4\n", conn, &conn->c_laddr,
&conn->c_faddr);
/* Ensure conn will not be scheduled for reconnect */
spin_lock_irq(&rds_conn_lock);
hlist_del_init_rcu(&conn->c_hash_node);
spin_unlock_irq(&rds_conn_lock);
synchronize_rcu();
/* shut the connection down */
rds_conn_drop(conn);
flush_work(&conn->c_down_w);
/* make sure lingering queued work won't try to ref the conn */
cancel_delayed_work_sync(&conn->c_send_w);
cancel_delayed_work_sync(&conn->c_recv_w);
/* tear down queued messages */
list_for_each_entry_safe(rm, rtmp,
&conn->c_send_queue,
m_conn_item) {
list_del_init(&rm->m_conn_item);
BUG_ON(!list_empty(&rm->m_sock_item));
rds_message_put(rm);
}
if (conn->c_xmit_rm)
rds_message_put(conn->c_xmit_rm);
conn->c_trans->conn_free(conn->c_transport_data);
/*
* The congestion maps aren't freed up here. They're
* freed by rds_cong_exit() after all the connections
* have been freed.
*/
rds_cong_remove_conn(conn);
BUG_ON(!list_empty(&conn->c_retrans));
kmem_cache_free(rds_conn_slab, conn);
spin_lock_irqsave(&rds_conn_lock, flags);
rds_conn_count--;
spin_unlock_irqrestore(&rds_conn_lock, flags);
}
EXPORT_SYMBOL_GPL(rds_conn_destroy);
static void rds_conn_message_info(struct socket *sock, unsigned int len,
struct rds_info_iterator *iter,
struct rds_info_lengths *lens,
int want_send)
{
struct hlist_head *head;
struct list_head *list;
struct rds_connection *conn;
struct rds_message *rm;
unsigned int total = 0;
unsigned long flags;
size_t i;
len /= sizeof(struct rds_info_message);
rcu_read_lock();
for (i = 0, head = rds_conn_hash; i < ARRAY_SIZE(rds_conn_hash);
i++, head++) {
hlist_for_each_entry_rcu(conn, head, c_hash_node) {
if (want_send)
list = &conn->c_send_queue;
else
list = &conn->c_retrans;
spin_lock_irqsave(&conn->c_lock, flags);
/* XXX too lazy to maintain counts.. */
list_for_each_entry(rm, list, m_conn_item) {
total++;
if (total <= len)
rds_inc_info_copy(&rm->m_inc, iter,
conn->c_laddr,
conn->c_faddr, 0);
}
spin_unlock_irqrestore(&conn->c_lock, flags);
}
}
rcu_read_unlock();
lens->nr = total;
lens->each = sizeof(struct rds_info_message);
}
static void rds_conn_message_info_send(struct socket *sock, unsigned int len,
struct rds_info_iterator *iter,
struct rds_info_lengths *lens)
{
rds_conn_message_info(sock, len, iter, lens, 1);
}
static void rds_conn_message_info_retrans(struct socket *sock,
unsigned int len,
struct rds_info_iterator *iter,
struct rds_info_lengths *lens)
{
rds_conn_message_info(sock, len, iter, lens, 0);
}
void rds_for_each_conn_info(struct socket *sock, unsigned int len,
struct rds_info_iterator *iter,
struct rds_info_lengths *lens,
int (*visitor)(struct rds_connection *, void *),
size_t item_len)
{
uint64_t buffer[(item_len + 7) / 8];
struct hlist_head *head;
struct rds_connection *conn;
size_t i;
rcu_read_lock();
lens->nr = 0;
lens->each = item_len;
for (i = 0, head = rds_conn_hash; i < ARRAY_SIZE(rds_conn_hash);
i++, head++) {
hlist_for_each_entry_rcu(conn, head, c_hash_node) {
/* XXX no c_lock usage.. */
if (!visitor(conn, buffer))
continue;
/* We copy as much as we can fit in the buffer,
* but we count all items so that the caller
* can resize the buffer. */
if (len >= item_len) {
rds_info_copy(iter, buffer, item_len);
len -= item_len;
}
lens->nr++;
}
}
rcu_read_unlock();
}
EXPORT_SYMBOL_GPL(rds_for_each_conn_info);
static int rds_conn_info_visitor(struct rds_connection *conn,
void *buffer)
{
struct rds_info_connection *cinfo = buffer;
cinfo->next_tx_seq = conn->c_next_tx_seq;
cinfo->next_rx_seq = conn->c_next_rx_seq;
cinfo->laddr = conn->c_laddr;
cinfo->faddr = conn->c_faddr;
strncpy(cinfo->transport, conn->c_trans->t_name,
sizeof(cinfo->transport));
cinfo->flags = 0;
rds_conn_info_set(cinfo->flags, test_bit(RDS_IN_XMIT, &conn->c_flags),
SENDING);
/* XXX Future: return the state rather than these funky bits */
rds_conn_info_set(cinfo->flags,
atomic_read(&conn->c_state) == RDS_CONN_CONNECTING,
CONNECTING);
rds_conn_info_set(cinfo->flags,
atomic_read(&conn->c_state) == RDS_CONN_UP,
CONNECTED);
return 1;
}
static void rds_conn_info(struct socket *sock, unsigned int len,
struct rds_info_iterator *iter,
struct rds_info_lengths *lens)
{
rds_for_each_conn_info(sock, len, iter, lens,
rds_conn_info_visitor,
sizeof(struct rds_info_connection));
}
int rds_conn_init(void)
{
rds_conn_slab = kmem_cache_create("rds_connection",
sizeof(struct rds_connection),
0, 0, NULL);
if (!rds_conn_slab)
return -ENOMEM;
rds_info_register_func(RDS_INFO_CONNECTIONS, rds_conn_info);
rds_info_register_func(RDS_INFO_SEND_MESSAGES,
rds_conn_message_info_send);
rds_info_register_func(RDS_INFO_RETRANS_MESSAGES,
rds_conn_message_info_retrans);
return 0;
}
void rds_conn_exit(void)
{
rds_loop_exit();
WARN_ON(!hlist_empty(rds_conn_hash));
kmem_cache_destroy(rds_conn_slab);
rds_info_deregister_func(RDS_INFO_CONNECTIONS, rds_conn_info);
rds_info_deregister_func(RDS_INFO_SEND_MESSAGES,
rds_conn_message_info_send);
rds_info_deregister_func(RDS_INFO_RETRANS_MESSAGES,
rds_conn_message_info_retrans);
}
/*
* Force a disconnect
*/
void rds_conn_drop(struct rds_connection *conn)
{
atomic_set(&conn->c_state, RDS_CONN_ERROR);
queue_work(rds_wq, &conn->c_down_w);
}
EXPORT_SYMBOL_GPL(rds_conn_drop);
/*
* If the connection is down, trigger a connect. We may have scheduled a
* delayed reconnect however - in this case we should not interfere.
*/
void rds_conn_connect_if_down(struct rds_connection *conn)
{
if (rds_conn_state(conn) == RDS_CONN_DOWN &&
!test_and_set_bit(RDS_RECONNECT_PENDING, &conn->c_flags))
queue_delayed_work(rds_wq, &conn->c_conn_w, 0);
}
EXPORT_SYMBOL_GPL(rds_conn_connect_if_down);
/*
* An error occurred on the connection
*/
void
__rds_conn_error(struct rds_connection *conn, const char *fmt, ...)
{
va_list ap;
va_start(ap, fmt);
vprintk(fmt, ap);
va_end(ap);
rds_conn_drop(conn);
}

435
net/rds/ib.c Normal file
View file

@ -0,0 +1,435 @@
/*
* Copyright (c) 2006 Oracle. All rights reserved.
*
* This software is available to you under a choice of one of two
* licenses. You may choose to be licensed under the terms of the GNU
* General Public License (GPL) Version 2, available from the file
* COPYING in the main directory of this source tree, or the
* OpenIB.org BSD license below:
*
* Redistribution and use in source and binary forms, with or
* without modification, are permitted provided that the following
* conditions are met:
*
* - Redistributions of source code must retain the above
* copyright notice, this list of conditions and the following
* disclaimer.
*
* - Redistributions in binary form must reproduce the above
* copyright notice, this list of conditions and the following
* disclaimer in the documentation and/or other materials
* provided with the distribution.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
* BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
* ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
* CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
*
*/
#include <linux/kernel.h>
#include <linux/in.h>
#include <linux/if.h>
#include <linux/netdevice.h>
#include <linux/inetdevice.h>
#include <linux/if_arp.h>
#include <linux/delay.h>
#include <linux/slab.h>
#include <linux/module.h>
#include "rds.h"
#include "ib.h"
static unsigned int fmr_pool_size = RDS_FMR_POOL_SIZE;
unsigned int fmr_message_size = RDS_FMR_SIZE + 1; /* +1 allows for unaligned MRs */
unsigned int rds_ib_retry_count = RDS_IB_DEFAULT_RETRY_COUNT;
module_param(fmr_pool_size, int, 0444);
MODULE_PARM_DESC(fmr_pool_size, " Max number of fmr per HCA");
module_param(fmr_message_size, int, 0444);
MODULE_PARM_DESC(fmr_message_size, " Max size of a RDMA transfer");
module_param(rds_ib_retry_count, int, 0444);
MODULE_PARM_DESC(rds_ib_retry_count, " Number of hw retries before reporting an error");
/*
* we have a clumsy combination of RCU and a rwsem protecting this list
* because it is used both in the get_mr fast path and while blocking in
* the FMR flushing path.
*/
DECLARE_RWSEM(rds_ib_devices_lock);
struct list_head rds_ib_devices;
/* NOTE: if also grabbing ibdev lock, grab this first */
DEFINE_SPINLOCK(ib_nodev_conns_lock);
LIST_HEAD(ib_nodev_conns);
static void rds_ib_nodev_connect(void)
{
struct rds_ib_connection *ic;
spin_lock(&ib_nodev_conns_lock);
list_for_each_entry(ic, &ib_nodev_conns, ib_node)
rds_conn_connect_if_down(ic->conn);
spin_unlock(&ib_nodev_conns_lock);
}
static void rds_ib_dev_shutdown(struct rds_ib_device *rds_ibdev)
{
struct rds_ib_connection *ic;
unsigned long flags;
spin_lock_irqsave(&rds_ibdev->spinlock, flags);
list_for_each_entry(ic, &rds_ibdev->conn_list, ib_node)
rds_conn_drop(ic->conn);
spin_unlock_irqrestore(&rds_ibdev->spinlock, flags);
}
/*
* rds_ib_destroy_mr_pool() blocks on a few things and mrs drop references
* from interrupt context so we push freing off into a work struct in krdsd.
*/
static void rds_ib_dev_free(struct work_struct *work)
{
struct rds_ib_ipaddr *i_ipaddr, *i_next;
struct rds_ib_device *rds_ibdev = container_of(work,
struct rds_ib_device, free_work);
if (rds_ibdev->mr_pool)
rds_ib_destroy_mr_pool(rds_ibdev->mr_pool);
if (rds_ibdev->mr)
ib_dereg_mr(rds_ibdev->mr);
if (rds_ibdev->pd)
ib_dealloc_pd(rds_ibdev->pd);
list_for_each_entry_safe(i_ipaddr, i_next, &rds_ibdev->ipaddr_list, list) {
list_del(&i_ipaddr->list);
kfree(i_ipaddr);
}
kfree(rds_ibdev);
}
void rds_ib_dev_put(struct rds_ib_device *rds_ibdev)
{
BUG_ON(atomic_read(&rds_ibdev->refcount) <= 0);
if (atomic_dec_and_test(&rds_ibdev->refcount))
queue_work(rds_wq, &rds_ibdev->free_work);
}
static void rds_ib_add_one(struct ib_device *device)
{
struct rds_ib_device *rds_ibdev;
struct ib_device_attr *dev_attr;
/* Only handle IB (no iWARP) devices */
if (device->node_type != RDMA_NODE_IB_CA)
return;
dev_attr = kmalloc(sizeof *dev_attr, GFP_KERNEL);
if (!dev_attr)
return;
if (ib_query_device(device, dev_attr)) {
rdsdebug("Query device failed for %s\n", device->name);
goto free_attr;
}
rds_ibdev = kzalloc_node(sizeof(struct rds_ib_device), GFP_KERNEL,
ibdev_to_node(device));
if (!rds_ibdev)
goto free_attr;
spin_lock_init(&rds_ibdev->spinlock);
atomic_set(&rds_ibdev->refcount, 1);
INIT_WORK(&rds_ibdev->free_work, rds_ib_dev_free);
rds_ibdev->max_wrs = dev_attr->max_qp_wr;
rds_ibdev->max_sge = min(dev_attr->max_sge, RDS_IB_MAX_SGE);
rds_ibdev->fmr_max_remaps = dev_attr->max_map_per_fmr?: 32;
rds_ibdev->max_fmrs = dev_attr->max_fmr ?
min_t(unsigned int, dev_attr->max_fmr, fmr_pool_size) :
fmr_pool_size;
rds_ibdev->max_initiator_depth = dev_attr->max_qp_init_rd_atom;
rds_ibdev->max_responder_resources = dev_attr->max_qp_rd_atom;
rds_ibdev->dev = device;
rds_ibdev->pd = ib_alloc_pd(device);
if (IS_ERR(rds_ibdev->pd)) {
rds_ibdev->pd = NULL;
goto put_dev;
}
rds_ibdev->mr = ib_get_dma_mr(rds_ibdev->pd, IB_ACCESS_LOCAL_WRITE);
if (IS_ERR(rds_ibdev->mr)) {
rds_ibdev->mr = NULL;
goto put_dev;
}
rds_ibdev->mr_pool = rds_ib_create_mr_pool(rds_ibdev);
if (IS_ERR(rds_ibdev->mr_pool)) {
rds_ibdev->mr_pool = NULL;
goto put_dev;
}
INIT_LIST_HEAD(&rds_ibdev->ipaddr_list);
INIT_LIST_HEAD(&rds_ibdev->conn_list);
down_write(&rds_ib_devices_lock);
list_add_tail_rcu(&rds_ibdev->list, &rds_ib_devices);
up_write(&rds_ib_devices_lock);
atomic_inc(&rds_ibdev->refcount);
ib_set_client_data(device, &rds_ib_client, rds_ibdev);
atomic_inc(&rds_ibdev->refcount);
rds_ib_nodev_connect();
put_dev:
rds_ib_dev_put(rds_ibdev);
free_attr:
kfree(dev_attr);
}
/*
* New connections use this to find the device to associate with the
* connection. It's not in the fast path so we're not concerned about the
* performance of the IB call. (As of this writing, it uses an interrupt
* blocking spinlock to serialize walking a per-device list of all registered
* clients.)
*
* RCU is used to handle incoming connections racing with device teardown.
* Rather than use a lock to serialize removal from the client_data and
* getting a new reference, we use an RCU grace period. The destruction
* path removes the device from client_data and then waits for all RCU
* readers to finish.
*
* A new connection can get NULL from this if its arriving on a
* device that is in the process of being removed.
*/
struct rds_ib_device *rds_ib_get_client_data(struct ib_device *device)
{
struct rds_ib_device *rds_ibdev;
rcu_read_lock();
rds_ibdev = ib_get_client_data(device, &rds_ib_client);
if (rds_ibdev)
atomic_inc(&rds_ibdev->refcount);
rcu_read_unlock();
return rds_ibdev;
}
/*
* The IB stack is letting us know that a device is going away. This can
* happen if the underlying HCA driver is removed or if PCI hotplug is removing
* the pci function, for example.
*
* This can be called at any time and can be racing with any other RDS path.
*/
static void rds_ib_remove_one(struct ib_device *device)
{
struct rds_ib_device *rds_ibdev;
rds_ibdev = ib_get_client_data(device, &rds_ib_client);
if (!rds_ibdev)
return;
rds_ib_dev_shutdown(rds_ibdev);
/* stop connection attempts from getting a reference to this device. */
ib_set_client_data(device, &rds_ib_client, NULL);
down_write(&rds_ib_devices_lock);
list_del_rcu(&rds_ibdev->list);
up_write(&rds_ib_devices_lock);
/*
* This synchronize rcu is waiting for readers of both the ib
* client data and the devices list to finish before we drop
* both of those references.
*/
synchronize_rcu();
rds_ib_dev_put(rds_ibdev);
rds_ib_dev_put(rds_ibdev);
}
struct ib_client rds_ib_client = {
.name = "rds_ib",
.add = rds_ib_add_one,
.remove = rds_ib_remove_one
};
static int rds_ib_conn_info_visitor(struct rds_connection *conn,
void *buffer)
{
struct rds_info_rdma_connection *iinfo = buffer;
struct rds_ib_connection *ic;
/* We will only ever look at IB transports */
if (conn->c_trans != &rds_ib_transport)
return 0;
iinfo->src_addr = conn->c_laddr;
iinfo->dst_addr = conn->c_faddr;
memset(&iinfo->src_gid, 0, sizeof(iinfo->src_gid));
memset(&iinfo->dst_gid, 0, sizeof(iinfo->dst_gid));
if (rds_conn_state(conn) == RDS_CONN_UP) {
struct rds_ib_device *rds_ibdev;
struct rdma_dev_addr *dev_addr;
ic = conn->c_transport_data;
dev_addr = &ic->i_cm_id->route.addr.dev_addr;
rdma_addr_get_sgid(dev_addr, (union ib_gid *) &iinfo->src_gid);
rdma_addr_get_dgid(dev_addr, (union ib_gid *) &iinfo->dst_gid);
rds_ibdev = ic->rds_ibdev;
iinfo->max_send_wr = ic->i_send_ring.w_nr;
iinfo->max_recv_wr = ic->i_recv_ring.w_nr;
iinfo->max_send_sge = rds_ibdev->max_sge;
rds_ib_get_mr_info(rds_ibdev, iinfo);
}
return 1;
}
static void rds_ib_ic_info(struct socket *sock, unsigned int len,
struct rds_info_iterator *iter,
struct rds_info_lengths *lens)
{
rds_for_each_conn_info(sock, len, iter, lens,
rds_ib_conn_info_visitor,
sizeof(struct rds_info_rdma_connection));
}
/*
* Early RDS/IB was built to only bind to an address if there is an IPoIB
* device with that address set.
*
* If it were me, I'd advocate for something more flexible. Sending and
* receiving should be device-agnostic. Transports would try and maintain
* connections between peers who have messages queued. Userspace would be
* allowed to influence which paths have priority. We could call userspace
* asserting this policy "routing".
*/
static int rds_ib_laddr_check(__be32 addr)
{
int ret;
struct rdma_cm_id *cm_id;
struct sockaddr_in sin;
/* Create a CMA ID and try to bind it. This catches both
* IB and iWARP capable NICs.
*/
cm_id = rdma_create_id(NULL, NULL, RDMA_PS_TCP, IB_QPT_RC);
if (IS_ERR(cm_id))
return PTR_ERR(cm_id);
memset(&sin, 0, sizeof(sin));
sin.sin_family = AF_INET;
sin.sin_addr.s_addr = addr;
/* rdma_bind_addr will only succeed for IB & iWARP devices */
ret = rdma_bind_addr(cm_id, (struct sockaddr *)&sin);
/* due to this, we will claim to support iWARP devices unless we
check node_type. */
if (ret || !cm_id->device ||
cm_id->device->node_type != RDMA_NODE_IB_CA)
ret = -EADDRNOTAVAIL;
rdsdebug("addr %pI4 ret %d node type %d\n",
&addr, ret,
cm_id->device ? cm_id->device->node_type : -1);
rdma_destroy_id(cm_id);
return ret;
}
static void rds_ib_unregister_client(void)
{
ib_unregister_client(&rds_ib_client);
/* wait for rds_ib_dev_free() to complete */
flush_workqueue(rds_wq);
}
void rds_ib_exit(void)
{
rds_info_deregister_func(RDS_INFO_IB_CONNECTIONS, rds_ib_ic_info);
rds_ib_unregister_client();
rds_ib_destroy_nodev_conns();
rds_ib_sysctl_exit();
rds_ib_recv_exit();
rds_trans_unregister(&rds_ib_transport);
}
struct rds_transport rds_ib_transport = {
.laddr_check = rds_ib_laddr_check,
.xmit_complete = rds_ib_xmit_complete,
.xmit = rds_ib_xmit,
.xmit_rdma = rds_ib_xmit_rdma,
.xmit_atomic = rds_ib_xmit_atomic,
.recv = rds_ib_recv,
.conn_alloc = rds_ib_conn_alloc,
.conn_free = rds_ib_conn_free,
.conn_connect = rds_ib_conn_connect,
.conn_shutdown = rds_ib_conn_shutdown,
.inc_copy_to_user = rds_ib_inc_copy_to_user,
.inc_free = rds_ib_inc_free,
.cm_initiate_connect = rds_ib_cm_initiate_connect,
.cm_handle_connect = rds_ib_cm_handle_connect,
.cm_connect_complete = rds_ib_cm_connect_complete,
.stats_info_copy = rds_ib_stats_info_copy,
.exit = rds_ib_exit,
.get_mr = rds_ib_get_mr,
.sync_mr = rds_ib_sync_mr,
.free_mr = rds_ib_free_mr,
.flush_mrs = rds_ib_flush_mrs,
.t_owner = THIS_MODULE,
.t_name = "infiniband",
.t_type = RDS_TRANS_IB
};
int rds_ib_init(void)
{
int ret;
INIT_LIST_HEAD(&rds_ib_devices);
ret = ib_register_client(&rds_ib_client);
if (ret)
goto out;
ret = rds_ib_sysctl_init();
if (ret)
goto out_ibreg;
ret = rds_ib_recv_init();
if (ret)
goto out_sysctl;
ret = rds_trans_register(&rds_ib_transport);
if (ret)
goto out_recv;
rds_info_register_func(RDS_INFO_IB_CONNECTIONS, rds_ib_ic_info);
goto out;
out_recv:
rds_ib_recv_exit();
out_sysctl:
rds_ib_sysctl_exit();
out_ibreg:
rds_ib_unregister_client();
out:
return ret;
}
MODULE_LICENSE("GPL");

373
net/rds/ib.h Normal file
View file

@ -0,0 +1,373 @@
#ifndef _RDS_IB_H
#define _RDS_IB_H
#include <rdma/ib_verbs.h>
#include <rdma/rdma_cm.h>
#include <linux/interrupt.h>
#include <linux/pci.h>
#include <linux/slab.h>
#include "rds.h"
#include "rdma_transport.h"
#define RDS_FMR_SIZE 256
#define RDS_FMR_POOL_SIZE 8192
#define RDS_IB_MAX_SGE 8
#define RDS_IB_RECV_SGE 2
#define RDS_IB_DEFAULT_RECV_WR 1024
#define RDS_IB_DEFAULT_SEND_WR 256
#define RDS_IB_DEFAULT_RETRY_COUNT 2
#define RDS_IB_SUPPORTED_PROTOCOLS 0x00000003 /* minor versions supported */
#define RDS_IB_RECYCLE_BATCH_COUNT 32
extern struct rw_semaphore rds_ib_devices_lock;
extern struct list_head rds_ib_devices;
/*
* IB posts RDS_FRAG_SIZE fragments of pages to the receive queues to
* try and minimize the amount of memory tied up both the device and
* socket receive queues.
*/
struct rds_page_frag {
struct list_head f_item;
struct list_head f_cache_entry;
struct scatterlist f_sg;
};
struct rds_ib_incoming {
struct list_head ii_frags;
struct list_head ii_cache_entry;
struct rds_incoming ii_inc;
};
struct rds_ib_cache_head {
struct list_head *first;
unsigned long count;
};
struct rds_ib_refill_cache {
struct rds_ib_cache_head __percpu *percpu;
struct list_head *xfer;
struct list_head *ready;
};
struct rds_ib_connect_private {
/* Add new fields at the end, and don't permute existing fields. */
__be32 dp_saddr;
__be32 dp_daddr;
u8 dp_protocol_major;
u8 dp_protocol_minor;
__be16 dp_protocol_minor_mask; /* bitmask */
__be32 dp_reserved1;
__be64 dp_ack_seq;
__be32 dp_credit; /* non-zero enables flow ctl */
};
struct rds_ib_send_work {
void *s_op;
struct ib_send_wr s_wr;
struct ib_sge s_sge[RDS_IB_MAX_SGE];
unsigned long s_queued;
};
struct rds_ib_recv_work {
struct rds_ib_incoming *r_ibinc;
struct rds_page_frag *r_frag;
struct ib_recv_wr r_wr;
struct ib_sge r_sge[2];
};
struct rds_ib_work_ring {
u32 w_nr;
u32 w_alloc_ptr;
u32 w_alloc_ctr;
u32 w_free_ptr;
atomic_t w_free_ctr;
};
struct rds_ib_device;
struct rds_ib_connection {
struct list_head ib_node;
struct rds_ib_device *rds_ibdev;
struct rds_connection *conn;
/* alphabet soup, IBTA style */
struct rdma_cm_id *i_cm_id;
struct ib_pd *i_pd;
struct ib_mr *i_mr;
struct ib_cq *i_send_cq;
struct ib_cq *i_recv_cq;
/* tx */
struct rds_ib_work_ring i_send_ring;
struct rm_data_op *i_data_op;
struct rds_header *i_send_hdrs;
u64 i_send_hdrs_dma;
struct rds_ib_send_work *i_sends;
atomic_t i_signaled_sends;
/* rx */
struct tasklet_struct i_recv_tasklet;
struct mutex i_recv_mutex;
struct rds_ib_work_ring i_recv_ring;
struct rds_ib_incoming *i_ibinc;
u32 i_recv_data_rem;
struct rds_header *i_recv_hdrs;
u64 i_recv_hdrs_dma;
struct rds_ib_recv_work *i_recvs;
u64 i_ack_recv; /* last ACK received */
struct rds_ib_refill_cache i_cache_incs;
struct rds_ib_refill_cache i_cache_frags;
/* sending acks */
unsigned long i_ack_flags;
#ifdef KERNEL_HAS_ATOMIC64
atomic64_t i_ack_next; /* next ACK to send */
#else
spinlock_t i_ack_lock; /* protect i_ack_next */
u64 i_ack_next; /* next ACK to send */
#endif
struct rds_header *i_ack;
struct ib_send_wr i_ack_wr;
struct ib_sge i_ack_sge;
u64 i_ack_dma;
unsigned long i_ack_queued;
/* Flow control related information
*
* Our algorithm uses a pair variables that we need to access
* atomically - one for the send credits, and one posted
* recv credits we need to transfer to remote.
* Rather than protect them using a slow spinlock, we put both into
* a single atomic_t and update it using cmpxchg
*/
atomic_t i_credits;
/* Protocol version specific information */
unsigned int i_flowctl:1; /* enable/disable flow ctl */
/* Batched completions */
unsigned int i_unsignaled_wrs;
};
/* This assumes that atomic_t is at least 32 bits */
#define IB_GET_SEND_CREDITS(v) ((v) & 0xffff)
#define IB_GET_POST_CREDITS(v) ((v) >> 16)
#define IB_SET_SEND_CREDITS(v) ((v) & 0xffff)
#define IB_SET_POST_CREDITS(v) ((v) << 16)
struct rds_ib_ipaddr {
struct list_head list;
__be32 ipaddr;
};
struct rds_ib_device {
struct list_head list;
struct list_head ipaddr_list;
struct list_head conn_list;
struct ib_device *dev;
struct ib_pd *pd;
struct ib_mr *mr;
struct rds_ib_mr_pool *mr_pool;
unsigned int fmr_max_remaps;
unsigned int max_fmrs;
int max_sge;
unsigned int max_wrs;
unsigned int max_initiator_depth;
unsigned int max_responder_resources;
spinlock_t spinlock; /* protect the above */
atomic_t refcount;
struct work_struct free_work;
};
#define ibdev_to_node(ibdev) dev_to_node(ibdev->dma_device)
#define rdsibdev_to_node(rdsibdev) ibdev_to_node(rdsibdev->dev)
/* bits for i_ack_flags */
#define IB_ACK_IN_FLIGHT 0
#define IB_ACK_REQUESTED 1
/* Magic WR_ID for ACKs */
#define RDS_IB_ACK_WR_ID (~(u64) 0)
struct rds_ib_statistics {
uint64_t s_ib_connect_raced;
uint64_t s_ib_listen_closed_stale;
uint64_t s_ib_tx_cq_call;
uint64_t s_ib_tx_cq_event;
uint64_t s_ib_tx_ring_full;
uint64_t s_ib_tx_throttle;
uint64_t s_ib_tx_sg_mapping_failure;
uint64_t s_ib_tx_stalled;
uint64_t s_ib_tx_credit_updates;
uint64_t s_ib_rx_cq_call;
uint64_t s_ib_rx_cq_event;
uint64_t s_ib_rx_ring_empty;
uint64_t s_ib_rx_refill_from_cq;
uint64_t s_ib_rx_refill_from_thread;
uint64_t s_ib_rx_alloc_limit;
uint64_t s_ib_rx_credit_updates;
uint64_t s_ib_ack_sent;
uint64_t s_ib_ack_send_failure;
uint64_t s_ib_ack_send_delayed;
uint64_t s_ib_ack_send_piggybacked;
uint64_t s_ib_ack_received;
uint64_t s_ib_rdma_mr_alloc;
uint64_t s_ib_rdma_mr_free;
uint64_t s_ib_rdma_mr_used;
uint64_t s_ib_rdma_mr_pool_flush;
uint64_t s_ib_rdma_mr_pool_wait;
uint64_t s_ib_rdma_mr_pool_depleted;
uint64_t s_ib_atomic_cswp;
uint64_t s_ib_atomic_fadd;
};
extern struct workqueue_struct *rds_ib_wq;
/*
* Fake ib_dma_sync_sg_for_{cpu,device} as long as ib_verbs.h
* doesn't define it.
*/
static inline void rds_ib_dma_sync_sg_for_cpu(struct ib_device *dev,
struct scatterlist *sg, unsigned int sg_dma_len, int direction)
{
unsigned int i;
for (i = 0; i < sg_dma_len; ++i) {
ib_dma_sync_single_for_cpu(dev,
ib_sg_dma_address(dev, &sg[i]),
ib_sg_dma_len(dev, &sg[i]),
direction);
}
}
#define ib_dma_sync_sg_for_cpu rds_ib_dma_sync_sg_for_cpu
static inline void rds_ib_dma_sync_sg_for_device(struct ib_device *dev,
struct scatterlist *sg, unsigned int sg_dma_len, int direction)
{
unsigned int i;
for (i = 0; i < sg_dma_len; ++i) {
ib_dma_sync_single_for_device(dev,
ib_sg_dma_address(dev, &sg[i]),
ib_sg_dma_len(dev, &sg[i]),
direction);
}
}
#define ib_dma_sync_sg_for_device rds_ib_dma_sync_sg_for_device
/* ib.c */
extern struct rds_transport rds_ib_transport;
struct rds_ib_device *rds_ib_get_client_data(struct ib_device *device);
void rds_ib_dev_put(struct rds_ib_device *rds_ibdev);
extern struct ib_client rds_ib_client;
extern unsigned int fmr_message_size;
extern unsigned int rds_ib_retry_count;
extern spinlock_t ib_nodev_conns_lock;
extern struct list_head ib_nodev_conns;
/* ib_cm.c */
int rds_ib_conn_alloc(struct rds_connection *conn, gfp_t gfp);
void rds_ib_conn_free(void *arg);
int rds_ib_conn_connect(struct rds_connection *conn);
void rds_ib_conn_shutdown(struct rds_connection *conn);
void rds_ib_state_change(struct sock *sk);
int rds_ib_listen_init(void);
void rds_ib_listen_stop(void);
void __rds_ib_conn_error(struct rds_connection *conn, const char *, ...);
int rds_ib_cm_handle_connect(struct rdma_cm_id *cm_id,
struct rdma_cm_event *event);
int rds_ib_cm_initiate_connect(struct rdma_cm_id *cm_id);
void rds_ib_cm_connect_complete(struct rds_connection *conn,
struct rdma_cm_event *event);
#define rds_ib_conn_error(conn, fmt...) \
__rds_ib_conn_error(conn, KERN_WARNING "RDS/IB: " fmt)
/* ib_rdma.c */
int rds_ib_update_ipaddr(struct rds_ib_device *rds_ibdev, __be32 ipaddr);
void rds_ib_add_conn(struct rds_ib_device *rds_ibdev, struct rds_connection *conn);
void rds_ib_remove_conn(struct rds_ib_device *rds_ibdev, struct rds_connection *conn);
void rds_ib_destroy_nodev_conns(void);
struct rds_ib_mr_pool *rds_ib_create_mr_pool(struct rds_ib_device *);
void rds_ib_get_mr_info(struct rds_ib_device *rds_ibdev, struct rds_info_rdma_connection *iinfo);
void rds_ib_destroy_mr_pool(struct rds_ib_mr_pool *);
void *rds_ib_get_mr(struct scatterlist *sg, unsigned long nents,
struct rds_sock *rs, u32 *key_ret);
void rds_ib_sync_mr(void *trans_private, int dir);
void rds_ib_free_mr(void *trans_private, int invalidate);
void rds_ib_flush_mrs(void);
/* ib_recv.c */
int rds_ib_recv_init(void);
void rds_ib_recv_exit(void);
int rds_ib_recv(struct rds_connection *conn);
int rds_ib_recv_alloc_caches(struct rds_ib_connection *ic);
void rds_ib_recv_free_caches(struct rds_ib_connection *ic);
void rds_ib_recv_refill(struct rds_connection *conn, int prefill);
void rds_ib_inc_free(struct rds_incoming *inc);
int rds_ib_inc_copy_to_user(struct rds_incoming *inc, struct iovec *iov,
size_t size);
void rds_ib_recv_cq_comp_handler(struct ib_cq *cq, void *context);
void rds_ib_recv_tasklet_fn(unsigned long data);
void rds_ib_recv_init_ring(struct rds_ib_connection *ic);
void rds_ib_recv_clear_ring(struct rds_ib_connection *ic);
void rds_ib_recv_init_ack(struct rds_ib_connection *ic);
void rds_ib_attempt_ack(struct rds_ib_connection *ic);
void rds_ib_ack_send_complete(struct rds_ib_connection *ic);
u64 rds_ib_piggyb_ack(struct rds_ib_connection *ic);
/* ib_ring.c */
void rds_ib_ring_init(struct rds_ib_work_ring *ring, u32 nr);
void rds_ib_ring_resize(struct rds_ib_work_ring *ring, u32 nr);
u32 rds_ib_ring_alloc(struct rds_ib_work_ring *ring, u32 val, u32 *pos);
void rds_ib_ring_free(struct rds_ib_work_ring *ring, u32 val);
void rds_ib_ring_unalloc(struct rds_ib_work_ring *ring, u32 val);
int rds_ib_ring_empty(struct rds_ib_work_ring *ring);
int rds_ib_ring_low(struct rds_ib_work_ring *ring);
u32 rds_ib_ring_oldest(struct rds_ib_work_ring *ring);
u32 rds_ib_ring_completed(struct rds_ib_work_ring *ring, u32 wr_id, u32 oldest);
extern wait_queue_head_t rds_ib_ring_empty_wait;
/* ib_send.c */
char *rds_ib_wc_status_str(enum ib_wc_status status);
void rds_ib_xmit_complete(struct rds_connection *conn);
int rds_ib_xmit(struct rds_connection *conn, struct rds_message *rm,
unsigned int hdr_off, unsigned int sg, unsigned int off);
void rds_ib_send_cq_comp_handler(struct ib_cq *cq, void *context);
void rds_ib_send_init_ring(struct rds_ib_connection *ic);
void rds_ib_send_clear_ring(struct rds_ib_connection *ic);
int rds_ib_xmit_rdma(struct rds_connection *conn, struct rm_rdma_op *op);
void rds_ib_send_add_credits(struct rds_connection *conn, unsigned int credits);
void rds_ib_advertise_credits(struct rds_connection *conn, unsigned int posted);
int rds_ib_send_grab_credits(struct rds_ib_connection *ic, u32 wanted,
u32 *adv_credits, int need_posted, int max_posted);
int rds_ib_xmit_atomic(struct rds_connection *conn, struct rm_atomic_op *op);
/* ib_stats.c */
DECLARE_PER_CPU(struct rds_ib_statistics, rds_ib_stats);
#define rds_ib_stats_inc(member) rds_stats_inc_which(rds_ib_stats, member)
unsigned int rds_ib_stats_info_copy(struct rds_info_iterator *iter,
unsigned int avail);
/* ib_sysctl.c */
int rds_ib_sysctl_init(void);
void rds_ib_sysctl_exit(void);
extern unsigned long rds_ib_sysctl_max_send_wr;
extern unsigned long rds_ib_sysctl_max_recv_wr;
extern unsigned long rds_ib_sysctl_max_unsig_wrs;
extern unsigned long rds_ib_sysctl_max_unsig_bytes;
extern unsigned long rds_ib_sysctl_max_recv_allocation;
extern unsigned int rds_ib_sysctl_flow_control;
#endif

829
net/rds/ib_cm.c Normal file
View file

@ -0,0 +1,829 @@
/*
* Copyright (c) 2006 Oracle. All rights reserved.
*
* This software is available to you under a choice of one of two
* licenses. You may choose to be licensed under the terms of the GNU
* General Public License (GPL) Version 2, available from the file
* COPYING in the main directory of this source tree, or the
* OpenIB.org BSD license below:
*
* Redistribution and use in source and binary forms, with or
* without modification, are permitted provided that the following
* conditions are met:
*
* - Redistributions of source code must retain the above
* copyright notice, this list of conditions and the following
* disclaimer.
*
* - Redistributions in binary form must reproduce the above
* copyright notice, this list of conditions and the following
* disclaimer in the documentation and/or other materials
* provided with the distribution.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
* BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
* ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
* CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
*
*/
#include <linux/kernel.h>
#include <linux/in.h>
#include <linux/slab.h>
#include <linux/vmalloc.h>
#include <linux/ratelimit.h>
#include "rds.h"
#include "ib.h"
static char *rds_ib_event_type_strings[] = {
#define RDS_IB_EVENT_STRING(foo) \
[IB_EVENT_##foo] = __stringify(IB_EVENT_##foo)
RDS_IB_EVENT_STRING(CQ_ERR),
RDS_IB_EVENT_STRING(QP_FATAL),
RDS_IB_EVENT_STRING(QP_REQ_ERR),
RDS_IB_EVENT_STRING(QP_ACCESS_ERR),
RDS_IB_EVENT_STRING(COMM_EST),
RDS_IB_EVENT_STRING(SQ_DRAINED),
RDS_IB_EVENT_STRING(PATH_MIG),
RDS_IB_EVENT_STRING(PATH_MIG_ERR),
RDS_IB_EVENT_STRING(DEVICE_FATAL),
RDS_IB_EVENT_STRING(PORT_ACTIVE),
RDS_IB_EVENT_STRING(PORT_ERR),
RDS_IB_EVENT_STRING(LID_CHANGE),
RDS_IB_EVENT_STRING(PKEY_CHANGE),
RDS_IB_EVENT_STRING(SM_CHANGE),
RDS_IB_EVENT_STRING(SRQ_ERR),
RDS_IB_EVENT_STRING(SRQ_LIMIT_REACHED),
RDS_IB_EVENT_STRING(QP_LAST_WQE_REACHED),
RDS_IB_EVENT_STRING(CLIENT_REREGISTER),
#undef RDS_IB_EVENT_STRING
};
static char *rds_ib_event_str(enum ib_event_type type)
{
return rds_str_array(rds_ib_event_type_strings,
ARRAY_SIZE(rds_ib_event_type_strings), type);
};
/*
* Set the selected protocol version
*/
static void rds_ib_set_protocol(struct rds_connection *conn, unsigned int version)
{
conn->c_version = version;
}
/*
* Set up flow control
*/
static void rds_ib_set_flow_control(struct rds_connection *conn, u32 credits)
{
struct rds_ib_connection *ic = conn->c_transport_data;
if (rds_ib_sysctl_flow_control && credits != 0) {
/* We're doing flow control */
ic->i_flowctl = 1;
rds_ib_send_add_credits(conn, credits);
} else {
ic->i_flowctl = 0;
}
}
/*
* Tune RNR behavior. Without flow control, we use a rather
* low timeout, but not the absolute minimum - this should
* be tunable.
*
* We already set the RNR retry count to 7 (which is the
* smallest infinite number :-) above.
* If flow control is off, we want to change this back to 0
* so that we learn quickly when our credit accounting is
* buggy.
*
* Caller passes in a qp_attr pointer - don't waste stack spacv
* by allocation this twice.
*/
static void
rds_ib_tune_rnr(struct rds_ib_connection *ic, struct ib_qp_attr *attr)
{
int ret;
attr->min_rnr_timer = IB_RNR_TIMER_000_32;
ret = ib_modify_qp(ic->i_cm_id->qp, attr, IB_QP_MIN_RNR_TIMER);
if (ret)
printk(KERN_NOTICE "ib_modify_qp(IB_QP_MIN_RNR_TIMER): err=%d\n", -ret);
}
/*
* Connection established.
* We get here for both outgoing and incoming connection.
*/
void rds_ib_cm_connect_complete(struct rds_connection *conn, struct rdma_cm_event *event)
{
const struct rds_ib_connect_private *dp = NULL;
struct rds_ib_connection *ic = conn->c_transport_data;
struct ib_qp_attr qp_attr;
int err;
if (event->param.conn.private_data_len >= sizeof(*dp)) {
dp = event->param.conn.private_data;
/* make sure it isn't empty data */
if (dp->dp_protocol_major) {
rds_ib_set_protocol(conn,
RDS_PROTOCOL(dp->dp_protocol_major,
dp->dp_protocol_minor));
rds_ib_set_flow_control(conn, be32_to_cpu(dp->dp_credit));
}
}
if (conn->c_version < RDS_PROTOCOL(3,1)) {
printk(KERN_NOTICE "RDS/IB: Connection to %pI4 version %u.%u failed,"
" no longer supported\n",
&conn->c_faddr,
RDS_PROTOCOL_MAJOR(conn->c_version),
RDS_PROTOCOL_MINOR(conn->c_version));
rds_conn_destroy(conn);
return;
} else {
printk(KERN_NOTICE "RDS/IB: connected to %pI4 version %u.%u%s\n",
&conn->c_faddr,
RDS_PROTOCOL_MAJOR(conn->c_version),
RDS_PROTOCOL_MINOR(conn->c_version),
ic->i_flowctl ? ", flow control" : "");
}
/*
* Init rings and fill recv. this needs to wait until protocol negotiation
* is complete, since ring layout is different from 3.0 to 3.1.
*/
rds_ib_send_init_ring(ic);
rds_ib_recv_init_ring(ic);
/* Post receive buffers - as a side effect, this will update
* the posted credit count. */
rds_ib_recv_refill(conn, 1);
/* Tune RNR behavior */
rds_ib_tune_rnr(ic, &qp_attr);
qp_attr.qp_state = IB_QPS_RTS;
err = ib_modify_qp(ic->i_cm_id->qp, &qp_attr, IB_QP_STATE);
if (err)
printk(KERN_NOTICE "ib_modify_qp(IB_QP_STATE, RTS): err=%d\n", err);
/* update ib_device with this local ipaddr */
err = rds_ib_update_ipaddr(ic->rds_ibdev, conn->c_laddr);
if (err)
printk(KERN_ERR "rds_ib_update_ipaddr failed (%d)\n",
err);
/* If the peer gave us the last packet it saw, process this as if
* we had received a regular ACK. */
if (dp && dp->dp_ack_seq)
rds_send_drop_acked(conn, be64_to_cpu(dp->dp_ack_seq), NULL);
rds_connect_complete(conn);
}
static void rds_ib_cm_fill_conn_param(struct rds_connection *conn,
struct rdma_conn_param *conn_param,
struct rds_ib_connect_private *dp,
u32 protocol_version,
u32 max_responder_resources,
u32 max_initiator_depth)
{
struct rds_ib_connection *ic = conn->c_transport_data;
struct rds_ib_device *rds_ibdev = ic->rds_ibdev;
memset(conn_param, 0, sizeof(struct rdma_conn_param));
conn_param->responder_resources =
min_t(u32, rds_ibdev->max_responder_resources, max_responder_resources);
conn_param->initiator_depth =
min_t(u32, rds_ibdev->max_initiator_depth, max_initiator_depth);
conn_param->retry_count = min_t(unsigned int, rds_ib_retry_count, 7);
conn_param->rnr_retry_count = 7;
if (dp) {
memset(dp, 0, sizeof(*dp));
dp->dp_saddr = conn->c_laddr;
dp->dp_daddr = conn->c_faddr;
dp->dp_protocol_major = RDS_PROTOCOL_MAJOR(protocol_version);
dp->dp_protocol_minor = RDS_PROTOCOL_MINOR(protocol_version);
dp->dp_protocol_minor_mask = cpu_to_be16(RDS_IB_SUPPORTED_PROTOCOLS);
dp->dp_ack_seq = rds_ib_piggyb_ack(ic);
/* Advertise flow control */
if (ic->i_flowctl) {
unsigned int credits;
credits = IB_GET_POST_CREDITS(atomic_read(&ic->i_credits));
dp->dp_credit = cpu_to_be32(credits);
atomic_sub(IB_SET_POST_CREDITS(credits), &ic->i_credits);
}
conn_param->private_data = dp;
conn_param->private_data_len = sizeof(*dp);
}
}
static void rds_ib_cq_event_handler(struct ib_event *event, void *data)
{
rdsdebug("event %u (%s) data %p\n",
event->event, rds_ib_event_str(event->event), data);
}
static void rds_ib_qp_event_handler(struct ib_event *event, void *data)
{
struct rds_connection *conn = data;
struct rds_ib_connection *ic = conn->c_transport_data;
rdsdebug("conn %p ic %p event %u (%s)\n", conn, ic, event->event,
rds_ib_event_str(event->event));
switch (event->event) {
case IB_EVENT_COMM_EST:
rdma_notify(ic->i_cm_id, IB_EVENT_COMM_EST);
break;
default:
rdsdebug("Fatal QP Event %u (%s) "
"- connection %pI4->%pI4, reconnecting\n",
event->event, rds_ib_event_str(event->event),
&conn->c_laddr, &conn->c_faddr);
rds_conn_drop(conn);
break;
}
}
/*
* This needs to be very careful to not leave IS_ERR pointers around for
* cleanup to trip over.
*/
static int rds_ib_setup_qp(struct rds_connection *conn)
{
struct rds_ib_connection *ic = conn->c_transport_data;
struct ib_device *dev = ic->i_cm_id->device;
struct ib_qp_init_attr attr;
struct rds_ib_device *rds_ibdev;
int ret;
/*
* It's normal to see a null device if an incoming connection races
* with device removal, so we don't print a warning.
*/
rds_ibdev = rds_ib_get_client_data(dev);
if (!rds_ibdev)
return -EOPNOTSUPP;
/* add the conn now so that connection establishment has the dev */
rds_ib_add_conn(rds_ibdev, conn);
if (rds_ibdev->max_wrs < ic->i_send_ring.w_nr + 1)
rds_ib_ring_resize(&ic->i_send_ring, rds_ibdev->max_wrs - 1);
if (rds_ibdev->max_wrs < ic->i_recv_ring.w_nr + 1)
rds_ib_ring_resize(&ic->i_recv_ring, rds_ibdev->max_wrs - 1);
/* Protection domain and memory range */
ic->i_pd = rds_ibdev->pd;
ic->i_mr = rds_ibdev->mr;
ic->i_send_cq = ib_create_cq(dev, rds_ib_send_cq_comp_handler,
rds_ib_cq_event_handler, conn,
ic->i_send_ring.w_nr + 1, 0);
if (IS_ERR(ic->i_send_cq)) {
ret = PTR_ERR(ic->i_send_cq);
ic->i_send_cq = NULL;
rdsdebug("ib_create_cq send failed: %d\n", ret);
goto out;
}
ic->i_recv_cq = ib_create_cq(dev, rds_ib_recv_cq_comp_handler,
rds_ib_cq_event_handler, conn,
ic->i_recv_ring.w_nr, 0);
if (IS_ERR(ic->i_recv_cq)) {
ret = PTR_ERR(ic->i_recv_cq);
ic->i_recv_cq = NULL;
rdsdebug("ib_create_cq recv failed: %d\n", ret);
goto out;
}
ret = ib_req_notify_cq(ic->i_send_cq, IB_CQ_NEXT_COMP);
if (ret) {
rdsdebug("ib_req_notify_cq send failed: %d\n", ret);
goto out;
}
ret = ib_req_notify_cq(ic->i_recv_cq, IB_CQ_SOLICITED);
if (ret) {
rdsdebug("ib_req_notify_cq recv failed: %d\n", ret);
goto out;
}
/* XXX negotiate max send/recv with remote? */
memset(&attr, 0, sizeof(attr));
attr.event_handler = rds_ib_qp_event_handler;
attr.qp_context = conn;
/* + 1 to allow for the single ack message */
attr.cap.max_send_wr = ic->i_send_ring.w_nr + 1;
attr.cap.max_recv_wr = ic->i_recv_ring.w_nr + 1;
attr.cap.max_send_sge = rds_ibdev->max_sge;
attr.cap.max_recv_sge = RDS_IB_RECV_SGE;
attr.sq_sig_type = IB_SIGNAL_REQ_WR;
attr.qp_type = IB_QPT_RC;
attr.send_cq = ic->i_send_cq;
attr.recv_cq = ic->i_recv_cq;
/*
* XXX this can fail if max_*_wr is too large? Are we supposed
* to back off until we get a value that the hardware can support?
*/
ret = rdma_create_qp(ic->i_cm_id, ic->i_pd, &attr);
if (ret) {
rdsdebug("rdma_create_qp failed: %d\n", ret);
goto out;
}
ic->i_send_hdrs = ib_dma_alloc_coherent(dev,
ic->i_send_ring.w_nr *
sizeof(struct rds_header),
&ic->i_send_hdrs_dma, GFP_KERNEL);
if (!ic->i_send_hdrs) {
ret = -ENOMEM;
rdsdebug("ib_dma_alloc_coherent send failed\n");
goto out;
}
ic->i_recv_hdrs = ib_dma_alloc_coherent(dev,
ic->i_recv_ring.w_nr *
sizeof(struct rds_header),
&ic->i_recv_hdrs_dma, GFP_KERNEL);
if (!ic->i_recv_hdrs) {
ret = -ENOMEM;
rdsdebug("ib_dma_alloc_coherent recv failed\n");
goto out;
}
ic->i_ack = ib_dma_alloc_coherent(dev, sizeof(struct rds_header),
&ic->i_ack_dma, GFP_KERNEL);
if (!ic->i_ack) {
ret = -ENOMEM;
rdsdebug("ib_dma_alloc_coherent ack failed\n");
goto out;
}
ic->i_sends = vzalloc_node(ic->i_send_ring.w_nr * sizeof(struct rds_ib_send_work),
ibdev_to_node(dev));
if (!ic->i_sends) {
ret = -ENOMEM;
rdsdebug("send allocation failed\n");
goto out;
}
ic->i_recvs = vzalloc_node(ic->i_recv_ring.w_nr * sizeof(struct rds_ib_recv_work),
ibdev_to_node(dev));
if (!ic->i_recvs) {
ret = -ENOMEM;
rdsdebug("recv allocation failed\n");
goto out;
}
rds_ib_recv_init_ack(ic);
rdsdebug("conn %p pd %p mr %p cq %p %p\n", conn, ic->i_pd, ic->i_mr,
ic->i_send_cq, ic->i_recv_cq);
out:
rds_ib_dev_put(rds_ibdev);
return ret;
}
static u32 rds_ib_protocol_compatible(struct rdma_cm_event *event)
{
const struct rds_ib_connect_private *dp = event->param.conn.private_data;
u16 common;
u32 version = 0;
/*
* rdma_cm private data is odd - when there is any private data in the
* request, we will be given a pretty large buffer without telling us the
* original size. The only way to tell the difference is by looking at
* the contents, which are initialized to zero.
* If the protocol version fields aren't set, this is a connection attempt
* from an older version. This could could be 3.0 or 2.0 - we can't tell.
* We really should have changed this for OFED 1.3 :-(
*/
/* Be paranoid. RDS always has privdata */
if (!event->param.conn.private_data_len) {
printk(KERN_NOTICE "RDS incoming connection has no private data, "
"rejecting\n");
return 0;
}
/* Even if len is crap *now* I still want to check it. -ASG */
if (event->param.conn.private_data_len < sizeof (*dp) ||
dp->dp_protocol_major == 0)
return RDS_PROTOCOL_3_0;
common = be16_to_cpu(dp->dp_protocol_minor_mask) & RDS_IB_SUPPORTED_PROTOCOLS;
if (dp->dp_protocol_major == 3 && common) {
version = RDS_PROTOCOL_3_0;
while ((common >>= 1) != 0)
version++;
} else
printk_ratelimited(KERN_NOTICE "RDS: Connection from %pI4 using incompatible protocol version %u.%u\n",
&dp->dp_saddr,
dp->dp_protocol_major,
dp->dp_protocol_minor);
return version;
}
int rds_ib_cm_handle_connect(struct rdma_cm_id *cm_id,
struct rdma_cm_event *event)
{
__be64 lguid = cm_id->route.path_rec->sgid.global.interface_id;
__be64 fguid = cm_id->route.path_rec->dgid.global.interface_id;
const struct rds_ib_connect_private *dp = event->param.conn.private_data;
struct rds_ib_connect_private dp_rep;
struct rds_connection *conn = NULL;
struct rds_ib_connection *ic = NULL;
struct rdma_conn_param conn_param;
u32 version;
int err = 1, destroy = 1;
/* Check whether the remote protocol version matches ours. */
version = rds_ib_protocol_compatible(event);
if (!version)
goto out;
rdsdebug("saddr %pI4 daddr %pI4 RDSv%u.%u lguid 0x%llx fguid "
"0x%llx\n", &dp->dp_saddr, &dp->dp_daddr,
RDS_PROTOCOL_MAJOR(version), RDS_PROTOCOL_MINOR(version),
(unsigned long long)be64_to_cpu(lguid),
(unsigned long long)be64_to_cpu(fguid));
conn = rds_conn_create(dp->dp_daddr, dp->dp_saddr, &rds_ib_transport,
GFP_KERNEL);
if (IS_ERR(conn)) {
rdsdebug("rds_conn_create failed (%ld)\n", PTR_ERR(conn));
conn = NULL;
goto out;
}
/*
* The connection request may occur while the
* previous connection exist, e.g. in case of failover.
* But as connections may be initiated simultaneously
* by both hosts, we have a random backoff mechanism -
* see the comment above rds_queue_reconnect()
*/
mutex_lock(&conn->c_cm_lock);
if (!rds_conn_transition(conn, RDS_CONN_DOWN, RDS_CONN_CONNECTING)) {
if (rds_conn_state(conn) == RDS_CONN_UP) {
rdsdebug("incoming connect while connecting\n");
rds_conn_drop(conn);
rds_ib_stats_inc(s_ib_listen_closed_stale);
} else
if (rds_conn_state(conn) == RDS_CONN_CONNECTING) {
/* Wait and see - our connect may still be succeeding */
rds_ib_stats_inc(s_ib_connect_raced);
}
goto out;
}
ic = conn->c_transport_data;
rds_ib_set_protocol(conn, version);
rds_ib_set_flow_control(conn, be32_to_cpu(dp->dp_credit));
/* If the peer gave us the last packet it saw, process this as if
* we had received a regular ACK. */
if (dp->dp_ack_seq)
rds_send_drop_acked(conn, be64_to_cpu(dp->dp_ack_seq), NULL);
BUG_ON(cm_id->context);
BUG_ON(ic->i_cm_id);
ic->i_cm_id = cm_id;
cm_id->context = conn;
/* We got halfway through setting up the ib_connection, if we
* fail now, we have to take the long route out of this mess. */
destroy = 0;
err = rds_ib_setup_qp(conn);
if (err) {
rds_ib_conn_error(conn, "rds_ib_setup_qp failed (%d)\n", err);
goto out;
}
rds_ib_cm_fill_conn_param(conn, &conn_param, &dp_rep, version,
event->param.conn.responder_resources,
event->param.conn.initiator_depth);
/* rdma_accept() calls rdma_reject() internally if it fails */
err = rdma_accept(cm_id, &conn_param);
if (err)
rds_ib_conn_error(conn, "rdma_accept failed (%d)\n", err);
out:
if (conn)
mutex_unlock(&conn->c_cm_lock);
if (err)
rdma_reject(cm_id, NULL, 0);
return destroy;
}
int rds_ib_cm_initiate_connect(struct rdma_cm_id *cm_id)
{
struct rds_connection *conn = cm_id->context;
struct rds_ib_connection *ic = conn->c_transport_data;
struct rdma_conn_param conn_param;
struct rds_ib_connect_private dp;
int ret;
/* If the peer doesn't do protocol negotiation, we must
* default to RDSv3.0 */
rds_ib_set_protocol(conn, RDS_PROTOCOL_3_0);
ic->i_flowctl = rds_ib_sysctl_flow_control; /* advertise flow control */
ret = rds_ib_setup_qp(conn);
if (ret) {
rds_ib_conn_error(conn, "rds_ib_setup_qp failed (%d)\n", ret);
goto out;
}
rds_ib_cm_fill_conn_param(conn, &conn_param, &dp, RDS_PROTOCOL_VERSION,
UINT_MAX, UINT_MAX);
ret = rdma_connect(cm_id, &conn_param);
if (ret)
rds_ib_conn_error(conn, "rdma_connect failed (%d)\n", ret);
out:
/* Beware - returning non-zero tells the rdma_cm to destroy
* the cm_id. We should certainly not do it as long as we still
* "own" the cm_id. */
if (ret) {
if (ic->i_cm_id == cm_id)
ret = 0;
}
return ret;
}
int rds_ib_conn_connect(struct rds_connection *conn)
{
struct rds_ib_connection *ic = conn->c_transport_data;
struct sockaddr_in src, dest;
int ret;
/* XXX I wonder what affect the port space has */
/* delegate cm event handler to rdma_transport */
ic->i_cm_id = rdma_create_id(rds_rdma_cm_event_handler, conn,
RDMA_PS_TCP, IB_QPT_RC);
if (IS_ERR(ic->i_cm_id)) {
ret = PTR_ERR(ic->i_cm_id);
ic->i_cm_id = NULL;
rdsdebug("rdma_create_id() failed: %d\n", ret);
goto out;
}
rdsdebug("created cm id %p for conn %p\n", ic->i_cm_id, conn);
src.sin_family = AF_INET;
src.sin_addr.s_addr = (__force u32)conn->c_laddr;
src.sin_port = (__force u16)htons(0);
dest.sin_family = AF_INET;
dest.sin_addr.s_addr = (__force u32)conn->c_faddr;
dest.sin_port = (__force u16)htons(RDS_PORT);
ret = rdma_resolve_addr(ic->i_cm_id, (struct sockaddr *)&src,
(struct sockaddr *)&dest,
RDS_RDMA_RESOLVE_TIMEOUT_MS);
if (ret) {
rdsdebug("addr resolve failed for cm id %p: %d\n", ic->i_cm_id,
ret);
rdma_destroy_id(ic->i_cm_id);
ic->i_cm_id = NULL;
}
out:
return ret;
}
/*
* This is so careful about only cleaning up resources that were built up
* so that it can be called at any point during startup. In fact it
* can be called multiple times for a given connection.
*/
void rds_ib_conn_shutdown(struct rds_connection *conn)
{
struct rds_ib_connection *ic = conn->c_transport_data;
int err = 0;
rdsdebug("cm %p pd %p cq %p %p qp %p\n", ic->i_cm_id,
ic->i_pd, ic->i_send_cq, ic->i_recv_cq,
ic->i_cm_id ? ic->i_cm_id->qp : NULL);
if (ic->i_cm_id) {
struct ib_device *dev = ic->i_cm_id->device;
rdsdebug("disconnecting cm %p\n", ic->i_cm_id);
err = rdma_disconnect(ic->i_cm_id);
if (err) {
/* Actually this may happen quite frequently, when
* an outgoing connect raced with an incoming connect.
*/
rdsdebug("failed to disconnect, cm: %p err %d\n",
ic->i_cm_id, err);
}
/*
* We want to wait for tx and rx completion to finish
* before we tear down the connection, but we have to be
* careful not to get stuck waiting on a send ring that
* only has unsignaled sends in it. We've shutdown new
* sends before getting here so by waiting for signaled
* sends to complete we're ensured that there will be no
* more tx processing.
*/
wait_event(rds_ib_ring_empty_wait,
rds_ib_ring_empty(&ic->i_recv_ring) &&
(atomic_read(&ic->i_signaled_sends) == 0));
tasklet_kill(&ic->i_recv_tasklet);
if (ic->i_send_hdrs)
ib_dma_free_coherent(dev,
ic->i_send_ring.w_nr *
sizeof(struct rds_header),
ic->i_send_hdrs,
ic->i_send_hdrs_dma);
if (ic->i_recv_hdrs)
ib_dma_free_coherent(dev,
ic->i_recv_ring.w_nr *
sizeof(struct rds_header),
ic->i_recv_hdrs,
ic->i_recv_hdrs_dma);
if (ic->i_ack)
ib_dma_free_coherent(dev, sizeof(struct rds_header),
ic->i_ack, ic->i_ack_dma);
if (ic->i_sends)
rds_ib_send_clear_ring(ic);
if (ic->i_recvs)
rds_ib_recv_clear_ring(ic);
if (ic->i_cm_id->qp)
rdma_destroy_qp(ic->i_cm_id);
if (ic->i_send_cq)
ib_destroy_cq(ic->i_send_cq);
if (ic->i_recv_cq)
ib_destroy_cq(ic->i_recv_cq);
rdma_destroy_id(ic->i_cm_id);
/*
* Move connection back to the nodev list.
*/
if (ic->rds_ibdev)
rds_ib_remove_conn(ic->rds_ibdev, conn);
ic->i_cm_id = NULL;
ic->i_pd = NULL;
ic->i_mr = NULL;
ic->i_send_cq = NULL;
ic->i_recv_cq = NULL;
ic->i_send_hdrs = NULL;
ic->i_recv_hdrs = NULL;
ic->i_ack = NULL;
}
BUG_ON(ic->rds_ibdev);
/* Clear pending transmit */
if (ic->i_data_op) {
struct rds_message *rm;
rm = container_of(ic->i_data_op, struct rds_message, data);
rds_message_put(rm);
ic->i_data_op = NULL;
}
/* Clear the ACK state */
clear_bit(IB_ACK_IN_FLIGHT, &ic->i_ack_flags);
#ifdef KERNEL_HAS_ATOMIC64
atomic64_set(&ic->i_ack_next, 0);
#else
ic->i_ack_next = 0;
#endif
ic->i_ack_recv = 0;
/* Clear flow control state */
ic->i_flowctl = 0;
atomic_set(&ic->i_credits, 0);
rds_ib_ring_init(&ic->i_send_ring, rds_ib_sysctl_max_send_wr);
rds_ib_ring_init(&ic->i_recv_ring, rds_ib_sysctl_max_recv_wr);
if (ic->i_ibinc) {
rds_inc_put(&ic->i_ibinc->ii_inc);
ic->i_ibinc = NULL;
}
vfree(ic->i_sends);
ic->i_sends = NULL;
vfree(ic->i_recvs);
ic->i_recvs = NULL;
}
int rds_ib_conn_alloc(struct rds_connection *conn, gfp_t gfp)
{
struct rds_ib_connection *ic;
unsigned long flags;
int ret;
/* XXX too lazy? */
ic = kzalloc(sizeof(struct rds_ib_connection), gfp);
if (!ic)
return -ENOMEM;
ret = rds_ib_recv_alloc_caches(ic);
if (ret) {
kfree(ic);
return ret;
}
INIT_LIST_HEAD(&ic->ib_node);
tasklet_init(&ic->i_recv_tasklet, rds_ib_recv_tasklet_fn,
(unsigned long) ic);
mutex_init(&ic->i_recv_mutex);
#ifndef KERNEL_HAS_ATOMIC64
spin_lock_init(&ic->i_ack_lock);
#endif
atomic_set(&ic->i_signaled_sends, 0);
/*
* rds_ib_conn_shutdown() waits for these to be emptied so they
* must be initialized before it can be called.
*/
rds_ib_ring_init(&ic->i_send_ring, rds_ib_sysctl_max_send_wr);
rds_ib_ring_init(&ic->i_recv_ring, rds_ib_sysctl_max_recv_wr);
ic->conn = conn;
conn->c_transport_data = ic;
spin_lock_irqsave(&ib_nodev_conns_lock, flags);
list_add_tail(&ic->ib_node, &ib_nodev_conns);
spin_unlock_irqrestore(&ib_nodev_conns_lock, flags);
rdsdebug("conn %p conn ic %p\n", conn, conn->c_transport_data);
return 0;
}
/*
* Free a connection. Connection must be shut down and not set for reconnect.
*/
void rds_ib_conn_free(void *arg)
{
struct rds_ib_connection *ic = arg;
spinlock_t *lock_ptr;
rdsdebug("ic %p\n", ic);
/*
* Conn is either on a dev's list or on the nodev list.
* A race with shutdown() or connect() would cause problems
* (since rds_ibdev would change) but that should never happen.
*/
lock_ptr = ic->rds_ibdev ? &ic->rds_ibdev->spinlock : &ib_nodev_conns_lock;
spin_lock_irq(lock_ptr);
list_del(&ic->ib_node);
spin_unlock_irq(lock_ptr);
rds_ib_recv_free_caches(ic);
kfree(ic);
}
/*
* An error occurred on the connection
*/
void
__rds_ib_conn_error(struct rds_connection *conn, const char *fmt, ...)
{
va_list ap;
rds_conn_drop(conn);
va_start(ap, fmt);
vprintk(fmt, ap);
va_end(ap);
}

784
net/rds/ib_rdma.c Normal file
View file

@ -0,0 +1,784 @@
/*
* Copyright (c) 2006 Oracle. All rights reserved.
*
* This software is available to you under a choice of one of two
* licenses. You may choose to be licensed under the terms of the GNU
* General Public License (GPL) Version 2, available from the file
* COPYING in the main directory of this source tree, or the
* OpenIB.org BSD license below:
*
* Redistribution and use in source and binary forms, with or
* without modification, are permitted provided that the following
* conditions are met:
*
* - Redistributions of source code must retain the above
* copyright notice, this list of conditions and the following
* disclaimer.
*
* - Redistributions in binary form must reproduce the above
* copyright notice, this list of conditions and the following
* disclaimer in the documentation and/or other materials
* provided with the distribution.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
* BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
* ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
* CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
*
*/
#include <linux/kernel.h>
#include <linux/slab.h>
#include <linux/rculist.h>
#include <linux/llist.h>
#include "rds.h"
#include "ib.h"
static DEFINE_PER_CPU(unsigned long, clean_list_grace);
#define CLEAN_LIST_BUSY_BIT 0
/*
* This is stored as mr->r_trans_private.
*/
struct rds_ib_mr {
struct rds_ib_device *device;
struct rds_ib_mr_pool *pool;
struct ib_fmr *fmr;
struct llist_node llnode;
/* unmap_list is for freeing */
struct list_head unmap_list;
unsigned int remap_count;
struct scatterlist *sg;
unsigned int sg_len;
u64 *dma;
int sg_dma_len;
};
/*
* Our own little FMR pool
*/
struct rds_ib_mr_pool {
struct mutex flush_lock; /* serialize fmr invalidate */
struct delayed_work flush_worker; /* flush worker */
atomic_t item_count; /* total # of MRs */
atomic_t dirty_count; /* # dirty of MRs */
struct llist_head drop_list; /* MRs that have reached their max_maps limit */
struct llist_head free_list; /* unused MRs */
struct llist_head clean_list; /* global unused & unamapped MRs */
wait_queue_head_t flush_wait;
atomic_t free_pinned; /* memory pinned by free MRs */
unsigned long max_items;
unsigned long max_items_soft;
unsigned long max_free_pinned;
struct ib_fmr_attr fmr_attr;
};
static int rds_ib_flush_mr_pool(struct rds_ib_mr_pool *pool, int free_all, struct rds_ib_mr **);
static void rds_ib_teardown_mr(struct rds_ib_mr *ibmr);
static void rds_ib_mr_pool_flush_worker(struct work_struct *work);
static struct rds_ib_device *rds_ib_get_device(__be32 ipaddr)
{
struct rds_ib_device *rds_ibdev;
struct rds_ib_ipaddr *i_ipaddr;
rcu_read_lock();
list_for_each_entry_rcu(rds_ibdev, &rds_ib_devices, list) {
list_for_each_entry_rcu(i_ipaddr, &rds_ibdev->ipaddr_list, list) {
if (i_ipaddr->ipaddr == ipaddr) {
atomic_inc(&rds_ibdev->refcount);
rcu_read_unlock();
return rds_ibdev;
}
}
}
rcu_read_unlock();
return NULL;
}
static int rds_ib_add_ipaddr(struct rds_ib_device *rds_ibdev, __be32 ipaddr)
{
struct rds_ib_ipaddr *i_ipaddr;
i_ipaddr = kmalloc(sizeof *i_ipaddr, GFP_KERNEL);
if (!i_ipaddr)
return -ENOMEM;
i_ipaddr->ipaddr = ipaddr;
spin_lock_irq(&rds_ibdev->spinlock);
list_add_tail_rcu(&i_ipaddr->list, &rds_ibdev->ipaddr_list);
spin_unlock_irq(&rds_ibdev->spinlock);
return 0;
}
static void rds_ib_remove_ipaddr(struct rds_ib_device *rds_ibdev, __be32 ipaddr)
{
struct rds_ib_ipaddr *i_ipaddr;
struct rds_ib_ipaddr *to_free = NULL;
spin_lock_irq(&rds_ibdev->spinlock);
list_for_each_entry_rcu(i_ipaddr, &rds_ibdev->ipaddr_list, list) {
if (i_ipaddr->ipaddr == ipaddr) {
list_del_rcu(&i_ipaddr->list);
to_free = i_ipaddr;
break;
}
}
spin_unlock_irq(&rds_ibdev->spinlock);
if (to_free) {
synchronize_rcu();
kfree(to_free);
}
}
int rds_ib_update_ipaddr(struct rds_ib_device *rds_ibdev, __be32 ipaddr)
{
struct rds_ib_device *rds_ibdev_old;
rds_ibdev_old = rds_ib_get_device(ipaddr);
if (rds_ibdev_old) {
rds_ib_remove_ipaddr(rds_ibdev_old, ipaddr);
rds_ib_dev_put(rds_ibdev_old);
}
return rds_ib_add_ipaddr(rds_ibdev, ipaddr);
}
void rds_ib_add_conn(struct rds_ib_device *rds_ibdev, struct rds_connection *conn)
{
struct rds_ib_connection *ic = conn->c_transport_data;
/* conn was previously on the nodev_conns_list */
spin_lock_irq(&ib_nodev_conns_lock);
BUG_ON(list_empty(&ib_nodev_conns));
BUG_ON(list_empty(&ic->ib_node));
list_del(&ic->ib_node);
spin_lock(&rds_ibdev->spinlock);
list_add_tail(&ic->ib_node, &rds_ibdev->conn_list);
spin_unlock(&rds_ibdev->spinlock);
spin_unlock_irq(&ib_nodev_conns_lock);
ic->rds_ibdev = rds_ibdev;
atomic_inc(&rds_ibdev->refcount);
}
void rds_ib_remove_conn(struct rds_ib_device *rds_ibdev, struct rds_connection *conn)
{
struct rds_ib_connection *ic = conn->c_transport_data;
/* place conn on nodev_conns_list */
spin_lock(&ib_nodev_conns_lock);
spin_lock_irq(&rds_ibdev->spinlock);
BUG_ON(list_empty(&ic->ib_node));
list_del(&ic->ib_node);
spin_unlock_irq(&rds_ibdev->spinlock);
list_add_tail(&ic->ib_node, &ib_nodev_conns);
spin_unlock(&ib_nodev_conns_lock);
ic->rds_ibdev = NULL;
rds_ib_dev_put(rds_ibdev);
}
void rds_ib_destroy_nodev_conns(void)
{
struct rds_ib_connection *ic, *_ic;
LIST_HEAD(tmp_list);
/* avoid calling conn_destroy with irqs off */
spin_lock_irq(&ib_nodev_conns_lock);
list_splice(&ib_nodev_conns, &tmp_list);
spin_unlock_irq(&ib_nodev_conns_lock);
list_for_each_entry_safe(ic, _ic, &tmp_list, ib_node)
rds_conn_destroy(ic->conn);
}
struct rds_ib_mr_pool *rds_ib_create_mr_pool(struct rds_ib_device *rds_ibdev)
{
struct rds_ib_mr_pool *pool;
pool = kzalloc(sizeof(*pool), GFP_KERNEL);
if (!pool)
return ERR_PTR(-ENOMEM);
init_llist_head(&pool->free_list);
init_llist_head(&pool->drop_list);
init_llist_head(&pool->clean_list);
mutex_init(&pool->flush_lock);
init_waitqueue_head(&pool->flush_wait);
INIT_DELAYED_WORK(&pool->flush_worker, rds_ib_mr_pool_flush_worker);
pool->fmr_attr.max_pages = fmr_message_size;
pool->fmr_attr.max_maps = rds_ibdev->fmr_max_remaps;
pool->fmr_attr.page_shift = PAGE_SHIFT;
pool->max_free_pinned = rds_ibdev->max_fmrs * fmr_message_size / 4;
/* We never allow more than max_items MRs to be allocated.
* When we exceed more than max_items_soft, we start freeing
* items more aggressively.
* Make sure that max_items > max_items_soft > max_items / 2
*/
pool->max_items_soft = rds_ibdev->max_fmrs * 3 / 4;
pool->max_items = rds_ibdev->max_fmrs;
return pool;
}
void rds_ib_get_mr_info(struct rds_ib_device *rds_ibdev, struct rds_info_rdma_connection *iinfo)
{
struct rds_ib_mr_pool *pool = rds_ibdev->mr_pool;
iinfo->rdma_mr_max = pool->max_items;
iinfo->rdma_mr_size = pool->fmr_attr.max_pages;
}
void rds_ib_destroy_mr_pool(struct rds_ib_mr_pool *pool)
{
cancel_delayed_work_sync(&pool->flush_worker);
rds_ib_flush_mr_pool(pool, 1, NULL);
WARN_ON(atomic_read(&pool->item_count));
WARN_ON(atomic_read(&pool->free_pinned));
kfree(pool);
}
static inline struct rds_ib_mr *rds_ib_reuse_fmr(struct rds_ib_mr_pool *pool)
{
struct rds_ib_mr *ibmr = NULL;
struct llist_node *ret;
unsigned long *flag;
preempt_disable();
flag = this_cpu_ptr(&clean_list_grace);
set_bit(CLEAN_LIST_BUSY_BIT, flag);
ret = llist_del_first(&pool->clean_list);
if (ret)
ibmr = llist_entry(ret, struct rds_ib_mr, llnode);
clear_bit(CLEAN_LIST_BUSY_BIT, flag);
preempt_enable();
return ibmr;
}
static inline void wait_clean_list_grace(void)
{
int cpu;
unsigned long *flag;
for_each_online_cpu(cpu) {
flag = &per_cpu(clean_list_grace, cpu);
while (test_bit(CLEAN_LIST_BUSY_BIT, flag))
cpu_relax();
}
}
static struct rds_ib_mr *rds_ib_alloc_fmr(struct rds_ib_device *rds_ibdev)
{
struct rds_ib_mr_pool *pool = rds_ibdev->mr_pool;
struct rds_ib_mr *ibmr = NULL;
int err = 0, iter = 0;
if (atomic_read(&pool->dirty_count) >= pool->max_items / 10)
schedule_delayed_work(&pool->flush_worker, 10);
while (1) {
ibmr = rds_ib_reuse_fmr(pool);
if (ibmr)
return ibmr;
/* No clean MRs - now we have the choice of either
* allocating a fresh MR up to the limit imposed by the
* driver, or flush any dirty unused MRs.
* We try to avoid stalling in the send path if possible,
* so we allocate as long as we're allowed to.
*
* We're fussy with enforcing the FMR limit, though. If the driver
* tells us we can't use more than N fmrs, we shouldn't start
* arguing with it */
if (atomic_inc_return(&pool->item_count) <= pool->max_items)
break;
atomic_dec(&pool->item_count);
if (++iter > 2) {
rds_ib_stats_inc(s_ib_rdma_mr_pool_depleted);
return ERR_PTR(-EAGAIN);
}
/* We do have some empty MRs. Flush them out. */
rds_ib_stats_inc(s_ib_rdma_mr_pool_wait);
rds_ib_flush_mr_pool(pool, 0, &ibmr);
if (ibmr)
return ibmr;
}
ibmr = kzalloc_node(sizeof(*ibmr), GFP_KERNEL, rdsibdev_to_node(rds_ibdev));
if (!ibmr) {
err = -ENOMEM;
goto out_no_cigar;
}
memset(ibmr, 0, sizeof(*ibmr));
ibmr->fmr = ib_alloc_fmr(rds_ibdev->pd,
(IB_ACCESS_LOCAL_WRITE |
IB_ACCESS_REMOTE_READ |
IB_ACCESS_REMOTE_WRITE|
IB_ACCESS_REMOTE_ATOMIC),
&pool->fmr_attr);
if (IS_ERR(ibmr->fmr)) {
err = PTR_ERR(ibmr->fmr);
ibmr->fmr = NULL;
printk(KERN_WARNING "RDS/IB: ib_alloc_fmr failed (err=%d)\n", err);
goto out_no_cigar;
}
rds_ib_stats_inc(s_ib_rdma_mr_alloc);
return ibmr;
out_no_cigar:
if (ibmr) {
if (ibmr->fmr)
ib_dealloc_fmr(ibmr->fmr);
kfree(ibmr);
}
atomic_dec(&pool->item_count);
return ERR_PTR(err);
}
static int rds_ib_map_fmr(struct rds_ib_device *rds_ibdev, struct rds_ib_mr *ibmr,
struct scatterlist *sg, unsigned int nents)
{
struct ib_device *dev = rds_ibdev->dev;
struct scatterlist *scat = sg;
u64 io_addr = 0;
u64 *dma_pages;
u32 len;
int page_cnt, sg_dma_len;
int i, j;
int ret;
sg_dma_len = ib_dma_map_sg(dev, sg, nents,
DMA_BIDIRECTIONAL);
if (unlikely(!sg_dma_len)) {
printk(KERN_WARNING "RDS/IB: dma_map_sg failed!\n");
return -EBUSY;
}
len = 0;
page_cnt = 0;
for (i = 0; i < sg_dma_len; ++i) {
unsigned int dma_len = ib_sg_dma_len(dev, &scat[i]);
u64 dma_addr = ib_sg_dma_address(dev, &scat[i]);
if (dma_addr & ~PAGE_MASK) {
if (i > 0)
return -EINVAL;
else
++page_cnt;
}
if ((dma_addr + dma_len) & ~PAGE_MASK) {
if (i < sg_dma_len - 1)
return -EINVAL;
else
++page_cnt;
}
len += dma_len;
}
page_cnt += len >> PAGE_SHIFT;
if (page_cnt > fmr_message_size)
return -EINVAL;
dma_pages = kmalloc_node(sizeof(u64) * page_cnt, GFP_ATOMIC,
rdsibdev_to_node(rds_ibdev));
if (!dma_pages)
return -ENOMEM;
page_cnt = 0;
for (i = 0; i < sg_dma_len; ++i) {
unsigned int dma_len = ib_sg_dma_len(dev, &scat[i]);
u64 dma_addr = ib_sg_dma_address(dev, &scat[i]);
for (j = 0; j < dma_len; j += PAGE_SIZE)
dma_pages[page_cnt++] =
(dma_addr & PAGE_MASK) + j;
}
ret = ib_map_phys_fmr(ibmr->fmr,
dma_pages, page_cnt, io_addr);
if (ret)
goto out;
/* Success - we successfully remapped the MR, so we can
* safely tear down the old mapping. */
rds_ib_teardown_mr(ibmr);
ibmr->sg = scat;
ibmr->sg_len = nents;
ibmr->sg_dma_len = sg_dma_len;
ibmr->remap_count++;
rds_ib_stats_inc(s_ib_rdma_mr_used);
ret = 0;
out:
kfree(dma_pages);
return ret;
}
void rds_ib_sync_mr(void *trans_private, int direction)
{
struct rds_ib_mr *ibmr = trans_private;
struct rds_ib_device *rds_ibdev = ibmr->device;
switch (direction) {
case DMA_FROM_DEVICE:
ib_dma_sync_sg_for_cpu(rds_ibdev->dev, ibmr->sg,
ibmr->sg_dma_len, DMA_BIDIRECTIONAL);
break;
case DMA_TO_DEVICE:
ib_dma_sync_sg_for_device(rds_ibdev->dev, ibmr->sg,
ibmr->sg_dma_len, DMA_BIDIRECTIONAL);
break;
}
}
static void __rds_ib_teardown_mr(struct rds_ib_mr *ibmr)
{
struct rds_ib_device *rds_ibdev = ibmr->device;
if (ibmr->sg_dma_len) {
ib_dma_unmap_sg(rds_ibdev->dev,
ibmr->sg, ibmr->sg_len,
DMA_BIDIRECTIONAL);
ibmr->sg_dma_len = 0;
}
/* Release the s/g list */
if (ibmr->sg_len) {
unsigned int i;
for (i = 0; i < ibmr->sg_len; ++i) {
struct page *page = sg_page(&ibmr->sg[i]);
/* FIXME we need a way to tell a r/w MR
* from a r/o MR */
BUG_ON(irqs_disabled());
set_page_dirty(page);
put_page(page);
}
kfree(ibmr->sg);
ibmr->sg = NULL;
ibmr->sg_len = 0;
}
}
static void rds_ib_teardown_mr(struct rds_ib_mr *ibmr)
{
unsigned int pinned = ibmr->sg_len;
__rds_ib_teardown_mr(ibmr);
if (pinned) {
struct rds_ib_device *rds_ibdev = ibmr->device;
struct rds_ib_mr_pool *pool = rds_ibdev->mr_pool;
atomic_sub(pinned, &pool->free_pinned);
}
}
static inline unsigned int rds_ib_flush_goal(struct rds_ib_mr_pool *pool, int free_all)
{
unsigned int item_count;
item_count = atomic_read(&pool->item_count);
if (free_all)
return item_count;
return 0;
}
/*
* given an llist of mrs, put them all into the list_head for more processing
*/
static void llist_append_to_list(struct llist_head *llist, struct list_head *list)
{
struct rds_ib_mr *ibmr;
struct llist_node *node;
struct llist_node *next;
node = llist_del_all(llist);
while (node) {
next = node->next;
ibmr = llist_entry(node, struct rds_ib_mr, llnode);
list_add_tail(&ibmr->unmap_list, list);
node = next;
}
}
/*
* this takes a list head of mrs and turns it into linked llist nodes
* of clusters. Each cluster has linked llist nodes of
* MR_CLUSTER_SIZE mrs that are ready for reuse.
*/
static void list_to_llist_nodes(struct rds_ib_mr_pool *pool,
struct list_head *list,
struct llist_node **nodes_head,
struct llist_node **nodes_tail)
{
struct rds_ib_mr *ibmr;
struct llist_node *cur = NULL;
struct llist_node **next = nodes_head;
list_for_each_entry(ibmr, list, unmap_list) {
cur = &ibmr->llnode;
*next = cur;
next = &cur->next;
}
*next = NULL;
*nodes_tail = cur;
}
/*
* Flush our pool of MRs.
* At a minimum, all currently unused MRs are unmapped.
* If the number of MRs allocated exceeds the limit, we also try
* to free as many MRs as needed to get back to this limit.
*/
static int rds_ib_flush_mr_pool(struct rds_ib_mr_pool *pool,
int free_all, struct rds_ib_mr **ibmr_ret)
{
struct rds_ib_mr *ibmr, *next;
struct llist_node *clean_nodes;
struct llist_node *clean_tail;
LIST_HEAD(unmap_list);
LIST_HEAD(fmr_list);
unsigned long unpinned = 0;
unsigned int nfreed = 0, ncleaned = 0, free_goal;
int ret = 0;
rds_ib_stats_inc(s_ib_rdma_mr_pool_flush);
if (ibmr_ret) {
DEFINE_WAIT(wait);
while(!mutex_trylock(&pool->flush_lock)) {
ibmr = rds_ib_reuse_fmr(pool);
if (ibmr) {
*ibmr_ret = ibmr;
finish_wait(&pool->flush_wait, &wait);
goto out_nolock;
}
prepare_to_wait(&pool->flush_wait, &wait,
TASK_UNINTERRUPTIBLE);
if (llist_empty(&pool->clean_list))
schedule();
ibmr = rds_ib_reuse_fmr(pool);
if (ibmr) {
*ibmr_ret = ibmr;
finish_wait(&pool->flush_wait, &wait);
goto out_nolock;
}
}
finish_wait(&pool->flush_wait, &wait);
} else
mutex_lock(&pool->flush_lock);
if (ibmr_ret) {
ibmr = rds_ib_reuse_fmr(pool);
if (ibmr) {
*ibmr_ret = ibmr;
goto out;
}
}
/* Get the list of all MRs to be dropped. Ordering matters -
* we want to put drop_list ahead of free_list.
*/
llist_append_to_list(&pool->drop_list, &unmap_list);
llist_append_to_list(&pool->free_list, &unmap_list);
if (free_all)
llist_append_to_list(&pool->clean_list, &unmap_list);
free_goal = rds_ib_flush_goal(pool, free_all);
if (list_empty(&unmap_list))
goto out;
/* String all ib_mr's onto one list and hand them to ib_unmap_fmr */
list_for_each_entry(ibmr, &unmap_list, unmap_list)
list_add(&ibmr->fmr->list, &fmr_list);
ret = ib_unmap_fmr(&fmr_list);
if (ret)
printk(KERN_WARNING "RDS/IB: ib_unmap_fmr failed (err=%d)\n", ret);
/* Now we can destroy the DMA mapping and unpin any pages */
list_for_each_entry_safe(ibmr, next, &unmap_list, unmap_list) {
unpinned += ibmr->sg_len;
__rds_ib_teardown_mr(ibmr);
if (nfreed < free_goal || ibmr->remap_count >= pool->fmr_attr.max_maps) {
rds_ib_stats_inc(s_ib_rdma_mr_free);
list_del(&ibmr->unmap_list);
ib_dealloc_fmr(ibmr->fmr);
kfree(ibmr);
nfreed++;
}
ncleaned++;
}
if (!list_empty(&unmap_list)) {
/* we have to make sure that none of the things we're about
* to put on the clean list would race with other cpus trying
* to pull items off. The llist would explode if we managed to
* remove something from the clean list and then add it back again
* while another CPU was spinning on that same item in llist_del_first.
*
* This is pretty unlikely, but just in case wait for an llist grace period
* here before adding anything back into the clean list.
*/
wait_clean_list_grace();
list_to_llist_nodes(pool, &unmap_list, &clean_nodes, &clean_tail);
if (ibmr_ret)
*ibmr_ret = llist_entry(clean_nodes, struct rds_ib_mr, llnode);
/* more than one entry in llist nodes */
if (clean_nodes->next)
llist_add_batch(clean_nodes->next, clean_tail, &pool->clean_list);
}
atomic_sub(unpinned, &pool->free_pinned);
atomic_sub(ncleaned, &pool->dirty_count);
atomic_sub(nfreed, &pool->item_count);
out:
mutex_unlock(&pool->flush_lock);
if (waitqueue_active(&pool->flush_wait))
wake_up(&pool->flush_wait);
out_nolock:
return ret;
}
static void rds_ib_mr_pool_flush_worker(struct work_struct *work)
{
struct rds_ib_mr_pool *pool = container_of(work, struct rds_ib_mr_pool, flush_worker.work);
rds_ib_flush_mr_pool(pool, 0, NULL);
}
void rds_ib_free_mr(void *trans_private, int invalidate)
{
struct rds_ib_mr *ibmr = trans_private;
struct rds_ib_device *rds_ibdev = ibmr->device;
struct rds_ib_mr_pool *pool = rds_ibdev->mr_pool;
rdsdebug("RDS/IB: free_mr nents %u\n", ibmr->sg_len);
/* Return it to the pool's free list */
if (ibmr->remap_count >= pool->fmr_attr.max_maps)
llist_add(&ibmr->llnode, &pool->drop_list);
else
llist_add(&ibmr->llnode, &pool->free_list);
atomic_add(ibmr->sg_len, &pool->free_pinned);
atomic_inc(&pool->dirty_count);
/* If we've pinned too many pages, request a flush */
if (atomic_read(&pool->free_pinned) >= pool->max_free_pinned ||
atomic_read(&pool->dirty_count) >= pool->max_items / 10)
schedule_delayed_work(&pool->flush_worker, 10);
if (invalidate) {
if (likely(!in_interrupt())) {
rds_ib_flush_mr_pool(pool, 0, NULL);
} else {
/* We get here if the user created a MR marked
* as use_once and invalidate at the same time. */
schedule_delayed_work(&pool->flush_worker, 10);
}
}
rds_ib_dev_put(rds_ibdev);
}
void rds_ib_flush_mrs(void)
{
struct rds_ib_device *rds_ibdev;
down_read(&rds_ib_devices_lock);
list_for_each_entry(rds_ibdev, &rds_ib_devices, list) {
struct rds_ib_mr_pool *pool = rds_ibdev->mr_pool;
if (pool)
rds_ib_flush_mr_pool(pool, 0, NULL);
}
up_read(&rds_ib_devices_lock);
}
void *rds_ib_get_mr(struct scatterlist *sg, unsigned long nents,
struct rds_sock *rs, u32 *key_ret)
{
struct rds_ib_device *rds_ibdev;
struct rds_ib_mr *ibmr = NULL;
int ret;
rds_ibdev = rds_ib_get_device(rs->rs_bound_addr);
if (!rds_ibdev) {
ret = -ENODEV;
goto out;
}
if (!rds_ibdev->mr_pool) {
ret = -ENODEV;
goto out;
}
ibmr = rds_ib_alloc_fmr(rds_ibdev);
if (IS_ERR(ibmr))
return ibmr;
ret = rds_ib_map_fmr(rds_ibdev, ibmr, sg, nents);
if (ret == 0)
*key_ret = ibmr->fmr->rkey;
else
printk(KERN_WARNING "RDS/IB: map_fmr failed (errno=%d)\n", ret);
ibmr->device = rds_ibdev;
rds_ibdev = NULL;
out:
if (ret) {
if (ibmr)
rds_ib_free_mr(ibmr, 0);
ibmr = ERR_PTR(ret);
}
if (rds_ibdev)
rds_ib_dev_put(rds_ibdev);
return ibmr;
}

1079
net/rds/ib_recv.c Normal file

File diff suppressed because it is too large Load diff

168
net/rds/ib_ring.c Normal file
View file

@ -0,0 +1,168 @@
/*
* Copyright (c) 2006 Oracle. All rights reserved.
*
* This software is available to you under a choice of one of two
* licenses. You may choose to be licensed under the terms of the GNU
* General Public License (GPL) Version 2, available from the file
* COPYING in the main directory of this source tree, or the
* OpenIB.org BSD license below:
*
* Redistribution and use in source and binary forms, with or
* without modification, are permitted provided that the following
* conditions are met:
*
* - Redistributions of source code must retain the above
* copyright notice, this list of conditions and the following
* disclaimer.
*
* - Redistributions in binary form must reproduce the above
* copyright notice, this list of conditions and the following
* disclaimer in the documentation and/or other materials
* provided with the distribution.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
* BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
* ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
* CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
*
*/
#include <linux/kernel.h>
#include "rds.h"
#include "ib.h"
/*
* Locking for IB rings.
* We assume that allocation is always protected by a mutex
* in the caller (this is a valid assumption for the current
* implementation).
*
* Freeing always happens in an interrupt, and hence only
* races with allocations, but not with other free()s.
*
* The interaction between allocation and freeing is that
* the alloc code has to determine the number of free entries.
* To this end, we maintain two counters; an allocation counter
* and a free counter. Both are allowed to run freely, and wrap
* around.
* The number of used entries is always (alloc_ctr - free_ctr) % NR.
*
* The current implementation makes free_ctr atomic. When the
* caller finds an allocation fails, it should set an "alloc fail"
* bit and retry the allocation. The "alloc fail" bit essentially tells
* the CQ completion handlers to wake it up after freeing some
* more entries.
*/
/*
* This only happens on shutdown.
*/
DECLARE_WAIT_QUEUE_HEAD(rds_ib_ring_empty_wait);
void rds_ib_ring_init(struct rds_ib_work_ring *ring, u32 nr)
{
memset(ring, 0, sizeof(*ring));
ring->w_nr = nr;
rdsdebug("ring %p nr %u\n", ring, ring->w_nr);
}
static inline u32 __rds_ib_ring_used(struct rds_ib_work_ring *ring)
{
u32 diff;
/* This assumes that atomic_t has at least as many bits as u32 */
diff = ring->w_alloc_ctr - (u32) atomic_read(&ring->w_free_ctr);
BUG_ON(diff > ring->w_nr);
return diff;
}
void rds_ib_ring_resize(struct rds_ib_work_ring *ring, u32 nr)
{
/* We only ever get called from the connection setup code,
* prior to creating the QP. */
BUG_ON(__rds_ib_ring_used(ring));
ring->w_nr = nr;
}
static int __rds_ib_ring_empty(struct rds_ib_work_ring *ring)
{
return __rds_ib_ring_used(ring) == 0;
}
u32 rds_ib_ring_alloc(struct rds_ib_work_ring *ring, u32 val, u32 *pos)
{
u32 ret = 0, avail;
avail = ring->w_nr - __rds_ib_ring_used(ring);
rdsdebug("ring %p val %u next %u free %u\n", ring, val,
ring->w_alloc_ptr, avail);
if (val && avail) {
ret = min(val, avail);
*pos = ring->w_alloc_ptr;
ring->w_alloc_ptr = (ring->w_alloc_ptr + ret) % ring->w_nr;
ring->w_alloc_ctr += ret;
}
return ret;
}
void rds_ib_ring_free(struct rds_ib_work_ring *ring, u32 val)
{
ring->w_free_ptr = (ring->w_free_ptr + val) % ring->w_nr;
atomic_add(val, &ring->w_free_ctr);
if (__rds_ib_ring_empty(ring) &&
waitqueue_active(&rds_ib_ring_empty_wait))
wake_up(&rds_ib_ring_empty_wait);
}
void rds_ib_ring_unalloc(struct rds_ib_work_ring *ring, u32 val)
{
ring->w_alloc_ptr = (ring->w_alloc_ptr - val) % ring->w_nr;
ring->w_alloc_ctr -= val;
}
int rds_ib_ring_empty(struct rds_ib_work_ring *ring)
{
return __rds_ib_ring_empty(ring);
}
int rds_ib_ring_low(struct rds_ib_work_ring *ring)
{
return __rds_ib_ring_used(ring) <= (ring->w_nr >> 1);
}
/*
* returns the oldest alloced ring entry. This will be the next one
* freed. This can't be called if there are none allocated.
*/
u32 rds_ib_ring_oldest(struct rds_ib_work_ring *ring)
{
return ring->w_free_ptr;
}
/*
* returns the number of completed work requests.
*/
u32 rds_ib_ring_completed(struct rds_ib_work_ring *ring, u32 wr_id, u32 oldest)
{
u32 ret;
if (oldest <= (unsigned long long)wr_id)
ret = (unsigned long long)wr_id - oldest + 1;
else
ret = ring->w_nr - oldest + (unsigned long long)wr_id + 1;
rdsdebug("ring %p ret %u wr_id %u oldest %u\n", ring, ret,
wr_id, oldest);
return ret;
}

1020
net/rds/ib_send.c Normal file

File diff suppressed because it is too large Load diff

97
net/rds/ib_stats.c Normal file
View file

@ -0,0 +1,97 @@
/*
* Copyright (c) 2006 Oracle. All rights reserved.
*
* This software is available to you under a choice of one of two
* licenses. You may choose to be licensed under the terms of the GNU
* General Public License (GPL) Version 2, available from the file
* COPYING in the main directory of this source tree, or the
* OpenIB.org BSD license below:
*
* Redistribution and use in source and binary forms, with or
* without modification, are permitted provided that the following
* conditions are met:
*
* - Redistributions of source code must retain the above
* copyright notice, this list of conditions and the following
* disclaimer.
*
* - Redistributions in binary form must reproduce the above
* copyright notice, this list of conditions and the following
* disclaimer in the documentation and/or other materials
* provided with the distribution.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
* BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
* ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
* CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
*
*/
#include <linux/percpu.h>
#include <linux/seq_file.h>
#include <linux/proc_fs.h>
#include "rds.h"
#include "ib.h"
DEFINE_PER_CPU_SHARED_ALIGNED(struct rds_ib_statistics, rds_ib_stats);
static const char *const rds_ib_stat_names[] = {
"ib_connect_raced",
"ib_listen_closed_stale",
"ib_tx_cq_call",
"ib_tx_cq_event",
"ib_tx_ring_full",
"ib_tx_throttle",
"ib_tx_sg_mapping_failure",
"ib_tx_stalled",
"ib_tx_credit_updates",
"ib_rx_cq_call",
"ib_rx_cq_event",
"ib_rx_ring_empty",
"ib_rx_refill_from_cq",
"ib_rx_refill_from_thread",
"ib_rx_alloc_limit",
"ib_rx_credit_updates",
"ib_ack_sent",
"ib_ack_send_failure",
"ib_ack_send_delayed",
"ib_ack_send_piggybacked",
"ib_ack_received",
"ib_rdma_mr_alloc",
"ib_rdma_mr_free",
"ib_rdma_mr_used",
"ib_rdma_mr_pool_flush",
"ib_rdma_mr_pool_wait",
"ib_rdma_mr_pool_depleted",
"ib_atomic_cswp",
"ib_atomic_fadd",
};
unsigned int rds_ib_stats_info_copy(struct rds_info_iterator *iter,
unsigned int avail)
{
struct rds_ib_statistics stats = {0, };
uint64_t *src;
uint64_t *sum;
size_t i;
int cpu;
if (avail < ARRAY_SIZE(rds_ib_stat_names))
goto out;
for_each_online_cpu(cpu) {
src = (uint64_t *)&(per_cpu(rds_ib_stats, cpu));
sum = (uint64_t *)&stats;
for (i = 0; i < sizeof(stats) / sizeof(uint64_t); i++)
*(sum++) += *(src++);
}
rds_stats_info_copy(iter, (uint64_t *)&stats, rds_ib_stat_names,
ARRAY_SIZE(rds_ib_stat_names));
out:
return ARRAY_SIZE(rds_ib_stat_names);
}

121
net/rds/ib_sysctl.c Normal file
View file

@ -0,0 +1,121 @@
/*
* Copyright (c) 2006 Oracle. All rights reserved.
*
* This software is available to you under a choice of one of two
* licenses. You may choose to be licensed under the terms of the GNU
* General Public License (GPL) Version 2, available from the file
* COPYING in the main directory of this source tree, or the
* OpenIB.org BSD license below:
*
* Redistribution and use in source and binary forms, with or
* without modification, are permitted provided that the following
* conditions are met:
*
* - Redistributions of source code must retain the above
* copyright notice, this list of conditions and the following
* disclaimer.
*
* - Redistributions in binary form must reproduce the above
* copyright notice, this list of conditions and the following
* disclaimer in the documentation and/or other materials
* provided with the distribution.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
* BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
* ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
* CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
*
*/
#include <linux/kernel.h>
#include <linux/sysctl.h>
#include <linux/proc_fs.h>
#include "ib.h"
static struct ctl_table_header *rds_ib_sysctl_hdr;
unsigned long rds_ib_sysctl_max_send_wr = RDS_IB_DEFAULT_SEND_WR;
unsigned long rds_ib_sysctl_max_recv_wr = RDS_IB_DEFAULT_RECV_WR;
unsigned long rds_ib_sysctl_max_recv_allocation = (128 * 1024 * 1024) / RDS_FRAG_SIZE;
static unsigned long rds_ib_sysctl_max_wr_min = 1;
/* hardware will fail CQ creation long before this */
static unsigned long rds_ib_sysctl_max_wr_max = (u32)~0;
unsigned long rds_ib_sysctl_max_unsig_wrs = 16;
static unsigned long rds_ib_sysctl_max_unsig_wr_min = 1;
static unsigned long rds_ib_sysctl_max_unsig_wr_max = 64;
/*
* This sysctl does nothing.
*
* Backwards compatibility with RDS 3.0 wire protocol
* disables initial FC credit exchange.
* If it's ever possible to drop 3.0 support,
* setting this to 1 and moving init/refill of send/recv
* rings from ib_cm_connect_complete() back into ib_setup_qp()
* will cause credits to be added before protocol negotiation.
*/
unsigned int rds_ib_sysctl_flow_control = 0;
static struct ctl_table rds_ib_sysctl_table[] = {
{
.procname = "max_send_wr",
.data = &rds_ib_sysctl_max_send_wr,
.maxlen = sizeof(unsigned long),
.mode = 0644,
.proc_handler = proc_doulongvec_minmax,
.extra1 = &rds_ib_sysctl_max_wr_min,
.extra2 = &rds_ib_sysctl_max_wr_max,
},
{
.procname = "max_recv_wr",
.data = &rds_ib_sysctl_max_recv_wr,
.maxlen = sizeof(unsigned long),
.mode = 0644,
.proc_handler = proc_doulongvec_minmax,
.extra1 = &rds_ib_sysctl_max_wr_min,
.extra2 = &rds_ib_sysctl_max_wr_max,
},
{
.procname = "max_unsignaled_wr",
.data = &rds_ib_sysctl_max_unsig_wrs,
.maxlen = sizeof(unsigned long),
.mode = 0644,
.proc_handler = proc_doulongvec_minmax,
.extra1 = &rds_ib_sysctl_max_unsig_wr_min,
.extra2 = &rds_ib_sysctl_max_unsig_wr_max,
},
{
.procname = "max_recv_allocation",
.data = &rds_ib_sysctl_max_recv_allocation,
.maxlen = sizeof(unsigned long),
.mode = 0644,
.proc_handler = proc_doulongvec_minmax,
},
{
.procname = "flow_control",
.data = &rds_ib_sysctl_flow_control,
.maxlen = sizeof(rds_ib_sysctl_flow_control),
.mode = 0644,
.proc_handler = proc_dointvec,
},
{ }
};
void rds_ib_sysctl_exit(void)
{
if (rds_ib_sysctl_hdr)
unregister_net_sysctl_table(rds_ib_sysctl_hdr);
}
int rds_ib_sysctl_init(void)
{
rds_ib_sysctl_hdr = register_net_sysctl(&init_net, "net/rds/ib", rds_ib_sysctl_table);
if (!rds_ib_sysctl_hdr)
return -ENOMEM;
return 0;
}

243
net/rds/info.c Normal file
View file

@ -0,0 +1,243 @@
/*
* Copyright (c) 2006 Oracle. All rights reserved.
*
* This software is available to you under a choice of one of two
* licenses. You may choose to be licensed under the terms of the GNU
* General Public License (GPL) Version 2, available from the file
* COPYING in the main directory of this source tree, or the
* OpenIB.org BSD license below:
*
* Redistribution and use in source and binary forms, with or
* without modification, are permitted provided that the following
* conditions are met:
*
* - Redistributions of source code must retain the above
* copyright notice, this list of conditions and the following
* disclaimer.
*
* - Redistributions in binary form must reproduce the above
* copyright notice, this list of conditions and the following
* disclaimer in the documentation and/or other materials
* provided with the distribution.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
* BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
* ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
* CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
*
*/
#include <linux/percpu.h>
#include <linux/seq_file.h>
#include <linux/slab.h>
#include <linux/proc_fs.h>
#include <linux/export.h>
#include "rds.h"
/*
* This file implements a getsockopt() call which copies a set of fixed
* sized structs into a user-specified buffer as a means of providing
* read-only information about RDS.
*
* For a given information source there are a given number of fixed sized
* structs at a given time. The structs are only copied if the user-specified
* buffer is big enough. The destination pages that make up the buffer
* are pinned for the duration of the copy.
*
* This gives us the following benefits:
*
* - simple implementation, no copy "position" across multiple calls
* - consistent snapshot of an info source
* - atomic copy works well with whatever locking info source has
* - one portable tool to get rds info across implementations
* - long-lived tool can get info without allocating
*
* at the following costs:
*
* - info source copy must be pinned, may be "large"
*/
struct rds_info_iterator {
struct page **pages;
void *addr;
unsigned long offset;
};
static DEFINE_SPINLOCK(rds_info_lock);
static rds_info_func rds_info_funcs[RDS_INFO_LAST - RDS_INFO_FIRST + 1];
void rds_info_register_func(int optname, rds_info_func func)
{
int offset = optname - RDS_INFO_FIRST;
BUG_ON(optname < RDS_INFO_FIRST || optname > RDS_INFO_LAST);
spin_lock(&rds_info_lock);
BUG_ON(rds_info_funcs[offset]);
rds_info_funcs[offset] = func;
spin_unlock(&rds_info_lock);
}
EXPORT_SYMBOL_GPL(rds_info_register_func);
void rds_info_deregister_func(int optname, rds_info_func func)
{
int offset = optname - RDS_INFO_FIRST;
BUG_ON(optname < RDS_INFO_FIRST || optname > RDS_INFO_LAST);
spin_lock(&rds_info_lock);
BUG_ON(rds_info_funcs[offset] != func);
rds_info_funcs[offset] = NULL;
spin_unlock(&rds_info_lock);
}
EXPORT_SYMBOL_GPL(rds_info_deregister_func);
/*
* Typically we hold an atomic kmap across multiple rds_info_copy() calls
* because the kmap is so expensive. This must be called before using blocking
* operations while holding the mapping and as the iterator is torn down.
*/
void rds_info_iter_unmap(struct rds_info_iterator *iter)
{
if (iter->addr) {
kunmap_atomic(iter->addr);
iter->addr = NULL;
}
}
/*
* get_user_pages() called flush_dcache_page() on the pages for us.
*/
void rds_info_copy(struct rds_info_iterator *iter, void *data,
unsigned long bytes)
{
unsigned long this;
while (bytes) {
if (!iter->addr)
iter->addr = kmap_atomic(*iter->pages);
this = min(bytes, PAGE_SIZE - iter->offset);
rdsdebug("page %p addr %p offset %lu this %lu data %p "
"bytes %lu\n", *iter->pages, iter->addr,
iter->offset, this, data, bytes);
memcpy(iter->addr + iter->offset, data, this);
data += this;
bytes -= this;
iter->offset += this;
if (iter->offset == PAGE_SIZE) {
kunmap_atomic(iter->addr);
iter->addr = NULL;
iter->offset = 0;
iter->pages++;
}
}
}
EXPORT_SYMBOL_GPL(rds_info_copy);
/*
* @optval points to the userspace buffer that the information snapshot
* will be copied into.
*
* @optlen on input is the size of the buffer in userspace. @optlen
* on output is the size of the requested snapshot in bytes.
*
* This function returns -errno if there is a failure, particularly -ENOSPC
* if the given userspace buffer was not large enough to fit the snapshot.
* On success it returns the positive number of bytes of each array element
* in the snapshot.
*/
int rds_info_getsockopt(struct socket *sock, int optname, char __user *optval,
int __user *optlen)
{
struct rds_info_iterator iter;
struct rds_info_lengths lens;
unsigned long nr_pages = 0;
unsigned long start;
unsigned long i;
rds_info_func func;
struct page **pages = NULL;
int ret;
int len;
int total;
if (get_user(len, optlen)) {
ret = -EFAULT;
goto out;
}
/* check for all kinds of wrapping and the like */
start = (unsigned long)optval;
if (len < 0 || len + PAGE_SIZE - 1 < len || start + len < start) {
ret = -EINVAL;
goto out;
}
/* a 0 len call is just trying to probe its length */
if (len == 0)
goto call_func;
nr_pages = (PAGE_ALIGN(start + len) - (start & PAGE_MASK))
>> PAGE_SHIFT;
pages = kmalloc(nr_pages * sizeof(struct page *), GFP_KERNEL);
if (!pages) {
ret = -ENOMEM;
goto out;
}
ret = get_user_pages_fast(start, nr_pages, 1, pages);
if (ret != nr_pages) {
if (ret > 0)
nr_pages = ret;
else
nr_pages = 0;
ret = -EAGAIN; /* XXX ? */
goto out;
}
rdsdebug("len %d nr_pages %lu\n", len, nr_pages);
call_func:
func = rds_info_funcs[optname - RDS_INFO_FIRST];
if (!func) {
ret = -ENOPROTOOPT;
goto out;
}
iter.pages = pages;
iter.addr = NULL;
iter.offset = start & (PAGE_SIZE - 1);
func(sock, len, &iter, &lens);
BUG_ON(lens.each == 0);
total = lens.nr * lens.each;
rds_info_iter_unmap(&iter);
if (total > len) {
len = total;
ret = -ENOSPC;
} else {
len = total;
ret = lens.each;
}
if (put_user(len, optlen))
ret = -EFAULT;
out:
for (i = 0; pages && i < nr_pages; i++)
put_page(pages[i]);
kfree(pages);
return ret;
}

30
net/rds/info.h Normal file
View file

@ -0,0 +1,30 @@
#ifndef _RDS_INFO_H
#define _RDS_INFO_H
struct rds_info_lengths {
unsigned int nr;
unsigned int each;
};
struct rds_info_iterator;
/*
* These functions must fill in the fields of @lens to reflect the size
* of the available info source. If the snapshot fits in @len then it
* should be copied using @iter. The caller will deduce if it was copied
* or not by comparing the lengths.
*/
typedef void (*rds_info_func)(struct socket *sock, unsigned int len,
struct rds_info_iterator *iter,
struct rds_info_lengths *lens);
void rds_info_register_func(int optname, rds_info_func func);
void rds_info_deregister_func(int optname, rds_info_func func);
int rds_info_getsockopt(struct socket *sock, int optname, char __user *optval,
int __user *optlen);
void rds_info_copy(struct rds_info_iterator *iter, void *data,
unsigned long bytes);
void rds_info_iter_unmap(struct rds_info_iterator *iter);
#endif

329
net/rds/iw.c Normal file
View file

@ -0,0 +1,329 @@
/*
* Copyright (c) 2006 Oracle. All rights reserved.
*
* This software is available to you under a choice of one of two
* licenses. You may choose to be licensed under the terms of the GNU
* General Public License (GPL) Version 2, available from the file
* COPYING in the main directory of this source tree, or the
* OpenIB.org BSD license below:
*
* Redistribution and use in source and binary forms, with or
* without modification, are permitted provided that the following
* conditions are met:
*
* - Redistributions of source code must retain the above
* copyright notice, this list of conditions and the following
* disclaimer.
*
* - Redistributions in binary form must reproduce the above
* copyright notice, this list of conditions and the following
* disclaimer in the documentation and/or other materials
* provided with the distribution.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
* BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
* ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
* CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
*
*/
#include <linux/kernel.h>
#include <linux/in.h>
#include <linux/if.h>
#include <linux/netdevice.h>
#include <linux/inetdevice.h>
#include <linux/if_arp.h>
#include <linux/delay.h>
#include <linux/slab.h>
#include <linux/module.h>
#include "rds.h"
#include "iw.h"
unsigned int fastreg_pool_size = RDS_FASTREG_POOL_SIZE;
unsigned int fastreg_message_size = RDS_FASTREG_SIZE + 1; /* +1 allows for unaligned MRs */
module_param(fastreg_pool_size, int, 0444);
MODULE_PARM_DESC(fastreg_pool_size, " Max number of fastreg MRs per device");
module_param(fastreg_message_size, int, 0444);
MODULE_PARM_DESC(fastreg_message_size, " Max size of a RDMA transfer (fastreg MRs)");
struct list_head rds_iw_devices;
/* NOTE: if also grabbing iwdev lock, grab this first */
DEFINE_SPINLOCK(iw_nodev_conns_lock);
LIST_HEAD(iw_nodev_conns);
static void rds_iw_add_one(struct ib_device *device)
{
struct rds_iw_device *rds_iwdev;
struct ib_device_attr *dev_attr;
/* Only handle iwarp devices */
if (device->node_type != RDMA_NODE_RNIC)
return;
dev_attr = kmalloc(sizeof *dev_attr, GFP_KERNEL);
if (!dev_attr)
return;
if (ib_query_device(device, dev_attr)) {
rdsdebug("Query device failed for %s\n", device->name);
goto free_attr;
}
rds_iwdev = kmalloc(sizeof *rds_iwdev, GFP_KERNEL);
if (!rds_iwdev)
goto free_attr;
spin_lock_init(&rds_iwdev->spinlock);
rds_iwdev->dma_local_lkey = !!(dev_attr->device_cap_flags & IB_DEVICE_LOCAL_DMA_LKEY);
rds_iwdev->max_wrs = dev_attr->max_qp_wr;
rds_iwdev->max_sge = min(dev_attr->max_sge, RDS_IW_MAX_SGE);
rds_iwdev->dev = device;
rds_iwdev->pd = ib_alloc_pd(device);
if (IS_ERR(rds_iwdev->pd))
goto free_dev;
if (!rds_iwdev->dma_local_lkey) {
rds_iwdev->mr = ib_get_dma_mr(rds_iwdev->pd,
IB_ACCESS_REMOTE_READ |
IB_ACCESS_REMOTE_WRITE |
IB_ACCESS_LOCAL_WRITE);
if (IS_ERR(rds_iwdev->mr))
goto err_pd;
} else
rds_iwdev->mr = NULL;
rds_iwdev->mr_pool = rds_iw_create_mr_pool(rds_iwdev);
if (IS_ERR(rds_iwdev->mr_pool)) {
rds_iwdev->mr_pool = NULL;
goto err_mr;
}
INIT_LIST_HEAD(&rds_iwdev->cm_id_list);
INIT_LIST_HEAD(&rds_iwdev->conn_list);
list_add_tail(&rds_iwdev->list, &rds_iw_devices);
ib_set_client_data(device, &rds_iw_client, rds_iwdev);
goto free_attr;
err_mr:
if (rds_iwdev->mr)
ib_dereg_mr(rds_iwdev->mr);
err_pd:
ib_dealloc_pd(rds_iwdev->pd);
free_dev:
kfree(rds_iwdev);
free_attr:
kfree(dev_attr);
}
static void rds_iw_remove_one(struct ib_device *device)
{
struct rds_iw_device *rds_iwdev;
struct rds_iw_cm_id *i_cm_id, *next;
rds_iwdev = ib_get_client_data(device, &rds_iw_client);
if (!rds_iwdev)
return;
spin_lock_irq(&rds_iwdev->spinlock);
list_for_each_entry_safe(i_cm_id, next, &rds_iwdev->cm_id_list, list) {
list_del(&i_cm_id->list);
kfree(i_cm_id);
}
spin_unlock_irq(&rds_iwdev->spinlock);
rds_iw_destroy_conns(rds_iwdev);
if (rds_iwdev->mr_pool)
rds_iw_destroy_mr_pool(rds_iwdev->mr_pool);
if (rds_iwdev->mr)
ib_dereg_mr(rds_iwdev->mr);
while (ib_dealloc_pd(rds_iwdev->pd)) {
rdsdebug("Failed to dealloc pd %p\n", rds_iwdev->pd);
msleep(1);
}
list_del(&rds_iwdev->list);
kfree(rds_iwdev);
}
struct ib_client rds_iw_client = {
.name = "rds_iw",
.add = rds_iw_add_one,
.remove = rds_iw_remove_one
};
static int rds_iw_conn_info_visitor(struct rds_connection *conn,
void *buffer)
{
struct rds_info_rdma_connection *iinfo = buffer;
struct rds_iw_connection *ic;
/* We will only ever look at IB transports */
if (conn->c_trans != &rds_iw_transport)
return 0;
iinfo->src_addr = conn->c_laddr;
iinfo->dst_addr = conn->c_faddr;
memset(&iinfo->src_gid, 0, sizeof(iinfo->src_gid));
memset(&iinfo->dst_gid, 0, sizeof(iinfo->dst_gid));
if (rds_conn_state(conn) == RDS_CONN_UP) {
struct rds_iw_device *rds_iwdev;
struct rdma_dev_addr *dev_addr;
ic = conn->c_transport_data;
dev_addr = &ic->i_cm_id->route.addr.dev_addr;
rdma_addr_get_sgid(dev_addr, (union ib_gid *) &iinfo->src_gid);
rdma_addr_get_dgid(dev_addr, (union ib_gid *) &iinfo->dst_gid);
rds_iwdev = ib_get_client_data(ic->i_cm_id->device, &rds_iw_client);
iinfo->max_send_wr = ic->i_send_ring.w_nr;
iinfo->max_recv_wr = ic->i_recv_ring.w_nr;
iinfo->max_send_sge = rds_iwdev->max_sge;
rds_iw_get_mr_info(rds_iwdev, iinfo);
}
return 1;
}
static void rds_iw_ic_info(struct socket *sock, unsigned int len,
struct rds_info_iterator *iter,
struct rds_info_lengths *lens)
{
rds_for_each_conn_info(sock, len, iter, lens,
rds_iw_conn_info_visitor,
sizeof(struct rds_info_rdma_connection));
}
/*
* Early RDS/IB was built to only bind to an address if there is an IPoIB
* device with that address set.
*
* If it were me, I'd advocate for something more flexible. Sending and
* receiving should be device-agnostic. Transports would try and maintain
* connections between peers who have messages queued. Userspace would be
* allowed to influence which paths have priority. We could call userspace
* asserting this policy "routing".
*/
static int rds_iw_laddr_check(__be32 addr)
{
int ret;
struct rdma_cm_id *cm_id;
struct sockaddr_in sin;
/* Create a CMA ID and try to bind it. This catches both
* IB and iWARP capable NICs.
*/
cm_id = rdma_create_id(NULL, NULL, RDMA_PS_TCP, IB_QPT_RC);
if (IS_ERR(cm_id))
return PTR_ERR(cm_id);
memset(&sin, 0, sizeof(sin));
sin.sin_family = AF_INET;
sin.sin_addr.s_addr = addr;
/* rdma_bind_addr will only succeed for IB & iWARP devices */
ret = rdma_bind_addr(cm_id, (struct sockaddr *)&sin);
/* due to this, we will claim to support IB devices unless we
check node_type. */
if (ret || !cm_id->device ||
cm_id->device->node_type != RDMA_NODE_RNIC)
ret = -EADDRNOTAVAIL;
rdsdebug("addr %pI4 ret %d node type %d\n",
&addr, ret,
cm_id->device ? cm_id->device->node_type : -1);
rdma_destroy_id(cm_id);
return ret;
}
void rds_iw_exit(void)
{
rds_info_deregister_func(RDS_INFO_IWARP_CONNECTIONS, rds_iw_ic_info);
rds_iw_destroy_nodev_conns();
ib_unregister_client(&rds_iw_client);
rds_iw_sysctl_exit();
rds_iw_recv_exit();
rds_trans_unregister(&rds_iw_transport);
}
struct rds_transport rds_iw_transport = {
.laddr_check = rds_iw_laddr_check,
.xmit_complete = rds_iw_xmit_complete,
.xmit = rds_iw_xmit,
.xmit_rdma = rds_iw_xmit_rdma,
.recv = rds_iw_recv,
.conn_alloc = rds_iw_conn_alloc,
.conn_free = rds_iw_conn_free,
.conn_connect = rds_iw_conn_connect,
.conn_shutdown = rds_iw_conn_shutdown,
.inc_copy_to_user = rds_iw_inc_copy_to_user,
.inc_free = rds_iw_inc_free,
.cm_initiate_connect = rds_iw_cm_initiate_connect,
.cm_handle_connect = rds_iw_cm_handle_connect,
.cm_connect_complete = rds_iw_cm_connect_complete,
.stats_info_copy = rds_iw_stats_info_copy,
.exit = rds_iw_exit,
.get_mr = rds_iw_get_mr,
.sync_mr = rds_iw_sync_mr,
.free_mr = rds_iw_free_mr,
.flush_mrs = rds_iw_flush_mrs,
.t_owner = THIS_MODULE,
.t_name = "iwarp",
.t_type = RDS_TRANS_IWARP,
.t_prefer_loopback = 1,
};
int rds_iw_init(void)
{
int ret;
INIT_LIST_HEAD(&rds_iw_devices);
ret = ib_register_client(&rds_iw_client);
if (ret)
goto out;
ret = rds_iw_sysctl_init();
if (ret)
goto out_ibreg;
ret = rds_iw_recv_init();
if (ret)
goto out_sysctl;
ret = rds_trans_register(&rds_iw_transport);
if (ret)
goto out_recv;
rds_info_register_func(RDS_INFO_IWARP_CONNECTIONS, rds_iw_ic_info);
goto out;
out_recv:
rds_iw_recv_exit();
out_sysctl:
rds_iw_sysctl_exit();
out_ibreg:
ib_unregister_client(&rds_iw_client);
out:
return ret;
}
MODULE_LICENSE("GPL");

396
net/rds/iw.h Normal file
View file

@ -0,0 +1,396 @@
#ifndef _RDS_IW_H
#define _RDS_IW_H
#include <linux/interrupt.h>
#include <rdma/ib_verbs.h>
#include <rdma/rdma_cm.h>
#include "rds.h"
#include "rdma_transport.h"
#define RDS_FASTREG_SIZE 20
#define RDS_FASTREG_POOL_SIZE 2048
#define RDS_IW_MAX_SGE 8
#define RDS_IW_RECV_SGE 2
#define RDS_IW_DEFAULT_RECV_WR 1024
#define RDS_IW_DEFAULT_SEND_WR 256
#define RDS_IW_SUPPORTED_PROTOCOLS 0x00000003 /* minor versions supported */
extern struct list_head rds_iw_devices;
/*
* IB posts RDS_FRAG_SIZE fragments of pages to the receive queues to
* try and minimize the amount of memory tied up both the device and
* socket receive queues.
*/
/* page offset of the final full frag that fits in the page */
#define RDS_PAGE_LAST_OFF (((PAGE_SIZE / RDS_FRAG_SIZE) - 1) * RDS_FRAG_SIZE)
struct rds_page_frag {
struct list_head f_item;
struct page *f_page;
unsigned long f_offset;
dma_addr_t f_mapped;
};
struct rds_iw_incoming {
struct list_head ii_frags;
struct rds_incoming ii_inc;
};
struct rds_iw_connect_private {
/* Add new fields at the end, and don't permute existing fields. */
__be32 dp_saddr;
__be32 dp_daddr;
u8 dp_protocol_major;
u8 dp_protocol_minor;
__be16 dp_protocol_minor_mask; /* bitmask */
__be32 dp_reserved1;
__be64 dp_ack_seq;
__be32 dp_credit; /* non-zero enables flow ctl */
};
struct rds_iw_scatterlist {
struct scatterlist *list;
unsigned int len;
int dma_len;
unsigned int dma_npages;
unsigned int bytes;
};
struct rds_iw_mapping {
spinlock_t m_lock; /* protect the mapping struct */
struct list_head m_list;
struct rds_iw_mr *m_mr;
uint32_t m_rkey;
struct rds_iw_scatterlist m_sg;
};
struct rds_iw_send_work {
struct rds_message *s_rm;
/* We should really put these into a union: */
struct rm_rdma_op *s_op;
struct rds_iw_mapping *s_mapping;
struct ib_mr *s_mr;
struct ib_fast_reg_page_list *s_page_list;
unsigned char s_remap_count;
struct ib_send_wr s_wr;
struct ib_sge s_sge[RDS_IW_MAX_SGE];
unsigned long s_queued;
};
struct rds_iw_recv_work {
struct rds_iw_incoming *r_iwinc;
struct rds_page_frag *r_frag;
struct ib_recv_wr r_wr;
struct ib_sge r_sge[2];
};
struct rds_iw_work_ring {
u32 w_nr;
u32 w_alloc_ptr;
u32 w_alloc_ctr;
u32 w_free_ptr;
atomic_t w_free_ctr;
};
struct rds_iw_device;
struct rds_iw_connection {
struct list_head iw_node;
struct rds_iw_device *rds_iwdev;
struct rds_connection *conn;
/* alphabet soup, IBTA style */
struct rdma_cm_id *i_cm_id;
struct ib_pd *i_pd;
struct ib_mr *i_mr;
struct ib_cq *i_send_cq;
struct ib_cq *i_recv_cq;
/* tx */
struct rds_iw_work_ring i_send_ring;
struct rds_message *i_rm;
struct rds_header *i_send_hdrs;
u64 i_send_hdrs_dma;
struct rds_iw_send_work *i_sends;
/* rx */
struct tasklet_struct i_recv_tasklet;
struct mutex i_recv_mutex;
struct rds_iw_work_ring i_recv_ring;
struct rds_iw_incoming *i_iwinc;
u32 i_recv_data_rem;
struct rds_header *i_recv_hdrs;
u64 i_recv_hdrs_dma;
struct rds_iw_recv_work *i_recvs;
struct rds_page_frag i_frag;
u64 i_ack_recv; /* last ACK received */
/* sending acks */
unsigned long i_ack_flags;
#ifdef KERNEL_HAS_ATOMIC64
atomic64_t i_ack_next; /* next ACK to send */
#else
spinlock_t i_ack_lock; /* protect i_ack_next */
u64 i_ack_next; /* next ACK to send */
#endif
struct rds_header *i_ack;
struct ib_send_wr i_ack_wr;
struct ib_sge i_ack_sge;
u64 i_ack_dma;
unsigned long i_ack_queued;
/* Flow control related information
*
* Our algorithm uses a pair variables that we need to access
* atomically - one for the send credits, and one posted
* recv credits we need to transfer to remote.
* Rather than protect them using a slow spinlock, we put both into
* a single atomic_t and update it using cmpxchg
*/
atomic_t i_credits;
/* Protocol version specific information */
unsigned int i_flowctl:1; /* enable/disable flow ctl */
unsigned int i_dma_local_lkey:1;
unsigned int i_fastreg_posted:1; /* fastreg posted on this connection */
/* Batched completions */
unsigned int i_unsignaled_wrs;
long i_unsignaled_bytes;
};
/* This assumes that atomic_t is at least 32 bits */
#define IB_GET_SEND_CREDITS(v) ((v) & 0xffff)
#define IB_GET_POST_CREDITS(v) ((v) >> 16)
#define IB_SET_SEND_CREDITS(v) ((v) & 0xffff)
#define IB_SET_POST_CREDITS(v) ((v) << 16)
struct rds_iw_cm_id {
struct list_head list;
struct rdma_cm_id *cm_id;
};
struct rds_iw_device {
struct list_head list;
struct list_head cm_id_list;
struct list_head conn_list;
struct ib_device *dev;
struct ib_pd *pd;
struct ib_mr *mr;
struct rds_iw_mr_pool *mr_pool;
int max_sge;
unsigned int max_wrs;
unsigned int dma_local_lkey:1;
spinlock_t spinlock; /* protect the above */
};
/* bits for i_ack_flags */
#define IB_ACK_IN_FLIGHT 0
#define IB_ACK_REQUESTED 1
/* Magic WR_ID for ACKs */
#define RDS_IW_ACK_WR_ID ((u64)0xffffffffffffffffULL)
#define RDS_IW_FAST_REG_WR_ID ((u64)0xefefefefefefefefULL)
#define RDS_IW_LOCAL_INV_WR_ID ((u64)0xdfdfdfdfdfdfdfdfULL)
struct rds_iw_statistics {
uint64_t s_iw_connect_raced;
uint64_t s_iw_listen_closed_stale;
uint64_t s_iw_tx_cq_call;
uint64_t s_iw_tx_cq_event;
uint64_t s_iw_tx_ring_full;
uint64_t s_iw_tx_throttle;
uint64_t s_iw_tx_sg_mapping_failure;
uint64_t s_iw_tx_stalled;
uint64_t s_iw_tx_credit_updates;
uint64_t s_iw_rx_cq_call;
uint64_t s_iw_rx_cq_event;
uint64_t s_iw_rx_ring_empty;
uint64_t s_iw_rx_refill_from_cq;
uint64_t s_iw_rx_refill_from_thread;
uint64_t s_iw_rx_alloc_limit;
uint64_t s_iw_rx_credit_updates;
uint64_t s_iw_ack_sent;
uint64_t s_iw_ack_send_failure;
uint64_t s_iw_ack_send_delayed;
uint64_t s_iw_ack_send_piggybacked;
uint64_t s_iw_ack_received;
uint64_t s_iw_rdma_mr_alloc;
uint64_t s_iw_rdma_mr_free;
uint64_t s_iw_rdma_mr_used;
uint64_t s_iw_rdma_mr_pool_flush;
uint64_t s_iw_rdma_mr_pool_wait;
uint64_t s_iw_rdma_mr_pool_depleted;
};
extern struct workqueue_struct *rds_iw_wq;
/*
* Fake ib_dma_sync_sg_for_{cpu,device} as long as ib_verbs.h
* doesn't define it.
*/
static inline void rds_iw_dma_sync_sg_for_cpu(struct ib_device *dev,
struct scatterlist *sg, unsigned int sg_dma_len, int direction)
{
unsigned int i;
for (i = 0; i < sg_dma_len; ++i) {
ib_dma_sync_single_for_cpu(dev,
ib_sg_dma_address(dev, &sg[i]),
ib_sg_dma_len(dev, &sg[i]),
direction);
}
}
#define ib_dma_sync_sg_for_cpu rds_iw_dma_sync_sg_for_cpu
static inline void rds_iw_dma_sync_sg_for_device(struct ib_device *dev,
struct scatterlist *sg, unsigned int sg_dma_len, int direction)
{
unsigned int i;
for (i = 0; i < sg_dma_len; ++i) {
ib_dma_sync_single_for_device(dev,
ib_sg_dma_address(dev, &sg[i]),
ib_sg_dma_len(dev, &sg[i]),
direction);
}
}
#define ib_dma_sync_sg_for_device rds_iw_dma_sync_sg_for_device
static inline u32 rds_iw_local_dma_lkey(struct rds_iw_connection *ic)
{
return ic->i_dma_local_lkey ? ic->i_cm_id->device->local_dma_lkey : ic->i_mr->lkey;
}
/* ib.c */
extern struct rds_transport rds_iw_transport;
extern struct ib_client rds_iw_client;
extern unsigned int fastreg_pool_size;
extern unsigned int fastreg_message_size;
extern spinlock_t iw_nodev_conns_lock;
extern struct list_head iw_nodev_conns;
/* ib_cm.c */
int rds_iw_conn_alloc(struct rds_connection *conn, gfp_t gfp);
void rds_iw_conn_free(void *arg);
int rds_iw_conn_connect(struct rds_connection *conn);
void rds_iw_conn_shutdown(struct rds_connection *conn);
void rds_iw_state_change(struct sock *sk);
int rds_iw_listen_init(void);
void rds_iw_listen_stop(void);
void __rds_iw_conn_error(struct rds_connection *conn, const char *, ...);
int rds_iw_cm_handle_connect(struct rdma_cm_id *cm_id,
struct rdma_cm_event *event);
int rds_iw_cm_initiate_connect(struct rdma_cm_id *cm_id);
void rds_iw_cm_connect_complete(struct rds_connection *conn,
struct rdma_cm_event *event);
#define rds_iw_conn_error(conn, fmt...) \
__rds_iw_conn_error(conn, KERN_WARNING "RDS/IW: " fmt)
/* ib_rdma.c */
int rds_iw_update_cm_id(struct rds_iw_device *rds_iwdev, struct rdma_cm_id *cm_id);
void rds_iw_add_conn(struct rds_iw_device *rds_iwdev, struct rds_connection *conn);
void rds_iw_remove_conn(struct rds_iw_device *rds_iwdev, struct rds_connection *conn);
void __rds_iw_destroy_conns(struct list_head *list, spinlock_t *list_lock);
static inline void rds_iw_destroy_nodev_conns(void)
{
__rds_iw_destroy_conns(&iw_nodev_conns, &iw_nodev_conns_lock);
}
static inline void rds_iw_destroy_conns(struct rds_iw_device *rds_iwdev)
{
__rds_iw_destroy_conns(&rds_iwdev->conn_list, &rds_iwdev->spinlock);
}
struct rds_iw_mr_pool *rds_iw_create_mr_pool(struct rds_iw_device *);
void rds_iw_get_mr_info(struct rds_iw_device *rds_iwdev, struct rds_info_rdma_connection *iinfo);
void rds_iw_destroy_mr_pool(struct rds_iw_mr_pool *);
void *rds_iw_get_mr(struct scatterlist *sg, unsigned long nents,
struct rds_sock *rs, u32 *key_ret);
void rds_iw_sync_mr(void *trans_private, int dir);
void rds_iw_free_mr(void *trans_private, int invalidate);
void rds_iw_flush_mrs(void);
/* ib_recv.c */
int rds_iw_recv_init(void);
void rds_iw_recv_exit(void);
int rds_iw_recv(struct rds_connection *conn);
int rds_iw_recv_refill(struct rds_connection *conn, gfp_t kptr_gfp,
gfp_t page_gfp, int prefill);
void rds_iw_inc_free(struct rds_incoming *inc);
int rds_iw_inc_copy_to_user(struct rds_incoming *inc, struct iovec *iov,
size_t size);
void rds_iw_recv_cq_comp_handler(struct ib_cq *cq, void *context);
void rds_iw_recv_tasklet_fn(unsigned long data);
void rds_iw_recv_init_ring(struct rds_iw_connection *ic);
void rds_iw_recv_clear_ring(struct rds_iw_connection *ic);
void rds_iw_recv_init_ack(struct rds_iw_connection *ic);
void rds_iw_attempt_ack(struct rds_iw_connection *ic);
void rds_iw_ack_send_complete(struct rds_iw_connection *ic);
u64 rds_iw_piggyb_ack(struct rds_iw_connection *ic);
/* ib_ring.c */
void rds_iw_ring_init(struct rds_iw_work_ring *ring, u32 nr);
void rds_iw_ring_resize(struct rds_iw_work_ring *ring, u32 nr);
u32 rds_iw_ring_alloc(struct rds_iw_work_ring *ring, u32 val, u32 *pos);
void rds_iw_ring_free(struct rds_iw_work_ring *ring, u32 val);
void rds_iw_ring_unalloc(struct rds_iw_work_ring *ring, u32 val);
int rds_iw_ring_empty(struct rds_iw_work_ring *ring);
int rds_iw_ring_low(struct rds_iw_work_ring *ring);
u32 rds_iw_ring_oldest(struct rds_iw_work_ring *ring);
u32 rds_iw_ring_completed(struct rds_iw_work_ring *ring, u32 wr_id, u32 oldest);
extern wait_queue_head_t rds_iw_ring_empty_wait;
/* ib_send.c */
void rds_iw_xmit_complete(struct rds_connection *conn);
int rds_iw_xmit(struct rds_connection *conn, struct rds_message *rm,
unsigned int hdr_off, unsigned int sg, unsigned int off);
void rds_iw_send_cq_comp_handler(struct ib_cq *cq, void *context);
void rds_iw_send_init_ring(struct rds_iw_connection *ic);
void rds_iw_send_clear_ring(struct rds_iw_connection *ic);
int rds_iw_xmit_rdma(struct rds_connection *conn, struct rm_rdma_op *op);
void rds_iw_send_add_credits(struct rds_connection *conn, unsigned int credits);
void rds_iw_advertise_credits(struct rds_connection *conn, unsigned int posted);
int rds_iw_send_grab_credits(struct rds_iw_connection *ic, u32 wanted,
u32 *adv_credits, int need_posted, int max_posted);
/* ib_stats.c */
DECLARE_PER_CPU(struct rds_iw_statistics, rds_iw_stats);
#define rds_iw_stats_inc(member) rds_stats_inc_which(rds_iw_stats, member)
unsigned int rds_iw_stats_info_copy(struct rds_info_iterator *iter,
unsigned int avail);
/* ib_sysctl.c */
int rds_iw_sysctl_init(void);
void rds_iw_sysctl_exit(void);
extern unsigned long rds_iw_sysctl_max_send_wr;
extern unsigned long rds_iw_sysctl_max_recv_wr;
extern unsigned long rds_iw_sysctl_max_unsig_wrs;
extern unsigned long rds_iw_sysctl_max_unsig_bytes;
extern unsigned long rds_iw_sysctl_max_recv_allocation;
extern unsigned int rds_iw_sysctl_flow_control;
/*
* Helper functions for getting/setting the header and data SGEs in
* RDS packets (not RDMA)
*/
static inline struct ib_sge *
rds_iw_header_sge(struct rds_iw_connection *ic, struct ib_sge *sge)
{
return &sge[0];
}
static inline struct ib_sge *
rds_iw_data_sge(struct rds_iw_connection *ic, struct ib_sge *sge)
{
return &sge[1];
}
#endif

765
net/rds/iw_cm.c Normal file
View file

@ -0,0 +1,765 @@
/*
* Copyright (c) 2006 Oracle. All rights reserved.
*
* This software is available to you under a choice of one of two
* licenses. You may choose to be licensed under the terms of the GNU
* General Public License (GPL) Version 2, available from the file
* COPYING in the main directory of this source tree, or the
* OpenIB.org BSD license below:
*
* Redistribution and use in source and binary forms, with or
* without modification, are permitted provided that the following
* conditions are met:
*
* - Redistributions of source code must retain the above
* copyright notice, this list of conditions and the following
* disclaimer.
*
* - Redistributions in binary form must reproduce the above
* copyright notice, this list of conditions and the following
* disclaimer in the documentation and/or other materials
* provided with the distribution.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
* BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
* ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
* CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
*
*/
#include <linux/kernel.h>
#include <linux/in.h>
#include <linux/slab.h>
#include <linux/vmalloc.h>
#include <linux/ratelimit.h>
#include "rds.h"
#include "iw.h"
/*
* Set the selected protocol version
*/
static void rds_iw_set_protocol(struct rds_connection *conn, unsigned int version)
{
conn->c_version = version;
}
/*
* Set up flow control
*/
static void rds_iw_set_flow_control(struct rds_connection *conn, u32 credits)
{
struct rds_iw_connection *ic = conn->c_transport_data;
if (rds_iw_sysctl_flow_control && credits != 0) {
/* We're doing flow control */
ic->i_flowctl = 1;
rds_iw_send_add_credits(conn, credits);
} else {
ic->i_flowctl = 0;
}
}
/*
* Connection established.
* We get here for both outgoing and incoming connection.
*/
void rds_iw_cm_connect_complete(struct rds_connection *conn, struct rdma_cm_event *event)
{
const struct rds_iw_connect_private *dp = NULL;
struct rds_iw_connection *ic = conn->c_transport_data;
struct rds_iw_device *rds_iwdev;
int err;
if (event->param.conn.private_data_len) {
dp = event->param.conn.private_data;
rds_iw_set_protocol(conn,
RDS_PROTOCOL(dp->dp_protocol_major,
dp->dp_protocol_minor));
rds_iw_set_flow_control(conn, be32_to_cpu(dp->dp_credit));
}
/* update ib_device with this local ipaddr & conn */
rds_iwdev = ib_get_client_data(ic->i_cm_id->device, &rds_iw_client);
err = rds_iw_update_cm_id(rds_iwdev, ic->i_cm_id);
if (err)
printk(KERN_ERR "rds_iw_update_ipaddr failed (%d)\n", err);
rds_iw_add_conn(rds_iwdev, conn);
/* If the peer gave us the last packet it saw, process this as if
* we had received a regular ACK. */
if (dp && dp->dp_ack_seq)
rds_send_drop_acked(conn, be64_to_cpu(dp->dp_ack_seq), NULL);
printk(KERN_NOTICE "RDS/IW: connected to %pI4<->%pI4 version %u.%u%s\n",
&conn->c_laddr, &conn->c_faddr,
RDS_PROTOCOL_MAJOR(conn->c_version),
RDS_PROTOCOL_MINOR(conn->c_version),
ic->i_flowctl ? ", flow control" : "");
rds_connect_complete(conn);
}
static void rds_iw_cm_fill_conn_param(struct rds_connection *conn,
struct rdma_conn_param *conn_param,
struct rds_iw_connect_private *dp,
u32 protocol_version)
{
struct rds_iw_connection *ic = conn->c_transport_data;
memset(conn_param, 0, sizeof(struct rdma_conn_param));
/* XXX tune these? */
conn_param->responder_resources = 1;
conn_param->initiator_depth = 1;
if (dp) {
memset(dp, 0, sizeof(*dp));
dp->dp_saddr = conn->c_laddr;
dp->dp_daddr = conn->c_faddr;
dp->dp_protocol_major = RDS_PROTOCOL_MAJOR(protocol_version);
dp->dp_protocol_minor = RDS_PROTOCOL_MINOR(protocol_version);
dp->dp_protocol_minor_mask = cpu_to_be16(RDS_IW_SUPPORTED_PROTOCOLS);
dp->dp_ack_seq = rds_iw_piggyb_ack(ic);
/* Advertise flow control */
if (ic->i_flowctl) {
unsigned int credits;
credits = IB_GET_POST_CREDITS(atomic_read(&ic->i_credits));
dp->dp_credit = cpu_to_be32(credits);
atomic_sub(IB_SET_POST_CREDITS(credits), &ic->i_credits);
}
conn_param->private_data = dp;
conn_param->private_data_len = sizeof(*dp);
}
}
static void rds_iw_cq_event_handler(struct ib_event *event, void *data)
{
rdsdebug("event %u data %p\n", event->event, data);
}
static void rds_iw_qp_event_handler(struct ib_event *event, void *data)
{
struct rds_connection *conn = data;
struct rds_iw_connection *ic = conn->c_transport_data;
rdsdebug("conn %p ic %p event %u\n", conn, ic, event->event);
switch (event->event) {
case IB_EVENT_COMM_EST:
rdma_notify(ic->i_cm_id, IB_EVENT_COMM_EST);
break;
case IB_EVENT_QP_REQ_ERR:
case IB_EVENT_QP_FATAL:
default:
rdsdebug("Fatal QP Event %u "
"- connection %pI4->%pI4, reconnecting\n",
event->event, &conn->c_laddr,
&conn->c_faddr);
rds_conn_drop(conn);
break;
}
}
/*
* Create a QP
*/
static int rds_iw_init_qp_attrs(struct ib_qp_init_attr *attr,
struct rds_iw_device *rds_iwdev,
struct rds_iw_work_ring *send_ring,
void (*send_cq_handler)(struct ib_cq *, void *),
struct rds_iw_work_ring *recv_ring,
void (*recv_cq_handler)(struct ib_cq *, void *),
void *context)
{
struct ib_device *dev = rds_iwdev->dev;
unsigned int send_size, recv_size;
int ret;
/* The offset of 1 is to accommodate the additional ACK WR. */
send_size = min_t(unsigned int, rds_iwdev->max_wrs, rds_iw_sysctl_max_send_wr + 1);
recv_size = min_t(unsigned int, rds_iwdev->max_wrs, rds_iw_sysctl_max_recv_wr + 1);
rds_iw_ring_resize(send_ring, send_size - 1);
rds_iw_ring_resize(recv_ring, recv_size - 1);
memset(attr, 0, sizeof(*attr));
attr->event_handler = rds_iw_qp_event_handler;
attr->qp_context = context;
attr->cap.max_send_wr = send_size;
attr->cap.max_recv_wr = recv_size;
attr->cap.max_send_sge = rds_iwdev->max_sge;
attr->cap.max_recv_sge = RDS_IW_RECV_SGE;
attr->sq_sig_type = IB_SIGNAL_REQ_WR;
attr->qp_type = IB_QPT_RC;
attr->send_cq = ib_create_cq(dev, send_cq_handler,
rds_iw_cq_event_handler,
context, send_size, 0);
if (IS_ERR(attr->send_cq)) {
ret = PTR_ERR(attr->send_cq);
attr->send_cq = NULL;
rdsdebug("ib_create_cq send failed: %d\n", ret);
goto out;
}
attr->recv_cq = ib_create_cq(dev, recv_cq_handler,
rds_iw_cq_event_handler,
context, recv_size, 0);
if (IS_ERR(attr->recv_cq)) {
ret = PTR_ERR(attr->recv_cq);
attr->recv_cq = NULL;
rdsdebug("ib_create_cq send failed: %d\n", ret);
goto out;
}
ret = ib_req_notify_cq(attr->send_cq, IB_CQ_NEXT_COMP);
if (ret) {
rdsdebug("ib_req_notify_cq send failed: %d\n", ret);
goto out;
}
ret = ib_req_notify_cq(attr->recv_cq, IB_CQ_SOLICITED);
if (ret) {
rdsdebug("ib_req_notify_cq recv failed: %d\n", ret);
goto out;
}
out:
if (ret) {
if (attr->send_cq)
ib_destroy_cq(attr->send_cq);
if (attr->recv_cq)
ib_destroy_cq(attr->recv_cq);
}
return ret;
}
/*
* This needs to be very careful to not leave IS_ERR pointers around for
* cleanup to trip over.
*/
static int rds_iw_setup_qp(struct rds_connection *conn)
{
struct rds_iw_connection *ic = conn->c_transport_data;
struct ib_device *dev = ic->i_cm_id->device;
struct ib_qp_init_attr attr;
struct rds_iw_device *rds_iwdev;
int ret;
/* rds_iw_add_one creates a rds_iw_device object per IB device,
* and allocates a protection domain, memory range and MR pool
* for each. If that fails for any reason, it will not register
* the rds_iwdev at all.
*/
rds_iwdev = ib_get_client_data(dev, &rds_iw_client);
if (!rds_iwdev) {
printk_ratelimited(KERN_NOTICE "RDS/IW: No client_data for device %s\n",
dev->name);
return -EOPNOTSUPP;
}
/* Protection domain and memory range */
ic->i_pd = rds_iwdev->pd;
ic->i_mr = rds_iwdev->mr;
ret = rds_iw_init_qp_attrs(&attr, rds_iwdev,
&ic->i_send_ring, rds_iw_send_cq_comp_handler,
&ic->i_recv_ring, rds_iw_recv_cq_comp_handler,
conn);
if (ret < 0)
goto out;
ic->i_send_cq = attr.send_cq;
ic->i_recv_cq = attr.recv_cq;
/*
* XXX this can fail if max_*_wr is too large? Are we supposed
* to back off until we get a value that the hardware can support?
*/
ret = rdma_create_qp(ic->i_cm_id, ic->i_pd, &attr);
if (ret) {
rdsdebug("rdma_create_qp failed: %d\n", ret);
goto out;
}
ic->i_send_hdrs = ib_dma_alloc_coherent(dev,
ic->i_send_ring.w_nr *
sizeof(struct rds_header),
&ic->i_send_hdrs_dma, GFP_KERNEL);
if (!ic->i_send_hdrs) {
ret = -ENOMEM;
rdsdebug("ib_dma_alloc_coherent send failed\n");
goto out;
}
ic->i_recv_hdrs = ib_dma_alloc_coherent(dev,
ic->i_recv_ring.w_nr *
sizeof(struct rds_header),
&ic->i_recv_hdrs_dma, GFP_KERNEL);
if (!ic->i_recv_hdrs) {
ret = -ENOMEM;
rdsdebug("ib_dma_alloc_coherent recv failed\n");
goto out;
}
ic->i_ack = ib_dma_alloc_coherent(dev, sizeof(struct rds_header),
&ic->i_ack_dma, GFP_KERNEL);
if (!ic->i_ack) {
ret = -ENOMEM;
rdsdebug("ib_dma_alloc_coherent ack failed\n");
goto out;
}
ic->i_sends = vmalloc(ic->i_send_ring.w_nr * sizeof(struct rds_iw_send_work));
if (!ic->i_sends) {
ret = -ENOMEM;
rdsdebug("send allocation failed\n");
goto out;
}
rds_iw_send_init_ring(ic);
ic->i_recvs = vmalloc(ic->i_recv_ring.w_nr * sizeof(struct rds_iw_recv_work));
if (!ic->i_recvs) {
ret = -ENOMEM;
rdsdebug("recv allocation failed\n");
goto out;
}
rds_iw_recv_init_ring(ic);
rds_iw_recv_init_ack(ic);
/* Post receive buffers - as a side effect, this will update
* the posted credit count. */
rds_iw_recv_refill(conn, GFP_KERNEL, GFP_HIGHUSER, 1);
rdsdebug("conn %p pd %p mr %p cq %p %p\n", conn, ic->i_pd, ic->i_mr,
ic->i_send_cq, ic->i_recv_cq);
out:
return ret;
}
static u32 rds_iw_protocol_compatible(const struct rds_iw_connect_private *dp)
{
u16 common;
u32 version = 0;
/* rdma_cm private data is odd - when there is any private data in the
* request, we will be given a pretty large buffer without telling us the
* original size. The only way to tell the difference is by looking at
* the contents, which are initialized to zero.
* If the protocol version fields aren't set, this is a connection attempt
* from an older version. This could could be 3.0 or 2.0 - we can't tell.
* We really should have changed this for OFED 1.3 :-( */
if (dp->dp_protocol_major == 0)
return RDS_PROTOCOL_3_0;
common = be16_to_cpu(dp->dp_protocol_minor_mask) & RDS_IW_SUPPORTED_PROTOCOLS;
if (dp->dp_protocol_major == 3 && common) {
version = RDS_PROTOCOL_3_0;
while ((common >>= 1) != 0)
version++;
}
printk_ratelimited(KERN_NOTICE "RDS: Connection from %pI4 using "
"incompatible protocol version %u.%u\n",
&dp->dp_saddr,
dp->dp_protocol_major,
dp->dp_protocol_minor);
return version;
}
int rds_iw_cm_handle_connect(struct rdma_cm_id *cm_id,
struct rdma_cm_event *event)
{
const struct rds_iw_connect_private *dp = event->param.conn.private_data;
struct rds_iw_connect_private dp_rep;
struct rds_connection *conn = NULL;
struct rds_iw_connection *ic = NULL;
struct rdma_conn_param conn_param;
struct rds_iw_device *rds_iwdev;
u32 version;
int err, destroy = 1;
/* Check whether the remote protocol version matches ours. */
version = rds_iw_protocol_compatible(dp);
if (!version)
goto out;
rdsdebug("saddr %pI4 daddr %pI4 RDSv%u.%u\n",
&dp->dp_saddr, &dp->dp_daddr,
RDS_PROTOCOL_MAJOR(version), RDS_PROTOCOL_MINOR(version));
conn = rds_conn_create(dp->dp_daddr, dp->dp_saddr, &rds_iw_transport,
GFP_KERNEL);
if (IS_ERR(conn)) {
rdsdebug("rds_conn_create failed (%ld)\n", PTR_ERR(conn));
conn = NULL;
goto out;
}
/*
* The connection request may occur while the
* previous connection exist, e.g. in case of failover.
* But as connections may be initiated simultaneously
* by both hosts, we have a random backoff mechanism -
* see the comment above rds_queue_reconnect()
*/
mutex_lock(&conn->c_cm_lock);
if (!rds_conn_transition(conn, RDS_CONN_DOWN, RDS_CONN_CONNECTING)) {
if (rds_conn_state(conn) == RDS_CONN_UP) {
rdsdebug("incoming connect while connecting\n");
rds_conn_drop(conn);
rds_iw_stats_inc(s_iw_listen_closed_stale);
} else
if (rds_conn_state(conn) == RDS_CONN_CONNECTING) {
/* Wait and see - our connect may still be succeeding */
rds_iw_stats_inc(s_iw_connect_raced);
}
mutex_unlock(&conn->c_cm_lock);
goto out;
}
ic = conn->c_transport_data;
rds_iw_set_protocol(conn, version);
rds_iw_set_flow_control(conn, be32_to_cpu(dp->dp_credit));
/* If the peer gave us the last packet it saw, process this as if
* we had received a regular ACK. */
if (dp->dp_ack_seq)
rds_send_drop_acked(conn, be64_to_cpu(dp->dp_ack_seq), NULL);
BUG_ON(cm_id->context);
BUG_ON(ic->i_cm_id);
ic->i_cm_id = cm_id;
cm_id->context = conn;
rds_iwdev = ib_get_client_data(cm_id->device, &rds_iw_client);
ic->i_dma_local_lkey = rds_iwdev->dma_local_lkey;
/* We got halfway through setting up the ib_connection, if we
* fail now, we have to take the long route out of this mess. */
destroy = 0;
err = rds_iw_setup_qp(conn);
if (err) {
rds_iw_conn_error(conn, "rds_iw_setup_qp failed (%d)\n", err);
mutex_unlock(&conn->c_cm_lock);
goto out;
}
rds_iw_cm_fill_conn_param(conn, &conn_param, &dp_rep, version);
/* rdma_accept() calls rdma_reject() internally if it fails */
err = rdma_accept(cm_id, &conn_param);
mutex_unlock(&conn->c_cm_lock);
if (err) {
rds_iw_conn_error(conn, "rdma_accept failed (%d)\n", err);
goto out;
}
return 0;
out:
rdma_reject(cm_id, NULL, 0);
return destroy;
}
int rds_iw_cm_initiate_connect(struct rdma_cm_id *cm_id)
{
struct rds_connection *conn = cm_id->context;
struct rds_iw_connection *ic = conn->c_transport_data;
struct rdma_conn_param conn_param;
struct rds_iw_connect_private dp;
int ret;
/* If the peer doesn't do protocol negotiation, we must
* default to RDSv3.0 */
rds_iw_set_protocol(conn, RDS_PROTOCOL_3_0);
ic->i_flowctl = rds_iw_sysctl_flow_control; /* advertise flow control */
ret = rds_iw_setup_qp(conn);
if (ret) {
rds_iw_conn_error(conn, "rds_iw_setup_qp failed (%d)\n", ret);
goto out;
}
rds_iw_cm_fill_conn_param(conn, &conn_param, &dp, RDS_PROTOCOL_VERSION);
ret = rdma_connect(cm_id, &conn_param);
if (ret)
rds_iw_conn_error(conn, "rdma_connect failed (%d)\n", ret);
out:
/* Beware - returning non-zero tells the rdma_cm to destroy
* the cm_id. We should certainly not do it as long as we still
* "own" the cm_id. */
if (ret) {
struct rds_iw_connection *ic = conn->c_transport_data;
if (ic->i_cm_id == cm_id)
ret = 0;
}
return ret;
}
int rds_iw_conn_connect(struct rds_connection *conn)
{
struct rds_iw_connection *ic = conn->c_transport_data;
struct rds_iw_device *rds_iwdev;
struct sockaddr_in src, dest;
int ret;
/* XXX I wonder what affect the port space has */
/* delegate cm event handler to rdma_transport */
ic->i_cm_id = rdma_create_id(rds_rdma_cm_event_handler, conn,
RDMA_PS_TCP, IB_QPT_RC);
if (IS_ERR(ic->i_cm_id)) {
ret = PTR_ERR(ic->i_cm_id);
ic->i_cm_id = NULL;
rdsdebug("rdma_create_id() failed: %d\n", ret);
goto out;
}
rdsdebug("created cm id %p for conn %p\n", ic->i_cm_id, conn);
src.sin_family = AF_INET;
src.sin_addr.s_addr = (__force u32)conn->c_laddr;
src.sin_port = (__force u16)htons(0);
/* First, bind to the local address and device. */
ret = rdma_bind_addr(ic->i_cm_id, (struct sockaddr *) &src);
if (ret) {
rdsdebug("rdma_bind_addr(%pI4) failed: %d\n",
&conn->c_laddr, ret);
rdma_destroy_id(ic->i_cm_id);
ic->i_cm_id = NULL;
goto out;
}
rds_iwdev = ib_get_client_data(ic->i_cm_id->device, &rds_iw_client);
ic->i_dma_local_lkey = rds_iwdev->dma_local_lkey;
dest.sin_family = AF_INET;
dest.sin_addr.s_addr = (__force u32)conn->c_faddr;
dest.sin_port = (__force u16)htons(RDS_PORT);
ret = rdma_resolve_addr(ic->i_cm_id, (struct sockaddr *)&src,
(struct sockaddr *)&dest,
RDS_RDMA_RESOLVE_TIMEOUT_MS);
if (ret) {
rdsdebug("addr resolve failed for cm id %p: %d\n", ic->i_cm_id,
ret);
rdma_destroy_id(ic->i_cm_id);
ic->i_cm_id = NULL;
}
out:
return ret;
}
/*
* This is so careful about only cleaning up resources that were built up
* so that it can be called at any point during startup. In fact it
* can be called multiple times for a given connection.
*/
void rds_iw_conn_shutdown(struct rds_connection *conn)
{
struct rds_iw_connection *ic = conn->c_transport_data;
int err = 0;
struct ib_qp_attr qp_attr;
rdsdebug("cm %p pd %p cq %p %p qp %p\n", ic->i_cm_id,
ic->i_pd, ic->i_send_cq, ic->i_recv_cq,
ic->i_cm_id ? ic->i_cm_id->qp : NULL);
if (ic->i_cm_id) {
struct ib_device *dev = ic->i_cm_id->device;
rdsdebug("disconnecting cm %p\n", ic->i_cm_id);
err = rdma_disconnect(ic->i_cm_id);
if (err) {
/* Actually this may happen quite frequently, when
* an outgoing connect raced with an incoming connect.
*/
rdsdebug("rds_iw_conn_shutdown: failed to disconnect,"
" cm: %p err %d\n", ic->i_cm_id, err);
}
if (ic->i_cm_id->qp) {
qp_attr.qp_state = IB_QPS_ERR;
ib_modify_qp(ic->i_cm_id->qp, &qp_attr, IB_QP_STATE);
}
wait_event(rds_iw_ring_empty_wait,
rds_iw_ring_empty(&ic->i_send_ring) &&
rds_iw_ring_empty(&ic->i_recv_ring));
if (ic->i_send_hdrs)
ib_dma_free_coherent(dev,
ic->i_send_ring.w_nr *
sizeof(struct rds_header),
ic->i_send_hdrs,
ic->i_send_hdrs_dma);
if (ic->i_recv_hdrs)
ib_dma_free_coherent(dev,
ic->i_recv_ring.w_nr *
sizeof(struct rds_header),
ic->i_recv_hdrs,
ic->i_recv_hdrs_dma);
if (ic->i_ack)
ib_dma_free_coherent(dev, sizeof(struct rds_header),
ic->i_ack, ic->i_ack_dma);
if (ic->i_sends)
rds_iw_send_clear_ring(ic);
if (ic->i_recvs)
rds_iw_recv_clear_ring(ic);
if (ic->i_cm_id->qp)
rdma_destroy_qp(ic->i_cm_id);
if (ic->i_send_cq)
ib_destroy_cq(ic->i_send_cq);
if (ic->i_recv_cq)
ib_destroy_cq(ic->i_recv_cq);
/*
* If associated with an rds_iw_device:
* Move connection back to the nodev list.
* Remove cm_id from the device cm_id list.
*/
if (ic->rds_iwdev)
rds_iw_remove_conn(ic->rds_iwdev, conn);
rdma_destroy_id(ic->i_cm_id);
ic->i_cm_id = NULL;
ic->i_pd = NULL;
ic->i_mr = NULL;
ic->i_send_cq = NULL;
ic->i_recv_cq = NULL;
ic->i_send_hdrs = NULL;
ic->i_recv_hdrs = NULL;
ic->i_ack = NULL;
}
BUG_ON(ic->rds_iwdev);
/* Clear pending transmit */
if (ic->i_rm) {
rds_message_put(ic->i_rm);
ic->i_rm = NULL;
}
/* Clear the ACK state */
clear_bit(IB_ACK_IN_FLIGHT, &ic->i_ack_flags);
#ifdef KERNEL_HAS_ATOMIC64
atomic64_set(&ic->i_ack_next, 0);
#else
ic->i_ack_next = 0;
#endif
ic->i_ack_recv = 0;
/* Clear flow control state */
ic->i_flowctl = 0;
atomic_set(&ic->i_credits, 0);
rds_iw_ring_init(&ic->i_send_ring, rds_iw_sysctl_max_send_wr);
rds_iw_ring_init(&ic->i_recv_ring, rds_iw_sysctl_max_recv_wr);
if (ic->i_iwinc) {
rds_inc_put(&ic->i_iwinc->ii_inc);
ic->i_iwinc = NULL;
}
vfree(ic->i_sends);
ic->i_sends = NULL;
vfree(ic->i_recvs);
ic->i_recvs = NULL;
rdsdebug("shutdown complete\n");
}
int rds_iw_conn_alloc(struct rds_connection *conn, gfp_t gfp)
{
struct rds_iw_connection *ic;
unsigned long flags;
/* XXX too lazy? */
ic = kzalloc(sizeof(struct rds_iw_connection), gfp);
if (!ic)
return -ENOMEM;
INIT_LIST_HEAD(&ic->iw_node);
tasklet_init(&ic->i_recv_tasklet, rds_iw_recv_tasklet_fn,
(unsigned long) ic);
mutex_init(&ic->i_recv_mutex);
#ifndef KERNEL_HAS_ATOMIC64
spin_lock_init(&ic->i_ack_lock);
#endif
/*
* rds_iw_conn_shutdown() waits for these to be emptied so they
* must be initialized before it can be called.
*/
rds_iw_ring_init(&ic->i_send_ring, rds_iw_sysctl_max_send_wr);
rds_iw_ring_init(&ic->i_recv_ring, rds_iw_sysctl_max_recv_wr);
ic->conn = conn;
conn->c_transport_data = ic;
spin_lock_irqsave(&iw_nodev_conns_lock, flags);
list_add_tail(&ic->iw_node, &iw_nodev_conns);
spin_unlock_irqrestore(&iw_nodev_conns_lock, flags);
rdsdebug("conn %p conn ic %p\n", conn, conn->c_transport_data);
return 0;
}
/*
* Free a connection. Connection must be shut down and not set for reconnect.
*/
void rds_iw_conn_free(void *arg)
{
struct rds_iw_connection *ic = arg;
spinlock_t *lock_ptr;
rdsdebug("ic %p\n", ic);
/*
* Conn is either on a dev's list or on the nodev list.
* A race with shutdown() or connect() would cause problems
* (since rds_iwdev would change) but that should never happen.
*/
lock_ptr = ic->rds_iwdev ? &ic->rds_iwdev->spinlock : &iw_nodev_conns_lock;
spin_lock_irq(lock_ptr);
list_del(&ic->iw_node);
spin_unlock_irq(lock_ptr);
kfree(ic);
}
/*
* An error occurred on the connection
*/
void
__rds_iw_conn_error(struct rds_connection *conn, const char *fmt, ...)
{
va_list ap;
rds_conn_drop(conn);
va_start(ap, fmt);
vprintk(fmt, ap);
va_end(ap);
}

875
net/rds/iw_rdma.c Normal file
View file

@ -0,0 +1,875 @@
/*
* Copyright (c) 2006 Oracle. All rights reserved.
*
* This software is available to you under a choice of one of two
* licenses. You may choose to be licensed under the terms of the GNU
* General Public License (GPL) Version 2, available from the file
* COPYING in the main directory of this source tree, or the
* OpenIB.org BSD license below:
*
* Redistribution and use in source and binary forms, with or
* without modification, are permitted provided that the following
* conditions are met:
*
* - Redistributions of source code must retain the above
* copyright notice, this list of conditions and the following
* disclaimer.
*
* - Redistributions in binary form must reproduce the above
* copyright notice, this list of conditions and the following
* disclaimer in the documentation and/or other materials
* provided with the distribution.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
* BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
* ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
* CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
*
*/
#include <linux/kernel.h>
#include <linux/slab.h>
#include <linux/ratelimit.h>
#include "rds.h"
#include "iw.h"
/*
* This is stored as mr->r_trans_private.
*/
struct rds_iw_mr {
struct rds_iw_device *device;
struct rds_iw_mr_pool *pool;
struct rdma_cm_id *cm_id;
struct ib_mr *mr;
struct ib_fast_reg_page_list *page_list;
struct rds_iw_mapping mapping;
unsigned char remap_count;
};
/*
* Our own little MR pool
*/
struct rds_iw_mr_pool {
struct rds_iw_device *device; /* back ptr to the device that owns us */
struct mutex flush_lock; /* serialize fmr invalidate */
struct work_struct flush_worker; /* flush worker */
spinlock_t list_lock; /* protect variables below */
atomic_t item_count; /* total # of MRs */
atomic_t dirty_count; /* # dirty of MRs */
struct list_head dirty_list; /* dirty mappings */
struct list_head clean_list; /* unused & unamapped MRs */
atomic_t free_pinned; /* memory pinned by free MRs */
unsigned long max_message_size; /* in pages */
unsigned long max_items;
unsigned long max_items_soft;
unsigned long max_free_pinned;
int max_pages;
};
static int rds_iw_flush_mr_pool(struct rds_iw_mr_pool *pool, int free_all);
static void rds_iw_mr_pool_flush_worker(struct work_struct *work);
static int rds_iw_init_fastreg(struct rds_iw_mr_pool *pool, struct rds_iw_mr *ibmr);
static int rds_iw_map_fastreg(struct rds_iw_mr_pool *pool,
struct rds_iw_mr *ibmr,
struct scatterlist *sg, unsigned int nents);
static void rds_iw_free_fastreg(struct rds_iw_mr_pool *pool, struct rds_iw_mr *ibmr);
static unsigned int rds_iw_unmap_fastreg_list(struct rds_iw_mr_pool *pool,
struct list_head *unmap_list,
struct list_head *kill_list,
int *unpinned);
static void rds_iw_destroy_fastreg(struct rds_iw_mr_pool *pool, struct rds_iw_mr *ibmr);
static int rds_iw_get_device(struct sockaddr_in *src, struct sockaddr_in *dst,
struct rds_iw_device **rds_iwdev,
struct rdma_cm_id **cm_id)
{
struct rds_iw_device *iwdev;
struct rds_iw_cm_id *i_cm_id;
*rds_iwdev = NULL;
*cm_id = NULL;
list_for_each_entry(iwdev, &rds_iw_devices, list) {
spin_lock_irq(&iwdev->spinlock);
list_for_each_entry(i_cm_id, &iwdev->cm_id_list, list) {
struct sockaddr_in *src_addr, *dst_addr;
src_addr = (struct sockaddr_in *)&i_cm_id->cm_id->route.addr.src_addr;
dst_addr = (struct sockaddr_in *)&i_cm_id->cm_id->route.addr.dst_addr;
rdsdebug("local ipaddr = %x port %d, "
"remote ipaddr = %x port %d"
"..looking for %x port %d, "
"remote ipaddr = %x port %d\n",
src_addr->sin_addr.s_addr,
src_addr->sin_port,
dst_addr->sin_addr.s_addr,
dst_addr->sin_port,
src->sin_addr.s_addr,
src->sin_port,
dst->sin_addr.s_addr,
dst->sin_port);
#ifdef WORKING_TUPLE_DETECTION
if (src_addr->sin_addr.s_addr == src->sin_addr.s_addr &&
src_addr->sin_port == src->sin_port &&
dst_addr->sin_addr.s_addr == dst->sin_addr.s_addr &&
dst_addr->sin_port == dst->sin_port) {
#else
/* FIXME - needs to compare the local and remote
* ipaddr/port tuple, but the ipaddr is the only
* available information in the rds_sock (as the rest are
* zero'ed. It doesn't appear to be properly populated
* during connection setup...
*/
if (src_addr->sin_addr.s_addr == src->sin_addr.s_addr) {
#endif
spin_unlock_irq(&iwdev->spinlock);
*rds_iwdev = iwdev;
*cm_id = i_cm_id->cm_id;
return 0;
}
}
spin_unlock_irq(&iwdev->spinlock);
}
return 1;
}
static int rds_iw_add_cm_id(struct rds_iw_device *rds_iwdev, struct rdma_cm_id *cm_id)
{
struct rds_iw_cm_id *i_cm_id;
i_cm_id = kmalloc(sizeof *i_cm_id, GFP_KERNEL);
if (!i_cm_id)
return -ENOMEM;
i_cm_id->cm_id = cm_id;
spin_lock_irq(&rds_iwdev->spinlock);
list_add_tail(&i_cm_id->list, &rds_iwdev->cm_id_list);
spin_unlock_irq(&rds_iwdev->spinlock);
return 0;
}
static void rds_iw_remove_cm_id(struct rds_iw_device *rds_iwdev,
struct rdma_cm_id *cm_id)
{
struct rds_iw_cm_id *i_cm_id;
spin_lock_irq(&rds_iwdev->spinlock);
list_for_each_entry(i_cm_id, &rds_iwdev->cm_id_list, list) {
if (i_cm_id->cm_id == cm_id) {
list_del(&i_cm_id->list);
kfree(i_cm_id);
break;
}
}
spin_unlock_irq(&rds_iwdev->spinlock);
}
int rds_iw_update_cm_id(struct rds_iw_device *rds_iwdev, struct rdma_cm_id *cm_id)
{
struct sockaddr_in *src_addr, *dst_addr;
struct rds_iw_device *rds_iwdev_old;
struct rdma_cm_id *pcm_id;
int rc;
src_addr = (struct sockaddr_in *)&cm_id->route.addr.src_addr;
dst_addr = (struct sockaddr_in *)&cm_id->route.addr.dst_addr;
rc = rds_iw_get_device(src_addr, dst_addr, &rds_iwdev_old, &pcm_id);
if (rc)
rds_iw_remove_cm_id(rds_iwdev, cm_id);
return rds_iw_add_cm_id(rds_iwdev, cm_id);
}
void rds_iw_add_conn(struct rds_iw_device *rds_iwdev, struct rds_connection *conn)
{
struct rds_iw_connection *ic = conn->c_transport_data;
/* conn was previously on the nodev_conns_list */
spin_lock_irq(&iw_nodev_conns_lock);
BUG_ON(list_empty(&iw_nodev_conns));
BUG_ON(list_empty(&ic->iw_node));
list_del(&ic->iw_node);
spin_lock(&rds_iwdev->spinlock);
list_add_tail(&ic->iw_node, &rds_iwdev->conn_list);
spin_unlock(&rds_iwdev->spinlock);
spin_unlock_irq(&iw_nodev_conns_lock);
ic->rds_iwdev = rds_iwdev;
}
void rds_iw_remove_conn(struct rds_iw_device *rds_iwdev, struct rds_connection *conn)
{
struct rds_iw_connection *ic = conn->c_transport_data;
/* place conn on nodev_conns_list */
spin_lock(&iw_nodev_conns_lock);
spin_lock_irq(&rds_iwdev->spinlock);
BUG_ON(list_empty(&ic->iw_node));
list_del(&ic->iw_node);
spin_unlock_irq(&rds_iwdev->spinlock);
list_add_tail(&ic->iw_node, &iw_nodev_conns);
spin_unlock(&iw_nodev_conns_lock);
rds_iw_remove_cm_id(ic->rds_iwdev, ic->i_cm_id);
ic->rds_iwdev = NULL;
}
void __rds_iw_destroy_conns(struct list_head *list, spinlock_t *list_lock)
{
struct rds_iw_connection *ic, *_ic;
LIST_HEAD(tmp_list);
/* avoid calling conn_destroy with irqs off */
spin_lock_irq(list_lock);
list_splice(list, &tmp_list);
INIT_LIST_HEAD(list);
spin_unlock_irq(list_lock);
list_for_each_entry_safe(ic, _ic, &tmp_list, iw_node)
rds_conn_destroy(ic->conn);
}
static void rds_iw_set_scatterlist(struct rds_iw_scatterlist *sg,
struct scatterlist *list, unsigned int sg_len)
{
sg->list = list;
sg->len = sg_len;
sg->dma_len = 0;
sg->dma_npages = 0;
sg->bytes = 0;
}
static u64 *rds_iw_map_scatterlist(struct rds_iw_device *rds_iwdev,
struct rds_iw_scatterlist *sg)
{
struct ib_device *dev = rds_iwdev->dev;
u64 *dma_pages = NULL;
int i, j, ret;
WARN_ON(sg->dma_len);
sg->dma_len = ib_dma_map_sg(dev, sg->list, sg->len, DMA_BIDIRECTIONAL);
if (unlikely(!sg->dma_len)) {
printk(KERN_WARNING "RDS/IW: dma_map_sg failed!\n");
return ERR_PTR(-EBUSY);
}
sg->bytes = 0;
sg->dma_npages = 0;
ret = -EINVAL;
for (i = 0; i < sg->dma_len; ++i) {
unsigned int dma_len = ib_sg_dma_len(dev, &sg->list[i]);
u64 dma_addr = ib_sg_dma_address(dev, &sg->list[i]);
u64 end_addr;
sg->bytes += dma_len;
end_addr = dma_addr + dma_len;
if (dma_addr & PAGE_MASK) {
if (i > 0)
goto out_unmap;
dma_addr &= ~PAGE_MASK;
}
if (end_addr & PAGE_MASK) {
if (i < sg->dma_len - 1)
goto out_unmap;
end_addr = (end_addr + PAGE_MASK) & ~PAGE_MASK;
}
sg->dma_npages += (end_addr - dma_addr) >> PAGE_SHIFT;
}
/* Now gather the dma addrs into one list */
if (sg->dma_npages > fastreg_message_size)
goto out_unmap;
dma_pages = kmalloc(sizeof(u64) * sg->dma_npages, GFP_ATOMIC);
if (!dma_pages) {
ret = -ENOMEM;
goto out_unmap;
}
for (i = j = 0; i < sg->dma_len; ++i) {
unsigned int dma_len = ib_sg_dma_len(dev, &sg->list[i]);
u64 dma_addr = ib_sg_dma_address(dev, &sg->list[i]);
u64 end_addr;
end_addr = dma_addr + dma_len;
dma_addr &= ~PAGE_MASK;
for (; dma_addr < end_addr; dma_addr += PAGE_SIZE)
dma_pages[j++] = dma_addr;
BUG_ON(j > sg->dma_npages);
}
return dma_pages;
out_unmap:
ib_dma_unmap_sg(rds_iwdev->dev, sg->list, sg->len, DMA_BIDIRECTIONAL);
sg->dma_len = 0;
kfree(dma_pages);
return ERR_PTR(ret);
}
struct rds_iw_mr_pool *rds_iw_create_mr_pool(struct rds_iw_device *rds_iwdev)
{
struct rds_iw_mr_pool *pool;
pool = kzalloc(sizeof(*pool), GFP_KERNEL);
if (!pool) {
printk(KERN_WARNING "RDS/IW: rds_iw_create_mr_pool alloc error\n");
return ERR_PTR(-ENOMEM);
}
pool->device = rds_iwdev;
INIT_LIST_HEAD(&pool->dirty_list);
INIT_LIST_HEAD(&pool->clean_list);
mutex_init(&pool->flush_lock);
spin_lock_init(&pool->list_lock);
INIT_WORK(&pool->flush_worker, rds_iw_mr_pool_flush_worker);
pool->max_message_size = fastreg_message_size;
pool->max_items = fastreg_pool_size;
pool->max_free_pinned = pool->max_items * pool->max_message_size / 4;
pool->max_pages = fastreg_message_size;
/* We never allow more than max_items MRs to be allocated.
* When we exceed more than max_items_soft, we start freeing
* items more aggressively.
* Make sure that max_items > max_items_soft > max_items / 2
*/
pool->max_items_soft = pool->max_items * 3 / 4;
return pool;
}
void rds_iw_get_mr_info(struct rds_iw_device *rds_iwdev, struct rds_info_rdma_connection *iinfo)
{
struct rds_iw_mr_pool *pool = rds_iwdev->mr_pool;
iinfo->rdma_mr_max = pool->max_items;
iinfo->rdma_mr_size = pool->max_pages;
}
void rds_iw_destroy_mr_pool(struct rds_iw_mr_pool *pool)
{
flush_workqueue(rds_wq);
rds_iw_flush_mr_pool(pool, 1);
BUG_ON(atomic_read(&pool->item_count));
BUG_ON(atomic_read(&pool->free_pinned));
kfree(pool);
}
static inline struct rds_iw_mr *rds_iw_reuse_fmr(struct rds_iw_mr_pool *pool)
{
struct rds_iw_mr *ibmr = NULL;
unsigned long flags;
spin_lock_irqsave(&pool->list_lock, flags);
if (!list_empty(&pool->clean_list)) {
ibmr = list_entry(pool->clean_list.next, struct rds_iw_mr, mapping.m_list);
list_del_init(&ibmr->mapping.m_list);
}
spin_unlock_irqrestore(&pool->list_lock, flags);
return ibmr;
}
static struct rds_iw_mr *rds_iw_alloc_mr(struct rds_iw_device *rds_iwdev)
{
struct rds_iw_mr_pool *pool = rds_iwdev->mr_pool;
struct rds_iw_mr *ibmr = NULL;
int err = 0, iter = 0;
while (1) {
ibmr = rds_iw_reuse_fmr(pool);
if (ibmr)
return ibmr;
/* No clean MRs - now we have the choice of either
* allocating a fresh MR up to the limit imposed by the
* driver, or flush any dirty unused MRs.
* We try to avoid stalling in the send path if possible,
* so we allocate as long as we're allowed to.
*
* We're fussy with enforcing the FMR limit, though. If the driver
* tells us we can't use more than N fmrs, we shouldn't start
* arguing with it */
if (atomic_inc_return(&pool->item_count) <= pool->max_items)
break;
atomic_dec(&pool->item_count);
if (++iter > 2) {
rds_iw_stats_inc(s_iw_rdma_mr_pool_depleted);
return ERR_PTR(-EAGAIN);
}
/* We do have some empty MRs. Flush them out. */
rds_iw_stats_inc(s_iw_rdma_mr_pool_wait);
rds_iw_flush_mr_pool(pool, 0);
}
ibmr = kzalloc(sizeof(*ibmr), GFP_KERNEL);
if (!ibmr) {
err = -ENOMEM;
goto out_no_cigar;
}
spin_lock_init(&ibmr->mapping.m_lock);
INIT_LIST_HEAD(&ibmr->mapping.m_list);
ibmr->mapping.m_mr = ibmr;
err = rds_iw_init_fastreg(pool, ibmr);
if (err)
goto out_no_cigar;
rds_iw_stats_inc(s_iw_rdma_mr_alloc);
return ibmr;
out_no_cigar:
if (ibmr) {
rds_iw_destroy_fastreg(pool, ibmr);
kfree(ibmr);
}
atomic_dec(&pool->item_count);
return ERR_PTR(err);
}
void rds_iw_sync_mr(void *trans_private, int direction)
{
struct rds_iw_mr *ibmr = trans_private;
struct rds_iw_device *rds_iwdev = ibmr->device;
switch (direction) {
case DMA_FROM_DEVICE:
ib_dma_sync_sg_for_cpu(rds_iwdev->dev, ibmr->mapping.m_sg.list,
ibmr->mapping.m_sg.dma_len, DMA_BIDIRECTIONAL);
break;
case DMA_TO_DEVICE:
ib_dma_sync_sg_for_device(rds_iwdev->dev, ibmr->mapping.m_sg.list,
ibmr->mapping.m_sg.dma_len, DMA_BIDIRECTIONAL);
break;
}
}
/*
* Flush our pool of MRs.
* At a minimum, all currently unused MRs are unmapped.
* If the number of MRs allocated exceeds the limit, we also try
* to free as many MRs as needed to get back to this limit.
*/
static int rds_iw_flush_mr_pool(struct rds_iw_mr_pool *pool, int free_all)
{
struct rds_iw_mr *ibmr, *next;
LIST_HEAD(unmap_list);
LIST_HEAD(kill_list);
unsigned long flags;
unsigned int nfreed = 0, ncleaned = 0, unpinned = 0;
int ret = 0;
rds_iw_stats_inc(s_iw_rdma_mr_pool_flush);
mutex_lock(&pool->flush_lock);
spin_lock_irqsave(&pool->list_lock, flags);
/* Get the list of all mappings to be destroyed */
list_splice_init(&pool->dirty_list, &unmap_list);
if (free_all)
list_splice_init(&pool->clean_list, &kill_list);
spin_unlock_irqrestore(&pool->list_lock, flags);
/* Batched invalidate of dirty MRs.
* For FMR based MRs, the mappings on the unmap list are
* actually members of an ibmr (ibmr->mapping). They either
* migrate to the kill_list, or have been cleaned and should be
* moved to the clean_list.
* For fastregs, they will be dynamically allocated, and
* will be destroyed by the unmap function.
*/
if (!list_empty(&unmap_list)) {
ncleaned = rds_iw_unmap_fastreg_list(pool, &unmap_list,
&kill_list, &unpinned);
/* If we've been asked to destroy all MRs, move those
* that were simply cleaned to the kill list */
if (free_all)
list_splice_init(&unmap_list, &kill_list);
}
/* Destroy any MRs that are past their best before date */
list_for_each_entry_safe(ibmr, next, &kill_list, mapping.m_list) {
rds_iw_stats_inc(s_iw_rdma_mr_free);
list_del(&ibmr->mapping.m_list);
rds_iw_destroy_fastreg(pool, ibmr);
kfree(ibmr);
nfreed++;
}
/* Anything that remains are laundered ibmrs, which we can add
* back to the clean list. */
if (!list_empty(&unmap_list)) {
spin_lock_irqsave(&pool->list_lock, flags);
list_splice(&unmap_list, &pool->clean_list);
spin_unlock_irqrestore(&pool->list_lock, flags);
}
atomic_sub(unpinned, &pool->free_pinned);
atomic_sub(ncleaned, &pool->dirty_count);
atomic_sub(nfreed, &pool->item_count);
mutex_unlock(&pool->flush_lock);
return ret;
}
static void rds_iw_mr_pool_flush_worker(struct work_struct *work)
{
struct rds_iw_mr_pool *pool = container_of(work, struct rds_iw_mr_pool, flush_worker);
rds_iw_flush_mr_pool(pool, 0);
}
void rds_iw_free_mr(void *trans_private, int invalidate)
{
struct rds_iw_mr *ibmr = trans_private;
struct rds_iw_mr_pool *pool = ibmr->device->mr_pool;
rdsdebug("RDS/IW: free_mr nents %u\n", ibmr->mapping.m_sg.len);
if (!pool)
return;
/* Return it to the pool's free list */
rds_iw_free_fastreg(pool, ibmr);
/* If we've pinned too many pages, request a flush */
if (atomic_read(&pool->free_pinned) >= pool->max_free_pinned ||
atomic_read(&pool->dirty_count) >= pool->max_items / 10)
queue_work(rds_wq, &pool->flush_worker);
if (invalidate) {
if (likely(!in_interrupt())) {
rds_iw_flush_mr_pool(pool, 0);
} else {
/* We get here if the user created a MR marked
* as use_once and invalidate at the same time. */
queue_work(rds_wq, &pool->flush_worker);
}
}
}
void rds_iw_flush_mrs(void)
{
struct rds_iw_device *rds_iwdev;
list_for_each_entry(rds_iwdev, &rds_iw_devices, list) {
struct rds_iw_mr_pool *pool = rds_iwdev->mr_pool;
if (pool)
rds_iw_flush_mr_pool(pool, 0);
}
}
void *rds_iw_get_mr(struct scatterlist *sg, unsigned long nents,
struct rds_sock *rs, u32 *key_ret)
{
struct rds_iw_device *rds_iwdev;
struct rds_iw_mr *ibmr = NULL;
struct rdma_cm_id *cm_id;
struct sockaddr_in src = {
.sin_addr.s_addr = rs->rs_bound_addr,
.sin_port = rs->rs_bound_port,
};
struct sockaddr_in dst = {
.sin_addr.s_addr = rs->rs_conn_addr,
.sin_port = rs->rs_conn_port,
};
int ret;
ret = rds_iw_get_device(&src, &dst, &rds_iwdev, &cm_id);
if (ret || !cm_id) {
ret = -ENODEV;
goto out;
}
if (!rds_iwdev->mr_pool) {
ret = -ENODEV;
goto out;
}
ibmr = rds_iw_alloc_mr(rds_iwdev);
if (IS_ERR(ibmr))
return ibmr;
ibmr->cm_id = cm_id;
ibmr->device = rds_iwdev;
ret = rds_iw_map_fastreg(rds_iwdev->mr_pool, ibmr, sg, nents);
if (ret == 0)
*key_ret = ibmr->mr->rkey;
else
printk(KERN_WARNING "RDS/IW: failed to map mr (errno=%d)\n", ret);
out:
if (ret) {
if (ibmr)
rds_iw_free_mr(ibmr, 0);
ibmr = ERR_PTR(ret);
}
return ibmr;
}
/*
* iWARP fastreg handling
*
* The life cycle of a fastreg registration is a bit different from
* FMRs.
* The idea behind fastreg is to have one MR, to which we bind different
* mappings over time. To avoid stalling on the expensive map and invalidate
* operations, these operations are pipelined on the same send queue on
* which we want to send the message containing the r_key.
*
* This creates a bit of a problem for us, as we do not have the destination
* IP in GET_MR, so the connection must be setup prior to the GET_MR call for
* RDMA to be correctly setup. If a fastreg request is present, rds_iw_xmit
* will try to queue a LOCAL_INV (if needed) and a FAST_REG_MR work request
* before queuing the SEND. When completions for these arrive, they are
* dispatched to the MR has a bit set showing that RDMa can be performed.
*
* There is another interesting aspect that's related to invalidation.
* The application can request that a mapping is invalidated in FREE_MR.
* The expectation there is that this invalidation step includes ALL
* PREVIOUSLY FREED MRs.
*/
static int rds_iw_init_fastreg(struct rds_iw_mr_pool *pool,
struct rds_iw_mr *ibmr)
{
struct rds_iw_device *rds_iwdev = pool->device;
struct ib_fast_reg_page_list *page_list = NULL;
struct ib_mr *mr;
int err;
mr = ib_alloc_fast_reg_mr(rds_iwdev->pd, pool->max_message_size);
if (IS_ERR(mr)) {
err = PTR_ERR(mr);
printk(KERN_WARNING "RDS/IW: ib_alloc_fast_reg_mr failed (err=%d)\n", err);
return err;
}
/* FIXME - this is overkill, but mapping->m_sg.dma_len/mapping->m_sg.dma_npages
* is not filled in.
*/
page_list = ib_alloc_fast_reg_page_list(rds_iwdev->dev, pool->max_message_size);
if (IS_ERR(page_list)) {
err = PTR_ERR(page_list);
printk(KERN_WARNING "RDS/IW: ib_alloc_fast_reg_page_list failed (err=%d)\n", err);
ib_dereg_mr(mr);
return err;
}
ibmr->page_list = page_list;
ibmr->mr = mr;
return 0;
}
static int rds_iw_rdma_build_fastreg(struct rds_iw_mapping *mapping)
{
struct rds_iw_mr *ibmr = mapping->m_mr;
struct ib_send_wr f_wr, *failed_wr;
int ret;
/*
* Perform a WR for the fast_reg_mr. Each individual page
* in the sg list is added to the fast reg page list and placed
* inside the fast_reg_mr WR. The key used is a rolling 8bit
* counter, which should guarantee uniqueness.
*/
ib_update_fast_reg_key(ibmr->mr, ibmr->remap_count++);
mapping->m_rkey = ibmr->mr->rkey;
memset(&f_wr, 0, sizeof(f_wr));
f_wr.wr_id = RDS_IW_FAST_REG_WR_ID;
f_wr.opcode = IB_WR_FAST_REG_MR;
f_wr.wr.fast_reg.length = mapping->m_sg.bytes;
f_wr.wr.fast_reg.rkey = mapping->m_rkey;
f_wr.wr.fast_reg.page_list = ibmr->page_list;
f_wr.wr.fast_reg.page_list_len = mapping->m_sg.dma_len;
f_wr.wr.fast_reg.page_shift = PAGE_SHIFT;
f_wr.wr.fast_reg.access_flags = IB_ACCESS_LOCAL_WRITE |
IB_ACCESS_REMOTE_READ |
IB_ACCESS_REMOTE_WRITE;
f_wr.wr.fast_reg.iova_start = 0;
f_wr.send_flags = IB_SEND_SIGNALED;
failed_wr = &f_wr;
ret = ib_post_send(ibmr->cm_id->qp, &f_wr, &failed_wr);
BUG_ON(failed_wr != &f_wr);
if (ret)
printk_ratelimited(KERN_WARNING "RDS/IW: %s:%d ib_post_send returned %d\n",
__func__, __LINE__, ret);
return ret;
}
static int rds_iw_rdma_fastreg_inv(struct rds_iw_mr *ibmr)
{
struct ib_send_wr s_wr, *failed_wr;
int ret = 0;
if (!ibmr->cm_id->qp || !ibmr->mr)
goto out;
memset(&s_wr, 0, sizeof(s_wr));
s_wr.wr_id = RDS_IW_LOCAL_INV_WR_ID;
s_wr.opcode = IB_WR_LOCAL_INV;
s_wr.ex.invalidate_rkey = ibmr->mr->rkey;
s_wr.send_flags = IB_SEND_SIGNALED;
failed_wr = &s_wr;
ret = ib_post_send(ibmr->cm_id->qp, &s_wr, &failed_wr);
if (ret) {
printk_ratelimited(KERN_WARNING "RDS/IW: %s:%d ib_post_send returned %d\n",
__func__, __LINE__, ret);
goto out;
}
out:
return ret;
}
static int rds_iw_map_fastreg(struct rds_iw_mr_pool *pool,
struct rds_iw_mr *ibmr,
struct scatterlist *sg,
unsigned int sg_len)
{
struct rds_iw_device *rds_iwdev = pool->device;
struct rds_iw_mapping *mapping = &ibmr->mapping;
u64 *dma_pages;
int i, ret = 0;
rds_iw_set_scatterlist(&mapping->m_sg, sg, sg_len);
dma_pages = rds_iw_map_scatterlist(rds_iwdev, &mapping->m_sg);
if (IS_ERR(dma_pages)) {
ret = PTR_ERR(dma_pages);
dma_pages = NULL;
goto out;
}
if (mapping->m_sg.dma_len > pool->max_message_size) {
ret = -EMSGSIZE;
goto out;
}
for (i = 0; i < mapping->m_sg.dma_npages; ++i)
ibmr->page_list->page_list[i] = dma_pages[i];
ret = rds_iw_rdma_build_fastreg(mapping);
if (ret)
goto out;
rds_iw_stats_inc(s_iw_rdma_mr_used);
out:
kfree(dma_pages);
return ret;
}
/*
* "Free" a fastreg MR.
*/
static void rds_iw_free_fastreg(struct rds_iw_mr_pool *pool,
struct rds_iw_mr *ibmr)
{
unsigned long flags;
int ret;
if (!ibmr->mapping.m_sg.dma_len)
return;
ret = rds_iw_rdma_fastreg_inv(ibmr);
if (ret)
return;
/* Try to post the LOCAL_INV WR to the queue. */
spin_lock_irqsave(&pool->list_lock, flags);
list_add_tail(&ibmr->mapping.m_list, &pool->dirty_list);
atomic_add(ibmr->mapping.m_sg.len, &pool->free_pinned);
atomic_inc(&pool->dirty_count);
spin_unlock_irqrestore(&pool->list_lock, flags);
}
static unsigned int rds_iw_unmap_fastreg_list(struct rds_iw_mr_pool *pool,
struct list_head *unmap_list,
struct list_head *kill_list,
int *unpinned)
{
struct rds_iw_mapping *mapping, *next;
unsigned int ncleaned = 0;
LIST_HEAD(laundered);
/* Batched invalidation of fastreg MRs.
* Why do we do it this way, even though we could pipeline unmap
* and remap? The reason is the application semantics - when the
* application requests an invalidation of MRs, it expects all
* previously released R_Keys to become invalid.
*
* If we implement MR reuse naively, we risk memory corruption
* (this has actually been observed). So the default behavior
* requires that a MR goes through an explicit unmap operation before
* we can reuse it again.
*
* We could probably improve on this a little, by allowing immediate
* reuse of a MR on the same socket (eg you could add small
* cache of unused MRs to strct rds_socket - GET_MR could grab one
* of these without requiring an explicit invalidate).
*/
while (!list_empty(unmap_list)) {
unsigned long flags;
spin_lock_irqsave(&pool->list_lock, flags);
list_for_each_entry_safe(mapping, next, unmap_list, m_list) {
*unpinned += mapping->m_sg.len;
list_move(&mapping->m_list, &laundered);
ncleaned++;
}
spin_unlock_irqrestore(&pool->list_lock, flags);
}
/* Move all laundered mappings back to the unmap list.
* We do not kill any WRs right now - it doesn't seem the
* fastreg API has a max_remap limit. */
list_splice_init(&laundered, unmap_list);
return ncleaned;
}
static void rds_iw_destroy_fastreg(struct rds_iw_mr_pool *pool,
struct rds_iw_mr *ibmr)
{
if (ibmr->page_list)
ib_free_fast_reg_page_list(ibmr->page_list);
if (ibmr->mr)
ib_dereg_mr(ibmr->mr);
}

919
net/rds/iw_recv.c Normal file
View file

@ -0,0 +1,919 @@
/*
* Copyright (c) 2006 Oracle. All rights reserved.
*
* This software is available to you under a choice of one of two
* licenses. You may choose to be licensed under the terms of the GNU
* General Public License (GPL) Version 2, available from the file
* COPYING in the main directory of this source tree, or the
* OpenIB.org BSD license below:
*
* Redistribution and use in source and binary forms, with or
* without modification, are permitted provided that the following
* conditions are met:
*
* - Redistributions of source code must retain the above
* copyright notice, this list of conditions and the following
* disclaimer.
*
* - Redistributions in binary form must reproduce the above
* copyright notice, this list of conditions and the following
* disclaimer in the documentation and/or other materials
* provided with the distribution.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
* BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
* ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
* CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
*
*/
#include <linux/kernel.h>
#include <linux/slab.h>
#include <linux/pci.h>
#include <linux/dma-mapping.h>
#include <rdma/rdma_cm.h>
#include "rds.h"
#include "iw.h"
static struct kmem_cache *rds_iw_incoming_slab;
static struct kmem_cache *rds_iw_frag_slab;
static atomic_t rds_iw_allocation = ATOMIC_INIT(0);
static void rds_iw_frag_drop_page(struct rds_page_frag *frag)
{
rdsdebug("frag %p page %p\n", frag, frag->f_page);
__free_page(frag->f_page);
frag->f_page = NULL;
}
static void rds_iw_frag_free(struct rds_page_frag *frag)
{
rdsdebug("frag %p page %p\n", frag, frag->f_page);
BUG_ON(frag->f_page);
kmem_cache_free(rds_iw_frag_slab, frag);
}
/*
* We map a page at a time. Its fragments are posted in order. This
* is called in fragment order as the fragments get send completion events.
* Only the last frag in the page performs the unmapping.
*
* It's OK for ring cleanup to call this in whatever order it likes because
* DMA is not in flight and so we can unmap while other ring entries still
* hold page references in their frags.
*/
static void rds_iw_recv_unmap_page(struct rds_iw_connection *ic,
struct rds_iw_recv_work *recv)
{
struct rds_page_frag *frag = recv->r_frag;
rdsdebug("recv %p frag %p page %p\n", recv, frag, frag->f_page);
if (frag->f_mapped)
ib_dma_unmap_page(ic->i_cm_id->device,
frag->f_mapped,
RDS_FRAG_SIZE, DMA_FROM_DEVICE);
frag->f_mapped = 0;
}
void rds_iw_recv_init_ring(struct rds_iw_connection *ic)
{
struct rds_iw_recv_work *recv;
u32 i;
for (i = 0, recv = ic->i_recvs; i < ic->i_recv_ring.w_nr; i++, recv++) {
struct ib_sge *sge;
recv->r_iwinc = NULL;
recv->r_frag = NULL;
recv->r_wr.next = NULL;
recv->r_wr.wr_id = i;
recv->r_wr.sg_list = recv->r_sge;
recv->r_wr.num_sge = RDS_IW_RECV_SGE;
sge = rds_iw_data_sge(ic, recv->r_sge);
sge->addr = 0;
sge->length = RDS_FRAG_SIZE;
sge->lkey = 0;
sge = rds_iw_header_sge(ic, recv->r_sge);
sge->addr = ic->i_recv_hdrs_dma + (i * sizeof(struct rds_header));
sge->length = sizeof(struct rds_header);
sge->lkey = 0;
}
}
static void rds_iw_recv_clear_one(struct rds_iw_connection *ic,
struct rds_iw_recv_work *recv)
{
if (recv->r_iwinc) {
rds_inc_put(&recv->r_iwinc->ii_inc);
recv->r_iwinc = NULL;
}
if (recv->r_frag) {
rds_iw_recv_unmap_page(ic, recv);
if (recv->r_frag->f_page)
rds_iw_frag_drop_page(recv->r_frag);
rds_iw_frag_free(recv->r_frag);
recv->r_frag = NULL;
}
}
void rds_iw_recv_clear_ring(struct rds_iw_connection *ic)
{
u32 i;
for (i = 0; i < ic->i_recv_ring.w_nr; i++)
rds_iw_recv_clear_one(ic, &ic->i_recvs[i]);
if (ic->i_frag.f_page)
rds_iw_frag_drop_page(&ic->i_frag);
}
static int rds_iw_recv_refill_one(struct rds_connection *conn,
struct rds_iw_recv_work *recv,
gfp_t kptr_gfp, gfp_t page_gfp)
{
struct rds_iw_connection *ic = conn->c_transport_data;
dma_addr_t dma_addr;
struct ib_sge *sge;
int ret = -ENOMEM;
if (!recv->r_iwinc) {
if (!atomic_add_unless(&rds_iw_allocation, 1, rds_iw_sysctl_max_recv_allocation)) {
rds_iw_stats_inc(s_iw_rx_alloc_limit);
goto out;
}
recv->r_iwinc = kmem_cache_alloc(rds_iw_incoming_slab,
kptr_gfp);
if (!recv->r_iwinc) {
atomic_dec(&rds_iw_allocation);
goto out;
}
INIT_LIST_HEAD(&recv->r_iwinc->ii_frags);
rds_inc_init(&recv->r_iwinc->ii_inc, conn, conn->c_faddr);
}
if (!recv->r_frag) {
recv->r_frag = kmem_cache_alloc(rds_iw_frag_slab, kptr_gfp);
if (!recv->r_frag)
goto out;
INIT_LIST_HEAD(&recv->r_frag->f_item);
recv->r_frag->f_page = NULL;
}
if (!ic->i_frag.f_page) {
ic->i_frag.f_page = alloc_page(page_gfp);
if (!ic->i_frag.f_page)
goto out;
ic->i_frag.f_offset = 0;
}
dma_addr = ib_dma_map_page(ic->i_cm_id->device,
ic->i_frag.f_page,
ic->i_frag.f_offset,
RDS_FRAG_SIZE,
DMA_FROM_DEVICE);
if (ib_dma_mapping_error(ic->i_cm_id->device, dma_addr))
goto out;
/*
* Once we get the RDS_PAGE_LAST_OFF frag then rds_iw_frag_unmap()
* must be called on this recv. This happens as completions hit
* in order or on connection shutdown.
*/
recv->r_frag->f_page = ic->i_frag.f_page;
recv->r_frag->f_offset = ic->i_frag.f_offset;
recv->r_frag->f_mapped = dma_addr;
sge = rds_iw_data_sge(ic, recv->r_sge);
sge->addr = dma_addr;
sge->length = RDS_FRAG_SIZE;
sge = rds_iw_header_sge(ic, recv->r_sge);
sge->addr = ic->i_recv_hdrs_dma + (recv - ic->i_recvs) * sizeof(struct rds_header);
sge->length = sizeof(struct rds_header);
get_page(recv->r_frag->f_page);
if (ic->i_frag.f_offset < RDS_PAGE_LAST_OFF) {
ic->i_frag.f_offset += RDS_FRAG_SIZE;
} else {
put_page(ic->i_frag.f_page);
ic->i_frag.f_page = NULL;
ic->i_frag.f_offset = 0;
}
ret = 0;
out:
return ret;
}
/*
* This tries to allocate and post unused work requests after making sure that
* they have all the allocations they need to queue received fragments into
* sockets. The i_recv_mutex is held here so that ring_alloc and _unalloc
* pairs don't go unmatched.
*
* -1 is returned if posting fails due to temporary resource exhaustion.
*/
int rds_iw_recv_refill(struct rds_connection *conn, gfp_t kptr_gfp,
gfp_t page_gfp, int prefill)
{
struct rds_iw_connection *ic = conn->c_transport_data;
struct rds_iw_recv_work *recv;
struct ib_recv_wr *failed_wr;
unsigned int posted = 0;
int ret = 0;
u32 pos;
while ((prefill || rds_conn_up(conn)) &&
rds_iw_ring_alloc(&ic->i_recv_ring, 1, &pos)) {
if (pos >= ic->i_recv_ring.w_nr) {
printk(KERN_NOTICE "Argh - ring alloc returned pos=%u\n",
pos);
ret = -EINVAL;
break;
}
recv = &ic->i_recvs[pos];
ret = rds_iw_recv_refill_one(conn, recv, kptr_gfp, page_gfp);
if (ret) {
ret = -1;
break;
}
/* XXX when can this fail? */
ret = ib_post_recv(ic->i_cm_id->qp, &recv->r_wr, &failed_wr);
rdsdebug("recv %p iwinc %p page %p addr %lu ret %d\n", recv,
recv->r_iwinc, recv->r_frag->f_page,
(long) recv->r_frag->f_mapped, ret);
if (ret) {
rds_iw_conn_error(conn, "recv post on "
"%pI4 returned %d, disconnecting and "
"reconnecting\n", &conn->c_faddr,
ret);
ret = -1;
break;
}
posted++;
}
/* We're doing flow control - update the window. */
if (ic->i_flowctl && posted)
rds_iw_advertise_credits(conn, posted);
if (ret)
rds_iw_ring_unalloc(&ic->i_recv_ring, 1);
return ret;
}
static void rds_iw_inc_purge(struct rds_incoming *inc)
{
struct rds_iw_incoming *iwinc;
struct rds_page_frag *frag;
struct rds_page_frag *pos;
iwinc = container_of(inc, struct rds_iw_incoming, ii_inc);
rdsdebug("purging iwinc %p inc %p\n", iwinc, inc);
list_for_each_entry_safe(frag, pos, &iwinc->ii_frags, f_item) {
list_del_init(&frag->f_item);
rds_iw_frag_drop_page(frag);
rds_iw_frag_free(frag);
}
}
void rds_iw_inc_free(struct rds_incoming *inc)
{
struct rds_iw_incoming *iwinc;
iwinc = container_of(inc, struct rds_iw_incoming, ii_inc);
rds_iw_inc_purge(inc);
rdsdebug("freeing iwinc %p inc %p\n", iwinc, inc);
BUG_ON(!list_empty(&iwinc->ii_frags));
kmem_cache_free(rds_iw_incoming_slab, iwinc);
atomic_dec(&rds_iw_allocation);
BUG_ON(atomic_read(&rds_iw_allocation) < 0);
}
int rds_iw_inc_copy_to_user(struct rds_incoming *inc, struct iovec *first_iov,
size_t size)
{
struct rds_iw_incoming *iwinc;
struct rds_page_frag *frag;
struct iovec *iov = first_iov;
unsigned long to_copy;
unsigned long frag_off = 0;
unsigned long iov_off = 0;
int copied = 0;
int ret;
u32 len;
iwinc = container_of(inc, struct rds_iw_incoming, ii_inc);
frag = list_entry(iwinc->ii_frags.next, struct rds_page_frag, f_item);
len = be32_to_cpu(inc->i_hdr.h_len);
while (copied < size && copied < len) {
if (frag_off == RDS_FRAG_SIZE) {
frag = list_entry(frag->f_item.next,
struct rds_page_frag, f_item);
frag_off = 0;
}
while (iov_off == iov->iov_len) {
iov_off = 0;
iov++;
}
to_copy = min(iov->iov_len - iov_off, RDS_FRAG_SIZE - frag_off);
to_copy = min_t(size_t, to_copy, size - copied);
to_copy = min_t(unsigned long, to_copy, len - copied);
rdsdebug("%lu bytes to user [%p, %zu] + %lu from frag "
"[%p, %lu] + %lu\n",
to_copy, iov->iov_base, iov->iov_len, iov_off,
frag->f_page, frag->f_offset, frag_off);
/* XXX needs + offset for multiple recvs per page */
ret = rds_page_copy_to_user(frag->f_page,
frag->f_offset + frag_off,
iov->iov_base + iov_off,
to_copy);
if (ret) {
copied = ret;
break;
}
iov_off += to_copy;
frag_off += to_copy;
copied += to_copy;
}
return copied;
}
/* ic starts out kzalloc()ed */
void rds_iw_recv_init_ack(struct rds_iw_connection *ic)
{
struct ib_send_wr *wr = &ic->i_ack_wr;
struct ib_sge *sge = &ic->i_ack_sge;
sge->addr = ic->i_ack_dma;
sge->length = sizeof(struct rds_header);
sge->lkey = rds_iw_local_dma_lkey(ic);
wr->sg_list = sge;
wr->num_sge = 1;
wr->opcode = IB_WR_SEND;
wr->wr_id = RDS_IW_ACK_WR_ID;
wr->send_flags = IB_SEND_SIGNALED | IB_SEND_SOLICITED;
}
/*
* You'd think that with reliable IB connections you wouldn't need to ack
* messages that have been received. The problem is that IB hardware generates
* an ack message before it has DMAed the message into memory. This creates a
* potential message loss if the HCA is disabled for any reason between when it
* sends the ack and before the message is DMAed and processed. This is only a
* potential issue if another HCA is available for fail-over.
*
* When the remote host receives our ack they'll free the sent message from
* their send queue. To decrease the latency of this we always send an ack
* immediately after we've received messages.
*
* For simplicity, we only have one ack in flight at a time. This puts
* pressure on senders to have deep enough send queues to absorb the latency of
* a single ack frame being in flight. This might not be good enough.
*
* This is implemented by have a long-lived send_wr and sge which point to a
* statically allocated ack frame. This ack wr does not fall under the ring
* accounting that the tx and rx wrs do. The QP attribute specifically makes
* room for it beyond the ring size. Send completion notices its special
* wr_id and avoids working with the ring in that case.
*/
#ifndef KERNEL_HAS_ATOMIC64
static void rds_iw_set_ack(struct rds_iw_connection *ic, u64 seq,
int ack_required)
{
unsigned long flags;
spin_lock_irqsave(&ic->i_ack_lock, flags);
ic->i_ack_next = seq;
if (ack_required)
set_bit(IB_ACK_REQUESTED, &ic->i_ack_flags);
spin_unlock_irqrestore(&ic->i_ack_lock, flags);
}
static u64 rds_iw_get_ack(struct rds_iw_connection *ic)
{
unsigned long flags;
u64 seq;
clear_bit(IB_ACK_REQUESTED, &ic->i_ack_flags);
spin_lock_irqsave(&ic->i_ack_lock, flags);
seq = ic->i_ack_next;
spin_unlock_irqrestore(&ic->i_ack_lock, flags);
return seq;
}
#else
static void rds_iw_set_ack(struct rds_iw_connection *ic, u64 seq,
int ack_required)
{
atomic64_set(&ic->i_ack_next, seq);
if (ack_required) {
smp_mb__before_atomic();
set_bit(IB_ACK_REQUESTED, &ic->i_ack_flags);
}
}
static u64 rds_iw_get_ack(struct rds_iw_connection *ic)
{
clear_bit(IB_ACK_REQUESTED, &ic->i_ack_flags);
smp_mb__after_atomic();
return atomic64_read(&ic->i_ack_next);
}
#endif
static void rds_iw_send_ack(struct rds_iw_connection *ic, unsigned int adv_credits)
{
struct rds_header *hdr = ic->i_ack;
struct ib_send_wr *failed_wr;
u64 seq;
int ret;
seq = rds_iw_get_ack(ic);
rdsdebug("send_ack: ic %p ack %llu\n", ic, (unsigned long long) seq);
rds_message_populate_header(hdr, 0, 0, 0);
hdr->h_ack = cpu_to_be64(seq);
hdr->h_credit = adv_credits;
rds_message_make_checksum(hdr);
ic->i_ack_queued = jiffies;
ret = ib_post_send(ic->i_cm_id->qp, &ic->i_ack_wr, &failed_wr);
if (unlikely(ret)) {
/* Failed to send. Release the WR, and
* force another ACK.
*/
clear_bit(IB_ACK_IN_FLIGHT, &ic->i_ack_flags);
set_bit(IB_ACK_REQUESTED, &ic->i_ack_flags);
rds_iw_stats_inc(s_iw_ack_send_failure);
rds_iw_conn_error(ic->conn, "sending ack failed\n");
} else
rds_iw_stats_inc(s_iw_ack_sent);
}
/*
* There are 3 ways of getting acknowledgements to the peer:
* 1. We call rds_iw_attempt_ack from the recv completion handler
* to send an ACK-only frame.
* However, there can be only one such frame in the send queue
* at any time, so we may have to postpone it.
* 2. When another (data) packet is transmitted while there's
* an ACK in the queue, we piggyback the ACK sequence number
* on the data packet.
* 3. If the ACK WR is done sending, we get called from the
* send queue completion handler, and check whether there's
* another ACK pending (postponed because the WR was on the
* queue). If so, we transmit it.
*
* We maintain 2 variables:
* - i_ack_flags, which keeps track of whether the ACK WR
* is currently in the send queue or not (IB_ACK_IN_FLIGHT)
* - i_ack_next, which is the last sequence number we received
*
* Potentially, send queue and receive queue handlers can run concurrently.
* It would be nice to not have to use a spinlock to synchronize things,
* but the one problem that rules this out is that 64bit updates are
* not atomic on all platforms. Things would be a lot simpler if
* we had atomic64 or maybe cmpxchg64 everywhere.
*
* Reconnecting complicates this picture just slightly. When we
* reconnect, we may be seeing duplicate packets. The peer
* is retransmitting them, because it hasn't seen an ACK for
* them. It is important that we ACK these.
*
* ACK mitigation adds a header flag "ACK_REQUIRED"; any packet with
* this flag set *MUST* be acknowledged immediately.
*/
/*
* When we get here, we're called from the recv queue handler.
* Check whether we ought to transmit an ACK.
*/
void rds_iw_attempt_ack(struct rds_iw_connection *ic)
{
unsigned int adv_credits;
if (!test_bit(IB_ACK_REQUESTED, &ic->i_ack_flags))
return;
if (test_and_set_bit(IB_ACK_IN_FLIGHT, &ic->i_ack_flags)) {
rds_iw_stats_inc(s_iw_ack_send_delayed);
return;
}
/* Can we get a send credit? */
if (!rds_iw_send_grab_credits(ic, 1, &adv_credits, 0, RDS_MAX_ADV_CREDIT)) {
rds_iw_stats_inc(s_iw_tx_throttle);
clear_bit(IB_ACK_IN_FLIGHT, &ic->i_ack_flags);
return;
}
clear_bit(IB_ACK_REQUESTED, &ic->i_ack_flags);
rds_iw_send_ack(ic, adv_credits);
}
/*
* We get here from the send completion handler, when the
* adapter tells us the ACK frame was sent.
*/
void rds_iw_ack_send_complete(struct rds_iw_connection *ic)
{
clear_bit(IB_ACK_IN_FLIGHT, &ic->i_ack_flags);
rds_iw_attempt_ack(ic);
}
/*
* This is called by the regular xmit code when it wants to piggyback
* an ACK on an outgoing frame.
*/
u64 rds_iw_piggyb_ack(struct rds_iw_connection *ic)
{
if (test_and_clear_bit(IB_ACK_REQUESTED, &ic->i_ack_flags))
rds_iw_stats_inc(s_iw_ack_send_piggybacked);
return rds_iw_get_ack(ic);
}
/*
* It's kind of lame that we're copying from the posted receive pages into
* long-lived bitmaps. We could have posted the bitmaps and rdma written into
* them. But receiving new congestion bitmaps should be a *rare* event, so
* hopefully we won't need to invest that complexity in making it more
* efficient. By copying we can share a simpler core with TCP which has to
* copy.
*/
static void rds_iw_cong_recv(struct rds_connection *conn,
struct rds_iw_incoming *iwinc)
{
struct rds_cong_map *map;
unsigned int map_off;
unsigned int map_page;
struct rds_page_frag *frag;
unsigned long frag_off;
unsigned long to_copy;
unsigned long copied;
uint64_t uncongested = 0;
void *addr;
/* catch completely corrupt packets */
if (be32_to_cpu(iwinc->ii_inc.i_hdr.h_len) != RDS_CONG_MAP_BYTES)
return;
map = conn->c_fcong;
map_page = 0;
map_off = 0;
frag = list_entry(iwinc->ii_frags.next, struct rds_page_frag, f_item);
frag_off = 0;
copied = 0;
while (copied < RDS_CONG_MAP_BYTES) {
uint64_t *src, *dst;
unsigned int k;
to_copy = min(RDS_FRAG_SIZE - frag_off, PAGE_SIZE - map_off);
BUG_ON(to_copy & 7); /* Must be 64bit aligned. */
addr = kmap_atomic(frag->f_page);
src = addr + frag_off;
dst = (void *)map->m_page_addrs[map_page] + map_off;
for (k = 0; k < to_copy; k += 8) {
/* Record ports that became uncongested, ie
* bits that changed from 0 to 1. */
uncongested |= ~(*src) & *dst;
*dst++ = *src++;
}
kunmap_atomic(addr);
copied += to_copy;
map_off += to_copy;
if (map_off == PAGE_SIZE) {
map_off = 0;
map_page++;
}
frag_off += to_copy;
if (frag_off == RDS_FRAG_SIZE) {
frag = list_entry(frag->f_item.next,
struct rds_page_frag, f_item);
frag_off = 0;
}
}
/* the congestion map is in little endian order */
uncongested = le64_to_cpu(uncongested);
rds_cong_map_updated(map, uncongested);
}
/*
* Rings are posted with all the allocations they'll need to queue the
* incoming message to the receiving socket so this can't fail.
* All fragments start with a header, so we can make sure we're not receiving
* garbage, and we can tell a small 8 byte fragment from an ACK frame.
*/
struct rds_iw_ack_state {
u64 ack_next;
u64 ack_recv;
unsigned int ack_required:1;
unsigned int ack_next_valid:1;
unsigned int ack_recv_valid:1;
};
static void rds_iw_process_recv(struct rds_connection *conn,
struct rds_iw_recv_work *recv, u32 byte_len,
struct rds_iw_ack_state *state)
{
struct rds_iw_connection *ic = conn->c_transport_data;
struct rds_iw_incoming *iwinc = ic->i_iwinc;
struct rds_header *ihdr, *hdr;
/* XXX shut down the connection if port 0,0 are seen? */
rdsdebug("ic %p iwinc %p recv %p byte len %u\n", ic, iwinc, recv,
byte_len);
if (byte_len < sizeof(struct rds_header)) {
rds_iw_conn_error(conn, "incoming message "
"from %pI4 didn't include a "
"header, disconnecting and "
"reconnecting\n",
&conn->c_faddr);
return;
}
byte_len -= sizeof(struct rds_header);
ihdr = &ic->i_recv_hdrs[recv - ic->i_recvs];
/* Validate the checksum. */
if (!rds_message_verify_checksum(ihdr)) {
rds_iw_conn_error(conn, "incoming message "
"from %pI4 has corrupted header - "
"forcing a reconnect\n",
&conn->c_faddr);
rds_stats_inc(s_recv_drop_bad_checksum);
return;
}
/* Process the ACK sequence which comes with every packet */
state->ack_recv = be64_to_cpu(ihdr->h_ack);
state->ack_recv_valid = 1;
/* Process the credits update if there was one */
if (ihdr->h_credit)
rds_iw_send_add_credits(conn, ihdr->h_credit);
if (ihdr->h_sport == 0 && ihdr->h_dport == 0 && byte_len == 0) {
/* This is an ACK-only packet. The fact that it gets
* special treatment here is that historically, ACKs
* were rather special beasts.
*/
rds_iw_stats_inc(s_iw_ack_received);
/*
* Usually the frags make their way on to incs and are then freed as
* the inc is freed. We don't go that route, so we have to drop the
* page ref ourselves. We can't just leave the page on the recv
* because that confuses the dma mapping of pages and each recv's use
* of a partial page. We can leave the frag, though, it will be
* reused.
*
* FIXME: Fold this into the code path below.
*/
rds_iw_frag_drop_page(recv->r_frag);
return;
}
/*
* If we don't already have an inc on the connection then this
* fragment has a header and starts a message.. copy its header
* into the inc and save the inc so we can hang upcoming fragments
* off its list.
*/
if (!iwinc) {
iwinc = recv->r_iwinc;
recv->r_iwinc = NULL;
ic->i_iwinc = iwinc;
hdr = &iwinc->ii_inc.i_hdr;
memcpy(hdr, ihdr, sizeof(*hdr));
ic->i_recv_data_rem = be32_to_cpu(hdr->h_len);
rdsdebug("ic %p iwinc %p rem %u flag 0x%x\n", ic, iwinc,
ic->i_recv_data_rem, hdr->h_flags);
} else {
hdr = &iwinc->ii_inc.i_hdr;
/* We can't just use memcmp here; fragments of a
* single message may carry different ACKs */
if (hdr->h_sequence != ihdr->h_sequence ||
hdr->h_len != ihdr->h_len ||
hdr->h_sport != ihdr->h_sport ||
hdr->h_dport != ihdr->h_dport) {
rds_iw_conn_error(conn,
"fragment header mismatch; forcing reconnect\n");
return;
}
}
list_add_tail(&recv->r_frag->f_item, &iwinc->ii_frags);
recv->r_frag = NULL;
if (ic->i_recv_data_rem > RDS_FRAG_SIZE)
ic->i_recv_data_rem -= RDS_FRAG_SIZE;
else {
ic->i_recv_data_rem = 0;
ic->i_iwinc = NULL;
if (iwinc->ii_inc.i_hdr.h_flags == RDS_FLAG_CONG_BITMAP)
rds_iw_cong_recv(conn, iwinc);
else {
rds_recv_incoming(conn, conn->c_faddr, conn->c_laddr,
&iwinc->ii_inc, GFP_ATOMIC);
state->ack_next = be64_to_cpu(hdr->h_sequence);
state->ack_next_valid = 1;
}
/* Evaluate the ACK_REQUIRED flag *after* we received
* the complete frame, and after bumping the next_rx
* sequence. */
if (hdr->h_flags & RDS_FLAG_ACK_REQUIRED) {
rds_stats_inc(s_recv_ack_required);
state->ack_required = 1;
}
rds_inc_put(&iwinc->ii_inc);
}
}
/*
* Plucking the oldest entry from the ring can be done concurrently with
* the thread refilling the ring. Each ring operation is protected by
* spinlocks and the transient state of refilling doesn't change the
* recording of which entry is oldest.
*
* This relies on IB only calling one cq comp_handler for each cq so that
* there will only be one caller of rds_recv_incoming() per RDS connection.
*/
void rds_iw_recv_cq_comp_handler(struct ib_cq *cq, void *context)
{
struct rds_connection *conn = context;
struct rds_iw_connection *ic = conn->c_transport_data;
rdsdebug("conn %p cq %p\n", conn, cq);
rds_iw_stats_inc(s_iw_rx_cq_call);
tasklet_schedule(&ic->i_recv_tasklet);
}
static inline void rds_poll_cq(struct rds_iw_connection *ic,
struct rds_iw_ack_state *state)
{
struct rds_connection *conn = ic->conn;
struct ib_wc wc;
struct rds_iw_recv_work *recv;
while (ib_poll_cq(ic->i_recv_cq, 1, &wc) > 0) {
rdsdebug("wc wr_id 0x%llx status %u byte_len %u imm_data %u\n",
(unsigned long long)wc.wr_id, wc.status, wc.byte_len,
be32_to_cpu(wc.ex.imm_data));
rds_iw_stats_inc(s_iw_rx_cq_event);
recv = &ic->i_recvs[rds_iw_ring_oldest(&ic->i_recv_ring)];
rds_iw_recv_unmap_page(ic, recv);
/*
* Also process recvs in connecting state because it is possible
* to get a recv completion _before_ the rdmacm ESTABLISHED
* event is processed.
*/
if (rds_conn_up(conn) || rds_conn_connecting(conn)) {
/* We expect errors as the qp is drained during shutdown */
if (wc.status == IB_WC_SUCCESS) {
rds_iw_process_recv(conn, recv, wc.byte_len, state);
} else {
rds_iw_conn_error(conn, "recv completion on "
"%pI4 had status %u, disconnecting and "
"reconnecting\n", &conn->c_faddr,
wc.status);
}
}
rds_iw_ring_free(&ic->i_recv_ring, 1);
}
}
void rds_iw_recv_tasklet_fn(unsigned long data)
{
struct rds_iw_connection *ic = (struct rds_iw_connection *) data;
struct rds_connection *conn = ic->conn;
struct rds_iw_ack_state state = { 0, };
rds_poll_cq(ic, &state);
ib_req_notify_cq(ic->i_recv_cq, IB_CQ_SOLICITED);
rds_poll_cq(ic, &state);
if (state.ack_next_valid)
rds_iw_set_ack(ic, state.ack_next, state.ack_required);
if (state.ack_recv_valid && state.ack_recv > ic->i_ack_recv) {
rds_send_drop_acked(conn, state.ack_recv, NULL);
ic->i_ack_recv = state.ack_recv;
}
if (rds_conn_up(conn))
rds_iw_attempt_ack(ic);
/* If we ever end up with a really empty receive ring, we're
* in deep trouble, as the sender will definitely see RNR
* timeouts. */
if (rds_iw_ring_empty(&ic->i_recv_ring))
rds_iw_stats_inc(s_iw_rx_ring_empty);
/*
* If the ring is running low, then schedule the thread to refill.
*/
if (rds_iw_ring_low(&ic->i_recv_ring))
queue_delayed_work(rds_wq, &conn->c_recv_w, 0);
}
int rds_iw_recv(struct rds_connection *conn)
{
struct rds_iw_connection *ic = conn->c_transport_data;
int ret = 0;
rdsdebug("conn %p\n", conn);
/*
* If we get a temporary posting failure in this context then
* we're really low and we want the caller to back off for a bit.
*/
mutex_lock(&ic->i_recv_mutex);
if (rds_iw_recv_refill(conn, GFP_KERNEL, GFP_HIGHUSER, 0))
ret = -ENOMEM;
else
rds_iw_stats_inc(s_iw_rx_refill_from_thread);
mutex_unlock(&ic->i_recv_mutex);
if (rds_conn_up(conn))
rds_iw_attempt_ack(ic);
return ret;
}
int rds_iw_recv_init(void)
{
struct sysinfo si;
int ret = -ENOMEM;
/* Default to 30% of all available RAM for recv memory */
si_meminfo(&si);
rds_iw_sysctl_max_recv_allocation = si.totalram / 3 * PAGE_SIZE / RDS_FRAG_SIZE;
rds_iw_incoming_slab = kmem_cache_create("rds_iw_incoming",
sizeof(struct rds_iw_incoming),
0, 0, NULL);
if (!rds_iw_incoming_slab)
goto out;
rds_iw_frag_slab = kmem_cache_create("rds_iw_frag",
sizeof(struct rds_page_frag),
0, 0, NULL);
if (!rds_iw_frag_slab)
kmem_cache_destroy(rds_iw_incoming_slab);
else
ret = 0;
out:
return ret;
}
void rds_iw_recv_exit(void)
{
kmem_cache_destroy(rds_iw_incoming_slab);
kmem_cache_destroy(rds_iw_frag_slab);
}

169
net/rds/iw_ring.c Normal file
View file

@ -0,0 +1,169 @@
/*
* Copyright (c) 2006 Oracle. All rights reserved.
*
* This software is available to you under a choice of one of two
* licenses. You may choose to be licensed under the terms of the GNU
* General Public License (GPL) Version 2, available from the file
* COPYING in the main directory of this source tree, or the
* OpenIB.org BSD license below:
*
* Redistribution and use in source and binary forms, with or
* without modification, are permitted provided that the following
* conditions are met:
*
* - Redistributions of source code must retain the above
* copyright notice, this list of conditions and the following
* disclaimer.
*
* - Redistributions in binary form must reproduce the above
* copyright notice, this list of conditions and the following
* disclaimer in the documentation and/or other materials
* provided with the distribution.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
* BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
* ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
* CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
*
*/
#include <linux/kernel.h>
#include "rds.h"
#include "iw.h"
/*
* Locking for IB rings.
* We assume that allocation is always protected by a mutex
* in the caller (this is a valid assumption for the current
* implementation).
*
* Freeing always happens in an interrupt, and hence only
* races with allocations, but not with other free()s.
*
* The interaction between allocation and freeing is that
* the alloc code has to determine the number of free entries.
* To this end, we maintain two counters; an allocation counter
* and a free counter. Both are allowed to run freely, and wrap
* around.
* The number of used entries is always (alloc_ctr - free_ctr) % NR.
*
* The current implementation makes free_ctr atomic. When the
* caller finds an allocation fails, it should set an "alloc fail"
* bit and retry the allocation. The "alloc fail" bit essentially tells
* the CQ completion handlers to wake it up after freeing some
* more entries.
*/
/*
* This only happens on shutdown.
*/
DECLARE_WAIT_QUEUE_HEAD(rds_iw_ring_empty_wait);
void rds_iw_ring_init(struct rds_iw_work_ring *ring, u32 nr)
{
memset(ring, 0, sizeof(*ring));
ring->w_nr = nr;
rdsdebug("ring %p nr %u\n", ring, ring->w_nr);
}
static inline u32 __rds_iw_ring_used(struct rds_iw_work_ring *ring)
{
u32 diff;
/* This assumes that atomic_t has at least as many bits as u32 */
diff = ring->w_alloc_ctr - (u32) atomic_read(&ring->w_free_ctr);
BUG_ON(diff > ring->w_nr);
return diff;
}
void rds_iw_ring_resize(struct rds_iw_work_ring *ring, u32 nr)
{
/* We only ever get called from the connection setup code,
* prior to creating the QP. */
BUG_ON(__rds_iw_ring_used(ring));
ring->w_nr = nr;
}
static int __rds_iw_ring_empty(struct rds_iw_work_ring *ring)
{
return __rds_iw_ring_used(ring) == 0;
}
u32 rds_iw_ring_alloc(struct rds_iw_work_ring *ring, u32 val, u32 *pos)
{
u32 ret = 0, avail;
avail = ring->w_nr - __rds_iw_ring_used(ring);
rdsdebug("ring %p val %u next %u free %u\n", ring, val,
ring->w_alloc_ptr, avail);
if (val && avail) {
ret = min(val, avail);
*pos = ring->w_alloc_ptr;
ring->w_alloc_ptr = (ring->w_alloc_ptr + ret) % ring->w_nr;
ring->w_alloc_ctr += ret;
}
return ret;
}
void rds_iw_ring_free(struct rds_iw_work_ring *ring, u32 val)
{
ring->w_free_ptr = (ring->w_free_ptr + val) % ring->w_nr;
atomic_add(val, &ring->w_free_ctr);
if (__rds_iw_ring_empty(ring) &&
waitqueue_active(&rds_iw_ring_empty_wait))
wake_up(&rds_iw_ring_empty_wait);
}
void rds_iw_ring_unalloc(struct rds_iw_work_ring *ring, u32 val)
{
ring->w_alloc_ptr = (ring->w_alloc_ptr - val) % ring->w_nr;
ring->w_alloc_ctr -= val;
}
int rds_iw_ring_empty(struct rds_iw_work_ring *ring)
{
return __rds_iw_ring_empty(ring);
}
int rds_iw_ring_low(struct rds_iw_work_ring *ring)
{
return __rds_iw_ring_used(ring) <= (ring->w_nr >> 1);
}
/*
* returns the oldest alloced ring entry. This will be the next one
* freed. This can't be called if there are none allocated.
*/
u32 rds_iw_ring_oldest(struct rds_iw_work_ring *ring)
{
return ring->w_free_ptr;
}
/*
* returns the number of completed work requests.
*/
u32 rds_iw_ring_completed(struct rds_iw_work_ring *ring, u32 wr_id, u32 oldest)
{
u32 ret;
if (oldest <= (unsigned long long)wr_id)
ret = (unsigned long long)wr_id - oldest + 1;
else
ret = ring->w_nr - oldest + (unsigned long long)wr_id + 1;
rdsdebug("ring %p ret %u wr_id %u oldest %u\n", ring, ret,
wr_id, oldest);
return ret;
}

974
net/rds/iw_send.c Normal file
View file

@ -0,0 +1,974 @@
/*
* Copyright (c) 2006 Oracle. All rights reserved.
*
* This software is available to you under a choice of one of two
* licenses. You may choose to be licensed under the terms of the GNU
* General Public License (GPL) Version 2, available from the file
* COPYING in the main directory of this source tree, or the
* OpenIB.org BSD license below:
*
* Redistribution and use in source and binary forms, with or
* without modification, are permitted provided that the following
* conditions are met:
*
* - Redistributions of source code must retain the above
* copyright notice, this list of conditions and the following
* disclaimer.
*
* - Redistributions in binary form must reproduce the above
* copyright notice, this list of conditions and the following
* disclaimer in the documentation and/or other materials
* provided with the distribution.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
* BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
* ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
* CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
*
*/
#include <linux/kernel.h>
#include <linux/in.h>
#include <linux/device.h>
#include <linux/dmapool.h>
#include <linux/ratelimit.h>
#include "rds.h"
#include "iw.h"
static void rds_iw_send_rdma_complete(struct rds_message *rm,
int wc_status)
{
int notify_status;
switch (wc_status) {
case IB_WC_WR_FLUSH_ERR:
return;
case IB_WC_SUCCESS:
notify_status = RDS_RDMA_SUCCESS;
break;
case IB_WC_REM_ACCESS_ERR:
notify_status = RDS_RDMA_REMOTE_ERROR;
break;
default:
notify_status = RDS_RDMA_OTHER_ERROR;
break;
}
rds_rdma_send_complete(rm, notify_status);
}
static void rds_iw_send_unmap_rdma(struct rds_iw_connection *ic,
struct rm_rdma_op *op)
{
if (op->op_mapped) {
ib_dma_unmap_sg(ic->i_cm_id->device,
op->op_sg, op->op_nents,
op->op_write ? DMA_TO_DEVICE : DMA_FROM_DEVICE);
op->op_mapped = 0;
}
}
static void rds_iw_send_unmap_rm(struct rds_iw_connection *ic,
struct rds_iw_send_work *send,
int wc_status)
{
struct rds_message *rm = send->s_rm;
rdsdebug("ic %p send %p rm %p\n", ic, send, rm);
ib_dma_unmap_sg(ic->i_cm_id->device,
rm->data.op_sg, rm->data.op_nents,
DMA_TO_DEVICE);
if (rm->rdma.op_active) {
rds_iw_send_unmap_rdma(ic, &rm->rdma);
/* If the user asked for a completion notification on this
* message, we can implement three different semantics:
* 1. Notify when we received the ACK on the RDS message
* that was queued with the RDMA. This provides reliable
* notification of RDMA status at the expense of a one-way
* packet delay.
* 2. Notify when the IB stack gives us the completion event for
* the RDMA operation.
* 3. Notify when the IB stack gives us the completion event for
* the accompanying RDS messages.
* Here, we implement approach #3. To implement approach #2,
* call rds_rdma_send_complete from the cq_handler. To implement #1,
* don't call rds_rdma_send_complete at all, and fall back to the notify
* handling in the ACK processing code.
*
* Note: There's no need to explicitly sync any RDMA buffers using
* ib_dma_sync_sg_for_cpu - the completion for the RDMA
* operation itself unmapped the RDMA buffers, which takes care
* of synching.
*/
rds_iw_send_rdma_complete(rm, wc_status);
if (rm->rdma.op_write)
rds_stats_add(s_send_rdma_bytes, rm->rdma.op_bytes);
else
rds_stats_add(s_recv_rdma_bytes, rm->rdma.op_bytes);
}
/* If anyone waited for this message to get flushed out, wake
* them up now */
rds_message_unmapped(rm);
rds_message_put(rm);
send->s_rm = NULL;
}
void rds_iw_send_init_ring(struct rds_iw_connection *ic)
{
struct rds_iw_send_work *send;
u32 i;
for (i = 0, send = ic->i_sends; i < ic->i_send_ring.w_nr; i++, send++) {
struct ib_sge *sge;
send->s_rm = NULL;
send->s_op = NULL;
send->s_mapping = NULL;
send->s_wr.next = NULL;
send->s_wr.wr_id = i;
send->s_wr.sg_list = send->s_sge;
send->s_wr.num_sge = 1;
send->s_wr.opcode = IB_WR_SEND;
send->s_wr.send_flags = 0;
send->s_wr.ex.imm_data = 0;
sge = rds_iw_data_sge(ic, send->s_sge);
sge->lkey = 0;
sge = rds_iw_header_sge(ic, send->s_sge);
sge->addr = ic->i_send_hdrs_dma + (i * sizeof(struct rds_header));
sge->length = sizeof(struct rds_header);
sge->lkey = 0;
send->s_mr = ib_alloc_fast_reg_mr(ic->i_pd, fastreg_message_size);
if (IS_ERR(send->s_mr)) {
printk(KERN_WARNING "RDS/IW: ib_alloc_fast_reg_mr failed\n");
break;
}
send->s_page_list = ib_alloc_fast_reg_page_list(
ic->i_cm_id->device, fastreg_message_size);
if (IS_ERR(send->s_page_list)) {
printk(KERN_WARNING "RDS/IW: ib_alloc_fast_reg_page_list failed\n");
break;
}
}
}
void rds_iw_send_clear_ring(struct rds_iw_connection *ic)
{
struct rds_iw_send_work *send;
u32 i;
for (i = 0, send = ic->i_sends; i < ic->i_send_ring.w_nr; i++, send++) {
BUG_ON(!send->s_mr);
ib_dereg_mr(send->s_mr);
BUG_ON(!send->s_page_list);
ib_free_fast_reg_page_list(send->s_page_list);
if (send->s_wr.opcode == 0xdead)
continue;
if (send->s_rm)
rds_iw_send_unmap_rm(ic, send, IB_WC_WR_FLUSH_ERR);
if (send->s_op)
rds_iw_send_unmap_rdma(ic, send->s_op);
}
}
/*
* The _oldest/_free ring operations here race cleanly with the alloc/unalloc
* operations performed in the send path. As the sender allocs and potentially
* unallocs the next free entry in the ring it doesn't alter which is
* the next to be freed, which is what this is concerned with.
*/
void rds_iw_send_cq_comp_handler(struct ib_cq *cq, void *context)
{
struct rds_connection *conn = context;
struct rds_iw_connection *ic = conn->c_transport_data;
struct ib_wc wc;
struct rds_iw_send_work *send;
u32 completed;
u32 oldest;
u32 i;
int ret;
rdsdebug("cq %p conn %p\n", cq, conn);
rds_iw_stats_inc(s_iw_tx_cq_call);
ret = ib_req_notify_cq(cq, IB_CQ_NEXT_COMP);
if (ret)
rdsdebug("ib_req_notify_cq send failed: %d\n", ret);
while (ib_poll_cq(cq, 1, &wc) > 0) {
rdsdebug("wc wr_id 0x%llx status %u byte_len %u imm_data %u\n",
(unsigned long long)wc.wr_id, wc.status, wc.byte_len,
be32_to_cpu(wc.ex.imm_data));
rds_iw_stats_inc(s_iw_tx_cq_event);
if (wc.status != IB_WC_SUCCESS) {
printk(KERN_ERR "WC Error: status = %d opcode = %d\n", wc.status, wc.opcode);
break;
}
if (wc.opcode == IB_WC_LOCAL_INV && wc.wr_id == RDS_IW_LOCAL_INV_WR_ID) {
ic->i_fastreg_posted = 0;
continue;
}
if (wc.opcode == IB_WC_FAST_REG_MR && wc.wr_id == RDS_IW_FAST_REG_WR_ID) {
ic->i_fastreg_posted = 1;
continue;
}
if (wc.wr_id == RDS_IW_ACK_WR_ID) {
if (time_after(jiffies, ic->i_ack_queued + HZ/2))
rds_iw_stats_inc(s_iw_tx_stalled);
rds_iw_ack_send_complete(ic);
continue;
}
oldest = rds_iw_ring_oldest(&ic->i_send_ring);
completed = rds_iw_ring_completed(&ic->i_send_ring, wc.wr_id, oldest);
for (i = 0; i < completed; i++) {
send = &ic->i_sends[oldest];
/* In the error case, wc.opcode sometimes contains garbage */
switch (send->s_wr.opcode) {
case IB_WR_SEND:
if (send->s_rm)
rds_iw_send_unmap_rm(ic, send, wc.status);
break;
case IB_WR_FAST_REG_MR:
case IB_WR_RDMA_WRITE:
case IB_WR_RDMA_READ:
case IB_WR_RDMA_READ_WITH_INV:
/* Nothing to be done - the SG list will be unmapped
* when the SEND completes. */
break;
default:
printk_ratelimited(KERN_NOTICE
"RDS/IW: %s: unexpected opcode 0x%x in WR!\n",
__func__, send->s_wr.opcode);
break;
}
send->s_wr.opcode = 0xdead;
send->s_wr.num_sge = 1;
if (time_after(jiffies, send->s_queued + HZ/2))
rds_iw_stats_inc(s_iw_tx_stalled);
/* If a RDMA operation produced an error, signal this right
* away. If we don't, the subsequent SEND that goes with this
* RDMA will be canceled with ERR_WFLUSH, and the application
* never learn that the RDMA failed. */
if (unlikely(wc.status == IB_WC_REM_ACCESS_ERR && send->s_op)) {
struct rds_message *rm;
rm = rds_send_get_message(conn, send->s_op);
if (rm)
rds_iw_send_rdma_complete(rm, wc.status);
}
oldest = (oldest + 1) % ic->i_send_ring.w_nr;
}
rds_iw_ring_free(&ic->i_send_ring, completed);
if (test_and_clear_bit(RDS_LL_SEND_FULL, &conn->c_flags) ||
test_bit(0, &conn->c_map_queued))
queue_delayed_work(rds_wq, &conn->c_send_w, 0);
/* We expect errors as the qp is drained during shutdown */
if (wc.status != IB_WC_SUCCESS && rds_conn_up(conn)) {
rds_iw_conn_error(conn,
"send completion on %pI4 "
"had status %u, disconnecting and reconnecting\n",
&conn->c_faddr, wc.status);
}
}
}
/*
* This is the main function for allocating credits when sending
* messages.
*
* Conceptually, we have two counters:
* - send credits: this tells us how many WRs we're allowed
* to submit without overruning the receiver's queue. For
* each SEND WR we post, we decrement this by one.
*
* - posted credits: this tells us how many WRs we recently
* posted to the receive queue. This value is transferred
* to the peer as a "credit update" in a RDS header field.
* Every time we transmit credits to the peer, we subtract
* the amount of transferred credits from this counter.
*
* It is essential that we avoid situations where both sides have
* exhausted their send credits, and are unable to send new credits
* to the peer. We achieve this by requiring that we send at least
* one credit update to the peer before exhausting our credits.
* When new credits arrive, we subtract one credit that is withheld
* until we've posted new buffers and are ready to transmit these
* credits (see rds_iw_send_add_credits below).
*
* The RDS send code is essentially single-threaded; rds_send_xmit
* grabs c_send_lock to ensure exclusive access to the send ring.
* However, the ACK sending code is independent and can race with
* message SENDs.
*
* In the send path, we need to update the counters for send credits
* and the counter of posted buffers atomically - when we use the
* last available credit, we cannot allow another thread to race us
* and grab the posted credits counter. Hence, we have to use a
* spinlock to protect the credit counter, or use atomics.
*
* Spinlocks shared between the send and the receive path are bad,
* because they create unnecessary delays. An early implementation
* using a spinlock showed a 5% degradation in throughput at some
* loads.
*
* This implementation avoids spinlocks completely, putting both
* counters into a single atomic, and updating that atomic using
* atomic_add (in the receive path, when receiving fresh credits),
* and using atomic_cmpxchg when updating the two counters.
*/
int rds_iw_send_grab_credits(struct rds_iw_connection *ic,
u32 wanted, u32 *adv_credits, int need_posted, int max_posted)
{
unsigned int avail, posted, got = 0, advertise;
long oldval, newval;
*adv_credits = 0;
if (!ic->i_flowctl)
return wanted;
try_again:
advertise = 0;
oldval = newval = atomic_read(&ic->i_credits);
posted = IB_GET_POST_CREDITS(oldval);
avail = IB_GET_SEND_CREDITS(oldval);
rdsdebug("rds_iw_send_grab_credits(%u): credits=%u posted=%u\n",
wanted, avail, posted);
/* The last credit must be used to send a credit update. */
if (avail && !posted)
avail--;
if (avail < wanted) {
struct rds_connection *conn = ic->i_cm_id->context;
/* Oops, there aren't that many credits left! */
set_bit(RDS_LL_SEND_FULL, &conn->c_flags);
got = avail;
} else {
/* Sometimes you get what you want, lalala. */
got = wanted;
}
newval -= IB_SET_SEND_CREDITS(got);
/*
* If need_posted is non-zero, then the caller wants
* the posted regardless of whether any send credits are
* available.
*/
if (posted && (got || need_posted)) {
advertise = min_t(unsigned int, posted, max_posted);
newval -= IB_SET_POST_CREDITS(advertise);
}
/* Finally bill everything */
if (atomic_cmpxchg(&ic->i_credits, oldval, newval) != oldval)
goto try_again;
*adv_credits = advertise;
return got;
}
void rds_iw_send_add_credits(struct rds_connection *conn, unsigned int credits)
{
struct rds_iw_connection *ic = conn->c_transport_data;
if (credits == 0)
return;
rdsdebug("rds_iw_send_add_credits(%u): current=%u%s\n",
credits,
IB_GET_SEND_CREDITS(atomic_read(&ic->i_credits)),
test_bit(RDS_LL_SEND_FULL, &conn->c_flags) ? ", ll_send_full" : "");
atomic_add(IB_SET_SEND_CREDITS(credits), &ic->i_credits);
if (test_and_clear_bit(RDS_LL_SEND_FULL, &conn->c_flags))
queue_delayed_work(rds_wq, &conn->c_send_w, 0);
WARN_ON(IB_GET_SEND_CREDITS(credits) >= 16384);
rds_iw_stats_inc(s_iw_rx_credit_updates);
}
void rds_iw_advertise_credits(struct rds_connection *conn, unsigned int posted)
{
struct rds_iw_connection *ic = conn->c_transport_data;
if (posted == 0)
return;
atomic_add(IB_SET_POST_CREDITS(posted), &ic->i_credits);
/* Decide whether to send an update to the peer now.
* If we would send a credit update for every single buffer we
* post, we would end up with an ACK storm (ACK arrives,
* consumes buffer, we refill the ring, send ACK to remote
* advertising the newly posted buffer... ad inf)
*
* Performance pretty much depends on how often we send
* credit updates - too frequent updates mean lots of ACKs.
* Too infrequent updates, and the peer will run out of
* credits and has to throttle.
* For the time being, 16 seems to be a good compromise.
*/
if (IB_GET_POST_CREDITS(atomic_read(&ic->i_credits)) >= 16)
set_bit(IB_ACK_REQUESTED, &ic->i_ack_flags);
}
static inline void
rds_iw_xmit_populate_wr(struct rds_iw_connection *ic,
struct rds_iw_send_work *send, unsigned int pos,
unsigned long buffer, unsigned int length,
int send_flags)
{
struct ib_sge *sge;
WARN_ON(pos != send - ic->i_sends);
send->s_wr.send_flags = send_flags;
send->s_wr.opcode = IB_WR_SEND;
send->s_wr.num_sge = 2;
send->s_wr.next = NULL;
send->s_queued = jiffies;
send->s_op = NULL;
if (length != 0) {
sge = rds_iw_data_sge(ic, send->s_sge);
sge->addr = buffer;
sge->length = length;
sge->lkey = rds_iw_local_dma_lkey(ic);
sge = rds_iw_header_sge(ic, send->s_sge);
} else {
/* We're sending a packet with no payload. There is only
* one SGE */
send->s_wr.num_sge = 1;
sge = &send->s_sge[0];
}
sge->addr = ic->i_send_hdrs_dma + (pos * sizeof(struct rds_header));
sge->length = sizeof(struct rds_header);
sge->lkey = rds_iw_local_dma_lkey(ic);
}
/*
* This can be called multiple times for a given message. The first time
* we see a message we map its scatterlist into the IB device so that
* we can provide that mapped address to the IB scatter gather entries
* in the IB work requests. We translate the scatterlist into a series
* of work requests that fragment the message. These work requests complete
* in order so we pass ownership of the message to the completion handler
* once we send the final fragment.
*
* The RDS core uses the c_send_lock to only enter this function once
* per connection. This makes sure that the tx ring alloc/unalloc pairs
* don't get out of sync and confuse the ring.
*/
int rds_iw_xmit(struct rds_connection *conn, struct rds_message *rm,
unsigned int hdr_off, unsigned int sg, unsigned int off)
{
struct rds_iw_connection *ic = conn->c_transport_data;
struct ib_device *dev = ic->i_cm_id->device;
struct rds_iw_send_work *send = NULL;
struct rds_iw_send_work *first;
struct rds_iw_send_work *prev;
struct ib_send_wr *failed_wr;
struct scatterlist *scat;
u32 pos;
u32 i;
u32 work_alloc;
u32 credit_alloc;
u32 posted;
u32 adv_credits = 0;
int send_flags = 0;
int sent;
int ret;
int flow_controlled = 0;
BUG_ON(off % RDS_FRAG_SIZE);
BUG_ON(hdr_off != 0 && hdr_off != sizeof(struct rds_header));
/* Fastreg support */
if (rds_rdma_cookie_key(rm->m_rdma_cookie) && !ic->i_fastreg_posted) {
ret = -EAGAIN;
goto out;
}
/* FIXME we may overallocate here */
if (be32_to_cpu(rm->m_inc.i_hdr.h_len) == 0)
i = 1;
else
i = ceil(be32_to_cpu(rm->m_inc.i_hdr.h_len), RDS_FRAG_SIZE);
work_alloc = rds_iw_ring_alloc(&ic->i_send_ring, i, &pos);
if (work_alloc == 0) {
set_bit(RDS_LL_SEND_FULL, &conn->c_flags);
rds_iw_stats_inc(s_iw_tx_ring_full);
ret = -ENOMEM;
goto out;
}
credit_alloc = work_alloc;
if (ic->i_flowctl) {
credit_alloc = rds_iw_send_grab_credits(ic, work_alloc, &posted, 0, RDS_MAX_ADV_CREDIT);
adv_credits += posted;
if (credit_alloc < work_alloc) {
rds_iw_ring_unalloc(&ic->i_send_ring, work_alloc - credit_alloc);
work_alloc = credit_alloc;
flow_controlled++;
}
if (work_alloc == 0) {
set_bit(RDS_LL_SEND_FULL, &conn->c_flags);
rds_iw_stats_inc(s_iw_tx_throttle);
ret = -ENOMEM;
goto out;
}
}
/* map the message the first time we see it */
if (!ic->i_rm) {
/*
printk(KERN_NOTICE "rds_iw_xmit prep msg dport=%u flags=0x%x len=%d\n",
be16_to_cpu(rm->m_inc.i_hdr.h_dport),
rm->m_inc.i_hdr.h_flags,
be32_to_cpu(rm->m_inc.i_hdr.h_len));
*/
if (rm->data.op_nents) {
rm->data.op_count = ib_dma_map_sg(dev,
rm->data.op_sg,
rm->data.op_nents,
DMA_TO_DEVICE);
rdsdebug("ic %p mapping rm %p: %d\n", ic, rm, rm->data.op_count);
if (rm->data.op_count == 0) {
rds_iw_stats_inc(s_iw_tx_sg_mapping_failure);
rds_iw_ring_unalloc(&ic->i_send_ring, work_alloc);
ret = -ENOMEM; /* XXX ? */
goto out;
}
} else {
rm->data.op_count = 0;
}
ic->i_unsignaled_wrs = rds_iw_sysctl_max_unsig_wrs;
ic->i_unsignaled_bytes = rds_iw_sysctl_max_unsig_bytes;
rds_message_addref(rm);
ic->i_rm = rm;
/* Finalize the header */
if (test_bit(RDS_MSG_ACK_REQUIRED, &rm->m_flags))
rm->m_inc.i_hdr.h_flags |= RDS_FLAG_ACK_REQUIRED;
if (test_bit(RDS_MSG_RETRANSMITTED, &rm->m_flags))
rm->m_inc.i_hdr.h_flags |= RDS_FLAG_RETRANSMITTED;
/* If it has a RDMA op, tell the peer we did it. This is
* used by the peer to release use-once RDMA MRs. */
if (rm->rdma.op_active) {
struct rds_ext_header_rdma ext_hdr;
ext_hdr.h_rdma_rkey = cpu_to_be32(rm->rdma.op_rkey);
rds_message_add_extension(&rm->m_inc.i_hdr,
RDS_EXTHDR_RDMA, &ext_hdr, sizeof(ext_hdr));
}
if (rm->m_rdma_cookie) {
rds_message_add_rdma_dest_extension(&rm->m_inc.i_hdr,
rds_rdma_cookie_key(rm->m_rdma_cookie),
rds_rdma_cookie_offset(rm->m_rdma_cookie));
}
/* Note - rds_iw_piggyb_ack clears the ACK_REQUIRED bit, so
* we should not do this unless we have a chance of at least
* sticking the header into the send ring. Which is why we
* should call rds_iw_ring_alloc first. */
rm->m_inc.i_hdr.h_ack = cpu_to_be64(rds_iw_piggyb_ack(ic));
rds_message_make_checksum(&rm->m_inc.i_hdr);
/*
* Update adv_credits since we reset the ACK_REQUIRED bit.
*/
rds_iw_send_grab_credits(ic, 0, &posted, 1, RDS_MAX_ADV_CREDIT - adv_credits);
adv_credits += posted;
BUG_ON(adv_credits > 255);
}
send = &ic->i_sends[pos];
first = send;
prev = NULL;
scat = &rm->data.op_sg[sg];
sent = 0;
i = 0;
/* Sometimes you want to put a fence between an RDMA
* READ and the following SEND.
* We could either do this all the time
* or when requested by the user. Right now, we let
* the application choose.
*/
if (rm->rdma.op_active && rm->rdma.op_fence)
send_flags = IB_SEND_FENCE;
/*
* We could be copying the header into the unused tail of the page.
* That would need to be changed in the future when those pages might
* be mapped userspace pages or page cache pages. So instead we always
* use a second sge and our long-lived ring of mapped headers. We send
* the header after the data so that the data payload can be aligned on
* the receiver.
*/
/* handle a 0-len message */
if (be32_to_cpu(rm->m_inc.i_hdr.h_len) == 0) {
rds_iw_xmit_populate_wr(ic, send, pos, 0, 0, send_flags);
goto add_header;
}
/* if there's data reference it with a chain of work reqs */
for (; i < work_alloc && scat != &rm->data.op_sg[rm->data.op_count]; i++) {
unsigned int len;
send = &ic->i_sends[pos];
len = min(RDS_FRAG_SIZE, ib_sg_dma_len(dev, scat) - off);
rds_iw_xmit_populate_wr(ic, send, pos,
ib_sg_dma_address(dev, scat) + off, len,
send_flags);
/*
* We want to delay signaling completions just enough to get
* the batching benefits but not so much that we create dead time
* on the wire.
*/
if (ic->i_unsignaled_wrs-- == 0) {
ic->i_unsignaled_wrs = rds_iw_sysctl_max_unsig_wrs;
send->s_wr.send_flags |= IB_SEND_SIGNALED | IB_SEND_SOLICITED;
}
ic->i_unsignaled_bytes -= len;
if (ic->i_unsignaled_bytes <= 0) {
ic->i_unsignaled_bytes = rds_iw_sysctl_max_unsig_bytes;
send->s_wr.send_flags |= IB_SEND_SIGNALED | IB_SEND_SOLICITED;
}
/*
* Always signal the last one if we're stopping due to flow control.
*/
if (flow_controlled && i == (work_alloc-1))
send->s_wr.send_flags |= IB_SEND_SIGNALED | IB_SEND_SOLICITED;
rdsdebug("send %p wr %p num_sge %u next %p\n", send,
&send->s_wr, send->s_wr.num_sge, send->s_wr.next);
sent += len;
off += len;
if (off == ib_sg_dma_len(dev, scat)) {
scat++;
off = 0;
}
add_header:
/* Tack on the header after the data. The header SGE should already
* have been set up to point to the right header buffer. */
memcpy(&ic->i_send_hdrs[pos], &rm->m_inc.i_hdr, sizeof(struct rds_header));
if (0) {
struct rds_header *hdr = &ic->i_send_hdrs[pos];
printk(KERN_NOTICE "send WR dport=%u flags=0x%x len=%d\n",
be16_to_cpu(hdr->h_dport),
hdr->h_flags,
be32_to_cpu(hdr->h_len));
}
if (adv_credits) {
struct rds_header *hdr = &ic->i_send_hdrs[pos];
/* add credit and redo the header checksum */
hdr->h_credit = adv_credits;
rds_message_make_checksum(hdr);
adv_credits = 0;
rds_iw_stats_inc(s_iw_tx_credit_updates);
}
if (prev)
prev->s_wr.next = &send->s_wr;
prev = send;
pos = (pos + 1) % ic->i_send_ring.w_nr;
}
/* Account the RDS header in the number of bytes we sent, but just once.
* The caller has no concept of fragmentation. */
if (hdr_off == 0)
sent += sizeof(struct rds_header);
/* if we finished the message then send completion owns it */
if (scat == &rm->data.op_sg[rm->data.op_count]) {
prev->s_rm = ic->i_rm;
prev->s_wr.send_flags |= IB_SEND_SIGNALED | IB_SEND_SOLICITED;
ic->i_rm = NULL;
}
if (i < work_alloc) {
rds_iw_ring_unalloc(&ic->i_send_ring, work_alloc - i);
work_alloc = i;
}
if (ic->i_flowctl && i < credit_alloc)
rds_iw_send_add_credits(conn, credit_alloc - i);
/* XXX need to worry about failed_wr and partial sends. */
failed_wr = &first->s_wr;
ret = ib_post_send(ic->i_cm_id->qp, &first->s_wr, &failed_wr);
rdsdebug("ic %p first %p (wr %p) ret %d wr %p\n", ic,
first, &first->s_wr, ret, failed_wr);
BUG_ON(failed_wr != &first->s_wr);
if (ret) {
printk(KERN_WARNING "RDS/IW: ib_post_send to %pI4 "
"returned %d\n", &conn->c_faddr, ret);
rds_iw_ring_unalloc(&ic->i_send_ring, work_alloc);
if (prev->s_rm) {
ic->i_rm = prev->s_rm;
prev->s_rm = NULL;
}
goto out;
}
ret = sent;
out:
BUG_ON(adv_credits);
return ret;
}
static void rds_iw_build_send_fastreg(struct rds_iw_device *rds_iwdev, struct rds_iw_connection *ic, struct rds_iw_send_work *send, int nent, int len, u64 sg_addr)
{
BUG_ON(nent > send->s_page_list->max_page_list_len);
/*
* Perform a WR for the fast_reg_mr. Each individual page
* in the sg list is added to the fast reg page list and placed
* inside the fast_reg_mr WR.
*/
send->s_wr.opcode = IB_WR_FAST_REG_MR;
send->s_wr.wr.fast_reg.length = len;
send->s_wr.wr.fast_reg.rkey = send->s_mr->rkey;
send->s_wr.wr.fast_reg.page_list = send->s_page_list;
send->s_wr.wr.fast_reg.page_list_len = nent;
send->s_wr.wr.fast_reg.page_shift = PAGE_SHIFT;
send->s_wr.wr.fast_reg.access_flags = IB_ACCESS_REMOTE_WRITE;
send->s_wr.wr.fast_reg.iova_start = sg_addr;
ib_update_fast_reg_key(send->s_mr, send->s_remap_count++);
}
int rds_iw_xmit_rdma(struct rds_connection *conn, struct rm_rdma_op *op)
{
struct rds_iw_connection *ic = conn->c_transport_data;
struct rds_iw_send_work *send = NULL;
struct rds_iw_send_work *first;
struct rds_iw_send_work *prev;
struct ib_send_wr *failed_wr;
struct rds_iw_device *rds_iwdev;
struct scatterlist *scat;
unsigned long len;
u64 remote_addr = op->op_remote_addr;
u32 pos, fr_pos;
u32 work_alloc;
u32 i;
u32 j;
int sent;
int ret;
int num_sge;
rds_iwdev = ib_get_client_data(ic->i_cm_id->device, &rds_iw_client);
/* map the message the first time we see it */
if (!op->op_mapped) {
op->op_count = ib_dma_map_sg(ic->i_cm_id->device,
op->op_sg, op->op_nents, (op->op_write) ?
DMA_TO_DEVICE : DMA_FROM_DEVICE);
rdsdebug("ic %p mapping op %p: %d\n", ic, op, op->op_count);
if (op->op_count == 0) {
rds_iw_stats_inc(s_iw_tx_sg_mapping_failure);
ret = -ENOMEM; /* XXX ? */
goto out;
}
op->op_mapped = 1;
}
if (!op->op_write) {
/* Alloc space on the send queue for the fastreg */
work_alloc = rds_iw_ring_alloc(&ic->i_send_ring, 1, &fr_pos);
if (work_alloc != 1) {
rds_iw_ring_unalloc(&ic->i_send_ring, work_alloc);
rds_iw_stats_inc(s_iw_tx_ring_full);
ret = -ENOMEM;
goto out;
}
}
/*
* Instead of knowing how to return a partial rdma read/write we insist that there
* be enough work requests to send the entire message.
*/
i = ceil(op->op_count, rds_iwdev->max_sge);
work_alloc = rds_iw_ring_alloc(&ic->i_send_ring, i, &pos);
if (work_alloc != i) {
rds_iw_ring_unalloc(&ic->i_send_ring, work_alloc);
rds_iw_stats_inc(s_iw_tx_ring_full);
ret = -ENOMEM;
goto out;
}
send = &ic->i_sends[pos];
if (!op->op_write) {
first = prev = &ic->i_sends[fr_pos];
} else {
first = send;
prev = NULL;
}
scat = &op->op_sg[0];
sent = 0;
num_sge = op->op_count;
for (i = 0; i < work_alloc && scat != &op->op_sg[op->op_count]; i++) {
send->s_wr.send_flags = 0;
send->s_queued = jiffies;
/*
* We want to delay signaling completions just enough to get
* the batching benefits but not so much that we create dead time on the wire.
*/
if (ic->i_unsignaled_wrs-- == 0) {
ic->i_unsignaled_wrs = rds_iw_sysctl_max_unsig_wrs;
send->s_wr.send_flags = IB_SEND_SIGNALED;
}
/* To avoid the need to have the plumbing to invalidate the fastreg_mr used
* for local access after RDS is finished with it, using
* IB_WR_RDMA_READ_WITH_INV will invalidate it after the read has completed.
*/
if (op->op_write)
send->s_wr.opcode = IB_WR_RDMA_WRITE;
else
send->s_wr.opcode = IB_WR_RDMA_READ_WITH_INV;
send->s_wr.wr.rdma.remote_addr = remote_addr;
send->s_wr.wr.rdma.rkey = op->op_rkey;
send->s_op = op;
if (num_sge > rds_iwdev->max_sge) {
send->s_wr.num_sge = rds_iwdev->max_sge;
num_sge -= rds_iwdev->max_sge;
} else
send->s_wr.num_sge = num_sge;
send->s_wr.next = NULL;
if (prev)
prev->s_wr.next = &send->s_wr;
for (j = 0; j < send->s_wr.num_sge && scat != &op->op_sg[op->op_count]; j++) {
len = ib_sg_dma_len(ic->i_cm_id->device, scat);
if (send->s_wr.opcode == IB_WR_RDMA_READ_WITH_INV)
send->s_page_list->page_list[j] = ib_sg_dma_address(ic->i_cm_id->device, scat);
else {
send->s_sge[j].addr = ib_sg_dma_address(ic->i_cm_id->device, scat);
send->s_sge[j].length = len;
send->s_sge[j].lkey = rds_iw_local_dma_lkey(ic);
}
sent += len;
rdsdebug("ic %p sent %d remote_addr %llu\n", ic, sent, remote_addr);
remote_addr += len;
scat++;
}
if (send->s_wr.opcode == IB_WR_RDMA_READ_WITH_INV) {
send->s_wr.num_sge = 1;
send->s_sge[0].addr = conn->c_xmit_rm->m_rs->rs_user_addr;
send->s_sge[0].length = conn->c_xmit_rm->m_rs->rs_user_bytes;
send->s_sge[0].lkey = ic->i_sends[fr_pos].s_mr->lkey;
}
rdsdebug("send %p wr %p num_sge %u next %p\n", send,
&send->s_wr, send->s_wr.num_sge, send->s_wr.next);
prev = send;
if (++send == &ic->i_sends[ic->i_send_ring.w_nr])
send = ic->i_sends;
}
/* if we finished the message then send completion owns it */
if (scat == &op->op_sg[op->op_count])
first->s_wr.send_flags = IB_SEND_SIGNALED;
if (i < work_alloc) {
rds_iw_ring_unalloc(&ic->i_send_ring, work_alloc - i);
work_alloc = i;
}
/* On iWARP, local memory access by a remote system (ie, RDMA Read) is not
* recommended. Putting the lkey on the wire is a security hole, as it can
* allow for memory access to all of memory on the remote system. Some
* adapters do not allow using the lkey for this at all. To bypass this use a
* fastreg_mr (or possibly a dma_mr)
*/
if (!op->op_write) {
rds_iw_build_send_fastreg(rds_iwdev, ic, &ic->i_sends[fr_pos],
op->op_count, sent, conn->c_xmit_rm->m_rs->rs_user_addr);
work_alloc++;
}
failed_wr = &first->s_wr;
ret = ib_post_send(ic->i_cm_id->qp, &first->s_wr, &failed_wr);
rdsdebug("ic %p first %p (wr %p) ret %d wr %p\n", ic,
first, &first->s_wr, ret, failed_wr);
BUG_ON(failed_wr != &first->s_wr);
if (ret) {
printk(KERN_WARNING "RDS/IW: rdma ib_post_send to %pI4 "
"returned %d\n", &conn->c_faddr, ret);
rds_iw_ring_unalloc(&ic->i_send_ring, work_alloc);
goto out;
}
out:
return ret;
}
void rds_iw_xmit_complete(struct rds_connection *conn)
{
struct rds_iw_connection *ic = conn->c_transport_data;
/* We may have a pending ACK or window update we were unable
* to send previously (due to flow control). Try again. */
rds_iw_attempt_ack(ic);
}

95
net/rds/iw_stats.c Normal file
View file

@ -0,0 +1,95 @@
/*
* Copyright (c) 2006 Oracle. All rights reserved.
*
* This software is available to you under a choice of one of two
* licenses. You may choose to be licensed under the terms of the GNU
* General Public License (GPL) Version 2, available from the file
* COPYING in the main directory of this source tree, or the
* OpenIB.org BSD license below:
*
* Redistribution and use in source and binary forms, with or
* without modification, are permitted provided that the following
* conditions are met:
*
* - Redistributions of source code must retain the above
* copyright notice, this list of conditions and the following
* disclaimer.
*
* - Redistributions in binary form must reproduce the above
* copyright notice, this list of conditions and the following
* disclaimer in the documentation and/or other materials
* provided with the distribution.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
* BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
* ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
* CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
*
*/
#include <linux/percpu.h>
#include <linux/seq_file.h>
#include <linux/proc_fs.h>
#include "rds.h"
#include "iw.h"
DEFINE_PER_CPU_SHARED_ALIGNED(struct rds_iw_statistics, rds_iw_stats);
static const char *const rds_iw_stat_names[] = {
"iw_connect_raced",
"iw_listen_closed_stale",
"iw_tx_cq_call",
"iw_tx_cq_event",
"iw_tx_ring_full",
"iw_tx_throttle",
"iw_tx_sg_mapping_failure",
"iw_tx_stalled",
"iw_tx_credit_updates",
"iw_rx_cq_call",
"iw_rx_cq_event",
"iw_rx_ring_empty",
"iw_rx_refill_from_cq",
"iw_rx_refill_from_thread",
"iw_rx_alloc_limit",
"iw_rx_credit_updates",
"iw_ack_sent",
"iw_ack_send_failure",
"iw_ack_send_delayed",
"iw_ack_send_piggybacked",
"iw_ack_received",
"iw_rdma_mr_alloc",
"iw_rdma_mr_free",
"iw_rdma_mr_used",
"iw_rdma_mr_pool_flush",
"iw_rdma_mr_pool_wait",
"iw_rdma_mr_pool_depleted",
};
unsigned int rds_iw_stats_info_copy(struct rds_info_iterator *iter,
unsigned int avail)
{
struct rds_iw_statistics stats = {0, };
uint64_t *src;
uint64_t *sum;
size_t i;
int cpu;
if (avail < ARRAY_SIZE(rds_iw_stat_names))
goto out;
for_each_online_cpu(cpu) {
src = (uint64_t *)&(per_cpu(rds_iw_stats, cpu));
sum = (uint64_t *)&stats;
for (i = 0; i < sizeof(stats) / sizeof(uint64_t); i++)
*(sum++) += *(src++);
}
rds_stats_info_copy(iter, (uint64_t *)&stats, rds_iw_stat_names,
ARRAY_SIZE(rds_iw_stat_names));
out:
return ARRAY_SIZE(rds_iw_stat_names);
}

123
net/rds/iw_sysctl.c Normal file
View file

@ -0,0 +1,123 @@
/*
* Copyright (c) 2006 Oracle. All rights reserved.
*
* This software is available to you under a choice of one of two
* licenses. You may choose to be licensed under the terms of the GNU
* General Public License (GPL) Version 2, available from the file
* COPYING in the main directory of this source tree, or the
* OpenIB.org BSD license below:
*
* Redistribution and use in source and binary forms, with or
* without modification, are permitted provided that the following
* conditions are met:
*
* - Redistributions of source code must retain the above
* copyright notice, this list of conditions and the following
* disclaimer.
*
* - Redistributions in binary form must reproduce the above
* copyright notice, this list of conditions and the following
* disclaimer in the documentation and/or other materials
* provided with the distribution.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
* BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
* ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
* CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
*
*/
#include <linux/kernel.h>
#include <linux/sysctl.h>
#include <linux/proc_fs.h>
#include "iw.h"
static struct ctl_table_header *rds_iw_sysctl_hdr;
unsigned long rds_iw_sysctl_max_send_wr = RDS_IW_DEFAULT_SEND_WR;
unsigned long rds_iw_sysctl_max_recv_wr = RDS_IW_DEFAULT_RECV_WR;
unsigned long rds_iw_sysctl_max_recv_allocation = (128 * 1024 * 1024) / RDS_FRAG_SIZE;
static unsigned long rds_iw_sysctl_max_wr_min = 1;
/* hardware will fail CQ creation long before this */
static unsigned long rds_iw_sysctl_max_wr_max = (u32)~0;
unsigned long rds_iw_sysctl_max_unsig_wrs = 16;
static unsigned long rds_iw_sysctl_max_unsig_wr_min = 1;
static unsigned long rds_iw_sysctl_max_unsig_wr_max = 64;
unsigned long rds_iw_sysctl_max_unsig_bytes = (16 << 20);
static unsigned long rds_iw_sysctl_max_unsig_bytes_min = 1;
static unsigned long rds_iw_sysctl_max_unsig_bytes_max = ~0UL;
unsigned int rds_iw_sysctl_flow_control = 1;
static struct ctl_table rds_iw_sysctl_table[] = {
{
.procname = "max_send_wr",
.data = &rds_iw_sysctl_max_send_wr,
.maxlen = sizeof(unsigned long),
.mode = 0644,
.proc_handler = proc_doulongvec_minmax,
.extra1 = &rds_iw_sysctl_max_wr_min,
.extra2 = &rds_iw_sysctl_max_wr_max,
},
{
.procname = "max_recv_wr",
.data = &rds_iw_sysctl_max_recv_wr,
.maxlen = sizeof(unsigned long),
.mode = 0644,
.proc_handler = proc_doulongvec_minmax,
.extra1 = &rds_iw_sysctl_max_wr_min,
.extra2 = &rds_iw_sysctl_max_wr_max,
},
{
.procname = "max_unsignaled_wr",
.data = &rds_iw_sysctl_max_unsig_wrs,
.maxlen = sizeof(unsigned long),
.mode = 0644,
.proc_handler = proc_doulongvec_minmax,
.extra1 = &rds_iw_sysctl_max_unsig_wr_min,
.extra2 = &rds_iw_sysctl_max_unsig_wr_max,
},
{
.procname = "max_unsignaled_bytes",
.data = &rds_iw_sysctl_max_unsig_bytes,
.maxlen = sizeof(unsigned long),
.mode = 0644,
.proc_handler = proc_doulongvec_minmax,
.extra1 = &rds_iw_sysctl_max_unsig_bytes_min,
.extra2 = &rds_iw_sysctl_max_unsig_bytes_max,
},
{
.procname = "max_recv_allocation",
.data = &rds_iw_sysctl_max_recv_allocation,
.maxlen = sizeof(unsigned long),
.mode = 0644,
.proc_handler = proc_doulongvec_minmax,
},
{
.procname = "flow_control",
.data = &rds_iw_sysctl_flow_control,
.maxlen = sizeof(rds_iw_sysctl_flow_control),
.mode = 0644,
.proc_handler = proc_dointvec,
},
{ }
};
void rds_iw_sysctl_exit(void)
{
unregister_net_sysctl_table(rds_iw_sysctl_hdr);
}
int rds_iw_sysctl_init(void)
{
rds_iw_sysctl_hdr = register_net_sysctl(&init_net, "net/rds/iw", rds_iw_sysctl_table);
if (!rds_iw_sysctl_hdr)
return -ENOMEM;
return 0;
}

194
net/rds/loop.c Normal file
View file

@ -0,0 +1,194 @@
/*
* Copyright (c) 2006 Oracle. All rights reserved.
*
* This software is available to you under a choice of one of two
* licenses. You may choose to be licensed under the terms of the GNU
* General Public License (GPL) Version 2, available from the file
* COPYING in the main directory of this source tree, or the
* OpenIB.org BSD license below:
*
* Redistribution and use in source and binary forms, with or
* without modification, are permitted provided that the following
* conditions are met:
*
* - Redistributions of source code must retain the above
* copyright notice, this list of conditions and the following
* disclaimer.
*
* - Redistributions in binary form must reproduce the above
* copyright notice, this list of conditions and the following
* disclaimer in the documentation and/or other materials
* provided with the distribution.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
* BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
* ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
* CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
*
*/
#include <linux/kernel.h>
#include <linux/slab.h>
#include <linux/in.h>
#include "rds.h"
#include "loop.h"
static DEFINE_SPINLOCK(loop_conns_lock);
static LIST_HEAD(loop_conns);
/*
* This 'loopback' transport is a special case for flows that originate
* and terminate on the same machine.
*
* Connection build-up notices if the destination address is thought of
* as a local address by a transport. At that time it decides to use the
* loopback transport instead of the bound transport of the sending socket.
*
* The loopback transport's sending path just hands the sent rds_message
* straight to the receiving path via an embedded rds_incoming.
*/
/*
* Usually a message transits both the sender and receiver's conns as it
* flows to the receiver. In the loopback case, though, the receive path
* is handed the sending conn so the sense of the addresses is reversed.
*/
static int rds_loop_xmit(struct rds_connection *conn, struct rds_message *rm,
unsigned int hdr_off, unsigned int sg,
unsigned int off)
{
struct scatterlist *sgp = &rm->data.op_sg[sg];
int ret = sizeof(struct rds_header) +
be32_to_cpu(rm->m_inc.i_hdr.h_len);
/* Do not send cong updates to loopback */
if (rm->m_inc.i_hdr.h_flags & RDS_FLAG_CONG_BITMAP) {
rds_cong_map_updated(conn->c_fcong, ~(u64) 0);
ret = min_t(int, ret, sgp->length - conn->c_xmit_data_off);
goto out;
}
BUG_ON(hdr_off || sg || off);
rds_inc_init(&rm->m_inc, conn, conn->c_laddr);
/* For the embedded inc. Matching put is in loop_inc_free() */
rds_message_addref(rm);
rds_recv_incoming(conn, conn->c_laddr, conn->c_faddr, &rm->m_inc,
GFP_KERNEL);
rds_send_drop_acked(conn, be64_to_cpu(rm->m_inc.i_hdr.h_sequence),
NULL);
rds_inc_put(&rm->m_inc);
out:
return ret;
}
/*
* See rds_loop_xmit(). Since our inc is embedded in the rm, we
* make sure the rm lives at least until the inc is done.
*/
static void rds_loop_inc_free(struct rds_incoming *inc)
{
struct rds_message *rm = container_of(inc, struct rds_message, m_inc);
rds_message_put(rm);
}
/* we need to at least give the thread something to succeed */
static int rds_loop_recv(struct rds_connection *conn)
{
return 0;
}
struct rds_loop_connection {
struct list_head loop_node;
struct rds_connection *conn;
};
/*
* Even the loopback transport needs to keep track of its connections,
* so it can call rds_conn_destroy() on them on exit. N.B. there are
* 1+ loopback addresses (127.*.*.*) so it's not a bug to have
* multiple loopback conns allocated, although rather useless.
*/
static int rds_loop_conn_alloc(struct rds_connection *conn, gfp_t gfp)
{
struct rds_loop_connection *lc;
unsigned long flags;
lc = kzalloc(sizeof(struct rds_loop_connection), gfp);
if (!lc)
return -ENOMEM;
INIT_LIST_HEAD(&lc->loop_node);
lc->conn = conn;
conn->c_transport_data = lc;
spin_lock_irqsave(&loop_conns_lock, flags);
list_add_tail(&lc->loop_node, &loop_conns);
spin_unlock_irqrestore(&loop_conns_lock, flags);
return 0;
}
static void rds_loop_conn_free(void *arg)
{
struct rds_loop_connection *lc = arg;
unsigned long flags;
rdsdebug("lc %p\n", lc);
spin_lock_irqsave(&loop_conns_lock, flags);
list_del(&lc->loop_node);
spin_unlock_irqrestore(&loop_conns_lock, flags);
kfree(lc);
}
static int rds_loop_conn_connect(struct rds_connection *conn)
{
rds_connect_complete(conn);
return 0;
}
static void rds_loop_conn_shutdown(struct rds_connection *conn)
{
}
void rds_loop_exit(void)
{
struct rds_loop_connection *lc, *_lc;
LIST_HEAD(tmp_list);
/* avoid calling conn_destroy with irqs off */
spin_lock_irq(&loop_conns_lock);
list_splice(&loop_conns, &tmp_list);
INIT_LIST_HEAD(&loop_conns);
spin_unlock_irq(&loop_conns_lock);
list_for_each_entry_safe(lc, _lc, &tmp_list, loop_node) {
WARN_ON(lc->conn->c_passive);
rds_conn_destroy(lc->conn);
}
}
/*
* This is missing .xmit_* because loop doesn't go through generic
* rds_send_xmit() and doesn't call rds_recv_incoming(). .listen_stop and
* .laddr_check are missing because transport.c doesn't iterate over
* rds_loop_transport.
*/
struct rds_transport rds_loop_transport = {
.xmit = rds_loop_xmit,
.recv = rds_loop_recv,
.conn_alloc = rds_loop_conn_alloc,
.conn_free = rds_loop_conn_free,
.conn_connect = rds_loop_conn_connect,
.conn_shutdown = rds_loop_conn_shutdown,
.inc_copy_to_user = rds_message_inc_copy_to_user,
.inc_free = rds_loop_inc_free,
.t_name = "loopback",
};

9
net/rds/loop.h Normal file
View file

@ -0,0 +1,9 @@
#ifndef _RDS_LOOP_H
#define _RDS_LOOP_H
/* loop.c */
extern struct rds_transport rds_loop_transport;
void rds_loop_exit(void);
#endif

402
net/rds/message.c Normal file
View file

@ -0,0 +1,402 @@
/*
* Copyright (c) 2006 Oracle. All rights reserved.
*
* This software is available to you under a choice of one of two
* licenses. You may choose to be licensed under the terms of the GNU
* General Public License (GPL) Version 2, available from the file
* COPYING in the main directory of this source tree, or the
* OpenIB.org BSD license below:
*
* Redistribution and use in source and binary forms, with or
* without modification, are permitted provided that the following
* conditions are met:
*
* - Redistributions of source code must retain the above
* copyright notice, this list of conditions and the following
* disclaimer.
*
* - Redistributions in binary form must reproduce the above
* copyright notice, this list of conditions and the following
* disclaimer in the documentation and/or other materials
* provided with the distribution.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
* BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
* ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
* CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
*
*/
#include <linux/kernel.h>
#include <linux/slab.h>
#include <linux/export.h>
#include "rds.h"
static unsigned int rds_exthdr_size[__RDS_EXTHDR_MAX] = {
[RDS_EXTHDR_NONE] = 0,
[RDS_EXTHDR_VERSION] = sizeof(struct rds_ext_header_version),
[RDS_EXTHDR_RDMA] = sizeof(struct rds_ext_header_rdma),
[RDS_EXTHDR_RDMA_DEST] = sizeof(struct rds_ext_header_rdma_dest),
};
void rds_message_addref(struct rds_message *rm)
{
rdsdebug("addref rm %p ref %d\n", rm, atomic_read(&rm->m_refcount));
atomic_inc(&rm->m_refcount);
}
EXPORT_SYMBOL_GPL(rds_message_addref);
/*
* This relies on dma_map_sg() not touching sg[].page during merging.
*/
static void rds_message_purge(struct rds_message *rm)
{
unsigned long i;
if (unlikely(test_bit(RDS_MSG_PAGEVEC, &rm->m_flags)))
return;
for (i = 0; i < rm->data.op_nents; i++) {
rdsdebug("putting data page %p\n", (void *)sg_page(&rm->data.op_sg[i]));
/* XXX will have to put_page for page refs */
__free_page(sg_page(&rm->data.op_sg[i]));
}
rm->data.op_nents = 0;
if (rm->rdma.op_active)
rds_rdma_free_op(&rm->rdma);
if (rm->rdma.op_rdma_mr)
rds_mr_put(rm->rdma.op_rdma_mr);
if (rm->atomic.op_active)
rds_atomic_free_op(&rm->atomic);
if (rm->atomic.op_rdma_mr)
rds_mr_put(rm->atomic.op_rdma_mr);
}
void rds_message_put(struct rds_message *rm)
{
rdsdebug("put rm %p ref %d\n", rm, atomic_read(&rm->m_refcount));
WARN(!atomic_read(&rm->m_refcount), "danger refcount zero on %p\n", rm);
if (atomic_dec_and_test(&rm->m_refcount)) {
BUG_ON(!list_empty(&rm->m_sock_item));
BUG_ON(!list_empty(&rm->m_conn_item));
rds_message_purge(rm);
kfree(rm);
}
}
EXPORT_SYMBOL_GPL(rds_message_put);
void rds_message_populate_header(struct rds_header *hdr, __be16 sport,
__be16 dport, u64 seq)
{
hdr->h_flags = 0;
hdr->h_sport = sport;
hdr->h_dport = dport;
hdr->h_sequence = cpu_to_be64(seq);
hdr->h_exthdr[0] = RDS_EXTHDR_NONE;
}
EXPORT_SYMBOL_GPL(rds_message_populate_header);
int rds_message_add_extension(struct rds_header *hdr, unsigned int type,
const void *data, unsigned int len)
{
unsigned int ext_len = sizeof(u8) + len;
unsigned char *dst;
/* For now, refuse to add more than one extension header */
if (hdr->h_exthdr[0] != RDS_EXTHDR_NONE)
return 0;
if (type >= __RDS_EXTHDR_MAX || len != rds_exthdr_size[type])
return 0;
if (ext_len >= RDS_HEADER_EXT_SPACE)
return 0;
dst = hdr->h_exthdr;
*dst++ = type;
memcpy(dst, data, len);
dst[len] = RDS_EXTHDR_NONE;
return 1;
}
EXPORT_SYMBOL_GPL(rds_message_add_extension);
/*
* If a message has extension headers, retrieve them here.
* Call like this:
*
* unsigned int pos = 0;
*
* while (1) {
* buflen = sizeof(buffer);
* type = rds_message_next_extension(hdr, &pos, buffer, &buflen);
* if (type == RDS_EXTHDR_NONE)
* break;
* ...
* }
*/
int rds_message_next_extension(struct rds_header *hdr,
unsigned int *pos, void *buf, unsigned int *buflen)
{
unsigned int offset, ext_type, ext_len;
u8 *src = hdr->h_exthdr;
offset = *pos;
if (offset >= RDS_HEADER_EXT_SPACE)
goto none;
/* Get the extension type and length. For now, the
* length is implied by the extension type. */
ext_type = src[offset++];
if (ext_type == RDS_EXTHDR_NONE || ext_type >= __RDS_EXTHDR_MAX)
goto none;
ext_len = rds_exthdr_size[ext_type];
if (offset + ext_len > RDS_HEADER_EXT_SPACE)
goto none;
*pos = offset + ext_len;
if (ext_len < *buflen)
*buflen = ext_len;
memcpy(buf, src + offset, *buflen);
return ext_type;
none:
*pos = RDS_HEADER_EXT_SPACE;
*buflen = 0;
return RDS_EXTHDR_NONE;
}
int rds_message_add_rdma_dest_extension(struct rds_header *hdr, u32 r_key, u32 offset)
{
struct rds_ext_header_rdma_dest ext_hdr;
ext_hdr.h_rdma_rkey = cpu_to_be32(r_key);
ext_hdr.h_rdma_offset = cpu_to_be32(offset);
return rds_message_add_extension(hdr, RDS_EXTHDR_RDMA_DEST, &ext_hdr, sizeof(ext_hdr));
}
EXPORT_SYMBOL_GPL(rds_message_add_rdma_dest_extension);
/*
* Each rds_message is allocated with extra space for the scatterlist entries
* rds ops will need. This is to minimize memory allocation count. Then, each rds op
* can grab SGs when initializing its part of the rds_message.
*/
struct rds_message *rds_message_alloc(unsigned int extra_len, gfp_t gfp)
{
struct rds_message *rm;
if (extra_len > KMALLOC_MAX_SIZE - sizeof(struct rds_message))
return NULL;
rm = kzalloc(sizeof(struct rds_message) + extra_len, gfp);
if (!rm)
goto out;
rm->m_used_sgs = 0;
rm->m_total_sgs = extra_len / sizeof(struct scatterlist);
atomic_set(&rm->m_refcount, 1);
INIT_LIST_HEAD(&rm->m_sock_item);
INIT_LIST_HEAD(&rm->m_conn_item);
spin_lock_init(&rm->m_rs_lock);
init_waitqueue_head(&rm->m_flush_wait);
out:
return rm;
}
/*
* RDS ops use this to grab SG entries from the rm's sg pool.
*/
struct scatterlist *rds_message_alloc_sgs(struct rds_message *rm, int nents)
{
struct scatterlist *sg_first = (struct scatterlist *) &rm[1];
struct scatterlist *sg_ret;
WARN_ON(rm->m_used_sgs + nents > rm->m_total_sgs);
WARN_ON(!nents);
if (rm->m_used_sgs + nents > rm->m_total_sgs)
return NULL;
sg_ret = &sg_first[rm->m_used_sgs];
sg_init_table(sg_ret, nents);
rm->m_used_sgs += nents;
return sg_ret;
}
struct rds_message *rds_message_map_pages(unsigned long *page_addrs, unsigned int total_len)
{
struct rds_message *rm;
unsigned int i;
int num_sgs = ceil(total_len, PAGE_SIZE);
int extra_bytes = num_sgs * sizeof(struct scatterlist);
rm = rds_message_alloc(extra_bytes, GFP_NOWAIT);
if (!rm)
return ERR_PTR(-ENOMEM);
set_bit(RDS_MSG_PAGEVEC, &rm->m_flags);
rm->m_inc.i_hdr.h_len = cpu_to_be32(total_len);
rm->data.op_nents = ceil(total_len, PAGE_SIZE);
rm->data.op_sg = rds_message_alloc_sgs(rm, num_sgs);
if (!rm->data.op_sg) {
rds_message_put(rm);
return ERR_PTR(-ENOMEM);
}
for (i = 0; i < rm->data.op_nents; ++i) {
sg_set_page(&rm->data.op_sg[i],
virt_to_page(page_addrs[i]),
PAGE_SIZE, 0);
}
return rm;
}
int rds_message_copy_from_user(struct rds_message *rm, struct iovec *first_iov,
size_t total_len)
{
unsigned long to_copy;
unsigned long iov_off;
unsigned long sg_off;
struct iovec *iov;
struct scatterlist *sg;
int ret = 0;
rm->m_inc.i_hdr.h_len = cpu_to_be32(total_len);
/*
* now allocate and copy in the data payload.
*/
sg = rm->data.op_sg;
iov = first_iov;
iov_off = 0;
sg_off = 0; /* Dear gcc, sg->page will be null from kzalloc. */
while (total_len) {
if (!sg_page(sg)) {
ret = rds_page_remainder_alloc(sg, total_len,
GFP_HIGHUSER);
if (ret)
goto out;
rm->data.op_nents++;
sg_off = 0;
}
while (iov_off == iov->iov_len) {
iov_off = 0;
iov++;
}
to_copy = min(iov->iov_len - iov_off, sg->length - sg_off);
to_copy = min_t(size_t, to_copy, total_len);
rdsdebug("copying %lu bytes from user iov [%p, %zu] + %lu to "
"sg [%p, %u, %u] + %lu\n",
to_copy, iov->iov_base, iov->iov_len, iov_off,
(void *)sg_page(sg), sg->offset, sg->length, sg_off);
ret = rds_page_copy_from_user(sg_page(sg), sg->offset + sg_off,
iov->iov_base + iov_off,
to_copy);
if (ret)
goto out;
iov_off += to_copy;
total_len -= to_copy;
sg_off += to_copy;
if (sg_off == sg->length)
sg++;
}
out:
return ret;
}
int rds_message_inc_copy_to_user(struct rds_incoming *inc,
struct iovec *first_iov, size_t size)
{
struct rds_message *rm;
struct iovec *iov;
struct scatterlist *sg;
unsigned long to_copy;
unsigned long iov_off;
unsigned long vec_off;
int copied;
int ret;
u32 len;
rm = container_of(inc, struct rds_message, m_inc);
len = be32_to_cpu(rm->m_inc.i_hdr.h_len);
iov = first_iov;
iov_off = 0;
sg = rm->data.op_sg;
vec_off = 0;
copied = 0;
while (copied < size && copied < len) {
while (iov_off == iov->iov_len) {
iov_off = 0;
iov++;
}
to_copy = min(iov->iov_len - iov_off, sg->length - vec_off);
to_copy = min_t(size_t, to_copy, size - copied);
to_copy = min_t(unsigned long, to_copy, len - copied);
rdsdebug("copying %lu bytes to user iov [%p, %zu] + %lu to "
"sg [%p, %u, %u] + %lu\n",
to_copy, iov->iov_base, iov->iov_len, iov_off,
sg_page(sg), sg->offset, sg->length, vec_off);
ret = rds_page_copy_to_user(sg_page(sg), sg->offset + vec_off,
iov->iov_base + iov_off,
to_copy);
if (ret) {
copied = ret;
break;
}
iov_off += to_copy;
vec_off += to_copy;
copied += to_copy;
if (vec_off == sg->length) {
vec_off = 0;
sg++;
}
}
return copied;
}
/*
* If the message is still on the send queue, wait until the transport
* is done with it. This is particularly important for RDMA operations.
*/
void rds_message_wait(struct rds_message *rm)
{
wait_event_interruptible(rm->m_flush_wait,
!test_bit(RDS_MSG_MAPPED, &rm->m_flags));
}
void rds_message_unmapped(struct rds_message *rm)
{
clear_bit(RDS_MSG_MAPPED, &rm->m_flags);
wake_up_interruptible(&rm->m_flush_wait);
}
EXPORT_SYMBOL_GPL(rds_message_unmapped);

215
net/rds/page.c Normal file
View file

@ -0,0 +1,215 @@
/*
* Copyright (c) 2006 Oracle. All rights reserved.
*
* This software is available to you under a choice of one of two
* licenses. You may choose to be licensed under the terms of the GNU
* General Public License (GPL) Version 2, available from the file
* COPYING in the main directory of this source tree, or the
* OpenIB.org BSD license below:
*
* Redistribution and use in source and binary forms, with or
* without modification, are permitted provided that the following
* conditions are met:
*
* - Redistributions of source code must retain the above
* copyright notice, this list of conditions and the following
* disclaimer.
*
* - Redistributions in binary form must reproduce the above
* copyright notice, this list of conditions and the following
* disclaimer in the documentation and/or other materials
* provided with the distribution.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
* BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
* ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
* CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
*
*/
#include <linux/highmem.h>
#include <linux/gfp.h>
#include <linux/cpu.h>
#include <linux/export.h>
#include "rds.h"
struct rds_page_remainder {
struct page *r_page;
unsigned long r_offset;
};
static DEFINE_PER_CPU_SHARED_ALIGNED(struct rds_page_remainder,
rds_page_remainders);
/*
* returns 0 on success or -errno on failure.
*
* We don't have to worry about flush_dcache_page() as this only works
* with private pages. If, say, we were to do directed receive to pinned
* user pages we'd have to worry more about cache coherence. (Though
* the flush_dcache_page() in get_user_pages() would probably be enough).
*/
int rds_page_copy_user(struct page *page, unsigned long offset,
void __user *ptr, unsigned long bytes,
int to_user)
{
unsigned long ret;
void *addr;
addr = kmap(page);
if (to_user) {
rds_stats_add(s_copy_to_user, bytes);
ret = copy_to_user(ptr, addr + offset, bytes);
} else {
rds_stats_add(s_copy_from_user, bytes);
ret = copy_from_user(addr + offset, ptr, bytes);
}
kunmap(page);
return ret ? -EFAULT : 0;
}
EXPORT_SYMBOL_GPL(rds_page_copy_user);
/**
* rds_page_remainder_alloc - build up regions of a message.
*
* @scat: Scatter list for message
* @bytes: the number of bytes needed.
* @gfp: the waiting behaviour of the allocation
*
* @gfp is always ored with __GFP_HIGHMEM. Callers must be prepared to
* kmap the pages, etc.
*
* If @bytes is at least a full page then this just returns a page from
* alloc_page().
*
* If @bytes is a partial page then this stores the unused region of the
* page in a per-cpu structure. Future partial-page allocations may be
* satisfied from that cached region. This lets us waste less memory on
* small allocations with minimal complexity. It works because the transmit
* path passes read-only page regions down to devices. They hold a page
* reference until they are done with the region.
*/
int rds_page_remainder_alloc(struct scatterlist *scat, unsigned long bytes,
gfp_t gfp)
{
struct rds_page_remainder *rem;
unsigned long flags;
struct page *page;
int ret;
gfp |= __GFP_HIGHMEM;
/* jump straight to allocation if we're trying for a huge page */
if (bytes >= PAGE_SIZE) {
page = alloc_page(gfp);
if (!page) {
ret = -ENOMEM;
} else {
sg_set_page(scat, page, PAGE_SIZE, 0);
ret = 0;
}
goto out;
}
rem = &per_cpu(rds_page_remainders, get_cpu());
local_irq_save(flags);
while (1) {
/* avoid a tiny region getting stuck by tossing it */
if (rem->r_page && bytes > (PAGE_SIZE - rem->r_offset)) {
rds_stats_inc(s_page_remainder_miss);
__free_page(rem->r_page);
rem->r_page = NULL;
}
/* hand out a fragment from the cached page */
if (rem->r_page && bytes <= (PAGE_SIZE - rem->r_offset)) {
sg_set_page(scat, rem->r_page, bytes, rem->r_offset);
get_page(sg_page(scat));
if (rem->r_offset != 0)
rds_stats_inc(s_page_remainder_hit);
rem->r_offset += bytes;
if (rem->r_offset == PAGE_SIZE) {
__free_page(rem->r_page);
rem->r_page = NULL;
}
ret = 0;
break;
}
/* alloc if there is nothing for us to use */
local_irq_restore(flags);
put_cpu();
page = alloc_page(gfp);
rem = &per_cpu(rds_page_remainders, get_cpu());
local_irq_save(flags);
if (!page) {
ret = -ENOMEM;
break;
}
/* did someone race to fill the remainder before us? */
if (rem->r_page) {
__free_page(page);
continue;
}
/* otherwise install our page and loop around to alloc */
rem->r_page = page;
rem->r_offset = 0;
}
local_irq_restore(flags);
put_cpu();
out:
rdsdebug("bytes %lu ret %d %p %u %u\n", bytes, ret,
ret ? NULL : sg_page(scat), ret ? 0 : scat->offset,
ret ? 0 : scat->length);
return ret;
}
EXPORT_SYMBOL_GPL(rds_page_remainder_alloc);
static int rds_page_remainder_cpu_notify(struct notifier_block *self,
unsigned long action, void *hcpu)
{
struct rds_page_remainder *rem;
long cpu = (long)hcpu;
rem = &per_cpu(rds_page_remainders, cpu);
rdsdebug("cpu %ld action 0x%lx\n", cpu, action);
switch (action) {
case CPU_DEAD:
if (rem->r_page)
__free_page(rem->r_page);
rem->r_page = NULL;
break;
}
return 0;
}
static struct notifier_block rds_page_remainder_nb = {
.notifier_call = rds_page_remainder_cpu_notify,
};
void rds_page_exit(void)
{
int i;
for_each_possible_cpu(i)
rds_page_remainder_cpu_notify(&rds_page_remainder_nb,
(unsigned long)CPU_DEAD,
(void *)(long)i);
}

859
net/rds/rdma.c Normal file
View file

@ -0,0 +1,859 @@
/*
* Copyright (c) 2007 Oracle. All rights reserved.
*
* This software is available to you under a choice of one of two
* licenses. You may choose to be licensed under the terms of the GNU
* General Public License (GPL) Version 2, available from the file
* COPYING in the main directory of this source tree, or the
* OpenIB.org BSD license below:
*
* Redistribution and use in source and binary forms, with or
* without modification, are permitted provided that the following
* conditions are met:
*
* - Redistributions of source code must retain the above
* copyright notice, this list of conditions and the following
* disclaimer.
*
* - Redistributions in binary form must reproduce the above
* copyright notice, this list of conditions and the following
* disclaimer in the documentation and/or other materials
* provided with the distribution.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
* BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
* ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
* CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
*
*/
#include <linux/pagemap.h>
#include <linux/slab.h>
#include <linux/rbtree.h>
#include <linux/dma-mapping.h> /* for DMA_*_DEVICE */
#include "rds.h"
/*
* XXX
* - build with sparse
* - should we limit the size of a mr region? let transport return failure?
* - should we detect duplicate keys on a socket? hmm.
* - an rdma is an mlock, apply rlimit?
*/
/*
* get the number of pages by looking at the page indices that the start and
* end addresses fall in.
*
* Returns 0 if the vec is invalid. It is invalid if the number of bytes
* causes the address to wrap or overflows an unsigned int. This comes
* from being stored in the 'length' member of 'struct scatterlist'.
*/
static unsigned int rds_pages_in_vec(struct rds_iovec *vec)
{
if ((vec->addr + vec->bytes <= vec->addr) ||
(vec->bytes > (u64)UINT_MAX))
return 0;
return ((vec->addr + vec->bytes + PAGE_SIZE - 1) >> PAGE_SHIFT) -
(vec->addr >> PAGE_SHIFT);
}
static struct rds_mr *rds_mr_tree_walk(struct rb_root *root, u64 key,
struct rds_mr *insert)
{
struct rb_node **p = &root->rb_node;
struct rb_node *parent = NULL;
struct rds_mr *mr;
while (*p) {
parent = *p;
mr = rb_entry(parent, struct rds_mr, r_rb_node);
if (key < mr->r_key)
p = &(*p)->rb_left;
else if (key > mr->r_key)
p = &(*p)->rb_right;
else
return mr;
}
if (insert) {
rb_link_node(&insert->r_rb_node, parent, p);
rb_insert_color(&insert->r_rb_node, root);
atomic_inc(&insert->r_refcount);
}
return NULL;
}
/*
* Destroy the transport-specific part of a MR.
*/
static void rds_destroy_mr(struct rds_mr *mr)
{
struct rds_sock *rs = mr->r_sock;
void *trans_private = NULL;
unsigned long flags;
rdsdebug("RDS: destroy mr key is %x refcnt %u\n",
mr->r_key, atomic_read(&mr->r_refcount));
if (test_and_set_bit(RDS_MR_DEAD, &mr->r_state))
return;
spin_lock_irqsave(&rs->rs_rdma_lock, flags);
if (!RB_EMPTY_NODE(&mr->r_rb_node))
rb_erase(&mr->r_rb_node, &rs->rs_rdma_keys);
trans_private = mr->r_trans_private;
mr->r_trans_private = NULL;
spin_unlock_irqrestore(&rs->rs_rdma_lock, flags);
if (trans_private)
mr->r_trans->free_mr(trans_private, mr->r_invalidate);
}
void __rds_put_mr_final(struct rds_mr *mr)
{
rds_destroy_mr(mr);
kfree(mr);
}
/*
* By the time this is called we can't have any more ioctls called on
* the socket so we don't need to worry about racing with others.
*/
void rds_rdma_drop_keys(struct rds_sock *rs)
{
struct rds_mr *mr;
struct rb_node *node;
unsigned long flags;
/* Release any MRs associated with this socket */
spin_lock_irqsave(&rs->rs_rdma_lock, flags);
while ((node = rb_first(&rs->rs_rdma_keys))) {
mr = container_of(node, struct rds_mr, r_rb_node);
if (mr->r_trans == rs->rs_transport)
mr->r_invalidate = 0;
rb_erase(&mr->r_rb_node, &rs->rs_rdma_keys);
RB_CLEAR_NODE(&mr->r_rb_node);
spin_unlock_irqrestore(&rs->rs_rdma_lock, flags);
rds_destroy_mr(mr);
rds_mr_put(mr);
spin_lock_irqsave(&rs->rs_rdma_lock, flags);
}
spin_unlock_irqrestore(&rs->rs_rdma_lock, flags);
if (rs->rs_transport && rs->rs_transport->flush_mrs)
rs->rs_transport->flush_mrs();
}
/*
* Helper function to pin user pages.
*/
static int rds_pin_pages(unsigned long user_addr, unsigned int nr_pages,
struct page **pages, int write)
{
int ret;
ret = get_user_pages_fast(user_addr, nr_pages, write, pages);
if (ret >= 0 && ret < nr_pages) {
while (ret--)
put_page(pages[ret]);
ret = -EFAULT;
}
return ret;
}
static int __rds_rdma_map(struct rds_sock *rs, struct rds_get_mr_args *args,
u64 *cookie_ret, struct rds_mr **mr_ret)
{
struct rds_mr *mr = NULL, *found;
unsigned int nr_pages;
struct page **pages = NULL;
struct scatterlist *sg;
void *trans_private;
unsigned long flags;
rds_rdma_cookie_t cookie;
unsigned int nents;
long i;
int ret;
if (rs->rs_bound_addr == 0) {
ret = -ENOTCONN; /* XXX not a great errno */
goto out;
}
if (!rs->rs_transport->get_mr) {
ret = -EOPNOTSUPP;
goto out;
}
nr_pages = rds_pages_in_vec(&args->vec);
if (nr_pages == 0) {
ret = -EINVAL;
goto out;
}
rdsdebug("RDS: get_mr addr %llx len %llu nr_pages %u\n",
args->vec.addr, args->vec.bytes, nr_pages);
/* XXX clamp nr_pages to limit the size of this alloc? */
pages = kcalloc(nr_pages, sizeof(struct page *), GFP_KERNEL);
if (!pages) {
ret = -ENOMEM;
goto out;
}
mr = kzalloc(sizeof(struct rds_mr), GFP_KERNEL);
if (!mr) {
ret = -ENOMEM;
goto out;
}
atomic_set(&mr->r_refcount, 1);
RB_CLEAR_NODE(&mr->r_rb_node);
mr->r_trans = rs->rs_transport;
mr->r_sock = rs;
if (args->flags & RDS_RDMA_USE_ONCE)
mr->r_use_once = 1;
if (args->flags & RDS_RDMA_INVALIDATE)
mr->r_invalidate = 1;
if (args->flags & RDS_RDMA_READWRITE)
mr->r_write = 1;
/*
* Pin the pages that make up the user buffer and transfer the page
* pointers to the mr's sg array. We check to see if we've mapped
* the whole region after transferring the partial page references
* to the sg array so that we can have one page ref cleanup path.
*
* For now we have no flag that tells us whether the mapping is
* r/o or r/w. We need to assume r/w, or we'll do a lot of RDMA to
* the zero page.
*/
ret = rds_pin_pages(args->vec.addr, nr_pages, pages, 1);
if (ret < 0)
goto out;
nents = ret;
sg = kcalloc(nents, sizeof(*sg), GFP_KERNEL);
if (!sg) {
ret = -ENOMEM;
goto out;
}
WARN_ON(!nents);
sg_init_table(sg, nents);
/* Stick all pages into the scatterlist */
for (i = 0 ; i < nents; i++)
sg_set_page(&sg[i], pages[i], PAGE_SIZE, 0);
rdsdebug("RDS: trans_private nents is %u\n", nents);
/* Obtain a transport specific MR. If this succeeds, the
* s/g list is now owned by the MR.
* Note that dma_map() implies that pending writes are
* flushed to RAM, so no dma_sync is needed here. */
trans_private = rs->rs_transport->get_mr(sg, nents, rs,
&mr->r_key);
if (IS_ERR(trans_private)) {
for (i = 0 ; i < nents; i++)
put_page(sg_page(&sg[i]));
kfree(sg);
ret = PTR_ERR(trans_private);
goto out;
}
mr->r_trans_private = trans_private;
rdsdebug("RDS: get_mr put_user key is %x cookie_addr %p\n",
mr->r_key, (void *)(unsigned long) args->cookie_addr);
/* The user may pass us an unaligned address, but we can only
* map page aligned regions. So we keep the offset, and build
* a 64bit cookie containing <R_Key, offset> and pass that
* around. */
cookie = rds_rdma_make_cookie(mr->r_key, args->vec.addr & ~PAGE_MASK);
if (cookie_ret)
*cookie_ret = cookie;
if (args->cookie_addr && put_user(cookie, (u64 __user *)(unsigned long) args->cookie_addr)) {
ret = -EFAULT;
goto out;
}
/* Inserting the new MR into the rbtree bumps its
* reference count. */
spin_lock_irqsave(&rs->rs_rdma_lock, flags);
found = rds_mr_tree_walk(&rs->rs_rdma_keys, mr->r_key, mr);
spin_unlock_irqrestore(&rs->rs_rdma_lock, flags);
BUG_ON(found && found != mr);
rdsdebug("RDS: get_mr key is %x\n", mr->r_key);
if (mr_ret) {
atomic_inc(&mr->r_refcount);
*mr_ret = mr;
}
ret = 0;
out:
kfree(pages);
if (mr)
rds_mr_put(mr);
return ret;
}
int rds_get_mr(struct rds_sock *rs, char __user *optval, int optlen)
{
struct rds_get_mr_args args;
if (optlen != sizeof(struct rds_get_mr_args))
return -EINVAL;
if (copy_from_user(&args, (struct rds_get_mr_args __user *)optval,
sizeof(struct rds_get_mr_args)))
return -EFAULT;
return __rds_rdma_map(rs, &args, NULL, NULL);
}
int rds_get_mr_for_dest(struct rds_sock *rs, char __user *optval, int optlen)
{
struct rds_get_mr_for_dest_args args;
struct rds_get_mr_args new_args;
if (optlen != sizeof(struct rds_get_mr_for_dest_args))
return -EINVAL;
if (copy_from_user(&args, (struct rds_get_mr_for_dest_args __user *)optval,
sizeof(struct rds_get_mr_for_dest_args)))
return -EFAULT;
/*
* Initially, just behave like get_mr().
* TODO: Implement get_mr as wrapper around this
* and deprecate it.
*/
new_args.vec = args.vec;
new_args.cookie_addr = args.cookie_addr;
new_args.flags = args.flags;
return __rds_rdma_map(rs, &new_args, NULL, NULL);
}
/*
* Free the MR indicated by the given R_Key
*/
int rds_free_mr(struct rds_sock *rs, char __user *optval, int optlen)
{
struct rds_free_mr_args args;
struct rds_mr *mr;
unsigned long flags;
if (optlen != sizeof(struct rds_free_mr_args))
return -EINVAL;
if (copy_from_user(&args, (struct rds_free_mr_args __user *)optval,
sizeof(struct rds_free_mr_args)))
return -EFAULT;
/* Special case - a null cookie means flush all unused MRs */
if (args.cookie == 0) {
if (!rs->rs_transport || !rs->rs_transport->flush_mrs)
return -EINVAL;
rs->rs_transport->flush_mrs();
return 0;
}
/* Look up the MR given its R_key and remove it from the rbtree
* so nobody else finds it.
* This should also prevent races with rds_rdma_unuse.
*/
spin_lock_irqsave(&rs->rs_rdma_lock, flags);
mr = rds_mr_tree_walk(&rs->rs_rdma_keys, rds_rdma_cookie_key(args.cookie), NULL);
if (mr) {
rb_erase(&mr->r_rb_node, &rs->rs_rdma_keys);
RB_CLEAR_NODE(&mr->r_rb_node);
if (args.flags & RDS_RDMA_INVALIDATE)
mr->r_invalidate = 1;
}
spin_unlock_irqrestore(&rs->rs_rdma_lock, flags);
if (!mr)
return -EINVAL;
/*
* call rds_destroy_mr() ourselves so that we're sure it's done by the time
* we return. If we let rds_mr_put() do it it might not happen until
* someone else drops their ref.
*/
rds_destroy_mr(mr);
rds_mr_put(mr);
return 0;
}
/*
* This is called when we receive an extension header that
* tells us this MR was used. It allows us to implement
* use_once semantics
*/
void rds_rdma_unuse(struct rds_sock *rs, u32 r_key, int force)
{
struct rds_mr *mr;
unsigned long flags;
int zot_me = 0;
spin_lock_irqsave(&rs->rs_rdma_lock, flags);
mr = rds_mr_tree_walk(&rs->rs_rdma_keys, r_key, NULL);
if (!mr) {
printk(KERN_ERR "rds: trying to unuse MR with unknown r_key %u!\n", r_key);
spin_unlock_irqrestore(&rs->rs_rdma_lock, flags);
return;
}
if (mr->r_use_once || force) {
rb_erase(&mr->r_rb_node, &rs->rs_rdma_keys);
RB_CLEAR_NODE(&mr->r_rb_node);
zot_me = 1;
}
spin_unlock_irqrestore(&rs->rs_rdma_lock, flags);
/* May have to issue a dma_sync on this memory region.
* Note we could avoid this if the operation was a RDMA READ,
* but at this point we can't tell. */
if (mr->r_trans->sync_mr)
mr->r_trans->sync_mr(mr->r_trans_private, DMA_FROM_DEVICE);
/* If the MR was marked as invalidate, this will
* trigger an async flush. */
if (zot_me)
rds_destroy_mr(mr);
rds_mr_put(mr);
}
void rds_rdma_free_op(struct rm_rdma_op *ro)
{
unsigned int i;
for (i = 0; i < ro->op_nents; i++) {
struct page *page = sg_page(&ro->op_sg[i]);
/* Mark page dirty if it was possibly modified, which
* is the case for a RDMA_READ which copies from remote
* to local memory */
if (!ro->op_write) {
BUG_ON(irqs_disabled());
set_page_dirty(page);
}
put_page(page);
}
kfree(ro->op_notifier);
ro->op_notifier = NULL;
ro->op_active = 0;
}
void rds_atomic_free_op(struct rm_atomic_op *ao)
{
struct page *page = sg_page(ao->op_sg);
/* Mark page dirty if it was possibly modified, which
* is the case for a RDMA_READ which copies from remote
* to local memory */
set_page_dirty(page);
put_page(page);
kfree(ao->op_notifier);
ao->op_notifier = NULL;
ao->op_active = 0;
}
/*
* Count the number of pages needed to describe an incoming iovec array.
*/
static int rds_rdma_pages(struct rds_iovec iov[], int nr_iovecs)
{
int tot_pages = 0;
unsigned int nr_pages;
unsigned int i;
/* figure out the number of pages in the vector */
for (i = 0; i < nr_iovecs; i++) {
nr_pages = rds_pages_in_vec(&iov[i]);
if (nr_pages == 0)
return -EINVAL;
tot_pages += nr_pages;
/*
* nr_pages for one entry is limited to (UINT_MAX>>PAGE_SHIFT)+1,
* so tot_pages cannot overflow without first going negative.
*/
if (tot_pages < 0)
return -EINVAL;
}
return tot_pages;
}
int rds_rdma_extra_size(struct rds_rdma_args *args)
{
struct rds_iovec vec;
struct rds_iovec __user *local_vec;
int tot_pages = 0;
unsigned int nr_pages;
unsigned int i;
local_vec = (struct rds_iovec __user *)(unsigned long) args->local_vec_addr;
/* figure out the number of pages in the vector */
for (i = 0; i < args->nr_local; i++) {
if (copy_from_user(&vec, &local_vec[i],
sizeof(struct rds_iovec)))
return -EFAULT;
nr_pages = rds_pages_in_vec(&vec);
if (nr_pages == 0)
return -EINVAL;
tot_pages += nr_pages;
/*
* nr_pages for one entry is limited to (UINT_MAX>>PAGE_SHIFT)+1,
* so tot_pages cannot overflow without first going negative.
*/
if (tot_pages < 0)
return -EINVAL;
}
return tot_pages * sizeof(struct scatterlist);
}
/*
* The application asks for a RDMA transfer.
* Extract all arguments and set up the rdma_op
*/
int rds_cmsg_rdma_args(struct rds_sock *rs, struct rds_message *rm,
struct cmsghdr *cmsg)
{
struct rds_rdma_args *args;
struct rm_rdma_op *op = &rm->rdma;
int nr_pages;
unsigned int nr_bytes;
struct page **pages = NULL;
struct rds_iovec iovstack[UIO_FASTIOV], *iovs = iovstack;
int iov_size;
unsigned int i, j;
int ret = 0;
if (cmsg->cmsg_len < CMSG_LEN(sizeof(struct rds_rdma_args))
|| rm->rdma.op_active)
return -EINVAL;
args = CMSG_DATA(cmsg);
if (rs->rs_bound_addr == 0) {
ret = -ENOTCONN; /* XXX not a great errno */
goto out_ret;
}
if (args->nr_local > UIO_MAXIOV) {
ret = -EMSGSIZE;
goto out_ret;
}
/* Check whether to allocate the iovec area */
iov_size = args->nr_local * sizeof(struct rds_iovec);
if (args->nr_local > UIO_FASTIOV) {
iovs = sock_kmalloc(rds_rs_to_sk(rs), iov_size, GFP_KERNEL);
if (!iovs) {
ret = -ENOMEM;
goto out_ret;
}
}
if (copy_from_user(iovs, (struct rds_iovec __user *)(unsigned long) args->local_vec_addr, iov_size)) {
ret = -EFAULT;
goto out;
}
nr_pages = rds_rdma_pages(iovs, args->nr_local);
if (nr_pages < 0) {
ret = -EINVAL;
goto out;
}
pages = kcalloc(nr_pages, sizeof(struct page *), GFP_KERNEL);
if (!pages) {
ret = -ENOMEM;
goto out;
}
op->op_write = !!(args->flags & RDS_RDMA_READWRITE);
op->op_fence = !!(args->flags & RDS_RDMA_FENCE);
op->op_notify = !!(args->flags & RDS_RDMA_NOTIFY_ME);
op->op_silent = !!(args->flags & RDS_RDMA_SILENT);
op->op_active = 1;
op->op_recverr = rs->rs_recverr;
WARN_ON(!nr_pages);
op->op_sg = rds_message_alloc_sgs(rm, nr_pages);
if (!op->op_sg) {
ret = -ENOMEM;
goto out;
}
if (op->op_notify || op->op_recverr) {
/* We allocate an uninitialized notifier here, because
* we don't want to do that in the completion handler. We
* would have to use GFP_ATOMIC there, and don't want to deal
* with failed allocations.
*/
op->op_notifier = kmalloc(sizeof(struct rds_notifier), GFP_KERNEL);
if (!op->op_notifier) {
ret = -ENOMEM;
goto out;
}
op->op_notifier->n_user_token = args->user_token;
op->op_notifier->n_status = RDS_RDMA_SUCCESS;
}
/* The cookie contains the R_Key of the remote memory region, and
* optionally an offset into it. This is how we implement RDMA into
* unaligned memory.
* When setting up the RDMA, we need to add that offset to the
* destination address (which is really an offset into the MR)
* FIXME: We may want to move this into ib_rdma.c
*/
op->op_rkey = rds_rdma_cookie_key(args->cookie);
op->op_remote_addr = args->remote_vec.addr + rds_rdma_cookie_offset(args->cookie);
nr_bytes = 0;
rdsdebug("RDS: rdma prepare nr_local %llu rva %llx rkey %x\n",
(unsigned long long)args->nr_local,
(unsigned long long)args->remote_vec.addr,
op->op_rkey);
for (i = 0; i < args->nr_local; i++) {
struct rds_iovec *iov = &iovs[i];
/* don't need to check, rds_rdma_pages() verified nr will be +nonzero */
unsigned int nr = rds_pages_in_vec(iov);
rs->rs_user_addr = iov->addr;
rs->rs_user_bytes = iov->bytes;
/* If it's a WRITE operation, we want to pin the pages for reading.
* If it's a READ operation, we need to pin the pages for writing.
*/
ret = rds_pin_pages(iov->addr, nr, pages, !op->op_write);
if (ret < 0)
goto out;
rdsdebug("RDS: nr_bytes %u nr %u iov->bytes %llu iov->addr %llx\n",
nr_bytes, nr, iov->bytes, iov->addr);
nr_bytes += iov->bytes;
for (j = 0; j < nr; j++) {
unsigned int offset = iov->addr & ~PAGE_MASK;
struct scatterlist *sg;
sg = &op->op_sg[op->op_nents + j];
sg_set_page(sg, pages[j],
min_t(unsigned int, iov->bytes, PAGE_SIZE - offset),
offset);
rdsdebug("RDS: sg->offset %x sg->len %x iov->addr %llx iov->bytes %llu\n",
sg->offset, sg->length, iov->addr, iov->bytes);
iov->addr += sg->length;
iov->bytes -= sg->length;
}
op->op_nents += nr;
}
if (nr_bytes > args->remote_vec.bytes) {
rdsdebug("RDS nr_bytes %u remote_bytes %u do not match\n",
nr_bytes,
(unsigned int) args->remote_vec.bytes);
ret = -EINVAL;
goto out;
}
op->op_bytes = nr_bytes;
out:
if (iovs != iovstack)
sock_kfree_s(rds_rs_to_sk(rs), iovs, iov_size);
kfree(pages);
out_ret:
if (ret)
rds_rdma_free_op(op);
else
rds_stats_inc(s_send_rdma);
return ret;
}
/*
* The application wants us to pass an RDMA destination (aka MR)
* to the remote
*/
int rds_cmsg_rdma_dest(struct rds_sock *rs, struct rds_message *rm,
struct cmsghdr *cmsg)
{
unsigned long flags;
struct rds_mr *mr;
u32 r_key;
int err = 0;
if (cmsg->cmsg_len < CMSG_LEN(sizeof(rds_rdma_cookie_t)) ||
rm->m_rdma_cookie != 0)
return -EINVAL;
memcpy(&rm->m_rdma_cookie, CMSG_DATA(cmsg), sizeof(rm->m_rdma_cookie));
/* We are reusing a previously mapped MR here. Most likely, the
* application has written to the buffer, so we need to explicitly
* flush those writes to RAM. Otherwise the HCA may not see them
* when doing a DMA from that buffer.
*/
r_key = rds_rdma_cookie_key(rm->m_rdma_cookie);
spin_lock_irqsave(&rs->rs_rdma_lock, flags);
mr = rds_mr_tree_walk(&rs->rs_rdma_keys, r_key, NULL);
if (!mr)
err = -EINVAL; /* invalid r_key */
else
atomic_inc(&mr->r_refcount);
spin_unlock_irqrestore(&rs->rs_rdma_lock, flags);
if (mr) {
mr->r_trans->sync_mr(mr->r_trans_private, DMA_TO_DEVICE);
rm->rdma.op_rdma_mr = mr;
}
return err;
}
/*
* The application passes us an address range it wants to enable RDMA
* to/from. We map the area, and save the <R_Key,offset> pair
* in rm->m_rdma_cookie. This causes it to be sent along to the peer
* in an extension header.
*/
int rds_cmsg_rdma_map(struct rds_sock *rs, struct rds_message *rm,
struct cmsghdr *cmsg)
{
if (cmsg->cmsg_len < CMSG_LEN(sizeof(struct rds_get_mr_args)) ||
rm->m_rdma_cookie != 0)
return -EINVAL;
return __rds_rdma_map(rs, CMSG_DATA(cmsg), &rm->m_rdma_cookie, &rm->rdma.op_rdma_mr);
}
/*
* Fill in rds_message for an atomic request.
*/
int rds_cmsg_atomic(struct rds_sock *rs, struct rds_message *rm,
struct cmsghdr *cmsg)
{
struct page *page = NULL;
struct rds_atomic_args *args;
int ret = 0;
if (cmsg->cmsg_len < CMSG_LEN(sizeof(struct rds_atomic_args))
|| rm->atomic.op_active)
return -EINVAL;
args = CMSG_DATA(cmsg);
/* Nonmasked & masked cmsg ops converted to masked hw ops */
switch (cmsg->cmsg_type) {
case RDS_CMSG_ATOMIC_FADD:
rm->atomic.op_type = RDS_ATOMIC_TYPE_FADD;
rm->atomic.op_m_fadd.add = args->fadd.add;
rm->atomic.op_m_fadd.nocarry_mask = 0;
break;
case RDS_CMSG_MASKED_ATOMIC_FADD:
rm->atomic.op_type = RDS_ATOMIC_TYPE_FADD;
rm->atomic.op_m_fadd.add = args->m_fadd.add;
rm->atomic.op_m_fadd.nocarry_mask = args->m_fadd.nocarry_mask;
break;
case RDS_CMSG_ATOMIC_CSWP:
rm->atomic.op_type = RDS_ATOMIC_TYPE_CSWP;
rm->atomic.op_m_cswp.compare = args->cswp.compare;
rm->atomic.op_m_cswp.swap = args->cswp.swap;
rm->atomic.op_m_cswp.compare_mask = ~0;
rm->atomic.op_m_cswp.swap_mask = ~0;
break;
case RDS_CMSG_MASKED_ATOMIC_CSWP:
rm->atomic.op_type = RDS_ATOMIC_TYPE_CSWP;
rm->atomic.op_m_cswp.compare = args->m_cswp.compare;
rm->atomic.op_m_cswp.swap = args->m_cswp.swap;
rm->atomic.op_m_cswp.compare_mask = args->m_cswp.compare_mask;
rm->atomic.op_m_cswp.swap_mask = args->m_cswp.swap_mask;
break;
default:
BUG(); /* should never happen */
}
rm->atomic.op_notify = !!(args->flags & RDS_RDMA_NOTIFY_ME);
rm->atomic.op_silent = !!(args->flags & RDS_RDMA_SILENT);
rm->atomic.op_active = 1;
rm->atomic.op_recverr = rs->rs_recverr;
rm->atomic.op_sg = rds_message_alloc_sgs(rm, 1);
if (!rm->atomic.op_sg) {
ret = -ENOMEM;
goto err;
}
/* verify 8 byte-aligned */
if (args->local_addr & 0x7) {
ret = -EFAULT;
goto err;
}
ret = rds_pin_pages(args->local_addr, 1, &page, 1);
if (ret != 1)
goto err;
ret = 0;
sg_set_page(rm->atomic.op_sg, page, 8, offset_in_page(args->local_addr));
if (rm->atomic.op_notify || rm->atomic.op_recverr) {
/* We allocate an uninitialized notifier here, because
* we don't want to do that in the completion handler. We
* would have to use GFP_ATOMIC there, and don't want to deal
* with failed allocations.
*/
rm->atomic.op_notifier = kmalloc(sizeof(*rm->atomic.op_notifier), GFP_KERNEL);
if (!rm->atomic.op_notifier) {
ret = -ENOMEM;
goto err;
}
rm->atomic.op_notifier->n_user_token = args->user_token;
rm->atomic.op_notifier->n_status = RDS_RDMA_SUCCESS;
}
rm->atomic.op_rkey = rds_rdma_cookie_key(args->cookie);
rm->atomic.op_remote_addr = args->remote_addr + rds_rdma_cookie_offset(args->cookie);
return ret;
err:
if (page)
put_page(page);
kfree(rm->atomic.op_notifier);
return ret;
}

251
net/rds/rdma_transport.c Normal file
View file

@ -0,0 +1,251 @@
/*
* Copyright (c) 2009 Oracle. All rights reserved.
*
* This software is available to you under a choice of one of two
* licenses. You may choose to be licensed under the terms of the GNU
* General Public License (GPL) Version 2, available from the file
* COPYING in the main directory of this source tree, or the
* OpenIB.org BSD license below:
*
* Redistribution and use in source and binary forms, with or
* without modification, are permitted provided that the following
* conditions are met:
*
* - Redistributions of source code must retain the above
* copyright notice, this list of conditions and the following
* disclaimer.
*
* - Redistributions in binary form must reproduce the above
* copyright notice, this list of conditions and the following
* disclaimer in the documentation and/or other materials
* provided with the distribution.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
* BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
* ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
* CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
*
*/
#include <linux/module.h>
#include <rdma/rdma_cm.h>
#include "rdma_transport.h"
static struct rdma_cm_id *rds_rdma_listen_id;
static char *rds_cm_event_strings[] = {
#define RDS_CM_EVENT_STRING(foo) \
[RDMA_CM_EVENT_##foo] = __stringify(RDMA_CM_EVENT_##foo)
RDS_CM_EVENT_STRING(ADDR_RESOLVED),
RDS_CM_EVENT_STRING(ADDR_ERROR),
RDS_CM_EVENT_STRING(ROUTE_RESOLVED),
RDS_CM_EVENT_STRING(ROUTE_ERROR),
RDS_CM_EVENT_STRING(CONNECT_REQUEST),
RDS_CM_EVENT_STRING(CONNECT_RESPONSE),
RDS_CM_EVENT_STRING(CONNECT_ERROR),
RDS_CM_EVENT_STRING(UNREACHABLE),
RDS_CM_EVENT_STRING(REJECTED),
RDS_CM_EVENT_STRING(ESTABLISHED),
RDS_CM_EVENT_STRING(DISCONNECTED),
RDS_CM_EVENT_STRING(DEVICE_REMOVAL),
RDS_CM_EVENT_STRING(MULTICAST_JOIN),
RDS_CM_EVENT_STRING(MULTICAST_ERROR),
RDS_CM_EVENT_STRING(ADDR_CHANGE),
RDS_CM_EVENT_STRING(TIMEWAIT_EXIT),
#undef RDS_CM_EVENT_STRING
};
static char *rds_cm_event_str(enum rdma_cm_event_type type)
{
return rds_str_array(rds_cm_event_strings,
ARRAY_SIZE(rds_cm_event_strings), type);
};
int rds_rdma_cm_event_handler(struct rdma_cm_id *cm_id,
struct rdma_cm_event *event)
{
/* this can be null in the listening path */
struct rds_connection *conn = cm_id->context;
struct rds_transport *trans;
int ret = 0;
rdsdebug("conn %p id %p handling event %u (%s)\n", conn, cm_id,
event->event, rds_cm_event_str(event->event));
if (cm_id->device->node_type == RDMA_NODE_RNIC)
trans = &rds_iw_transport;
else
trans = &rds_ib_transport;
/* Prevent shutdown from tearing down the connection
* while we're executing. */
if (conn) {
mutex_lock(&conn->c_cm_lock);
/* If the connection is being shut down, bail out
* right away. We return 0 so cm_id doesn't get
* destroyed prematurely */
if (rds_conn_state(conn) == RDS_CONN_DISCONNECTING) {
/* Reject incoming connections while we're tearing
* down an existing one. */
if (event->event == RDMA_CM_EVENT_CONNECT_REQUEST)
ret = 1;
goto out;
}
}
switch (event->event) {
case RDMA_CM_EVENT_CONNECT_REQUEST:
ret = trans->cm_handle_connect(cm_id, event);
break;
case RDMA_CM_EVENT_ADDR_RESOLVED:
/* XXX do we need to clean up if this fails? */
ret = rdma_resolve_route(cm_id,
RDS_RDMA_RESOLVE_TIMEOUT_MS);
break;
case RDMA_CM_EVENT_ROUTE_RESOLVED:
/* XXX worry about racing with listen acceptance */
ret = trans->cm_initiate_connect(cm_id);
break;
case RDMA_CM_EVENT_ESTABLISHED:
trans->cm_connect_complete(conn, event);
break;
case RDMA_CM_EVENT_ADDR_ERROR:
case RDMA_CM_EVENT_ROUTE_ERROR:
case RDMA_CM_EVENT_CONNECT_ERROR:
case RDMA_CM_EVENT_UNREACHABLE:
case RDMA_CM_EVENT_REJECTED:
case RDMA_CM_EVENT_DEVICE_REMOVAL:
case RDMA_CM_EVENT_ADDR_CHANGE:
if (conn)
rds_conn_drop(conn);
break;
case RDMA_CM_EVENT_DISCONNECTED:
rdsdebug("DISCONNECT event - dropping connection "
"%pI4->%pI4\n", &conn->c_laddr,
&conn->c_faddr);
rds_conn_drop(conn);
break;
default:
/* things like device disconnect? */
printk(KERN_ERR "RDS: unknown event %u (%s)!\n",
event->event, rds_cm_event_str(event->event));
break;
}
out:
if (conn)
mutex_unlock(&conn->c_cm_lock);
rdsdebug("id %p event %u (%s) handling ret %d\n", cm_id, event->event,
rds_cm_event_str(event->event), ret);
return ret;
}
static int rds_rdma_listen_init(void)
{
struct sockaddr_in sin;
struct rdma_cm_id *cm_id;
int ret;
cm_id = rdma_create_id(rds_rdma_cm_event_handler, NULL, RDMA_PS_TCP,
IB_QPT_RC);
if (IS_ERR(cm_id)) {
ret = PTR_ERR(cm_id);
printk(KERN_ERR "RDS/RDMA: failed to setup listener, "
"rdma_create_id() returned %d\n", ret);
return ret;
}
sin.sin_family = AF_INET;
sin.sin_addr.s_addr = (__force u32)htonl(INADDR_ANY);
sin.sin_port = (__force u16)htons(RDS_PORT);
/*
* XXX I bet this binds the cm_id to a device. If we want to support
* fail-over we'll have to take this into consideration.
*/
ret = rdma_bind_addr(cm_id, (struct sockaddr *)&sin);
if (ret) {
printk(KERN_ERR "RDS/RDMA: failed to setup listener, "
"rdma_bind_addr() returned %d\n", ret);
goto out;
}
ret = rdma_listen(cm_id, 128);
if (ret) {
printk(KERN_ERR "RDS/RDMA: failed to setup listener, "
"rdma_listen() returned %d\n", ret);
goto out;
}
rdsdebug("cm %p listening on port %u\n", cm_id, RDS_PORT);
rds_rdma_listen_id = cm_id;
cm_id = NULL;
out:
if (cm_id)
rdma_destroy_id(cm_id);
return ret;
}
static void rds_rdma_listen_stop(void)
{
if (rds_rdma_listen_id) {
rdsdebug("cm %p\n", rds_rdma_listen_id);
rdma_destroy_id(rds_rdma_listen_id);
rds_rdma_listen_id = NULL;
}
}
static int rds_rdma_init(void)
{
int ret;
ret = rds_rdma_listen_init();
if (ret)
goto out;
ret = rds_iw_init();
if (ret)
goto err_iw_init;
ret = rds_ib_init();
if (ret)
goto err_ib_init;
goto out;
err_ib_init:
rds_iw_exit();
err_iw_init:
rds_rdma_listen_stop();
out:
return ret;
}
module_init(rds_rdma_init);
static void rds_rdma_exit(void)
{
/* stop listening first to ensure no new connections are attempted */
rds_rdma_listen_stop();
rds_ib_exit();
rds_iw_exit();
}
module_exit(rds_rdma_exit);
MODULE_AUTHOR("Oracle Corporation <rds-devel@oss.oracle.com>");
MODULE_DESCRIPTION("RDS: IB/iWARP transport");
MODULE_LICENSE("Dual BSD/GPL");

24
net/rds/rdma_transport.h Normal file
View file

@ -0,0 +1,24 @@
#ifndef _RDMA_TRANSPORT_H
#define _RDMA_TRANSPORT_H
#include <rdma/ib_verbs.h>
#include <rdma/rdma_cm.h>
#include "rds.h"
#define RDS_RDMA_RESOLVE_TIMEOUT_MS 5000
int rds_rdma_conn_connect(struct rds_connection *conn);
int rds_rdma_cm_event_handler(struct rdma_cm_id *cm_id,
struct rdma_cm_event *event);
/* from ib.c */
extern struct rds_transport rds_ib_transport;
int rds_ib_init(void);
void rds_ib_exit(void);
/* from iw.c */
extern struct rds_transport rds_iw_transport;
int rds_iw_init(void);
void rds_iw_exit(void);
#endif

812
net/rds/rds.h Normal file
View file

@ -0,0 +1,812 @@
#ifndef _RDS_RDS_H
#define _RDS_RDS_H
#include <net/sock.h>
#include <linux/scatterlist.h>
#include <linux/highmem.h>
#include <rdma/rdma_cm.h>
#include <linux/mutex.h>
#include <linux/rds.h>
#include "info.h"
/*
* RDS Network protocol version
*/
#define RDS_PROTOCOL_3_0 0x0300
#define RDS_PROTOCOL_3_1 0x0301
#define RDS_PROTOCOL_VERSION RDS_PROTOCOL_3_1
#define RDS_PROTOCOL_MAJOR(v) ((v) >> 8)
#define RDS_PROTOCOL_MINOR(v) ((v) & 255)
#define RDS_PROTOCOL(maj, min) (((maj) << 8) | min)
/*
* XXX randomly chosen, but at least seems to be unused:
* # 18464-18768 Unassigned
* We should do better. We want a reserved port to discourage unpriv'ed
* userspace from listening.
*/
#define RDS_PORT 18634
#ifdef ATOMIC64_INIT
#define KERNEL_HAS_ATOMIC64
#endif
#ifdef DEBUG
#define rdsdebug(fmt, args...) pr_debug("%s(): " fmt, __func__ , ##args)
#else
/* sigh, pr_debug() causes unused variable warnings */
static inline __printf(1, 2)
void rdsdebug(char *fmt, ...)
{
}
#endif
/* XXX is there one of these somewhere? */
#define ceil(x, y) \
({ unsigned long __x = (x), __y = (y); (__x + __y - 1) / __y; })
#define RDS_FRAG_SHIFT 12
#define RDS_FRAG_SIZE ((unsigned int)(1 << RDS_FRAG_SHIFT))
#define RDS_CONG_MAP_BYTES (65536 / 8)
#define RDS_CONG_MAP_PAGES (PAGE_ALIGN(RDS_CONG_MAP_BYTES) / PAGE_SIZE)
#define RDS_CONG_MAP_PAGE_BITS (PAGE_SIZE * 8)
struct rds_cong_map {
struct rb_node m_rb_node;
__be32 m_addr;
wait_queue_head_t m_waitq;
struct list_head m_conn_list;
unsigned long m_page_addrs[RDS_CONG_MAP_PAGES];
};
/*
* This is how we will track the connection state:
* A connection is always in one of the following
* states. Updates to the state are atomic and imply
* a memory barrier.
*/
enum {
RDS_CONN_DOWN = 0,
RDS_CONN_CONNECTING,
RDS_CONN_DISCONNECTING,
RDS_CONN_UP,
RDS_CONN_ERROR,
};
/* Bits for c_flags */
#define RDS_LL_SEND_FULL 0
#define RDS_RECONNECT_PENDING 1
#define RDS_IN_XMIT 2
struct rds_connection {
struct hlist_node c_hash_node;
__be32 c_laddr;
__be32 c_faddr;
unsigned int c_loopback:1;
struct rds_connection *c_passive;
struct rds_cong_map *c_lcong;
struct rds_cong_map *c_fcong;
struct rds_message *c_xmit_rm;
unsigned long c_xmit_sg;
unsigned int c_xmit_hdr_off;
unsigned int c_xmit_data_off;
unsigned int c_xmit_atomic_sent;
unsigned int c_xmit_rdma_sent;
unsigned int c_xmit_data_sent;
spinlock_t c_lock; /* protect msg queues */
u64 c_next_tx_seq;
struct list_head c_send_queue;
struct list_head c_retrans;
u64 c_next_rx_seq;
struct rds_transport *c_trans;
void *c_transport_data;
atomic_t c_state;
unsigned long c_flags;
unsigned long c_reconnect_jiffies;
struct delayed_work c_send_w;
struct delayed_work c_recv_w;
struct delayed_work c_conn_w;
struct work_struct c_down_w;
struct mutex c_cm_lock; /* protect conn state & cm */
wait_queue_head_t c_waitq;
struct list_head c_map_item;
unsigned long c_map_queued;
unsigned int c_unacked_packets;
unsigned int c_unacked_bytes;
/* Protocol version */
unsigned int c_version;
};
#define RDS_FLAG_CONG_BITMAP 0x01
#define RDS_FLAG_ACK_REQUIRED 0x02
#define RDS_FLAG_RETRANSMITTED 0x04
#define RDS_MAX_ADV_CREDIT 255
/*
* Maximum space available for extension headers.
*/
#define RDS_HEADER_EXT_SPACE 16
struct rds_header {
__be64 h_sequence;
__be64 h_ack;
__be32 h_len;
__be16 h_sport;
__be16 h_dport;
u8 h_flags;
u8 h_credit;
u8 h_padding[4];
__sum16 h_csum;
u8 h_exthdr[RDS_HEADER_EXT_SPACE];
};
/*
* Reserved - indicates end of extensions
*/
#define RDS_EXTHDR_NONE 0
/*
* This extension header is included in the very
* first message that is sent on a new connection,
* and identifies the protocol level. This will help
* rolling updates if a future change requires breaking
* the protocol.
* NB: This is no longer true for IB, where we do a version
* negotiation during the connection setup phase (protocol
* version information is included in the RDMA CM private data).
*/
#define RDS_EXTHDR_VERSION 1
struct rds_ext_header_version {
__be32 h_version;
};
/*
* This extension header is included in the RDS message
* chasing an RDMA operation.
*/
#define RDS_EXTHDR_RDMA 2
struct rds_ext_header_rdma {
__be32 h_rdma_rkey;
};
/*
* This extension header tells the peer about the
* destination <R_Key,offset> of the requested RDMA
* operation.
*/
#define RDS_EXTHDR_RDMA_DEST 3
struct rds_ext_header_rdma_dest {
__be32 h_rdma_rkey;
__be32 h_rdma_offset;
};
#define __RDS_EXTHDR_MAX 16 /* for now */
struct rds_incoming {
atomic_t i_refcount;
struct list_head i_item;
struct rds_connection *i_conn;
struct rds_header i_hdr;
unsigned long i_rx_jiffies;
__be32 i_saddr;
rds_rdma_cookie_t i_rdma_cookie;
};
struct rds_mr {
struct rb_node r_rb_node;
atomic_t r_refcount;
u32 r_key;
/* A copy of the creation flags */
unsigned int r_use_once:1;
unsigned int r_invalidate:1;
unsigned int r_write:1;
/* This is for RDS_MR_DEAD.
* It would be nice & consistent to make this part of the above
* bit field here, but we need to use test_and_set_bit.
*/
unsigned long r_state;
struct rds_sock *r_sock; /* back pointer to the socket that owns us */
struct rds_transport *r_trans;
void *r_trans_private;
};
/* Flags for mr->r_state */
#define RDS_MR_DEAD 0
static inline rds_rdma_cookie_t rds_rdma_make_cookie(u32 r_key, u32 offset)
{
return r_key | (((u64) offset) << 32);
}
static inline u32 rds_rdma_cookie_key(rds_rdma_cookie_t cookie)
{
return cookie;
}
static inline u32 rds_rdma_cookie_offset(rds_rdma_cookie_t cookie)
{
return cookie >> 32;
}
/* atomic operation types */
#define RDS_ATOMIC_TYPE_CSWP 0
#define RDS_ATOMIC_TYPE_FADD 1
/*
* m_sock_item and m_conn_item are on lists that are serialized under
* conn->c_lock. m_sock_item has additional meaning in that once it is empty
* the message will not be put back on the retransmit list after being sent.
* messages that are canceled while being sent rely on this.
*
* m_inc is used by loopback so that it can pass an incoming message straight
* back up into the rx path. It embeds a wire header which is also used by
* the send path, which is kind of awkward.
*
* m_sock_item indicates the message's presence on a socket's send or receive
* queue. m_rs will point to that socket.
*
* m_daddr is used by cancellation to prune messages to a given destination.
*
* The RDS_MSG_ON_SOCK and RDS_MSG_ON_CONN flags are used to avoid lock
* nesting. As paths iterate over messages on a sock, or conn, they must
* also lock the conn, or sock, to remove the message from those lists too.
* Testing the flag to determine if the message is still on the lists lets
* us avoid testing the list_head directly. That means each path can use
* the message's list_head to keep it on a local list while juggling locks
* without confusing the other path.
*
* m_ack_seq is an optional field set by transports who need a different
* sequence number range to invalidate. They can use this in a callback
* that they pass to rds_send_drop_acked() to see if each message has been
* acked. The HAS_ACK_SEQ flag can be used to detect messages which haven't
* had ack_seq set yet.
*/
#define RDS_MSG_ON_SOCK 1
#define RDS_MSG_ON_CONN 2
#define RDS_MSG_HAS_ACK_SEQ 3
#define RDS_MSG_ACK_REQUIRED 4
#define RDS_MSG_RETRANSMITTED 5
#define RDS_MSG_MAPPED 6
#define RDS_MSG_PAGEVEC 7
struct rds_message {
atomic_t m_refcount;
struct list_head m_sock_item;
struct list_head m_conn_item;
struct rds_incoming m_inc;
u64 m_ack_seq;
__be32 m_daddr;
unsigned long m_flags;
/* Never access m_rs without holding m_rs_lock.
* Lock nesting is
* rm->m_rs_lock
* -> rs->rs_lock
*/
spinlock_t m_rs_lock;
wait_queue_head_t m_flush_wait;
struct rds_sock *m_rs;
/* cookie to send to remote, in rds header */
rds_rdma_cookie_t m_rdma_cookie;
unsigned int m_used_sgs;
unsigned int m_total_sgs;
void *m_final_op;
struct {
struct rm_atomic_op {
int op_type;
union {
struct {
uint64_t compare;
uint64_t swap;
uint64_t compare_mask;
uint64_t swap_mask;
} op_m_cswp;
struct {
uint64_t add;
uint64_t nocarry_mask;
} op_m_fadd;
};
u32 op_rkey;
u64 op_remote_addr;
unsigned int op_notify:1;
unsigned int op_recverr:1;
unsigned int op_mapped:1;
unsigned int op_silent:1;
unsigned int op_active:1;
struct scatterlist *op_sg;
struct rds_notifier *op_notifier;
struct rds_mr *op_rdma_mr;
} atomic;
struct rm_rdma_op {
u32 op_rkey;
u64 op_remote_addr;
unsigned int op_write:1;
unsigned int op_fence:1;
unsigned int op_notify:1;
unsigned int op_recverr:1;
unsigned int op_mapped:1;
unsigned int op_silent:1;
unsigned int op_active:1;
unsigned int op_bytes;
unsigned int op_nents;
unsigned int op_count;
struct scatterlist *op_sg;
struct rds_notifier *op_notifier;
struct rds_mr *op_rdma_mr;
} rdma;
struct rm_data_op {
unsigned int op_active:1;
unsigned int op_nents;
unsigned int op_count;
struct scatterlist *op_sg;
} data;
};
};
/*
* The RDS notifier is used (optionally) to tell the application about
* completed RDMA operations. Rather than keeping the whole rds message
* around on the queue, we allocate a small notifier that is put on the
* socket's notifier_list. Notifications are delivered to the application
* through control messages.
*/
struct rds_notifier {
struct list_head n_list;
uint64_t n_user_token;
int n_status;
};
/**
* struct rds_transport - transport specific behavioural hooks
*
* @xmit: .xmit is called by rds_send_xmit() to tell the transport to send
* part of a message. The caller serializes on the send_sem so this
* doesn't need to be reentrant for a given conn. The header must be
* sent before the data payload. .xmit must be prepared to send a
* message with no data payload. .xmit should return the number of
* bytes that were sent down the connection, including header bytes.
* Returning 0 tells the caller that it doesn't need to perform any
* additional work now. This is usually the case when the transport has
* filled the sending queue for its connection and will handle
* triggering the rds thread to continue the send when space becomes
* available. Returning -EAGAIN tells the caller to retry the send
* immediately. Returning -ENOMEM tells the caller to retry the send at
* some point in the future.
*
* @conn_shutdown: conn_shutdown stops traffic on the given connection. Once
* it returns the connection can not call rds_recv_incoming().
* This will only be called once after conn_connect returns
* non-zero success and will The caller serializes this with
* the send and connecting paths (xmit_* and conn_*). The
* transport is responsible for other serialization, including
* rds_recv_incoming(). This is called in process context but
* should try hard not to block.
*/
#define RDS_TRANS_IB 0
#define RDS_TRANS_IWARP 1
#define RDS_TRANS_TCP 2
#define RDS_TRANS_COUNT 3
struct rds_transport {
char t_name[TRANSNAMSIZ];
struct list_head t_item;
struct module *t_owner;
unsigned int t_prefer_loopback:1;
unsigned int t_type;
int (*laddr_check)(__be32 addr);
int (*conn_alloc)(struct rds_connection *conn, gfp_t gfp);
void (*conn_free)(void *data);
int (*conn_connect)(struct rds_connection *conn);
void (*conn_shutdown)(struct rds_connection *conn);
void (*xmit_prepare)(struct rds_connection *conn);
void (*xmit_complete)(struct rds_connection *conn);
int (*xmit)(struct rds_connection *conn, struct rds_message *rm,
unsigned int hdr_off, unsigned int sg, unsigned int off);
int (*xmit_rdma)(struct rds_connection *conn, struct rm_rdma_op *op);
int (*xmit_atomic)(struct rds_connection *conn, struct rm_atomic_op *op);
int (*recv)(struct rds_connection *conn);
int (*inc_copy_to_user)(struct rds_incoming *inc, struct iovec *iov,
size_t size);
void (*inc_free)(struct rds_incoming *inc);
int (*cm_handle_connect)(struct rdma_cm_id *cm_id,
struct rdma_cm_event *event);
int (*cm_initiate_connect)(struct rdma_cm_id *cm_id);
void (*cm_connect_complete)(struct rds_connection *conn,
struct rdma_cm_event *event);
unsigned int (*stats_info_copy)(struct rds_info_iterator *iter,
unsigned int avail);
void (*exit)(void);
void *(*get_mr)(struct scatterlist *sg, unsigned long nr_sg,
struct rds_sock *rs, u32 *key_ret);
void (*sync_mr)(void *trans_private, int direction);
void (*free_mr)(void *trans_private, int invalidate);
void (*flush_mrs)(void);
};
struct rds_sock {
struct sock rs_sk;
u64 rs_user_addr;
u64 rs_user_bytes;
/*
* bound_addr used for both incoming and outgoing, no INADDR_ANY
* support.
*/
struct hlist_node rs_bound_node;
__be32 rs_bound_addr;
__be32 rs_conn_addr;
__be16 rs_bound_port;
__be16 rs_conn_port;
struct rds_transport *rs_transport;
/*
* rds_sendmsg caches the conn it used the last time around.
* This helps avoid costly lookups.
*/
struct rds_connection *rs_conn;
/* flag indicating we were congested or not */
int rs_congested;
/* seen congestion (ENOBUFS) when sending? */
int rs_seen_congestion;
/* rs_lock protects all these adjacent members before the newline */
spinlock_t rs_lock;
struct list_head rs_send_queue;
u32 rs_snd_bytes;
int rs_rcv_bytes;
struct list_head rs_notify_queue; /* currently used for failed RDMAs */
/* Congestion wake_up. If rs_cong_monitor is set, we use cong_mask
* to decide whether the application should be woken up.
* If not set, we use rs_cong_track to find out whether a cong map
* update arrived.
*/
uint64_t rs_cong_mask;
uint64_t rs_cong_notify;
struct list_head rs_cong_list;
unsigned long rs_cong_track;
/*
* rs_recv_lock protects the receive queue, and is
* used to serialize with rds_release.
*/
rwlock_t rs_recv_lock;
struct list_head rs_recv_queue;
/* just for stats reporting */
struct list_head rs_item;
/* these have their own lock */
spinlock_t rs_rdma_lock;
struct rb_root rs_rdma_keys;
/* Socket options - in case there will be more */
unsigned char rs_recverr,
rs_cong_monitor;
};
static inline struct rds_sock *rds_sk_to_rs(const struct sock *sk)
{
return container_of(sk, struct rds_sock, rs_sk);
}
static inline struct sock *rds_rs_to_sk(struct rds_sock *rs)
{
return &rs->rs_sk;
}
/*
* The stack assigns sk_sndbuf and sk_rcvbuf to twice the specified value
* to account for overhead. We don't account for overhead, we just apply
* the number of payload bytes to the specified value.
*/
static inline int rds_sk_sndbuf(struct rds_sock *rs)
{
return rds_rs_to_sk(rs)->sk_sndbuf / 2;
}
static inline int rds_sk_rcvbuf(struct rds_sock *rs)
{
return rds_rs_to_sk(rs)->sk_rcvbuf / 2;
}
struct rds_statistics {
uint64_t s_conn_reset;
uint64_t s_recv_drop_bad_checksum;
uint64_t s_recv_drop_old_seq;
uint64_t s_recv_drop_no_sock;
uint64_t s_recv_drop_dead_sock;
uint64_t s_recv_deliver_raced;
uint64_t s_recv_delivered;
uint64_t s_recv_queued;
uint64_t s_recv_immediate_retry;
uint64_t s_recv_delayed_retry;
uint64_t s_recv_ack_required;
uint64_t s_recv_rdma_bytes;
uint64_t s_recv_ping;
uint64_t s_send_queue_empty;
uint64_t s_send_queue_full;
uint64_t s_send_lock_contention;
uint64_t s_send_lock_queue_raced;
uint64_t s_send_immediate_retry;
uint64_t s_send_delayed_retry;
uint64_t s_send_drop_acked;
uint64_t s_send_ack_required;
uint64_t s_send_queued;
uint64_t s_send_rdma;
uint64_t s_send_rdma_bytes;
uint64_t s_send_pong;
uint64_t s_page_remainder_hit;
uint64_t s_page_remainder_miss;
uint64_t s_copy_to_user;
uint64_t s_copy_from_user;
uint64_t s_cong_update_queued;
uint64_t s_cong_update_received;
uint64_t s_cong_send_error;
uint64_t s_cong_send_blocked;
};
/* af_rds.c */
char *rds_str_array(char **array, size_t elements, size_t index);
void rds_sock_addref(struct rds_sock *rs);
void rds_sock_put(struct rds_sock *rs);
void rds_wake_sk_sleep(struct rds_sock *rs);
static inline void __rds_wake_sk_sleep(struct sock *sk)
{
wait_queue_head_t *waitq = sk_sleep(sk);
if (!sock_flag(sk, SOCK_DEAD) && waitq)
wake_up(waitq);
}
extern wait_queue_head_t rds_poll_waitq;
/* bind.c */
int rds_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len);
void rds_remove_bound(struct rds_sock *rs);
struct rds_sock *rds_find_bound(__be32 addr, __be16 port);
/* cong.c */
int rds_cong_get_maps(struct rds_connection *conn);
void rds_cong_add_conn(struct rds_connection *conn);
void rds_cong_remove_conn(struct rds_connection *conn);
void rds_cong_set_bit(struct rds_cong_map *map, __be16 port);
void rds_cong_clear_bit(struct rds_cong_map *map, __be16 port);
int rds_cong_wait(struct rds_cong_map *map, __be16 port, int nonblock, struct rds_sock *rs);
void rds_cong_queue_updates(struct rds_cong_map *map);
void rds_cong_map_updated(struct rds_cong_map *map, uint64_t);
int rds_cong_updated_since(unsigned long *recent);
void rds_cong_add_socket(struct rds_sock *);
void rds_cong_remove_socket(struct rds_sock *);
void rds_cong_exit(void);
struct rds_message *rds_cong_update_alloc(struct rds_connection *conn);
/* conn.c */
int rds_conn_init(void);
void rds_conn_exit(void);
struct rds_connection *rds_conn_create(__be32 laddr, __be32 faddr,
struct rds_transport *trans, gfp_t gfp);
struct rds_connection *rds_conn_create_outgoing(__be32 laddr, __be32 faddr,
struct rds_transport *trans, gfp_t gfp);
void rds_conn_shutdown(struct rds_connection *conn);
void rds_conn_destroy(struct rds_connection *conn);
void rds_conn_drop(struct rds_connection *conn);
void rds_conn_connect_if_down(struct rds_connection *conn);
void rds_for_each_conn_info(struct socket *sock, unsigned int len,
struct rds_info_iterator *iter,
struct rds_info_lengths *lens,
int (*visitor)(struct rds_connection *, void *),
size_t item_len);
__printf(2, 3)
void __rds_conn_error(struct rds_connection *conn, const char *, ...);
#define rds_conn_error(conn, fmt...) \
__rds_conn_error(conn, KERN_WARNING "RDS: " fmt)
static inline int
rds_conn_transition(struct rds_connection *conn, int old, int new)
{
return atomic_cmpxchg(&conn->c_state, old, new) == old;
}
static inline int
rds_conn_state(struct rds_connection *conn)
{
return atomic_read(&conn->c_state);
}
static inline int
rds_conn_up(struct rds_connection *conn)
{
return atomic_read(&conn->c_state) == RDS_CONN_UP;
}
static inline int
rds_conn_connecting(struct rds_connection *conn)
{
return atomic_read(&conn->c_state) == RDS_CONN_CONNECTING;
}
/* message.c */
struct rds_message *rds_message_alloc(unsigned int nents, gfp_t gfp);
struct scatterlist *rds_message_alloc_sgs(struct rds_message *rm, int nents);
int rds_message_copy_from_user(struct rds_message *rm, struct iovec *first_iov,
size_t total_len);
struct rds_message *rds_message_map_pages(unsigned long *page_addrs, unsigned int total_len);
void rds_message_populate_header(struct rds_header *hdr, __be16 sport,
__be16 dport, u64 seq);
int rds_message_add_extension(struct rds_header *hdr,
unsigned int type, const void *data, unsigned int len);
int rds_message_next_extension(struct rds_header *hdr,
unsigned int *pos, void *buf, unsigned int *buflen);
int rds_message_add_rdma_dest_extension(struct rds_header *hdr, u32 r_key, u32 offset);
int rds_message_inc_copy_to_user(struct rds_incoming *inc,
struct iovec *first_iov, size_t size);
void rds_message_inc_free(struct rds_incoming *inc);
void rds_message_addref(struct rds_message *rm);
void rds_message_put(struct rds_message *rm);
void rds_message_wait(struct rds_message *rm);
void rds_message_unmapped(struct rds_message *rm);
static inline void rds_message_make_checksum(struct rds_header *hdr)
{
hdr->h_csum = 0;
hdr->h_csum = ip_fast_csum((void *) hdr, sizeof(*hdr) >> 2);
}
static inline int rds_message_verify_checksum(const struct rds_header *hdr)
{
return !hdr->h_csum || ip_fast_csum((void *) hdr, sizeof(*hdr) >> 2) == 0;
}
/* page.c */
int rds_page_remainder_alloc(struct scatterlist *scat, unsigned long bytes,
gfp_t gfp);
int rds_page_copy_user(struct page *page, unsigned long offset,
void __user *ptr, unsigned long bytes,
int to_user);
#define rds_page_copy_to_user(page, offset, ptr, bytes) \
rds_page_copy_user(page, offset, ptr, bytes, 1)
#define rds_page_copy_from_user(page, offset, ptr, bytes) \
rds_page_copy_user(page, offset, ptr, bytes, 0)
void rds_page_exit(void);
/* recv.c */
void rds_inc_init(struct rds_incoming *inc, struct rds_connection *conn,
__be32 saddr);
void rds_inc_put(struct rds_incoming *inc);
void rds_recv_incoming(struct rds_connection *conn, __be32 saddr, __be32 daddr,
struct rds_incoming *inc, gfp_t gfp);
int rds_recvmsg(struct kiocb *iocb, struct socket *sock, struct msghdr *msg,
size_t size, int msg_flags);
void rds_clear_recv_queue(struct rds_sock *rs);
int rds_notify_queue_get(struct rds_sock *rs, struct msghdr *msg);
void rds_inc_info_copy(struct rds_incoming *inc,
struct rds_info_iterator *iter,
__be32 saddr, __be32 daddr, int flip);
/* send.c */
int rds_sendmsg(struct kiocb *iocb, struct socket *sock, struct msghdr *msg,
size_t payload_len);
void rds_send_reset(struct rds_connection *conn);
int rds_send_xmit(struct rds_connection *conn);
struct sockaddr_in;
void rds_send_drop_to(struct rds_sock *rs, struct sockaddr_in *dest);
typedef int (*is_acked_func)(struct rds_message *rm, uint64_t ack);
void rds_send_drop_acked(struct rds_connection *conn, u64 ack,
is_acked_func is_acked);
int rds_send_pong(struct rds_connection *conn, __be16 dport);
struct rds_message *rds_send_get_message(struct rds_connection *,
struct rm_rdma_op *);
/* rdma.c */
void rds_rdma_unuse(struct rds_sock *rs, u32 r_key, int force);
int rds_get_mr(struct rds_sock *rs, char __user *optval, int optlen);
int rds_get_mr_for_dest(struct rds_sock *rs, char __user *optval, int optlen);
int rds_free_mr(struct rds_sock *rs, char __user *optval, int optlen);
void rds_rdma_drop_keys(struct rds_sock *rs);
int rds_rdma_extra_size(struct rds_rdma_args *args);
int rds_cmsg_rdma_args(struct rds_sock *rs, struct rds_message *rm,
struct cmsghdr *cmsg);
int rds_cmsg_rdma_dest(struct rds_sock *rs, struct rds_message *rm,
struct cmsghdr *cmsg);
int rds_cmsg_rdma_args(struct rds_sock *rs, struct rds_message *rm,
struct cmsghdr *cmsg);
int rds_cmsg_rdma_map(struct rds_sock *rs, struct rds_message *rm,
struct cmsghdr *cmsg);
void rds_rdma_free_op(struct rm_rdma_op *ro);
void rds_atomic_free_op(struct rm_atomic_op *ao);
void rds_rdma_send_complete(struct rds_message *rm, int wc_status);
void rds_atomic_send_complete(struct rds_message *rm, int wc_status);
int rds_cmsg_atomic(struct rds_sock *rs, struct rds_message *rm,
struct cmsghdr *cmsg);
void __rds_put_mr_final(struct rds_mr *mr);
static inline void rds_mr_put(struct rds_mr *mr)
{
if (atomic_dec_and_test(&mr->r_refcount))
__rds_put_mr_final(mr);
}
/* stats.c */
DECLARE_PER_CPU_SHARED_ALIGNED(struct rds_statistics, rds_stats);
#define rds_stats_inc_which(which, member) do { \
per_cpu(which, get_cpu()).member++; \
put_cpu(); \
} while (0)
#define rds_stats_inc(member) rds_stats_inc_which(rds_stats, member)
#define rds_stats_add_which(which, member, count) do { \
per_cpu(which, get_cpu()).member += count; \
put_cpu(); \
} while (0)
#define rds_stats_add(member, count) rds_stats_add_which(rds_stats, member, count)
int rds_stats_init(void);
void rds_stats_exit(void);
void rds_stats_info_copy(struct rds_info_iterator *iter,
uint64_t *values, const char *const *names,
size_t nr);
/* sysctl.c */
int rds_sysctl_init(void);
void rds_sysctl_exit(void);
extern unsigned long rds_sysctl_sndbuf_min;
extern unsigned long rds_sysctl_sndbuf_default;
extern unsigned long rds_sysctl_sndbuf_max;
extern unsigned long rds_sysctl_reconnect_min_jiffies;
extern unsigned long rds_sysctl_reconnect_max_jiffies;
extern unsigned int rds_sysctl_max_unacked_packets;
extern unsigned int rds_sysctl_max_unacked_bytes;
extern unsigned int rds_sysctl_ping_enable;
extern unsigned long rds_sysctl_trace_flags;
extern unsigned int rds_sysctl_trace_level;
/* threads.c */
int rds_threads_init(void);
void rds_threads_exit(void);
extern struct workqueue_struct *rds_wq;
void rds_queue_reconnect(struct rds_connection *conn);
void rds_connect_worker(struct work_struct *);
void rds_shutdown_worker(struct work_struct *);
void rds_send_worker(struct work_struct *);
void rds_recv_worker(struct work_struct *);
void rds_connect_complete(struct rds_connection *conn);
/* transport.c */
int rds_trans_register(struct rds_transport *trans);
void rds_trans_unregister(struct rds_transport *trans);
struct rds_transport *rds_trans_get_preferred(__be32 addr);
void rds_trans_put(struct rds_transport *trans);
unsigned int rds_trans_stats_info_copy(struct rds_info_iterator *iter,
unsigned int avail);
int rds_trans_init(void);
void rds_trans_exit(void);
#endif

549
net/rds/recv.c Normal file
View file

@ -0,0 +1,549 @@
/*
* Copyright (c) 2006 Oracle. All rights reserved.
*
* This software is available to you under a choice of one of two
* licenses. You may choose to be licensed under the terms of the GNU
* General Public License (GPL) Version 2, available from the file
* COPYING in the main directory of this source tree, or the
* OpenIB.org BSD license below:
*
* Redistribution and use in source and binary forms, with or
* without modification, are permitted provided that the following
* conditions are met:
*
* - Redistributions of source code must retain the above
* copyright notice, this list of conditions and the following
* disclaimer.
*
* - Redistributions in binary form must reproduce the above
* copyright notice, this list of conditions and the following
* disclaimer in the documentation and/or other materials
* provided with the distribution.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
* BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
* ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
* CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
*
*/
#include <linux/kernel.h>
#include <linux/slab.h>
#include <net/sock.h>
#include <linux/in.h>
#include <linux/export.h>
#include "rds.h"
void rds_inc_init(struct rds_incoming *inc, struct rds_connection *conn,
__be32 saddr)
{
atomic_set(&inc->i_refcount, 1);
INIT_LIST_HEAD(&inc->i_item);
inc->i_conn = conn;
inc->i_saddr = saddr;
inc->i_rdma_cookie = 0;
}
EXPORT_SYMBOL_GPL(rds_inc_init);
static void rds_inc_addref(struct rds_incoming *inc)
{
rdsdebug("addref inc %p ref %d\n", inc, atomic_read(&inc->i_refcount));
atomic_inc(&inc->i_refcount);
}
void rds_inc_put(struct rds_incoming *inc)
{
rdsdebug("put inc %p ref %d\n", inc, atomic_read(&inc->i_refcount));
if (atomic_dec_and_test(&inc->i_refcount)) {
BUG_ON(!list_empty(&inc->i_item));
inc->i_conn->c_trans->inc_free(inc);
}
}
EXPORT_SYMBOL_GPL(rds_inc_put);
static void rds_recv_rcvbuf_delta(struct rds_sock *rs, struct sock *sk,
struct rds_cong_map *map,
int delta, __be16 port)
{
int now_congested;
if (delta == 0)
return;
rs->rs_rcv_bytes += delta;
now_congested = rs->rs_rcv_bytes > rds_sk_rcvbuf(rs);
rdsdebug("rs %p (%pI4:%u) recv bytes %d buf %d "
"now_cong %d delta %d\n",
rs, &rs->rs_bound_addr,
ntohs(rs->rs_bound_port), rs->rs_rcv_bytes,
rds_sk_rcvbuf(rs), now_congested, delta);
/* wasn't -> am congested */
if (!rs->rs_congested && now_congested) {
rs->rs_congested = 1;
rds_cong_set_bit(map, port);
rds_cong_queue_updates(map);
}
/* was -> aren't congested */
/* Require more free space before reporting uncongested to prevent
bouncing cong/uncong state too often */
else if (rs->rs_congested && (rs->rs_rcv_bytes < (rds_sk_rcvbuf(rs)/2))) {
rs->rs_congested = 0;
rds_cong_clear_bit(map, port);
rds_cong_queue_updates(map);
}
/* do nothing if no change in cong state */
}
/*
* Process all extension headers that come with this message.
*/
static void rds_recv_incoming_exthdrs(struct rds_incoming *inc, struct rds_sock *rs)
{
struct rds_header *hdr = &inc->i_hdr;
unsigned int pos = 0, type, len;
union {
struct rds_ext_header_version version;
struct rds_ext_header_rdma rdma;
struct rds_ext_header_rdma_dest rdma_dest;
} buffer;
while (1) {
len = sizeof(buffer);
type = rds_message_next_extension(hdr, &pos, &buffer, &len);
if (type == RDS_EXTHDR_NONE)
break;
/* Process extension header here */
switch (type) {
case RDS_EXTHDR_RDMA:
rds_rdma_unuse(rs, be32_to_cpu(buffer.rdma.h_rdma_rkey), 0);
break;
case RDS_EXTHDR_RDMA_DEST:
/* We ignore the size for now. We could stash it
* somewhere and use it for error checking. */
inc->i_rdma_cookie = rds_rdma_make_cookie(
be32_to_cpu(buffer.rdma_dest.h_rdma_rkey),
be32_to_cpu(buffer.rdma_dest.h_rdma_offset));
break;
}
}
}
/*
* The transport must make sure that this is serialized against other
* rx and conn reset on this specific conn.
*
* We currently assert that only one fragmented message will be sent
* down a connection at a time. This lets us reassemble in the conn
* instead of per-flow which means that we don't have to go digging through
* flows to tear down partial reassembly progress on conn failure and
* we save flow lookup and locking for each frag arrival. It does mean
* that small messages will wait behind large ones. Fragmenting at all
* is only to reduce the memory consumption of pre-posted buffers.
*
* The caller passes in saddr and daddr instead of us getting it from the
* conn. This lets loopback, who only has one conn for both directions,
* tell us which roles the addrs in the conn are playing for this message.
*/
void rds_recv_incoming(struct rds_connection *conn, __be32 saddr, __be32 daddr,
struct rds_incoming *inc, gfp_t gfp)
{
struct rds_sock *rs = NULL;
struct sock *sk;
unsigned long flags;
inc->i_conn = conn;
inc->i_rx_jiffies = jiffies;
rdsdebug("conn %p next %llu inc %p seq %llu len %u sport %u dport %u "
"flags 0x%x rx_jiffies %lu\n", conn,
(unsigned long long)conn->c_next_rx_seq,
inc,
(unsigned long long)be64_to_cpu(inc->i_hdr.h_sequence),
be32_to_cpu(inc->i_hdr.h_len),
be16_to_cpu(inc->i_hdr.h_sport),
be16_to_cpu(inc->i_hdr.h_dport),
inc->i_hdr.h_flags,
inc->i_rx_jiffies);
/*
* Sequence numbers should only increase. Messages get their
* sequence number as they're queued in a sending conn. They
* can be dropped, though, if the sending socket is closed before
* they hit the wire. So sequence numbers can skip forward
* under normal operation. They can also drop back in the conn
* failover case as previously sent messages are resent down the
* new instance of a conn. We drop those, otherwise we have
* to assume that the next valid seq does not come after a
* hole in the fragment stream.
*
* The headers don't give us a way to realize if fragments of
* a message have been dropped. We assume that frags that arrive
* to a flow are part of the current message on the flow that is
* being reassembled. This means that senders can't drop messages
* from the sending conn until all their frags are sent.
*
* XXX we could spend more on the wire to get more robust failure
* detection, arguably worth it to avoid data corruption.
*/
if (be64_to_cpu(inc->i_hdr.h_sequence) < conn->c_next_rx_seq &&
(inc->i_hdr.h_flags & RDS_FLAG_RETRANSMITTED)) {
rds_stats_inc(s_recv_drop_old_seq);
goto out;
}
conn->c_next_rx_seq = be64_to_cpu(inc->i_hdr.h_sequence) + 1;
if (rds_sysctl_ping_enable && inc->i_hdr.h_dport == 0) {
rds_stats_inc(s_recv_ping);
rds_send_pong(conn, inc->i_hdr.h_sport);
goto out;
}
rs = rds_find_bound(daddr, inc->i_hdr.h_dport);
if (!rs) {
rds_stats_inc(s_recv_drop_no_sock);
goto out;
}
/* Process extension headers */
rds_recv_incoming_exthdrs(inc, rs);
/* We can be racing with rds_release() which marks the socket dead. */
sk = rds_rs_to_sk(rs);
/* serialize with rds_release -> sock_orphan */
write_lock_irqsave(&rs->rs_recv_lock, flags);
if (!sock_flag(sk, SOCK_DEAD)) {
rdsdebug("adding inc %p to rs %p's recv queue\n", inc, rs);
rds_stats_inc(s_recv_queued);
rds_recv_rcvbuf_delta(rs, sk, inc->i_conn->c_lcong,
be32_to_cpu(inc->i_hdr.h_len),
inc->i_hdr.h_dport);
rds_inc_addref(inc);
list_add_tail(&inc->i_item, &rs->rs_recv_queue);
__rds_wake_sk_sleep(sk);
} else {
rds_stats_inc(s_recv_drop_dead_sock);
}
write_unlock_irqrestore(&rs->rs_recv_lock, flags);
out:
if (rs)
rds_sock_put(rs);
}
EXPORT_SYMBOL_GPL(rds_recv_incoming);
/*
* be very careful here. This is being called as the condition in
* wait_event_*() needs to cope with being called many times.
*/
static int rds_next_incoming(struct rds_sock *rs, struct rds_incoming **inc)
{
unsigned long flags;
if (!*inc) {
read_lock_irqsave(&rs->rs_recv_lock, flags);
if (!list_empty(&rs->rs_recv_queue)) {
*inc = list_entry(rs->rs_recv_queue.next,
struct rds_incoming,
i_item);
rds_inc_addref(*inc);
}
read_unlock_irqrestore(&rs->rs_recv_lock, flags);
}
return *inc != NULL;
}
static int rds_still_queued(struct rds_sock *rs, struct rds_incoming *inc,
int drop)
{
struct sock *sk = rds_rs_to_sk(rs);
int ret = 0;
unsigned long flags;
write_lock_irqsave(&rs->rs_recv_lock, flags);
if (!list_empty(&inc->i_item)) {
ret = 1;
if (drop) {
/* XXX make sure this i_conn is reliable */
rds_recv_rcvbuf_delta(rs, sk, inc->i_conn->c_lcong,
-be32_to_cpu(inc->i_hdr.h_len),
inc->i_hdr.h_dport);
list_del_init(&inc->i_item);
rds_inc_put(inc);
}
}
write_unlock_irqrestore(&rs->rs_recv_lock, flags);
rdsdebug("inc %p rs %p still %d dropped %d\n", inc, rs, ret, drop);
return ret;
}
/*
* Pull errors off the error queue.
* If msghdr is NULL, we will just purge the error queue.
*/
int rds_notify_queue_get(struct rds_sock *rs, struct msghdr *msghdr)
{
struct rds_notifier *notifier;
struct rds_rdma_notify cmsg = { 0 }; /* fill holes with zero */
unsigned int count = 0, max_messages = ~0U;
unsigned long flags;
LIST_HEAD(copy);
int err = 0;
/* put_cmsg copies to user space and thus may sleep. We can't do this
* with rs_lock held, so first grab as many notifications as we can stuff
* in the user provided cmsg buffer. We don't try to copy more, to avoid
* losing notifications - except when the buffer is so small that it wouldn't
* even hold a single notification. Then we give him as much of this single
* msg as we can squeeze in, and set MSG_CTRUNC.
*/
if (msghdr) {
max_messages = msghdr->msg_controllen / CMSG_SPACE(sizeof(cmsg));
if (!max_messages)
max_messages = 1;
}
spin_lock_irqsave(&rs->rs_lock, flags);
while (!list_empty(&rs->rs_notify_queue) && count < max_messages) {
notifier = list_entry(rs->rs_notify_queue.next,
struct rds_notifier, n_list);
list_move(&notifier->n_list, &copy);
count++;
}
spin_unlock_irqrestore(&rs->rs_lock, flags);
if (!count)
return 0;
while (!list_empty(&copy)) {
notifier = list_entry(copy.next, struct rds_notifier, n_list);
if (msghdr) {
cmsg.user_token = notifier->n_user_token;
cmsg.status = notifier->n_status;
err = put_cmsg(msghdr, SOL_RDS, RDS_CMSG_RDMA_STATUS,
sizeof(cmsg), &cmsg);
if (err)
break;
}
list_del_init(&notifier->n_list);
kfree(notifier);
}
/* If we bailed out because of an error in put_cmsg,
* we may be left with one or more notifications that we
* didn't process. Return them to the head of the list. */
if (!list_empty(&copy)) {
spin_lock_irqsave(&rs->rs_lock, flags);
list_splice(&copy, &rs->rs_notify_queue);
spin_unlock_irqrestore(&rs->rs_lock, flags);
}
return err;
}
/*
* Queue a congestion notification
*/
static int rds_notify_cong(struct rds_sock *rs, struct msghdr *msghdr)
{
uint64_t notify = rs->rs_cong_notify;
unsigned long flags;
int err;
err = put_cmsg(msghdr, SOL_RDS, RDS_CMSG_CONG_UPDATE,
sizeof(notify), &notify);
if (err)
return err;
spin_lock_irqsave(&rs->rs_lock, flags);
rs->rs_cong_notify &= ~notify;
spin_unlock_irqrestore(&rs->rs_lock, flags);
return 0;
}
/*
* Receive any control messages.
*/
static int rds_cmsg_recv(struct rds_incoming *inc, struct msghdr *msg)
{
int ret = 0;
if (inc->i_rdma_cookie) {
ret = put_cmsg(msg, SOL_RDS, RDS_CMSG_RDMA_DEST,
sizeof(inc->i_rdma_cookie), &inc->i_rdma_cookie);
if (ret)
return ret;
}
return 0;
}
int rds_recvmsg(struct kiocb *iocb, struct socket *sock, struct msghdr *msg,
size_t size, int msg_flags)
{
struct sock *sk = sock->sk;
struct rds_sock *rs = rds_sk_to_rs(sk);
long timeo;
int ret = 0, nonblock = msg_flags & MSG_DONTWAIT;
DECLARE_SOCKADDR(struct sockaddr_in *, sin, msg->msg_name);
struct rds_incoming *inc = NULL;
/* udp_recvmsg()->sock_recvtimeo() gets away without locking too.. */
timeo = sock_rcvtimeo(sk, nonblock);
rdsdebug("size %zu flags 0x%x timeo %ld\n", size, msg_flags, timeo);
if (msg_flags & MSG_OOB)
goto out;
while (1) {
/* If there are pending notifications, do those - and nothing else */
if (!list_empty(&rs->rs_notify_queue)) {
ret = rds_notify_queue_get(rs, msg);
break;
}
if (rs->rs_cong_notify) {
ret = rds_notify_cong(rs, msg);
break;
}
if (!rds_next_incoming(rs, &inc)) {
if (nonblock) {
ret = -EAGAIN;
break;
}
timeo = wait_event_interruptible_timeout(*sk_sleep(sk),
(!list_empty(&rs->rs_notify_queue) ||
rs->rs_cong_notify ||
rds_next_incoming(rs, &inc)), timeo);
rdsdebug("recvmsg woke inc %p timeo %ld\n", inc,
timeo);
if (timeo > 0 || timeo == MAX_SCHEDULE_TIMEOUT)
continue;
ret = timeo;
if (ret == 0)
ret = -ETIMEDOUT;
break;
}
rdsdebug("copying inc %p from %pI4:%u to user\n", inc,
&inc->i_conn->c_faddr,
ntohs(inc->i_hdr.h_sport));
ret = inc->i_conn->c_trans->inc_copy_to_user(inc, msg->msg_iov,
size);
if (ret < 0)
break;
/*
* if the message we just copied isn't at the head of the
* recv queue then someone else raced us to return it, try
* to get the next message.
*/
if (!rds_still_queued(rs, inc, !(msg_flags & MSG_PEEK))) {
rds_inc_put(inc);
inc = NULL;
rds_stats_inc(s_recv_deliver_raced);
continue;
}
if (ret < be32_to_cpu(inc->i_hdr.h_len)) {
if (msg_flags & MSG_TRUNC)
ret = be32_to_cpu(inc->i_hdr.h_len);
msg->msg_flags |= MSG_TRUNC;
}
if (rds_cmsg_recv(inc, msg)) {
ret = -EFAULT;
goto out;
}
rds_stats_inc(s_recv_delivered);
if (sin) {
sin->sin_family = AF_INET;
sin->sin_port = inc->i_hdr.h_sport;
sin->sin_addr.s_addr = inc->i_saddr;
memset(sin->sin_zero, 0, sizeof(sin->sin_zero));
msg->msg_namelen = sizeof(*sin);
}
break;
}
if (inc)
rds_inc_put(inc);
out:
return ret;
}
/*
* The socket is being shut down and we're asked to drop messages that were
* queued for recvmsg. The caller has unbound the socket so the receive path
* won't queue any more incoming fragments or messages on the socket.
*/
void rds_clear_recv_queue(struct rds_sock *rs)
{
struct sock *sk = rds_rs_to_sk(rs);
struct rds_incoming *inc, *tmp;
unsigned long flags;
write_lock_irqsave(&rs->rs_recv_lock, flags);
list_for_each_entry_safe(inc, tmp, &rs->rs_recv_queue, i_item) {
rds_recv_rcvbuf_delta(rs, sk, inc->i_conn->c_lcong,
-be32_to_cpu(inc->i_hdr.h_len),
inc->i_hdr.h_dport);
list_del_init(&inc->i_item);
rds_inc_put(inc);
}
write_unlock_irqrestore(&rs->rs_recv_lock, flags);
}
/*
* inc->i_saddr isn't used here because it is only set in the receive
* path.
*/
void rds_inc_info_copy(struct rds_incoming *inc,
struct rds_info_iterator *iter,
__be32 saddr, __be32 daddr, int flip)
{
struct rds_info_message minfo;
minfo.seq = be64_to_cpu(inc->i_hdr.h_sequence);
minfo.len = be32_to_cpu(inc->i_hdr.h_len);
if (flip) {
minfo.laddr = daddr;
minfo.faddr = saddr;
minfo.lport = inc->i_hdr.h_dport;
minfo.fport = inc->i_hdr.h_sport;
} else {
minfo.laddr = saddr;
minfo.faddr = daddr;
minfo.lport = inc->i_hdr.h_sport;
minfo.fport = inc->i_hdr.h_dport;
}
minfo.flags = 0;
rds_info_copy(iter, &minfo, sizeof(minfo));
}

1139
net/rds/send.c Normal file

File diff suppressed because it is too large Load diff

152
net/rds/stats.c Normal file
View file

@ -0,0 +1,152 @@
/*
* Copyright (c) 2006 Oracle. All rights reserved.
*
* This software is available to you under a choice of one of two
* licenses. You may choose to be licensed under the terms of the GNU
* General Public License (GPL) Version 2, available from the file
* COPYING in the main directory of this source tree, or the
* OpenIB.org BSD license below:
*
* Redistribution and use in source and binary forms, with or
* without modification, are permitted provided that the following
* conditions are met:
*
* - Redistributions of source code must retain the above
* copyright notice, this list of conditions and the following
* disclaimer.
*
* - Redistributions in binary form must reproduce the above
* copyright notice, this list of conditions and the following
* disclaimer in the documentation and/or other materials
* provided with the distribution.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
* BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
* ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
* CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
*
*/
#include <linux/percpu.h>
#include <linux/seq_file.h>
#include <linux/proc_fs.h>
#include <linux/export.h>
#include "rds.h"
DEFINE_PER_CPU_SHARED_ALIGNED(struct rds_statistics, rds_stats);
EXPORT_PER_CPU_SYMBOL_GPL(rds_stats);
/* :.,$s/unsigned long\>.*\<s_\(.*\);/"\1",/g */
static const char *const rds_stat_names[] = {
"conn_reset",
"recv_drop_bad_checksum",
"recv_drop_old_seq",
"recv_drop_no_sock",
"recv_drop_dead_sock",
"recv_deliver_raced",
"recv_delivered",
"recv_queued",
"recv_immediate_retry",
"recv_delayed_retry",
"recv_ack_required",
"recv_rdma_bytes",
"recv_ping",
"send_queue_empty",
"send_queue_full",
"send_lock_contention",
"send_lock_queue_raced",
"send_immediate_retry",
"send_delayed_retry",
"send_drop_acked",
"send_ack_required",
"send_queued",
"send_rdma",
"send_rdma_bytes",
"send_pong",
"page_remainder_hit",
"page_remainder_miss",
"copy_to_user",
"copy_from_user",
"cong_update_queued",
"cong_update_received",
"cong_send_error",
"cong_send_blocked",
};
void rds_stats_info_copy(struct rds_info_iterator *iter,
uint64_t *values, const char *const *names, size_t nr)
{
struct rds_info_counter ctr;
size_t i;
for (i = 0; i < nr; i++) {
BUG_ON(strlen(names[i]) >= sizeof(ctr.name));
strncpy(ctr.name, names[i], sizeof(ctr.name) - 1);
ctr.name[sizeof(ctr.name) - 1] = '\0';
ctr.value = values[i];
rds_info_copy(iter, &ctr, sizeof(ctr));
}
}
EXPORT_SYMBOL_GPL(rds_stats_info_copy);
/*
* This gives global counters across all the transports. The strings
* are copied in so that the tool doesn't need knowledge of the specific
* stats that we're exporting. Some are pretty implementation dependent
* and may change over time. That doesn't stop them from being useful.
*
* This is the only function in the chain that knows about the byte granular
* length in userspace. It converts it to number of stat entries that the
* rest of the functions operate in.
*/
static void rds_stats_info(struct socket *sock, unsigned int len,
struct rds_info_iterator *iter,
struct rds_info_lengths *lens)
{
struct rds_statistics stats = {0, };
uint64_t *src;
uint64_t *sum;
size_t i;
int cpu;
unsigned int avail;
avail = len / sizeof(struct rds_info_counter);
if (avail < ARRAY_SIZE(rds_stat_names)) {
avail = 0;
goto trans;
}
for_each_online_cpu(cpu) {
src = (uint64_t *)&(per_cpu(rds_stats, cpu));
sum = (uint64_t *)&stats;
for (i = 0; i < sizeof(stats) / sizeof(uint64_t); i++)
*(sum++) += *(src++);
}
rds_stats_info_copy(iter, (uint64_t *)&stats, rds_stat_names,
ARRAY_SIZE(rds_stat_names));
avail -= ARRAY_SIZE(rds_stat_names);
trans:
lens->each = sizeof(struct rds_info_counter);
lens->nr = rds_trans_stats_info_copy(iter, avail) +
ARRAY_SIZE(rds_stat_names);
}
void rds_stats_exit(void)
{
rds_info_deregister_func(RDS_INFO_COUNTERS, rds_stats_info);
}
int rds_stats_init(void)
{
rds_info_register_func(RDS_INFO_COUNTERS, rds_stats_info);
return 0;
}

109
net/rds/sysctl.c Normal file
View file

@ -0,0 +1,109 @@
/*
* Copyright (c) 2006 Oracle. All rights reserved.
*
* This software is available to you under a choice of one of two
* licenses. You may choose to be licensed under the terms of the GNU
* General Public License (GPL) Version 2, available from the file
* COPYING in the main directory of this source tree, or the
* OpenIB.org BSD license below:
*
* Redistribution and use in source and binary forms, with or
* without modification, are permitted provided that the following
* conditions are met:
*
* - Redistributions of source code must retain the above
* copyright notice, this list of conditions and the following
* disclaimer.
*
* - Redistributions in binary form must reproduce the above
* copyright notice, this list of conditions and the following
* disclaimer in the documentation and/or other materials
* provided with the distribution.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
* BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
* ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
* CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
*
*/
#include <linux/kernel.h>
#include <linux/sysctl.h>
#include <linux/proc_fs.h>
#include "rds.h"
static struct ctl_table_header *rds_sysctl_reg_table;
static unsigned long rds_sysctl_reconnect_min = 1;
static unsigned long rds_sysctl_reconnect_max = ~0UL;
unsigned long rds_sysctl_reconnect_min_jiffies;
unsigned long rds_sysctl_reconnect_max_jiffies = HZ;
unsigned int rds_sysctl_max_unacked_packets = 8;
unsigned int rds_sysctl_max_unacked_bytes = (16 << 20);
unsigned int rds_sysctl_ping_enable = 1;
static struct ctl_table rds_sysctl_rds_table[] = {
{
.procname = "reconnect_min_delay_ms",
.data = &rds_sysctl_reconnect_min_jiffies,
.maxlen = sizeof(unsigned long),
.mode = 0644,
.proc_handler = proc_doulongvec_ms_jiffies_minmax,
.extra1 = &rds_sysctl_reconnect_min,
.extra2 = &rds_sysctl_reconnect_max_jiffies,
},
{
.procname = "reconnect_max_delay_ms",
.data = &rds_sysctl_reconnect_max_jiffies,
.maxlen = sizeof(unsigned long),
.mode = 0644,
.proc_handler = proc_doulongvec_ms_jiffies_minmax,
.extra1 = &rds_sysctl_reconnect_min_jiffies,
.extra2 = &rds_sysctl_reconnect_max,
},
{
.procname = "max_unacked_packets",
.data = &rds_sysctl_max_unacked_packets,
.maxlen = sizeof(unsigned long),
.mode = 0644,
.proc_handler = proc_dointvec,
},
{
.procname = "max_unacked_bytes",
.data = &rds_sysctl_max_unacked_bytes,
.maxlen = sizeof(unsigned long),
.mode = 0644,
.proc_handler = proc_dointvec,
},
{
.procname = "ping_enable",
.data = &rds_sysctl_ping_enable,
.maxlen = sizeof(int),
.mode = 0644,
.proc_handler = proc_dointvec,
},
{ }
};
void rds_sysctl_exit(void)
{
unregister_net_sysctl_table(rds_sysctl_reg_table);
}
int rds_sysctl_init(void)
{
rds_sysctl_reconnect_min = msecs_to_jiffies(1);
rds_sysctl_reconnect_min_jiffies = rds_sysctl_reconnect_min;
rds_sysctl_reg_table = register_net_sysctl(&init_net,"net/rds", rds_sysctl_rds_table);
if (!rds_sysctl_reg_table)
return -ENOMEM;
return 0;
}

326
net/rds/tcp.c Normal file
View file

@ -0,0 +1,326 @@
/*
* Copyright (c) 2006 Oracle. All rights reserved.
*
* This software is available to you under a choice of one of two
* licenses. You may choose to be licensed under the terms of the GNU
* General Public License (GPL) Version 2, available from the file
* COPYING in the main directory of this source tree, or the
* OpenIB.org BSD license below:
*
* Redistribution and use in source and binary forms, with or
* without modification, are permitted provided that the following
* conditions are met:
*
* - Redistributions of source code must retain the above
* copyright notice, this list of conditions and the following
* disclaimer.
*
* - Redistributions in binary form must reproduce the above
* copyright notice, this list of conditions and the following
* disclaimer in the documentation and/or other materials
* provided with the distribution.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
* BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
* ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
* CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
*
*/
#include <linux/kernel.h>
#include <linux/slab.h>
#include <linux/in.h>
#include <linux/module.h>
#include <net/tcp.h>
#include "rds.h"
#include "tcp.h"
/* only for info exporting */
static DEFINE_SPINLOCK(rds_tcp_tc_list_lock);
static LIST_HEAD(rds_tcp_tc_list);
static unsigned int rds_tcp_tc_count;
/* Track rds_tcp_connection structs so they can be cleaned up */
static DEFINE_SPINLOCK(rds_tcp_conn_lock);
static LIST_HEAD(rds_tcp_conn_list);
static struct kmem_cache *rds_tcp_conn_slab;
#define RDS_TCP_DEFAULT_BUFSIZE (128 * 1024)
/* doing it this way avoids calling tcp_sk() */
void rds_tcp_nonagle(struct socket *sock)
{
mm_segment_t oldfs = get_fs();
int val = 1;
set_fs(KERNEL_DS);
sock->ops->setsockopt(sock, SOL_TCP, TCP_NODELAY, (char __user *)&val,
sizeof(val));
set_fs(oldfs);
}
void rds_tcp_tune(struct socket *sock)
{
struct sock *sk = sock->sk;
rds_tcp_nonagle(sock);
/*
* We're trying to saturate gigabit with the default,
* see svc_sock_setbufsize().
*/
lock_sock(sk);
sk->sk_sndbuf = RDS_TCP_DEFAULT_BUFSIZE;
sk->sk_rcvbuf = RDS_TCP_DEFAULT_BUFSIZE;
sk->sk_userlocks |= SOCK_SNDBUF_LOCK|SOCK_RCVBUF_LOCK;
release_sock(sk);
}
u32 rds_tcp_snd_nxt(struct rds_tcp_connection *tc)
{
return tcp_sk(tc->t_sock->sk)->snd_nxt;
}
u32 rds_tcp_snd_una(struct rds_tcp_connection *tc)
{
return tcp_sk(tc->t_sock->sk)->snd_una;
}
void rds_tcp_restore_callbacks(struct socket *sock,
struct rds_tcp_connection *tc)
{
rdsdebug("restoring sock %p callbacks from tc %p\n", sock, tc);
write_lock_bh(&sock->sk->sk_callback_lock);
/* done under the callback_lock to serialize with write_space */
spin_lock(&rds_tcp_tc_list_lock);
list_del_init(&tc->t_list_item);
rds_tcp_tc_count--;
spin_unlock(&rds_tcp_tc_list_lock);
tc->t_sock = NULL;
sock->sk->sk_write_space = tc->t_orig_write_space;
sock->sk->sk_data_ready = tc->t_orig_data_ready;
sock->sk->sk_state_change = tc->t_orig_state_change;
sock->sk->sk_user_data = NULL;
write_unlock_bh(&sock->sk->sk_callback_lock);
}
/*
* This is the only path that sets tc->t_sock. Send and receive trust that
* it is set. The RDS_CONN_CONNECTED bit protects those paths from being
* called while it isn't set.
*/
void rds_tcp_set_callbacks(struct socket *sock, struct rds_connection *conn)
{
struct rds_tcp_connection *tc = conn->c_transport_data;
rdsdebug("setting sock %p callbacks to tc %p\n", sock, tc);
write_lock_bh(&sock->sk->sk_callback_lock);
/* done under the callback_lock to serialize with write_space */
spin_lock(&rds_tcp_tc_list_lock);
list_add_tail(&tc->t_list_item, &rds_tcp_tc_list);
rds_tcp_tc_count++;
spin_unlock(&rds_tcp_tc_list_lock);
/* accepted sockets need our listen data ready undone */
if (sock->sk->sk_data_ready == rds_tcp_listen_data_ready)
sock->sk->sk_data_ready = sock->sk->sk_user_data;
tc->t_sock = sock;
tc->conn = conn;
tc->t_orig_data_ready = sock->sk->sk_data_ready;
tc->t_orig_write_space = sock->sk->sk_write_space;
tc->t_orig_state_change = sock->sk->sk_state_change;
sock->sk->sk_user_data = conn;
sock->sk->sk_data_ready = rds_tcp_data_ready;
sock->sk->sk_write_space = rds_tcp_write_space;
sock->sk->sk_state_change = rds_tcp_state_change;
write_unlock_bh(&sock->sk->sk_callback_lock);
}
static void rds_tcp_tc_info(struct socket *sock, unsigned int len,
struct rds_info_iterator *iter,
struct rds_info_lengths *lens)
{
struct rds_info_tcp_socket tsinfo;
struct rds_tcp_connection *tc;
unsigned long flags;
struct sockaddr_in sin;
int sinlen;
spin_lock_irqsave(&rds_tcp_tc_list_lock, flags);
if (len / sizeof(tsinfo) < rds_tcp_tc_count)
goto out;
list_for_each_entry(tc, &rds_tcp_tc_list, t_list_item) {
sock->ops->getname(sock, (struct sockaddr *)&sin, &sinlen, 0);
tsinfo.local_addr = sin.sin_addr.s_addr;
tsinfo.local_port = sin.sin_port;
sock->ops->getname(sock, (struct sockaddr *)&sin, &sinlen, 1);
tsinfo.peer_addr = sin.sin_addr.s_addr;
tsinfo.peer_port = sin.sin_port;
tsinfo.hdr_rem = tc->t_tinc_hdr_rem;
tsinfo.data_rem = tc->t_tinc_data_rem;
tsinfo.last_sent_nxt = tc->t_last_sent_nxt;
tsinfo.last_expected_una = tc->t_last_expected_una;
tsinfo.last_seen_una = tc->t_last_seen_una;
rds_info_copy(iter, &tsinfo, sizeof(tsinfo));
}
out:
lens->nr = rds_tcp_tc_count;
lens->each = sizeof(tsinfo);
spin_unlock_irqrestore(&rds_tcp_tc_list_lock, flags);
}
static int rds_tcp_laddr_check(__be32 addr)
{
if (inet_addr_type(&init_net, addr) == RTN_LOCAL)
return 0;
return -EADDRNOTAVAIL;
}
static int rds_tcp_conn_alloc(struct rds_connection *conn, gfp_t gfp)
{
struct rds_tcp_connection *tc;
tc = kmem_cache_alloc(rds_tcp_conn_slab, gfp);
if (!tc)
return -ENOMEM;
tc->t_sock = NULL;
tc->t_tinc = NULL;
tc->t_tinc_hdr_rem = sizeof(struct rds_header);
tc->t_tinc_data_rem = 0;
conn->c_transport_data = tc;
spin_lock_irq(&rds_tcp_conn_lock);
list_add_tail(&tc->t_tcp_node, &rds_tcp_conn_list);
spin_unlock_irq(&rds_tcp_conn_lock);
rdsdebug("alloced tc %p\n", conn->c_transport_data);
return 0;
}
static void rds_tcp_conn_free(void *arg)
{
struct rds_tcp_connection *tc = arg;
unsigned long flags;
rdsdebug("freeing tc %p\n", tc);
spin_lock_irqsave(&rds_tcp_conn_lock, flags);
list_del(&tc->t_tcp_node);
spin_unlock_irqrestore(&rds_tcp_conn_lock, flags);
kmem_cache_free(rds_tcp_conn_slab, tc);
}
static void rds_tcp_destroy_conns(void)
{
struct rds_tcp_connection *tc, *_tc;
LIST_HEAD(tmp_list);
/* avoid calling conn_destroy with irqs off */
spin_lock_irq(&rds_tcp_conn_lock);
list_splice(&rds_tcp_conn_list, &tmp_list);
INIT_LIST_HEAD(&rds_tcp_conn_list);
spin_unlock_irq(&rds_tcp_conn_lock);
list_for_each_entry_safe(tc, _tc, &tmp_list, t_tcp_node) {
if (tc->conn->c_passive)
rds_conn_destroy(tc->conn->c_passive);
rds_conn_destroy(tc->conn);
}
}
static void rds_tcp_exit(void)
{
rds_info_deregister_func(RDS_INFO_TCP_SOCKETS, rds_tcp_tc_info);
rds_tcp_listen_stop();
rds_tcp_destroy_conns();
rds_trans_unregister(&rds_tcp_transport);
rds_tcp_recv_exit();
kmem_cache_destroy(rds_tcp_conn_slab);
}
module_exit(rds_tcp_exit);
struct rds_transport rds_tcp_transport = {
.laddr_check = rds_tcp_laddr_check,
.xmit_prepare = rds_tcp_xmit_prepare,
.xmit_complete = rds_tcp_xmit_complete,
.xmit = rds_tcp_xmit,
.recv = rds_tcp_recv,
.conn_alloc = rds_tcp_conn_alloc,
.conn_free = rds_tcp_conn_free,
.conn_connect = rds_tcp_conn_connect,
.conn_shutdown = rds_tcp_conn_shutdown,
.inc_copy_to_user = rds_tcp_inc_copy_to_user,
.inc_free = rds_tcp_inc_free,
.stats_info_copy = rds_tcp_stats_info_copy,
.exit = rds_tcp_exit,
.t_owner = THIS_MODULE,
.t_name = "tcp",
.t_type = RDS_TRANS_TCP,
.t_prefer_loopback = 1,
};
static int rds_tcp_init(void)
{
int ret;
rds_tcp_conn_slab = kmem_cache_create("rds_tcp_connection",
sizeof(struct rds_tcp_connection),
0, 0, NULL);
if (!rds_tcp_conn_slab) {
ret = -ENOMEM;
goto out;
}
ret = rds_tcp_recv_init();
if (ret)
goto out_slab;
ret = rds_trans_register(&rds_tcp_transport);
if (ret)
goto out_recv;
ret = rds_tcp_listen_init();
if (ret)
goto out_register;
rds_info_register_func(RDS_INFO_TCP_SOCKETS, rds_tcp_tc_info);
goto out;
out_register:
rds_trans_unregister(&rds_tcp_transport);
out_recv:
rds_tcp_recv_exit();
out_slab:
kmem_cache_destroy(rds_tcp_conn_slab);
out:
return ret;
}
module_init(rds_tcp_init);
MODULE_AUTHOR("Oracle Corporation <rds-devel@oss.oracle.com>");
MODULE_DESCRIPTION("RDS: TCP transport");
MODULE_LICENSE("Dual BSD/GPL");

88
net/rds/tcp.h Normal file
View file

@ -0,0 +1,88 @@
#ifndef _RDS_TCP_H
#define _RDS_TCP_H
#define RDS_TCP_PORT 16385
struct rds_tcp_incoming {
struct rds_incoming ti_inc;
struct sk_buff_head ti_skb_list;
};
struct rds_tcp_connection {
struct list_head t_tcp_node;
struct rds_connection *conn;
struct socket *t_sock;
void *t_orig_write_space;
void *t_orig_data_ready;
void *t_orig_state_change;
struct rds_tcp_incoming *t_tinc;
size_t t_tinc_hdr_rem;
size_t t_tinc_data_rem;
/* XXX error report? */
struct work_struct t_conn_w;
struct work_struct t_send_w;
struct work_struct t_down_w;
struct work_struct t_recv_w;
/* for info exporting only */
struct list_head t_list_item;
u32 t_last_sent_nxt;
u32 t_last_expected_una;
u32 t_last_seen_una;
};
struct rds_tcp_statistics {
uint64_t s_tcp_data_ready_calls;
uint64_t s_tcp_write_space_calls;
uint64_t s_tcp_sndbuf_full;
uint64_t s_tcp_connect_raced;
uint64_t s_tcp_listen_closed_stale;
};
/* tcp.c */
void rds_tcp_tune(struct socket *sock);
void rds_tcp_nonagle(struct socket *sock);
void rds_tcp_set_callbacks(struct socket *sock, struct rds_connection *conn);
void rds_tcp_restore_callbacks(struct socket *sock,
struct rds_tcp_connection *tc);
u32 rds_tcp_snd_nxt(struct rds_tcp_connection *tc);
u32 rds_tcp_snd_una(struct rds_tcp_connection *tc);
u64 rds_tcp_map_seq(struct rds_tcp_connection *tc, u32 seq);
extern struct rds_transport rds_tcp_transport;
/* tcp_connect.c */
int rds_tcp_conn_connect(struct rds_connection *conn);
void rds_tcp_conn_shutdown(struct rds_connection *conn);
void rds_tcp_state_change(struct sock *sk);
/* tcp_listen.c */
int rds_tcp_listen_init(void);
void rds_tcp_listen_stop(void);
void rds_tcp_listen_data_ready(struct sock *sk);
/* tcp_recv.c */
int rds_tcp_recv_init(void);
void rds_tcp_recv_exit(void);
void rds_tcp_data_ready(struct sock *sk);
int rds_tcp_recv(struct rds_connection *conn);
void rds_tcp_inc_free(struct rds_incoming *inc);
int rds_tcp_inc_copy_to_user(struct rds_incoming *inc, struct iovec *iov,
size_t size);
/* tcp_send.c */
void rds_tcp_xmit_prepare(struct rds_connection *conn);
void rds_tcp_xmit_complete(struct rds_connection *conn);
int rds_tcp_xmit(struct rds_connection *conn, struct rds_message *rm,
unsigned int hdr_off, unsigned int sg, unsigned int off);
void rds_tcp_write_space(struct sock *sk);
/* tcp_stats.c */
DECLARE_PER_CPU(struct rds_tcp_statistics, rds_tcp_stats);
#define rds_tcp_stats_inc(member) rds_stats_inc_which(rds_tcp_stats, member)
unsigned int rds_tcp_stats_info_copy(struct rds_info_iterator *iter,
unsigned int avail);
#endif

155
net/rds/tcp_connect.c Normal file
View file

@ -0,0 +1,155 @@
/*
* Copyright (c) 2006 Oracle. All rights reserved.
*
* This software is available to you under a choice of one of two
* licenses. You may choose to be licensed under the terms of the GNU
* General Public License (GPL) Version 2, available from the file
* COPYING in the main directory of this source tree, or the
* OpenIB.org BSD license below:
*
* Redistribution and use in source and binary forms, with or
* without modification, are permitted provided that the following
* conditions are met:
*
* - Redistributions of source code must retain the above
* copyright notice, this list of conditions and the following
* disclaimer.
*
* - Redistributions in binary form must reproduce the above
* copyright notice, this list of conditions and the following
* disclaimer in the documentation and/or other materials
* provided with the distribution.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
* BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
* ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
* CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
*
*/
#include <linux/kernel.h>
#include <linux/in.h>
#include <net/tcp.h>
#include "rds.h"
#include "tcp.h"
void rds_tcp_state_change(struct sock *sk)
{
void (*state_change)(struct sock *sk);
struct rds_connection *conn;
struct rds_tcp_connection *tc;
read_lock(&sk->sk_callback_lock);
conn = sk->sk_user_data;
if (!conn) {
state_change = sk->sk_state_change;
goto out;
}
tc = conn->c_transport_data;
state_change = tc->t_orig_state_change;
rdsdebug("sock %p state_change to %d\n", tc->t_sock, sk->sk_state);
switch(sk->sk_state) {
/* ignore connecting sockets as they make progress */
case TCP_SYN_SENT:
case TCP_SYN_RECV:
break;
case TCP_ESTABLISHED:
rds_connect_complete(conn);
break;
case TCP_CLOSE:
rds_conn_drop(conn);
default:
break;
}
out:
read_unlock(&sk->sk_callback_lock);
state_change(sk);
}
int rds_tcp_conn_connect(struct rds_connection *conn)
{
struct socket *sock = NULL;
struct sockaddr_in src, dest;
int ret;
ret = sock_create(PF_INET, SOCK_STREAM, IPPROTO_TCP, &sock);
if (ret < 0)
goto out;
rds_tcp_tune(sock);
src.sin_family = AF_INET;
src.sin_addr.s_addr = (__force u32)conn->c_laddr;
src.sin_port = (__force u16)htons(0);
ret = sock->ops->bind(sock, (struct sockaddr *)&src, sizeof(src));
if (ret) {
rdsdebug("bind failed with %d at address %pI4\n",
ret, &conn->c_laddr);
goto out;
}
dest.sin_family = AF_INET;
dest.sin_addr.s_addr = (__force u32)conn->c_faddr;
dest.sin_port = (__force u16)htons(RDS_TCP_PORT);
/*
* once we call connect() we can start getting callbacks and they
* own the socket
*/
rds_tcp_set_callbacks(sock, conn);
ret = sock->ops->connect(sock, (struct sockaddr *)&dest, sizeof(dest),
O_NONBLOCK);
rdsdebug("connect to address %pI4 returned %d\n", &conn->c_faddr, ret);
if (ret == -EINPROGRESS)
ret = 0;
if (ret == 0)
sock = NULL;
else
rds_tcp_restore_callbacks(sock, conn->c_transport_data);
out:
if (sock)
sock_release(sock);
return ret;
}
/*
* Before killing the tcp socket this needs to serialize with callbacks. The
* caller has already grabbed the sending sem so we're serialized with other
* senders.
*
* TCP calls the callbacks with the sock lock so we hold it while we reset the
* callbacks to those set by TCP. Our callbacks won't execute again once we
* hold the sock lock.
*/
void rds_tcp_conn_shutdown(struct rds_connection *conn)
{
struct rds_tcp_connection *tc = conn->c_transport_data;
struct socket *sock = tc->t_sock;
rdsdebug("shutting down conn %p tc %p sock %p\n", conn, tc, sock);
if (sock) {
sock->ops->shutdown(sock, RCV_SHUTDOWN | SEND_SHUTDOWN);
lock_sock(sock->sk);
rds_tcp_restore_callbacks(sock, tc); /* tc->tc_sock = NULL */
release_sock(sock->sk);
sock_release(sock);
}
if (tc->t_tinc) {
rds_inc_put(&tc->t_tinc->ti_inc);
tc->t_tinc = NULL;
}
tc->t_tinc_hdr_rem = sizeof(struct rds_header);
tc->t_tinc_data_rem = 0;
}

200
net/rds/tcp_listen.c Normal file
View file

@ -0,0 +1,200 @@
/*
* Copyright (c) 2006 Oracle. All rights reserved.
*
* This software is available to you under a choice of one of two
* licenses. You may choose to be licensed under the terms of the GNU
* General Public License (GPL) Version 2, available from the file
* COPYING in the main directory of this source tree, or the
* OpenIB.org BSD license below:
*
* Redistribution and use in source and binary forms, with or
* without modification, are permitted provided that the following
* conditions are met:
*
* - Redistributions of source code must retain the above
* copyright notice, this list of conditions and the following
* disclaimer.
*
* - Redistributions in binary form must reproduce the above
* copyright notice, this list of conditions and the following
* disclaimer in the documentation and/or other materials
* provided with the distribution.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
* BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
* ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
* CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
*
*/
#include <linux/kernel.h>
#include <linux/gfp.h>
#include <linux/in.h>
#include <net/tcp.h>
#include "rds.h"
#include "tcp.h"
/*
* cheesy, but simple..
*/
static void rds_tcp_accept_worker(struct work_struct *work);
static DECLARE_WORK(rds_tcp_listen_work, rds_tcp_accept_worker);
static struct socket *rds_tcp_listen_sock;
static int rds_tcp_accept_one(struct socket *sock)
{
struct socket *new_sock = NULL;
struct rds_connection *conn;
int ret;
struct inet_sock *inet;
ret = sock_create_lite(sock->sk->sk_family, sock->sk->sk_type,
sock->sk->sk_protocol, &new_sock);
if (ret)
goto out;
new_sock->type = sock->type;
new_sock->ops = sock->ops;
ret = sock->ops->accept(sock, new_sock, O_NONBLOCK);
if (ret < 0)
goto out;
rds_tcp_tune(new_sock);
inet = inet_sk(new_sock->sk);
rdsdebug("accepted tcp %pI4:%u -> %pI4:%u\n",
&inet->inet_saddr, ntohs(inet->inet_sport),
&inet->inet_daddr, ntohs(inet->inet_dport));
conn = rds_conn_create(inet->inet_saddr, inet->inet_daddr,
&rds_tcp_transport, GFP_KERNEL);
if (IS_ERR(conn)) {
ret = PTR_ERR(conn);
goto out;
}
/*
* see the comment above rds_queue_delayed_reconnect()
*/
if (!rds_conn_transition(conn, RDS_CONN_DOWN, RDS_CONN_CONNECTING)) {
if (rds_conn_state(conn) == RDS_CONN_UP)
rds_tcp_stats_inc(s_tcp_listen_closed_stale);
else
rds_tcp_stats_inc(s_tcp_connect_raced);
rds_conn_drop(conn);
ret = 0;
goto out;
}
rds_tcp_set_callbacks(new_sock, conn);
rds_connect_complete(conn);
new_sock = NULL;
ret = 0;
out:
if (new_sock)
sock_release(new_sock);
return ret;
}
static void rds_tcp_accept_worker(struct work_struct *work)
{
while (rds_tcp_accept_one(rds_tcp_listen_sock) == 0)
cond_resched();
}
void rds_tcp_listen_data_ready(struct sock *sk)
{
void (*ready)(struct sock *sk);
rdsdebug("listen data ready sk %p\n", sk);
read_lock(&sk->sk_callback_lock);
ready = sk->sk_user_data;
if (!ready) { /* check for teardown race */
ready = sk->sk_data_ready;
goto out;
}
/*
* ->sk_data_ready is also called for a newly established child socket
* before it has been accepted and the accepter has set up their
* data_ready.. we only want to queue listen work for our listening
* socket
*/
if (sk->sk_state == TCP_LISTEN)
queue_work(rds_wq, &rds_tcp_listen_work);
out:
read_unlock(&sk->sk_callback_lock);
ready(sk);
}
int rds_tcp_listen_init(void)
{
struct sockaddr_in sin;
struct socket *sock = NULL;
int ret;
ret = sock_create(PF_INET, SOCK_STREAM, IPPROTO_TCP, &sock);
if (ret < 0)
goto out;
sock->sk->sk_reuse = SK_CAN_REUSE;
rds_tcp_nonagle(sock);
write_lock_bh(&sock->sk->sk_callback_lock);
sock->sk->sk_user_data = sock->sk->sk_data_ready;
sock->sk->sk_data_ready = rds_tcp_listen_data_ready;
write_unlock_bh(&sock->sk->sk_callback_lock);
sin.sin_family = PF_INET;
sin.sin_addr.s_addr = (__force u32)htonl(INADDR_ANY);
sin.sin_port = (__force u16)htons(RDS_TCP_PORT);
ret = sock->ops->bind(sock, (struct sockaddr *)&sin, sizeof(sin));
if (ret < 0)
goto out;
ret = sock->ops->listen(sock, 64);
if (ret < 0)
goto out;
rds_tcp_listen_sock = sock;
sock = NULL;
out:
if (sock)
sock_release(sock);
return ret;
}
void rds_tcp_listen_stop(void)
{
struct socket *sock = rds_tcp_listen_sock;
struct sock *sk;
if (!sock)
return;
sk = sock->sk;
/* serialize with and prevent further callbacks */
lock_sock(sk);
write_lock_bh(&sk->sk_callback_lock);
if (sk->sk_user_data) {
sk->sk_data_ready = sk->sk_user_data;
sk->sk_user_data = NULL;
}
write_unlock_bh(&sk->sk_callback_lock);
release_sock(sk);
/* wait for accepts to stop and close the socket */
flush_workqueue(rds_wq);
sock_release(sock);
rds_tcp_listen_sock = NULL;
}

356
net/rds/tcp_recv.c Normal file
View file

@ -0,0 +1,356 @@
/*
* Copyright (c) 2006 Oracle. All rights reserved.
*
* This software is available to you under a choice of one of two
* licenses. You may choose to be licensed under the terms of the GNU
* General Public License (GPL) Version 2, available from the file
* COPYING in the main directory of this source tree, or the
* OpenIB.org BSD license below:
*
* Redistribution and use in source and binary forms, with or
* without modification, are permitted provided that the following
* conditions are met:
*
* - Redistributions of source code must retain the above
* copyright notice, this list of conditions and the following
* disclaimer.
*
* - Redistributions in binary form must reproduce the above
* copyright notice, this list of conditions and the following
* disclaimer in the documentation and/or other materials
* provided with the distribution.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
* BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
* ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
* CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
*
*/
#include <linux/kernel.h>
#include <linux/slab.h>
#include <net/tcp.h>
#include "rds.h"
#include "tcp.h"
static struct kmem_cache *rds_tcp_incoming_slab;
static void rds_tcp_inc_purge(struct rds_incoming *inc)
{
struct rds_tcp_incoming *tinc;
tinc = container_of(inc, struct rds_tcp_incoming, ti_inc);
rdsdebug("purging tinc %p inc %p\n", tinc, inc);
skb_queue_purge(&tinc->ti_skb_list);
}
void rds_tcp_inc_free(struct rds_incoming *inc)
{
struct rds_tcp_incoming *tinc;
tinc = container_of(inc, struct rds_tcp_incoming, ti_inc);
rds_tcp_inc_purge(inc);
rdsdebug("freeing tinc %p inc %p\n", tinc, inc);
kmem_cache_free(rds_tcp_incoming_slab, tinc);
}
/*
* this is pretty lame, but, whatever.
*/
int rds_tcp_inc_copy_to_user(struct rds_incoming *inc, struct iovec *first_iov,
size_t size)
{
struct rds_tcp_incoming *tinc;
struct iovec *iov, tmp;
struct sk_buff *skb;
unsigned long to_copy, skb_off;
int ret = 0;
if (size == 0)
goto out;
tinc = container_of(inc, struct rds_tcp_incoming, ti_inc);
iov = first_iov;
tmp = *iov;
skb_queue_walk(&tinc->ti_skb_list, skb) {
skb_off = 0;
while (skb_off < skb->len) {
while (tmp.iov_len == 0) {
iov++;
tmp = *iov;
}
to_copy = min(tmp.iov_len, size);
to_copy = min(to_copy, skb->len - skb_off);
rdsdebug("ret %d size %zu skb %p skb_off %lu "
"skblen %d iov_base %p iov_len %zu cpy %lu\n",
ret, size, skb, skb_off, skb->len,
tmp.iov_base, tmp.iov_len, to_copy);
/* modifies tmp as it copies */
if (skb_copy_datagram_iovec(skb, skb_off, &tmp,
to_copy)) {
ret = -EFAULT;
goto out;
}
rds_stats_add(s_copy_to_user, to_copy);
size -= to_copy;
ret += to_copy;
skb_off += to_copy;
if (size == 0)
goto out;
}
}
out:
return ret;
}
/*
* We have a series of skbs that have fragmented pieces of the congestion
* bitmap. They must add up to the exact size of the congestion bitmap. We
* use the skb helpers to copy those into the pages that make up the in-memory
* congestion bitmap for the remote address of this connection. We then tell
* the congestion core that the bitmap has been changed so that it can wake up
* sleepers.
*
* This is racing with sending paths which are using test_bit to see if the
* bitmap indicates that their recipient is congested.
*/
static void rds_tcp_cong_recv(struct rds_connection *conn,
struct rds_tcp_incoming *tinc)
{
struct sk_buff *skb;
unsigned int to_copy, skb_off;
unsigned int map_off;
unsigned int map_page;
struct rds_cong_map *map;
int ret;
/* catch completely corrupt packets */
if (be32_to_cpu(tinc->ti_inc.i_hdr.h_len) != RDS_CONG_MAP_BYTES)
return;
map_page = 0;
map_off = 0;
map = conn->c_fcong;
skb_queue_walk(&tinc->ti_skb_list, skb) {
skb_off = 0;
while (skb_off < skb->len) {
to_copy = min_t(unsigned int, PAGE_SIZE - map_off,
skb->len - skb_off);
BUG_ON(map_page >= RDS_CONG_MAP_PAGES);
/* only returns 0 or -error */
ret = skb_copy_bits(skb, skb_off,
(void *)map->m_page_addrs[map_page] + map_off,
to_copy);
BUG_ON(ret != 0);
skb_off += to_copy;
map_off += to_copy;
if (map_off == PAGE_SIZE) {
map_off = 0;
map_page++;
}
}
}
rds_cong_map_updated(map, ~(u64) 0);
}
struct rds_tcp_desc_arg {
struct rds_connection *conn;
gfp_t gfp;
};
static int rds_tcp_data_recv(read_descriptor_t *desc, struct sk_buff *skb,
unsigned int offset, size_t len)
{
struct rds_tcp_desc_arg *arg = desc->arg.data;
struct rds_connection *conn = arg->conn;
struct rds_tcp_connection *tc = conn->c_transport_data;
struct rds_tcp_incoming *tinc = tc->t_tinc;
struct sk_buff *clone;
size_t left = len, to_copy;
rdsdebug("tcp data tc %p skb %p offset %u len %zu\n", tc, skb, offset,
len);
/*
* tcp_read_sock() interprets partial progress as an indication to stop
* processing.
*/
while (left) {
if (!tinc) {
tinc = kmem_cache_alloc(rds_tcp_incoming_slab,
arg->gfp);
if (!tinc) {
desc->error = -ENOMEM;
goto out;
}
tc->t_tinc = tinc;
rdsdebug("alloced tinc %p\n", tinc);
rds_inc_init(&tinc->ti_inc, conn, conn->c_faddr);
/*
* XXX * we might be able to use the __ variants when
* we've already serialized at a higher level.
*/
skb_queue_head_init(&tinc->ti_skb_list);
}
if (left && tc->t_tinc_hdr_rem) {
to_copy = min(tc->t_tinc_hdr_rem, left);
rdsdebug("copying %zu header from skb %p\n", to_copy,
skb);
skb_copy_bits(skb, offset,
(char *)&tinc->ti_inc.i_hdr +
sizeof(struct rds_header) -
tc->t_tinc_hdr_rem,
to_copy);
tc->t_tinc_hdr_rem -= to_copy;
left -= to_copy;
offset += to_copy;
if (tc->t_tinc_hdr_rem == 0) {
/* could be 0 for a 0 len message */
tc->t_tinc_data_rem =
be32_to_cpu(tinc->ti_inc.i_hdr.h_len);
}
}
if (left && tc->t_tinc_data_rem) {
clone = skb_clone(skb, arg->gfp);
if (!clone) {
desc->error = -ENOMEM;
goto out;
}
to_copy = min(tc->t_tinc_data_rem, left);
pskb_pull(clone, offset);
pskb_trim(clone, to_copy);
skb_queue_tail(&tinc->ti_skb_list, clone);
rdsdebug("skb %p data %p len %d off %u to_copy %zu -> "
"clone %p data %p len %d\n",
skb, skb->data, skb->len, offset, to_copy,
clone, clone->data, clone->len);
tc->t_tinc_data_rem -= to_copy;
left -= to_copy;
offset += to_copy;
}
if (tc->t_tinc_hdr_rem == 0 && tc->t_tinc_data_rem == 0) {
if (tinc->ti_inc.i_hdr.h_flags == RDS_FLAG_CONG_BITMAP)
rds_tcp_cong_recv(conn, tinc);
else
rds_recv_incoming(conn, conn->c_faddr,
conn->c_laddr, &tinc->ti_inc,
arg->gfp);
tc->t_tinc_hdr_rem = sizeof(struct rds_header);
tc->t_tinc_data_rem = 0;
tc->t_tinc = NULL;
rds_inc_put(&tinc->ti_inc);
tinc = NULL;
}
}
out:
rdsdebug("returning len %zu left %zu skb len %d rx queue depth %d\n",
len, left, skb->len,
skb_queue_len(&tc->t_sock->sk->sk_receive_queue));
return len - left;
}
/* the caller has to hold the sock lock */
static int rds_tcp_read_sock(struct rds_connection *conn, gfp_t gfp)
{
struct rds_tcp_connection *tc = conn->c_transport_data;
struct socket *sock = tc->t_sock;
read_descriptor_t desc;
struct rds_tcp_desc_arg arg;
/* It's like glib in the kernel! */
arg.conn = conn;
arg.gfp = gfp;
desc.arg.data = &arg;
desc.error = 0;
desc.count = 1; /* give more than one skb per call */
tcp_read_sock(sock->sk, &desc, rds_tcp_data_recv);
rdsdebug("tcp_read_sock for tc %p gfp 0x%x returned %d\n", tc, gfp,
desc.error);
return desc.error;
}
/*
* We hold the sock lock to serialize our rds_tcp_recv->tcp_read_sock from
* data_ready.
*
* if we fail to allocate we're in trouble.. blindly wait some time before
* trying again to see if the VM can free up something for us.
*/
int rds_tcp_recv(struct rds_connection *conn)
{
struct rds_tcp_connection *tc = conn->c_transport_data;
struct socket *sock = tc->t_sock;
int ret = 0;
rdsdebug("recv worker conn %p tc %p sock %p\n", conn, tc, sock);
lock_sock(sock->sk);
ret = rds_tcp_read_sock(conn, GFP_KERNEL);
release_sock(sock->sk);
return ret;
}
void rds_tcp_data_ready(struct sock *sk)
{
void (*ready)(struct sock *sk);
struct rds_connection *conn;
struct rds_tcp_connection *tc;
rdsdebug("data ready sk %p\n", sk);
read_lock(&sk->sk_callback_lock);
conn = sk->sk_user_data;
if (!conn) { /* check for teardown race */
ready = sk->sk_data_ready;
goto out;
}
tc = conn->c_transport_data;
ready = tc->t_orig_data_ready;
rds_tcp_stats_inc(s_tcp_data_ready_calls);
if (rds_tcp_read_sock(conn, GFP_ATOMIC) == -ENOMEM)
queue_delayed_work(rds_wq, &conn->c_recv_w, 0);
out:
read_unlock(&sk->sk_callback_lock);
ready(sk);
}
int rds_tcp_recv_init(void)
{
rds_tcp_incoming_slab = kmem_cache_create("rds_tcp_incoming",
sizeof(struct rds_tcp_incoming),
0, 0, NULL);
if (!rds_tcp_incoming_slab)
return -ENOMEM;
return 0;
}
void rds_tcp_recv_exit(void)
{
kmem_cache_destroy(rds_tcp_incoming_slab);
}

215
net/rds/tcp_send.c Normal file
View file

@ -0,0 +1,215 @@
/*
* Copyright (c) 2006 Oracle. All rights reserved.
*
* This software is available to you under a choice of one of two
* licenses. You may choose to be licensed under the terms of the GNU
* General Public License (GPL) Version 2, available from the file
* COPYING in the main directory of this source tree, or the
* OpenIB.org BSD license below:
*
* Redistribution and use in source and binary forms, with or
* without modification, are permitted provided that the following
* conditions are met:
*
* - Redistributions of source code must retain the above
* copyright notice, this list of conditions and the following
* disclaimer.
*
* - Redistributions in binary form must reproduce the above
* copyright notice, this list of conditions and the following
* disclaimer in the documentation and/or other materials
* provided with the distribution.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
* BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
* ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
* CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
*
*/
#include <linux/kernel.h>
#include <linux/in.h>
#include <net/tcp.h>
#include "rds.h"
#include "tcp.h"
static void rds_tcp_cork(struct socket *sock, int val)
{
mm_segment_t oldfs;
oldfs = get_fs();
set_fs(KERNEL_DS);
sock->ops->setsockopt(sock, SOL_TCP, TCP_CORK, (char __user *)&val,
sizeof(val));
set_fs(oldfs);
}
void rds_tcp_xmit_prepare(struct rds_connection *conn)
{
struct rds_tcp_connection *tc = conn->c_transport_data;
rds_tcp_cork(tc->t_sock, 1);
}
void rds_tcp_xmit_complete(struct rds_connection *conn)
{
struct rds_tcp_connection *tc = conn->c_transport_data;
rds_tcp_cork(tc->t_sock, 0);
}
/* the core send_sem serializes this with other xmit and shutdown */
static int rds_tcp_sendmsg(struct socket *sock, void *data, unsigned int len)
{
struct kvec vec = {
.iov_base = data,
.iov_len = len,
};
struct msghdr msg = {
.msg_flags = MSG_DONTWAIT | MSG_NOSIGNAL,
};
return kernel_sendmsg(sock, &msg, &vec, 1, vec.iov_len);
}
/* the core send_sem serializes this with other xmit and shutdown */
int rds_tcp_xmit(struct rds_connection *conn, struct rds_message *rm,
unsigned int hdr_off, unsigned int sg, unsigned int off)
{
struct rds_tcp_connection *tc = conn->c_transport_data;
int done = 0;
int ret = 0;
if (hdr_off == 0) {
/*
* m_ack_seq is set to the sequence number of the last byte of
* header and data. see rds_tcp_is_acked().
*/
tc->t_last_sent_nxt = rds_tcp_snd_nxt(tc);
rm->m_ack_seq = tc->t_last_sent_nxt +
sizeof(struct rds_header) +
be32_to_cpu(rm->m_inc.i_hdr.h_len) - 1;
smp_mb__before_atomic();
set_bit(RDS_MSG_HAS_ACK_SEQ, &rm->m_flags);
tc->t_last_expected_una = rm->m_ack_seq + 1;
rdsdebug("rm %p tcp nxt %u ack_seq %llu\n",
rm, rds_tcp_snd_nxt(tc),
(unsigned long long)rm->m_ack_seq);
}
if (hdr_off < sizeof(struct rds_header)) {
/* see rds_tcp_write_space() */
set_bit(SOCK_NOSPACE, &tc->t_sock->sk->sk_socket->flags);
ret = rds_tcp_sendmsg(tc->t_sock,
(void *)&rm->m_inc.i_hdr + hdr_off,
sizeof(rm->m_inc.i_hdr) - hdr_off);
if (ret < 0)
goto out;
done += ret;
if (hdr_off + done != sizeof(struct rds_header))
goto out;
}
while (sg < rm->data.op_nents) {
ret = tc->t_sock->ops->sendpage(tc->t_sock,
sg_page(&rm->data.op_sg[sg]),
rm->data.op_sg[sg].offset + off,
rm->data.op_sg[sg].length - off,
MSG_DONTWAIT|MSG_NOSIGNAL);
rdsdebug("tcp sendpage %p:%u:%u ret %d\n", (void *)sg_page(&rm->data.op_sg[sg]),
rm->data.op_sg[sg].offset + off, rm->data.op_sg[sg].length - off,
ret);
if (ret <= 0)
break;
off += ret;
done += ret;
if (off == rm->data.op_sg[sg].length) {
off = 0;
sg++;
}
}
out:
if (ret <= 0) {
/* write_space will hit after EAGAIN, all else fatal */
if (ret == -EAGAIN) {
rds_tcp_stats_inc(s_tcp_sndbuf_full);
ret = 0;
} else {
printk(KERN_WARNING "RDS/tcp: send to %pI4 "
"returned %d, disconnecting and reconnecting\n",
&conn->c_faddr, ret);
rds_conn_drop(conn);
}
}
if (done == 0)
done = ret;
return done;
}
/*
* rm->m_ack_seq is set to the tcp sequence number that corresponds to the
* last byte of the message, including the header. This means that the
* entire message has been received if rm->m_ack_seq is "before" the next
* unacked byte of the TCP sequence space. We have to do very careful
* wrapping 32bit comparisons here.
*/
static int rds_tcp_is_acked(struct rds_message *rm, uint64_t ack)
{
if (!test_bit(RDS_MSG_HAS_ACK_SEQ, &rm->m_flags))
return 0;
return (__s32)((u32)rm->m_ack_seq - (u32)ack) < 0;
}
void rds_tcp_write_space(struct sock *sk)
{
void (*write_space)(struct sock *sk);
struct rds_connection *conn;
struct rds_tcp_connection *tc;
read_lock(&sk->sk_callback_lock);
conn = sk->sk_user_data;
if (!conn) {
write_space = sk->sk_write_space;
goto out;
}
tc = conn->c_transport_data;
rdsdebug("write_space for tc %p\n", tc);
write_space = tc->t_orig_write_space;
rds_tcp_stats_inc(s_tcp_write_space_calls);
rdsdebug("tcp una %u\n", rds_tcp_snd_una(tc));
tc->t_last_seen_una = rds_tcp_snd_una(tc);
rds_send_drop_acked(conn, rds_tcp_snd_una(tc), rds_tcp_is_acked);
if ((atomic_read(&sk->sk_wmem_alloc) << 1) <= sk->sk_sndbuf)
queue_delayed_work(rds_wq, &conn->c_send_w, 0);
out:
read_unlock(&sk->sk_callback_lock);
/*
* write_space is only called when data leaves tcp's send queue if
* SOCK_NOSPACE is set. We set SOCK_NOSPACE every time we put
* data in tcp's send queue because we use write_space to parse the
* sequence numbers and notice that rds messages have been fully
* received.
*
* tcp's write_space clears SOCK_NOSPACE if the send queue has more
* than a certain amount of space. So we need to set it again *after*
* we call tcp's write_space or else we might only get called on the
* first of a series of incoming tcp acks.
*/
write_space(sk);
if (sk->sk_socket)
set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
}

74
net/rds/tcp_stats.c Normal file
View file

@ -0,0 +1,74 @@
/*
* Copyright (c) 2006 Oracle. All rights reserved.
*
* This software is available to you under a choice of one of two
* licenses. You may choose to be licensed under the terms of the GNU
* General Public License (GPL) Version 2, available from the file
* COPYING in the main directory of this source tree, or the
* OpenIB.org BSD license below:
*
* Redistribution and use in source and binary forms, with or
* without modification, are permitted provided that the following
* conditions are met:
*
* - Redistributions of source code must retain the above
* copyright notice, this list of conditions and the following
* disclaimer.
*
* - Redistributions in binary form must reproduce the above
* copyright notice, this list of conditions and the following
* disclaimer in the documentation and/or other materials
* provided with the distribution.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
* BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
* ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
* CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
*
*/
#include <linux/percpu.h>
#include <linux/seq_file.h>
#include <linux/proc_fs.h>
#include "rds.h"
#include "tcp.h"
DEFINE_PER_CPU(struct rds_tcp_statistics, rds_tcp_stats)
____cacheline_aligned;
static const char * const rds_tcp_stat_names[] = {
"tcp_data_ready_calls",
"tcp_write_space_calls",
"tcp_sndbuf_full",
"tcp_connect_raced",
"tcp_listen_closed_stale",
};
unsigned int rds_tcp_stats_info_copy(struct rds_info_iterator *iter,
unsigned int avail)
{
struct rds_tcp_statistics stats = {0, };
uint64_t *src;
uint64_t *sum;
size_t i;
int cpu;
if (avail < ARRAY_SIZE(rds_tcp_stat_names))
goto out;
for_each_online_cpu(cpu) {
src = (uint64_t *)&(per_cpu(rds_tcp_stats, cpu));
sum = (uint64_t *)&stats;
for (i = 0; i < sizeof(stats) / sizeof(uint64_t); i++)
*(sum++) += *(src++);
}
rds_stats_info_copy(iter, (uint64_t *)&stats, rds_tcp_stat_names,
ARRAY_SIZE(rds_tcp_stat_names));
out:
return ARRAY_SIZE(rds_tcp_stat_names);
}

222
net/rds/threads.c Normal file
View file

@ -0,0 +1,222 @@
/*
* Copyright (c) 2006 Oracle. All rights reserved.
*
* This software is available to you under a choice of one of two
* licenses. You may choose to be licensed under the terms of the GNU
* General Public License (GPL) Version 2, available from the file
* COPYING in the main directory of this source tree, or the
* OpenIB.org BSD license below:
*
* Redistribution and use in source and binary forms, with or
* without modification, are permitted provided that the following
* conditions are met:
*
* - Redistributions of source code must retain the above
* copyright notice, this list of conditions and the following
* disclaimer.
*
* - Redistributions in binary form must reproduce the above
* copyright notice, this list of conditions and the following
* disclaimer in the documentation and/or other materials
* provided with the distribution.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
* BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
* ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
* CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
*
*/
#include <linux/kernel.h>
#include <linux/random.h>
#include <linux/export.h>
#include "rds.h"
/*
* All of connection management is simplified by serializing it through
* work queues that execute in a connection managing thread.
*
* TCP wants to send acks through sendpage() in response to data_ready(),
* but it needs a process context to do so.
*
* The receive paths need to allocate but can't drop packets (!) so we have
* a thread around to block allocating if the receive fast path sees an
* allocation failure.
*/
/* Grand Unified Theory of connection life cycle:
* At any point in time, the connection can be in one of these states:
* DOWN, CONNECTING, UP, DISCONNECTING, ERROR
*
* The following transitions are possible:
* ANY -> ERROR
* UP -> DISCONNECTING
* ERROR -> DISCONNECTING
* DISCONNECTING -> DOWN
* DOWN -> CONNECTING
* CONNECTING -> UP
*
* Transition to state DISCONNECTING/DOWN:
* - Inside the shutdown worker; synchronizes with xmit path
* through RDS_IN_XMIT, and with connection management callbacks
* via c_cm_lock.
*
* For receive callbacks, we rely on the underlying transport
* (TCP, IB/RDMA) to provide the necessary synchronisation.
*/
struct workqueue_struct *rds_wq;
EXPORT_SYMBOL_GPL(rds_wq);
void rds_connect_complete(struct rds_connection *conn)
{
if (!rds_conn_transition(conn, RDS_CONN_CONNECTING, RDS_CONN_UP)) {
printk(KERN_WARNING "%s: Cannot transition to state UP, "
"current state is %d\n",
__func__,
atomic_read(&conn->c_state));
rds_conn_drop(conn);
return;
}
rdsdebug("conn %p for %pI4 to %pI4 complete\n",
conn, &conn->c_laddr, &conn->c_faddr);
conn->c_reconnect_jiffies = 0;
set_bit(0, &conn->c_map_queued);
queue_delayed_work(rds_wq, &conn->c_send_w, 0);
queue_delayed_work(rds_wq, &conn->c_recv_w, 0);
}
EXPORT_SYMBOL_GPL(rds_connect_complete);
/*
* This random exponential backoff is relied on to eventually resolve racing
* connects.
*
* If connect attempts race then both parties drop both connections and come
* here to wait for a random amount of time before trying again. Eventually
* the backoff range will be so much greater than the time it takes to
* establish a connection that one of the pair will establish the connection
* before the other's random delay fires.
*
* Connection attempts that arrive while a connection is already established
* are also considered to be racing connects. This lets a connection from
* a rebooted machine replace an existing stale connection before the transport
* notices that the connection has failed.
*
* We should *always* start with a random backoff; otherwise a broken connection
* will always take several iterations to be re-established.
*/
void rds_queue_reconnect(struct rds_connection *conn)
{
unsigned long rand;
rdsdebug("conn %p for %pI4 to %pI4 reconnect jiffies %lu\n",
conn, &conn->c_laddr, &conn->c_faddr,
conn->c_reconnect_jiffies);
set_bit(RDS_RECONNECT_PENDING, &conn->c_flags);
if (conn->c_reconnect_jiffies == 0) {
conn->c_reconnect_jiffies = rds_sysctl_reconnect_min_jiffies;
queue_delayed_work(rds_wq, &conn->c_conn_w, 0);
return;
}
get_random_bytes(&rand, sizeof(rand));
rdsdebug("%lu delay %lu ceil conn %p for %pI4 -> %pI4\n",
rand % conn->c_reconnect_jiffies, conn->c_reconnect_jiffies,
conn, &conn->c_laddr, &conn->c_faddr);
queue_delayed_work(rds_wq, &conn->c_conn_w,
rand % conn->c_reconnect_jiffies);
conn->c_reconnect_jiffies = min(conn->c_reconnect_jiffies * 2,
rds_sysctl_reconnect_max_jiffies);
}
void rds_connect_worker(struct work_struct *work)
{
struct rds_connection *conn = container_of(work, struct rds_connection, c_conn_w.work);
int ret;
clear_bit(RDS_RECONNECT_PENDING, &conn->c_flags);
if (rds_conn_transition(conn, RDS_CONN_DOWN, RDS_CONN_CONNECTING)) {
ret = conn->c_trans->conn_connect(conn);
rdsdebug("conn %p for %pI4 to %pI4 dispatched, ret %d\n",
conn, &conn->c_laddr, &conn->c_faddr, ret);
if (ret) {
if (rds_conn_transition(conn, RDS_CONN_CONNECTING, RDS_CONN_DOWN))
rds_queue_reconnect(conn);
else
rds_conn_error(conn, "RDS: connect failed\n");
}
}
}
void rds_send_worker(struct work_struct *work)
{
struct rds_connection *conn = container_of(work, struct rds_connection, c_send_w.work);
int ret;
if (rds_conn_state(conn) == RDS_CONN_UP) {
ret = rds_send_xmit(conn);
rdsdebug("conn %p ret %d\n", conn, ret);
switch (ret) {
case -EAGAIN:
rds_stats_inc(s_send_immediate_retry);
queue_delayed_work(rds_wq, &conn->c_send_w, 0);
break;
case -ENOMEM:
rds_stats_inc(s_send_delayed_retry);
queue_delayed_work(rds_wq, &conn->c_send_w, 2);
default:
break;
}
}
}
void rds_recv_worker(struct work_struct *work)
{
struct rds_connection *conn = container_of(work, struct rds_connection, c_recv_w.work);
int ret;
if (rds_conn_state(conn) == RDS_CONN_UP) {
ret = conn->c_trans->recv(conn);
rdsdebug("conn %p ret %d\n", conn, ret);
switch (ret) {
case -EAGAIN:
rds_stats_inc(s_recv_immediate_retry);
queue_delayed_work(rds_wq, &conn->c_recv_w, 0);
break;
case -ENOMEM:
rds_stats_inc(s_recv_delayed_retry);
queue_delayed_work(rds_wq, &conn->c_recv_w, 2);
default:
break;
}
}
}
void rds_shutdown_worker(struct work_struct *work)
{
struct rds_connection *conn = container_of(work, struct rds_connection, c_down_w);
rds_conn_shutdown(conn);
}
void rds_threads_exit(void)
{
destroy_workqueue(rds_wq);
}
int rds_threads_init(void)
{
rds_wq = create_singlethread_workqueue("krdsd");
if (!rds_wq)
return -ENOMEM;
return 0;
}

137
net/rds/transport.c Normal file
View file

@ -0,0 +1,137 @@
/*
* Copyright (c) 2006 Oracle. All rights reserved.
*
* This software is available to you under a choice of one of two
* licenses. You may choose to be licensed under the terms of the GNU
* General Public License (GPL) Version 2, available from the file
* COPYING in the main directory of this source tree, or the
* OpenIB.org BSD license below:
*
* Redistribution and use in source and binary forms, with or
* without modification, are permitted provided that the following
* conditions are met:
*
* - Redistributions of source code must retain the above
* copyright notice, this list of conditions and the following
* disclaimer.
*
* - Redistributions in binary form must reproduce the above
* copyright notice, this list of conditions and the following
* disclaimer in the documentation and/or other materials
* provided with the distribution.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
* BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
* ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
* CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
*
*/
#include <linux/kernel.h>
#include <linux/module.h>
#include <linux/in.h>
#include "rds.h"
#include "loop.h"
static struct rds_transport *transports[RDS_TRANS_COUNT];
static DECLARE_RWSEM(rds_trans_sem);
int rds_trans_register(struct rds_transport *trans)
{
BUG_ON(strlen(trans->t_name) + 1 > TRANSNAMSIZ);
down_write(&rds_trans_sem);
if (transports[trans->t_type])
printk(KERN_ERR "RDS Transport type %d already registered\n",
trans->t_type);
else {
transports[trans->t_type] = trans;
printk(KERN_INFO "Registered RDS/%s transport\n", trans->t_name);
}
up_write(&rds_trans_sem);
return 0;
}
EXPORT_SYMBOL_GPL(rds_trans_register);
void rds_trans_unregister(struct rds_transport *trans)
{
down_write(&rds_trans_sem);
transports[trans->t_type] = NULL;
printk(KERN_INFO "Unregistered RDS/%s transport\n", trans->t_name);
up_write(&rds_trans_sem);
}
EXPORT_SYMBOL_GPL(rds_trans_unregister);
void rds_trans_put(struct rds_transport *trans)
{
if (trans && trans->t_owner)
module_put(trans->t_owner);
}
struct rds_transport *rds_trans_get_preferred(__be32 addr)
{
struct rds_transport *ret = NULL;
struct rds_transport *trans;
unsigned int i;
if (IN_LOOPBACK(ntohl(addr)))
return &rds_loop_transport;
down_read(&rds_trans_sem);
for (i = 0; i < RDS_TRANS_COUNT; i++) {
trans = transports[i];
if (trans && (trans->laddr_check(addr) == 0) &&
(!trans->t_owner || try_module_get(trans->t_owner))) {
ret = trans;
break;
}
}
up_read(&rds_trans_sem);
return ret;
}
/*
* This returns the number of stats entries in the snapshot and only
* copies them using the iter if there is enough space for them. The
* caller passes in the global stats so that we can size and copy while
* holding the lock.
*/
unsigned int rds_trans_stats_info_copy(struct rds_info_iterator *iter,
unsigned int avail)
{
struct rds_transport *trans;
unsigned int total = 0;
unsigned int part;
int i;
rds_info_iter_unmap(iter);
down_read(&rds_trans_sem);
for (i = 0; i < RDS_TRANS_COUNT; i++)
{
trans = transports[i];
if (!trans || !trans->stats_info_copy)
continue;
part = trans->stats_info_copy(iter, avail);
avail -= min(avail, part);
total += part;
}
up_read(&rds_trans_sem);
return total;
}