mirror of
https://github.com/AetherDroid/android_kernel_samsung_on5xelte.git
synced 2025-09-08 01:08:03 -04:00
Fixed MTP to work with TWRP
This commit is contained in:
commit
f6dfaef42e
50820 changed files with 20846062 additions and 0 deletions
25
net/core/Makefile
Normal file
25
net/core/Makefile
Normal file
|
@ -0,0 +1,25 @@
|
|||
#
|
||||
# Makefile for the Linux networking core.
|
||||
#
|
||||
|
||||
obj-y := sock.o request_sock.o skbuff.o iovec.o datagram.o stream.o scm.o \
|
||||
gen_stats.o gen_estimator.o net_namespace.o secure_seq.o flow_dissector.o
|
||||
|
||||
obj-$(CONFIG_SYSCTL) += sysctl_net_core.o
|
||||
|
||||
obj-y += dev.o ethtool.o dev_addr_lists.o dst.o netevent.o \
|
||||
neighbour.o rtnetlink.o utils.o link_watch.o filter.o \
|
||||
sock_diag.o dev_ioctl.o tso.o
|
||||
|
||||
obj-$(CONFIG_XFRM) += flow.o
|
||||
obj-y += net-sysfs.o
|
||||
obj-$(CONFIG_PROC_FS) += net-procfs.o
|
||||
obj-$(CONFIG_NET_PKTGEN) += pktgen.o
|
||||
obj-$(CONFIG_NETPOLL) += netpoll.o
|
||||
obj-$(CONFIG_FIB_RULES) += fib_rules.o
|
||||
obj-$(CONFIG_TRACEPOINTS) += net-traces.o
|
||||
obj-$(CONFIG_NET_DROP_MONITOR) += drop_monitor.o
|
||||
obj-$(CONFIG_NETWORK_PHY_TIMESTAMPING) += timestamping.o
|
||||
obj-$(CONFIG_NET_PTP_CLASSIFY) += ptp_classifier.o
|
||||
obj-$(CONFIG_CGROUP_NET_PRIO) += netprio_cgroup.o
|
||||
obj-$(CONFIG_CGROUP_NET_CLASSID) += netclassid_cgroup.o
|
888
net/core/datagram.c
Normal file
888
net/core/datagram.c
Normal file
|
@ -0,0 +1,888 @@
|
|||
/*
|
||||
* SUCS NET3:
|
||||
*
|
||||
* Generic datagram handling routines. These are generic for all
|
||||
* protocols. Possibly a generic IP version on top of these would
|
||||
* make sense. Not tonight however 8-).
|
||||
* This is used because UDP, RAW, PACKET, DDP, IPX, AX.25 and
|
||||
* NetROM layer all have identical poll code and mostly
|
||||
* identical recvmsg() code. So we share it here. The poll was
|
||||
* shared before but buried in udp.c so I moved it.
|
||||
*
|
||||
* Authors: Alan Cox <alan@lxorguk.ukuu.org.uk>. (datagram_poll() from old
|
||||
* udp.c code)
|
||||
*
|
||||
* Fixes:
|
||||
* Alan Cox : NULL return from skb_peek_copy()
|
||||
* understood
|
||||
* Alan Cox : Rewrote skb_read_datagram to avoid the
|
||||
* skb_peek_copy stuff.
|
||||
* Alan Cox : Added support for SOCK_SEQPACKET.
|
||||
* IPX can no longer use the SO_TYPE hack
|
||||
* but AX.25 now works right, and SPX is
|
||||
* feasible.
|
||||
* Alan Cox : Fixed write poll of non IP protocol
|
||||
* crash.
|
||||
* Florian La Roche: Changed for my new skbuff handling.
|
||||
* Darryl Miles : Fixed non-blocking SOCK_SEQPACKET.
|
||||
* Linus Torvalds : BSD semantic fixes.
|
||||
* Alan Cox : Datagram iovec handling
|
||||
* Darryl Miles : Fixed non-blocking SOCK_STREAM.
|
||||
* Alan Cox : POSIXisms
|
||||
* Pete Wyckoff : Unconnected accept() fix.
|
||||
*
|
||||
*/
|
||||
|
||||
#include <linux/module.h>
|
||||
#include <linux/types.h>
|
||||
#include <linux/kernel.h>
|
||||
#include <asm/uaccess.h>
|
||||
#include <linux/mm.h>
|
||||
#include <linux/interrupt.h>
|
||||
#include <linux/errno.h>
|
||||
#include <linux/sched.h>
|
||||
#include <linux/inet.h>
|
||||
#include <linux/netdevice.h>
|
||||
#include <linux/rtnetlink.h>
|
||||
#include <linux/poll.h>
|
||||
#include <linux/highmem.h>
|
||||
#include <linux/spinlock.h>
|
||||
#include <linux/slab.h>
|
||||
#include <linux/pagemap.h>
|
||||
|
||||
#include <net/protocol.h>
|
||||
#include <linux/skbuff.h>
|
||||
|
||||
#include <net/checksum.h>
|
||||
#include <net/sock.h>
|
||||
#include <net/tcp_states.h>
|
||||
#include <trace/events/skb.h>
|
||||
#include <net/busy_poll.h>
|
||||
|
||||
/*
|
||||
* Is a socket 'connection oriented' ?
|
||||
*/
|
||||
static inline int connection_based(struct sock *sk)
|
||||
{
|
||||
return sk->sk_type == SOCK_SEQPACKET || sk->sk_type == SOCK_STREAM;
|
||||
}
|
||||
|
||||
static int receiver_wake_function(wait_queue_t *wait, unsigned int mode, int sync,
|
||||
void *key)
|
||||
{
|
||||
unsigned long bits = (unsigned long)key;
|
||||
|
||||
/*
|
||||
* Avoid a wakeup if event not interesting for us
|
||||
*/
|
||||
if (bits && !(bits & (POLLIN | POLLERR)))
|
||||
return 0;
|
||||
return autoremove_wake_function(wait, mode, sync, key);
|
||||
}
|
||||
/*
|
||||
* Wait for the last received packet to be different from skb
|
||||
*/
|
||||
static int wait_for_more_packets(struct sock *sk, int *err, long *timeo_p,
|
||||
const struct sk_buff *skb)
|
||||
{
|
||||
int error;
|
||||
DEFINE_WAIT_FUNC(wait, receiver_wake_function);
|
||||
|
||||
prepare_to_wait_exclusive(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE);
|
||||
|
||||
/* Socket errors? */
|
||||
error = sock_error(sk);
|
||||
if (error)
|
||||
goto out_err;
|
||||
|
||||
if (sk->sk_receive_queue.prev != skb)
|
||||
goto out;
|
||||
|
||||
/* Socket shut down? */
|
||||
if (sk->sk_shutdown & RCV_SHUTDOWN)
|
||||
goto out_noerr;
|
||||
|
||||
/* Sequenced packets can come disconnected.
|
||||
* If so we report the problem
|
||||
*/
|
||||
error = -ENOTCONN;
|
||||
if (connection_based(sk) &&
|
||||
!(sk->sk_state == TCP_ESTABLISHED || sk->sk_state == TCP_LISTEN))
|
||||
goto out_err;
|
||||
|
||||
/* handle signals */
|
||||
if (signal_pending(current))
|
||||
goto interrupted;
|
||||
|
||||
error = 0;
|
||||
*timeo_p = schedule_timeout(*timeo_p);
|
||||
out:
|
||||
finish_wait(sk_sleep(sk), &wait);
|
||||
return error;
|
||||
interrupted:
|
||||
error = sock_intr_errno(*timeo_p);
|
||||
out_err:
|
||||
*err = error;
|
||||
goto out;
|
||||
out_noerr:
|
||||
*err = 0;
|
||||
error = 1;
|
||||
goto out;
|
||||
}
|
||||
|
||||
/**
|
||||
* __skb_recv_datagram - Receive a datagram skbuff
|
||||
* @sk: socket
|
||||
* @flags: MSG_ flags
|
||||
* @peeked: returns non-zero if this packet has been seen before
|
||||
* @off: an offset in bytes to peek skb from. Returns an offset
|
||||
* within an skb where data actually starts
|
||||
* @err: error code returned
|
||||
*
|
||||
* Get a datagram skbuff, understands the peeking, nonblocking wakeups
|
||||
* and possible races. This replaces identical code in packet, raw and
|
||||
* udp, as well as the IPX AX.25 and Appletalk. It also finally fixes
|
||||
* the long standing peek and read race for datagram sockets. If you
|
||||
* alter this routine remember it must be re-entrant.
|
||||
*
|
||||
* This function will lock the socket if a skb is returned, so the caller
|
||||
* needs to unlock the socket in that case (usually by calling
|
||||
* skb_free_datagram)
|
||||
*
|
||||
* * It does not lock socket since today. This function is
|
||||
* * free of race conditions. This measure should/can improve
|
||||
* * significantly datagram socket latencies at high loads,
|
||||
* * when data copying to user space takes lots of time.
|
||||
* * (BTW I've just killed the last cli() in IP/IPv6/core/netlink/packet
|
||||
* * 8) Great win.)
|
||||
* * --ANK (980729)
|
||||
*
|
||||
* The order of the tests when we find no data waiting are specified
|
||||
* quite explicitly by POSIX 1003.1g, don't change them without having
|
||||
* the standard around please.
|
||||
*/
|
||||
struct sk_buff *__skb_recv_datagram(struct sock *sk, unsigned int flags,
|
||||
int *peeked, int *off, int *err)
|
||||
{
|
||||
struct sk_buff *skb, *last;
|
||||
long timeo;
|
||||
/*
|
||||
* Caller is allowed not to check sk->sk_err before skb_recv_datagram()
|
||||
*/
|
||||
int error = sock_error(sk);
|
||||
|
||||
if (error)
|
||||
goto no_packet;
|
||||
|
||||
timeo = sock_rcvtimeo(sk, flags & MSG_DONTWAIT);
|
||||
|
||||
do {
|
||||
/* Again only user level code calls this function, so nothing
|
||||
* interrupt level will suddenly eat the receive_queue.
|
||||
*
|
||||
* Look at current nfs client by the way...
|
||||
* However, this function was correct in any case. 8)
|
||||
*/
|
||||
unsigned long cpu_flags;
|
||||
struct sk_buff_head *queue = &sk->sk_receive_queue;
|
||||
int _off = *off;
|
||||
|
||||
last = (struct sk_buff *)queue;
|
||||
spin_lock_irqsave(&queue->lock, cpu_flags);
|
||||
skb_queue_walk(queue, skb) {
|
||||
last = skb;
|
||||
*peeked = skb->peeked;
|
||||
if (flags & MSG_PEEK) {
|
||||
if (_off >= skb->len && (skb->len || _off ||
|
||||
skb->peeked)) {
|
||||
_off -= skb->len;
|
||||
continue;
|
||||
}
|
||||
skb->peeked = 1;
|
||||
atomic_inc(&skb->users);
|
||||
} else
|
||||
__skb_unlink(skb, queue);
|
||||
|
||||
spin_unlock_irqrestore(&queue->lock, cpu_flags);
|
||||
*off = _off;
|
||||
return skb;
|
||||
}
|
||||
spin_unlock_irqrestore(&queue->lock, cpu_flags);
|
||||
|
||||
if (sk_can_busy_loop(sk) &&
|
||||
sk_busy_loop(sk, flags & MSG_DONTWAIT))
|
||||
continue;
|
||||
|
||||
/* User doesn't want to wait */
|
||||
error = -EAGAIN;
|
||||
if (!timeo)
|
||||
goto no_packet;
|
||||
|
||||
} while (!wait_for_more_packets(sk, err, &timeo, last));
|
||||
|
||||
return NULL;
|
||||
|
||||
no_packet:
|
||||
*err = error;
|
||||
return NULL;
|
||||
}
|
||||
EXPORT_SYMBOL(__skb_recv_datagram);
|
||||
|
||||
struct sk_buff *skb_recv_datagram(struct sock *sk, unsigned int flags,
|
||||
int noblock, int *err)
|
||||
{
|
||||
int peeked, off = 0;
|
||||
|
||||
return __skb_recv_datagram(sk, flags | (noblock ? MSG_DONTWAIT : 0),
|
||||
&peeked, &off, err);
|
||||
}
|
||||
EXPORT_SYMBOL(skb_recv_datagram);
|
||||
|
||||
void skb_free_datagram(struct sock *sk, struct sk_buff *skb)
|
||||
{
|
||||
consume_skb(skb);
|
||||
sk_mem_reclaim_partial(sk);
|
||||
}
|
||||
EXPORT_SYMBOL(skb_free_datagram);
|
||||
|
||||
void skb_free_datagram_locked(struct sock *sk, struct sk_buff *skb)
|
||||
{
|
||||
bool slow;
|
||||
|
||||
if (likely(atomic_read(&skb->users) == 1))
|
||||
smp_rmb();
|
||||
else if (likely(!atomic_dec_and_test(&skb->users)))
|
||||
return;
|
||||
|
||||
slow = lock_sock_fast(sk);
|
||||
skb_orphan(skb);
|
||||
sk_mem_reclaim_partial(sk);
|
||||
unlock_sock_fast(sk, slow);
|
||||
|
||||
/* skb is now orphaned, can be freed outside of locked section */
|
||||
__kfree_skb(skb);
|
||||
}
|
||||
EXPORT_SYMBOL(skb_free_datagram_locked);
|
||||
|
||||
/**
|
||||
* skb_kill_datagram - Free a datagram skbuff forcibly
|
||||
* @sk: socket
|
||||
* @skb: datagram skbuff
|
||||
* @flags: MSG_ flags
|
||||
*
|
||||
* This function frees a datagram skbuff that was received by
|
||||
* skb_recv_datagram. The flags argument must match the one
|
||||
* used for skb_recv_datagram.
|
||||
*
|
||||
* If the MSG_PEEK flag is set, and the packet is still on the
|
||||
* receive queue of the socket, it will be taken off the queue
|
||||
* before it is freed.
|
||||
*
|
||||
* This function currently only disables BH when acquiring the
|
||||
* sk_receive_queue lock. Therefore it must not be used in a
|
||||
* context where that lock is acquired in an IRQ context.
|
||||
*
|
||||
* It returns 0 if the packet was removed by us.
|
||||
*/
|
||||
|
||||
int skb_kill_datagram(struct sock *sk, struct sk_buff *skb, unsigned int flags)
|
||||
{
|
||||
int err = 0;
|
||||
|
||||
if (flags & MSG_PEEK) {
|
||||
err = -ENOENT;
|
||||
spin_lock_bh(&sk->sk_receive_queue.lock);
|
||||
if (skb == skb_peek(&sk->sk_receive_queue)) {
|
||||
__skb_unlink(skb, &sk->sk_receive_queue);
|
||||
atomic_dec(&skb->users);
|
||||
err = 0;
|
||||
}
|
||||
spin_unlock_bh(&sk->sk_receive_queue.lock);
|
||||
}
|
||||
|
||||
kfree_skb(skb);
|
||||
atomic_inc(&sk->sk_drops);
|
||||
sk_mem_reclaim_partial(sk);
|
||||
|
||||
return err;
|
||||
}
|
||||
EXPORT_SYMBOL(skb_kill_datagram);
|
||||
|
||||
/**
|
||||
* skb_copy_datagram_iovec - Copy a datagram to an iovec.
|
||||
* @skb: buffer to copy
|
||||
* @offset: offset in the buffer to start copying from
|
||||
* @to: io vector to copy to
|
||||
* @len: amount of data to copy from buffer to iovec
|
||||
*
|
||||
* Note: the iovec is modified during the copy.
|
||||
*/
|
||||
int skb_copy_datagram_iovec(const struct sk_buff *skb, int offset,
|
||||
struct iovec *to, int len)
|
||||
{
|
||||
int start = skb_headlen(skb);
|
||||
int i, copy = start - offset;
|
||||
struct sk_buff *frag_iter;
|
||||
|
||||
trace_skb_copy_datagram_iovec(skb, len);
|
||||
|
||||
/* Copy header. */
|
||||
if (copy > 0) {
|
||||
if (copy > len)
|
||||
copy = len;
|
||||
if (memcpy_toiovec(to, skb->data + offset, copy))
|
||||
goto fault;
|
||||
if ((len -= copy) == 0)
|
||||
return 0;
|
||||
offset += copy;
|
||||
}
|
||||
|
||||
/* Copy paged appendix. Hmm... why does this look so complicated? */
|
||||
for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
|
||||
int end;
|
||||
const skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
|
||||
|
||||
WARN_ON(start > offset + len);
|
||||
|
||||
end = start + skb_frag_size(frag);
|
||||
if ((copy = end - offset) > 0) {
|
||||
int err;
|
||||
u8 *vaddr;
|
||||
struct page *page = skb_frag_page(frag);
|
||||
|
||||
if (copy > len)
|
||||
copy = len;
|
||||
vaddr = kmap(page);
|
||||
err = memcpy_toiovec(to, vaddr + frag->page_offset +
|
||||
offset - start, copy);
|
||||
kunmap(page);
|
||||
if (err)
|
||||
goto fault;
|
||||
if (!(len -= copy))
|
||||
return 0;
|
||||
offset += copy;
|
||||
}
|
||||
start = end;
|
||||
}
|
||||
|
||||
skb_walk_frags(skb, frag_iter) {
|
||||
int end;
|
||||
|
||||
WARN_ON(start > offset + len);
|
||||
|
||||
end = start + frag_iter->len;
|
||||
if ((copy = end - offset) > 0) {
|
||||
if (copy > len)
|
||||
copy = len;
|
||||
if (skb_copy_datagram_iovec(frag_iter,
|
||||
offset - start,
|
||||
to, copy))
|
||||
goto fault;
|
||||
if ((len -= copy) == 0)
|
||||
return 0;
|
||||
offset += copy;
|
||||
}
|
||||
start = end;
|
||||
}
|
||||
if (!len)
|
||||
return 0;
|
||||
|
||||
fault:
|
||||
return -EFAULT;
|
||||
}
|
||||
EXPORT_SYMBOL(skb_copy_datagram_iovec);
|
||||
|
||||
/**
|
||||
* skb_copy_datagram_const_iovec - Copy a datagram to an iovec.
|
||||
* @skb: buffer to copy
|
||||
* @offset: offset in the buffer to start copying from
|
||||
* @to: io vector to copy to
|
||||
* @to_offset: offset in the io vector to start copying to
|
||||
* @len: amount of data to copy from buffer to iovec
|
||||
*
|
||||
* Returns 0 or -EFAULT.
|
||||
* Note: the iovec is not modified during the copy.
|
||||
*/
|
||||
int skb_copy_datagram_const_iovec(const struct sk_buff *skb, int offset,
|
||||
const struct iovec *to, int to_offset,
|
||||
int len)
|
||||
{
|
||||
int start = skb_headlen(skb);
|
||||
int i, copy = start - offset;
|
||||
struct sk_buff *frag_iter;
|
||||
|
||||
/* Copy header. */
|
||||
if (copy > 0) {
|
||||
if (copy > len)
|
||||
copy = len;
|
||||
if (memcpy_toiovecend(to, skb->data + offset, to_offset, copy))
|
||||
goto fault;
|
||||
if ((len -= copy) == 0)
|
||||
return 0;
|
||||
offset += copy;
|
||||
to_offset += copy;
|
||||
}
|
||||
|
||||
/* Copy paged appendix. Hmm... why does this look so complicated? */
|
||||
for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
|
||||
int end;
|
||||
const skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
|
||||
|
||||
WARN_ON(start > offset + len);
|
||||
|
||||
end = start + skb_frag_size(frag);
|
||||
if ((copy = end - offset) > 0) {
|
||||
int err;
|
||||
u8 *vaddr;
|
||||
struct page *page = skb_frag_page(frag);
|
||||
|
||||
if (copy > len)
|
||||
copy = len;
|
||||
vaddr = kmap(page);
|
||||
err = memcpy_toiovecend(to, vaddr + frag->page_offset +
|
||||
offset - start, to_offset, copy);
|
||||
kunmap(page);
|
||||
if (err)
|
||||
goto fault;
|
||||
if (!(len -= copy))
|
||||
return 0;
|
||||
offset += copy;
|
||||
to_offset += copy;
|
||||
}
|
||||
start = end;
|
||||
}
|
||||
|
||||
skb_walk_frags(skb, frag_iter) {
|
||||
int end;
|
||||
|
||||
WARN_ON(start > offset + len);
|
||||
|
||||
end = start + frag_iter->len;
|
||||
if ((copy = end - offset) > 0) {
|
||||
if (copy > len)
|
||||
copy = len;
|
||||
if (skb_copy_datagram_const_iovec(frag_iter,
|
||||
offset - start,
|
||||
to, to_offset,
|
||||
copy))
|
||||
goto fault;
|
||||
if ((len -= copy) == 0)
|
||||
return 0;
|
||||
offset += copy;
|
||||
to_offset += copy;
|
||||
}
|
||||
start = end;
|
||||
}
|
||||
if (!len)
|
||||
return 0;
|
||||
|
||||
fault:
|
||||
return -EFAULT;
|
||||
}
|
||||
EXPORT_SYMBOL(skb_copy_datagram_const_iovec);
|
||||
|
||||
/**
|
||||
* skb_copy_datagram_from_iovec - Copy a datagram from an iovec.
|
||||
* @skb: buffer to copy
|
||||
* @offset: offset in the buffer to start copying to
|
||||
* @from: io vector to copy to
|
||||
* @from_offset: offset in the io vector to start copying from
|
||||
* @len: amount of data to copy to buffer from iovec
|
||||
*
|
||||
* Returns 0 or -EFAULT.
|
||||
* Note: the iovec is not modified during the copy.
|
||||
*/
|
||||
int skb_copy_datagram_from_iovec(struct sk_buff *skb, int offset,
|
||||
const struct iovec *from, int from_offset,
|
||||
int len)
|
||||
{
|
||||
int start = skb_headlen(skb);
|
||||
int i, copy = start - offset;
|
||||
struct sk_buff *frag_iter;
|
||||
|
||||
/* Copy header. */
|
||||
if (copy > 0) {
|
||||
if (copy > len)
|
||||
copy = len;
|
||||
if (memcpy_fromiovecend(skb->data + offset, from, from_offset,
|
||||
copy))
|
||||
goto fault;
|
||||
if ((len -= copy) == 0)
|
||||
return 0;
|
||||
offset += copy;
|
||||
from_offset += copy;
|
||||
}
|
||||
|
||||
/* Copy paged appendix. Hmm... why does this look so complicated? */
|
||||
for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
|
||||
int end;
|
||||
const skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
|
||||
|
||||
WARN_ON(start > offset + len);
|
||||
|
||||
end = start + skb_frag_size(frag);
|
||||
if ((copy = end - offset) > 0) {
|
||||
int err;
|
||||
u8 *vaddr;
|
||||
struct page *page = skb_frag_page(frag);
|
||||
|
||||
if (copy > len)
|
||||
copy = len;
|
||||
vaddr = kmap(page);
|
||||
err = memcpy_fromiovecend(vaddr + frag->page_offset +
|
||||
offset - start,
|
||||
from, from_offset, copy);
|
||||
kunmap(page);
|
||||
if (err)
|
||||
goto fault;
|
||||
|
||||
if (!(len -= copy))
|
||||
return 0;
|
||||
offset += copy;
|
||||
from_offset += copy;
|
||||
}
|
||||
start = end;
|
||||
}
|
||||
|
||||
skb_walk_frags(skb, frag_iter) {
|
||||
int end;
|
||||
|
||||
WARN_ON(start > offset + len);
|
||||
|
||||
end = start + frag_iter->len;
|
||||
if ((copy = end - offset) > 0) {
|
||||
if (copy > len)
|
||||
copy = len;
|
||||
if (skb_copy_datagram_from_iovec(frag_iter,
|
||||
offset - start,
|
||||
from,
|
||||
from_offset,
|
||||
copy))
|
||||
goto fault;
|
||||
if ((len -= copy) == 0)
|
||||
return 0;
|
||||
offset += copy;
|
||||
from_offset += copy;
|
||||
}
|
||||
start = end;
|
||||
}
|
||||
if (!len)
|
||||
return 0;
|
||||
|
||||
fault:
|
||||
return -EFAULT;
|
||||
}
|
||||
EXPORT_SYMBOL(skb_copy_datagram_from_iovec);
|
||||
|
||||
/**
|
||||
* zerocopy_sg_from_iovec - Build a zerocopy datagram from an iovec
|
||||
* @skb: buffer to copy
|
||||
* @from: io vector to copy from
|
||||
* @offset: offset in the io vector to start copying from
|
||||
* @count: amount of vectors to copy to buffer from
|
||||
*
|
||||
* The function will first copy up to headlen, and then pin the userspace
|
||||
* pages and build frags through them.
|
||||
*
|
||||
* Returns 0, -EFAULT or -EMSGSIZE.
|
||||
* Note: the iovec is not modified during the copy
|
||||
*/
|
||||
int zerocopy_sg_from_iovec(struct sk_buff *skb, const struct iovec *from,
|
||||
int offset, size_t count)
|
||||
{
|
||||
int len = iov_length(from, count) - offset;
|
||||
int copy = min_t(int, skb_headlen(skb), len);
|
||||
int size;
|
||||
int i = 0;
|
||||
|
||||
/* copy up to skb headlen */
|
||||
if (skb_copy_datagram_from_iovec(skb, 0, from, offset, copy))
|
||||
return -EFAULT;
|
||||
|
||||
if (len == copy)
|
||||
return 0;
|
||||
|
||||
offset += copy;
|
||||
while (count--) {
|
||||
struct page *page[MAX_SKB_FRAGS];
|
||||
int num_pages;
|
||||
unsigned long base;
|
||||
unsigned long truesize;
|
||||
|
||||
/* Skip over from offset and copied */
|
||||
if (offset >= from->iov_len) {
|
||||
offset -= from->iov_len;
|
||||
++from;
|
||||
continue;
|
||||
}
|
||||
len = from->iov_len - offset;
|
||||
base = (unsigned long)from->iov_base + offset;
|
||||
size = ((base & ~PAGE_MASK) + len + ~PAGE_MASK) >> PAGE_SHIFT;
|
||||
if (i + size > MAX_SKB_FRAGS)
|
||||
return -EMSGSIZE;
|
||||
num_pages = get_user_pages_fast(base, size, 0, &page[i]);
|
||||
if (num_pages != size) {
|
||||
release_pages(&page[i], num_pages, 0);
|
||||
return -EFAULT;
|
||||
}
|
||||
truesize = size * PAGE_SIZE;
|
||||
skb->data_len += len;
|
||||
skb->len += len;
|
||||
skb->truesize += truesize;
|
||||
atomic_add(truesize, &skb->sk->sk_wmem_alloc);
|
||||
while (len) {
|
||||
int off = base & ~PAGE_MASK;
|
||||
int size = min_t(int, len, PAGE_SIZE - off);
|
||||
skb_fill_page_desc(skb, i, page[i], off, size);
|
||||
base += size;
|
||||
len -= size;
|
||||
i++;
|
||||
}
|
||||
offset = 0;
|
||||
++from;
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
EXPORT_SYMBOL(zerocopy_sg_from_iovec);
|
||||
|
||||
static int skb_copy_and_csum_datagram(const struct sk_buff *skb, int offset,
|
||||
u8 __user *to, int len,
|
||||
__wsum *csump)
|
||||
{
|
||||
int start = skb_headlen(skb);
|
||||
int i, copy = start - offset;
|
||||
struct sk_buff *frag_iter;
|
||||
int pos = 0;
|
||||
|
||||
/* Copy header. */
|
||||
if (copy > 0) {
|
||||
int err = 0;
|
||||
if (copy > len)
|
||||
copy = len;
|
||||
*csump = csum_and_copy_to_user(skb->data + offset, to, copy,
|
||||
*csump, &err);
|
||||
if (err)
|
||||
goto fault;
|
||||
if ((len -= copy) == 0)
|
||||
return 0;
|
||||
offset += copy;
|
||||
to += copy;
|
||||
pos = copy;
|
||||
}
|
||||
|
||||
for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
|
||||
int end;
|
||||
const skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
|
||||
|
||||
WARN_ON(start > offset + len);
|
||||
|
||||
end = start + skb_frag_size(frag);
|
||||
if ((copy = end - offset) > 0) {
|
||||
__wsum csum2;
|
||||
int err = 0;
|
||||
u8 *vaddr;
|
||||
struct page *page = skb_frag_page(frag);
|
||||
|
||||
if (copy > len)
|
||||
copy = len;
|
||||
vaddr = kmap(page);
|
||||
csum2 = csum_and_copy_to_user(vaddr +
|
||||
frag->page_offset +
|
||||
offset - start,
|
||||
to, copy, 0, &err);
|
||||
kunmap(page);
|
||||
if (err)
|
||||
goto fault;
|
||||
*csump = csum_block_add(*csump, csum2, pos);
|
||||
if (!(len -= copy))
|
||||
return 0;
|
||||
offset += copy;
|
||||
to += copy;
|
||||
pos += copy;
|
||||
}
|
||||
start = end;
|
||||
}
|
||||
|
||||
skb_walk_frags(skb, frag_iter) {
|
||||
int end;
|
||||
|
||||
WARN_ON(start > offset + len);
|
||||
|
||||
end = start + frag_iter->len;
|
||||
if ((copy = end - offset) > 0) {
|
||||
__wsum csum2 = 0;
|
||||
if (copy > len)
|
||||
copy = len;
|
||||
if (skb_copy_and_csum_datagram(frag_iter,
|
||||
offset - start,
|
||||
to, copy,
|
||||
&csum2))
|
||||
goto fault;
|
||||
*csump = csum_block_add(*csump, csum2, pos);
|
||||
if ((len -= copy) == 0)
|
||||
return 0;
|
||||
offset += copy;
|
||||
to += copy;
|
||||
pos += copy;
|
||||
}
|
||||
start = end;
|
||||
}
|
||||
if (!len)
|
||||
return 0;
|
||||
|
||||
fault:
|
||||
return -EFAULT;
|
||||
}
|
||||
|
||||
__sum16 __skb_checksum_complete_head(struct sk_buff *skb, int len)
|
||||
{
|
||||
__sum16 sum;
|
||||
|
||||
sum = csum_fold(skb_checksum(skb, 0, len, skb->csum));
|
||||
if (likely(!sum)) {
|
||||
if (unlikely(skb->ip_summed == CHECKSUM_COMPLETE) &&
|
||||
!skb->csum_complete_sw)
|
||||
netdev_rx_csum_fault(skb->dev);
|
||||
}
|
||||
skb->csum_valid = !sum;
|
||||
return sum;
|
||||
}
|
||||
EXPORT_SYMBOL(__skb_checksum_complete_head);
|
||||
|
||||
__sum16 __skb_checksum_complete(struct sk_buff *skb)
|
||||
{
|
||||
__wsum csum;
|
||||
__sum16 sum;
|
||||
|
||||
csum = skb_checksum(skb, 0, skb->len, 0);
|
||||
|
||||
/* skb->csum holds pseudo checksum */
|
||||
sum = csum_fold(csum_add(skb->csum, csum));
|
||||
if (likely(!sum)) {
|
||||
if (unlikely(skb->ip_summed == CHECKSUM_COMPLETE) &&
|
||||
!skb->csum_complete_sw)
|
||||
netdev_rx_csum_fault(skb->dev);
|
||||
}
|
||||
|
||||
/* Save full packet checksum */
|
||||
skb->csum = csum;
|
||||
skb->ip_summed = CHECKSUM_COMPLETE;
|
||||
skb->csum_complete_sw = 1;
|
||||
skb->csum_valid = !sum;
|
||||
|
||||
return sum;
|
||||
}
|
||||
EXPORT_SYMBOL(__skb_checksum_complete);
|
||||
|
||||
/**
|
||||
* skb_copy_and_csum_datagram_iovec - Copy and checksum skb to user iovec.
|
||||
* @skb: skbuff
|
||||
* @hlen: hardware length
|
||||
* @iov: io vector
|
||||
* @len: amount of data to copy from skb to iov
|
||||
*
|
||||
* Caller _must_ check that skb will fit to this iovec.
|
||||
*
|
||||
* Returns: 0 - success.
|
||||
* -EINVAL - checksum failure.
|
||||
* -EFAULT - fault during copy. Beware, in this case iovec
|
||||
* can be modified!
|
||||
*/
|
||||
int skb_copy_and_csum_datagram_iovec(struct sk_buff *skb,
|
||||
int hlen, struct iovec *iov, int len)
|
||||
{
|
||||
__wsum csum;
|
||||
int chunk = skb->len - hlen;
|
||||
|
||||
if (chunk > len)
|
||||
chunk = len;
|
||||
|
||||
if (!chunk)
|
||||
return 0;
|
||||
|
||||
/* Skip filled elements.
|
||||
* Pretty silly, look at memcpy_toiovec, though 8)
|
||||
*/
|
||||
while (!iov->iov_len)
|
||||
iov++;
|
||||
|
||||
if (iov->iov_len < chunk) {
|
||||
if (__skb_checksum_complete(skb))
|
||||
goto csum_error;
|
||||
if (skb_copy_datagram_iovec(skb, hlen, iov, chunk))
|
||||
goto fault;
|
||||
} else {
|
||||
csum = csum_partial(skb->data, hlen, skb->csum);
|
||||
if (skb_copy_and_csum_datagram(skb, hlen, iov->iov_base,
|
||||
chunk, &csum))
|
||||
goto fault;
|
||||
if (csum_fold(csum))
|
||||
goto csum_error;
|
||||
if (unlikely(skb->ip_summed == CHECKSUM_COMPLETE))
|
||||
netdev_rx_csum_fault(skb->dev);
|
||||
iov->iov_len -= chunk;
|
||||
iov->iov_base += chunk;
|
||||
}
|
||||
return 0;
|
||||
csum_error:
|
||||
return -EINVAL;
|
||||
fault:
|
||||
return -EFAULT;
|
||||
}
|
||||
EXPORT_SYMBOL(skb_copy_and_csum_datagram_iovec);
|
||||
|
||||
/**
|
||||
* datagram_poll - generic datagram poll
|
||||
* @file: file struct
|
||||
* @sock: socket
|
||||
* @wait: poll table
|
||||
*
|
||||
* Datagram poll: Again totally generic. This also handles
|
||||
* sequenced packet sockets providing the socket receive queue
|
||||
* is only ever holding data ready to receive.
|
||||
*
|
||||
* Note: when you _don't_ use this routine for this protocol,
|
||||
* and you use a different write policy from sock_writeable()
|
||||
* then please supply your own write_space callback.
|
||||
*/
|
||||
unsigned int datagram_poll(struct file *file, struct socket *sock,
|
||||
poll_table *wait)
|
||||
{
|
||||
struct sock *sk = sock->sk;
|
||||
unsigned int mask;
|
||||
|
||||
sock_poll_wait(file, sk_sleep(sk), wait);
|
||||
mask = 0;
|
||||
|
||||
/* exceptional events? */
|
||||
if (sk->sk_err || !skb_queue_empty(&sk->sk_error_queue))
|
||||
mask |= POLLERR |
|
||||
(sock_flag(sk, SOCK_SELECT_ERR_QUEUE) ? POLLPRI : 0);
|
||||
|
||||
if (sk->sk_shutdown & RCV_SHUTDOWN)
|
||||
mask |= POLLRDHUP | POLLIN | POLLRDNORM;
|
||||
if (sk->sk_shutdown == SHUTDOWN_MASK)
|
||||
mask |= POLLHUP;
|
||||
|
||||
/* readable? */
|
||||
if (!skb_queue_empty(&sk->sk_receive_queue))
|
||||
mask |= POLLIN | POLLRDNORM;
|
||||
|
||||
/* Connection-based need to check for termination and startup */
|
||||
if (connection_based(sk)) {
|
||||
if (sk->sk_state == TCP_CLOSE)
|
||||
mask |= POLLHUP;
|
||||
/* connection hasn't started yet? */
|
||||
if (sk->sk_state == TCP_SYN_SENT)
|
||||
return mask;
|
||||
}
|
||||
|
||||
/* writable? */
|
||||
if (sock_writeable(sk))
|
||||
mask |= POLLOUT | POLLWRNORM | POLLWRBAND;
|
||||
else
|
||||
set_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags);
|
||||
|
||||
return mask;
|
||||
}
|
||||
EXPORT_SYMBOL(datagram_poll);
|
7361
net/core/dev.c
Normal file
7361
net/core/dev.c
Normal file
File diff suppressed because it is too large
Load diff
851
net/core/dev_addr_lists.c
Normal file
851
net/core/dev_addr_lists.c
Normal file
|
@ -0,0 +1,851 @@
|
|||
/*
|
||||
* net/core/dev_addr_lists.c - Functions for handling net device lists
|
||||
* Copyright (c) 2010 Jiri Pirko <jpirko@redhat.com>
|
||||
*
|
||||
* This file contains functions for working with unicast, multicast and device
|
||||
* addresses lists.
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
* the Free Software Foundation; either version 2 of the License, or
|
||||
* (at your option) any later version.
|
||||
*/
|
||||
|
||||
#include <linux/netdevice.h>
|
||||
#include <linux/rtnetlink.h>
|
||||
#include <linux/export.h>
|
||||
#include <linux/list.h>
|
||||
|
||||
/*
|
||||
* General list handling functions
|
||||
*/
|
||||
|
||||
static int __hw_addr_create_ex(struct netdev_hw_addr_list *list,
|
||||
const unsigned char *addr, int addr_len,
|
||||
unsigned char addr_type, bool global,
|
||||
bool sync)
|
||||
{
|
||||
struct netdev_hw_addr *ha;
|
||||
int alloc_size;
|
||||
|
||||
alloc_size = sizeof(*ha);
|
||||
if (alloc_size < L1_CACHE_BYTES)
|
||||
alloc_size = L1_CACHE_BYTES;
|
||||
ha = kmalloc(alloc_size, GFP_ATOMIC);
|
||||
if (!ha)
|
||||
return -ENOMEM;
|
||||
memcpy(ha->addr, addr, addr_len);
|
||||
ha->type = addr_type;
|
||||
ha->refcount = 1;
|
||||
ha->global_use = global;
|
||||
ha->synced = sync ? 1 : 0;
|
||||
ha->sync_cnt = 0;
|
||||
list_add_tail_rcu(&ha->list, &list->list);
|
||||
list->count++;
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
static int __hw_addr_add_ex(struct netdev_hw_addr_list *list,
|
||||
const unsigned char *addr, int addr_len,
|
||||
unsigned char addr_type, bool global, bool sync,
|
||||
int sync_count)
|
||||
{
|
||||
struct netdev_hw_addr *ha;
|
||||
|
||||
if (addr_len > MAX_ADDR_LEN)
|
||||
return -EINVAL;
|
||||
|
||||
list_for_each_entry(ha, &list->list, list) {
|
||||
if (!memcmp(ha->addr, addr, addr_len) &&
|
||||
ha->type == addr_type) {
|
||||
if (global) {
|
||||
/* check if addr is already used as global */
|
||||
if (ha->global_use)
|
||||
return 0;
|
||||
else
|
||||
ha->global_use = true;
|
||||
}
|
||||
if (sync) {
|
||||
if (ha->synced && sync_count)
|
||||
return -EEXIST;
|
||||
else
|
||||
ha->synced++;
|
||||
}
|
||||
ha->refcount++;
|
||||
return 0;
|
||||
}
|
||||
}
|
||||
|
||||
return __hw_addr_create_ex(list, addr, addr_len, addr_type, global,
|
||||
sync);
|
||||
}
|
||||
|
||||
static int __hw_addr_add(struct netdev_hw_addr_list *list,
|
||||
const unsigned char *addr, int addr_len,
|
||||
unsigned char addr_type)
|
||||
{
|
||||
return __hw_addr_add_ex(list, addr, addr_len, addr_type, false, false,
|
||||
0);
|
||||
}
|
||||
|
||||
static int __hw_addr_del_entry(struct netdev_hw_addr_list *list,
|
||||
struct netdev_hw_addr *ha, bool global,
|
||||
bool sync)
|
||||
{
|
||||
if (global && !ha->global_use)
|
||||
return -ENOENT;
|
||||
|
||||
if (sync && !ha->synced)
|
||||
return -ENOENT;
|
||||
|
||||
if (global)
|
||||
ha->global_use = false;
|
||||
|
||||
if (sync)
|
||||
ha->synced--;
|
||||
|
||||
if (--ha->refcount)
|
||||
return 0;
|
||||
list_del_rcu(&ha->list);
|
||||
kfree_rcu(ha, rcu_head);
|
||||
list->count--;
|
||||
return 0;
|
||||
}
|
||||
|
||||
static int __hw_addr_del_ex(struct netdev_hw_addr_list *list,
|
||||
const unsigned char *addr, int addr_len,
|
||||
unsigned char addr_type, bool global, bool sync)
|
||||
{
|
||||
struct netdev_hw_addr *ha;
|
||||
|
||||
list_for_each_entry(ha, &list->list, list) {
|
||||
if (!memcmp(ha->addr, addr, addr_len) &&
|
||||
(ha->type == addr_type || !addr_type))
|
||||
return __hw_addr_del_entry(list, ha, global, sync);
|
||||
}
|
||||
return -ENOENT;
|
||||
}
|
||||
|
||||
static int __hw_addr_del(struct netdev_hw_addr_list *list,
|
||||
const unsigned char *addr, int addr_len,
|
||||
unsigned char addr_type)
|
||||
{
|
||||
return __hw_addr_del_ex(list, addr, addr_len, addr_type, false, false);
|
||||
}
|
||||
|
||||
static int __hw_addr_sync_one(struct netdev_hw_addr_list *to_list,
|
||||
struct netdev_hw_addr *ha,
|
||||
int addr_len)
|
||||
{
|
||||
int err;
|
||||
|
||||
err = __hw_addr_add_ex(to_list, ha->addr, addr_len, ha->type,
|
||||
false, true, ha->sync_cnt);
|
||||
if (err && err != -EEXIST)
|
||||
return err;
|
||||
|
||||
if (!err) {
|
||||
ha->sync_cnt++;
|
||||
ha->refcount++;
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
static void __hw_addr_unsync_one(struct netdev_hw_addr_list *to_list,
|
||||
struct netdev_hw_addr_list *from_list,
|
||||
struct netdev_hw_addr *ha,
|
||||
int addr_len)
|
||||
{
|
||||
int err;
|
||||
|
||||
err = __hw_addr_del_ex(to_list, ha->addr, addr_len, ha->type,
|
||||
false, true);
|
||||
if (err)
|
||||
return;
|
||||
ha->sync_cnt--;
|
||||
/* address on from list is not marked synced */
|
||||
__hw_addr_del_entry(from_list, ha, false, false);
|
||||
}
|
||||
|
||||
static int __hw_addr_sync_multiple(struct netdev_hw_addr_list *to_list,
|
||||
struct netdev_hw_addr_list *from_list,
|
||||
int addr_len)
|
||||
{
|
||||
int err = 0;
|
||||
struct netdev_hw_addr *ha, *tmp;
|
||||
|
||||
list_for_each_entry_safe(ha, tmp, &from_list->list, list) {
|
||||
if (ha->sync_cnt == ha->refcount) {
|
||||
__hw_addr_unsync_one(to_list, from_list, ha, addr_len);
|
||||
} else {
|
||||
err = __hw_addr_sync_one(to_list, ha, addr_len);
|
||||
if (err)
|
||||
break;
|
||||
}
|
||||
}
|
||||
return err;
|
||||
}
|
||||
|
||||
/* This function only works where there is a strict 1-1 relationship
|
||||
* between source and destionation of they synch. If you ever need to
|
||||
* sync addresses to more then 1 destination, you need to use
|
||||
* __hw_addr_sync_multiple().
|
||||
*/
|
||||
int __hw_addr_sync(struct netdev_hw_addr_list *to_list,
|
||||
struct netdev_hw_addr_list *from_list,
|
||||
int addr_len)
|
||||
{
|
||||
int err = 0;
|
||||
struct netdev_hw_addr *ha, *tmp;
|
||||
|
||||
list_for_each_entry_safe(ha, tmp, &from_list->list, list) {
|
||||
if (!ha->sync_cnt) {
|
||||
err = __hw_addr_sync_one(to_list, ha, addr_len);
|
||||
if (err)
|
||||
break;
|
||||
} else if (ha->refcount == 1)
|
||||
__hw_addr_unsync_one(to_list, from_list, ha, addr_len);
|
||||
}
|
||||
return err;
|
||||
}
|
||||
EXPORT_SYMBOL(__hw_addr_sync);
|
||||
|
||||
void __hw_addr_unsync(struct netdev_hw_addr_list *to_list,
|
||||
struct netdev_hw_addr_list *from_list,
|
||||
int addr_len)
|
||||
{
|
||||
struct netdev_hw_addr *ha, *tmp;
|
||||
|
||||
list_for_each_entry_safe(ha, tmp, &from_list->list, list) {
|
||||
if (ha->sync_cnt)
|
||||
__hw_addr_unsync_one(to_list, from_list, ha, addr_len);
|
||||
}
|
||||
}
|
||||
EXPORT_SYMBOL(__hw_addr_unsync);
|
||||
|
||||
/**
|
||||
* __hw_addr_sync_dev - Synchonize device's multicast list
|
||||
* @list: address list to syncronize
|
||||
* @dev: device to sync
|
||||
* @sync: function to call if address should be added
|
||||
* @unsync: function to call if address should be removed
|
||||
*
|
||||
* This funciton is intended to be called from the ndo_set_rx_mode
|
||||
* function of devices that require explicit address add/remove
|
||||
* notifications. The unsync function may be NULL in which case
|
||||
* the addresses requiring removal will simply be removed without
|
||||
* any notification to the device.
|
||||
**/
|
||||
int __hw_addr_sync_dev(struct netdev_hw_addr_list *list,
|
||||
struct net_device *dev,
|
||||
int (*sync)(struct net_device *, const unsigned char *),
|
||||
int (*unsync)(struct net_device *,
|
||||
const unsigned char *))
|
||||
{
|
||||
struct netdev_hw_addr *ha, *tmp;
|
||||
int err;
|
||||
|
||||
/* first go through and flush out any stale entries */
|
||||
list_for_each_entry_safe(ha, tmp, &list->list, list) {
|
||||
if (!ha->sync_cnt || ha->refcount != 1)
|
||||
continue;
|
||||
|
||||
/* if unsync is defined and fails defer unsyncing address */
|
||||
if (unsync && unsync(dev, ha->addr))
|
||||
continue;
|
||||
|
||||
ha->sync_cnt--;
|
||||
__hw_addr_del_entry(list, ha, false, false);
|
||||
}
|
||||
|
||||
/* go through and sync new entries to the list */
|
||||
list_for_each_entry_safe(ha, tmp, &list->list, list) {
|
||||
if (ha->sync_cnt)
|
||||
continue;
|
||||
|
||||
err = sync(dev, ha->addr);
|
||||
if (err)
|
||||
return err;
|
||||
|
||||
ha->sync_cnt++;
|
||||
ha->refcount++;
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
EXPORT_SYMBOL(__hw_addr_sync_dev);
|
||||
|
||||
/**
|
||||
* __hw_addr_unsync_dev - Remove synchonized addresses from device
|
||||
* @list: address list to remove syncronized addresses from
|
||||
* @dev: device to sync
|
||||
* @unsync: function to call if address should be removed
|
||||
*
|
||||
* Remove all addresses that were added to the device by __hw_addr_sync_dev().
|
||||
* This function is intended to be called from the ndo_stop or ndo_open
|
||||
* functions on devices that require explicit address add/remove
|
||||
* notifications. If the unsync function pointer is NULL then this function
|
||||
* can be used to just reset the sync_cnt for the addresses in the list.
|
||||
**/
|
||||
void __hw_addr_unsync_dev(struct netdev_hw_addr_list *list,
|
||||
struct net_device *dev,
|
||||
int (*unsync)(struct net_device *,
|
||||
const unsigned char *))
|
||||
{
|
||||
struct netdev_hw_addr *ha, *tmp;
|
||||
|
||||
list_for_each_entry_safe(ha, tmp, &list->list, list) {
|
||||
if (!ha->sync_cnt)
|
||||
continue;
|
||||
|
||||
/* if unsync is defined and fails defer unsyncing address */
|
||||
if (unsync && unsync(dev, ha->addr))
|
||||
continue;
|
||||
|
||||
ha->sync_cnt--;
|
||||
__hw_addr_del_entry(list, ha, false, false);
|
||||
}
|
||||
}
|
||||
EXPORT_SYMBOL(__hw_addr_unsync_dev);
|
||||
|
||||
static void __hw_addr_flush(struct netdev_hw_addr_list *list)
|
||||
{
|
||||
struct netdev_hw_addr *ha, *tmp;
|
||||
|
||||
list_for_each_entry_safe(ha, tmp, &list->list, list) {
|
||||
list_del_rcu(&ha->list);
|
||||
kfree_rcu(ha, rcu_head);
|
||||
}
|
||||
list->count = 0;
|
||||
}
|
||||
|
||||
void __hw_addr_init(struct netdev_hw_addr_list *list)
|
||||
{
|
||||
INIT_LIST_HEAD(&list->list);
|
||||
list->count = 0;
|
||||
}
|
||||
EXPORT_SYMBOL(__hw_addr_init);
|
||||
|
||||
/*
|
||||
* Device addresses handling functions
|
||||
*/
|
||||
|
||||
/**
|
||||
* dev_addr_flush - Flush device address list
|
||||
* @dev: device
|
||||
*
|
||||
* Flush device address list and reset ->dev_addr.
|
||||
*
|
||||
* The caller must hold the rtnl_mutex.
|
||||
*/
|
||||
void dev_addr_flush(struct net_device *dev)
|
||||
{
|
||||
/* rtnl_mutex must be held here */
|
||||
|
||||
__hw_addr_flush(&dev->dev_addrs);
|
||||
dev->dev_addr = NULL;
|
||||
}
|
||||
EXPORT_SYMBOL(dev_addr_flush);
|
||||
|
||||
/**
|
||||
* dev_addr_init - Init device address list
|
||||
* @dev: device
|
||||
*
|
||||
* Init device address list and create the first element,
|
||||
* used by ->dev_addr.
|
||||
*
|
||||
* The caller must hold the rtnl_mutex.
|
||||
*/
|
||||
int dev_addr_init(struct net_device *dev)
|
||||
{
|
||||
unsigned char addr[MAX_ADDR_LEN];
|
||||
struct netdev_hw_addr *ha;
|
||||
int err;
|
||||
|
||||
/* rtnl_mutex must be held here */
|
||||
|
||||
__hw_addr_init(&dev->dev_addrs);
|
||||
memset(addr, 0, sizeof(addr));
|
||||
err = __hw_addr_add(&dev->dev_addrs, addr, sizeof(addr),
|
||||
NETDEV_HW_ADDR_T_LAN);
|
||||
if (!err) {
|
||||
/*
|
||||
* Get the first (previously created) address from the list
|
||||
* and set dev_addr pointer to this location.
|
||||
*/
|
||||
ha = list_first_entry(&dev->dev_addrs.list,
|
||||
struct netdev_hw_addr, list);
|
||||
dev->dev_addr = ha->addr;
|
||||
}
|
||||
return err;
|
||||
}
|
||||
EXPORT_SYMBOL(dev_addr_init);
|
||||
|
||||
/**
|
||||
* dev_addr_add - Add a device address
|
||||
* @dev: device
|
||||
* @addr: address to add
|
||||
* @addr_type: address type
|
||||
*
|
||||
* Add a device address to the device or increase the reference count if
|
||||
* it already exists.
|
||||
*
|
||||
* The caller must hold the rtnl_mutex.
|
||||
*/
|
||||
int dev_addr_add(struct net_device *dev, const unsigned char *addr,
|
||||
unsigned char addr_type)
|
||||
{
|
||||
int err;
|
||||
|
||||
ASSERT_RTNL();
|
||||
|
||||
err = __hw_addr_add(&dev->dev_addrs, addr, dev->addr_len, addr_type);
|
||||
if (!err)
|
||||
call_netdevice_notifiers(NETDEV_CHANGEADDR, dev);
|
||||
return err;
|
||||
}
|
||||
EXPORT_SYMBOL(dev_addr_add);
|
||||
|
||||
/**
|
||||
* dev_addr_del - Release a device address.
|
||||
* @dev: device
|
||||
* @addr: address to delete
|
||||
* @addr_type: address type
|
||||
*
|
||||
* Release reference to a device address and remove it from the device
|
||||
* if the reference count drops to zero.
|
||||
*
|
||||
* The caller must hold the rtnl_mutex.
|
||||
*/
|
||||
int dev_addr_del(struct net_device *dev, const unsigned char *addr,
|
||||
unsigned char addr_type)
|
||||
{
|
||||
int err;
|
||||
struct netdev_hw_addr *ha;
|
||||
|
||||
ASSERT_RTNL();
|
||||
|
||||
/*
|
||||
* We can not remove the first address from the list because
|
||||
* dev->dev_addr points to that.
|
||||
*/
|
||||
ha = list_first_entry(&dev->dev_addrs.list,
|
||||
struct netdev_hw_addr, list);
|
||||
if (!memcmp(ha->addr, addr, dev->addr_len) &&
|
||||
ha->type == addr_type && ha->refcount == 1)
|
||||
return -ENOENT;
|
||||
|
||||
err = __hw_addr_del(&dev->dev_addrs, addr, dev->addr_len,
|
||||
addr_type);
|
||||
if (!err)
|
||||
call_netdevice_notifiers(NETDEV_CHANGEADDR, dev);
|
||||
return err;
|
||||
}
|
||||
EXPORT_SYMBOL(dev_addr_del);
|
||||
|
||||
/*
|
||||
* Unicast list handling functions
|
||||
*/
|
||||
|
||||
/**
|
||||
* dev_uc_add_excl - Add a global secondary unicast address
|
||||
* @dev: device
|
||||
* @addr: address to add
|
||||
*/
|
||||
int dev_uc_add_excl(struct net_device *dev, const unsigned char *addr)
|
||||
{
|
||||
struct netdev_hw_addr *ha;
|
||||
int err;
|
||||
|
||||
netif_addr_lock_bh(dev);
|
||||
list_for_each_entry(ha, &dev->uc.list, list) {
|
||||
if (!memcmp(ha->addr, addr, dev->addr_len) &&
|
||||
ha->type == NETDEV_HW_ADDR_T_UNICAST) {
|
||||
err = -EEXIST;
|
||||
goto out;
|
||||
}
|
||||
}
|
||||
err = __hw_addr_create_ex(&dev->uc, addr, dev->addr_len,
|
||||
NETDEV_HW_ADDR_T_UNICAST, true, false);
|
||||
if (!err)
|
||||
__dev_set_rx_mode(dev);
|
||||
out:
|
||||
netif_addr_unlock_bh(dev);
|
||||
return err;
|
||||
}
|
||||
EXPORT_SYMBOL(dev_uc_add_excl);
|
||||
|
||||
/**
|
||||
* dev_uc_add - Add a secondary unicast address
|
||||
* @dev: device
|
||||
* @addr: address to add
|
||||
*
|
||||
* Add a secondary unicast address to the device or increase
|
||||
* the reference count if it already exists.
|
||||
*/
|
||||
int dev_uc_add(struct net_device *dev, const unsigned char *addr)
|
||||
{
|
||||
int err;
|
||||
|
||||
netif_addr_lock_bh(dev);
|
||||
err = __hw_addr_add(&dev->uc, addr, dev->addr_len,
|
||||
NETDEV_HW_ADDR_T_UNICAST);
|
||||
if (!err)
|
||||
__dev_set_rx_mode(dev);
|
||||
netif_addr_unlock_bh(dev);
|
||||
return err;
|
||||
}
|
||||
EXPORT_SYMBOL(dev_uc_add);
|
||||
|
||||
/**
|
||||
* dev_uc_del - Release secondary unicast address.
|
||||
* @dev: device
|
||||
* @addr: address to delete
|
||||
*
|
||||
* Release reference to a secondary unicast address and remove it
|
||||
* from the device if the reference count drops to zero.
|
||||
*/
|
||||
int dev_uc_del(struct net_device *dev, const unsigned char *addr)
|
||||
{
|
||||
int err;
|
||||
|
||||
netif_addr_lock_bh(dev);
|
||||
err = __hw_addr_del(&dev->uc, addr, dev->addr_len,
|
||||
NETDEV_HW_ADDR_T_UNICAST);
|
||||
if (!err)
|
||||
__dev_set_rx_mode(dev);
|
||||
netif_addr_unlock_bh(dev);
|
||||
return err;
|
||||
}
|
||||
EXPORT_SYMBOL(dev_uc_del);
|
||||
|
||||
/**
|
||||
* dev_uc_sync - Synchronize device's unicast list to another device
|
||||
* @to: destination device
|
||||
* @from: source device
|
||||
*
|
||||
* Add newly added addresses to the destination device and release
|
||||
* addresses that have no users left. The source device must be
|
||||
* locked by netif_addr_lock_bh.
|
||||
*
|
||||
* This function is intended to be called from the dev->set_rx_mode
|
||||
* function of layered software devices. This function assumes that
|
||||
* addresses will only ever be synced to the @to devices and no other.
|
||||
*/
|
||||
int dev_uc_sync(struct net_device *to, struct net_device *from)
|
||||
{
|
||||
int err = 0;
|
||||
|
||||
if (to->addr_len != from->addr_len)
|
||||
return -EINVAL;
|
||||
|
||||
netif_addr_lock_nested(to);
|
||||
err = __hw_addr_sync(&to->uc, &from->uc, to->addr_len);
|
||||
if (!err)
|
||||
__dev_set_rx_mode(to);
|
||||
netif_addr_unlock(to);
|
||||
return err;
|
||||
}
|
||||
EXPORT_SYMBOL(dev_uc_sync);
|
||||
|
||||
/**
|
||||
* dev_uc_sync_multiple - Synchronize device's unicast list to another
|
||||
* device, but allow for multiple calls to sync to multiple devices.
|
||||
* @to: destination device
|
||||
* @from: source device
|
||||
*
|
||||
* Add newly added addresses to the destination device and release
|
||||
* addresses that have been deleted from the source. The source device
|
||||
* must be locked by netif_addr_lock_bh.
|
||||
*
|
||||
* This function is intended to be called from the dev->set_rx_mode
|
||||
* function of layered software devices. It allows for a single source
|
||||
* device to be synced to multiple destination devices.
|
||||
*/
|
||||
int dev_uc_sync_multiple(struct net_device *to, struct net_device *from)
|
||||
{
|
||||
int err = 0;
|
||||
|
||||
if (to->addr_len != from->addr_len)
|
||||
return -EINVAL;
|
||||
|
||||
netif_addr_lock_nested(to);
|
||||
err = __hw_addr_sync_multiple(&to->uc, &from->uc, to->addr_len);
|
||||
if (!err)
|
||||
__dev_set_rx_mode(to);
|
||||
netif_addr_unlock(to);
|
||||
return err;
|
||||
}
|
||||
EXPORT_SYMBOL(dev_uc_sync_multiple);
|
||||
|
||||
/**
|
||||
* dev_uc_unsync - Remove synchronized addresses from the destination device
|
||||
* @to: destination device
|
||||
* @from: source device
|
||||
*
|
||||
* Remove all addresses that were added to the destination device by
|
||||
* dev_uc_sync(). This function is intended to be called from the
|
||||
* dev->stop function of layered software devices.
|
||||
*/
|
||||
void dev_uc_unsync(struct net_device *to, struct net_device *from)
|
||||
{
|
||||
if (to->addr_len != from->addr_len)
|
||||
return;
|
||||
|
||||
netif_addr_lock_bh(from);
|
||||
netif_addr_lock_nested(to);
|
||||
__hw_addr_unsync(&to->uc, &from->uc, to->addr_len);
|
||||
__dev_set_rx_mode(to);
|
||||
netif_addr_unlock(to);
|
||||
netif_addr_unlock_bh(from);
|
||||
}
|
||||
EXPORT_SYMBOL(dev_uc_unsync);
|
||||
|
||||
/**
|
||||
* dev_uc_flush - Flush unicast addresses
|
||||
* @dev: device
|
||||
*
|
||||
* Flush unicast addresses.
|
||||
*/
|
||||
void dev_uc_flush(struct net_device *dev)
|
||||
{
|
||||
netif_addr_lock_bh(dev);
|
||||
__hw_addr_flush(&dev->uc);
|
||||
netif_addr_unlock_bh(dev);
|
||||
}
|
||||
EXPORT_SYMBOL(dev_uc_flush);
|
||||
|
||||
/**
|
||||
* dev_uc_flush - Init unicast address list
|
||||
* @dev: device
|
||||
*
|
||||
* Init unicast address list.
|
||||
*/
|
||||
void dev_uc_init(struct net_device *dev)
|
||||
{
|
||||
__hw_addr_init(&dev->uc);
|
||||
}
|
||||
EXPORT_SYMBOL(dev_uc_init);
|
||||
|
||||
/*
|
||||
* Multicast list handling functions
|
||||
*/
|
||||
|
||||
/**
|
||||
* dev_mc_add_excl - Add a global secondary multicast address
|
||||
* @dev: device
|
||||
* @addr: address to add
|
||||
*/
|
||||
int dev_mc_add_excl(struct net_device *dev, const unsigned char *addr)
|
||||
{
|
||||
struct netdev_hw_addr *ha;
|
||||
int err;
|
||||
|
||||
netif_addr_lock_bh(dev);
|
||||
list_for_each_entry(ha, &dev->mc.list, list) {
|
||||
if (!memcmp(ha->addr, addr, dev->addr_len) &&
|
||||
ha->type == NETDEV_HW_ADDR_T_MULTICAST) {
|
||||
err = -EEXIST;
|
||||
goto out;
|
||||
}
|
||||
}
|
||||
err = __hw_addr_create_ex(&dev->mc, addr, dev->addr_len,
|
||||
NETDEV_HW_ADDR_T_MULTICAST, true, false);
|
||||
if (!err)
|
||||
__dev_set_rx_mode(dev);
|
||||
out:
|
||||
netif_addr_unlock_bh(dev);
|
||||
return err;
|
||||
}
|
||||
EXPORT_SYMBOL(dev_mc_add_excl);
|
||||
|
||||
static int __dev_mc_add(struct net_device *dev, const unsigned char *addr,
|
||||
bool global)
|
||||
{
|
||||
int err;
|
||||
|
||||
netif_addr_lock_bh(dev);
|
||||
err = __hw_addr_add_ex(&dev->mc, addr, dev->addr_len,
|
||||
NETDEV_HW_ADDR_T_MULTICAST, global, false, 0);
|
||||
if (!err)
|
||||
__dev_set_rx_mode(dev);
|
||||
netif_addr_unlock_bh(dev);
|
||||
return err;
|
||||
}
|
||||
/**
|
||||
* dev_mc_add - Add a multicast address
|
||||
* @dev: device
|
||||
* @addr: address to add
|
||||
*
|
||||
* Add a multicast address to the device or increase
|
||||
* the reference count if it already exists.
|
||||
*/
|
||||
int dev_mc_add(struct net_device *dev, const unsigned char *addr)
|
||||
{
|
||||
return __dev_mc_add(dev, addr, false);
|
||||
}
|
||||
EXPORT_SYMBOL(dev_mc_add);
|
||||
|
||||
/**
|
||||
* dev_mc_add_global - Add a global multicast address
|
||||
* @dev: device
|
||||
* @addr: address to add
|
||||
*
|
||||
* Add a global multicast address to the device.
|
||||
*/
|
||||
int dev_mc_add_global(struct net_device *dev, const unsigned char *addr)
|
||||
{
|
||||
return __dev_mc_add(dev, addr, true);
|
||||
}
|
||||
EXPORT_SYMBOL(dev_mc_add_global);
|
||||
|
||||
static int __dev_mc_del(struct net_device *dev, const unsigned char *addr,
|
||||
bool global)
|
||||
{
|
||||
int err;
|
||||
|
||||
netif_addr_lock_bh(dev);
|
||||
err = __hw_addr_del_ex(&dev->mc, addr, dev->addr_len,
|
||||
NETDEV_HW_ADDR_T_MULTICAST, global, false);
|
||||
if (!err)
|
||||
__dev_set_rx_mode(dev);
|
||||
netif_addr_unlock_bh(dev);
|
||||
return err;
|
||||
}
|
||||
|
||||
/**
|
||||
* dev_mc_del - Delete a multicast address.
|
||||
* @dev: device
|
||||
* @addr: address to delete
|
||||
*
|
||||
* Release reference to a multicast address and remove it
|
||||
* from the device if the reference count drops to zero.
|
||||
*/
|
||||
int dev_mc_del(struct net_device *dev, const unsigned char *addr)
|
||||
{
|
||||
return __dev_mc_del(dev, addr, false);
|
||||
}
|
||||
EXPORT_SYMBOL(dev_mc_del);
|
||||
|
||||
/**
|
||||
* dev_mc_del_global - Delete a global multicast address.
|
||||
* @dev: device
|
||||
* @addr: address to delete
|
||||
*
|
||||
* Release reference to a multicast address and remove it
|
||||
* from the device if the reference count drops to zero.
|
||||
*/
|
||||
int dev_mc_del_global(struct net_device *dev, const unsigned char *addr)
|
||||
{
|
||||
return __dev_mc_del(dev, addr, true);
|
||||
}
|
||||
EXPORT_SYMBOL(dev_mc_del_global);
|
||||
|
||||
/**
|
||||
* dev_mc_sync - Synchronize device's multicast list to another device
|
||||
* @to: destination device
|
||||
* @from: source device
|
||||
*
|
||||
* Add newly added addresses to the destination device and release
|
||||
* addresses that have no users left. The source device must be
|
||||
* locked by netif_addr_lock_bh.
|
||||
*
|
||||
* This function is intended to be called from the ndo_set_rx_mode
|
||||
* function of layered software devices.
|
||||
*/
|
||||
int dev_mc_sync(struct net_device *to, struct net_device *from)
|
||||
{
|
||||
int err = 0;
|
||||
|
||||
if (to->addr_len != from->addr_len)
|
||||
return -EINVAL;
|
||||
|
||||
netif_addr_lock_nested(to);
|
||||
err = __hw_addr_sync(&to->mc, &from->mc, to->addr_len);
|
||||
if (!err)
|
||||
__dev_set_rx_mode(to);
|
||||
netif_addr_unlock(to);
|
||||
return err;
|
||||
}
|
||||
EXPORT_SYMBOL(dev_mc_sync);
|
||||
|
||||
/**
|
||||
* dev_mc_sync_multiple - Synchronize device's multicast list to another
|
||||
* device, but allow for multiple calls to sync to multiple devices.
|
||||
* @to: destination device
|
||||
* @from: source device
|
||||
*
|
||||
* Add newly added addresses to the destination device and release
|
||||
* addresses that have no users left. The source device must be
|
||||
* locked by netif_addr_lock_bh.
|
||||
*
|
||||
* This function is intended to be called from the ndo_set_rx_mode
|
||||
* function of layered software devices. It allows for a single
|
||||
* source device to be synced to multiple destination devices.
|
||||
*/
|
||||
int dev_mc_sync_multiple(struct net_device *to, struct net_device *from)
|
||||
{
|
||||
int err = 0;
|
||||
|
||||
if (to->addr_len != from->addr_len)
|
||||
return -EINVAL;
|
||||
|
||||
netif_addr_lock_nested(to);
|
||||
err = __hw_addr_sync_multiple(&to->mc, &from->mc, to->addr_len);
|
||||
if (!err)
|
||||
__dev_set_rx_mode(to);
|
||||
netif_addr_unlock(to);
|
||||
return err;
|
||||
}
|
||||
EXPORT_SYMBOL(dev_mc_sync_multiple);
|
||||
|
||||
/**
|
||||
* dev_mc_unsync - Remove synchronized addresses from the destination device
|
||||
* @to: destination device
|
||||
* @from: source device
|
||||
*
|
||||
* Remove all addresses that were added to the destination device by
|
||||
* dev_mc_sync(). This function is intended to be called from the
|
||||
* dev->stop function of layered software devices.
|
||||
*/
|
||||
void dev_mc_unsync(struct net_device *to, struct net_device *from)
|
||||
{
|
||||
if (to->addr_len != from->addr_len)
|
||||
return;
|
||||
|
||||
netif_addr_lock_bh(from);
|
||||
netif_addr_lock_nested(to);
|
||||
__hw_addr_unsync(&to->mc, &from->mc, to->addr_len);
|
||||
__dev_set_rx_mode(to);
|
||||
netif_addr_unlock(to);
|
||||
netif_addr_unlock_bh(from);
|
||||
}
|
||||
EXPORT_SYMBOL(dev_mc_unsync);
|
||||
|
||||
/**
|
||||
* dev_mc_flush - Flush multicast addresses
|
||||
* @dev: device
|
||||
*
|
||||
* Flush multicast addresses.
|
||||
*/
|
||||
void dev_mc_flush(struct net_device *dev)
|
||||
{
|
||||
netif_addr_lock_bh(dev);
|
||||
__hw_addr_flush(&dev->mc);
|
||||
netif_addr_unlock_bh(dev);
|
||||
}
|
||||
EXPORT_SYMBOL(dev_mc_flush);
|
||||
|
||||
/**
|
||||
* dev_mc_flush - Init multicast address list
|
||||
* @dev: device
|
||||
*
|
||||
* Init multicast address list.
|
||||
*/
|
||||
void dev_mc_init(struct net_device *dev)
|
||||
{
|
||||
__hw_addr_init(&dev->mc);
|
||||
}
|
||||
EXPORT_SYMBOL(dev_mc_init);
|
564
net/core/dev_ioctl.c
Normal file
564
net/core/dev_ioctl.c
Normal file
|
@ -0,0 +1,564 @@
|
|||
#include <linux/kmod.h>
|
||||
#include <linux/netdevice.h>
|
||||
#include <linux/etherdevice.h>
|
||||
#include <linux/rtnetlink.h>
|
||||
#include <linux/net_tstamp.h>
|
||||
#include <linux/wireless.h>
|
||||
#include <net/wext.h>
|
||||
|
||||
/*
|
||||
* Map an interface index to its name (SIOCGIFNAME)
|
||||
*/
|
||||
|
||||
/*
|
||||
* We need this ioctl for efficient implementation of the
|
||||
* if_indextoname() function required by the IPv6 API. Without
|
||||
* it, we would have to search all the interfaces to find a
|
||||
* match. --pb
|
||||
*/
|
||||
|
||||
static int dev_ifname(struct net *net, struct ifreq __user *arg)
|
||||
{
|
||||
struct ifreq ifr;
|
||||
int error;
|
||||
|
||||
/*
|
||||
* Fetch the caller's info block.
|
||||
*/
|
||||
|
||||
if (copy_from_user(&ifr, arg, sizeof(struct ifreq)))
|
||||
return -EFAULT;
|
||||
|
||||
error = netdev_get_name(net, ifr.ifr_name, ifr.ifr_ifindex);
|
||||
if (error)
|
||||
return error;
|
||||
|
||||
if (copy_to_user(arg, &ifr, sizeof(struct ifreq)))
|
||||
return -EFAULT;
|
||||
return 0;
|
||||
}
|
||||
|
||||
static gifconf_func_t *gifconf_list[NPROTO];
|
||||
|
||||
/**
|
||||
* register_gifconf - register a SIOCGIF handler
|
||||
* @family: Address family
|
||||
* @gifconf: Function handler
|
||||
*
|
||||
* Register protocol dependent address dumping routines. The handler
|
||||
* that is passed must not be freed or reused until it has been replaced
|
||||
* by another handler.
|
||||
*/
|
||||
int register_gifconf(unsigned int family, gifconf_func_t *gifconf)
|
||||
{
|
||||
if (family >= NPROTO)
|
||||
return -EINVAL;
|
||||
gifconf_list[family] = gifconf;
|
||||
return 0;
|
||||
}
|
||||
EXPORT_SYMBOL(register_gifconf);
|
||||
|
||||
/*
|
||||
* Perform a SIOCGIFCONF call. This structure will change
|
||||
* size eventually, and there is nothing I can do about it.
|
||||
* Thus we will need a 'compatibility mode'.
|
||||
*/
|
||||
|
||||
static int dev_ifconf(struct net *net, char __user *arg)
|
||||
{
|
||||
struct ifconf ifc;
|
||||
struct net_device *dev;
|
||||
char __user *pos;
|
||||
int len;
|
||||
int total;
|
||||
int i;
|
||||
|
||||
/*
|
||||
* Fetch the caller's info block.
|
||||
*/
|
||||
|
||||
if (copy_from_user(&ifc, arg, sizeof(struct ifconf)))
|
||||
return -EFAULT;
|
||||
|
||||
pos = ifc.ifc_buf;
|
||||
len = ifc.ifc_len;
|
||||
|
||||
/*
|
||||
* Loop over the interfaces, and write an info block for each.
|
||||
*/
|
||||
|
||||
total = 0;
|
||||
for_each_netdev(net, dev) {
|
||||
for (i = 0; i < NPROTO; i++) {
|
||||
if (gifconf_list[i]) {
|
||||
int done;
|
||||
if (!pos)
|
||||
done = gifconf_list[i](dev, NULL, 0);
|
||||
else
|
||||
done = gifconf_list[i](dev, pos + total,
|
||||
len - total);
|
||||
if (done < 0)
|
||||
return -EFAULT;
|
||||
total += done;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* All done. Write the updated control block back to the caller.
|
||||
*/
|
||||
ifc.ifc_len = total;
|
||||
|
||||
/*
|
||||
* Both BSD and Solaris return 0 here, so we do too.
|
||||
*/
|
||||
return copy_to_user(arg, &ifc, sizeof(struct ifconf)) ? -EFAULT : 0;
|
||||
}
|
||||
|
||||
/*
|
||||
* Perform the SIOCxIFxxx calls, inside rcu_read_lock()
|
||||
*/
|
||||
static int dev_ifsioc_locked(struct net *net, struct ifreq *ifr, unsigned int cmd)
|
||||
{
|
||||
int err;
|
||||
struct net_device *dev = dev_get_by_name_rcu(net, ifr->ifr_name);
|
||||
|
||||
if (!dev)
|
||||
return -ENODEV;
|
||||
|
||||
switch (cmd) {
|
||||
case SIOCGIFFLAGS: /* Get interface flags */
|
||||
ifr->ifr_flags = (short) dev_get_flags(dev);
|
||||
return 0;
|
||||
|
||||
case SIOCGIFMETRIC: /* Get the metric on the interface
|
||||
(currently unused) */
|
||||
ifr->ifr_metric = 0;
|
||||
return 0;
|
||||
|
||||
case SIOCGIFMTU: /* Get the MTU of a device */
|
||||
ifr->ifr_mtu = dev->mtu;
|
||||
return 0;
|
||||
|
||||
case SIOCGIFHWADDR:
|
||||
if (!dev->addr_len)
|
||||
memset(ifr->ifr_hwaddr.sa_data, 0, sizeof ifr->ifr_hwaddr.sa_data);
|
||||
else
|
||||
memcpy(ifr->ifr_hwaddr.sa_data, dev->dev_addr,
|
||||
min(sizeof ifr->ifr_hwaddr.sa_data, (size_t) dev->addr_len));
|
||||
ifr->ifr_hwaddr.sa_family = dev->type;
|
||||
return 0;
|
||||
|
||||
case SIOCGIFSLAVE:
|
||||
err = -EINVAL;
|
||||
break;
|
||||
|
||||
case SIOCGIFMAP:
|
||||
ifr->ifr_map.mem_start = dev->mem_start;
|
||||
ifr->ifr_map.mem_end = dev->mem_end;
|
||||
ifr->ifr_map.base_addr = dev->base_addr;
|
||||
ifr->ifr_map.irq = dev->irq;
|
||||
ifr->ifr_map.dma = dev->dma;
|
||||
ifr->ifr_map.port = dev->if_port;
|
||||
return 0;
|
||||
|
||||
case SIOCGIFINDEX:
|
||||
ifr->ifr_ifindex = dev->ifindex;
|
||||
return 0;
|
||||
|
||||
case SIOCGIFTXQLEN:
|
||||
ifr->ifr_qlen = dev->tx_queue_len;
|
||||
return 0;
|
||||
|
||||
default:
|
||||
/* dev_ioctl() should ensure this case
|
||||
* is never reached
|
||||
*/
|
||||
WARN_ON(1);
|
||||
err = -ENOTTY;
|
||||
break;
|
||||
|
||||
}
|
||||
return err;
|
||||
}
|
||||
|
||||
static int net_hwtstamp_validate(struct ifreq *ifr)
|
||||
{
|
||||
struct hwtstamp_config cfg;
|
||||
enum hwtstamp_tx_types tx_type;
|
||||
enum hwtstamp_rx_filters rx_filter;
|
||||
int tx_type_valid = 0;
|
||||
int rx_filter_valid = 0;
|
||||
|
||||
if (copy_from_user(&cfg, ifr->ifr_data, sizeof(cfg)))
|
||||
return -EFAULT;
|
||||
|
||||
if (cfg.flags) /* reserved for future extensions */
|
||||
return -EINVAL;
|
||||
|
||||
tx_type = cfg.tx_type;
|
||||
rx_filter = cfg.rx_filter;
|
||||
|
||||
switch (tx_type) {
|
||||
case HWTSTAMP_TX_OFF:
|
||||
case HWTSTAMP_TX_ON:
|
||||
case HWTSTAMP_TX_ONESTEP_SYNC:
|
||||
tx_type_valid = 1;
|
||||
break;
|
||||
}
|
||||
|
||||
switch (rx_filter) {
|
||||
case HWTSTAMP_FILTER_NONE:
|
||||
case HWTSTAMP_FILTER_ALL:
|
||||
case HWTSTAMP_FILTER_SOME:
|
||||
case HWTSTAMP_FILTER_PTP_V1_L4_EVENT:
|
||||
case HWTSTAMP_FILTER_PTP_V1_L4_SYNC:
|
||||
case HWTSTAMP_FILTER_PTP_V1_L4_DELAY_REQ:
|
||||
case HWTSTAMP_FILTER_PTP_V2_L4_EVENT:
|
||||
case HWTSTAMP_FILTER_PTP_V2_L4_SYNC:
|
||||
case HWTSTAMP_FILTER_PTP_V2_L4_DELAY_REQ:
|
||||
case HWTSTAMP_FILTER_PTP_V2_L2_EVENT:
|
||||
case HWTSTAMP_FILTER_PTP_V2_L2_SYNC:
|
||||
case HWTSTAMP_FILTER_PTP_V2_L2_DELAY_REQ:
|
||||
case HWTSTAMP_FILTER_PTP_V2_EVENT:
|
||||
case HWTSTAMP_FILTER_PTP_V2_SYNC:
|
||||
case HWTSTAMP_FILTER_PTP_V2_DELAY_REQ:
|
||||
rx_filter_valid = 1;
|
||||
break;
|
||||
}
|
||||
|
||||
if (!tx_type_valid || !rx_filter_valid)
|
||||
return -ERANGE;
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
/*
|
||||
* Perform the SIOCxIFxxx calls, inside rtnl_lock()
|
||||
*/
|
||||
static int dev_ifsioc(struct net *net, struct ifreq *ifr, unsigned int cmd)
|
||||
{
|
||||
int err;
|
||||
struct net_device *dev = __dev_get_by_name(net, ifr->ifr_name);
|
||||
const struct net_device_ops *ops;
|
||||
|
||||
if (!dev)
|
||||
return -ENODEV;
|
||||
|
||||
ops = dev->netdev_ops;
|
||||
|
||||
switch (cmd) {
|
||||
case SIOCSIFFLAGS: /* Set interface flags */
|
||||
return dev_change_flags(dev, ifr->ifr_flags);
|
||||
|
||||
case SIOCSIFMETRIC: /* Set the metric on the interface
|
||||
(currently unused) */
|
||||
return -EOPNOTSUPP;
|
||||
|
||||
case SIOCSIFMTU: /* Set the MTU of a device */
|
||||
return dev_set_mtu(dev, ifr->ifr_mtu);
|
||||
|
||||
case SIOCSIFHWADDR:
|
||||
return dev_set_mac_address(dev, &ifr->ifr_hwaddr);
|
||||
|
||||
case SIOCSIFHWBROADCAST:
|
||||
if (ifr->ifr_hwaddr.sa_family != dev->type)
|
||||
return -EINVAL;
|
||||
memcpy(dev->broadcast, ifr->ifr_hwaddr.sa_data,
|
||||
min(sizeof ifr->ifr_hwaddr.sa_data, (size_t) dev->addr_len));
|
||||
call_netdevice_notifiers(NETDEV_CHANGEADDR, dev);
|
||||
return 0;
|
||||
|
||||
case SIOCSIFMAP:
|
||||
if (ops->ndo_set_config) {
|
||||
if (!netif_device_present(dev))
|
||||
return -ENODEV;
|
||||
return ops->ndo_set_config(dev, &ifr->ifr_map);
|
||||
}
|
||||
return -EOPNOTSUPP;
|
||||
|
||||
case SIOCADDMULTI:
|
||||
if (!ops->ndo_set_rx_mode ||
|
||||
ifr->ifr_hwaddr.sa_family != AF_UNSPEC)
|
||||
return -EINVAL;
|
||||
if (!netif_device_present(dev))
|
||||
return -ENODEV;
|
||||
return dev_mc_add_global(dev, ifr->ifr_hwaddr.sa_data);
|
||||
|
||||
case SIOCDELMULTI:
|
||||
if (!ops->ndo_set_rx_mode ||
|
||||
ifr->ifr_hwaddr.sa_family != AF_UNSPEC)
|
||||
return -EINVAL;
|
||||
if (!netif_device_present(dev))
|
||||
return -ENODEV;
|
||||
return dev_mc_del_global(dev, ifr->ifr_hwaddr.sa_data);
|
||||
|
||||
case SIOCSIFTXQLEN:
|
||||
if (ifr->ifr_qlen < 0)
|
||||
return -EINVAL;
|
||||
dev->tx_queue_len = ifr->ifr_qlen;
|
||||
return 0;
|
||||
|
||||
case SIOCSIFNAME:
|
||||
ifr->ifr_newname[IFNAMSIZ-1] = '\0';
|
||||
return dev_change_name(dev, ifr->ifr_newname);
|
||||
|
||||
case SIOCSHWTSTAMP:
|
||||
err = net_hwtstamp_validate(ifr);
|
||||
if (err)
|
||||
return err;
|
||||
/* fall through */
|
||||
|
||||
/*
|
||||
* Unknown or private ioctl
|
||||
*/
|
||||
default:
|
||||
if ((cmd >= SIOCDEVPRIVATE &&
|
||||
cmd <= SIOCDEVPRIVATE + 15) ||
|
||||
cmd == SIOCBONDENSLAVE ||
|
||||
cmd == SIOCBONDRELEASE ||
|
||||
cmd == SIOCBONDSETHWADDR ||
|
||||
cmd == SIOCBONDSLAVEINFOQUERY ||
|
||||
cmd == SIOCBONDINFOQUERY ||
|
||||
cmd == SIOCBONDCHANGEACTIVE ||
|
||||
cmd == SIOCGMIIPHY ||
|
||||
cmd == SIOCGMIIREG ||
|
||||
cmd == SIOCSMIIREG ||
|
||||
cmd == SIOCBRADDIF ||
|
||||
cmd == SIOCBRDELIF ||
|
||||
cmd == SIOCSHWTSTAMP ||
|
||||
cmd == SIOCGHWTSTAMP ||
|
||||
cmd == SIOCWANDEV) {
|
||||
err = -EOPNOTSUPP;
|
||||
if (ops->ndo_do_ioctl) {
|
||||
if (netif_device_present(dev))
|
||||
err = ops->ndo_do_ioctl(dev, ifr, cmd);
|
||||
else
|
||||
err = -ENODEV;
|
||||
}
|
||||
} else
|
||||
err = -EINVAL;
|
||||
|
||||
}
|
||||
return err;
|
||||
}
|
||||
|
||||
/**
|
||||
* dev_load - load a network module
|
||||
* @net: the applicable net namespace
|
||||
* @name: name of interface
|
||||
*
|
||||
* If a network interface is not present and the process has suitable
|
||||
* privileges this function loads the module. If module loading is not
|
||||
* available in this kernel then it becomes a nop.
|
||||
*/
|
||||
|
||||
void dev_load(struct net *net, const char *name)
|
||||
{
|
||||
struct net_device *dev;
|
||||
int no_module;
|
||||
|
||||
rcu_read_lock();
|
||||
dev = dev_get_by_name_rcu(net, name);
|
||||
rcu_read_unlock();
|
||||
|
||||
no_module = !dev;
|
||||
if (no_module && capable(CAP_NET_ADMIN))
|
||||
no_module = request_module("netdev-%s", name);
|
||||
if (no_module && capable(CAP_SYS_MODULE))
|
||||
request_module("%s", name);
|
||||
}
|
||||
EXPORT_SYMBOL(dev_load);
|
||||
|
||||
/*
|
||||
* This function handles all "interface"-type I/O control requests. The actual
|
||||
* 'doing' part of this is dev_ifsioc above.
|
||||
*/
|
||||
|
||||
/**
|
||||
* dev_ioctl - network device ioctl
|
||||
* @net: the applicable net namespace
|
||||
* @cmd: command to issue
|
||||
* @arg: pointer to a struct ifreq in user space
|
||||
*
|
||||
* Issue ioctl functions to devices. This is normally called by the
|
||||
* user space syscall interfaces but can sometimes be useful for
|
||||
* other purposes. The return value is the return from the syscall if
|
||||
* positive or a negative errno code on error.
|
||||
*/
|
||||
|
||||
int dev_ioctl(struct net *net, unsigned int cmd, void __user *arg)
|
||||
{
|
||||
struct ifreq ifr;
|
||||
int ret;
|
||||
char *colon;
|
||||
|
||||
/* One special case: SIOCGIFCONF takes ifconf argument
|
||||
and requires shared lock, because it sleeps writing
|
||||
to user space.
|
||||
*/
|
||||
|
||||
if (cmd == SIOCGIFCONF) {
|
||||
rtnl_lock();
|
||||
ret = dev_ifconf(net, (char __user *) arg);
|
||||
rtnl_unlock();
|
||||
return ret;
|
||||
}
|
||||
if (cmd == SIOCGIFNAME)
|
||||
return dev_ifname(net, (struct ifreq __user *)arg);
|
||||
|
||||
if (copy_from_user(&ifr, arg, sizeof(struct ifreq)))
|
||||
return -EFAULT;
|
||||
|
||||
ifr.ifr_name[IFNAMSIZ-1] = 0;
|
||||
|
||||
colon = strchr(ifr.ifr_name, ':');
|
||||
if (colon)
|
||||
*colon = 0;
|
||||
|
||||
/*
|
||||
* See which interface the caller is talking about.
|
||||
*/
|
||||
|
||||
switch (cmd) {
|
||||
/*
|
||||
* These ioctl calls:
|
||||
* - can be done by all.
|
||||
* - atomic and do not require locking.
|
||||
* - return a value
|
||||
*/
|
||||
case SIOCGIFFLAGS:
|
||||
case SIOCGIFMETRIC:
|
||||
case SIOCGIFMTU:
|
||||
case SIOCGIFHWADDR:
|
||||
case SIOCGIFSLAVE:
|
||||
case SIOCGIFMAP:
|
||||
case SIOCGIFINDEX:
|
||||
case SIOCGIFTXQLEN:
|
||||
dev_load(net, ifr.ifr_name);
|
||||
rcu_read_lock();
|
||||
ret = dev_ifsioc_locked(net, &ifr, cmd);
|
||||
rcu_read_unlock();
|
||||
if (!ret) {
|
||||
if (colon)
|
||||
*colon = ':';
|
||||
if (copy_to_user(arg, &ifr,
|
||||
sizeof(struct ifreq)))
|
||||
ret = -EFAULT;
|
||||
}
|
||||
return ret;
|
||||
|
||||
case SIOCETHTOOL:
|
||||
dev_load(net, ifr.ifr_name);
|
||||
rtnl_lock();
|
||||
ret = dev_ethtool(net, &ifr);
|
||||
rtnl_unlock();
|
||||
if (!ret) {
|
||||
if (colon)
|
||||
*colon = ':';
|
||||
if (copy_to_user(arg, &ifr,
|
||||
sizeof(struct ifreq)))
|
||||
ret = -EFAULT;
|
||||
}
|
||||
return ret;
|
||||
|
||||
/*
|
||||
* These ioctl calls:
|
||||
* - require superuser power.
|
||||
* - require strict serialization.
|
||||
* - return a value
|
||||
*/
|
||||
case SIOCGMIIPHY:
|
||||
case SIOCGMIIREG:
|
||||
case SIOCSIFNAME:
|
||||
if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
|
||||
return -EPERM;
|
||||
dev_load(net, ifr.ifr_name);
|
||||
rtnl_lock();
|
||||
ret = dev_ifsioc(net, &ifr, cmd);
|
||||
rtnl_unlock();
|
||||
if (!ret) {
|
||||
if (colon)
|
||||
*colon = ':';
|
||||
if (copy_to_user(arg, &ifr,
|
||||
sizeof(struct ifreq)))
|
||||
ret = -EFAULT;
|
||||
}
|
||||
return ret;
|
||||
|
||||
/*
|
||||
* These ioctl calls:
|
||||
* - require superuser power.
|
||||
* - require strict serialization.
|
||||
* - do not return a value
|
||||
*/
|
||||
case SIOCSIFMAP:
|
||||
case SIOCSIFTXQLEN:
|
||||
if (!capable(CAP_NET_ADMIN))
|
||||
return -EPERM;
|
||||
/* fall through */
|
||||
/*
|
||||
* These ioctl calls:
|
||||
* - require local superuser power.
|
||||
* - require strict serialization.
|
||||
* - do not return a value
|
||||
*/
|
||||
case SIOCSIFFLAGS:
|
||||
case SIOCSIFMETRIC:
|
||||
case SIOCSIFMTU:
|
||||
case SIOCSIFHWADDR:
|
||||
case SIOCSIFSLAVE:
|
||||
case SIOCADDMULTI:
|
||||
case SIOCDELMULTI:
|
||||
case SIOCSIFHWBROADCAST:
|
||||
case SIOCSMIIREG:
|
||||
case SIOCBONDENSLAVE:
|
||||
case SIOCBONDRELEASE:
|
||||
case SIOCBONDSETHWADDR:
|
||||
case SIOCBONDCHANGEACTIVE:
|
||||
case SIOCBRADDIF:
|
||||
case SIOCBRDELIF:
|
||||
case SIOCSHWTSTAMP:
|
||||
if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
|
||||
return -EPERM;
|
||||
/* fall through */
|
||||
case SIOCBONDSLAVEINFOQUERY:
|
||||
case SIOCBONDINFOQUERY:
|
||||
dev_load(net, ifr.ifr_name);
|
||||
rtnl_lock();
|
||||
ret = dev_ifsioc(net, &ifr, cmd);
|
||||
rtnl_unlock();
|
||||
return ret;
|
||||
|
||||
case SIOCGIFMEM:
|
||||
/* Get the per device memory space. We can add this but
|
||||
* currently do not support it */
|
||||
case SIOCSIFMEM:
|
||||
/* Set the per device memory buffer space.
|
||||
* Not applicable in our case */
|
||||
case SIOCSIFLINK:
|
||||
return -ENOTTY;
|
||||
|
||||
/*
|
||||
* Unknown or private ioctl.
|
||||
*/
|
||||
default:
|
||||
if (cmd == SIOCWANDEV ||
|
||||
cmd == SIOCGHWTSTAMP ||
|
||||
(cmd >= SIOCDEVPRIVATE &&
|
||||
cmd <= SIOCDEVPRIVATE + 15)) {
|
||||
dev_load(net, ifr.ifr_name);
|
||||
rtnl_lock();
|
||||
ret = dev_ifsioc(net, &ifr, cmd);
|
||||
rtnl_unlock();
|
||||
if (!ret && copy_to_user(arg, &ifr,
|
||||
sizeof(struct ifreq)))
|
||||
ret = -EFAULT;
|
||||
return ret;
|
||||
}
|
||||
/* Take care of Wireless Extensions */
|
||||
if (cmd >= SIOCIWFIRST && cmd <= SIOCIWLAST)
|
||||
return wext_handle_ioctl(net, &ifr, cmd, arg);
|
||||
return -ENOTTY;
|
||||
}
|
||||
}
|
437
net/core/drop_monitor.c
Normal file
437
net/core/drop_monitor.c
Normal file
|
@ -0,0 +1,437 @@
|
|||
/*
|
||||
* Monitoring code for network dropped packet alerts
|
||||
*
|
||||
* Copyright (C) 2009 Neil Horman <nhorman@tuxdriver.com>
|
||||
*/
|
||||
|
||||
#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
|
||||
|
||||
#include <linux/netdevice.h>
|
||||
#include <linux/etherdevice.h>
|
||||
#include <linux/string.h>
|
||||
#include <linux/if_arp.h>
|
||||
#include <linux/inetdevice.h>
|
||||
#include <linux/inet.h>
|
||||
#include <linux/interrupt.h>
|
||||
#include <linux/netpoll.h>
|
||||
#include <linux/sched.h>
|
||||
#include <linux/delay.h>
|
||||
#include <linux/types.h>
|
||||
#include <linux/workqueue.h>
|
||||
#include <linux/netlink.h>
|
||||
#include <linux/net_dropmon.h>
|
||||
#include <linux/percpu.h>
|
||||
#include <linux/timer.h>
|
||||
#include <linux/bitops.h>
|
||||
#include <linux/slab.h>
|
||||
#include <linux/module.h>
|
||||
#include <net/genetlink.h>
|
||||
#include <net/netevent.h>
|
||||
|
||||
#include <trace/events/skb.h>
|
||||
#include <trace/events/napi.h>
|
||||
|
||||
#include <asm/unaligned.h>
|
||||
|
||||
#define TRACE_ON 1
|
||||
#define TRACE_OFF 0
|
||||
|
||||
/*
|
||||
* Globals, our netlink socket pointer
|
||||
* and the work handle that will send up
|
||||
* netlink alerts
|
||||
*/
|
||||
static int trace_state = TRACE_OFF;
|
||||
static DEFINE_MUTEX(trace_state_mutex);
|
||||
|
||||
struct per_cpu_dm_data {
|
||||
spinlock_t lock;
|
||||
struct sk_buff *skb;
|
||||
struct work_struct dm_alert_work;
|
||||
struct timer_list send_timer;
|
||||
};
|
||||
|
||||
struct dm_hw_stat_delta {
|
||||
struct net_device *dev;
|
||||
unsigned long last_rx;
|
||||
struct list_head list;
|
||||
struct rcu_head rcu;
|
||||
unsigned long last_drop_val;
|
||||
};
|
||||
|
||||
static struct genl_family net_drop_monitor_family = {
|
||||
.id = GENL_ID_GENERATE,
|
||||
.hdrsize = 0,
|
||||
.name = "NET_DM",
|
||||
.version = 2,
|
||||
};
|
||||
|
||||
static DEFINE_PER_CPU(struct per_cpu_dm_data, dm_cpu_data);
|
||||
|
||||
static int dm_hit_limit = 64;
|
||||
static int dm_delay = 1;
|
||||
static unsigned long dm_hw_check_delta = 2*HZ;
|
||||
static LIST_HEAD(hw_stats_list);
|
||||
|
||||
static struct sk_buff *reset_per_cpu_data(struct per_cpu_dm_data *data)
|
||||
{
|
||||
size_t al;
|
||||
struct net_dm_alert_msg *msg;
|
||||
struct nlattr *nla;
|
||||
struct sk_buff *skb;
|
||||
unsigned long flags;
|
||||
|
||||
al = sizeof(struct net_dm_alert_msg);
|
||||
al += dm_hit_limit * sizeof(struct net_dm_drop_point);
|
||||
al += sizeof(struct nlattr);
|
||||
|
||||
skb = genlmsg_new(al, GFP_KERNEL);
|
||||
|
||||
if (skb) {
|
||||
genlmsg_put(skb, 0, 0, &net_drop_monitor_family,
|
||||
0, NET_DM_CMD_ALERT);
|
||||
nla = nla_reserve(skb, NLA_UNSPEC,
|
||||
sizeof(struct net_dm_alert_msg));
|
||||
msg = nla_data(nla);
|
||||
memset(msg, 0, al);
|
||||
} else {
|
||||
mod_timer(&data->send_timer, jiffies + HZ / 10);
|
||||
}
|
||||
|
||||
spin_lock_irqsave(&data->lock, flags);
|
||||
swap(data->skb, skb);
|
||||
spin_unlock_irqrestore(&data->lock, flags);
|
||||
|
||||
return skb;
|
||||
}
|
||||
|
||||
static struct genl_multicast_group dropmon_mcgrps[] = {
|
||||
{ .name = "events", },
|
||||
};
|
||||
|
||||
static void send_dm_alert(struct work_struct *work)
|
||||
{
|
||||
struct sk_buff *skb;
|
||||
struct per_cpu_dm_data *data;
|
||||
|
||||
data = container_of(work, struct per_cpu_dm_data, dm_alert_work);
|
||||
|
||||
skb = reset_per_cpu_data(data);
|
||||
|
||||
if (skb)
|
||||
genlmsg_multicast(&net_drop_monitor_family, skb, 0,
|
||||
0, GFP_KERNEL);
|
||||
}
|
||||
|
||||
/*
|
||||
* This is the timer function to delay the sending of an alert
|
||||
* in the event that more drops will arrive during the
|
||||
* hysteresis period.
|
||||
*/
|
||||
static void sched_send_work(unsigned long _data)
|
||||
{
|
||||
struct per_cpu_dm_data *data = (struct per_cpu_dm_data *)_data;
|
||||
|
||||
schedule_work(&data->dm_alert_work);
|
||||
}
|
||||
|
||||
static void trace_drop_common(struct sk_buff *skb, void *location)
|
||||
{
|
||||
struct net_dm_alert_msg *msg;
|
||||
struct nlmsghdr *nlh;
|
||||
struct nlattr *nla;
|
||||
int i;
|
||||
struct sk_buff *dskb;
|
||||
struct per_cpu_dm_data *data;
|
||||
unsigned long flags;
|
||||
|
||||
local_irq_save(flags);
|
||||
data = this_cpu_ptr(&dm_cpu_data);
|
||||
spin_lock(&data->lock);
|
||||
dskb = data->skb;
|
||||
|
||||
if (!dskb)
|
||||
goto out;
|
||||
|
||||
nlh = (struct nlmsghdr *)dskb->data;
|
||||
nla = genlmsg_data(nlmsg_data(nlh));
|
||||
msg = nla_data(nla);
|
||||
for (i = 0; i < msg->entries; i++) {
|
||||
if (!memcmp(&location, msg->points[i].pc, sizeof(void *))) {
|
||||
msg->points[i].count++;
|
||||
goto out;
|
||||
}
|
||||
}
|
||||
if (msg->entries == dm_hit_limit)
|
||||
goto out;
|
||||
/*
|
||||
* We need to create a new entry
|
||||
*/
|
||||
__nla_reserve_nohdr(dskb, sizeof(struct net_dm_drop_point));
|
||||
nla->nla_len += NLA_ALIGN(sizeof(struct net_dm_drop_point));
|
||||
memcpy(msg->points[msg->entries].pc, &location, sizeof(void *));
|
||||
msg->points[msg->entries].count = 1;
|
||||
msg->entries++;
|
||||
|
||||
if (!timer_pending(&data->send_timer)) {
|
||||
data->send_timer.expires = jiffies + dm_delay * HZ;
|
||||
add_timer(&data->send_timer);
|
||||
}
|
||||
|
||||
out:
|
||||
spin_unlock_irqrestore(&data->lock, flags);
|
||||
}
|
||||
|
||||
static void trace_kfree_skb_hit(void *ignore, struct sk_buff *skb, void *location)
|
||||
{
|
||||
trace_drop_common(skb, location);
|
||||
}
|
||||
|
||||
static void trace_napi_poll_hit(void *ignore, struct napi_struct *napi)
|
||||
{
|
||||
struct dm_hw_stat_delta *new_stat;
|
||||
|
||||
/*
|
||||
* Don't check napi structures with no associated device
|
||||
*/
|
||||
if (!napi->dev)
|
||||
return;
|
||||
|
||||
rcu_read_lock();
|
||||
list_for_each_entry_rcu(new_stat, &hw_stats_list, list) {
|
||||
/*
|
||||
* only add a note to our monitor buffer if:
|
||||
* 1) this is the dev we received on
|
||||
* 2) its after the last_rx delta
|
||||
* 3) our rx_dropped count has gone up
|
||||
*/
|
||||
if ((new_stat->dev == napi->dev) &&
|
||||
(time_after(jiffies, new_stat->last_rx + dm_hw_check_delta)) &&
|
||||
(napi->dev->stats.rx_dropped != new_stat->last_drop_val)) {
|
||||
trace_drop_common(NULL, NULL);
|
||||
new_stat->last_drop_val = napi->dev->stats.rx_dropped;
|
||||
new_stat->last_rx = jiffies;
|
||||
break;
|
||||
}
|
||||
}
|
||||
rcu_read_unlock();
|
||||
}
|
||||
|
||||
static int set_all_monitor_traces(int state)
|
||||
{
|
||||
int rc = 0;
|
||||
struct dm_hw_stat_delta *new_stat = NULL;
|
||||
struct dm_hw_stat_delta *temp;
|
||||
|
||||
mutex_lock(&trace_state_mutex);
|
||||
|
||||
if (state == trace_state) {
|
||||
rc = -EAGAIN;
|
||||
goto out_unlock;
|
||||
}
|
||||
|
||||
switch (state) {
|
||||
case TRACE_ON:
|
||||
if (!try_module_get(THIS_MODULE)) {
|
||||
rc = -ENODEV;
|
||||
break;
|
||||
}
|
||||
|
||||
rc |= register_trace_kfree_skb(trace_kfree_skb_hit, NULL);
|
||||
rc |= register_trace_napi_poll(trace_napi_poll_hit, NULL);
|
||||
break;
|
||||
|
||||
case TRACE_OFF:
|
||||
rc |= unregister_trace_kfree_skb(trace_kfree_skb_hit, NULL);
|
||||
rc |= unregister_trace_napi_poll(trace_napi_poll_hit, NULL);
|
||||
|
||||
tracepoint_synchronize_unregister();
|
||||
|
||||
/*
|
||||
* Clean the device list
|
||||
*/
|
||||
list_for_each_entry_safe(new_stat, temp, &hw_stats_list, list) {
|
||||
if (new_stat->dev == NULL) {
|
||||
list_del_rcu(&new_stat->list);
|
||||
kfree_rcu(new_stat, rcu);
|
||||
}
|
||||
}
|
||||
|
||||
module_put(THIS_MODULE);
|
||||
|
||||
break;
|
||||
default:
|
||||
rc = 1;
|
||||
break;
|
||||
}
|
||||
|
||||
if (!rc)
|
||||
trace_state = state;
|
||||
else
|
||||
rc = -EINPROGRESS;
|
||||
|
||||
out_unlock:
|
||||
mutex_unlock(&trace_state_mutex);
|
||||
|
||||
return rc;
|
||||
}
|
||||
|
||||
|
||||
static int net_dm_cmd_config(struct sk_buff *skb,
|
||||
struct genl_info *info)
|
||||
{
|
||||
return -ENOTSUPP;
|
||||
}
|
||||
|
||||
static int net_dm_cmd_trace(struct sk_buff *skb,
|
||||
struct genl_info *info)
|
||||
{
|
||||
switch (info->genlhdr->cmd) {
|
||||
case NET_DM_CMD_START:
|
||||
return set_all_monitor_traces(TRACE_ON);
|
||||
case NET_DM_CMD_STOP:
|
||||
return set_all_monitor_traces(TRACE_OFF);
|
||||
}
|
||||
|
||||
return -ENOTSUPP;
|
||||
}
|
||||
|
||||
static int dropmon_net_event(struct notifier_block *ev_block,
|
||||
unsigned long event, void *ptr)
|
||||
{
|
||||
struct net_device *dev = netdev_notifier_info_to_dev(ptr);
|
||||
struct dm_hw_stat_delta *new_stat = NULL;
|
||||
struct dm_hw_stat_delta *tmp;
|
||||
|
||||
switch (event) {
|
||||
case NETDEV_REGISTER:
|
||||
new_stat = kzalloc(sizeof(struct dm_hw_stat_delta), GFP_KERNEL);
|
||||
|
||||
if (!new_stat)
|
||||
goto out;
|
||||
|
||||
new_stat->dev = dev;
|
||||
new_stat->last_rx = jiffies;
|
||||
mutex_lock(&trace_state_mutex);
|
||||
list_add_rcu(&new_stat->list, &hw_stats_list);
|
||||
mutex_unlock(&trace_state_mutex);
|
||||
break;
|
||||
case NETDEV_UNREGISTER:
|
||||
mutex_lock(&trace_state_mutex);
|
||||
list_for_each_entry_safe(new_stat, tmp, &hw_stats_list, list) {
|
||||
if (new_stat->dev == dev) {
|
||||
new_stat->dev = NULL;
|
||||
if (trace_state == TRACE_OFF) {
|
||||
list_del_rcu(&new_stat->list);
|
||||
kfree_rcu(new_stat, rcu);
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
mutex_unlock(&trace_state_mutex);
|
||||
break;
|
||||
}
|
||||
out:
|
||||
return NOTIFY_DONE;
|
||||
}
|
||||
|
||||
static const struct genl_ops dropmon_ops[] = {
|
||||
{
|
||||
.cmd = NET_DM_CMD_CONFIG,
|
||||
.doit = net_dm_cmd_config,
|
||||
},
|
||||
{
|
||||
.cmd = NET_DM_CMD_START,
|
||||
.doit = net_dm_cmd_trace,
|
||||
},
|
||||
{
|
||||
.cmd = NET_DM_CMD_STOP,
|
||||
.doit = net_dm_cmd_trace,
|
||||
},
|
||||
};
|
||||
|
||||
static struct notifier_block dropmon_net_notifier = {
|
||||
.notifier_call = dropmon_net_event
|
||||
};
|
||||
|
||||
static int __init init_net_drop_monitor(void)
|
||||
{
|
||||
struct per_cpu_dm_data *data;
|
||||
int cpu, rc;
|
||||
|
||||
pr_info("Initializing network drop monitor service\n");
|
||||
|
||||
if (sizeof(void *) > 8) {
|
||||
pr_err("Unable to store program counters on this arch, Drop monitor failed\n");
|
||||
return -ENOSPC;
|
||||
}
|
||||
|
||||
rc = genl_register_family_with_ops_groups(&net_drop_monitor_family,
|
||||
dropmon_ops, dropmon_mcgrps);
|
||||
if (rc) {
|
||||
pr_err("Could not create drop monitor netlink family\n");
|
||||
return rc;
|
||||
}
|
||||
WARN_ON(net_drop_monitor_family.mcgrp_offset != NET_DM_GRP_ALERT);
|
||||
|
||||
rc = register_netdevice_notifier(&dropmon_net_notifier);
|
||||
if (rc < 0) {
|
||||
pr_crit("Failed to register netdevice notifier\n");
|
||||
goto out_unreg;
|
||||
}
|
||||
|
||||
rc = 0;
|
||||
|
||||
for_each_possible_cpu(cpu) {
|
||||
data = &per_cpu(dm_cpu_data, cpu);
|
||||
INIT_WORK(&data->dm_alert_work, send_dm_alert);
|
||||
init_timer(&data->send_timer);
|
||||
data->send_timer.data = (unsigned long)data;
|
||||
data->send_timer.function = sched_send_work;
|
||||
spin_lock_init(&data->lock);
|
||||
reset_per_cpu_data(data);
|
||||
}
|
||||
|
||||
|
||||
goto out;
|
||||
|
||||
out_unreg:
|
||||
genl_unregister_family(&net_drop_monitor_family);
|
||||
out:
|
||||
return rc;
|
||||
}
|
||||
|
||||
static void exit_net_drop_monitor(void)
|
||||
{
|
||||
struct per_cpu_dm_data *data;
|
||||
int cpu;
|
||||
|
||||
BUG_ON(unregister_netdevice_notifier(&dropmon_net_notifier));
|
||||
|
||||
/*
|
||||
* Because of the module_get/put we do in the trace state change path
|
||||
* we are guarnateed not to have any current users when we get here
|
||||
* all we need to do is make sure that we don't have any running timers
|
||||
* or pending schedule calls
|
||||
*/
|
||||
|
||||
for_each_possible_cpu(cpu) {
|
||||
data = &per_cpu(dm_cpu_data, cpu);
|
||||
del_timer_sync(&data->send_timer);
|
||||
cancel_work_sync(&data->dm_alert_work);
|
||||
/*
|
||||
* At this point, we should have exclusive access
|
||||
* to this struct and can free the skb inside it
|
||||
*/
|
||||
kfree_skb(data->skb);
|
||||
}
|
||||
|
||||
BUG_ON(genl_unregister_family(&net_drop_monitor_family));
|
||||
}
|
||||
|
||||
module_init(init_net_drop_monitor);
|
||||
module_exit(exit_net_drop_monitor);
|
||||
|
||||
MODULE_LICENSE("GPL v2");
|
||||
MODULE_AUTHOR("Neil Horman <nhorman@tuxdriver.com>");
|
||||
MODULE_ALIAS_GENL_FAMILY("NET_DM");
|
421
net/core/dst.c
Normal file
421
net/core/dst.c
Normal file
|
@ -0,0 +1,421 @@
|
|||
/*
|
||||
* net/core/dst.c Protocol independent destination cache.
|
||||
*
|
||||
* Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
|
||||
*
|
||||
*/
|
||||
|
||||
#include <linux/bitops.h>
|
||||
#include <linux/errno.h>
|
||||
#include <linux/init.h>
|
||||
#include <linux/kernel.h>
|
||||
#include <linux/workqueue.h>
|
||||
#include <linux/mm.h>
|
||||
#include <linux/module.h>
|
||||
#include <linux/slab.h>
|
||||
#include <linux/netdevice.h>
|
||||
#include <linux/skbuff.h>
|
||||
#include <linux/string.h>
|
||||
#include <linux/types.h>
|
||||
#include <net/net_namespace.h>
|
||||
#include <linux/sched.h>
|
||||
#include <linux/prefetch.h>
|
||||
|
||||
#include <net/dst.h>
|
||||
|
||||
/*
|
||||
* Theory of operations:
|
||||
* 1) We use a list, protected by a spinlock, to add
|
||||
* new entries from both BH and non-BH context.
|
||||
* 2) In order to keep spinlock held for a small delay,
|
||||
* we use a second list where are stored long lived
|
||||
* entries, that are handled by the garbage collect thread
|
||||
* fired by a workqueue.
|
||||
* 3) This list is guarded by a mutex,
|
||||
* so that the gc_task and dst_dev_event() can be synchronized.
|
||||
*/
|
||||
|
||||
/*
|
||||
* We want to keep lock & list close together
|
||||
* to dirty as few cache lines as possible in __dst_free().
|
||||
* As this is not a very strong hint, we dont force an alignment on SMP.
|
||||
*/
|
||||
static struct {
|
||||
spinlock_t lock;
|
||||
struct dst_entry *list;
|
||||
unsigned long timer_inc;
|
||||
unsigned long timer_expires;
|
||||
} dst_garbage = {
|
||||
.lock = __SPIN_LOCK_UNLOCKED(dst_garbage.lock),
|
||||
.timer_inc = DST_GC_MAX,
|
||||
};
|
||||
static void dst_gc_task(struct work_struct *work);
|
||||
static void ___dst_free(struct dst_entry *dst);
|
||||
|
||||
static DECLARE_DELAYED_WORK(dst_gc_work, dst_gc_task);
|
||||
|
||||
static DEFINE_MUTEX(dst_gc_mutex);
|
||||
/*
|
||||
* long lived entries are maintained in this list, guarded by dst_gc_mutex
|
||||
*/
|
||||
static struct dst_entry *dst_busy_list;
|
||||
|
||||
static void dst_gc_task(struct work_struct *work)
|
||||
{
|
||||
int delayed = 0;
|
||||
int work_performed = 0;
|
||||
unsigned long expires = ~0L;
|
||||
struct dst_entry *dst, *next, head;
|
||||
struct dst_entry *last = &head;
|
||||
|
||||
mutex_lock(&dst_gc_mutex);
|
||||
next = dst_busy_list;
|
||||
|
||||
loop:
|
||||
while ((dst = next) != NULL) {
|
||||
next = dst->next;
|
||||
prefetch(&next->next);
|
||||
cond_resched();
|
||||
if (likely(atomic_read(&dst->__refcnt))) {
|
||||
last->next = dst;
|
||||
last = dst;
|
||||
delayed++;
|
||||
continue;
|
||||
}
|
||||
work_performed++;
|
||||
|
||||
dst = dst_destroy(dst);
|
||||
if (dst) {
|
||||
/* NOHASH and still referenced. Unless it is already
|
||||
* on gc list, invalidate it and add to gc list.
|
||||
*
|
||||
* Note: this is temporary. Actually, NOHASH dst's
|
||||
* must be obsoleted when parent is obsoleted.
|
||||
* But we do not have state "obsoleted, but
|
||||
* referenced by parent", so it is right.
|
||||
*/
|
||||
if (dst->obsolete > 0)
|
||||
continue;
|
||||
|
||||
___dst_free(dst);
|
||||
dst->next = next;
|
||||
next = dst;
|
||||
}
|
||||
}
|
||||
|
||||
spin_lock_bh(&dst_garbage.lock);
|
||||
next = dst_garbage.list;
|
||||
if (next) {
|
||||
dst_garbage.list = NULL;
|
||||
spin_unlock_bh(&dst_garbage.lock);
|
||||
goto loop;
|
||||
}
|
||||
last->next = NULL;
|
||||
dst_busy_list = head.next;
|
||||
if (!dst_busy_list)
|
||||
dst_garbage.timer_inc = DST_GC_MAX;
|
||||
else {
|
||||
/*
|
||||
* if we freed less than 1/10 of delayed entries,
|
||||
* we can sleep longer.
|
||||
*/
|
||||
if (work_performed <= delayed/10) {
|
||||
dst_garbage.timer_expires += dst_garbage.timer_inc;
|
||||
if (dst_garbage.timer_expires > DST_GC_MAX)
|
||||
dst_garbage.timer_expires = DST_GC_MAX;
|
||||
dst_garbage.timer_inc += DST_GC_INC;
|
||||
} else {
|
||||
dst_garbage.timer_inc = DST_GC_INC;
|
||||
dst_garbage.timer_expires = DST_GC_MIN;
|
||||
}
|
||||
expires = dst_garbage.timer_expires;
|
||||
/*
|
||||
* if the next desired timer is more than 4 seconds in the
|
||||
* future then round the timer to whole seconds
|
||||
*/
|
||||
if (expires > 4*HZ)
|
||||
expires = round_jiffies_relative(expires);
|
||||
schedule_delayed_work(&dst_gc_work, expires);
|
||||
}
|
||||
|
||||
spin_unlock_bh(&dst_garbage.lock);
|
||||
mutex_unlock(&dst_gc_mutex);
|
||||
}
|
||||
|
||||
int dst_discard_sk(struct sock *sk, struct sk_buff *skb)
|
||||
{
|
||||
kfree_skb(skb);
|
||||
return 0;
|
||||
}
|
||||
EXPORT_SYMBOL(dst_discard_sk);
|
||||
|
||||
const u32 dst_default_metrics[RTAX_MAX + 1] = {
|
||||
/* This initializer is needed to force linker to place this variable
|
||||
* into const section. Otherwise it might end into bss section.
|
||||
* We really want to avoid false sharing on this variable, and catch
|
||||
* any writes on it.
|
||||
*/
|
||||
[RTAX_MAX] = 0xdeadbeef,
|
||||
};
|
||||
|
||||
|
||||
void *dst_alloc(struct dst_ops *ops, struct net_device *dev,
|
||||
int initial_ref, int initial_obsolete, unsigned short flags)
|
||||
{
|
||||
struct dst_entry *dst;
|
||||
|
||||
if (ops->gc && dst_entries_get_fast(ops) > ops->gc_thresh) {
|
||||
if (ops->gc(ops))
|
||||
return NULL;
|
||||
}
|
||||
dst = kmem_cache_alloc(ops->kmem_cachep, GFP_ATOMIC);
|
||||
if (!dst)
|
||||
return NULL;
|
||||
dst->child = NULL;
|
||||
dst->dev = dev;
|
||||
if (dev)
|
||||
dev_hold(dev);
|
||||
dst->ops = ops;
|
||||
dst_init_metrics(dst, dst_default_metrics, true);
|
||||
dst->expires = 0UL;
|
||||
dst->path = dst;
|
||||
dst->from = NULL;
|
||||
#ifdef CONFIG_XFRM
|
||||
dst->xfrm = NULL;
|
||||
#endif
|
||||
dst->input = dst_discard;
|
||||
dst->output = dst_discard_sk;
|
||||
dst->error = 0;
|
||||
dst->obsolete = initial_obsolete;
|
||||
dst->header_len = 0;
|
||||
dst->trailer_len = 0;
|
||||
#ifdef CONFIG_IP_ROUTE_CLASSID
|
||||
dst->tclassid = 0;
|
||||
#endif
|
||||
atomic_set(&dst->__refcnt, initial_ref);
|
||||
dst->__use = 0;
|
||||
dst->lastuse = jiffies;
|
||||
dst->flags = flags;
|
||||
dst->pending_confirm = 0;
|
||||
dst->next = NULL;
|
||||
if (!(flags & DST_NOCOUNT))
|
||||
dst_entries_add(ops, 1);
|
||||
return dst;
|
||||
}
|
||||
EXPORT_SYMBOL(dst_alloc);
|
||||
|
||||
static void ___dst_free(struct dst_entry *dst)
|
||||
{
|
||||
/* The first case (dev==NULL) is required, when
|
||||
protocol module is unloaded.
|
||||
*/
|
||||
if (dst->dev == NULL || !(dst->dev->flags&IFF_UP)) {
|
||||
dst->input = dst_discard;
|
||||
dst->output = dst_discard_sk;
|
||||
}
|
||||
dst->obsolete = DST_OBSOLETE_DEAD;
|
||||
}
|
||||
|
||||
void __dst_free(struct dst_entry *dst)
|
||||
{
|
||||
spin_lock_bh(&dst_garbage.lock);
|
||||
___dst_free(dst);
|
||||
dst->next = dst_garbage.list;
|
||||
dst_garbage.list = dst;
|
||||
if (dst_garbage.timer_inc > DST_GC_INC) {
|
||||
dst_garbage.timer_inc = DST_GC_INC;
|
||||
dst_garbage.timer_expires = DST_GC_MIN;
|
||||
mod_delayed_work(system_wq, &dst_gc_work,
|
||||
dst_garbage.timer_expires);
|
||||
}
|
||||
spin_unlock_bh(&dst_garbage.lock);
|
||||
}
|
||||
EXPORT_SYMBOL(__dst_free);
|
||||
|
||||
struct dst_entry *dst_destroy(struct dst_entry * dst)
|
||||
{
|
||||
struct dst_entry *child;
|
||||
|
||||
smp_rmb();
|
||||
|
||||
again:
|
||||
child = dst->child;
|
||||
|
||||
if (!(dst->flags & DST_NOCOUNT))
|
||||
dst_entries_add(dst->ops, -1);
|
||||
|
||||
if (dst->ops->destroy)
|
||||
dst->ops->destroy(dst);
|
||||
if (dst->dev)
|
||||
dev_put(dst->dev);
|
||||
kmem_cache_free(dst->ops->kmem_cachep, dst);
|
||||
|
||||
dst = child;
|
||||
if (dst) {
|
||||
int nohash = dst->flags & DST_NOHASH;
|
||||
|
||||
if (atomic_dec_and_test(&dst->__refcnt)) {
|
||||
/* We were real parent of this dst, so kill child. */
|
||||
if (nohash)
|
||||
goto again;
|
||||
} else {
|
||||
/* Child is still referenced, return it for freeing. */
|
||||
if (nohash)
|
||||
return dst;
|
||||
/* Child is still in his hash table */
|
||||
}
|
||||
}
|
||||
return NULL;
|
||||
}
|
||||
EXPORT_SYMBOL(dst_destroy);
|
||||
|
||||
static void dst_destroy_rcu(struct rcu_head *head)
|
||||
{
|
||||
struct dst_entry *dst = container_of(head, struct dst_entry, rcu_head);
|
||||
|
||||
dst = dst_destroy(dst);
|
||||
if (dst)
|
||||
__dst_free(dst);
|
||||
}
|
||||
|
||||
void dst_release(struct dst_entry *dst)
|
||||
{
|
||||
if (dst) {
|
||||
int newrefcnt;
|
||||
|
||||
newrefcnt = atomic_dec_return(&dst->__refcnt);
|
||||
WARN_ON(newrefcnt < 0);
|
||||
if (unlikely(dst->flags & DST_NOCACHE) && !newrefcnt)
|
||||
call_rcu(&dst->rcu_head, dst_destroy_rcu);
|
||||
}
|
||||
}
|
||||
EXPORT_SYMBOL(dst_release);
|
||||
|
||||
u32 *dst_cow_metrics_generic(struct dst_entry *dst, unsigned long old)
|
||||
{
|
||||
u32 *p = kmalloc(sizeof(u32) * RTAX_MAX, GFP_ATOMIC);
|
||||
|
||||
if (p) {
|
||||
u32 *old_p = __DST_METRICS_PTR(old);
|
||||
unsigned long prev, new;
|
||||
|
||||
memcpy(p, old_p, sizeof(u32) * RTAX_MAX);
|
||||
|
||||
new = (unsigned long) p;
|
||||
prev = cmpxchg(&dst->_metrics, old, new);
|
||||
|
||||
if (prev != old) {
|
||||
kfree(p);
|
||||
p = __DST_METRICS_PTR(prev);
|
||||
if (prev & DST_METRICS_READ_ONLY)
|
||||
p = NULL;
|
||||
}
|
||||
}
|
||||
return p;
|
||||
}
|
||||
EXPORT_SYMBOL(dst_cow_metrics_generic);
|
||||
|
||||
/* Caller asserts that dst_metrics_read_only(dst) is false. */
|
||||
void __dst_destroy_metrics_generic(struct dst_entry *dst, unsigned long old)
|
||||
{
|
||||
unsigned long prev, new;
|
||||
|
||||
new = ((unsigned long) dst_default_metrics) | DST_METRICS_READ_ONLY;
|
||||
prev = cmpxchg(&dst->_metrics, old, new);
|
||||
if (prev == old)
|
||||
kfree(__DST_METRICS_PTR(old));
|
||||
}
|
||||
EXPORT_SYMBOL(__dst_destroy_metrics_generic);
|
||||
|
||||
/**
|
||||
* __skb_dst_set_noref - sets skb dst, without a reference
|
||||
* @skb: buffer
|
||||
* @dst: dst entry
|
||||
* @force: if force is set, use noref version even for DST_NOCACHE entries
|
||||
*
|
||||
* Sets skb dst, assuming a reference was not taken on dst
|
||||
* skb_dst_drop() should not dst_release() this dst
|
||||
*/
|
||||
void __skb_dst_set_noref(struct sk_buff *skb, struct dst_entry *dst, bool force)
|
||||
{
|
||||
WARN_ON(!rcu_read_lock_held() && !rcu_read_lock_bh_held());
|
||||
/* If dst not in cache, we must take a reference, because
|
||||
* dst_release() will destroy dst as soon as its refcount becomes zero
|
||||
*/
|
||||
if (unlikely((dst->flags & DST_NOCACHE) && !force)) {
|
||||
dst_hold(dst);
|
||||
skb_dst_set(skb, dst);
|
||||
} else {
|
||||
skb->_skb_refdst = (unsigned long)dst | SKB_DST_NOREF;
|
||||
}
|
||||
}
|
||||
EXPORT_SYMBOL(__skb_dst_set_noref);
|
||||
|
||||
/* Dirty hack. We did it in 2.2 (in __dst_free),
|
||||
* we have _very_ good reasons not to repeat
|
||||
* this mistake in 2.3, but we have no choice
|
||||
* now. _It_ _is_ _explicit_ _deliberate_
|
||||
* _race_ _condition_.
|
||||
*
|
||||
* Commented and originally written by Alexey.
|
||||
*/
|
||||
static void dst_ifdown(struct dst_entry *dst, struct net_device *dev,
|
||||
int unregister)
|
||||
{
|
||||
if (dst->ops->ifdown)
|
||||
dst->ops->ifdown(dst, dev, unregister);
|
||||
|
||||
if (dev != dst->dev)
|
||||
return;
|
||||
|
||||
if (!unregister) {
|
||||
dst->input = dst_discard;
|
||||
dst->output = dst_discard_sk;
|
||||
} else {
|
||||
dst->dev = dev_net(dst->dev)->loopback_dev;
|
||||
dev_hold(dst->dev);
|
||||
dev_put(dev);
|
||||
}
|
||||
}
|
||||
|
||||
static int dst_dev_event(struct notifier_block *this, unsigned long event,
|
||||
void *ptr)
|
||||
{
|
||||
struct net_device *dev = netdev_notifier_info_to_dev(ptr);
|
||||
struct dst_entry *dst, *last = NULL;
|
||||
|
||||
switch (event) {
|
||||
case NETDEV_UNREGISTER_FINAL:
|
||||
case NETDEV_DOWN:
|
||||
mutex_lock(&dst_gc_mutex);
|
||||
for (dst = dst_busy_list; dst; dst = dst->next) {
|
||||
last = dst;
|
||||
dst_ifdown(dst, dev, event != NETDEV_DOWN);
|
||||
}
|
||||
|
||||
spin_lock_bh(&dst_garbage.lock);
|
||||
dst = dst_garbage.list;
|
||||
dst_garbage.list = NULL;
|
||||
spin_unlock_bh(&dst_garbage.lock);
|
||||
|
||||
if (last)
|
||||
last->next = dst;
|
||||
else
|
||||
dst_busy_list = dst;
|
||||
for (; dst; dst = dst->next)
|
||||
dst_ifdown(dst, dev, event != NETDEV_DOWN);
|
||||
mutex_unlock(&dst_gc_mutex);
|
||||
break;
|
||||
}
|
||||
return NOTIFY_DONE;
|
||||
}
|
||||
|
||||
static struct notifier_block dst_dev_notifier = {
|
||||
.notifier_call = dst_dev_event,
|
||||
.priority = -10, /* must be called after other network notifiers */
|
||||
};
|
||||
|
||||
void __init dst_init(void)
|
||||
{
|
||||
register_netdevice_notifier(&dst_dev_notifier);
|
||||
}
|
1957
net/core/ethtool.c
Normal file
1957
net/core/ethtool.c
Normal file
File diff suppressed because it is too large
Load diff
854
net/core/fib_rules.c
Normal file
854
net/core/fib_rules.c
Normal file
|
@ -0,0 +1,854 @@
|
|||
/*
|
||||
* net/core/fib_rules.c Generic Routing Rules
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU General Public License as
|
||||
* published by the Free Software Foundation, version 2.
|
||||
*
|
||||
* Authors: Thomas Graf <tgraf@suug.ch>
|
||||
*/
|
||||
|
||||
#include <linux/types.h>
|
||||
#include <linux/kernel.h>
|
||||
#include <linux/slab.h>
|
||||
#include <linux/list.h>
|
||||
#include <linux/module.h>
|
||||
#include <net/net_namespace.h>
|
||||
#include <net/sock.h>
|
||||
#include <net/fib_rules.h>
|
||||
|
||||
int fib_default_rule_add(struct fib_rules_ops *ops,
|
||||
u32 pref, u32 table, u32 flags)
|
||||
{
|
||||
struct fib_rule *r;
|
||||
|
||||
r = kzalloc(ops->rule_size, GFP_KERNEL);
|
||||
if (r == NULL)
|
||||
return -ENOMEM;
|
||||
|
||||
atomic_set(&r->refcnt, 1);
|
||||
r->action = FR_ACT_TO_TBL;
|
||||
r->pref = pref;
|
||||
r->table = table;
|
||||
r->flags = flags;
|
||||
r->uid_start = INVALID_UID;
|
||||
r->uid_end = INVALID_UID;
|
||||
r->fr_net = hold_net(ops->fro_net);
|
||||
|
||||
r->suppress_prefixlen = -1;
|
||||
r->suppress_ifgroup = -1;
|
||||
|
||||
/* The lock is not required here, the list in unreacheable
|
||||
* at the moment this function is called */
|
||||
list_add_tail(&r->list, &ops->rules_list);
|
||||
return 0;
|
||||
}
|
||||
EXPORT_SYMBOL(fib_default_rule_add);
|
||||
|
||||
u32 fib_default_rule_pref(struct fib_rules_ops *ops)
|
||||
{
|
||||
struct list_head *pos;
|
||||
struct fib_rule *rule;
|
||||
|
||||
if (!list_empty(&ops->rules_list)) {
|
||||
pos = ops->rules_list.next;
|
||||
if (pos->next != &ops->rules_list) {
|
||||
rule = list_entry(pos->next, struct fib_rule, list);
|
||||
if (rule->pref)
|
||||
return rule->pref - 1;
|
||||
}
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
EXPORT_SYMBOL(fib_default_rule_pref);
|
||||
|
||||
static void notify_rule_change(int event, struct fib_rule *rule,
|
||||
struct fib_rules_ops *ops, struct nlmsghdr *nlh,
|
||||
u32 pid);
|
||||
|
||||
static struct fib_rules_ops *lookup_rules_ops(struct net *net, int family)
|
||||
{
|
||||
struct fib_rules_ops *ops;
|
||||
|
||||
rcu_read_lock();
|
||||
list_for_each_entry_rcu(ops, &net->rules_ops, list) {
|
||||
if (ops->family == family) {
|
||||
if (!try_module_get(ops->owner))
|
||||
ops = NULL;
|
||||
rcu_read_unlock();
|
||||
return ops;
|
||||
}
|
||||
}
|
||||
rcu_read_unlock();
|
||||
|
||||
return NULL;
|
||||
}
|
||||
|
||||
static void rules_ops_put(struct fib_rules_ops *ops)
|
||||
{
|
||||
if (ops)
|
||||
module_put(ops->owner);
|
||||
}
|
||||
|
||||
static void flush_route_cache(struct fib_rules_ops *ops)
|
||||
{
|
||||
if (ops->flush_cache)
|
||||
ops->flush_cache(ops);
|
||||
}
|
||||
|
||||
static int __fib_rules_register(struct fib_rules_ops *ops)
|
||||
{
|
||||
int err = -EEXIST;
|
||||
struct fib_rules_ops *o;
|
||||
struct net *net;
|
||||
|
||||
net = ops->fro_net;
|
||||
|
||||
if (ops->rule_size < sizeof(struct fib_rule))
|
||||
return -EINVAL;
|
||||
|
||||
if (ops->match == NULL || ops->configure == NULL ||
|
||||
ops->compare == NULL || ops->fill == NULL ||
|
||||
ops->action == NULL)
|
||||
return -EINVAL;
|
||||
|
||||
spin_lock(&net->rules_mod_lock);
|
||||
list_for_each_entry(o, &net->rules_ops, list)
|
||||
if (ops->family == o->family)
|
||||
goto errout;
|
||||
|
||||
hold_net(net);
|
||||
list_add_tail_rcu(&ops->list, &net->rules_ops);
|
||||
err = 0;
|
||||
errout:
|
||||
spin_unlock(&net->rules_mod_lock);
|
||||
|
||||
return err;
|
||||
}
|
||||
|
||||
struct fib_rules_ops *
|
||||
fib_rules_register(const struct fib_rules_ops *tmpl, struct net *net)
|
||||
{
|
||||
struct fib_rules_ops *ops;
|
||||
int err;
|
||||
|
||||
ops = kmemdup(tmpl, sizeof(*ops), GFP_KERNEL);
|
||||
if (ops == NULL)
|
||||
return ERR_PTR(-ENOMEM);
|
||||
|
||||
INIT_LIST_HEAD(&ops->rules_list);
|
||||
ops->fro_net = net;
|
||||
|
||||
err = __fib_rules_register(ops);
|
||||
if (err) {
|
||||
kfree(ops);
|
||||
ops = ERR_PTR(err);
|
||||
}
|
||||
|
||||
return ops;
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(fib_rules_register);
|
||||
|
||||
static void fib_rules_cleanup_ops(struct fib_rules_ops *ops)
|
||||
{
|
||||
struct fib_rule *rule, *tmp;
|
||||
|
||||
list_for_each_entry_safe(rule, tmp, &ops->rules_list, list) {
|
||||
list_del_rcu(&rule->list);
|
||||
if (ops->delete)
|
||||
ops->delete(rule);
|
||||
fib_rule_put(rule);
|
||||
}
|
||||
}
|
||||
|
||||
static void fib_rules_put_rcu(struct rcu_head *head)
|
||||
{
|
||||
struct fib_rules_ops *ops = container_of(head, struct fib_rules_ops, rcu);
|
||||
struct net *net = ops->fro_net;
|
||||
|
||||
release_net(net);
|
||||
kfree(ops);
|
||||
}
|
||||
|
||||
void fib_rules_unregister(struct fib_rules_ops *ops)
|
||||
{
|
||||
struct net *net = ops->fro_net;
|
||||
|
||||
spin_lock(&net->rules_mod_lock);
|
||||
list_del_rcu(&ops->list);
|
||||
fib_rules_cleanup_ops(ops);
|
||||
spin_unlock(&net->rules_mod_lock);
|
||||
|
||||
call_rcu(&ops->rcu, fib_rules_put_rcu);
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(fib_rules_unregister);
|
||||
|
||||
static inline kuid_t fib_nl_uid(struct nlattr *nla)
|
||||
{
|
||||
return make_kuid(current_user_ns(), nla_get_u32(nla));
|
||||
}
|
||||
|
||||
static int nla_put_uid(struct sk_buff *skb, int idx, kuid_t uid)
|
||||
{
|
||||
return nla_put_u32(skb, idx, from_kuid_munged(current_user_ns(), uid));
|
||||
}
|
||||
|
||||
static int fib_uid_range_match(struct flowi *fl, struct fib_rule *rule)
|
||||
{
|
||||
return (!uid_valid(rule->uid_start) && !uid_valid(rule->uid_end)) ||
|
||||
(uid_gte(fl->flowi_uid, rule->uid_start) &&
|
||||
uid_lte(fl->flowi_uid, rule->uid_end));
|
||||
}
|
||||
|
||||
static int fib_rule_match(struct fib_rule *rule, struct fib_rules_ops *ops,
|
||||
struct flowi *fl, int flags)
|
||||
{
|
||||
int ret = 0;
|
||||
|
||||
if (rule->iifindex && (rule->iifindex != fl->flowi_iif))
|
||||
goto out;
|
||||
|
||||
if (rule->oifindex && (rule->oifindex != fl->flowi_oif))
|
||||
goto out;
|
||||
|
||||
if ((rule->mark ^ fl->flowi_mark) & rule->mark_mask)
|
||||
goto out;
|
||||
|
||||
if (!fib_uid_range_match(fl, rule))
|
||||
goto out;
|
||||
|
||||
ret = ops->match(rule, fl, flags);
|
||||
out:
|
||||
return (rule->flags & FIB_RULE_INVERT) ? !ret : ret;
|
||||
}
|
||||
|
||||
int fib_rules_lookup(struct fib_rules_ops *ops, struct flowi *fl,
|
||||
int flags, struct fib_lookup_arg *arg)
|
||||
{
|
||||
struct fib_rule *rule;
|
||||
int err;
|
||||
|
||||
rcu_read_lock();
|
||||
|
||||
list_for_each_entry_rcu(rule, &ops->rules_list, list) {
|
||||
jumped:
|
||||
if (!fib_rule_match(rule, ops, fl, flags))
|
||||
continue;
|
||||
|
||||
if (rule->action == FR_ACT_GOTO) {
|
||||
struct fib_rule *target;
|
||||
|
||||
target = rcu_dereference(rule->ctarget);
|
||||
if (target == NULL) {
|
||||
continue;
|
||||
} else {
|
||||
rule = target;
|
||||
goto jumped;
|
||||
}
|
||||
} else if (rule->action == FR_ACT_NOP)
|
||||
continue;
|
||||
else
|
||||
err = ops->action(rule, fl, flags, arg);
|
||||
|
||||
if (!err && ops->suppress && ops->suppress(rule, arg))
|
||||
continue;
|
||||
|
||||
if (err != -EAGAIN) {
|
||||
if ((arg->flags & FIB_LOOKUP_NOREF) ||
|
||||
likely(atomic_inc_not_zero(&rule->refcnt))) {
|
||||
arg->rule = rule;
|
||||
goto out;
|
||||
}
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
err = -ESRCH;
|
||||
out:
|
||||
rcu_read_unlock();
|
||||
|
||||
return err;
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(fib_rules_lookup);
|
||||
|
||||
static int validate_rulemsg(struct fib_rule_hdr *frh, struct nlattr **tb,
|
||||
struct fib_rules_ops *ops)
|
||||
{
|
||||
int err = -EINVAL;
|
||||
|
||||
if (frh->src_len)
|
||||
if (tb[FRA_SRC] == NULL ||
|
||||
frh->src_len > (ops->addr_size * 8) ||
|
||||
nla_len(tb[FRA_SRC]) != ops->addr_size)
|
||||
goto errout;
|
||||
|
||||
if (frh->dst_len)
|
||||
if (tb[FRA_DST] == NULL ||
|
||||
frh->dst_len > (ops->addr_size * 8) ||
|
||||
nla_len(tb[FRA_DST]) != ops->addr_size)
|
||||
goto errout;
|
||||
|
||||
err = 0;
|
||||
errout:
|
||||
return err;
|
||||
}
|
||||
|
||||
static int fib_nl_newrule(struct sk_buff *skb, struct nlmsghdr* nlh)
|
||||
{
|
||||
struct net *net = sock_net(skb->sk);
|
||||
struct fib_rule_hdr *frh = nlmsg_data(nlh);
|
||||
struct fib_rules_ops *ops = NULL;
|
||||
struct fib_rule *rule, *r, *last = NULL;
|
||||
struct nlattr *tb[FRA_MAX+1];
|
||||
int err = -EINVAL, unresolved = 0;
|
||||
|
||||
if (nlh->nlmsg_len < nlmsg_msg_size(sizeof(*frh)))
|
||||
goto errout;
|
||||
|
||||
ops = lookup_rules_ops(net, frh->family);
|
||||
if (ops == NULL) {
|
||||
err = -EAFNOSUPPORT;
|
||||
goto errout;
|
||||
}
|
||||
|
||||
err = nlmsg_parse(nlh, sizeof(*frh), tb, FRA_MAX, ops->policy);
|
||||
if (err < 0)
|
||||
goto errout;
|
||||
|
||||
err = validate_rulemsg(frh, tb, ops);
|
||||
if (err < 0)
|
||||
goto errout;
|
||||
|
||||
rule = kzalloc(ops->rule_size, GFP_KERNEL);
|
||||
if (rule == NULL) {
|
||||
err = -ENOMEM;
|
||||
goto errout;
|
||||
}
|
||||
rule->fr_net = hold_net(net);
|
||||
|
||||
if (tb[FRA_PRIORITY])
|
||||
rule->pref = nla_get_u32(tb[FRA_PRIORITY]);
|
||||
|
||||
if (tb[FRA_IIFNAME]) {
|
||||
struct net_device *dev;
|
||||
|
||||
rule->iifindex = -1;
|
||||
nla_strlcpy(rule->iifname, tb[FRA_IIFNAME], IFNAMSIZ);
|
||||
dev = __dev_get_by_name(net, rule->iifname);
|
||||
if (dev)
|
||||
rule->iifindex = dev->ifindex;
|
||||
}
|
||||
|
||||
if (tb[FRA_OIFNAME]) {
|
||||
struct net_device *dev;
|
||||
|
||||
rule->oifindex = -1;
|
||||
nla_strlcpy(rule->oifname, tb[FRA_OIFNAME], IFNAMSIZ);
|
||||
dev = __dev_get_by_name(net, rule->oifname);
|
||||
if (dev)
|
||||
rule->oifindex = dev->ifindex;
|
||||
}
|
||||
|
||||
if (tb[FRA_FWMARK]) {
|
||||
rule->mark = nla_get_u32(tb[FRA_FWMARK]);
|
||||
if (rule->mark)
|
||||
/* compatibility: if the mark value is non-zero all bits
|
||||
* are compared unless a mask is explicitly specified.
|
||||
*/
|
||||
rule->mark_mask = 0xFFFFFFFF;
|
||||
}
|
||||
|
||||
if (tb[FRA_FWMASK])
|
||||
rule->mark_mask = nla_get_u32(tb[FRA_FWMASK]);
|
||||
|
||||
rule->action = frh->action;
|
||||
rule->flags = frh->flags;
|
||||
rule->table = frh_get_table(frh, tb);
|
||||
if (tb[FRA_SUPPRESS_PREFIXLEN])
|
||||
rule->suppress_prefixlen = nla_get_u32(tb[FRA_SUPPRESS_PREFIXLEN]);
|
||||
else
|
||||
rule->suppress_prefixlen = -1;
|
||||
|
||||
if (tb[FRA_SUPPRESS_IFGROUP])
|
||||
rule->suppress_ifgroup = nla_get_u32(tb[FRA_SUPPRESS_IFGROUP]);
|
||||
else
|
||||
rule->suppress_ifgroup = -1;
|
||||
|
||||
if (!tb[FRA_PRIORITY] && ops->default_pref)
|
||||
rule->pref = ops->default_pref(ops);
|
||||
|
||||
err = -EINVAL;
|
||||
if (tb[FRA_GOTO]) {
|
||||
if (rule->action != FR_ACT_GOTO)
|
||||
goto errout_free;
|
||||
|
||||
rule->target = nla_get_u32(tb[FRA_GOTO]);
|
||||
/* Backward jumps are prohibited to avoid endless loops */
|
||||
if (rule->target <= rule->pref)
|
||||
goto errout_free;
|
||||
|
||||
list_for_each_entry(r, &ops->rules_list, list) {
|
||||
if (r->pref == rule->target) {
|
||||
RCU_INIT_POINTER(rule->ctarget, r);
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
if (rcu_dereference_protected(rule->ctarget, 1) == NULL)
|
||||
unresolved = 1;
|
||||
} else if (rule->action == FR_ACT_GOTO)
|
||||
goto errout_free;
|
||||
|
||||
/* UID start and end must either both be valid or both unspecified. */
|
||||
rule->uid_start = rule->uid_end = INVALID_UID;
|
||||
if (tb[FRA_UID_START] || tb[FRA_UID_END]) {
|
||||
if (tb[FRA_UID_START] && tb[FRA_UID_END]) {
|
||||
rule->uid_start = fib_nl_uid(tb[FRA_UID_START]);
|
||||
rule->uid_end = fib_nl_uid(tb[FRA_UID_END]);
|
||||
}
|
||||
if (!uid_valid(rule->uid_start) ||
|
||||
!uid_valid(rule->uid_end) ||
|
||||
!uid_lte(rule->uid_start, rule->uid_end))
|
||||
goto errout_free;
|
||||
}
|
||||
|
||||
err = ops->configure(rule, skb, frh, tb);
|
||||
if (err < 0)
|
||||
goto errout_free;
|
||||
|
||||
list_for_each_entry(r, &ops->rules_list, list) {
|
||||
if (r->pref > rule->pref)
|
||||
break;
|
||||
last = r;
|
||||
}
|
||||
|
||||
fib_rule_get(rule);
|
||||
|
||||
if (last)
|
||||
list_add_rcu(&rule->list, &last->list);
|
||||
else
|
||||
list_add_rcu(&rule->list, &ops->rules_list);
|
||||
|
||||
if (ops->unresolved_rules) {
|
||||
/*
|
||||
* There are unresolved goto rules in the list, check if
|
||||
* any of them are pointing to this new rule.
|
||||
*/
|
||||
list_for_each_entry(r, &ops->rules_list, list) {
|
||||
if (r->action == FR_ACT_GOTO &&
|
||||
r->target == rule->pref &&
|
||||
rtnl_dereference(r->ctarget) == NULL) {
|
||||
rcu_assign_pointer(r->ctarget, rule);
|
||||
if (--ops->unresolved_rules == 0)
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (rule->action == FR_ACT_GOTO)
|
||||
ops->nr_goto_rules++;
|
||||
|
||||
if (unresolved)
|
||||
ops->unresolved_rules++;
|
||||
|
||||
notify_rule_change(RTM_NEWRULE, rule, ops, nlh, NETLINK_CB(skb).portid);
|
||||
flush_route_cache(ops);
|
||||
rules_ops_put(ops);
|
||||
return 0;
|
||||
|
||||
errout_free:
|
||||
release_net(rule->fr_net);
|
||||
kfree(rule);
|
||||
errout:
|
||||
rules_ops_put(ops);
|
||||
return err;
|
||||
}
|
||||
|
||||
static int fib_nl_delrule(struct sk_buff *skb, struct nlmsghdr* nlh)
|
||||
{
|
||||
struct net *net = sock_net(skb->sk);
|
||||
struct fib_rule_hdr *frh = nlmsg_data(nlh);
|
||||
struct fib_rules_ops *ops = NULL;
|
||||
struct fib_rule *rule, *tmp;
|
||||
struct nlattr *tb[FRA_MAX+1];
|
||||
int err = -EINVAL;
|
||||
|
||||
if (nlh->nlmsg_len < nlmsg_msg_size(sizeof(*frh)))
|
||||
goto errout;
|
||||
|
||||
ops = lookup_rules_ops(net, frh->family);
|
||||
if (ops == NULL) {
|
||||
err = -EAFNOSUPPORT;
|
||||
goto errout;
|
||||
}
|
||||
|
||||
err = nlmsg_parse(nlh, sizeof(*frh), tb, FRA_MAX, ops->policy);
|
||||
if (err < 0)
|
||||
goto errout;
|
||||
|
||||
err = validate_rulemsg(frh, tb, ops);
|
||||
if (err < 0)
|
||||
goto errout;
|
||||
|
||||
list_for_each_entry(rule, &ops->rules_list, list) {
|
||||
if (frh->action && (frh->action != rule->action))
|
||||
continue;
|
||||
|
||||
if (frh_get_table(frh, tb) &&
|
||||
(frh_get_table(frh, tb) != rule->table))
|
||||
continue;
|
||||
|
||||
if (tb[FRA_PRIORITY] &&
|
||||
(rule->pref != nla_get_u32(tb[FRA_PRIORITY])))
|
||||
continue;
|
||||
|
||||
if (tb[FRA_IIFNAME] &&
|
||||
nla_strcmp(tb[FRA_IIFNAME], rule->iifname))
|
||||
continue;
|
||||
|
||||
if (tb[FRA_OIFNAME] &&
|
||||
nla_strcmp(tb[FRA_OIFNAME], rule->oifname))
|
||||
continue;
|
||||
|
||||
if (tb[FRA_FWMARK] &&
|
||||
(rule->mark != nla_get_u32(tb[FRA_FWMARK])))
|
||||
continue;
|
||||
|
||||
if (tb[FRA_FWMASK] &&
|
||||
(rule->mark_mask != nla_get_u32(tb[FRA_FWMASK])))
|
||||
continue;
|
||||
|
||||
if (tb[FRA_UID_START] &&
|
||||
!uid_eq(rule->uid_start, fib_nl_uid(tb[FRA_UID_START])))
|
||||
continue;
|
||||
|
||||
if (tb[FRA_UID_END] &&
|
||||
!uid_eq(rule->uid_end, fib_nl_uid(tb[FRA_UID_END])))
|
||||
continue;
|
||||
|
||||
if (!ops->compare(rule, frh, tb))
|
||||
continue;
|
||||
|
||||
if (rule->flags & FIB_RULE_PERMANENT) {
|
||||
err = -EPERM;
|
||||
goto errout;
|
||||
}
|
||||
|
||||
list_del_rcu(&rule->list);
|
||||
|
||||
if (rule->action == FR_ACT_GOTO) {
|
||||
ops->nr_goto_rules--;
|
||||
if (rtnl_dereference(rule->ctarget) == NULL)
|
||||
ops->unresolved_rules--;
|
||||
}
|
||||
|
||||
/*
|
||||
* Check if this rule is a target to any of them. If so,
|
||||
* disable them. As this operation is eventually very
|
||||
* expensive, it is only performed if goto rules have
|
||||
* actually been added.
|
||||
*/
|
||||
if (ops->nr_goto_rules > 0) {
|
||||
list_for_each_entry(tmp, &ops->rules_list, list) {
|
||||
if (rtnl_dereference(tmp->ctarget) == rule) {
|
||||
RCU_INIT_POINTER(tmp->ctarget, NULL);
|
||||
ops->unresolved_rules++;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
notify_rule_change(RTM_DELRULE, rule, ops, nlh,
|
||||
NETLINK_CB(skb).portid);
|
||||
if (ops->delete)
|
||||
ops->delete(rule);
|
||||
fib_rule_put(rule);
|
||||
flush_route_cache(ops);
|
||||
rules_ops_put(ops);
|
||||
return 0;
|
||||
}
|
||||
|
||||
err = -ENOENT;
|
||||
errout:
|
||||
rules_ops_put(ops);
|
||||
return err;
|
||||
}
|
||||
|
||||
static inline size_t fib_rule_nlmsg_size(struct fib_rules_ops *ops,
|
||||
struct fib_rule *rule)
|
||||
{
|
||||
size_t payload = NLMSG_ALIGN(sizeof(struct fib_rule_hdr))
|
||||
+ nla_total_size(IFNAMSIZ) /* FRA_IIFNAME */
|
||||
+ nla_total_size(IFNAMSIZ) /* FRA_OIFNAME */
|
||||
+ nla_total_size(4) /* FRA_PRIORITY */
|
||||
+ nla_total_size(4) /* FRA_TABLE */
|
||||
+ nla_total_size(4) /* FRA_SUPPRESS_PREFIXLEN */
|
||||
+ nla_total_size(4) /* FRA_SUPPRESS_IFGROUP */
|
||||
+ nla_total_size(4) /* FRA_FWMARK */
|
||||
+ nla_total_size(4) /* FRA_FWMASK */
|
||||
+ nla_total_size(4) /* FRA_UID_START */
|
||||
+ nla_total_size(4); /* FRA_UID_END */
|
||||
|
||||
if (ops->nlmsg_payload)
|
||||
payload += ops->nlmsg_payload(rule);
|
||||
|
||||
return payload;
|
||||
}
|
||||
|
||||
static int fib_nl_fill_rule(struct sk_buff *skb, struct fib_rule *rule,
|
||||
u32 pid, u32 seq, int type, int flags,
|
||||
struct fib_rules_ops *ops)
|
||||
{
|
||||
struct nlmsghdr *nlh;
|
||||
struct fib_rule_hdr *frh;
|
||||
|
||||
nlh = nlmsg_put(skb, pid, seq, type, sizeof(*frh), flags);
|
||||
if (nlh == NULL)
|
||||
return -EMSGSIZE;
|
||||
|
||||
frh = nlmsg_data(nlh);
|
||||
frh->family = ops->family;
|
||||
frh->table = rule->table;
|
||||
if (nla_put_u32(skb, FRA_TABLE, rule->table))
|
||||
goto nla_put_failure;
|
||||
if (nla_put_u32(skb, FRA_SUPPRESS_PREFIXLEN, rule->suppress_prefixlen))
|
||||
goto nla_put_failure;
|
||||
frh->res1 = 0;
|
||||
frh->res2 = 0;
|
||||
frh->action = rule->action;
|
||||
frh->flags = rule->flags;
|
||||
|
||||
if (rule->action == FR_ACT_GOTO &&
|
||||
rcu_access_pointer(rule->ctarget) == NULL)
|
||||
frh->flags |= FIB_RULE_UNRESOLVED;
|
||||
|
||||
if (rule->iifname[0]) {
|
||||
if (nla_put_string(skb, FRA_IIFNAME, rule->iifname))
|
||||
goto nla_put_failure;
|
||||
if (rule->iifindex == -1)
|
||||
frh->flags |= FIB_RULE_IIF_DETACHED;
|
||||
}
|
||||
|
||||
if (rule->oifname[0]) {
|
||||
if (nla_put_string(skb, FRA_OIFNAME, rule->oifname))
|
||||
goto nla_put_failure;
|
||||
if (rule->oifindex == -1)
|
||||
frh->flags |= FIB_RULE_OIF_DETACHED;
|
||||
}
|
||||
|
||||
if ((rule->pref &&
|
||||
nla_put_u32(skb, FRA_PRIORITY, rule->pref)) ||
|
||||
(rule->mark &&
|
||||
nla_put_u32(skb, FRA_FWMARK, rule->mark)) ||
|
||||
((rule->mark_mask || rule->mark) &&
|
||||
nla_put_u32(skb, FRA_FWMASK, rule->mark_mask)) ||
|
||||
(rule->target &&
|
||||
nla_put_u32(skb, FRA_GOTO, rule->target)) ||
|
||||
(uid_valid(rule->uid_start) &&
|
||||
nla_put_uid(skb, FRA_UID_START, rule->uid_start)) ||
|
||||
(uid_valid(rule->uid_end) &&
|
||||
nla_put_uid(skb, FRA_UID_END, rule->uid_end)))
|
||||
goto nla_put_failure;
|
||||
|
||||
if (rule->suppress_ifgroup != -1) {
|
||||
if (nla_put_u32(skb, FRA_SUPPRESS_IFGROUP, rule->suppress_ifgroup))
|
||||
goto nla_put_failure;
|
||||
}
|
||||
|
||||
if (ops->fill(rule, skb, frh) < 0)
|
||||
goto nla_put_failure;
|
||||
|
||||
return nlmsg_end(skb, nlh);
|
||||
|
||||
nla_put_failure:
|
||||
nlmsg_cancel(skb, nlh);
|
||||
return -EMSGSIZE;
|
||||
}
|
||||
|
||||
static int dump_rules(struct sk_buff *skb, struct netlink_callback *cb,
|
||||
struct fib_rules_ops *ops)
|
||||
{
|
||||
int idx = 0;
|
||||
struct fib_rule *rule;
|
||||
|
||||
rcu_read_lock();
|
||||
list_for_each_entry_rcu(rule, &ops->rules_list, list) {
|
||||
if (idx < cb->args[1])
|
||||
goto skip;
|
||||
|
||||
if (fib_nl_fill_rule(skb, rule, NETLINK_CB(cb->skb).portid,
|
||||
cb->nlh->nlmsg_seq, RTM_NEWRULE,
|
||||
NLM_F_MULTI, ops) < 0)
|
||||
break;
|
||||
skip:
|
||||
idx++;
|
||||
}
|
||||
rcu_read_unlock();
|
||||
cb->args[1] = idx;
|
||||
rules_ops_put(ops);
|
||||
|
||||
return skb->len;
|
||||
}
|
||||
|
||||
static int fib_nl_dumprule(struct sk_buff *skb, struct netlink_callback *cb)
|
||||
{
|
||||
struct net *net = sock_net(skb->sk);
|
||||
struct fib_rules_ops *ops;
|
||||
int idx = 0, family;
|
||||
|
||||
family = rtnl_msg_family(cb->nlh);
|
||||
if (family != AF_UNSPEC) {
|
||||
/* Protocol specific dump request */
|
||||
ops = lookup_rules_ops(net, family);
|
||||
if (ops == NULL)
|
||||
return -EAFNOSUPPORT;
|
||||
|
||||
return dump_rules(skb, cb, ops);
|
||||
}
|
||||
|
||||
rcu_read_lock();
|
||||
list_for_each_entry_rcu(ops, &net->rules_ops, list) {
|
||||
if (idx < cb->args[0] || !try_module_get(ops->owner))
|
||||
goto skip;
|
||||
|
||||
if (dump_rules(skb, cb, ops) < 0)
|
||||
break;
|
||||
|
||||
cb->args[1] = 0;
|
||||
skip:
|
||||
idx++;
|
||||
}
|
||||
rcu_read_unlock();
|
||||
cb->args[0] = idx;
|
||||
|
||||
return skb->len;
|
||||
}
|
||||
|
||||
static void notify_rule_change(int event, struct fib_rule *rule,
|
||||
struct fib_rules_ops *ops, struct nlmsghdr *nlh,
|
||||
u32 pid)
|
||||
{
|
||||
struct net *net;
|
||||
struct sk_buff *skb;
|
||||
int err = -ENOBUFS;
|
||||
|
||||
net = ops->fro_net;
|
||||
skb = nlmsg_new(fib_rule_nlmsg_size(ops, rule), GFP_KERNEL);
|
||||
if (skb == NULL)
|
||||
goto errout;
|
||||
|
||||
err = fib_nl_fill_rule(skb, rule, pid, nlh->nlmsg_seq, event, 0, ops);
|
||||
if (err < 0) {
|
||||
/* -EMSGSIZE implies BUG in fib_rule_nlmsg_size() */
|
||||
WARN_ON(err == -EMSGSIZE);
|
||||
kfree_skb(skb);
|
||||
goto errout;
|
||||
}
|
||||
|
||||
rtnl_notify(skb, net, pid, ops->nlgroup, nlh, GFP_KERNEL);
|
||||
return;
|
||||
errout:
|
||||
if (err < 0)
|
||||
rtnl_set_sk_err(net, ops->nlgroup, err);
|
||||
}
|
||||
|
||||
static void attach_rules(struct list_head *rules, struct net_device *dev)
|
||||
{
|
||||
struct fib_rule *rule;
|
||||
|
||||
list_for_each_entry(rule, rules, list) {
|
||||
if (rule->iifindex == -1 &&
|
||||
strcmp(dev->name, rule->iifname) == 0)
|
||||
rule->iifindex = dev->ifindex;
|
||||
if (rule->oifindex == -1 &&
|
||||
strcmp(dev->name, rule->oifname) == 0)
|
||||
rule->oifindex = dev->ifindex;
|
||||
}
|
||||
}
|
||||
|
||||
static void detach_rules(struct list_head *rules, struct net_device *dev)
|
||||
{
|
||||
struct fib_rule *rule;
|
||||
|
||||
list_for_each_entry(rule, rules, list) {
|
||||
if (rule->iifindex == dev->ifindex)
|
||||
rule->iifindex = -1;
|
||||
if (rule->oifindex == dev->ifindex)
|
||||
rule->oifindex = -1;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
static int fib_rules_event(struct notifier_block *this, unsigned long event,
|
||||
void *ptr)
|
||||
{
|
||||
struct net_device *dev = netdev_notifier_info_to_dev(ptr);
|
||||
struct net *net = dev_net(dev);
|
||||
struct fib_rules_ops *ops;
|
||||
|
||||
ASSERT_RTNL();
|
||||
|
||||
switch (event) {
|
||||
case NETDEV_REGISTER:
|
||||
list_for_each_entry(ops, &net->rules_ops, list)
|
||||
attach_rules(&ops->rules_list, dev);
|
||||
break;
|
||||
|
||||
case NETDEV_CHANGENAME:
|
||||
list_for_each_entry(ops, &net->rules_ops, list) {
|
||||
detach_rules(&ops->rules_list, dev);
|
||||
attach_rules(&ops->rules_list, dev);
|
||||
}
|
||||
break;
|
||||
|
||||
case NETDEV_UNREGISTER:
|
||||
list_for_each_entry(ops, &net->rules_ops, list)
|
||||
detach_rules(&ops->rules_list, dev);
|
||||
break;
|
||||
}
|
||||
|
||||
return NOTIFY_DONE;
|
||||
}
|
||||
|
||||
static struct notifier_block fib_rules_notifier = {
|
||||
.notifier_call = fib_rules_event,
|
||||
};
|
||||
|
||||
static int __net_init fib_rules_net_init(struct net *net)
|
||||
{
|
||||
INIT_LIST_HEAD(&net->rules_ops);
|
||||
spin_lock_init(&net->rules_mod_lock);
|
||||
return 0;
|
||||
}
|
||||
|
||||
static struct pernet_operations fib_rules_net_ops = {
|
||||
.init = fib_rules_net_init,
|
||||
};
|
||||
|
||||
static int __init fib_rules_init(void)
|
||||
{
|
||||
int err;
|
||||
rtnl_register(PF_UNSPEC, RTM_NEWRULE, fib_nl_newrule, NULL, NULL);
|
||||
rtnl_register(PF_UNSPEC, RTM_DELRULE, fib_nl_delrule, NULL, NULL);
|
||||
rtnl_register(PF_UNSPEC, RTM_GETRULE, NULL, fib_nl_dumprule, NULL);
|
||||
|
||||
err = register_pernet_subsys(&fib_rules_net_ops);
|
||||
if (err < 0)
|
||||
goto fail;
|
||||
|
||||
err = register_netdevice_notifier(&fib_rules_notifier);
|
||||
if (err < 0)
|
||||
goto fail_unregister;
|
||||
|
||||
return 0;
|
||||
|
||||
fail_unregister:
|
||||
unregister_pernet_subsys(&fib_rules_net_ops);
|
||||
fail:
|
||||
rtnl_unregister(PF_UNSPEC, RTM_NEWRULE);
|
||||
rtnl_unregister(PF_UNSPEC, RTM_DELRULE);
|
||||
rtnl_unregister(PF_UNSPEC, RTM_GETRULE);
|
||||
return err;
|
||||
}
|
||||
|
||||
subsys_initcall(fib_rules_init);
|
1149
net/core/filter.c
Normal file
1149
net/core/filter.c
Normal file
File diff suppressed because it is too large
Load diff
511
net/core/flow.c
Normal file
511
net/core/flow.c
Normal file
|
@ -0,0 +1,511 @@
|
|||
/* flow.c: Generic flow cache.
|
||||
*
|
||||
* Copyright (C) 2003 Alexey N. Kuznetsov (kuznet@ms2.inr.ac.ru)
|
||||
* Copyright (C) 2003 David S. Miller (davem@redhat.com)
|
||||
*/
|
||||
|
||||
#include <linux/kernel.h>
|
||||
#include <linux/module.h>
|
||||
#include <linux/list.h>
|
||||
#include <linux/jhash.h>
|
||||
#include <linux/interrupt.h>
|
||||
#include <linux/mm.h>
|
||||
#include <linux/random.h>
|
||||
#include <linux/init.h>
|
||||
#include <linux/slab.h>
|
||||
#include <linux/smp.h>
|
||||
#include <linux/completion.h>
|
||||
#include <linux/percpu.h>
|
||||
#include <linux/bitops.h>
|
||||
#include <linux/notifier.h>
|
||||
#include <linux/cpu.h>
|
||||
#include <linux/cpumask.h>
|
||||
#include <linux/mutex.h>
|
||||
#include <net/flow.h>
|
||||
#include <linux/atomic.h>
|
||||
#include <linux/security.h>
|
||||
#include <net/net_namespace.h>
|
||||
|
||||
struct flow_cache_entry {
|
||||
union {
|
||||
struct hlist_node hlist;
|
||||
struct list_head gc_list;
|
||||
} u;
|
||||
struct net *net;
|
||||
u16 family;
|
||||
u8 dir;
|
||||
u32 genid;
|
||||
struct flowi key;
|
||||
struct flow_cache_object *object;
|
||||
};
|
||||
|
||||
struct flow_flush_info {
|
||||
struct flow_cache *cache;
|
||||
atomic_t cpuleft;
|
||||
struct completion completion;
|
||||
};
|
||||
|
||||
static struct kmem_cache *flow_cachep __read_mostly;
|
||||
|
||||
#define flow_cache_hash_size(cache) (1 << (cache)->hash_shift)
|
||||
#define FLOW_HASH_RND_PERIOD (10 * 60 * HZ)
|
||||
|
||||
static void flow_cache_new_hashrnd(unsigned long arg)
|
||||
{
|
||||
struct flow_cache *fc = (void *) arg;
|
||||
int i;
|
||||
|
||||
for_each_possible_cpu(i)
|
||||
per_cpu_ptr(fc->percpu, i)->hash_rnd_recalc = 1;
|
||||
|
||||
fc->rnd_timer.expires = jiffies + FLOW_HASH_RND_PERIOD;
|
||||
add_timer(&fc->rnd_timer);
|
||||
}
|
||||
|
||||
static int flow_entry_valid(struct flow_cache_entry *fle,
|
||||
struct netns_xfrm *xfrm)
|
||||
{
|
||||
if (atomic_read(&xfrm->flow_cache_genid) != fle->genid)
|
||||
return 0;
|
||||
if (fle->object && !fle->object->ops->check(fle->object))
|
||||
return 0;
|
||||
return 1;
|
||||
}
|
||||
|
||||
static void flow_entry_kill(struct flow_cache_entry *fle,
|
||||
struct netns_xfrm *xfrm)
|
||||
{
|
||||
if (fle->object)
|
||||
fle->object->ops->delete(fle->object);
|
||||
kmem_cache_free(flow_cachep, fle);
|
||||
}
|
||||
|
||||
static void flow_cache_gc_task(struct work_struct *work)
|
||||
{
|
||||
struct list_head gc_list;
|
||||
struct flow_cache_entry *fce, *n;
|
||||
struct netns_xfrm *xfrm = container_of(work, struct netns_xfrm,
|
||||
flow_cache_gc_work);
|
||||
|
||||
INIT_LIST_HEAD(&gc_list);
|
||||
spin_lock_bh(&xfrm->flow_cache_gc_lock);
|
||||
list_splice_tail_init(&xfrm->flow_cache_gc_list, &gc_list);
|
||||
spin_unlock_bh(&xfrm->flow_cache_gc_lock);
|
||||
|
||||
list_for_each_entry_safe(fce, n, &gc_list, u.gc_list)
|
||||
flow_entry_kill(fce, xfrm);
|
||||
}
|
||||
|
||||
static void flow_cache_queue_garbage(struct flow_cache_percpu *fcp,
|
||||
int deleted, struct list_head *gc_list,
|
||||
struct netns_xfrm *xfrm)
|
||||
{
|
||||
if (deleted) {
|
||||
fcp->hash_count -= deleted;
|
||||
spin_lock_bh(&xfrm->flow_cache_gc_lock);
|
||||
list_splice_tail(gc_list, &xfrm->flow_cache_gc_list);
|
||||
spin_unlock_bh(&xfrm->flow_cache_gc_lock);
|
||||
schedule_work(&xfrm->flow_cache_gc_work);
|
||||
}
|
||||
}
|
||||
|
||||
static void __flow_cache_shrink(struct flow_cache *fc,
|
||||
struct flow_cache_percpu *fcp,
|
||||
int shrink_to)
|
||||
{
|
||||
struct flow_cache_entry *fle;
|
||||
struct hlist_node *tmp;
|
||||
LIST_HEAD(gc_list);
|
||||
int i, deleted = 0;
|
||||
struct netns_xfrm *xfrm = container_of(fc, struct netns_xfrm,
|
||||
flow_cache_global);
|
||||
|
||||
for (i = 0; i < flow_cache_hash_size(fc); i++) {
|
||||
int saved = 0;
|
||||
|
||||
hlist_for_each_entry_safe(fle, tmp,
|
||||
&fcp->hash_table[i], u.hlist) {
|
||||
if (saved < shrink_to &&
|
||||
flow_entry_valid(fle, xfrm)) {
|
||||
saved++;
|
||||
} else {
|
||||
deleted++;
|
||||
hlist_del(&fle->u.hlist);
|
||||
list_add_tail(&fle->u.gc_list, &gc_list);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
flow_cache_queue_garbage(fcp, deleted, &gc_list, xfrm);
|
||||
}
|
||||
|
||||
static void flow_cache_shrink(struct flow_cache *fc,
|
||||
struct flow_cache_percpu *fcp)
|
||||
{
|
||||
int shrink_to = fc->low_watermark / flow_cache_hash_size(fc);
|
||||
|
||||
__flow_cache_shrink(fc, fcp, shrink_to);
|
||||
}
|
||||
|
||||
static void flow_new_hash_rnd(struct flow_cache *fc,
|
||||
struct flow_cache_percpu *fcp)
|
||||
{
|
||||
get_random_bytes(&fcp->hash_rnd, sizeof(u32));
|
||||
fcp->hash_rnd_recalc = 0;
|
||||
__flow_cache_shrink(fc, fcp, 0);
|
||||
}
|
||||
|
||||
static u32 flow_hash_code(struct flow_cache *fc,
|
||||
struct flow_cache_percpu *fcp,
|
||||
const struct flowi *key,
|
||||
size_t keysize)
|
||||
{
|
||||
const u32 *k = (const u32 *) key;
|
||||
const u32 length = keysize * sizeof(flow_compare_t) / sizeof(u32);
|
||||
|
||||
return jhash2(k, length, fcp->hash_rnd)
|
||||
& (flow_cache_hash_size(fc) - 1);
|
||||
}
|
||||
|
||||
/* I hear what you're saying, use memcmp. But memcmp cannot make
|
||||
* important assumptions that we can here, such as alignment.
|
||||
*/
|
||||
static int flow_key_compare(const struct flowi *key1, const struct flowi *key2,
|
||||
size_t keysize)
|
||||
{
|
||||
const flow_compare_t *k1, *k1_lim, *k2;
|
||||
|
||||
k1 = (const flow_compare_t *) key1;
|
||||
k1_lim = k1 + keysize;
|
||||
|
||||
k2 = (const flow_compare_t *) key2;
|
||||
|
||||
do {
|
||||
if (*k1++ != *k2++)
|
||||
return 1;
|
||||
} while (k1 < k1_lim);
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
struct flow_cache_object *
|
||||
flow_cache_lookup(struct net *net, const struct flowi *key, u16 family, u8 dir,
|
||||
flow_resolve_t resolver, void *ctx)
|
||||
{
|
||||
struct flow_cache *fc = &net->xfrm.flow_cache_global;
|
||||
struct flow_cache_percpu *fcp;
|
||||
struct flow_cache_entry *fle, *tfle;
|
||||
struct flow_cache_object *flo;
|
||||
size_t keysize;
|
||||
unsigned int hash;
|
||||
|
||||
local_bh_disable();
|
||||
fcp = this_cpu_ptr(fc->percpu);
|
||||
|
||||
fle = NULL;
|
||||
flo = NULL;
|
||||
|
||||
keysize = flow_key_size(family);
|
||||
if (!keysize)
|
||||
goto nocache;
|
||||
|
||||
/* Packet really early in init? Making flow_cache_init a
|
||||
* pre-smp initcall would solve this. --RR */
|
||||
if (!fcp->hash_table)
|
||||
goto nocache;
|
||||
|
||||
if (fcp->hash_rnd_recalc)
|
||||
flow_new_hash_rnd(fc, fcp);
|
||||
|
||||
hash = flow_hash_code(fc, fcp, key, keysize);
|
||||
hlist_for_each_entry(tfle, &fcp->hash_table[hash], u.hlist) {
|
||||
if (tfle->net == net &&
|
||||
tfle->family == family &&
|
||||
tfle->dir == dir &&
|
||||
flow_key_compare(key, &tfle->key, keysize) == 0) {
|
||||
fle = tfle;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
if (unlikely(!fle)) {
|
||||
if (fcp->hash_count > fc->high_watermark)
|
||||
flow_cache_shrink(fc, fcp);
|
||||
|
||||
fle = kmem_cache_alloc(flow_cachep, GFP_ATOMIC);
|
||||
if (fle) {
|
||||
fle->net = net;
|
||||
fle->family = family;
|
||||
fle->dir = dir;
|
||||
memcpy(&fle->key, key, keysize * sizeof(flow_compare_t));
|
||||
fle->object = NULL;
|
||||
hlist_add_head(&fle->u.hlist, &fcp->hash_table[hash]);
|
||||
fcp->hash_count++;
|
||||
}
|
||||
} else if (likely(fle->genid == atomic_read(&net->xfrm.flow_cache_genid))) {
|
||||
flo = fle->object;
|
||||
if (!flo)
|
||||
goto ret_object;
|
||||
flo = flo->ops->get(flo);
|
||||
if (flo)
|
||||
goto ret_object;
|
||||
} else if (fle->object) {
|
||||
flo = fle->object;
|
||||
flo->ops->delete(flo);
|
||||
fle->object = NULL;
|
||||
}
|
||||
|
||||
nocache:
|
||||
flo = NULL;
|
||||
if (fle) {
|
||||
flo = fle->object;
|
||||
fle->object = NULL;
|
||||
}
|
||||
flo = resolver(net, key, family, dir, flo, ctx);
|
||||
if (fle) {
|
||||
fle->genid = atomic_read(&net->xfrm.flow_cache_genid);
|
||||
if (!IS_ERR(flo))
|
||||
fle->object = flo;
|
||||
else
|
||||
fle->genid--;
|
||||
} else {
|
||||
if (!IS_ERR_OR_NULL(flo))
|
||||
flo->ops->delete(flo);
|
||||
}
|
||||
ret_object:
|
||||
local_bh_enable();
|
||||
return flo;
|
||||
}
|
||||
EXPORT_SYMBOL(flow_cache_lookup);
|
||||
|
||||
static void flow_cache_flush_tasklet(unsigned long data)
|
||||
{
|
||||
struct flow_flush_info *info = (void *)data;
|
||||
struct flow_cache *fc = info->cache;
|
||||
struct flow_cache_percpu *fcp;
|
||||
struct flow_cache_entry *fle;
|
||||
struct hlist_node *tmp;
|
||||
LIST_HEAD(gc_list);
|
||||
int i, deleted = 0;
|
||||
struct netns_xfrm *xfrm = container_of(fc, struct netns_xfrm,
|
||||
flow_cache_global);
|
||||
|
||||
fcp = this_cpu_ptr(fc->percpu);
|
||||
for (i = 0; i < flow_cache_hash_size(fc); i++) {
|
||||
hlist_for_each_entry_safe(fle, tmp,
|
||||
&fcp->hash_table[i], u.hlist) {
|
||||
if (flow_entry_valid(fle, xfrm))
|
||||
continue;
|
||||
|
||||
deleted++;
|
||||
hlist_del(&fle->u.hlist);
|
||||
list_add_tail(&fle->u.gc_list, &gc_list);
|
||||
}
|
||||
}
|
||||
|
||||
flow_cache_queue_garbage(fcp, deleted, &gc_list, xfrm);
|
||||
|
||||
if (atomic_dec_and_test(&info->cpuleft))
|
||||
complete(&info->completion);
|
||||
}
|
||||
|
||||
/*
|
||||
* Return whether a cpu needs flushing. Conservatively, we assume
|
||||
* the presence of any entries means the core may require flushing,
|
||||
* since the flow_cache_ops.check() function may assume it's running
|
||||
* on the same core as the per-cpu cache component.
|
||||
*/
|
||||
static int flow_cache_percpu_empty(struct flow_cache *fc, int cpu)
|
||||
{
|
||||
struct flow_cache_percpu *fcp;
|
||||
int i;
|
||||
|
||||
fcp = per_cpu_ptr(fc->percpu, cpu);
|
||||
for (i = 0; i < flow_cache_hash_size(fc); i++)
|
||||
if (!hlist_empty(&fcp->hash_table[i]))
|
||||
return 0;
|
||||
return 1;
|
||||
}
|
||||
|
||||
static void flow_cache_flush_per_cpu(void *data)
|
||||
{
|
||||
struct flow_flush_info *info = data;
|
||||
struct tasklet_struct *tasklet;
|
||||
|
||||
tasklet = &this_cpu_ptr(info->cache->percpu)->flush_tasklet;
|
||||
tasklet->data = (unsigned long)info;
|
||||
tasklet_schedule(tasklet);
|
||||
}
|
||||
|
||||
void flow_cache_flush(struct net *net)
|
||||
{
|
||||
struct flow_flush_info info;
|
||||
cpumask_var_t mask;
|
||||
int i, self;
|
||||
|
||||
/* Track which cpus need flushing to avoid disturbing all cores. */
|
||||
if (!alloc_cpumask_var(&mask, GFP_KERNEL))
|
||||
return;
|
||||
cpumask_clear(mask);
|
||||
|
||||
/* Don't want cpus going down or up during this. */
|
||||
get_online_cpus();
|
||||
mutex_lock(&net->xfrm.flow_flush_sem);
|
||||
info.cache = &net->xfrm.flow_cache_global;
|
||||
for_each_online_cpu(i)
|
||||
if (!flow_cache_percpu_empty(info.cache, i))
|
||||
cpumask_set_cpu(i, mask);
|
||||
atomic_set(&info.cpuleft, cpumask_weight(mask));
|
||||
if (atomic_read(&info.cpuleft) == 0)
|
||||
goto done;
|
||||
|
||||
init_completion(&info.completion);
|
||||
|
||||
local_bh_disable();
|
||||
self = cpumask_test_and_clear_cpu(smp_processor_id(), mask);
|
||||
on_each_cpu_mask(mask, flow_cache_flush_per_cpu, &info, 0);
|
||||
if (self)
|
||||
flow_cache_flush_tasklet((unsigned long)&info);
|
||||
local_bh_enable();
|
||||
|
||||
wait_for_completion(&info.completion);
|
||||
|
||||
done:
|
||||
mutex_unlock(&net->xfrm.flow_flush_sem);
|
||||
put_online_cpus();
|
||||
free_cpumask_var(mask);
|
||||
}
|
||||
|
||||
static void flow_cache_flush_task(struct work_struct *work)
|
||||
{
|
||||
struct netns_xfrm *xfrm = container_of(work, struct netns_xfrm,
|
||||
flow_cache_flush_work);
|
||||
struct net *net = container_of(xfrm, struct net, xfrm);
|
||||
|
||||
flow_cache_flush(net);
|
||||
}
|
||||
|
||||
void flow_cache_flush_deferred(struct net *net)
|
||||
{
|
||||
schedule_work(&net->xfrm.flow_cache_flush_work);
|
||||
}
|
||||
|
||||
static int flow_cache_cpu_prepare(struct flow_cache *fc, int cpu)
|
||||
{
|
||||
struct flow_cache_percpu *fcp = per_cpu_ptr(fc->percpu, cpu);
|
||||
size_t sz = sizeof(struct hlist_head) * flow_cache_hash_size(fc);
|
||||
|
||||
if (!fcp->hash_table) {
|
||||
fcp->hash_table = kzalloc_node(sz, GFP_KERNEL, cpu_to_node(cpu));
|
||||
if (!fcp->hash_table) {
|
||||
pr_err("NET: failed to allocate flow cache sz %zu\n", sz);
|
||||
return -ENOMEM;
|
||||
}
|
||||
fcp->hash_rnd_recalc = 1;
|
||||
fcp->hash_count = 0;
|
||||
tasklet_init(&fcp->flush_tasklet, flow_cache_flush_tasklet, 0);
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
static int flow_cache_cpu(struct notifier_block *nfb,
|
||||
unsigned long action,
|
||||
void *hcpu)
|
||||
{
|
||||
struct flow_cache *fc = container_of(nfb, struct flow_cache,
|
||||
hotcpu_notifier);
|
||||
int res, cpu = (unsigned long) hcpu;
|
||||
struct flow_cache_percpu *fcp = per_cpu_ptr(fc->percpu, cpu);
|
||||
|
||||
switch (action) {
|
||||
case CPU_UP_PREPARE:
|
||||
case CPU_UP_PREPARE_FROZEN:
|
||||
res = flow_cache_cpu_prepare(fc, cpu);
|
||||
if (res)
|
||||
return notifier_from_errno(res);
|
||||
break;
|
||||
case CPU_DEAD:
|
||||
case CPU_DEAD_FROZEN:
|
||||
__flow_cache_shrink(fc, fcp, 0);
|
||||
break;
|
||||
}
|
||||
return NOTIFY_OK;
|
||||
}
|
||||
|
||||
int flow_cache_init(struct net *net)
|
||||
{
|
||||
int i;
|
||||
struct flow_cache *fc = &net->xfrm.flow_cache_global;
|
||||
|
||||
if (!flow_cachep)
|
||||
flow_cachep = kmem_cache_create("flow_cache",
|
||||
sizeof(struct flow_cache_entry),
|
||||
0, SLAB_PANIC, NULL);
|
||||
spin_lock_init(&net->xfrm.flow_cache_gc_lock);
|
||||
INIT_LIST_HEAD(&net->xfrm.flow_cache_gc_list);
|
||||
INIT_WORK(&net->xfrm.flow_cache_gc_work, flow_cache_gc_task);
|
||||
INIT_WORK(&net->xfrm.flow_cache_flush_work, flow_cache_flush_task);
|
||||
mutex_init(&net->xfrm.flow_flush_sem);
|
||||
|
||||
fc->hash_shift = 10;
|
||||
fc->low_watermark = 2 * flow_cache_hash_size(fc);
|
||||
fc->high_watermark = 4 * flow_cache_hash_size(fc);
|
||||
|
||||
fc->percpu = alloc_percpu(struct flow_cache_percpu);
|
||||
if (!fc->percpu)
|
||||
return -ENOMEM;
|
||||
|
||||
cpu_notifier_register_begin();
|
||||
|
||||
for_each_online_cpu(i) {
|
||||
if (flow_cache_cpu_prepare(fc, i))
|
||||
goto err;
|
||||
}
|
||||
fc->hotcpu_notifier = (struct notifier_block){
|
||||
.notifier_call = flow_cache_cpu,
|
||||
};
|
||||
__register_hotcpu_notifier(&fc->hotcpu_notifier);
|
||||
|
||||
cpu_notifier_register_done();
|
||||
|
||||
setup_timer(&fc->rnd_timer, flow_cache_new_hashrnd,
|
||||
(unsigned long) fc);
|
||||
fc->rnd_timer.expires = jiffies + FLOW_HASH_RND_PERIOD;
|
||||
add_timer(&fc->rnd_timer);
|
||||
|
||||
return 0;
|
||||
|
||||
err:
|
||||
for_each_possible_cpu(i) {
|
||||
struct flow_cache_percpu *fcp = per_cpu_ptr(fc->percpu, i);
|
||||
kfree(fcp->hash_table);
|
||||
fcp->hash_table = NULL;
|
||||
}
|
||||
|
||||
cpu_notifier_register_done();
|
||||
|
||||
free_percpu(fc->percpu);
|
||||
fc->percpu = NULL;
|
||||
|
||||
return -ENOMEM;
|
||||
}
|
||||
EXPORT_SYMBOL(flow_cache_init);
|
||||
|
||||
void flow_cache_fini(struct net *net)
|
||||
{
|
||||
int i;
|
||||
struct flow_cache *fc = &net->xfrm.flow_cache_global;
|
||||
|
||||
del_timer_sync(&fc->rnd_timer);
|
||||
unregister_hotcpu_notifier(&fc->hotcpu_notifier);
|
||||
|
||||
for_each_possible_cpu(i) {
|
||||
struct flow_cache_percpu *fcp = per_cpu_ptr(fc->percpu, i);
|
||||
kfree(fcp->hash_table);
|
||||
fcp->hash_table = NULL;
|
||||
}
|
||||
|
||||
free_percpu(fc->percpu);
|
||||
fc->percpu = NULL;
|
||||
}
|
||||
EXPORT_SYMBOL(flow_cache_fini);
|
471
net/core/flow_dissector.c
Normal file
471
net/core/flow_dissector.c
Normal file
|
@ -0,0 +1,471 @@
|
|||
#include <linux/skbuff.h>
|
||||
#include <linux/export.h>
|
||||
#include <linux/ip.h>
|
||||
#include <linux/ipv6.h>
|
||||
#include <linux/if_vlan.h>
|
||||
#include <net/ip.h>
|
||||
#include <net/ipv6.h>
|
||||
#include <linux/igmp.h>
|
||||
#include <linux/icmp.h>
|
||||
#include <linux/sctp.h>
|
||||
#include <linux/dccp.h>
|
||||
#include <linux/if_tunnel.h>
|
||||
#include <linux/if_pppox.h>
|
||||
#include <linux/ppp_defs.h>
|
||||
#include <net/flow_keys.h>
|
||||
#include <scsi/fc/fc_fcoe.h>
|
||||
|
||||
/* copy saddr & daddr, possibly using 64bit load/store
|
||||
* Equivalent to : flow->src = iph->saddr;
|
||||
* flow->dst = iph->daddr;
|
||||
*/
|
||||
static void iph_to_flow_copy_addrs(struct flow_keys *flow, const struct iphdr *iph)
|
||||
{
|
||||
BUILD_BUG_ON(offsetof(typeof(*flow), dst) !=
|
||||
offsetof(typeof(*flow), src) + sizeof(flow->src));
|
||||
memcpy(&flow->src, &iph->saddr, sizeof(flow->src) + sizeof(flow->dst));
|
||||
}
|
||||
|
||||
/**
|
||||
* __skb_flow_get_ports - extract the upper layer ports and return them
|
||||
* @skb: sk_buff to extract the ports from
|
||||
* @thoff: transport header offset
|
||||
* @ip_proto: protocol for which to get port offset
|
||||
* @data: raw buffer pointer to the packet, if NULL use skb->data
|
||||
* @hlen: packet header length, if @data is NULL use skb_headlen(skb)
|
||||
*
|
||||
* The function will try to retrieve the ports at offset thoff + poff where poff
|
||||
* is the protocol port offset returned from proto_ports_offset
|
||||
*/
|
||||
__be32 __skb_flow_get_ports(const struct sk_buff *skb, int thoff, u8 ip_proto,
|
||||
void *data, int hlen)
|
||||
{
|
||||
int poff = proto_ports_offset(ip_proto);
|
||||
|
||||
if (!data) {
|
||||
data = skb->data;
|
||||
hlen = skb_headlen(skb);
|
||||
}
|
||||
|
||||
if (poff >= 0) {
|
||||
__be32 *ports, _ports;
|
||||
|
||||
ports = __skb_header_pointer(skb, thoff + poff,
|
||||
sizeof(_ports), data, hlen, &_ports);
|
||||
if (ports)
|
||||
return *ports;
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
EXPORT_SYMBOL(__skb_flow_get_ports);
|
||||
|
||||
/**
|
||||
* __skb_flow_dissect - extract the flow_keys struct and return it
|
||||
* @skb: sk_buff to extract the flow from, can be NULL if the rest are specified
|
||||
* @data: raw buffer pointer to the packet, if NULL use skb->data
|
||||
* @proto: protocol for which to get the flow, if @data is NULL use skb->protocol
|
||||
* @nhoff: network header offset, if @data is NULL use skb_network_offset(skb)
|
||||
* @hlen: packet header length, if @data is NULL use skb_headlen(skb)
|
||||
*
|
||||
* The function will try to retrieve the struct flow_keys from either the skbuff
|
||||
* or a raw buffer specified by the rest parameters
|
||||
*/
|
||||
bool __skb_flow_dissect(const struct sk_buff *skb, struct flow_keys *flow,
|
||||
void *data, __be16 proto, int nhoff, int hlen)
|
||||
{
|
||||
u8 ip_proto;
|
||||
|
||||
if (!data) {
|
||||
data = skb->data;
|
||||
proto = skb->protocol;
|
||||
nhoff = skb_network_offset(skb);
|
||||
hlen = skb_headlen(skb);
|
||||
}
|
||||
|
||||
memset(flow, 0, sizeof(*flow));
|
||||
|
||||
again:
|
||||
switch (proto) {
|
||||
case htons(ETH_P_IP): {
|
||||
const struct iphdr *iph;
|
||||
struct iphdr _iph;
|
||||
ip:
|
||||
iph = __skb_header_pointer(skb, nhoff, sizeof(_iph), data, hlen, &_iph);
|
||||
if (!iph || iph->ihl < 5)
|
||||
return false;
|
||||
nhoff += iph->ihl * 4;
|
||||
|
||||
ip_proto = iph->protocol;
|
||||
if (ip_is_fragment(iph))
|
||||
ip_proto = 0;
|
||||
|
||||
/* skip the address processing if skb is NULL. The assumption
|
||||
* here is that if there is no skb we are not looking for flow
|
||||
* info but lengths and protocols.
|
||||
*/
|
||||
if (!skb)
|
||||
break;
|
||||
|
||||
iph_to_flow_copy_addrs(flow, iph);
|
||||
break;
|
||||
}
|
||||
case htons(ETH_P_IPV6): {
|
||||
const struct ipv6hdr *iph;
|
||||
struct ipv6hdr _iph;
|
||||
__be32 flow_label;
|
||||
|
||||
ipv6:
|
||||
iph = __skb_header_pointer(skb, nhoff, sizeof(_iph), data, hlen, &_iph);
|
||||
if (!iph)
|
||||
return false;
|
||||
|
||||
ip_proto = iph->nexthdr;
|
||||
nhoff += sizeof(struct ipv6hdr);
|
||||
|
||||
/* see comment above in IPv4 section */
|
||||
if (!skb)
|
||||
break;
|
||||
|
||||
flow->src = (__force __be32)ipv6_addr_hash(&iph->saddr);
|
||||
flow->dst = (__force __be32)ipv6_addr_hash(&iph->daddr);
|
||||
|
||||
flow_label = ip6_flowlabel(iph);
|
||||
if (flow_label) {
|
||||
/* Awesome, IPv6 packet has a flow label so we can
|
||||
* use that to represent the ports without any
|
||||
* further dissection.
|
||||
*/
|
||||
flow->n_proto = proto;
|
||||
flow->ip_proto = ip_proto;
|
||||
flow->ports = flow_label;
|
||||
flow->thoff = (u16)nhoff;
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
break;
|
||||
}
|
||||
case htons(ETH_P_8021AD):
|
||||
case htons(ETH_P_8021Q): {
|
||||
const struct vlan_hdr *vlan;
|
||||
struct vlan_hdr _vlan;
|
||||
|
||||
vlan = __skb_header_pointer(skb, nhoff, sizeof(_vlan), data, hlen, &_vlan);
|
||||
if (!vlan)
|
||||
return false;
|
||||
|
||||
proto = vlan->h_vlan_encapsulated_proto;
|
||||
nhoff += sizeof(*vlan);
|
||||
goto again;
|
||||
}
|
||||
case htons(ETH_P_PPP_SES): {
|
||||
struct {
|
||||
struct pppoe_hdr hdr;
|
||||
__be16 proto;
|
||||
} *hdr, _hdr;
|
||||
hdr = __skb_header_pointer(skb, nhoff, sizeof(_hdr), data, hlen, &_hdr);
|
||||
if (!hdr)
|
||||
return false;
|
||||
proto = hdr->proto;
|
||||
nhoff += PPPOE_SES_HLEN;
|
||||
switch (proto) {
|
||||
case htons(PPP_IP):
|
||||
goto ip;
|
||||
case htons(PPP_IPV6):
|
||||
goto ipv6;
|
||||
default:
|
||||
return false;
|
||||
}
|
||||
}
|
||||
case htons(ETH_P_FCOE):
|
||||
flow->thoff = (u16)(nhoff + FCOE_HEADER_LEN);
|
||||
/* fall through */
|
||||
default:
|
||||
return false;
|
||||
}
|
||||
|
||||
switch (ip_proto) {
|
||||
case IPPROTO_GRE: {
|
||||
struct gre_hdr {
|
||||
__be16 flags;
|
||||
__be16 proto;
|
||||
} *hdr, _hdr;
|
||||
|
||||
hdr = __skb_header_pointer(skb, nhoff, sizeof(_hdr), data, hlen, &_hdr);
|
||||
if (!hdr)
|
||||
return false;
|
||||
/*
|
||||
* Only look inside GRE if version zero and no
|
||||
* routing
|
||||
*/
|
||||
if (!(hdr->flags & (GRE_VERSION|GRE_ROUTING))) {
|
||||
proto = hdr->proto;
|
||||
nhoff += 4;
|
||||
if (hdr->flags & GRE_CSUM)
|
||||
nhoff += 4;
|
||||
if (hdr->flags & GRE_KEY)
|
||||
nhoff += 4;
|
||||
if (hdr->flags & GRE_SEQ)
|
||||
nhoff += 4;
|
||||
if (proto == htons(ETH_P_TEB)) {
|
||||
const struct ethhdr *eth;
|
||||
struct ethhdr _eth;
|
||||
|
||||
eth = __skb_header_pointer(skb, nhoff,
|
||||
sizeof(_eth),
|
||||
data, hlen, &_eth);
|
||||
if (!eth)
|
||||
return false;
|
||||
proto = eth->h_proto;
|
||||
nhoff += sizeof(*eth);
|
||||
}
|
||||
goto again;
|
||||
}
|
||||
break;
|
||||
}
|
||||
case IPPROTO_IPIP:
|
||||
proto = htons(ETH_P_IP);
|
||||
goto ip;
|
||||
case IPPROTO_IPV6:
|
||||
proto = htons(ETH_P_IPV6);
|
||||
goto ipv6;
|
||||
default:
|
||||
break;
|
||||
}
|
||||
|
||||
flow->n_proto = proto;
|
||||
flow->ip_proto = ip_proto;
|
||||
flow->thoff = (u16) nhoff;
|
||||
|
||||
/* unless skb is set we don't need to record port info */
|
||||
if (skb)
|
||||
flow->ports = __skb_flow_get_ports(skb, nhoff, ip_proto,
|
||||
data, hlen);
|
||||
|
||||
return true;
|
||||
}
|
||||
EXPORT_SYMBOL(__skb_flow_dissect);
|
||||
|
||||
static u32 hashrnd __read_mostly;
|
||||
static __always_inline void __flow_hash_secret_init(void)
|
||||
{
|
||||
net_get_random_once(&hashrnd, sizeof(hashrnd));
|
||||
}
|
||||
|
||||
static __always_inline u32 __flow_hash_3words(u32 a, u32 b, u32 c)
|
||||
{
|
||||
__flow_hash_secret_init();
|
||||
return jhash_3words(a, b, c, hashrnd);
|
||||
}
|
||||
|
||||
static inline u32 __flow_hash_from_keys(struct flow_keys *keys)
|
||||
{
|
||||
u32 hash;
|
||||
|
||||
/* get a consistent hash (same value on both flow directions) */
|
||||
if (((__force u32)keys->dst < (__force u32)keys->src) ||
|
||||
(((__force u32)keys->dst == (__force u32)keys->src) &&
|
||||
((__force u16)keys->port16[1] < (__force u16)keys->port16[0]))) {
|
||||
swap(keys->dst, keys->src);
|
||||
swap(keys->port16[0], keys->port16[1]);
|
||||
}
|
||||
|
||||
hash = __flow_hash_3words((__force u32)keys->dst,
|
||||
(__force u32)keys->src,
|
||||
(__force u32)keys->ports);
|
||||
if (!hash)
|
||||
hash = 1;
|
||||
|
||||
return hash;
|
||||
}
|
||||
|
||||
u32 flow_hash_from_keys(struct flow_keys *keys)
|
||||
{
|
||||
return __flow_hash_from_keys(keys);
|
||||
}
|
||||
EXPORT_SYMBOL(flow_hash_from_keys);
|
||||
|
||||
/*
|
||||
* __skb_get_hash: calculate a flow hash based on src/dst addresses
|
||||
* and src/dst port numbers. Sets hash in skb to non-zero hash value
|
||||
* on success, zero indicates no valid hash. Also, sets l4_hash in skb
|
||||
* if hash is a canonical 4-tuple hash over transport ports.
|
||||
*/
|
||||
void __skb_get_hash(struct sk_buff *skb)
|
||||
{
|
||||
struct flow_keys keys;
|
||||
|
||||
if (!skb_flow_dissect(skb, &keys))
|
||||
return;
|
||||
|
||||
if (keys.ports)
|
||||
skb->l4_hash = 1;
|
||||
|
||||
skb->sw_hash = 1;
|
||||
|
||||
skb->hash = __flow_hash_from_keys(&keys);
|
||||
}
|
||||
EXPORT_SYMBOL(__skb_get_hash);
|
||||
|
||||
/*
|
||||
* Returns a Tx hash based on the given packet descriptor a Tx queues' number
|
||||
* to be used as a distribution range.
|
||||
*/
|
||||
u16 __skb_tx_hash(const struct net_device *dev, struct sk_buff *skb,
|
||||
unsigned int num_tx_queues)
|
||||
{
|
||||
u32 hash;
|
||||
u16 qoffset = 0;
|
||||
u16 qcount = num_tx_queues;
|
||||
|
||||
if (skb_rx_queue_recorded(skb)) {
|
||||
hash = skb_get_rx_queue(skb);
|
||||
while (unlikely(hash >= num_tx_queues))
|
||||
hash -= num_tx_queues;
|
||||
return hash;
|
||||
}
|
||||
|
||||
if (dev->num_tc) {
|
||||
u8 tc = netdev_get_prio_tc_map(dev, skb->priority);
|
||||
qoffset = dev->tc_to_txq[tc].offset;
|
||||
qcount = dev->tc_to_txq[tc].count;
|
||||
}
|
||||
|
||||
return (u16) reciprocal_scale(skb_get_hash(skb), qcount) + qoffset;
|
||||
}
|
||||
EXPORT_SYMBOL(__skb_tx_hash);
|
||||
|
||||
u32 __skb_get_poff(const struct sk_buff *skb, void *data,
|
||||
const struct flow_keys *keys, int hlen)
|
||||
{
|
||||
u32 poff = keys->thoff;
|
||||
|
||||
switch (keys->ip_proto) {
|
||||
case IPPROTO_TCP: {
|
||||
/* access doff as u8 to avoid unaligned access */
|
||||
const u8 *doff;
|
||||
u8 _doff;
|
||||
|
||||
doff = __skb_header_pointer(skb, poff + 12, sizeof(_doff),
|
||||
data, hlen, &_doff);
|
||||
if (!doff)
|
||||
return poff;
|
||||
|
||||
poff += max_t(u32, sizeof(struct tcphdr), (*doff & 0xF0) >> 2);
|
||||
break;
|
||||
}
|
||||
case IPPROTO_UDP:
|
||||
case IPPROTO_UDPLITE:
|
||||
poff += sizeof(struct udphdr);
|
||||
break;
|
||||
/* For the rest, we do not really care about header
|
||||
* extensions at this point for now.
|
||||
*/
|
||||
case IPPROTO_ICMP:
|
||||
poff += sizeof(struct icmphdr);
|
||||
break;
|
||||
case IPPROTO_ICMPV6:
|
||||
poff += sizeof(struct icmp6hdr);
|
||||
break;
|
||||
case IPPROTO_IGMP:
|
||||
poff += sizeof(struct igmphdr);
|
||||
break;
|
||||
case IPPROTO_DCCP:
|
||||
poff += sizeof(struct dccp_hdr);
|
||||
break;
|
||||
case IPPROTO_SCTP:
|
||||
poff += sizeof(struct sctphdr);
|
||||
break;
|
||||
}
|
||||
|
||||
return poff;
|
||||
}
|
||||
|
||||
/* skb_get_poff() returns the offset to the payload as far as it could
|
||||
* be dissected. The main user is currently BPF, so that we can dynamically
|
||||
* truncate packets without needing to push actual payload to the user
|
||||
* space and can analyze headers only, instead.
|
||||
*/
|
||||
u32 skb_get_poff(const struct sk_buff *skb)
|
||||
{
|
||||
struct flow_keys keys;
|
||||
|
||||
if (!skb_flow_dissect(skb, &keys))
|
||||
return 0;
|
||||
|
||||
return __skb_get_poff(skb, skb->data, &keys, skb_headlen(skb));
|
||||
}
|
||||
|
||||
static inline int get_xps_queue(struct net_device *dev, struct sk_buff *skb)
|
||||
{
|
||||
#ifdef CONFIG_XPS
|
||||
struct xps_dev_maps *dev_maps;
|
||||
struct xps_map *map;
|
||||
int queue_index = -1;
|
||||
|
||||
rcu_read_lock();
|
||||
dev_maps = rcu_dereference(dev->xps_maps);
|
||||
if (dev_maps) {
|
||||
map = rcu_dereference(
|
||||
dev_maps->cpu_map[raw_smp_processor_id()]);
|
||||
if (map) {
|
||||
if (map->len == 1)
|
||||
queue_index = map->queues[0];
|
||||
else
|
||||
queue_index = map->queues[reciprocal_scale(skb_get_hash(skb),
|
||||
map->len)];
|
||||
if (unlikely(queue_index >= dev->real_num_tx_queues))
|
||||
queue_index = -1;
|
||||
}
|
||||
}
|
||||
rcu_read_unlock();
|
||||
|
||||
return queue_index;
|
||||
#else
|
||||
return -1;
|
||||
#endif
|
||||
}
|
||||
|
||||
static u16 __netdev_pick_tx(struct net_device *dev, struct sk_buff *skb)
|
||||
{
|
||||
struct sock *sk = skb->sk;
|
||||
int queue_index = sk_tx_queue_get(sk);
|
||||
|
||||
if (queue_index < 0 || skb->ooo_okay ||
|
||||
queue_index >= dev->real_num_tx_queues) {
|
||||
int new_index = get_xps_queue(dev, skb);
|
||||
if (new_index < 0)
|
||||
new_index = skb_tx_hash(dev, skb);
|
||||
|
||||
if (queue_index != new_index && sk &&
|
||||
rcu_access_pointer(sk->sk_dst_cache))
|
||||
sk_tx_queue_set(sk, new_index);
|
||||
|
||||
queue_index = new_index;
|
||||
}
|
||||
|
||||
return queue_index;
|
||||
}
|
||||
|
||||
struct netdev_queue *netdev_pick_tx(struct net_device *dev,
|
||||
struct sk_buff *skb,
|
||||
void *accel_priv)
|
||||
{
|
||||
int queue_index = 0;
|
||||
|
||||
if (dev->real_num_tx_queues != 1) {
|
||||
const struct net_device_ops *ops = dev->netdev_ops;
|
||||
if (ops->ndo_select_queue)
|
||||
queue_index = ops->ndo_select_queue(dev, skb, accel_priv,
|
||||
__netdev_pick_tx);
|
||||
else
|
||||
queue_index = __netdev_pick_tx(dev, skb);
|
||||
|
||||
if (!accel_priv)
|
||||
queue_index = netdev_cap_txqueue(dev, queue_index);
|
||||
}
|
||||
|
||||
skb_set_queue_mapping(skb, queue_index);
|
||||
return netdev_get_tx_queue(dev, queue_index);
|
||||
}
|
328
net/core/gen_estimator.c
Normal file
328
net/core/gen_estimator.c
Normal file
|
@ -0,0 +1,328 @@
|
|||
/*
|
||||
* net/sched/gen_estimator.c Simple rate estimator.
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU General Public License
|
||||
* as published by the Free Software Foundation; either version
|
||||
* 2 of the License, or (at your option) any later version.
|
||||
*
|
||||
* Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
|
||||
*
|
||||
* Changes:
|
||||
* Jamal Hadi Salim - moved it to net/core and reshulfed
|
||||
* names to make it usable in general net subsystem.
|
||||
*/
|
||||
|
||||
#include <asm/uaccess.h>
|
||||
#include <linux/bitops.h>
|
||||
#include <linux/module.h>
|
||||
#include <linux/types.h>
|
||||
#include <linux/kernel.h>
|
||||
#include <linux/jiffies.h>
|
||||
#include <linux/string.h>
|
||||
#include <linux/mm.h>
|
||||
#include <linux/socket.h>
|
||||
#include <linux/sockios.h>
|
||||
#include <linux/in.h>
|
||||
#include <linux/errno.h>
|
||||
#include <linux/interrupt.h>
|
||||
#include <linux/netdevice.h>
|
||||
#include <linux/skbuff.h>
|
||||
#include <linux/rtnetlink.h>
|
||||
#include <linux/init.h>
|
||||
#include <linux/rbtree.h>
|
||||
#include <linux/slab.h>
|
||||
#include <net/sock.h>
|
||||
#include <net/gen_stats.h>
|
||||
|
||||
/*
|
||||
This code is NOT intended to be used for statistics collection,
|
||||
its purpose is to provide a base for statistical multiplexing
|
||||
for controlled load service.
|
||||
If you need only statistics, run a user level daemon which
|
||||
periodically reads byte counters.
|
||||
|
||||
Unfortunately, rate estimation is not a very easy task.
|
||||
F.e. I did not find a simple way to estimate the current peak rate
|
||||
and even failed to formulate the problem 8)8)
|
||||
|
||||
So I preferred not to built an estimator into the scheduler,
|
||||
but run this task separately.
|
||||
Ideally, it should be kernel thread(s), but for now it runs
|
||||
from timers, which puts apparent top bounds on the number of rated
|
||||
flows, has minimal overhead on small, but is enough
|
||||
to handle controlled load service, sets of aggregates.
|
||||
|
||||
We measure rate over A=(1<<interval) seconds and evaluate EWMA:
|
||||
|
||||
avrate = avrate*(1-W) + rate*W
|
||||
|
||||
where W is chosen as negative power of 2: W = 2^(-ewma_log)
|
||||
|
||||
The resulting time constant is:
|
||||
|
||||
T = A/(-ln(1-W))
|
||||
|
||||
|
||||
NOTES.
|
||||
|
||||
* avbps is scaled by 2^5, avpps is scaled by 2^10.
|
||||
* both values are reported as 32 bit unsigned values. bps can
|
||||
overflow for fast links : max speed being 34360Mbit/sec
|
||||
* Minimal interval is HZ/4=250msec (it is the greatest common divisor
|
||||
for HZ=100 and HZ=1024 8)), maximal interval
|
||||
is (HZ*2^EST_MAX_INTERVAL)/4 = 8sec. Shorter intervals
|
||||
are too expensive, longer ones can be implemented
|
||||
at user level painlessly.
|
||||
*/
|
||||
|
||||
#define EST_MAX_INTERVAL 5
|
||||
|
||||
struct gen_estimator
|
||||
{
|
||||
struct list_head list;
|
||||
struct gnet_stats_basic_packed *bstats;
|
||||
struct gnet_stats_rate_est64 *rate_est;
|
||||
spinlock_t *stats_lock;
|
||||
int ewma_log;
|
||||
u64 last_bytes;
|
||||
u64 avbps;
|
||||
u32 last_packets;
|
||||
u32 avpps;
|
||||
struct rcu_head e_rcu;
|
||||
struct rb_node node;
|
||||
struct gnet_stats_basic_cpu __percpu *cpu_bstats;
|
||||
struct rcu_head head;
|
||||
};
|
||||
|
||||
struct gen_estimator_head
|
||||
{
|
||||
struct timer_list timer;
|
||||
struct list_head list;
|
||||
};
|
||||
|
||||
static struct gen_estimator_head elist[EST_MAX_INTERVAL+1];
|
||||
|
||||
/* Protects against NULL dereference */
|
||||
static DEFINE_RWLOCK(est_lock);
|
||||
|
||||
/* Protects against soft lockup during large deletion */
|
||||
static struct rb_root est_root = RB_ROOT;
|
||||
static DEFINE_SPINLOCK(est_tree_lock);
|
||||
|
||||
static void est_timer(unsigned long arg)
|
||||
{
|
||||
int idx = (int)arg;
|
||||
struct gen_estimator *e;
|
||||
|
||||
rcu_read_lock();
|
||||
list_for_each_entry_rcu(e, &elist[idx].list, list) {
|
||||
struct gnet_stats_basic_packed b = {0};
|
||||
u64 brate;
|
||||
u32 rate;
|
||||
|
||||
spin_lock(e->stats_lock);
|
||||
read_lock(&est_lock);
|
||||
if (e->bstats == NULL)
|
||||
goto skip;
|
||||
|
||||
__gnet_stats_copy_basic(&b, e->cpu_bstats, e->bstats);
|
||||
|
||||
brate = (b.bytes - e->last_bytes)<<(7 - idx);
|
||||
e->last_bytes = b.bytes;
|
||||
e->avbps += (brate >> e->ewma_log) - (e->avbps >> e->ewma_log);
|
||||
e->rate_est->bps = (e->avbps+0xF)>>5;
|
||||
|
||||
rate = (b.packets - e->last_packets)<<(12 - idx);
|
||||
e->last_packets = b.packets;
|
||||
e->avpps += (rate >> e->ewma_log) - (e->avpps >> e->ewma_log);
|
||||
e->rate_est->pps = (e->avpps+0x1FF)>>10;
|
||||
skip:
|
||||
read_unlock(&est_lock);
|
||||
spin_unlock(e->stats_lock);
|
||||
}
|
||||
|
||||
if (!list_empty(&elist[idx].list))
|
||||
mod_timer(&elist[idx].timer, jiffies + ((HZ/4) << idx));
|
||||
rcu_read_unlock();
|
||||
}
|
||||
|
||||
static void gen_add_node(struct gen_estimator *est)
|
||||
{
|
||||
struct rb_node **p = &est_root.rb_node, *parent = NULL;
|
||||
|
||||
while (*p) {
|
||||
struct gen_estimator *e;
|
||||
|
||||
parent = *p;
|
||||
e = rb_entry(parent, struct gen_estimator, node);
|
||||
|
||||
if (est->bstats > e->bstats)
|
||||
p = &parent->rb_right;
|
||||
else
|
||||
p = &parent->rb_left;
|
||||
}
|
||||
rb_link_node(&est->node, parent, p);
|
||||
rb_insert_color(&est->node, &est_root);
|
||||
}
|
||||
|
||||
static
|
||||
struct gen_estimator *gen_find_node(const struct gnet_stats_basic_packed *bstats,
|
||||
const struct gnet_stats_rate_est64 *rate_est)
|
||||
{
|
||||
struct rb_node *p = est_root.rb_node;
|
||||
|
||||
while (p) {
|
||||
struct gen_estimator *e;
|
||||
|
||||
e = rb_entry(p, struct gen_estimator, node);
|
||||
|
||||
if (bstats > e->bstats)
|
||||
p = p->rb_right;
|
||||
else if (bstats < e->bstats || rate_est != e->rate_est)
|
||||
p = p->rb_left;
|
||||
else
|
||||
return e;
|
||||
}
|
||||
return NULL;
|
||||
}
|
||||
|
||||
/**
|
||||
* gen_new_estimator - create a new rate estimator
|
||||
* @bstats: basic statistics
|
||||
* @rate_est: rate estimator statistics
|
||||
* @stats_lock: statistics lock
|
||||
* @opt: rate estimator configuration TLV
|
||||
*
|
||||
* Creates a new rate estimator with &bstats as source and &rate_est
|
||||
* as destination. A new timer with the interval specified in the
|
||||
* configuration TLV is created. Upon each interval, the latest statistics
|
||||
* will be read from &bstats and the estimated rate will be stored in
|
||||
* &rate_est with the statistics lock grabbed during this period.
|
||||
*
|
||||
* Returns 0 on success or a negative error code.
|
||||
*
|
||||
*/
|
||||
int gen_new_estimator(struct gnet_stats_basic_packed *bstats,
|
||||
struct gnet_stats_basic_cpu __percpu *cpu_bstats,
|
||||
struct gnet_stats_rate_est64 *rate_est,
|
||||
spinlock_t *stats_lock,
|
||||
struct nlattr *opt)
|
||||
{
|
||||
struct gen_estimator *est;
|
||||
struct gnet_estimator *parm = nla_data(opt);
|
||||
struct gnet_stats_basic_packed b = {0};
|
||||
int idx;
|
||||
|
||||
if (nla_len(opt) < sizeof(*parm))
|
||||
return -EINVAL;
|
||||
|
||||
if (parm->interval < -2 || parm->interval > 3)
|
||||
return -EINVAL;
|
||||
|
||||
est = kzalloc(sizeof(*est), GFP_KERNEL);
|
||||
if (est == NULL)
|
||||
return -ENOBUFS;
|
||||
|
||||
__gnet_stats_copy_basic(&b, cpu_bstats, bstats);
|
||||
|
||||
idx = parm->interval + 2;
|
||||
est->bstats = bstats;
|
||||
est->rate_est = rate_est;
|
||||
est->stats_lock = stats_lock;
|
||||
est->ewma_log = parm->ewma_log;
|
||||
est->last_bytes = b.bytes;
|
||||
est->avbps = rate_est->bps<<5;
|
||||
est->last_packets = b.packets;
|
||||
est->avpps = rate_est->pps<<10;
|
||||
est->cpu_bstats = cpu_bstats;
|
||||
|
||||
spin_lock_bh(&est_tree_lock);
|
||||
if (!elist[idx].timer.function) {
|
||||
INIT_LIST_HEAD(&elist[idx].list);
|
||||
setup_timer(&elist[idx].timer, est_timer, idx);
|
||||
}
|
||||
|
||||
if (list_empty(&elist[idx].list))
|
||||
mod_timer(&elist[idx].timer, jiffies + ((HZ/4) << idx));
|
||||
|
||||
list_add_rcu(&est->list, &elist[idx].list);
|
||||
gen_add_node(est);
|
||||
spin_unlock_bh(&est_tree_lock);
|
||||
|
||||
return 0;
|
||||
}
|
||||
EXPORT_SYMBOL(gen_new_estimator);
|
||||
|
||||
/**
|
||||
* gen_kill_estimator - remove a rate estimator
|
||||
* @bstats: basic statistics
|
||||
* @rate_est: rate estimator statistics
|
||||
*
|
||||
* Removes the rate estimator specified by &bstats and &rate_est.
|
||||
*
|
||||
* Note : Caller should respect an RCU grace period before freeing stats_lock
|
||||
*/
|
||||
void gen_kill_estimator(struct gnet_stats_basic_packed *bstats,
|
||||
struct gnet_stats_rate_est64 *rate_est)
|
||||
{
|
||||
struct gen_estimator *e;
|
||||
|
||||
spin_lock_bh(&est_tree_lock);
|
||||
while ((e = gen_find_node(bstats, rate_est))) {
|
||||
rb_erase(&e->node, &est_root);
|
||||
|
||||
write_lock(&est_lock);
|
||||
e->bstats = NULL;
|
||||
write_unlock(&est_lock);
|
||||
|
||||
list_del_rcu(&e->list);
|
||||
kfree_rcu(e, e_rcu);
|
||||
}
|
||||
spin_unlock_bh(&est_tree_lock);
|
||||
}
|
||||
EXPORT_SYMBOL(gen_kill_estimator);
|
||||
|
||||
/**
|
||||
* gen_replace_estimator - replace rate estimator configuration
|
||||
* @bstats: basic statistics
|
||||
* @rate_est: rate estimator statistics
|
||||
* @stats_lock: statistics lock
|
||||
* @opt: rate estimator configuration TLV
|
||||
*
|
||||
* Replaces the configuration of a rate estimator by calling
|
||||
* gen_kill_estimator() and gen_new_estimator().
|
||||
*
|
||||
* Returns 0 on success or a negative error code.
|
||||
*/
|
||||
int gen_replace_estimator(struct gnet_stats_basic_packed *bstats,
|
||||
struct gnet_stats_basic_cpu __percpu *cpu_bstats,
|
||||
struct gnet_stats_rate_est64 *rate_est,
|
||||
spinlock_t *stats_lock, struct nlattr *opt)
|
||||
{
|
||||
gen_kill_estimator(bstats, rate_est);
|
||||
return gen_new_estimator(bstats, cpu_bstats, rate_est, stats_lock, opt);
|
||||
}
|
||||
EXPORT_SYMBOL(gen_replace_estimator);
|
||||
|
||||
/**
|
||||
* gen_estimator_active - test if estimator is currently in use
|
||||
* @bstats: basic statistics
|
||||
* @rate_est: rate estimator statistics
|
||||
*
|
||||
* Returns true if estimator is active, and false if not.
|
||||
*/
|
||||
bool gen_estimator_active(const struct gnet_stats_basic_packed *bstats,
|
||||
const struct gnet_stats_rate_est64 *rate_est)
|
||||
{
|
||||
bool res;
|
||||
|
||||
ASSERT_RTNL();
|
||||
|
||||
spin_lock_bh(&est_tree_lock);
|
||||
res = gen_find_node(bstats, rate_est) != NULL;
|
||||
spin_unlock_bh(&est_tree_lock);
|
||||
|
||||
return res;
|
||||
}
|
||||
EXPORT_SYMBOL(gen_estimator_active);
|
364
net/core/gen_stats.c
Normal file
364
net/core/gen_stats.c
Normal file
|
@ -0,0 +1,364 @@
|
|||
/*
|
||||
* net/core/gen_stats.c
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU General Public License
|
||||
* as published by the Free Software Foundation; either version
|
||||
* 2 of the License, or (at your option) any later version.
|
||||
*
|
||||
* Authors: Thomas Graf <tgraf@suug.ch>
|
||||
* Jamal Hadi Salim
|
||||
* Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
|
||||
*
|
||||
* See Documentation/networking/gen_stats.txt
|
||||
*/
|
||||
|
||||
#include <linux/types.h>
|
||||
#include <linux/kernel.h>
|
||||
#include <linux/module.h>
|
||||
#include <linux/interrupt.h>
|
||||
#include <linux/socket.h>
|
||||
#include <linux/rtnetlink.h>
|
||||
#include <linux/gen_stats.h>
|
||||
#include <net/netlink.h>
|
||||
#include <net/gen_stats.h>
|
||||
|
||||
|
||||
static inline int
|
||||
gnet_stats_copy(struct gnet_dump *d, int type, void *buf, int size)
|
||||
{
|
||||
if (nla_put(d->skb, type, size, buf))
|
||||
goto nla_put_failure;
|
||||
return 0;
|
||||
|
||||
nla_put_failure:
|
||||
kfree(d->xstats);
|
||||
d->xstats = NULL;
|
||||
d->xstats_len = 0;
|
||||
spin_unlock_bh(d->lock);
|
||||
return -1;
|
||||
}
|
||||
|
||||
/**
|
||||
* gnet_stats_start_copy_compat - start dumping procedure in compatibility mode
|
||||
* @skb: socket buffer to put statistics TLVs into
|
||||
* @type: TLV type for top level statistic TLV
|
||||
* @tc_stats_type: TLV type for backward compatibility struct tc_stats TLV
|
||||
* @xstats_type: TLV type for backward compatibility xstats TLV
|
||||
* @lock: statistics lock
|
||||
* @d: dumping handle
|
||||
*
|
||||
* Initializes the dumping handle, grabs the statistic lock and appends
|
||||
* an empty TLV header to the socket buffer for use a container for all
|
||||
* other statistic TLVS.
|
||||
*
|
||||
* The dumping handle is marked to be in backward compatibility mode telling
|
||||
* all gnet_stats_copy_XXX() functions to fill a local copy of struct tc_stats.
|
||||
*
|
||||
* Returns 0 on success or -1 if the room in the socket buffer was not sufficient.
|
||||
*/
|
||||
int
|
||||
gnet_stats_start_copy_compat(struct sk_buff *skb, int type, int tc_stats_type,
|
||||
int xstats_type, spinlock_t *lock, struct gnet_dump *d)
|
||||
__acquires(lock)
|
||||
{
|
||||
memset(d, 0, sizeof(*d));
|
||||
|
||||
spin_lock_bh(lock);
|
||||
d->lock = lock;
|
||||
if (type)
|
||||
d->tail = (struct nlattr *)skb_tail_pointer(skb);
|
||||
d->skb = skb;
|
||||
d->compat_tc_stats = tc_stats_type;
|
||||
d->compat_xstats = xstats_type;
|
||||
|
||||
if (d->tail)
|
||||
return gnet_stats_copy(d, type, NULL, 0);
|
||||
|
||||
return 0;
|
||||
}
|
||||
EXPORT_SYMBOL(gnet_stats_start_copy_compat);
|
||||
|
||||
/**
|
||||
* gnet_stats_start_copy_compat - start dumping procedure in compatibility mode
|
||||
* @skb: socket buffer to put statistics TLVs into
|
||||
* @type: TLV type for top level statistic TLV
|
||||
* @lock: statistics lock
|
||||
* @d: dumping handle
|
||||
*
|
||||
* Initializes the dumping handle, grabs the statistic lock and appends
|
||||
* an empty TLV header to the socket buffer for use a container for all
|
||||
* other statistic TLVS.
|
||||
*
|
||||
* Returns 0 on success or -1 if the room in the socket buffer was not sufficient.
|
||||
*/
|
||||
int
|
||||
gnet_stats_start_copy(struct sk_buff *skb, int type, spinlock_t *lock,
|
||||
struct gnet_dump *d)
|
||||
{
|
||||
return gnet_stats_start_copy_compat(skb, type, 0, 0, lock, d);
|
||||
}
|
||||
EXPORT_SYMBOL(gnet_stats_start_copy);
|
||||
|
||||
static void
|
||||
__gnet_stats_copy_basic_cpu(struct gnet_stats_basic_packed *bstats,
|
||||
struct gnet_stats_basic_cpu __percpu *cpu)
|
||||
{
|
||||
int i;
|
||||
|
||||
for_each_possible_cpu(i) {
|
||||
struct gnet_stats_basic_cpu *bcpu = per_cpu_ptr(cpu, i);
|
||||
unsigned int start;
|
||||
u64 bytes;
|
||||
u32 packets;
|
||||
|
||||
do {
|
||||
start = u64_stats_fetch_begin_irq(&bcpu->syncp);
|
||||
bytes = bcpu->bstats.bytes;
|
||||
packets = bcpu->bstats.packets;
|
||||
} while (u64_stats_fetch_retry_irq(&bcpu->syncp, start));
|
||||
|
||||
bstats->bytes += bytes;
|
||||
bstats->packets += packets;
|
||||
}
|
||||
}
|
||||
|
||||
void
|
||||
__gnet_stats_copy_basic(struct gnet_stats_basic_packed *bstats,
|
||||
struct gnet_stats_basic_cpu __percpu *cpu,
|
||||
struct gnet_stats_basic_packed *b)
|
||||
{
|
||||
if (cpu) {
|
||||
__gnet_stats_copy_basic_cpu(bstats, cpu);
|
||||
} else {
|
||||
bstats->bytes = b->bytes;
|
||||
bstats->packets = b->packets;
|
||||
}
|
||||
}
|
||||
EXPORT_SYMBOL(__gnet_stats_copy_basic);
|
||||
|
||||
/**
|
||||
* gnet_stats_copy_basic - copy basic statistics into statistic TLV
|
||||
* @d: dumping handle
|
||||
* @b: basic statistics
|
||||
*
|
||||
* Appends the basic statistics to the top level TLV created by
|
||||
* gnet_stats_start_copy().
|
||||
*
|
||||
* Returns 0 on success or -1 with the statistic lock released
|
||||
* if the room in the socket buffer was not sufficient.
|
||||
*/
|
||||
int
|
||||
gnet_stats_copy_basic(struct gnet_dump *d,
|
||||
struct gnet_stats_basic_cpu __percpu *cpu,
|
||||
struct gnet_stats_basic_packed *b)
|
||||
{
|
||||
struct gnet_stats_basic_packed bstats = {0};
|
||||
|
||||
__gnet_stats_copy_basic(&bstats, cpu, b);
|
||||
|
||||
if (d->compat_tc_stats) {
|
||||
d->tc_stats.bytes = bstats.bytes;
|
||||
d->tc_stats.packets = bstats.packets;
|
||||
}
|
||||
|
||||
if (d->tail) {
|
||||
struct gnet_stats_basic sb;
|
||||
|
||||
memset(&sb, 0, sizeof(sb));
|
||||
sb.bytes = bstats.bytes;
|
||||
sb.packets = bstats.packets;
|
||||
return gnet_stats_copy(d, TCA_STATS_BASIC, &sb, sizeof(sb));
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
EXPORT_SYMBOL(gnet_stats_copy_basic);
|
||||
|
||||
/**
|
||||
* gnet_stats_copy_rate_est - copy rate estimator statistics into statistics TLV
|
||||
* @d: dumping handle
|
||||
* @b: basic statistics
|
||||
* @r: rate estimator statistics
|
||||
*
|
||||
* Appends the rate estimator statistics to the top level TLV created by
|
||||
* gnet_stats_start_copy().
|
||||
*
|
||||
* Returns 0 on success or -1 with the statistic lock released
|
||||
* if the room in the socket buffer was not sufficient.
|
||||
*/
|
||||
int
|
||||
gnet_stats_copy_rate_est(struct gnet_dump *d,
|
||||
const struct gnet_stats_basic_packed *b,
|
||||
struct gnet_stats_rate_est64 *r)
|
||||
{
|
||||
struct gnet_stats_rate_est est;
|
||||
int res;
|
||||
|
||||
if (b && !gen_estimator_active(b, r))
|
||||
return 0;
|
||||
|
||||
est.bps = min_t(u64, UINT_MAX, r->bps);
|
||||
/* we have some time before reaching 2^32 packets per second */
|
||||
est.pps = r->pps;
|
||||
|
||||
if (d->compat_tc_stats) {
|
||||
d->tc_stats.bps = est.bps;
|
||||
d->tc_stats.pps = est.pps;
|
||||
}
|
||||
|
||||
if (d->tail) {
|
||||
res = gnet_stats_copy(d, TCA_STATS_RATE_EST, &est, sizeof(est));
|
||||
if (res < 0 || est.bps == r->bps)
|
||||
return res;
|
||||
/* emit 64bit stats only if needed */
|
||||
return gnet_stats_copy(d, TCA_STATS_RATE_EST64, r, sizeof(*r));
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
EXPORT_SYMBOL(gnet_stats_copy_rate_est);
|
||||
|
||||
static void
|
||||
__gnet_stats_copy_queue_cpu(struct gnet_stats_queue *qstats,
|
||||
const struct gnet_stats_queue __percpu *q)
|
||||
{
|
||||
int i;
|
||||
|
||||
for_each_possible_cpu(i) {
|
||||
const struct gnet_stats_queue *qcpu = per_cpu_ptr(q, i);
|
||||
|
||||
qstats->qlen = 0;
|
||||
qstats->backlog += qcpu->backlog;
|
||||
qstats->drops += qcpu->drops;
|
||||
qstats->requeues += qcpu->requeues;
|
||||
qstats->overlimits += qcpu->overlimits;
|
||||
}
|
||||
}
|
||||
|
||||
static void __gnet_stats_copy_queue(struct gnet_stats_queue *qstats,
|
||||
const struct gnet_stats_queue __percpu *cpu,
|
||||
const struct gnet_stats_queue *q,
|
||||
__u32 qlen)
|
||||
{
|
||||
if (cpu) {
|
||||
__gnet_stats_copy_queue_cpu(qstats, cpu);
|
||||
} else {
|
||||
qstats->qlen = q->qlen;
|
||||
qstats->backlog = q->backlog;
|
||||
qstats->drops = q->drops;
|
||||
qstats->requeues = q->requeues;
|
||||
qstats->overlimits = q->overlimits;
|
||||
}
|
||||
|
||||
qstats->qlen = qlen;
|
||||
}
|
||||
|
||||
/**
|
||||
* gnet_stats_copy_queue - copy queue statistics into statistics TLV
|
||||
* @d: dumping handle
|
||||
* @cpu_q: per cpu queue statistics
|
||||
* @q: queue statistics
|
||||
* @qlen: queue length statistics
|
||||
*
|
||||
* Appends the queue statistics to the top level TLV created by
|
||||
* gnet_stats_start_copy(). Using per cpu queue statistics if
|
||||
* they are available.
|
||||
*
|
||||
* Returns 0 on success or -1 with the statistic lock released
|
||||
* if the room in the socket buffer was not sufficient.
|
||||
*/
|
||||
int
|
||||
gnet_stats_copy_queue(struct gnet_dump *d,
|
||||
struct gnet_stats_queue __percpu *cpu_q,
|
||||
struct gnet_stats_queue *q, __u32 qlen)
|
||||
{
|
||||
struct gnet_stats_queue qstats = {0};
|
||||
|
||||
__gnet_stats_copy_queue(&qstats, cpu_q, q, qlen);
|
||||
|
||||
if (d->compat_tc_stats) {
|
||||
d->tc_stats.drops = qstats.drops;
|
||||
d->tc_stats.qlen = qstats.qlen;
|
||||
d->tc_stats.backlog = qstats.backlog;
|
||||
d->tc_stats.overlimits = qstats.overlimits;
|
||||
}
|
||||
|
||||
if (d->tail)
|
||||
return gnet_stats_copy(d, TCA_STATS_QUEUE,
|
||||
&qstats, sizeof(qstats));
|
||||
|
||||
return 0;
|
||||
}
|
||||
EXPORT_SYMBOL(gnet_stats_copy_queue);
|
||||
|
||||
/**
|
||||
* gnet_stats_copy_app - copy application specific statistics into statistics TLV
|
||||
* @d: dumping handle
|
||||
* @st: application specific statistics data
|
||||
* @len: length of data
|
||||
*
|
||||
* Appends the application specific statistics to the top level TLV created by
|
||||
* gnet_stats_start_copy() and remembers the data for XSTATS if the dumping
|
||||
* handle is in backward compatibility mode.
|
||||
*
|
||||
* Returns 0 on success or -1 with the statistic lock released
|
||||
* if the room in the socket buffer was not sufficient.
|
||||
*/
|
||||
int
|
||||
gnet_stats_copy_app(struct gnet_dump *d, void *st, int len)
|
||||
{
|
||||
if (d->compat_xstats) {
|
||||
d->xstats = kmemdup(st, len, GFP_ATOMIC);
|
||||
if (!d->xstats)
|
||||
goto err_out;
|
||||
d->xstats_len = len;
|
||||
}
|
||||
|
||||
if (d->tail)
|
||||
return gnet_stats_copy(d, TCA_STATS_APP, st, len);
|
||||
|
||||
return 0;
|
||||
|
||||
err_out:
|
||||
d->xstats_len = 0;
|
||||
spin_unlock_bh(d->lock);
|
||||
return -1;
|
||||
}
|
||||
EXPORT_SYMBOL(gnet_stats_copy_app);
|
||||
|
||||
/**
|
||||
* gnet_stats_finish_copy - finish dumping procedure
|
||||
* @d: dumping handle
|
||||
*
|
||||
* Corrects the length of the top level TLV to include all TLVs added
|
||||
* by gnet_stats_copy_XXX() calls. Adds the backward compatibility TLVs
|
||||
* if gnet_stats_start_copy_compat() was used and releases the statistics
|
||||
* lock.
|
||||
*
|
||||
* Returns 0 on success or -1 with the statistic lock released
|
||||
* if the room in the socket buffer was not sufficient.
|
||||
*/
|
||||
int
|
||||
gnet_stats_finish_copy(struct gnet_dump *d)
|
||||
{
|
||||
if (d->tail)
|
||||
d->tail->nla_len = skb_tail_pointer(d->skb) - (u8 *)d->tail;
|
||||
|
||||
if (d->compat_tc_stats)
|
||||
if (gnet_stats_copy(d, d->compat_tc_stats, &d->tc_stats,
|
||||
sizeof(d->tc_stats)) < 0)
|
||||
return -1;
|
||||
|
||||
if (d->compat_xstats && d->xstats) {
|
||||
if (gnet_stats_copy(d, d->compat_xstats, d->xstats,
|
||||
d->xstats_len) < 0)
|
||||
return -1;
|
||||
}
|
||||
|
||||
kfree(d->xstats);
|
||||
d->xstats = NULL;
|
||||
d->xstats_len = 0;
|
||||
spin_unlock_bh(d->lock);
|
||||
return 0;
|
||||
}
|
||||
EXPORT_SYMBOL(gnet_stats_finish_copy);
|
184
net/core/iovec.c
Normal file
184
net/core/iovec.c
Normal file
|
@ -0,0 +1,184 @@
|
|||
/*
|
||||
* iovec manipulation routines.
|
||||
*
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU General Public License
|
||||
* as published by the Free Software Foundation; either version
|
||||
* 2 of the License, or (at your option) any later version.
|
||||
*
|
||||
* Fixes:
|
||||
* Andrew Lunn : Errors in iovec copying.
|
||||
* Pedro Roque : Added memcpy_fromiovecend and
|
||||
* csum_..._fromiovecend.
|
||||
* Andi Kleen : fixed error handling for 2.1
|
||||
* Alexey Kuznetsov: 2.1 optimisations
|
||||
* Andi Kleen : Fix csum*fromiovecend for IPv6.
|
||||
*/
|
||||
|
||||
#include <linux/errno.h>
|
||||
#include <linux/module.h>
|
||||
#include <linux/kernel.h>
|
||||
#include <linux/mm.h>
|
||||
#include <linux/net.h>
|
||||
#include <linux/in6.h>
|
||||
#include <asm/uaccess.h>
|
||||
#include <asm/byteorder.h>
|
||||
#include <net/checksum.h>
|
||||
#include <net/sock.h>
|
||||
|
||||
/*
|
||||
* Verify iovec. The caller must ensure that the iovec is big enough
|
||||
* to hold the message iovec.
|
||||
*
|
||||
* Save time not doing access_ok. copy_*_user will make this work
|
||||
* in any case.
|
||||
*/
|
||||
|
||||
int verify_iovec(struct msghdr *m, struct iovec *iov, struct sockaddr_storage *address, int mode)
|
||||
{
|
||||
int size, ct, err;
|
||||
|
||||
if (m->msg_name && m->msg_namelen) {
|
||||
if (mode == VERIFY_READ) {
|
||||
void __user *namep;
|
||||
namep = (void __user __force *) m->msg_name;
|
||||
err = move_addr_to_kernel(namep, m->msg_namelen,
|
||||
address);
|
||||
if (err < 0)
|
||||
return err;
|
||||
}
|
||||
m->msg_name = address;
|
||||
} else {
|
||||
m->msg_name = NULL;
|
||||
m->msg_namelen = 0;
|
||||
}
|
||||
|
||||
size = m->msg_iovlen * sizeof(struct iovec);
|
||||
if (copy_from_user(iov, (void __user __force *) m->msg_iov, size))
|
||||
return -EFAULT;
|
||||
|
||||
m->msg_iov = iov;
|
||||
err = 0;
|
||||
|
||||
for (ct = 0; ct < m->msg_iovlen; ct++) {
|
||||
size_t len = iov[ct].iov_len;
|
||||
|
||||
if (len > INT_MAX - err) {
|
||||
len = INT_MAX - err;
|
||||
iov[ct].iov_len = len;
|
||||
}
|
||||
err += len;
|
||||
}
|
||||
|
||||
return err;
|
||||
}
|
||||
|
||||
/*
|
||||
* And now for the all-in-one: copy and checksum from a user iovec
|
||||
* directly to a datagram
|
||||
* Calls to csum_partial but the last must be in 32 bit chunks
|
||||
*
|
||||
* ip_build_xmit must ensure that when fragmenting only the last
|
||||
* call to this function will be unaligned also.
|
||||
*/
|
||||
int csum_partial_copy_fromiovecend(unsigned char *kdata, struct iovec *iov,
|
||||
int offset, unsigned int len, __wsum *csump)
|
||||
{
|
||||
__wsum csum = *csump;
|
||||
int partial_cnt = 0, err = 0;
|
||||
|
||||
/* Skip over the finished iovecs */
|
||||
while (offset >= iov->iov_len) {
|
||||
offset -= iov->iov_len;
|
||||
iov++;
|
||||
}
|
||||
|
||||
while (len > 0) {
|
||||
u8 __user *base = iov->iov_base + offset;
|
||||
int copy = min_t(unsigned int, len, iov->iov_len - offset);
|
||||
|
||||
offset = 0;
|
||||
|
||||
/* There is a remnant from previous iov. */
|
||||
if (partial_cnt) {
|
||||
int par_len = 4 - partial_cnt;
|
||||
|
||||
/* iov component is too short ... */
|
||||
if (par_len > copy) {
|
||||
if (copy_from_user(kdata, base, copy))
|
||||
goto out_fault;
|
||||
kdata += copy;
|
||||
base += copy;
|
||||
partial_cnt += copy;
|
||||
len -= copy;
|
||||
iov++;
|
||||
if (len)
|
||||
continue;
|
||||
*csump = csum_partial(kdata - partial_cnt,
|
||||
partial_cnt, csum);
|
||||
goto out;
|
||||
}
|
||||
if (copy_from_user(kdata, base, par_len))
|
||||
goto out_fault;
|
||||
csum = csum_partial(kdata - partial_cnt, 4, csum);
|
||||
kdata += par_len;
|
||||
base += par_len;
|
||||
copy -= par_len;
|
||||
len -= par_len;
|
||||
partial_cnt = 0;
|
||||
}
|
||||
|
||||
if (len > copy) {
|
||||
partial_cnt = copy % 4;
|
||||
if (partial_cnt) {
|
||||
copy -= partial_cnt;
|
||||
if (copy_from_user(kdata + copy, base + copy,
|
||||
partial_cnt))
|
||||
goto out_fault;
|
||||
}
|
||||
}
|
||||
|
||||
if (copy) {
|
||||
csum = csum_and_copy_from_user(base, kdata, copy,
|
||||
csum, &err);
|
||||
if (err)
|
||||
goto out;
|
||||
}
|
||||
len -= copy + partial_cnt;
|
||||
kdata += copy + partial_cnt;
|
||||
iov++;
|
||||
}
|
||||
*csump = csum;
|
||||
out:
|
||||
return err;
|
||||
|
||||
out_fault:
|
||||
err = -EFAULT;
|
||||
goto out;
|
||||
}
|
||||
EXPORT_SYMBOL(csum_partial_copy_fromiovecend);
|
||||
|
||||
unsigned long iov_pages(const struct iovec *iov, int offset,
|
||||
unsigned long nr_segs)
|
||||
{
|
||||
unsigned long seg, base;
|
||||
int pages = 0, len, size;
|
||||
|
||||
while (nr_segs && (offset >= iov->iov_len)) {
|
||||
offset -= iov->iov_len;
|
||||
++iov;
|
||||
--nr_segs;
|
||||
}
|
||||
|
||||
for (seg = 0; seg < nr_segs; seg++) {
|
||||
base = (unsigned long)iov[seg].iov_base + offset;
|
||||
len = iov[seg].iov_len - offset;
|
||||
size = ((base & ~PAGE_MASK) + len + ~PAGE_MASK) >> PAGE_SHIFT;
|
||||
pages += size;
|
||||
offset = 0;
|
||||
}
|
||||
|
||||
return pages;
|
||||
}
|
||||
EXPORT_SYMBOL(iov_pages);
|
253
net/core/link_watch.c
Normal file
253
net/core/link_watch.c
Normal file
|
@ -0,0 +1,253 @@
|
|||
/*
|
||||
* Linux network device link state notification
|
||||
*
|
||||
* Author:
|
||||
* Stefan Rompf <sux@loplof.de>
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU General Public License
|
||||
* as published by the Free Software Foundation; either version
|
||||
* 2 of the License, or (at your option) any later version.
|
||||
*
|
||||
*/
|
||||
|
||||
#include <linux/module.h>
|
||||
#include <linux/netdevice.h>
|
||||
#include <linux/if.h>
|
||||
#include <net/sock.h>
|
||||
#include <net/pkt_sched.h>
|
||||
#include <linux/rtnetlink.h>
|
||||
#include <linux/jiffies.h>
|
||||
#include <linux/spinlock.h>
|
||||
#include <linux/workqueue.h>
|
||||
#include <linux/bitops.h>
|
||||
#include <asm/types.h>
|
||||
|
||||
|
||||
enum lw_bits {
|
||||
LW_URGENT = 0,
|
||||
};
|
||||
|
||||
static unsigned long linkwatch_flags;
|
||||
static unsigned long linkwatch_nextevent;
|
||||
|
||||
static void linkwatch_event(struct work_struct *dummy);
|
||||
static DECLARE_DELAYED_WORK(linkwatch_work, linkwatch_event);
|
||||
|
||||
static LIST_HEAD(lweventlist);
|
||||
static DEFINE_SPINLOCK(lweventlist_lock);
|
||||
|
||||
static unsigned char default_operstate(const struct net_device *dev)
|
||||
{
|
||||
if (!netif_carrier_ok(dev))
|
||||
return (dev->ifindex != dev->iflink ?
|
||||
IF_OPER_LOWERLAYERDOWN : IF_OPER_DOWN);
|
||||
|
||||
if (netif_dormant(dev))
|
||||
return IF_OPER_DORMANT;
|
||||
|
||||
return IF_OPER_UP;
|
||||
}
|
||||
|
||||
|
||||
static void rfc2863_policy(struct net_device *dev)
|
||||
{
|
||||
unsigned char operstate = default_operstate(dev);
|
||||
|
||||
if (operstate == dev->operstate)
|
||||
return;
|
||||
|
||||
write_lock_bh(&dev_base_lock);
|
||||
|
||||
switch(dev->link_mode) {
|
||||
case IF_LINK_MODE_DORMANT:
|
||||
if (operstate == IF_OPER_UP)
|
||||
operstate = IF_OPER_DORMANT;
|
||||
break;
|
||||
|
||||
case IF_LINK_MODE_DEFAULT:
|
||||
default:
|
||||
break;
|
||||
}
|
||||
|
||||
dev->operstate = operstate;
|
||||
|
||||
write_unlock_bh(&dev_base_lock);
|
||||
}
|
||||
|
||||
|
||||
void linkwatch_init_dev(struct net_device *dev)
|
||||
{
|
||||
/* Handle pre-registration link state changes */
|
||||
if (!netif_carrier_ok(dev) || netif_dormant(dev))
|
||||
rfc2863_policy(dev);
|
||||
}
|
||||
|
||||
|
||||
static bool linkwatch_urgent_event(struct net_device *dev)
|
||||
{
|
||||
if (!netif_running(dev))
|
||||
return false;
|
||||
|
||||
if (dev->ifindex != dev->iflink)
|
||||
return true;
|
||||
|
||||
if (dev->priv_flags & IFF_TEAM_PORT)
|
||||
return true;
|
||||
|
||||
return netif_carrier_ok(dev) && qdisc_tx_changing(dev);
|
||||
}
|
||||
|
||||
|
||||
static void linkwatch_add_event(struct net_device *dev)
|
||||
{
|
||||
unsigned long flags;
|
||||
|
||||
spin_lock_irqsave(&lweventlist_lock, flags);
|
||||
if (list_empty(&dev->link_watch_list)) {
|
||||
list_add_tail(&dev->link_watch_list, &lweventlist);
|
||||
dev_hold(dev);
|
||||
}
|
||||
spin_unlock_irqrestore(&lweventlist_lock, flags);
|
||||
}
|
||||
|
||||
|
||||
static void linkwatch_schedule_work(int urgent)
|
||||
{
|
||||
unsigned long delay = linkwatch_nextevent - jiffies;
|
||||
|
||||
if (test_bit(LW_URGENT, &linkwatch_flags))
|
||||
return;
|
||||
|
||||
/* Minimise down-time: drop delay for up event. */
|
||||
if (urgent) {
|
||||
if (test_and_set_bit(LW_URGENT, &linkwatch_flags))
|
||||
return;
|
||||
delay = 0;
|
||||
}
|
||||
|
||||
/* If we wrap around we'll delay it by at most HZ. */
|
||||
if (delay > HZ)
|
||||
delay = 0;
|
||||
|
||||
/*
|
||||
* If urgent, schedule immediate execution; otherwise, don't
|
||||
* override the existing timer.
|
||||
*/
|
||||
if (test_bit(LW_URGENT, &linkwatch_flags))
|
||||
mod_delayed_work(system_wq, &linkwatch_work, 0);
|
||||
else
|
||||
schedule_delayed_work(&linkwatch_work, delay);
|
||||
}
|
||||
|
||||
|
||||
static void linkwatch_do_dev(struct net_device *dev)
|
||||
{
|
||||
/*
|
||||
* Make sure the above read is complete since it can be
|
||||
* rewritten as soon as we clear the bit below.
|
||||
*/
|
||||
smp_mb__before_atomic();
|
||||
|
||||
/* We are about to handle this device,
|
||||
* so new events can be accepted
|
||||
*/
|
||||
clear_bit(__LINK_STATE_LINKWATCH_PENDING, &dev->state);
|
||||
|
||||
rfc2863_policy(dev);
|
||||
if (dev->flags & IFF_UP) {
|
||||
if (netif_carrier_ok(dev))
|
||||
dev_activate(dev);
|
||||
else
|
||||
dev_deactivate(dev);
|
||||
|
||||
netdev_state_change(dev);
|
||||
}
|
||||
dev_put(dev);
|
||||
}
|
||||
|
||||
static void __linkwatch_run_queue(int urgent_only)
|
||||
{
|
||||
struct net_device *dev;
|
||||
LIST_HEAD(wrk);
|
||||
|
||||
/*
|
||||
* Limit the number of linkwatch events to one
|
||||
* per second so that a runaway driver does not
|
||||
* cause a storm of messages on the netlink
|
||||
* socket. This limit does not apply to up events
|
||||
* while the device qdisc is down.
|
||||
*/
|
||||
if (!urgent_only)
|
||||
linkwatch_nextevent = jiffies + HZ;
|
||||
/* Limit wrap-around effect on delay. */
|
||||
else if (time_after(linkwatch_nextevent, jiffies + HZ))
|
||||
linkwatch_nextevent = jiffies;
|
||||
|
||||
clear_bit(LW_URGENT, &linkwatch_flags);
|
||||
|
||||
spin_lock_irq(&lweventlist_lock);
|
||||
list_splice_init(&lweventlist, &wrk);
|
||||
|
||||
while (!list_empty(&wrk)) {
|
||||
|
||||
dev = list_first_entry(&wrk, struct net_device, link_watch_list);
|
||||
list_del_init(&dev->link_watch_list);
|
||||
|
||||
if (urgent_only && !linkwatch_urgent_event(dev)) {
|
||||
list_add_tail(&dev->link_watch_list, &lweventlist);
|
||||
continue;
|
||||
}
|
||||
spin_unlock_irq(&lweventlist_lock);
|
||||
linkwatch_do_dev(dev);
|
||||
spin_lock_irq(&lweventlist_lock);
|
||||
}
|
||||
|
||||
if (!list_empty(&lweventlist))
|
||||
linkwatch_schedule_work(0);
|
||||
spin_unlock_irq(&lweventlist_lock);
|
||||
}
|
||||
|
||||
void linkwatch_forget_dev(struct net_device *dev)
|
||||
{
|
||||
unsigned long flags;
|
||||
int clean = 0;
|
||||
|
||||
spin_lock_irqsave(&lweventlist_lock, flags);
|
||||
if (!list_empty(&dev->link_watch_list)) {
|
||||
list_del_init(&dev->link_watch_list);
|
||||
clean = 1;
|
||||
}
|
||||
spin_unlock_irqrestore(&lweventlist_lock, flags);
|
||||
if (clean)
|
||||
linkwatch_do_dev(dev);
|
||||
}
|
||||
|
||||
|
||||
/* Must be called with the rtnl semaphore held */
|
||||
void linkwatch_run_queue(void)
|
||||
{
|
||||
__linkwatch_run_queue(0);
|
||||
}
|
||||
|
||||
|
||||
static void linkwatch_event(struct work_struct *dummy)
|
||||
{
|
||||
rtnl_lock();
|
||||
__linkwatch_run_queue(time_after(linkwatch_nextevent, jiffies));
|
||||
rtnl_unlock();
|
||||
}
|
||||
|
||||
|
||||
void linkwatch_fire_event(struct net_device *dev)
|
||||
{
|
||||
bool urgent = linkwatch_urgent_event(dev);
|
||||
|
||||
if (!test_and_set_bit(__LINK_STATE_LINKWATCH_PENDING, &dev->state)) {
|
||||
linkwatch_add_event(dev);
|
||||
} else if (!urgent)
|
||||
return;
|
||||
|
||||
linkwatch_schedule_work(urgent);
|
||||
}
|
||||
EXPORT_SYMBOL(linkwatch_fire_event);
|
3143
net/core/neighbour.c
Normal file
3143
net/core/neighbour.c
Normal file
File diff suppressed because it is too large
Load diff
423
net/core/net-procfs.c
Normal file
423
net/core/net-procfs.c
Normal file
|
@ -0,0 +1,423 @@
|
|||
#include <linux/netdevice.h>
|
||||
#include <linux/proc_fs.h>
|
||||
#include <linux/seq_file.h>
|
||||
#include <net/wext.h>
|
||||
|
||||
#define BUCKET_SPACE (32 - NETDEV_HASHBITS - 1)
|
||||
|
||||
#define get_bucket(x) ((x) >> BUCKET_SPACE)
|
||||
#define get_offset(x) ((x) & ((1 << BUCKET_SPACE) - 1))
|
||||
#define set_bucket_offset(b, o) ((b) << BUCKET_SPACE | (o))
|
||||
|
||||
extern struct list_head ptype_all __read_mostly;
|
||||
extern struct list_head ptype_base[PTYPE_HASH_SIZE] __read_mostly;
|
||||
|
||||
static inline struct net_device *dev_from_same_bucket(struct seq_file *seq, loff_t *pos)
|
||||
{
|
||||
struct net *net = seq_file_net(seq);
|
||||
struct net_device *dev;
|
||||
struct hlist_head *h;
|
||||
unsigned int count = 0, offset = get_offset(*pos);
|
||||
|
||||
h = &net->dev_name_head[get_bucket(*pos)];
|
||||
hlist_for_each_entry_rcu(dev, h, name_hlist) {
|
||||
if (++count == offset)
|
||||
return dev;
|
||||
}
|
||||
|
||||
return NULL;
|
||||
}
|
||||
|
||||
static inline struct net_device *dev_from_bucket(struct seq_file *seq, loff_t *pos)
|
||||
{
|
||||
struct net_device *dev;
|
||||
unsigned int bucket;
|
||||
|
||||
do {
|
||||
dev = dev_from_same_bucket(seq, pos);
|
||||
if (dev)
|
||||
return dev;
|
||||
|
||||
bucket = get_bucket(*pos) + 1;
|
||||
*pos = set_bucket_offset(bucket, 1);
|
||||
} while (bucket < NETDEV_HASHENTRIES);
|
||||
|
||||
return NULL;
|
||||
}
|
||||
|
||||
/*
|
||||
* This is invoked by the /proc filesystem handler to display a device
|
||||
* in detail.
|
||||
*/
|
||||
static void *dev_seq_start(struct seq_file *seq, loff_t *pos)
|
||||
__acquires(RCU)
|
||||
{
|
||||
rcu_read_lock();
|
||||
if (!*pos)
|
||||
return SEQ_START_TOKEN;
|
||||
|
||||
if (get_bucket(*pos) >= NETDEV_HASHENTRIES)
|
||||
return NULL;
|
||||
|
||||
return dev_from_bucket(seq, pos);
|
||||
}
|
||||
|
||||
static void *dev_seq_next(struct seq_file *seq, void *v, loff_t *pos)
|
||||
{
|
||||
++*pos;
|
||||
return dev_from_bucket(seq, pos);
|
||||
}
|
||||
|
||||
static void dev_seq_stop(struct seq_file *seq, void *v)
|
||||
__releases(RCU)
|
||||
{
|
||||
rcu_read_unlock();
|
||||
}
|
||||
|
||||
static void dev_seq_printf_stats(struct seq_file *seq, struct net_device *dev)
|
||||
{
|
||||
struct rtnl_link_stats64 temp;
|
||||
const struct rtnl_link_stats64 *stats = dev_get_stats(dev, &temp);
|
||||
|
||||
seq_printf(seq, "%6s: %7llu %7llu %4llu %4llu %4llu %5llu %10llu %9llu "
|
||||
"%8llu %7llu %4llu %4llu %4llu %5llu %7llu %10llu\n",
|
||||
dev->name, stats->rx_bytes, stats->rx_packets,
|
||||
stats->rx_errors,
|
||||
stats->rx_dropped + stats->rx_missed_errors,
|
||||
stats->rx_fifo_errors,
|
||||
stats->rx_length_errors + stats->rx_over_errors +
|
||||
stats->rx_crc_errors + stats->rx_frame_errors,
|
||||
stats->rx_compressed, stats->multicast,
|
||||
stats->tx_bytes, stats->tx_packets,
|
||||
stats->tx_errors, stats->tx_dropped,
|
||||
stats->tx_fifo_errors, stats->collisions,
|
||||
stats->tx_carrier_errors +
|
||||
stats->tx_aborted_errors +
|
||||
stats->tx_window_errors +
|
||||
stats->tx_heartbeat_errors,
|
||||
stats->tx_compressed);
|
||||
}
|
||||
|
||||
/*
|
||||
* Called from the PROCfs module. This now uses the new arbitrary sized
|
||||
* /proc/net interface to create /proc/net/dev
|
||||
*/
|
||||
static int dev_seq_show(struct seq_file *seq, void *v)
|
||||
{
|
||||
if (v == SEQ_START_TOKEN)
|
||||
seq_puts(seq, "Inter-| Receive "
|
||||
" | Transmit\n"
|
||||
" face |bytes packets errs drop fifo frame "
|
||||
"compressed multicast|bytes packets errs "
|
||||
"drop fifo colls carrier compressed\n");
|
||||
else
|
||||
dev_seq_printf_stats(seq, v);
|
||||
return 0;
|
||||
}
|
||||
|
||||
static struct softnet_data *softnet_get_online(loff_t *pos)
|
||||
{
|
||||
struct softnet_data *sd = NULL;
|
||||
|
||||
while (*pos < nr_cpu_ids)
|
||||
if (cpu_online(*pos)) {
|
||||
sd = &per_cpu(softnet_data, *pos);
|
||||
break;
|
||||
} else
|
||||
++*pos;
|
||||
return sd;
|
||||
}
|
||||
|
||||
static void *softnet_seq_start(struct seq_file *seq, loff_t *pos)
|
||||
{
|
||||
return softnet_get_online(pos);
|
||||
}
|
||||
|
||||
static void *softnet_seq_next(struct seq_file *seq, void *v, loff_t *pos)
|
||||
{
|
||||
++*pos;
|
||||
return softnet_get_online(pos);
|
||||
}
|
||||
|
||||
static void softnet_seq_stop(struct seq_file *seq, void *v)
|
||||
{
|
||||
}
|
||||
|
||||
static int softnet_seq_show(struct seq_file *seq, void *v)
|
||||
{
|
||||
struct softnet_data *sd = v;
|
||||
unsigned int flow_limit_count = 0;
|
||||
|
||||
#ifdef CONFIG_NET_FLOW_LIMIT
|
||||
struct sd_flow_limit *fl;
|
||||
|
||||
rcu_read_lock();
|
||||
fl = rcu_dereference(sd->flow_limit);
|
||||
if (fl)
|
||||
flow_limit_count = fl->count;
|
||||
rcu_read_unlock();
|
||||
#endif
|
||||
|
||||
seq_printf(seq,
|
||||
"%08x %08x %08x %08x %08x %08x %08x %08x %08x %08x %08x\n",
|
||||
sd->processed, sd->dropped, sd->time_squeeze, 0,
|
||||
0, 0, 0, 0, /* was fastroute */
|
||||
sd->cpu_collision, sd->received_rps, flow_limit_count);
|
||||
return 0;
|
||||
}
|
||||
|
||||
static const struct seq_operations dev_seq_ops = {
|
||||
.start = dev_seq_start,
|
||||
.next = dev_seq_next,
|
||||
.stop = dev_seq_stop,
|
||||
.show = dev_seq_show,
|
||||
};
|
||||
|
||||
static int dev_seq_open(struct inode *inode, struct file *file)
|
||||
{
|
||||
return seq_open_net(inode, file, &dev_seq_ops,
|
||||
sizeof(struct seq_net_private));
|
||||
}
|
||||
|
||||
static const struct file_operations dev_seq_fops = {
|
||||
.owner = THIS_MODULE,
|
||||
.open = dev_seq_open,
|
||||
.read = seq_read,
|
||||
.llseek = seq_lseek,
|
||||
.release = seq_release_net,
|
||||
};
|
||||
|
||||
static const struct seq_operations softnet_seq_ops = {
|
||||
.start = softnet_seq_start,
|
||||
.next = softnet_seq_next,
|
||||
.stop = softnet_seq_stop,
|
||||
.show = softnet_seq_show,
|
||||
};
|
||||
|
||||
static int softnet_seq_open(struct inode *inode, struct file *file)
|
||||
{
|
||||
return seq_open(file, &softnet_seq_ops);
|
||||
}
|
||||
|
||||
static const struct file_operations softnet_seq_fops = {
|
||||
.owner = THIS_MODULE,
|
||||
.open = softnet_seq_open,
|
||||
.read = seq_read,
|
||||
.llseek = seq_lseek,
|
||||
.release = seq_release,
|
||||
};
|
||||
|
||||
static void *ptype_get_idx(loff_t pos)
|
||||
{
|
||||
struct packet_type *pt = NULL;
|
||||
loff_t i = 0;
|
||||
int t;
|
||||
|
||||
list_for_each_entry_rcu(pt, &ptype_all, list) {
|
||||
if (i == pos)
|
||||
return pt;
|
||||
++i;
|
||||
}
|
||||
|
||||
for (t = 0; t < PTYPE_HASH_SIZE; t++) {
|
||||
list_for_each_entry_rcu(pt, &ptype_base[t], list) {
|
||||
if (i == pos)
|
||||
return pt;
|
||||
++i;
|
||||
}
|
||||
}
|
||||
return NULL;
|
||||
}
|
||||
|
||||
static void *ptype_seq_start(struct seq_file *seq, loff_t *pos)
|
||||
__acquires(RCU)
|
||||
{
|
||||
rcu_read_lock();
|
||||
return *pos ? ptype_get_idx(*pos - 1) : SEQ_START_TOKEN;
|
||||
}
|
||||
|
||||
static void *ptype_seq_next(struct seq_file *seq, void *v, loff_t *pos)
|
||||
{
|
||||
struct packet_type *pt;
|
||||
struct list_head *nxt;
|
||||
int hash;
|
||||
|
||||
++*pos;
|
||||
if (v == SEQ_START_TOKEN)
|
||||
return ptype_get_idx(0);
|
||||
|
||||
pt = v;
|
||||
nxt = pt->list.next;
|
||||
if (pt->type == htons(ETH_P_ALL)) {
|
||||
if (nxt != &ptype_all)
|
||||
goto found;
|
||||
hash = 0;
|
||||
nxt = ptype_base[0].next;
|
||||
} else
|
||||
hash = ntohs(pt->type) & PTYPE_HASH_MASK;
|
||||
|
||||
while (nxt == &ptype_base[hash]) {
|
||||
if (++hash >= PTYPE_HASH_SIZE)
|
||||
return NULL;
|
||||
nxt = ptype_base[hash].next;
|
||||
}
|
||||
found:
|
||||
return list_entry(nxt, struct packet_type, list);
|
||||
}
|
||||
|
||||
static void ptype_seq_stop(struct seq_file *seq, void *v)
|
||||
__releases(RCU)
|
||||
{
|
||||
rcu_read_unlock();
|
||||
}
|
||||
|
||||
static int ptype_seq_show(struct seq_file *seq, void *v)
|
||||
{
|
||||
struct packet_type *pt = v;
|
||||
|
||||
if (v == SEQ_START_TOKEN)
|
||||
seq_puts(seq, "Type Device Function\n");
|
||||
else if (pt->dev == NULL || dev_net(pt->dev) == seq_file_net(seq)) {
|
||||
if (pt->type == htons(ETH_P_ALL))
|
||||
seq_puts(seq, "ALL ");
|
||||
else
|
||||
seq_printf(seq, "%04x", ntohs(pt->type));
|
||||
|
||||
seq_printf(seq, " %-8s %pf\n",
|
||||
pt->dev ? pt->dev->name : "", pt->func);
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
static const struct seq_operations ptype_seq_ops = {
|
||||
.start = ptype_seq_start,
|
||||
.next = ptype_seq_next,
|
||||
.stop = ptype_seq_stop,
|
||||
.show = ptype_seq_show,
|
||||
};
|
||||
|
||||
static int ptype_seq_open(struct inode *inode, struct file *file)
|
||||
{
|
||||
return seq_open_net(inode, file, &ptype_seq_ops,
|
||||
sizeof(struct seq_net_private));
|
||||
}
|
||||
|
||||
static const struct file_operations ptype_seq_fops = {
|
||||
.owner = THIS_MODULE,
|
||||
.open = ptype_seq_open,
|
||||
.read = seq_read,
|
||||
.llseek = seq_lseek,
|
||||
.release = seq_release_net,
|
||||
};
|
||||
|
||||
|
||||
static int __net_init dev_proc_net_init(struct net *net)
|
||||
{
|
||||
int rc = -ENOMEM;
|
||||
|
||||
if (!proc_create("dev", S_IRUGO, net->proc_net, &dev_seq_fops))
|
||||
goto out;
|
||||
if (!proc_create("softnet_stat", S_IRUGO, net->proc_net,
|
||||
&softnet_seq_fops))
|
||||
goto out_dev;
|
||||
if (!proc_create("ptype", S_IRUGO, net->proc_net, &ptype_seq_fops))
|
||||
goto out_softnet;
|
||||
|
||||
if (wext_proc_init(net))
|
||||
goto out_ptype;
|
||||
rc = 0;
|
||||
out:
|
||||
return rc;
|
||||
out_ptype:
|
||||
remove_proc_entry("ptype", net->proc_net);
|
||||
out_softnet:
|
||||
remove_proc_entry("softnet_stat", net->proc_net);
|
||||
out_dev:
|
||||
remove_proc_entry("dev", net->proc_net);
|
||||
goto out;
|
||||
}
|
||||
|
||||
static void __net_exit dev_proc_net_exit(struct net *net)
|
||||
{
|
||||
wext_proc_exit(net);
|
||||
|
||||
remove_proc_entry("ptype", net->proc_net);
|
||||
remove_proc_entry("softnet_stat", net->proc_net);
|
||||
remove_proc_entry("dev", net->proc_net);
|
||||
}
|
||||
|
||||
static struct pernet_operations __net_initdata dev_proc_ops = {
|
||||
.init = dev_proc_net_init,
|
||||
.exit = dev_proc_net_exit,
|
||||
};
|
||||
|
||||
static int dev_mc_seq_show(struct seq_file *seq, void *v)
|
||||
{
|
||||
struct netdev_hw_addr *ha;
|
||||
struct net_device *dev = v;
|
||||
|
||||
if (v == SEQ_START_TOKEN)
|
||||
return 0;
|
||||
|
||||
netif_addr_lock_bh(dev);
|
||||
netdev_for_each_mc_addr(ha, dev) {
|
||||
int i;
|
||||
|
||||
seq_printf(seq, "%-4d %-15s %-5d %-5d ", dev->ifindex,
|
||||
dev->name, ha->refcount, ha->global_use);
|
||||
|
||||
for (i = 0; i < dev->addr_len; i++)
|
||||
seq_printf(seq, "%02x", ha->addr[i]);
|
||||
|
||||
seq_putc(seq, '\n');
|
||||
}
|
||||
netif_addr_unlock_bh(dev);
|
||||
return 0;
|
||||
}
|
||||
|
||||
static const struct seq_operations dev_mc_seq_ops = {
|
||||
.start = dev_seq_start,
|
||||
.next = dev_seq_next,
|
||||
.stop = dev_seq_stop,
|
||||
.show = dev_mc_seq_show,
|
||||
};
|
||||
|
||||
static int dev_mc_seq_open(struct inode *inode, struct file *file)
|
||||
{
|
||||
return seq_open_net(inode, file, &dev_mc_seq_ops,
|
||||
sizeof(struct seq_net_private));
|
||||
}
|
||||
|
||||
static const struct file_operations dev_mc_seq_fops = {
|
||||
.owner = THIS_MODULE,
|
||||
.open = dev_mc_seq_open,
|
||||
.read = seq_read,
|
||||
.llseek = seq_lseek,
|
||||
.release = seq_release_net,
|
||||
};
|
||||
|
||||
static int __net_init dev_mc_net_init(struct net *net)
|
||||
{
|
||||
if (!proc_create("dev_mcast", 0, net->proc_net, &dev_mc_seq_fops))
|
||||
return -ENOMEM;
|
||||
return 0;
|
||||
}
|
||||
|
||||
static void __net_exit dev_mc_net_exit(struct net *net)
|
||||
{
|
||||
remove_proc_entry("dev_mcast", net->proc_net);
|
||||
}
|
||||
|
||||
static struct pernet_operations __net_initdata dev_mc_net_ops = {
|
||||
.init = dev_mc_net_init,
|
||||
.exit = dev_mc_net_exit,
|
||||
};
|
||||
|
||||
int __init dev_proc_init(void)
|
||||
{
|
||||
int ret = register_pernet_subsys(&dev_proc_ops);
|
||||
if (!ret)
|
||||
return register_pernet_subsys(&dev_mc_net_ops);
|
||||
return ret;
|
||||
}
|
1429
net/core/net-sysfs.c
Normal file
1429
net/core/net-sysfs.c
Normal file
File diff suppressed because it is too large
Load diff
11
net/core/net-sysfs.h
Normal file
11
net/core/net-sysfs.h
Normal file
|
@ -0,0 +1,11 @@
|
|||
#ifndef __NET_SYSFS_H__
|
||||
#define __NET_SYSFS_H__
|
||||
|
||||
int __init netdev_kobject_init(void);
|
||||
int netdev_register_kobject(struct net_device *);
|
||||
void netdev_unregister_kobject(struct net_device *);
|
||||
int net_rx_queue_update_kobjects(struct net_device *, int old_num, int new_num);
|
||||
int netdev_queue_update_kobjects(struct net_device *net,
|
||||
int old_num, int new_num);
|
||||
|
||||
#endif
|
37
net/core/net-traces.c
Normal file
37
net/core/net-traces.c
Normal file
|
@ -0,0 +1,37 @@
|
|||
/*
|
||||
* consolidates trace point definitions
|
||||
*
|
||||
* Copyright (C) 2009 Neil Horman <nhorman@tuxdriver.com>
|
||||
*/
|
||||
|
||||
#include <linux/netdevice.h>
|
||||
#include <linux/etherdevice.h>
|
||||
#include <linux/string.h>
|
||||
#include <linux/if_arp.h>
|
||||
#include <linux/inetdevice.h>
|
||||
#include <linux/inet.h>
|
||||
#include <linux/interrupt.h>
|
||||
#include <linux/export.h>
|
||||
#include <linux/netpoll.h>
|
||||
#include <linux/sched.h>
|
||||
#include <linux/delay.h>
|
||||
#include <linux/rcupdate.h>
|
||||
#include <linux/types.h>
|
||||
#include <linux/workqueue.h>
|
||||
#include <linux/netlink.h>
|
||||
#include <linux/net_dropmon.h>
|
||||
#include <linux/slab.h>
|
||||
|
||||
#include <asm/unaligned.h>
|
||||
#include <asm/bitops.h>
|
||||
|
||||
#define CREATE_TRACE_POINTS
|
||||
#include <trace/events/skb.h>
|
||||
#include <trace/events/net.h>
|
||||
#include <trace/events/napi.h>
|
||||
#include <trace/events/sock.h>
|
||||
#include <trace/events/udp.h>
|
||||
|
||||
EXPORT_TRACEPOINT_SYMBOL_GPL(kfree_skb);
|
||||
|
||||
EXPORT_TRACEPOINT_SYMBOL_GPL(napi_poll);
|
678
net/core/net_namespace.c
Normal file
678
net/core/net_namespace.c
Normal file
|
@ -0,0 +1,678 @@
|
|||
#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
|
||||
|
||||
#include <linux/workqueue.h>
|
||||
#include <linux/rtnetlink.h>
|
||||
#include <linux/cache.h>
|
||||
#include <linux/slab.h>
|
||||
#include <linux/list.h>
|
||||
#include <linux/delay.h>
|
||||
#include <linux/sched.h>
|
||||
#include <linux/idr.h>
|
||||
#include <linux/rculist.h>
|
||||
#include <linux/nsproxy.h>
|
||||
#include <linux/fs.h>
|
||||
#include <linux/proc_ns.h>
|
||||
#include <linux/file.h>
|
||||
#include <linux/export.h>
|
||||
#include <linux/user_namespace.h>
|
||||
#include <net/net_namespace.h>
|
||||
#include <net/netns/generic.h>
|
||||
|
||||
/*
|
||||
* Our network namespace constructor/destructor lists
|
||||
*/
|
||||
|
||||
static LIST_HEAD(pernet_list);
|
||||
static struct list_head *first_device = &pernet_list;
|
||||
DEFINE_MUTEX(net_mutex);
|
||||
|
||||
LIST_HEAD(net_namespace_list);
|
||||
EXPORT_SYMBOL_GPL(net_namespace_list);
|
||||
|
||||
struct net init_net = {
|
||||
.dev_base_head = LIST_HEAD_INIT(init_net.dev_base_head),
|
||||
};
|
||||
EXPORT_SYMBOL(init_net);
|
||||
|
||||
#define INITIAL_NET_GEN_PTRS 13 /* +1 for len +2 for rcu_head */
|
||||
|
||||
static unsigned int max_gen_ptrs = INITIAL_NET_GEN_PTRS;
|
||||
|
||||
static struct net_generic *net_alloc_generic(void)
|
||||
{
|
||||
struct net_generic *ng;
|
||||
size_t generic_size = offsetof(struct net_generic, ptr[max_gen_ptrs]);
|
||||
|
||||
ng = kzalloc(generic_size, GFP_KERNEL);
|
||||
if (ng)
|
||||
ng->len = max_gen_ptrs;
|
||||
|
||||
return ng;
|
||||
}
|
||||
|
||||
static int net_assign_generic(struct net *net, int id, void *data)
|
||||
{
|
||||
struct net_generic *ng, *old_ng;
|
||||
|
||||
BUG_ON(!mutex_is_locked(&net_mutex));
|
||||
BUG_ON(id == 0);
|
||||
|
||||
old_ng = rcu_dereference_protected(net->gen,
|
||||
lockdep_is_held(&net_mutex));
|
||||
ng = old_ng;
|
||||
if (old_ng->len >= id)
|
||||
goto assign;
|
||||
|
||||
ng = net_alloc_generic();
|
||||
if (ng == NULL)
|
||||
return -ENOMEM;
|
||||
|
||||
/*
|
||||
* Some synchronisation notes:
|
||||
*
|
||||
* The net_generic explores the net->gen array inside rcu
|
||||
* read section. Besides once set the net->gen->ptr[x]
|
||||
* pointer never changes (see rules in netns/generic.h).
|
||||
*
|
||||
* That said, we simply duplicate this array and schedule
|
||||
* the old copy for kfree after a grace period.
|
||||
*/
|
||||
|
||||
memcpy(&ng->ptr, &old_ng->ptr, old_ng->len * sizeof(void*));
|
||||
|
||||
rcu_assign_pointer(net->gen, ng);
|
||||
kfree_rcu(old_ng, rcu);
|
||||
assign:
|
||||
ng->ptr[id - 1] = data;
|
||||
return 0;
|
||||
}
|
||||
|
||||
static int ops_init(const struct pernet_operations *ops, struct net *net)
|
||||
{
|
||||
int err = -ENOMEM;
|
||||
void *data = NULL;
|
||||
|
||||
if (ops->id && ops->size) {
|
||||
data = kzalloc(ops->size, GFP_KERNEL);
|
||||
if (!data)
|
||||
goto out;
|
||||
|
||||
err = net_assign_generic(net, *ops->id, data);
|
||||
if (err)
|
||||
goto cleanup;
|
||||
}
|
||||
err = 0;
|
||||
if (ops->init)
|
||||
err = ops->init(net);
|
||||
if (!err)
|
||||
return 0;
|
||||
|
||||
cleanup:
|
||||
kfree(data);
|
||||
|
||||
out:
|
||||
return err;
|
||||
}
|
||||
|
||||
static void ops_free(const struct pernet_operations *ops, struct net *net)
|
||||
{
|
||||
if (ops->id && ops->size) {
|
||||
int id = *ops->id;
|
||||
kfree(net_generic(net, id));
|
||||
}
|
||||
}
|
||||
|
||||
static void ops_exit_list(const struct pernet_operations *ops,
|
||||
struct list_head *net_exit_list)
|
||||
{
|
||||
struct net *net;
|
||||
if (ops->exit) {
|
||||
list_for_each_entry(net, net_exit_list, exit_list)
|
||||
ops->exit(net);
|
||||
}
|
||||
if (ops->exit_batch)
|
||||
ops->exit_batch(net_exit_list);
|
||||
}
|
||||
|
||||
static void ops_free_list(const struct pernet_operations *ops,
|
||||
struct list_head *net_exit_list)
|
||||
{
|
||||
struct net *net;
|
||||
if (ops->size && ops->id) {
|
||||
list_for_each_entry(net, net_exit_list, exit_list)
|
||||
ops_free(ops, net);
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* setup_net runs the initializers for the network namespace object.
|
||||
*/
|
||||
static __net_init int setup_net(struct net *net, struct user_namespace *user_ns)
|
||||
{
|
||||
/* Must be called with net_mutex held */
|
||||
const struct pernet_operations *ops, *saved_ops;
|
||||
int error = 0;
|
||||
LIST_HEAD(net_exit_list);
|
||||
|
||||
atomic_set(&net->count, 1);
|
||||
atomic_set(&net->passive, 1);
|
||||
net->dev_base_seq = 1;
|
||||
net->user_ns = user_ns;
|
||||
|
||||
#ifdef NETNS_REFCNT_DEBUG
|
||||
atomic_set(&net->use_count, 0);
|
||||
#endif
|
||||
|
||||
list_for_each_entry(ops, &pernet_list, list) {
|
||||
error = ops_init(ops, net);
|
||||
if (error < 0)
|
||||
goto out_undo;
|
||||
}
|
||||
out:
|
||||
return error;
|
||||
|
||||
out_undo:
|
||||
/* Walk through the list backwards calling the exit functions
|
||||
* for the pernet modules whose init functions did not fail.
|
||||
*/
|
||||
list_add(&net->exit_list, &net_exit_list);
|
||||
saved_ops = ops;
|
||||
list_for_each_entry_continue_reverse(ops, &pernet_list, list)
|
||||
ops_exit_list(ops, &net_exit_list);
|
||||
|
||||
ops = saved_ops;
|
||||
list_for_each_entry_continue_reverse(ops, &pernet_list, list)
|
||||
ops_free_list(ops, &net_exit_list);
|
||||
|
||||
rcu_barrier();
|
||||
goto out;
|
||||
}
|
||||
|
||||
|
||||
#ifdef CONFIG_NET_NS
|
||||
static struct kmem_cache *net_cachep;
|
||||
static struct workqueue_struct *netns_wq;
|
||||
|
||||
static struct net *net_alloc(void)
|
||||
{
|
||||
struct net *net = NULL;
|
||||
struct net_generic *ng;
|
||||
|
||||
ng = net_alloc_generic();
|
||||
if (!ng)
|
||||
goto out;
|
||||
|
||||
net = kmem_cache_zalloc(net_cachep, GFP_KERNEL);
|
||||
if (!net)
|
||||
goto out_free;
|
||||
|
||||
rcu_assign_pointer(net->gen, ng);
|
||||
out:
|
||||
return net;
|
||||
|
||||
out_free:
|
||||
kfree(ng);
|
||||
goto out;
|
||||
}
|
||||
|
||||
static void net_free(struct net *net)
|
||||
{
|
||||
#ifdef NETNS_REFCNT_DEBUG
|
||||
if (unlikely(atomic_read(&net->use_count) != 0)) {
|
||||
pr_emerg("network namespace not free! Usage: %d\n",
|
||||
atomic_read(&net->use_count));
|
||||
return;
|
||||
}
|
||||
#endif
|
||||
kfree(rcu_access_pointer(net->gen));
|
||||
kmem_cache_free(net_cachep, net);
|
||||
}
|
||||
|
||||
void net_drop_ns(void *p)
|
||||
{
|
||||
struct net *ns = p;
|
||||
if (ns && atomic_dec_and_test(&ns->passive))
|
||||
net_free(ns);
|
||||
}
|
||||
|
||||
struct net *copy_net_ns(unsigned long flags,
|
||||
struct user_namespace *user_ns, struct net *old_net)
|
||||
{
|
||||
struct net *net;
|
||||
int rv;
|
||||
|
||||
if (!(flags & CLONE_NEWNET))
|
||||
return get_net(old_net);
|
||||
|
||||
net = net_alloc();
|
||||
if (!net)
|
||||
return ERR_PTR(-ENOMEM);
|
||||
|
||||
get_user_ns(user_ns);
|
||||
|
||||
mutex_lock(&net_mutex);
|
||||
rv = setup_net(net, user_ns);
|
||||
if (rv == 0) {
|
||||
rtnl_lock();
|
||||
list_add_tail_rcu(&net->list, &net_namespace_list);
|
||||
rtnl_unlock();
|
||||
}
|
||||
mutex_unlock(&net_mutex);
|
||||
if (rv < 0) {
|
||||
put_user_ns(user_ns);
|
||||
net_drop_ns(net);
|
||||
return ERR_PTR(rv);
|
||||
}
|
||||
return net;
|
||||
}
|
||||
|
||||
static DEFINE_SPINLOCK(cleanup_list_lock);
|
||||
static LIST_HEAD(cleanup_list); /* Must hold cleanup_list_lock to touch */
|
||||
|
||||
static void cleanup_net(struct work_struct *work)
|
||||
{
|
||||
const struct pernet_operations *ops;
|
||||
struct net *net, *tmp;
|
||||
struct list_head net_kill_list;
|
||||
LIST_HEAD(net_exit_list);
|
||||
|
||||
/* Atomically snapshot the list of namespaces to cleanup */
|
||||
spin_lock_irq(&cleanup_list_lock);
|
||||
list_replace_init(&cleanup_list, &net_kill_list);
|
||||
spin_unlock_irq(&cleanup_list_lock);
|
||||
|
||||
mutex_lock(&net_mutex);
|
||||
|
||||
/* Don't let anyone else find us. */
|
||||
rtnl_lock();
|
||||
list_for_each_entry(net, &net_kill_list, cleanup_list) {
|
||||
list_del_rcu(&net->list);
|
||||
list_add_tail(&net->exit_list, &net_exit_list);
|
||||
}
|
||||
rtnl_unlock();
|
||||
|
||||
/*
|
||||
* Another CPU might be rcu-iterating the list, wait for it.
|
||||
* This needs to be before calling the exit() notifiers, so
|
||||
* the rcu_barrier() below isn't sufficient alone.
|
||||
*/
|
||||
synchronize_rcu();
|
||||
|
||||
/* Run all of the network namespace exit methods */
|
||||
list_for_each_entry_reverse(ops, &pernet_list, list)
|
||||
ops_exit_list(ops, &net_exit_list);
|
||||
|
||||
/* Free the net generic variables */
|
||||
list_for_each_entry_reverse(ops, &pernet_list, list)
|
||||
ops_free_list(ops, &net_exit_list);
|
||||
|
||||
mutex_unlock(&net_mutex);
|
||||
|
||||
/* Ensure there are no outstanding rcu callbacks using this
|
||||
* network namespace.
|
||||
*/
|
||||
rcu_barrier();
|
||||
|
||||
/* Finally it is safe to free my network namespace structure */
|
||||
list_for_each_entry_safe(net, tmp, &net_exit_list, exit_list) {
|
||||
list_del_init(&net->exit_list);
|
||||
put_user_ns(net->user_ns);
|
||||
net_drop_ns(net);
|
||||
}
|
||||
}
|
||||
static DECLARE_WORK(net_cleanup_work, cleanup_net);
|
||||
|
||||
void __put_net(struct net *net)
|
||||
{
|
||||
/* Cleanup the network namespace in process context */
|
||||
unsigned long flags;
|
||||
|
||||
spin_lock_irqsave(&cleanup_list_lock, flags);
|
||||
list_add(&net->cleanup_list, &cleanup_list);
|
||||
spin_unlock_irqrestore(&cleanup_list_lock, flags);
|
||||
|
||||
queue_work(netns_wq, &net_cleanup_work);
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(__put_net);
|
||||
|
||||
struct net *get_net_ns_by_fd(int fd)
|
||||
{
|
||||
struct proc_ns *ei;
|
||||
struct file *file;
|
||||
struct net *net;
|
||||
|
||||
file = proc_ns_fget(fd);
|
||||
if (IS_ERR(file))
|
||||
return ERR_CAST(file);
|
||||
|
||||
ei = get_proc_ns(file_inode(file));
|
||||
if (ei->ns_ops == &netns_operations)
|
||||
net = get_net(ei->ns);
|
||||
else
|
||||
net = ERR_PTR(-EINVAL);
|
||||
|
||||
fput(file);
|
||||
return net;
|
||||
}
|
||||
|
||||
#else
|
||||
struct net *get_net_ns_by_fd(int fd)
|
||||
{
|
||||
return ERR_PTR(-EINVAL);
|
||||
}
|
||||
#endif
|
||||
|
||||
struct net *get_net_ns_by_pid(pid_t pid)
|
||||
{
|
||||
struct task_struct *tsk;
|
||||
struct net *net;
|
||||
|
||||
/* Lookup the network namespace */
|
||||
net = ERR_PTR(-ESRCH);
|
||||
rcu_read_lock();
|
||||
tsk = find_task_by_vpid(pid);
|
||||
if (tsk) {
|
||||
struct nsproxy *nsproxy;
|
||||
task_lock(tsk);
|
||||
nsproxy = tsk->nsproxy;
|
||||
if (nsproxy)
|
||||
net = get_net(nsproxy->net_ns);
|
||||
task_unlock(tsk);
|
||||
}
|
||||
rcu_read_unlock();
|
||||
return net;
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(get_net_ns_by_pid);
|
||||
|
||||
static __net_init int net_ns_net_init(struct net *net)
|
||||
{
|
||||
return proc_alloc_inum(&net->proc_inum);
|
||||
}
|
||||
|
||||
static __net_exit void net_ns_net_exit(struct net *net)
|
||||
{
|
||||
proc_free_inum(net->proc_inum);
|
||||
}
|
||||
|
||||
static struct pernet_operations __net_initdata net_ns_ops = {
|
||||
.init = net_ns_net_init,
|
||||
.exit = net_ns_net_exit,
|
||||
};
|
||||
|
||||
static int __init net_ns_init(void)
|
||||
{
|
||||
struct net_generic *ng;
|
||||
|
||||
#ifdef CONFIG_NET_NS
|
||||
net_cachep = kmem_cache_create("net_namespace", sizeof(struct net),
|
||||
SMP_CACHE_BYTES,
|
||||
SLAB_PANIC, NULL);
|
||||
|
||||
/* Create workqueue for cleanup */
|
||||
netns_wq = create_singlethread_workqueue("netns");
|
||||
if (!netns_wq)
|
||||
panic("Could not create netns workq");
|
||||
#endif
|
||||
|
||||
ng = net_alloc_generic();
|
||||
if (!ng)
|
||||
panic("Could not allocate generic netns");
|
||||
|
||||
rcu_assign_pointer(init_net.gen, ng);
|
||||
|
||||
mutex_lock(&net_mutex);
|
||||
if (setup_net(&init_net, &init_user_ns))
|
||||
panic("Could not setup the initial network namespace");
|
||||
|
||||
rtnl_lock();
|
||||
list_add_tail_rcu(&init_net.list, &net_namespace_list);
|
||||
rtnl_unlock();
|
||||
|
||||
mutex_unlock(&net_mutex);
|
||||
|
||||
register_pernet_subsys(&net_ns_ops);
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
pure_initcall(net_ns_init);
|
||||
|
||||
#ifdef CONFIG_NET_NS
|
||||
static int __register_pernet_operations(struct list_head *list,
|
||||
struct pernet_operations *ops)
|
||||
{
|
||||
struct net *net;
|
||||
int error;
|
||||
LIST_HEAD(net_exit_list);
|
||||
|
||||
list_add_tail(&ops->list, list);
|
||||
if (ops->init || (ops->id && ops->size)) {
|
||||
for_each_net(net) {
|
||||
error = ops_init(ops, net);
|
||||
if (error)
|
||||
goto out_undo;
|
||||
list_add_tail(&net->exit_list, &net_exit_list);
|
||||
}
|
||||
}
|
||||
return 0;
|
||||
|
||||
out_undo:
|
||||
/* If I have an error cleanup all namespaces I initialized */
|
||||
list_del(&ops->list);
|
||||
ops_exit_list(ops, &net_exit_list);
|
||||
ops_free_list(ops, &net_exit_list);
|
||||
return error;
|
||||
}
|
||||
|
||||
static void __unregister_pernet_operations(struct pernet_operations *ops)
|
||||
{
|
||||
struct net *net;
|
||||
LIST_HEAD(net_exit_list);
|
||||
|
||||
list_del(&ops->list);
|
||||
for_each_net(net)
|
||||
list_add_tail(&net->exit_list, &net_exit_list);
|
||||
ops_exit_list(ops, &net_exit_list);
|
||||
ops_free_list(ops, &net_exit_list);
|
||||
}
|
||||
|
||||
#else
|
||||
|
||||
static int __register_pernet_operations(struct list_head *list,
|
||||
struct pernet_operations *ops)
|
||||
{
|
||||
return ops_init(ops, &init_net);
|
||||
}
|
||||
|
||||
static void __unregister_pernet_operations(struct pernet_operations *ops)
|
||||
{
|
||||
LIST_HEAD(net_exit_list);
|
||||
list_add(&init_net.exit_list, &net_exit_list);
|
||||
ops_exit_list(ops, &net_exit_list);
|
||||
ops_free_list(ops, &net_exit_list);
|
||||
}
|
||||
|
||||
#endif /* CONFIG_NET_NS */
|
||||
|
||||
static DEFINE_IDA(net_generic_ids);
|
||||
|
||||
static int register_pernet_operations(struct list_head *list,
|
||||
struct pernet_operations *ops)
|
||||
{
|
||||
int error;
|
||||
|
||||
if (ops->id) {
|
||||
again:
|
||||
error = ida_get_new_above(&net_generic_ids, 1, ops->id);
|
||||
if (error < 0) {
|
||||
if (error == -EAGAIN) {
|
||||
ida_pre_get(&net_generic_ids, GFP_KERNEL);
|
||||
goto again;
|
||||
}
|
||||
return error;
|
||||
}
|
||||
max_gen_ptrs = max_t(unsigned int, max_gen_ptrs, *ops->id);
|
||||
}
|
||||
error = __register_pernet_operations(list, ops);
|
||||
if (error) {
|
||||
rcu_barrier();
|
||||
if (ops->id)
|
||||
ida_remove(&net_generic_ids, *ops->id);
|
||||
}
|
||||
|
||||
return error;
|
||||
}
|
||||
|
||||
static void unregister_pernet_operations(struct pernet_operations *ops)
|
||||
{
|
||||
|
||||
__unregister_pernet_operations(ops);
|
||||
rcu_barrier();
|
||||
if (ops->id)
|
||||
ida_remove(&net_generic_ids, *ops->id);
|
||||
}
|
||||
|
||||
/**
|
||||
* register_pernet_subsys - register a network namespace subsystem
|
||||
* @ops: pernet operations structure for the subsystem
|
||||
*
|
||||
* Register a subsystem which has init and exit functions
|
||||
* that are called when network namespaces are created and
|
||||
* destroyed respectively.
|
||||
*
|
||||
* When registered all network namespace init functions are
|
||||
* called for every existing network namespace. Allowing kernel
|
||||
* modules to have a race free view of the set of network namespaces.
|
||||
*
|
||||
* When a new network namespace is created all of the init
|
||||
* methods are called in the order in which they were registered.
|
||||
*
|
||||
* When a network namespace is destroyed all of the exit methods
|
||||
* are called in the reverse of the order with which they were
|
||||
* registered.
|
||||
*/
|
||||
int register_pernet_subsys(struct pernet_operations *ops)
|
||||
{
|
||||
int error;
|
||||
mutex_lock(&net_mutex);
|
||||
error = register_pernet_operations(first_device, ops);
|
||||
mutex_unlock(&net_mutex);
|
||||
return error;
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(register_pernet_subsys);
|
||||
|
||||
/**
|
||||
* unregister_pernet_subsys - unregister a network namespace subsystem
|
||||
* @ops: pernet operations structure to manipulate
|
||||
*
|
||||
* Remove the pernet operations structure from the list to be
|
||||
* used when network namespaces are created or destroyed. In
|
||||
* addition run the exit method for all existing network
|
||||
* namespaces.
|
||||
*/
|
||||
void unregister_pernet_subsys(struct pernet_operations *ops)
|
||||
{
|
||||
mutex_lock(&net_mutex);
|
||||
unregister_pernet_operations(ops);
|
||||
mutex_unlock(&net_mutex);
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(unregister_pernet_subsys);
|
||||
|
||||
/**
|
||||
* register_pernet_device - register a network namespace device
|
||||
* @ops: pernet operations structure for the subsystem
|
||||
*
|
||||
* Register a device which has init and exit functions
|
||||
* that are called when network namespaces are created and
|
||||
* destroyed respectively.
|
||||
*
|
||||
* When registered all network namespace init functions are
|
||||
* called for every existing network namespace. Allowing kernel
|
||||
* modules to have a race free view of the set of network namespaces.
|
||||
*
|
||||
* When a new network namespace is created all of the init
|
||||
* methods are called in the order in which they were registered.
|
||||
*
|
||||
* When a network namespace is destroyed all of the exit methods
|
||||
* are called in the reverse of the order with which they were
|
||||
* registered.
|
||||
*/
|
||||
int register_pernet_device(struct pernet_operations *ops)
|
||||
{
|
||||
int error;
|
||||
mutex_lock(&net_mutex);
|
||||
error = register_pernet_operations(&pernet_list, ops);
|
||||
if (!error && (first_device == &pernet_list))
|
||||
first_device = &ops->list;
|
||||
mutex_unlock(&net_mutex);
|
||||
return error;
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(register_pernet_device);
|
||||
|
||||
/**
|
||||
* unregister_pernet_device - unregister a network namespace netdevice
|
||||
* @ops: pernet operations structure to manipulate
|
||||
*
|
||||
* Remove the pernet operations structure from the list to be
|
||||
* used when network namespaces are created or destroyed. In
|
||||
* addition run the exit method for all existing network
|
||||
* namespaces.
|
||||
*/
|
||||
void unregister_pernet_device(struct pernet_operations *ops)
|
||||
{
|
||||
mutex_lock(&net_mutex);
|
||||
if (&ops->list == first_device)
|
||||
first_device = first_device->next;
|
||||
unregister_pernet_operations(ops);
|
||||
mutex_unlock(&net_mutex);
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(unregister_pernet_device);
|
||||
|
||||
#ifdef CONFIG_NET_NS
|
||||
static void *netns_get(struct task_struct *task)
|
||||
{
|
||||
struct net *net = NULL;
|
||||
struct nsproxy *nsproxy;
|
||||
|
||||
task_lock(task);
|
||||
nsproxy = task->nsproxy;
|
||||
if (nsproxy)
|
||||
net = get_net(nsproxy->net_ns);
|
||||
task_unlock(task);
|
||||
|
||||
return net;
|
||||
}
|
||||
|
||||
static void netns_put(void *ns)
|
||||
{
|
||||
put_net(ns);
|
||||
}
|
||||
|
||||
static int netns_install(struct nsproxy *nsproxy, void *ns)
|
||||
{
|
||||
struct net *net = ns;
|
||||
|
||||
if (!ns_capable(net->user_ns, CAP_SYS_ADMIN) ||
|
||||
!ns_capable(current_user_ns(), CAP_SYS_ADMIN))
|
||||
return -EPERM;
|
||||
|
||||
put_net(nsproxy->net_ns);
|
||||
nsproxy->net_ns = get_net(net);
|
||||
return 0;
|
||||
}
|
||||
|
||||
static unsigned int netns_inum(void *ns)
|
||||
{
|
||||
struct net *net = ns;
|
||||
return net->proc_inum;
|
||||
}
|
||||
|
||||
const struct proc_ns_operations netns_operations = {
|
||||
.name = "net",
|
||||
.type = CLONE_NEWNET,
|
||||
.get = netns_get,
|
||||
.put = netns_put,
|
||||
.install = netns_install,
|
||||
.inum = netns_inum,
|
||||
};
|
||||
#endif
|
111
net/core/netclassid_cgroup.c
Normal file
111
net/core/netclassid_cgroup.c
Normal file
|
@ -0,0 +1,111 @@
|
|||
/*
|
||||
* net/core/netclassid_cgroup.c Classid Cgroupfs Handling
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU General Public License
|
||||
* as published by the Free Software Foundation; either version
|
||||
* 2 of the License, or (at your option) any later version.
|
||||
*
|
||||
* Authors: Thomas Graf <tgraf@suug.ch>
|
||||
*/
|
||||
|
||||
#include <linux/module.h>
|
||||
#include <linux/slab.h>
|
||||
#include <linux/cgroup.h>
|
||||
#include <linux/fdtable.h>
|
||||
#include <net/cls_cgroup.h>
|
||||
#include <net/sock.h>
|
||||
|
||||
static inline struct cgroup_cls_state *css_cls_state(struct cgroup_subsys_state *css)
|
||||
{
|
||||
return css ? container_of(css, struct cgroup_cls_state, css) : NULL;
|
||||
}
|
||||
|
||||
struct cgroup_cls_state *task_cls_state(struct task_struct *p)
|
||||
{
|
||||
return css_cls_state(task_css(p, net_cls_cgrp_id));
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(task_cls_state);
|
||||
|
||||
static struct cgroup_subsys_state *
|
||||
cgrp_css_alloc(struct cgroup_subsys_state *parent_css)
|
||||
{
|
||||
struct cgroup_cls_state *cs;
|
||||
|
||||
cs = kzalloc(sizeof(*cs), GFP_KERNEL);
|
||||
if (!cs)
|
||||
return ERR_PTR(-ENOMEM);
|
||||
|
||||
return &cs->css;
|
||||
}
|
||||
|
||||
static int cgrp_css_online(struct cgroup_subsys_state *css)
|
||||
{
|
||||
struct cgroup_cls_state *cs = css_cls_state(css);
|
||||
struct cgroup_cls_state *parent = css_cls_state(css->parent);
|
||||
|
||||
if (parent)
|
||||
cs->classid = parent->classid;
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
static void cgrp_css_free(struct cgroup_subsys_state *css)
|
||||
{
|
||||
kfree(css_cls_state(css));
|
||||
}
|
||||
|
||||
static int update_classid(const void *v, struct file *file, unsigned n)
|
||||
{
|
||||
int err;
|
||||
struct socket *sock = sock_from_file(file, &err);
|
||||
|
||||
if (sock)
|
||||
sock->sk->sk_classid = (u32)(unsigned long)v;
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
static void cgrp_attach(struct cgroup_subsys_state *css,
|
||||
struct cgroup_taskset *tset)
|
||||
{
|
||||
struct cgroup_cls_state *cs = css_cls_state(css);
|
||||
void *v = (void *)(unsigned long)cs->classid;
|
||||
struct task_struct *p;
|
||||
|
||||
cgroup_taskset_for_each(p, tset) {
|
||||
task_lock(p);
|
||||
iterate_fd(p->files, 0, update_classid, v);
|
||||
task_unlock(p);
|
||||
}
|
||||
}
|
||||
|
||||
static u64 read_classid(struct cgroup_subsys_state *css, struct cftype *cft)
|
||||
{
|
||||
return css_cls_state(css)->classid;
|
||||
}
|
||||
|
||||
static int write_classid(struct cgroup_subsys_state *css, struct cftype *cft,
|
||||
u64 value)
|
||||
{
|
||||
css_cls_state(css)->classid = (u32) value;
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
static struct cftype ss_files[] = {
|
||||
{
|
||||
.name = "classid",
|
||||
.read_u64 = read_classid,
|
||||
.write_u64 = write_classid,
|
||||
},
|
||||
{ } /* terminate */
|
||||
};
|
||||
|
||||
struct cgroup_subsys net_cls_cgrp_subsys = {
|
||||
.css_alloc = cgrp_css_alloc,
|
||||
.css_online = cgrp_css_online,
|
||||
.css_free = cgrp_css_free,
|
||||
.attach = cgrp_attach,
|
||||
.legacy_cftypes = ss_files,
|
||||
};
|
70
net/core/netevent.c
Normal file
70
net/core/netevent.c
Normal file
|
@ -0,0 +1,70 @@
|
|||
/*
|
||||
* Network event notifiers
|
||||
*
|
||||
* Authors:
|
||||
* Tom Tucker <tom@opengridcomputing.com>
|
||||
* Steve Wise <swise@opengridcomputing.com>
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU General Public License
|
||||
* as published by the Free Software Foundation; either version
|
||||
* 2 of the License, or (at your option) any later version.
|
||||
*
|
||||
* Fixes:
|
||||
*/
|
||||
|
||||
#include <linux/rtnetlink.h>
|
||||
#include <linux/notifier.h>
|
||||
#include <linux/export.h>
|
||||
#include <net/netevent.h>
|
||||
|
||||
static ATOMIC_NOTIFIER_HEAD(netevent_notif_chain);
|
||||
|
||||
/**
|
||||
* register_netevent_notifier - register a netevent notifier block
|
||||
* @nb: notifier
|
||||
*
|
||||
* Register a notifier to be called when a netevent occurs.
|
||||
* The notifier passed is linked into the kernel structures and must
|
||||
* not be reused until it has been unregistered. A negative errno code
|
||||
* is returned on a failure.
|
||||
*/
|
||||
int register_netevent_notifier(struct notifier_block *nb)
|
||||
{
|
||||
int err;
|
||||
|
||||
err = atomic_notifier_chain_register(&netevent_notif_chain, nb);
|
||||
return err;
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(register_netevent_notifier);
|
||||
|
||||
/**
|
||||
* netevent_unregister_notifier - unregister a netevent notifier block
|
||||
* @nb: notifier
|
||||
*
|
||||
* Unregister a notifier previously registered by
|
||||
* register_neigh_notifier(). The notifier is unlinked into the
|
||||
* kernel structures and may then be reused. A negative errno code
|
||||
* is returned on a failure.
|
||||
*/
|
||||
|
||||
int unregister_netevent_notifier(struct notifier_block *nb)
|
||||
{
|
||||
return atomic_notifier_chain_unregister(&netevent_notif_chain, nb);
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(unregister_netevent_notifier);
|
||||
|
||||
/**
|
||||
* call_netevent_notifiers - call all netevent notifier blocks
|
||||
* @val: value passed unmodified to notifier function
|
||||
* @v: pointer passed unmodified to notifier function
|
||||
*
|
||||
* Call all neighbour notifier blocks. Parameters and return value
|
||||
* are as for notifier_call_chain().
|
||||
*/
|
||||
|
||||
int call_netevent_notifiers(unsigned long val, void *v)
|
||||
{
|
||||
return atomic_notifier_call_chain(&netevent_notif_chain, val, v);
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(call_netevent_notifiers);
|
852
net/core/netpoll.c
Normal file
852
net/core/netpoll.c
Normal file
|
@ -0,0 +1,852 @@
|
|||
/*
|
||||
* Common framework for low-level network console, dump, and debugger code
|
||||
*
|
||||
* Sep 8 2003 Matt Mackall <mpm@selenic.com>
|
||||
*
|
||||
* based on the netconsole code from:
|
||||
*
|
||||
* Copyright (C) 2001 Ingo Molnar <mingo@redhat.com>
|
||||
* Copyright (C) 2002 Red Hat, Inc.
|
||||
*/
|
||||
|
||||
#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
|
||||
|
||||
#include <linux/moduleparam.h>
|
||||
#include <linux/kernel.h>
|
||||
#include <linux/netdevice.h>
|
||||
#include <linux/etherdevice.h>
|
||||
#include <linux/string.h>
|
||||
#include <linux/if_arp.h>
|
||||
#include <linux/inetdevice.h>
|
||||
#include <linux/inet.h>
|
||||
#include <linux/interrupt.h>
|
||||
#include <linux/netpoll.h>
|
||||
#include <linux/sched.h>
|
||||
#include <linux/delay.h>
|
||||
#include <linux/rcupdate.h>
|
||||
#include <linux/workqueue.h>
|
||||
#include <linux/slab.h>
|
||||
#include <linux/export.h>
|
||||
#include <linux/if_vlan.h>
|
||||
#include <net/tcp.h>
|
||||
#include <net/udp.h>
|
||||
#include <net/addrconf.h>
|
||||
#include <net/ndisc.h>
|
||||
#include <net/ip6_checksum.h>
|
||||
#include <asm/unaligned.h>
|
||||
#include <trace/events/napi.h>
|
||||
|
||||
/*
|
||||
* We maintain a small pool of fully-sized skbs, to make sure the
|
||||
* message gets out even in extreme OOM situations.
|
||||
*/
|
||||
|
||||
#define MAX_UDP_CHUNK 1460
|
||||
#define MAX_SKBS 32
|
||||
|
||||
static struct sk_buff_head skb_pool;
|
||||
|
||||
DEFINE_STATIC_SRCU(netpoll_srcu);
|
||||
|
||||
#define USEC_PER_POLL 50
|
||||
|
||||
#define MAX_SKB_SIZE \
|
||||
(sizeof(struct ethhdr) + \
|
||||
sizeof(struct iphdr) + \
|
||||
sizeof(struct udphdr) + \
|
||||
MAX_UDP_CHUNK)
|
||||
|
||||
static void zap_completion_queue(void);
|
||||
static void netpoll_async_cleanup(struct work_struct *work);
|
||||
|
||||
static unsigned int carrier_timeout = 4;
|
||||
module_param(carrier_timeout, uint, 0644);
|
||||
|
||||
#define np_info(np, fmt, ...) \
|
||||
pr_info("%s: " fmt, np->name, ##__VA_ARGS__)
|
||||
#define np_err(np, fmt, ...) \
|
||||
pr_err("%s: " fmt, np->name, ##__VA_ARGS__)
|
||||
#define np_notice(np, fmt, ...) \
|
||||
pr_notice("%s: " fmt, np->name, ##__VA_ARGS__)
|
||||
|
||||
static int netpoll_start_xmit(struct sk_buff *skb, struct net_device *dev,
|
||||
struct netdev_queue *txq)
|
||||
{
|
||||
int status = NETDEV_TX_OK;
|
||||
netdev_features_t features;
|
||||
|
||||
features = netif_skb_features(skb);
|
||||
|
||||
if (vlan_tx_tag_present(skb) &&
|
||||
!vlan_hw_offload_capable(features, skb->vlan_proto)) {
|
||||
skb = __vlan_hwaccel_push_inside(skb);
|
||||
if (unlikely(!skb)) {
|
||||
/* This is actually a packet drop, but we
|
||||
* don't want the code that calls this
|
||||
* function to try and operate on a NULL skb.
|
||||
*/
|
||||
goto out;
|
||||
}
|
||||
}
|
||||
|
||||
status = netdev_start_xmit(skb, dev, txq, false);
|
||||
|
||||
out:
|
||||
return status;
|
||||
}
|
||||
|
||||
static void queue_process(struct work_struct *work)
|
||||
{
|
||||
struct netpoll_info *npinfo =
|
||||
container_of(work, struct netpoll_info, tx_work.work);
|
||||
struct sk_buff *skb;
|
||||
unsigned long flags;
|
||||
|
||||
while ((skb = skb_dequeue(&npinfo->txq))) {
|
||||
struct net_device *dev = skb->dev;
|
||||
struct netdev_queue *txq;
|
||||
|
||||
if (!netif_device_present(dev) || !netif_running(dev)) {
|
||||
kfree_skb(skb);
|
||||
continue;
|
||||
}
|
||||
|
||||
txq = skb_get_tx_queue(dev, skb);
|
||||
|
||||
local_irq_save(flags);
|
||||
HARD_TX_LOCK(dev, txq, smp_processor_id());
|
||||
if (netif_xmit_frozen_or_stopped(txq) ||
|
||||
netpoll_start_xmit(skb, dev, txq) != NETDEV_TX_OK) {
|
||||
skb_queue_head(&npinfo->txq, skb);
|
||||
HARD_TX_UNLOCK(dev, txq);
|
||||
local_irq_restore(flags);
|
||||
|
||||
schedule_delayed_work(&npinfo->tx_work, HZ/10);
|
||||
return;
|
||||
}
|
||||
HARD_TX_UNLOCK(dev, txq);
|
||||
local_irq_restore(flags);
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* Check whether delayed processing was scheduled for our NIC. If so,
|
||||
* we attempt to grab the poll lock and use ->poll() to pump the card.
|
||||
* If this fails, either we've recursed in ->poll() or it's already
|
||||
* running on another CPU.
|
||||
*
|
||||
* Note: we don't mask interrupts with this lock because we're using
|
||||
* trylock here and interrupts are already disabled in the softirq
|
||||
* case. Further, we test the poll_owner to avoid recursion on UP
|
||||
* systems where the lock doesn't exist.
|
||||
*/
|
||||
static int poll_one_napi(struct napi_struct *napi, int budget)
|
||||
{
|
||||
int work;
|
||||
|
||||
/* net_rx_action's ->poll() invocations and our's are
|
||||
* synchronized by this test which is only made while
|
||||
* holding the napi->poll_lock.
|
||||
*/
|
||||
if (!test_bit(NAPI_STATE_SCHED, &napi->state))
|
||||
return budget;
|
||||
|
||||
set_bit(NAPI_STATE_NPSVC, &napi->state);
|
||||
|
||||
work = napi->poll(napi, budget);
|
||||
WARN_ONCE(work > budget, "%pF exceeded budget in poll\n", napi->poll);
|
||||
trace_napi_poll(napi);
|
||||
|
||||
clear_bit(NAPI_STATE_NPSVC, &napi->state);
|
||||
|
||||
return budget - work;
|
||||
}
|
||||
|
||||
static void poll_napi(struct net_device *dev, int budget)
|
||||
{
|
||||
struct napi_struct *napi;
|
||||
|
||||
list_for_each_entry(napi, &dev->napi_list, dev_list) {
|
||||
if (napi->poll_owner != smp_processor_id() &&
|
||||
spin_trylock(&napi->poll_lock)) {
|
||||
budget = poll_one_napi(napi, budget);
|
||||
spin_unlock(&napi->poll_lock);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static void netpoll_poll_dev(struct net_device *dev)
|
||||
{
|
||||
const struct net_device_ops *ops;
|
||||
struct netpoll_info *ni = rcu_dereference_bh(dev->npinfo);
|
||||
int budget = 0;
|
||||
|
||||
/* Don't do any rx activity if the dev_lock mutex is held
|
||||
* the dev_open/close paths use this to block netpoll activity
|
||||
* while changing device state
|
||||
*/
|
||||
if (down_trylock(&ni->dev_lock))
|
||||
return;
|
||||
|
||||
if (!netif_running(dev)) {
|
||||
up(&ni->dev_lock);
|
||||
return;
|
||||
}
|
||||
|
||||
ops = dev->netdev_ops;
|
||||
if (!ops->ndo_poll_controller) {
|
||||
up(&ni->dev_lock);
|
||||
return;
|
||||
}
|
||||
|
||||
/* Process pending work on NIC */
|
||||
ops->ndo_poll_controller(dev);
|
||||
|
||||
poll_napi(dev, budget);
|
||||
|
||||
up(&ni->dev_lock);
|
||||
|
||||
zap_completion_queue();
|
||||
}
|
||||
|
||||
void netpoll_poll_disable(struct net_device *dev)
|
||||
{
|
||||
struct netpoll_info *ni;
|
||||
int idx;
|
||||
might_sleep();
|
||||
idx = srcu_read_lock(&netpoll_srcu);
|
||||
ni = srcu_dereference(dev->npinfo, &netpoll_srcu);
|
||||
if (ni)
|
||||
down(&ni->dev_lock);
|
||||
srcu_read_unlock(&netpoll_srcu, idx);
|
||||
}
|
||||
EXPORT_SYMBOL(netpoll_poll_disable);
|
||||
|
||||
void netpoll_poll_enable(struct net_device *dev)
|
||||
{
|
||||
struct netpoll_info *ni;
|
||||
rcu_read_lock();
|
||||
ni = rcu_dereference(dev->npinfo);
|
||||
if (ni)
|
||||
up(&ni->dev_lock);
|
||||
rcu_read_unlock();
|
||||
}
|
||||
EXPORT_SYMBOL(netpoll_poll_enable);
|
||||
|
||||
static void refill_skbs(void)
|
||||
{
|
||||
struct sk_buff *skb;
|
||||
unsigned long flags;
|
||||
|
||||
spin_lock_irqsave(&skb_pool.lock, flags);
|
||||
while (skb_pool.qlen < MAX_SKBS) {
|
||||
skb = alloc_skb(MAX_SKB_SIZE, GFP_ATOMIC);
|
||||
if (!skb)
|
||||
break;
|
||||
|
||||
__skb_queue_tail(&skb_pool, skb);
|
||||
}
|
||||
spin_unlock_irqrestore(&skb_pool.lock, flags);
|
||||
}
|
||||
|
||||
static void zap_completion_queue(void)
|
||||
{
|
||||
unsigned long flags;
|
||||
struct softnet_data *sd = &get_cpu_var(softnet_data);
|
||||
|
||||
if (sd->completion_queue) {
|
||||
struct sk_buff *clist;
|
||||
|
||||
local_irq_save(flags);
|
||||
clist = sd->completion_queue;
|
||||
sd->completion_queue = NULL;
|
||||
local_irq_restore(flags);
|
||||
|
||||
while (clist != NULL) {
|
||||
struct sk_buff *skb = clist;
|
||||
clist = clist->next;
|
||||
if (!skb_irq_freeable(skb)) {
|
||||
atomic_inc(&skb->users);
|
||||
dev_kfree_skb_any(skb); /* put this one back */
|
||||
} else {
|
||||
__kfree_skb(skb);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
put_cpu_var(softnet_data);
|
||||
}
|
||||
|
||||
static struct sk_buff *find_skb(struct netpoll *np, int len, int reserve)
|
||||
{
|
||||
int count = 0;
|
||||
struct sk_buff *skb;
|
||||
|
||||
zap_completion_queue();
|
||||
refill_skbs();
|
||||
repeat:
|
||||
|
||||
skb = alloc_skb(len, GFP_ATOMIC);
|
||||
if (!skb)
|
||||
skb = skb_dequeue(&skb_pool);
|
||||
|
||||
if (!skb) {
|
||||
if (++count < 10) {
|
||||
netpoll_poll_dev(np->dev);
|
||||
goto repeat;
|
||||
}
|
||||
return NULL;
|
||||
}
|
||||
|
||||
atomic_set(&skb->users, 1);
|
||||
skb_reserve(skb, reserve);
|
||||
return skb;
|
||||
}
|
||||
|
||||
static int netpoll_owner_active(struct net_device *dev)
|
||||
{
|
||||
struct napi_struct *napi;
|
||||
|
||||
list_for_each_entry(napi, &dev->napi_list, dev_list) {
|
||||
if (napi->poll_owner == smp_processor_id())
|
||||
return 1;
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
/* call with IRQ disabled */
|
||||
void netpoll_send_skb_on_dev(struct netpoll *np, struct sk_buff *skb,
|
||||
struct net_device *dev)
|
||||
{
|
||||
int status = NETDEV_TX_BUSY;
|
||||
unsigned long tries;
|
||||
/* It is up to the caller to keep npinfo alive. */
|
||||
struct netpoll_info *npinfo;
|
||||
|
||||
WARN_ON_ONCE(!irqs_disabled());
|
||||
|
||||
npinfo = rcu_dereference_bh(np->dev->npinfo);
|
||||
if (!npinfo || !netif_running(dev) || !netif_device_present(dev)) {
|
||||
dev_kfree_skb_irq(skb);
|
||||
return;
|
||||
}
|
||||
|
||||
/* don't get messages out of order, and no recursion */
|
||||
if (skb_queue_len(&npinfo->txq) == 0 && !netpoll_owner_active(dev)) {
|
||||
struct netdev_queue *txq;
|
||||
|
||||
txq = netdev_pick_tx(dev, skb, NULL);
|
||||
|
||||
/* try until next clock tick */
|
||||
for (tries = jiffies_to_usecs(1)/USEC_PER_POLL;
|
||||
tries > 0; --tries) {
|
||||
if (HARD_TX_TRYLOCK(dev, txq)) {
|
||||
if (!netif_xmit_stopped(txq))
|
||||
status = netpoll_start_xmit(skb, dev, txq);
|
||||
|
||||
HARD_TX_UNLOCK(dev, txq);
|
||||
|
||||
if (status == NETDEV_TX_OK)
|
||||
break;
|
||||
|
||||
}
|
||||
|
||||
/* tickle device maybe there is some cleanup */
|
||||
netpoll_poll_dev(np->dev);
|
||||
|
||||
udelay(USEC_PER_POLL);
|
||||
}
|
||||
|
||||
WARN_ONCE(!irqs_disabled(),
|
||||
"netpoll_send_skb_on_dev(): %s enabled interrupts in poll (%pF)\n",
|
||||
dev->name, dev->netdev_ops->ndo_start_xmit);
|
||||
|
||||
}
|
||||
|
||||
if (status != NETDEV_TX_OK) {
|
||||
skb_queue_tail(&npinfo->txq, skb);
|
||||
schedule_delayed_work(&npinfo->tx_work,0);
|
||||
}
|
||||
}
|
||||
EXPORT_SYMBOL(netpoll_send_skb_on_dev);
|
||||
|
||||
void netpoll_send_udp(struct netpoll *np, const char *msg, int len)
|
||||
{
|
||||
int total_len, ip_len, udp_len;
|
||||
struct sk_buff *skb;
|
||||
struct udphdr *udph;
|
||||
struct iphdr *iph;
|
||||
struct ethhdr *eth;
|
||||
static atomic_t ip_ident;
|
||||
struct ipv6hdr *ip6h;
|
||||
|
||||
udp_len = len + sizeof(*udph);
|
||||
if (np->ipv6)
|
||||
ip_len = udp_len + sizeof(*ip6h);
|
||||
else
|
||||
ip_len = udp_len + sizeof(*iph);
|
||||
|
||||
total_len = ip_len + LL_RESERVED_SPACE(np->dev);
|
||||
|
||||
skb = find_skb(np, total_len + np->dev->needed_tailroom,
|
||||
total_len - len);
|
||||
if (!skb)
|
||||
return;
|
||||
|
||||
skb_copy_to_linear_data(skb, msg, len);
|
||||
skb_put(skb, len);
|
||||
|
||||
skb_push(skb, sizeof(*udph));
|
||||
skb_reset_transport_header(skb);
|
||||
udph = udp_hdr(skb);
|
||||
udph->source = htons(np->local_port);
|
||||
udph->dest = htons(np->remote_port);
|
||||
udph->len = htons(udp_len);
|
||||
|
||||
if (np->ipv6) {
|
||||
udph->check = 0;
|
||||
udph->check = csum_ipv6_magic(&np->local_ip.in6,
|
||||
&np->remote_ip.in6,
|
||||
udp_len, IPPROTO_UDP,
|
||||
csum_partial(udph, udp_len, 0));
|
||||
if (udph->check == 0)
|
||||
udph->check = CSUM_MANGLED_0;
|
||||
|
||||
skb_push(skb, sizeof(*ip6h));
|
||||
skb_reset_network_header(skb);
|
||||
ip6h = ipv6_hdr(skb);
|
||||
|
||||
/* ip6h->version = 6; ip6h->priority = 0; */
|
||||
put_unaligned(0x60, (unsigned char *)ip6h);
|
||||
ip6h->flow_lbl[0] = 0;
|
||||
ip6h->flow_lbl[1] = 0;
|
||||
ip6h->flow_lbl[2] = 0;
|
||||
|
||||
ip6h->payload_len = htons(sizeof(struct udphdr) + len);
|
||||
ip6h->nexthdr = IPPROTO_UDP;
|
||||
ip6h->hop_limit = 32;
|
||||
ip6h->saddr = np->local_ip.in6;
|
||||
ip6h->daddr = np->remote_ip.in6;
|
||||
|
||||
eth = (struct ethhdr *) skb_push(skb, ETH_HLEN);
|
||||
skb_reset_mac_header(skb);
|
||||
skb->protocol = eth->h_proto = htons(ETH_P_IPV6);
|
||||
} else {
|
||||
udph->check = 0;
|
||||
udph->check = csum_tcpudp_magic(np->local_ip.ip,
|
||||
np->remote_ip.ip,
|
||||
udp_len, IPPROTO_UDP,
|
||||
csum_partial(udph, udp_len, 0));
|
||||
if (udph->check == 0)
|
||||
udph->check = CSUM_MANGLED_0;
|
||||
|
||||
skb_push(skb, sizeof(*iph));
|
||||
skb_reset_network_header(skb);
|
||||
iph = ip_hdr(skb);
|
||||
|
||||
/* iph->version = 4; iph->ihl = 5; */
|
||||
put_unaligned(0x45, (unsigned char *)iph);
|
||||
iph->tos = 0;
|
||||
put_unaligned(htons(ip_len), &(iph->tot_len));
|
||||
iph->id = htons(atomic_inc_return(&ip_ident));
|
||||
iph->frag_off = 0;
|
||||
iph->ttl = 64;
|
||||
iph->protocol = IPPROTO_UDP;
|
||||
iph->check = 0;
|
||||
put_unaligned(np->local_ip.ip, &(iph->saddr));
|
||||
put_unaligned(np->remote_ip.ip, &(iph->daddr));
|
||||
iph->check = ip_fast_csum((unsigned char *)iph, iph->ihl);
|
||||
|
||||
eth = (struct ethhdr *) skb_push(skb, ETH_HLEN);
|
||||
skb_reset_mac_header(skb);
|
||||
skb->protocol = eth->h_proto = htons(ETH_P_IP);
|
||||
}
|
||||
|
||||
ether_addr_copy(eth->h_source, np->dev->dev_addr);
|
||||
ether_addr_copy(eth->h_dest, np->remote_mac);
|
||||
|
||||
skb->dev = np->dev;
|
||||
|
||||
netpoll_send_skb(np, skb);
|
||||
}
|
||||
EXPORT_SYMBOL(netpoll_send_udp);
|
||||
|
||||
void netpoll_print_options(struct netpoll *np)
|
||||
{
|
||||
np_info(np, "local port %d\n", np->local_port);
|
||||
if (np->ipv6)
|
||||
np_info(np, "local IPv6 address %pI6c\n", &np->local_ip.in6);
|
||||
else
|
||||
np_info(np, "local IPv4 address %pI4\n", &np->local_ip.ip);
|
||||
np_info(np, "interface '%s'\n", np->dev_name);
|
||||
np_info(np, "remote port %d\n", np->remote_port);
|
||||
if (np->ipv6)
|
||||
np_info(np, "remote IPv6 address %pI6c\n", &np->remote_ip.in6);
|
||||
else
|
||||
np_info(np, "remote IPv4 address %pI4\n", &np->remote_ip.ip);
|
||||
np_info(np, "remote ethernet address %pM\n", np->remote_mac);
|
||||
}
|
||||
EXPORT_SYMBOL(netpoll_print_options);
|
||||
|
||||
static int netpoll_parse_ip_addr(const char *str, union inet_addr *addr)
|
||||
{
|
||||
const char *end;
|
||||
|
||||
if (!strchr(str, ':') &&
|
||||
in4_pton(str, -1, (void *)addr, -1, &end) > 0) {
|
||||
if (!*end)
|
||||
return 0;
|
||||
}
|
||||
if (in6_pton(str, -1, addr->in6.s6_addr, -1, &end) > 0) {
|
||||
#if IS_ENABLED(CONFIG_IPV6)
|
||||
if (!*end)
|
||||
return 1;
|
||||
#else
|
||||
return -1;
|
||||
#endif
|
||||
}
|
||||
return -1;
|
||||
}
|
||||
|
||||
int netpoll_parse_options(struct netpoll *np, char *opt)
|
||||
{
|
||||
char *cur=opt, *delim;
|
||||
int ipv6;
|
||||
bool ipversion_set = false;
|
||||
|
||||
if (*cur != '@') {
|
||||
if ((delim = strchr(cur, '@')) == NULL)
|
||||
goto parse_failed;
|
||||
*delim = 0;
|
||||
if (kstrtou16(cur, 10, &np->local_port))
|
||||
goto parse_failed;
|
||||
cur = delim;
|
||||
}
|
||||
cur++;
|
||||
|
||||
if (*cur != '/') {
|
||||
ipversion_set = true;
|
||||
if ((delim = strchr(cur, '/')) == NULL)
|
||||
goto parse_failed;
|
||||
*delim = 0;
|
||||
ipv6 = netpoll_parse_ip_addr(cur, &np->local_ip);
|
||||
if (ipv6 < 0)
|
||||
goto parse_failed;
|
||||
else
|
||||
np->ipv6 = (bool)ipv6;
|
||||
cur = delim;
|
||||
}
|
||||
cur++;
|
||||
|
||||
if (*cur != ',') {
|
||||
/* parse out dev name */
|
||||
if ((delim = strchr(cur, ',')) == NULL)
|
||||
goto parse_failed;
|
||||
*delim = 0;
|
||||
strlcpy(np->dev_name, cur, sizeof(np->dev_name));
|
||||
cur = delim;
|
||||
}
|
||||
cur++;
|
||||
|
||||
if (*cur != '@') {
|
||||
/* dst port */
|
||||
if ((delim = strchr(cur, '@')) == NULL)
|
||||
goto parse_failed;
|
||||
*delim = 0;
|
||||
if (*cur == ' ' || *cur == '\t')
|
||||
np_info(np, "warning: whitespace is not allowed\n");
|
||||
if (kstrtou16(cur, 10, &np->remote_port))
|
||||
goto parse_failed;
|
||||
cur = delim;
|
||||
}
|
||||
cur++;
|
||||
|
||||
/* dst ip */
|
||||
if ((delim = strchr(cur, '/')) == NULL)
|
||||
goto parse_failed;
|
||||
*delim = 0;
|
||||
ipv6 = netpoll_parse_ip_addr(cur, &np->remote_ip);
|
||||
if (ipv6 < 0)
|
||||
goto parse_failed;
|
||||
else if (ipversion_set && np->ipv6 != (bool)ipv6)
|
||||
goto parse_failed;
|
||||
else
|
||||
np->ipv6 = (bool)ipv6;
|
||||
cur = delim + 1;
|
||||
|
||||
if (*cur != 0) {
|
||||
/* MAC address */
|
||||
if (!mac_pton(cur, np->remote_mac))
|
||||
goto parse_failed;
|
||||
}
|
||||
|
||||
netpoll_print_options(np);
|
||||
|
||||
return 0;
|
||||
|
||||
parse_failed:
|
||||
np_info(np, "couldn't parse config at '%s'!\n", cur);
|
||||
return -1;
|
||||
}
|
||||
EXPORT_SYMBOL(netpoll_parse_options);
|
||||
|
||||
int __netpoll_setup(struct netpoll *np, struct net_device *ndev)
|
||||
{
|
||||
struct netpoll_info *npinfo;
|
||||
const struct net_device_ops *ops;
|
||||
int err;
|
||||
|
||||
np->dev = ndev;
|
||||
strlcpy(np->dev_name, ndev->name, IFNAMSIZ);
|
||||
INIT_WORK(&np->cleanup_work, netpoll_async_cleanup);
|
||||
|
||||
if ((ndev->priv_flags & IFF_DISABLE_NETPOLL) ||
|
||||
!ndev->netdev_ops->ndo_poll_controller) {
|
||||
np_err(np, "%s doesn't support polling, aborting\n",
|
||||
np->dev_name);
|
||||
err = -ENOTSUPP;
|
||||
goto out;
|
||||
}
|
||||
|
||||
if (!ndev->npinfo) {
|
||||
npinfo = kmalloc(sizeof(*npinfo), GFP_KERNEL);
|
||||
if (!npinfo) {
|
||||
err = -ENOMEM;
|
||||
goto out;
|
||||
}
|
||||
|
||||
sema_init(&npinfo->dev_lock, 1);
|
||||
skb_queue_head_init(&npinfo->txq);
|
||||
INIT_DELAYED_WORK(&npinfo->tx_work, queue_process);
|
||||
|
||||
atomic_set(&npinfo->refcnt, 1);
|
||||
|
||||
ops = np->dev->netdev_ops;
|
||||
if (ops->ndo_netpoll_setup) {
|
||||
err = ops->ndo_netpoll_setup(ndev, npinfo);
|
||||
if (err)
|
||||
goto free_npinfo;
|
||||
}
|
||||
} else {
|
||||
npinfo = rtnl_dereference(ndev->npinfo);
|
||||
atomic_inc(&npinfo->refcnt);
|
||||
}
|
||||
|
||||
npinfo->netpoll = np;
|
||||
|
||||
/* last thing to do is link it to the net device structure */
|
||||
rcu_assign_pointer(ndev->npinfo, npinfo);
|
||||
|
||||
return 0;
|
||||
|
||||
free_npinfo:
|
||||
kfree(npinfo);
|
||||
out:
|
||||
return err;
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(__netpoll_setup);
|
||||
|
||||
int netpoll_setup(struct netpoll *np)
|
||||
{
|
||||
struct net_device *ndev = NULL;
|
||||
struct in_device *in_dev;
|
||||
int err;
|
||||
|
||||
rtnl_lock();
|
||||
if (np->dev_name) {
|
||||
struct net *net = current->nsproxy->net_ns;
|
||||
ndev = __dev_get_by_name(net, np->dev_name);
|
||||
}
|
||||
if (!ndev) {
|
||||
np_err(np, "%s doesn't exist, aborting\n", np->dev_name);
|
||||
err = -ENODEV;
|
||||
goto unlock;
|
||||
}
|
||||
dev_hold(ndev);
|
||||
|
||||
if (netdev_master_upper_dev_get(ndev)) {
|
||||
np_err(np, "%s is a slave device, aborting\n", np->dev_name);
|
||||
err = -EBUSY;
|
||||
goto put;
|
||||
}
|
||||
|
||||
if (!netif_running(ndev)) {
|
||||
unsigned long atmost, atleast;
|
||||
|
||||
np_info(np, "device %s not up yet, forcing it\n", np->dev_name);
|
||||
|
||||
err = dev_open(ndev);
|
||||
|
||||
if (err) {
|
||||
np_err(np, "failed to open %s\n", ndev->name);
|
||||
goto put;
|
||||
}
|
||||
|
||||
rtnl_unlock();
|
||||
atleast = jiffies + HZ/10;
|
||||
atmost = jiffies + carrier_timeout * HZ;
|
||||
while (!netif_carrier_ok(ndev)) {
|
||||
if (time_after(jiffies, atmost)) {
|
||||
np_notice(np, "timeout waiting for carrier\n");
|
||||
break;
|
||||
}
|
||||
msleep(1);
|
||||
}
|
||||
|
||||
/* If carrier appears to come up instantly, we don't
|
||||
* trust it and pause so that we don't pump all our
|
||||
* queued console messages into the bitbucket.
|
||||
*/
|
||||
|
||||
if (time_before(jiffies, atleast)) {
|
||||
np_notice(np, "carrier detect appears untrustworthy, waiting 4 seconds\n");
|
||||
msleep(4000);
|
||||
}
|
||||
rtnl_lock();
|
||||
}
|
||||
|
||||
if (!np->local_ip.ip) {
|
||||
if (!np->ipv6) {
|
||||
in_dev = __in_dev_get_rtnl(ndev);
|
||||
|
||||
if (!in_dev || !in_dev->ifa_list) {
|
||||
np_err(np, "no IP address for %s, aborting\n",
|
||||
np->dev_name);
|
||||
err = -EDESTADDRREQ;
|
||||
goto put;
|
||||
}
|
||||
|
||||
np->local_ip.ip = in_dev->ifa_list->ifa_local;
|
||||
np_info(np, "local IP %pI4\n", &np->local_ip.ip);
|
||||
} else {
|
||||
#if IS_ENABLED(CONFIG_IPV6)
|
||||
struct inet6_dev *idev;
|
||||
|
||||
err = -EDESTADDRREQ;
|
||||
idev = __in6_dev_get(ndev);
|
||||
if (idev) {
|
||||
struct inet6_ifaddr *ifp;
|
||||
|
||||
read_lock_bh(&idev->lock);
|
||||
list_for_each_entry(ifp, &idev->addr_list, if_list) {
|
||||
if (ipv6_addr_type(&ifp->addr) & IPV6_ADDR_LINKLOCAL)
|
||||
continue;
|
||||
np->local_ip.in6 = ifp->addr;
|
||||
err = 0;
|
||||
break;
|
||||
}
|
||||
read_unlock_bh(&idev->lock);
|
||||
}
|
||||
if (err) {
|
||||
np_err(np, "no IPv6 address for %s, aborting\n",
|
||||
np->dev_name);
|
||||
goto put;
|
||||
} else
|
||||
np_info(np, "local IPv6 %pI6c\n", &np->local_ip.in6);
|
||||
#else
|
||||
np_err(np, "IPv6 is not supported %s, aborting\n",
|
||||
np->dev_name);
|
||||
err = -EINVAL;
|
||||
goto put;
|
||||
#endif
|
||||
}
|
||||
}
|
||||
|
||||
/* fill up the skb queue */
|
||||
refill_skbs();
|
||||
|
||||
err = __netpoll_setup(np, ndev);
|
||||
if (err)
|
||||
goto put;
|
||||
|
||||
rtnl_unlock();
|
||||
return 0;
|
||||
|
||||
put:
|
||||
dev_put(ndev);
|
||||
unlock:
|
||||
rtnl_unlock();
|
||||
return err;
|
||||
}
|
||||
EXPORT_SYMBOL(netpoll_setup);
|
||||
|
||||
static int __init netpoll_init(void)
|
||||
{
|
||||
skb_queue_head_init(&skb_pool);
|
||||
return 0;
|
||||
}
|
||||
core_initcall(netpoll_init);
|
||||
|
||||
static void rcu_cleanup_netpoll_info(struct rcu_head *rcu_head)
|
||||
{
|
||||
struct netpoll_info *npinfo =
|
||||
container_of(rcu_head, struct netpoll_info, rcu);
|
||||
|
||||
skb_queue_purge(&npinfo->txq);
|
||||
|
||||
/* we can't call cancel_delayed_work_sync here, as we are in softirq */
|
||||
cancel_delayed_work(&npinfo->tx_work);
|
||||
|
||||
/* clean after last, unfinished work */
|
||||
__skb_queue_purge(&npinfo->txq);
|
||||
/* now cancel it again */
|
||||
cancel_delayed_work(&npinfo->tx_work);
|
||||
kfree(npinfo);
|
||||
}
|
||||
|
||||
void __netpoll_cleanup(struct netpoll *np)
|
||||
{
|
||||
struct netpoll_info *npinfo;
|
||||
|
||||
/* rtnl_dereference would be preferable here but
|
||||
* rcu_cleanup_netpoll path can put us in here safely without
|
||||
* holding the rtnl, so plain rcu_dereference it is
|
||||
*/
|
||||
npinfo = rtnl_dereference(np->dev->npinfo);
|
||||
if (!npinfo)
|
||||
return;
|
||||
|
||||
synchronize_srcu(&netpoll_srcu);
|
||||
|
||||
if (atomic_dec_and_test(&npinfo->refcnt)) {
|
||||
const struct net_device_ops *ops;
|
||||
|
||||
ops = np->dev->netdev_ops;
|
||||
if (ops->ndo_netpoll_cleanup)
|
||||
ops->ndo_netpoll_cleanup(np->dev);
|
||||
|
||||
RCU_INIT_POINTER(np->dev->npinfo, NULL);
|
||||
call_rcu_bh(&npinfo->rcu, rcu_cleanup_netpoll_info);
|
||||
} else
|
||||
RCU_INIT_POINTER(np->dev->npinfo, NULL);
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(__netpoll_cleanup);
|
||||
|
||||
static void netpoll_async_cleanup(struct work_struct *work)
|
||||
{
|
||||
struct netpoll *np = container_of(work, struct netpoll, cleanup_work);
|
||||
|
||||
rtnl_lock();
|
||||
__netpoll_cleanup(np);
|
||||
rtnl_unlock();
|
||||
kfree(np);
|
||||
}
|
||||
|
||||
void __netpoll_free_async(struct netpoll *np)
|
||||
{
|
||||
schedule_work(&np->cleanup_work);
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(__netpoll_free_async);
|
||||
|
||||
void netpoll_cleanup(struct netpoll *np)
|
||||
{
|
||||
rtnl_lock();
|
||||
if (!np->dev)
|
||||
goto out;
|
||||
__netpoll_cleanup(np);
|
||||
dev_put(np->dev);
|
||||
np->dev = NULL;
|
||||
out:
|
||||
rtnl_unlock();
|
||||
}
|
||||
EXPORT_SYMBOL(netpoll_cleanup);
|
288
net/core/netprio_cgroup.c
Normal file
288
net/core/netprio_cgroup.c
Normal file
|
@ -0,0 +1,288 @@
|
|||
/*
|
||||
* net/core/netprio_cgroup.c Priority Control Group
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU General Public License
|
||||
* as published by the Free Software Foundation; either version
|
||||
* 2 of the License, or (at your option) any later version.
|
||||
*
|
||||
* Authors: Neil Horman <nhorman@tuxdriver.com>
|
||||
*/
|
||||
|
||||
#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
|
||||
|
||||
#include <linux/module.h>
|
||||
#include <linux/slab.h>
|
||||
#include <linux/types.h>
|
||||
#include <linux/string.h>
|
||||
#include <linux/errno.h>
|
||||
#include <linux/skbuff.h>
|
||||
#include <linux/cgroup.h>
|
||||
#include <linux/rcupdate.h>
|
||||
#include <linux/atomic.h>
|
||||
#include <net/rtnetlink.h>
|
||||
#include <net/pkt_cls.h>
|
||||
#include <net/sock.h>
|
||||
#include <net/netprio_cgroup.h>
|
||||
|
||||
#include <linux/fdtable.h>
|
||||
|
||||
#define PRIOMAP_MIN_SZ 128
|
||||
|
||||
/*
|
||||
* Extend @dev->priomap so that it's large enough to accommodate
|
||||
* @target_idx. @dev->priomap.priomap_len > @target_idx after successful
|
||||
* return. Must be called under rtnl lock.
|
||||
*/
|
||||
static int extend_netdev_table(struct net_device *dev, u32 target_idx)
|
||||
{
|
||||
struct netprio_map *old, *new;
|
||||
size_t new_sz, new_len;
|
||||
|
||||
/* is the existing priomap large enough? */
|
||||
old = rtnl_dereference(dev->priomap);
|
||||
if (old && old->priomap_len > target_idx)
|
||||
return 0;
|
||||
|
||||
/*
|
||||
* Determine the new size. Let's keep it power-of-two. We start
|
||||
* from PRIOMAP_MIN_SZ and double it until it's large enough to
|
||||
* accommodate @target_idx.
|
||||
*/
|
||||
new_sz = PRIOMAP_MIN_SZ;
|
||||
while (true) {
|
||||
new_len = (new_sz - offsetof(struct netprio_map, priomap)) /
|
||||
sizeof(new->priomap[0]);
|
||||
if (new_len > target_idx)
|
||||
break;
|
||||
new_sz *= 2;
|
||||
/* overflowed? */
|
||||
if (WARN_ON(new_sz < PRIOMAP_MIN_SZ))
|
||||
return -ENOSPC;
|
||||
}
|
||||
|
||||
/* allocate & copy */
|
||||
new = kzalloc(new_sz, GFP_KERNEL);
|
||||
if (!new)
|
||||
return -ENOMEM;
|
||||
|
||||
if (old)
|
||||
memcpy(new->priomap, old->priomap,
|
||||
old->priomap_len * sizeof(old->priomap[0]));
|
||||
|
||||
new->priomap_len = new_len;
|
||||
|
||||
/* install the new priomap */
|
||||
rcu_assign_pointer(dev->priomap, new);
|
||||
if (old)
|
||||
kfree_rcu(old, rcu);
|
||||
return 0;
|
||||
}
|
||||
|
||||
/**
|
||||
* netprio_prio - return the effective netprio of a cgroup-net_device pair
|
||||
* @css: css part of the target pair
|
||||
* @dev: net_device part of the target pair
|
||||
*
|
||||
* Should be called under RCU read or rtnl lock.
|
||||
*/
|
||||
static u32 netprio_prio(struct cgroup_subsys_state *css, struct net_device *dev)
|
||||
{
|
||||
struct netprio_map *map = rcu_dereference_rtnl(dev->priomap);
|
||||
int id = css->cgroup->id;
|
||||
|
||||
if (map && id < map->priomap_len)
|
||||
return map->priomap[id];
|
||||
return 0;
|
||||
}
|
||||
|
||||
/**
|
||||
* netprio_set_prio - set netprio on a cgroup-net_device pair
|
||||
* @css: css part of the target pair
|
||||
* @dev: net_device part of the target pair
|
||||
* @prio: prio to set
|
||||
*
|
||||
* Set netprio to @prio on @css-@dev pair. Should be called under rtnl
|
||||
* lock and may fail under memory pressure for non-zero @prio.
|
||||
*/
|
||||
static int netprio_set_prio(struct cgroup_subsys_state *css,
|
||||
struct net_device *dev, u32 prio)
|
||||
{
|
||||
struct netprio_map *map;
|
||||
int id = css->cgroup->id;
|
||||
int ret;
|
||||
|
||||
/* avoid extending priomap for zero writes */
|
||||
map = rtnl_dereference(dev->priomap);
|
||||
if (!prio && (!map || map->priomap_len <= id))
|
||||
return 0;
|
||||
|
||||
ret = extend_netdev_table(dev, id);
|
||||
if (ret)
|
||||
return ret;
|
||||
|
||||
map = rtnl_dereference(dev->priomap);
|
||||
map->priomap[id] = prio;
|
||||
return 0;
|
||||
}
|
||||
|
||||
static struct cgroup_subsys_state *
|
||||
cgrp_css_alloc(struct cgroup_subsys_state *parent_css)
|
||||
{
|
||||
struct cgroup_subsys_state *css;
|
||||
|
||||
css = kzalloc(sizeof(*css), GFP_KERNEL);
|
||||
if (!css)
|
||||
return ERR_PTR(-ENOMEM);
|
||||
|
||||
return css;
|
||||
}
|
||||
|
||||
static int cgrp_css_online(struct cgroup_subsys_state *css)
|
||||
{
|
||||
struct cgroup_subsys_state *parent_css = css->parent;
|
||||
struct net_device *dev;
|
||||
int ret = 0;
|
||||
|
||||
if (!parent_css)
|
||||
return 0;
|
||||
|
||||
rtnl_lock();
|
||||
/*
|
||||
* Inherit prios from the parent. As all prios are set during
|
||||
* onlining, there is no need to clear them on offline.
|
||||
*/
|
||||
for_each_netdev(&init_net, dev) {
|
||||
u32 prio = netprio_prio(parent_css, dev);
|
||||
|
||||
ret = netprio_set_prio(css, dev, prio);
|
||||
if (ret)
|
||||
break;
|
||||
}
|
||||
rtnl_unlock();
|
||||
return ret;
|
||||
}
|
||||
|
||||
static void cgrp_css_free(struct cgroup_subsys_state *css)
|
||||
{
|
||||
kfree(css);
|
||||
}
|
||||
|
||||
static u64 read_prioidx(struct cgroup_subsys_state *css, struct cftype *cft)
|
||||
{
|
||||
return css->cgroup->id;
|
||||
}
|
||||
|
||||
static int read_priomap(struct seq_file *sf, void *v)
|
||||
{
|
||||
struct net_device *dev;
|
||||
|
||||
rcu_read_lock();
|
||||
for_each_netdev_rcu(&init_net, dev)
|
||||
seq_printf(sf, "%s %u\n", dev->name,
|
||||
netprio_prio(seq_css(sf), dev));
|
||||
rcu_read_unlock();
|
||||
return 0;
|
||||
}
|
||||
|
||||
static ssize_t write_priomap(struct kernfs_open_file *of,
|
||||
char *buf, size_t nbytes, loff_t off)
|
||||
{
|
||||
char devname[IFNAMSIZ + 1];
|
||||
struct net_device *dev;
|
||||
u32 prio;
|
||||
int ret;
|
||||
|
||||
if (sscanf(buf, "%"__stringify(IFNAMSIZ)"s %u", devname, &prio) != 2)
|
||||
return -EINVAL;
|
||||
|
||||
dev = dev_get_by_name(&init_net, devname);
|
||||
if (!dev)
|
||||
return -ENODEV;
|
||||
|
||||
rtnl_lock();
|
||||
|
||||
ret = netprio_set_prio(of_css(of), dev, prio);
|
||||
|
||||
rtnl_unlock();
|
||||
dev_put(dev);
|
||||
return ret ?: nbytes;
|
||||
}
|
||||
|
||||
static int update_netprio(const void *v, struct file *file, unsigned n)
|
||||
{
|
||||
int err;
|
||||
struct socket *sock = sock_from_file(file, &err);
|
||||
if (sock)
|
||||
sock->sk->sk_cgrp_prioidx = (u32)(unsigned long)v;
|
||||
return 0;
|
||||
}
|
||||
|
||||
static void net_prio_attach(struct cgroup_subsys_state *css,
|
||||
struct cgroup_taskset *tset)
|
||||
{
|
||||
struct task_struct *p;
|
||||
void *v = (void *)(unsigned long)css->cgroup->id;
|
||||
|
||||
cgroup_taskset_for_each(p, tset) {
|
||||
task_lock(p);
|
||||
iterate_fd(p->files, 0, update_netprio, v);
|
||||
task_unlock(p);
|
||||
}
|
||||
}
|
||||
|
||||
static struct cftype ss_files[] = {
|
||||
{
|
||||
.name = "prioidx",
|
||||
.read_u64 = read_prioidx,
|
||||
},
|
||||
{
|
||||
.name = "ifpriomap",
|
||||
.seq_show = read_priomap,
|
||||
.write = write_priomap,
|
||||
},
|
||||
{ } /* terminate */
|
||||
};
|
||||
|
||||
struct cgroup_subsys net_prio_cgrp_subsys = {
|
||||
.css_alloc = cgrp_css_alloc,
|
||||
.css_online = cgrp_css_online,
|
||||
.css_free = cgrp_css_free,
|
||||
.attach = net_prio_attach,
|
||||
.legacy_cftypes = ss_files,
|
||||
};
|
||||
|
||||
static int netprio_device_event(struct notifier_block *unused,
|
||||
unsigned long event, void *ptr)
|
||||
{
|
||||
struct net_device *dev = netdev_notifier_info_to_dev(ptr);
|
||||
struct netprio_map *old;
|
||||
|
||||
/*
|
||||
* Note this is called with rtnl_lock held so we have update side
|
||||
* protection on our rcu assignments
|
||||
*/
|
||||
|
||||
switch (event) {
|
||||
case NETDEV_UNREGISTER:
|
||||
old = rtnl_dereference(dev->priomap);
|
||||
RCU_INIT_POINTER(dev->priomap, NULL);
|
||||
if (old)
|
||||
kfree_rcu(old, rcu);
|
||||
break;
|
||||
}
|
||||
return NOTIFY_DONE;
|
||||
}
|
||||
|
||||
static struct notifier_block netprio_device_notifier = {
|
||||
.notifier_call = netprio_device_event
|
||||
};
|
||||
|
||||
static int __init init_cgroup_netprio(void)
|
||||
{
|
||||
register_netdevice_notifier(&netprio_device_notifier);
|
||||
return 0;
|
||||
}
|
||||
|
||||
subsys_initcall(init_cgroup_netprio);
|
||||
MODULE_LICENSE("GPL v2");
|
3863
net/core/pktgen.c
Normal file
3863
net/core/pktgen.c
Normal file
File diff suppressed because it is too large
Load diff
193
net/core/ptp_classifier.c
Normal file
193
net/core/ptp_classifier.c
Normal file
|
@ -0,0 +1,193 @@
|
|||
/* PTP classifier
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or
|
||||
* modify it under the terms of version 2 of the GNU General Public
|
||||
* License as published by the Free Software Foundation.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful, but
|
||||
* WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
* General Public License for more details.
|
||||
*/
|
||||
|
||||
/* The below program is the bpf_asm (tools/net/) representation of
|
||||
* the opcode array in the ptp_filter structure.
|
||||
*
|
||||
* For convenience, this can easily be altered and reviewed with
|
||||
* bpf_asm and bpf_dbg, e.g. `./bpf_asm -c prog` where prog is a
|
||||
* simple file containing the below program:
|
||||
*
|
||||
* ldh [12] ; load ethertype
|
||||
*
|
||||
* ; PTP over UDP over IPv4 over Ethernet
|
||||
* test_ipv4:
|
||||
* jneq #0x800, test_ipv6 ; ETH_P_IP ?
|
||||
* ldb [23] ; load proto
|
||||
* jneq #17, drop_ipv4 ; IPPROTO_UDP ?
|
||||
* ldh [20] ; load frag offset field
|
||||
* jset #0x1fff, drop_ipv4 ; don't allow fragments
|
||||
* ldxb 4*([14]&0xf) ; load IP header len
|
||||
* ldh [x + 16] ; load UDP dst port
|
||||
* jneq #319, drop_ipv4 ; is port PTP_EV_PORT ?
|
||||
* ldh [x + 22] ; load payload
|
||||
* and #0xf ; mask PTP_CLASS_VMASK
|
||||
* or #0x10 ; PTP_CLASS_IPV4
|
||||
* ret a ; return PTP class
|
||||
* drop_ipv4: ret #0x0 ; PTP_CLASS_NONE
|
||||
*
|
||||
* ; PTP over UDP over IPv6 over Ethernet
|
||||
* test_ipv6:
|
||||
* jneq #0x86dd, test_8021q ; ETH_P_IPV6 ?
|
||||
* ldb [20] ; load proto
|
||||
* jneq #17, drop_ipv6 ; IPPROTO_UDP ?
|
||||
* ldh [56] ; load UDP dst port
|
||||
* jneq #319, drop_ipv6 ; is port PTP_EV_PORT ?
|
||||
* ldh [62] ; load payload
|
||||
* and #0xf ; mask PTP_CLASS_VMASK
|
||||
* or #0x20 ; PTP_CLASS_IPV6
|
||||
* ret a ; return PTP class
|
||||
* drop_ipv6: ret #0x0 ; PTP_CLASS_NONE
|
||||
*
|
||||
* ; PTP over 802.1Q over Ethernet
|
||||
* test_8021q:
|
||||
* jneq #0x8100, test_ieee1588 ; ETH_P_8021Q ?
|
||||
* ldh [16] ; load inner type
|
||||
* jneq #0x88f7, test_8021q_ipv4 ; ETH_P_1588 ?
|
||||
* ldb [18] ; load payload
|
||||
* and #0x8 ; as we don't have ports here, test
|
||||
* jneq #0x0, drop_ieee1588 ; for PTP_GEN_BIT and drop these
|
||||
* ldh [18] ; reload payload
|
||||
* and #0xf ; mask PTP_CLASS_VMASK
|
||||
* or #0x70 ; PTP_CLASS_VLAN|PTP_CLASS_L2
|
||||
* ret a ; return PTP class
|
||||
*
|
||||
* ; PTP over UDP over IPv4 over 802.1Q over Ethernet
|
||||
* test_8021q_ipv4:
|
||||
* jneq #0x800, test_8021q_ipv6 ; ETH_P_IP ?
|
||||
* ldb [27] ; load proto
|
||||
* jneq #17, drop_8021q_ipv4 ; IPPROTO_UDP ?
|
||||
* ldh [24] ; load frag offset field
|
||||
* jset #0x1fff, drop_8021q_ipv4; don't allow fragments
|
||||
* ldxb 4*([18]&0xf) ; load IP header len
|
||||
* ldh [x + 20] ; load UDP dst port
|
||||
* jneq #319, drop_8021q_ipv4 ; is port PTP_EV_PORT ?
|
||||
* ldh [x + 26] ; load payload
|
||||
* and #0xf ; mask PTP_CLASS_VMASK
|
||||
* or #0x50 ; PTP_CLASS_VLAN|PTP_CLASS_IPV4
|
||||
* ret a ; return PTP class
|
||||
* drop_8021q_ipv4: ret #0x0 ; PTP_CLASS_NONE
|
||||
*
|
||||
* ; PTP over UDP over IPv6 over 802.1Q over Ethernet
|
||||
* test_8021q_ipv6:
|
||||
* jneq #0x86dd, drop_8021q_ipv6 ; ETH_P_IPV6 ?
|
||||
* ldb [24] ; load proto
|
||||
* jneq #17, drop_8021q_ipv6 ; IPPROTO_UDP ?
|
||||
* ldh [60] ; load UDP dst port
|
||||
* jneq #319, drop_8021q_ipv6 ; is port PTP_EV_PORT ?
|
||||
* ldh [66] ; load payload
|
||||
* and #0xf ; mask PTP_CLASS_VMASK
|
||||
* or #0x60 ; PTP_CLASS_VLAN|PTP_CLASS_IPV6
|
||||
* ret a ; return PTP class
|
||||
* drop_8021q_ipv6: ret #0x0 ; PTP_CLASS_NONE
|
||||
*
|
||||
* ; PTP over Ethernet
|
||||
* test_ieee1588:
|
||||
* jneq #0x88f7, drop_ieee1588 ; ETH_P_1588 ?
|
||||
* ldb [14] ; load payload
|
||||
* and #0x8 ; as we don't have ports here, test
|
||||
* jneq #0x0, drop_ieee1588 ; for PTP_GEN_BIT and drop these
|
||||
* ldh [14] ; reload payload
|
||||
* and #0xf ; mask PTP_CLASS_VMASK
|
||||
* or #0x30 ; PTP_CLASS_L2
|
||||
* ret a ; return PTP class
|
||||
* drop_ieee1588: ret #0x0 ; PTP_CLASS_NONE
|
||||
*/
|
||||
|
||||
#include <linux/skbuff.h>
|
||||
#include <linux/filter.h>
|
||||
#include <linux/ptp_classify.h>
|
||||
|
||||
static struct bpf_prog *ptp_insns __read_mostly;
|
||||
|
||||
unsigned int ptp_classify_raw(const struct sk_buff *skb)
|
||||
{
|
||||
return BPF_PROG_RUN(ptp_insns, skb);
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(ptp_classify_raw);
|
||||
|
||||
void __init ptp_classifier_init(void)
|
||||
{
|
||||
static struct sock_filter ptp_filter[] __initdata = {
|
||||
{ 0x28, 0, 0, 0x0000000c },
|
||||
{ 0x15, 0, 12, 0x00000800 },
|
||||
{ 0x30, 0, 0, 0x00000017 },
|
||||
{ 0x15, 0, 9, 0x00000011 },
|
||||
{ 0x28, 0, 0, 0x00000014 },
|
||||
{ 0x45, 7, 0, 0x00001fff },
|
||||
{ 0xb1, 0, 0, 0x0000000e },
|
||||
{ 0x48, 0, 0, 0x00000010 },
|
||||
{ 0x15, 0, 4, 0x0000013f },
|
||||
{ 0x48, 0, 0, 0x00000016 },
|
||||
{ 0x54, 0, 0, 0x0000000f },
|
||||
{ 0x44, 0, 0, 0x00000010 },
|
||||
{ 0x16, 0, 0, 0x00000000 },
|
||||
{ 0x06, 0, 0, 0x00000000 },
|
||||
{ 0x15, 0, 9, 0x000086dd },
|
||||
{ 0x30, 0, 0, 0x00000014 },
|
||||
{ 0x15, 0, 6, 0x00000011 },
|
||||
{ 0x28, 0, 0, 0x00000038 },
|
||||
{ 0x15, 0, 4, 0x0000013f },
|
||||
{ 0x28, 0, 0, 0x0000003e },
|
||||
{ 0x54, 0, 0, 0x0000000f },
|
||||
{ 0x44, 0, 0, 0x00000020 },
|
||||
{ 0x16, 0, 0, 0x00000000 },
|
||||
{ 0x06, 0, 0, 0x00000000 },
|
||||
{ 0x15, 0, 32, 0x00008100 },
|
||||
{ 0x28, 0, 0, 0x00000010 },
|
||||
{ 0x15, 0, 7, 0x000088f7 },
|
||||
{ 0x30, 0, 0, 0x00000012 },
|
||||
{ 0x54, 0, 0, 0x00000008 },
|
||||
{ 0x15, 0, 35, 0x00000000 },
|
||||
{ 0x28, 0, 0, 0x00000012 },
|
||||
{ 0x54, 0, 0, 0x0000000f },
|
||||
{ 0x44, 0, 0, 0x00000070 },
|
||||
{ 0x16, 0, 0, 0x00000000 },
|
||||
{ 0x15, 0, 12, 0x00000800 },
|
||||
{ 0x30, 0, 0, 0x0000001b },
|
||||
{ 0x15, 0, 9, 0x00000011 },
|
||||
{ 0x28, 0, 0, 0x00000018 },
|
||||
{ 0x45, 7, 0, 0x00001fff },
|
||||
{ 0xb1, 0, 0, 0x00000012 },
|
||||
{ 0x48, 0, 0, 0x00000014 },
|
||||
{ 0x15, 0, 4, 0x0000013f },
|
||||
{ 0x48, 0, 0, 0x0000001a },
|
||||
{ 0x54, 0, 0, 0x0000000f },
|
||||
{ 0x44, 0, 0, 0x00000050 },
|
||||
{ 0x16, 0, 0, 0x00000000 },
|
||||
{ 0x06, 0, 0, 0x00000000 },
|
||||
{ 0x15, 0, 8, 0x000086dd },
|
||||
{ 0x30, 0, 0, 0x00000018 },
|
||||
{ 0x15, 0, 6, 0x00000011 },
|
||||
{ 0x28, 0, 0, 0x0000003c },
|
||||
{ 0x15, 0, 4, 0x0000013f },
|
||||
{ 0x28, 0, 0, 0x00000042 },
|
||||
{ 0x54, 0, 0, 0x0000000f },
|
||||
{ 0x44, 0, 0, 0x00000060 },
|
||||
{ 0x16, 0, 0, 0x00000000 },
|
||||
{ 0x06, 0, 0, 0x00000000 },
|
||||
{ 0x15, 0, 7, 0x000088f7 },
|
||||
{ 0x30, 0, 0, 0x0000000e },
|
||||
{ 0x54, 0, 0, 0x00000008 },
|
||||
{ 0x15, 0, 4, 0x00000000 },
|
||||
{ 0x28, 0, 0, 0x0000000e },
|
||||
{ 0x54, 0, 0, 0x0000000f },
|
||||
{ 0x44, 0, 0, 0x00000030 },
|
||||
{ 0x16, 0, 0, 0x00000000 },
|
||||
{ 0x06, 0, 0, 0x00000000 },
|
||||
};
|
||||
struct sock_fprog_kern ptp_prog = {
|
||||
.len = ARRAY_SIZE(ptp_filter), .filter = ptp_filter,
|
||||
};
|
||||
|
||||
BUG_ON(bpf_prog_create(&ptp_insns, &ptp_prog));
|
||||
}
|
205
net/core/request_sock.c
Normal file
205
net/core/request_sock.c
Normal file
|
@ -0,0 +1,205 @@
|
|||
/*
|
||||
* NET Generic infrastructure for Network protocols.
|
||||
*
|
||||
* Authors: Arnaldo Carvalho de Melo <acme@conectiva.com.br>
|
||||
*
|
||||
* From code originally in include/net/tcp.h
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU General Public License
|
||||
* as published by the Free Software Foundation; either version
|
||||
* 2 of the License, or (at your option) any later version.
|
||||
*/
|
||||
|
||||
#include <linux/module.h>
|
||||
#include <linux/random.h>
|
||||
#include <linux/slab.h>
|
||||
#include <linux/string.h>
|
||||
#include <linux/tcp.h>
|
||||
#include <linux/vmalloc.h>
|
||||
|
||||
#include <net/request_sock.h>
|
||||
|
||||
/*
|
||||
* Maximum number of SYN_RECV sockets in queue per LISTEN socket.
|
||||
* One SYN_RECV socket costs about 80bytes on a 32bit machine.
|
||||
* It would be better to replace it with a global counter for all sockets
|
||||
* but then some measure against one socket starving all other sockets
|
||||
* would be needed.
|
||||
*
|
||||
* The minimum value of it is 128. Experiments with real servers show that
|
||||
* it is absolutely not enough even at 100conn/sec. 256 cures most
|
||||
* of problems.
|
||||
* This value is adjusted to 128 for low memory machines,
|
||||
* and it will increase in proportion to the memory of machine.
|
||||
* Note : Dont forget somaxconn that may limit backlog too.
|
||||
*/
|
||||
int sysctl_max_syn_backlog = 256;
|
||||
EXPORT_SYMBOL(sysctl_max_syn_backlog);
|
||||
|
||||
int reqsk_queue_alloc(struct request_sock_queue *queue,
|
||||
unsigned int nr_table_entries)
|
||||
{
|
||||
size_t lopt_size = sizeof(struct listen_sock);
|
||||
struct listen_sock *lopt = NULL;
|
||||
|
||||
nr_table_entries = min_t(u32, nr_table_entries, sysctl_max_syn_backlog);
|
||||
nr_table_entries = max_t(u32, nr_table_entries, 8);
|
||||
nr_table_entries = roundup_pow_of_two(nr_table_entries + 1);
|
||||
lopt_size += nr_table_entries * sizeof(struct request_sock *);
|
||||
|
||||
if (lopt_size <= (PAGE_SIZE << PAGE_ALLOC_COSTLY_ORDER))
|
||||
lopt = kzalloc(lopt_size, GFP_KERNEL |
|
||||
__GFP_NOWARN |
|
||||
__GFP_NORETRY);
|
||||
if (!lopt)
|
||||
lopt = vzalloc(lopt_size);
|
||||
if (!lopt)
|
||||
return -ENOMEM;
|
||||
|
||||
get_random_bytes(&lopt->hash_rnd, sizeof(lopt->hash_rnd));
|
||||
rwlock_init(&queue->syn_wait_lock);
|
||||
queue->rskq_accept_head = NULL;
|
||||
lopt->nr_table_entries = nr_table_entries;
|
||||
lopt->max_qlen_log = ilog2(nr_table_entries);
|
||||
|
||||
write_lock_bh(&queue->syn_wait_lock);
|
||||
queue->listen_opt = lopt;
|
||||
write_unlock_bh(&queue->syn_wait_lock);
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
void __reqsk_queue_destroy(struct request_sock_queue *queue)
|
||||
{
|
||||
/* This is an error recovery path only, no locking needed */
|
||||
kvfree(queue->listen_opt);
|
||||
}
|
||||
|
||||
static inline struct listen_sock *reqsk_queue_yank_listen_sk(
|
||||
struct request_sock_queue *queue)
|
||||
{
|
||||
struct listen_sock *lopt;
|
||||
|
||||
write_lock_bh(&queue->syn_wait_lock);
|
||||
lopt = queue->listen_opt;
|
||||
queue->listen_opt = NULL;
|
||||
write_unlock_bh(&queue->syn_wait_lock);
|
||||
|
||||
return lopt;
|
||||
}
|
||||
|
||||
void reqsk_queue_destroy(struct request_sock_queue *queue)
|
||||
{
|
||||
/* make all the listen_opt local to us */
|
||||
struct listen_sock *lopt = reqsk_queue_yank_listen_sk(queue);
|
||||
|
||||
if (lopt->qlen != 0) {
|
||||
unsigned int i;
|
||||
|
||||
for (i = 0; i < lopt->nr_table_entries; i++) {
|
||||
struct request_sock *req;
|
||||
|
||||
while ((req = lopt->syn_table[i]) != NULL) {
|
||||
lopt->syn_table[i] = req->dl_next;
|
||||
lopt->qlen--;
|
||||
reqsk_free(req);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
WARN_ON(lopt->qlen != 0);
|
||||
kvfree(lopt);
|
||||
}
|
||||
|
||||
/*
|
||||
* This function is called to set a Fast Open socket's "fastopen_rsk" field
|
||||
* to NULL when a TFO socket no longer needs to access the request_sock.
|
||||
* This happens only after 3WHS has been either completed or aborted (e.g.,
|
||||
* RST is received).
|
||||
*
|
||||
* Before TFO, a child socket is created only after 3WHS is completed,
|
||||
* hence it never needs to access the request_sock. things get a lot more
|
||||
* complex with TFO. A child socket, accepted or not, has to access its
|
||||
* request_sock for 3WHS processing, e.g., to retransmit SYN-ACK pkts,
|
||||
* until 3WHS is either completed or aborted. Afterwards the req will stay
|
||||
* until either the child socket is accepted, or in the rare case when the
|
||||
* listener is closed before the child is accepted.
|
||||
*
|
||||
* In short, a request socket is only freed after BOTH 3WHS has completed
|
||||
* (or aborted) and the child socket has been accepted (or listener closed).
|
||||
* When a child socket is accepted, its corresponding req->sk is set to
|
||||
* NULL since it's no longer needed. More importantly, "req->sk == NULL"
|
||||
* will be used by the code below to determine if a child socket has been
|
||||
* accepted or not, and the check is protected by the fastopenq->lock
|
||||
* described below.
|
||||
*
|
||||
* Note that fastopen_rsk is only accessed from the child socket's context
|
||||
* with its socket lock held. But a request_sock (req) can be accessed by
|
||||
* both its child socket through fastopen_rsk, and a listener socket through
|
||||
* icsk_accept_queue.rskq_accept_head. To protect the access a simple spin
|
||||
* lock per listener "icsk->icsk_accept_queue.fastopenq->lock" is created.
|
||||
* only in the rare case when both the listener and the child locks are held,
|
||||
* e.g., in inet_csk_listen_stop() do we not need to acquire the lock.
|
||||
* The lock also protects other fields such as fastopenq->qlen, which is
|
||||
* decremented by this function when fastopen_rsk is no longer needed.
|
||||
*
|
||||
* Note that another solution was to simply use the existing socket lock
|
||||
* from the listener. But first socket lock is difficult to use. It is not
|
||||
* a simple spin lock - one must consider sock_owned_by_user() and arrange
|
||||
* to use sk_add_backlog() stuff. But what really makes it infeasible is the
|
||||
* locking hierarchy violation. E.g., inet_csk_listen_stop() may try to
|
||||
* acquire a child's lock while holding listener's socket lock. A corner
|
||||
* case might also exist in tcp_v4_hnd_req() that will trigger this locking
|
||||
* order.
|
||||
*
|
||||
* When a TFO req is created, it needs to sock_hold its listener to prevent
|
||||
* the latter data structure from going away.
|
||||
*
|
||||
* This function also sets "treq->listener" to NULL and unreference listener
|
||||
* socket. treq->listener is used by the listener so it is protected by the
|
||||
* fastopenq->lock in this function.
|
||||
*/
|
||||
void reqsk_fastopen_remove(struct sock *sk, struct request_sock *req,
|
||||
bool reset)
|
||||
{
|
||||
struct sock *lsk = tcp_rsk(req)->listener;
|
||||
struct fastopen_queue *fastopenq =
|
||||
inet_csk(lsk)->icsk_accept_queue.fastopenq;
|
||||
|
||||
tcp_sk(sk)->fastopen_rsk = NULL;
|
||||
spin_lock_bh(&fastopenq->lock);
|
||||
fastopenq->qlen--;
|
||||
tcp_rsk(req)->listener = NULL;
|
||||
if (req->sk) /* the child socket hasn't been accepted yet */
|
||||
goto out;
|
||||
|
||||
if (!reset || lsk->sk_state != TCP_LISTEN) {
|
||||
/* If the listener has been closed don't bother with the
|
||||
* special RST handling below.
|
||||
*/
|
||||
spin_unlock_bh(&fastopenq->lock);
|
||||
sock_put(lsk);
|
||||
reqsk_free(req);
|
||||
return;
|
||||
}
|
||||
/* Wait for 60secs before removing a req that has triggered RST.
|
||||
* This is a simple defense against TFO spoofing attack - by
|
||||
* counting the req against fastopen.max_qlen, and disabling
|
||||
* TFO when the qlen exceeds max_qlen.
|
||||
*
|
||||
* For more details see CoNext'11 "TCP Fast Open" paper.
|
||||
*/
|
||||
req->expires = jiffies + 60*HZ;
|
||||
if (fastopenq->rskq_rst_head == NULL)
|
||||
fastopenq->rskq_rst_head = req;
|
||||
else
|
||||
fastopenq->rskq_rst_tail->dl_next = req;
|
||||
|
||||
req->dl_next = NULL;
|
||||
fastopenq->rskq_rst_tail = req;
|
||||
fastopenq->qlen++;
|
||||
out:
|
||||
spin_unlock_bh(&fastopenq->lock);
|
||||
sock_put(lsk);
|
||||
}
|
3082
net/core/rtnetlink.c
Normal file
3082
net/core/rtnetlink.c
Normal file
File diff suppressed because it is too large
Load diff
341
net/core/scm.c
Normal file
341
net/core/scm.c
Normal file
|
@ -0,0 +1,341 @@
|
|||
/* scm.c - Socket level control messages processing.
|
||||
*
|
||||
* Author: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
|
||||
* Alignment and value checking mods by Craig Metz
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU General Public License
|
||||
* as published by the Free Software Foundation; either version
|
||||
* 2 of the License, or (at your option) any later version.
|
||||
*/
|
||||
|
||||
#include <linux/module.h>
|
||||
#include <linux/signal.h>
|
||||
#include <linux/capability.h>
|
||||
#include <linux/errno.h>
|
||||
#include <linux/sched.h>
|
||||
#include <linux/mm.h>
|
||||
#include <linux/kernel.h>
|
||||
#include <linux/stat.h>
|
||||
#include <linux/socket.h>
|
||||
#include <linux/file.h>
|
||||
#include <linux/fcntl.h>
|
||||
#include <linux/net.h>
|
||||
#include <linux/interrupt.h>
|
||||
#include <linux/netdevice.h>
|
||||
#include <linux/security.h>
|
||||
#include <linux/pid_namespace.h>
|
||||
#include <linux/pid.h>
|
||||
#include <linux/nsproxy.h>
|
||||
#include <linux/slab.h>
|
||||
|
||||
#include <asm/uaccess.h>
|
||||
|
||||
#include <net/protocol.h>
|
||||
#include <linux/skbuff.h>
|
||||
#include <net/sock.h>
|
||||
#include <net/compat.h>
|
||||
#include <net/scm.h>
|
||||
#include <net/cls_cgroup.h>
|
||||
|
||||
|
||||
/*
|
||||
* Only allow a user to send credentials, that they could set with
|
||||
* setu(g)id.
|
||||
*/
|
||||
|
||||
static __inline__ int scm_check_creds(struct ucred *creds)
|
||||
{
|
||||
const struct cred *cred = current_cred();
|
||||
kuid_t uid = make_kuid(cred->user_ns, creds->uid);
|
||||
kgid_t gid = make_kgid(cred->user_ns, creds->gid);
|
||||
|
||||
if (!uid_valid(uid) || !gid_valid(gid))
|
||||
return -EINVAL;
|
||||
|
||||
if ((creds->pid == task_tgid_vnr(current) ||
|
||||
ns_capable(task_active_pid_ns(current)->user_ns, CAP_SYS_ADMIN)) &&
|
||||
((uid_eq(uid, cred->uid) || uid_eq(uid, cred->euid) ||
|
||||
uid_eq(uid, cred->suid)) || ns_capable(cred->user_ns, CAP_SETUID)) &&
|
||||
((gid_eq(gid, cred->gid) || gid_eq(gid, cred->egid) ||
|
||||
gid_eq(gid, cred->sgid)) || ns_capable(cred->user_ns, CAP_SETGID))) {
|
||||
return 0;
|
||||
}
|
||||
return -EPERM;
|
||||
}
|
||||
|
||||
static int scm_fp_copy(struct cmsghdr *cmsg, struct scm_fp_list **fplp)
|
||||
{
|
||||
int *fdp = (int*)CMSG_DATA(cmsg);
|
||||
struct scm_fp_list *fpl = *fplp;
|
||||
struct file **fpp;
|
||||
int i, num;
|
||||
|
||||
num = (cmsg->cmsg_len - CMSG_ALIGN(sizeof(struct cmsghdr)))/sizeof(int);
|
||||
|
||||
if (num <= 0)
|
||||
return 0;
|
||||
|
||||
if (num > SCM_MAX_FD)
|
||||
return -EINVAL;
|
||||
|
||||
if (!fpl)
|
||||
{
|
||||
fpl = kmalloc(sizeof(struct scm_fp_list), GFP_KERNEL);
|
||||
if (!fpl)
|
||||
return -ENOMEM;
|
||||
*fplp = fpl;
|
||||
fpl->count = 0;
|
||||
fpl->max = SCM_MAX_FD;
|
||||
}
|
||||
fpp = &fpl->fp[fpl->count];
|
||||
|
||||
if (fpl->count + num > fpl->max)
|
||||
return -EINVAL;
|
||||
|
||||
/*
|
||||
* Verify the descriptors and increment the usage count.
|
||||
*/
|
||||
|
||||
for (i=0; i< num; i++)
|
||||
{
|
||||
int fd = fdp[i];
|
||||
struct file *file;
|
||||
|
||||
if (fd < 0 || !(file = fget_raw(fd)))
|
||||
return -EBADF;
|
||||
*fpp++ = file;
|
||||
fpl->count++;
|
||||
}
|
||||
return num;
|
||||
}
|
||||
|
||||
void __scm_destroy(struct scm_cookie *scm)
|
||||
{
|
||||
struct scm_fp_list *fpl = scm->fp;
|
||||
int i;
|
||||
|
||||
if (fpl) {
|
||||
scm->fp = NULL;
|
||||
for (i=fpl->count-1; i>=0; i--)
|
||||
fput(fpl->fp[i]);
|
||||
kfree(fpl);
|
||||
}
|
||||
}
|
||||
EXPORT_SYMBOL(__scm_destroy);
|
||||
|
||||
int __scm_send(struct socket *sock, struct msghdr *msg, struct scm_cookie *p)
|
||||
{
|
||||
struct cmsghdr *cmsg;
|
||||
int err;
|
||||
|
||||
for (cmsg = CMSG_FIRSTHDR(msg); cmsg; cmsg = CMSG_NXTHDR(msg, cmsg))
|
||||
{
|
||||
err = -EINVAL;
|
||||
|
||||
/* Verify that cmsg_len is at least sizeof(struct cmsghdr) */
|
||||
/* The first check was omitted in <= 2.2.5. The reasoning was
|
||||
that parser checks cmsg_len in any case, so that
|
||||
additional check would be work duplication.
|
||||
But if cmsg_level is not SOL_SOCKET, we do not check
|
||||
for too short ancillary data object at all! Oops.
|
||||
OK, let's add it...
|
||||
*/
|
||||
if (!CMSG_OK(msg, cmsg))
|
||||
goto error;
|
||||
|
||||
if (cmsg->cmsg_level != SOL_SOCKET)
|
||||
continue;
|
||||
|
||||
switch (cmsg->cmsg_type)
|
||||
{
|
||||
case SCM_RIGHTS:
|
||||
if (!sock->ops || sock->ops->family != PF_UNIX)
|
||||
goto error;
|
||||
err=scm_fp_copy(cmsg, &p->fp);
|
||||
if (err<0)
|
||||
goto error;
|
||||
break;
|
||||
case SCM_CREDENTIALS:
|
||||
{
|
||||
struct ucred creds;
|
||||
kuid_t uid;
|
||||
kgid_t gid;
|
||||
if (cmsg->cmsg_len != CMSG_LEN(sizeof(struct ucred)))
|
||||
goto error;
|
||||
memcpy(&creds, CMSG_DATA(cmsg), sizeof(struct ucred));
|
||||
err = scm_check_creds(&creds);
|
||||
if (err)
|
||||
goto error;
|
||||
|
||||
p->creds.pid = creds.pid;
|
||||
if (!p->pid || pid_vnr(p->pid) != creds.pid) {
|
||||
struct pid *pid;
|
||||
err = -ESRCH;
|
||||
pid = find_get_pid(creds.pid);
|
||||
if (!pid)
|
||||
goto error;
|
||||
put_pid(p->pid);
|
||||
p->pid = pid;
|
||||
}
|
||||
|
||||
err = -EINVAL;
|
||||
uid = make_kuid(current_user_ns(), creds.uid);
|
||||
gid = make_kgid(current_user_ns(), creds.gid);
|
||||
if (!uid_valid(uid) || !gid_valid(gid))
|
||||
goto error;
|
||||
|
||||
p->creds.uid = uid;
|
||||
p->creds.gid = gid;
|
||||
break;
|
||||
}
|
||||
default:
|
||||
goto error;
|
||||
}
|
||||
}
|
||||
|
||||
if (p->fp && !p->fp->count)
|
||||
{
|
||||
kfree(p->fp);
|
||||
p->fp = NULL;
|
||||
}
|
||||
return 0;
|
||||
|
||||
error:
|
||||
scm_destroy(p);
|
||||
return err;
|
||||
}
|
||||
EXPORT_SYMBOL(__scm_send);
|
||||
|
||||
int put_cmsg(struct msghdr * msg, int level, int type, int len, void *data)
|
||||
{
|
||||
struct cmsghdr __user *cm
|
||||
= (__force struct cmsghdr __user *)msg->msg_control;
|
||||
struct cmsghdr cmhdr;
|
||||
int cmlen = CMSG_LEN(len);
|
||||
int err;
|
||||
|
||||
if (MSG_CMSG_COMPAT & msg->msg_flags)
|
||||
return put_cmsg_compat(msg, level, type, len, data);
|
||||
|
||||
if (cm==NULL || msg->msg_controllen < sizeof(*cm)) {
|
||||
msg->msg_flags |= MSG_CTRUNC;
|
||||
return 0; /* XXX: return error? check spec. */
|
||||
}
|
||||
if (msg->msg_controllen < cmlen) {
|
||||
msg->msg_flags |= MSG_CTRUNC;
|
||||
cmlen = msg->msg_controllen;
|
||||
}
|
||||
cmhdr.cmsg_level = level;
|
||||
cmhdr.cmsg_type = type;
|
||||
cmhdr.cmsg_len = cmlen;
|
||||
|
||||
err = -EFAULT;
|
||||
if (copy_to_user(cm, &cmhdr, sizeof cmhdr))
|
||||
goto out;
|
||||
if (copy_to_user(CMSG_DATA(cm), data, cmlen - sizeof(struct cmsghdr)))
|
||||
goto out;
|
||||
cmlen = CMSG_SPACE(len);
|
||||
if (msg->msg_controllen < cmlen)
|
||||
cmlen = msg->msg_controllen;
|
||||
msg->msg_control += cmlen;
|
||||
msg->msg_controllen -= cmlen;
|
||||
err = 0;
|
||||
out:
|
||||
return err;
|
||||
}
|
||||
EXPORT_SYMBOL(put_cmsg);
|
||||
|
||||
void scm_detach_fds(struct msghdr *msg, struct scm_cookie *scm)
|
||||
{
|
||||
struct cmsghdr __user *cm
|
||||
= (__force struct cmsghdr __user*)msg->msg_control;
|
||||
|
||||
int fdmax = 0;
|
||||
int fdnum = scm->fp->count;
|
||||
struct file **fp = scm->fp->fp;
|
||||
int __user *cmfptr;
|
||||
int err = 0, i;
|
||||
|
||||
if (MSG_CMSG_COMPAT & msg->msg_flags) {
|
||||
scm_detach_fds_compat(msg, scm);
|
||||
return;
|
||||
}
|
||||
|
||||
if (msg->msg_controllen > sizeof(struct cmsghdr))
|
||||
fdmax = ((msg->msg_controllen - sizeof(struct cmsghdr))
|
||||
/ sizeof(int));
|
||||
|
||||
if (fdnum < fdmax)
|
||||
fdmax = fdnum;
|
||||
|
||||
for (i=0, cmfptr=(__force int __user *)CMSG_DATA(cm); i<fdmax;
|
||||
i++, cmfptr++)
|
||||
{
|
||||
struct socket *sock;
|
||||
int new_fd;
|
||||
err = security_file_receive(fp[i]);
|
||||
if (err)
|
||||
break;
|
||||
err = get_unused_fd_flags(MSG_CMSG_CLOEXEC & msg->msg_flags
|
||||
? O_CLOEXEC : 0);
|
||||
if (err < 0)
|
||||
break;
|
||||
new_fd = err;
|
||||
err = put_user(new_fd, cmfptr);
|
||||
if (err) {
|
||||
put_unused_fd(new_fd);
|
||||
break;
|
||||
}
|
||||
/* Bump the usage count and install the file. */
|
||||
sock = sock_from_file(fp[i], &err);
|
||||
if (sock) {
|
||||
sock_update_netprioidx(sock->sk);
|
||||
sock_update_classid(sock->sk);
|
||||
}
|
||||
fd_install(new_fd, get_file(fp[i]));
|
||||
}
|
||||
|
||||
if (i > 0)
|
||||
{
|
||||
int cmlen = CMSG_LEN(i*sizeof(int));
|
||||
err = put_user(SOL_SOCKET, &cm->cmsg_level);
|
||||
if (!err)
|
||||
err = put_user(SCM_RIGHTS, &cm->cmsg_type);
|
||||
if (!err)
|
||||
err = put_user(cmlen, &cm->cmsg_len);
|
||||
if (!err) {
|
||||
cmlen = CMSG_SPACE(i*sizeof(int));
|
||||
msg->msg_control += cmlen;
|
||||
msg->msg_controllen -= cmlen;
|
||||
}
|
||||
}
|
||||
if (i < fdnum || (fdnum && fdmax <= 0))
|
||||
msg->msg_flags |= MSG_CTRUNC;
|
||||
|
||||
/*
|
||||
* All of the files that fit in the message have had their
|
||||
* usage counts incremented, so we just free the list.
|
||||
*/
|
||||
__scm_destroy(scm);
|
||||
}
|
||||
EXPORT_SYMBOL(scm_detach_fds);
|
||||
|
||||
struct scm_fp_list *scm_fp_dup(struct scm_fp_list *fpl)
|
||||
{
|
||||
struct scm_fp_list *new_fpl;
|
||||
int i;
|
||||
|
||||
if (!fpl)
|
||||
return NULL;
|
||||
|
||||
new_fpl = kmemdup(fpl, offsetof(struct scm_fp_list, fp[fpl->count]),
|
||||
GFP_KERNEL);
|
||||
if (new_fpl) {
|
||||
for (i = 0; i < fpl->count; i++)
|
||||
get_file(fpl->fp[i]);
|
||||
new_fpl->max = new_fpl->count;
|
||||
}
|
||||
return new_fpl;
|
||||
}
|
||||
EXPORT_SYMBOL(scm_fp_dup);
|
173
net/core/secure_seq.c
Normal file
173
net/core/secure_seq.c
Normal file
|
@ -0,0 +1,173 @@
|
|||
#include <linux/kernel.h>
|
||||
#include <linux/init.h>
|
||||
#include <linux/cryptohash.h>
|
||||
#include <linux/module.h>
|
||||
#include <linux/cache.h>
|
||||
#include <linux/random.h>
|
||||
#include <linux/hrtimer.h>
|
||||
#include <linux/ktime.h>
|
||||
#include <linux/string.h>
|
||||
#include <linux/net.h>
|
||||
|
||||
#include <net/secure_seq.h>
|
||||
|
||||
#if IS_ENABLED(CONFIG_IPV6) || IS_ENABLED(CONFIG_INET)
|
||||
#define NET_SECRET_SIZE (MD5_MESSAGE_BYTES / 4)
|
||||
|
||||
static u32 net_secret[NET_SECRET_SIZE] ____cacheline_aligned;
|
||||
|
||||
static __always_inline void net_secret_init(void)
|
||||
{
|
||||
net_get_random_once(net_secret, sizeof(net_secret));
|
||||
}
|
||||
#endif
|
||||
|
||||
#ifdef CONFIG_INET
|
||||
static u32 seq_scale(u32 seq)
|
||||
{
|
||||
/*
|
||||
* As close as possible to RFC 793, which
|
||||
* suggests using a 250 kHz clock.
|
||||
* Further reading shows this assumes 2 Mb/s networks.
|
||||
* For 10 Mb/s Ethernet, a 1 MHz clock is appropriate.
|
||||
* For 10 Gb/s Ethernet, a 1 GHz clock should be ok, but
|
||||
* we also need to limit the resolution so that the u32 seq
|
||||
* overlaps less than one time per MSL (2 minutes).
|
||||
* Choosing a clock of 64 ns period is OK. (period of 274 s)
|
||||
*/
|
||||
return seq + (ktime_get_real_ns() >> 6);
|
||||
}
|
||||
#endif
|
||||
|
||||
#if IS_ENABLED(CONFIG_IPV6)
|
||||
__u32 secure_tcpv6_sequence_number(const __be32 *saddr, const __be32 *daddr,
|
||||
__be16 sport, __be16 dport)
|
||||
{
|
||||
u32 secret[MD5_MESSAGE_BYTES / 4];
|
||||
u32 hash[MD5_DIGEST_WORDS];
|
||||
u32 i;
|
||||
|
||||
net_secret_init();
|
||||
memcpy(hash, saddr, 16);
|
||||
for (i = 0; i < 4; i++)
|
||||
secret[i] = net_secret[i] + (__force u32)daddr[i];
|
||||
secret[4] = net_secret[4] +
|
||||
(((__force u16)sport << 16) + (__force u16)dport);
|
||||
for (i = 5; i < MD5_MESSAGE_BYTES / 4; i++)
|
||||
secret[i] = net_secret[i];
|
||||
|
||||
md5_transform(hash, secret);
|
||||
|
||||
return seq_scale(hash[0]);
|
||||
}
|
||||
EXPORT_SYMBOL(secure_tcpv6_sequence_number);
|
||||
|
||||
u32 secure_ipv6_port_ephemeral(const __be32 *saddr, const __be32 *daddr,
|
||||
__be16 dport)
|
||||
{
|
||||
u32 secret[MD5_MESSAGE_BYTES / 4];
|
||||
u32 hash[MD5_DIGEST_WORDS];
|
||||
u32 i;
|
||||
|
||||
net_secret_init();
|
||||
memcpy(hash, saddr, 16);
|
||||
for (i = 0; i < 4; i++)
|
||||
secret[i] = net_secret[i] + (__force u32) daddr[i];
|
||||
secret[4] = net_secret[4] + (__force u32)dport;
|
||||
for (i = 5; i < MD5_MESSAGE_BYTES / 4; i++)
|
||||
secret[i] = net_secret[i];
|
||||
|
||||
md5_transform(hash, secret);
|
||||
|
||||
return hash[0];
|
||||
}
|
||||
EXPORT_SYMBOL(secure_ipv6_port_ephemeral);
|
||||
#endif
|
||||
|
||||
#ifdef CONFIG_INET
|
||||
|
||||
__u32 secure_tcp_sequence_number(__be32 saddr, __be32 daddr,
|
||||
__be16 sport, __be16 dport)
|
||||
{
|
||||
u32 hash[MD5_DIGEST_WORDS];
|
||||
|
||||
net_secret_init();
|
||||
hash[0] = (__force u32)saddr;
|
||||
hash[1] = (__force u32)daddr;
|
||||
hash[2] = ((__force u16)sport << 16) + (__force u16)dport;
|
||||
hash[3] = net_secret[15];
|
||||
|
||||
md5_transform(hash, net_secret);
|
||||
|
||||
return seq_scale(hash[0]);
|
||||
}
|
||||
|
||||
u32 secure_ipv4_port_ephemeral(__be32 saddr, __be32 daddr, __be16 dport)
|
||||
{
|
||||
u32 hash[MD5_DIGEST_WORDS];
|
||||
|
||||
net_secret_init();
|
||||
hash[0] = (__force u32)saddr;
|
||||
hash[1] = (__force u32)daddr;
|
||||
hash[2] = (__force u32)dport ^ net_secret[14];
|
||||
hash[3] = net_secret[15];
|
||||
|
||||
md5_transform(hash, net_secret);
|
||||
|
||||
return hash[0];
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(secure_ipv4_port_ephemeral);
|
||||
#endif
|
||||
|
||||
#if IS_ENABLED(CONFIG_IP_DCCP)
|
||||
u64 secure_dccp_sequence_number(__be32 saddr, __be32 daddr,
|
||||
__be16 sport, __be16 dport)
|
||||
{
|
||||
u32 hash[MD5_DIGEST_WORDS];
|
||||
u64 seq;
|
||||
|
||||
net_secret_init();
|
||||
hash[0] = (__force u32)saddr;
|
||||
hash[1] = (__force u32)daddr;
|
||||
hash[2] = ((__force u16)sport << 16) + (__force u16)dport;
|
||||
hash[3] = net_secret[15];
|
||||
|
||||
md5_transform(hash, net_secret);
|
||||
|
||||
seq = hash[0] | (((u64)hash[1]) << 32);
|
||||
seq += ktime_get_real_ns();
|
||||
seq &= (1ull << 48) - 1;
|
||||
|
||||
return seq;
|
||||
}
|
||||
EXPORT_SYMBOL(secure_dccp_sequence_number);
|
||||
|
||||
#if IS_ENABLED(CONFIG_IPV6)
|
||||
u64 secure_dccpv6_sequence_number(__be32 *saddr, __be32 *daddr,
|
||||
__be16 sport, __be16 dport)
|
||||
{
|
||||
u32 secret[MD5_MESSAGE_BYTES / 4];
|
||||
u32 hash[MD5_DIGEST_WORDS];
|
||||
u64 seq;
|
||||
u32 i;
|
||||
|
||||
net_secret_init();
|
||||
memcpy(hash, saddr, 16);
|
||||
for (i = 0; i < 4; i++)
|
||||
secret[i] = net_secret[i] + daddr[i];
|
||||
secret[4] = net_secret[4] +
|
||||
(((__force u16)sport << 16) + (__force u16)dport);
|
||||
for (i = 5; i < MD5_MESSAGE_BYTES / 4; i++)
|
||||
secret[i] = net_secret[i];
|
||||
|
||||
md5_transform(hash, secret);
|
||||
|
||||
seq = hash[0] | (((u64)hash[1]) << 32);
|
||||
seq += ktime_get_real_ns();
|
||||
seq &= (1ull << 48) - 1;
|
||||
|
||||
return seq;
|
||||
}
|
||||
EXPORT_SYMBOL(secure_dccpv6_sequence_number);
|
||||
#endif
|
||||
#endif
|
4231
net/core/skbuff.c
Normal file
4231
net/core/skbuff.c
Normal file
File diff suppressed because it is too large
Load diff
2956
net/core/sock.c
Normal file
2956
net/core/sock.c
Normal file
File diff suppressed because it is too large
Load diff
248
net/core/sock_diag.c
Normal file
248
net/core/sock_diag.c
Normal file
|
@ -0,0 +1,248 @@
|
|||
#include <linux/mutex.h>
|
||||
#include <linux/socket.h>
|
||||
#include <linux/skbuff.h>
|
||||
#include <net/netlink.h>
|
||||
#include <net/net_namespace.h>
|
||||
#include <linux/module.h>
|
||||
#include <net/sock.h>
|
||||
|
||||
#include <linux/inet_diag.h>
|
||||
#include <linux/sock_diag.h>
|
||||
|
||||
static const struct sock_diag_handler *sock_diag_handlers[AF_MAX];
|
||||
static int (*inet_rcv_compat)(struct sk_buff *skb, struct nlmsghdr *nlh);
|
||||
static DEFINE_MUTEX(sock_diag_table_mutex);
|
||||
|
||||
int sock_diag_check_cookie(void *sk, __u32 *cookie)
|
||||
{
|
||||
if ((cookie[0] != INET_DIAG_NOCOOKIE ||
|
||||
cookie[1] != INET_DIAG_NOCOOKIE) &&
|
||||
((u32)(unsigned long)sk != cookie[0] ||
|
||||
(u32)((((unsigned long)sk) >> 31) >> 1) != cookie[1]))
|
||||
return -ESTALE;
|
||||
else
|
||||
return 0;
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(sock_diag_check_cookie);
|
||||
|
||||
void sock_diag_save_cookie(void *sk, __u32 *cookie)
|
||||
{
|
||||
cookie[0] = (u32)(unsigned long)sk;
|
||||
cookie[1] = (u32)(((unsigned long)sk >> 31) >> 1);
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(sock_diag_save_cookie);
|
||||
|
||||
int sock_diag_put_meminfo(struct sock *sk, struct sk_buff *skb, int attrtype)
|
||||
{
|
||||
u32 mem[SK_MEMINFO_VARS];
|
||||
|
||||
mem[SK_MEMINFO_RMEM_ALLOC] = sk_rmem_alloc_get(sk);
|
||||
mem[SK_MEMINFO_RCVBUF] = sk->sk_rcvbuf;
|
||||
mem[SK_MEMINFO_WMEM_ALLOC] = sk_wmem_alloc_get(sk);
|
||||
mem[SK_MEMINFO_SNDBUF] = sk->sk_sndbuf;
|
||||
mem[SK_MEMINFO_FWD_ALLOC] = sk->sk_forward_alloc;
|
||||
mem[SK_MEMINFO_WMEM_QUEUED] = sk->sk_wmem_queued;
|
||||
mem[SK_MEMINFO_OPTMEM] = atomic_read(&sk->sk_omem_alloc);
|
||||
mem[SK_MEMINFO_BACKLOG] = sk->sk_backlog.len;
|
||||
|
||||
return nla_put(skb, attrtype, sizeof(mem), &mem);
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(sock_diag_put_meminfo);
|
||||
|
||||
int sock_diag_put_filterinfo(bool may_report_filterinfo, struct sock *sk,
|
||||
struct sk_buff *skb, int attrtype)
|
||||
{
|
||||
struct sock_fprog_kern *fprog;
|
||||
struct sk_filter *filter;
|
||||
struct nlattr *attr;
|
||||
unsigned int flen;
|
||||
int err = 0;
|
||||
|
||||
if (!may_report_filterinfo) {
|
||||
nla_reserve(skb, attrtype, 0);
|
||||
return 0;
|
||||
}
|
||||
|
||||
rcu_read_lock();
|
||||
filter = rcu_dereference(sk->sk_filter);
|
||||
if (!filter)
|
||||
goto out;
|
||||
|
||||
fprog = filter->prog->orig_prog;
|
||||
flen = bpf_classic_proglen(fprog);
|
||||
|
||||
attr = nla_reserve(skb, attrtype, flen);
|
||||
if (attr == NULL) {
|
||||
err = -EMSGSIZE;
|
||||
goto out;
|
||||
}
|
||||
|
||||
memcpy(nla_data(attr), fprog->filter, flen);
|
||||
out:
|
||||
rcu_read_unlock();
|
||||
return err;
|
||||
}
|
||||
EXPORT_SYMBOL(sock_diag_put_filterinfo);
|
||||
|
||||
void sock_diag_register_inet_compat(int (*fn)(struct sk_buff *skb, struct nlmsghdr *nlh))
|
||||
{
|
||||
mutex_lock(&sock_diag_table_mutex);
|
||||
inet_rcv_compat = fn;
|
||||
mutex_unlock(&sock_diag_table_mutex);
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(sock_diag_register_inet_compat);
|
||||
|
||||
void sock_diag_unregister_inet_compat(int (*fn)(struct sk_buff *skb, struct nlmsghdr *nlh))
|
||||
{
|
||||
mutex_lock(&sock_diag_table_mutex);
|
||||
inet_rcv_compat = NULL;
|
||||
mutex_unlock(&sock_diag_table_mutex);
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(sock_diag_unregister_inet_compat);
|
||||
|
||||
int sock_diag_register(const struct sock_diag_handler *hndl)
|
||||
{
|
||||
int err = 0;
|
||||
|
||||
if (hndl->family >= AF_MAX)
|
||||
return -EINVAL;
|
||||
|
||||
mutex_lock(&sock_diag_table_mutex);
|
||||
if (sock_diag_handlers[hndl->family])
|
||||
err = -EBUSY;
|
||||
else
|
||||
sock_diag_handlers[hndl->family] = hndl;
|
||||
mutex_unlock(&sock_diag_table_mutex);
|
||||
|
||||
return err;
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(sock_diag_register);
|
||||
|
||||
void sock_diag_unregister(const struct sock_diag_handler *hnld)
|
||||
{
|
||||
int family = hnld->family;
|
||||
|
||||
if (family >= AF_MAX)
|
||||
return;
|
||||
|
||||
mutex_lock(&sock_diag_table_mutex);
|
||||
BUG_ON(sock_diag_handlers[family] != hnld);
|
||||
sock_diag_handlers[family] = NULL;
|
||||
mutex_unlock(&sock_diag_table_mutex);
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(sock_diag_unregister);
|
||||
|
||||
static int __sock_diag_cmd(struct sk_buff *skb, struct nlmsghdr *nlh)
|
||||
{
|
||||
int err;
|
||||
struct sock_diag_req *req = nlmsg_data(nlh);
|
||||
const struct sock_diag_handler *hndl;
|
||||
|
||||
if (nlmsg_len(nlh) < sizeof(*req))
|
||||
return -EINVAL;
|
||||
|
||||
if (req->sdiag_family >= AF_MAX)
|
||||
return -EINVAL;
|
||||
|
||||
if (sock_diag_handlers[req->sdiag_family] == NULL)
|
||||
request_module("net-pf-%d-proto-%d-type-%d", PF_NETLINK,
|
||||
NETLINK_SOCK_DIAG, req->sdiag_family);
|
||||
|
||||
mutex_lock(&sock_diag_table_mutex);
|
||||
hndl = sock_diag_handlers[req->sdiag_family];
|
||||
if (hndl == NULL)
|
||||
err = -ENOENT;
|
||||
else if (nlh->nlmsg_type == SOCK_DIAG_BY_FAMILY)
|
||||
err = hndl->dump(skb, nlh);
|
||||
else if (nlh->nlmsg_type == SOCK_DESTROY_BACKPORT && hndl->destroy)
|
||||
err = hndl->destroy(skb, nlh);
|
||||
else
|
||||
err = -EOPNOTSUPP;
|
||||
mutex_unlock(&sock_diag_table_mutex);
|
||||
|
||||
return err;
|
||||
}
|
||||
|
||||
static int sock_diag_rcv_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
|
||||
{
|
||||
int ret;
|
||||
|
||||
switch (nlh->nlmsg_type) {
|
||||
case TCPDIAG_GETSOCK:
|
||||
case DCCPDIAG_GETSOCK:
|
||||
if (inet_rcv_compat == NULL)
|
||||
request_module("net-pf-%d-proto-%d-type-%d", PF_NETLINK,
|
||||
NETLINK_SOCK_DIAG, AF_INET);
|
||||
|
||||
mutex_lock(&sock_diag_table_mutex);
|
||||
if (inet_rcv_compat != NULL)
|
||||
ret = inet_rcv_compat(skb, nlh);
|
||||
else
|
||||
ret = -EOPNOTSUPP;
|
||||
mutex_unlock(&sock_diag_table_mutex);
|
||||
|
||||
return ret;
|
||||
case SOCK_DIAG_BY_FAMILY:
|
||||
case SOCK_DESTROY_BACKPORT:
|
||||
return __sock_diag_cmd(skb, nlh);
|
||||
default:
|
||||
return -EINVAL;
|
||||
}
|
||||
}
|
||||
|
||||
static DEFINE_MUTEX(sock_diag_mutex);
|
||||
|
||||
static void sock_diag_rcv(struct sk_buff *skb)
|
||||
{
|
||||
mutex_lock(&sock_diag_mutex);
|
||||
netlink_rcv_skb(skb, &sock_diag_rcv_msg);
|
||||
mutex_unlock(&sock_diag_mutex);
|
||||
}
|
||||
|
||||
int sock_diag_destroy(struct sock *sk, int err)
|
||||
{
|
||||
if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN))
|
||||
return -EPERM;
|
||||
|
||||
if (!sk->sk_prot->diag_destroy)
|
||||
return -EOPNOTSUPP;
|
||||
|
||||
return sk->sk_prot->diag_destroy(sk, err);
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(sock_diag_destroy);
|
||||
|
||||
static int __net_init diag_net_init(struct net *net)
|
||||
{
|
||||
struct netlink_kernel_cfg cfg = {
|
||||
.input = sock_diag_rcv,
|
||||
};
|
||||
|
||||
net->diag_nlsk = netlink_kernel_create(net, NETLINK_SOCK_DIAG, &cfg);
|
||||
return net->diag_nlsk == NULL ? -ENOMEM : 0;
|
||||
}
|
||||
|
||||
static void __net_exit diag_net_exit(struct net *net)
|
||||
{
|
||||
netlink_kernel_release(net->diag_nlsk);
|
||||
net->diag_nlsk = NULL;
|
||||
}
|
||||
|
||||
static struct pernet_operations diag_net_ops = {
|
||||
.init = diag_net_init,
|
||||
.exit = diag_net_exit,
|
||||
};
|
||||
|
||||
static int __init sock_diag_init(void)
|
||||
{
|
||||
return register_pernet_subsys(&diag_net_ops);
|
||||
}
|
||||
|
||||
static void __exit sock_diag_exit(void)
|
||||
{
|
||||
unregister_pernet_subsys(&diag_net_ops);
|
||||
}
|
||||
|
||||
module_init(sock_diag_init);
|
||||
module_exit(sock_diag_exit);
|
||||
MODULE_LICENSE("GPL");
|
||||
MODULE_ALIAS_NET_PF_PROTO(PF_NETLINK, NETLINK_SOCK_DIAG);
|
208
net/core/stream.c
Normal file
208
net/core/stream.c
Normal file
|
@ -0,0 +1,208 @@
|
|||
/*
|
||||
* SUCS NET3:
|
||||
*
|
||||
* Generic stream handling routines. These are generic for most
|
||||
* protocols. Even IP. Tonight 8-).
|
||||
* This is used because TCP, LLC (others too) layer all have mostly
|
||||
* identical sendmsg() and recvmsg() code.
|
||||
* So we (will) share it here.
|
||||
*
|
||||
* Authors: Arnaldo Carvalho de Melo <acme@conectiva.com.br>
|
||||
* (from old tcp.c code)
|
||||
* Alan Cox <alan@lxorguk.ukuu.org.uk> (Borrowed comments 8-))
|
||||
*/
|
||||
|
||||
#include <linux/module.h>
|
||||
#include <linux/net.h>
|
||||
#include <linux/signal.h>
|
||||
#include <linux/tcp.h>
|
||||
#include <linux/wait.h>
|
||||
#include <net/sock.h>
|
||||
|
||||
/**
|
||||
* sk_stream_write_space - stream socket write_space callback.
|
||||
* @sk: socket
|
||||
*
|
||||
* FIXME: write proper description
|
||||
*/
|
||||
void sk_stream_write_space(struct sock *sk)
|
||||
{
|
||||
struct socket *sock = sk->sk_socket;
|
||||
struct socket_wq *wq;
|
||||
|
||||
if (sk_stream_is_writeable(sk) && sock) {
|
||||
clear_bit(SOCK_NOSPACE, &sock->flags);
|
||||
|
||||
rcu_read_lock();
|
||||
wq = rcu_dereference(sk->sk_wq);
|
||||
if (wq_has_sleeper(wq))
|
||||
wake_up_interruptible_poll(&wq->wait, POLLOUT |
|
||||
POLLWRNORM | POLLWRBAND);
|
||||
if (wq && wq->fasync_list && !(sk->sk_shutdown & SEND_SHUTDOWN))
|
||||
sock_wake_async(sock, SOCK_WAKE_SPACE, POLL_OUT);
|
||||
rcu_read_unlock();
|
||||
}
|
||||
}
|
||||
EXPORT_SYMBOL(sk_stream_write_space);
|
||||
|
||||
/**
|
||||
* sk_stream_wait_connect - Wait for a socket to get into the connected state
|
||||
* @sk: sock to wait on
|
||||
* @timeo_p: for how long to wait
|
||||
*
|
||||
* Must be called with the socket locked.
|
||||
*/
|
||||
int sk_stream_wait_connect(struct sock *sk, long *timeo_p)
|
||||
{
|
||||
struct task_struct *tsk = current;
|
||||
DEFINE_WAIT(wait);
|
||||
int done;
|
||||
|
||||
do {
|
||||
int err = sock_error(sk);
|
||||
if (err)
|
||||
return err;
|
||||
if ((1 << sk->sk_state) & ~(TCPF_SYN_SENT | TCPF_SYN_RECV))
|
||||
return -EPIPE;
|
||||
if (!*timeo_p)
|
||||
return -EAGAIN;
|
||||
if (signal_pending(tsk))
|
||||
return sock_intr_errno(*timeo_p);
|
||||
|
||||
prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE);
|
||||
sk->sk_write_pending++;
|
||||
done = sk_wait_event(sk, timeo_p,
|
||||
!sk->sk_err &&
|
||||
!((1 << sk->sk_state) &
|
||||
~(TCPF_ESTABLISHED | TCPF_CLOSE_WAIT)));
|
||||
finish_wait(sk_sleep(sk), &wait);
|
||||
sk->sk_write_pending--;
|
||||
} while (!done);
|
||||
return 0;
|
||||
}
|
||||
EXPORT_SYMBOL(sk_stream_wait_connect);
|
||||
|
||||
/**
|
||||
* sk_stream_closing - Return 1 if we still have things to send in our buffers.
|
||||
* @sk: socket to verify
|
||||
*/
|
||||
static inline int sk_stream_closing(struct sock *sk)
|
||||
{
|
||||
return (1 << sk->sk_state) &
|
||||
(TCPF_FIN_WAIT1 | TCPF_CLOSING | TCPF_LAST_ACK);
|
||||
}
|
||||
|
||||
void sk_stream_wait_close(struct sock *sk, long timeout)
|
||||
{
|
||||
if (timeout) {
|
||||
DEFINE_WAIT(wait);
|
||||
|
||||
do {
|
||||
prepare_to_wait(sk_sleep(sk), &wait,
|
||||
TASK_INTERRUPTIBLE);
|
||||
if (sk_wait_event(sk, &timeout, !sk_stream_closing(sk)))
|
||||
break;
|
||||
} while (!signal_pending(current) && timeout);
|
||||
|
||||
finish_wait(sk_sleep(sk), &wait);
|
||||
}
|
||||
}
|
||||
EXPORT_SYMBOL(sk_stream_wait_close);
|
||||
|
||||
/**
|
||||
* sk_stream_wait_memory - Wait for more memory for a socket
|
||||
* @sk: socket to wait for memory
|
||||
* @timeo_p: for how long
|
||||
*/
|
||||
int sk_stream_wait_memory(struct sock *sk, long *timeo_p)
|
||||
{
|
||||
int err = 0;
|
||||
long vm_wait = 0;
|
||||
long current_timeo = *timeo_p;
|
||||
DEFINE_WAIT(wait);
|
||||
|
||||
if (sk_stream_memory_free(sk))
|
||||
current_timeo = vm_wait = (prandom_u32() % (HZ / 5)) + 2;
|
||||
|
||||
while (1) {
|
||||
set_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags);
|
||||
|
||||
prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE);
|
||||
|
||||
if (sk->sk_err || (sk->sk_shutdown & SEND_SHUTDOWN))
|
||||
goto do_error;
|
||||
if (!*timeo_p)
|
||||
goto do_nonblock;
|
||||
if (signal_pending(current))
|
||||
goto do_interrupted;
|
||||
clear_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags);
|
||||
if (sk_stream_memory_free(sk) && !vm_wait)
|
||||
break;
|
||||
|
||||
set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
|
||||
sk->sk_write_pending++;
|
||||
sk_wait_event(sk, ¤t_timeo, sk->sk_err ||
|
||||
(sk->sk_shutdown & SEND_SHUTDOWN) ||
|
||||
(sk_stream_memory_free(sk) &&
|
||||
!vm_wait));
|
||||
sk->sk_write_pending--;
|
||||
|
||||
if (vm_wait) {
|
||||
vm_wait -= current_timeo;
|
||||
current_timeo = *timeo_p;
|
||||
if (current_timeo != MAX_SCHEDULE_TIMEOUT &&
|
||||
(current_timeo -= vm_wait) < 0)
|
||||
current_timeo = 0;
|
||||
vm_wait = 0;
|
||||
}
|
||||
*timeo_p = current_timeo;
|
||||
}
|
||||
out:
|
||||
finish_wait(sk_sleep(sk), &wait);
|
||||
return err;
|
||||
|
||||
do_error:
|
||||
err = -EPIPE;
|
||||
goto out;
|
||||
do_nonblock:
|
||||
err = -EAGAIN;
|
||||
goto out;
|
||||
do_interrupted:
|
||||
err = sock_intr_errno(*timeo_p);
|
||||
goto out;
|
||||
}
|
||||
EXPORT_SYMBOL(sk_stream_wait_memory);
|
||||
|
||||
int sk_stream_error(struct sock *sk, int flags, int err)
|
||||
{
|
||||
if (err == -EPIPE)
|
||||
err = sock_error(sk) ? : -EPIPE;
|
||||
if (err == -EPIPE && !(flags & MSG_NOSIGNAL))
|
||||
send_sig(SIGPIPE, current, 0);
|
||||
return err;
|
||||
}
|
||||
EXPORT_SYMBOL(sk_stream_error);
|
||||
|
||||
void sk_stream_kill_queues(struct sock *sk)
|
||||
{
|
||||
/* First the read buffer. */
|
||||
__skb_queue_purge(&sk->sk_receive_queue);
|
||||
|
||||
/* Next, the error queue. */
|
||||
__skb_queue_purge(&sk->sk_error_queue);
|
||||
|
||||
/* Next, the write queue. */
|
||||
WARN_ON(!skb_queue_empty(&sk->sk_write_queue));
|
||||
|
||||
/* Account for returned memory. */
|
||||
sk_mem_reclaim(sk);
|
||||
|
||||
WARN_ON(sk->sk_wmem_queued);
|
||||
WARN_ON(sk->sk_forward_alloc);
|
||||
|
||||
/* It is _impossible_ for the backlog to contain anything
|
||||
* when we get here. All user references to this socket
|
||||
* have gone away, only the net layer knows can touch it.
|
||||
*/
|
||||
}
|
||||
EXPORT_SYMBOL(sk_stream_kill_queues);
|
435
net/core/sysctl_net_core.c
Normal file
435
net/core/sysctl_net_core.c
Normal file
|
@ -0,0 +1,435 @@
|
|||
/* -*- linux-c -*-
|
||||
* sysctl_net_core.c: sysctl interface to net core subsystem.
|
||||
*
|
||||
* Begun April 1, 1996, Mike Shaver.
|
||||
* Added /proc/sys/net/core directory entry (empty =) ). [MS]
|
||||
*/
|
||||
|
||||
#include <linux/mm.h>
|
||||
#include <linux/sysctl.h>
|
||||
#include <linux/module.h>
|
||||
#include <linux/socket.h>
|
||||
#include <linux/netdevice.h>
|
||||
#include <linux/ratelimit.h>
|
||||
#include <linux/vmalloc.h>
|
||||
#include <linux/init.h>
|
||||
#include <linux/slab.h>
|
||||
#include <linux/kmemleak.h>
|
||||
|
||||
#include <net/ip.h>
|
||||
#include <net/sock.h>
|
||||
#include <net/net_ratelimit.h>
|
||||
#include <net/busy_poll.h>
|
||||
#include <net/pkt_sched.h>
|
||||
|
||||
static int zero = 0;
|
||||
static int ushort_max = USHRT_MAX;
|
||||
static int min_sndbuf = SOCK_MIN_SNDBUF;
|
||||
static int min_rcvbuf = SOCK_MIN_RCVBUF;
|
||||
|
||||
#ifdef CONFIG_RPS
|
||||
static int rps_sock_flow_sysctl(struct ctl_table *table, int write,
|
||||
void __user *buffer, size_t *lenp, loff_t *ppos)
|
||||
{
|
||||
unsigned int orig_size, size;
|
||||
int ret, i;
|
||||
struct ctl_table tmp = {
|
||||
.data = &size,
|
||||
.maxlen = sizeof(size),
|
||||
.mode = table->mode
|
||||
};
|
||||
struct rps_sock_flow_table *orig_sock_table, *sock_table;
|
||||
static DEFINE_MUTEX(sock_flow_mutex);
|
||||
|
||||
mutex_lock(&sock_flow_mutex);
|
||||
|
||||
orig_sock_table = rcu_dereference_protected(rps_sock_flow_table,
|
||||
lockdep_is_held(&sock_flow_mutex));
|
||||
size = orig_size = orig_sock_table ? orig_sock_table->mask + 1 : 0;
|
||||
|
||||
ret = proc_dointvec(&tmp, write, buffer, lenp, ppos);
|
||||
|
||||
if (write) {
|
||||
if (size) {
|
||||
if (size > 1<<30) {
|
||||
/* Enforce limit to prevent overflow */
|
||||
mutex_unlock(&sock_flow_mutex);
|
||||
return -EINVAL;
|
||||
}
|
||||
size = roundup_pow_of_two(size);
|
||||
if (size != orig_size) {
|
||||
sock_table =
|
||||
vmalloc(RPS_SOCK_FLOW_TABLE_SIZE(size));
|
||||
if (!sock_table) {
|
||||
mutex_unlock(&sock_flow_mutex);
|
||||
return -ENOMEM;
|
||||
}
|
||||
|
||||
sock_table->mask = size - 1;
|
||||
} else
|
||||
sock_table = orig_sock_table;
|
||||
|
||||
for (i = 0; i < size; i++)
|
||||
sock_table->ents[i] = RPS_NO_CPU;
|
||||
} else
|
||||
sock_table = NULL;
|
||||
|
||||
if (sock_table != orig_sock_table) {
|
||||
rcu_assign_pointer(rps_sock_flow_table, sock_table);
|
||||
if (sock_table)
|
||||
static_key_slow_inc(&rps_needed);
|
||||
if (orig_sock_table) {
|
||||
static_key_slow_dec(&rps_needed);
|
||||
synchronize_rcu();
|
||||
vfree(orig_sock_table);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
mutex_unlock(&sock_flow_mutex);
|
||||
|
||||
return ret;
|
||||
}
|
||||
#endif /* CONFIG_RPS */
|
||||
|
||||
#ifdef CONFIG_NET_FLOW_LIMIT
|
||||
static DEFINE_MUTEX(flow_limit_update_mutex);
|
||||
|
||||
static int flow_limit_cpu_sysctl(struct ctl_table *table, int write,
|
||||
void __user *buffer, size_t *lenp,
|
||||
loff_t *ppos)
|
||||
{
|
||||
struct sd_flow_limit *cur;
|
||||
struct softnet_data *sd;
|
||||
cpumask_var_t mask;
|
||||
int i, len, ret = 0;
|
||||
|
||||
if (!alloc_cpumask_var(&mask, GFP_KERNEL))
|
||||
return -ENOMEM;
|
||||
|
||||
if (write) {
|
||||
ret = cpumask_parse_user(buffer, *lenp, mask);
|
||||
if (ret)
|
||||
goto done;
|
||||
|
||||
mutex_lock(&flow_limit_update_mutex);
|
||||
len = sizeof(*cur) + netdev_flow_limit_table_len;
|
||||
for_each_possible_cpu(i) {
|
||||
sd = &per_cpu(softnet_data, i);
|
||||
cur = rcu_dereference_protected(sd->flow_limit,
|
||||
lockdep_is_held(&flow_limit_update_mutex));
|
||||
if (cur && !cpumask_test_cpu(i, mask)) {
|
||||
RCU_INIT_POINTER(sd->flow_limit, NULL);
|
||||
synchronize_rcu();
|
||||
kfree(cur);
|
||||
} else if (!cur && cpumask_test_cpu(i, mask)) {
|
||||
cur = kzalloc_node(len, GFP_KERNEL,
|
||||
cpu_to_node(i));
|
||||
if (!cur) {
|
||||
/* not unwinding previous changes */
|
||||
ret = -ENOMEM;
|
||||
goto write_unlock;
|
||||
}
|
||||
cur->num_buckets = netdev_flow_limit_table_len;
|
||||
rcu_assign_pointer(sd->flow_limit, cur);
|
||||
}
|
||||
}
|
||||
write_unlock:
|
||||
mutex_unlock(&flow_limit_update_mutex);
|
||||
} else {
|
||||
char kbuf[128];
|
||||
|
||||
if (*ppos || !*lenp) {
|
||||
*lenp = 0;
|
||||
goto done;
|
||||
}
|
||||
|
||||
cpumask_clear(mask);
|
||||
rcu_read_lock();
|
||||
for_each_possible_cpu(i) {
|
||||
sd = &per_cpu(softnet_data, i);
|
||||
if (rcu_dereference(sd->flow_limit))
|
||||
cpumask_set_cpu(i, mask);
|
||||
}
|
||||
rcu_read_unlock();
|
||||
|
||||
len = min(sizeof(kbuf) - 1, *lenp);
|
||||
len = cpumask_scnprintf(kbuf, len, mask);
|
||||
if (!len) {
|
||||
*lenp = 0;
|
||||
goto done;
|
||||
}
|
||||
if (len < *lenp)
|
||||
kbuf[len++] = '\n';
|
||||
if (copy_to_user(buffer, kbuf, len)) {
|
||||
ret = -EFAULT;
|
||||
goto done;
|
||||
}
|
||||
*lenp = len;
|
||||
*ppos += len;
|
||||
}
|
||||
|
||||
done:
|
||||
free_cpumask_var(mask);
|
||||
return ret;
|
||||
}
|
||||
|
||||
static int flow_limit_table_len_sysctl(struct ctl_table *table, int write,
|
||||
void __user *buffer, size_t *lenp,
|
||||
loff_t *ppos)
|
||||
{
|
||||
unsigned int old, *ptr;
|
||||
int ret;
|
||||
|
||||
mutex_lock(&flow_limit_update_mutex);
|
||||
|
||||
ptr = table->data;
|
||||
old = *ptr;
|
||||
ret = proc_dointvec(table, write, buffer, lenp, ppos);
|
||||
if (!ret && write && !is_power_of_2(*ptr)) {
|
||||
*ptr = old;
|
||||
ret = -EINVAL;
|
||||
}
|
||||
|
||||
mutex_unlock(&flow_limit_update_mutex);
|
||||
return ret;
|
||||
}
|
||||
#endif /* CONFIG_NET_FLOW_LIMIT */
|
||||
|
||||
#ifdef CONFIG_NET_SCHED
|
||||
static int set_default_qdisc(struct ctl_table *table, int write,
|
||||
void __user *buffer, size_t *lenp, loff_t *ppos)
|
||||
{
|
||||
char id[IFNAMSIZ];
|
||||
struct ctl_table tbl = {
|
||||
.data = id,
|
||||
.maxlen = IFNAMSIZ,
|
||||
};
|
||||
int ret;
|
||||
|
||||
qdisc_get_default(id, IFNAMSIZ);
|
||||
|
||||
ret = proc_dostring(&tbl, write, buffer, lenp, ppos);
|
||||
if (write && ret == 0)
|
||||
ret = qdisc_set_default(id);
|
||||
return ret;
|
||||
}
|
||||
#endif
|
||||
|
||||
static struct ctl_table net_core_table[] = {
|
||||
#ifdef CONFIG_NET
|
||||
{
|
||||
.procname = "wmem_max",
|
||||
.data = &sysctl_wmem_max,
|
||||
.maxlen = sizeof(int),
|
||||
.mode = 0644,
|
||||
.proc_handler = proc_dointvec_minmax,
|
||||
.extra1 = &min_sndbuf,
|
||||
},
|
||||
{
|
||||
.procname = "rmem_max",
|
||||
.data = &sysctl_rmem_max,
|
||||
.maxlen = sizeof(int),
|
||||
.mode = 0644,
|
||||
.proc_handler = proc_dointvec_minmax,
|
||||
.extra1 = &min_rcvbuf,
|
||||
},
|
||||
{
|
||||
.procname = "wmem_default",
|
||||
.data = &sysctl_wmem_default,
|
||||
.maxlen = sizeof(int),
|
||||
.mode = 0644,
|
||||
.proc_handler = proc_dointvec_minmax,
|
||||
.extra1 = &min_sndbuf,
|
||||
},
|
||||
{
|
||||
.procname = "rmem_default",
|
||||
.data = &sysctl_rmem_default,
|
||||
.maxlen = sizeof(int),
|
||||
.mode = 0644,
|
||||
.proc_handler = proc_dointvec_minmax,
|
||||
.extra1 = &min_rcvbuf,
|
||||
},
|
||||
{
|
||||
.procname = "dev_weight",
|
||||
.data = &weight_p,
|
||||
.maxlen = sizeof(int),
|
||||
.mode = 0644,
|
||||
.proc_handler = proc_dointvec
|
||||
},
|
||||
{
|
||||
.procname = "netdev_max_backlog",
|
||||
.data = &netdev_max_backlog,
|
||||
.maxlen = sizeof(int),
|
||||
.mode = 0644,
|
||||
.proc_handler = proc_dointvec
|
||||
},
|
||||
#ifdef CONFIG_BPF_JIT
|
||||
{
|
||||
.procname = "bpf_jit_enable",
|
||||
.data = &bpf_jit_enable,
|
||||
.maxlen = sizeof(int),
|
||||
.mode = 0644,
|
||||
.proc_handler = proc_dointvec
|
||||
},
|
||||
#endif
|
||||
{
|
||||
.procname = "netdev_tstamp_prequeue",
|
||||
.data = &netdev_tstamp_prequeue,
|
||||
.maxlen = sizeof(int),
|
||||
.mode = 0644,
|
||||
.proc_handler = proc_dointvec
|
||||
},
|
||||
{
|
||||
.procname = "message_cost",
|
||||
.data = &net_ratelimit_state.interval,
|
||||
.maxlen = sizeof(int),
|
||||
.mode = 0644,
|
||||
.proc_handler = proc_dointvec_jiffies,
|
||||
},
|
||||
{
|
||||
.procname = "message_burst",
|
||||
.data = &net_ratelimit_state.burst,
|
||||
.maxlen = sizeof(int),
|
||||
.mode = 0644,
|
||||
.proc_handler = proc_dointvec,
|
||||
},
|
||||
{
|
||||
.procname = "optmem_max",
|
||||
.data = &sysctl_optmem_max,
|
||||
.maxlen = sizeof(int),
|
||||
.mode = 0644,
|
||||
.proc_handler = proc_dointvec
|
||||
},
|
||||
#ifdef CONFIG_RPS
|
||||
{
|
||||
.procname = "rps_sock_flow_entries",
|
||||
.maxlen = sizeof(int),
|
||||
.mode = 0644,
|
||||
.proc_handler = rps_sock_flow_sysctl
|
||||
},
|
||||
#endif
|
||||
#ifdef CONFIG_NET_FLOW_LIMIT
|
||||
{
|
||||
.procname = "flow_limit_cpu_bitmap",
|
||||
.mode = 0644,
|
||||
.proc_handler = flow_limit_cpu_sysctl
|
||||
},
|
||||
{
|
||||
.procname = "flow_limit_table_len",
|
||||
.data = &netdev_flow_limit_table_len,
|
||||
.maxlen = sizeof(int),
|
||||
.mode = 0644,
|
||||
.proc_handler = flow_limit_table_len_sysctl
|
||||
},
|
||||
#endif /* CONFIG_NET_FLOW_LIMIT */
|
||||
#ifdef CONFIG_NET_RX_BUSY_POLL
|
||||
{
|
||||
.procname = "busy_poll",
|
||||
.data = &sysctl_net_busy_poll,
|
||||
.maxlen = sizeof(unsigned int),
|
||||
.mode = 0644,
|
||||
.proc_handler = proc_dointvec
|
||||
},
|
||||
{
|
||||
.procname = "busy_read",
|
||||
.data = &sysctl_net_busy_read,
|
||||
.maxlen = sizeof(unsigned int),
|
||||
.mode = 0644,
|
||||
.proc_handler = proc_dointvec
|
||||
},
|
||||
#endif
|
||||
#ifdef CONFIG_NET_SCHED
|
||||
{
|
||||
.procname = "default_qdisc",
|
||||
.mode = 0644,
|
||||
.maxlen = IFNAMSIZ,
|
||||
.proc_handler = set_default_qdisc
|
||||
},
|
||||
#endif
|
||||
#endif /* CONFIG_NET */
|
||||
{
|
||||
.procname = "netdev_budget",
|
||||
.data = &netdev_budget,
|
||||
.maxlen = sizeof(int),
|
||||
.mode = 0644,
|
||||
.proc_handler = proc_dointvec
|
||||
},
|
||||
{
|
||||
.procname = "warnings",
|
||||
.data = &net_msg_warn,
|
||||
.maxlen = sizeof(int),
|
||||
.mode = 0644,
|
||||
.proc_handler = proc_dointvec
|
||||
},
|
||||
{ }
|
||||
};
|
||||
|
||||
static struct ctl_table netns_core_table[] = {
|
||||
{
|
||||
.procname = "somaxconn",
|
||||
.data = &init_net.core.sysctl_somaxconn,
|
||||
.maxlen = sizeof(int),
|
||||
.mode = 0644,
|
||||
.extra1 = &zero,
|
||||
.extra2 = &ushort_max,
|
||||
.proc_handler = proc_dointvec_minmax
|
||||
},
|
||||
{ }
|
||||
};
|
||||
|
||||
static __net_init int sysctl_core_net_init(struct net *net)
|
||||
{
|
||||
struct ctl_table *tbl;
|
||||
|
||||
net->core.sysctl_somaxconn = SOMAXCONN;
|
||||
|
||||
tbl = netns_core_table;
|
||||
if (!net_eq(net, &init_net)) {
|
||||
tbl = kmemdup(tbl, sizeof(netns_core_table), GFP_KERNEL);
|
||||
if (tbl == NULL)
|
||||
goto err_dup;
|
||||
|
||||
tbl[0].data = &net->core.sysctl_somaxconn;
|
||||
|
||||
/* Don't export any sysctls to unprivileged users */
|
||||
if (net->user_ns != &init_user_ns) {
|
||||
tbl[0].procname = NULL;
|
||||
}
|
||||
}
|
||||
|
||||
net->core.sysctl_hdr = register_net_sysctl(net, "net/core", tbl);
|
||||
if (net->core.sysctl_hdr == NULL)
|
||||
goto err_reg;
|
||||
|
||||
return 0;
|
||||
|
||||
err_reg:
|
||||
if (tbl != netns_core_table)
|
||||
kfree(tbl);
|
||||
err_dup:
|
||||
return -ENOMEM;
|
||||
}
|
||||
|
||||
static __net_exit void sysctl_core_net_exit(struct net *net)
|
||||
{
|
||||
struct ctl_table *tbl;
|
||||
|
||||
tbl = net->core.sysctl_hdr->ctl_table_arg;
|
||||
unregister_net_sysctl_table(net->core.sysctl_hdr);
|
||||
BUG_ON(tbl == netns_core_table);
|
||||
kfree(tbl);
|
||||
}
|
||||
|
||||
static __net_initdata struct pernet_operations sysctl_core_ops = {
|
||||
.init = sysctl_core_net_init,
|
||||
.exit = sysctl_core_net_exit,
|
||||
};
|
||||
|
||||
static __init int sysctl_core_init(void)
|
||||
{
|
||||
register_net_sysctl(&init_net, "net/core", net_core_table);
|
||||
return register_pernet_subsys(&sysctl_core_ops);
|
||||
}
|
||||
|
||||
fs_initcall(sysctl_core_init);
|
80
net/core/timestamping.c
Normal file
80
net/core/timestamping.c
Normal file
|
@ -0,0 +1,80 @@
|
|||
/*
|
||||
* PTP 1588 clock support - support for timestamping in PHY devices
|
||||
*
|
||||
* Copyright (C) 2010 OMICRON electronics GmbH
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
* the Free Software Foundation; either version 2 of the License, or
|
||||
* (at your option) any later version.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License
|
||||
* along with this program; if not, write to the Free Software
|
||||
* Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
|
||||
*/
|
||||
#include <linux/errqueue.h>
|
||||
#include <linux/phy.h>
|
||||
#include <linux/ptp_classify.h>
|
||||
#include <linux/skbuff.h>
|
||||
#include <linux/export.h>
|
||||
|
||||
static unsigned int classify(const struct sk_buff *skb)
|
||||
{
|
||||
if (likely(skb->dev && skb->dev->phydev &&
|
||||
skb->dev->phydev->drv))
|
||||
return ptp_classify_raw(skb);
|
||||
else
|
||||
return PTP_CLASS_NONE;
|
||||
}
|
||||
|
||||
void skb_clone_tx_timestamp(struct sk_buff *skb)
|
||||
{
|
||||
struct phy_device *phydev;
|
||||
struct sk_buff *clone;
|
||||
unsigned int type;
|
||||
|
||||
if (!skb->sk)
|
||||
return;
|
||||
|
||||
type = classify(skb);
|
||||
if (type == PTP_CLASS_NONE)
|
||||
return;
|
||||
|
||||
phydev = skb->dev->phydev;
|
||||
if (likely(phydev->drv->txtstamp)) {
|
||||
clone = skb_clone_sk(skb);
|
||||
if (!clone)
|
||||
return;
|
||||
phydev->drv->txtstamp(phydev, clone, type);
|
||||
}
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(skb_clone_tx_timestamp);
|
||||
|
||||
bool skb_defer_rx_timestamp(struct sk_buff *skb)
|
||||
{
|
||||
struct phy_device *phydev;
|
||||
unsigned int type;
|
||||
|
||||
if (skb_headroom(skb) < ETH_HLEN)
|
||||
return false;
|
||||
__skb_push(skb, ETH_HLEN);
|
||||
|
||||
type = classify(skb);
|
||||
|
||||
__skb_pull(skb, ETH_HLEN);
|
||||
|
||||
if (type == PTP_CLASS_NONE)
|
||||
return false;
|
||||
|
||||
phydev = skb->dev->phydev;
|
||||
if (likely(phydev->drv->rxtstamp))
|
||||
return phydev->drv->rxtstamp(phydev, skb, type);
|
||||
|
||||
return false;
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(skb_defer_rx_timestamp);
|
78
net/core/tso.c
Normal file
78
net/core/tso.c
Normal file
|
@ -0,0 +1,78 @@
|
|||
#include <linux/export.h>
|
||||
#include <net/ip.h>
|
||||
#include <net/tso.h>
|
||||
#include <asm/unaligned.h>
|
||||
|
||||
/* Calculate expected number of TX descriptors */
|
||||
int tso_count_descs(struct sk_buff *skb)
|
||||
{
|
||||
/* The Marvell Way */
|
||||
return skb_shinfo(skb)->gso_segs * 2 + skb_shinfo(skb)->nr_frags;
|
||||
}
|
||||
EXPORT_SYMBOL(tso_count_descs);
|
||||
|
||||
void tso_build_hdr(struct sk_buff *skb, char *hdr, struct tso_t *tso,
|
||||
int size, bool is_last)
|
||||
{
|
||||
struct iphdr *iph;
|
||||
struct tcphdr *tcph;
|
||||
int hdr_len = skb_transport_offset(skb) + tcp_hdrlen(skb);
|
||||
int mac_hdr_len = skb_network_offset(skb);
|
||||
|
||||
memcpy(hdr, skb->data, hdr_len);
|
||||
iph = (struct iphdr *)(hdr + mac_hdr_len);
|
||||
iph->id = htons(tso->ip_id);
|
||||
iph->tot_len = htons(size + hdr_len - mac_hdr_len);
|
||||
tcph = (struct tcphdr *)(hdr + skb_transport_offset(skb));
|
||||
put_unaligned_be32(tso->tcp_seq, &tcph->seq);
|
||||
tso->ip_id++;
|
||||
|
||||
if (!is_last) {
|
||||
/* Clear all special flags for not last packet */
|
||||
tcph->psh = 0;
|
||||
tcph->fin = 0;
|
||||
tcph->rst = 0;
|
||||
}
|
||||
}
|
||||
EXPORT_SYMBOL(tso_build_hdr);
|
||||
|
||||
void tso_build_data(struct sk_buff *skb, struct tso_t *tso, int size)
|
||||
{
|
||||
tso->tcp_seq += size;
|
||||
tso->size -= size;
|
||||
tso->data += size;
|
||||
|
||||
if ((tso->size == 0) &&
|
||||
(tso->next_frag_idx < skb_shinfo(skb)->nr_frags)) {
|
||||
skb_frag_t *frag = &skb_shinfo(skb)->frags[tso->next_frag_idx];
|
||||
|
||||
/* Move to next segment */
|
||||
tso->size = frag->size;
|
||||
tso->data = page_address(frag->page.p) + frag->page_offset;
|
||||
tso->next_frag_idx++;
|
||||
}
|
||||
}
|
||||
EXPORT_SYMBOL(tso_build_data);
|
||||
|
||||
void tso_start(struct sk_buff *skb, struct tso_t *tso)
|
||||
{
|
||||
int hdr_len = skb_transport_offset(skb) + tcp_hdrlen(skb);
|
||||
|
||||
tso->ip_id = ntohs(ip_hdr(skb)->id);
|
||||
tso->tcp_seq = ntohl(tcp_hdr(skb)->seq);
|
||||
tso->next_frag_idx = 0;
|
||||
|
||||
/* Build first data */
|
||||
tso->size = skb_headlen(skb) - hdr_len;
|
||||
tso->data = skb->data + hdr_len;
|
||||
if ((tso->size == 0) &&
|
||||
(tso->next_frag_idx < skb_shinfo(skb)->nr_frags)) {
|
||||
skb_frag_t *frag = &skb_shinfo(skb)->frags[tso->next_frag_idx];
|
||||
|
||||
/* Move to next segment */
|
||||
tso->size = frag->size;
|
||||
tso->data = page_address(frag->page.p) + frag->page_offset;
|
||||
tso->next_frag_idx++;
|
||||
}
|
||||
}
|
||||
EXPORT_SYMBOL(tso_start);
|
387
net/core/utils.c
Normal file
387
net/core/utils.c
Normal file
|
@ -0,0 +1,387 @@
|
|||
/*
|
||||
* Generic address resultion entity
|
||||
*
|
||||
* Authors:
|
||||
* net_random Alan Cox
|
||||
* net_ratelimit Andi Kleen
|
||||
* in{4,6}_pton YOSHIFUJI Hideaki, Copyright (C)2006 USAGI/WIDE Project
|
||||
*
|
||||
* Created by Alexey Kuznetsov <kuznet@ms2.inr.ac.ru>
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU General Public License
|
||||
* as published by the Free Software Foundation; either version
|
||||
* 2 of the License, or (at your option) any later version.
|
||||
*/
|
||||
|
||||
#include <linux/module.h>
|
||||
#include <linux/jiffies.h>
|
||||
#include <linux/kernel.h>
|
||||
#include <linux/ctype.h>
|
||||
#include <linux/inet.h>
|
||||
#include <linux/mm.h>
|
||||
#include <linux/net.h>
|
||||
#include <linux/string.h>
|
||||
#include <linux/types.h>
|
||||
#include <linux/percpu.h>
|
||||
#include <linux/init.h>
|
||||
#include <linux/ratelimit.h>
|
||||
|
||||
#include <net/sock.h>
|
||||
#include <net/net_ratelimit.h>
|
||||
|
||||
#include <asm/byteorder.h>
|
||||
#include <asm/uaccess.h>
|
||||
|
||||
int net_msg_warn __read_mostly = 1;
|
||||
EXPORT_SYMBOL(net_msg_warn);
|
||||
|
||||
DEFINE_RATELIMIT_STATE(net_ratelimit_state, 5 * HZ, 10);
|
||||
/*
|
||||
* All net warning printk()s should be guarded by this function.
|
||||
*/
|
||||
int net_ratelimit(void)
|
||||
{
|
||||
return __ratelimit(&net_ratelimit_state);
|
||||
}
|
||||
EXPORT_SYMBOL(net_ratelimit);
|
||||
|
||||
/*
|
||||
* Convert an ASCII string to binary IP.
|
||||
* This is outside of net/ipv4/ because various code that uses IP addresses
|
||||
* is otherwise not dependent on the TCP/IP stack.
|
||||
*/
|
||||
|
||||
__be32 in_aton(const char *str)
|
||||
{
|
||||
unsigned long l;
|
||||
unsigned int val;
|
||||
int i;
|
||||
|
||||
l = 0;
|
||||
for (i = 0; i < 4; i++) {
|
||||
l <<= 8;
|
||||
if (*str != '\0') {
|
||||
val = 0;
|
||||
while (*str != '\0' && *str != '.' && *str != '\n') {
|
||||
val *= 10;
|
||||
val += *str - '0';
|
||||
str++;
|
||||
}
|
||||
l |= val;
|
||||
if (*str != '\0')
|
||||
str++;
|
||||
}
|
||||
}
|
||||
return htonl(l);
|
||||
}
|
||||
EXPORT_SYMBOL(in_aton);
|
||||
|
||||
#define IN6PTON_XDIGIT 0x00010000
|
||||
#define IN6PTON_DIGIT 0x00020000
|
||||
#define IN6PTON_COLON_MASK 0x00700000
|
||||
#define IN6PTON_COLON_1 0x00100000 /* single : requested */
|
||||
#define IN6PTON_COLON_2 0x00200000 /* second : requested */
|
||||
#define IN6PTON_COLON_1_2 0x00400000 /* :: requested */
|
||||
#define IN6PTON_DOT 0x00800000 /* . */
|
||||
#define IN6PTON_DELIM 0x10000000
|
||||
#define IN6PTON_NULL 0x20000000 /* first/tail */
|
||||
#define IN6PTON_UNKNOWN 0x40000000
|
||||
|
||||
static inline int xdigit2bin(char c, int delim)
|
||||
{
|
||||
int val;
|
||||
|
||||
if (c == delim || c == '\0')
|
||||
return IN6PTON_DELIM;
|
||||
if (c == ':')
|
||||
return IN6PTON_COLON_MASK;
|
||||
if (c == '.')
|
||||
return IN6PTON_DOT;
|
||||
|
||||
val = hex_to_bin(c);
|
||||
if (val >= 0)
|
||||
return val | IN6PTON_XDIGIT | (val < 10 ? IN6PTON_DIGIT : 0);
|
||||
|
||||
if (delim == -1)
|
||||
return IN6PTON_DELIM;
|
||||
return IN6PTON_UNKNOWN;
|
||||
}
|
||||
|
||||
/**
|
||||
* in4_pton - convert an IPv4 address from literal to binary representation
|
||||
* @src: the start of the IPv4 address string
|
||||
* @srclen: the length of the string, -1 means strlen(src)
|
||||
* @dst: the binary (u8[4] array) representation of the IPv4 address
|
||||
* @delim: the delimiter of the IPv4 address in @src, -1 means no delimiter
|
||||
* @end: A pointer to the end of the parsed string will be placed here
|
||||
*
|
||||
* Return one on success, return zero when any error occurs
|
||||
* and @end will point to the end of the parsed string.
|
||||
*
|
||||
*/
|
||||
int in4_pton(const char *src, int srclen,
|
||||
u8 *dst,
|
||||
int delim, const char **end)
|
||||
{
|
||||
const char *s;
|
||||
u8 *d;
|
||||
u8 dbuf[4];
|
||||
int ret = 0;
|
||||
int i;
|
||||
int w = 0;
|
||||
|
||||
if (srclen < 0)
|
||||
srclen = strlen(src);
|
||||
s = src;
|
||||
d = dbuf;
|
||||
i = 0;
|
||||
while(1) {
|
||||
int c;
|
||||
c = xdigit2bin(srclen > 0 ? *s : '\0', delim);
|
||||
if (!(c & (IN6PTON_DIGIT | IN6PTON_DOT | IN6PTON_DELIM | IN6PTON_COLON_MASK))) {
|
||||
goto out;
|
||||
}
|
||||
if (c & (IN6PTON_DOT | IN6PTON_DELIM | IN6PTON_COLON_MASK)) {
|
||||
if (w == 0)
|
||||
goto out;
|
||||
*d++ = w & 0xff;
|
||||
w = 0;
|
||||
i++;
|
||||
if (c & (IN6PTON_DELIM | IN6PTON_COLON_MASK)) {
|
||||
if (i != 4)
|
||||
goto out;
|
||||
break;
|
||||
}
|
||||
goto cont;
|
||||
}
|
||||
w = (w * 10) + c;
|
||||
if ((w & 0xffff) > 255) {
|
||||
goto out;
|
||||
}
|
||||
cont:
|
||||
if (i >= 4)
|
||||
goto out;
|
||||
s++;
|
||||
srclen--;
|
||||
}
|
||||
ret = 1;
|
||||
memcpy(dst, dbuf, sizeof(dbuf));
|
||||
out:
|
||||
if (end)
|
||||
*end = s;
|
||||
return ret;
|
||||
}
|
||||
EXPORT_SYMBOL(in4_pton);
|
||||
|
||||
/**
|
||||
* in6_pton - convert an IPv6 address from literal to binary representation
|
||||
* @src: the start of the IPv6 address string
|
||||
* @srclen: the length of the string, -1 means strlen(src)
|
||||
* @dst: the binary (u8[16] array) representation of the IPv6 address
|
||||
* @delim: the delimiter of the IPv6 address in @src, -1 means no delimiter
|
||||
* @end: A pointer to the end of the parsed string will be placed here
|
||||
*
|
||||
* Return one on success, return zero when any error occurs
|
||||
* and @end will point to the end of the parsed string.
|
||||
*
|
||||
*/
|
||||
int in6_pton(const char *src, int srclen,
|
||||
u8 *dst,
|
||||
int delim, const char **end)
|
||||
{
|
||||
const char *s, *tok = NULL;
|
||||
u8 *d, *dc = NULL;
|
||||
u8 dbuf[16];
|
||||
int ret = 0;
|
||||
int i;
|
||||
int state = IN6PTON_COLON_1_2 | IN6PTON_XDIGIT | IN6PTON_NULL;
|
||||
int w = 0;
|
||||
|
||||
memset(dbuf, 0, sizeof(dbuf));
|
||||
|
||||
s = src;
|
||||
d = dbuf;
|
||||
if (srclen < 0)
|
||||
srclen = strlen(src);
|
||||
|
||||
while (1) {
|
||||
int c;
|
||||
|
||||
c = xdigit2bin(srclen > 0 ? *s : '\0', delim);
|
||||
if (!(c & state))
|
||||
goto out;
|
||||
if (c & (IN6PTON_DELIM | IN6PTON_COLON_MASK)) {
|
||||
/* process one 16-bit word */
|
||||
if (!(state & IN6PTON_NULL)) {
|
||||
*d++ = (w >> 8) & 0xff;
|
||||
*d++ = w & 0xff;
|
||||
}
|
||||
w = 0;
|
||||
if (c & IN6PTON_DELIM) {
|
||||
/* We've processed last word */
|
||||
break;
|
||||
}
|
||||
/*
|
||||
* COLON_1 => XDIGIT
|
||||
* COLON_2 => XDIGIT|DELIM
|
||||
* COLON_1_2 => COLON_2
|
||||
*/
|
||||
switch (state & IN6PTON_COLON_MASK) {
|
||||
case IN6PTON_COLON_2:
|
||||
dc = d;
|
||||
state = IN6PTON_XDIGIT | IN6PTON_DELIM;
|
||||
if (dc - dbuf >= sizeof(dbuf))
|
||||
state |= IN6PTON_NULL;
|
||||
break;
|
||||
case IN6PTON_COLON_1|IN6PTON_COLON_1_2:
|
||||
state = IN6PTON_XDIGIT | IN6PTON_COLON_2;
|
||||
break;
|
||||
case IN6PTON_COLON_1:
|
||||
state = IN6PTON_XDIGIT;
|
||||
break;
|
||||
case IN6PTON_COLON_1_2:
|
||||
state = IN6PTON_COLON_2;
|
||||
break;
|
||||
default:
|
||||
state = 0;
|
||||
}
|
||||
tok = s + 1;
|
||||
goto cont;
|
||||
}
|
||||
|
||||
if (c & IN6PTON_DOT) {
|
||||
ret = in4_pton(tok ? tok : s, srclen + (int)(s - tok), d, delim, &s);
|
||||
if (ret > 0) {
|
||||
d += 4;
|
||||
break;
|
||||
}
|
||||
goto out;
|
||||
}
|
||||
|
||||
w = (w << 4) | (0xff & c);
|
||||
state = IN6PTON_COLON_1 | IN6PTON_DELIM;
|
||||
if (!(w & 0xf000)) {
|
||||
state |= IN6PTON_XDIGIT;
|
||||
}
|
||||
if (!dc && d + 2 < dbuf + sizeof(dbuf)) {
|
||||
state |= IN6PTON_COLON_1_2;
|
||||
state &= ~IN6PTON_DELIM;
|
||||
}
|
||||
if (d + 2 >= dbuf + sizeof(dbuf)) {
|
||||
state &= ~(IN6PTON_COLON_1|IN6PTON_COLON_1_2);
|
||||
}
|
||||
cont:
|
||||
if ((dc && d + 4 < dbuf + sizeof(dbuf)) ||
|
||||
d + 4 == dbuf + sizeof(dbuf)) {
|
||||
state |= IN6PTON_DOT;
|
||||
}
|
||||
if (d >= dbuf + sizeof(dbuf)) {
|
||||
state &= ~(IN6PTON_XDIGIT|IN6PTON_COLON_MASK);
|
||||
}
|
||||
s++;
|
||||
srclen--;
|
||||
}
|
||||
|
||||
i = 15; d--;
|
||||
|
||||
if (dc) {
|
||||
while(d >= dc)
|
||||
dst[i--] = *d--;
|
||||
while(i >= dc - dbuf)
|
||||
dst[i--] = 0;
|
||||
while(i >= 0)
|
||||
dst[i--] = *d--;
|
||||
} else
|
||||
memcpy(dst, dbuf, sizeof(dbuf));
|
||||
|
||||
ret = 1;
|
||||
out:
|
||||
if (end)
|
||||
*end = s;
|
||||
return ret;
|
||||
}
|
||||
EXPORT_SYMBOL(in6_pton);
|
||||
|
||||
void inet_proto_csum_replace4(__sum16 *sum, struct sk_buff *skb,
|
||||
__be32 from, __be32 to, int pseudohdr)
|
||||
{
|
||||
if (skb->ip_summed != CHECKSUM_PARTIAL) {
|
||||
*sum = csum_fold(csum_add(csum_sub(~csum_unfold(*sum), from),
|
||||
to));
|
||||
if (skb->ip_summed == CHECKSUM_COMPLETE && pseudohdr)
|
||||
skb->csum = ~csum_add(csum_sub(~(skb->csum), from), to);
|
||||
} else if (pseudohdr)
|
||||
*sum = ~csum_fold(csum_add(csum_sub(csum_unfold(*sum), from),
|
||||
to));
|
||||
}
|
||||
EXPORT_SYMBOL(inet_proto_csum_replace4);
|
||||
|
||||
void inet_proto_csum_replace16(__sum16 *sum, struct sk_buff *skb,
|
||||
const __be32 *from, const __be32 *to,
|
||||
int pseudohdr)
|
||||
{
|
||||
__be32 diff[] = {
|
||||
~from[0], ~from[1], ~from[2], ~from[3],
|
||||
to[0], to[1], to[2], to[3],
|
||||
};
|
||||
if (skb->ip_summed != CHECKSUM_PARTIAL) {
|
||||
*sum = csum_fold(csum_partial(diff, sizeof(diff),
|
||||
~csum_unfold(*sum)));
|
||||
if (skb->ip_summed == CHECKSUM_COMPLETE && pseudohdr)
|
||||
skb->csum = ~csum_partial(diff, sizeof(diff),
|
||||
~skb->csum);
|
||||
} else if (pseudohdr)
|
||||
*sum = ~csum_fold(csum_partial(diff, sizeof(diff),
|
||||
csum_unfold(*sum)));
|
||||
}
|
||||
EXPORT_SYMBOL(inet_proto_csum_replace16);
|
||||
|
||||
struct __net_random_once_work {
|
||||
struct work_struct work;
|
||||
struct static_key *key;
|
||||
};
|
||||
|
||||
static void __net_random_once_deferred(struct work_struct *w)
|
||||
{
|
||||
struct __net_random_once_work *work =
|
||||
container_of(w, struct __net_random_once_work, work);
|
||||
BUG_ON(!static_key_enabled(work->key));
|
||||
static_key_slow_dec(work->key);
|
||||
kfree(work);
|
||||
}
|
||||
|
||||
static void __net_random_once_disable_jump(struct static_key *key)
|
||||
{
|
||||
struct __net_random_once_work *w;
|
||||
|
||||
w = kmalloc(sizeof(*w), GFP_ATOMIC);
|
||||
if (!w)
|
||||
return;
|
||||
|
||||
INIT_WORK(&w->work, __net_random_once_deferred);
|
||||
w->key = key;
|
||||
schedule_work(&w->work);
|
||||
}
|
||||
|
||||
bool __net_get_random_once(void *buf, int nbytes, bool *done,
|
||||
struct static_key *once_key)
|
||||
{
|
||||
static DEFINE_SPINLOCK(lock);
|
||||
unsigned long flags;
|
||||
|
||||
spin_lock_irqsave(&lock, flags);
|
||||
if (*done) {
|
||||
spin_unlock_irqrestore(&lock, flags);
|
||||
return false;
|
||||
}
|
||||
|
||||
get_random_bytes(buf, nbytes);
|
||||
*done = true;
|
||||
spin_unlock_irqrestore(&lock, flags);
|
||||
|
||||
__net_random_once_disable_jump(once_key);
|
||||
|
||||
return true;
|
||||
}
|
||||
EXPORT_SYMBOL(__net_get_random_once);
|
Loading…
Add table
Add a link
Reference in a new issue