Fixed MTP to work with TWRP

This commit is contained in:
awab228 2018-06-19 23:16:04 +02:00
commit f6dfaef42e
50820 changed files with 20846062 additions and 0 deletions

25
net/core/Makefile Normal file
View file

@ -0,0 +1,25 @@
#
# Makefile for the Linux networking core.
#
obj-y := sock.o request_sock.o skbuff.o iovec.o datagram.o stream.o scm.o \
gen_stats.o gen_estimator.o net_namespace.o secure_seq.o flow_dissector.o
obj-$(CONFIG_SYSCTL) += sysctl_net_core.o
obj-y += dev.o ethtool.o dev_addr_lists.o dst.o netevent.o \
neighbour.o rtnetlink.o utils.o link_watch.o filter.o \
sock_diag.o dev_ioctl.o tso.o
obj-$(CONFIG_XFRM) += flow.o
obj-y += net-sysfs.o
obj-$(CONFIG_PROC_FS) += net-procfs.o
obj-$(CONFIG_NET_PKTGEN) += pktgen.o
obj-$(CONFIG_NETPOLL) += netpoll.o
obj-$(CONFIG_FIB_RULES) += fib_rules.o
obj-$(CONFIG_TRACEPOINTS) += net-traces.o
obj-$(CONFIG_NET_DROP_MONITOR) += drop_monitor.o
obj-$(CONFIG_NETWORK_PHY_TIMESTAMPING) += timestamping.o
obj-$(CONFIG_NET_PTP_CLASSIFY) += ptp_classifier.o
obj-$(CONFIG_CGROUP_NET_PRIO) += netprio_cgroup.o
obj-$(CONFIG_CGROUP_NET_CLASSID) += netclassid_cgroup.o

888
net/core/datagram.c Normal file
View file

@ -0,0 +1,888 @@
/*
* SUCS NET3:
*
* Generic datagram handling routines. These are generic for all
* protocols. Possibly a generic IP version on top of these would
* make sense. Not tonight however 8-).
* This is used because UDP, RAW, PACKET, DDP, IPX, AX.25 and
* NetROM layer all have identical poll code and mostly
* identical recvmsg() code. So we share it here. The poll was
* shared before but buried in udp.c so I moved it.
*
* Authors: Alan Cox <alan@lxorguk.ukuu.org.uk>. (datagram_poll() from old
* udp.c code)
*
* Fixes:
* Alan Cox : NULL return from skb_peek_copy()
* understood
* Alan Cox : Rewrote skb_read_datagram to avoid the
* skb_peek_copy stuff.
* Alan Cox : Added support for SOCK_SEQPACKET.
* IPX can no longer use the SO_TYPE hack
* but AX.25 now works right, and SPX is
* feasible.
* Alan Cox : Fixed write poll of non IP protocol
* crash.
* Florian La Roche: Changed for my new skbuff handling.
* Darryl Miles : Fixed non-blocking SOCK_SEQPACKET.
* Linus Torvalds : BSD semantic fixes.
* Alan Cox : Datagram iovec handling
* Darryl Miles : Fixed non-blocking SOCK_STREAM.
* Alan Cox : POSIXisms
* Pete Wyckoff : Unconnected accept() fix.
*
*/
#include <linux/module.h>
#include <linux/types.h>
#include <linux/kernel.h>
#include <asm/uaccess.h>
#include <linux/mm.h>
#include <linux/interrupt.h>
#include <linux/errno.h>
#include <linux/sched.h>
#include <linux/inet.h>
#include <linux/netdevice.h>
#include <linux/rtnetlink.h>
#include <linux/poll.h>
#include <linux/highmem.h>
#include <linux/spinlock.h>
#include <linux/slab.h>
#include <linux/pagemap.h>
#include <net/protocol.h>
#include <linux/skbuff.h>
#include <net/checksum.h>
#include <net/sock.h>
#include <net/tcp_states.h>
#include <trace/events/skb.h>
#include <net/busy_poll.h>
/*
* Is a socket 'connection oriented' ?
*/
static inline int connection_based(struct sock *sk)
{
return sk->sk_type == SOCK_SEQPACKET || sk->sk_type == SOCK_STREAM;
}
static int receiver_wake_function(wait_queue_t *wait, unsigned int mode, int sync,
void *key)
{
unsigned long bits = (unsigned long)key;
/*
* Avoid a wakeup if event not interesting for us
*/
if (bits && !(bits & (POLLIN | POLLERR)))
return 0;
return autoremove_wake_function(wait, mode, sync, key);
}
/*
* Wait for the last received packet to be different from skb
*/
static int wait_for_more_packets(struct sock *sk, int *err, long *timeo_p,
const struct sk_buff *skb)
{
int error;
DEFINE_WAIT_FUNC(wait, receiver_wake_function);
prepare_to_wait_exclusive(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE);
/* Socket errors? */
error = sock_error(sk);
if (error)
goto out_err;
if (sk->sk_receive_queue.prev != skb)
goto out;
/* Socket shut down? */
if (sk->sk_shutdown & RCV_SHUTDOWN)
goto out_noerr;
/* Sequenced packets can come disconnected.
* If so we report the problem
*/
error = -ENOTCONN;
if (connection_based(sk) &&
!(sk->sk_state == TCP_ESTABLISHED || sk->sk_state == TCP_LISTEN))
goto out_err;
/* handle signals */
if (signal_pending(current))
goto interrupted;
error = 0;
*timeo_p = schedule_timeout(*timeo_p);
out:
finish_wait(sk_sleep(sk), &wait);
return error;
interrupted:
error = sock_intr_errno(*timeo_p);
out_err:
*err = error;
goto out;
out_noerr:
*err = 0;
error = 1;
goto out;
}
/**
* __skb_recv_datagram - Receive a datagram skbuff
* @sk: socket
* @flags: MSG_ flags
* @peeked: returns non-zero if this packet has been seen before
* @off: an offset in bytes to peek skb from. Returns an offset
* within an skb where data actually starts
* @err: error code returned
*
* Get a datagram skbuff, understands the peeking, nonblocking wakeups
* and possible races. This replaces identical code in packet, raw and
* udp, as well as the IPX AX.25 and Appletalk. It also finally fixes
* the long standing peek and read race for datagram sockets. If you
* alter this routine remember it must be re-entrant.
*
* This function will lock the socket if a skb is returned, so the caller
* needs to unlock the socket in that case (usually by calling
* skb_free_datagram)
*
* * It does not lock socket since today. This function is
* * free of race conditions. This measure should/can improve
* * significantly datagram socket latencies at high loads,
* * when data copying to user space takes lots of time.
* * (BTW I've just killed the last cli() in IP/IPv6/core/netlink/packet
* * 8) Great win.)
* * --ANK (980729)
*
* The order of the tests when we find no data waiting are specified
* quite explicitly by POSIX 1003.1g, don't change them without having
* the standard around please.
*/
struct sk_buff *__skb_recv_datagram(struct sock *sk, unsigned int flags,
int *peeked, int *off, int *err)
{
struct sk_buff *skb, *last;
long timeo;
/*
* Caller is allowed not to check sk->sk_err before skb_recv_datagram()
*/
int error = sock_error(sk);
if (error)
goto no_packet;
timeo = sock_rcvtimeo(sk, flags & MSG_DONTWAIT);
do {
/* Again only user level code calls this function, so nothing
* interrupt level will suddenly eat the receive_queue.
*
* Look at current nfs client by the way...
* However, this function was correct in any case. 8)
*/
unsigned long cpu_flags;
struct sk_buff_head *queue = &sk->sk_receive_queue;
int _off = *off;
last = (struct sk_buff *)queue;
spin_lock_irqsave(&queue->lock, cpu_flags);
skb_queue_walk(queue, skb) {
last = skb;
*peeked = skb->peeked;
if (flags & MSG_PEEK) {
if (_off >= skb->len && (skb->len || _off ||
skb->peeked)) {
_off -= skb->len;
continue;
}
skb->peeked = 1;
atomic_inc(&skb->users);
} else
__skb_unlink(skb, queue);
spin_unlock_irqrestore(&queue->lock, cpu_flags);
*off = _off;
return skb;
}
spin_unlock_irqrestore(&queue->lock, cpu_flags);
if (sk_can_busy_loop(sk) &&
sk_busy_loop(sk, flags & MSG_DONTWAIT))
continue;
/* User doesn't want to wait */
error = -EAGAIN;
if (!timeo)
goto no_packet;
} while (!wait_for_more_packets(sk, err, &timeo, last));
return NULL;
no_packet:
*err = error;
return NULL;
}
EXPORT_SYMBOL(__skb_recv_datagram);
struct sk_buff *skb_recv_datagram(struct sock *sk, unsigned int flags,
int noblock, int *err)
{
int peeked, off = 0;
return __skb_recv_datagram(sk, flags | (noblock ? MSG_DONTWAIT : 0),
&peeked, &off, err);
}
EXPORT_SYMBOL(skb_recv_datagram);
void skb_free_datagram(struct sock *sk, struct sk_buff *skb)
{
consume_skb(skb);
sk_mem_reclaim_partial(sk);
}
EXPORT_SYMBOL(skb_free_datagram);
void skb_free_datagram_locked(struct sock *sk, struct sk_buff *skb)
{
bool slow;
if (likely(atomic_read(&skb->users) == 1))
smp_rmb();
else if (likely(!atomic_dec_and_test(&skb->users)))
return;
slow = lock_sock_fast(sk);
skb_orphan(skb);
sk_mem_reclaim_partial(sk);
unlock_sock_fast(sk, slow);
/* skb is now orphaned, can be freed outside of locked section */
__kfree_skb(skb);
}
EXPORT_SYMBOL(skb_free_datagram_locked);
/**
* skb_kill_datagram - Free a datagram skbuff forcibly
* @sk: socket
* @skb: datagram skbuff
* @flags: MSG_ flags
*
* This function frees a datagram skbuff that was received by
* skb_recv_datagram. The flags argument must match the one
* used for skb_recv_datagram.
*
* If the MSG_PEEK flag is set, and the packet is still on the
* receive queue of the socket, it will be taken off the queue
* before it is freed.
*
* This function currently only disables BH when acquiring the
* sk_receive_queue lock. Therefore it must not be used in a
* context where that lock is acquired in an IRQ context.
*
* It returns 0 if the packet was removed by us.
*/
int skb_kill_datagram(struct sock *sk, struct sk_buff *skb, unsigned int flags)
{
int err = 0;
if (flags & MSG_PEEK) {
err = -ENOENT;
spin_lock_bh(&sk->sk_receive_queue.lock);
if (skb == skb_peek(&sk->sk_receive_queue)) {
__skb_unlink(skb, &sk->sk_receive_queue);
atomic_dec(&skb->users);
err = 0;
}
spin_unlock_bh(&sk->sk_receive_queue.lock);
}
kfree_skb(skb);
atomic_inc(&sk->sk_drops);
sk_mem_reclaim_partial(sk);
return err;
}
EXPORT_SYMBOL(skb_kill_datagram);
/**
* skb_copy_datagram_iovec - Copy a datagram to an iovec.
* @skb: buffer to copy
* @offset: offset in the buffer to start copying from
* @to: io vector to copy to
* @len: amount of data to copy from buffer to iovec
*
* Note: the iovec is modified during the copy.
*/
int skb_copy_datagram_iovec(const struct sk_buff *skb, int offset,
struct iovec *to, int len)
{
int start = skb_headlen(skb);
int i, copy = start - offset;
struct sk_buff *frag_iter;
trace_skb_copy_datagram_iovec(skb, len);
/* Copy header. */
if (copy > 0) {
if (copy > len)
copy = len;
if (memcpy_toiovec(to, skb->data + offset, copy))
goto fault;
if ((len -= copy) == 0)
return 0;
offset += copy;
}
/* Copy paged appendix. Hmm... why does this look so complicated? */
for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
int end;
const skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
WARN_ON(start > offset + len);
end = start + skb_frag_size(frag);
if ((copy = end - offset) > 0) {
int err;
u8 *vaddr;
struct page *page = skb_frag_page(frag);
if (copy > len)
copy = len;
vaddr = kmap(page);
err = memcpy_toiovec(to, vaddr + frag->page_offset +
offset - start, copy);
kunmap(page);
if (err)
goto fault;
if (!(len -= copy))
return 0;
offset += copy;
}
start = end;
}
skb_walk_frags(skb, frag_iter) {
int end;
WARN_ON(start > offset + len);
end = start + frag_iter->len;
if ((copy = end - offset) > 0) {
if (copy > len)
copy = len;
if (skb_copy_datagram_iovec(frag_iter,
offset - start,
to, copy))
goto fault;
if ((len -= copy) == 0)
return 0;
offset += copy;
}
start = end;
}
if (!len)
return 0;
fault:
return -EFAULT;
}
EXPORT_SYMBOL(skb_copy_datagram_iovec);
/**
* skb_copy_datagram_const_iovec - Copy a datagram to an iovec.
* @skb: buffer to copy
* @offset: offset in the buffer to start copying from
* @to: io vector to copy to
* @to_offset: offset in the io vector to start copying to
* @len: amount of data to copy from buffer to iovec
*
* Returns 0 or -EFAULT.
* Note: the iovec is not modified during the copy.
*/
int skb_copy_datagram_const_iovec(const struct sk_buff *skb, int offset,
const struct iovec *to, int to_offset,
int len)
{
int start = skb_headlen(skb);
int i, copy = start - offset;
struct sk_buff *frag_iter;
/* Copy header. */
if (copy > 0) {
if (copy > len)
copy = len;
if (memcpy_toiovecend(to, skb->data + offset, to_offset, copy))
goto fault;
if ((len -= copy) == 0)
return 0;
offset += copy;
to_offset += copy;
}
/* Copy paged appendix. Hmm... why does this look so complicated? */
for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
int end;
const skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
WARN_ON(start > offset + len);
end = start + skb_frag_size(frag);
if ((copy = end - offset) > 0) {
int err;
u8 *vaddr;
struct page *page = skb_frag_page(frag);
if (copy > len)
copy = len;
vaddr = kmap(page);
err = memcpy_toiovecend(to, vaddr + frag->page_offset +
offset - start, to_offset, copy);
kunmap(page);
if (err)
goto fault;
if (!(len -= copy))
return 0;
offset += copy;
to_offset += copy;
}
start = end;
}
skb_walk_frags(skb, frag_iter) {
int end;
WARN_ON(start > offset + len);
end = start + frag_iter->len;
if ((copy = end - offset) > 0) {
if (copy > len)
copy = len;
if (skb_copy_datagram_const_iovec(frag_iter,
offset - start,
to, to_offset,
copy))
goto fault;
if ((len -= copy) == 0)
return 0;
offset += copy;
to_offset += copy;
}
start = end;
}
if (!len)
return 0;
fault:
return -EFAULT;
}
EXPORT_SYMBOL(skb_copy_datagram_const_iovec);
/**
* skb_copy_datagram_from_iovec - Copy a datagram from an iovec.
* @skb: buffer to copy
* @offset: offset in the buffer to start copying to
* @from: io vector to copy to
* @from_offset: offset in the io vector to start copying from
* @len: amount of data to copy to buffer from iovec
*
* Returns 0 or -EFAULT.
* Note: the iovec is not modified during the copy.
*/
int skb_copy_datagram_from_iovec(struct sk_buff *skb, int offset,
const struct iovec *from, int from_offset,
int len)
{
int start = skb_headlen(skb);
int i, copy = start - offset;
struct sk_buff *frag_iter;
/* Copy header. */
if (copy > 0) {
if (copy > len)
copy = len;
if (memcpy_fromiovecend(skb->data + offset, from, from_offset,
copy))
goto fault;
if ((len -= copy) == 0)
return 0;
offset += copy;
from_offset += copy;
}
/* Copy paged appendix. Hmm... why does this look so complicated? */
for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
int end;
const skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
WARN_ON(start > offset + len);
end = start + skb_frag_size(frag);
if ((copy = end - offset) > 0) {
int err;
u8 *vaddr;
struct page *page = skb_frag_page(frag);
if (copy > len)
copy = len;
vaddr = kmap(page);
err = memcpy_fromiovecend(vaddr + frag->page_offset +
offset - start,
from, from_offset, copy);
kunmap(page);
if (err)
goto fault;
if (!(len -= copy))
return 0;
offset += copy;
from_offset += copy;
}
start = end;
}
skb_walk_frags(skb, frag_iter) {
int end;
WARN_ON(start > offset + len);
end = start + frag_iter->len;
if ((copy = end - offset) > 0) {
if (copy > len)
copy = len;
if (skb_copy_datagram_from_iovec(frag_iter,
offset - start,
from,
from_offset,
copy))
goto fault;
if ((len -= copy) == 0)
return 0;
offset += copy;
from_offset += copy;
}
start = end;
}
if (!len)
return 0;
fault:
return -EFAULT;
}
EXPORT_SYMBOL(skb_copy_datagram_from_iovec);
/**
* zerocopy_sg_from_iovec - Build a zerocopy datagram from an iovec
* @skb: buffer to copy
* @from: io vector to copy from
* @offset: offset in the io vector to start copying from
* @count: amount of vectors to copy to buffer from
*
* The function will first copy up to headlen, and then pin the userspace
* pages and build frags through them.
*
* Returns 0, -EFAULT or -EMSGSIZE.
* Note: the iovec is not modified during the copy
*/
int zerocopy_sg_from_iovec(struct sk_buff *skb, const struct iovec *from,
int offset, size_t count)
{
int len = iov_length(from, count) - offset;
int copy = min_t(int, skb_headlen(skb), len);
int size;
int i = 0;
/* copy up to skb headlen */
if (skb_copy_datagram_from_iovec(skb, 0, from, offset, copy))
return -EFAULT;
if (len == copy)
return 0;
offset += copy;
while (count--) {
struct page *page[MAX_SKB_FRAGS];
int num_pages;
unsigned long base;
unsigned long truesize;
/* Skip over from offset and copied */
if (offset >= from->iov_len) {
offset -= from->iov_len;
++from;
continue;
}
len = from->iov_len - offset;
base = (unsigned long)from->iov_base + offset;
size = ((base & ~PAGE_MASK) + len + ~PAGE_MASK) >> PAGE_SHIFT;
if (i + size > MAX_SKB_FRAGS)
return -EMSGSIZE;
num_pages = get_user_pages_fast(base, size, 0, &page[i]);
if (num_pages != size) {
release_pages(&page[i], num_pages, 0);
return -EFAULT;
}
truesize = size * PAGE_SIZE;
skb->data_len += len;
skb->len += len;
skb->truesize += truesize;
atomic_add(truesize, &skb->sk->sk_wmem_alloc);
while (len) {
int off = base & ~PAGE_MASK;
int size = min_t(int, len, PAGE_SIZE - off);
skb_fill_page_desc(skb, i, page[i], off, size);
base += size;
len -= size;
i++;
}
offset = 0;
++from;
}
return 0;
}
EXPORT_SYMBOL(zerocopy_sg_from_iovec);
static int skb_copy_and_csum_datagram(const struct sk_buff *skb, int offset,
u8 __user *to, int len,
__wsum *csump)
{
int start = skb_headlen(skb);
int i, copy = start - offset;
struct sk_buff *frag_iter;
int pos = 0;
/* Copy header. */
if (copy > 0) {
int err = 0;
if (copy > len)
copy = len;
*csump = csum_and_copy_to_user(skb->data + offset, to, copy,
*csump, &err);
if (err)
goto fault;
if ((len -= copy) == 0)
return 0;
offset += copy;
to += copy;
pos = copy;
}
for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
int end;
const skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
WARN_ON(start > offset + len);
end = start + skb_frag_size(frag);
if ((copy = end - offset) > 0) {
__wsum csum2;
int err = 0;
u8 *vaddr;
struct page *page = skb_frag_page(frag);
if (copy > len)
copy = len;
vaddr = kmap(page);
csum2 = csum_and_copy_to_user(vaddr +
frag->page_offset +
offset - start,
to, copy, 0, &err);
kunmap(page);
if (err)
goto fault;
*csump = csum_block_add(*csump, csum2, pos);
if (!(len -= copy))
return 0;
offset += copy;
to += copy;
pos += copy;
}
start = end;
}
skb_walk_frags(skb, frag_iter) {
int end;
WARN_ON(start > offset + len);
end = start + frag_iter->len;
if ((copy = end - offset) > 0) {
__wsum csum2 = 0;
if (copy > len)
copy = len;
if (skb_copy_and_csum_datagram(frag_iter,
offset - start,
to, copy,
&csum2))
goto fault;
*csump = csum_block_add(*csump, csum2, pos);
if ((len -= copy) == 0)
return 0;
offset += copy;
to += copy;
pos += copy;
}
start = end;
}
if (!len)
return 0;
fault:
return -EFAULT;
}
__sum16 __skb_checksum_complete_head(struct sk_buff *skb, int len)
{
__sum16 sum;
sum = csum_fold(skb_checksum(skb, 0, len, skb->csum));
if (likely(!sum)) {
if (unlikely(skb->ip_summed == CHECKSUM_COMPLETE) &&
!skb->csum_complete_sw)
netdev_rx_csum_fault(skb->dev);
}
skb->csum_valid = !sum;
return sum;
}
EXPORT_SYMBOL(__skb_checksum_complete_head);
__sum16 __skb_checksum_complete(struct sk_buff *skb)
{
__wsum csum;
__sum16 sum;
csum = skb_checksum(skb, 0, skb->len, 0);
/* skb->csum holds pseudo checksum */
sum = csum_fold(csum_add(skb->csum, csum));
if (likely(!sum)) {
if (unlikely(skb->ip_summed == CHECKSUM_COMPLETE) &&
!skb->csum_complete_sw)
netdev_rx_csum_fault(skb->dev);
}
/* Save full packet checksum */
skb->csum = csum;
skb->ip_summed = CHECKSUM_COMPLETE;
skb->csum_complete_sw = 1;
skb->csum_valid = !sum;
return sum;
}
EXPORT_SYMBOL(__skb_checksum_complete);
/**
* skb_copy_and_csum_datagram_iovec - Copy and checksum skb to user iovec.
* @skb: skbuff
* @hlen: hardware length
* @iov: io vector
* @len: amount of data to copy from skb to iov
*
* Caller _must_ check that skb will fit to this iovec.
*
* Returns: 0 - success.
* -EINVAL - checksum failure.
* -EFAULT - fault during copy. Beware, in this case iovec
* can be modified!
*/
int skb_copy_and_csum_datagram_iovec(struct sk_buff *skb,
int hlen, struct iovec *iov, int len)
{
__wsum csum;
int chunk = skb->len - hlen;
if (chunk > len)
chunk = len;
if (!chunk)
return 0;
/* Skip filled elements.
* Pretty silly, look at memcpy_toiovec, though 8)
*/
while (!iov->iov_len)
iov++;
if (iov->iov_len < chunk) {
if (__skb_checksum_complete(skb))
goto csum_error;
if (skb_copy_datagram_iovec(skb, hlen, iov, chunk))
goto fault;
} else {
csum = csum_partial(skb->data, hlen, skb->csum);
if (skb_copy_and_csum_datagram(skb, hlen, iov->iov_base,
chunk, &csum))
goto fault;
if (csum_fold(csum))
goto csum_error;
if (unlikely(skb->ip_summed == CHECKSUM_COMPLETE))
netdev_rx_csum_fault(skb->dev);
iov->iov_len -= chunk;
iov->iov_base += chunk;
}
return 0;
csum_error:
return -EINVAL;
fault:
return -EFAULT;
}
EXPORT_SYMBOL(skb_copy_and_csum_datagram_iovec);
/**
* datagram_poll - generic datagram poll
* @file: file struct
* @sock: socket
* @wait: poll table
*
* Datagram poll: Again totally generic. This also handles
* sequenced packet sockets providing the socket receive queue
* is only ever holding data ready to receive.
*
* Note: when you _don't_ use this routine for this protocol,
* and you use a different write policy from sock_writeable()
* then please supply your own write_space callback.
*/
unsigned int datagram_poll(struct file *file, struct socket *sock,
poll_table *wait)
{
struct sock *sk = sock->sk;
unsigned int mask;
sock_poll_wait(file, sk_sleep(sk), wait);
mask = 0;
/* exceptional events? */
if (sk->sk_err || !skb_queue_empty(&sk->sk_error_queue))
mask |= POLLERR |
(sock_flag(sk, SOCK_SELECT_ERR_QUEUE) ? POLLPRI : 0);
if (sk->sk_shutdown & RCV_SHUTDOWN)
mask |= POLLRDHUP | POLLIN | POLLRDNORM;
if (sk->sk_shutdown == SHUTDOWN_MASK)
mask |= POLLHUP;
/* readable? */
if (!skb_queue_empty(&sk->sk_receive_queue))
mask |= POLLIN | POLLRDNORM;
/* Connection-based need to check for termination and startup */
if (connection_based(sk)) {
if (sk->sk_state == TCP_CLOSE)
mask |= POLLHUP;
/* connection hasn't started yet? */
if (sk->sk_state == TCP_SYN_SENT)
return mask;
}
/* writable? */
if (sock_writeable(sk))
mask |= POLLOUT | POLLWRNORM | POLLWRBAND;
else
set_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags);
return mask;
}
EXPORT_SYMBOL(datagram_poll);

7361
net/core/dev.c Normal file

File diff suppressed because it is too large Load diff

851
net/core/dev_addr_lists.c Normal file
View file

@ -0,0 +1,851 @@
/*
* net/core/dev_addr_lists.c - Functions for handling net device lists
* Copyright (c) 2010 Jiri Pirko <jpirko@redhat.com>
*
* This file contains functions for working with unicast, multicast and device
* addresses lists.
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*/
#include <linux/netdevice.h>
#include <linux/rtnetlink.h>
#include <linux/export.h>
#include <linux/list.h>
/*
* General list handling functions
*/
static int __hw_addr_create_ex(struct netdev_hw_addr_list *list,
const unsigned char *addr, int addr_len,
unsigned char addr_type, bool global,
bool sync)
{
struct netdev_hw_addr *ha;
int alloc_size;
alloc_size = sizeof(*ha);
if (alloc_size < L1_CACHE_BYTES)
alloc_size = L1_CACHE_BYTES;
ha = kmalloc(alloc_size, GFP_ATOMIC);
if (!ha)
return -ENOMEM;
memcpy(ha->addr, addr, addr_len);
ha->type = addr_type;
ha->refcount = 1;
ha->global_use = global;
ha->synced = sync ? 1 : 0;
ha->sync_cnt = 0;
list_add_tail_rcu(&ha->list, &list->list);
list->count++;
return 0;
}
static int __hw_addr_add_ex(struct netdev_hw_addr_list *list,
const unsigned char *addr, int addr_len,
unsigned char addr_type, bool global, bool sync,
int sync_count)
{
struct netdev_hw_addr *ha;
if (addr_len > MAX_ADDR_LEN)
return -EINVAL;
list_for_each_entry(ha, &list->list, list) {
if (!memcmp(ha->addr, addr, addr_len) &&
ha->type == addr_type) {
if (global) {
/* check if addr is already used as global */
if (ha->global_use)
return 0;
else
ha->global_use = true;
}
if (sync) {
if (ha->synced && sync_count)
return -EEXIST;
else
ha->synced++;
}
ha->refcount++;
return 0;
}
}
return __hw_addr_create_ex(list, addr, addr_len, addr_type, global,
sync);
}
static int __hw_addr_add(struct netdev_hw_addr_list *list,
const unsigned char *addr, int addr_len,
unsigned char addr_type)
{
return __hw_addr_add_ex(list, addr, addr_len, addr_type, false, false,
0);
}
static int __hw_addr_del_entry(struct netdev_hw_addr_list *list,
struct netdev_hw_addr *ha, bool global,
bool sync)
{
if (global && !ha->global_use)
return -ENOENT;
if (sync && !ha->synced)
return -ENOENT;
if (global)
ha->global_use = false;
if (sync)
ha->synced--;
if (--ha->refcount)
return 0;
list_del_rcu(&ha->list);
kfree_rcu(ha, rcu_head);
list->count--;
return 0;
}
static int __hw_addr_del_ex(struct netdev_hw_addr_list *list,
const unsigned char *addr, int addr_len,
unsigned char addr_type, bool global, bool sync)
{
struct netdev_hw_addr *ha;
list_for_each_entry(ha, &list->list, list) {
if (!memcmp(ha->addr, addr, addr_len) &&
(ha->type == addr_type || !addr_type))
return __hw_addr_del_entry(list, ha, global, sync);
}
return -ENOENT;
}
static int __hw_addr_del(struct netdev_hw_addr_list *list,
const unsigned char *addr, int addr_len,
unsigned char addr_type)
{
return __hw_addr_del_ex(list, addr, addr_len, addr_type, false, false);
}
static int __hw_addr_sync_one(struct netdev_hw_addr_list *to_list,
struct netdev_hw_addr *ha,
int addr_len)
{
int err;
err = __hw_addr_add_ex(to_list, ha->addr, addr_len, ha->type,
false, true, ha->sync_cnt);
if (err && err != -EEXIST)
return err;
if (!err) {
ha->sync_cnt++;
ha->refcount++;
}
return 0;
}
static void __hw_addr_unsync_one(struct netdev_hw_addr_list *to_list,
struct netdev_hw_addr_list *from_list,
struct netdev_hw_addr *ha,
int addr_len)
{
int err;
err = __hw_addr_del_ex(to_list, ha->addr, addr_len, ha->type,
false, true);
if (err)
return;
ha->sync_cnt--;
/* address on from list is not marked synced */
__hw_addr_del_entry(from_list, ha, false, false);
}
static int __hw_addr_sync_multiple(struct netdev_hw_addr_list *to_list,
struct netdev_hw_addr_list *from_list,
int addr_len)
{
int err = 0;
struct netdev_hw_addr *ha, *tmp;
list_for_each_entry_safe(ha, tmp, &from_list->list, list) {
if (ha->sync_cnt == ha->refcount) {
__hw_addr_unsync_one(to_list, from_list, ha, addr_len);
} else {
err = __hw_addr_sync_one(to_list, ha, addr_len);
if (err)
break;
}
}
return err;
}
/* This function only works where there is a strict 1-1 relationship
* between source and destionation of they synch. If you ever need to
* sync addresses to more then 1 destination, you need to use
* __hw_addr_sync_multiple().
*/
int __hw_addr_sync(struct netdev_hw_addr_list *to_list,
struct netdev_hw_addr_list *from_list,
int addr_len)
{
int err = 0;
struct netdev_hw_addr *ha, *tmp;
list_for_each_entry_safe(ha, tmp, &from_list->list, list) {
if (!ha->sync_cnt) {
err = __hw_addr_sync_one(to_list, ha, addr_len);
if (err)
break;
} else if (ha->refcount == 1)
__hw_addr_unsync_one(to_list, from_list, ha, addr_len);
}
return err;
}
EXPORT_SYMBOL(__hw_addr_sync);
void __hw_addr_unsync(struct netdev_hw_addr_list *to_list,
struct netdev_hw_addr_list *from_list,
int addr_len)
{
struct netdev_hw_addr *ha, *tmp;
list_for_each_entry_safe(ha, tmp, &from_list->list, list) {
if (ha->sync_cnt)
__hw_addr_unsync_one(to_list, from_list, ha, addr_len);
}
}
EXPORT_SYMBOL(__hw_addr_unsync);
/**
* __hw_addr_sync_dev - Synchonize device's multicast list
* @list: address list to syncronize
* @dev: device to sync
* @sync: function to call if address should be added
* @unsync: function to call if address should be removed
*
* This funciton is intended to be called from the ndo_set_rx_mode
* function of devices that require explicit address add/remove
* notifications. The unsync function may be NULL in which case
* the addresses requiring removal will simply be removed without
* any notification to the device.
**/
int __hw_addr_sync_dev(struct netdev_hw_addr_list *list,
struct net_device *dev,
int (*sync)(struct net_device *, const unsigned char *),
int (*unsync)(struct net_device *,
const unsigned char *))
{
struct netdev_hw_addr *ha, *tmp;
int err;
/* first go through and flush out any stale entries */
list_for_each_entry_safe(ha, tmp, &list->list, list) {
if (!ha->sync_cnt || ha->refcount != 1)
continue;
/* if unsync is defined and fails defer unsyncing address */
if (unsync && unsync(dev, ha->addr))
continue;
ha->sync_cnt--;
__hw_addr_del_entry(list, ha, false, false);
}
/* go through and sync new entries to the list */
list_for_each_entry_safe(ha, tmp, &list->list, list) {
if (ha->sync_cnt)
continue;
err = sync(dev, ha->addr);
if (err)
return err;
ha->sync_cnt++;
ha->refcount++;
}
return 0;
}
EXPORT_SYMBOL(__hw_addr_sync_dev);
/**
* __hw_addr_unsync_dev - Remove synchonized addresses from device
* @list: address list to remove syncronized addresses from
* @dev: device to sync
* @unsync: function to call if address should be removed
*
* Remove all addresses that were added to the device by __hw_addr_sync_dev().
* This function is intended to be called from the ndo_stop or ndo_open
* functions on devices that require explicit address add/remove
* notifications. If the unsync function pointer is NULL then this function
* can be used to just reset the sync_cnt for the addresses in the list.
**/
void __hw_addr_unsync_dev(struct netdev_hw_addr_list *list,
struct net_device *dev,
int (*unsync)(struct net_device *,
const unsigned char *))
{
struct netdev_hw_addr *ha, *tmp;
list_for_each_entry_safe(ha, tmp, &list->list, list) {
if (!ha->sync_cnt)
continue;
/* if unsync is defined and fails defer unsyncing address */
if (unsync && unsync(dev, ha->addr))
continue;
ha->sync_cnt--;
__hw_addr_del_entry(list, ha, false, false);
}
}
EXPORT_SYMBOL(__hw_addr_unsync_dev);
static void __hw_addr_flush(struct netdev_hw_addr_list *list)
{
struct netdev_hw_addr *ha, *tmp;
list_for_each_entry_safe(ha, tmp, &list->list, list) {
list_del_rcu(&ha->list);
kfree_rcu(ha, rcu_head);
}
list->count = 0;
}
void __hw_addr_init(struct netdev_hw_addr_list *list)
{
INIT_LIST_HEAD(&list->list);
list->count = 0;
}
EXPORT_SYMBOL(__hw_addr_init);
/*
* Device addresses handling functions
*/
/**
* dev_addr_flush - Flush device address list
* @dev: device
*
* Flush device address list and reset ->dev_addr.
*
* The caller must hold the rtnl_mutex.
*/
void dev_addr_flush(struct net_device *dev)
{
/* rtnl_mutex must be held here */
__hw_addr_flush(&dev->dev_addrs);
dev->dev_addr = NULL;
}
EXPORT_SYMBOL(dev_addr_flush);
/**
* dev_addr_init - Init device address list
* @dev: device
*
* Init device address list and create the first element,
* used by ->dev_addr.
*
* The caller must hold the rtnl_mutex.
*/
int dev_addr_init(struct net_device *dev)
{
unsigned char addr[MAX_ADDR_LEN];
struct netdev_hw_addr *ha;
int err;
/* rtnl_mutex must be held here */
__hw_addr_init(&dev->dev_addrs);
memset(addr, 0, sizeof(addr));
err = __hw_addr_add(&dev->dev_addrs, addr, sizeof(addr),
NETDEV_HW_ADDR_T_LAN);
if (!err) {
/*
* Get the first (previously created) address from the list
* and set dev_addr pointer to this location.
*/
ha = list_first_entry(&dev->dev_addrs.list,
struct netdev_hw_addr, list);
dev->dev_addr = ha->addr;
}
return err;
}
EXPORT_SYMBOL(dev_addr_init);
/**
* dev_addr_add - Add a device address
* @dev: device
* @addr: address to add
* @addr_type: address type
*
* Add a device address to the device or increase the reference count if
* it already exists.
*
* The caller must hold the rtnl_mutex.
*/
int dev_addr_add(struct net_device *dev, const unsigned char *addr,
unsigned char addr_type)
{
int err;
ASSERT_RTNL();
err = __hw_addr_add(&dev->dev_addrs, addr, dev->addr_len, addr_type);
if (!err)
call_netdevice_notifiers(NETDEV_CHANGEADDR, dev);
return err;
}
EXPORT_SYMBOL(dev_addr_add);
/**
* dev_addr_del - Release a device address.
* @dev: device
* @addr: address to delete
* @addr_type: address type
*
* Release reference to a device address and remove it from the device
* if the reference count drops to zero.
*
* The caller must hold the rtnl_mutex.
*/
int dev_addr_del(struct net_device *dev, const unsigned char *addr,
unsigned char addr_type)
{
int err;
struct netdev_hw_addr *ha;
ASSERT_RTNL();
/*
* We can not remove the first address from the list because
* dev->dev_addr points to that.
*/
ha = list_first_entry(&dev->dev_addrs.list,
struct netdev_hw_addr, list);
if (!memcmp(ha->addr, addr, dev->addr_len) &&
ha->type == addr_type && ha->refcount == 1)
return -ENOENT;
err = __hw_addr_del(&dev->dev_addrs, addr, dev->addr_len,
addr_type);
if (!err)
call_netdevice_notifiers(NETDEV_CHANGEADDR, dev);
return err;
}
EXPORT_SYMBOL(dev_addr_del);
/*
* Unicast list handling functions
*/
/**
* dev_uc_add_excl - Add a global secondary unicast address
* @dev: device
* @addr: address to add
*/
int dev_uc_add_excl(struct net_device *dev, const unsigned char *addr)
{
struct netdev_hw_addr *ha;
int err;
netif_addr_lock_bh(dev);
list_for_each_entry(ha, &dev->uc.list, list) {
if (!memcmp(ha->addr, addr, dev->addr_len) &&
ha->type == NETDEV_HW_ADDR_T_UNICAST) {
err = -EEXIST;
goto out;
}
}
err = __hw_addr_create_ex(&dev->uc, addr, dev->addr_len,
NETDEV_HW_ADDR_T_UNICAST, true, false);
if (!err)
__dev_set_rx_mode(dev);
out:
netif_addr_unlock_bh(dev);
return err;
}
EXPORT_SYMBOL(dev_uc_add_excl);
/**
* dev_uc_add - Add a secondary unicast address
* @dev: device
* @addr: address to add
*
* Add a secondary unicast address to the device or increase
* the reference count if it already exists.
*/
int dev_uc_add(struct net_device *dev, const unsigned char *addr)
{
int err;
netif_addr_lock_bh(dev);
err = __hw_addr_add(&dev->uc, addr, dev->addr_len,
NETDEV_HW_ADDR_T_UNICAST);
if (!err)
__dev_set_rx_mode(dev);
netif_addr_unlock_bh(dev);
return err;
}
EXPORT_SYMBOL(dev_uc_add);
/**
* dev_uc_del - Release secondary unicast address.
* @dev: device
* @addr: address to delete
*
* Release reference to a secondary unicast address and remove it
* from the device if the reference count drops to zero.
*/
int dev_uc_del(struct net_device *dev, const unsigned char *addr)
{
int err;
netif_addr_lock_bh(dev);
err = __hw_addr_del(&dev->uc, addr, dev->addr_len,
NETDEV_HW_ADDR_T_UNICAST);
if (!err)
__dev_set_rx_mode(dev);
netif_addr_unlock_bh(dev);
return err;
}
EXPORT_SYMBOL(dev_uc_del);
/**
* dev_uc_sync - Synchronize device's unicast list to another device
* @to: destination device
* @from: source device
*
* Add newly added addresses to the destination device and release
* addresses that have no users left. The source device must be
* locked by netif_addr_lock_bh.
*
* This function is intended to be called from the dev->set_rx_mode
* function of layered software devices. This function assumes that
* addresses will only ever be synced to the @to devices and no other.
*/
int dev_uc_sync(struct net_device *to, struct net_device *from)
{
int err = 0;
if (to->addr_len != from->addr_len)
return -EINVAL;
netif_addr_lock_nested(to);
err = __hw_addr_sync(&to->uc, &from->uc, to->addr_len);
if (!err)
__dev_set_rx_mode(to);
netif_addr_unlock(to);
return err;
}
EXPORT_SYMBOL(dev_uc_sync);
/**
* dev_uc_sync_multiple - Synchronize device's unicast list to another
* device, but allow for multiple calls to sync to multiple devices.
* @to: destination device
* @from: source device
*
* Add newly added addresses to the destination device and release
* addresses that have been deleted from the source. The source device
* must be locked by netif_addr_lock_bh.
*
* This function is intended to be called from the dev->set_rx_mode
* function of layered software devices. It allows for a single source
* device to be synced to multiple destination devices.
*/
int dev_uc_sync_multiple(struct net_device *to, struct net_device *from)
{
int err = 0;
if (to->addr_len != from->addr_len)
return -EINVAL;
netif_addr_lock_nested(to);
err = __hw_addr_sync_multiple(&to->uc, &from->uc, to->addr_len);
if (!err)
__dev_set_rx_mode(to);
netif_addr_unlock(to);
return err;
}
EXPORT_SYMBOL(dev_uc_sync_multiple);
/**
* dev_uc_unsync - Remove synchronized addresses from the destination device
* @to: destination device
* @from: source device
*
* Remove all addresses that were added to the destination device by
* dev_uc_sync(). This function is intended to be called from the
* dev->stop function of layered software devices.
*/
void dev_uc_unsync(struct net_device *to, struct net_device *from)
{
if (to->addr_len != from->addr_len)
return;
netif_addr_lock_bh(from);
netif_addr_lock_nested(to);
__hw_addr_unsync(&to->uc, &from->uc, to->addr_len);
__dev_set_rx_mode(to);
netif_addr_unlock(to);
netif_addr_unlock_bh(from);
}
EXPORT_SYMBOL(dev_uc_unsync);
/**
* dev_uc_flush - Flush unicast addresses
* @dev: device
*
* Flush unicast addresses.
*/
void dev_uc_flush(struct net_device *dev)
{
netif_addr_lock_bh(dev);
__hw_addr_flush(&dev->uc);
netif_addr_unlock_bh(dev);
}
EXPORT_SYMBOL(dev_uc_flush);
/**
* dev_uc_flush - Init unicast address list
* @dev: device
*
* Init unicast address list.
*/
void dev_uc_init(struct net_device *dev)
{
__hw_addr_init(&dev->uc);
}
EXPORT_SYMBOL(dev_uc_init);
/*
* Multicast list handling functions
*/
/**
* dev_mc_add_excl - Add a global secondary multicast address
* @dev: device
* @addr: address to add
*/
int dev_mc_add_excl(struct net_device *dev, const unsigned char *addr)
{
struct netdev_hw_addr *ha;
int err;
netif_addr_lock_bh(dev);
list_for_each_entry(ha, &dev->mc.list, list) {
if (!memcmp(ha->addr, addr, dev->addr_len) &&
ha->type == NETDEV_HW_ADDR_T_MULTICAST) {
err = -EEXIST;
goto out;
}
}
err = __hw_addr_create_ex(&dev->mc, addr, dev->addr_len,
NETDEV_HW_ADDR_T_MULTICAST, true, false);
if (!err)
__dev_set_rx_mode(dev);
out:
netif_addr_unlock_bh(dev);
return err;
}
EXPORT_SYMBOL(dev_mc_add_excl);
static int __dev_mc_add(struct net_device *dev, const unsigned char *addr,
bool global)
{
int err;
netif_addr_lock_bh(dev);
err = __hw_addr_add_ex(&dev->mc, addr, dev->addr_len,
NETDEV_HW_ADDR_T_MULTICAST, global, false, 0);
if (!err)
__dev_set_rx_mode(dev);
netif_addr_unlock_bh(dev);
return err;
}
/**
* dev_mc_add - Add a multicast address
* @dev: device
* @addr: address to add
*
* Add a multicast address to the device or increase
* the reference count if it already exists.
*/
int dev_mc_add(struct net_device *dev, const unsigned char *addr)
{
return __dev_mc_add(dev, addr, false);
}
EXPORT_SYMBOL(dev_mc_add);
/**
* dev_mc_add_global - Add a global multicast address
* @dev: device
* @addr: address to add
*
* Add a global multicast address to the device.
*/
int dev_mc_add_global(struct net_device *dev, const unsigned char *addr)
{
return __dev_mc_add(dev, addr, true);
}
EXPORT_SYMBOL(dev_mc_add_global);
static int __dev_mc_del(struct net_device *dev, const unsigned char *addr,
bool global)
{
int err;
netif_addr_lock_bh(dev);
err = __hw_addr_del_ex(&dev->mc, addr, dev->addr_len,
NETDEV_HW_ADDR_T_MULTICAST, global, false);
if (!err)
__dev_set_rx_mode(dev);
netif_addr_unlock_bh(dev);
return err;
}
/**
* dev_mc_del - Delete a multicast address.
* @dev: device
* @addr: address to delete
*
* Release reference to a multicast address and remove it
* from the device if the reference count drops to zero.
*/
int dev_mc_del(struct net_device *dev, const unsigned char *addr)
{
return __dev_mc_del(dev, addr, false);
}
EXPORT_SYMBOL(dev_mc_del);
/**
* dev_mc_del_global - Delete a global multicast address.
* @dev: device
* @addr: address to delete
*
* Release reference to a multicast address and remove it
* from the device if the reference count drops to zero.
*/
int dev_mc_del_global(struct net_device *dev, const unsigned char *addr)
{
return __dev_mc_del(dev, addr, true);
}
EXPORT_SYMBOL(dev_mc_del_global);
/**
* dev_mc_sync - Synchronize device's multicast list to another device
* @to: destination device
* @from: source device
*
* Add newly added addresses to the destination device and release
* addresses that have no users left. The source device must be
* locked by netif_addr_lock_bh.
*
* This function is intended to be called from the ndo_set_rx_mode
* function of layered software devices.
*/
int dev_mc_sync(struct net_device *to, struct net_device *from)
{
int err = 0;
if (to->addr_len != from->addr_len)
return -EINVAL;
netif_addr_lock_nested(to);
err = __hw_addr_sync(&to->mc, &from->mc, to->addr_len);
if (!err)
__dev_set_rx_mode(to);
netif_addr_unlock(to);
return err;
}
EXPORT_SYMBOL(dev_mc_sync);
/**
* dev_mc_sync_multiple - Synchronize device's multicast list to another
* device, but allow for multiple calls to sync to multiple devices.
* @to: destination device
* @from: source device
*
* Add newly added addresses to the destination device and release
* addresses that have no users left. The source device must be
* locked by netif_addr_lock_bh.
*
* This function is intended to be called from the ndo_set_rx_mode
* function of layered software devices. It allows for a single
* source device to be synced to multiple destination devices.
*/
int dev_mc_sync_multiple(struct net_device *to, struct net_device *from)
{
int err = 0;
if (to->addr_len != from->addr_len)
return -EINVAL;
netif_addr_lock_nested(to);
err = __hw_addr_sync_multiple(&to->mc, &from->mc, to->addr_len);
if (!err)
__dev_set_rx_mode(to);
netif_addr_unlock(to);
return err;
}
EXPORT_SYMBOL(dev_mc_sync_multiple);
/**
* dev_mc_unsync - Remove synchronized addresses from the destination device
* @to: destination device
* @from: source device
*
* Remove all addresses that were added to the destination device by
* dev_mc_sync(). This function is intended to be called from the
* dev->stop function of layered software devices.
*/
void dev_mc_unsync(struct net_device *to, struct net_device *from)
{
if (to->addr_len != from->addr_len)
return;
netif_addr_lock_bh(from);
netif_addr_lock_nested(to);
__hw_addr_unsync(&to->mc, &from->mc, to->addr_len);
__dev_set_rx_mode(to);
netif_addr_unlock(to);
netif_addr_unlock_bh(from);
}
EXPORT_SYMBOL(dev_mc_unsync);
/**
* dev_mc_flush - Flush multicast addresses
* @dev: device
*
* Flush multicast addresses.
*/
void dev_mc_flush(struct net_device *dev)
{
netif_addr_lock_bh(dev);
__hw_addr_flush(&dev->mc);
netif_addr_unlock_bh(dev);
}
EXPORT_SYMBOL(dev_mc_flush);
/**
* dev_mc_flush - Init multicast address list
* @dev: device
*
* Init multicast address list.
*/
void dev_mc_init(struct net_device *dev)
{
__hw_addr_init(&dev->mc);
}
EXPORT_SYMBOL(dev_mc_init);

564
net/core/dev_ioctl.c Normal file
View file

@ -0,0 +1,564 @@
#include <linux/kmod.h>
#include <linux/netdevice.h>
#include <linux/etherdevice.h>
#include <linux/rtnetlink.h>
#include <linux/net_tstamp.h>
#include <linux/wireless.h>
#include <net/wext.h>
/*
* Map an interface index to its name (SIOCGIFNAME)
*/
/*
* We need this ioctl for efficient implementation of the
* if_indextoname() function required by the IPv6 API. Without
* it, we would have to search all the interfaces to find a
* match. --pb
*/
static int dev_ifname(struct net *net, struct ifreq __user *arg)
{
struct ifreq ifr;
int error;
/*
* Fetch the caller's info block.
*/
if (copy_from_user(&ifr, arg, sizeof(struct ifreq)))
return -EFAULT;
error = netdev_get_name(net, ifr.ifr_name, ifr.ifr_ifindex);
if (error)
return error;
if (copy_to_user(arg, &ifr, sizeof(struct ifreq)))
return -EFAULT;
return 0;
}
static gifconf_func_t *gifconf_list[NPROTO];
/**
* register_gifconf - register a SIOCGIF handler
* @family: Address family
* @gifconf: Function handler
*
* Register protocol dependent address dumping routines. The handler
* that is passed must not be freed or reused until it has been replaced
* by another handler.
*/
int register_gifconf(unsigned int family, gifconf_func_t *gifconf)
{
if (family >= NPROTO)
return -EINVAL;
gifconf_list[family] = gifconf;
return 0;
}
EXPORT_SYMBOL(register_gifconf);
/*
* Perform a SIOCGIFCONF call. This structure will change
* size eventually, and there is nothing I can do about it.
* Thus we will need a 'compatibility mode'.
*/
static int dev_ifconf(struct net *net, char __user *arg)
{
struct ifconf ifc;
struct net_device *dev;
char __user *pos;
int len;
int total;
int i;
/*
* Fetch the caller's info block.
*/
if (copy_from_user(&ifc, arg, sizeof(struct ifconf)))
return -EFAULT;
pos = ifc.ifc_buf;
len = ifc.ifc_len;
/*
* Loop over the interfaces, and write an info block for each.
*/
total = 0;
for_each_netdev(net, dev) {
for (i = 0; i < NPROTO; i++) {
if (gifconf_list[i]) {
int done;
if (!pos)
done = gifconf_list[i](dev, NULL, 0);
else
done = gifconf_list[i](dev, pos + total,
len - total);
if (done < 0)
return -EFAULT;
total += done;
}
}
}
/*
* All done. Write the updated control block back to the caller.
*/
ifc.ifc_len = total;
/*
* Both BSD and Solaris return 0 here, so we do too.
*/
return copy_to_user(arg, &ifc, sizeof(struct ifconf)) ? -EFAULT : 0;
}
/*
* Perform the SIOCxIFxxx calls, inside rcu_read_lock()
*/
static int dev_ifsioc_locked(struct net *net, struct ifreq *ifr, unsigned int cmd)
{
int err;
struct net_device *dev = dev_get_by_name_rcu(net, ifr->ifr_name);
if (!dev)
return -ENODEV;
switch (cmd) {
case SIOCGIFFLAGS: /* Get interface flags */
ifr->ifr_flags = (short) dev_get_flags(dev);
return 0;
case SIOCGIFMETRIC: /* Get the metric on the interface
(currently unused) */
ifr->ifr_metric = 0;
return 0;
case SIOCGIFMTU: /* Get the MTU of a device */
ifr->ifr_mtu = dev->mtu;
return 0;
case SIOCGIFHWADDR:
if (!dev->addr_len)
memset(ifr->ifr_hwaddr.sa_data, 0, sizeof ifr->ifr_hwaddr.sa_data);
else
memcpy(ifr->ifr_hwaddr.sa_data, dev->dev_addr,
min(sizeof ifr->ifr_hwaddr.sa_data, (size_t) dev->addr_len));
ifr->ifr_hwaddr.sa_family = dev->type;
return 0;
case SIOCGIFSLAVE:
err = -EINVAL;
break;
case SIOCGIFMAP:
ifr->ifr_map.mem_start = dev->mem_start;
ifr->ifr_map.mem_end = dev->mem_end;
ifr->ifr_map.base_addr = dev->base_addr;
ifr->ifr_map.irq = dev->irq;
ifr->ifr_map.dma = dev->dma;
ifr->ifr_map.port = dev->if_port;
return 0;
case SIOCGIFINDEX:
ifr->ifr_ifindex = dev->ifindex;
return 0;
case SIOCGIFTXQLEN:
ifr->ifr_qlen = dev->tx_queue_len;
return 0;
default:
/* dev_ioctl() should ensure this case
* is never reached
*/
WARN_ON(1);
err = -ENOTTY;
break;
}
return err;
}
static int net_hwtstamp_validate(struct ifreq *ifr)
{
struct hwtstamp_config cfg;
enum hwtstamp_tx_types tx_type;
enum hwtstamp_rx_filters rx_filter;
int tx_type_valid = 0;
int rx_filter_valid = 0;
if (copy_from_user(&cfg, ifr->ifr_data, sizeof(cfg)))
return -EFAULT;
if (cfg.flags) /* reserved for future extensions */
return -EINVAL;
tx_type = cfg.tx_type;
rx_filter = cfg.rx_filter;
switch (tx_type) {
case HWTSTAMP_TX_OFF:
case HWTSTAMP_TX_ON:
case HWTSTAMP_TX_ONESTEP_SYNC:
tx_type_valid = 1;
break;
}
switch (rx_filter) {
case HWTSTAMP_FILTER_NONE:
case HWTSTAMP_FILTER_ALL:
case HWTSTAMP_FILTER_SOME:
case HWTSTAMP_FILTER_PTP_V1_L4_EVENT:
case HWTSTAMP_FILTER_PTP_V1_L4_SYNC:
case HWTSTAMP_FILTER_PTP_V1_L4_DELAY_REQ:
case HWTSTAMP_FILTER_PTP_V2_L4_EVENT:
case HWTSTAMP_FILTER_PTP_V2_L4_SYNC:
case HWTSTAMP_FILTER_PTP_V2_L4_DELAY_REQ:
case HWTSTAMP_FILTER_PTP_V2_L2_EVENT:
case HWTSTAMP_FILTER_PTP_V2_L2_SYNC:
case HWTSTAMP_FILTER_PTP_V2_L2_DELAY_REQ:
case HWTSTAMP_FILTER_PTP_V2_EVENT:
case HWTSTAMP_FILTER_PTP_V2_SYNC:
case HWTSTAMP_FILTER_PTP_V2_DELAY_REQ:
rx_filter_valid = 1;
break;
}
if (!tx_type_valid || !rx_filter_valid)
return -ERANGE;
return 0;
}
/*
* Perform the SIOCxIFxxx calls, inside rtnl_lock()
*/
static int dev_ifsioc(struct net *net, struct ifreq *ifr, unsigned int cmd)
{
int err;
struct net_device *dev = __dev_get_by_name(net, ifr->ifr_name);
const struct net_device_ops *ops;
if (!dev)
return -ENODEV;
ops = dev->netdev_ops;
switch (cmd) {
case SIOCSIFFLAGS: /* Set interface flags */
return dev_change_flags(dev, ifr->ifr_flags);
case SIOCSIFMETRIC: /* Set the metric on the interface
(currently unused) */
return -EOPNOTSUPP;
case SIOCSIFMTU: /* Set the MTU of a device */
return dev_set_mtu(dev, ifr->ifr_mtu);
case SIOCSIFHWADDR:
return dev_set_mac_address(dev, &ifr->ifr_hwaddr);
case SIOCSIFHWBROADCAST:
if (ifr->ifr_hwaddr.sa_family != dev->type)
return -EINVAL;
memcpy(dev->broadcast, ifr->ifr_hwaddr.sa_data,
min(sizeof ifr->ifr_hwaddr.sa_data, (size_t) dev->addr_len));
call_netdevice_notifiers(NETDEV_CHANGEADDR, dev);
return 0;
case SIOCSIFMAP:
if (ops->ndo_set_config) {
if (!netif_device_present(dev))
return -ENODEV;
return ops->ndo_set_config(dev, &ifr->ifr_map);
}
return -EOPNOTSUPP;
case SIOCADDMULTI:
if (!ops->ndo_set_rx_mode ||
ifr->ifr_hwaddr.sa_family != AF_UNSPEC)
return -EINVAL;
if (!netif_device_present(dev))
return -ENODEV;
return dev_mc_add_global(dev, ifr->ifr_hwaddr.sa_data);
case SIOCDELMULTI:
if (!ops->ndo_set_rx_mode ||
ifr->ifr_hwaddr.sa_family != AF_UNSPEC)
return -EINVAL;
if (!netif_device_present(dev))
return -ENODEV;
return dev_mc_del_global(dev, ifr->ifr_hwaddr.sa_data);
case SIOCSIFTXQLEN:
if (ifr->ifr_qlen < 0)
return -EINVAL;
dev->tx_queue_len = ifr->ifr_qlen;
return 0;
case SIOCSIFNAME:
ifr->ifr_newname[IFNAMSIZ-1] = '\0';
return dev_change_name(dev, ifr->ifr_newname);
case SIOCSHWTSTAMP:
err = net_hwtstamp_validate(ifr);
if (err)
return err;
/* fall through */
/*
* Unknown or private ioctl
*/
default:
if ((cmd >= SIOCDEVPRIVATE &&
cmd <= SIOCDEVPRIVATE + 15) ||
cmd == SIOCBONDENSLAVE ||
cmd == SIOCBONDRELEASE ||
cmd == SIOCBONDSETHWADDR ||
cmd == SIOCBONDSLAVEINFOQUERY ||
cmd == SIOCBONDINFOQUERY ||
cmd == SIOCBONDCHANGEACTIVE ||
cmd == SIOCGMIIPHY ||
cmd == SIOCGMIIREG ||
cmd == SIOCSMIIREG ||
cmd == SIOCBRADDIF ||
cmd == SIOCBRDELIF ||
cmd == SIOCSHWTSTAMP ||
cmd == SIOCGHWTSTAMP ||
cmd == SIOCWANDEV) {
err = -EOPNOTSUPP;
if (ops->ndo_do_ioctl) {
if (netif_device_present(dev))
err = ops->ndo_do_ioctl(dev, ifr, cmd);
else
err = -ENODEV;
}
} else
err = -EINVAL;
}
return err;
}
/**
* dev_load - load a network module
* @net: the applicable net namespace
* @name: name of interface
*
* If a network interface is not present and the process has suitable
* privileges this function loads the module. If module loading is not
* available in this kernel then it becomes a nop.
*/
void dev_load(struct net *net, const char *name)
{
struct net_device *dev;
int no_module;
rcu_read_lock();
dev = dev_get_by_name_rcu(net, name);
rcu_read_unlock();
no_module = !dev;
if (no_module && capable(CAP_NET_ADMIN))
no_module = request_module("netdev-%s", name);
if (no_module && capable(CAP_SYS_MODULE))
request_module("%s", name);
}
EXPORT_SYMBOL(dev_load);
/*
* This function handles all "interface"-type I/O control requests. The actual
* 'doing' part of this is dev_ifsioc above.
*/
/**
* dev_ioctl - network device ioctl
* @net: the applicable net namespace
* @cmd: command to issue
* @arg: pointer to a struct ifreq in user space
*
* Issue ioctl functions to devices. This is normally called by the
* user space syscall interfaces but can sometimes be useful for
* other purposes. The return value is the return from the syscall if
* positive or a negative errno code on error.
*/
int dev_ioctl(struct net *net, unsigned int cmd, void __user *arg)
{
struct ifreq ifr;
int ret;
char *colon;
/* One special case: SIOCGIFCONF takes ifconf argument
and requires shared lock, because it sleeps writing
to user space.
*/
if (cmd == SIOCGIFCONF) {
rtnl_lock();
ret = dev_ifconf(net, (char __user *) arg);
rtnl_unlock();
return ret;
}
if (cmd == SIOCGIFNAME)
return dev_ifname(net, (struct ifreq __user *)arg);
if (copy_from_user(&ifr, arg, sizeof(struct ifreq)))
return -EFAULT;
ifr.ifr_name[IFNAMSIZ-1] = 0;
colon = strchr(ifr.ifr_name, ':');
if (colon)
*colon = 0;
/*
* See which interface the caller is talking about.
*/
switch (cmd) {
/*
* These ioctl calls:
* - can be done by all.
* - atomic and do not require locking.
* - return a value
*/
case SIOCGIFFLAGS:
case SIOCGIFMETRIC:
case SIOCGIFMTU:
case SIOCGIFHWADDR:
case SIOCGIFSLAVE:
case SIOCGIFMAP:
case SIOCGIFINDEX:
case SIOCGIFTXQLEN:
dev_load(net, ifr.ifr_name);
rcu_read_lock();
ret = dev_ifsioc_locked(net, &ifr, cmd);
rcu_read_unlock();
if (!ret) {
if (colon)
*colon = ':';
if (copy_to_user(arg, &ifr,
sizeof(struct ifreq)))
ret = -EFAULT;
}
return ret;
case SIOCETHTOOL:
dev_load(net, ifr.ifr_name);
rtnl_lock();
ret = dev_ethtool(net, &ifr);
rtnl_unlock();
if (!ret) {
if (colon)
*colon = ':';
if (copy_to_user(arg, &ifr,
sizeof(struct ifreq)))
ret = -EFAULT;
}
return ret;
/*
* These ioctl calls:
* - require superuser power.
* - require strict serialization.
* - return a value
*/
case SIOCGMIIPHY:
case SIOCGMIIREG:
case SIOCSIFNAME:
if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
return -EPERM;
dev_load(net, ifr.ifr_name);
rtnl_lock();
ret = dev_ifsioc(net, &ifr, cmd);
rtnl_unlock();
if (!ret) {
if (colon)
*colon = ':';
if (copy_to_user(arg, &ifr,
sizeof(struct ifreq)))
ret = -EFAULT;
}
return ret;
/*
* These ioctl calls:
* - require superuser power.
* - require strict serialization.
* - do not return a value
*/
case SIOCSIFMAP:
case SIOCSIFTXQLEN:
if (!capable(CAP_NET_ADMIN))
return -EPERM;
/* fall through */
/*
* These ioctl calls:
* - require local superuser power.
* - require strict serialization.
* - do not return a value
*/
case SIOCSIFFLAGS:
case SIOCSIFMETRIC:
case SIOCSIFMTU:
case SIOCSIFHWADDR:
case SIOCSIFSLAVE:
case SIOCADDMULTI:
case SIOCDELMULTI:
case SIOCSIFHWBROADCAST:
case SIOCSMIIREG:
case SIOCBONDENSLAVE:
case SIOCBONDRELEASE:
case SIOCBONDSETHWADDR:
case SIOCBONDCHANGEACTIVE:
case SIOCBRADDIF:
case SIOCBRDELIF:
case SIOCSHWTSTAMP:
if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
return -EPERM;
/* fall through */
case SIOCBONDSLAVEINFOQUERY:
case SIOCBONDINFOQUERY:
dev_load(net, ifr.ifr_name);
rtnl_lock();
ret = dev_ifsioc(net, &ifr, cmd);
rtnl_unlock();
return ret;
case SIOCGIFMEM:
/* Get the per device memory space. We can add this but
* currently do not support it */
case SIOCSIFMEM:
/* Set the per device memory buffer space.
* Not applicable in our case */
case SIOCSIFLINK:
return -ENOTTY;
/*
* Unknown or private ioctl.
*/
default:
if (cmd == SIOCWANDEV ||
cmd == SIOCGHWTSTAMP ||
(cmd >= SIOCDEVPRIVATE &&
cmd <= SIOCDEVPRIVATE + 15)) {
dev_load(net, ifr.ifr_name);
rtnl_lock();
ret = dev_ifsioc(net, &ifr, cmd);
rtnl_unlock();
if (!ret && copy_to_user(arg, &ifr,
sizeof(struct ifreq)))
ret = -EFAULT;
return ret;
}
/* Take care of Wireless Extensions */
if (cmd >= SIOCIWFIRST && cmd <= SIOCIWLAST)
return wext_handle_ioctl(net, &ifr, cmd, arg);
return -ENOTTY;
}
}

437
net/core/drop_monitor.c Normal file
View file

@ -0,0 +1,437 @@
/*
* Monitoring code for network dropped packet alerts
*
* Copyright (C) 2009 Neil Horman <nhorman@tuxdriver.com>
*/
#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
#include <linux/netdevice.h>
#include <linux/etherdevice.h>
#include <linux/string.h>
#include <linux/if_arp.h>
#include <linux/inetdevice.h>
#include <linux/inet.h>
#include <linux/interrupt.h>
#include <linux/netpoll.h>
#include <linux/sched.h>
#include <linux/delay.h>
#include <linux/types.h>
#include <linux/workqueue.h>
#include <linux/netlink.h>
#include <linux/net_dropmon.h>
#include <linux/percpu.h>
#include <linux/timer.h>
#include <linux/bitops.h>
#include <linux/slab.h>
#include <linux/module.h>
#include <net/genetlink.h>
#include <net/netevent.h>
#include <trace/events/skb.h>
#include <trace/events/napi.h>
#include <asm/unaligned.h>
#define TRACE_ON 1
#define TRACE_OFF 0
/*
* Globals, our netlink socket pointer
* and the work handle that will send up
* netlink alerts
*/
static int trace_state = TRACE_OFF;
static DEFINE_MUTEX(trace_state_mutex);
struct per_cpu_dm_data {
spinlock_t lock;
struct sk_buff *skb;
struct work_struct dm_alert_work;
struct timer_list send_timer;
};
struct dm_hw_stat_delta {
struct net_device *dev;
unsigned long last_rx;
struct list_head list;
struct rcu_head rcu;
unsigned long last_drop_val;
};
static struct genl_family net_drop_monitor_family = {
.id = GENL_ID_GENERATE,
.hdrsize = 0,
.name = "NET_DM",
.version = 2,
};
static DEFINE_PER_CPU(struct per_cpu_dm_data, dm_cpu_data);
static int dm_hit_limit = 64;
static int dm_delay = 1;
static unsigned long dm_hw_check_delta = 2*HZ;
static LIST_HEAD(hw_stats_list);
static struct sk_buff *reset_per_cpu_data(struct per_cpu_dm_data *data)
{
size_t al;
struct net_dm_alert_msg *msg;
struct nlattr *nla;
struct sk_buff *skb;
unsigned long flags;
al = sizeof(struct net_dm_alert_msg);
al += dm_hit_limit * sizeof(struct net_dm_drop_point);
al += sizeof(struct nlattr);
skb = genlmsg_new(al, GFP_KERNEL);
if (skb) {
genlmsg_put(skb, 0, 0, &net_drop_monitor_family,
0, NET_DM_CMD_ALERT);
nla = nla_reserve(skb, NLA_UNSPEC,
sizeof(struct net_dm_alert_msg));
msg = nla_data(nla);
memset(msg, 0, al);
} else {
mod_timer(&data->send_timer, jiffies + HZ / 10);
}
spin_lock_irqsave(&data->lock, flags);
swap(data->skb, skb);
spin_unlock_irqrestore(&data->lock, flags);
return skb;
}
static struct genl_multicast_group dropmon_mcgrps[] = {
{ .name = "events", },
};
static void send_dm_alert(struct work_struct *work)
{
struct sk_buff *skb;
struct per_cpu_dm_data *data;
data = container_of(work, struct per_cpu_dm_data, dm_alert_work);
skb = reset_per_cpu_data(data);
if (skb)
genlmsg_multicast(&net_drop_monitor_family, skb, 0,
0, GFP_KERNEL);
}
/*
* This is the timer function to delay the sending of an alert
* in the event that more drops will arrive during the
* hysteresis period.
*/
static void sched_send_work(unsigned long _data)
{
struct per_cpu_dm_data *data = (struct per_cpu_dm_data *)_data;
schedule_work(&data->dm_alert_work);
}
static void trace_drop_common(struct sk_buff *skb, void *location)
{
struct net_dm_alert_msg *msg;
struct nlmsghdr *nlh;
struct nlattr *nla;
int i;
struct sk_buff *dskb;
struct per_cpu_dm_data *data;
unsigned long flags;
local_irq_save(flags);
data = this_cpu_ptr(&dm_cpu_data);
spin_lock(&data->lock);
dskb = data->skb;
if (!dskb)
goto out;
nlh = (struct nlmsghdr *)dskb->data;
nla = genlmsg_data(nlmsg_data(nlh));
msg = nla_data(nla);
for (i = 0; i < msg->entries; i++) {
if (!memcmp(&location, msg->points[i].pc, sizeof(void *))) {
msg->points[i].count++;
goto out;
}
}
if (msg->entries == dm_hit_limit)
goto out;
/*
* We need to create a new entry
*/
__nla_reserve_nohdr(dskb, sizeof(struct net_dm_drop_point));
nla->nla_len += NLA_ALIGN(sizeof(struct net_dm_drop_point));
memcpy(msg->points[msg->entries].pc, &location, sizeof(void *));
msg->points[msg->entries].count = 1;
msg->entries++;
if (!timer_pending(&data->send_timer)) {
data->send_timer.expires = jiffies + dm_delay * HZ;
add_timer(&data->send_timer);
}
out:
spin_unlock_irqrestore(&data->lock, flags);
}
static void trace_kfree_skb_hit(void *ignore, struct sk_buff *skb, void *location)
{
trace_drop_common(skb, location);
}
static void trace_napi_poll_hit(void *ignore, struct napi_struct *napi)
{
struct dm_hw_stat_delta *new_stat;
/*
* Don't check napi structures with no associated device
*/
if (!napi->dev)
return;
rcu_read_lock();
list_for_each_entry_rcu(new_stat, &hw_stats_list, list) {
/*
* only add a note to our monitor buffer if:
* 1) this is the dev we received on
* 2) its after the last_rx delta
* 3) our rx_dropped count has gone up
*/
if ((new_stat->dev == napi->dev) &&
(time_after(jiffies, new_stat->last_rx + dm_hw_check_delta)) &&
(napi->dev->stats.rx_dropped != new_stat->last_drop_val)) {
trace_drop_common(NULL, NULL);
new_stat->last_drop_val = napi->dev->stats.rx_dropped;
new_stat->last_rx = jiffies;
break;
}
}
rcu_read_unlock();
}
static int set_all_monitor_traces(int state)
{
int rc = 0;
struct dm_hw_stat_delta *new_stat = NULL;
struct dm_hw_stat_delta *temp;
mutex_lock(&trace_state_mutex);
if (state == trace_state) {
rc = -EAGAIN;
goto out_unlock;
}
switch (state) {
case TRACE_ON:
if (!try_module_get(THIS_MODULE)) {
rc = -ENODEV;
break;
}
rc |= register_trace_kfree_skb(trace_kfree_skb_hit, NULL);
rc |= register_trace_napi_poll(trace_napi_poll_hit, NULL);
break;
case TRACE_OFF:
rc |= unregister_trace_kfree_skb(trace_kfree_skb_hit, NULL);
rc |= unregister_trace_napi_poll(trace_napi_poll_hit, NULL);
tracepoint_synchronize_unregister();
/*
* Clean the device list
*/
list_for_each_entry_safe(new_stat, temp, &hw_stats_list, list) {
if (new_stat->dev == NULL) {
list_del_rcu(&new_stat->list);
kfree_rcu(new_stat, rcu);
}
}
module_put(THIS_MODULE);
break;
default:
rc = 1;
break;
}
if (!rc)
trace_state = state;
else
rc = -EINPROGRESS;
out_unlock:
mutex_unlock(&trace_state_mutex);
return rc;
}
static int net_dm_cmd_config(struct sk_buff *skb,
struct genl_info *info)
{
return -ENOTSUPP;
}
static int net_dm_cmd_trace(struct sk_buff *skb,
struct genl_info *info)
{
switch (info->genlhdr->cmd) {
case NET_DM_CMD_START:
return set_all_monitor_traces(TRACE_ON);
case NET_DM_CMD_STOP:
return set_all_monitor_traces(TRACE_OFF);
}
return -ENOTSUPP;
}
static int dropmon_net_event(struct notifier_block *ev_block,
unsigned long event, void *ptr)
{
struct net_device *dev = netdev_notifier_info_to_dev(ptr);
struct dm_hw_stat_delta *new_stat = NULL;
struct dm_hw_stat_delta *tmp;
switch (event) {
case NETDEV_REGISTER:
new_stat = kzalloc(sizeof(struct dm_hw_stat_delta), GFP_KERNEL);
if (!new_stat)
goto out;
new_stat->dev = dev;
new_stat->last_rx = jiffies;
mutex_lock(&trace_state_mutex);
list_add_rcu(&new_stat->list, &hw_stats_list);
mutex_unlock(&trace_state_mutex);
break;
case NETDEV_UNREGISTER:
mutex_lock(&trace_state_mutex);
list_for_each_entry_safe(new_stat, tmp, &hw_stats_list, list) {
if (new_stat->dev == dev) {
new_stat->dev = NULL;
if (trace_state == TRACE_OFF) {
list_del_rcu(&new_stat->list);
kfree_rcu(new_stat, rcu);
break;
}
}
}
mutex_unlock(&trace_state_mutex);
break;
}
out:
return NOTIFY_DONE;
}
static const struct genl_ops dropmon_ops[] = {
{
.cmd = NET_DM_CMD_CONFIG,
.doit = net_dm_cmd_config,
},
{
.cmd = NET_DM_CMD_START,
.doit = net_dm_cmd_trace,
},
{
.cmd = NET_DM_CMD_STOP,
.doit = net_dm_cmd_trace,
},
};
static struct notifier_block dropmon_net_notifier = {
.notifier_call = dropmon_net_event
};
static int __init init_net_drop_monitor(void)
{
struct per_cpu_dm_data *data;
int cpu, rc;
pr_info("Initializing network drop monitor service\n");
if (sizeof(void *) > 8) {
pr_err("Unable to store program counters on this arch, Drop monitor failed\n");
return -ENOSPC;
}
rc = genl_register_family_with_ops_groups(&net_drop_monitor_family,
dropmon_ops, dropmon_mcgrps);
if (rc) {
pr_err("Could not create drop monitor netlink family\n");
return rc;
}
WARN_ON(net_drop_monitor_family.mcgrp_offset != NET_DM_GRP_ALERT);
rc = register_netdevice_notifier(&dropmon_net_notifier);
if (rc < 0) {
pr_crit("Failed to register netdevice notifier\n");
goto out_unreg;
}
rc = 0;
for_each_possible_cpu(cpu) {
data = &per_cpu(dm_cpu_data, cpu);
INIT_WORK(&data->dm_alert_work, send_dm_alert);
init_timer(&data->send_timer);
data->send_timer.data = (unsigned long)data;
data->send_timer.function = sched_send_work;
spin_lock_init(&data->lock);
reset_per_cpu_data(data);
}
goto out;
out_unreg:
genl_unregister_family(&net_drop_monitor_family);
out:
return rc;
}
static void exit_net_drop_monitor(void)
{
struct per_cpu_dm_data *data;
int cpu;
BUG_ON(unregister_netdevice_notifier(&dropmon_net_notifier));
/*
* Because of the module_get/put we do in the trace state change path
* we are guarnateed not to have any current users when we get here
* all we need to do is make sure that we don't have any running timers
* or pending schedule calls
*/
for_each_possible_cpu(cpu) {
data = &per_cpu(dm_cpu_data, cpu);
del_timer_sync(&data->send_timer);
cancel_work_sync(&data->dm_alert_work);
/*
* At this point, we should have exclusive access
* to this struct and can free the skb inside it
*/
kfree_skb(data->skb);
}
BUG_ON(genl_unregister_family(&net_drop_monitor_family));
}
module_init(init_net_drop_monitor);
module_exit(exit_net_drop_monitor);
MODULE_LICENSE("GPL v2");
MODULE_AUTHOR("Neil Horman <nhorman@tuxdriver.com>");
MODULE_ALIAS_GENL_FAMILY("NET_DM");

421
net/core/dst.c Normal file
View file

@ -0,0 +1,421 @@
/*
* net/core/dst.c Protocol independent destination cache.
*
* Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
*
*/
#include <linux/bitops.h>
#include <linux/errno.h>
#include <linux/init.h>
#include <linux/kernel.h>
#include <linux/workqueue.h>
#include <linux/mm.h>
#include <linux/module.h>
#include <linux/slab.h>
#include <linux/netdevice.h>
#include <linux/skbuff.h>
#include <linux/string.h>
#include <linux/types.h>
#include <net/net_namespace.h>
#include <linux/sched.h>
#include <linux/prefetch.h>
#include <net/dst.h>
/*
* Theory of operations:
* 1) We use a list, protected by a spinlock, to add
* new entries from both BH and non-BH context.
* 2) In order to keep spinlock held for a small delay,
* we use a second list where are stored long lived
* entries, that are handled by the garbage collect thread
* fired by a workqueue.
* 3) This list is guarded by a mutex,
* so that the gc_task and dst_dev_event() can be synchronized.
*/
/*
* We want to keep lock & list close together
* to dirty as few cache lines as possible in __dst_free().
* As this is not a very strong hint, we dont force an alignment on SMP.
*/
static struct {
spinlock_t lock;
struct dst_entry *list;
unsigned long timer_inc;
unsigned long timer_expires;
} dst_garbage = {
.lock = __SPIN_LOCK_UNLOCKED(dst_garbage.lock),
.timer_inc = DST_GC_MAX,
};
static void dst_gc_task(struct work_struct *work);
static void ___dst_free(struct dst_entry *dst);
static DECLARE_DELAYED_WORK(dst_gc_work, dst_gc_task);
static DEFINE_MUTEX(dst_gc_mutex);
/*
* long lived entries are maintained in this list, guarded by dst_gc_mutex
*/
static struct dst_entry *dst_busy_list;
static void dst_gc_task(struct work_struct *work)
{
int delayed = 0;
int work_performed = 0;
unsigned long expires = ~0L;
struct dst_entry *dst, *next, head;
struct dst_entry *last = &head;
mutex_lock(&dst_gc_mutex);
next = dst_busy_list;
loop:
while ((dst = next) != NULL) {
next = dst->next;
prefetch(&next->next);
cond_resched();
if (likely(atomic_read(&dst->__refcnt))) {
last->next = dst;
last = dst;
delayed++;
continue;
}
work_performed++;
dst = dst_destroy(dst);
if (dst) {
/* NOHASH and still referenced. Unless it is already
* on gc list, invalidate it and add to gc list.
*
* Note: this is temporary. Actually, NOHASH dst's
* must be obsoleted when parent is obsoleted.
* But we do not have state "obsoleted, but
* referenced by parent", so it is right.
*/
if (dst->obsolete > 0)
continue;
___dst_free(dst);
dst->next = next;
next = dst;
}
}
spin_lock_bh(&dst_garbage.lock);
next = dst_garbage.list;
if (next) {
dst_garbage.list = NULL;
spin_unlock_bh(&dst_garbage.lock);
goto loop;
}
last->next = NULL;
dst_busy_list = head.next;
if (!dst_busy_list)
dst_garbage.timer_inc = DST_GC_MAX;
else {
/*
* if we freed less than 1/10 of delayed entries,
* we can sleep longer.
*/
if (work_performed <= delayed/10) {
dst_garbage.timer_expires += dst_garbage.timer_inc;
if (dst_garbage.timer_expires > DST_GC_MAX)
dst_garbage.timer_expires = DST_GC_MAX;
dst_garbage.timer_inc += DST_GC_INC;
} else {
dst_garbage.timer_inc = DST_GC_INC;
dst_garbage.timer_expires = DST_GC_MIN;
}
expires = dst_garbage.timer_expires;
/*
* if the next desired timer is more than 4 seconds in the
* future then round the timer to whole seconds
*/
if (expires > 4*HZ)
expires = round_jiffies_relative(expires);
schedule_delayed_work(&dst_gc_work, expires);
}
spin_unlock_bh(&dst_garbage.lock);
mutex_unlock(&dst_gc_mutex);
}
int dst_discard_sk(struct sock *sk, struct sk_buff *skb)
{
kfree_skb(skb);
return 0;
}
EXPORT_SYMBOL(dst_discard_sk);
const u32 dst_default_metrics[RTAX_MAX + 1] = {
/* This initializer is needed to force linker to place this variable
* into const section. Otherwise it might end into bss section.
* We really want to avoid false sharing on this variable, and catch
* any writes on it.
*/
[RTAX_MAX] = 0xdeadbeef,
};
void *dst_alloc(struct dst_ops *ops, struct net_device *dev,
int initial_ref, int initial_obsolete, unsigned short flags)
{
struct dst_entry *dst;
if (ops->gc && dst_entries_get_fast(ops) > ops->gc_thresh) {
if (ops->gc(ops))
return NULL;
}
dst = kmem_cache_alloc(ops->kmem_cachep, GFP_ATOMIC);
if (!dst)
return NULL;
dst->child = NULL;
dst->dev = dev;
if (dev)
dev_hold(dev);
dst->ops = ops;
dst_init_metrics(dst, dst_default_metrics, true);
dst->expires = 0UL;
dst->path = dst;
dst->from = NULL;
#ifdef CONFIG_XFRM
dst->xfrm = NULL;
#endif
dst->input = dst_discard;
dst->output = dst_discard_sk;
dst->error = 0;
dst->obsolete = initial_obsolete;
dst->header_len = 0;
dst->trailer_len = 0;
#ifdef CONFIG_IP_ROUTE_CLASSID
dst->tclassid = 0;
#endif
atomic_set(&dst->__refcnt, initial_ref);
dst->__use = 0;
dst->lastuse = jiffies;
dst->flags = flags;
dst->pending_confirm = 0;
dst->next = NULL;
if (!(flags & DST_NOCOUNT))
dst_entries_add(ops, 1);
return dst;
}
EXPORT_SYMBOL(dst_alloc);
static void ___dst_free(struct dst_entry *dst)
{
/* The first case (dev==NULL) is required, when
protocol module is unloaded.
*/
if (dst->dev == NULL || !(dst->dev->flags&IFF_UP)) {
dst->input = dst_discard;
dst->output = dst_discard_sk;
}
dst->obsolete = DST_OBSOLETE_DEAD;
}
void __dst_free(struct dst_entry *dst)
{
spin_lock_bh(&dst_garbage.lock);
___dst_free(dst);
dst->next = dst_garbage.list;
dst_garbage.list = dst;
if (dst_garbage.timer_inc > DST_GC_INC) {
dst_garbage.timer_inc = DST_GC_INC;
dst_garbage.timer_expires = DST_GC_MIN;
mod_delayed_work(system_wq, &dst_gc_work,
dst_garbage.timer_expires);
}
spin_unlock_bh(&dst_garbage.lock);
}
EXPORT_SYMBOL(__dst_free);
struct dst_entry *dst_destroy(struct dst_entry * dst)
{
struct dst_entry *child;
smp_rmb();
again:
child = dst->child;
if (!(dst->flags & DST_NOCOUNT))
dst_entries_add(dst->ops, -1);
if (dst->ops->destroy)
dst->ops->destroy(dst);
if (dst->dev)
dev_put(dst->dev);
kmem_cache_free(dst->ops->kmem_cachep, dst);
dst = child;
if (dst) {
int nohash = dst->flags & DST_NOHASH;
if (atomic_dec_and_test(&dst->__refcnt)) {
/* We were real parent of this dst, so kill child. */
if (nohash)
goto again;
} else {
/* Child is still referenced, return it for freeing. */
if (nohash)
return dst;
/* Child is still in his hash table */
}
}
return NULL;
}
EXPORT_SYMBOL(dst_destroy);
static void dst_destroy_rcu(struct rcu_head *head)
{
struct dst_entry *dst = container_of(head, struct dst_entry, rcu_head);
dst = dst_destroy(dst);
if (dst)
__dst_free(dst);
}
void dst_release(struct dst_entry *dst)
{
if (dst) {
int newrefcnt;
newrefcnt = atomic_dec_return(&dst->__refcnt);
WARN_ON(newrefcnt < 0);
if (unlikely(dst->flags & DST_NOCACHE) && !newrefcnt)
call_rcu(&dst->rcu_head, dst_destroy_rcu);
}
}
EXPORT_SYMBOL(dst_release);
u32 *dst_cow_metrics_generic(struct dst_entry *dst, unsigned long old)
{
u32 *p = kmalloc(sizeof(u32) * RTAX_MAX, GFP_ATOMIC);
if (p) {
u32 *old_p = __DST_METRICS_PTR(old);
unsigned long prev, new;
memcpy(p, old_p, sizeof(u32) * RTAX_MAX);
new = (unsigned long) p;
prev = cmpxchg(&dst->_metrics, old, new);
if (prev != old) {
kfree(p);
p = __DST_METRICS_PTR(prev);
if (prev & DST_METRICS_READ_ONLY)
p = NULL;
}
}
return p;
}
EXPORT_SYMBOL(dst_cow_metrics_generic);
/* Caller asserts that dst_metrics_read_only(dst) is false. */
void __dst_destroy_metrics_generic(struct dst_entry *dst, unsigned long old)
{
unsigned long prev, new;
new = ((unsigned long) dst_default_metrics) | DST_METRICS_READ_ONLY;
prev = cmpxchg(&dst->_metrics, old, new);
if (prev == old)
kfree(__DST_METRICS_PTR(old));
}
EXPORT_SYMBOL(__dst_destroy_metrics_generic);
/**
* __skb_dst_set_noref - sets skb dst, without a reference
* @skb: buffer
* @dst: dst entry
* @force: if force is set, use noref version even for DST_NOCACHE entries
*
* Sets skb dst, assuming a reference was not taken on dst
* skb_dst_drop() should not dst_release() this dst
*/
void __skb_dst_set_noref(struct sk_buff *skb, struct dst_entry *dst, bool force)
{
WARN_ON(!rcu_read_lock_held() && !rcu_read_lock_bh_held());
/* If dst not in cache, we must take a reference, because
* dst_release() will destroy dst as soon as its refcount becomes zero
*/
if (unlikely((dst->flags & DST_NOCACHE) && !force)) {
dst_hold(dst);
skb_dst_set(skb, dst);
} else {
skb->_skb_refdst = (unsigned long)dst | SKB_DST_NOREF;
}
}
EXPORT_SYMBOL(__skb_dst_set_noref);
/* Dirty hack. We did it in 2.2 (in __dst_free),
* we have _very_ good reasons not to repeat
* this mistake in 2.3, but we have no choice
* now. _It_ _is_ _explicit_ _deliberate_
* _race_ _condition_.
*
* Commented and originally written by Alexey.
*/
static void dst_ifdown(struct dst_entry *dst, struct net_device *dev,
int unregister)
{
if (dst->ops->ifdown)
dst->ops->ifdown(dst, dev, unregister);
if (dev != dst->dev)
return;
if (!unregister) {
dst->input = dst_discard;
dst->output = dst_discard_sk;
} else {
dst->dev = dev_net(dst->dev)->loopback_dev;
dev_hold(dst->dev);
dev_put(dev);
}
}
static int dst_dev_event(struct notifier_block *this, unsigned long event,
void *ptr)
{
struct net_device *dev = netdev_notifier_info_to_dev(ptr);
struct dst_entry *dst, *last = NULL;
switch (event) {
case NETDEV_UNREGISTER_FINAL:
case NETDEV_DOWN:
mutex_lock(&dst_gc_mutex);
for (dst = dst_busy_list; dst; dst = dst->next) {
last = dst;
dst_ifdown(dst, dev, event != NETDEV_DOWN);
}
spin_lock_bh(&dst_garbage.lock);
dst = dst_garbage.list;
dst_garbage.list = NULL;
spin_unlock_bh(&dst_garbage.lock);
if (last)
last->next = dst;
else
dst_busy_list = dst;
for (; dst; dst = dst->next)
dst_ifdown(dst, dev, event != NETDEV_DOWN);
mutex_unlock(&dst_gc_mutex);
break;
}
return NOTIFY_DONE;
}
static struct notifier_block dst_dev_notifier = {
.notifier_call = dst_dev_event,
.priority = -10, /* must be called after other network notifiers */
};
void __init dst_init(void)
{
register_netdevice_notifier(&dst_dev_notifier);
}

1957
net/core/ethtool.c Normal file

File diff suppressed because it is too large Load diff

854
net/core/fib_rules.c Normal file
View file

@ -0,0 +1,854 @@
/*
* net/core/fib_rules.c Generic Routing Rules
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public License as
* published by the Free Software Foundation, version 2.
*
* Authors: Thomas Graf <tgraf@suug.ch>
*/
#include <linux/types.h>
#include <linux/kernel.h>
#include <linux/slab.h>
#include <linux/list.h>
#include <linux/module.h>
#include <net/net_namespace.h>
#include <net/sock.h>
#include <net/fib_rules.h>
int fib_default_rule_add(struct fib_rules_ops *ops,
u32 pref, u32 table, u32 flags)
{
struct fib_rule *r;
r = kzalloc(ops->rule_size, GFP_KERNEL);
if (r == NULL)
return -ENOMEM;
atomic_set(&r->refcnt, 1);
r->action = FR_ACT_TO_TBL;
r->pref = pref;
r->table = table;
r->flags = flags;
r->uid_start = INVALID_UID;
r->uid_end = INVALID_UID;
r->fr_net = hold_net(ops->fro_net);
r->suppress_prefixlen = -1;
r->suppress_ifgroup = -1;
/* The lock is not required here, the list in unreacheable
* at the moment this function is called */
list_add_tail(&r->list, &ops->rules_list);
return 0;
}
EXPORT_SYMBOL(fib_default_rule_add);
u32 fib_default_rule_pref(struct fib_rules_ops *ops)
{
struct list_head *pos;
struct fib_rule *rule;
if (!list_empty(&ops->rules_list)) {
pos = ops->rules_list.next;
if (pos->next != &ops->rules_list) {
rule = list_entry(pos->next, struct fib_rule, list);
if (rule->pref)
return rule->pref - 1;
}
}
return 0;
}
EXPORT_SYMBOL(fib_default_rule_pref);
static void notify_rule_change(int event, struct fib_rule *rule,
struct fib_rules_ops *ops, struct nlmsghdr *nlh,
u32 pid);
static struct fib_rules_ops *lookup_rules_ops(struct net *net, int family)
{
struct fib_rules_ops *ops;
rcu_read_lock();
list_for_each_entry_rcu(ops, &net->rules_ops, list) {
if (ops->family == family) {
if (!try_module_get(ops->owner))
ops = NULL;
rcu_read_unlock();
return ops;
}
}
rcu_read_unlock();
return NULL;
}
static void rules_ops_put(struct fib_rules_ops *ops)
{
if (ops)
module_put(ops->owner);
}
static void flush_route_cache(struct fib_rules_ops *ops)
{
if (ops->flush_cache)
ops->flush_cache(ops);
}
static int __fib_rules_register(struct fib_rules_ops *ops)
{
int err = -EEXIST;
struct fib_rules_ops *o;
struct net *net;
net = ops->fro_net;
if (ops->rule_size < sizeof(struct fib_rule))
return -EINVAL;
if (ops->match == NULL || ops->configure == NULL ||
ops->compare == NULL || ops->fill == NULL ||
ops->action == NULL)
return -EINVAL;
spin_lock(&net->rules_mod_lock);
list_for_each_entry(o, &net->rules_ops, list)
if (ops->family == o->family)
goto errout;
hold_net(net);
list_add_tail_rcu(&ops->list, &net->rules_ops);
err = 0;
errout:
spin_unlock(&net->rules_mod_lock);
return err;
}
struct fib_rules_ops *
fib_rules_register(const struct fib_rules_ops *tmpl, struct net *net)
{
struct fib_rules_ops *ops;
int err;
ops = kmemdup(tmpl, sizeof(*ops), GFP_KERNEL);
if (ops == NULL)
return ERR_PTR(-ENOMEM);
INIT_LIST_HEAD(&ops->rules_list);
ops->fro_net = net;
err = __fib_rules_register(ops);
if (err) {
kfree(ops);
ops = ERR_PTR(err);
}
return ops;
}
EXPORT_SYMBOL_GPL(fib_rules_register);
static void fib_rules_cleanup_ops(struct fib_rules_ops *ops)
{
struct fib_rule *rule, *tmp;
list_for_each_entry_safe(rule, tmp, &ops->rules_list, list) {
list_del_rcu(&rule->list);
if (ops->delete)
ops->delete(rule);
fib_rule_put(rule);
}
}
static void fib_rules_put_rcu(struct rcu_head *head)
{
struct fib_rules_ops *ops = container_of(head, struct fib_rules_ops, rcu);
struct net *net = ops->fro_net;
release_net(net);
kfree(ops);
}
void fib_rules_unregister(struct fib_rules_ops *ops)
{
struct net *net = ops->fro_net;
spin_lock(&net->rules_mod_lock);
list_del_rcu(&ops->list);
fib_rules_cleanup_ops(ops);
spin_unlock(&net->rules_mod_lock);
call_rcu(&ops->rcu, fib_rules_put_rcu);
}
EXPORT_SYMBOL_GPL(fib_rules_unregister);
static inline kuid_t fib_nl_uid(struct nlattr *nla)
{
return make_kuid(current_user_ns(), nla_get_u32(nla));
}
static int nla_put_uid(struct sk_buff *skb, int idx, kuid_t uid)
{
return nla_put_u32(skb, idx, from_kuid_munged(current_user_ns(), uid));
}
static int fib_uid_range_match(struct flowi *fl, struct fib_rule *rule)
{
return (!uid_valid(rule->uid_start) && !uid_valid(rule->uid_end)) ||
(uid_gte(fl->flowi_uid, rule->uid_start) &&
uid_lte(fl->flowi_uid, rule->uid_end));
}
static int fib_rule_match(struct fib_rule *rule, struct fib_rules_ops *ops,
struct flowi *fl, int flags)
{
int ret = 0;
if (rule->iifindex && (rule->iifindex != fl->flowi_iif))
goto out;
if (rule->oifindex && (rule->oifindex != fl->flowi_oif))
goto out;
if ((rule->mark ^ fl->flowi_mark) & rule->mark_mask)
goto out;
if (!fib_uid_range_match(fl, rule))
goto out;
ret = ops->match(rule, fl, flags);
out:
return (rule->flags & FIB_RULE_INVERT) ? !ret : ret;
}
int fib_rules_lookup(struct fib_rules_ops *ops, struct flowi *fl,
int flags, struct fib_lookup_arg *arg)
{
struct fib_rule *rule;
int err;
rcu_read_lock();
list_for_each_entry_rcu(rule, &ops->rules_list, list) {
jumped:
if (!fib_rule_match(rule, ops, fl, flags))
continue;
if (rule->action == FR_ACT_GOTO) {
struct fib_rule *target;
target = rcu_dereference(rule->ctarget);
if (target == NULL) {
continue;
} else {
rule = target;
goto jumped;
}
} else if (rule->action == FR_ACT_NOP)
continue;
else
err = ops->action(rule, fl, flags, arg);
if (!err && ops->suppress && ops->suppress(rule, arg))
continue;
if (err != -EAGAIN) {
if ((arg->flags & FIB_LOOKUP_NOREF) ||
likely(atomic_inc_not_zero(&rule->refcnt))) {
arg->rule = rule;
goto out;
}
break;
}
}
err = -ESRCH;
out:
rcu_read_unlock();
return err;
}
EXPORT_SYMBOL_GPL(fib_rules_lookup);
static int validate_rulemsg(struct fib_rule_hdr *frh, struct nlattr **tb,
struct fib_rules_ops *ops)
{
int err = -EINVAL;
if (frh->src_len)
if (tb[FRA_SRC] == NULL ||
frh->src_len > (ops->addr_size * 8) ||
nla_len(tb[FRA_SRC]) != ops->addr_size)
goto errout;
if (frh->dst_len)
if (tb[FRA_DST] == NULL ||
frh->dst_len > (ops->addr_size * 8) ||
nla_len(tb[FRA_DST]) != ops->addr_size)
goto errout;
err = 0;
errout:
return err;
}
static int fib_nl_newrule(struct sk_buff *skb, struct nlmsghdr* nlh)
{
struct net *net = sock_net(skb->sk);
struct fib_rule_hdr *frh = nlmsg_data(nlh);
struct fib_rules_ops *ops = NULL;
struct fib_rule *rule, *r, *last = NULL;
struct nlattr *tb[FRA_MAX+1];
int err = -EINVAL, unresolved = 0;
if (nlh->nlmsg_len < nlmsg_msg_size(sizeof(*frh)))
goto errout;
ops = lookup_rules_ops(net, frh->family);
if (ops == NULL) {
err = -EAFNOSUPPORT;
goto errout;
}
err = nlmsg_parse(nlh, sizeof(*frh), tb, FRA_MAX, ops->policy);
if (err < 0)
goto errout;
err = validate_rulemsg(frh, tb, ops);
if (err < 0)
goto errout;
rule = kzalloc(ops->rule_size, GFP_KERNEL);
if (rule == NULL) {
err = -ENOMEM;
goto errout;
}
rule->fr_net = hold_net(net);
if (tb[FRA_PRIORITY])
rule->pref = nla_get_u32(tb[FRA_PRIORITY]);
if (tb[FRA_IIFNAME]) {
struct net_device *dev;
rule->iifindex = -1;
nla_strlcpy(rule->iifname, tb[FRA_IIFNAME], IFNAMSIZ);
dev = __dev_get_by_name(net, rule->iifname);
if (dev)
rule->iifindex = dev->ifindex;
}
if (tb[FRA_OIFNAME]) {
struct net_device *dev;
rule->oifindex = -1;
nla_strlcpy(rule->oifname, tb[FRA_OIFNAME], IFNAMSIZ);
dev = __dev_get_by_name(net, rule->oifname);
if (dev)
rule->oifindex = dev->ifindex;
}
if (tb[FRA_FWMARK]) {
rule->mark = nla_get_u32(tb[FRA_FWMARK]);
if (rule->mark)
/* compatibility: if the mark value is non-zero all bits
* are compared unless a mask is explicitly specified.
*/
rule->mark_mask = 0xFFFFFFFF;
}
if (tb[FRA_FWMASK])
rule->mark_mask = nla_get_u32(tb[FRA_FWMASK]);
rule->action = frh->action;
rule->flags = frh->flags;
rule->table = frh_get_table(frh, tb);
if (tb[FRA_SUPPRESS_PREFIXLEN])
rule->suppress_prefixlen = nla_get_u32(tb[FRA_SUPPRESS_PREFIXLEN]);
else
rule->suppress_prefixlen = -1;
if (tb[FRA_SUPPRESS_IFGROUP])
rule->suppress_ifgroup = nla_get_u32(tb[FRA_SUPPRESS_IFGROUP]);
else
rule->suppress_ifgroup = -1;
if (!tb[FRA_PRIORITY] && ops->default_pref)
rule->pref = ops->default_pref(ops);
err = -EINVAL;
if (tb[FRA_GOTO]) {
if (rule->action != FR_ACT_GOTO)
goto errout_free;
rule->target = nla_get_u32(tb[FRA_GOTO]);
/* Backward jumps are prohibited to avoid endless loops */
if (rule->target <= rule->pref)
goto errout_free;
list_for_each_entry(r, &ops->rules_list, list) {
if (r->pref == rule->target) {
RCU_INIT_POINTER(rule->ctarget, r);
break;
}
}
if (rcu_dereference_protected(rule->ctarget, 1) == NULL)
unresolved = 1;
} else if (rule->action == FR_ACT_GOTO)
goto errout_free;
/* UID start and end must either both be valid or both unspecified. */
rule->uid_start = rule->uid_end = INVALID_UID;
if (tb[FRA_UID_START] || tb[FRA_UID_END]) {
if (tb[FRA_UID_START] && tb[FRA_UID_END]) {
rule->uid_start = fib_nl_uid(tb[FRA_UID_START]);
rule->uid_end = fib_nl_uid(tb[FRA_UID_END]);
}
if (!uid_valid(rule->uid_start) ||
!uid_valid(rule->uid_end) ||
!uid_lte(rule->uid_start, rule->uid_end))
goto errout_free;
}
err = ops->configure(rule, skb, frh, tb);
if (err < 0)
goto errout_free;
list_for_each_entry(r, &ops->rules_list, list) {
if (r->pref > rule->pref)
break;
last = r;
}
fib_rule_get(rule);
if (last)
list_add_rcu(&rule->list, &last->list);
else
list_add_rcu(&rule->list, &ops->rules_list);
if (ops->unresolved_rules) {
/*
* There are unresolved goto rules in the list, check if
* any of them are pointing to this new rule.
*/
list_for_each_entry(r, &ops->rules_list, list) {
if (r->action == FR_ACT_GOTO &&
r->target == rule->pref &&
rtnl_dereference(r->ctarget) == NULL) {
rcu_assign_pointer(r->ctarget, rule);
if (--ops->unresolved_rules == 0)
break;
}
}
}
if (rule->action == FR_ACT_GOTO)
ops->nr_goto_rules++;
if (unresolved)
ops->unresolved_rules++;
notify_rule_change(RTM_NEWRULE, rule, ops, nlh, NETLINK_CB(skb).portid);
flush_route_cache(ops);
rules_ops_put(ops);
return 0;
errout_free:
release_net(rule->fr_net);
kfree(rule);
errout:
rules_ops_put(ops);
return err;
}
static int fib_nl_delrule(struct sk_buff *skb, struct nlmsghdr* nlh)
{
struct net *net = sock_net(skb->sk);
struct fib_rule_hdr *frh = nlmsg_data(nlh);
struct fib_rules_ops *ops = NULL;
struct fib_rule *rule, *tmp;
struct nlattr *tb[FRA_MAX+1];
int err = -EINVAL;
if (nlh->nlmsg_len < nlmsg_msg_size(sizeof(*frh)))
goto errout;
ops = lookup_rules_ops(net, frh->family);
if (ops == NULL) {
err = -EAFNOSUPPORT;
goto errout;
}
err = nlmsg_parse(nlh, sizeof(*frh), tb, FRA_MAX, ops->policy);
if (err < 0)
goto errout;
err = validate_rulemsg(frh, tb, ops);
if (err < 0)
goto errout;
list_for_each_entry(rule, &ops->rules_list, list) {
if (frh->action && (frh->action != rule->action))
continue;
if (frh_get_table(frh, tb) &&
(frh_get_table(frh, tb) != rule->table))
continue;
if (tb[FRA_PRIORITY] &&
(rule->pref != nla_get_u32(tb[FRA_PRIORITY])))
continue;
if (tb[FRA_IIFNAME] &&
nla_strcmp(tb[FRA_IIFNAME], rule->iifname))
continue;
if (tb[FRA_OIFNAME] &&
nla_strcmp(tb[FRA_OIFNAME], rule->oifname))
continue;
if (tb[FRA_FWMARK] &&
(rule->mark != nla_get_u32(tb[FRA_FWMARK])))
continue;
if (tb[FRA_FWMASK] &&
(rule->mark_mask != nla_get_u32(tb[FRA_FWMASK])))
continue;
if (tb[FRA_UID_START] &&
!uid_eq(rule->uid_start, fib_nl_uid(tb[FRA_UID_START])))
continue;
if (tb[FRA_UID_END] &&
!uid_eq(rule->uid_end, fib_nl_uid(tb[FRA_UID_END])))
continue;
if (!ops->compare(rule, frh, tb))
continue;
if (rule->flags & FIB_RULE_PERMANENT) {
err = -EPERM;
goto errout;
}
list_del_rcu(&rule->list);
if (rule->action == FR_ACT_GOTO) {
ops->nr_goto_rules--;
if (rtnl_dereference(rule->ctarget) == NULL)
ops->unresolved_rules--;
}
/*
* Check if this rule is a target to any of them. If so,
* disable them. As this operation is eventually very
* expensive, it is only performed if goto rules have
* actually been added.
*/
if (ops->nr_goto_rules > 0) {
list_for_each_entry(tmp, &ops->rules_list, list) {
if (rtnl_dereference(tmp->ctarget) == rule) {
RCU_INIT_POINTER(tmp->ctarget, NULL);
ops->unresolved_rules++;
}
}
}
notify_rule_change(RTM_DELRULE, rule, ops, nlh,
NETLINK_CB(skb).portid);
if (ops->delete)
ops->delete(rule);
fib_rule_put(rule);
flush_route_cache(ops);
rules_ops_put(ops);
return 0;
}
err = -ENOENT;
errout:
rules_ops_put(ops);
return err;
}
static inline size_t fib_rule_nlmsg_size(struct fib_rules_ops *ops,
struct fib_rule *rule)
{
size_t payload = NLMSG_ALIGN(sizeof(struct fib_rule_hdr))
+ nla_total_size(IFNAMSIZ) /* FRA_IIFNAME */
+ nla_total_size(IFNAMSIZ) /* FRA_OIFNAME */
+ nla_total_size(4) /* FRA_PRIORITY */
+ nla_total_size(4) /* FRA_TABLE */
+ nla_total_size(4) /* FRA_SUPPRESS_PREFIXLEN */
+ nla_total_size(4) /* FRA_SUPPRESS_IFGROUP */
+ nla_total_size(4) /* FRA_FWMARK */
+ nla_total_size(4) /* FRA_FWMASK */
+ nla_total_size(4) /* FRA_UID_START */
+ nla_total_size(4); /* FRA_UID_END */
if (ops->nlmsg_payload)
payload += ops->nlmsg_payload(rule);
return payload;
}
static int fib_nl_fill_rule(struct sk_buff *skb, struct fib_rule *rule,
u32 pid, u32 seq, int type, int flags,
struct fib_rules_ops *ops)
{
struct nlmsghdr *nlh;
struct fib_rule_hdr *frh;
nlh = nlmsg_put(skb, pid, seq, type, sizeof(*frh), flags);
if (nlh == NULL)
return -EMSGSIZE;
frh = nlmsg_data(nlh);
frh->family = ops->family;
frh->table = rule->table;
if (nla_put_u32(skb, FRA_TABLE, rule->table))
goto nla_put_failure;
if (nla_put_u32(skb, FRA_SUPPRESS_PREFIXLEN, rule->suppress_prefixlen))
goto nla_put_failure;
frh->res1 = 0;
frh->res2 = 0;
frh->action = rule->action;
frh->flags = rule->flags;
if (rule->action == FR_ACT_GOTO &&
rcu_access_pointer(rule->ctarget) == NULL)
frh->flags |= FIB_RULE_UNRESOLVED;
if (rule->iifname[0]) {
if (nla_put_string(skb, FRA_IIFNAME, rule->iifname))
goto nla_put_failure;
if (rule->iifindex == -1)
frh->flags |= FIB_RULE_IIF_DETACHED;
}
if (rule->oifname[0]) {
if (nla_put_string(skb, FRA_OIFNAME, rule->oifname))
goto nla_put_failure;
if (rule->oifindex == -1)
frh->flags |= FIB_RULE_OIF_DETACHED;
}
if ((rule->pref &&
nla_put_u32(skb, FRA_PRIORITY, rule->pref)) ||
(rule->mark &&
nla_put_u32(skb, FRA_FWMARK, rule->mark)) ||
((rule->mark_mask || rule->mark) &&
nla_put_u32(skb, FRA_FWMASK, rule->mark_mask)) ||
(rule->target &&
nla_put_u32(skb, FRA_GOTO, rule->target)) ||
(uid_valid(rule->uid_start) &&
nla_put_uid(skb, FRA_UID_START, rule->uid_start)) ||
(uid_valid(rule->uid_end) &&
nla_put_uid(skb, FRA_UID_END, rule->uid_end)))
goto nla_put_failure;
if (rule->suppress_ifgroup != -1) {
if (nla_put_u32(skb, FRA_SUPPRESS_IFGROUP, rule->suppress_ifgroup))
goto nla_put_failure;
}
if (ops->fill(rule, skb, frh) < 0)
goto nla_put_failure;
return nlmsg_end(skb, nlh);
nla_put_failure:
nlmsg_cancel(skb, nlh);
return -EMSGSIZE;
}
static int dump_rules(struct sk_buff *skb, struct netlink_callback *cb,
struct fib_rules_ops *ops)
{
int idx = 0;
struct fib_rule *rule;
rcu_read_lock();
list_for_each_entry_rcu(rule, &ops->rules_list, list) {
if (idx < cb->args[1])
goto skip;
if (fib_nl_fill_rule(skb, rule, NETLINK_CB(cb->skb).portid,
cb->nlh->nlmsg_seq, RTM_NEWRULE,
NLM_F_MULTI, ops) < 0)
break;
skip:
idx++;
}
rcu_read_unlock();
cb->args[1] = idx;
rules_ops_put(ops);
return skb->len;
}
static int fib_nl_dumprule(struct sk_buff *skb, struct netlink_callback *cb)
{
struct net *net = sock_net(skb->sk);
struct fib_rules_ops *ops;
int idx = 0, family;
family = rtnl_msg_family(cb->nlh);
if (family != AF_UNSPEC) {
/* Protocol specific dump request */
ops = lookup_rules_ops(net, family);
if (ops == NULL)
return -EAFNOSUPPORT;
return dump_rules(skb, cb, ops);
}
rcu_read_lock();
list_for_each_entry_rcu(ops, &net->rules_ops, list) {
if (idx < cb->args[0] || !try_module_get(ops->owner))
goto skip;
if (dump_rules(skb, cb, ops) < 0)
break;
cb->args[1] = 0;
skip:
idx++;
}
rcu_read_unlock();
cb->args[0] = idx;
return skb->len;
}
static void notify_rule_change(int event, struct fib_rule *rule,
struct fib_rules_ops *ops, struct nlmsghdr *nlh,
u32 pid)
{
struct net *net;
struct sk_buff *skb;
int err = -ENOBUFS;
net = ops->fro_net;
skb = nlmsg_new(fib_rule_nlmsg_size(ops, rule), GFP_KERNEL);
if (skb == NULL)
goto errout;
err = fib_nl_fill_rule(skb, rule, pid, nlh->nlmsg_seq, event, 0, ops);
if (err < 0) {
/* -EMSGSIZE implies BUG in fib_rule_nlmsg_size() */
WARN_ON(err == -EMSGSIZE);
kfree_skb(skb);
goto errout;
}
rtnl_notify(skb, net, pid, ops->nlgroup, nlh, GFP_KERNEL);
return;
errout:
if (err < 0)
rtnl_set_sk_err(net, ops->nlgroup, err);
}
static void attach_rules(struct list_head *rules, struct net_device *dev)
{
struct fib_rule *rule;
list_for_each_entry(rule, rules, list) {
if (rule->iifindex == -1 &&
strcmp(dev->name, rule->iifname) == 0)
rule->iifindex = dev->ifindex;
if (rule->oifindex == -1 &&
strcmp(dev->name, rule->oifname) == 0)
rule->oifindex = dev->ifindex;
}
}
static void detach_rules(struct list_head *rules, struct net_device *dev)
{
struct fib_rule *rule;
list_for_each_entry(rule, rules, list) {
if (rule->iifindex == dev->ifindex)
rule->iifindex = -1;
if (rule->oifindex == dev->ifindex)
rule->oifindex = -1;
}
}
static int fib_rules_event(struct notifier_block *this, unsigned long event,
void *ptr)
{
struct net_device *dev = netdev_notifier_info_to_dev(ptr);
struct net *net = dev_net(dev);
struct fib_rules_ops *ops;
ASSERT_RTNL();
switch (event) {
case NETDEV_REGISTER:
list_for_each_entry(ops, &net->rules_ops, list)
attach_rules(&ops->rules_list, dev);
break;
case NETDEV_CHANGENAME:
list_for_each_entry(ops, &net->rules_ops, list) {
detach_rules(&ops->rules_list, dev);
attach_rules(&ops->rules_list, dev);
}
break;
case NETDEV_UNREGISTER:
list_for_each_entry(ops, &net->rules_ops, list)
detach_rules(&ops->rules_list, dev);
break;
}
return NOTIFY_DONE;
}
static struct notifier_block fib_rules_notifier = {
.notifier_call = fib_rules_event,
};
static int __net_init fib_rules_net_init(struct net *net)
{
INIT_LIST_HEAD(&net->rules_ops);
spin_lock_init(&net->rules_mod_lock);
return 0;
}
static struct pernet_operations fib_rules_net_ops = {
.init = fib_rules_net_init,
};
static int __init fib_rules_init(void)
{
int err;
rtnl_register(PF_UNSPEC, RTM_NEWRULE, fib_nl_newrule, NULL, NULL);
rtnl_register(PF_UNSPEC, RTM_DELRULE, fib_nl_delrule, NULL, NULL);
rtnl_register(PF_UNSPEC, RTM_GETRULE, NULL, fib_nl_dumprule, NULL);
err = register_pernet_subsys(&fib_rules_net_ops);
if (err < 0)
goto fail;
err = register_netdevice_notifier(&fib_rules_notifier);
if (err < 0)
goto fail_unregister;
return 0;
fail_unregister:
unregister_pernet_subsys(&fib_rules_net_ops);
fail:
rtnl_unregister(PF_UNSPEC, RTM_NEWRULE);
rtnl_unregister(PF_UNSPEC, RTM_DELRULE);
rtnl_unregister(PF_UNSPEC, RTM_GETRULE);
return err;
}
subsys_initcall(fib_rules_init);

1149
net/core/filter.c Normal file

File diff suppressed because it is too large Load diff

511
net/core/flow.c Normal file
View file

@ -0,0 +1,511 @@
/* flow.c: Generic flow cache.
*
* Copyright (C) 2003 Alexey N. Kuznetsov (kuznet@ms2.inr.ac.ru)
* Copyright (C) 2003 David S. Miller (davem@redhat.com)
*/
#include <linux/kernel.h>
#include <linux/module.h>
#include <linux/list.h>
#include <linux/jhash.h>
#include <linux/interrupt.h>
#include <linux/mm.h>
#include <linux/random.h>
#include <linux/init.h>
#include <linux/slab.h>
#include <linux/smp.h>
#include <linux/completion.h>
#include <linux/percpu.h>
#include <linux/bitops.h>
#include <linux/notifier.h>
#include <linux/cpu.h>
#include <linux/cpumask.h>
#include <linux/mutex.h>
#include <net/flow.h>
#include <linux/atomic.h>
#include <linux/security.h>
#include <net/net_namespace.h>
struct flow_cache_entry {
union {
struct hlist_node hlist;
struct list_head gc_list;
} u;
struct net *net;
u16 family;
u8 dir;
u32 genid;
struct flowi key;
struct flow_cache_object *object;
};
struct flow_flush_info {
struct flow_cache *cache;
atomic_t cpuleft;
struct completion completion;
};
static struct kmem_cache *flow_cachep __read_mostly;
#define flow_cache_hash_size(cache) (1 << (cache)->hash_shift)
#define FLOW_HASH_RND_PERIOD (10 * 60 * HZ)
static void flow_cache_new_hashrnd(unsigned long arg)
{
struct flow_cache *fc = (void *) arg;
int i;
for_each_possible_cpu(i)
per_cpu_ptr(fc->percpu, i)->hash_rnd_recalc = 1;
fc->rnd_timer.expires = jiffies + FLOW_HASH_RND_PERIOD;
add_timer(&fc->rnd_timer);
}
static int flow_entry_valid(struct flow_cache_entry *fle,
struct netns_xfrm *xfrm)
{
if (atomic_read(&xfrm->flow_cache_genid) != fle->genid)
return 0;
if (fle->object && !fle->object->ops->check(fle->object))
return 0;
return 1;
}
static void flow_entry_kill(struct flow_cache_entry *fle,
struct netns_xfrm *xfrm)
{
if (fle->object)
fle->object->ops->delete(fle->object);
kmem_cache_free(flow_cachep, fle);
}
static void flow_cache_gc_task(struct work_struct *work)
{
struct list_head gc_list;
struct flow_cache_entry *fce, *n;
struct netns_xfrm *xfrm = container_of(work, struct netns_xfrm,
flow_cache_gc_work);
INIT_LIST_HEAD(&gc_list);
spin_lock_bh(&xfrm->flow_cache_gc_lock);
list_splice_tail_init(&xfrm->flow_cache_gc_list, &gc_list);
spin_unlock_bh(&xfrm->flow_cache_gc_lock);
list_for_each_entry_safe(fce, n, &gc_list, u.gc_list)
flow_entry_kill(fce, xfrm);
}
static void flow_cache_queue_garbage(struct flow_cache_percpu *fcp,
int deleted, struct list_head *gc_list,
struct netns_xfrm *xfrm)
{
if (deleted) {
fcp->hash_count -= deleted;
spin_lock_bh(&xfrm->flow_cache_gc_lock);
list_splice_tail(gc_list, &xfrm->flow_cache_gc_list);
spin_unlock_bh(&xfrm->flow_cache_gc_lock);
schedule_work(&xfrm->flow_cache_gc_work);
}
}
static void __flow_cache_shrink(struct flow_cache *fc,
struct flow_cache_percpu *fcp,
int shrink_to)
{
struct flow_cache_entry *fle;
struct hlist_node *tmp;
LIST_HEAD(gc_list);
int i, deleted = 0;
struct netns_xfrm *xfrm = container_of(fc, struct netns_xfrm,
flow_cache_global);
for (i = 0; i < flow_cache_hash_size(fc); i++) {
int saved = 0;
hlist_for_each_entry_safe(fle, tmp,
&fcp->hash_table[i], u.hlist) {
if (saved < shrink_to &&
flow_entry_valid(fle, xfrm)) {
saved++;
} else {
deleted++;
hlist_del(&fle->u.hlist);
list_add_tail(&fle->u.gc_list, &gc_list);
}
}
}
flow_cache_queue_garbage(fcp, deleted, &gc_list, xfrm);
}
static void flow_cache_shrink(struct flow_cache *fc,
struct flow_cache_percpu *fcp)
{
int shrink_to = fc->low_watermark / flow_cache_hash_size(fc);
__flow_cache_shrink(fc, fcp, shrink_to);
}
static void flow_new_hash_rnd(struct flow_cache *fc,
struct flow_cache_percpu *fcp)
{
get_random_bytes(&fcp->hash_rnd, sizeof(u32));
fcp->hash_rnd_recalc = 0;
__flow_cache_shrink(fc, fcp, 0);
}
static u32 flow_hash_code(struct flow_cache *fc,
struct flow_cache_percpu *fcp,
const struct flowi *key,
size_t keysize)
{
const u32 *k = (const u32 *) key;
const u32 length = keysize * sizeof(flow_compare_t) / sizeof(u32);
return jhash2(k, length, fcp->hash_rnd)
& (flow_cache_hash_size(fc) - 1);
}
/* I hear what you're saying, use memcmp. But memcmp cannot make
* important assumptions that we can here, such as alignment.
*/
static int flow_key_compare(const struct flowi *key1, const struct flowi *key2,
size_t keysize)
{
const flow_compare_t *k1, *k1_lim, *k2;
k1 = (const flow_compare_t *) key1;
k1_lim = k1 + keysize;
k2 = (const flow_compare_t *) key2;
do {
if (*k1++ != *k2++)
return 1;
} while (k1 < k1_lim);
return 0;
}
struct flow_cache_object *
flow_cache_lookup(struct net *net, const struct flowi *key, u16 family, u8 dir,
flow_resolve_t resolver, void *ctx)
{
struct flow_cache *fc = &net->xfrm.flow_cache_global;
struct flow_cache_percpu *fcp;
struct flow_cache_entry *fle, *tfle;
struct flow_cache_object *flo;
size_t keysize;
unsigned int hash;
local_bh_disable();
fcp = this_cpu_ptr(fc->percpu);
fle = NULL;
flo = NULL;
keysize = flow_key_size(family);
if (!keysize)
goto nocache;
/* Packet really early in init? Making flow_cache_init a
* pre-smp initcall would solve this. --RR */
if (!fcp->hash_table)
goto nocache;
if (fcp->hash_rnd_recalc)
flow_new_hash_rnd(fc, fcp);
hash = flow_hash_code(fc, fcp, key, keysize);
hlist_for_each_entry(tfle, &fcp->hash_table[hash], u.hlist) {
if (tfle->net == net &&
tfle->family == family &&
tfle->dir == dir &&
flow_key_compare(key, &tfle->key, keysize) == 0) {
fle = tfle;
break;
}
}
if (unlikely(!fle)) {
if (fcp->hash_count > fc->high_watermark)
flow_cache_shrink(fc, fcp);
fle = kmem_cache_alloc(flow_cachep, GFP_ATOMIC);
if (fle) {
fle->net = net;
fle->family = family;
fle->dir = dir;
memcpy(&fle->key, key, keysize * sizeof(flow_compare_t));
fle->object = NULL;
hlist_add_head(&fle->u.hlist, &fcp->hash_table[hash]);
fcp->hash_count++;
}
} else if (likely(fle->genid == atomic_read(&net->xfrm.flow_cache_genid))) {
flo = fle->object;
if (!flo)
goto ret_object;
flo = flo->ops->get(flo);
if (flo)
goto ret_object;
} else if (fle->object) {
flo = fle->object;
flo->ops->delete(flo);
fle->object = NULL;
}
nocache:
flo = NULL;
if (fle) {
flo = fle->object;
fle->object = NULL;
}
flo = resolver(net, key, family, dir, flo, ctx);
if (fle) {
fle->genid = atomic_read(&net->xfrm.flow_cache_genid);
if (!IS_ERR(flo))
fle->object = flo;
else
fle->genid--;
} else {
if (!IS_ERR_OR_NULL(flo))
flo->ops->delete(flo);
}
ret_object:
local_bh_enable();
return flo;
}
EXPORT_SYMBOL(flow_cache_lookup);
static void flow_cache_flush_tasklet(unsigned long data)
{
struct flow_flush_info *info = (void *)data;
struct flow_cache *fc = info->cache;
struct flow_cache_percpu *fcp;
struct flow_cache_entry *fle;
struct hlist_node *tmp;
LIST_HEAD(gc_list);
int i, deleted = 0;
struct netns_xfrm *xfrm = container_of(fc, struct netns_xfrm,
flow_cache_global);
fcp = this_cpu_ptr(fc->percpu);
for (i = 0; i < flow_cache_hash_size(fc); i++) {
hlist_for_each_entry_safe(fle, tmp,
&fcp->hash_table[i], u.hlist) {
if (flow_entry_valid(fle, xfrm))
continue;
deleted++;
hlist_del(&fle->u.hlist);
list_add_tail(&fle->u.gc_list, &gc_list);
}
}
flow_cache_queue_garbage(fcp, deleted, &gc_list, xfrm);
if (atomic_dec_and_test(&info->cpuleft))
complete(&info->completion);
}
/*
* Return whether a cpu needs flushing. Conservatively, we assume
* the presence of any entries means the core may require flushing,
* since the flow_cache_ops.check() function may assume it's running
* on the same core as the per-cpu cache component.
*/
static int flow_cache_percpu_empty(struct flow_cache *fc, int cpu)
{
struct flow_cache_percpu *fcp;
int i;
fcp = per_cpu_ptr(fc->percpu, cpu);
for (i = 0; i < flow_cache_hash_size(fc); i++)
if (!hlist_empty(&fcp->hash_table[i]))
return 0;
return 1;
}
static void flow_cache_flush_per_cpu(void *data)
{
struct flow_flush_info *info = data;
struct tasklet_struct *tasklet;
tasklet = &this_cpu_ptr(info->cache->percpu)->flush_tasklet;
tasklet->data = (unsigned long)info;
tasklet_schedule(tasklet);
}
void flow_cache_flush(struct net *net)
{
struct flow_flush_info info;
cpumask_var_t mask;
int i, self;
/* Track which cpus need flushing to avoid disturbing all cores. */
if (!alloc_cpumask_var(&mask, GFP_KERNEL))
return;
cpumask_clear(mask);
/* Don't want cpus going down or up during this. */
get_online_cpus();
mutex_lock(&net->xfrm.flow_flush_sem);
info.cache = &net->xfrm.flow_cache_global;
for_each_online_cpu(i)
if (!flow_cache_percpu_empty(info.cache, i))
cpumask_set_cpu(i, mask);
atomic_set(&info.cpuleft, cpumask_weight(mask));
if (atomic_read(&info.cpuleft) == 0)
goto done;
init_completion(&info.completion);
local_bh_disable();
self = cpumask_test_and_clear_cpu(smp_processor_id(), mask);
on_each_cpu_mask(mask, flow_cache_flush_per_cpu, &info, 0);
if (self)
flow_cache_flush_tasklet((unsigned long)&info);
local_bh_enable();
wait_for_completion(&info.completion);
done:
mutex_unlock(&net->xfrm.flow_flush_sem);
put_online_cpus();
free_cpumask_var(mask);
}
static void flow_cache_flush_task(struct work_struct *work)
{
struct netns_xfrm *xfrm = container_of(work, struct netns_xfrm,
flow_cache_flush_work);
struct net *net = container_of(xfrm, struct net, xfrm);
flow_cache_flush(net);
}
void flow_cache_flush_deferred(struct net *net)
{
schedule_work(&net->xfrm.flow_cache_flush_work);
}
static int flow_cache_cpu_prepare(struct flow_cache *fc, int cpu)
{
struct flow_cache_percpu *fcp = per_cpu_ptr(fc->percpu, cpu);
size_t sz = sizeof(struct hlist_head) * flow_cache_hash_size(fc);
if (!fcp->hash_table) {
fcp->hash_table = kzalloc_node(sz, GFP_KERNEL, cpu_to_node(cpu));
if (!fcp->hash_table) {
pr_err("NET: failed to allocate flow cache sz %zu\n", sz);
return -ENOMEM;
}
fcp->hash_rnd_recalc = 1;
fcp->hash_count = 0;
tasklet_init(&fcp->flush_tasklet, flow_cache_flush_tasklet, 0);
}
return 0;
}
static int flow_cache_cpu(struct notifier_block *nfb,
unsigned long action,
void *hcpu)
{
struct flow_cache *fc = container_of(nfb, struct flow_cache,
hotcpu_notifier);
int res, cpu = (unsigned long) hcpu;
struct flow_cache_percpu *fcp = per_cpu_ptr(fc->percpu, cpu);
switch (action) {
case CPU_UP_PREPARE:
case CPU_UP_PREPARE_FROZEN:
res = flow_cache_cpu_prepare(fc, cpu);
if (res)
return notifier_from_errno(res);
break;
case CPU_DEAD:
case CPU_DEAD_FROZEN:
__flow_cache_shrink(fc, fcp, 0);
break;
}
return NOTIFY_OK;
}
int flow_cache_init(struct net *net)
{
int i;
struct flow_cache *fc = &net->xfrm.flow_cache_global;
if (!flow_cachep)
flow_cachep = kmem_cache_create("flow_cache",
sizeof(struct flow_cache_entry),
0, SLAB_PANIC, NULL);
spin_lock_init(&net->xfrm.flow_cache_gc_lock);
INIT_LIST_HEAD(&net->xfrm.flow_cache_gc_list);
INIT_WORK(&net->xfrm.flow_cache_gc_work, flow_cache_gc_task);
INIT_WORK(&net->xfrm.flow_cache_flush_work, flow_cache_flush_task);
mutex_init(&net->xfrm.flow_flush_sem);
fc->hash_shift = 10;
fc->low_watermark = 2 * flow_cache_hash_size(fc);
fc->high_watermark = 4 * flow_cache_hash_size(fc);
fc->percpu = alloc_percpu(struct flow_cache_percpu);
if (!fc->percpu)
return -ENOMEM;
cpu_notifier_register_begin();
for_each_online_cpu(i) {
if (flow_cache_cpu_prepare(fc, i))
goto err;
}
fc->hotcpu_notifier = (struct notifier_block){
.notifier_call = flow_cache_cpu,
};
__register_hotcpu_notifier(&fc->hotcpu_notifier);
cpu_notifier_register_done();
setup_timer(&fc->rnd_timer, flow_cache_new_hashrnd,
(unsigned long) fc);
fc->rnd_timer.expires = jiffies + FLOW_HASH_RND_PERIOD;
add_timer(&fc->rnd_timer);
return 0;
err:
for_each_possible_cpu(i) {
struct flow_cache_percpu *fcp = per_cpu_ptr(fc->percpu, i);
kfree(fcp->hash_table);
fcp->hash_table = NULL;
}
cpu_notifier_register_done();
free_percpu(fc->percpu);
fc->percpu = NULL;
return -ENOMEM;
}
EXPORT_SYMBOL(flow_cache_init);
void flow_cache_fini(struct net *net)
{
int i;
struct flow_cache *fc = &net->xfrm.flow_cache_global;
del_timer_sync(&fc->rnd_timer);
unregister_hotcpu_notifier(&fc->hotcpu_notifier);
for_each_possible_cpu(i) {
struct flow_cache_percpu *fcp = per_cpu_ptr(fc->percpu, i);
kfree(fcp->hash_table);
fcp->hash_table = NULL;
}
free_percpu(fc->percpu);
fc->percpu = NULL;
}
EXPORT_SYMBOL(flow_cache_fini);

471
net/core/flow_dissector.c Normal file
View file

@ -0,0 +1,471 @@
#include <linux/skbuff.h>
#include <linux/export.h>
#include <linux/ip.h>
#include <linux/ipv6.h>
#include <linux/if_vlan.h>
#include <net/ip.h>
#include <net/ipv6.h>
#include <linux/igmp.h>
#include <linux/icmp.h>
#include <linux/sctp.h>
#include <linux/dccp.h>
#include <linux/if_tunnel.h>
#include <linux/if_pppox.h>
#include <linux/ppp_defs.h>
#include <net/flow_keys.h>
#include <scsi/fc/fc_fcoe.h>
/* copy saddr & daddr, possibly using 64bit load/store
* Equivalent to : flow->src = iph->saddr;
* flow->dst = iph->daddr;
*/
static void iph_to_flow_copy_addrs(struct flow_keys *flow, const struct iphdr *iph)
{
BUILD_BUG_ON(offsetof(typeof(*flow), dst) !=
offsetof(typeof(*flow), src) + sizeof(flow->src));
memcpy(&flow->src, &iph->saddr, sizeof(flow->src) + sizeof(flow->dst));
}
/**
* __skb_flow_get_ports - extract the upper layer ports and return them
* @skb: sk_buff to extract the ports from
* @thoff: transport header offset
* @ip_proto: protocol for which to get port offset
* @data: raw buffer pointer to the packet, if NULL use skb->data
* @hlen: packet header length, if @data is NULL use skb_headlen(skb)
*
* The function will try to retrieve the ports at offset thoff + poff where poff
* is the protocol port offset returned from proto_ports_offset
*/
__be32 __skb_flow_get_ports(const struct sk_buff *skb, int thoff, u8 ip_proto,
void *data, int hlen)
{
int poff = proto_ports_offset(ip_proto);
if (!data) {
data = skb->data;
hlen = skb_headlen(skb);
}
if (poff >= 0) {
__be32 *ports, _ports;
ports = __skb_header_pointer(skb, thoff + poff,
sizeof(_ports), data, hlen, &_ports);
if (ports)
return *ports;
}
return 0;
}
EXPORT_SYMBOL(__skb_flow_get_ports);
/**
* __skb_flow_dissect - extract the flow_keys struct and return it
* @skb: sk_buff to extract the flow from, can be NULL if the rest are specified
* @data: raw buffer pointer to the packet, if NULL use skb->data
* @proto: protocol for which to get the flow, if @data is NULL use skb->protocol
* @nhoff: network header offset, if @data is NULL use skb_network_offset(skb)
* @hlen: packet header length, if @data is NULL use skb_headlen(skb)
*
* The function will try to retrieve the struct flow_keys from either the skbuff
* or a raw buffer specified by the rest parameters
*/
bool __skb_flow_dissect(const struct sk_buff *skb, struct flow_keys *flow,
void *data, __be16 proto, int nhoff, int hlen)
{
u8 ip_proto;
if (!data) {
data = skb->data;
proto = skb->protocol;
nhoff = skb_network_offset(skb);
hlen = skb_headlen(skb);
}
memset(flow, 0, sizeof(*flow));
again:
switch (proto) {
case htons(ETH_P_IP): {
const struct iphdr *iph;
struct iphdr _iph;
ip:
iph = __skb_header_pointer(skb, nhoff, sizeof(_iph), data, hlen, &_iph);
if (!iph || iph->ihl < 5)
return false;
nhoff += iph->ihl * 4;
ip_proto = iph->protocol;
if (ip_is_fragment(iph))
ip_proto = 0;
/* skip the address processing if skb is NULL. The assumption
* here is that if there is no skb we are not looking for flow
* info but lengths and protocols.
*/
if (!skb)
break;
iph_to_flow_copy_addrs(flow, iph);
break;
}
case htons(ETH_P_IPV6): {
const struct ipv6hdr *iph;
struct ipv6hdr _iph;
__be32 flow_label;
ipv6:
iph = __skb_header_pointer(skb, nhoff, sizeof(_iph), data, hlen, &_iph);
if (!iph)
return false;
ip_proto = iph->nexthdr;
nhoff += sizeof(struct ipv6hdr);
/* see comment above in IPv4 section */
if (!skb)
break;
flow->src = (__force __be32)ipv6_addr_hash(&iph->saddr);
flow->dst = (__force __be32)ipv6_addr_hash(&iph->daddr);
flow_label = ip6_flowlabel(iph);
if (flow_label) {
/* Awesome, IPv6 packet has a flow label so we can
* use that to represent the ports without any
* further dissection.
*/
flow->n_proto = proto;
flow->ip_proto = ip_proto;
flow->ports = flow_label;
flow->thoff = (u16)nhoff;
return true;
}
break;
}
case htons(ETH_P_8021AD):
case htons(ETH_P_8021Q): {
const struct vlan_hdr *vlan;
struct vlan_hdr _vlan;
vlan = __skb_header_pointer(skb, nhoff, sizeof(_vlan), data, hlen, &_vlan);
if (!vlan)
return false;
proto = vlan->h_vlan_encapsulated_proto;
nhoff += sizeof(*vlan);
goto again;
}
case htons(ETH_P_PPP_SES): {
struct {
struct pppoe_hdr hdr;
__be16 proto;
} *hdr, _hdr;
hdr = __skb_header_pointer(skb, nhoff, sizeof(_hdr), data, hlen, &_hdr);
if (!hdr)
return false;
proto = hdr->proto;
nhoff += PPPOE_SES_HLEN;
switch (proto) {
case htons(PPP_IP):
goto ip;
case htons(PPP_IPV6):
goto ipv6;
default:
return false;
}
}
case htons(ETH_P_FCOE):
flow->thoff = (u16)(nhoff + FCOE_HEADER_LEN);
/* fall through */
default:
return false;
}
switch (ip_proto) {
case IPPROTO_GRE: {
struct gre_hdr {
__be16 flags;
__be16 proto;
} *hdr, _hdr;
hdr = __skb_header_pointer(skb, nhoff, sizeof(_hdr), data, hlen, &_hdr);
if (!hdr)
return false;
/*
* Only look inside GRE if version zero and no
* routing
*/
if (!(hdr->flags & (GRE_VERSION|GRE_ROUTING))) {
proto = hdr->proto;
nhoff += 4;
if (hdr->flags & GRE_CSUM)
nhoff += 4;
if (hdr->flags & GRE_KEY)
nhoff += 4;
if (hdr->flags & GRE_SEQ)
nhoff += 4;
if (proto == htons(ETH_P_TEB)) {
const struct ethhdr *eth;
struct ethhdr _eth;
eth = __skb_header_pointer(skb, nhoff,
sizeof(_eth),
data, hlen, &_eth);
if (!eth)
return false;
proto = eth->h_proto;
nhoff += sizeof(*eth);
}
goto again;
}
break;
}
case IPPROTO_IPIP:
proto = htons(ETH_P_IP);
goto ip;
case IPPROTO_IPV6:
proto = htons(ETH_P_IPV6);
goto ipv6;
default:
break;
}
flow->n_proto = proto;
flow->ip_proto = ip_proto;
flow->thoff = (u16) nhoff;
/* unless skb is set we don't need to record port info */
if (skb)
flow->ports = __skb_flow_get_ports(skb, nhoff, ip_proto,
data, hlen);
return true;
}
EXPORT_SYMBOL(__skb_flow_dissect);
static u32 hashrnd __read_mostly;
static __always_inline void __flow_hash_secret_init(void)
{
net_get_random_once(&hashrnd, sizeof(hashrnd));
}
static __always_inline u32 __flow_hash_3words(u32 a, u32 b, u32 c)
{
__flow_hash_secret_init();
return jhash_3words(a, b, c, hashrnd);
}
static inline u32 __flow_hash_from_keys(struct flow_keys *keys)
{
u32 hash;
/* get a consistent hash (same value on both flow directions) */
if (((__force u32)keys->dst < (__force u32)keys->src) ||
(((__force u32)keys->dst == (__force u32)keys->src) &&
((__force u16)keys->port16[1] < (__force u16)keys->port16[0]))) {
swap(keys->dst, keys->src);
swap(keys->port16[0], keys->port16[1]);
}
hash = __flow_hash_3words((__force u32)keys->dst,
(__force u32)keys->src,
(__force u32)keys->ports);
if (!hash)
hash = 1;
return hash;
}
u32 flow_hash_from_keys(struct flow_keys *keys)
{
return __flow_hash_from_keys(keys);
}
EXPORT_SYMBOL(flow_hash_from_keys);
/*
* __skb_get_hash: calculate a flow hash based on src/dst addresses
* and src/dst port numbers. Sets hash in skb to non-zero hash value
* on success, zero indicates no valid hash. Also, sets l4_hash in skb
* if hash is a canonical 4-tuple hash over transport ports.
*/
void __skb_get_hash(struct sk_buff *skb)
{
struct flow_keys keys;
if (!skb_flow_dissect(skb, &keys))
return;
if (keys.ports)
skb->l4_hash = 1;
skb->sw_hash = 1;
skb->hash = __flow_hash_from_keys(&keys);
}
EXPORT_SYMBOL(__skb_get_hash);
/*
* Returns a Tx hash based on the given packet descriptor a Tx queues' number
* to be used as a distribution range.
*/
u16 __skb_tx_hash(const struct net_device *dev, struct sk_buff *skb,
unsigned int num_tx_queues)
{
u32 hash;
u16 qoffset = 0;
u16 qcount = num_tx_queues;
if (skb_rx_queue_recorded(skb)) {
hash = skb_get_rx_queue(skb);
while (unlikely(hash >= num_tx_queues))
hash -= num_tx_queues;
return hash;
}
if (dev->num_tc) {
u8 tc = netdev_get_prio_tc_map(dev, skb->priority);
qoffset = dev->tc_to_txq[tc].offset;
qcount = dev->tc_to_txq[tc].count;
}
return (u16) reciprocal_scale(skb_get_hash(skb), qcount) + qoffset;
}
EXPORT_SYMBOL(__skb_tx_hash);
u32 __skb_get_poff(const struct sk_buff *skb, void *data,
const struct flow_keys *keys, int hlen)
{
u32 poff = keys->thoff;
switch (keys->ip_proto) {
case IPPROTO_TCP: {
/* access doff as u8 to avoid unaligned access */
const u8 *doff;
u8 _doff;
doff = __skb_header_pointer(skb, poff + 12, sizeof(_doff),
data, hlen, &_doff);
if (!doff)
return poff;
poff += max_t(u32, sizeof(struct tcphdr), (*doff & 0xF0) >> 2);
break;
}
case IPPROTO_UDP:
case IPPROTO_UDPLITE:
poff += sizeof(struct udphdr);
break;
/* For the rest, we do not really care about header
* extensions at this point for now.
*/
case IPPROTO_ICMP:
poff += sizeof(struct icmphdr);
break;
case IPPROTO_ICMPV6:
poff += sizeof(struct icmp6hdr);
break;
case IPPROTO_IGMP:
poff += sizeof(struct igmphdr);
break;
case IPPROTO_DCCP:
poff += sizeof(struct dccp_hdr);
break;
case IPPROTO_SCTP:
poff += sizeof(struct sctphdr);
break;
}
return poff;
}
/* skb_get_poff() returns the offset to the payload as far as it could
* be dissected. The main user is currently BPF, so that we can dynamically
* truncate packets without needing to push actual payload to the user
* space and can analyze headers only, instead.
*/
u32 skb_get_poff(const struct sk_buff *skb)
{
struct flow_keys keys;
if (!skb_flow_dissect(skb, &keys))
return 0;
return __skb_get_poff(skb, skb->data, &keys, skb_headlen(skb));
}
static inline int get_xps_queue(struct net_device *dev, struct sk_buff *skb)
{
#ifdef CONFIG_XPS
struct xps_dev_maps *dev_maps;
struct xps_map *map;
int queue_index = -1;
rcu_read_lock();
dev_maps = rcu_dereference(dev->xps_maps);
if (dev_maps) {
map = rcu_dereference(
dev_maps->cpu_map[raw_smp_processor_id()]);
if (map) {
if (map->len == 1)
queue_index = map->queues[0];
else
queue_index = map->queues[reciprocal_scale(skb_get_hash(skb),
map->len)];
if (unlikely(queue_index >= dev->real_num_tx_queues))
queue_index = -1;
}
}
rcu_read_unlock();
return queue_index;
#else
return -1;
#endif
}
static u16 __netdev_pick_tx(struct net_device *dev, struct sk_buff *skb)
{
struct sock *sk = skb->sk;
int queue_index = sk_tx_queue_get(sk);
if (queue_index < 0 || skb->ooo_okay ||
queue_index >= dev->real_num_tx_queues) {
int new_index = get_xps_queue(dev, skb);
if (new_index < 0)
new_index = skb_tx_hash(dev, skb);
if (queue_index != new_index && sk &&
rcu_access_pointer(sk->sk_dst_cache))
sk_tx_queue_set(sk, new_index);
queue_index = new_index;
}
return queue_index;
}
struct netdev_queue *netdev_pick_tx(struct net_device *dev,
struct sk_buff *skb,
void *accel_priv)
{
int queue_index = 0;
if (dev->real_num_tx_queues != 1) {
const struct net_device_ops *ops = dev->netdev_ops;
if (ops->ndo_select_queue)
queue_index = ops->ndo_select_queue(dev, skb, accel_priv,
__netdev_pick_tx);
else
queue_index = __netdev_pick_tx(dev, skb);
if (!accel_priv)
queue_index = netdev_cap_txqueue(dev, queue_index);
}
skb_set_queue_mapping(skb, queue_index);
return netdev_get_tx_queue(dev, queue_index);
}

328
net/core/gen_estimator.c Normal file
View file

@ -0,0 +1,328 @@
/*
* net/sched/gen_estimator.c Simple rate estimator.
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public License
* as published by the Free Software Foundation; either version
* 2 of the License, or (at your option) any later version.
*
* Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
*
* Changes:
* Jamal Hadi Salim - moved it to net/core and reshulfed
* names to make it usable in general net subsystem.
*/
#include <asm/uaccess.h>
#include <linux/bitops.h>
#include <linux/module.h>
#include <linux/types.h>
#include <linux/kernel.h>
#include <linux/jiffies.h>
#include <linux/string.h>
#include <linux/mm.h>
#include <linux/socket.h>
#include <linux/sockios.h>
#include <linux/in.h>
#include <linux/errno.h>
#include <linux/interrupt.h>
#include <linux/netdevice.h>
#include <linux/skbuff.h>
#include <linux/rtnetlink.h>
#include <linux/init.h>
#include <linux/rbtree.h>
#include <linux/slab.h>
#include <net/sock.h>
#include <net/gen_stats.h>
/*
This code is NOT intended to be used for statistics collection,
its purpose is to provide a base for statistical multiplexing
for controlled load service.
If you need only statistics, run a user level daemon which
periodically reads byte counters.
Unfortunately, rate estimation is not a very easy task.
F.e. I did not find a simple way to estimate the current peak rate
and even failed to formulate the problem 8)8)
So I preferred not to built an estimator into the scheduler,
but run this task separately.
Ideally, it should be kernel thread(s), but for now it runs
from timers, which puts apparent top bounds on the number of rated
flows, has minimal overhead on small, but is enough
to handle controlled load service, sets of aggregates.
We measure rate over A=(1<<interval) seconds and evaluate EWMA:
avrate = avrate*(1-W) + rate*W
where W is chosen as negative power of 2: W = 2^(-ewma_log)
The resulting time constant is:
T = A/(-ln(1-W))
NOTES.
* avbps is scaled by 2^5, avpps is scaled by 2^10.
* both values are reported as 32 bit unsigned values. bps can
overflow for fast links : max speed being 34360Mbit/sec
* Minimal interval is HZ/4=250msec (it is the greatest common divisor
for HZ=100 and HZ=1024 8)), maximal interval
is (HZ*2^EST_MAX_INTERVAL)/4 = 8sec. Shorter intervals
are too expensive, longer ones can be implemented
at user level painlessly.
*/
#define EST_MAX_INTERVAL 5
struct gen_estimator
{
struct list_head list;
struct gnet_stats_basic_packed *bstats;
struct gnet_stats_rate_est64 *rate_est;
spinlock_t *stats_lock;
int ewma_log;
u64 last_bytes;
u64 avbps;
u32 last_packets;
u32 avpps;
struct rcu_head e_rcu;
struct rb_node node;
struct gnet_stats_basic_cpu __percpu *cpu_bstats;
struct rcu_head head;
};
struct gen_estimator_head
{
struct timer_list timer;
struct list_head list;
};
static struct gen_estimator_head elist[EST_MAX_INTERVAL+1];
/* Protects against NULL dereference */
static DEFINE_RWLOCK(est_lock);
/* Protects against soft lockup during large deletion */
static struct rb_root est_root = RB_ROOT;
static DEFINE_SPINLOCK(est_tree_lock);
static void est_timer(unsigned long arg)
{
int idx = (int)arg;
struct gen_estimator *e;
rcu_read_lock();
list_for_each_entry_rcu(e, &elist[idx].list, list) {
struct gnet_stats_basic_packed b = {0};
u64 brate;
u32 rate;
spin_lock(e->stats_lock);
read_lock(&est_lock);
if (e->bstats == NULL)
goto skip;
__gnet_stats_copy_basic(&b, e->cpu_bstats, e->bstats);
brate = (b.bytes - e->last_bytes)<<(7 - idx);
e->last_bytes = b.bytes;
e->avbps += (brate >> e->ewma_log) - (e->avbps >> e->ewma_log);
e->rate_est->bps = (e->avbps+0xF)>>5;
rate = (b.packets - e->last_packets)<<(12 - idx);
e->last_packets = b.packets;
e->avpps += (rate >> e->ewma_log) - (e->avpps >> e->ewma_log);
e->rate_est->pps = (e->avpps+0x1FF)>>10;
skip:
read_unlock(&est_lock);
spin_unlock(e->stats_lock);
}
if (!list_empty(&elist[idx].list))
mod_timer(&elist[idx].timer, jiffies + ((HZ/4) << idx));
rcu_read_unlock();
}
static void gen_add_node(struct gen_estimator *est)
{
struct rb_node **p = &est_root.rb_node, *parent = NULL;
while (*p) {
struct gen_estimator *e;
parent = *p;
e = rb_entry(parent, struct gen_estimator, node);
if (est->bstats > e->bstats)
p = &parent->rb_right;
else
p = &parent->rb_left;
}
rb_link_node(&est->node, parent, p);
rb_insert_color(&est->node, &est_root);
}
static
struct gen_estimator *gen_find_node(const struct gnet_stats_basic_packed *bstats,
const struct gnet_stats_rate_est64 *rate_est)
{
struct rb_node *p = est_root.rb_node;
while (p) {
struct gen_estimator *e;
e = rb_entry(p, struct gen_estimator, node);
if (bstats > e->bstats)
p = p->rb_right;
else if (bstats < e->bstats || rate_est != e->rate_est)
p = p->rb_left;
else
return e;
}
return NULL;
}
/**
* gen_new_estimator - create a new rate estimator
* @bstats: basic statistics
* @rate_est: rate estimator statistics
* @stats_lock: statistics lock
* @opt: rate estimator configuration TLV
*
* Creates a new rate estimator with &bstats as source and &rate_est
* as destination. A new timer with the interval specified in the
* configuration TLV is created. Upon each interval, the latest statistics
* will be read from &bstats and the estimated rate will be stored in
* &rate_est with the statistics lock grabbed during this period.
*
* Returns 0 on success or a negative error code.
*
*/
int gen_new_estimator(struct gnet_stats_basic_packed *bstats,
struct gnet_stats_basic_cpu __percpu *cpu_bstats,
struct gnet_stats_rate_est64 *rate_est,
spinlock_t *stats_lock,
struct nlattr *opt)
{
struct gen_estimator *est;
struct gnet_estimator *parm = nla_data(opt);
struct gnet_stats_basic_packed b = {0};
int idx;
if (nla_len(opt) < sizeof(*parm))
return -EINVAL;
if (parm->interval < -2 || parm->interval > 3)
return -EINVAL;
est = kzalloc(sizeof(*est), GFP_KERNEL);
if (est == NULL)
return -ENOBUFS;
__gnet_stats_copy_basic(&b, cpu_bstats, bstats);
idx = parm->interval + 2;
est->bstats = bstats;
est->rate_est = rate_est;
est->stats_lock = stats_lock;
est->ewma_log = parm->ewma_log;
est->last_bytes = b.bytes;
est->avbps = rate_est->bps<<5;
est->last_packets = b.packets;
est->avpps = rate_est->pps<<10;
est->cpu_bstats = cpu_bstats;
spin_lock_bh(&est_tree_lock);
if (!elist[idx].timer.function) {
INIT_LIST_HEAD(&elist[idx].list);
setup_timer(&elist[idx].timer, est_timer, idx);
}
if (list_empty(&elist[idx].list))
mod_timer(&elist[idx].timer, jiffies + ((HZ/4) << idx));
list_add_rcu(&est->list, &elist[idx].list);
gen_add_node(est);
spin_unlock_bh(&est_tree_lock);
return 0;
}
EXPORT_SYMBOL(gen_new_estimator);
/**
* gen_kill_estimator - remove a rate estimator
* @bstats: basic statistics
* @rate_est: rate estimator statistics
*
* Removes the rate estimator specified by &bstats and &rate_est.
*
* Note : Caller should respect an RCU grace period before freeing stats_lock
*/
void gen_kill_estimator(struct gnet_stats_basic_packed *bstats,
struct gnet_stats_rate_est64 *rate_est)
{
struct gen_estimator *e;
spin_lock_bh(&est_tree_lock);
while ((e = gen_find_node(bstats, rate_est))) {
rb_erase(&e->node, &est_root);
write_lock(&est_lock);
e->bstats = NULL;
write_unlock(&est_lock);
list_del_rcu(&e->list);
kfree_rcu(e, e_rcu);
}
spin_unlock_bh(&est_tree_lock);
}
EXPORT_SYMBOL(gen_kill_estimator);
/**
* gen_replace_estimator - replace rate estimator configuration
* @bstats: basic statistics
* @rate_est: rate estimator statistics
* @stats_lock: statistics lock
* @opt: rate estimator configuration TLV
*
* Replaces the configuration of a rate estimator by calling
* gen_kill_estimator() and gen_new_estimator().
*
* Returns 0 on success or a negative error code.
*/
int gen_replace_estimator(struct gnet_stats_basic_packed *bstats,
struct gnet_stats_basic_cpu __percpu *cpu_bstats,
struct gnet_stats_rate_est64 *rate_est,
spinlock_t *stats_lock, struct nlattr *opt)
{
gen_kill_estimator(bstats, rate_est);
return gen_new_estimator(bstats, cpu_bstats, rate_est, stats_lock, opt);
}
EXPORT_SYMBOL(gen_replace_estimator);
/**
* gen_estimator_active - test if estimator is currently in use
* @bstats: basic statistics
* @rate_est: rate estimator statistics
*
* Returns true if estimator is active, and false if not.
*/
bool gen_estimator_active(const struct gnet_stats_basic_packed *bstats,
const struct gnet_stats_rate_est64 *rate_est)
{
bool res;
ASSERT_RTNL();
spin_lock_bh(&est_tree_lock);
res = gen_find_node(bstats, rate_est) != NULL;
spin_unlock_bh(&est_tree_lock);
return res;
}
EXPORT_SYMBOL(gen_estimator_active);

364
net/core/gen_stats.c Normal file
View file

@ -0,0 +1,364 @@
/*
* net/core/gen_stats.c
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public License
* as published by the Free Software Foundation; either version
* 2 of the License, or (at your option) any later version.
*
* Authors: Thomas Graf <tgraf@suug.ch>
* Jamal Hadi Salim
* Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
*
* See Documentation/networking/gen_stats.txt
*/
#include <linux/types.h>
#include <linux/kernel.h>
#include <linux/module.h>
#include <linux/interrupt.h>
#include <linux/socket.h>
#include <linux/rtnetlink.h>
#include <linux/gen_stats.h>
#include <net/netlink.h>
#include <net/gen_stats.h>
static inline int
gnet_stats_copy(struct gnet_dump *d, int type, void *buf, int size)
{
if (nla_put(d->skb, type, size, buf))
goto nla_put_failure;
return 0;
nla_put_failure:
kfree(d->xstats);
d->xstats = NULL;
d->xstats_len = 0;
spin_unlock_bh(d->lock);
return -1;
}
/**
* gnet_stats_start_copy_compat - start dumping procedure in compatibility mode
* @skb: socket buffer to put statistics TLVs into
* @type: TLV type for top level statistic TLV
* @tc_stats_type: TLV type for backward compatibility struct tc_stats TLV
* @xstats_type: TLV type for backward compatibility xstats TLV
* @lock: statistics lock
* @d: dumping handle
*
* Initializes the dumping handle, grabs the statistic lock and appends
* an empty TLV header to the socket buffer for use a container for all
* other statistic TLVS.
*
* The dumping handle is marked to be in backward compatibility mode telling
* all gnet_stats_copy_XXX() functions to fill a local copy of struct tc_stats.
*
* Returns 0 on success or -1 if the room in the socket buffer was not sufficient.
*/
int
gnet_stats_start_copy_compat(struct sk_buff *skb, int type, int tc_stats_type,
int xstats_type, spinlock_t *lock, struct gnet_dump *d)
__acquires(lock)
{
memset(d, 0, sizeof(*d));
spin_lock_bh(lock);
d->lock = lock;
if (type)
d->tail = (struct nlattr *)skb_tail_pointer(skb);
d->skb = skb;
d->compat_tc_stats = tc_stats_type;
d->compat_xstats = xstats_type;
if (d->tail)
return gnet_stats_copy(d, type, NULL, 0);
return 0;
}
EXPORT_SYMBOL(gnet_stats_start_copy_compat);
/**
* gnet_stats_start_copy_compat - start dumping procedure in compatibility mode
* @skb: socket buffer to put statistics TLVs into
* @type: TLV type for top level statistic TLV
* @lock: statistics lock
* @d: dumping handle
*
* Initializes the dumping handle, grabs the statistic lock and appends
* an empty TLV header to the socket buffer for use a container for all
* other statistic TLVS.
*
* Returns 0 on success or -1 if the room in the socket buffer was not sufficient.
*/
int
gnet_stats_start_copy(struct sk_buff *skb, int type, spinlock_t *lock,
struct gnet_dump *d)
{
return gnet_stats_start_copy_compat(skb, type, 0, 0, lock, d);
}
EXPORT_SYMBOL(gnet_stats_start_copy);
static void
__gnet_stats_copy_basic_cpu(struct gnet_stats_basic_packed *bstats,
struct gnet_stats_basic_cpu __percpu *cpu)
{
int i;
for_each_possible_cpu(i) {
struct gnet_stats_basic_cpu *bcpu = per_cpu_ptr(cpu, i);
unsigned int start;
u64 bytes;
u32 packets;
do {
start = u64_stats_fetch_begin_irq(&bcpu->syncp);
bytes = bcpu->bstats.bytes;
packets = bcpu->bstats.packets;
} while (u64_stats_fetch_retry_irq(&bcpu->syncp, start));
bstats->bytes += bytes;
bstats->packets += packets;
}
}
void
__gnet_stats_copy_basic(struct gnet_stats_basic_packed *bstats,
struct gnet_stats_basic_cpu __percpu *cpu,
struct gnet_stats_basic_packed *b)
{
if (cpu) {
__gnet_stats_copy_basic_cpu(bstats, cpu);
} else {
bstats->bytes = b->bytes;
bstats->packets = b->packets;
}
}
EXPORT_SYMBOL(__gnet_stats_copy_basic);
/**
* gnet_stats_copy_basic - copy basic statistics into statistic TLV
* @d: dumping handle
* @b: basic statistics
*
* Appends the basic statistics to the top level TLV created by
* gnet_stats_start_copy().
*
* Returns 0 on success or -1 with the statistic lock released
* if the room in the socket buffer was not sufficient.
*/
int
gnet_stats_copy_basic(struct gnet_dump *d,
struct gnet_stats_basic_cpu __percpu *cpu,
struct gnet_stats_basic_packed *b)
{
struct gnet_stats_basic_packed bstats = {0};
__gnet_stats_copy_basic(&bstats, cpu, b);
if (d->compat_tc_stats) {
d->tc_stats.bytes = bstats.bytes;
d->tc_stats.packets = bstats.packets;
}
if (d->tail) {
struct gnet_stats_basic sb;
memset(&sb, 0, sizeof(sb));
sb.bytes = bstats.bytes;
sb.packets = bstats.packets;
return gnet_stats_copy(d, TCA_STATS_BASIC, &sb, sizeof(sb));
}
return 0;
}
EXPORT_SYMBOL(gnet_stats_copy_basic);
/**
* gnet_stats_copy_rate_est - copy rate estimator statistics into statistics TLV
* @d: dumping handle
* @b: basic statistics
* @r: rate estimator statistics
*
* Appends the rate estimator statistics to the top level TLV created by
* gnet_stats_start_copy().
*
* Returns 0 on success or -1 with the statistic lock released
* if the room in the socket buffer was not sufficient.
*/
int
gnet_stats_copy_rate_est(struct gnet_dump *d,
const struct gnet_stats_basic_packed *b,
struct gnet_stats_rate_est64 *r)
{
struct gnet_stats_rate_est est;
int res;
if (b && !gen_estimator_active(b, r))
return 0;
est.bps = min_t(u64, UINT_MAX, r->bps);
/* we have some time before reaching 2^32 packets per second */
est.pps = r->pps;
if (d->compat_tc_stats) {
d->tc_stats.bps = est.bps;
d->tc_stats.pps = est.pps;
}
if (d->tail) {
res = gnet_stats_copy(d, TCA_STATS_RATE_EST, &est, sizeof(est));
if (res < 0 || est.bps == r->bps)
return res;
/* emit 64bit stats only if needed */
return gnet_stats_copy(d, TCA_STATS_RATE_EST64, r, sizeof(*r));
}
return 0;
}
EXPORT_SYMBOL(gnet_stats_copy_rate_est);
static void
__gnet_stats_copy_queue_cpu(struct gnet_stats_queue *qstats,
const struct gnet_stats_queue __percpu *q)
{
int i;
for_each_possible_cpu(i) {
const struct gnet_stats_queue *qcpu = per_cpu_ptr(q, i);
qstats->qlen = 0;
qstats->backlog += qcpu->backlog;
qstats->drops += qcpu->drops;
qstats->requeues += qcpu->requeues;
qstats->overlimits += qcpu->overlimits;
}
}
static void __gnet_stats_copy_queue(struct gnet_stats_queue *qstats,
const struct gnet_stats_queue __percpu *cpu,
const struct gnet_stats_queue *q,
__u32 qlen)
{
if (cpu) {
__gnet_stats_copy_queue_cpu(qstats, cpu);
} else {
qstats->qlen = q->qlen;
qstats->backlog = q->backlog;
qstats->drops = q->drops;
qstats->requeues = q->requeues;
qstats->overlimits = q->overlimits;
}
qstats->qlen = qlen;
}
/**
* gnet_stats_copy_queue - copy queue statistics into statistics TLV
* @d: dumping handle
* @cpu_q: per cpu queue statistics
* @q: queue statistics
* @qlen: queue length statistics
*
* Appends the queue statistics to the top level TLV created by
* gnet_stats_start_copy(). Using per cpu queue statistics if
* they are available.
*
* Returns 0 on success or -1 with the statistic lock released
* if the room in the socket buffer was not sufficient.
*/
int
gnet_stats_copy_queue(struct gnet_dump *d,
struct gnet_stats_queue __percpu *cpu_q,
struct gnet_stats_queue *q, __u32 qlen)
{
struct gnet_stats_queue qstats = {0};
__gnet_stats_copy_queue(&qstats, cpu_q, q, qlen);
if (d->compat_tc_stats) {
d->tc_stats.drops = qstats.drops;
d->tc_stats.qlen = qstats.qlen;
d->tc_stats.backlog = qstats.backlog;
d->tc_stats.overlimits = qstats.overlimits;
}
if (d->tail)
return gnet_stats_copy(d, TCA_STATS_QUEUE,
&qstats, sizeof(qstats));
return 0;
}
EXPORT_SYMBOL(gnet_stats_copy_queue);
/**
* gnet_stats_copy_app - copy application specific statistics into statistics TLV
* @d: dumping handle
* @st: application specific statistics data
* @len: length of data
*
* Appends the application specific statistics to the top level TLV created by
* gnet_stats_start_copy() and remembers the data for XSTATS if the dumping
* handle is in backward compatibility mode.
*
* Returns 0 on success or -1 with the statistic lock released
* if the room in the socket buffer was not sufficient.
*/
int
gnet_stats_copy_app(struct gnet_dump *d, void *st, int len)
{
if (d->compat_xstats) {
d->xstats = kmemdup(st, len, GFP_ATOMIC);
if (!d->xstats)
goto err_out;
d->xstats_len = len;
}
if (d->tail)
return gnet_stats_copy(d, TCA_STATS_APP, st, len);
return 0;
err_out:
d->xstats_len = 0;
spin_unlock_bh(d->lock);
return -1;
}
EXPORT_SYMBOL(gnet_stats_copy_app);
/**
* gnet_stats_finish_copy - finish dumping procedure
* @d: dumping handle
*
* Corrects the length of the top level TLV to include all TLVs added
* by gnet_stats_copy_XXX() calls. Adds the backward compatibility TLVs
* if gnet_stats_start_copy_compat() was used and releases the statistics
* lock.
*
* Returns 0 on success or -1 with the statistic lock released
* if the room in the socket buffer was not sufficient.
*/
int
gnet_stats_finish_copy(struct gnet_dump *d)
{
if (d->tail)
d->tail->nla_len = skb_tail_pointer(d->skb) - (u8 *)d->tail;
if (d->compat_tc_stats)
if (gnet_stats_copy(d, d->compat_tc_stats, &d->tc_stats,
sizeof(d->tc_stats)) < 0)
return -1;
if (d->compat_xstats && d->xstats) {
if (gnet_stats_copy(d, d->compat_xstats, d->xstats,
d->xstats_len) < 0)
return -1;
}
kfree(d->xstats);
d->xstats = NULL;
d->xstats_len = 0;
spin_unlock_bh(d->lock);
return 0;
}
EXPORT_SYMBOL(gnet_stats_finish_copy);

184
net/core/iovec.c Normal file
View file

@ -0,0 +1,184 @@
/*
* iovec manipulation routines.
*
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public License
* as published by the Free Software Foundation; either version
* 2 of the License, or (at your option) any later version.
*
* Fixes:
* Andrew Lunn : Errors in iovec copying.
* Pedro Roque : Added memcpy_fromiovecend and
* csum_..._fromiovecend.
* Andi Kleen : fixed error handling for 2.1
* Alexey Kuznetsov: 2.1 optimisations
* Andi Kleen : Fix csum*fromiovecend for IPv6.
*/
#include <linux/errno.h>
#include <linux/module.h>
#include <linux/kernel.h>
#include <linux/mm.h>
#include <linux/net.h>
#include <linux/in6.h>
#include <asm/uaccess.h>
#include <asm/byteorder.h>
#include <net/checksum.h>
#include <net/sock.h>
/*
* Verify iovec. The caller must ensure that the iovec is big enough
* to hold the message iovec.
*
* Save time not doing access_ok. copy_*_user will make this work
* in any case.
*/
int verify_iovec(struct msghdr *m, struct iovec *iov, struct sockaddr_storage *address, int mode)
{
int size, ct, err;
if (m->msg_name && m->msg_namelen) {
if (mode == VERIFY_READ) {
void __user *namep;
namep = (void __user __force *) m->msg_name;
err = move_addr_to_kernel(namep, m->msg_namelen,
address);
if (err < 0)
return err;
}
m->msg_name = address;
} else {
m->msg_name = NULL;
m->msg_namelen = 0;
}
size = m->msg_iovlen * sizeof(struct iovec);
if (copy_from_user(iov, (void __user __force *) m->msg_iov, size))
return -EFAULT;
m->msg_iov = iov;
err = 0;
for (ct = 0; ct < m->msg_iovlen; ct++) {
size_t len = iov[ct].iov_len;
if (len > INT_MAX - err) {
len = INT_MAX - err;
iov[ct].iov_len = len;
}
err += len;
}
return err;
}
/*
* And now for the all-in-one: copy and checksum from a user iovec
* directly to a datagram
* Calls to csum_partial but the last must be in 32 bit chunks
*
* ip_build_xmit must ensure that when fragmenting only the last
* call to this function will be unaligned also.
*/
int csum_partial_copy_fromiovecend(unsigned char *kdata, struct iovec *iov,
int offset, unsigned int len, __wsum *csump)
{
__wsum csum = *csump;
int partial_cnt = 0, err = 0;
/* Skip over the finished iovecs */
while (offset >= iov->iov_len) {
offset -= iov->iov_len;
iov++;
}
while (len > 0) {
u8 __user *base = iov->iov_base + offset;
int copy = min_t(unsigned int, len, iov->iov_len - offset);
offset = 0;
/* There is a remnant from previous iov. */
if (partial_cnt) {
int par_len = 4 - partial_cnt;
/* iov component is too short ... */
if (par_len > copy) {
if (copy_from_user(kdata, base, copy))
goto out_fault;
kdata += copy;
base += copy;
partial_cnt += copy;
len -= copy;
iov++;
if (len)
continue;
*csump = csum_partial(kdata - partial_cnt,
partial_cnt, csum);
goto out;
}
if (copy_from_user(kdata, base, par_len))
goto out_fault;
csum = csum_partial(kdata - partial_cnt, 4, csum);
kdata += par_len;
base += par_len;
copy -= par_len;
len -= par_len;
partial_cnt = 0;
}
if (len > copy) {
partial_cnt = copy % 4;
if (partial_cnt) {
copy -= partial_cnt;
if (copy_from_user(kdata + copy, base + copy,
partial_cnt))
goto out_fault;
}
}
if (copy) {
csum = csum_and_copy_from_user(base, kdata, copy,
csum, &err);
if (err)
goto out;
}
len -= copy + partial_cnt;
kdata += copy + partial_cnt;
iov++;
}
*csump = csum;
out:
return err;
out_fault:
err = -EFAULT;
goto out;
}
EXPORT_SYMBOL(csum_partial_copy_fromiovecend);
unsigned long iov_pages(const struct iovec *iov, int offset,
unsigned long nr_segs)
{
unsigned long seg, base;
int pages = 0, len, size;
while (nr_segs && (offset >= iov->iov_len)) {
offset -= iov->iov_len;
++iov;
--nr_segs;
}
for (seg = 0; seg < nr_segs; seg++) {
base = (unsigned long)iov[seg].iov_base + offset;
len = iov[seg].iov_len - offset;
size = ((base & ~PAGE_MASK) + len + ~PAGE_MASK) >> PAGE_SHIFT;
pages += size;
offset = 0;
}
return pages;
}
EXPORT_SYMBOL(iov_pages);

253
net/core/link_watch.c Normal file
View file

@ -0,0 +1,253 @@
/*
* Linux network device link state notification
*
* Author:
* Stefan Rompf <sux@loplof.de>
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public License
* as published by the Free Software Foundation; either version
* 2 of the License, or (at your option) any later version.
*
*/
#include <linux/module.h>
#include <linux/netdevice.h>
#include <linux/if.h>
#include <net/sock.h>
#include <net/pkt_sched.h>
#include <linux/rtnetlink.h>
#include <linux/jiffies.h>
#include <linux/spinlock.h>
#include <linux/workqueue.h>
#include <linux/bitops.h>
#include <asm/types.h>
enum lw_bits {
LW_URGENT = 0,
};
static unsigned long linkwatch_flags;
static unsigned long linkwatch_nextevent;
static void linkwatch_event(struct work_struct *dummy);
static DECLARE_DELAYED_WORK(linkwatch_work, linkwatch_event);
static LIST_HEAD(lweventlist);
static DEFINE_SPINLOCK(lweventlist_lock);
static unsigned char default_operstate(const struct net_device *dev)
{
if (!netif_carrier_ok(dev))
return (dev->ifindex != dev->iflink ?
IF_OPER_LOWERLAYERDOWN : IF_OPER_DOWN);
if (netif_dormant(dev))
return IF_OPER_DORMANT;
return IF_OPER_UP;
}
static void rfc2863_policy(struct net_device *dev)
{
unsigned char operstate = default_operstate(dev);
if (operstate == dev->operstate)
return;
write_lock_bh(&dev_base_lock);
switch(dev->link_mode) {
case IF_LINK_MODE_DORMANT:
if (operstate == IF_OPER_UP)
operstate = IF_OPER_DORMANT;
break;
case IF_LINK_MODE_DEFAULT:
default:
break;
}
dev->operstate = operstate;
write_unlock_bh(&dev_base_lock);
}
void linkwatch_init_dev(struct net_device *dev)
{
/* Handle pre-registration link state changes */
if (!netif_carrier_ok(dev) || netif_dormant(dev))
rfc2863_policy(dev);
}
static bool linkwatch_urgent_event(struct net_device *dev)
{
if (!netif_running(dev))
return false;
if (dev->ifindex != dev->iflink)
return true;
if (dev->priv_flags & IFF_TEAM_PORT)
return true;
return netif_carrier_ok(dev) && qdisc_tx_changing(dev);
}
static void linkwatch_add_event(struct net_device *dev)
{
unsigned long flags;
spin_lock_irqsave(&lweventlist_lock, flags);
if (list_empty(&dev->link_watch_list)) {
list_add_tail(&dev->link_watch_list, &lweventlist);
dev_hold(dev);
}
spin_unlock_irqrestore(&lweventlist_lock, flags);
}
static void linkwatch_schedule_work(int urgent)
{
unsigned long delay = linkwatch_nextevent - jiffies;
if (test_bit(LW_URGENT, &linkwatch_flags))
return;
/* Minimise down-time: drop delay for up event. */
if (urgent) {
if (test_and_set_bit(LW_URGENT, &linkwatch_flags))
return;
delay = 0;
}
/* If we wrap around we'll delay it by at most HZ. */
if (delay > HZ)
delay = 0;
/*
* If urgent, schedule immediate execution; otherwise, don't
* override the existing timer.
*/
if (test_bit(LW_URGENT, &linkwatch_flags))
mod_delayed_work(system_wq, &linkwatch_work, 0);
else
schedule_delayed_work(&linkwatch_work, delay);
}
static void linkwatch_do_dev(struct net_device *dev)
{
/*
* Make sure the above read is complete since it can be
* rewritten as soon as we clear the bit below.
*/
smp_mb__before_atomic();
/* We are about to handle this device,
* so new events can be accepted
*/
clear_bit(__LINK_STATE_LINKWATCH_PENDING, &dev->state);
rfc2863_policy(dev);
if (dev->flags & IFF_UP) {
if (netif_carrier_ok(dev))
dev_activate(dev);
else
dev_deactivate(dev);
netdev_state_change(dev);
}
dev_put(dev);
}
static void __linkwatch_run_queue(int urgent_only)
{
struct net_device *dev;
LIST_HEAD(wrk);
/*
* Limit the number of linkwatch events to one
* per second so that a runaway driver does not
* cause a storm of messages on the netlink
* socket. This limit does not apply to up events
* while the device qdisc is down.
*/
if (!urgent_only)
linkwatch_nextevent = jiffies + HZ;
/* Limit wrap-around effect on delay. */
else if (time_after(linkwatch_nextevent, jiffies + HZ))
linkwatch_nextevent = jiffies;
clear_bit(LW_URGENT, &linkwatch_flags);
spin_lock_irq(&lweventlist_lock);
list_splice_init(&lweventlist, &wrk);
while (!list_empty(&wrk)) {
dev = list_first_entry(&wrk, struct net_device, link_watch_list);
list_del_init(&dev->link_watch_list);
if (urgent_only && !linkwatch_urgent_event(dev)) {
list_add_tail(&dev->link_watch_list, &lweventlist);
continue;
}
spin_unlock_irq(&lweventlist_lock);
linkwatch_do_dev(dev);
spin_lock_irq(&lweventlist_lock);
}
if (!list_empty(&lweventlist))
linkwatch_schedule_work(0);
spin_unlock_irq(&lweventlist_lock);
}
void linkwatch_forget_dev(struct net_device *dev)
{
unsigned long flags;
int clean = 0;
spin_lock_irqsave(&lweventlist_lock, flags);
if (!list_empty(&dev->link_watch_list)) {
list_del_init(&dev->link_watch_list);
clean = 1;
}
spin_unlock_irqrestore(&lweventlist_lock, flags);
if (clean)
linkwatch_do_dev(dev);
}
/* Must be called with the rtnl semaphore held */
void linkwatch_run_queue(void)
{
__linkwatch_run_queue(0);
}
static void linkwatch_event(struct work_struct *dummy)
{
rtnl_lock();
__linkwatch_run_queue(time_after(linkwatch_nextevent, jiffies));
rtnl_unlock();
}
void linkwatch_fire_event(struct net_device *dev)
{
bool urgent = linkwatch_urgent_event(dev);
if (!test_and_set_bit(__LINK_STATE_LINKWATCH_PENDING, &dev->state)) {
linkwatch_add_event(dev);
} else if (!urgent)
return;
linkwatch_schedule_work(urgent);
}
EXPORT_SYMBOL(linkwatch_fire_event);

3143
net/core/neighbour.c Normal file

File diff suppressed because it is too large Load diff

423
net/core/net-procfs.c Normal file
View file

@ -0,0 +1,423 @@
#include <linux/netdevice.h>
#include <linux/proc_fs.h>
#include <linux/seq_file.h>
#include <net/wext.h>
#define BUCKET_SPACE (32 - NETDEV_HASHBITS - 1)
#define get_bucket(x) ((x) >> BUCKET_SPACE)
#define get_offset(x) ((x) & ((1 << BUCKET_SPACE) - 1))
#define set_bucket_offset(b, o) ((b) << BUCKET_SPACE | (o))
extern struct list_head ptype_all __read_mostly;
extern struct list_head ptype_base[PTYPE_HASH_SIZE] __read_mostly;
static inline struct net_device *dev_from_same_bucket(struct seq_file *seq, loff_t *pos)
{
struct net *net = seq_file_net(seq);
struct net_device *dev;
struct hlist_head *h;
unsigned int count = 0, offset = get_offset(*pos);
h = &net->dev_name_head[get_bucket(*pos)];
hlist_for_each_entry_rcu(dev, h, name_hlist) {
if (++count == offset)
return dev;
}
return NULL;
}
static inline struct net_device *dev_from_bucket(struct seq_file *seq, loff_t *pos)
{
struct net_device *dev;
unsigned int bucket;
do {
dev = dev_from_same_bucket(seq, pos);
if (dev)
return dev;
bucket = get_bucket(*pos) + 1;
*pos = set_bucket_offset(bucket, 1);
} while (bucket < NETDEV_HASHENTRIES);
return NULL;
}
/*
* This is invoked by the /proc filesystem handler to display a device
* in detail.
*/
static void *dev_seq_start(struct seq_file *seq, loff_t *pos)
__acquires(RCU)
{
rcu_read_lock();
if (!*pos)
return SEQ_START_TOKEN;
if (get_bucket(*pos) >= NETDEV_HASHENTRIES)
return NULL;
return dev_from_bucket(seq, pos);
}
static void *dev_seq_next(struct seq_file *seq, void *v, loff_t *pos)
{
++*pos;
return dev_from_bucket(seq, pos);
}
static void dev_seq_stop(struct seq_file *seq, void *v)
__releases(RCU)
{
rcu_read_unlock();
}
static void dev_seq_printf_stats(struct seq_file *seq, struct net_device *dev)
{
struct rtnl_link_stats64 temp;
const struct rtnl_link_stats64 *stats = dev_get_stats(dev, &temp);
seq_printf(seq, "%6s: %7llu %7llu %4llu %4llu %4llu %5llu %10llu %9llu "
"%8llu %7llu %4llu %4llu %4llu %5llu %7llu %10llu\n",
dev->name, stats->rx_bytes, stats->rx_packets,
stats->rx_errors,
stats->rx_dropped + stats->rx_missed_errors,
stats->rx_fifo_errors,
stats->rx_length_errors + stats->rx_over_errors +
stats->rx_crc_errors + stats->rx_frame_errors,
stats->rx_compressed, stats->multicast,
stats->tx_bytes, stats->tx_packets,
stats->tx_errors, stats->tx_dropped,
stats->tx_fifo_errors, stats->collisions,
stats->tx_carrier_errors +
stats->tx_aborted_errors +
stats->tx_window_errors +
stats->tx_heartbeat_errors,
stats->tx_compressed);
}
/*
* Called from the PROCfs module. This now uses the new arbitrary sized
* /proc/net interface to create /proc/net/dev
*/
static int dev_seq_show(struct seq_file *seq, void *v)
{
if (v == SEQ_START_TOKEN)
seq_puts(seq, "Inter-| Receive "
" | Transmit\n"
" face |bytes packets errs drop fifo frame "
"compressed multicast|bytes packets errs "
"drop fifo colls carrier compressed\n");
else
dev_seq_printf_stats(seq, v);
return 0;
}
static struct softnet_data *softnet_get_online(loff_t *pos)
{
struct softnet_data *sd = NULL;
while (*pos < nr_cpu_ids)
if (cpu_online(*pos)) {
sd = &per_cpu(softnet_data, *pos);
break;
} else
++*pos;
return sd;
}
static void *softnet_seq_start(struct seq_file *seq, loff_t *pos)
{
return softnet_get_online(pos);
}
static void *softnet_seq_next(struct seq_file *seq, void *v, loff_t *pos)
{
++*pos;
return softnet_get_online(pos);
}
static void softnet_seq_stop(struct seq_file *seq, void *v)
{
}
static int softnet_seq_show(struct seq_file *seq, void *v)
{
struct softnet_data *sd = v;
unsigned int flow_limit_count = 0;
#ifdef CONFIG_NET_FLOW_LIMIT
struct sd_flow_limit *fl;
rcu_read_lock();
fl = rcu_dereference(sd->flow_limit);
if (fl)
flow_limit_count = fl->count;
rcu_read_unlock();
#endif
seq_printf(seq,
"%08x %08x %08x %08x %08x %08x %08x %08x %08x %08x %08x\n",
sd->processed, sd->dropped, sd->time_squeeze, 0,
0, 0, 0, 0, /* was fastroute */
sd->cpu_collision, sd->received_rps, flow_limit_count);
return 0;
}
static const struct seq_operations dev_seq_ops = {
.start = dev_seq_start,
.next = dev_seq_next,
.stop = dev_seq_stop,
.show = dev_seq_show,
};
static int dev_seq_open(struct inode *inode, struct file *file)
{
return seq_open_net(inode, file, &dev_seq_ops,
sizeof(struct seq_net_private));
}
static const struct file_operations dev_seq_fops = {
.owner = THIS_MODULE,
.open = dev_seq_open,
.read = seq_read,
.llseek = seq_lseek,
.release = seq_release_net,
};
static const struct seq_operations softnet_seq_ops = {
.start = softnet_seq_start,
.next = softnet_seq_next,
.stop = softnet_seq_stop,
.show = softnet_seq_show,
};
static int softnet_seq_open(struct inode *inode, struct file *file)
{
return seq_open(file, &softnet_seq_ops);
}
static const struct file_operations softnet_seq_fops = {
.owner = THIS_MODULE,
.open = softnet_seq_open,
.read = seq_read,
.llseek = seq_lseek,
.release = seq_release,
};
static void *ptype_get_idx(loff_t pos)
{
struct packet_type *pt = NULL;
loff_t i = 0;
int t;
list_for_each_entry_rcu(pt, &ptype_all, list) {
if (i == pos)
return pt;
++i;
}
for (t = 0; t < PTYPE_HASH_SIZE; t++) {
list_for_each_entry_rcu(pt, &ptype_base[t], list) {
if (i == pos)
return pt;
++i;
}
}
return NULL;
}
static void *ptype_seq_start(struct seq_file *seq, loff_t *pos)
__acquires(RCU)
{
rcu_read_lock();
return *pos ? ptype_get_idx(*pos - 1) : SEQ_START_TOKEN;
}
static void *ptype_seq_next(struct seq_file *seq, void *v, loff_t *pos)
{
struct packet_type *pt;
struct list_head *nxt;
int hash;
++*pos;
if (v == SEQ_START_TOKEN)
return ptype_get_idx(0);
pt = v;
nxt = pt->list.next;
if (pt->type == htons(ETH_P_ALL)) {
if (nxt != &ptype_all)
goto found;
hash = 0;
nxt = ptype_base[0].next;
} else
hash = ntohs(pt->type) & PTYPE_HASH_MASK;
while (nxt == &ptype_base[hash]) {
if (++hash >= PTYPE_HASH_SIZE)
return NULL;
nxt = ptype_base[hash].next;
}
found:
return list_entry(nxt, struct packet_type, list);
}
static void ptype_seq_stop(struct seq_file *seq, void *v)
__releases(RCU)
{
rcu_read_unlock();
}
static int ptype_seq_show(struct seq_file *seq, void *v)
{
struct packet_type *pt = v;
if (v == SEQ_START_TOKEN)
seq_puts(seq, "Type Device Function\n");
else if (pt->dev == NULL || dev_net(pt->dev) == seq_file_net(seq)) {
if (pt->type == htons(ETH_P_ALL))
seq_puts(seq, "ALL ");
else
seq_printf(seq, "%04x", ntohs(pt->type));
seq_printf(seq, " %-8s %pf\n",
pt->dev ? pt->dev->name : "", pt->func);
}
return 0;
}
static const struct seq_operations ptype_seq_ops = {
.start = ptype_seq_start,
.next = ptype_seq_next,
.stop = ptype_seq_stop,
.show = ptype_seq_show,
};
static int ptype_seq_open(struct inode *inode, struct file *file)
{
return seq_open_net(inode, file, &ptype_seq_ops,
sizeof(struct seq_net_private));
}
static const struct file_operations ptype_seq_fops = {
.owner = THIS_MODULE,
.open = ptype_seq_open,
.read = seq_read,
.llseek = seq_lseek,
.release = seq_release_net,
};
static int __net_init dev_proc_net_init(struct net *net)
{
int rc = -ENOMEM;
if (!proc_create("dev", S_IRUGO, net->proc_net, &dev_seq_fops))
goto out;
if (!proc_create("softnet_stat", S_IRUGO, net->proc_net,
&softnet_seq_fops))
goto out_dev;
if (!proc_create("ptype", S_IRUGO, net->proc_net, &ptype_seq_fops))
goto out_softnet;
if (wext_proc_init(net))
goto out_ptype;
rc = 0;
out:
return rc;
out_ptype:
remove_proc_entry("ptype", net->proc_net);
out_softnet:
remove_proc_entry("softnet_stat", net->proc_net);
out_dev:
remove_proc_entry("dev", net->proc_net);
goto out;
}
static void __net_exit dev_proc_net_exit(struct net *net)
{
wext_proc_exit(net);
remove_proc_entry("ptype", net->proc_net);
remove_proc_entry("softnet_stat", net->proc_net);
remove_proc_entry("dev", net->proc_net);
}
static struct pernet_operations __net_initdata dev_proc_ops = {
.init = dev_proc_net_init,
.exit = dev_proc_net_exit,
};
static int dev_mc_seq_show(struct seq_file *seq, void *v)
{
struct netdev_hw_addr *ha;
struct net_device *dev = v;
if (v == SEQ_START_TOKEN)
return 0;
netif_addr_lock_bh(dev);
netdev_for_each_mc_addr(ha, dev) {
int i;
seq_printf(seq, "%-4d %-15s %-5d %-5d ", dev->ifindex,
dev->name, ha->refcount, ha->global_use);
for (i = 0; i < dev->addr_len; i++)
seq_printf(seq, "%02x", ha->addr[i]);
seq_putc(seq, '\n');
}
netif_addr_unlock_bh(dev);
return 0;
}
static const struct seq_operations dev_mc_seq_ops = {
.start = dev_seq_start,
.next = dev_seq_next,
.stop = dev_seq_stop,
.show = dev_mc_seq_show,
};
static int dev_mc_seq_open(struct inode *inode, struct file *file)
{
return seq_open_net(inode, file, &dev_mc_seq_ops,
sizeof(struct seq_net_private));
}
static const struct file_operations dev_mc_seq_fops = {
.owner = THIS_MODULE,
.open = dev_mc_seq_open,
.read = seq_read,
.llseek = seq_lseek,
.release = seq_release_net,
};
static int __net_init dev_mc_net_init(struct net *net)
{
if (!proc_create("dev_mcast", 0, net->proc_net, &dev_mc_seq_fops))
return -ENOMEM;
return 0;
}
static void __net_exit dev_mc_net_exit(struct net *net)
{
remove_proc_entry("dev_mcast", net->proc_net);
}
static struct pernet_operations __net_initdata dev_mc_net_ops = {
.init = dev_mc_net_init,
.exit = dev_mc_net_exit,
};
int __init dev_proc_init(void)
{
int ret = register_pernet_subsys(&dev_proc_ops);
if (!ret)
return register_pernet_subsys(&dev_mc_net_ops);
return ret;
}

1429
net/core/net-sysfs.c Normal file

File diff suppressed because it is too large Load diff

11
net/core/net-sysfs.h Normal file
View file

@ -0,0 +1,11 @@
#ifndef __NET_SYSFS_H__
#define __NET_SYSFS_H__
int __init netdev_kobject_init(void);
int netdev_register_kobject(struct net_device *);
void netdev_unregister_kobject(struct net_device *);
int net_rx_queue_update_kobjects(struct net_device *, int old_num, int new_num);
int netdev_queue_update_kobjects(struct net_device *net,
int old_num, int new_num);
#endif

37
net/core/net-traces.c Normal file
View file

@ -0,0 +1,37 @@
/*
* consolidates trace point definitions
*
* Copyright (C) 2009 Neil Horman <nhorman@tuxdriver.com>
*/
#include <linux/netdevice.h>
#include <linux/etherdevice.h>
#include <linux/string.h>
#include <linux/if_arp.h>
#include <linux/inetdevice.h>
#include <linux/inet.h>
#include <linux/interrupt.h>
#include <linux/export.h>
#include <linux/netpoll.h>
#include <linux/sched.h>
#include <linux/delay.h>
#include <linux/rcupdate.h>
#include <linux/types.h>
#include <linux/workqueue.h>
#include <linux/netlink.h>
#include <linux/net_dropmon.h>
#include <linux/slab.h>
#include <asm/unaligned.h>
#include <asm/bitops.h>
#define CREATE_TRACE_POINTS
#include <trace/events/skb.h>
#include <trace/events/net.h>
#include <trace/events/napi.h>
#include <trace/events/sock.h>
#include <trace/events/udp.h>
EXPORT_TRACEPOINT_SYMBOL_GPL(kfree_skb);
EXPORT_TRACEPOINT_SYMBOL_GPL(napi_poll);

678
net/core/net_namespace.c Normal file
View file

@ -0,0 +1,678 @@
#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
#include <linux/workqueue.h>
#include <linux/rtnetlink.h>
#include <linux/cache.h>
#include <linux/slab.h>
#include <linux/list.h>
#include <linux/delay.h>
#include <linux/sched.h>
#include <linux/idr.h>
#include <linux/rculist.h>
#include <linux/nsproxy.h>
#include <linux/fs.h>
#include <linux/proc_ns.h>
#include <linux/file.h>
#include <linux/export.h>
#include <linux/user_namespace.h>
#include <net/net_namespace.h>
#include <net/netns/generic.h>
/*
* Our network namespace constructor/destructor lists
*/
static LIST_HEAD(pernet_list);
static struct list_head *first_device = &pernet_list;
DEFINE_MUTEX(net_mutex);
LIST_HEAD(net_namespace_list);
EXPORT_SYMBOL_GPL(net_namespace_list);
struct net init_net = {
.dev_base_head = LIST_HEAD_INIT(init_net.dev_base_head),
};
EXPORT_SYMBOL(init_net);
#define INITIAL_NET_GEN_PTRS 13 /* +1 for len +2 for rcu_head */
static unsigned int max_gen_ptrs = INITIAL_NET_GEN_PTRS;
static struct net_generic *net_alloc_generic(void)
{
struct net_generic *ng;
size_t generic_size = offsetof(struct net_generic, ptr[max_gen_ptrs]);
ng = kzalloc(generic_size, GFP_KERNEL);
if (ng)
ng->len = max_gen_ptrs;
return ng;
}
static int net_assign_generic(struct net *net, int id, void *data)
{
struct net_generic *ng, *old_ng;
BUG_ON(!mutex_is_locked(&net_mutex));
BUG_ON(id == 0);
old_ng = rcu_dereference_protected(net->gen,
lockdep_is_held(&net_mutex));
ng = old_ng;
if (old_ng->len >= id)
goto assign;
ng = net_alloc_generic();
if (ng == NULL)
return -ENOMEM;
/*
* Some synchronisation notes:
*
* The net_generic explores the net->gen array inside rcu
* read section. Besides once set the net->gen->ptr[x]
* pointer never changes (see rules in netns/generic.h).
*
* That said, we simply duplicate this array and schedule
* the old copy for kfree after a grace period.
*/
memcpy(&ng->ptr, &old_ng->ptr, old_ng->len * sizeof(void*));
rcu_assign_pointer(net->gen, ng);
kfree_rcu(old_ng, rcu);
assign:
ng->ptr[id - 1] = data;
return 0;
}
static int ops_init(const struct pernet_operations *ops, struct net *net)
{
int err = -ENOMEM;
void *data = NULL;
if (ops->id && ops->size) {
data = kzalloc(ops->size, GFP_KERNEL);
if (!data)
goto out;
err = net_assign_generic(net, *ops->id, data);
if (err)
goto cleanup;
}
err = 0;
if (ops->init)
err = ops->init(net);
if (!err)
return 0;
cleanup:
kfree(data);
out:
return err;
}
static void ops_free(const struct pernet_operations *ops, struct net *net)
{
if (ops->id && ops->size) {
int id = *ops->id;
kfree(net_generic(net, id));
}
}
static void ops_exit_list(const struct pernet_operations *ops,
struct list_head *net_exit_list)
{
struct net *net;
if (ops->exit) {
list_for_each_entry(net, net_exit_list, exit_list)
ops->exit(net);
}
if (ops->exit_batch)
ops->exit_batch(net_exit_list);
}
static void ops_free_list(const struct pernet_operations *ops,
struct list_head *net_exit_list)
{
struct net *net;
if (ops->size && ops->id) {
list_for_each_entry(net, net_exit_list, exit_list)
ops_free(ops, net);
}
}
/*
* setup_net runs the initializers for the network namespace object.
*/
static __net_init int setup_net(struct net *net, struct user_namespace *user_ns)
{
/* Must be called with net_mutex held */
const struct pernet_operations *ops, *saved_ops;
int error = 0;
LIST_HEAD(net_exit_list);
atomic_set(&net->count, 1);
atomic_set(&net->passive, 1);
net->dev_base_seq = 1;
net->user_ns = user_ns;
#ifdef NETNS_REFCNT_DEBUG
atomic_set(&net->use_count, 0);
#endif
list_for_each_entry(ops, &pernet_list, list) {
error = ops_init(ops, net);
if (error < 0)
goto out_undo;
}
out:
return error;
out_undo:
/* Walk through the list backwards calling the exit functions
* for the pernet modules whose init functions did not fail.
*/
list_add(&net->exit_list, &net_exit_list);
saved_ops = ops;
list_for_each_entry_continue_reverse(ops, &pernet_list, list)
ops_exit_list(ops, &net_exit_list);
ops = saved_ops;
list_for_each_entry_continue_reverse(ops, &pernet_list, list)
ops_free_list(ops, &net_exit_list);
rcu_barrier();
goto out;
}
#ifdef CONFIG_NET_NS
static struct kmem_cache *net_cachep;
static struct workqueue_struct *netns_wq;
static struct net *net_alloc(void)
{
struct net *net = NULL;
struct net_generic *ng;
ng = net_alloc_generic();
if (!ng)
goto out;
net = kmem_cache_zalloc(net_cachep, GFP_KERNEL);
if (!net)
goto out_free;
rcu_assign_pointer(net->gen, ng);
out:
return net;
out_free:
kfree(ng);
goto out;
}
static void net_free(struct net *net)
{
#ifdef NETNS_REFCNT_DEBUG
if (unlikely(atomic_read(&net->use_count) != 0)) {
pr_emerg("network namespace not free! Usage: %d\n",
atomic_read(&net->use_count));
return;
}
#endif
kfree(rcu_access_pointer(net->gen));
kmem_cache_free(net_cachep, net);
}
void net_drop_ns(void *p)
{
struct net *ns = p;
if (ns && atomic_dec_and_test(&ns->passive))
net_free(ns);
}
struct net *copy_net_ns(unsigned long flags,
struct user_namespace *user_ns, struct net *old_net)
{
struct net *net;
int rv;
if (!(flags & CLONE_NEWNET))
return get_net(old_net);
net = net_alloc();
if (!net)
return ERR_PTR(-ENOMEM);
get_user_ns(user_ns);
mutex_lock(&net_mutex);
rv = setup_net(net, user_ns);
if (rv == 0) {
rtnl_lock();
list_add_tail_rcu(&net->list, &net_namespace_list);
rtnl_unlock();
}
mutex_unlock(&net_mutex);
if (rv < 0) {
put_user_ns(user_ns);
net_drop_ns(net);
return ERR_PTR(rv);
}
return net;
}
static DEFINE_SPINLOCK(cleanup_list_lock);
static LIST_HEAD(cleanup_list); /* Must hold cleanup_list_lock to touch */
static void cleanup_net(struct work_struct *work)
{
const struct pernet_operations *ops;
struct net *net, *tmp;
struct list_head net_kill_list;
LIST_HEAD(net_exit_list);
/* Atomically snapshot the list of namespaces to cleanup */
spin_lock_irq(&cleanup_list_lock);
list_replace_init(&cleanup_list, &net_kill_list);
spin_unlock_irq(&cleanup_list_lock);
mutex_lock(&net_mutex);
/* Don't let anyone else find us. */
rtnl_lock();
list_for_each_entry(net, &net_kill_list, cleanup_list) {
list_del_rcu(&net->list);
list_add_tail(&net->exit_list, &net_exit_list);
}
rtnl_unlock();
/*
* Another CPU might be rcu-iterating the list, wait for it.
* This needs to be before calling the exit() notifiers, so
* the rcu_barrier() below isn't sufficient alone.
*/
synchronize_rcu();
/* Run all of the network namespace exit methods */
list_for_each_entry_reverse(ops, &pernet_list, list)
ops_exit_list(ops, &net_exit_list);
/* Free the net generic variables */
list_for_each_entry_reverse(ops, &pernet_list, list)
ops_free_list(ops, &net_exit_list);
mutex_unlock(&net_mutex);
/* Ensure there are no outstanding rcu callbacks using this
* network namespace.
*/
rcu_barrier();
/* Finally it is safe to free my network namespace structure */
list_for_each_entry_safe(net, tmp, &net_exit_list, exit_list) {
list_del_init(&net->exit_list);
put_user_ns(net->user_ns);
net_drop_ns(net);
}
}
static DECLARE_WORK(net_cleanup_work, cleanup_net);
void __put_net(struct net *net)
{
/* Cleanup the network namespace in process context */
unsigned long flags;
spin_lock_irqsave(&cleanup_list_lock, flags);
list_add(&net->cleanup_list, &cleanup_list);
spin_unlock_irqrestore(&cleanup_list_lock, flags);
queue_work(netns_wq, &net_cleanup_work);
}
EXPORT_SYMBOL_GPL(__put_net);
struct net *get_net_ns_by_fd(int fd)
{
struct proc_ns *ei;
struct file *file;
struct net *net;
file = proc_ns_fget(fd);
if (IS_ERR(file))
return ERR_CAST(file);
ei = get_proc_ns(file_inode(file));
if (ei->ns_ops == &netns_operations)
net = get_net(ei->ns);
else
net = ERR_PTR(-EINVAL);
fput(file);
return net;
}
#else
struct net *get_net_ns_by_fd(int fd)
{
return ERR_PTR(-EINVAL);
}
#endif
struct net *get_net_ns_by_pid(pid_t pid)
{
struct task_struct *tsk;
struct net *net;
/* Lookup the network namespace */
net = ERR_PTR(-ESRCH);
rcu_read_lock();
tsk = find_task_by_vpid(pid);
if (tsk) {
struct nsproxy *nsproxy;
task_lock(tsk);
nsproxy = tsk->nsproxy;
if (nsproxy)
net = get_net(nsproxy->net_ns);
task_unlock(tsk);
}
rcu_read_unlock();
return net;
}
EXPORT_SYMBOL_GPL(get_net_ns_by_pid);
static __net_init int net_ns_net_init(struct net *net)
{
return proc_alloc_inum(&net->proc_inum);
}
static __net_exit void net_ns_net_exit(struct net *net)
{
proc_free_inum(net->proc_inum);
}
static struct pernet_operations __net_initdata net_ns_ops = {
.init = net_ns_net_init,
.exit = net_ns_net_exit,
};
static int __init net_ns_init(void)
{
struct net_generic *ng;
#ifdef CONFIG_NET_NS
net_cachep = kmem_cache_create("net_namespace", sizeof(struct net),
SMP_CACHE_BYTES,
SLAB_PANIC, NULL);
/* Create workqueue for cleanup */
netns_wq = create_singlethread_workqueue("netns");
if (!netns_wq)
panic("Could not create netns workq");
#endif
ng = net_alloc_generic();
if (!ng)
panic("Could not allocate generic netns");
rcu_assign_pointer(init_net.gen, ng);
mutex_lock(&net_mutex);
if (setup_net(&init_net, &init_user_ns))
panic("Could not setup the initial network namespace");
rtnl_lock();
list_add_tail_rcu(&init_net.list, &net_namespace_list);
rtnl_unlock();
mutex_unlock(&net_mutex);
register_pernet_subsys(&net_ns_ops);
return 0;
}
pure_initcall(net_ns_init);
#ifdef CONFIG_NET_NS
static int __register_pernet_operations(struct list_head *list,
struct pernet_operations *ops)
{
struct net *net;
int error;
LIST_HEAD(net_exit_list);
list_add_tail(&ops->list, list);
if (ops->init || (ops->id && ops->size)) {
for_each_net(net) {
error = ops_init(ops, net);
if (error)
goto out_undo;
list_add_tail(&net->exit_list, &net_exit_list);
}
}
return 0;
out_undo:
/* If I have an error cleanup all namespaces I initialized */
list_del(&ops->list);
ops_exit_list(ops, &net_exit_list);
ops_free_list(ops, &net_exit_list);
return error;
}
static void __unregister_pernet_operations(struct pernet_operations *ops)
{
struct net *net;
LIST_HEAD(net_exit_list);
list_del(&ops->list);
for_each_net(net)
list_add_tail(&net->exit_list, &net_exit_list);
ops_exit_list(ops, &net_exit_list);
ops_free_list(ops, &net_exit_list);
}
#else
static int __register_pernet_operations(struct list_head *list,
struct pernet_operations *ops)
{
return ops_init(ops, &init_net);
}
static void __unregister_pernet_operations(struct pernet_operations *ops)
{
LIST_HEAD(net_exit_list);
list_add(&init_net.exit_list, &net_exit_list);
ops_exit_list(ops, &net_exit_list);
ops_free_list(ops, &net_exit_list);
}
#endif /* CONFIG_NET_NS */
static DEFINE_IDA(net_generic_ids);
static int register_pernet_operations(struct list_head *list,
struct pernet_operations *ops)
{
int error;
if (ops->id) {
again:
error = ida_get_new_above(&net_generic_ids, 1, ops->id);
if (error < 0) {
if (error == -EAGAIN) {
ida_pre_get(&net_generic_ids, GFP_KERNEL);
goto again;
}
return error;
}
max_gen_ptrs = max_t(unsigned int, max_gen_ptrs, *ops->id);
}
error = __register_pernet_operations(list, ops);
if (error) {
rcu_barrier();
if (ops->id)
ida_remove(&net_generic_ids, *ops->id);
}
return error;
}
static void unregister_pernet_operations(struct pernet_operations *ops)
{
__unregister_pernet_operations(ops);
rcu_barrier();
if (ops->id)
ida_remove(&net_generic_ids, *ops->id);
}
/**
* register_pernet_subsys - register a network namespace subsystem
* @ops: pernet operations structure for the subsystem
*
* Register a subsystem which has init and exit functions
* that are called when network namespaces are created and
* destroyed respectively.
*
* When registered all network namespace init functions are
* called for every existing network namespace. Allowing kernel
* modules to have a race free view of the set of network namespaces.
*
* When a new network namespace is created all of the init
* methods are called in the order in which they were registered.
*
* When a network namespace is destroyed all of the exit methods
* are called in the reverse of the order with which they were
* registered.
*/
int register_pernet_subsys(struct pernet_operations *ops)
{
int error;
mutex_lock(&net_mutex);
error = register_pernet_operations(first_device, ops);
mutex_unlock(&net_mutex);
return error;
}
EXPORT_SYMBOL_GPL(register_pernet_subsys);
/**
* unregister_pernet_subsys - unregister a network namespace subsystem
* @ops: pernet operations structure to manipulate
*
* Remove the pernet operations structure from the list to be
* used when network namespaces are created or destroyed. In
* addition run the exit method for all existing network
* namespaces.
*/
void unregister_pernet_subsys(struct pernet_operations *ops)
{
mutex_lock(&net_mutex);
unregister_pernet_operations(ops);
mutex_unlock(&net_mutex);
}
EXPORT_SYMBOL_GPL(unregister_pernet_subsys);
/**
* register_pernet_device - register a network namespace device
* @ops: pernet operations structure for the subsystem
*
* Register a device which has init and exit functions
* that are called when network namespaces are created and
* destroyed respectively.
*
* When registered all network namespace init functions are
* called for every existing network namespace. Allowing kernel
* modules to have a race free view of the set of network namespaces.
*
* When a new network namespace is created all of the init
* methods are called in the order in which they were registered.
*
* When a network namespace is destroyed all of the exit methods
* are called in the reverse of the order with which they were
* registered.
*/
int register_pernet_device(struct pernet_operations *ops)
{
int error;
mutex_lock(&net_mutex);
error = register_pernet_operations(&pernet_list, ops);
if (!error && (first_device == &pernet_list))
first_device = &ops->list;
mutex_unlock(&net_mutex);
return error;
}
EXPORT_SYMBOL_GPL(register_pernet_device);
/**
* unregister_pernet_device - unregister a network namespace netdevice
* @ops: pernet operations structure to manipulate
*
* Remove the pernet operations structure from the list to be
* used when network namespaces are created or destroyed. In
* addition run the exit method for all existing network
* namespaces.
*/
void unregister_pernet_device(struct pernet_operations *ops)
{
mutex_lock(&net_mutex);
if (&ops->list == first_device)
first_device = first_device->next;
unregister_pernet_operations(ops);
mutex_unlock(&net_mutex);
}
EXPORT_SYMBOL_GPL(unregister_pernet_device);
#ifdef CONFIG_NET_NS
static void *netns_get(struct task_struct *task)
{
struct net *net = NULL;
struct nsproxy *nsproxy;
task_lock(task);
nsproxy = task->nsproxy;
if (nsproxy)
net = get_net(nsproxy->net_ns);
task_unlock(task);
return net;
}
static void netns_put(void *ns)
{
put_net(ns);
}
static int netns_install(struct nsproxy *nsproxy, void *ns)
{
struct net *net = ns;
if (!ns_capable(net->user_ns, CAP_SYS_ADMIN) ||
!ns_capable(current_user_ns(), CAP_SYS_ADMIN))
return -EPERM;
put_net(nsproxy->net_ns);
nsproxy->net_ns = get_net(net);
return 0;
}
static unsigned int netns_inum(void *ns)
{
struct net *net = ns;
return net->proc_inum;
}
const struct proc_ns_operations netns_operations = {
.name = "net",
.type = CLONE_NEWNET,
.get = netns_get,
.put = netns_put,
.install = netns_install,
.inum = netns_inum,
};
#endif

View file

@ -0,0 +1,111 @@
/*
* net/core/netclassid_cgroup.c Classid Cgroupfs Handling
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public License
* as published by the Free Software Foundation; either version
* 2 of the License, or (at your option) any later version.
*
* Authors: Thomas Graf <tgraf@suug.ch>
*/
#include <linux/module.h>
#include <linux/slab.h>
#include <linux/cgroup.h>
#include <linux/fdtable.h>
#include <net/cls_cgroup.h>
#include <net/sock.h>
static inline struct cgroup_cls_state *css_cls_state(struct cgroup_subsys_state *css)
{
return css ? container_of(css, struct cgroup_cls_state, css) : NULL;
}
struct cgroup_cls_state *task_cls_state(struct task_struct *p)
{
return css_cls_state(task_css(p, net_cls_cgrp_id));
}
EXPORT_SYMBOL_GPL(task_cls_state);
static struct cgroup_subsys_state *
cgrp_css_alloc(struct cgroup_subsys_state *parent_css)
{
struct cgroup_cls_state *cs;
cs = kzalloc(sizeof(*cs), GFP_KERNEL);
if (!cs)
return ERR_PTR(-ENOMEM);
return &cs->css;
}
static int cgrp_css_online(struct cgroup_subsys_state *css)
{
struct cgroup_cls_state *cs = css_cls_state(css);
struct cgroup_cls_state *parent = css_cls_state(css->parent);
if (parent)
cs->classid = parent->classid;
return 0;
}
static void cgrp_css_free(struct cgroup_subsys_state *css)
{
kfree(css_cls_state(css));
}
static int update_classid(const void *v, struct file *file, unsigned n)
{
int err;
struct socket *sock = sock_from_file(file, &err);
if (sock)
sock->sk->sk_classid = (u32)(unsigned long)v;
return 0;
}
static void cgrp_attach(struct cgroup_subsys_state *css,
struct cgroup_taskset *tset)
{
struct cgroup_cls_state *cs = css_cls_state(css);
void *v = (void *)(unsigned long)cs->classid;
struct task_struct *p;
cgroup_taskset_for_each(p, tset) {
task_lock(p);
iterate_fd(p->files, 0, update_classid, v);
task_unlock(p);
}
}
static u64 read_classid(struct cgroup_subsys_state *css, struct cftype *cft)
{
return css_cls_state(css)->classid;
}
static int write_classid(struct cgroup_subsys_state *css, struct cftype *cft,
u64 value)
{
css_cls_state(css)->classid = (u32) value;
return 0;
}
static struct cftype ss_files[] = {
{
.name = "classid",
.read_u64 = read_classid,
.write_u64 = write_classid,
},
{ } /* terminate */
};
struct cgroup_subsys net_cls_cgrp_subsys = {
.css_alloc = cgrp_css_alloc,
.css_online = cgrp_css_online,
.css_free = cgrp_css_free,
.attach = cgrp_attach,
.legacy_cftypes = ss_files,
};

70
net/core/netevent.c Normal file
View file

@ -0,0 +1,70 @@
/*
* Network event notifiers
*
* Authors:
* Tom Tucker <tom@opengridcomputing.com>
* Steve Wise <swise@opengridcomputing.com>
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public License
* as published by the Free Software Foundation; either version
* 2 of the License, or (at your option) any later version.
*
* Fixes:
*/
#include <linux/rtnetlink.h>
#include <linux/notifier.h>
#include <linux/export.h>
#include <net/netevent.h>
static ATOMIC_NOTIFIER_HEAD(netevent_notif_chain);
/**
* register_netevent_notifier - register a netevent notifier block
* @nb: notifier
*
* Register a notifier to be called when a netevent occurs.
* The notifier passed is linked into the kernel structures and must
* not be reused until it has been unregistered. A negative errno code
* is returned on a failure.
*/
int register_netevent_notifier(struct notifier_block *nb)
{
int err;
err = atomic_notifier_chain_register(&netevent_notif_chain, nb);
return err;
}
EXPORT_SYMBOL_GPL(register_netevent_notifier);
/**
* netevent_unregister_notifier - unregister a netevent notifier block
* @nb: notifier
*
* Unregister a notifier previously registered by
* register_neigh_notifier(). The notifier is unlinked into the
* kernel structures and may then be reused. A negative errno code
* is returned on a failure.
*/
int unregister_netevent_notifier(struct notifier_block *nb)
{
return atomic_notifier_chain_unregister(&netevent_notif_chain, nb);
}
EXPORT_SYMBOL_GPL(unregister_netevent_notifier);
/**
* call_netevent_notifiers - call all netevent notifier blocks
* @val: value passed unmodified to notifier function
* @v: pointer passed unmodified to notifier function
*
* Call all neighbour notifier blocks. Parameters and return value
* are as for notifier_call_chain().
*/
int call_netevent_notifiers(unsigned long val, void *v)
{
return atomic_notifier_call_chain(&netevent_notif_chain, val, v);
}
EXPORT_SYMBOL_GPL(call_netevent_notifiers);

852
net/core/netpoll.c Normal file
View file

@ -0,0 +1,852 @@
/*
* Common framework for low-level network console, dump, and debugger code
*
* Sep 8 2003 Matt Mackall <mpm@selenic.com>
*
* based on the netconsole code from:
*
* Copyright (C) 2001 Ingo Molnar <mingo@redhat.com>
* Copyright (C) 2002 Red Hat, Inc.
*/
#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
#include <linux/moduleparam.h>
#include <linux/kernel.h>
#include <linux/netdevice.h>
#include <linux/etherdevice.h>
#include <linux/string.h>
#include <linux/if_arp.h>
#include <linux/inetdevice.h>
#include <linux/inet.h>
#include <linux/interrupt.h>
#include <linux/netpoll.h>
#include <linux/sched.h>
#include <linux/delay.h>
#include <linux/rcupdate.h>
#include <linux/workqueue.h>
#include <linux/slab.h>
#include <linux/export.h>
#include <linux/if_vlan.h>
#include <net/tcp.h>
#include <net/udp.h>
#include <net/addrconf.h>
#include <net/ndisc.h>
#include <net/ip6_checksum.h>
#include <asm/unaligned.h>
#include <trace/events/napi.h>
/*
* We maintain a small pool of fully-sized skbs, to make sure the
* message gets out even in extreme OOM situations.
*/
#define MAX_UDP_CHUNK 1460
#define MAX_SKBS 32
static struct sk_buff_head skb_pool;
DEFINE_STATIC_SRCU(netpoll_srcu);
#define USEC_PER_POLL 50
#define MAX_SKB_SIZE \
(sizeof(struct ethhdr) + \
sizeof(struct iphdr) + \
sizeof(struct udphdr) + \
MAX_UDP_CHUNK)
static void zap_completion_queue(void);
static void netpoll_async_cleanup(struct work_struct *work);
static unsigned int carrier_timeout = 4;
module_param(carrier_timeout, uint, 0644);
#define np_info(np, fmt, ...) \
pr_info("%s: " fmt, np->name, ##__VA_ARGS__)
#define np_err(np, fmt, ...) \
pr_err("%s: " fmt, np->name, ##__VA_ARGS__)
#define np_notice(np, fmt, ...) \
pr_notice("%s: " fmt, np->name, ##__VA_ARGS__)
static int netpoll_start_xmit(struct sk_buff *skb, struct net_device *dev,
struct netdev_queue *txq)
{
int status = NETDEV_TX_OK;
netdev_features_t features;
features = netif_skb_features(skb);
if (vlan_tx_tag_present(skb) &&
!vlan_hw_offload_capable(features, skb->vlan_proto)) {
skb = __vlan_hwaccel_push_inside(skb);
if (unlikely(!skb)) {
/* This is actually a packet drop, but we
* don't want the code that calls this
* function to try and operate on a NULL skb.
*/
goto out;
}
}
status = netdev_start_xmit(skb, dev, txq, false);
out:
return status;
}
static void queue_process(struct work_struct *work)
{
struct netpoll_info *npinfo =
container_of(work, struct netpoll_info, tx_work.work);
struct sk_buff *skb;
unsigned long flags;
while ((skb = skb_dequeue(&npinfo->txq))) {
struct net_device *dev = skb->dev;
struct netdev_queue *txq;
if (!netif_device_present(dev) || !netif_running(dev)) {
kfree_skb(skb);
continue;
}
txq = skb_get_tx_queue(dev, skb);
local_irq_save(flags);
HARD_TX_LOCK(dev, txq, smp_processor_id());
if (netif_xmit_frozen_or_stopped(txq) ||
netpoll_start_xmit(skb, dev, txq) != NETDEV_TX_OK) {
skb_queue_head(&npinfo->txq, skb);
HARD_TX_UNLOCK(dev, txq);
local_irq_restore(flags);
schedule_delayed_work(&npinfo->tx_work, HZ/10);
return;
}
HARD_TX_UNLOCK(dev, txq);
local_irq_restore(flags);
}
}
/*
* Check whether delayed processing was scheduled for our NIC. If so,
* we attempt to grab the poll lock and use ->poll() to pump the card.
* If this fails, either we've recursed in ->poll() or it's already
* running on another CPU.
*
* Note: we don't mask interrupts with this lock because we're using
* trylock here and interrupts are already disabled in the softirq
* case. Further, we test the poll_owner to avoid recursion on UP
* systems where the lock doesn't exist.
*/
static int poll_one_napi(struct napi_struct *napi, int budget)
{
int work;
/* net_rx_action's ->poll() invocations and our's are
* synchronized by this test which is only made while
* holding the napi->poll_lock.
*/
if (!test_bit(NAPI_STATE_SCHED, &napi->state))
return budget;
set_bit(NAPI_STATE_NPSVC, &napi->state);
work = napi->poll(napi, budget);
WARN_ONCE(work > budget, "%pF exceeded budget in poll\n", napi->poll);
trace_napi_poll(napi);
clear_bit(NAPI_STATE_NPSVC, &napi->state);
return budget - work;
}
static void poll_napi(struct net_device *dev, int budget)
{
struct napi_struct *napi;
list_for_each_entry(napi, &dev->napi_list, dev_list) {
if (napi->poll_owner != smp_processor_id() &&
spin_trylock(&napi->poll_lock)) {
budget = poll_one_napi(napi, budget);
spin_unlock(&napi->poll_lock);
}
}
}
static void netpoll_poll_dev(struct net_device *dev)
{
const struct net_device_ops *ops;
struct netpoll_info *ni = rcu_dereference_bh(dev->npinfo);
int budget = 0;
/* Don't do any rx activity if the dev_lock mutex is held
* the dev_open/close paths use this to block netpoll activity
* while changing device state
*/
if (down_trylock(&ni->dev_lock))
return;
if (!netif_running(dev)) {
up(&ni->dev_lock);
return;
}
ops = dev->netdev_ops;
if (!ops->ndo_poll_controller) {
up(&ni->dev_lock);
return;
}
/* Process pending work on NIC */
ops->ndo_poll_controller(dev);
poll_napi(dev, budget);
up(&ni->dev_lock);
zap_completion_queue();
}
void netpoll_poll_disable(struct net_device *dev)
{
struct netpoll_info *ni;
int idx;
might_sleep();
idx = srcu_read_lock(&netpoll_srcu);
ni = srcu_dereference(dev->npinfo, &netpoll_srcu);
if (ni)
down(&ni->dev_lock);
srcu_read_unlock(&netpoll_srcu, idx);
}
EXPORT_SYMBOL(netpoll_poll_disable);
void netpoll_poll_enable(struct net_device *dev)
{
struct netpoll_info *ni;
rcu_read_lock();
ni = rcu_dereference(dev->npinfo);
if (ni)
up(&ni->dev_lock);
rcu_read_unlock();
}
EXPORT_SYMBOL(netpoll_poll_enable);
static void refill_skbs(void)
{
struct sk_buff *skb;
unsigned long flags;
spin_lock_irqsave(&skb_pool.lock, flags);
while (skb_pool.qlen < MAX_SKBS) {
skb = alloc_skb(MAX_SKB_SIZE, GFP_ATOMIC);
if (!skb)
break;
__skb_queue_tail(&skb_pool, skb);
}
spin_unlock_irqrestore(&skb_pool.lock, flags);
}
static void zap_completion_queue(void)
{
unsigned long flags;
struct softnet_data *sd = &get_cpu_var(softnet_data);
if (sd->completion_queue) {
struct sk_buff *clist;
local_irq_save(flags);
clist = sd->completion_queue;
sd->completion_queue = NULL;
local_irq_restore(flags);
while (clist != NULL) {
struct sk_buff *skb = clist;
clist = clist->next;
if (!skb_irq_freeable(skb)) {
atomic_inc(&skb->users);
dev_kfree_skb_any(skb); /* put this one back */
} else {
__kfree_skb(skb);
}
}
}
put_cpu_var(softnet_data);
}
static struct sk_buff *find_skb(struct netpoll *np, int len, int reserve)
{
int count = 0;
struct sk_buff *skb;
zap_completion_queue();
refill_skbs();
repeat:
skb = alloc_skb(len, GFP_ATOMIC);
if (!skb)
skb = skb_dequeue(&skb_pool);
if (!skb) {
if (++count < 10) {
netpoll_poll_dev(np->dev);
goto repeat;
}
return NULL;
}
atomic_set(&skb->users, 1);
skb_reserve(skb, reserve);
return skb;
}
static int netpoll_owner_active(struct net_device *dev)
{
struct napi_struct *napi;
list_for_each_entry(napi, &dev->napi_list, dev_list) {
if (napi->poll_owner == smp_processor_id())
return 1;
}
return 0;
}
/* call with IRQ disabled */
void netpoll_send_skb_on_dev(struct netpoll *np, struct sk_buff *skb,
struct net_device *dev)
{
int status = NETDEV_TX_BUSY;
unsigned long tries;
/* It is up to the caller to keep npinfo alive. */
struct netpoll_info *npinfo;
WARN_ON_ONCE(!irqs_disabled());
npinfo = rcu_dereference_bh(np->dev->npinfo);
if (!npinfo || !netif_running(dev) || !netif_device_present(dev)) {
dev_kfree_skb_irq(skb);
return;
}
/* don't get messages out of order, and no recursion */
if (skb_queue_len(&npinfo->txq) == 0 && !netpoll_owner_active(dev)) {
struct netdev_queue *txq;
txq = netdev_pick_tx(dev, skb, NULL);
/* try until next clock tick */
for (tries = jiffies_to_usecs(1)/USEC_PER_POLL;
tries > 0; --tries) {
if (HARD_TX_TRYLOCK(dev, txq)) {
if (!netif_xmit_stopped(txq))
status = netpoll_start_xmit(skb, dev, txq);
HARD_TX_UNLOCK(dev, txq);
if (status == NETDEV_TX_OK)
break;
}
/* tickle device maybe there is some cleanup */
netpoll_poll_dev(np->dev);
udelay(USEC_PER_POLL);
}
WARN_ONCE(!irqs_disabled(),
"netpoll_send_skb_on_dev(): %s enabled interrupts in poll (%pF)\n",
dev->name, dev->netdev_ops->ndo_start_xmit);
}
if (status != NETDEV_TX_OK) {
skb_queue_tail(&npinfo->txq, skb);
schedule_delayed_work(&npinfo->tx_work,0);
}
}
EXPORT_SYMBOL(netpoll_send_skb_on_dev);
void netpoll_send_udp(struct netpoll *np, const char *msg, int len)
{
int total_len, ip_len, udp_len;
struct sk_buff *skb;
struct udphdr *udph;
struct iphdr *iph;
struct ethhdr *eth;
static atomic_t ip_ident;
struct ipv6hdr *ip6h;
udp_len = len + sizeof(*udph);
if (np->ipv6)
ip_len = udp_len + sizeof(*ip6h);
else
ip_len = udp_len + sizeof(*iph);
total_len = ip_len + LL_RESERVED_SPACE(np->dev);
skb = find_skb(np, total_len + np->dev->needed_tailroom,
total_len - len);
if (!skb)
return;
skb_copy_to_linear_data(skb, msg, len);
skb_put(skb, len);
skb_push(skb, sizeof(*udph));
skb_reset_transport_header(skb);
udph = udp_hdr(skb);
udph->source = htons(np->local_port);
udph->dest = htons(np->remote_port);
udph->len = htons(udp_len);
if (np->ipv6) {
udph->check = 0;
udph->check = csum_ipv6_magic(&np->local_ip.in6,
&np->remote_ip.in6,
udp_len, IPPROTO_UDP,
csum_partial(udph, udp_len, 0));
if (udph->check == 0)
udph->check = CSUM_MANGLED_0;
skb_push(skb, sizeof(*ip6h));
skb_reset_network_header(skb);
ip6h = ipv6_hdr(skb);
/* ip6h->version = 6; ip6h->priority = 0; */
put_unaligned(0x60, (unsigned char *)ip6h);
ip6h->flow_lbl[0] = 0;
ip6h->flow_lbl[1] = 0;
ip6h->flow_lbl[2] = 0;
ip6h->payload_len = htons(sizeof(struct udphdr) + len);
ip6h->nexthdr = IPPROTO_UDP;
ip6h->hop_limit = 32;
ip6h->saddr = np->local_ip.in6;
ip6h->daddr = np->remote_ip.in6;
eth = (struct ethhdr *) skb_push(skb, ETH_HLEN);
skb_reset_mac_header(skb);
skb->protocol = eth->h_proto = htons(ETH_P_IPV6);
} else {
udph->check = 0;
udph->check = csum_tcpudp_magic(np->local_ip.ip,
np->remote_ip.ip,
udp_len, IPPROTO_UDP,
csum_partial(udph, udp_len, 0));
if (udph->check == 0)
udph->check = CSUM_MANGLED_0;
skb_push(skb, sizeof(*iph));
skb_reset_network_header(skb);
iph = ip_hdr(skb);
/* iph->version = 4; iph->ihl = 5; */
put_unaligned(0x45, (unsigned char *)iph);
iph->tos = 0;
put_unaligned(htons(ip_len), &(iph->tot_len));
iph->id = htons(atomic_inc_return(&ip_ident));
iph->frag_off = 0;
iph->ttl = 64;
iph->protocol = IPPROTO_UDP;
iph->check = 0;
put_unaligned(np->local_ip.ip, &(iph->saddr));
put_unaligned(np->remote_ip.ip, &(iph->daddr));
iph->check = ip_fast_csum((unsigned char *)iph, iph->ihl);
eth = (struct ethhdr *) skb_push(skb, ETH_HLEN);
skb_reset_mac_header(skb);
skb->protocol = eth->h_proto = htons(ETH_P_IP);
}
ether_addr_copy(eth->h_source, np->dev->dev_addr);
ether_addr_copy(eth->h_dest, np->remote_mac);
skb->dev = np->dev;
netpoll_send_skb(np, skb);
}
EXPORT_SYMBOL(netpoll_send_udp);
void netpoll_print_options(struct netpoll *np)
{
np_info(np, "local port %d\n", np->local_port);
if (np->ipv6)
np_info(np, "local IPv6 address %pI6c\n", &np->local_ip.in6);
else
np_info(np, "local IPv4 address %pI4\n", &np->local_ip.ip);
np_info(np, "interface '%s'\n", np->dev_name);
np_info(np, "remote port %d\n", np->remote_port);
if (np->ipv6)
np_info(np, "remote IPv6 address %pI6c\n", &np->remote_ip.in6);
else
np_info(np, "remote IPv4 address %pI4\n", &np->remote_ip.ip);
np_info(np, "remote ethernet address %pM\n", np->remote_mac);
}
EXPORT_SYMBOL(netpoll_print_options);
static int netpoll_parse_ip_addr(const char *str, union inet_addr *addr)
{
const char *end;
if (!strchr(str, ':') &&
in4_pton(str, -1, (void *)addr, -1, &end) > 0) {
if (!*end)
return 0;
}
if (in6_pton(str, -1, addr->in6.s6_addr, -1, &end) > 0) {
#if IS_ENABLED(CONFIG_IPV6)
if (!*end)
return 1;
#else
return -1;
#endif
}
return -1;
}
int netpoll_parse_options(struct netpoll *np, char *opt)
{
char *cur=opt, *delim;
int ipv6;
bool ipversion_set = false;
if (*cur != '@') {
if ((delim = strchr(cur, '@')) == NULL)
goto parse_failed;
*delim = 0;
if (kstrtou16(cur, 10, &np->local_port))
goto parse_failed;
cur = delim;
}
cur++;
if (*cur != '/') {
ipversion_set = true;
if ((delim = strchr(cur, '/')) == NULL)
goto parse_failed;
*delim = 0;
ipv6 = netpoll_parse_ip_addr(cur, &np->local_ip);
if (ipv6 < 0)
goto parse_failed;
else
np->ipv6 = (bool)ipv6;
cur = delim;
}
cur++;
if (*cur != ',') {
/* parse out dev name */
if ((delim = strchr(cur, ',')) == NULL)
goto parse_failed;
*delim = 0;
strlcpy(np->dev_name, cur, sizeof(np->dev_name));
cur = delim;
}
cur++;
if (*cur != '@') {
/* dst port */
if ((delim = strchr(cur, '@')) == NULL)
goto parse_failed;
*delim = 0;
if (*cur == ' ' || *cur == '\t')
np_info(np, "warning: whitespace is not allowed\n");
if (kstrtou16(cur, 10, &np->remote_port))
goto parse_failed;
cur = delim;
}
cur++;
/* dst ip */
if ((delim = strchr(cur, '/')) == NULL)
goto parse_failed;
*delim = 0;
ipv6 = netpoll_parse_ip_addr(cur, &np->remote_ip);
if (ipv6 < 0)
goto parse_failed;
else if (ipversion_set && np->ipv6 != (bool)ipv6)
goto parse_failed;
else
np->ipv6 = (bool)ipv6;
cur = delim + 1;
if (*cur != 0) {
/* MAC address */
if (!mac_pton(cur, np->remote_mac))
goto parse_failed;
}
netpoll_print_options(np);
return 0;
parse_failed:
np_info(np, "couldn't parse config at '%s'!\n", cur);
return -1;
}
EXPORT_SYMBOL(netpoll_parse_options);
int __netpoll_setup(struct netpoll *np, struct net_device *ndev)
{
struct netpoll_info *npinfo;
const struct net_device_ops *ops;
int err;
np->dev = ndev;
strlcpy(np->dev_name, ndev->name, IFNAMSIZ);
INIT_WORK(&np->cleanup_work, netpoll_async_cleanup);
if ((ndev->priv_flags & IFF_DISABLE_NETPOLL) ||
!ndev->netdev_ops->ndo_poll_controller) {
np_err(np, "%s doesn't support polling, aborting\n",
np->dev_name);
err = -ENOTSUPP;
goto out;
}
if (!ndev->npinfo) {
npinfo = kmalloc(sizeof(*npinfo), GFP_KERNEL);
if (!npinfo) {
err = -ENOMEM;
goto out;
}
sema_init(&npinfo->dev_lock, 1);
skb_queue_head_init(&npinfo->txq);
INIT_DELAYED_WORK(&npinfo->tx_work, queue_process);
atomic_set(&npinfo->refcnt, 1);
ops = np->dev->netdev_ops;
if (ops->ndo_netpoll_setup) {
err = ops->ndo_netpoll_setup(ndev, npinfo);
if (err)
goto free_npinfo;
}
} else {
npinfo = rtnl_dereference(ndev->npinfo);
atomic_inc(&npinfo->refcnt);
}
npinfo->netpoll = np;
/* last thing to do is link it to the net device structure */
rcu_assign_pointer(ndev->npinfo, npinfo);
return 0;
free_npinfo:
kfree(npinfo);
out:
return err;
}
EXPORT_SYMBOL_GPL(__netpoll_setup);
int netpoll_setup(struct netpoll *np)
{
struct net_device *ndev = NULL;
struct in_device *in_dev;
int err;
rtnl_lock();
if (np->dev_name) {
struct net *net = current->nsproxy->net_ns;
ndev = __dev_get_by_name(net, np->dev_name);
}
if (!ndev) {
np_err(np, "%s doesn't exist, aborting\n", np->dev_name);
err = -ENODEV;
goto unlock;
}
dev_hold(ndev);
if (netdev_master_upper_dev_get(ndev)) {
np_err(np, "%s is a slave device, aborting\n", np->dev_name);
err = -EBUSY;
goto put;
}
if (!netif_running(ndev)) {
unsigned long atmost, atleast;
np_info(np, "device %s not up yet, forcing it\n", np->dev_name);
err = dev_open(ndev);
if (err) {
np_err(np, "failed to open %s\n", ndev->name);
goto put;
}
rtnl_unlock();
atleast = jiffies + HZ/10;
atmost = jiffies + carrier_timeout * HZ;
while (!netif_carrier_ok(ndev)) {
if (time_after(jiffies, atmost)) {
np_notice(np, "timeout waiting for carrier\n");
break;
}
msleep(1);
}
/* If carrier appears to come up instantly, we don't
* trust it and pause so that we don't pump all our
* queued console messages into the bitbucket.
*/
if (time_before(jiffies, atleast)) {
np_notice(np, "carrier detect appears untrustworthy, waiting 4 seconds\n");
msleep(4000);
}
rtnl_lock();
}
if (!np->local_ip.ip) {
if (!np->ipv6) {
in_dev = __in_dev_get_rtnl(ndev);
if (!in_dev || !in_dev->ifa_list) {
np_err(np, "no IP address for %s, aborting\n",
np->dev_name);
err = -EDESTADDRREQ;
goto put;
}
np->local_ip.ip = in_dev->ifa_list->ifa_local;
np_info(np, "local IP %pI4\n", &np->local_ip.ip);
} else {
#if IS_ENABLED(CONFIG_IPV6)
struct inet6_dev *idev;
err = -EDESTADDRREQ;
idev = __in6_dev_get(ndev);
if (idev) {
struct inet6_ifaddr *ifp;
read_lock_bh(&idev->lock);
list_for_each_entry(ifp, &idev->addr_list, if_list) {
if (ipv6_addr_type(&ifp->addr) & IPV6_ADDR_LINKLOCAL)
continue;
np->local_ip.in6 = ifp->addr;
err = 0;
break;
}
read_unlock_bh(&idev->lock);
}
if (err) {
np_err(np, "no IPv6 address for %s, aborting\n",
np->dev_name);
goto put;
} else
np_info(np, "local IPv6 %pI6c\n", &np->local_ip.in6);
#else
np_err(np, "IPv6 is not supported %s, aborting\n",
np->dev_name);
err = -EINVAL;
goto put;
#endif
}
}
/* fill up the skb queue */
refill_skbs();
err = __netpoll_setup(np, ndev);
if (err)
goto put;
rtnl_unlock();
return 0;
put:
dev_put(ndev);
unlock:
rtnl_unlock();
return err;
}
EXPORT_SYMBOL(netpoll_setup);
static int __init netpoll_init(void)
{
skb_queue_head_init(&skb_pool);
return 0;
}
core_initcall(netpoll_init);
static void rcu_cleanup_netpoll_info(struct rcu_head *rcu_head)
{
struct netpoll_info *npinfo =
container_of(rcu_head, struct netpoll_info, rcu);
skb_queue_purge(&npinfo->txq);
/* we can't call cancel_delayed_work_sync here, as we are in softirq */
cancel_delayed_work(&npinfo->tx_work);
/* clean after last, unfinished work */
__skb_queue_purge(&npinfo->txq);
/* now cancel it again */
cancel_delayed_work(&npinfo->tx_work);
kfree(npinfo);
}
void __netpoll_cleanup(struct netpoll *np)
{
struct netpoll_info *npinfo;
/* rtnl_dereference would be preferable here but
* rcu_cleanup_netpoll path can put us in here safely without
* holding the rtnl, so plain rcu_dereference it is
*/
npinfo = rtnl_dereference(np->dev->npinfo);
if (!npinfo)
return;
synchronize_srcu(&netpoll_srcu);
if (atomic_dec_and_test(&npinfo->refcnt)) {
const struct net_device_ops *ops;
ops = np->dev->netdev_ops;
if (ops->ndo_netpoll_cleanup)
ops->ndo_netpoll_cleanup(np->dev);
RCU_INIT_POINTER(np->dev->npinfo, NULL);
call_rcu_bh(&npinfo->rcu, rcu_cleanup_netpoll_info);
} else
RCU_INIT_POINTER(np->dev->npinfo, NULL);
}
EXPORT_SYMBOL_GPL(__netpoll_cleanup);
static void netpoll_async_cleanup(struct work_struct *work)
{
struct netpoll *np = container_of(work, struct netpoll, cleanup_work);
rtnl_lock();
__netpoll_cleanup(np);
rtnl_unlock();
kfree(np);
}
void __netpoll_free_async(struct netpoll *np)
{
schedule_work(&np->cleanup_work);
}
EXPORT_SYMBOL_GPL(__netpoll_free_async);
void netpoll_cleanup(struct netpoll *np)
{
rtnl_lock();
if (!np->dev)
goto out;
__netpoll_cleanup(np);
dev_put(np->dev);
np->dev = NULL;
out:
rtnl_unlock();
}
EXPORT_SYMBOL(netpoll_cleanup);

288
net/core/netprio_cgroup.c Normal file
View file

@ -0,0 +1,288 @@
/*
* net/core/netprio_cgroup.c Priority Control Group
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public License
* as published by the Free Software Foundation; either version
* 2 of the License, or (at your option) any later version.
*
* Authors: Neil Horman <nhorman@tuxdriver.com>
*/
#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
#include <linux/module.h>
#include <linux/slab.h>
#include <linux/types.h>
#include <linux/string.h>
#include <linux/errno.h>
#include <linux/skbuff.h>
#include <linux/cgroup.h>
#include <linux/rcupdate.h>
#include <linux/atomic.h>
#include <net/rtnetlink.h>
#include <net/pkt_cls.h>
#include <net/sock.h>
#include <net/netprio_cgroup.h>
#include <linux/fdtable.h>
#define PRIOMAP_MIN_SZ 128
/*
* Extend @dev->priomap so that it's large enough to accommodate
* @target_idx. @dev->priomap.priomap_len > @target_idx after successful
* return. Must be called under rtnl lock.
*/
static int extend_netdev_table(struct net_device *dev, u32 target_idx)
{
struct netprio_map *old, *new;
size_t new_sz, new_len;
/* is the existing priomap large enough? */
old = rtnl_dereference(dev->priomap);
if (old && old->priomap_len > target_idx)
return 0;
/*
* Determine the new size. Let's keep it power-of-two. We start
* from PRIOMAP_MIN_SZ and double it until it's large enough to
* accommodate @target_idx.
*/
new_sz = PRIOMAP_MIN_SZ;
while (true) {
new_len = (new_sz - offsetof(struct netprio_map, priomap)) /
sizeof(new->priomap[0]);
if (new_len > target_idx)
break;
new_sz *= 2;
/* overflowed? */
if (WARN_ON(new_sz < PRIOMAP_MIN_SZ))
return -ENOSPC;
}
/* allocate & copy */
new = kzalloc(new_sz, GFP_KERNEL);
if (!new)
return -ENOMEM;
if (old)
memcpy(new->priomap, old->priomap,
old->priomap_len * sizeof(old->priomap[0]));
new->priomap_len = new_len;
/* install the new priomap */
rcu_assign_pointer(dev->priomap, new);
if (old)
kfree_rcu(old, rcu);
return 0;
}
/**
* netprio_prio - return the effective netprio of a cgroup-net_device pair
* @css: css part of the target pair
* @dev: net_device part of the target pair
*
* Should be called under RCU read or rtnl lock.
*/
static u32 netprio_prio(struct cgroup_subsys_state *css, struct net_device *dev)
{
struct netprio_map *map = rcu_dereference_rtnl(dev->priomap);
int id = css->cgroup->id;
if (map && id < map->priomap_len)
return map->priomap[id];
return 0;
}
/**
* netprio_set_prio - set netprio on a cgroup-net_device pair
* @css: css part of the target pair
* @dev: net_device part of the target pair
* @prio: prio to set
*
* Set netprio to @prio on @css-@dev pair. Should be called under rtnl
* lock and may fail under memory pressure for non-zero @prio.
*/
static int netprio_set_prio(struct cgroup_subsys_state *css,
struct net_device *dev, u32 prio)
{
struct netprio_map *map;
int id = css->cgroup->id;
int ret;
/* avoid extending priomap for zero writes */
map = rtnl_dereference(dev->priomap);
if (!prio && (!map || map->priomap_len <= id))
return 0;
ret = extend_netdev_table(dev, id);
if (ret)
return ret;
map = rtnl_dereference(dev->priomap);
map->priomap[id] = prio;
return 0;
}
static struct cgroup_subsys_state *
cgrp_css_alloc(struct cgroup_subsys_state *parent_css)
{
struct cgroup_subsys_state *css;
css = kzalloc(sizeof(*css), GFP_KERNEL);
if (!css)
return ERR_PTR(-ENOMEM);
return css;
}
static int cgrp_css_online(struct cgroup_subsys_state *css)
{
struct cgroup_subsys_state *parent_css = css->parent;
struct net_device *dev;
int ret = 0;
if (!parent_css)
return 0;
rtnl_lock();
/*
* Inherit prios from the parent. As all prios are set during
* onlining, there is no need to clear them on offline.
*/
for_each_netdev(&init_net, dev) {
u32 prio = netprio_prio(parent_css, dev);
ret = netprio_set_prio(css, dev, prio);
if (ret)
break;
}
rtnl_unlock();
return ret;
}
static void cgrp_css_free(struct cgroup_subsys_state *css)
{
kfree(css);
}
static u64 read_prioidx(struct cgroup_subsys_state *css, struct cftype *cft)
{
return css->cgroup->id;
}
static int read_priomap(struct seq_file *sf, void *v)
{
struct net_device *dev;
rcu_read_lock();
for_each_netdev_rcu(&init_net, dev)
seq_printf(sf, "%s %u\n", dev->name,
netprio_prio(seq_css(sf), dev));
rcu_read_unlock();
return 0;
}
static ssize_t write_priomap(struct kernfs_open_file *of,
char *buf, size_t nbytes, loff_t off)
{
char devname[IFNAMSIZ + 1];
struct net_device *dev;
u32 prio;
int ret;
if (sscanf(buf, "%"__stringify(IFNAMSIZ)"s %u", devname, &prio) != 2)
return -EINVAL;
dev = dev_get_by_name(&init_net, devname);
if (!dev)
return -ENODEV;
rtnl_lock();
ret = netprio_set_prio(of_css(of), dev, prio);
rtnl_unlock();
dev_put(dev);
return ret ?: nbytes;
}
static int update_netprio(const void *v, struct file *file, unsigned n)
{
int err;
struct socket *sock = sock_from_file(file, &err);
if (sock)
sock->sk->sk_cgrp_prioidx = (u32)(unsigned long)v;
return 0;
}
static void net_prio_attach(struct cgroup_subsys_state *css,
struct cgroup_taskset *tset)
{
struct task_struct *p;
void *v = (void *)(unsigned long)css->cgroup->id;
cgroup_taskset_for_each(p, tset) {
task_lock(p);
iterate_fd(p->files, 0, update_netprio, v);
task_unlock(p);
}
}
static struct cftype ss_files[] = {
{
.name = "prioidx",
.read_u64 = read_prioidx,
},
{
.name = "ifpriomap",
.seq_show = read_priomap,
.write = write_priomap,
},
{ } /* terminate */
};
struct cgroup_subsys net_prio_cgrp_subsys = {
.css_alloc = cgrp_css_alloc,
.css_online = cgrp_css_online,
.css_free = cgrp_css_free,
.attach = net_prio_attach,
.legacy_cftypes = ss_files,
};
static int netprio_device_event(struct notifier_block *unused,
unsigned long event, void *ptr)
{
struct net_device *dev = netdev_notifier_info_to_dev(ptr);
struct netprio_map *old;
/*
* Note this is called with rtnl_lock held so we have update side
* protection on our rcu assignments
*/
switch (event) {
case NETDEV_UNREGISTER:
old = rtnl_dereference(dev->priomap);
RCU_INIT_POINTER(dev->priomap, NULL);
if (old)
kfree_rcu(old, rcu);
break;
}
return NOTIFY_DONE;
}
static struct notifier_block netprio_device_notifier = {
.notifier_call = netprio_device_event
};
static int __init init_cgroup_netprio(void)
{
register_netdevice_notifier(&netprio_device_notifier);
return 0;
}
subsys_initcall(init_cgroup_netprio);
MODULE_LICENSE("GPL v2");

3863
net/core/pktgen.c Normal file

File diff suppressed because it is too large Load diff

193
net/core/ptp_classifier.c Normal file
View file

@ -0,0 +1,193 @@
/* PTP classifier
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of version 2 of the GNU General Public
* License as published by the Free Software Foundation.
*
* This program is distributed in the hope that it will be useful, but
* WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* General Public License for more details.
*/
/* The below program is the bpf_asm (tools/net/) representation of
* the opcode array in the ptp_filter structure.
*
* For convenience, this can easily be altered and reviewed with
* bpf_asm and bpf_dbg, e.g. `./bpf_asm -c prog` where prog is a
* simple file containing the below program:
*
* ldh [12] ; load ethertype
*
* ; PTP over UDP over IPv4 over Ethernet
* test_ipv4:
* jneq #0x800, test_ipv6 ; ETH_P_IP ?
* ldb [23] ; load proto
* jneq #17, drop_ipv4 ; IPPROTO_UDP ?
* ldh [20] ; load frag offset field
* jset #0x1fff, drop_ipv4 ; don't allow fragments
* ldxb 4*([14]&0xf) ; load IP header len
* ldh [x + 16] ; load UDP dst port
* jneq #319, drop_ipv4 ; is port PTP_EV_PORT ?
* ldh [x + 22] ; load payload
* and #0xf ; mask PTP_CLASS_VMASK
* or #0x10 ; PTP_CLASS_IPV4
* ret a ; return PTP class
* drop_ipv4: ret #0x0 ; PTP_CLASS_NONE
*
* ; PTP over UDP over IPv6 over Ethernet
* test_ipv6:
* jneq #0x86dd, test_8021q ; ETH_P_IPV6 ?
* ldb [20] ; load proto
* jneq #17, drop_ipv6 ; IPPROTO_UDP ?
* ldh [56] ; load UDP dst port
* jneq #319, drop_ipv6 ; is port PTP_EV_PORT ?
* ldh [62] ; load payload
* and #0xf ; mask PTP_CLASS_VMASK
* or #0x20 ; PTP_CLASS_IPV6
* ret a ; return PTP class
* drop_ipv6: ret #0x0 ; PTP_CLASS_NONE
*
* ; PTP over 802.1Q over Ethernet
* test_8021q:
* jneq #0x8100, test_ieee1588 ; ETH_P_8021Q ?
* ldh [16] ; load inner type
* jneq #0x88f7, test_8021q_ipv4 ; ETH_P_1588 ?
* ldb [18] ; load payload
* and #0x8 ; as we don't have ports here, test
* jneq #0x0, drop_ieee1588 ; for PTP_GEN_BIT and drop these
* ldh [18] ; reload payload
* and #0xf ; mask PTP_CLASS_VMASK
* or #0x70 ; PTP_CLASS_VLAN|PTP_CLASS_L2
* ret a ; return PTP class
*
* ; PTP over UDP over IPv4 over 802.1Q over Ethernet
* test_8021q_ipv4:
* jneq #0x800, test_8021q_ipv6 ; ETH_P_IP ?
* ldb [27] ; load proto
* jneq #17, drop_8021q_ipv4 ; IPPROTO_UDP ?
* ldh [24] ; load frag offset field
* jset #0x1fff, drop_8021q_ipv4; don't allow fragments
* ldxb 4*([18]&0xf) ; load IP header len
* ldh [x + 20] ; load UDP dst port
* jneq #319, drop_8021q_ipv4 ; is port PTP_EV_PORT ?
* ldh [x + 26] ; load payload
* and #0xf ; mask PTP_CLASS_VMASK
* or #0x50 ; PTP_CLASS_VLAN|PTP_CLASS_IPV4
* ret a ; return PTP class
* drop_8021q_ipv4: ret #0x0 ; PTP_CLASS_NONE
*
* ; PTP over UDP over IPv6 over 802.1Q over Ethernet
* test_8021q_ipv6:
* jneq #0x86dd, drop_8021q_ipv6 ; ETH_P_IPV6 ?
* ldb [24] ; load proto
* jneq #17, drop_8021q_ipv6 ; IPPROTO_UDP ?
* ldh [60] ; load UDP dst port
* jneq #319, drop_8021q_ipv6 ; is port PTP_EV_PORT ?
* ldh [66] ; load payload
* and #0xf ; mask PTP_CLASS_VMASK
* or #0x60 ; PTP_CLASS_VLAN|PTP_CLASS_IPV6
* ret a ; return PTP class
* drop_8021q_ipv6: ret #0x0 ; PTP_CLASS_NONE
*
* ; PTP over Ethernet
* test_ieee1588:
* jneq #0x88f7, drop_ieee1588 ; ETH_P_1588 ?
* ldb [14] ; load payload
* and #0x8 ; as we don't have ports here, test
* jneq #0x0, drop_ieee1588 ; for PTP_GEN_BIT and drop these
* ldh [14] ; reload payload
* and #0xf ; mask PTP_CLASS_VMASK
* or #0x30 ; PTP_CLASS_L2
* ret a ; return PTP class
* drop_ieee1588: ret #0x0 ; PTP_CLASS_NONE
*/
#include <linux/skbuff.h>
#include <linux/filter.h>
#include <linux/ptp_classify.h>
static struct bpf_prog *ptp_insns __read_mostly;
unsigned int ptp_classify_raw(const struct sk_buff *skb)
{
return BPF_PROG_RUN(ptp_insns, skb);
}
EXPORT_SYMBOL_GPL(ptp_classify_raw);
void __init ptp_classifier_init(void)
{
static struct sock_filter ptp_filter[] __initdata = {
{ 0x28, 0, 0, 0x0000000c },
{ 0x15, 0, 12, 0x00000800 },
{ 0x30, 0, 0, 0x00000017 },
{ 0x15, 0, 9, 0x00000011 },
{ 0x28, 0, 0, 0x00000014 },
{ 0x45, 7, 0, 0x00001fff },
{ 0xb1, 0, 0, 0x0000000e },
{ 0x48, 0, 0, 0x00000010 },
{ 0x15, 0, 4, 0x0000013f },
{ 0x48, 0, 0, 0x00000016 },
{ 0x54, 0, 0, 0x0000000f },
{ 0x44, 0, 0, 0x00000010 },
{ 0x16, 0, 0, 0x00000000 },
{ 0x06, 0, 0, 0x00000000 },
{ 0x15, 0, 9, 0x000086dd },
{ 0x30, 0, 0, 0x00000014 },
{ 0x15, 0, 6, 0x00000011 },
{ 0x28, 0, 0, 0x00000038 },
{ 0x15, 0, 4, 0x0000013f },
{ 0x28, 0, 0, 0x0000003e },
{ 0x54, 0, 0, 0x0000000f },
{ 0x44, 0, 0, 0x00000020 },
{ 0x16, 0, 0, 0x00000000 },
{ 0x06, 0, 0, 0x00000000 },
{ 0x15, 0, 32, 0x00008100 },
{ 0x28, 0, 0, 0x00000010 },
{ 0x15, 0, 7, 0x000088f7 },
{ 0x30, 0, 0, 0x00000012 },
{ 0x54, 0, 0, 0x00000008 },
{ 0x15, 0, 35, 0x00000000 },
{ 0x28, 0, 0, 0x00000012 },
{ 0x54, 0, 0, 0x0000000f },
{ 0x44, 0, 0, 0x00000070 },
{ 0x16, 0, 0, 0x00000000 },
{ 0x15, 0, 12, 0x00000800 },
{ 0x30, 0, 0, 0x0000001b },
{ 0x15, 0, 9, 0x00000011 },
{ 0x28, 0, 0, 0x00000018 },
{ 0x45, 7, 0, 0x00001fff },
{ 0xb1, 0, 0, 0x00000012 },
{ 0x48, 0, 0, 0x00000014 },
{ 0x15, 0, 4, 0x0000013f },
{ 0x48, 0, 0, 0x0000001a },
{ 0x54, 0, 0, 0x0000000f },
{ 0x44, 0, 0, 0x00000050 },
{ 0x16, 0, 0, 0x00000000 },
{ 0x06, 0, 0, 0x00000000 },
{ 0x15, 0, 8, 0x000086dd },
{ 0x30, 0, 0, 0x00000018 },
{ 0x15, 0, 6, 0x00000011 },
{ 0x28, 0, 0, 0x0000003c },
{ 0x15, 0, 4, 0x0000013f },
{ 0x28, 0, 0, 0x00000042 },
{ 0x54, 0, 0, 0x0000000f },
{ 0x44, 0, 0, 0x00000060 },
{ 0x16, 0, 0, 0x00000000 },
{ 0x06, 0, 0, 0x00000000 },
{ 0x15, 0, 7, 0x000088f7 },
{ 0x30, 0, 0, 0x0000000e },
{ 0x54, 0, 0, 0x00000008 },
{ 0x15, 0, 4, 0x00000000 },
{ 0x28, 0, 0, 0x0000000e },
{ 0x54, 0, 0, 0x0000000f },
{ 0x44, 0, 0, 0x00000030 },
{ 0x16, 0, 0, 0x00000000 },
{ 0x06, 0, 0, 0x00000000 },
};
struct sock_fprog_kern ptp_prog = {
.len = ARRAY_SIZE(ptp_filter), .filter = ptp_filter,
};
BUG_ON(bpf_prog_create(&ptp_insns, &ptp_prog));
}

205
net/core/request_sock.c Normal file
View file

@ -0,0 +1,205 @@
/*
* NET Generic infrastructure for Network protocols.
*
* Authors: Arnaldo Carvalho de Melo <acme@conectiva.com.br>
*
* From code originally in include/net/tcp.h
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public License
* as published by the Free Software Foundation; either version
* 2 of the License, or (at your option) any later version.
*/
#include <linux/module.h>
#include <linux/random.h>
#include <linux/slab.h>
#include <linux/string.h>
#include <linux/tcp.h>
#include <linux/vmalloc.h>
#include <net/request_sock.h>
/*
* Maximum number of SYN_RECV sockets in queue per LISTEN socket.
* One SYN_RECV socket costs about 80bytes on a 32bit machine.
* It would be better to replace it with a global counter for all sockets
* but then some measure against one socket starving all other sockets
* would be needed.
*
* The minimum value of it is 128. Experiments with real servers show that
* it is absolutely not enough even at 100conn/sec. 256 cures most
* of problems.
* This value is adjusted to 128 for low memory machines,
* and it will increase in proportion to the memory of machine.
* Note : Dont forget somaxconn that may limit backlog too.
*/
int sysctl_max_syn_backlog = 256;
EXPORT_SYMBOL(sysctl_max_syn_backlog);
int reqsk_queue_alloc(struct request_sock_queue *queue,
unsigned int nr_table_entries)
{
size_t lopt_size = sizeof(struct listen_sock);
struct listen_sock *lopt = NULL;
nr_table_entries = min_t(u32, nr_table_entries, sysctl_max_syn_backlog);
nr_table_entries = max_t(u32, nr_table_entries, 8);
nr_table_entries = roundup_pow_of_two(nr_table_entries + 1);
lopt_size += nr_table_entries * sizeof(struct request_sock *);
if (lopt_size <= (PAGE_SIZE << PAGE_ALLOC_COSTLY_ORDER))
lopt = kzalloc(lopt_size, GFP_KERNEL |
__GFP_NOWARN |
__GFP_NORETRY);
if (!lopt)
lopt = vzalloc(lopt_size);
if (!lopt)
return -ENOMEM;
get_random_bytes(&lopt->hash_rnd, sizeof(lopt->hash_rnd));
rwlock_init(&queue->syn_wait_lock);
queue->rskq_accept_head = NULL;
lopt->nr_table_entries = nr_table_entries;
lopt->max_qlen_log = ilog2(nr_table_entries);
write_lock_bh(&queue->syn_wait_lock);
queue->listen_opt = lopt;
write_unlock_bh(&queue->syn_wait_lock);
return 0;
}
void __reqsk_queue_destroy(struct request_sock_queue *queue)
{
/* This is an error recovery path only, no locking needed */
kvfree(queue->listen_opt);
}
static inline struct listen_sock *reqsk_queue_yank_listen_sk(
struct request_sock_queue *queue)
{
struct listen_sock *lopt;
write_lock_bh(&queue->syn_wait_lock);
lopt = queue->listen_opt;
queue->listen_opt = NULL;
write_unlock_bh(&queue->syn_wait_lock);
return lopt;
}
void reqsk_queue_destroy(struct request_sock_queue *queue)
{
/* make all the listen_opt local to us */
struct listen_sock *lopt = reqsk_queue_yank_listen_sk(queue);
if (lopt->qlen != 0) {
unsigned int i;
for (i = 0; i < lopt->nr_table_entries; i++) {
struct request_sock *req;
while ((req = lopt->syn_table[i]) != NULL) {
lopt->syn_table[i] = req->dl_next;
lopt->qlen--;
reqsk_free(req);
}
}
}
WARN_ON(lopt->qlen != 0);
kvfree(lopt);
}
/*
* This function is called to set a Fast Open socket's "fastopen_rsk" field
* to NULL when a TFO socket no longer needs to access the request_sock.
* This happens only after 3WHS has been either completed or aborted (e.g.,
* RST is received).
*
* Before TFO, a child socket is created only after 3WHS is completed,
* hence it never needs to access the request_sock. things get a lot more
* complex with TFO. A child socket, accepted or not, has to access its
* request_sock for 3WHS processing, e.g., to retransmit SYN-ACK pkts,
* until 3WHS is either completed or aborted. Afterwards the req will stay
* until either the child socket is accepted, or in the rare case when the
* listener is closed before the child is accepted.
*
* In short, a request socket is only freed after BOTH 3WHS has completed
* (or aborted) and the child socket has been accepted (or listener closed).
* When a child socket is accepted, its corresponding req->sk is set to
* NULL since it's no longer needed. More importantly, "req->sk == NULL"
* will be used by the code below to determine if a child socket has been
* accepted or not, and the check is protected by the fastopenq->lock
* described below.
*
* Note that fastopen_rsk is only accessed from the child socket's context
* with its socket lock held. But a request_sock (req) can be accessed by
* both its child socket through fastopen_rsk, and a listener socket through
* icsk_accept_queue.rskq_accept_head. To protect the access a simple spin
* lock per listener "icsk->icsk_accept_queue.fastopenq->lock" is created.
* only in the rare case when both the listener and the child locks are held,
* e.g., in inet_csk_listen_stop() do we not need to acquire the lock.
* The lock also protects other fields such as fastopenq->qlen, which is
* decremented by this function when fastopen_rsk is no longer needed.
*
* Note that another solution was to simply use the existing socket lock
* from the listener. But first socket lock is difficult to use. It is not
* a simple spin lock - one must consider sock_owned_by_user() and arrange
* to use sk_add_backlog() stuff. But what really makes it infeasible is the
* locking hierarchy violation. E.g., inet_csk_listen_stop() may try to
* acquire a child's lock while holding listener's socket lock. A corner
* case might also exist in tcp_v4_hnd_req() that will trigger this locking
* order.
*
* When a TFO req is created, it needs to sock_hold its listener to prevent
* the latter data structure from going away.
*
* This function also sets "treq->listener" to NULL and unreference listener
* socket. treq->listener is used by the listener so it is protected by the
* fastopenq->lock in this function.
*/
void reqsk_fastopen_remove(struct sock *sk, struct request_sock *req,
bool reset)
{
struct sock *lsk = tcp_rsk(req)->listener;
struct fastopen_queue *fastopenq =
inet_csk(lsk)->icsk_accept_queue.fastopenq;
tcp_sk(sk)->fastopen_rsk = NULL;
spin_lock_bh(&fastopenq->lock);
fastopenq->qlen--;
tcp_rsk(req)->listener = NULL;
if (req->sk) /* the child socket hasn't been accepted yet */
goto out;
if (!reset || lsk->sk_state != TCP_LISTEN) {
/* If the listener has been closed don't bother with the
* special RST handling below.
*/
spin_unlock_bh(&fastopenq->lock);
sock_put(lsk);
reqsk_free(req);
return;
}
/* Wait for 60secs before removing a req that has triggered RST.
* This is a simple defense against TFO spoofing attack - by
* counting the req against fastopen.max_qlen, and disabling
* TFO when the qlen exceeds max_qlen.
*
* For more details see CoNext'11 "TCP Fast Open" paper.
*/
req->expires = jiffies + 60*HZ;
if (fastopenq->rskq_rst_head == NULL)
fastopenq->rskq_rst_head = req;
else
fastopenq->rskq_rst_tail->dl_next = req;
req->dl_next = NULL;
fastopenq->rskq_rst_tail = req;
fastopenq->qlen++;
out:
spin_unlock_bh(&fastopenq->lock);
sock_put(lsk);
}

3082
net/core/rtnetlink.c Normal file

File diff suppressed because it is too large Load diff

341
net/core/scm.c Normal file
View file

@ -0,0 +1,341 @@
/* scm.c - Socket level control messages processing.
*
* Author: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
* Alignment and value checking mods by Craig Metz
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public License
* as published by the Free Software Foundation; either version
* 2 of the License, or (at your option) any later version.
*/
#include <linux/module.h>
#include <linux/signal.h>
#include <linux/capability.h>
#include <linux/errno.h>
#include <linux/sched.h>
#include <linux/mm.h>
#include <linux/kernel.h>
#include <linux/stat.h>
#include <linux/socket.h>
#include <linux/file.h>
#include <linux/fcntl.h>
#include <linux/net.h>
#include <linux/interrupt.h>
#include <linux/netdevice.h>
#include <linux/security.h>
#include <linux/pid_namespace.h>
#include <linux/pid.h>
#include <linux/nsproxy.h>
#include <linux/slab.h>
#include <asm/uaccess.h>
#include <net/protocol.h>
#include <linux/skbuff.h>
#include <net/sock.h>
#include <net/compat.h>
#include <net/scm.h>
#include <net/cls_cgroup.h>
/*
* Only allow a user to send credentials, that they could set with
* setu(g)id.
*/
static __inline__ int scm_check_creds(struct ucred *creds)
{
const struct cred *cred = current_cred();
kuid_t uid = make_kuid(cred->user_ns, creds->uid);
kgid_t gid = make_kgid(cred->user_ns, creds->gid);
if (!uid_valid(uid) || !gid_valid(gid))
return -EINVAL;
if ((creds->pid == task_tgid_vnr(current) ||
ns_capable(task_active_pid_ns(current)->user_ns, CAP_SYS_ADMIN)) &&
((uid_eq(uid, cred->uid) || uid_eq(uid, cred->euid) ||
uid_eq(uid, cred->suid)) || ns_capable(cred->user_ns, CAP_SETUID)) &&
((gid_eq(gid, cred->gid) || gid_eq(gid, cred->egid) ||
gid_eq(gid, cred->sgid)) || ns_capable(cred->user_ns, CAP_SETGID))) {
return 0;
}
return -EPERM;
}
static int scm_fp_copy(struct cmsghdr *cmsg, struct scm_fp_list **fplp)
{
int *fdp = (int*)CMSG_DATA(cmsg);
struct scm_fp_list *fpl = *fplp;
struct file **fpp;
int i, num;
num = (cmsg->cmsg_len - CMSG_ALIGN(sizeof(struct cmsghdr)))/sizeof(int);
if (num <= 0)
return 0;
if (num > SCM_MAX_FD)
return -EINVAL;
if (!fpl)
{
fpl = kmalloc(sizeof(struct scm_fp_list), GFP_KERNEL);
if (!fpl)
return -ENOMEM;
*fplp = fpl;
fpl->count = 0;
fpl->max = SCM_MAX_FD;
}
fpp = &fpl->fp[fpl->count];
if (fpl->count + num > fpl->max)
return -EINVAL;
/*
* Verify the descriptors and increment the usage count.
*/
for (i=0; i< num; i++)
{
int fd = fdp[i];
struct file *file;
if (fd < 0 || !(file = fget_raw(fd)))
return -EBADF;
*fpp++ = file;
fpl->count++;
}
return num;
}
void __scm_destroy(struct scm_cookie *scm)
{
struct scm_fp_list *fpl = scm->fp;
int i;
if (fpl) {
scm->fp = NULL;
for (i=fpl->count-1; i>=0; i--)
fput(fpl->fp[i]);
kfree(fpl);
}
}
EXPORT_SYMBOL(__scm_destroy);
int __scm_send(struct socket *sock, struct msghdr *msg, struct scm_cookie *p)
{
struct cmsghdr *cmsg;
int err;
for (cmsg = CMSG_FIRSTHDR(msg); cmsg; cmsg = CMSG_NXTHDR(msg, cmsg))
{
err = -EINVAL;
/* Verify that cmsg_len is at least sizeof(struct cmsghdr) */
/* The first check was omitted in <= 2.2.5. The reasoning was
that parser checks cmsg_len in any case, so that
additional check would be work duplication.
But if cmsg_level is not SOL_SOCKET, we do not check
for too short ancillary data object at all! Oops.
OK, let's add it...
*/
if (!CMSG_OK(msg, cmsg))
goto error;
if (cmsg->cmsg_level != SOL_SOCKET)
continue;
switch (cmsg->cmsg_type)
{
case SCM_RIGHTS:
if (!sock->ops || sock->ops->family != PF_UNIX)
goto error;
err=scm_fp_copy(cmsg, &p->fp);
if (err<0)
goto error;
break;
case SCM_CREDENTIALS:
{
struct ucred creds;
kuid_t uid;
kgid_t gid;
if (cmsg->cmsg_len != CMSG_LEN(sizeof(struct ucred)))
goto error;
memcpy(&creds, CMSG_DATA(cmsg), sizeof(struct ucred));
err = scm_check_creds(&creds);
if (err)
goto error;
p->creds.pid = creds.pid;
if (!p->pid || pid_vnr(p->pid) != creds.pid) {
struct pid *pid;
err = -ESRCH;
pid = find_get_pid(creds.pid);
if (!pid)
goto error;
put_pid(p->pid);
p->pid = pid;
}
err = -EINVAL;
uid = make_kuid(current_user_ns(), creds.uid);
gid = make_kgid(current_user_ns(), creds.gid);
if (!uid_valid(uid) || !gid_valid(gid))
goto error;
p->creds.uid = uid;
p->creds.gid = gid;
break;
}
default:
goto error;
}
}
if (p->fp && !p->fp->count)
{
kfree(p->fp);
p->fp = NULL;
}
return 0;
error:
scm_destroy(p);
return err;
}
EXPORT_SYMBOL(__scm_send);
int put_cmsg(struct msghdr * msg, int level, int type, int len, void *data)
{
struct cmsghdr __user *cm
= (__force struct cmsghdr __user *)msg->msg_control;
struct cmsghdr cmhdr;
int cmlen = CMSG_LEN(len);
int err;
if (MSG_CMSG_COMPAT & msg->msg_flags)
return put_cmsg_compat(msg, level, type, len, data);
if (cm==NULL || msg->msg_controllen < sizeof(*cm)) {
msg->msg_flags |= MSG_CTRUNC;
return 0; /* XXX: return error? check spec. */
}
if (msg->msg_controllen < cmlen) {
msg->msg_flags |= MSG_CTRUNC;
cmlen = msg->msg_controllen;
}
cmhdr.cmsg_level = level;
cmhdr.cmsg_type = type;
cmhdr.cmsg_len = cmlen;
err = -EFAULT;
if (copy_to_user(cm, &cmhdr, sizeof cmhdr))
goto out;
if (copy_to_user(CMSG_DATA(cm), data, cmlen - sizeof(struct cmsghdr)))
goto out;
cmlen = CMSG_SPACE(len);
if (msg->msg_controllen < cmlen)
cmlen = msg->msg_controllen;
msg->msg_control += cmlen;
msg->msg_controllen -= cmlen;
err = 0;
out:
return err;
}
EXPORT_SYMBOL(put_cmsg);
void scm_detach_fds(struct msghdr *msg, struct scm_cookie *scm)
{
struct cmsghdr __user *cm
= (__force struct cmsghdr __user*)msg->msg_control;
int fdmax = 0;
int fdnum = scm->fp->count;
struct file **fp = scm->fp->fp;
int __user *cmfptr;
int err = 0, i;
if (MSG_CMSG_COMPAT & msg->msg_flags) {
scm_detach_fds_compat(msg, scm);
return;
}
if (msg->msg_controllen > sizeof(struct cmsghdr))
fdmax = ((msg->msg_controllen - sizeof(struct cmsghdr))
/ sizeof(int));
if (fdnum < fdmax)
fdmax = fdnum;
for (i=0, cmfptr=(__force int __user *)CMSG_DATA(cm); i<fdmax;
i++, cmfptr++)
{
struct socket *sock;
int new_fd;
err = security_file_receive(fp[i]);
if (err)
break;
err = get_unused_fd_flags(MSG_CMSG_CLOEXEC & msg->msg_flags
? O_CLOEXEC : 0);
if (err < 0)
break;
new_fd = err;
err = put_user(new_fd, cmfptr);
if (err) {
put_unused_fd(new_fd);
break;
}
/* Bump the usage count and install the file. */
sock = sock_from_file(fp[i], &err);
if (sock) {
sock_update_netprioidx(sock->sk);
sock_update_classid(sock->sk);
}
fd_install(new_fd, get_file(fp[i]));
}
if (i > 0)
{
int cmlen = CMSG_LEN(i*sizeof(int));
err = put_user(SOL_SOCKET, &cm->cmsg_level);
if (!err)
err = put_user(SCM_RIGHTS, &cm->cmsg_type);
if (!err)
err = put_user(cmlen, &cm->cmsg_len);
if (!err) {
cmlen = CMSG_SPACE(i*sizeof(int));
msg->msg_control += cmlen;
msg->msg_controllen -= cmlen;
}
}
if (i < fdnum || (fdnum && fdmax <= 0))
msg->msg_flags |= MSG_CTRUNC;
/*
* All of the files that fit in the message have had their
* usage counts incremented, so we just free the list.
*/
__scm_destroy(scm);
}
EXPORT_SYMBOL(scm_detach_fds);
struct scm_fp_list *scm_fp_dup(struct scm_fp_list *fpl)
{
struct scm_fp_list *new_fpl;
int i;
if (!fpl)
return NULL;
new_fpl = kmemdup(fpl, offsetof(struct scm_fp_list, fp[fpl->count]),
GFP_KERNEL);
if (new_fpl) {
for (i = 0; i < fpl->count; i++)
get_file(fpl->fp[i]);
new_fpl->max = new_fpl->count;
}
return new_fpl;
}
EXPORT_SYMBOL(scm_fp_dup);

173
net/core/secure_seq.c Normal file
View file

@ -0,0 +1,173 @@
#include <linux/kernel.h>
#include <linux/init.h>
#include <linux/cryptohash.h>
#include <linux/module.h>
#include <linux/cache.h>
#include <linux/random.h>
#include <linux/hrtimer.h>
#include <linux/ktime.h>
#include <linux/string.h>
#include <linux/net.h>
#include <net/secure_seq.h>
#if IS_ENABLED(CONFIG_IPV6) || IS_ENABLED(CONFIG_INET)
#define NET_SECRET_SIZE (MD5_MESSAGE_BYTES / 4)
static u32 net_secret[NET_SECRET_SIZE] ____cacheline_aligned;
static __always_inline void net_secret_init(void)
{
net_get_random_once(net_secret, sizeof(net_secret));
}
#endif
#ifdef CONFIG_INET
static u32 seq_scale(u32 seq)
{
/*
* As close as possible to RFC 793, which
* suggests using a 250 kHz clock.
* Further reading shows this assumes 2 Mb/s networks.
* For 10 Mb/s Ethernet, a 1 MHz clock is appropriate.
* For 10 Gb/s Ethernet, a 1 GHz clock should be ok, but
* we also need to limit the resolution so that the u32 seq
* overlaps less than one time per MSL (2 minutes).
* Choosing a clock of 64 ns period is OK. (period of 274 s)
*/
return seq + (ktime_get_real_ns() >> 6);
}
#endif
#if IS_ENABLED(CONFIG_IPV6)
__u32 secure_tcpv6_sequence_number(const __be32 *saddr, const __be32 *daddr,
__be16 sport, __be16 dport)
{
u32 secret[MD5_MESSAGE_BYTES / 4];
u32 hash[MD5_DIGEST_WORDS];
u32 i;
net_secret_init();
memcpy(hash, saddr, 16);
for (i = 0; i < 4; i++)
secret[i] = net_secret[i] + (__force u32)daddr[i];
secret[4] = net_secret[4] +
(((__force u16)sport << 16) + (__force u16)dport);
for (i = 5; i < MD5_MESSAGE_BYTES / 4; i++)
secret[i] = net_secret[i];
md5_transform(hash, secret);
return seq_scale(hash[0]);
}
EXPORT_SYMBOL(secure_tcpv6_sequence_number);
u32 secure_ipv6_port_ephemeral(const __be32 *saddr, const __be32 *daddr,
__be16 dport)
{
u32 secret[MD5_MESSAGE_BYTES / 4];
u32 hash[MD5_DIGEST_WORDS];
u32 i;
net_secret_init();
memcpy(hash, saddr, 16);
for (i = 0; i < 4; i++)
secret[i] = net_secret[i] + (__force u32) daddr[i];
secret[4] = net_secret[4] + (__force u32)dport;
for (i = 5; i < MD5_MESSAGE_BYTES / 4; i++)
secret[i] = net_secret[i];
md5_transform(hash, secret);
return hash[0];
}
EXPORT_SYMBOL(secure_ipv6_port_ephemeral);
#endif
#ifdef CONFIG_INET
__u32 secure_tcp_sequence_number(__be32 saddr, __be32 daddr,
__be16 sport, __be16 dport)
{
u32 hash[MD5_DIGEST_WORDS];
net_secret_init();
hash[0] = (__force u32)saddr;
hash[1] = (__force u32)daddr;
hash[2] = ((__force u16)sport << 16) + (__force u16)dport;
hash[3] = net_secret[15];
md5_transform(hash, net_secret);
return seq_scale(hash[0]);
}
u32 secure_ipv4_port_ephemeral(__be32 saddr, __be32 daddr, __be16 dport)
{
u32 hash[MD5_DIGEST_WORDS];
net_secret_init();
hash[0] = (__force u32)saddr;
hash[1] = (__force u32)daddr;
hash[2] = (__force u32)dport ^ net_secret[14];
hash[3] = net_secret[15];
md5_transform(hash, net_secret);
return hash[0];
}
EXPORT_SYMBOL_GPL(secure_ipv4_port_ephemeral);
#endif
#if IS_ENABLED(CONFIG_IP_DCCP)
u64 secure_dccp_sequence_number(__be32 saddr, __be32 daddr,
__be16 sport, __be16 dport)
{
u32 hash[MD5_DIGEST_WORDS];
u64 seq;
net_secret_init();
hash[0] = (__force u32)saddr;
hash[1] = (__force u32)daddr;
hash[2] = ((__force u16)sport << 16) + (__force u16)dport;
hash[3] = net_secret[15];
md5_transform(hash, net_secret);
seq = hash[0] | (((u64)hash[1]) << 32);
seq += ktime_get_real_ns();
seq &= (1ull << 48) - 1;
return seq;
}
EXPORT_SYMBOL(secure_dccp_sequence_number);
#if IS_ENABLED(CONFIG_IPV6)
u64 secure_dccpv6_sequence_number(__be32 *saddr, __be32 *daddr,
__be16 sport, __be16 dport)
{
u32 secret[MD5_MESSAGE_BYTES / 4];
u32 hash[MD5_DIGEST_WORDS];
u64 seq;
u32 i;
net_secret_init();
memcpy(hash, saddr, 16);
for (i = 0; i < 4; i++)
secret[i] = net_secret[i] + daddr[i];
secret[4] = net_secret[4] +
(((__force u16)sport << 16) + (__force u16)dport);
for (i = 5; i < MD5_MESSAGE_BYTES / 4; i++)
secret[i] = net_secret[i];
md5_transform(hash, secret);
seq = hash[0] | (((u64)hash[1]) << 32);
seq += ktime_get_real_ns();
seq &= (1ull << 48) - 1;
return seq;
}
EXPORT_SYMBOL(secure_dccpv6_sequence_number);
#endif
#endif

4231
net/core/skbuff.c Normal file

File diff suppressed because it is too large Load diff

2956
net/core/sock.c Normal file

File diff suppressed because it is too large Load diff

248
net/core/sock_diag.c Normal file
View file

@ -0,0 +1,248 @@
#include <linux/mutex.h>
#include <linux/socket.h>
#include <linux/skbuff.h>
#include <net/netlink.h>
#include <net/net_namespace.h>
#include <linux/module.h>
#include <net/sock.h>
#include <linux/inet_diag.h>
#include <linux/sock_diag.h>
static const struct sock_diag_handler *sock_diag_handlers[AF_MAX];
static int (*inet_rcv_compat)(struct sk_buff *skb, struct nlmsghdr *nlh);
static DEFINE_MUTEX(sock_diag_table_mutex);
int sock_diag_check_cookie(void *sk, __u32 *cookie)
{
if ((cookie[0] != INET_DIAG_NOCOOKIE ||
cookie[1] != INET_DIAG_NOCOOKIE) &&
((u32)(unsigned long)sk != cookie[0] ||
(u32)((((unsigned long)sk) >> 31) >> 1) != cookie[1]))
return -ESTALE;
else
return 0;
}
EXPORT_SYMBOL_GPL(sock_diag_check_cookie);
void sock_diag_save_cookie(void *sk, __u32 *cookie)
{
cookie[0] = (u32)(unsigned long)sk;
cookie[1] = (u32)(((unsigned long)sk >> 31) >> 1);
}
EXPORT_SYMBOL_GPL(sock_diag_save_cookie);
int sock_diag_put_meminfo(struct sock *sk, struct sk_buff *skb, int attrtype)
{
u32 mem[SK_MEMINFO_VARS];
mem[SK_MEMINFO_RMEM_ALLOC] = sk_rmem_alloc_get(sk);
mem[SK_MEMINFO_RCVBUF] = sk->sk_rcvbuf;
mem[SK_MEMINFO_WMEM_ALLOC] = sk_wmem_alloc_get(sk);
mem[SK_MEMINFO_SNDBUF] = sk->sk_sndbuf;
mem[SK_MEMINFO_FWD_ALLOC] = sk->sk_forward_alloc;
mem[SK_MEMINFO_WMEM_QUEUED] = sk->sk_wmem_queued;
mem[SK_MEMINFO_OPTMEM] = atomic_read(&sk->sk_omem_alloc);
mem[SK_MEMINFO_BACKLOG] = sk->sk_backlog.len;
return nla_put(skb, attrtype, sizeof(mem), &mem);
}
EXPORT_SYMBOL_GPL(sock_diag_put_meminfo);
int sock_diag_put_filterinfo(bool may_report_filterinfo, struct sock *sk,
struct sk_buff *skb, int attrtype)
{
struct sock_fprog_kern *fprog;
struct sk_filter *filter;
struct nlattr *attr;
unsigned int flen;
int err = 0;
if (!may_report_filterinfo) {
nla_reserve(skb, attrtype, 0);
return 0;
}
rcu_read_lock();
filter = rcu_dereference(sk->sk_filter);
if (!filter)
goto out;
fprog = filter->prog->orig_prog;
flen = bpf_classic_proglen(fprog);
attr = nla_reserve(skb, attrtype, flen);
if (attr == NULL) {
err = -EMSGSIZE;
goto out;
}
memcpy(nla_data(attr), fprog->filter, flen);
out:
rcu_read_unlock();
return err;
}
EXPORT_SYMBOL(sock_diag_put_filterinfo);
void sock_diag_register_inet_compat(int (*fn)(struct sk_buff *skb, struct nlmsghdr *nlh))
{
mutex_lock(&sock_diag_table_mutex);
inet_rcv_compat = fn;
mutex_unlock(&sock_diag_table_mutex);
}
EXPORT_SYMBOL_GPL(sock_diag_register_inet_compat);
void sock_diag_unregister_inet_compat(int (*fn)(struct sk_buff *skb, struct nlmsghdr *nlh))
{
mutex_lock(&sock_diag_table_mutex);
inet_rcv_compat = NULL;
mutex_unlock(&sock_diag_table_mutex);
}
EXPORT_SYMBOL_GPL(sock_diag_unregister_inet_compat);
int sock_diag_register(const struct sock_diag_handler *hndl)
{
int err = 0;
if (hndl->family >= AF_MAX)
return -EINVAL;
mutex_lock(&sock_diag_table_mutex);
if (sock_diag_handlers[hndl->family])
err = -EBUSY;
else
sock_diag_handlers[hndl->family] = hndl;
mutex_unlock(&sock_diag_table_mutex);
return err;
}
EXPORT_SYMBOL_GPL(sock_diag_register);
void sock_diag_unregister(const struct sock_diag_handler *hnld)
{
int family = hnld->family;
if (family >= AF_MAX)
return;
mutex_lock(&sock_diag_table_mutex);
BUG_ON(sock_diag_handlers[family] != hnld);
sock_diag_handlers[family] = NULL;
mutex_unlock(&sock_diag_table_mutex);
}
EXPORT_SYMBOL_GPL(sock_diag_unregister);
static int __sock_diag_cmd(struct sk_buff *skb, struct nlmsghdr *nlh)
{
int err;
struct sock_diag_req *req = nlmsg_data(nlh);
const struct sock_diag_handler *hndl;
if (nlmsg_len(nlh) < sizeof(*req))
return -EINVAL;
if (req->sdiag_family >= AF_MAX)
return -EINVAL;
if (sock_diag_handlers[req->sdiag_family] == NULL)
request_module("net-pf-%d-proto-%d-type-%d", PF_NETLINK,
NETLINK_SOCK_DIAG, req->sdiag_family);
mutex_lock(&sock_diag_table_mutex);
hndl = sock_diag_handlers[req->sdiag_family];
if (hndl == NULL)
err = -ENOENT;
else if (nlh->nlmsg_type == SOCK_DIAG_BY_FAMILY)
err = hndl->dump(skb, nlh);
else if (nlh->nlmsg_type == SOCK_DESTROY_BACKPORT && hndl->destroy)
err = hndl->destroy(skb, nlh);
else
err = -EOPNOTSUPP;
mutex_unlock(&sock_diag_table_mutex);
return err;
}
static int sock_diag_rcv_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
{
int ret;
switch (nlh->nlmsg_type) {
case TCPDIAG_GETSOCK:
case DCCPDIAG_GETSOCK:
if (inet_rcv_compat == NULL)
request_module("net-pf-%d-proto-%d-type-%d", PF_NETLINK,
NETLINK_SOCK_DIAG, AF_INET);
mutex_lock(&sock_diag_table_mutex);
if (inet_rcv_compat != NULL)
ret = inet_rcv_compat(skb, nlh);
else
ret = -EOPNOTSUPP;
mutex_unlock(&sock_diag_table_mutex);
return ret;
case SOCK_DIAG_BY_FAMILY:
case SOCK_DESTROY_BACKPORT:
return __sock_diag_cmd(skb, nlh);
default:
return -EINVAL;
}
}
static DEFINE_MUTEX(sock_diag_mutex);
static void sock_diag_rcv(struct sk_buff *skb)
{
mutex_lock(&sock_diag_mutex);
netlink_rcv_skb(skb, &sock_diag_rcv_msg);
mutex_unlock(&sock_diag_mutex);
}
int sock_diag_destroy(struct sock *sk, int err)
{
if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN))
return -EPERM;
if (!sk->sk_prot->diag_destroy)
return -EOPNOTSUPP;
return sk->sk_prot->diag_destroy(sk, err);
}
EXPORT_SYMBOL_GPL(sock_diag_destroy);
static int __net_init diag_net_init(struct net *net)
{
struct netlink_kernel_cfg cfg = {
.input = sock_diag_rcv,
};
net->diag_nlsk = netlink_kernel_create(net, NETLINK_SOCK_DIAG, &cfg);
return net->diag_nlsk == NULL ? -ENOMEM : 0;
}
static void __net_exit diag_net_exit(struct net *net)
{
netlink_kernel_release(net->diag_nlsk);
net->diag_nlsk = NULL;
}
static struct pernet_operations diag_net_ops = {
.init = diag_net_init,
.exit = diag_net_exit,
};
static int __init sock_diag_init(void)
{
return register_pernet_subsys(&diag_net_ops);
}
static void __exit sock_diag_exit(void)
{
unregister_pernet_subsys(&diag_net_ops);
}
module_init(sock_diag_init);
module_exit(sock_diag_exit);
MODULE_LICENSE("GPL");
MODULE_ALIAS_NET_PF_PROTO(PF_NETLINK, NETLINK_SOCK_DIAG);

208
net/core/stream.c Normal file
View file

@ -0,0 +1,208 @@
/*
* SUCS NET3:
*
* Generic stream handling routines. These are generic for most
* protocols. Even IP. Tonight 8-).
* This is used because TCP, LLC (others too) layer all have mostly
* identical sendmsg() and recvmsg() code.
* So we (will) share it here.
*
* Authors: Arnaldo Carvalho de Melo <acme@conectiva.com.br>
* (from old tcp.c code)
* Alan Cox <alan@lxorguk.ukuu.org.uk> (Borrowed comments 8-))
*/
#include <linux/module.h>
#include <linux/net.h>
#include <linux/signal.h>
#include <linux/tcp.h>
#include <linux/wait.h>
#include <net/sock.h>
/**
* sk_stream_write_space - stream socket write_space callback.
* @sk: socket
*
* FIXME: write proper description
*/
void sk_stream_write_space(struct sock *sk)
{
struct socket *sock = sk->sk_socket;
struct socket_wq *wq;
if (sk_stream_is_writeable(sk) && sock) {
clear_bit(SOCK_NOSPACE, &sock->flags);
rcu_read_lock();
wq = rcu_dereference(sk->sk_wq);
if (wq_has_sleeper(wq))
wake_up_interruptible_poll(&wq->wait, POLLOUT |
POLLWRNORM | POLLWRBAND);
if (wq && wq->fasync_list && !(sk->sk_shutdown & SEND_SHUTDOWN))
sock_wake_async(sock, SOCK_WAKE_SPACE, POLL_OUT);
rcu_read_unlock();
}
}
EXPORT_SYMBOL(sk_stream_write_space);
/**
* sk_stream_wait_connect - Wait for a socket to get into the connected state
* @sk: sock to wait on
* @timeo_p: for how long to wait
*
* Must be called with the socket locked.
*/
int sk_stream_wait_connect(struct sock *sk, long *timeo_p)
{
struct task_struct *tsk = current;
DEFINE_WAIT(wait);
int done;
do {
int err = sock_error(sk);
if (err)
return err;
if ((1 << sk->sk_state) & ~(TCPF_SYN_SENT | TCPF_SYN_RECV))
return -EPIPE;
if (!*timeo_p)
return -EAGAIN;
if (signal_pending(tsk))
return sock_intr_errno(*timeo_p);
prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE);
sk->sk_write_pending++;
done = sk_wait_event(sk, timeo_p,
!sk->sk_err &&
!((1 << sk->sk_state) &
~(TCPF_ESTABLISHED | TCPF_CLOSE_WAIT)));
finish_wait(sk_sleep(sk), &wait);
sk->sk_write_pending--;
} while (!done);
return 0;
}
EXPORT_SYMBOL(sk_stream_wait_connect);
/**
* sk_stream_closing - Return 1 if we still have things to send in our buffers.
* @sk: socket to verify
*/
static inline int sk_stream_closing(struct sock *sk)
{
return (1 << sk->sk_state) &
(TCPF_FIN_WAIT1 | TCPF_CLOSING | TCPF_LAST_ACK);
}
void sk_stream_wait_close(struct sock *sk, long timeout)
{
if (timeout) {
DEFINE_WAIT(wait);
do {
prepare_to_wait(sk_sleep(sk), &wait,
TASK_INTERRUPTIBLE);
if (sk_wait_event(sk, &timeout, !sk_stream_closing(sk)))
break;
} while (!signal_pending(current) && timeout);
finish_wait(sk_sleep(sk), &wait);
}
}
EXPORT_SYMBOL(sk_stream_wait_close);
/**
* sk_stream_wait_memory - Wait for more memory for a socket
* @sk: socket to wait for memory
* @timeo_p: for how long
*/
int sk_stream_wait_memory(struct sock *sk, long *timeo_p)
{
int err = 0;
long vm_wait = 0;
long current_timeo = *timeo_p;
DEFINE_WAIT(wait);
if (sk_stream_memory_free(sk))
current_timeo = vm_wait = (prandom_u32() % (HZ / 5)) + 2;
while (1) {
set_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags);
prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE);
if (sk->sk_err || (sk->sk_shutdown & SEND_SHUTDOWN))
goto do_error;
if (!*timeo_p)
goto do_nonblock;
if (signal_pending(current))
goto do_interrupted;
clear_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags);
if (sk_stream_memory_free(sk) && !vm_wait)
break;
set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
sk->sk_write_pending++;
sk_wait_event(sk, &current_timeo, sk->sk_err ||
(sk->sk_shutdown & SEND_SHUTDOWN) ||
(sk_stream_memory_free(sk) &&
!vm_wait));
sk->sk_write_pending--;
if (vm_wait) {
vm_wait -= current_timeo;
current_timeo = *timeo_p;
if (current_timeo != MAX_SCHEDULE_TIMEOUT &&
(current_timeo -= vm_wait) < 0)
current_timeo = 0;
vm_wait = 0;
}
*timeo_p = current_timeo;
}
out:
finish_wait(sk_sleep(sk), &wait);
return err;
do_error:
err = -EPIPE;
goto out;
do_nonblock:
err = -EAGAIN;
goto out;
do_interrupted:
err = sock_intr_errno(*timeo_p);
goto out;
}
EXPORT_SYMBOL(sk_stream_wait_memory);
int sk_stream_error(struct sock *sk, int flags, int err)
{
if (err == -EPIPE)
err = sock_error(sk) ? : -EPIPE;
if (err == -EPIPE && !(flags & MSG_NOSIGNAL))
send_sig(SIGPIPE, current, 0);
return err;
}
EXPORT_SYMBOL(sk_stream_error);
void sk_stream_kill_queues(struct sock *sk)
{
/* First the read buffer. */
__skb_queue_purge(&sk->sk_receive_queue);
/* Next, the error queue. */
__skb_queue_purge(&sk->sk_error_queue);
/* Next, the write queue. */
WARN_ON(!skb_queue_empty(&sk->sk_write_queue));
/* Account for returned memory. */
sk_mem_reclaim(sk);
WARN_ON(sk->sk_wmem_queued);
WARN_ON(sk->sk_forward_alloc);
/* It is _impossible_ for the backlog to contain anything
* when we get here. All user references to this socket
* have gone away, only the net layer knows can touch it.
*/
}
EXPORT_SYMBOL(sk_stream_kill_queues);

435
net/core/sysctl_net_core.c Normal file
View file

@ -0,0 +1,435 @@
/* -*- linux-c -*-
* sysctl_net_core.c: sysctl interface to net core subsystem.
*
* Begun April 1, 1996, Mike Shaver.
* Added /proc/sys/net/core directory entry (empty =) ). [MS]
*/
#include <linux/mm.h>
#include <linux/sysctl.h>
#include <linux/module.h>
#include <linux/socket.h>
#include <linux/netdevice.h>
#include <linux/ratelimit.h>
#include <linux/vmalloc.h>
#include <linux/init.h>
#include <linux/slab.h>
#include <linux/kmemleak.h>
#include <net/ip.h>
#include <net/sock.h>
#include <net/net_ratelimit.h>
#include <net/busy_poll.h>
#include <net/pkt_sched.h>
static int zero = 0;
static int ushort_max = USHRT_MAX;
static int min_sndbuf = SOCK_MIN_SNDBUF;
static int min_rcvbuf = SOCK_MIN_RCVBUF;
#ifdef CONFIG_RPS
static int rps_sock_flow_sysctl(struct ctl_table *table, int write,
void __user *buffer, size_t *lenp, loff_t *ppos)
{
unsigned int orig_size, size;
int ret, i;
struct ctl_table tmp = {
.data = &size,
.maxlen = sizeof(size),
.mode = table->mode
};
struct rps_sock_flow_table *orig_sock_table, *sock_table;
static DEFINE_MUTEX(sock_flow_mutex);
mutex_lock(&sock_flow_mutex);
orig_sock_table = rcu_dereference_protected(rps_sock_flow_table,
lockdep_is_held(&sock_flow_mutex));
size = orig_size = orig_sock_table ? orig_sock_table->mask + 1 : 0;
ret = proc_dointvec(&tmp, write, buffer, lenp, ppos);
if (write) {
if (size) {
if (size > 1<<30) {
/* Enforce limit to prevent overflow */
mutex_unlock(&sock_flow_mutex);
return -EINVAL;
}
size = roundup_pow_of_two(size);
if (size != orig_size) {
sock_table =
vmalloc(RPS_SOCK_FLOW_TABLE_SIZE(size));
if (!sock_table) {
mutex_unlock(&sock_flow_mutex);
return -ENOMEM;
}
sock_table->mask = size - 1;
} else
sock_table = orig_sock_table;
for (i = 0; i < size; i++)
sock_table->ents[i] = RPS_NO_CPU;
} else
sock_table = NULL;
if (sock_table != orig_sock_table) {
rcu_assign_pointer(rps_sock_flow_table, sock_table);
if (sock_table)
static_key_slow_inc(&rps_needed);
if (orig_sock_table) {
static_key_slow_dec(&rps_needed);
synchronize_rcu();
vfree(orig_sock_table);
}
}
}
mutex_unlock(&sock_flow_mutex);
return ret;
}
#endif /* CONFIG_RPS */
#ifdef CONFIG_NET_FLOW_LIMIT
static DEFINE_MUTEX(flow_limit_update_mutex);
static int flow_limit_cpu_sysctl(struct ctl_table *table, int write,
void __user *buffer, size_t *lenp,
loff_t *ppos)
{
struct sd_flow_limit *cur;
struct softnet_data *sd;
cpumask_var_t mask;
int i, len, ret = 0;
if (!alloc_cpumask_var(&mask, GFP_KERNEL))
return -ENOMEM;
if (write) {
ret = cpumask_parse_user(buffer, *lenp, mask);
if (ret)
goto done;
mutex_lock(&flow_limit_update_mutex);
len = sizeof(*cur) + netdev_flow_limit_table_len;
for_each_possible_cpu(i) {
sd = &per_cpu(softnet_data, i);
cur = rcu_dereference_protected(sd->flow_limit,
lockdep_is_held(&flow_limit_update_mutex));
if (cur && !cpumask_test_cpu(i, mask)) {
RCU_INIT_POINTER(sd->flow_limit, NULL);
synchronize_rcu();
kfree(cur);
} else if (!cur && cpumask_test_cpu(i, mask)) {
cur = kzalloc_node(len, GFP_KERNEL,
cpu_to_node(i));
if (!cur) {
/* not unwinding previous changes */
ret = -ENOMEM;
goto write_unlock;
}
cur->num_buckets = netdev_flow_limit_table_len;
rcu_assign_pointer(sd->flow_limit, cur);
}
}
write_unlock:
mutex_unlock(&flow_limit_update_mutex);
} else {
char kbuf[128];
if (*ppos || !*lenp) {
*lenp = 0;
goto done;
}
cpumask_clear(mask);
rcu_read_lock();
for_each_possible_cpu(i) {
sd = &per_cpu(softnet_data, i);
if (rcu_dereference(sd->flow_limit))
cpumask_set_cpu(i, mask);
}
rcu_read_unlock();
len = min(sizeof(kbuf) - 1, *lenp);
len = cpumask_scnprintf(kbuf, len, mask);
if (!len) {
*lenp = 0;
goto done;
}
if (len < *lenp)
kbuf[len++] = '\n';
if (copy_to_user(buffer, kbuf, len)) {
ret = -EFAULT;
goto done;
}
*lenp = len;
*ppos += len;
}
done:
free_cpumask_var(mask);
return ret;
}
static int flow_limit_table_len_sysctl(struct ctl_table *table, int write,
void __user *buffer, size_t *lenp,
loff_t *ppos)
{
unsigned int old, *ptr;
int ret;
mutex_lock(&flow_limit_update_mutex);
ptr = table->data;
old = *ptr;
ret = proc_dointvec(table, write, buffer, lenp, ppos);
if (!ret && write && !is_power_of_2(*ptr)) {
*ptr = old;
ret = -EINVAL;
}
mutex_unlock(&flow_limit_update_mutex);
return ret;
}
#endif /* CONFIG_NET_FLOW_LIMIT */
#ifdef CONFIG_NET_SCHED
static int set_default_qdisc(struct ctl_table *table, int write,
void __user *buffer, size_t *lenp, loff_t *ppos)
{
char id[IFNAMSIZ];
struct ctl_table tbl = {
.data = id,
.maxlen = IFNAMSIZ,
};
int ret;
qdisc_get_default(id, IFNAMSIZ);
ret = proc_dostring(&tbl, write, buffer, lenp, ppos);
if (write && ret == 0)
ret = qdisc_set_default(id);
return ret;
}
#endif
static struct ctl_table net_core_table[] = {
#ifdef CONFIG_NET
{
.procname = "wmem_max",
.data = &sysctl_wmem_max,
.maxlen = sizeof(int),
.mode = 0644,
.proc_handler = proc_dointvec_minmax,
.extra1 = &min_sndbuf,
},
{
.procname = "rmem_max",
.data = &sysctl_rmem_max,
.maxlen = sizeof(int),
.mode = 0644,
.proc_handler = proc_dointvec_minmax,
.extra1 = &min_rcvbuf,
},
{
.procname = "wmem_default",
.data = &sysctl_wmem_default,
.maxlen = sizeof(int),
.mode = 0644,
.proc_handler = proc_dointvec_minmax,
.extra1 = &min_sndbuf,
},
{
.procname = "rmem_default",
.data = &sysctl_rmem_default,
.maxlen = sizeof(int),
.mode = 0644,
.proc_handler = proc_dointvec_minmax,
.extra1 = &min_rcvbuf,
},
{
.procname = "dev_weight",
.data = &weight_p,
.maxlen = sizeof(int),
.mode = 0644,
.proc_handler = proc_dointvec
},
{
.procname = "netdev_max_backlog",
.data = &netdev_max_backlog,
.maxlen = sizeof(int),
.mode = 0644,
.proc_handler = proc_dointvec
},
#ifdef CONFIG_BPF_JIT
{
.procname = "bpf_jit_enable",
.data = &bpf_jit_enable,
.maxlen = sizeof(int),
.mode = 0644,
.proc_handler = proc_dointvec
},
#endif
{
.procname = "netdev_tstamp_prequeue",
.data = &netdev_tstamp_prequeue,
.maxlen = sizeof(int),
.mode = 0644,
.proc_handler = proc_dointvec
},
{
.procname = "message_cost",
.data = &net_ratelimit_state.interval,
.maxlen = sizeof(int),
.mode = 0644,
.proc_handler = proc_dointvec_jiffies,
},
{
.procname = "message_burst",
.data = &net_ratelimit_state.burst,
.maxlen = sizeof(int),
.mode = 0644,
.proc_handler = proc_dointvec,
},
{
.procname = "optmem_max",
.data = &sysctl_optmem_max,
.maxlen = sizeof(int),
.mode = 0644,
.proc_handler = proc_dointvec
},
#ifdef CONFIG_RPS
{
.procname = "rps_sock_flow_entries",
.maxlen = sizeof(int),
.mode = 0644,
.proc_handler = rps_sock_flow_sysctl
},
#endif
#ifdef CONFIG_NET_FLOW_LIMIT
{
.procname = "flow_limit_cpu_bitmap",
.mode = 0644,
.proc_handler = flow_limit_cpu_sysctl
},
{
.procname = "flow_limit_table_len",
.data = &netdev_flow_limit_table_len,
.maxlen = sizeof(int),
.mode = 0644,
.proc_handler = flow_limit_table_len_sysctl
},
#endif /* CONFIG_NET_FLOW_LIMIT */
#ifdef CONFIG_NET_RX_BUSY_POLL
{
.procname = "busy_poll",
.data = &sysctl_net_busy_poll,
.maxlen = sizeof(unsigned int),
.mode = 0644,
.proc_handler = proc_dointvec
},
{
.procname = "busy_read",
.data = &sysctl_net_busy_read,
.maxlen = sizeof(unsigned int),
.mode = 0644,
.proc_handler = proc_dointvec
},
#endif
#ifdef CONFIG_NET_SCHED
{
.procname = "default_qdisc",
.mode = 0644,
.maxlen = IFNAMSIZ,
.proc_handler = set_default_qdisc
},
#endif
#endif /* CONFIG_NET */
{
.procname = "netdev_budget",
.data = &netdev_budget,
.maxlen = sizeof(int),
.mode = 0644,
.proc_handler = proc_dointvec
},
{
.procname = "warnings",
.data = &net_msg_warn,
.maxlen = sizeof(int),
.mode = 0644,
.proc_handler = proc_dointvec
},
{ }
};
static struct ctl_table netns_core_table[] = {
{
.procname = "somaxconn",
.data = &init_net.core.sysctl_somaxconn,
.maxlen = sizeof(int),
.mode = 0644,
.extra1 = &zero,
.extra2 = &ushort_max,
.proc_handler = proc_dointvec_minmax
},
{ }
};
static __net_init int sysctl_core_net_init(struct net *net)
{
struct ctl_table *tbl;
net->core.sysctl_somaxconn = SOMAXCONN;
tbl = netns_core_table;
if (!net_eq(net, &init_net)) {
tbl = kmemdup(tbl, sizeof(netns_core_table), GFP_KERNEL);
if (tbl == NULL)
goto err_dup;
tbl[0].data = &net->core.sysctl_somaxconn;
/* Don't export any sysctls to unprivileged users */
if (net->user_ns != &init_user_ns) {
tbl[0].procname = NULL;
}
}
net->core.sysctl_hdr = register_net_sysctl(net, "net/core", tbl);
if (net->core.sysctl_hdr == NULL)
goto err_reg;
return 0;
err_reg:
if (tbl != netns_core_table)
kfree(tbl);
err_dup:
return -ENOMEM;
}
static __net_exit void sysctl_core_net_exit(struct net *net)
{
struct ctl_table *tbl;
tbl = net->core.sysctl_hdr->ctl_table_arg;
unregister_net_sysctl_table(net->core.sysctl_hdr);
BUG_ON(tbl == netns_core_table);
kfree(tbl);
}
static __net_initdata struct pernet_operations sysctl_core_ops = {
.init = sysctl_core_net_init,
.exit = sysctl_core_net_exit,
};
static __init int sysctl_core_init(void)
{
register_net_sysctl(&init_net, "net/core", net_core_table);
return register_pernet_subsys(&sysctl_core_ops);
}
fs_initcall(sysctl_core_init);

80
net/core/timestamping.c Normal file
View file

@ -0,0 +1,80 @@
/*
* PTP 1588 clock support - support for timestamping in PHY devices
*
* Copyright (C) 2010 OMICRON electronics GmbH
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
*/
#include <linux/errqueue.h>
#include <linux/phy.h>
#include <linux/ptp_classify.h>
#include <linux/skbuff.h>
#include <linux/export.h>
static unsigned int classify(const struct sk_buff *skb)
{
if (likely(skb->dev && skb->dev->phydev &&
skb->dev->phydev->drv))
return ptp_classify_raw(skb);
else
return PTP_CLASS_NONE;
}
void skb_clone_tx_timestamp(struct sk_buff *skb)
{
struct phy_device *phydev;
struct sk_buff *clone;
unsigned int type;
if (!skb->sk)
return;
type = classify(skb);
if (type == PTP_CLASS_NONE)
return;
phydev = skb->dev->phydev;
if (likely(phydev->drv->txtstamp)) {
clone = skb_clone_sk(skb);
if (!clone)
return;
phydev->drv->txtstamp(phydev, clone, type);
}
}
EXPORT_SYMBOL_GPL(skb_clone_tx_timestamp);
bool skb_defer_rx_timestamp(struct sk_buff *skb)
{
struct phy_device *phydev;
unsigned int type;
if (skb_headroom(skb) < ETH_HLEN)
return false;
__skb_push(skb, ETH_HLEN);
type = classify(skb);
__skb_pull(skb, ETH_HLEN);
if (type == PTP_CLASS_NONE)
return false;
phydev = skb->dev->phydev;
if (likely(phydev->drv->rxtstamp))
return phydev->drv->rxtstamp(phydev, skb, type);
return false;
}
EXPORT_SYMBOL_GPL(skb_defer_rx_timestamp);

78
net/core/tso.c Normal file
View file

@ -0,0 +1,78 @@
#include <linux/export.h>
#include <net/ip.h>
#include <net/tso.h>
#include <asm/unaligned.h>
/* Calculate expected number of TX descriptors */
int tso_count_descs(struct sk_buff *skb)
{
/* The Marvell Way */
return skb_shinfo(skb)->gso_segs * 2 + skb_shinfo(skb)->nr_frags;
}
EXPORT_SYMBOL(tso_count_descs);
void tso_build_hdr(struct sk_buff *skb, char *hdr, struct tso_t *tso,
int size, bool is_last)
{
struct iphdr *iph;
struct tcphdr *tcph;
int hdr_len = skb_transport_offset(skb) + tcp_hdrlen(skb);
int mac_hdr_len = skb_network_offset(skb);
memcpy(hdr, skb->data, hdr_len);
iph = (struct iphdr *)(hdr + mac_hdr_len);
iph->id = htons(tso->ip_id);
iph->tot_len = htons(size + hdr_len - mac_hdr_len);
tcph = (struct tcphdr *)(hdr + skb_transport_offset(skb));
put_unaligned_be32(tso->tcp_seq, &tcph->seq);
tso->ip_id++;
if (!is_last) {
/* Clear all special flags for not last packet */
tcph->psh = 0;
tcph->fin = 0;
tcph->rst = 0;
}
}
EXPORT_SYMBOL(tso_build_hdr);
void tso_build_data(struct sk_buff *skb, struct tso_t *tso, int size)
{
tso->tcp_seq += size;
tso->size -= size;
tso->data += size;
if ((tso->size == 0) &&
(tso->next_frag_idx < skb_shinfo(skb)->nr_frags)) {
skb_frag_t *frag = &skb_shinfo(skb)->frags[tso->next_frag_idx];
/* Move to next segment */
tso->size = frag->size;
tso->data = page_address(frag->page.p) + frag->page_offset;
tso->next_frag_idx++;
}
}
EXPORT_SYMBOL(tso_build_data);
void tso_start(struct sk_buff *skb, struct tso_t *tso)
{
int hdr_len = skb_transport_offset(skb) + tcp_hdrlen(skb);
tso->ip_id = ntohs(ip_hdr(skb)->id);
tso->tcp_seq = ntohl(tcp_hdr(skb)->seq);
tso->next_frag_idx = 0;
/* Build first data */
tso->size = skb_headlen(skb) - hdr_len;
tso->data = skb->data + hdr_len;
if ((tso->size == 0) &&
(tso->next_frag_idx < skb_shinfo(skb)->nr_frags)) {
skb_frag_t *frag = &skb_shinfo(skb)->frags[tso->next_frag_idx];
/* Move to next segment */
tso->size = frag->size;
tso->data = page_address(frag->page.p) + frag->page_offset;
tso->next_frag_idx++;
}
}
EXPORT_SYMBOL(tso_start);

387
net/core/utils.c Normal file
View file

@ -0,0 +1,387 @@
/*
* Generic address resultion entity
*
* Authors:
* net_random Alan Cox
* net_ratelimit Andi Kleen
* in{4,6}_pton YOSHIFUJI Hideaki, Copyright (C)2006 USAGI/WIDE Project
*
* Created by Alexey Kuznetsov <kuznet@ms2.inr.ac.ru>
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public License
* as published by the Free Software Foundation; either version
* 2 of the License, or (at your option) any later version.
*/
#include <linux/module.h>
#include <linux/jiffies.h>
#include <linux/kernel.h>
#include <linux/ctype.h>
#include <linux/inet.h>
#include <linux/mm.h>
#include <linux/net.h>
#include <linux/string.h>
#include <linux/types.h>
#include <linux/percpu.h>
#include <linux/init.h>
#include <linux/ratelimit.h>
#include <net/sock.h>
#include <net/net_ratelimit.h>
#include <asm/byteorder.h>
#include <asm/uaccess.h>
int net_msg_warn __read_mostly = 1;
EXPORT_SYMBOL(net_msg_warn);
DEFINE_RATELIMIT_STATE(net_ratelimit_state, 5 * HZ, 10);
/*
* All net warning printk()s should be guarded by this function.
*/
int net_ratelimit(void)
{
return __ratelimit(&net_ratelimit_state);
}
EXPORT_SYMBOL(net_ratelimit);
/*
* Convert an ASCII string to binary IP.
* This is outside of net/ipv4/ because various code that uses IP addresses
* is otherwise not dependent on the TCP/IP stack.
*/
__be32 in_aton(const char *str)
{
unsigned long l;
unsigned int val;
int i;
l = 0;
for (i = 0; i < 4; i++) {
l <<= 8;
if (*str != '\0') {
val = 0;
while (*str != '\0' && *str != '.' && *str != '\n') {
val *= 10;
val += *str - '0';
str++;
}
l |= val;
if (*str != '\0')
str++;
}
}
return htonl(l);
}
EXPORT_SYMBOL(in_aton);
#define IN6PTON_XDIGIT 0x00010000
#define IN6PTON_DIGIT 0x00020000
#define IN6PTON_COLON_MASK 0x00700000
#define IN6PTON_COLON_1 0x00100000 /* single : requested */
#define IN6PTON_COLON_2 0x00200000 /* second : requested */
#define IN6PTON_COLON_1_2 0x00400000 /* :: requested */
#define IN6PTON_DOT 0x00800000 /* . */
#define IN6PTON_DELIM 0x10000000
#define IN6PTON_NULL 0x20000000 /* first/tail */
#define IN6PTON_UNKNOWN 0x40000000
static inline int xdigit2bin(char c, int delim)
{
int val;
if (c == delim || c == '\0')
return IN6PTON_DELIM;
if (c == ':')
return IN6PTON_COLON_MASK;
if (c == '.')
return IN6PTON_DOT;
val = hex_to_bin(c);
if (val >= 0)
return val | IN6PTON_XDIGIT | (val < 10 ? IN6PTON_DIGIT : 0);
if (delim == -1)
return IN6PTON_DELIM;
return IN6PTON_UNKNOWN;
}
/**
* in4_pton - convert an IPv4 address from literal to binary representation
* @src: the start of the IPv4 address string
* @srclen: the length of the string, -1 means strlen(src)
* @dst: the binary (u8[4] array) representation of the IPv4 address
* @delim: the delimiter of the IPv4 address in @src, -1 means no delimiter
* @end: A pointer to the end of the parsed string will be placed here
*
* Return one on success, return zero when any error occurs
* and @end will point to the end of the parsed string.
*
*/
int in4_pton(const char *src, int srclen,
u8 *dst,
int delim, const char **end)
{
const char *s;
u8 *d;
u8 dbuf[4];
int ret = 0;
int i;
int w = 0;
if (srclen < 0)
srclen = strlen(src);
s = src;
d = dbuf;
i = 0;
while(1) {
int c;
c = xdigit2bin(srclen > 0 ? *s : '\0', delim);
if (!(c & (IN6PTON_DIGIT | IN6PTON_DOT | IN6PTON_DELIM | IN6PTON_COLON_MASK))) {
goto out;
}
if (c & (IN6PTON_DOT | IN6PTON_DELIM | IN6PTON_COLON_MASK)) {
if (w == 0)
goto out;
*d++ = w & 0xff;
w = 0;
i++;
if (c & (IN6PTON_DELIM | IN6PTON_COLON_MASK)) {
if (i != 4)
goto out;
break;
}
goto cont;
}
w = (w * 10) + c;
if ((w & 0xffff) > 255) {
goto out;
}
cont:
if (i >= 4)
goto out;
s++;
srclen--;
}
ret = 1;
memcpy(dst, dbuf, sizeof(dbuf));
out:
if (end)
*end = s;
return ret;
}
EXPORT_SYMBOL(in4_pton);
/**
* in6_pton - convert an IPv6 address from literal to binary representation
* @src: the start of the IPv6 address string
* @srclen: the length of the string, -1 means strlen(src)
* @dst: the binary (u8[16] array) representation of the IPv6 address
* @delim: the delimiter of the IPv6 address in @src, -1 means no delimiter
* @end: A pointer to the end of the parsed string will be placed here
*
* Return one on success, return zero when any error occurs
* and @end will point to the end of the parsed string.
*
*/
int in6_pton(const char *src, int srclen,
u8 *dst,
int delim, const char **end)
{
const char *s, *tok = NULL;
u8 *d, *dc = NULL;
u8 dbuf[16];
int ret = 0;
int i;
int state = IN6PTON_COLON_1_2 | IN6PTON_XDIGIT | IN6PTON_NULL;
int w = 0;
memset(dbuf, 0, sizeof(dbuf));
s = src;
d = dbuf;
if (srclen < 0)
srclen = strlen(src);
while (1) {
int c;
c = xdigit2bin(srclen > 0 ? *s : '\0', delim);
if (!(c & state))
goto out;
if (c & (IN6PTON_DELIM | IN6PTON_COLON_MASK)) {
/* process one 16-bit word */
if (!(state & IN6PTON_NULL)) {
*d++ = (w >> 8) & 0xff;
*d++ = w & 0xff;
}
w = 0;
if (c & IN6PTON_DELIM) {
/* We've processed last word */
break;
}
/*
* COLON_1 => XDIGIT
* COLON_2 => XDIGIT|DELIM
* COLON_1_2 => COLON_2
*/
switch (state & IN6PTON_COLON_MASK) {
case IN6PTON_COLON_2:
dc = d;
state = IN6PTON_XDIGIT | IN6PTON_DELIM;
if (dc - dbuf >= sizeof(dbuf))
state |= IN6PTON_NULL;
break;
case IN6PTON_COLON_1|IN6PTON_COLON_1_2:
state = IN6PTON_XDIGIT | IN6PTON_COLON_2;
break;
case IN6PTON_COLON_1:
state = IN6PTON_XDIGIT;
break;
case IN6PTON_COLON_1_2:
state = IN6PTON_COLON_2;
break;
default:
state = 0;
}
tok = s + 1;
goto cont;
}
if (c & IN6PTON_DOT) {
ret = in4_pton(tok ? tok : s, srclen + (int)(s - tok), d, delim, &s);
if (ret > 0) {
d += 4;
break;
}
goto out;
}
w = (w << 4) | (0xff & c);
state = IN6PTON_COLON_1 | IN6PTON_DELIM;
if (!(w & 0xf000)) {
state |= IN6PTON_XDIGIT;
}
if (!dc && d + 2 < dbuf + sizeof(dbuf)) {
state |= IN6PTON_COLON_1_2;
state &= ~IN6PTON_DELIM;
}
if (d + 2 >= dbuf + sizeof(dbuf)) {
state &= ~(IN6PTON_COLON_1|IN6PTON_COLON_1_2);
}
cont:
if ((dc && d + 4 < dbuf + sizeof(dbuf)) ||
d + 4 == dbuf + sizeof(dbuf)) {
state |= IN6PTON_DOT;
}
if (d >= dbuf + sizeof(dbuf)) {
state &= ~(IN6PTON_XDIGIT|IN6PTON_COLON_MASK);
}
s++;
srclen--;
}
i = 15; d--;
if (dc) {
while(d >= dc)
dst[i--] = *d--;
while(i >= dc - dbuf)
dst[i--] = 0;
while(i >= 0)
dst[i--] = *d--;
} else
memcpy(dst, dbuf, sizeof(dbuf));
ret = 1;
out:
if (end)
*end = s;
return ret;
}
EXPORT_SYMBOL(in6_pton);
void inet_proto_csum_replace4(__sum16 *sum, struct sk_buff *skb,
__be32 from, __be32 to, int pseudohdr)
{
if (skb->ip_summed != CHECKSUM_PARTIAL) {
*sum = csum_fold(csum_add(csum_sub(~csum_unfold(*sum), from),
to));
if (skb->ip_summed == CHECKSUM_COMPLETE && pseudohdr)
skb->csum = ~csum_add(csum_sub(~(skb->csum), from), to);
} else if (pseudohdr)
*sum = ~csum_fold(csum_add(csum_sub(csum_unfold(*sum), from),
to));
}
EXPORT_SYMBOL(inet_proto_csum_replace4);
void inet_proto_csum_replace16(__sum16 *sum, struct sk_buff *skb,
const __be32 *from, const __be32 *to,
int pseudohdr)
{
__be32 diff[] = {
~from[0], ~from[1], ~from[2], ~from[3],
to[0], to[1], to[2], to[3],
};
if (skb->ip_summed != CHECKSUM_PARTIAL) {
*sum = csum_fold(csum_partial(diff, sizeof(diff),
~csum_unfold(*sum)));
if (skb->ip_summed == CHECKSUM_COMPLETE && pseudohdr)
skb->csum = ~csum_partial(diff, sizeof(diff),
~skb->csum);
} else if (pseudohdr)
*sum = ~csum_fold(csum_partial(diff, sizeof(diff),
csum_unfold(*sum)));
}
EXPORT_SYMBOL(inet_proto_csum_replace16);
struct __net_random_once_work {
struct work_struct work;
struct static_key *key;
};
static void __net_random_once_deferred(struct work_struct *w)
{
struct __net_random_once_work *work =
container_of(w, struct __net_random_once_work, work);
BUG_ON(!static_key_enabled(work->key));
static_key_slow_dec(work->key);
kfree(work);
}
static void __net_random_once_disable_jump(struct static_key *key)
{
struct __net_random_once_work *w;
w = kmalloc(sizeof(*w), GFP_ATOMIC);
if (!w)
return;
INIT_WORK(&w->work, __net_random_once_deferred);
w->key = key;
schedule_work(&w->work);
}
bool __net_get_random_once(void *buf, int nbytes, bool *done,
struct static_key *once_key)
{
static DEFINE_SPINLOCK(lock);
unsigned long flags;
spin_lock_irqsave(&lock, flags);
if (*done) {
spin_unlock_irqrestore(&lock, flags);
return false;
}
get_random_bytes(buf, nbytes);
*done = true;
spin_unlock_irqrestore(&lock, flags);
__net_random_once_disable_jump(once_key);
return true;
}
EXPORT_SYMBOL(__net_get_random_once);