mirror of
https://github.com/AetherDroid/android_kernel_samsung_on5xelte.git
synced 2025-10-28 23:08:52 +01:00
Fixed MTP to work with TWRP
This commit is contained in:
commit
f6dfaef42e
50820 changed files with 20846062 additions and 0 deletions
686
net/ipv4/Kconfig
Normal file
686
net/ipv4/Kconfig
Normal file
|
|
@ -0,0 +1,686 @@
|
|||
#
|
||||
# IP configuration
|
||||
#
|
||||
config IP_MULTICAST
|
||||
bool "IP: multicasting"
|
||||
help
|
||||
This is code for addressing several networked computers at once,
|
||||
enlarging your kernel by about 2 KB. You need multicasting if you
|
||||
intend to participate in the MBONE, a high bandwidth network on top
|
||||
of the Internet which carries audio and video broadcasts. More
|
||||
information about the MBONE is on the WWW at
|
||||
<http://www.savetz.com/mbone/>. For most people, it's safe to say N.
|
||||
|
||||
config IP_ADVANCED_ROUTER
|
||||
bool "IP: advanced router"
|
||||
---help---
|
||||
If you intend to run your Linux box mostly as a router, i.e. as a
|
||||
computer that forwards and redistributes network packets, say Y; you
|
||||
will then be presented with several options that allow more precise
|
||||
control about the routing process.
|
||||
|
||||
The answer to this question won't directly affect the kernel:
|
||||
answering N will just cause the configurator to skip all the
|
||||
questions about advanced routing.
|
||||
|
||||
Note that your box can only act as a router if you enable IP
|
||||
forwarding in your kernel; you can do that by saying Y to "/proc
|
||||
file system support" and "Sysctl support" below and executing the
|
||||
line
|
||||
|
||||
echo "1" > /proc/sys/net/ipv4/ip_forward
|
||||
|
||||
at boot time after the /proc file system has been mounted.
|
||||
|
||||
If you turn on IP forwarding, you should consider the rp_filter, which
|
||||
automatically rejects incoming packets if the routing table entry
|
||||
for their source address doesn't match the network interface they're
|
||||
arriving on. This has security advantages because it prevents the
|
||||
so-called IP spoofing, however it can pose problems if you use
|
||||
asymmetric routing (packets from you to a host take a different path
|
||||
than packets from that host to you) or if you operate a non-routing
|
||||
host which has several IP addresses on different interfaces. To turn
|
||||
rp_filter on use:
|
||||
|
||||
echo 1 > /proc/sys/net/ipv4/conf/<device>/rp_filter
|
||||
or
|
||||
echo 1 > /proc/sys/net/ipv4/conf/all/rp_filter
|
||||
|
||||
Note that some distributions enable it in startup scripts.
|
||||
For details about rp_filter strict and loose mode read
|
||||
<file:Documentation/networking/ip-sysctl.txt>.
|
||||
|
||||
If unsure, say N here.
|
||||
|
||||
config IP_FIB_TRIE_STATS
|
||||
bool "FIB TRIE statistics"
|
||||
depends on IP_ADVANCED_ROUTER
|
||||
---help---
|
||||
Keep track of statistics on structure of FIB TRIE table.
|
||||
Useful for testing and measuring TRIE performance.
|
||||
|
||||
config IP_MULTIPLE_TABLES
|
||||
bool "IP: policy routing"
|
||||
depends on IP_ADVANCED_ROUTER
|
||||
select FIB_RULES
|
||||
---help---
|
||||
Normally, a router decides what to do with a received packet based
|
||||
solely on the packet's final destination address. If you say Y here,
|
||||
the Linux router will also be able to take the packet's source
|
||||
address into account. Furthermore, the TOS (Type-Of-Service) field
|
||||
of the packet can be used for routing decisions as well.
|
||||
|
||||
If you are interested in this, please see the preliminary
|
||||
documentation at <http://www.compendium.com.ar/policy-routing.txt>
|
||||
and <ftp://post.tepkom.ru/pub/vol2/Linux/docs/advanced-routing.tex>.
|
||||
You will need supporting software from
|
||||
<ftp://ftp.tux.org/pub/net/ip-routing/>.
|
||||
|
||||
If unsure, say N.
|
||||
|
||||
config IP_ROUTE_MULTIPATH
|
||||
bool "IP: equal cost multipath"
|
||||
depends on IP_ADVANCED_ROUTER
|
||||
help
|
||||
Normally, the routing tables specify a single action to be taken in
|
||||
a deterministic manner for a given packet. If you say Y here
|
||||
however, it becomes possible to attach several actions to a packet
|
||||
pattern, in effect specifying several alternative paths to travel
|
||||
for those packets. The router considers all these paths to be of
|
||||
equal "cost" and chooses one of them in a non-deterministic fashion
|
||||
if a matching packet arrives.
|
||||
|
||||
config IP_ROUTE_VERBOSE
|
||||
bool "IP: verbose route monitoring"
|
||||
depends on IP_ADVANCED_ROUTER
|
||||
help
|
||||
If you say Y here, which is recommended, then the kernel will print
|
||||
verbose messages regarding the routing, for example warnings about
|
||||
received packets which look strange and could be evidence of an
|
||||
attack or a misconfigured system somewhere. The information is
|
||||
handled by the klogd daemon which is responsible for kernel messages
|
||||
("man klogd").
|
||||
|
||||
config IP_ROUTE_CLASSID
|
||||
bool
|
||||
|
||||
config IP_PNP
|
||||
bool "IP: kernel level autoconfiguration"
|
||||
help
|
||||
This enables automatic configuration of IP addresses of devices and
|
||||
of the routing table during kernel boot, based on either information
|
||||
supplied on the kernel command line or by BOOTP or RARP protocols.
|
||||
You need to say Y only for diskless machines requiring network
|
||||
access to boot (in which case you want to say Y to "Root file system
|
||||
on NFS" as well), because all other machines configure the network
|
||||
in their startup scripts.
|
||||
|
||||
config IP_PNP_DHCP
|
||||
bool "IP: DHCP support"
|
||||
depends on IP_PNP
|
||||
---help---
|
||||
If you want your Linux box to mount its whole root file system (the
|
||||
one containing the directory /) from some other computer over the
|
||||
net via NFS and you want the IP address of your computer to be
|
||||
discovered automatically at boot time using the DHCP protocol (a
|
||||
special protocol designed for doing this job), say Y here. In case
|
||||
the boot ROM of your network card was designed for booting Linux and
|
||||
does DHCP itself, providing all necessary information on the kernel
|
||||
command line, you can say N here.
|
||||
|
||||
If unsure, say Y. Note that if you want to use DHCP, a DHCP server
|
||||
must be operating on your network. Read
|
||||
<file:Documentation/filesystems/nfs/nfsroot.txt> for details.
|
||||
|
||||
config IP_PNP_BOOTP
|
||||
bool "IP: BOOTP support"
|
||||
depends on IP_PNP
|
||||
---help---
|
||||
If you want your Linux box to mount its whole root file system (the
|
||||
one containing the directory /) from some other computer over the
|
||||
net via NFS and you want the IP address of your computer to be
|
||||
discovered automatically at boot time using the BOOTP protocol (a
|
||||
special protocol designed for doing this job), say Y here. In case
|
||||
the boot ROM of your network card was designed for booting Linux and
|
||||
does BOOTP itself, providing all necessary information on the kernel
|
||||
command line, you can say N here. If unsure, say Y. Note that if you
|
||||
want to use BOOTP, a BOOTP server must be operating on your network.
|
||||
Read <file:Documentation/filesystems/nfs/nfsroot.txt> for details.
|
||||
|
||||
config IP_PNP_RARP
|
||||
bool "IP: RARP support"
|
||||
depends on IP_PNP
|
||||
help
|
||||
If you want your Linux box to mount its whole root file system (the
|
||||
one containing the directory /) from some other computer over the
|
||||
net via NFS and you want the IP address of your computer to be
|
||||
discovered automatically at boot time using the RARP protocol (an
|
||||
older protocol which is being obsoleted by BOOTP and DHCP), say Y
|
||||
here. Note that if you want to use RARP, a RARP server must be
|
||||
operating on your network. Read
|
||||
<file:Documentation/filesystems/nfs/nfsroot.txt> for details.
|
||||
|
||||
config NET_IPIP
|
||||
tristate "IP: tunneling"
|
||||
select INET_TUNNEL
|
||||
select NET_IP_TUNNEL
|
||||
---help---
|
||||
Tunneling means encapsulating data of one protocol type within
|
||||
another protocol and sending it over a channel that understands the
|
||||
encapsulating protocol. This particular tunneling driver implements
|
||||
encapsulation of IP within IP, which sounds kind of pointless, but
|
||||
can be useful if you want to make your (or some other) machine
|
||||
appear on a different network than it physically is, or to use
|
||||
mobile-IP facilities (allowing laptops to seamlessly move between
|
||||
networks without changing their IP addresses).
|
||||
|
||||
Saying Y to this option will produce two modules ( = code which can
|
||||
be inserted in and removed from the running kernel whenever you
|
||||
want). Most people won't need this and can say N.
|
||||
|
||||
config NET_IPGRE_DEMUX
|
||||
tristate "IP: GRE demultiplexer"
|
||||
help
|
||||
This is helper module to demultiplex GRE packets on GRE version field criteria.
|
||||
Required by ip_gre and pptp modules.
|
||||
|
||||
config NET_IP_TUNNEL
|
||||
tristate
|
||||
default n
|
||||
|
||||
config NET_IPGRE
|
||||
tristate "IP: GRE tunnels over IP"
|
||||
depends on (IPV6 || IPV6=n) && NET_IPGRE_DEMUX
|
||||
select NET_IP_TUNNEL
|
||||
help
|
||||
Tunneling means encapsulating data of one protocol type within
|
||||
another protocol and sending it over a channel that understands the
|
||||
encapsulating protocol. This particular tunneling driver implements
|
||||
GRE (Generic Routing Encapsulation) and at this time allows
|
||||
encapsulating of IPv4 or IPv6 over existing IPv4 infrastructure.
|
||||
This driver is useful if the other endpoint is a Cisco router: Cisco
|
||||
likes GRE much better than the other Linux tunneling driver ("IP
|
||||
tunneling" above). In addition, GRE allows multicast redistribution
|
||||
through the tunnel.
|
||||
|
||||
config NET_IPGRE_BROADCAST
|
||||
bool "IP: broadcast GRE over IP"
|
||||
depends on IP_MULTICAST && NET_IPGRE
|
||||
help
|
||||
One application of GRE/IP is to construct a broadcast WAN (Wide Area
|
||||
Network), which looks like a normal Ethernet LAN (Local Area
|
||||
Network), but can be distributed all over the Internet. If you want
|
||||
to do that, say Y here and to "IP multicast routing" below.
|
||||
|
||||
config IP_MROUTE
|
||||
bool "IP: multicast routing"
|
||||
depends on IP_MULTICAST
|
||||
help
|
||||
This is used if you want your machine to act as a router for IP
|
||||
packets that have several destination addresses. It is needed on the
|
||||
MBONE, a high bandwidth network on top of the Internet which carries
|
||||
audio and video broadcasts. In order to do that, you would most
|
||||
likely run the program mrouted. If you haven't heard about it, you
|
||||
don't need it.
|
||||
|
||||
config IP_MROUTE_MULTIPLE_TABLES
|
||||
bool "IP: multicast policy routing"
|
||||
depends on IP_MROUTE && IP_ADVANCED_ROUTER
|
||||
select FIB_RULES
|
||||
help
|
||||
Normally, a multicast router runs a userspace daemon and decides
|
||||
what to do with a multicast packet based on the source and
|
||||
destination addresses. If you say Y here, the multicast router
|
||||
will also be able to take interfaces and packet marks into
|
||||
account and run multiple instances of userspace daemons
|
||||
simultaneously, each one handling a single table.
|
||||
|
||||
If unsure, say N.
|
||||
|
||||
config IP_PIMSM_V1
|
||||
bool "IP: PIM-SM version 1 support"
|
||||
depends on IP_MROUTE
|
||||
help
|
||||
Kernel side support for Sparse Mode PIM (Protocol Independent
|
||||
Multicast) version 1. This multicast routing protocol is used widely
|
||||
because Cisco supports it. You need special software to use it
|
||||
(pimd-v1). Please see <http://netweb.usc.edu/pim/> for more
|
||||
information about PIM.
|
||||
|
||||
Say Y if you want to use PIM-SM v1. Note that you can say N here if
|
||||
you just want to use Dense Mode PIM.
|
||||
|
||||
config IP_PIMSM_V2
|
||||
bool "IP: PIM-SM version 2 support"
|
||||
depends on IP_MROUTE
|
||||
help
|
||||
Kernel side support for Sparse Mode PIM version 2. In order to use
|
||||
this, you need an experimental routing daemon supporting it (pimd or
|
||||
gated-5). This routing protocol is not used widely, so say N unless
|
||||
you want to play with it.
|
||||
|
||||
config SYN_COOKIES
|
||||
bool "IP: TCP syncookie support"
|
||||
---help---
|
||||
Normal TCP/IP networking is open to an attack known as "SYN
|
||||
flooding". This denial-of-service attack prevents legitimate remote
|
||||
users from being able to connect to your computer during an ongoing
|
||||
attack and requires very little work from the attacker, who can
|
||||
operate from anywhere on the Internet.
|
||||
|
||||
SYN cookies provide protection against this type of attack. If you
|
||||
say Y here, the TCP/IP stack will use a cryptographic challenge
|
||||
protocol known as "SYN cookies" to enable legitimate users to
|
||||
continue to connect, even when your machine is under attack. There
|
||||
is no need for the legitimate users to change their TCP/IP software;
|
||||
SYN cookies work transparently to them. For technical information
|
||||
about SYN cookies, check out <http://cr.yp.to/syncookies.html>.
|
||||
|
||||
If you are SYN flooded, the source address reported by the kernel is
|
||||
likely to have been forged by the attacker; it is only reported as
|
||||
an aid in tracing the packets to their actual source and should not
|
||||
be taken as absolute truth.
|
||||
|
||||
SYN cookies may prevent correct error reporting on clients when the
|
||||
server is really overloaded. If this happens frequently better turn
|
||||
them off.
|
||||
|
||||
If you say Y here, you can disable SYN cookies at run time by
|
||||
saying Y to "/proc file system support" and
|
||||
"Sysctl support" below and executing the command
|
||||
|
||||
echo 0 > /proc/sys/net/ipv4/tcp_syncookies
|
||||
|
||||
after the /proc file system has been mounted.
|
||||
|
||||
If unsure, say N.
|
||||
|
||||
config NET_IPVTI
|
||||
tristate "Virtual (secure) IP: tunneling"
|
||||
select INET_TUNNEL
|
||||
select NET_IP_TUNNEL
|
||||
depends on INET_XFRM_MODE_TUNNEL
|
||||
---help---
|
||||
Tunneling means encapsulating data of one protocol type within
|
||||
another protocol and sending it over a channel that understands the
|
||||
encapsulating protocol. This can be used with xfrm mode tunnel to give
|
||||
the notion of a secure tunnel for IPSEC and then use routing protocol
|
||||
on top.
|
||||
|
||||
config NET_UDP_TUNNEL
|
||||
tristate
|
||||
select NET_IP_TUNNEL
|
||||
default n
|
||||
|
||||
config NET_FOU
|
||||
tristate "IP: Foo (IP protocols) over UDP"
|
||||
select XFRM
|
||||
select NET_UDP_TUNNEL
|
||||
---help---
|
||||
Foo over UDP allows any IP protocol to be directly encapsulated
|
||||
over UDP include tunnels (IPIP, GRE, SIT). By encapsulating in UDP
|
||||
network mechanisms and optimizations for UDP (such as ECMP
|
||||
and RSS) can be leveraged to provide better service.
|
||||
|
||||
config GENEVE
|
||||
tristate "Generic Network Virtualization Encapsulation (Geneve)"
|
||||
depends on INET
|
||||
select NET_UDP_TUNNEL
|
||||
---help---
|
||||
This allows one to create Geneve virtual interfaces that provide
|
||||
Layer 2 Networks over Layer 3 Networks. Geneve is often used
|
||||
to tunnel virtual network infrastructure in virtualized environments.
|
||||
For more information see:
|
||||
http://tools.ietf.org/html/draft-gross-geneve-01
|
||||
|
||||
To compile this driver as a module, choose M here: the module
|
||||
|
||||
|
||||
config INET_AH
|
||||
tristate "IP: AH transformation"
|
||||
select XFRM_ALGO
|
||||
select CRYPTO
|
||||
select CRYPTO_HMAC
|
||||
select CRYPTO_MD5
|
||||
select CRYPTO_SHA1
|
||||
---help---
|
||||
Support for IPsec AH.
|
||||
|
||||
If unsure, say Y.
|
||||
|
||||
config INET_ESP
|
||||
tristate "IP: ESP transformation"
|
||||
select XFRM_ALGO
|
||||
select CRYPTO
|
||||
select CRYPTO_AUTHENC
|
||||
select CRYPTO_HMAC
|
||||
select CRYPTO_MD5
|
||||
select CRYPTO_CBC
|
||||
select CRYPTO_SHA1
|
||||
select CRYPTO_DES
|
||||
---help---
|
||||
Support for IPsec ESP.
|
||||
|
||||
If unsure, say Y.
|
||||
|
||||
config INET_IPCOMP
|
||||
tristate "IP: IPComp transformation"
|
||||
select INET_XFRM_TUNNEL
|
||||
select XFRM_IPCOMP
|
||||
---help---
|
||||
Support for IP Payload Compression Protocol (IPComp) (RFC3173),
|
||||
typically needed for IPsec.
|
||||
|
||||
If unsure, say Y.
|
||||
|
||||
config INET_XFRM_TUNNEL
|
||||
tristate
|
||||
select INET_TUNNEL
|
||||
default n
|
||||
|
||||
config INET_TUNNEL
|
||||
tristate
|
||||
default n
|
||||
|
||||
config INET_XFRM_MODE_TRANSPORT
|
||||
tristate "IP: IPsec transport mode"
|
||||
default y
|
||||
select XFRM
|
||||
---help---
|
||||
Support for IPsec transport mode.
|
||||
|
||||
If unsure, say Y.
|
||||
|
||||
config INET_XFRM_MODE_TUNNEL
|
||||
tristate "IP: IPsec tunnel mode"
|
||||
default y
|
||||
select XFRM
|
||||
---help---
|
||||
Support for IPsec tunnel mode.
|
||||
|
||||
If unsure, say Y.
|
||||
|
||||
config INET_XFRM_MODE_BEET
|
||||
tristate "IP: IPsec BEET mode"
|
||||
default y
|
||||
select XFRM
|
||||
---help---
|
||||
Support for IPsec BEET mode.
|
||||
|
||||
If unsure, say Y.
|
||||
|
||||
config INET_LRO
|
||||
tristate "Large Receive Offload (ipv4/tcp)"
|
||||
default y
|
||||
---help---
|
||||
Support for Large Receive Offload (ipv4/tcp).
|
||||
|
||||
If unsure, say Y.
|
||||
|
||||
config INET_DIAG
|
||||
tristate "INET: socket monitoring interface"
|
||||
default y
|
||||
---help---
|
||||
Support for INET (TCP, DCCP, etc) socket monitoring interface used by
|
||||
native Linux tools such as ss. ss is included in iproute2, currently
|
||||
downloadable at:
|
||||
|
||||
http://www.linuxfoundation.org/collaborate/workgroups/networking/iproute2
|
||||
|
||||
If unsure, say Y.
|
||||
|
||||
config INET_TCP_DIAG
|
||||
depends on INET_DIAG
|
||||
def_tristate INET_DIAG
|
||||
|
||||
config INET_UDP_DIAG
|
||||
tristate "UDP: socket monitoring interface"
|
||||
depends on INET_DIAG && (IPV6 || IPV6=n)
|
||||
default n
|
||||
---help---
|
||||
Support for UDP socket monitoring interface used by the ss tool.
|
||||
If unsure, say Y.
|
||||
|
||||
config INET_DIAG_DESTROY
|
||||
bool "INET: allow privileged process to administratively close sockets"
|
||||
depends on INET_DIAG
|
||||
default n
|
||||
---help---
|
||||
Provides a SOCK_DESTROY operation that allows privileged processes
|
||||
(e.g., a connection manager or a network administration tool such as
|
||||
ss) to close sockets opened by other processes. Closing a socket in
|
||||
this way interrupts any blocking read/write/connect operations on
|
||||
the socket and causes future socket calls to behave as if the socket
|
||||
had been disconnected.
|
||||
If unsure, say N.
|
||||
|
||||
menuconfig TCP_CONG_ADVANCED
|
||||
bool "TCP: advanced congestion control"
|
||||
---help---
|
||||
Support for selection of various TCP congestion control
|
||||
modules.
|
||||
|
||||
Nearly all users can safely say no here, and a safe default
|
||||
selection will be made (CUBIC with new Reno as a fallback).
|
||||
|
||||
If unsure, say N.
|
||||
|
||||
if TCP_CONG_ADVANCED
|
||||
|
||||
config TCP_CONG_BIC
|
||||
tristate "Binary Increase Congestion (BIC) control"
|
||||
default m
|
||||
---help---
|
||||
BIC-TCP is a sender-side only change that ensures a linear RTT
|
||||
fairness under large windows while offering both scalability and
|
||||
bounded TCP-friendliness. The protocol combines two schemes
|
||||
called additive increase and binary search increase. When the
|
||||
congestion window is large, additive increase with a large
|
||||
increment ensures linear RTT fairness as well as good
|
||||
scalability. Under small congestion windows, binary search
|
||||
increase provides TCP friendliness.
|
||||
See http://www.csc.ncsu.edu/faculty/rhee/export/bitcp/
|
||||
|
||||
config TCP_CONG_CUBIC
|
||||
tristate "CUBIC TCP"
|
||||
default y
|
||||
---help---
|
||||
This is version 2.0 of BIC-TCP which uses a cubic growth function
|
||||
among other techniques.
|
||||
See http://www.csc.ncsu.edu/faculty/rhee/export/bitcp/cubic-paper.pdf
|
||||
|
||||
config TCP_CONG_WESTWOOD
|
||||
tristate "TCP Westwood+"
|
||||
default m
|
||||
---help---
|
||||
TCP Westwood+ is a sender-side only modification of the TCP Reno
|
||||
protocol stack that optimizes the performance of TCP congestion
|
||||
control. It is based on end-to-end bandwidth estimation to set
|
||||
congestion window and slow start threshold after a congestion
|
||||
episode. Using this estimation, TCP Westwood+ adaptively sets a
|
||||
slow start threshold and a congestion window which takes into
|
||||
account the bandwidth used at the time congestion is experienced.
|
||||
TCP Westwood+ significantly increases fairness wrt TCP Reno in
|
||||
wired networks and throughput over wireless links.
|
||||
|
||||
config TCP_CONG_HTCP
|
||||
tristate "H-TCP"
|
||||
default m
|
||||
---help---
|
||||
H-TCP is a send-side only modifications of the TCP Reno
|
||||
protocol stack that optimizes the performance of TCP
|
||||
congestion control for high speed network links. It uses a
|
||||
modeswitch to change the alpha and beta parameters of TCP Reno
|
||||
based on network conditions and in a way so as to be fair with
|
||||
other Reno and H-TCP flows.
|
||||
|
||||
config TCP_CONG_HSTCP
|
||||
tristate "High Speed TCP"
|
||||
default n
|
||||
---help---
|
||||
Sally Floyd's High Speed TCP (RFC 3649) congestion control.
|
||||
A modification to TCP's congestion control mechanism for use
|
||||
with large congestion windows. A table indicates how much to
|
||||
increase the congestion window by when an ACK is received.
|
||||
For more detail see http://www.icir.org/floyd/hstcp.html
|
||||
|
||||
config TCP_CONG_HYBLA
|
||||
tristate "TCP-Hybla congestion control algorithm"
|
||||
default n
|
||||
---help---
|
||||
TCP-Hybla is a sender-side only change that eliminates penalization of
|
||||
long-RTT, large-bandwidth connections, like when satellite legs are
|
||||
involved, especially when sharing a common bottleneck with normal
|
||||
terrestrial connections.
|
||||
|
||||
config TCP_CONG_VEGAS
|
||||
tristate "TCP Vegas"
|
||||
default n
|
||||
---help---
|
||||
TCP Vegas is a sender-side only change to TCP that anticipates
|
||||
the onset of congestion by estimating the bandwidth. TCP Vegas
|
||||
adjusts the sending rate by modifying the congestion
|
||||
window. TCP Vegas should provide less packet loss, but it is
|
||||
not as aggressive as TCP Reno.
|
||||
|
||||
config TCP_CONG_SCALABLE
|
||||
tristate "Scalable TCP"
|
||||
default n
|
||||
---help---
|
||||
Scalable TCP is a sender-side only change to TCP which uses a
|
||||
MIMD congestion control algorithm which has some nice scaling
|
||||
properties, though is known to have fairness issues.
|
||||
See http://www.deneholme.net/tom/scalable/
|
||||
|
||||
config TCP_CONG_LP
|
||||
tristate "TCP Low Priority"
|
||||
default n
|
||||
---help---
|
||||
TCP Low Priority (TCP-LP), a distributed algorithm whose goal is
|
||||
to utilize only the excess network bandwidth as compared to the
|
||||
``fair share`` of bandwidth as targeted by TCP.
|
||||
See http://www-ece.rice.edu/networks/TCP-LP/
|
||||
|
||||
config TCP_CONG_VENO
|
||||
tristate "TCP Veno"
|
||||
default n
|
||||
---help---
|
||||
TCP Veno is a sender-side only enhancement of TCP to obtain better
|
||||
throughput over wireless networks. TCP Veno makes use of state
|
||||
distinguishing to circumvent the difficult judgment of the packet loss
|
||||
type. TCP Veno cuts down less congestion window in response to random
|
||||
loss packets.
|
||||
See <http://ieeexplore.ieee.org/xpl/freeabs_all.jsp?arnumber=1177186>
|
||||
|
||||
config TCP_CONG_YEAH
|
||||
tristate "YeAH TCP"
|
||||
select TCP_CONG_VEGAS
|
||||
default n
|
||||
---help---
|
||||
YeAH-TCP is a sender-side high-speed enabled TCP congestion control
|
||||
algorithm, which uses a mixed loss/delay approach to compute the
|
||||
congestion window. It's design goals target high efficiency,
|
||||
internal, RTT and Reno fairness, resilience to link loss while
|
||||
keeping network elements load as low as possible.
|
||||
|
||||
For further details look here:
|
||||
http://wil.cs.caltech.edu/pfldnet2007/paper/YeAH_TCP.pdf
|
||||
|
||||
config TCP_CONG_ILLINOIS
|
||||
tristate "TCP Illinois"
|
||||
default n
|
||||
---help---
|
||||
TCP-Illinois is a sender-side modification of TCP Reno for
|
||||
high speed long delay links. It uses round-trip-time to
|
||||
adjust the alpha and beta parameters to achieve a higher average
|
||||
throughput and maintain fairness.
|
||||
|
||||
For further details see:
|
||||
http://www.ews.uiuc.edu/~shaoliu/tcpillinois/index.html
|
||||
|
||||
config TCP_CONG_DCTCP
|
||||
tristate "DataCenter TCP (DCTCP)"
|
||||
default n
|
||||
---help---
|
||||
DCTCP leverages Explicit Congestion Notification (ECN) in the network to
|
||||
provide multi-bit feedback to the end hosts. It is designed to provide:
|
||||
|
||||
- High burst tolerance (incast due to partition/aggregate),
|
||||
- Low latency (short flows, queries),
|
||||
- High throughput (continuous data updates, large file transfers) with
|
||||
commodity, shallow-buffered switches.
|
||||
|
||||
All switches in the data center network running DCTCP must support
|
||||
ECN marking and be configured for marking when reaching defined switch
|
||||
buffer thresholds. The default ECN marking threshold heuristic for
|
||||
DCTCP on switches is 20 packets (30KB) at 1Gbps, and 65 packets
|
||||
(~100KB) at 10Gbps, but might need further careful tweaking.
|
||||
|
||||
For further details see:
|
||||
http://simula.stanford.edu/~alizade/Site/DCTCP_files/dctcp-final.pdf
|
||||
|
||||
choice
|
||||
prompt "Default TCP congestion control"
|
||||
default DEFAULT_CUBIC
|
||||
help
|
||||
Select the TCP congestion control that will be used by default
|
||||
for all connections.
|
||||
|
||||
config DEFAULT_BIC
|
||||
bool "Bic" if TCP_CONG_BIC=y
|
||||
|
||||
config DEFAULT_CUBIC
|
||||
bool "Cubic" if TCP_CONG_CUBIC=y
|
||||
|
||||
config DEFAULT_HTCP
|
||||
bool "Htcp" if TCP_CONG_HTCP=y
|
||||
|
||||
config DEFAULT_HYBLA
|
||||
bool "Hybla" if TCP_CONG_HYBLA=y
|
||||
|
||||
config DEFAULT_VEGAS
|
||||
bool "Vegas" if TCP_CONG_VEGAS=y
|
||||
|
||||
config DEFAULT_VENO
|
||||
bool "Veno" if TCP_CONG_VENO=y
|
||||
|
||||
config DEFAULT_WESTWOOD
|
||||
bool "Westwood" if TCP_CONG_WESTWOOD=y
|
||||
|
||||
config DEFAULT_DCTCP
|
||||
bool "DCTCP" if TCP_CONG_DCTCP=y
|
||||
|
||||
config DEFAULT_RENO
|
||||
bool "Reno"
|
||||
endchoice
|
||||
|
||||
endif
|
||||
|
||||
config TCP_CONG_CUBIC
|
||||
tristate
|
||||
depends on !TCP_CONG_ADVANCED
|
||||
default y
|
||||
|
||||
config DEFAULT_TCP_CONG
|
||||
string
|
||||
default "bic" if DEFAULT_BIC
|
||||
default "cubic" if DEFAULT_CUBIC
|
||||
default "htcp" if DEFAULT_HTCP
|
||||
default "hybla" if DEFAULT_HYBLA
|
||||
default "vegas" if DEFAULT_VEGAS
|
||||
default "westwood" if DEFAULT_WESTWOOD
|
||||
default "veno" if DEFAULT_VENO
|
||||
default "reno" if DEFAULT_RENO
|
||||
default "dctcp" if DEFAULT_DCTCP
|
||||
default "cubic"
|
||||
|
||||
config TCP_MD5SIG
|
||||
bool "TCP: MD5 Signature Option support (RFC2385)"
|
||||
select CRYPTO
|
||||
select CRYPTO_MD5
|
||||
---help---
|
||||
RFC2385 specifies a method of giving MD5 protection to TCP sessions.
|
||||
Its main (only?) use is to protect BGP sessions between core routers
|
||||
on the Internet.
|
||||
|
||||
If unsure, say N.
|
||||
63
net/ipv4/Makefile
Normal file
63
net/ipv4/Makefile
Normal file
|
|
@ -0,0 +1,63 @@
|
|||
#
|
||||
# Makefile for the Linux TCP/IP (INET) layer.
|
||||
#
|
||||
|
||||
obj-y := route.o inetpeer.o protocol.o \
|
||||
ip_input.o ip_fragment.o ip_forward.o ip_options.o \
|
||||
ip_output.o ip_sockglue.o inet_hashtables.o \
|
||||
inet_timewait_sock.o inet_connection_sock.o \
|
||||
tcp.o tcp_input.o tcp_output.o tcp_timer.o tcp_ipv4.o \
|
||||
tcp_minisocks.o tcp_cong.o tcp_metrics.o tcp_fastopen.o \
|
||||
tcp_offload.o datagram.o raw.o udp.o udplite.o \
|
||||
udp_offload.o arp.o icmp.o devinet.o af_inet.o igmp.o \
|
||||
fib_frontend.o fib_semantics.o fib_trie.o \
|
||||
inet_fragment.o ping.o ip_tunnel_core.o gre_offload.o
|
||||
|
||||
obj-$(CONFIG_NET_IP_TUNNEL) += ip_tunnel.o
|
||||
obj-$(CONFIG_SYSCTL) += sysctl_net_ipv4.o
|
||||
obj-$(CONFIG_SYSFS) += sysfs_net_ipv4.o
|
||||
obj-$(CONFIG_PROC_FS) += proc.o
|
||||
obj-$(CONFIG_IP_MULTIPLE_TABLES) += fib_rules.o
|
||||
obj-$(CONFIG_IP_MROUTE) += ipmr.o
|
||||
obj-$(CONFIG_NET_IPIP) += ipip.o
|
||||
gre-y := gre_demux.o
|
||||
obj-$(CONFIG_NET_FOU) += fou.o
|
||||
obj-$(CONFIG_NET_IPGRE_DEMUX) += gre.o
|
||||
obj-$(CONFIG_NET_IPGRE) += ip_gre.o
|
||||
obj-$(CONFIG_NET_UDP_TUNNEL) += udp_tunnel.o
|
||||
obj-$(CONFIG_NET_IPVTI) += ip_vti.o
|
||||
obj-$(CONFIG_SYN_COOKIES) += syncookies.o
|
||||
obj-$(CONFIG_INET_AH) += ah4.o
|
||||
obj-$(CONFIG_INET_ESP) += esp4.o
|
||||
obj-$(CONFIG_INET_IPCOMP) += ipcomp.o
|
||||
obj-$(CONFIG_INET_XFRM_TUNNEL) += xfrm4_tunnel.o
|
||||
obj-$(CONFIG_INET_XFRM_MODE_BEET) += xfrm4_mode_beet.o
|
||||
obj-$(CONFIG_INET_LRO) += inet_lro.o
|
||||
obj-$(CONFIG_INET_TUNNEL) += tunnel4.o
|
||||
obj-$(CONFIG_INET_XFRM_MODE_TRANSPORT) += xfrm4_mode_transport.o
|
||||
obj-$(CONFIG_INET_XFRM_MODE_TUNNEL) += xfrm4_mode_tunnel.o
|
||||
obj-$(CONFIG_IP_PNP) += ipconfig.o
|
||||
obj-$(CONFIG_NETFILTER) += netfilter.o netfilter/
|
||||
obj-$(CONFIG_INET_DIAG) += inet_diag.o
|
||||
obj-$(CONFIG_INET_TCP_DIAG) += tcp_diag.o
|
||||
obj-$(CONFIG_INET_UDP_DIAG) += udp_diag.o
|
||||
obj-$(CONFIG_NET_TCPPROBE) += tcp_probe.o
|
||||
obj-$(CONFIG_TCP_CONG_BIC) += tcp_bic.o
|
||||
obj-$(CONFIG_TCP_CONG_CUBIC) += tcp_cubic.o
|
||||
obj-$(CONFIG_TCP_CONG_DCTCP) += tcp_dctcp.o
|
||||
obj-$(CONFIG_TCP_CONG_WESTWOOD) += tcp_westwood.o
|
||||
obj-$(CONFIG_TCP_CONG_HSTCP) += tcp_highspeed.o
|
||||
obj-$(CONFIG_TCP_CONG_HYBLA) += tcp_hybla.o
|
||||
obj-$(CONFIG_TCP_CONG_HTCP) += tcp_htcp.o
|
||||
obj-$(CONFIG_TCP_CONG_VEGAS) += tcp_vegas.o
|
||||
obj-$(CONFIG_TCP_CONG_VENO) += tcp_veno.o
|
||||
obj-$(CONFIG_TCP_CONG_SCALABLE) += tcp_scalable.o
|
||||
obj-$(CONFIG_TCP_CONG_LP) += tcp_lp.o
|
||||
obj-$(CONFIG_TCP_CONG_YEAH) += tcp_yeah.o
|
||||
obj-$(CONFIG_TCP_CONG_ILLINOIS) += tcp_illinois.o
|
||||
obj-$(CONFIG_MEMCG_KMEM) += tcp_memcontrol.o
|
||||
obj-$(CONFIG_NETLABEL) += cipso_ipv4.o
|
||||
obj-$(CONFIG_GENEVE) += geneve.o
|
||||
|
||||
obj-$(CONFIG_XFRM) += xfrm4_policy.o xfrm4_state.o xfrm4_input.o \
|
||||
xfrm4_output.o xfrm4_protocol.o
|
||||
1858
net/ipv4/af_inet.c
Normal file
1858
net/ipv4/af_inet.c
Normal file
File diff suppressed because it is too large
Load diff
589
net/ipv4/ah4.c
Normal file
589
net/ipv4/ah4.c
Normal file
|
|
@ -0,0 +1,589 @@
|
|||
#define pr_fmt(fmt) "IPsec: " fmt
|
||||
|
||||
#include <crypto/hash.h>
|
||||
#include <linux/err.h>
|
||||
#include <linux/module.h>
|
||||
#include <linux/slab.h>
|
||||
#include <net/ip.h>
|
||||
#include <net/xfrm.h>
|
||||
#include <net/ah.h>
|
||||
#include <linux/crypto.h>
|
||||
#include <linux/pfkeyv2.h>
|
||||
#include <linux/scatterlist.h>
|
||||
#include <net/icmp.h>
|
||||
#include <net/protocol.h>
|
||||
|
||||
struct ah_skb_cb {
|
||||
struct xfrm_skb_cb xfrm;
|
||||
void *tmp;
|
||||
};
|
||||
|
||||
#define AH_SKB_CB(__skb) ((struct ah_skb_cb *)&((__skb)->cb[0]))
|
||||
|
||||
static void *ah_alloc_tmp(struct crypto_ahash *ahash, int nfrags,
|
||||
unsigned int size)
|
||||
{
|
||||
unsigned int len;
|
||||
|
||||
len = size + crypto_ahash_digestsize(ahash) +
|
||||
(crypto_ahash_alignmask(ahash) &
|
||||
~(crypto_tfm_ctx_alignment() - 1));
|
||||
|
||||
len = ALIGN(len, crypto_tfm_ctx_alignment());
|
||||
|
||||
len += sizeof(struct ahash_request) + crypto_ahash_reqsize(ahash);
|
||||
len = ALIGN(len, __alignof__(struct scatterlist));
|
||||
|
||||
len += sizeof(struct scatterlist) * nfrags;
|
||||
|
||||
return kmalloc(len, GFP_ATOMIC);
|
||||
}
|
||||
|
||||
static inline u8 *ah_tmp_auth(void *tmp, unsigned int offset)
|
||||
{
|
||||
return tmp + offset;
|
||||
}
|
||||
|
||||
static inline u8 *ah_tmp_icv(struct crypto_ahash *ahash, void *tmp,
|
||||
unsigned int offset)
|
||||
{
|
||||
return PTR_ALIGN((u8 *)tmp + offset, crypto_ahash_alignmask(ahash) + 1);
|
||||
}
|
||||
|
||||
static inline struct ahash_request *ah_tmp_req(struct crypto_ahash *ahash,
|
||||
u8 *icv)
|
||||
{
|
||||
struct ahash_request *req;
|
||||
|
||||
req = (void *)PTR_ALIGN(icv + crypto_ahash_digestsize(ahash),
|
||||
crypto_tfm_ctx_alignment());
|
||||
|
||||
ahash_request_set_tfm(req, ahash);
|
||||
|
||||
return req;
|
||||
}
|
||||
|
||||
static inline struct scatterlist *ah_req_sg(struct crypto_ahash *ahash,
|
||||
struct ahash_request *req)
|
||||
{
|
||||
return (void *)ALIGN((unsigned long)(req + 1) +
|
||||
crypto_ahash_reqsize(ahash),
|
||||
__alignof__(struct scatterlist));
|
||||
}
|
||||
|
||||
/* Clear mutable options and find final destination to substitute
|
||||
* into IP header for icv calculation. Options are already checked
|
||||
* for validity, so paranoia is not required. */
|
||||
|
||||
static int ip_clear_mutable_options(const struct iphdr *iph, __be32 *daddr)
|
||||
{
|
||||
unsigned char *optptr = (unsigned char *)(iph+1);
|
||||
int l = iph->ihl*4 - sizeof(struct iphdr);
|
||||
int optlen;
|
||||
|
||||
while (l > 0) {
|
||||
switch (*optptr) {
|
||||
case IPOPT_END:
|
||||
return 0;
|
||||
case IPOPT_NOOP:
|
||||
l--;
|
||||
optptr++;
|
||||
continue;
|
||||
}
|
||||
optlen = optptr[1];
|
||||
if (optlen<2 || optlen>l)
|
||||
return -EINVAL;
|
||||
switch (*optptr) {
|
||||
case IPOPT_SEC:
|
||||
case 0x85: /* Some "Extended Security" crap. */
|
||||
case IPOPT_CIPSO:
|
||||
case IPOPT_RA:
|
||||
case 0x80|21: /* RFC1770 */
|
||||
break;
|
||||
case IPOPT_LSRR:
|
||||
case IPOPT_SSRR:
|
||||
if (optlen < 6)
|
||||
return -EINVAL;
|
||||
memcpy(daddr, optptr+optlen-4, 4);
|
||||
/* Fall through */
|
||||
default:
|
||||
memset(optptr, 0, optlen);
|
||||
}
|
||||
l -= optlen;
|
||||
optptr += optlen;
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
static void ah_output_done(struct crypto_async_request *base, int err)
|
||||
{
|
||||
u8 *icv;
|
||||
struct iphdr *iph;
|
||||
struct sk_buff *skb = base->data;
|
||||
struct xfrm_state *x = skb_dst(skb)->xfrm;
|
||||
struct ah_data *ahp = x->data;
|
||||
struct iphdr *top_iph = ip_hdr(skb);
|
||||
struct ip_auth_hdr *ah = ip_auth_hdr(skb);
|
||||
int ihl = ip_hdrlen(skb);
|
||||
|
||||
iph = AH_SKB_CB(skb)->tmp;
|
||||
icv = ah_tmp_icv(ahp->ahash, iph, ihl);
|
||||
memcpy(ah->auth_data, icv, ahp->icv_trunc_len);
|
||||
|
||||
top_iph->tos = iph->tos;
|
||||
top_iph->ttl = iph->ttl;
|
||||
top_iph->frag_off = iph->frag_off;
|
||||
if (top_iph->ihl != 5) {
|
||||
top_iph->daddr = iph->daddr;
|
||||
memcpy(top_iph+1, iph+1, top_iph->ihl*4 - sizeof(struct iphdr));
|
||||
}
|
||||
|
||||
kfree(AH_SKB_CB(skb)->tmp);
|
||||
xfrm_output_resume(skb, err);
|
||||
}
|
||||
|
||||
static int ah_output(struct xfrm_state *x, struct sk_buff *skb)
|
||||
{
|
||||
int err;
|
||||
int nfrags;
|
||||
int ihl;
|
||||
u8 *icv;
|
||||
struct sk_buff *trailer;
|
||||
struct crypto_ahash *ahash;
|
||||
struct ahash_request *req;
|
||||
struct scatterlist *sg;
|
||||
struct iphdr *iph, *top_iph;
|
||||
struct ip_auth_hdr *ah;
|
||||
struct ah_data *ahp;
|
||||
int seqhi_len = 0;
|
||||
__be32 *seqhi;
|
||||
int sglists = 0;
|
||||
struct scatterlist *seqhisg;
|
||||
|
||||
ahp = x->data;
|
||||
ahash = ahp->ahash;
|
||||
|
||||
if ((err = skb_cow_data(skb, 0, &trailer)) < 0)
|
||||
goto out;
|
||||
nfrags = err;
|
||||
|
||||
skb_push(skb, -skb_network_offset(skb));
|
||||
ah = ip_auth_hdr(skb);
|
||||
ihl = ip_hdrlen(skb);
|
||||
|
||||
if (x->props.flags & XFRM_STATE_ESN) {
|
||||
sglists = 1;
|
||||
seqhi_len = sizeof(*seqhi);
|
||||
}
|
||||
err = -ENOMEM;
|
||||
iph = ah_alloc_tmp(ahash, nfrags + sglists, ihl + seqhi_len);
|
||||
if (!iph)
|
||||
goto out;
|
||||
seqhi = (__be32 *)((char *)iph + ihl);
|
||||
icv = ah_tmp_icv(ahash, seqhi, seqhi_len);
|
||||
req = ah_tmp_req(ahash, icv);
|
||||
sg = ah_req_sg(ahash, req);
|
||||
seqhisg = sg + nfrags;
|
||||
|
||||
memset(ah->auth_data, 0, ahp->icv_trunc_len);
|
||||
|
||||
top_iph = ip_hdr(skb);
|
||||
|
||||
iph->tos = top_iph->tos;
|
||||
iph->ttl = top_iph->ttl;
|
||||
iph->frag_off = top_iph->frag_off;
|
||||
|
||||
if (top_iph->ihl != 5) {
|
||||
iph->daddr = top_iph->daddr;
|
||||
memcpy(iph+1, top_iph+1, top_iph->ihl*4 - sizeof(struct iphdr));
|
||||
err = ip_clear_mutable_options(top_iph, &top_iph->daddr);
|
||||
if (err)
|
||||
goto out_free;
|
||||
}
|
||||
|
||||
ah->nexthdr = *skb_mac_header(skb);
|
||||
*skb_mac_header(skb) = IPPROTO_AH;
|
||||
|
||||
top_iph->tos = 0;
|
||||
top_iph->tot_len = htons(skb->len);
|
||||
top_iph->frag_off = 0;
|
||||
top_iph->ttl = 0;
|
||||
top_iph->check = 0;
|
||||
|
||||
if (x->props.flags & XFRM_STATE_ALIGN4)
|
||||
ah->hdrlen = (XFRM_ALIGN4(sizeof(*ah) + ahp->icv_trunc_len) >> 2) - 2;
|
||||
else
|
||||
ah->hdrlen = (XFRM_ALIGN8(sizeof(*ah) + ahp->icv_trunc_len) >> 2) - 2;
|
||||
|
||||
ah->reserved = 0;
|
||||
ah->spi = x->id.spi;
|
||||
ah->seq_no = htonl(XFRM_SKB_CB(skb)->seq.output.low);
|
||||
|
||||
sg_init_table(sg, nfrags + sglists);
|
||||
skb_to_sgvec_nomark(skb, sg, 0, skb->len);
|
||||
|
||||
if (x->props.flags & XFRM_STATE_ESN) {
|
||||
/* Attach seqhi sg right after packet payload */
|
||||
*seqhi = htonl(XFRM_SKB_CB(skb)->seq.output.hi);
|
||||
sg_set_buf(seqhisg, seqhi, seqhi_len);
|
||||
}
|
||||
ahash_request_set_crypt(req, sg, icv, skb->len + seqhi_len);
|
||||
ahash_request_set_callback(req, 0, ah_output_done, skb);
|
||||
|
||||
AH_SKB_CB(skb)->tmp = iph;
|
||||
|
||||
err = crypto_ahash_digest(req);
|
||||
if (err) {
|
||||
if (err == -EINPROGRESS)
|
||||
goto out;
|
||||
|
||||
if (err == -EBUSY)
|
||||
err = NET_XMIT_DROP;
|
||||
goto out_free;
|
||||
}
|
||||
|
||||
memcpy(ah->auth_data, icv, ahp->icv_trunc_len);
|
||||
|
||||
top_iph->tos = iph->tos;
|
||||
top_iph->ttl = iph->ttl;
|
||||
top_iph->frag_off = iph->frag_off;
|
||||
if (top_iph->ihl != 5) {
|
||||
top_iph->daddr = iph->daddr;
|
||||
memcpy(top_iph+1, iph+1, top_iph->ihl*4 - sizeof(struct iphdr));
|
||||
}
|
||||
|
||||
out_free:
|
||||
kfree(iph);
|
||||
out:
|
||||
return err;
|
||||
}
|
||||
|
||||
static void ah_input_done(struct crypto_async_request *base, int err)
|
||||
{
|
||||
u8 *auth_data;
|
||||
u8 *icv;
|
||||
struct iphdr *work_iph;
|
||||
struct sk_buff *skb = base->data;
|
||||
struct xfrm_state *x = xfrm_input_state(skb);
|
||||
struct ah_data *ahp = x->data;
|
||||
struct ip_auth_hdr *ah = ip_auth_hdr(skb);
|
||||
int ihl = ip_hdrlen(skb);
|
||||
int ah_hlen = (ah->hdrlen + 2) << 2;
|
||||
|
||||
work_iph = AH_SKB_CB(skb)->tmp;
|
||||
auth_data = ah_tmp_auth(work_iph, ihl);
|
||||
icv = ah_tmp_icv(ahp->ahash, auth_data, ahp->icv_trunc_len);
|
||||
|
||||
err = memcmp(icv, auth_data, ahp->icv_trunc_len) ? -EBADMSG: 0;
|
||||
if (err)
|
||||
goto out;
|
||||
|
||||
err = ah->nexthdr;
|
||||
|
||||
skb->network_header += ah_hlen;
|
||||
memcpy(skb_network_header(skb), work_iph, ihl);
|
||||
__skb_pull(skb, ah_hlen + ihl);
|
||||
|
||||
if (x->props.mode == XFRM_MODE_TUNNEL)
|
||||
skb_reset_transport_header(skb);
|
||||
else
|
||||
skb_set_transport_header(skb, -ihl);
|
||||
out:
|
||||
kfree(AH_SKB_CB(skb)->tmp);
|
||||
xfrm_input_resume(skb, err);
|
||||
}
|
||||
|
||||
static int ah_input(struct xfrm_state *x, struct sk_buff *skb)
|
||||
{
|
||||
int ah_hlen;
|
||||
int ihl;
|
||||
int nexthdr;
|
||||
int nfrags;
|
||||
u8 *auth_data;
|
||||
u8 *icv;
|
||||
struct sk_buff *trailer;
|
||||
struct crypto_ahash *ahash;
|
||||
struct ahash_request *req;
|
||||
struct scatterlist *sg;
|
||||
struct iphdr *iph, *work_iph;
|
||||
struct ip_auth_hdr *ah;
|
||||
struct ah_data *ahp;
|
||||
int err = -ENOMEM;
|
||||
int seqhi_len = 0;
|
||||
__be32 *seqhi;
|
||||
int sglists = 0;
|
||||
struct scatterlist *seqhisg;
|
||||
|
||||
if (!pskb_may_pull(skb, sizeof(*ah)))
|
||||
goto out;
|
||||
|
||||
ah = (struct ip_auth_hdr *)skb->data;
|
||||
ahp = x->data;
|
||||
ahash = ahp->ahash;
|
||||
|
||||
nexthdr = ah->nexthdr;
|
||||
ah_hlen = (ah->hdrlen + 2) << 2;
|
||||
|
||||
if (x->props.flags & XFRM_STATE_ALIGN4) {
|
||||
if (ah_hlen != XFRM_ALIGN4(sizeof(*ah) + ahp->icv_full_len) &&
|
||||
ah_hlen != XFRM_ALIGN4(sizeof(*ah) + ahp->icv_trunc_len))
|
||||
goto out;
|
||||
} else {
|
||||
if (ah_hlen != XFRM_ALIGN8(sizeof(*ah) + ahp->icv_full_len) &&
|
||||
ah_hlen != XFRM_ALIGN8(sizeof(*ah) + ahp->icv_trunc_len))
|
||||
goto out;
|
||||
}
|
||||
|
||||
if (!pskb_may_pull(skb, ah_hlen))
|
||||
goto out;
|
||||
|
||||
/* We are going to _remove_ AH header to keep sockets happy,
|
||||
* so... Later this can change. */
|
||||
if (skb_unclone(skb, GFP_ATOMIC))
|
||||
goto out;
|
||||
|
||||
skb->ip_summed = CHECKSUM_NONE;
|
||||
|
||||
|
||||
if ((err = skb_cow_data(skb, 0, &trailer)) < 0)
|
||||
goto out;
|
||||
nfrags = err;
|
||||
|
||||
ah = (struct ip_auth_hdr *)skb->data;
|
||||
iph = ip_hdr(skb);
|
||||
ihl = ip_hdrlen(skb);
|
||||
|
||||
if (x->props.flags & XFRM_STATE_ESN) {
|
||||
sglists = 1;
|
||||
seqhi_len = sizeof(*seqhi);
|
||||
}
|
||||
|
||||
work_iph = ah_alloc_tmp(ahash, nfrags + sglists, ihl +
|
||||
ahp->icv_trunc_len + seqhi_len);
|
||||
if (!work_iph)
|
||||
goto out;
|
||||
|
||||
seqhi = (__be32 *)((char *)work_iph + ihl);
|
||||
auth_data = ah_tmp_auth(seqhi, seqhi_len);
|
||||
icv = ah_tmp_icv(ahash, auth_data, ahp->icv_trunc_len);
|
||||
req = ah_tmp_req(ahash, icv);
|
||||
sg = ah_req_sg(ahash, req);
|
||||
seqhisg = sg + nfrags;
|
||||
|
||||
memcpy(work_iph, iph, ihl);
|
||||
memcpy(auth_data, ah->auth_data, ahp->icv_trunc_len);
|
||||
memset(ah->auth_data, 0, ahp->icv_trunc_len);
|
||||
|
||||
iph->ttl = 0;
|
||||
iph->tos = 0;
|
||||
iph->frag_off = 0;
|
||||
iph->check = 0;
|
||||
if (ihl > sizeof(*iph)) {
|
||||
__be32 dummy;
|
||||
err = ip_clear_mutable_options(iph, &dummy);
|
||||
if (err)
|
||||
goto out_free;
|
||||
}
|
||||
|
||||
skb_push(skb, ihl);
|
||||
|
||||
sg_init_table(sg, nfrags + sglists);
|
||||
skb_to_sgvec_nomark(skb, sg, 0, skb->len);
|
||||
|
||||
if (x->props.flags & XFRM_STATE_ESN) {
|
||||
/* Attach seqhi sg right after packet payload */
|
||||
*seqhi = XFRM_SKB_CB(skb)->seq.input.hi;
|
||||
sg_set_buf(seqhisg, seqhi, seqhi_len);
|
||||
}
|
||||
ahash_request_set_crypt(req, sg, icv, skb->len + seqhi_len);
|
||||
ahash_request_set_callback(req, 0, ah_input_done, skb);
|
||||
|
||||
AH_SKB_CB(skb)->tmp = work_iph;
|
||||
|
||||
err = crypto_ahash_digest(req);
|
||||
if (err) {
|
||||
if (err == -EINPROGRESS)
|
||||
goto out;
|
||||
|
||||
goto out_free;
|
||||
}
|
||||
|
||||
err = memcmp(icv, auth_data, ahp->icv_trunc_len) ? -EBADMSG: 0;
|
||||
if (err)
|
||||
goto out_free;
|
||||
|
||||
skb->network_header += ah_hlen;
|
||||
memcpy(skb_network_header(skb), work_iph, ihl);
|
||||
__skb_pull(skb, ah_hlen + ihl);
|
||||
if (x->props.mode == XFRM_MODE_TUNNEL)
|
||||
skb_reset_transport_header(skb);
|
||||
else
|
||||
skb_set_transport_header(skb, -ihl);
|
||||
|
||||
err = nexthdr;
|
||||
|
||||
out_free:
|
||||
kfree (work_iph);
|
||||
out:
|
||||
return err;
|
||||
}
|
||||
|
||||
static int ah4_err(struct sk_buff *skb, u32 info)
|
||||
{
|
||||
struct net *net = dev_net(skb->dev);
|
||||
const struct iphdr *iph = (const struct iphdr *)skb->data;
|
||||
struct ip_auth_hdr *ah = (struct ip_auth_hdr *)(skb->data+(iph->ihl<<2));
|
||||
struct xfrm_state *x;
|
||||
|
||||
switch (icmp_hdr(skb)->type) {
|
||||
case ICMP_DEST_UNREACH:
|
||||
if (icmp_hdr(skb)->code != ICMP_FRAG_NEEDED)
|
||||
return 0;
|
||||
case ICMP_REDIRECT:
|
||||
break;
|
||||
default:
|
||||
return 0;
|
||||
}
|
||||
|
||||
x = xfrm_state_lookup(net, skb->mark, (const xfrm_address_t *)&iph->daddr,
|
||||
ah->spi, IPPROTO_AH, AF_INET);
|
||||
if (!x)
|
||||
return 0;
|
||||
|
||||
if (icmp_hdr(skb)->type == ICMP_DEST_UNREACH)
|
||||
ipv4_update_pmtu(skb, net, info, 0, 0, IPPROTO_AH, 0);
|
||||
else
|
||||
ipv4_redirect(skb, net, 0, 0, IPPROTO_AH, 0);
|
||||
xfrm_state_put(x);
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
static int ah_init_state(struct xfrm_state *x)
|
||||
{
|
||||
struct ah_data *ahp = NULL;
|
||||
struct xfrm_algo_desc *aalg_desc;
|
||||
struct crypto_ahash *ahash;
|
||||
|
||||
if (!x->aalg)
|
||||
goto error;
|
||||
|
||||
if (x->encap)
|
||||
goto error;
|
||||
|
||||
ahp = kzalloc(sizeof(*ahp), GFP_KERNEL);
|
||||
if (!ahp)
|
||||
return -ENOMEM;
|
||||
|
||||
ahash = crypto_alloc_ahash(x->aalg->alg_name, 0, 0);
|
||||
if (IS_ERR(ahash))
|
||||
goto error;
|
||||
|
||||
ahp->ahash = ahash;
|
||||
if (crypto_ahash_setkey(ahash, x->aalg->alg_key,
|
||||
(x->aalg->alg_key_len + 7) / 8))
|
||||
goto error;
|
||||
|
||||
/*
|
||||
* Lookup the algorithm description maintained by xfrm_algo,
|
||||
* verify crypto transform properties, and store information
|
||||
* we need for AH processing. This lookup cannot fail here
|
||||
* after a successful crypto_alloc_ahash().
|
||||
*/
|
||||
aalg_desc = xfrm_aalg_get_byname(x->aalg->alg_name, 0);
|
||||
BUG_ON(!aalg_desc);
|
||||
|
||||
if (aalg_desc->uinfo.auth.icv_fullbits/8 !=
|
||||
crypto_ahash_digestsize(ahash)) {
|
||||
pr_info("%s: %s digestsize %u != %hu\n",
|
||||
__func__, x->aalg->alg_name,
|
||||
crypto_ahash_digestsize(ahash),
|
||||
aalg_desc->uinfo.auth.icv_fullbits / 8);
|
||||
goto error;
|
||||
}
|
||||
|
||||
ahp->icv_full_len = aalg_desc->uinfo.auth.icv_fullbits/8;
|
||||
ahp->icv_trunc_len = x->aalg->alg_trunc_len/8;
|
||||
|
||||
if (x->props.flags & XFRM_STATE_ALIGN4)
|
||||
x->props.header_len = XFRM_ALIGN4(sizeof(struct ip_auth_hdr) +
|
||||
ahp->icv_trunc_len);
|
||||
else
|
||||
x->props.header_len = XFRM_ALIGN8(sizeof(struct ip_auth_hdr) +
|
||||
ahp->icv_trunc_len);
|
||||
if (x->props.mode == XFRM_MODE_TUNNEL)
|
||||
x->props.header_len += sizeof(struct iphdr);
|
||||
x->data = ahp;
|
||||
|
||||
return 0;
|
||||
|
||||
error:
|
||||
if (ahp) {
|
||||
crypto_free_ahash(ahp->ahash);
|
||||
kfree(ahp);
|
||||
}
|
||||
return -EINVAL;
|
||||
}
|
||||
|
||||
static void ah_destroy(struct xfrm_state *x)
|
||||
{
|
||||
struct ah_data *ahp = x->data;
|
||||
|
||||
if (!ahp)
|
||||
return;
|
||||
|
||||
crypto_free_ahash(ahp->ahash);
|
||||
kfree(ahp);
|
||||
}
|
||||
|
||||
static int ah4_rcv_cb(struct sk_buff *skb, int err)
|
||||
{
|
||||
return 0;
|
||||
}
|
||||
|
||||
static const struct xfrm_type ah_type =
|
||||
{
|
||||
.description = "AH4",
|
||||
.owner = THIS_MODULE,
|
||||
.proto = IPPROTO_AH,
|
||||
.flags = XFRM_TYPE_REPLAY_PROT,
|
||||
.init_state = ah_init_state,
|
||||
.destructor = ah_destroy,
|
||||
.input = ah_input,
|
||||
.output = ah_output
|
||||
};
|
||||
|
||||
static struct xfrm4_protocol ah4_protocol = {
|
||||
.handler = xfrm4_rcv,
|
||||
.input_handler = xfrm_input,
|
||||
.cb_handler = ah4_rcv_cb,
|
||||
.err_handler = ah4_err,
|
||||
.priority = 0,
|
||||
};
|
||||
|
||||
static int __init ah4_init(void)
|
||||
{
|
||||
if (xfrm_register_type(&ah_type, AF_INET) < 0) {
|
||||
pr_info("%s: can't add xfrm type\n", __func__);
|
||||
return -EAGAIN;
|
||||
}
|
||||
if (xfrm4_protocol_register(&ah4_protocol, IPPROTO_AH) < 0) {
|
||||
pr_info("%s: can't add protocol\n", __func__);
|
||||
xfrm_unregister_type(&ah_type, AF_INET);
|
||||
return -EAGAIN;
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
static void __exit ah4_fini(void)
|
||||
{
|
||||
if (xfrm4_protocol_deregister(&ah4_protocol, IPPROTO_AH) < 0)
|
||||
pr_info("%s: can't remove protocol\n", __func__);
|
||||
if (xfrm_unregister_type(&ah_type, AF_INET) < 0)
|
||||
pr_info("%s: can't remove xfrm type\n", __func__);
|
||||
}
|
||||
|
||||
module_init(ah4_init);
|
||||
module_exit(ah4_fini);
|
||||
MODULE_LICENSE("GPL");
|
||||
MODULE_ALIAS_XFRM_TYPE(AF_INET, XFRM_PROTO_AH);
|
||||
1467
net/ipv4/arp.c
Normal file
1467
net/ipv4/arp.c
Normal file
File diff suppressed because it is too large
Load diff
2355
net/ipv4/cipso_ipv4.c
Normal file
2355
net/ipv4/cipso_ipv4.c
Normal file
File diff suppressed because it is too large
Load diff
123
net/ipv4/datagram.c
Normal file
123
net/ipv4/datagram.c
Normal file
|
|
@ -0,0 +1,123 @@
|
|||
/*
|
||||
* common UDP/RAW code
|
||||
* Linux INET implementation
|
||||
*
|
||||
* Authors:
|
||||
* Hideaki YOSHIFUJI <yoshfuji@linux-ipv6.org>
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU General Public License
|
||||
* as published by the Free Software Foundation; either version
|
||||
* 2 of the License, or (at your option) any later version.
|
||||
*/
|
||||
|
||||
#include <linux/types.h>
|
||||
#include <linux/module.h>
|
||||
#include <linux/ip.h>
|
||||
#include <linux/in.h>
|
||||
#include <net/ip.h>
|
||||
#include <net/sock.h>
|
||||
#include <net/route.h>
|
||||
#include <net/tcp_states.h>
|
||||
|
||||
int ip4_datagram_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
|
||||
{
|
||||
struct inet_sock *inet = inet_sk(sk);
|
||||
struct sockaddr_in *usin = (struct sockaddr_in *) uaddr;
|
||||
struct flowi4 *fl4;
|
||||
struct rtable *rt;
|
||||
__be32 saddr;
|
||||
int oif;
|
||||
int err;
|
||||
|
||||
|
||||
if (addr_len < sizeof(*usin))
|
||||
return -EINVAL;
|
||||
|
||||
if (usin->sin_family != AF_INET)
|
||||
return -EAFNOSUPPORT;
|
||||
|
||||
sk_dst_reset(sk);
|
||||
|
||||
lock_sock(sk);
|
||||
|
||||
oif = sk->sk_bound_dev_if;
|
||||
saddr = inet->inet_saddr;
|
||||
if (ipv4_is_multicast(usin->sin_addr.s_addr)) {
|
||||
if (!oif)
|
||||
oif = inet->mc_index;
|
||||
if (!saddr)
|
||||
saddr = inet->mc_addr;
|
||||
}
|
||||
fl4 = &inet->cork.fl.u.ip4;
|
||||
rt = ip_route_connect(fl4, usin->sin_addr.s_addr, saddr,
|
||||
RT_CONN_FLAGS(sk), oif,
|
||||
sk->sk_protocol,
|
||||
inet->inet_sport, usin->sin_port, sk);
|
||||
if (IS_ERR(rt)) {
|
||||
err = PTR_ERR(rt);
|
||||
if (err == -ENETUNREACH)
|
||||
IP_INC_STATS(sock_net(sk), IPSTATS_MIB_OUTNOROUTES);
|
||||
goto out;
|
||||
}
|
||||
|
||||
if ((rt->rt_flags & RTCF_BROADCAST) && !sock_flag(sk, SOCK_BROADCAST)) {
|
||||
ip_rt_put(rt);
|
||||
err = -EACCES;
|
||||
goto out;
|
||||
}
|
||||
if (!inet->inet_saddr)
|
||||
inet->inet_saddr = fl4->saddr; /* Update source address */
|
||||
if (!inet->inet_rcv_saddr) {
|
||||
inet->inet_rcv_saddr = fl4->saddr;
|
||||
if (sk->sk_prot->rehash)
|
||||
sk->sk_prot->rehash(sk);
|
||||
}
|
||||
inet->inet_daddr = fl4->daddr;
|
||||
inet->inet_dport = usin->sin_port;
|
||||
sk->sk_state = TCP_ESTABLISHED;
|
||||
inet_set_txhash(sk);
|
||||
inet->inet_id = jiffies;
|
||||
|
||||
sk_dst_set(sk, &rt->dst);
|
||||
err = 0;
|
||||
out:
|
||||
release_sock(sk);
|
||||
return err;
|
||||
}
|
||||
EXPORT_SYMBOL(ip4_datagram_connect);
|
||||
|
||||
/* Because UDP xmit path can manipulate sk_dst_cache without holding
|
||||
* socket lock, we need to use sk_dst_set() here,
|
||||
* even if we own the socket lock.
|
||||
*/
|
||||
void ip4_datagram_release_cb(struct sock *sk)
|
||||
{
|
||||
const struct inet_sock *inet = inet_sk(sk);
|
||||
const struct ip_options_rcu *inet_opt;
|
||||
__be32 daddr = inet->inet_daddr;
|
||||
struct dst_entry *dst;
|
||||
struct flowi4 fl4;
|
||||
struct rtable *rt;
|
||||
|
||||
rcu_read_lock();
|
||||
|
||||
dst = __sk_dst_get(sk);
|
||||
if (!dst || !dst->obsolete || dst->ops->check(dst, 0)) {
|
||||
rcu_read_unlock();
|
||||
return;
|
||||
}
|
||||
inet_opt = rcu_dereference(inet->inet_opt);
|
||||
if (inet_opt && inet_opt->opt.srr)
|
||||
daddr = inet_opt->opt.faddr;
|
||||
rt = ip_route_output_ports(sock_net(sk), &fl4, sk, daddr,
|
||||
inet->inet_saddr, inet->inet_dport,
|
||||
inet->inet_sport, sk->sk_protocol,
|
||||
RT_CONN_FLAGS(sk), sk->sk_bound_dev_if);
|
||||
|
||||
dst = !IS_ERR(rt) ? &rt->dst : NULL;
|
||||
sk_dst_set(sk, dst);
|
||||
|
||||
rcu_read_unlock();
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(ip4_datagram_release_cb);
|
||||
2363
net/ipv4/devinet.c
Normal file
2363
net/ipv4/devinet.c
Normal file
File diff suppressed because it is too large
Load diff
728
net/ipv4/esp4.c
Normal file
728
net/ipv4/esp4.c
Normal file
|
|
@ -0,0 +1,728 @@
|
|||
#define pr_fmt(fmt) "IPsec: " fmt
|
||||
|
||||
#include <crypto/aead.h>
|
||||
#include <crypto/authenc.h>
|
||||
#include <linux/err.h>
|
||||
#include <linux/module.h>
|
||||
#include <net/ip.h>
|
||||
#include <net/xfrm.h>
|
||||
#include <net/esp.h>
|
||||
#include <linux/scatterlist.h>
|
||||
#include <linux/kernel.h>
|
||||
#include <linux/pfkeyv2.h>
|
||||
#include <linux/rtnetlink.h>
|
||||
#include <linux/slab.h>
|
||||
#include <linux/spinlock.h>
|
||||
#include <linux/in6.h>
|
||||
#include <net/icmp.h>
|
||||
#include <net/protocol.h>
|
||||
#include <net/udp.h>
|
||||
|
||||
struct esp_skb_cb {
|
||||
struct xfrm_skb_cb xfrm;
|
||||
void *tmp;
|
||||
};
|
||||
|
||||
#define ESP_SKB_CB(__skb) ((struct esp_skb_cb *)&((__skb)->cb[0]))
|
||||
|
||||
static u32 esp4_get_mtu(struct xfrm_state *x, int mtu);
|
||||
|
||||
/*
|
||||
* Allocate an AEAD request structure with extra space for SG and IV.
|
||||
*
|
||||
* For alignment considerations the IV is placed at the front, followed
|
||||
* by the request and finally the SG list.
|
||||
*
|
||||
* TODO: Use spare space in skb for this where possible.
|
||||
*/
|
||||
static void *esp_alloc_tmp(struct crypto_aead *aead, int nfrags, int seqhilen)
|
||||
{
|
||||
unsigned int len;
|
||||
|
||||
len = seqhilen;
|
||||
|
||||
len += crypto_aead_ivsize(aead);
|
||||
|
||||
if (len) {
|
||||
len += crypto_aead_alignmask(aead) &
|
||||
~(crypto_tfm_ctx_alignment() - 1);
|
||||
len = ALIGN(len, crypto_tfm_ctx_alignment());
|
||||
}
|
||||
|
||||
len += sizeof(struct aead_givcrypt_request) + crypto_aead_reqsize(aead);
|
||||
len = ALIGN(len, __alignof__(struct scatterlist));
|
||||
|
||||
len += sizeof(struct scatterlist) * nfrags;
|
||||
|
||||
return kmalloc(len, GFP_ATOMIC);
|
||||
}
|
||||
|
||||
static inline __be32 *esp_tmp_seqhi(void *tmp)
|
||||
{
|
||||
return PTR_ALIGN((__be32 *)tmp, __alignof__(__be32));
|
||||
}
|
||||
static inline u8 *esp_tmp_iv(struct crypto_aead *aead, void *tmp, int seqhilen)
|
||||
{
|
||||
return crypto_aead_ivsize(aead) ?
|
||||
PTR_ALIGN((u8 *)tmp + seqhilen,
|
||||
crypto_aead_alignmask(aead) + 1) : tmp + seqhilen;
|
||||
}
|
||||
|
||||
static inline struct aead_givcrypt_request *esp_tmp_givreq(
|
||||
struct crypto_aead *aead, u8 *iv)
|
||||
{
|
||||
struct aead_givcrypt_request *req;
|
||||
|
||||
req = (void *)PTR_ALIGN(iv + crypto_aead_ivsize(aead),
|
||||
crypto_tfm_ctx_alignment());
|
||||
aead_givcrypt_set_tfm(req, aead);
|
||||
return req;
|
||||
}
|
||||
|
||||
static inline struct aead_request *esp_tmp_req(struct crypto_aead *aead, u8 *iv)
|
||||
{
|
||||
struct aead_request *req;
|
||||
|
||||
req = (void *)PTR_ALIGN(iv + crypto_aead_ivsize(aead),
|
||||
crypto_tfm_ctx_alignment());
|
||||
aead_request_set_tfm(req, aead);
|
||||
return req;
|
||||
}
|
||||
|
||||
static inline struct scatterlist *esp_req_sg(struct crypto_aead *aead,
|
||||
struct aead_request *req)
|
||||
{
|
||||
return (void *)ALIGN((unsigned long)(req + 1) +
|
||||
crypto_aead_reqsize(aead),
|
||||
__alignof__(struct scatterlist));
|
||||
}
|
||||
|
||||
static inline struct scatterlist *esp_givreq_sg(
|
||||
struct crypto_aead *aead, struct aead_givcrypt_request *req)
|
||||
{
|
||||
return (void *)ALIGN((unsigned long)(req + 1) +
|
||||
crypto_aead_reqsize(aead),
|
||||
__alignof__(struct scatterlist));
|
||||
}
|
||||
|
||||
static void esp_output_done(struct crypto_async_request *base, int err)
|
||||
{
|
||||
struct sk_buff *skb = base->data;
|
||||
|
||||
kfree(ESP_SKB_CB(skb)->tmp);
|
||||
xfrm_output_resume(skb, err);
|
||||
}
|
||||
|
||||
static int esp_output(struct xfrm_state *x, struct sk_buff *skb)
|
||||
{
|
||||
int err;
|
||||
struct ip_esp_hdr *esph;
|
||||
struct crypto_aead *aead;
|
||||
struct aead_givcrypt_request *req;
|
||||
struct scatterlist *sg;
|
||||
struct scatterlist *asg;
|
||||
struct sk_buff *trailer;
|
||||
void *tmp;
|
||||
u8 *iv;
|
||||
u8 *tail;
|
||||
int blksize;
|
||||
int clen;
|
||||
int alen;
|
||||
int plen;
|
||||
int tfclen;
|
||||
int nfrags;
|
||||
int assoclen;
|
||||
int sglists;
|
||||
int seqhilen;
|
||||
__be32 *seqhi;
|
||||
|
||||
/* skb is pure payload to encrypt */
|
||||
|
||||
aead = x->data;
|
||||
alen = crypto_aead_authsize(aead);
|
||||
|
||||
tfclen = 0;
|
||||
if (x->tfcpad) {
|
||||
struct xfrm_dst *dst = (struct xfrm_dst *)skb_dst(skb);
|
||||
u32 padto;
|
||||
|
||||
padto = min(x->tfcpad, esp4_get_mtu(x, dst->child_mtu_cached));
|
||||
if (skb->len < padto)
|
||||
tfclen = padto - skb->len;
|
||||
}
|
||||
blksize = ALIGN(crypto_aead_blocksize(aead), 4);
|
||||
clen = ALIGN(skb->len + 2 + tfclen, blksize);
|
||||
plen = clen - skb->len - tfclen;
|
||||
|
||||
err = skb_cow_data(skb, tfclen + plen + alen, &trailer);
|
||||
if (err < 0)
|
||||
goto error;
|
||||
nfrags = err;
|
||||
|
||||
assoclen = sizeof(*esph);
|
||||
sglists = 1;
|
||||
seqhilen = 0;
|
||||
|
||||
if (x->props.flags & XFRM_STATE_ESN) {
|
||||
sglists += 2;
|
||||
seqhilen += sizeof(__be32);
|
||||
assoclen += seqhilen;
|
||||
}
|
||||
|
||||
tmp = esp_alloc_tmp(aead, nfrags + sglists, seqhilen);
|
||||
if (!tmp) {
|
||||
err = -ENOMEM;
|
||||
goto error;
|
||||
}
|
||||
|
||||
seqhi = esp_tmp_seqhi(tmp);
|
||||
iv = esp_tmp_iv(aead, tmp, seqhilen);
|
||||
req = esp_tmp_givreq(aead, iv);
|
||||
asg = esp_givreq_sg(aead, req);
|
||||
sg = asg + sglists;
|
||||
|
||||
/* Fill padding... */
|
||||
tail = skb_tail_pointer(trailer);
|
||||
if (tfclen) {
|
||||
memset(tail, 0, tfclen);
|
||||
tail += tfclen;
|
||||
}
|
||||
do {
|
||||
int i;
|
||||
for (i = 0; i < plen - 2; i++)
|
||||
tail[i] = i + 1;
|
||||
} while (0);
|
||||
tail[plen - 2] = plen - 2;
|
||||
tail[plen - 1] = *skb_mac_header(skb);
|
||||
pskb_put(skb, trailer, clen - skb->len + alen);
|
||||
|
||||
skb_push(skb, -skb_network_offset(skb));
|
||||
esph = ip_esp_hdr(skb);
|
||||
*skb_mac_header(skb) = IPPROTO_ESP;
|
||||
|
||||
/* this is non-NULL only with UDP Encapsulation */
|
||||
if (x->encap) {
|
||||
struct xfrm_encap_tmpl *encap = x->encap;
|
||||
struct udphdr *uh;
|
||||
__be32 *udpdata32;
|
||||
__be16 sport, dport;
|
||||
int encap_type;
|
||||
|
||||
spin_lock_bh(&x->lock);
|
||||
sport = encap->encap_sport;
|
||||
dport = encap->encap_dport;
|
||||
encap_type = encap->encap_type;
|
||||
spin_unlock_bh(&x->lock);
|
||||
|
||||
uh = (struct udphdr *)esph;
|
||||
uh->source = sport;
|
||||
uh->dest = dport;
|
||||
uh->len = htons(skb->len - skb_transport_offset(skb));
|
||||
uh->check = 0;
|
||||
|
||||
switch (encap_type) {
|
||||
default:
|
||||
case UDP_ENCAP_ESPINUDP:
|
||||
esph = (struct ip_esp_hdr *)(uh + 1);
|
||||
break;
|
||||
case UDP_ENCAP_ESPINUDP_NON_IKE:
|
||||
udpdata32 = (__be32 *)(uh + 1);
|
||||
udpdata32[0] = udpdata32[1] = 0;
|
||||
esph = (struct ip_esp_hdr *)(udpdata32 + 2);
|
||||
break;
|
||||
}
|
||||
|
||||
*skb_mac_header(skb) = IPPROTO_UDP;
|
||||
}
|
||||
|
||||
esph->spi = x->id.spi;
|
||||
esph->seq_no = htonl(XFRM_SKB_CB(skb)->seq.output.low);
|
||||
|
||||
sg_init_table(sg, nfrags);
|
||||
skb_to_sgvec(skb, sg,
|
||||
esph->enc_data + crypto_aead_ivsize(aead) - skb->data,
|
||||
clen + alen);
|
||||
|
||||
if ((x->props.flags & XFRM_STATE_ESN)) {
|
||||
sg_init_table(asg, 3);
|
||||
sg_set_buf(asg, &esph->spi, sizeof(__be32));
|
||||
*seqhi = htonl(XFRM_SKB_CB(skb)->seq.output.hi);
|
||||
sg_set_buf(asg + 1, seqhi, seqhilen);
|
||||
sg_set_buf(asg + 2, &esph->seq_no, sizeof(__be32));
|
||||
} else
|
||||
sg_init_one(asg, esph, sizeof(*esph));
|
||||
|
||||
aead_givcrypt_set_callback(req, 0, esp_output_done, skb);
|
||||
aead_givcrypt_set_crypt(req, sg, sg, clen, iv);
|
||||
aead_givcrypt_set_assoc(req, asg, assoclen);
|
||||
aead_givcrypt_set_giv(req, esph->enc_data,
|
||||
XFRM_SKB_CB(skb)->seq.output.low);
|
||||
|
||||
ESP_SKB_CB(skb)->tmp = tmp;
|
||||
err = crypto_aead_givencrypt(req);
|
||||
if (err == -EINPROGRESS)
|
||||
goto error;
|
||||
|
||||
if (err == -EBUSY)
|
||||
err = NET_XMIT_DROP;
|
||||
|
||||
kfree(tmp);
|
||||
|
||||
error:
|
||||
return err;
|
||||
}
|
||||
|
||||
static int esp_input_done2(struct sk_buff *skb, int err)
|
||||
{
|
||||
const struct iphdr *iph;
|
||||
struct xfrm_state *x = xfrm_input_state(skb);
|
||||
struct crypto_aead *aead = x->data;
|
||||
int alen = crypto_aead_authsize(aead);
|
||||
int hlen = sizeof(struct ip_esp_hdr) + crypto_aead_ivsize(aead);
|
||||
int elen = skb->len - hlen;
|
||||
int ihl;
|
||||
u8 nexthdr[2];
|
||||
int padlen;
|
||||
|
||||
kfree(ESP_SKB_CB(skb)->tmp);
|
||||
|
||||
if (unlikely(err))
|
||||
goto out;
|
||||
|
||||
if (skb_copy_bits(skb, skb->len-alen-2, nexthdr, 2))
|
||||
BUG();
|
||||
|
||||
err = -EINVAL;
|
||||
padlen = nexthdr[0];
|
||||
if (padlen + 2 + alen >= elen)
|
||||
goto out;
|
||||
|
||||
/* ... check padding bits here. Silly. :-) */
|
||||
|
||||
iph = ip_hdr(skb);
|
||||
ihl = iph->ihl * 4;
|
||||
|
||||
if (x->encap) {
|
||||
struct xfrm_encap_tmpl *encap = x->encap;
|
||||
struct udphdr *uh = (void *)(skb_network_header(skb) + ihl);
|
||||
|
||||
/*
|
||||
* 1) if the NAT-T peer's IP or port changed then
|
||||
* advertize the change to the keying daemon.
|
||||
* This is an inbound SA, so just compare
|
||||
* SRC ports.
|
||||
*/
|
||||
if (iph->saddr != x->props.saddr.a4 ||
|
||||
uh->source != encap->encap_sport) {
|
||||
xfrm_address_t ipaddr;
|
||||
|
||||
ipaddr.a4 = iph->saddr;
|
||||
km_new_mapping(x, &ipaddr, uh->source);
|
||||
|
||||
/* XXX: perhaps add an extra
|
||||
* policy check here, to see
|
||||
* if we should allow or
|
||||
* reject a packet from a
|
||||
* different source
|
||||
* address/port.
|
||||
*/
|
||||
}
|
||||
|
||||
/*
|
||||
* 2) ignore UDP/TCP checksums in case
|
||||
* of NAT-T in Transport Mode, or
|
||||
* perform other post-processing fixes
|
||||
* as per draft-ietf-ipsec-udp-encaps-06,
|
||||
* section 3.1.2
|
||||
*/
|
||||
if (x->props.mode == XFRM_MODE_TRANSPORT)
|
||||
skb->ip_summed = CHECKSUM_UNNECESSARY;
|
||||
}
|
||||
|
||||
pskb_trim(skb, skb->len - alen - padlen - 2);
|
||||
__skb_pull(skb, hlen);
|
||||
if (x->props.mode == XFRM_MODE_TUNNEL)
|
||||
skb_reset_transport_header(skb);
|
||||
else
|
||||
skb_set_transport_header(skb, -ihl);
|
||||
|
||||
err = nexthdr[1];
|
||||
|
||||
/* RFC4303: Drop dummy packets without any error */
|
||||
if (err == IPPROTO_NONE)
|
||||
err = -EINVAL;
|
||||
|
||||
out:
|
||||
return err;
|
||||
}
|
||||
|
||||
static void esp_input_done(struct crypto_async_request *base, int err)
|
||||
{
|
||||
struct sk_buff *skb = base->data;
|
||||
|
||||
xfrm_input_resume(skb, esp_input_done2(skb, err));
|
||||
}
|
||||
|
||||
/*
|
||||
* Note: detecting truncated vs. non-truncated authentication data is very
|
||||
* expensive, so we only support truncated data, which is the recommended
|
||||
* and common case.
|
||||
*/
|
||||
static int esp_input(struct xfrm_state *x, struct sk_buff *skb)
|
||||
{
|
||||
struct ip_esp_hdr *esph;
|
||||
struct crypto_aead *aead = x->data;
|
||||
struct aead_request *req;
|
||||
struct sk_buff *trailer;
|
||||
int elen = skb->len - sizeof(*esph) - crypto_aead_ivsize(aead);
|
||||
int nfrags;
|
||||
int assoclen;
|
||||
int sglists;
|
||||
int seqhilen;
|
||||
__be32 *seqhi;
|
||||
void *tmp;
|
||||
u8 *iv;
|
||||
struct scatterlist *sg;
|
||||
struct scatterlist *asg;
|
||||
int err = -EINVAL;
|
||||
|
||||
if (!pskb_may_pull(skb, sizeof(*esph) + crypto_aead_ivsize(aead)))
|
||||
goto out;
|
||||
|
||||
if (elen <= 0)
|
||||
goto out;
|
||||
|
||||
if ((err = skb_cow_data(skb, 0, &trailer)) < 0)
|
||||
goto out;
|
||||
nfrags = err;
|
||||
|
||||
assoclen = sizeof(*esph);
|
||||
sglists = 1;
|
||||
seqhilen = 0;
|
||||
|
||||
if (x->props.flags & XFRM_STATE_ESN) {
|
||||
sglists += 2;
|
||||
seqhilen += sizeof(__be32);
|
||||
assoclen += seqhilen;
|
||||
}
|
||||
|
||||
err = -ENOMEM;
|
||||
tmp = esp_alloc_tmp(aead, nfrags + sglists, seqhilen);
|
||||
if (!tmp)
|
||||
goto out;
|
||||
|
||||
ESP_SKB_CB(skb)->tmp = tmp;
|
||||
seqhi = esp_tmp_seqhi(tmp);
|
||||
iv = esp_tmp_iv(aead, tmp, seqhilen);
|
||||
req = esp_tmp_req(aead, iv);
|
||||
asg = esp_req_sg(aead, req);
|
||||
sg = asg + sglists;
|
||||
|
||||
skb->ip_summed = CHECKSUM_NONE;
|
||||
|
||||
esph = (struct ip_esp_hdr *)skb->data;
|
||||
|
||||
/* Get ivec. This can be wrong, check against another impls. */
|
||||
iv = esph->enc_data;
|
||||
|
||||
sg_init_table(sg, nfrags);
|
||||
skb_to_sgvec(skb, sg, sizeof(*esph) + crypto_aead_ivsize(aead), elen);
|
||||
|
||||
if ((x->props.flags & XFRM_STATE_ESN)) {
|
||||
sg_init_table(asg, 3);
|
||||
sg_set_buf(asg, &esph->spi, sizeof(__be32));
|
||||
*seqhi = XFRM_SKB_CB(skb)->seq.input.hi;
|
||||
sg_set_buf(asg + 1, seqhi, seqhilen);
|
||||
sg_set_buf(asg + 2, &esph->seq_no, sizeof(__be32));
|
||||
} else
|
||||
sg_init_one(asg, esph, sizeof(*esph));
|
||||
|
||||
aead_request_set_callback(req, 0, esp_input_done, skb);
|
||||
aead_request_set_crypt(req, sg, sg, elen, iv);
|
||||
aead_request_set_assoc(req, asg, assoclen);
|
||||
|
||||
err = crypto_aead_decrypt(req);
|
||||
if (err == -EINPROGRESS)
|
||||
goto out;
|
||||
|
||||
err = esp_input_done2(skb, err);
|
||||
|
||||
out:
|
||||
return err;
|
||||
}
|
||||
|
||||
static u32 esp4_get_mtu(struct xfrm_state *x, int mtu)
|
||||
{
|
||||
struct crypto_aead *aead = x->data;
|
||||
u32 blksize = ALIGN(crypto_aead_blocksize(aead), 4);
|
||||
unsigned int net_adj;
|
||||
|
||||
switch (x->props.mode) {
|
||||
case XFRM_MODE_TRANSPORT:
|
||||
case XFRM_MODE_BEET:
|
||||
net_adj = sizeof(struct iphdr);
|
||||
break;
|
||||
case XFRM_MODE_TUNNEL:
|
||||
net_adj = 0;
|
||||
break;
|
||||
default:
|
||||
BUG();
|
||||
}
|
||||
|
||||
return ((mtu - x->props.header_len - crypto_aead_authsize(aead) -
|
||||
net_adj) & ~(blksize - 1)) + net_adj - 2;
|
||||
}
|
||||
|
||||
static int esp4_err(struct sk_buff *skb, u32 info)
|
||||
{
|
||||
struct net *net = dev_net(skb->dev);
|
||||
const struct iphdr *iph = (const struct iphdr *)skb->data;
|
||||
struct ip_esp_hdr *esph = (struct ip_esp_hdr *)(skb->data+(iph->ihl<<2));
|
||||
struct xfrm_state *x;
|
||||
|
||||
switch (icmp_hdr(skb)->type) {
|
||||
case ICMP_DEST_UNREACH:
|
||||
if (icmp_hdr(skb)->code != ICMP_FRAG_NEEDED)
|
||||
return 0;
|
||||
case ICMP_REDIRECT:
|
||||
break;
|
||||
default:
|
||||
return 0;
|
||||
}
|
||||
|
||||
x = xfrm_state_lookup(net, skb->mark, (const xfrm_address_t *)&iph->daddr,
|
||||
esph->spi, IPPROTO_ESP, AF_INET);
|
||||
if (!x)
|
||||
return 0;
|
||||
|
||||
if (icmp_hdr(skb)->type == ICMP_DEST_UNREACH)
|
||||
ipv4_update_pmtu(skb, net, info, 0, 0, IPPROTO_ESP, 0);
|
||||
else
|
||||
ipv4_redirect(skb, net, 0, 0, IPPROTO_ESP, 0);
|
||||
xfrm_state_put(x);
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
static void esp_destroy(struct xfrm_state *x)
|
||||
{
|
||||
struct crypto_aead *aead = x->data;
|
||||
|
||||
if (!aead)
|
||||
return;
|
||||
|
||||
crypto_free_aead(aead);
|
||||
}
|
||||
|
||||
static int esp_init_aead(struct xfrm_state *x)
|
||||
{
|
||||
struct crypto_aead *aead;
|
||||
int err;
|
||||
|
||||
aead = crypto_alloc_aead(x->aead->alg_name, 0, 0);
|
||||
err = PTR_ERR(aead);
|
||||
if (IS_ERR(aead))
|
||||
goto error;
|
||||
|
||||
x->data = aead;
|
||||
|
||||
err = crypto_aead_setkey(aead, x->aead->alg_key,
|
||||
(x->aead->alg_key_len + 7) / 8);
|
||||
if (err)
|
||||
goto error;
|
||||
|
||||
err = crypto_aead_setauthsize(aead, x->aead->alg_icv_len / 8);
|
||||
if (err)
|
||||
goto error;
|
||||
|
||||
error:
|
||||
return err;
|
||||
}
|
||||
|
||||
static int esp_init_authenc(struct xfrm_state *x)
|
||||
{
|
||||
struct crypto_aead *aead;
|
||||
struct crypto_authenc_key_param *param;
|
||||
struct rtattr *rta;
|
||||
char *key;
|
||||
char *p;
|
||||
char authenc_name[CRYPTO_MAX_ALG_NAME];
|
||||
unsigned int keylen;
|
||||
int err;
|
||||
|
||||
err = -EINVAL;
|
||||
if (x->ealg == NULL)
|
||||
goto error;
|
||||
|
||||
err = -ENAMETOOLONG;
|
||||
|
||||
if ((x->props.flags & XFRM_STATE_ESN)) {
|
||||
if (snprintf(authenc_name, CRYPTO_MAX_ALG_NAME,
|
||||
"authencesn(%s,%s)",
|
||||
x->aalg ? x->aalg->alg_name : "digest_null",
|
||||
x->ealg->alg_name) >= CRYPTO_MAX_ALG_NAME)
|
||||
goto error;
|
||||
} else {
|
||||
if (snprintf(authenc_name, CRYPTO_MAX_ALG_NAME,
|
||||
"authenc(%s,%s)",
|
||||
x->aalg ? x->aalg->alg_name : "digest_null",
|
||||
x->ealg->alg_name) >= CRYPTO_MAX_ALG_NAME)
|
||||
goto error;
|
||||
}
|
||||
|
||||
aead = crypto_alloc_aead(authenc_name, 0, 0);
|
||||
err = PTR_ERR(aead);
|
||||
if (IS_ERR(aead))
|
||||
goto error;
|
||||
|
||||
x->data = aead;
|
||||
|
||||
keylen = (x->aalg ? (x->aalg->alg_key_len + 7) / 8 : 0) +
|
||||
(x->ealg->alg_key_len + 7) / 8 + RTA_SPACE(sizeof(*param));
|
||||
err = -ENOMEM;
|
||||
key = kmalloc(keylen, GFP_KERNEL);
|
||||
if (!key)
|
||||
goto error;
|
||||
|
||||
p = key;
|
||||
rta = (void *)p;
|
||||
rta->rta_type = CRYPTO_AUTHENC_KEYA_PARAM;
|
||||
rta->rta_len = RTA_LENGTH(sizeof(*param));
|
||||
param = RTA_DATA(rta);
|
||||
p += RTA_SPACE(sizeof(*param));
|
||||
|
||||
if (x->aalg) {
|
||||
struct xfrm_algo_desc *aalg_desc;
|
||||
|
||||
memcpy(p, x->aalg->alg_key, (x->aalg->alg_key_len + 7) / 8);
|
||||
p += (x->aalg->alg_key_len + 7) / 8;
|
||||
|
||||
aalg_desc = xfrm_aalg_get_byname(x->aalg->alg_name, 0);
|
||||
BUG_ON(!aalg_desc);
|
||||
|
||||
err = -EINVAL;
|
||||
if (aalg_desc->uinfo.auth.icv_fullbits/8 !=
|
||||
crypto_aead_authsize(aead)) {
|
||||
NETDEBUG(KERN_INFO "ESP: %s digestsize %u != %hu\n",
|
||||
x->aalg->alg_name,
|
||||
crypto_aead_authsize(aead),
|
||||
aalg_desc->uinfo.auth.icv_fullbits/8);
|
||||
goto free_key;
|
||||
}
|
||||
|
||||
err = crypto_aead_setauthsize(
|
||||
aead, x->aalg->alg_trunc_len / 8);
|
||||
if (err)
|
||||
goto free_key;
|
||||
}
|
||||
|
||||
param->enckeylen = cpu_to_be32((x->ealg->alg_key_len + 7) / 8);
|
||||
memcpy(p, x->ealg->alg_key, (x->ealg->alg_key_len + 7) / 8);
|
||||
|
||||
err = crypto_aead_setkey(aead, key, keylen);
|
||||
|
||||
free_key:
|
||||
kfree(key);
|
||||
|
||||
error:
|
||||
return err;
|
||||
}
|
||||
|
||||
static int esp_init_state(struct xfrm_state *x)
|
||||
{
|
||||
struct crypto_aead *aead;
|
||||
u32 align;
|
||||
int err;
|
||||
|
||||
x->data = NULL;
|
||||
|
||||
if (x->aead)
|
||||
err = esp_init_aead(x);
|
||||
else
|
||||
err = esp_init_authenc(x);
|
||||
|
||||
if (err)
|
||||
goto error;
|
||||
|
||||
aead = x->data;
|
||||
|
||||
x->props.header_len = sizeof(struct ip_esp_hdr) +
|
||||
crypto_aead_ivsize(aead);
|
||||
if (x->props.mode == XFRM_MODE_TUNNEL)
|
||||
x->props.header_len += sizeof(struct iphdr);
|
||||
else if (x->props.mode == XFRM_MODE_BEET && x->sel.family != AF_INET6)
|
||||
x->props.header_len += IPV4_BEET_PHMAXLEN;
|
||||
if (x->encap) {
|
||||
struct xfrm_encap_tmpl *encap = x->encap;
|
||||
|
||||
switch (encap->encap_type) {
|
||||
default:
|
||||
goto error;
|
||||
case UDP_ENCAP_ESPINUDP:
|
||||
x->props.header_len += sizeof(struct udphdr);
|
||||
break;
|
||||
case UDP_ENCAP_ESPINUDP_NON_IKE:
|
||||
x->props.header_len += sizeof(struct udphdr) + 2 * sizeof(u32);
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
align = ALIGN(crypto_aead_blocksize(aead), 4);
|
||||
x->props.trailer_len = align + 1 + crypto_aead_authsize(aead);
|
||||
|
||||
error:
|
||||
return err;
|
||||
}
|
||||
|
||||
static int esp4_rcv_cb(struct sk_buff *skb, int err)
|
||||
{
|
||||
return 0;
|
||||
}
|
||||
|
||||
static const struct xfrm_type esp_type =
|
||||
{
|
||||
.description = "ESP4",
|
||||
.owner = THIS_MODULE,
|
||||
.proto = IPPROTO_ESP,
|
||||
.flags = XFRM_TYPE_REPLAY_PROT,
|
||||
.init_state = esp_init_state,
|
||||
.destructor = esp_destroy,
|
||||
.get_mtu = esp4_get_mtu,
|
||||
.input = esp_input,
|
||||
.output = esp_output
|
||||
};
|
||||
|
||||
static struct xfrm4_protocol esp4_protocol = {
|
||||
.handler = xfrm4_rcv,
|
||||
.input_handler = xfrm_input,
|
||||
.cb_handler = esp4_rcv_cb,
|
||||
.err_handler = esp4_err,
|
||||
.priority = 0,
|
||||
};
|
||||
|
||||
static int __init esp4_init(void)
|
||||
{
|
||||
if (xfrm_register_type(&esp_type, AF_INET) < 0) {
|
||||
pr_info("%s: can't add xfrm type\n", __func__);
|
||||
return -EAGAIN;
|
||||
}
|
||||
if (xfrm4_protocol_register(&esp4_protocol, IPPROTO_ESP) < 0) {
|
||||
pr_info("%s: can't add protocol\n", __func__);
|
||||
xfrm_unregister_type(&esp_type, AF_INET);
|
||||
return -EAGAIN;
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
static void __exit esp4_fini(void)
|
||||
{
|
||||
if (xfrm4_protocol_deregister(&esp4_protocol, IPPROTO_ESP) < 0)
|
||||
pr_info("%s: can't remove protocol\n", __func__);
|
||||
if (xfrm_unregister_type(&esp_type, AF_INET) < 0)
|
||||
pr_info("%s: can't remove xfrm type\n", __func__);
|
||||
}
|
||||
|
||||
module_init(esp4_init);
|
||||
module_exit(esp4_fini);
|
||||
MODULE_LICENSE("GPL");
|
||||
MODULE_ALIAS_XFRM_TYPE(AF_INET, XFRM_PROTO_ESP);
|
||||
1186
net/ipv4/fib_frontend.c
Normal file
1186
net/ipv4/fib_frontend.c
Normal file
File diff suppressed because it is too large
Load diff
51
net/ipv4/fib_lookup.h
Normal file
51
net/ipv4/fib_lookup.h
Normal file
|
|
@ -0,0 +1,51 @@
|
|||
#ifndef _FIB_LOOKUP_H
|
||||
#define _FIB_LOOKUP_H
|
||||
|
||||
#include <linux/types.h>
|
||||
#include <linux/list.h>
|
||||
#include <net/ip_fib.h>
|
||||
|
||||
struct fib_alias {
|
||||
struct list_head fa_list;
|
||||
struct fib_info *fa_info;
|
||||
u8 fa_tos;
|
||||
u8 fa_type;
|
||||
u8 fa_state;
|
||||
struct rcu_head rcu;
|
||||
};
|
||||
|
||||
#define FA_S_ACCESSED 0x01
|
||||
|
||||
/* Dont write on fa_state unless needed, to keep it shared on all cpus */
|
||||
static inline void fib_alias_accessed(struct fib_alias *fa)
|
||||
{
|
||||
if (!(fa->fa_state & FA_S_ACCESSED))
|
||||
fa->fa_state |= FA_S_ACCESSED;
|
||||
}
|
||||
|
||||
/* Exported by fib_semantics.c */
|
||||
void fib_release_info(struct fib_info *);
|
||||
struct fib_info *fib_create_info(struct fib_config *cfg);
|
||||
int fib_nh_match(struct fib_config *cfg, struct fib_info *fi);
|
||||
int fib_dump_info(struct sk_buff *skb, u32 pid, u32 seq, int event, u32 tb_id,
|
||||
u8 type, __be32 dst, int dst_len, u8 tos, struct fib_info *fi,
|
||||
unsigned int);
|
||||
void rtmsg_fib(int event, __be32 key, struct fib_alias *fa, int dst_len,
|
||||
u32 tb_id, const struct nl_info *info, unsigned int nlm_flags);
|
||||
struct fib_alias *fib_find_alias(struct list_head *fah, u8 tos, u32 prio);
|
||||
|
||||
static inline void fib_result_assign(struct fib_result *res,
|
||||
struct fib_info *fi)
|
||||
{
|
||||
/* we used to play games with refcounts, but we now use RCU */
|
||||
res->fi = fi;
|
||||
}
|
||||
|
||||
struct fib_prop {
|
||||
int error;
|
||||
u8 scope;
|
||||
};
|
||||
|
||||
extern const struct fib_prop fib_props[RTN_MAX + 1];
|
||||
|
||||
#endif /* _FIB_LOOKUP_H */
|
||||
356
net/ipv4/fib_rules.c
Normal file
356
net/ipv4/fib_rules.c
Normal file
|
|
@ -0,0 +1,356 @@
|
|||
/*
|
||||
* INET An implementation of the TCP/IP protocol suite for the LINUX
|
||||
* operating system. INET is implemented using the BSD Socket
|
||||
* interface as the means of communication with the user level.
|
||||
*
|
||||
* IPv4 Forwarding Information Base: policy rules.
|
||||
*
|
||||
* Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
|
||||
* Thomas Graf <tgraf@suug.ch>
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU General Public License
|
||||
* as published by the Free Software Foundation; either version
|
||||
* 2 of the License, or (at your option) any later version.
|
||||
*
|
||||
* Fixes:
|
||||
* Rani Assaf : local_rule cannot be deleted
|
||||
* Marc Boucher : routing by fwmark
|
||||
*/
|
||||
|
||||
#include <linux/types.h>
|
||||
#include <linux/kernel.h>
|
||||
#include <linux/netdevice.h>
|
||||
#include <linux/netlink.h>
|
||||
#include <linux/inetdevice.h>
|
||||
#include <linux/init.h>
|
||||
#include <linux/list.h>
|
||||
#include <linux/rcupdate.h>
|
||||
#include <linux/export.h>
|
||||
#include <net/ip.h>
|
||||
#include <net/route.h>
|
||||
#include <net/tcp.h>
|
||||
#include <net/ip_fib.h>
|
||||
#include <net/fib_rules.h>
|
||||
|
||||
struct fib4_rule {
|
||||
struct fib_rule common;
|
||||
u8 dst_len;
|
||||
u8 src_len;
|
||||
u8 tos;
|
||||
__be32 src;
|
||||
__be32 srcmask;
|
||||
__be32 dst;
|
||||
__be32 dstmask;
|
||||
#ifdef CONFIG_IP_ROUTE_CLASSID
|
||||
u32 tclassid;
|
||||
#endif
|
||||
};
|
||||
|
||||
int __fib_lookup(struct net *net, struct flowi4 *flp, struct fib_result *res)
|
||||
{
|
||||
struct fib_lookup_arg arg = {
|
||||
.result = res,
|
||||
.flags = FIB_LOOKUP_NOREF,
|
||||
};
|
||||
int err;
|
||||
|
||||
err = fib_rules_lookup(net->ipv4.rules_ops, flowi4_to_flowi(flp), 0, &arg);
|
||||
#ifdef CONFIG_IP_ROUTE_CLASSID
|
||||
if (arg.rule)
|
||||
res->tclassid = ((struct fib4_rule *)arg.rule)->tclassid;
|
||||
else
|
||||
res->tclassid = 0;
|
||||
#endif
|
||||
|
||||
if (err == -ESRCH)
|
||||
err = -ENETUNREACH;
|
||||
|
||||
return err;
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(__fib_lookup);
|
||||
|
||||
static int fib4_rule_action(struct fib_rule *rule, struct flowi *flp,
|
||||
int flags, struct fib_lookup_arg *arg)
|
||||
{
|
||||
int err = -EAGAIN;
|
||||
struct fib_table *tbl;
|
||||
|
||||
switch (rule->action) {
|
||||
case FR_ACT_TO_TBL:
|
||||
break;
|
||||
|
||||
case FR_ACT_UNREACHABLE:
|
||||
err = -ENETUNREACH;
|
||||
goto errout;
|
||||
|
||||
case FR_ACT_PROHIBIT:
|
||||
err = -EACCES;
|
||||
goto errout;
|
||||
|
||||
case FR_ACT_BLACKHOLE:
|
||||
default:
|
||||
err = -EINVAL;
|
||||
goto errout;
|
||||
}
|
||||
|
||||
tbl = fib_get_table(rule->fr_net, rule->table);
|
||||
if (!tbl)
|
||||
goto errout;
|
||||
|
||||
err = fib_table_lookup(tbl, &flp->u.ip4, (struct fib_result *) arg->result, arg->flags);
|
||||
if (err > 0)
|
||||
err = -EAGAIN;
|
||||
errout:
|
||||
return err;
|
||||
}
|
||||
|
||||
static bool fib4_rule_suppress(struct fib_rule *rule, struct fib_lookup_arg *arg)
|
||||
{
|
||||
struct fib_result *result = (struct fib_result *) arg->result;
|
||||
struct net_device *dev = NULL;
|
||||
|
||||
if (result->fi)
|
||||
dev = result->fi->fib_dev;
|
||||
|
||||
/* do not accept result if the route does
|
||||
* not meet the required prefix length
|
||||
*/
|
||||
if (result->prefixlen <= rule->suppress_prefixlen)
|
||||
goto suppress_route;
|
||||
|
||||
/* do not accept result if the route uses a device
|
||||
* belonging to a forbidden interface group
|
||||
*/
|
||||
if (rule->suppress_ifgroup != -1 && dev && dev->group == rule->suppress_ifgroup)
|
||||
goto suppress_route;
|
||||
|
||||
return false;
|
||||
|
||||
suppress_route:
|
||||
if (!(arg->flags & FIB_LOOKUP_NOREF))
|
||||
fib_info_put(result->fi);
|
||||
return true;
|
||||
}
|
||||
|
||||
static int fib4_rule_match(struct fib_rule *rule, struct flowi *fl, int flags)
|
||||
{
|
||||
struct fib4_rule *r = (struct fib4_rule *) rule;
|
||||
struct flowi4 *fl4 = &fl->u.ip4;
|
||||
__be32 daddr = fl4->daddr;
|
||||
__be32 saddr = fl4->saddr;
|
||||
|
||||
if (((saddr ^ r->src) & r->srcmask) ||
|
||||
((daddr ^ r->dst) & r->dstmask))
|
||||
return 0;
|
||||
|
||||
if (r->tos && (r->tos != fl4->flowi4_tos))
|
||||
return 0;
|
||||
|
||||
return 1;
|
||||
}
|
||||
|
||||
static struct fib_table *fib_empty_table(struct net *net)
|
||||
{
|
||||
u32 id;
|
||||
|
||||
for (id = 1; id <= RT_TABLE_MAX; id++)
|
||||
if (fib_get_table(net, id) == NULL)
|
||||
return fib_new_table(net, id);
|
||||
return NULL;
|
||||
}
|
||||
|
||||
static const struct nla_policy fib4_rule_policy[FRA_MAX+1] = {
|
||||
FRA_GENERIC_POLICY,
|
||||
[FRA_FLOW] = { .type = NLA_U32 },
|
||||
};
|
||||
|
||||
static int fib4_rule_configure(struct fib_rule *rule, struct sk_buff *skb,
|
||||
struct fib_rule_hdr *frh,
|
||||
struct nlattr **tb)
|
||||
{
|
||||
struct net *net = sock_net(skb->sk);
|
||||
int err = -EINVAL;
|
||||
struct fib4_rule *rule4 = (struct fib4_rule *) rule;
|
||||
|
||||
if (frh->tos & ~IPTOS_TOS_MASK)
|
||||
goto errout;
|
||||
|
||||
if (rule->table == RT_TABLE_UNSPEC) {
|
||||
if (rule->action == FR_ACT_TO_TBL) {
|
||||
struct fib_table *table;
|
||||
|
||||
table = fib_empty_table(net);
|
||||
if (table == NULL) {
|
||||
err = -ENOBUFS;
|
||||
goto errout;
|
||||
}
|
||||
|
||||
rule->table = table->tb_id;
|
||||
}
|
||||
}
|
||||
|
||||
if (frh->src_len)
|
||||
rule4->src = nla_get_be32(tb[FRA_SRC]);
|
||||
|
||||
if (frh->dst_len)
|
||||
rule4->dst = nla_get_be32(tb[FRA_DST]);
|
||||
|
||||
#ifdef CONFIG_IP_ROUTE_CLASSID
|
||||
if (tb[FRA_FLOW]) {
|
||||
rule4->tclassid = nla_get_u32(tb[FRA_FLOW]);
|
||||
if (rule4->tclassid)
|
||||
net->ipv4.fib_num_tclassid_users++;
|
||||
}
|
||||
#endif
|
||||
|
||||
rule4->src_len = frh->src_len;
|
||||
rule4->srcmask = inet_make_mask(rule4->src_len);
|
||||
rule4->dst_len = frh->dst_len;
|
||||
rule4->dstmask = inet_make_mask(rule4->dst_len);
|
||||
rule4->tos = frh->tos;
|
||||
|
||||
net->ipv4.fib_has_custom_rules = true;
|
||||
err = 0;
|
||||
errout:
|
||||
return err;
|
||||
}
|
||||
|
||||
static void fib4_rule_delete(struct fib_rule *rule)
|
||||
{
|
||||
struct net *net = rule->fr_net;
|
||||
#ifdef CONFIG_IP_ROUTE_CLASSID
|
||||
struct fib4_rule *rule4 = (struct fib4_rule *) rule;
|
||||
|
||||
if (rule4->tclassid)
|
||||
net->ipv4.fib_num_tclassid_users--;
|
||||
#endif
|
||||
net->ipv4.fib_has_custom_rules = true;
|
||||
}
|
||||
|
||||
static int fib4_rule_compare(struct fib_rule *rule, struct fib_rule_hdr *frh,
|
||||
struct nlattr **tb)
|
||||
{
|
||||
struct fib4_rule *rule4 = (struct fib4_rule *) rule;
|
||||
|
||||
if (frh->src_len && (rule4->src_len != frh->src_len))
|
||||
return 0;
|
||||
|
||||
if (frh->dst_len && (rule4->dst_len != frh->dst_len))
|
||||
return 0;
|
||||
|
||||
if (frh->tos && (rule4->tos != frh->tos))
|
||||
return 0;
|
||||
|
||||
#ifdef CONFIG_IP_ROUTE_CLASSID
|
||||
if (tb[FRA_FLOW] && (rule4->tclassid != nla_get_u32(tb[FRA_FLOW])))
|
||||
return 0;
|
||||
#endif
|
||||
|
||||
if (frh->src_len && (rule4->src != nla_get_be32(tb[FRA_SRC])))
|
||||
return 0;
|
||||
|
||||
if (frh->dst_len && (rule4->dst != nla_get_be32(tb[FRA_DST])))
|
||||
return 0;
|
||||
|
||||
return 1;
|
||||
}
|
||||
|
||||
static int fib4_rule_fill(struct fib_rule *rule, struct sk_buff *skb,
|
||||
struct fib_rule_hdr *frh)
|
||||
{
|
||||
struct fib4_rule *rule4 = (struct fib4_rule *) rule;
|
||||
|
||||
frh->dst_len = rule4->dst_len;
|
||||
frh->src_len = rule4->src_len;
|
||||
frh->tos = rule4->tos;
|
||||
|
||||
if ((rule4->dst_len &&
|
||||
nla_put_be32(skb, FRA_DST, rule4->dst)) ||
|
||||
(rule4->src_len &&
|
||||
nla_put_be32(skb, FRA_SRC, rule4->src)))
|
||||
goto nla_put_failure;
|
||||
#ifdef CONFIG_IP_ROUTE_CLASSID
|
||||
if (rule4->tclassid &&
|
||||
nla_put_u32(skb, FRA_FLOW, rule4->tclassid))
|
||||
goto nla_put_failure;
|
||||
#endif
|
||||
return 0;
|
||||
|
||||
nla_put_failure:
|
||||
return -ENOBUFS;
|
||||
}
|
||||
|
||||
static size_t fib4_rule_nlmsg_payload(struct fib_rule *rule)
|
||||
{
|
||||
return nla_total_size(4) /* dst */
|
||||
+ nla_total_size(4) /* src */
|
||||
+ nla_total_size(4); /* flow */
|
||||
}
|
||||
|
||||
static void fib4_rule_flush_cache(struct fib_rules_ops *ops)
|
||||
{
|
||||
rt_cache_flush(ops->fro_net);
|
||||
}
|
||||
|
||||
static const struct fib_rules_ops __net_initconst fib4_rules_ops_template = {
|
||||
.family = AF_INET,
|
||||
.rule_size = sizeof(struct fib4_rule),
|
||||
.addr_size = sizeof(u32),
|
||||
.action = fib4_rule_action,
|
||||
.suppress = fib4_rule_suppress,
|
||||
.match = fib4_rule_match,
|
||||
.configure = fib4_rule_configure,
|
||||
.delete = fib4_rule_delete,
|
||||
.compare = fib4_rule_compare,
|
||||
.fill = fib4_rule_fill,
|
||||
.default_pref = fib_default_rule_pref,
|
||||
.nlmsg_payload = fib4_rule_nlmsg_payload,
|
||||
.flush_cache = fib4_rule_flush_cache,
|
||||
.nlgroup = RTNLGRP_IPV4_RULE,
|
||||
.policy = fib4_rule_policy,
|
||||
.owner = THIS_MODULE,
|
||||
};
|
||||
|
||||
static int fib_default_rules_init(struct fib_rules_ops *ops)
|
||||
{
|
||||
int err;
|
||||
|
||||
err = fib_default_rule_add(ops, 0, RT_TABLE_LOCAL, 0);
|
||||
if (err < 0)
|
||||
return err;
|
||||
err = fib_default_rule_add(ops, 0x7FFE, RT_TABLE_MAIN, 0);
|
||||
if (err < 0)
|
||||
return err;
|
||||
err = fib_default_rule_add(ops, 0x7FFF, RT_TABLE_DEFAULT, 0);
|
||||
if (err < 0)
|
||||
return err;
|
||||
return 0;
|
||||
}
|
||||
|
||||
int __net_init fib4_rules_init(struct net *net)
|
||||
{
|
||||
int err;
|
||||
struct fib_rules_ops *ops;
|
||||
|
||||
ops = fib_rules_register(&fib4_rules_ops_template, net);
|
||||
if (IS_ERR(ops))
|
||||
return PTR_ERR(ops);
|
||||
|
||||
err = fib_default_rules_init(ops);
|
||||
if (err < 0)
|
||||
goto fail;
|
||||
net->ipv4.rules_ops = ops;
|
||||
net->ipv4.fib_has_custom_rules = false;
|
||||
return 0;
|
||||
|
||||
fail:
|
||||
/* also cleans all rules already added */
|
||||
fib_rules_unregister(ops);
|
||||
return err;
|
||||
}
|
||||
|
||||
void __net_exit fib4_rules_exit(struct net *net)
|
||||
{
|
||||
fib_rules_unregister(net->ipv4.rules_ops);
|
||||
}
|
||||
1336
net/ipv4/fib_semantics.c
Normal file
1336
net/ipv4/fib_semantics.c
Normal file
File diff suppressed because it is too large
Load diff
2610
net/ipv4/fib_trie.c
Normal file
2610
net/ipv4/fib_trie.c
Normal file
File diff suppressed because it is too large
Load diff
519
net/ipv4/fou.c
Normal file
519
net/ipv4/fou.c
Normal file
|
|
@ -0,0 +1,519 @@
|
|||
#include <linux/module.h>
|
||||
#include <linux/errno.h>
|
||||
#include <linux/socket.h>
|
||||
#include <linux/skbuff.h>
|
||||
#include <linux/ip.h>
|
||||
#include <linux/udp.h>
|
||||
#include <linux/types.h>
|
||||
#include <linux/kernel.h>
|
||||
#include <net/genetlink.h>
|
||||
#include <net/gue.h>
|
||||
#include <net/ip.h>
|
||||
#include <net/protocol.h>
|
||||
#include <net/udp.h>
|
||||
#include <net/udp_tunnel.h>
|
||||
#include <net/xfrm.h>
|
||||
#include <uapi/linux/fou.h>
|
||||
#include <uapi/linux/genetlink.h>
|
||||
|
||||
static DEFINE_SPINLOCK(fou_lock);
|
||||
static LIST_HEAD(fou_list);
|
||||
|
||||
struct fou {
|
||||
struct socket *sock;
|
||||
u8 protocol;
|
||||
u16 port;
|
||||
struct udp_offload udp_offloads;
|
||||
struct list_head list;
|
||||
};
|
||||
|
||||
struct fou_cfg {
|
||||
u16 type;
|
||||
u8 protocol;
|
||||
struct udp_port_cfg udp_config;
|
||||
};
|
||||
|
||||
static inline struct fou *fou_from_sock(struct sock *sk)
|
||||
{
|
||||
return sk->sk_user_data;
|
||||
}
|
||||
|
||||
static int fou_udp_encap_recv_deliver(struct sk_buff *skb,
|
||||
u8 protocol, size_t len)
|
||||
{
|
||||
struct iphdr *iph = ip_hdr(skb);
|
||||
|
||||
/* Remove 'len' bytes from the packet (UDP header and
|
||||
* FOU header if present), modify the protocol to the one
|
||||
* we found, and then call rcv_encap.
|
||||
*/
|
||||
iph->tot_len = htons(ntohs(iph->tot_len) - len);
|
||||
__skb_pull(skb, len);
|
||||
skb_postpull_rcsum(skb, udp_hdr(skb), len);
|
||||
skb_reset_transport_header(skb);
|
||||
|
||||
return -protocol;
|
||||
}
|
||||
|
||||
static int fou_udp_recv(struct sock *sk, struct sk_buff *skb)
|
||||
{
|
||||
struct fou *fou = fou_from_sock(sk);
|
||||
|
||||
if (!fou)
|
||||
return 1;
|
||||
|
||||
return fou_udp_encap_recv_deliver(skb, fou->protocol,
|
||||
sizeof(struct udphdr));
|
||||
}
|
||||
|
||||
static int gue_udp_recv(struct sock *sk, struct sk_buff *skb)
|
||||
{
|
||||
struct fou *fou = fou_from_sock(sk);
|
||||
size_t len;
|
||||
struct guehdr *guehdr;
|
||||
struct udphdr *uh;
|
||||
|
||||
if (!fou)
|
||||
return 1;
|
||||
|
||||
len = sizeof(struct udphdr) + sizeof(struct guehdr);
|
||||
if (!pskb_may_pull(skb, len))
|
||||
goto drop;
|
||||
|
||||
uh = udp_hdr(skb);
|
||||
guehdr = (struct guehdr *)&uh[1];
|
||||
|
||||
len += guehdr->hlen << 2;
|
||||
if (!pskb_may_pull(skb, len))
|
||||
goto drop;
|
||||
|
||||
uh = udp_hdr(skb);
|
||||
guehdr = (struct guehdr *)&uh[1];
|
||||
|
||||
if (guehdr->version != 0)
|
||||
goto drop;
|
||||
|
||||
if (guehdr->flags) {
|
||||
/* No support yet */
|
||||
goto drop;
|
||||
}
|
||||
|
||||
return fou_udp_encap_recv_deliver(skb, guehdr->next_hdr, len);
|
||||
drop:
|
||||
kfree_skb(skb);
|
||||
return 0;
|
||||
}
|
||||
|
||||
static struct sk_buff **fou_gro_receive(struct sk_buff **head,
|
||||
struct sk_buff *skb)
|
||||
{
|
||||
const struct net_offload *ops;
|
||||
struct sk_buff **pp = NULL;
|
||||
u8 proto = NAPI_GRO_CB(skb)->proto;
|
||||
const struct net_offload **offloads;
|
||||
|
||||
rcu_read_lock();
|
||||
offloads = NAPI_GRO_CB(skb)->is_ipv6 ? inet6_offloads : inet_offloads;
|
||||
ops = rcu_dereference(offloads[proto]);
|
||||
if (!ops || !ops->callbacks.gro_receive)
|
||||
goto out_unlock;
|
||||
|
||||
pp = ops->callbacks.gro_receive(head, skb);
|
||||
|
||||
out_unlock:
|
||||
rcu_read_unlock();
|
||||
|
||||
return pp;
|
||||
}
|
||||
|
||||
static int fou_gro_complete(struct sk_buff *skb, int nhoff)
|
||||
{
|
||||
const struct net_offload *ops;
|
||||
u8 proto = NAPI_GRO_CB(skb)->proto;
|
||||
int err = -ENOSYS;
|
||||
const struct net_offload **offloads;
|
||||
|
||||
udp_tunnel_gro_complete(skb, nhoff);
|
||||
|
||||
rcu_read_lock();
|
||||
offloads = NAPI_GRO_CB(skb)->is_ipv6 ? inet6_offloads : inet_offloads;
|
||||
ops = rcu_dereference(offloads[proto]);
|
||||
if (WARN_ON(!ops || !ops->callbacks.gro_complete))
|
||||
goto out_unlock;
|
||||
|
||||
err = ops->callbacks.gro_complete(skb, nhoff);
|
||||
|
||||
out_unlock:
|
||||
rcu_read_unlock();
|
||||
|
||||
return err;
|
||||
}
|
||||
|
||||
static struct sk_buff **gue_gro_receive(struct sk_buff **head,
|
||||
struct sk_buff *skb)
|
||||
{
|
||||
const struct net_offload **offloads;
|
||||
const struct net_offload *ops;
|
||||
struct sk_buff **pp = NULL;
|
||||
struct sk_buff *p;
|
||||
u8 proto;
|
||||
struct guehdr *guehdr;
|
||||
unsigned int hlen, guehlen;
|
||||
unsigned int off;
|
||||
int flush = 1;
|
||||
|
||||
off = skb_gro_offset(skb);
|
||||
hlen = off + sizeof(*guehdr);
|
||||
guehdr = skb_gro_header_fast(skb, off);
|
||||
if (skb_gro_header_hard(skb, hlen)) {
|
||||
guehdr = skb_gro_header_slow(skb, hlen, off);
|
||||
if (unlikely(!guehdr))
|
||||
goto out;
|
||||
}
|
||||
|
||||
proto = guehdr->next_hdr;
|
||||
|
||||
rcu_read_lock();
|
||||
offloads = NAPI_GRO_CB(skb)->is_ipv6 ? inet6_offloads : inet_offloads;
|
||||
ops = rcu_dereference(offloads[proto]);
|
||||
if (WARN_ON(!ops || !ops->callbacks.gro_receive))
|
||||
goto out_unlock;
|
||||
|
||||
guehlen = sizeof(*guehdr) + (guehdr->hlen << 2);
|
||||
|
||||
hlen = off + guehlen;
|
||||
if (skb_gro_header_hard(skb, hlen)) {
|
||||
guehdr = skb_gro_header_slow(skb, hlen, off);
|
||||
if (unlikely(!guehdr))
|
||||
goto out_unlock;
|
||||
}
|
||||
|
||||
flush = 0;
|
||||
|
||||
for (p = *head; p; p = p->next) {
|
||||
const struct guehdr *guehdr2;
|
||||
|
||||
if (!NAPI_GRO_CB(p)->same_flow)
|
||||
continue;
|
||||
|
||||
guehdr2 = (struct guehdr *)(p->data + off);
|
||||
|
||||
/* Compare base GUE header to be equal (covers
|
||||
* hlen, version, next_hdr, and flags.
|
||||
*/
|
||||
if (guehdr->word != guehdr2->word) {
|
||||
NAPI_GRO_CB(p)->same_flow = 0;
|
||||
continue;
|
||||
}
|
||||
|
||||
/* Compare optional fields are the same. */
|
||||
if (guehdr->hlen && memcmp(&guehdr[1], &guehdr2[1],
|
||||
guehdr->hlen << 2)) {
|
||||
NAPI_GRO_CB(p)->same_flow = 0;
|
||||
continue;
|
||||
}
|
||||
}
|
||||
|
||||
skb_gro_pull(skb, guehlen);
|
||||
|
||||
/* Adjusted NAPI_GRO_CB(skb)->csum after skb_gro_pull()*/
|
||||
skb_gro_postpull_rcsum(skb, guehdr, guehlen);
|
||||
|
||||
pp = ops->callbacks.gro_receive(head, skb);
|
||||
|
||||
out_unlock:
|
||||
rcu_read_unlock();
|
||||
out:
|
||||
NAPI_GRO_CB(skb)->flush |= flush;
|
||||
|
||||
return pp;
|
||||
}
|
||||
|
||||
static int gue_gro_complete(struct sk_buff *skb, int nhoff)
|
||||
{
|
||||
const struct net_offload **offloads;
|
||||
struct guehdr *guehdr = (struct guehdr *)(skb->data + nhoff);
|
||||
const struct net_offload *ops;
|
||||
unsigned int guehlen;
|
||||
u8 proto;
|
||||
int err = -ENOENT;
|
||||
|
||||
proto = guehdr->next_hdr;
|
||||
|
||||
guehlen = sizeof(*guehdr) + (guehdr->hlen << 2);
|
||||
|
||||
rcu_read_lock();
|
||||
offloads = NAPI_GRO_CB(skb)->is_ipv6 ? inet6_offloads : inet_offloads;
|
||||
ops = rcu_dereference(offloads[proto]);
|
||||
if (WARN_ON(!ops || !ops->callbacks.gro_complete))
|
||||
goto out_unlock;
|
||||
|
||||
err = ops->callbacks.gro_complete(skb, nhoff + guehlen);
|
||||
|
||||
out_unlock:
|
||||
rcu_read_unlock();
|
||||
return err;
|
||||
}
|
||||
|
||||
static int fou_add_to_port_list(struct fou *fou)
|
||||
{
|
||||
struct fou *fout;
|
||||
|
||||
spin_lock(&fou_lock);
|
||||
list_for_each_entry(fout, &fou_list, list) {
|
||||
if (fou->port == fout->port) {
|
||||
spin_unlock(&fou_lock);
|
||||
return -EALREADY;
|
||||
}
|
||||
}
|
||||
|
||||
list_add(&fou->list, &fou_list);
|
||||
spin_unlock(&fou_lock);
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
static void fou_release(struct fou *fou)
|
||||
{
|
||||
struct socket *sock = fou->sock;
|
||||
struct sock *sk = sock->sk;
|
||||
|
||||
udp_del_offload(&fou->udp_offloads);
|
||||
|
||||
list_del(&fou->list);
|
||||
|
||||
/* Remove hooks into tunnel socket */
|
||||
sk->sk_user_data = NULL;
|
||||
|
||||
sock_release(sock);
|
||||
|
||||
kfree(fou);
|
||||
}
|
||||
|
||||
static int fou_encap_init(struct sock *sk, struct fou *fou, struct fou_cfg *cfg)
|
||||
{
|
||||
udp_sk(sk)->encap_rcv = fou_udp_recv;
|
||||
fou->protocol = cfg->protocol;
|
||||
fou->udp_offloads.callbacks.gro_receive = fou_gro_receive;
|
||||
fou->udp_offloads.callbacks.gro_complete = fou_gro_complete;
|
||||
fou->udp_offloads.port = cfg->udp_config.local_udp_port;
|
||||
fou->udp_offloads.ipproto = cfg->protocol;
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
static int gue_encap_init(struct sock *sk, struct fou *fou, struct fou_cfg *cfg)
|
||||
{
|
||||
udp_sk(sk)->encap_rcv = gue_udp_recv;
|
||||
fou->udp_offloads.callbacks.gro_receive = gue_gro_receive;
|
||||
fou->udp_offloads.callbacks.gro_complete = gue_gro_complete;
|
||||
fou->udp_offloads.port = cfg->udp_config.local_udp_port;
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
static int fou_create(struct net *net, struct fou_cfg *cfg,
|
||||
struct socket **sockp)
|
||||
{
|
||||
struct fou *fou = NULL;
|
||||
int err;
|
||||
struct socket *sock = NULL;
|
||||
struct sock *sk;
|
||||
|
||||
/* Open UDP socket */
|
||||
err = udp_sock_create(net, &cfg->udp_config, &sock);
|
||||
if (err < 0)
|
||||
goto error;
|
||||
|
||||
/* Allocate FOU port structure */
|
||||
fou = kzalloc(sizeof(*fou), GFP_KERNEL);
|
||||
if (!fou) {
|
||||
err = -ENOMEM;
|
||||
goto error;
|
||||
}
|
||||
|
||||
sk = sock->sk;
|
||||
|
||||
fou->port = cfg->udp_config.local_udp_port;
|
||||
|
||||
/* Initial for fou type */
|
||||
switch (cfg->type) {
|
||||
case FOU_ENCAP_DIRECT:
|
||||
err = fou_encap_init(sk, fou, cfg);
|
||||
if (err)
|
||||
goto error;
|
||||
break;
|
||||
case FOU_ENCAP_GUE:
|
||||
err = gue_encap_init(sk, fou, cfg);
|
||||
if (err)
|
||||
goto error;
|
||||
break;
|
||||
default:
|
||||
err = -EINVAL;
|
||||
goto error;
|
||||
}
|
||||
|
||||
udp_sk(sk)->encap_type = 1;
|
||||
udp_encap_enable();
|
||||
|
||||
sk->sk_user_data = fou;
|
||||
fou->sock = sock;
|
||||
|
||||
udp_set_convert_csum(sk, true);
|
||||
|
||||
sk->sk_allocation = GFP_ATOMIC;
|
||||
|
||||
if (cfg->udp_config.family == AF_INET) {
|
||||
err = udp_add_offload(&fou->udp_offloads);
|
||||
if (err)
|
||||
goto error;
|
||||
}
|
||||
|
||||
err = fou_add_to_port_list(fou);
|
||||
if (err)
|
||||
goto error;
|
||||
|
||||
if (sockp)
|
||||
*sockp = sock;
|
||||
|
||||
return 0;
|
||||
|
||||
error:
|
||||
kfree(fou);
|
||||
if (sock)
|
||||
sock_release(sock);
|
||||
|
||||
return err;
|
||||
}
|
||||
|
||||
static int fou_destroy(struct net *net, struct fou_cfg *cfg)
|
||||
{
|
||||
struct fou *fou;
|
||||
u16 port = cfg->udp_config.local_udp_port;
|
||||
int err = -EINVAL;
|
||||
|
||||
spin_lock(&fou_lock);
|
||||
list_for_each_entry(fou, &fou_list, list) {
|
||||
if (fou->port == port) {
|
||||
udp_del_offload(&fou->udp_offloads);
|
||||
fou_release(fou);
|
||||
err = 0;
|
||||
break;
|
||||
}
|
||||
}
|
||||
spin_unlock(&fou_lock);
|
||||
|
||||
return err;
|
||||
}
|
||||
|
||||
static struct genl_family fou_nl_family = {
|
||||
.id = GENL_ID_GENERATE,
|
||||
.hdrsize = 0,
|
||||
.name = FOU_GENL_NAME,
|
||||
.version = FOU_GENL_VERSION,
|
||||
.maxattr = FOU_ATTR_MAX,
|
||||
.netnsok = true,
|
||||
};
|
||||
|
||||
static struct nla_policy fou_nl_policy[FOU_ATTR_MAX + 1] = {
|
||||
[FOU_ATTR_PORT] = { .type = NLA_U16, },
|
||||
[FOU_ATTR_AF] = { .type = NLA_U8, },
|
||||
[FOU_ATTR_IPPROTO] = { .type = NLA_U8, },
|
||||
[FOU_ATTR_TYPE] = { .type = NLA_U8, },
|
||||
};
|
||||
|
||||
static int parse_nl_config(struct genl_info *info,
|
||||
struct fou_cfg *cfg)
|
||||
{
|
||||
memset(cfg, 0, sizeof(*cfg));
|
||||
|
||||
cfg->udp_config.family = AF_INET;
|
||||
|
||||
if (info->attrs[FOU_ATTR_AF]) {
|
||||
u8 family = nla_get_u8(info->attrs[FOU_ATTR_AF]);
|
||||
|
||||
if (family != AF_INET && family != AF_INET6)
|
||||
return -EINVAL;
|
||||
|
||||
cfg->udp_config.family = family;
|
||||
}
|
||||
|
||||
if (info->attrs[FOU_ATTR_PORT]) {
|
||||
u16 port = nla_get_u16(info->attrs[FOU_ATTR_PORT]);
|
||||
|
||||
cfg->udp_config.local_udp_port = port;
|
||||
}
|
||||
|
||||
if (info->attrs[FOU_ATTR_IPPROTO])
|
||||
cfg->protocol = nla_get_u8(info->attrs[FOU_ATTR_IPPROTO]);
|
||||
|
||||
if (info->attrs[FOU_ATTR_TYPE])
|
||||
cfg->type = nla_get_u8(info->attrs[FOU_ATTR_TYPE]);
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
static int fou_nl_cmd_add_port(struct sk_buff *skb, struct genl_info *info)
|
||||
{
|
||||
struct fou_cfg cfg;
|
||||
int err;
|
||||
|
||||
err = parse_nl_config(info, &cfg);
|
||||
if (err)
|
||||
return err;
|
||||
|
||||
return fou_create(&init_net, &cfg, NULL);
|
||||
}
|
||||
|
||||
static int fou_nl_cmd_rm_port(struct sk_buff *skb, struct genl_info *info)
|
||||
{
|
||||
struct fou_cfg cfg;
|
||||
|
||||
parse_nl_config(info, &cfg);
|
||||
|
||||
return fou_destroy(&init_net, &cfg);
|
||||
}
|
||||
|
||||
static const struct genl_ops fou_nl_ops[] = {
|
||||
{
|
||||
.cmd = FOU_CMD_ADD,
|
||||
.doit = fou_nl_cmd_add_port,
|
||||
.policy = fou_nl_policy,
|
||||
.flags = GENL_ADMIN_PERM,
|
||||
},
|
||||
{
|
||||
.cmd = FOU_CMD_DEL,
|
||||
.doit = fou_nl_cmd_rm_port,
|
||||
.policy = fou_nl_policy,
|
||||
.flags = GENL_ADMIN_PERM,
|
||||
},
|
||||
};
|
||||
|
||||
static int __init fou_init(void)
|
||||
{
|
||||
int ret;
|
||||
|
||||
ret = genl_register_family_with_ops(&fou_nl_family,
|
||||
fou_nl_ops);
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
static void __exit fou_fini(void)
|
||||
{
|
||||
struct fou *fou, *next;
|
||||
|
||||
genl_unregister_family(&fou_nl_family);
|
||||
|
||||
/* Close all the FOU sockets */
|
||||
|
||||
spin_lock(&fou_lock);
|
||||
list_for_each_entry_safe(fou, next, &fou_list, list)
|
||||
fou_release(fou);
|
||||
spin_unlock(&fou_lock);
|
||||
}
|
||||
|
||||
module_init(fou_init);
|
||||
module_exit(fou_fini);
|
||||
MODULE_AUTHOR("Tom Herbert <therbert@google.com>");
|
||||
MODULE_LICENSE("GPL");
|
||||
390
net/ipv4/geneve.c
Normal file
390
net/ipv4/geneve.c
Normal file
|
|
@ -0,0 +1,390 @@
|
|||
/*
|
||||
* Geneve: Generic Network Virtualization Encapsulation
|
||||
*
|
||||
* Copyright (c) 2014 Nicira, Inc.
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU General Public License
|
||||
* as published by the Free Software Foundation; either version
|
||||
* 2 of the License, or (at your option) any later version.
|
||||
*/
|
||||
|
||||
#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
|
||||
|
||||
#include <linux/kernel.h>
|
||||
#include <linux/types.h>
|
||||
#include <linux/module.h>
|
||||
#include <linux/errno.h>
|
||||
#include <linux/slab.h>
|
||||
#include <linux/skbuff.h>
|
||||
#include <linux/rculist.h>
|
||||
#include <linux/netdevice.h>
|
||||
#include <linux/in.h>
|
||||
#include <linux/ip.h>
|
||||
#include <linux/udp.h>
|
||||
#include <linux/igmp.h>
|
||||
#include <linux/etherdevice.h>
|
||||
#include <linux/if_ether.h>
|
||||
#include <linux/if_vlan.h>
|
||||
#include <linux/hash.h>
|
||||
#include <linux/ethtool.h>
|
||||
#include <net/arp.h>
|
||||
#include <net/ndisc.h>
|
||||
#include <net/ip.h>
|
||||
#include <net/ip_tunnels.h>
|
||||
#include <net/icmp.h>
|
||||
#include <net/udp.h>
|
||||
#include <net/rtnetlink.h>
|
||||
#include <net/route.h>
|
||||
#include <net/dsfield.h>
|
||||
#include <net/inet_ecn.h>
|
||||
#include <net/net_namespace.h>
|
||||
#include <net/netns/generic.h>
|
||||
#include <net/geneve.h>
|
||||
#include <net/protocol.h>
|
||||
#include <net/udp_tunnel.h>
|
||||
#if IS_ENABLED(CONFIG_IPV6)
|
||||
#include <net/ipv6.h>
|
||||
#include <net/addrconf.h>
|
||||
#include <net/ip6_tunnel.h>
|
||||
#include <net/ip6_checksum.h>
|
||||
#endif
|
||||
|
||||
#define PORT_HASH_BITS 8
|
||||
#define PORT_HASH_SIZE (1<<PORT_HASH_BITS)
|
||||
|
||||
/* per-network namespace private data for this module */
|
||||
struct geneve_net {
|
||||
struct hlist_head sock_list[PORT_HASH_SIZE];
|
||||
spinlock_t sock_lock; /* Protects sock_list */
|
||||
};
|
||||
|
||||
static int geneve_net_id;
|
||||
|
||||
static struct workqueue_struct *geneve_wq;
|
||||
|
||||
static inline struct genevehdr *geneve_hdr(const struct sk_buff *skb)
|
||||
{
|
||||
return (struct genevehdr *)(udp_hdr(skb) + 1);
|
||||
}
|
||||
|
||||
static struct hlist_head *gs_head(struct net *net, __be16 port)
|
||||
{
|
||||
struct geneve_net *gn = net_generic(net, geneve_net_id);
|
||||
|
||||
return &gn->sock_list[hash_32(ntohs(port), PORT_HASH_BITS)];
|
||||
}
|
||||
|
||||
/* Find geneve socket based on network namespace and UDP port */
|
||||
static struct geneve_sock *geneve_find_sock(struct net *net, __be16 port)
|
||||
{
|
||||
struct geneve_sock *gs;
|
||||
|
||||
hlist_for_each_entry_rcu(gs, gs_head(net, port), hlist) {
|
||||
if (inet_sk(gs->sock->sk)->inet_sport == port)
|
||||
return gs;
|
||||
}
|
||||
|
||||
return NULL;
|
||||
}
|
||||
|
||||
static void geneve_build_header(struct genevehdr *geneveh,
|
||||
__be16 tun_flags, u8 vni[3],
|
||||
u8 options_len, u8 *options)
|
||||
{
|
||||
geneveh->ver = GENEVE_VER;
|
||||
geneveh->opt_len = options_len / 4;
|
||||
geneveh->oam = !!(tun_flags & TUNNEL_OAM);
|
||||
geneveh->critical = !!(tun_flags & TUNNEL_CRIT_OPT);
|
||||
geneveh->rsvd1 = 0;
|
||||
memcpy(geneveh->vni, vni, 3);
|
||||
geneveh->proto_type = htons(ETH_P_TEB);
|
||||
geneveh->rsvd2 = 0;
|
||||
|
||||
memcpy(geneveh->options, options, options_len);
|
||||
}
|
||||
|
||||
/* Transmit a fully formated Geneve frame.
|
||||
*
|
||||
* When calling this function. The skb->data should point
|
||||
* to the geneve header which is fully formed.
|
||||
*
|
||||
* This function will add other UDP tunnel headers.
|
||||
*/
|
||||
int geneve_xmit_skb(struct geneve_sock *gs, struct rtable *rt,
|
||||
struct sk_buff *skb, __be32 src, __be32 dst, __u8 tos,
|
||||
__u8 ttl, __be16 df, __be16 src_port, __be16 dst_port,
|
||||
__be16 tun_flags, u8 vni[3], u8 opt_len, u8 *opt,
|
||||
bool xnet)
|
||||
{
|
||||
struct genevehdr *gnvh;
|
||||
int min_headroom;
|
||||
int err;
|
||||
|
||||
min_headroom = LL_RESERVED_SPACE(rt->dst.dev) + rt->dst.header_len
|
||||
+ GENEVE_BASE_HLEN + opt_len + sizeof(struct iphdr)
|
||||
+ (vlan_tx_tag_present(skb) ? VLAN_HLEN : 0);
|
||||
|
||||
err = skb_cow_head(skb, min_headroom);
|
||||
if (unlikely(err))
|
||||
return err;
|
||||
|
||||
skb = vlan_hwaccel_push_inside(skb);
|
||||
if (unlikely(!skb))
|
||||
return -ENOMEM;
|
||||
|
||||
skb = udp_tunnel_handle_offloads(skb, !gs->sock->sk->sk_no_check_tx);
|
||||
if (IS_ERR(skb))
|
||||
return PTR_ERR(skb);
|
||||
|
||||
gnvh = (struct genevehdr *)__skb_push(skb, sizeof(*gnvh) + opt_len);
|
||||
geneve_build_header(gnvh, tun_flags, vni, opt_len, opt);
|
||||
|
||||
skb_set_inner_protocol(skb, htons(ETH_P_TEB));
|
||||
|
||||
return udp_tunnel_xmit_skb(gs->sock, rt, skb, src, dst,
|
||||
tos, ttl, df, src_port, dst_port, xnet);
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(geneve_xmit_skb);
|
||||
|
||||
static void geneve_notify_add_rx_port(struct geneve_sock *gs)
|
||||
{
|
||||
struct sock *sk = gs->sock->sk;
|
||||
sa_family_t sa_family = sk->sk_family;
|
||||
int err;
|
||||
|
||||
if (sa_family == AF_INET) {
|
||||
err = udp_add_offload(&gs->udp_offloads);
|
||||
if (err)
|
||||
pr_warn("geneve: udp_add_offload failed with status %d\n",
|
||||
err);
|
||||
}
|
||||
}
|
||||
|
||||
static void geneve_notify_del_rx_port(struct geneve_sock *gs)
|
||||
{
|
||||
struct sock *sk = gs->sock->sk;
|
||||
sa_family_t sa_family = sk->sk_family;
|
||||
|
||||
if (sa_family == AF_INET)
|
||||
udp_del_offload(&gs->udp_offloads);
|
||||
}
|
||||
|
||||
/* Callback from net/ipv4/udp.c to receive packets */
|
||||
static int geneve_udp_encap_recv(struct sock *sk, struct sk_buff *skb)
|
||||
{
|
||||
struct genevehdr *geneveh;
|
||||
struct geneve_sock *gs;
|
||||
int opts_len;
|
||||
|
||||
/* Need Geneve and inner Ethernet header to be present */
|
||||
if (unlikely(!pskb_may_pull(skb, GENEVE_BASE_HLEN)))
|
||||
goto error;
|
||||
|
||||
/* Return packets with reserved bits set */
|
||||
geneveh = geneve_hdr(skb);
|
||||
|
||||
if (unlikely(geneveh->ver != GENEVE_VER))
|
||||
goto error;
|
||||
|
||||
if (unlikely(geneveh->proto_type != htons(ETH_P_TEB)))
|
||||
goto error;
|
||||
|
||||
opts_len = geneveh->opt_len * 4;
|
||||
if (iptunnel_pull_header(skb, GENEVE_BASE_HLEN + opts_len,
|
||||
htons(ETH_P_TEB)))
|
||||
goto drop;
|
||||
|
||||
gs = rcu_dereference_sk_user_data(sk);
|
||||
if (!gs)
|
||||
goto drop;
|
||||
|
||||
gs->rcv(gs, skb);
|
||||
return 0;
|
||||
|
||||
drop:
|
||||
/* Consume bad packet */
|
||||
kfree_skb(skb);
|
||||
return 0;
|
||||
|
||||
error:
|
||||
/* Let the UDP layer deal with the skb */
|
||||
return 1;
|
||||
}
|
||||
|
||||
static void geneve_del_work(struct work_struct *work)
|
||||
{
|
||||
struct geneve_sock *gs = container_of(work, struct geneve_sock,
|
||||
del_work);
|
||||
|
||||
udp_tunnel_sock_release(gs->sock);
|
||||
kfree_rcu(gs, rcu);
|
||||
}
|
||||
|
||||
static struct socket *geneve_create_sock(struct net *net, bool ipv6,
|
||||
__be16 port)
|
||||
{
|
||||
struct socket *sock;
|
||||
struct udp_port_cfg udp_conf;
|
||||
int err;
|
||||
|
||||
memset(&udp_conf, 0, sizeof(udp_conf));
|
||||
|
||||
if (ipv6) {
|
||||
udp_conf.family = AF_INET6;
|
||||
} else {
|
||||
udp_conf.family = AF_INET;
|
||||
udp_conf.local_ip.s_addr = htonl(INADDR_ANY);
|
||||
}
|
||||
|
||||
udp_conf.local_udp_port = port;
|
||||
|
||||
/* Open UDP socket */
|
||||
err = udp_sock_create(net, &udp_conf, &sock);
|
||||
if (err < 0)
|
||||
return ERR_PTR(err);
|
||||
|
||||
return sock;
|
||||
}
|
||||
|
||||
/* Create new listen socket if needed */
|
||||
static struct geneve_sock *geneve_socket_create(struct net *net, __be16 port,
|
||||
geneve_rcv_t *rcv, void *data,
|
||||
bool ipv6)
|
||||
{
|
||||
struct geneve_net *gn = net_generic(net, geneve_net_id);
|
||||
struct geneve_sock *gs;
|
||||
struct socket *sock;
|
||||
struct udp_tunnel_sock_cfg tunnel_cfg;
|
||||
|
||||
gs = kzalloc(sizeof(*gs), GFP_KERNEL);
|
||||
if (!gs)
|
||||
return ERR_PTR(-ENOMEM);
|
||||
|
||||
INIT_WORK(&gs->del_work, geneve_del_work);
|
||||
|
||||
sock = geneve_create_sock(net, ipv6, port);
|
||||
if (IS_ERR(sock)) {
|
||||
kfree(gs);
|
||||
return ERR_CAST(sock);
|
||||
}
|
||||
|
||||
gs->sock = sock;
|
||||
atomic_set(&gs->refcnt, 1);
|
||||
gs->rcv = rcv;
|
||||
gs->rcv_data = data;
|
||||
|
||||
/* Initialize the geneve udp offloads structure */
|
||||
gs->udp_offloads.port = port;
|
||||
gs->udp_offloads.callbacks.gro_receive = NULL;
|
||||
gs->udp_offloads.callbacks.gro_complete = NULL;
|
||||
|
||||
spin_lock(&gn->sock_lock);
|
||||
hlist_add_head_rcu(&gs->hlist, gs_head(net, port));
|
||||
geneve_notify_add_rx_port(gs);
|
||||
spin_unlock(&gn->sock_lock);
|
||||
|
||||
/* Mark socket as an encapsulation socket */
|
||||
tunnel_cfg.sk_user_data = gs;
|
||||
tunnel_cfg.encap_type = 1;
|
||||
tunnel_cfg.encap_rcv = geneve_udp_encap_recv;
|
||||
tunnel_cfg.encap_destroy = NULL;
|
||||
setup_udp_tunnel_sock(net, sock, &tunnel_cfg);
|
||||
|
||||
return gs;
|
||||
}
|
||||
|
||||
struct geneve_sock *geneve_sock_add(struct net *net, __be16 port,
|
||||
geneve_rcv_t *rcv, void *data,
|
||||
bool no_share, bool ipv6)
|
||||
{
|
||||
struct geneve_net *gn = net_generic(net, geneve_net_id);
|
||||
struct geneve_sock *gs;
|
||||
|
||||
gs = geneve_socket_create(net, port, rcv, data, ipv6);
|
||||
if (!IS_ERR(gs))
|
||||
return gs;
|
||||
|
||||
if (no_share) /* Return error if sharing is not allowed. */
|
||||
return ERR_PTR(-EINVAL);
|
||||
|
||||
spin_lock(&gn->sock_lock);
|
||||
gs = geneve_find_sock(net, port);
|
||||
if (gs && ((gs->rcv != rcv) ||
|
||||
!atomic_add_unless(&gs->refcnt, 1, 0)))
|
||||
gs = ERR_PTR(-EBUSY);
|
||||
spin_unlock(&gn->sock_lock);
|
||||
|
||||
if (!gs)
|
||||
gs = ERR_PTR(-EINVAL);
|
||||
|
||||
return gs;
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(geneve_sock_add);
|
||||
|
||||
void geneve_sock_release(struct geneve_sock *gs)
|
||||
{
|
||||
struct net *net = sock_net(gs->sock->sk);
|
||||
struct geneve_net *gn = net_generic(net, geneve_net_id);
|
||||
|
||||
if (!atomic_dec_and_test(&gs->refcnt))
|
||||
return;
|
||||
|
||||
spin_lock(&gn->sock_lock);
|
||||
hlist_del_rcu(&gs->hlist);
|
||||
geneve_notify_del_rx_port(gs);
|
||||
spin_unlock(&gn->sock_lock);
|
||||
|
||||
queue_work(geneve_wq, &gs->del_work);
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(geneve_sock_release);
|
||||
|
||||
static __net_init int geneve_init_net(struct net *net)
|
||||
{
|
||||
struct geneve_net *gn = net_generic(net, geneve_net_id);
|
||||
unsigned int h;
|
||||
|
||||
spin_lock_init(&gn->sock_lock);
|
||||
|
||||
for (h = 0; h < PORT_HASH_SIZE; ++h)
|
||||
INIT_HLIST_HEAD(&gn->sock_list[h]);
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
static struct pernet_operations geneve_net_ops = {
|
||||
.init = geneve_init_net,
|
||||
.exit = NULL,
|
||||
.id = &geneve_net_id,
|
||||
.size = sizeof(struct geneve_net),
|
||||
};
|
||||
|
||||
static int __init geneve_init_module(void)
|
||||
{
|
||||
int rc;
|
||||
|
||||
geneve_wq = alloc_workqueue("geneve", 0, 0);
|
||||
if (!geneve_wq)
|
||||
return -ENOMEM;
|
||||
|
||||
rc = register_pernet_subsys(&geneve_net_ops);
|
||||
if (rc)
|
||||
return rc;
|
||||
|
||||
pr_info("Geneve driver\n");
|
||||
|
||||
return 0;
|
||||
}
|
||||
late_initcall(geneve_init_module);
|
||||
|
||||
static void __exit geneve_cleanup_module(void)
|
||||
{
|
||||
destroy_workqueue(geneve_wq);
|
||||
unregister_pernet_subsys(&geneve_net_ops);
|
||||
}
|
||||
module_exit(geneve_cleanup_module);
|
||||
|
||||
MODULE_LICENSE("GPL");
|
||||
MODULE_AUTHOR("Jesse Gross <jesse@nicira.com>");
|
||||
MODULE_DESCRIPTION("Driver for GENEVE encapsulated traffic");
|
||||
MODULE_ALIAS_RTNL_LINK("geneve");
|
||||
367
net/ipv4/gre_demux.c
Normal file
367
net/ipv4/gre_demux.c
Normal file
|
|
@ -0,0 +1,367 @@
|
|||
/*
|
||||
* GRE over IPv4 demultiplexer driver
|
||||
*
|
||||
* Authors: Dmitry Kozlov (xeb@mail.ru)
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU General Public License
|
||||
* as published by the Free Software Foundation; either version
|
||||
* 2 of the License, or (at your option) any later version.
|
||||
*
|
||||
*/
|
||||
|
||||
#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
|
||||
|
||||
#include <linux/module.h>
|
||||
#include <linux/if.h>
|
||||
#include <linux/icmp.h>
|
||||
#include <linux/kernel.h>
|
||||
#include <linux/kmod.h>
|
||||
#include <linux/skbuff.h>
|
||||
#include <linux/in.h>
|
||||
#include <linux/ip.h>
|
||||
#include <linux/netdevice.h>
|
||||
#include <linux/if_tunnel.h>
|
||||
#include <linux/spinlock.h>
|
||||
#include <net/protocol.h>
|
||||
#include <net/gre.h>
|
||||
|
||||
#include <net/icmp.h>
|
||||
#include <net/route.h>
|
||||
#include <net/xfrm.h>
|
||||
|
||||
static const struct gre_protocol __rcu *gre_proto[GREPROTO_MAX] __read_mostly;
|
||||
static struct gre_cisco_protocol __rcu *gre_cisco_proto_list[GRE_IP_PROTO_MAX];
|
||||
|
||||
int gre_add_protocol(const struct gre_protocol *proto, u8 version)
|
||||
{
|
||||
if (version >= GREPROTO_MAX)
|
||||
return -EINVAL;
|
||||
|
||||
return (cmpxchg((const struct gre_protocol **)&gre_proto[version], NULL, proto) == NULL) ?
|
||||
0 : -EBUSY;
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(gre_add_protocol);
|
||||
|
||||
int gre_del_protocol(const struct gre_protocol *proto, u8 version)
|
||||
{
|
||||
int ret;
|
||||
|
||||
if (version >= GREPROTO_MAX)
|
||||
return -EINVAL;
|
||||
|
||||
ret = (cmpxchg((const struct gre_protocol **)&gre_proto[version], proto, NULL) == proto) ?
|
||||
0 : -EBUSY;
|
||||
|
||||
if (ret)
|
||||
return ret;
|
||||
|
||||
synchronize_rcu();
|
||||
return 0;
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(gre_del_protocol);
|
||||
|
||||
void gre_build_header(struct sk_buff *skb, const struct tnl_ptk_info *tpi,
|
||||
int hdr_len)
|
||||
{
|
||||
struct gre_base_hdr *greh;
|
||||
|
||||
skb_push(skb, hdr_len);
|
||||
|
||||
skb_reset_transport_header(skb);
|
||||
greh = (struct gre_base_hdr *)skb->data;
|
||||
greh->flags = tnl_flags_to_gre_flags(tpi->flags);
|
||||
greh->protocol = tpi->proto;
|
||||
|
||||
if (tpi->flags&(TUNNEL_KEY|TUNNEL_CSUM|TUNNEL_SEQ)) {
|
||||
__be32 *ptr = (__be32 *)(((u8 *)greh) + hdr_len - 4);
|
||||
|
||||
if (tpi->flags&TUNNEL_SEQ) {
|
||||
*ptr = tpi->seq;
|
||||
ptr--;
|
||||
}
|
||||
if (tpi->flags&TUNNEL_KEY) {
|
||||
*ptr = tpi->key;
|
||||
ptr--;
|
||||
}
|
||||
if (tpi->flags&TUNNEL_CSUM &&
|
||||
!(skb_shinfo(skb)->gso_type &
|
||||
(SKB_GSO_GRE|SKB_GSO_GRE_CSUM))) {
|
||||
*ptr = 0;
|
||||
*(__sum16 *)ptr = csum_fold(skb_checksum(skb, 0,
|
||||
skb->len, 0));
|
||||
}
|
||||
}
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(gre_build_header);
|
||||
|
||||
static int parse_gre_header(struct sk_buff *skb, struct tnl_ptk_info *tpi,
|
||||
bool *csum_err)
|
||||
{
|
||||
const struct gre_base_hdr *greh;
|
||||
__be32 *options;
|
||||
int hdr_len;
|
||||
|
||||
if (unlikely(!pskb_may_pull(skb, sizeof(struct gre_base_hdr))))
|
||||
return -EINVAL;
|
||||
|
||||
greh = (struct gre_base_hdr *)skb_transport_header(skb);
|
||||
if (unlikely(greh->flags & (GRE_VERSION | GRE_ROUTING)))
|
||||
return -EINVAL;
|
||||
|
||||
tpi->flags = gre_flags_to_tnl_flags(greh->flags);
|
||||
hdr_len = ip_gre_calc_hlen(tpi->flags);
|
||||
|
||||
if (!pskb_may_pull(skb, hdr_len))
|
||||
return -EINVAL;
|
||||
|
||||
greh = (struct gre_base_hdr *)skb_transport_header(skb);
|
||||
tpi->proto = greh->protocol;
|
||||
|
||||
options = (__be32 *)(greh + 1);
|
||||
if (greh->flags & GRE_CSUM) {
|
||||
if (skb_checksum_simple_validate(skb)) {
|
||||
*csum_err = true;
|
||||
return -EINVAL;
|
||||
}
|
||||
|
||||
skb_checksum_try_convert(skb, IPPROTO_GRE, 0,
|
||||
null_compute_pseudo);
|
||||
|
||||
options++;
|
||||
}
|
||||
|
||||
if (greh->flags & GRE_KEY) {
|
||||
tpi->key = *options;
|
||||
options++;
|
||||
} else
|
||||
tpi->key = 0;
|
||||
|
||||
if (unlikely(greh->flags & GRE_SEQ)) {
|
||||
tpi->seq = *options;
|
||||
options++;
|
||||
} else
|
||||
tpi->seq = 0;
|
||||
|
||||
/* WCCP version 1 and 2 protocol decoding.
|
||||
* - Change protocol to IP
|
||||
* - When dealing with WCCPv2, Skip extra 4 bytes in GRE header
|
||||
*/
|
||||
if (greh->flags == 0 && tpi->proto == htons(ETH_P_WCCP)) {
|
||||
tpi->proto = htons(ETH_P_IP);
|
||||
if ((*(u8 *)options & 0xF0) != 0x40) {
|
||||
hdr_len += 4;
|
||||
if (!pskb_may_pull(skb, hdr_len))
|
||||
return -EINVAL;
|
||||
}
|
||||
}
|
||||
|
||||
return iptunnel_pull_header(skb, hdr_len, tpi->proto);
|
||||
}
|
||||
|
||||
static int gre_cisco_rcv(struct sk_buff *skb)
|
||||
{
|
||||
struct tnl_ptk_info tpi;
|
||||
int i;
|
||||
bool csum_err = false;
|
||||
|
||||
#ifdef CONFIG_NET_IPGRE_BROADCAST
|
||||
if (ipv4_is_multicast(ip_hdr(skb)->daddr)) {
|
||||
/* Looped back packet, drop it! */
|
||||
if (rt_is_output_route(skb_rtable(skb)))
|
||||
goto drop;
|
||||
}
|
||||
#endif
|
||||
|
||||
if (parse_gre_header(skb, &tpi, &csum_err) < 0)
|
||||
goto drop;
|
||||
|
||||
rcu_read_lock();
|
||||
for (i = 0; i < GRE_IP_PROTO_MAX; i++) {
|
||||
struct gre_cisco_protocol *proto;
|
||||
int ret;
|
||||
|
||||
proto = rcu_dereference(gre_cisco_proto_list[i]);
|
||||
if (!proto)
|
||||
continue;
|
||||
ret = proto->handler(skb, &tpi);
|
||||
if (ret == PACKET_RCVD) {
|
||||
rcu_read_unlock();
|
||||
return 0;
|
||||
}
|
||||
}
|
||||
rcu_read_unlock();
|
||||
|
||||
icmp_send(skb, ICMP_DEST_UNREACH, ICMP_PORT_UNREACH, 0);
|
||||
drop:
|
||||
kfree_skb(skb);
|
||||
return 0;
|
||||
}
|
||||
|
||||
static void gre_cisco_err(struct sk_buff *skb, u32 info)
|
||||
{
|
||||
/* All the routers (except for Linux) return only
|
||||
* 8 bytes of packet payload. It means, that precise relaying of
|
||||
* ICMP in the real Internet is absolutely infeasible.
|
||||
*
|
||||
* Moreover, Cisco "wise men" put GRE key to the third word
|
||||
* in GRE header. It makes impossible maintaining even soft
|
||||
* state for keyed
|
||||
* GRE tunnels with enabled checksum. Tell them "thank you".
|
||||
*
|
||||
* Well, I wonder, rfc1812 was written by Cisco employee,
|
||||
* what the hell these idiots break standards established
|
||||
* by themselves???
|
||||
*/
|
||||
|
||||
const int type = icmp_hdr(skb)->type;
|
||||
const int code = icmp_hdr(skb)->code;
|
||||
struct tnl_ptk_info tpi;
|
||||
bool csum_err = false;
|
||||
int i;
|
||||
|
||||
if (parse_gre_header(skb, &tpi, &csum_err)) {
|
||||
if (!csum_err) /* ignore csum errors. */
|
||||
return;
|
||||
}
|
||||
|
||||
if (type == ICMP_DEST_UNREACH && code == ICMP_FRAG_NEEDED) {
|
||||
ipv4_update_pmtu(skb, dev_net(skb->dev), info,
|
||||
skb->dev->ifindex, 0, IPPROTO_GRE, 0);
|
||||
return;
|
||||
}
|
||||
if (type == ICMP_REDIRECT) {
|
||||
ipv4_redirect(skb, dev_net(skb->dev), skb->dev->ifindex, 0,
|
||||
IPPROTO_GRE, 0);
|
||||
return;
|
||||
}
|
||||
|
||||
rcu_read_lock();
|
||||
for (i = 0; i < GRE_IP_PROTO_MAX; i++) {
|
||||
struct gre_cisco_protocol *proto;
|
||||
|
||||
proto = rcu_dereference(gre_cisco_proto_list[i]);
|
||||
if (!proto)
|
||||
continue;
|
||||
|
||||
if (proto->err_handler(skb, info, &tpi) == PACKET_RCVD)
|
||||
goto out;
|
||||
|
||||
}
|
||||
out:
|
||||
rcu_read_unlock();
|
||||
}
|
||||
|
||||
static int gre_rcv(struct sk_buff *skb)
|
||||
{
|
||||
const struct gre_protocol *proto;
|
||||
u8 ver;
|
||||
int ret;
|
||||
|
||||
if (!pskb_may_pull(skb, 12))
|
||||
goto drop;
|
||||
|
||||
ver = skb->data[1]&0x7f;
|
||||
if (ver >= GREPROTO_MAX)
|
||||
goto drop;
|
||||
|
||||
rcu_read_lock();
|
||||
proto = rcu_dereference(gre_proto[ver]);
|
||||
if (!proto || !proto->handler)
|
||||
goto drop_unlock;
|
||||
ret = proto->handler(skb);
|
||||
rcu_read_unlock();
|
||||
return ret;
|
||||
|
||||
drop_unlock:
|
||||
rcu_read_unlock();
|
||||
drop:
|
||||
kfree_skb(skb);
|
||||
return NET_RX_DROP;
|
||||
}
|
||||
|
||||
static void gre_err(struct sk_buff *skb, u32 info)
|
||||
{
|
||||
const struct gre_protocol *proto;
|
||||
const struct iphdr *iph = (const struct iphdr *)skb->data;
|
||||
u8 ver = skb->data[(iph->ihl<<2) + 1]&0x7f;
|
||||
|
||||
if (ver >= GREPROTO_MAX)
|
||||
return;
|
||||
|
||||
rcu_read_lock();
|
||||
proto = rcu_dereference(gre_proto[ver]);
|
||||
if (proto && proto->err_handler)
|
||||
proto->err_handler(skb, info);
|
||||
rcu_read_unlock();
|
||||
}
|
||||
|
||||
static const struct net_protocol net_gre_protocol = {
|
||||
.handler = gre_rcv,
|
||||
.err_handler = gre_err,
|
||||
.netns_ok = 1,
|
||||
};
|
||||
|
||||
static const struct gre_protocol ipgre_protocol = {
|
||||
.handler = gre_cisco_rcv,
|
||||
.err_handler = gre_cisco_err,
|
||||
};
|
||||
|
||||
int gre_cisco_register(struct gre_cisco_protocol *newp)
|
||||
{
|
||||
struct gre_cisco_protocol **proto = (struct gre_cisco_protocol **)
|
||||
&gre_cisco_proto_list[newp->priority];
|
||||
|
||||
return (cmpxchg(proto, NULL, newp) == NULL) ? 0 : -EBUSY;
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(gre_cisco_register);
|
||||
|
||||
int gre_cisco_unregister(struct gre_cisco_protocol *del_proto)
|
||||
{
|
||||
struct gre_cisco_protocol **proto = (struct gre_cisco_protocol **)
|
||||
&gre_cisco_proto_list[del_proto->priority];
|
||||
int ret;
|
||||
|
||||
ret = (cmpxchg(proto, del_proto, NULL) == del_proto) ? 0 : -EINVAL;
|
||||
|
||||
if (ret)
|
||||
return ret;
|
||||
|
||||
synchronize_net();
|
||||
return 0;
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(gre_cisco_unregister);
|
||||
|
||||
static int __init gre_init(void)
|
||||
{
|
||||
pr_info("GRE over IPv4 demultiplexor driver\n");
|
||||
|
||||
if (inet_add_protocol(&net_gre_protocol, IPPROTO_GRE) < 0) {
|
||||
pr_err("can't add protocol\n");
|
||||
goto err;
|
||||
}
|
||||
|
||||
if (gre_add_protocol(&ipgre_protocol, GREPROTO_CISCO) < 0) {
|
||||
pr_info("%s: can't add ipgre handler\n", __func__);
|
||||
goto err_gre;
|
||||
}
|
||||
|
||||
return 0;
|
||||
err_gre:
|
||||
inet_del_protocol(&net_gre_protocol, IPPROTO_GRE);
|
||||
err:
|
||||
return -EAGAIN;
|
||||
}
|
||||
|
||||
static void __exit gre_exit(void)
|
||||
{
|
||||
gre_del_protocol(&ipgre_protocol, GREPROTO_CISCO);
|
||||
inet_del_protocol(&net_gre_protocol, IPPROTO_GRE);
|
||||
}
|
||||
|
||||
module_init(gre_init);
|
||||
module_exit(gre_exit);
|
||||
|
||||
MODULE_DESCRIPTION("GRE over IPv4 demultiplexer driver");
|
||||
MODULE_AUTHOR("D. Kozlov (xeb@mail.ru)");
|
||||
MODULE_LICENSE("GPL");
|
||||
268
net/ipv4/gre_offload.c
Normal file
268
net/ipv4/gre_offload.c
Normal file
|
|
@ -0,0 +1,268 @@
|
|||
/*
|
||||
* IPV4 GSO/GRO offload support
|
||||
* Linux INET implementation
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU General Public License
|
||||
* as published by the Free Software Foundation; either version
|
||||
* 2 of the License, or (at your option) any later version.
|
||||
*
|
||||
* GRE GSO support
|
||||
*/
|
||||
|
||||
#include <linux/skbuff.h>
|
||||
#include <linux/init.h>
|
||||
#include <net/protocol.h>
|
||||
#include <net/gre.h>
|
||||
|
||||
static struct sk_buff *gre_gso_segment(struct sk_buff *skb,
|
||||
netdev_features_t features)
|
||||
{
|
||||
struct sk_buff *segs = ERR_PTR(-EINVAL);
|
||||
netdev_features_t enc_features;
|
||||
int ghl;
|
||||
struct gre_base_hdr *greh;
|
||||
u16 mac_offset = skb->mac_header;
|
||||
int mac_len = skb->mac_len;
|
||||
__be16 protocol = skb->protocol;
|
||||
int tnl_hlen;
|
||||
bool csum;
|
||||
|
||||
if (unlikely(skb_shinfo(skb)->gso_type &
|
||||
~(SKB_GSO_TCPV4 |
|
||||
SKB_GSO_TCPV6 |
|
||||
SKB_GSO_UDP |
|
||||
SKB_GSO_DODGY |
|
||||
SKB_GSO_TCP_ECN |
|
||||
SKB_GSO_GRE |
|
||||
SKB_GSO_GRE_CSUM |
|
||||
SKB_GSO_IPIP)))
|
||||
goto out;
|
||||
|
||||
if (!skb->encapsulation)
|
||||
goto out;
|
||||
|
||||
if (unlikely(!pskb_may_pull(skb, sizeof(*greh))))
|
||||
goto out;
|
||||
|
||||
greh = (struct gre_base_hdr *)skb_transport_header(skb);
|
||||
|
||||
ghl = skb_inner_mac_header(skb) - skb_transport_header(skb);
|
||||
if (unlikely(ghl < sizeof(*greh)))
|
||||
goto out;
|
||||
|
||||
csum = !!(greh->flags & GRE_CSUM);
|
||||
if (csum)
|
||||
skb->encap_hdr_csum = 1;
|
||||
|
||||
/* setup inner skb. */
|
||||
skb->protocol = greh->protocol;
|
||||
skb->encapsulation = 0;
|
||||
|
||||
if (unlikely(!pskb_may_pull(skb, ghl)))
|
||||
goto out;
|
||||
|
||||
__skb_pull(skb, ghl);
|
||||
skb_reset_mac_header(skb);
|
||||
skb_set_network_header(skb, skb_inner_network_offset(skb));
|
||||
skb->mac_len = skb_inner_network_offset(skb);
|
||||
|
||||
/* segment inner packet. */
|
||||
enc_features = skb->dev->hw_enc_features & features;
|
||||
segs = skb_mac_gso_segment(skb, enc_features);
|
||||
if (IS_ERR_OR_NULL(segs)) {
|
||||
skb_gso_error_unwind(skb, protocol, ghl, mac_offset, mac_len);
|
||||
goto out;
|
||||
}
|
||||
|
||||
skb = segs;
|
||||
tnl_hlen = skb_tnl_header_len(skb);
|
||||
do {
|
||||
__skb_push(skb, ghl);
|
||||
if (csum) {
|
||||
__be32 *pcsum;
|
||||
|
||||
if (skb_has_shared_frag(skb)) {
|
||||
int err;
|
||||
|
||||
err = __skb_linearize(skb);
|
||||
if (err) {
|
||||
kfree_skb_list(segs);
|
||||
segs = ERR_PTR(err);
|
||||
goto out;
|
||||
}
|
||||
}
|
||||
|
||||
skb_reset_transport_header(skb);
|
||||
|
||||
greh = (struct gre_base_hdr *)
|
||||
skb_transport_header(skb);
|
||||
pcsum = (__be32 *)(greh + 1);
|
||||
*pcsum = 0;
|
||||
*(__sum16 *)pcsum = gso_make_checksum(skb, 0);
|
||||
}
|
||||
__skb_push(skb, tnl_hlen - ghl);
|
||||
|
||||
skb_reset_inner_headers(skb);
|
||||
skb->encapsulation = 1;
|
||||
|
||||
skb_reset_mac_header(skb);
|
||||
skb_set_network_header(skb, mac_len);
|
||||
skb->mac_len = mac_len;
|
||||
skb->protocol = protocol;
|
||||
} while ((skb = skb->next));
|
||||
out:
|
||||
return segs;
|
||||
}
|
||||
|
||||
static struct sk_buff **gre_gro_receive(struct sk_buff **head,
|
||||
struct sk_buff *skb)
|
||||
{
|
||||
struct sk_buff **pp = NULL;
|
||||
struct sk_buff *p;
|
||||
const struct gre_base_hdr *greh;
|
||||
unsigned int hlen, grehlen;
|
||||
unsigned int off;
|
||||
int flush = 1;
|
||||
struct packet_offload *ptype;
|
||||
__be16 type;
|
||||
|
||||
off = skb_gro_offset(skb);
|
||||
hlen = off + sizeof(*greh);
|
||||
greh = skb_gro_header_fast(skb, off);
|
||||
if (skb_gro_header_hard(skb, hlen)) {
|
||||
greh = skb_gro_header_slow(skb, hlen, off);
|
||||
if (unlikely(!greh))
|
||||
goto out;
|
||||
}
|
||||
|
||||
/* Only support version 0 and K (key), C (csum) flags. Note that
|
||||
* although the support for the S (seq#) flag can be added easily
|
||||
* for GRO, this is problematic for GSO hence can not be enabled
|
||||
* here because a GRO pkt may end up in the forwarding path, thus
|
||||
* requiring GSO support to break it up correctly.
|
||||
*/
|
||||
if ((greh->flags & ~(GRE_KEY|GRE_CSUM)) != 0)
|
||||
goto out;
|
||||
|
||||
type = greh->protocol;
|
||||
|
||||
rcu_read_lock();
|
||||
ptype = gro_find_receive_by_type(type);
|
||||
if (ptype == NULL)
|
||||
goto out_unlock;
|
||||
|
||||
grehlen = GRE_HEADER_SECTION;
|
||||
|
||||
if (greh->flags & GRE_KEY)
|
||||
grehlen += GRE_HEADER_SECTION;
|
||||
|
||||
if (greh->flags & GRE_CSUM)
|
||||
grehlen += GRE_HEADER_SECTION;
|
||||
|
||||
hlen = off + grehlen;
|
||||
if (skb_gro_header_hard(skb, hlen)) {
|
||||
greh = skb_gro_header_slow(skb, hlen, off);
|
||||
if (unlikely(!greh))
|
||||
goto out_unlock;
|
||||
}
|
||||
|
||||
/* Don't bother verifying checksum if we're going to flush anyway. */
|
||||
if ((greh->flags & GRE_CSUM) && !NAPI_GRO_CB(skb)->flush) {
|
||||
if (skb_gro_checksum_simple_validate(skb))
|
||||
goto out_unlock;
|
||||
|
||||
skb_gro_checksum_try_convert(skb, IPPROTO_GRE, 0,
|
||||
null_compute_pseudo);
|
||||
}
|
||||
|
||||
flush = 0;
|
||||
|
||||
for (p = *head; p; p = p->next) {
|
||||
const struct gre_base_hdr *greh2;
|
||||
|
||||
if (!NAPI_GRO_CB(p)->same_flow)
|
||||
continue;
|
||||
|
||||
/* The following checks are needed to ensure only pkts
|
||||
* from the same tunnel are considered for aggregation.
|
||||
* The criteria for "the same tunnel" includes:
|
||||
* 1) same version (we only support version 0 here)
|
||||
* 2) same protocol (we only support ETH_P_IP for now)
|
||||
* 3) same set of flags
|
||||
* 4) same key if the key field is present.
|
||||
*/
|
||||
greh2 = (struct gre_base_hdr *)(p->data + off);
|
||||
|
||||
if (greh2->flags != greh->flags ||
|
||||
greh2->protocol != greh->protocol) {
|
||||
NAPI_GRO_CB(p)->same_flow = 0;
|
||||
continue;
|
||||
}
|
||||
if (greh->flags & GRE_KEY) {
|
||||
/* compare keys */
|
||||
if (*(__be32 *)(greh2+1) != *(__be32 *)(greh+1)) {
|
||||
NAPI_GRO_CB(p)->same_flow = 0;
|
||||
continue;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
skb_gro_pull(skb, grehlen);
|
||||
|
||||
/* Adjusted NAPI_GRO_CB(skb)->csum after skb_gro_pull()*/
|
||||
skb_gro_postpull_rcsum(skb, greh, grehlen);
|
||||
|
||||
pp = ptype->callbacks.gro_receive(head, skb);
|
||||
|
||||
out_unlock:
|
||||
rcu_read_unlock();
|
||||
out:
|
||||
NAPI_GRO_CB(skb)->flush |= flush;
|
||||
|
||||
return pp;
|
||||
}
|
||||
|
||||
static int gre_gro_complete(struct sk_buff *skb, int nhoff)
|
||||
{
|
||||
struct gre_base_hdr *greh = (struct gre_base_hdr *)(skb->data + nhoff);
|
||||
struct packet_offload *ptype;
|
||||
unsigned int grehlen = sizeof(*greh);
|
||||
int err = -ENOENT;
|
||||
__be16 type;
|
||||
|
||||
skb->encapsulation = 1;
|
||||
skb_shinfo(skb)->gso_type = SKB_GSO_GRE;
|
||||
|
||||
type = greh->protocol;
|
||||
if (greh->flags & GRE_KEY)
|
||||
grehlen += GRE_HEADER_SECTION;
|
||||
|
||||
if (greh->flags & GRE_CSUM)
|
||||
grehlen += GRE_HEADER_SECTION;
|
||||
|
||||
rcu_read_lock();
|
||||
ptype = gro_find_complete_by_type(type);
|
||||
if (ptype != NULL)
|
||||
err = ptype->callbacks.gro_complete(skb, nhoff + grehlen);
|
||||
|
||||
rcu_read_unlock();
|
||||
|
||||
skb_set_inner_mac_header(skb, nhoff + grehlen);
|
||||
|
||||
return err;
|
||||
}
|
||||
|
||||
static const struct net_offload gre_offload = {
|
||||
.callbacks = {
|
||||
.gso_segment = gre_gso_segment,
|
||||
.gro_receive = gre_gro_receive,
|
||||
.gro_complete = gre_gro_complete,
|
||||
},
|
||||
};
|
||||
|
||||
static int __init gre_offload_init(void)
|
||||
{
|
||||
return inet_add_offload(&gre_offload, IPPROTO_GRE);
|
||||
}
|
||||
device_initcall(gre_offload_init);
|
||||
1206
net/ipv4/icmp.c
Normal file
1206
net/ipv4/icmp.c
Normal file
File diff suppressed because it is too large
Load diff
2800
net/ipv4/igmp.c
Normal file
2800
net/ipv4/igmp.c
Normal file
File diff suppressed because it is too large
Load diff
940
net/ipv4/inet_connection_sock.c
Normal file
940
net/ipv4/inet_connection_sock.c
Normal file
|
|
@ -0,0 +1,940 @@
|
|||
/*
|
||||
* INET An implementation of the TCP/IP protocol suite for the LINUX
|
||||
* operating system. INET is implemented using the BSD Socket
|
||||
* interface as the means of communication with the user level.
|
||||
*
|
||||
* Support for INET connection oriented protocols.
|
||||
*
|
||||
* Authors: See the TCP sources
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU General Public License
|
||||
* as published by the Free Software Foundation; either version
|
||||
* 2 of the License, or(at your option) any later version.
|
||||
*/
|
||||
|
||||
#include <linux/module.h>
|
||||
#include <linux/jhash.h>
|
||||
|
||||
#include <net/inet_connection_sock.h>
|
||||
#include <net/inet_hashtables.h>
|
||||
#include <net/inet_timewait_sock.h>
|
||||
#include <net/ip.h>
|
||||
#include <net/route.h>
|
||||
#include <net/tcp_states.h>
|
||||
#include <net/xfrm.h>
|
||||
|
||||
#ifdef INET_CSK_DEBUG
|
||||
const char inet_csk_timer_bug_msg[] = "inet_csk BUG: unknown timer value\n";
|
||||
EXPORT_SYMBOL(inet_csk_timer_bug_msg);
|
||||
#endif
|
||||
|
||||
void inet_get_local_port_range(struct net *net, int *low, int *high)
|
||||
{
|
||||
unsigned int seq;
|
||||
|
||||
do {
|
||||
seq = read_seqbegin(&net->ipv4.ip_local_ports.lock);
|
||||
|
||||
*low = net->ipv4.ip_local_ports.range[0];
|
||||
*high = net->ipv4.ip_local_ports.range[1];
|
||||
} while (read_seqretry(&net->ipv4.ip_local_ports.lock, seq));
|
||||
}
|
||||
EXPORT_SYMBOL(inet_get_local_port_range);
|
||||
|
||||
int inet_csk_bind_conflict(const struct sock *sk,
|
||||
const struct inet_bind_bucket *tb, bool relax)
|
||||
{
|
||||
struct sock *sk2;
|
||||
int reuse = sk->sk_reuse;
|
||||
int reuseport = sk->sk_reuseport;
|
||||
kuid_t uid = sock_i_uid((struct sock *)sk);
|
||||
|
||||
/*
|
||||
* Unlike other sk lookup places we do not check
|
||||
* for sk_net here, since _all_ the socks listed
|
||||
* in tb->owners list belong to the same net - the
|
||||
* one this bucket belongs to.
|
||||
*/
|
||||
|
||||
sk_for_each_bound(sk2, &tb->owners) {
|
||||
if (sk != sk2 &&
|
||||
!inet_v6_ipv6only(sk2) &&
|
||||
(!sk->sk_bound_dev_if ||
|
||||
!sk2->sk_bound_dev_if ||
|
||||
sk->sk_bound_dev_if == sk2->sk_bound_dev_if)) {
|
||||
if ((!reuse || !sk2->sk_reuse ||
|
||||
sk2->sk_state == TCP_LISTEN) &&
|
||||
(!reuseport || !sk2->sk_reuseport ||
|
||||
(sk2->sk_state != TCP_TIME_WAIT &&
|
||||
!uid_eq(uid, sock_i_uid(sk2))))) {
|
||||
|
||||
if (!sk2->sk_rcv_saddr || !sk->sk_rcv_saddr ||
|
||||
sk2->sk_rcv_saddr == sk->sk_rcv_saddr)
|
||||
break;
|
||||
}
|
||||
if (!relax && reuse && sk2->sk_reuse &&
|
||||
sk2->sk_state != TCP_LISTEN) {
|
||||
|
||||
if (!sk2->sk_rcv_saddr || !sk->sk_rcv_saddr ||
|
||||
sk2->sk_rcv_saddr == sk->sk_rcv_saddr)
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
return sk2 != NULL;
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(inet_csk_bind_conflict);
|
||||
|
||||
/* Obtain a reference to a local port for the given sock,
|
||||
* if snum is zero it means select any available local port.
|
||||
*/
|
||||
int inet_csk_get_port(struct sock *sk, unsigned short snum)
|
||||
{
|
||||
struct inet_hashinfo *hashinfo = sk->sk_prot->h.hashinfo;
|
||||
struct inet_bind_hashbucket *head;
|
||||
struct inet_bind_bucket *tb;
|
||||
int ret, attempts = 5;
|
||||
struct net *net = sock_net(sk);
|
||||
int smallest_size = -1, smallest_rover;
|
||||
kuid_t uid = sock_i_uid(sk);
|
||||
|
||||
local_bh_disable();
|
||||
if (!snum) {
|
||||
int remaining, rover, low, high;
|
||||
|
||||
again:
|
||||
inet_get_local_port_range(net, &low, &high);
|
||||
remaining = (high - low) + 1;
|
||||
smallest_rover = rover = prandom_u32() % remaining + low;
|
||||
|
||||
smallest_size = -1;
|
||||
do {
|
||||
if (inet_is_local_reserved_port(net, rover))
|
||||
goto next_nolock;
|
||||
head = &hashinfo->bhash[inet_bhashfn(net, rover,
|
||||
hashinfo->bhash_size)];
|
||||
spin_lock(&head->lock);
|
||||
inet_bind_bucket_for_each(tb, &head->chain)
|
||||
if (net_eq(ib_net(tb), net) && tb->port == rover) {
|
||||
if (((tb->fastreuse > 0 &&
|
||||
sk->sk_reuse &&
|
||||
sk->sk_state != TCP_LISTEN) ||
|
||||
(tb->fastreuseport > 0 &&
|
||||
sk->sk_reuseport &&
|
||||
uid_eq(tb->fastuid, uid))) &&
|
||||
(tb->num_owners < smallest_size || smallest_size == -1)) {
|
||||
smallest_size = tb->num_owners;
|
||||
smallest_rover = rover;
|
||||
if (atomic_read(&hashinfo->bsockets) > (high - low) + 1 &&
|
||||
!inet_csk(sk)->icsk_af_ops->bind_conflict(sk, tb, false)) {
|
||||
snum = smallest_rover;
|
||||
goto tb_found;
|
||||
}
|
||||
}
|
||||
if (!inet_csk(sk)->icsk_af_ops->bind_conflict(sk, tb, false)) {
|
||||
snum = rover;
|
||||
goto tb_found;
|
||||
}
|
||||
goto next;
|
||||
}
|
||||
break;
|
||||
next:
|
||||
spin_unlock(&head->lock);
|
||||
next_nolock:
|
||||
if (++rover > high)
|
||||
rover = low;
|
||||
} while (--remaining > 0);
|
||||
|
||||
/* Exhausted local port range during search? It is not
|
||||
* possible for us to be holding one of the bind hash
|
||||
* locks if this test triggers, because if 'remaining'
|
||||
* drops to zero, we broke out of the do/while loop at
|
||||
* the top level, not from the 'break;' statement.
|
||||
*/
|
||||
ret = 1;
|
||||
if (remaining <= 0) {
|
||||
if (smallest_size != -1) {
|
||||
snum = smallest_rover;
|
||||
goto have_snum;
|
||||
}
|
||||
goto fail;
|
||||
}
|
||||
/* OK, here is the one we will use. HEAD is
|
||||
* non-NULL and we hold it's mutex.
|
||||
*/
|
||||
snum = rover;
|
||||
} else {
|
||||
have_snum:
|
||||
head = &hashinfo->bhash[inet_bhashfn(net, snum,
|
||||
hashinfo->bhash_size)];
|
||||
spin_lock(&head->lock);
|
||||
inet_bind_bucket_for_each(tb, &head->chain)
|
||||
if (net_eq(ib_net(tb), net) && tb->port == snum)
|
||||
goto tb_found;
|
||||
}
|
||||
tb = NULL;
|
||||
goto tb_not_found;
|
||||
tb_found:
|
||||
if (!hlist_empty(&tb->owners)) {
|
||||
if (sk->sk_reuse == SK_FORCE_REUSE)
|
||||
goto success;
|
||||
|
||||
if (((tb->fastreuse > 0 &&
|
||||
sk->sk_reuse && sk->sk_state != TCP_LISTEN) ||
|
||||
(tb->fastreuseport > 0 &&
|
||||
sk->sk_reuseport && uid_eq(tb->fastuid, uid))) &&
|
||||
smallest_size == -1) {
|
||||
goto success;
|
||||
} else {
|
||||
ret = 1;
|
||||
if (inet_csk(sk)->icsk_af_ops->bind_conflict(sk, tb, true)) {
|
||||
if (((sk->sk_reuse && sk->sk_state != TCP_LISTEN) ||
|
||||
(tb->fastreuseport > 0 &&
|
||||
sk->sk_reuseport && uid_eq(tb->fastuid, uid))) &&
|
||||
smallest_size != -1 && --attempts >= 0) {
|
||||
spin_unlock(&head->lock);
|
||||
goto again;
|
||||
}
|
||||
|
||||
goto fail_unlock;
|
||||
}
|
||||
}
|
||||
}
|
||||
tb_not_found:
|
||||
ret = 1;
|
||||
if (!tb && (tb = inet_bind_bucket_create(hashinfo->bind_bucket_cachep,
|
||||
net, head, snum)) == NULL)
|
||||
goto fail_unlock;
|
||||
if (hlist_empty(&tb->owners)) {
|
||||
if (sk->sk_reuse && sk->sk_state != TCP_LISTEN)
|
||||
tb->fastreuse = 1;
|
||||
else
|
||||
tb->fastreuse = 0;
|
||||
if (sk->sk_reuseport) {
|
||||
tb->fastreuseport = 1;
|
||||
tb->fastuid = uid;
|
||||
} else
|
||||
tb->fastreuseport = 0;
|
||||
} else {
|
||||
if (tb->fastreuse &&
|
||||
(!sk->sk_reuse || sk->sk_state == TCP_LISTEN))
|
||||
tb->fastreuse = 0;
|
||||
if (tb->fastreuseport &&
|
||||
(!sk->sk_reuseport || !uid_eq(tb->fastuid, uid)))
|
||||
tb->fastreuseport = 0;
|
||||
}
|
||||
success:
|
||||
if (!inet_csk(sk)->icsk_bind_hash)
|
||||
inet_bind_hash(sk, tb, snum);
|
||||
WARN_ON(inet_csk(sk)->icsk_bind_hash != tb);
|
||||
ret = 0;
|
||||
|
||||
fail_unlock:
|
||||
spin_unlock(&head->lock);
|
||||
fail:
|
||||
local_bh_enable();
|
||||
return ret;
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(inet_csk_get_port);
|
||||
|
||||
/*
|
||||
* Wait for an incoming connection, avoid race conditions. This must be called
|
||||
* with the socket locked.
|
||||
*/
|
||||
static int inet_csk_wait_for_connect(struct sock *sk, long timeo)
|
||||
{
|
||||
struct inet_connection_sock *icsk = inet_csk(sk);
|
||||
DEFINE_WAIT(wait);
|
||||
int err;
|
||||
|
||||
/*
|
||||
* True wake-one mechanism for incoming connections: only
|
||||
* one process gets woken up, not the 'whole herd'.
|
||||
* Since we do not 'race & poll' for established sockets
|
||||
* anymore, the common case will execute the loop only once.
|
||||
*
|
||||
* Subtle issue: "add_wait_queue_exclusive()" will be added
|
||||
* after any current non-exclusive waiters, and we know that
|
||||
* it will always _stay_ after any new non-exclusive waiters
|
||||
* because all non-exclusive waiters are added at the
|
||||
* beginning of the wait-queue. As such, it's ok to "drop"
|
||||
* our exclusiveness temporarily when we get woken up without
|
||||
* having to remove and re-insert us on the wait queue.
|
||||
*/
|
||||
for (;;) {
|
||||
prepare_to_wait_exclusive(sk_sleep(sk), &wait,
|
||||
TASK_INTERRUPTIBLE);
|
||||
release_sock(sk);
|
||||
if (reqsk_queue_empty(&icsk->icsk_accept_queue))
|
||||
timeo = schedule_timeout(timeo);
|
||||
lock_sock(sk);
|
||||
err = 0;
|
||||
if (!reqsk_queue_empty(&icsk->icsk_accept_queue))
|
||||
break;
|
||||
err = -EINVAL;
|
||||
if (sk->sk_state != TCP_LISTEN)
|
||||
break;
|
||||
err = sock_intr_errno(timeo);
|
||||
if (signal_pending(current))
|
||||
break;
|
||||
err = -EAGAIN;
|
||||
if (!timeo)
|
||||
break;
|
||||
}
|
||||
finish_wait(sk_sleep(sk), &wait);
|
||||
return err;
|
||||
}
|
||||
|
||||
/*
|
||||
* This will accept the next outstanding connection.
|
||||
*/
|
||||
struct sock *inet_csk_accept(struct sock *sk, int flags, int *err)
|
||||
{
|
||||
struct inet_connection_sock *icsk = inet_csk(sk);
|
||||
struct request_sock_queue *queue = &icsk->icsk_accept_queue;
|
||||
struct sock *newsk;
|
||||
struct request_sock *req;
|
||||
int error;
|
||||
|
||||
lock_sock(sk);
|
||||
|
||||
/* We need to make sure that this socket is listening,
|
||||
* and that it has something pending.
|
||||
*/
|
||||
error = -EINVAL;
|
||||
if (sk->sk_state != TCP_LISTEN)
|
||||
goto out_err;
|
||||
|
||||
/* Find already established connection */
|
||||
if (reqsk_queue_empty(queue)) {
|
||||
long timeo = sock_rcvtimeo(sk, flags & O_NONBLOCK);
|
||||
|
||||
/* If this is a non blocking socket don't sleep */
|
||||
error = -EAGAIN;
|
||||
if (!timeo)
|
||||
goto out_err;
|
||||
|
||||
error = inet_csk_wait_for_connect(sk, timeo);
|
||||
if (error)
|
||||
goto out_err;
|
||||
}
|
||||
req = reqsk_queue_remove(queue);
|
||||
newsk = req->sk;
|
||||
|
||||
sk_acceptq_removed(sk);
|
||||
if (sk->sk_protocol == IPPROTO_TCP && queue->fastopenq != NULL) {
|
||||
spin_lock_bh(&queue->fastopenq->lock);
|
||||
if (tcp_rsk(req)->listener) {
|
||||
/* We are still waiting for the final ACK from 3WHS
|
||||
* so can't free req now. Instead, we set req->sk to
|
||||
* NULL to signify that the child socket is taken
|
||||
* so reqsk_fastopen_remove() will free the req
|
||||
* when 3WHS finishes (or is aborted).
|
||||
*/
|
||||
req->sk = NULL;
|
||||
req = NULL;
|
||||
}
|
||||
spin_unlock_bh(&queue->fastopenq->lock);
|
||||
}
|
||||
out:
|
||||
release_sock(sk);
|
||||
if (req)
|
||||
__reqsk_free(req);
|
||||
return newsk;
|
||||
out_err:
|
||||
newsk = NULL;
|
||||
req = NULL;
|
||||
*err = error;
|
||||
goto out;
|
||||
}
|
||||
EXPORT_SYMBOL(inet_csk_accept);
|
||||
|
||||
/*
|
||||
* Using different timers for retransmit, delayed acks and probes
|
||||
* We may wish use just one timer maintaining a list of expire jiffies
|
||||
* to optimize.
|
||||
*/
|
||||
void inet_csk_init_xmit_timers(struct sock *sk,
|
||||
void (*retransmit_handler)(unsigned long),
|
||||
void (*delack_handler)(unsigned long),
|
||||
void (*keepalive_handler)(unsigned long))
|
||||
{
|
||||
struct inet_connection_sock *icsk = inet_csk(sk);
|
||||
|
||||
setup_timer(&icsk->icsk_retransmit_timer, retransmit_handler,
|
||||
(unsigned long)sk);
|
||||
setup_timer(&icsk->icsk_delack_timer, delack_handler,
|
||||
(unsigned long)sk);
|
||||
setup_timer(&sk->sk_timer, keepalive_handler, (unsigned long)sk);
|
||||
icsk->icsk_pending = icsk->icsk_ack.pending = 0;
|
||||
}
|
||||
EXPORT_SYMBOL(inet_csk_init_xmit_timers);
|
||||
|
||||
void inet_csk_clear_xmit_timers(struct sock *sk)
|
||||
{
|
||||
struct inet_connection_sock *icsk = inet_csk(sk);
|
||||
|
||||
icsk->icsk_pending = icsk->icsk_ack.pending = icsk->icsk_ack.blocked = 0;
|
||||
|
||||
sk_stop_timer(sk, &icsk->icsk_retransmit_timer);
|
||||
sk_stop_timer(sk, &icsk->icsk_delack_timer);
|
||||
sk_stop_timer(sk, &sk->sk_timer);
|
||||
}
|
||||
EXPORT_SYMBOL(inet_csk_clear_xmit_timers);
|
||||
|
||||
void inet_csk_delete_keepalive_timer(struct sock *sk)
|
||||
{
|
||||
sk_stop_timer(sk, &sk->sk_timer);
|
||||
}
|
||||
EXPORT_SYMBOL(inet_csk_delete_keepalive_timer);
|
||||
|
||||
void inet_csk_reset_keepalive_timer(struct sock *sk, unsigned long len)
|
||||
{
|
||||
sk_reset_timer(sk, &sk->sk_timer, jiffies + len);
|
||||
}
|
||||
EXPORT_SYMBOL(inet_csk_reset_keepalive_timer);
|
||||
|
||||
struct dst_entry *inet_csk_route_req(struct sock *sk,
|
||||
struct flowi4 *fl4,
|
||||
const struct request_sock *req)
|
||||
{
|
||||
struct rtable *rt;
|
||||
const struct inet_request_sock *ireq = inet_rsk(req);
|
||||
struct ip_options_rcu *opt = inet_rsk(req)->opt;
|
||||
struct net *net = sock_net(sk);
|
||||
int flags = inet_sk_flowi_flags(sk);
|
||||
|
||||
flowi4_init_output(fl4, sk->sk_bound_dev_if, ireq->ir_mark,
|
||||
RT_CONN_FLAGS(sk), RT_SCOPE_UNIVERSE,
|
||||
sk->sk_protocol,
|
||||
flags,
|
||||
(opt && opt->opt.srr) ? opt->opt.faddr : ireq->ir_rmt_addr,
|
||||
ireq->ir_loc_addr, ireq->ir_rmt_port, inet_sk(sk)->inet_sport,
|
||||
sock_i_uid(sk));
|
||||
security_req_classify_flow(req, flowi4_to_flowi(fl4));
|
||||
rt = ip_route_output_flow(net, fl4, sk);
|
||||
if (IS_ERR(rt))
|
||||
goto no_route;
|
||||
if (opt && opt->opt.is_strictroute && rt->rt_uses_gateway)
|
||||
goto route_err;
|
||||
return &rt->dst;
|
||||
|
||||
route_err:
|
||||
ip_rt_put(rt);
|
||||
no_route:
|
||||
IP_INC_STATS_BH(net, IPSTATS_MIB_OUTNOROUTES);
|
||||
return NULL;
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(inet_csk_route_req);
|
||||
|
||||
struct dst_entry *inet_csk_route_child_sock(struct sock *sk,
|
||||
struct sock *newsk,
|
||||
const struct request_sock *req)
|
||||
{
|
||||
const struct inet_request_sock *ireq = inet_rsk(req);
|
||||
struct inet_sock *newinet = inet_sk(newsk);
|
||||
struct ip_options_rcu *opt;
|
||||
struct net *net = sock_net(sk);
|
||||
struct flowi4 *fl4;
|
||||
struct rtable *rt;
|
||||
|
||||
fl4 = &newinet->cork.fl.u.ip4;
|
||||
|
||||
rcu_read_lock();
|
||||
opt = rcu_dereference(newinet->inet_opt);
|
||||
flowi4_init_output(fl4, sk->sk_bound_dev_if, inet_rsk(req)->ir_mark,
|
||||
RT_CONN_FLAGS(sk), RT_SCOPE_UNIVERSE,
|
||||
sk->sk_protocol, inet_sk_flowi_flags(sk),
|
||||
(opt && opt->opt.srr) ? opt->opt.faddr : ireq->ir_rmt_addr,
|
||||
ireq->ir_loc_addr, ireq->ir_rmt_port, inet_sk(sk)->inet_sport,
|
||||
sock_i_uid(sk));
|
||||
security_req_classify_flow(req, flowi4_to_flowi(fl4));
|
||||
rt = ip_route_output_flow(net, fl4, sk);
|
||||
if (IS_ERR(rt))
|
||||
goto no_route;
|
||||
if (opt && opt->opt.is_strictroute && rt->rt_uses_gateway)
|
||||
goto route_err;
|
||||
rcu_read_unlock();
|
||||
return &rt->dst;
|
||||
|
||||
route_err:
|
||||
ip_rt_put(rt);
|
||||
no_route:
|
||||
rcu_read_unlock();
|
||||
IP_INC_STATS_BH(net, IPSTATS_MIB_OUTNOROUTES);
|
||||
return NULL;
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(inet_csk_route_child_sock);
|
||||
|
||||
static inline u32 inet_synq_hash(const __be32 raddr, const __be16 rport,
|
||||
const u32 rnd, const u32 synq_hsize)
|
||||
{
|
||||
return jhash_2words((__force u32)raddr, (__force u32)rport, rnd) & (synq_hsize - 1);
|
||||
}
|
||||
|
||||
#if IS_ENABLED(CONFIG_IPV6)
|
||||
#define AF_INET_FAMILY(fam) ((fam) == AF_INET)
|
||||
#else
|
||||
#define AF_INET_FAMILY(fam) 1
|
||||
#endif
|
||||
|
||||
struct request_sock *inet_csk_search_req(const struct sock *sk,
|
||||
struct request_sock ***prevp,
|
||||
const __be16 rport, const __be32 raddr,
|
||||
const __be32 laddr)
|
||||
{
|
||||
const struct inet_connection_sock *icsk = inet_csk(sk);
|
||||
struct listen_sock *lopt = icsk->icsk_accept_queue.listen_opt;
|
||||
struct request_sock *req, **prev;
|
||||
|
||||
for (prev = &lopt->syn_table[inet_synq_hash(raddr, rport, lopt->hash_rnd,
|
||||
lopt->nr_table_entries)];
|
||||
(req = *prev) != NULL;
|
||||
prev = &req->dl_next) {
|
||||
const struct inet_request_sock *ireq = inet_rsk(req);
|
||||
|
||||
if (ireq->ir_rmt_port == rport &&
|
||||
ireq->ir_rmt_addr == raddr &&
|
||||
ireq->ir_loc_addr == laddr &&
|
||||
AF_INET_FAMILY(req->rsk_ops->family)) {
|
||||
WARN_ON(req->sk);
|
||||
*prevp = prev;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
return req;
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(inet_csk_search_req);
|
||||
|
||||
void inet_csk_reqsk_queue_hash_add(struct sock *sk, struct request_sock *req,
|
||||
unsigned long timeout)
|
||||
{
|
||||
struct inet_connection_sock *icsk = inet_csk(sk);
|
||||
struct listen_sock *lopt = icsk->icsk_accept_queue.listen_opt;
|
||||
const u32 h = inet_synq_hash(inet_rsk(req)->ir_rmt_addr,
|
||||
inet_rsk(req)->ir_rmt_port,
|
||||
lopt->hash_rnd, lopt->nr_table_entries);
|
||||
|
||||
reqsk_queue_hash_req(&icsk->icsk_accept_queue, h, req, timeout);
|
||||
inet_csk_reqsk_queue_added(sk, timeout);
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(inet_csk_reqsk_queue_hash_add);
|
||||
|
||||
/* Only thing we need from tcp.h */
|
||||
extern int sysctl_tcp_synack_retries;
|
||||
|
||||
|
||||
/* Decide when to expire the request and when to resend SYN-ACK */
|
||||
static inline void syn_ack_recalc(struct request_sock *req, const int thresh,
|
||||
const int max_retries,
|
||||
const u8 rskq_defer_accept,
|
||||
int *expire, int *resend)
|
||||
{
|
||||
if (!rskq_defer_accept) {
|
||||
*expire = req->num_timeout >= thresh;
|
||||
*resend = 1;
|
||||
return;
|
||||
}
|
||||
*expire = req->num_timeout >= thresh &&
|
||||
(!inet_rsk(req)->acked || req->num_timeout >= max_retries);
|
||||
/*
|
||||
* Do not resend while waiting for data after ACK,
|
||||
* start to resend on end of deferring period to give
|
||||
* last chance for data or ACK to create established socket.
|
||||
*/
|
||||
*resend = !inet_rsk(req)->acked ||
|
||||
req->num_timeout >= rskq_defer_accept - 1;
|
||||
}
|
||||
|
||||
int inet_rtx_syn_ack(struct sock *parent, struct request_sock *req)
|
||||
{
|
||||
int err = req->rsk_ops->rtx_syn_ack(parent, req);
|
||||
|
||||
if (!err)
|
||||
req->num_retrans++;
|
||||
return err;
|
||||
}
|
||||
EXPORT_SYMBOL(inet_rtx_syn_ack);
|
||||
|
||||
void inet_csk_reqsk_queue_prune(struct sock *parent,
|
||||
const unsigned long interval,
|
||||
const unsigned long timeout,
|
||||
const unsigned long max_rto)
|
||||
{
|
||||
struct inet_connection_sock *icsk = inet_csk(parent);
|
||||
struct request_sock_queue *queue = &icsk->icsk_accept_queue;
|
||||
struct listen_sock *lopt = queue->listen_opt;
|
||||
int max_retries = icsk->icsk_syn_retries ? : sysctl_tcp_synack_retries;
|
||||
int thresh = max_retries;
|
||||
unsigned long now = jiffies;
|
||||
struct request_sock **reqp, *req;
|
||||
int i, budget;
|
||||
|
||||
if (lopt == NULL || lopt->qlen == 0)
|
||||
return;
|
||||
|
||||
/* Normally all the openreqs are young and become mature
|
||||
* (i.e. converted to established socket) for first timeout.
|
||||
* If synack was not acknowledged for 1 second, it means
|
||||
* one of the following things: synack was lost, ack was lost,
|
||||
* rtt is high or nobody planned to ack (i.e. synflood).
|
||||
* When server is a bit loaded, queue is populated with old
|
||||
* open requests, reducing effective size of queue.
|
||||
* When server is well loaded, queue size reduces to zero
|
||||
* after several minutes of work. It is not synflood,
|
||||
* it is normal operation. The solution is pruning
|
||||
* too old entries overriding normal timeout, when
|
||||
* situation becomes dangerous.
|
||||
*
|
||||
* Essentially, we reserve half of room for young
|
||||
* embrions; and abort old ones without pity, if old
|
||||
* ones are about to clog our table.
|
||||
*/
|
||||
if (lopt->qlen>>(lopt->max_qlen_log-1)) {
|
||||
int young = (lopt->qlen_young<<1);
|
||||
|
||||
while (thresh > 2) {
|
||||
if (lopt->qlen < young)
|
||||
break;
|
||||
thresh--;
|
||||
young <<= 1;
|
||||
}
|
||||
}
|
||||
|
||||
if (queue->rskq_defer_accept)
|
||||
max_retries = queue->rskq_defer_accept;
|
||||
|
||||
budget = 2 * (lopt->nr_table_entries / (timeout / interval));
|
||||
i = lopt->clock_hand;
|
||||
|
||||
do {
|
||||
reqp=&lopt->syn_table[i];
|
||||
while ((req = *reqp) != NULL) {
|
||||
if (time_after_eq(now, req->expires)) {
|
||||
int expire = 0, resend = 0;
|
||||
|
||||
syn_ack_recalc(req, thresh, max_retries,
|
||||
queue->rskq_defer_accept,
|
||||
&expire, &resend);
|
||||
req->rsk_ops->syn_ack_timeout(parent, req);
|
||||
if (!expire &&
|
||||
(!resend ||
|
||||
!inet_rtx_syn_ack(parent, req) ||
|
||||
inet_rsk(req)->acked)) {
|
||||
unsigned long timeo;
|
||||
|
||||
if (req->num_timeout++ == 0)
|
||||
lopt->qlen_young--;
|
||||
timeo = min(timeout << req->num_timeout,
|
||||
max_rto);
|
||||
req->expires = now + timeo;
|
||||
reqp = &req->dl_next;
|
||||
continue;
|
||||
}
|
||||
|
||||
/* Drop this request */
|
||||
inet_csk_reqsk_queue_unlink(parent, req, reqp);
|
||||
reqsk_queue_removed(queue, req);
|
||||
reqsk_free(req);
|
||||
continue;
|
||||
}
|
||||
reqp = &req->dl_next;
|
||||
}
|
||||
|
||||
i = (i + 1) & (lopt->nr_table_entries - 1);
|
||||
|
||||
} while (--budget > 0);
|
||||
|
||||
lopt->clock_hand = i;
|
||||
|
||||
if (lopt->qlen)
|
||||
inet_csk_reset_keepalive_timer(parent, interval);
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(inet_csk_reqsk_queue_prune);
|
||||
|
||||
/**
|
||||
* inet_csk_clone_lock - clone an inet socket, and lock its clone
|
||||
* @sk: the socket to clone
|
||||
* @req: request_sock
|
||||
* @priority: for allocation (%GFP_KERNEL, %GFP_ATOMIC, etc)
|
||||
*
|
||||
* Caller must unlock socket even in error path (bh_unlock_sock(newsk))
|
||||
*/
|
||||
struct sock *inet_csk_clone_lock(const struct sock *sk,
|
||||
const struct request_sock *req,
|
||||
const gfp_t priority)
|
||||
{
|
||||
struct sock *newsk = sk_clone_lock(sk, priority);
|
||||
|
||||
if (newsk != NULL) {
|
||||
struct inet_connection_sock *newicsk = inet_csk(newsk);
|
||||
|
||||
newsk->sk_state = TCP_SYN_RECV;
|
||||
newicsk->icsk_bind_hash = NULL;
|
||||
|
||||
inet_sk(newsk)->inet_dport = inet_rsk(req)->ir_rmt_port;
|
||||
inet_sk(newsk)->inet_num = inet_rsk(req)->ir_num;
|
||||
inet_sk(newsk)->inet_sport = htons(inet_rsk(req)->ir_num);
|
||||
newsk->sk_write_space = sk_stream_write_space;
|
||||
|
||||
newsk->sk_mark = inet_rsk(req)->ir_mark;
|
||||
|
||||
newicsk->icsk_retransmits = 0;
|
||||
newicsk->icsk_backoff = 0;
|
||||
newicsk->icsk_probes_out = 0;
|
||||
|
||||
/* Deinitialize accept_queue to trap illegal accesses. */
|
||||
memset(&newicsk->icsk_accept_queue, 0, sizeof(newicsk->icsk_accept_queue));
|
||||
|
||||
security_inet_csk_clone(newsk, req);
|
||||
}
|
||||
return newsk;
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(inet_csk_clone_lock);
|
||||
|
||||
/*
|
||||
* At this point, there should be no process reference to this
|
||||
* socket, and thus no user references at all. Therefore we
|
||||
* can assume the socket waitqueue is inactive and nobody will
|
||||
* try to jump onto it.
|
||||
*/
|
||||
void inet_csk_destroy_sock(struct sock *sk)
|
||||
{
|
||||
WARN_ON(sk->sk_state != TCP_CLOSE);
|
||||
WARN_ON(!sock_flag(sk, SOCK_DEAD));
|
||||
|
||||
/* It cannot be in hash table! */
|
||||
WARN_ON(!sk_unhashed(sk));
|
||||
|
||||
/* If it has not 0 inet_sk(sk)->inet_num, it must be bound */
|
||||
WARN_ON(inet_sk(sk)->inet_num && !inet_csk(sk)->icsk_bind_hash);
|
||||
|
||||
sk->sk_prot->destroy(sk);
|
||||
|
||||
sk_stream_kill_queues(sk);
|
||||
|
||||
xfrm_sk_free_policy(sk);
|
||||
|
||||
sk_refcnt_debug_release(sk);
|
||||
|
||||
percpu_counter_dec(sk->sk_prot->orphan_count);
|
||||
sock_put(sk);
|
||||
}
|
||||
EXPORT_SYMBOL(inet_csk_destroy_sock);
|
||||
|
||||
/* This function allows to force a closure of a socket after the call to
|
||||
* tcp/dccp_create_openreq_child().
|
||||
*/
|
||||
void inet_csk_prepare_forced_close(struct sock *sk)
|
||||
__releases(&sk->sk_lock.slock)
|
||||
{
|
||||
/* sk_clone_lock locked the socket and set refcnt to 2 */
|
||||
bh_unlock_sock(sk);
|
||||
sock_put(sk);
|
||||
|
||||
/* The below has to be done to allow calling inet_csk_destroy_sock */
|
||||
sock_set_flag(sk, SOCK_DEAD);
|
||||
percpu_counter_inc(sk->sk_prot->orphan_count);
|
||||
inet_sk(sk)->inet_num = 0;
|
||||
}
|
||||
EXPORT_SYMBOL(inet_csk_prepare_forced_close);
|
||||
|
||||
int inet_csk_listen_start(struct sock *sk, const int nr_table_entries)
|
||||
{
|
||||
struct inet_sock *inet = inet_sk(sk);
|
||||
struct inet_connection_sock *icsk = inet_csk(sk);
|
||||
int rc = reqsk_queue_alloc(&icsk->icsk_accept_queue, nr_table_entries);
|
||||
|
||||
if (rc != 0)
|
||||
return rc;
|
||||
|
||||
sk->sk_max_ack_backlog = 0;
|
||||
sk->sk_ack_backlog = 0;
|
||||
inet_csk_delack_init(sk);
|
||||
|
||||
/* There is race window here: we announce ourselves listening,
|
||||
* but this transition is still not validated by get_port().
|
||||
* It is OK, because this socket enters to hash table only
|
||||
* after validation is complete.
|
||||
*/
|
||||
sk->sk_state = TCP_LISTEN;
|
||||
if (!sk->sk_prot->get_port(sk, inet->inet_num)) {
|
||||
inet->inet_sport = htons(inet->inet_num);
|
||||
|
||||
sk_dst_reset(sk);
|
||||
sk->sk_prot->hash(sk);
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
sk->sk_state = TCP_CLOSE;
|
||||
__reqsk_queue_destroy(&icsk->icsk_accept_queue);
|
||||
return -EADDRINUSE;
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(inet_csk_listen_start);
|
||||
|
||||
/*
|
||||
* This routine closes sockets which have been at least partially
|
||||
* opened, but not yet accepted.
|
||||
*/
|
||||
void inet_csk_listen_stop(struct sock *sk)
|
||||
{
|
||||
struct inet_connection_sock *icsk = inet_csk(sk);
|
||||
struct request_sock_queue *queue = &icsk->icsk_accept_queue;
|
||||
struct request_sock *acc_req;
|
||||
struct request_sock *req;
|
||||
|
||||
inet_csk_delete_keepalive_timer(sk);
|
||||
|
||||
/* make all the listen_opt local to us */
|
||||
acc_req = reqsk_queue_yank_acceptq(queue);
|
||||
|
||||
/* Following specs, it would be better either to send FIN
|
||||
* (and enter FIN-WAIT-1, it is normal close)
|
||||
* or to send active reset (abort).
|
||||
* Certainly, it is pretty dangerous while synflood, but it is
|
||||
* bad justification for our negligence 8)
|
||||
* To be honest, we are not able to make either
|
||||
* of the variants now. --ANK
|
||||
*/
|
||||
reqsk_queue_destroy(queue);
|
||||
|
||||
while ((req = acc_req) != NULL) {
|
||||
struct sock *child = req->sk;
|
||||
|
||||
acc_req = req->dl_next;
|
||||
|
||||
local_bh_disable();
|
||||
bh_lock_sock(child);
|
||||
WARN_ON(sock_owned_by_user(child));
|
||||
sock_hold(child);
|
||||
|
||||
sk->sk_prot->disconnect(child, O_NONBLOCK);
|
||||
|
||||
sock_orphan(child);
|
||||
|
||||
percpu_counter_inc(sk->sk_prot->orphan_count);
|
||||
|
||||
if (sk->sk_protocol == IPPROTO_TCP && tcp_rsk(req)->listener) {
|
||||
BUG_ON(tcp_sk(child)->fastopen_rsk != req);
|
||||
BUG_ON(sk != tcp_rsk(req)->listener);
|
||||
|
||||
/* Paranoid, to prevent race condition if
|
||||
* an inbound pkt destined for child is
|
||||
* blocked by sock lock in tcp_v4_rcv().
|
||||
* Also to satisfy an assertion in
|
||||
* tcp_v4_destroy_sock().
|
||||
*/
|
||||
tcp_sk(child)->fastopen_rsk = NULL;
|
||||
sock_put(sk);
|
||||
}
|
||||
inet_csk_destroy_sock(child);
|
||||
|
||||
bh_unlock_sock(child);
|
||||
local_bh_enable();
|
||||
sock_put(child);
|
||||
|
||||
sk_acceptq_removed(sk);
|
||||
__reqsk_free(req);
|
||||
}
|
||||
if (queue->fastopenq != NULL) {
|
||||
/* Free all the reqs queued in rskq_rst_head. */
|
||||
spin_lock_bh(&queue->fastopenq->lock);
|
||||
acc_req = queue->fastopenq->rskq_rst_head;
|
||||
queue->fastopenq->rskq_rst_head = NULL;
|
||||
spin_unlock_bh(&queue->fastopenq->lock);
|
||||
while ((req = acc_req) != NULL) {
|
||||
acc_req = req->dl_next;
|
||||
__reqsk_free(req);
|
||||
}
|
||||
}
|
||||
WARN_ON(sk->sk_ack_backlog);
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(inet_csk_listen_stop);
|
||||
|
||||
void inet_csk_addr2sockaddr(struct sock *sk, struct sockaddr *uaddr)
|
||||
{
|
||||
struct sockaddr_in *sin = (struct sockaddr_in *)uaddr;
|
||||
const struct inet_sock *inet = inet_sk(sk);
|
||||
|
||||
sin->sin_family = AF_INET;
|
||||
sin->sin_addr.s_addr = inet->inet_daddr;
|
||||
sin->sin_port = inet->inet_dport;
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(inet_csk_addr2sockaddr);
|
||||
|
||||
#ifdef CONFIG_COMPAT
|
||||
int inet_csk_compat_getsockopt(struct sock *sk, int level, int optname,
|
||||
char __user *optval, int __user *optlen)
|
||||
{
|
||||
const struct inet_connection_sock *icsk = inet_csk(sk);
|
||||
|
||||
if (icsk->icsk_af_ops->compat_getsockopt != NULL)
|
||||
return icsk->icsk_af_ops->compat_getsockopt(sk, level, optname,
|
||||
optval, optlen);
|
||||
return icsk->icsk_af_ops->getsockopt(sk, level, optname,
|
||||
optval, optlen);
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(inet_csk_compat_getsockopt);
|
||||
|
||||
int inet_csk_compat_setsockopt(struct sock *sk, int level, int optname,
|
||||
char __user *optval, unsigned int optlen)
|
||||
{
|
||||
const struct inet_connection_sock *icsk = inet_csk(sk);
|
||||
|
||||
if (icsk->icsk_af_ops->compat_setsockopt != NULL)
|
||||
return icsk->icsk_af_ops->compat_setsockopt(sk, level, optname,
|
||||
optval, optlen);
|
||||
return icsk->icsk_af_ops->setsockopt(sk, level, optname,
|
||||
optval, optlen);
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(inet_csk_compat_setsockopt);
|
||||
#endif
|
||||
|
||||
static struct dst_entry *inet_csk_rebuild_route(struct sock *sk, struct flowi *fl)
|
||||
{
|
||||
const struct inet_sock *inet = inet_sk(sk);
|
||||
const struct ip_options_rcu *inet_opt;
|
||||
__be32 daddr = inet->inet_daddr;
|
||||
struct flowi4 *fl4;
|
||||
struct rtable *rt;
|
||||
|
||||
rcu_read_lock();
|
||||
inet_opt = rcu_dereference(inet->inet_opt);
|
||||
if (inet_opt && inet_opt->opt.srr)
|
||||
daddr = inet_opt->opt.faddr;
|
||||
fl4 = &fl->u.ip4;
|
||||
rt = ip_route_output_ports(sock_net(sk), fl4, sk, daddr,
|
||||
inet->inet_saddr, inet->inet_dport,
|
||||
inet->inet_sport, sk->sk_protocol,
|
||||
RT_CONN_FLAGS(sk), sk->sk_bound_dev_if);
|
||||
if (IS_ERR(rt))
|
||||
rt = NULL;
|
||||
if (rt)
|
||||
sk_setup_caps(sk, &rt->dst);
|
||||
rcu_read_unlock();
|
||||
|
||||
return &rt->dst;
|
||||
}
|
||||
|
||||
struct dst_entry *inet_csk_update_pmtu(struct sock *sk, u32 mtu)
|
||||
{
|
||||
struct dst_entry *dst = __sk_dst_check(sk, 0);
|
||||
struct inet_sock *inet = inet_sk(sk);
|
||||
|
||||
if (!dst) {
|
||||
dst = inet_csk_rebuild_route(sk, &inet->cork.fl);
|
||||
if (!dst)
|
||||
goto out;
|
||||
}
|
||||
dst->ops->update_pmtu(dst, sk, NULL, mtu);
|
||||
|
||||
dst = __sk_dst_check(sk, 0);
|
||||
if (!dst)
|
||||
dst = inet_csk_rebuild_route(sk, &inet->cork.fl);
|
||||
out:
|
||||
return dst;
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(inet_csk_update_pmtu);
|
||||
1231
net/ipv4/inet_diag.c
Normal file
1231
net/ipv4/inet_diag.c
Normal file
File diff suppressed because it is too large
Load diff
463
net/ipv4/inet_fragment.c
Normal file
463
net/ipv4/inet_fragment.c
Normal file
|
|
@ -0,0 +1,463 @@
|
|||
/*
|
||||
* inet fragments management
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU General Public License
|
||||
* as published by the Free Software Foundation; either version
|
||||
* 2 of the License, or (at your option) any later version.
|
||||
*
|
||||
* Authors: Pavel Emelyanov <xemul@openvz.org>
|
||||
* Started as consolidation of ipv4/ip_fragment.c,
|
||||
* ipv6/reassembly. and ipv6 nf conntrack reassembly
|
||||
*/
|
||||
|
||||
#include <linux/list.h>
|
||||
#include <linux/spinlock.h>
|
||||
#include <linux/module.h>
|
||||
#include <linux/timer.h>
|
||||
#include <linux/mm.h>
|
||||
#include <linux/random.h>
|
||||
#include <linux/skbuff.h>
|
||||
#include <linux/rtnetlink.h>
|
||||
#include <linux/slab.h>
|
||||
|
||||
#include <net/sock.h>
|
||||
#include <net/inet_frag.h>
|
||||
#include <net/inet_ecn.h>
|
||||
|
||||
#define INETFRAGS_EVICT_BUCKETS 128
|
||||
#define INETFRAGS_EVICT_MAX 512
|
||||
|
||||
/* don't rebuild inetfrag table with new secret more often than this */
|
||||
#define INETFRAGS_MIN_REBUILD_INTERVAL (5 * HZ)
|
||||
|
||||
/* Given the OR values of all fragments, apply RFC 3168 5.3 requirements
|
||||
* Value : 0xff if frame should be dropped.
|
||||
* 0 or INET_ECN_CE value, to be ORed in to final iph->tos field
|
||||
*/
|
||||
const u8 ip_frag_ecn_table[16] = {
|
||||
/* at least one fragment had CE, and others ECT_0 or ECT_1 */
|
||||
[IPFRAG_ECN_CE | IPFRAG_ECN_ECT_0] = INET_ECN_CE,
|
||||
[IPFRAG_ECN_CE | IPFRAG_ECN_ECT_1] = INET_ECN_CE,
|
||||
[IPFRAG_ECN_CE | IPFRAG_ECN_ECT_0 | IPFRAG_ECN_ECT_1] = INET_ECN_CE,
|
||||
|
||||
/* invalid combinations : drop frame */
|
||||
[IPFRAG_ECN_NOT_ECT | IPFRAG_ECN_CE] = 0xff,
|
||||
[IPFRAG_ECN_NOT_ECT | IPFRAG_ECN_ECT_0] = 0xff,
|
||||
[IPFRAG_ECN_NOT_ECT | IPFRAG_ECN_ECT_1] = 0xff,
|
||||
[IPFRAG_ECN_NOT_ECT | IPFRAG_ECN_ECT_0 | IPFRAG_ECN_ECT_1] = 0xff,
|
||||
[IPFRAG_ECN_NOT_ECT | IPFRAG_ECN_CE | IPFRAG_ECN_ECT_0] = 0xff,
|
||||
[IPFRAG_ECN_NOT_ECT | IPFRAG_ECN_CE | IPFRAG_ECN_ECT_1] = 0xff,
|
||||
[IPFRAG_ECN_NOT_ECT | IPFRAG_ECN_CE | IPFRAG_ECN_ECT_0 | IPFRAG_ECN_ECT_1] = 0xff,
|
||||
};
|
||||
EXPORT_SYMBOL(ip_frag_ecn_table);
|
||||
|
||||
static unsigned int
|
||||
inet_frag_hashfn(const struct inet_frags *f, const struct inet_frag_queue *q)
|
||||
{
|
||||
return f->hashfn(q) & (INETFRAGS_HASHSZ - 1);
|
||||
}
|
||||
|
||||
static bool inet_frag_may_rebuild(struct inet_frags *f)
|
||||
{
|
||||
return time_after(jiffies,
|
||||
f->last_rebuild_jiffies + INETFRAGS_MIN_REBUILD_INTERVAL);
|
||||
}
|
||||
|
||||
static void inet_frag_secret_rebuild(struct inet_frags *f)
|
||||
{
|
||||
int i;
|
||||
|
||||
write_seqlock_bh(&f->rnd_seqlock);
|
||||
|
||||
if (!inet_frag_may_rebuild(f))
|
||||
goto out;
|
||||
|
||||
get_random_bytes(&f->rnd, sizeof(u32));
|
||||
|
||||
for (i = 0; i < INETFRAGS_HASHSZ; i++) {
|
||||
struct inet_frag_bucket *hb;
|
||||
struct inet_frag_queue *q;
|
||||
struct hlist_node *n;
|
||||
|
||||
hb = &f->hash[i];
|
||||
spin_lock(&hb->chain_lock);
|
||||
|
||||
hlist_for_each_entry_safe(q, n, &hb->chain, list) {
|
||||
unsigned int hval = inet_frag_hashfn(f, q);
|
||||
|
||||
if (hval != i) {
|
||||
struct inet_frag_bucket *hb_dest;
|
||||
|
||||
hlist_del(&q->list);
|
||||
|
||||
/* Relink to new hash chain. */
|
||||
hb_dest = &f->hash[hval];
|
||||
|
||||
/* This is the only place where we take
|
||||
* another chain_lock while already holding
|
||||
* one. As this will not run concurrently,
|
||||
* we cannot deadlock on hb_dest lock below, if its
|
||||
* already locked it will be released soon since
|
||||
* other caller cannot be waiting for hb lock
|
||||
* that we've taken above.
|
||||
*/
|
||||
spin_lock_nested(&hb_dest->chain_lock,
|
||||
SINGLE_DEPTH_NESTING);
|
||||
hlist_add_head(&q->list, &hb_dest->chain);
|
||||
spin_unlock(&hb_dest->chain_lock);
|
||||
}
|
||||
}
|
||||
spin_unlock(&hb->chain_lock);
|
||||
}
|
||||
|
||||
f->rebuild = false;
|
||||
f->last_rebuild_jiffies = jiffies;
|
||||
out:
|
||||
write_sequnlock_bh(&f->rnd_seqlock);
|
||||
}
|
||||
|
||||
static bool inet_fragq_should_evict(const struct inet_frag_queue *q)
|
||||
{
|
||||
return q->net->low_thresh == 0 ||
|
||||
frag_mem_limit(q->net) >= q->net->low_thresh;
|
||||
}
|
||||
|
||||
static unsigned int
|
||||
inet_evict_bucket(struct inet_frags *f, struct inet_frag_bucket *hb)
|
||||
{
|
||||
struct inet_frag_queue *fq;
|
||||
struct hlist_node *n;
|
||||
unsigned int evicted = 0;
|
||||
HLIST_HEAD(expired);
|
||||
|
||||
evict_again:
|
||||
spin_lock(&hb->chain_lock);
|
||||
|
||||
hlist_for_each_entry_safe(fq, n, &hb->chain, list) {
|
||||
if (!inet_fragq_should_evict(fq))
|
||||
continue;
|
||||
|
||||
if (!del_timer(&fq->timer)) {
|
||||
/* q expiring right now thus increment its refcount so
|
||||
* it won't be freed under us and wait until the timer
|
||||
* has finished executing then destroy it
|
||||
*/
|
||||
atomic_inc(&fq->refcnt);
|
||||
spin_unlock(&hb->chain_lock);
|
||||
del_timer_sync(&fq->timer);
|
||||
inet_frag_put(fq, f);
|
||||
goto evict_again;
|
||||
}
|
||||
|
||||
fq->flags |= INET_FRAG_EVICTED;
|
||||
hlist_del(&fq->list);
|
||||
hlist_add_head(&fq->list, &expired);
|
||||
++evicted;
|
||||
}
|
||||
|
||||
spin_unlock(&hb->chain_lock);
|
||||
|
||||
hlist_for_each_entry_safe(fq, n, &expired, list)
|
||||
f->frag_expire((unsigned long) fq);
|
||||
|
||||
return evicted;
|
||||
}
|
||||
|
||||
static void inet_frag_worker(struct work_struct *work)
|
||||
{
|
||||
unsigned int budget = INETFRAGS_EVICT_BUCKETS;
|
||||
unsigned int i, evicted = 0;
|
||||
struct inet_frags *f;
|
||||
|
||||
f = container_of(work, struct inet_frags, frags_work);
|
||||
|
||||
BUILD_BUG_ON(INETFRAGS_EVICT_BUCKETS >= INETFRAGS_HASHSZ);
|
||||
|
||||
local_bh_disable();
|
||||
|
||||
for (i = ACCESS_ONCE(f->next_bucket); budget; --budget) {
|
||||
evicted += inet_evict_bucket(f, &f->hash[i]);
|
||||
i = (i + 1) & (INETFRAGS_HASHSZ - 1);
|
||||
if (evicted > INETFRAGS_EVICT_MAX)
|
||||
break;
|
||||
}
|
||||
|
||||
f->next_bucket = i;
|
||||
|
||||
local_bh_enable();
|
||||
|
||||
if (f->rebuild && inet_frag_may_rebuild(f))
|
||||
inet_frag_secret_rebuild(f);
|
||||
}
|
||||
|
||||
static void inet_frag_schedule_worker(struct inet_frags *f)
|
||||
{
|
||||
if (unlikely(!work_pending(&f->frags_work)))
|
||||
schedule_work(&f->frags_work);
|
||||
}
|
||||
|
||||
int inet_frags_init(struct inet_frags *f)
|
||||
{
|
||||
int i;
|
||||
|
||||
INIT_WORK(&f->frags_work, inet_frag_worker);
|
||||
|
||||
for (i = 0; i < INETFRAGS_HASHSZ; i++) {
|
||||
struct inet_frag_bucket *hb = &f->hash[i];
|
||||
|
||||
spin_lock_init(&hb->chain_lock);
|
||||
INIT_HLIST_HEAD(&hb->chain);
|
||||
}
|
||||
|
||||
seqlock_init(&f->rnd_seqlock);
|
||||
f->last_rebuild_jiffies = 0;
|
||||
f->frags_cachep = kmem_cache_create(f->frags_cache_name, f->qsize, 0, 0,
|
||||
NULL);
|
||||
if (!f->frags_cachep)
|
||||
return -ENOMEM;
|
||||
|
||||
return 0;
|
||||
}
|
||||
EXPORT_SYMBOL(inet_frags_init);
|
||||
|
||||
void inet_frags_init_net(struct netns_frags *nf)
|
||||
{
|
||||
init_frag_mem_limit(nf);
|
||||
}
|
||||
EXPORT_SYMBOL(inet_frags_init_net);
|
||||
|
||||
void inet_frags_fini(struct inet_frags *f)
|
||||
{
|
||||
cancel_work_sync(&f->frags_work);
|
||||
kmem_cache_destroy(f->frags_cachep);
|
||||
}
|
||||
EXPORT_SYMBOL(inet_frags_fini);
|
||||
|
||||
void inet_frags_exit_net(struct netns_frags *nf, struct inet_frags *f)
|
||||
{
|
||||
unsigned int seq;
|
||||
int i;
|
||||
|
||||
nf->low_thresh = 0;
|
||||
local_bh_disable();
|
||||
|
||||
evict_again:
|
||||
seq = read_seqbegin(&f->rnd_seqlock);
|
||||
|
||||
for (i = 0; i < INETFRAGS_HASHSZ ; i++)
|
||||
inet_evict_bucket(f, &f->hash[i]);
|
||||
|
||||
if (read_seqretry(&f->rnd_seqlock, seq))
|
||||
goto evict_again;
|
||||
|
||||
local_bh_enable();
|
||||
|
||||
percpu_counter_destroy(&nf->mem);
|
||||
}
|
||||
EXPORT_SYMBOL(inet_frags_exit_net);
|
||||
|
||||
static struct inet_frag_bucket *
|
||||
get_frag_bucket_locked(struct inet_frag_queue *fq, struct inet_frags *f)
|
||||
__acquires(hb->chain_lock)
|
||||
{
|
||||
struct inet_frag_bucket *hb;
|
||||
unsigned int seq, hash;
|
||||
|
||||
restart:
|
||||
seq = read_seqbegin(&f->rnd_seqlock);
|
||||
|
||||
hash = inet_frag_hashfn(f, fq);
|
||||
hb = &f->hash[hash];
|
||||
|
||||
spin_lock(&hb->chain_lock);
|
||||
if (read_seqretry(&f->rnd_seqlock, seq)) {
|
||||
spin_unlock(&hb->chain_lock);
|
||||
goto restart;
|
||||
}
|
||||
|
||||
return hb;
|
||||
}
|
||||
|
||||
static inline void fq_unlink(struct inet_frag_queue *fq, struct inet_frags *f)
|
||||
{
|
||||
struct inet_frag_bucket *hb;
|
||||
|
||||
hb = get_frag_bucket_locked(fq, f);
|
||||
if (!(fq->flags & INET_FRAG_EVICTED))
|
||||
hlist_del(&fq->list);
|
||||
spin_unlock(&hb->chain_lock);
|
||||
}
|
||||
|
||||
void inet_frag_kill(struct inet_frag_queue *fq, struct inet_frags *f)
|
||||
{
|
||||
if (del_timer(&fq->timer))
|
||||
atomic_dec(&fq->refcnt);
|
||||
|
||||
if (!(fq->flags & INET_FRAG_COMPLETE)) {
|
||||
fq_unlink(fq, f);
|
||||
atomic_dec(&fq->refcnt);
|
||||
fq->flags |= INET_FRAG_COMPLETE;
|
||||
}
|
||||
}
|
||||
EXPORT_SYMBOL(inet_frag_kill);
|
||||
|
||||
static inline void frag_kfree_skb(struct netns_frags *nf, struct inet_frags *f,
|
||||
struct sk_buff *skb)
|
||||
{
|
||||
if (f->skb_free)
|
||||
f->skb_free(skb);
|
||||
kfree_skb(skb);
|
||||
}
|
||||
|
||||
void inet_frag_destroy(struct inet_frag_queue *q, struct inet_frags *f)
|
||||
{
|
||||
struct sk_buff *fp;
|
||||
struct netns_frags *nf;
|
||||
unsigned int sum, sum_truesize = 0;
|
||||
|
||||
WARN_ON(!(q->flags & INET_FRAG_COMPLETE));
|
||||
WARN_ON(del_timer(&q->timer) != 0);
|
||||
|
||||
/* Release all fragment data. */
|
||||
fp = q->fragments;
|
||||
nf = q->net;
|
||||
while (fp) {
|
||||
struct sk_buff *xp = fp->next;
|
||||
|
||||
sum_truesize += fp->truesize;
|
||||
frag_kfree_skb(nf, f, fp);
|
||||
fp = xp;
|
||||
}
|
||||
sum = sum_truesize + f->qsize;
|
||||
sub_frag_mem_limit(q, sum);
|
||||
|
||||
if (f->destructor)
|
||||
f->destructor(q);
|
||||
kmem_cache_free(f->frags_cachep, q);
|
||||
}
|
||||
EXPORT_SYMBOL(inet_frag_destroy);
|
||||
|
||||
static struct inet_frag_queue *inet_frag_intern(struct netns_frags *nf,
|
||||
struct inet_frag_queue *qp_in,
|
||||
struct inet_frags *f,
|
||||
void *arg)
|
||||
{
|
||||
struct inet_frag_bucket *hb = get_frag_bucket_locked(qp_in, f);
|
||||
struct inet_frag_queue *qp;
|
||||
|
||||
#ifdef CONFIG_SMP
|
||||
/* With SMP race we have to recheck hash table, because
|
||||
* such entry could have been created on other cpu before
|
||||
* we acquired hash bucket lock.
|
||||
*/
|
||||
hlist_for_each_entry(qp, &hb->chain, list) {
|
||||
if (qp->net == nf && f->match(qp, arg)) {
|
||||
atomic_inc(&qp->refcnt);
|
||||
spin_unlock(&hb->chain_lock);
|
||||
qp_in->flags |= INET_FRAG_COMPLETE;
|
||||
inet_frag_put(qp_in, f);
|
||||
return qp;
|
||||
}
|
||||
}
|
||||
#endif
|
||||
qp = qp_in;
|
||||
if (!mod_timer(&qp->timer, jiffies + nf->timeout))
|
||||
atomic_inc(&qp->refcnt);
|
||||
|
||||
atomic_inc(&qp->refcnt);
|
||||
hlist_add_head(&qp->list, &hb->chain);
|
||||
|
||||
spin_unlock(&hb->chain_lock);
|
||||
|
||||
return qp;
|
||||
}
|
||||
|
||||
static struct inet_frag_queue *inet_frag_alloc(struct netns_frags *nf,
|
||||
struct inet_frags *f,
|
||||
void *arg)
|
||||
{
|
||||
struct inet_frag_queue *q;
|
||||
|
||||
if (frag_mem_limit(nf) > nf->high_thresh) {
|
||||
inet_frag_schedule_worker(f);
|
||||
return NULL;
|
||||
}
|
||||
|
||||
q = kmem_cache_zalloc(f->frags_cachep, GFP_ATOMIC);
|
||||
if (q == NULL)
|
||||
return NULL;
|
||||
|
||||
q->net = nf;
|
||||
f->constructor(q, arg);
|
||||
add_frag_mem_limit(q, f->qsize);
|
||||
|
||||
setup_timer(&q->timer, f->frag_expire, (unsigned long)q);
|
||||
spin_lock_init(&q->lock);
|
||||
atomic_set(&q->refcnt, 1);
|
||||
|
||||
return q;
|
||||
}
|
||||
|
||||
static struct inet_frag_queue *inet_frag_create(struct netns_frags *nf,
|
||||
struct inet_frags *f,
|
||||
void *arg)
|
||||
{
|
||||
struct inet_frag_queue *q;
|
||||
|
||||
q = inet_frag_alloc(nf, f, arg);
|
||||
if (q == NULL)
|
||||
return NULL;
|
||||
|
||||
return inet_frag_intern(nf, q, f, arg);
|
||||
}
|
||||
|
||||
struct inet_frag_queue *inet_frag_find(struct netns_frags *nf,
|
||||
struct inet_frags *f, void *key,
|
||||
unsigned int hash)
|
||||
{
|
||||
struct inet_frag_bucket *hb;
|
||||
struct inet_frag_queue *q;
|
||||
int depth = 0;
|
||||
|
||||
if (frag_mem_limit(nf) > nf->low_thresh)
|
||||
inet_frag_schedule_worker(f);
|
||||
|
||||
hash &= (INETFRAGS_HASHSZ - 1);
|
||||
hb = &f->hash[hash];
|
||||
|
||||
spin_lock(&hb->chain_lock);
|
||||
hlist_for_each_entry(q, &hb->chain, list) {
|
||||
if (q->net == nf && f->match(q, key)) {
|
||||
atomic_inc(&q->refcnt);
|
||||
spin_unlock(&hb->chain_lock);
|
||||
return q;
|
||||
}
|
||||
depth++;
|
||||
}
|
||||
spin_unlock(&hb->chain_lock);
|
||||
|
||||
if (depth <= INETFRAGS_MAXDEPTH)
|
||||
return inet_frag_create(nf, f, key);
|
||||
|
||||
if (inet_frag_may_rebuild(f)) {
|
||||
if (!f->rebuild)
|
||||
f->rebuild = true;
|
||||
inet_frag_schedule_worker(f);
|
||||
}
|
||||
|
||||
return ERR_PTR(-ENOBUFS);
|
||||
}
|
||||
EXPORT_SYMBOL(inet_frag_find);
|
||||
|
||||
void inet_frag_maybe_warn_overflow(struct inet_frag_queue *q,
|
||||
const char *prefix)
|
||||
{
|
||||
static const char msg[] = "inet_frag_find: Fragment hash bucket"
|
||||
" list length grew over limit " __stringify(INETFRAGS_MAXDEPTH)
|
||||
". Dropping fragment.\n";
|
||||
|
||||
if (PTR_ERR(q) == -ENOBUFS)
|
||||
LIMIT_NETDEBUG(KERN_WARNING "%s%s", prefix, msg);
|
||||
}
|
||||
EXPORT_SYMBOL(inet_frag_maybe_warn_overflow);
|
||||
608
net/ipv4/inet_hashtables.c
Normal file
608
net/ipv4/inet_hashtables.c
Normal file
|
|
@ -0,0 +1,608 @@
|
|||
/*
|
||||
* INET An implementation of the TCP/IP protocol suite for the LINUX
|
||||
* operating system. INET is implemented using the BSD Socket
|
||||
* interface as the means of communication with the user level.
|
||||
*
|
||||
* Generic INET transport hashtables
|
||||
*
|
||||
* Authors: Lotsa people, from code originally in tcp
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU General Public License
|
||||
* as published by the Free Software Foundation; either version
|
||||
* 2 of the License, or (at your option) any later version.
|
||||
*/
|
||||
|
||||
#include <linux/module.h>
|
||||
#include <linux/random.h>
|
||||
#include <linux/sched.h>
|
||||
#include <linux/slab.h>
|
||||
#include <linux/wait.h>
|
||||
|
||||
#include <net/inet_connection_sock.h>
|
||||
#include <net/inet_hashtables.h>
|
||||
#include <net/secure_seq.h>
|
||||
#include <net/ip.h>
|
||||
|
||||
static unsigned int inet_ehashfn(struct net *net, const __be32 laddr,
|
||||
const __u16 lport, const __be32 faddr,
|
||||
const __be16 fport)
|
||||
{
|
||||
static u32 inet_ehash_secret __read_mostly;
|
||||
|
||||
net_get_random_once(&inet_ehash_secret, sizeof(inet_ehash_secret));
|
||||
|
||||
return __inet_ehashfn(laddr, lport, faddr, fport,
|
||||
inet_ehash_secret + net_hash_mix(net));
|
||||
}
|
||||
|
||||
|
||||
static unsigned int inet_sk_ehashfn(const struct sock *sk)
|
||||
{
|
||||
const struct inet_sock *inet = inet_sk(sk);
|
||||
const __be32 laddr = inet->inet_rcv_saddr;
|
||||
const __u16 lport = inet->inet_num;
|
||||
const __be32 faddr = inet->inet_daddr;
|
||||
const __be16 fport = inet->inet_dport;
|
||||
struct net *net = sock_net(sk);
|
||||
|
||||
return inet_ehashfn(net, laddr, lport, faddr, fport);
|
||||
}
|
||||
|
||||
/*
|
||||
* Allocate and initialize a new local port bind bucket.
|
||||
* The bindhash mutex for snum's hash chain must be held here.
|
||||
*/
|
||||
struct inet_bind_bucket *inet_bind_bucket_create(struct kmem_cache *cachep,
|
||||
struct net *net,
|
||||
struct inet_bind_hashbucket *head,
|
||||
const unsigned short snum)
|
||||
{
|
||||
struct inet_bind_bucket *tb = kmem_cache_alloc(cachep, GFP_ATOMIC);
|
||||
|
||||
if (tb != NULL) {
|
||||
write_pnet(&tb->ib_net, hold_net(net));
|
||||
tb->port = snum;
|
||||
tb->fastreuse = 0;
|
||||
tb->fastreuseport = 0;
|
||||
tb->num_owners = 0;
|
||||
INIT_HLIST_HEAD(&tb->owners);
|
||||
hlist_add_head(&tb->node, &head->chain);
|
||||
}
|
||||
return tb;
|
||||
}
|
||||
|
||||
/*
|
||||
* Caller must hold hashbucket lock for this tb with local BH disabled
|
||||
*/
|
||||
void inet_bind_bucket_destroy(struct kmem_cache *cachep, struct inet_bind_bucket *tb)
|
||||
{
|
||||
if (hlist_empty(&tb->owners)) {
|
||||
__hlist_del(&tb->node);
|
||||
release_net(ib_net(tb));
|
||||
kmem_cache_free(cachep, tb);
|
||||
}
|
||||
}
|
||||
|
||||
void inet_bind_hash(struct sock *sk, struct inet_bind_bucket *tb,
|
||||
const unsigned short snum)
|
||||
{
|
||||
struct inet_hashinfo *hashinfo = sk->sk_prot->h.hashinfo;
|
||||
|
||||
atomic_inc(&hashinfo->bsockets);
|
||||
|
||||
inet_sk(sk)->inet_num = snum;
|
||||
sk_add_bind_node(sk, &tb->owners);
|
||||
tb->num_owners++;
|
||||
inet_csk(sk)->icsk_bind_hash = tb;
|
||||
}
|
||||
|
||||
/*
|
||||
* Get rid of any references to a local port held by the given sock.
|
||||
*/
|
||||
static void __inet_put_port(struct sock *sk)
|
||||
{
|
||||
struct inet_hashinfo *hashinfo = sk->sk_prot->h.hashinfo;
|
||||
const int bhash = inet_bhashfn(sock_net(sk), inet_sk(sk)->inet_num,
|
||||
hashinfo->bhash_size);
|
||||
struct inet_bind_hashbucket *head = &hashinfo->bhash[bhash];
|
||||
struct inet_bind_bucket *tb;
|
||||
|
||||
atomic_dec(&hashinfo->bsockets);
|
||||
|
||||
spin_lock(&head->lock);
|
||||
tb = inet_csk(sk)->icsk_bind_hash;
|
||||
__sk_del_bind_node(sk);
|
||||
tb->num_owners--;
|
||||
inet_csk(sk)->icsk_bind_hash = NULL;
|
||||
inet_sk(sk)->inet_num = 0;
|
||||
inet_bind_bucket_destroy(hashinfo->bind_bucket_cachep, tb);
|
||||
spin_unlock(&head->lock);
|
||||
}
|
||||
|
||||
void inet_put_port(struct sock *sk)
|
||||
{
|
||||
local_bh_disable();
|
||||
__inet_put_port(sk);
|
||||
local_bh_enable();
|
||||
}
|
||||
EXPORT_SYMBOL(inet_put_port);
|
||||
|
||||
int __inet_inherit_port(struct sock *sk, struct sock *child)
|
||||
{
|
||||
struct inet_hashinfo *table = sk->sk_prot->h.hashinfo;
|
||||
unsigned short port = inet_sk(child)->inet_num;
|
||||
const int bhash = inet_bhashfn(sock_net(sk), port,
|
||||
table->bhash_size);
|
||||
struct inet_bind_hashbucket *head = &table->bhash[bhash];
|
||||
struct inet_bind_bucket *tb;
|
||||
|
||||
spin_lock(&head->lock);
|
||||
tb = inet_csk(sk)->icsk_bind_hash;
|
||||
if (tb->port != port) {
|
||||
/* NOTE: using tproxy and redirecting skbs to a proxy
|
||||
* on a different listener port breaks the assumption
|
||||
* that the listener socket's icsk_bind_hash is the same
|
||||
* as that of the child socket. We have to look up or
|
||||
* create a new bind bucket for the child here. */
|
||||
inet_bind_bucket_for_each(tb, &head->chain) {
|
||||
if (net_eq(ib_net(tb), sock_net(sk)) &&
|
||||
tb->port == port)
|
||||
break;
|
||||
}
|
||||
if (!tb) {
|
||||
tb = inet_bind_bucket_create(table->bind_bucket_cachep,
|
||||
sock_net(sk), head, port);
|
||||
if (!tb) {
|
||||
spin_unlock(&head->lock);
|
||||
return -ENOMEM;
|
||||
}
|
||||
}
|
||||
}
|
||||
inet_bind_hash(child, tb, port);
|
||||
spin_unlock(&head->lock);
|
||||
|
||||
return 0;
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(__inet_inherit_port);
|
||||
|
||||
static inline int compute_score(struct sock *sk, struct net *net,
|
||||
const unsigned short hnum, const __be32 daddr,
|
||||
const int dif)
|
||||
{
|
||||
int score = -1;
|
||||
struct inet_sock *inet = inet_sk(sk);
|
||||
|
||||
if (net_eq(sock_net(sk), net) && inet->inet_num == hnum &&
|
||||
!ipv6_only_sock(sk)) {
|
||||
__be32 rcv_saddr = inet->inet_rcv_saddr;
|
||||
score = sk->sk_family == PF_INET ? 2 : 1;
|
||||
if (rcv_saddr) {
|
||||
if (rcv_saddr != daddr)
|
||||
return -1;
|
||||
score += 4;
|
||||
}
|
||||
if (sk->sk_bound_dev_if) {
|
||||
if (sk->sk_bound_dev_if != dif)
|
||||
return -1;
|
||||
score += 4;
|
||||
}
|
||||
}
|
||||
return score;
|
||||
}
|
||||
|
||||
/*
|
||||
* Don't inline this cruft. Here are some nice properties to exploit here. The
|
||||
* BSD API does not allow a listening sock to specify the remote port nor the
|
||||
* remote address for the connection. So always assume those are both
|
||||
* wildcarded during the search since they can never be otherwise.
|
||||
*/
|
||||
|
||||
|
||||
struct sock *__inet_lookup_listener(struct net *net,
|
||||
struct inet_hashinfo *hashinfo,
|
||||
const __be32 saddr, __be16 sport,
|
||||
const __be32 daddr, const unsigned short hnum,
|
||||
const int dif)
|
||||
{
|
||||
struct sock *sk, *result;
|
||||
struct hlist_nulls_node *node;
|
||||
unsigned int hash = inet_lhashfn(net, hnum);
|
||||
struct inet_listen_hashbucket *ilb = &hashinfo->listening_hash[hash];
|
||||
int score, hiscore, matches = 0, reuseport = 0;
|
||||
u32 phash = 0;
|
||||
|
||||
rcu_read_lock();
|
||||
begin:
|
||||
result = NULL;
|
||||
hiscore = 0;
|
||||
sk_nulls_for_each_rcu(sk, node, &ilb->head) {
|
||||
score = compute_score(sk, net, hnum, daddr, dif);
|
||||
if (score > hiscore) {
|
||||
result = sk;
|
||||
hiscore = score;
|
||||
reuseport = sk->sk_reuseport;
|
||||
if (reuseport) {
|
||||
phash = inet_ehashfn(net, daddr, hnum,
|
||||
saddr, sport);
|
||||
matches = 1;
|
||||
}
|
||||
} else if (score == hiscore && reuseport) {
|
||||
matches++;
|
||||
if (reciprocal_scale(phash, matches) == 0)
|
||||
result = sk;
|
||||
phash = next_pseudo_random32(phash);
|
||||
}
|
||||
}
|
||||
/*
|
||||
* if the nulls value we got at the end of this lookup is
|
||||
* not the expected one, we must restart lookup.
|
||||
* We probably met an item that was moved to another chain.
|
||||
*/
|
||||
if (get_nulls_value(node) != hash + LISTENING_NULLS_BASE)
|
||||
goto begin;
|
||||
if (result) {
|
||||
if (unlikely(!atomic_inc_not_zero(&result->sk_refcnt)))
|
||||
result = NULL;
|
||||
else if (unlikely(compute_score(result, net, hnum, daddr,
|
||||
dif) < hiscore)) {
|
||||
sock_put(result);
|
||||
goto begin;
|
||||
}
|
||||
}
|
||||
rcu_read_unlock();
|
||||
return result;
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(__inet_lookup_listener);
|
||||
|
||||
/* All sockets share common refcount, but have different destructors */
|
||||
void sock_gen_put(struct sock *sk)
|
||||
{
|
||||
if (!atomic_dec_and_test(&sk->sk_refcnt))
|
||||
return;
|
||||
|
||||
if (sk->sk_state == TCP_TIME_WAIT)
|
||||
inet_twsk_free(inet_twsk(sk));
|
||||
else
|
||||
sk_free(sk);
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(sock_gen_put);
|
||||
|
||||
struct sock *__inet_lookup_established(struct net *net,
|
||||
struct inet_hashinfo *hashinfo,
|
||||
const __be32 saddr, const __be16 sport,
|
||||
const __be32 daddr, const u16 hnum,
|
||||
const int dif)
|
||||
{
|
||||
INET_ADDR_COOKIE(acookie, saddr, daddr);
|
||||
const __portpair ports = INET_COMBINED_PORTS(sport, hnum);
|
||||
struct sock *sk;
|
||||
const struct hlist_nulls_node *node;
|
||||
/* Optimize here for direct hit, only listening connections can
|
||||
* have wildcards anyways.
|
||||
*/
|
||||
unsigned int hash = inet_ehashfn(net, daddr, hnum, saddr, sport);
|
||||
unsigned int slot = hash & hashinfo->ehash_mask;
|
||||
struct inet_ehash_bucket *head = &hashinfo->ehash[slot];
|
||||
|
||||
rcu_read_lock();
|
||||
begin:
|
||||
sk_nulls_for_each_rcu(sk, node, &head->chain) {
|
||||
if (sk->sk_hash != hash)
|
||||
continue;
|
||||
if (likely(INET_MATCH(sk, net, acookie,
|
||||
saddr, daddr, ports, dif))) {
|
||||
if (unlikely(!atomic_inc_not_zero(&sk->sk_refcnt)))
|
||||
goto out;
|
||||
if (unlikely(!INET_MATCH(sk, net, acookie,
|
||||
saddr, daddr, ports, dif))) {
|
||||
sock_gen_put(sk);
|
||||
goto begin;
|
||||
}
|
||||
goto found;
|
||||
}
|
||||
}
|
||||
/*
|
||||
* if the nulls value we got at the end of this lookup is
|
||||
* not the expected one, we must restart lookup.
|
||||
* We probably met an item that was moved to another chain.
|
||||
*/
|
||||
if (get_nulls_value(node) != slot)
|
||||
goto begin;
|
||||
out:
|
||||
sk = NULL;
|
||||
found:
|
||||
rcu_read_unlock();
|
||||
return sk;
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(__inet_lookup_established);
|
||||
|
||||
/* called with local bh disabled */
|
||||
static int __inet_check_established(struct inet_timewait_death_row *death_row,
|
||||
struct sock *sk, __u16 lport,
|
||||
struct inet_timewait_sock **twp)
|
||||
{
|
||||
struct inet_hashinfo *hinfo = death_row->hashinfo;
|
||||
struct inet_sock *inet = inet_sk(sk);
|
||||
__be32 daddr = inet->inet_rcv_saddr;
|
||||
__be32 saddr = inet->inet_daddr;
|
||||
int dif = sk->sk_bound_dev_if;
|
||||
INET_ADDR_COOKIE(acookie, saddr, daddr);
|
||||
const __portpair ports = INET_COMBINED_PORTS(inet->inet_dport, lport);
|
||||
struct net *net = sock_net(sk);
|
||||
unsigned int hash = inet_ehashfn(net, daddr, lport,
|
||||
saddr, inet->inet_dport);
|
||||
struct inet_ehash_bucket *head = inet_ehash_bucket(hinfo, hash);
|
||||
spinlock_t *lock = inet_ehash_lockp(hinfo, hash);
|
||||
struct sock *sk2;
|
||||
const struct hlist_nulls_node *node;
|
||||
struct inet_timewait_sock *tw = NULL;
|
||||
int twrefcnt = 0;
|
||||
|
||||
spin_lock(lock);
|
||||
|
||||
sk_nulls_for_each(sk2, node, &head->chain) {
|
||||
if (sk2->sk_hash != hash)
|
||||
continue;
|
||||
|
||||
if (likely(INET_MATCH(sk2, net, acookie,
|
||||
saddr, daddr, ports, dif))) {
|
||||
if (sk2->sk_state == TCP_TIME_WAIT) {
|
||||
tw = inet_twsk(sk2);
|
||||
if (twsk_unique(sk, sk2, twp))
|
||||
break;
|
||||
}
|
||||
goto not_unique;
|
||||
}
|
||||
}
|
||||
|
||||
/* Must record num and sport now. Otherwise we will see
|
||||
* in hash table socket with a funny identity.
|
||||
*/
|
||||
inet->inet_num = lport;
|
||||
inet->inet_sport = htons(lport);
|
||||
sk->sk_hash = hash;
|
||||
WARN_ON(!sk_unhashed(sk));
|
||||
__sk_nulls_add_node_rcu(sk, &head->chain);
|
||||
if (tw) {
|
||||
twrefcnt = inet_twsk_unhash(tw);
|
||||
NET_INC_STATS_BH(net, LINUX_MIB_TIMEWAITRECYCLED);
|
||||
}
|
||||
spin_unlock(lock);
|
||||
if (twrefcnt)
|
||||
inet_twsk_put(tw);
|
||||
sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1);
|
||||
|
||||
if (twp) {
|
||||
*twp = tw;
|
||||
} else if (tw) {
|
||||
/* Silly. Should hash-dance instead... */
|
||||
inet_twsk_deschedule(tw, death_row);
|
||||
|
||||
inet_twsk_put(tw);
|
||||
}
|
||||
return 0;
|
||||
|
||||
not_unique:
|
||||
spin_unlock(lock);
|
||||
return -EADDRNOTAVAIL;
|
||||
}
|
||||
|
||||
static inline u32 inet_sk_port_offset(const struct sock *sk)
|
||||
{
|
||||
const struct inet_sock *inet = inet_sk(sk);
|
||||
return secure_ipv4_port_ephemeral(inet->inet_rcv_saddr,
|
||||
inet->inet_daddr,
|
||||
inet->inet_dport);
|
||||
}
|
||||
|
||||
int __inet_hash_nolisten(struct sock *sk, struct inet_timewait_sock *tw)
|
||||
{
|
||||
struct inet_hashinfo *hashinfo = sk->sk_prot->h.hashinfo;
|
||||
struct hlist_nulls_head *list;
|
||||
spinlock_t *lock;
|
||||
struct inet_ehash_bucket *head;
|
||||
int twrefcnt = 0;
|
||||
|
||||
WARN_ON(!sk_unhashed(sk));
|
||||
|
||||
sk->sk_hash = inet_sk_ehashfn(sk);
|
||||
head = inet_ehash_bucket(hashinfo, sk->sk_hash);
|
||||
list = &head->chain;
|
||||
lock = inet_ehash_lockp(hashinfo, sk->sk_hash);
|
||||
|
||||
spin_lock(lock);
|
||||
__sk_nulls_add_node_rcu(sk, list);
|
||||
if (tw) {
|
||||
WARN_ON(sk->sk_hash != tw->tw_hash);
|
||||
twrefcnt = inet_twsk_unhash(tw);
|
||||
}
|
||||
spin_unlock(lock);
|
||||
sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1);
|
||||
return twrefcnt;
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(__inet_hash_nolisten);
|
||||
|
||||
static void __inet_hash(struct sock *sk)
|
||||
{
|
||||
struct inet_hashinfo *hashinfo = sk->sk_prot->h.hashinfo;
|
||||
struct inet_listen_hashbucket *ilb;
|
||||
|
||||
if (sk->sk_state != TCP_LISTEN) {
|
||||
__inet_hash_nolisten(sk, NULL);
|
||||
return;
|
||||
}
|
||||
|
||||
WARN_ON(!sk_unhashed(sk));
|
||||
ilb = &hashinfo->listening_hash[inet_sk_listen_hashfn(sk)];
|
||||
|
||||
spin_lock(&ilb->lock);
|
||||
__sk_nulls_add_node_rcu(sk, &ilb->head);
|
||||
sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1);
|
||||
spin_unlock(&ilb->lock);
|
||||
}
|
||||
|
||||
void inet_hash(struct sock *sk)
|
||||
{
|
||||
if (sk->sk_state != TCP_CLOSE) {
|
||||
local_bh_disable();
|
||||
__inet_hash(sk);
|
||||
local_bh_enable();
|
||||
}
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(inet_hash);
|
||||
|
||||
void inet_unhash(struct sock *sk)
|
||||
{
|
||||
struct inet_hashinfo *hashinfo = sk->sk_prot->h.hashinfo;
|
||||
spinlock_t *lock;
|
||||
int done;
|
||||
|
||||
if (sk_unhashed(sk))
|
||||
return;
|
||||
|
||||
if (sk->sk_state == TCP_LISTEN)
|
||||
lock = &hashinfo->listening_hash[inet_sk_listen_hashfn(sk)].lock;
|
||||
else
|
||||
lock = inet_ehash_lockp(hashinfo, sk->sk_hash);
|
||||
|
||||
spin_lock_bh(lock);
|
||||
done = __sk_nulls_del_node_init_rcu(sk);
|
||||
if (done)
|
||||
sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1);
|
||||
spin_unlock_bh(lock);
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(inet_unhash);
|
||||
|
||||
int __inet_hash_connect(struct inet_timewait_death_row *death_row,
|
||||
struct sock *sk, u32 port_offset,
|
||||
int (*check_established)(struct inet_timewait_death_row *,
|
||||
struct sock *, __u16, struct inet_timewait_sock **),
|
||||
int (*hash)(struct sock *sk, struct inet_timewait_sock *twp))
|
||||
{
|
||||
struct inet_hashinfo *hinfo = death_row->hashinfo;
|
||||
const unsigned short snum = inet_sk(sk)->inet_num;
|
||||
struct inet_bind_hashbucket *head;
|
||||
struct inet_bind_bucket *tb;
|
||||
int ret;
|
||||
struct net *net = sock_net(sk);
|
||||
int twrefcnt = 1;
|
||||
|
||||
if (!snum) {
|
||||
int i, remaining, low, high, port;
|
||||
static u32 hint;
|
||||
u32 offset = hint + port_offset;
|
||||
struct inet_timewait_sock *tw = NULL;
|
||||
|
||||
inet_get_local_port_range(net, &low, &high);
|
||||
remaining = (high - low) + 1;
|
||||
|
||||
local_bh_disable();
|
||||
for (i = 1; i <= remaining; i++) {
|
||||
port = low + (i + offset) % remaining;
|
||||
if (inet_is_local_reserved_port(net, port))
|
||||
continue;
|
||||
head = &hinfo->bhash[inet_bhashfn(net, port,
|
||||
hinfo->bhash_size)];
|
||||
spin_lock(&head->lock);
|
||||
|
||||
/* Does not bother with rcv_saddr checks,
|
||||
* because the established check is already
|
||||
* unique enough.
|
||||
*/
|
||||
inet_bind_bucket_for_each(tb, &head->chain) {
|
||||
if (net_eq(ib_net(tb), net) &&
|
||||
tb->port == port) {
|
||||
if (tb->fastreuse >= 0 ||
|
||||
tb->fastreuseport >= 0)
|
||||
goto next_port;
|
||||
WARN_ON(hlist_empty(&tb->owners));
|
||||
if (!check_established(death_row, sk,
|
||||
port, &tw))
|
||||
goto ok;
|
||||
goto next_port;
|
||||
}
|
||||
}
|
||||
|
||||
tb = inet_bind_bucket_create(hinfo->bind_bucket_cachep,
|
||||
net, head, port);
|
||||
if (!tb) {
|
||||
spin_unlock(&head->lock);
|
||||
break;
|
||||
}
|
||||
tb->fastreuse = -1;
|
||||
tb->fastreuseport = -1;
|
||||
goto ok;
|
||||
|
||||
next_port:
|
||||
spin_unlock(&head->lock);
|
||||
}
|
||||
local_bh_enable();
|
||||
|
||||
return -EADDRNOTAVAIL;
|
||||
|
||||
ok:
|
||||
hint += i;
|
||||
|
||||
/* Head lock still held and bh's disabled */
|
||||
inet_bind_hash(sk, tb, port);
|
||||
if (sk_unhashed(sk)) {
|
||||
inet_sk(sk)->inet_sport = htons(port);
|
||||
twrefcnt += hash(sk, tw);
|
||||
}
|
||||
if (tw)
|
||||
twrefcnt += inet_twsk_bind_unhash(tw, hinfo);
|
||||
spin_unlock(&head->lock);
|
||||
|
||||
if (tw) {
|
||||
inet_twsk_deschedule(tw, death_row);
|
||||
while (twrefcnt) {
|
||||
twrefcnt--;
|
||||
inet_twsk_put(tw);
|
||||
}
|
||||
}
|
||||
|
||||
ret = 0;
|
||||
goto out;
|
||||
}
|
||||
|
||||
head = &hinfo->bhash[inet_bhashfn(net, snum, hinfo->bhash_size)];
|
||||
tb = inet_csk(sk)->icsk_bind_hash;
|
||||
spin_lock_bh(&head->lock);
|
||||
if (sk_head(&tb->owners) == sk && !sk->sk_bind_node.next) {
|
||||
hash(sk, NULL);
|
||||
spin_unlock_bh(&head->lock);
|
||||
return 0;
|
||||
} else {
|
||||
spin_unlock(&head->lock);
|
||||
/* No definite answer... Walk to established hash table */
|
||||
ret = check_established(death_row, sk, snum, NULL);
|
||||
out:
|
||||
local_bh_enable();
|
||||
return ret;
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* Bind a port for a connect operation and hash it.
|
||||
*/
|
||||
int inet_hash_connect(struct inet_timewait_death_row *death_row,
|
||||
struct sock *sk)
|
||||
{
|
||||
return __inet_hash_connect(death_row, sk, inet_sk_port_offset(sk),
|
||||
__inet_check_established, __inet_hash_nolisten);
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(inet_hash_connect);
|
||||
|
||||
void inet_hashinfo_init(struct inet_hashinfo *h)
|
||||
{
|
||||
int i;
|
||||
|
||||
atomic_set(&h->bsockets, 0);
|
||||
for (i = 0; i < INET_LHTABLE_SIZE; i++) {
|
||||
spin_lock_init(&h->listening_hash[i].lock);
|
||||
INIT_HLIST_NULLS_HEAD(&h->listening_hash[i].head,
|
||||
i + LISTENING_NULLS_BASE);
|
||||
}
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(inet_hashinfo_init);
|
||||
374
net/ipv4/inet_lro.c
Normal file
374
net/ipv4/inet_lro.c
Normal file
|
|
@ -0,0 +1,374 @@
|
|||
/*
|
||||
* linux/net/ipv4/inet_lro.c
|
||||
*
|
||||
* Large Receive Offload (ipv4 / tcp)
|
||||
*
|
||||
* (C) Copyright IBM Corp. 2007
|
||||
*
|
||||
* Authors:
|
||||
* Jan-Bernd Themann <themann@de.ibm.com>
|
||||
* Christoph Raisch <raisch@de.ibm.com>
|
||||
*
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
* the Free Software Foundation; either version 2, or (at your option)
|
||||
* any later version.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License
|
||||
* along with this program; if not, write to the Free Software
|
||||
* Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
|
||||
*/
|
||||
|
||||
|
||||
#include <linux/module.h>
|
||||
#include <linux/if_vlan.h>
|
||||
#include <linux/inet_lro.h>
|
||||
#include <net/checksum.h>
|
||||
|
||||
MODULE_LICENSE("GPL");
|
||||
MODULE_AUTHOR("Jan-Bernd Themann <themann@de.ibm.com>");
|
||||
MODULE_DESCRIPTION("Large Receive Offload (ipv4 / tcp)");
|
||||
|
||||
#define TCP_HDR_LEN(tcph) (tcph->doff << 2)
|
||||
#define IP_HDR_LEN(iph) (iph->ihl << 2)
|
||||
#define TCP_PAYLOAD_LENGTH(iph, tcph) \
|
||||
(ntohs(iph->tot_len) - IP_HDR_LEN(iph) - TCP_HDR_LEN(tcph))
|
||||
|
||||
#define IPH_LEN_WO_OPTIONS 5
|
||||
#define TCPH_LEN_WO_OPTIONS 5
|
||||
#define TCPH_LEN_W_TIMESTAMP 8
|
||||
|
||||
#define LRO_MAX_PG_HLEN 64
|
||||
|
||||
#define LRO_INC_STATS(lro_mgr, attr) { lro_mgr->stats.attr++; }
|
||||
|
||||
/*
|
||||
* Basic tcp checks whether packet is suitable for LRO
|
||||
*/
|
||||
|
||||
static int lro_tcp_ip_check(const struct iphdr *iph, const struct tcphdr *tcph,
|
||||
int len, const struct net_lro_desc *lro_desc)
|
||||
{
|
||||
/* check ip header: don't aggregate padded frames */
|
||||
if (ntohs(iph->tot_len) != len)
|
||||
return -1;
|
||||
|
||||
if (TCP_PAYLOAD_LENGTH(iph, tcph) == 0)
|
||||
return -1;
|
||||
|
||||
if (iph->ihl != IPH_LEN_WO_OPTIONS)
|
||||
return -1;
|
||||
|
||||
if (tcph->cwr || tcph->ece || tcph->urg || !tcph->ack ||
|
||||
tcph->rst || tcph->syn || tcph->fin)
|
||||
return -1;
|
||||
|
||||
if (INET_ECN_is_ce(ipv4_get_dsfield(iph)))
|
||||
return -1;
|
||||
|
||||
if (tcph->doff != TCPH_LEN_WO_OPTIONS &&
|
||||
tcph->doff != TCPH_LEN_W_TIMESTAMP)
|
||||
return -1;
|
||||
|
||||
/* check tcp options (only timestamp allowed) */
|
||||
if (tcph->doff == TCPH_LEN_W_TIMESTAMP) {
|
||||
__be32 *topt = (__be32 *)(tcph + 1);
|
||||
|
||||
if (*topt != htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16)
|
||||
| (TCPOPT_TIMESTAMP << 8)
|
||||
| TCPOLEN_TIMESTAMP))
|
||||
return -1;
|
||||
|
||||
/* timestamp should be in right order */
|
||||
topt++;
|
||||
if (lro_desc && after(ntohl(lro_desc->tcp_rcv_tsval),
|
||||
ntohl(*topt)))
|
||||
return -1;
|
||||
|
||||
/* timestamp reply should not be zero */
|
||||
topt++;
|
||||
if (*topt == 0)
|
||||
return -1;
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
static void lro_update_tcp_ip_header(struct net_lro_desc *lro_desc)
|
||||
{
|
||||
struct iphdr *iph = lro_desc->iph;
|
||||
struct tcphdr *tcph = lro_desc->tcph;
|
||||
__be32 *p;
|
||||
__wsum tcp_hdr_csum;
|
||||
|
||||
tcph->ack_seq = lro_desc->tcp_ack;
|
||||
tcph->window = lro_desc->tcp_window;
|
||||
|
||||
if (lro_desc->tcp_saw_tstamp) {
|
||||
p = (__be32 *)(tcph + 1);
|
||||
*(p+2) = lro_desc->tcp_rcv_tsecr;
|
||||
}
|
||||
|
||||
csum_replace2(&iph->check, iph->tot_len, htons(lro_desc->ip_tot_len));
|
||||
iph->tot_len = htons(lro_desc->ip_tot_len);
|
||||
|
||||
tcph->check = 0;
|
||||
tcp_hdr_csum = csum_partial(tcph, TCP_HDR_LEN(tcph), 0);
|
||||
lro_desc->data_csum = csum_add(lro_desc->data_csum, tcp_hdr_csum);
|
||||
tcph->check = csum_tcpudp_magic(iph->saddr, iph->daddr,
|
||||
lro_desc->ip_tot_len -
|
||||
IP_HDR_LEN(iph), IPPROTO_TCP,
|
||||
lro_desc->data_csum);
|
||||
}
|
||||
|
||||
static __wsum lro_tcp_data_csum(struct iphdr *iph, struct tcphdr *tcph, int len)
|
||||
{
|
||||
__wsum tcp_csum;
|
||||
__wsum tcp_hdr_csum;
|
||||
__wsum tcp_ps_hdr_csum;
|
||||
|
||||
tcp_csum = ~csum_unfold(tcph->check);
|
||||
tcp_hdr_csum = csum_partial(tcph, TCP_HDR_LEN(tcph), tcp_csum);
|
||||
|
||||
tcp_ps_hdr_csum = csum_tcpudp_nofold(iph->saddr, iph->daddr,
|
||||
len + TCP_HDR_LEN(tcph),
|
||||
IPPROTO_TCP, 0);
|
||||
|
||||
return csum_sub(csum_sub(tcp_csum, tcp_hdr_csum),
|
||||
tcp_ps_hdr_csum);
|
||||
}
|
||||
|
||||
static void lro_init_desc(struct net_lro_desc *lro_desc, struct sk_buff *skb,
|
||||
struct iphdr *iph, struct tcphdr *tcph)
|
||||
{
|
||||
int nr_frags;
|
||||
__be32 *ptr;
|
||||
u32 tcp_data_len = TCP_PAYLOAD_LENGTH(iph, tcph);
|
||||
|
||||
nr_frags = skb_shinfo(skb)->nr_frags;
|
||||
lro_desc->parent = skb;
|
||||
lro_desc->next_frag = &(skb_shinfo(skb)->frags[nr_frags]);
|
||||
lro_desc->iph = iph;
|
||||
lro_desc->tcph = tcph;
|
||||
lro_desc->tcp_next_seq = ntohl(tcph->seq) + tcp_data_len;
|
||||
lro_desc->tcp_ack = tcph->ack_seq;
|
||||
lro_desc->tcp_window = tcph->window;
|
||||
|
||||
lro_desc->pkt_aggr_cnt = 1;
|
||||
lro_desc->ip_tot_len = ntohs(iph->tot_len);
|
||||
|
||||
if (tcph->doff == 8) {
|
||||
ptr = (__be32 *)(tcph+1);
|
||||
lro_desc->tcp_saw_tstamp = 1;
|
||||
lro_desc->tcp_rcv_tsval = *(ptr+1);
|
||||
lro_desc->tcp_rcv_tsecr = *(ptr+2);
|
||||
}
|
||||
|
||||
lro_desc->mss = tcp_data_len;
|
||||
lro_desc->active = 1;
|
||||
|
||||
lro_desc->data_csum = lro_tcp_data_csum(iph, tcph,
|
||||
tcp_data_len);
|
||||
}
|
||||
|
||||
static inline void lro_clear_desc(struct net_lro_desc *lro_desc)
|
||||
{
|
||||
memset(lro_desc, 0, sizeof(struct net_lro_desc));
|
||||
}
|
||||
|
||||
static void lro_add_common(struct net_lro_desc *lro_desc, struct iphdr *iph,
|
||||
struct tcphdr *tcph, int tcp_data_len)
|
||||
{
|
||||
struct sk_buff *parent = lro_desc->parent;
|
||||
__be32 *topt;
|
||||
|
||||
lro_desc->pkt_aggr_cnt++;
|
||||
lro_desc->ip_tot_len += tcp_data_len;
|
||||
lro_desc->tcp_next_seq += tcp_data_len;
|
||||
lro_desc->tcp_window = tcph->window;
|
||||
lro_desc->tcp_ack = tcph->ack_seq;
|
||||
|
||||
/* don't update tcp_rcv_tsval, would not work with PAWS */
|
||||
if (lro_desc->tcp_saw_tstamp) {
|
||||
topt = (__be32 *) (tcph + 1);
|
||||
lro_desc->tcp_rcv_tsecr = *(topt + 2);
|
||||
}
|
||||
|
||||
lro_desc->data_csum = csum_block_add(lro_desc->data_csum,
|
||||
lro_tcp_data_csum(iph, tcph,
|
||||
tcp_data_len),
|
||||
parent->len);
|
||||
|
||||
parent->len += tcp_data_len;
|
||||
parent->data_len += tcp_data_len;
|
||||
if (tcp_data_len > lro_desc->mss)
|
||||
lro_desc->mss = tcp_data_len;
|
||||
}
|
||||
|
||||
static void lro_add_packet(struct net_lro_desc *lro_desc, struct sk_buff *skb,
|
||||
struct iphdr *iph, struct tcphdr *tcph)
|
||||
{
|
||||
struct sk_buff *parent = lro_desc->parent;
|
||||
int tcp_data_len = TCP_PAYLOAD_LENGTH(iph, tcph);
|
||||
|
||||
lro_add_common(lro_desc, iph, tcph, tcp_data_len);
|
||||
|
||||
skb_pull(skb, (skb->len - tcp_data_len));
|
||||
parent->truesize += skb->truesize;
|
||||
|
||||
if (lro_desc->last_skb)
|
||||
lro_desc->last_skb->next = skb;
|
||||
else
|
||||
skb_shinfo(parent)->frag_list = skb;
|
||||
|
||||
lro_desc->last_skb = skb;
|
||||
}
|
||||
|
||||
|
||||
static int lro_check_tcp_conn(struct net_lro_desc *lro_desc,
|
||||
struct iphdr *iph,
|
||||
struct tcphdr *tcph)
|
||||
{
|
||||
if ((lro_desc->iph->saddr != iph->saddr) ||
|
||||
(lro_desc->iph->daddr != iph->daddr) ||
|
||||
(lro_desc->tcph->source != tcph->source) ||
|
||||
(lro_desc->tcph->dest != tcph->dest))
|
||||
return -1;
|
||||
return 0;
|
||||
}
|
||||
|
||||
static struct net_lro_desc *lro_get_desc(struct net_lro_mgr *lro_mgr,
|
||||
struct net_lro_desc *lro_arr,
|
||||
struct iphdr *iph,
|
||||
struct tcphdr *tcph)
|
||||
{
|
||||
struct net_lro_desc *lro_desc = NULL;
|
||||
struct net_lro_desc *tmp;
|
||||
int max_desc = lro_mgr->max_desc;
|
||||
int i;
|
||||
|
||||
for (i = 0; i < max_desc; i++) {
|
||||
tmp = &lro_arr[i];
|
||||
if (tmp->active)
|
||||
if (!lro_check_tcp_conn(tmp, iph, tcph)) {
|
||||
lro_desc = tmp;
|
||||
goto out;
|
||||
}
|
||||
}
|
||||
|
||||
for (i = 0; i < max_desc; i++) {
|
||||
if (!lro_arr[i].active) {
|
||||
lro_desc = &lro_arr[i];
|
||||
goto out;
|
||||
}
|
||||
}
|
||||
|
||||
LRO_INC_STATS(lro_mgr, no_desc);
|
||||
out:
|
||||
return lro_desc;
|
||||
}
|
||||
|
||||
static void lro_flush(struct net_lro_mgr *lro_mgr,
|
||||
struct net_lro_desc *lro_desc)
|
||||
{
|
||||
if (lro_desc->pkt_aggr_cnt > 1)
|
||||
lro_update_tcp_ip_header(lro_desc);
|
||||
|
||||
skb_shinfo(lro_desc->parent)->gso_size = lro_desc->mss;
|
||||
|
||||
if (lro_mgr->features & LRO_F_NAPI)
|
||||
netif_receive_skb(lro_desc->parent);
|
||||
else
|
||||
netif_rx(lro_desc->parent);
|
||||
|
||||
LRO_INC_STATS(lro_mgr, flushed);
|
||||
lro_clear_desc(lro_desc);
|
||||
}
|
||||
|
||||
static int __lro_proc_skb(struct net_lro_mgr *lro_mgr, struct sk_buff *skb,
|
||||
void *priv)
|
||||
{
|
||||
struct net_lro_desc *lro_desc;
|
||||
struct iphdr *iph;
|
||||
struct tcphdr *tcph;
|
||||
u64 flags;
|
||||
int vlan_hdr_len = 0;
|
||||
|
||||
if (!lro_mgr->get_skb_header ||
|
||||
lro_mgr->get_skb_header(skb, (void *)&iph, (void *)&tcph,
|
||||
&flags, priv))
|
||||
goto out;
|
||||
|
||||
if (!(flags & LRO_IPV4) || !(flags & LRO_TCP))
|
||||
goto out;
|
||||
|
||||
lro_desc = lro_get_desc(lro_mgr, lro_mgr->lro_arr, iph, tcph);
|
||||
if (!lro_desc)
|
||||
goto out;
|
||||
|
||||
if ((skb->protocol == htons(ETH_P_8021Q)) &&
|
||||
!(lro_mgr->features & LRO_F_EXTRACT_VLAN_ID))
|
||||
vlan_hdr_len = VLAN_HLEN;
|
||||
|
||||
if (!lro_desc->active) { /* start new lro session */
|
||||
if (lro_tcp_ip_check(iph, tcph, skb->len - vlan_hdr_len, NULL))
|
||||
goto out;
|
||||
|
||||
skb->ip_summed = lro_mgr->ip_summed_aggr;
|
||||
lro_init_desc(lro_desc, skb, iph, tcph);
|
||||
LRO_INC_STATS(lro_mgr, aggregated);
|
||||
return 0;
|
||||
}
|
||||
|
||||
if (lro_desc->tcp_next_seq != ntohl(tcph->seq))
|
||||
goto out2;
|
||||
|
||||
if (lro_tcp_ip_check(iph, tcph, skb->len, lro_desc))
|
||||
goto out2;
|
||||
|
||||
lro_add_packet(lro_desc, skb, iph, tcph);
|
||||
LRO_INC_STATS(lro_mgr, aggregated);
|
||||
|
||||
if ((lro_desc->pkt_aggr_cnt >= lro_mgr->max_aggr) ||
|
||||
lro_desc->parent->len > (0xFFFF - lro_mgr->dev->mtu))
|
||||
lro_flush(lro_mgr, lro_desc);
|
||||
|
||||
return 0;
|
||||
|
||||
out2: /* send aggregated SKBs to stack */
|
||||
lro_flush(lro_mgr, lro_desc);
|
||||
|
||||
out:
|
||||
return 1;
|
||||
}
|
||||
|
||||
void lro_receive_skb(struct net_lro_mgr *lro_mgr,
|
||||
struct sk_buff *skb,
|
||||
void *priv)
|
||||
{
|
||||
if (__lro_proc_skb(lro_mgr, skb, priv)) {
|
||||
if (lro_mgr->features & LRO_F_NAPI)
|
||||
netif_receive_skb(skb);
|
||||
else
|
||||
netif_rx(skb);
|
||||
}
|
||||
}
|
||||
EXPORT_SYMBOL(lro_receive_skb);
|
||||
|
||||
void lro_flush_all(struct net_lro_mgr *lro_mgr)
|
||||
{
|
||||
int i;
|
||||
struct net_lro_desc *lro_desc = lro_mgr->lro_arr;
|
||||
|
||||
for (i = 0; i < lro_mgr->max_desc; i++) {
|
||||
if (lro_desc[i].active)
|
||||
lro_flush(lro_mgr, &lro_desc[i]);
|
||||
}
|
||||
}
|
||||
EXPORT_SYMBOL(lro_flush_all);
|
||||
525
net/ipv4/inet_timewait_sock.c
Normal file
525
net/ipv4/inet_timewait_sock.c
Normal file
|
|
@ -0,0 +1,525 @@
|
|||
/*
|
||||
* INET An implementation of the TCP/IP protocol suite for the LINUX
|
||||
* operating system. INET is implemented using the BSD Socket
|
||||
* interface as the means of communication with the user level.
|
||||
*
|
||||
* Generic TIME_WAIT sockets functions
|
||||
*
|
||||
* From code orinally in TCP
|
||||
*/
|
||||
|
||||
#include <linux/kernel.h>
|
||||
#include <linux/kmemcheck.h>
|
||||
#include <linux/slab.h>
|
||||
#include <linux/module.h>
|
||||
#include <net/inet_hashtables.h>
|
||||
#include <net/inet_timewait_sock.h>
|
||||
#include <net/ip.h>
|
||||
|
||||
|
||||
/**
|
||||
* inet_twsk_unhash - unhash a timewait socket from established hash
|
||||
* @tw: timewait socket
|
||||
*
|
||||
* unhash a timewait socket from established hash, if hashed.
|
||||
* ehash lock must be held by caller.
|
||||
* Returns 1 if caller should call inet_twsk_put() after lock release.
|
||||
*/
|
||||
int inet_twsk_unhash(struct inet_timewait_sock *tw)
|
||||
{
|
||||
if (hlist_nulls_unhashed(&tw->tw_node))
|
||||
return 0;
|
||||
|
||||
hlist_nulls_del_rcu(&tw->tw_node);
|
||||
sk_nulls_node_init(&tw->tw_node);
|
||||
/*
|
||||
* We cannot call inet_twsk_put() ourself under lock,
|
||||
* caller must call it for us.
|
||||
*/
|
||||
return 1;
|
||||
}
|
||||
|
||||
/**
|
||||
* inet_twsk_bind_unhash - unhash a timewait socket from bind hash
|
||||
* @tw: timewait socket
|
||||
* @hashinfo: hashinfo pointer
|
||||
*
|
||||
* unhash a timewait socket from bind hash, if hashed.
|
||||
* bind hash lock must be held by caller.
|
||||
* Returns 1 if caller should call inet_twsk_put() after lock release.
|
||||
*/
|
||||
int inet_twsk_bind_unhash(struct inet_timewait_sock *tw,
|
||||
struct inet_hashinfo *hashinfo)
|
||||
{
|
||||
struct inet_bind_bucket *tb = tw->tw_tb;
|
||||
|
||||
if (!tb)
|
||||
return 0;
|
||||
|
||||
__hlist_del(&tw->tw_bind_node);
|
||||
tw->tw_tb = NULL;
|
||||
inet_bind_bucket_destroy(hashinfo->bind_bucket_cachep, tb);
|
||||
/*
|
||||
* We cannot call inet_twsk_put() ourself under lock,
|
||||
* caller must call it for us.
|
||||
*/
|
||||
return 1;
|
||||
}
|
||||
|
||||
/* Must be called with locally disabled BHs. */
|
||||
static void __inet_twsk_kill(struct inet_timewait_sock *tw,
|
||||
struct inet_hashinfo *hashinfo)
|
||||
{
|
||||
struct inet_bind_hashbucket *bhead;
|
||||
int refcnt;
|
||||
/* Unlink from established hashes. */
|
||||
spinlock_t *lock = inet_ehash_lockp(hashinfo, tw->tw_hash);
|
||||
|
||||
spin_lock(lock);
|
||||
refcnt = inet_twsk_unhash(tw);
|
||||
spin_unlock(lock);
|
||||
|
||||
/* Disassociate with bind bucket. */
|
||||
bhead = &hashinfo->bhash[inet_bhashfn(twsk_net(tw), tw->tw_num,
|
||||
hashinfo->bhash_size)];
|
||||
|
||||
spin_lock(&bhead->lock);
|
||||
refcnt += inet_twsk_bind_unhash(tw, hashinfo);
|
||||
spin_unlock(&bhead->lock);
|
||||
|
||||
BUG_ON(refcnt >= atomic_read(&tw->tw_refcnt));
|
||||
atomic_sub(refcnt, &tw->tw_refcnt);
|
||||
}
|
||||
|
||||
void inet_twsk_free(struct inet_timewait_sock *tw)
|
||||
{
|
||||
struct module *owner = tw->tw_prot->owner;
|
||||
twsk_destructor((struct sock *)tw);
|
||||
#ifdef SOCK_REFCNT_DEBUG
|
||||
pr_debug("%s timewait_sock %p released\n", tw->tw_prot->name, tw);
|
||||
#endif
|
||||
release_net(twsk_net(tw));
|
||||
kmem_cache_free(tw->tw_prot->twsk_prot->twsk_slab, tw);
|
||||
module_put(owner);
|
||||
}
|
||||
|
||||
void inet_twsk_put(struct inet_timewait_sock *tw)
|
||||
{
|
||||
if (atomic_dec_and_test(&tw->tw_refcnt))
|
||||
inet_twsk_free(tw);
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(inet_twsk_put);
|
||||
|
||||
static void inet_twsk_add_node_rcu(struct inet_timewait_sock *tw,
|
||||
struct hlist_nulls_head *list)
|
||||
{
|
||||
hlist_nulls_add_head_rcu(&tw->tw_node, list);
|
||||
}
|
||||
|
||||
static void inet_twsk_add_bind_node(struct inet_timewait_sock *tw,
|
||||
struct hlist_head *list)
|
||||
{
|
||||
hlist_add_head(&tw->tw_bind_node, list);
|
||||
}
|
||||
|
||||
/*
|
||||
* Enter the time wait state. This is called with locally disabled BH.
|
||||
* Essentially we whip up a timewait bucket, copy the relevant info into it
|
||||
* from the SK, and mess with hash chains and list linkage.
|
||||
*/
|
||||
void __inet_twsk_hashdance(struct inet_timewait_sock *tw, struct sock *sk,
|
||||
struct inet_hashinfo *hashinfo)
|
||||
{
|
||||
const struct inet_sock *inet = inet_sk(sk);
|
||||
const struct inet_connection_sock *icsk = inet_csk(sk);
|
||||
struct inet_ehash_bucket *ehead = inet_ehash_bucket(hashinfo, sk->sk_hash);
|
||||
spinlock_t *lock = inet_ehash_lockp(hashinfo, sk->sk_hash);
|
||||
struct inet_bind_hashbucket *bhead;
|
||||
/* Step 1: Put TW into bind hash. Original socket stays there too.
|
||||
Note, that any socket with inet->num != 0 MUST be bound in
|
||||
binding cache, even if it is closed.
|
||||
*/
|
||||
bhead = &hashinfo->bhash[inet_bhashfn(twsk_net(tw), inet->inet_num,
|
||||
hashinfo->bhash_size)];
|
||||
spin_lock(&bhead->lock);
|
||||
tw->tw_tb = icsk->icsk_bind_hash;
|
||||
WARN_ON(!icsk->icsk_bind_hash);
|
||||
inet_twsk_add_bind_node(tw, &tw->tw_tb->owners);
|
||||
spin_unlock(&bhead->lock);
|
||||
|
||||
spin_lock(lock);
|
||||
|
||||
/*
|
||||
* Step 2: Hash TW into tcp ehash chain.
|
||||
* Notes :
|
||||
* - tw_refcnt is set to 3 because :
|
||||
* - We have one reference from bhash chain.
|
||||
* - We have one reference from ehash chain.
|
||||
* We can use atomic_set() because prior spin_lock()/spin_unlock()
|
||||
* committed into memory all tw fields.
|
||||
*/
|
||||
atomic_set(&tw->tw_refcnt, 1 + 1 + 1);
|
||||
inet_twsk_add_node_rcu(tw, &ehead->chain);
|
||||
|
||||
/* Step 3: Remove SK from hash chain */
|
||||
if (__sk_nulls_del_node_init_rcu(sk))
|
||||
sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1);
|
||||
|
||||
spin_unlock(lock);
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(__inet_twsk_hashdance);
|
||||
|
||||
struct inet_timewait_sock *inet_twsk_alloc(const struct sock *sk, const int state)
|
||||
{
|
||||
struct inet_timewait_sock *tw =
|
||||
kmem_cache_alloc(sk->sk_prot_creator->twsk_prot->twsk_slab,
|
||||
GFP_ATOMIC);
|
||||
if (tw != NULL) {
|
||||
const struct inet_sock *inet = inet_sk(sk);
|
||||
|
||||
kmemcheck_annotate_bitfield(tw, flags);
|
||||
|
||||
/* Give us an identity. */
|
||||
tw->tw_daddr = inet->inet_daddr;
|
||||
tw->tw_rcv_saddr = inet->inet_rcv_saddr;
|
||||
tw->tw_bound_dev_if = sk->sk_bound_dev_if;
|
||||
tw->tw_tos = inet->tos;
|
||||
tw->tw_num = inet->inet_num;
|
||||
tw->tw_state = TCP_TIME_WAIT;
|
||||
tw->tw_substate = state;
|
||||
tw->tw_sport = inet->inet_sport;
|
||||
tw->tw_dport = inet->inet_dport;
|
||||
tw->tw_family = sk->sk_family;
|
||||
tw->tw_reuse = sk->sk_reuse;
|
||||
tw->tw_hash = sk->sk_hash;
|
||||
tw->tw_ipv6only = 0;
|
||||
tw->tw_transparent = inet->transparent;
|
||||
tw->tw_prot = sk->sk_prot_creator;
|
||||
twsk_net_set(tw, hold_net(sock_net(sk)));
|
||||
/*
|
||||
* Because we use RCU lookups, we should not set tw_refcnt
|
||||
* to a non null value before everything is setup for this
|
||||
* timewait socket.
|
||||
*/
|
||||
atomic_set(&tw->tw_refcnt, 0);
|
||||
inet_twsk_dead_node_init(tw);
|
||||
__module_get(tw->tw_prot->owner);
|
||||
}
|
||||
|
||||
return tw;
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(inet_twsk_alloc);
|
||||
|
||||
/* Returns non-zero if quota exceeded. */
|
||||
static int inet_twdr_do_twkill_work(struct inet_timewait_death_row *twdr,
|
||||
const int slot)
|
||||
{
|
||||
struct inet_timewait_sock *tw;
|
||||
unsigned int killed;
|
||||
int ret;
|
||||
|
||||
/* NOTE: compare this to previous version where lock
|
||||
* was released after detaching chain. It was racy,
|
||||
* because tw buckets are scheduled in not serialized context
|
||||
* in 2.3 (with netfilter), and with softnet it is common, because
|
||||
* soft irqs are not sequenced.
|
||||
*/
|
||||
killed = 0;
|
||||
ret = 0;
|
||||
rescan:
|
||||
inet_twsk_for_each_inmate(tw, &twdr->cells[slot]) {
|
||||
__inet_twsk_del_dead_node(tw);
|
||||
spin_unlock(&twdr->death_lock);
|
||||
__inet_twsk_kill(tw, twdr->hashinfo);
|
||||
#ifdef CONFIG_NET_NS
|
||||
NET_INC_STATS_BH(twsk_net(tw), LINUX_MIB_TIMEWAITED);
|
||||
#endif
|
||||
inet_twsk_put(tw);
|
||||
killed++;
|
||||
spin_lock(&twdr->death_lock);
|
||||
if (killed > INET_TWDR_TWKILL_QUOTA) {
|
||||
ret = 1;
|
||||
break;
|
||||
}
|
||||
|
||||
/* While we dropped twdr->death_lock, another cpu may have
|
||||
* killed off the next TW bucket in the list, therefore
|
||||
* do a fresh re-read of the hlist head node with the
|
||||
* lock reacquired. We still use the hlist traversal
|
||||
* macro in order to get the prefetches.
|
||||
*/
|
||||
goto rescan;
|
||||
}
|
||||
|
||||
twdr->tw_count -= killed;
|
||||
#ifndef CONFIG_NET_NS
|
||||
NET_ADD_STATS_BH(&init_net, LINUX_MIB_TIMEWAITED, killed);
|
||||
#endif
|
||||
return ret;
|
||||
}
|
||||
|
||||
void inet_twdr_hangman(unsigned long data)
|
||||
{
|
||||
struct inet_timewait_death_row *twdr;
|
||||
unsigned int need_timer;
|
||||
|
||||
twdr = (struct inet_timewait_death_row *)data;
|
||||
spin_lock(&twdr->death_lock);
|
||||
|
||||
if (twdr->tw_count == 0)
|
||||
goto out;
|
||||
|
||||
need_timer = 0;
|
||||
if (inet_twdr_do_twkill_work(twdr, twdr->slot)) {
|
||||
twdr->thread_slots |= (1 << twdr->slot);
|
||||
schedule_work(&twdr->twkill_work);
|
||||
need_timer = 1;
|
||||
} else {
|
||||
/* We purged the entire slot, anything left? */
|
||||
if (twdr->tw_count)
|
||||
need_timer = 1;
|
||||
twdr->slot = ((twdr->slot + 1) & (INET_TWDR_TWKILL_SLOTS - 1));
|
||||
}
|
||||
if (need_timer)
|
||||
mod_timer(&twdr->tw_timer, jiffies + twdr->period);
|
||||
out:
|
||||
spin_unlock(&twdr->death_lock);
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(inet_twdr_hangman);
|
||||
|
||||
void inet_twdr_twkill_work(struct work_struct *work)
|
||||
{
|
||||
struct inet_timewait_death_row *twdr =
|
||||
container_of(work, struct inet_timewait_death_row, twkill_work);
|
||||
int i;
|
||||
|
||||
BUILD_BUG_ON((INET_TWDR_TWKILL_SLOTS - 1) >
|
||||
(sizeof(twdr->thread_slots) * 8));
|
||||
|
||||
while (twdr->thread_slots) {
|
||||
spin_lock_bh(&twdr->death_lock);
|
||||
for (i = 0; i < INET_TWDR_TWKILL_SLOTS; i++) {
|
||||
if (!(twdr->thread_slots & (1 << i)))
|
||||
continue;
|
||||
|
||||
while (inet_twdr_do_twkill_work(twdr, i) != 0) {
|
||||
if (need_resched()) {
|
||||
spin_unlock_bh(&twdr->death_lock);
|
||||
schedule();
|
||||
spin_lock_bh(&twdr->death_lock);
|
||||
}
|
||||
}
|
||||
|
||||
twdr->thread_slots &= ~(1 << i);
|
||||
}
|
||||
spin_unlock_bh(&twdr->death_lock);
|
||||
}
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(inet_twdr_twkill_work);
|
||||
|
||||
/* These are always called from BH context. See callers in
|
||||
* tcp_input.c to verify this.
|
||||
*/
|
||||
|
||||
/* This is for handling early-kills of TIME_WAIT sockets. */
|
||||
void inet_twsk_deschedule(struct inet_timewait_sock *tw,
|
||||
struct inet_timewait_death_row *twdr)
|
||||
{
|
||||
spin_lock(&twdr->death_lock);
|
||||
if (inet_twsk_del_dead_node(tw)) {
|
||||
inet_twsk_put(tw);
|
||||
if (--twdr->tw_count == 0)
|
||||
del_timer(&twdr->tw_timer);
|
||||
}
|
||||
spin_unlock(&twdr->death_lock);
|
||||
__inet_twsk_kill(tw, twdr->hashinfo);
|
||||
}
|
||||
EXPORT_SYMBOL(inet_twsk_deschedule);
|
||||
|
||||
void inet_twsk_schedule(struct inet_timewait_sock *tw,
|
||||
struct inet_timewait_death_row *twdr,
|
||||
const int timeo, const int timewait_len)
|
||||
{
|
||||
struct hlist_head *list;
|
||||
int slot;
|
||||
|
||||
/* timeout := RTO * 3.5
|
||||
*
|
||||
* 3.5 = 1+2+0.5 to wait for two retransmits.
|
||||
*
|
||||
* RATIONALE: if FIN arrived and we entered TIME-WAIT state,
|
||||
* our ACK acking that FIN can be lost. If N subsequent retransmitted
|
||||
* FINs (or previous seqments) are lost (probability of such event
|
||||
* is p^(N+1), where p is probability to lose single packet and
|
||||
* time to detect the loss is about RTO*(2^N - 1) with exponential
|
||||
* backoff). Normal timewait length is calculated so, that we
|
||||
* waited at least for one retransmitted FIN (maximal RTO is 120sec).
|
||||
* [ BTW Linux. following BSD, violates this requirement waiting
|
||||
* only for 60sec, we should wait at least for 240 secs.
|
||||
* Well, 240 consumes too much of resources 8)
|
||||
* ]
|
||||
* This interval is not reduced to catch old duplicate and
|
||||
* responces to our wandering segments living for two MSLs.
|
||||
* However, if we use PAWS to detect
|
||||
* old duplicates, we can reduce the interval to bounds required
|
||||
* by RTO, rather than MSL. So, if peer understands PAWS, we
|
||||
* kill tw bucket after 3.5*RTO (it is important that this number
|
||||
* is greater than TS tick!) and detect old duplicates with help
|
||||
* of PAWS.
|
||||
*/
|
||||
slot = (timeo + (1 << INET_TWDR_RECYCLE_TICK) - 1) >> INET_TWDR_RECYCLE_TICK;
|
||||
|
||||
spin_lock(&twdr->death_lock);
|
||||
|
||||
/* Unlink it, if it was scheduled */
|
||||
if (inet_twsk_del_dead_node(tw))
|
||||
twdr->tw_count--;
|
||||
else
|
||||
atomic_inc(&tw->tw_refcnt);
|
||||
|
||||
if (slot >= INET_TWDR_RECYCLE_SLOTS) {
|
||||
/* Schedule to slow timer */
|
||||
if (timeo >= timewait_len) {
|
||||
slot = INET_TWDR_TWKILL_SLOTS - 1;
|
||||
} else {
|
||||
slot = DIV_ROUND_UP(timeo, twdr->period);
|
||||
if (slot >= INET_TWDR_TWKILL_SLOTS)
|
||||
slot = INET_TWDR_TWKILL_SLOTS - 1;
|
||||
}
|
||||
tw->tw_ttd = inet_tw_time_stamp() + timeo;
|
||||
slot = (twdr->slot + slot) & (INET_TWDR_TWKILL_SLOTS - 1);
|
||||
list = &twdr->cells[slot];
|
||||
} else {
|
||||
tw->tw_ttd = inet_tw_time_stamp() + (slot << INET_TWDR_RECYCLE_TICK);
|
||||
|
||||
if (twdr->twcal_hand < 0) {
|
||||
twdr->twcal_hand = 0;
|
||||
twdr->twcal_jiffie = jiffies;
|
||||
twdr->twcal_timer.expires = twdr->twcal_jiffie +
|
||||
(slot << INET_TWDR_RECYCLE_TICK);
|
||||
add_timer(&twdr->twcal_timer);
|
||||
} else {
|
||||
if (time_after(twdr->twcal_timer.expires,
|
||||
jiffies + (slot << INET_TWDR_RECYCLE_TICK)))
|
||||
mod_timer(&twdr->twcal_timer,
|
||||
jiffies + (slot << INET_TWDR_RECYCLE_TICK));
|
||||
slot = (twdr->twcal_hand + slot) & (INET_TWDR_RECYCLE_SLOTS - 1);
|
||||
}
|
||||
list = &twdr->twcal_row[slot];
|
||||
}
|
||||
|
||||
hlist_add_head(&tw->tw_death_node, list);
|
||||
|
||||
if (twdr->tw_count++ == 0)
|
||||
mod_timer(&twdr->tw_timer, jiffies + twdr->period);
|
||||
spin_unlock(&twdr->death_lock);
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(inet_twsk_schedule);
|
||||
|
||||
void inet_twdr_twcal_tick(unsigned long data)
|
||||
{
|
||||
struct inet_timewait_death_row *twdr;
|
||||
int n, slot;
|
||||
unsigned long j;
|
||||
unsigned long now = jiffies;
|
||||
int killed = 0;
|
||||
int adv = 0;
|
||||
|
||||
twdr = (struct inet_timewait_death_row *)data;
|
||||
|
||||
spin_lock(&twdr->death_lock);
|
||||
if (twdr->twcal_hand < 0)
|
||||
goto out;
|
||||
|
||||
slot = twdr->twcal_hand;
|
||||
j = twdr->twcal_jiffie;
|
||||
|
||||
for (n = 0; n < INET_TWDR_RECYCLE_SLOTS; n++) {
|
||||
if (time_before_eq(j, now)) {
|
||||
struct hlist_node *safe;
|
||||
struct inet_timewait_sock *tw;
|
||||
|
||||
inet_twsk_for_each_inmate_safe(tw, safe,
|
||||
&twdr->twcal_row[slot]) {
|
||||
__inet_twsk_del_dead_node(tw);
|
||||
__inet_twsk_kill(tw, twdr->hashinfo);
|
||||
#ifdef CONFIG_NET_NS
|
||||
NET_INC_STATS_BH(twsk_net(tw), LINUX_MIB_TIMEWAITKILLED);
|
||||
#endif
|
||||
inet_twsk_put(tw);
|
||||
killed++;
|
||||
}
|
||||
} else {
|
||||
if (!adv) {
|
||||
adv = 1;
|
||||
twdr->twcal_jiffie = j;
|
||||
twdr->twcal_hand = slot;
|
||||
}
|
||||
|
||||
if (!hlist_empty(&twdr->twcal_row[slot])) {
|
||||
mod_timer(&twdr->twcal_timer, j);
|
||||
goto out;
|
||||
}
|
||||
}
|
||||
j += 1 << INET_TWDR_RECYCLE_TICK;
|
||||
slot = (slot + 1) & (INET_TWDR_RECYCLE_SLOTS - 1);
|
||||
}
|
||||
twdr->twcal_hand = -1;
|
||||
|
||||
out:
|
||||
if ((twdr->tw_count -= killed) == 0)
|
||||
del_timer(&twdr->tw_timer);
|
||||
#ifndef CONFIG_NET_NS
|
||||
NET_ADD_STATS_BH(&init_net, LINUX_MIB_TIMEWAITKILLED, killed);
|
||||
#endif
|
||||
spin_unlock(&twdr->death_lock);
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(inet_twdr_twcal_tick);
|
||||
|
||||
void inet_twsk_purge(struct inet_hashinfo *hashinfo,
|
||||
struct inet_timewait_death_row *twdr, int family)
|
||||
{
|
||||
struct inet_timewait_sock *tw;
|
||||
struct sock *sk;
|
||||
struct hlist_nulls_node *node;
|
||||
unsigned int slot;
|
||||
|
||||
for (slot = 0; slot <= hashinfo->ehash_mask; slot++) {
|
||||
struct inet_ehash_bucket *head = &hashinfo->ehash[slot];
|
||||
restart_rcu:
|
||||
rcu_read_lock();
|
||||
restart:
|
||||
sk_nulls_for_each_rcu(sk, node, &head->chain) {
|
||||
if (sk->sk_state != TCP_TIME_WAIT)
|
||||
continue;
|
||||
tw = inet_twsk(sk);
|
||||
if ((tw->tw_family != family) ||
|
||||
atomic_read(&twsk_net(tw)->count))
|
||||
continue;
|
||||
|
||||
if (unlikely(!atomic_inc_not_zero(&tw->tw_refcnt)))
|
||||
continue;
|
||||
|
||||
if (unlikely((tw->tw_family != family) ||
|
||||
atomic_read(&twsk_net(tw)->count))) {
|
||||
inet_twsk_put(tw);
|
||||
goto restart;
|
||||
}
|
||||
|
||||
rcu_read_unlock();
|
||||
local_bh_disable();
|
||||
inet_twsk_deschedule(tw, twdr);
|
||||
local_bh_enable();
|
||||
inet_twsk_put(tw);
|
||||
goto restart_rcu;
|
||||
}
|
||||
/* If the nulls value we got at the end of this lookup is
|
||||
* not the expected one, we must restart lookup.
|
||||
* We probably met an item that was moved to another chain.
|
||||
*/
|
||||
if (get_nulls_value(node) != slot)
|
||||
goto restart;
|
||||
rcu_read_unlock();
|
||||
}
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(inet_twsk_purge);
|
||||
558
net/ipv4/inetpeer.c
Normal file
558
net/ipv4/inetpeer.c
Normal file
|
|
@ -0,0 +1,558 @@
|
|||
/*
|
||||
* INETPEER - A storage for permanent information about peers
|
||||
*
|
||||
* This source is covered by the GNU GPL, the same as all kernel sources.
|
||||
*
|
||||
* Authors: Andrey V. Savochkin <saw@msu.ru>
|
||||
*/
|
||||
|
||||
#include <linux/module.h>
|
||||
#include <linux/types.h>
|
||||
#include <linux/slab.h>
|
||||
#include <linux/interrupt.h>
|
||||
#include <linux/spinlock.h>
|
||||
#include <linux/random.h>
|
||||
#include <linux/timer.h>
|
||||
#include <linux/time.h>
|
||||
#include <linux/kernel.h>
|
||||
#include <linux/mm.h>
|
||||
#include <linux/net.h>
|
||||
#include <linux/workqueue.h>
|
||||
#include <net/ip.h>
|
||||
#include <net/inetpeer.h>
|
||||
#include <net/secure_seq.h>
|
||||
|
||||
/*
|
||||
* Theory of operations.
|
||||
* We keep one entry for each peer IP address. The nodes contains long-living
|
||||
* information about the peer which doesn't depend on routes.
|
||||
*
|
||||
* Nodes are removed only when reference counter goes to 0.
|
||||
* When it's happened the node may be removed when a sufficient amount of
|
||||
* time has been passed since its last use. The less-recently-used entry can
|
||||
* also be removed if the pool is overloaded i.e. if the total amount of
|
||||
* entries is greater-or-equal than the threshold.
|
||||
*
|
||||
* Node pool is organised as an AVL tree.
|
||||
* Such an implementation has been chosen not just for fun. It's a way to
|
||||
* prevent easy and efficient DoS attacks by creating hash collisions. A huge
|
||||
* amount of long living nodes in a single hash slot would significantly delay
|
||||
* lookups performed with disabled BHs.
|
||||
*
|
||||
* Serialisation issues.
|
||||
* 1. Nodes may appear in the tree only with the pool lock held.
|
||||
* 2. Nodes may disappear from the tree only with the pool lock held
|
||||
* AND reference count being 0.
|
||||
* 3. Global variable peer_total is modified under the pool lock.
|
||||
* 4. struct inet_peer fields modification:
|
||||
* avl_left, avl_right, avl_parent, avl_height: pool lock
|
||||
* refcnt: atomically against modifications on other CPU;
|
||||
* usually under some other lock to prevent node disappearing
|
||||
* daddr: unchangeable
|
||||
*/
|
||||
|
||||
static struct kmem_cache *peer_cachep __read_mostly;
|
||||
|
||||
static LIST_HEAD(gc_list);
|
||||
static const int gc_delay = 60 * HZ;
|
||||
static struct delayed_work gc_work;
|
||||
static DEFINE_SPINLOCK(gc_lock);
|
||||
|
||||
#define node_height(x) x->avl_height
|
||||
|
||||
#define peer_avl_empty ((struct inet_peer *)&peer_fake_node)
|
||||
#define peer_avl_empty_rcu ((struct inet_peer __rcu __force *)&peer_fake_node)
|
||||
static const struct inet_peer peer_fake_node = {
|
||||
.avl_left = peer_avl_empty_rcu,
|
||||
.avl_right = peer_avl_empty_rcu,
|
||||
.avl_height = 0
|
||||
};
|
||||
|
||||
void inet_peer_base_init(struct inet_peer_base *bp)
|
||||
{
|
||||
bp->root = peer_avl_empty_rcu;
|
||||
seqlock_init(&bp->lock);
|
||||
bp->total = 0;
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(inet_peer_base_init);
|
||||
|
||||
#define PEER_MAXDEPTH 40 /* sufficient for about 2^27 nodes */
|
||||
|
||||
/* Exported for sysctl_net_ipv4. */
|
||||
int inet_peer_threshold __read_mostly = 65536 + 128; /* start to throw entries more
|
||||
* aggressively at this stage */
|
||||
int inet_peer_minttl __read_mostly = 120 * HZ; /* TTL under high load: 120 sec */
|
||||
int inet_peer_maxttl __read_mostly = 10 * 60 * HZ; /* usual time to live: 10 min */
|
||||
|
||||
static void inetpeer_gc_worker(struct work_struct *work)
|
||||
{
|
||||
struct inet_peer *p, *n, *c;
|
||||
struct list_head list;
|
||||
|
||||
spin_lock_bh(&gc_lock);
|
||||
list_replace_init(&gc_list, &list);
|
||||
spin_unlock_bh(&gc_lock);
|
||||
|
||||
if (list_empty(&list))
|
||||
return;
|
||||
|
||||
list_for_each_entry_safe(p, n, &list, gc_list) {
|
||||
|
||||
if (need_resched())
|
||||
cond_resched();
|
||||
|
||||
c = rcu_dereference_protected(p->avl_left, 1);
|
||||
if (c != peer_avl_empty) {
|
||||
list_add_tail(&c->gc_list, &list);
|
||||
p->avl_left = peer_avl_empty_rcu;
|
||||
}
|
||||
|
||||
c = rcu_dereference_protected(p->avl_right, 1);
|
||||
if (c != peer_avl_empty) {
|
||||
list_add_tail(&c->gc_list, &list);
|
||||
p->avl_right = peer_avl_empty_rcu;
|
||||
}
|
||||
|
||||
n = list_entry(p->gc_list.next, struct inet_peer, gc_list);
|
||||
|
||||
if (!atomic_read(&p->refcnt)) {
|
||||
list_del(&p->gc_list);
|
||||
kmem_cache_free(peer_cachep, p);
|
||||
}
|
||||
}
|
||||
|
||||
if (list_empty(&list))
|
||||
return;
|
||||
|
||||
spin_lock_bh(&gc_lock);
|
||||
list_splice(&list, &gc_list);
|
||||
spin_unlock_bh(&gc_lock);
|
||||
|
||||
schedule_delayed_work(&gc_work, gc_delay);
|
||||
}
|
||||
|
||||
/* Called from ip_output.c:ip_init */
|
||||
void __init inet_initpeers(void)
|
||||
{
|
||||
struct sysinfo si;
|
||||
|
||||
/* Use the straight interface to information about memory. */
|
||||
si_meminfo(&si);
|
||||
/* The values below were suggested by Alexey Kuznetsov
|
||||
* <kuznet@ms2.inr.ac.ru>. I don't have any opinion about the values
|
||||
* myself. --SAW
|
||||
*/
|
||||
if (si.totalram <= (32768*1024)/PAGE_SIZE)
|
||||
inet_peer_threshold >>= 1; /* max pool size about 1MB on IA32 */
|
||||
if (si.totalram <= (16384*1024)/PAGE_SIZE)
|
||||
inet_peer_threshold >>= 1; /* about 512KB */
|
||||
if (si.totalram <= (8192*1024)/PAGE_SIZE)
|
||||
inet_peer_threshold >>= 2; /* about 128KB */
|
||||
|
||||
peer_cachep = kmem_cache_create("inet_peer_cache",
|
||||
sizeof(struct inet_peer),
|
||||
0, SLAB_HWCACHE_ALIGN | SLAB_PANIC,
|
||||
NULL);
|
||||
|
||||
INIT_DEFERRABLE_WORK(&gc_work, inetpeer_gc_worker);
|
||||
}
|
||||
|
||||
static int addr_compare(const struct inetpeer_addr *a,
|
||||
const struct inetpeer_addr *b)
|
||||
{
|
||||
int i, n = (a->family == AF_INET ? 1 : 4);
|
||||
|
||||
for (i = 0; i < n; i++) {
|
||||
if (a->addr.a6[i] == b->addr.a6[i])
|
||||
continue;
|
||||
if ((__force u32)a->addr.a6[i] < (__force u32)b->addr.a6[i])
|
||||
return -1;
|
||||
return 1;
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
#define rcu_deref_locked(X, BASE) \
|
||||
rcu_dereference_protected(X, lockdep_is_held(&(BASE)->lock.lock))
|
||||
|
||||
/*
|
||||
* Called with local BH disabled and the pool lock held.
|
||||
*/
|
||||
#define lookup(_daddr, _stack, _base) \
|
||||
({ \
|
||||
struct inet_peer *u; \
|
||||
struct inet_peer __rcu **v; \
|
||||
\
|
||||
stackptr = _stack; \
|
||||
*stackptr++ = &_base->root; \
|
||||
for (u = rcu_deref_locked(_base->root, _base); \
|
||||
u != peer_avl_empty;) { \
|
||||
int cmp = addr_compare(_daddr, &u->daddr); \
|
||||
if (cmp == 0) \
|
||||
break; \
|
||||
if (cmp == -1) \
|
||||
v = &u->avl_left; \
|
||||
else \
|
||||
v = &u->avl_right; \
|
||||
*stackptr++ = v; \
|
||||
u = rcu_deref_locked(*v, _base); \
|
||||
} \
|
||||
u; \
|
||||
})
|
||||
|
||||
/*
|
||||
* Called with rcu_read_lock()
|
||||
* Because we hold no lock against a writer, its quite possible we fall
|
||||
* in an endless loop.
|
||||
* But every pointer we follow is guaranteed to be valid thanks to RCU.
|
||||
* We exit from this function if number of links exceeds PEER_MAXDEPTH
|
||||
*/
|
||||
static struct inet_peer *lookup_rcu(const struct inetpeer_addr *daddr,
|
||||
struct inet_peer_base *base)
|
||||
{
|
||||
struct inet_peer *u = rcu_dereference(base->root);
|
||||
int count = 0;
|
||||
|
||||
while (u != peer_avl_empty) {
|
||||
int cmp = addr_compare(daddr, &u->daddr);
|
||||
if (cmp == 0) {
|
||||
/* Before taking a reference, check if this entry was
|
||||
* deleted (refcnt=-1)
|
||||
*/
|
||||
if (!atomic_add_unless(&u->refcnt, 1, -1))
|
||||
u = NULL;
|
||||
return u;
|
||||
}
|
||||
if (cmp == -1)
|
||||
u = rcu_dereference(u->avl_left);
|
||||
else
|
||||
u = rcu_dereference(u->avl_right);
|
||||
if (unlikely(++count == PEER_MAXDEPTH))
|
||||
break;
|
||||
}
|
||||
return NULL;
|
||||
}
|
||||
|
||||
/* Called with local BH disabled and the pool lock held. */
|
||||
#define lookup_rightempty(start, base) \
|
||||
({ \
|
||||
struct inet_peer *u; \
|
||||
struct inet_peer __rcu **v; \
|
||||
*stackptr++ = &start->avl_left; \
|
||||
v = &start->avl_left; \
|
||||
for (u = rcu_deref_locked(*v, base); \
|
||||
u->avl_right != peer_avl_empty_rcu;) { \
|
||||
v = &u->avl_right; \
|
||||
*stackptr++ = v; \
|
||||
u = rcu_deref_locked(*v, base); \
|
||||
} \
|
||||
u; \
|
||||
})
|
||||
|
||||
/* Called with local BH disabled and the pool lock held.
|
||||
* Variable names are the proof of operation correctness.
|
||||
* Look into mm/map_avl.c for more detail description of the ideas.
|
||||
*/
|
||||
static void peer_avl_rebalance(struct inet_peer __rcu **stack[],
|
||||
struct inet_peer __rcu ***stackend,
|
||||
struct inet_peer_base *base)
|
||||
{
|
||||
struct inet_peer __rcu **nodep;
|
||||
struct inet_peer *node, *l, *r;
|
||||
int lh, rh;
|
||||
|
||||
while (stackend > stack) {
|
||||
nodep = *--stackend;
|
||||
node = rcu_deref_locked(*nodep, base);
|
||||
l = rcu_deref_locked(node->avl_left, base);
|
||||
r = rcu_deref_locked(node->avl_right, base);
|
||||
lh = node_height(l);
|
||||
rh = node_height(r);
|
||||
if (lh > rh + 1) { /* l: RH+2 */
|
||||
struct inet_peer *ll, *lr, *lrl, *lrr;
|
||||
int lrh;
|
||||
ll = rcu_deref_locked(l->avl_left, base);
|
||||
lr = rcu_deref_locked(l->avl_right, base);
|
||||
lrh = node_height(lr);
|
||||
if (lrh <= node_height(ll)) { /* ll: RH+1 */
|
||||
RCU_INIT_POINTER(node->avl_left, lr); /* lr: RH or RH+1 */
|
||||
RCU_INIT_POINTER(node->avl_right, r); /* r: RH */
|
||||
node->avl_height = lrh + 1; /* RH+1 or RH+2 */
|
||||
RCU_INIT_POINTER(l->avl_left, ll); /* ll: RH+1 */
|
||||
RCU_INIT_POINTER(l->avl_right, node); /* node: RH+1 or RH+2 */
|
||||
l->avl_height = node->avl_height + 1;
|
||||
RCU_INIT_POINTER(*nodep, l);
|
||||
} else { /* ll: RH, lr: RH+1 */
|
||||
lrl = rcu_deref_locked(lr->avl_left, base);/* lrl: RH or RH-1 */
|
||||
lrr = rcu_deref_locked(lr->avl_right, base);/* lrr: RH or RH-1 */
|
||||
RCU_INIT_POINTER(node->avl_left, lrr); /* lrr: RH or RH-1 */
|
||||
RCU_INIT_POINTER(node->avl_right, r); /* r: RH */
|
||||
node->avl_height = rh + 1; /* node: RH+1 */
|
||||
RCU_INIT_POINTER(l->avl_left, ll); /* ll: RH */
|
||||
RCU_INIT_POINTER(l->avl_right, lrl); /* lrl: RH or RH-1 */
|
||||
l->avl_height = rh + 1; /* l: RH+1 */
|
||||
RCU_INIT_POINTER(lr->avl_left, l); /* l: RH+1 */
|
||||
RCU_INIT_POINTER(lr->avl_right, node); /* node: RH+1 */
|
||||
lr->avl_height = rh + 2;
|
||||
RCU_INIT_POINTER(*nodep, lr);
|
||||
}
|
||||
} else if (rh > lh + 1) { /* r: LH+2 */
|
||||
struct inet_peer *rr, *rl, *rlr, *rll;
|
||||
int rlh;
|
||||
rr = rcu_deref_locked(r->avl_right, base);
|
||||
rl = rcu_deref_locked(r->avl_left, base);
|
||||
rlh = node_height(rl);
|
||||
if (rlh <= node_height(rr)) { /* rr: LH+1 */
|
||||
RCU_INIT_POINTER(node->avl_right, rl); /* rl: LH or LH+1 */
|
||||
RCU_INIT_POINTER(node->avl_left, l); /* l: LH */
|
||||
node->avl_height = rlh + 1; /* LH+1 or LH+2 */
|
||||
RCU_INIT_POINTER(r->avl_right, rr); /* rr: LH+1 */
|
||||
RCU_INIT_POINTER(r->avl_left, node); /* node: LH+1 or LH+2 */
|
||||
r->avl_height = node->avl_height + 1;
|
||||
RCU_INIT_POINTER(*nodep, r);
|
||||
} else { /* rr: RH, rl: RH+1 */
|
||||
rlr = rcu_deref_locked(rl->avl_right, base);/* rlr: LH or LH-1 */
|
||||
rll = rcu_deref_locked(rl->avl_left, base);/* rll: LH or LH-1 */
|
||||
RCU_INIT_POINTER(node->avl_right, rll); /* rll: LH or LH-1 */
|
||||
RCU_INIT_POINTER(node->avl_left, l); /* l: LH */
|
||||
node->avl_height = lh + 1; /* node: LH+1 */
|
||||
RCU_INIT_POINTER(r->avl_right, rr); /* rr: LH */
|
||||
RCU_INIT_POINTER(r->avl_left, rlr); /* rlr: LH or LH-1 */
|
||||
r->avl_height = lh + 1; /* r: LH+1 */
|
||||
RCU_INIT_POINTER(rl->avl_right, r); /* r: LH+1 */
|
||||
RCU_INIT_POINTER(rl->avl_left, node); /* node: LH+1 */
|
||||
rl->avl_height = lh + 2;
|
||||
RCU_INIT_POINTER(*nodep, rl);
|
||||
}
|
||||
} else {
|
||||
node->avl_height = (lh > rh ? lh : rh) + 1;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/* Called with local BH disabled and the pool lock held. */
|
||||
#define link_to_pool(n, base) \
|
||||
do { \
|
||||
n->avl_height = 1; \
|
||||
n->avl_left = peer_avl_empty_rcu; \
|
||||
n->avl_right = peer_avl_empty_rcu; \
|
||||
/* lockless readers can catch us now */ \
|
||||
rcu_assign_pointer(**--stackptr, n); \
|
||||
peer_avl_rebalance(stack, stackptr, base); \
|
||||
} while (0)
|
||||
|
||||
static void inetpeer_free_rcu(struct rcu_head *head)
|
||||
{
|
||||
kmem_cache_free(peer_cachep, container_of(head, struct inet_peer, rcu));
|
||||
}
|
||||
|
||||
static void unlink_from_pool(struct inet_peer *p, struct inet_peer_base *base,
|
||||
struct inet_peer __rcu **stack[PEER_MAXDEPTH])
|
||||
{
|
||||
struct inet_peer __rcu ***stackptr, ***delp;
|
||||
|
||||
if (lookup(&p->daddr, stack, base) != p)
|
||||
BUG();
|
||||
delp = stackptr - 1; /* *delp[0] == p */
|
||||
if (p->avl_left == peer_avl_empty_rcu) {
|
||||
*delp[0] = p->avl_right;
|
||||
--stackptr;
|
||||
} else {
|
||||
/* look for a node to insert instead of p */
|
||||
struct inet_peer *t;
|
||||
t = lookup_rightempty(p, base);
|
||||
BUG_ON(rcu_deref_locked(*stackptr[-1], base) != t);
|
||||
**--stackptr = t->avl_left;
|
||||
/* t is removed, t->daddr > x->daddr for any
|
||||
* x in p->avl_left subtree.
|
||||
* Put t in the old place of p. */
|
||||
RCU_INIT_POINTER(*delp[0], t);
|
||||
t->avl_left = p->avl_left;
|
||||
t->avl_right = p->avl_right;
|
||||
t->avl_height = p->avl_height;
|
||||
BUG_ON(delp[1] != &p->avl_left);
|
||||
delp[1] = &t->avl_left; /* was &p->avl_left */
|
||||
}
|
||||
peer_avl_rebalance(stack, stackptr, base);
|
||||
base->total--;
|
||||
call_rcu(&p->rcu, inetpeer_free_rcu);
|
||||
}
|
||||
|
||||
/* perform garbage collect on all items stacked during a lookup */
|
||||
static int inet_peer_gc(struct inet_peer_base *base,
|
||||
struct inet_peer __rcu **stack[PEER_MAXDEPTH],
|
||||
struct inet_peer __rcu ***stackptr)
|
||||
{
|
||||
struct inet_peer *p, *gchead = NULL;
|
||||
__u32 delta, ttl;
|
||||
int cnt = 0;
|
||||
|
||||
if (base->total >= inet_peer_threshold)
|
||||
ttl = 0; /* be aggressive */
|
||||
else
|
||||
ttl = inet_peer_maxttl
|
||||
- (inet_peer_maxttl - inet_peer_minttl) / HZ *
|
||||
base->total / inet_peer_threshold * HZ;
|
||||
stackptr--; /* last stack slot is peer_avl_empty */
|
||||
while (stackptr > stack) {
|
||||
stackptr--;
|
||||
p = rcu_deref_locked(**stackptr, base);
|
||||
if (atomic_read(&p->refcnt) == 0) {
|
||||
smp_rmb();
|
||||
delta = (__u32)jiffies - p->dtime;
|
||||
if (delta >= ttl &&
|
||||
atomic_cmpxchg(&p->refcnt, 0, -1) == 0) {
|
||||
p->gc_next = gchead;
|
||||
gchead = p;
|
||||
}
|
||||
}
|
||||
}
|
||||
while ((p = gchead) != NULL) {
|
||||
gchead = p->gc_next;
|
||||
cnt++;
|
||||
unlink_from_pool(p, base, stack);
|
||||
}
|
||||
return cnt;
|
||||
}
|
||||
|
||||
struct inet_peer *inet_getpeer(struct inet_peer_base *base,
|
||||
const struct inetpeer_addr *daddr,
|
||||
int create)
|
||||
{
|
||||
struct inet_peer __rcu **stack[PEER_MAXDEPTH], ***stackptr;
|
||||
struct inet_peer *p;
|
||||
unsigned int sequence;
|
||||
int invalidated, gccnt = 0;
|
||||
|
||||
/* Attempt a lockless lookup first.
|
||||
* Because of a concurrent writer, we might not find an existing entry.
|
||||
*/
|
||||
rcu_read_lock();
|
||||
sequence = read_seqbegin(&base->lock);
|
||||
p = lookup_rcu(daddr, base);
|
||||
invalidated = read_seqretry(&base->lock, sequence);
|
||||
rcu_read_unlock();
|
||||
|
||||
if (p)
|
||||
return p;
|
||||
|
||||
/* If no writer did a change during our lookup, we can return early. */
|
||||
if (!create && !invalidated)
|
||||
return NULL;
|
||||
|
||||
/* retry an exact lookup, taking the lock before.
|
||||
* At least, nodes should be hot in our cache.
|
||||
*/
|
||||
write_seqlock_bh(&base->lock);
|
||||
relookup:
|
||||
p = lookup(daddr, stack, base);
|
||||
if (p != peer_avl_empty) {
|
||||
atomic_inc(&p->refcnt);
|
||||
write_sequnlock_bh(&base->lock);
|
||||
return p;
|
||||
}
|
||||
if (!gccnt) {
|
||||
gccnt = inet_peer_gc(base, stack, stackptr);
|
||||
if (gccnt && create)
|
||||
goto relookup;
|
||||
}
|
||||
p = create ? kmem_cache_alloc(peer_cachep, GFP_ATOMIC) : NULL;
|
||||
if (p) {
|
||||
p->daddr = *daddr;
|
||||
atomic_set(&p->refcnt, 1);
|
||||
atomic_set(&p->rid, 0);
|
||||
p->metrics[RTAX_LOCK-1] = INETPEER_METRICS_NEW;
|
||||
p->rate_tokens = 0;
|
||||
/* 60*HZ is arbitrary, but chosen enough high so that the first
|
||||
* calculation of tokens is at its maximum.
|
||||
*/
|
||||
p->rate_last = jiffies - 60*HZ;
|
||||
INIT_LIST_HEAD(&p->gc_list);
|
||||
|
||||
/* Link the node. */
|
||||
link_to_pool(p, base);
|
||||
base->total++;
|
||||
}
|
||||
write_sequnlock_bh(&base->lock);
|
||||
|
||||
return p;
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(inet_getpeer);
|
||||
|
||||
void inet_putpeer(struct inet_peer *p)
|
||||
{
|
||||
p->dtime = (__u32)jiffies;
|
||||
smp_mb__before_atomic();
|
||||
atomic_dec(&p->refcnt);
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(inet_putpeer);
|
||||
|
||||
/*
|
||||
* Check transmit rate limitation for given message.
|
||||
* The rate information is held in the inet_peer entries now.
|
||||
* This function is generic and could be used for other purposes
|
||||
* too. It uses a Token bucket filter as suggested by Alexey Kuznetsov.
|
||||
*
|
||||
* Note that the same inet_peer fields are modified by functions in
|
||||
* route.c too, but these work for packet destinations while xrlim_allow
|
||||
* works for icmp destinations. This means the rate limiting information
|
||||
* for one "ip object" is shared - and these ICMPs are twice limited:
|
||||
* by source and by destination.
|
||||
*
|
||||
* RFC 1812: 4.3.2.8 SHOULD be able to limit error message rate
|
||||
* SHOULD allow setting of rate limits
|
||||
*
|
||||
* Shared between ICMPv4 and ICMPv6.
|
||||
*/
|
||||
#define XRLIM_BURST_FACTOR 6
|
||||
bool inet_peer_xrlim_allow(struct inet_peer *peer, int timeout)
|
||||
{
|
||||
unsigned long now, token;
|
||||
bool rc = false;
|
||||
|
||||
if (!peer)
|
||||
return true;
|
||||
|
||||
token = peer->rate_tokens;
|
||||
now = jiffies;
|
||||
token += now - peer->rate_last;
|
||||
peer->rate_last = now;
|
||||
if (token > XRLIM_BURST_FACTOR * timeout)
|
||||
token = XRLIM_BURST_FACTOR * timeout;
|
||||
if (token >= timeout) {
|
||||
token -= timeout;
|
||||
rc = true;
|
||||
}
|
||||
peer->rate_tokens = token;
|
||||
return rc;
|
||||
}
|
||||
EXPORT_SYMBOL(inet_peer_xrlim_allow);
|
||||
|
||||
static void inetpeer_inval_rcu(struct rcu_head *head)
|
||||
{
|
||||
struct inet_peer *p = container_of(head, struct inet_peer, gc_rcu);
|
||||
|
||||
spin_lock_bh(&gc_lock);
|
||||
list_add_tail(&p->gc_list, &gc_list);
|
||||
spin_unlock_bh(&gc_lock);
|
||||
|
||||
schedule_delayed_work(&gc_work, gc_delay);
|
||||
}
|
||||
|
||||
void inetpeer_invalidate_tree(struct inet_peer_base *base)
|
||||
{
|
||||
struct inet_peer *root;
|
||||
|
||||
write_seqlock_bh(&base->lock);
|
||||
|
||||
root = rcu_deref_locked(base->root, base);
|
||||
if (root != peer_avl_empty) {
|
||||
base->root = peer_avl_empty_rcu;
|
||||
base->total = 0;
|
||||
call_rcu(&root->gc_rcu, inetpeer_inval_rcu);
|
||||
}
|
||||
|
||||
write_sequnlock_bh(&base->lock);
|
||||
}
|
||||
EXPORT_SYMBOL(inetpeer_invalidate_tree);
|
||||
158
net/ipv4/ip_forward.c
Normal file
158
net/ipv4/ip_forward.c
Normal file
|
|
@ -0,0 +1,158 @@
|
|||
/*
|
||||
* INET An implementation of the TCP/IP protocol suite for the LINUX
|
||||
* operating system. INET is implemented using the BSD Socket
|
||||
* interface as the means of communication with the user level.
|
||||
*
|
||||
* The IP forwarding functionality.
|
||||
*
|
||||
* Authors: see ip.c
|
||||
*
|
||||
* Fixes:
|
||||
* Many : Split from ip.c , see ip_input.c for
|
||||
* history.
|
||||
* Dave Gregorich : NULL ip_rt_put fix for multicast
|
||||
* routing.
|
||||
* Jos Vos : Add call_out_firewall before sending,
|
||||
* use output device for accounting.
|
||||
* Jos Vos : Call forward firewall after routing
|
||||
* (always use output device).
|
||||
* Mike McLagan : Routing by source
|
||||
*/
|
||||
|
||||
#include <linux/types.h>
|
||||
#include <linux/mm.h>
|
||||
#include <linux/skbuff.h>
|
||||
#include <linux/ip.h>
|
||||
#include <linux/icmp.h>
|
||||
#include <linux/netdevice.h>
|
||||
#include <linux/slab.h>
|
||||
#include <net/sock.h>
|
||||
#include <net/ip.h>
|
||||
#include <net/tcp.h>
|
||||
#include <net/udp.h>
|
||||
#include <net/icmp.h>
|
||||
#include <linux/tcp.h>
|
||||
#include <linux/udp.h>
|
||||
#include <linux/netfilter_ipv4.h>
|
||||
#include <net/checksum.h>
|
||||
#include <linux/route.h>
|
||||
#include <net/route.h>
|
||||
#include <net/xfrm.h>
|
||||
|
||||
static bool ip_may_fragment(const struct sk_buff *skb)
|
||||
{
|
||||
return unlikely((ip_hdr(skb)->frag_off & htons(IP_DF)) == 0) ||
|
||||
skb->ignore_df;
|
||||
}
|
||||
|
||||
static bool ip_exceeds_mtu(const struct sk_buff *skb, unsigned int mtu)
|
||||
{
|
||||
if (skb->len <= mtu)
|
||||
return false;
|
||||
|
||||
if (skb_is_gso(skb) && skb_gso_network_seglen(skb) <= mtu)
|
||||
return false;
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
|
||||
static int ip_forward_finish(struct sk_buff *skb)
|
||||
{
|
||||
struct ip_options *opt = &(IPCB(skb)->opt);
|
||||
|
||||
IP_INC_STATS_BH(dev_net(skb_dst(skb)->dev), IPSTATS_MIB_OUTFORWDATAGRAMS);
|
||||
IP_ADD_STATS_BH(dev_net(skb_dst(skb)->dev), IPSTATS_MIB_OUTOCTETS, skb->len);
|
||||
|
||||
if (unlikely(opt->optlen))
|
||||
ip_forward_options(skb);
|
||||
|
||||
return dst_output(skb);
|
||||
}
|
||||
|
||||
int ip_forward(struct sk_buff *skb)
|
||||
{
|
||||
u32 mtu;
|
||||
struct iphdr *iph; /* Our header */
|
||||
struct rtable *rt; /* Route we use */
|
||||
struct ip_options *opt = &(IPCB(skb)->opt);
|
||||
|
||||
/* that should never happen */
|
||||
if (skb->pkt_type != PACKET_HOST)
|
||||
goto drop;
|
||||
|
||||
if (unlikely(skb->sk))
|
||||
goto drop;
|
||||
|
||||
if (skb_warn_if_lro(skb))
|
||||
goto drop;
|
||||
|
||||
if (!xfrm4_policy_check(NULL, XFRM_POLICY_FWD, skb))
|
||||
goto drop;
|
||||
|
||||
if (IPCB(skb)->opt.router_alert && ip_call_ra_chain(skb))
|
||||
return NET_RX_SUCCESS;
|
||||
|
||||
skb_forward_csum(skb);
|
||||
|
||||
/*
|
||||
* According to the RFC, we must first decrease the TTL field. If
|
||||
* that reaches zero, we must reply an ICMP control message telling
|
||||
* that the packet's lifetime expired.
|
||||
*/
|
||||
if (ip_hdr(skb)->ttl <= 1)
|
||||
goto too_many_hops;
|
||||
|
||||
if (!xfrm4_route_forward(skb))
|
||||
goto drop;
|
||||
|
||||
rt = skb_rtable(skb);
|
||||
|
||||
if (opt->is_strictroute && rt->rt_uses_gateway)
|
||||
goto sr_failed;
|
||||
|
||||
IPCB(skb)->flags |= IPSKB_FORWARDED;
|
||||
mtu = ip_dst_mtu_maybe_forward(&rt->dst, true);
|
||||
if (!ip_may_fragment(skb) && ip_exceeds_mtu(skb, mtu)) {
|
||||
IP_INC_STATS(dev_net(rt->dst.dev), IPSTATS_MIB_FRAGFAILS);
|
||||
icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED,
|
||||
htonl(mtu));
|
||||
goto drop;
|
||||
}
|
||||
|
||||
/* We are about to mangle packet. Copy it! */
|
||||
if (skb_cow(skb, LL_RESERVED_SPACE(rt->dst.dev)+rt->dst.header_len))
|
||||
goto drop;
|
||||
iph = ip_hdr(skb);
|
||||
|
||||
/* Decrease ttl after skb cow done */
|
||||
ip_decrease_ttl(iph);
|
||||
|
||||
/*
|
||||
* We now generate an ICMP HOST REDIRECT giving the route
|
||||
* we calculated.
|
||||
*/
|
||||
if (IPCB(skb)->flags & IPSKB_DOREDIRECT && !opt->srr &&
|
||||
!skb_sec_path(skb))
|
||||
ip_rt_send_redirect(skb);
|
||||
|
||||
skb->priority = rt_tos2priority(iph->tos);
|
||||
|
||||
return NF_HOOK(NFPROTO_IPV4, NF_INET_FORWARD, skb, skb->dev,
|
||||
rt->dst.dev, ip_forward_finish);
|
||||
|
||||
sr_failed:
|
||||
/*
|
||||
* Strict routing permits no gatewaying
|
||||
*/
|
||||
icmp_send(skb, ICMP_DEST_UNREACH, ICMP_SR_FAILED, 0);
|
||||
goto drop;
|
||||
|
||||
too_many_hops:
|
||||
/* Tell the sender its packet died... */
|
||||
IP_INC_STATS_BH(dev_net(skb_dst(skb)->dev), IPSTATS_MIB_INHDRERRORS);
|
||||
icmp_send(skb, ICMP_TIME_EXCEEDED, ICMP_EXC_TTL, 0);
|
||||
drop:
|
||||
kfree_skb(skb);
|
||||
return NET_RX_DROP;
|
||||
}
|
||||
870
net/ipv4/ip_fragment.c
Normal file
870
net/ipv4/ip_fragment.c
Normal file
|
|
@ -0,0 +1,870 @@
|
|||
/*
|
||||
* INET An implementation of the TCP/IP protocol suite for the LINUX
|
||||
* operating system. INET is implemented using the BSD Socket
|
||||
* interface as the means of communication with the user level.
|
||||
*
|
||||
* The IP fragmentation functionality.
|
||||
*
|
||||
* Authors: Fred N. van Kempen <waltje@uWalt.NL.Mugnet.ORG>
|
||||
* Alan Cox <alan@lxorguk.ukuu.org.uk>
|
||||
*
|
||||
* Fixes:
|
||||
* Alan Cox : Split from ip.c , see ip_input.c for history.
|
||||
* David S. Miller : Begin massive cleanup...
|
||||
* Andi Kleen : Add sysctls.
|
||||
* xxxx : Overlapfrag bug.
|
||||
* Ultima : ip_expire() kernel panic.
|
||||
* Bill Hawes : Frag accounting and evictor fixes.
|
||||
* John McDonald : 0 length frag bug.
|
||||
* Alexey Kuznetsov: SMP races, threading, cleanup.
|
||||
* Patrick McHardy : LRU queue of frag heads for evictor.
|
||||
*/
|
||||
|
||||
#define pr_fmt(fmt) "IPv4: " fmt
|
||||
|
||||
#include <linux/compiler.h>
|
||||
#include <linux/module.h>
|
||||
#include <linux/types.h>
|
||||
#include <linux/mm.h>
|
||||
#include <linux/jiffies.h>
|
||||
#include <linux/skbuff.h>
|
||||
#include <linux/list.h>
|
||||
#include <linux/ip.h>
|
||||
#include <linux/icmp.h>
|
||||
#include <linux/netdevice.h>
|
||||
#include <linux/jhash.h>
|
||||
#include <linux/random.h>
|
||||
#include <linux/slab.h>
|
||||
#include <net/route.h>
|
||||
#include <net/dst.h>
|
||||
#include <net/sock.h>
|
||||
#include <net/ip.h>
|
||||
#include <net/icmp.h>
|
||||
#include <net/checksum.h>
|
||||
#include <net/inetpeer.h>
|
||||
#include <net/inet_frag.h>
|
||||
#include <linux/tcp.h>
|
||||
#include <linux/udp.h>
|
||||
#include <linux/inet.h>
|
||||
#include <linux/netfilter_ipv4.h>
|
||||
#include <net/inet_ecn.h>
|
||||
|
||||
/* NOTE. Logic of IP defragmentation is parallel to corresponding IPv6
|
||||
* code now. If you change something here, _PLEASE_ update ipv6/reassembly.c
|
||||
* as well. Or notify me, at least. --ANK
|
||||
*/
|
||||
|
||||
static int sysctl_ipfrag_max_dist __read_mostly = 64;
|
||||
static const char ip_frag_cache_name[] = "ip4-frags";
|
||||
|
||||
struct ipfrag_skb_cb
|
||||
{
|
||||
struct inet_skb_parm h;
|
||||
int offset;
|
||||
};
|
||||
|
||||
#define FRAG_CB(skb) ((struct ipfrag_skb_cb *)((skb)->cb))
|
||||
|
||||
/* Describe an entry in the "incomplete datagrams" queue. */
|
||||
struct ipq {
|
||||
struct inet_frag_queue q;
|
||||
|
||||
u32 user;
|
||||
__be32 saddr;
|
||||
__be32 daddr;
|
||||
__be16 id;
|
||||
u8 protocol;
|
||||
u8 ecn; /* RFC3168 support */
|
||||
int iif;
|
||||
unsigned int rid;
|
||||
struct inet_peer *peer;
|
||||
};
|
||||
|
||||
static inline u8 ip4_frag_ecn(u8 tos)
|
||||
{
|
||||
return 1 << (tos & INET_ECN_MASK);
|
||||
}
|
||||
|
||||
static struct inet_frags ip4_frags;
|
||||
|
||||
int ip_frag_mem(struct net *net)
|
||||
{
|
||||
return sum_frag_mem_limit(&net->ipv4.frags);
|
||||
}
|
||||
|
||||
static int ip_frag_reasm(struct ipq *qp, struct sk_buff *prev,
|
||||
struct net_device *dev);
|
||||
|
||||
struct ip4_create_arg {
|
||||
struct iphdr *iph;
|
||||
u32 user;
|
||||
};
|
||||
|
||||
static unsigned int ipqhashfn(__be16 id, __be32 saddr, __be32 daddr, u8 prot)
|
||||
{
|
||||
net_get_random_once(&ip4_frags.rnd, sizeof(ip4_frags.rnd));
|
||||
return jhash_3words((__force u32)id << 16 | prot,
|
||||
(__force u32)saddr, (__force u32)daddr,
|
||||
ip4_frags.rnd);
|
||||
}
|
||||
|
||||
static unsigned int ip4_hashfn(const struct inet_frag_queue *q)
|
||||
{
|
||||
const struct ipq *ipq;
|
||||
|
||||
ipq = container_of(q, struct ipq, q);
|
||||
return ipqhashfn(ipq->id, ipq->saddr, ipq->daddr, ipq->protocol);
|
||||
}
|
||||
|
||||
static bool ip4_frag_match(const struct inet_frag_queue *q, const void *a)
|
||||
{
|
||||
const struct ipq *qp;
|
||||
const struct ip4_create_arg *arg = a;
|
||||
|
||||
qp = container_of(q, struct ipq, q);
|
||||
return qp->id == arg->iph->id &&
|
||||
qp->saddr == arg->iph->saddr &&
|
||||
qp->daddr == arg->iph->daddr &&
|
||||
qp->protocol == arg->iph->protocol &&
|
||||
qp->user == arg->user;
|
||||
}
|
||||
|
||||
static void ip4_frag_init(struct inet_frag_queue *q, const void *a)
|
||||
{
|
||||
struct ipq *qp = container_of(q, struct ipq, q);
|
||||
struct netns_ipv4 *ipv4 = container_of(q->net, struct netns_ipv4,
|
||||
frags);
|
||||
struct net *net = container_of(ipv4, struct net, ipv4);
|
||||
|
||||
const struct ip4_create_arg *arg = a;
|
||||
|
||||
qp->protocol = arg->iph->protocol;
|
||||
qp->id = arg->iph->id;
|
||||
qp->ecn = ip4_frag_ecn(arg->iph->tos);
|
||||
qp->saddr = arg->iph->saddr;
|
||||
qp->daddr = arg->iph->daddr;
|
||||
qp->user = arg->user;
|
||||
qp->peer = sysctl_ipfrag_max_dist ?
|
||||
inet_getpeer_v4(net->ipv4.peers, arg->iph->saddr, 1) : NULL;
|
||||
}
|
||||
|
||||
static __inline__ void ip4_frag_free(struct inet_frag_queue *q)
|
||||
{
|
||||
struct ipq *qp;
|
||||
|
||||
qp = container_of(q, struct ipq, q);
|
||||
if (qp->peer)
|
||||
inet_putpeer(qp->peer);
|
||||
}
|
||||
|
||||
|
||||
/* Destruction primitives. */
|
||||
|
||||
static __inline__ void ipq_put(struct ipq *ipq)
|
||||
{
|
||||
inet_frag_put(&ipq->q, &ip4_frags);
|
||||
}
|
||||
|
||||
/* Kill ipq entry. It is not destroyed immediately,
|
||||
* because caller (and someone more) holds reference count.
|
||||
*/
|
||||
static void ipq_kill(struct ipq *ipq)
|
||||
{
|
||||
inet_frag_kill(&ipq->q, &ip4_frags);
|
||||
}
|
||||
|
||||
/*
|
||||
* Oops, a fragment queue timed out. Kill it and send an ICMP reply.
|
||||
*/
|
||||
static void ip_expire(unsigned long arg)
|
||||
{
|
||||
struct ipq *qp;
|
||||
struct net *net;
|
||||
|
||||
qp = container_of((struct inet_frag_queue *) arg, struct ipq, q);
|
||||
net = container_of(qp->q.net, struct net, ipv4.frags);
|
||||
|
||||
spin_lock(&qp->q.lock);
|
||||
|
||||
if (qp->q.flags & INET_FRAG_COMPLETE)
|
||||
goto out;
|
||||
|
||||
ipq_kill(qp);
|
||||
IP_INC_STATS_BH(net, IPSTATS_MIB_REASMFAILS);
|
||||
|
||||
if (!(qp->q.flags & INET_FRAG_EVICTED)) {
|
||||
struct sk_buff *head = qp->q.fragments;
|
||||
const struct iphdr *iph;
|
||||
int err;
|
||||
|
||||
IP_INC_STATS_BH(net, IPSTATS_MIB_REASMTIMEOUT);
|
||||
|
||||
if (!(qp->q.flags & INET_FRAG_FIRST_IN) || !qp->q.fragments)
|
||||
goto out;
|
||||
|
||||
rcu_read_lock();
|
||||
head->dev = dev_get_by_index_rcu(net, qp->iif);
|
||||
if (!head->dev)
|
||||
goto out_rcu_unlock;
|
||||
|
||||
/* skb has no dst, perform route lookup again */
|
||||
iph = ip_hdr(head);
|
||||
err = ip_route_input_noref(head, iph->daddr, iph->saddr,
|
||||
iph->tos, head->dev);
|
||||
if (err)
|
||||
goto out_rcu_unlock;
|
||||
|
||||
/* Only an end host needs to send an ICMP
|
||||
* "Fragment Reassembly Timeout" message, per RFC792.
|
||||
*/
|
||||
if (qp->user == IP_DEFRAG_AF_PACKET ||
|
||||
((qp->user >= IP_DEFRAG_CONNTRACK_IN) &&
|
||||
(qp->user <= __IP_DEFRAG_CONNTRACK_IN_END) &&
|
||||
(skb_rtable(head)->rt_type != RTN_LOCAL)))
|
||||
goto out_rcu_unlock;
|
||||
|
||||
/* Send an ICMP "Fragment Reassembly Timeout" message. */
|
||||
icmp_send(head, ICMP_TIME_EXCEEDED, ICMP_EXC_FRAGTIME, 0);
|
||||
out_rcu_unlock:
|
||||
rcu_read_unlock();
|
||||
}
|
||||
out:
|
||||
spin_unlock(&qp->q.lock);
|
||||
ipq_put(qp);
|
||||
}
|
||||
|
||||
/* Find the correct entry in the "incomplete datagrams" queue for
|
||||
* this IP datagram, and create new one, if nothing is found.
|
||||
*/
|
||||
static inline struct ipq *ip_find(struct net *net, struct iphdr *iph, u32 user)
|
||||
{
|
||||
struct inet_frag_queue *q;
|
||||
struct ip4_create_arg arg;
|
||||
unsigned int hash;
|
||||
|
||||
arg.iph = iph;
|
||||
arg.user = user;
|
||||
|
||||
hash = ipqhashfn(iph->id, iph->saddr, iph->daddr, iph->protocol);
|
||||
|
||||
q = inet_frag_find(&net->ipv4.frags, &ip4_frags, &arg, hash);
|
||||
if (IS_ERR_OR_NULL(q)) {
|
||||
inet_frag_maybe_warn_overflow(q, pr_fmt());
|
||||
return NULL;
|
||||
}
|
||||
return container_of(q, struct ipq, q);
|
||||
}
|
||||
|
||||
/* Is the fragment too far ahead to be part of ipq? */
|
||||
static inline int ip_frag_too_far(struct ipq *qp)
|
||||
{
|
||||
struct inet_peer *peer = qp->peer;
|
||||
unsigned int max = sysctl_ipfrag_max_dist;
|
||||
unsigned int start, end;
|
||||
|
||||
int rc;
|
||||
|
||||
if (!peer || !max)
|
||||
return 0;
|
||||
|
||||
start = qp->rid;
|
||||
end = atomic_inc_return(&peer->rid);
|
||||
qp->rid = end;
|
||||
|
||||
rc = qp->q.fragments && (end - start) > max;
|
||||
|
||||
if (rc) {
|
||||
struct net *net;
|
||||
|
||||
net = container_of(qp->q.net, struct net, ipv4.frags);
|
||||
IP_INC_STATS_BH(net, IPSTATS_MIB_REASMFAILS);
|
||||
}
|
||||
|
||||
return rc;
|
||||
}
|
||||
|
||||
static int ip_frag_reinit(struct ipq *qp)
|
||||
{
|
||||
struct sk_buff *fp;
|
||||
unsigned int sum_truesize = 0;
|
||||
|
||||
if (!mod_timer(&qp->q.timer, jiffies + qp->q.net->timeout)) {
|
||||
atomic_inc(&qp->q.refcnt);
|
||||
return -ETIMEDOUT;
|
||||
}
|
||||
|
||||
fp = qp->q.fragments;
|
||||
do {
|
||||
struct sk_buff *xp = fp->next;
|
||||
|
||||
sum_truesize += fp->truesize;
|
||||
kfree_skb(fp);
|
||||
fp = xp;
|
||||
} while (fp);
|
||||
sub_frag_mem_limit(&qp->q, sum_truesize);
|
||||
|
||||
qp->q.flags = 0;
|
||||
qp->q.len = 0;
|
||||
qp->q.meat = 0;
|
||||
qp->q.fragments = NULL;
|
||||
qp->q.fragments_tail = NULL;
|
||||
qp->iif = 0;
|
||||
qp->ecn = 0;
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
/* Add new segment to existing queue. */
|
||||
static int ip_frag_queue(struct ipq *qp, struct sk_buff *skb)
|
||||
{
|
||||
struct sk_buff *prev, *next;
|
||||
struct net_device *dev;
|
||||
int flags, offset;
|
||||
int ihl, end;
|
||||
int err = -ENOENT;
|
||||
u8 ecn;
|
||||
|
||||
if (qp->q.flags & INET_FRAG_COMPLETE)
|
||||
goto err;
|
||||
|
||||
if (!(IPCB(skb)->flags & IPSKB_FRAG_COMPLETE) &&
|
||||
unlikely(ip_frag_too_far(qp)) &&
|
||||
unlikely(err = ip_frag_reinit(qp))) {
|
||||
ipq_kill(qp);
|
||||
goto err;
|
||||
}
|
||||
|
||||
ecn = ip4_frag_ecn(ip_hdr(skb)->tos);
|
||||
offset = ntohs(ip_hdr(skb)->frag_off);
|
||||
flags = offset & ~IP_OFFSET;
|
||||
offset &= IP_OFFSET;
|
||||
offset <<= 3; /* offset is in 8-byte chunks */
|
||||
ihl = ip_hdrlen(skb);
|
||||
|
||||
/* Determine the position of this fragment. */
|
||||
end = offset + skb->len - ihl;
|
||||
err = -EINVAL;
|
||||
|
||||
/* Is this the final fragment? */
|
||||
if ((flags & IP_MF) == 0) {
|
||||
/* If we already have some bits beyond end
|
||||
* or have different end, the segment is corrupted.
|
||||
*/
|
||||
if (end < qp->q.len ||
|
||||
((qp->q.flags & INET_FRAG_LAST_IN) && end != qp->q.len))
|
||||
goto err;
|
||||
qp->q.flags |= INET_FRAG_LAST_IN;
|
||||
qp->q.len = end;
|
||||
} else {
|
||||
if (end&7) {
|
||||
end &= ~7;
|
||||
if (skb->ip_summed != CHECKSUM_UNNECESSARY)
|
||||
skb->ip_summed = CHECKSUM_NONE;
|
||||
}
|
||||
if (end > qp->q.len) {
|
||||
/* Some bits beyond end -> corruption. */
|
||||
if (qp->q.flags & INET_FRAG_LAST_IN)
|
||||
goto err;
|
||||
qp->q.len = end;
|
||||
}
|
||||
}
|
||||
if (end == offset)
|
||||
goto err;
|
||||
|
||||
err = -ENOMEM;
|
||||
if (pskb_pull(skb, ihl) == NULL)
|
||||
goto err;
|
||||
|
||||
err = pskb_trim_rcsum(skb, end - offset);
|
||||
if (err)
|
||||
goto err;
|
||||
|
||||
/* Find out which fragments are in front and at the back of us
|
||||
* in the chain of fragments so far. We must know where to put
|
||||
* this fragment, right?
|
||||
*/
|
||||
prev = qp->q.fragments_tail;
|
||||
if (!prev || FRAG_CB(prev)->offset < offset) {
|
||||
next = NULL;
|
||||
goto found;
|
||||
}
|
||||
prev = NULL;
|
||||
for (next = qp->q.fragments; next != NULL; next = next->next) {
|
||||
if (FRAG_CB(next)->offset >= offset)
|
||||
break; /* bingo! */
|
||||
prev = next;
|
||||
}
|
||||
|
||||
found:
|
||||
/* We found where to put this one. Check for overlap with
|
||||
* preceding fragment, and, if needed, align things so that
|
||||
* any overlaps are eliminated.
|
||||
*/
|
||||
if (prev) {
|
||||
int i = (FRAG_CB(prev)->offset + prev->len) - offset;
|
||||
|
||||
if (i > 0) {
|
||||
offset += i;
|
||||
err = -EINVAL;
|
||||
if (end <= offset)
|
||||
goto err;
|
||||
err = -ENOMEM;
|
||||
if (!pskb_pull(skb, i))
|
||||
goto err;
|
||||
if (skb->ip_summed != CHECKSUM_UNNECESSARY)
|
||||
skb->ip_summed = CHECKSUM_NONE;
|
||||
}
|
||||
}
|
||||
|
||||
err = -ENOMEM;
|
||||
|
||||
while (next && FRAG_CB(next)->offset < end) {
|
||||
int i = end - FRAG_CB(next)->offset; /* overlap is 'i' bytes */
|
||||
|
||||
if (i < next->len) {
|
||||
/* Eat head of the next overlapped fragment
|
||||
* and leave the loop. The next ones cannot overlap.
|
||||
*/
|
||||
if (!pskb_pull(next, i))
|
||||
goto err;
|
||||
FRAG_CB(next)->offset += i;
|
||||
qp->q.meat -= i;
|
||||
if (next->ip_summed != CHECKSUM_UNNECESSARY)
|
||||
next->ip_summed = CHECKSUM_NONE;
|
||||
break;
|
||||
} else {
|
||||
struct sk_buff *free_it = next;
|
||||
|
||||
/* Old fragment is completely overridden with
|
||||
* new one drop it.
|
||||
*/
|
||||
next = next->next;
|
||||
|
||||
if (prev)
|
||||
prev->next = next;
|
||||
else
|
||||
qp->q.fragments = next;
|
||||
|
||||
qp->q.meat -= free_it->len;
|
||||
sub_frag_mem_limit(&qp->q, free_it->truesize);
|
||||
kfree_skb(free_it);
|
||||
}
|
||||
}
|
||||
|
||||
FRAG_CB(skb)->offset = offset;
|
||||
|
||||
/* Insert this fragment in the chain of fragments. */
|
||||
skb->next = next;
|
||||
if (!next)
|
||||
qp->q.fragments_tail = skb;
|
||||
if (prev)
|
||||
prev->next = skb;
|
||||
else
|
||||
qp->q.fragments = skb;
|
||||
|
||||
dev = skb->dev;
|
||||
if (dev) {
|
||||
qp->iif = dev->ifindex;
|
||||
skb->dev = NULL;
|
||||
}
|
||||
qp->q.stamp = skb->tstamp;
|
||||
qp->q.meat += skb->len;
|
||||
qp->ecn |= ecn;
|
||||
add_frag_mem_limit(&qp->q, skb->truesize);
|
||||
if (offset == 0)
|
||||
qp->q.flags |= INET_FRAG_FIRST_IN;
|
||||
|
||||
if (ip_hdr(skb)->frag_off & htons(IP_DF) &&
|
||||
skb->len + ihl > qp->q.max_size)
|
||||
qp->q.max_size = skb->len + ihl;
|
||||
|
||||
if (qp->q.flags == (INET_FRAG_FIRST_IN | INET_FRAG_LAST_IN) &&
|
||||
qp->q.meat == qp->q.len) {
|
||||
unsigned long orefdst = skb->_skb_refdst;
|
||||
|
||||
skb->_skb_refdst = 0UL;
|
||||
err = ip_frag_reasm(qp, prev, dev);
|
||||
skb->_skb_refdst = orefdst;
|
||||
return err;
|
||||
}
|
||||
|
||||
skb_dst_drop(skb);
|
||||
return -EINPROGRESS;
|
||||
|
||||
err:
|
||||
kfree_skb(skb);
|
||||
return err;
|
||||
}
|
||||
|
||||
|
||||
/* Build a new IP datagram from all its fragments. */
|
||||
|
||||
static int ip_frag_reasm(struct ipq *qp, struct sk_buff *prev,
|
||||
struct net_device *dev)
|
||||
{
|
||||
struct net *net = container_of(qp->q.net, struct net, ipv4.frags);
|
||||
struct iphdr *iph;
|
||||
struct sk_buff *fp, *head = qp->q.fragments;
|
||||
int len;
|
||||
int ihlen;
|
||||
int err;
|
||||
int sum_truesize;
|
||||
u8 ecn;
|
||||
|
||||
ipq_kill(qp);
|
||||
|
||||
ecn = ip_frag_ecn_table[qp->ecn];
|
||||
if (unlikely(ecn == 0xff)) {
|
||||
err = -EINVAL;
|
||||
goto out_fail;
|
||||
}
|
||||
/* Make the one we just received the head. */
|
||||
if (prev) {
|
||||
head = prev->next;
|
||||
fp = skb_clone(head, GFP_ATOMIC);
|
||||
if (!fp)
|
||||
goto out_nomem;
|
||||
|
||||
fp->next = head->next;
|
||||
if (!fp->next)
|
||||
qp->q.fragments_tail = fp;
|
||||
prev->next = fp;
|
||||
|
||||
skb_morph(head, qp->q.fragments);
|
||||
head->next = qp->q.fragments->next;
|
||||
|
||||
consume_skb(qp->q.fragments);
|
||||
qp->q.fragments = head;
|
||||
}
|
||||
|
||||
WARN_ON(head == NULL);
|
||||
WARN_ON(FRAG_CB(head)->offset != 0);
|
||||
|
||||
/* Allocate a new buffer for the datagram. */
|
||||
ihlen = ip_hdrlen(head);
|
||||
len = ihlen + qp->q.len;
|
||||
|
||||
err = -E2BIG;
|
||||
if (len > 65535)
|
||||
goto out_oversize;
|
||||
|
||||
/* Head of list must not be cloned. */
|
||||
if (skb_unclone(head, GFP_ATOMIC))
|
||||
goto out_nomem;
|
||||
|
||||
/* If the first fragment is fragmented itself, we split
|
||||
* it to two chunks: the first with data and paged part
|
||||
* and the second, holding only fragments. */
|
||||
if (skb_has_frag_list(head)) {
|
||||
struct sk_buff *clone;
|
||||
int i, plen = 0;
|
||||
|
||||
if ((clone = alloc_skb(0, GFP_ATOMIC)) == NULL)
|
||||
goto out_nomem;
|
||||
clone->next = head->next;
|
||||
head->next = clone;
|
||||
skb_shinfo(clone)->frag_list = skb_shinfo(head)->frag_list;
|
||||
skb_frag_list_init(head);
|
||||
for (i = 0; i < skb_shinfo(head)->nr_frags; i++)
|
||||
plen += skb_frag_size(&skb_shinfo(head)->frags[i]);
|
||||
clone->len = clone->data_len = head->data_len - plen;
|
||||
head->data_len -= clone->len;
|
||||
head->len -= clone->len;
|
||||
clone->csum = 0;
|
||||
clone->ip_summed = head->ip_summed;
|
||||
add_frag_mem_limit(&qp->q, clone->truesize);
|
||||
}
|
||||
|
||||
skb_push(head, head->data - skb_network_header(head));
|
||||
|
||||
sum_truesize = head->truesize;
|
||||
for (fp = head->next; fp;) {
|
||||
bool headstolen;
|
||||
int delta;
|
||||
struct sk_buff *next = fp->next;
|
||||
|
||||
sum_truesize += fp->truesize;
|
||||
if (head->ip_summed != fp->ip_summed)
|
||||
head->ip_summed = CHECKSUM_NONE;
|
||||
else if (head->ip_summed == CHECKSUM_COMPLETE)
|
||||
head->csum = csum_add(head->csum, fp->csum);
|
||||
|
||||
if (skb_try_coalesce(head, fp, &headstolen, &delta)) {
|
||||
kfree_skb_partial(fp, headstolen);
|
||||
} else {
|
||||
if (!skb_shinfo(head)->frag_list)
|
||||
skb_shinfo(head)->frag_list = fp;
|
||||
head->data_len += fp->len;
|
||||
head->len += fp->len;
|
||||
head->truesize += fp->truesize;
|
||||
}
|
||||
fp = next;
|
||||
}
|
||||
sub_frag_mem_limit(&qp->q, sum_truesize);
|
||||
|
||||
head->next = NULL;
|
||||
head->dev = dev;
|
||||
head->tstamp = qp->q.stamp;
|
||||
IPCB(head)->frag_max_size = qp->q.max_size;
|
||||
|
||||
iph = ip_hdr(head);
|
||||
/* max_size != 0 implies at least one fragment had IP_DF set */
|
||||
iph->frag_off = qp->q.max_size ? htons(IP_DF) : 0;
|
||||
iph->tot_len = htons(len);
|
||||
iph->tos |= ecn;
|
||||
IP_INC_STATS_BH(net, IPSTATS_MIB_REASMOKS);
|
||||
qp->q.fragments = NULL;
|
||||
qp->q.fragments_tail = NULL;
|
||||
return 0;
|
||||
|
||||
out_nomem:
|
||||
LIMIT_NETDEBUG(KERN_ERR pr_fmt("queue_glue: no memory for gluing queue %p\n"),
|
||||
qp);
|
||||
err = -ENOMEM;
|
||||
goto out_fail;
|
||||
out_oversize:
|
||||
net_info_ratelimited("Oversized IP packet from %pI4\n", &qp->saddr);
|
||||
out_fail:
|
||||
IP_INC_STATS_BH(net, IPSTATS_MIB_REASMFAILS);
|
||||
return err;
|
||||
}
|
||||
|
||||
/* Process an incoming IP datagram fragment. */
|
||||
int ip_defrag(struct sk_buff *skb, u32 user)
|
||||
{
|
||||
struct ipq *qp;
|
||||
struct net *net;
|
||||
|
||||
net = skb->dev ? dev_net(skb->dev) : dev_net(skb_dst(skb)->dev);
|
||||
IP_INC_STATS_BH(net, IPSTATS_MIB_REASMREQDS);
|
||||
|
||||
/* Lookup (or create) queue header */
|
||||
if ((qp = ip_find(net, ip_hdr(skb), user)) != NULL) {
|
||||
int ret;
|
||||
|
||||
spin_lock(&qp->q.lock);
|
||||
|
||||
ret = ip_frag_queue(qp, skb);
|
||||
|
||||
spin_unlock(&qp->q.lock);
|
||||
ipq_put(qp);
|
||||
return ret;
|
||||
}
|
||||
|
||||
IP_INC_STATS_BH(net, IPSTATS_MIB_REASMFAILS);
|
||||
kfree_skb(skb);
|
||||
return -ENOMEM;
|
||||
}
|
||||
EXPORT_SYMBOL(ip_defrag);
|
||||
|
||||
struct sk_buff *ip_check_defrag(struct sk_buff *skb, u32 user)
|
||||
{
|
||||
struct iphdr iph;
|
||||
int netoff;
|
||||
u32 len;
|
||||
|
||||
if (skb->protocol != htons(ETH_P_IP))
|
||||
return skb;
|
||||
|
||||
netoff = skb_network_offset(skb);
|
||||
|
||||
if (skb_copy_bits(skb, netoff, &iph, sizeof(iph)) < 0)
|
||||
return skb;
|
||||
|
||||
if (iph.ihl < 5 || iph.version != 4)
|
||||
return skb;
|
||||
|
||||
len = ntohs(iph.tot_len);
|
||||
if (skb->len < netoff + len || len < (iph.ihl * 4))
|
||||
return skb;
|
||||
|
||||
if (ip_is_fragment(&iph)) {
|
||||
skb = skb_share_check(skb, GFP_ATOMIC);
|
||||
if (skb) {
|
||||
if (!pskb_may_pull(skb, netoff + iph.ihl * 4))
|
||||
return skb;
|
||||
if (pskb_trim_rcsum(skb, netoff + len))
|
||||
return skb;
|
||||
memset(IPCB(skb), 0, sizeof(struct inet_skb_parm));
|
||||
if (ip_defrag(skb, user))
|
||||
return NULL;
|
||||
skb_clear_hash(skb);
|
||||
}
|
||||
}
|
||||
return skb;
|
||||
}
|
||||
EXPORT_SYMBOL(ip_check_defrag);
|
||||
|
||||
#ifdef CONFIG_SYSCTL
|
||||
static int zero;
|
||||
|
||||
static struct ctl_table ip4_frags_ns_ctl_table[] = {
|
||||
{
|
||||
.procname = "ipfrag_high_thresh",
|
||||
.data = &init_net.ipv4.frags.high_thresh,
|
||||
.maxlen = sizeof(int),
|
||||
.mode = 0644,
|
||||
.proc_handler = proc_dointvec_minmax,
|
||||
.extra1 = &init_net.ipv4.frags.low_thresh
|
||||
},
|
||||
{
|
||||
.procname = "ipfrag_low_thresh",
|
||||
.data = &init_net.ipv4.frags.low_thresh,
|
||||
.maxlen = sizeof(int),
|
||||
.mode = 0644,
|
||||
.proc_handler = proc_dointvec_minmax,
|
||||
.extra1 = &zero,
|
||||
.extra2 = &init_net.ipv4.frags.high_thresh
|
||||
},
|
||||
{
|
||||
.procname = "ipfrag_time",
|
||||
.data = &init_net.ipv4.frags.timeout,
|
||||
.maxlen = sizeof(int),
|
||||
.mode = 0644,
|
||||
.proc_handler = proc_dointvec_jiffies,
|
||||
},
|
||||
{ }
|
||||
};
|
||||
|
||||
/* secret interval has been deprecated */
|
||||
static int ip4_frags_secret_interval_unused;
|
||||
static struct ctl_table ip4_frags_ctl_table[] = {
|
||||
{
|
||||
.procname = "ipfrag_secret_interval",
|
||||
.data = &ip4_frags_secret_interval_unused,
|
||||
.maxlen = sizeof(int),
|
||||
.mode = 0644,
|
||||
.proc_handler = proc_dointvec_jiffies,
|
||||
},
|
||||
{
|
||||
.procname = "ipfrag_max_dist",
|
||||
.data = &sysctl_ipfrag_max_dist,
|
||||
.maxlen = sizeof(int),
|
||||
.mode = 0644,
|
||||
.proc_handler = proc_dointvec_minmax,
|
||||
.extra1 = &zero
|
||||
},
|
||||
{ }
|
||||
};
|
||||
|
||||
static int __net_init ip4_frags_ns_ctl_register(struct net *net)
|
||||
{
|
||||
struct ctl_table *table;
|
||||
struct ctl_table_header *hdr;
|
||||
|
||||
table = ip4_frags_ns_ctl_table;
|
||||
if (!net_eq(net, &init_net)) {
|
||||
table = kmemdup(table, sizeof(ip4_frags_ns_ctl_table), GFP_KERNEL);
|
||||
if (table == NULL)
|
||||
goto err_alloc;
|
||||
|
||||
table[0].data = &net->ipv4.frags.high_thresh;
|
||||
table[0].extra1 = &net->ipv4.frags.low_thresh;
|
||||
table[0].extra2 = &init_net.ipv4.frags.high_thresh;
|
||||
table[1].data = &net->ipv4.frags.low_thresh;
|
||||
table[1].extra2 = &net->ipv4.frags.high_thresh;
|
||||
table[2].data = &net->ipv4.frags.timeout;
|
||||
|
||||
/* Don't export sysctls to unprivileged users */
|
||||
if (net->user_ns != &init_user_ns)
|
||||
table[0].procname = NULL;
|
||||
}
|
||||
|
||||
hdr = register_net_sysctl(net, "net/ipv4", table);
|
||||
if (hdr == NULL)
|
||||
goto err_reg;
|
||||
|
||||
net->ipv4.frags_hdr = hdr;
|
||||
return 0;
|
||||
|
||||
err_reg:
|
||||
if (!net_eq(net, &init_net))
|
||||
kfree(table);
|
||||
err_alloc:
|
||||
return -ENOMEM;
|
||||
}
|
||||
|
||||
static void __net_exit ip4_frags_ns_ctl_unregister(struct net *net)
|
||||
{
|
||||
struct ctl_table *table;
|
||||
|
||||
table = net->ipv4.frags_hdr->ctl_table_arg;
|
||||
unregister_net_sysctl_table(net->ipv4.frags_hdr);
|
||||
kfree(table);
|
||||
}
|
||||
|
||||
static void __init ip4_frags_ctl_register(void)
|
||||
{
|
||||
register_net_sysctl(&init_net, "net/ipv4", ip4_frags_ctl_table);
|
||||
}
|
||||
#else
|
||||
static inline int ip4_frags_ns_ctl_register(struct net *net)
|
||||
{
|
||||
return 0;
|
||||
}
|
||||
|
||||
static inline void ip4_frags_ns_ctl_unregister(struct net *net)
|
||||
{
|
||||
}
|
||||
|
||||
static inline void __init ip4_frags_ctl_register(void)
|
||||
{
|
||||
}
|
||||
#endif
|
||||
|
||||
static int __net_init ipv4_frags_init_net(struct net *net)
|
||||
{
|
||||
/* Fragment cache limits.
|
||||
*
|
||||
* The fragment memory accounting code, (tries to) account for
|
||||
* the real memory usage, by measuring both the size of frag
|
||||
* queue struct (inet_frag_queue (ipv4:ipq/ipv6:frag_queue))
|
||||
* and the SKB's truesize.
|
||||
*
|
||||
* A 64K fragment consumes 129736 bytes (44*2944)+200
|
||||
* (1500 truesize == 2944, sizeof(struct ipq) == 200)
|
||||
*
|
||||
* We will commit 4MB at one time. Should we cross that limit
|
||||
* we will prune down to 3MB, making room for approx 8 big 64K
|
||||
* fragments 8x128k.
|
||||
*/
|
||||
net->ipv4.frags.high_thresh = 4 * 1024 * 1024;
|
||||
net->ipv4.frags.low_thresh = 3 * 1024 * 1024;
|
||||
/*
|
||||
* Important NOTE! Fragment queue must be destroyed before MSL expires.
|
||||
* RFC791 is wrong proposing to prolongate timer each fragment arrival
|
||||
* by TTL.
|
||||
*/
|
||||
net->ipv4.frags.timeout = IP_FRAG_TIME;
|
||||
|
||||
inet_frags_init_net(&net->ipv4.frags);
|
||||
|
||||
return ip4_frags_ns_ctl_register(net);
|
||||
}
|
||||
|
||||
static void __net_exit ipv4_frags_exit_net(struct net *net)
|
||||
{
|
||||
ip4_frags_ns_ctl_unregister(net);
|
||||
inet_frags_exit_net(&net->ipv4.frags, &ip4_frags);
|
||||
}
|
||||
|
||||
static struct pernet_operations ip4_frags_ops = {
|
||||
.init = ipv4_frags_init_net,
|
||||
.exit = ipv4_frags_exit_net,
|
||||
};
|
||||
|
||||
void __init ipfrag_init(void)
|
||||
{
|
||||
ip4_frags_ctl_register();
|
||||
register_pernet_subsys(&ip4_frags_ops);
|
||||
ip4_frags.hashfn = ip4_hashfn;
|
||||
ip4_frags.constructor = ip4_frag_init;
|
||||
ip4_frags.destructor = ip4_frag_free;
|
||||
ip4_frags.skb_free = NULL;
|
||||
ip4_frags.qsize = sizeof(struct ipq);
|
||||
ip4_frags.match = ip4_frag_match;
|
||||
ip4_frags.frag_expire = ip_expire;
|
||||
ip4_frags.frags_cache_name = ip_frag_cache_name;
|
||||
if (inet_frags_init(&ip4_frags))
|
||||
panic("IP: failed to allocate ip4_frags cache\n");
|
||||
}
|
||||
921
net/ipv4/ip_gre.c
Normal file
921
net/ipv4/ip_gre.c
Normal file
|
|
@ -0,0 +1,921 @@
|
|||
/*
|
||||
* Linux NET3: GRE over IP protocol decoder.
|
||||
*
|
||||
* Authors: Alexey Kuznetsov (kuznet@ms2.inr.ac.ru)
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU General Public License
|
||||
* as published by the Free Software Foundation; either version
|
||||
* 2 of the License, or (at your option) any later version.
|
||||
*
|
||||
*/
|
||||
|
||||
#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
|
||||
|
||||
#include <linux/capability.h>
|
||||
#include <linux/module.h>
|
||||
#include <linux/types.h>
|
||||
#include <linux/kernel.h>
|
||||
#include <linux/slab.h>
|
||||
#include <asm/uaccess.h>
|
||||
#include <linux/skbuff.h>
|
||||
#include <linux/netdevice.h>
|
||||
#include <linux/in.h>
|
||||
#include <linux/tcp.h>
|
||||
#include <linux/udp.h>
|
||||
#include <linux/if_arp.h>
|
||||
#include <linux/mroute.h>
|
||||
#include <linux/init.h>
|
||||
#include <linux/in6.h>
|
||||
#include <linux/inetdevice.h>
|
||||
#include <linux/igmp.h>
|
||||
#include <linux/netfilter_ipv4.h>
|
||||
#include <linux/etherdevice.h>
|
||||
#include <linux/if_ether.h>
|
||||
|
||||
#include <net/sock.h>
|
||||
#include <net/ip.h>
|
||||
#include <net/icmp.h>
|
||||
#include <net/protocol.h>
|
||||
#include <net/ip_tunnels.h>
|
||||
#include <net/arp.h>
|
||||
#include <net/checksum.h>
|
||||
#include <net/dsfield.h>
|
||||
#include <net/inet_ecn.h>
|
||||
#include <net/xfrm.h>
|
||||
#include <net/net_namespace.h>
|
||||
#include <net/netns/generic.h>
|
||||
#include <net/rtnetlink.h>
|
||||
#include <net/gre.h>
|
||||
|
||||
#if IS_ENABLED(CONFIG_IPV6)
|
||||
#include <net/ipv6.h>
|
||||
#include <net/ip6_fib.h>
|
||||
#include <net/ip6_route.h>
|
||||
#endif
|
||||
|
||||
/*
|
||||
Problems & solutions
|
||||
--------------------
|
||||
|
||||
1. The most important issue is detecting local dead loops.
|
||||
They would cause complete host lockup in transmit, which
|
||||
would be "resolved" by stack overflow or, if queueing is enabled,
|
||||
with infinite looping in net_bh.
|
||||
|
||||
We cannot track such dead loops during route installation,
|
||||
it is infeasible task. The most general solutions would be
|
||||
to keep skb->encapsulation counter (sort of local ttl),
|
||||
and silently drop packet when it expires. It is a good
|
||||
solution, but it supposes maintaining new variable in ALL
|
||||
skb, even if no tunneling is used.
|
||||
|
||||
Current solution: xmit_recursion breaks dead loops. This is a percpu
|
||||
counter, since when we enter the first ndo_xmit(), cpu migration is
|
||||
forbidden. We force an exit if this counter reaches RECURSION_LIMIT
|
||||
|
||||
2. Networking dead loops would not kill routers, but would really
|
||||
kill network. IP hop limit plays role of "t->recursion" in this case,
|
||||
if we copy it from packet being encapsulated to upper header.
|
||||
It is very good solution, but it introduces two problems:
|
||||
|
||||
- Routing protocols, using packets with ttl=1 (OSPF, RIP2),
|
||||
do not work over tunnels.
|
||||
- traceroute does not work. I planned to relay ICMP from tunnel,
|
||||
so that this problem would be solved and traceroute output
|
||||
would even more informative. This idea appeared to be wrong:
|
||||
only Linux complies to rfc1812 now (yes, guys, Linux is the only
|
||||
true router now :-)), all routers (at least, in neighbourhood of mine)
|
||||
return only 8 bytes of payload. It is the end.
|
||||
|
||||
Hence, if we want that OSPF worked or traceroute said something reasonable,
|
||||
we should search for another solution.
|
||||
|
||||
One of them is to parse packet trying to detect inner encapsulation
|
||||
made by our node. It is difficult or even impossible, especially,
|
||||
taking into account fragmentation. TO be short, ttl is not solution at all.
|
||||
|
||||
Current solution: The solution was UNEXPECTEDLY SIMPLE.
|
||||
We force DF flag on tunnels with preconfigured hop limit,
|
||||
that is ALL. :-) Well, it does not remove the problem completely,
|
||||
but exponential growth of network traffic is changed to linear
|
||||
(branches, that exceed pmtu are pruned) and tunnel mtu
|
||||
rapidly degrades to value <68, where looping stops.
|
||||
Yes, it is not good if there exists a router in the loop,
|
||||
which does not force DF, even when encapsulating packets have DF set.
|
||||
But it is not our problem! Nobody could accuse us, we made
|
||||
all that we could make. Even if it is your gated who injected
|
||||
fatal route to network, even if it were you who configured
|
||||
fatal static route: you are innocent. :-)
|
||||
|
||||
Alexey Kuznetsov.
|
||||
*/
|
||||
|
||||
static bool log_ecn_error = true;
|
||||
module_param(log_ecn_error, bool, 0644);
|
||||
MODULE_PARM_DESC(log_ecn_error, "Log packets received with corrupted ECN");
|
||||
|
||||
static struct rtnl_link_ops ipgre_link_ops __read_mostly;
|
||||
static int ipgre_tunnel_init(struct net_device *dev);
|
||||
|
||||
static int ipgre_net_id __read_mostly;
|
||||
static int gre_tap_net_id __read_mostly;
|
||||
|
||||
static int ipgre_err(struct sk_buff *skb, u32 info,
|
||||
const struct tnl_ptk_info *tpi)
|
||||
{
|
||||
|
||||
/* All the routers (except for Linux) return only
|
||||
8 bytes of packet payload. It means, that precise relaying of
|
||||
ICMP in the real Internet is absolutely infeasible.
|
||||
|
||||
Moreover, Cisco "wise men" put GRE key to the third word
|
||||
in GRE header. It makes impossible maintaining even soft
|
||||
state for keyed GRE tunnels with enabled checksum. Tell
|
||||
them "thank you".
|
||||
|
||||
Well, I wonder, rfc1812 was written by Cisco employee,
|
||||
what the hell these idiots break standards established
|
||||
by themselves???
|
||||
*/
|
||||
struct net *net = dev_net(skb->dev);
|
||||
struct ip_tunnel_net *itn;
|
||||
const struct iphdr *iph;
|
||||
const int type = icmp_hdr(skb)->type;
|
||||
const int code = icmp_hdr(skb)->code;
|
||||
struct ip_tunnel *t;
|
||||
|
||||
switch (type) {
|
||||
default:
|
||||
case ICMP_PARAMETERPROB:
|
||||
return PACKET_RCVD;
|
||||
|
||||
case ICMP_DEST_UNREACH:
|
||||
switch (code) {
|
||||
case ICMP_SR_FAILED:
|
||||
case ICMP_PORT_UNREACH:
|
||||
/* Impossible event. */
|
||||
return PACKET_RCVD;
|
||||
default:
|
||||
/* All others are translated to HOST_UNREACH.
|
||||
rfc2003 contains "deep thoughts" about NET_UNREACH,
|
||||
I believe they are just ether pollution. --ANK
|
||||
*/
|
||||
break;
|
||||
}
|
||||
break;
|
||||
case ICMP_TIME_EXCEEDED:
|
||||
if (code != ICMP_EXC_TTL)
|
||||
return PACKET_RCVD;
|
||||
break;
|
||||
|
||||
case ICMP_REDIRECT:
|
||||
break;
|
||||
}
|
||||
|
||||
if (tpi->proto == htons(ETH_P_TEB))
|
||||
itn = net_generic(net, gre_tap_net_id);
|
||||
else
|
||||
itn = net_generic(net, ipgre_net_id);
|
||||
|
||||
iph = (const struct iphdr *)(icmp_hdr(skb) + 1);
|
||||
t = ip_tunnel_lookup(itn, skb->dev->ifindex, tpi->flags,
|
||||
iph->daddr, iph->saddr, tpi->key);
|
||||
|
||||
if (t == NULL)
|
||||
return PACKET_REJECT;
|
||||
|
||||
if (t->parms.iph.daddr == 0 ||
|
||||
ipv4_is_multicast(t->parms.iph.daddr))
|
||||
return PACKET_RCVD;
|
||||
|
||||
if (t->parms.iph.ttl == 0 && type == ICMP_TIME_EXCEEDED)
|
||||
return PACKET_RCVD;
|
||||
|
||||
if (time_before(jiffies, t->err_time + IPTUNNEL_ERR_TIMEO))
|
||||
t->err_count++;
|
||||
else
|
||||
t->err_count = 1;
|
||||
t->err_time = jiffies;
|
||||
return PACKET_RCVD;
|
||||
}
|
||||
|
||||
static int ipgre_rcv(struct sk_buff *skb, const struct tnl_ptk_info *tpi)
|
||||
{
|
||||
struct net *net = dev_net(skb->dev);
|
||||
struct ip_tunnel_net *itn;
|
||||
const struct iphdr *iph;
|
||||
struct ip_tunnel *tunnel;
|
||||
|
||||
if (tpi->proto == htons(ETH_P_TEB))
|
||||
itn = net_generic(net, gre_tap_net_id);
|
||||
else
|
||||
itn = net_generic(net, ipgre_net_id);
|
||||
|
||||
iph = ip_hdr(skb);
|
||||
tunnel = ip_tunnel_lookup(itn, skb->dev->ifindex, tpi->flags,
|
||||
iph->saddr, iph->daddr, tpi->key);
|
||||
|
||||
if (tunnel) {
|
||||
skb_pop_mac_header(skb);
|
||||
ip_tunnel_rcv(tunnel, skb, tpi, log_ecn_error);
|
||||
return PACKET_RCVD;
|
||||
}
|
||||
return PACKET_REJECT;
|
||||
}
|
||||
|
||||
static void __gre_xmit(struct sk_buff *skb, struct net_device *dev,
|
||||
const struct iphdr *tnl_params,
|
||||
__be16 proto)
|
||||
{
|
||||
struct ip_tunnel *tunnel = netdev_priv(dev);
|
||||
struct tnl_ptk_info tpi;
|
||||
|
||||
tpi.flags = tunnel->parms.o_flags;
|
||||
tpi.proto = proto;
|
||||
tpi.key = tunnel->parms.o_key;
|
||||
if (tunnel->parms.o_flags & TUNNEL_SEQ)
|
||||
tunnel->o_seqno++;
|
||||
tpi.seq = htonl(tunnel->o_seqno);
|
||||
|
||||
/* Push GRE header. */
|
||||
gre_build_header(skb, &tpi, tunnel->tun_hlen);
|
||||
|
||||
skb_set_inner_protocol(skb, tpi.proto);
|
||||
|
||||
ip_tunnel_xmit(skb, dev, tnl_params, tnl_params->protocol);
|
||||
}
|
||||
|
||||
static netdev_tx_t ipgre_xmit(struct sk_buff *skb,
|
||||
struct net_device *dev)
|
||||
{
|
||||
struct ip_tunnel *tunnel = netdev_priv(dev);
|
||||
const struct iphdr *tnl_params;
|
||||
|
||||
if (dev->header_ops) {
|
||||
/* Need space for new headers */
|
||||
if (skb_cow_head(skb, dev->needed_headroom -
|
||||
(tunnel->hlen + sizeof(struct iphdr))))
|
||||
goto free_skb;
|
||||
|
||||
tnl_params = (const struct iphdr *)skb->data;
|
||||
|
||||
/* Pull skb since ip_tunnel_xmit() needs skb->data pointing
|
||||
* to gre header.
|
||||
*/
|
||||
skb_pull(skb, tunnel->hlen + sizeof(struct iphdr));
|
||||
skb_reset_mac_header(skb);
|
||||
} else {
|
||||
if (skb_cow_head(skb, dev->needed_headroom))
|
||||
goto free_skb;
|
||||
|
||||
tnl_params = &tunnel->parms.iph;
|
||||
}
|
||||
|
||||
skb = gre_handle_offloads(skb, !!(tunnel->parms.o_flags&TUNNEL_CSUM));
|
||||
if (IS_ERR(skb))
|
||||
goto out;
|
||||
|
||||
__gre_xmit(skb, dev, tnl_params, skb->protocol);
|
||||
|
||||
return NETDEV_TX_OK;
|
||||
|
||||
free_skb:
|
||||
kfree_skb(skb);
|
||||
out:
|
||||
dev->stats.tx_dropped++;
|
||||
return NETDEV_TX_OK;
|
||||
}
|
||||
|
||||
static netdev_tx_t gre_tap_xmit(struct sk_buff *skb,
|
||||
struct net_device *dev)
|
||||
{
|
||||
struct ip_tunnel *tunnel = netdev_priv(dev);
|
||||
|
||||
skb = gre_handle_offloads(skb, !!(tunnel->parms.o_flags&TUNNEL_CSUM));
|
||||
if (IS_ERR(skb))
|
||||
goto out;
|
||||
|
||||
if (skb_cow_head(skb, dev->needed_headroom))
|
||||
goto free_skb;
|
||||
|
||||
__gre_xmit(skb, dev, &tunnel->parms.iph, htons(ETH_P_TEB));
|
||||
|
||||
return NETDEV_TX_OK;
|
||||
|
||||
free_skb:
|
||||
kfree_skb(skb);
|
||||
out:
|
||||
dev->stats.tx_dropped++;
|
||||
return NETDEV_TX_OK;
|
||||
}
|
||||
|
||||
static int ipgre_tunnel_ioctl(struct net_device *dev,
|
||||
struct ifreq *ifr, int cmd)
|
||||
{
|
||||
int err;
|
||||
struct ip_tunnel_parm p;
|
||||
|
||||
if (copy_from_user(&p, ifr->ifr_ifru.ifru_data, sizeof(p)))
|
||||
return -EFAULT;
|
||||
if (cmd == SIOCADDTUNNEL || cmd == SIOCCHGTUNNEL) {
|
||||
if (p.iph.version != 4 || p.iph.protocol != IPPROTO_GRE ||
|
||||
p.iph.ihl != 5 || (p.iph.frag_off&htons(~IP_DF)) ||
|
||||
((p.i_flags|p.o_flags)&(GRE_VERSION|GRE_ROUTING)))
|
||||
return -EINVAL;
|
||||
}
|
||||
p.i_flags = gre_flags_to_tnl_flags(p.i_flags);
|
||||
p.o_flags = gre_flags_to_tnl_flags(p.o_flags);
|
||||
|
||||
err = ip_tunnel_ioctl(dev, &p, cmd);
|
||||
if (err)
|
||||
return err;
|
||||
|
||||
p.i_flags = tnl_flags_to_gre_flags(p.i_flags);
|
||||
p.o_flags = tnl_flags_to_gre_flags(p.o_flags);
|
||||
|
||||
if (copy_to_user(ifr->ifr_ifru.ifru_data, &p, sizeof(p)))
|
||||
return -EFAULT;
|
||||
return 0;
|
||||
}
|
||||
|
||||
/* Nice toy. Unfortunately, useless in real life :-)
|
||||
It allows to construct virtual multiprotocol broadcast "LAN"
|
||||
over the Internet, provided multicast routing is tuned.
|
||||
|
||||
|
||||
I have no idea was this bicycle invented before me,
|
||||
so that I had to set ARPHRD_IPGRE to a random value.
|
||||
I have an impression, that Cisco could make something similar,
|
||||
but this feature is apparently missing in IOS<=11.2(8).
|
||||
|
||||
I set up 10.66.66/24 and fec0:6666:6666::0/96 as virtual networks
|
||||
with broadcast 224.66.66.66. If you have access to mbone, play with me :-)
|
||||
|
||||
ping -t 255 224.66.66.66
|
||||
|
||||
If nobody answers, mbone does not work.
|
||||
|
||||
ip tunnel add Universe mode gre remote 224.66.66.66 local <Your_real_addr> ttl 255
|
||||
ip addr add 10.66.66.<somewhat>/24 dev Universe
|
||||
ifconfig Universe up
|
||||
ifconfig Universe add fe80::<Your_real_addr>/10
|
||||
ifconfig Universe add fec0:6666:6666::<Your_real_addr>/96
|
||||
ftp 10.66.66.66
|
||||
...
|
||||
ftp fec0:6666:6666::193.233.7.65
|
||||
...
|
||||
*/
|
||||
static int ipgre_header(struct sk_buff *skb, struct net_device *dev,
|
||||
unsigned short type,
|
||||
const void *daddr, const void *saddr, unsigned int len)
|
||||
{
|
||||
struct ip_tunnel *t = netdev_priv(dev);
|
||||
struct iphdr *iph;
|
||||
struct gre_base_hdr *greh;
|
||||
|
||||
iph = (struct iphdr *)skb_push(skb, t->hlen + sizeof(*iph));
|
||||
greh = (struct gre_base_hdr *)(iph+1);
|
||||
greh->flags = tnl_flags_to_gre_flags(t->parms.o_flags);
|
||||
greh->protocol = htons(type);
|
||||
|
||||
memcpy(iph, &t->parms.iph, sizeof(struct iphdr));
|
||||
|
||||
/* Set the source hardware address. */
|
||||
if (saddr)
|
||||
memcpy(&iph->saddr, saddr, 4);
|
||||
if (daddr)
|
||||
memcpy(&iph->daddr, daddr, 4);
|
||||
if (iph->daddr)
|
||||
return t->hlen + sizeof(*iph);
|
||||
|
||||
return -(t->hlen + sizeof(*iph));
|
||||
}
|
||||
|
||||
static int ipgre_header_parse(const struct sk_buff *skb, unsigned char *haddr)
|
||||
{
|
||||
const struct iphdr *iph = (const struct iphdr *) skb_mac_header(skb);
|
||||
memcpy(haddr, &iph->saddr, 4);
|
||||
return 4;
|
||||
}
|
||||
|
||||
static const struct header_ops ipgre_header_ops = {
|
||||
.create = ipgre_header,
|
||||
.parse = ipgre_header_parse,
|
||||
};
|
||||
|
||||
#ifdef CONFIG_NET_IPGRE_BROADCAST
|
||||
static int ipgre_open(struct net_device *dev)
|
||||
{
|
||||
struct ip_tunnel *t = netdev_priv(dev);
|
||||
|
||||
if (ipv4_is_multicast(t->parms.iph.daddr)) {
|
||||
struct flowi4 fl4;
|
||||
struct rtable *rt;
|
||||
|
||||
rt = ip_route_output_gre(t->net, &fl4,
|
||||
t->parms.iph.daddr,
|
||||
t->parms.iph.saddr,
|
||||
t->parms.o_key,
|
||||
RT_TOS(t->parms.iph.tos),
|
||||
t->parms.link);
|
||||
if (IS_ERR(rt))
|
||||
return -EADDRNOTAVAIL;
|
||||
dev = rt->dst.dev;
|
||||
ip_rt_put(rt);
|
||||
if (__in_dev_get_rtnl(dev) == NULL)
|
||||
return -EADDRNOTAVAIL;
|
||||
t->mlink = dev->ifindex;
|
||||
ip_mc_inc_group(__in_dev_get_rtnl(dev), t->parms.iph.daddr);
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
static int ipgre_close(struct net_device *dev)
|
||||
{
|
||||
struct ip_tunnel *t = netdev_priv(dev);
|
||||
|
||||
if (ipv4_is_multicast(t->parms.iph.daddr) && t->mlink) {
|
||||
struct in_device *in_dev;
|
||||
in_dev = inetdev_by_index(t->net, t->mlink);
|
||||
if (in_dev)
|
||||
ip_mc_dec_group(in_dev, t->parms.iph.daddr);
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
#endif
|
||||
|
||||
static const struct net_device_ops ipgre_netdev_ops = {
|
||||
.ndo_init = ipgre_tunnel_init,
|
||||
.ndo_uninit = ip_tunnel_uninit,
|
||||
#ifdef CONFIG_NET_IPGRE_BROADCAST
|
||||
.ndo_open = ipgre_open,
|
||||
.ndo_stop = ipgre_close,
|
||||
#endif
|
||||
.ndo_start_xmit = ipgre_xmit,
|
||||
.ndo_do_ioctl = ipgre_tunnel_ioctl,
|
||||
.ndo_change_mtu = ip_tunnel_change_mtu,
|
||||
.ndo_get_stats64 = ip_tunnel_get_stats64,
|
||||
};
|
||||
|
||||
#define GRE_FEATURES (NETIF_F_SG | \
|
||||
NETIF_F_FRAGLIST | \
|
||||
NETIF_F_HIGHDMA | \
|
||||
NETIF_F_HW_CSUM)
|
||||
|
||||
static void ipgre_tunnel_setup(struct net_device *dev)
|
||||
{
|
||||
dev->netdev_ops = &ipgre_netdev_ops;
|
||||
dev->type = ARPHRD_IPGRE;
|
||||
ip_tunnel_setup(dev, ipgre_net_id);
|
||||
}
|
||||
|
||||
static void __gre_tunnel_init(struct net_device *dev)
|
||||
{
|
||||
struct ip_tunnel *tunnel;
|
||||
int t_hlen;
|
||||
|
||||
tunnel = netdev_priv(dev);
|
||||
tunnel->tun_hlen = ip_gre_calc_hlen(tunnel->parms.o_flags);
|
||||
tunnel->parms.iph.protocol = IPPROTO_GRE;
|
||||
|
||||
tunnel->hlen = tunnel->tun_hlen + tunnel->encap_hlen;
|
||||
|
||||
t_hlen = tunnel->hlen + sizeof(struct iphdr);
|
||||
|
||||
dev->needed_headroom = LL_MAX_HEADER + t_hlen + 4;
|
||||
dev->mtu = ETH_DATA_LEN - t_hlen - 4;
|
||||
|
||||
dev->features |= GRE_FEATURES;
|
||||
dev->hw_features |= GRE_FEATURES;
|
||||
|
||||
if (!(tunnel->parms.o_flags & TUNNEL_SEQ)) {
|
||||
/* TCP offload with GRE SEQ is not supported. */
|
||||
dev->features |= NETIF_F_GSO_SOFTWARE;
|
||||
dev->hw_features |= NETIF_F_GSO_SOFTWARE;
|
||||
/* Can use a lockless transmit, unless we generate
|
||||
* output sequences
|
||||
*/
|
||||
dev->features |= NETIF_F_LLTX;
|
||||
}
|
||||
}
|
||||
|
||||
static int ipgre_tunnel_init(struct net_device *dev)
|
||||
{
|
||||
struct ip_tunnel *tunnel = netdev_priv(dev);
|
||||
struct iphdr *iph = &tunnel->parms.iph;
|
||||
|
||||
__gre_tunnel_init(dev);
|
||||
|
||||
memcpy(dev->dev_addr, &iph->saddr, 4);
|
||||
memcpy(dev->broadcast, &iph->daddr, 4);
|
||||
|
||||
dev->flags = IFF_NOARP;
|
||||
netif_keep_dst(dev);
|
||||
dev->addr_len = 4;
|
||||
|
||||
if (iph->daddr) {
|
||||
#ifdef CONFIG_NET_IPGRE_BROADCAST
|
||||
if (ipv4_is_multicast(iph->daddr)) {
|
||||
if (!iph->saddr)
|
||||
return -EINVAL;
|
||||
dev->flags = IFF_BROADCAST;
|
||||
dev->header_ops = &ipgre_header_ops;
|
||||
}
|
||||
#endif
|
||||
} else
|
||||
dev->header_ops = &ipgre_header_ops;
|
||||
|
||||
return ip_tunnel_init(dev);
|
||||
}
|
||||
|
||||
static struct gre_cisco_protocol ipgre_protocol = {
|
||||
.handler = ipgre_rcv,
|
||||
.err_handler = ipgre_err,
|
||||
.priority = 0,
|
||||
};
|
||||
|
||||
static int __net_init ipgre_init_net(struct net *net)
|
||||
{
|
||||
return ip_tunnel_init_net(net, ipgre_net_id, &ipgre_link_ops, NULL);
|
||||
}
|
||||
|
||||
static void __net_exit ipgre_exit_net(struct net *net)
|
||||
{
|
||||
struct ip_tunnel_net *itn = net_generic(net, ipgre_net_id);
|
||||
ip_tunnel_delete_net(itn, &ipgre_link_ops);
|
||||
}
|
||||
|
||||
static struct pernet_operations ipgre_net_ops = {
|
||||
.init = ipgre_init_net,
|
||||
.exit = ipgre_exit_net,
|
||||
.id = &ipgre_net_id,
|
||||
.size = sizeof(struct ip_tunnel_net),
|
||||
};
|
||||
|
||||
static int ipgre_tunnel_validate(struct nlattr *tb[], struct nlattr *data[])
|
||||
{
|
||||
__be16 flags;
|
||||
|
||||
if (!data)
|
||||
return 0;
|
||||
|
||||
flags = 0;
|
||||
if (data[IFLA_GRE_IFLAGS])
|
||||
flags |= nla_get_be16(data[IFLA_GRE_IFLAGS]);
|
||||
if (data[IFLA_GRE_OFLAGS])
|
||||
flags |= nla_get_be16(data[IFLA_GRE_OFLAGS]);
|
||||
if (flags & (GRE_VERSION|GRE_ROUTING))
|
||||
return -EINVAL;
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
static int ipgre_tap_validate(struct nlattr *tb[], struct nlattr *data[])
|
||||
{
|
||||
__be32 daddr;
|
||||
|
||||
if (tb[IFLA_ADDRESS]) {
|
||||
if (nla_len(tb[IFLA_ADDRESS]) != ETH_ALEN)
|
||||
return -EINVAL;
|
||||
if (!is_valid_ether_addr(nla_data(tb[IFLA_ADDRESS])))
|
||||
return -EADDRNOTAVAIL;
|
||||
}
|
||||
|
||||
if (!data)
|
||||
goto out;
|
||||
|
||||
if (data[IFLA_GRE_REMOTE]) {
|
||||
memcpy(&daddr, nla_data(data[IFLA_GRE_REMOTE]), 4);
|
||||
if (!daddr)
|
||||
return -EINVAL;
|
||||
}
|
||||
|
||||
out:
|
||||
return ipgre_tunnel_validate(tb, data);
|
||||
}
|
||||
|
||||
static void ipgre_netlink_parms(struct nlattr *data[], struct nlattr *tb[],
|
||||
struct ip_tunnel_parm *parms)
|
||||
{
|
||||
memset(parms, 0, sizeof(*parms));
|
||||
|
||||
parms->iph.protocol = IPPROTO_GRE;
|
||||
|
||||
if (!data)
|
||||
return;
|
||||
|
||||
if (data[IFLA_GRE_LINK])
|
||||
parms->link = nla_get_u32(data[IFLA_GRE_LINK]);
|
||||
|
||||
if (data[IFLA_GRE_IFLAGS])
|
||||
parms->i_flags = gre_flags_to_tnl_flags(nla_get_be16(data[IFLA_GRE_IFLAGS]));
|
||||
|
||||
if (data[IFLA_GRE_OFLAGS])
|
||||
parms->o_flags = gre_flags_to_tnl_flags(nla_get_be16(data[IFLA_GRE_OFLAGS]));
|
||||
|
||||
if (data[IFLA_GRE_IKEY])
|
||||
parms->i_key = nla_get_be32(data[IFLA_GRE_IKEY]);
|
||||
|
||||
if (data[IFLA_GRE_OKEY])
|
||||
parms->o_key = nla_get_be32(data[IFLA_GRE_OKEY]);
|
||||
|
||||
if (data[IFLA_GRE_LOCAL])
|
||||
parms->iph.saddr = nla_get_be32(data[IFLA_GRE_LOCAL]);
|
||||
|
||||
if (data[IFLA_GRE_REMOTE])
|
||||
parms->iph.daddr = nla_get_be32(data[IFLA_GRE_REMOTE]);
|
||||
|
||||
if (data[IFLA_GRE_TTL])
|
||||
parms->iph.ttl = nla_get_u8(data[IFLA_GRE_TTL]);
|
||||
|
||||
if (data[IFLA_GRE_TOS])
|
||||
parms->iph.tos = nla_get_u8(data[IFLA_GRE_TOS]);
|
||||
|
||||
if (!data[IFLA_GRE_PMTUDISC] || nla_get_u8(data[IFLA_GRE_PMTUDISC]))
|
||||
parms->iph.frag_off = htons(IP_DF);
|
||||
}
|
||||
|
||||
/* This function returns true when ENCAP attributes are present in the nl msg */
|
||||
static bool ipgre_netlink_encap_parms(struct nlattr *data[],
|
||||
struct ip_tunnel_encap *ipencap)
|
||||
{
|
||||
bool ret = false;
|
||||
|
||||
memset(ipencap, 0, sizeof(*ipencap));
|
||||
|
||||
if (!data)
|
||||
return ret;
|
||||
|
||||
if (data[IFLA_GRE_ENCAP_TYPE]) {
|
||||
ret = true;
|
||||
ipencap->type = nla_get_u16(data[IFLA_GRE_ENCAP_TYPE]);
|
||||
}
|
||||
|
||||
if (data[IFLA_GRE_ENCAP_FLAGS]) {
|
||||
ret = true;
|
||||
ipencap->flags = nla_get_u16(data[IFLA_GRE_ENCAP_FLAGS]);
|
||||
}
|
||||
|
||||
if (data[IFLA_GRE_ENCAP_SPORT]) {
|
||||
ret = true;
|
||||
ipencap->sport = nla_get_u16(data[IFLA_GRE_ENCAP_SPORT]);
|
||||
}
|
||||
|
||||
if (data[IFLA_GRE_ENCAP_DPORT]) {
|
||||
ret = true;
|
||||
ipencap->dport = nla_get_u16(data[IFLA_GRE_ENCAP_DPORT]);
|
||||
}
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
static int gre_tap_init(struct net_device *dev)
|
||||
{
|
||||
__gre_tunnel_init(dev);
|
||||
|
||||
return ip_tunnel_init(dev);
|
||||
}
|
||||
|
||||
static const struct net_device_ops gre_tap_netdev_ops = {
|
||||
.ndo_init = gre_tap_init,
|
||||
.ndo_uninit = ip_tunnel_uninit,
|
||||
.ndo_start_xmit = gre_tap_xmit,
|
||||
.ndo_set_mac_address = eth_mac_addr,
|
||||
.ndo_validate_addr = eth_validate_addr,
|
||||
.ndo_change_mtu = ip_tunnel_change_mtu,
|
||||
.ndo_get_stats64 = ip_tunnel_get_stats64,
|
||||
};
|
||||
|
||||
static void ipgre_tap_setup(struct net_device *dev)
|
||||
{
|
||||
ether_setup(dev);
|
||||
dev->netdev_ops = &gre_tap_netdev_ops;
|
||||
dev->priv_flags |= IFF_LIVE_ADDR_CHANGE;
|
||||
ip_tunnel_setup(dev, gre_tap_net_id);
|
||||
}
|
||||
|
||||
static int ipgre_newlink(struct net *src_net, struct net_device *dev,
|
||||
struct nlattr *tb[], struct nlattr *data[])
|
||||
{
|
||||
struct ip_tunnel_parm p;
|
||||
struct ip_tunnel_encap ipencap;
|
||||
|
||||
if (ipgre_netlink_encap_parms(data, &ipencap)) {
|
||||
struct ip_tunnel *t = netdev_priv(dev);
|
||||
int err = ip_tunnel_encap_setup(t, &ipencap);
|
||||
|
||||
if (err < 0)
|
||||
return err;
|
||||
}
|
||||
|
||||
ipgre_netlink_parms(data, tb, &p);
|
||||
return ip_tunnel_newlink(dev, tb, &p);
|
||||
}
|
||||
|
||||
static int ipgre_changelink(struct net_device *dev, struct nlattr *tb[],
|
||||
struct nlattr *data[])
|
||||
{
|
||||
struct ip_tunnel_parm p;
|
||||
struct ip_tunnel_encap ipencap;
|
||||
|
||||
if (ipgre_netlink_encap_parms(data, &ipencap)) {
|
||||
struct ip_tunnel *t = netdev_priv(dev);
|
||||
int err = ip_tunnel_encap_setup(t, &ipencap);
|
||||
|
||||
if (err < 0)
|
||||
return err;
|
||||
}
|
||||
|
||||
ipgre_netlink_parms(data, tb, &p);
|
||||
return ip_tunnel_changelink(dev, tb, &p);
|
||||
}
|
||||
|
||||
static size_t ipgre_get_size(const struct net_device *dev)
|
||||
{
|
||||
return
|
||||
/* IFLA_GRE_LINK */
|
||||
nla_total_size(4) +
|
||||
/* IFLA_GRE_IFLAGS */
|
||||
nla_total_size(2) +
|
||||
/* IFLA_GRE_OFLAGS */
|
||||
nla_total_size(2) +
|
||||
/* IFLA_GRE_IKEY */
|
||||
nla_total_size(4) +
|
||||
/* IFLA_GRE_OKEY */
|
||||
nla_total_size(4) +
|
||||
/* IFLA_GRE_LOCAL */
|
||||
nla_total_size(4) +
|
||||
/* IFLA_GRE_REMOTE */
|
||||
nla_total_size(4) +
|
||||
/* IFLA_GRE_TTL */
|
||||
nla_total_size(1) +
|
||||
/* IFLA_GRE_TOS */
|
||||
nla_total_size(1) +
|
||||
/* IFLA_GRE_PMTUDISC */
|
||||
nla_total_size(1) +
|
||||
/* IFLA_GRE_ENCAP_TYPE */
|
||||
nla_total_size(2) +
|
||||
/* IFLA_GRE_ENCAP_FLAGS */
|
||||
nla_total_size(2) +
|
||||
/* IFLA_GRE_ENCAP_SPORT */
|
||||
nla_total_size(2) +
|
||||
/* IFLA_GRE_ENCAP_DPORT */
|
||||
nla_total_size(2) +
|
||||
0;
|
||||
}
|
||||
|
||||
static int ipgre_fill_info(struct sk_buff *skb, const struct net_device *dev)
|
||||
{
|
||||
struct ip_tunnel *t = netdev_priv(dev);
|
||||
struct ip_tunnel_parm *p = &t->parms;
|
||||
|
||||
if (nla_put_u32(skb, IFLA_GRE_LINK, p->link) ||
|
||||
nla_put_be16(skb, IFLA_GRE_IFLAGS, tnl_flags_to_gre_flags(p->i_flags)) ||
|
||||
nla_put_be16(skb, IFLA_GRE_OFLAGS, tnl_flags_to_gre_flags(p->o_flags)) ||
|
||||
nla_put_be32(skb, IFLA_GRE_IKEY, p->i_key) ||
|
||||
nla_put_be32(skb, IFLA_GRE_OKEY, p->o_key) ||
|
||||
nla_put_be32(skb, IFLA_GRE_LOCAL, p->iph.saddr) ||
|
||||
nla_put_be32(skb, IFLA_GRE_REMOTE, p->iph.daddr) ||
|
||||
nla_put_u8(skb, IFLA_GRE_TTL, p->iph.ttl) ||
|
||||
nla_put_u8(skb, IFLA_GRE_TOS, p->iph.tos) ||
|
||||
nla_put_u8(skb, IFLA_GRE_PMTUDISC,
|
||||
!!(p->iph.frag_off & htons(IP_DF))))
|
||||
goto nla_put_failure;
|
||||
|
||||
if (nla_put_u16(skb, IFLA_GRE_ENCAP_TYPE,
|
||||
t->encap.type) ||
|
||||
nla_put_u16(skb, IFLA_GRE_ENCAP_SPORT,
|
||||
t->encap.sport) ||
|
||||
nla_put_u16(skb, IFLA_GRE_ENCAP_DPORT,
|
||||
t->encap.dport) ||
|
||||
nla_put_u16(skb, IFLA_GRE_ENCAP_FLAGS,
|
||||
t->encap.dport))
|
||||
goto nla_put_failure;
|
||||
|
||||
return 0;
|
||||
|
||||
nla_put_failure:
|
||||
return -EMSGSIZE;
|
||||
}
|
||||
|
||||
static const struct nla_policy ipgre_policy[IFLA_GRE_MAX + 1] = {
|
||||
[IFLA_GRE_LINK] = { .type = NLA_U32 },
|
||||
[IFLA_GRE_IFLAGS] = { .type = NLA_U16 },
|
||||
[IFLA_GRE_OFLAGS] = { .type = NLA_U16 },
|
||||
[IFLA_GRE_IKEY] = { .type = NLA_U32 },
|
||||
[IFLA_GRE_OKEY] = { .type = NLA_U32 },
|
||||
[IFLA_GRE_LOCAL] = { .len = FIELD_SIZEOF(struct iphdr, saddr) },
|
||||
[IFLA_GRE_REMOTE] = { .len = FIELD_SIZEOF(struct iphdr, daddr) },
|
||||
[IFLA_GRE_TTL] = { .type = NLA_U8 },
|
||||
[IFLA_GRE_TOS] = { .type = NLA_U8 },
|
||||
[IFLA_GRE_PMTUDISC] = { .type = NLA_U8 },
|
||||
[IFLA_GRE_ENCAP_TYPE] = { .type = NLA_U16 },
|
||||
[IFLA_GRE_ENCAP_FLAGS] = { .type = NLA_U16 },
|
||||
[IFLA_GRE_ENCAP_SPORT] = { .type = NLA_U16 },
|
||||
[IFLA_GRE_ENCAP_DPORT] = { .type = NLA_U16 },
|
||||
};
|
||||
|
||||
static struct rtnl_link_ops ipgre_link_ops __read_mostly = {
|
||||
.kind = "gre",
|
||||
.maxtype = IFLA_GRE_MAX,
|
||||
.policy = ipgre_policy,
|
||||
.priv_size = sizeof(struct ip_tunnel),
|
||||
.setup = ipgre_tunnel_setup,
|
||||
.validate = ipgre_tunnel_validate,
|
||||
.newlink = ipgre_newlink,
|
||||
.changelink = ipgre_changelink,
|
||||
.dellink = ip_tunnel_dellink,
|
||||
.get_size = ipgre_get_size,
|
||||
.fill_info = ipgre_fill_info,
|
||||
};
|
||||
|
||||
static struct rtnl_link_ops ipgre_tap_ops __read_mostly = {
|
||||
.kind = "gretap",
|
||||
.maxtype = IFLA_GRE_MAX,
|
||||
.policy = ipgre_policy,
|
||||
.priv_size = sizeof(struct ip_tunnel),
|
||||
.setup = ipgre_tap_setup,
|
||||
.validate = ipgre_tap_validate,
|
||||
.newlink = ipgre_newlink,
|
||||
.changelink = ipgre_changelink,
|
||||
.dellink = ip_tunnel_dellink,
|
||||
.get_size = ipgre_get_size,
|
||||
.fill_info = ipgre_fill_info,
|
||||
};
|
||||
|
||||
static int __net_init ipgre_tap_init_net(struct net *net)
|
||||
{
|
||||
return ip_tunnel_init_net(net, gre_tap_net_id, &ipgre_tap_ops, NULL);
|
||||
}
|
||||
|
||||
static void __net_exit ipgre_tap_exit_net(struct net *net)
|
||||
{
|
||||
struct ip_tunnel_net *itn = net_generic(net, gre_tap_net_id);
|
||||
ip_tunnel_delete_net(itn, &ipgre_tap_ops);
|
||||
}
|
||||
|
||||
static struct pernet_operations ipgre_tap_net_ops = {
|
||||
.init = ipgre_tap_init_net,
|
||||
.exit = ipgre_tap_exit_net,
|
||||
.id = &gre_tap_net_id,
|
||||
.size = sizeof(struct ip_tunnel_net),
|
||||
};
|
||||
|
||||
static int __init ipgre_init(void)
|
||||
{
|
||||
int err;
|
||||
|
||||
pr_info("GRE over IPv4 tunneling driver\n");
|
||||
|
||||
err = register_pernet_device(&ipgre_net_ops);
|
||||
if (err < 0)
|
||||
return err;
|
||||
|
||||
err = register_pernet_device(&ipgre_tap_net_ops);
|
||||
if (err < 0)
|
||||
goto pnet_tap_faied;
|
||||
|
||||
err = gre_cisco_register(&ipgre_protocol);
|
||||
if (err < 0) {
|
||||
pr_info("%s: can't add protocol\n", __func__);
|
||||
goto add_proto_failed;
|
||||
}
|
||||
|
||||
err = rtnl_link_register(&ipgre_link_ops);
|
||||
if (err < 0)
|
||||
goto rtnl_link_failed;
|
||||
|
||||
err = rtnl_link_register(&ipgre_tap_ops);
|
||||
if (err < 0)
|
||||
goto tap_ops_failed;
|
||||
|
||||
return 0;
|
||||
|
||||
tap_ops_failed:
|
||||
rtnl_link_unregister(&ipgre_link_ops);
|
||||
rtnl_link_failed:
|
||||
gre_cisco_unregister(&ipgre_protocol);
|
||||
add_proto_failed:
|
||||
unregister_pernet_device(&ipgre_tap_net_ops);
|
||||
pnet_tap_faied:
|
||||
unregister_pernet_device(&ipgre_net_ops);
|
||||
return err;
|
||||
}
|
||||
|
||||
static void __exit ipgre_fini(void)
|
||||
{
|
||||
rtnl_link_unregister(&ipgre_tap_ops);
|
||||
rtnl_link_unregister(&ipgre_link_ops);
|
||||
gre_cisco_unregister(&ipgre_protocol);
|
||||
unregister_pernet_device(&ipgre_tap_net_ops);
|
||||
unregister_pernet_device(&ipgre_net_ops);
|
||||
}
|
||||
|
||||
module_init(ipgre_init);
|
||||
module_exit(ipgre_fini);
|
||||
MODULE_LICENSE("GPL");
|
||||
MODULE_ALIAS_RTNL_LINK("gre");
|
||||
MODULE_ALIAS_RTNL_LINK("gretap");
|
||||
MODULE_ALIAS_NETDEV("gre0");
|
||||
MODULE_ALIAS_NETDEV("gretap0");
|
||||
464
net/ipv4/ip_input.c
Normal file
464
net/ipv4/ip_input.c
Normal file
|
|
@ -0,0 +1,464 @@
|
|||
/*
|
||||
* INET An implementation of the TCP/IP protocol suite for the LINUX
|
||||
* operating system. INET is implemented using the BSD Socket
|
||||
* interface as the means of communication with the user level.
|
||||
*
|
||||
* The Internet Protocol (IP) module.
|
||||
*
|
||||
* Authors: Ross Biro
|
||||
* Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
|
||||
* Donald Becker, <becker@super.org>
|
||||
* Alan Cox, <alan@lxorguk.ukuu.org.uk>
|
||||
* Richard Underwood
|
||||
* Stefan Becker, <stefanb@yello.ping.de>
|
||||
* Jorge Cwik, <jorge@laser.satlink.net>
|
||||
* Arnt Gulbrandsen, <agulbra@nvg.unit.no>
|
||||
*
|
||||
*
|
||||
* Fixes:
|
||||
* Alan Cox : Commented a couple of minor bits of surplus code
|
||||
* Alan Cox : Undefining IP_FORWARD doesn't include the code
|
||||
* (just stops a compiler warning).
|
||||
* Alan Cox : Frames with >=MAX_ROUTE record routes, strict routes or loose routes
|
||||
* are junked rather than corrupting things.
|
||||
* Alan Cox : Frames to bad broadcast subnets are dumped
|
||||
* We used to process them non broadcast and
|
||||
* boy could that cause havoc.
|
||||
* Alan Cox : ip_forward sets the free flag on the
|
||||
* new frame it queues. Still crap because
|
||||
* it copies the frame but at least it
|
||||
* doesn't eat memory too.
|
||||
* Alan Cox : Generic queue code and memory fixes.
|
||||
* Fred Van Kempen : IP fragment support (borrowed from NET2E)
|
||||
* Gerhard Koerting: Forward fragmented frames correctly.
|
||||
* Gerhard Koerting: Fixes to my fix of the above 8-).
|
||||
* Gerhard Koerting: IP interface addressing fix.
|
||||
* Linus Torvalds : More robustness checks
|
||||
* Alan Cox : Even more checks: Still not as robust as it ought to be
|
||||
* Alan Cox : Save IP header pointer for later
|
||||
* Alan Cox : ip option setting
|
||||
* Alan Cox : Use ip_tos/ip_ttl settings
|
||||
* Alan Cox : Fragmentation bogosity removed
|
||||
* (Thanks to Mark.Bush@prg.ox.ac.uk)
|
||||
* Dmitry Gorodchanin : Send of a raw packet crash fix.
|
||||
* Alan Cox : Silly ip bug when an overlength
|
||||
* fragment turns up. Now frees the
|
||||
* queue.
|
||||
* Linus Torvalds/ : Memory leakage on fragmentation
|
||||
* Alan Cox : handling.
|
||||
* Gerhard Koerting: Forwarding uses IP priority hints
|
||||
* Teemu Rantanen : Fragment problems.
|
||||
* Alan Cox : General cleanup, comments and reformat
|
||||
* Alan Cox : SNMP statistics
|
||||
* Alan Cox : BSD address rule semantics. Also see
|
||||
* UDP as there is a nasty checksum issue
|
||||
* if you do things the wrong way.
|
||||
* Alan Cox : Always defrag, moved IP_FORWARD to the config.in file
|
||||
* Alan Cox : IP options adjust sk->priority.
|
||||
* Pedro Roque : Fix mtu/length error in ip_forward.
|
||||
* Alan Cox : Avoid ip_chk_addr when possible.
|
||||
* Richard Underwood : IP multicasting.
|
||||
* Alan Cox : Cleaned up multicast handlers.
|
||||
* Alan Cox : RAW sockets demultiplex in the BSD style.
|
||||
* Gunther Mayer : Fix the SNMP reporting typo
|
||||
* Alan Cox : Always in group 224.0.0.1
|
||||
* Pauline Middelink : Fast ip_checksum update when forwarding
|
||||
* Masquerading support.
|
||||
* Alan Cox : Multicast loopback error for 224.0.0.1
|
||||
* Alan Cox : IP_MULTICAST_LOOP option.
|
||||
* Alan Cox : Use notifiers.
|
||||
* Bjorn Ekwall : Removed ip_csum (from slhc.c too)
|
||||
* Bjorn Ekwall : Moved ip_fast_csum to ip.h (inline!)
|
||||
* Stefan Becker : Send out ICMP HOST REDIRECT
|
||||
* Arnt Gulbrandsen : ip_build_xmit
|
||||
* Alan Cox : Per socket routing cache
|
||||
* Alan Cox : Fixed routing cache, added header cache.
|
||||
* Alan Cox : Loopback didn't work right in original ip_build_xmit - fixed it.
|
||||
* Alan Cox : Only send ICMP_REDIRECT if src/dest are the same net.
|
||||
* Alan Cox : Incoming IP option handling.
|
||||
* Alan Cox : Set saddr on raw output frames as per BSD.
|
||||
* Alan Cox : Stopped broadcast source route explosions.
|
||||
* Alan Cox : Can disable source routing
|
||||
* Takeshi Sone : Masquerading didn't work.
|
||||
* Dave Bonn,Alan Cox : Faster IP forwarding whenever possible.
|
||||
* Alan Cox : Memory leaks, tramples, misc debugging.
|
||||
* Alan Cox : Fixed multicast (by popular demand 8))
|
||||
* Alan Cox : Fixed forwarding (by even more popular demand 8))
|
||||
* Alan Cox : Fixed SNMP statistics [I think]
|
||||
* Gerhard Koerting : IP fragmentation forwarding fix
|
||||
* Alan Cox : Device lock against page fault.
|
||||
* Alan Cox : IP_HDRINCL facility.
|
||||
* Werner Almesberger : Zero fragment bug
|
||||
* Alan Cox : RAW IP frame length bug
|
||||
* Alan Cox : Outgoing firewall on build_xmit
|
||||
* A.N.Kuznetsov : IP_OPTIONS support throughout the kernel
|
||||
* Alan Cox : Multicast routing hooks
|
||||
* Jos Vos : Do accounting *before* call_in_firewall
|
||||
* Willy Konynenberg : Transparent proxying support
|
||||
*
|
||||
*
|
||||
*
|
||||
* To Fix:
|
||||
* IP fragmentation wants rewriting cleanly. The RFC815 algorithm is much more efficient
|
||||
* and could be made very efficient with the addition of some virtual memory hacks to permit
|
||||
* the allocation of a buffer that can then be 'grown' by twiddling page tables.
|
||||
* Output fragmentation wants updating along with the buffer management to use a single
|
||||
* interleaved copy algorithm so that fragmenting has a one copy overhead. Actual packet
|
||||
* output should probably do its own fragmentation at the UDP/RAW layer. TCP shouldn't cause
|
||||
* fragmentation anyway.
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU General Public License
|
||||
* as published by the Free Software Foundation; either version
|
||||
* 2 of the License, or (at your option) any later version.
|
||||
*/
|
||||
|
||||
#define pr_fmt(fmt) "IPv4: " fmt
|
||||
|
||||
#include <linux/module.h>
|
||||
#include <linux/types.h>
|
||||
#include <linux/kernel.h>
|
||||
#include <linux/string.h>
|
||||
#include <linux/errno.h>
|
||||
#include <linux/slab.h>
|
||||
|
||||
#include <linux/net.h>
|
||||
#include <linux/socket.h>
|
||||
#include <linux/sockios.h>
|
||||
#include <linux/in.h>
|
||||
#include <linux/inet.h>
|
||||
#include <linux/inetdevice.h>
|
||||
#include <linux/netdevice.h>
|
||||
#include <linux/etherdevice.h>
|
||||
|
||||
#include <net/snmp.h>
|
||||
#include <net/ip.h>
|
||||
#include <net/protocol.h>
|
||||
#include <net/route.h>
|
||||
#include <linux/skbuff.h>
|
||||
#include <net/sock.h>
|
||||
#include <net/arp.h>
|
||||
#include <net/icmp.h>
|
||||
#include <net/raw.h>
|
||||
#include <net/checksum.h>
|
||||
#include <net/inet_ecn.h>
|
||||
#include <linux/netfilter_ipv4.h>
|
||||
#include <net/xfrm.h>
|
||||
#include <linux/mroute.h>
|
||||
#include <linux/netlink.h>
|
||||
|
||||
/*
|
||||
* Process Router Attention IP option (RFC 2113)
|
||||
*/
|
||||
bool ip_call_ra_chain(struct sk_buff *skb)
|
||||
{
|
||||
struct ip_ra_chain *ra;
|
||||
u8 protocol = ip_hdr(skb)->protocol;
|
||||
struct sock *last = NULL;
|
||||
struct net_device *dev = skb->dev;
|
||||
|
||||
for (ra = rcu_dereference(ip_ra_chain); ra; ra = rcu_dereference(ra->next)) {
|
||||
struct sock *sk = ra->sk;
|
||||
|
||||
/* If socket is bound to an interface, only report
|
||||
* the packet if it came from that interface.
|
||||
*/
|
||||
if (sk && inet_sk(sk)->inet_num == protocol &&
|
||||
(!sk->sk_bound_dev_if ||
|
||||
sk->sk_bound_dev_if == dev->ifindex) &&
|
||||
net_eq(sock_net(sk), dev_net(dev))) {
|
||||
if (ip_is_fragment(ip_hdr(skb))) {
|
||||
if (ip_defrag(skb, IP_DEFRAG_CALL_RA_CHAIN))
|
||||
return true;
|
||||
}
|
||||
if (last) {
|
||||
struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC);
|
||||
if (skb2)
|
||||
raw_rcv(last, skb2);
|
||||
}
|
||||
last = sk;
|
||||
}
|
||||
}
|
||||
|
||||
if (last) {
|
||||
raw_rcv(last, skb);
|
||||
return true;
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
static int ip_local_deliver_finish(struct sk_buff *skb)
|
||||
{
|
||||
struct net *net = dev_net(skb->dev);
|
||||
|
||||
__skb_pull(skb, skb_network_header_len(skb));
|
||||
|
||||
rcu_read_lock();
|
||||
{
|
||||
int protocol = ip_hdr(skb)->protocol;
|
||||
const struct net_protocol *ipprot;
|
||||
int raw;
|
||||
|
||||
resubmit:
|
||||
raw = raw_local_deliver(skb, protocol);
|
||||
|
||||
ipprot = rcu_dereference(inet_protos[protocol]);
|
||||
if (ipprot != NULL) {
|
||||
int ret;
|
||||
|
||||
if (!ipprot->no_policy) {
|
||||
if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) {
|
||||
kfree_skb(skb);
|
||||
goto out;
|
||||
}
|
||||
nf_reset(skb);
|
||||
}
|
||||
ret = ipprot->handler(skb);
|
||||
if (ret < 0) {
|
||||
protocol = -ret;
|
||||
goto resubmit;
|
||||
}
|
||||
IP_INC_STATS_BH(net, IPSTATS_MIB_INDELIVERS);
|
||||
} else {
|
||||
if (!raw) {
|
||||
if (xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) {
|
||||
IP_INC_STATS_BH(net, IPSTATS_MIB_INUNKNOWNPROTOS);
|
||||
icmp_send(skb, ICMP_DEST_UNREACH,
|
||||
ICMP_PROT_UNREACH, 0);
|
||||
}
|
||||
kfree_skb(skb);
|
||||
} else {
|
||||
IP_INC_STATS_BH(net, IPSTATS_MIB_INDELIVERS);
|
||||
consume_skb(skb);
|
||||
}
|
||||
}
|
||||
}
|
||||
out:
|
||||
rcu_read_unlock();
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
/*
|
||||
* Deliver IP Packets to the higher protocol layers.
|
||||
*/
|
||||
int ip_local_deliver(struct sk_buff *skb)
|
||||
{
|
||||
/*
|
||||
* Reassemble IP fragments.
|
||||
*/
|
||||
|
||||
if (ip_is_fragment(ip_hdr(skb))) {
|
||||
if (ip_defrag(skb, IP_DEFRAG_LOCAL_DELIVER))
|
||||
return 0;
|
||||
}
|
||||
|
||||
return NF_HOOK(NFPROTO_IPV4, NF_INET_LOCAL_IN, skb, skb->dev, NULL,
|
||||
ip_local_deliver_finish);
|
||||
}
|
||||
|
||||
static inline bool ip_rcv_options(struct sk_buff *skb)
|
||||
{
|
||||
struct ip_options *opt;
|
||||
const struct iphdr *iph;
|
||||
struct net_device *dev = skb->dev;
|
||||
|
||||
/* It looks as overkill, because not all
|
||||
IP options require packet mangling.
|
||||
But it is the easiest for now, especially taking
|
||||
into account that combination of IP options
|
||||
and running sniffer is extremely rare condition.
|
||||
--ANK (980813)
|
||||
*/
|
||||
if (skb_cow(skb, skb_headroom(skb))) {
|
||||
IP_INC_STATS_BH(dev_net(dev), IPSTATS_MIB_INDISCARDS);
|
||||
goto drop;
|
||||
}
|
||||
|
||||
iph = ip_hdr(skb);
|
||||
opt = &(IPCB(skb)->opt);
|
||||
opt->optlen = iph->ihl*4 - sizeof(struct iphdr);
|
||||
|
||||
if (ip_options_compile(dev_net(dev), opt, skb)) {
|
||||
IP_INC_STATS_BH(dev_net(dev), IPSTATS_MIB_INHDRERRORS);
|
||||
goto drop;
|
||||
}
|
||||
|
||||
if (unlikely(opt->srr)) {
|
||||
struct in_device *in_dev = __in_dev_get_rcu(dev);
|
||||
|
||||
if (in_dev) {
|
||||
if (!IN_DEV_SOURCE_ROUTE(in_dev)) {
|
||||
if (IN_DEV_LOG_MARTIANS(in_dev))
|
||||
net_info_ratelimited("source route option %pI4 -> %pI4\n",
|
||||
&iph->saddr,
|
||||
&iph->daddr);
|
||||
goto drop;
|
||||
}
|
||||
}
|
||||
|
||||
if (ip_options_rcv_srr(skb))
|
||||
goto drop;
|
||||
}
|
||||
|
||||
return false;
|
||||
drop:
|
||||
return true;
|
||||
}
|
||||
|
||||
int sysctl_ip_early_demux __read_mostly = 1;
|
||||
EXPORT_SYMBOL(sysctl_ip_early_demux);
|
||||
|
||||
static int ip_rcv_finish(struct sk_buff *skb)
|
||||
{
|
||||
const struct iphdr *iph = ip_hdr(skb);
|
||||
struct rtable *rt;
|
||||
|
||||
if (sysctl_ip_early_demux && !skb_dst(skb) && skb->sk == NULL) {
|
||||
const struct net_protocol *ipprot;
|
||||
int protocol = iph->protocol;
|
||||
|
||||
ipprot = rcu_dereference(inet_protos[protocol]);
|
||||
if (ipprot && ipprot->early_demux) {
|
||||
ipprot->early_demux(skb);
|
||||
/* must reload iph, skb->head might have changed */
|
||||
iph = ip_hdr(skb);
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* Initialise the virtual path cache for the packet. It describes
|
||||
* how the packet travels inside Linux networking.
|
||||
*/
|
||||
if (!skb_dst(skb)) {
|
||||
int err = ip_route_input_noref(skb, iph->daddr, iph->saddr,
|
||||
iph->tos, skb->dev);
|
||||
if (unlikely(err)) {
|
||||
if (err == -EXDEV)
|
||||
NET_INC_STATS_BH(dev_net(skb->dev),
|
||||
LINUX_MIB_IPRPFILTER);
|
||||
goto drop;
|
||||
}
|
||||
}
|
||||
|
||||
#ifdef CONFIG_IP_ROUTE_CLASSID
|
||||
if (unlikely(skb_dst(skb)->tclassid)) {
|
||||
struct ip_rt_acct *st = this_cpu_ptr(ip_rt_acct);
|
||||
u32 idx = skb_dst(skb)->tclassid;
|
||||
st[idx&0xFF].o_packets++;
|
||||
st[idx&0xFF].o_bytes += skb->len;
|
||||
st[(idx>>16)&0xFF].i_packets++;
|
||||
st[(idx>>16)&0xFF].i_bytes += skb->len;
|
||||
}
|
||||
#endif
|
||||
|
||||
if (iph->ihl > 5 && ip_rcv_options(skb))
|
||||
goto drop;
|
||||
|
||||
rt = skb_rtable(skb);
|
||||
if (rt->rt_type == RTN_MULTICAST) {
|
||||
IP_UPD_PO_STATS_BH(dev_net(rt->dst.dev), IPSTATS_MIB_INMCAST,
|
||||
skb->len);
|
||||
} else if (rt->rt_type == RTN_BROADCAST)
|
||||
IP_UPD_PO_STATS_BH(dev_net(rt->dst.dev), IPSTATS_MIB_INBCAST,
|
||||
skb->len);
|
||||
|
||||
return dst_input(skb);
|
||||
|
||||
drop:
|
||||
kfree_skb(skb);
|
||||
return NET_RX_DROP;
|
||||
}
|
||||
|
||||
/*
|
||||
* Main IP Receive routine.
|
||||
*/
|
||||
int ip_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt, struct net_device *orig_dev)
|
||||
{
|
||||
const struct iphdr *iph;
|
||||
u32 len;
|
||||
|
||||
/* When the interface is in promisc. mode, drop all the crap
|
||||
* that it receives, do not try to analyse it.
|
||||
*/
|
||||
if (skb->pkt_type == PACKET_OTHERHOST)
|
||||
goto drop;
|
||||
|
||||
|
||||
IP_UPD_PO_STATS_BH(dev_net(dev), IPSTATS_MIB_IN, skb->len);
|
||||
|
||||
if ((skb = skb_share_check(skb, GFP_ATOMIC)) == NULL) {
|
||||
IP_INC_STATS_BH(dev_net(dev), IPSTATS_MIB_INDISCARDS);
|
||||
goto out;
|
||||
}
|
||||
|
||||
if (!pskb_may_pull(skb, sizeof(struct iphdr)))
|
||||
goto inhdr_error;
|
||||
|
||||
iph = ip_hdr(skb);
|
||||
|
||||
/*
|
||||
* RFC1122: 3.2.1.2 MUST silently discard any IP frame that fails the checksum.
|
||||
*
|
||||
* Is the datagram acceptable?
|
||||
*
|
||||
* 1. Length at least the size of an ip header
|
||||
* 2. Version of 4
|
||||
* 3. Checksums correctly. [Speed optimisation for later, skip loopback checksums]
|
||||
* 4. Doesn't have a bogus length
|
||||
*/
|
||||
|
||||
if (iph->ihl < 5 || iph->version != 4)
|
||||
goto inhdr_error;
|
||||
|
||||
BUILD_BUG_ON(IPSTATS_MIB_ECT1PKTS != IPSTATS_MIB_NOECTPKTS + INET_ECN_ECT_1);
|
||||
BUILD_BUG_ON(IPSTATS_MIB_ECT0PKTS != IPSTATS_MIB_NOECTPKTS + INET_ECN_ECT_0);
|
||||
BUILD_BUG_ON(IPSTATS_MIB_CEPKTS != IPSTATS_MIB_NOECTPKTS + INET_ECN_CE);
|
||||
IP_ADD_STATS_BH(dev_net(dev),
|
||||
IPSTATS_MIB_NOECTPKTS + (iph->tos & INET_ECN_MASK),
|
||||
max_t(unsigned short, 1, skb_shinfo(skb)->gso_segs));
|
||||
|
||||
if (!pskb_may_pull(skb, iph->ihl*4))
|
||||
goto inhdr_error;
|
||||
|
||||
iph = ip_hdr(skb);
|
||||
|
||||
if (unlikely(ip_fast_csum((u8 *)iph, iph->ihl)))
|
||||
goto csum_error;
|
||||
|
||||
len = ntohs(iph->tot_len);
|
||||
if (skb->len < len) {
|
||||
IP_INC_STATS_BH(dev_net(dev), IPSTATS_MIB_INTRUNCATEDPKTS);
|
||||
goto drop;
|
||||
} else if (len < (iph->ihl*4))
|
||||
goto inhdr_error;
|
||||
|
||||
/* Our transport medium may have padded the buffer out. Now we know it
|
||||
* is IP we can trim to the true length of the frame.
|
||||
* Note this now means skb->len holds ntohs(iph->tot_len).
|
||||
*/
|
||||
if (pskb_trim_rcsum(skb, len)) {
|
||||
IP_INC_STATS_BH(dev_net(dev), IPSTATS_MIB_INDISCARDS);
|
||||
goto drop;
|
||||
}
|
||||
|
||||
skb->transport_header = skb->network_header + iph->ihl*4;
|
||||
|
||||
/* Remove any debris in the socket control block */
|
||||
memset(IPCB(skb), 0, sizeof(struct inet_skb_parm));
|
||||
|
||||
/* Must drop socket now because of tproxy. */
|
||||
skb_orphan(skb);
|
||||
|
||||
return NF_HOOK(NFPROTO_IPV4, NF_INET_PRE_ROUTING, skb, dev, NULL,
|
||||
ip_rcv_finish);
|
||||
|
||||
csum_error:
|
||||
IP_INC_STATS_BH(dev_net(dev), IPSTATS_MIB_CSUMERRORS);
|
||||
inhdr_error:
|
||||
IP_INC_STATS_BH(dev_net(dev), IPSTATS_MIB_INHDRERRORS);
|
||||
drop:
|
||||
kfree_skb(skb);
|
||||
out:
|
||||
return NET_RX_DROP;
|
||||
}
|
||||
663
net/ipv4/ip_options.c
Normal file
663
net/ipv4/ip_options.c
Normal file
|
|
@ -0,0 +1,663 @@
|
|||
/*
|
||||
* INET An implementation of the TCP/IP protocol suite for the LINUX
|
||||
* operating system. INET is implemented using the BSD Socket
|
||||
* interface as the means of communication with the user level.
|
||||
*
|
||||
* The options processing module for ip.c
|
||||
*
|
||||
* Authors: A.N.Kuznetsov
|
||||
*
|
||||
*/
|
||||
|
||||
#define pr_fmt(fmt) "IPv4: " fmt
|
||||
|
||||
#include <linux/capability.h>
|
||||
#include <linux/module.h>
|
||||
#include <linux/slab.h>
|
||||
#include <linux/types.h>
|
||||
#include <asm/uaccess.h>
|
||||
#include <asm/unaligned.h>
|
||||
#include <linux/skbuff.h>
|
||||
#include <linux/ip.h>
|
||||
#include <linux/icmp.h>
|
||||
#include <linux/netdevice.h>
|
||||
#include <linux/rtnetlink.h>
|
||||
#include <net/sock.h>
|
||||
#include <net/ip.h>
|
||||
#include <net/icmp.h>
|
||||
#include <net/route.h>
|
||||
#include <net/cipso_ipv4.h>
|
||||
#include <net/ip_fib.h>
|
||||
|
||||
/*
|
||||
* Write options to IP header, record destination address to
|
||||
* source route option, address of outgoing interface
|
||||
* (we should already know it, so that this function is allowed be
|
||||
* called only after routing decision) and timestamp,
|
||||
* if we originate this datagram.
|
||||
*
|
||||
* daddr is real destination address, next hop is recorded in IP header.
|
||||
* saddr is address of outgoing interface.
|
||||
*/
|
||||
|
||||
void ip_options_build(struct sk_buff *skb, struct ip_options *opt,
|
||||
__be32 daddr, struct rtable *rt, int is_frag)
|
||||
{
|
||||
unsigned char *iph = skb_network_header(skb);
|
||||
|
||||
memcpy(&(IPCB(skb)->opt), opt, sizeof(struct ip_options));
|
||||
memcpy(iph+sizeof(struct iphdr), opt->__data, opt->optlen);
|
||||
opt = &(IPCB(skb)->opt);
|
||||
|
||||
if (opt->srr)
|
||||
memcpy(iph+opt->srr+iph[opt->srr+1]-4, &daddr, 4);
|
||||
|
||||
if (!is_frag) {
|
||||
if (opt->rr_needaddr)
|
||||
ip_rt_get_source(iph+opt->rr+iph[opt->rr+2]-5, skb, rt);
|
||||
if (opt->ts_needaddr)
|
||||
ip_rt_get_source(iph+opt->ts+iph[opt->ts+2]-9, skb, rt);
|
||||
if (opt->ts_needtime) {
|
||||
struct timespec tv;
|
||||
__be32 midtime;
|
||||
getnstimeofday(&tv);
|
||||
midtime = htonl((tv.tv_sec % 86400) * MSEC_PER_SEC + tv.tv_nsec / NSEC_PER_MSEC);
|
||||
memcpy(iph+opt->ts+iph[opt->ts+2]-5, &midtime, 4);
|
||||
}
|
||||
return;
|
||||
}
|
||||
if (opt->rr) {
|
||||
memset(iph+opt->rr, IPOPT_NOP, iph[opt->rr+1]);
|
||||
opt->rr = 0;
|
||||
opt->rr_needaddr = 0;
|
||||
}
|
||||
if (opt->ts) {
|
||||
memset(iph+opt->ts, IPOPT_NOP, iph[opt->ts+1]);
|
||||
opt->ts = 0;
|
||||
opt->ts_needaddr = opt->ts_needtime = 0;
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* Provided (sopt, skb) points to received options,
|
||||
* build in dopt compiled option set appropriate for answering.
|
||||
* i.e. invert SRR option, copy anothers,
|
||||
* and grab room in RR/TS options.
|
||||
*
|
||||
* NOTE: dopt cannot point to skb.
|
||||
*/
|
||||
|
||||
int __ip_options_echo(struct ip_options *dopt, struct sk_buff *skb,
|
||||
const struct ip_options *sopt)
|
||||
{
|
||||
unsigned char *sptr, *dptr;
|
||||
int soffset, doffset;
|
||||
int optlen;
|
||||
|
||||
memset(dopt, 0, sizeof(struct ip_options));
|
||||
|
||||
if (sopt->optlen == 0)
|
||||
return 0;
|
||||
|
||||
sptr = skb_network_header(skb);
|
||||
dptr = dopt->__data;
|
||||
|
||||
if (sopt->rr) {
|
||||
optlen = sptr[sopt->rr+1];
|
||||
soffset = sptr[sopt->rr+2];
|
||||
dopt->rr = dopt->optlen + sizeof(struct iphdr);
|
||||
memcpy(dptr, sptr+sopt->rr, optlen);
|
||||
if (sopt->rr_needaddr && soffset <= optlen) {
|
||||
if (soffset + 3 > optlen)
|
||||
return -EINVAL;
|
||||
dptr[2] = soffset + 4;
|
||||
dopt->rr_needaddr = 1;
|
||||
}
|
||||
dptr += optlen;
|
||||
dopt->optlen += optlen;
|
||||
}
|
||||
if (sopt->ts) {
|
||||
optlen = sptr[sopt->ts+1];
|
||||
soffset = sptr[sopt->ts+2];
|
||||
dopt->ts = dopt->optlen + sizeof(struct iphdr);
|
||||
memcpy(dptr, sptr+sopt->ts, optlen);
|
||||
if (soffset <= optlen) {
|
||||
if (sopt->ts_needaddr) {
|
||||
if (soffset + 3 > optlen)
|
||||
return -EINVAL;
|
||||
dopt->ts_needaddr = 1;
|
||||
soffset += 4;
|
||||
}
|
||||
if (sopt->ts_needtime) {
|
||||
if (soffset + 3 > optlen)
|
||||
return -EINVAL;
|
||||
if ((dptr[3]&0xF) != IPOPT_TS_PRESPEC) {
|
||||
dopt->ts_needtime = 1;
|
||||
soffset += 4;
|
||||
} else {
|
||||
dopt->ts_needtime = 0;
|
||||
|
||||
if (soffset + 7 <= optlen) {
|
||||
__be32 addr;
|
||||
|
||||
memcpy(&addr, dptr+soffset-1, 4);
|
||||
if (inet_addr_type(dev_net(skb_dst(skb)->dev), addr) != RTN_UNICAST) {
|
||||
dopt->ts_needtime = 1;
|
||||
soffset += 8;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
dptr[2] = soffset;
|
||||
}
|
||||
dptr += optlen;
|
||||
dopt->optlen += optlen;
|
||||
}
|
||||
if (sopt->srr) {
|
||||
unsigned char *start = sptr+sopt->srr;
|
||||
__be32 faddr;
|
||||
|
||||
optlen = start[1];
|
||||
soffset = start[2];
|
||||
doffset = 0;
|
||||
if (soffset > optlen)
|
||||
soffset = optlen + 1;
|
||||
soffset -= 4;
|
||||
if (soffset > 3) {
|
||||
memcpy(&faddr, &start[soffset-1], 4);
|
||||
for (soffset -= 4, doffset = 4; soffset > 3; soffset -= 4, doffset += 4)
|
||||
memcpy(&dptr[doffset-1], &start[soffset-1], 4);
|
||||
/*
|
||||
* RFC1812 requires to fix illegal source routes.
|
||||
*/
|
||||
if (memcmp(&ip_hdr(skb)->saddr,
|
||||
&start[soffset + 3], 4) == 0)
|
||||
doffset -= 4;
|
||||
}
|
||||
if (doffset > 3) {
|
||||
__be32 daddr = fib_compute_spec_dst(skb);
|
||||
|
||||
memcpy(&start[doffset-1], &daddr, 4);
|
||||
dopt->faddr = faddr;
|
||||
dptr[0] = start[0];
|
||||
dptr[1] = doffset+3;
|
||||
dptr[2] = 4;
|
||||
dptr += doffset+3;
|
||||
dopt->srr = dopt->optlen + sizeof(struct iphdr);
|
||||
dopt->optlen += doffset+3;
|
||||
dopt->is_strictroute = sopt->is_strictroute;
|
||||
}
|
||||
}
|
||||
if (sopt->cipso) {
|
||||
optlen = sptr[sopt->cipso+1];
|
||||
dopt->cipso = dopt->optlen+sizeof(struct iphdr);
|
||||
memcpy(dptr, sptr+sopt->cipso, optlen);
|
||||
dptr += optlen;
|
||||
dopt->optlen += optlen;
|
||||
}
|
||||
while (dopt->optlen & 3) {
|
||||
*dptr++ = IPOPT_END;
|
||||
dopt->optlen++;
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
/*
|
||||
* Options "fragmenting", just fill options not
|
||||
* allowed in fragments with NOOPs.
|
||||
* Simple and stupid 8), but the most efficient way.
|
||||
*/
|
||||
|
||||
void ip_options_fragment(struct sk_buff *skb)
|
||||
{
|
||||
unsigned char *optptr = skb_network_header(skb) + sizeof(struct iphdr);
|
||||
struct ip_options *opt = &(IPCB(skb)->opt);
|
||||
int l = opt->optlen;
|
||||
int optlen;
|
||||
|
||||
while (l > 0) {
|
||||
switch (*optptr) {
|
||||
case IPOPT_END:
|
||||
return;
|
||||
case IPOPT_NOOP:
|
||||
l--;
|
||||
optptr++;
|
||||
continue;
|
||||
}
|
||||
optlen = optptr[1];
|
||||
if (optlen < 2 || optlen > l)
|
||||
return;
|
||||
if (!IPOPT_COPIED(*optptr))
|
||||
memset(optptr, IPOPT_NOOP, optlen);
|
||||
l -= optlen;
|
||||
optptr += optlen;
|
||||
}
|
||||
opt->ts = 0;
|
||||
opt->rr = 0;
|
||||
opt->rr_needaddr = 0;
|
||||
opt->ts_needaddr = 0;
|
||||
opt->ts_needtime = 0;
|
||||
}
|
||||
|
||||
/* helper used by ip_options_compile() to call fib_compute_spec_dst()
|
||||
* at most one time.
|
||||
*/
|
||||
static void spec_dst_fill(__be32 *spec_dst, struct sk_buff *skb)
|
||||
{
|
||||
if (*spec_dst == htonl(INADDR_ANY))
|
||||
*spec_dst = fib_compute_spec_dst(skb);
|
||||
}
|
||||
|
||||
/*
|
||||
* Verify options and fill pointers in struct options.
|
||||
* Caller should clear *opt, and set opt->data.
|
||||
* If opt == NULL, then skb->data should point to IP header.
|
||||
*/
|
||||
|
||||
int ip_options_compile(struct net *net,
|
||||
struct ip_options *opt, struct sk_buff *skb)
|
||||
{
|
||||
__be32 spec_dst = htonl(INADDR_ANY);
|
||||
unsigned char *pp_ptr = NULL;
|
||||
struct rtable *rt = NULL;
|
||||
unsigned char *optptr;
|
||||
unsigned char *iph;
|
||||
int optlen, l;
|
||||
|
||||
if (skb != NULL) {
|
||||
rt = skb_rtable(skb);
|
||||
optptr = (unsigned char *)&(ip_hdr(skb)[1]);
|
||||
} else
|
||||
optptr = opt->__data;
|
||||
iph = optptr - sizeof(struct iphdr);
|
||||
|
||||
for (l = opt->optlen; l > 0; ) {
|
||||
switch (*optptr) {
|
||||
case IPOPT_END:
|
||||
for (optptr++, l--; l > 0; optptr++, l--) {
|
||||
if (*optptr != IPOPT_END) {
|
||||
*optptr = IPOPT_END;
|
||||
opt->is_changed = 1;
|
||||
}
|
||||
}
|
||||
goto eol;
|
||||
case IPOPT_NOOP:
|
||||
l--;
|
||||
optptr++;
|
||||
continue;
|
||||
}
|
||||
if (unlikely(l < 2)) {
|
||||
pp_ptr = optptr;
|
||||
goto error;
|
||||
}
|
||||
optlen = optptr[1];
|
||||
if (optlen < 2 || optlen > l) {
|
||||
pp_ptr = optptr;
|
||||
goto error;
|
||||
}
|
||||
switch (*optptr) {
|
||||
case IPOPT_SSRR:
|
||||
case IPOPT_LSRR:
|
||||
if (optlen < 3) {
|
||||
pp_ptr = optptr + 1;
|
||||
goto error;
|
||||
}
|
||||
if (optptr[2] < 4) {
|
||||
pp_ptr = optptr + 2;
|
||||
goto error;
|
||||
}
|
||||
/* NB: cf RFC-1812 5.2.4.1 */
|
||||
if (opt->srr) {
|
||||
pp_ptr = optptr;
|
||||
goto error;
|
||||
}
|
||||
if (!skb) {
|
||||
if (optptr[2] != 4 || optlen < 7 || ((optlen-3) & 3)) {
|
||||
pp_ptr = optptr + 1;
|
||||
goto error;
|
||||
}
|
||||
memcpy(&opt->faddr, &optptr[3], 4);
|
||||
if (optlen > 7)
|
||||
memmove(&optptr[3], &optptr[7], optlen-7);
|
||||
}
|
||||
opt->is_strictroute = (optptr[0] == IPOPT_SSRR);
|
||||
opt->srr = optptr - iph;
|
||||
break;
|
||||
case IPOPT_RR:
|
||||
if (opt->rr) {
|
||||
pp_ptr = optptr;
|
||||
goto error;
|
||||
}
|
||||
if (optlen < 3) {
|
||||
pp_ptr = optptr + 1;
|
||||
goto error;
|
||||
}
|
||||
if (optptr[2] < 4) {
|
||||
pp_ptr = optptr + 2;
|
||||
goto error;
|
||||
}
|
||||
if (optptr[2] <= optlen) {
|
||||
if (optptr[2]+3 > optlen) {
|
||||
pp_ptr = optptr + 2;
|
||||
goto error;
|
||||
}
|
||||
if (rt) {
|
||||
spec_dst_fill(&spec_dst, skb);
|
||||
memcpy(&optptr[optptr[2]-1], &spec_dst, 4);
|
||||
opt->is_changed = 1;
|
||||
}
|
||||
optptr[2] += 4;
|
||||
opt->rr_needaddr = 1;
|
||||
}
|
||||
opt->rr = optptr - iph;
|
||||
break;
|
||||
case IPOPT_TIMESTAMP:
|
||||
if (opt->ts) {
|
||||
pp_ptr = optptr;
|
||||
goto error;
|
||||
}
|
||||
if (optlen < 4) {
|
||||
pp_ptr = optptr + 1;
|
||||
goto error;
|
||||
}
|
||||
if (optptr[2] < 5) {
|
||||
pp_ptr = optptr + 2;
|
||||
goto error;
|
||||
}
|
||||
if (optptr[2] <= optlen) {
|
||||
unsigned char *timeptr = NULL;
|
||||
if (optptr[2]+3 > optlen) {
|
||||
pp_ptr = optptr + 2;
|
||||
goto error;
|
||||
}
|
||||
switch (optptr[3]&0xF) {
|
||||
case IPOPT_TS_TSONLY:
|
||||
if (skb)
|
||||
timeptr = &optptr[optptr[2]-1];
|
||||
opt->ts_needtime = 1;
|
||||
optptr[2] += 4;
|
||||
break;
|
||||
case IPOPT_TS_TSANDADDR:
|
||||
if (optptr[2]+7 > optlen) {
|
||||
pp_ptr = optptr + 2;
|
||||
goto error;
|
||||
}
|
||||
if (rt) {
|
||||
spec_dst_fill(&spec_dst, skb);
|
||||
memcpy(&optptr[optptr[2]-1], &spec_dst, 4);
|
||||
timeptr = &optptr[optptr[2]+3];
|
||||
}
|
||||
opt->ts_needaddr = 1;
|
||||
opt->ts_needtime = 1;
|
||||
optptr[2] += 8;
|
||||
break;
|
||||
case IPOPT_TS_PRESPEC:
|
||||
if (optptr[2]+7 > optlen) {
|
||||
pp_ptr = optptr + 2;
|
||||
goto error;
|
||||
}
|
||||
{
|
||||
__be32 addr;
|
||||
memcpy(&addr, &optptr[optptr[2]-1], 4);
|
||||
if (inet_addr_type(net, addr) == RTN_UNICAST)
|
||||
break;
|
||||
if (skb)
|
||||
timeptr = &optptr[optptr[2]+3];
|
||||
}
|
||||
opt->ts_needtime = 1;
|
||||
optptr[2] += 8;
|
||||
break;
|
||||
default:
|
||||
if (!skb && !ns_capable(net->user_ns, CAP_NET_RAW)) {
|
||||
pp_ptr = optptr + 3;
|
||||
goto error;
|
||||
}
|
||||
break;
|
||||
}
|
||||
if (timeptr) {
|
||||
struct timespec tv;
|
||||
u32 midtime;
|
||||
getnstimeofday(&tv);
|
||||
midtime = (tv.tv_sec % 86400) * MSEC_PER_SEC + tv.tv_nsec / NSEC_PER_MSEC;
|
||||
put_unaligned_be32(midtime, timeptr);
|
||||
opt->is_changed = 1;
|
||||
}
|
||||
} else if ((optptr[3]&0xF) != IPOPT_TS_PRESPEC) {
|
||||
unsigned int overflow = optptr[3]>>4;
|
||||
if (overflow == 15) {
|
||||
pp_ptr = optptr + 3;
|
||||
goto error;
|
||||
}
|
||||
if (skb) {
|
||||
optptr[3] = (optptr[3]&0xF)|((overflow+1)<<4);
|
||||
opt->is_changed = 1;
|
||||
}
|
||||
}
|
||||
opt->ts = optptr - iph;
|
||||
break;
|
||||
case IPOPT_RA:
|
||||
if (optlen < 4) {
|
||||
pp_ptr = optptr + 1;
|
||||
goto error;
|
||||
}
|
||||
if (optptr[2] == 0 && optptr[3] == 0)
|
||||
opt->router_alert = optptr - iph;
|
||||
break;
|
||||
case IPOPT_CIPSO:
|
||||
if ((!skb && !ns_capable(net->user_ns, CAP_NET_RAW)) || opt->cipso) {
|
||||
pp_ptr = optptr;
|
||||
goto error;
|
||||
}
|
||||
opt->cipso = optptr - iph;
|
||||
if (cipso_v4_validate(skb, &optptr)) {
|
||||
pp_ptr = optptr;
|
||||
goto error;
|
||||
}
|
||||
break;
|
||||
case IPOPT_SEC:
|
||||
case IPOPT_SID:
|
||||
default:
|
||||
if (!skb && !ns_capable(net->user_ns, CAP_NET_RAW)) {
|
||||
pp_ptr = optptr;
|
||||
goto error;
|
||||
}
|
||||
break;
|
||||
}
|
||||
l -= optlen;
|
||||
optptr += optlen;
|
||||
}
|
||||
|
||||
eol:
|
||||
if (!pp_ptr)
|
||||
return 0;
|
||||
|
||||
error:
|
||||
if (skb) {
|
||||
icmp_send(skb, ICMP_PARAMETERPROB, 0, htonl((pp_ptr-iph)<<24));
|
||||
}
|
||||
return -EINVAL;
|
||||
}
|
||||
EXPORT_SYMBOL(ip_options_compile);
|
||||
|
||||
/*
|
||||
* Undo all the changes done by ip_options_compile().
|
||||
*/
|
||||
|
||||
void ip_options_undo(struct ip_options *opt)
|
||||
{
|
||||
if (opt->srr) {
|
||||
unsigned char *optptr = opt->__data+opt->srr-sizeof(struct iphdr);
|
||||
memmove(optptr+7, optptr+3, optptr[1]-7);
|
||||
memcpy(optptr+3, &opt->faddr, 4);
|
||||
}
|
||||
if (opt->rr_needaddr) {
|
||||
unsigned char *optptr = opt->__data+opt->rr-sizeof(struct iphdr);
|
||||
optptr[2] -= 4;
|
||||
memset(&optptr[optptr[2]-1], 0, 4);
|
||||
}
|
||||
if (opt->ts) {
|
||||
unsigned char *optptr = opt->__data+opt->ts-sizeof(struct iphdr);
|
||||
if (opt->ts_needtime) {
|
||||
optptr[2] -= 4;
|
||||
memset(&optptr[optptr[2]-1], 0, 4);
|
||||
if ((optptr[3]&0xF) == IPOPT_TS_PRESPEC)
|
||||
optptr[2] -= 4;
|
||||
}
|
||||
if (opt->ts_needaddr) {
|
||||
optptr[2] -= 4;
|
||||
memset(&optptr[optptr[2]-1], 0, 4);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static struct ip_options_rcu *ip_options_get_alloc(const int optlen)
|
||||
{
|
||||
return kzalloc(sizeof(struct ip_options_rcu) + ((optlen + 3) & ~3),
|
||||
GFP_KERNEL);
|
||||
}
|
||||
|
||||
static int ip_options_get_finish(struct net *net, struct ip_options_rcu **optp,
|
||||
struct ip_options_rcu *opt, int optlen)
|
||||
{
|
||||
while (optlen & 3)
|
||||
opt->opt.__data[optlen++] = IPOPT_END;
|
||||
opt->opt.optlen = optlen;
|
||||
if (optlen && ip_options_compile(net, &opt->opt, NULL)) {
|
||||
kfree(opt);
|
||||
return -EINVAL;
|
||||
}
|
||||
kfree(*optp);
|
||||
*optp = opt;
|
||||
return 0;
|
||||
}
|
||||
|
||||
int ip_options_get_from_user(struct net *net, struct ip_options_rcu **optp,
|
||||
unsigned char __user *data, int optlen)
|
||||
{
|
||||
struct ip_options_rcu *opt = ip_options_get_alloc(optlen);
|
||||
|
||||
if (!opt)
|
||||
return -ENOMEM;
|
||||
if (optlen && copy_from_user(opt->opt.__data, data, optlen)) {
|
||||
kfree(opt);
|
||||
return -EFAULT;
|
||||
}
|
||||
return ip_options_get_finish(net, optp, opt, optlen);
|
||||
}
|
||||
|
||||
int ip_options_get(struct net *net, struct ip_options_rcu **optp,
|
||||
unsigned char *data, int optlen)
|
||||
{
|
||||
struct ip_options_rcu *opt = ip_options_get_alloc(optlen);
|
||||
|
||||
if (!opt)
|
||||
return -ENOMEM;
|
||||
if (optlen)
|
||||
memcpy(opt->opt.__data, data, optlen);
|
||||
return ip_options_get_finish(net, optp, opt, optlen);
|
||||
}
|
||||
|
||||
void ip_forward_options(struct sk_buff *skb)
|
||||
{
|
||||
struct ip_options *opt = &(IPCB(skb)->opt);
|
||||
unsigned char *optptr;
|
||||
struct rtable *rt = skb_rtable(skb);
|
||||
unsigned char *raw = skb_network_header(skb);
|
||||
|
||||
if (opt->rr_needaddr) {
|
||||
optptr = (unsigned char *)raw + opt->rr;
|
||||
ip_rt_get_source(&optptr[optptr[2]-5], skb, rt);
|
||||
opt->is_changed = 1;
|
||||
}
|
||||
if (opt->srr_is_hit) {
|
||||
int srrptr, srrspace;
|
||||
|
||||
optptr = raw + opt->srr;
|
||||
|
||||
for ( srrptr = optptr[2], srrspace = optptr[1];
|
||||
srrptr <= srrspace;
|
||||
srrptr += 4
|
||||
) {
|
||||
if (srrptr + 3 > srrspace)
|
||||
break;
|
||||
if (memcmp(&opt->nexthop, &optptr[srrptr-1], 4) == 0)
|
||||
break;
|
||||
}
|
||||
if (srrptr + 3 <= srrspace) {
|
||||
opt->is_changed = 1;
|
||||
ip_hdr(skb)->daddr = opt->nexthop;
|
||||
ip_rt_get_source(&optptr[srrptr-1], skb, rt);
|
||||
optptr[2] = srrptr+4;
|
||||
} else {
|
||||
net_crit_ratelimited("%s(): Argh! Destination lost!\n",
|
||||
__func__);
|
||||
}
|
||||
if (opt->ts_needaddr) {
|
||||
optptr = raw + opt->ts;
|
||||
ip_rt_get_source(&optptr[optptr[2]-9], skb, rt);
|
||||
opt->is_changed = 1;
|
||||
}
|
||||
}
|
||||
if (opt->is_changed) {
|
||||
opt->is_changed = 0;
|
||||
ip_send_check(ip_hdr(skb));
|
||||
}
|
||||
}
|
||||
|
||||
int ip_options_rcv_srr(struct sk_buff *skb)
|
||||
{
|
||||
struct ip_options *opt = &(IPCB(skb)->opt);
|
||||
int srrspace, srrptr;
|
||||
__be32 nexthop;
|
||||
struct iphdr *iph = ip_hdr(skb);
|
||||
unsigned char *optptr = skb_network_header(skb) + opt->srr;
|
||||
struct rtable *rt = skb_rtable(skb);
|
||||
struct rtable *rt2;
|
||||
unsigned long orefdst;
|
||||
int err;
|
||||
|
||||
if (!rt)
|
||||
return 0;
|
||||
|
||||
if (skb->pkt_type != PACKET_HOST)
|
||||
return -EINVAL;
|
||||
if (rt->rt_type == RTN_UNICAST) {
|
||||
if (!opt->is_strictroute)
|
||||
return 0;
|
||||
icmp_send(skb, ICMP_PARAMETERPROB, 0, htonl(16<<24));
|
||||
return -EINVAL;
|
||||
}
|
||||
if (rt->rt_type != RTN_LOCAL)
|
||||
return -EINVAL;
|
||||
|
||||
for (srrptr = optptr[2], srrspace = optptr[1]; srrptr <= srrspace; srrptr += 4) {
|
||||
if (srrptr + 3 > srrspace) {
|
||||
icmp_send(skb, ICMP_PARAMETERPROB, 0, htonl((opt->srr+2)<<24));
|
||||
return -EINVAL;
|
||||
}
|
||||
memcpy(&nexthop, &optptr[srrptr-1], 4);
|
||||
|
||||
orefdst = skb->_skb_refdst;
|
||||
skb_dst_set(skb, NULL);
|
||||
err = ip_route_input(skb, nexthop, iph->saddr, iph->tos, skb->dev);
|
||||
rt2 = skb_rtable(skb);
|
||||
if (err || (rt2->rt_type != RTN_UNICAST && rt2->rt_type != RTN_LOCAL)) {
|
||||
skb_dst_drop(skb);
|
||||
skb->_skb_refdst = orefdst;
|
||||
return -EINVAL;
|
||||
}
|
||||
refdst_drop(orefdst);
|
||||
if (rt2->rt_type != RTN_LOCAL)
|
||||
break;
|
||||
/* Superfast 8) loopback forward */
|
||||
iph->daddr = nexthop;
|
||||
opt->is_changed = 1;
|
||||
}
|
||||
if (srrptr <= srrspace) {
|
||||
opt->srr_is_hit = 1;
|
||||
opt->nexthop = nexthop;
|
||||
opt->is_changed = 1;
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
EXPORT_SYMBOL(ip_options_rcv_srr);
|
||||
1589
net/ipv4/ip_output.c
Normal file
1589
net/ipv4/ip_output.c
Normal file
File diff suppressed because it is too large
Load diff
1423
net/ipv4/ip_sockglue.c
Normal file
1423
net/ipv4/ip_sockglue.c
Normal file
File diff suppressed because it is too large
Load diff
1176
net/ipv4/ip_tunnel.c
Normal file
1176
net/ipv4/ip_tunnel.c
Normal file
File diff suppressed because it is too large
Load diff
205
net/ipv4/ip_tunnel_core.c
Normal file
205
net/ipv4/ip_tunnel_core.c
Normal file
|
|
@ -0,0 +1,205 @@
|
|||
/*
|
||||
* Copyright (c) 2013 Nicira, Inc.
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or
|
||||
* modify it under the terms of version 2 of the GNU General Public
|
||||
* License as published by the Free Software Foundation.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful, but
|
||||
* WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
* General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License
|
||||
* along with this program; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
|
||||
* 02110-1301, USA
|
||||
*/
|
||||
|
||||
#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
|
||||
|
||||
#include <linux/types.h>
|
||||
#include <linux/kernel.h>
|
||||
#include <linux/skbuff.h>
|
||||
#include <linux/netdevice.h>
|
||||
#include <linux/in.h>
|
||||
#include <linux/if_arp.h>
|
||||
#include <linux/mroute.h>
|
||||
#include <linux/init.h>
|
||||
#include <linux/in6.h>
|
||||
#include <linux/inetdevice.h>
|
||||
#include <linux/netfilter_ipv4.h>
|
||||
#include <linux/etherdevice.h>
|
||||
#include <linux/if_ether.h>
|
||||
#include <linux/if_vlan.h>
|
||||
|
||||
#include <net/ip.h>
|
||||
#include <net/icmp.h>
|
||||
#include <net/protocol.h>
|
||||
#include <net/ip_tunnels.h>
|
||||
#include <net/arp.h>
|
||||
#include <net/checksum.h>
|
||||
#include <net/dsfield.h>
|
||||
#include <net/inet_ecn.h>
|
||||
#include <net/xfrm.h>
|
||||
#include <net/net_namespace.h>
|
||||
#include <net/netns/generic.h>
|
||||
#include <net/rtnetlink.h>
|
||||
|
||||
int iptunnel_xmit(struct sock *sk, struct rtable *rt, struct sk_buff *skb,
|
||||
__be32 src, __be32 dst, __u8 proto,
|
||||
__u8 tos, __u8 ttl, __be16 df, bool xnet)
|
||||
{
|
||||
int pkt_len = skb->len;
|
||||
struct iphdr *iph;
|
||||
int err;
|
||||
|
||||
skb_scrub_packet(skb, xnet);
|
||||
|
||||
skb_clear_hash(skb);
|
||||
skb_dst_set(skb, &rt->dst);
|
||||
memset(IPCB(skb), 0, sizeof(*IPCB(skb)));
|
||||
|
||||
/* Push down and install the IP header. */
|
||||
skb_push(skb, sizeof(struct iphdr));
|
||||
skb_reset_network_header(skb);
|
||||
|
||||
iph = ip_hdr(skb);
|
||||
|
||||
iph->version = 4;
|
||||
iph->ihl = sizeof(struct iphdr) >> 2;
|
||||
iph->frag_off = df;
|
||||
iph->protocol = proto;
|
||||
iph->tos = tos;
|
||||
iph->daddr = dst;
|
||||
iph->saddr = src;
|
||||
iph->ttl = ttl;
|
||||
__ip_select_ident(iph, skb_shinfo(skb)->gso_segs ?: 1);
|
||||
|
||||
err = ip_local_out_sk(sk, skb);
|
||||
if (unlikely(net_xmit_eval(err)))
|
||||
pkt_len = 0;
|
||||
return pkt_len;
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(iptunnel_xmit);
|
||||
|
||||
int iptunnel_pull_header(struct sk_buff *skb, int hdr_len, __be16 inner_proto)
|
||||
{
|
||||
if (unlikely(!pskb_may_pull(skb, hdr_len)))
|
||||
return -ENOMEM;
|
||||
|
||||
skb_pull_rcsum(skb, hdr_len);
|
||||
|
||||
if (inner_proto == htons(ETH_P_TEB)) {
|
||||
struct ethhdr *eh;
|
||||
|
||||
if (unlikely(!pskb_may_pull(skb, ETH_HLEN)))
|
||||
return -ENOMEM;
|
||||
|
||||
eh = (struct ethhdr *)skb->data;
|
||||
if (likely(ntohs(eh->h_proto) >= ETH_P_802_3_MIN))
|
||||
skb->protocol = eh->h_proto;
|
||||
else
|
||||
skb->protocol = htons(ETH_P_802_2);
|
||||
|
||||
} else {
|
||||
skb->protocol = inner_proto;
|
||||
}
|
||||
|
||||
nf_reset(skb);
|
||||
secpath_reset(skb);
|
||||
skb_clear_hash_if_not_l4(skb);
|
||||
skb_dst_drop(skb);
|
||||
skb->vlan_tci = 0;
|
||||
skb_set_queue_mapping(skb, 0);
|
||||
skb->pkt_type = PACKET_HOST;
|
||||
return 0;
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(iptunnel_pull_header);
|
||||
|
||||
struct sk_buff *iptunnel_handle_offloads(struct sk_buff *skb,
|
||||
bool csum_help,
|
||||
int gso_type_mask)
|
||||
{
|
||||
int err;
|
||||
|
||||
if (likely(!skb->encapsulation)) {
|
||||
skb_reset_inner_headers(skb);
|
||||
skb->encapsulation = 1;
|
||||
}
|
||||
|
||||
if (skb_is_gso(skb)) {
|
||||
err = skb_unclone(skb, GFP_ATOMIC);
|
||||
if (unlikely(err))
|
||||
goto error;
|
||||
skb_shinfo(skb)->gso_type |= gso_type_mask;
|
||||
return skb;
|
||||
}
|
||||
|
||||
/* If packet is not gso and we are resolving any partial checksum,
|
||||
* clear encapsulation flag. This allows setting CHECKSUM_PARTIAL
|
||||
* on the outer header without confusing devices that implement
|
||||
* NETIF_F_IP_CSUM with encapsulation.
|
||||
*/
|
||||
if (csum_help)
|
||||
skb->encapsulation = 0;
|
||||
|
||||
if (skb->ip_summed == CHECKSUM_PARTIAL && csum_help) {
|
||||
err = skb_checksum_help(skb);
|
||||
if (unlikely(err))
|
||||
goto error;
|
||||
} else if (skb->ip_summed != CHECKSUM_PARTIAL)
|
||||
skb->ip_summed = CHECKSUM_NONE;
|
||||
|
||||
return skb;
|
||||
error:
|
||||
kfree_skb(skb);
|
||||
return ERR_PTR(err);
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(iptunnel_handle_offloads);
|
||||
|
||||
/* Often modified stats are per cpu, other are shared (netdev->stats) */
|
||||
struct rtnl_link_stats64 *ip_tunnel_get_stats64(struct net_device *dev,
|
||||
struct rtnl_link_stats64 *tot)
|
||||
{
|
||||
int i;
|
||||
|
||||
for_each_possible_cpu(i) {
|
||||
const struct pcpu_sw_netstats *tstats =
|
||||
per_cpu_ptr(dev->tstats, i);
|
||||
u64 rx_packets, rx_bytes, tx_packets, tx_bytes;
|
||||
unsigned int start;
|
||||
|
||||
do {
|
||||
start = u64_stats_fetch_begin_irq(&tstats->syncp);
|
||||
rx_packets = tstats->rx_packets;
|
||||
tx_packets = tstats->tx_packets;
|
||||
rx_bytes = tstats->rx_bytes;
|
||||
tx_bytes = tstats->tx_bytes;
|
||||
} while (u64_stats_fetch_retry_irq(&tstats->syncp, start));
|
||||
|
||||
tot->rx_packets += rx_packets;
|
||||
tot->tx_packets += tx_packets;
|
||||
tot->rx_bytes += rx_bytes;
|
||||
tot->tx_bytes += tx_bytes;
|
||||
}
|
||||
|
||||
tot->multicast = dev->stats.multicast;
|
||||
|
||||
tot->rx_crc_errors = dev->stats.rx_crc_errors;
|
||||
tot->rx_fifo_errors = dev->stats.rx_fifo_errors;
|
||||
tot->rx_length_errors = dev->stats.rx_length_errors;
|
||||
tot->rx_frame_errors = dev->stats.rx_frame_errors;
|
||||
tot->rx_errors = dev->stats.rx_errors;
|
||||
|
||||
tot->tx_fifo_errors = dev->stats.tx_fifo_errors;
|
||||
tot->tx_carrier_errors = dev->stats.tx_carrier_errors;
|
||||
tot->tx_dropped = dev->stats.tx_dropped;
|
||||
tot->tx_aborted_errors = dev->stats.tx_aborted_errors;
|
||||
tot->tx_errors = dev->stats.tx_errors;
|
||||
|
||||
tot->collisions = dev->stats.collisions;
|
||||
|
||||
return tot;
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(ip_tunnel_get_stats64);
|
||||
592
net/ipv4/ip_vti.c
Normal file
592
net/ipv4/ip_vti.c
Normal file
|
|
@ -0,0 +1,592 @@
|
|||
/*
|
||||
* Linux NET3: IP/IP protocol decoder modified to support
|
||||
* virtual tunnel interface
|
||||
*
|
||||
* Authors:
|
||||
* Saurabh Mohan (saurabh.mohan@vyatta.com) 05/07/2012
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU General Public License
|
||||
* as published by the Free Software Foundation; either version
|
||||
* 2 of the License, or (at your option) any later version.
|
||||
*
|
||||
*/
|
||||
|
||||
/*
|
||||
This version of net/ipv4/ip_vti.c is cloned of net/ipv4/ipip.c
|
||||
|
||||
For comments look at net/ipv4/ip_gre.c --ANK
|
||||
*/
|
||||
|
||||
|
||||
#include <linux/capability.h>
|
||||
#include <linux/module.h>
|
||||
#include <linux/types.h>
|
||||
#include <linux/kernel.h>
|
||||
#include <linux/uaccess.h>
|
||||
#include <linux/skbuff.h>
|
||||
#include <linux/netdevice.h>
|
||||
#include <linux/in.h>
|
||||
#include <linux/tcp.h>
|
||||
#include <linux/udp.h>
|
||||
#include <linux/if_arp.h>
|
||||
#include <linux/mroute.h>
|
||||
#include <linux/init.h>
|
||||
#include <linux/netfilter_ipv4.h>
|
||||
#include <linux/if_ether.h>
|
||||
#include <linux/icmpv6.h>
|
||||
|
||||
#include <net/sock.h>
|
||||
#include <net/ip.h>
|
||||
#include <net/icmp.h>
|
||||
#include <net/ip_tunnels.h>
|
||||
#include <net/inet_ecn.h>
|
||||
#include <net/xfrm.h>
|
||||
#include <net/net_namespace.h>
|
||||
#include <net/netns/generic.h>
|
||||
|
||||
static struct rtnl_link_ops vti_link_ops __read_mostly;
|
||||
|
||||
static int vti_net_id __read_mostly;
|
||||
static int vti_tunnel_init(struct net_device *dev);
|
||||
|
||||
static int vti_input(struct sk_buff *skb, int nexthdr, __be32 spi,
|
||||
int encap_type)
|
||||
{
|
||||
struct ip_tunnel *tunnel;
|
||||
const struct iphdr *iph = ip_hdr(skb);
|
||||
struct net *net = dev_net(skb->dev);
|
||||
struct ip_tunnel_net *itn = net_generic(net, vti_net_id);
|
||||
|
||||
tunnel = ip_tunnel_lookup(itn, skb->dev->ifindex, TUNNEL_NO_KEY,
|
||||
iph->saddr, iph->daddr, 0);
|
||||
if (tunnel != NULL) {
|
||||
if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb))
|
||||
goto drop;
|
||||
|
||||
XFRM_TUNNEL_SKB_CB(skb)->tunnel.ip4 = tunnel;
|
||||
skb->mark = be32_to_cpu(tunnel->parms.i_key);
|
||||
|
||||
return xfrm_input(skb, nexthdr, spi, encap_type);
|
||||
}
|
||||
|
||||
return -EINVAL;
|
||||
drop:
|
||||
kfree_skb(skb);
|
||||
return 0;
|
||||
}
|
||||
|
||||
static int vti_rcv(struct sk_buff *skb)
|
||||
{
|
||||
XFRM_SPI_SKB_CB(skb)->family = AF_INET;
|
||||
XFRM_SPI_SKB_CB(skb)->daddroff = offsetof(struct iphdr, daddr);
|
||||
|
||||
return vti_input(skb, ip_hdr(skb)->protocol, 0, 0);
|
||||
}
|
||||
|
||||
static int vti_rcv_cb(struct sk_buff *skb, int err)
|
||||
{
|
||||
unsigned short family;
|
||||
struct net_device *dev;
|
||||
struct pcpu_sw_netstats *tstats;
|
||||
struct xfrm_state *x;
|
||||
struct ip_tunnel *tunnel = XFRM_TUNNEL_SKB_CB(skb)->tunnel.ip4;
|
||||
|
||||
if (!tunnel)
|
||||
return 1;
|
||||
|
||||
dev = tunnel->dev;
|
||||
|
||||
if (err) {
|
||||
dev->stats.rx_errors++;
|
||||
dev->stats.rx_dropped++;
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
x = xfrm_input_state(skb);
|
||||
family = x->inner_mode->afinfo->family;
|
||||
|
||||
if (!xfrm_policy_check(NULL, XFRM_POLICY_IN, skb, family))
|
||||
return -EPERM;
|
||||
|
||||
skb_scrub_packet(skb, !net_eq(tunnel->net, dev_net(skb->dev)));
|
||||
skb->dev = dev;
|
||||
|
||||
tstats = this_cpu_ptr(dev->tstats);
|
||||
|
||||
u64_stats_update_begin(&tstats->syncp);
|
||||
tstats->rx_packets++;
|
||||
tstats->rx_bytes += skb->len;
|
||||
u64_stats_update_end(&tstats->syncp);
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
static bool vti_state_check(const struct xfrm_state *x, __be32 dst, __be32 src)
|
||||
{
|
||||
xfrm_address_t *daddr = (xfrm_address_t *)&dst;
|
||||
xfrm_address_t *saddr = (xfrm_address_t *)&src;
|
||||
|
||||
/* if there is no transform then this tunnel is not functional.
|
||||
* Or if the xfrm is not mode tunnel.
|
||||
*/
|
||||
if (!x || x->props.mode != XFRM_MODE_TUNNEL ||
|
||||
x->props.family != AF_INET)
|
||||
return false;
|
||||
|
||||
if (!dst)
|
||||
return xfrm_addr_equal(saddr, &x->props.saddr, AF_INET);
|
||||
|
||||
if (!xfrm_state_addr_check(x, daddr, saddr, AF_INET))
|
||||
return false;
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
static netdev_tx_t vti_xmit(struct sk_buff *skb, struct net_device *dev,
|
||||
struct flowi *fl)
|
||||
{
|
||||
struct ip_tunnel *tunnel = netdev_priv(dev);
|
||||
struct ip_tunnel_parm *parms = &tunnel->parms;
|
||||
struct dst_entry *dst = skb_dst(skb);
|
||||
struct net_device *tdev; /* Device to other host */
|
||||
int err;
|
||||
|
||||
if (!dst) {
|
||||
dev->stats.tx_carrier_errors++;
|
||||
goto tx_error_icmp;
|
||||
}
|
||||
|
||||
dst_hold(dst);
|
||||
dst = xfrm_lookup(tunnel->net, dst, fl, NULL, 0);
|
||||
if (IS_ERR(dst)) {
|
||||
dev->stats.tx_carrier_errors++;
|
||||
goto tx_error_icmp;
|
||||
}
|
||||
|
||||
if (!vti_state_check(dst->xfrm, parms->iph.daddr, parms->iph.saddr)) {
|
||||
dev->stats.tx_carrier_errors++;
|
||||
dst_release(dst);
|
||||
goto tx_error_icmp;
|
||||
}
|
||||
|
||||
tdev = dst->dev;
|
||||
|
||||
if (tdev == dev) {
|
||||
dst_release(dst);
|
||||
dev->stats.collisions++;
|
||||
goto tx_error;
|
||||
}
|
||||
|
||||
if (tunnel->err_count > 0) {
|
||||
if (time_before(jiffies,
|
||||
tunnel->err_time + IPTUNNEL_ERR_TIMEO)) {
|
||||
tunnel->err_count--;
|
||||
dst_link_failure(skb);
|
||||
} else
|
||||
tunnel->err_count = 0;
|
||||
}
|
||||
|
||||
skb_scrub_packet(skb, !net_eq(tunnel->net, dev_net(dev)));
|
||||
skb_dst_set(skb, dst);
|
||||
skb->dev = skb_dst(skb)->dev;
|
||||
|
||||
err = dst_output(skb);
|
||||
if (net_xmit_eval(err) == 0)
|
||||
err = skb->len;
|
||||
iptunnel_xmit_stats(err, &dev->stats, dev->tstats);
|
||||
return NETDEV_TX_OK;
|
||||
|
||||
tx_error_icmp:
|
||||
dst_link_failure(skb);
|
||||
tx_error:
|
||||
dev->stats.tx_errors++;
|
||||
kfree_skb(skb);
|
||||
return NETDEV_TX_OK;
|
||||
}
|
||||
|
||||
/* This function assumes it is being called from dev_queue_xmit()
|
||||
* and that skb is filled properly by that function.
|
||||
*/
|
||||
static netdev_tx_t vti_tunnel_xmit(struct sk_buff *skb, struct net_device *dev)
|
||||
{
|
||||
struct ip_tunnel *tunnel = netdev_priv(dev);
|
||||
struct flowi fl;
|
||||
|
||||
memset(&fl, 0, sizeof(fl));
|
||||
|
||||
skb->mark = be32_to_cpu(tunnel->parms.o_key);
|
||||
|
||||
switch (skb->protocol) {
|
||||
case htons(ETH_P_IP):
|
||||
xfrm_decode_session(skb, &fl, AF_INET);
|
||||
memset(IPCB(skb), 0, sizeof(*IPCB(skb)));
|
||||
break;
|
||||
case htons(ETH_P_IPV6):
|
||||
xfrm_decode_session(skb, &fl, AF_INET6);
|
||||
memset(IP6CB(skb), 0, sizeof(*IP6CB(skb)));
|
||||
break;
|
||||
default:
|
||||
dev->stats.tx_errors++;
|
||||
dev_kfree_skb(skb);
|
||||
return NETDEV_TX_OK;
|
||||
}
|
||||
|
||||
return vti_xmit(skb, dev, &fl);
|
||||
}
|
||||
|
||||
static int vti4_err(struct sk_buff *skb, u32 info)
|
||||
{
|
||||
__be32 spi;
|
||||
__u32 mark;
|
||||
struct xfrm_state *x;
|
||||
struct ip_tunnel *tunnel;
|
||||
struct ip_esp_hdr *esph;
|
||||
struct ip_auth_hdr *ah ;
|
||||
struct ip_comp_hdr *ipch;
|
||||
struct net *net = dev_net(skb->dev);
|
||||
const struct iphdr *iph = (const struct iphdr *)skb->data;
|
||||
int protocol = iph->protocol;
|
||||
struct ip_tunnel_net *itn = net_generic(net, vti_net_id);
|
||||
|
||||
tunnel = ip_tunnel_lookup(itn, skb->dev->ifindex, TUNNEL_NO_KEY,
|
||||
iph->daddr, iph->saddr, 0);
|
||||
if (!tunnel)
|
||||
return -1;
|
||||
|
||||
mark = be32_to_cpu(tunnel->parms.o_key);
|
||||
|
||||
switch (protocol) {
|
||||
case IPPROTO_ESP:
|
||||
esph = (struct ip_esp_hdr *)(skb->data+(iph->ihl<<2));
|
||||
spi = esph->spi;
|
||||
break;
|
||||
case IPPROTO_AH:
|
||||
ah = (struct ip_auth_hdr *)(skb->data+(iph->ihl<<2));
|
||||
spi = ah->spi;
|
||||
break;
|
||||
case IPPROTO_COMP:
|
||||
ipch = (struct ip_comp_hdr *)(skb->data+(iph->ihl<<2));
|
||||
spi = htonl(ntohs(ipch->cpi));
|
||||
break;
|
||||
default:
|
||||
return 0;
|
||||
}
|
||||
|
||||
switch (icmp_hdr(skb)->type) {
|
||||
case ICMP_DEST_UNREACH:
|
||||
if (icmp_hdr(skb)->code != ICMP_FRAG_NEEDED)
|
||||
return 0;
|
||||
case ICMP_REDIRECT:
|
||||
break;
|
||||
default:
|
||||
return 0;
|
||||
}
|
||||
|
||||
x = xfrm_state_lookup(net, mark, (const xfrm_address_t *)&iph->daddr,
|
||||
spi, protocol, AF_INET);
|
||||
if (!x)
|
||||
return 0;
|
||||
|
||||
if (icmp_hdr(skb)->type == ICMP_DEST_UNREACH)
|
||||
ipv4_update_pmtu(skb, net, info, 0, 0, protocol, 0);
|
||||
else
|
||||
ipv4_redirect(skb, net, 0, 0, protocol, 0);
|
||||
xfrm_state_put(x);
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
static int
|
||||
vti_tunnel_ioctl(struct net_device *dev, struct ifreq *ifr, int cmd)
|
||||
{
|
||||
int err = 0;
|
||||
struct ip_tunnel_parm p;
|
||||
|
||||
if (copy_from_user(&p, ifr->ifr_ifru.ifru_data, sizeof(p)))
|
||||
return -EFAULT;
|
||||
|
||||
if (cmd == SIOCADDTUNNEL || cmd == SIOCCHGTUNNEL) {
|
||||
if (p.iph.version != 4 || p.iph.protocol != IPPROTO_IPIP ||
|
||||
p.iph.ihl != 5)
|
||||
return -EINVAL;
|
||||
}
|
||||
|
||||
if (!(p.i_flags & GRE_KEY))
|
||||
p.i_key = 0;
|
||||
if (!(p.o_flags & GRE_KEY))
|
||||
p.o_key = 0;
|
||||
|
||||
p.i_flags = VTI_ISVTI;
|
||||
|
||||
err = ip_tunnel_ioctl(dev, &p, cmd);
|
||||
if (err)
|
||||
return err;
|
||||
|
||||
if (cmd != SIOCDELTUNNEL) {
|
||||
p.i_flags |= GRE_KEY;
|
||||
p.o_flags |= GRE_KEY;
|
||||
}
|
||||
|
||||
if (copy_to_user(ifr->ifr_ifru.ifru_data, &p, sizeof(p)))
|
||||
return -EFAULT;
|
||||
return 0;
|
||||
}
|
||||
|
||||
static const struct net_device_ops vti_netdev_ops = {
|
||||
.ndo_init = vti_tunnel_init,
|
||||
.ndo_uninit = ip_tunnel_uninit,
|
||||
.ndo_start_xmit = vti_tunnel_xmit,
|
||||
.ndo_do_ioctl = vti_tunnel_ioctl,
|
||||
.ndo_change_mtu = ip_tunnel_change_mtu,
|
||||
.ndo_get_stats64 = ip_tunnel_get_stats64,
|
||||
};
|
||||
|
||||
static void vti_tunnel_setup(struct net_device *dev)
|
||||
{
|
||||
dev->netdev_ops = &vti_netdev_ops;
|
||||
dev->type = ARPHRD_TUNNEL;
|
||||
ip_tunnel_setup(dev, vti_net_id);
|
||||
}
|
||||
|
||||
static int vti_tunnel_init(struct net_device *dev)
|
||||
{
|
||||
struct ip_tunnel *tunnel = netdev_priv(dev);
|
||||
struct iphdr *iph = &tunnel->parms.iph;
|
||||
|
||||
memcpy(dev->dev_addr, &iph->saddr, 4);
|
||||
memcpy(dev->broadcast, &iph->daddr, 4);
|
||||
|
||||
dev->hard_header_len = LL_MAX_HEADER + sizeof(struct iphdr);
|
||||
dev->mtu = ETH_DATA_LEN;
|
||||
dev->flags = IFF_NOARP;
|
||||
dev->iflink = 0;
|
||||
dev->addr_len = 4;
|
||||
dev->features |= NETIF_F_LLTX;
|
||||
netif_keep_dst(dev);
|
||||
|
||||
return ip_tunnel_init(dev);
|
||||
}
|
||||
|
||||
static void __net_init vti_fb_tunnel_init(struct net_device *dev)
|
||||
{
|
||||
struct ip_tunnel *tunnel = netdev_priv(dev);
|
||||
struct iphdr *iph = &tunnel->parms.iph;
|
||||
|
||||
iph->version = 4;
|
||||
iph->protocol = IPPROTO_IPIP;
|
||||
iph->ihl = 5;
|
||||
}
|
||||
|
||||
static struct xfrm4_protocol vti_esp4_protocol __read_mostly = {
|
||||
.handler = vti_rcv,
|
||||
.input_handler = vti_input,
|
||||
.cb_handler = vti_rcv_cb,
|
||||
.err_handler = vti4_err,
|
||||
.priority = 100,
|
||||
};
|
||||
|
||||
static struct xfrm4_protocol vti_ah4_protocol __read_mostly = {
|
||||
.handler = vti_rcv,
|
||||
.input_handler = vti_input,
|
||||
.cb_handler = vti_rcv_cb,
|
||||
.err_handler = vti4_err,
|
||||
.priority = 100,
|
||||
};
|
||||
|
||||
static struct xfrm4_protocol vti_ipcomp4_protocol __read_mostly = {
|
||||
.handler = vti_rcv,
|
||||
.input_handler = vti_input,
|
||||
.cb_handler = vti_rcv_cb,
|
||||
.err_handler = vti4_err,
|
||||
.priority = 100,
|
||||
};
|
||||
|
||||
static int __net_init vti_init_net(struct net *net)
|
||||
{
|
||||
int err;
|
||||
struct ip_tunnel_net *itn;
|
||||
|
||||
err = ip_tunnel_init_net(net, vti_net_id, &vti_link_ops, "ip_vti0");
|
||||
if (err)
|
||||
return err;
|
||||
itn = net_generic(net, vti_net_id);
|
||||
vti_fb_tunnel_init(itn->fb_tunnel_dev);
|
||||
return 0;
|
||||
}
|
||||
|
||||
static void __net_exit vti_exit_net(struct net *net)
|
||||
{
|
||||
struct ip_tunnel_net *itn = net_generic(net, vti_net_id);
|
||||
ip_tunnel_delete_net(itn, &vti_link_ops);
|
||||
}
|
||||
|
||||
static struct pernet_operations vti_net_ops = {
|
||||
.init = vti_init_net,
|
||||
.exit = vti_exit_net,
|
||||
.id = &vti_net_id,
|
||||
.size = sizeof(struct ip_tunnel_net),
|
||||
};
|
||||
|
||||
static int vti_tunnel_validate(struct nlattr *tb[], struct nlattr *data[])
|
||||
{
|
||||
return 0;
|
||||
}
|
||||
|
||||
static void vti_netlink_parms(struct nlattr *data[],
|
||||
struct ip_tunnel_parm *parms)
|
||||
{
|
||||
memset(parms, 0, sizeof(*parms));
|
||||
|
||||
parms->iph.protocol = IPPROTO_IPIP;
|
||||
|
||||
if (!data)
|
||||
return;
|
||||
|
||||
parms->i_flags = VTI_ISVTI;
|
||||
|
||||
if (data[IFLA_VTI_LINK])
|
||||
parms->link = nla_get_u32(data[IFLA_VTI_LINK]);
|
||||
|
||||
if (data[IFLA_VTI_IKEY])
|
||||
parms->i_key = nla_get_be32(data[IFLA_VTI_IKEY]);
|
||||
|
||||
if (data[IFLA_VTI_OKEY])
|
||||
parms->o_key = nla_get_be32(data[IFLA_VTI_OKEY]);
|
||||
|
||||
if (data[IFLA_VTI_LOCAL])
|
||||
parms->iph.saddr = nla_get_be32(data[IFLA_VTI_LOCAL]);
|
||||
|
||||
if (data[IFLA_VTI_REMOTE])
|
||||
parms->iph.daddr = nla_get_be32(data[IFLA_VTI_REMOTE]);
|
||||
|
||||
}
|
||||
|
||||
static int vti_newlink(struct net *src_net, struct net_device *dev,
|
||||
struct nlattr *tb[], struct nlattr *data[])
|
||||
{
|
||||
struct ip_tunnel_parm parms;
|
||||
|
||||
vti_netlink_parms(data, &parms);
|
||||
return ip_tunnel_newlink(dev, tb, &parms);
|
||||
}
|
||||
|
||||
static int vti_changelink(struct net_device *dev, struct nlattr *tb[],
|
||||
struct nlattr *data[])
|
||||
{
|
||||
struct ip_tunnel_parm p;
|
||||
|
||||
vti_netlink_parms(data, &p);
|
||||
return ip_tunnel_changelink(dev, tb, &p);
|
||||
}
|
||||
|
||||
static size_t vti_get_size(const struct net_device *dev)
|
||||
{
|
||||
return
|
||||
/* IFLA_VTI_LINK */
|
||||
nla_total_size(4) +
|
||||
/* IFLA_VTI_IKEY */
|
||||
nla_total_size(4) +
|
||||
/* IFLA_VTI_OKEY */
|
||||
nla_total_size(4) +
|
||||
/* IFLA_VTI_LOCAL */
|
||||
nla_total_size(4) +
|
||||
/* IFLA_VTI_REMOTE */
|
||||
nla_total_size(4) +
|
||||
0;
|
||||
}
|
||||
|
||||
static int vti_fill_info(struct sk_buff *skb, const struct net_device *dev)
|
||||
{
|
||||
struct ip_tunnel *t = netdev_priv(dev);
|
||||
struct ip_tunnel_parm *p = &t->parms;
|
||||
|
||||
nla_put_u32(skb, IFLA_VTI_LINK, p->link);
|
||||
nla_put_be32(skb, IFLA_VTI_IKEY, p->i_key);
|
||||
nla_put_be32(skb, IFLA_VTI_OKEY, p->o_key);
|
||||
nla_put_be32(skb, IFLA_VTI_LOCAL, p->iph.saddr);
|
||||
nla_put_be32(skb, IFLA_VTI_REMOTE, p->iph.daddr);
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
static const struct nla_policy vti_policy[IFLA_VTI_MAX + 1] = {
|
||||
[IFLA_VTI_LINK] = { .type = NLA_U32 },
|
||||
[IFLA_VTI_IKEY] = { .type = NLA_U32 },
|
||||
[IFLA_VTI_OKEY] = { .type = NLA_U32 },
|
||||
[IFLA_VTI_LOCAL] = { .len = FIELD_SIZEOF(struct iphdr, saddr) },
|
||||
[IFLA_VTI_REMOTE] = { .len = FIELD_SIZEOF(struct iphdr, daddr) },
|
||||
};
|
||||
|
||||
static struct rtnl_link_ops vti_link_ops __read_mostly = {
|
||||
.kind = "vti",
|
||||
.maxtype = IFLA_VTI_MAX,
|
||||
.policy = vti_policy,
|
||||
.priv_size = sizeof(struct ip_tunnel),
|
||||
.setup = vti_tunnel_setup,
|
||||
.validate = vti_tunnel_validate,
|
||||
.newlink = vti_newlink,
|
||||
.changelink = vti_changelink,
|
||||
.dellink = ip_tunnel_dellink,
|
||||
.get_size = vti_get_size,
|
||||
.fill_info = vti_fill_info,
|
||||
};
|
||||
|
||||
static int __init vti_init(void)
|
||||
{
|
||||
const char *msg;
|
||||
int err;
|
||||
|
||||
pr_info("IPv4 over IPsec tunneling driver\n");
|
||||
|
||||
msg = "tunnel device";
|
||||
err = register_pernet_device(&vti_net_ops);
|
||||
if (err < 0)
|
||||
goto pernet_dev_failed;
|
||||
|
||||
msg = "tunnel protocols";
|
||||
err = xfrm4_protocol_register(&vti_esp4_protocol, IPPROTO_ESP);
|
||||
if (err < 0)
|
||||
goto xfrm_proto_esp_failed;
|
||||
err = xfrm4_protocol_register(&vti_ah4_protocol, IPPROTO_AH);
|
||||
if (err < 0)
|
||||
goto xfrm_proto_ah_failed;
|
||||
err = xfrm4_protocol_register(&vti_ipcomp4_protocol, IPPROTO_COMP);
|
||||
if (err < 0)
|
||||
goto xfrm_proto_comp_failed;
|
||||
|
||||
msg = "netlink interface";
|
||||
err = rtnl_link_register(&vti_link_ops);
|
||||
if (err < 0)
|
||||
goto rtnl_link_failed;
|
||||
|
||||
return err;
|
||||
|
||||
rtnl_link_failed:
|
||||
xfrm4_protocol_deregister(&vti_ipcomp4_protocol, IPPROTO_COMP);
|
||||
xfrm_proto_comp_failed:
|
||||
xfrm4_protocol_deregister(&vti_ah4_protocol, IPPROTO_AH);
|
||||
xfrm_proto_ah_failed:
|
||||
xfrm4_protocol_deregister(&vti_esp4_protocol, IPPROTO_ESP);
|
||||
xfrm_proto_esp_failed:
|
||||
unregister_pernet_device(&vti_net_ops);
|
||||
pernet_dev_failed:
|
||||
pr_err("vti init: failed to register %s\n", msg);
|
||||
return err;
|
||||
}
|
||||
|
||||
static void __exit vti_fini(void)
|
||||
{
|
||||
rtnl_link_unregister(&vti_link_ops);
|
||||
xfrm4_protocol_deregister(&vti_ipcomp4_protocol, IPPROTO_COMP);
|
||||
xfrm4_protocol_deregister(&vti_ah4_protocol, IPPROTO_AH);
|
||||
xfrm4_protocol_deregister(&vti_esp4_protocol, IPPROTO_ESP);
|
||||
unregister_pernet_device(&vti_net_ops);
|
||||
}
|
||||
|
||||
module_init(vti_init);
|
||||
module_exit(vti_fini);
|
||||
MODULE_LICENSE("GPL");
|
||||
MODULE_ALIAS_RTNL_LINK("vti");
|
||||
MODULE_ALIAS_NETDEV("ip_vti0");
|
||||
204
net/ipv4/ipcomp.c
Normal file
204
net/ipv4/ipcomp.c
Normal file
|
|
@ -0,0 +1,204 @@
|
|||
/*
|
||||
* IP Payload Compression Protocol (IPComp) - RFC3173.
|
||||
*
|
||||
* Copyright (c) 2003 James Morris <jmorris@intercode.com.au>
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or modify it
|
||||
* under the terms of the GNU General Public License as published by the Free
|
||||
* Software Foundation; either version 2 of the License, or (at your option)
|
||||
* any later version.
|
||||
*
|
||||
* Todo:
|
||||
* - Tunable compression parameters.
|
||||
* - Compression stats.
|
||||
* - Adaptive compression.
|
||||
*/
|
||||
#include <linux/module.h>
|
||||
#include <linux/err.h>
|
||||
#include <linux/rtnetlink.h>
|
||||
#include <net/ip.h>
|
||||
#include <net/xfrm.h>
|
||||
#include <net/icmp.h>
|
||||
#include <net/ipcomp.h>
|
||||
#include <net/protocol.h>
|
||||
#include <net/sock.h>
|
||||
|
||||
static int ipcomp4_err(struct sk_buff *skb, u32 info)
|
||||
{
|
||||
struct net *net = dev_net(skb->dev);
|
||||
__be32 spi;
|
||||
const struct iphdr *iph = (const struct iphdr *)skb->data;
|
||||
struct ip_comp_hdr *ipch = (struct ip_comp_hdr *)(skb->data+(iph->ihl<<2));
|
||||
struct xfrm_state *x;
|
||||
|
||||
switch (icmp_hdr(skb)->type) {
|
||||
case ICMP_DEST_UNREACH:
|
||||
if (icmp_hdr(skb)->code != ICMP_FRAG_NEEDED)
|
||||
return 0;
|
||||
case ICMP_REDIRECT:
|
||||
break;
|
||||
default:
|
||||
return 0;
|
||||
}
|
||||
|
||||
spi = htonl(ntohs(ipch->cpi));
|
||||
x = xfrm_state_lookup(net, skb->mark, (const xfrm_address_t *)&iph->daddr,
|
||||
spi, IPPROTO_COMP, AF_INET);
|
||||
if (!x)
|
||||
return 0;
|
||||
|
||||
if (icmp_hdr(skb)->type == ICMP_DEST_UNREACH)
|
||||
ipv4_update_pmtu(skb, net, info, 0, 0, IPPROTO_COMP, 0);
|
||||
else
|
||||
ipv4_redirect(skb, net, 0, 0, IPPROTO_COMP, 0);
|
||||
xfrm_state_put(x);
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
/* We always hold one tunnel user reference to indicate a tunnel */
|
||||
static struct xfrm_state *ipcomp_tunnel_create(struct xfrm_state *x)
|
||||
{
|
||||
struct net *net = xs_net(x);
|
||||
struct xfrm_state *t;
|
||||
|
||||
t = xfrm_state_alloc(net);
|
||||
if (t == NULL)
|
||||
goto out;
|
||||
|
||||
t->id.proto = IPPROTO_IPIP;
|
||||
t->id.spi = x->props.saddr.a4;
|
||||
t->id.daddr.a4 = x->id.daddr.a4;
|
||||
memcpy(&t->sel, &x->sel, sizeof(t->sel));
|
||||
t->props.family = AF_INET;
|
||||
t->props.mode = x->props.mode;
|
||||
t->props.saddr.a4 = x->props.saddr.a4;
|
||||
t->props.flags = x->props.flags;
|
||||
t->props.extra_flags = x->props.extra_flags;
|
||||
memcpy(&t->mark, &x->mark, sizeof(t->mark));
|
||||
|
||||
if (xfrm_init_state(t))
|
||||
goto error;
|
||||
|
||||
atomic_set(&t->tunnel_users, 1);
|
||||
out:
|
||||
return t;
|
||||
|
||||
error:
|
||||
t->km.state = XFRM_STATE_DEAD;
|
||||
xfrm_state_put(t);
|
||||
t = NULL;
|
||||
goto out;
|
||||
}
|
||||
|
||||
/*
|
||||
* Must be protected by xfrm_cfg_mutex. State and tunnel user references are
|
||||
* always incremented on success.
|
||||
*/
|
||||
static int ipcomp_tunnel_attach(struct xfrm_state *x)
|
||||
{
|
||||
struct net *net = xs_net(x);
|
||||
int err = 0;
|
||||
struct xfrm_state *t;
|
||||
u32 mark = x->mark.v & x->mark.m;
|
||||
|
||||
t = xfrm_state_lookup(net, mark, (xfrm_address_t *)&x->id.daddr.a4,
|
||||
x->props.saddr.a4, IPPROTO_IPIP, AF_INET);
|
||||
if (!t) {
|
||||
t = ipcomp_tunnel_create(x);
|
||||
if (!t) {
|
||||
err = -EINVAL;
|
||||
goto out;
|
||||
}
|
||||
xfrm_state_insert(t);
|
||||
xfrm_state_hold(t);
|
||||
}
|
||||
x->tunnel = t;
|
||||
atomic_inc(&t->tunnel_users);
|
||||
out:
|
||||
return err;
|
||||
}
|
||||
|
||||
static int ipcomp4_init_state(struct xfrm_state *x)
|
||||
{
|
||||
int err = -EINVAL;
|
||||
|
||||
x->props.header_len = 0;
|
||||
switch (x->props.mode) {
|
||||
case XFRM_MODE_TRANSPORT:
|
||||
break;
|
||||
case XFRM_MODE_TUNNEL:
|
||||
x->props.header_len += sizeof(struct iphdr);
|
||||
break;
|
||||
default:
|
||||
goto out;
|
||||
}
|
||||
|
||||
err = ipcomp_init_state(x);
|
||||
if (err)
|
||||
goto out;
|
||||
|
||||
if (x->props.mode == XFRM_MODE_TUNNEL) {
|
||||
err = ipcomp_tunnel_attach(x);
|
||||
if (err)
|
||||
goto out;
|
||||
}
|
||||
|
||||
err = 0;
|
||||
out:
|
||||
return err;
|
||||
}
|
||||
|
||||
static int ipcomp4_rcv_cb(struct sk_buff *skb, int err)
|
||||
{
|
||||
return 0;
|
||||
}
|
||||
|
||||
static const struct xfrm_type ipcomp_type = {
|
||||
.description = "IPCOMP4",
|
||||
.owner = THIS_MODULE,
|
||||
.proto = IPPROTO_COMP,
|
||||
.init_state = ipcomp4_init_state,
|
||||
.destructor = ipcomp_destroy,
|
||||
.input = ipcomp_input,
|
||||
.output = ipcomp_output
|
||||
};
|
||||
|
||||
static struct xfrm4_protocol ipcomp4_protocol = {
|
||||
.handler = xfrm4_rcv,
|
||||
.input_handler = xfrm_input,
|
||||
.cb_handler = ipcomp4_rcv_cb,
|
||||
.err_handler = ipcomp4_err,
|
||||
.priority = 0,
|
||||
};
|
||||
|
||||
static int __init ipcomp4_init(void)
|
||||
{
|
||||
if (xfrm_register_type(&ipcomp_type, AF_INET) < 0) {
|
||||
pr_info("%s: can't add xfrm type\n", __func__);
|
||||
return -EAGAIN;
|
||||
}
|
||||
if (xfrm4_protocol_register(&ipcomp4_protocol, IPPROTO_COMP) < 0) {
|
||||
pr_info("%s: can't add protocol\n", __func__);
|
||||
xfrm_unregister_type(&ipcomp_type, AF_INET);
|
||||
return -EAGAIN;
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
static void __exit ipcomp4_fini(void)
|
||||
{
|
||||
if (xfrm4_protocol_deregister(&ipcomp4_protocol, IPPROTO_COMP) < 0)
|
||||
pr_info("%s: can't remove protocol\n", __func__);
|
||||
if (xfrm_unregister_type(&ipcomp_type, AF_INET) < 0)
|
||||
pr_info("%s: can't remove xfrm type\n", __func__);
|
||||
}
|
||||
|
||||
module_init(ipcomp4_init);
|
||||
module_exit(ipcomp4_fini);
|
||||
|
||||
MODULE_LICENSE("GPL");
|
||||
MODULE_DESCRIPTION("IP Payload Compression Protocol (IPComp/IPv4) - RFC3173");
|
||||
MODULE_AUTHOR("James Morris <jmorris@intercode.com.au>");
|
||||
|
||||
MODULE_ALIAS_XFRM_TYPE(AF_INET, XFRM_PROTO_COMP);
|
||||
1692
net/ipv4/ipconfig.c
Normal file
1692
net/ipv4/ipconfig.c
Normal file
File diff suppressed because it is too large
Load diff
568
net/ipv4/ipip.c
Normal file
568
net/ipv4/ipip.c
Normal file
|
|
@ -0,0 +1,568 @@
|
|||
/*
|
||||
* Linux NET3: IP/IP protocol decoder.
|
||||
*
|
||||
* Authors:
|
||||
* Sam Lantinga (slouken@cs.ucdavis.edu) 02/01/95
|
||||
*
|
||||
* Fixes:
|
||||
* Alan Cox : Merged and made usable non modular (its so tiny its silly as
|
||||
* a module taking up 2 pages).
|
||||
* Alan Cox : Fixed bug with 1.3.18 and IPIP not working (now needs to set skb->h.iph)
|
||||
* to keep ip_forward happy.
|
||||
* Alan Cox : More fixes for 1.3.21, and firewall fix. Maybe this will work soon 8).
|
||||
* Kai Schulte : Fixed #defines for IP_FIREWALL->FIREWALL
|
||||
* David Woodhouse : Perform some basic ICMP handling.
|
||||
* IPIP Routing without decapsulation.
|
||||
* Carlos Picoto : GRE over IP support
|
||||
* Alexey Kuznetsov: Reworked. Really, now it is truncated version of ipv4/ip_gre.c.
|
||||
* I do not want to merge them together.
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU General Public License
|
||||
* as published by the Free Software Foundation; either version
|
||||
* 2 of the License, or (at your option) any later version.
|
||||
*
|
||||
*/
|
||||
|
||||
/* tunnel.c: an IP tunnel driver
|
||||
|
||||
The purpose of this driver is to provide an IP tunnel through
|
||||
which you can tunnel network traffic transparently across subnets.
|
||||
|
||||
This was written by looking at Nick Holloway's dummy driver
|
||||
Thanks for the great code!
|
||||
|
||||
-Sam Lantinga (slouken@cs.ucdavis.edu) 02/01/95
|
||||
|
||||
Minor tweaks:
|
||||
Cleaned up the code a little and added some pre-1.3.0 tweaks.
|
||||
dev->hard_header/hard_header_len changed to use no headers.
|
||||
Comments/bracketing tweaked.
|
||||
Made the tunnels use dev->name not tunnel: when error reporting.
|
||||
Added tx_dropped stat
|
||||
|
||||
-Alan Cox (alan@lxorguk.ukuu.org.uk) 21 March 95
|
||||
|
||||
Reworked:
|
||||
Changed to tunnel to destination gateway in addition to the
|
||||
tunnel's pointopoint address
|
||||
Almost completely rewritten
|
||||
Note: There is currently no firewall or ICMP handling done.
|
||||
|
||||
-Sam Lantinga (slouken@cs.ucdavis.edu) 02/13/96
|
||||
|
||||
*/
|
||||
|
||||
/* Things I wish I had known when writing the tunnel driver:
|
||||
|
||||
When the tunnel_xmit() function is called, the skb contains the
|
||||
packet to be sent (plus a great deal of extra info), and dev
|
||||
contains the tunnel device that _we_ are.
|
||||
|
||||
When we are passed a packet, we are expected to fill in the
|
||||
source address with our source IP address.
|
||||
|
||||
What is the proper way to allocate, copy and free a buffer?
|
||||
After you allocate it, it is a "0 length" chunk of memory
|
||||
starting at zero. If you want to add headers to the buffer
|
||||
later, you'll have to call "skb_reserve(skb, amount)" with
|
||||
the amount of memory you want reserved. Then, you call
|
||||
"skb_put(skb, amount)" with the amount of space you want in
|
||||
the buffer. skb_put() returns a pointer to the top (#0) of
|
||||
that buffer. skb->len is set to the amount of space you have
|
||||
"allocated" with skb_put(). You can then write up to skb->len
|
||||
bytes to that buffer. If you need more, you can call skb_put()
|
||||
again with the additional amount of space you need. You can
|
||||
find out how much more space you can allocate by calling
|
||||
"skb_tailroom(skb)".
|
||||
Now, to add header space, call "skb_push(skb, header_len)".
|
||||
This creates space at the beginning of the buffer and returns
|
||||
a pointer to this new space. If later you need to strip a
|
||||
header from a buffer, call "skb_pull(skb, header_len)".
|
||||
skb_headroom() will return how much space is left at the top
|
||||
of the buffer (before the main data). Remember, this headroom
|
||||
space must be reserved before the skb_put() function is called.
|
||||
*/
|
||||
|
||||
/*
|
||||
This version of net/ipv4/ipip.c is cloned of net/ipv4/ip_gre.c
|
||||
|
||||
For comments look at net/ipv4/ip_gre.c --ANK
|
||||
*/
|
||||
|
||||
|
||||
#include <linux/capability.h>
|
||||
#include <linux/module.h>
|
||||
#include <linux/types.h>
|
||||
#include <linux/kernel.h>
|
||||
#include <linux/slab.h>
|
||||
#include <asm/uaccess.h>
|
||||
#include <linux/skbuff.h>
|
||||
#include <linux/netdevice.h>
|
||||
#include <linux/in.h>
|
||||
#include <linux/tcp.h>
|
||||
#include <linux/udp.h>
|
||||
#include <linux/if_arp.h>
|
||||
#include <linux/mroute.h>
|
||||
#include <linux/init.h>
|
||||
#include <linux/netfilter_ipv4.h>
|
||||
#include <linux/if_ether.h>
|
||||
|
||||
#include <net/sock.h>
|
||||
#include <net/ip.h>
|
||||
#include <net/icmp.h>
|
||||
#include <net/ip_tunnels.h>
|
||||
#include <net/inet_ecn.h>
|
||||
#include <net/xfrm.h>
|
||||
#include <net/net_namespace.h>
|
||||
#include <net/netns/generic.h>
|
||||
|
||||
static bool log_ecn_error = true;
|
||||
module_param(log_ecn_error, bool, 0644);
|
||||
MODULE_PARM_DESC(log_ecn_error, "Log packets received with corrupted ECN");
|
||||
|
||||
static int ipip_net_id __read_mostly;
|
||||
|
||||
static int ipip_tunnel_init(struct net_device *dev);
|
||||
static struct rtnl_link_ops ipip_link_ops __read_mostly;
|
||||
|
||||
static int ipip_err(struct sk_buff *skb, u32 info)
|
||||
{
|
||||
|
||||
/* All the routers (except for Linux) return only
|
||||
8 bytes of packet payload. It means, that precise relaying of
|
||||
ICMP in the real Internet is absolutely infeasible.
|
||||
*/
|
||||
struct net *net = dev_net(skb->dev);
|
||||
struct ip_tunnel_net *itn = net_generic(net, ipip_net_id);
|
||||
const struct iphdr *iph = (const struct iphdr *)skb->data;
|
||||
struct ip_tunnel *t;
|
||||
int err;
|
||||
const int type = icmp_hdr(skb)->type;
|
||||
const int code = icmp_hdr(skb)->code;
|
||||
|
||||
err = -ENOENT;
|
||||
t = ip_tunnel_lookup(itn, skb->dev->ifindex, TUNNEL_NO_KEY,
|
||||
iph->daddr, iph->saddr, 0);
|
||||
if (t == NULL)
|
||||
goto out;
|
||||
|
||||
if (type == ICMP_DEST_UNREACH && code == ICMP_FRAG_NEEDED) {
|
||||
ipv4_update_pmtu(skb, dev_net(skb->dev), info,
|
||||
t->parms.link, 0, IPPROTO_IPIP, 0);
|
||||
err = 0;
|
||||
goto out;
|
||||
}
|
||||
|
||||
if (type == ICMP_REDIRECT) {
|
||||
ipv4_redirect(skb, dev_net(skb->dev), t->parms.link, 0,
|
||||
IPPROTO_IPIP, 0);
|
||||
err = 0;
|
||||
goto out;
|
||||
}
|
||||
|
||||
if (t->parms.iph.daddr == 0)
|
||||
goto out;
|
||||
|
||||
err = 0;
|
||||
if (t->parms.iph.ttl == 0 && type == ICMP_TIME_EXCEEDED)
|
||||
goto out;
|
||||
|
||||
if (time_before(jiffies, t->err_time + IPTUNNEL_ERR_TIMEO))
|
||||
t->err_count++;
|
||||
else
|
||||
t->err_count = 1;
|
||||
t->err_time = jiffies;
|
||||
|
||||
out:
|
||||
return err;
|
||||
}
|
||||
|
||||
static const struct tnl_ptk_info tpi = {
|
||||
/* no tunnel info required for ipip. */
|
||||
.proto = htons(ETH_P_IP),
|
||||
};
|
||||
|
||||
static int ipip_rcv(struct sk_buff *skb)
|
||||
{
|
||||
struct net *net = dev_net(skb->dev);
|
||||
struct ip_tunnel_net *itn = net_generic(net, ipip_net_id);
|
||||
struct ip_tunnel *tunnel;
|
||||
const struct iphdr *iph;
|
||||
|
||||
iph = ip_hdr(skb);
|
||||
tunnel = ip_tunnel_lookup(itn, skb->dev->ifindex, TUNNEL_NO_KEY,
|
||||
iph->saddr, iph->daddr, 0);
|
||||
if (tunnel) {
|
||||
if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb))
|
||||
goto drop;
|
||||
if (iptunnel_pull_header(skb, 0, tpi.proto))
|
||||
goto drop;
|
||||
return ip_tunnel_rcv(tunnel, skb, &tpi, log_ecn_error);
|
||||
}
|
||||
|
||||
return -1;
|
||||
|
||||
drop:
|
||||
kfree_skb(skb);
|
||||
return 0;
|
||||
}
|
||||
|
||||
/*
|
||||
* This function assumes it is being called from dev_queue_xmit()
|
||||
* and that skb is filled properly by that function.
|
||||
*/
|
||||
static netdev_tx_t ipip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev)
|
||||
{
|
||||
struct ip_tunnel *tunnel = netdev_priv(dev);
|
||||
const struct iphdr *tiph = &tunnel->parms.iph;
|
||||
|
||||
if (unlikely(skb->protocol != htons(ETH_P_IP)))
|
||||
goto tx_error;
|
||||
|
||||
skb = iptunnel_handle_offloads(skb, false, SKB_GSO_IPIP);
|
||||
if (IS_ERR(skb))
|
||||
goto out;
|
||||
|
||||
skb_set_inner_ipproto(skb, IPPROTO_IPIP);
|
||||
|
||||
ip_tunnel_xmit(skb, dev, tiph, tiph->protocol);
|
||||
return NETDEV_TX_OK;
|
||||
|
||||
tx_error:
|
||||
kfree_skb(skb);
|
||||
out:
|
||||
dev->stats.tx_errors++;
|
||||
return NETDEV_TX_OK;
|
||||
}
|
||||
|
||||
static int
|
||||
ipip_tunnel_ioctl(struct net_device *dev, struct ifreq *ifr, int cmd)
|
||||
{
|
||||
int err = 0;
|
||||
struct ip_tunnel_parm p;
|
||||
|
||||
if (copy_from_user(&p, ifr->ifr_ifru.ifru_data, sizeof(p)))
|
||||
return -EFAULT;
|
||||
|
||||
if (cmd == SIOCADDTUNNEL || cmd == SIOCCHGTUNNEL) {
|
||||
if (p.iph.version != 4 || p.iph.protocol != IPPROTO_IPIP ||
|
||||
p.iph.ihl != 5 || (p.iph.frag_off&htons(~IP_DF)))
|
||||
return -EINVAL;
|
||||
}
|
||||
|
||||
p.i_key = p.o_key = p.i_flags = p.o_flags = 0;
|
||||
if (p.iph.ttl)
|
||||
p.iph.frag_off |= htons(IP_DF);
|
||||
|
||||
err = ip_tunnel_ioctl(dev, &p, cmd);
|
||||
if (err)
|
||||
return err;
|
||||
|
||||
if (copy_to_user(ifr->ifr_ifru.ifru_data, &p, sizeof(p)))
|
||||
return -EFAULT;
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
static const struct net_device_ops ipip_netdev_ops = {
|
||||
.ndo_init = ipip_tunnel_init,
|
||||
.ndo_uninit = ip_tunnel_uninit,
|
||||
.ndo_start_xmit = ipip_tunnel_xmit,
|
||||
.ndo_do_ioctl = ipip_tunnel_ioctl,
|
||||
.ndo_change_mtu = ip_tunnel_change_mtu,
|
||||
.ndo_get_stats64 = ip_tunnel_get_stats64,
|
||||
};
|
||||
|
||||
#define IPIP_FEATURES (NETIF_F_SG | \
|
||||
NETIF_F_FRAGLIST | \
|
||||
NETIF_F_HIGHDMA | \
|
||||
NETIF_F_GSO_SOFTWARE | \
|
||||
NETIF_F_HW_CSUM)
|
||||
|
||||
static void ipip_tunnel_setup(struct net_device *dev)
|
||||
{
|
||||
dev->netdev_ops = &ipip_netdev_ops;
|
||||
|
||||
dev->type = ARPHRD_TUNNEL;
|
||||
dev->flags = IFF_NOARP;
|
||||
dev->iflink = 0;
|
||||
dev->addr_len = 4;
|
||||
dev->features |= NETIF_F_LLTX;
|
||||
netif_keep_dst(dev);
|
||||
|
||||
dev->features |= IPIP_FEATURES;
|
||||
dev->hw_features |= IPIP_FEATURES;
|
||||
ip_tunnel_setup(dev, ipip_net_id);
|
||||
}
|
||||
|
||||
static int ipip_tunnel_init(struct net_device *dev)
|
||||
{
|
||||
struct ip_tunnel *tunnel = netdev_priv(dev);
|
||||
|
||||
memcpy(dev->dev_addr, &tunnel->parms.iph.saddr, 4);
|
||||
memcpy(dev->broadcast, &tunnel->parms.iph.daddr, 4);
|
||||
|
||||
tunnel->tun_hlen = 0;
|
||||
tunnel->hlen = tunnel->tun_hlen + tunnel->encap_hlen;
|
||||
tunnel->parms.iph.protocol = IPPROTO_IPIP;
|
||||
return ip_tunnel_init(dev);
|
||||
}
|
||||
|
||||
static void ipip_netlink_parms(struct nlattr *data[],
|
||||
struct ip_tunnel_parm *parms)
|
||||
{
|
||||
memset(parms, 0, sizeof(*parms));
|
||||
|
||||
parms->iph.version = 4;
|
||||
parms->iph.protocol = IPPROTO_IPIP;
|
||||
parms->iph.ihl = 5;
|
||||
|
||||
if (!data)
|
||||
return;
|
||||
|
||||
if (data[IFLA_IPTUN_LINK])
|
||||
parms->link = nla_get_u32(data[IFLA_IPTUN_LINK]);
|
||||
|
||||
if (data[IFLA_IPTUN_LOCAL])
|
||||
parms->iph.saddr = nla_get_be32(data[IFLA_IPTUN_LOCAL]);
|
||||
|
||||
if (data[IFLA_IPTUN_REMOTE])
|
||||
parms->iph.daddr = nla_get_be32(data[IFLA_IPTUN_REMOTE]);
|
||||
|
||||
if (data[IFLA_IPTUN_TTL]) {
|
||||
parms->iph.ttl = nla_get_u8(data[IFLA_IPTUN_TTL]);
|
||||
if (parms->iph.ttl)
|
||||
parms->iph.frag_off = htons(IP_DF);
|
||||
}
|
||||
|
||||
if (data[IFLA_IPTUN_TOS])
|
||||
parms->iph.tos = nla_get_u8(data[IFLA_IPTUN_TOS]);
|
||||
|
||||
if (!data[IFLA_IPTUN_PMTUDISC] || nla_get_u8(data[IFLA_IPTUN_PMTUDISC]))
|
||||
parms->iph.frag_off = htons(IP_DF);
|
||||
}
|
||||
|
||||
/* This function returns true when ENCAP attributes are present in the nl msg */
|
||||
static bool ipip_netlink_encap_parms(struct nlattr *data[],
|
||||
struct ip_tunnel_encap *ipencap)
|
||||
{
|
||||
bool ret = false;
|
||||
|
||||
memset(ipencap, 0, sizeof(*ipencap));
|
||||
|
||||
if (!data)
|
||||
return ret;
|
||||
|
||||
if (data[IFLA_IPTUN_ENCAP_TYPE]) {
|
||||
ret = true;
|
||||
ipencap->type = nla_get_u16(data[IFLA_IPTUN_ENCAP_TYPE]);
|
||||
}
|
||||
|
||||
if (data[IFLA_IPTUN_ENCAP_FLAGS]) {
|
||||
ret = true;
|
||||
ipencap->flags = nla_get_u16(data[IFLA_IPTUN_ENCAP_FLAGS]);
|
||||
}
|
||||
|
||||
if (data[IFLA_IPTUN_ENCAP_SPORT]) {
|
||||
ret = true;
|
||||
ipencap->sport = nla_get_u16(data[IFLA_IPTUN_ENCAP_SPORT]);
|
||||
}
|
||||
|
||||
if (data[IFLA_IPTUN_ENCAP_DPORT]) {
|
||||
ret = true;
|
||||
ipencap->dport = nla_get_u16(data[IFLA_IPTUN_ENCAP_DPORT]);
|
||||
}
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
static int ipip_newlink(struct net *src_net, struct net_device *dev,
|
||||
struct nlattr *tb[], struct nlattr *data[])
|
||||
{
|
||||
struct ip_tunnel_parm p;
|
||||
struct ip_tunnel_encap ipencap;
|
||||
|
||||
if (ipip_netlink_encap_parms(data, &ipencap)) {
|
||||
struct ip_tunnel *t = netdev_priv(dev);
|
||||
int err = ip_tunnel_encap_setup(t, &ipencap);
|
||||
|
||||
if (err < 0)
|
||||
return err;
|
||||
}
|
||||
|
||||
ipip_netlink_parms(data, &p);
|
||||
return ip_tunnel_newlink(dev, tb, &p);
|
||||
}
|
||||
|
||||
static int ipip_changelink(struct net_device *dev, struct nlattr *tb[],
|
||||
struct nlattr *data[])
|
||||
{
|
||||
struct ip_tunnel_parm p;
|
||||
struct ip_tunnel_encap ipencap;
|
||||
|
||||
if (ipip_netlink_encap_parms(data, &ipencap)) {
|
||||
struct ip_tunnel *t = netdev_priv(dev);
|
||||
int err = ip_tunnel_encap_setup(t, &ipencap);
|
||||
|
||||
if (err < 0)
|
||||
return err;
|
||||
}
|
||||
|
||||
ipip_netlink_parms(data, &p);
|
||||
|
||||
if (((dev->flags & IFF_POINTOPOINT) && !p.iph.daddr) ||
|
||||
(!(dev->flags & IFF_POINTOPOINT) && p.iph.daddr))
|
||||
return -EINVAL;
|
||||
|
||||
return ip_tunnel_changelink(dev, tb, &p);
|
||||
}
|
||||
|
||||
static size_t ipip_get_size(const struct net_device *dev)
|
||||
{
|
||||
return
|
||||
/* IFLA_IPTUN_LINK */
|
||||
nla_total_size(4) +
|
||||
/* IFLA_IPTUN_LOCAL */
|
||||
nla_total_size(4) +
|
||||
/* IFLA_IPTUN_REMOTE */
|
||||
nla_total_size(4) +
|
||||
/* IFLA_IPTUN_TTL */
|
||||
nla_total_size(1) +
|
||||
/* IFLA_IPTUN_TOS */
|
||||
nla_total_size(1) +
|
||||
/* IFLA_IPTUN_PMTUDISC */
|
||||
nla_total_size(1) +
|
||||
/* IFLA_IPTUN_ENCAP_TYPE */
|
||||
nla_total_size(2) +
|
||||
/* IFLA_IPTUN_ENCAP_FLAGS */
|
||||
nla_total_size(2) +
|
||||
/* IFLA_IPTUN_ENCAP_SPORT */
|
||||
nla_total_size(2) +
|
||||
/* IFLA_IPTUN_ENCAP_DPORT */
|
||||
nla_total_size(2) +
|
||||
0;
|
||||
}
|
||||
|
||||
static int ipip_fill_info(struct sk_buff *skb, const struct net_device *dev)
|
||||
{
|
||||
struct ip_tunnel *tunnel = netdev_priv(dev);
|
||||
struct ip_tunnel_parm *parm = &tunnel->parms;
|
||||
|
||||
if (nla_put_u32(skb, IFLA_IPTUN_LINK, parm->link) ||
|
||||
nla_put_be32(skb, IFLA_IPTUN_LOCAL, parm->iph.saddr) ||
|
||||
nla_put_be32(skb, IFLA_IPTUN_REMOTE, parm->iph.daddr) ||
|
||||
nla_put_u8(skb, IFLA_IPTUN_TTL, parm->iph.ttl) ||
|
||||
nla_put_u8(skb, IFLA_IPTUN_TOS, parm->iph.tos) ||
|
||||
nla_put_u8(skb, IFLA_IPTUN_PMTUDISC,
|
||||
!!(parm->iph.frag_off & htons(IP_DF))))
|
||||
goto nla_put_failure;
|
||||
|
||||
if (nla_put_u16(skb, IFLA_IPTUN_ENCAP_TYPE,
|
||||
tunnel->encap.type) ||
|
||||
nla_put_u16(skb, IFLA_IPTUN_ENCAP_SPORT,
|
||||
tunnel->encap.sport) ||
|
||||
nla_put_u16(skb, IFLA_IPTUN_ENCAP_DPORT,
|
||||
tunnel->encap.dport) ||
|
||||
nla_put_u16(skb, IFLA_IPTUN_ENCAP_FLAGS,
|
||||
tunnel->encap.dport))
|
||||
goto nla_put_failure;
|
||||
|
||||
return 0;
|
||||
|
||||
nla_put_failure:
|
||||
return -EMSGSIZE;
|
||||
}
|
||||
|
||||
static const struct nla_policy ipip_policy[IFLA_IPTUN_MAX + 1] = {
|
||||
[IFLA_IPTUN_LINK] = { .type = NLA_U32 },
|
||||
[IFLA_IPTUN_LOCAL] = { .type = NLA_U32 },
|
||||
[IFLA_IPTUN_REMOTE] = { .type = NLA_U32 },
|
||||
[IFLA_IPTUN_TTL] = { .type = NLA_U8 },
|
||||
[IFLA_IPTUN_TOS] = { .type = NLA_U8 },
|
||||
[IFLA_IPTUN_PMTUDISC] = { .type = NLA_U8 },
|
||||
[IFLA_IPTUN_ENCAP_TYPE] = { .type = NLA_U16 },
|
||||
[IFLA_IPTUN_ENCAP_FLAGS] = { .type = NLA_U16 },
|
||||
[IFLA_IPTUN_ENCAP_SPORT] = { .type = NLA_U16 },
|
||||
[IFLA_IPTUN_ENCAP_DPORT] = { .type = NLA_U16 },
|
||||
};
|
||||
|
||||
static struct rtnl_link_ops ipip_link_ops __read_mostly = {
|
||||
.kind = "ipip",
|
||||
.maxtype = IFLA_IPTUN_MAX,
|
||||
.policy = ipip_policy,
|
||||
.priv_size = sizeof(struct ip_tunnel),
|
||||
.setup = ipip_tunnel_setup,
|
||||
.newlink = ipip_newlink,
|
||||
.changelink = ipip_changelink,
|
||||
.dellink = ip_tunnel_dellink,
|
||||
.get_size = ipip_get_size,
|
||||
.fill_info = ipip_fill_info,
|
||||
};
|
||||
|
||||
static struct xfrm_tunnel ipip_handler __read_mostly = {
|
||||
.handler = ipip_rcv,
|
||||
.err_handler = ipip_err,
|
||||
.priority = 1,
|
||||
};
|
||||
|
||||
static int __net_init ipip_init_net(struct net *net)
|
||||
{
|
||||
return ip_tunnel_init_net(net, ipip_net_id, &ipip_link_ops, "tunl0");
|
||||
}
|
||||
|
||||
static void __net_exit ipip_exit_net(struct net *net)
|
||||
{
|
||||
struct ip_tunnel_net *itn = net_generic(net, ipip_net_id);
|
||||
ip_tunnel_delete_net(itn, &ipip_link_ops);
|
||||
}
|
||||
|
||||
static struct pernet_operations ipip_net_ops = {
|
||||
.init = ipip_init_net,
|
||||
.exit = ipip_exit_net,
|
||||
.id = &ipip_net_id,
|
||||
.size = sizeof(struct ip_tunnel_net),
|
||||
};
|
||||
|
||||
static int __init ipip_init(void)
|
||||
{
|
||||
int err;
|
||||
|
||||
pr_info("ipip: IPv4 over IPv4 tunneling driver\n");
|
||||
|
||||
err = register_pernet_device(&ipip_net_ops);
|
||||
if (err < 0)
|
||||
return err;
|
||||
err = xfrm4_tunnel_register(&ipip_handler, AF_INET);
|
||||
if (err < 0) {
|
||||
pr_info("%s: can't register tunnel\n", __func__);
|
||||
goto xfrm_tunnel_failed;
|
||||
}
|
||||
err = rtnl_link_register(&ipip_link_ops);
|
||||
if (err < 0)
|
||||
goto rtnl_link_failed;
|
||||
|
||||
out:
|
||||
return err;
|
||||
|
||||
rtnl_link_failed:
|
||||
xfrm4_tunnel_deregister(&ipip_handler, AF_INET);
|
||||
xfrm_tunnel_failed:
|
||||
unregister_pernet_device(&ipip_net_ops);
|
||||
goto out;
|
||||
}
|
||||
|
||||
static void __exit ipip_fini(void)
|
||||
{
|
||||
rtnl_link_unregister(&ipip_link_ops);
|
||||
if (xfrm4_tunnel_deregister(&ipip_handler, AF_INET))
|
||||
pr_info("%s: can't deregister tunnel\n", __func__);
|
||||
|
||||
unregister_pernet_device(&ipip_net_ops);
|
||||
}
|
||||
|
||||
module_init(ipip_init);
|
||||
module_exit(ipip_fini);
|
||||
MODULE_LICENSE("GPL");
|
||||
MODULE_ALIAS_RTNL_LINK("ipip");
|
||||
MODULE_ALIAS_NETDEV("tunl0");
|
||||
2781
net/ipv4/ipmr.c
Normal file
2781
net/ipv4/ipmr.c
Normal file
File diff suppressed because it is too large
Load diff
207
net/ipv4/netfilter.c
Normal file
207
net/ipv4/netfilter.c
Normal file
|
|
@ -0,0 +1,207 @@
|
|||
/*
|
||||
* IPv4 specific functions of netfilter core
|
||||
*
|
||||
* Rusty Russell (C) 2000 -- This code is GPL.
|
||||
* Patrick McHardy (C) 2006-2012
|
||||
*/
|
||||
#include <linux/kernel.h>
|
||||
#include <linux/netfilter.h>
|
||||
#include <linux/netfilter_ipv4.h>
|
||||
#include <linux/ip.h>
|
||||
#include <linux/skbuff.h>
|
||||
#include <linux/gfp.h>
|
||||
#include <linux/export.h>
|
||||
#include <net/route.h>
|
||||
#include <net/xfrm.h>
|
||||
#include <net/ip.h>
|
||||
#include <net/netfilter/nf_queue.h>
|
||||
|
||||
/* route_me_harder function, used by iptable_nat, iptable_mangle + ip_queue */
|
||||
int ip_route_me_harder(struct sk_buff *skb, unsigned int addr_type)
|
||||
{
|
||||
struct net *net = dev_net(skb_dst(skb)->dev);
|
||||
const struct iphdr *iph = ip_hdr(skb);
|
||||
struct rtable *rt;
|
||||
struct flowi4 fl4 = {};
|
||||
__be32 saddr = iph->saddr;
|
||||
__u8 flags = skb->sk ? inet_sk_flowi_flags(skb->sk) : 0;
|
||||
unsigned int hh_len;
|
||||
|
||||
if (addr_type == RTN_UNSPEC)
|
||||
addr_type = inet_addr_type(net, saddr);
|
||||
if (addr_type == RTN_LOCAL || addr_type == RTN_UNICAST)
|
||||
flags |= FLOWI_FLAG_ANYSRC;
|
||||
else
|
||||
saddr = 0;
|
||||
|
||||
/* some non-standard hacks like ipt_REJECT.c:send_reset() can cause
|
||||
* packets with foreign saddr to appear on the NF_INET_LOCAL_OUT hook.
|
||||
*/
|
||||
fl4.daddr = iph->daddr;
|
||||
fl4.saddr = saddr;
|
||||
fl4.flowi4_tos = RT_TOS(iph->tos);
|
||||
fl4.flowi4_oif = skb->sk ? skb->sk->sk_bound_dev_if : 0;
|
||||
fl4.flowi4_mark = skb->mark;
|
||||
fl4.flowi4_flags = flags;
|
||||
rt = ip_route_output_key(net, &fl4);
|
||||
if (IS_ERR(rt))
|
||||
return PTR_ERR(rt);
|
||||
|
||||
/* Drop old route. */
|
||||
skb_dst_drop(skb);
|
||||
skb_dst_set(skb, &rt->dst);
|
||||
|
||||
if (skb_dst(skb)->error)
|
||||
return skb_dst(skb)->error;
|
||||
|
||||
#ifdef CONFIG_XFRM
|
||||
if (!(IPCB(skb)->flags & IPSKB_XFRM_TRANSFORMED) &&
|
||||
xfrm_decode_session(skb, flowi4_to_flowi(&fl4), AF_INET) == 0) {
|
||||
struct dst_entry *dst = skb_dst(skb);
|
||||
skb_dst_set(skb, NULL);
|
||||
dst = xfrm_lookup(net, dst, flowi4_to_flowi(&fl4), skb->sk, 0);
|
||||
if (IS_ERR(dst))
|
||||
return PTR_ERR(dst);
|
||||
skb_dst_set(skb, dst);
|
||||
}
|
||||
#endif
|
||||
|
||||
/* Change in oif may mean change in hh_len. */
|
||||
hh_len = skb_dst(skb)->dev->hard_header_len;
|
||||
if (skb_headroom(skb) < hh_len &&
|
||||
pskb_expand_head(skb, HH_DATA_ALIGN(hh_len - skb_headroom(skb)),
|
||||
0, GFP_ATOMIC))
|
||||
return -ENOMEM;
|
||||
|
||||
return 0;
|
||||
}
|
||||
EXPORT_SYMBOL(ip_route_me_harder);
|
||||
|
||||
/*
|
||||
* Extra routing may needed on local out, as the QUEUE target never
|
||||
* returns control to the table.
|
||||
*/
|
||||
|
||||
struct ip_rt_info {
|
||||
__be32 daddr;
|
||||
__be32 saddr;
|
||||
u_int8_t tos;
|
||||
u_int32_t mark;
|
||||
};
|
||||
|
||||
static void nf_ip_saveroute(const struct sk_buff *skb,
|
||||
struct nf_queue_entry *entry)
|
||||
{
|
||||
struct ip_rt_info *rt_info = nf_queue_entry_reroute(entry);
|
||||
|
||||
if (entry->hook == NF_INET_LOCAL_OUT) {
|
||||
const struct iphdr *iph = ip_hdr(skb);
|
||||
|
||||
rt_info->tos = iph->tos;
|
||||
rt_info->daddr = iph->daddr;
|
||||
rt_info->saddr = iph->saddr;
|
||||
rt_info->mark = skb->mark;
|
||||
}
|
||||
}
|
||||
|
||||
static int nf_ip_reroute(struct sk_buff *skb,
|
||||
const struct nf_queue_entry *entry)
|
||||
{
|
||||
const struct ip_rt_info *rt_info = nf_queue_entry_reroute(entry);
|
||||
|
||||
if (entry->hook == NF_INET_LOCAL_OUT) {
|
||||
const struct iphdr *iph = ip_hdr(skb);
|
||||
|
||||
if (!(iph->tos == rt_info->tos &&
|
||||
skb->mark == rt_info->mark &&
|
||||
iph->daddr == rt_info->daddr &&
|
||||
iph->saddr == rt_info->saddr))
|
||||
return ip_route_me_harder(skb, RTN_UNSPEC);
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
__sum16 nf_ip_checksum(struct sk_buff *skb, unsigned int hook,
|
||||
unsigned int dataoff, u_int8_t protocol)
|
||||
{
|
||||
const struct iphdr *iph = ip_hdr(skb);
|
||||
__sum16 csum = 0;
|
||||
|
||||
switch (skb->ip_summed) {
|
||||
case CHECKSUM_COMPLETE:
|
||||
if (hook != NF_INET_PRE_ROUTING && hook != NF_INET_LOCAL_IN)
|
||||
break;
|
||||
if ((protocol == 0 && !csum_fold(skb->csum)) ||
|
||||
!csum_tcpudp_magic(iph->saddr, iph->daddr,
|
||||
skb->len - dataoff, protocol,
|
||||
skb->csum)) {
|
||||
skb->ip_summed = CHECKSUM_UNNECESSARY;
|
||||
break;
|
||||
}
|
||||
/* fall through */
|
||||
case CHECKSUM_NONE:
|
||||
if (protocol == 0)
|
||||
skb->csum = 0;
|
||||
else
|
||||
skb->csum = csum_tcpudp_nofold(iph->saddr, iph->daddr,
|
||||
skb->len - dataoff,
|
||||
protocol, 0);
|
||||
csum = __skb_checksum_complete(skb);
|
||||
}
|
||||
return csum;
|
||||
}
|
||||
EXPORT_SYMBOL(nf_ip_checksum);
|
||||
|
||||
static __sum16 nf_ip_checksum_partial(struct sk_buff *skb, unsigned int hook,
|
||||
unsigned int dataoff, unsigned int len,
|
||||
u_int8_t protocol)
|
||||
{
|
||||
const struct iphdr *iph = ip_hdr(skb);
|
||||
__sum16 csum = 0;
|
||||
|
||||
switch (skb->ip_summed) {
|
||||
case CHECKSUM_COMPLETE:
|
||||
if (len == skb->len - dataoff)
|
||||
return nf_ip_checksum(skb, hook, dataoff, protocol);
|
||||
/* fall through */
|
||||
case CHECKSUM_NONE:
|
||||
skb->csum = csum_tcpudp_nofold(iph->saddr, iph->daddr, protocol,
|
||||
skb->len - dataoff, 0);
|
||||
skb->ip_summed = CHECKSUM_NONE;
|
||||
return __skb_checksum_complete_head(skb, dataoff + len);
|
||||
}
|
||||
return csum;
|
||||
}
|
||||
|
||||
static int nf_ip_route(struct net *net, struct dst_entry **dst,
|
||||
struct flowi *fl, bool strict __always_unused)
|
||||
{
|
||||
struct rtable *rt = ip_route_output_key(net, &fl->u.ip4);
|
||||
if (IS_ERR(rt))
|
||||
return PTR_ERR(rt);
|
||||
*dst = &rt->dst;
|
||||
return 0;
|
||||
}
|
||||
|
||||
static const struct nf_afinfo nf_ip_afinfo = {
|
||||
.family = AF_INET,
|
||||
.checksum = nf_ip_checksum,
|
||||
.checksum_partial = nf_ip_checksum_partial,
|
||||
.route = nf_ip_route,
|
||||
.saveroute = nf_ip_saveroute,
|
||||
.reroute = nf_ip_reroute,
|
||||
.route_key_size = sizeof(struct ip_rt_info),
|
||||
};
|
||||
|
||||
static int __init ipv4_netfilter_init(void)
|
||||
{
|
||||
return nf_register_afinfo(&nf_ip_afinfo);
|
||||
}
|
||||
|
||||
static void __exit ipv4_netfilter_fini(void)
|
||||
{
|
||||
nf_unregister_afinfo(&nf_ip_afinfo);
|
||||
}
|
||||
|
||||
module_init(ipv4_netfilter_init);
|
||||
module_exit(ipv4_netfilter_fini);
|
||||
393
net/ipv4/netfilter/Kconfig
Normal file
393
net/ipv4/netfilter/Kconfig
Normal file
|
|
@ -0,0 +1,393 @@
|
|||
#
|
||||
# IP netfilter configuration
|
||||
#
|
||||
|
||||
menu "IP: Netfilter Configuration"
|
||||
depends on INET && NETFILTER
|
||||
|
||||
config NF_DEFRAG_IPV4
|
||||
tristate
|
||||
default n
|
||||
|
||||
config NF_CONNTRACK_IPV4
|
||||
tristate "IPv4 connection tracking support (required for NAT)"
|
||||
depends on NF_CONNTRACK
|
||||
default m if NETFILTER_ADVANCED=n
|
||||
select NF_DEFRAG_IPV4
|
||||
---help---
|
||||
Connection tracking keeps a record of what packets have passed
|
||||
through your machine, in order to figure out how they are related
|
||||
into connections.
|
||||
|
||||
This is IPv4 support on Layer 3 independent connection tracking.
|
||||
Layer 3 independent connection tracking is experimental scheme
|
||||
which generalize ip_conntrack to support other layer 3 protocols.
|
||||
|
||||
To compile it as a module, choose M here. If unsure, say N.
|
||||
|
||||
config NF_CONNTRACK_PROC_COMPAT
|
||||
bool "proc/sysctl compatibility with old connection tracking"
|
||||
depends on NF_CONNTRACK_PROCFS && NF_CONNTRACK_IPV4
|
||||
default y
|
||||
help
|
||||
This option enables /proc and sysctl compatibility with the old
|
||||
layer 3 dependent connection tracking. This is needed to keep
|
||||
old programs that have not been adapted to the new names working.
|
||||
|
||||
If unsure, say Y.
|
||||
|
||||
config NF_LOG_ARP
|
||||
tristate "ARP packet logging"
|
||||
default m if NETFILTER_ADVANCED=n
|
||||
select NF_LOG_COMMON
|
||||
|
||||
config NF_LOG_IPV4
|
||||
tristate "IPv4 packet logging"
|
||||
default m if NETFILTER_ADVANCED=n
|
||||
select NF_LOG_COMMON
|
||||
|
||||
config NF_TABLES_IPV4
|
||||
depends on NF_TABLES
|
||||
tristate "IPv4 nf_tables support"
|
||||
help
|
||||
This option enables the IPv4 support for nf_tables.
|
||||
|
||||
config NFT_CHAIN_ROUTE_IPV4
|
||||
depends on NF_TABLES_IPV4
|
||||
tristate "IPv4 nf_tables route chain support"
|
||||
help
|
||||
This option enables the "route" chain for IPv4 in nf_tables. This
|
||||
chain type is used to force packet re-routing after mangling header
|
||||
fields such as the source, destination, type of service and
|
||||
the packet mark.
|
||||
|
||||
config NF_REJECT_IPV4
|
||||
tristate "IPv4 packet rejection"
|
||||
default m if NETFILTER_ADVANCED=n
|
||||
|
||||
config NFT_REJECT_IPV4
|
||||
depends on NF_TABLES_IPV4
|
||||
select NF_REJECT_IPV4
|
||||
default NFT_REJECT
|
||||
tristate
|
||||
|
||||
config NF_TABLES_ARP
|
||||
depends on NF_TABLES
|
||||
tristate "ARP nf_tables support"
|
||||
help
|
||||
This option enables the ARP support for nf_tables.
|
||||
|
||||
config NF_NAT_IPV4
|
||||
tristate "IPv4 NAT"
|
||||
depends on NF_CONNTRACK_IPV4
|
||||
default m if NETFILTER_ADVANCED=n
|
||||
select NF_NAT
|
||||
help
|
||||
The IPv4 NAT option allows masquerading, port forwarding and other
|
||||
forms of full Network Address Port Translation. This can be
|
||||
controlled by iptables or nft.
|
||||
|
||||
if NF_NAT_IPV4
|
||||
|
||||
config NFT_CHAIN_NAT_IPV4
|
||||
depends on NF_TABLES_IPV4
|
||||
tristate "IPv4 nf_tables nat chain support"
|
||||
help
|
||||
This option enables the "nat" chain for IPv4 in nf_tables. This
|
||||
chain type is used to perform Network Address Translation (NAT)
|
||||
packet transformations such as the source, destination address and
|
||||
source and destination ports.
|
||||
|
||||
config NF_NAT_MASQUERADE_IPV4
|
||||
tristate "IPv4 masquerade support"
|
||||
help
|
||||
This is the kernel functionality to provide NAT in the masquerade
|
||||
flavour (automatic source address selection).
|
||||
|
||||
config NFT_MASQ_IPV4
|
||||
tristate "IPv4 masquerading support for nf_tables"
|
||||
depends on NF_TABLES_IPV4
|
||||
depends on NFT_MASQ
|
||||
select NF_NAT_MASQUERADE_IPV4
|
||||
help
|
||||
This is the expression that provides IPv4 masquerading support for
|
||||
nf_tables.
|
||||
|
||||
config NF_NAT_SNMP_BASIC
|
||||
tristate "Basic SNMP-ALG support"
|
||||
depends on NF_CONNTRACK_SNMP
|
||||
depends on NETFILTER_ADVANCED
|
||||
default NF_NAT && NF_CONNTRACK_SNMP
|
||||
---help---
|
||||
|
||||
This module implements an Application Layer Gateway (ALG) for
|
||||
SNMP payloads. In conjunction with NAT, it allows a network
|
||||
management system to access multiple private networks with
|
||||
conflicting addresses. It works by modifying IP addresses
|
||||
inside SNMP payloads to match IP-layer NAT mapping.
|
||||
|
||||
This is the "basic" form of SNMP-ALG, as described in RFC 2962
|
||||
|
||||
To compile it as a module, choose M here. If unsure, say N.
|
||||
|
||||
config NF_NAT_PROTO_GRE
|
||||
tristate
|
||||
depends on NF_CT_PROTO_GRE
|
||||
|
||||
config NF_NAT_PPTP
|
||||
tristate
|
||||
depends on NF_CONNTRACK
|
||||
default NF_CONNTRACK_PPTP
|
||||
select NF_NAT_PROTO_GRE
|
||||
|
||||
config NF_NAT_H323
|
||||
tristate
|
||||
depends on NF_CONNTRACK
|
||||
default NF_CONNTRACK_H323
|
||||
|
||||
endif # NF_NAT_IPV4
|
||||
|
||||
config IP_NF_IPTABLES
|
||||
tristate "IP tables support (required for filtering/masq/NAT)"
|
||||
default m if NETFILTER_ADVANCED=n
|
||||
select NETFILTER_XTABLES
|
||||
help
|
||||
iptables is a general, extensible packet identification framework.
|
||||
The packet filtering and full NAT (masquerading, port forwarding,
|
||||
etc) subsystems now use this: say `Y' or `M' here if you want to use
|
||||
either of those.
|
||||
|
||||
To compile it as a module, choose M here. If unsure, say N.
|
||||
|
||||
if IP_NF_IPTABLES
|
||||
|
||||
# The matches.
|
||||
config IP_NF_MATCH_AH
|
||||
tristate '"ah" match support'
|
||||
depends on NETFILTER_ADVANCED
|
||||
help
|
||||
This match extension allows you to match a range of SPIs
|
||||
inside AH header of IPSec packets.
|
||||
|
||||
To compile it as a module, choose M here. If unsure, say N.
|
||||
|
||||
config IP_NF_MATCH_ECN
|
||||
tristate '"ecn" match support'
|
||||
depends on NETFILTER_ADVANCED
|
||||
select NETFILTER_XT_MATCH_ECN
|
||||
---help---
|
||||
This is a backwards-compat option for the user's convenience
|
||||
(e.g. when running oldconfig). It selects
|
||||
CONFIG_NETFILTER_XT_MATCH_ECN.
|
||||
|
||||
config IP_NF_MATCH_RPFILTER
|
||||
tristate '"rpfilter" reverse path filter match support'
|
||||
depends on NETFILTER_ADVANCED && (IP_NF_MANGLE || IP_NF_RAW)
|
||||
---help---
|
||||
This option allows you to match packets whose replies would
|
||||
go out via the interface the packet came in.
|
||||
|
||||
To compile it as a module, choose M here. If unsure, say N.
|
||||
The module will be called ipt_rpfilter.
|
||||
|
||||
config IP_NF_MATCH_TTL
|
||||
tristate '"ttl" match support'
|
||||
depends on NETFILTER_ADVANCED
|
||||
select NETFILTER_XT_MATCH_HL
|
||||
---help---
|
||||
This is a backwards-compat option for the user's convenience
|
||||
(e.g. when running oldconfig). It selects
|
||||
CONFIG_NETFILTER_XT_MATCH_HL.
|
||||
|
||||
# `filter', generic and specific targets
|
||||
config IP_NF_FILTER
|
||||
tristate "Packet filtering"
|
||||
default m if NETFILTER_ADVANCED=n
|
||||
help
|
||||
Packet filtering defines a table `filter', which has a series of
|
||||
rules for simple packet filtering at local input, forwarding and
|
||||
local output. See the man page for iptables(8).
|
||||
|
||||
To compile it as a module, choose M here. If unsure, say N.
|
||||
|
||||
config IP_NF_TARGET_REJECT
|
||||
tristate "REJECT target support"
|
||||
depends on IP_NF_FILTER
|
||||
select NF_REJECT_IPV4
|
||||
default m if NETFILTER_ADVANCED=n
|
||||
help
|
||||
The REJECT target allows a filtering rule to specify that an ICMP
|
||||
error should be issued in response to an incoming packet, rather
|
||||
than silently being dropped.
|
||||
|
||||
To compile it as a module, choose M here. If unsure, say N.
|
||||
|
||||
config IP_NF_TARGET_SYNPROXY
|
||||
tristate "SYNPROXY target support"
|
||||
depends on NF_CONNTRACK && NETFILTER_ADVANCED
|
||||
select NETFILTER_SYNPROXY
|
||||
select SYN_COOKIES
|
||||
help
|
||||
The SYNPROXY target allows you to intercept TCP connections and
|
||||
establish them using syncookies before they are passed on to the
|
||||
server. This allows to avoid conntrack and server resource usage
|
||||
during SYN-flood attacks.
|
||||
|
||||
To compile it as a module, choose M here. If unsure, say N.
|
||||
|
||||
# NAT + specific targets: nf_conntrack
|
||||
config IP_NF_NAT
|
||||
tristate "iptables NAT support"
|
||||
depends on NF_CONNTRACK_IPV4
|
||||
default m if NETFILTER_ADVANCED=n
|
||||
select NF_NAT
|
||||
select NF_NAT_IPV4
|
||||
select NETFILTER_XT_NAT
|
||||
help
|
||||
This enables the `nat' table in iptables. This allows masquerading,
|
||||
port forwarding and other forms of full Network Address Port
|
||||
Translation.
|
||||
|
||||
To compile it as a module, choose M here. If unsure, say N.
|
||||
|
||||
if IP_NF_NAT
|
||||
|
||||
config IP_NF_TARGET_MASQUERADE
|
||||
tristate "MASQUERADE target support"
|
||||
select NF_NAT_MASQUERADE_IPV4
|
||||
default m if NETFILTER_ADVANCED=n
|
||||
help
|
||||
Masquerading is a special case of NAT: all outgoing connections are
|
||||
changed to seem to come from a particular interface's address, and
|
||||
if the interface goes down, those connections are lost. This is
|
||||
only useful for dialup accounts with dynamic IP address (ie. your IP
|
||||
address will be different on next dialup).
|
||||
|
||||
To compile it as a module, choose M here. If unsure, say N.
|
||||
|
||||
config IP_NF_TARGET_NETMAP
|
||||
tristate "NETMAP target support"
|
||||
depends on NETFILTER_ADVANCED
|
||||
select NETFILTER_XT_TARGET_NETMAP
|
||||
---help---
|
||||
This is a backwards-compat option for the user's convenience
|
||||
(e.g. when running oldconfig). It selects
|
||||
CONFIG_NETFILTER_XT_TARGET_NETMAP.
|
||||
|
||||
config IP_NF_TARGET_REDIRECT
|
||||
tristate "REDIRECT target support"
|
||||
depends on NETFILTER_ADVANCED
|
||||
select NETFILTER_XT_TARGET_REDIRECT
|
||||
---help---
|
||||
This is a backwards-compat option for the user's convenience
|
||||
(e.g. when running oldconfig). It selects
|
||||
CONFIG_NETFILTER_XT_TARGET_REDIRECT.
|
||||
|
||||
endif # IP_NF_NAT
|
||||
|
||||
# mangle + specific targets
|
||||
config IP_NF_MANGLE
|
||||
tristate "Packet mangling"
|
||||
default m if NETFILTER_ADVANCED=n
|
||||
help
|
||||
This option adds a `mangle' table to iptables: see the man page for
|
||||
iptables(8). This table is used for various packet alterations
|
||||
which can effect how the packet is routed.
|
||||
|
||||
To compile it as a module, choose M here. If unsure, say N.
|
||||
|
||||
config IP_NF_TARGET_CLUSTERIP
|
||||
tristate "CLUSTERIP target support"
|
||||
depends on IP_NF_MANGLE
|
||||
depends on NF_CONNTRACK_IPV4
|
||||
depends on NETFILTER_ADVANCED
|
||||
select NF_CONNTRACK_MARK
|
||||
help
|
||||
The CLUSTERIP target allows you to build load-balancing clusters of
|
||||
network servers without having a dedicated load-balancing
|
||||
router/server/switch.
|
||||
|
||||
To compile it as a module, choose M here. If unsure, say N.
|
||||
|
||||
config IP_NF_TARGET_ECN
|
||||
tristate "ECN target support"
|
||||
depends on IP_NF_MANGLE
|
||||
depends on NETFILTER_ADVANCED
|
||||
---help---
|
||||
This option adds a `ECN' target, which can be used in the iptables mangle
|
||||
table.
|
||||
|
||||
You can use this target to remove the ECN bits from the IPv4 header of
|
||||
an IP packet. This is particularly useful, if you need to work around
|
||||
existing ECN blackholes on the internet, but don't want to disable
|
||||
ECN support in general.
|
||||
|
||||
To compile it as a module, choose M here. If unsure, say N.
|
||||
|
||||
config IP_NF_TARGET_TTL
|
||||
tristate '"TTL" target support'
|
||||
depends on NETFILTER_ADVANCED && IP_NF_MANGLE
|
||||
select NETFILTER_XT_TARGET_HL
|
||||
---help---
|
||||
This is a backwards-compatible option for the user's convenience
|
||||
(e.g. when running oldconfig). It selects
|
||||
CONFIG_NETFILTER_XT_TARGET_HL.
|
||||
|
||||
# raw + specific targets
|
||||
config IP_NF_RAW
|
||||
tristate 'raw table support (required for NOTRACK/TRACE)'
|
||||
help
|
||||
This option adds a `raw' table to iptables. This table is the very
|
||||
first in the netfilter framework and hooks in at the PREROUTING
|
||||
and OUTPUT chains.
|
||||
|
||||
If you want to compile it as a module, say M here and read
|
||||
<file:Documentation/kbuild/modules.txt>. If unsure, say `N'.
|
||||
|
||||
# security table for MAC policy
|
||||
config IP_NF_SECURITY
|
||||
tristate "Security table"
|
||||
depends on SECURITY
|
||||
depends on NETFILTER_ADVANCED
|
||||
help
|
||||
This option adds a `security' table to iptables, for use
|
||||
with Mandatory Access Control (MAC) policy.
|
||||
|
||||
If unsure, say N.
|
||||
|
||||
endif # IP_NF_IPTABLES
|
||||
|
||||
# ARP tables
|
||||
config IP_NF_ARPTABLES
|
||||
tristate "ARP tables support"
|
||||
select NETFILTER_XTABLES
|
||||
depends on NETFILTER_ADVANCED
|
||||
help
|
||||
arptables is a general, extensible packet identification framework.
|
||||
The ARP packet filtering and mangling (manipulation)subsystems
|
||||
use this: say Y or M here if you want to use either of those.
|
||||
|
||||
To compile it as a module, choose M here. If unsure, say N.
|
||||
|
||||
if IP_NF_ARPTABLES
|
||||
|
||||
config IP_NF_ARPFILTER
|
||||
tristate "ARP packet filtering"
|
||||
help
|
||||
ARP packet filtering defines a table `filter', which has a series of
|
||||
rules for simple ARP packet filtering at local input and
|
||||
local output. On a bridge, you can also specify filtering rules
|
||||
for forwarded ARP packets. See the man page for arptables(8).
|
||||
|
||||
To compile it as a module, choose M here. If unsure, say N.
|
||||
|
||||
config IP_NF_ARP_MANGLE
|
||||
tristate "ARP payload mangling"
|
||||
help
|
||||
Allows altering the ARP packet payload: source and destination
|
||||
hardware and network addresses.
|
||||
|
||||
endif # IP_NF_ARPTABLES
|
||||
|
||||
endmenu
|
||||
|
||||
71
net/ipv4/netfilter/Makefile
Normal file
71
net/ipv4/netfilter/Makefile
Normal file
|
|
@ -0,0 +1,71 @@
|
|||
#
|
||||
# Makefile for the netfilter modules on top of IPv4.
|
||||
#
|
||||
|
||||
# objects for l3 independent conntrack
|
||||
nf_conntrack_ipv4-y := nf_conntrack_l3proto_ipv4.o nf_conntrack_proto_icmp.o
|
||||
ifeq ($(CONFIG_NF_CONNTRACK_PROC_COMPAT),y)
|
||||
ifeq ($(CONFIG_PROC_FS),y)
|
||||
nf_conntrack_ipv4-objs += nf_conntrack_l3proto_ipv4_compat.o
|
||||
endif
|
||||
endif
|
||||
|
||||
# connection tracking
|
||||
obj-$(CONFIG_NF_CONNTRACK_IPV4) += nf_conntrack_ipv4.o
|
||||
|
||||
nf_nat_ipv4-y := nf_nat_l3proto_ipv4.o nf_nat_proto_icmp.o
|
||||
obj-$(CONFIG_NF_NAT_IPV4) += nf_nat_ipv4.o
|
||||
|
||||
# defrag
|
||||
obj-$(CONFIG_NF_DEFRAG_IPV4) += nf_defrag_ipv4.o
|
||||
|
||||
# logging
|
||||
obj-$(CONFIG_NF_LOG_ARP) += nf_log_arp.o
|
||||
obj-$(CONFIG_NF_LOG_IPV4) += nf_log_ipv4.o
|
||||
|
||||
# reject
|
||||
obj-$(CONFIG_NF_REJECT_IPV4) += nf_reject_ipv4.o
|
||||
|
||||
# NAT helpers (nf_conntrack)
|
||||
obj-$(CONFIG_NF_NAT_H323) += nf_nat_h323.o
|
||||
obj-$(CONFIG_NF_NAT_PPTP) += nf_nat_pptp.o
|
||||
obj-$(CONFIG_NF_NAT_SNMP_BASIC) += nf_nat_snmp_basic.o
|
||||
obj-$(CONFIG_NF_NAT_MASQUERADE_IPV4) += nf_nat_masquerade_ipv4.o
|
||||
|
||||
# NAT protocols (nf_nat)
|
||||
obj-$(CONFIG_NF_NAT_PROTO_GRE) += nf_nat_proto_gre.o
|
||||
|
||||
obj-$(CONFIG_NF_TABLES_IPV4) += nf_tables_ipv4.o
|
||||
obj-$(CONFIG_NFT_CHAIN_ROUTE_IPV4) += nft_chain_route_ipv4.o
|
||||
obj-$(CONFIG_NFT_CHAIN_NAT_IPV4) += nft_chain_nat_ipv4.o
|
||||
obj-$(CONFIG_NFT_REJECT_IPV4) += nft_reject_ipv4.o
|
||||
obj-$(CONFIG_NFT_MASQ_IPV4) += nft_masq_ipv4.o
|
||||
obj-$(CONFIG_NF_TABLES_ARP) += nf_tables_arp.o
|
||||
|
||||
# generic IP tables
|
||||
obj-$(CONFIG_IP_NF_IPTABLES) += ip_tables.o
|
||||
|
||||
# the three instances of ip_tables
|
||||
obj-$(CONFIG_IP_NF_FILTER) += iptable_filter.o
|
||||
obj-$(CONFIG_IP_NF_MANGLE) += iptable_mangle.o
|
||||
obj-$(CONFIG_IP_NF_NAT) += iptable_nat.o
|
||||
obj-$(CONFIG_IP_NF_RAW) += iptable_raw.o
|
||||
obj-$(CONFIG_IP_NF_SECURITY) += iptable_security.o
|
||||
|
||||
# matches
|
||||
obj-$(CONFIG_IP_NF_MATCH_AH) += ipt_ah.o
|
||||
obj-$(CONFIG_IP_NF_MATCH_RPFILTER) += ipt_rpfilter.o
|
||||
|
||||
# targets
|
||||
obj-$(CONFIG_IP_NF_TARGET_CLUSTERIP) += ipt_CLUSTERIP.o
|
||||
obj-$(CONFIG_IP_NF_TARGET_ECN) += ipt_ECN.o
|
||||
obj-$(CONFIG_IP_NF_TARGET_MASQUERADE) += ipt_MASQUERADE.o
|
||||
obj-$(CONFIG_IP_NF_TARGET_REJECT) += ipt_REJECT.o
|
||||
obj-$(CONFIG_IP_NF_TARGET_SYNPROXY) += ipt_SYNPROXY.o
|
||||
|
||||
# generic ARP tables
|
||||
obj-$(CONFIG_IP_NF_ARPTABLES) += arp_tables.o
|
||||
obj-$(CONFIG_IP_NF_ARP_MANGLE) += arpt_mangle.o
|
||||
|
||||
# just filtering instance of ARP tables for now
|
||||
obj-$(CONFIG_IP_NF_ARPFILTER) += arptable_filter.o
|
||||
1921
net/ipv4/netfilter/arp_tables.c
Normal file
1921
net/ipv4/netfilter/arp_tables.c
Normal file
File diff suppressed because it is too large
Load diff
91
net/ipv4/netfilter/arpt_mangle.c
Normal file
91
net/ipv4/netfilter/arpt_mangle.c
Normal file
|
|
@ -0,0 +1,91 @@
|
|||
/* module that allows mangling of the arp payload */
|
||||
#include <linux/module.h>
|
||||
#include <linux/netfilter.h>
|
||||
#include <linux/netfilter_arp/arpt_mangle.h>
|
||||
#include <net/sock.h>
|
||||
|
||||
MODULE_LICENSE("GPL");
|
||||
MODULE_AUTHOR("Bart De Schuymer <bdschuym@pandora.be>");
|
||||
MODULE_DESCRIPTION("arptables arp payload mangle target");
|
||||
|
||||
static unsigned int
|
||||
target(struct sk_buff *skb, const struct xt_action_param *par)
|
||||
{
|
||||
const struct arpt_mangle *mangle = par->targinfo;
|
||||
const struct arphdr *arp;
|
||||
unsigned char *arpptr;
|
||||
int pln, hln;
|
||||
|
||||
if (!skb_make_writable(skb, skb->len))
|
||||
return NF_DROP;
|
||||
|
||||
arp = arp_hdr(skb);
|
||||
arpptr = skb_network_header(skb) + sizeof(*arp);
|
||||
pln = arp->ar_pln;
|
||||
hln = arp->ar_hln;
|
||||
/* We assume that pln and hln were checked in the match */
|
||||
if (mangle->flags & ARPT_MANGLE_SDEV) {
|
||||
if (ARPT_DEV_ADDR_LEN_MAX < hln ||
|
||||
(arpptr + hln > skb_tail_pointer(skb)))
|
||||
return NF_DROP;
|
||||
memcpy(arpptr, mangle->src_devaddr, hln);
|
||||
}
|
||||
arpptr += hln;
|
||||
if (mangle->flags & ARPT_MANGLE_SIP) {
|
||||
if (ARPT_MANGLE_ADDR_LEN_MAX < pln ||
|
||||
(arpptr + pln > skb_tail_pointer(skb)))
|
||||
return NF_DROP;
|
||||
memcpy(arpptr, &mangle->u_s.src_ip, pln);
|
||||
}
|
||||
arpptr += pln;
|
||||
if (mangle->flags & ARPT_MANGLE_TDEV) {
|
||||
if (ARPT_DEV_ADDR_LEN_MAX < hln ||
|
||||
(arpptr + hln > skb_tail_pointer(skb)))
|
||||
return NF_DROP;
|
||||
memcpy(arpptr, mangle->tgt_devaddr, hln);
|
||||
}
|
||||
arpptr += hln;
|
||||
if (mangle->flags & ARPT_MANGLE_TIP) {
|
||||
if (ARPT_MANGLE_ADDR_LEN_MAX < pln ||
|
||||
(arpptr + pln > skb_tail_pointer(skb)))
|
||||
return NF_DROP;
|
||||
memcpy(arpptr, &mangle->u_t.tgt_ip, pln);
|
||||
}
|
||||
return mangle->target;
|
||||
}
|
||||
|
||||
static int checkentry(const struct xt_tgchk_param *par)
|
||||
{
|
||||
const struct arpt_mangle *mangle = par->targinfo;
|
||||
|
||||
if (mangle->flags & ~ARPT_MANGLE_MASK ||
|
||||
!(mangle->flags & ARPT_MANGLE_MASK))
|
||||
return -EINVAL;
|
||||
|
||||
if (mangle->target != NF_DROP && mangle->target != NF_ACCEPT &&
|
||||
mangle->target != XT_CONTINUE)
|
||||
return -EINVAL;
|
||||
return 0;
|
||||
}
|
||||
|
||||
static struct xt_target arpt_mangle_reg __read_mostly = {
|
||||
.name = "mangle",
|
||||
.family = NFPROTO_ARP,
|
||||
.target = target,
|
||||
.targetsize = sizeof(struct arpt_mangle),
|
||||
.checkentry = checkentry,
|
||||
.me = THIS_MODULE,
|
||||
};
|
||||
|
||||
static int __init arpt_mangle_init(void)
|
||||
{
|
||||
return xt_register_target(&arpt_mangle_reg);
|
||||
}
|
||||
|
||||
static void __exit arpt_mangle_fini(void)
|
||||
{
|
||||
xt_unregister_target(&arpt_mangle_reg);
|
||||
}
|
||||
|
||||
module_init(arpt_mangle_init);
|
||||
module_exit(arpt_mangle_fini);
|
||||
92
net/ipv4/netfilter/arptable_filter.c
Normal file
92
net/ipv4/netfilter/arptable_filter.c
Normal file
|
|
@ -0,0 +1,92 @@
|
|||
/*
|
||||
* Filtering ARP tables module.
|
||||
*
|
||||
* Copyright (C) 2002 David S. Miller (davem@redhat.com)
|
||||
*
|
||||
*/
|
||||
|
||||
#include <linux/module.h>
|
||||
#include <linux/netfilter/x_tables.h>
|
||||
#include <linux/netfilter_arp/arp_tables.h>
|
||||
#include <linux/slab.h>
|
||||
|
||||
MODULE_LICENSE("GPL");
|
||||
MODULE_AUTHOR("David S. Miller <davem@redhat.com>");
|
||||
MODULE_DESCRIPTION("arptables filter table");
|
||||
|
||||
#define FILTER_VALID_HOOKS ((1 << NF_ARP_IN) | (1 << NF_ARP_OUT) | \
|
||||
(1 << NF_ARP_FORWARD))
|
||||
|
||||
static const struct xt_table packet_filter = {
|
||||
.name = "filter",
|
||||
.valid_hooks = FILTER_VALID_HOOKS,
|
||||
.me = THIS_MODULE,
|
||||
.af = NFPROTO_ARP,
|
||||
.priority = NF_IP_PRI_FILTER,
|
||||
};
|
||||
|
||||
/* The work comes in here from netfilter.c */
|
||||
static unsigned int
|
||||
arptable_filter_hook(const struct nf_hook_ops *ops, struct sk_buff *skb,
|
||||
const struct net_device *in, const struct net_device *out,
|
||||
int (*okfn)(struct sk_buff *))
|
||||
{
|
||||
const struct net *net = dev_net((in != NULL) ? in : out);
|
||||
|
||||
return arpt_do_table(skb, ops->hooknum, in, out,
|
||||
net->ipv4.arptable_filter);
|
||||
}
|
||||
|
||||
static struct nf_hook_ops *arpfilter_ops __read_mostly;
|
||||
|
||||
static int __net_init arptable_filter_net_init(struct net *net)
|
||||
{
|
||||
struct arpt_replace *repl;
|
||||
|
||||
repl = arpt_alloc_initial_table(&packet_filter);
|
||||
if (repl == NULL)
|
||||
return -ENOMEM;
|
||||
net->ipv4.arptable_filter =
|
||||
arpt_register_table(net, &packet_filter, repl);
|
||||
kfree(repl);
|
||||
return PTR_ERR_OR_ZERO(net->ipv4.arptable_filter);
|
||||
}
|
||||
|
||||
static void __net_exit arptable_filter_net_exit(struct net *net)
|
||||
{
|
||||
arpt_unregister_table(net->ipv4.arptable_filter);
|
||||
}
|
||||
|
||||
static struct pernet_operations arptable_filter_net_ops = {
|
||||
.init = arptable_filter_net_init,
|
||||
.exit = arptable_filter_net_exit,
|
||||
};
|
||||
|
||||
static int __init arptable_filter_init(void)
|
||||
{
|
||||
int ret;
|
||||
|
||||
ret = register_pernet_subsys(&arptable_filter_net_ops);
|
||||
if (ret < 0)
|
||||
return ret;
|
||||
|
||||
arpfilter_ops = xt_hook_link(&packet_filter, arptable_filter_hook);
|
||||
if (IS_ERR(arpfilter_ops)) {
|
||||
ret = PTR_ERR(arpfilter_ops);
|
||||
goto cleanup_table;
|
||||
}
|
||||
return ret;
|
||||
|
||||
cleanup_table:
|
||||
unregister_pernet_subsys(&arptable_filter_net_ops);
|
||||
return ret;
|
||||
}
|
||||
|
||||
static void __exit arptable_filter_fini(void)
|
||||
{
|
||||
xt_hook_unlink(&packet_filter, arpfilter_ops);
|
||||
unregister_pernet_subsys(&arptable_filter_net_ops);
|
||||
}
|
||||
|
||||
module_init(arptable_filter_init);
|
||||
module_exit(arptable_filter_fini);
|
||||
2276
net/ipv4/netfilter/ip_tables.c
Normal file
2276
net/ipv4/netfilter/ip_tables.c
Normal file
File diff suppressed because it is too large
Load diff
789
net/ipv4/netfilter/ipt_CLUSTERIP.c
Normal file
789
net/ipv4/netfilter/ipt_CLUSTERIP.c
Normal file
|
|
@ -0,0 +1,789 @@
|
|||
/* Cluster IP hashmark target
|
||||
* (C) 2003-2004 by Harald Welte <laforge@netfilter.org>
|
||||
* based on ideas of Fabio Olive Leite <olive@unixforge.org>
|
||||
*
|
||||
* Development of this code funded by SuSE Linux AG, http://www.suse.com/
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License version 2 as
|
||||
* published by the Free Software Foundation.
|
||||
*
|
||||
*/
|
||||
#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
|
||||
#include <linux/module.h>
|
||||
#include <linux/proc_fs.h>
|
||||
#include <linux/jhash.h>
|
||||
#include <linux/bitops.h>
|
||||
#include <linux/skbuff.h>
|
||||
#include <linux/slab.h>
|
||||
#include <linux/ip.h>
|
||||
#include <linux/tcp.h>
|
||||
#include <linux/udp.h>
|
||||
#include <linux/icmp.h>
|
||||
#include <linux/if_arp.h>
|
||||
#include <linux/seq_file.h>
|
||||
#include <linux/netfilter_arp.h>
|
||||
#include <linux/netfilter/x_tables.h>
|
||||
#include <linux/netfilter_ipv4/ip_tables.h>
|
||||
#include <linux/netfilter_ipv4/ipt_CLUSTERIP.h>
|
||||
#include <net/netfilter/nf_conntrack.h>
|
||||
#include <net/net_namespace.h>
|
||||
#include <net/netns/generic.h>
|
||||
#include <net/checksum.h>
|
||||
#include <net/ip.h>
|
||||
|
||||
#define CLUSTERIP_VERSION "0.8"
|
||||
|
||||
MODULE_LICENSE("GPL");
|
||||
MODULE_AUTHOR("Harald Welte <laforge@netfilter.org>");
|
||||
MODULE_DESCRIPTION("Xtables: CLUSTERIP target");
|
||||
|
||||
struct clusterip_config {
|
||||
struct list_head list; /* list of all configs */
|
||||
atomic_t refcount; /* reference count */
|
||||
atomic_t entries; /* number of entries/rules
|
||||
* referencing us */
|
||||
|
||||
__be32 clusterip; /* the IP address */
|
||||
u_int8_t clustermac[ETH_ALEN]; /* the MAC address */
|
||||
struct net_device *dev; /* device */
|
||||
u_int16_t num_total_nodes; /* total number of nodes */
|
||||
unsigned long local_nodes; /* node number array */
|
||||
|
||||
#ifdef CONFIG_PROC_FS
|
||||
struct proc_dir_entry *pde; /* proc dir entry */
|
||||
#endif
|
||||
enum clusterip_hashmode hash_mode; /* which hashing mode */
|
||||
u_int32_t hash_initval; /* hash initialization */
|
||||
struct rcu_head rcu;
|
||||
};
|
||||
|
||||
#ifdef CONFIG_PROC_FS
|
||||
static const struct file_operations clusterip_proc_fops;
|
||||
#endif
|
||||
|
||||
static int clusterip_net_id __read_mostly;
|
||||
|
||||
struct clusterip_net {
|
||||
struct list_head configs;
|
||||
/* lock protects the configs list */
|
||||
spinlock_t lock;
|
||||
|
||||
#ifdef CONFIG_PROC_FS
|
||||
struct proc_dir_entry *procdir;
|
||||
#endif
|
||||
};
|
||||
|
||||
static inline void
|
||||
clusterip_config_get(struct clusterip_config *c)
|
||||
{
|
||||
atomic_inc(&c->refcount);
|
||||
}
|
||||
|
||||
|
||||
static void clusterip_config_rcu_free(struct rcu_head *head)
|
||||
{
|
||||
kfree(container_of(head, struct clusterip_config, rcu));
|
||||
}
|
||||
|
||||
static inline void
|
||||
clusterip_config_put(struct clusterip_config *c)
|
||||
{
|
||||
if (atomic_dec_and_test(&c->refcount))
|
||||
call_rcu_bh(&c->rcu, clusterip_config_rcu_free);
|
||||
}
|
||||
|
||||
/* decrease the count of entries using/referencing this config. If last
|
||||
* entry(rule) is removed, remove the config from lists, but don't free it
|
||||
* yet, since proc-files could still be holding references */
|
||||
static inline void
|
||||
clusterip_config_entry_put(struct clusterip_config *c)
|
||||
{
|
||||
struct net *net = dev_net(c->dev);
|
||||
struct clusterip_net *cn = net_generic(net, clusterip_net_id);
|
||||
|
||||
local_bh_disable();
|
||||
if (atomic_dec_and_lock(&c->entries, &cn->lock)) {
|
||||
list_del_rcu(&c->list);
|
||||
spin_unlock(&cn->lock);
|
||||
local_bh_enable();
|
||||
|
||||
dev_mc_del(c->dev, c->clustermac);
|
||||
dev_put(c->dev);
|
||||
|
||||
/* In case anyone still accesses the file, the open/close
|
||||
* functions are also incrementing the refcount on their own,
|
||||
* so it's safe to remove the entry even if it's in use. */
|
||||
#ifdef CONFIG_PROC_FS
|
||||
proc_remove(c->pde);
|
||||
#endif
|
||||
return;
|
||||
}
|
||||
local_bh_enable();
|
||||
}
|
||||
|
||||
static struct clusterip_config *
|
||||
__clusterip_config_find(struct net *net, __be32 clusterip)
|
||||
{
|
||||
struct clusterip_config *c;
|
||||
struct clusterip_net *cn = net_generic(net, clusterip_net_id);
|
||||
|
||||
list_for_each_entry_rcu(c, &cn->configs, list) {
|
||||
if (c->clusterip == clusterip)
|
||||
return c;
|
||||
}
|
||||
|
||||
return NULL;
|
||||
}
|
||||
|
||||
static inline struct clusterip_config *
|
||||
clusterip_config_find_get(struct net *net, __be32 clusterip, int entry)
|
||||
{
|
||||
struct clusterip_config *c;
|
||||
|
||||
rcu_read_lock_bh();
|
||||
c = __clusterip_config_find(net, clusterip);
|
||||
if (c) {
|
||||
if (unlikely(!atomic_inc_not_zero(&c->refcount)))
|
||||
c = NULL;
|
||||
else if (entry)
|
||||
atomic_inc(&c->entries);
|
||||
}
|
||||
rcu_read_unlock_bh();
|
||||
|
||||
return c;
|
||||
}
|
||||
|
||||
static void
|
||||
clusterip_config_init_nodelist(struct clusterip_config *c,
|
||||
const struct ipt_clusterip_tgt_info *i)
|
||||
{
|
||||
int n;
|
||||
|
||||
for (n = 0; n < i->num_local_nodes; n++)
|
||||
set_bit(i->local_nodes[n] - 1, &c->local_nodes);
|
||||
}
|
||||
|
||||
static struct clusterip_config *
|
||||
clusterip_config_init(const struct ipt_clusterip_tgt_info *i, __be32 ip,
|
||||
struct net_device *dev)
|
||||
{
|
||||
struct clusterip_config *c;
|
||||
struct clusterip_net *cn = net_generic(dev_net(dev), clusterip_net_id);
|
||||
|
||||
c = kzalloc(sizeof(*c), GFP_ATOMIC);
|
||||
if (!c)
|
||||
return NULL;
|
||||
|
||||
c->dev = dev;
|
||||
c->clusterip = ip;
|
||||
memcpy(&c->clustermac, &i->clustermac, ETH_ALEN);
|
||||
c->num_total_nodes = i->num_total_nodes;
|
||||
clusterip_config_init_nodelist(c, i);
|
||||
c->hash_mode = i->hash_mode;
|
||||
c->hash_initval = i->hash_initval;
|
||||
atomic_set(&c->refcount, 1);
|
||||
atomic_set(&c->entries, 1);
|
||||
|
||||
#ifdef CONFIG_PROC_FS
|
||||
{
|
||||
char buffer[16];
|
||||
|
||||
/* create proc dir entry */
|
||||
sprintf(buffer, "%pI4", &ip);
|
||||
c->pde = proc_create_data(buffer, S_IWUSR|S_IRUSR,
|
||||
cn->procdir,
|
||||
&clusterip_proc_fops, c);
|
||||
if (!c->pde) {
|
||||
kfree(c);
|
||||
return NULL;
|
||||
}
|
||||
}
|
||||
#endif
|
||||
|
||||
spin_lock_bh(&cn->lock);
|
||||
list_add_rcu(&c->list, &cn->configs);
|
||||
spin_unlock_bh(&cn->lock);
|
||||
|
||||
return c;
|
||||
}
|
||||
|
||||
#ifdef CONFIG_PROC_FS
|
||||
static int
|
||||
clusterip_add_node(struct clusterip_config *c, u_int16_t nodenum)
|
||||
{
|
||||
|
||||
if (nodenum == 0 ||
|
||||
nodenum > c->num_total_nodes)
|
||||
return 1;
|
||||
|
||||
/* check if we already have this number in our bitfield */
|
||||
if (test_and_set_bit(nodenum - 1, &c->local_nodes))
|
||||
return 1;
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
static bool
|
||||
clusterip_del_node(struct clusterip_config *c, u_int16_t nodenum)
|
||||
{
|
||||
if (nodenum == 0 ||
|
||||
nodenum > c->num_total_nodes)
|
||||
return true;
|
||||
|
||||
if (test_and_clear_bit(nodenum - 1, &c->local_nodes))
|
||||
return false;
|
||||
|
||||
return true;
|
||||
}
|
||||
#endif
|
||||
|
||||
static inline u_int32_t
|
||||
clusterip_hashfn(const struct sk_buff *skb,
|
||||
const struct clusterip_config *config)
|
||||
{
|
||||
const struct iphdr *iph = ip_hdr(skb);
|
||||
unsigned long hashval;
|
||||
u_int16_t sport = 0, dport = 0;
|
||||
int poff;
|
||||
|
||||
poff = proto_ports_offset(iph->protocol);
|
||||
if (poff >= 0) {
|
||||
const u_int16_t *ports;
|
||||
u16 _ports[2];
|
||||
|
||||
ports = skb_header_pointer(skb, iph->ihl * 4 + poff, 4, _ports);
|
||||
if (ports) {
|
||||
sport = ports[0];
|
||||
dport = ports[1];
|
||||
}
|
||||
} else {
|
||||
net_info_ratelimited("unknown protocol %u\n", iph->protocol);
|
||||
}
|
||||
|
||||
switch (config->hash_mode) {
|
||||
case CLUSTERIP_HASHMODE_SIP:
|
||||
hashval = jhash_1word(ntohl(iph->saddr),
|
||||
config->hash_initval);
|
||||
break;
|
||||
case CLUSTERIP_HASHMODE_SIP_SPT:
|
||||
hashval = jhash_2words(ntohl(iph->saddr), sport,
|
||||
config->hash_initval);
|
||||
break;
|
||||
case CLUSTERIP_HASHMODE_SIP_SPT_DPT:
|
||||
hashval = jhash_3words(ntohl(iph->saddr), sport, dport,
|
||||
config->hash_initval);
|
||||
break;
|
||||
default:
|
||||
/* to make gcc happy */
|
||||
hashval = 0;
|
||||
/* This cannot happen, unless the check function wasn't called
|
||||
* at rule load time */
|
||||
pr_info("unknown mode %u\n", config->hash_mode);
|
||||
BUG();
|
||||
break;
|
||||
}
|
||||
|
||||
/* node numbers are 1..n, not 0..n */
|
||||
return reciprocal_scale(hashval, config->num_total_nodes) + 1;
|
||||
}
|
||||
|
||||
static inline int
|
||||
clusterip_responsible(const struct clusterip_config *config, u_int32_t hash)
|
||||
{
|
||||
return test_bit(hash - 1, &config->local_nodes);
|
||||
}
|
||||
|
||||
/***********************************************************************
|
||||
* IPTABLES TARGET
|
||||
***********************************************************************/
|
||||
|
||||
static unsigned int
|
||||
clusterip_tg(struct sk_buff *skb, const struct xt_action_param *par)
|
||||
{
|
||||
const struct ipt_clusterip_tgt_info *cipinfo = par->targinfo;
|
||||
struct nf_conn *ct;
|
||||
enum ip_conntrack_info ctinfo;
|
||||
u_int32_t hash;
|
||||
|
||||
/* don't need to clusterip_config_get() here, since refcount
|
||||
* is only decremented by destroy() - and ip_tables guarantees
|
||||
* that the ->target() function isn't called after ->destroy() */
|
||||
|
||||
ct = nf_ct_get(skb, &ctinfo);
|
||||
if (ct == NULL)
|
||||
return NF_DROP;
|
||||
|
||||
/* special case: ICMP error handling. conntrack distinguishes between
|
||||
* error messages (RELATED) and information requests (see below) */
|
||||
if (ip_hdr(skb)->protocol == IPPROTO_ICMP &&
|
||||
(ctinfo == IP_CT_RELATED ||
|
||||
ctinfo == IP_CT_RELATED_REPLY))
|
||||
return XT_CONTINUE;
|
||||
|
||||
/* ip_conntrack_icmp guarantees us that we only have ICMP_ECHO,
|
||||
* TIMESTAMP, INFO_REQUEST or ADDRESS type icmp packets from here
|
||||
* on, which all have an ID field [relevant for hashing]. */
|
||||
|
||||
hash = clusterip_hashfn(skb, cipinfo->config);
|
||||
|
||||
switch (ctinfo) {
|
||||
case IP_CT_NEW:
|
||||
ct->mark = hash;
|
||||
break;
|
||||
case IP_CT_RELATED:
|
||||
case IP_CT_RELATED_REPLY:
|
||||
/* FIXME: we don't handle expectations at the moment.
|
||||
* They can arrive on a different node than
|
||||
* the master connection (e.g. FTP passive mode) */
|
||||
case IP_CT_ESTABLISHED:
|
||||
case IP_CT_ESTABLISHED_REPLY:
|
||||
break;
|
||||
default: /* Prevent gcc warnings */
|
||||
break;
|
||||
}
|
||||
|
||||
#ifdef DEBUG
|
||||
nf_ct_dump_tuple_ip(&ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple);
|
||||
#endif
|
||||
pr_debug("hash=%u ct_hash=%u ", hash, ct->mark);
|
||||
if (!clusterip_responsible(cipinfo->config, hash)) {
|
||||
pr_debug("not responsible\n");
|
||||
return NF_DROP;
|
||||
}
|
||||
pr_debug("responsible\n");
|
||||
|
||||
/* despite being received via linklayer multicast, this is
|
||||
* actually a unicast IP packet. TCP doesn't like PACKET_MULTICAST */
|
||||
skb->pkt_type = PACKET_HOST;
|
||||
|
||||
return XT_CONTINUE;
|
||||
}
|
||||
|
||||
static int clusterip_tg_check(const struct xt_tgchk_param *par)
|
||||
{
|
||||
struct ipt_clusterip_tgt_info *cipinfo = par->targinfo;
|
||||
const struct ipt_entry *e = par->entryinfo;
|
||||
struct clusterip_config *config;
|
||||
int ret;
|
||||
|
||||
if (cipinfo->hash_mode != CLUSTERIP_HASHMODE_SIP &&
|
||||
cipinfo->hash_mode != CLUSTERIP_HASHMODE_SIP_SPT &&
|
||||
cipinfo->hash_mode != CLUSTERIP_HASHMODE_SIP_SPT_DPT) {
|
||||
pr_info("unknown mode %u\n", cipinfo->hash_mode);
|
||||
return -EINVAL;
|
||||
|
||||
}
|
||||
if (e->ip.dmsk.s_addr != htonl(0xffffffff) ||
|
||||
e->ip.dst.s_addr == 0) {
|
||||
pr_info("Please specify destination IP\n");
|
||||
return -EINVAL;
|
||||
}
|
||||
|
||||
/* FIXME: further sanity checks */
|
||||
|
||||
config = clusterip_config_find_get(par->net, e->ip.dst.s_addr, 1);
|
||||
if (!config) {
|
||||
if (!(cipinfo->flags & CLUSTERIP_FLAG_NEW)) {
|
||||
pr_info("no config found for %pI4, need 'new'\n",
|
||||
&e->ip.dst.s_addr);
|
||||
return -EINVAL;
|
||||
} else {
|
||||
struct net_device *dev;
|
||||
|
||||
if (e->ip.iniface[0] == '\0') {
|
||||
pr_info("Please specify an interface name\n");
|
||||
return -EINVAL;
|
||||
}
|
||||
|
||||
dev = dev_get_by_name(par->net, e->ip.iniface);
|
||||
if (!dev) {
|
||||
pr_info("no such interface %s\n",
|
||||
e->ip.iniface);
|
||||
return -ENOENT;
|
||||
}
|
||||
|
||||
config = clusterip_config_init(cipinfo,
|
||||
e->ip.dst.s_addr, dev);
|
||||
if (!config) {
|
||||
dev_put(dev);
|
||||
return -ENOMEM;
|
||||
}
|
||||
dev_mc_add(config->dev, config->clustermac);
|
||||
}
|
||||
}
|
||||
cipinfo->config = config;
|
||||
|
||||
ret = nf_ct_l3proto_try_module_get(par->family);
|
||||
if (ret < 0)
|
||||
pr_info("cannot load conntrack support for proto=%u\n",
|
||||
par->family);
|
||||
return ret;
|
||||
}
|
||||
|
||||
/* drop reference count of cluster config when rule is deleted */
|
||||
static void clusterip_tg_destroy(const struct xt_tgdtor_param *par)
|
||||
{
|
||||
const struct ipt_clusterip_tgt_info *cipinfo = par->targinfo;
|
||||
|
||||
/* if no more entries are referencing the config, remove it
|
||||
* from the list and destroy the proc entry */
|
||||
clusterip_config_entry_put(cipinfo->config);
|
||||
|
||||
clusterip_config_put(cipinfo->config);
|
||||
|
||||
nf_ct_l3proto_module_put(par->family);
|
||||
}
|
||||
|
||||
#ifdef CONFIG_COMPAT
|
||||
struct compat_ipt_clusterip_tgt_info
|
||||
{
|
||||
u_int32_t flags;
|
||||
u_int8_t clustermac[6];
|
||||
u_int16_t num_total_nodes;
|
||||
u_int16_t num_local_nodes;
|
||||
u_int16_t local_nodes[CLUSTERIP_MAX_NODES];
|
||||
u_int32_t hash_mode;
|
||||
u_int32_t hash_initval;
|
||||
compat_uptr_t config;
|
||||
};
|
||||
#endif /* CONFIG_COMPAT */
|
||||
|
||||
static struct xt_target clusterip_tg_reg __read_mostly = {
|
||||
.name = "CLUSTERIP",
|
||||
.family = NFPROTO_IPV4,
|
||||
.target = clusterip_tg,
|
||||
.checkentry = clusterip_tg_check,
|
||||
.destroy = clusterip_tg_destroy,
|
||||
.targetsize = sizeof(struct ipt_clusterip_tgt_info),
|
||||
#ifdef CONFIG_COMPAT
|
||||
.compatsize = sizeof(struct compat_ipt_clusterip_tgt_info),
|
||||
#endif /* CONFIG_COMPAT */
|
||||
.me = THIS_MODULE
|
||||
};
|
||||
|
||||
|
||||
/***********************************************************************
|
||||
* ARP MANGLING CODE
|
||||
***********************************************************************/
|
||||
|
||||
/* hardcoded for 48bit ethernet and 32bit ipv4 addresses */
|
||||
struct arp_payload {
|
||||
u_int8_t src_hw[ETH_ALEN];
|
||||
__be32 src_ip;
|
||||
u_int8_t dst_hw[ETH_ALEN];
|
||||
__be32 dst_ip;
|
||||
} __packed;
|
||||
|
||||
#ifdef DEBUG
|
||||
static void arp_print(struct arp_payload *payload)
|
||||
{
|
||||
#define HBUFFERLEN 30
|
||||
char hbuffer[HBUFFERLEN];
|
||||
int j,k;
|
||||
|
||||
for (k=0, j=0; k < HBUFFERLEN-3 && j < ETH_ALEN; j++) {
|
||||
hbuffer[k++] = hex_asc_hi(payload->src_hw[j]);
|
||||
hbuffer[k++] = hex_asc_lo(payload->src_hw[j]);
|
||||
hbuffer[k++]=':';
|
||||
}
|
||||
hbuffer[--k]='\0';
|
||||
|
||||
pr_debug("src %pI4@%s, dst %pI4\n",
|
||||
&payload->src_ip, hbuffer, &payload->dst_ip);
|
||||
}
|
||||
#endif
|
||||
|
||||
static unsigned int
|
||||
arp_mangle(const struct nf_hook_ops *ops,
|
||||
struct sk_buff *skb,
|
||||
const struct net_device *in,
|
||||
const struct net_device *out,
|
||||
int (*okfn)(struct sk_buff *))
|
||||
{
|
||||
struct arphdr *arp = arp_hdr(skb);
|
||||
struct arp_payload *payload;
|
||||
struct clusterip_config *c;
|
||||
struct net *net = dev_net(in ? in : out);
|
||||
|
||||
/* we don't care about non-ethernet and non-ipv4 ARP */
|
||||
if (arp->ar_hrd != htons(ARPHRD_ETHER) ||
|
||||
arp->ar_pro != htons(ETH_P_IP) ||
|
||||
arp->ar_pln != 4 || arp->ar_hln != ETH_ALEN)
|
||||
return NF_ACCEPT;
|
||||
|
||||
/* we only want to mangle arp requests and replies */
|
||||
if (arp->ar_op != htons(ARPOP_REPLY) &&
|
||||
arp->ar_op != htons(ARPOP_REQUEST))
|
||||
return NF_ACCEPT;
|
||||
|
||||
payload = (void *)(arp+1);
|
||||
|
||||
/* if there is no clusterip configuration for the arp reply's
|
||||
* source ip, we don't want to mangle it */
|
||||
c = clusterip_config_find_get(net, payload->src_ip, 0);
|
||||
if (!c)
|
||||
return NF_ACCEPT;
|
||||
|
||||
/* normally the linux kernel always replies to arp queries of
|
||||
* addresses on different interfacs. However, in the CLUSTERIP case
|
||||
* this wouldn't work, since we didn't subscribe the mcast group on
|
||||
* other interfaces */
|
||||
if (c->dev != out) {
|
||||
pr_debug("not mangling arp reply on different "
|
||||
"interface: cip'%s'-skb'%s'\n",
|
||||
c->dev->name, out->name);
|
||||
clusterip_config_put(c);
|
||||
return NF_ACCEPT;
|
||||
}
|
||||
|
||||
/* mangle reply hardware address */
|
||||
memcpy(payload->src_hw, c->clustermac, arp->ar_hln);
|
||||
|
||||
#ifdef DEBUG
|
||||
pr_debug("mangled arp reply: ");
|
||||
arp_print(payload);
|
||||
#endif
|
||||
|
||||
clusterip_config_put(c);
|
||||
|
||||
return NF_ACCEPT;
|
||||
}
|
||||
|
||||
static struct nf_hook_ops cip_arp_ops __read_mostly = {
|
||||
.hook = arp_mangle,
|
||||
.pf = NFPROTO_ARP,
|
||||
.hooknum = NF_ARP_OUT,
|
||||
.priority = -1
|
||||
};
|
||||
|
||||
/***********************************************************************
|
||||
* PROC DIR HANDLING
|
||||
***********************************************************************/
|
||||
|
||||
#ifdef CONFIG_PROC_FS
|
||||
|
||||
struct clusterip_seq_position {
|
||||
unsigned int pos; /* position */
|
||||
unsigned int weight; /* number of bits set == size */
|
||||
unsigned int bit; /* current bit */
|
||||
unsigned long val; /* current value */
|
||||
};
|
||||
|
||||
static void *clusterip_seq_start(struct seq_file *s, loff_t *pos)
|
||||
{
|
||||
struct clusterip_config *c = s->private;
|
||||
unsigned int weight;
|
||||
u_int32_t local_nodes;
|
||||
struct clusterip_seq_position *idx;
|
||||
|
||||
/* FIXME: possible race */
|
||||
local_nodes = c->local_nodes;
|
||||
weight = hweight32(local_nodes);
|
||||
if (*pos >= weight)
|
||||
return NULL;
|
||||
|
||||
idx = kmalloc(sizeof(struct clusterip_seq_position), GFP_KERNEL);
|
||||
if (!idx)
|
||||
return ERR_PTR(-ENOMEM);
|
||||
|
||||
idx->pos = *pos;
|
||||
idx->weight = weight;
|
||||
idx->bit = ffs(local_nodes);
|
||||
idx->val = local_nodes;
|
||||
clear_bit(idx->bit - 1, &idx->val);
|
||||
|
||||
return idx;
|
||||
}
|
||||
|
||||
static void *clusterip_seq_next(struct seq_file *s, void *v, loff_t *pos)
|
||||
{
|
||||
struct clusterip_seq_position *idx = v;
|
||||
|
||||
*pos = ++idx->pos;
|
||||
if (*pos >= idx->weight) {
|
||||
kfree(v);
|
||||
return NULL;
|
||||
}
|
||||
idx->bit = ffs(idx->val);
|
||||
clear_bit(idx->bit - 1, &idx->val);
|
||||
return idx;
|
||||
}
|
||||
|
||||
static void clusterip_seq_stop(struct seq_file *s, void *v)
|
||||
{
|
||||
if (!IS_ERR(v))
|
||||
kfree(v);
|
||||
}
|
||||
|
||||
static int clusterip_seq_show(struct seq_file *s, void *v)
|
||||
{
|
||||
struct clusterip_seq_position *idx = v;
|
||||
|
||||
if (idx->pos != 0)
|
||||
seq_putc(s, ',');
|
||||
|
||||
seq_printf(s, "%u", idx->bit);
|
||||
|
||||
if (idx->pos == idx->weight - 1)
|
||||
seq_putc(s, '\n');
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
static const struct seq_operations clusterip_seq_ops = {
|
||||
.start = clusterip_seq_start,
|
||||
.next = clusterip_seq_next,
|
||||
.stop = clusterip_seq_stop,
|
||||
.show = clusterip_seq_show,
|
||||
};
|
||||
|
||||
static int clusterip_proc_open(struct inode *inode, struct file *file)
|
||||
{
|
||||
int ret = seq_open(file, &clusterip_seq_ops);
|
||||
|
||||
if (!ret) {
|
||||
struct seq_file *sf = file->private_data;
|
||||
struct clusterip_config *c = PDE_DATA(inode);
|
||||
|
||||
sf->private = c;
|
||||
|
||||
clusterip_config_get(c);
|
||||
}
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
static int clusterip_proc_release(struct inode *inode, struct file *file)
|
||||
{
|
||||
struct clusterip_config *c = PDE_DATA(inode);
|
||||
int ret;
|
||||
|
||||
ret = seq_release(inode, file);
|
||||
|
||||
if (!ret)
|
||||
clusterip_config_put(c);
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
static ssize_t clusterip_proc_write(struct file *file, const char __user *input,
|
||||
size_t size, loff_t *ofs)
|
||||
{
|
||||
struct clusterip_config *c = PDE_DATA(file_inode(file));
|
||||
#define PROC_WRITELEN 10
|
||||
char buffer[PROC_WRITELEN+1];
|
||||
unsigned long nodenum;
|
||||
int rc;
|
||||
|
||||
if (size > PROC_WRITELEN)
|
||||
return -EIO;
|
||||
if (copy_from_user(buffer, input, size))
|
||||
return -EFAULT;
|
||||
buffer[size] = 0;
|
||||
|
||||
if (*buffer == '+') {
|
||||
rc = kstrtoul(buffer+1, 10, &nodenum);
|
||||
if (rc)
|
||||
return rc;
|
||||
if (clusterip_add_node(c, nodenum))
|
||||
return -ENOMEM;
|
||||
} else if (*buffer == '-') {
|
||||
rc = kstrtoul(buffer+1, 10, &nodenum);
|
||||
if (rc)
|
||||
return rc;
|
||||
if (clusterip_del_node(c, nodenum))
|
||||
return -ENOENT;
|
||||
} else
|
||||
return -EIO;
|
||||
|
||||
return size;
|
||||
}
|
||||
|
||||
static const struct file_operations clusterip_proc_fops = {
|
||||
.owner = THIS_MODULE,
|
||||
.open = clusterip_proc_open,
|
||||
.read = seq_read,
|
||||
.write = clusterip_proc_write,
|
||||
.llseek = seq_lseek,
|
||||
.release = clusterip_proc_release,
|
||||
};
|
||||
|
||||
#endif /* CONFIG_PROC_FS */
|
||||
|
||||
static int clusterip_net_init(struct net *net)
|
||||
{
|
||||
struct clusterip_net *cn = net_generic(net, clusterip_net_id);
|
||||
|
||||
INIT_LIST_HEAD(&cn->configs);
|
||||
|
||||
spin_lock_init(&cn->lock);
|
||||
|
||||
#ifdef CONFIG_PROC_FS
|
||||
cn->procdir = proc_mkdir("ipt_CLUSTERIP", net->proc_net);
|
||||
if (!cn->procdir) {
|
||||
pr_err("Unable to proc dir entry\n");
|
||||
return -ENOMEM;
|
||||
}
|
||||
#endif /* CONFIG_PROC_FS */
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
static void clusterip_net_exit(struct net *net)
|
||||
{
|
||||
#ifdef CONFIG_PROC_FS
|
||||
struct clusterip_net *cn = net_generic(net, clusterip_net_id);
|
||||
proc_remove(cn->procdir);
|
||||
#endif
|
||||
}
|
||||
|
||||
static struct pernet_operations clusterip_net_ops = {
|
||||
.init = clusterip_net_init,
|
||||
.exit = clusterip_net_exit,
|
||||
.id = &clusterip_net_id,
|
||||
.size = sizeof(struct clusterip_net),
|
||||
};
|
||||
|
||||
static int __init clusterip_tg_init(void)
|
||||
{
|
||||
int ret;
|
||||
|
||||
ret = register_pernet_subsys(&clusterip_net_ops);
|
||||
if (ret < 0)
|
||||
return ret;
|
||||
|
||||
ret = xt_register_target(&clusterip_tg_reg);
|
||||
if (ret < 0)
|
||||
goto cleanup_subsys;
|
||||
|
||||
ret = nf_register_hook(&cip_arp_ops);
|
||||
if (ret < 0)
|
||||
goto cleanup_target;
|
||||
|
||||
pr_info("ClusterIP Version %s loaded successfully\n",
|
||||
CLUSTERIP_VERSION);
|
||||
|
||||
return 0;
|
||||
|
||||
cleanup_target:
|
||||
xt_unregister_target(&clusterip_tg_reg);
|
||||
cleanup_subsys:
|
||||
unregister_pernet_subsys(&clusterip_net_ops);
|
||||
return ret;
|
||||
}
|
||||
|
||||
static void __exit clusterip_tg_exit(void)
|
||||
{
|
||||
pr_info("ClusterIP Version %s unloading\n", CLUSTERIP_VERSION);
|
||||
|
||||
nf_unregister_hook(&cip_arp_ops);
|
||||
xt_unregister_target(&clusterip_tg_reg);
|
||||
unregister_pernet_subsys(&clusterip_net_ops);
|
||||
|
||||
/* Wait for completion of call_rcu_bh()'s (clusterip_config_rcu_free) */
|
||||
rcu_barrier_bh();
|
||||
}
|
||||
|
||||
module_init(clusterip_tg_init);
|
||||
module_exit(clusterip_tg_exit);
|
||||
138
net/ipv4/netfilter/ipt_ECN.c
Normal file
138
net/ipv4/netfilter/ipt_ECN.c
Normal file
|
|
@ -0,0 +1,138 @@
|
|||
/* iptables module for the IPv4 and TCP ECN bits, Version 1.5
|
||||
*
|
||||
* (C) 2002 by Harald Welte <laforge@netfilter.org>
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License version 2 as
|
||||
* published by the Free Software Foundation.
|
||||
*/
|
||||
#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
|
||||
#include <linux/in.h>
|
||||
#include <linux/module.h>
|
||||
#include <linux/skbuff.h>
|
||||
#include <linux/ip.h>
|
||||
#include <net/ip.h>
|
||||
#include <linux/tcp.h>
|
||||
#include <net/checksum.h>
|
||||
|
||||
#include <linux/netfilter/x_tables.h>
|
||||
#include <linux/netfilter_ipv4/ip_tables.h>
|
||||
#include <linux/netfilter_ipv4/ipt_ECN.h>
|
||||
|
||||
MODULE_LICENSE("GPL");
|
||||
MODULE_AUTHOR("Harald Welte <laforge@netfilter.org>");
|
||||
MODULE_DESCRIPTION("Xtables: Explicit Congestion Notification (ECN) flag modification");
|
||||
|
||||
/* set ECT codepoint from IP header.
|
||||
* return false if there was an error. */
|
||||
static inline bool
|
||||
set_ect_ip(struct sk_buff *skb, const struct ipt_ECN_info *einfo)
|
||||
{
|
||||
struct iphdr *iph = ip_hdr(skb);
|
||||
|
||||
if ((iph->tos & IPT_ECN_IP_MASK) != (einfo->ip_ect & IPT_ECN_IP_MASK)) {
|
||||
__u8 oldtos;
|
||||
if (!skb_make_writable(skb, sizeof(struct iphdr)))
|
||||
return false;
|
||||
iph = ip_hdr(skb);
|
||||
oldtos = iph->tos;
|
||||
iph->tos &= ~IPT_ECN_IP_MASK;
|
||||
iph->tos |= (einfo->ip_ect & IPT_ECN_IP_MASK);
|
||||
csum_replace2(&iph->check, htons(oldtos), htons(iph->tos));
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
/* Return false if there was an error. */
|
||||
static inline bool
|
||||
set_ect_tcp(struct sk_buff *skb, const struct ipt_ECN_info *einfo)
|
||||
{
|
||||
struct tcphdr _tcph, *tcph;
|
||||
__be16 oldval;
|
||||
|
||||
/* Not enough header? */
|
||||
tcph = skb_header_pointer(skb, ip_hdrlen(skb), sizeof(_tcph), &_tcph);
|
||||
if (!tcph)
|
||||
return false;
|
||||
|
||||
if ((!(einfo->operation & IPT_ECN_OP_SET_ECE) ||
|
||||
tcph->ece == einfo->proto.tcp.ece) &&
|
||||
(!(einfo->operation & IPT_ECN_OP_SET_CWR) ||
|
||||
tcph->cwr == einfo->proto.tcp.cwr))
|
||||
return true;
|
||||
|
||||
if (!skb_make_writable(skb, ip_hdrlen(skb) + sizeof(*tcph)))
|
||||
return false;
|
||||
tcph = (void *)ip_hdr(skb) + ip_hdrlen(skb);
|
||||
|
||||
oldval = ((__be16 *)tcph)[6];
|
||||
if (einfo->operation & IPT_ECN_OP_SET_ECE)
|
||||
tcph->ece = einfo->proto.tcp.ece;
|
||||
if (einfo->operation & IPT_ECN_OP_SET_CWR)
|
||||
tcph->cwr = einfo->proto.tcp.cwr;
|
||||
|
||||
inet_proto_csum_replace2(&tcph->check, skb,
|
||||
oldval, ((__be16 *)tcph)[6], 0);
|
||||
return true;
|
||||
}
|
||||
|
||||
static unsigned int
|
||||
ecn_tg(struct sk_buff *skb, const struct xt_action_param *par)
|
||||
{
|
||||
const struct ipt_ECN_info *einfo = par->targinfo;
|
||||
|
||||
if (einfo->operation & IPT_ECN_OP_SET_IP)
|
||||
if (!set_ect_ip(skb, einfo))
|
||||
return NF_DROP;
|
||||
|
||||
if (einfo->operation & (IPT_ECN_OP_SET_ECE | IPT_ECN_OP_SET_CWR) &&
|
||||
ip_hdr(skb)->protocol == IPPROTO_TCP)
|
||||
if (!set_ect_tcp(skb, einfo))
|
||||
return NF_DROP;
|
||||
|
||||
return XT_CONTINUE;
|
||||
}
|
||||
|
||||
static int ecn_tg_check(const struct xt_tgchk_param *par)
|
||||
{
|
||||
const struct ipt_ECN_info *einfo = par->targinfo;
|
||||
const struct ipt_entry *e = par->entryinfo;
|
||||
|
||||
if (einfo->operation & IPT_ECN_OP_MASK) {
|
||||
pr_info("unsupported ECN operation %x\n", einfo->operation);
|
||||
return -EINVAL;
|
||||
}
|
||||
if (einfo->ip_ect & ~IPT_ECN_IP_MASK) {
|
||||
pr_info("new ECT codepoint %x out of mask\n", einfo->ip_ect);
|
||||
return -EINVAL;
|
||||
}
|
||||
if ((einfo->operation & (IPT_ECN_OP_SET_ECE|IPT_ECN_OP_SET_CWR)) &&
|
||||
(e->ip.proto != IPPROTO_TCP || (e->ip.invflags & XT_INV_PROTO))) {
|
||||
pr_info("cannot use TCP operations on a non-tcp rule\n");
|
||||
return -EINVAL;
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
static struct xt_target ecn_tg_reg __read_mostly = {
|
||||
.name = "ECN",
|
||||
.family = NFPROTO_IPV4,
|
||||
.target = ecn_tg,
|
||||
.targetsize = sizeof(struct ipt_ECN_info),
|
||||
.table = "mangle",
|
||||
.checkentry = ecn_tg_check,
|
||||
.me = THIS_MODULE,
|
||||
};
|
||||
|
||||
static int __init ecn_tg_init(void)
|
||||
{
|
||||
return xt_register_target(&ecn_tg_reg);
|
||||
}
|
||||
|
||||
static void __exit ecn_tg_exit(void)
|
||||
{
|
||||
xt_unregister_target(&ecn_tg_reg);
|
||||
}
|
||||
|
||||
module_init(ecn_tg_init);
|
||||
module_exit(ecn_tg_exit);
|
||||
91
net/ipv4/netfilter/ipt_MASQUERADE.c
Normal file
91
net/ipv4/netfilter/ipt_MASQUERADE.c
Normal file
|
|
@ -0,0 +1,91 @@
|
|||
/* Masquerade. Simple mapping which alters range to a local IP address
|
||||
(depending on route). */
|
||||
|
||||
/* (C) 1999-2001 Paul `Rusty' Russell
|
||||
* (C) 2002-2006 Netfilter Core Team <coreteam@netfilter.org>
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License version 2 as
|
||||
* published by the Free Software Foundation.
|
||||
*/
|
||||
#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
|
||||
#include <linux/types.h>
|
||||
#include <linux/inetdevice.h>
|
||||
#include <linux/ip.h>
|
||||
#include <linux/timer.h>
|
||||
#include <linux/module.h>
|
||||
#include <linux/netfilter.h>
|
||||
#include <net/protocol.h>
|
||||
#include <net/ip.h>
|
||||
#include <net/checksum.h>
|
||||
#include <net/route.h>
|
||||
#include <linux/netfilter_ipv4.h>
|
||||
#include <linux/netfilter/x_tables.h>
|
||||
#include <net/netfilter/nf_nat.h>
|
||||
#include <net/netfilter/ipv4/nf_nat_masquerade.h>
|
||||
|
||||
MODULE_LICENSE("GPL");
|
||||
MODULE_AUTHOR("Netfilter Core Team <coreteam@netfilter.org>");
|
||||
MODULE_DESCRIPTION("Xtables: automatic-address SNAT");
|
||||
|
||||
/* FIXME: Multiple targets. --RR */
|
||||
static int masquerade_tg_check(const struct xt_tgchk_param *par)
|
||||
{
|
||||
const struct nf_nat_ipv4_multi_range_compat *mr = par->targinfo;
|
||||
|
||||
if (mr->range[0].flags & NF_NAT_RANGE_MAP_IPS) {
|
||||
pr_debug("bad MAP_IPS.\n");
|
||||
return -EINVAL;
|
||||
}
|
||||
if (mr->rangesize != 1) {
|
||||
pr_debug("bad rangesize %u\n", mr->rangesize);
|
||||
return -EINVAL;
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
static unsigned int
|
||||
masquerade_tg(struct sk_buff *skb, const struct xt_action_param *par)
|
||||
{
|
||||
struct nf_nat_range range;
|
||||
const struct nf_nat_ipv4_multi_range_compat *mr;
|
||||
|
||||
mr = par->targinfo;
|
||||
range.flags = mr->range[0].flags;
|
||||
range.min_proto = mr->range[0].min;
|
||||
range.max_proto = mr->range[0].max;
|
||||
|
||||
return nf_nat_masquerade_ipv4(skb, par->hooknum, &range, par->out);
|
||||
}
|
||||
|
||||
static struct xt_target masquerade_tg_reg __read_mostly = {
|
||||
.name = "MASQUERADE",
|
||||
.family = NFPROTO_IPV4,
|
||||
.target = masquerade_tg,
|
||||
.targetsize = sizeof(struct nf_nat_ipv4_multi_range_compat),
|
||||
.table = "nat",
|
||||
.hooks = 1 << NF_INET_POST_ROUTING,
|
||||
.checkentry = masquerade_tg_check,
|
||||
.me = THIS_MODULE,
|
||||
};
|
||||
|
||||
static int __init masquerade_tg_init(void)
|
||||
{
|
||||
int ret;
|
||||
|
||||
ret = xt_register_target(&masquerade_tg_reg);
|
||||
|
||||
if (ret == 0)
|
||||
nf_nat_masquerade_ipv4_register_notifier();
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
static void __exit masquerade_tg_exit(void)
|
||||
{
|
||||
xt_unregister_target(&masquerade_tg_reg);
|
||||
nf_nat_masquerade_ipv4_unregister_notifier();
|
||||
}
|
||||
|
||||
module_init(masquerade_tg_init);
|
||||
module_exit(masquerade_tg_exit);
|
||||
112
net/ipv4/netfilter/ipt_REJECT.c
Normal file
112
net/ipv4/netfilter/ipt_REJECT.c
Normal file
|
|
@ -0,0 +1,112 @@
|
|||
/*
|
||||
* This is a module which is used for rejecting packets.
|
||||
*/
|
||||
|
||||
/* (C) 1999-2001 Paul `Rusty' Russell
|
||||
* (C) 2002-2004 Netfilter Core Team <coreteam@netfilter.org>
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License version 2 as
|
||||
* published by the Free Software Foundation.
|
||||
*/
|
||||
#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
|
||||
#include <linux/module.h>
|
||||
#include <linux/skbuff.h>
|
||||
#include <linux/slab.h>
|
||||
#include <linux/ip.h>
|
||||
#include <linux/udp.h>
|
||||
#include <linux/icmp.h>
|
||||
#include <net/icmp.h>
|
||||
#include <linux/netfilter/x_tables.h>
|
||||
#include <linux/netfilter_ipv4/ip_tables.h>
|
||||
#include <linux/netfilter_ipv4/ipt_REJECT.h>
|
||||
#if IS_ENABLED(CONFIG_BRIDGE_NETFILTER)
|
||||
#include <linux/netfilter_bridge.h>
|
||||
#endif
|
||||
|
||||
#include <net/netfilter/ipv4/nf_reject.h>
|
||||
|
||||
MODULE_LICENSE("GPL");
|
||||
MODULE_AUTHOR("Netfilter Core Team <coreteam@netfilter.org>");
|
||||
MODULE_DESCRIPTION("Xtables: packet \"rejection\" target for IPv4");
|
||||
|
||||
static unsigned int
|
||||
reject_tg(struct sk_buff *skb, const struct xt_action_param *par)
|
||||
{
|
||||
const struct ipt_reject_info *reject = par->targinfo;
|
||||
|
||||
switch (reject->with) {
|
||||
case IPT_ICMP_NET_UNREACHABLE:
|
||||
nf_send_unreach(skb, ICMP_NET_UNREACH);
|
||||
break;
|
||||
case IPT_ICMP_HOST_UNREACHABLE:
|
||||
nf_send_unreach(skb, ICMP_HOST_UNREACH);
|
||||
break;
|
||||
case IPT_ICMP_PROT_UNREACHABLE:
|
||||
nf_send_unreach(skb, ICMP_PROT_UNREACH);
|
||||
break;
|
||||
case IPT_ICMP_PORT_UNREACHABLE:
|
||||
nf_send_unreach(skb, ICMP_PORT_UNREACH);
|
||||
break;
|
||||
case IPT_ICMP_NET_PROHIBITED:
|
||||
nf_send_unreach(skb, ICMP_NET_ANO);
|
||||
break;
|
||||
case IPT_ICMP_HOST_PROHIBITED:
|
||||
nf_send_unreach(skb, ICMP_HOST_ANO);
|
||||
break;
|
||||
case IPT_ICMP_ADMIN_PROHIBITED:
|
||||
nf_send_unreach(skb, ICMP_PKT_FILTERED);
|
||||
break;
|
||||
case IPT_TCP_RESET:
|
||||
nf_send_reset(skb, par->hooknum);
|
||||
case IPT_ICMP_ECHOREPLY:
|
||||
/* Doesn't happen. */
|
||||
break;
|
||||
}
|
||||
|
||||
return NF_DROP;
|
||||
}
|
||||
|
||||
static int reject_tg_check(const struct xt_tgchk_param *par)
|
||||
{
|
||||
const struct ipt_reject_info *rejinfo = par->targinfo;
|
||||
const struct ipt_entry *e = par->entryinfo;
|
||||
|
||||
if (rejinfo->with == IPT_ICMP_ECHOREPLY) {
|
||||
pr_info("ECHOREPLY no longer supported.\n");
|
||||
return -EINVAL;
|
||||
} else if (rejinfo->with == IPT_TCP_RESET) {
|
||||
/* Must specify that it's a TCP packet */
|
||||
if (e->ip.proto != IPPROTO_TCP ||
|
||||
(e->ip.invflags & XT_INV_PROTO)) {
|
||||
pr_info("TCP_RESET invalid for non-tcp\n");
|
||||
return -EINVAL;
|
||||
}
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
static struct xt_target reject_tg_reg __read_mostly = {
|
||||
.name = "REJECT",
|
||||
.family = NFPROTO_IPV4,
|
||||
.target = reject_tg,
|
||||
.targetsize = sizeof(struct ipt_reject_info),
|
||||
.table = "filter",
|
||||
.hooks = (1 << NF_INET_LOCAL_IN) | (1 << NF_INET_FORWARD) |
|
||||
(1 << NF_INET_LOCAL_OUT),
|
||||
.checkentry = reject_tg_check,
|
||||
.me = THIS_MODULE,
|
||||
};
|
||||
|
||||
static int __init reject_tg_init(void)
|
||||
{
|
||||
return xt_register_target(&reject_tg_reg);
|
||||
}
|
||||
|
||||
static void __exit reject_tg_exit(void)
|
||||
{
|
||||
xt_unregister_target(&reject_tg_reg);
|
||||
}
|
||||
|
||||
module_init(reject_tg_init);
|
||||
module_exit(reject_tg_exit);
|
||||
482
net/ipv4/netfilter/ipt_SYNPROXY.c
Normal file
482
net/ipv4/netfilter/ipt_SYNPROXY.c
Normal file
|
|
@ -0,0 +1,482 @@
|
|||
/*
|
||||
* Copyright (c) 2013 Patrick McHardy <kaber@trash.net>
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License version 2 as
|
||||
* published by the Free Software Foundation.
|
||||
*/
|
||||
|
||||
#include <linux/module.h>
|
||||
#include <linux/skbuff.h>
|
||||
#include <net/tcp.h>
|
||||
|
||||
#include <linux/netfilter_ipv4/ip_tables.h>
|
||||
#include <linux/netfilter/x_tables.h>
|
||||
#include <linux/netfilter/xt_SYNPROXY.h>
|
||||
#include <net/netfilter/nf_conntrack.h>
|
||||
#include <net/netfilter/nf_conntrack_seqadj.h>
|
||||
#include <net/netfilter/nf_conntrack_synproxy.h>
|
||||
|
||||
static struct iphdr *
|
||||
synproxy_build_ip(struct sk_buff *skb, u32 saddr, u32 daddr)
|
||||
{
|
||||
struct iphdr *iph;
|
||||
|
||||
skb_reset_network_header(skb);
|
||||
iph = (struct iphdr *)skb_put(skb, sizeof(*iph));
|
||||
iph->version = 4;
|
||||
iph->ihl = sizeof(*iph) / 4;
|
||||
iph->tos = 0;
|
||||
iph->id = 0;
|
||||
iph->frag_off = htons(IP_DF);
|
||||
iph->ttl = sysctl_ip_default_ttl;
|
||||
iph->protocol = IPPROTO_TCP;
|
||||
iph->check = 0;
|
||||
iph->saddr = saddr;
|
||||
iph->daddr = daddr;
|
||||
|
||||
return iph;
|
||||
}
|
||||
|
||||
static void
|
||||
synproxy_send_tcp(const struct sk_buff *skb, struct sk_buff *nskb,
|
||||
struct nf_conntrack *nfct, enum ip_conntrack_info ctinfo,
|
||||
struct iphdr *niph, struct tcphdr *nth,
|
||||
unsigned int tcp_hdr_size)
|
||||
{
|
||||
nth->check = ~tcp_v4_check(tcp_hdr_size, niph->saddr, niph->daddr, 0);
|
||||
nskb->ip_summed = CHECKSUM_PARTIAL;
|
||||
nskb->csum_start = (unsigned char *)nth - nskb->head;
|
||||
nskb->csum_offset = offsetof(struct tcphdr, check);
|
||||
|
||||
skb_dst_set_noref(nskb, skb_dst(skb));
|
||||
nskb->protocol = htons(ETH_P_IP);
|
||||
if (ip_route_me_harder(nskb, RTN_UNSPEC))
|
||||
goto free_nskb;
|
||||
|
||||
if (nfct) {
|
||||
nskb->nfct = nfct;
|
||||
nskb->nfctinfo = ctinfo;
|
||||
nf_conntrack_get(nfct);
|
||||
}
|
||||
|
||||
ip_local_out(nskb);
|
||||
return;
|
||||
|
||||
free_nskb:
|
||||
kfree_skb(nskb);
|
||||
}
|
||||
|
||||
static void
|
||||
synproxy_send_client_synack(const struct sk_buff *skb, const struct tcphdr *th,
|
||||
const struct synproxy_options *opts)
|
||||
{
|
||||
struct sk_buff *nskb;
|
||||
struct iphdr *iph, *niph;
|
||||
struct tcphdr *nth;
|
||||
unsigned int tcp_hdr_size;
|
||||
u16 mss = opts->mss;
|
||||
|
||||
iph = ip_hdr(skb);
|
||||
|
||||
tcp_hdr_size = sizeof(*nth) + synproxy_options_size(opts);
|
||||
nskb = alloc_skb(sizeof(*niph) + tcp_hdr_size + MAX_TCP_HEADER,
|
||||
GFP_ATOMIC);
|
||||
if (nskb == NULL)
|
||||
return;
|
||||
skb_reserve(nskb, MAX_TCP_HEADER);
|
||||
|
||||
niph = synproxy_build_ip(nskb, iph->daddr, iph->saddr);
|
||||
|
||||
skb_reset_transport_header(nskb);
|
||||
nth = (struct tcphdr *)skb_put(nskb, tcp_hdr_size);
|
||||
nth->source = th->dest;
|
||||
nth->dest = th->source;
|
||||
nth->seq = htonl(__cookie_v4_init_sequence(iph, th, &mss));
|
||||
nth->ack_seq = htonl(ntohl(th->seq) + 1);
|
||||
tcp_flag_word(nth) = TCP_FLAG_SYN | TCP_FLAG_ACK;
|
||||
if (opts->options & XT_SYNPROXY_OPT_ECN)
|
||||
tcp_flag_word(nth) |= TCP_FLAG_ECE;
|
||||
nth->doff = tcp_hdr_size / 4;
|
||||
nth->window = 0;
|
||||
nth->check = 0;
|
||||
nth->urg_ptr = 0;
|
||||
|
||||
synproxy_build_options(nth, opts);
|
||||
|
||||
synproxy_send_tcp(skb, nskb, skb->nfct, IP_CT_ESTABLISHED_REPLY,
|
||||
niph, nth, tcp_hdr_size);
|
||||
}
|
||||
|
||||
static void
|
||||
synproxy_send_server_syn(const struct synproxy_net *snet,
|
||||
const struct sk_buff *skb, const struct tcphdr *th,
|
||||
const struct synproxy_options *opts, u32 recv_seq)
|
||||
{
|
||||
struct sk_buff *nskb;
|
||||
struct iphdr *iph, *niph;
|
||||
struct tcphdr *nth;
|
||||
unsigned int tcp_hdr_size;
|
||||
|
||||
iph = ip_hdr(skb);
|
||||
|
||||
tcp_hdr_size = sizeof(*nth) + synproxy_options_size(opts);
|
||||
nskb = alloc_skb(sizeof(*niph) + tcp_hdr_size + MAX_TCP_HEADER,
|
||||
GFP_ATOMIC);
|
||||
if (nskb == NULL)
|
||||
return;
|
||||
skb_reserve(nskb, MAX_TCP_HEADER);
|
||||
|
||||
niph = synproxy_build_ip(nskb, iph->saddr, iph->daddr);
|
||||
|
||||
skb_reset_transport_header(nskb);
|
||||
nth = (struct tcphdr *)skb_put(nskb, tcp_hdr_size);
|
||||
nth->source = th->source;
|
||||
nth->dest = th->dest;
|
||||
nth->seq = htonl(recv_seq - 1);
|
||||
/* ack_seq is used to relay our ISN to the synproxy hook to initialize
|
||||
* sequence number translation once a connection tracking entry exists.
|
||||
*/
|
||||
nth->ack_seq = htonl(ntohl(th->ack_seq) - 1);
|
||||
tcp_flag_word(nth) = TCP_FLAG_SYN;
|
||||
if (opts->options & XT_SYNPROXY_OPT_ECN)
|
||||
tcp_flag_word(nth) |= TCP_FLAG_ECE | TCP_FLAG_CWR;
|
||||
nth->doff = tcp_hdr_size / 4;
|
||||
nth->window = th->window;
|
||||
nth->check = 0;
|
||||
nth->urg_ptr = 0;
|
||||
|
||||
synproxy_build_options(nth, opts);
|
||||
|
||||
synproxy_send_tcp(skb, nskb, &snet->tmpl->ct_general, IP_CT_NEW,
|
||||
niph, nth, tcp_hdr_size);
|
||||
}
|
||||
|
||||
static void
|
||||
synproxy_send_server_ack(const struct synproxy_net *snet,
|
||||
const struct ip_ct_tcp *state,
|
||||
const struct sk_buff *skb, const struct tcphdr *th,
|
||||
const struct synproxy_options *opts)
|
||||
{
|
||||
struct sk_buff *nskb;
|
||||
struct iphdr *iph, *niph;
|
||||
struct tcphdr *nth;
|
||||
unsigned int tcp_hdr_size;
|
||||
|
||||
iph = ip_hdr(skb);
|
||||
|
||||
tcp_hdr_size = sizeof(*nth) + synproxy_options_size(opts);
|
||||
nskb = alloc_skb(sizeof(*niph) + tcp_hdr_size + MAX_TCP_HEADER,
|
||||
GFP_ATOMIC);
|
||||
if (nskb == NULL)
|
||||
return;
|
||||
skb_reserve(nskb, MAX_TCP_HEADER);
|
||||
|
||||
niph = synproxy_build_ip(nskb, iph->daddr, iph->saddr);
|
||||
|
||||
skb_reset_transport_header(nskb);
|
||||
nth = (struct tcphdr *)skb_put(nskb, tcp_hdr_size);
|
||||
nth->source = th->dest;
|
||||
nth->dest = th->source;
|
||||
nth->seq = htonl(ntohl(th->ack_seq));
|
||||
nth->ack_seq = htonl(ntohl(th->seq) + 1);
|
||||
tcp_flag_word(nth) = TCP_FLAG_ACK;
|
||||
nth->doff = tcp_hdr_size / 4;
|
||||
nth->window = htons(state->seen[IP_CT_DIR_ORIGINAL].td_maxwin);
|
||||
nth->check = 0;
|
||||
nth->urg_ptr = 0;
|
||||
|
||||
synproxy_build_options(nth, opts);
|
||||
|
||||
synproxy_send_tcp(skb, nskb, NULL, 0, niph, nth, tcp_hdr_size);
|
||||
}
|
||||
|
||||
static void
|
||||
synproxy_send_client_ack(const struct synproxy_net *snet,
|
||||
const struct sk_buff *skb, const struct tcphdr *th,
|
||||
const struct synproxy_options *opts)
|
||||
{
|
||||
struct sk_buff *nskb;
|
||||
struct iphdr *iph, *niph;
|
||||
struct tcphdr *nth;
|
||||
unsigned int tcp_hdr_size;
|
||||
|
||||
iph = ip_hdr(skb);
|
||||
|
||||
tcp_hdr_size = sizeof(*nth) + synproxy_options_size(opts);
|
||||
nskb = alloc_skb(sizeof(*niph) + tcp_hdr_size + MAX_TCP_HEADER,
|
||||
GFP_ATOMIC);
|
||||
if (nskb == NULL)
|
||||
return;
|
||||
skb_reserve(nskb, MAX_TCP_HEADER);
|
||||
|
||||
niph = synproxy_build_ip(nskb, iph->saddr, iph->daddr);
|
||||
|
||||
skb_reset_transport_header(nskb);
|
||||
nth = (struct tcphdr *)skb_put(nskb, tcp_hdr_size);
|
||||
nth->source = th->source;
|
||||
nth->dest = th->dest;
|
||||
nth->seq = htonl(ntohl(th->seq) + 1);
|
||||
nth->ack_seq = th->ack_seq;
|
||||
tcp_flag_word(nth) = TCP_FLAG_ACK;
|
||||
nth->doff = tcp_hdr_size / 4;
|
||||
nth->window = ntohs(htons(th->window) >> opts->wscale);
|
||||
nth->check = 0;
|
||||
nth->urg_ptr = 0;
|
||||
|
||||
synproxy_build_options(nth, opts);
|
||||
|
||||
synproxy_send_tcp(skb, nskb, NULL, 0, niph, nth, tcp_hdr_size);
|
||||
}
|
||||
|
||||
static bool
|
||||
synproxy_recv_client_ack(const struct synproxy_net *snet,
|
||||
const struct sk_buff *skb, const struct tcphdr *th,
|
||||
struct synproxy_options *opts, u32 recv_seq)
|
||||
{
|
||||
int mss;
|
||||
|
||||
mss = __cookie_v4_check(ip_hdr(skb), th, ntohl(th->ack_seq) - 1);
|
||||
if (mss == 0) {
|
||||
this_cpu_inc(snet->stats->cookie_invalid);
|
||||
return false;
|
||||
}
|
||||
|
||||
this_cpu_inc(snet->stats->cookie_valid);
|
||||
opts->mss = mss;
|
||||
opts->options |= XT_SYNPROXY_OPT_MSS;
|
||||
|
||||
if (opts->options & XT_SYNPROXY_OPT_TIMESTAMP)
|
||||
synproxy_check_timestamp_cookie(opts);
|
||||
|
||||
synproxy_send_server_syn(snet, skb, th, opts, recv_seq);
|
||||
return true;
|
||||
}
|
||||
|
||||
static unsigned int
|
||||
synproxy_tg4(struct sk_buff *skb, const struct xt_action_param *par)
|
||||
{
|
||||
const struct xt_synproxy_info *info = par->targinfo;
|
||||
struct synproxy_net *snet = synproxy_pernet(dev_net(par->in));
|
||||
struct synproxy_options opts = {};
|
||||
struct tcphdr *th, _th;
|
||||
|
||||
if (nf_ip_checksum(skb, par->hooknum, par->thoff, IPPROTO_TCP))
|
||||
return NF_DROP;
|
||||
|
||||
th = skb_header_pointer(skb, par->thoff, sizeof(_th), &_th);
|
||||
if (th == NULL)
|
||||
return NF_DROP;
|
||||
|
||||
if (!synproxy_parse_options(skb, par->thoff, th, &opts))
|
||||
return NF_DROP;
|
||||
|
||||
if (th->syn && !(th->ack || th->fin || th->rst)) {
|
||||
/* Initial SYN from client */
|
||||
this_cpu_inc(snet->stats->syn_received);
|
||||
|
||||
if (th->ece && th->cwr)
|
||||
opts.options |= XT_SYNPROXY_OPT_ECN;
|
||||
|
||||
opts.options &= info->options;
|
||||
if (opts.options & XT_SYNPROXY_OPT_TIMESTAMP)
|
||||
synproxy_init_timestamp_cookie(info, &opts);
|
||||
else
|
||||
opts.options &= ~(XT_SYNPROXY_OPT_WSCALE |
|
||||
XT_SYNPROXY_OPT_SACK_PERM |
|
||||
XT_SYNPROXY_OPT_ECN);
|
||||
|
||||
synproxy_send_client_synack(skb, th, &opts);
|
||||
return NF_DROP;
|
||||
|
||||
} else if (th->ack && !(th->fin || th->rst || th->syn)) {
|
||||
/* ACK from client */
|
||||
synproxy_recv_client_ack(snet, skb, th, &opts, ntohl(th->seq));
|
||||
return NF_DROP;
|
||||
}
|
||||
|
||||
return XT_CONTINUE;
|
||||
}
|
||||
|
||||
static unsigned int ipv4_synproxy_hook(const struct nf_hook_ops *ops,
|
||||
struct sk_buff *skb,
|
||||
const struct net_device *in,
|
||||
const struct net_device *out,
|
||||
int (*okfn)(struct sk_buff *))
|
||||
{
|
||||
struct synproxy_net *snet = synproxy_pernet(dev_net(in ? : out));
|
||||
enum ip_conntrack_info ctinfo;
|
||||
struct nf_conn *ct;
|
||||
struct nf_conn_synproxy *synproxy;
|
||||
struct synproxy_options opts = {};
|
||||
const struct ip_ct_tcp *state;
|
||||
struct tcphdr *th, _th;
|
||||
unsigned int thoff;
|
||||
|
||||
ct = nf_ct_get(skb, &ctinfo);
|
||||
if (ct == NULL)
|
||||
return NF_ACCEPT;
|
||||
|
||||
synproxy = nfct_synproxy(ct);
|
||||
if (synproxy == NULL)
|
||||
return NF_ACCEPT;
|
||||
|
||||
if (nf_is_loopback_packet(skb))
|
||||
return NF_ACCEPT;
|
||||
|
||||
thoff = ip_hdrlen(skb);
|
||||
th = skb_header_pointer(skb, thoff, sizeof(_th), &_th);
|
||||
if (th == NULL)
|
||||
return NF_DROP;
|
||||
|
||||
state = &ct->proto.tcp;
|
||||
switch (state->state) {
|
||||
case TCP_CONNTRACK_CLOSE:
|
||||
if (th->rst && !test_bit(IPS_SEEN_REPLY_BIT, &ct->status)) {
|
||||
nf_ct_seqadj_init(ct, ctinfo, synproxy->isn -
|
||||
ntohl(th->seq) + 1);
|
||||
break;
|
||||
}
|
||||
|
||||
if (!th->syn || th->ack ||
|
||||
CTINFO2DIR(ctinfo) != IP_CT_DIR_ORIGINAL)
|
||||
break;
|
||||
|
||||
/* Reopened connection - reset the sequence number and timestamp
|
||||
* adjustments, they will get initialized once the connection is
|
||||
* reestablished.
|
||||
*/
|
||||
nf_ct_seqadj_init(ct, ctinfo, 0);
|
||||
synproxy->tsoff = 0;
|
||||
this_cpu_inc(snet->stats->conn_reopened);
|
||||
|
||||
/* fall through */
|
||||
case TCP_CONNTRACK_SYN_SENT:
|
||||
if (!synproxy_parse_options(skb, thoff, th, &opts))
|
||||
return NF_DROP;
|
||||
|
||||
if (!th->syn && th->ack &&
|
||||
CTINFO2DIR(ctinfo) == IP_CT_DIR_ORIGINAL) {
|
||||
/* Keep-Alives are sent with SEG.SEQ = SND.NXT-1,
|
||||
* therefore we need to add 1 to make the SYN sequence
|
||||
* number match the one of first SYN.
|
||||
*/
|
||||
if (synproxy_recv_client_ack(snet, skb, th, &opts,
|
||||
ntohl(th->seq) + 1))
|
||||
this_cpu_inc(snet->stats->cookie_retrans);
|
||||
|
||||
return NF_DROP;
|
||||
}
|
||||
|
||||
synproxy->isn = ntohl(th->ack_seq);
|
||||
if (opts.options & XT_SYNPROXY_OPT_TIMESTAMP)
|
||||
synproxy->its = opts.tsecr;
|
||||
break;
|
||||
case TCP_CONNTRACK_SYN_RECV:
|
||||
if (!th->syn || !th->ack)
|
||||
break;
|
||||
|
||||
if (!synproxy_parse_options(skb, thoff, th, &opts))
|
||||
return NF_DROP;
|
||||
|
||||
if (opts.options & XT_SYNPROXY_OPT_TIMESTAMP)
|
||||
synproxy->tsoff = opts.tsval - synproxy->its;
|
||||
|
||||
opts.options &= ~(XT_SYNPROXY_OPT_MSS |
|
||||
XT_SYNPROXY_OPT_WSCALE |
|
||||
XT_SYNPROXY_OPT_SACK_PERM);
|
||||
|
||||
swap(opts.tsval, opts.tsecr);
|
||||
synproxy_send_server_ack(snet, state, skb, th, &opts);
|
||||
|
||||
nf_ct_seqadj_init(ct, ctinfo, synproxy->isn - ntohl(th->seq));
|
||||
|
||||
swap(opts.tsval, opts.tsecr);
|
||||
synproxy_send_client_ack(snet, skb, th, &opts);
|
||||
|
||||
consume_skb(skb);
|
||||
return NF_STOLEN;
|
||||
default:
|
||||
break;
|
||||
}
|
||||
|
||||
synproxy_tstamp_adjust(skb, thoff, th, ct, ctinfo, synproxy);
|
||||
return NF_ACCEPT;
|
||||
}
|
||||
|
||||
static int synproxy_tg4_check(const struct xt_tgchk_param *par)
|
||||
{
|
||||
const struct ipt_entry *e = par->entryinfo;
|
||||
|
||||
if (e->ip.proto != IPPROTO_TCP ||
|
||||
e->ip.invflags & XT_INV_PROTO)
|
||||
return -EINVAL;
|
||||
|
||||
return nf_ct_l3proto_try_module_get(par->family);
|
||||
}
|
||||
|
||||
static void synproxy_tg4_destroy(const struct xt_tgdtor_param *par)
|
||||
{
|
||||
nf_ct_l3proto_module_put(par->family);
|
||||
}
|
||||
|
||||
static struct xt_target synproxy_tg4_reg __read_mostly = {
|
||||
.name = "SYNPROXY",
|
||||
.family = NFPROTO_IPV4,
|
||||
.hooks = (1 << NF_INET_LOCAL_IN) | (1 << NF_INET_FORWARD),
|
||||
.target = synproxy_tg4,
|
||||
.targetsize = sizeof(struct xt_synproxy_info),
|
||||
.checkentry = synproxy_tg4_check,
|
||||
.destroy = synproxy_tg4_destroy,
|
||||
.me = THIS_MODULE,
|
||||
};
|
||||
|
||||
static struct nf_hook_ops ipv4_synproxy_ops[] __read_mostly = {
|
||||
{
|
||||
.hook = ipv4_synproxy_hook,
|
||||
.owner = THIS_MODULE,
|
||||
.pf = NFPROTO_IPV4,
|
||||
.hooknum = NF_INET_LOCAL_IN,
|
||||
.priority = NF_IP_PRI_CONNTRACK_CONFIRM - 1,
|
||||
},
|
||||
{
|
||||
.hook = ipv4_synproxy_hook,
|
||||
.owner = THIS_MODULE,
|
||||
.pf = NFPROTO_IPV4,
|
||||
.hooknum = NF_INET_POST_ROUTING,
|
||||
.priority = NF_IP_PRI_CONNTRACK_CONFIRM - 1,
|
||||
},
|
||||
};
|
||||
|
||||
static int __init synproxy_tg4_init(void)
|
||||
{
|
||||
int err;
|
||||
|
||||
err = nf_register_hooks(ipv4_synproxy_ops,
|
||||
ARRAY_SIZE(ipv4_synproxy_ops));
|
||||
if (err < 0)
|
||||
goto err1;
|
||||
|
||||
err = xt_register_target(&synproxy_tg4_reg);
|
||||
if (err < 0)
|
||||
goto err2;
|
||||
|
||||
return 0;
|
||||
|
||||
err2:
|
||||
nf_unregister_hooks(ipv4_synproxy_ops, ARRAY_SIZE(ipv4_synproxy_ops));
|
||||
err1:
|
||||
return err;
|
||||
}
|
||||
|
||||
static void __exit synproxy_tg4_exit(void)
|
||||
{
|
||||
xt_unregister_target(&synproxy_tg4_reg);
|
||||
nf_unregister_hooks(ipv4_synproxy_ops, ARRAY_SIZE(ipv4_synproxy_ops));
|
||||
}
|
||||
|
||||
module_init(synproxy_tg4_init);
|
||||
module_exit(synproxy_tg4_exit);
|
||||
|
||||
MODULE_LICENSE("GPL");
|
||||
MODULE_AUTHOR("Patrick McHardy <kaber@trash.net>");
|
||||
91
net/ipv4/netfilter/ipt_ah.c
Normal file
91
net/ipv4/netfilter/ipt_ah.c
Normal file
|
|
@ -0,0 +1,91 @@
|
|||
/* Kernel module to match AH parameters. */
|
||||
/* (C) 1999-2000 Yon Uriarte <yon@astaro.de>
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License version 2 as
|
||||
* published by the Free Software Foundation.
|
||||
*/
|
||||
#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
|
||||
#include <linux/in.h>
|
||||
#include <linux/module.h>
|
||||
#include <linux/skbuff.h>
|
||||
#include <linux/ip.h>
|
||||
|
||||
#include <linux/netfilter_ipv4/ipt_ah.h>
|
||||
#include <linux/netfilter/x_tables.h>
|
||||
|
||||
MODULE_LICENSE("GPL");
|
||||
MODULE_AUTHOR("Yon Uriarte <yon@astaro.de>");
|
||||
MODULE_DESCRIPTION("Xtables: IPv4 IPsec-AH SPI match");
|
||||
|
||||
/* Returns 1 if the spi is matched by the range, 0 otherwise */
|
||||
static inline bool
|
||||
spi_match(u_int32_t min, u_int32_t max, u_int32_t spi, bool invert)
|
||||
{
|
||||
bool r;
|
||||
pr_debug("spi_match:%c 0x%x <= 0x%x <= 0x%x\n",
|
||||
invert ? '!' : ' ', min, spi, max);
|
||||
r=(spi >= min && spi <= max) ^ invert;
|
||||
pr_debug(" result %s\n", r ? "PASS" : "FAILED");
|
||||
return r;
|
||||
}
|
||||
|
||||
static bool ah_mt(const struct sk_buff *skb, struct xt_action_param *par)
|
||||
{
|
||||
struct ip_auth_hdr _ahdr;
|
||||
const struct ip_auth_hdr *ah;
|
||||
const struct ipt_ah *ahinfo = par->matchinfo;
|
||||
|
||||
/* Must not be a fragment. */
|
||||
if (par->fragoff != 0)
|
||||
return false;
|
||||
|
||||
ah = skb_header_pointer(skb, par->thoff, sizeof(_ahdr), &_ahdr);
|
||||
if (ah == NULL) {
|
||||
/* We've been asked to examine this packet, and we
|
||||
* can't. Hence, no choice but to drop.
|
||||
*/
|
||||
pr_debug("Dropping evil AH tinygram.\n");
|
||||
par->hotdrop = true;
|
||||
return 0;
|
||||
}
|
||||
|
||||
return spi_match(ahinfo->spis[0], ahinfo->spis[1],
|
||||
ntohl(ah->spi),
|
||||
!!(ahinfo->invflags & IPT_AH_INV_SPI));
|
||||
}
|
||||
|
||||
static int ah_mt_check(const struct xt_mtchk_param *par)
|
||||
{
|
||||
const struct ipt_ah *ahinfo = par->matchinfo;
|
||||
|
||||
/* Must specify no unknown invflags */
|
||||
if (ahinfo->invflags & ~IPT_AH_INV_MASK) {
|
||||
pr_debug("unknown flags %X\n", ahinfo->invflags);
|
||||
return -EINVAL;
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
static struct xt_match ah_mt_reg __read_mostly = {
|
||||
.name = "ah",
|
||||
.family = NFPROTO_IPV4,
|
||||
.match = ah_mt,
|
||||
.matchsize = sizeof(struct ipt_ah),
|
||||
.proto = IPPROTO_AH,
|
||||
.checkentry = ah_mt_check,
|
||||
.me = THIS_MODULE,
|
||||
};
|
||||
|
||||
static int __init ah_mt_init(void)
|
||||
{
|
||||
return xt_register_match(&ah_mt_reg);
|
||||
}
|
||||
|
||||
static void __exit ah_mt_exit(void)
|
||||
{
|
||||
xt_unregister_match(&ah_mt_reg);
|
||||
}
|
||||
|
||||
module_init(ah_mt_init);
|
||||
module_exit(ah_mt_exit);
|
||||
144
net/ipv4/netfilter/ipt_rpfilter.c
Normal file
144
net/ipv4/netfilter/ipt_rpfilter.c
Normal file
|
|
@ -0,0 +1,144 @@
|
|||
/*
|
||||
* Copyright (c) 2011 Florian Westphal <fw@strlen.de>
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License version 2 as
|
||||
* published by the Free Software Foundation.
|
||||
*
|
||||
* based on fib_frontend.c; Author: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
|
||||
*/
|
||||
#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
|
||||
#include <linux/module.h>
|
||||
#include <linux/skbuff.h>
|
||||
#include <linux/netdevice.h>
|
||||
#include <linux/ip.h>
|
||||
#include <net/ip.h>
|
||||
#include <net/ip_fib.h>
|
||||
#include <net/route.h>
|
||||
|
||||
#include <linux/netfilter/xt_rpfilter.h>
|
||||
#include <linux/netfilter/x_tables.h>
|
||||
|
||||
MODULE_LICENSE("GPL");
|
||||
MODULE_AUTHOR("Florian Westphal <fw@strlen.de>");
|
||||
MODULE_DESCRIPTION("iptables: ipv4 reverse path filter match");
|
||||
|
||||
/* don't try to find route from mcast/bcast/zeronet */
|
||||
static __be32 rpfilter_get_saddr(__be32 addr)
|
||||
{
|
||||
if (ipv4_is_multicast(addr) || ipv4_is_lbcast(addr) ||
|
||||
ipv4_is_zeronet(addr))
|
||||
return 0;
|
||||
return addr;
|
||||
}
|
||||
|
||||
static bool rpfilter_lookup_reverse(struct flowi4 *fl4,
|
||||
const struct net_device *dev, u8 flags)
|
||||
{
|
||||
struct fib_result res;
|
||||
bool dev_match;
|
||||
struct net *net = dev_net(dev);
|
||||
int ret __maybe_unused;
|
||||
|
||||
if (fib_lookup(net, fl4, &res))
|
||||
return false;
|
||||
|
||||
if (res.type != RTN_UNICAST) {
|
||||
if (res.type != RTN_LOCAL || !(flags & XT_RPFILTER_ACCEPT_LOCAL))
|
||||
return false;
|
||||
}
|
||||
dev_match = false;
|
||||
#ifdef CONFIG_IP_ROUTE_MULTIPATH
|
||||
for (ret = 0; ret < res.fi->fib_nhs; ret++) {
|
||||
struct fib_nh *nh = &res.fi->fib_nh[ret];
|
||||
|
||||
if (nh->nh_dev == dev) {
|
||||
dev_match = true;
|
||||
break;
|
||||
}
|
||||
}
|
||||
#else
|
||||
if (FIB_RES_DEV(res) == dev)
|
||||
dev_match = true;
|
||||
#endif
|
||||
if (dev_match || flags & XT_RPFILTER_LOOSE)
|
||||
return FIB_RES_NH(res).nh_scope <= RT_SCOPE_HOST;
|
||||
return dev_match;
|
||||
}
|
||||
|
||||
static bool rpfilter_is_local(const struct sk_buff *skb)
|
||||
{
|
||||
const struct rtable *rt = skb_rtable(skb);
|
||||
return rt && (rt->rt_flags & RTCF_LOCAL);
|
||||
}
|
||||
|
||||
static bool rpfilter_mt(const struct sk_buff *skb, struct xt_action_param *par)
|
||||
{
|
||||
const struct xt_rpfilter_info *info;
|
||||
const struct iphdr *iph;
|
||||
struct flowi4 flow;
|
||||
bool invert;
|
||||
|
||||
info = par->matchinfo;
|
||||
invert = info->flags & XT_RPFILTER_INVERT;
|
||||
|
||||
if (rpfilter_is_local(skb))
|
||||
return true ^ invert;
|
||||
|
||||
iph = ip_hdr(skb);
|
||||
if (ipv4_is_multicast(iph->daddr)) {
|
||||
if (ipv4_is_zeronet(iph->saddr))
|
||||
return ipv4_is_local_multicast(iph->daddr) ^ invert;
|
||||
}
|
||||
flow.flowi4_iif = LOOPBACK_IFINDEX;
|
||||
flow.daddr = iph->saddr;
|
||||
flow.saddr = rpfilter_get_saddr(iph->daddr);
|
||||
flow.flowi4_oif = 0;
|
||||
flow.flowi4_mark = info->flags & XT_RPFILTER_VALID_MARK ? skb->mark : 0;
|
||||
flow.flowi4_tos = RT_TOS(iph->tos);
|
||||
flow.flowi4_scope = RT_SCOPE_UNIVERSE;
|
||||
|
||||
return rpfilter_lookup_reverse(&flow, par->in, info->flags) ^ invert;
|
||||
}
|
||||
|
||||
static int rpfilter_check(const struct xt_mtchk_param *par)
|
||||
{
|
||||
const struct xt_rpfilter_info *info = par->matchinfo;
|
||||
unsigned int options = ~XT_RPFILTER_OPTION_MASK;
|
||||
if (info->flags & options) {
|
||||
pr_info("unknown options encountered");
|
||||
return -EINVAL;
|
||||
}
|
||||
|
||||
if (strcmp(par->table, "mangle") != 0 &&
|
||||
strcmp(par->table, "raw") != 0) {
|
||||
pr_info("match only valid in the \'raw\' "
|
||||
"or \'mangle\' tables, not \'%s\'.\n", par->table);
|
||||
return -EINVAL;
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
static struct xt_match rpfilter_mt_reg __read_mostly = {
|
||||
.name = "rpfilter",
|
||||
.family = NFPROTO_IPV4,
|
||||
.checkentry = rpfilter_check,
|
||||
.match = rpfilter_mt,
|
||||
.matchsize = sizeof(struct xt_rpfilter_info),
|
||||
.hooks = (1 << NF_INET_PRE_ROUTING),
|
||||
.me = THIS_MODULE
|
||||
};
|
||||
|
||||
static int __init rpfilter_mt_init(void)
|
||||
{
|
||||
return xt_register_match(&rpfilter_mt_reg);
|
||||
}
|
||||
|
||||
static void __exit rpfilter_mt_exit(void)
|
||||
{
|
||||
xt_unregister_match(&rpfilter_mt_reg);
|
||||
}
|
||||
|
||||
module_init(rpfilter_mt_init);
|
||||
module_exit(rpfilter_mt_exit);
|
||||
111
net/ipv4/netfilter/iptable_filter.c
Normal file
111
net/ipv4/netfilter/iptable_filter.c
Normal file
|
|
@ -0,0 +1,111 @@
|
|||
/*
|
||||
* This is the 1999 rewrite of IP Firewalling, aiming for kernel 2.3.x.
|
||||
*
|
||||
* Copyright (C) 1999 Paul `Rusty' Russell & Michael J. Neuling
|
||||
* Copyright (C) 2000-2004 Netfilter Core Team <coreteam@netfilter.org>
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License version 2 as
|
||||
* published by the Free Software Foundation.
|
||||
*
|
||||
*/
|
||||
|
||||
#include <linux/module.h>
|
||||
#include <linux/moduleparam.h>
|
||||
#include <linux/netfilter_ipv4/ip_tables.h>
|
||||
#include <linux/slab.h>
|
||||
#include <net/ip.h>
|
||||
|
||||
MODULE_LICENSE("GPL");
|
||||
MODULE_AUTHOR("Netfilter Core Team <coreteam@netfilter.org>");
|
||||
MODULE_DESCRIPTION("iptables filter table");
|
||||
|
||||
#define FILTER_VALID_HOOKS ((1 << NF_INET_LOCAL_IN) | \
|
||||
(1 << NF_INET_FORWARD) | \
|
||||
(1 << NF_INET_LOCAL_OUT))
|
||||
|
||||
static const struct xt_table packet_filter = {
|
||||
.name = "filter",
|
||||
.valid_hooks = FILTER_VALID_HOOKS,
|
||||
.me = THIS_MODULE,
|
||||
.af = NFPROTO_IPV4,
|
||||
.priority = NF_IP_PRI_FILTER,
|
||||
};
|
||||
|
||||
static unsigned int
|
||||
iptable_filter_hook(const struct nf_hook_ops *ops, struct sk_buff *skb,
|
||||
const struct net_device *in, const struct net_device *out,
|
||||
int (*okfn)(struct sk_buff *))
|
||||
{
|
||||
const struct net *net;
|
||||
|
||||
if (ops->hooknum == NF_INET_LOCAL_OUT &&
|
||||
(skb->len < sizeof(struct iphdr) ||
|
||||
ip_hdrlen(skb) < sizeof(struct iphdr)))
|
||||
/* root is playing with raw sockets. */
|
||||
return NF_ACCEPT;
|
||||
|
||||
net = dev_net((in != NULL) ? in : out);
|
||||
return ipt_do_table(skb, ops->hooknum, in, out,
|
||||
net->ipv4.iptable_filter);
|
||||
}
|
||||
|
||||
static struct nf_hook_ops *filter_ops __read_mostly;
|
||||
|
||||
/* Default to forward because I got too much mail already. */
|
||||
static bool forward = true;
|
||||
module_param(forward, bool, 0000);
|
||||
|
||||
static int __net_init iptable_filter_net_init(struct net *net)
|
||||
{
|
||||
struct ipt_replace *repl;
|
||||
|
||||
repl = ipt_alloc_initial_table(&packet_filter);
|
||||
if (repl == NULL)
|
||||
return -ENOMEM;
|
||||
/* Entry 1 is the FORWARD hook */
|
||||
((struct ipt_standard *)repl->entries)[1].target.verdict =
|
||||
forward ? -NF_ACCEPT - 1 : -NF_DROP - 1;
|
||||
|
||||
net->ipv4.iptable_filter =
|
||||
ipt_register_table(net, &packet_filter, repl);
|
||||
kfree(repl);
|
||||
return PTR_ERR_OR_ZERO(net->ipv4.iptable_filter);
|
||||
}
|
||||
|
||||
static void __net_exit iptable_filter_net_exit(struct net *net)
|
||||
{
|
||||
ipt_unregister_table(net, net->ipv4.iptable_filter);
|
||||
}
|
||||
|
||||
static struct pernet_operations iptable_filter_net_ops = {
|
||||
.init = iptable_filter_net_init,
|
||||
.exit = iptable_filter_net_exit,
|
||||
};
|
||||
|
||||
static int __init iptable_filter_init(void)
|
||||
{
|
||||
int ret;
|
||||
|
||||
ret = register_pernet_subsys(&iptable_filter_net_ops);
|
||||
if (ret < 0)
|
||||
return ret;
|
||||
|
||||
/* Register hooks */
|
||||
filter_ops = xt_hook_link(&packet_filter, iptable_filter_hook);
|
||||
if (IS_ERR(filter_ops)) {
|
||||
ret = PTR_ERR(filter_ops);
|
||||
unregister_pernet_subsys(&iptable_filter_net_ops);
|
||||
}
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
static void __exit iptable_filter_fini(void)
|
||||
{
|
||||
xt_hook_unlink(&packet_filter, filter_ops);
|
||||
unregister_pernet_subsys(&iptable_filter_net_ops);
|
||||
}
|
||||
|
||||
module_init(iptable_filter_init);
|
||||
module_exit(iptable_filter_fini);
|
||||
148
net/ipv4/netfilter/iptable_mangle.c
Normal file
148
net/ipv4/netfilter/iptable_mangle.c
Normal file
|
|
@ -0,0 +1,148 @@
|
|||
/*
|
||||
* This is the 1999 rewrite of IP Firewalling, aiming for kernel 2.3.x.
|
||||
*
|
||||
* Copyright (C) 1999 Paul `Rusty' Russell & Michael J. Neuling
|
||||
* Copyright (C) 2000-2004 Netfilter Core Team <coreteam@netfilter.org>
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License version 2 as
|
||||
* published by the Free Software Foundation.
|
||||
*/
|
||||
#include <linux/module.h>
|
||||
#include <linux/netfilter_ipv4/ip_tables.h>
|
||||
#include <linux/netdevice.h>
|
||||
#include <linux/skbuff.h>
|
||||
#include <linux/slab.h>
|
||||
#include <net/sock.h>
|
||||
#include <net/route.h>
|
||||
#include <linux/ip.h>
|
||||
#include <net/ip.h>
|
||||
|
||||
MODULE_LICENSE("GPL");
|
||||
MODULE_AUTHOR("Netfilter Core Team <coreteam@netfilter.org>");
|
||||
MODULE_DESCRIPTION("iptables mangle table");
|
||||
|
||||
#define MANGLE_VALID_HOOKS ((1 << NF_INET_PRE_ROUTING) | \
|
||||
(1 << NF_INET_LOCAL_IN) | \
|
||||
(1 << NF_INET_FORWARD) | \
|
||||
(1 << NF_INET_LOCAL_OUT) | \
|
||||
(1 << NF_INET_POST_ROUTING))
|
||||
|
||||
static const struct xt_table packet_mangler = {
|
||||
.name = "mangle",
|
||||
.valid_hooks = MANGLE_VALID_HOOKS,
|
||||
.me = THIS_MODULE,
|
||||
.af = NFPROTO_IPV4,
|
||||
.priority = NF_IP_PRI_MANGLE,
|
||||
};
|
||||
|
||||
static unsigned int
|
||||
ipt_mangle_out(struct sk_buff *skb, const struct net_device *out)
|
||||
{
|
||||
unsigned int ret;
|
||||
const struct iphdr *iph;
|
||||
u_int8_t tos;
|
||||
__be32 saddr, daddr;
|
||||
u_int32_t mark;
|
||||
int err;
|
||||
|
||||
/* root is playing with raw sockets. */
|
||||
if (skb->len < sizeof(struct iphdr) ||
|
||||
ip_hdrlen(skb) < sizeof(struct iphdr))
|
||||
return NF_ACCEPT;
|
||||
|
||||
/* Save things which could affect route */
|
||||
mark = skb->mark;
|
||||
iph = ip_hdr(skb);
|
||||
saddr = iph->saddr;
|
||||
daddr = iph->daddr;
|
||||
tos = iph->tos;
|
||||
|
||||
ret = ipt_do_table(skb, NF_INET_LOCAL_OUT, NULL, out,
|
||||
dev_net(out)->ipv4.iptable_mangle);
|
||||
/* Reroute for ANY change. */
|
||||
if (ret != NF_DROP && ret != NF_STOLEN) {
|
||||
iph = ip_hdr(skb);
|
||||
|
||||
if (iph->saddr != saddr ||
|
||||
iph->daddr != daddr ||
|
||||
skb->mark != mark ||
|
||||
iph->tos != tos) {
|
||||
err = ip_route_me_harder(skb, RTN_UNSPEC);
|
||||
if (err < 0)
|
||||
ret = NF_DROP_ERR(err);
|
||||
}
|
||||
}
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
/* The work comes in here from netfilter.c. */
|
||||
static unsigned int
|
||||
iptable_mangle_hook(const struct nf_hook_ops *ops,
|
||||
struct sk_buff *skb,
|
||||
const struct net_device *in,
|
||||
const struct net_device *out,
|
||||
int (*okfn)(struct sk_buff *))
|
||||
{
|
||||
if (ops->hooknum == NF_INET_LOCAL_OUT)
|
||||
return ipt_mangle_out(skb, out);
|
||||
if (ops->hooknum == NF_INET_POST_ROUTING)
|
||||
return ipt_do_table(skb, ops->hooknum, in, out,
|
||||
dev_net(out)->ipv4.iptable_mangle);
|
||||
/* PREROUTING/INPUT/FORWARD: */
|
||||
return ipt_do_table(skb, ops->hooknum, in, out,
|
||||
dev_net(in)->ipv4.iptable_mangle);
|
||||
}
|
||||
|
||||
static struct nf_hook_ops *mangle_ops __read_mostly;
|
||||
|
||||
static int __net_init iptable_mangle_net_init(struct net *net)
|
||||
{
|
||||
struct ipt_replace *repl;
|
||||
|
||||
repl = ipt_alloc_initial_table(&packet_mangler);
|
||||
if (repl == NULL)
|
||||
return -ENOMEM;
|
||||
net->ipv4.iptable_mangle =
|
||||
ipt_register_table(net, &packet_mangler, repl);
|
||||
kfree(repl);
|
||||
return PTR_ERR_OR_ZERO(net->ipv4.iptable_mangle);
|
||||
}
|
||||
|
||||
static void __net_exit iptable_mangle_net_exit(struct net *net)
|
||||
{
|
||||
ipt_unregister_table(net, net->ipv4.iptable_mangle);
|
||||
}
|
||||
|
||||
static struct pernet_operations iptable_mangle_net_ops = {
|
||||
.init = iptable_mangle_net_init,
|
||||
.exit = iptable_mangle_net_exit,
|
||||
};
|
||||
|
||||
static int __init iptable_mangle_init(void)
|
||||
{
|
||||
int ret;
|
||||
|
||||
ret = register_pernet_subsys(&iptable_mangle_net_ops);
|
||||
if (ret < 0)
|
||||
return ret;
|
||||
|
||||
/* Register hooks */
|
||||
mangle_ops = xt_hook_link(&packet_mangler, iptable_mangle_hook);
|
||||
if (IS_ERR(mangle_ops)) {
|
||||
ret = PTR_ERR(mangle_ops);
|
||||
unregister_pernet_subsys(&iptable_mangle_net_ops);
|
||||
}
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
static void __exit iptable_mangle_fini(void)
|
||||
{
|
||||
xt_hook_unlink(&packet_mangler, mangle_ops);
|
||||
unregister_pernet_subsys(&iptable_mangle_net_ops);
|
||||
}
|
||||
|
||||
module_init(iptable_mangle_init);
|
||||
module_exit(iptable_mangle_fini);
|
||||
163
net/ipv4/netfilter/iptable_nat.c
Normal file
163
net/ipv4/netfilter/iptable_nat.c
Normal file
|
|
@ -0,0 +1,163 @@
|
|||
/* (C) 1999-2001 Paul `Rusty' Russell
|
||||
* (C) 2002-2006 Netfilter Core Team <coreteam@netfilter.org>
|
||||
* (C) 2011 Patrick McHardy <kaber@trash.net>
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License version 2 as
|
||||
* published by the Free Software Foundation.
|
||||
*/
|
||||
|
||||
#include <linux/module.h>
|
||||
#include <linux/netfilter.h>
|
||||
#include <linux/netfilter_ipv4.h>
|
||||
#include <linux/netfilter_ipv4/ip_tables.h>
|
||||
#include <linux/ip.h>
|
||||
#include <net/ip.h>
|
||||
|
||||
#include <net/netfilter/nf_nat.h>
|
||||
#include <net/netfilter/nf_nat_core.h>
|
||||
#include <net/netfilter/nf_nat_l3proto.h>
|
||||
|
||||
static const struct xt_table nf_nat_ipv4_table = {
|
||||
.name = "nat",
|
||||
.valid_hooks = (1 << NF_INET_PRE_ROUTING) |
|
||||
(1 << NF_INET_POST_ROUTING) |
|
||||
(1 << NF_INET_LOCAL_OUT) |
|
||||
(1 << NF_INET_LOCAL_IN),
|
||||
.me = THIS_MODULE,
|
||||
.af = NFPROTO_IPV4,
|
||||
};
|
||||
|
||||
static unsigned int iptable_nat_do_chain(const struct nf_hook_ops *ops,
|
||||
struct sk_buff *skb,
|
||||
const struct net_device *in,
|
||||
const struct net_device *out,
|
||||
struct nf_conn *ct)
|
||||
{
|
||||
struct net *net = nf_ct_net(ct);
|
||||
|
||||
return ipt_do_table(skb, ops->hooknum, in, out, net->ipv4.nat_table);
|
||||
}
|
||||
|
||||
static unsigned int iptable_nat_ipv4_fn(const struct nf_hook_ops *ops,
|
||||
struct sk_buff *skb,
|
||||
const struct net_device *in,
|
||||
const struct net_device *out,
|
||||
int (*okfn)(struct sk_buff *))
|
||||
{
|
||||
return nf_nat_ipv4_fn(ops, skb, in, out, iptable_nat_do_chain);
|
||||
}
|
||||
|
||||
static unsigned int iptable_nat_ipv4_in(const struct nf_hook_ops *ops,
|
||||
struct sk_buff *skb,
|
||||
const struct net_device *in,
|
||||
const struct net_device *out,
|
||||
int (*okfn)(struct sk_buff *))
|
||||
{
|
||||
return nf_nat_ipv4_in(ops, skb, in, out, iptable_nat_do_chain);
|
||||
}
|
||||
|
||||
static unsigned int iptable_nat_ipv4_out(const struct nf_hook_ops *ops,
|
||||
struct sk_buff *skb,
|
||||
const struct net_device *in,
|
||||
const struct net_device *out,
|
||||
int (*okfn)(struct sk_buff *))
|
||||
{
|
||||
return nf_nat_ipv4_out(ops, skb, in, out, iptable_nat_do_chain);
|
||||
}
|
||||
|
||||
static unsigned int iptable_nat_ipv4_local_fn(const struct nf_hook_ops *ops,
|
||||
struct sk_buff *skb,
|
||||
const struct net_device *in,
|
||||
const struct net_device *out,
|
||||
int (*okfn)(struct sk_buff *))
|
||||
{
|
||||
return nf_nat_ipv4_local_fn(ops, skb, in, out, iptable_nat_do_chain);
|
||||
}
|
||||
|
||||
static struct nf_hook_ops nf_nat_ipv4_ops[] __read_mostly = {
|
||||
/* Before packet filtering, change destination */
|
||||
{
|
||||
.hook = iptable_nat_ipv4_in,
|
||||
.owner = THIS_MODULE,
|
||||
.pf = NFPROTO_IPV4,
|
||||
.hooknum = NF_INET_PRE_ROUTING,
|
||||
.priority = NF_IP_PRI_NAT_DST,
|
||||
},
|
||||
/* After packet filtering, change source */
|
||||
{
|
||||
.hook = iptable_nat_ipv4_out,
|
||||
.owner = THIS_MODULE,
|
||||
.pf = NFPROTO_IPV4,
|
||||
.hooknum = NF_INET_POST_ROUTING,
|
||||
.priority = NF_IP_PRI_NAT_SRC,
|
||||
},
|
||||
/* Before packet filtering, change destination */
|
||||
{
|
||||
.hook = iptable_nat_ipv4_local_fn,
|
||||
.owner = THIS_MODULE,
|
||||
.pf = NFPROTO_IPV4,
|
||||
.hooknum = NF_INET_LOCAL_OUT,
|
||||
.priority = NF_IP_PRI_NAT_DST,
|
||||
},
|
||||
/* After packet filtering, change source */
|
||||
{
|
||||
.hook = iptable_nat_ipv4_fn,
|
||||
.owner = THIS_MODULE,
|
||||
.pf = NFPROTO_IPV4,
|
||||
.hooknum = NF_INET_LOCAL_IN,
|
||||
.priority = NF_IP_PRI_NAT_SRC,
|
||||
},
|
||||
};
|
||||
|
||||
static int __net_init iptable_nat_net_init(struct net *net)
|
||||
{
|
||||
struct ipt_replace *repl;
|
||||
|
||||
repl = ipt_alloc_initial_table(&nf_nat_ipv4_table);
|
||||
if (repl == NULL)
|
||||
return -ENOMEM;
|
||||
net->ipv4.nat_table = ipt_register_table(net, &nf_nat_ipv4_table, repl);
|
||||
kfree(repl);
|
||||
return PTR_ERR_OR_ZERO(net->ipv4.nat_table);
|
||||
}
|
||||
|
||||
static void __net_exit iptable_nat_net_exit(struct net *net)
|
||||
{
|
||||
ipt_unregister_table(net, net->ipv4.nat_table);
|
||||
}
|
||||
|
||||
static struct pernet_operations iptable_nat_net_ops = {
|
||||
.init = iptable_nat_net_init,
|
||||
.exit = iptable_nat_net_exit,
|
||||
};
|
||||
|
||||
static int __init iptable_nat_init(void)
|
||||
{
|
||||
int err;
|
||||
|
||||
err = register_pernet_subsys(&iptable_nat_net_ops);
|
||||
if (err < 0)
|
||||
goto err1;
|
||||
|
||||
err = nf_register_hooks(nf_nat_ipv4_ops, ARRAY_SIZE(nf_nat_ipv4_ops));
|
||||
if (err < 0)
|
||||
goto err2;
|
||||
return 0;
|
||||
|
||||
err2:
|
||||
unregister_pernet_subsys(&iptable_nat_net_ops);
|
||||
err1:
|
||||
return err;
|
||||
}
|
||||
|
||||
static void __exit iptable_nat_exit(void)
|
||||
{
|
||||
nf_unregister_hooks(nf_nat_ipv4_ops, ARRAY_SIZE(nf_nat_ipv4_ops));
|
||||
unregister_pernet_subsys(&iptable_nat_net_ops);
|
||||
}
|
||||
|
||||
module_init(iptable_nat_init);
|
||||
module_exit(iptable_nat_exit);
|
||||
|
||||
MODULE_LICENSE("GPL");
|
||||
90
net/ipv4/netfilter/iptable_raw.c
Normal file
90
net/ipv4/netfilter/iptable_raw.c
Normal file
|
|
@ -0,0 +1,90 @@
|
|||
/*
|
||||
* 'raw' table, which is the very first hooked in at PRE_ROUTING and LOCAL_OUT .
|
||||
*
|
||||
* Copyright (C) 2003 Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>
|
||||
*/
|
||||
#include <linux/module.h>
|
||||
#include <linux/netfilter_ipv4/ip_tables.h>
|
||||
#include <linux/slab.h>
|
||||
#include <net/ip.h>
|
||||
|
||||
#define RAW_VALID_HOOKS ((1 << NF_INET_PRE_ROUTING) | (1 << NF_INET_LOCAL_OUT))
|
||||
|
||||
static const struct xt_table packet_raw = {
|
||||
.name = "raw",
|
||||
.valid_hooks = RAW_VALID_HOOKS,
|
||||
.me = THIS_MODULE,
|
||||
.af = NFPROTO_IPV4,
|
||||
.priority = NF_IP_PRI_RAW,
|
||||
};
|
||||
|
||||
/* The work comes in here from netfilter.c. */
|
||||
static unsigned int
|
||||
iptable_raw_hook(const struct nf_hook_ops *ops, struct sk_buff *skb,
|
||||
const struct net_device *in, const struct net_device *out,
|
||||
int (*okfn)(struct sk_buff *))
|
||||
{
|
||||
const struct net *net;
|
||||
|
||||
if (ops->hooknum == NF_INET_LOCAL_OUT &&
|
||||
(skb->len < sizeof(struct iphdr) ||
|
||||
ip_hdrlen(skb) < sizeof(struct iphdr)))
|
||||
/* root is playing with raw sockets. */
|
||||
return NF_ACCEPT;
|
||||
|
||||
net = dev_net((in != NULL) ? in : out);
|
||||
return ipt_do_table(skb, ops->hooknum, in, out, net->ipv4.iptable_raw);
|
||||
}
|
||||
|
||||
static struct nf_hook_ops *rawtable_ops __read_mostly;
|
||||
|
||||
static int __net_init iptable_raw_net_init(struct net *net)
|
||||
{
|
||||
struct ipt_replace *repl;
|
||||
|
||||
repl = ipt_alloc_initial_table(&packet_raw);
|
||||
if (repl == NULL)
|
||||
return -ENOMEM;
|
||||
net->ipv4.iptable_raw =
|
||||
ipt_register_table(net, &packet_raw, repl);
|
||||
kfree(repl);
|
||||
return PTR_ERR_OR_ZERO(net->ipv4.iptable_raw);
|
||||
}
|
||||
|
||||
static void __net_exit iptable_raw_net_exit(struct net *net)
|
||||
{
|
||||
ipt_unregister_table(net, net->ipv4.iptable_raw);
|
||||
}
|
||||
|
||||
static struct pernet_operations iptable_raw_net_ops = {
|
||||
.init = iptable_raw_net_init,
|
||||
.exit = iptable_raw_net_exit,
|
||||
};
|
||||
|
||||
static int __init iptable_raw_init(void)
|
||||
{
|
||||
int ret;
|
||||
|
||||
ret = register_pernet_subsys(&iptable_raw_net_ops);
|
||||
if (ret < 0)
|
||||
return ret;
|
||||
|
||||
/* Register hooks */
|
||||
rawtable_ops = xt_hook_link(&packet_raw, iptable_raw_hook);
|
||||
if (IS_ERR(rawtable_ops)) {
|
||||
ret = PTR_ERR(rawtable_ops);
|
||||
unregister_pernet_subsys(&iptable_raw_net_ops);
|
||||
}
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
static void __exit iptable_raw_fini(void)
|
||||
{
|
||||
xt_hook_unlink(&packet_raw, rawtable_ops);
|
||||
unregister_pernet_subsys(&iptable_raw_net_ops);
|
||||
}
|
||||
|
||||
module_init(iptable_raw_init);
|
||||
module_exit(iptable_raw_fini);
|
||||
MODULE_LICENSE("GPL");
|
||||
111
net/ipv4/netfilter/iptable_security.c
Normal file
111
net/ipv4/netfilter/iptable_security.c
Normal file
|
|
@ -0,0 +1,111 @@
|
|||
/*
|
||||
* "security" table
|
||||
*
|
||||
* This is for use by Mandatory Access Control (MAC) security models,
|
||||
* which need to be able to manage security policy in separate context
|
||||
* to DAC.
|
||||
*
|
||||
* Based on iptable_mangle.c
|
||||
*
|
||||
* Copyright (C) 1999 Paul `Rusty' Russell & Michael J. Neuling
|
||||
* Copyright (C) 2000-2004 Netfilter Core Team <coreteam <at> netfilter.org>
|
||||
* Copyright (C) 2008 Red Hat, Inc., James Morris <jmorris <at> redhat.com>
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License version 2 as
|
||||
* published by the Free Software Foundation.
|
||||
*/
|
||||
#include <linux/module.h>
|
||||
#include <linux/netfilter_ipv4/ip_tables.h>
|
||||
#include <linux/slab.h>
|
||||
#include <net/ip.h>
|
||||
|
||||
MODULE_LICENSE("GPL");
|
||||
MODULE_AUTHOR("James Morris <jmorris <at> redhat.com>");
|
||||
MODULE_DESCRIPTION("iptables security table, for MAC rules");
|
||||
|
||||
#define SECURITY_VALID_HOOKS (1 << NF_INET_LOCAL_IN) | \
|
||||
(1 << NF_INET_FORWARD) | \
|
||||
(1 << NF_INET_LOCAL_OUT)
|
||||
|
||||
static const struct xt_table security_table = {
|
||||
.name = "security",
|
||||
.valid_hooks = SECURITY_VALID_HOOKS,
|
||||
.me = THIS_MODULE,
|
||||
.af = NFPROTO_IPV4,
|
||||
.priority = NF_IP_PRI_SECURITY,
|
||||
};
|
||||
|
||||
static unsigned int
|
||||
iptable_security_hook(const struct nf_hook_ops *ops, struct sk_buff *skb,
|
||||
const struct net_device *in,
|
||||
const struct net_device *out,
|
||||
int (*okfn)(struct sk_buff *))
|
||||
{
|
||||
const struct net *net;
|
||||
|
||||
if (ops->hooknum == NF_INET_LOCAL_OUT &&
|
||||
(skb->len < sizeof(struct iphdr) ||
|
||||
ip_hdrlen(skb) < sizeof(struct iphdr)))
|
||||
/* Somebody is playing with raw sockets. */
|
||||
return NF_ACCEPT;
|
||||
|
||||
net = dev_net((in != NULL) ? in : out);
|
||||
return ipt_do_table(skb, ops->hooknum, in, out,
|
||||
net->ipv4.iptable_security);
|
||||
}
|
||||
|
||||
static struct nf_hook_ops *sectbl_ops __read_mostly;
|
||||
|
||||
static int __net_init iptable_security_net_init(struct net *net)
|
||||
{
|
||||
struct ipt_replace *repl;
|
||||
|
||||
repl = ipt_alloc_initial_table(&security_table);
|
||||
if (repl == NULL)
|
||||
return -ENOMEM;
|
||||
net->ipv4.iptable_security =
|
||||
ipt_register_table(net, &security_table, repl);
|
||||
kfree(repl);
|
||||
return PTR_ERR_OR_ZERO(net->ipv4.iptable_security);
|
||||
}
|
||||
|
||||
static void __net_exit iptable_security_net_exit(struct net *net)
|
||||
{
|
||||
ipt_unregister_table(net, net->ipv4.iptable_security);
|
||||
}
|
||||
|
||||
static struct pernet_operations iptable_security_net_ops = {
|
||||
.init = iptable_security_net_init,
|
||||
.exit = iptable_security_net_exit,
|
||||
};
|
||||
|
||||
static int __init iptable_security_init(void)
|
||||
{
|
||||
int ret;
|
||||
|
||||
ret = register_pernet_subsys(&iptable_security_net_ops);
|
||||
if (ret < 0)
|
||||
return ret;
|
||||
|
||||
sectbl_ops = xt_hook_link(&security_table, iptable_security_hook);
|
||||
if (IS_ERR(sectbl_ops)) {
|
||||
ret = PTR_ERR(sectbl_ops);
|
||||
goto cleanup_table;
|
||||
}
|
||||
|
||||
return ret;
|
||||
|
||||
cleanup_table:
|
||||
unregister_pernet_subsys(&iptable_security_net_ops);
|
||||
return ret;
|
||||
}
|
||||
|
||||
static void __exit iptable_security_fini(void)
|
||||
{
|
||||
xt_hook_unlink(&security_table, sectbl_ops);
|
||||
unregister_pernet_subsys(&iptable_security_net_ops);
|
||||
}
|
||||
|
||||
module_init(iptable_security_init);
|
||||
module_exit(iptable_security_fini);
|
||||
550
net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c
Normal file
550
net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c
Normal file
|
|
@ -0,0 +1,550 @@
|
|||
|
||||
/* (C) 1999-2001 Paul `Rusty' Russell
|
||||
* (C) 2002-2004 Netfilter Core Team <coreteam@netfilter.org>
|
||||
* (C) 2006-2012 Patrick McHardy <kaber@trash.net>
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License version 2 as
|
||||
* published by the Free Software Foundation.
|
||||
*/
|
||||
|
||||
#include <linux/types.h>
|
||||
#include <linux/ip.h>
|
||||
#include <linux/netfilter.h>
|
||||
#include <linux/module.h>
|
||||
#include <linux/skbuff.h>
|
||||
#include <linux/icmp.h>
|
||||
#include <linux/sysctl.h>
|
||||
#include <net/route.h>
|
||||
#include <net/ip.h>
|
||||
|
||||
#include <linux/netfilter_ipv4.h>
|
||||
#include <net/netfilter/nf_conntrack.h>
|
||||
#include <net/netfilter/nf_conntrack_helper.h>
|
||||
#include <net/netfilter/nf_conntrack_l4proto.h>
|
||||
#include <net/netfilter/nf_conntrack_l3proto.h>
|
||||
#include <net/netfilter/nf_conntrack_zones.h>
|
||||
#include <net/netfilter/nf_conntrack_core.h>
|
||||
#include <net/netfilter/nf_conntrack_seqadj.h>
|
||||
#include <net/netfilter/ipv4/nf_conntrack_ipv4.h>
|
||||
#include <net/netfilter/nf_nat_helper.h>
|
||||
#include <net/netfilter/ipv4/nf_defrag_ipv4.h>
|
||||
#include <net/netfilter/nf_log.h>
|
||||
|
||||
static bool ipv4_pkt_to_tuple(const struct sk_buff *skb, unsigned int nhoff,
|
||||
struct nf_conntrack_tuple *tuple)
|
||||
{
|
||||
const __be32 *ap;
|
||||
__be32 _addrs[2];
|
||||
ap = skb_header_pointer(skb, nhoff + offsetof(struct iphdr, saddr),
|
||||
sizeof(u_int32_t) * 2, _addrs);
|
||||
if (ap == NULL)
|
||||
return false;
|
||||
|
||||
tuple->src.u3.ip = ap[0];
|
||||
tuple->dst.u3.ip = ap[1];
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
static bool ipv4_invert_tuple(struct nf_conntrack_tuple *tuple,
|
||||
const struct nf_conntrack_tuple *orig)
|
||||
{
|
||||
tuple->src.u3.ip = orig->dst.u3.ip;
|
||||
tuple->dst.u3.ip = orig->src.u3.ip;
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
static int ipv4_print_tuple(struct seq_file *s,
|
||||
const struct nf_conntrack_tuple *tuple)
|
||||
{
|
||||
return seq_printf(s, "src=%pI4 dst=%pI4 ",
|
||||
&tuple->src.u3.ip, &tuple->dst.u3.ip);
|
||||
}
|
||||
|
||||
static int ipv4_get_l4proto(const struct sk_buff *skb, unsigned int nhoff,
|
||||
unsigned int *dataoff, u_int8_t *protonum)
|
||||
{
|
||||
const struct iphdr *iph;
|
||||
struct iphdr _iph;
|
||||
|
||||
iph = skb_header_pointer(skb, nhoff, sizeof(_iph), &_iph);
|
||||
if (iph == NULL)
|
||||
return -NF_ACCEPT;
|
||||
|
||||
/* Conntrack defragments packets, we might still see fragments
|
||||
* inside ICMP packets though. */
|
||||
if (iph->frag_off & htons(IP_OFFSET))
|
||||
return -NF_ACCEPT;
|
||||
|
||||
*dataoff = nhoff + (iph->ihl << 2);
|
||||
*protonum = iph->protocol;
|
||||
|
||||
/* Check bogus IP headers */
|
||||
if (*dataoff > skb->len) {
|
||||
pr_debug("nf_conntrack_ipv4: bogus IPv4 packet: "
|
||||
"nhoff %u, ihl %u, skblen %u\n",
|
||||
nhoff, iph->ihl << 2, skb->len);
|
||||
return -NF_ACCEPT;
|
||||
}
|
||||
|
||||
return NF_ACCEPT;
|
||||
}
|
||||
|
||||
static unsigned int ipv4_helper(const struct nf_hook_ops *ops,
|
||||
struct sk_buff *skb,
|
||||
const struct net_device *in,
|
||||
const struct net_device *out,
|
||||
int (*okfn)(struct sk_buff *))
|
||||
{
|
||||
struct nf_conn *ct;
|
||||
enum ip_conntrack_info ctinfo;
|
||||
const struct nf_conn_help *help;
|
||||
const struct nf_conntrack_helper *helper;
|
||||
|
||||
/* This is where we call the helper: as the packet goes out. */
|
||||
ct = nf_ct_get(skb, &ctinfo);
|
||||
if (!ct || ctinfo == IP_CT_RELATED_REPLY)
|
||||
return NF_ACCEPT;
|
||||
|
||||
help = nfct_help(ct);
|
||||
if (!help)
|
||||
return NF_ACCEPT;
|
||||
|
||||
/* rcu_read_lock()ed by nf_hook_slow */
|
||||
helper = rcu_dereference(help->helper);
|
||||
if (!helper)
|
||||
return NF_ACCEPT;
|
||||
|
||||
return helper->help(skb, skb_network_offset(skb) + ip_hdrlen(skb),
|
||||
ct, ctinfo);
|
||||
}
|
||||
|
||||
static unsigned int ipv4_confirm(const struct nf_hook_ops *ops,
|
||||
struct sk_buff *skb,
|
||||
const struct net_device *in,
|
||||
const struct net_device *out,
|
||||
int (*okfn)(struct sk_buff *))
|
||||
{
|
||||
struct nf_conn *ct;
|
||||
enum ip_conntrack_info ctinfo;
|
||||
|
||||
ct = nf_ct_get(skb, &ctinfo);
|
||||
if (!ct || ctinfo == IP_CT_RELATED_REPLY)
|
||||
goto out;
|
||||
|
||||
/* adjust seqs for loopback traffic only in outgoing direction */
|
||||
if (test_bit(IPS_SEQ_ADJUST_BIT, &ct->status) &&
|
||||
!nf_is_loopback_packet(skb)) {
|
||||
if (!nf_ct_seq_adjust(skb, ct, ctinfo, ip_hdrlen(skb))) {
|
||||
NF_CT_STAT_INC_ATOMIC(nf_ct_net(ct), drop);
|
||||
return NF_DROP;
|
||||
}
|
||||
}
|
||||
out:
|
||||
/* We've seen it coming out the other side: confirm it */
|
||||
return nf_conntrack_confirm(skb);
|
||||
}
|
||||
|
||||
static unsigned int ipv4_conntrack_in(const struct nf_hook_ops *ops,
|
||||
struct sk_buff *skb,
|
||||
const struct net_device *in,
|
||||
const struct net_device *out,
|
||||
int (*okfn)(struct sk_buff *))
|
||||
{
|
||||
return nf_conntrack_in(dev_net(in), PF_INET, ops->hooknum, skb);
|
||||
}
|
||||
|
||||
static unsigned int ipv4_conntrack_local(const struct nf_hook_ops *ops,
|
||||
struct sk_buff *skb,
|
||||
const struct net_device *in,
|
||||
const struct net_device *out,
|
||||
int (*okfn)(struct sk_buff *))
|
||||
{
|
||||
/* root is playing with raw sockets. */
|
||||
if (skb->len < sizeof(struct iphdr) ||
|
||||
ip_hdrlen(skb) < sizeof(struct iphdr))
|
||||
return NF_ACCEPT;
|
||||
return nf_conntrack_in(dev_net(out), PF_INET, ops->hooknum, skb);
|
||||
}
|
||||
|
||||
/* Connection tracking may drop packets, but never alters them, so
|
||||
make it the first hook. */
|
||||
static struct nf_hook_ops ipv4_conntrack_ops[] __read_mostly = {
|
||||
{
|
||||
.hook = ipv4_conntrack_in,
|
||||
.owner = THIS_MODULE,
|
||||
.pf = NFPROTO_IPV4,
|
||||
.hooknum = NF_INET_PRE_ROUTING,
|
||||
.priority = NF_IP_PRI_CONNTRACK,
|
||||
},
|
||||
{
|
||||
.hook = ipv4_conntrack_local,
|
||||
.owner = THIS_MODULE,
|
||||
.pf = NFPROTO_IPV4,
|
||||
.hooknum = NF_INET_LOCAL_OUT,
|
||||
.priority = NF_IP_PRI_CONNTRACK,
|
||||
},
|
||||
{
|
||||
.hook = ipv4_helper,
|
||||
.owner = THIS_MODULE,
|
||||
.pf = NFPROTO_IPV4,
|
||||
.hooknum = NF_INET_POST_ROUTING,
|
||||
.priority = NF_IP_PRI_CONNTRACK_HELPER,
|
||||
},
|
||||
{
|
||||
.hook = ipv4_confirm,
|
||||
.owner = THIS_MODULE,
|
||||
.pf = NFPROTO_IPV4,
|
||||
.hooknum = NF_INET_POST_ROUTING,
|
||||
.priority = NF_IP_PRI_CONNTRACK_CONFIRM,
|
||||
},
|
||||
{
|
||||
.hook = ipv4_helper,
|
||||
.owner = THIS_MODULE,
|
||||
.pf = NFPROTO_IPV4,
|
||||
.hooknum = NF_INET_LOCAL_IN,
|
||||
.priority = NF_IP_PRI_CONNTRACK_HELPER,
|
||||
},
|
||||
{
|
||||
.hook = ipv4_confirm,
|
||||
.owner = THIS_MODULE,
|
||||
.pf = NFPROTO_IPV4,
|
||||
.hooknum = NF_INET_LOCAL_IN,
|
||||
.priority = NF_IP_PRI_CONNTRACK_CONFIRM,
|
||||
},
|
||||
};
|
||||
|
||||
#if defined(CONFIG_SYSCTL) && defined(CONFIG_NF_CONNTRACK_PROC_COMPAT)
|
||||
static int log_invalid_proto_min = 0;
|
||||
static int log_invalid_proto_max = 255;
|
||||
|
||||
static struct ctl_table ip_ct_sysctl_table[] = {
|
||||
{
|
||||
.procname = "ip_conntrack_max",
|
||||
.maxlen = sizeof(int),
|
||||
.mode = 0644,
|
||||
.proc_handler = proc_dointvec,
|
||||
},
|
||||
{
|
||||
.procname = "ip_conntrack_count",
|
||||
.maxlen = sizeof(int),
|
||||
.mode = 0444,
|
||||
.proc_handler = proc_dointvec,
|
||||
},
|
||||
{
|
||||
.procname = "ip_conntrack_buckets",
|
||||
.maxlen = sizeof(unsigned int),
|
||||
.mode = 0444,
|
||||
.proc_handler = proc_dointvec,
|
||||
},
|
||||
{
|
||||
.procname = "ip_conntrack_checksum",
|
||||
.maxlen = sizeof(int),
|
||||
.mode = 0644,
|
||||
.proc_handler = proc_dointvec,
|
||||
},
|
||||
{
|
||||
.procname = "ip_conntrack_log_invalid",
|
||||
.maxlen = sizeof(unsigned int),
|
||||
.mode = 0644,
|
||||
.proc_handler = proc_dointvec_minmax,
|
||||
.extra1 = &log_invalid_proto_min,
|
||||
.extra2 = &log_invalid_proto_max,
|
||||
},
|
||||
{ }
|
||||
};
|
||||
#endif /* CONFIG_SYSCTL && CONFIG_NF_CONNTRACK_PROC_COMPAT */
|
||||
|
||||
/* Fast function for those who don't want to parse /proc (and I don't
|
||||
blame them). */
|
||||
/* Reversing the socket's dst/src point of view gives us the reply
|
||||
mapping. */
|
||||
static int
|
||||
getorigdst(struct sock *sk, int optval, void __user *user, int *len)
|
||||
{
|
||||
const struct inet_sock *inet = inet_sk(sk);
|
||||
const struct nf_conntrack_tuple_hash *h;
|
||||
struct nf_conntrack_tuple tuple;
|
||||
|
||||
memset(&tuple, 0, sizeof(tuple));
|
||||
tuple.src.u3.ip = inet->inet_rcv_saddr;
|
||||
tuple.src.u.tcp.port = inet->inet_sport;
|
||||
tuple.dst.u3.ip = inet->inet_daddr;
|
||||
tuple.dst.u.tcp.port = inet->inet_dport;
|
||||
tuple.src.l3num = PF_INET;
|
||||
tuple.dst.protonum = sk->sk_protocol;
|
||||
|
||||
/* We only do TCP and SCTP at the moment: is there a better way? */
|
||||
if (sk->sk_protocol != IPPROTO_TCP && sk->sk_protocol != IPPROTO_SCTP) {
|
||||
pr_debug("SO_ORIGINAL_DST: Not a TCP/SCTP socket\n");
|
||||
return -ENOPROTOOPT;
|
||||
}
|
||||
|
||||
if ((unsigned int) *len < sizeof(struct sockaddr_in)) {
|
||||
pr_debug("SO_ORIGINAL_DST: len %d not %Zu\n",
|
||||
*len, sizeof(struct sockaddr_in));
|
||||
return -EINVAL;
|
||||
}
|
||||
|
||||
h = nf_conntrack_find_get(sock_net(sk), NF_CT_DEFAULT_ZONE, &tuple);
|
||||
if (h) {
|
||||
struct sockaddr_in sin;
|
||||
struct nf_conn *ct = nf_ct_tuplehash_to_ctrack(h);
|
||||
|
||||
sin.sin_family = AF_INET;
|
||||
sin.sin_port = ct->tuplehash[IP_CT_DIR_ORIGINAL]
|
||||
.tuple.dst.u.tcp.port;
|
||||
sin.sin_addr.s_addr = ct->tuplehash[IP_CT_DIR_ORIGINAL]
|
||||
.tuple.dst.u3.ip;
|
||||
memset(sin.sin_zero, 0, sizeof(sin.sin_zero));
|
||||
|
||||
pr_debug("SO_ORIGINAL_DST: %pI4 %u\n",
|
||||
&sin.sin_addr.s_addr, ntohs(sin.sin_port));
|
||||
nf_ct_put(ct);
|
||||
if (copy_to_user(user, &sin, sizeof(sin)) != 0)
|
||||
return -EFAULT;
|
||||
else
|
||||
return 0;
|
||||
}
|
||||
pr_debug("SO_ORIGINAL_DST: Can't find %pI4/%u-%pI4/%u.\n",
|
||||
&tuple.src.u3.ip, ntohs(tuple.src.u.tcp.port),
|
||||
&tuple.dst.u3.ip, ntohs(tuple.dst.u.tcp.port));
|
||||
return -ENOENT;
|
||||
}
|
||||
|
||||
#if IS_ENABLED(CONFIG_NF_CT_NETLINK)
|
||||
|
||||
#include <linux/netfilter/nfnetlink.h>
|
||||
#include <linux/netfilter/nfnetlink_conntrack.h>
|
||||
|
||||
static int ipv4_tuple_to_nlattr(struct sk_buff *skb,
|
||||
const struct nf_conntrack_tuple *tuple)
|
||||
{
|
||||
if (nla_put_be32(skb, CTA_IP_V4_SRC, tuple->src.u3.ip) ||
|
||||
nla_put_be32(skb, CTA_IP_V4_DST, tuple->dst.u3.ip))
|
||||
goto nla_put_failure;
|
||||
return 0;
|
||||
|
||||
nla_put_failure:
|
||||
return -1;
|
||||
}
|
||||
|
||||
static const struct nla_policy ipv4_nla_policy[CTA_IP_MAX+1] = {
|
||||
[CTA_IP_V4_SRC] = { .type = NLA_U32 },
|
||||
[CTA_IP_V4_DST] = { .type = NLA_U32 },
|
||||
};
|
||||
|
||||
static int ipv4_nlattr_to_tuple(struct nlattr *tb[],
|
||||
struct nf_conntrack_tuple *t)
|
||||
{
|
||||
if (!tb[CTA_IP_V4_SRC] || !tb[CTA_IP_V4_DST])
|
||||
return -EINVAL;
|
||||
|
||||
t->src.u3.ip = nla_get_be32(tb[CTA_IP_V4_SRC]);
|
||||
t->dst.u3.ip = nla_get_be32(tb[CTA_IP_V4_DST]);
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
static int ipv4_nlattr_tuple_size(void)
|
||||
{
|
||||
return nla_policy_len(ipv4_nla_policy, CTA_IP_MAX + 1);
|
||||
}
|
||||
#endif
|
||||
|
||||
static struct nf_sockopt_ops so_getorigdst = {
|
||||
.pf = PF_INET,
|
||||
.get_optmin = SO_ORIGINAL_DST,
|
||||
.get_optmax = SO_ORIGINAL_DST+1,
|
||||
.get = getorigdst,
|
||||
.owner = THIS_MODULE,
|
||||
};
|
||||
|
||||
static int ipv4_init_net(struct net *net)
|
||||
{
|
||||
#if defined(CONFIG_SYSCTL) && defined(CONFIG_NF_CONNTRACK_PROC_COMPAT)
|
||||
struct nf_ip_net *in = &net->ct.nf_ct_proto;
|
||||
in->ctl_table = kmemdup(ip_ct_sysctl_table,
|
||||
sizeof(ip_ct_sysctl_table),
|
||||
GFP_KERNEL);
|
||||
if (!in->ctl_table)
|
||||
return -ENOMEM;
|
||||
|
||||
in->ctl_table[0].data = &nf_conntrack_max;
|
||||
in->ctl_table[1].data = &net->ct.count;
|
||||
in->ctl_table[2].data = &net->ct.htable_size;
|
||||
in->ctl_table[3].data = &net->ct.sysctl_checksum;
|
||||
in->ctl_table[4].data = &net->ct.sysctl_log_invalid;
|
||||
#endif
|
||||
return 0;
|
||||
}
|
||||
|
||||
struct nf_conntrack_l3proto nf_conntrack_l3proto_ipv4 __read_mostly = {
|
||||
.l3proto = PF_INET,
|
||||
.name = "ipv4",
|
||||
.pkt_to_tuple = ipv4_pkt_to_tuple,
|
||||
.invert_tuple = ipv4_invert_tuple,
|
||||
.print_tuple = ipv4_print_tuple,
|
||||
.get_l4proto = ipv4_get_l4proto,
|
||||
#if IS_ENABLED(CONFIG_NF_CT_NETLINK)
|
||||
.tuple_to_nlattr = ipv4_tuple_to_nlattr,
|
||||
.nlattr_tuple_size = ipv4_nlattr_tuple_size,
|
||||
.nlattr_to_tuple = ipv4_nlattr_to_tuple,
|
||||
.nla_policy = ipv4_nla_policy,
|
||||
#endif
|
||||
#if defined(CONFIG_SYSCTL) && defined(CONFIG_NF_CONNTRACK_PROC_COMPAT)
|
||||
.ctl_table_path = "net/ipv4/netfilter",
|
||||
#endif
|
||||
.init_net = ipv4_init_net,
|
||||
.me = THIS_MODULE,
|
||||
};
|
||||
|
||||
module_param_call(hashsize, nf_conntrack_set_hashsize, param_get_uint,
|
||||
&nf_conntrack_htable_size, 0600);
|
||||
|
||||
MODULE_ALIAS("nf_conntrack-" __stringify(AF_INET));
|
||||
MODULE_ALIAS("ip_conntrack");
|
||||
MODULE_LICENSE("GPL");
|
||||
|
||||
static int ipv4_net_init(struct net *net)
|
||||
{
|
||||
int ret = 0;
|
||||
|
||||
ret = nf_ct_l4proto_pernet_register(net, &nf_conntrack_l4proto_tcp4);
|
||||
if (ret < 0) {
|
||||
pr_err("nf_conntrack_tcp4: pernet registration failed\n");
|
||||
goto out_tcp;
|
||||
}
|
||||
ret = nf_ct_l4proto_pernet_register(net, &nf_conntrack_l4proto_udp4);
|
||||
if (ret < 0) {
|
||||
pr_err("nf_conntrack_udp4: pernet registration failed\n");
|
||||
goto out_udp;
|
||||
}
|
||||
ret = nf_ct_l4proto_pernet_register(net, &nf_conntrack_l4proto_icmp);
|
||||
if (ret < 0) {
|
||||
pr_err("nf_conntrack_icmp4: pernet registration failed\n");
|
||||
goto out_icmp;
|
||||
}
|
||||
ret = nf_ct_l3proto_pernet_register(net, &nf_conntrack_l3proto_ipv4);
|
||||
if (ret < 0) {
|
||||
pr_err("nf_conntrack_ipv4: pernet registration failed\n");
|
||||
goto out_ipv4;
|
||||
}
|
||||
return 0;
|
||||
out_ipv4:
|
||||
nf_ct_l4proto_pernet_unregister(net, &nf_conntrack_l4proto_icmp);
|
||||
out_icmp:
|
||||
nf_ct_l4proto_pernet_unregister(net, &nf_conntrack_l4proto_udp4);
|
||||
out_udp:
|
||||
nf_ct_l4proto_pernet_unregister(net, &nf_conntrack_l4proto_tcp4);
|
||||
out_tcp:
|
||||
return ret;
|
||||
}
|
||||
|
||||
static void ipv4_net_exit(struct net *net)
|
||||
{
|
||||
nf_ct_l3proto_pernet_unregister(net, &nf_conntrack_l3proto_ipv4);
|
||||
nf_ct_l4proto_pernet_unregister(net, &nf_conntrack_l4proto_icmp);
|
||||
nf_ct_l4proto_pernet_unregister(net, &nf_conntrack_l4proto_udp4);
|
||||
nf_ct_l4proto_pernet_unregister(net, &nf_conntrack_l4proto_tcp4);
|
||||
}
|
||||
|
||||
static struct pernet_operations ipv4_net_ops = {
|
||||
.init = ipv4_net_init,
|
||||
.exit = ipv4_net_exit,
|
||||
};
|
||||
|
||||
static int __init nf_conntrack_l3proto_ipv4_init(void)
|
||||
{
|
||||
int ret = 0;
|
||||
|
||||
need_conntrack();
|
||||
nf_defrag_ipv4_enable();
|
||||
|
||||
ret = nf_register_sockopt(&so_getorigdst);
|
||||
if (ret < 0) {
|
||||
printk(KERN_ERR "Unable to register netfilter socket option\n");
|
||||
return ret;
|
||||
}
|
||||
|
||||
ret = register_pernet_subsys(&ipv4_net_ops);
|
||||
if (ret < 0) {
|
||||
pr_err("nf_conntrack_ipv4: can't register pernet ops\n");
|
||||
goto cleanup_sockopt;
|
||||
}
|
||||
|
||||
ret = nf_register_hooks(ipv4_conntrack_ops,
|
||||
ARRAY_SIZE(ipv4_conntrack_ops));
|
||||
if (ret < 0) {
|
||||
pr_err("nf_conntrack_ipv4: can't register hooks.\n");
|
||||
goto cleanup_pernet;
|
||||
}
|
||||
|
||||
ret = nf_ct_l4proto_register(&nf_conntrack_l4proto_tcp4);
|
||||
if (ret < 0) {
|
||||
pr_err("nf_conntrack_ipv4: can't register tcp4 proto.\n");
|
||||
goto cleanup_hooks;
|
||||
}
|
||||
|
||||
ret = nf_ct_l4proto_register(&nf_conntrack_l4proto_udp4);
|
||||
if (ret < 0) {
|
||||
pr_err("nf_conntrack_ipv4: can't register udp4 proto.\n");
|
||||
goto cleanup_tcp4;
|
||||
}
|
||||
|
||||
ret = nf_ct_l4proto_register(&nf_conntrack_l4proto_icmp);
|
||||
if (ret < 0) {
|
||||
pr_err("nf_conntrack_ipv4: can't register icmpv4 proto.\n");
|
||||
goto cleanup_udp4;
|
||||
}
|
||||
|
||||
ret = nf_ct_l3proto_register(&nf_conntrack_l3proto_ipv4);
|
||||
if (ret < 0) {
|
||||
pr_err("nf_conntrack_ipv4: can't register ipv4 proto.\n");
|
||||
goto cleanup_icmpv4;
|
||||
}
|
||||
|
||||
#if defined(CONFIG_PROC_FS) && defined(CONFIG_NF_CONNTRACK_PROC_COMPAT)
|
||||
ret = nf_conntrack_ipv4_compat_init();
|
||||
if (ret < 0)
|
||||
goto cleanup_proto;
|
||||
#endif
|
||||
return ret;
|
||||
#if defined(CONFIG_PROC_FS) && defined(CONFIG_NF_CONNTRACK_PROC_COMPAT)
|
||||
cleanup_proto:
|
||||
nf_ct_l3proto_unregister(&nf_conntrack_l3proto_ipv4);
|
||||
#endif
|
||||
cleanup_icmpv4:
|
||||
nf_ct_l4proto_unregister(&nf_conntrack_l4proto_icmp);
|
||||
cleanup_udp4:
|
||||
nf_ct_l4proto_unregister(&nf_conntrack_l4proto_udp4);
|
||||
cleanup_tcp4:
|
||||
nf_ct_l4proto_unregister(&nf_conntrack_l4proto_tcp4);
|
||||
cleanup_hooks:
|
||||
nf_unregister_hooks(ipv4_conntrack_ops, ARRAY_SIZE(ipv4_conntrack_ops));
|
||||
cleanup_pernet:
|
||||
unregister_pernet_subsys(&ipv4_net_ops);
|
||||
cleanup_sockopt:
|
||||
nf_unregister_sockopt(&so_getorigdst);
|
||||
return ret;
|
||||
}
|
||||
|
||||
static void __exit nf_conntrack_l3proto_ipv4_fini(void)
|
||||
{
|
||||
synchronize_net();
|
||||
#if defined(CONFIG_PROC_FS) && defined(CONFIG_NF_CONNTRACK_PROC_COMPAT)
|
||||
nf_conntrack_ipv4_compat_fini();
|
||||
#endif
|
||||
nf_ct_l3proto_unregister(&nf_conntrack_l3proto_ipv4);
|
||||
nf_ct_l4proto_unregister(&nf_conntrack_l4proto_icmp);
|
||||
nf_ct_l4proto_unregister(&nf_conntrack_l4proto_udp4);
|
||||
nf_ct_l4proto_unregister(&nf_conntrack_l4proto_tcp4);
|
||||
nf_unregister_hooks(ipv4_conntrack_ops, ARRAY_SIZE(ipv4_conntrack_ops));
|
||||
unregister_pernet_subsys(&ipv4_net_ops);
|
||||
nf_unregister_sockopt(&so_getorigdst);
|
||||
}
|
||||
|
||||
module_init(nf_conntrack_l3proto_ipv4_init);
|
||||
module_exit(nf_conntrack_l3proto_ipv4_fini);
|
||||
464
net/ipv4/netfilter/nf_conntrack_l3proto_ipv4_compat.c
Normal file
464
net/ipv4/netfilter/nf_conntrack_l3proto_ipv4_compat.c
Normal file
|
|
@ -0,0 +1,464 @@
|
|||
/* ip_conntrack proc compat - based on ip_conntrack_standalone.c
|
||||
*
|
||||
* (C) 1999-2001 Paul `Rusty' Russell
|
||||
* (C) 2002-2006 Netfilter Core Team <coreteam@netfilter.org>
|
||||
* (C) 2006-2010 Patrick McHardy <kaber@trash.net>
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License version 2 as
|
||||
* published by the Free Software Foundation.
|
||||
*/
|
||||
#include <linux/types.h>
|
||||
#include <linux/proc_fs.h>
|
||||
#include <linux/seq_file.h>
|
||||
#include <linux/percpu.h>
|
||||
#include <linux/security.h>
|
||||
#include <net/net_namespace.h>
|
||||
|
||||
#include <linux/netfilter.h>
|
||||
#include <net/netfilter/nf_conntrack_core.h>
|
||||
#include <net/netfilter/nf_conntrack_l3proto.h>
|
||||
#include <net/netfilter/nf_conntrack_l4proto.h>
|
||||
#include <net/netfilter/nf_conntrack_expect.h>
|
||||
#include <net/netfilter/nf_conntrack_acct.h>
|
||||
#include <linux/rculist_nulls.h>
|
||||
#include <linux/export.h>
|
||||
|
||||
struct ct_iter_state {
|
||||
struct seq_net_private p;
|
||||
unsigned int bucket;
|
||||
};
|
||||
|
||||
static struct hlist_nulls_node *ct_get_first(struct seq_file *seq)
|
||||
{
|
||||
struct net *net = seq_file_net(seq);
|
||||
struct ct_iter_state *st = seq->private;
|
||||
struct hlist_nulls_node *n;
|
||||
|
||||
for (st->bucket = 0;
|
||||
st->bucket < net->ct.htable_size;
|
||||
st->bucket++) {
|
||||
n = rcu_dereference(
|
||||
hlist_nulls_first_rcu(&net->ct.hash[st->bucket]));
|
||||
if (!is_a_nulls(n))
|
||||
return n;
|
||||
}
|
||||
return NULL;
|
||||
}
|
||||
|
||||
static struct hlist_nulls_node *ct_get_next(struct seq_file *seq,
|
||||
struct hlist_nulls_node *head)
|
||||
{
|
||||
struct net *net = seq_file_net(seq);
|
||||
struct ct_iter_state *st = seq->private;
|
||||
|
||||
head = rcu_dereference(hlist_nulls_next_rcu(head));
|
||||
while (is_a_nulls(head)) {
|
||||
if (likely(get_nulls_value(head) == st->bucket)) {
|
||||
if (++st->bucket >= net->ct.htable_size)
|
||||
return NULL;
|
||||
}
|
||||
head = rcu_dereference(
|
||||
hlist_nulls_first_rcu(&net->ct.hash[st->bucket]));
|
||||
}
|
||||
return head;
|
||||
}
|
||||
|
||||
static struct hlist_nulls_node *ct_get_idx(struct seq_file *seq, loff_t pos)
|
||||
{
|
||||
struct hlist_nulls_node *head = ct_get_first(seq);
|
||||
|
||||
if (head)
|
||||
while (pos && (head = ct_get_next(seq, head)))
|
||||
pos--;
|
||||
return pos ? NULL : head;
|
||||
}
|
||||
|
||||
static void *ct_seq_start(struct seq_file *seq, loff_t *pos)
|
||||
__acquires(RCU)
|
||||
{
|
||||
rcu_read_lock();
|
||||
return ct_get_idx(seq, *pos);
|
||||
}
|
||||
|
||||
static void *ct_seq_next(struct seq_file *s, void *v, loff_t *pos)
|
||||
{
|
||||
(*pos)++;
|
||||
return ct_get_next(s, v);
|
||||
}
|
||||
|
||||
static void ct_seq_stop(struct seq_file *s, void *v)
|
||||
__releases(RCU)
|
||||
{
|
||||
rcu_read_unlock();
|
||||
}
|
||||
|
||||
#ifdef CONFIG_NF_CONNTRACK_SECMARK
|
||||
static int ct_show_secctx(struct seq_file *s, const struct nf_conn *ct)
|
||||
{
|
||||
int ret;
|
||||
u32 len;
|
||||
char *secctx;
|
||||
|
||||
ret = security_secid_to_secctx(ct->secmark, &secctx, &len);
|
||||
if (ret)
|
||||
return 0;
|
||||
|
||||
ret = seq_printf(s, "secctx=%s ", secctx);
|
||||
|
||||
security_release_secctx(secctx, len);
|
||||
return ret;
|
||||
}
|
||||
#else
|
||||
static inline int ct_show_secctx(struct seq_file *s, const struct nf_conn *ct)
|
||||
{
|
||||
return 0;
|
||||
}
|
||||
#endif
|
||||
|
||||
static int ct_seq_show(struct seq_file *s, void *v)
|
||||
{
|
||||
struct nf_conntrack_tuple_hash *hash = v;
|
||||
struct nf_conn *ct = nf_ct_tuplehash_to_ctrack(hash);
|
||||
const struct nf_conntrack_l3proto *l3proto;
|
||||
const struct nf_conntrack_l4proto *l4proto;
|
||||
int ret = 0;
|
||||
|
||||
NF_CT_ASSERT(ct);
|
||||
if (unlikely(!atomic_inc_not_zero(&ct->ct_general.use)))
|
||||
return 0;
|
||||
|
||||
|
||||
/* we only want to print DIR_ORIGINAL */
|
||||
if (NF_CT_DIRECTION(hash))
|
||||
goto release;
|
||||
if (nf_ct_l3num(ct) != AF_INET)
|
||||
goto release;
|
||||
|
||||
l3proto = __nf_ct_l3proto_find(nf_ct_l3num(ct));
|
||||
NF_CT_ASSERT(l3proto);
|
||||
l4proto = __nf_ct_l4proto_find(nf_ct_l3num(ct), nf_ct_protonum(ct));
|
||||
NF_CT_ASSERT(l4proto);
|
||||
|
||||
ret = -ENOSPC;
|
||||
if (seq_printf(s, "%-8s %u %ld ",
|
||||
l4proto->name, nf_ct_protonum(ct),
|
||||
timer_pending(&ct->timeout)
|
||||
? (long)(ct->timeout.expires - jiffies)/HZ : 0) != 0)
|
||||
goto release;
|
||||
|
||||
if (l4proto->print_conntrack && l4proto->print_conntrack(s, ct))
|
||||
goto release;
|
||||
|
||||
if (print_tuple(s, &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple,
|
||||
l3proto, l4proto))
|
||||
goto release;
|
||||
|
||||
if (seq_print_acct(s, ct, IP_CT_DIR_ORIGINAL))
|
||||
goto release;
|
||||
|
||||
if (!(test_bit(IPS_SEEN_REPLY_BIT, &ct->status)))
|
||||
if (seq_printf(s, "[UNREPLIED] "))
|
||||
goto release;
|
||||
|
||||
if (print_tuple(s, &ct->tuplehash[IP_CT_DIR_REPLY].tuple,
|
||||
l3proto, l4proto))
|
||||
goto release;
|
||||
|
||||
if (seq_print_acct(s, ct, IP_CT_DIR_REPLY))
|
||||
goto release;
|
||||
|
||||
if (test_bit(IPS_ASSURED_BIT, &ct->status))
|
||||
if (seq_printf(s, "[ASSURED] "))
|
||||
goto release;
|
||||
|
||||
#ifdef CONFIG_NF_CONNTRACK_MARK
|
||||
if (seq_printf(s, "mark=%u ", ct->mark))
|
||||
goto release;
|
||||
#endif
|
||||
|
||||
if (ct_show_secctx(s, ct))
|
||||
goto release;
|
||||
|
||||
if (seq_printf(s, "use=%u\n", atomic_read(&ct->ct_general.use)))
|
||||
goto release;
|
||||
ret = 0;
|
||||
release:
|
||||
nf_ct_put(ct);
|
||||
return ret;
|
||||
}
|
||||
|
||||
static const struct seq_operations ct_seq_ops = {
|
||||
.start = ct_seq_start,
|
||||
.next = ct_seq_next,
|
||||
.stop = ct_seq_stop,
|
||||
.show = ct_seq_show
|
||||
};
|
||||
|
||||
static int ct_open(struct inode *inode, struct file *file)
|
||||
{
|
||||
return seq_open_net(inode, file, &ct_seq_ops,
|
||||
sizeof(struct ct_iter_state));
|
||||
}
|
||||
|
||||
static const struct file_operations ct_file_ops = {
|
||||
.owner = THIS_MODULE,
|
||||
.open = ct_open,
|
||||
.read = seq_read,
|
||||
.llseek = seq_lseek,
|
||||
.release = seq_release_net,
|
||||
};
|
||||
|
||||
/* expects */
|
||||
struct ct_expect_iter_state {
|
||||
struct seq_net_private p;
|
||||
unsigned int bucket;
|
||||
};
|
||||
|
||||
static struct hlist_node *ct_expect_get_first(struct seq_file *seq)
|
||||
{
|
||||
struct net *net = seq_file_net(seq);
|
||||
struct ct_expect_iter_state *st = seq->private;
|
||||
struct hlist_node *n;
|
||||
|
||||
for (st->bucket = 0; st->bucket < nf_ct_expect_hsize; st->bucket++) {
|
||||
n = rcu_dereference(
|
||||
hlist_first_rcu(&net->ct.expect_hash[st->bucket]));
|
||||
if (n)
|
||||
return n;
|
||||
}
|
||||
return NULL;
|
||||
}
|
||||
|
||||
static struct hlist_node *ct_expect_get_next(struct seq_file *seq,
|
||||
struct hlist_node *head)
|
||||
{
|
||||
struct net *net = seq_file_net(seq);
|
||||
struct ct_expect_iter_state *st = seq->private;
|
||||
|
||||
head = rcu_dereference(hlist_next_rcu(head));
|
||||
while (head == NULL) {
|
||||
if (++st->bucket >= nf_ct_expect_hsize)
|
||||
return NULL;
|
||||
head = rcu_dereference(
|
||||
hlist_first_rcu(&net->ct.expect_hash[st->bucket]));
|
||||
}
|
||||
return head;
|
||||
}
|
||||
|
||||
static struct hlist_node *ct_expect_get_idx(struct seq_file *seq, loff_t pos)
|
||||
{
|
||||
struct hlist_node *head = ct_expect_get_first(seq);
|
||||
|
||||
if (head)
|
||||
while (pos && (head = ct_expect_get_next(seq, head)))
|
||||
pos--;
|
||||
return pos ? NULL : head;
|
||||
}
|
||||
|
||||
static void *exp_seq_start(struct seq_file *seq, loff_t *pos)
|
||||
__acquires(RCU)
|
||||
{
|
||||
rcu_read_lock();
|
||||
return ct_expect_get_idx(seq, *pos);
|
||||
}
|
||||
|
||||
static void *exp_seq_next(struct seq_file *seq, void *v, loff_t *pos)
|
||||
{
|
||||
(*pos)++;
|
||||
return ct_expect_get_next(seq, v);
|
||||
}
|
||||
|
||||
static void exp_seq_stop(struct seq_file *seq, void *v)
|
||||
__releases(RCU)
|
||||
{
|
||||
rcu_read_unlock();
|
||||
}
|
||||
|
||||
static int exp_seq_show(struct seq_file *s, void *v)
|
||||
{
|
||||
struct nf_conntrack_expect *exp;
|
||||
const struct hlist_node *n = v;
|
||||
|
||||
exp = hlist_entry(n, struct nf_conntrack_expect, hnode);
|
||||
|
||||
if (exp->tuple.src.l3num != AF_INET)
|
||||
return 0;
|
||||
|
||||
if (exp->timeout.function)
|
||||
seq_printf(s, "%ld ", timer_pending(&exp->timeout)
|
||||
? (long)(exp->timeout.expires - jiffies)/HZ : 0);
|
||||
else
|
||||
seq_printf(s, "- ");
|
||||
|
||||
seq_printf(s, "proto=%u ", exp->tuple.dst.protonum);
|
||||
|
||||
print_tuple(s, &exp->tuple,
|
||||
__nf_ct_l3proto_find(exp->tuple.src.l3num),
|
||||
__nf_ct_l4proto_find(exp->tuple.src.l3num,
|
||||
exp->tuple.dst.protonum));
|
||||
return seq_putc(s, '\n');
|
||||
}
|
||||
|
||||
static const struct seq_operations exp_seq_ops = {
|
||||
.start = exp_seq_start,
|
||||
.next = exp_seq_next,
|
||||
.stop = exp_seq_stop,
|
||||
.show = exp_seq_show
|
||||
};
|
||||
|
||||
static int exp_open(struct inode *inode, struct file *file)
|
||||
{
|
||||
return seq_open_net(inode, file, &exp_seq_ops,
|
||||
sizeof(struct ct_expect_iter_state));
|
||||
}
|
||||
|
||||
static const struct file_operations ip_exp_file_ops = {
|
||||
.owner = THIS_MODULE,
|
||||
.open = exp_open,
|
||||
.read = seq_read,
|
||||
.llseek = seq_lseek,
|
||||
.release = seq_release_net,
|
||||
};
|
||||
|
||||
static void *ct_cpu_seq_start(struct seq_file *seq, loff_t *pos)
|
||||
{
|
||||
struct net *net = seq_file_net(seq);
|
||||
int cpu;
|
||||
|
||||
if (*pos == 0)
|
||||
return SEQ_START_TOKEN;
|
||||
|
||||
for (cpu = *pos-1; cpu < nr_cpu_ids; ++cpu) {
|
||||
if (!cpu_possible(cpu))
|
||||
continue;
|
||||
*pos = cpu+1;
|
||||
return per_cpu_ptr(net->ct.stat, cpu);
|
||||
}
|
||||
|
||||
return NULL;
|
||||
}
|
||||
|
||||
static void *ct_cpu_seq_next(struct seq_file *seq, void *v, loff_t *pos)
|
||||
{
|
||||
struct net *net = seq_file_net(seq);
|
||||
int cpu;
|
||||
|
||||
for (cpu = *pos; cpu < nr_cpu_ids; ++cpu) {
|
||||
if (!cpu_possible(cpu))
|
||||
continue;
|
||||
*pos = cpu+1;
|
||||
return per_cpu_ptr(net->ct.stat, cpu);
|
||||
}
|
||||
|
||||
return NULL;
|
||||
}
|
||||
|
||||
static void ct_cpu_seq_stop(struct seq_file *seq, void *v)
|
||||
{
|
||||
}
|
||||
|
||||
static int ct_cpu_seq_show(struct seq_file *seq, void *v)
|
||||
{
|
||||
struct net *net = seq_file_net(seq);
|
||||
unsigned int nr_conntracks = atomic_read(&net->ct.count);
|
||||
const struct ip_conntrack_stat *st = v;
|
||||
|
||||
if (v == SEQ_START_TOKEN) {
|
||||
seq_printf(seq, "entries searched found new invalid ignore delete delete_list insert insert_failed drop early_drop icmp_error expect_new expect_create expect_delete search_restart\n");
|
||||
return 0;
|
||||
}
|
||||
|
||||
seq_printf(seq, "%08x %08x %08x %08x %08x %08x %08x %08x "
|
||||
"%08x %08x %08x %08x %08x %08x %08x %08x %08x\n",
|
||||
nr_conntracks,
|
||||
st->searched,
|
||||
st->found,
|
||||
st->new,
|
||||
st->invalid,
|
||||
st->ignore,
|
||||
st->delete,
|
||||
st->delete_list,
|
||||
st->insert,
|
||||
st->insert_failed,
|
||||
st->drop,
|
||||
st->early_drop,
|
||||
st->error,
|
||||
|
||||
st->expect_new,
|
||||
st->expect_create,
|
||||
st->expect_delete,
|
||||
st->search_restart
|
||||
);
|
||||
return 0;
|
||||
}
|
||||
|
||||
static const struct seq_operations ct_cpu_seq_ops = {
|
||||
.start = ct_cpu_seq_start,
|
||||
.next = ct_cpu_seq_next,
|
||||
.stop = ct_cpu_seq_stop,
|
||||
.show = ct_cpu_seq_show,
|
||||
};
|
||||
|
||||
static int ct_cpu_seq_open(struct inode *inode, struct file *file)
|
||||
{
|
||||
return seq_open_net(inode, file, &ct_cpu_seq_ops,
|
||||
sizeof(struct seq_net_private));
|
||||
}
|
||||
|
||||
static const struct file_operations ct_cpu_seq_fops = {
|
||||
.owner = THIS_MODULE,
|
||||
.open = ct_cpu_seq_open,
|
||||
.read = seq_read,
|
||||
.llseek = seq_lseek,
|
||||
.release = seq_release_net,
|
||||
};
|
||||
|
||||
static int __net_init ip_conntrack_net_init(struct net *net)
|
||||
{
|
||||
struct proc_dir_entry *proc, *proc_exp, *proc_stat;
|
||||
|
||||
proc = proc_create("ip_conntrack", 0440, net->proc_net, &ct_file_ops);
|
||||
if (!proc)
|
||||
goto err1;
|
||||
|
||||
proc_exp = proc_create("ip_conntrack_expect", 0440, net->proc_net,
|
||||
&ip_exp_file_ops);
|
||||
if (!proc_exp)
|
||||
goto err2;
|
||||
|
||||
proc_stat = proc_create("ip_conntrack", S_IRUGO,
|
||||
net->proc_net_stat, &ct_cpu_seq_fops);
|
||||
if (!proc_stat)
|
||||
goto err3;
|
||||
return 0;
|
||||
|
||||
err3:
|
||||
remove_proc_entry("ip_conntrack_expect", net->proc_net);
|
||||
err2:
|
||||
remove_proc_entry("ip_conntrack", net->proc_net);
|
||||
err1:
|
||||
return -ENOMEM;
|
||||
}
|
||||
|
||||
static void __net_exit ip_conntrack_net_exit(struct net *net)
|
||||
{
|
||||
remove_proc_entry("ip_conntrack", net->proc_net_stat);
|
||||
remove_proc_entry("ip_conntrack_expect", net->proc_net);
|
||||
remove_proc_entry("ip_conntrack", net->proc_net);
|
||||
}
|
||||
|
||||
static struct pernet_operations ip_conntrack_net_ops = {
|
||||
.init = ip_conntrack_net_init,
|
||||
.exit = ip_conntrack_net_exit,
|
||||
};
|
||||
|
||||
int __init nf_conntrack_ipv4_compat_init(void)
|
||||
{
|
||||
return register_pernet_subsys(&ip_conntrack_net_ops);
|
||||
}
|
||||
|
||||
void __exit nf_conntrack_ipv4_compat_fini(void)
|
||||
{
|
||||
unregister_pernet_subsys(&ip_conntrack_net_ops);
|
||||
}
|
||||
428
net/ipv4/netfilter/nf_conntrack_proto_icmp.c
Normal file
428
net/ipv4/netfilter/nf_conntrack_proto_icmp.c
Normal file
|
|
@ -0,0 +1,428 @@
|
|||
/* (C) 1999-2001 Paul `Rusty' Russell
|
||||
* (C) 2002-2004 Netfilter Core Team <coreteam@netfilter.org>
|
||||
* (C) 2006-2010 Patrick McHardy <kaber@trash.net>
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License version 2 as
|
||||
* published by the Free Software Foundation.
|
||||
*/
|
||||
|
||||
#include <linux/types.h>
|
||||
#include <linux/timer.h>
|
||||
#include <linux/netfilter.h>
|
||||
#include <linux/in.h>
|
||||
#include <linux/icmp.h>
|
||||
#include <linux/seq_file.h>
|
||||
#include <net/ip.h>
|
||||
#include <net/checksum.h>
|
||||
#include <linux/netfilter_ipv4.h>
|
||||
#include <net/netfilter/nf_conntrack_tuple.h>
|
||||
#include <net/netfilter/nf_conntrack_l4proto.h>
|
||||
#include <net/netfilter/nf_conntrack_core.h>
|
||||
#include <net/netfilter/nf_conntrack_zones.h>
|
||||
#include <net/netfilter/nf_log.h>
|
||||
|
||||
static unsigned int nf_ct_icmp_timeout __read_mostly = 30*HZ;
|
||||
|
||||
static inline struct nf_icmp_net *icmp_pernet(struct net *net)
|
||||
{
|
||||
return &net->ct.nf_ct_proto.icmp;
|
||||
}
|
||||
|
||||
static bool icmp_pkt_to_tuple(const struct sk_buff *skb, unsigned int dataoff,
|
||||
struct nf_conntrack_tuple *tuple)
|
||||
{
|
||||
const struct icmphdr *hp;
|
||||
struct icmphdr _hdr;
|
||||
|
||||
hp = skb_header_pointer(skb, dataoff, sizeof(_hdr), &_hdr);
|
||||
if (hp == NULL)
|
||||
return false;
|
||||
|
||||
tuple->dst.u.icmp.type = hp->type;
|
||||
tuple->src.u.icmp.id = hp->un.echo.id;
|
||||
tuple->dst.u.icmp.code = hp->code;
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
/* Add 1; spaces filled with 0. */
|
||||
static const u_int8_t invmap[] = {
|
||||
[ICMP_ECHO] = ICMP_ECHOREPLY + 1,
|
||||
[ICMP_ECHOREPLY] = ICMP_ECHO + 1,
|
||||
[ICMP_TIMESTAMP] = ICMP_TIMESTAMPREPLY + 1,
|
||||
[ICMP_TIMESTAMPREPLY] = ICMP_TIMESTAMP + 1,
|
||||
[ICMP_INFO_REQUEST] = ICMP_INFO_REPLY + 1,
|
||||
[ICMP_INFO_REPLY] = ICMP_INFO_REQUEST + 1,
|
||||
[ICMP_ADDRESS] = ICMP_ADDRESSREPLY + 1,
|
||||
[ICMP_ADDRESSREPLY] = ICMP_ADDRESS + 1
|
||||
};
|
||||
|
||||
static bool icmp_invert_tuple(struct nf_conntrack_tuple *tuple,
|
||||
const struct nf_conntrack_tuple *orig)
|
||||
{
|
||||
if (orig->dst.u.icmp.type >= sizeof(invmap) ||
|
||||
!invmap[orig->dst.u.icmp.type])
|
||||
return false;
|
||||
|
||||
tuple->src.u.icmp.id = orig->src.u.icmp.id;
|
||||
tuple->dst.u.icmp.type = invmap[orig->dst.u.icmp.type] - 1;
|
||||
tuple->dst.u.icmp.code = orig->dst.u.icmp.code;
|
||||
return true;
|
||||
}
|
||||
|
||||
/* Print out the per-protocol part of the tuple. */
|
||||
static int icmp_print_tuple(struct seq_file *s,
|
||||
const struct nf_conntrack_tuple *tuple)
|
||||
{
|
||||
return seq_printf(s, "type=%u code=%u id=%u ",
|
||||
tuple->dst.u.icmp.type,
|
||||
tuple->dst.u.icmp.code,
|
||||
ntohs(tuple->src.u.icmp.id));
|
||||
}
|
||||
|
||||
static unsigned int *icmp_get_timeouts(struct net *net)
|
||||
{
|
||||
return &icmp_pernet(net)->timeout;
|
||||
}
|
||||
|
||||
/* Returns verdict for packet, or -1 for invalid. */
|
||||
static int icmp_packet(struct nf_conn *ct,
|
||||
const struct sk_buff *skb,
|
||||
unsigned int dataoff,
|
||||
enum ip_conntrack_info ctinfo,
|
||||
u_int8_t pf,
|
||||
unsigned int hooknum,
|
||||
unsigned int *timeout)
|
||||
{
|
||||
/* Do not immediately delete the connection after the first
|
||||
successful reply to avoid excessive conntrackd traffic
|
||||
and also to handle correctly ICMP echo reply duplicates. */
|
||||
nf_ct_refresh_acct(ct, ctinfo, skb, *timeout);
|
||||
|
||||
return NF_ACCEPT;
|
||||
}
|
||||
|
||||
/* Called when a new connection for this protocol found. */
|
||||
static bool icmp_new(struct nf_conn *ct, const struct sk_buff *skb,
|
||||
unsigned int dataoff, unsigned int *timeouts)
|
||||
{
|
||||
static const u_int8_t valid_new[] = {
|
||||
[ICMP_ECHO] = 1,
|
||||
[ICMP_TIMESTAMP] = 1,
|
||||
[ICMP_INFO_REQUEST] = 1,
|
||||
[ICMP_ADDRESS] = 1
|
||||
};
|
||||
|
||||
if (ct->tuplehash[0].tuple.dst.u.icmp.type >= sizeof(valid_new) ||
|
||||
!valid_new[ct->tuplehash[0].tuple.dst.u.icmp.type]) {
|
||||
/* Can't create a new ICMP `conn' with this. */
|
||||
pr_debug("icmp: can't create new conn with type %u\n",
|
||||
ct->tuplehash[0].tuple.dst.u.icmp.type);
|
||||
nf_ct_dump_tuple_ip(&ct->tuplehash[0].tuple);
|
||||
return false;
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
/* Returns conntrack if it dealt with ICMP, and filled in skb fields */
|
||||
static int
|
||||
icmp_error_message(struct net *net, struct nf_conn *tmpl, struct sk_buff *skb,
|
||||
enum ip_conntrack_info *ctinfo,
|
||||
unsigned int hooknum)
|
||||
{
|
||||
struct nf_conntrack_tuple innertuple, origtuple;
|
||||
const struct nf_conntrack_l4proto *innerproto;
|
||||
const struct nf_conntrack_tuple_hash *h;
|
||||
u16 zone = tmpl ? nf_ct_zone(tmpl) : NF_CT_DEFAULT_ZONE;
|
||||
|
||||
NF_CT_ASSERT(skb->nfct == NULL);
|
||||
|
||||
/* Are they talking about one of our connections? */
|
||||
if (!nf_ct_get_tuplepr(skb,
|
||||
skb_network_offset(skb) + ip_hdrlen(skb)
|
||||
+ sizeof(struct icmphdr),
|
||||
PF_INET, &origtuple)) {
|
||||
pr_debug("icmp_error_message: failed to get tuple\n");
|
||||
return -NF_ACCEPT;
|
||||
}
|
||||
|
||||
/* rcu_read_lock()ed by nf_hook_slow */
|
||||
innerproto = __nf_ct_l4proto_find(PF_INET, origtuple.dst.protonum);
|
||||
|
||||
/* Ordinarily, we'd expect the inverted tupleproto, but it's
|
||||
been preserved inside the ICMP. */
|
||||
if (!nf_ct_invert_tuple(&innertuple, &origtuple,
|
||||
&nf_conntrack_l3proto_ipv4, innerproto)) {
|
||||
pr_debug("icmp_error_message: no match\n");
|
||||
return -NF_ACCEPT;
|
||||
}
|
||||
|
||||
*ctinfo = IP_CT_RELATED;
|
||||
|
||||
h = nf_conntrack_find_get(net, zone, &innertuple);
|
||||
if (!h) {
|
||||
pr_debug("icmp_error_message: no match\n");
|
||||
return -NF_ACCEPT;
|
||||
}
|
||||
|
||||
if (NF_CT_DIRECTION(h) == IP_CT_DIR_REPLY)
|
||||
*ctinfo += IP_CT_IS_REPLY;
|
||||
|
||||
/* Update skb to refer to this connection */
|
||||
skb->nfct = &nf_ct_tuplehash_to_ctrack(h)->ct_general;
|
||||
skb->nfctinfo = *ctinfo;
|
||||
return NF_ACCEPT;
|
||||
}
|
||||
|
||||
/* Small and modified version of icmp_rcv */
|
||||
static int
|
||||
icmp_error(struct net *net, struct nf_conn *tmpl,
|
||||
struct sk_buff *skb, unsigned int dataoff,
|
||||
enum ip_conntrack_info *ctinfo, u_int8_t pf, unsigned int hooknum)
|
||||
{
|
||||
const struct icmphdr *icmph;
|
||||
struct icmphdr _ih;
|
||||
|
||||
/* Not enough header? */
|
||||
icmph = skb_header_pointer(skb, ip_hdrlen(skb), sizeof(_ih), &_ih);
|
||||
if (icmph == NULL) {
|
||||
if (LOG_INVALID(net, IPPROTO_ICMP))
|
||||
nf_log_packet(net, PF_INET, 0, skb, NULL, NULL,
|
||||
NULL, "nf_ct_icmp: short packet ");
|
||||
return -NF_ACCEPT;
|
||||
}
|
||||
|
||||
/* See ip_conntrack_proto_tcp.c */
|
||||
if (net->ct.sysctl_checksum && hooknum == NF_INET_PRE_ROUTING &&
|
||||
nf_ip_checksum(skb, hooknum, dataoff, 0)) {
|
||||
if (LOG_INVALID(net, IPPROTO_ICMP))
|
||||
nf_log_packet(net, PF_INET, 0, skb, NULL, NULL, NULL,
|
||||
"nf_ct_icmp: bad HW ICMP checksum ");
|
||||
return -NF_ACCEPT;
|
||||
}
|
||||
|
||||
/*
|
||||
* 18 is the highest 'known' ICMP type. Anything else is a mystery
|
||||
*
|
||||
* RFC 1122: 3.2.2 Unknown ICMP messages types MUST be silently
|
||||
* discarded.
|
||||
*/
|
||||
if (icmph->type > NR_ICMP_TYPES) {
|
||||
if (LOG_INVALID(net, IPPROTO_ICMP))
|
||||
nf_log_packet(net, PF_INET, 0, skb, NULL, NULL, NULL,
|
||||
"nf_ct_icmp: invalid ICMP type ");
|
||||
return -NF_ACCEPT;
|
||||
}
|
||||
|
||||
/* Need to track icmp error message? */
|
||||
if (icmph->type != ICMP_DEST_UNREACH &&
|
||||
icmph->type != ICMP_SOURCE_QUENCH &&
|
||||
icmph->type != ICMP_TIME_EXCEEDED &&
|
||||
icmph->type != ICMP_PARAMETERPROB &&
|
||||
icmph->type != ICMP_REDIRECT)
|
||||
return NF_ACCEPT;
|
||||
|
||||
return icmp_error_message(net, tmpl, skb, ctinfo, hooknum);
|
||||
}
|
||||
|
||||
#if IS_ENABLED(CONFIG_NF_CT_NETLINK)
|
||||
|
||||
#include <linux/netfilter/nfnetlink.h>
|
||||
#include <linux/netfilter/nfnetlink_conntrack.h>
|
||||
|
||||
static int icmp_tuple_to_nlattr(struct sk_buff *skb,
|
||||
const struct nf_conntrack_tuple *t)
|
||||
{
|
||||
if (nla_put_be16(skb, CTA_PROTO_ICMP_ID, t->src.u.icmp.id) ||
|
||||
nla_put_u8(skb, CTA_PROTO_ICMP_TYPE, t->dst.u.icmp.type) ||
|
||||
nla_put_u8(skb, CTA_PROTO_ICMP_CODE, t->dst.u.icmp.code))
|
||||
goto nla_put_failure;
|
||||
return 0;
|
||||
|
||||
nla_put_failure:
|
||||
return -1;
|
||||
}
|
||||
|
||||
static const struct nla_policy icmp_nla_policy[CTA_PROTO_MAX+1] = {
|
||||
[CTA_PROTO_ICMP_TYPE] = { .type = NLA_U8 },
|
||||
[CTA_PROTO_ICMP_CODE] = { .type = NLA_U8 },
|
||||
[CTA_PROTO_ICMP_ID] = { .type = NLA_U16 },
|
||||
};
|
||||
|
||||
static int icmp_nlattr_to_tuple(struct nlattr *tb[],
|
||||
struct nf_conntrack_tuple *tuple)
|
||||
{
|
||||
if (!tb[CTA_PROTO_ICMP_TYPE] ||
|
||||
!tb[CTA_PROTO_ICMP_CODE] ||
|
||||
!tb[CTA_PROTO_ICMP_ID])
|
||||
return -EINVAL;
|
||||
|
||||
tuple->dst.u.icmp.type = nla_get_u8(tb[CTA_PROTO_ICMP_TYPE]);
|
||||
tuple->dst.u.icmp.code = nla_get_u8(tb[CTA_PROTO_ICMP_CODE]);
|
||||
tuple->src.u.icmp.id = nla_get_be16(tb[CTA_PROTO_ICMP_ID]);
|
||||
|
||||
if (tuple->dst.u.icmp.type >= sizeof(invmap) ||
|
||||
!invmap[tuple->dst.u.icmp.type])
|
||||
return -EINVAL;
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
static int icmp_nlattr_tuple_size(void)
|
||||
{
|
||||
return nla_policy_len(icmp_nla_policy, CTA_PROTO_MAX + 1);
|
||||
}
|
||||
#endif
|
||||
|
||||
#if IS_ENABLED(CONFIG_NF_CT_NETLINK_TIMEOUT)
|
||||
|
||||
#include <linux/netfilter/nfnetlink.h>
|
||||
#include <linux/netfilter/nfnetlink_cttimeout.h>
|
||||
|
||||
static int icmp_timeout_nlattr_to_obj(struct nlattr *tb[],
|
||||
struct net *net, void *data)
|
||||
{
|
||||
unsigned int *timeout = data;
|
||||
struct nf_icmp_net *in = icmp_pernet(net);
|
||||
|
||||
if (tb[CTA_TIMEOUT_ICMP_TIMEOUT]) {
|
||||
*timeout =
|
||||
ntohl(nla_get_be32(tb[CTA_TIMEOUT_ICMP_TIMEOUT])) * HZ;
|
||||
} else {
|
||||
/* Set default ICMP timeout. */
|
||||
*timeout = in->timeout;
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
static int
|
||||
icmp_timeout_obj_to_nlattr(struct sk_buff *skb, const void *data)
|
||||
{
|
||||
const unsigned int *timeout = data;
|
||||
|
||||
if (nla_put_be32(skb, CTA_TIMEOUT_ICMP_TIMEOUT, htonl(*timeout / HZ)))
|
||||
goto nla_put_failure;
|
||||
return 0;
|
||||
|
||||
nla_put_failure:
|
||||
return -ENOSPC;
|
||||
}
|
||||
|
||||
static const struct nla_policy
|
||||
icmp_timeout_nla_policy[CTA_TIMEOUT_ICMP_MAX+1] = {
|
||||
[CTA_TIMEOUT_ICMP_TIMEOUT] = { .type = NLA_U32 },
|
||||
};
|
||||
#endif /* CONFIG_NF_CT_NETLINK_TIMEOUT */
|
||||
|
||||
#ifdef CONFIG_SYSCTL
|
||||
static struct ctl_table icmp_sysctl_table[] = {
|
||||
{
|
||||
.procname = "nf_conntrack_icmp_timeout",
|
||||
.maxlen = sizeof(unsigned int),
|
||||
.mode = 0644,
|
||||
.proc_handler = proc_dointvec_jiffies,
|
||||
},
|
||||
{ }
|
||||
};
|
||||
#ifdef CONFIG_NF_CONNTRACK_PROC_COMPAT
|
||||
static struct ctl_table icmp_compat_sysctl_table[] = {
|
||||
{
|
||||
.procname = "ip_conntrack_icmp_timeout",
|
||||
.maxlen = sizeof(unsigned int),
|
||||
.mode = 0644,
|
||||
.proc_handler = proc_dointvec_jiffies,
|
||||
},
|
||||
{ }
|
||||
};
|
||||
#endif /* CONFIG_NF_CONNTRACK_PROC_COMPAT */
|
||||
#endif /* CONFIG_SYSCTL */
|
||||
|
||||
static int icmp_kmemdup_sysctl_table(struct nf_proto_net *pn,
|
||||
struct nf_icmp_net *in)
|
||||
{
|
||||
#ifdef CONFIG_SYSCTL
|
||||
pn->ctl_table = kmemdup(icmp_sysctl_table,
|
||||
sizeof(icmp_sysctl_table),
|
||||
GFP_KERNEL);
|
||||
if (!pn->ctl_table)
|
||||
return -ENOMEM;
|
||||
|
||||
pn->ctl_table[0].data = &in->timeout;
|
||||
#endif
|
||||
return 0;
|
||||
}
|
||||
|
||||
static int icmp_kmemdup_compat_sysctl_table(struct nf_proto_net *pn,
|
||||
struct nf_icmp_net *in)
|
||||
{
|
||||
#ifdef CONFIG_SYSCTL
|
||||
#ifdef CONFIG_NF_CONNTRACK_PROC_COMPAT
|
||||
pn->ctl_compat_table = kmemdup(icmp_compat_sysctl_table,
|
||||
sizeof(icmp_compat_sysctl_table),
|
||||
GFP_KERNEL);
|
||||
if (!pn->ctl_compat_table)
|
||||
return -ENOMEM;
|
||||
|
||||
pn->ctl_compat_table[0].data = &in->timeout;
|
||||
#endif
|
||||
#endif
|
||||
return 0;
|
||||
}
|
||||
|
||||
static int icmp_init_net(struct net *net, u_int16_t proto)
|
||||
{
|
||||
int ret;
|
||||
struct nf_icmp_net *in = icmp_pernet(net);
|
||||
struct nf_proto_net *pn = &in->pn;
|
||||
|
||||
in->timeout = nf_ct_icmp_timeout;
|
||||
|
||||
ret = icmp_kmemdup_compat_sysctl_table(pn, in);
|
||||
if (ret < 0)
|
||||
return ret;
|
||||
|
||||
ret = icmp_kmemdup_sysctl_table(pn, in);
|
||||
if (ret < 0)
|
||||
nf_ct_kfree_compat_sysctl_table(pn);
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
static struct nf_proto_net *icmp_get_net_proto(struct net *net)
|
||||
{
|
||||
return &net->ct.nf_ct_proto.icmp.pn;
|
||||
}
|
||||
|
||||
struct nf_conntrack_l4proto nf_conntrack_l4proto_icmp __read_mostly =
|
||||
{
|
||||
.l3proto = PF_INET,
|
||||
.l4proto = IPPROTO_ICMP,
|
||||
.name = "icmp",
|
||||
.pkt_to_tuple = icmp_pkt_to_tuple,
|
||||
.invert_tuple = icmp_invert_tuple,
|
||||
.print_tuple = icmp_print_tuple,
|
||||
.packet = icmp_packet,
|
||||
.get_timeouts = icmp_get_timeouts,
|
||||
.new = icmp_new,
|
||||
.error = icmp_error,
|
||||
.destroy = NULL,
|
||||
.me = NULL,
|
||||
#if IS_ENABLED(CONFIG_NF_CT_NETLINK)
|
||||
.tuple_to_nlattr = icmp_tuple_to_nlattr,
|
||||
.nlattr_tuple_size = icmp_nlattr_tuple_size,
|
||||
.nlattr_to_tuple = icmp_nlattr_to_tuple,
|
||||
.nla_policy = icmp_nla_policy,
|
||||
#endif
|
||||
#if IS_ENABLED(CONFIG_NF_CT_NETLINK_TIMEOUT)
|
||||
.ctnl_timeout = {
|
||||
.nlattr_to_obj = icmp_timeout_nlattr_to_obj,
|
||||
.obj_to_nlattr = icmp_timeout_obj_to_nlattr,
|
||||
.nlattr_max = CTA_TIMEOUT_ICMP_MAX,
|
||||
.obj_size = sizeof(unsigned int),
|
||||
.nla_policy = icmp_timeout_nla_policy,
|
||||
},
|
||||
#endif /* CONFIG_NF_CT_NETLINK_TIMEOUT */
|
||||
.init_net = icmp_init_net,
|
||||
.get_net_proto = icmp_get_net_proto,
|
||||
};
|
||||
131
net/ipv4/netfilter/nf_defrag_ipv4.c
Normal file
131
net/ipv4/netfilter/nf_defrag_ipv4.c
Normal file
|
|
@ -0,0 +1,131 @@
|
|||
/* (C) 1999-2001 Paul `Rusty' Russell
|
||||
* (C) 2002-2004 Netfilter Core Team <coreteam@netfilter.org>
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License version 2 as
|
||||
* published by the Free Software Foundation.
|
||||
*/
|
||||
|
||||
#include <linux/types.h>
|
||||
#include <linux/ip.h>
|
||||
#include <linux/netfilter.h>
|
||||
#include <linux/module.h>
|
||||
#include <linux/skbuff.h>
|
||||
#include <net/route.h>
|
||||
#include <net/ip.h>
|
||||
|
||||
#include <linux/netfilter_bridge.h>
|
||||
#include <linux/netfilter_ipv4.h>
|
||||
#include <net/netfilter/ipv4/nf_defrag_ipv4.h>
|
||||
#if IS_ENABLED(CONFIG_NF_CONNTRACK)
|
||||
#include <net/netfilter/nf_conntrack.h>
|
||||
#endif
|
||||
#include <net/netfilter/nf_conntrack_zones.h>
|
||||
|
||||
static int nf_ct_ipv4_gather_frags(struct sk_buff *skb, u_int32_t user)
|
||||
{
|
||||
int err;
|
||||
|
||||
skb_orphan(skb);
|
||||
|
||||
local_bh_disable();
|
||||
err = ip_defrag(skb, user);
|
||||
local_bh_enable();
|
||||
|
||||
if (!err) {
|
||||
ip_send_check(ip_hdr(skb));
|
||||
skb->ignore_df = 1;
|
||||
}
|
||||
|
||||
return err;
|
||||
}
|
||||
|
||||
static enum ip_defrag_users nf_ct_defrag_user(unsigned int hooknum,
|
||||
struct sk_buff *skb)
|
||||
{
|
||||
u16 zone = NF_CT_DEFAULT_ZONE;
|
||||
|
||||
#if IS_ENABLED(CONFIG_NF_CONNTRACK)
|
||||
if (skb->nfct)
|
||||
zone = nf_ct_zone((struct nf_conn *)skb->nfct);
|
||||
#endif
|
||||
|
||||
#if IS_ENABLED(CONFIG_BRIDGE_NETFILTER)
|
||||
if (skb->nf_bridge &&
|
||||
skb->nf_bridge->mask & BRNF_NF_BRIDGE_PREROUTING)
|
||||
return IP_DEFRAG_CONNTRACK_BRIDGE_IN + zone;
|
||||
#endif
|
||||
if (hooknum == NF_INET_PRE_ROUTING)
|
||||
return IP_DEFRAG_CONNTRACK_IN + zone;
|
||||
else
|
||||
return IP_DEFRAG_CONNTRACK_OUT + zone;
|
||||
}
|
||||
|
||||
static unsigned int ipv4_conntrack_defrag(const struct nf_hook_ops *ops,
|
||||
struct sk_buff *skb,
|
||||
const struct net_device *in,
|
||||
const struct net_device *out,
|
||||
int (*okfn)(struct sk_buff *))
|
||||
{
|
||||
struct sock *sk = skb->sk;
|
||||
struct inet_sock *inet = inet_sk(skb->sk);
|
||||
|
||||
if (sk && (sk->sk_family == PF_INET) &&
|
||||
inet->nodefrag)
|
||||
return NF_ACCEPT;
|
||||
|
||||
#if IS_ENABLED(CONFIG_NF_CONNTRACK)
|
||||
#if !IS_ENABLED(CONFIG_NF_NAT)
|
||||
/* Previously seen (loopback)? Ignore. Do this before
|
||||
fragment check. */
|
||||
if (skb->nfct && !nf_ct_is_template((struct nf_conn *)skb->nfct))
|
||||
return NF_ACCEPT;
|
||||
#endif
|
||||
#endif
|
||||
/* Gather fragments. */
|
||||
if (ip_is_fragment(ip_hdr(skb))) {
|
||||
enum ip_defrag_users user =
|
||||
nf_ct_defrag_user(ops->hooknum, skb);
|
||||
|
||||
if (nf_ct_ipv4_gather_frags(skb, user))
|
||||
return NF_STOLEN;
|
||||
}
|
||||
return NF_ACCEPT;
|
||||
}
|
||||
|
||||
static struct nf_hook_ops ipv4_defrag_ops[] = {
|
||||
{
|
||||
.hook = ipv4_conntrack_defrag,
|
||||
.owner = THIS_MODULE,
|
||||
.pf = NFPROTO_IPV4,
|
||||
.hooknum = NF_INET_PRE_ROUTING,
|
||||
.priority = NF_IP_PRI_CONNTRACK_DEFRAG,
|
||||
},
|
||||
{
|
||||
.hook = ipv4_conntrack_defrag,
|
||||
.owner = THIS_MODULE,
|
||||
.pf = NFPROTO_IPV4,
|
||||
.hooknum = NF_INET_LOCAL_OUT,
|
||||
.priority = NF_IP_PRI_CONNTRACK_DEFRAG,
|
||||
},
|
||||
};
|
||||
|
||||
static int __init nf_defrag_init(void)
|
||||
{
|
||||
return nf_register_hooks(ipv4_defrag_ops, ARRAY_SIZE(ipv4_defrag_ops));
|
||||
}
|
||||
|
||||
static void __exit nf_defrag_fini(void)
|
||||
{
|
||||
nf_unregister_hooks(ipv4_defrag_ops, ARRAY_SIZE(ipv4_defrag_ops));
|
||||
}
|
||||
|
||||
void nf_defrag_ipv4_enable(void)
|
||||
{
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(nf_defrag_ipv4_enable);
|
||||
|
||||
module_init(nf_defrag_init);
|
||||
module_exit(nf_defrag_fini);
|
||||
|
||||
MODULE_LICENSE("GPL");
|
||||
149
net/ipv4/netfilter/nf_log_arp.c
Normal file
149
net/ipv4/netfilter/nf_log_arp.c
Normal file
|
|
@ -0,0 +1,149 @@
|
|||
/*
|
||||
* (C) 2014 by Pablo Neira Ayuso <pablo@netfilter.org>
|
||||
*
|
||||
* Based on code from ebt_log from:
|
||||
*
|
||||
* Bart De Schuymer <bdschuym@pandora.be>
|
||||
* Harald Welte <laforge@netfilter.org>
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License version 2 as
|
||||
* published by the Free Software Foundation.
|
||||
*/
|
||||
|
||||
#include <linux/module.h>
|
||||
#include <linux/spinlock.h>
|
||||
#include <linux/skbuff.h>
|
||||
#include <linux/if_arp.h>
|
||||
#include <linux/ip.h>
|
||||
#include <net/route.h>
|
||||
|
||||
#include <linux/netfilter.h>
|
||||
#include <linux/netfilter/xt_LOG.h>
|
||||
#include <net/netfilter/nf_log.h>
|
||||
|
||||
static struct nf_loginfo default_loginfo = {
|
||||
.type = NF_LOG_TYPE_LOG,
|
||||
.u = {
|
||||
.log = {
|
||||
.level = 5,
|
||||
.logflags = NF_LOG_MASK,
|
||||
},
|
||||
},
|
||||
};
|
||||
|
||||
struct arppayload {
|
||||
unsigned char mac_src[ETH_ALEN];
|
||||
unsigned char ip_src[4];
|
||||
unsigned char mac_dst[ETH_ALEN];
|
||||
unsigned char ip_dst[4];
|
||||
};
|
||||
|
||||
static void dump_arp_packet(struct nf_log_buf *m,
|
||||
const struct nf_loginfo *info,
|
||||
const struct sk_buff *skb, unsigned int nhoff)
|
||||
{
|
||||
const struct arphdr *ah;
|
||||
struct arphdr _arph;
|
||||
const struct arppayload *ap;
|
||||
struct arppayload _arpp;
|
||||
|
||||
ah = skb_header_pointer(skb, 0, sizeof(_arph), &_arph);
|
||||
if (ah == NULL) {
|
||||
nf_log_buf_add(m, "TRUNCATED");
|
||||
return;
|
||||
}
|
||||
nf_log_buf_add(m, "ARP HTYPE=%d PTYPE=0x%04x OPCODE=%d",
|
||||
ntohs(ah->ar_hrd), ntohs(ah->ar_pro), ntohs(ah->ar_op));
|
||||
|
||||
/* If it's for Ethernet and the lengths are OK, then log the ARP
|
||||
* payload.
|
||||
*/
|
||||
if (ah->ar_hrd != htons(1) ||
|
||||
ah->ar_hln != ETH_ALEN ||
|
||||
ah->ar_pln != sizeof(__be32))
|
||||
return;
|
||||
|
||||
ap = skb_header_pointer(skb, sizeof(_arph), sizeof(_arpp), &_arpp);
|
||||
if (ap == NULL) {
|
||||
nf_log_buf_add(m, " INCOMPLETE [%Zu bytes]",
|
||||
skb->len - sizeof(_arph));
|
||||
return;
|
||||
}
|
||||
nf_log_buf_add(m, " MACSRC=%pM IPSRC=%pI4 MACDST=%pM IPDST=%pI4",
|
||||
ap->mac_src, ap->ip_src, ap->mac_dst, ap->ip_dst);
|
||||
}
|
||||
|
||||
void nf_log_arp_packet(struct net *net, u_int8_t pf,
|
||||
unsigned int hooknum, const struct sk_buff *skb,
|
||||
const struct net_device *in,
|
||||
const struct net_device *out,
|
||||
const struct nf_loginfo *loginfo,
|
||||
const char *prefix)
|
||||
{
|
||||
struct nf_log_buf *m;
|
||||
|
||||
/* FIXME: Disabled from containers until syslog ns is supported */
|
||||
if (!net_eq(net, &init_net))
|
||||
return;
|
||||
|
||||
m = nf_log_buf_open();
|
||||
|
||||
if (!loginfo)
|
||||
loginfo = &default_loginfo;
|
||||
|
||||
nf_log_dump_packet_common(m, pf, hooknum, skb, in, out, loginfo,
|
||||
prefix);
|
||||
dump_arp_packet(m, loginfo, skb, 0);
|
||||
|
||||
nf_log_buf_close(m);
|
||||
}
|
||||
|
||||
static struct nf_logger nf_arp_logger __read_mostly = {
|
||||
.name = "nf_log_arp",
|
||||
.type = NF_LOG_TYPE_LOG,
|
||||
.logfn = nf_log_arp_packet,
|
||||
.me = THIS_MODULE,
|
||||
};
|
||||
|
||||
static int __net_init nf_log_arp_net_init(struct net *net)
|
||||
{
|
||||
nf_log_set(net, NFPROTO_ARP, &nf_arp_logger);
|
||||
return 0;
|
||||
}
|
||||
|
||||
static void __net_exit nf_log_arp_net_exit(struct net *net)
|
||||
{
|
||||
nf_log_unset(net, &nf_arp_logger);
|
||||
}
|
||||
|
||||
static struct pernet_operations nf_log_arp_net_ops = {
|
||||
.init = nf_log_arp_net_init,
|
||||
.exit = nf_log_arp_net_exit,
|
||||
};
|
||||
|
||||
static int __init nf_log_arp_init(void)
|
||||
{
|
||||
int ret;
|
||||
|
||||
ret = register_pernet_subsys(&nf_log_arp_net_ops);
|
||||
if (ret < 0)
|
||||
return ret;
|
||||
|
||||
nf_log_register(NFPROTO_ARP, &nf_arp_logger);
|
||||
return 0;
|
||||
}
|
||||
|
||||
static void __exit nf_log_arp_exit(void)
|
||||
{
|
||||
unregister_pernet_subsys(&nf_log_arp_net_ops);
|
||||
nf_log_unregister(&nf_arp_logger);
|
||||
}
|
||||
|
||||
module_init(nf_log_arp_init);
|
||||
module_exit(nf_log_arp_exit);
|
||||
|
||||
MODULE_AUTHOR("Pablo Neira Ayuso <pablo@netfilter.org>");
|
||||
MODULE_DESCRIPTION("Netfilter ARP packet logging");
|
||||
MODULE_LICENSE("GPL");
|
||||
MODULE_ALIAS_NF_LOGGER(3, 0);
|
||||
385
net/ipv4/netfilter/nf_log_ipv4.c
Normal file
385
net/ipv4/netfilter/nf_log_ipv4.c
Normal file
|
|
@ -0,0 +1,385 @@
|
|||
/* (C) 1999-2001 Paul `Rusty' Russell
|
||||
* (C) 2002-2004 Netfilter Core Team <coreteam@netfilter.org>
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License version 2 as
|
||||
* published by the Free Software Foundation.
|
||||
*/
|
||||
|
||||
#include <linux/module.h>
|
||||
#include <linux/spinlock.h>
|
||||
#include <linux/skbuff.h>
|
||||
#include <linux/if_arp.h>
|
||||
#include <linux/ip.h>
|
||||
#include <net/ipv6.h>
|
||||
#include <net/icmp.h>
|
||||
#include <net/udp.h>
|
||||
#include <net/tcp.h>
|
||||
#include <net/route.h>
|
||||
|
||||
#include <linux/netfilter.h>
|
||||
#include <linux/netfilter/xt_LOG.h>
|
||||
#include <net/netfilter/nf_log.h>
|
||||
|
||||
static struct nf_loginfo default_loginfo = {
|
||||
.type = NF_LOG_TYPE_LOG,
|
||||
.u = {
|
||||
.log = {
|
||||
.level = 5,
|
||||
.logflags = NF_LOG_MASK,
|
||||
},
|
||||
},
|
||||
};
|
||||
|
||||
/* One level of recursion won't kill us */
|
||||
static void dump_ipv4_packet(struct nf_log_buf *m,
|
||||
const struct nf_loginfo *info,
|
||||
const struct sk_buff *skb, unsigned int iphoff)
|
||||
{
|
||||
struct iphdr _iph;
|
||||
const struct iphdr *ih;
|
||||
unsigned int logflags;
|
||||
|
||||
if (info->type == NF_LOG_TYPE_LOG)
|
||||
logflags = info->u.log.logflags;
|
||||
else
|
||||
logflags = NF_LOG_MASK;
|
||||
|
||||
ih = skb_header_pointer(skb, iphoff, sizeof(_iph), &_iph);
|
||||
if (ih == NULL) {
|
||||
nf_log_buf_add(m, "TRUNCATED");
|
||||
return;
|
||||
}
|
||||
|
||||
/* Important fields:
|
||||
* TOS, len, DF/MF, fragment offset, TTL, src, dst, options. */
|
||||
/* Max length: 40 "SRC=255.255.255.255 DST=255.255.255.255 " */
|
||||
nf_log_buf_add(m, "SRC=%pI4 DST=%pI4 ", &ih->saddr, &ih->daddr);
|
||||
|
||||
/* Max length: 46 "LEN=65535 TOS=0xFF PREC=0xFF TTL=255 ID=65535 " */
|
||||
nf_log_buf_add(m, "LEN=%u TOS=0x%02X PREC=0x%02X TTL=%u ID=%u ",
|
||||
ntohs(ih->tot_len), ih->tos & IPTOS_TOS_MASK,
|
||||
ih->tos & IPTOS_PREC_MASK, ih->ttl, ntohs(ih->id));
|
||||
|
||||
/* Max length: 6 "CE DF MF " */
|
||||
if (ntohs(ih->frag_off) & IP_CE)
|
||||
nf_log_buf_add(m, "CE ");
|
||||
if (ntohs(ih->frag_off) & IP_DF)
|
||||
nf_log_buf_add(m, "DF ");
|
||||
if (ntohs(ih->frag_off) & IP_MF)
|
||||
nf_log_buf_add(m, "MF ");
|
||||
|
||||
/* Max length: 11 "FRAG:65535 " */
|
||||
if (ntohs(ih->frag_off) & IP_OFFSET)
|
||||
nf_log_buf_add(m, "FRAG:%u ", ntohs(ih->frag_off) & IP_OFFSET);
|
||||
|
||||
if ((logflags & XT_LOG_IPOPT) &&
|
||||
ih->ihl * 4 > sizeof(struct iphdr)) {
|
||||
const unsigned char *op;
|
||||
unsigned char _opt[4 * 15 - sizeof(struct iphdr)];
|
||||
unsigned int i, optsize;
|
||||
|
||||
optsize = ih->ihl * 4 - sizeof(struct iphdr);
|
||||
op = skb_header_pointer(skb, iphoff+sizeof(_iph),
|
||||
optsize, _opt);
|
||||
if (op == NULL) {
|
||||
nf_log_buf_add(m, "TRUNCATED");
|
||||
return;
|
||||
}
|
||||
|
||||
/* Max length: 127 "OPT (" 15*4*2chars ") " */
|
||||
nf_log_buf_add(m, "OPT (");
|
||||
for (i = 0; i < optsize; i++)
|
||||
nf_log_buf_add(m, "%02X", op[i]);
|
||||
nf_log_buf_add(m, ") ");
|
||||
}
|
||||
|
||||
switch (ih->protocol) {
|
||||
case IPPROTO_TCP:
|
||||
if (nf_log_dump_tcp_header(m, skb, ih->protocol,
|
||||
ntohs(ih->frag_off) & IP_OFFSET,
|
||||
iphoff+ih->ihl*4, logflags))
|
||||
return;
|
||||
break;
|
||||
case IPPROTO_UDP:
|
||||
case IPPROTO_UDPLITE:
|
||||
if (nf_log_dump_udp_header(m, skb, ih->protocol,
|
||||
ntohs(ih->frag_off) & IP_OFFSET,
|
||||
iphoff+ih->ihl*4))
|
||||
return;
|
||||
break;
|
||||
case IPPROTO_ICMP: {
|
||||
struct icmphdr _icmph;
|
||||
const struct icmphdr *ich;
|
||||
static const size_t required_len[NR_ICMP_TYPES+1]
|
||||
= { [ICMP_ECHOREPLY] = 4,
|
||||
[ICMP_DEST_UNREACH]
|
||||
= 8 + sizeof(struct iphdr),
|
||||
[ICMP_SOURCE_QUENCH]
|
||||
= 8 + sizeof(struct iphdr),
|
||||
[ICMP_REDIRECT]
|
||||
= 8 + sizeof(struct iphdr),
|
||||
[ICMP_ECHO] = 4,
|
||||
[ICMP_TIME_EXCEEDED]
|
||||
= 8 + sizeof(struct iphdr),
|
||||
[ICMP_PARAMETERPROB]
|
||||
= 8 + sizeof(struct iphdr),
|
||||
[ICMP_TIMESTAMP] = 20,
|
||||
[ICMP_TIMESTAMPREPLY] = 20,
|
||||
[ICMP_ADDRESS] = 12,
|
||||
[ICMP_ADDRESSREPLY] = 12 };
|
||||
|
||||
/* Max length: 11 "PROTO=ICMP " */
|
||||
nf_log_buf_add(m, "PROTO=ICMP ");
|
||||
|
||||
if (ntohs(ih->frag_off) & IP_OFFSET)
|
||||
break;
|
||||
|
||||
/* Max length: 25 "INCOMPLETE [65535 bytes] " */
|
||||
ich = skb_header_pointer(skb, iphoff + ih->ihl * 4,
|
||||
sizeof(_icmph), &_icmph);
|
||||
if (ich == NULL) {
|
||||
nf_log_buf_add(m, "INCOMPLETE [%u bytes] ",
|
||||
skb->len - iphoff - ih->ihl*4);
|
||||
break;
|
||||
}
|
||||
|
||||
/* Max length: 18 "TYPE=255 CODE=255 " */
|
||||
nf_log_buf_add(m, "TYPE=%u CODE=%u ", ich->type, ich->code);
|
||||
|
||||
/* Max length: 25 "INCOMPLETE [65535 bytes] " */
|
||||
if (ich->type <= NR_ICMP_TYPES &&
|
||||
required_len[ich->type] &&
|
||||
skb->len-iphoff-ih->ihl*4 < required_len[ich->type]) {
|
||||
nf_log_buf_add(m, "INCOMPLETE [%u bytes] ",
|
||||
skb->len - iphoff - ih->ihl*4);
|
||||
break;
|
||||
}
|
||||
|
||||
switch (ich->type) {
|
||||
case ICMP_ECHOREPLY:
|
||||
case ICMP_ECHO:
|
||||
/* Max length: 19 "ID=65535 SEQ=65535 " */
|
||||
nf_log_buf_add(m, "ID=%u SEQ=%u ",
|
||||
ntohs(ich->un.echo.id),
|
||||
ntohs(ich->un.echo.sequence));
|
||||
break;
|
||||
|
||||
case ICMP_PARAMETERPROB:
|
||||
/* Max length: 14 "PARAMETER=255 " */
|
||||
nf_log_buf_add(m, "PARAMETER=%u ",
|
||||
ntohl(ich->un.gateway) >> 24);
|
||||
break;
|
||||
case ICMP_REDIRECT:
|
||||
/* Max length: 24 "GATEWAY=255.255.255.255 " */
|
||||
nf_log_buf_add(m, "GATEWAY=%pI4 ", &ich->un.gateway);
|
||||
/* Fall through */
|
||||
case ICMP_DEST_UNREACH:
|
||||
case ICMP_SOURCE_QUENCH:
|
||||
case ICMP_TIME_EXCEEDED:
|
||||
/* Max length: 3+maxlen */
|
||||
if (!iphoff) { /* Only recurse once. */
|
||||
nf_log_buf_add(m, "[");
|
||||
dump_ipv4_packet(m, info, skb,
|
||||
iphoff + ih->ihl*4+sizeof(_icmph));
|
||||
nf_log_buf_add(m, "] ");
|
||||
}
|
||||
|
||||
/* Max length: 10 "MTU=65535 " */
|
||||
if (ich->type == ICMP_DEST_UNREACH &&
|
||||
ich->code == ICMP_FRAG_NEEDED) {
|
||||
nf_log_buf_add(m, "MTU=%u ",
|
||||
ntohs(ich->un.frag.mtu));
|
||||
}
|
||||
}
|
||||
break;
|
||||
}
|
||||
/* Max Length */
|
||||
case IPPROTO_AH: {
|
||||
struct ip_auth_hdr _ahdr;
|
||||
const struct ip_auth_hdr *ah;
|
||||
|
||||
if (ntohs(ih->frag_off) & IP_OFFSET)
|
||||
break;
|
||||
|
||||
/* Max length: 9 "PROTO=AH " */
|
||||
nf_log_buf_add(m, "PROTO=AH ");
|
||||
|
||||
/* Max length: 25 "INCOMPLETE [65535 bytes] " */
|
||||
ah = skb_header_pointer(skb, iphoff+ih->ihl*4,
|
||||
sizeof(_ahdr), &_ahdr);
|
||||
if (ah == NULL) {
|
||||
nf_log_buf_add(m, "INCOMPLETE [%u bytes] ",
|
||||
skb->len - iphoff - ih->ihl*4);
|
||||
break;
|
||||
}
|
||||
|
||||
/* Length: 15 "SPI=0xF1234567 " */
|
||||
nf_log_buf_add(m, "SPI=0x%x ", ntohl(ah->spi));
|
||||
break;
|
||||
}
|
||||
case IPPROTO_ESP: {
|
||||
struct ip_esp_hdr _esph;
|
||||
const struct ip_esp_hdr *eh;
|
||||
|
||||
/* Max length: 10 "PROTO=ESP " */
|
||||
nf_log_buf_add(m, "PROTO=ESP ");
|
||||
|
||||
if (ntohs(ih->frag_off) & IP_OFFSET)
|
||||
break;
|
||||
|
||||
/* Max length: 25 "INCOMPLETE [65535 bytes] " */
|
||||
eh = skb_header_pointer(skb, iphoff+ih->ihl*4,
|
||||
sizeof(_esph), &_esph);
|
||||
if (eh == NULL) {
|
||||
nf_log_buf_add(m, "INCOMPLETE [%u bytes] ",
|
||||
skb->len - iphoff - ih->ihl*4);
|
||||
break;
|
||||
}
|
||||
|
||||
/* Length: 15 "SPI=0xF1234567 " */
|
||||
nf_log_buf_add(m, "SPI=0x%x ", ntohl(eh->spi));
|
||||
break;
|
||||
}
|
||||
/* Max length: 10 "PROTO 255 " */
|
||||
default:
|
||||
nf_log_buf_add(m, "PROTO=%u ", ih->protocol);
|
||||
}
|
||||
|
||||
/* Max length: 15 "UID=4294967295 " */
|
||||
if ((logflags & XT_LOG_UID) && !iphoff)
|
||||
nf_log_dump_sk_uid_gid(m, skb->sk);
|
||||
|
||||
/* Max length: 16 "MARK=0xFFFFFFFF " */
|
||||
if (!iphoff && skb->mark)
|
||||
nf_log_buf_add(m, "MARK=0x%x ", skb->mark);
|
||||
|
||||
/* Proto Max log string length */
|
||||
/* IP: 40+46+6+11+127 = 230 */
|
||||
/* TCP: 10+max(25,20+30+13+9+32+11+127) = 252 */
|
||||
/* UDP: 10+max(25,20) = 35 */
|
||||
/* UDPLITE: 14+max(25,20) = 39 */
|
||||
/* ICMP: 11+max(25, 18+25+max(19,14,24+3+n+10,3+n+10)) = 91+n */
|
||||
/* ESP: 10+max(25)+15 = 50 */
|
||||
/* AH: 9+max(25)+15 = 49 */
|
||||
/* unknown: 10 */
|
||||
|
||||
/* (ICMP allows recursion one level deep) */
|
||||
/* maxlen = IP + ICMP + IP + max(TCP,UDP,ICMP,unknown) */
|
||||
/* maxlen = 230+ 91 + 230 + 252 = 803 */
|
||||
}
|
||||
|
||||
static void dump_ipv4_mac_header(struct nf_log_buf *m,
|
||||
const struct nf_loginfo *info,
|
||||
const struct sk_buff *skb)
|
||||
{
|
||||
struct net_device *dev = skb->dev;
|
||||
unsigned int logflags = 0;
|
||||
|
||||
if (info->type == NF_LOG_TYPE_LOG)
|
||||
logflags = info->u.log.logflags;
|
||||
|
||||
if (!(logflags & XT_LOG_MACDECODE))
|
||||
goto fallback;
|
||||
|
||||
switch (dev->type) {
|
||||
case ARPHRD_ETHER:
|
||||
nf_log_buf_add(m, "MACSRC=%pM MACDST=%pM MACPROTO=%04x ",
|
||||
eth_hdr(skb)->h_source, eth_hdr(skb)->h_dest,
|
||||
ntohs(eth_hdr(skb)->h_proto));
|
||||
return;
|
||||
default:
|
||||
break;
|
||||
}
|
||||
|
||||
fallback:
|
||||
nf_log_buf_add(m, "MAC=");
|
||||
if (dev->hard_header_len &&
|
||||
skb->mac_header != skb->network_header) {
|
||||
const unsigned char *p = skb_mac_header(skb);
|
||||
unsigned int i;
|
||||
|
||||
nf_log_buf_add(m, "%02x", *p++);
|
||||
for (i = 1; i < dev->hard_header_len; i++, p++)
|
||||
nf_log_buf_add(m, ":%02x", *p);
|
||||
}
|
||||
nf_log_buf_add(m, " ");
|
||||
}
|
||||
|
||||
static void nf_log_ip_packet(struct net *net, u_int8_t pf,
|
||||
unsigned int hooknum, const struct sk_buff *skb,
|
||||
const struct net_device *in,
|
||||
const struct net_device *out,
|
||||
const struct nf_loginfo *loginfo,
|
||||
const char *prefix)
|
||||
{
|
||||
struct nf_log_buf *m;
|
||||
|
||||
/* FIXME: Disabled from containers until syslog ns is supported */
|
||||
if (!net_eq(net, &init_net))
|
||||
return;
|
||||
|
||||
m = nf_log_buf_open();
|
||||
|
||||
if (!loginfo)
|
||||
loginfo = &default_loginfo;
|
||||
|
||||
nf_log_dump_packet_common(m, pf, hooknum, skb, in,
|
||||
out, loginfo, prefix);
|
||||
|
||||
if (in != NULL)
|
||||
dump_ipv4_mac_header(m, loginfo, skb);
|
||||
|
||||
dump_ipv4_packet(m, loginfo, skb, 0);
|
||||
|
||||
nf_log_buf_close(m);
|
||||
}
|
||||
|
||||
static struct nf_logger nf_ip_logger __read_mostly = {
|
||||
.name = "nf_log_ipv4",
|
||||
.type = NF_LOG_TYPE_LOG,
|
||||
.logfn = nf_log_ip_packet,
|
||||
.me = THIS_MODULE,
|
||||
};
|
||||
|
||||
static int __net_init nf_log_ipv4_net_init(struct net *net)
|
||||
{
|
||||
nf_log_set(net, NFPROTO_IPV4, &nf_ip_logger);
|
||||
return 0;
|
||||
}
|
||||
|
||||
static void __net_exit nf_log_ipv4_net_exit(struct net *net)
|
||||
{
|
||||
nf_log_unset(net, &nf_ip_logger);
|
||||
}
|
||||
|
||||
static struct pernet_operations nf_log_ipv4_net_ops = {
|
||||
.init = nf_log_ipv4_net_init,
|
||||
.exit = nf_log_ipv4_net_exit,
|
||||
};
|
||||
|
||||
static int __init nf_log_ipv4_init(void)
|
||||
{
|
||||
int ret;
|
||||
|
||||
ret = register_pernet_subsys(&nf_log_ipv4_net_ops);
|
||||
if (ret < 0)
|
||||
return ret;
|
||||
|
||||
nf_log_register(NFPROTO_IPV4, &nf_ip_logger);
|
||||
return 0;
|
||||
}
|
||||
|
||||
static void __exit nf_log_ipv4_exit(void)
|
||||
{
|
||||
unregister_pernet_subsys(&nf_log_ipv4_net_ops);
|
||||
nf_log_unregister(&nf_ip_logger);
|
||||
}
|
||||
|
||||
module_init(nf_log_ipv4_init);
|
||||
module_exit(nf_log_ipv4_exit);
|
||||
|
||||
MODULE_AUTHOR("Netfilter Core Team <coreteam@netfilter.org>");
|
||||
MODULE_DESCRIPTION("Netfilter IPv4 packet logging");
|
||||
MODULE_LICENSE("GPL");
|
||||
MODULE_ALIAS_NF_LOGGER(AF_INET, 0);
|
||||
631
net/ipv4/netfilter/nf_nat_h323.c
Normal file
631
net/ipv4/netfilter/nf_nat_h323.c
Normal file
|
|
@ -0,0 +1,631 @@
|
|||
/*
|
||||
* H.323 extension for NAT alteration.
|
||||
*
|
||||
* Copyright (c) 2006 Jing Min Zhao <zhaojingmin@users.sourceforge.net>
|
||||
* Copyright (c) 2006-2012 Patrick McHardy <kaber@trash.net>
|
||||
*
|
||||
* This source code is licensed under General Public License version 2.
|
||||
*
|
||||
* Based on the 'brute force' H.323 NAT module by
|
||||
* Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>
|
||||
*/
|
||||
|
||||
#include <linux/module.h>
|
||||
#include <linux/tcp.h>
|
||||
#include <net/tcp.h>
|
||||
|
||||
#include <net/netfilter/nf_nat.h>
|
||||
#include <net/netfilter/nf_nat_helper.h>
|
||||
#include <net/netfilter/nf_conntrack_helper.h>
|
||||
#include <net/netfilter/nf_conntrack_expect.h>
|
||||
#include <linux/netfilter/nf_conntrack_h323.h>
|
||||
|
||||
/****************************************************************************/
|
||||
static int set_addr(struct sk_buff *skb, unsigned int protoff,
|
||||
unsigned char **data, int dataoff,
|
||||
unsigned int addroff, __be32 ip, __be16 port)
|
||||
{
|
||||
enum ip_conntrack_info ctinfo;
|
||||
struct nf_conn *ct = nf_ct_get(skb, &ctinfo);
|
||||
struct {
|
||||
__be32 ip;
|
||||
__be16 port;
|
||||
} __attribute__ ((__packed__)) buf;
|
||||
const struct tcphdr *th;
|
||||
struct tcphdr _tcph;
|
||||
|
||||
buf.ip = ip;
|
||||
buf.port = port;
|
||||
addroff += dataoff;
|
||||
|
||||
if (ip_hdr(skb)->protocol == IPPROTO_TCP) {
|
||||
if (!nf_nat_mangle_tcp_packet(skb, ct, ctinfo,
|
||||
protoff, addroff, sizeof(buf),
|
||||
(char *) &buf, sizeof(buf))) {
|
||||
net_notice_ratelimited("nf_nat_h323: nf_nat_mangle_tcp_packet error\n");
|
||||
return -1;
|
||||
}
|
||||
|
||||
/* Relocate data pointer */
|
||||
th = skb_header_pointer(skb, ip_hdrlen(skb),
|
||||
sizeof(_tcph), &_tcph);
|
||||
if (th == NULL)
|
||||
return -1;
|
||||
*data = skb->data + ip_hdrlen(skb) + th->doff * 4 + dataoff;
|
||||
} else {
|
||||
if (!nf_nat_mangle_udp_packet(skb, ct, ctinfo,
|
||||
protoff, addroff, sizeof(buf),
|
||||
(char *) &buf, sizeof(buf))) {
|
||||
net_notice_ratelimited("nf_nat_h323: nf_nat_mangle_udp_packet error\n");
|
||||
return -1;
|
||||
}
|
||||
/* nf_nat_mangle_udp_packet uses skb_make_writable() to copy
|
||||
* or pull everything in a linear buffer, so we can safely
|
||||
* use the skb pointers now */
|
||||
*data = skb->data + ip_hdrlen(skb) + sizeof(struct udphdr);
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
/****************************************************************************/
|
||||
static int set_h225_addr(struct sk_buff *skb, unsigned int protoff,
|
||||
unsigned char **data, int dataoff,
|
||||
TransportAddress *taddr,
|
||||
union nf_inet_addr *addr, __be16 port)
|
||||
{
|
||||
return set_addr(skb, protoff, data, dataoff, taddr->ipAddress.ip,
|
||||
addr->ip, port);
|
||||
}
|
||||
|
||||
/****************************************************************************/
|
||||
static int set_h245_addr(struct sk_buff *skb, unsigned protoff,
|
||||
unsigned char **data, int dataoff,
|
||||
H245_TransportAddress *taddr,
|
||||
union nf_inet_addr *addr, __be16 port)
|
||||
{
|
||||
return set_addr(skb, protoff, data, dataoff,
|
||||
taddr->unicastAddress.iPAddress.network,
|
||||
addr->ip, port);
|
||||
}
|
||||
|
||||
/****************************************************************************/
|
||||
static int set_sig_addr(struct sk_buff *skb, struct nf_conn *ct,
|
||||
enum ip_conntrack_info ctinfo,
|
||||
unsigned int protoff, unsigned char **data,
|
||||
TransportAddress *taddr, int count)
|
||||
{
|
||||
const struct nf_ct_h323_master *info = nfct_help_data(ct);
|
||||
int dir = CTINFO2DIR(ctinfo);
|
||||
int i;
|
||||
__be16 port;
|
||||
union nf_inet_addr addr;
|
||||
|
||||
for (i = 0; i < count; i++) {
|
||||
if (get_h225_addr(ct, *data, &taddr[i], &addr, &port)) {
|
||||
if (addr.ip == ct->tuplehash[dir].tuple.src.u3.ip &&
|
||||
port == info->sig_port[dir]) {
|
||||
/* GW->GK */
|
||||
|
||||
/* Fix for Gnomemeeting */
|
||||
if (i > 0 &&
|
||||
get_h225_addr(ct, *data, &taddr[0],
|
||||
&addr, &port) &&
|
||||
(ntohl(addr.ip) & 0xff000000) == 0x7f000000)
|
||||
i = 0;
|
||||
|
||||
pr_debug("nf_nat_ras: set signal address %pI4:%hu->%pI4:%hu\n",
|
||||
&addr.ip, port,
|
||||
&ct->tuplehash[!dir].tuple.dst.u3.ip,
|
||||
info->sig_port[!dir]);
|
||||
return set_h225_addr(skb, protoff, data, 0,
|
||||
&taddr[i],
|
||||
&ct->tuplehash[!dir].
|
||||
tuple.dst.u3,
|
||||
info->sig_port[!dir]);
|
||||
} else if (addr.ip == ct->tuplehash[dir].tuple.dst.u3.ip &&
|
||||
port == info->sig_port[dir]) {
|
||||
/* GK->GW */
|
||||
pr_debug("nf_nat_ras: set signal address %pI4:%hu->%pI4:%hu\n",
|
||||
&addr.ip, port,
|
||||
&ct->tuplehash[!dir].tuple.src.u3.ip,
|
||||
info->sig_port[!dir]);
|
||||
return set_h225_addr(skb, protoff, data, 0,
|
||||
&taddr[i],
|
||||
&ct->tuplehash[!dir].
|
||||
tuple.src.u3,
|
||||
info->sig_port[!dir]);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
/****************************************************************************/
|
||||
static int set_ras_addr(struct sk_buff *skb, struct nf_conn *ct,
|
||||
enum ip_conntrack_info ctinfo,
|
||||
unsigned int protoff, unsigned char **data,
|
||||
TransportAddress *taddr, int count)
|
||||
{
|
||||
int dir = CTINFO2DIR(ctinfo);
|
||||
int i;
|
||||
__be16 port;
|
||||
union nf_inet_addr addr;
|
||||
|
||||
for (i = 0; i < count; i++) {
|
||||
if (get_h225_addr(ct, *data, &taddr[i], &addr, &port) &&
|
||||
addr.ip == ct->tuplehash[dir].tuple.src.u3.ip &&
|
||||
port == ct->tuplehash[dir].tuple.src.u.udp.port) {
|
||||
pr_debug("nf_nat_ras: set rasAddress %pI4:%hu->%pI4:%hu\n",
|
||||
&addr.ip, ntohs(port),
|
||||
&ct->tuplehash[!dir].tuple.dst.u3.ip,
|
||||
ntohs(ct->tuplehash[!dir].tuple.dst.u.udp.port));
|
||||
return set_h225_addr(skb, protoff, data, 0, &taddr[i],
|
||||
&ct->tuplehash[!dir].tuple.dst.u3,
|
||||
ct->tuplehash[!dir].tuple.
|
||||
dst.u.udp.port);
|
||||
}
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
/****************************************************************************/
|
||||
static int nat_rtp_rtcp(struct sk_buff *skb, struct nf_conn *ct,
|
||||
enum ip_conntrack_info ctinfo,
|
||||
unsigned int protoff, unsigned char **data, int dataoff,
|
||||
H245_TransportAddress *taddr,
|
||||
__be16 port, __be16 rtp_port,
|
||||
struct nf_conntrack_expect *rtp_exp,
|
||||
struct nf_conntrack_expect *rtcp_exp)
|
||||
{
|
||||
struct nf_ct_h323_master *info = nfct_help_data(ct);
|
||||
int dir = CTINFO2DIR(ctinfo);
|
||||
int i;
|
||||
u_int16_t nated_port;
|
||||
|
||||
/* Set expectations for NAT */
|
||||
rtp_exp->saved_proto.udp.port = rtp_exp->tuple.dst.u.udp.port;
|
||||
rtp_exp->expectfn = nf_nat_follow_master;
|
||||
rtp_exp->dir = !dir;
|
||||
rtcp_exp->saved_proto.udp.port = rtcp_exp->tuple.dst.u.udp.port;
|
||||
rtcp_exp->expectfn = nf_nat_follow_master;
|
||||
rtcp_exp->dir = !dir;
|
||||
|
||||
/* Lookup existing expects */
|
||||
for (i = 0; i < H323_RTP_CHANNEL_MAX; i++) {
|
||||
if (info->rtp_port[i][dir] == rtp_port) {
|
||||
/* Expected */
|
||||
|
||||
/* Use allocated ports first. This will refresh
|
||||
* the expects */
|
||||
rtp_exp->tuple.dst.u.udp.port = info->rtp_port[i][dir];
|
||||
rtcp_exp->tuple.dst.u.udp.port =
|
||||
htons(ntohs(info->rtp_port[i][dir]) + 1);
|
||||
break;
|
||||
} else if (info->rtp_port[i][dir] == 0) {
|
||||
/* Not expected */
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
/* Run out of expectations */
|
||||
if (i >= H323_RTP_CHANNEL_MAX) {
|
||||
net_notice_ratelimited("nf_nat_h323: out of expectations\n");
|
||||
return 0;
|
||||
}
|
||||
|
||||
/* Try to get a pair of ports. */
|
||||
for (nated_port = ntohs(rtp_exp->tuple.dst.u.udp.port);
|
||||
nated_port != 0; nated_port += 2) {
|
||||
int ret;
|
||||
|
||||
rtp_exp->tuple.dst.u.udp.port = htons(nated_port);
|
||||
ret = nf_ct_expect_related(rtp_exp);
|
||||
if (ret == 0) {
|
||||
rtcp_exp->tuple.dst.u.udp.port =
|
||||
htons(nated_port + 1);
|
||||
ret = nf_ct_expect_related(rtcp_exp);
|
||||
if (ret == 0)
|
||||
break;
|
||||
else if (ret == -EBUSY) {
|
||||
nf_ct_unexpect_related(rtp_exp);
|
||||
continue;
|
||||
} else if (ret < 0) {
|
||||
nf_ct_unexpect_related(rtp_exp);
|
||||
nated_port = 0;
|
||||
break;
|
||||
}
|
||||
} else if (ret != -EBUSY) {
|
||||
nated_port = 0;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
if (nated_port == 0) { /* No port available */
|
||||
net_notice_ratelimited("nf_nat_h323: out of RTP ports\n");
|
||||
return 0;
|
||||
}
|
||||
|
||||
/* Modify signal */
|
||||
if (set_h245_addr(skb, protoff, data, dataoff, taddr,
|
||||
&ct->tuplehash[!dir].tuple.dst.u3,
|
||||
htons((port & htons(1)) ? nated_port + 1 :
|
||||
nated_port)) == 0) {
|
||||
/* Save ports */
|
||||
info->rtp_port[i][dir] = rtp_port;
|
||||
info->rtp_port[i][!dir] = htons(nated_port);
|
||||
} else {
|
||||
nf_ct_unexpect_related(rtp_exp);
|
||||
nf_ct_unexpect_related(rtcp_exp);
|
||||
return -1;
|
||||
}
|
||||
|
||||
/* Success */
|
||||
pr_debug("nf_nat_h323: expect RTP %pI4:%hu->%pI4:%hu\n",
|
||||
&rtp_exp->tuple.src.u3.ip,
|
||||
ntohs(rtp_exp->tuple.src.u.udp.port),
|
||||
&rtp_exp->tuple.dst.u3.ip,
|
||||
ntohs(rtp_exp->tuple.dst.u.udp.port));
|
||||
pr_debug("nf_nat_h323: expect RTCP %pI4:%hu->%pI4:%hu\n",
|
||||
&rtcp_exp->tuple.src.u3.ip,
|
||||
ntohs(rtcp_exp->tuple.src.u.udp.port),
|
||||
&rtcp_exp->tuple.dst.u3.ip,
|
||||
ntohs(rtcp_exp->tuple.dst.u.udp.port));
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
/****************************************************************************/
|
||||
static int nat_t120(struct sk_buff *skb, struct nf_conn *ct,
|
||||
enum ip_conntrack_info ctinfo,
|
||||
unsigned int protoff, unsigned char **data, int dataoff,
|
||||
H245_TransportAddress *taddr, __be16 port,
|
||||
struct nf_conntrack_expect *exp)
|
||||
{
|
||||
int dir = CTINFO2DIR(ctinfo);
|
||||
u_int16_t nated_port = ntohs(port);
|
||||
|
||||
/* Set expectations for NAT */
|
||||
exp->saved_proto.tcp.port = exp->tuple.dst.u.tcp.port;
|
||||
exp->expectfn = nf_nat_follow_master;
|
||||
exp->dir = !dir;
|
||||
|
||||
/* Try to get same port: if not, try to change it. */
|
||||
for (; nated_port != 0; nated_port++) {
|
||||
int ret;
|
||||
|
||||
exp->tuple.dst.u.tcp.port = htons(nated_port);
|
||||
ret = nf_ct_expect_related(exp);
|
||||
if (ret == 0)
|
||||
break;
|
||||
else if (ret != -EBUSY) {
|
||||
nated_port = 0;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
if (nated_port == 0) { /* No port available */
|
||||
net_notice_ratelimited("nf_nat_h323: out of TCP ports\n");
|
||||
return 0;
|
||||
}
|
||||
|
||||
/* Modify signal */
|
||||
if (set_h245_addr(skb, protoff, data, dataoff, taddr,
|
||||
&ct->tuplehash[!dir].tuple.dst.u3,
|
||||
htons(nated_port)) < 0) {
|
||||
nf_ct_unexpect_related(exp);
|
||||
return -1;
|
||||
}
|
||||
|
||||
pr_debug("nf_nat_h323: expect T.120 %pI4:%hu->%pI4:%hu\n",
|
||||
&exp->tuple.src.u3.ip,
|
||||
ntohs(exp->tuple.src.u.tcp.port),
|
||||
&exp->tuple.dst.u3.ip,
|
||||
ntohs(exp->tuple.dst.u.tcp.port));
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
/****************************************************************************/
|
||||
static int nat_h245(struct sk_buff *skb, struct nf_conn *ct,
|
||||
enum ip_conntrack_info ctinfo,
|
||||
unsigned int protoff, unsigned char **data, int dataoff,
|
||||
TransportAddress *taddr, __be16 port,
|
||||
struct nf_conntrack_expect *exp)
|
||||
{
|
||||
struct nf_ct_h323_master *info = nfct_help_data(ct);
|
||||
int dir = CTINFO2DIR(ctinfo);
|
||||
u_int16_t nated_port = ntohs(port);
|
||||
|
||||
/* Set expectations for NAT */
|
||||
exp->saved_proto.tcp.port = exp->tuple.dst.u.tcp.port;
|
||||
exp->expectfn = nf_nat_follow_master;
|
||||
exp->dir = !dir;
|
||||
|
||||
/* Check existing expects */
|
||||
if (info->sig_port[dir] == port)
|
||||
nated_port = ntohs(info->sig_port[!dir]);
|
||||
|
||||
/* Try to get same port: if not, try to change it. */
|
||||
for (; nated_port != 0; nated_port++) {
|
||||
int ret;
|
||||
|
||||
exp->tuple.dst.u.tcp.port = htons(nated_port);
|
||||
ret = nf_ct_expect_related(exp);
|
||||
if (ret == 0)
|
||||
break;
|
||||
else if (ret != -EBUSY) {
|
||||
nated_port = 0;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
if (nated_port == 0) { /* No port available */
|
||||
net_notice_ratelimited("nf_nat_q931: out of TCP ports\n");
|
||||
return 0;
|
||||
}
|
||||
|
||||
/* Modify signal */
|
||||
if (set_h225_addr(skb, protoff, data, dataoff, taddr,
|
||||
&ct->tuplehash[!dir].tuple.dst.u3,
|
||||
htons(nated_port)) == 0) {
|
||||
/* Save ports */
|
||||
info->sig_port[dir] = port;
|
||||
info->sig_port[!dir] = htons(nated_port);
|
||||
} else {
|
||||
nf_ct_unexpect_related(exp);
|
||||
return -1;
|
||||
}
|
||||
|
||||
pr_debug("nf_nat_q931: expect H.245 %pI4:%hu->%pI4:%hu\n",
|
||||
&exp->tuple.src.u3.ip,
|
||||
ntohs(exp->tuple.src.u.tcp.port),
|
||||
&exp->tuple.dst.u3.ip,
|
||||
ntohs(exp->tuple.dst.u.tcp.port));
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
/****************************************************************************
|
||||
* This conntrack expect function replaces nf_conntrack_q931_expect()
|
||||
* which was set by nf_conntrack_h323.c.
|
||||
****************************************************************************/
|
||||
static void ip_nat_q931_expect(struct nf_conn *new,
|
||||
struct nf_conntrack_expect *this)
|
||||
{
|
||||
struct nf_nat_range range;
|
||||
|
||||
if (this->tuple.src.u3.ip != 0) { /* Only accept calls from GK */
|
||||
nf_nat_follow_master(new, this);
|
||||
return;
|
||||
}
|
||||
|
||||
/* This must be a fresh one. */
|
||||
BUG_ON(new->status & IPS_NAT_DONE_MASK);
|
||||
|
||||
/* Change src to where master sends to */
|
||||
range.flags = NF_NAT_RANGE_MAP_IPS;
|
||||
range.min_addr = range.max_addr =
|
||||
new->tuplehash[!this->dir].tuple.src.u3;
|
||||
nf_nat_setup_info(new, &range, NF_NAT_MANIP_SRC);
|
||||
|
||||
/* For DST manip, map port here to where it's expected. */
|
||||
range.flags = (NF_NAT_RANGE_MAP_IPS | NF_NAT_RANGE_PROTO_SPECIFIED);
|
||||
range.min_proto = range.max_proto = this->saved_proto;
|
||||
range.min_addr = range.max_addr =
|
||||
new->master->tuplehash[!this->dir].tuple.src.u3;
|
||||
nf_nat_setup_info(new, &range, NF_NAT_MANIP_DST);
|
||||
}
|
||||
|
||||
/****************************************************************************/
|
||||
static int nat_q931(struct sk_buff *skb, struct nf_conn *ct,
|
||||
enum ip_conntrack_info ctinfo,
|
||||
unsigned int protoff, unsigned char **data,
|
||||
TransportAddress *taddr, int idx,
|
||||
__be16 port, struct nf_conntrack_expect *exp)
|
||||
{
|
||||
struct nf_ct_h323_master *info = nfct_help_data(ct);
|
||||
int dir = CTINFO2DIR(ctinfo);
|
||||
u_int16_t nated_port = ntohs(port);
|
||||
union nf_inet_addr addr;
|
||||
|
||||
/* Set expectations for NAT */
|
||||
exp->saved_proto.tcp.port = exp->tuple.dst.u.tcp.port;
|
||||
exp->expectfn = ip_nat_q931_expect;
|
||||
exp->dir = !dir;
|
||||
|
||||
/* Check existing expects */
|
||||
if (info->sig_port[dir] == port)
|
||||
nated_port = ntohs(info->sig_port[!dir]);
|
||||
|
||||
/* Try to get same port: if not, try to change it. */
|
||||
for (; nated_port != 0; nated_port++) {
|
||||
int ret;
|
||||
|
||||
exp->tuple.dst.u.tcp.port = htons(nated_port);
|
||||
ret = nf_ct_expect_related(exp);
|
||||
if (ret == 0)
|
||||
break;
|
||||
else if (ret != -EBUSY) {
|
||||
nated_port = 0;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
if (nated_port == 0) { /* No port available */
|
||||
net_notice_ratelimited("nf_nat_ras: out of TCP ports\n");
|
||||
return 0;
|
||||
}
|
||||
|
||||
/* Modify signal */
|
||||
if (set_h225_addr(skb, protoff, data, 0, &taddr[idx],
|
||||
&ct->tuplehash[!dir].tuple.dst.u3,
|
||||
htons(nated_port)) == 0) {
|
||||
/* Save ports */
|
||||
info->sig_port[dir] = port;
|
||||
info->sig_port[!dir] = htons(nated_port);
|
||||
|
||||
/* Fix for Gnomemeeting */
|
||||
if (idx > 0 &&
|
||||
get_h225_addr(ct, *data, &taddr[0], &addr, &port) &&
|
||||
(ntohl(addr.ip) & 0xff000000) == 0x7f000000) {
|
||||
set_h225_addr(skb, protoff, data, 0, &taddr[0],
|
||||
&ct->tuplehash[!dir].tuple.dst.u3,
|
||||
info->sig_port[!dir]);
|
||||
}
|
||||
} else {
|
||||
nf_ct_unexpect_related(exp);
|
||||
return -1;
|
||||
}
|
||||
|
||||
/* Success */
|
||||
pr_debug("nf_nat_ras: expect Q.931 %pI4:%hu->%pI4:%hu\n",
|
||||
&exp->tuple.src.u3.ip,
|
||||
ntohs(exp->tuple.src.u.tcp.port),
|
||||
&exp->tuple.dst.u3.ip,
|
||||
ntohs(exp->tuple.dst.u.tcp.port));
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
/****************************************************************************/
|
||||
static void ip_nat_callforwarding_expect(struct nf_conn *new,
|
||||
struct nf_conntrack_expect *this)
|
||||
{
|
||||
struct nf_nat_range range;
|
||||
|
||||
/* This must be a fresh one. */
|
||||
BUG_ON(new->status & IPS_NAT_DONE_MASK);
|
||||
|
||||
/* Change src to where master sends to */
|
||||
range.flags = NF_NAT_RANGE_MAP_IPS;
|
||||
range.min_addr = range.max_addr =
|
||||
new->tuplehash[!this->dir].tuple.src.u3;
|
||||
nf_nat_setup_info(new, &range, NF_NAT_MANIP_SRC);
|
||||
|
||||
/* For DST manip, map port here to where it's expected. */
|
||||
range.flags = (NF_NAT_RANGE_MAP_IPS | NF_NAT_RANGE_PROTO_SPECIFIED);
|
||||
range.min_proto = range.max_proto = this->saved_proto;
|
||||
range.min_addr = range.max_addr = this->saved_addr;
|
||||
nf_nat_setup_info(new, &range, NF_NAT_MANIP_DST);
|
||||
}
|
||||
|
||||
/****************************************************************************/
|
||||
static int nat_callforwarding(struct sk_buff *skb, struct nf_conn *ct,
|
||||
enum ip_conntrack_info ctinfo,
|
||||
unsigned int protoff,
|
||||
unsigned char **data, int dataoff,
|
||||
TransportAddress *taddr, __be16 port,
|
||||
struct nf_conntrack_expect *exp)
|
||||
{
|
||||
int dir = CTINFO2DIR(ctinfo);
|
||||
u_int16_t nated_port;
|
||||
|
||||
/* Set expectations for NAT */
|
||||
exp->saved_addr = exp->tuple.dst.u3;
|
||||
exp->tuple.dst.u3.ip = ct->tuplehash[!dir].tuple.dst.u3.ip;
|
||||
exp->saved_proto.tcp.port = exp->tuple.dst.u.tcp.port;
|
||||
exp->expectfn = ip_nat_callforwarding_expect;
|
||||
exp->dir = !dir;
|
||||
|
||||
/* Try to get same port: if not, try to change it. */
|
||||
for (nated_port = ntohs(port); nated_port != 0; nated_port++) {
|
||||
int ret;
|
||||
|
||||
exp->tuple.dst.u.tcp.port = htons(nated_port);
|
||||
ret = nf_ct_expect_related(exp);
|
||||
if (ret == 0)
|
||||
break;
|
||||
else if (ret != -EBUSY) {
|
||||
nated_port = 0;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
if (nated_port == 0) { /* No port available */
|
||||
net_notice_ratelimited("nf_nat_q931: out of TCP ports\n");
|
||||
return 0;
|
||||
}
|
||||
|
||||
/* Modify signal */
|
||||
if (!set_h225_addr(skb, protoff, data, dataoff, taddr,
|
||||
&ct->tuplehash[!dir].tuple.dst.u3,
|
||||
htons(nated_port)) == 0) {
|
||||
nf_ct_unexpect_related(exp);
|
||||
return -1;
|
||||
}
|
||||
|
||||
/* Success */
|
||||
pr_debug("nf_nat_q931: expect Call Forwarding %pI4:%hu->%pI4:%hu\n",
|
||||
&exp->tuple.src.u3.ip,
|
||||
ntohs(exp->tuple.src.u.tcp.port),
|
||||
&exp->tuple.dst.u3.ip,
|
||||
ntohs(exp->tuple.dst.u.tcp.port));
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
static struct nf_ct_helper_expectfn q931_nat = {
|
||||
.name = "Q.931",
|
||||
.expectfn = ip_nat_q931_expect,
|
||||
};
|
||||
|
||||
static struct nf_ct_helper_expectfn callforwarding_nat = {
|
||||
.name = "callforwarding",
|
||||
.expectfn = ip_nat_callforwarding_expect,
|
||||
};
|
||||
|
||||
/****************************************************************************/
|
||||
static int __init init(void)
|
||||
{
|
||||
BUG_ON(set_h245_addr_hook != NULL);
|
||||
BUG_ON(set_h225_addr_hook != NULL);
|
||||
BUG_ON(set_sig_addr_hook != NULL);
|
||||
BUG_ON(set_ras_addr_hook != NULL);
|
||||
BUG_ON(nat_rtp_rtcp_hook != NULL);
|
||||
BUG_ON(nat_t120_hook != NULL);
|
||||
BUG_ON(nat_h245_hook != NULL);
|
||||
BUG_ON(nat_callforwarding_hook != NULL);
|
||||
BUG_ON(nat_q931_hook != NULL);
|
||||
|
||||
RCU_INIT_POINTER(set_h245_addr_hook, set_h245_addr);
|
||||
RCU_INIT_POINTER(set_h225_addr_hook, set_h225_addr);
|
||||
RCU_INIT_POINTER(set_sig_addr_hook, set_sig_addr);
|
||||
RCU_INIT_POINTER(set_ras_addr_hook, set_ras_addr);
|
||||
RCU_INIT_POINTER(nat_rtp_rtcp_hook, nat_rtp_rtcp);
|
||||
RCU_INIT_POINTER(nat_t120_hook, nat_t120);
|
||||
RCU_INIT_POINTER(nat_h245_hook, nat_h245);
|
||||
RCU_INIT_POINTER(nat_callforwarding_hook, nat_callforwarding);
|
||||
RCU_INIT_POINTER(nat_q931_hook, nat_q931);
|
||||
nf_ct_helper_expectfn_register(&q931_nat);
|
||||
nf_ct_helper_expectfn_register(&callforwarding_nat);
|
||||
return 0;
|
||||
}
|
||||
|
||||
/****************************************************************************/
|
||||
static void __exit fini(void)
|
||||
{
|
||||
RCU_INIT_POINTER(set_h245_addr_hook, NULL);
|
||||
RCU_INIT_POINTER(set_h225_addr_hook, NULL);
|
||||
RCU_INIT_POINTER(set_sig_addr_hook, NULL);
|
||||
RCU_INIT_POINTER(set_ras_addr_hook, NULL);
|
||||
RCU_INIT_POINTER(nat_rtp_rtcp_hook, NULL);
|
||||
RCU_INIT_POINTER(nat_t120_hook, NULL);
|
||||
RCU_INIT_POINTER(nat_h245_hook, NULL);
|
||||
RCU_INIT_POINTER(nat_callforwarding_hook, NULL);
|
||||
RCU_INIT_POINTER(nat_q931_hook, NULL);
|
||||
nf_ct_helper_expectfn_unregister(&q931_nat);
|
||||
nf_ct_helper_expectfn_unregister(&callforwarding_nat);
|
||||
synchronize_rcu();
|
||||
}
|
||||
|
||||
/****************************************************************************/
|
||||
module_init(init);
|
||||
module_exit(fini);
|
||||
|
||||
MODULE_AUTHOR("Jing Min Zhao <zhaojingmin@users.sourceforge.net>");
|
||||
MODULE_DESCRIPTION("H.323 NAT helper");
|
||||
MODULE_LICENSE("GPL");
|
||||
MODULE_ALIAS("ip_nat_h323");
|
||||
484
net/ipv4/netfilter/nf_nat_l3proto_ipv4.c
Normal file
484
net/ipv4/netfilter/nf_nat_l3proto_ipv4.c
Normal file
|
|
@ -0,0 +1,484 @@
|
|||
/*
|
||||
* (C) 1999-2001 Paul `Rusty' Russell
|
||||
* (C) 2002-2006 Netfilter Core Team <coreteam@netfilter.org>
|
||||
* (C) 2011 Patrick McHardy <kaber@trash.net>
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License version 2 as
|
||||
* published by the Free Software Foundation.
|
||||
*/
|
||||
|
||||
#include <linux/types.h>
|
||||
#include <linux/module.h>
|
||||
#include <linux/skbuff.h>
|
||||
#include <linux/ip.h>
|
||||
#include <linux/icmp.h>
|
||||
#include <linux/netfilter.h>
|
||||
#include <linux/netfilter_ipv4.h>
|
||||
#include <net/secure_seq.h>
|
||||
#include <net/checksum.h>
|
||||
#include <net/route.h>
|
||||
#include <net/ip.h>
|
||||
|
||||
#include <net/netfilter/nf_conntrack_core.h>
|
||||
#include <net/netfilter/nf_conntrack.h>
|
||||
#include <net/netfilter/nf_nat_core.h>
|
||||
#include <net/netfilter/nf_nat_l3proto.h>
|
||||
#include <net/netfilter/nf_nat_l4proto.h>
|
||||
|
||||
static const struct nf_nat_l3proto nf_nat_l3proto_ipv4;
|
||||
|
||||
#ifdef CONFIG_XFRM
|
||||
static void nf_nat_ipv4_decode_session(struct sk_buff *skb,
|
||||
const struct nf_conn *ct,
|
||||
enum ip_conntrack_dir dir,
|
||||
unsigned long statusbit,
|
||||
struct flowi *fl)
|
||||
{
|
||||
const struct nf_conntrack_tuple *t = &ct->tuplehash[dir].tuple;
|
||||
struct flowi4 *fl4 = &fl->u.ip4;
|
||||
|
||||
if (ct->status & statusbit) {
|
||||
fl4->daddr = t->dst.u3.ip;
|
||||
if (t->dst.protonum == IPPROTO_TCP ||
|
||||
t->dst.protonum == IPPROTO_UDP ||
|
||||
t->dst.protonum == IPPROTO_UDPLITE ||
|
||||
t->dst.protonum == IPPROTO_DCCP ||
|
||||
t->dst.protonum == IPPROTO_SCTP)
|
||||
fl4->fl4_dport = t->dst.u.all;
|
||||
}
|
||||
|
||||
statusbit ^= IPS_NAT_MASK;
|
||||
|
||||
if (ct->status & statusbit) {
|
||||
fl4->saddr = t->src.u3.ip;
|
||||
if (t->dst.protonum == IPPROTO_TCP ||
|
||||
t->dst.protonum == IPPROTO_UDP ||
|
||||
t->dst.protonum == IPPROTO_UDPLITE ||
|
||||
t->dst.protonum == IPPROTO_DCCP ||
|
||||
t->dst.protonum == IPPROTO_SCTP)
|
||||
fl4->fl4_sport = t->src.u.all;
|
||||
}
|
||||
}
|
||||
#endif /* CONFIG_XFRM */
|
||||
|
||||
static bool nf_nat_ipv4_in_range(const struct nf_conntrack_tuple *t,
|
||||
const struct nf_nat_range *range)
|
||||
{
|
||||
return ntohl(t->src.u3.ip) >= ntohl(range->min_addr.ip) &&
|
||||
ntohl(t->src.u3.ip) <= ntohl(range->max_addr.ip);
|
||||
}
|
||||
|
||||
static u32 nf_nat_ipv4_secure_port(const struct nf_conntrack_tuple *t,
|
||||
__be16 dport)
|
||||
{
|
||||
return secure_ipv4_port_ephemeral(t->src.u3.ip, t->dst.u3.ip, dport);
|
||||
}
|
||||
|
||||
static bool nf_nat_ipv4_manip_pkt(struct sk_buff *skb,
|
||||
unsigned int iphdroff,
|
||||
const struct nf_nat_l4proto *l4proto,
|
||||
const struct nf_conntrack_tuple *target,
|
||||
enum nf_nat_manip_type maniptype)
|
||||
{
|
||||
struct iphdr *iph;
|
||||
unsigned int hdroff;
|
||||
|
||||
if (!skb_make_writable(skb, iphdroff + sizeof(*iph)))
|
||||
return false;
|
||||
|
||||
iph = (void *)skb->data + iphdroff;
|
||||
hdroff = iphdroff + iph->ihl * 4;
|
||||
|
||||
if (!l4proto->manip_pkt(skb, &nf_nat_l3proto_ipv4, iphdroff, hdroff,
|
||||
target, maniptype))
|
||||
return false;
|
||||
iph = (void *)skb->data + iphdroff;
|
||||
|
||||
if (maniptype == NF_NAT_MANIP_SRC) {
|
||||
csum_replace4(&iph->check, iph->saddr, target->src.u3.ip);
|
||||
iph->saddr = target->src.u3.ip;
|
||||
} else {
|
||||
csum_replace4(&iph->check, iph->daddr, target->dst.u3.ip);
|
||||
iph->daddr = target->dst.u3.ip;
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
static void nf_nat_ipv4_csum_update(struct sk_buff *skb,
|
||||
unsigned int iphdroff, __sum16 *check,
|
||||
const struct nf_conntrack_tuple *t,
|
||||
enum nf_nat_manip_type maniptype)
|
||||
{
|
||||
struct iphdr *iph = (struct iphdr *)(skb->data + iphdroff);
|
||||
__be32 oldip, newip;
|
||||
|
||||
if (maniptype == NF_NAT_MANIP_SRC) {
|
||||
oldip = iph->saddr;
|
||||
newip = t->src.u3.ip;
|
||||
} else {
|
||||
oldip = iph->daddr;
|
||||
newip = t->dst.u3.ip;
|
||||
}
|
||||
inet_proto_csum_replace4(check, skb, oldip, newip, 1);
|
||||
}
|
||||
|
||||
static void nf_nat_ipv4_csum_recalc(struct sk_buff *skb,
|
||||
u8 proto, void *data, __sum16 *check,
|
||||
int datalen, int oldlen)
|
||||
{
|
||||
const struct iphdr *iph = ip_hdr(skb);
|
||||
struct rtable *rt = skb_rtable(skb);
|
||||
|
||||
if (skb->ip_summed != CHECKSUM_PARTIAL) {
|
||||
if (!(rt->rt_flags & RTCF_LOCAL) &&
|
||||
(!skb->dev || skb->dev->features & NETIF_F_V4_CSUM)) {
|
||||
skb->ip_summed = CHECKSUM_PARTIAL;
|
||||
skb->csum_start = skb_headroom(skb) +
|
||||
skb_network_offset(skb) +
|
||||
ip_hdrlen(skb);
|
||||
skb->csum_offset = (void *)check - data;
|
||||
*check = ~csum_tcpudp_magic(iph->saddr, iph->daddr,
|
||||
datalen, proto, 0);
|
||||
} else {
|
||||
*check = 0;
|
||||
*check = csum_tcpudp_magic(iph->saddr, iph->daddr,
|
||||
datalen, proto,
|
||||
csum_partial(data, datalen,
|
||||
0));
|
||||
if (proto == IPPROTO_UDP && !*check)
|
||||
*check = CSUM_MANGLED_0;
|
||||
}
|
||||
} else
|
||||
inet_proto_csum_replace2(check, skb,
|
||||
htons(oldlen), htons(datalen), 1);
|
||||
}
|
||||
|
||||
#if IS_ENABLED(CONFIG_NF_CT_NETLINK)
|
||||
static int nf_nat_ipv4_nlattr_to_range(struct nlattr *tb[],
|
||||
struct nf_nat_range *range)
|
||||
{
|
||||
if (tb[CTA_NAT_V4_MINIP]) {
|
||||
range->min_addr.ip = nla_get_be32(tb[CTA_NAT_V4_MINIP]);
|
||||
range->flags |= NF_NAT_RANGE_MAP_IPS;
|
||||
}
|
||||
|
||||
if (tb[CTA_NAT_V4_MAXIP])
|
||||
range->max_addr.ip = nla_get_be32(tb[CTA_NAT_V4_MAXIP]);
|
||||
else
|
||||
range->max_addr.ip = range->min_addr.ip;
|
||||
|
||||
return 0;
|
||||
}
|
||||
#endif
|
||||
|
||||
static const struct nf_nat_l3proto nf_nat_l3proto_ipv4 = {
|
||||
.l3proto = NFPROTO_IPV4,
|
||||
.in_range = nf_nat_ipv4_in_range,
|
||||
.secure_port = nf_nat_ipv4_secure_port,
|
||||
.manip_pkt = nf_nat_ipv4_manip_pkt,
|
||||
.csum_update = nf_nat_ipv4_csum_update,
|
||||
.csum_recalc = nf_nat_ipv4_csum_recalc,
|
||||
#if IS_ENABLED(CONFIG_NF_CT_NETLINK)
|
||||
.nlattr_to_range = nf_nat_ipv4_nlattr_to_range,
|
||||
#endif
|
||||
#ifdef CONFIG_XFRM
|
||||
.decode_session = nf_nat_ipv4_decode_session,
|
||||
#endif
|
||||
};
|
||||
|
||||
int nf_nat_icmp_reply_translation(struct sk_buff *skb,
|
||||
struct nf_conn *ct,
|
||||
enum ip_conntrack_info ctinfo,
|
||||
unsigned int hooknum)
|
||||
{
|
||||
struct {
|
||||
struct icmphdr icmp;
|
||||
struct iphdr ip;
|
||||
} *inside;
|
||||
enum ip_conntrack_dir dir = CTINFO2DIR(ctinfo);
|
||||
enum nf_nat_manip_type manip = HOOK2MANIP(hooknum);
|
||||
unsigned int hdrlen = ip_hdrlen(skb);
|
||||
const struct nf_nat_l4proto *l4proto;
|
||||
struct nf_conntrack_tuple target;
|
||||
unsigned long statusbit;
|
||||
|
||||
NF_CT_ASSERT(ctinfo == IP_CT_RELATED || ctinfo == IP_CT_RELATED_REPLY);
|
||||
|
||||
if (!skb_make_writable(skb, hdrlen + sizeof(*inside)))
|
||||
return 0;
|
||||
if (nf_ip_checksum(skb, hooknum, hdrlen, 0))
|
||||
return 0;
|
||||
|
||||
inside = (void *)skb->data + hdrlen;
|
||||
if (inside->icmp.type == ICMP_REDIRECT) {
|
||||
if ((ct->status & IPS_NAT_DONE_MASK) != IPS_NAT_DONE_MASK)
|
||||
return 0;
|
||||
if (ct->status & IPS_NAT_MASK)
|
||||
return 0;
|
||||
}
|
||||
|
||||
if (manip == NF_NAT_MANIP_SRC)
|
||||
statusbit = IPS_SRC_NAT;
|
||||
else
|
||||
statusbit = IPS_DST_NAT;
|
||||
|
||||
/* Invert if this is reply direction */
|
||||
if (dir == IP_CT_DIR_REPLY)
|
||||
statusbit ^= IPS_NAT_MASK;
|
||||
|
||||
if (!(ct->status & statusbit))
|
||||
return 1;
|
||||
|
||||
l4proto = __nf_nat_l4proto_find(NFPROTO_IPV4, inside->ip.protocol);
|
||||
if (!nf_nat_ipv4_manip_pkt(skb, hdrlen + sizeof(inside->icmp),
|
||||
l4proto, &ct->tuplehash[!dir].tuple, !manip))
|
||||
return 0;
|
||||
|
||||
if (skb->ip_summed != CHECKSUM_PARTIAL) {
|
||||
/* Reloading "inside" here since manip_pkt may reallocate */
|
||||
inside = (void *)skb->data + hdrlen;
|
||||
inside->icmp.checksum = 0;
|
||||
inside->icmp.checksum =
|
||||
csum_fold(skb_checksum(skb, hdrlen,
|
||||
skb->len - hdrlen, 0));
|
||||
}
|
||||
|
||||
/* Change outer to look like the reply to an incoming packet */
|
||||
nf_ct_invert_tuplepr(&target, &ct->tuplehash[!dir].tuple);
|
||||
l4proto = __nf_nat_l4proto_find(NFPROTO_IPV4, 0);
|
||||
if (!nf_nat_ipv4_manip_pkt(skb, 0, l4proto, &target, manip))
|
||||
return 0;
|
||||
|
||||
return 1;
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(nf_nat_icmp_reply_translation);
|
||||
|
||||
unsigned int
|
||||
nf_nat_ipv4_fn(const struct nf_hook_ops *ops, struct sk_buff *skb,
|
||||
const struct net_device *in, const struct net_device *out,
|
||||
unsigned int (*do_chain)(const struct nf_hook_ops *ops,
|
||||
struct sk_buff *skb,
|
||||
const struct net_device *in,
|
||||
const struct net_device *out,
|
||||
struct nf_conn *ct))
|
||||
{
|
||||
struct nf_conn *ct;
|
||||
enum ip_conntrack_info ctinfo;
|
||||
struct nf_conn_nat *nat;
|
||||
/* maniptype == SRC for postrouting. */
|
||||
enum nf_nat_manip_type maniptype = HOOK2MANIP(ops->hooknum);
|
||||
|
||||
/* We never see fragments: conntrack defrags on pre-routing
|
||||
* and local-out, and nf_nat_out protects post-routing.
|
||||
*/
|
||||
NF_CT_ASSERT(!ip_is_fragment(ip_hdr(skb)));
|
||||
|
||||
ct = nf_ct_get(skb, &ctinfo);
|
||||
/* Can't track? It's not due to stress, or conntrack would
|
||||
* have dropped it. Hence it's the user's responsibilty to
|
||||
* packet filter it out, or implement conntrack/NAT for that
|
||||
* protocol. 8) --RR
|
||||
*/
|
||||
if (!ct)
|
||||
return NF_ACCEPT;
|
||||
|
||||
/* Don't try to NAT if this packet is not conntracked */
|
||||
if (nf_ct_is_untracked(ct))
|
||||
return NF_ACCEPT;
|
||||
|
||||
nat = nf_ct_nat_ext_add(ct);
|
||||
if (nat == NULL)
|
||||
return NF_ACCEPT;
|
||||
|
||||
switch (ctinfo) {
|
||||
case IP_CT_RELATED:
|
||||
case IP_CT_RELATED_REPLY:
|
||||
if (ip_hdr(skb)->protocol == IPPROTO_ICMP) {
|
||||
if (!nf_nat_icmp_reply_translation(skb, ct, ctinfo,
|
||||
ops->hooknum))
|
||||
return NF_DROP;
|
||||
else
|
||||
return NF_ACCEPT;
|
||||
}
|
||||
/* Fall thru... (Only ICMPs can be IP_CT_IS_REPLY) */
|
||||
case IP_CT_NEW:
|
||||
/* Seen it before? This can happen for loopback, retrans,
|
||||
* or local packets.
|
||||
*/
|
||||
if (!nf_nat_initialized(ct, maniptype)) {
|
||||
unsigned int ret;
|
||||
|
||||
ret = do_chain(ops, skb, in, out, ct);
|
||||
if (ret != NF_ACCEPT)
|
||||
return ret;
|
||||
|
||||
if (nf_nat_initialized(ct, HOOK2MANIP(ops->hooknum)))
|
||||
break;
|
||||
|
||||
ret = nf_nat_alloc_null_binding(ct, ops->hooknum);
|
||||
if (ret != NF_ACCEPT)
|
||||
return ret;
|
||||
} else {
|
||||
pr_debug("Already setup manip %s for ct %p\n",
|
||||
maniptype == NF_NAT_MANIP_SRC ? "SRC" : "DST",
|
||||
ct);
|
||||
if (nf_nat_oif_changed(ops->hooknum, ctinfo, nat, out))
|
||||
goto oif_changed;
|
||||
}
|
||||
break;
|
||||
|
||||
default:
|
||||
/* ESTABLISHED */
|
||||
NF_CT_ASSERT(ctinfo == IP_CT_ESTABLISHED ||
|
||||
ctinfo == IP_CT_ESTABLISHED_REPLY);
|
||||
if (nf_nat_oif_changed(ops->hooknum, ctinfo, nat, out))
|
||||
goto oif_changed;
|
||||
}
|
||||
|
||||
return nf_nat_packet(ct, ctinfo, ops->hooknum, skb);
|
||||
|
||||
oif_changed:
|
||||
nf_ct_kill_acct(ct, ctinfo, skb);
|
||||
return NF_DROP;
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(nf_nat_ipv4_fn);
|
||||
|
||||
unsigned int
|
||||
nf_nat_ipv4_in(const struct nf_hook_ops *ops, struct sk_buff *skb,
|
||||
const struct net_device *in, const struct net_device *out,
|
||||
unsigned int (*do_chain)(const struct nf_hook_ops *ops,
|
||||
struct sk_buff *skb,
|
||||
const struct net_device *in,
|
||||
const struct net_device *out,
|
||||
struct nf_conn *ct))
|
||||
{
|
||||
unsigned int ret;
|
||||
__be32 daddr = ip_hdr(skb)->daddr;
|
||||
|
||||
ret = nf_nat_ipv4_fn(ops, skb, in, out, do_chain);
|
||||
if (ret != NF_DROP && ret != NF_STOLEN &&
|
||||
daddr != ip_hdr(skb)->daddr)
|
||||
skb_dst_drop(skb);
|
||||
|
||||
return ret;
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(nf_nat_ipv4_in);
|
||||
|
||||
unsigned int
|
||||
nf_nat_ipv4_out(const struct nf_hook_ops *ops, struct sk_buff *skb,
|
||||
const struct net_device *in, const struct net_device *out,
|
||||
unsigned int (*do_chain)(const struct nf_hook_ops *ops,
|
||||
struct sk_buff *skb,
|
||||
const struct net_device *in,
|
||||
const struct net_device *out,
|
||||
struct nf_conn *ct))
|
||||
{
|
||||
#ifdef CONFIG_XFRM
|
||||
const struct nf_conn *ct;
|
||||
enum ip_conntrack_info ctinfo;
|
||||
int err;
|
||||
#endif
|
||||
unsigned int ret;
|
||||
|
||||
/* root is playing with raw sockets. */
|
||||
if (skb->len < sizeof(struct iphdr) ||
|
||||
ip_hdrlen(skb) < sizeof(struct iphdr))
|
||||
return NF_ACCEPT;
|
||||
|
||||
ret = nf_nat_ipv4_fn(ops, skb, in, out, do_chain);
|
||||
#ifdef CONFIG_XFRM
|
||||
if (ret != NF_DROP && ret != NF_STOLEN &&
|
||||
!(IPCB(skb)->flags & IPSKB_XFRM_TRANSFORMED) &&
|
||||
(ct = nf_ct_get(skb, &ctinfo)) != NULL) {
|
||||
enum ip_conntrack_dir dir = CTINFO2DIR(ctinfo);
|
||||
|
||||
if ((ct->tuplehash[dir].tuple.src.u3.ip !=
|
||||
ct->tuplehash[!dir].tuple.dst.u3.ip) ||
|
||||
(ct->tuplehash[dir].tuple.dst.protonum != IPPROTO_ICMP &&
|
||||
ct->tuplehash[dir].tuple.src.u.all !=
|
||||
ct->tuplehash[!dir].tuple.dst.u.all)) {
|
||||
err = nf_xfrm_me_harder(skb, AF_INET);
|
||||
if (err < 0)
|
||||
ret = NF_DROP_ERR(err);
|
||||
}
|
||||
}
|
||||
#endif
|
||||
return ret;
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(nf_nat_ipv4_out);
|
||||
|
||||
unsigned int
|
||||
nf_nat_ipv4_local_fn(const struct nf_hook_ops *ops, struct sk_buff *skb,
|
||||
const struct net_device *in, const struct net_device *out,
|
||||
unsigned int (*do_chain)(const struct nf_hook_ops *ops,
|
||||
struct sk_buff *skb,
|
||||
const struct net_device *in,
|
||||
const struct net_device *out,
|
||||
struct nf_conn *ct))
|
||||
{
|
||||
const struct nf_conn *ct;
|
||||
enum ip_conntrack_info ctinfo;
|
||||
unsigned int ret;
|
||||
int err;
|
||||
|
||||
/* root is playing with raw sockets. */
|
||||
if (skb->len < sizeof(struct iphdr) ||
|
||||
ip_hdrlen(skb) < sizeof(struct iphdr))
|
||||
return NF_ACCEPT;
|
||||
|
||||
ret = nf_nat_ipv4_fn(ops, skb, in, out, do_chain);
|
||||
if (ret != NF_DROP && ret != NF_STOLEN &&
|
||||
(ct = nf_ct_get(skb, &ctinfo)) != NULL) {
|
||||
enum ip_conntrack_dir dir = CTINFO2DIR(ctinfo);
|
||||
|
||||
if (ct->tuplehash[dir].tuple.dst.u3.ip !=
|
||||
ct->tuplehash[!dir].tuple.src.u3.ip) {
|
||||
err = ip_route_me_harder(skb, RTN_UNSPEC);
|
||||
if (err < 0)
|
||||
ret = NF_DROP_ERR(err);
|
||||
}
|
||||
#ifdef CONFIG_XFRM
|
||||
else if (!(IPCB(skb)->flags & IPSKB_XFRM_TRANSFORMED) &&
|
||||
ct->tuplehash[dir].tuple.dst.protonum != IPPROTO_ICMP &&
|
||||
ct->tuplehash[dir].tuple.dst.u.all !=
|
||||
ct->tuplehash[!dir].tuple.src.u.all) {
|
||||
err = nf_xfrm_me_harder(skb, AF_INET);
|
||||
if (err < 0)
|
||||
ret = NF_DROP_ERR(err);
|
||||
}
|
||||
#endif
|
||||
}
|
||||
return ret;
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(nf_nat_ipv4_local_fn);
|
||||
|
||||
static int __init nf_nat_l3proto_ipv4_init(void)
|
||||
{
|
||||
int err;
|
||||
|
||||
err = nf_nat_l4proto_register(NFPROTO_IPV4, &nf_nat_l4proto_icmp);
|
||||
if (err < 0)
|
||||
goto err1;
|
||||
err = nf_nat_l3proto_register(&nf_nat_l3proto_ipv4);
|
||||
if (err < 0)
|
||||
goto err2;
|
||||
return err;
|
||||
|
||||
err2:
|
||||
nf_nat_l4proto_unregister(NFPROTO_IPV4, &nf_nat_l4proto_icmp);
|
||||
err1:
|
||||
return err;
|
||||
}
|
||||
|
||||
static void __exit nf_nat_l3proto_ipv4_exit(void)
|
||||
{
|
||||
nf_nat_l3proto_unregister(&nf_nat_l3proto_ipv4);
|
||||
nf_nat_l4proto_unregister(NFPROTO_IPV4, &nf_nat_l4proto_icmp);
|
||||
}
|
||||
|
||||
MODULE_LICENSE("GPL");
|
||||
MODULE_ALIAS("nf-nat-" __stringify(AF_INET));
|
||||
|
||||
module_init(nf_nat_l3proto_ipv4_init);
|
||||
module_exit(nf_nat_l3proto_ipv4_exit);
|
||||
161
net/ipv4/netfilter/nf_nat_masquerade_ipv4.c
Normal file
161
net/ipv4/netfilter/nf_nat_masquerade_ipv4.c
Normal file
|
|
@ -0,0 +1,161 @@
|
|||
/* (C) 1999-2001 Paul `Rusty' Russell
|
||||
* (C) 2002-2006 Netfilter Core Team <coreteam@netfilter.org>
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License version 2 as
|
||||
* published by the Free Software Foundation.
|
||||
*/
|
||||
|
||||
#include <linux/types.h>
|
||||
#include <linux/module.h>
|
||||
#include <linux/atomic.h>
|
||||
#include <linux/inetdevice.h>
|
||||
#include <linux/ip.h>
|
||||
#include <linux/timer.h>
|
||||
#include <linux/netfilter.h>
|
||||
#include <net/protocol.h>
|
||||
#include <net/ip.h>
|
||||
#include <net/checksum.h>
|
||||
#include <net/route.h>
|
||||
#include <linux/netfilter_ipv4.h>
|
||||
#include <linux/netfilter/x_tables.h>
|
||||
#include <net/netfilter/nf_nat.h>
|
||||
#include <net/netfilter/ipv4/nf_nat_masquerade.h>
|
||||
|
||||
unsigned int
|
||||
nf_nat_masquerade_ipv4(struct sk_buff *skb, unsigned int hooknum,
|
||||
const struct nf_nat_range *range,
|
||||
const struct net_device *out)
|
||||
{
|
||||
struct nf_conn *ct;
|
||||
struct nf_conn_nat *nat;
|
||||
enum ip_conntrack_info ctinfo;
|
||||
struct nf_nat_range newrange;
|
||||
const struct rtable *rt;
|
||||
__be32 newsrc, nh;
|
||||
|
||||
NF_CT_ASSERT(hooknum == NF_INET_POST_ROUTING);
|
||||
|
||||
ct = nf_ct_get(skb, &ctinfo);
|
||||
nat = nfct_nat(ct);
|
||||
|
||||
NF_CT_ASSERT(ct && (ctinfo == IP_CT_NEW || ctinfo == IP_CT_RELATED ||
|
||||
ctinfo == IP_CT_RELATED_REPLY));
|
||||
|
||||
/* Source address is 0.0.0.0 - locally generated packet that is
|
||||
* probably not supposed to be masqueraded.
|
||||
*/
|
||||
if (ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.src.u3.ip == 0)
|
||||
return NF_ACCEPT;
|
||||
|
||||
rt = skb_rtable(skb);
|
||||
nh = rt_nexthop(rt, ip_hdr(skb)->daddr);
|
||||
newsrc = inet_select_addr(out, nh, RT_SCOPE_UNIVERSE);
|
||||
if (!newsrc) {
|
||||
pr_info("%s ate my IP address\n", out->name);
|
||||
return NF_DROP;
|
||||
}
|
||||
|
||||
nat->masq_index = out->ifindex;
|
||||
|
||||
/* Transfer from original range. */
|
||||
memset(&newrange.min_addr, 0, sizeof(newrange.min_addr));
|
||||
memset(&newrange.max_addr, 0, sizeof(newrange.max_addr));
|
||||
newrange.flags = range->flags | NF_NAT_RANGE_MAP_IPS;
|
||||
newrange.min_addr.ip = newsrc;
|
||||
newrange.max_addr.ip = newsrc;
|
||||
newrange.min_proto = range->min_proto;
|
||||
newrange.max_proto = range->max_proto;
|
||||
|
||||
/* Hand modified range to generic setup. */
|
||||
return nf_nat_setup_info(ct, &newrange, NF_NAT_MANIP_SRC);
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(nf_nat_masquerade_ipv4);
|
||||
|
||||
static int device_cmp(struct nf_conn *i, void *ifindex)
|
||||
{
|
||||
const struct nf_conn_nat *nat = nfct_nat(i);
|
||||
|
||||
if (!nat)
|
||||
return 0;
|
||||
if (nf_ct_l3num(i) != NFPROTO_IPV4)
|
||||
return 0;
|
||||
return nat->masq_index == (int)(long)ifindex;
|
||||
}
|
||||
|
||||
static int masq_device_event(struct notifier_block *this,
|
||||
unsigned long event,
|
||||
void *ptr)
|
||||
{
|
||||
const struct net_device *dev = netdev_notifier_info_to_dev(ptr);
|
||||
struct net *net = dev_net(dev);
|
||||
|
||||
if (event == NETDEV_DOWN) {
|
||||
/* Device was downed. Search entire table for
|
||||
* conntracks which were associated with that device,
|
||||
* and forget them.
|
||||
*/
|
||||
NF_CT_ASSERT(dev->ifindex != 0);
|
||||
|
||||
nf_ct_iterate_cleanup(net, device_cmp,
|
||||
(void *)(long)dev->ifindex, 0, 0);
|
||||
}
|
||||
|
||||
return NOTIFY_DONE;
|
||||
}
|
||||
|
||||
static int masq_inet_event(struct notifier_block *this,
|
||||
unsigned long event,
|
||||
void *ptr)
|
||||
{
|
||||
struct in_device *idev = ((struct in_ifaddr *)ptr)->ifa_dev;
|
||||
struct netdev_notifier_info info;
|
||||
|
||||
/* The masq_dev_notifier will catch the case of the device going
|
||||
* down. So if the inetdev is dead and being destroyed we have
|
||||
* no work to do. Otherwise this is an individual address removal
|
||||
* and we have to perform the flush.
|
||||
*/
|
||||
if (idev->dead)
|
||||
return NOTIFY_DONE;
|
||||
|
||||
netdev_notifier_info_init(&info, idev->dev);
|
||||
return masq_device_event(this, event, &info);
|
||||
}
|
||||
|
||||
static struct notifier_block masq_dev_notifier = {
|
||||
.notifier_call = masq_device_event,
|
||||
};
|
||||
|
||||
static struct notifier_block masq_inet_notifier = {
|
||||
.notifier_call = masq_inet_event,
|
||||
};
|
||||
|
||||
static atomic_t masquerade_notifier_refcount = ATOMIC_INIT(0);
|
||||
|
||||
void nf_nat_masquerade_ipv4_register_notifier(void)
|
||||
{
|
||||
/* check if the notifier was already set */
|
||||
if (atomic_inc_return(&masquerade_notifier_refcount) > 1)
|
||||
return;
|
||||
|
||||
/* Register for device down reports */
|
||||
register_netdevice_notifier(&masq_dev_notifier);
|
||||
/* Register IP address change reports */
|
||||
register_inetaddr_notifier(&masq_inet_notifier);
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(nf_nat_masquerade_ipv4_register_notifier);
|
||||
|
||||
void nf_nat_masquerade_ipv4_unregister_notifier(void)
|
||||
{
|
||||
/* check if the notifier still has clients */
|
||||
if (atomic_dec_return(&masquerade_notifier_refcount) > 0)
|
||||
return;
|
||||
|
||||
unregister_netdevice_notifier(&masq_dev_notifier);
|
||||
unregister_inetaddr_notifier(&masq_inet_notifier);
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(nf_nat_masquerade_ipv4_unregister_notifier);
|
||||
|
||||
MODULE_LICENSE("GPL");
|
||||
MODULE_AUTHOR("Rusty Russell <rusty@rustcorp.com.au>");
|
||||
311
net/ipv4/netfilter/nf_nat_pptp.c
Normal file
311
net/ipv4/netfilter/nf_nat_pptp.c
Normal file
|
|
@ -0,0 +1,311 @@
|
|||
/*
|
||||
* nf_nat_pptp.c
|
||||
*
|
||||
* NAT support for PPTP (Point to Point Tunneling Protocol).
|
||||
* PPTP is a a protocol for creating virtual private networks.
|
||||
* It is a specification defined by Microsoft and some vendors
|
||||
* working with Microsoft. PPTP is built on top of a modified
|
||||
* version of the Internet Generic Routing Encapsulation Protocol.
|
||||
* GRE is defined in RFC 1701 and RFC 1702. Documentation of
|
||||
* PPTP can be found in RFC 2637
|
||||
*
|
||||
* (C) 2000-2005 by Harald Welte <laforge@gnumonks.org>
|
||||
*
|
||||
* Development of this code funded by Astaro AG (http://www.astaro.com/)
|
||||
*
|
||||
* (C) 2006-2012 Patrick McHardy <kaber@trash.net>
|
||||
*
|
||||
* TODO: - NAT to a unique tuple, not to TCP source port
|
||||
* (needs netfilter tuple reservation)
|
||||
*/
|
||||
|
||||
#include <linux/module.h>
|
||||
#include <linux/tcp.h>
|
||||
|
||||
#include <net/netfilter/nf_nat.h>
|
||||
#include <net/netfilter/nf_nat_helper.h>
|
||||
#include <net/netfilter/nf_conntrack_helper.h>
|
||||
#include <net/netfilter/nf_conntrack_expect.h>
|
||||
#include <net/netfilter/nf_conntrack_zones.h>
|
||||
#include <linux/netfilter/nf_conntrack_proto_gre.h>
|
||||
#include <linux/netfilter/nf_conntrack_pptp.h>
|
||||
|
||||
#define NF_NAT_PPTP_VERSION "3.0"
|
||||
|
||||
#define REQ_CID(req, off) (*(__be16 *)((char *)(req) + (off)))
|
||||
|
||||
MODULE_LICENSE("GPL");
|
||||
MODULE_AUTHOR("Harald Welte <laforge@gnumonks.org>");
|
||||
MODULE_DESCRIPTION("Netfilter NAT helper module for PPTP");
|
||||
MODULE_ALIAS("ip_nat_pptp");
|
||||
|
||||
static void pptp_nat_expected(struct nf_conn *ct,
|
||||
struct nf_conntrack_expect *exp)
|
||||
{
|
||||
struct net *net = nf_ct_net(ct);
|
||||
const struct nf_conn *master = ct->master;
|
||||
struct nf_conntrack_expect *other_exp;
|
||||
struct nf_conntrack_tuple t;
|
||||
const struct nf_ct_pptp_master *ct_pptp_info;
|
||||
const struct nf_nat_pptp *nat_pptp_info;
|
||||
struct nf_nat_range range;
|
||||
|
||||
ct_pptp_info = nfct_help_data(master);
|
||||
nat_pptp_info = &nfct_nat(master)->help.nat_pptp_info;
|
||||
|
||||
/* And here goes the grand finale of corrosion... */
|
||||
if (exp->dir == IP_CT_DIR_ORIGINAL) {
|
||||
pr_debug("we are PNS->PAC\n");
|
||||
/* therefore, build tuple for PAC->PNS */
|
||||
t.src.l3num = AF_INET;
|
||||
t.src.u3.ip = master->tuplehash[!exp->dir].tuple.src.u3.ip;
|
||||
t.src.u.gre.key = ct_pptp_info->pac_call_id;
|
||||
t.dst.u3.ip = master->tuplehash[!exp->dir].tuple.dst.u3.ip;
|
||||
t.dst.u.gre.key = ct_pptp_info->pns_call_id;
|
||||
t.dst.protonum = IPPROTO_GRE;
|
||||
} else {
|
||||
pr_debug("we are PAC->PNS\n");
|
||||
/* build tuple for PNS->PAC */
|
||||
t.src.l3num = AF_INET;
|
||||
t.src.u3.ip = master->tuplehash[!exp->dir].tuple.src.u3.ip;
|
||||
t.src.u.gre.key = nat_pptp_info->pns_call_id;
|
||||
t.dst.u3.ip = master->tuplehash[!exp->dir].tuple.dst.u3.ip;
|
||||
t.dst.u.gre.key = nat_pptp_info->pac_call_id;
|
||||
t.dst.protonum = IPPROTO_GRE;
|
||||
}
|
||||
|
||||
pr_debug("trying to unexpect other dir: ");
|
||||
nf_ct_dump_tuple_ip(&t);
|
||||
other_exp = nf_ct_expect_find_get(net, nf_ct_zone(ct), &t);
|
||||
if (other_exp) {
|
||||
nf_ct_unexpect_related(other_exp);
|
||||
nf_ct_expect_put(other_exp);
|
||||
pr_debug("success\n");
|
||||
} else {
|
||||
pr_debug("not found!\n");
|
||||
}
|
||||
|
||||
/* This must be a fresh one. */
|
||||
BUG_ON(ct->status & IPS_NAT_DONE_MASK);
|
||||
|
||||
/* Change src to where master sends to */
|
||||
range.flags = NF_NAT_RANGE_MAP_IPS;
|
||||
range.min_addr = range.max_addr
|
||||
= ct->master->tuplehash[!exp->dir].tuple.dst.u3;
|
||||
if (exp->dir == IP_CT_DIR_ORIGINAL) {
|
||||
range.flags |= NF_NAT_RANGE_PROTO_SPECIFIED;
|
||||
range.min_proto = range.max_proto = exp->saved_proto;
|
||||
}
|
||||
nf_nat_setup_info(ct, &range, NF_NAT_MANIP_SRC);
|
||||
|
||||
/* For DST manip, map port here to where it's expected. */
|
||||
range.flags = NF_NAT_RANGE_MAP_IPS;
|
||||
range.min_addr = range.max_addr
|
||||
= ct->master->tuplehash[!exp->dir].tuple.src.u3;
|
||||
if (exp->dir == IP_CT_DIR_REPLY) {
|
||||
range.flags |= NF_NAT_RANGE_PROTO_SPECIFIED;
|
||||
range.min_proto = range.max_proto = exp->saved_proto;
|
||||
}
|
||||
nf_nat_setup_info(ct, &range, NF_NAT_MANIP_DST);
|
||||
}
|
||||
|
||||
/* outbound packets == from PNS to PAC */
|
||||
static int
|
||||
pptp_outbound_pkt(struct sk_buff *skb,
|
||||
struct nf_conn *ct,
|
||||
enum ip_conntrack_info ctinfo,
|
||||
unsigned int protoff,
|
||||
struct PptpControlHeader *ctlh,
|
||||
union pptp_ctrl_union *pptpReq)
|
||||
|
||||
{
|
||||
struct nf_ct_pptp_master *ct_pptp_info;
|
||||
struct nf_nat_pptp *nat_pptp_info;
|
||||
u_int16_t msg;
|
||||
__be16 new_callid;
|
||||
unsigned int cid_off;
|
||||
|
||||
ct_pptp_info = nfct_help_data(ct);
|
||||
nat_pptp_info = &nfct_nat(ct)->help.nat_pptp_info;
|
||||
|
||||
new_callid = ct_pptp_info->pns_call_id;
|
||||
|
||||
switch (msg = ntohs(ctlh->messageType)) {
|
||||
case PPTP_OUT_CALL_REQUEST:
|
||||
cid_off = offsetof(union pptp_ctrl_union, ocreq.callID);
|
||||
/* FIXME: ideally we would want to reserve a call ID
|
||||
* here. current netfilter NAT core is not able to do
|
||||
* this :( For now we use TCP source port. This breaks
|
||||
* multiple calls within one control session */
|
||||
|
||||
/* save original call ID in nat_info */
|
||||
nat_pptp_info->pns_call_id = ct_pptp_info->pns_call_id;
|
||||
|
||||
/* don't use tcph->source since we are at a DSTmanip
|
||||
* hook (e.g. PREROUTING) and pkt is not mangled yet */
|
||||
new_callid = ct->tuplehash[IP_CT_DIR_REPLY].tuple.dst.u.tcp.port;
|
||||
|
||||
/* save new call ID in ct info */
|
||||
ct_pptp_info->pns_call_id = new_callid;
|
||||
break;
|
||||
case PPTP_IN_CALL_REPLY:
|
||||
cid_off = offsetof(union pptp_ctrl_union, icack.callID);
|
||||
break;
|
||||
case PPTP_CALL_CLEAR_REQUEST:
|
||||
cid_off = offsetof(union pptp_ctrl_union, clrreq.callID);
|
||||
break;
|
||||
default:
|
||||
pr_debug("unknown outbound packet 0x%04x:%s\n", msg,
|
||||
msg <= PPTP_MSG_MAX ? pptp_msg_name[msg] :
|
||||
pptp_msg_name[0]);
|
||||
/* fall through */
|
||||
case PPTP_SET_LINK_INFO:
|
||||
/* only need to NAT in case PAC is behind NAT box */
|
||||
case PPTP_START_SESSION_REQUEST:
|
||||
case PPTP_START_SESSION_REPLY:
|
||||
case PPTP_STOP_SESSION_REQUEST:
|
||||
case PPTP_STOP_SESSION_REPLY:
|
||||
case PPTP_ECHO_REQUEST:
|
||||
case PPTP_ECHO_REPLY:
|
||||
/* no need to alter packet */
|
||||
return NF_ACCEPT;
|
||||
}
|
||||
|
||||
/* only OUT_CALL_REQUEST, IN_CALL_REPLY, CALL_CLEAR_REQUEST pass
|
||||
* down to here */
|
||||
pr_debug("altering call id from 0x%04x to 0x%04x\n",
|
||||
ntohs(REQ_CID(pptpReq, cid_off)), ntohs(new_callid));
|
||||
|
||||
/* mangle packet */
|
||||
if (nf_nat_mangle_tcp_packet(skb, ct, ctinfo, protoff,
|
||||
cid_off + sizeof(struct pptp_pkt_hdr) +
|
||||
sizeof(struct PptpControlHeader),
|
||||
sizeof(new_callid), (char *)&new_callid,
|
||||
sizeof(new_callid)) == 0)
|
||||
return NF_DROP;
|
||||
return NF_ACCEPT;
|
||||
}
|
||||
|
||||
static void
|
||||
pptp_exp_gre(struct nf_conntrack_expect *expect_orig,
|
||||
struct nf_conntrack_expect *expect_reply)
|
||||
{
|
||||
const struct nf_conn *ct = expect_orig->master;
|
||||
struct nf_ct_pptp_master *ct_pptp_info;
|
||||
struct nf_nat_pptp *nat_pptp_info;
|
||||
|
||||
ct_pptp_info = nfct_help_data(ct);
|
||||
nat_pptp_info = &nfct_nat(ct)->help.nat_pptp_info;
|
||||
|
||||
/* save original PAC call ID in nat_info */
|
||||
nat_pptp_info->pac_call_id = ct_pptp_info->pac_call_id;
|
||||
|
||||
/* alter expectation for PNS->PAC direction */
|
||||
expect_orig->saved_proto.gre.key = ct_pptp_info->pns_call_id;
|
||||
expect_orig->tuple.src.u.gre.key = nat_pptp_info->pns_call_id;
|
||||
expect_orig->tuple.dst.u.gre.key = ct_pptp_info->pac_call_id;
|
||||
expect_orig->dir = IP_CT_DIR_ORIGINAL;
|
||||
|
||||
/* alter expectation for PAC->PNS direction */
|
||||
expect_reply->saved_proto.gre.key = nat_pptp_info->pns_call_id;
|
||||
expect_reply->tuple.src.u.gre.key = nat_pptp_info->pac_call_id;
|
||||
expect_reply->tuple.dst.u.gre.key = ct_pptp_info->pns_call_id;
|
||||
expect_reply->dir = IP_CT_DIR_REPLY;
|
||||
}
|
||||
|
||||
/* inbound packets == from PAC to PNS */
|
||||
static int
|
||||
pptp_inbound_pkt(struct sk_buff *skb,
|
||||
struct nf_conn *ct,
|
||||
enum ip_conntrack_info ctinfo,
|
||||
unsigned int protoff,
|
||||
struct PptpControlHeader *ctlh,
|
||||
union pptp_ctrl_union *pptpReq)
|
||||
{
|
||||
const struct nf_nat_pptp *nat_pptp_info;
|
||||
u_int16_t msg;
|
||||
__be16 new_pcid;
|
||||
unsigned int pcid_off;
|
||||
|
||||
nat_pptp_info = &nfct_nat(ct)->help.nat_pptp_info;
|
||||
new_pcid = nat_pptp_info->pns_call_id;
|
||||
|
||||
switch (msg = ntohs(ctlh->messageType)) {
|
||||
case PPTP_OUT_CALL_REPLY:
|
||||
pcid_off = offsetof(union pptp_ctrl_union, ocack.peersCallID);
|
||||
break;
|
||||
case PPTP_IN_CALL_CONNECT:
|
||||
pcid_off = offsetof(union pptp_ctrl_union, iccon.peersCallID);
|
||||
break;
|
||||
case PPTP_IN_CALL_REQUEST:
|
||||
/* only need to nat in case PAC is behind NAT box */
|
||||
return NF_ACCEPT;
|
||||
case PPTP_WAN_ERROR_NOTIFY:
|
||||
pcid_off = offsetof(union pptp_ctrl_union, wanerr.peersCallID);
|
||||
break;
|
||||
case PPTP_CALL_DISCONNECT_NOTIFY:
|
||||
pcid_off = offsetof(union pptp_ctrl_union, disc.callID);
|
||||
break;
|
||||
case PPTP_SET_LINK_INFO:
|
||||
pcid_off = offsetof(union pptp_ctrl_union, setlink.peersCallID);
|
||||
break;
|
||||
default:
|
||||
pr_debug("unknown inbound packet %s\n",
|
||||
msg <= PPTP_MSG_MAX ? pptp_msg_name[msg] :
|
||||
pptp_msg_name[0]);
|
||||
/* fall through */
|
||||
case PPTP_START_SESSION_REQUEST:
|
||||
case PPTP_START_SESSION_REPLY:
|
||||
case PPTP_STOP_SESSION_REQUEST:
|
||||
case PPTP_STOP_SESSION_REPLY:
|
||||
case PPTP_ECHO_REQUEST:
|
||||
case PPTP_ECHO_REPLY:
|
||||
/* no need to alter packet */
|
||||
return NF_ACCEPT;
|
||||
}
|
||||
|
||||
/* only OUT_CALL_REPLY, IN_CALL_CONNECT, IN_CALL_REQUEST,
|
||||
* WAN_ERROR_NOTIFY, CALL_DISCONNECT_NOTIFY pass down here */
|
||||
|
||||
/* mangle packet */
|
||||
pr_debug("altering peer call id from 0x%04x to 0x%04x\n",
|
||||
ntohs(REQ_CID(pptpReq, pcid_off)), ntohs(new_pcid));
|
||||
|
||||
if (nf_nat_mangle_tcp_packet(skb, ct, ctinfo, protoff,
|
||||
pcid_off + sizeof(struct pptp_pkt_hdr) +
|
||||
sizeof(struct PptpControlHeader),
|
||||
sizeof(new_pcid), (char *)&new_pcid,
|
||||
sizeof(new_pcid)) == 0)
|
||||
return NF_DROP;
|
||||
return NF_ACCEPT;
|
||||
}
|
||||
|
||||
static int __init nf_nat_helper_pptp_init(void)
|
||||
{
|
||||
nf_nat_need_gre();
|
||||
|
||||
BUG_ON(nf_nat_pptp_hook_outbound != NULL);
|
||||
RCU_INIT_POINTER(nf_nat_pptp_hook_outbound, pptp_outbound_pkt);
|
||||
|
||||
BUG_ON(nf_nat_pptp_hook_inbound != NULL);
|
||||
RCU_INIT_POINTER(nf_nat_pptp_hook_inbound, pptp_inbound_pkt);
|
||||
|
||||
BUG_ON(nf_nat_pptp_hook_exp_gre != NULL);
|
||||
RCU_INIT_POINTER(nf_nat_pptp_hook_exp_gre, pptp_exp_gre);
|
||||
|
||||
BUG_ON(nf_nat_pptp_hook_expectfn != NULL);
|
||||
RCU_INIT_POINTER(nf_nat_pptp_hook_expectfn, pptp_nat_expected);
|
||||
return 0;
|
||||
}
|
||||
|
||||
static void __exit nf_nat_helper_pptp_fini(void)
|
||||
{
|
||||
RCU_INIT_POINTER(nf_nat_pptp_hook_expectfn, NULL);
|
||||
RCU_INIT_POINTER(nf_nat_pptp_hook_exp_gre, NULL);
|
||||
RCU_INIT_POINTER(nf_nat_pptp_hook_inbound, NULL);
|
||||
RCU_INIT_POINTER(nf_nat_pptp_hook_outbound, NULL);
|
||||
synchronize_rcu();
|
||||
}
|
||||
|
||||
module_init(nf_nat_helper_pptp_init);
|
||||
module_exit(nf_nat_helper_pptp_fini);
|
||||
149
net/ipv4/netfilter/nf_nat_proto_gre.c
Normal file
149
net/ipv4/netfilter/nf_nat_proto_gre.c
Normal file
|
|
@ -0,0 +1,149 @@
|
|||
/*
|
||||
* nf_nat_proto_gre.c
|
||||
*
|
||||
* NAT protocol helper module for GRE.
|
||||
*
|
||||
* GRE is a generic encapsulation protocol, which is generally not very
|
||||
* suited for NAT, as it has no protocol-specific part as port numbers.
|
||||
*
|
||||
* It has an optional key field, which may help us distinguishing two
|
||||
* connections between the same two hosts.
|
||||
*
|
||||
* GRE is defined in RFC 1701 and RFC 1702, as well as RFC 2784
|
||||
*
|
||||
* PPTP is built on top of a modified version of GRE, and has a mandatory
|
||||
* field called "CallID", which serves us for the same purpose as the key
|
||||
* field in plain GRE.
|
||||
*
|
||||
* Documentation about PPTP can be found in RFC 2637
|
||||
*
|
||||
* (C) 2000-2005 by Harald Welte <laforge@gnumonks.org>
|
||||
*
|
||||
* Development of this code funded by Astaro AG (http://www.astaro.com/)
|
||||
*
|
||||
* (C) 2006-2012 Patrick McHardy <kaber@trash.net>
|
||||
*
|
||||
*/
|
||||
|
||||
#include <linux/module.h>
|
||||
#include <linux/skbuff.h>
|
||||
#include <linux/ip.h>
|
||||
|
||||
#include <net/netfilter/nf_nat.h>
|
||||
#include <net/netfilter/nf_nat_l4proto.h>
|
||||
#include <linux/netfilter/nf_conntrack_proto_gre.h>
|
||||
|
||||
MODULE_LICENSE("GPL");
|
||||
MODULE_AUTHOR("Harald Welte <laforge@gnumonks.org>");
|
||||
MODULE_DESCRIPTION("Netfilter NAT protocol helper module for GRE");
|
||||
|
||||
/* generate unique tuple ... */
|
||||
static void
|
||||
gre_unique_tuple(const struct nf_nat_l3proto *l3proto,
|
||||
struct nf_conntrack_tuple *tuple,
|
||||
const struct nf_nat_range *range,
|
||||
enum nf_nat_manip_type maniptype,
|
||||
const struct nf_conn *ct)
|
||||
{
|
||||
static u_int16_t key;
|
||||
__be16 *keyptr;
|
||||
unsigned int min, i, range_size;
|
||||
|
||||
/* If there is no master conntrack we are not PPTP,
|
||||
do not change tuples */
|
||||
if (!ct->master)
|
||||
return;
|
||||
|
||||
if (maniptype == NF_NAT_MANIP_SRC)
|
||||
keyptr = &tuple->src.u.gre.key;
|
||||
else
|
||||
keyptr = &tuple->dst.u.gre.key;
|
||||
|
||||
if (!(range->flags & NF_NAT_RANGE_PROTO_SPECIFIED)) {
|
||||
pr_debug("%p: NATing GRE PPTP\n", ct);
|
||||
min = 1;
|
||||
range_size = 0xffff;
|
||||
} else {
|
||||
min = ntohs(range->min_proto.gre.key);
|
||||
range_size = ntohs(range->max_proto.gre.key) - min + 1;
|
||||
}
|
||||
|
||||
pr_debug("min = %u, range_size = %u\n", min, range_size);
|
||||
|
||||
for (i = 0; ; ++key) {
|
||||
*keyptr = htons(min + key % range_size);
|
||||
if (++i == range_size || !nf_nat_used_tuple(tuple, ct))
|
||||
return;
|
||||
}
|
||||
|
||||
pr_debug("%p: no NAT mapping\n", ct);
|
||||
return;
|
||||
}
|
||||
|
||||
/* manipulate a GRE packet according to maniptype */
|
||||
static bool
|
||||
gre_manip_pkt(struct sk_buff *skb,
|
||||
const struct nf_nat_l3proto *l3proto,
|
||||
unsigned int iphdroff, unsigned int hdroff,
|
||||
const struct nf_conntrack_tuple *tuple,
|
||||
enum nf_nat_manip_type maniptype)
|
||||
{
|
||||
const struct gre_hdr *greh;
|
||||
struct gre_hdr_pptp *pgreh;
|
||||
|
||||
/* pgreh includes two optional 32bit fields which are not required
|
||||
* to be there. That's where the magic '8' comes from */
|
||||
if (!skb_make_writable(skb, hdroff + sizeof(*pgreh) - 8))
|
||||
return false;
|
||||
|
||||
greh = (void *)skb->data + hdroff;
|
||||
pgreh = (struct gre_hdr_pptp *)greh;
|
||||
|
||||
/* we only have destination manip of a packet, since 'source key'
|
||||
* is not present in the packet itself */
|
||||
if (maniptype != NF_NAT_MANIP_DST)
|
||||
return true;
|
||||
switch (greh->version) {
|
||||
case GRE_VERSION_1701:
|
||||
/* We do not currently NAT any GREv0 packets.
|
||||
* Try to behave like "nf_nat_proto_unknown" */
|
||||
break;
|
||||
case GRE_VERSION_PPTP:
|
||||
pr_debug("call_id -> 0x%04x\n", ntohs(tuple->dst.u.gre.key));
|
||||
pgreh->call_id = tuple->dst.u.gre.key;
|
||||
break;
|
||||
default:
|
||||
pr_debug("can't nat unknown GRE version\n");
|
||||
return false;
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
static const struct nf_nat_l4proto gre = {
|
||||
.l4proto = IPPROTO_GRE,
|
||||
.manip_pkt = gre_manip_pkt,
|
||||
.in_range = nf_nat_l4proto_in_range,
|
||||
.unique_tuple = gre_unique_tuple,
|
||||
#if IS_ENABLED(CONFIG_NF_CT_NETLINK)
|
||||
.nlattr_to_range = nf_nat_l4proto_nlattr_to_range,
|
||||
#endif
|
||||
};
|
||||
|
||||
static int __init nf_nat_proto_gre_init(void)
|
||||
{
|
||||
return nf_nat_l4proto_register(NFPROTO_IPV4, &gre);
|
||||
}
|
||||
|
||||
static void __exit nf_nat_proto_gre_fini(void)
|
||||
{
|
||||
nf_nat_l4proto_unregister(NFPROTO_IPV4, &gre);
|
||||
}
|
||||
|
||||
module_init(nf_nat_proto_gre_init);
|
||||
module_exit(nf_nat_proto_gre_fini);
|
||||
|
||||
void nf_nat_need_gre(void)
|
||||
{
|
||||
return;
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(nf_nat_need_gre);
|
||||
83
net/ipv4/netfilter/nf_nat_proto_icmp.c
Normal file
83
net/ipv4/netfilter/nf_nat_proto_icmp.c
Normal file
|
|
@ -0,0 +1,83 @@
|
|||
/* (C) 1999-2001 Paul `Rusty' Russell
|
||||
* (C) 2002-2006 Netfilter Core Team <coreteam@netfilter.org>
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License version 2 as
|
||||
* published by the Free Software Foundation.
|
||||
*/
|
||||
|
||||
#include <linux/types.h>
|
||||
#include <linux/init.h>
|
||||
#include <linux/export.h>
|
||||
#include <linux/ip.h>
|
||||
#include <linux/icmp.h>
|
||||
|
||||
#include <linux/netfilter.h>
|
||||
#include <net/netfilter/nf_nat.h>
|
||||
#include <net/netfilter/nf_nat_core.h>
|
||||
#include <net/netfilter/nf_nat_l4proto.h>
|
||||
|
||||
static bool
|
||||
icmp_in_range(const struct nf_conntrack_tuple *tuple,
|
||||
enum nf_nat_manip_type maniptype,
|
||||
const union nf_conntrack_man_proto *min,
|
||||
const union nf_conntrack_man_proto *max)
|
||||
{
|
||||
return ntohs(tuple->src.u.icmp.id) >= ntohs(min->icmp.id) &&
|
||||
ntohs(tuple->src.u.icmp.id) <= ntohs(max->icmp.id);
|
||||
}
|
||||
|
||||
static void
|
||||
icmp_unique_tuple(const struct nf_nat_l3proto *l3proto,
|
||||
struct nf_conntrack_tuple *tuple,
|
||||
const struct nf_nat_range *range,
|
||||
enum nf_nat_manip_type maniptype,
|
||||
const struct nf_conn *ct)
|
||||
{
|
||||
static u_int16_t id;
|
||||
unsigned int range_size;
|
||||
unsigned int i;
|
||||
|
||||
range_size = ntohs(range->max_proto.icmp.id) -
|
||||
ntohs(range->min_proto.icmp.id) + 1;
|
||||
/* If no range specified... */
|
||||
if (!(range->flags & NF_NAT_RANGE_PROTO_SPECIFIED))
|
||||
range_size = 0xFFFF;
|
||||
|
||||
for (i = 0; ; ++id) {
|
||||
tuple->src.u.icmp.id = htons(ntohs(range->min_proto.icmp.id) +
|
||||
(id % range_size));
|
||||
if (++i == range_size || !nf_nat_used_tuple(tuple, ct))
|
||||
return;
|
||||
}
|
||||
return;
|
||||
}
|
||||
|
||||
static bool
|
||||
icmp_manip_pkt(struct sk_buff *skb,
|
||||
const struct nf_nat_l3proto *l3proto,
|
||||
unsigned int iphdroff, unsigned int hdroff,
|
||||
const struct nf_conntrack_tuple *tuple,
|
||||
enum nf_nat_manip_type maniptype)
|
||||
{
|
||||
struct icmphdr *hdr;
|
||||
|
||||
if (!skb_make_writable(skb, hdroff + sizeof(*hdr)))
|
||||
return false;
|
||||
|
||||
hdr = (struct icmphdr *)(skb->data + hdroff);
|
||||
inet_proto_csum_replace2(&hdr->checksum, skb,
|
||||
hdr->un.echo.id, tuple->src.u.icmp.id, 0);
|
||||
hdr->un.echo.id = tuple->src.u.icmp.id;
|
||||
return true;
|
||||
}
|
||||
|
||||
const struct nf_nat_l4proto nf_nat_l4proto_icmp = {
|
||||
.l4proto = IPPROTO_ICMP,
|
||||
.manip_pkt = icmp_manip_pkt,
|
||||
.in_range = icmp_in_range,
|
||||
.unique_tuple = icmp_unique_tuple,
|
||||
#if IS_ENABLED(CONFIG_NF_CT_NETLINK)
|
||||
.nlattr_to_range = nf_nat_l4proto_nlattr_to_range,
|
||||
#endif
|
||||
};
|
||||
1313
net/ipv4/netfilter/nf_nat_snmp_basic.c
Normal file
1313
net/ipv4/netfilter/nf_nat_snmp_basic.c
Normal file
File diff suppressed because it is too large
Load diff
166
net/ipv4/netfilter/nf_reject_ipv4.c
Normal file
166
net/ipv4/netfilter/nf_reject_ipv4.c
Normal file
|
|
@ -0,0 +1,166 @@
|
|||
/* (C) 1999-2001 Paul `Rusty' Russell
|
||||
* (C) 2002-2004 Netfilter Core Team <coreteam@netfilter.org>
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License version 2 as
|
||||
* published by the Free Software Foundation.
|
||||
*/
|
||||
|
||||
#include <linux/module.h>
|
||||
#include <net/ip.h>
|
||||
#include <net/tcp.h>
|
||||
#include <net/route.h>
|
||||
#include <net/dst.h>
|
||||
#include <linux/netfilter_ipv4.h>
|
||||
#include <net/netfilter/ipv4/nf_reject.h>
|
||||
|
||||
const struct tcphdr *nf_reject_ip_tcphdr_get(struct sk_buff *oldskb,
|
||||
struct tcphdr *_oth, int hook)
|
||||
{
|
||||
const struct tcphdr *oth;
|
||||
|
||||
/* IP header checks: fragment. */
|
||||
if (ip_hdr(oldskb)->frag_off & htons(IP_OFFSET))
|
||||
return NULL;
|
||||
|
||||
oth = skb_header_pointer(oldskb, ip_hdrlen(oldskb),
|
||||
sizeof(struct tcphdr), _oth);
|
||||
if (oth == NULL)
|
||||
return NULL;
|
||||
|
||||
/* No RST for RST. */
|
||||
if (oth->rst)
|
||||
return NULL;
|
||||
|
||||
/* Check checksum */
|
||||
if (nf_ip_checksum(oldskb, hook, ip_hdrlen(oldskb), IPPROTO_TCP))
|
||||
return NULL;
|
||||
|
||||
return oth;
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(nf_reject_ip_tcphdr_get);
|
||||
|
||||
struct iphdr *nf_reject_iphdr_put(struct sk_buff *nskb,
|
||||
const struct sk_buff *oldskb,
|
||||
__be16 protocol, int ttl)
|
||||
{
|
||||
struct iphdr *niph, *oiph = ip_hdr(oldskb);
|
||||
|
||||
skb_reset_network_header(nskb);
|
||||
niph = (struct iphdr *)skb_put(nskb, sizeof(struct iphdr));
|
||||
niph->version = 4;
|
||||
niph->ihl = sizeof(struct iphdr) / 4;
|
||||
niph->tos = 0;
|
||||
niph->id = 0;
|
||||
niph->frag_off = htons(IP_DF);
|
||||
niph->protocol = protocol;
|
||||
niph->check = 0;
|
||||
niph->saddr = oiph->daddr;
|
||||
niph->daddr = oiph->saddr;
|
||||
niph->ttl = ttl;
|
||||
|
||||
nskb->protocol = htons(ETH_P_IP);
|
||||
|
||||
return niph;
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(nf_reject_iphdr_put);
|
||||
|
||||
void nf_reject_ip_tcphdr_put(struct sk_buff *nskb, const struct sk_buff *oldskb,
|
||||
const struct tcphdr *oth)
|
||||
{
|
||||
struct iphdr *niph = ip_hdr(nskb);
|
||||
struct tcphdr *tcph;
|
||||
|
||||
skb_reset_transport_header(nskb);
|
||||
tcph = (struct tcphdr *)skb_put(nskb, sizeof(struct tcphdr));
|
||||
memset(tcph, 0, sizeof(*tcph));
|
||||
tcph->source = oth->dest;
|
||||
tcph->dest = oth->source;
|
||||
tcph->doff = sizeof(struct tcphdr) / 4;
|
||||
|
||||
if (oth->ack) {
|
||||
tcph->seq = oth->ack_seq;
|
||||
} else {
|
||||
tcph->ack_seq = htonl(ntohl(oth->seq) + oth->syn + oth->fin +
|
||||
oldskb->len - ip_hdrlen(oldskb) -
|
||||
(oth->doff << 2));
|
||||
tcph->ack = 1;
|
||||
}
|
||||
|
||||
tcph->rst = 1;
|
||||
tcph->check = ~tcp_v4_check(sizeof(struct tcphdr), niph->saddr,
|
||||
niph->daddr, 0);
|
||||
nskb->ip_summed = CHECKSUM_PARTIAL;
|
||||
nskb->csum_start = (unsigned char *)tcph - nskb->head;
|
||||
nskb->csum_offset = offsetof(struct tcphdr, check);
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(nf_reject_ip_tcphdr_put);
|
||||
|
||||
/* Send RST reply */
|
||||
void nf_send_reset(struct sk_buff *oldskb, int hook)
|
||||
{
|
||||
struct sk_buff *nskb;
|
||||
const struct iphdr *oiph;
|
||||
struct iphdr *niph;
|
||||
const struct tcphdr *oth;
|
||||
struct tcphdr _oth;
|
||||
|
||||
oth = nf_reject_ip_tcphdr_get(oldskb, &_oth, hook);
|
||||
if (!oth)
|
||||
return;
|
||||
|
||||
if (skb_rtable(oldskb)->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST))
|
||||
return;
|
||||
|
||||
oiph = ip_hdr(oldskb);
|
||||
|
||||
nskb = alloc_skb(sizeof(struct iphdr) + sizeof(struct tcphdr) +
|
||||
LL_MAX_HEADER, GFP_ATOMIC);
|
||||
if (!nskb)
|
||||
return;
|
||||
|
||||
/* ip_route_me_harder expects skb->dst to be set */
|
||||
skb_dst_set_noref(nskb, skb_dst(oldskb));
|
||||
|
||||
skb_reserve(nskb, LL_MAX_HEADER);
|
||||
niph = nf_reject_iphdr_put(nskb, oldskb, IPPROTO_TCP,
|
||||
ip4_dst_hoplimit(skb_dst(nskb)));
|
||||
nf_reject_ip_tcphdr_put(nskb, oldskb, oth);
|
||||
|
||||
if (ip_route_me_harder(nskb, RTN_UNSPEC))
|
||||
goto free_nskb;
|
||||
|
||||
/* "Never happens" */
|
||||
if (nskb->len > dst_mtu(skb_dst(nskb)))
|
||||
goto free_nskb;
|
||||
|
||||
nf_ct_attach(nskb, oldskb);
|
||||
|
||||
#if IS_ENABLED(CONFIG_BRIDGE_NETFILTER)
|
||||
/* If we use ip_local_out for bridged traffic, the MAC source on
|
||||
* the RST will be ours, instead of the destination's. This confuses
|
||||
* some routers/firewalls, and they drop the packet. So we need to
|
||||
* build the eth header using the original destination's MAC as the
|
||||
* source, and send the RST packet directly.
|
||||
*/
|
||||
if (oldskb->nf_bridge) {
|
||||
struct ethhdr *oeth = eth_hdr(oldskb);
|
||||
nskb->dev = oldskb->nf_bridge->physindev;
|
||||
niph->tot_len = htons(nskb->len);
|
||||
ip_send_check(niph);
|
||||
if (dev_hard_header(nskb, nskb->dev, ntohs(nskb->protocol),
|
||||
oeth->h_source, oeth->h_dest, nskb->len) < 0)
|
||||
goto free_nskb;
|
||||
dev_queue_xmit(nskb);
|
||||
} else
|
||||
#endif
|
||||
ip_local_out(nskb);
|
||||
|
||||
return;
|
||||
|
||||
free_nskb:
|
||||
kfree_skb(nskb);
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(nf_send_reset);
|
||||
|
||||
MODULE_LICENSE("GPL");
|
||||
104
net/ipv4/netfilter/nf_tables_arp.c
Normal file
104
net/ipv4/netfilter/nf_tables_arp.c
Normal file
|
|
@ -0,0 +1,104 @@
|
|||
/*
|
||||
* Copyright (c) 2008-2010 Patrick McHardy <kaber@trash.net>
|
||||
* Copyright (c) 2013 Pablo Neira Ayuso <pablo@netfilter.org>
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License version 2 as
|
||||
* published by the Free Software Foundation.
|
||||
*
|
||||
* Development of this code funded by Astaro AG (http://www.astaro.com/)
|
||||
*/
|
||||
|
||||
#include <linux/module.h>
|
||||
#include <linux/init.h>
|
||||
#include <linux/netfilter_arp.h>
|
||||
#include <net/netfilter/nf_tables.h>
|
||||
|
||||
static unsigned int
|
||||
nft_do_chain_arp(const struct nf_hook_ops *ops,
|
||||
struct sk_buff *skb,
|
||||
const struct net_device *in,
|
||||
const struct net_device *out,
|
||||
int (*okfn)(struct sk_buff *))
|
||||
{
|
||||
struct nft_pktinfo pkt;
|
||||
|
||||
nft_set_pktinfo(&pkt, ops, skb, in, out);
|
||||
|
||||
return nft_do_chain(&pkt, ops);
|
||||
}
|
||||
|
||||
static struct nft_af_info nft_af_arp __read_mostly = {
|
||||
.family = NFPROTO_ARP,
|
||||
.nhooks = NF_ARP_NUMHOOKS,
|
||||
.owner = THIS_MODULE,
|
||||
.nops = 1,
|
||||
.hooks = {
|
||||
[NF_ARP_IN] = nft_do_chain_arp,
|
||||
[NF_ARP_OUT] = nft_do_chain_arp,
|
||||
[NF_ARP_FORWARD] = nft_do_chain_arp,
|
||||
},
|
||||
};
|
||||
|
||||
static int nf_tables_arp_init_net(struct net *net)
|
||||
{
|
||||
net->nft.arp = kmalloc(sizeof(struct nft_af_info), GFP_KERNEL);
|
||||
if (net->nft.arp== NULL)
|
||||
return -ENOMEM;
|
||||
|
||||
memcpy(net->nft.arp, &nft_af_arp, sizeof(nft_af_arp));
|
||||
|
||||
if (nft_register_afinfo(net, net->nft.arp) < 0)
|
||||
goto err;
|
||||
|
||||
return 0;
|
||||
err:
|
||||
kfree(net->nft.arp);
|
||||
return -ENOMEM;
|
||||
}
|
||||
|
||||
static void nf_tables_arp_exit_net(struct net *net)
|
||||
{
|
||||
nft_unregister_afinfo(net->nft.arp);
|
||||
kfree(net->nft.arp);
|
||||
}
|
||||
|
||||
static struct pernet_operations nf_tables_arp_net_ops = {
|
||||
.init = nf_tables_arp_init_net,
|
||||
.exit = nf_tables_arp_exit_net,
|
||||
};
|
||||
|
||||
static const struct nf_chain_type filter_arp = {
|
||||
.name = "filter",
|
||||
.type = NFT_CHAIN_T_DEFAULT,
|
||||
.family = NFPROTO_ARP,
|
||||
.owner = THIS_MODULE,
|
||||
.hook_mask = (1 << NF_ARP_IN) |
|
||||
(1 << NF_ARP_OUT) |
|
||||
(1 << NF_ARP_FORWARD),
|
||||
};
|
||||
|
||||
static int __init nf_tables_arp_init(void)
|
||||
{
|
||||
int ret;
|
||||
|
||||
nft_register_chain_type(&filter_arp);
|
||||
ret = register_pernet_subsys(&nf_tables_arp_net_ops);
|
||||
if (ret < 0)
|
||||
nft_unregister_chain_type(&filter_arp);
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
static void __exit nf_tables_arp_exit(void)
|
||||
{
|
||||
unregister_pernet_subsys(&nf_tables_arp_net_ops);
|
||||
nft_unregister_chain_type(&filter_arp);
|
||||
}
|
||||
|
||||
module_init(nf_tables_arp_init);
|
||||
module_exit(nf_tables_arp_exit);
|
||||
|
||||
MODULE_LICENSE("GPL");
|
||||
MODULE_AUTHOR("Patrick McHardy <kaber@trash.net>");
|
||||
MODULE_ALIAS_NFT_FAMILY(3); /* NFPROTO_ARP */
|
||||
129
net/ipv4/netfilter/nf_tables_ipv4.c
Normal file
129
net/ipv4/netfilter/nf_tables_ipv4.c
Normal file
|
|
@ -0,0 +1,129 @@
|
|||
/*
|
||||
* Copyright (c) 2008 Patrick McHardy <kaber@trash.net>
|
||||
* Copyright (c) 2012-2013 Pablo Neira Ayuso <pablo@netfilter.org>
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License version 2 as
|
||||
* published by the Free Software Foundation.
|
||||
*
|
||||
* Development of this code funded by Astaro AG (http://www.astaro.com/)
|
||||
*/
|
||||
|
||||
#include <linux/init.h>
|
||||
#include <linux/module.h>
|
||||
#include <linux/ip.h>
|
||||
#include <linux/netfilter_ipv4.h>
|
||||
#include <net/netfilter/nf_tables.h>
|
||||
#include <net/net_namespace.h>
|
||||
#include <net/ip.h>
|
||||
#include <net/netfilter/nf_tables_ipv4.h>
|
||||
|
||||
static unsigned int nft_do_chain_ipv4(const struct nf_hook_ops *ops,
|
||||
struct sk_buff *skb,
|
||||
const struct net_device *in,
|
||||
const struct net_device *out,
|
||||
int (*okfn)(struct sk_buff *))
|
||||
{
|
||||
struct nft_pktinfo pkt;
|
||||
|
||||
nft_set_pktinfo_ipv4(&pkt, ops, skb, in, out);
|
||||
|
||||
return nft_do_chain(&pkt, ops);
|
||||
}
|
||||
|
||||
static unsigned int nft_ipv4_output(const struct nf_hook_ops *ops,
|
||||
struct sk_buff *skb,
|
||||
const struct net_device *in,
|
||||
const struct net_device *out,
|
||||
int (*okfn)(struct sk_buff *))
|
||||
{
|
||||
if (unlikely(skb->len < sizeof(struct iphdr) ||
|
||||
ip_hdr(skb)->ihl < sizeof(struct iphdr) / 4)) {
|
||||
if (net_ratelimit())
|
||||
pr_info("nf_tables_ipv4: ignoring short SOCK_RAW "
|
||||
"packet\n");
|
||||
return NF_ACCEPT;
|
||||
}
|
||||
|
||||
return nft_do_chain_ipv4(ops, skb, in, out, okfn);
|
||||
}
|
||||
|
||||
struct nft_af_info nft_af_ipv4 __read_mostly = {
|
||||
.family = NFPROTO_IPV4,
|
||||
.nhooks = NF_INET_NUMHOOKS,
|
||||
.owner = THIS_MODULE,
|
||||
.nops = 1,
|
||||
.hooks = {
|
||||
[NF_INET_LOCAL_IN] = nft_do_chain_ipv4,
|
||||
[NF_INET_LOCAL_OUT] = nft_ipv4_output,
|
||||
[NF_INET_FORWARD] = nft_do_chain_ipv4,
|
||||
[NF_INET_PRE_ROUTING] = nft_do_chain_ipv4,
|
||||
[NF_INET_POST_ROUTING] = nft_do_chain_ipv4,
|
||||
},
|
||||
};
|
||||
EXPORT_SYMBOL_GPL(nft_af_ipv4);
|
||||
|
||||
static int nf_tables_ipv4_init_net(struct net *net)
|
||||
{
|
||||
net->nft.ipv4 = kmalloc(sizeof(struct nft_af_info), GFP_KERNEL);
|
||||
if (net->nft.ipv4 == NULL)
|
||||
return -ENOMEM;
|
||||
|
||||
memcpy(net->nft.ipv4, &nft_af_ipv4, sizeof(nft_af_ipv4));
|
||||
|
||||
if (nft_register_afinfo(net, net->nft.ipv4) < 0)
|
||||
goto err;
|
||||
|
||||
return 0;
|
||||
err:
|
||||
kfree(net->nft.ipv4);
|
||||
return -ENOMEM;
|
||||
}
|
||||
|
||||
static void nf_tables_ipv4_exit_net(struct net *net)
|
||||
{
|
||||
nft_unregister_afinfo(net->nft.ipv4);
|
||||
kfree(net->nft.ipv4);
|
||||
}
|
||||
|
||||
static struct pernet_operations nf_tables_ipv4_net_ops = {
|
||||
.init = nf_tables_ipv4_init_net,
|
||||
.exit = nf_tables_ipv4_exit_net,
|
||||
};
|
||||
|
||||
static const struct nf_chain_type filter_ipv4 = {
|
||||
.name = "filter",
|
||||
.type = NFT_CHAIN_T_DEFAULT,
|
||||
.family = NFPROTO_IPV4,
|
||||
.owner = THIS_MODULE,
|
||||
.hook_mask = (1 << NF_INET_LOCAL_IN) |
|
||||
(1 << NF_INET_LOCAL_OUT) |
|
||||
(1 << NF_INET_FORWARD) |
|
||||
(1 << NF_INET_PRE_ROUTING) |
|
||||
(1 << NF_INET_POST_ROUTING),
|
||||
};
|
||||
|
||||
static int __init nf_tables_ipv4_init(void)
|
||||
{
|
||||
int ret;
|
||||
|
||||
nft_register_chain_type(&filter_ipv4);
|
||||
ret = register_pernet_subsys(&nf_tables_ipv4_net_ops);
|
||||
if (ret < 0)
|
||||
nft_unregister_chain_type(&filter_ipv4);
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
static void __exit nf_tables_ipv4_exit(void)
|
||||
{
|
||||
unregister_pernet_subsys(&nf_tables_ipv4_net_ops);
|
||||
nft_unregister_chain_type(&filter_ipv4);
|
||||
}
|
||||
|
||||
module_init(nf_tables_ipv4_init);
|
||||
module_exit(nf_tables_ipv4_exit);
|
||||
|
||||
MODULE_LICENSE("GPL");
|
||||
MODULE_AUTHOR("Patrick McHardy <kaber@trash.net>");
|
||||
MODULE_ALIAS_NFT_FAMILY(AF_INET);
|
||||
116
net/ipv4/netfilter/nft_chain_nat_ipv4.c
Normal file
116
net/ipv4/netfilter/nft_chain_nat_ipv4.c
Normal file
|
|
@ -0,0 +1,116 @@
|
|||
/*
|
||||
* Copyright (c) 2008-2009 Patrick McHardy <kaber@trash.net>
|
||||
* Copyright (c) 2012 Pablo Neira Ayuso <pablo@netfilter.org>
|
||||
* Copyright (c) 2012 Intel Corporation
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License version 2 as
|
||||
* published by the Free Software Foundation.
|
||||
*
|
||||
* Development of this code funded by Astaro AG (http://www.astaro.com/)
|
||||
*/
|
||||
|
||||
#include <linux/module.h>
|
||||
#include <linux/init.h>
|
||||
#include <linux/list.h>
|
||||
#include <linux/skbuff.h>
|
||||
#include <linux/ip.h>
|
||||
#include <linux/netfilter.h>
|
||||
#include <linux/netfilter_ipv4.h>
|
||||
#include <linux/netfilter/nf_tables.h>
|
||||
#include <net/netfilter/nf_conntrack.h>
|
||||
#include <net/netfilter/nf_nat.h>
|
||||
#include <net/netfilter/nf_nat_core.h>
|
||||
#include <net/netfilter/nf_tables.h>
|
||||
#include <net/netfilter/nf_tables_ipv4.h>
|
||||
#include <net/netfilter/nf_nat_l3proto.h>
|
||||
#include <net/ip.h>
|
||||
|
||||
static unsigned int nft_nat_do_chain(const struct nf_hook_ops *ops,
|
||||
struct sk_buff *skb,
|
||||
const struct net_device *in,
|
||||
const struct net_device *out,
|
||||
struct nf_conn *ct)
|
||||
{
|
||||
struct nft_pktinfo pkt;
|
||||
|
||||
nft_set_pktinfo_ipv4(&pkt, ops, skb, in, out);
|
||||
|
||||
return nft_do_chain(&pkt, ops);
|
||||
}
|
||||
|
||||
static unsigned int nft_nat_ipv4_fn(const struct nf_hook_ops *ops,
|
||||
struct sk_buff *skb,
|
||||
const struct net_device *in,
|
||||
const struct net_device *out,
|
||||
int (*okfn)(struct sk_buff *))
|
||||
{
|
||||
return nf_nat_ipv4_fn(ops, skb, in, out, nft_nat_do_chain);
|
||||
}
|
||||
|
||||
static unsigned int nft_nat_ipv4_in(const struct nf_hook_ops *ops,
|
||||
struct sk_buff *skb,
|
||||
const struct net_device *in,
|
||||
const struct net_device *out,
|
||||
int (*okfn)(struct sk_buff *))
|
||||
{
|
||||
return nf_nat_ipv4_in(ops, skb, in, out, nft_nat_do_chain);
|
||||
}
|
||||
|
||||
static unsigned int nft_nat_ipv4_out(const struct nf_hook_ops *ops,
|
||||
struct sk_buff *skb,
|
||||
const struct net_device *in,
|
||||
const struct net_device *out,
|
||||
int (*okfn)(struct sk_buff *))
|
||||
{
|
||||
return nf_nat_ipv4_out(ops, skb, in, out, nft_nat_do_chain);
|
||||
}
|
||||
|
||||
static unsigned int nft_nat_ipv4_local_fn(const struct nf_hook_ops *ops,
|
||||
struct sk_buff *skb,
|
||||
const struct net_device *in,
|
||||
const struct net_device *out,
|
||||
int (*okfn)(struct sk_buff *))
|
||||
{
|
||||
return nf_nat_ipv4_local_fn(ops, skb, in, out, nft_nat_do_chain);
|
||||
}
|
||||
|
||||
static const struct nf_chain_type nft_chain_nat_ipv4 = {
|
||||
.name = "nat",
|
||||
.type = NFT_CHAIN_T_NAT,
|
||||
.family = NFPROTO_IPV4,
|
||||
.owner = THIS_MODULE,
|
||||
.hook_mask = (1 << NF_INET_PRE_ROUTING) |
|
||||
(1 << NF_INET_POST_ROUTING) |
|
||||
(1 << NF_INET_LOCAL_OUT) |
|
||||
(1 << NF_INET_LOCAL_IN),
|
||||
.hooks = {
|
||||
[NF_INET_PRE_ROUTING] = nft_nat_ipv4_in,
|
||||
[NF_INET_POST_ROUTING] = nft_nat_ipv4_out,
|
||||
[NF_INET_LOCAL_OUT] = nft_nat_ipv4_local_fn,
|
||||
[NF_INET_LOCAL_IN] = nft_nat_ipv4_fn,
|
||||
},
|
||||
};
|
||||
|
||||
static int __init nft_chain_nat_init(void)
|
||||
{
|
||||
int err;
|
||||
|
||||
err = nft_register_chain_type(&nft_chain_nat_ipv4);
|
||||
if (err < 0)
|
||||
return err;
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
static void __exit nft_chain_nat_exit(void)
|
||||
{
|
||||
nft_unregister_chain_type(&nft_chain_nat_ipv4);
|
||||
}
|
||||
|
||||
module_init(nft_chain_nat_init);
|
||||
module_exit(nft_chain_nat_exit);
|
||||
|
||||
MODULE_LICENSE("GPL");
|
||||
MODULE_AUTHOR("Patrick McHardy <kaber@trash.net>");
|
||||
MODULE_ALIAS_NFT_CHAIN(AF_INET, "nat");
|
||||
90
net/ipv4/netfilter/nft_chain_route_ipv4.c
Normal file
90
net/ipv4/netfilter/nft_chain_route_ipv4.c
Normal file
|
|
@ -0,0 +1,90 @@
|
|||
/*
|
||||
* Copyright (c) 2008 Patrick McHardy <kaber@trash.net>
|
||||
* Copyright (c) 2012 Pablo Neira Ayuso <pablo@netfilter.org>
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License version 2 as
|
||||
* published by the Free Software Foundation.
|
||||
*/
|
||||
|
||||
#include <linux/module.h>
|
||||
#include <linux/init.h>
|
||||
#include <linux/list.h>
|
||||
#include <linux/skbuff.h>
|
||||
#include <linux/netlink.h>
|
||||
#include <linux/netfilter.h>
|
||||
#include <linux/netfilter_ipv4.h>
|
||||
#include <linux/netfilter/nfnetlink.h>
|
||||
#include <linux/netfilter/nf_tables.h>
|
||||
#include <net/netfilter/nf_tables.h>
|
||||
#include <net/netfilter/nf_tables_ipv4.h>
|
||||
#include <net/route.h>
|
||||
#include <net/ip.h>
|
||||
|
||||
static unsigned int nf_route_table_hook(const struct nf_hook_ops *ops,
|
||||
struct sk_buff *skb,
|
||||
const struct net_device *in,
|
||||
const struct net_device *out,
|
||||
int (*okfn)(struct sk_buff *))
|
||||
{
|
||||
unsigned int ret;
|
||||
struct nft_pktinfo pkt;
|
||||
u32 mark;
|
||||
__be32 saddr, daddr;
|
||||
u_int8_t tos;
|
||||
const struct iphdr *iph;
|
||||
|
||||
/* root is playing with raw sockets. */
|
||||
if (skb->len < sizeof(struct iphdr) ||
|
||||
ip_hdrlen(skb) < sizeof(struct iphdr))
|
||||
return NF_ACCEPT;
|
||||
|
||||
nft_set_pktinfo_ipv4(&pkt, ops, skb, in, out);
|
||||
|
||||
mark = skb->mark;
|
||||
iph = ip_hdr(skb);
|
||||
saddr = iph->saddr;
|
||||
daddr = iph->daddr;
|
||||
tos = iph->tos;
|
||||
|
||||
ret = nft_do_chain(&pkt, ops);
|
||||
if (ret != NF_DROP && ret != NF_QUEUE) {
|
||||
iph = ip_hdr(skb);
|
||||
|
||||
if (iph->saddr != saddr ||
|
||||
iph->daddr != daddr ||
|
||||
skb->mark != mark ||
|
||||
iph->tos != tos)
|
||||
if (ip_route_me_harder(skb, RTN_UNSPEC))
|
||||
ret = NF_DROP;
|
||||
}
|
||||
return ret;
|
||||
}
|
||||
|
||||
static const struct nf_chain_type nft_chain_route_ipv4 = {
|
||||
.name = "route",
|
||||
.type = NFT_CHAIN_T_ROUTE,
|
||||
.family = NFPROTO_IPV4,
|
||||
.owner = THIS_MODULE,
|
||||
.hook_mask = (1 << NF_INET_LOCAL_OUT),
|
||||
.hooks = {
|
||||
[NF_INET_LOCAL_OUT] = nf_route_table_hook,
|
||||
},
|
||||
};
|
||||
|
||||
static int __init nft_chain_route_init(void)
|
||||
{
|
||||
return nft_register_chain_type(&nft_chain_route_ipv4);
|
||||
}
|
||||
|
||||
static void __exit nft_chain_route_exit(void)
|
||||
{
|
||||
nft_unregister_chain_type(&nft_chain_route_ipv4);
|
||||
}
|
||||
|
||||
module_init(nft_chain_route_init);
|
||||
module_exit(nft_chain_route_exit);
|
||||
|
||||
MODULE_LICENSE("GPL");
|
||||
MODULE_AUTHOR("Patrick McHardy <kaber@trash.net>");
|
||||
MODULE_ALIAS_NFT_CHAIN(AF_INET, "route");
|
||||
79
net/ipv4/netfilter/nft_masq_ipv4.c
Normal file
79
net/ipv4/netfilter/nft_masq_ipv4.c
Normal file
|
|
@ -0,0 +1,79 @@
|
|||
/*
|
||||
* Copyright (c) 2014 Arturo Borrero Gonzalez <arturo.borrero.glez@gmail.com>
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License version 2 as
|
||||
* published by the Free Software Foundation.
|
||||
*/
|
||||
|
||||
#include <linux/kernel.h>
|
||||
#include <linux/init.h>
|
||||
#include <linux/module.h>
|
||||
#include <linux/netlink.h>
|
||||
#include <linux/netfilter.h>
|
||||
#include <linux/netfilter/nf_tables.h>
|
||||
#include <net/netfilter/nf_tables.h>
|
||||
#include <net/netfilter/nft_masq.h>
|
||||
#include <net/netfilter/ipv4/nf_nat_masquerade.h>
|
||||
|
||||
static void nft_masq_ipv4_eval(const struct nft_expr *expr,
|
||||
struct nft_data data[NFT_REG_MAX + 1],
|
||||
const struct nft_pktinfo *pkt)
|
||||
{
|
||||
struct nft_masq *priv = nft_expr_priv(expr);
|
||||
struct nf_nat_range range;
|
||||
unsigned int verdict;
|
||||
|
||||
memset(&range, 0, sizeof(range));
|
||||
range.flags = priv->flags;
|
||||
|
||||
verdict = nf_nat_masquerade_ipv4(pkt->skb, pkt->ops->hooknum,
|
||||
&range, pkt->out);
|
||||
|
||||
data[NFT_REG_VERDICT].verdict = verdict;
|
||||
}
|
||||
|
||||
static struct nft_expr_type nft_masq_ipv4_type;
|
||||
static const struct nft_expr_ops nft_masq_ipv4_ops = {
|
||||
.type = &nft_masq_ipv4_type,
|
||||
.size = NFT_EXPR_SIZE(sizeof(struct nft_masq)),
|
||||
.eval = nft_masq_ipv4_eval,
|
||||
.init = nft_masq_init,
|
||||
.dump = nft_masq_dump,
|
||||
.validate = nft_masq_validate,
|
||||
};
|
||||
|
||||
static struct nft_expr_type nft_masq_ipv4_type __read_mostly = {
|
||||
.family = NFPROTO_IPV4,
|
||||
.name = "masq",
|
||||
.ops = &nft_masq_ipv4_ops,
|
||||
.policy = nft_masq_policy,
|
||||
.maxattr = NFTA_MASQ_MAX,
|
||||
.owner = THIS_MODULE,
|
||||
};
|
||||
|
||||
static int __init nft_masq_ipv4_module_init(void)
|
||||
{
|
||||
int ret;
|
||||
|
||||
ret = nft_register_expr(&nft_masq_ipv4_type);
|
||||
if (ret < 0)
|
||||
return ret;
|
||||
|
||||
nf_nat_masquerade_ipv4_register_notifier();
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
static void __exit nft_masq_ipv4_module_exit(void)
|
||||
{
|
||||
nft_unregister_expr(&nft_masq_ipv4_type);
|
||||
nf_nat_masquerade_ipv4_unregister_notifier();
|
||||
}
|
||||
|
||||
module_init(nft_masq_ipv4_module_init);
|
||||
module_exit(nft_masq_ipv4_module_exit);
|
||||
|
||||
MODULE_LICENSE("GPL");
|
||||
MODULE_AUTHOR("Arturo Borrero Gonzalez <arturo.borrero.glez@gmail.com>");
|
||||
MODULE_ALIAS_NFT_AF_EXPR(AF_INET, "masq");
|
||||
74
net/ipv4/netfilter/nft_reject_ipv4.c
Normal file
74
net/ipv4/netfilter/nft_reject_ipv4.c
Normal file
|
|
@ -0,0 +1,74 @@
|
|||
/*
|
||||
* Copyright (c) 2008-2009 Patrick McHardy <kaber@trash.net>
|
||||
* Copyright (c) 2013 Eric Leblond <eric@regit.org>
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License version 2 as
|
||||
* published by the Free Software Foundation.
|
||||
*
|
||||
* Development of this code funded by Astaro AG (http://www.astaro.com/)
|
||||
*/
|
||||
|
||||
#include <linux/kernel.h>
|
||||
#include <linux/init.h>
|
||||
#include <linux/module.h>
|
||||
#include <linux/netlink.h>
|
||||
#include <linux/netfilter.h>
|
||||
#include <linux/netfilter/nf_tables.h>
|
||||
#include <net/netfilter/nf_tables.h>
|
||||
#include <net/netfilter/ipv4/nf_reject.h>
|
||||
#include <net/netfilter/nft_reject.h>
|
||||
|
||||
void nft_reject_ipv4_eval(const struct nft_expr *expr,
|
||||
struct nft_data data[NFT_REG_MAX + 1],
|
||||
const struct nft_pktinfo *pkt)
|
||||
{
|
||||
struct nft_reject *priv = nft_expr_priv(expr);
|
||||
|
||||
switch (priv->type) {
|
||||
case NFT_REJECT_ICMP_UNREACH:
|
||||
nf_send_unreach(pkt->skb, priv->icmp_code);
|
||||
break;
|
||||
case NFT_REJECT_TCP_RST:
|
||||
nf_send_reset(pkt->skb, pkt->ops->hooknum);
|
||||
break;
|
||||
}
|
||||
|
||||
data[NFT_REG_VERDICT].verdict = NF_DROP;
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(nft_reject_ipv4_eval);
|
||||
|
||||
static struct nft_expr_type nft_reject_ipv4_type;
|
||||
static const struct nft_expr_ops nft_reject_ipv4_ops = {
|
||||
.type = &nft_reject_ipv4_type,
|
||||
.size = NFT_EXPR_SIZE(sizeof(struct nft_reject)),
|
||||
.eval = nft_reject_ipv4_eval,
|
||||
.init = nft_reject_init,
|
||||
.dump = nft_reject_dump,
|
||||
};
|
||||
|
||||
static struct nft_expr_type nft_reject_ipv4_type __read_mostly = {
|
||||
.family = NFPROTO_IPV4,
|
||||
.name = "reject",
|
||||
.ops = &nft_reject_ipv4_ops,
|
||||
.policy = nft_reject_policy,
|
||||
.maxattr = NFTA_REJECT_MAX,
|
||||
.owner = THIS_MODULE,
|
||||
};
|
||||
|
||||
static int __init nft_reject_ipv4_module_init(void)
|
||||
{
|
||||
return nft_register_expr(&nft_reject_ipv4_type);
|
||||
}
|
||||
|
||||
static void __exit nft_reject_ipv4_module_exit(void)
|
||||
{
|
||||
nft_unregister_expr(&nft_reject_ipv4_type);
|
||||
}
|
||||
|
||||
module_init(nft_reject_ipv4_module_init);
|
||||
module_exit(nft_reject_ipv4_module_exit);
|
||||
|
||||
MODULE_LICENSE("GPL");
|
||||
MODULE_AUTHOR("Patrick McHardy <kaber@trash.net>");
|
||||
MODULE_ALIAS_NFT_AF_EXPR(AF_INET, "reject");
|
||||
1225
net/ipv4/ping.c
Normal file
1225
net/ipv4/ping.c
Normal file
File diff suppressed because it is too large
Load diff
529
net/ipv4/proc.c
Normal file
529
net/ipv4/proc.c
Normal file
|
|
@ -0,0 +1,529 @@
|
|||
/*
|
||||
* INET An implementation of the TCP/IP protocol suite for the LINUX
|
||||
* operating system. INET is implemented using the BSD Socket
|
||||
* interface as the means of communication with the user level.
|
||||
*
|
||||
* This file implements the various access functions for the
|
||||
* PROC file system. It is mainly used for debugging and
|
||||
* statistics.
|
||||
*
|
||||
* Authors: Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
|
||||
* Gerald J. Heim, <heim@peanuts.informatik.uni-tuebingen.de>
|
||||
* Fred Baumgarten, <dc6iq@insu1.etec.uni-karlsruhe.de>
|
||||
* Erik Schoenfelder, <schoenfr@ibr.cs.tu-bs.de>
|
||||
*
|
||||
* Fixes:
|
||||
* Alan Cox : UDP sockets show the rxqueue/txqueue
|
||||
* using hint flag for the netinfo.
|
||||
* Pauline Middelink : identd support
|
||||
* Alan Cox : Make /proc safer.
|
||||
* Erik Schoenfelder : /proc/net/snmp
|
||||
* Alan Cox : Handle dead sockets properly.
|
||||
* Gerhard Koerting : Show both timers
|
||||
* Alan Cox : Allow inode to be NULL (kernel socket)
|
||||
* Andi Kleen : Add support for open_requests and
|
||||
* split functions for more readibility.
|
||||
* Andi Kleen : Add support for /proc/net/netstat
|
||||
* Arnaldo C. Melo : Convert to seq_file
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU General Public License
|
||||
* as published by the Free Software Foundation; either version
|
||||
* 2 of the License, or (at your option) any later version.
|
||||
*/
|
||||
#include <linux/types.h>
|
||||
#include <net/net_namespace.h>
|
||||
#include <net/icmp.h>
|
||||
#include <net/protocol.h>
|
||||
#include <net/tcp.h>
|
||||
#include <net/udp.h>
|
||||
#include <net/udplite.h>
|
||||
#include <linux/bottom_half.h>
|
||||
#include <linux/inetdevice.h>
|
||||
#include <linux/proc_fs.h>
|
||||
#include <linux/seq_file.h>
|
||||
#include <linux/export.h>
|
||||
#include <net/sock.h>
|
||||
#include <net/raw.h>
|
||||
|
||||
/*
|
||||
* Report socket allocation statistics [mea@utu.fi]
|
||||
*/
|
||||
static int sockstat_seq_show(struct seq_file *seq, void *v)
|
||||
{
|
||||
struct net *net = seq->private;
|
||||
unsigned int frag_mem;
|
||||
int orphans, sockets;
|
||||
|
||||
local_bh_disable();
|
||||
orphans = percpu_counter_sum_positive(&tcp_orphan_count);
|
||||
sockets = proto_sockets_allocated_sum_positive(&tcp_prot);
|
||||
local_bh_enable();
|
||||
|
||||
socket_seq_show(seq);
|
||||
seq_printf(seq, "TCP: inuse %d orphan %d tw %d alloc %d mem %ld\n",
|
||||
sock_prot_inuse_get(net, &tcp_prot), orphans,
|
||||
tcp_death_row.tw_count, sockets,
|
||||
proto_memory_allocated(&tcp_prot));
|
||||
seq_printf(seq, "UDP: inuse %d mem %ld\n",
|
||||
sock_prot_inuse_get(net, &udp_prot),
|
||||
proto_memory_allocated(&udp_prot));
|
||||
seq_printf(seq, "UDPLITE: inuse %d\n",
|
||||
sock_prot_inuse_get(net, &udplite_prot));
|
||||
seq_printf(seq, "RAW: inuse %d\n",
|
||||
sock_prot_inuse_get(net, &raw_prot));
|
||||
frag_mem = ip_frag_mem(net);
|
||||
seq_printf(seq, "FRAG: inuse %u memory %u\n", !!frag_mem, frag_mem);
|
||||
return 0;
|
||||
}
|
||||
|
||||
static int sockstat_seq_open(struct inode *inode, struct file *file)
|
||||
{
|
||||
return single_open_net(inode, file, sockstat_seq_show);
|
||||
}
|
||||
|
||||
static const struct file_operations sockstat_seq_fops = {
|
||||
.owner = THIS_MODULE,
|
||||
.open = sockstat_seq_open,
|
||||
.read = seq_read,
|
||||
.llseek = seq_lseek,
|
||||
.release = single_release_net,
|
||||
};
|
||||
|
||||
/* snmp items */
|
||||
static const struct snmp_mib snmp4_ipstats_list[] = {
|
||||
SNMP_MIB_ITEM("InReceives", IPSTATS_MIB_INPKTS),
|
||||
SNMP_MIB_ITEM("InHdrErrors", IPSTATS_MIB_INHDRERRORS),
|
||||
SNMP_MIB_ITEM("InAddrErrors", IPSTATS_MIB_INADDRERRORS),
|
||||
SNMP_MIB_ITEM("ForwDatagrams", IPSTATS_MIB_OUTFORWDATAGRAMS),
|
||||
SNMP_MIB_ITEM("InUnknownProtos", IPSTATS_MIB_INUNKNOWNPROTOS),
|
||||
SNMP_MIB_ITEM("InDiscards", IPSTATS_MIB_INDISCARDS),
|
||||
SNMP_MIB_ITEM("InDelivers", IPSTATS_MIB_INDELIVERS),
|
||||
SNMP_MIB_ITEM("OutRequests", IPSTATS_MIB_OUTPKTS),
|
||||
SNMP_MIB_ITEM("OutDiscards", IPSTATS_MIB_OUTDISCARDS),
|
||||
SNMP_MIB_ITEM("OutNoRoutes", IPSTATS_MIB_OUTNOROUTES),
|
||||
SNMP_MIB_ITEM("ReasmTimeout", IPSTATS_MIB_REASMTIMEOUT),
|
||||
SNMP_MIB_ITEM("ReasmReqds", IPSTATS_MIB_REASMREQDS),
|
||||
SNMP_MIB_ITEM("ReasmOKs", IPSTATS_MIB_REASMOKS),
|
||||
SNMP_MIB_ITEM("ReasmFails", IPSTATS_MIB_REASMFAILS),
|
||||
SNMP_MIB_ITEM("FragOKs", IPSTATS_MIB_FRAGOKS),
|
||||
SNMP_MIB_ITEM("FragFails", IPSTATS_MIB_FRAGFAILS),
|
||||
SNMP_MIB_ITEM("FragCreates", IPSTATS_MIB_FRAGCREATES),
|
||||
SNMP_MIB_SENTINEL
|
||||
};
|
||||
|
||||
/* Following items are displayed in /proc/net/netstat */
|
||||
static const struct snmp_mib snmp4_ipextstats_list[] = {
|
||||
SNMP_MIB_ITEM("InNoRoutes", IPSTATS_MIB_INNOROUTES),
|
||||
SNMP_MIB_ITEM("InTruncatedPkts", IPSTATS_MIB_INTRUNCATEDPKTS),
|
||||
SNMP_MIB_ITEM("InMcastPkts", IPSTATS_MIB_INMCASTPKTS),
|
||||
SNMP_MIB_ITEM("OutMcastPkts", IPSTATS_MIB_OUTMCASTPKTS),
|
||||
SNMP_MIB_ITEM("InBcastPkts", IPSTATS_MIB_INBCASTPKTS),
|
||||
SNMP_MIB_ITEM("OutBcastPkts", IPSTATS_MIB_OUTBCASTPKTS),
|
||||
SNMP_MIB_ITEM("InOctets", IPSTATS_MIB_INOCTETS),
|
||||
SNMP_MIB_ITEM("OutOctets", IPSTATS_MIB_OUTOCTETS),
|
||||
SNMP_MIB_ITEM("InMcastOctets", IPSTATS_MIB_INMCASTOCTETS),
|
||||
SNMP_MIB_ITEM("OutMcastOctets", IPSTATS_MIB_OUTMCASTOCTETS),
|
||||
SNMP_MIB_ITEM("InBcastOctets", IPSTATS_MIB_INBCASTOCTETS),
|
||||
SNMP_MIB_ITEM("OutBcastOctets", IPSTATS_MIB_OUTBCASTOCTETS),
|
||||
/* Non RFC4293 fields */
|
||||
SNMP_MIB_ITEM("InCsumErrors", IPSTATS_MIB_CSUMERRORS),
|
||||
SNMP_MIB_ITEM("InNoECTPkts", IPSTATS_MIB_NOECTPKTS),
|
||||
SNMP_MIB_ITEM("InECT1Pkts", IPSTATS_MIB_ECT1PKTS),
|
||||
SNMP_MIB_ITEM("InECT0Pkts", IPSTATS_MIB_ECT0PKTS),
|
||||
SNMP_MIB_ITEM("InCEPkts", IPSTATS_MIB_CEPKTS),
|
||||
SNMP_MIB_SENTINEL
|
||||
};
|
||||
|
||||
static const struct {
|
||||
const char *name;
|
||||
int index;
|
||||
} icmpmibmap[] = {
|
||||
{ "DestUnreachs", ICMP_DEST_UNREACH },
|
||||
{ "TimeExcds", ICMP_TIME_EXCEEDED },
|
||||
{ "ParmProbs", ICMP_PARAMETERPROB },
|
||||
{ "SrcQuenchs", ICMP_SOURCE_QUENCH },
|
||||
{ "Redirects", ICMP_REDIRECT },
|
||||
{ "Echos", ICMP_ECHO },
|
||||
{ "EchoReps", ICMP_ECHOREPLY },
|
||||
{ "Timestamps", ICMP_TIMESTAMP },
|
||||
{ "TimestampReps", ICMP_TIMESTAMPREPLY },
|
||||
{ "AddrMasks", ICMP_ADDRESS },
|
||||
{ "AddrMaskReps", ICMP_ADDRESSREPLY },
|
||||
{ NULL, 0 }
|
||||
};
|
||||
|
||||
|
||||
static const struct snmp_mib snmp4_tcp_list[] = {
|
||||
SNMP_MIB_ITEM("RtoAlgorithm", TCP_MIB_RTOALGORITHM),
|
||||
SNMP_MIB_ITEM("RtoMin", TCP_MIB_RTOMIN),
|
||||
SNMP_MIB_ITEM("RtoMax", TCP_MIB_RTOMAX),
|
||||
SNMP_MIB_ITEM("MaxConn", TCP_MIB_MAXCONN),
|
||||
SNMP_MIB_ITEM("ActiveOpens", TCP_MIB_ACTIVEOPENS),
|
||||
SNMP_MIB_ITEM("PassiveOpens", TCP_MIB_PASSIVEOPENS),
|
||||
SNMP_MIB_ITEM("AttemptFails", TCP_MIB_ATTEMPTFAILS),
|
||||
SNMP_MIB_ITEM("EstabResets", TCP_MIB_ESTABRESETS),
|
||||
SNMP_MIB_ITEM("CurrEstab", TCP_MIB_CURRESTAB),
|
||||
SNMP_MIB_ITEM("InSegs", TCP_MIB_INSEGS),
|
||||
SNMP_MIB_ITEM("OutSegs", TCP_MIB_OUTSEGS),
|
||||
SNMP_MIB_ITEM("RetransSegs", TCP_MIB_RETRANSSEGS),
|
||||
SNMP_MIB_ITEM("InErrs", TCP_MIB_INERRS),
|
||||
SNMP_MIB_ITEM("OutRsts", TCP_MIB_OUTRSTS),
|
||||
SNMP_MIB_ITEM("InCsumErrors", TCP_MIB_CSUMERRORS),
|
||||
SNMP_MIB_SENTINEL
|
||||
};
|
||||
|
||||
static const struct snmp_mib snmp4_udp_list[] = {
|
||||
SNMP_MIB_ITEM("InDatagrams", UDP_MIB_INDATAGRAMS),
|
||||
SNMP_MIB_ITEM("NoPorts", UDP_MIB_NOPORTS),
|
||||
SNMP_MIB_ITEM("InErrors", UDP_MIB_INERRORS),
|
||||
SNMP_MIB_ITEM("OutDatagrams", UDP_MIB_OUTDATAGRAMS),
|
||||
SNMP_MIB_ITEM("RcvbufErrors", UDP_MIB_RCVBUFERRORS),
|
||||
SNMP_MIB_ITEM("SndbufErrors", UDP_MIB_SNDBUFERRORS),
|
||||
SNMP_MIB_ITEM("InCsumErrors", UDP_MIB_CSUMERRORS),
|
||||
SNMP_MIB_SENTINEL
|
||||
};
|
||||
|
||||
static const struct snmp_mib snmp4_net_list[] = {
|
||||
SNMP_MIB_ITEM("SyncookiesSent", LINUX_MIB_SYNCOOKIESSENT),
|
||||
SNMP_MIB_ITEM("SyncookiesRecv", LINUX_MIB_SYNCOOKIESRECV),
|
||||
SNMP_MIB_ITEM("SyncookiesFailed", LINUX_MIB_SYNCOOKIESFAILED),
|
||||
SNMP_MIB_ITEM("EmbryonicRsts", LINUX_MIB_EMBRYONICRSTS),
|
||||
SNMP_MIB_ITEM("PruneCalled", LINUX_MIB_PRUNECALLED),
|
||||
SNMP_MIB_ITEM("RcvPruned", LINUX_MIB_RCVPRUNED),
|
||||
SNMP_MIB_ITEM("OfoPruned", LINUX_MIB_OFOPRUNED),
|
||||
SNMP_MIB_ITEM("OutOfWindowIcmps", LINUX_MIB_OUTOFWINDOWICMPS),
|
||||
SNMP_MIB_ITEM("LockDroppedIcmps", LINUX_MIB_LOCKDROPPEDICMPS),
|
||||
SNMP_MIB_ITEM("ArpFilter", LINUX_MIB_ARPFILTER),
|
||||
SNMP_MIB_ITEM("TW", LINUX_MIB_TIMEWAITED),
|
||||
SNMP_MIB_ITEM("TWRecycled", LINUX_MIB_TIMEWAITRECYCLED),
|
||||
SNMP_MIB_ITEM("TWKilled", LINUX_MIB_TIMEWAITKILLED),
|
||||
SNMP_MIB_ITEM("PAWSPassive", LINUX_MIB_PAWSPASSIVEREJECTED),
|
||||
SNMP_MIB_ITEM("PAWSActive", LINUX_MIB_PAWSACTIVEREJECTED),
|
||||
SNMP_MIB_ITEM("PAWSEstab", LINUX_MIB_PAWSESTABREJECTED),
|
||||
SNMP_MIB_ITEM("DelayedACKs", LINUX_MIB_DELAYEDACKS),
|
||||
SNMP_MIB_ITEM("DelayedACKLocked", LINUX_MIB_DELAYEDACKLOCKED),
|
||||
SNMP_MIB_ITEM("DelayedACKLost", LINUX_MIB_DELAYEDACKLOST),
|
||||
SNMP_MIB_ITEM("ListenOverflows", LINUX_MIB_LISTENOVERFLOWS),
|
||||
SNMP_MIB_ITEM("ListenDrops", LINUX_MIB_LISTENDROPS),
|
||||
SNMP_MIB_ITEM("TCPPrequeued", LINUX_MIB_TCPPREQUEUED),
|
||||
SNMP_MIB_ITEM("TCPDirectCopyFromBacklog", LINUX_MIB_TCPDIRECTCOPYFROMBACKLOG),
|
||||
SNMP_MIB_ITEM("TCPDirectCopyFromPrequeue", LINUX_MIB_TCPDIRECTCOPYFROMPREQUEUE),
|
||||
SNMP_MIB_ITEM("TCPPrequeueDropped", LINUX_MIB_TCPPREQUEUEDROPPED),
|
||||
SNMP_MIB_ITEM("TCPHPHits", LINUX_MIB_TCPHPHITS),
|
||||
SNMP_MIB_ITEM("TCPHPHitsToUser", LINUX_MIB_TCPHPHITSTOUSER),
|
||||
SNMP_MIB_ITEM("TCPPureAcks", LINUX_MIB_TCPPUREACKS),
|
||||
SNMP_MIB_ITEM("TCPHPAcks", LINUX_MIB_TCPHPACKS),
|
||||
SNMP_MIB_ITEM("TCPRenoRecovery", LINUX_MIB_TCPRENORECOVERY),
|
||||
SNMP_MIB_ITEM("TCPSackRecovery", LINUX_MIB_TCPSACKRECOVERY),
|
||||
SNMP_MIB_ITEM("TCPSACKReneging", LINUX_MIB_TCPSACKRENEGING),
|
||||
SNMP_MIB_ITEM("TCPFACKReorder", LINUX_MIB_TCPFACKREORDER),
|
||||
SNMP_MIB_ITEM("TCPSACKReorder", LINUX_MIB_TCPSACKREORDER),
|
||||
SNMP_MIB_ITEM("TCPRenoReorder", LINUX_MIB_TCPRENOREORDER),
|
||||
SNMP_MIB_ITEM("TCPTSReorder", LINUX_MIB_TCPTSREORDER),
|
||||
SNMP_MIB_ITEM("TCPFullUndo", LINUX_MIB_TCPFULLUNDO),
|
||||
SNMP_MIB_ITEM("TCPPartialUndo", LINUX_MIB_TCPPARTIALUNDO),
|
||||
SNMP_MIB_ITEM("TCPDSACKUndo", LINUX_MIB_TCPDSACKUNDO),
|
||||
SNMP_MIB_ITEM("TCPLossUndo", LINUX_MIB_TCPLOSSUNDO),
|
||||
SNMP_MIB_ITEM("TCPLostRetransmit", LINUX_MIB_TCPLOSTRETRANSMIT),
|
||||
SNMP_MIB_ITEM("TCPRenoFailures", LINUX_MIB_TCPRENOFAILURES),
|
||||
SNMP_MIB_ITEM("TCPSackFailures", LINUX_MIB_TCPSACKFAILURES),
|
||||
SNMP_MIB_ITEM("TCPLossFailures", LINUX_MIB_TCPLOSSFAILURES),
|
||||
SNMP_MIB_ITEM("TCPFastRetrans", LINUX_MIB_TCPFASTRETRANS),
|
||||
SNMP_MIB_ITEM("TCPForwardRetrans", LINUX_MIB_TCPFORWARDRETRANS),
|
||||
SNMP_MIB_ITEM("TCPSlowStartRetrans", LINUX_MIB_TCPSLOWSTARTRETRANS),
|
||||
SNMP_MIB_ITEM("TCPTimeouts", LINUX_MIB_TCPTIMEOUTS),
|
||||
SNMP_MIB_ITEM("TCPLossProbes", LINUX_MIB_TCPLOSSPROBES),
|
||||
SNMP_MIB_ITEM("TCPLossProbeRecovery", LINUX_MIB_TCPLOSSPROBERECOVERY),
|
||||
SNMP_MIB_ITEM("TCPRenoRecoveryFail", LINUX_MIB_TCPRENORECOVERYFAIL),
|
||||
SNMP_MIB_ITEM("TCPSackRecoveryFail", LINUX_MIB_TCPSACKRECOVERYFAIL),
|
||||
SNMP_MIB_ITEM("TCPSchedulerFailed", LINUX_MIB_TCPSCHEDULERFAILED),
|
||||
SNMP_MIB_ITEM("TCPRcvCollapsed", LINUX_MIB_TCPRCVCOLLAPSED),
|
||||
SNMP_MIB_ITEM("TCPDSACKOldSent", LINUX_MIB_TCPDSACKOLDSENT),
|
||||
SNMP_MIB_ITEM("TCPDSACKOfoSent", LINUX_MIB_TCPDSACKOFOSENT),
|
||||
SNMP_MIB_ITEM("TCPDSACKRecv", LINUX_MIB_TCPDSACKRECV),
|
||||
SNMP_MIB_ITEM("TCPDSACKOfoRecv", LINUX_MIB_TCPDSACKOFORECV),
|
||||
SNMP_MIB_ITEM("TCPAbortOnData", LINUX_MIB_TCPABORTONDATA),
|
||||
SNMP_MIB_ITEM("TCPAbortOnClose", LINUX_MIB_TCPABORTONCLOSE),
|
||||
SNMP_MIB_ITEM("TCPAbortOnMemory", LINUX_MIB_TCPABORTONMEMORY),
|
||||
SNMP_MIB_ITEM("TCPAbortOnTimeout", LINUX_MIB_TCPABORTONTIMEOUT),
|
||||
SNMP_MIB_ITEM("TCPAbortOnLinger", LINUX_MIB_TCPABORTONLINGER),
|
||||
SNMP_MIB_ITEM("TCPAbortFailed", LINUX_MIB_TCPABORTFAILED),
|
||||
SNMP_MIB_ITEM("TCPMemoryPressures", LINUX_MIB_TCPMEMORYPRESSURES),
|
||||
SNMP_MIB_ITEM("TCPSACKDiscard", LINUX_MIB_TCPSACKDISCARD),
|
||||
SNMP_MIB_ITEM("TCPDSACKIgnoredOld", LINUX_MIB_TCPDSACKIGNOREDOLD),
|
||||
SNMP_MIB_ITEM("TCPDSACKIgnoredNoUndo", LINUX_MIB_TCPDSACKIGNOREDNOUNDO),
|
||||
SNMP_MIB_ITEM("TCPSpuriousRTOs", LINUX_MIB_TCPSPURIOUSRTOS),
|
||||
SNMP_MIB_ITEM("TCPMD5NotFound", LINUX_MIB_TCPMD5NOTFOUND),
|
||||
SNMP_MIB_ITEM("TCPMD5Unexpected", LINUX_MIB_TCPMD5UNEXPECTED),
|
||||
SNMP_MIB_ITEM("TCPSackShifted", LINUX_MIB_SACKSHIFTED),
|
||||
SNMP_MIB_ITEM("TCPSackMerged", LINUX_MIB_SACKMERGED),
|
||||
SNMP_MIB_ITEM("TCPSackShiftFallback", LINUX_MIB_SACKSHIFTFALLBACK),
|
||||
SNMP_MIB_ITEM("TCPBacklogDrop", LINUX_MIB_TCPBACKLOGDROP),
|
||||
SNMP_MIB_ITEM("TCPMinTTLDrop", LINUX_MIB_TCPMINTTLDROP),
|
||||
SNMP_MIB_ITEM("TCPDeferAcceptDrop", LINUX_MIB_TCPDEFERACCEPTDROP),
|
||||
SNMP_MIB_ITEM("IPReversePathFilter", LINUX_MIB_IPRPFILTER),
|
||||
SNMP_MIB_ITEM("TCPTimeWaitOverflow", LINUX_MIB_TCPTIMEWAITOVERFLOW),
|
||||
SNMP_MIB_ITEM("TCPReqQFullDoCookies", LINUX_MIB_TCPREQQFULLDOCOOKIES),
|
||||
SNMP_MIB_ITEM("TCPReqQFullDrop", LINUX_MIB_TCPREQQFULLDROP),
|
||||
SNMP_MIB_ITEM("TCPRetransFail", LINUX_MIB_TCPRETRANSFAIL),
|
||||
SNMP_MIB_ITEM("TCPRcvCoalesce", LINUX_MIB_TCPRCVCOALESCE),
|
||||
SNMP_MIB_ITEM("TCPOFOQueue", LINUX_MIB_TCPOFOQUEUE),
|
||||
SNMP_MIB_ITEM("TCPOFODrop", LINUX_MIB_TCPOFODROP),
|
||||
SNMP_MIB_ITEM("TCPOFOMerge", LINUX_MIB_TCPOFOMERGE),
|
||||
SNMP_MIB_ITEM("TCPChallengeACK", LINUX_MIB_TCPCHALLENGEACK),
|
||||
SNMP_MIB_ITEM("TCPSYNChallenge", LINUX_MIB_TCPSYNCHALLENGE),
|
||||
SNMP_MIB_ITEM("TCPFastOpenActive", LINUX_MIB_TCPFASTOPENACTIVE),
|
||||
SNMP_MIB_ITEM("TCPFastOpenActiveFail", LINUX_MIB_TCPFASTOPENACTIVEFAIL),
|
||||
SNMP_MIB_ITEM("TCPFastOpenPassive", LINUX_MIB_TCPFASTOPENPASSIVE),
|
||||
SNMP_MIB_ITEM("TCPFastOpenPassiveFail", LINUX_MIB_TCPFASTOPENPASSIVEFAIL),
|
||||
SNMP_MIB_ITEM("TCPFastOpenListenOverflow", LINUX_MIB_TCPFASTOPENLISTENOVERFLOW),
|
||||
SNMP_MIB_ITEM("TCPFastOpenCookieReqd", LINUX_MIB_TCPFASTOPENCOOKIEREQD),
|
||||
SNMP_MIB_ITEM("TCPSpuriousRtxHostQueues", LINUX_MIB_TCPSPURIOUS_RTX_HOSTQUEUES),
|
||||
SNMP_MIB_ITEM("BusyPollRxPackets", LINUX_MIB_BUSYPOLLRXPACKETS),
|
||||
SNMP_MIB_ITEM("TCPAutoCorking", LINUX_MIB_TCPAUTOCORKING),
|
||||
SNMP_MIB_ITEM("TCPFromZeroWindowAdv", LINUX_MIB_TCPFROMZEROWINDOWADV),
|
||||
SNMP_MIB_ITEM("TCPToZeroWindowAdv", LINUX_MIB_TCPTOZEROWINDOWADV),
|
||||
SNMP_MIB_ITEM("TCPWantZeroWindowAdv", LINUX_MIB_TCPWANTZEROWINDOWADV),
|
||||
SNMP_MIB_ITEM("TCPSynRetrans", LINUX_MIB_TCPSYNRETRANS),
|
||||
SNMP_MIB_ITEM("TCPOrigDataSent", LINUX_MIB_TCPORIGDATASENT),
|
||||
SNMP_MIB_SENTINEL
|
||||
};
|
||||
|
||||
static void icmpmsg_put_line(struct seq_file *seq, unsigned long *vals,
|
||||
unsigned short *type, int count)
|
||||
{
|
||||
int j;
|
||||
|
||||
if (count) {
|
||||
seq_printf(seq, "\nIcmpMsg:");
|
||||
for (j = 0; j < count; ++j)
|
||||
seq_printf(seq, " %sType%u",
|
||||
type[j] & 0x100 ? "Out" : "In",
|
||||
type[j] & 0xff);
|
||||
seq_printf(seq, "\nIcmpMsg:");
|
||||
for (j = 0; j < count; ++j)
|
||||
seq_printf(seq, " %lu", vals[j]);
|
||||
}
|
||||
}
|
||||
|
||||
static void icmpmsg_put(struct seq_file *seq)
|
||||
{
|
||||
#define PERLINE 16
|
||||
|
||||
int i, count;
|
||||
unsigned short type[PERLINE];
|
||||
unsigned long vals[PERLINE], val;
|
||||
struct net *net = seq->private;
|
||||
|
||||
count = 0;
|
||||
for (i = 0; i < ICMPMSG_MIB_MAX; i++) {
|
||||
val = atomic_long_read(&net->mib.icmpmsg_statistics->mibs[i]);
|
||||
if (val) {
|
||||
type[count] = i;
|
||||
vals[count++] = val;
|
||||
}
|
||||
if (count == PERLINE) {
|
||||
icmpmsg_put_line(seq, vals, type, count);
|
||||
count = 0;
|
||||
}
|
||||
}
|
||||
icmpmsg_put_line(seq, vals, type, count);
|
||||
|
||||
#undef PERLINE
|
||||
}
|
||||
|
||||
static void icmp_put(struct seq_file *seq)
|
||||
{
|
||||
int i;
|
||||
struct net *net = seq->private;
|
||||
atomic_long_t *ptr = net->mib.icmpmsg_statistics->mibs;
|
||||
|
||||
seq_puts(seq, "\nIcmp: InMsgs InErrors InCsumErrors");
|
||||
for (i = 0; icmpmibmap[i].name != NULL; i++)
|
||||
seq_printf(seq, " In%s", icmpmibmap[i].name);
|
||||
seq_printf(seq, " OutMsgs OutErrors");
|
||||
for (i = 0; icmpmibmap[i].name != NULL; i++)
|
||||
seq_printf(seq, " Out%s", icmpmibmap[i].name);
|
||||
seq_printf(seq, "\nIcmp: %lu %lu %lu",
|
||||
snmp_fold_field(net->mib.icmp_statistics, ICMP_MIB_INMSGS),
|
||||
snmp_fold_field(net->mib.icmp_statistics, ICMP_MIB_INERRORS),
|
||||
snmp_fold_field(net->mib.icmp_statistics, ICMP_MIB_CSUMERRORS));
|
||||
for (i = 0; icmpmibmap[i].name != NULL; i++)
|
||||
seq_printf(seq, " %lu",
|
||||
atomic_long_read(ptr + icmpmibmap[i].index));
|
||||
seq_printf(seq, " %lu %lu",
|
||||
snmp_fold_field(net->mib.icmp_statistics, ICMP_MIB_OUTMSGS),
|
||||
snmp_fold_field(net->mib.icmp_statistics, ICMP_MIB_OUTERRORS));
|
||||
for (i = 0; icmpmibmap[i].name != NULL; i++)
|
||||
seq_printf(seq, " %lu",
|
||||
atomic_long_read(ptr + (icmpmibmap[i].index | 0x100)));
|
||||
}
|
||||
|
||||
/*
|
||||
* Called from the PROCfs module. This outputs /proc/net/snmp.
|
||||
*/
|
||||
static int snmp_seq_show(struct seq_file *seq, void *v)
|
||||
{
|
||||
int i;
|
||||
struct net *net = seq->private;
|
||||
|
||||
seq_puts(seq, "Ip: Forwarding DefaultTTL");
|
||||
|
||||
for (i = 0; snmp4_ipstats_list[i].name != NULL; i++)
|
||||
seq_printf(seq, " %s", snmp4_ipstats_list[i].name);
|
||||
|
||||
seq_printf(seq, "\nIp: %d %d",
|
||||
IPV4_DEVCONF_ALL(net, FORWARDING) ? 1 : 2,
|
||||
sysctl_ip_default_ttl);
|
||||
|
||||
BUILD_BUG_ON(offsetof(struct ipstats_mib, mibs) != 0);
|
||||
for (i = 0; snmp4_ipstats_list[i].name != NULL; i++)
|
||||
seq_printf(seq, " %llu",
|
||||
snmp_fold_field64(net->mib.ip_statistics,
|
||||
snmp4_ipstats_list[i].entry,
|
||||
offsetof(struct ipstats_mib, syncp)));
|
||||
|
||||
icmp_put(seq); /* RFC 2011 compatibility */
|
||||
icmpmsg_put(seq);
|
||||
|
||||
seq_puts(seq, "\nTcp:");
|
||||
for (i = 0; snmp4_tcp_list[i].name != NULL; i++)
|
||||
seq_printf(seq, " %s", snmp4_tcp_list[i].name);
|
||||
|
||||
seq_puts(seq, "\nTcp:");
|
||||
for (i = 0; snmp4_tcp_list[i].name != NULL; i++) {
|
||||
/* MaxConn field is signed, RFC 2012 */
|
||||
if (snmp4_tcp_list[i].entry == TCP_MIB_MAXCONN)
|
||||
seq_printf(seq, " %ld",
|
||||
snmp_fold_field(net->mib.tcp_statistics,
|
||||
snmp4_tcp_list[i].entry));
|
||||
else
|
||||
seq_printf(seq, " %lu",
|
||||
snmp_fold_field(net->mib.tcp_statistics,
|
||||
snmp4_tcp_list[i].entry));
|
||||
}
|
||||
|
||||
seq_puts(seq, "\nUdp:");
|
||||
for (i = 0; snmp4_udp_list[i].name != NULL; i++)
|
||||
seq_printf(seq, " %s", snmp4_udp_list[i].name);
|
||||
|
||||
seq_puts(seq, "\nUdp:");
|
||||
for (i = 0; snmp4_udp_list[i].name != NULL; i++)
|
||||
seq_printf(seq, " %lu",
|
||||
snmp_fold_field(net->mib.udp_statistics,
|
||||
snmp4_udp_list[i].entry));
|
||||
|
||||
/* the UDP and UDP-Lite MIBs are the same */
|
||||
seq_puts(seq, "\nUdpLite:");
|
||||
for (i = 0; snmp4_udp_list[i].name != NULL; i++)
|
||||
seq_printf(seq, " %s", snmp4_udp_list[i].name);
|
||||
|
||||
seq_puts(seq, "\nUdpLite:");
|
||||
for (i = 0; snmp4_udp_list[i].name != NULL; i++)
|
||||
seq_printf(seq, " %lu",
|
||||
snmp_fold_field(net->mib.udplite_statistics,
|
||||
snmp4_udp_list[i].entry));
|
||||
|
||||
seq_putc(seq, '\n');
|
||||
return 0;
|
||||
}
|
||||
|
||||
static int snmp_seq_open(struct inode *inode, struct file *file)
|
||||
{
|
||||
return single_open_net(inode, file, snmp_seq_show);
|
||||
}
|
||||
|
||||
static const struct file_operations snmp_seq_fops = {
|
||||
.owner = THIS_MODULE,
|
||||
.open = snmp_seq_open,
|
||||
.read = seq_read,
|
||||
.llseek = seq_lseek,
|
||||
.release = single_release_net,
|
||||
};
|
||||
|
||||
|
||||
|
||||
/*
|
||||
* Output /proc/net/netstat
|
||||
*/
|
||||
static int netstat_seq_show(struct seq_file *seq, void *v)
|
||||
{
|
||||
int i;
|
||||
struct net *net = seq->private;
|
||||
|
||||
seq_puts(seq, "TcpExt:");
|
||||
for (i = 0; snmp4_net_list[i].name != NULL; i++)
|
||||
seq_printf(seq, " %s", snmp4_net_list[i].name);
|
||||
|
||||
seq_puts(seq, "\nTcpExt:");
|
||||
for (i = 0; snmp4_net_list[i].name != NULL; i++)
|
||||
seq_printf(seq, " %lu",
|
||||
snmp_fold_field(net->mib.net_statistics,
|
||||
snmp4_net_list[i].entry));
|
||||
|
||||
seq_puts(seq, "\nIpExt:");
|
||||
for (i = 0; snmp4_ipextstats_list[i].name != NULL; i++)
|
||||
seq_printf(seq, " %s", snmp4_ipextstats_list[i].name);
|
||||
|
||||
seq_puts(seq, "\nIpExt:");
|
||||
for (i = 0; snmp4_ipextstats_list[i].name != NULL; i++)
|
||||
seq_printf(seq, " %llu",
|
||||
snmp_fold_field64(net->mib.ip_statistics,
|
||||
snmp4_ipextstats_list[i].entry,
|
||||
offsetof(struct ipstats_mib, syncp)));
|
||||
|
||||
seq_putc(seq, '\n');
|
||||
return 0;
|
||||
}
|
||||
|
||||
static int netstat_seq_open(struct inode *inode, struct file *file)
|
||||
{
|
||||
return single_open_net(inode, file, netstat_seq_show);
|
||||
}
|
||||
|
||||
static const struct file_operations netstat_seq_fops = {
|
||||
.owner = THIS_MODULE,
|
||||
.open = netstat_seq_open,
|
||||
.read = seq_read,
|
||||
.llseek = seq_lseek,
|
||||
.release = single_release_net,
|
||||
};
|
||||
|
||||
static __net_init int ip_proc_init_net(struct net *net)
|
||||
{
|
||||
if (!proc_create("sockstat", S_IRUGO, net->proc_net,
|
||||
&sockstat_seq_fops))
|
||||
goto out_sockstat;
|
||||
if (!proc_create("netstat", S_IRUGO, net->proc_net, &netstat_seq_fops))
|
||||
goto out_netstat;
|
||||
if (!proc_create("snmp", S_IRUGO, net->proc_net, &snmp_seq_fops))
|
||||
goto out_snmp;
|
||||
|
||||
return 0;
|
||||
|
||||
out_snmp:
|
||||
remove_proc_entry("netstat", net->proc_net);
|
||||
out_netstat:
|
||||
remove_proc_entry("sockstat", net->proc_net);
|
||||
out_sockstat:
|
||||
return -ENOMEM;
|
||||
}
|
||||
|
||||
static __net_exit void ip_proc_exit_net(struct net *net)
|
||||
{
|
||||
remove_proc_entry("snmp", net->proc_net);
|
||||
remove_proc_entry("netstat", net->proc_net);
|
||||
remove_proc_entry("sockstat", net->proc_net);
|
||||
}
|
||||
|
||||
static __net_initdata struct pernet_operations ip_proc_ops = {
|
||||
.init = ip_proc_init_net,
|
||||
.exit = ip_proc_exit_net,
|
||||
};
|
||||
|
||||
int __init ip_misc_proc_init(void)
|
||||
{
|
||||
return register_pernet_subsys(&ip_proc_ops);
|
||||
}
|
||||
|
||||
79
net/ipv4/protocol.c
Normal file
79
net/ipv4/protocol.c
Normal file
|
|
@ -0,0 +1,79 @@
|
|||
/*
|
||||
* INET An implementation of the TCP/IP protocol suite for the LINUX
|
||||
* operating system. INET is implemented using the BSD Socket
|
||||
* interface as the means of communication with the user level.
|
||||
*
|
||||
* INET protocol dispatch tables.
|
||||
*
|
||||
* Authors: Ross Biro
|
||||
* Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
|
||||
*
|
||||
* Fixes:
|
||||
* Alan Cox : Ahah! udp icmp errors don't work because
|
||||
* udp_err is never called!
|
||||
* Alan Cox : Added new fields for init and ready for
|
||||
* proper fragmentation (_NO_ 4K limits!)
|
||||
* Richard Colella : Hang on hash collision
|
||||
* Vince Laviano : Modified inet_del_protocol() to correctly
|
||||
* maintain copy bit.
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU General Public License
|
||||
* as published by the Free Software Foundation; either version
|
||||
* 2 of the License, or (at your option) any later version.
|
||||
*/
|
||||
#include <linux/cache.h>
|
||||
#include <linux/module.h>
|
||||
#include <linux/netdevice.h>
|
||||
#include <linux/spinlock.h>
|
||||
#include <net/protocol.h>
|
||||
|
||||
const struct net_protocol __rcu *inet_protos[MAX_INET_PROTOS] __read_mostly;
|
||||
const struct net_offload __rcu *inet_offloads[MAX_INET_PROTOS] __read_mostly;
|
||||
EXPORT_SYMBOL(inet_offloads);
|
||||
|
||||
int inet_add_protocol(const struct net_protocol *prot, unsigned char protocol)
|
||||
{
|
||||
if (!prot->netns_ok) {
|
||||
pr_err("Protocol %u is not namespace aware, cannot register.\n",
|
||||
protocol);
|
||||
return -EINVAL;
|
||||
}
|
||||
|
||||
return !cmpxchg((const struct net_protocol **)&inet_protos[protocol],
|
||||
NULL, prot) ? 0 : -1;
|
||||
}
|
||||
EXPORT_SYMBOL(inet_add_protocol);
|
||||
|
||||
int inet_add_offload(const struct net_offload *prot, unsigned char protocol)
|
||||
{
|
||||
return !cmpxchg((const struct net_offload **)&inet_offloads[protocol],
|
||||
NULL, prot) ? 0 : -1;
|
||||
}
|
||||
EXPORT_SYMBOL(inet_add_offload);
|
||||
|
||||
int inet_del_protocol(const struct net_protocol *prot, unsigned char protocol)
|
||||
{
|
||||
int ret;
|
||||
|
||||
ret = (cmpxchg((const struct net_protocol **)&inet_protos[protocol],
|
||||
prot, NULL) == prot) ? 0 : -1;
|
||||
|
||||
synchronize_net();
|
||||
|
||||
return ret;
|
||||
}
|
||||
EXPORT_SYMBOL(inet_del_protocol);
|
||||
|
||||
int inet_del_offload(const struct net_offload *prot, unsigned char protocol)
|
||||
{
|
||||
int ret;
|
||||
|
||||
ret = (cmpxchg((const struct net_offload **)&inet_offloads[protocol],
|
||||
prot, NULL) == prot) ? 0 : -1;
|
||||
|
||||
synchronize_net();
|
||||
|
||||
return ret;
|
||||
}
|
||||
EXPORT_SYMBOL(inet_del_offload);
|
||||
1085
net/ipv4/raw.c
Normal file
1085
net/ipv4/raw.c
Normal file
File diff suppressed because it is too large
Load diff
2790
net/ipv4/route.c
Normal file
2790
net/ipv4/route.c
Normal file
File diff suppressed because it is too large
Load diff
366
net/ipv4/syncookies.c
Normal file
366
net/ipv4/syncookies.c
Normal file
|
|
@ -0,0 +1,366 @@
|
|||
/*
|
||||
* Syncookies implementation for the Linux kernel
|
||||
*
|
||||
* Copyright (C) 1997 Andi Kleen
|
||||
* Based on ideas by D.J.Bernstein and Eric Schenk.
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU General Public License
|
||||
* as published by the Free Software Foundation; either version
|
||||
* 2 of the License, or (at your option) any later version.
|
||||
*/
|
||||
|
||||
#include <linux/tcp.h>
|
||||
#include <linux/slab.h>
|
||||
#include <linux/random.h>
|
||||
#include <linux/cryptohash.h>
|
||||
#include <linux/kernel.h>
|
||||
#include <linux/export.h>
|
||||
#include <net/tcp.h>
|
||||
#include <net/route.h>
|
||||
|
||||
/* Timestamps: lowest bits store TCP options */
|
||||
#define TSBITS 6
|
||||
#define TSMASK (((__u32)1 << TSBITS) - 1)
|
||||
|
||||
extern int sysctl_tcp_syncookies;
|
||||
|
||||
static u32 syncookie_secret[2][16-4+SHA_DIGEST_WORDS] __read_mostly;
|
||||
|
||||
#define COOKIEBITS 24 /* Upper bits store count */
|
||||
#define COOKIEMASK (((__u32)1 << COOKIEBITS) - 1)
|
||||
|
||||
static DEFINE_PER_CPU(__u32 [16 + 5 + SHA_WORKSPACE_WORDS],
|
||||
ipv4_cookie_scratch);
|
||||
|
||||
static u32 cookie_hash(__be32 saddr, __be32 daddr, __be16 sport, __be16 dport,
|
||||
u32 count, int c)
|
||||
{
|
||||
__u32 *tmp;
|
||||
|
||||
net_get_random_once(syncookie_secret, sizeof(syncookie_secret));
|
||||
|
||||
tmp = this_cpu_ptr(ipv4_cookie_scratch);
|
||||
memcpy(tmp + 4, syncookie_secret[c], sizeof(syncookie_secret[c]));
|
||||
tmp[0] = (__force u32)saddr;
|
||||
tmp[1] = (__force u32)daddr;
|
||||
tmp[2] = ((__force u32)sport << 16) + (__force u32)dport;
|
||||
tmp[3] = count;
|
||||
sha_transform(tmp + 16, (__u8 *)tmp, tmp + 16 + 5);
|
||||
|
||||
return tmp[17];
|
||||
}
|
||||
|
||||
|
||||
/*
|
||||
* when syncookies are in effect and tcp timestamps are enabled we encode
|
||||
* tcp options in the lower bits of the timestamp value that will be
|
||||
* sent in the syn-ack.
|
||||
* Since subsequent timestamps use the normal tcp_time_stamp value, we
|
||||
* must make sure that the resulting initial timestamp is <= tcp_time_stamp.
|
||||
*/
|
||||
__u32 cookie_init_timestamp(struct request_sock *req)
|
||||
{
|
||||
struct inet_request_sock *ireq;
|
||||
u32 ts, ts_now = tcp_time_stamp;
|
||||
u32 options = 0;
|
||||
|
||||
ireq = inet_rsk(req);
|
||||
|
||||
options = ireq->wscale_ok ? ireq->snd_wscale : 0xf;
|
||||
options |= ireq->sack_ok << 4;
|
||||
options |= ireq->ecn_ok << 5;
|
||||
|
||||
ts = ts_now & ~TSMASK;
|
||||
ts |= options;
|
||||
if (ts > ts_now) {
|
||||
ts >>= TSBITS;
|
||||
ts--;
|
||||
ts <<= TSBITS;
|
||||
ts |= options;
|
||||
}
|
||||
return ts;
|
||||
}
|
||||
|
||||
|
||||
static __u32 secure_tcp_syn_cookie(__be32 saddr, __be32 daddr, __be16 sport,
|
||||
__be16 dport, __u32 sseq, __u32 data)
|
||||
{
|
||||
/*
|
||||
* Compute the secure sequence number.
|
||||
* The output should be:
|
||||
* HASH(sec1,saddr,sport,daddr,dport,sec1) + sseq + (count * 2^24)
|
||||
* + (HASH(sec2,saddr,sport,daddr,dport,count,sec2) % 2^24).
|
||||
* Where sseq is their sequence number and count increases every
|
||||
* minute by 1.
|
||||
* As an extra hack, we add a small "data" value that encodes the
|
||||
* MSS into the second hash value.
|
||||
*/
|
||||
u32 count = tcp_cookie_time();
|
||||
return (cookie_hash(saddr, daddr, sport, dport, 0, 0) +
|
||||
sseq + (count << COOKIEBITS) +
|
||||
((cookie_hash(saddr, daddr, sport, dport, count, 1) + data)
|
||||
& COOKIEMASK));
|
||||
}
|
||||
|
||||
/*
|
||||
* This retrieves the small "data" value from the syncookie.
|
||||
* If the syncookie is bad, the data returned will be out of
|
||||
* range. This must be checked by the caller.
|
||||
*
|
||||
* The count value used to generate the cookie must be less than
|
||||
* MAX_SYNCOOKIE_AGE minutes in the past.
|
||||
* The return value (__u32)-1 if this test fails.
|
||||
*/
|
||||
static __u32 check_tcp_syn_cookie(__u32 cookie, __be32 saddr, __be32 daddr,
|
||||
__be16 sport, __be16 dport, __u32 sseq)
|
||||
{
|
||||
u32 diff, count = tcp_cookie_time();
|
||||
|
||||
/* Strip away the layers from the cookie */
|
||||
cookie -= cookie_hash(saddr, daddr, sport, dport, 0, 0) + sseq;
|
||||
|
||||
/* Cookie is now reduced to (count * 2^24) ^ (hash % 2^24) */
|
||||
diff = (count - (cookie >> COOKIEBITS)) & ((__u32) -1 >> COOKIEBITS);
|
||||
if (diff >= MAX_SYNCOOKIE_AGE)
|
||||
return (__u32)-1;
|
||||
|
||||
return (cookie -
|
||||
cookie_hash(saddr, daddr, sport, dport, count - diff, 1))
|
||||
& COOKIEMASK; /* Leaving the data behind */
|
||||
}
|
||||
|
||||
/*
|
||||
* MSS Values are chosen based on the 2011 paper
|
||||
* 'An Analysis of TCP Maximum Segement Sizes' by S. Alcock and R. Nelson.
|
||||
* Values ..
|
||||
* .. lower than 536 are rare (< 0.2%)
|
||||
* .. between 537 and 1299 account for less than < 1.5% of observed values
|
||||
* .. in the 1300-1349 range account for about 15 to 20% of observed mss values
|
||||
* .. exceeding 1460 are very rare (< 0.04%)
|
||||
*
|
||||
* 1460 is the single most frequently announced mss value (30 to 46% depending
|
||||
* on monitor location). Table must be sorted.
|
||||
*/
|
||||
static __u16 const msstab[] = {
|
||||
536,
|
||||
1300,
|
||||
1440, /* 1440, 1452: PPPoE */
|
||||
1460,
|
||||
};
|
||||
|
||||
/*
|
||||
* Generate a syncookie. mssp points to the mss, which is returned
|
||||
* rounded down to the value encoded in the cookie.
|
||||
*/
|
||||
u32 __cookie_v4_init_sequence(const struct iphdr *iph, const struct tcphdr *th,
|
||||
u16 *mssp)
|
||||
{
|
||||
int mssind;
|
||||
const __u16 mss = *mssp;
|
||||
|
||||
for (mssind = ARRAY_SIZE(msstab) - 1; mssind ; mssind--)
|
||||
if (mss >= msstab[mssind])
|
||||
break;
|
||||
*mssp = msstab[mssind];
|
||||
|
||||
return secure_tcp_syn_cookie(iph->saddr, iph->daddr,
|
||||
th->source, th->dest, ntohl(th->seq),
|
||||
mssind);
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(__cookie_v4_init_sequence);
|
||||
|
||||
__u32 cookie_v4_init_sequence(struct sock *sk, const struct sk_buff *skb,
|
||||
__u16 *mssp)
|
||||
{
|
||||
const struct iphdr *iph = ip_hdr(skb);
|
||||
const struct tcphdr *th = tcp_hdr(skb);
|
||||
|
||||
tcp_synq_overflow(sk);
|
||||
NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_SYNCOOKIESSENT);
|
||||
|
||||
return __cookie_v4_init_sequence(iph, th, mssp);
|
||||
}
|
||||
|
||||
/*
|
||||
* Check if a ack sequence number is a valid syncookie.
|
||||
* Return the decoded mss if it is, or 0 if not.
|
||||
*/
|
||||
int __cookie_v4_check(const struct iphdr *iph, const struct tcphdr *th,
|
||||
u32 cookie)
|
||||
{
|
||||
__u32 seq = ntohl(th->seq) - 1;
|
||||
__u32 mssind = check_tcp_syn_cookie(cookie, iph->saddr, iph->daddr,
|
||||
th->source, th->dest, seq);
|
||||
|
||||
return mssind < ARRAY_SIZE(msstab) ? msstab[mssind] : 0;
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(__cookie_v4_check);
|
||||
|
||||
static inline struct sock *get_cookie_sock(struct sock *sk, struct sk_buff *skb,
|
||||
struct request_sock *req,
|
||||
struct dst_entry *dst)
|
||||
{
|
||||
struct inet_connection_sock *icsk = inet_csk(sk);
|
||||
struct sock *child;
|
||||
|
||||
child = icsk->icsk_af_ops->syn_recv_sock(sk, skb, req, dst);
|
||||
if (child)
|
||||
inet_csk_reqsk_queue_add(sk, req, child);
|
||||
else
|
||||
reqsk_free(req);
|
||||
|
||||
return child;
|
||||
}
|
||||
|
||||
|
||||
/*
|
||||
* when syncookies are in effect and tcp timestamps are enabled we stored
|
||||
* additional tcp options in the timestamp.
|
||||
* This extracts these options from the timestamp echo.
|
||||
*
|
||||
* The lowest 4 bits store snd_wscale.
|
||||
* next 2 bits indicate SACK and ECN support.
|
||||
*
|
||||
* return false if we decode an option that should not be.
|
||||
*/
|
||||
bool cookie_check_timestamp(struct tcp_options_received *tcp_opt,
|
||||
struct net *net, bool *ecn_ok)
|
||||
{
|
||||
/* echoed timestamp, lowest bits contain options */
|
||||
u32 options = tcp_opt->rcv_tsecr & TSMASK;
|
||||
|
||||
if (!tcp_opt->saw_tstamp) {
|
||||
tcp_clear_options(tcp_opt);
|
||||
return true;
|
||||
}
|
||||
|
||||
if (!sysctl_tcp_timestamps)
|
||||
return false;
|
||||
|
||||
tcp_opt->sack_ok = (options & (1 << 4)) ? TCP_SACK_SEEN : 0;
|
||||
*ecn_ok = (options >> 5) & 1;
|
||||
if (*ecn_ok && !net->ipv4.sysctl_tcp_ecn)
|
||||
return false;
|
||||
|
||||
if (tcp_opt->sack_ok && !sysctl_tcp_sack)
|
||||
return false;
|
||||
|
||||
if ((options & 0xf) == 0xf)
|
||||
return true; /* no window scaling */
|
||||
|
||||
tcp_opt->wscale_ok = 1;
|
||||
tcp_opt->snd_wscale = options & 0xf;
|
||||
return sysctl_tcp_window_scaling != 0;
|
||||
}
|
||||
EXPORT_SYMBOL(cookie_check_timestamp);
|
||||
|
||||
struct sock *cookie_v4_check(struct sock *sk, struct sk_buff *skb)
|
||||
{
|
||||
struct ip_options *opt = &TCP_SKB_CB(skb)->header.h4.opt;
|
||||
struct tcp_options_received tcp_opt;
|
||||
struct inet_request_sock *ireq;
|
||||
struct tcp_request_sock *treq;
|
||||
struct tcp_sock *tp = tcp_sk(sk);
|
||||
const struct tcphdr *th = tcp_hdr(skb);
|
||||
__u32 cookie = ntohl(th->ack_seq) - 1;
|
||||
struct sock *ret = sk;
|
||||
struct request_sock *req;
|
||||
int mss;
|
||||
struct rtable *rt;
|
||||
__u8 rcv_wscale;
|
||||
bool ecn_ok = false;
|
||||
struct flowi4 fl4;
|
||||
|
||||
if (!sysctl_tcp_syncookies || !th->ack || th->rst)
|
||||
goto out;
|
||||
|
||||
if (tcp_synq_no_recent_overflow(sk) ||
|
||||
(mss = __cookie_v4_check(ip_hdr(skb), th, cookie)) == 0) {
|
||||
NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_SYNCOOKIESFAILED);
|
||||
goto out;
|
||||
}
|
||||
|
||||
NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_SYNCOOKIESRECV);
|
||||
|
||||
/* check for timestamp cookie support */
|
||||
memset(&tcp_opt, 0, sizeof(tcp_opt));
|
||||
tcp_parse_options(skb, &tcp_opt, 0, NULL);
|
||||
|
||||
if (!cookie_check_timestamp(&tcp_opt, sock_net(sk), &ecn_ok))
|
||||
goto out;
|
||||
|
||||
ret = NULL;
|
||||
req = inet_reqsk_alloc(&tcp_request_sock_ops); /* for safety */
|
||||
if (!req)
|
||||
goto out;
|
||||
|
||||
ireq = inet_rsk(req);
|
||||
treq = tcp_rsk(req);
|
||||
treq->rcv_isn = ntohl(th->seq) - 1;
|
||||
treq->snt_isn = cookie;
|
||||
req->mss = mss;
|
||||
ireq->ir_num = ntohs(th->dest);
|
||||
ireq->ir_rmt_port = th->source;
|
||||
ireq->ir_loc_addr = ip_hdr(skb)->daddr;
|
||||
ireq->ir_rmt_addr = ip_hdr(skb)->saddr;
|
||||
ireq->ir_mark = inet_request_mark(sk, skb);
|
||||
ireq->ecn_ok = ecn_ok;
|
||||
ireq->snd_wscale = tcp_opt.snd_wscale;
|
||||
ireq->sack_ok = tcp_opt.sack_ok;
|
||||
ireq->wscale_ok = tcp_opt.wscale_ok;
|
||||
ireq->tstamp_ok = tcp_opt.saw_tstamp;
|
||||
req->ts_recent = tcp_opt.saw_tstamp ? tcp_opt.rcv_tsval : 0;
|
||||
treq->snt_synack = tcp_opt.saw_tstamp ? tcp_opt.rcv_tsecr : 0;
|
||||
treq->listener = NULL;
|
||||
|
||||
/* We throwed the options of the initial SYN away, so we hope
|
||||
* the ACK carries the same options again (see RFC1122 4.2.3.8)
|
||||
*/
|
||||
ireq->opt = tcp_v4_save_options(skb);
|
||||
|
||||
if (security_inet_conn_request(sk, skb, req)) {
|
||||
reqsk_free(req);
|
||||
goto out;
|
||||
}
|
||||
|
||||
req->expires = 0UL;
|
||||
req->num_retrans = 0;
|
||||
|
||||
/*
|
||||
* We need to lookup the route here to get at the correct
|
||||
* window size. We should better make sure that the window size
|
||||
* hasn't changed since we received the original syn, but I see
|
||||
* no easy way to do this.
|
||||
*/
|
||||
flowi4_init_output(&fl4, sk->sk_bound_dev_if, ireq->ir_mark,
|
||||
RT_CONN_FLAGS(sk), RT_SCOPE_UNIVERSE, IPPROTO_TCP,
|
||||
inet_sk_flowi_flags(sk),
|
||||
(opt && opt->srr) ? opt->faddr : ireq->ir_rmt_addr,
|
||||
ireq->ir_loc_addr, th->source, th->dest,
|
||||
sock_i_uid(sk));
|
||||
security_req_classify_flow(req, flowi4_to_flowi(&fl4));
|
||||
rt = ip_route_output_key(sock_net(sk), &fl4);
|
||||
if (IS_ERR(rt)) {
|
||||
reqsk_free(req);
|
||||
goto out;
|
||||
}
|
||||
|
||||
/* Try to redo what tcp_v4_send_synack did. */
|
||||
req->window_clamp = tp->window_clamp ? :dst_metric(&rt->dst, RTAX_WINDOW);
|
||||
|
||||
tcp_select_initial_window(tcp_full_space(sk), req->mss,
|
||||
&req->rcv_wnd, &req->window_clamp,
|
||||
ireq->wscale_ok, &rcv_wscale,
|
||||
dst_metric(&rt->dst, RTAX_INITRWND));
|
||||
|
||||
ireq->rcv_wscale = rcv_wscale;
|
||||
|
||||
ret = get_cookie_sock(sk, skb, req, &rt->dst);
|
||||
/* ip_queue_xmit() depends on our flow being setup
|
||||
* Normal sockets get it right from inet_csk_route_child_sock()
|
||||
*/
|
||||
if (ret)
|
||||
inet_sk(ret)->cork.fl.u.ip4 = fl4;
|
||||
out: return ret;
|
||||
}
|
||||
964
net/ipv4/sysctl_net_ipv4.c
Normal file
964
net/ipv4/sysctl_net_ipv4.c
Normal file
|
|
@ -0,0 +1,964 @@
|
|||
/*
|
||||
* sysctl_net_ipv4.c: sysctl interface to net IPV4 subsystem.
|
||||
*
|
||||
* Begun April 1, 1996, Mike Shaver.
|
||||
* Added /proc/sys/net/ipv4 directory entry (empty =) ). [MS]
|
||||
*/
|
||||
|
||||
#include <linux/mm.h>
|
||||
#include <linux/module.h>
|
||||
#include <linux/sysctl.h>
|
||||
#include <linux/igmp.h>
|
||||
#include <linux/inetdevice.h>
|
||||
#include <linux/seqlock.h>
|
||||
#include <linux/init.h>
|
||||
#include <linux/slab.h>
|
||||
#include <linux/nsproxy.h>
|
||||
#include <linux/swap.h>
|
||||
#include <net/snmp.h>
|
||||
#include <net/icmp.h>
|
||||
#include <net/ip.h>
|
||||
#include <net/route.h>
|
||||
#include <net/tcp.h>
|
||||
#include <net/udp.h>
|
||||
#include <net/cipso_ipv4.h>
|
||||
#include <net/inet_frag.h>
|
||||
#include <net/ping.h>
|
||||
#include <net/tcp_memcontrol.h>
|
||||
|
||||
static int zero;
|
||||
static int one = 1;
|
||||
static int four = 4;
|
||||
static int gso_max_segs = GSO_MAX_SEGS;
|
||||
static int tcp_retr1_max = 255;
|
||||
static int ip_local_port_range_min[] = { 1, 1 };
|
||||
static int ip_local_port_range_max[] = { 65535, 65535 };
|
||||
static int tcp_adv_win_scale_min = -31;
|
||||
static int tcp_adv_win_scale_max = 31;
|
||||
static int ip_ttl_min = 1;
|
||||
static int ip_ttl_max = 255;
|
||||
static int tcp_syn_retries_min = 1;
|
||||
static int tcp_syn_retries_max = MAX_TCP_SYNCNT;
|
||||
static int ip_ping_group_range_min[] = { 0, 0 };
|
||||
static int ip_ping_group_range_max[] = { GID_T_MAX, GID_T_MAX };
|
||||
|
||||
/* Update system visible IP port range */
|
||||
static void set_local_port_range(struct net *net, int range[2])
|
||||
{
|
||||
write_seqlock(&net->ipv4.ip_local_ports.lock);
|
||||
net->ipv4.ip_local_ports.range[0] = range[0];
|
||||
net->ipv4.ip_local_ports.range[1] = range[1];
|
||||
write_sequnlock(&net->ipv4.ip_local_ports.lock);
|
||||
}
|
||||
|
||||
/* Validate changes from /proc interface. */
|
||||
static int ipv4_local_port_range(struct ctl_table *table, int write,
|
||||
void __user *buffer,
|
||||
size_t *lenp, loff_t *ppos)
|
||||
{
|
||||
struct net *net =
|
||||
container_of(table->data, struct net, ipv4.ip_local_ports.range);
|
||||
int ret;
|
||||
int range[2];
|
||||
struct ctl_table tmp = {
|
||||
.data = &range,
|
||||
.maxlen = sizeof(range),
|
||||
.mode = table->mode,
|
||||
.extra1 = &ip_local_port_range_min,
|
||||
.extra2 = &ip_local_port_range_max,
|
||||
};
|
||||
|
||||
inet_get_local_port_range(net, &range[0], &range[1]);
|
||||
|
||||
ret = proc_dointvec_minmax(&tmp, write, buffer, lenp, ppos);
|
||||
|
||||
if (write && ret == 0) {
|
||||
if (range[1] < range[0])
|
||||
ret = -EINVAL;
|
||||
else
|
||||
set_local_port_range(net, range);
|
||||
}
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
|
||||
static void inet_get_ping_group_range_table(struct ctl_table *table, kgid_t *low, kgid_t *high)
|
||||
{
|
||||
kgid_t *data = table->data;
|
||||
struct net *net =
|
||||
container_of(table->data, struct net, ipv4.ping_group_range.range);
|
||||
unsigned int seq;
|
||||
do {
|
||||
seq = read_seqbegin(&net->ipv4.ip_local_ports.lock);
|
||||
|
||||
*low = data[0];
|
||||
*high = data[1];
|
||||
} while (read_seqretry(&net->ipv4.ip_local_ports.lock, seq));
|
||||
}
|
||||
|
||||
/* Update system visible IP port range */
|
||||
static void set_ping_group_range(struct ctl_table *table, kgid_t low, kgid_t high)
|
||||
{
|
||||
kgid_t *data = table->data;
|
||||
struct net *net =
|
||||
container_of(table->data, struct net, ipv4.ping_group_range.range);
|
||||
write_seqlock(&net->ipv4.ip_local_ports.lock);
|
||||
data[0] = low;
|
||||
data[1] = high;
|
||||
write_sequnlock(&net->ipv4.ip_local_ports.lock);
|
||||
}
|
||||
|
||||
/* Validate changes from /proc interface. */
|
||||
static int ipv4_ping_group_range(struct ctl_table *table, int write,
|
||||
void __user *buffer,
|
||||
size_t *lenp, loff_t *ppos)
|
||||
{
|
||||
struct user_namespace *user_ns = current_user_ns();
|
||||
int ret;
|
||||
gid_t urange[2];
|
||||
kgid_t low, high;
|
||||
struct ctl_table tmp = {
|
||||
.data = &urange,
|
||||
.maxlen = sizeof(urange),
|
||||
.mode = table->mode,
|
||||
.extra1 = &ip_ping_group_range_min,
|
||||
.extra2 = &ip_ping_group_range_max,
|
||||
};
|
||||
|
||||
inet_get_ping_group_range_table(table, &low, &high);
|
||||
urange[0] = from_kgid_munged(user_ns, low);
|
||||
urange[1] = from_kgid_munged(user_ns, high);
|
||||
ret = proc_dointvec_minmax(&tmp, write, buffer, lenp, ppos);
|
||||
|
||||
if (write && ret == 0) {
|
||||
low = make_kgid(user_ns, urange[0]);
|
||||
high = make_kgid(user_ns, urange[1]);
|
||||
if (!gid_valid(low) || !gid_valid(high) ||
|
||||
(urange[1] < urange[0]) || gid_lt(high, low)) {
|
||||
low = make_kgid(&init_user_ns, 1);
|
||||
high = make_kgid(&init_user_ns, 0);
|
||||
}
|
||||
set_ping_group_range(table, low, high);
|
||||
}
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
/* Validate changes from /proc interface. */
|
||||
static int proc_tcp_default_init_rwnd(struct ctl_table *ctl, int write,
|
||||
void __user *buffer,
|
||||
size_t *lenp, loff_t *ppos)
|
||||
{
|
||||
int old_value = *(int *)ctl->data;
|
||||
int ret = proc_dointvec(ctl, write, buffer, lenp, ppos);
|
||||
int new_value = *(int *)ctl->data;
|
||||
|
||||
if (write && ret == 0 && (new_value < 3 || new_value > 100))
|
||||
*(int *)ctl->data = old_value;
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
static int proc_tcp_congestion_control(struct ctl_table *ctl, int write,
|
||||
void __user *buffer, size_t *lenp, loff_t *ppos)
|
||||
{
|
||||
char val[TCP_CA_NAME_MAX];
|
||||
struct ctl_table tbl = {
|
||||
.data = val,
|
||||
.maxlen = TCP_CA_NAME_MAX,
|
||||
};
|
||||
int ret;
|
||||
|
||||
tcp_get_default_congestion_control(val);
|
||||
|
||||
ret = proc_dostring(&tbl, write, buffer, lenp, ppos);
|
||||
if (write && ret == 0)
|
||||
ret = tcp_set_default_congestion_control(val);
|
||||
return ret;
|
||||
}
|
||||
|
||||
static int proc_tcp_available_congestion_control(struct ctl_table *ctl,
|
||||
int write,
|
||||
void __user *buffer, size_t *lenp,
|
||||
loff_t *ppos)
|
||||
{
|
||||
struct ctl_table tbl = { .maxlen = TCP_CA_BUF_MAX, };
|
||||
int ret;
|
||||
|
||||
tbl.data = kmalloc(tbl.maxlen, GFP_USER);
|
||||
if (!tbl.data)
|
||||
return -ENOMEM;
|
||||
tcp_get_available_congestion_control(tbl.data, TCP_CA_BUF_MAX);
|
||||
ret = proc_dostring(&tbl, write, buffer, lenp, ppos);
|
||||
kfree(tbl.data);
|
||||
return ret;
|
||||
}
|
||||
|
||||
static int proc_allowed_congestion_control(struct ctl_table *ctl,
|
||||
int write,
|
||||
void __user *buffer, size_t *lenp,
|
||||
loff_t *ppos)
|
||||
{
|
||||
struct ctl_table tbl = { .maxlen = TCP_CA_BUF_MAX };
|
||||
int ret;
|
||||
|
||||
tbl.data = kmalloc(tbl.maxlen, GFP_USER);
|
||||
if (!tbl.data)
|
||||
return -ENOMEM;
|
||||
|
||||
tcp_get_allowed_congestion_control(tbl.data, tbl.maxlen);
|
||||
ret = proc_dostring(&tbl, write, buffer, lenp, ppos);
|
||||
if (write && ret == 0)
|
||||
ret = tcp_set_allowed_congestion_control(tbl.data);
|
||||
kfree(tbl.data);
|
||||
return ret;
|
||||
}
|
||||
|
||||
static int proc_tcp_fastopen_key(struct ctl_table *ctl, int write,
|
||||
void __user *buffer, size_t *lenp,
|
||||
loff_t *ppos)
|
||||
{
|
||||
struct ctl_table tbl = { .maxlen = (TCP_FASTOPEN_KEY_LENGTH * 2 + 10) };
|
||||
struct tcp_fastopen_context *ctxt;
|
||||
int ret;
|
||||
u32 user_key[4]; /* 16 bytes, matching TCP_FASTOPEN_KEY_LENGTH */
|
||||
|
||||
tbl.data = kmalloc(tbl.maxlen, GFP_KERNEL);
|
||||
if (!tbl.data)
|
||||
return -ENOMEM;
|
||||
|
||||
rcu_read_lock();
|
||||
ctxt = rcu_dereference(tcp_fastopen_ctx);
|
||||
if (ctxt)
|
||||
memcpy(user_key, ctxt->key, TCP_FASTOPEN_KEY_LENGTH);
|
||||
else
|
||||
memset(user_key, 0, sizeof(user_key));
|
||||
rcu_read_unlock();
|
||||
|
||||
snprintf(tbl.data, tbl.maxlen, "%08x-%08x-%08x-%08x",
|
||||
user_key[0], user_key[1], user_key[2], user_key[3]);
|
||||
ret = proc_dostring(&tbl, write, buffer, lenp, ppos);
|
||||
|
||||
if (write && ret == 0) {
|
||||
if (sscanf(tbl.data, "%x-%x-%x-%x", user_key, user_key + 1,
|
||||
user_key + 2, user_key + 3) != 4) {
|
||||
ret = -EINVAL;
|
||||
goto bad_key;
|
||||
}
|
||||
/* Generate a dummy secret but don't publish it. This
|
||||
* is needed so we don't regenerate a new key on the
|
||||
* first invocation of tcp_fastopen_cookie_gen
|
||||
*/
|
||||
tcp_fastopen_init_key_once(false);
|
||||
tcp_fastopen_reset_cipher(user_key, TCP_FASTOPEN_KEY_LENGTH);
|
||||
}
|
||||
|
||||
bad_key:
|
||||
pr_debug("proc FO key set 0x%x-%x-%x-%x <- 0x%s: %u\n",
|
||||
user_key[0], user_key[1], user_key[2], user_key[3],
|
||||
(char *)tbl.data, ret);
|
||||
kfree(tbl.data);
|
||||
return ret;
|
||||
}
|
||||
|
||||
static struct ctl_table ipv4_table[] = {
|
||||
{
|
||||
.procname = "tcp_timestamps",
|
||||
.data = &sysctl_tcp_timestamps,
|
||||
.maxlen = sizeof(int),
|
||||
.mode = 0644,
|
||||
.proc_handler = proc_dointvec
|
||||
},
|
||||
{
|
||||
.procname = "tcp_window_scaling",
|
||||
.data = &sysctl_tcp_window_scaling,
|
||||
.maxlen = sizeof(int),
|
||||
.mode = 0644,
|
||||
.proc_handler = proc_dointvec
|
||||
},
|
||||
{
|
||||
.procname = "tcp_sack",
|
||||
.data = &sysctl_tcp_sack,
|
||||
.maxlen = sizeof(int),
|
||||
.mode = 0644,
|
||||
.proc_handler = proc_dointvec
|
||||
},
|
||||
{
|
||||
.procname = "tcp_retrans_collapse",
|
||||
.data = &sysctl_tcp_retrans_collapse,
|
||||
.maxlen = sizeof(int),
|
||||
.mode = 0644,
|
||||
.proc_handler = proc_dointvec
|
||||
},
|
||||
{
|
||||
.procname = "ip_default_ttl",
|
||||
.data = &sysctl_ip_default_ttl,
|
||||
.maxlen = sizeof(int),
|
||||
.mode = 0644,
|
||||
.proc_handler = proc_dointvec_minmax,
|
||||
.extra1 = &ip_ttl_min,
|
||||
.extra2 = &ip_ttl_max,
|
||||
},
|
||||
{
|
||||
.procname = "tcp_syn_retries",
|
||||
.data = &sysctl_tcp_syn_retries,
|
||||
.maxlen = sizeof(int),
|
||||
.mode = 0644,
|
||||
.proc_handler = proc_dointvec_minmax,
|
||||
.extra1 = &tcp_syn_retries_min,
|
||||
.extra2 = &tcp_syn_retries_max
|
||||
},
|
||||
{
|
||||
.procname = "tcp_synack_retries",
|
||||
.data = &sysctl_tcp_synack_retries,
|
||||
.maxlen = sizeof(int),
|
||||
.mode = 0644,
|
||||
.proc_handler = proc_dointvec
|
||||
},
|
||||
{
|
||||
.procname = "tcp_max_orphans",
|
||||
.data = &sysctl_tcp_max_orphans,
|
||||
.maxlen = sizeof(int),
|
||||
.mode = 0644,
|
||||
.proc_handler = proc_dointvec
|
||||
},
|
||||
{
|
||||
.procname = "tcp_max_tw_buckets",
|
||||
.data = &tcp_death_row.sysctl_max_tw_buckets,
|
||||
.maxlen = sizeof(int),
|
||||
.mode = 0644,
|
||||
.proc_handler = proc_dointvec
|
||||
},
|
||||
{
|
||||
.procname = "ip_early_demux",
|
||||
.data = &sysctl_ip_early_demux,
|
||||
.maxlen = sizeof(int),
|
||||
.mode = 0644,
|
||||
.proc_handler = proc_dointvec
|
||||
},
|
||||
{
|
||||
.procname = "ip_dynaddr",
|
||||
.data = &sysctl_ip_dynaddr,
|
||||
.maxlen = sizeof(int),
|
||||
.mode = 0644,
|
||||
.proc_handler = proc_dointvec
|
||||
},
|
||||
{
|
||||
.procname = "tcp_keepalive_time",
|
||||
.data = &sysctl_tcp_keepalive_time,
|
||||
.maxlen = sizeof(int),
|
||||
.mode = 0644,
|
||||
.proc_handler = proc_dointvec_jiffies,
|
||||
},
|
||||
{
|
||||
.procname = "tcp_keepalive_probes",
|
||||
.data = &sysctl_tcp_keepalive_probes,
|
||||
.maxlen = sizeof(int),
|
||||
.mode = 0644,
|
||||
.proc_handler = proc_dointvec
|
||||
},
|
||||
{
|
||||
.procname = "tcp_keepalive_intvl",
|
||||
.data = &sysctl_tcp_keepalive_intvl,
|
||||
.maxlen = sizeof(int),
|
||||
.mode = 0644,
|
||||
.proc_handler = proc_dointvec_jiffies,
|
||||
},
|
||||
{
|
||||
.procname = "tcp_retries1",
|
||||
.data = &sysctl_tcp_retries1,
|
||||
.maxlen = sizeof(int),
|
||||
.mode = 0644,
|
||||
.proc_handler = proc_dointvec_minmax,
|
||||
.extra2 = &tcp_retr1_max
|
||||
},
|
||||
{
|
||||
.procname = "tcp_retries2",
|
||||
.data = &sysctl_tcp_retries2,
|
||||
.maxlen = sizeof(int),
|
||||
.mode = 0644,
|
||||
.proc_handler = proc_dointvec
|
||||
},
|
||||
{
|
||||
.procname = "tcp_fin_timeout",
|
||||
.data = &sysctl_tcp_fin_timeout,
|
||||
.maxlen = sizeof(int),
|
||||
.mode = 0644,
|
||||
.proc_handler = proc_dointvec_jiffies,
|
||||
},
|
||||
#ifdef CONFIG_SYN_COOKIES
|
||||
{
|
||||
.procname = "tcp_syncookies",
|
||||
.data = &sysctl_tcp_syncookies,
|
||||
.maxlen = sizeof(int),
|
||||
.mode = 0644,
|
||||
.proc_handler = proc_dointvec
|
||||
},
|
||||
#endif
|
||||
{
|
||||
.procname = "tcp_fastopen",
|
||||
.data = &sysctl_tcp_fastopen,
|
||||
.maxlen = sizeof(int),
|
||||
.mode = 0644,
|
||||
.proc_handler = proc_dointvec,
|
||||
},
|
||||
{
|
||||
.procname = "tcp_fastopen_key",
|
||||
.mode = 0600,
|
||||
.maxlen = ((TCP_FASTOPEN_KEY_LENGTH * 2) + 10),
|
||||
.proc_handler = proc_tcp_fastopen_key,
|
||||
},
|
||||
{
|
||||
.procname = "tcp_tw_recycle",
|
||||
.data = &tcp_death_row.sysctl_tw_recycle,
|
||||
.maxlen = sizeof(int),
|
||||
.mode = 0644,
|
||||
.proc_handler = proc_dointvec
|
||||
},
|
||||
{
|
||||
.procname = "tcp_abort_on_overflow",
|
||||
.data = &sysctl_tcp_abort_on_overflow,
|
||||
.maxlen = sizeof(int),
|
||||
.mode = 0644,
|
||||
.proc_handler = proc_dointvec
|
||||
},
|
||||
{
|
||||
.procname = "tcp_stdurg",
|
||||
.data = &sysctl_tcp_stdurg,
|
||||
.maxlen = sizeof(int),
|
||||
.mode = 0644,
|
||||
.proc_handler = proc_dointvec
|
||||
},
|
||||
{
|
||||
.procname = "tcp_rfc1337",
|
||||
.data = &sysctl_tcp_rfc1337,
|
||||
.maxlen = sizeof(int),
|
||||
.mode = 0644,
|
||||
.proc_handler = proc_dointvec
|
||||
},
|
||||
{
|
||||
.procname = "tcp_max_syn_backlog",
|
||||
.data = &sysctl_max_syn_backlog,
|
||||
.maxlen = sizeof(int),
|
||||
.mode = 0644,
|
||||
.proc_handler = proc_dointvec
|
||||
},
|
||||
{
|
||||
.procname = "igmp_max_memberships",
|
||||
.data = &sysctl_igmp_max_memberships,
|
||||
.maxlen = sizeof(int),
|
||||
.mode = 0644,
|
||||
.proc_handler = proc_dointvec
|
||||
},
|
||||
{
|
||||
.procname = "igmp_max_msf",
|
||||
.data = &sysctl_igmp_max_msf,
|
||||
.maxlen = sizeof(int),
|
||||
.mode = 0644,
|
||||
.proc_handler = proc_dointvec
|
||||
},
|
||||
#ifdef CONFIG_IP_MULTICAST
|
||||
{
|
||||
.procname = "igmp_qrv",
|
||||
.data = &sysctl_igmp_qrv,
|
||||
.maxlen = sizeof(int),
|
||||
.mode = 0644,
|
||||
.proc_handler = proc_dointvec_minmax,
|
||||
.extra1 = &one
|
||||
},
|
||||
#endif
|
||||
{
|
||||
.procname = "inet_peer_threshold",
|
||||
.data = &inet_peer_threshold,
|
||||
.maxlen = sizeof(int),
|
||||
.mode = 0644,
|
||||
.proc_handler = proc_dointvec
|
||||
},
|
||||
{
|
||||
.procname = "inet_peer_minttl",
|
||||
.data = &inet_peer_minttl,
|
||||
.maxlen = sizeof(int),
|
||||
.mode = 0644,
|
||||
.proc_handler = proc_dointvec_jiffies,
|
||||
},
|
||||
{
|
||||
.procname = "inet_peer_maxttl",
|
||||
.data = &inet_peer_maxttl,
|
||||
.maxlen = sizeof(int),
|
||||
.mode = 0644,
|
||||
.proc_handler = proc_dointvec_jiffies,
|
||||
},
|
||||
{
|
||||
.procname = "tcp_orphan_retries",
|
||||
.data = &sysctl_tcp_orphan_retries,
|
||||
.maxlen = sizeof(int),
|
||||
.mode = 0644,
|
||||
.proc_handler = proc_dointvec
|
||||
},
|
||||
{
|
||||
.procname = "tcp_fack",
|
||||
.data = &sysctl_tcp_fack,
|
||||
.maxlen = sizeof(int),
|
||||
.mode = 0644,
|
||||
.proc_handler = proc_dointvec
|
||||
},
|
||||
{
|
||||
.procname = "tcp_reordering",
|
||||
.data = &sysctl_tcp_reordering,
|
||||
.maxlen = sizeof(int),
|
||||
.mode = 0644,
|
||||
.proc_handler = proc_dointvec
|
||||
},
|
||||
{
|
||||
.procname = "tcp_dsack",
|
||||
.data = &sysctl_tcp_dsack,
|
||||
.maxlen = sizeof(int),
|
||||
.mode = 0644,
|
||||
.proc_handler = proc_dointvec
|
||||
},
|
||||
{
|
||||
.procname = "tcp_mem",
|
||||
.maxlen = sizeof(sysctl_tcp_mem),
|
||||
.data = &sysctl_tcp_mem,
|
||||
.mode = 0644,
|
||||
.proc_handler = proc_doulongvec_minmax,
|
||||
},
|
||||
{
|
||||
.procname = "tcp_wmem",
|
||||
.data = &sysctl_tcp_wmem,
|
||||
.maxlen = sizeof(sysctl_tcp_wmem),
|
||||
.mode = 0644,
|
||||
.proc_handler = proc_dointvec_minmax,
|
||||
.extra1 = &one,
|
||||
},
|
||||
{
|
||||
.procname = "tcp_notsent_lowat",
|
||||
.data = &sysctl_tcp_notsent_lowat,
|
||||
.maxlen = sizeof(sysctl_tcp_notsent_lowat),
|
||||
.mode = 0644,
|
||||
.proc_handler = proc_dointvec,
|
||||
},
|
||||
{
|
||||
.procname = "tcp_rmem",
|
||||
.data = &sysctl_tcp_rmem,
|
||||
.maxlen = sizeof(sysctl_tcp_rmem),
|
||||
.mode = 0644,
|
||||
.proc_handler = proc_dointvec_minmax,
|
||||
.extra1 = &one,
|
||||
},
|
||||
{
|
||||
.procname = "tcp_app_win",
|
||||
.data = &sysctl_tcp_app_win,
|
||||
.maxlen = sizeof(int),
|
||||
.mode = 0644,
|
||||
.proc_handler = proc_dointvec
|
||||
},
|
||||
{
|
||||
.procname = "tcp_adv_win_scale",
|
||||
.data = &sysctl_tcp_adv_win_scale,
|
||||
.maxlen = sizeof(int),
|
||||
.mode = 0644,
|
||||
.proc_handler = proc_dointvec_minmax,
|
||||
.extra1 = &tcp_adv_win_scale_min,
|
||||
.extra2 = &tcp_adv_win_scale_max,
|
||||
},
|
||||
{
|
||||
.procname = "tcp_tw_reuse",
|
||||
.data = &sysctl_tcp_tw_reuse,
|
||||
.maxlen = sizeof(int),
|
||||
.mode = 0644,
|
||||
.proc_handler = proc_dointvec
|
||||
},
|
||||
{
|
||||
.procname = "tcp_frto",
|
||||
.data = &sysctl_tcp_frto,
|
||||
.maxlen = sizeof(int),
|
||||
.mode = 0644,
|
||||
.proc_handler = proc_dointvec
|
||||
},
|
||||
{
|
||||
.procname = "tcp_low_latency",
|
||||
.data = &sysctl_tcp_low_latency,
|
||||
.maxlen = sizeof(int),
|
||||
.mode = 0644,
|
||||
.proc_handler = proc_dointvec
|
||||
},
|
||||
{
|
||||
.procname = "tcp_no_metrics_save",
|
||||
.data = &sysctl_tcp_nometrics_save,
|
||||
.maxlen = sizeof(int),
|
||||
.mode = 0644,
|
||||
.proc_handler = proc_dointvec,
|
||||
},
|
||||
{
|
||||
.procname = "tcp_moderate_rcvbuf",
|
||||
.data = &sysctl_tcp_moderate_rcvbuf,
|
||||
.maxlen = sizeof(int),
|
||||
.mode = 0644,
|
||||
.proc_handler = proc_dointvec,
|
||||
},
|
||||
{
|
||||
.procname = "tcp_tso_win_divisor",
|
||||
.data = &sysctl_tcp_tso_win_divisor,
|
||||
.maxlen = sizeof(int),
|
||||
.mode = 0644,
|
||||
.proc_handler = proc_dointvec,
|
||||
},
|
||||
{
|
||||
.procname = "tcp_congestion_control",
|
||||
.mode = 0644,
|
||||
.maxlen = TCP_CA_NAME_MAX,
|
||||
.proc_handler = proc_tcp_congestion_control,
|
||||
},
|
||||
{
|
||||
.procname = "tcp_mtu_probing",
|
||||
.data = &sysctl_tcp_mtu_probing,
|
||||
.maxlen = sizeof(int),
|
||||
.mode = 0644,
|
||||
.proc_handler = proc_dointvec,
|
||||
},
|
||||
{
|
||||
.procname = "tcp_base_mss",
|
||||
.data = &sysctl_tcp_base_mss,
|
||||
.maxlen = sizeof(int),
|
||||
.mode = 0644,
|
||||
.proc_handler = proc_dointvec,
|
||||
},
|
||||
{
|
||||
.procname = "tcp_workaround_signed_windows",
|
||||
.data = &sysctl_tcp_workaround_signed_windows,
|
||||
.maxlen = sizeof(int),
|
||||
.mode = 0644,
|
||||
.proc_handler = proc_dointvec
|
||||
},
|
||||
{
|
||||
.procname = "tcp_limit_output_bytes",
|
||||
.data = &sysctl_tcp_limit_output_bytes,
|
||||
.maxlen = sizeof(int),
|
||||
.mode = 0644,
|
||||
.proc_handler = proc_dointvec
|
||||
},
|
||||
{
|
||||
.procname = "tcp_challenge_ack_limit",
|
||||
.data = &sysctl_tcp_challenge_ack_limit,
|
||||
.maxlen = sizeof(int),
|
||||
.mode = 0644,
|
||||
.proc_handler = proc_dointvec
|
||||
},
|
||||
{
|
||||
.procname = "tcp_slow_start_after_idle",
|
||||
.data = &sysctl_tcp_slow_start_after_idle,
|
||||
.maxlen = sizeof(int),
|
||||
.mode = 0644,
|
||||
.proc_handler = proc_dointvec
|
||||
},
|
||||
#ifdef CONFIG_NETLABEL
|
||||
{
|
||||
.procname = "cipso_cache_enable",
|
||||
.data = &cipso_v4_cache_enabled,
|
||||
.maxlen = sizeof(int),
|
||||
.mode = 0644,
|
||||
.proc_handler = proc_dointvec,
|
||||
},
|
||||
{
|
||||
.procname = "cipso_cache_bucket_size",
|
||||
.data = &cipso_v4_cache_bucketsize,
|
||||
.maxlen = sizeof(int),
|
||||
.mode = 0644,
|
||||
.proc_handler = proc_dointvec,
|
||||
},
|
||||
{
|
||||
.procname = "cipso_rbm_optfmt",
|
||||
.data = &cipso_v4_rbm_optfmt,
|
||||
.maxlen = sizeof(int),
|
||||
.mode = 0644,
|
||||
.proc_handler = proc_dointvec,
|
||||
},
|
||||
{
|
||||
.procname = "cipso_rbm_strictvalid",
|
||||
.data = &cipso_v4_rbm_strictvalid,
|
||||
.maxlen = sizeof(int),
|
||||
.mode = 0644,
|
||||
.proc_handler = proc_dointvec,
|
||||
},
|
||||
#endif /* CONFIG_NETLABEL */
|
||||
{
|
||||
.procname = "tcp_available_congestion_control",
|
||||
.maxlen = TCP_CA_BUF_MAX,
|
||||
.mode = 0444,
|
||||
.proc_handler = proc_tcp_available_congestion_control,
|
||||
},
|
||||
{
|
||||
.procname = "tcp_allowed_congestion_control",
|
||||
.maxlen = TCP_CA_BUF_MAX,
|
||||
.mode = 0644,
|
||||
.proc_handler = proc_allowed_congestion_control,
|
||||
},
|
||||
{
|
||||
.procname = "tcp_thin_linear_timeouts",
|
||||
.data = &sysctl_tcp_thin_linear_timeouts,
|
||||
.maxlen = sizeof(int),
|
||||
.mode = 0644,
|
||||
.proc_handler = proc_dointvec
|
||||
},
|
||||
{
|
||||
.procname = "tcp_thin_dupack",
|
||||
.data = &sysctl_tcp_thin_dupack,
|
||||
.maxlen = sizeof(int),
|
||||
.mode = 0644,
|
||||
.proc_handler = proc_dointvec
|
||||
},
|
||||
{
|
||||
.procname = "tcp_early_retrans",
|
||||
.data = &sysctl_tcp_early_retrans,
|
||||
.maxlen = sizeof(int),
|
||||
.mode = 0644,
|
||||
.proc_handler = proc_dointvec_minmax,
|
||||
.extra1 = &zero,
|
||||
.extra2 = &four,
|
||||
},
|
||||
{
|
||||
.procname = "tcp_min_tso_segs",
|
||||
.data = &sysctl_tcp_min_tso_segs,
|
||||
.maxlen = sizeof(int),
|
||||
.mode = 0644,
|
||||
.proc_handler = proc_dointvec_minmax,
|
||||
.extra1 = &zero,
|
||||
.extra2 = &gso_max_segs,
|
||||
},
|
||||
{
|
||||
.procname = "tcp_autocorking",
|
||||
.data = &sysctl_tcp_autocorking,
|
||||
.maxlen = sizeof(int),
|
||||
.mode = 0644,
|
||||
.proc_handler = proc_dointvec_minmax,
|
||||
.extra1 = &zero,
|
||||
.extra2 = &one,
|
||||
},
|
||||
{
|
||||
.procname = "tcp_default_init_rwnd",
|
||||
.data = &sysctl_tcp_default_init_rwnd,
|
||||
.maxlen = sizeof(int),
|
||||
.mode = 0644,
|
||||
.proc_handler = proc_tcp_default_init_rwnd
|
||||
},
|
||||
{
|
||||
.procname = "icmp_msgs_per_sec",
|
||||
.data = &sysctl_icmp_msgs_per_sec,
|
||||
.maxlen = sizeof(int),
|
||||
.mode = 0644,
|
||||
.proc_handler = proc_dointvec_minmax,
|
||||
.extra1 = &zero,
|
||||
},
|
||||
{
|
||||
.procname = "icmp_msgs_burst",
|
||||
.data = &sysctl_icmp_msgs_burst,
|
||||
.maxlen = sizeof(int),
|
||||
.mode = 0644,
|
||||
.proc_handler = proc_dointvec_minmax,
|
||||
.extra1 = &zero,
|
||||
},
|
||||
{
|
||||
.procname = "udp_mem",
|
||||
.data = &sysctl_udp_mem,
|
||||
.maxlen = sizeof(sysctl_udp_mem),
|
||||
.mode = 0644,
|
||||
.proc_handler = proc_doulongvec_minmax,
|
||||
},
|
||||
{
|
||||
.procname = "udp_rmem_min",
|
||||
.data = &sysctl_udp_rmem_min,
|
||||
.maxlen = sizeof(sysctl_udp_rmem_min),
|
||||
.mode = 0644,
|
||||
.proc_handler = proc_dointvec_minmax,
|
||||
.extra1 = &one
|
||||
},
|
||||
{
|
||||
.procname = "udp_wmem_min",
|
||||
.data = &sysctl_udp_wmem_min,
|
||||
.maxlen = sizeof(sysctl_udp_wmem_min),
|
||||
.mode = 0644,
|
||||
.proc_handler = proc_dointvec_minmax,
|
||||
.extra1 = &one
|
||||
},
|
||||
{ }
|
||||
};
|
||||
|
||||
static struct ctl_table ipv4_net_table[] = {
|
||||
{
|
||||
.procname = "icmp_echo_ignore_all",
|
||||
.data = &init_net.ipv4.sysctl_icmp_echo_ignore_all,
|
||||
.maxlen = sizeof(int),
|
||||
.mode = 0644,
|
||||
.proc_handler = proc_dointvec
|
||||
},
|
||||
{
|
||||
.procname = "icmp_echo_ignore_broadcasts",
|
||||
.data = &init_net.ipv4.sysctl_icmp_echo_ignore_broadcasts,
|
||||
.maxlen = sizeof(int),
|
||||
.mode = 0644,
|
||||
.proc_handler = proc_dointvec
|
||||
},
|
||||
{
|
||||
.procname = "icmp_ignore_bogus_error_responses",
|
||||
.data = &init_net.ipv4.sysctl_icmp_ignore_bogus_error_responses,
|
||||
.maxlen = sizeof(int),
|
||||
.mode = 0644,
|
||||
.proc_handler = proc_dointvec
|
||||
},
|
||||
{
|
||||
.procname = "icmp_errors_use_inbound_ifaddr",
|
||||
.data = &init_net.ipv4.sysctl_icmp_errors_use_inbound_ifaddr,
|
||||
.maxlen = sizeof(int),
|
||||
.mode = 0644,
|
||||
.proc_handler = proc_dointvec
|
||||
},
|
||||
{
|
||||
.procname = "icmp_ratelimit",
|
||||
.data = &init_net.ipv4.sysctl_icmp_ratelimit,
|
||||
.maxlen = sizeof(int),
|
||||
.mode = 0644,
|
||||
.proc_handler = proc_dointvec_ms_jiffies,
|
||||
},
|
||||
{
|
||||
.procname = "icmp_ratemask",
|
||||
.data = &init_net.ipv4.sysctl_icmp_ratemask,
|
||||
.maxlen = sizeof(int),
|
||||
.mode = 0644,
|
||||
.proc_handler = proc_dointvec
|
||||
},
|
||||
{
|
||||
.procname = "ping_group_range",
|
||||
.data = &init_net.ipv4.ping_group_range.range,
|
||||
.maxlen = sizeof(gid_t)*2,
|
||||
.mode = 0644,
|
||||
.proc_handler = ipv4_ping_group_range,
|
||||
},
|
||||
{
|
||||
.procname = "tcp_ecn",
|
||||
.data = &init_net.ipv4.sysctl_tcp_ecn,
|
||||
.maxlen = sizeof(int),
|
||||
.mode = 0644,
|
||||
.proc_handler = proc_dointvec
|
||||
},
|
||||
{
|
||||
.procname = "ip_local_port_range",
|
||||
.maxlen = sizeof(init_net.ipv4.ip_local_ports.range),
|
||||
.data = &init_net.ipv4.ip_local_ports.range,
|
||||
.mode = 0644,
|
||||
.proc_handler = ipv4_local_port_range,
|
||||
},
|
||||
{
|
||||
.procname = "ip_local_reserved_ports",
|
||||
.data = &init_net.ipv4.sysctl_local_reserved_ports,
|
||||
.maxlen = 65536,
|
||||
.mode = 0644,
|
||||
.proc_handler = proc_do_large_bitmap,
|
||||
},
|
||||
{
|
||||
.procname = "ip_no_pmtu_disc",
|
||||
.data = &init_net.ipv4.sysctl_ip_no_pmtu_disc,
|
||||
.maxlen = sizeof(int),
|
||||
.mode = 0644,
|
||||
.proc_handler = proc_dointvec
|
||||
},
|
||||
{
|
||||
.procname = "ip_forward_use_pmtu",
|
||||
.data = &init_net.ipv4.sysctl_ip_fwd_use_pmtu,
|
||||
.maxlen = sizeof(int),
|
||||
.mode = 0644,
|
||||
.proc_handler = proc_dointvec,
|
||||
},
|
||||
{
|
||||
.procname = "ip_nonlocal_bind",
|
||||
.data = &init_net.ipv4.sysctl_ip_nonlocal_bind,
|
||||
.maxlen = sizeof(int),
|
||||
.mode = 0644,
|
||||
.proc_handler = proc_dointvec
|
||||
},
|
||||
{
|
||||
.procname = "fwmark_reflect",
|
||||
.data = &init_net.ipv4.sysctl_fwmark_reflect,
|
||||
.maxlen = sizeof(int),
|
||||
.mode = 0644,
|
||||
.proc_handler = proc_dointvec,
|
||||
},
|
||||
{
|
||||
.procname = "tcp_fwmark_accept",
|
||||
.data = &init_net.ipv4.sysctl_tcp_fwmark_accept,
|
||||
.maxlen = sizeof(int),
|
||||
.mode = 0644,
|
||||
.proc_handler = proc_dointvec,
|
||||
},
|
||||
{ }
|
||||
};
|
||||
|
||||
static __net_init int ipv4_sysctl_init_net(struct net *net)
|
||||
{
|
||||
struct ctl_table *table;
|
||||
|
||||
table = ipv4_net_table;
|
||||
if (!net_eq(net, &init_net)) {
|
||||
int i;
|
||||
|
||||
table = kmemdup(table, sizeof(ipv4_net_table), GFP_KERNEL);
|
||||
if (table == NULL)
|
||||
goto err_alloc;
|
||||
|
||||
/* Update the variables to point into the current struct net */
|
||||
for (i = 0; i < ARRAY_SIZE(ipv4_net_table) - 1; i++)
|
||||
table[i].data += (void *)net - (void *)&init_net;
|
||||
}
|
||||
|
||||
net->ipv4.ipv4_hdr = register_net_sysctl(net, "net/ipv4", table);
|
||||
if (net->ipv4.ipv4_hdr == NULL)
|
||||
goto err_reg;
|
||||
|
||||
net->ipv4.sysctl_local_reserved_ports = kzalloc(65536 / 8, GFP_KERNEL);
|
||||
if (!net->ipv4.sysctl_local_reserved_ports)
|
||||
goto err_ports;
|
||||
|
||||
return 0;
|
||||
|
||||
err_ports:
|
||||
unregister_net_sysctl_table(net->ipv4.ipv4_hdr);
|
||||
err_reg:
|
||||
if (!net_eq(net, &init_net))
|
||||
kfree(table);
|
||||
err_alloc:
|
||||
return -ENOMEM;
|
||||
}
|
||||
|
||||
static __net_exit void ipv4_sysctl_exit_net(struct net *net)
|
||||
{
|
||||
struct ctl_table *table;
|
||||
|
||||
kfree(net->ipv4.sysctl_local_reserved_ports);
|
||||
table = net->ipv4.ipv4_hdr->ctl_table_arg;
|
||||
unregister_net_sysctl_table(net->ipv4.ipv4_hdr);
|
||||
kfree(table);
|
||||
}
|
||||
|
||||
static __net_initdata struct pernet_operations ipv4_sysctl_ops = {
|
||||
.init = ipv4_sysctl_init_net,
|
||||
.exit = ipv4_sysctl_exit_net,
|
||||
};
|
||||
|
||||
static __init int sysctl_ipv4_init(void)
|
||||
{
|
||||
struct ctl_table_header *hdr;
|
||||
|
||||
hdr = register_net_sysctl(&init_net, "net/ipv4", ipv4_table);
|
||||
if (hdr == NULL)
|
||||
return -ENOMEM;
|
||||
|
||||
if (register_pernet_subsys(&ipv4_sysctl_ops)) {
|
||||
unregister_net_sysctl_table(hdr);
|
||||
return -ENOMEM;
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
__initcall(sysctl_ipv4_init);
|
||||
88
net/ipv4/sysfs_net_ipv4.c
Normal file
88
net/ipv4/sysfs_net_ipv4.c
Normal file
|
|
@ -0,0 +1,88 @@
|
|||
/*
|
||||
* net/ipv4/sysfs_net_ipv4.c
|
||||
*
|
||||
* sysfs-based networking knobs (so we can, unlike with sysctl, control perms)
|
||||
*
|
||||
* Copyright (C) 2008 Google, Inc.
|
||||
*
|
||||
* Robert Love <rlove@google.com>
|
||||
*
|
||||
* This software is licensed under the terms of the GNU General Public
|
||||
* License version 2, as published by the Free Software Foundation, and
|
||||
* may be copied, distributed, and modified under those terms.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU General Public License for more details.
|
||||
*/
|
||||
|
||||
#include <linux/kobject.h>
|
||||
#include <linux/string.h>
|
||||
#include <linux/sysfs.h>
|
||||
#include <linux/init.h>
|
||||
#include <net/tcp.h>
|
||||
|
||||
#define CREATE_IPV4_FILE(_name, _var) \
|
||||
static ssize_t _name##_show(struct kobject *kobj, \
|
||||
struct kobj_attribute *attr, char *buf) \
|
||||
{ \
|
||||
return sprintf(buf, "%d\n", _var); \
|
||||
} \
|
||||
static ssize_t _name##_store(struct kobject *kobj, \
|
||||
struct kobj_attribute *attr, \
|
||||
const char *buf, size_t count) \
|
||||
{ \
|
||||
int val, ret; \
|
||||
ret = sscanf(buf, "%d", &val); \
|
||||
if (ret != 1) \
|
||||
return -EINVAL; \
|
||||
if (val < 0) \
|
||||
return -EINVAL; \
|
||||
_var = val; \
|
||||
return count; \
|
||||
} \
|
||||
static struct kobj_attribute _name##_attr = \
|
||||
__ATTR(_name, 0644, _name##_show, _name##_store)
|
||||
|
||||
CREATE_IPV4_FILE(tcp_wmem_min, sysctl_tcp_wmem[0]);
|
||||
CREATE_IPV4_FILE(tcp_wmem_def, sysctl_tcp_wmem[1]);
|
||||
CREATE_IPV4_FILE(tcp_wmem_max, sysctl_tcp_wmem[2]);
|
||||
|
||||
CREATE_IPV4_FILE(tcp_rmem_min, sysctl_tcp_rmem[0]);
|
||||
CREATE_IPV4_FILE(tcp_rmem_def, sysctl_tcp_rmem[1]);
|
||||
CREATE_IPV4_FILE(tcp_rmem_max, sysctl_tcp_rmem[2]);
|
||||
|
||||
static struct attribute *ipv4_attrs[] = {
|
||||
&tcp_wmem_min_attr.attr,
|
||||
&tcp_wmem_def_attr.attr,
|
||||
&tcp_wmem_max_attr.attr,
|
||||
&tcp_rmem_min_attr.attr,
|
||||
&tcp_rmem_def_attr.attr,
|
||||
&tcp_rmem_max_attr.attr,
|
||||
NULL
|
||||
};
|
||||
|
||||
static struct attribute_group ipv4_attr_group = {
|
||||
.attrs = ipv4_attrs,
|
||||
};
|
||||
|
||||
static __init int sysfs_ipv4_init(void)
|
||||
{
|
||||
struct kobject *ipv4_kobject;
|
||||
int ret;
|
||||
|
||||
ipv4_kobject = kobject_create_and_add("ipv4", kernel_kobj);
|
||||
if (!ipv4_kobject)
|
||||
return -ENOMEM;
|
||||
|
||||
ret = sysfs_create_group(ipv4_kobject, &ipv4_attr_group);
|
||||
if (ret) {
|
||||
kobject_put(ipv4_kobject);
|
||||
return ret;
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
subsys_initcall(sysfs_ipv4_init);
|
||||
3307
net/ipv4/tcp.c
Normal file
3307
net/ipv4/tcp.c
Normal file
File diff suppressed because it is too large
Load diff
239
net/ipv4/tcp_bic.c
Normal file
239
net/ipv4/tcp_bic.c
Normal file
|
|
@ -0,0 +1,239 @@
|
|||
/*
|
||||
* Binary Increase Congestion control for TCP
|
||||
* Home page:
|
||||
* http://netsrv.csc.ncsu.edu/twiki/bin/view/Main/BIC
|
||||
* This is from the implementation of BICTCP in
|
||||
* Lison-Xu, Kahaled Harfoush, and Injong Rhee.
|
||||
* "Binary Increase Congestion Control for Fast, Long Distance
|
||||
* Networks" in InfoComm 2004
|
||||
* Available from:
|
||||
* http://netsrv.csc.ncsu.edu/export/bitcp.pdf
|
||||
*
|
||||
* Unless BIC is enabled and congestion window is large
|
||||
* this behaves the same as the original Reno.
|
||||
*/
|
||||
|
||||
#include <linux/mm.h>
|
||||
#include <linux/module.h>
|
||||
#include <net/tcp.h>
|
||||
|
||||
#define BICTCP_BETA_SCALE 1024 /* Scale factor beta calculation
|
||||
* max_cwnd = snd_cwnd * beta
|
||||
*/
|
||||
#define BICTCP_B 4 /*
|
||||
* In binary search,
|
||||
* go to point (max+min)/N
|
||||
*/
|
||||
|
||||
static int fast_convergence = 1;
|
||||
static int max_increment = 16;
|
||||
static int low_window = 14;
|
||||
static int beta = 819; /* = 819/1024 (BICTCP_BETA_SCALE) */
|
||||
static int initial_ssthresh;
|
||||
static int smooth_part = 20;
|
||||
|
||||
module_param(fast_convergence, int, 0644);
|
||||
MODULE_PARM_DESC(fast_convergence, "turn on/off fast convergence");
|
||||
module_param(max_increment, int, 0644);
|
||||
MODULE_PARM_DESC(max_increment, "Limit on increment allowed during binary search");
|
||||
module_param(low_window, int, 0644);
|
||||
MODULE_PARM_DESC(low_window, "lower bound on congestion window (for TCP friendliness)");
|
||||
module_param(beta, int, 0644);
|
||||
MODULE_PARM_DESC(beta, "beta for multiplicative increase");
|
||||
module_param(initial_ssthresh, int, 0644);
|
||||
MODULE_PARM_DESC(initial_ssthresh, "initial value of slow start threshold");
|
||||
module_param(smooth_part, int, 0644);
|
||||
MODULE_PARM_DESC(smooth_part, "log(B/(B*Smin))/log(B/(B-1))+B, # of RTT from Wmax-B to Wmax");
|
||||
|
||||
/* BIC TCP Parameters */
|
||||
struct bictcp {
|
||||
u32 cnt; /* increase cwnd by 1 after ACKs */
|
||||
u32 last_max_cwnd; /* last maximum snd_cwnd */
|
||||
u32 loss_cwnd; /* congestion window at last loss */
|
||||
u32 last_cwnd; /* the last snd_cwnd */
|
||||
u32 last_time; /* time when updated last_cwnd */
|
||||
u32 epoch_start; /* beginning of an epoch */
|
||||
#define ACK_RATIO_SHIFT 4
|
||||
u32 delayed_ack; /* estimate the ratio of Packets/ACKs << 4 */
|
||||
};
|
||||
|
||||
static inline void bictcp_reset(struct bictcp *ca)
|
||||
{
|
||||
ca->cnt = 0;
|
||||
ca->last_max_cwnd = 0;
|
||||
ca->last_cwnd = 0;
|
||||
ca->last_time = 0;
|
||||
ca->epoch_start = 0;
|
||||
ca->delayed_ack = 2 << ACK_RATIO_SHIFT;
|
||||
}
|
||||
|
||||
static void bictcp_init(struct sock *sk)
|
||||
{
|
||||
struct bictcp *ca = inet_csk_ca(sk);
|
||||
|
||||
bictcp_reset(ca);
|
||||
ca->loss_cwnd = 0;
|
||||
|
||||
if (initial_ssthresh)
|
||||
tcp_sk(sk)->snd_ssthresh = initial_ssthresh;
|
||||
}
|
||||
|
||||
/*
|
||||
* Compute congestion window to use.
|
||||
*/
|
||||
static inline void bictcp_update(struct bictcp *ca, u32 cwnd)
|
||||
{
|
||||
if (ca->last_cwnd == cwnd &&
|
||||
(s32)(tcp_time_stamp - ca->last_time) <= HZ / 32)
|
||||
return;
|
||||
|
||||
ca->last_cwnd = cwnd;
|
||||
ca->last_time = tcp_time_stamp;
|
||||
|
||||
if (ca->epoch_start == 0) /* record the beginning of an epoch */
|
||||
ca->epoch_start = tcp_time_stamp;
|
||||
|
||||
/* start off normal */
|
||||
if (cwnd <= low_window) {
|
||||
ca->cnt = cwnd;
|
||||
return;
|
||||
}
|
||||
|
||||
/* binary increase */
|
||||
if (cwnd < ca->last_max_cwnd) {
|
||||
__u32 dist = (ca->last_max_cwnd - cwnd)
|
||||
/ BICTCP_B;
|
||||
|
||||
if (dist > max_increment)
|
||||
/* linear increase */
|
||||
ca->cnt = cwnd / max_increment;
|
||||
else if (dist <= 1U)
|
||||
/* binary search increase */
|
||||
ca->cnt = (cwnd * smooth_part) / BICTCP_B;
|
||||
else
|
||||
/* binary search increase */
|
||||
ca->cnt = cwnd / dist;
|
||||
} else {
|
||||
/* slow start AMD linear increase */
|
||||
if (cwnd < ca->last_max_cwnd + BICTCP_B)
|
||||
/* slow start */
|
||||
ca->cnt = (cwnd * smooth_part) / BICTCP_B;
|
||||
else if (cwnd < ca->last_max_cwnd + max_increment*(BICTCP_B-1))
|
||||
/* slow start */
|
||||
ca->cnt = (cwnd * (BICTCP_B-1))
|
||||
/ (cwnd - ca->last_max_cwnd);
|
||||
else
|
||||
/* linear increase */
|
||||
ca->cnt = cwnd / max_increment;
|
||||
}
|
||||
|
||||
/* if in slow start or link utilization is very low */
|
||||
if (ca->last_max_cwnd == 0) {
|
||||
if (ca->cnt > 20) /* increase cwnd 5% per RTT */
|
||||
ca->cnt = 20;
|
||||
}
|
||||
|
||||
ca->cnt = (ca->cnt << ACK_RATIO_SHIFT) / ca->delayed_ack;
|
||||
if (ca->cnt == 0) /* cannot be zero */
|
||||
ca->cnt = 1;
|
||||
}
|
||||
|
||||
static void bictcp_cong_avoid(struct sock *sk, u32 ack, u32 acked)
|
||||
{
|
||||
struct tcp_sock *tp = tcp_sk(sk);
|
||||
struct bictcp *ca = inet_csk_ca(sk);
|
||||
|
||||
if (!tcp_is_cwnd_limited(sk))
|
||||
return;
|
||||
|
||||
if (tp->snd_cwnd <= tp->snd_ssthresh)
|
||||
tcp_slow_start(tp, acked);
|
||||
else {
|
||||
bictcp_update(ca, tp->snd_cwnd);
|
||||
tcp_cong_avoid_ai(tp, ca->cnt);
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* behave like Reno until low_window is reached,
|
||||
* then increase congestion window slowly
|
||||
*/
|
||||
static u32 bictcp_recalc_ssthresh(struct sock *sk)
|
||||
{
|
||||
const struct tcp_sock *tp = tcp_sk(sk);
|
||||
struct bictcp *ca = inet_csk_ca(sk);
|
||||
|
||||
ca->epoch_start = 0; /* end of epoch */
|
||||
|
||||
/* Wmax and fast convergence */
|
||||
if (tp->snd_cwnd < ca->last_max_cwnd && fast_convergence)
|
||||
ca->last_max_cwnd = (tp->snd_cwnd * (BICTCP_BETA_SCALE + beta))
|
||||
/ (2 * BICTCP_BETA_SCALE);
|
||||
else
|
||||
ca->last_max_cwnd = tp->snd_cwnd;
|
||||
|
||||
ca->loss_cwnd = tp->snd_cwnd;
|
||||
|
||||
if (tp->snd_cwnd <= low_window)
|
||||
return max(tp->snd_cwnd >> 1U, 2U);
|
||||
else
|
||||
return max((tp->snd_cwnd * beta) / BICTCP_BETA_SCALE, 2U);
|
||||
}
|
||||
|
||||
static u32 bictcp_undo_cwnd(struct sock *sk)
|
||||
{
|
||||
const struct tcp_sock *tp = tcp_sk(sk);
|
||||
const struct bictcp *ca = inet_csk_ca(sk);
|
||||
|
||||
return max(tp->snd_cwnd, ca->loss_cwnd);
|
||||
}
|
||||
|
||||
static void bictcp_state(struct sock *sk, u8 new_state)
|
||||
{
|
||||
if (new_state == TCP_CA_Loss)
|
||||
bictcp_reset(inet_csk_ca(sk));
|
||||
}
|
||||
|
||||
/* Track delayed acknowledgment ratio using sliding window
|
||||
* ratio = (15*ratio + sample) / 16
|
||||
*/
|
||||
static void bictcp_acked(struct sock *sk, u32 cnt, s32 rtt)
|
||||
{
|
||||
const struct inet_connection_sock *icsk = inet_csk(sk);
|
||||
|
||||
if (icsk->icsk_ca_state == TCP_CA_Open) {
|
||||
struct bictcp *ca = inet_csk_ca(sk);
|
||||
|
||||
cnt -= ca->delayed_ack >> ACK_RATIO_SHIFT;
|
||||
ca->delayed_ack += cnt;
|
||||
}
|
||||
}
|
||||
|
||||
static struct tcp_congestion_ops bictcp __read_mostly = {
|
||||
.init = bictcp_init,
|
||||
.ssthresh = bictcp_recalc_ssthresh,
|
||||
.cong_avoid = bictcp_cong_avoid,
|
||||
.set_state = bictcp_state,
|
||||
.undo_cwnd = bictcp_undo_cwnd,
|
||||
.pkts_acked = bictcp_acked,
|
||||
.owner = THIS_MODULE,
|
||||
.name = "bic",
|
||||
};
|
||||
|
||||
static int __init bictcp_register(void)
|
||||
{
|
||||
BUILD_BUG_ON(sizeof(struct bictcp) > ICSK_CA_PRIV_SIZE);
|
||||
return tcp_register_congestion_control(&bictcp);
|
||||
}
|
||||
|
||||
static void __exit bictcp_unregister(void)
|
||||
{
|
||||
tcp_unregister_congestion_control(&bictcp);
|
||||
}
|
||||
|
||||
module_init(bictcp_register);
|
||||
module_exit(bictcp_unregister);
|
||||
|
||||
MODULE_AUTHOR("Stephen Hemminger");
|
||||
MODULE_LICENSE("GPL");
|
||||
MODULE_DESCRIPTION("BIC TCP");
|
||||
355
net/ipv4/tcp_cong.c
Normal file
355
net/ipv4/tcp_cong.c
Normal file
|
|
@ -0,0 +1,355 @@
|
|||
/*
|
||||
* Plugable TCP congestion control support and newReno
|
||||
* congestion control.
|
||||
* Based on ideas from I/O scheduler support and Web100.
|
||||
*
|
||||
* Copyright (C) 2005 Stephen Hemminger <shemminger@osdl.org>
|
||||
*/
|
||||
|
||||
#define pr_fmt(fmt) "TCP: " fmt
|
||||
|
||||
#include <linux/module.h>
|
||||
#include <linux/mm.h>
|
||||
#include <linux/types.h>
|
||||
#include <linux/list.h>
|
||||
#include <linux/gfp.h>
|
||||
#include <net/tcp.h>
|
||||
|
||||
static DEFINE_SPINLOCK(tcp_cong_list_lock);
|
||||
static LIST_HEAD(tcp_cong_list);
|
||||
|
||||
/* Simple linear search, don't expect many entries! */
|
||||
static struct tcp_congestion_ops *tcp_ca_find(const char *name)
|
||||
{
|
||||
struct tcp_congestion_ops *e;
|
||||
|
||||
list_for_each_entry_rcu(e, &tcp_cong_list, list) {
|
||||
if (strcmp(e->name, name) == 0)
|
||||
return e;
|
||||
}
|
||||
|
||||
return NULL;
|
||||
}
|
||||
|
||||
/*
|
||||
* Attach new congestion control algorithm to the list
|
||||
* of available options.
|
||||
*/
|
||||
int tcp_register_congestion_control(struct tcp_congestion_ops *ca)
|
||||
{
|
||||
int ret = 0;
|
||||
|
||||
/* all algorithms must implement ssthresh and cong_avoid ops */
|
||||
if (!ca->ssthresh || !ca->cong_avoid) {
|
||||
pr_err("%s does not implement required ops\n", ca->name);
|
||||
return -EINVAL;
|
||||
}
|
||||
|
||||
spin_lock(&tcp_cong_list_lock);
|
||||
if (tcp_ca_find(ca->name)) {
|
||||
pr_notice("%s already registered\n", ca->name);
|
||||
ret = -EEXIST;
|
||||
} else {
|
||||
list_add_tail_rcu(&ca->list, &tcp_cong_list);
|
||||
pr_info("%s registered\n", ca->name);
|
||||
}
|
||||
spin_unlock(&tcp_cong_list_lock);
|
||||
|
||||
return ret;
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(tcp_register_congestion_control);
|
||||
|
||||
/*
|
||||
* Remove congestion control algorithm, called from
|
||||
* the module's remove function. Module ref counts are used
|
||||
* to ensure that this can't be done till all sockets using
|
||||
* that method are closed.
|
||||
*/
|
||||
void tcp_unregister_congestion_control(struct tcp_congestion_ops *ca)
|
||||
{
|
||||
spin_lock(&tcp_cong_list_lock);
|
||||
list_del_rcu(&ca->list);
|
||||
spin_unlock(&tcp_cong_list_lock);
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(tcp_unregister_congestion_control);
|
||||
|
||||
/* Assign choice of congestion control. */
|
||||
void tcp_assign_congestion_control(struct sock *sk)
|
||||
{
|
||||
struct inet_connection_sock *icsk = inet_csk(sk);
|
||||
struct tcp_congestion_ops *ca;
|
||||
|
||||
rcu_read_lock();
|
||||
list_for_each_entry_rcu(ca, &tcp_cong_list, list) {
|
||||
if (likely(try_module_get(ca->owner))) {
|
||||
icsk->icsk_ca_ops = ca;
|
||||
goto out;
|
||||
}
|
||||
/* Fallback to next available. The last really
|
||||
* guaranteed fallback is Reno from this list.
|
||||
*/
|
||||
}
|
||||
out:
|
||||
rcu_read_unlock();
|
||||
|
||||
/* Clear out private data before diag gets it and
|
||||
* the ca has not been initialized.
|
||||
*/
|
||||
if (ca->get_info)
|
||||
memset(icsk->icsk_ca_priv, 0, sizeof(icsk->icsk_ca_priv));
|
||||
}
|
||||
|
||||
void tcp_init_congestion_control(struct sock *sk)
|
||||
{
|
||||
const struct inet_connection_sock *icsk = inet_csk(sk);
|
||||
|
||||
if (icsk->icsk_ca_ops->init)
|
||||
icsk->icsk_ca_ops->init(sk);
|
||||
}
|
||||
|
||||
/* Manage refcounts on socket close. */
|
||||
void tcp_cleanup_congestion_control(struct sock *sk)
|
||||
{
|
||||
struct inet_connection_sock *icsk = inet_csk(sk);
|
||||
|
||||
if (icsk->icsk_ca_ops->release)
|
||||
icsk->icsk_ca_ops->release(sk);
|
||||
module_put(icsk->icsk_ca_ops->owner);
|
||||
}
|
||||
|
||||
/* Used by sysctl to change default congestion control */
|
||||
int tcp_set_default_congestion_control(const char *name)
|
||||
{
|
||||
struct tcp_congestion_ops *ca;
|
||||
int ret = -ENOENT;
|
||||
|
||||
spin_lock(&tcp_cong_list_lock);
|
||||
ca = tcp_ca_find(name);
|
||||
#ifdef CONFIG_MODULES
|
||||
if (!ca && capable(CAP_NET_ADMIN)) {
|
||||
spin_unlock(&tcp_cong_list_lock);
|
||||
|
||||
request_module("tcp_%s", name);
|
||||
spin_lock(&tcp_cong_list_lock);
|
||||
ca = tcp_ca_find(name);
|
||||
}
|
||||
#endif
|
||||
|
||||
if (ca) {
|
||||
ca->flags |= TCP_CONG_NON_RESTRICTED; /* default is always allowed */
|
||||
list_move(&ca->list, &tcp_cong_list);
|
||||
ret = 0;
|
||||
}
|
||||
spin_unlock(&tcp_cong_list_lock);
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
/* Set default value from kernel configuration at bootup */
|
||||
static int __init tcp_congestion_default(void)
|
||||
{
|
||||
return tcp_set_default_congestion_control(CONFIG_DEFAULT_TCP_CONG);
|
||||
}
|
||||
late_initcall(tcp_congestion_default);
|
||||
|
||||
/* Build string with list of available congestion control values */
|
||||
void tcp_get_available_congestion_control(char *buf, size_t maxlen)
|
||||
{
|
||||
struct tcp_congestion_ops *ca;
|
||||
size_t offs = 0;
|
||||
|
||||
rcu_read_lock();
|
||||
list_for_each_entry_rcu(ca, &tcp_cong_list, list) {
|
||||
offs += snprintf(buf + offs, maxlen - offs,
|
||||
"%s%s",
|
||||
offs == 0 ? "" : " ", ca->name);
|
||||
}
|
||||
rcu_read_unlock();
|
||||
}
|
||||
|
||||
/* Get current default congestion control */
|
||||
void tcp_get_default_congestion_control(char *name)
|
||||
{
|
||||
struct tcp_congestion_ops *ca;
|
||||
/* We will always have reno... */
|
||||
BUG_ON(list_empty(&tcp_cong_list));
|
||||
|
||||
rcu_read_lock();
|
||||
ca = list_entry(tcp_cong_list.next, struct tcp_congestion_ops, list);
|
||||
strncpy(name, ca->name, TCP_CA_NAME_MAX);
|
||||
rcu_read_unlock();
|
||||
}
|
||||
|
||||
/* Built list of non-restricted congestion control values */
|
||||
void tcp_get_allowed_congestion_control(char *buf, size_t maxlen)
|
||||
{
|
||||
struct tcp_congestion_ops *ca;
|
||||
size_t offs = 0;
|
||||
|
||||
*buf = '\0';
|
||||
rcu_read_lock();
|
||||
list_for_each_entry_rcu(ca, &tcp_cong_list, list) {
|
||||
if (!(ca->flags & TCP_CONG_NON_RESTRICTED))
|
||||
continue;
|
||||
offs += snprintf(buf + offs, maxlen - offs,
|
||||
"%s%s",
|
||||
offs == 0 ? "" : " ", ca->name);
|
||||
}
|
||||
rcu_read_unlock();
|
||||
}
|
||||
|
||||
/* Change list of non-restricted congestion control */
|
||||
int tcp_set_allowed_congestion_control(char *val)
|
||||
{
|
||||
struct tcp_congestion_ops *ca;
|
||||
char *saved_clone, *clone, *name;
|
||||
int ret = 0;
|
||||
|
||||
saved_clone = clone = kstrdup(val, GFP_USER);
|
||||
if (!clone)
|
||||
return -ENOMEM;
|
||||
|
||||
spin_lock(&tcp_cong_list_lock);
|
||||
/* pass 1 check for bad entries */
|
||||
while ((name = strsep(&clone, " ")) && *name) {
|
||||
ca = tcp_ca_find(name);
|
||||
if (!ca) {
|
||||
ret = -ENOENT;
|
||||
goto out;
|
||||
}
|
||||
}
|
||||
|
||||
/* pass 2 clear old values */
|
||||
list_for_each_entry_rcu(ca, &tcp_cong_list, list)
|
||||
ca->flags &= ~TCP_CONG_NON_RESTRICTED;
|
||||
|
||||
/* pass 3 mark as allowed */
|
||||
while ((name = strsep(&val, " ")) && *name) {
|
||||
ca = tcp_ca_find(name);
|
||||
WARN_ON(!ca);
|
||||
if (ca)
|
||||
ca->flags |= TCP_CONG_NON_RESTRICTED;
|
||||
}
|
||||
out:
|
||||
spin_unlock(&tcp_cong_list_lock);
|
||||
kfree(saved_clone);
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
/* Change congestion control for socket */
|
||||
int tcp_set_congestion_control(struct sock *sk, const char *name)
|
||||
{
|
||||
struct inet_connection_sock *icsk = inet_csk(sk);
|
||||
struct tcp_congestion_ops *ca;
|
||||
int err = 0;
|
||||
|
||||
rcu_read_lock();
|
||||
ca = tcp_ca_find(name);
|
||||
|
||||
/* no change asking for existing value */
|
||||
if (ca == icsk->icsk_ca_ops)
|
||||
goto out;
|
||||
|
||||
#ifdef CONFIG_MODULES
|
||||
/* not found attempt to autoload module */
|
||||
if (!ca && capable(CAP_NET_ADMIN)) {
|
||||
rcu_read_unlock();
|
||||
request_module("tcp_%s", name);
|
||||
rcu_read_lock();
|
||||
ca = tcp_ca_find(name);
|
||||
}
|
||||
#endif
|
||||
if (!ca)
|
||||
err = -ENOENT;
|
||||
|
||||
else if (!((ca->flags & TCP_CONG_NON_RESTRICTED) ||
|
||||
ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN)))
|
||||
err = -EPERM;
|
||||
|
||||
else if (!try_module_get(ca->owner))
|
||||
err = -EBUSY;
|
||||
|
||||
else {
|
||||
tcp_cleanup_congestion_control(sk);
|
||||
icsk->icsk_ca_ops = ca;
|
||||
|
||||
if (sk->sk_state != TCP_CLOSE && icsk->icsk_ca_ops->init)
|
||||
icsk->icsk_ca_ops->init(sk);
|
||||
}
|
||||
out:
|
||||
rcu_read_unlock();
|
||||
return err;
|
||||
}
|
||||
|
||||
/* Slow start is used when congestion window is no greater than the slow start
|
||||
* threshold. We base on RFC2581 and also handle stretch ACKs properly.
|
||||
* We do not implement RFC3465 Appropriate Byte Counting (ABC) per se but
|
||||
* something better;) a packet is only considered (s)acked in its entirety to
|
||||
* defend the ACK attacks described in the RFC. Slow start processes a stretch
|
||||
* ACK of degree N as if N acks of degree 1 are received back to back except
|
||||
* ABC caps N to 2. Slow start exits when cwnd grows over ssthresh and
|
||||
* returns the leftover acks to adjust cwnd in congestion avoidance mode.
|
||||
*/
|
||||
void tcp_slow_start(struct tcp_sock *tp, u32 acked)
|
||||
{
|
||||
u32 cwnd = tp->snd_cwnd + acked;
|
||||
|
||||
if (cwnd > tp->snd_ssthresh)
|
||||
cwnd = tp->snd_ssthresh + 1;
|
||||
tp->snd_cwnd = min(cwnd, tp->snd_cwnd_clamp);
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(tcp_slow_start);
|
||||
|
||||
/* In theory this is tp->snd_cwnd += 1 / tp->snd_cwnd (or alternative w) */
|
||||
void tcp_cong_avoid_ai(struct tcp_sock *tp, u32 w)
|
||||
{
|
||||
if (tp->snd_cwnd_cnt >= w) {
|
||||
if (tp->snd_cwnd < tp->snd_cwnd_clamp)
|
||||
tp->snd_cwnd++;
|
||||
tp->snd_cwnd_cnt = 0;
|
||||
} else {
|
||||
tp->snd_cwnd_cnt++;
|
||||
}
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(tcp_cong_avoid_ai);
|
||||
|
||||
/*
|
||||
* TCP Reno congestion control
|
||||
* This is special case used for fallback as well.
|
||||
*/
|
||||
/* This is Jacobson's slow start and congestion avoidance.
|
||||
* SIGCOMM '88, p. 328.
|
||||
*/
|
||||
void tcp_reno_cong_avoid(struct sock *sk, u32 ack, u32 acked)
|
||||
{
|
||||
struct tcp_sock *tp = tcp_sk(sk);
|
||||
|
||||
if (!tcp_is_cwnd_limited(sk))
|
||||
return;
|
||||
|
||||
/* In "safe" area, increase. */
|
||||
if (tp->snd_cwnd <= tp->snd_ssthresh)
|
||||
tcp_slow_start(tp, acked);
|
||||
/* In dangerous area, increase slowly. */
|
||||
else
|
||||
tcp_cong_avoid_ai(tp, tp->snd_cwnd);
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(tcp_reno_cong_avoid);
|
||||
|
||||
/* Slow start threshold is half the congestion window (min 2) */
|
||||
u32 tcp_reno_ssthresh(struct sock *sk)
|
||||
{
|
||||
const struct tcp_sock *tp = tcp_sk(sk);
|
||||
|
||||
return max(tp->snd_cwnd >> 1U, 2U);
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(tcp_reno_ssthresh);
|
||||
|
||||
struct tcp_congestion_ops tcp_reno = {
|
||||
.flags = TCP_CONG_NON_RESTRICTED,
|
||||
.name = "reno",
|
||||
.owner = THIS_MODULE,
|
||||
.ssthresh = tcp_reno_ssthresh,
|
||||
.cong_avoid = tcp_reno_cong_avoid,
|
||||
};
|
||||
494
net/ipv4/tcp_cubic.c
Normal file
494
net/ipv4/tcp_cubic.c
Normal file
|
|
@ -0,0 +1,494 @@
|
|||
/*
|
||||
* TCP CUBIC: Binary Increase Congestion control for TCP v2.3
|
||||
* Home page:
|
||||
* http://netsrv.csc.ncsu.edu/twiki/bin/view/Main/BIC
|
||||
* This is from the implementation of CUBIC TCP in
|
||||
* Sangtae Ha, Injong Rhee and Lisong Xu,
|
||||
* "CUBIC: A New TCP-Friendly High-Speed TCP Variant"
|
||||
* in ACM SIGOPS Operating System Review, July 2008.
|
||||
* Available from:
|
||||
* http://netsrv.csc.ncsu.edu/export/cubic_a_new_tcp_2008.pdf
|
||||
*
|
||||
* CUBIC integrates a new slow start algorithm, called HyStart.
|
||||
* The details of HyStart are presented in
|
||||
* Sangtae Ha and Injong Rhee,
|
||||
* "Taming the Elephants: New TCP Slow Start", NCSU TechReport 2008.
|
||||
* Available from:
|
||||
* http://netsrv.csc.ncsu.edu/export/hystart_techreport_2008.pdf
|
||||
*
|
||||
* All testing results are available from:
|
||||
* http://netsrv.csc.ncsu.edu/wiki/index.php/TCP_Testing
|
||||
*
|
||||
* Unless CUBIC is enabled and congestion window is large
|
||||
* this behaves the same as the original Reno.
|
||||
*/
|
||||
|
||||
#include <linux/mm.h>
|
||||
#include <linux/module.h>
|
||||
#include <linux/math64.h>
|
||||
#include <net/tcp.h>
|
||||
|
||||
#define BICTCP_BETA_SCALE 1024 /* Scale factor beta calculation
|
||||
* max_cwnd = snd_cwnd * beta
|
||||
*/
|
||||
#define BICTCP_HZ 10 /* BIC HZ 2^10 = 1024 */
|
||||
|
||||
/* Two methods of hybrid slow start */
|
||||
#define HYSTART_ACK_TRAIN 0x1
|
||||
#define HYSTART_DELAY 0x2
|
||||
|
||||
/* Number of delay samples for detecting the increase of delay */
|
||||
#define HYSTART_MIN_SAMPLES 8
|
||||
#define HYSTART_DELAY_MIN (4U<<3)
|
||||
#define HYSTART_DELAY_MAX (16U<<3)
|
||||
#define HYSTART_DELAY_THRESH(x) clamp(x, HYSTART_DELAY_MIN, HYSTART_DELAY_MAX)
|
||||
|
||||
static int fast_convergence __read_mostly = 1;
|
||||
static int beta __read_mostly = 717; /* = 717/1024 (BICTCP_BETA_SCALE) */
|
||||
static int initial_ssthresh __read_mostly;
|
||||
static int bic_scale __read_mostly = 41;
|
||||
static int tcp_friendliness __read_mostly = 1;
|
||||
|
||||
static int hystart __read_mostly = 1;
|
||||
static int hystart_detect __read_mostly = HYSTART_ACK_TRAIN | HYSTART_DELAY;
|
||||
static int hystart_low_window __read_mostly = 16;
|
||||
static int hystart_ack_delta __read_mostly = 2;
|
||||
|
||||
static u32 cube_rtt_scale __read_mostly;
|
||||
static u32 beta_scale __read_mostly;
|
||||
static u64 cube_factor __read_mostly;
|
||||
|
||||
/* Note parameters that are used for precomputing scale factors are read-only */
|
||||
module_param(fast_convergence, int, 0644);
|
||||
MODULE_PARM_DESC(fast_convergence, "turn on/off fast convergence");
|
||||
module_param(beta, int, 0644);
|
||||
MODULE_PARM_DESC(beta, "beta for multiplicative increase");
|
||||
module_param(initial_ssthresh, int, 0644);
|
||||
MODULE_PARM_DESC(initial_ssthresh, "initial value of slow start threshold");
|
||||
module_param(bic_scale, int, 0444);
|
||||
MODULE_PARM_DESC(bic_scale, "scale (scaled by 1024) value for bic function (bic_scale/1024)");
|
||||
module_param(tcp_friendliness, int, 0644);
|
||||
MODULE_PARM_DESC(tcp_friendliness, "turn on/off tcp friendliness");
|
||||
module_param(hystart, int, 0644);
|
||||
MODULE_PARM_DESC(hystart, "turn on/off hybrid slow start algorithm");
|
||||
module_param(hystart_detect, int, 0644);
|
||||
MODULE_PARM_DESC(hystart_detect, "hyrbrid slow start detection mechanisms"
|
||||
" 1: packet-train 2: delay 3: both packet-train and delay");
|
||||
module_param(hystart_low_window, int, 0644);
|
||||
MODULE_PARM_DESC(hystart_low_window, "lower bound cwnd for hybrid slow start");
|
||||
module_param(hystart_ack_delta, int, 0644);
|
||||
MODULE_PARM_DESC(hystart_ack_delta, "spacing between ack's indicating train (msecs)");
|
||||
|
||||
/* BIC TCP Parameters */
|
||||
struct bictcp {
|
||||
u32 cnt; /* increase cwnd by 1 after ACKs */
|
||||
u32 last_max_cwnd; /* last maximum snd_cwnd */
|
||||
u32 loss_cwnd; /* congestion window at last loss */
|
||||
u32 last_cwnd; /* the last snd_cwnd */
|
||||
u32 last_time; /* time when updated last_cwnd */
|
||||
u32 bic_origin_point;/* origin point of bic function */
|
||||
u32 bic_K; /* time to origin point
|
||||
from the beginning of the current epoch */
|
||||
u32 delay_min; /* min delay (msec << 3) */
|
||||
u32 epoch_start; /* beginning of an epoch */
|
||||
u32 ack_cnt; /* number of acks */
|
||||
u32 tcp_cwnd; /* estimated tcp cwnd */
|
||||
#define ACK_RATIO_SHIFT 4
|
||||
#define ACK_RATIO_LIMIT (32u << ACK_RATIO_SHIFT)
|
||||
u16 delayed_ack; /* estimate the ratio of Packets/ACKs << 4 */
|
||||
u8 sample_cnt; /* number of samples to decide curr_rtt */
|
||||
u8 found; /* the exit point is found? */
|
||||
u32 round_start; /* beginning of each round */
|
||||
u32 end_seq; /* end_seq of the round */
|
||||
u32 last_ack; /* last time when the ACK spacing is close */
|
||||
u32 curr_rtt; /* the minimum rtt of current round */
|
||||
};
|
||||
|
||||
static inline void bictcp_reset(struct bictcp *ca)
|
||||
{
|
||||
ca->cnt = 0;
|
||||
ca->last_max_cwnd = 0;
|
||||
ca->last_cwnd = 0;
|
||||
ca->last_time = 0;
|
||||
ca->bic_origin_point = 0;
|
||||
ca->bic_K = 0;
|
||||
ca->delay_min = 0;
|
||||
ca->epoch_start = 0;
|
||||
ca->delayed_ack = 2 << ACK_RATIO_SHIFT;
|
||||
ca->ack_cnt = 0;
|
||||
ca->tcp_cwnd = 0;
|
||||
ca->found = 0;
|
||||
}
|
||||
|
||||
static inline u32 bictcp_clock(void)
|
||||
{
|
||||
#if HZ < 1000
|
||||
return ktime_to_ms(ktime_get_real());
|
||||
#else
|
||||
return jiffies_to_msecs(jiffies);
|
||||
#endif
|
||||
}
|
||||
|
||||
static inline void bictcp_hystart_reset(struct sock *sk)
|
||||
{
|
||||
struct tcp_sock *tp = tcp_sk(sk);
|
||||
struct bictcp *ca = inet_csk_ca(sk);
|
||||
|
||||
ca->round_start = ca->last_ack = bictcp_clock();
|
||||
ca->end_seq = tp->snd_nxt;
|
||||
ca->curr_rtt = 0;
|
||||
ca->sample_cnt = 0;
|
||||
}
|
||||
|
||||
static void bictcp_init(struct sock *sk)
|
||||
{
|
||||
struct bictcp *ca = inet_csk_ca(sk);
|
||||
|
||||
bictcp_reset(ca);
|
||||
ca->loss_cwnd = 0;
|
||||
|
||||
if (hystart)
|
||||
bictcp_hystart_reset(sk);
|
||||
|
||||
if (!hystart && initial_ssthresh)
|
||||
tcp_sk(sk)->snd_ssthresh = initial_ssthresh;
|
||||
}
|
||||
|
||||
/* calculate the cubic root of x using a table lookup followed by one
|
||||
* Newton-Raphson iteration.
|
||||
* Avg err ~= 0.195%
|
||||
*/
|
||||
static u32 cubic_root(u64 a)
|
||||
{
|
||||
u32 x, b, shift;
|
||||
/*
|
||||
* cbrt(x) MSB values for x MSB values in [0..63].
|
||||
* Precomputed then refined by hand - Willy Tarreau
|
||||
*
|
||||
* For x in [0..63],
|
||||
* v = cbrt(x << 18) - 1
|
||||
* cbrt(x) = (v[x] + 10) >> 6
|
||||
*/
|
||||
static const u8 v[] = {
|
||||
/* 0x00 */ 0, 54, 54, 54, 118, 118, 118, 118,
|
||||
/* 0x08 */ 123, 129, 134, 138, 143, 147, 151, 156,
|
||||
/* 0x10 */ 157, 161, 164, 168, 170, 173, 176, 179,
|
||||
/* 0x18 */ 181, 185, 187, 190, 192, 194, 197, 199,
|
||||
/* 0x20 */ 200, 202, 204, 206, 209, 211, 213, 215,
|
||||
/* 0x28 */ 217, 219, 221, 222, 224, 225, 227, 229,
|
||||
/* 0x30 */ 231, 232, 234, 236, 237, 239, 240, 242,
|
||||
/* 0x38 */ 244, 245, 246, 248, 250, 251, 252, 254,
|
||||
};
|
||||
|
||||
b = fls64(a);
|
||||
if (b < 7) {
|
||||
/* a in [0..63] */
|
||||
return ((u32)v[(u32)a] + 35) >> 6;
|
||||
}
|
||||
|
||||
b = ((b * 84) >> 8) - 1;
|
||||
shift = (a >> (b * 3));
|
||||
|
||||
x = ((u32)(((u32)v[shift] + 10) << b)) >> 6;
|
||||
|
||||
/*
|
||||
* Newton-Raphson iteration
|
||||
* 2
|
||||
* x = ( 2 * x + a / x ) / 3
|
||||
* k+1 k k
|
||||
*/
|
||||
x = (2 * x + (u32)div64_u64(a, (u64)x * (u64)(x - 1)));
|
||||
x = ((x * 341) >> 10);
|
||||
return x;
|
||||
}
|
||||
|
||||
/*
|
||||
* Compute congestion window to use.
|
||||
*/
|
||||
static inline void bictcp_update(struct bictcp *ca, u32 cwnd)
|
||||
{
|
||||
u32 delta, bic_target, max_cnt;
|
||||
u64 offs, t;
|
||||
|
||||
ca->ack_cnt++; /* count the number of ACKs */
|
||||
|
||||
if (ca->last_cwnd == cwnd &&
|
||||
(s32)(tcp_time_stamp - ca->last_time) <= HZ / 32)
|
||||
return;
|
||||
|
||||
ca->last_cwnd = cwnd;
|
||||
ca->last_time = tcp_time_stamp;
|
||||
|
||||
if (ca->epoch_start == 0) {
|
||||
ca->epoch_start = tcp_time_stamp; /* record beginning */
|
||||
ca->ack_cnt = 1; /* start counting */
|
||||
ca->tcp_cwnd = cwnd; /* syn with cubic */
|
||||
|
||||
if (ca->last_max_cwnd <= cwnd) {
|
||||
ca->bic_K = 0;
|
||||
ca->bic_origin_point = cwnd;
|
||||
} else {
|
||||
/* Compute new K based on
|
||||
* (wmax-cwnd) * (srtt>>3 / HZ) / c * 2^(3*bictcp_HZ)
|
||||
*/
|
||||
ca->bic_K = cubic_root(cube_factor
|
||||
* (ca->last_max_cwnd - cwnd));
|
||||
ca->bic_origin_point = ca->last_max_cwnd;
|
||||
}
|
||||
}
|
||||
|
||||
/* cubic function - calc*/
|
||||
/* calculate c * time^3 / rtt,
|
||||
* while considering overflow in calculation of time^3
|
||||
* (so time^3 is done by using 64 bit)
|
||||
* and without the support of division of 64bit numbers
|
||||
* (so all divisions are done by using 32 bit)
|
||||
* also NOTE the unit of those veriables
|
||||
* time = (t - K) / 2^bictcp_HZ
|
||||
* c = bic_scale >> 10
|
||||
* rtt = (srtt >> 3) / HZ
|
||||
* !!! The following code does not have overflow problems,
|
||||
* if the cwnd < 1 million packets !!!
|
||||
*/
|
||||
|
||||
t = (s32)(tcp_time_stamp - ca->epoch_start);
|
||||
t += msecs_to_jiffies(ca->delay_min >> 3);
|
||||
/* change the unit from HZ to bictcp_HZ */
|
||||
t <<= BICTCP_HZ;
|
||||
do_div(t, HZ);
|
||||
|
||||
if (t < ca->bic_K) /* t - K */
|
||||
offs = ca->bic_K - t;
|
||||
else
|
||||
offs = t - ca->bic_K;
|
||||
|
||||
/* c/rtt * (t-K)^3 */
|
||||
delta = (cube_rtt_scale * offs * offs * offs) >> (10+3*BICTCP_HZ);
|
||||
if (t < ca->bic_K) /* below origin*/
|
||||
bic_target = ca->bic_origin_point - delta;
|
||||
else /* above origin*/
|
||||
bic_target = ca->bic_origin_point + delta;
|
||||
|
||||
/* cubic function - calc bictcp_cnt*/
|
||||
if (bic_target > cwnd) {
|
||||
ca->cnt = cwnd / (bic_target - cwnd);
|
||||
} else {
|
||||
ca->cnt = 100 * cwnd; /* very small increment*/
|
||||
}
|
||||
|
||||
/*
|
||||
* The initial growth of cubic function may be too conservative
|
||||
* when the available bandwidth is still unknown.
|
||||
*/
|
||||
if (ca->last_max_cwnd == 0 && ca->cnt > 20)
|
||||
ca->cnt = 20; /* increase cwnd 5% per RTT */
|
||||
|
||||
/* TCP Friendly */
|
||||
if (tcp_friendliness) {
|
||||
u32 scale = beta_scale;
|
||||
|
||||
delta = (cwnd * scale) >> 3;
|
||||
while (ca->ack_cnt > delta) { /* update tcp cwnd */
|
||||
ca->ack_cnt -= delta;
|
||||
ca->tcp_cwnd++;
|
||||
}
|
||||
|
||||
if (ca->tcp_cwnd > cwnd) { /* if bic is slower than tcp */
|
||||
delta = ca->tcp_cwnd - cwnd;
|
||||
max_cnt = cwnd / delta;
|
||||
if (ca->cnt > max_cnt)
|
||||
ca->cnt = max_cnt;
|
||||
}
|
||||
}
|
||||
|
||||
ca->cnt = (ca->cnt << ACK_RATIO_SHIFT) / ca->delayed_ack;
|
||||
if (ca->cnt == 0) /* cannot be zero */
|
||||
ca->cnt = 1;
|
||||
}
|
||||
|
||||
static void bictcp_cong_avoid(struct sock *sk, u32 ack, u32 acked)
|
||||
{
|
||||
struct tcp_sock *tp = tcp_sk(sk);
|
||||
struct bictcp *ca = inet_csk_ca(sk);
|
||||
|
||||
if (!tcp_is_cwnd_limited(sk))
|
||||
return;
|
||||
|
||||
if (tp->snd_cwnd <= tp->snd_ssthresh) {
|
||||
if (hystart && after(ack, ca->end_seq))
|
||||
bictcp_hystart_reset(sk);
|
||||
tcp_slow_start(tp, acked);
|
||||
} else {
|
||||
bictcp_update(ca, tp->snd_cwnd);
|
||||
tcp_cong_avoid_ai(tp, ca->cnt);
|
||||
}
|
||||
}
|
||||
|
||||
static u32 bictcp_recalc_ssthresh(struct sock *sk)
|
||||
{
|
||||
const struct tcp_sock *tp = tcp_sk(sk);
|
||||
struct bictcp *ca = inet_csk_ca(sk);
|
||||
|
||||
ca->epoch_start = 0; /* end of epoch */
|
||||
|
||||
/* Wmax and fast convergence */
|
||||
if (tp->snd_cwnd < ca->last_max_cwnd && fast_convergence)
|
||||
ca->last_max_cwnd = (tp->snd_cwnd * (BICTCP_BETA_SCALE + beta))
|
||||
/ (2 * BICTCP_BETA_SCALE);
|
||||
else
|
||||
ca->last_max_cwnd = tp->snd_cwnd;
|
||||
|
||||
ca->loss_cwnd = tp->snd_cwnd;
|
||||
|
||||
return max((tp->snd_cwnd * beta) / BICTCP_BETA_SCALE, 2U);
|
||||
}
|
||||
|
||||
static u32 bictcp_undo_cwnd(struct sock *sk)
|
||||
{
|
||||
struct bictcp *ca = inet_csk_ca(sk);
|
||||
|
||||
return max(tcp_sk(sk)->snd_cwnd, ca->loss_cwnd);
|
||||
}
|
||||
|
||||
static void bictcp_state(struct sock *sk, u8 new_state)
|
||||
{
|
||||
if (new_state == TCP_CA_Loss) {
|
||||
bictcp_reset(inet_csk_ca(sk));
|
||||
bictcp_hystart_reset(sk);
|
||||
}
|
||||
}
|
||||
|
||||
static void hystart_update(struct sock *sk, u32 delay)
|
||||
{
|
||||
struct tcp_sock *tp = tcp_sk(sk);
|
||||
struct bictcp *ca = inet_csk_ca(sk);
|
||||
|
||||
if (!(ca->found & hystart_detect)) {
|
||||
u32 now = bictcp_clock();
|
||||
|
||||
/* first detection parameter - ack-train detection */
|
||||
if ((s32)(now - ca->last_ack) <= hystart_ack_delta) {
|
||||
ca->last_ack = now;
|
||||
if ((s32)(now - ca->round_start) > ca->delay_min >> 4)
|
||||
ca->found |= HYSTART_ACK_TRAIN;
|
||||
}
|
||||
|
||||
/* obtain the minimum delay of more than sampling packets */
|
||||
if (ca->sample_cnt < HYSTART_MIN_SAMPLES) {
|
||||
if (ca->curr_rtt == 0 || ca->curr_rtt > delay)
|
||||
ca->curr_rtt = delay;
|
||||
|
||||
ca->sample_cnt++;
|
||||
} else {
|
||||
if (ca->curr_rtt > ca->delay_min +
|
||||
HYSTART_DELAY_THRESH(ca->delay_min>>4))
|
||||
ca->found |= HYSTART_DELAY;
|
||||
}
|
||||
/*
|
||||
* Either one of two conditions are met,
|
||||
* we exit from slow start immediately.
|
||||
*/
|
||||
if (ca->found & hystart_detect)
|
||||
tp->snd_ssthresh = tp->snd_cwnd;
|
||||
}
|
||||
}
|
||||
|
||||
/* Track delayed acknowledgment ratio using sliding window
|
||||
* ratio = (15*ratio + sample) / 16
|
||||
*/
|
||||
static void bictcp_acked(struct sock *sk, u32 cnt, s32 rtt_us)
|
||||
{
|
||||
const struct inet_connection_sock *icsk = inet_csk(sk);
|
||||
const struct tcp_sock *tp = tcp_sk(sk);
|
||||
struct bictcp *ca = inet_csk_ca(sk);
|
||||
u32 delay;
|
||||
|
||||
if (icsk->icsk_ca_state == TCP_CA_Open) {
|
||||
u32 ratio = ca->delayed_ack;
|
||||
|
||||
ratio -= ca->delayed_ack >> ACK_RATIO_SHIFT;
|
||||
ratio += cnt;
|
||||
|
||||
ca->delayed_ack = clamp(ratio, 1U, ACK_RATIO_LIMIT);
|
||||
}
|
||||
|
||||
/* Some calls are for duplicates without timetamps */
|
||||
if (rtt_us < 0)
|
||||
return;
|
||||
|
||||
/* Discard delay samples right after fast recovery */
|
||||
if (ca->epoch_start && (s32)(tcp_time_stamp - ca->epoch_start) < HZ)
|
||||
return;
|
||||
|
||||
delay = (rtt_us << 3) / USEC_PER_MSEC;
|
||||
if (delay == 0)
|
||||
delay = 1;
|
||||
|
||||
/* first time call or link delay decreases */
|
||||
if (ca->delay_min == 0 || ca->delay_min > delay)
|
||||
ca->delay_min = delay;
|
||||
|
||||
/* hystart triggers when cwnd is larger than some threshold */
|
||||
if (hystart && tp->snd_cwnd <= tp->snd_ssthresh &&
|
||||
tp->snd_cwnd >= hystart_low_window)
|
||||
hystart_update(sk, delay);
|
||||
}
|
||||
|
||||
static struct tcp_congestion_ops cubictcp __read_mostly = {
|
||||
.init = bictcp_init,
|
||||
.ssthresh = bictcp_recalc_ssthresh,
|
||||
.cong_avoid = bictcp_cong_avoid,
|
||||
.set_state = bictcp_state,
|
||||
.undo_cwnd = bictcp_undo_cwnd,
|
||||
.pkts_acked = bictcp_acked,
|
||||
.owner = THIS_MODULE,
|
||||
.name = "cubic",
|
||||
};
|
||||
|
||||
static int __init cubictcp_register(void)
|
||||
{
|
||||
BUILD_BUG_ON(sizeof(struct bictcp) > ICSK_CA_PRIV_SIZE);
|
||||
|
||||
/* Precompute a bunch of the scaling factors that are used per-packet
|
||||
* based on SRTT of 100ms
|
||||
*/
|
||||
|
||||
beta_scale = 8*(BICTCP_BETA_SCALE+beta) / 3
|
||||
/ (BICTCP_BETA_SCALE - beta);
|
||||
|
||||
cube_rtt_scale = (bic_scale * 10); /* 1024*c/rtt */
|
||||
|
||||
/* calculate the "K" for (wmax-cwnd) = c/rtt * K^3
|
||||
* so K = cubic_root( (wmax-cwnd)*rtt/c )
|
||||
* the unit of K is bictcp_HZ=2^10, not HZ
|
||||
*
|
||||
* c = bic_scale >> 10
|
||||
* rtt = 100ms
|
||||
*
|
||||
* the following code has been designed and tested for
|
||||
* cwnd < 1 million packets
|
||||
* RTT < 100 seconds
|
||||
* HZ < 1,000,00 (corresponding to 10 nano-second)
|
||||
*/
|
||||
|
||||
/* 1/c * 2^2*bictcp_HZ * srtt */
|
||||
cube_factor = 1ull << (10+3*BICTCP_HZ); /* 2^40 */
|
||||
|
||||
/* divide by bic_scale and by constant Srtt (100ms) */
|
||||
do_div(cube_factor, bic_scale * 10);
|
||||
|
||||
return tcp_register_congestion_control(&cubictcp);
|
||||
}
|
||||
|
||||
static void __exit cubictcp_unregister(void)
|
||||
{
|
||||
tcp_unregister_congestion_control(&cubictcp);
|
||||
}
|
||||
|
||||
module_init(cubictcp_register);
|
||||
module_exit(cubictcp_unregister);
|
||||
|
||||
MODULE_AUTHOR("Sangtae Ha, Stephen Hemminger");
|
||||
MODULE_LICENSE("GPL");
|
||||
MODULE_DESCRIPTION("CUBIC TCP");
|
||||
MODULE_VERSION("2.3");
|
||||
344
net/ipv4/tcp_dctcp.c
Normal file
344
net/ipv4/tcp_dctcp.c
Normal file
|
|
@ -0,0 +1,344 @@
|
|||
/* DataCenter TCP (DCTCP) congestion control.
|
||||
*
|
||||
* http://simula.stanford.edu/~alizade/Site/DCTCP.html
|
||||
*
|
||||
* This is an implementation of DCTCP over Reno, an enhancement to the
|
||||
* TCP congestion control algorithm designed for data centers. DCTCP
|
||||
* leverages Explicit Congestion Notification (ECN) in the network to
|
||||
* provide multi-bit feedback to the end hosts. DCTCP's goal is to meet
|
||||
* the following three data center transport requirements:
|
||||
*
|
||||
* - High burst tolerance (incast due to partition/aggregate)
|
||||
* - Low latency (short flows, queries)
|
||||
* - High throughput (continuous data updates, large file transfers)
|
||||
* with commodity shallow buffered switches
|
||||
*
|
||||
* The algorithm is described in detail in the following two papers:
|
||||
*
|
||||
* 1) Mohammad Alizadeh, Albert Greenberg, David A. Maltz, Jitendra Padhye,
|
||||
* Parveen Patel, Balaji Prabhakar, Sudipta Sengupta, and Murari Sridharan:
|
||||
* "Data Center TCP (DCTCP)", Data Center Networks session
|
||||
* Proc. ACM SIGCOMM, New Delhi, 2010.
|
||||
* http://simula.stanford.edu/~alizade/Site/DCTCP_files/dctcp-final.pdf
|
||||
*
|
||||
* 2) Mohammad Alizadeh, Adel Javanmard, and Balaji Prabhakar:
|
||||
* "Analysis of DCTCP: Stability, Convergence, and Fairness"
|
||||
* Proc. ACM SIGMETRICS, San Jose, 2011.
|
||||
* http://simula.stanford.edu/~alizade/Site/DCTCP_files/dctcp_analysis-full.pdf
|
||||
*
|
||||
* Initial prototype from Abdul Kabbani, Masato Yasuda and Mohammad Alizadeh.
|
||||
*
|
||||
* Authors:
|
||||
*
|
||||
* Daniel Borkmann <dborkman@redhat.com>
|
||||
* Florian Westphal <fw@strlen.de>
|
||||
* Glenn Judd <glenn.judd@morganstanley.com>
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
* the Free Software Foundation; either version 2 of the License, or (at
|
||||
* your option) any later version.
|
||||
*/
|
||||
|
||||
#include <linux/module.h>
|
||||
#include <linux/mm.h>
|
||||
#include <net/tcp.h>
|
||||
#include <linux/inet_diag.h>
|
||||
|
||||
#define DCTCP_MAX_ALPHA 1024U
|
||||
|
||||
struct dctcp {
|
||||
u32 acked_bytes_ecn;
|
||||
u32 acked_bytes_total;
|
||||
u32 prior_snd_una;
|
||||
u32 prior_rcv_nxt;
|
||||
u32 dctcp_alpha;
|
||||
u32 next_seq;
|
||||
u32 ce_state;
|
||||
u32 delayed_ack_reserved;
|
||||
};
|
||||
|
||||
static unsigned int dctcp_shift_g __read_mostly = 4; /* g = 1/2^4 */
|
||||
module_param(dctcp_shift_g, uint, 0644);
|
||||
MODULE_PARM_DESC(dctcp_shift_g, "parameter g for updating dctcp_alpha");
|
||||
|
||||
static unsigned int dctcp_alpha_on_init __read_mostly = DCTCP_MAX_ALPHA;
|
||||
module_param(dctcp_alpha_on_init, uint, 0644);
|
||||
MODULE_PARM_DESC(dctcp_alpha_on_init, "parameter for initial alpha value");
|
||||
|
||||
static unsigned int dctcp_clamp_alpha_on_loss __read_mostly;
|
||||
module_param(dctcp_clamp_alpha_on_loss, uint, 0644);
|
||||
MODULE_PARM_DESC(dctcp_clamp_alpha_on_loss,
|
||||
"parameter for clamping alpha on loss");
|
||||
|
||||
static struct tcp_congestion_ops dctcp_reno;
|
||||
|
||||
static void dctcp_reset(const struct tcp_sock *tp, struct dctcp *ca)
|
||||
{
|
||||
ca->next_seq = tp->snd_nxt;
|
||||
|
||||
ca->acked_bytes_ecn = 0;
|
||||
ca->acked_bytes_total = 0;
|
||||
}
|
||||
|
||||
static void dctcp_init(struct sock *sk)
|
||||
{
|
||||
const struct tcp_sock *tp = tcp_sk(sk);
|
||||
|
||||
if ((tp->ecn_flags & TCP_ECN_OK) ||
|
||||
(sk->sk_state == TCP_LISTEN ||
|
||||
sk->sk_state == TCP_CLOSE)) {
|
||||
struct dctcp *ca = inet_csk_ca(sk);
|
||||
|
||||
ca->prior_snd_una = tp->snd_una;
|
||||
ca->prior_rcv_nxt = tp->rcv_nxt;
|
||||
|
||||
ca->dctcp_alpha = min(dctcp_alpha_on_init, DCTCP_MAX_ALPHA);
|
||||
|
||||
ca->delayed_ack_reserved = 0;
|
||||
ca->ce_state = 0;
|
||||
|
||||
dctcp_reset(tp, ca);
|
||||
return;
|
||||
}
|
||||
|
||||
/* No ECN support? Fall back to Reno. Also need to clear
|
||||
* ECT from sk since it is set during 3WHS for DCTCP.
|
||||
*/
|
||||
inet_csk(sk)->icsk_ca_ops = &dctcp_reno;
|
||||
INET_ECN_dontxmit(sk);
|
||||
}
|
||||
|
||||
static u32 dctcp_ssthresh(struct sock *sk)
|
||||
{
|
||||
const struct dctcp *ca = inet_csk_ca(sk);
|
||||
struct tcp_sock *tp = tcp_sk(sk);
|
||||
|
||||
return max(tp->snd_cwnd - ((tp->snd_cwnd * ca->dctcp_alpha) >> 11U), 2U);
|
||||
}
|
||||
|
||||
/* Minimal DCTP CE state machine:
|
||||
*
|
||||
* S: 0 <- last pkt was non-CE
|
||||
* 1 <- last pkt was CE
|
||||
*/
|
||||
|
||||
static void dctcp_ce_state_0_to_1(struct sock *sk)
|
||||
{
|
||||
struct dctcp *ca = inet_csk_ca(sk);
|
||||
struct tcp_sock *tp = tcp_sk(sk);
|
||||
|
||||
/* State has changed from CE=0 to CE=1 and delayed
|
||||
* ACK has not sent yet.
|
||||
*/
|
||||
if (!ca->ce_state && ca->delayed_ack_reserved) {
|
||||
u32 tmp_rcv_nxt;
|
||||
|
||||
/* Save current rcv_nxt. */
|
||||
tmp_rcv_nxt = tp->rcv_nxt;
|
||||
|
||||
/* Generate previous ack with CE=0. */
|
||||
tp->ecn_flags &= ~TCP_ECN_DEMAND_CWR;
|
||||
tp->rcv_nxt = ca->prior_rcv_nxt;
|
||||
|
||||
tcp_send_ack(sk);
|
||||
|
||||
/* Recover current rcv_nxt. */
|
||||
tp->rcv_nxt = tmp_rcv_nxt;
|
||||
}
|
||||
|
||||
ca->prior_rcv_nxt = tp->rcv_nxt;
|
||||
ca->ce_state = 1;
|
||||
|
||||
tp->ecn_flags |= TCP_ECN_DEMAND_CWR;
|
||||
}
|
||||
|
||||
static void dctcp_ce_state_1_to_0(struct sock *sk)
|
||||
{
|
||||
struct dctcp *ca = inet_csk_ca(sk);
|
||||
struct tcp_sock *tp = tcp_sk(sk);
|
||||
|
||||
/* State has changed from CE=1 to CE=0 and delayed
|
||||
* ACK has not sent yet.
|
||||
*/
|
||||
if (ca->ce_state && ca->delayed_ack_reserved) {
|
||||
u32 tmp_rcv_nxt;
|
||||
|
||||
/* Save current rcv_nxt. */
|
||||
tmp_rcv_nxt = tp->rcv_nxt;
|
||||
|
||||
/* Generate previous ack with CE=1. */
|
||||
tp->ecn_flags |= TCP_ECN_DEMAND_CWR;
|
||||
tp->rcv_nxt = ca->prior_rcv_nxt;
|
||||
|
||||
tcp_send_ack(sk);
|
||||
|
||||
/* Recover current rcv_nxt. */
|
||||
tp->rcv_nxt = tmp_rcv_nxt;
|
||||
}
|
||||
|
||||
ca->prior_rcv_nxt = tp->rcv_nxt;
|
||||
ca->ce_state = 0;
|
||||
|
||||
tp->ecn_flags &= ~TCP_ECN_DEMAND_CWR;
|
||||
}
|
||||
|
||||
static void dctcp_update_alpha(struct sock *sk, u32 flags)
|
||||
{
|
||||
const struct tcp_sock *tp = tcp_sk(sk);
|
||||
struct dctcp *ca = inet_csk_ca(sk);
|
||||
u32 acked_bytes = tp->snd_una - ca->prior_snd_una;
|
||||
|
||||
/* If ack did not advance snd_una, count dupack as MSS size.
|
||||
* If ack did update window, do not count it at all.
|
||||
*/
|
||||
if (acked_bytes == 0 && !(flags & CA_ACK_WIN_UPDATE))
|
||||
acked_bytes = inet_csk(sk)->icsk_ack.rcv_mss;
|
||||
if (acked_bytes) {
|
||||
ca->acked_bytes_total += acked_bytes;
|
||||
ca->prior_snd_una = tp->snd_una;
|
||||
|
||||
if (flags & CA_ACK_ECE)
|
||||
ca->acked_bytes_ecn += acked_bytes;
|
||||
}
|
||||
|
||||
/* Expired RTT */
|
||||
if (!before(tp->snd_una, ca->next_seq)) {
|
||||
/* For avoiding denominator == 1. */
|
||||
if (ca->acked_bytes_total == 0)
|
||||
ca->acked_bytes_total = 1;
|
||||
|
||||
/* alpha = (1 - g) * alpha + g * F */
|
||||
ca->dctcp_alpha = ca->dctcp_alpha -
|
||||
(ca->dctcp_alpha >> dctcp_shift_g) +
|
||||
(ca->acked_bytes_ecn << (10U - dctcp_shift_g)) /
|
||||
ca->acked_bytes_total;
|
||||
|
||||
if (ca->dctcp_alpha > DCTCP_MAX_ALPHA)
|
||||
/* Clamp dctcp_alpha to max. */
|
||||
ca->dctcp_alpha = DCTCP_MAX_ALPHA;
|
||||
|
||||
dctcp_reset(tp, ca);
|
||||
}
|
||||
}
|
||||
|
||||
static void dctcp_state(struct sock *sk, u8 new_state)
|
||||
{
|
||||
if (dctcp_clamp_alpha_on_loss && new_state == TCP_CA_Loss) {
|
||||
struct dctcp *ca = inet_csk_ca(sk);
|
||||
|
||||
/* If this extension is enabled, we clamp dctcp_alpha to
|
||||
* max on packet loss; the motivation is that dctcp_alpha
|
||||
* is an indicator to the extend of congestion and packet
|
||||
* loss is an indicator of extreme congestion; setting
|
||||
* this in practice turned out to be beneficial, and
|
||||
* effectively assumes total congestion which reduces the
|
||||
* window by half.
|
||||
*/
|
||||
ca->dctcp_alpha = DCTCP_MAX_ALPHA;
|
||||
}
|
||||
}
|
||||
|
||||
static void dctcp_update_ack_reserved(struct sock *sk, enum tcp_ca_event ev)
|
||||
{
|
||||
struct dctcp *ca = inet_csk_ca(sk);
|
||||
|
||||
switch (ev) {
|
||||
case CA_EVENT_DELAYED_ACK:
|
||||
if (!ca->delayed_ack_reserved)
|
||||
ca->delayed_ack_reserved = 1;
|
||||
break;
|
||||
case CA_EVENT_NON_DELAYED_ACK:
|
||||
if (ca->delayed_ack_reserved)
|
||||
ca->delayed_ack_reserved = 0;
|
||||
break;
|
||||
default:
|
||||
/* Don't care for the rest. */
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
static void dctcp_cwnd_event(struct sock *sk, enum tcp_ca_event ev)
|
||||
{
|
||||
switch (ev) {
|
||||
case CA_EVENT_ECN_IS_CE:
|
||||
dctcp_ce_state_0_to_1(sk);
|
||||
break;
|
||||
case CA_EVENT_ECN_NO_CE:
|
||||
dctcp_ce_state_1_to_0(sk);
|
||||
break;
|
||||
case CA_EVENT_DELAYED_ACK:
|
||||
case CA_EVENT_NON_DELAYED_ACK:
|
||||
dctcp_update_ack_reserved(sk, ev);
|
||||
break;
|
||||
default:
|
||||
/* Don't care for the rest. */
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
static void dctcp_get_info(struct sock *sk, u32 ext, struct sk_buff *skb)
|
||||
{
|
||||
const struct dctcp *ca = inet_csk_ca(sk);
|
||||
|
||||
/* Fill it also in case of VEGASINFO due to req struct limits.
|
||||
* We can still correctly retrieve it later.
|
||||
*/
|
||||
if (ext & (1 << (INET_DIAG_DCTCPINFO - 1)) ||
|
||||
ext & (1 << (INET_DIAG_VEGASINFO - 1))) {
|
||||
struct tcp_dctcp_info info;
|
||||
|
||||
memset(&info, 0, sizeof(info));
|
||||
if (inet_csk(sk)->icsk_ca_ops != &dctcp_reno) {
|
||||
info.dctcp_enabled = 1;
|
||||
info.dctcp_ce_state = (u16) ca->ce_state;
|
||||
info.dctcp_alpha = ca->dctcp_alpha;
|
||||
info.dctcp_ab_ecn = ca->acked_bytes_ecn;
|
||||
info.dctcp_ab_tot = ca->acked_bytes_total;
|
||||
}
|
||||
|
||||
nla_put(skb, INET_DIAG_DCTCPINFO, sizeof(info), &info);
|
||||
}
|
||||
}
|
||||
|
||||
static struct tcp_congestion_ops dctcp __read_mostly = {
|
||||
.init = dctcp_init,
|
||||
.in_ack_event = dctcp_update_alpha,
|
||||
.cwnd_event = dctcp_cwnd_event,
|
||||
.ssthresh = dctcp_ssthresh,
|
||||
.cong_avoid = tcp_reno_cong_avoid,
|
||||
.set_state = dctcp_state,
|
||||
.get_info = dctcp_get_info,
|
||||
.flags = TCP_CONG_NEEDS_ECN,
|
||||
.owner = THIS_MODULE,
|
||||
.name = "dctcp",
|
||||
};
|
||||
|
||||
static struct tcp_congestion_ops dctcp_reno __read_mostly = {
|
||||
.ssthresh = tcp_reno_ssthresh,
|
||||
.cong_avoid = tcp_reno_cong_avoid,
|
||||
.get_info = dctcp_get_info,
|
||||
.owner = THIS_MODULE,
|
||||
.name = "dctcp-reno",
|
||||
};
|
||||
|
||||
static int __init dctcp_register(void)
|
||||
{
|
||||
BUILD_BUG_ON(sizeof(struct dctcp) > ICSK_CA_PRIV_SIZE);
|
||||
return tcp_register_congestion_control(&dctcp);
|
||||
}
|
||||
|
||||
static void __exit dctcp_unregister(void)
|
||||
{
|
||||
tcp_unregister_congestion_control(&dctcp);
|
||||
}
|
||||
|
||||
module_init(dctcp_register);
|
||||
module_exit(dctcp_unregister);
|
||||
|
||||
MODULE_AUTHOR("Daniel Borkmann <dborkman@redhat.com>");
|
||||
MODULE_AUTHOR("Florian Westphal <fw@strlen.de>");
|
||||
MODULE_AUTHOR("Glenn Judd <glenn.judd@morganstanley.com>");
|
||||
|
||||
MODULE_LICENSE("GPL v2");
|
||||
MODULE_DESCRIPTION("DataCenter TCP (DCTCP)");
|
||||
87
net/ipv4/tcp_diag.c
Normal file
87
net/ipv4/tcp_diag.c
Normal file
|
|
@ -0,0 +1,87 @@
|
|||
/*
|
||||
* tcp_diag.c Module for monitoring TCP transport protocols sockets.
|
||||
*
|
||||
* Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU General Public License
|
||||
* as published by the Free Software Foundation; either version
|
||||
* 2 of the License, or (at your option) any later version.
|
||||
*/
|
||||
|
||||
#include <linux/module.h>
|
||||
#include <linux/net.h>
|
||||
#include <linux/sock_diag.h>
|
||||
#include <linux/inet_diag.h>
|
||||
|
||||
#include <linux/tcp.h>
|
||||
|
||||
#include <net/tcp.h>
|
||||
|
||||
static void tcp_diag_get_info(struct sock *sk, struct inet_diag_msg *r,
|
||||
void *_info)
|
||||
{
|
||||
const struct tcp_sock *tp = tcp_sk(sk);
|
||||
struct tcp_info *info = _info;
|
||||
|
||||
if (sk->sk_state == TCP_LISTEN) {
|
||||
r->idiag_rqueue = sk->sk_ack_backlog;
|
||||
r->idiag_wqueue = sk->sk_max_ack_backlog;
|
||||
} else {
|
||||
r->idiag_rqueue = max_t(int, tp->rcv_nxt - tp->copied_seq, 0);
|
||||
r->idiag_wqueue = tp->write_seq - tp->snd_una;
|
||||
}
|
||||
if (info != NULL)
|
||||
tcp_get_info(sk, info);
|
||||
}
|
||||
|
||||
static void tcp_diag_dump(struct sk_buff *skb, struct netlink_callback *cb,
|
||||
struct inet_diag_req_v2 *r, struct nlattr *bc)
|
||||
{
|
||||
inet_diag_dump_icsk(&tcp_hashinfo, skb, cb, r, bc);
|
||||
}
|
||||
|
||||
static int tcp_diag_dump_one(struct sk_buff *in_skb, const struct nlmsghdr *nlh,
|
||||
struct inet_diag_req_v2 *req)
|
||||
{
|
||||
return inet_diag_dump_one_icsk(&tcp_hashinfo, in_skb, nlh, req);
|
||||
}
|
||||
|
||||
#ifdef CONFIG_INET_DIAG_DESTROY
|
||||
static int tcp_diag_destroy(struct sk_buff *in_skb,
|
||||
struct inet_diag_req_v2 *req)
|
||||
{
|
||||
struct net *net = sock_net(in_skb->sk);
|
||||
struct sock *sk = inet_diag_find_one_icsk(net, &tcp_hashinfo, req);
|
||||
|
||||
if (IS_ERR(sk))
|
||||
return PTR_ERR(sk);
|
||||
|
||||
return sock_diag_destroy(sk, ECONNABORTED);
|
||||
}
|
||||
#endif
|
||||
|
||||
static const struct inet_diag_handler tcp_diag_handler = {
|
||||
.dump = tcp_diag_dump,
|
||||
.dump_one = tcp_diag_dump_one,
|
||||
.idiag_get_info = tcp_diag_get_info,
|
||||
.idiag_type = IPPROTO_TCP,
|
||||
#ifdef CONFIG_INET_DIAG_DESTROY
|
||||
.destroy = tcp_diag_destroy,
|
||||
#endif
|
||||
};
|
||||
|
||||
static int __init tcp_diag_init(void)
|
||||
{
|
||||
return inet_diag_register(&tcp_diag_handler);
|
||||
}
|
||||
|
||||
static void __exit tcp_diag_exit(void)
|
||||
{
|
||||
inet_diag_unregister(&tcp_diag_handler);
|
||||
}
|
||||
|
||||
module_init(tcp_diag_init);
|
||||
module_exit(tcp_diag_exit);
|
||||
MODULE_LICENSE("GPL");
|
||||
MODULE_ALIAS_NET_PF_PROTO_TYPE(PF_NETLINK, NETLINK_SOCK_DIAG, 2-6 /* AF_INET - IPPROTO_TCP */);
|
||||
311
net/ipv4/tcp_fastopen.c
Normal file
311
net/ipv4/tcp_fastopen.c
Normal file
|
|
@ -0,0 +1,311 @@
|
|||
#include <linux/err.h>
|
||||
#include <linux/init.h>
|
||||
#include <linux/kernel.h>
|
||||
#include <linux/list.h>
|
||||
#include <linux/tcp.h>
|
||||
#include <linux/rcupdate.h>
|
||||
#include <linux/rculist.h>
|
||||
#include <net/inetpeer.h>
|
||||
#include <net/tcp.h>
|
||||
|
||||
int sysctl_tcp_fastopen __read_mostly = TFO_CLIENT_ENABLE;
|
||||
|
||||
struct tcp_fastopen_context __rcu *tcp_fastopen_ctx;
|
||||
|
||||
static DEFINE_SPINLOCK(tcp_fastopen_ctx_lock);
|
||||
|
||||
void tcp_fastopen_init_key_once(bool publish)
|
||||
{
|
||||
static u8 key[TCP_FASTOPEN_KEY_LENGTH];
|
||||
|
||||
/* tcp_fastopen_reset_cipher publishes the new context
|
||||
* atomically, so we allow this race happening here.
|
||||
*
|
||||
* All call sites of tcp_fastopen_cookie_gen also check
|
||||
* for a valid cookie, so this is an acceptable risk.
|
||||
*/
|
||||
if (net_get_random_once(key, sizeof(key)) && publish)
|
||||
tcp_fastopen_reset_cipher(key, sizeof(key));
|
||||
}
|
||||
|
||||
static void tcp_fastopen_ctx_free(struct rcu_head *head)
|
||||
{
|
||||
struct tcp_fastopen_context *ctx =
|
||||
container_of(head, struct tcp_fastopen_context, rcu);
|
||||
crypto_free_cipher(ctx->tfm);
|
||||
kfree(ctx);
|
||||
}
|
||||
|
||||
int tcp_fastopen_reset_cipher(void *key, unsigned int len)
|
||||
{
|
||||
int err;
|
||||
struct tcp_fastopen_context *ctx, *octx;
|
||||
|
||||
ctx = kmalloc(sizeof(*ctx), GFP_KERNEL);
|
||||
if (!ctx)
|
||||
return -ENOMEM;
|
||||
ctx->tfm = crypto_alloc_cipher("aes", 0, 0);
|
||||
|
||||
if (IS_ERR(ctx->tfm)) {
|
||||
err = PTR_ERR(ctx->tfm);
|
||||
error: kfree(ctx);
|
||||
pr_err("TCP: TFO aes cipher alloc error: %d\n", err);
|
||||
return err;
|
||||
}
|
||||
err = crypto_cipher_setkey(ctx->tfm, key, len);
|
||||
if (err) {
|
||||
pr_err("TCP: TFO cipher key error: %d\n", err);
|
||||
crypto_free_cipher(ctx->tfm);
|
||||
goto error;
|
||||
}
|
||||
memcpy(ctx->key, key, len);
|
||||
|
||||
spin_lock(&tcp_fastopen_ctx_lock);
|
||||
|
||||
octx = rcu_dereference_protected(tcp_fastopen_ctx,
|
||||
lockdep_is_held(&tcp_fastopen_ctx_lock));
|
||||
rcu_assign_pointer(tcp_fastopen_ctx, ctx);
|
||||
spin_unlock(&tcp_fastopen_ctx_lock);
|
||||
|
||||
if (octx)
|
||||
call_rcu(&octx->rcu, tcp_fastopen_ctx_free);
|
||||
return err;
|
||||
}
|
||||
|
||||
static bool __tcp_fastopen_cookie_gen(const void *path,
|
||||
struct tcp_fastopen_cookie *foc)
|
||||
{
|
||||
struct tcp_fastopen_context *ctx;
|
||||
bool ok = false;
|
||||
|
||||
tcp_fastopen_init_key_once(true);
|
||||
|
||||
rcu_read_lock();
|
||||
ctx = rcu_dereference(tcp_fastopen_ctx);
|
||||
if (ctx) {
|
||||
crypto_cipher_encrypt_one(ctx->tfm, foc->val, path);
|
||||
foc->len = TCP_FASTOPEN_COOKIE_SIZE;
|
||||
ok = true;
|
||||
}
|
||||
rcu_read_unlock();
|
||||
return ok;
|
||||
}
|
||||
|
||||
/* Generate the fastopen cookie by doing aes128 encryption on both
|
||||
* the source and destination addresses. Pad 0s for IPv4 or IPv4-mapped-IPv6
|
||||
* addresses. For the longer IPv6 addresses use CBC-MAC.
|
||||
*
|
||||
* XXX (TFO) - refactor when TCP_FASTOPEN_COOKIE_SIZE != AES_BLOCK_SIZE.
|
||||
*/
|
||||
static bool tcp_fastopen_cookie_gen(struct request_sock *req,
|
||||
struct sk_buff *syn,
|
||||
struct tcp_fastopen_cookie *foc)
|
||||
{
|
||||
if (req->rsk_ops->family == AF_INET) {
|
||||
const struct iphdr *iph = ip_hdr(syn);
|
||||
|
||||
__be32 path[4] = { iph->saddr, iph->daddr, 0, 0 };
|
||||
return __tcp_fastopen_cookie_gen(path, foc);
|
||||
}
|
||||
|
||||
#if IS_ENABLED(CONFIG_IPV6)
|
||||
if (req->rsk_ops->family == AF_INET6) {
|
||||
const struct ipv6hdr *ip6h = ipv6_hdr(syn);
|
||||
struct tcp_fastopen_cookie tmp;
|
||||
|
||||
if (__tcp_fastopen_cookie_gen(&ip6h->saddr, &tmp)) {
|
||||
struct in6_addr *buf = (struct in6_addr *) tmp.val;
|
||||
int i;
|
||||
|
||||
for (i = 0; i < 4; i++)
|
||||
buf->s6_addr32[i] ^= ip6h->daddr.s6_addr32[i];
|
||||
return __tcp_fastopen_cookie_gen(buf, foc);
|
||||
}
|
||||
}
|
||||
#endif
|
||||
return false;
|
||||
}
|
||||
|
||||
static bool tcp_fastopen_create_child(struct sock *sk,
|
||||
struct sk_buff *skb,
|
||||
struct dst_entry *dst,
|
||||
struct request_sock *req)
|
||||
{
|
||||
struct tcp_sock *tp;
|
||||
struct request_sock_queue *queue = &inet_csk(sk)->icsk_accept_queue;
|
||||
struct sock *child;
|
||||
u32 end_seq;
|
||||
|
||||
req->num_retrans = 0;
|
||||
req->num_timeout = 0;
|
||||
req->sk = NULL;
|
||||
|
||||
child = inet_csk(sk)->icsk_af_ops->syn_recv_sock(sk, skb, req, NULL);
|
||||
if (child == NULL)
|
||||
return false;
|
||||
|
||||
spin_lock(&queue->fastopenq->lock);
|
||||
queue->fastopenq->qlen++;
|
||||
spin_unlock(&queue->fastopenq->lock);
|
||||
|
||||
/* Initialize the child socket. Have to fix some values to take
|
||||
* into account the child is a Fast Open socket and is created
|
||||
* only out of the bits carried in the SYN packet.
|
||||
*/
|
||||
tp = tcp_sk(child);
|
||||
|
||||
tp->fastopen_rsk = req;
|
||||
/* Do a hold on the listner sk so that if the listener is being
|
||||
* closed, the child that has been accepted can live on and still
|
||||
* access listen_lock.
|
||||
*/
|
||||
sock_hold(sk);
|
||||
tcp_rsk(req)->listener = sk;
|
||||
|
||||
/* RFC1323: The window in SYN & SYN/ACK segments is never
|
||||
* scaled. So correct it appropriately.
|
||||
*/
|
||||
tp->snd_wnd = ntohs(tcp_hdr(skb)->window);
|
||||
|
||||
/* Activate the retrans timer so that SYNACK can be retransmitted.
|
||||
* The request socket is not added to the SYN table of the parent
|
||||
* because it's been added to the accept queue directly.
|
||||
*/
|
||||
inet_csk_reset_xmit_timer(child, ICSK_TIME_RETRANS,
|
||||
TCP_TIMEOUT_INIT, TCP_RTO_MAX);
|
||||
|
||||
/* Add the child socket directly into the accept queue */
|
||||
inet_csk_reqsk_queue_add(sk, req, child);
|
||||
|
||||
/* Now finish processing the fastopen child socket. */
|
||||
inet_csk(child)->icsk_af_ops->rebuild_header(child);
|
||||
tcp_init_congestion_control(child);
|
||||
tcp_mtup_init(child);
|
||||
tcp_init_metrics(child);
|
||||
tcp_init_buffer_space(child);
|
||||
|
||||
/* Queue the data carried in the SYN packet. We need to first
|
||||
* bump skb's refcnt because the caller will attempt to free it.
|
||||
* Note that IPv6 might also have used skb_get() trick
|
||||
* in tcp_v6_conn_request() to keep this SYN around (treq->pktopts)
|
||||
* So we need to eventually get a clone of the packet,
|
||||
* before inserting it in sk_receive_queue.
|
||||
*
|
||||
* XXX (TFO) - we honor a zero-payload TFO request for now,
|
||||
* (any reason not to?) but no need to queue the skb since
|
||||
* there is no data. How about SYN+FIN?
|
||||
*/
|
||||
end_seq = TCP_SKB_CB(skb)->end_seq;
|
||||
if (end_seq != TCP_SKB_CB(skb)->seq + 1) {
|
||||
struct sk_buff *skb2;
|
||||
|
||||
if (unlikely(skb_shared(skb)))
|
||||
skb2 = skb_clone(skb, GFP_ATOMIC);
|
||||
else
|
||||
skb2 = skb_get(skb);
|
||||
|
||||
if (likely(skb2)) {
|
||||
skb_dst_drop(skb2);
|
||||
__skb_pull(skb2, tcp_hdrlen(skb));
|
||||
skb_set_owner_r(skb2, child);
|
||||
__skb_queue_tail(&child->sk_receive_queue, skb2);
|
||||
tp->syn_data_acked = 1;
|
||||
} else {
|
||||
end_seq = TCP_SKB_CB(skb)->seq + 1;
|
||||
}
|
||||
}
|
||||
tcp_rsk(req)->rcv_nxt = tp->rcv_nxt = end_seq;
|
||||
sk->sk_data_ready(sk);
|
||||
bh_unlock_sock(child);
|
||||
sock_put(child);
|
||||
WARN_ON(req->sk == NULL);
|
||||
return true;
|
||||
}
|
||||
EXPORT_SYMBOL(tcp_fastopen_create_child);
|
||||
|
||||
static bool tcp_fastopen_queue_check(struct sock *sk)
|
||||
{
|
||||
struct fastopen_queue *fastopenq;
|
||||
|
||||
/* Make sure the listener has enabled fastopen, and we don't
|
||||
* exceed the max # of pending TFO requests allowed before trying
|
||||
* to validating the cookie in order to avoid burning CPU cycles
|
||||
* unnecessarily.
|
||||
*
|
||||
* XXX (TFO) - The implication of checking the max_qlen before
|
||||
* processing a cookie request is that clients can't differentiate
|
||||
* between qlen overflow causing Fast Open to be disabled
|
||||
* temporarily vs a server not supporting Fast Open at all.
|
||||
*/
|
||||
fastopenq = inet_csk(sk)->icsk_accept_queue.fastopenq;
|
||||
if (fastopenq == NULL || fastopenq->max_qlen == 0)
|
||||
return false;
|
||||
|
||||
if (fastopenq->qlen >= fastopenq->max_qlen) {
|
||||
struct request_sock *req1;
|
||||
spin_lock(&fastopenq->lock);
|
||||
req1 = fastopenq->rskq_rst_head;
|
||||
if ((req1 == NULL) || time_after(req1->expires, jiffies)) {
|
||||
spin_unlock(&fastopenq->lock);
|
||||
NET_INC_STATS_BH(sock_net(sk),
|
||||
LINUX_MIB_TCPFASTOPENLISTENOVERFLOW);
|
||||
return false;
|
||||
}
|
||||
fastopenq->rskq_rst_head = req1->dl_next;
|
||||
fastopenq->qlen--;
|
||||
spin_unlock(&fastopenq->lock);
|
||||
reqsk_free(req1);
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
/* Returns true if we should perform Fast Open on the SYN. The cookie (foc)
|
||||
* may be updated and return the client in the SYN-ACK later. E.g., Fast Open
|
||||
* cookie request (foc->len == 0).
|
||||
*/
|
||||
bool tcp_try_fastopen(struct sock *sk, struct sk_buff *skb,
|
||||
struct request_sock *req,
|
||||
struct tcp_fastopen_cookie *foc,
|
||||
struct dst_entry *dst)
|
||||
{
|
||||
struct tcp_fastopen_cookie valid_foc = { .len = -1 };
|
||||
bool syn_data = TCP_SKB_CB(skb)->end_seq != TCP_SKB_CB(skb)->seq + 1;
|
||||
|
||||
if (!((sysctl_tcp_fastopen & TFO_SERVER_ENABLE) &&
|
||||
(syn_data || foc->len >= 0) &&
|
||||
tcp_fastopen_queue_check(sk))) {
|
||||
foc->len = -1;
|
||||
return false;
|
||||
}
|
||||
|
||||
if (syn_data && (sysctl_tcp_fastopen & TFO_SERVER_COOKIE_NOT_REQD))
|
||||
goto fastopen;
|
||||
|
||||
if (tcp_fastopen_cookie_gen(req, skb, &valid_foc) &&
|
||||
foc->len == TCP_FASTOPEN_COOKIE_SIZE &&
|
||||
foc->len == valid_foc.len &&
|
||||
!memcmp(foc->val, valid_foc.val, foc->len)) {
|
||||
/* Cookie is valid. Create a (full) child socket to accept
|
||||
* the data in SYN before returning a SYN-ACK to ack the
|
||||
* data. If we fail to create the socket, fall back and
|
||||
* ack the ISN only but includes the same cookie.
|
||||
*
|
||||
* Note: Data-less SYN with valid cookie is allowed to send
|
||||
* data in SYN_RECV state.
|
||||
*/
|
||||
fastopen:
|
||||
if (tcp_fastopen_create_child(sk, skb, dst, req)) {
|
||||
foc->len = -1;
|
||||
NET_INC_STATS_BH(sock_net(sk),
|
||||
LINUX_MIB_TCPFASTOPENPASSIVE);
|
||||
return true;
|
||||
}
|
||||
}
|
||||
|
||||
NET_INC_STATS_BH(sock_net(sk), foc->len ?
|
||||
LINUX_MIB_TCPFASTOPENPASSIVEFAIL :
|
||||
LINUX_MIB_TCPFASTOPENCOOKIEREQD);
|
||||
*foc = valid_foc;
|
||||
return false;
|
||||
}
|
||||
EXPORT_SYMBOL(tcp_try_fastopen);
|
||||
185
net/ipv4/tcp_highspeed.c
Normal file
185
net/ipv4/tcp_highspeed.c
Normal file
|
|
@ -0,0 +1,185 @@
|
|||
/*
|
||||
* Sally Floyd's High Speed TCP (RFC 3649) congestion control
|
||||
*
|
||||
* See http://www.icir.org/floyd/hstcp.html
|
||||
*
|
||||
* John Heffner <jheffner@psc.edu>
|
||||
*/
|
||||
|
||||
#include <linux/module.h>
|
||||
#include <net/tcp.h>
|
||||
|
||||
/* From AIMD tables from RFC 3649 appendix B,
|
||||
* with fixed-point MD scaled <<8.
|
||||
*/
|
||||
static const struct hstcp_aimd_val {
|
||||
unsigned int cwnd;
|
||||
unsigned int md;
|
||||
} hstcp_aimd_vals[] = {
|
||||
{ 38, 128, /* 0.50 */ },
|
||||
{ 118, 112, /* 0.44 */ },
|
||||
{ 221, 104, /* 0.41 */ },
|
||||
{ 347, 98, /* 0.38 */ },
|
||||
{ 495, 93, /* 0.37 */ },
|
||||
{ 663, 89, /* 0.35 */ },
|
||||
{ 851, 86, /* 0.34 */ },
|
||||
{ 1058, 83, /* 0.33 */ },
|
||||
{ 1284, 81, /* 0.32 */ },
|
||||
{ 1529, 78, /* 0.31 */ },
|
||||
{ 1793, 76, /* 0.30 */ },
|
||||
{ 2076, 74, /* 0.29 */ },
|
||||
{ 2378, 72, /* 0.28 */ },
|
||||
{ 2699, 71, /* 0.28 */ },
|
||||
{ 3039, 69, /* 0.27 */ },
|
||||
{ 3399, 68, /* 0.27 */ },
|
||||
{ 3778, 66, /* 0.26 */ },
|
||||
{ 4177, 65, /* 0.26 */ },
|
||||
{ 4596, 64, /* 0.25 */ },
|
||||
{ 5036, 62, /* 0.25 */ },
|
||||
{ 5497, 61, /* 0.24 */ },
|
||||
{ 5979, 60, /* 0.24 */ },
|
||||
{ 6483, 59, /* 0.23 */ },
|
||||
{ 7009, 58, /* 0.23 */ },
|
||||
{ 7558, 57, /* 0.22 */ },
|
||||
{ 8130, 56, /* 0.22 */ },
|
||||
{ 8726, 55, /* 0.22 */ },
|
||||
{ 9346, 54, /* 0.21 */ },
|
||||
{ 9991, 53, /* 0.21 */ },
|
||||
{ 10661, 52, /* 0.21 */ },
|
||||
{ 11358, 52, /* 0.20 */ },
|
||||
{ 12082, 51, /* 0.20 */ },
|
||||
{ 12834, 50, /* 0.20 */ },
|
||||
{ 13614, 49, /* 0.19 */ },
|
||||
{ 14424, 48, /* 0.19 */ },
|
||||
{ 15265, 48, /* 0.19 */ },
|
||||
{ 16137, 47, /* 0.19 */ },
|
||||
{ 17042, 46, /* 0.18 */ },
|
||||
{ 17981, 45, /* 0.18 */ },
|
||||
{ 18955, 45, /* 0.18 */ },
|
||||
{ 19965, 44, /* 0.17 */ },
|
||||
{ 21013, 43, /* 0.17 */ },
|
||||
{ 22101, 43, /* 0.17 */ },
|
||||
{ 23230, 42, /* 0.17 */ },
|
||||
{ 24402, 41, /* 0.16 */ },
|
||||
{ 25618, 41, /* 0.16 */ },
|
||||
{ 26881, 40, /* 0.16 */ },
|
||||
{ 28193, 39, /* 0.16 */ },
|
||||
{ 29557, 39, /* 0.15 */ },
|
||||
{ 30975, 38, /* 0.15 */ },
|
||||
{ 32450, 38, /* 0.15 */ },
|
||||
{ 33986, 37, /* 0.15 */ },
|
||||
{ 35586, 36, /* 0.14 */ },
|
||||
{ 37253, 36, /* 0.14 */ },
|
||||
{ 38992, 35, /* 0.14 */ },
|
||||
{ 40808, 35, /* 0.14 */ },
|
||||
{ 42707, 34, /* 0.13 */ },
|
||||
{ 44694, 33, /* 0.13 */ },
|
||||
{ 46776, 33, /* 0.13 */ },
|
||||
{ 48961, 32, /* 0.13 */ },
|
||||
{ 51258, 32, /* 0.13 */ },
|
||||
{ 53677, 31, /* 0.12 */ },
|
||||
{ 56230, 30, /* 0.12 */ },
|
||||
{ 58932, 30, /* 0.12 */ },
|
||||
{ 61799, 29, /* 0.12 */ },
|
||||
{ 64851, 28, /* 0.11 */ },
|
||||
{ 68113, 28, /* 0.11 */ },
|
||||
{ 71617, 27, /* 0.11 */ },
|
||||
{ 75401, 26, /* 0.10 */ },
|
||||
{ 79517, 26, /* 0.10 */ },
|
||||
{ 84035, 25, /* 0.10 */ },
|
||||
{ 89053, 24, /* 0.10 */ },
|
||||
};
|
||||
|
||||
#define HSTCP_AIMD_MAX ARRAY_SIZE(hstcp_aimd_vals)
|
||||
|
||||
struct hstcp {
|
||||
u32 ai;
|
||||
};
|
||||
|
||||
static void hstcp_init(struct sock *sk)
|
||||
{
|
||||
struct tcp_sock *tp = tcp_sk(sk);
|
||||
struct hstcp *ca = inet_csk_ca(sk);
|
||||
|
||||
ca->ai = 0;
|
||||
|
||||
/* Ensure the MD arithmetic works. This is somewhat pedantic,
|
||||
* since I don't think we will see a cwnd this large. :) */
|
||||
tp->snd_cwnd_clamp = min_t(u32, tp->snd_cwnd_clamp, 0xffffffff/128);
|
||||
}
|
||||
|
||||
static void hstcp_cong_avoid(struct sock *sk, u32 ack, u32 acked)
|
||||
{
|
||||
struct tcp_sock *tp = tcp_sk(sk);
|
||||
struct hstcp *ca = inet_csk_ca(sk);
|
||||
|
||||
if (!tcp_is_cwnd_limited(sk))
|
||||
return;
|
||||
|
||||
if (tp->snd_cwnd <= tp->snd_ssthresh)
|
||||
tcp_slow_start(tp, acked);
|
||||
else {
|
||||
/* Update AIMD parameters.
|
||||
*
|
||||
* We want to guarantee that:
|
||||
* hstcp_aimd_vals[ca->ai-1].cwnd <
|
||||
* snd_cwnd <=
|
||||
* hstcp_aimd_vals[ca->ai].cwnd
|
||||
*/
|
||||
if (tp->snd_cwnd > hstcp_aimd_vals[ca->ai].cwnd) {
|
||||
while (tp->snd_cwnd > hstcp_aimd_vals[ca->ai].cwnd &&
|
||||
ca->ai < HSTCP_AIMD_MAX - 1)
|
||||
ca->ai++;
|
||||
} else if (ca->ai && tp->snd_cwnd <= hstcp_aimd_vals[ca->ai-1].cwnd) {
|
||||
while (ca->ai && tp->snd_cwnd <= hstcp_aimd_vals[ca->ai-1].cwnd)
|
||||
ca->ai--;
|
||||
}
|
||||
|
||||
/* Do additive increase */
|
||||
if (tp->snd_cwnd < tp->snd_cwnd_clamp) {
|
||||
/* cwnd = cwnd + a(w) / cwnd */
|
||||
tp->snd_cwnd_cnt += ca->ai + 1;
|
||||
if (tp->snd_cwnd_cnt >= tp->snd_cwnd) {
|
||||
tp->snd_cwnd_cnt -= tp->snd_cwnd;
|
||||
tp->snd_cwnd++;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static u32 hstcp_ssthresh(struct sock *sk)
|
||||
{
|
||||
const struct tcp_sock *tp = tcp_sk(sk);
|
||||
const struct hstcp *ca = inet_csk_ca(sk);
|
||||
|
||||
/* Do multiplicative decrease */
|
||||
return max(tp->snd_cwnd - ((tp->snd_cwnd * hstcp_aimd_vals[ca->ai].md) >> 8), 2U);
|
||||
}
|
||||
|
||||
|
||||
static struct tcp_congestion_ops tcp_highspeed __read_mostly = {
|
||||
.init = hstcp_init,
|
||||
.ssthresh = hstcp_ssthresh,
|
||||
.cong_avoid = hstcp_cong_avoid,
|
||||
|
||||
.owner = THIS_MODULE,
|
||||
.name = "highspeed"
|
||||
};
|
||||
|
||||
static int __init hstcp_register(void)
|
||||
{
|
||||
BUILD_BUG_ON(sizeof(struct hstcp) > ICSK_CA_PRIV_SIZE);
|
||||
return tcp_register_congestion_control(&tcp_highspeed);
|
||||
}
|
||||
|
||||
static void __exit hstcp_unregister(void)
|
||||
{
|
||||
tcp_unregister_congestion_control(&tcp_highspeed);
|
||||
}
|
||||
|
||||
module_init(hstcp_register);
|
||||
module_exit(hstcp_unregister);
|
||||
|
||||
MODULE_AUTHOR("John Heffner");
|
||||
MODULE_LICENSE("GPL");
|
||||
MODULE_DESCRIPTION("High Speed TCP");
|
||||
317
net/ipv4/tcp_htcp.c
Normal file
317
net/ipv4/tcp_htcp.c
Normal file
|
|
@ -0,0 +1,317 @@
|
|||
/*
|
||||
* H-TCP congestion control. The algorithm is detailed in:
|
||||
* R.N.Shorten, D.J.Leith:
|
||||
* "H-TCP: TCP for high-speed and long-distance networks"
|
||||
* Proc. PFLDnet, Argonne, 2004.
|
||||
* http://www.hamilton.ie/net/htcp3.pdf
|
||||
*/
|
||||
|
||||
#include <linux/mm.h>
|
||||
#include <linux/module.h>
|
||||
#include <net/tcp.h>
|
||||
|
||||
#define ALPHA_BASE (1<<7) /* 1.0 with shift << 7 */
|
||||
#define BETA_MIN (1<<6) /* 0.5 with shift << 7 */
|
||||
#define BETA_MAX 102 /* 0.8 with shift << 7 */
|
||||
|
||||
static int use_rtt_scaling __read_mostly = 1;
|
||||
module_param(use_rtt_scaling, int, 0644);
|
||||
MODULE_PARM_DESC(use_rtt_scaling, "turn on/off RTT scaling");
|
||||
|
||||
static int use_bandwidth_switch __read_mostly = 1;
|
||||
module_param(use_bandwidth_switch, int, 0644);
|
||||
MODULE_PARM_DESC(use_bandwidth_switch, "turn on/off bandwidth switcher");
|
||||
|
||||
struct htcp {
|
||||
u32 alpha; /* Fixed point arith, << 7 */
|
||||
u8 beta; /* Fixed point arith, << 7 */
|
||||
u8 modeswitch; /* Delay modeswitch
|
||||
until we had at least one congestion event */
|
||||
u16 pkts_acked;
|
||||
u32 packetcount;
|
||||
u32 minRTT;
|
||||
u32 maxRTT;
|
||||
u32 last_cong; /* Time since last congestion event end */
|
||||
u32 undo_last_cong;
|
||||
|
||||
u32 undo_maxRTT;
|
||||
u32 undo_old_maxB;
|
||||
|
||||
/* Bandwidth estimation */
|
||||
u32 minB;
|
||||
u32 maxB;
|
||||
u32 old_maxB;
|
||||
u32 Bi;
|
||||
u32 lasttime;
|
||||
};
|
||||
|
||||
static inline u32 htcp_cong_time(const struct htcp *ca)
|
||||
{
|
||||
return jiffies - ca->last_cong;
|
||||
}
|
||||
|
||||
static inline u32 htcp_ccount(const struct htcp *ca)
|
||||
{
|
||||
return htcp_cong_time(ca) / ca->minRTT;
|
||||
}
|
||||
|
||||
static inline void htcp_reset(struct htcp *ca)
|
||||
{
|
||||
ca->undo_last_cong = ca->last_cong;
|
||||
ca->undo_maxRTT = ca->maxRTT;
|
||||
ca->undo_old_maxB = ca->old_maxB;
|
||||
|
||||
ca->last_cong = jiffies;
|
||||
}
|
||||
|
||||
static u32 htcp_cwnd_undo(struct sock *sk)
|
||||
{
|
||||
const struct tcp_sock *tp = tcp_sk(sk);
|
||||
struct htcp *ca = inet_csk_ca(sk);
|
||||
|
||||
if (ca->undo_last_cong) {
|
||||
ca->last_cong = ca->undo_last_cong;
|
||||
ca->maxRTT = ca->undo_maxRTT;
|
||||
ca->old_maxB = ca->undo_old_maxB;
|
||||
ca->undo_last_cong = 0;
|
||||
}
|
||||
|
||||
return max(tp->snd_cwnd, (tp->snd_ssthresh << 7) / ca->beta);
|
||||
}
|
||||
|
||||
static inline void measure_rtt(struct sock *sk, u32 srtt)
|
||||
{
|
||||
const struct inet_connection_sock *icsk = inet_csk(sk);
|
||||
struct htcp *ca = inet_csk_ca(sk);
|
||||
|
||||
/* keep track of minimum RTT seen so far, minRTT is zero at first */
|
||||
if (ca->minRTT > srtt || !ca->minRTT)
|
||||
ca->minRTT = srtt;
|
||||
|
||||
/* max RTT */
|
||||
if (icsk->icsk_ca_state == TCP_CA_Open) {
|
||||
if (ca->maxRTT < ca->minRTT)
|
||||
ca->maxRTT = ca->minRTT;
|
||||
if (ca->maxRTT < srtt &&
|
||||
srtt <= ca->maxRTT + msecs_to_jiffies(20))
|
||||
ca->maxRTT = srtt;
|
||||
}
|
||||
}
|
||||
|
||||
static void measure_achieved_throughput(struct sock *sk,
|
||||
u32 pkts_acked, s32 rtt)
|
||||
{
|
||||
const struct inet_connection_sock *icsk = inet_csk(sk);
|
||||
const struct tcp_sock *tp = tcp_sk(sk);
|
||||
struct htcp *ca = inet_csk_ca(sk);
|
||||
u32 now = tcp_time_stamp;
|
||||
|
||||
if (icsk->icsk_ca_state == TCP_CA_Open)
|
||||
ca->pkts_acked = pkts_acked;
|
||||
|
||||
if (rtt > 0)
|
||||
measure_rtt(sk, usecs_to_jiffies(rtt));
|
||||
|
||||
if (!use_bandwidth_switch)
|
||||
return;
|
||||
|
||||
/* achieved throughput calculations */
|
||||
if (!((1 << icsk->icsk_ca_state) & (TCPF_CA_Open | TCPF_CA_Disorder))) {
|
||||
ca->packetcount = 0;
|
||||
ca->lasttime = now;
|
||||
return;
|
||||
}
|
||||
|
||||
ca->packetcount += pkts_acked;
|
||||
|
||||
if (ca->packetcount >= tp->snd_cwnd - (ca->alpha >> 7 ? : 1) &&
|
||||
now - ca->lasttime >= ca->minRTT &&
|
||||
ca->minRTT > 0) {
|
||||
__u32 cur_Bi = ca->packetcount * HZ / (now - ca->lasttime);
|
||||
|
||||
if (htcp_ccount(ca) <= 3) {
|
||||
/* just after backoff */
|
||||
ca->minB = ca->maxB = ca->Bi = cur_Bi;
|
||||
} else {
|
||||
ca->Bi = (3 * ca->Bi + cur_Bi) / 4;
|
||||
if (ca->Bi > ca->maxB)
|
||||
ca->maxB = ca->Bi;
|
||||
if (ca->minB > ca->maxB)
|
||||
ca->minB = ca->maxB;
|
||||
}
|
||||
ca->packetcount = 0;
|
||||
ca->lasttime = now;
|
||||
}
|
||||
}
|
||||
|
||||
static inline void htcp_beta_update(struct htcp *ca, u32 minRTT, u32 maxRTT)
|
||||
{
|
||||
if (use_bandwidth_switch) {
|
||||
u32 maxB = ca->maxB;
|
||||
u32 old_maxB = ca->old_maxB;
|
||||
|
||||
ca->old_maxB = ca->maxB;
|
||||
if (!between(5 * maxB, 4 * old_maxB, 6 * old_maxB)) {
|
||||
ca->beta = BETA_MIN;
|
||||
ca->modeswitch = 0;
|
||||
return;
|
||||
}
|
||||
}
|
||||
|
||||
if (ca->modeswitch && minRTT > msecs_to_jiffies(10) && maxRTT) {
|
||||
ca->beta = (minRTT << 7) / maxRTT;
|
||||
if (ca->beta < BETA_MIN)
|
||||
ca->beta = BETA_MIN;
|
||||
else if (ca->beta > BETA_MAX)
|
||||
ca->beta = BETA_MAX;
|
||||
} else {
|
||||
ca->beta = BETA_MIN;
|
||||
ca->modeswitch = 1;
|
||||
}
|
||||
}
|
||||
|
||||
static inline void htcp_alpha_update(struct htcp *ca)
|
||||
{
|
||||
u32 minRTT = ca->minRTT;
|
||||
u32 factor = 1;
|
||||
u32 diff = htcp_cong_time(ca);
|
||||
|
||||
if (diff > HZ) {
|
||||
diff -= HZ;
|
||||
factor = 1 + (10 * diff + ((diff / 2) * (diff / 2) / HZ)) / HZ;
|
||||
}
|
||||
|
||||
if (use_rtt_scaling && minRTT) {
|
||||
u32 scale = (HZ << 3) / (10 * minRTT);
|
||||
|
||||
/* clamping ratio to interval [0.5,10]<<3 */
|
||||
scale = min(max(scale, 1U << 2), 10U << 3);
|
||||
factor = (factor << 3) / scale;
|
||||
if (!factor)
|
||||
factor = 1;
|
||||
}
|
||||
|
||||
ca->alpha = 2 * factor * ((1 << 7) - ca->beta);
|
||||
if (!ca->alpha)
|
||||
ca->alpha = ALPHA_BASE;
|
||||
}
|
||||
|
||||
/*
|
||||
* After we have the rtt data to calculate beta, we'd still prefer to wait one
|
||||
* rtt before we adjust our beta to ensure we are working from a consistent
|
||||
* data.
|
||||
*
|
||||
* This function should be called when we hit a congestion event since only at
|
||||
* that point do we really have a real sense of maxRTT (the queues en route
|
||||
* were getting just too full now).
|
||||
*/
|
||||
static void htcp_param_update(struct sock *sk)
|
||||
{
|
||||
struct htcp *ca = inet_csk_ca(sk);
|
||||
u32 minRTT = ca->minRTT;
|
||||
u32 maxRTT = ca->maxRTT;
|
||||
|
||||
htcp_beta_update(ca, minRTT, maxRTT);
|
||||
htcp_alpha_update(ca);
|
||||
|
||||
/* add slowly fading memory for maxRTT to accommodate routing changes */
|
||||
if (minRTT > 0 && maxRTT > minRTT)
|
||||
ca->maxRTT = minRTT + ((maxRTT - minRTT) * 95) / 100;
|
||||
}
|
||||
|
||||
static u32 htcp_recalc_ssthresh(struct sock *sk)
|
||||
{
|
||||
const struct tcp_sock *tp = tcp_sk(sk);
|
||||
const struct htcp *ca = inet_csk_ca(sk);
|
||||
|
||||
htcp_param_update(sk);
|
||||
return max((tp->snd_cwnd * ca->beta) >> 7, 2U);
|
||||
}
|
||||
|
||||
static void htcp_cong_avoid(struct sock *sk, u32 ack, u32 acked)
|
||||
{
|
||||
struct tcp_sock *tp = tcp_sk(sk);
|
||||
struct htcp *ca = inet_csk_ca(sk);
|
||||
|
||||
if (!tcp_is_cwnd_limited(sk))
|
||||
return;
|
||||
|
||||
if (tp->snd_cwnd <= tp->snd_ssthresh)
|
||||
tcp_slow_start(tp, acked);
|
||||
else {
|
||||
/* In dangerous area, increase slowly.
|
||||
* In theory this is tp->snd_cwnd += alpha / tp->snd_cwnd
|
||||
*/
|
||||
if ((tp->snd_cwnd_cnt * ca->alpha)>>7 >= tp->snd_cwnd) {
|
||||
if (tp->snd_cwnd < tp->snd_cwnd_clamp)
|
||||
tp->snd_cwnd++;
|
||||
tp->snd_cwnd_cnt = 0;
|
||||
htcp_alpha_update(ca);
|
||||
} else
|
||||
tp->snd_cwnd_cnt += ca->pkts_acked;
|
||||
|
||||
ca->pkts_acked = 1;
|
||||
}
|
||||
}
|
||||
|
||||
static void htcp_init(struct sock *sk)
|
||||
{
|
||||
struct htcp *ca = inet_csk_ca(sk);
|
||||
|
||||
memset(ca, 0, sizeof(struct htcp));
|
||||
ca->alpha = ALPHA_BASE;
|
||||
ca->beta = BETA_MIN;
|
||||
ca->pkts_acked = 1;
|
||||
ca->last_cong = jiffies;
|
||||
}
|
||||
|
||||
static void htcp_state(struct sock *sk, u8 new_state)
|
||||
{
|
||||
switch (new_state) {
|
||||
case TCP_CA_Open:
|
||||
{
|
||||
struct htcp *ca = inet_csk_ca(sk);
|
||||
|
||||
if (ca->undo_last_cong) {
|
||||
ca->last_cong = jiffies;
|
||||
ca->undo_last_cong = 0;
|
||||
}
|
||||
}
|
||||
break;
|
||||
case TCP_CA_CWR:
|
||||
case TCP_CA_Recovery:
|
||||
case TCP_CA_Loss:
|
||||
htcp_reset(inet_csk_ca(sk));
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
static struct tcp_congestion_ops htcp __read_mostly = {
|
||||
.init = htcp_init,
|
||||
.ssthresh = htcp_recalc_ssthresh,
|
||||
.cong_avoid = htcp_cong_avoid,
|
||||
.set_state = htcp_state,
|
||||
.undo_cwnd = htcp_cwnd_undo,
|
||||
.pkts_acked = measure_achieved_throughput,
|
||||
.owner = THIS_MODULE,
|
||||
.name = "htcp",
|
||||
};
|
||||
|
||||
static int __init htcp_register(void)
|
||||
{
|
||||
BUILD_BUG_ON(sizeof(struct htcp) > ICSK_CA_PRIV_SIZE);
|
||||
BUILD_BUG_ON(BETA_MIN >= BETA_MAX);
|
||||
return tcp_register_congestion_control(&htcp);
|
||||
}
|
||||
|
||||
static void __exit htcp_unregister(void)
|
||||
{
|
||||
tcp_unregister_congestion_control(&htcp);
|
||||
}
|
||||
|
||||
module_init(htcp_register);
|
||||
module_exit(htcp_unregister);
|
||||
|
||||
MODULE_AUTHOR("Baruch Even");
|
||||
MODULE_LICENSE("GPL");
|
||||
MODULE_DESCRIPTION("H-TCP");
|
||||
192
net/ipv4/tcp_hybla.c
Normal file
192
net/ipv4/tcp_hybla.c
Normal file
|
|
@ -0,0 +1,192 @@
|
|||
/*
|
||||
* TCP HYBLA
|
||||
*
|
||||
* TCP-HYBLA Congestion control algorithm, based on:
|
||||
* C.Caini, R.Firrincieli, "TCP-Hybla: A TCP Enhancement
|
||||
* for Heterogeneous Networks",
|
||||
* International Journal on satellite Communications,
|
||||
* September 2004
|
||||
* Daniele Lacamera
|
||||
* root at danielinux.net
|
||||
*/
|
||||
|
||||
#include <linux/module.h>
|
||||
#include <net/tcp.h>
|
||||
|
||||
/* Tcp Hybla structure. */
|
||||
struct hybla {
|
||||
bool hybla_en;
|
||||
u32 snd_cwnd_cents; /* Keeps increment values when it is <1, <<7 */
|
||||
u32 rho; /* Rho parameter, integer part */
|
||||
u32 rho2; /* Rho * Rho, integer part */
|
||||
u32 rho_3ls; /* Rho parameter, <<3 */
|
||||
u32 rho2_7ls; /* Rho^2, <<7 */
|
||||
u32 minrtt_us; /* Minimum smoothed round trip time value seen */
|
||||
};
|
||||
|
||||
/* Hybla reference round trip time (default= 1/40 sec = 25 ms), in ms */
|
||||
static int rtt0 = 25;
|
||||
module_param(rtt0, int, 0644);
|
||||
MODULE_PARM_DESC(rtt0, "reference rout trip time (ms)");
|
||||
|
||||
/* This is called to refresh values for hybla parameters */
|
||||
static inline void hybla_recalc_param (struct sock *sk)
|
||||
{
|
||||
struct hybla *ca = inet_csk_ca(sk);
|
||||
|
||||
ca->rho_3ls = max_t(u32,
|
||||
tcp_sk(sk)->srtt_us / (rtt0 * USEC_PER_MSEC),
|
||||
8U);
|
||||
ca->rho = ca->rho_3ls >> 3;
|
||||
ca->rho2_7ls = (ca->rho_3ls * ca->rho_3ls) << 1;
|
||||
ca->rho2 = ca->rho2_7ls >> 7;
|
||||
}
|
||||
|
||||
static void hybla_init(struct sock *sk)
|
||||
{
|
||||
struct tcp_sock *tp = tcp_sk(sk);
|
||||
struct hybla *ca = inet_csk_ca(sk);
|
||||
|
||||
ca->rho = 0;
|
||||
ca->rho2 = 0;
|
||||
ca->rho_3ls = 0;
|
||||
ca->rho2_7ls = 0;
|
||||
ca->snd_cwnd_cents = 0;
|
||||
ca->hybla_en = true;
|
||||
tp->snd_cwnd = 2;
|
||||
tp->snd_cwnd_clamp = 65535;
|
||||
|
||||
/* 1st Rho measurement based on initial srtt */
|
||||
hybla_recalc_param(sk);
|
||||
|
||||
/* set minimum rtt as this is the 1st ever seen */
|
||||
ca->minrtt_us = tp->srtt_us;
|
||||
tp->snd_cwnd = ca->rho;
|
||||
}
|
||||
|
||||
static void hybla_state(struct sock *sk, u8 ca_state)
|
||||
{
|
||||
struct hybla *ca = inet_csk_ca(sk);
|
||||
|
||||
ca->hybla_en = (ca_state == TCP_CA_Open);
|
||||
}
|
||||
|
||||
static inline u32 hybla_fraction(u32 odds)
|
||||
{
|
||||
static const u32 fractions[] = {
|
||||
128, 139, 152, 165, 181, 197, 215, 234,
|
||||
};
|
||||
|
||||
return (odds < ARRAY_SIZE(fractions)) ? fractions[odds] : 128;
|
||||
}
|
||||
|
||||
/* TCP Hybla main routine.
|
||||
* This is the algorithm behavior:
|
||||
* o Recalc Hybla parameters if min_rtt has changed
|
||||
* o Give cwnd a new value based on the model proposed
|
||||
* o remember increments <1
|
||||
*/
|
||||
static void hybla_cong_avoid(struct sock *sk, u32 ack, u32 acked)
|
||||
{
|
||||
struct tcp_sock *tp = tcp_sk(sk);
|
||||
struct hybla *ca = inet_csk_ca(sk);
|
||||
u32 increment, odd, rho_fractions;
|
||||
int is_slowstart = 0;
|
||||
|
||||
/* Recalculate rho only if this srtt is the lowest */
|
||||
if (tp->srtt_us < ca->minrtt_us) {
|
||||
hybla_recalc_param(sk);
|
||||
ca->minrtt_us = tp->srtt_us;
|
||||
}
|
||||
|
||||
if (!tcp_is_cwnd_limited(sk))
|
||||
return;
|
||||
|
||||
if (!ca->hybla_en) {
|
||||
tcp_reno_cong_avoid(sk, ack, acked);
|
||||
return;
|
||||
}
|
||||
|
||||
if (ca->rho == 0)
|
||||
hybla_recalc_param(sk);
|
||||
|
||||
rho_fractions = ca->rho_3ls - (ca->rho << 3);
|
||||
|
||||
if (tp->snd_cwnd < tp->snd_ssthresh) {
|
||||
/*
|
||||
* slow start
|
||||
* INC = 2^RHO - 1
|
||||
* This is done by splitting the rho parameter
|
||||
* into 2 parts: an integer part and a fraction part.
|
||||
* Inrement<<7 is estimated by doing:
|
||||
* [2^(int+fract)]<<7
|
||||
* that is equal to:
|
||||
* (2^int) * [(2^fract) <<7]
|
||||
* 2^int is straightly computed as 1<<int,
|
||||
* while we will use hybla_slowstart_fraction_increment() to
|
||||
* calculate 2^fract in a <<7 value.
|
||||
*/
|
||||
is_slowstart = 1;
|
||||
increment = ((1 << min(ca->rho, 16U)) *
|
||||
hybla_fraction(rho_fractions)) - 128;
|
||||
} else {
|
||||
/*
|
||||
* congestion avoidance
|
||||
* INC = RHO^2 / W
|
||||
* as long as increment is estimated as (rho<<7)/window
|
||||
* it already is <<7 and we can easily count its fractions.
|
||||
*/
|
||||
increment = ca->rho2_7ls / tp->snd_cwnd;
|
||||
if (increment < 128)
|
||||
tp->snd_cwnd_cnt++;
|
||||
}
|
||||
|
||||
odd = increment % 128;
|
||||
tp->snd_cwnd += increment >> 7;
|
||||
ca->snd_cwnd_cents += odd;
|
||||
|
||||
/* check when fractions goes >=128 and increase cwnd by 1. */
|
||||
while (ca->snd_cwnd_cents >= 128) {
|
||||
tp->snd_cwnd++;
|
||||
ca->snd_cwnd_cents -= 128;
|
||||
tp->snd_cwnd_cnt = 0;
|
||||
}
|
||||
/* check when cwnd has not been incremented for a while */
|
||||
if (increment == 0 && odd == 0 && tp->snd_cwnd_cnt >= tp->snd_cwnd) {
|
||||
tp->snd_cwnd++;
|
||||
tp->snd_cwnd_cnt = 0;
|
||||
}
|
||||
/* clamp down slowstart cwnd to ssthresh value. */
|
||||
if (is_slowstart)
|
||||
tp->snd_cwnd = min(tp->snd_cwnd, tp->snd_ssthresh);
|
||||
|
||||
tp->snd_cwnd = min_t(u32, tp->snd_cwnd, tp->snd_cwnd_clamp);
|
||||
}
|
||||
|
||||
static struct tcp_congestion_ops tcp_hybla __read_mostly = {
|
||||
.init = hybla_init,
|
||||
.ssthresh = tcp_reno_ssthresh,
|
||||
.cong_avoid = hybla_cong_avoid,
|
||||
.set_state = hybla_state,
|
||||
|
||||
.owner = THIS_MODULE,
|
||||
.name = "hybla"
|
||||
};
|
||||
|
||||
static int __init hybla_register(void)
|
||||
{
|
||||
BUILD_BUG_ON(sizeof(struct hybla) > ICSK_CA_PRIV_SIZE);
|
||||
return tcp_register_congestion_control(&tcp_hybla);
|
||||
}
|
||||
|
||||
static void __exit hybla_unregister(void)
|
||||
{
|
||||
tcp_unregister_congestion_control(&tcp_hybla);
|
||||
}
|
||||
|
||||
module_init(hybla_register);
|
||||
module_exit(hybla_unregister);
|
||||
|
||||
MODULE_AUTHOR("Daniele Lacamera");
|
||||
MODULE_LICENSE("GPL");
|
||||
MODULE_DESCRIPTION("TCP Hybla");
|
||||
354
net/ipv4/tcp_illinois.c
Normal file
354
net/ipv4/tcp_illinois.c
Normal file
|
|
@ -0,0 +1,354 @@
|
|||
/*
|
||||
* TCP Illinois congestion control.
|
||||
* Home page:
|
||||
* http://www.ews.uiuc.edu/~shaoliu/tcpillinois/index.html
|
||||
*
|
||||
* The algorithm is described in:
|
||||
* "TCP-Illinois: A Loss and Delay-Based Congestion Control Algorithm
|
||||
* for High-Speed Networks"
|
||||
* http://www.ifp.illinois.edu/~srikant/Papers/liubassri06perf.pdf
|
||||
*
|
||||
* Implemented from description in paper and ns-2 simulation.
|
||||
* Copyright (C) 2007 Stephen Hemminger <shemminger@linux-foundation.org>
|
||||
*/
|
||||
|
||||
#include <linux/module.h>
|
||||
#include <linux/skbuff.h>
|
||||
#include <linux/inet_diag.h>
|
||||
#include <asm/div64.h>
|
||||
#include <net/tcp.h>
|
||||
|
||||
#define ALPHA_SHIFT 7
|
||||
#define ALPHA_SCALE (1u<<ALPHA_SHIFT)
|
||||
#define ALPHA_MIN ((3*ALPHA_SCALE)/10) /* ~0.3 */
|
||||
#define ALPHA_MAX (10*ALPHA_SCALE) /* 10.0 */
|
||||
#define ALPHA_BASE ALPHA_SCALE /* 1.0 */
|
||||
#define RTT_MAX (U32_MAX / ALPHA_MAX) /* 3.3 secs */
|
||||
|
||||
#define BETA_SHIFT 6
|
||||
#define BETA_SCALE (1u<<BETA_SHIFT)
|
||||
#define BETA_MIN (BETA_SCALE/8) /* 0.125 */
|
||||
#define BETA_MAX (BETA_SCALE/2) /* 0.5 */
|
||||
#define BETA_BASE BETA_MAX
|
||||
|
||||
static int win_thresh __read_mostly = 15;
|
||||
module_param(win_thresh, int, 0);
|
||||
MODULE_PARM_DESC(win_thresh, "Window threshold for starting adaptive sizing");
|
||||
|
||||
static int theta __read_mostly = 5;
|
||||
module_param(theta, int, 0);
|
||||
MODULE_PARM_DESC(theta, "# of fast RTT's before full growth");
|
||||
|
||||
/* TCP Illinois Parameters */
|
||||
struct illinois {
|
||||
u64 sum_rtt; /* sum of rtt's measured within last rtt */
|
||||
u16 cnt_rtt; /* # of rtts measured within last rtt */
|
||||
u32 base_rtt; /* min of all rtt in usec */
|
||||
u32 max_rtt; /* max of all rtt in usec */
|
||||
u32 end_seq; /* right edge of current RTT */
|
||||
u32 alpha; /* Additive increase */
|
||||
u32 beta; /* Muliplicative decrease */
|
||||
u16 acked; /* # packets acked by current ACK */
|
||||
u8 rtt_above; /* average rtt has gone above threshold */
|
||||
u8 rtt_low; /* # of rtts measurements below threshold */
|
||||
};
|
||||
|
||||
static void rtt_reset(struct sock *sk)
|
||||
{
|
||||
struct tcp_sock *tp = tcp_sk(sk);
|
||||
struct illinois *ca = inet_csk_ca(sk);
|
||||
|
||||
ca->end_seq = tp->snd_nxt;
|
||||
ca->cnt_rtt = 0;
|
||||
ca->sum_rtt = 0;
|
||||
|
||||
/* TODO: age max_rtt? */
|
||||
}
|
||||
|
||||
static void tcp_illinois_init(struct sock *sk)
|
||||
{
|
||||
struct illinois *ca = inet_csk_ca(sk);
|
||||
|
||||
ca->alpha = ALPHA_MAX;
|
||||
ca->beta = BETA_BASE;
|
||||
ca->base_rtt = 0x7fffffff;
|
||||
ca->max_rtt = 0;
|
||||
|
||||
ca->acked = 0;
|
||||
ca->rtt_low = 0;
|
||||
ca->rtt_above = 0;
|
||||
|
||||
rtt_reset(sk);
|
||||
}
|
||||
|
||||
/* Measure RTT for each ack. */
|
||||
static void tcp_illinois_acked(struct sock *sk, u32 pkts_acked, s32 rtt)
|
||||
{
|
||||
struct illinois *ca = inet_csk_ca(sk);
|
||||
|
||||
ca->acked = pkts_acked;
|
||||
|
||||
/* dup ack, no rtt sample */
|
||||
if (rtt < 0)
|
||||
return;
|
||||
|
||||
/* ignore bogus values, this prevents wraparound in alpha math */
|
||||
if (rtt > RTT_MAX)
|
||||
rtt = RTT_MAX;
|
||||
|
||||
/* keep track of minimum RTT seen so far */
|
||||
if (ca->base_rtt > rtt)
|
||||
ca->base_rtt = rtt;
|
||||
|
||||
/* and max */
|
||||
if (ca->max_rtt < rtt)
|
||||
ca->max_rtt = rtt;
|
||||
|
||||
++ca->cnt_rtt;
|
||||
ca->sum_rtt += rtt;
|
||||
}
|
||||
|
||||
/* Maximum queuing delay */
|
||||
static inline u32 max_delay(const struct illinois *ca)
|
||||
{
|
||||
return ca->max_rtt - ca->base_rtt;
|
||||
}
|
||||
|
||||
/* Average queuing delay */
|
||||
static inline u32 avg_delay(const struct illinois *ca)
|
||||
{
|
||||
u64 t = ca->sum_rtt;
|
||||
|
||||
do_div(t, ca->cnt_rtt);
|
||||
return t - ca->base_rtt;
|
||||
}
|
||||
|
||||
/*
|
||||
* Compute value of alpha used for additive increase.
|
||||
* If small window then use 1.0, equivalent to Reno.
|
||||
*
|
||||
* For larger windows, adjust based on average delay.
|
||||
* A. If average delay is at minimum (we are uncongested),
|
||||
* then use large alpha (10.0) to increase faster.
|
||||
* B. If average delay is at maximum (getting congested)
|
||||
* then use small alpha (0.3)
|
||||
*
|
||||
* The result is a convex window growth curve.
|
||||
*/
|
||||
static u32 alpha(struct illinois *ca, u32 da, u32 dm)
|
||||
{
|
||||
u32 d1 = dm / 100; /* Low threshold */
|
||||
|
||||
if (da <= d1) {
|
||||
/* If never got out of low delay zone, then use max */
|
||||
if (!ca->rtt_above)
|
||||
return ALPHA_MAX;
|
||||
|
||||
/* Wait for 5 good RTT's before allowing alpha to go alpha max.
|
||||
* This prevents one good RTT from causing sudden window increase.
|
||||
*/
|
||||
if (++ca->rtt_low < theta)
|
||||
return ca->alpha;
|
||||
|
||||
ca->rtt_low = 0;
|
||||
ca->rtt_above = 0;
|
||||
return ALPHA_MAX;
|
||||
}
|
||||
|
||||
ca->rtt_above = 1;
|
||||
|
||||
/*
|
||||
* Based on:
|
||||
*
|
||||
* (dm - d1) amin amax
|
||||
* k1 = -------------------
|
||||
* amax - amin
|
||||
*
|
||||
* (dm - d1) amin
|
||||
* k2 = ---------------- - d1
|
||||
* amax - amin
|
||||
*
|
||||
* k1
|
||||
* alpha = ----------
|
||||
* k2 + da
|
||||
*/
|
||||
|
||||
dm -= d1;
|
||||
da -= d1;
|
||||
return (dm * ALPHA_MAX) /
|
||||
(dm + (da * (ALPHA_MAX - ALPHA_MIN)) / ALPHA_MIN);
|
||||
}
|
||||
|
||||
/*
|
||||
* Beta used for multiplicative decrease.
|
||||
* For small window sizes returns same value as Reno (0.5)
|
||||
*
|
||||
* If delay is small (10% of max) then beta = 1/8
|
||||
* If delay is up to 80% of max then beta = 1/2
|
||||
* In between is a linear function
|
||||
*/
|
||||
static u32 beta(u32 da, u32 dm)
|
||||
{
|
||||
u32 d2, d3;
|
||||
|
||||
d2 = dm / 10;
|
||||
if (da <= d2)
|
||||
return BETA_MIN;
|
||||
|
||||
d3 = (8 * dm) / 10;
|
||||
if (da >= d3 || d3 <= d2)
|
||||
return BETA_MAX;
|
||||
|
||||
/*
|
||||
* Based on:
|
||||
*
|
||||
* bmin d3 - bmax d2
|
||||
* k3 = -------------------
|
||||
* d3 - d2
|
||||
*
|
||||
* bmax - bmin
|
||||
* k4 = -------------
|
||||
* d3 - d2
|
||||
*
|
||||
* b = k3 + k4 da
|
||||
*/
|
||||
return (BETA_MIN * d3 - BETA_MAX * d2 + (BETA_MAX - BETA_MIN) * da)
|
||||
/ (d3 - d2);
|
||||
}
|
||||
|
||||
/* Update alpha and beta values once per RTT */
|
||||
static void update_params(struct sock *sk)
|
||||
{
|
||||
struct tcp_sock *tp = tcp_sk(sk);
|
||||
struct illinois *ca = inet_csk_ca(sk);
|
||||
|
||||
if (tp->snd_cwnd < win_thresh) {
|
||||
ca->alpha = ALPHA_BASE;
|
||||
ca->beta = BETA_BASE;
|
||||
} else if (ca->cnt_rtt > 0) {
|
||||
u32 dm = max_delay(ca);
|
||||
u32 da = avg_delay(ca);
|
||||
|
||||
ca->alpha = alpha(ca, da, dm);
|
||||
ca->beta = beta(da, dm);
|
||||
}
|
||||
|
||||
rtt_reset(sk);
|
||||
}
|
||||
|
||||
/*
|
||||
* In case of loss, reset to default values
|
||||
*/
|
||||
static void tcp_illinois_state(struct sock *sk, u8 new_state)
|
||||
{
|
||||
struct illinois *ca = inet_csk_ca(sk);
|
||||
|
||||
if (new_state == TCP_CA_Loss) {
|
||||
ca->alpha = ALPHA_BASE;
|
||||
ca->beta = BETA_BASE;
|
||||
ca->rtt_low = 0;
|
||||
ca->rtt_above = 0;
|
||||
rtt_reset(sk);
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* Increase window in response to successful acknowledgment.
|
||||
*/
|
||||
static void tcp_illinois_cong_avoid(struct sock *sk, u32 ack, u32 acked)
|
||||
{
|
||||
struct tcp_sock *tp = tcp_sk(sk);
|
||||
struct illinois *ca = inet_csk_ca(sk);
|
||||
|
||||
if (after(ack, ca->end_seq))
|
||||
update_params(sk);
|
||||
|
||||
/* RFC2861 only increase cwnd if fully utilized */
|
||||
if (!tcp_is_cwnd_limited(sk))
|
||||
return;
|
||||
|
||||
/* In slow start */
|
||||
if (tp->snd_cwnd <= tp->snd_ssthresh)
|
||||
tcp_slow_start(tp, acked);
|
||||
|
||||
else {
|
||||
u32 delta;
|
||||
|
||||
/* snd_cwnd_cnt is # of packets since last cwnd increment */
|
||||
tp->snd_cwnd_cnt += ca->acked;
|
||||
ca->acked = 1;
|
||||
|
||||
/* This is close approximation of:
|
||||
* tp->snd_cwnd += alpha/tp->snd_cwnd
|
||||
*/
|
||||
delta = (tp->snd_cwnd_cnt * ca->alpha) >> ALPHA_SHIFT;
|
||||
if (delta >= tp->snd_cwnd) {
|
||||
tp->snd_cwnd = min(tp->snd_cwnd + delta / tp->snd_cwnd,
|
||||
(u32)tp->snd_cwnd_clamp);
|
||||
tp->snd_cwnd_cnt = 0;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static u32 tcp_illinois_ssthresh(struct sock *sk)
|
||||
{
|
||||
struct tcp_sock *tp = tcp_sk(sk);
|
||||
struct illinois *ca = inet_csk_ca(sk);
|
||||
|
||||
/* Multiplicative decrease */
|
||||
return max(tp->snd_cwnd - ((tp->snd_cwnd * ca->beta) >> BETA_SHIFT), 2U);
|
||||
}
|
||||
|
||||
/* Extract info for Tcp socket info provided via netlink. */
|
||||
static void tcp_illinois_info(struct sock *sk, u32 ext,
|
||||
struct sk_buff *skb)
|
||||
{
|
||||
const struct illinois *ca = inet_csk_ca(sk);
|
||||
|
||||
if (ext & (1 << (INET_DIAG_VEGASINFO - 1))) {
|
||||
struct tcpvegas_info info = {
|
||||
.tcpv_enabled = 1,
|
||||
.tcpv_rttcnt = ca->cnt_rtt,
|
||||
.tcpv_minrtt = ca->base_rtt,
|
||||
};
|
||||
|
||||
if (info.tcpv_rttcnt > 0) {
|
||||
u64 t = ca->sum_rtt;
|
||||
|
||||
do_div(t, info.tcpv_rttcnt);
|
||||
info.tcpv_rtt = t;
|
||||
}
|
||||
nla_put(skb, INET_DIAG_VEGASINFO, sizeof(info), &info);
|
||||
}
|
||||
}
|
||||
|
||||
static struct tcp_congestion_ops tcp_illinois __read_mostly = {
|
||||
.init = tcp_illinois_init,
|
||||
.ssthresh = tcp_illinois_ssthresh,
|
||||
.cong_avoid = tcp_illinois_cong_avoid,
|
||||
.set_state = tcp_illinois_state,
|
||||
.get_info = tcp_illinois_info,
|
||||
.pkts_acked = tcp_illinois_acked,
|
||||
|
||||
.owner = THIS_MODULE,
|
||||
.name = "illinois",
|
||||
};
|
||||
|
||||
static int __init tcp_illinois_register(void)
|
||||
{
|
||||
BUILD_BUG_ON(sizeof(struct illinois) > ICSK_CA_PRIV_SIZE);
|
||||
return tcp_register_congestion_control(&tcp_illinois);
|
||||
}
|
||||
|
||||
static void __exit tcp_illinois_unregister(void)
|
||||
{
|
||||
tcp_unregister_congestion_control(&tcp_illinois);
|
||||
}
|
||||
|
||||
module_init(tcp_illinois_register);
|
||||
module_exit(tcp_illinois_unregister);
|
||||
|
||||
MODULE_AUTHOR("Stephen Hemminger, Shao Liu");
|
||||
MODULE_LICENSE("GPL");
|
||||
MODULE_DESCRIPTION("TCP Illinois");
|
||||
MODULE_VERSION("1.0");
|
||||
6041
net/ipv4/tcp_input.c
Normal file
6041
net/ipv4/tcp_input.c
Normal file
File diff suppressed because it is too large
Load diff
Some files were not shown because too many files have changed in this diff Show more
Loading…
Add table
Add a link
Reference in a new issue