mirror of
https://github.com/AetherDroid/android_kernel_samsung_on5xelte.git
synced 2025-10-29 07:18:51 +01:00
Fixed MTP to work with TWRP
This commit is contained in:
commit
f6dfaef42e
50820 changed files with 20846062 additions and 0 deletions
700
net/sched/Kconfig
Normal file
700
net/sched/Kconfig
Normal file
|
|
@ -0,0 +1,700 @@
|
|||
#
|
||||
# Traffic control configuration.
|
||||
#
|
||||
|
||||
menuconfig NET_SCHED
|
||||
bool "QoS and/or fair queueing"
|
||||
select NET_SCH_FIFO
|
||||
---help---
|
||||
When the kernel has several packets to send out over a network
|
||||
device, it has to decide which ones to send first, which ones to
|
||||
delay, and which ones to drop. This is the job of the queueing
|
||||
disciplines, several different algorithms for how to do this
|
||||
"fairly" have been proposed.
|
||||
|
||||
If you say N here, you will get the standard packet scheduler, which
|
||||
is a FIFO (first come, first served). If you say Y here, you will be
|
||||
able to choose from among several alternative algorithms which can
|
||||
then be attached to different network devices. This is useful for
|
||||
example if some of your network devices are real time devices that
|
||||
need a certain minimum data flow rate, or if you need to limit the
|
||||
maximum data flow rate for traffic which matches specified criteria.
|
||||
This code is considered to be experimental.
|
||||
|
||||
To administer these schedulers, you'll need the user-level utilities
|
||||
from the package iproute2+tc at <ftp://ftp.tux.org/pub/net/ip-routing/>.
|
||||
That package also contains some documentation; for more, check out
|
||||
<http://www.linuxfoundation.org/collaborate/workgroups/networking/iproute2>.
|
||||
|
||||
This Quality of Service (QoS) support will enable you to use
|
||||
Differentiated Services (diffserv) and Resource Reservation Protocol
|
||||
(RSVP) on your Linux router if you also say Y to the corresponding
|
||||
classifiers below. Documentation and software is at
|
||||
<http://diffserv.sourceforge.net/>.
|
||||
|
||||
If you say Y here and to "/proc file system" below, you will be able
|
||||
to read status information about packet schedulers from the file
|
||||
/proc/net/psched.
|
||||
|
||||
The available schedulers are listed in the following questions; you
|
||||
can say Y to as many as you like. If unsure, say N now.
|
||||
|
||||
if NET_SCHED
|
||||
|
||||
comment "Queueing/Scheduling"
|
||||
|
||||
config NET_SCH_CBQ
|
||||
tristate "Class Based Queueing (CBQ)"
|
||||
---help---
|
||||
Say Y here if you want to use the Class-Based Queueing (CBQ) packet
|
||||
scheduling algorithm. This algorithm classifies the waiting packets
|
||||
into a tree-like hierarchy of classes; the leaves of this tree are
|
||||
in turn scheduled by separate algorithms.
|
||||
|
||||
See the top of <file:net/sched/sch_cbq.c> for more details.
|
||||
|
||||
CBQ is a commonly used scheduler, so if you're unsure, you should
|
||||
say Y here. Then say Y to all the queueing algorithms below that you
|
||||
want to use as leaf disciplines.
|
||||
|
||||
To compile this code as a module, choose M here: the
|
||||
module will be called sch_cbq.
|
||||
|
||||
config NET_SCH_HTB
|
||||
tristate "Hierarchical Token Bucket (HTB)"
|
||||
---help---
|
||||
Say Y here if you want to use the Hierarchical Token Buckets (HTB)
|
||||
packet scheduling algorithm. See
|
||||
<http://luxik.cdi.cz/~devik/qos/htb/> for complete manual and
|
||||
in-depth articles.
|
||||
|
||||
HTB is very similar to CBQ regarding its goals however is has
|
||||
different properties and different algorithm.
|
||||
|
||||
To compile this code as a module, choose M here: the
|
||||
module will be called sch_htb.
|
||||
|
||||
config NET_SCH_HFSC
|
||||
tristate "Hierarchical Fair Service Curve (HFSC)"
|
||||
---help---
|
||||
Say Y here if you want to use the Hierarchical Fair Service Curve
|
||||
(HFSC) packet scheduling algorithm.
|
||||
|
||||
To compile this code as a module, choose M here: the
|
||||
module will be called sch_hfsc.
|
||||
|
||||
config NET_SCH_ATM
|
||||
tristate "ATM Virtual Circuits (ATM)"
|
||||
depends on ATM
|
||||
---help---
|
||||
Say Y here if you want to use the ATM pseudo-scheduler. This
|
||||
provides a framework for invoking classifiers, which in turn
|
||||
select classes of this queuing discipline. Each class maps
|
||||
the flow(s) it is handling to a given virtual circuit.
|
||||
|
||||
See the top of <file:net/sched/sch_atm.c> for more details.
|
||||
|
||||
To compile this code as a module, choose M here: the
|
||||
module will be called sch_atm.
|
||||
|
||||
config NET_SCH_PRIO
|
||||
tristate "Multi Band Priority Queueing (PRIO)"
|
||||
---help---
|
||||
Say Y here if you want to use an n-band priority queue packet
|
||||
scheduler.
|
||||
|
||||
To compile this code as a module, choose M here: the
|
||||
module will be called sch_prio.
|
||||
|
||||
config NET_SCH_MULTIQ
|
||||
tristate "Hardware Multiqueue-aware Multi Band Queuing (MULTIQ)"
|
||||
---help---
|
||||
Say Y here if you want to use an n-band queue packet scheduler
|
||||
to support devices that have multiple hardware transmit queues.
|
||||
|
||||
To compile this code as a module, choose M here: the
|
||||
module will be called sch_multiq.
|
||||
|
||||
config NET_SCH_RED
|
||||
tristate "Random Early Detection (RED)"
|
||||
---help---
|
||||
Say Y here if you want to use the Random Early Detection (RED)
|
||||
packet scheduling algorithm.
|
||||
|
||||
See the top of <file:net/sched/sch_red.c> for more details.
|
||||
|
||||
To compile this code as a module, choose M here: the
|
||||
module will be called sch_red.
|
||||
|
||||
config NET_SCH_SFB
|
||||
tristate "Stochastic Fair Blue (SFB)"
|
||||
---help---
|
||||
Say Y here if you want to use the Stochastic Fair Blue (SFB)
|
||||
packet scheduling algorithm.
|
||||
|
||||
See the top of <file:net/sched/sch_sfb.c> for more details.
|
||||
|
||||
To compile this code as a module, choose M here: the
|
||||
module will be called sch_sfb.
|
||||
|
||||
config NET_SCH_SFQ
|
||||
tristate "Stochastic Fairness Queueing (SFQ)"
|
||||
---help---
|
||||
Say Y here if you want to use the Stochastic Fairness Queueing (SFQ)
|
||||
packet scheduling algorithm.
|
||||
|
||||
See the top of <file:net/sched/sch_sfq.c> for more details.
|
||||
|
||||
To compile this code as a module, choose M here: the
|
||||
module will be called sch_sfq.
|
||||
|
||||
config NET_SCH_TEQL
|
||||
tristate "True Link Equalizer (TEQL)"
|
||||
---help---
|
||||
Say Y here if you want to use the True Link Equalizer (TLE) packet
|
||||
scheduling algorithm. This queueing discipline allows the combination
|
||||
of several physical devices into one virtual device.
|
||||
|
||||
See the top of <file:net/sched/sch_teql.c> for more details.
|
||||
|
||||
To compile this code as a module, choose M here: the
|
||||
module will be called sch_teql.
|
||||
|
||||
config NET_SCH_TBF
|
||||
tristate "Token Bucket Filter (TBF)"
|
||||
---help---
|
||||
Say Y here if you want to use the Token Bucket Filter (TBF) packet
|
||||
scheduling algorithm.
|
||||
|
||||
See the top of <file:net/sched/sch_tbf.c> for more details.
|
||||
|
||||
To compile this code as a module, choose M here: the
|
||||
module will be called sch_tbf.
|
||||
|
||||
config NET_SCH_GRED
|
||||
tristate "Generic Random Early Detection (GRED)"
|
||||
---help---
|
||||
Say Y here if you want to use the Generic Random Early Detection
|
||||
(GRED) packet scheduling algorithm for some of your network devices
|
||||
(see the top of <file:net/sched/sch_red.c> for details and
|
||||
references about the algorithm).
|
||||
|
||||
To compile this code as a module, choose M here: the
|
||||
module will be called sch_gred.
|
||||
|
||||
config NET_SCH_DSMARK
|
||||
tristate "Differentiated Services marker (DSMARK)"
|
||||
---help---
|
||||
Say Y if you want to schedule packets according to the
|
||||
Differentiated Services architecture proposed in RFC 2475.
|
||||
Technical information on this method, with pointers to associated
|
||||
RFCs, is available at <http://www.gta.ufrj.br/diffserv/>.
|
||||
|
||||
To compile this code as a module, choose M here: the
|
||||
module will be called sch_dsmark.
|
||||
|
||||
config NET_SCH_NETEM
|
||||
tristate "Network emulator (NETEM)"
|
||||
---help---
|
||||
Say Y if you want to emulate network delay, loss, and packet
|
||||
re-ordering. This is often useful to simulate networks when
|
||||
testing applications or protocols.
|
||||
|
||||
To compile this driver as a module, choose M here: the module
|
||||
will be called sch_netem.
|
||||
|
||||
If unsure, say N.
|
||||
|
||||
config NET_SCH_DRR
|
||||
tristate "Deficit Round Robin scheduler (DRR)"
|
||||
help
|
||||
Say Y here if you want to use the Deficit Round Robin (DRR) packet
|
||||
scheduling algorithm.
|
||||
|
||||
To compile this driver as a module, choose M here: the module
|
||||
will be called sch_drr.
|
||||
|
||||
If unsure, say N.
|
||||
|
||||
config NET_SCH_MQPRIO
|
||||
tristate "Multi-queue priority scheduler (MQPRIO)"
|
||||
help
|
||||
Say Y here if you want to use the Multi-queue Priority scheduler.
|
||||
This scheduler allows QOS to be offloaded on NICs that have support
|
||||
for offloading QOS schedulers.
|
||||
|
||||
To compile this driver as a module, choose M here: the module will
|
||||
be called sch_mqprio.
|
||||
|
||||
If unsure, say N.
|
||||
|
||||
config NET_SCH_CHOKE
|
||||
tristate "CHOose and Keep responsive flow scheduler (CHOKE)"
|
||||
help
|
||||
Say Y here if you want to use the CHOKe packet scheduler (CHOose
|
||||
and Keep for responsive flows, CHOose and Kill for unresponsive
|
||||
flows). This is a variation of RED which trys to penalize flows
|
||||
that monopolize the queue.
|
||||
|
||||
To compile this code as a module, choose M here: the
|
||||
module will be called sch_choke.
|
||||
|
||||
config NET_SCH_QFQ
|
||||
tristate "Quick Fair Queueing scheduler (QFQ)"
|
||||
help
|
||||
Say Y here if you want to use the Quick Fair Queueing Scheduler (QFQ)
|
||||
packet scheduling algorithm.
|
||||
|
||||
To compile this driver as a module, choose M here: the module
|
||||
will be called sch_qfq.
|
||||
|
||||
If unsure, say N.
|
||||
|
||||
config NET_SCH_CODEL
|
||||
tristate "Controlled Delay AQM (CODEL)"
|
||||
help
|
||||
Say Y here if you want to use the Controlled Delay (CODEL)
|
||||
packet scheduling algorithm.
|
||||
|
||||
To compile this driver as a module, choose M here: the module
|
||||
will be called sch_codel.
|
||||
|
||||
If unsure, say N.
|
||||
|
||||
config NET_SCH_FQ_CODEL
|
||||
tristate "Fair Queue Controlled Delay AQM (FQ_CODEL)"
|
||||
help
|
||||
Say Y here if you want to use the FQ Controlled Delay (FQ_CODEL)
|
||||
packet scheduling algorithm.
|
||||
|
||||
To compile this driver as a module, choose M here: the module
|
||||
will be called sch_fq_codel.
|
||||
|
||||
If unsure, say N.
|
||||
|
||||
config NET_SCH_FQ
|
||||
tristate "Fair Queue"
|
||||
help
|
||||
Say Y here if you want to use the FQ packet scheduling algorithm.
|
||||
|
||||
FQ does flow separation, and is able to respect pacing requirements
|
||||
set by TCP stack into sk->sk_pacing_rate (for localy generated
|
||||
traffic)
|
||||
|
||||
To compile this driver as a module, choose M here: the module
|
||||
will be called sch_fq.
|
||||
|
||||
If unsure, say N.
|
||||
|
||||
config NET_SCH_HHF
|
||||
tristate "Heavy-Hitter Filter (HHF)"
|
||||
help
|
||||
Say Y here if you want to use the Heavy-Hitter Filter (HHF)
|
||||
packet scheduling algorithm.
|
||||
|
||||
To compile this driver as a module, choose M here: the module
|
||||
will be called sch_hhf.
|
||||
|
||||
config NET_SCH_PIE
|
||||
tristate "Proportional Integral controller Enhanced (PIE) scheduler"
|
||||
help
|
||||
Say Y here if you want to use the Proportional Integral controller
|
||||
Enhanced scheduler packet scheduling algorithm.
|
||||
For more information, please see
|
||||
http://tools.ietf.org/html/draft-pan-tsvwg-pie-00
|
||||
|
||||
To compile this driver as a module, choose M here: the module
|
||||
will be called sch_pie.
|
||||
|
||||
If unsure, say N.
|
||||
|
||||
config NET_SCH_INGRESS
|
||||
tristate "Ingress Qdisc"
|
||||
depends on NET_CLS_ACT
|
||||
---help---
|
||||
Say Y here if you want to use classifiers for incoming packets.
|
||||
If unsure, say Y.
|
||||
|
||||
To compile this code as a module, choose M here: the
|
||||
module will be called sch_ingress.
|
||||
|
||||
config NET_SCH_PLUG
|
||||
tristate "Plug network traffic until release (PLUG)"
|
||||
---help---
|
||||
|
||||
This queuing discipline allows userspace to plug/unplug a network
|
||||
output queue, using the netlink interface. When it receives an
|
||||
enqueue command it inserts a plug into the outbound queue that
|
||||
causes following packets to enqueue until a dequeue command arrives
|
||||
over netlink, causing the plug to be removed and resuming the normal
|
||||
packet flow.
|
||||
|
||||
This module also provides a generic "network output buffering"
|
||||
functionality (aka output commit), wherein upon arrival of a dequeue
|
||||
command, only packets up to the first plug are released for delivery.
|
||||
The Remus HA project uses this module to enable speculative execution
|
||||
of virtual machines by allowing the generated network output to be rolled
|
||||
back if needed.
|
||||
|
||||
For more information, please refer to http://wiki.xensource.com/xenwiki/Remus
|
||||
|
||||
Say Y here if you are using this kernel for Xen dom0 and
|
||||
want to protect Xen guests with Remus.
|
||||
|
||||
To compile this code as a module, choose M here: the
|
||||
module will be called sch_plug.
|
||||
|
||||
comment "Classification"
|
||||
|
||||
config NET_CLS
|
||||
boolean
|
||||
|
||||
config NET_CLS_BASIC
|
||||
tristate "Elementary classification (BASIC)"
|
||||
select NET_CLS
|
||||
---help---
|
||||
Say Y here if you want to be able to classify packets using
|
||||
only extended matches and actions.
|
||||
|
||||
To compile this code as a module, choose M here: the
|
||||
module will be called cls_basic.
|
||||
|
||||
config NET_CLS_TCINDEX
|
||||
tristate "Traffic-Control Index (TCINDEX)"
|
||||
select NET_CLS
|
||||
---help---
|
||||
Say Y here if you want to be able to classify packets based on
|
||||
traffic control indices. You will want this feature if you want
|
||||
to implement Differentiated Services together with DSMARK.
|
||||
|
||||
To compile this code as a module, choose M here: the
|
||||
module will be called cls_tcindex.
|
||||
|
||||
config NET_CLS_ROUTE4
|
||||
tristate "Routing decision (ROUTE)"
|
||||
depends on INET
|
||||
select IP_ROUTE_CLASSID
|
||||
select NET_CLS
|
||||
---help---
|
||||
If you say Y here, you will be able to classify packets
|
||||
according to the route table entry they matched.
|
||||
|
||||
To compile this code as a module, choose M here: the
|
||||
module will be called cls_route.
|
||||
|
||||
config NET_CLS_FW
|
||||
tristate "Netfilter mark (FW)"
|
||||
select NET_CLS
|
||||
---help---
|
||||
If you say Y here, you will be able to classify packets
|
||||
according to netfilter/firewall marks.
|
||||
|
||||
To compile this code as a module, choose M here: the
|
||||
module will be called cls_fw.
|
||||
|
||||
config NET_CLS_U32
|
||||
tristate "Universal 32bit comparisons w/ hashing (U32)"
|
||||
select NET_CLS
|
||||
---help---
|
||||
Say Y here to be able to classify packets using a universal
|
||||
32bit pieces based comparison scheme.
|
||||
|
||||
To compile this code as a module, choose M here: the
|
||||
module will be called cls_u32.
|
||||
|
||||
config CLS_U32_PERF
|
||||
bool "Performance counters support"
|
||||
depends on NET_CLS_U32
|
||||
---help---
|
||||
Say Y here to make u32 gather additional statistics useful for
|
||||
fine tuning u32 classifiers.
|
||||
|
||||
config CLS_U32_MARK
|
||||
bool "Netfilter marks support"
|
||||
depends on NET_CLS_U32
|
||||
---help---
|
||||
Say Y here to be able to use netfilter marks as u32 key.
|
||||
|
||||
config NET_CLS_RSVP
|
||||
tristate "IPv4 Resource Reservation Protocol (RSVP)"
|
||||
select NET_CLS
|
||||
---help---
|
||||
The Resource Reservation Protocol (RSVP) permits end systems to
|
||||
request a minimum and maximum data flow rate for a connection; this
|
||||
is important for real time data such as streaming sound or video.
|
||||
|
||||
Say Y here if you want to be able to classify outgoing packets based
|
||||
on their RSVP requests.
|
||||
|
||||
To compile this code as a module, choose M here: the
|
||||
module will be called cls_rsvp.
|
||||
|
||||
config NET_CLS_RSVP6
|
||||
tristate "IPv6 Resource Reservation Protocol (RSVP6)"
|
||||
select NET_CLS
|
||||
---help---
|
||||
The Resource Reservation Protocol (RSVP) permits end systems to
|
||||
request a minimum and maximum data flow rate for a connection; this
|
||||
is important for real time data such as streaming sound or video.
|
||||
|
||||
Say Y here if you want to be able to classify outgoing packets based
|
||||
on their RSVP requests and you are using the IPv6 protocol.
|
||||
|
||||
To compile this code as a module, choose M here: the
|
||||
module will be called cls_rsvp6.
|
||||
|
||||
config NET_CLS_FLOW
|
||||
tristate "Flow classifier"
|
||||
select NET_CLS
|
||||
---help---
|
||||
If you say Y here, you will be able to classify packets based on
|
||||
a configurable combination of packet keys. This is mostly useful
|
||||
in combination with SFQ.
|
||||
|
||||
To compile this code as a module, choose M here: the
|
||||
module will be called cls_flow.
|
||||
|
||||
config NET_CLS_CGROUP
|
||||
tristate "Control Group Classifier"
|
||||
select NET_CLS
|
||||
select CGROUP_NET_CLASSID
|
||||
depends on CGROUPS
|
||||
---help---
|
||||
Say Y here if you want to classify packets based on the control
|
||||
cgroup of their process.
|
||||
|
||||
To compile this code as a module, choose M here: the
|
||||
module will be called cls_cgroup.
|
||||
|
||||
config NET_CLS_BPF
|
||||
tristate "BPF-based classifier"
|
||||
select NET_CLS
|
||||
---help---
|
||||
If you say Y here, you will be able to classify packets based on
|
||||
programmable BPF (JIT'ed) filters as an alternative to ematches.
|
||||
|
||||
To compile this code as a module, choose M here: the module will
|
||||
be called cls_bpf.
|
||||
|
||||
config NET_EMATCH
|
||||
bool "Extended Matches"
|
||||
select NET_CLS
|
||||
---help---
|
||||
Say Y here if you want to use extended matches on top of classifiers
|
||||
and select the extended matches below.
|
||||
|
||||
Extended matches are small classification helpers not worth writing
|
||||
a separate classifier for.
|
||||
|
||||
A recent version of the iproute2 package is required to use
|
||||
extended matches.
|
||||
|
||||
config NET_EMATCH_STACK
|
||||
int "Stack size"
|
||||
depends on NET_EMATCH
|
||||
default "32"
|
||||
---help---
|
||||
Size of the local stack variable used while evaluating the tree of
|
||||
ematches. Limits the depth of the tree, i.e. the number of
|
||||
encapsulated precedences. Every level requires 4 bytes of additional
|
||||
stack space.
|
||||
|
||||
config NET_EMATCH_CMP
|
||||
tristate "Simple packet data comparison"
|
||||
depends on NET_EMATCH
|
||||
---help---
|
||||
Say Y here if you want to be able to classify packets based on
|
||||
simple packet data comparisons for 8, 16, and 32bit values.
|
||||
|
||||
To compile this code as a module, choose M here: the
|
||||
module will be called em_cmp.
|
||||
|
||||
config NET_EMATCH_NBYTE
|
||||
tristate "Multi byte comparison"
|
||||
depends on NET_EMATCH
|
||||
---help---
|
||||
Say Y here if you want to be able to classify packets based on
|
||||
multiple byte comparisons mainly useful for IPv6 address comparisons.
|
||||
|
||||
To compile this code as a module, choose M here: the
|
||||
module will be called em_nbyte.
|
||||
|
||||
config NET_EMATCH_U32
|
||||
tristate "U32 key"
|
||||
depends on NET_EMATCH
|
||||
---help---
|
||||
Say Y here if you want to be able to classify packets using
|
||||
the famous u32 key in combination with logic relations.
|
||||
|
||||
To compile this code as a module, choose M here: the
|
||||
module will be called em_u32.
|
||||
|
||||
config NET_EMATCH_META
|
||||
tristate "Metadata"
|
||||
depends on NET_EMATCH
|
||||
---help---
|
||||
Say Y here if you want to be able to classify packets based on
|
||||
metadata such as load average, netfilter attributes, socket
|
||||
attributes and routing decisions.
|
||||
|
||||
To compile this code as a module, choose M here: the
|
||||
module will be called em_meta.
|
||||
|
||||
config NET_EMATCH_TEXT
|
||||
tristate "Textsearch"
|
||||
depends on NET_EMATCH
|
||||
select TEXTSEARCH
|
||||
select TEXTSEARCH_KMP
|
||||
select TEXTSEARCH_BM
|
||||
select TEXTSEARCH_FSM
|
||||
---help---
|
||||
Say Y here if you want to be able to classify packets based on
|
||||
textsearch comparisons.
|
||||
|
||||
To compile this code as a module, choose M here: the
|
||||
module will be called em_text.
|
||||
|
||||
config NET_EMATCH_CANID
|
||||
tristate "CAN Identifier"
|
||||
depends on NET_EMATCH && (CAN=y || CAN=m)
|
||||
---help---
|
||||
Say Y here if you want to be able to classify CAN frames based
|
||||
on CAN Identifier.
|
||||
|
||||
To compile this code as a module, choose M here: the
|
||||
module will be called em_canid.
|
||||
|
||||
config NET_EMATCH_IPSET
|
||||
tristate "IPset"
|
||||
depends on NET_EMATCH && IP_SET
|
||||
---help---
|
||||
Say Y here if you want to be able to classify packets based on
|
||||
ipset membership.
|
||||
|
||||
To compile this code as a module, choose M here: the
|
||||
module will be called em_ipset.
|
||||
|
||||
config NET_CLS_ACT
|
||||
bool "Actions"
|
||||
---help---
|
||||
Say Y here if you want to use traffic control actions. Actions
|
||||
get attached to classifiers and are invoked after a successful
|
||||
classification. They are used to overwrite the classification
|
||||
result, instantly drop or redirect packets, etc.
|
||||
|
||||
A recent version of the iproute2 package is required to use
|
||||
extended matches.
|
||||
|
||||
config NET_ACT_POLICE
|
||||
tristate "Traffic Policing"
|
||||
depends on NET_CLS_ACT
|
||||
---help---
|
||||
Say Y here if you want to do traffic policing, i.e. strict
|
||||
bandwidth limiting. This action replaces the existing policing
|
||||
module.
|
||||
|
||||
To compile this code as a module, choose M here: the
|
||||
module will be called act_police.
|
||||
|
||||
config NET_ACT_GACT
|
||||
tristate "Generic actions"
|
||||
depends on NET_CLS_ACT
|
||||
---help---
|
||||
Say Y here to take generic actions such as dropping and
|
||||
accepting packets.
|
||||
|
||||
To compile this code as a module, choose M here: the
|
||||
module will be called act_gact.
|
||||
|
||||
config GACT_PROB
|
||||
bool "Probability support"
|
||||
depends on NET_ACT_GACT
|
||||
---help---
|
||||
Say Y here to use the generic action randomly or deterministically.
|
||||
|
||||
config NET_ACT_MIRRED
|
||||
tristate "Redirecting and Mirroring"
|
||||
depends on NET_CLS_ACT
|
||||
---help---
|
||||
Say Y here to allow packets to be mirrored or redirected to
|
||||
other devices.
|
||||
|
||||
To compile this code as a module, choose M here: the
|
||||
module will be called act_mirred.
|
||||
|
||||
config NET_ACT_IPT
|
||||
tristate "IPtables targets"
|
||||
depends on NET_CLS_ACT && NETFILTER && IP_NF_IPTABLES
|
||||
---help---
|
||||
Say Y here to be able to invoke iptables targets after successful
|
||||
classification.
|
||||
|
||||
To compile this code as a module, choose M here: the
|
||||
module will be called act_ipt.
|
||||
|
||||
config NET_ACT_NAT
|
||||
tristate "Stateless NAT"
|
||||
depends on NET_CLS_ACT
|
||||
---help---
|
||||
Say Y here to do stateless NAT on IPv4 packets. You should use
|
||||
netfilter for NAT unless you know what you are doing.
|
||||
|
||||
To compile this code as a module, choose M here: the
|
||||
module will be called act_nat.
|
||||
|
||||
config NET_ACT_PEDIT
|
||||
tristate "Packet Editing"
|
||||
depends on NET_CLS_ACT
|
||||
---help---
|
||||
Say Y here if you want to mangle the content of packets.
|
||||
|
||||
To compile this code as a module, choose M here: the
|
||||
module will be called act_pedit.
|
||||
|
||||
config NET_ACT_SIMP
|
||||
tristate "Simple Example (Debug)"
|
||||
depends on NET_CLS_ACT
|
||||
---help---
|
||||
Say Y here to add a simple action for demonstration purposes.
|
||||
It is meant as an example and for debugging purposes. It will
|
||||
print a configured policy string followed by the packet count
|
||||
to the console for every packet that passes by.
|
||||
|
||||
If unsure, say N.
|
||||
|
||||
To compile this code as a module, choose M here: the
|
||||
module will be called act_simple.
|
||||
|
||||
config NET_ACT_SKBEDIT
|
||||
tristate "SKB Editing"
|
||||
depends on NET_CLS_ACT
|
||||
---help---
|
||||
Say Y here to change skb priority or queue_mapping settings.
|
||||
|
||||
If unsure, say N.
|
||||
|
||||
To compile this code as a module, choose M here: the
|
||||
module will be called act_skbedit.
|
||||
|
||||
config NET_ACT_CSUM
|
||||
tristate "Checksum Updating"
|
||||
depends on NET_CLS_ACT && INET
|
||||
---help---
|
||||
Say Y here to update some common checksum after some direct
|
||||
packet alterations.
|
||||
|
||||
To compile this code as a module, choose M here: the
|
||||
module will be called act_csum.
|
||||
|
||||
config NET_CLS_IND
|
||||
bool "Incoming device classification"
|
||||
depends on NET_CLS_U32 || NET_CLS_FW
|
||||
---help---
|
||||
Say Y here to extend the u32 and fw classifier to support
|
||||
classification based on the incoming device. This option is
|
||||
likely to disappear in favour of the metadata ematch.
|
||||
|
||||
endif # NET_SCHED
|
||||
|
||||
config NET_SCH_FIFO
|
||||
bool
|
||||
63
net/sched/Makefile
Normal file
63
net/sched/Makefile
Normal file
|
|
@ -0,0 +1,63 @@
|
|||
#
|
||||
# Makefile for the Linux Traffic Control Unit.
|
||||
#
|
||||
|
||||
obj-y := sch_generic.o sch_mq.o
|
||||
|
||||
obj-$(CONFIG_NET_SCHED) += sch_api.o sch_blackhole.o
|
||||
obj-$(CONFIG_NET_CLS) += cls_api.o
|
||||
obj-$(CONFIG_NET_CLS_ACT) += act_api.o
|
||||
obj-$(CONFIG_NET_ACT_POLICE) += act_police.o
|
||||
obj-$(CONFIG_NET_ACT_GACT) += act_gact.o
|
||||
obj-$(CONFIG_NET_ACT_MIRRED) += act_mirred.o
|
||||
obj-$(CONFIG_NET_ACT_IPT) += act_ipt.o
|
||||
obj-$(CONFIG_NET_ACT_NAT) += act_nat.o
|
||||
obj-$(CONFIG_NET_ACT_PEDIT) += act_pedit.o
|
||||
obj-$(CONFIG_NET_ACT_SIMP) += act_simple.o
|
||||
obj-$(CONFIG_NET_ACT_SKBEDIT) += act_skbedit.o
|
||||
obj-$(CONFIG_NET_ACT_CSUM) += act_csum.o
|
||||
obj-$(CONFIG_NET_SCH_FIFO) += sch_fifo.o
|
||||
obj-$(CONFIG_NET_SCH_CBQ) += sch_cbq.o
|
||||
obj-$(CONFIG_NET_SCH_HTB) += sch_htb.o
|
||||
obj-$(CONFIG_NET_SCH_HFSC) += sch_hfsc.o
|
||||
obj-$(CONFIG_NET_SCH_RED) += sch_red.o
|
||||
obj-$(CONFIG_NET_SCH_GRED) += sch_gred.o
|
||||
obj-$(CONFIG_NET_SCH_INGRESS) += sch_ingress.o
|
||||
obj-$(CONFIG_NET_SCH_DSMARK) += sch_dsmark.o
|
||||
obj-$(CONFIG_NET_SCH_SFB) += sch_sfb.o
|
||||
obj-$(CONFIG_NET_SCH_SFQ) += sch_sfq.o
|
||||
obj-$(CONFIG_NET_SCH_TBF) += sch_tbf.o
|
||||
obj-$(CONFIG_NET_SCH_TEQL) += sch_teql.o
|
||||
obj-$(CONFIG_NET_SCH_PRIO) += sch_prio.o
|
||||
obj-$(CONFIG_NET_SCH_MULTIQ) += sch_multiq.o
|
||||
obj-$(CONFIG_NET_SCH_ATM) += sch_atm.o
|
||||
obj-$(CONFIG_NET_SCH_NETEM) += sch_netem.o
|
||||
obj-$(CONFIG_NET_SCH_DRR) += sch_drr.o
|
||||
obj-$(CONFIG_NET_SCH_PLUG) += sch_plug.o
|
||||
obj-$(CONFIG_NET_SCH_MQPRIO) += sch_mqprio.o
|
||||
obj-$(CONFIG_NET_SCH_CHOKE) += sch_choke.o
|
||||
obj-$(CONFIG_NET_SCH_QFQ) += sch_qfq.o
|
||||
obj-$(CONFIG_NET_SCH_CODEL) += sch_codel.o
|
||||
obj-$(CONFIG_NET_SCH_FQ_CODEL) += sch_fq_codel.o
|
||||
obj-$(CONFIG_NET_SCH_FQ) += sch_fq.o
|
||||
obj-$(CONFIG_NET_SCH_HHF) += sch_hhf.o
|
||||
obj-$(CONFIG_NET_SCH_PIE) += sch_pie.o
|
||||
|
||||
obj-$(CONFIG_NET_CLS_U32) += cls_u32.o
|
||||
obj-$(CONFIG_NET_CLS_ROUTE4) += cls_route.o
|
||||
obj-$(CONFIG_NET_CLS_FW) += cls_fw.o
|
||||
obj-$(CONFIG_NET_CLS_RSVP) += cls_rsvp.o
|
||||
obj-$(CONFIG_NET_CLS_TCINDEX) += cls_tcindex.o
|
||||
obj-$(CONFIG_NET_CLS_RSVP6) += cls_rsvp6.o
|
||||
obj-$(CONFIG_NET_CLS_BASIC) += cls_basic.o
|
||||
obj-$(CONFIG_NET_CLS_FLOW) += cls_flow.o
|
||||
obj-$(CONFIG_NET_CLS_CGROUP) += cls_cgroup.o
|
||||
obj-$(CONFIG_NET_CLS_BPF) += cls_bpf.o
|
||||
obj-$(CONFIG_NET_EMATCH) += ematch.o
|
||||
obj-$(CONFIG_NET_EMATCH_CMP) += em_cmp.o
|
||||
obj-$(CONFIG_NET_EMATCH_NBYTE) += em_nbyte.o
|
||||
obj-$(CONFIG_NET_EMATCH_U32) += em_u32.o
|
||||
obj-$(CONFIG_NET_EMATCH_META) += em_meta.o
|
||||
obj-$(CONFIG_NET_EMATCH_TEXT) += em_text.o
|
||||
obj-$(CONFIG_NET_EMATCH_CANID) += em_canid.o
|
||||
obj-$(CONFIG_NET_EMATCH_IPSET) += em_ipset.o
|
||||
1094
net/sched/act_api.c
Normal file
1094
net/sched/act_api.c
Normal file
File diff suppressed because it is too large
Load diff
584
net/sched/act_csum.c
Normal file
584
net/sched/act_csum.c
Normal file
|
|
@ -0,0 +1,584 @@
|
|||
/*
|
||||
* Checksum updating actions
|
||||
*
|
||||
* Copyright (c) 2010 Gregoire Baron <baronchon@n7mm.org>
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or modify it
|
||||
* under the terms of the GNU General Public License as published by the Free
|
||||
* Software Foundation; either version 2 of the License, or (at your option)
|
||||
* any later version.
|
||||
*
|
||||
*/
|
||||
|
||||
#include <linux/types.h>
|
||||
#include <linux/init.h>
|
||||
#include <linux/kernel.h>
|
||||
#include <linux/module.h>
|
||||
#include <linux/spinlock.h>
|
||||
|
||||
#include <linux/netlink.h>
|
||||
#include <net/netlink.h>
|
||||
#include <linux/rtnetlink.h>
|
||||
|
||||
#include <linux/skbuff.h>
|
||||
|
||||
#include <net/ip.h>
|
||||
#include <net/ipv6.h>
|
||||
#include <net/icmp.h>
|
||||
#include <linux/icmpv6.h>
|
||||
#include <linux/igmp.h>
|
||||
#include <net/tcp.h>
|
||||
#include <net/udp.h>
|
||||
#include <net/ip6_checksum.h>
|
||||
|
||||
#include <net/act_api.h>
|
||||
|
||||
#include <linux/tc_act/tc_csum.h>
|
||||
#include <net/tc_act/tc_csum.h>
|
||||
|
||||
#define CSUM_TAB_MASK 15
|
||||
|
||||
static const struct nla_policy csum_policy[TCA_CSUM_MAX + 1] = {
|
||||
[TCA_CSUM_PARMS] = { .len = sizeof(struct tc_csum), },
|
||||
};
|
||||
|
||||
static int tcf_csum_init(struct net *n, struct nlattr *nla, struct nlattr *est,
|
||||
struct tc_action *a, int ovr, int bind)
|
||||
{
|
||||
struct nlattr *tb[TCA_CSUM_MAX + 1];
|
||||
struct tc_csum *parm;
|
||||
struct tcf_csum *p;
|
||||
int ret = 0, err;
|
||||
|
||||
if (nla == NULL)
|
||||
return -EINVAL;
|
||||
|
||||
err = nla_parse_nested(tb, TCA_CSUM_MAX, nla, csum_policy);
|
||||
if (err < 0)
|
||||
return err;
|
||||
|
||||
if (tb[TCA_CSUM_PARMS] == NULL)
|
||||
return -EINVAL;
|
||||
parm = nla_data(tb[TCA_CSUM_PARMS]);
|
||||
|
||||
if (!tcf_hash_check(parm->index, a, bind)) {
|
||||
ret = tcf_hash_create(parm->index, est, a, sizeof(*p), bind);
|
||||
if (ret)
|
||||
return ret;
|
||||
ret = ACT_P_CREATED;
|
||||
} else {
|
||||
if (bind)/* dont override defaults */
|
||||
return 0;
|
||||
tcf_hash_release(a, bind);
|
||||
if (!ovr)
|
||||
return -EEXIST;
|
||||
}
|
||||
|
||||
p = to_tcf_csum(a);
|
||||
spin_lock_bh(&p->tcf_lock);
|
||||
p->tcf_action = parm->action;
|
||||
p->update_flags = parm->update_flags;
|
||||
spin_unlock_bh(&p->tcf_lock);
|
||||
|
||||
if (ret == ACT_P_CREATED)
|
||||
tcf_hash_insert(a);
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
/**
|
||||
* tcf_csum_skb_nextlayer - Get next layer pointer
|
||||
* @skb: sk_buff to use
|
||||
* @ihl: previous summed headers length
|
||||
* @ipl: complete packet length
|
||||
* @jhl: next header length
|
||||
*
|
||||
* Check the expected next layer availability in the specified sk_buff.
|
||||
* Return the next layer pointer if pass, NULL otherwise.
|
||||
*/
|
||||
static void *tcf_csum_skb_nextlayer(struct sk_buff *skb,
|
||||
unsigned int ihl, unsigned int ipl,
|
||||
unsigned int jhl)
|
||||
{
|
||||
int ntkoff = skb_network_offset(skb);
|
||||
int hl = ihl + jhl;
|
||||
|
||||
if (!pskb_may_pull(skb, ipl + ntkoff) || (ipl < hl) ||
|
||||
(skb_cloned(skb) &&
|
||||
!skb_clone_writable(skb, hl + ntkoff) &&
|
||||
pskb_expand_head(skb, 0, 0, GFP_ATOMIC)))
|
||||
return NULL;
|
||||
else
|
||||
return (void *)(skb_network_header(skb) + ihl);
|
||||
}
|
||||
|
||||
static int tcf_csum_ipv4_icmp(struct sk_buff *skb,
|
||||
unsigned int ihl, unsigned int ipl)
|
||||
{
|
||||
struct icmphdr *icmph;
|
||||
|
||||
icmph = tcf_csum_skb_nextlayer(skb, ihl, ipl, sizeof(*icmph));
|
||||
if (icmph == NULL)
|
||||
return 0;
|
||||
|
||||
icmph->checksum = 0;
|
||||
skb->csum = csum_partial(icmph, ipl - ihl, 0);
|
||||
icmph->checksum = csum_fold(skb->csum);
|
||||
|
||||
skb->ip_summed = CHECKSUM_NONE;
|
||||
|
||||
return 1;
|
||||
}
|
||||
|
||||
static int tcf_csum_ipv4_igmp(struct sk_buff *skb,
|
||||
unsigned int ihl, unsigned int ipl)
|
||||
{
|
||||
struct igmphdr *igmph;
|
||||
|
||||
igmph = tcf_csum_skb_nextlayer(skb, ihl, ipl, sizeof(*igmph));
|
||||
if (igmph == NULL)
|
||||
return 0;
|
||||
|
||||
igmph->csum = 0;
|
||||
skb->csum = csum_partial(igmph, ipl - ihl, 0);
|
||||
igmph->csum = csum_fold(skb->csum);
|
||||
|
||||
skb->ip_summed = CHECKSUM_NONE;
|
||||
|
||||
return 1;
|
||||
}
|
||||
|
||||
static int tcf_csum_ipv6_icmp(struct sk_buff *skb,
|
||||
unsigned int ihl, unsigned int ipl)
|
||||
{
|
||||
struct icmp6hdr *icmp6h;
|
||||
const struct ipv6hdr *ip6h;
|
||||
|
||||
icmp6h = tcf_csum_skb_nextlayer(skb, ihl, ipl, sizeof(*icmp6h));
|
||||
if (icmp6h == NULL)
|
||||
return 0;
|
||||
|
||||
ip6h = ipv6_hdr(skb);
|
||||
icmp6h->icmp6_cksum = 0;
|
||||
skb->csum = csum_partial(icmp6h, ipl - ihl, 0);
|
||||
icmp6h->icmp6_cksum = csum_ipv6_magic(&ip6h->saddr, &ip6h->daddr,
|
||||
ipl - ihl, IPPROTO_ICMPV6,
|
||||
skb->csum);
|
||||
|
||||
skb->ip_summed = CHECKSUM_NONE;
|
||||
|
||||
return 1;
|
||||
}
|
||||
|
||||
static int tcf_csum_ipv4_tcp(struct sk_buff *skb,
|
||||
unsigned int ihl, unsigned int ipl)
|
||||
{
|
||||
struct tcphdr *tcph;
|
||||
const struct iphdr *iph;
|
||||
|
||||
tcph = tcf_csum_skb_nextlayer(skb, ihl, ipl, sizeof(*tcph));
|
||||
if (tcph == NULL)
|
||||
return 0;
|
||||
|
||||
iph = ip_hdr(skb);
|
||||
tcph->check = 0;
|
||||
skb->csum = csum_partial(tcph, ipl - ihl, 0);
|
||||
tcph->check = tcp_v4_check(ipl - ihl,
|
||||
iph->saddr, iph->daddr, skb->csum);
|
||||
|
||||
skb->ip_summed = CHECKSUM_NONE;
|
||||
|
||||
return 1;
|
||||
}
|
||||
|
||||
static int tcf_csum_ipv6_tcp(struct sk_buff *skb,
|
||||
unsigned int ihl, unsigned int ipl)
|
||||
{
|
||||
struct tcphdr *tcph;
|
||||
const struct ipv6hdr *ip6h;
|
||||
|
||||
tcph = tcf_csum_skb_nextlayer(skb, ihl, ipl, sizeof(*tcph));
|
||||
if (tcph == NULL)
|
||||
return 0;
|
||||
|
||||
ip6h = ipv6_hdr(skb);
|
||||
tcph->check = 0;
|
||||
skb->csum = csum_partial(tcph, ipl - ihl, 0);
|
||||
tcph->check = csum_ipv6_magic(&ip6h->saddr, &ip6h->daddr,
|
||||
ipl - ihl, IPPROTO_TCP,
|
||||
skb->csum);
|
||||
|
||||
skb->ip_summed = CHECKSUM_NONE;
|
||||
|
||||
return 1;
|
||||
}
|
||||
|
||||
static int tcf_csum_ipv4_udp(struct sk_buff *skb,
|
||||
unsigned int ihl, unsigned int ipl, int udplite)
|
||||
{
|
||||
struct udphdr *udph;
|
||||
const struct iphdr *iph;
|
||||
u16 ul;
|
||||
|
||||
/*
|
||||
* Support both UDP and UDPLITE checksum algorithms, Don't use
|
||||
* udph->len to get the real length without any protocol check,
|
||||
* UDPLITE uses udph->len for another thing,
|
||||
* Use iph->tot_len, or just ipl.
|
||||
*/
|
||||
|
||||
udph = tcf_csum_skb_nextlayer(skb, ihl, ipl, sizeof(*udph));
|
||||
if (udph == NULL)
|
||||
return 0;
|
||||
|
||||
iph = ip_hdr(skb);
|
||||
ul = ntohs(udph->len);
|
||||
|
||||
if (udplite || udph->check) {
|
||||
|
||||
udph->check = 0;
|
||||
|
||||
if (udplite) {
|
||||
if (ul == 0)
|
||||
skb->csum = csum_partial(udph, ipl - ihl, 0);
|
||||
else if ((ul >= sizeof(*udph)) && (ul <= ipl - ihl))
|
||||
skb->csum = csum_partial(udph, ul, 0);
|
||||
else
|
||||
goto ignore_obscure_skb;
|
||||
} else {
|
||||
if (ul != ipl - ihl)
|
||||
goto ignore_obscure_skb;
|
||||
|
||||
skb->csum = csum_partial(udph, ul, 0);
|
||||
}
|
||||
|
||||
udph->check = csum_tcpudp_magic(iph->saddr, iph->daddr,
|
||||
ul, iph->protocol,
|
||||
skb->csum);
|
||||
|
||||
if (!udph->check)
|
||||
udph->check = CSUM_MANGLED_0;
|
||||
}
|
||||
|
||||
skb->ip_summed = CHECKSUM_NONE;
|
||||
|
||||
ignore_obscure_skb:
|
||||
return 1;
|
||||
}
|
||||
|
||||
static int tcf_csum_ipv6_udp(struct sk_buff *skb,
|
||||
unsigned int ihl, unsigned int ipl, int udplite)
|
||||
{
|
||||
struct udphdr *udph;
|
||||
const struct ipv6hdr *ip6h;
|
||||
u16 ul;
|
||||
|
||||
/*
|
||||
* Support both UDP and UDPLITE checksum algorithms, Don't use
|
||||
* udph->len to get the real length without any protocol check,
|
||||
* UDPLITE uses udph->len for another thing,
|
||||
* Use ip6h->payload_len + sizeof(*ip6h) ... , or just ipl.
|
||||
*/
|
||||
|
||||
udph = tcf_csum_skb_nextlayer(skb, ihl, ipl, sizeof(*udph));
|
||||
if (udph == NULL)
|
||||
return 0;
|
||||
|
||||
ip6h = ipv6_hdr(skb);
|
||||
ul = ntohs(udph->len);
|
||||
|
||||
udph->check = 0;
|
||||
|
||||
if (udplite) {
|
||||
if (ul == 0)
|
||||
skb->csum = csum_partial(udph, ipl - ihl, 0);
|
||||
|
||||
else if ((ul >= sizeof(*udph)) && (ul <= ipl - ihl))
|
||||
skb->csum = csum_partial(udph, ul, 0);
|
||||
|
||||
else
|
||||
goto ignore_obscure_skb;
|
||||
} else {
|
||||
if (ul != ipl - ihl)
|
||||
goto ignore_obscure_skb;
|
||||
|
||||
skb->csum = csum_partial(udph, ul, 0);
|
||||
}
|
||||
|
||||
udph->check = csum_ipv6_magic(&ip6h->saddr, &ip6h->daddr, ul,
|
||||
udplite ? IPPROTO_UDPLITE : IPPROTO_UDP,
|
||||
skb->csum);
|
||||
|
||||
if (!udph->check)
|
||||
udph->check = CSUM_MANGLED_0;
|
||||
|
||||
skb->ip_summed = CHECKSUM_NONE;
|
||||
|
||||
ignore_obscure_skb:
|
||||
return 1;
|
||||
}
|
||||
|
||||
static int tcf_csum_ipv4(struct sk_buff *skb, u32 update_flags)
|
||||
{
|
||||
const struct iphdr *iph;
|
||||
int ntkoff;
|
||||
|
||||
ntkoff = skb_network_offset(skb);
|
||||
|
||||
if (!pskb_may_pull(skb, sizeof(*iph) + ntkoff))
|
||||
goto fail;
|
||||
|
||||
iph = ip_hdr(skb);
|
||||
|
||||
switch (iph->frag_off & htons(IP_OFFSET) ? 0 : iph->protocol) {
|
||||
case IPPROTO_ICMP:
|
||||
if (update_flags & TCA_CSUM_UPDATE_FLAG_ICMP)
|
||||
if (!tcf_csum_ipv4_icmp(skb, iph->ihl * 4,
|
||||
ntohs(iph->tot_len)))
|
||||
goto fail;
|
||||
break;
|
||||
case IPPROTO_IGMP:
|
||||
if (update_flags & TCA_CSUM_UPDATE_FLAG_IGMP)
|
||||
if (!tcf_csum_ipv4_igmp(skb, iph->ihl * 4,
|
||||
ntohs(iph->tot_len)))
|
||||
goto fail;
|
||||
break;
|
||||
case IPPROTO_TCP:
|
||||
if (update_flags & TCA_CSUM_UPDATE_FLAG_TCP)
|
||||
if (!tcf_csum_ipv4_tcp(skb, iph->ihl * 4,
|
||||
ntohs(iph->tot_len)))
|
||||
goto fail;
|
||||
break;
|
||||
case IPPROTO_UDP:
|
||||
if (update_flags & TCA_CSUM_UPDATE_FLAG_UDP)
|
||||
if (!tcf_csum_ipv4_udp(skb, iph->ihl * 4,
|
||||
ntohs(iph->tot_len), 0))
|
||||
goto fail;
|
||||
break;
|
||||
case IPPROTO_UDPLITE:
|
||||
if (update_flags & TCA_CSUM_UPDATE_FLAG_UDPLITE)
|
||||
if (!tcf_csum_ipv4_udp(skb, iph->ihl * 4,
|
||||
ntohs(iph->tot_len), 1))
|
||||
goto fail;
|
||||
break;
|
||||
}
|
||||
|
||||
if (update_flags & TCA_CSUM_UPDATE_FLAG_IPV4HDR) {
|
||||
if (skb_cloned(skb) &&
|
||||
!skb_clone_writable(skb, sizeof(*iph) + ntkoff) &&
|
||||
pskb_expand_head(skb, 0, 0, GFP_ATOMIC))
|
||||
goto fail;
|
||||
|
||||
ip_send_check(ip_hdr(skb));
|
||||
}
|
||||
|
||||
return 1;
|
||||
|
||||
fail:
|
||||
return 0;
|
||||
}
|
||||
|
||||
static int tcf_csum_ipv6_hopopts(struct ipv6_opt_hdr *ip6xh,
|
||||
unsigned int ixhl, unsigned int *pl)
|
||||
{
|
||||
int off, len, optlen;
|
||||
unsigned char *xh = (void *)ip6xh;
|
||||
|
||||
off = sizeof(*ip6xh);
|
||||
len = ixhl - off;
|
||||
|
||||
while (len > 1) {
|
||||
switch (xh[off]) {
|
||||
case IPV6_TLV_PAD1:
|
||||
optlen = 1;
|
||||
break;
|
||||
case IPV6_TLV_JUMBO:
|
||||
optlen = xh[off + 1] + 2;
|
||||
if (optlen != 6 || len < 6 || (off & 3) != 2)
|
||||
/* wrong jumbo option length/alignment */
|
||||
return 0;
|
||||
*pl = ntohl(*(__be32 *)(xh + off + 2));
|
||||
goto done;
|
||||
default:
|
||||
optlen = xh[off + 1] + 2;
|
||||
if (optlen > len)
|
||||
/* ignore obscure options */
|
||||
goto done;
|
||||
break;
|
||||
}
|
||||
off += optlen;
|
||||
len -= optlen;
|
||||
}
|
||||
|
||||
done:
|
||||
return 1;
|
||||
}
|
||||
|
||||
static int tcf_csum_ipv6(struct sk_buff *skb, u32 update_flags)
|
||||
{
|
||||
struct ipv6hdr *ip6h;
|
||||
struct ipv6_opt_hdr *ip6xh;
|
||||
unsigned int hl, ixhl;
|
||||
unsigned int pl;
|
||||
int ntkoff;
|
||||
u8 nexthdr;
|
||||
|
||||
ntkoff = skb_network_offset(skb);
|
||||
|
||||
hl = sizeof(*ip6h);
|
||||
|
||||
if (!pskb_may_pull(skb, hl + ntkoff))
|
||||
goto fail;
|
||||
|
||||
ip6h = ipv6_hdr(skb);
|
||||
|
||||
pl = ntohs(ip6h->payload_len);
|
||||
nexthdr = ip6h->nexthdr;
|
||||
|
||||
do {
|
||||
switch (nexthdr) {
|
||||
case NEXTHDR_FRAGMENT:
|
||||
goto ignore_skb;
|
||||
case NEXTHDR_ROUTING:
|
||||
case NEXTHDR_HOP:
|
||||
case NEXTHDR_DEST:
|
||||
if (!pskb_may_pull(skb, hl + sizeof(*ip6xh) + ntkoff))
|
||||
goto fail;
|
||||
ip6xh = (void *)(skb_network_header(skb) + hl);
|
||||
ixhl = ipv6_optlen(ip6xh);
|
||||
if (!pskb_may_pull(skb, hl + ixhl + ntkoff))
|
||||
goto fail;
|
||||
ip6xh = (void *)(skb_network_header(skb) + hl);
|
||||
if ((nexthdr == NEXTHDR_HOP) &&
|
||||
!(tcf_csum_ipv6_hopopts(ip6xh, ixhl, &pl)))
|
||||
goto fail;
|
||||
nexthdr = ip6xh->nexthdr;
|
||||
hl += ixhl;
|
||||
break;
|
||||
case IPPROTO_ICMPV6:
|
||||
if (update_flags & TCA_CSUM_UPDATE_FLAG_ICMP)
|
||||
if (!tcf_csum_ipv6_icmp(skb,
|
||||
hl, pl + sizeof(*ip6h)))
|
||||
goto fail;
|
||||
goto done;
|
||||
case IPPROTO_TCP:
|
||||
if (update_flags & TCA_CSUM_UPDATE_FLAG_TCP)
|
||||
if (!tcf_csum_ipv6_tcp(skb,
|
||||
hl, pl + sizeof(*ip6h)))
|
||||
goto fail;
|
||||
goto done;
|
||||
case IPPROTO_UDP:
|
||||
if (update_flags & TCA_CSUM_UPDATE_FLAG_UDP)
|
||||
if (!tcf_csum_ipv6_udp(skb, hl,
|
||||
pl + sizeof(*ip6h), 0))
|
||||
goto fail;
|
||||
goto done;
|
||||
case IPPROTO_UDPLITE:
|
||||
if (update_flags & TCA_CSUM_UPDATE_FLAG_UDPLITE)
|
||||
if (!tcf_csum_ipv6_udp(skb, hl,
|
||||
pl + sizeof(*ip6h), 1))
|
||||
goto fail;
|
||||
goto done;
|
||||
default:
|
||||
goto ignore_skb;
|
||||
}
|
||||
} while (pskb_may_pull(skb, hl + 1 + ntkoff));
|
||||
|
||||
done:
|
||||
ignore_skb:
|
||||
return 1;
|
||||
|
||||
fail:
|
||||
return 0;
|
||||
}
|
||||
|
||||
static int tcf_csum(struct sk_buff *skb,
|
||||
const struct tc_action *a, struct tcf_result *res)
|
||||
{
|
||||
struct tcf_csum *p = a->priv;
|
||||
int action;
|
||||
u32 update_flags;
|
||||
|
||||
spin_lock(&p->tcf_lock);
|
||||
p->tcf_tm.lastuse = jiffies;
|
||||
bstats_update(&p->tcf_bstats, skb);
|
||||
action = p->tcf_action;
|
||||
update_flags = p->update_flags;
|
||||
spin_unlock(&p->tcf_lock);
|
||||
|
||||
if (unlikely(action == TC_ACT_SHOT))
|
||||
goto drop;
|
||||
|
||||
switch (skb->protocol) {
|
||||
case cpu_to_be16(ETH_P_IP):
|
||||
if (!tcf_csum_ipv4(skb, update_flags))
|
||||
goto drop;
|
||||
break;
|
||||
case cpu_to_be16(ETH_P_IPV6):
|
||||
if (!tcf_csum_ipv6(skb, update_flags))
|
||||
goto drop;
|
||||
break;
|
||||
}
|
||||
|
||||
return action;
|
||||
|
||||
drop:
|
||||
spin_lock(&p->tcf_lock);
|
||||
p->tcf_qstats.drops++;
|
||||
spin_unlock(&p->tcf_lock);
|
||||
return TC_ACT_SHOT;
|
||||
}
|
||||
|
||||
static int tcf_csum_dump(struct sk_buff *skb,
|
||||
struct tc_action *a, int bind, int ref)
|
||||
{
|
||||
unsigned char *b = skb_tail_pointer(skb);
|
||||
struct tcf_csum *p = a->priv;
|
||||
struct tc_csum opt = {
|
||||
.update_flags = p->update_flags,
|
||||
.index = p->tcf_index,
|
||||
.action = p->tcf_action,
|
||||
.refcnt = p->tcf_refcnt - ref,
|
||||
.bindcnt = p->tcf_bindcnt - bind,
|
||||
};
|
||||
struct tcf_t t;
|
||||
|
||||
if (nla_put(skb, TCA_CSUM_PARMS, sizeof(opt), &opt))
|
||||
goto nla_put_failure;
|
||||
t.install = jiffies_to_clock_t(jiffies - p->tcf_tm.install);
|
||||
t.lastuse = jiffies_to_clock_t(jiffies - p->tcf_tm.lastuse);
|
||||
t.expires = jiffies_to_clock_t(p->tcf_tm.expires);
|
||||
if (nla_put(skb, TCA_CSUM_TM, sizeof(t), &t))
|
||||
goto nla_put_failure;
|
||||
|
||||
return skb->len;
|
||||
|
||||
nla_put_failure:
|
||||
nlmsg_trim(skb, b);
|
||||
return -1;
|
||||
}
|
||||
|
||||
static struct tc_action_ops act_csum_ops = {
|
||||
.kind = "csum",
|
||||
.type = TCA_ACT_CSUM,
|
||||
.owner = THIS_MODULE,
|
||||
.act = tcf_csum,
|
||||
.dump = tcf_csum_dump,
|
||||
.init = tcf_csum_init,
|
||||
};
|
||||
|
||||
MODULE_DESCRIPTION("Checksum updating actions");
|
||||
MODULE_LICENSE("GPL");
|
||||
|
||||
static int __init csum_init_module(void)
|
||||
{
|
||||
return tcf_register_action(&act_csum_ops, CSUM_TAB_MASK);
|
||||
}
|
||||
|
||||
static void __exit csum_cleanup_module(void)
|
||||
{
|
||||
tcf_unregister_action(&act_csum_ops);
|
||||
}
|
||||
|
||||
module_init(csum_init_module);
|
||||
module_exit(csum_cleanup_module);
|
||||
209
net/sched/act_gact.c
Normal file
209
net/sched/act_gact.c
Normal file
|
|
@ -0,0 +1,209 @@
|
|||
/*
|
||||
* net/sched/gact.c Generic actions
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU General Public License
|
||||
* as published by the Free Software Foundation; either version
|
||||
* 2 of the License, or (at your option) any later version.
|
||||
*
|
||||
* copyright Jamal Hadi Salim (2002-4)
|
||||
*
|
||||
*/
|
||||
|
||||
#include <linux/types.h>
|
||||
#include <linux/kernel.h>
|
||||
#include <linux/string.h>
|
||||
#include <linux/errno.h>
|
||||
#include <linux/skbuff.h>
|
||||
#include <linux/rtnetlink.h>
|
||||
#include <linux/module.h>
|
||||
#include <linux/init.h>
|
||||
#include <net/netlink.h>
|
||||
#include <net/pkt_sched.h>
|
||||
#include <linux/tc_act/tc_gact.h>
|
||||
#include <net/tc_act/tc_gact.h>
|
||||
|
||||
#define GACT_TAB_MASK 15
|
||||
|
||||
#ifdef CONFIG_GACT_PROB
|
||||
static int gact_net_rand(struct tcf_gact *gact)
|
||||
{
|
||||
if (!gact->tcfg_pval || prandom_u32() % gact->tcfg_pval)
|
||||
return gact->tcf_action;
|
||||
return gact->tcfg_paction;
|
||||
}
|
||||
|
||||
static int gact_determ(struct tcf_gact *gact)
|
||||
{
|
||||
if (!gact->tcfg_pval || gact->tcf_bstats.packets % gact->tcfg_pval)
|
||||
return gact->tcf_action;
|
||||
return gact->tcfg_paction;
|
||||
}
|
||||
|
||||
typedef int (*g_rand)(struct tcf_gact *gact);
|
||||
static g_rand gact_rand[MAX_RAND] = { NULL, gact_net_rand, gact_determ };
|
||||
#endif /* CONFIG_GACT_PROB */
|
||||
|
||||
static const struct nla_policy gact_policy[TCA_GACT_MAX + 1] = {
|
||||
[TCA_GACT_PARMS] = { .len = sizeof(struct tc_gact) },
|
||||
[TCA_GACT_PROB] = { .len = sizeof(struct tc_gact_p) },
|
||||
};
|
||||
|
||||
static int tcf_gact_init(struct net *net, struct nlattr *nla,
|
||||
struct nlattr *est, struct tc_action *a,
|
||||
int ovr, int bind)
|
||||
{
|
||||
struct nlattr *tb[TCA_GACT_MAX + 1];
|
||||
struct tc_gact *parm;
|
||||
struct tcf_gact *gact;
|
||||
int ret = 0;
|
||||
int err;
|
||||
#ifdef CONFIG_GACT_PROB
|
||||
struct tc_gact_p *p_parm = NULL;
|
||||
#endif
|
||||
|
||||
if (nla == NULL)
|
||||
return -EINVAL;
|
||||
|
||||
err = nla_parse_nested(tb, TCA_GACT_MAX, nla, gact_policy);
|
||||
if (err < 0)
|
||||
return err;
|
||||
|
||||
if (tb[TCA_GACT_PARMS] == NULL)
|
||||
return -EINVAL;
|
||||
parm = nla_data(tb[TCA_GACT_PARMS]);
|
||||
|
||||
#ifndef CONFIG_GACT_PROB
|
||||
if (tb[TCA_GACT_PROB] != NULL)
|
||||
return -EOPNOTSUPP;
|
||||
#else
|
||||
if (tb[TCA_GACT_PROB]) {
|
||||
p_parm = nla_data(tb[TCA_GACT_PROB]);
|
||||
if (p_parm->ptype >= MAX_RAND)
|
||||
return -EINVAL;
|
||||
}
|
||||
#endif
|
||||
|
||||
if (!tcf_hash_check(parm->index, a, bind)) {
|
||||
ret = tcf_hash_create(parm->index, est, a, sizeof(*gact), bind);
|
||||
if (ret)
|
||||
return ret;
|
||||
ret = ACT_P_CREATED;
|
||||
} else {
|
||||
if (bind)/* dont override defaults */
|
||||
return 0;
|
||||
tcf_hash_release(a, bind);
|
||||
if (!ovr)
|
||||
return -EEXIST;
|
||||
}
|
||||
|
||||
gact = to_gact(a);
|
||||
|
||||
spin_lock_bh(&gact->tcf_lock);
|
||||
gact->tcf_action = parm->action;
|
||||
#ifdef CONFIG_GACT_PROB
|
||||
if (p_parm) {
|
||||
gact->tcfg_paction = p_parm->paction;
|
||||
gact->tcfg_pval = p_parm->pval;
|
||||
gact->tcfg_ptype = p_parm->ptype;
|
||||
}
|
||||
#endif
|
||||
spin_unlock_bh(&gact->tcf_lock);
|
||||
if (ret == ACT_P_CREATED)
|
||||
tcf_hash_insert(a);
|
||||
return ret;
|
||||
}
|
||||
|
||||
static int tcf_gact(struct sk_buff *skb, const struct tc_action *a,
|
||||
struct tcf_result *res)
|
||||
{
|
||||
struct tcf_gact *gact = a->priv;
|
||||
int action = TC_ACT_SHOT;
|
||||
|
||||
spin_lock(&gact->tcf_lock);
|
||||
#ifdef CONFIG_GACT_PROB
|
||||
if (gact->tcfg_ptype)
|
||||
action = gact_rand[gact->tcfg_ptype](gact);
|
||||
else
|
||||
action = gact->tcf_action;
|
||||
#else
|
||||
action = gact->tcf_action;
|
||||
#endif
|
||||
gact->tcf_bstats.bytes += qdisc_pkt_len(skb);
|
||||
gact->tcf_bstats.packets++;
|
||||
if (action == TC_ACT_SHOT)
|
||||
gact->tcf_qstats.drops++;
|
||||
gact->tcf_tm.lastuse = jiffies;
|
||||
spin_unlock(&gact->tcf_lock);
|
||||
|
||||
return action;
|
||||
}
|
||||
|
||||
static int tcf_gact_dump(struct sk_buff *skb, struct tc_action *a, int bind, int ref)
|
||||
{
|
||||
unsigned char *b = skb_tail_pointer(skb);
|
||||
struct tcf_gact *gact = a->priv;
|
||||
struct tc_gact opt = {
|
||||
.index = gact->tcf_index,
|
||||
.refcnt = gact->tcf_refcnt - ref,
|
||||
.bindcnt = gact->tcf_bindcnt - bind,
|
||||
.action = gact->tcf_action,
|
||||
};
|
||||
struct tcf_t t;
|
||||
|
||||
if (nla_put(skb, TCA_GACT_PARMS, sizeof(opt), &opt))
|
||||
goto nla_put_failure;
|
||||
#ifdef CONFIG_GACT_PROB
|
||||
if (gact->tcfg_ptype) {
|
||||
struct tc_gact_p p_opt = {
|
||||
.paction = gact->tcfg_paction,
|
||||
.pval = gact->tcfg_pval,
|
||||
.ptype = gact->tcfg_ptype,
|
||||
};
|
||||
|
||||
if (nla_put(skb, TCA_GACT_PROB, sizeof(p_opt), &p_opt))
|
||||
goto nla_put_failure;
|
||||
}
|
||||
#endif
|
||||
t.install = jiffies_to_clock_t(jiffies - gact->tcf_tm.install);
|
||||
t.lastuse = jiffies_to_clock_t(jiffies - gact->tcf_tm.lastuse);
|
||||
t.expires = jiffies_to_clock_t(gact->tcf_tm.expires);
|
||||
if (nla_put(skb, TCA_GACT_TM, sizeof(t), &t))
|
||||
goto nla_put_failure;
|
||||
return skb->len;
|
||||
|
||||
nla_put_failure:
|
||||
nlmsg_trim(skb, b);
|
||||
return -1;
|
||||
}
|
||||
|
||||
static struct tc_action_ops act_gact_ops = {
|
||||
.kind = "gact",
|
||||
.type = TCA_ACT_GACT,
|
||||
.owner = THIS_MODULE,
|
||||
.act = tcf_gact,
|
||||
.dump = tcf_gact_dump,
|
||||
.init = tcf_gact_init,
|
||||
};
|
||||
|
||||
MODULE_AUTHOR("Jamal Hadi Salim(2002-4)");
|
||||
MODULE_DESCRIPTION("Generic Classifier actions");
|
||||
MODULE_LICENSE("GPL");
|
||||
|
||||
static int __init gact_init_module(void)
|
||||
{
|
||||
#ifdef CONFIG_GACT_PROB
|
||||
pr_info("GACT probability on\n");
|
||||
#else
|
||||
pr_info("GACT probability NOT on\n");
|
||||
#endif
|
||||
return tcf_register_action(&act_gact_ops, GACT_TAB_MASK);
|
||||
}
|
||||
|
||||
static void __exit gact_cleanup_module(void)
|
||||
{
|
||||
tcf_unregister_action(&act_gact_ops);
|
||||
}
|
||||
|
||||
module_init(gact_init_module);
|
||||
module_exit(gact_cleanup_module);
|
||||
311
net/sched/act_ipt.c
Normal file
311
net/sched/act_ipt.c
Normal file
|
|
@ -0,0 +1,311 @@
|
|||
/*
|
||||
* net/sched/ipt.c iptables target interface
|
||||
*
|
||||
*TODO: Add other tables. For now we only support the ipv4 table targets
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU General Public License
|
||||
* as published by the Free Software Foundation; either version
|
||||
* 2 of the License, or (at your option) any later version.
|
||||
*
|
||||
* Copyright: Jamal Hadi Salim (2002-13)
|
||||
*/
|
||||
|
||||
#include <linux/types.h>
|
||||
#include <linux/kernel.h>
|
||||
#include <linux/string.h>
|
||||
#include <linux/errno.h>
|
||||
#include <linux/skbuff.h>
|
||||
#include <linux/rtnetlink.h>
|
||||
#include <linux/module.h>
|
||||
#include <linux/init.h>
|
||||
#include <linux/slab.h>
|
||||
#include <net/netlink.h>
|
||||
#include <net/pkt_sched.h>
|
||||
#include <linux/tc_act/tc_ipt.h>
|
||||
#include <net/tc_act/tc_ipt.h>
|
||||
|
||||
#include <linux/netfilter_ipv4/ip_tables.h>
|
||||
|
||||
|
||||
#define IPT_TAB_MASK 15
|
||||
|
||||
static int ipt_init_target(struct xt_entry_target *t, char *table, unsigned int hook)
|
||||
{
|
||||
struct xt_tgchk_param par;
|
||||
struct xt_target *target;
|
||||
int ret = 0;
|
||||
|
||||
target = xt_request_find_target(AF_INET, t->u.user.name,
|
||||
t->u.user.revision);
|
||||
if (IS_ERR(target))
|
||||
return PTR_ERR(target);
|
||||
|
||||
t->u.kernel.target = target;
|
||||
par.table = table;
|
||||
par.entryinfo = NULL;
|
||||
par.target = target;
|
||||
par.targinfo = t->data;
|
||||
par.hook_mask = hook;
|
||||
par.family = NFPROTO_IPV4;
|
||||
|
||||
ret = xt_check_target(&par, t->u.target_size - sizeof(*t), 0, false);
|
||||
if (ret < 0) {
|
||||
module_put(t->u.kernel.target->me);
|
||||
return ret;
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
static void ipt_destroy_target(struct xt_entry_target *t)
|
||||
{
|
||||
struct xt_tgdtor_param par = {
|
||||
.target = t->u.kernel.target,
|
||||
.targinfo = t->data,
|
||||
};
|
||||
if (par.target->destroy != NULL)
|
||||
par.target->destroy(&par);
|
||||
module_put(par.target->me);
|
||||
}
|
||||
|
||||
static void tcf_ipt_release(struct tc_action *a, int bind)
|
||||
{
|
||||
struct tcf_ipt *ipt = to_ipt(a);
|
||||
ipt_destroy_target(ipt->tcfi_t);
|
||||
kfree(ipt->tcfi_tname);
|
||||
kfree(ipt->tcfi_t);
|
||||
}
|
||||
|
||||
static const struct nla_policy ipt_policy[TCA_IPT_MAX + 1] = {
|
||||
[TCA_IPT_TABLE] = { .type = NLA_STRING, .len = IFNAMSIZ },
|
||||
[TCA_IPT_HOOK] = { .type = NLA_U32 },
|
||||
[TCA_IPT_INDEX] = { .type = NLA_U32 },
|
||||
[TCA_IPT_TARG] = { .len = sizeof(struct xt_entry_target) },
|
||||
};
|
||||
|
||||
static int tcf_ipt_init(struct net *net, struct nlattr *nla, struct nlattr *est,
|
||||
struct tc_action *a, int ovr, int bind)
|
||||
{
|
||||
struct nlattr *tb[TCA_IPT_MAX + 1];
|
||||
struct tcf_ipt *ipt;
|
||||
struct xt_entry_target *td, *t;
|
||||
char *tname;
|
||||
int ret = 0, err;
|
||||
u32 hook = 0;
|
||||
u32 index = 0;
|
||||
|
||||
if (nla == NULL)
|
||||
return -EINVAL;
|
||||
|
||||
err = nla_parse_nested(tb, TCA_IPT_MAX, nla, ipt_policy);
|
||||
if (err < 0)
|
||||
return err;
|
||||
|
||||
if (tb[TCA_IPT_HOOK] == NULL)
|
||||
return -EINVAL;
|
||||
if (tb[TCA_IPT_TARG] == NULL)
|
||||
return -EINVAL;
|
||||
|
||||
td = (struct xt_entry_target *)nla_data(tb[TCA_IPT_TARG]);
|
||||
if (nla_len(tb[TCA_IPT_TARG]) < td->u.target_size)
|
||||
return -EINVAL;
|
||||
|
||||
if (tb[TCA_IPT_INDEX] != NULL)
|
||||
index = nla_get_u32(tb[TCA_IPT_INDEX]);
|
||||
|
||||
if (!tcf_hash_check(index, a, bind) ) {
|
||||
ret = tcf_hash_create(index, est, a, sizeof(*ipt), bind);
|
||||
if (ret)
|
||||
return ret;
|
||||
ret = ACT_P_CREATED;
|
||||
} else {
|
||||
if (bind)/* dont override defaults */
|
||||
return 0;
|
||||
tcf_hash_release(a, bind);
|
||||
|
||||
if (!ovr)
|
||||
return -EEXIST;
|
||||
}
|
||||
ipt = to_ipt(a);
|
||||
|
||||
hook = nla_get_u32(tb[TCA_IPT_HOOK]);
|
||||
|
||||
err = -ENOMEM;
|
||||
tname = kmalloc(IFNAMSIZ, GFP_KERNEL);
|
||||
if (unlikely(!tname))
|
||||
goto err1;
|
||||
if (tb[TCA_IPT_TABLE] == NULL ||
|
||||
nla_strlcpy(tname, tb[TCA_IPT_TABLE], IFNAMSIZ) >= IFNAMSIZ)
|
||||
strcpy(tname, "mangle");
|
||||
|
||||
t = kmemdup(td, td->u.target_size, GFP_KERNEL);
|
||||
if (unlikely(!t))
|
||||
goto err2;
|
||||
|
||||
err = ipt_init_target(t, tname, hook);
|
||||
if (err < 0)
|
||||
goto err3;
|
||||
|
||||
spin_lock_bh(&ipt->tcf_lock);
|
||||
if (ret != ACT_P_CREATED) {
|
||||
ipt_destroy_target(ipt->tcfi_t);
|
||||
kfree(ipt->tcfi_tname);
|
||||
kfree(ipt->tcfi_t);
|
||||
}
|
||||
ipt->tcfi_tname = tname;
|
||||
ipt->tcfi_t = t;
|
||||
ipt->tcfi_hook = hook;
|
||||
spin_unlock_bh(&ipt->tcf_lock);
|
||||
if (ret == ACT_P_CREATED)
|
||||
tcf_hash_insert(a);
|
||||
return ret;
|
||||
|
||||
err3:
|
||||
kfree(t);
|
||||
err2:
|
||||
kfree(tname);
|
||||
err1:
|
||||
if (ret == ACT_P_CREATED)
|
||||
tcf_hash_cleanup(a, est);
|
||||
return err;
|
||||
}
|
||||
|
||||
static int tcf_ipt(struct sk_buff *skb, const struct tc_action *a,
|
||||
struct tcf_result *res)
|
||||
{
|
||||
int ret = 0, result = 0;
|
||||
struct tcf_ipt *ipt = a->priv;
|
||||
struct xt_action_param par;
|
||||
|
||||
if (skb_unclone(skb, GFP_ATOMIC))
|
||||
return TC_ACT_UNSPEC;
|
||||
|
||||
spin_lock(&ipt->tcf_lock);
|
||||
|
||||
ipt->tcf_tm.lastuse = jiffies;
|
||||
bstats_update(&ipt->tcf_bstats, skb);
|
||||
|
||||
/* yes, we have to worry about both in and out dev
|
||||
* worry later - danger - this API seems to have changed
|
||||
* from earlier kernels
|
||||
*/
|
||||
par.in = skb->dev;
|
||||
par.out = NULL;
|
||||
par.hooknum = ipt->tcfi_hook;
|
||||
par.target = ipt->tcfi_t->u.kernel.target;
|
||||
par.targinfo = ipt->tcfi_t->data;
|
||||
ret = par.target->target(skb, &par);
|
||||
|
||||
switch (ret) {
|
||||
case NF_ACCEPT:
|
||||
result = TC_ACT_OK;
|
||||
break;
|
||||
case NF_DROP:
|
||||
result = TC_ACT_SHOT;
|
||||
ipt->tcf_qstats.drops++;
|
||||
break;
|
||||
case XT_CONTINUE:
|
||||
result = TC_ACT_PIPE;
|
||||
break;
|
||||
default:
|
||||
net_notice_ratelimited("tc filter: Bogus netfilter code %d assume ACCEPT\n",
|
||||
ret);
|
||||
result = TC_POLICE_OK;
|
||||
break;
|
||||
}
|
||||
spin_unlock(&ipt->tcf_lock);
|
||||
return result;
|
||||
|
||||
}
|
||||
|
||||
static int tcf_ipt_dump(struct sk_buff *skb, struct tc_action *a, int bind, int ref)
|
||||
{
|
||||
unsigned char *b = skb_tail_pointer(skb);
|
||||
struct tcf_ipt *ipt = a->priv;
|
||||
struct xt_entry_target *t;
|
||||
struct tcf_t tm;
|
||||
struct tc_cnt c;
|
||||
|
||||
/* for simple targets kernel size == user size
|
||||
* user name = target name
|
||||
* for foolproof you need to not assume this
|
||||
*/
|
||||
|
||||
t = kmemdup(ipt->tcfi_t, ipt->tcfi_t->u.user.target_size, GFP_ATOMIC);
|
||||
if (unlikely(!t))
|
||||
goto nla_put_failure;
|
||||
|
||||
c.bindcnt = ipt->tcf_bindcnt - bind;
|
||||
c.refcnt = ipt->tcf_refcnt - ref;
|
||||
strcpy(t->u.user.name, ipt->tcfi_t->u.kernel.target->name);
|
||||
|
||||
if (nla_put(skb, TCA_IPT_TARG, ipt->tcfi_t->u.user.target_size, t) ||
|
||||
nla_put_u32(skb, TCA_IPT_INDEX, ipt->tcf_index) ||
|
||||
nla_put_u32(skb, TCA_IPT_HOOK, ipt->tcfi_hook) ||
|
||||
nla_put(skb, TCA_IPT_CNT, sizeof(struct tc_cnt), &c) ||
|
||||
nla_put_string(skb, TCA_IPT_TABLE, ipt->tcfi_tname))
|
||||
goto nla_put_failure;
|
||||
tm.install = jiffies_to_clock_t(jiffies - ipt->tcf_tm.install);
|
||||
tm.lastuse = jiffies_to_clock_t(jiffies - ipt->tcf_tm.lastuse);
|
||||
tm.expires = jiffies_to_clock_t(ipt->tcf_tm.expires);
|
||||
if (nla_put(skb, TCA_IPT_TM, sizeof (tm), &tm))
|
||||
goto nla_put_failure;
|
||||
kfree(t);
|
||||
return skb->len;
|
||||
|
||||
nla_put_failure:
|
||||
nlmsg_trim(skb, b);
|
||||
kfree(t);
|
||||
return -1;
|
||||
}
|
||||
|
||||
static struct tc_action_ops act_ipt_ops = {
|
||||
.kind = "ipt",
|
||||
.type = TCA_ACT_IPT,
|
||||
.owner = THIS_MODULE,
|
||||
.act = tcf_ipt,
|
||||
.dump = tcf_ipt_dump,
|
||||
.cleanup = tcf_ipt_release,
|
||||
.init = tcf_ipt_init,
|
||||
};
|
||||
|
||||
static struct tc_action_ops act_xt_ops = {
|
||||
.kind = "xt",
|
||||
.type = TCA_ACT_XT,
|
||||
.owner = THIS_MODULE,
|
||||
.act = tcf_ipt,
|
||||
.dump = tcf_ipt_dump,
|
||||
.cleanup = tcf_ipt_release,
|
||||
.init = tcf_ipt_init,
|
||||
};
|
||||
|
||||
MODULE_AUTHOR("Jamal Hadi Salim(2002-13)");
|
||||
MODULE_DESCRIPTION("Iptables target actions");
|
||||
MODULE_LICENSE("GPL");
|
||||
MODULE_ALIAS("act_xt");
|
||||
|
||||
static int __init ipt_init_module(void)
|
||||
{
|
||||
int ret1, ret2;
|
||||
|
||||
ret1 = tcf_register_action(&act_xt_ops, IPT_TAB_MASK);
|
||||
if (ret1 < 0)
|
||||
printk("Failed to load xt action\n");
|
||||
ret2 = tcf_register_action(&act_ipt_ops, IPT_TAB_MASK);
|
||||
if (ret2 < 0)
|
||||
printk("Failed to load ipt action\n");
|
||||
|
||||
if (ret1 < 0 && ret2 < 0) {
|
||||
return ret1;
|
||||
} else
|
||||
return 0;
|
||||
}
|
||||
|
||||
static void __exit ipt_cleanup_module(void)
|
||||
{
|
||||
tcf_unregister_action(&act_xt_ops);
|
||||
tcf_unregister_action(&act_ipt_ops);
|
||||
}
|
||||
|
||||
module_init(ipt_init_module);
|
||||
module_exit(ipt_cleanup_module);
|
||||
292
net/sched/act_mirred.c
Normal file
292
net/sched/act_mirred.c
Normal file
|
|
@ -0,0 +1,292 @@
|
|||
/*
|
||||
* net/sched/mirred.c packet mirroring and redirect actions
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU General Public License
|
||||
* as published by the Free Software Foundation; either version
|
||||
* 2 of the License, or (at your option) any later version.
|
||||
*
|
||||
* Authors: Jamal Hadi Salim (2002-4)
|
||||
*
|
||||
* TODO: Add ingress support (and socket redirect support)
|
||||
*
|
||||
*/
|
||||
|
||||
#include <linux/types.h>
|
||||
#include <linux/kernel.h>
|
||||
#include <linux/string.h>
|
||||
#include <linux/errno.h>
|
||||
#include <linux/skbuff.h>
|
||||
#include <linux/rtnetlink.h>
|
||||
#include <linux/module.h>
|
||||
#include <linux/init.h>
|
||||
#include <linux/gfp.h>
|
||||
#include <net/net_namespace.h>
|
||||
#include <net/netlink.h>
|
||||
#include <net/pkt_sched.h>
|
||||
#include <linux/tc_act/tc_mirred.h>
|
||||
#include <net/tc_act/tc_mirred.h>
|
||||
|
||||
#include <linux/if_arp.h>
|
||||
|
||||
#define MIRRED_TAB_MASK 7
|
||||
static LIST_HEAD(mirred_list);
|
||||
static DEFINE_SPINLOCK(mirred_list_lock);
|
||||
|
||||
static void tcf_mirred_release(struct tc_action *a, int bind)
|
||||
{
|
||||
struct tcf_mirred *m = to_mirred(a);
|
||||
|
||||
/* We could be called either in a RCU callback or with RTNL lock held. */
|
||||
spin_lock_bh(&mirred_list_lock);
|
||||
list_del(&m->tcfm_list);
|
||||
spin_unlock_bh(&mirred_list_lock);
|
||||
if (m->tcfm_dev)
|
||||
dev_put(m->tcfm_dev);
|
||||
}
|
||||
|
||||
static const struct nla_policy mirred_policy[TCA_MIRRED_MAX + 1] = {
|
||||
[TCA_MIRRED_PARMS] = { .len = sizeof(struct tc_mirred) },
|
||||
};
|
||||
|
||||
static int tcf_mirred_init(struct net *net, struct nlattr *nla,
|
||||
struct nlattr *est, struct tc_action *a, int ovr,
|
||||
int bind)
|
||||
{
|
||||
struct nlattr *tb[TCA_MIRRED_MAX + 1];
|
||||
struct tc_mirred *parm;
|
||||
struct tcf_mirred *m;
|
||||
struct net_device *dev;
|
||||
int ret, ok_push = 0;
|
||||
|
||||
if (nla == NULL)
|
||||
return -EINVAL;
|
||||
ret = nla_parse_nested(tb, TCA_MIRRED_MAX, nla, mirred_policy);
|
||||
if (ret < 0)
|
||||
return ret;
|
||||
if (tb[TCA_MIRRED_PARMS] == NULL)
|
||||
return -EINVAL;
|
||||
parm = nla_data(tb[TCA_MIRRED_PARMS]);
|
||||
switch (parm->eaction) {
|
||||
case TCA_EGRESS_MIRROR:
|
||||
case TCA_EGRESS_REDIR:
|
||||
case TCA_INGRESS_REDIR:
|
||||
break;
|
||||
default:
|
||||
return -EINVAL;
|
||||
}
|
||||
if (parm->ifindex) {
|
||||
dev = __dev_get_by_index(net, parm->ifindex);
|
||||
if (dev == NULL)
|
||||
return -ENODEV;
|
||||
switch (dev->type) {
|
||||
case ARPHRD_TUNNEL:
|
||||
case ARPHRD_TUNNEL6:
|
||||
case ARPHRD_SIT:
|
||||
case ARPHRD_IPGRE:
|
||||
case ARPHRD_VOID:
|
||||
case ARPHRD_NONE:
|
||||
ok_push = 0;
|
||||
break;
|
||||
default:
|
||||
ok_push = 1;
|
||||
break;
|
||||
}
|
||||
} else {
|
||||
dev = NULL;
|
||||
}
|
||||
|
||||
if (!tcf_hash_check(parm->index, a, bind)) {
|
||||
if (dev == NULL)
|
||||
return -EINVAL;
|
||||
ret = tcf_hash_create(parm->index, est, a, sizeof(*m), bind);
|
||||
if (ret)
|
||||
return ret;
|
||||
ret = ACT_P_CREATED;
|
||||
} else {
|
||||
if (!ovr) {
|
||||
tcf_hash_release(a, bind);
|
||||
return -EEXIST;
|
||||
}
|
||||
}
|
||||
m = to_mirred(a);
|
||||
|
||||
spin_lock_bh(&m->tcf_lock);
|
||||
m->tcf_action = parm->action;
|
||||
m->tcfm_eaction = parm->eaction;
|
||||
if (dev != NULL) {
|
||||
m->tcfm_ifindex = parm->ifindex;
|
||||
if (ret != ACT_P_CREATED)
|
||||
dev_put(m->tcfm_dev);
|
||||
dev_hold(dev);
|
||||
m->tcfm_dev = dev;
|
||||
m->tcfm_ok_push = ok_push;
|
||||
}
|
||||
spin_unlock_bh(&m->tcf_lock);
|
||||
if (ret == ACT_P_CREATED) {
|
||||
spin_lock_bh(&mirred_list_lock);
|
||||
list_add(&m->tcfm_list, &mirred_list);
|
||||
spin_unlock_bh(&mirred_list_lock);
|
||||
tcf_hash_insert(a);
|
||||
}
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
static int tcf_mirred(struct sk_buff *skb, const struct tc_action *a,
|
||||
struct tcf_result *res)
|
||||
{
|
||||
struct tcf_mirred *m = a->priv;
|
||||
struct net_device *dev;
|
||||
struct sk_buff *skb2;
|
||||
u32 at;
|
||||
int retval, err = 1;
|
||||
|
||||
spin_lock(&m->tcf_lock);
|
||||
m->tcf_tm.lastuse = jiffies;
|
||||
bstats_update(&m->tcf_bstats, skb);
|
||||
|
||||
dev = m->tcfm_dev;
|
||||
if (!dev) {
|
||||
printk_once(KERN_NOTICE "tc mirred: target device is gone\n");
|
||||
goto out;
|
||||
}
|
||||
|
||||
if (!(dev->flags & IFF_UP)) {
|
||||
net_notice_ratelimited("tc mirred to Houston: device %s is down\n",
|
||||
dev->name);
|
||||
goto out;
|
||||
}
|
||||
|
||||
skb2 = skb_act_clone(skb, GFP_ATOMIC, m->tcf_action);
|
||||
if (skb2 == NULL)
|
||||
goto out;
|
||||
|
||||
if (m->tcfm_eaction == TCA_INGRESS_REDIR) {
|
||||
/* Let's _hope_ the devices are of similar type.
|
||||
* This is rather dangerous; with changed skb_iif, we
|
||||
* will not know the real input device, but perhaps
|
||||
* that's the whole point of doing the ingress
|
||||
* redirect/mirror in the first place? (Note: This
|
||||
* can lead to bad things if two devices ingress
|
||||
* redirect at each other. Don't do that.)*/
|
||||
skb2->dev = dev;
|
||||
skb2->skb_iif = skb2->dev->ifindex;
|
||||
skb2->pkt_type = PACKET_HOST;
|
||||
netif_rx(skb2);
|
||||
} else {
|
||||
at = G_TC_AT(skb->tc_verd);
|
||||
if (!(at & AT_EGRESS)) {
|
||||
if (m->tcfm_ok_push) {
|
||||
skb_push(skb2, skb2->dev->hard_header_len);
|
||||
}
|
||||
}
|
||||
|
||||
/* mirror is always swallowed */
|
||||
if (m->tcfm_eaction != TCA_EGRESS_MIRROR)
|
||||
skb2->tc_verd = SET_TC_FROM(skb2->tc_verd, at);
|
||||
|
||||
skb2->skb_iif = skb->dev->ifindex;
|
||||
skb2->dev = dev;
|
||||
err = dev_queue_xmit(skb2);
|
||||
}
|
||||
out:
|
||||
if (err) {
|
||||
m->tcf_qstats.overlimits++;
|
||||
if (m->tcfm_eaction != TCA_EGRESS_MIRROR)
|
||||
retval = TC_ACT_SHOT;
|
||||
else
|
||||
retval = m->tcf_action;
|
||||
} else
|
||||
retval = m->tcf_action;
|
||||
spin_unlock(&m->tcf_lock);
|
||||
|
||||
return retval;
|
||||
}
|
||||
|
||||
static int tcf_mirred_dump(struct sk_buff *skb, struct tc_action *a, int bind, int ref)
|
||||
{
|
||||
unsigned char *b = skb_tail_pointer(skb);
|
||||
struct tcf_mirred *m = a->priv;
|
||||
struct tc_mirred opt = {
|
||||
.index = m->tcf_index,
|
||||
.action = m->tcf_action,
|
||||
.refcnt = m->tcf_refcnt - ref,
|
||||
.bindcnt = m->tcf_bindcnt - bind,
|
||||
.eaction = m->tcfm_eaction,
|
||||
.ifindex = m->tcfm_ifindex,
|
||||
};
|
||||
struct tcf_t t;
|
||||
|
||||
if (nla_put(skb, TCA_MIRRED_PARMS, sizeof(opt), &opt))
|
||||
goto nla_put_failure;
|
||||
t.install = jiffies_to_clock_t(jiffies - m->tcf_tm.install);
|
||||
t.lastuse = jiffies_to_clock_t(jiffies - m->tcf_tm.lastuse);
|
||||
t.expires = jiffies_to_clock_t(m->tcf_tm.expires);
|
||||
if (nla_put(skb, TCA_MIRRED_TM, sizeof(t), &t))
|
||||
goto nla_put_failure;
|
||||
return skb->len;
|
||||
|
||||
nla_put_failure:
|
||||
nlmsg_trim(skb, b);
|
||||
return -1;
|
||||
}
|
||||
|
||||
static int mirred_device_event(struct notifier_block *unused,
|
||||
unsigned long event, void *ptr)
|
||||
{
|
||||
struct net_device *dev = netdev_notifier_info_to_dev(ptr);
|
||||
struct tcf_mirred *m;
|
||||
|
||||
if (event == NETDEV_UNREGISTER) {
|
||||
spin_lock_bh(&mirred_list_lock);
|
||||
list_for_each_entry(m, &mirred_list, tcfm_list) {
|
||||
spin_lock_bh(&m->tcf_lock);
|
||||
if (m->tcfm_dev == dev) {
|
||||
dev_put(dev);
|
||||
m->tcfm_dev = NULL;
|
||||
}
|
||||
spin_unlock_bh(&m->tcf_lock);
|
||||
}
|
||||
spin_unlock_bh(&mirred_list_lock);
|
||||
}
|
||||
|
||||
return NOTIFY_DONE;
|
||||
}
|
||||
|
||||
static struct notifier_block mirred_device_notifier = {
|
||||
.notifier_call = mirred_device_event,
|
||||
};
|
||||
|
||||
static struct tc_action_ops act_mirred_ops = {
|
||||
.kind = "mirred",
|
||||
.type = TCA_ACT_MIRRED,
|
||||
.owner = THIS_MODULE,
|
||||
.act = tcf_mirred,
|
||||
.dump = tcf_mirred_dump,
|
||||
.cleanup = tcf_mirred_release,
|
||||
.init = tcf_mirred_init,
|
||||
};
|
||||
|
||||
MODULE_AUTHOR("Jamal Hadi Salim(2002)");
|
||||
MODULE_DESCRIPTION("Device Mirror/redirect actions");
|
||||
MODULE_LICENSE("GPL");
|
||||
|
||||
static int __init mirred_init_module(void)
|
||||
{
|
||||
int err = register_netdevice_notifier(&mirred_device_notifier);
|
||||
if (err)
|
||||
return err;
|
||||
|
||||
pr_info("Mirror/redirect action on\n");
|
||||
return tcf_register_action(&act_mirred_ops, MIRRED_TAB_MASK);
|
||||
}
|
||||
|
||||
static void __exit mirred_cleanup_module(void)
|
||||
{
|
||||
tcf_unregister_action(&act_mirred_ops);
|
||||
unregister_netdevice_notifier(&mirred_device_notifier);
|
||||
}
|
||||
|
||||
module_init(mirred_init_module);
|
||||
module_exit(mirred_cleanup_module);
|
||||
306
net/sched/act_nat.c
Normal file
306
net/sched/act_nat.c
Normal file
|
|
@ -0,0 +1,306 @@
|
|||
/*
|
||||
* Stateless NAT actions
|
||||
*
|
||||
* Copyright (c) 2007 Herbert Xu <herbert@gondor.apana.org.au>
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or modify it
|
||||
* under the terms of the GNU General Public License as published by the Free
|
||||
* Software Foundation; either version 2 of the License, or (at your option)
|
||||
* any later version.
|
||||
*/
|
||||
|
||||
#include <linux/errno.h>
|
||||
#include <linux/init.h>
|
||||
#include <linux/kernel.h>
|
||||
#include <linux/module.h>
|
||||
#include <linux/netfilter.h>
|
||||
#include <linux/rtnetlink.h>
|
||||
#include <linux/skbuff.h>
|
||||
#include <linux/slab.h>
|
||||
#include <linux/spinlock.h>
|
||||
#include <linux/string.h>
|
||||
#include <linux/tc_act/tc_nat.h>
|
||||
#include <net/act_api.h>
|
||||
#include <net/icmp.h>
|
||||
#include <net/ip.h>
|
||||
#include <net/netlink.h>
|
||||
#include <net/tc_act/tc_nat.h>
|
||||
#include <net/tcp.h>
|
||||
#include <net/udp.h>
|
||||
|
||||
|
||||
#define NAT_TAB_MASK 15
|
||||
|
||||
static const struct nla_policy nat_policy[TCA_NAT_MAX + 1] = {
|
||||
[TCA_NAT_PARMS] = { .len = sizeof(struct tc_nat) },
|
||||
};
|
||||
|
||||
static int tcf_nat_init(struct net *net, struct nlattr *nla, struct nlattr *est,
|
||||
struct tc_action *a, int ovr, int bind)
|
||||
{
|
||||
struct nlattr *tb[TCA_NAT_MAX + 1];
|
||||
struct tc_nat *parm;
|
||||
int ret = 0, err;
|
||||
struct tcf_nat *p;
|
||||
|
||||
if (nla == NULL)
|
||||
return -EINVAL;
|
||||
|
||||
err = nla_parse_nested(tb, TCA_NAT_MAX, nla, nat_policy);
|
||||
if (err < 0)
|
||||
return err;
|
||||
|
||||
if (tb[TCA_NAT_PARMS] == NULL)
|
||||
return -EINVAL;
|
||||
parm = nla_data(tb[TCA_NAT_PARMS]);
|
||||
|
||||
if (!tcf_hash_check(parm->index, a, bind)) {
|
||||
ret = tcf_hash_create(parm->index, est, a, sizeof(*p), bind);
|
||||
if (ret)
|
||||
return ret;
|
||||
ret = ACT_P_CREATED;
|
||||
} else {
|
||||
if (bind)
|
||||
return 0;
|
||||
tcf_hash_release(a, bind);
|
||||
if (!ovr)
|
||||
return -EEXIST;
|
||||
}
|
||||
p = to_tcf_nat(a);
|
||||
|
||||
spin_lock_bh(&p->tcf_lock);
|
||||
p->old_addr = parm->old_addr;
|
||||
p->new_addr = parm->new_addr;
|
||||
p->mask = parm->mask;
|
||||
p->flags = parm->flags;
|
||||
|
||||
p->tcf_action = parm->action;
|
||||
spin_unlock_bh(&p->tcf_lock);
|
||||
|
||||
if (ret == ACT_P_CREATED)
|
||||
tcf_hash_insert(a);
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
static int tcf_nat(struct sk_buff *skb, const struct tc_action *a,
|
||||
struct tcf_result *res)
|
||||
{
|
||||
struct tcf_nat *p = a->priv;
|
||||
struct iphdr *iph;
|
||||
__be32 old_addr;
|
||||
__be32 new_addr;
|
||||
__be32 mask;
|
||||
__be32 addr;
|
||||
int egress;
|
||||
int action;
|
||||
int ihl;
|
||||
int noff;
|
||||
|
||||
spin_lock(&p->tcf_lock);
|
||||
|
||||
p->tcf_tm.lastuse = jiffies;
|
||||
old_addr = p->old_addr;
|
||||
new_addr = p->new_addr;
|
||||
mask = p->mask;
|
||||
egress = p->flags & TCA_NAT_FLAG_EGRESS;
|
||||
action = p->tcf_action;
|
||||
|
||||
bstats_update(&p->tcf_bstats, skb);
|
||||
|
||||
spin_unlock(&p->tcf_lock);
|
||||
|
||||
if (unlikely(action == TC_ACT_SHOT))
|
||||
goto drop;
|
||||
|
||||
noff = skb_network_offset(skb);
|
||||
if (!pskb_may_pull(skb, sizeof(*iph) + noff))
|
||||
goto drop;
|
||||
|
||||
iph = ip_hdr(skb);
|
||||
|
||||
if (egress)
|
||||
addr = iph->saddr;
|
||||
else
|
||||
addr = iph->daddr;
|
||||
|
||||
if (!((old_addr ^ addr) & mask)) {
|
||||
if (skb_cloned(skb) &&
|
||||
!skb_clone_writable(skb, sizeof(*iph) + noff) &&
|
||||
pskb_expand_head(skb, 0, 0, GFP_ATOMIC))
|
||||
goto drop;
|
||||
|
||||
new_addr &= mask;
|
||||
new_addr |= addr & ~mask;
|
||||
|
||||
/* Rewrite IP header */
|
||||
iph = ip_hdr(skb);
|
||||
if (egress)
|
||||
iph->saddr = new_addr;
|
||||
else
|
||||
iph->daddr = new_addr;
|
||||
|
||||
csum_replace4(&iph->check, addr, new_addr);
|
||||
} else if ((iph->frag_off & htons(IP_OFFSET)) ||
|
||||
iph->protocol != IPPROTO_ICMP) {
|
||||
goto out;
|
||||
}
|
||||
|
||||
ihl = iph->ihl * 4;
|
||||
|
||||
/* It would be nice to share code with stateful NAT. */
|
||||
switch (iph->frag_off & htons(IP_OFFSET) ? 0 : iph->protocol) {
|
||||
case IPPROTO_TCP:
|
||||
{
|
||||
struct tcphdr *tcph;
|
||||
|
||||
if (!pskb_may_pull(skb, ihl + sizeof(*tcph) + noff) ||
|
||||
(skb_cloned(skb) &&
|
||||
!skb_clone_writable(skb, ihl + sizeof(*tcph) + noff) &&
|
||||
pskb_expand_head(skb, 0, 0, GFP_ATOMIC)))
|
||||
goto drop;
|
||||
|
||||
tcph = (void *)(skb_network_header(skb) + ihl);
|
||||
inet_proto_csum_replace4(&tcph->check, skb, addr, new_addr, 1);
|
||||
break;
|
||||
}
|
||||
case IPPROTO_UDP:
|
||||
{
|
||||
struct udphdr *udph;
|
||||
|
||||
if (!pskb_may_pull(skb, ihl + sizeof(*udph) + noff) ||
|
||||
(skb_cloned(skb) &&
|
||||
!skb_clone_writable(skb, ihl + sizeof(*udph) + noff) &&
|
||||
pskb_expand_head(skb, 0, 0, GFP_ATOMIC)))
|
||||
goto drop;
|
||||
|
||||
udph = (void *)(skb_network_header(skb) + ihl);
|
||||
if (udph->check || skb->ip_summed == CHECKSUM_PARTIAL) {
|
||||
inet_proto_csum_replace4(&udph->check, skb, addr,
|
||||
new_addr, 1);
|
||||
if (!udph->check)
|
||||
udph->check = CSUM_MANGLED_0;
|
||||
}
|
||||
break;
|
||||
}
|
||||
case IPPROTO_ICMP:
|
||||
{
|
||||
struct icmphdr *icmph;
|
||||
|
||||
if (!pskb_may_pull(skb, ihl + sizeof(*icmph) + noff))
|
||||
goto drop;
|
||||
|
||||
icmph = (void *)(skb_network_header(skb) + ihl);
|
||||
|
||||
if ((icmph->type != ICMP_DEST_UNREACH) &&
|
||||
(icmph->type != ICMP_TIME_EXCEEDED) &&
|
||||
(icmph->type != ICMP_PARAMETERPROB))
|
||||
break;
|
||||
|
||||
if (!pskb_may_pull(skb, ihl + sizeof(*icmph) + sizeof(*iph) +
|
||||
noff))
|
||||
goto drop;
|
||||
|
||||
icmph = (void *)(skb_network_header(skb) + ihl);
|
||||
iph = (void *)(icmph + 1);
|
||||
if (egress)
|
||||
addr = iph->daddr;
|
||||
else
|
||||
addr = iph->saddr;
|
||||
|
||||
if ((old_addr ^ addr) & mask)
|
||||
break;
|
||||
|
||||
if (skb_cloned(skb) &&
|
||||
!skb_clone_writable(skb, ihl + sizeof(*icmph) +
|
||||
sizeof(*iph) + noff) &&
|
||||
pskb_expand_head(skb, 0, 0, GFP_ATOMIC))
|
||||
goto drop;
|
||||
|
||||
icmph = (void *)(skb_network_header(skb) + ihl);
|
||||
iph = (void *)(icmph + 1);
|
||||
|
||||
new_addr &= mask;
|
||||
new_addr |= addr & ~mask;
|
||||
|
||||
/* XXX Fix up the inner checksums. */
|
||||
if (egress)
|
||||
iph->daddr = new_addr;
|
||||
else
|
||||
iph->saddr = new_addr;
|
||||
|
||||
inet_proto_csum_replace4(&icmph->checksum, skb, addr, new_addr,
|
||||
0);
|
||||
break;
|
||||
}
|
||||
default:
|
||||
break;
|
||||
}
|
||||
|
||||
out:
|
||||
return action;
|
||||
|
||||
drop:
|
||||
spin_lock(&p->tcf_lock);
|
||||
p->tcf_qstats.drops++;
|
||||
spin_unlock(&p->tcf_lock);
|
||||
return TC_ACT_SHOT;
|
||||
}
|
||||
|
||||
static int tcf_nat_dump(struct sk_buff *skb, struct tc_action *a,
|
||||
int bind, int ref)
|
||||
{
|
||||
unsigned char *b = skb_tail_pointer(skb);
|
||||
struct tcf_nat *p = a->priv;
|
||||
struct tc_nat opt = {
|
||||
.old_addr = p->old_addr,
|
||||
.new_addr = p->new_addr,
|
||||
.mask = p->mask,
|
||||
.flags = p->flags,
|
||||
|
||||
.index = p->tcf_index,
|
||||
.action = p->tcf_action,
|
||||
.refcnt = p->tcf_refcnt - ref,
|
||||
.bindcnt = p->tcf_bindcnt - bind,
|
||||
};
|
||||
struct tcf_t t;
|
||||
|
||||
if (nla_put(skb, TCA_NAT_PARMS, sizeof(opt), &opt))
|
||||
goto nla_put_failure;
|
||||
t.install = jiffies_to_clock_t(jiffies - p->tcf_tm.install);
|
||||
t.lastuse = jiffies_to_clock_t(jiffies - p->tcf_tm.lastuse);
|
||||
t.expires = jiffies_to_clock_t(p->tcf_tm.expires);
|
||||
if (nla_put(skb, TCA_NAT_TM, sizeof(t), &t))
|
||||
goto nla_put_failure;
|
||||
|
||||
return skb->len;
|
||||
|
||||
nla_put_failure:
|
||||
nlmsg_trim(skb, b);
|
||||
return -1;
|
||||
}
|
||||
|
||||
static struct tc_action_ops act_nat_ops = {
|
||||
.kind = "nat",
|
||||
.type = TCA_ACT_NAT,
|
||||
.owner = THIS_MODULE,
|
||||
.act = tcf_nat,
|
||||
.dump = tcf_nat_dump,
|
||||
.init = tcf_nat_init,
|
||||
};
|
||||
|
||||
MODULE_DESCRIPTION("Stateless NAT actions");
|
||||
MODULE_LICENSE("GPL");
|
||||
|
||||
static int __init nat_init_module(void)
|
||||
{
|
||||
return tcf_register_action(&act_nat_ops, NAT_TAB_MASK);
|
||||
}
|
||||
|
||||
static void __exit nat_cleanup_module(void)
|
||||
{
|
||||
tcf_unregister_action(&act_nat_ops);
|
||||
}
|
||||
|
||||
module_init(nat_init_module);
|
||||
module_exit(nat_cleanup_module);
|
||||
243
net/sched/act_pedit.c
Normal file
243
net/sched/act_pedit.c
Normal file
|
|
@ -0,0 +1,243 @@
|
|||
/*
|
||||
* net/sched/pedit.c Generic packet editor
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU General Public License
|
||||
* as published by the Free Software Foundation; either version
|
||||
* 2 of the License, or (at your option) any later version.
|
||||
*
|
||||
* Authors: Jamal Hadi Salim (2002-4)
|
||||
*/
|
||||
|
||||
#include <linux/types.h>
|
||||
#include <linux/kernel.h>
|
||||
#include <linux/string.h>
|
||||
#include <linux/errno.h>
|
||||
#include <linux/skbuff.h>
|
||||
#include <linux/rtnetlink.h>
|
||||
#include <linux/module.h>
|
||||
#include <linux/init.h>
|
||||
#include <linux/slab.h>
|
||||
#include <net/netlink.h>
|
||||
#include <net/pkt_sched.h>
|
||||
#include <linux/tc_act/tc_pedit.h>
|
||||
#include <net/tc_act/tc_pedit.h>
|
||||
|
||||
#define PEDIT_TAB_MASK 15
|
||||
|
||||
static const struct nla_policy pedit_policy[TCA_PEDIT_MAX + 1] = {
|
||||
[TCA_PEDIT_PARMS] = { .len = sizeof(struct tc_pedit) },
|
||||
};
|
||||
|
||||
static int tcf_pedit_init(struct net *net, struct nlattr *nla,
|
||||
struct nlattr *est, struct tc_action *a,
|
||||
int ovr, int bind)
|
||||
{
|
||||
struct nlattr *tb[TCA_PEDIT_MAX + 1];
|
||||
struct tc_pedit *parm;
|
||||
int ret = 0, err;
|
||||
struct tcf_pedit *p;
|
||||
struct tc_pedit_key *keys = NULL;
|
||||
int ksize;
|
||||
|
||||
if (nla == NULL)
|
||||
return -EINVAL;
|
||||
|
||||
err = nla_parse_nested(tb, TCA_PEDIT_MAX, nla, pedit_policy);
|
||||
if (err < 0)
|
||||
return err;
|
||||
|
||||
if (tb[TCA_PEDIT_PARMS] == NULL)
|
||||
return -EINVAL;
|
||||
parm = nla_data(tb[TCA_PEDIT_PARMS]);
|
||||
ksize = parm->nkeys * sizeof(struct tc_pedit_key);
|
||||
if (nla_len(tb[TCA_PEDIT_PARMS]) < sizeof(*parm) + ksize)
|
||||
return -EINVAL;
|
||||
|
||||
if (!tcf_hash_check(parm->index, a, bind)) {
|
||||
if (!parm->nkeys)
|
||||
return -EINVAL;
|
||||
ret = tcf_hash_create(parm->index, est, a, sizeof(*p), bind);
|
||||
if (ret)
|
||||
return ret;
|
||||
p = to_pedit(a);
|
||||
keys = kmalloc(ksize, GFP_KERNEL);
|
||||
if (keys == NULL) {
|
||||
tcf_hash_cleanup(a, est);
|
||||
return -ENOMEM;
|
||||
}
|
||||
ret = ACT_P_CREATED;
|
||||
} else {
|
||||
p = to_pedit(a);
|
||||
tcf_hash_release(a, bind);
|
||||
if (bind)
|
||||
return 0;
|
||||
if (!ovr)
|
||||
return -EEXIST;
|
||||
|
||||
if (p->tcfp_nkeys && p->tcfp_nkeys != parm->nkeys) {
|
||||
keys = kmalloc(ksize, GFP_KERNEL);
|
||||
if (keys == NULL)
|
||||
return -ENOMEM;
|
||||
}
|
||||
}
|
||||
|
||||
spin_lock_bh(&p->tcf_lock);
|
||||
p->tcfp_flags = parm->flags;
|
||||
p->tcf_action = parm->action;
|
||||
if (keys) {
|
||||
kfree(p->tcfp_keys);
|
||||
p->tcfp_keys = keys;
|
||||
p->tcfp_nkeys = parm->nkeys;
|
||||
}
|
||||
memcpy(p->tcfp_keys, parm->keys, ksize);
|
||||
spin_unlock_bh(&p->tcf_lock);
|
||||
if (ret == ACT_P_CREATED)
|
||||
tcf_hash_insert(a);
|
||||
return ret;
|
||||
}
|
||||
|
||||
static void tcf_pedit_cleanup(struct tc_action *a, int bind)
|
||||
{
|
||||
struct tcf_pedit *p = a->priv;
|
||||
struct tc_pedit_key *keys = p->tcfp_keys;
|
||||
kfree(keys);
|
||||
}
|
||||
|
||||
static int tcf_pedit(struct sk_buff *skb, const struct tc_action *a,
|
||||
struct tcf_result *res)
|
||||
{
|
||||
struct tcf_pedit *p = a->priv;
|
||||
int i, munged = 0;
|
||||
unsigned int off;
|
||||
|
||||
if (skb_unclone(skb, GFP_ATOMIC))
|
||||
return p->tcf_action;
|
||||
|
||||
off = skb_network_offset(skb);
|
||||
|
||||
spin_lock(&p->tcf_lock);
|
||||
|
||||
p->tcf_tm.lastuse = jiffies;
|
||||
|
||||
if (p->tcfp_nkeys > 0) {
|
||||
struct tc_pedit_key *tkey = p->tcfp_keys;
|
||||
|
||||
for (i = p->tcfp_nkeys; i > 0; i--, tkey++) {
|
||||
u32 *ptr, _data;
|
||||
int offset = tkey->off;
|
||||
|
||||
if (tkey->offmask) {
|
||||
char *d, _d;
|
||||
|
||||
d = skb_header_pointer(skb, off + tkey->at, 1,
|
||||
&_d);
|
||||
if (!d)
|
||||
goto bad;
|
||||
offset += (*d & tkey->offmask) >> tkey->shift;
|
||||
}
|
||||
|
||||
if (offset % 4) {
|
||||
pr_info("tc filter pedit"
|
||||
" offset must be on 32 bit boundaries\n");
|
||||
goto bad;
|
||||
}
|
||||
if (offset > 0 && offset > skb->len) {
|
||||
pr_info("tc filter pedit"
|
||||
" offset %d can't exceed pkt length %d\n",
|
||||
offset, skb->len);
|
||||
goto bad;
|
||||
}
|
||||
|
||||
ptr = skb_header_pointer(skb, off + offset, 4, &_data);
|
||||
if (!ptr)
|
||||
goto bad;
|
||||
/* just do it, baby */
|
||||
*ptr = ((*ptr & tkey->mask) ^ tkey->val);
|
||||
if (ptr == &_data)
|
||||
skb_store_bits(skb, off + offset, ptr, 4);
|
||||
munged++;
|
||||
}
|
||||
|
||||
if (munged)
|
||||
skb->tc_verd = SET_TC_MUNGED(skb->tc_verd);
|
||||
goto done;
|
||||
} else
|
||||
WARN(1, "pedit BUG: index %d\n", p->tcf_index);
|
||||
|
||||
bad:
|
||||
p->tcf_qstats.overlimits++;
|
||||
done:
|
||||
bstats_update(&p->tcf_bstats, skb);
|
||||
spin_unlock(&p->tcf_lock);
|
||||
return p->tcf_action;
|
||||
}
|
||||
|
||||
static int tcf_pedit_dump(struct sk_buff *skb, struct tc_action *a,
|
||||
int bind, int ref)
|
||||
{
|
||||
unsigned char *b = skb_tail_pointer(skb);
|
||||
struct tcf_pedit *p = a->priv;
|
||||
struct tc_pedit *opt;
|
||||
struct tcf_t t;
|
||||
int s;
|
||||
|
||||
s = sizeof(*opt) + p->tcfp_nkeys * sizeof(struct tc_pedit_key);
|
||||
|
||||
/* netlink spinlocks held above us - must use ATOMIC */
|
||||
opt = kzalloc(s, GFP_ATOMIC);
|
||||
if (unlikely(!opt))
|
||||
return -ENOBUFS;
|
||||
|
||||
memcpy(opt->keys, p->tcfp_keys,
|
||||
p->tcfp_nkeys * sizeof(struct tc_pedit_key));
|
||||
opt->index = p->tcf_index;
|
||||
opt->nkeys = p->tcfp_nkeys;
|
||||
opt->flags = p->tcfp_flags;
|
||||
opt->action = p->tcf_action;
|
||||
opt->refcnt = p->tcf_refcnt - ref;
|
||||
opt->bindcnt = p->tcf_bindcnt - bind;
|
||||
|
||||
if (nla_put(skb, TCA_PEDIT_PARMS, s, opt))
|
||||
goto nla_put_failure;
|
||||
t.install = jiffies_to_clock_t(jiffies - p->tcf_tm.install);
|
||||
t.lastuse = jiffies_to_clock_t(jiffies - p->tcf_tm.lastuse);
|
||||
t.expires = jiffies_to_clock_t(p->tcf_tm.expires);
|
||||
if (nla_put(skb, TCA_PEDIT_TM, sizeof(t), &t))
|
||||
goto nla_put_failure;
|
||||
kfree(opt);
|
||||
return skb->len;
|
||||
|
||||
nla_put_failure:
|
||||
nlmsg_trim(skb, b);
|
||||
kfree(opt);
|
||||
return -1;
|
||||
}
|
||||
|
||||
static struct tc_action_ops act_pedit_ops = {
|
||||
.kind = "pedit",
|
||||
.type = TCA_ACT_PEDIT,
|
||||
.owner = THIS_MODULE,
|
||||
.act = tcf_pedit,
|
||||
.dump = tcf_pedit_dump,
|
||||
.cleanup = tcf_pedit_cleanup,
|
||||
.init = tcf_pedit_init,
|
||||
};
|
||||
|
||||
MODULE_AUTHOR("Jamal Hadi Salim(2002-4)");
|
||||
MODULE_DESCRIPTION("Generic Packet Editor actions");
|
||||
MODULE_LICENSE("GPL");
|
||||
|
||||
static int __init pedit_init_module(void)
|
||||
{
|
||||
return tcf_register_action(&act_pedit_ops, PEDIT_TAB_MASK);
|
||||
}
|
||||
|
||||
static void __exit pedit_cleanup_module(void)
|
||||
{
|
||||
tcf_unregister_action(&act_pedit_ops);
|
||||
}
|
||||
|
||||
module_init(pedit_init_module);
|
||||
module_exit(pedit_cleanup_module);
|
||||
|
||||
372
net/sched/act_police.c
Normal file
372
net/sched/act_police.c
Normal file
|
|
@ -0,0 +1,372 @@
|
|||
/*
|
||||
* net/sched/police.c Input police filter.
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU General Public License
|
||||
* as published by the Free Software Foundation; either version
|
||||
* 2 of the License, or (at your option) any later version.
|
||||
*
|
||||
* Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
|
||||
* J Hadi Salim (action changes)
|
||||
*/
|
||||
|
||||
#include <linux/module.h>
|
||||
#include <linux/types.h>
|
||||
#include <linux/kernel.h>
|
||||
#include <linux/string.h>
|
||||
#include <linux/errno.h>
|
||||
#include <linux/skbuff.h>
|
||||
#include <linux/rtnetlink.h>
|
||||
#include <linux/init.h>
|
||||
#include <linux/slab.h>
|
||||
#include <net/act_api.h>
|
||||
#include <net/netlink.h>
|
||||
|
||||
struct tcf_police {
|
||||
struct tcf_common common;
|
||||
int tcfp_result;
|
||||
u32 tcfp_ewma_rate;
|
||||
s64 tcfp_burst;
|
||||
u32 tcfp_mtu;
|
||||
s64 tcfp_toks;
|
||||
s64 tcfp_ptoks;
|
||||
s64 tcfp_mtu_ptoks;
|
||||
s64 tcfp_t_c;
|
||||
struct psched_ratecfg rate;
|
||||
bool rate_present;
|
||||
struct psched_ratecfg peak;
|
||||
bool peak_present;
|
||||
};
|
||||
#define to_police(pc) \
|
||||
container_of(pc, struct tcf_police, common)
|
||||
|
||||
#define POL_TAB_MASK 15
|
||||
|
||||
/* old policer structure from before tc actions */
|
||||
struct tc_police_compat {
|
||||
u32 index;
|
||||
int action;
|
||||
u32 limit;
|
||||
u32 burst;
|
||||
u32 mtu;
|
||||
struct tc_ratespec rate;
|
||||
struct tc_ratespec peakrate;
|
||||
};
|
||||
|
||||
/* Each policer is serialized by its individual spinlock */
|
||||
|
||||
static int tcf_act_police_walker(struct sk_buff *skb, struct netlink_callback *cb,
|
||||
int type, struct tc_action *a)
|
||||
{
|
||||
struct tcf_hashinfo *hinfo = a->ops->hinfo;
|
||||
struct hlist_head *head;
|
||||
struct tcf_common *p;
|
||||
int err = 0, index = -1, i = 0, s_i = 0, n_i = 0;
|
||||
struct nlattr *nest;
|
||||
|
||||
spin_lock_bh(&hinfo->lock);
|
||||
|
||||
s_i = cb->args[0];
|
||||
|
||||
for (i = 0; i < (POL_TAB_MASK + 1); i++) {
|
||||
head = &hinfo->htab[tcf_hash(i, POL_TAB_MASK)];
|
||||
|
||||
hlist_for_each_entry_rcu(p, head, tcfc_head) {
|
||||
index++;
|
||||
if (index < s_i)
|
||||
continue;
|
||||
a->priv = p;
|
||||
a->order = index;
|
||||
nest = nla_nest_start(skb, a->order);
|
||||
if (nest == NULL)
|
||||
goto nla_put_failure;
|
||||
if (type == RTM_DELACTION)
|
||||
err = tcf_action_dump_1(skb, a, 0, 1);
|
||||
else
|
||||
err = tcf_action_dump_1(skb, a, 0, 0);
|
||||
if (err < 0) {
|
||||
index--;
|
||||
nla_nest_cancel(skb, nest);
|
||||
goto done;
|
||||
}
|
||||
nla_nest_end(skb, nest);
|
||||
n_i++;
|
||||
}
|
||||
}
|
||||
done:
|
||||
spin_unlock_bh(&hinfo->lock);
|
||||
if (n_i)
|
||||
cb->args[0] += n_i;
|
||||
return n_i;
|
||||
|
||||
nla_put_failure:
|
||||
nla_nest_cancel(skb, nest);
|
||||
goto done;
|
||||
}
|
||||
|
||||
static const struct nla_policy police_policy[TCA_POLICE_MAX + 1] = {
|
||||
[TCA_POLICE_RATE] = { .len = TC_RTAB_SIZE },
|
||||
[TCA_POLICE_PEAKRATE] = { .len = TC_RTAB_SIZE },
|
||||
[TCA_POLICE_AVRATE] = { .type = NLA_U32 },
|
||||
[TCA_POLICE_RESULT] = { .type = NLA_U32 },
|
||||
};
|
||||
|
||||
static int tcf_act_police_locate(struct net *net, struct nlattr *nla,
|
||||
struct nlattr *est, struct tc_action *a,
|
||||
int ovr, int bind)
|
||||
{
|
||||
unsigned int h;
|
||||
int ret = 0, err;
|
||||
struct nlattr *tb[TCA_POLICE_MAX + 1];
|
||||
struct tc_police *parm;
|
||||
struct tcf_police *police;
|
||||
struct qdisc_rate_table *R_tab = NULL, *P_tab = NULL;
|
||||
struct tcf_hashinfo *hinfo = a->ops->hinfo;
|
||||
int size;
|
||||
|
||||
if (nla == NULL)
|
||||
return -EINVAL;
|
||||
|
||||
err = nla_parse_nested(tb, TCA_POLICE_MAX, nla, police_policy);
|
||||
if (err < 0)
|
||||
return err;
|
||||
|
||||
if (tb[TCA_POLICE_TBF] == NULL)
|
||||
return -EINVAL;
|
||||
size = nla_len(tb[TCA_POLICE_TBF]);
|
||||
if (size != sizeof(*parm) && size != sizeof(struct tc_police_compat))
|
||||
return -EINVAL;
|
||||
parm = nla_data(tb[TCA_POLICE_TBF]);
|
||||
|
||||
if (parm->index) {
|
||||
if (tcf_hash_search(a, parm->index)) {
|
||||
police = to_police(a->priv);
|
||||
if (bind) {
|
||||
police->tcf_bindcnt += 1;
|
||||
police->tcf_refcnt += 1;
|
||||
return 0;
|
||||
}
|
||||
if (ovr)
|
||||
goto override;
|
||||
/* not replacing */
|
||||
return -EEXIST;
|
||||
}
|
||||
}
|
||||
|
||||
police = kzalloc(sizeof(*police), GFP_KERNEL);
|
||||
if (police == NULL)
|
||||
return -ENOMEM;
|
||||
ret = ACT_P_CREATED;
|
||||
police->tcf_refcnt = 1;
|
||||
spin_lock_init(&police->tcf_lock);
|
||||
if (bind)
|
||||
police->tcf_bindcnt = 1;
|
||||
override:
|
||||
if (parm->rate.rate) {
|
||||
err = -ENOMEM;
|
||||
R_tab = qdisc_get_rtab(&parm->rate, tb[TCA_POLICE_RATE]);
|
||||
if (R_tab == NULL)
|
||||
goto failure;
|
||||
|
||||
if (parm->peakrate.rate) {
|
||||
P_tab = qdisc_get_rtab(&parm->peakrate,
|
||||
tb[TCA_POLICE_PEAKRATE]);
|
||||
if (P_tab == NULL)
|
||||
goto failure;
|
||||
}
|
||||
}
|
||||
|
||||
spin_lock_bh(&police->tcf_lock);
|
||||
if (est) {
|
||||
err = gen_replace_estimator(&police->tcf_bstats, NULL,
|
||||
&police->tcf_rate_est,
|
||||
&police->tcf_lock, est);
|
||||
if (err)
|
||||
goto failure_unlock;
|
||||
} else if (tb[TCA_POLICE_AVRATE] &&
|
||||
(ret == ACT_P_CREATED ||
|
||||
!gen_estimator_active(&police->tcf_bstats,
|
||||
&police->tcf_rate_est))) {
|
||||
err = -EINVAL;
|
||||
goto failure_unlock;
|
||||
}
|
||||
|
||||
/* No failure allowed after this point */
|
||||
police->tcfp_mtu = parm->mtu;
|
||||
if (police->tcfp_mtu == 0) {
|
||||
police->tcfp_mtu = ~0;
|
||||
if (R_tab)
|
||||
police->tcfp_mtu = 255 << R_tab->rate.cell_log;
|
||||
}
|
||||
if (R_tab) {
|
||||
police->rate_present = true;
|
||||
psched_ratecfg_precompute(&police->rate, &R_tab->rate, 0);
|
||||
qdisc_put_rtab(R_tab);
|
||||
} else {
|
||||
police->rate_present = false;
|
||||
}
|
||||
if (P_tab) {
|
||||
police->peak_present = true;
|
||||
psched_ratecfg_precompute(&police->peak, &P_tab->rate, 0);
|
||||
qdisc_put_rtab(P_tab);
|
||||
} else {
|
||||
police->peak_present = false;
|
||||
}
|
||||
|
||||
if (tb[TCA_POLICE_RESULT])
|
||||
police->tcfp_result = nla_get_u32(tb[TCA_POLICE_RESULT]);
|
||||
police->tcfp_burst = PSCHED_TICKS2NS(parm->burst);
|
||||
police->tcfp_toks = police->tcfp_burst;
|
||||
if (police->peak_present) {
|
||||
police->tcfp_mtu_ptoks = (s64) psched_l2t_ns(&police->peak,
|
||||
police->tcfp_mtu);
|
||||
police->tcfp_ptoks = police->tcfp_mtu_ptoks;
|
||||
}
|
||||
police->tcf_action = parm->action;
|
||||
|
||||
if (tb[TCA_POLICE_AVRATE])
|
||||
police->tcfp_ewma_rate = nla_get_u32(tb[TCA_POLICE_AVRATE]);
|
||||
|
||||
spin_unlock_bh(&police->tcf_lock);
|
||||
if (ret != ACT_P_CREATED)
|
||||
return ret;
|
||||
|
||||
police->tcfp_t_c = ktime_get_ns();
|
||||
police->tcf_index = parm->index ? parm->index :
|
||||
tcf_hash_new_index(hinfo);
|
||||
h = tcf_hash(police->tcf_index, POL_TAB_MASK);
|
||||
spin_lock_bh(&hinfo->lock);
|
||||
hlist_add_head(&police->tcf_head, &hinfo->htab[h]);
|
||||
spin_unlock_bh(&hinfo->lock);
|
||||
|
||||
a->priv = police;
|
||||
return ret;
|
||||
|
||||
failure_unlock:
|
||||
spin_unlock_bh(&police->tcf_lock);
|
||||
failure:
|
||||
qdisc_put_rtab(P_tab);
|
||||
qdisc_put_rtab(R_tab);
|
||||
if (ret == ACT_P_CREATED)
|
||||
kfree(police);
|
||||
return err;
|
||||
}
|
||||
|
||||
static int tcf_act_police(struct sk_buff *skb, const struct tc_action *a,
|
||||
struct tcf_result *res)
|
||||
{
|
||||
struct tcf_police *police = a->priv;
|
||||
s64 now;
|
||||
s64 toks;
|
||||
s64 ptoks = 0;
|
||||
|
||||
spin_lock(&police->tcf_lock);
|
||||
|
||||
bstats_update(&police->tcf_bstats, skb);
|
||||
|
||||
if (police->tcfp_ewma_rate &&
|
||||
police->tcf_rate_est.bps >= police->tcfp_ewma_rate) {
|
||||
police->tcf_qstats.overlimits++;
|
||||
if (police->tcf_action == TC_ACT_SHOT)
|
||||
police->tcf_qstats.drops++;
|
||||
spin_unlock(&police->tcf_lock);
|
||||
return police->tcf_action;
|
||||
}
|
||||
|
||||
if (qdisc_pkt_len(skb) <= police->tcfp_mtu) {
|
||||
if (!police->rate_present) {
|
||||
spin_unlock(&police->tcf_lock);
|
||||
return police->tcfp_result;
|
||||
}
|
||||
|
||||
now = ktime_get_ns();
|
||||
toks = min_t(s64, now - police->tcfp_t_c,
|
||||
police->tcfp_burst);
|
||||
if (police->peak_present) {
|
||||
ptoks = toks + police->tcfp_ptoks;
|
||||
if (ptoks > police->tcfp_mtu_ptoks)
|
||||
ptoks = police->tcfp_mtu_ptoks;
|
||||
ptoks -= (s64) psched_l2t_ns(&police->peak,
|
||||
qdisc_pkt_len(skb));
|
||||
}
|
||||
toks += police->tcfp_toks;
|
||||
if (toks > police->tcfp_burst)
|
||||
toks = police->tcfp_burst;
|
||||
toks -= (s64) psched_l2t_ns(&police->rate, qdisc_pkt_len(skb));
|
||||
if ((toks|ptoks) >= 0) {
|
||||
police->tcfp_t_c = now;
|
||||
police->tcfp_toks = toks;
|
||||
police->tcfp_ptoks = ptoks;
|
||||
spin_unlock(&police->tcf_lock);
|
||||
return police->tcfp_result;
|
||||
}
|
||||
}
|
||||
|
||||
police->tcf_qstats.overlimits++;
|
||||
if (police->tcf_action == TC_ACT_SHOT)
|
||||
police->tcf_qstats.drops++;
|
||||
spin_unlock(&police->tcf_lock);
|
||||
return police->tcf_action;
|
||||
}
|
||||
|
||||
static int
|
||||
tcf_act_police_dump(struct sk_buff *skb, struct tc_action *a, int bind, int ref)
|
||||
{
|
||||
unsigned char *b = skb_tail_pointer(skb);
|
||||
struct tcf_police *police = a->priv;
|
||||
struct tc_police opt = {
|
||||
.index = police->tcf_index,
|
||||
.action = police->tcf_action,
|
||||
.mtu = police->tcfp_mtu,
|
||||
.burst = PSCHED_NS2TICKS(police->tcfp_burst),
|
||||
.refcnt = police->tcf_refcnt - ref,
|
||||
.bindcnt = police->tcf_bindcnt - bind,
|
||||
};
|
||||
|
||||
if (police->rate_present)
|
||||
psched_ratecfg_getrate(&opt.rate, &police->rate);
|
||||
if (police->peak_present)
|
||||
psched_ratecfg_getrate(&opt.peakrate, &police->peak);
|
||||
if (nla_put(skb, TCA_POLICE_TBF, sizeof(opt), &opt))
|
||||
goto nla_put_failure;
|
||||
if (police->tcfp_result &&
|
||||
nla_put_u32(skb, TCA_POLICE_RESULT, police->tcfp_result))
|
||||
goto nla_put_failure;
|
||||
if (police->tcfp_ewma_rate &&
|
||||
nla_put_u32(skb, TCA_POLICE_AVRATE, police->tcfp_ewma_rate))
|
||||
goto nla_put_failure;
|
||||
return skb->len;
|
||||
|
||||
nla_put_failure:
|
||||
nlmsg_trim(skb, b);
|
||||
return -1;
|
||||
}
|
||||
|
||||
MODULE_AUTHOR("Alexey Kuznetsov");
|
||||
MODULE_DESCRIPTION("Policing actions");
|
||||
MODULE_LICENSE("GPL");
|
||||
|
||||
static struct tc_action_ops act_police_ops = {
|
||||
.kind = "police",
|
||||
.type = TCA_ID_POLICE,
|
||||
.owner = THIS_MODULE,
|
||||
.act = tcf_act_police,
|
||||
.dump = tcf_act_police_dump,
|
||||
.init = tcf_act_police_locate,
|
||||
.walk = tcf_act_police_walker
|
||||
};
|
||||
|
||||
static int __init
|
||||
police_init_module(void)
|
||||
{
|
||||
return tcf_register_action(&act_police_ops, POL_TAB_MASK);
|
||||
}
|
||||
|
||||
static void __exit
|
||||
police_cleanup_module(void)
|
||||
{
|
||||
tcf_unregister_action(&act_police_ops);
|
||||
}
|
||||
|
||||
module_init(police_init_module);
|
||||
module_exit(police_cleanup_module);
|
||||
192
net/sched/act_simple.c
Normal file
192
net/sched/act_simple.c
Normal file
|
|
@ -0,0 +1,192 @@
|
|||
/*
|
||||
* net/sched/simp.c Simple example of an action
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU General Public License
|
||||
* as published by the Free Software Foundation; either version
|
||||
* 2 of the License, or (at your option) any later version.
|
||||
*
|
||||
* Authors: Jamal Hadi Salim (2005-8)
|
||||
*
|
||||
*/
|
||||
|
||||
#include <linux/module.h>
|
||||
#include <linux/slab.h>
|
||||
#include <linux/init.h>
|
||||
#include <linux/kernel.h>
|
||||
#include <linux/skbuff.h>
|
||||
#include <linux/rtnetlink.h>
|
||||
#include <net/netlink.h>
|
||||
#include <net/pkt_sched.h>
|
||||
|
||||
#define TCA_ACT_SIMP 22
|
||||
|
||||
#include <linux/tc_act/tc_defact.h>
|
||||
#include <net/tc_act/tc_defact.h>
|
||||
|
||||
#define SIMP_TAB_MASK 7
|
||||
|
||||
#define SIMP_MAX_DATA 32
|
||||
static int tcf_simp(struct sk_buff *skb, const struct tc_action *a,
|
||||
struct tcf_result *res)
|
||||
{
|
||||
struct tcf_defact *d = a->priv;
|
||||
|
||||
spin_lock(&d->tcf_lock);
|
||||
d->tcf_tm.lastuse = jiffies;
|
||||
bstats_update(&d->tcf_bstats, skb);
|
||||
|
||||
/* print policy string followed by _ then packet count
|
||||
* Example if this was the 3rd packet and the string was "hello"
|
||||
* then it would look like "hello_3" (without quotes)
|
||||
*/
|
||||
pr_info("simple: %s_%d\n",
|
||||
(char *)d->tcfd_defdata, d->tcf_bstats.packets);
|
||||
spin_unlock(&d->tcf_lock);
|
||||
return d->tcf_action;
|
||||
}
|
||||
|
||||
static void tcf_simp_release(struct tc_action *a, int bind)
|
||||
{
|
||||
struct tcf_defact *d = to_defact(a);
|
||||
kfree(d->tcfd_defdata);
|
||||
}
|
||||
|
||||
static int alloc_defdata(struct tcf_defact *d, char *defdata)
|
||||
{
|
||||
d->tcfd_defdata = kzalloc(SIMP_MAX_DATA, GFP_KERNEL);
|
||||
if (unlikely(!d->tcfd_defdata))
|
||||
return -ENOMEM;
|
||||
strlcpy(d->tcfd_defdata, defdata, SIMP_MAX_DATA);
|
||||
return 0;
|
||||
}
|
||||
|
||||
static void reset_policy(struct tcf_defact *d, char *defdata,
|
||||
struct tc_defact *p)
|
||||
{
|
||||
spin_lock_bh(&d->tcf_lock);
|
||||
d->tcf_action = p->action;
|
||||
memset(d->tcfd_defdata, 0, SIMP_MAX_DATA);
|
||||
strlcpy(d->tcfd_defdata, defdata, SIMP_MAX_DATA);
|
||||
spin_unlock_bh(&d->tcf_lock);
|
||||
}
|
||||
|
||||
static const struct nla_policy simple_policy[TCA_DEF_MAX + 1] = {
|
||||
[TCA_DEF_PARMS] = { .len = sizeof(struct tc_defact) },
|
||||
[TCA_DEF_DATA] = { .type = NLA_STRING, .len = SIMP_MAX_DATA },
|
||||
};
|
||||
|
||||
static int tcf_simp_init(struct net *net, struct nlattr *nla,
|
||||
struct nlattr *est, struct tc_action *a,
|
||||
int ovr, int bind)
|
||||
{
|
||||
struct nlattr *tb[TCA_DEF_MAX + 1];
|
||||
struct tc_defact *parm;
|
||||
struct tcf_defact *d;
|
||||
char *defdata;
|
||||
int ret = 0, err;
|
||||
|
||||
if (nla == NULL)
|
||||
return -EINVAL;
|
||||
|
||||
err = nla_parse_nested(tb, TCA_DEF_MAX, nla, simple_policy);
|
||||
if (err < 0)
|
||||
return err;
|
||||
|
||||
if (tb[TCA_DEF_PARMS] == NULL)
|
||||
return -EINVAL;
|
||||
|
||||
if (tb[TCA_DEF_DATA] == NULL)
|
||||
return -EINVAL;
|
||||
|
||||
parm = nla_data(tb[TCA_DEF_PARMS]);
|
||||
defdata = nla_data(tb[TCA_DEF_DATA]);
|
||||
|
||||
if (!tcf_hash_check(parm->index, a, bind)) {
|
||||
ret = tcf_hash_create(parm->index, est, a, sizeof(*d), bind);
|
||||
if (ret)
|
||||
return ret;
|
||||
|
||||
d = to_defact(a);
|
||||
ret = alloc_defdata(d, defdata);
|
||||
if (ret < 0) {
|
||||
tcf_hash_cleanup(a, est);
|
||||
return ret;
|
||||
}
|
||||
d->tcf_action = parm->action;
|
||||
ret = ACT_P_CREATED;
|
||||
} else {
|
||||
d = to_defact(a);
|
||||
|
||||
if (bind)
|
||||
return 0;
|
||||
tcf_hash_release(a, bind);
|
||||
if (!ovr)
|
||||
return -EEXIST;
|
||||
|
||||
reset_policy(d, defdata, parm);
|
||||
}
|
||||
|
||||
if (ret == ACT_P_CREATED)
|
||||
tcf_hash_insert(a);
|
||||
return ret;
|
||||
}
|
||||
|
||||
static int tcf_simp_dump(struct sk_buff *skb, struct tc_action *a,
|
||||
int bind, int ref)
|
||||
{
|
||||
unsigned char *b = skb_tail_pointer(skb);
|
||||
struct tcf_defact *d = a->priv;
|
||||
struct tc_defact opt = {
|
||||
.index = d->tcf_index,
|
||||
.refcnt = d->tcf_refcnt - ref,
|
||||
.bindcnt = d->tcf_bindcnt - bind,
|
||||
.action = d->tcf_action,
|
||||
};
|
||||
struct tcf_t t;
|
||||
|
||||
if (nla_put(skb, TCA_DEF_PARMS, sizeof(opt), &opt) ||
|
||||
nla_put_string(skb, TCA_DEF_DATA, d->tcfd_defdata))
|
||||
goto nla_put_failure;
|
||||
t.install = jiffies_to_clock_t(jiffies - d->tcf_tm.install);
|
||||
t.lastuse = jiffies_to_clock_t(jiffies - d->tcf_tm.lastuse);
|
||||
t.expires = jiffies_to_clock_t(d->tcf_tm.expires);
|
||||
if (nla_put(skb, TCA_DEF_TM, sizeof(t), &t))
|
||||
goto nla_put_failure;
|
||||
return skb->len;
|
||||
|
||||
nla_put_failure:
|
||||
nlmsg_trim(skb, b);
|
||||
return -1;
|
||||
}
|
||||
|
||||
static struct tc_action_ops act_simp_ops = {
|
||||
.kind = "simple",
|
||||
.type = TCA_ACT_SIMP,
|
||||
.owner = THIS_MODULE,
|
||||
.act = tcf_simp,
|
||||
.dump = tcf_simp_dump,
|
||||
.cleanup = tcf_simp_release,
|
||||
.init = tcf_simp_init,
|
||||
};
|
||||
|
||||
MODULE_AUTHOR("Jamal Hadi Salim(2005)");
|
||||
MODULE_DESCRIPTION("Simple example action");
|
||||
MODULE_LICENSE("GPL");
|
||||
|
||||
static int __init simp_init_module(void)
|
||||
{
|
||||
int ret;
|
||||
ret = tcf_register_action(&act_simp_ops, SIMP_TAB_MASK);
|
||||
if (!ret)
|
||||
pr_info("Simple TC action Loaded\n");
|
||||
return ret;
|
||||
}
|
||||
|
||||
static void __exit simp_cleanup_module(void)
|
||||
{
|
||||
tcf_unregister_action(&act_simp_ops);
|
||||
}
|
||||
|
||||
module_init(simp_init_module);
|
||||
module_exit(simp_cleanup_module);
|
||||
199
net/sched/act_skbedit.c
Normal file
199
net/sched/act_skbedit.c
Normal file
|
|
@ -0,0 +1,199 @@
|
|||
/*
|
||||
* Copyright (c) 2008, Intel Corporation.
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or modify it
|
||||
* under the terms and conditions of the GNU General Public License,
|
||||
* version 2, as published by the Free Software Foundation.
|
||||
*
|
||||
* This program is distributed in the hope it will be useful, but WITHOUT
|
||||
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
|
||||
* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
|
||||
* more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License along with
|
||||
* this program; if not, see <http://www.gnu.org/licenses/>.
|
||||
*
|
||||
* Author: Alexander Duyck <alexander.h.duyck@intel.com>
|
||||
*/
|
||||
|
||||
#include <linux/module.h>
|
||||
#include <linux/init.h>
|
||||
#include <linux/kernel.h>
|
||||
#include <linux/skbuff.h>
|
||||
#include <linux/rtnetlink.h>
|
||||
#include <net/netlink.h>
|
||||
#include <net/pkt_sched.h>
|
||||
|
||||
#include <linux/tc_act/tc_skbedit.h>
|
||||
#include <net/tc_act/tc_skbedit.h>
|
||||
|
||||
#define SKBEDIT_TAB_MASK 15
|
||||
|
||||
static int tcf_skbedit(struct sk_buff *skb, const struct tc_action *a,
|
||||
struct tcf_result *res)
|
||||
{
|
||||
struct tcf_skbedit *d = a->priv;
|
||||
|
||||
spin_lock(&d->tcf_lock);
|
||||
d->tcf_tm.lastuse = jiffies;
|
||||
bstats_update(&d->tcf_bstats, skb);
|
||||
|
||||
if (d->flags & SKBEDIT_F_PRIORITY)
|
||||
skb->priority = d->priority;
|
||||
if (d->flags & SKBEDIT_F_QUEUE_MAPPING &&
|
||||
skb->dev->real_num_tx_queues > d->queue_mapping)
|
||||
skb_set_queue_mapping(skb, d->queue_mapping);
|
||||
if (d->flags & SKBEDIT_F_MARK)
|
||||
skb->mark = d->mark;
|
||||
|
||||
spin_unlock(&d->tcf_lock);
|
||||
return d->tcf_action;
|
||||
}
|
||||
|
||||
static const struct nla_policy skbedit_policy[TCA_SKBEDIT_MAX + 1] = {
|
||||
[TCA_SKBEDIT_PARMS] = { .len = sizeof(struct tc_skbedit) },
|
||||
[TCA_SKBEDIT_PRIORITY] = { .len = sizeof(u32) },
|
||||
[TCA_SKBEDIT_QUEUE_MAPPING] = { .len = sizeof(u16) },
|
||||
[TCA_SKBEDIT_MARK] = { .len = sizeof(u32) },
|
||||
};
|
||||
|
||||
static int tcf_skbedit_init(struct net *net, struct nlattr *nla,
|
||||
struct nlattr *est, struct tc_action *a,
|
||||
int ovr, int bind)
|
||||
{
|
||||
struct nlattr *tb[TCA_SKBEDIT_MAX + 1];
|
||||
struct tc_skbedit *parm;
|
||||
struct tcf_skbedit *d;
|
||||
u32 flags = 0, *priority = NULL, *mark = NULL;
|
||||
u16 *queue_mapping = NULL;
|
||||
int ret = 0, err;
|
||||
|
||||
if (nla == NULL)
|
||||
return -EINVAL;
|
||||
|
||||
err = nla_parse_nested(tb, TCA_SKBEDIT_MAX, nla, skbedit_policy);
|
||||
if (err < 0)
|
||||
return err;
|
||||
|
||||
if (tb[TCA_SKBEDIT_PARMS] == NULL)
|
||||
return -EINVAL;
|
||||
|
||||
if (tb[TCA_SKBEDIT_PRIORITY] != NULL) {
|
||||
flags |= SKBEDIT_F_PRIORITY;
|
||||
priority = nla_data(tb[TCA_SKBEDIT_PRIORITY]);
|
||||
}
|
||||
|
||||
if (tb[TCA_SKBEDIT_QUEUE_MAPPING] != NULL) {
|
||||
flags |= SKBEDIT_F_QUEUE_MAPPING;
|
||||
queue_mapping = nla_data(tb[TCA_SKBEDIT_QUEUE_MAPPING]);
|
||||
}
|
||||
|
||||
if (tb[TCA_SKBEDIT_MARK] != NULL) {
|
||||
flags |= SKBEDIT_F_MARK;
|
||||
mark = nla_data(tb[TCA_SKBEDIT_MARK]);
|
||||
}
|
||||
|
||||
if (!flags)
|
||||
return -EINVAL;
|
||||
|
||||
parm = nla_data(tb[TCA_SKBEDIT_PARMS]);
|
||||
|
||||
if (!tcf_hash_check(parm->index, a, bind)) {
|
||||
ret = tcf_hash_create(parm->index, est, a, sizeof(*d), bind);
|
||||
if (ret)
|
||||
return ret;
|
||||
|
||||
d = to_skbedit(a);
|
||||
ret = ACT_P_CREATED;
|
||||
} else {
|
||||
d = to_skbedit(a);
|
||||
if (bind)
|
||||
return 0;
|
||||
tcf_hash_release(a, bind);
|
||||
if (!ovr)
|
||||
return -EEXIST;
|
||||
}
|
||||
|
||||
spin_lock_bh(&d->tcf_lock);
|
||||
|
||||
d->flags = flags;
|
||||
if (flags & SKBEDIT_F_PRIORITY)
|
||||
d->priority = *priority;
|
||||
if (flags & SKBEDIT_F_QUEUE_MAPPING)
|
||||
d->queue_mapping = *queue_mapping;
|
||||
if (flags & SKBEDIT_F_MARK)
|
||||
d->mark = *mark;
|
||||
|
||||
d->tcf_action = parm->action;
|
||||
|
||||
spin_unlock_bh(&d->tcf_lock);
|
||||
|
||||
if (ret == ACT_P_CREATED)
|
||||
tcf_hash_insert(a);
|
||||
return ret;
|
||||
}
|
||||
|
||||
static int tcf_skbedit_dump(struct sk_buff *skb, struct tc_action *a,
|
||||
int bind, int ref)
|
||||
{
|
||||
unsigned char *b = skb_tail_pointer(skb);
|
||||
struct tcf_skbedit *d = a->priv;
|
||||
struct tc_skbedit opt = {
|
||||
.index = d->tcf_index,
|
||||
.refcnt = d->tcf_refcnt - ref,
|
||||
.bindcnt = d->tcf_bindcnt - bind,
|
||||
.action = d->tcf_action,
|
||||
};
|
||||
struct tcf_t t;
|
||||
|
||||
if (nla_put(skb, TCA_SKBEDIT_PARMS, sizeof(opt), &opt))
|
||||
goto nla_put_failure;
|
||||
if ((d->flags & SKBEDIT_F_PRIORITY) &&
|
||||
nla_put(skb, TCA_SKBEDIT_PRIORITY, sizeof(d->priority),
|
||||
&d->priority))
|
||||
goto nla_put_failure;
|
||||
if ((d->flags & SKBEDIT_F_QUEUE_MAPPING) &&
|
||||
nla_put(skb, TCA_SKBEDIT_QUEUE_MAPPING,
|
||||
sizeof(d->queue_mapping), &d->queue_mapping))
|
||||
goto nla_put_failure;
|
||||
if ((d->flags & SKBEDIT_F_MARK) &&
|
||||
nla_put(skb, TCA_SKBEDIT_MARK, sizeof(d->mark),
|
||||
&d->mark))
|
||||
goto nla_put_failure;
|
||||
t.install = jiffies_to_clock_t(jiffies - d->tcf_tm.install);
|
||||
t.lastuse = jiffies_to_clock_t(jiffies - d->tcf_tm.lastuse);
|
||||
t.expires = jiffies_to_clock_t(d->tcf_tm.expires);
|
||||
if (nla_put(skb, TCA_SKBEDIT_TM, sizeof(t), &t))
|
||||
goto nla_put_failure;
|
||||
return skb->len;
|
||||
|
||||
nla_put_failure:
|
||||
nlmsg_trim(skb, b);
|
||||
return -1;
|
||||
}
|
||||
|
||||
static struct tc_action_ops act_skbedit_ops = {
|
||||
.kind = "skbedit",
|
||||
.type = TCA_ACT_SKBEDIT,
|
||||
.owner = THIS_MODULE,
|
||||
.act = tcf_skbedit,
|
||||
.dump = tcf_skbedit_dump,
|
||||
.init = tcf_skbedit_init,
|
||||
};
|
||||
|
||||
MODULE_AUTHOR("Alexander Duyck, <alexander.h.duyck@intel.com>");
|
||||
MODULE_DESCRIPTION("SKB Editing");
|
||||
MODULE_LICENSE("GPL");
|
||||
|
||||
static int __init skbedit_init_module(void)
|
||||
{
|
||||
return tcf_register_action(&act_skbedit_ops, SKBEDIT_TAB_MASK);
|
||||
}
|
||||
|
||||
static void __exit skbedit_cleanup_module(void)
|
||||
{
|
||||
tcf_unregister_action(&act_skbedit_ops);
|
||||
}
|
||||
|
||||
module_init(skbedit_init_module);
|
||||
module_exit(skbedit_cleanup_module);
|
||||
623
net/sched/cls_api.c
Normal file
623
net/sched/cls_api.c
Normal file
|
|
@ -0,0 +1,623 @@
|
|||
/*
|
||||
* net/sched/cls_api.c Packet classifier API.
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU General Public License
|
||||
* as published by the Free Software Foundation; either version
|
||||
* 2 of the License, or (at your option) any later version.
|
||||
*
|
||||
* Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
|
||||
*
|
||||
* Changes:
|
||||
*
|
||||
* Eduardo J. Blanco <ejbs@netlabs.com.uy> :990222: kmod support
|
||||
*
|
||||
*/
|
||||
|
||||
#include <linux/module.h>
|
||||
#include <linux/types.h>
|
||||
#include <linux/kernel.h>
|
||||
#include <linux/string.h>
|
||||
#include <linux/errno.h>
|
||||
#include <linux/skbuff.h>
|
||||
#include <linux/init.h>
|
||||
#include <linux/kmod.h>
|
||||
#include <linux/err.h>
|
||||
#include <linux/slab.h>
|
||||
#include <net/net_namespace.h>
|
||||
#include <net/sock.h>
|
||||
#include <net/netlink.h>
|
||||
#include <net/pkt_sched.h>
|
||||
#include <net/pkt_cls.h>
|
||||
|
||||
/* The list of all installed classifier types */
|
||||
static LIST_HEAD(tcf_proto_base);
|
||||
|
||||
/* Protects list of registered TC modules. It is pure SMP lock. */
|
||||
static DEFINE_RWLOCK(cls_mod_lock);
|
||||
|
||||
/* Find classifier type by string name */
|
||||
|
||||
static const struct tcf_proto_ops *tcf_proto_lookup_ops(struct nlattr *kind)
|
||||
{
|
||||
const struct tcf_proto_ops *t, *res = NULL;
|
||||
|
||||
if (kind) {
|
||||
read_lock(&cls_mod_lock);
|
||||
list_for_each_entry(t, &tcf_proto_base, head) {
|
||||
if (nla_strcmp(kind, t->kind) == 0) {
|
||||
if (try_module_get(t->owner))
|
||||
res = t;
|
||||
break;
|
||||
}
|
||||
}
|
||||
read_unlock(&cls_mod_lock);
|
||||
}
|
||||
return res;
|
||||
}
|
||||
|
||||
/* Register(unregister) new classifier type */
|
||||
|
||||
int register_tcf_proto_ops(struct tcf_proto_ops *ops)
|
||||
{
|
||||
struct tcf_proto_ops *t;
|
||||
int rc = -EEXIST;
|
||||
|
||||
write_lock(&cls_mod_lock);
|
||||
list_for_each_entry(t, &tcf_proto_base, head)
|
||||
if (!strcmp(ops->kind, t->kind))
|
||||
goto out;
|
||||
|
||||
list_add_tail(&ops->head, &tcf_proto_base);
|
||||
rc = 0;
|
||||
out:
|
||||
write_unlock(&cls_mod_lock);
|
||||
return rc;
|
||||
}
|
||||
EXPORT_SYMBOL(register_tcf_proto_ops);
|
||||
|
||||
int unregister_tcf_proto_ops(struct tcf_proto_ops *ops)
|
||||
{
|
||||
struct tcf_proto_ops *t;
|
||||
int rc = -ENOENT;
|
||||
|
||||
write_lock(&cls_mod_lock);
|
||||
list_for_each_entry(t, &tcf_proto_base, head) {
|
||||
if (t == ops) {
|
||||
list_del(&t->head);
|
||||
rc = 0;
|
||||
break;
|
||||
}
|
||||
}
|
||||
write_unlock(&cls_mod_lock);
|
||||
return rc;
|
||||
}
|
||||
EXPORT_SYMBOL(unregister_tcf_proto_ops);
|
||||
|
||||
static int tfilter_notify(struct net *net, struct sk_buff *oskb,
|
||||
struct nlmsghdr *n, struct tcf_proto *tp,
|
||||
unsigned long fh, int event);
|
||||
|
||||
|
||||
/* Select new prio value from the range, managed by kernel. */
|
||||
|
||||
static inline u32 tcf_auto_prio(struct tcf_proto *tp)
|
||||
{
|
||||
u32 first = TC_H_MAKE(0xC0000000U, 0U);
|
||||
|
||||
if (tp)
|
||||
first = tp->prio - 1;
|
||||
|
||||
return first;
|
||||
}
|
||||
|
||||
/* Add/change/delete/get a filter node */
|
||||
|
||||
static int tc_ctl_tfilter(struct sk_buff *skb, struct nlmsghdr *n)
|
||||
{
|
||||
struct net *net = sock_net(skb->sk);
|
||||
struct nlattr *tca[TCA_MAX + 1];
|
||||
struct tcmsg *t;
|
||||
u32 protocol;
|
||||
u32 prio;
|
||||
u32 nprio;
|
||||
u32 parent;
|
||||
struct net_device *dev;
|
||||
struct Qdisc *q;
|
||||
struct tcf_proto __rcu **back;
|
||||
struct tcf_proto __rcu **chain;
|
||||
struct tcf_proto *tp;
|
||||
const struct tcf_proto_ops *tp_ops;
|
||||
const struct Qdisc_class_ops *cops;
|
||||
unsigned long cl;
|
||||
unsigned long fh;
|
||||
int err;
|
||||
int tp_created = 0;
|
||||
|
||||
if ((n->nlmsg_type != RTM_GETTFILTER) &&
|
||||
!netlink_ns_capable(skb, net->user_ns, CAP_NET_ADMIN))
|
||||
return -EPERM;
|
||||
|
||||
replay:
|
||||
err = nlmsg_parse(n, sizeof(*t), tca, TCA_MAX, NULL);
|
||||
if (err < 0)
|
||||
return err;
|
||||
|
||||
t = nlmsg_data(n);
|
||||
protocol = TC_H_MIN(t->tcm_info);
|
||||
prio = TC_H_MAJ(t->tcm_info);
|
||||
nprio = prio;
|
||||
parent = t->tcm_parent;
|
||||
cl = 0;
|
||||
|
||||
if (prio == 0) {
|
||||
/* If no priority is given, user wants we allocated it. */
|
||||
if (n->nlmsg_type != RTM_NEWTFILTER ||
|
||||
!(n->nlmsg_flags & NLM_F_CREATE))
|
||||
return -ENOENT;
|
||||
prio = TC_H_MAKE(0x80000000U, 0U);
|
||||
}
|
||||
|
||||
/* Find head of filter chain. */
|
||||
|
||||
/* Find link */
|
||||
dev = __dev_get_by_index(net, t->tcm_ifindex);
|
||||
if (dev == NULL)
|
||||
return -ENODEV;
|
||||
|
||||
/* Find qdisc */
|
||||
if (!parent) {
|
||||
q = dev->qdisc;
|
||||
parent = q->handle;
|
||||
} else {
|
||||
q = qdisc_lookup(dev, TC_H_MAJ(t->tcm_parent));
|
||||
if (q == NULL)
|
||||
return -EINVAL;
|
||||
}
|
||||
|
||||
/* Is it classful? */
|
||||
cops = q->ops->cl_ops;
|
||||
if (!cops)
|
||||
return -EINVAL;
|
||||
|
||||
if (cops->tcf_chain == NULL)
|
||||
return -EOPNOTSUPP;
|
||||
|
||||
/* Do we search for filter, attached to class? */
|
||||
if (TC_H_MIN(parent)) {
|
||||
cl = cops->get(q, parent);
|
||||
if (cl == 0)
|
||||
return -ENOENT;
|
||||
}
|
||||
|
||||
/* And the last stroke */
|
||||
chain = cops->tcf_chain(q, cl);
|
||||
err = -EINVAL;
|
||||
if (chain == NULL)
|
||||
goto errout;
|
||||
|
||||
/* Check the chain for existence of proto-tcf with this priority */
|
||||
for (back = chain;
|
||||
(tp = rtnl_dereference(*back)) != NULL;
|
||||
back = &tp->next) {
|
||||
if (tp->prio >= prio) {
|
||||
if (tp->prio == prio) {
|
||||
if (!nprio ||
|
||||
(tp->protocol != protocol && protocol))
|
||||
goto errout;
|
||||
} else
|
||||
tp = NULL;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
if (tp == NULL) {
|
||||
/* Proto-tcf does not exist, create new one */
|
||||
|
||||
if (tca[TCA_KIND] == NULL || !protocol)
|
||||
goto errout;
|
||||
|
||||
err = -ENOENT;
|
||||
if (n->nlmsg_type != RTM_NEWTFILTER ||
|
||||
!(n->nlmsg_flags & NLM_F_CREATE))
|
||||
goto errout;
|
||||
|
||||
|
||||
/* Create new proto tcf */
|
||||
|
||||
err = -ENOBUFS;
|
||||
tp = kzalloc(sizeof(*tp), GFP_KERNEL);
|
||||
if (tp == NULL)
|
||||
goto errout;
|
||||
err = -ENOENT;
|
||||
tp_ops = tcf_proto_lookup_ops(tca[TCA_KIND]);
|
||||
if (tp_ops == NULL) {
|
||||
#ifdef CONFIG_MODULES
|
||||
struct nlattr *kind = tca[TCA_KIND];
|
||||
char name[IFNAMSIZ];
|
||||
|
||||
if (kind != NULL &&
|
||||
nla_strlcpy(name, kind, IFNAMSIZ) < IFNAMSIZ) {
|
||||
rtnl_unlock();
|
||||
request_module("cls_%s", name);
|
||||
rtnl_lock();
|
||||
tp_ops = tcf_proto_lookup_ops(kind);
|
||||
/* We dropped the RTNL semaphore in order to
|
||||
* perform the module load. So, even if we
|
||||
* succeeded in loading the module we have to
|
||||
* replay the request. We indicate this using
|
||||
* -EAGAIN.
|
||||
*/
|
||||
if (tp_ops != NULL) {
|
||||
module_put(tp_ops->owner);
|
||||
err = -EAGAIN;
|
||||
}
|
||||
}
|
||||
#endif
|
||||
kfree(tp);
|
||||
goto errout;
|
||||
}
|
||||
tp->ops = tp_ops;
|
||||
tp->protocol = protocol;
|
||||
tp->prio = nprio ? :
|
||||
TC_H_MAJ(tcf_auto_prio(rtnl_dereference(*back)));
|
||||
tp->q = q;
|
||||
tp->classify = tp_ops->classify;
|
||||
tp->classid = parent;
|
||||
|
||||
err = tp_ops->init(tp);
|
||||
if (err != 0) {
|
||||
module_put(tp_ops->owner);
|
||||
kfree(tp);
|
||||
goto errout;
|
||||
}
|
||||
|
||||
tp_created = 1;
|
||||
|
||||
} else if (tca[TCA_KIND] && nla_strcmp(tca[TCA_KIND], tp->ops->kind))
|
||||
goto errout;
|
||||
|
||||
fh = tp->ops->get(tp, t->tcm_handle);
|
||||
|
||||
if (fh == 0) {
|
||||
if (n->nlmsg_type == RTM_DELTFILTER && t->tcm_handle == 0) {
|
||||
struct tcf_proto *next = rtnl_dereference(tp->next);
|
||||
|
||||
RCU_INIT_POINTER(*back, next);
|
||||
|
||||
tfilter_notify(net, skb, n, tp, fh, RTM_DELTFILTER);
|
||||
tcf_destroy(tp);
|
||||
err = 0;
|
||||
goto errout;
|
||||
}
|
||||
|
||||
err = -ENOENT;
|
||||
if (n->nlmsg_type != RTM_NEWTFILTER ||
|
||||
!(n->nlmsg_flags & NLM_F_CREATE))
|
||||
goto errout;
|
||||
} else {
|
||||
switch (n->nlmsg_type) {
|
||||
case RTM_NEWTFILTER:
|
||||
err = -EEXIST;
|
||||
if (n->nlmsg_flags & NLM_F_EXCL) {
|
||||
if (tp_created)
|
||||
tcf_destroy(tp);
|
||||
goto errout;
|
||||
}
|
||||
break;
|
||||
case RTM_DELTFILTER:
|
||||
err = tp->ops->delete(tp, fh);
|
||||
if (err == 0)
|
||||
tfilter_notify(net, skb, n, tp, fh, RTM_DELTFILTER);
|
||||
goto errout;
|
||||
case RTM_GETTFILTER:
|
||||
err = tfilter_notify(net, skb, n, tp, fh, RTM_NEWTFILTER);
|
||||
goto errout;
|
||||
default:
|
||||
err = -EINVAL;
|
||||
goto errout;
|
||||
}
|
||||
}
|
||||
|
||||
err = tp->ops->change(net, skb, tp, cl, t->tcm_handle, tca, &fh,
|
||||
n->nlmsg_flags & NLM_F_CREATE ? TCA_ACT_NOREPLACE : TCA_ACT_REPLACE);
|
||||
if (err == 0) {
|
||||
if (tp_created) {
|
||||
RCU_INIT_POINTER(tp->next, rtnl_dereference(*back));
|
||||
rcu_assign_pointer(*back, tp);
|
||||
}
|
||||
tfilter_notify(net, skb, n, tp, fh, RTM_NEWTFILTER);
|
||||
} else {
|
||||
if (tp_created)
|
||||
tcf_destroy(tp);
|
||||
}
|
||||
|
||||
errout:
|
||||
if (cl)
|
||||
cops->put(q, cl);
|
||||
if (err == -EAGAIN)
|
||||
/* Replay the request. */
|
||||
goto replay;
|
||||
return err;
|
||||
}
|
||||
|
||||
static int tcf_fill_node(struct net *net, struct sk_buff *skb, struct tcf_proto *tp,
|
||||
unsigned long fh, u32 portid, u32 seq, u16 flags, int event)
|
||||
{
|
||||
struct tcmsg *tcm;
|
||||
struct nlmsghdr *nlh;
|
||||
unsigned char *b = skb_tail_pointer(skb);
|
||||
|
||||
nlh = nlmsg_put(skb, portid, seq, event, sizeof(*tcm), flags);
|
||||
if (!nlh)
|
||||
goto out_nlmsg_trim;
|
||||
tcm = nlmsg_data(nlh);
|
||||
tcm->tcm_family = AF_UNSPEC;
|
||||
tcm->tcm__pad1 = 0;
|
||||
tcm->tcm__pad2 = 0;
|
||||
tcm->tcm_ifindex = qdisc_dev(tp->q)->ifindex;
|
||||
tcm->tcm_parent = tp->classid;
|
||||
tcm->tcm_info = TC_H_MAKE(tp->prio, tp->protocol);
|
||||
if (nla_put_string(skb, TCA_KIND, tp->ops->kind))
|
||||
goto nla_put_failure;
|
||||
tcm->tcm_handle = fh;
|
||||
if (RTM_DELTFILTER != event) {
|
||||
tcm->tcm_handle = 0;
|
||||
if (tp->ops->dump && tp->ops->dump(net, tp, fh, skb, tcm) < 0)
|
||||
goto nla_put_failure;
|
||||
}
|
||||
nlh->nlmsg_len = skb_tail_pointer(skb) - b;
|
||||
return skb->len;
|
||||
|
||||
out_nlmsg_trim:
|
||||
nla_put_failure:
|
||||
nlmsg_trim(skb, b);
|
||||
return -1;
|
||||
}
|
||||
|
||||
static int tfilter_notify(struct net *net, struct sk_buff *oskb,
|
||||
struct nlmsghdr *n, struct tcf_proto *tp,
|
||||
unsigned long fh, int event)
|
||||
{
|
||||
struct sk_buff *skb;
|
||||
u32 portid = oskb ? NETLINK_CB(oskb).portid : 0;
|
||||
|
||||
skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
|
||||
if (!skb)
|
||||
return -ENOBUFS;
|
||||
|
||||
if (tcf_fill_node(net, skb, tp, fh, portid, n->nlmsg_seq, 0, event) <= 0) {
|
||||
kfree_skb(skb);
|
||||
return -EINVAL;
|
||||
}
|
||||
|
||||
return rtnetlink_send(skb, net, portid, RTNLGRP_TC,
|
||||
n->nlmsg_flags & NLM_F_ECHO);
|
||||
}
|
||||
|
||||
struct tcf_dump_args {
|
||||
struct tcf_walker w;
|
||||
struct sk_buff *skb;
|
||||
struct netlink_callback *cb;
|
||||
};
|
||||
|
||||
static int tcf_node_dump(struct tcf_proto *tp, unsigned long n,
|
||||
struct tcf_walker *arg)
|
||||
{
|
||||
struct tcf_dump_args *a = (void *)arg;
|
||||
struct net *net = sock_net(a->skb->sk);
|
||||
|
||||
return tcf_fill_node(net, a->skb, tp, n, NETLINK_CB(a->cb->skb).portid,
|
||||
a->cb->nlh->nlmsg_seq, NLM_F_MULTI, RTM_NEWTFILTER);
|
||||
}
|
||||
|
||||
/* called with RTNL */
|
||||
static int tc_dump_tfilter(struct sk_buff *skb, struct netlink_callback *cb)
|
||||
{
|
||||
struct net *net = sock_net(skb->sk);
|
||||
int t;
|
||||
int s_t;
|
||||
struct net_device *dev;
|
||||
struct Qdisc *q;
|
||||
struct tcf_proto *tp, __rcu **chain;
|
||||
struct tcmsg *tcm = nlmsg_data(cb->nlh);
|
||||
unsigned long cl = 0;
|
||||
const struct Qdisc_class_ops *cops;
|
||||
struct tcf_dump_args arg;
|
||||
|
||||
if (nlmsg_len(cb->nlh) < sizeof(*tcm))
|
||||
return skb->len;
|
||||
dev = __dev_get_by_index(net, tcm->tcm_ifindex);
|
||||
if (!dev)
|
||||
return skb->len;
|
||||
|
||||
if (!tcm->tcm_parent)
|
||||
q = dev->qdisc;
|
||||
else
|
||||
q = qdisc_lookup(dev, TC_H_MAJ(tcm->tcm_parent));
|
||||
if (!q)
|
||||
goto out;
|
||||
cops = q->ops->cl_ops;
|
||||
if (!cops)
|
||||
goto errout;
|
||||
if (cops->tcf_chain == NULL)
|
||||
goto errout;
|
||||
if (TC_H_MIN(tcm->tcm_parent)) {
|
||||
cl = cops->get(q, tcm->tcm_parent);
|
||||
if (cl == 0)
|
||||
goto errout;
|
||||
}
|
||||
chain = cops->tcf_chain(q, cl);
|
||||
if (chain == NULL)
|
||||
goto errout;
|
||||
|
||||
s_t = cb->args[0];
|
||||
|
||||
for (tp = rtnl_dereference(*chain), t = 0;
|
||||
tp; tp = rtnl_dereference(tp->next), t++) {
|
||||
if (t < s_t)
|
||||
continue;
|
||||
if (TC_H_MAJ(tcm->tcm_info) &&
|
||||
TC_H_MAJ(tcm->tcm_info) != tp->prio)
|
||||
continue;
|
||||
if (TC_H_MIN(tcm->tcm_info) &&
|
||||
TC_H_MIN(tcm->tcm_info) != tp->protocol)
|
||||
continue;
|
||||
if (t > s_t)
|
||||
memset(&cb->args[1], 0, sizeof(cb->args)-sizeof(cb->args[0]));
|
||||
if (cb->args[1] == 0) {
|
||||
if (tcf_fill_node(net, skb, tp, 0, NETLINK_CB(cb->skb).portid,
|
||||
cb->nlh->nlmsg_seq, NLM_F_MULTI,
|
||||
RTM_NEWTFILTER) <= 0)
|
||||
break;
|
||||
|
||||
cb->args[1] = 1;
|
||||
}
|
||||
if (tp->ops->walk == NULL)
|
||||
continue;
|
||||
arg.w.fn = tcf_node_dump;
|
||||
arg.skb = skb;
|
||||
arg.cb = cb;
|
||||
arg.w.stop = 0;
|
||||
arg.w.skip = cb->args[1] - 1;
|
||||
arg.w.count = 0;
|
||||
tp->ops->walk(tp, &arg.w);
|
||||
cb->args[1] = arg.w.count + 1;
|
||||
if (arg.w.stop)
|
||||
break;
|
||||
}
|
||||
|
||||
cb->args[0] = t;
|
||||
|
||||
errout:
|
||||
if (cl)
|
||||
cops->put(q, cl);
|
||||
out:
|
||||
return skb->len;
|
||||
}
|
||||
|
||||
void tcf_exts_destroy(struct tcf_exts *exts)
|
||||
{
|
||||
#ifdef CONFIG_NET_CLS_ACT
|
||||
tcf_action_destroy(&exts->actions, TCA_ACT_UNBIND);
|
||||
INIT_LIST_HEAD(&exts->actions);
|
||||
#endif
|
||||
}
|
||||
EXPORT_SYMBOL(tcf_exts_destroy);
|
||||
|
||||
int tcf_exts_validate(struct net *net, struct tcf_proto *tp, struct nlattr **tb,
|
||||
struct nlattr *rate_tlv, struct tcf_exts *exts, bool ovr)
|
||||
{
|
||||
#ifdef CONFIG_NET_CLS_ACT
|
||||
{
|
||||
struct tc_action *act;
|
||||
|
||||
INIT_LIST_HEAD(&exts->actions);
|
||||
if (exts->police && tb[exts->police]) {
|
||||
act = tcf_action_init_1(net, tb[exts->police], rate_tlv,
|
||||
"police", ovr,
|
||||
TCA_ACT_BIND);
|
||||
if (IS_ERR(act))
|
||||
return PTR_ERR(act);
|
||||
|
||||
act->type = exts->type = TCA_OLD_COMPAT;
|
||||
list_add(&act->list, &exts->actions);
|
||||
} else if (exts->action && tb[exts->action]) {
|
||||
int err;
|
||||
err = tcf_action_init(net, tb[exts->action], rate_tlv,
|
||||
NULL, ovr,
|
||||
TCA_ACT_BIND, &exts->actions);
|
||||
if (err)
|
||||
return err;
|
||||
}
|
||||
}
|
||||
#else
|
||||
if ((exts->action && tb[exts->action]) ||
|
||||
(exts->police && tb[exts->police]))
|
||||
return -EOPNOTSUPP;
|
||||
#endif
|
||||
|
||||
return 0;
|
||||
}
|
||||
EXPORT_SYMBOL(tcf_exts_validate);
|
||||
|
||||
void tcf_exts_change(struct tcf_proto *tp, struct tcf_exts *dst,
|
||||
struct tcf_exts *src)
|
||||
{
|
||||
#ifdef CONFIG_NET_CLS_ACT
|
||||
LIST_HEAD(tmp);
|
||||
tcf_tree_lock(tp);
|
||||
list_splice_init(&dst->actions, &tmp);
|
||||
list_splice(&src->actions, &dst->actions);
|
||||
dst->type = src->type;
|
||||
tcf_tree_unlock(tp);
|
||||
tcf_action_destroy(&tmp, TCA_ACT_UNBIND);
|
||||
#endif
|
||||
}
|
||||
EXPORT_SYMBOL(tcf_exts_change);
|
||||
|
||||
#define tcf_exts_first_act(ext) \
|
||||
list_first_entry(&(exts)->actions, struct tc_action, list)
|
||||
|
||||
int tcf_exts_dump(struct sk_buff *skb, struct tcf_exts *exts)
|
||||
{
|
||||
#ifdef CONFIG_NET_CLS_ACT
|
||||
struct nlattr *nest;
|
||||
|
||||
if (exts->action && !list_empty(&exts->actions)) {
|
||||
/*
|
||||
* again for backward compatible mode - we want
|
||||
* to work with both old and new modes of entering
|
||||
* tc data even if iproute2 was newer - jhs
|
||||
*/
|
||||
if (exts->type != TCA_OLD_COMPAT) {
|
||||
nest = nla_nest_start(skb, exts->action);
|
||||
if (nest == NULL)
|
||||
goto nla_put_failure;
|
||||
if (tcf_action_dump(skb, &exts->actions, 0, 0) < 0)
|
||||
goto nla_put_failure;
|
||||
nla_nest_end(skb, nest);
|
||||
} else if (exts->police) {
|
||||
struct tc_action *act = tcf_exts_first_act(exts);
|
||||
nest = nla_nest_start(skb, exts->police);
|
||||
if (nest == NULL || !act)
|
||||
goto nla_put_failure;
|
||||
if (tcf_action_dump_old(skb, act, 0, 0) < 0)
|
||||
goto nla_put_failure;
|
||||
nla_nest_end(skb, nest);
|
||||
}
|
||||
}
|
||||
return 0;
|
||||
|
||||
nla_put_failure:
|
||||
nla_nest_cancel(skb, nest);
|
||||
return -1;
|
||||
#else
|
||||
return 0;
|
||||
#endif
|
||||
}
|
||||
EXPORT_SYMBOL(tcf_exts_dump);
|
||||
|
||||
|
||||
int tcf_exts_dump_stats(struct sk_buff *skb, struct tcf_exts *exts)
|
||||
{
|
||||
#ifdef CONFIG_NET_CLS_ACT
|
||||
struct tc_action *a = tcf_exts_first_act(exts);
|
||||
if (tcf_action_copy_stats(skb, a, 1) < 0)
|
||||
return -1;
|
||||
#endif
|
||||
return 0;
|
||||
}
|
||||
EXPORT_SYMBOL(tcf_exts_dump_stats);
|
||||
|
||||
static int __init tc_filter_init(void)
|
||||
{
|
||||
rtnl_register(PF_UNSPEC, RTM_NEWTFILTER, tc_ctl_tfilter, NULL, NULL);
|
||||
rtnl_register(PF_UNSPEC, RTM_DELTFILTER, tc_ctl_tfilter, NULL, NULL);
|
||||
rtnl_register(PF_UNSPEC, RTM_GETTFILTER, tc_ctl_tfilter,
|
||||
tc_dump_tfilter, NULL);
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
subsys_initcall(tc_filter_init);
|
||||
317
net/sched/cls_basic.c
Normal file
317
net/sched/cls_basic.c
Normal file
|
|
@ -0,0 +1,317 @@
|
|||
/*
|
||||
* net/sched/cls_basic.c Basic Packet Classifier.
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU General Public License
|
||||
* as published by the Free Software Foundation; either version
|
||||
* 2 of the License, or (at your option) any later version.
|
||||
*
|
||||
* Authors: Thomas Graf <tgraf@suug.ch>
|
||||
*/
|
||||
|
||||
#include <linux/module.h>
|
||||
#include <linux/slab.h>
|
||||
#include <linux/types.h>
|
||||
#include <linux/kernel.h>
|
||||
#include <linux/string.h>
|
||||
#include <linux/errno.h>
|
||||
#include <linux/rtnetlink.h>
|
||||
#include <linux/skbuff.h>
|
||||
#include <net/netlink.h>
|
||||
#include <net/act_api.h>
|
||||
#include <net/pkt_cls.h>
|
||||
|
||||
struct basic_head {
|
||||
u32 hgenerator;
|
||||
struct list_head flist;
|
||||
struct rcu_head rcu;
|
||||
};
|
||||
|
||||
struct basic_filter {
|
||||
u32 handle;
|
||||
struct tcf_exts exts;
|
||||
struct tcf_ematch_tree ematches;
|
||||
struct tcf_result res;
|
||||
struct tcf_proto *tp;
|
||||
struct list_head link;
|
||||
struct rcu_head rcu;
|
||||
};
|
||||
|
||||
static int basic_classify(struct sk_buff *skb, const struct tcf_proto *tp,
|
||||
struct tcf_result *res)
|
||||
{
|
||||
int r;
|
||||
struct basic_head *head = rcu_dereference_bh(tp->root);
|
||||
struct basic_filter *f;
|
||||
|
||||
list_for_each_entry_rcu(f, &head->flist, link) {
|
||||
if (!tcf_em_tree_match(skb, &f->ematches, NULL))
|
||||
continue;
|
||||
*res = f->res;
|
||||
r = tcf_exts_exec(skb, &f->exts, res);
|
||||
if (r < 0)
|
||||
continue;
|
||||
return r;
|
||||
}
|
||||
return -1;
|
||||
}
|
||||
|
||||
static unsigned long basic_get(struct tcf_proto *tp, u32 handle)
|
||||
{
|
||||
unsigned long l = 0UL;
|
||||
struct basic_head *head = rtnl_dereference(tp->root);
|
||||
struct basic_filter *f;
|
||||
|
||||
if (head == NULL)
|
||||
return 0UL;
|
||||
|
||||
list_for_each_entry(f, &head->flist, link)
|
||||
if (f->handle == handle)
|
||||
l = (unsigned long) f;
|
||||
|
||||
return l;
|
||||
}
|
||||
|
||||
static void basic_put(struct tcf_proto *tp, unsigned long f)
|
||||
{
|
||||
}
|
||||
|
||||
static int basic_init(struct tcf_proto *tp)
|
||||
{
|
||||
struct basic_head *head;
|
||||
|
||||
head = kzalloc(sizeof(*head), GFP_KERNEL);
|
||||
if (head == NULL)
|
||||
return -ENOBUFS;
|
||||
INIT_LIST_HEAD(&head->flist);
|
||||
rcu_assign_pointer(tp->root, head);
|
||||
return 0;
|
||||
}
|
||||
|
||||
static void basic_delete_filter(struct rcu_head *head)
|
||||
{
|
||||
struct basic_filter *f = container_of(head, struct basic_filter, rcu);
|
||||
|
||||
tcf_exts_destroy(&f->exts);
|
||||
tcf_em_tree_destroy(&f->ematches);
|
||||
kfree(f);
|
||||
}
|
||||
|
||||
static void basic_destroy(struct tcf_proto *tp)
|
||||
{
|
||||
struct basic_head *head = rtnl_dereference(tp->root);
|
||||
struct basic_filter *f, *n;
|
||||
|
||||
list_for_each_entry_safe(f, n, &head->flist, link) {
|
||||
list_del_rcu(&f->link);
|
||||
tcf_unbind_filter(tp, &f->res);
|
||||
call_rcu(&f->rcu, basic_delete_filter);
|
||||
}
|
||||
RCU_INIT_POINTER(tp->root, NULL);
|
||||
kfree_rcu(head, rcu);
|
||||
}
|
||||
|
||||
static int basic_delete(struct tcf_proto *tp, unsigned long arg)
|
||||
{
|
||||
struct basic_head *head = rtnl_dereference(tp->root);
|
||||
struct basic_filter *t, *f = (struct basic_filter *) arg;
|
||||
|
||||
list_for_each_entry(t, &head->flist, link)
|
||||
if (t == f) {
|
||||
list_del_rcu(&t->link);
|
||||
tcf_unbind_filter(tp, &t->res);
|
||||
call_rcu(&t->rcu, basic_delete_filter);
|
||||
return 0;
|
||||
}
|
||||
|
||||
return -ENOENT;
|
||||
}
|
||||
|
||||
static const struct nla_policy basic_policy[TCA_BASIC_MAX + 1] = {
|
||||
[TCA_BASIC_CLASSID] = { .type = NLA_U32 },
|
||||
[TCA_BASIC_EMATCHES] = { .type = NLA_NESTED },
|
||||
};
|
||||
|
||||
static int basic_set_parms(struct net *net, struct tcf_proto *tp,
|
||||
struct basic_filter *f, unsigned long base,
|
||||
struct nlattr **tb,
|
||||
struct nlattr *est, bool ovr)
|
||||
{
|
||||
int err;
|
||||
struct tcf_exts e;
|
||||
struct tcf_ematch_tree t;
|
||||
|
||||
tcf_exts_init(&e, TCA_BASIC_ACT, TCA_BASIC_POLICE);
|
||||
err = tcf_exts_validate(net, tp, tb, est, &e, ovr);
|
||||
if (err < 0)
|
||||
return err;
|
||||
|
||||
err = tcf_em_tree_validate(tp, tb[TCA_BASIC_EMATCHES], &t);
|
||||
if (err < 0)
|
||||
goto errout;
|
||||
|
||||
if (tb[TCA_BASIC_CLASSID]) {
|
||||
f->res.classid = nla_get_u32(tb[TCA_BASIC_CLASSID]);
|
||||
tcf_bind_filter(tp, &f->res, base);
|
||||
}
|
||||
|
||||
tcf_exts_change(tp, &f->exts, &e);
|
||||
tcf_em_tree_change(tp, &f->ematches, &t);
|
||||
f->tp = tp;
|
||||
|
||||
return 0;
|
||||
errout:
|
||||
tcf_exts_destroy(&e);
|
||||
return err;
|
||||
}
|
||||
|
||||
static int basic_change(struct net *net, struct sk_buff *in_skb,
|
||||
struct tcf_proto *tp, unsigned long base, u32 handle,
|
||||
struct nlattr **tca, unsigned long *arg, bool ovr)
|
||||
{
|
||||
int err;
|
||||
struct basic_head *head = rtnl_dereference(tp->root);
|
||||
struct nlattr *tb[TCA_BASIC_MAX + 1];
|
||||
struct basic_filter *fold = (struct basic_filter *) *arg;
|
||||
struct basic_filter *fnew;
|
||||
|
||||
if (tca[TCA_OPTIONS] == NULL)
|
||||
return -EINVAL;
|
||||
|
||||
err = nla_parse_nested(tb, TCA_BASIC_MAX, tca[TCA_OPTIONS],
|
||||
basic_policy);
|
||||
if (err < 0)
|
||||
return err;
|
||||
|
||||
if (fold != NULL) {
|
||||
if (handle && fold->handle != handle)
|
||||
return -EINVAL;
|
||||
}
|
||||
|
||||
err = -ENOBUFS;
|
||||
fnew = kzalloc(sizeof(*fnew), GFP_KERNEL);
|
||||
if (fnew == NULL)
|
||||
goto errout;
|
||||
|
||||
tcf_exts_init(&fnew->exts, TCA_BASIC_ACT, TCA_BASIC_POLICE);
|
||||
err = -EINVAL;
|
||||
if (handle) {
|
||||
fnew->handle = handle;
|
||||
} else if (fold) {
|
||||
fnew->handle = fold->handle;
|
||||
} else {
|
||||
unsigned int i = 0x80000000;
|
||||
do {
|
||||
if (++head->hgenerator == 0x7FFFFFFF)
|
||||
head->hgenerator = 1;
|
||||
} while (--i > 0 && basic_get(tp, head->hgenerator));
|
||||
|
||||
if (i <= 0) {
|
||||
pr_err("Insufficient number of handles\n");
|
||||
goto errout;
|
||||
}
|
||||
|
||||
fnew->handle = head->hgenerator;
|
||||
}
|
||||
|
||||
err = basic_set_parms(net, tp, fnew, base, tb, tca[TCA_RATE], ovr);
|
||||
if (err < 0)
|
||||
goto errout;
|
||||
|
||||
*arg = (unsigned long)fnew;
|
||||
|
||||
if (fold) {
|
||||
list_replace_rcu(&fold->link, &fnew->link);
|
||||
tcf_unbind_filter(tp, &fold->res);
|
||||
call_rcu(&fold->rcu, basic_delete_filter);
|
||||
} else {
|
||||
list_add_rcu(&fnew->link, &head->flist);
|
||||
}
|
||||
|
||||
return 0;
|
||||
errout:
|
||||
kfree(fnew);
|
||||
return err;
|
||||
}
|
||||
|
||||
static void basic_walk(struct tcf_proto *tp, struct tcf_walker *arg)
|
||||
{
|
||||
struct basic_head *head = rtnl_dereference(tp->root);
|
||||
struct basic_filter *f;
|
||||
|
||||
list_for_each_entry(f, &head->flist, link) {
|
||||
if (arg->count < arg->skip)
|
||||
goto skip;
|
||||
|
||||
if (arg->fn(tp, (unsigned long) f, arg) < 0) {
|
||||
arg->stop = 1;
|
||||
break;
|
||||
}
|
||||
skip:
|
||||
arg->count++;
|
||||
}
|
||||
}
|
||||
|
||||
static int basic_dump(struct net *net, struct tcf_proto *tp, unsigned long fh,
|
||||
struct sk_buff *skb, struct tcmsg *t)
|
||||
{
|
||||
struct basic_filter *f = (struct basic_filter *) fh;
|
||||
struct nlattr *nest;
|
||||
|
||||
if (f == NULL)
|
||||
return skb->len;
|
||||
|
||||
t->tcm_handle = f->handle;
|
||||
|
||||
nest = nla_nest_start(skb, TCA_OPTIONS);
|
||||
if (nest == NULL)
|
||||
goto nla_put_failure;
|
||||
|
||||
if (f->res.classid &&
|
||||
nla_put_u32(skb, TCA_BASIC_CLASSID, f->res.classid))
|
||||
goto nla_put_failure;
|
||||
|
||||
if (tcf_exts_dump(skb, &f->exts) < 0 ||
|
||||
tcf_em_tree_dump(skb, &f->ematches, TCA_BASIC_EMATCHES) < 0)
|
||||
goto nla_put_failure;
|
||||
|
||||
nla_nest_end(skb, nest);
|
||||
|
||||
if (tcf_exts_dump_stats(skb, &f->exts) < 0)
|
||||
goto nla_put_failure;
|
||||
|
||||
return skb->len;
|
||||
|
||||
nla_put_failure:
|
||||
nla_nest_cancel(skb, nest);
|
||||
return -1;
|
||||
}
|
||||
|
||||
static struct tcf_proto_ops cls_basic_ops __read_mostly = {
|
||||
.kind = "basic",
|
||||
.classify = basic_classify,
|
||||
.init = basic_init,
|
||||
.destroy = basic_destroy,
|
||||
.get = basic_get,
|
||||
.put = basic_put,
|
||||
.change = basic_change,
|
||||
.delete = basic_delete,
|
||||
.walk = basic_walk,
|
||||
.dump = basic_dump,
|
||||
.owner = THIS_MODULE,
|
||||
};
|
||||
|
||||
static int __init init_basic(void)
|
||||
{
|
||||
return register_tcf_proto_ops(&cls_basic_ops);
|
||||
}
|
||||
|
||||
static void __exit exit_basic(void)
|
||||
{
|
||||
unregister_tcf_proto_ops(&cls_basic_ops);
|
||||
}
|
||||
|
||||
module_init(init_basic)
|
||||
module_exit(exit_basic)
|
||||
MODULE_LICENSE("GPL");
|
||||
|
||||
384
net/sched/cls_bpf.c
Normal file
384
net/sched/cls_bpf.c
Normal file
|
|
@ -0,0 +1,384 @@
|
|||
/*
|
||||
* Berkeley Packet Filter based traffic classifier
|
||||
*
|
||||
* Might be used to classify traffic through flexible, user-defined and
|
||||
* possibly JIT-ed BPF filters for traffic control as an alternative to
|
||||
* ematches.
|
||||
*
|
||||
* (C) 2013 Daniel Borkmann <dborkman@redhat.com>
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License version 2 as
|
||||
* published by the Free Software Foundation.
|
||||
*/
|
||||
|
||||
#include <linux/module.h>
|
||||
#include <linux/types.h>
|
||||
#include <linux/skbuff.h>
|
||||
#include <linux/filter.h>
|
||||
#include <net/rtnetlink.h>
|
||||
#include <net/pkt_cls.h>
|
||||
#include <net/sock.h>
|
||||
|
||||
MODULE_LICENSE("GPL");
|
||||
MODULE_AUTHOR("Daniel Borkmann <dborkman@redhat.com>");
|
||||
MODULE_DESCRIPTION("TC BPF based classifier");
|
||||
|
||||
struct cls_bpf_head {
|
||||
struct list_head plist;
|
||||
u32 hgen;
|
||||
struct rcu_head rcu;
|
||||
};
|
||||
|
||||
struct cls_bpf_prog {
|
||||
struct bpf_prog *filter;
|
||||
struct sock_filter *bpf_ops;
|
||||
struct tcf_exts exts;
|
||||
struct tcf_result res;
|
||||
struct list_head link;
|
||||
u32 handle;
|
||||
u16 bpf_len;
|
||||
struct tcf_proto *tp;
|
||||
struct rcu_head rcu;
|
||||
};
|
||||
|
||||
static const struct nla_policy bpf_policy[TCA_BPF_MAX + 1] = {
|
||||
[TCA_BPF_CLASSID] = { .type = NLA_U32 },
|
||||
[TCA_BPF_OPS_LEN] = { .type = NLA_U16 },
|
||||
[TCA_BPF_OPS] = { .type = NLA_BINARY,
|
||||
.len = sizeof(struct sock_filter) * BPF_MAXINSNS },
|
||||
};
|
||||
|
||||
static int cls_bpf_classify(struct sk_buff *skb, const struct tcf_proto *tp,
|
||||
struct tcf_result *res)
|
||||
{
|
||||
struct cls_bpf_head *head = rcu_dereference_bh(tp->root);
|
||||
struct cls_bpf_prog *prog;
|
||||
int ret;
|
||||
|
||||
list_for_each_entry_rcu(prog, &head->plist, link) {
|
||||
int filter_res = BPF_PROG_RUN(prog->filter, skb);
|
||||
|
||||
if (filter_res == 0)
|
||||
continue;
|
||||
|
||||
*res = prog->res;
|
||||
if (filter_res != -1)
|
||||
res->classid = filter_res;
|
||||
|
||||
ret = tcf_exts_exec(skb, &prog->exts, res);
|
||||
if (ret < 0)
|
||||
continue;
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
return -1;
|
||||
}
|
||||
|
||||
static int cls_bpf_init(struct tcf_proto *tp)
|
||||
{
|
||||
struct cls_bpf_head *head;
|
||||
|
||||
head = kzalloc(sizeof(*head), GFP_KERNEL);
|
||||
if (head == NULL)
|
||||
return -ENOBUFS;
|
||||
|
||||
INIT_LIST_HEAD_RCU(&head->plist);
|
||||
rcu_assign_pointer(tp->root, head);
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
static void cls_bpf_delete_prog(struct tcf_proto *tp, struct cls_bpf_prog *prog)
|
||||
{
|
||||
tcf_exts_destroy(&prog->exts);
|
||||
|
||||
bpf_prog_destroy(prog->filter);
|
||||
|
||||
kfree(prog->bpf_ops);
|
||||
kfree(prog);
|
||||
}
|
||||
|
||||
static void __cls_bpf_delete_prog(struct rcu_head *rcu)
|
||||
{
|
||||
struct cls_bpf_prog *prog = container_of(rcu, struct cls_bpf_prog, rcu);
|
||||
|
||||
cls_bpf_delete_prog(prog->tp, prog);
|
||||
}
|
||||
|
||||
static int cls_bpf_delete(struct tcf_proto *tp, unsigned long arg)
|
||||
{
|
||||
struct cls_bpf_head *head = rtnl_dereference(tp->root);
|
||||
struct cls_bpf_prog *prog, *todel = (struct cls_bpf_prog *) arg;
|
||||
|
||||
list_for_each_entry(prog, &head->plist, link) {
|
||||
if (prog == todel) {
|
||||
list_del_rcu(&prog->link);
|
||||
tcf_unbind_filter(tp, &prog->res);
|
||||
call_rcu(&prog->rcu, __cls_bpf_delete_prog);
|
||||
return 0;
|
||||
}
|
||||
}
|
||||
|
||||
return -ENOENT;
|
||||
}
|
||||
|
||||
static void cls_bpf_destroy(struct tcf_proto *tp)
|
||||
{
|
||||
struct cls_bpf_head *head = rtnl_dereference(tp->root);
|
||||
struct cls_bpf_prog *prog, *tmp;
|
||||
|
||||
list_for_each_entry_safe(prog, tmp, &head->plist, link) {
|
||||
list_del_rcu(&prog->link);
|
||||
tcf_unbind_filter(tp, &prog->res);
|
||||
call_rcu(&prog->rcu, __cls_bpf_delete_prog);
|
||||
}
|
||||
|
||||
RCU_INIT_POINTER(tp->root, NULL);
|
||||
kfree_rcu(head, rcu);
|
||||
}
|
||||
|
||||
static unsigned long cls_bpf_get(struct tcf_proto *tp, u32 handle)
|
||||
{
|
||||
struct cls_bpf_head *head = rtnl_dereference(tp->root);
|
||||
struct cls_bpf_prog *prog;
|
||||
unsigned long ret = 0UL;
|
||||
|
||||
if (head == NULL)
|
||||
return 0UL;
|
||||
|
||||
list_for_each_entry_rcu(prog, &head->plist, link) {
|
||||
if (prog->handle == handle) {
|
||||
ret = (unsigned long) prog;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
static void cls_bpf_put(struct tcf_proto *tp, unsigned long f)
|
||||
{
|
||||
}
|
||||
|
||||
static int cls_bpf_modify_existing(struct net *net, struct tcf_proto *tp,
|
||||
struct cls_bpf_prog *prog,
|
||||
unsigned long base, struct nlattr **tb,
|
||||
struct nlattr *est, bool ovr)
|
||||
{
|
||||
struct sock_filter *bpf_ops;
|
||||
struct tcf_exts exts;
|
||||
struct sock_fprog_kern tmp;
|
||||
struct bpf_prog *fp;
|
||||
u16 bpf_size, bpf_len;
|
||||
u32 classid;
|
||||
int ret;
|
||||
|
||||
if (!tb[TCA_BPF_OPS_LEN] || !tb[TCA_BPF_OPS] || !tb[TCA_BPF_CLASSID])
|
||||
return -EINVAL;
|
||||
|
||||
tcf_exts_init(&exts, TCA_BPF_ACT, TCA_BPF_POLICE);
|
||||
ret = tcf_exts_validate(net, tp, tb, est, &exts, ovr);
|
||||
if (ret < 0)
|
||||
return ret;
|
||||
|
||||
classid = nla_get_u32(tb[TCA_BPF_CLASSID]);
|
||||
bpf_len = nla_get_u16(tb[TCA_BPF_OPS_LEN]);
|
||||
if (bpf_len > BPF_MAXINSNS || bpf_len == 0) {
|
||||
ret = -EINVAL;
|
||||
goto errout;
|
||||
}
|
||||
|
||||
bpf_size = bpf_len * sizeof(*bpf_ops);
|
||||
bpf_ops = kzalloc(bpf_size, GFP_KERNEL);
|
||||
if (bpf_ops == NULL) {
|
||||
ret = -ENOMEM;
|
||||
goto errout;
|
||||
}
|
||||
|
||||
memcpy(bpf_ops, nla_data(tb[TCA_BPF_OPS]), bpf_size);
|
||||
|
||||
tmp.len = bpf_len;
|
||||
tmp.filter = bpf_ops;
|
||||
|
||||
ret = bpf_prog_create(&fp, &tmp);
|
||||
if (ret)
|
||||
goto errout_free;
|
||||
|
||||
prog->bpf_len = bpf_len;
|
||||
prog->bpf_ops = bpf_ops;
|
||||
prog->filter = fp;
|
||||
prog->res.classid = classid;
|
||||
|
||||
tcf_bind_filter(tp, &prog->res, base);
|
||||
tcf_exts_change(tp, &prog->exts, &exts);
|
||||
|
||||
return 0;
|
||||
errout_free:
|
||||
kfree(bpf_ops);
|
||||
errout:
|
||||
tcf_exts_destroy(&exts);
|
||||
return ret;
|
||||
}
|
||||
|
||||
static u32 cls_bpf_grab_new_handle(struct tcf_proto *tp,
|
||||
struct cls_bpf_head *head)
|
||||
{
|
||||
unsigned int i = 0x80000000;
|
||||
|
||||
do {
|
||||
if (++head->hgen == 0x7FFFFFFF)
|
||||
head->hgen = 1;
|
||||
} while (--i > 0 && cls_bpf_get(tp, head->hgen));
|
||||
if (i == 0)
|
||||
pr_err("Insufficient number of handles\n");
|
||||
|
||||
return i;
|
||||
}
|
||||
|
||||
static int cls_bpf_change(struct net *net, struct sk_buff *in_skb,
|
||||
struct tcf_proto *tp, unsigned long base,
|
||||
u32 handle, struct nlattr **tca,
|
||||
unsigned long *arg, bool ovr)
|
||||
{
|
||||
struct cls_bpf_head *head = rtnl_dereference(tp->root);
|
||||
struct cls_bpf_prog *oldprog = (struct cls_bpf_prog *) *arg;
|
||||
struct nlattr *tb[TCA_BPF_MAX + 1];
|
||||
struct cls_bpf_prog *prog;
|
||||
int ret;
|
||||
|
||||
if (tca[TCA_OPTIONS] == NULL)
|
||||
return -EINVAL;
|
||||
|
||||
ret = nla_parse_nested(tb, TCA_BPF_MAX, tca[TCA_OPTIONS], bpf_policy);
|
||||
if (ret < 0)
|
||||
return ret;
|
||||
|
||||
prog = kzalloc(sizeof(*prog), GFP_KERNEL);
|
||||
if (!prog)
|
||||
return -ENOBUFS;
|
||||
|
||||
tcf_exts_init(&prog->exts, TCA_BPF_ACT, TCA_BPF_POLICE);
|
||||
|
||||
if (oldprog) {
|
||||
if (handle && oldprog->handle != handle) {
|
||||
ret = -EINVAL;
|
||||
goto errout;
|
||||
}
|
||||
}
|
||||
|
||||
if (handle == 0)
|
||||
prog->handle = cls_bpf_grab_new_handle(tp, head);
|
||||
else
|
||||
prog->handle = handle;
|
||||
if (prog->handle == 0) {
|
||||
ret = -EINVAL;
|
||||
goto errout;
|
||||
}
|
||||
|
||||
ret = cls_bpf_modify_existing(net, tp, prog, base, tb, tca[TCA_RATE], ovr);
|
||||
if (ret < 0)
|
||||
goto errout;
|
||||
|
||||
if (oldprog) {
|
||||
list_replace_rcu(&prog->link, &oldprog->link);
|
||||
tcf_unbind_filter(tp, &oldprog->res);
|
||||
call_rcu(&oldprog->rcu, __cls_bpf_delete_prog);
|
||||
} else {
|
||||
list_add_rcu(&prog->link, &head->plist);
|
||||
}
|
||||
|
||||
*arg = (unsigned long) prog;
|
||||
return 0;
|
||||
errout:
|
||||
kfree(prog);
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
static int cls_bpf_dump(struct net *net, struct tcf_proto *tp, unsigned long fh,
|
||||
struct sk_buff *skb, struct tcmsg *tm)
|
||||
{
|
||||
struct cls_bpf_prog *prog = (struct cls_bpf_prog *) fh;
|
||||
struct nlattr *nest, *nla;
|
||||
|
||||
if (prog == NULL)
|
||||
return skb->len;
|
||||
|
||||
tm->tcm_handle = prog->handle;
|
||||
|
||||
nest = nla_nest_start(skb, TCA_OPTIONS);
|
||||
if (nest == NULL)
|
||||
goto nla_put_failure;
|
||||
|
||||
if (nla_put_u32(skb, TCA_BPF_CLASSID, prog->res.classid))
|
||||
goto nla_put_failure;
|
||||
if (nla_put_u16(skb, TCA_BPF_OPS_LEN, prog->bpf_len))
|
||||
goto nla_put_failure;
|
||||
|
||||
nla = nla_reserve(skb, TCA_BPF_OPS, prog->bpf_len *
|
||||
sizeof(struct sock_filter));
|
||||
if (nla == NULL)
|
||||
goto nla_put_failure;
|
||||
|
||||
memcpy(nla_data(nla), prog->bpf_ops, nla_len(nla));
|
||||
|
||||
if (tcf_exts_dump(skb, &prog->exts) < 0)
|
||||
goto nla_put_failure;
|
||||
|
||||
nla_nest_end(skb, nest);
|
||||
|
||||
if (tcf_exts_dump_stats(skb, &prog->exts) < 0)
|
||||
goto nla_put_failure;
|
||||
|
||||
return skb->len;
|
||||
|
||||
nla_put_failure:
|
||||
nla_nest_cancel(skb, nest);
|
||||
return -1;
|
||||
}
|
||||
|
||||
static void cls_bpf_walk(struct tcf_proto *tp, struct tcf_walker *arg)
|
||||
{
|
||||
struct cls_bpf_head *head = rtnl_dereference(tp->root);
|
||||
struct cls_bpf_prog *prog;
|
||||
|
||||
list_for_each_entry_rcu(prog, &head->plist, link) {
|
||||
if (arg->count < arg->skip)
|
||||
goto skip;
|
||||
if (arg->fn(tp, (unsigned long) prog, arg) < 0) {
|
||||
arg->stop = 1;
|
||||
break;
|
||||
}
|
||||
skip:
|
||||
arg->count++;
|
||||
}
|
||||
}
|
||||
|
||||
static struct tcf_proto_ops cls_bpf_ops __read_mostly = {
|
||||
.kind = "bpf",
|
||||
.owner = THIS_MODULE,
|
||||
.classify = cls_bpf_classify,
|
||||
.init = cls_bpf_init,
|
||||
.destroy = cls_bpf_destroy,
|
||||
.get = cls_bpf_get,
|
||||
.put = cls_bpf_put,
|
||||
.change = cls_bpf_change,
|
||||
.delete = cls_bpf_delete,
|
||||
.walk = cls_bpf_walk,
|
||||
.dump = cls_bpf_dump,
|
||||
};
|
||||
|
||||
static int __init cls_bpf_init_mod(void)
|
||||
{
|
||||
return register_tcf_proto_ops(&cls_bpf_ops);
|
||||
}
|
||||
|
||||
static void __exit cls_bpf_exit_mod(void)
|
||||
{
|
||||
unregister_tcf_proto_ops(&cls_bpf_ops);
|
||||
}
|
||||
|
||||
module_init(cls_bpf_init_mod);
|
||||
module_exit(cls_bpf_exit_mod);
|
||||
239
net/sched/cls_cgroup.c
Normal file
239
net/sched/cls_cgroup.c
Normal file
|
|
@ -0,0 +1,239 @@
|
|||
/*
|
||||
* net/sched/cls_cgroup.c Control Group Classifier
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU General Public License
|
||||
* as published by the Free Software Foundation; either version
|
||||
* 2 of the License, or (at your option) any later version.
|
||||
*
|
||||
* Authors: Thomas Graf <tgraf@suug.ch>
|
||||
*/
|
||||
|
||||
#include <linux/module.h>
|
||||
#include <linux/slab.h>
|
||||
#include <linux/skbuff.h>
|
||||
#include <linux/rcupdate.h>
|
||||
#include <net/rtnetlink.h>
|
||||
#include <net/pkt_cls.h>
|
||||
#include <net/sock.h>
|
||||
#include <net/cls_cgroup.h>
|
||||
|
||||
struct cls_cgroup_head {
|
||||
u32 handle;
|
||||
struct tcf_exts exts;
|
||||
struct tcf_ematch_tree ematches;
|
||||
struct tcf_proto *tp;
|
||||
struct rcu_head rcu;
|
||||
};
|
||||
|
||||
static int cls_cgroup_classify(struct sk_buff *skb, const struct tcf_proto *tp,
|
||||
struct tcf_result *res)
|
||||
{
|
||||
struct cls_cgroup_head *head = rcu_dereference_bh(tp->root);
|
||||
u32 classid;
|
||||
|
||||
classid = task_cls_state(current)->classid;
|
||||
|
||||
/*
|
||||
* Due to the nature of the classifier it is required to ignore all
|
||||
* packets originating from softirq context as accessing `current'
|
||||
* would lead to false results.
|
||||
*
|
||||
* This test assumes that all callers of dev_queue_xmit() explicitely
|
||||
* disable bh. Knowing this, it is possible to detect softirq based
|
||||
* calls by looking at the number of nested bh disable calls because
|
||||
* softirqs always disables bh.
|
||||
*/
|
||||
if (in_serving_softirq()) {
|
||||
/* If there is an sk_classid we'll use that. */
|
||||
if (!skb->sk)
|
||||
return -1;
|
||||
classid = skb->sk->sk_classid;
|
||||
}
|
||||
|
||||
if (!classid)
|
||||
return -1;
|
||||
|
||||
if (!tcf_em_tree_match(skb, &head->ematches, NULL))
|
||||
return -1;
|
||||
|
||||
res->classid = classid;
|
||||
res->class = 0;
|
||||
return tcf_exts_exec(skb, &head->exts, res);
|
||||
}
|
||||
|
||||
static unsigned long cls_cgroup_get(struct tcf_proto *tp, u32 handle)
|
||||
{
|
||||
return 0UL;
|
||||
}
|
||||
|
||||
static void cls_cgroup_put(struct tcf_proto *tp, unsigned long f)
|
||||
{
|
||||
}
|
||||
|
||||
static int cls_cgroup_init(struct tcf_proto *tp)
|
||||
{
|
||||
return 0;
|
||||
}
|
||||
|
||||
static const struct nla_policy cgroup_policy[TCA_CGROUP_MAX + 1] = {
|
||||
[TCA_CGROUP_EMATCHES] = { .type = NLA_NESTED },
|
||||
};
|
||||
|
||||
static void cls_cgroup_destroy_rcu(struct rcu_head *root)
|
||||
{
|
||||
struct cls_cgroup_head *head = container_of(root,
|
||||
struct cls_cgroup_head,
|
||||
rcu);
|
||||
|
||||
tcf_exts_destroy(&head->exts);
|
||||
tcf_em_tree_destroy(&head->ematches);
|
||||
kfree(head);
|
||||
}
|
||||
|
||||
static int cls_cgroup_change(struct net *net, struct sk_buff *in_skb,
|
||||
struct tcf_proto *tp, unsigned long base,
|
||||
u32 handle, struct nlattr **tca,
|
||||
unsigned long *arg, bool ovr)
|
||||
{
|
||||
struct nlattr *tb[TCA_CGROUP_MAX + 1];
|
||||
struct cls_cgroup_head *head = rtnl_dereference(tp->root);
|
||||
struct cls_cgroup_head *new;
|
||||
struct tcf_ematch_tree t;
|
||||
struct tcf_exts e;
|
||||
int err;
|
||||
|
||||
if (!tca[TCA_OPTIONS])
|
||||
return -EINVAL;
|
||||
|
||||
if (!head && !handle)
|
||||
return -EINVAL;
|
||||
|
||||
if (head && handle != head->handle)
|
||||
return -ENOENT;
|
||||
|
||||
new = kzalloc(sizeof(*head), GFP_KERNEL);
|
||||
if (!new)
|
||||
return -ENOBUFS;
|
||||
|
||||
tcf_exts_init(&new->exts, TCA_CGROUP_ACT, TCA_CGROUP_POLICE);
|
||||
if (head)
|
||||
new->handle = head->handle;
|
||||
else
|
||||
new->handle = handle;
|
||||
|
||||
new->tp = tp;
|
||||
err = nla_parse_nested(tb, TCA_CGROUP_MAX, tca[TCA_OPTIONS],
|
||||
cgroup_policy);
|
||||
if (err < 0)
|
||||
goto errout;
|
||||
|
||||
tcf_exts_init(&e, TCA_CGROUP_ACT, TCA_CGROUP_POLICE);
|
||||
err = tcf_exts_validate(net, tp, tb, tca[TCA_RATE], &e, ovr);
|
||||
if (err < 0)
|
||||
goto errout;
|
||||
|
||||
err = tcf_em_tree_validate(tp, tb[TCA_CGROUP_EMATCHES], &t);
|
||||
if (err < 0) {
|
||||
tcf_exts_destroy(&e);
|
||||
goto errout;
|
||||
}
|
||||
|
||||
tcf_exts_change(tp, &new->exts, &e);
|
||||
tcf_em_tree_change(tp, &new->ematches, &t);
|
||||
|
||||
rcu_assign_pointer(tp->root, new);
|
||||
if (head)
|
||||
call_rcu(&head->rcu, cls_cgroup_destroy_rcu);
|
||||
return 0;
|
||||
errout:
|
||||
kfree(new);
|
||||
return err;
|
||||
}
|
||||
|
||||
static void cls_cgroup_destroy(struct tcf_proto *tp)
|
||||
{
|
||||
struct cls_cgroup_head *head = rtnl_dereference(tp->root);
|
||||
|
||||
if (head) {
|
||||
RCU_INIT_POINTER(tp->root, NULL);
|
||||
call_rcu(&head->rcu, cls_cgroup_destroy_rcu);
|
||||
}
|
||||
}
|
||||
|
||||
static int cls_cgroup_delete(struct tcf_proto *tp, unsigned long arg)
|
||||
{
|
||||
return -EOPNOTSUPP;
|
||||
}
|
||||
|
||||
static void cls_cgroup_walk(struct tcf_proto *tp, struct tcf_walker *arg)
|
||||
{
|
||||
struct cls_cgroup_head *head = rtnl_dereference(tp->root);
|
||||
|
||||
if (arg->count < arg->skip)
|
||||
goto skip;
|
||||
|
||||
if (arg->fn(tp, (unsigned long) head, arg) < 0) {
|
||||
arg->stop = 1;
|
||||
return;
|
||||
}
|
||||
skip:
|
||||
arg->count++;
|
||||
}
|
||||
|
||||
static int cls_cgroup_dump(struct net *net, struct tcf_proto *tp, unsigned long fh,
|
||||
struct sk_buff *skb, struct tcmsg *t)
|
||||
{
|
||||
struct cls_cgroup_head *head = rtnl_dereference(tp->root);
|
||||
unsigned char *b = skb_tail_pointer(skb);
|
||||
struct nlattr *nest;
|
||||
|
||||
t->tcm_handle = head->handle;
|
||||
|
||||
nest = nla_nest_start(skb, TCA_OPTIONS);
|
||||
if (nest == NULL)
|
||||
goto nla_put_failure;
|
||||
|
||||
if (tcf_exts_dump(skb, &head->exts) < 0 ||
|
||||
tcf_em_tree_dump(skb, &head->ematches, TCA_CGROUP_EMATCHES) < 0)
|
||||
goto nla_put_failure;
|
||||
|
||||
nla_nest_end(skb, nest);
|
||||
|
||||
if (tcf_exts_dump_stats(skb, &head->exts) < 0)
|
||||
goto nla_put_failure;
|
||||
|
||||
return skb->len;
|
||||
|
||||
nla_put_failure:
|
||||
nlmsg_trim(skb, b);
|
||||
return -1;
|
||||
}
|
||||
|
||||
static struct tcf_proto_ops cls_cgroup_ops __read_mostly = {
|
||||
.kind = "cgroup",
|
||||
.init = cls_cgroup_init,
|
||||
.change = cls_cgroup_change,
|
||||
.classify = cls_cgroup_classify,
|
||||
.destroy = cls_cgroup_destroy,
|
||||
.get = cls_cgroup_get,
|
||||
.put = cls_cgroup_put,
|
||||
.delete = cls_cgroup_delete,
|
||||
.walk = cls_cgroup_walk,
|
||||
.dump = cls_cgroup_dump,
|
||||
.owner = THIS_MODULE,
|
||||
};
|
||||
|
||||
static int __init init_cgroup_cls(void)
|
||||
{
|
||||
return register_tcf_proto_ops(&cls_cgroup_ops);
|
||||
}
|
||||
|
||||
static void __exit exit_cgroup_cls(void)
|
||||
{
|
||||
unregister_tcf_proto_ops(&cls_cgroup_ops);
|
||||
}
|
||||
|
||||
module_init(init_cgroup_cls);
|
||||
module_exit(exit_cgroup_cls);
|
||||
MODULE_LICENSE("GPL");
|
||||
698
net/sched/cls_flow.c
Normal file
698
net/sched/cls_flow.c
Normal file
|
|
@ -0,0 +1,698 @@
|
|||
/*
|
||||
* net/sched/cls_flow.c Generic flow classifier
|
||||
*
|
||||
* Copyright (c) 2007, 2008 Patrick McHardy <kaber@trash.net>
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU General Public License
|
||||
* as published by the Free Software Foundation; either version 2
|
||||
* of the License, or (at your option) any later version.
|
||||
*/
|
||||
|
||||
#include <linux/kernel.h>
|
||||
#include <linux/init.h>
|
||||
#include <linux/list.h>
|
||||
#include <linux/jhash.h>
|
||||
#include <linux/random.h>
|
||||
#include <linux/pkt_cls.h>
|
||||
#include <linux/skbuff.h>
|
||||
#include <linux/in.h>
|
||||
#include <linux/ip.h>
|
||||
#include <linux/ipv6.h>
|
||||
#include <linux/if_vlan.h>
|
||||
#include <linux/slab.h>
|
||||
#include <linux/module.h>
|
||||
|
||||
#include <net/pkt_cls.h>
|
||||
#include <net/ip.h>
|
||||
#include <net/route.h>
|
||||
#include <net/flow_keys.h>
|
||||
|
||||
#if defined(CONFIG_NF_CONNTRACK) || defined(CONFIG_NF_CONNTRACK_MODULE)
|
||||
#include <net/netfilter/nf_conntrack.h>
|
||||
#endif
|
||||
|
||||
struct flow_head {
|
||||
struct list_head filters;
|
||||
struct rcu_head rcu;
|
||||
};
|
||||
|
||||
struct flow_filter {
|
||||
struct list_head list;
|
||||
struct tcf_exts exts;
|
||||
struct tcf_ematch_tree ematches;
|
||||
struct tcf_proto *tp;
|
||||
struct timer_list perturb_timer;
|
||||
u32 perturb_period;
|
||||
u32 handle;
|
||||
|
||||
u32 nkeys;
|
||||
u32 keymask;
|
||||
u32 mode;
|
||||
u32 mask;
|
||||
u32 xor;
|
||||
u32 rshift;
|
||||
u32 addend;
|
||||
u32 divisor;
|
||||
u32 baseclass;
|
||||
u32 hashrnd;
|
||||
struct rcu_head rcu;
|
||||
};
|
||||
|
||||
static inline u32 addr_fold(void *addr)
|
||||
{
|
||||
unsigned long a = (unsigned long)addr;
|
||||
|
||||
return (a & 0xFFFFFFFF) ^ (BITS_PER_LONG > 32 ? a >> 32 : 0);
|
||||
}
|
||||
|
||||
static u32 flow_get_src(const struct sk_buff *skb, const struct flow_keys *flow)
|
||||
{
|
||||
if (flow->src)
|
||||
return ntohl(flow->src);
|
||||
return addr_fold(skb->sk);
|
||||
}
|
||||
|
||||
static u32 flow_get_dst(const struct sk_buff *skb, const struct flow_keys *flow)
|
||||
{
|
||||
if (flow->dst)
|
||||
return ntohl(flow->dst);
|
||||
return addr_fold(skb_dst(skb)) ^ (__force u16)skb->protocol;
|
||||
}
|
||||
|
||||
static u32 flow_get_proto(const struct sk_buff *skb, const struct flow_keys *flow)
|
||||
{
|
||||
return flow->ip_proto;
|
||||
}
|
||||
|
||||
static u32 flow_get_proto_src(const struct sk_buff *skb, const struct flow_keys *flow)
|
||||
{
|
||||
if (flow->ports)
|
||||
return ntohs(flow->port16[0]);
|
||||
|
||||
return addr_fold(skb->sk);
|
||||
}
|
||||
|
||||
static u32 flow_get_proto_dst(const struct sk_buff *skb, const struct flow_keys *flow)
|
||||
{
|
||||
if (flow->ports)
|
||||
return ntohs(flow->port16[1]);
|
||||
|
||||
return addr_fold(skb_dst(skb)) ^ (__force u16)skb->protocol;
|
||||
}
|
||||
|
||||
static u32 flow_get_iif(const struct sk_buff *skb)
|
||||
{
|
||||
return skb->skb_iif;
|
||||
}
|
||||
|
||||
static u32 flow_get_priority(const struct sk_buff *skb)
|
||||
{
|
||||
return skb->priority;
|
||||
}
|
||||
|
||||
static u32 flow_get_mark(const struct sk_buff *skb)
|
||||
{
|
||||
return skb->mark;
|
||||
}
|
||||
|
||||
static u32 flow_get_nfct(const struct sk_buff *skb)
|
||||
{
|
||||
#if defined(CONFIG_NF_CONNTRACK) || defined(CONFIG_NF_CONNTRACK_MODULE)
|
||||
return addr_fold(skb->nfct);
|
||||
#else
|
||||
return 0;
|
||||
#endif
|
||||
}
|
||||
|
||||
#if defined(CONFIG_NF_CONNTRACK) || defined(CONFIG_NF_CONNTRACK_MODULE)
|
||||
#define CTTUPLE(skb, member) \
|
||||
({ \
|
||||
enum ip_conntrack_info ctinfo; \
|
||||
const struct nf_conn *ct = nf_ct_get(skb, &ctinfo); \
|
||||
if (ct == NULL) \
|
||||
goto fallback; \
|
||||
ct->tuplehash[CTINFO2DIR(ctinfo)].tuple.member; \
|
||||
})
|
||||
#else
|
||||
#define CTTUPLE(skb, member) \
|
||||
({ \
|
||||
goto fallback; \
|
||||
0; \
|
||||
})
|
||||
#endif
|
||||
|
||||
static u32 flow_get_nfct_src(const struct sk_buff *skb, const struct flow_keys *flow)
|
||||
{
|
||||
switch (skb->protocol) {
|
||||
case htons(ETH_P_IP):
|
||||
return ntohl(CTTUPLE(skb, src.u3.ip));
|
||||
case htons(ETH_P_IPV6):
|
||||
return ntohl(CTTUPLE(skb, src.u3.ip6[3]));
|
||||
}
|
||||
fallback:
|
||||
return flow_get_src(skb, flow);
|
||||
}
|
||||
|
||||
static u32 flow_get_nfct_dst(const struct sk_buff *skb, const struct flow_keys *flow)
|
||||
{
|
||||
switch (skb->protocol) {
|
||||
case htons(ETH_P_IP):
|
||||
return ntohl(CTTUPLE(skb, dst.u3.ip));
|
||||
case htons(ETH_P_IPV6):
|
||||
return ntohl(CTTUPLE(skb, dst.u3.ip6[3]));
|
||||
}
|
||||
fallback:
|
||||
return flow_get_dst(skb, flow);
|
||||
}
|
||||
|
||||
static u32 flow_get_nfct_proto_src(const struct sk_buff *skb, const struct flow_keys *flow)
|
||||
{
|
||||
return ntohs(CTTUPLE(skb, src.u.all));
|
||||
fallback:
|
||||
return flow_get_proto_src(skb, flow);
|
||||
}
|
||||
|
||||
static u32 flow_get_nfct_proto_dst(const struct sk_buff *skb, const struct flow_keys *flow)
|
||||
{
|
||||
return ntohs(CTTUPLE(skb, dst.u.all));
|
||||
fallback:
|
||||
return flow_get_proto_dst(skb, flow);
|
||||
}
|
||||
|
||||
static u32 flow_get_rtclassid(const struct sk_buff *skb)
|
||||
{
|
||||
#ifdef CONFIG_IP_ROUTE_CLASSID
|
||||
if (skb_dst(skb))
|
||||
return skb_dst(skb)->tclassid;
|
||||
#endif
|
||||
return 0;
|
||||
}
|
||||
|
||||
static u32 flow_get_skuid(const struct sk_buff *skb)
|
||||
{
|
||||
if (skb->sk && skb->sk->sk_socket && skb->sk->sk_socket->file) {
|
||||
kuid_t skuid = skb->sk->sk_socket->file->f_cred->fsuid;
|
||||
return from_kuid(&init_user_ns, skuid);
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
static u32 flow_get_skgid(const struct sk_buff *skb)
|
||||
{
|
||||
if (skb->sk && skb->sk->sk_socket && skb->sk->sk_socket->file) {
|
||||
kgid_t skgid = skb->sk->sk_socket->file->f_cred->fsgid;
|
||||
return from_kgid(&init_user_ns, skgid);
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
static u32 flow_get_vlan_tag(const struct sk_buff *skb)
|
||||
{
|
||||
u16 uninitialized_var(tag);
|
||||
|
||||
if (vlan_get_tag(skb, &tag) < 0)
|
||||
return 0;
|
||||
return tag & VLAN_VID_MASK;
|
||||
}
|
||||
|
||||
static u32 flow_get_rxhash(struct sk_buff *skb)
|
||||
{
|
||||
return skb_get_hash(skb);
|
||||
}
|
||||
|
||||
static u32 flow_key_get(struct sk_buff *skb, int key, struct flow_keys *flow)
|
||||
{
|
||||
switch (key) {
|
||||
case FLOW_KEY_SRC:
|
||||
return flow_get_src(skb, flow);
|
||||
case FLOW_KEY_DST:
|
||||
return flow_get_dst(skb, flow);
|
||||
case FLOW_KEY_PROTO:
|
||||
return flow_get_proto(skb, flow);
|
||||
case FLOW_KEY_PROTO_SRC:
|
||||
return flow_get_proto_src(skb, flow);
|
||||
case FLOW_KEY_PROTO_DST:
|
||||
return flow_get_proto_dst(skb, flow);
|
||||
case FLOW_KEY_IIF:
|
||||
return flow_get_iif(skb);
|
||||
case FLOW_KEY_PRIORITY:
|
||||
return flow_get_priority(skb);
|
||||
case FLOW_KEY_MARK:
|
||||
return flow_get_mark(skb);
|
||||
case FLOW_KEY_NFCT:
|
||||
return flow_get_nfct(skb);
|
||||
case FLOW_KEY_NFCT_SRC:
|
||||
return flow_get_nfct_src(skb, flow);
|
||||
case FLOW_KEY_NFCT_DST:
|
||||
return flow_get_nfct_dst(skb, flow);
|
||||
case FLOW_KEY_NFCT_PROTO_SRC:
|
||||
return flow_get_nfct_proto_src(skb, flow);
|
||||
case FLOW_KEY_NFCT_PROTO_DST:
|
||||
return flow_get_nfct_proto_dst(skb, flow);
|
||||
case FLOW_KEY_RTCLASSID:
|
||||
return flow_get_rtclassid(skb);
|
||||
case FLOW_KEY_SKUID:
|
||||
return flow_get_skuid(skb);
|
||||
case FLOW_KEY_SKGID:
|
||||
return flow_get_skgid(skb);
|
||||
case FLOW_KEY_VLAN_TAG:
|
||||
return flow_get_vlan_tag(skb);
|
||||
case FLOW_KEY_RXHASH:
|
||||
return flow_get_rxhash(skb);
|
||||
default:
|
||||
WARN_ON(1);
|
||||
return 0;
|
||||
}
|
||||
}
|
||||
|
||||
#define FLOW_KEYS_NEEDED ((1 << FLOW_KEY_SRC) | \
|
||||
(1 << FLOW_KEY_DST) | \
|
||||
(1 << FLOW_KEY_PROTO) | \
|
||||
(1 << FLOW_KEY_PROTO_SRC) | \
|
||||
(1 << FLOW_KEY_PROTO_DST) | \
|
||||
(1 << FLOW_KEY_NFCT_SRC) | \
|
||||
(1 << FLOW_KEY_NFCT_DST) | \
|
||||
(1 << FLOW_KEY_NFCT_PROTO_SRC) | \
|
||||
(1 << FLOW_KEY_NFCT_PROTO_DST))
|
||||
|
||||
static int flow_classify(struct sk_buff *skb, const struct tcf_proto *tp,
|
||||
struct tcf_result *res)
|
||||
{
|
||||
struct flow_head *head = rcu_dereference_bh(tp->root);
|
||||
struct flow_filter *f;
|
||||
u32 keymask;
|
||||
u32 classid;
|
||||
unsigned int n, key;
|
||||
int r;
|
||||
|
||||
list_for_each_entry_rcu(f, &head->filters, list) {
|
||||
u32 keys[FLOW_KEY_MAX + 1];
|
||||
struct flow_keys flow_keys;
|
||||
|
||||
if (!tcf_em_tree_match(skb, &f->ematches, NULL))
|
||||
continue;
|
||||
|
||||
keymask = f->keymask;
|
||||
if (keymask & FLOW_KEYS_NEEDED)
|
||||
skb_flow_dissect(skb, &flow_keys);
|
||||
|
||||
for (n = 0; n < f->nkeys; n++) {
|
||||
key = ffs(keymask) - 1;
|
||||
keymask &= ~(1 << key);
|
||||
keys[n] = flow_key_get(skb, key, &flow_keys);
|
||||
}
|
||||
|
||||
if (f->mode == FLOW_MODE_HASH)
|
||||
classid = jhash2(keys, f->nkeys, f->hashrnd);
|
||||
else {
|
||||
classid = keys[0];
|
||||
classid = (classid & f->mask) ^ f->xor;
|
||||
classid = (classid >> f->rshift) + f->addend;
|
||||
}
|
||||
|
||||
if (f->divisor)
|
||||
classid %= f->divisor;
|
||||
|
||||
res->class = 0;
|
||||
res->classid = TC_H_MAKE(f->baseclass, f->baseclass + classid);
|
||||
|
||||
r = tcf_exts_exec(skb, &f->exts, res);
|
||||
if (r < 0)
|
||||
continue;
|
||||
return r;
|
||||
}
|
||||
return -1;
|
||||
}
|
||||
|
||||
static void flow_perturbation(unsigned long arg)
|
||||
{
|
||||
struct flow_filter *f = (struct flow_filter *)arg;
|
||||
|
||||
get_random_bytes(&f->hashrnd, 4);
|
||||
if (f->perturb_period)
|
||||
mod_timer(&f->perturb_timer, jiffies + f->perturb_period);
|
||||
}
|
||||
|
||||
static const struct nla_policy flow_policy[TCA_FLOW_MAX + 1] = {
|
||||
[TCA_FLOW_KEYS] = { .type = NLA_U32 },
|
||||
[TCA_FLOW_MODE] = { .type = NLA_U32 },
|
||||
[TCA_FLOW_BASECLASS] = { .type = NLA_U32 },
|
||||
[TCA_FLOW_RSHIFT] = { .type = NLA_U32 },
|
||||
[TCA_FLOW_ADDEND] = { .type = NLA_U32 },
|
||||
[TCA_FLOW_MASK] = { .type = NLA_U32 },
|
||||
[TCA_FLOW_XOR] = { .type = NLA_U32 },
|
||||
[TCA_FLOW_DIVISOR] = { .type = NLA_U32 },
|
||||
[TCA_FLOW_ACT] = { .type = NLA_NESTED },
|
||||
[TCA_FLOW_POLICE] = { .type = NLA_NESTED },
|
||||
[TCA_FLOW_EMATCHES] = { .type = NLA_NESTED },
|
||||
[TCA_FLOW_PERTURB] = { .type = NLA_U32 },
|
||||
};
|
||||
|
||||
static void flow_destroy_filter(struct rcu_head *head)
|
||||
{
|
||||
struct flow_filter *f = container_of(head, struct flow_filter, rcu);
|
||||
|
||||
del_timer_sync(&f->perturb_timer);
|
||||
tcf_exts_destroy(&f->exts);
|
||||
tcf_em_tree_destroy(&f->ematches);
|
||||
kfree(f);
|
||||
}
|
||||
|
||||
static int flow_change(struct net *net, struct sk_buff *in_skb,
|
||||
struct tcf_proto *tp, unsigned long base,
|
||||
u32 handle, struct nlattr **tca,
|
||||
unsigned long *arg, bool ovr)
|
||||
{
|
||||
struct flow_head *head = rtnl_dereference(tp->root);
|
||||
struct flow_filter *fold, *fnew;
|
||||
struct nlattr *opt = tca[TCA_OPTIONS];
|
||||
struct nlattr *tb[TCA_FLOW_MAX + 1];
|
||||
struct tcf_exts e;
|
||||
struct tcf_ematch_tree t;
|
||||
unsigned int nkeys = 0;
|
||||
unsigned int perturb_period = 0;
|
||||
u32 baseclass = 0;
|
||||
u32 keymask = 0;
|
||||
u32 mode;
|
||||
int err;
|
||||
|
||||
if (opt == NULL)
|
||||
return -EINVAL;
|
||||
|
||||
err = nla_parse_nested(tb, TCA_FLOW_MAX, opt, flow_policy);
|
||||
if (err < 0)
|
||||
return err;
|
||||
|
||||
if (tb[TCA_FLOW_BASECLASS]) {
|
||||
baseclass = nla_get_u32(tb[TCA_FLOW_BASECLASS]);
|
||||
if (TC_H_MIN(baseclass) == 0)
|
||||
return -EINVAL;
|
||||
}
|
||||
|
||||
if (tb[TCA_FLOW_KEYS]) {
|
||||
keymask = nla_get_u32(tb[TCA_FLOW_KEYS]);
|
||||
|
||||
nkeys = hweight32(keymask);
|
||||
if (nkeys == 0)
|
||||
return -EINVAL;
|
||||
|
||||
if (fls(keymask) - 1 > FLOW_KEY_MAX)
|
||||
return -EOPNOTSUPP;
|
||||
|
||||
if ((keymask & (FLOW_KEY_SKUID|FLOW_KEY_SKGID)) &&
|
||||
sk_user_ns(NETLINK_CB(in_skb).sk) != &init_user_ns)
|
||||
return -EOPNOTSUPP;
|
||||
}
|
||||
|
||||
tcf_exts_init(&e, TCA_FLOW_ACT, TCA_FLOW_POLICE);
|
||||
err = tcf_exts_validate(net, tp, tb, tca[TCA_RATE], &e, ovr);
|
||||
if (err < 0)
|
||||
return err;
|
||||
|
||||
err = tcf_em_tree_validate(tp, tb[TCA_FLOW_EMATCHES], &t);
|
||||
if (err < 0)
|
||||
goto err1;
|
||||
|
||||
err = -ENOBUFS;
|
||||
fnew = kzalloc(sizeof(*fnew), GFP_KERNEL);
|
||||
if (!fnew)
|
||||
goto err2;
|
||||
|
||||
fold = (struct flow_filter *)*arg;
|
||||
if (fold) {
|
||||
err = -EINVAL;
|
||||
if (fold->handle != handle && handle)
|
||||
goto err2;
|
||||
|
||||
/* Copy fold into fnew */
|
||||
fnew->handle = fold->handle;
|
||||
fnew->keymask = fold->keymask;
|
||||
fnew->tp = fold->tp;
|
||||
|
||||
fnew->handle = fold->handle;
|
||||
fnew->nkeys = fold->nkeys;
|
||||
fnew->keymask = fold->keymask;
|
||||
fnew->mode = fold->mode;
|
||||
fnew->mask = fold->mask;
|
||||
fnew->xor = fold->xor;
|
||||
fnew->rshift = fold->rshift;
|
||||
fnew->addend = fold->addend;
|
||||
fnew->divisor = fold->divisor;
|
||||
fnew->baseclass = fold->baseclass;
|
||||
fnew->hashrnd = fold->hashrnd;
|
||||
|
||||
mode = fold->mode;
|
||||
if (tb[TCA_FLOW_MODE])
|
||||
mode = nla_get_u32(tb[TCA_FLOW_MODE]);
|
||||
if (mode != FLOW_MODE_HASH && nkeys > 1)
|
||||
goto err2;
|
||||
|
||||
if (mode == FLOW_MODE_HASH)
|
||||
perturb_period = fold->perturb_period;
|
||||
if (tb[TCA_FLOW_PERTURB]) {
|
||||
if (mode != FLOW_MODE_HASH)
|
||||
goto err2;
|
||||
perturb_period = nla_get_u32(tb[TCA_FLOW_PERTURB]) * HZ;
|
||||
}
|
||||
} else {
|
||||
err = -EINVAL;
|
||||
if (!handle)
|
||||
goto err2;
|
||||
if (!tb[TCA_FLOW_KEYS])
|
||||
goto err2;
|
||||
|
||||
mode = FLOW_MODE_MAP;
|
||||
if (tb[TCA_FLOW_MODE])
|
||||
mode = nla_get_u32(tb[TCA_FLOW_MODE]);
|
||||
if (mode != FLOW_MODE_HASH && nkeys > 1)
|
||||
goto err2;
|
||||
|
||||
if (tb[TCA_FLOW_PERTURB]) {
|
||||
if (mode != FLOW_MODE_HASH)
|
||||
goto err2;
|
||||
perturb_period = nla_get_u32(tb[TCA_FLOW_PERTURB]) * HZ;
|
||||
}
|
||||
|
||||
if (TC_H_MAJ(baseclass) == 0)
|
||||
baseclass = TC_H_MAKE(tp->q->handle, baseclass);
|
||||
if (TC_H_MIN(baseclass) == 0)
|
||||
baseclass = TC_H_MAKE(baseclass, 1);
|
||||
|
||||
fnew->handle = handle;
|
||||
fnew->mask = ~0U;
|
||||
fnew->tp = tp;
|
||||
get_random_bytes(&fnew->hashrnd, 4);
|
||||
tcf_exts_init(&fnew->exts, TCA_FLOW_ACT, TCA_FLOW_POLICE);
|
||||
}
|
||||
|
||||
fnew->perturb_timer.function = flow_perturbation;
|
||||
fnew->perturb_timer.data = (unsigned long)fnew;
|
||||
init_timer_deferrable(&fnew->perturb_timer);
|
||||
|
||||
tcf_exts_change(tp, &fnew->exts, &e);
|
||||
tcf_em_tree_change(tp, &fnew->ematches, &t);
|
||||
|
||||
netif_keep_dst(qdisc_dev(tp->q));
|
||||
|
||||
if (tb[TCA_FLOW_KEYS]) {
|
||||
fnew->keymask = keymask;
|
||||
fnew->nkeys = nkeys;
|
||||
}
|
||||
|
||||
fnew->mode = mode;
|
||||
|
||||
if (tb[TCA_FLOW_MASK])
|
||||
fnew->mask = nla_get_u32(tb[TCA_FLOW_MASK]);
|
||||
if (tb[TCA_FLOW_XOR])
|
||||
fnew->xor = nla_get_u32(tb[TCA_FLOW_XOR]);
|
||||
if (tb[TCA_FLOW_RSHIFT])
|
||||
fnew->rshift = nla_get_u32(tb[TCA_FLOW_RSHIFT]);
|
||||
if (tb[TCA_FLOW_ADDEND])
|
||||
fnew->addend = nla_get_u32(tb[TCA_FLOW_ADDEND]);
|
||||
|
||||
if (tb[TCA_FLOW_DIVISOR])
|
||||
fnew->divisor = nla_get_u32(tb[TCA_FLOW_DIVISOR]);
|
||||
if (baseclass)
|
||||
fnew->baseclass = baseclass;
|
||||
|
||||
fnew->perturb_period = perturb_period;
|
||||
if (perturb_period)
|
||||
mod_timer(&fnew->perturb_timer, jiffies + perturb_period);
|
||||
|
||||
if (*arg == 0)
|
||||
list_add_tail_rcu(&fnew->list, &head->filters);
|
||||
else
|
||||
list_replace_rcu(&fnew->list, &fold->list);
|
||||
|
||||
*arg = (unsigned long)fnew;
|
||||
|
||||
if (fold)
|
||||
call_rcu(&fold->rcu, flow_destroy_filter);
|
||||
return 0;
|
||||
|
||||
err2:
|
||||
tcf_em_tree_destroy(&t);
|
||||
kfree(fnew);
|
||||
err1:
|
||||
tcf_exts_destroy(&e);
|
||||
return err;
|
||||
}
|
||||
|
||||
static int flow_delete(struct tcf_proto *tp, unsigned long arg)
|
||||
{
|
||||
struct flow_filter *f = (struct flow_filter *)arg;
|
||||
|
||||
list_del_rcu(&f->list);
|
||||
call_rcu(&f->rcu, flow_destroy_filter);
|
||||
return 0;
|
||||
}
|
||||
|
||||
static int flow_init(struct tcf_proto *tp)
|
||||
{
|
||||
struct flow_head *head;
|
||||
|
||||
head = kzalloc(sizeof(*head), GFP_KERNEL);
|
||||
if (head == NULL)
|
||||
return -ENOBUFS;
|
||||
INIT_LIST_HEAD(&head->filters);
|
||||
rcu_assign_pointer(tp->root, head);
|
||||
return 0;
|
||||
}
|
||||
|
||||
static void flow_destroy(struct tcf_proto *tp)
|
||||
{
|
||||
struct flow_head *head = rtnl_dereference(tp->root);
|
||||
struct flow_filter *f, *next;
|
||||
|
||||
list_for_each_entry_safe(f, next, &head->filters, list) {
|
||||
list_del_rcu(&f->list);
|
||||
call_rcu(&f->rcu, flow_destroy_filter);
|
||||
}
|
||||
RCU_INIT_POINTER(tp->root, NULL);
|
||||
kfree_rcu(head, rcu);
|
||||
}
|
||||
|
||||
static unsigned long flow_get(struct tcf_proto *tp, u32 handle)
|
||||
{
|
||||
struct flow_head *head = rtnl_dereference(tp->root);
|
||||
struct flow_filter *f;
|
||||
|
||||
list_for_each_entry_rcu(f, &head->filters, list)
|
||||
if (f->handle == handle)
|
||||
return (unsigned long)f;
|
||||
return 0;
|
||||
}
|
||||
|
||||
static void flow_put(struct tcf_proto *tp, unsigned long f)
|
||||
{
|
||||
}
|
||||
|
||||
static int flow_dump(struct net *net, struct tcf_proto *tp, unsigned long fh,
|
||||
struct sk_buff *skb, struct tcmsg *t)
|
||||
{
|
||||
struct flow_filter *f = (struct flow_filter *)fh;
|
||||
struct nlattr *nest;
|
||||
|
||||
if (f == NULL)
|
||||
return skb->len;
|
||||
|
||||
t->tcm_handle = f->handle;
|
||||
|
||||
nest = nla_nest_start(skb, TCA_OPTIONS);
|
||||
if (nest == NULL)
|
||||
goto nla_put_failure;
|
||||
|
||||
if (nla_put_u32(skb, TCA_FLOW_KEYS, f->keymask) ||
|
||||
nla_put_u32(skb, TCA_FLOW_MODE, f->mode))
|
||||
goto nla_put_failure;
|
||||
|
||||
if (f->mask != ~0 || f->xor != 0) {
|
||||
if (nla_put_u32(skb, TCA_FLOW_MASK, f->mask) ||
|
||||
nla_put_u32(skb, TCA_FLOW_XOR, f->xor))
|
||||
goto nla_put_failure;
|
||||
}
|
||||
if (f->rshift &&
|
||||
nla_put_u32(skb, TCA_FLOW_RSHIFT, f->rshift))
|
||||
goto nla_put_failure;
|
||||
if (f->addend &&
|
||||
nla_put_u32(skb, TCA_FLOW_ADDEND, f->addend))
|
||||
goto nla_put_failure;
|
||||
|
||||
if (f->divisor &&
|
||||
nla_put_u32(skb, TCA_FLOW_DIVISOR, f->divisor))
|
||||
goto nla_put_failure;
|
||||
if (f->baseclass &&
|
||||
nla_put_u32(skb, TCA_FLOW_BASECLASS, f->baseclass))
|
||||
goto nla_put_failure;
|
||||
|
||||
if (f->perturb_period &&
|
||||
nla_put_u32(skb, TCA_FLOW_PERTURB, f->perturb_period / HZ))
|
||||
goto nla_put_failure;
|
||||
|
||||
if (tcf_exts_dump(skb, &f->exts) < 0)
|
||||
goto nla_put_failure;
|
||||
#ifdef CONFIG_NET_EMATCH
|
||||
if (f->ematches.hdr.nmatches &&
|
||||
tcf_em_tree_dump(skb, &f->ematches, TCA_FLOW_EMATCHES) < 0)
|
||||
goto nla_put_failure;
|
||||
#endif
|
||||
nla_nest_end(skb, nest);
|
||||
|
||||
if (tcf_exts_dump_stats(skb, &f->exts) < 0)
|
||||
goto nla_put_failure;
|
||||
|
||||
return skb->len;
|
||||
|
||||
nla_put_failure:
|
||||
nlmsg_trim(skb, nest);
|
||||
return -1;
|
||||
}
|
||||
|
||||
static void flow_walk(struct tcf_proto *tp, struct tcf_walker *arg)
|
||||
{
|
||||
struct flow_head *head = rtnl_dereference(tp->root);
|
||||
struct flow_filter *f;
|
||||
|
||||
list_for_each_entry_rcu(f, &head->filters, list) {
|
||||
if (arg->count < arg->skip)
|
||||
goto skip;
|
||||
if (arg->fn(tp, (unsigned long)f, arg) < 0) {
|
||||
arg->stop = 1;
|
||||
break;
|
||||
}
|
||||
skip:
|
||||
arg->count++;
|
||||
}
|
||||
}
|
||||
|
||||
static struct tcf_proto_ops cls_flow_ops __read_mostly = {
|
||||
.kind = "flow",
|
||||
.classify = flow_classify,
|
||||
.init = flow_init,
|
||||
.destroy = flow_destroy,
|
||||
.change = flow_change,
|
||||
.delete = flow_delete,
|
||||
.get = flow_get,
|
||||
.put = flow_put,
|
||||
.dump = flow_dump,
|
||||
.walk = flow_walk,
|
||||
.owner = THIS_MODULE,
|
||||
};
|
||||
|
||||
static int __init cls_flow_init(void)
|
||||
{
|
||||
return register_tcf_proto_ops(&cls_flow_ops);
|
||||
}
|
||||
|
||||
static void __exit cls_flow_exit(void)
|
||||
{
|
||||
unregister_tcf_proto_ops(&cls_flow_ops);
|
||||
}
|
||||
|
||||
module_init(cls_flow_init);
|
||||
module_exit(cls_flow_exit);
|
||||
|
||||
MODULE_LICENSE("GPL");
|
||||
MODULE_AUTHOR("Patrick McHardy <kaber@trash.net>");
|
||||
MODULE_DESCRIPTION("TC flow classifier");
|
||||
434
net/sched/cls_fw.c
Normal file
434
net/sched/cls_fw.c
Normal file
|
|
@ -0,0 +1,434 @@
|
|||
/*
|
||||
* net/sched/cls_fw.c Classifier mapping ipchains' fwmark to traffic class.
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU General Public License
|
||||
* as published by the Free Software Foundation; either version
|
||||
* 2 of the License, or (at your option) any later version.
|
||||
*
|
||||
* Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
|
||||
*
|
||||
* Changes:
|
||||
* Karlis Peisenieks <karlis@mt.lv> : 990415 : fw_walk off by one
|
||||
* Karlis Peisenieks <karlis@mt.lv> : 990415 : fw_delete killed all the filter (and kernel).
|
||||
* Alex <alex@pilotsoft.com> : 2004xxyy: Added Action extension
|
||||
*
|
||||
* JHS: We should remove the CONFIG_NET_CLS_IND from here
|
||||
* eventually when the meta match extension is made available
|
||||
*
|
||||
*/
|
||||
|
||||
#include <linux/module.h>
|
||||
#include <linux/slab.h>
|
||||
#include <linux/types.h>
|
||||
#include <linux/kernel.h>
|
||||
#include <linux/string.h>
|
||||
#include <linux/errno.h>
|
||||
#include <linux/skbuff.h>
|
||||
#include <net/netlink.h>
|
||||
#include <net/act_api.h>
|
||||
#include <net/pkt_cls.h>
|
||||
|
||||
#define HTSIZE 256
|
||||
|
||||
struct fw_head {
|
||||
u32 mask;
|
||||
struct fw_filter __rcu *ht[HTSIZE];
|
||||
struct rcu_head rcu;
|
||||
};
|
||||
|
||||
struct fw_filter {
|
||||
struct fw_filter __rcu *next;
|
||||
u32 id;
|
||||
struct tcf_result res;
|
||||
#ifdef CONFIG_NET_CLS_IND
|
||||
int ifindex;
|
||||
#endif /* CONFIG_NET_CLS_IND */
|
||||
struct tcf_exts exts;
|
||||
struct tcf_proto *tp;
|
||||
struct rcu_head rcu;
|
||||
};
|
||||
|
||||
static u32 fw_hash(u32 handle)
|
||||
{
|
||||
handle ^= (handle >> 16);
|
||||
handle ^= (handle >> 8);
|
||||
return handle % HTSIZE;
|
||||
}
|
||||
|
||||
static int fw_classify(struct sk_buff *skb, const struct tcf_proto *tp,
|
||||
struct tcf_result *res)
|
||||
{
|
||||
struct fw_head *head = rcu_dereference_bh(tp->root);
|
||||
struct fw_filter *f;
|
||||
int r;
|
||||
u32 id = skb->mark;
|
||||
|
||||
if (head != NULL) {
|
||||
id &= head->mask;
|
||||
|
||||
for (f = rcu_dereference_bh(head->ht[fw_hash(id)]); f;
|
||||
f = rcu_dereference_bh(f->next)) {
|
||||
if (f->id == id) {
|
||||
*res = f->res;
|
||||
#ifdef CONFIG_NET_CLS_IND
|
||||
if (!tcf_match_indev(skb, f->ifindex))
|
||||
continue;
|
||||
#endif /* CONFIG_NET_CLS_IND */
|
||||
r = tcf_exts_exec(skb, &f->exts, res);
|
||||
if (r < 0)
|
||||
continue;
|
||||
|
||||
return r;
|
||||
}
|
||||
}
|
||||
} else {
|
||||
/* old method */
|
||||
if (id && (TC_H_MAJ(id) == 0 ||
|
||||
!(TC_H_MAJ(id ^ tp->q->handle)))) {
|
||||
res->classid = id;
|
||||
res->class = 0;
|
||||
return 0;
|
||||
}
|
||||
}
|
||||
|
||||
return -1;
|
||||
}
|
||||
|
||||
static unsigned long fw_get(struct tcf_proto *tp, u32 handle)
|
||||
{
|
||||
struct fw_head *head = rtnl_dereference(tp->root);
|
||||
struct fw_filter *f;
|
||||
|
||||
if (head == NULL)
|
||||
return 0;
|
||||
|
||||
f = rtnl_dereference(head->ht[fw_hash(handle)]);
|
||||
for (; f; f = rtnl_dereference(f->next)) {
|
||||
if (f->id == handle)
|
||||
return (unsigned long)f;
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
static void fw_put(struct tcf_proto *tp, unsigned long f)
|
||||
{
|
||||
}
|
||||
|
||||
static int fw_init(struct tcf_proto *tp)
|
||||
{
|
||||
return 0;
|
||||
}
|
||||
|
||||
static void fw_delete_filter(struct rcu_head *head)
|
||||
{
|
||||
struct fw_filter *f = container_of(head, struct fw_filter, rcu);
|
||||
|
||||
tcf_exts_destroy(&f->exts);
|
||||
kfree(f);
|
||||
}
|
||||
|
||||
static void fw_destroy(struct tcf_proto *tp)
|
||||
{
|
||||
struct fw_head *head = rtnl_dereference(tp->root);
|
||||
struct fw_filter *f;
|
||||
int h;
|
||||
|
||||
if (head == NULL)
|
||||
return;
|
||||
|
||||
for (h = 0; h < HTSIZE; h++) {
|
||||
while ((f = rtnl_dereference(head->ht[h])) != NULL) {
|
||||
RCU_INIT_POINTER(head->ht[h],
|
||||
rtnl_dereference(f->next));
|
||||
tcf_unbind_filter(tp, &f->res);
|
||||
call_rcu(&f->rcu, fw_delete_filter);
|
||||
}
|
||||
}
|
||||
RCU_INIT_POINTER(tp->root, NULL);
|
||||
kfree_rcu(head, rcu);
|
||||
}
|
||||
|
||||
static int fw_delete(struct tcf_proto *tp, unsigned long arg)
|
||||
{
|
||||
struct fw_head *head = rtnl_dereference(tp->root);
|
||||
struct fw_filter *f = (struct fw_filter *)arg;
|
||||
struct fw_filter __rcu **fp;
|
||||
struct fw_filter *pfp;
|
||||
|
||||
if (head == NULL || f == NULL)
|
||||
goto out;
|
||||
|
||||
fp = &head->ht[fw_hash(f->id)];
|
||||
|
||||
for (pfp = rtnl_dereference(*fp); pfp;
|
||||
fp = &pfp->next, pfp = rtnl_dereference(*fp)) {
|
||||
if (pfp == f) {
|
||||
RCU_INIT_POINTER(*fp, rtnl_dereference(f->next));
|
||||
tcf_unbind_filter(tp, &f->res);
|
||||
call_rcu(&f->rcu, fw_delete_filter);
|
||||
return 0;
|
||||
}
|
||||
}
|
||||
out:
|
||||
return -EINVAL;
|
||||
}
|
||||
|
||||
static const struct nla_policy fw_policy[TCA_FW_MAX + 1] = {
|
||||
[TCA_FW_CLASSID] = { .type = NLA_U32 },
|
||||
[TCA_FW_INDEV] = { .type = NLA_STRING, .len = IFNAMSIZ },
|
||||
[TCA_FW_MASK] = { .type = NLA_U32 },
|
||||
};
|
||||
|
||||
static int
|
||||
fw_change_attrs(struct net *net, struct tcf_proto *tp, struct fw_filter *f,
|
||||
struct nlattr **tb, struct nlattr **tca, unsigned long base, bool ovr)
|
||||
{
|
||||
struct fw_head *head = rtnl_dereference(tp->root);
|
||||
struct tcf_exts e;
|
||||
u32 mask;
|
||||
int err;
|
||||
|
||||
tcf_exts_init(&e, TCA_FW_ACT, TCA_FW_POLICE);
|
||||
err = tcf_exts_validate(net, tp, tb, tca[TCA_RATE], &e, ovr);
|
||||
if (err < 0)
|
||||
return err;
|
||||
|
||||
if (tb[TCA_FW_CLASSID]) {
|
||||
f->res.classid = nla_get_u32(tb[TCA_FW_CLASSID]);
|
||||
tcf_bind_filter(tp, &f->res, base);
|
||||
}
|
||||
|
||||
#ifdef CONFIG_NET_CLS_IND
|
||||
if (tb[TCA_FW_INDEV]) {
|
||||
int ret;
|
||||
ret = tcf_change_indev(net, tb[TCA_FW_INDEV]);
|
||||
if (ret < 0) {
|
||||
err = ret;
|
||||
goto errout;
|
||||
}
|
||||
f->ifindex = ret;
|
||||
}
|
||||
#endif /* CONFIG_NET_CLS_IND */
|
||||
|
||||
err = -EINVAL;
|
||||
if (tb[TCA_FW_MASK]) {
|
||||
mask = nla_get_u32(tb[TCA_FW_MASK]);
|
||||
if (mask != head->mask)
|
||||
goto errout;
|
||||
} else if (head->mask != 0xFFFFFFFF)
|
||||
goto errout;
|
||||
|
||||
tcf_exts_change(tp, &f->exts, &e);
|
||||
|
||||
return 0;
|
||||
errout:
|
||||
tcf_exts_destroy(&e);
|
||||
return err;
|
||||
}
|
||||
|
||||
static int fw_change(struct net *net, struct sk_buff *in_skb,
|
||||
struct tcf_proto *tp, unsigned long base,
|
||||
u32 handle,
|
||||
struct nlattr **tca,
|
||||
unsigned long *arg, bool ovr)
|
||||
{
|
||||
struct fw_head *head = rtnl_dereference(tp->root);
|
||||
struct fw_filter *f = (struct fw_filter *) *arg;
|
||||
struct nlattr *opt = tca[TCA_OPTIONS];
|
||||
struct nlattr *tb[TCA_FW_MAX + 1];
|
||||
int err;
|
||||
|
||||
if (!opt)
|
||||
return handle ? -EINVAL : 0;
|
||||
|
||||
err = nla_parse_nested(tb, TCA_FW_MAX, opt, fw_policy);
|
||||
if (err < 0)
|
||||
return err;
|
||||
|
||||
if (f) {
|
||||
struct fw_filter *pfp, *fnew;
|
||||
struct fw_filter __rcu **fp;
|
||||
|
||||
if (f->id != handle && handle)
|
||||
return -EINVAL;
|
||||
|
||||
fnew = kzalloc(sizeof(struct fw_filter), GFP_KERNEL);
|
||||
if (!fnew)
|
||||
return -ENOBUFS;
|
||||
|
||||
fnew->id = f->id;
|
||||
fnew->res = f->res;
|
||||
#ifdef CONFIG_NET_CLS_IND
|
||||
fnew->ifindex = f->ifindex;
|
||||
#endif /* CONFIG_NET_CLS_IND */
|
||||
fnew->tp = f->tp;
|
||||
|
||||
tcf_exts_init(&fnew->exts, TCA_FW_ACT, TCA_FW_POLICE);
|
||||
|
||||
err = fw_change_attrs(net, tp, fnew, tb, tca, base, ovr);
|
||||
if (err < 0) {
|
||||
kfree(fnew);
|
||||
return err;
|
||||
}
|
||||
|
||||
fp = &head->ht[fw_hash(fnew->id)];
|
||||
for (pfp = rtnl_dereference(*fp); pfp;
|
||||
fp = &pfp->next, pfp = rtnl_dereference(*fp))
|
||||
if (pfp == f)
|
||||
break;
|
||||
|
||||
RCU_INIT_POINTER(fnew->next, rtnl_dereference(pfp->next));
|
||||
rcu_assign_pointer(*fp, fnew);
|
||||
tcf_unbind_filter(tp, &f->res);
|
||||
call_rcu(&f->rcu, fw_delete_filter);
|
||||
|
||||
*arg = (unsigned long)fnew;
|
||||
return err;
|
||||
}
|
||||
|
||||
if (!handle)
|
||||
return -EINVAL;
|
||||
|
||||
if (head == NULL) {
|
||||
u32 mask = 0xFFFFFFFF;
|
||||
if (tb[TCA_FW_MASK])
|
||||
mask = nla_get_u32(tb[TCA_FW_MASK]);
|
||||
|
||||
head = kzalloc(sizeof(struct fw_head), GFP_KERNEL);
|
||||
if (head == NULL)
|
||||
return -ENOBUFS;
|
||||
head->mask = mask;
|
||||
|
||||
rcu_assign_pointer(tp->root, head);
|
||||
}
|
||||
|
||||
f = kzalloc(sizeof(struct fw_filter), GFP_KERNEL);
|
||||
if (f == NULL)
|
||||
return -ENOBUFS;
|
||||
|
||||
tcf_exts_init(&f->exts, TCA_FW_ACT, TCA_FW_POLICE);
|
||||
f->id = handle;
|
||||
f->tp = tp;
|
||||
|
||||
err = fw_change_attrs(net, tp, f, tb, tca, base, ovr);
|
||||
if (err < 0)
|
||||
goto errout;
|
||||
|
||||
RCU_INIT_POINTER(f->next, head->ht[fw_hash(handle)]);
|
||||
rcu_assign_pointer(head->ht[fw_hash(handle)], f);
|
||||
|
||||
*arg = (unsigned long)f;
|
||||
return 0;
|
||||
|
||||
errout:
|
||||
kfree(f);
|
||||
return err;
|
||||
}
|
||||
|
||||
static void fw_walk(struct tcf_proto *tp, struct tcf_walker *arg)
|
||||
{
|
||||
struct fw_head *head = rtnl_dereference(tp->root);
|
||||
int h;
|
||||
|
||||
if (head == NULL)
|
||||
arg->stop = 1;
|
||||
|
||||
if (arg->stop)
|
||||
return;
|
||||
|
||||
for (h = 0; h < HTSIZE; h++) {
|
||||
struct fw_filter *f;
|
||||
|
||||
for (f = rtnl_dereference(head->ht[h]); f;
|
||||
f = rtnl_dereference(f->next)) {
|
||||
if (arg->count < arg->skip) {
|
||||
arg->count++;
|
||||
continue;
|
||||
}
|
||||
if (arg->fn(tp, (unsigned long)f, arg) < 0) {
|
||||
arg->stop = 1;
|
||||
return;
|
||||
}
|
||||
arg->count++;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static int fw_dump(struct net *net, struct tcf_proto *tp, unsigned long fh,
|
||||
struct sk_buff *skb, struct tcmsg *t)
|
||||
{
|
||||
struct fw_head *head = rtnl_dereference(tp->root);
|
||||
struct fw_filter *f = (struct fw_filter *)fh;
|
||||
unsigned char *b = skb_tail_pointer(skb);
|
||||
struct nlattr *nest;
|
||||
|
||||
if (f == NULL)
|
||||
return skb->len;
|
||||
|
||||
t->tcm_handle = f->id;
|
||||
|
||||
if (!f->res.classid && !tcf_exts_is_available(&f->exts))
|
||||
return skb->len;
|
||||
|
||||
nest = nla_nest_start(skb, TCA_OPTIONS);
|
||||
if (nest == NULL)
|
||||
goto nla_put_failure;
|
||||
|
||||
if (f->res.classid &&
|
||||
nla_put_u32(skb, TCA_FW_CLASSID, f->res.classid))
|
||||
goto nla_put_failure;
|
||||
#ifdef CONFIG_NET_CLS_IND
|
||||
if (f->ifindex) {
|
||||
struct net_device *dev;
|
||||
dev = __dev_get_by_index(net, f->ifindex);
|
||||
if (dev && nla_put_string(skb, TCA_FW_INDEV, dev->name))
|
||||
goto nla_put_failure;
|
||||
}
|
||||
#endif /* CONFIG_NET_CLS_IND */
|
||||
if (head->mask != 0xFFFFFFFF &&
|
||||
nla_put_u32(skb, TCA_FW_MASK, head->mask))
|
||||
goto nla_put_failure;
|
||||
|
||||
if (tcf_exts_dump(skb, &f->exts) < 0)
|
||||
goto nla_put_failure;
|
||||
|
||||
nla_nest_end(skb, nest);
|
||||
|
||||
if (tcf_exts_dump_stats(skb, &f->exts) < 0)
|
||||
goto nla_put_failure;
|
||||
|
||||
return skb->len;
|
||||
|
||||
nla_put_failure:
|
||||
nlmsg_trim(skb, b);
|
||||
return -1;
|
||||
}
|
||||
|
||||
static struct tcf_proto_ops cls_fw_ops __read_mostly = {
|
||||
.kind = "fw",
|
||||
.classify = fw_classify,
|
||||
.init = fw_init,
|
||||
.destroy = fw_destroy,
|
||||
.get = fw_get,
|
||||
.put = fw_put,
|
||||
.change = fw_change,
|
||||
.delete = fw_delete,
|
||||
.walk = fw_walk,
|
||||
.dump = fw_dump,
|
||||
.owner = THIS_MODULE,
|
||||
};
|
||||
|
||||
static int __init init_fw(void)
|
||||
{
|
||||
return register_tcf_proto_ops(&cls_fw_ops);
|
||||
}
|
||||
|
||||
static void __exit exit_fw(void)
|
||||
{
|
||||
unregister_tcf_proto_ops(&cls_fw_ops);
|
||||
}
|
||||
|
||||
module_init(init_fw)
|
||||
module_exit(exit_fw)
|
||||
MODULE_LICENSE("GPL");
|
||||
672
net/sched/cls_route.c
Normal file
672
net/sched/cls_route.c
Normal file
|
|
@ -0,0 +1,672 @@
|
|||
/*
|
||||
* net/sched/cls_route.c ROUTE4 classifier.
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU General Public License
|
||||
* as published by the Free Software Foundation; either version
|
||||
* 2 of the License, or (at your option) any later version.
|
||||
*
|
||||
* Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
|
||||
*/
|
||||
|
||||
#include <linux/module.h>
|
||||
#include <linux/slab.h>
|
||||
#include <linux/types.h>
|
||||
#include <linux/kernel.h>
|
||||
#include <linux/string.h>
|
||||
#include <linux/errno.h>
|
||||
#include <linux/skbuff.h>
|
||||
#include <net/dst.h>
|
||||
#include <net/route.h>
|
||||
#include <net/netlink.h>
|
||||
#include <net/act_api.h>
|
||||
#include <net/pkt_cls.h>
|
||||
|
||||
/*
|
||||
* 1. For now we assume that route tags < 256.
|
||||
* It allows to use direct table lookups, instead of hash tables.
|
||||
* 2. For now we assume that "from TAG" and "fromdev DEV" statements
|
||||
* are mutually exclusive.
|
||||
* 3. "to TAG from ANY" has higher priority, than "to ANY from XXX"
|
||||
*/
|
||||
struct route4_fastmap {
|
||||
struct route4_filter *filter;
|
||||
u32 id;
|
||||
int iif;
|
||||
};
|
||||
|
||||
struct route4_head {
|
||||
struct route4_fastmap fastmap[16];
|
||||
struct route4_bucket __rcu *table[256 + 1];
|
||||
struct rcu_head rcu;
|
||||
};
|
||||
|
||||
struct route4_bucket {
|
||||
/* 16 FROM buckets + 16 IIF buckets + 1 wildcard bucket */
|
||||
struct route4_filter __rcu *ht[16 + 16 + 1];
|
||||
struct rcu_head rcu;
|
||||
};
|
||||
|
||||
struct route4_filter {
|
||||
struct route4_filter __rcu *next;
|
||||
u32 id;
|
||||
int iif;
|
||||
|
||||
struct tcf_result res;
|
||||
struct tcf_exts exts;
|
||||
u32 handle;
|
||||
struct route4_bucket *bkt;
|
||||
struct tcf_proto *tp;
|
||||
struct rcu_head rcu;
|
||||
};
|
||||
|
||||
#define ROUTE4_FAILURE ((struct route4_filter *)(-1L))
|
||||
|
||||
static inline int route4_fastmap_hash(u32 id, int iif)
|
||||
{
|
||||
return id & 0xF;
|
||||
}
|
||||
|
||||
static DEFINE_SPINLOCK(fastmap_lock);
|
||||
static void
|
||||
route4_reset_fastmap(struct route4_head *head)
|
||||
{
|
||||
spin_lock_bh(&fastmap_lock);
|
||||
memset(head->fastmap, 0, sizeof(head->fastmap));
|
||||
spin_unlock_bh(&fastmap_lock);
|
||||
}
|
||||
|
||||
static void
|
||||
route4_set_fastmap(struct route4_head *head, u32 id, int iif,
|
||||
struct route4_filter *f)
|
||||
{
|
||||
int h = route4_fastmap_hash(id, iif);
|
||||
|
||||
/* fastmap updates must look atomic to aling id, iff, filter */
|
||||
spin_lock_bh(&fastmap_lock);
|
||||
head->fastmap[h].id = id;
|
||||
head->fastmap[h].iif = iif;
|
||||
head->fastmap[h].filter = f;
|
||||
spin_unlock_bh(&fastmap_lock);
|
||||
}
|
||||
|
||||
static inline int route4_hash_to(u32 id)
|
||||
{
|
||||
return id & 0xFF;
|
||||
}
|
||||
|
||||
static inline int route4_hash_from(u32 id)
|
||||
{
|
||||
return (id >> 16) & 0xF;
|
||||
}
|
||||
|
||||
static inline int route4_hash_iif(int iif)
|
||||
{
|
||||
return 16 + ((iif >> 16) & 0xF);
|
||||
}
|
||||
|
||||
static inline int route4_hash_wild(void)
|
||||
{
|
||||
return 32;
|
||||
}
|
||||
|
||||
#define ROUTE4_APPLY_RESULT() \
|
||||
{ \
|
||||
*res = f->res; \
|
||||
if (tcf_exts_is_available(&f->exts)) { \
|
||||
int r = tcf_exts_exec(skb, &f->exts, res); \
|
||||
if (r < 0) { \
|
||||
dont_cache = 1; \
|
||||
continue; \
|
||||
} \
|
||||
return r; \
|
||||
} else if (!dont_cache) \
|
||||
route4_set_fastmap(head, id, iif, f); \
|
||||
return 0; \
|
||||
}
|
||||
|
||||
static int route4_classify(struct sk_buff *skb, const struct tcf_proto *tp,
|
||||
struct tcf_result *res)
|
||||
{
|
||||
struct route4_head *head = rcu_dereference_bh(tp->root);
|
||||
struct dst_entry *dst;
|
||||
struct route4_bucket *b;
|
||||
struct route4_filter *f;
|
||||
u32 id, h;
|
||||
int iif, dont_cache = 0;
|
||||
|
||||
dst = skb_dst(skb);
|
||||
if (!dst)
|
||||
goto failure;
|
||||
|
||||
id = dst->tclassid;
|
||||
if (head == NULL)
|
||||
goto old_method;
|
||||
|
||||
iif = inet_iif(skb);
|
||||
|
||||
h = route4_fastmap_hash(id, iif);
|
||||
|
||||
spin_lock(&fastmap_lock);
|
||||
if (id == head->fastmap[h].id &&
|
||||
iif == head->fastmap[h].iif &&
|
||||
(f = head->fastmap[h].filter) != NULL) {
|
||||
if (f == ROUTE4_FAILURE) {
|
||||
spin_unlock(&fastmap_lock);
|
||||
goto failure;
|
||||
}
|
||||
|
||||
*res = f->res;
|
||||
spin_unlock(&fastmap_lock);
|
||||
return 0;
|
||||
}
|
||||
spin_unlock(&fastmap_lock);
|
||||
|
||||
h = route4_hash_to(id);
|
||||
|
||||
restart:
|
||||
b = rcu_dereference_bh(head->table[h]);
|
||||
if (b) {
|
||||
for (f = rcu_dereference_bh(b->ht[route4_hash_from(id)]);
|
||||
f;
|
||||
f = rcu_dereference_bh(f->next))
|
||||
if (f->id == id)
|
||||
ROUTE4_APPLY_RESULT();
|
||||
|
||||
for (f = rcu_dereference_bh(b->ht[route4_hash_iif(iif)]);
|
||||
f;
|
||||
f = rcu_dereference_bh(f->next))
|
||||
if (f->iif == iif)
|
||||
ROUTE4_APPLY_RESULT();
|
||||
|
||||
for (f = rcu_dereference_bh(b->ht[route4_hash_wild()]);
|
||||
f;
|
||||
f = rcu_dereference_bh(f->next))
|
||||
ROUTE4_APPLY_RESULT();
|
||||
}
|
||||
if (h < 256) {
|
||||
h = 256;
|
||||
id &= ~0xFFFF;
|
||||
goto restart;
|
||||
}
|
||||
|
||||
if (!dont_cache)
|
||||
route4_set_fastmap(head, id, iif, ROUTE4_FAILURE);
|
||||
failure:
|
||||
return -1;
|
||||
|
||||
old_method:
|
||||
if (id && (TC_H_MAJ(id) == 0 ||
|
||||
!(TC_H_MAJ(id^tp->q->handle)))) {
|
||||
res->classid = id;
|
||||
res->class = 0;
|
||||
return 0;
|
||||
}
|
||||
return -1;
|
||||
}
|
||||
|
||||
static inline u32 to_hash(u32 id)
|
||||
{
|
||||
u32 h = id & 0xFF;
|
||||
|
||||
if (id & 0x8000)
|
||||
h += 256;
|
||||
return h;
|
||||
}
|
||||
|
||||
static inline u32 from_hash(u32 id)
|
||||
{
|
||||
id &= 0xFFFF;
|
||||
if (id == 0xFFFF)
|
||||
return 32;
|
||||
if (!(id & 0x8000)) {
|
||||
if (id > 255)
|
||||
return 256;
|
||||
return id & 0xF;
|
||||
}
|
||||
return 16 + (id & 0xF);
|
||||
}
|
||||
|
||||
static unsigned long route4_get(struct tcf_proto *tp, u32 handle)
|
||||
{
|
||||
struct route4_head *head = rtnl_dereference(tp->root);
|
||||
struct route4_bucket *b;
|
||||
struct route4_filter *f;
|
||||
unsigned int h1, h2;
|
||||
|
||||
if (!head)
|
||||
return 0;
|
||||
|
||||
h1 = to_hash(handle);
|
||||
if (h1 > 256)
|
||||
return 0;
|
||||
|
||||
h2 = from_hash(handle >> 16);
|
||||
if (h2 > 32)
|
||||
return 0;
|
||||
|
||||
b = rtnl_dereference(head->table[h1]);
|
||||
if (b) {
|
||||
for (f = rtnl_dereference(b->ht[h2]);
|
||||
f;
|
||||
f = rtnl_dereference(f->next))
|
||||
if (f->handle == handle)
|
||||
return (unsigned long)f;
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
static void route4_put(struct tcf_proto *tp, unsigned long f)
|
||||
{
|
||||
}
|
||||
|
||||
static int route4_init(struct tcf_proto *tp)
|
||||
{
|
||||
return 0;
|
||||
}
|
||||
|
||||
static void
|
||||
route4_delete_filter(struct rcu_head *head)
|
||||
{
|
||||
struct route4_filter *f = container_of(head, struct route4_filter, rcu);
|
||||
|
||||
tcf_exts_destroy(&f->exts);
|
||||
kfree(f);
|
||||
}
|
||||
|
||||
static void route4_destroy(struct tcf_proto *tp)
|
||||
{
|
||||
struct route4_head *head = rtnl_dereference(tp->root);
|
||||
int h1, h2;
|
||||
|
||||
if (head == NULL)
|
||||
return;
|
||||
|
||||
for (h1 = 0; h1 <= 256; h1++) {
|
||||
struct route4_bucket *b;
|
||||
|
||||
b = rtnl_dereference(head->table[h1]);
|
||||
if (b) {
|
||||
for (h2 = 0; h2 <= 32; h2++) {
|
||||
struct route4_filter *f;
|
||||
|
||||
while ((f = rtnl_dereference(b->ht[h2])) != NULL) {
|
||||
struct route4_filter *next;
|
||||
|
||||
next = rtnl_dereference(f->next);
|
||||
RCU_INIT_POINTER(b->ht[h2], next);
|
||||
tcf_unbind_filter(tp, &f->res);
|
||||
call_rcu(&f->rcu, route4_delete_filter);
|
||||
}
|
||||
}
|
||||
RCU_INIT_POINTER(head->table[h1], NULL);
|
||||
kfree_rcu(b, rcu);
|
||||
}
|
||||
}
|
||||
RCU_INIT_POINTER(tp->root, NULL);
|
||||
kfree_rcu(head, rcu);
|
||||
}
|
||||
|
||||
static int route4_delete(struct tcf_proto *tp, unsigned long arg)
|
||||
{
|
||||
struct route4_head *head = rtnl_dereference(tp->root);
|
||||
struct route4_filter *f = (struct route4_filter *)arg;
|
||||
struct route4_filter __rcu **fp;
|
||||
struct route4_filter *nf;
|
||||
struct route4_bucket *b;
|
||||
unsigned int h = 0;
|
||||
int i;
|
||||
|
||||
if (!head || !f)
|
||||
return -EINVAL;
|
||||
|
||||
h = f->handle;
|
||||
b = f->bkt;
|
||||
|
||||
fp = &b->ht[from_hash(h >> 16)];
|
||||
for (nf = rtnl_dereference(*fp); nf;
|
||||
fp = &nf->next, nf = rtnl_dereference(*fp)) {
|
||||
if (nf == f) {
|
||||
/* unlink it */
|
||||
RCU_INIT_POINTER(*fp, rtnl_dereference(f->next));
|
||||
|
||||
/* Remove any fastmap lookups that might ref filter
|
||||
* notice we unlink'd the filter so we can't get it
|
||||
* back in the fastmap.
|
||||
*/
|
||||
route4_reset_fastmap(head);
|
||||
|
||||
/* Delete it */
|
||||
tcf_unbind_filter(tp, &f->res);
|
||||
call_rcu(&f->rcu, route4_delete_filter);
|
||||
|
||||
/* Strip RTNL protected tree */
|
||||
for (i = 0; i <= 32; i++) {
|
||||
struct route4_filter *rt;
|
||||
|
||||
rt = rtnl_dereference(b->ht[i]);
|
||||
if (rt)
|
||||
return 0;
|
||||
}
|
||||
|
||||
/* OK, session has no flows */
|
||||
RCU_INIT_POINTER(head->table[to_hash(h)], NULL);
|
||||
kfree_rcu(b, rcu);
|
||||
|
||||
return 0;
|
||||
}
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
static const struct nla_policy route4_policy[TCA_ROUTE4_MAX + 1] = {
|
||||
[TCA_ROUTE4_CLASSID] = { .type = NLA_U32 },
|
||||
[TCA_ROUTE4_TO] = { .type = NLA_U32 },
|
||||
[TCA_ROUTE4_FROM] = { .type = NLA_U32 },
|
||||
[TCA_ROUTE4_IIF] = { .type = NLA_U32 },
|
||||
};
|
||||
|
||||
static int route4_set_parms(struct net *net, struct tcf_proto *tp,
|
||||
unsigned long base, struct route4_filter *f,
|
||||
u32 handle, struct route4_head *head,
|
||||
struct nlattr **tb, struct nlattr *est, int new,
|
||||
bool ovr)
|
||||
{
|
||||
int err;
|
||||
u32 id = 0, to = 0, nhandle = 0x8000;
|
||||
struct route4_filter *fp;
|
||||
unsigned int h1;
|
||||
struct route4_bucket *b;
|
||||
struct tcf_exts e;
|
||||
|
||||
tcf_exts_init(&e, TCA_ROUTE4_ACT, TCA_ROUTE4_POLICE);
|
||||
err = tcf_exts_validate(net, tp, tb, est, &e, ovr);
|
||||
if (err < 0)
|
||||
return err;
|
||||
|
||||
err = -EINVAL;
|
||||
if (tb[TCA_ROUTE4_TO]) {
|
||||
if (new && handle & 0x8000)
|
||||
goto errout;
|
||||
to = nla_get_u32(tb[TCA_ROUTE4_TO]);
|
||||
if (to > 0xFF)
|
||||
goto errout;
|
||||
nhandle = to;
|
||||
}
|
||||
|
||||
if (tb[TCA_ROUTE4_FROM]) {
|
||||
if (tb[TCA_ROUTE4_IIF])
|
||||
goto errout;
|
||||
id = nla_get_u32(tb[TCA_ROUTE4_FROM]);
|
||||
if (id > 0xFF)
|
||||
goto errout;
|
||||
nhandle |= id << 16;
|
||||
} else if (tb[TCA_ROUTE4_IIF]) {
|
||||
id = nla_get_u32(tb[TCA_ROUTE4_IIF]);
|
||||
if (id > 0x7FFF)
|
||||
goto errout;
|
||||
nhandle |= (id | 0x8000) << 16;
|
||||
} else
|
||||
nhandle |= 0xFFFF << 16;
|
||||
|
||||
if (handle && new) {
|
||||
nhandle |= handle & 0x7F00;
|
||||
if (nhandle != handle)
|
||||
goto errout;
|
||||
}
|
||||
|
||||
h1 = to_hash(nhandle);
|
||||
b = rtnl_dereference(head->table[h1]);
|
||||
if (!b) {
|
||||
err = -ENOBUFS;
|
||||
b = kzalloc(sizeof(struct route4_bucket), GFP_KERNEL);
|
||||
if (b == NULL)
|
||||
goto errout;
|
||||
|
||||
rcu_assign_pointer(head->table[h1], b);
|
||||
} else {
|
||||
unsigned int h2 = from_hash(nhandle >> 16);
|
||||
|
||||
err = -EEXIST;
|
||||
for (fp = rtnl_dereference(b->ht[h2]);
|
||||
fp;
|
||||
fp = rtnl_dereference(fp->next))
|
||||
if (fp->handle == f->handle)
|
||||
goto errout;
|
||||
}
|
||||
|
||||
if (tb[TCA_ROUTE4_TO])
|
||||
f->id = to;
|
||||
|
||||
if (tb[TCA_ROUTE4_FROM])
|
||||
f->id = to | id<<16;
|
||||
else if (tb[TCA_ROUTE4_IIF])
|
||||
f->iif = id;
|
||||
|
||||
f->handle = nhandle;
|
||||
f->bkt = b;
|
||||
f->tp = tp;
|
||||
|
||||
if (tb[TCA_ROUTE4_CLASSID]) {
|
||||
f->res.classid = nla_get_u32(tb[TCA_ROUTE4_CLASSID]);
|
||||
tcf_bind_filter(tp, &f->res, base);
|
||||
}
|
||||
|
||||
tcf_exts_change(tp, &f->exts, &e);
|
||||
|
||||
return 0;
|
||||
errout:
|
||||
tcf_exts_destroy(&e);
|
||||
return err;
|
||||
}
|
||||
|
||||
static int route4_change(struct net *net, struct sk_buff *in_skb,
|
||||
struct tcf_proto *tp, unsigned long base,
|
||||
u32 handle,
|
||||
struct nlattr **tca,
|
||||
unsigned long *arg, bool ovr)
|
||||
{
|
||||
struct route4_head *head = rtnl_dereference(tp->root);
|
||||
struct route4_filter __rcu **fp;
|
||||
struct route4_filter *fold, *f1, *pfp, *f = NULL;
|
||||
struct route4_bucket *b;
|
||||
struct nlattr *opt = tca[TCA_OPTIONS];
|
||||
struct nlattr *tb[TCA_ROUTE4_MAX + 1];
|
||||
unsigned int h, th;
|
||||
int err;
|
||||
bool new = true;
|
||||
|
||||
if (opt == NULL)
|
||||
return handle ? -EINVAL : 0;
|
||||
|
||||
err = nla_parse_nested(tb, TCA_ROUTE4_MAX, opt, route4_policy);
|
||||
if (err < 0)
|
||||
return err;
|
||||
|
||||
fold = (struct route4_filter *)*arg;
|
||||
if (fold && handle && fold->handle != handle)
|
||||
return -EINVAL;
|
||||
|
||||
err = -ENOBUFS;
|
||||
if (head == NULL) {
|
||||
head = kzalloc(sizeof(struct route4_head), GFP_KERNEL);
|
||||
if (head == NULL)
|
||||
goto errout;
|
||||
rcu_assign_pointer(tp->root, head);
|
||||
}
|
||||
|
||||
f = kzalloc(sizeof(struct route4_filter), GFP_KERNEL);
|
||||
if (!f)
|
||||
goto errout;
|
||||
|
||||
tcf_exts_init(&f->exts, TCA_ROUTE4_ACT, TCA_ROUTE4_POLICE);
|
||||
if (fold) {
|
||||
f->id = fold->id;
|
||||
f->iif = fold->iif;
|
||||
f->res = fold->res;
|
||||
f->handle = fold->handle;
|
||||
|
||||
f->tp = fold->tp;
|
||||
f->bkt = fold->bkt;
|
||||
new = false;
|
||||
}
|
||||
|
||||
err = route4_set_parms(net, tp, base, f, handle, head, tb,
|
||||
tca[TCA_RATE], new, ovr);
|
||||
if (err < 0)
|
||||
goto errout;
|
||||
|
||||
h = from_hash(f->handle >> 16);
|
||||
fp = &f->bkt->ht[h];
|
||||
for (pfp = rtnl_dereference(*fp);
|
||||
(f1 = rtnl_dereference(*fp)) != NULL;
|
||||
fp = &f1->next)
|
||||
if (f->handle < f1->handle)
|
||||
break;
|
||||
|
||||
netif_keep_dst(qdisc_dev(tp->q));
|
||||
rcu_assign_pointer(f->next, f1);
|
||||
rcu_assign_pointer(*fp, f);
|
||||
|
||||
if (fold && fold->handle && f->handle != fold->handle) {
|
||||
th = to_hash(fold->handle);
|
||||
h = from_hash(fold->handle >> 16);
|
||||
b = rtnl_dereference(head->table[th]);
|
||||
if (b) {
|
||||
fp = &b->ht[h];
|
||||
for (pfp = rtnl_dereference(*fp); pfp;
|
||||
fp = &pfp->next, pfp = rtnl_dereference(*fp)) {
|
||||
if (pfp == f) {
|
||||
*fp = f->next;
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
route4_reset_fastmap(head);
|
||||
*arg = (unsigned long)f;
|
||||
if (fold) {
|
||||
tcf_unbind_filter(tp, &fold->res);
|
||||
call_rcu(&fold->rcu, route4_delete_filter);
|
||||
}
|
||||
return 0;
|
||||
|
||||
errout:
|
||||
kfree(f);
|
||||
return err;
|
||||
}
|
||||
|
||||
static void route4_walk(struct tcf_proto *tp, struct tcf_walker *arg)
|
||||
{
|
||||
struct route4_head *head = rtnl_dereference(tp->root);
|
||||
unsigned int h, h1;
|
||||
|
||||
if (head == NULL)
|
||||
arg->stop = 1;
|
||||
|
||||
if (arg->stop)
|
||||
return;
|
||||
|
||||
for (h = 0; h <= 256; h++) {
|
||||
struct route4_bucket *b = rtnl_dereference(head->table[h]);
|
||||
|
||||
if (b) {
|
||||
for (h1 = 0; h1 <= 32; h1++) {
|
||||
struct route4_filter *f;
|
||||
|
||||
for (f = rtnl_dereference(b->ht[h1]);
|
||||
f;
|
||||
f = rtnl_dereference(f->next)) {
|
||||
if (arg->count < arg->skip) {
|
||||
arg->count++;
|
||||
continue;
|
||||
}
|
||||
if (arg->fn(tp, (unsigned long)f, arg) < 0) {
|
||||
arg->stop = 1;
|
||||
return;
|
||||
}
|
||||
arg->count++;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static int route4_dump(struct net *net, struct tcf_proto *tp, unsigned long fh,
|
||||
struct sk_buff *skb, struct tcmsg *t)
|
||||
{
|
||||
struct route4_filter *f = (struct route4_filter *)fh;
|
||||
unsigned char *b = skb_tail_pointer(skb);
|
||||
struct nlattr *nest;
|
||||
u32 id;
|
||||
|
||||
if (f == NULL)
|
||||
return skb->len;
|
||||
|
||||
t->tcm_handle = f->handle;
|
||||
|
||||
nest = nla_nest_start(skb, TCA_OPTIONS);
|
||||
if (nest == NULL)
|
||||
goto nla_put_failure;
|
||||
|
||||
if (!(f->handle & 0x8000)) {
|
||||
id = f->id & 0xFF;
|
||||
if (nla_put_u32(skb, TCA_ROUTE4_TO, id))
|
||||
goto nla_put_failure;
|
||||
}
|
||||
if (f->handle & 0x80000000) {
|
||||
if ((f->handle >> 16) != 0xFFFF &&
|
||||
nla_put_u32(skb, TCA_ROUTE4_IIF, f->iif))
|
||||
goto nla_put_failure;
|
||||
} else {
|
||||
id = f->id >> 16;
|
||||
if (nla_put_u32(skb, TCA_ROUTE4_FROM, id))
|
||||
goto nla_put_failure;
|
||||
}
|
||||
if (f->res.classid &&
|
||||
nla_put_u32(skb, TCA_ROUTE4_CLASSID, f->res.classid))
|
||||
goto nla_put_failure;
|
||||
|
||||
if (tcf_exts_dump(skb, &f->exts) < 0)
|
||||
goto nla_put_failure;
|
||||
|
||||
nla_nest_end(skb, nest);
|
||||
|
||||
if (tcf_exts_dump_stats(skb, &f->exts) < 0)
|
||||
goto nla_put_failure;
|
||||
|
||||
return skb->len;
|
||||
|
||||
nla_put_failure:
|
||||
nlmsg_trim(skb, b);
|
||||
return -1;
|
||||
}
|
||||
|
||||
static struct tcf_proto_ops cls_route4_ops __read_mostly = {
|
||||
.kind = "route",
|
||||
.classify = route4_classify,
|
||||
.init = route4_init,
|
||||
.destroy = route4_destroy,
|
||||
.get = route4_get,
|
||||
.put = route4_put,
|
||||
.change = route4_change,
|
||||
.delete = route4_delete,
|
||||
.walk = route4_walk,
|
||||
.dump = route4_dump,
|
||||
.owner = THIS_MODULE,
|
||||
};
|
||||
|
||||
static int __init init_route4(void)
|
||||
{
|
||||
return register_tcf_proto_ops(&cls_route4_ops);
|
||||
}
|
||||
|
||||
static void __exit exit_route4(void)
|
||||
{
|
||||
unregister_tcf_proto_ops(&cls_route4_ops);
|
||||
}
|
||||
|
||||
module_init(init_route4)
|
||||
module_exit(exit_route4)
|
||||
MODULE_LICENSE("GPL");
|
||||
28
net/sched/cls_rsvp.c
Normal file
28
net/sched/cls_rsvp.c
Normal file
|
|
@ -0,0 +1,28 @@
|
|||
/*
|
||||
* net/sched/cls_rsvp.c Special RSVP packet classifier for IPv4.
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU General Public License
|
||||
* as published by the Free Software Foundation; either version
|
||||
* 2 of the License, or (at your option) any later version.
|
||||
*
|
||||
* Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
|
||||
*/
|
||||
|
||||
#include <linux/module.h>
|
||||
#include <linux/types.h>
|
||||
#include <linux/kernel.h>
|
||||
#include <linux/string.h>
|
||||
#include <linux/errno.h>
|
||||
#include <linux/skbuff.h>
|
||||
#include <net/ip.h>
|
||||
#include <net/netlink.h>
|
||||
#include <net/act_api.h>
|
||||
#include <net/pkt_cls.h>
|
||||
|
||||
#define RSVP_DST_LEN 1
|
||||
#define RSVP_ID "rsvp"
|
||||
#define RSVP_OPS cls_rsvp_ops
|
||||
|
||||
#include "cls_rsvp.h"
|
||||
MODULE_LICENSE("GPL");
|
||||
730
net/sched/cls_rsvp.h
Normal file
730
net/sched/cls_rsvp.h
Normal file
|
|
@ -0,0 +1,730 @@
|
|||
/*
|
||||
* net/sched/cls_rsvp.h Template file for RSVPv[46] classifiers.
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU General Public License
|
||||
* as published by the Free Software Foundation; either version
|
||||
* 2 of the License, or (at your option) any later version.
|
||||
*
|
||||
* Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
|
||||
*/
|
||||
|
||||
/*
|
||||
Comparing to general packet classification problem,
|
||||
RSVP needs only sevaral relatively simple rules:
|
||||
|
||||
* (dst, protocol) are always specified,
|
||||
so that we are able to hash them.
|
||||
* src may be exact, or may be wildcard, so that
|
||||
we can keep a hash table plus one wildcard entry.
|
||||
* source port (or flow label) is important only if src is given.
|
||||
|
||||
IMPLEMENTATION.
|
||||
|
||||
We use a two level hash table: The top level is keyed by
|
||||
destination address and protocol ID, every bucket contains a list
|
||||
of "rsvp sessions", identified by destination address, protocol and
|
||||
DPI(="Destination Port ID"): triple (key, mask, offset).
|
||||
|
||||
Every bucket has a smaller hash table keyed by source address
|
||||
(cf. RSVP flowspec) and one wildcard entry for wildcard reservations.
|
||||
Every bucket is again a list of "RSVP flows", selected by
|
||||
source address and SPI(="Source Port ID" here rather than
|
||||
"security parameter index"): triple (key, mask, offset).
|
||||
|
||||
|
||||
NOTE 1. All the packets with IPv6 extension headers (but AH and ESP)
|
||||
and all fragmented packets go to the best-effort traffic class.
|
||||
|
||||
|
||||
NOTE 2. Two "port id"'s seems to be redundant, rfc2207 requires
|
||||
only one "Generalized Port Identifier". So that for classic
|
||||
ah, esp (and udp,tcp) both *pi should coincide or one of them
|
||||
should be wildcard.
|
||||
|
||||
At first sight, this redundancy is just a waste of CPU
|
||||
resources. But DPI and SPI add the possibility to assign different
|
||||
priorities to GPIs. Look also at note 4 about tunnels below.
|
||||
|
||||
|
||||
NOTE 3. One complication is the case of tunneled packets.
|
||||
We implement it as following: if the first lookup
|
||||
matches a special session with "tunnelhdr" value not zero,
|
||||
flowid doesn't contain the true flow ID, but the tunnel ID (1...255).
|
||||
In this case, we pull tunnelhdr bytes and restart lookup
|
||||
with tunnel ID added to the list of keys. Simple and stupid 8)8)
|
||||
It's enough for PIMREG and IPIP.
|
||||
|
||||
|
||||
NOTE 4. Two GPIs make it possible to parse even GRE packets.
|
||||
F.e. DPI can select ETH_P_IP (and necessary flags to make
|
||||
tunnelhdr correct) in GRE protocol field and SPI matches
|
||||
GRE key. Is it not nice? 8)8)
|
||||
|
||||
|
||||
Well, as result, despite its simplicity, we get a pretty
|
||||
powerful classification engine. */
|
||||
|
||||
|
||||
struct rsvp_head {
|
||||
u32 tmap[256/32];
|
||||
u32 hgenerator;
|
||||
u8 tgenerator;
|
||||
struct rsvp_session __rcu *ht[256];
|
||||
struct rcu_head rcu;
|
||||
};
|
||||
|
||||
struct rsvp_session {
|
||||
struct rsvp_session __rcu *next;
|
||||
__be32 dst[RSVP_DST_LEN];
|
||||
struct tc_rsvp_gpi dpi;
|
||||
u8 protocol;
|
||||
u8 tunnelid;
|
||||
/* 16 (src,sport) hash slots, and one wildcard source slot */
|
||||
struct rsvp_filter __rcu *ht[16 + 1];
|
||||
struct rcu_head rcu;
|
||||
};
|
||||
|
||||
|
||||
struct rsvp_filter {
|
||||
struct rsvp_filter __rcu *next;
|
||||
__be32 src[RSVP_DST_LEN];
|
||||
struct tc_rsvp_gpi spi;
|
||||
u8 tunnelhdr;
|
||||
|
||||
struct tcf_result res;
|
||||
struct tcf_exts exts;
|
||||
|
||||
u32 handle;
|
||||
struct rsvp_session *sess;
|
||||
struct rcu_head rcu;
|
||||
};
|
||||
|
||||
static inline unsigned int hash_dst(__be32 *dst, u8 protocol, u8 tunnelid)
|
||||
{
|
||||
unsigned int h = (__force __u32)dst[RSVP_DST_LEN - 1];
|
||||
|
||||
h ^= h>>16;
|
||||
h ^= h>>8;
|
||||
return (h ^ protocol ^ tunnelid) & 0xFF;
|
||||
}
|
||||
|
||||
static inline unsigned int hash_src(__be32 *src)
|
||||
{
|
||||
unsigned int h = (__force __u32)src[RSVP_DST_LEN-1];
|
||||
|
||||
h ^= h>>16;
|
||||
h ^= h>>8;
|
||||
h ^= h>>4;
|
||||
return h & 0xF;
|
||||
}
|
||||
|
||||
#define RSVP_APPLY_RESULT() \
|
||||
{ \
|
||||
int r = tcf_exts_exec(skb, &f->exts, res); \
|
||||
if (r < 0) \
|
||||
continue; \
|
||||
else if (r > 0) \
|
||||
return r; \
|
||||
}
|
||||
|
||||
static int rsvp_classify(struct sk_buff *skb, const struct tcf_proto *tp,
|
||||
struct tcf_result *res)
|
||||
{
|
||||
struct rsvp_head *head = rcu_dereference_bh(tp->root);
|
||||
struct rsvp_session *s;
|
||||
struct rsvp_filter *f;
|
||||
unsigned int h1, h2;
|
||||
__be32 *dst, *src;
|
||||
u8 protocol;
|
||||
u8 tunnelid = 0;
|
||||
u8 *xprt;
|
||||
#if RSVP_DST_LEN == 4
|
||||
struct ipv6hdr *nhptr;
|
||||
|
||||
if (!pskb_network_may_pull(skb, sizeof(*nhptr)))
|
||||
return -1;
|
||||
nhptr = ipv6_hdr(skb);
|
||||
#else
|
||||
struct iphdr *nhptr;
|
||||
|
||||
if (!pskb_network_may_pull(skb, sizeof(*nhptr)))
|
||||
return -1;
|
||||
nhptr = ip_hdr(skb);
|
||||
#endif
|
||||
|
||||
restart:
|
||||
|
||||
#if RSVP_DST_LEN == 4
|
||||
src = &nhptr->saddr.s6_addr32[0];
|
||||
dst = &nhptr->daddr.s6_addr32[0];
|
||||
protocol = nhptr->nexthdr;
|
||||
xprt = ((u8 *)nhptr) + sizeof(struct ipv6hdr);
|
||||
#else
|
||||
src = &nhptr->saddr;
|
||||
dst = &nhptr->daddr;
|
||||
protocol = nhptr->protocol;
|
||||
xprt = ((u8 *)nhptr) + (nhptr->ihl<<2);
|
||||
if (ip_is_fragment(nhptr))
|
||||
return -1;
|
||||
#endif
|
||||
|
||||
h1 = hash_dst(dst, protocol, tunnelid);
|
||||
h2 = hash_src(src);
|
||||
|
||||
for (s = rcu_dereference_bh(head->ht[h1]); s;
|
||||
s = rcu_dereference_bh(s->next)) {
|
||||
if (dst[RSVP_DST_LEN-1] == s->dst[RSVP_DST_LEN - 1] &&
|
||||
protocol == s->protocol &&
|
||||
!(s->dpi.mask &
|
||||
(*(u32 *)(xprt + s->dpi.offset) ^ s->dpi.key)) &&
|
||||
#if RSVP_DST_LEN == 4
|
||||
dst[0] == s->dst[0] &&
|
||||
dst[1] == s->dst[1] &&
|
||||
dst[2] == s->dst[2] &&
|
||||
#endif
|
||||
tunnelid == s->tunnelid) {
|
||||
|
||||
for (f = rcu_dereference_bh(s->ht[h2]); f;
|
||||
f = rcu_dereference_bh(f->next)) {
|
||||
if (src[RSVP_DST_LEN-1] == f->src[RSVP_DST_LEN - 1] &&
|
||||
!(f->spi.mask & (*(u32 *)(xprt + f->spi.offset) ^ f->spi.key))
|
||||
#if RSVP_DST_LEN == 4
|
||||
&&
|
||||
src[0] == f->src[0] &&
|
||||
src[1] == f->src[1] &&
|
||||
src[2] == f->src[2]
|
||||
#endif
|
||||
) {
|
||||
*res = f->res;
|
||||
RSVP_APPLY_RESULT();
|
||||
|
||||
matched:
|
||||
if (f->tunnelhdr == 0)
|
||||
return 0;
|
||||
|
||||
tunnelid = f->res.classid;
|
||||
nhptr = (void *)(xprt + f->tunnelhdr - sizeof(*nhptr));
|
||||
goto restart;
|
||||
}
|
||||
}
|
||||
|
||||
/* And wildcard bucket... */
|
||||
for (f = rcu_dereference_bh(s->ht[16]); f;
|
||||
f = rcu_dereference_bh(f->next)) {
|
||||
*res = f->res;
|
||||
RSVP_APPLY_RESULT();
|
||||
goto matched;
|
||||
}
|
||||
return -1;
|
||||
}
|
||||
}
|
||||
return -1;
|
||||
}
|
||||
|
||||
static void rsvp_replace(struct tcf_proto *tp, struct rsvp_filter *n, u32 h)
|
||||
{
|
||||
struct rsvp_head *head = rtnl_dereference(tp->root);
|
||||
struct rsvp_session *s;
|
||||
struct rsvp_filter __rcu **ins;
|
||||
struct rsvp_filter *pins;
|
||||
unsigned int h1 = h & 0xFF;
|
||||
unsigned int h2 = (h >> 8) & 0xFF;
|
||||
|
||||
for (s = rtnl_dereference(head->ht[h1]); s;
|
||||
s = rtnl_dereference(s->next)) {
|
||||
for (ins = &s->ht[h2], pins = rtnl_dereference(*ins); ;
|
||||
ins = &pins->next, pins = rtnl_dereference(*ins)) {
|
||||
if (pins->handle == h) {
|
||||
RCU_INIT_POINTER(n->next, pins->next);
|
||||
rcu_assign_pointer(*ins, n);
|
||||
return;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/* Something went wrong if we are trying to replace a non-existant
|
||||
* node. Mind as well halt instead of silently failing.
|
||||
*/
|
||||
BUG_ON(1);
|
||||
}
|
||||
|
||||
static unsigned long rsvp_get(struct tcf_proto *tp, u32 handle)
|
||||
{
|
||||
struct rsvp_head *head = rtnl_dereference(tp->root);
|
||||
struct rsvp_session *s;
|
||||
struct rsvp_filter *f;
|
||||
unsigned int h1 = handle & 0xFF;
|
||||
unsigned int h2 = (handle >> 8) & 0xFF;
|
||||
|
||||
if (h2 > 16)
|
||||
return 0;
|
||||
|
||||
for (s = rtnl_dereference(head->ht[h1]); s;
|
||||
s = rtnl_dereference(s->next)) {
|
||||
for (f = rtnl_dereference(s->ht[h2]); f;
|
||||
f = rtnl_dereference(f->next)) {
|
||||
if (f->handle == handle)
|
||||
return (unsigned long)f;
|
||||
}
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
static void rsvp_put(struct tcf_proto *tp, unsigned long f)
|
||||
{
|
||||
}
|
||||
|
||||
static int rsvp_init(struct tcf_proto *tp)
|
||||
{
|
||||
struct rsvp_head *data;
|
||||
|
||||
data = kzalloc(sizeof(struct rsvp_head), GFP_KERNEL);
|
||||
if (data) {
|
||||
rcu_assign_pointer(tp->root, data);
|
||||
return 0;
|
||||
}
|
||||
return -ENOBUFS;
|
||||
}
|
||||
|
||||
static void
|
||||
rsvp_delete_filter(struct tcf_proto *tp, struct rsvp_filter *f)
|
||||
{
|
||||
tcf_unbind_filter(tp, &f->res);
|
||||
tcf_exts_destroy(&f->exts);
|
||||
kfree_rcu(f, rcu);
|
||||
}
|
||||
|
||||
static void rsvp_destroy(struct tcf_proto *tp)
|
||||
{
|
||||
struct rsvp_head *data = rtnl_dereference(tp->root);
|
||||
int h1, h2;
|
||||
|
||||
if (data == NULL)
|
||||
return;
|
||||
|
||||
RCU_INIT_POINTER(tp->root, NULL);
|
||||
|
||||
for (h1 = 0; h1 < 256; h1++) {
|
||||
struct rsvp_session *s;
|
||||
|
||||
while ((s = rtnl_dereference(data->ht[h1])) != NULL) {
|
||||
RCU_INIT_POINTER(data->ht[h1], s->next);
|
||||
|
||||
for (h2 = 0; h2 <= 16; h2++) {
|
||||
struct rsvp_filter *f;
|
||||
|
||||
while ((f = rtnl_dereference(s->ht[h2])) != NULL) {
|
||||
rcu_assign_pointer(s->ht[h2], f->next);
|
||||
rsvp_delete_filter(tp, f);
|
||||
}
|
||||
}
|
||||
kfree_rcu(s, rcu);
|
||||
}
|
||||
}
|
||||
kfree_rcu(data, rcu);
|
||||
}
|
||||
|
||||
static int rsvp_delete(struct tcf_proto *tp, unsigned long arg)
|
||||
{
|
||||
struct rsvp_head *head = rtnl_dereference(tp->root);
|
||||
struct rsvp_filter *nfp, *f = (struct rsvp_filter *)arg;
|
||||
struct rsvp_filter __rcu **fp;
|
||||
unsigned int h = f->handle;
|
||||
struct rsvp_session __rcu **sp;
|
||||
struct rsvp_session *nsp, *s = f->sess;
|
||||
int i;
|
||||
|
||||
fp = &s->ht[(h >> 8) & 0xFF];
|
||||
for (nfp = rtnl_dereference(*fp); nfp;
|
||||
fp = &nfp->next, nfp = rtnl_dereference(*fp)) {
|
||||
if (nfp == f) {
|
||||
RCU_INIT_POINTER(*fp, f->next);
|
||||
rsvp_delete_filter(tp, f);
|
||||
|
||||
/* Strip tree */
|
||||
|
||||
for (i = 0; i <= 16; i++)
|
||||
if (s->ht[i])
|
||||
return 0;
|
||||
|
||||
/* OK, session has no flows */
|
||||
sp = &head->ht[h & 0xFF];
|
||||
for (nsp = rtnl_dereference(*sp); nsp;
|
||||
sp = &nsp->next, nsp = rtnl_dereference(*sp)) {
|
||||
if (nsp == s) {
|
||||
RCU_INIT_POINTER(*sp, s->next);
|
||||
kfree_rcu(s, rcu);
|
||||
return 0;
|
||||
}
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
static unsigned int gen_handle(struct tcf_proto *tp, unsigned salt)
|
||||
{
|
||||
struct rsvp_head *data = rtnl_dereference(tp->root);
|
||||
int i = 0xFFFF;
|
||||
|
||||
while (i-- > 0) {
|
||||
u32 h;
|
||||
|
||||
if ((data->hgenerator += 0x10000) == 0)
|
||||
data->hgenerator = 0x10000;
|
||||
h = data->hgenerator|salt;
|
||||
if (rsvp_get(tp, h) == 0)
|
||||
return h;
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
static int tunnel_bts(struct rsvp_head *data)
|
||||
{
|
||||
int n = data->tgenerator >> 5;
|
||||
u32 b = 1 << (data->tgenerator & 0x1F);
|
||||
|
||||
if (data->tmap[n] & b)
|
||||
return 0;
|
||||
data->tmap[n] |= b;
|
||||
return 1;
|
||||
}
|
||||
|
||||
static void tunnel_recycle(struct rsvp_head *data)
|
||||
{
|
||||
struct rsvp_session __rcu **sht = data->ht;
|
||||
u32 tmap[256/32];
|
||||
int h1, h2;
|
||||
|
||||
memset(tmap, 0, sizeof(tmap));
|
||||
|
||||
for (h1 = 0; h1 < 256; h1++) {
|
||||
struct rsvp_session *s;
|
||||
for (s = rtnl_dereference(sht[h1]); s;
|
||||
s = rtnl_dereference(s->next)) {
|
||||
for (h2 = 0; h2 <= 16; h2++) {
|
||||
struct rsvp_filter *f;
|
||||
|
||||
for (f = rtnl_dereference(s->ht[h2]); f;
|
||||
f = rtnl_dereference(f->next)) {
|
||||
if (f->tunnelhdr == 0)
|
||||
continue;
|
||||
data->tgenerator = f->res.classid;
|
||||
tunnel_bts(data);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
memcpy(data->tmap, tmap, sizeof(tmap));
|
||||
}
|
||||
|
||||
static u32 gen_tunnel(struct rsvp_head *data)
|
||||
{
|
||||
int i, k;
|
||||
|
||||
for (k = 0; k < 2; k++) {
|
||||
for (i = 255; i > 0; i--) {
|
||||
if (++data->tgenerator == 0)
|
||||
data->tgenerator = 1;
|
||||
if (tunnel_bts(data))
|
||||
return data->tgenerator;
|
||||
}
|
||||
tunnel_recycle(data);
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
static const struct nla_policy rsvp_policy[TCA_RSVP_MAX + 1] = {
|
||||
[TCA_RSVP_CLASSID] = { .type = NLA_U32 },
|
||||
[TCA_RSVP_DST] = { .type = NLA_BINARY,
|
||||
.len = RSVP_DST_LEN * sizeof(u32) },
|
||||
[TCA_RSVP_SRC] = { .type = NLA_BINARY,
|
||||
.len = RSVP_DST_LEN * sizeof(u32) },
|
||||
[TCA_RSVP_PINFO] = { .len = sizeof(struct tc_rsvp_pinfo) },
|
||||
};
|
||||
|
||||
static int rsvp_change(struct net *net, struct sk_buff *in_skb,
|
||||
struct tcf_proto *tp, unsigned long base,
|
||||
u32 handle,
|
||||
struct nlattr **tca,
|
||||
unsigned long *arg, bool ovr)
|
||||
{
|
||||
struct rsvp_head *data = rtnl_dereference(tp->root);
|
||||
struct rsvp_filter *f, *nfp;
|
||||
struct rsvp_filter __rcu **fp;
|
||||
struct rsvp_session *nsp, *s;
|
||||
struct rsvp_session __rcu **sp;
|
||||
struct tc_rsvp_pinfo *pinfo = NULL;
|
||||
struct nlattr *opt = tca[TCA_OPTIONS];
|
||||
struct nlattr *tb[TCA_RSVP_MAX + 1];
|
||||
struct tcf_exts e;
|
||||
unsigned int h1, h2;
|
||||
__be32 *dst;
|
||||
int err;
|
||||
|
||||
if (opt == NULL)
|
||||
return handle ? -EINVAL : 0;
|
||||
|
||||
err = nla_parse_nested(tb, TCA_RSVP_MAX, opt, rsvp_policy);
|
||||
if (err < 0)
|
||||
return err;
|
||||
|
||||
tcf_exts_init(&e, TCA_RSVP_ACT, TCA_RSVP_POLICE);
|
||||
err = tcf_exts_validate(net, tp, tb, tca[TCA_RATE], &e, ovr);
|
||||
if (err < 0)
|
||||
return err;
|
||||
|
||||
f = (struct rsvp_filter *)*arg;
|
||||
if (f) {
|
||||
/* Node exists: adjust only classid */
|
||||
struct rsvp_filter *n;
|
||||
|
||||
if (f->handle != handle && handle)
|
||||
goto errout2;
|
||||
|
||||
n = kmemdup(f, sizeof(*f), GFP_KERNEL);
|
||||
if (!n) {
|
||||
err = -ENOMEM;
|
||||
goto errout2;
|
||||
}
|
||||
|
||||
tcf_exts_init(&n->exts, TCA_RSVP_ACT, TCA_RSVP_POLICE);
|
||||
|
||||
if (tb[TCA_RSVP_CLASSID]) {
|
||||
n->res.classid = nla_get_u32(tb[TCA_RSVP_CLASSID]);
|
||||
tcf_bind_filter(tp, &n->res, base);
|
||||
}
|
||||
|
||||
tcf_exts_change(tp, &n->exts, &e);
|
||||
rsvp_replace(tp, n, handle);
|
||||
return 0;
|
||||
}
|
||||
|
||||
/* Now more serious part... */
|
||||
err = -EINVAL;
|
||||
if (handle)
|
||||
goto errout2;
|
||||
if (tb[TCA_RSVP_DST] == NULL)
|
||||
goto errout2;
|
||||
|
||||
err = -ENOBUFS;
|
||||
f = kzalloc(sizeof(struct rsvp_filter), GFP_KERNEL);
|
||||
if (f == NULL)
|
||||
goto errout2;
|
||||
|
||||
tcf_exts_init(&f->exts, TCA_RSVP_ACT, TCA_RSVP_POLICE);
|
||||
h2 = 16;
|
||||
if (tb[TCA_RSVP_SRC]) {
|
||||
memcpy(f->src, nla_data(tb[TCA_RSVP_SRC]), sizeof(f->src));
|
||||
h2 = hash_src(f->src);
|
||||
}
|
||||
if (tb[TCA_RSVP_PINFO]) {
|
||||
pinfo = nla_data(tb[TCA_RSVP_PINFO]);
|
||||
f->spi = pinfo->spi;
|
||||
f->tunnelhdr = pinfo->tunnelhdr;
|
||||
}
|
||||
if (tb[TCA_RSVP_CLASSID])
|
||||
f->res.classid = nla_get_u32(tb[TCA_RSVP_CLASSID]);
|
||||
|
||||
dst = nla_data(tb[TCA_RSVP_DST]);
|
||||
h1 = hash_dst(dst, pinfo ? pinfo->protocol : 0, pinfo ? pinfo->tunnelid : 0);
|
||||
|
||||
err = -ENOMEM;
|
||||
if ((f->handle = gen_handle(tp, h1 | (h2<<8))) == 0)
|
||||
goto errout;
|
||||
|
||||
if (f->tunnelhdr) {
|
||||
err = -EINVAL;
|
||||
if (f->res.classid > 255)
|
||||
goto errout;
|
||||
|
||||
err = -ENOMEM;
|
||||
if (f->res.classid == 0 &&
|
||||
(f->res.classid = gen_tunnel(data)) == 0)
|
||||
goto errout;
|
||||
}
|
||||
|
||||
for (sp = &data->ht[h1];
|
||||
(s = rtnl_dereference(*sp)) != NULL;
|
||||
sp = &s->next) {
|
||||
if (dst[RSVP_DST_LEN-1] == s->dst[RSVP_DST_LEN-1] &&
|
||||
pinfo && pinfo->protocol == s->protocol &&
|
||||
memcmp(&pinfo->dpi, &s->dpi, sizeof(s->dpi)) == 0 &&
|
||||
#if RSVP_DST_LEN == 4
|
||||
dst[0] == s->dst[0] &&
|
||||
dst[1] == s->dst[1] &&
|
||||
dst[2] == s->dst[2] &&
|
||||
#endif
|
||||
pinfo->tunnelid == s->tunnelid) {
|
||||
|
||||
insert:
|
||||
/* OK, we found appropriate session */
|
||||
|
||||
fp = &s->ht[h2];
|
||||
|
||||
f->sess = s;
|
||||
if (f->tunnelhdr == 0)
|
||||
tcf_bind_filter(tp, &f->res, base);
|
||||
|
||||
tcf_exts_change(tp, &f->exts, &e);
|
||||
|
||||
fp = &s->ht[h2];
|
||||
for (nfp = rtnl_dereference(*fp); nfp;
|
||||
fp = &nfp->next, nfp = rtnl_dereference(*fp)) {
|
||||
__u32 mask = nfp->spi.mask & f->spi.mask;
|
||||
|
||||
if (mask != f->spi.mask)
|
||||
break;
|
||||
}
|
||||
RCU_INIT_POINTER(f->next, nfp);
|
||||
rcu_assign_pointer(*fp, f);
|
||||
|
||||
*arg = (unsigned long)f;
|
||||
return 0;
|
||||
}
|
||||
}
|
||||
|
||||
/* No session found. Create new one. */
|
||||
|
||||
err = -ENOBUFS;
|
||||
s = kzalloc(sizeof(struct rsvp_session), GFP_KERNEL);
|
||||
if (s == NULL)
|
||||
goto errout;
|
||||
memcpy(s->dst, dst, sizeof(s->dst));
|
||||
|
||||
if (pinfo) {
|
||||
s->dpi = pinfo->dpi;
|
||||
s->protocol = pinfo->protocol;
|
||||
s->tunnelid = pinfo->tunnelid;
|
||||
}
|
||||
sp = &data->ht[h1];
|
||||
for (nsp = rtnl_dereference(*sp); nsp;
|
||||
sp = &nsp->next, nsp = rtnl_dereference(*sp)) {
|
||||
if ((nsp->dpi.mask & s->dpi.mask) != s->dpi.mask)
|
||||
break;
|
||||
}
|
||||
RCU_INIT_POINTER(s->next, nsp);
|
||||
rcu_assign_pointer(*sp, s);
|
||||
|
||||
goto insert;
|
||||
|
||||
errout:
|
||||
kfree(f);
|
||||
errout2:
|
||||
tcf_exts_destroy(&e);
|
||||
return err;
|
||||
}
|
||||
|
||||
static void rsvp_walk(struct tcf_proto *tp, struct tcf_walker *arg)
|
||||
{
|
||||
struct rsvp_head *head = rtnl_dereference(tp->root);
|
||||
unsigned int h, h1;
|
||||
|
||||
if (arg->stop)
|
||||
return;
|
||||
|
||||
for (h = 0; h < 256; h++) {
|
||||
struct rsvp_session *s;
|
||||
|
||||
for (s = rtnl_dereference(head->ht[h]); s;
|
||||
s = rtnl_dereference(s->next)) {
|
||||
for (h1 = 0; h1 <= 16; h1++) {
|
||||
struct rsvp_filter *f;
|
||||
|
||||
for (f = rtnl_dereference(s->ht[h1]); f;
|
||||
f = rtnl_dereference(f->next)) {
|
||||
if (arg->count < arg->skip) {
|
||||
arg->count++;
|
||||
continue;
|
||||
}
|
||||
if (arg->fn(tp, (unsigned long)f, arg) < 0) {
|
||||
arg->stop = 1;
|
||||
return;
|
||||
}
|
||||
arg->count++;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static int rsvp_dump(struct net *net, struct tcf_proto *tp, unsigned long fh,
|
||||
struct sk_buff *skb, struct tcmsg *t)
|
||||
{
|
||||
struct rsvp_filter *f = (struct rsvp_filter *)fh;
|
||||
struct rsvp_session *s;
|
||||
unsigned char *b = skb_tail_pointer(skb);
|
||||
struct nlattr *nest;
|
||||
struct tc_rsvp_pinfo pinfo;
|
||||
|
||||
if (f == NULL)
|
||||
return skb->len;
|
||||
s = f->sess;
|
||||
|
||||
t->tcm_handle = f->handle;
|
||||
|
||||
nest = nla_nest_start(skb, TCA_OPTIONS);
|
||||
if (nest == NULL)
|
||||
goto nla_put_failure;
|
||||
|
||||
if (nla_put(skb, TCA_RSVP_DST, sizeof(s->dst), &s->dst))
|
||||
goto nla_put_failure;
|
||||
pinfo.dpi = s->dpi;
|
||||
pinfo.spi = f->spi;
|
||||
pinfo.protocol = s->protocol;
|
||||
pinfo.tunnelid = s->tunnelid;
|
||||
pinfo.tunnelhdr = f->tunnelhdr;
|
||||
pinfo.pad = 0;
|
||||
if (nla_put(skb, TCA_RSVP_PINFO, sizeof(pinfo), &pinfo))
|
||||
goto nla_put_failure;
|
||||
if (f->res.classid &&
|
||||
nla_put_u32(skb, TCA_RSVP_CLASSID, f->res.classid))
|
||||
goto nla_put_failure;
|
||||
if (((f->handle >> 8) & 0xFF) != 16 &&
|
||||
nla_put(skb, TCA_RSVP_SRC, sizeof(f->src), f->src))
|
||||
goto nla_put_failure;
|
||||
|
||||
if (tcf_exts_dump(skb, &f->exts) < 0)
|
||||
goto nla_put_failure;
|
||||
|
||||
nla_nest_end(skb, nest);
|
||||
|
||||
if (tcf_exts_dump_stats(skb, &f->exts) < 0)
|
||||
goto nla_put_failure;
|
||||
return skb->len;
|
||||
|
||||
nla_put_failure:
|
||||
nlmsg_trim(skb, b);
|
||||
return -1;
|
||||
}
|
||||
|
||||
static struct tcf_proto_ops RSVP_OPS __read_mostly = {
|
||||
.kind = RSVP_ID,
|
||||
.classify = rsvp_classify,
|
||||
.init = rsvp_init,
|
||||
.destroy = rsvp_destroy,
|
||||
.get = rsvp_get,
|
||||
.put = rsvp_put,
|
||||
.change = rsvp_change,
|
||||
.delete = rsvp_delete,
|
||||
.walk = rsvp_walk,
|
||||
.dump = rsvp_dump,
|
||||
.owner = THIS_MODULE,
|
||||
};
|
||||
|
||||
static int __init init_rsvp(void)
|
||||
{
|
||||
return register_tcf_proto_ops(&RSVP_OPS);
|
||||
}
|
||||
|
||||
static void __exit exit_rsvp(void)
|
||||
{
|
||||
unregister_tcf_proto_ops(&RSVP_OPS);
|
||||
}
|
||||
|
||||
module_init(init_rsvp)
|
||||
module_exit(exit_rsvp)
|
||||
28
net/sched/cls_rsvp6.c
Normal file
28
net/sched/cls_rsvp6.c
Normal file
|
|
@ -0,0 +1,28 @@
|
|||
/*
|
||||
* net/sched/cls_rsvp6.c Special RSVP packet classifier for IPv6.
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU General Public License
|
||||
* as published by the Free Software Foundation; either version
|
||||
* 2 of the License, or (at your option) any later version.
|
||||
*
|
||||
* Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
|
||||
*/
|
||||
|
||||
#include <linux/module.h>
|
||||
#include <linux/types.h>
|
||||
#include <linux/kernel.h>
|
||||
#include <linux/string.h>
|
||||
#include <linux/errno.h>
|
||||
#include <linux/ipv6.h>
|
||||
#include <linux/skbuff.h>
|
||||
#include <net/act_api.h>
|
||||
#include <net/pkt_cls.h>
|
||||
#include <net/netlink.h>
|
||||
|
||||
#define RSVP_DST_LEN 4
|
||||
#define RSVP_ID "rsvp6"
|
||||
#define RSVP_OPS cls_rsvp6_ops
|
||||
|
||||
#include "cls_rsvp.h"
|
||||
MODULE_LICENSE("GPL");
|
||||
583
net/sched/cls_tcindex.c
Normal file
583
net/sched/cls_tcindex.c
Normal file
|
|
@ -0,0 +1,583 @@
|
|||
/*
|
||||
* net/sched/cls_tcindex.c Packet classifier for skb->tc_index
|
||||
*
|
||||
* Written 1998,1999 by Werner Almesberger, EPFL ICA
|
||||
*/
|
||||
|
||||
#include <linux/module.h>
|
||||
#include <linux/types.h>
|
||||
#include <linux/kernel.h>
|
||||
#include <linux/skbuff.h>
|
||||
#include <linux/errno.h>
|
||||
#include <linux/slab.h>
|
||||
#include <net/act_api.h>
|
||||
#include <net/netlink.h>
|
||||
#include <net/pkt_cls.h>
|
||||
|
||||
/*
|
||||
* Passing parameters to the root seems to be done more awkwardly than really
|
||||
* necessary. At least, u32 doesn't seem to use such dirty hacks. To be
|
||||
* verified. FIXME.
|
||||
*/
|
||||
|
||||
#define PERFECT_HASH_THRESHOLD 64 /* use perfect hash if not bigger */
|
||||
#define DEFAULT_HASH_SIZE 64 /* optimized for diffserv */
|
||||
|
||||
|
||||
struct tcindex_filter_result {
|
||||
struct tcf_exts exts;
|
||||
struct tcf_result res;
|
||||
};
|
||||
|
||||
struct tcindex_filter {
|
||||
u16 key;
|
||||
struct tcindex_filter_result result;
|
||||
struct tcindex_filter __rcu *next;
|
||||
struct rcu_head rcu;
|
||||
};
|
||||
|
||||
|
||||
struct tcindex_data {
|
||||
struct tcindex_filter_result *perfect; /* perfect hash; NULL if none */
|
||||
struct tcindex_filter __rcu **h; /* imperfect hash; */
|
||||
struct tcf_proto *tp;
|
||||
u16 mask; /* AND key with mask */
|
||||
u32 shift; /* shift ANDed key to the right */
|
||||
u32 hash; /* hash table size; 0 if undefined */
|
||||
u32 alloc_hash; /* allocated size */
|
||||
u32 fall_through; /* 0: only classify if explicit match */
|
||||
struct rcu_head rcu;
|
||||
};
|
||||
|
||||
static inline int
|
||||
tcindex_filter_is_set(struct tcindex_filter_result *r)
|
||||
{
|
||||
return tcf_exts_is_predicative(&r->exts) || r->res.classid;
|
||||
}
|
||||
|
||||
static struct tcindex_filter_result *
|
||||
tcindex_lookup(struct tcindex_data *p, u16 key)
|
||||
{
|
||||
if (p->perfect) {
|
||||
struct tcindex_filter_result *f = p->perfect + key;
|
||||
|
||||
return tcindex_filter_is_set(f) ? f : NULL;
|
||||
} else if (p->h) {
|
||||
struct tcindex_filter __rcu **fp;
|
||||
struct tcindex_filter *f;
|
||||
|
||||
fp = &p->h[key % p->hash];
|
||||
for (f = rcu_dereference_bh_rtnl(*fp);
|
||||
f;
|
||||
fp = &f->next, f = rcu_dereference_bh_rtnl(*fp))
|
||||
if (f->key == key)
|
||||
return &f->result;
|
||||
}
|
||||
|
||||
return NULL;
|
||||
}
|
||||
|
||||
|
||||
static int tcindex_classify(struct sk_buff *skb, const struct tcf_proto *tp,
|
||||
struct tcf_result *res)
|
||||
{
|
||||
struct tcindex_data *p = rcu_dereference_bh(tp->root);
|
||||
struct tcindex_filter_result *f;
|
||||
int key = (skb->tc_index & p->mask) >> p->shift;
|
||||
|
||||
pr_debug("tcindex_classify(skb %p,tp %p,res %p),p %p\n",
|
||||
skb, tp, res, p);
|
||||
|
||||
f = tcindex_lookup(p, key);
|
||||
if (!f) {
|
||||
if (!p->fall_through)
|
||||
return -1;
|
||||
res->classid = TC_H_MAKE(TC_H_MAJ(tp->q->handle), key);
|
||||
res->class = 0;
|
||||
pr_debug("alg 0x%x\n", res->classid);
|
||||
return 0;
|
||||
}
|
||||
*res = f->res;
|
||||
pr_debug("map 0x%x\n", res->classid);
|
||||
|
||||
return tcf_exts_exec(skb, &f->exts, res);
|
||||
}
|
||||
|
||||
|
||||
static unsigned long tcindex_get(struct tcf_proto *tp, u32 handle)
|
||||
{
|
||||
struct tcindex_data *p = rtnl_dereference(tp->root);
|
||||
struct tcindex_filter_result *r;
|
||||
|
||||
pr_debug("tcindex_get(tp %p,handle 0x%08x)\n", tp, handle);
|
||||
if (p->perfect && handle >= p->alloc_hash)
|
||||
return 0;
|
||||
r = tcindex_lookup(p, handle);
|
||||
return r && tcindex_filter_is_set(r) ? (unsigned long) r : 0UL;
|
||||
}
|
||||
|
||||
|
||||
static void tcindex_put(struct tcf_proto *tp, unsigned long f)
|
||||
{
|
||||
pr_debug("tcindex_put(tp %p,f 0x%lx)\n", tp, f);
|
||||
}
|
||||
|
||||
|
||||
static int tcindex_init(struct tcf_proto *tp)
|
||||
{
|
||||
struct tcindex_data *p;
|
||||
|
||||
pr_debug("tcindex_init(tp %p)\n", tp);
|
||||
p = kzalloc(sizeof(struct tcindex_data), GFP_KERNEL);
|
||||
if (!p)
|
||||
return -ENOMEM;
|
||||
|
||||
p->mask = 0xffff;
|
||||
p->hash = DEFAULT_HASH_SIZE;
|
||||
p->fall_through = 1;
|
||||
|
||||
rcu_assign_pointer(tp->root, p);
|
||||
return 0;
|
||||
}
|
||||
|
||||
static int
|
||||
tcindex_delete(struct tcf_proto *tp, unsigned long arg)
|
||||
{
|
||||
struct tcindex_data *p = rtnl_dereference(tp->root);
|
||||
struct tcindex_filter_result *r = (struct tcindex_filter_result *) arg;
|
||||
struct tcindex_filter __rcu **walk;
|
||||
struct tcindex_filter *f = NULL;
|
||||
|
||||
pr_debug("tcindex_delete(tp %p,arg 0x%lx),p %p\n", tp, arg, p);
|
||||
if (p->perfect) {
|
||||
if (!r->res.class)
|
||||
return -ENOENT;
|
||||
} else {
|
||||
int i;
|
||||
|
||||
for (i = 0; i < p->hash; i++) {
|
||||
walk = p->h + i;
|
||||
for (f = rtnl_dereference(*walk); f;
|
||||
walk = &f->next, f = rtnl_dereference(*walk)) {
|
||||
if (&f->result == r)
|
||||
goto found;
|
||||
}
|
||||
}
|
||||
return -ENOENT;
|
||||
|
||||
found:
|
||||
rcu_assign_pointer(*walk, rtnl_dereference(f->next));
|
||||
}
|
||||
tcf_unbind_filter(tp, &r->res);
|
||||
tcf_exts_destroy(&r->exts);
|
||||
if (f)
|
||||
kfree_rcu(f, rcu);
|
||||
return 0;
|
||||
}
|
||||
|
||||
static int tcindex_destroy_element(struct tcf_proto *tp,
|
||||
unsigned long arg,
|
||||
struct tcf_walker *walker)
|
||||
{
|
||||
return tcindex_delete(tp, arg);
|
||||
}
|
||||
|
||||
static void __tcindex_destroy(struct rcu_head *head)
|
||||
{
|
||||
struct tcindex_data *p = container_of(head, struct tcindex_data, rcu);
|
||||
|
||||
kfree(p->perfect);
|
||||
kfree(p->h);
|
||||
kfree(p);
|
||||
}
|
||||
|
||||
static inline int
|
||||
valid_perfect_hash(struct tcindex_data *p)
|
||||
{
|
||||
return p->hash > (p->mask >> p->shift);
|
||||
}
|
||||
|
||||
static const struct nla_policy tcindex_policy[TCA_TCINDEX_MAX + 1] = {
|
||||
[TCA_TCINDEX_HASH] = { .type = NLA_U32 },
|
||||
[TCA_TCINDEX_MASK] = { .type = NLA_U16 },
|
||||
[TCA_TCINDEX_SHIFT] = { .type = NLA_U32 },
|
||||
[TCA_TCINDEX_FALL_THROUGH] = { .type = NLA_U32 },
|
||||
[TCA_TCINDEX_CLASSID] = { .type = NLA_U32 },
|
||||
};
|
||||
|
||||
static void tcindex_filter_result_init(struct tcindex_filter_result *r)
|
||||
{
|
||||
memset(r, 0, sizeof(*r));
|
||||
tcf_exts_init(&r->exts, TCA_TCINDEX_ACT, TCA_TCINDEX_POLICE);
|
||||
}
|
||||
|
||||
static void __tcindex_partial_destroy(struct rcu_head *head)
|
||||
{
|
||||
struct tcindex_data *p = container_of(head, struct tcindex_data, rcu);
|
||||
|
||||
kfree(p->perfect);
|
||||
kfree(p);
|
||||
}
|
||||
|
||||
static int
|
||||
tcindex_set_parms(struct net *net, struct tcf_proto *tp, unsigned long base,
|
||||
u32 handle, struct tcindex_data *p,
|
||||
struct tcindex_filter_result *r, struct nlattr **tb,
|
||||
struct nlattr *est, bool ovr)
|
||||
{
|
||||
int err, balloc = 0;
|
||||
struct tcindex_filter_result new_filter_result, *old_r = r;
|
||||
struct tcindex_filter_result cr;
|
||||
struct tcindex_data *cp, *oldp;
|
||||
struct tcindex_filter *f = NULL; /* make gcc behave */
|
||||
struct tcf_exts e;
|
||||
|
||||
tcf_exts_init(&e, TCA_TCINDEX_ACT, TCA_TCINDEX_POLICE);
|
||||
err = tcf_exts_validate(net, tp, tb, est, &e, ovr);
|
||||
if (err < 0)
|
||||
return err;
|
||||
|
||||
err = -ENOMEM;
|
||||
/* tcindex_data attributes must look atomic to classifier/lookup so
|
||||
* allocate new tcindex data and RCU assign it onto root. Keeping
|
||||
* perfect hash and hash pointers from old data.
|
||||
*/
|
||||
cp = kzalloc(sizeof(*cp), GFP_KERNEL);
|
||||
if (!cp)
|
||||
goto errout;
|
||||
|
||||
cp->mask = p->mask;
|
||||
cp->shift = p->shift;
|
||||
cp->hash = p->hash;
|
||||
cp->alloc_hash = p->alloc_hash;
|
||||
cp->fall_through = p->fall_through;
|
||||
cp->tp = tp;
|
||||
|
||||
if (p->perfect) {
|
||||
int i;
|
||||
|
||||
cp->perfect = kmemdup(p->perfect,
|
||||
sizeof(*r) * cp->hash, GFP_KERNEL);
|
||||
if (!cp->perfect)
|
||||
goto errout;
|
||||
for (i = 0; i < cp->hash; i++)
|
||||
tcf_exts_init(&cp->perfect[i].exts,
|
||||
TCA_TCINDEX_ACT, TCA_TCINDEX_POLICE);
|
||||
balloc = 1;
|
||||
}
|
||||
cp->h = p->h;
|
||||
|
||||
tcindex_filter_result_init(&new_filter_result);
|
||||
tcindex_filter_result_init(&cr);
|
||||
if (old_r)
|
||||
cr.res = r->res;
|
||||
|
||||
if (tb[TCA_TCINDEX_HASH])
|
||||
cp->hash = nla_get_u32(tb[TCA_TCINDEX_HASH]);
|
||||
|
||||
if (tb[TCA_TCINDEX_MASK])
|
||||
cp->mask = nla_get_u16(tb[TCA_TCINDEX_MASK]);
|
||||
|
||||
if (tb[TCA_TCINDEX_SHIFT])
|
||||
cp->shift = nla_get_u32(tb[TCA_TCINDEX_SHIFT]);
|
||||
|
||||
err = -EBUSY;
|
||||
|
||||
/* Hash already allocated, make sure that we still meet the
|
||||
* requirements for the allocated hash.
|
||||
*/
|
||||
if (cp->perfect) {
|
||||
if (!valid_perfect_hash(cp) ||
|
||||
cp->hash > cp->alloc_hash)
|
||||
goto errout_alloc;
|
||||
} else if (cp->h && cp->hash != cp->alloc_hash) {
|
||||
goto errout_alloc;
|
||||
}
|
||||
|
||||
err = -EINVAL;
|
||||
if (tb[TCA_TCINDEX_FALL_THROUGH])
|
||||
cp->fall_through = nla_get_u32(tb[TCA_TCINDEX_FALL_THROUGH]);
|
||||
|
||||
if (!cp->hash) {
|
||||
/* Hash not specified, use perfect hash if the upper limit
|
||||
* of the hashing index is below the threshold.
|
||||
*/
|
||||
if ((cp->mask >> cp->shift) < PERFECT_HASH_THRESHOLD)
|
||||
cp->hash = (cp->mask >> cp->shift) + 1;
|
||||
else
|
||||
cp->hash = DEFAULT_HASH_SIZE;
|
||||
}
|
||||
|
||||
if (!cp->perfect && !cp->h)
|
||||
cp->alloc_hash = cp->hash;
|
||||
|
||||
/* Note: this could be as restrictive as if (handle & ~(mask >> shift))
|
||||
* but then, we'd fail handles that may become valid after some future
|
||||
* mask change. While this is extremely unlikely to ever matter,
|
||||
* the check below is safer (and also more backwards-compatible).
|
||||
*/
|
||||
if (cp->perfect || valid_perfect_hash(cp))
|
||||
if (handle >= cp->alloc_hash)
|
||||
goto errout_alloc;
|
||||
|
||||
|
||||
err = -ENOMEM;
|
||||
if (!cp->perfect && !cp->h) {
|
||||
if (valid_perfect_hash(cp)) {
|
||||
int i;
|
||||
|
||||
cp->perfect = kcalloc(cp->hash, sizeof(*r), GFP_KERNEL);
|
||||
if (!cp->perfect)
|
||||
goto errout_alloc;
|
||||
for (i = 0; i < cp->hash; i++)
|
||||
tcf_exts_init(&cp->perfect[i].exts,
|
||||
TCA_TCINDEX_ACT,
|
||||
TCA_TCINDEX_POLICE);
|
||||
balloc = 1;
|
||||
} else {
|
||||
struct tcindex_filter __rcu **hash;
|
||||
|
||||
hash = kcalloc(cp->hash,
|
||||
sizeof(struct tcindex_filter *),
|
||||
GFP_KERNEL);
|
||||
|
||||
if (!hash)
|
||||
goto errout_alloc;
|
||||
|
||||
cp->h = hash;
|
||||
balloc = 2;
|
||||
}
|
||||
}
|
||||
|
||||
if (cp->perfect)
|
||||
r = cp->perfect + handle;
|
||||
else
|
||||
r = tcindex_lookup(cp, handle) ? : &new_filter_result;
|
||||
|
||||
if (r == &new_filter_result) {
|
||||
f = kzalloc(sizeof(*f), GFP_KERNEL);
|
||||
if (!f)
|
||||
goto errout_alloc;
|
||||
f->key = handle;
|
||||
tcindex_filter_result_init(&f->result);
|
||||
f->next = NULL;
|
||||
}
|
||||
|
||||
if (tb[TCA_TCINDEX_CLASSID]) {
|
||||
cr.res.classid = nla_get_u32(tb[TCA_TCINDEX_CLASSID]);
|
||||
tcf_bind_filter(tp, &cr.res, base);
|
||||
}
|
||||
|
||||
if (old_r)
|
||||
tcf_exts_change(tp, &r->exts, &e);
|
||||
else
|
||||
tcf_exts_change(tp, &cr.exts, &e);
|
||||
|
||||
if (old_r && old_r != r)
|
||||
tcindex_filter_result_init(old_r);
|
||||
|
||||
oldp = p;
|
||||
r->res = cr.res;
|
||||
rcu_assign_pointer(tp->root, cp);
|
||||
|
||||
if (r == &new_filter_result) {
|
||||
struct tcindex_filter *nfp;
|
||||
struct tcindex_filter __rcu **fp;
|
||||
|
||||
tcf_exts_change(tp, &f->result.exts, &r->exts);
|
||||
|
||||
fp = cp->h + (handle % cp->hash);
|
||||
for (nfp = rtnl_dereference(*fp);
|
||||
nfp;
|
||||
fp = &nfp->next, nfp = rtnl_dereference(*fp))
|
||||
; /* nothing */
|
||||
|
||||
rcu_assign_pointer(*fp, f);
|
||||
}
|
||||
|
||||
if (oldp)
|
||||
call_rcu(&oldp->rcu, __tcindex_partial_destroy);
|
||||
return 0;
|
||||
|
||||
errout_alloc:
|
||||
if (balloc == 1)
|
||||
kfree(cp->perfect);
|
||||
else if (balloc == 2)
|
||||
kfree(cp->h);
|
||||
errout:
|
||||
kfree(cp);
|
||||
tcf_exts_destroy(&e);
|
||||
return err;
|
||||
}
|
||||
|
||||
static int
|
||||
tcindex_change(struct net *net, struct sk_buff *in_skb,
|
||||
struct tcf_proto *tp, unsigned long base, u32 handle,
|
||||
struct nlattr **tca, unsigned long *arg, bool ovr)
|
||||
{
|
||||
struct nlattr *opt = tca[TCA_OPTIONS];
|
||||
struct nlattr *tb[TCA_TCINDEX_MAX + 1];
|
||||
struct tcindex_data *p = rtnl_dereference(tp->root);
|
||||
struct tcindex_filter_result *r = (struct tcindex_filter_result *) *arg;
|
||||
int err;
|
||||
|
||||
pr_debug("tcindex_change(tp %p,handle 0x%08x,tca %p,arg %p),opt %p,"
|
||||
"p %p,r %p,*arg 0x%lx\n",
|
||||
tp, handle, tca, arg, opt, p, r, arg ? *arg : 0L);
|
||||
|
||||
if (!opt)
|
||||
return 0;
|
||||
|
||||
err = nla_parse_nested(tb, TCA_TCINDEX_MAX, opt, tcindex_policy);
|
||||
if (err < 0)
|
||||
return err;
|
||||
|
||||
return tcindex_set_parms(net, tp, base, handle, p, r, tb,
|
||||
tca[TCA_RATE], ovr);
|
||||
}
|
||||
|
||||
static void tcindex_walk(struct tcf_proto *tp, struct tcf_walker *walker)
|
||||
{
|
||||
struct tcindex_data *p = rtnl_dereference(tp->root);
|
||||
struct tcindex_filter *f, *next;
|
||||
int i;
|
||||
|
||||
pr_debug("tcindex_walk(tp %p,walker %p),p %p\n", tp, walker, p);
|
||||
if (p->perfect) {
|
||||
for (i = 0; i < p->hash; i++) {
|
||||
if (!p->perfect[i].res.class)
|
||||
continue;
|
||||
if (walker->count >= walker->skip) {
|
||||
if (walker->fn(tp,
|
||||
(unsigned long) (p->perfect+i), walker)
|
||||
< 0) {
|
||||
walker->stop = 1;
|
||||
return;
|
||||
}
|
||||
}
|
||||
walker->count++;
|
||||
}
|
||||
}
|
||||
if (!p->h)
|
||||
return;
|
||||
for (i = 0; i < p->hash; i++) {
|
||||
for (f = rtnl_dereference(p->h[i]); f; f = next) {
|
||||
next = rtnl_dereference(f->next);
|
||||
if (walker->count >= walker->skip) {
|
||||
if (walker->fn(tp, (unsigned long) &f->result,
|
||||
walker) < 0) {
|
||||
walker->stop = 1;
|
||||
return;
|
||||
}
|
||||
}
|
||||
walker->count++;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static void tcindex_destroy(struct tcf_proto *tp)
|
||||
{
|
||||
struct tcindex_data *p = rtnl_dereference(tp->root);
|
||||
struct tcf_walker walker;
|
||||
|
||||
pr_debug("tcindex_destroy(tp %p),p %p\n", tp, p);
|
||||
walker.count = 0;
|
||||
walker.skip = 0;
|
||||
walker.fn = tcindex_destroy_element;
|
||||
tcindex_walk(tp, &walker);
|
||||
|
||||
RCU_INIT_POINTER(tp->root, NULL);
|
||||
call_rcu(&p->rcu, __tcindex_destroy);
|
||||
}
|
||||
|
||||
|
||||
static int tcindex_dump(struct net *net, struct tcf_proto *tp, unsigned long fh,
|
||||
struct sk_buff *skb, struct tcmsg *t)
|
||||
{
|
||||
struct tcindex_data *p = rtnl_dereference(tp->root);
|
||||
struct tcindex_filter_result *r = (struct tcindex_filter_result *) fh;
|
||||
unsigned char *b = skb_tail_pointer(skb);
|
||||
struct nlattr *nest;
|
||||
|
||||
pr_debug("tcindex_dump(tp %p,fh 0x%lx,skb %p,t %p),p %p,r %p,b %p\n",
|
||||
tp, fh, skb, t, p, r, b);
|
||||
pr_debug("p->perfect %p p->h %p\n", p->perfect, p->h);
|
||||
|
||||
nest = nla_nest_start(skb, TCA_OPTIONS);
|
||||
if (nest == NULL)
|
||||
goto nla_put_failure;
|
||||
|
||||
if (!fh) {
|
||||
t->tcm_handle = ~0; /* whatever ... */
|
||||
if (nla_put_u32(skb, TCA_TCINDEX_HASH, p->hash) ||
|
||||
nla_put_u16(skb, TCA_TCINDEX_MASK, p->mask) ||
|
||||
nla_put_u32(skb, TCA_TCINDEX_SHIFT, p->shift) ||
|
||||
nla_put_u32(skb, TCA_TCINDEX_FALL_THROUGH, p->fall_through))
|
||||
goto nla_put_failure;
|
||||
nla_nest_end(skb, nest);
|
||||
} else {
|
||||
if (p->perfect) {
|
||||
t->tcm_handle = r - p->perfect;
|
||||
} else {
|
||||
struct tcindex_filter *f;
|
||||
struct tcindex_filter __rcu **fp;
|
||||
int i;
|
||||
|
||||
t->tcm_handle = 0;
|
||||
for (i = 0; !t->tcm_handle && i < p->hash; i++) {
|
||||
fp = &p->h[i];
|
||||
for (f = rtnl_dereference(*fp);
|
||||
!t->tcm_handle && f;
|
||||
fp = &f->next, f = rtnl_dereference(*fp)) {
|
||||
if (&f->result == r)
|
||||
t->tcm_handle = f->key;
|
||||
}
|
||||
}
|
||||
}
|
||||
pr_debug("handle = %d\n", t->tcm_handle);
|
||||
if (r->res.class &&
|
||||
nla_put_u32(skb, TCA_TCINDEX_CLASSID, r->res.classid))
|
||||
goto nla_put_failure;
|
||||
|
||||
if (tcf_exts_dump(skb, &r->exts) < 0)
|
||||
goto nla_put_failure;
|
||||
nla_nest_end(skb, nest);
|
||||
|
||||
if (tcf_exts_dump_stats(skb, &r->exts) < 0)
|
||||
goto nla_put_failure;
|
||||
}
|
||||
|
||||
return skb->len;
|
||||
|
||||
nla_put_failure:
|
||||
nlmsg_trim(skb, b);
|
||||
return -1;
|
||||
}
|
||||
|
||||
static struct tcf_proto_ops cls_tcindex_ops __read_mostly = {
|
||||
.kind = "tcindex",
|
||||
.classify = tcindex_classify,
|
||||
.init = tcindex_init,
|
||||
.destroy = tcindex_destroy,
|
||||
.get = tcindex_get,
|
||||
.put = tcindex_put,
|
||||
.change = tcindex_change,
|
||||
.delete = tcindex_delete,
|
||||
.walk = tcindex_walk,
|
||||
.dump = tcindex_dump,
|
||||
.owner = THIS_MODULE,
|
||||
};
|
||||
|
||||
static int __init init_tcindex(void)
|
||||
{
|
||||
return register_tcf_proto_ops(&cls_tcindex_ops);
|
||||
}
|
||||
|
||||
static void __exit exit_tcindex(void)
|
||||
{
|
||||
unregister_tcf_proto_ops(&cls_tcindex_ops);
|
||||
}
|
||||
|
||||
module_init(init_tcindex)
|
||||
module_exit(exit_tcindex)
|
||||
MODULE_LICENSE("GPL");
|
||||
1057
net/sched/cls_u32.c
Normal file
1057
net/sched/cls_u32.c
Normal file
File diff suppressed because it is too large
Load diff
233
net/sched/em_canid.c
Normal file
233
net/sched/em_canid.c
Normal file
|
|
@ -0,0 +1,233 @@
|
|||
/*
|
||||
* em_canid.c Ematch rule to match CAN frames according to their CAN IDs
|
||||
*
|
||||
* This program is free software; you can distribute it and/or
|
||||
* modify it under the terms of the GNU General Public License
|
||||
* as published by the Free Software Foundation; either version
|
||||
* 2 of the License, or (at your option) any later version.
|
||||
*
|
||||
* Idea: Oliver Hartkopp <oliver.hartkopp@volkswagen.de>
|
||||
* Copyright: (c) 2011 Czech Technical University in Prague
|
||||
* (c) 2011 Volkswagen Group Research
|
||||
* Authors: Michal Sojka <sojkam1@fel.cvut.cz>
|
||||
* Pavel Pisa <pisa@cmp.felk.cvut.cz>
|
||||
* Rostislav Lisovy <lisovy@gmail.cz>
|
||||
* Funded by: Volkswagen Group Research
|
||||
*/
|
||||
|
||||
#include <linux/slab.h>
|
||||
#include <linux/module.h>
|
||||
#include <linux/types.h>
|
||||
#include <linux/kernel.h>
|
||||
#include <linux/string.h>
|
||||
#include <linux/skbuff.h>
|
||||
#include <net/pkt_cls.h>
|
||||
#include <linux/can.h>
|
||||
|
||||
#define EM_CAN_RULES_MAX 500
|
||||
|
||||
struct canid_match {
|
||||
/* For each SFF CAN ID (11 bit) there is one record in this bitfield */
|
||||
DECLARE_BITMAP(match_sff, (1 << CAN_SFF_ID_BITS));
|
||||
|
||||
int rules_count;
|
||||
int sff_rules_count;
|
||||
int eff_rules_count;
|
||||
|
||||
/*
|
||||
* Raw rules copied from netlink message; Used for sending
|
||||
* information to userspace (when 'tc filter show' is invoked)
|
||||
* AND when matching EFF frames
|
||||
*/
|
||||
struct can_filter rules_raw[];
|
||||
};
|
||||
|
||||
/**
|
||||
* em_canid_get_id() - Extracts Can ID out of the sk_buff structure.
|
||||
*/
|
||||
static canid_t em_canid_get_id(struct sk_buff *skb)
|
||||
{
|
||||
/* CAN ID is stored within the data field */
|
||||
struct can_frame *cf = (struct can_frame *)skb->data;
|
||||
|
||||
return cf->can_id;
|
||||
}
|
||||
|
||||
static void em_canid_sff_match_add(struct canid_match *cm, u32 can_id,
|
||||
u32 can_mask)
|
||||
{
|
||||
int i;
|
||||
|
||||
/*
|
||||
* Limit can_mask and can_id to SFF range to
|
||||
* protect against write after end of array
|
||||
*/
|
||||
can_mask &= CAN_SFF_MASK;
|
||||
can_id &= can_mask;
|
||||
|
||||
/* Single frame */
|
||||
if (can_mask == CAN_SFF_MASK) {
|
||||
set_bit(can_id, cm->match_sff);
|
||||
return;
|
||||
}
|
||||
|
||||
/* All frames */
|
||||
if (can_mask == 0) {
|
||||
bitmap_fill(cm->match_sff, (1 << CAN_SFF_ID_BITS));
|
||||
return;
|
||||
}
|
||||
|
||||
/*
|
||||
* Individual frame filter.
|
||||
* Add record (set bit to 1) for each ID that
|
||||
* conforms particular rule
|
||||
*/
|
||||
for (i = 0; i < (1 << CAN_SFF_ID_BITS); i++) {
|
||||
if ((i & can_mask) == can_id)
|
||||
set_bit(i, cm->match_sff);
|
||||
}
|
||||
}
|
||||
|
||||
static inline struct canid_match *em_canid_priv(struct tcf_ematch *m)
|
||||
{
|
||||
return (struct canid_match *)m->data;
|
||||
}
|
||||
|
||||
static int em_canid_match(struct sk_buff *skb, struct tcf_ematch *m,
|
||||
struct tcf_pkt_info *info)
|
||||
{
|
||||
struct canid_match *cm = em_canid_priv(m);
|
||||
canid_t can_id;
|
||||
int match = 0;
|
||||
int i;
|
||||
const struct can_filter *lp;
|
||||
|
||||
can_id = em_canid_get_id(skb);
|
||||
|
||||
if (can_id & CAN_EFF_FLAG) {
|
||||
for (i = 0, lp = cm->rules_raw;
|
||||
i < cm->eff_rules_count; i++, lp++) {
|
||||
if (!(((lp->can_id ^ can_id) & lp->can_mask))) {
|
||||
match = 1;
|
||||
break;
|
||||
}
|
||||
}
|
||||
} else { /* SFF */
|
||||
can_id &= CAN_SFF_MASK;
|
||||
match = (test_bit(can_id, cm->match_sff) ? 1 : 0);
|
||||
}
|
||||
|
||||
return match;
|
||||
}
|
||||
|
||||
static int em_canid_change(struct net *net, void *data, int len,
|
||||
struct tcf_ematch *m)
|
||||
{
|
||||
struct can_filter *conf = data; /* Array with rules */
|
||||
struct canid_match *cm;
|
||||
int i;
|
||||
|
||||
if (!len)
|
||||
return -EINVAL;
|
||||
|
||||
if (len % sizeof(struct can_filter))
|
||||
return -EINVAL;
|
||||
|
||||
if (len > sizeof(struct can_filter) * EM_CAN_RULES_MAX)
|
||||
return -EINVAL;
|
||||
|
||||
cm = kzalloc(sizeof(struct canid_match) + len, GFP_KERNEL);
|
||||
if (!cm)
|
||||
return -ENOMEM;
|
||||
|
||||
cm->rules_count = len / sizeof(struct can_filter);
|
||||
|
||||
/*
|
||||
* We need two for() loops for copying rules into two contiguous
|
||||
* areas in rules_raw to process all eff rules with a simple loop.
|
||||
* NB: The configuration interface supports sff and eff rules.
|
||||
* We do not support filters here that match for the same can_id
|
||||
* provided in a SFF and EFF frame (e.g. 0x123 / 0x80000123).
|
||||
* For this (unusual case) two filters have to be specified. The
|
||||
* SFF/EFF separation is done with the CAN_EFF_FLAG in the can_id.
|
||||
*/
|
||||
|
||||
/* Fill rules_raw with EFF rules first */
|
||||
for (i = 0; i < cm->rules_count; i++) {
|
||||
if (conf[i].can_id & CAN_EFF_FLAG) {
|
||||
memcpy(cm->rules_raw + cm->eff_rules_count,
|
||||
&conf[i],
|
||||
sizeof(struct can_filter));
|
||||
|
||||
cm->eff_rules_count++;
|
||||
}
|
||||
}
|
||||
|
||||
/* append SFF frame rules */
|
||||
for (i = 0; i < cm->rules_count; i++) {
|
||||
if (!(conf[i].can_id & CAN_EFF_FLAG)) {
|
||||
memcpy(cm->rules_raw
|
||||
+ cm->eff_rules_count
|
||||
+ cm->sff_rules_count,
|
||||
&conf[i], sizeof(struct can_filter));
|
||||
|
||||
cm->sff_rules_count++;
|
||||
|
||||
em_canid_sff_match_add(cm,
|
||||
conf[i].can_id, conf[i].can_mask);
|
||||
}
|
||||
}
|
||||
|
||||
m->datalen = sizeof(struct canid_match) + len;
|
||||
m->data = (unsigned long)cm;
|
||||
return 0;
|
||||
}
|
||||
|
||||
static void em_canid_destroy(struct tcf_ematch *m)
|
||||
{
|
||||
struct canid_match *cm = em_canid_priv(m);
|
||||
|
||||
kfree(cm);
|
||||
}
|
||||
|
||||
static int em_canid_dump(struct sk_buff *skb, struct tcf_ematch *m)
|
||||
{
|
||||
struct canid_match *cm = em_canid_priv(m);
|
||||
|
||||
/*
|
||||
* When configuring this ematch 'rules_count' is set not to exceed
|
||||
* 'rules_raw' array size
|
||||
*/
|
||||
if (nla_put_nohdr(skb, sizeof(struct can_filter) * cm->rules_count,
|
||||
&cm->rules_raw) < 0)
|
||||
return -EMSGSIZE;
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
static struct tcf_ematch_ops em_canid_ops = {
|
||||
.kind = TCF_EM_CANID,
|
||||
.change = em_canid_change,
|
||||
.match = em_canid_match,
|
||||
.destroy = em_canid_destroy,
|
||||
.dump = em_canid_dump,
|
||||
.owner = THIS_MODULE,
|
||||
.link = LIST_HEAD_INIT(em_canid_ops.link)
|
||||
};
|
||||
|
||||
static int __init init_em_canid(void)
|
||||
{
|
||||
return tcf_em_register(&em_canid_ops);
|
||||
}
|
||||
|
||||
static void __exit exit_em_canid(void)
|
||||
{
|
||||
tcf_em_unregister(&em_canid_ops);
|
||||
}
|
||||
|
||||
MODULE_LICENSE("GPL");
|
||||
|
||||
module_init(init_em_canid);
|
||||
module_exit(exit_em_canid);
|
||||
|
||||
MODULE_ALIAS_TCF_EMATCH(TCF_EM_CANID);
|
||||
99
net/sched/em_cmp.c
Normal file
99
net/sched/em_cmp.c
Normal file
|
|
@ -0,0 +1,99 @@
|
|||
/*
|
||||
* net/sched/em_cmp.c Simple packet data comparison ematch
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU General Public License
|
||||
* as published by the Free Software Foundation; either version
|
||||
* 2 of the License, or (at your option) any later version.
|
||||
*
|
||||
* Authors: Thomas Graf <tgraf@suug.ch>
|
||||
*/
|
||||
|
||||
#include <linux/module.h>
|
||||
#include <linux/types.h>
|
||||
#include <linux/kernel.h>
|
||||
#include <linux/skbuff.h>
|
||||
#include <linux/tc_ematch/tc_em_cmp.h>
|
||||
#include <asm/unaligned.h>
|
||||
#include <net/pkt_cls.h>
|
||||
|
||||
static inline int cmp_needs_transformation(struct tcf_em_cmp *cmp)
|
||||
{
|
||||
return unlikely(cmp->flags & TCF_EM_CMP_TRANS);
|
||||
}
|
||||
|
||||
static int em_cmp_match(struct sk_buff *skb, struct tcf_ematch *em,
|
||||
struct tcf_pkt_info *info)
|
||||
{
|
||||
struct tcf_em_cmp *cmp = (struct tcf_em_cmp *) em->data;
|
||||
unsigned char *ptr = tcf_get_base_ptr(skb, cmp->layer) + cmp->off;
|
||||
u32 val = 0;
|
||||
|
||||
if (!tcf_valid_offset(skb, ptr, cmp->align))
|
||||
return 0;
|
||||
|
||||
switch (cmp->align) {
|
||||
case TCF_EM_ALIGN_U8:
|
||||
val = *ptr;
|
||||
break;
|
||||
|
||||
case TCF_EM_ALIGN_U16:
|
||||
val = get_unaligned_be16(ptr);
|
||||
|
||||
if (cmp_needs_transformation(cmp))
|
||||
val = be16_to_cpu(val);
|
||||
break;
|
||||
|
||||
case TCF_EM_ALIGN_U32:
|
||||
/* Worth checking boundries? The branching seems
|
||||
* to get worse. Visit again.
|
||||
*/
|
||||
val = get_unaligned_be32(ptr);
|
||||
|
||||
if (cmp_needs_transformation(cmp))
|
||||
val = be32_to_cpu(val);
|
||||
break;
|
||||
|
||||
default:
|
||||
return 0;
|
||||
}
|
||||
|
||||
if (cmp->mask)
|
||||
val &= cmp->mask;
|
||||
|
||||
switch (cmp->opnd) {
|
||||
case TCF_EM_OPND_EQ:
|
||||
return val == cmp->val;
|
||||
case TCF_EM_OPND_LT:
|
||||
return val < cmp->val;
|
||||
case TCF_EM_OPND_GT:
|
||||
return val > cmp->val;
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
static struct tcf_ematch_ops em_cmp_ops = {
|
||||
.kind = TCF_EM_CMP,
|
||||
.datalen = sizeof(struct tcf_em_cmp),
|
||||
.match = em_cmp_match,
|
||||
.owner = THIS_MODULE,
|
||||
.link = LIST_HEAD_INIT(em_cmp_ops.link)
|
||||
};
|
||||
|
||||
static int __init init_em_cmp(void)
|
||||
{
|
||||
return tcf_em_register(&em_cmp_ops);
|
||||
}
|
||||
|
||||
static void __exit exit_em_cmp(void)
|
||||
{
|
||||
tcf_em_unregister(&em_cmp_ops);
|
||||
}
|
||||
|
||||
MODULE_LICENSE("GPL");
|
||||
|
||||
module_init(init_em_cmp);
|
||||
module_exit(exit_em_cmp);
|
||||
|
||||
MODULE_ALIAS_TCF_EMATCH(TCF_EM_CMP);
|
||||
135
net/sched/em_ipset.c
Normal file
135
net/sched/em_ipset.c
Normal file
|
|
@ -0,0 +1,135 @@
|
|||
/*
|
||||
* net/sched/em_ipset.c ipset ematch
|
||||
*
|
||||
* Copyright (c) 2012 Florian Westphal <fw@strlen.de>
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU General Public License
|
||||
* version 2 as published by the Free Software Foundation.
|
||||
*/
|
||||
|
||||
#include <linux/gfp.h>
|
||||
#include <linux/module.h>
|
||||
#include <linux/types.h>
|
||||
#include <linux/kernel.h>
|
||||
#include <linux/string.h>
|
||||
#include <linux/skbuff.h>
|
||||
#include <linux/netfilter/xt_set.h>
|
||||
#include <linux/ipv6.h>
|
||||
#include <net/ip.h>
|
||||
#include <net/pkt_cls.h>
|
||||
|
||||
static int em_ipset_change(struct net *net, void *data, int data_len,
|
||||
struct tcf_ematch *em)
|
||||
{
|
||||
struct xt_set_info *set = data;
|
||||
ip_set_id_t index;
|
||||
|
||||
if (data_len != sizeof(*set))
|
||||
return -EINVAL;
|
||||
|
||||
index = ip_set_nfnl_get_byindex(net, set->index);
|
||||
if (index == IPSET_INVALID_ID)
|
||||
return -ENOENT;
|
||||
|
||||
em->datalen = sizeof(*set);
|
||||
em->data = (unsigned long)kmemdup(data, em->datalen, GFP_KERNEL);
|
||||
if (em->data)
|
||||
return 0;
|
||||
|
||||
ip_set_nfnl_put(net, index);
|
||||
return -ENOMEM;
|
||||
}
|
||||
|
||||
static void em_ipset_destroy(struct tcf_ematch *em)
|
||||
{
|
||||
const struct xt_set_info *set = (const void *) em->data;
|
||||
if (set) {
|
||||
ip_set_nfnl_put(em->net, set->index);
|
||||
kfree((void *) em->data);
|
||||
}
|
||||
}
|
||||
|
||||
static int em_ipset_match(struct sk_buff *skb, struct tcf_ematch *em,
|
||||
struct tcf_pkt_info *info)
|
||||
{
|
||||
struct ip_set_adt_opt opt;
|
||||
struct xt_action_param acpar;
|
||||
const struct xt_set_info *set = (const void *) em->data;
|
||||
struct net_device *dev, *indev = NULL;
|
||||
int ret, network_offset;
|
||||
|
||||
switch (skb->protocol) {
|
||||
case htons(ETH_P_IP):
|
||||
acpar.family = NFPROTO_IPV4;
|
||||
if (!pskb_network_may_pull(skb, sizeof(struct iphdr)))
|
||||
return 0;
|
||||
acpar.thoff = ip_hdrlen(skb);
|
||||
break;
|
||||
case htons(ETH_P_IPV6):
|
||||
acpar.family = NFPROTO_IPV6;
|
||||
if (!pskb_network_may_pull(skb, sizeof(struct ipv6hdr)))
|
||||
return 0;
|
||||
/* doesn't call ipv6_find_hdr() because ipset doesn't use thoff, yet */
|
||||
acpar.thoff = sizeof(struct ipv6hdr);
|
||||
break;
|
||||
default:
|
||||
return 0;
|
||||
}
|
||||
|
||||
acpar.hooknum = 0;
|
||||
|
||||
opt.family = acpar.family;
|
||||
opt.dim = set->dim;
|
||||
opt.flags = set->flags;
|
||||
opt.cmdflags = 0;
|
||||
opt.ext.timeout = ~0u;
|
||||
|
||||
network_offset = skb_network_offset(skb);
|
||||
skb_pull(skb, network_offset);
|
||||
|
||||
dev = skb->dev;
|
||||
|
||||
rcu_read_lock();
|
||||
|
||||
if (dev && skb->skb_iif)
|
||||
indev = dev_get_by_index_rcu(dev_net(dev), skb->skb_iif);
|
||||
|
||||
acpar.in = indev ? indev : dev;
|
||||
acpar.out = dev;
|
||||
|
||||
ret = ip_set_test(set->index, skb, &acpar, &opt);
|
||||
|
||||
rcu_read_unlock();
|
||||
|
||||
skb_push(skb, network_offset);
|
||||
return ret;
|
||||
}
|
||||
|
||||
static struct tcf_ematch_ops em_ipset_ops = {
|
||||
.kind = TCF_EM_IPSET,
|
||||
.change = em_ipset_change,
|
||||
.destroy = em_ipset_destroy,
|
||||
.match = em_ipset_match,
|
||||
.owner = THIS_MODULE,
|
||||
.link = LIST_HEAD_INIT(em_ipset_ops.link)
|
||||
};
|
||||
|
||||
static int __init init_em_ipset(void)
|
||||
{
|
||||
return tcf_em_register(&em_ipset_ops);
|
||||
}
|
||||
|
||||
static void __exit exit_em_ipset(void)
|
||||
{
|
||||
tcf_em_unregister(&em_ipset_ops);
|
||||
}
|
||||
|
||||
MODULE_LICENSE("GPL");
|
||||
MODULE_AUTHOR("Florian Westphal <fw@strlen.de>");
|
||||
MODULE_DESCRIPTION("TC extended match for IP sets");
|
||||
|
||||
module_init(init_em_ipset);
|
||||
module_exit(exit_em_ipset);
|
||||
|
||||
MODULE_ALIAS_TCF_EMATCH(TCF_EM_IPSET);
|
||||
966
net/sched/em_meta.c
Normal file
966
net/sched/em_meta.c
Normal file
|
|
@ -0,0 +1,966 @@
|
|||
/*
|
||||
* net/sched/em_meta.c Metadata ematch
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU General Public License
|
||||
* as published by the Free Software Foundation; either version
|
||||
* 2 of the License, or (at your option) any later version.
|
||||
*
|
||||
* Authors: Thomas Graf <tgraf@suug.ch>
|
||||
*
|
||||
* ==========================================================================
|
||||
*
|
||||
* The metadata ematch compares two meta objects where each object
|
||||
* represents either a meta value stored in the kernel or a static
|
||||
* value provided by userspace. The objects are not provided by
|
||||
* userspace itself but rather a definition providing the information
|
||||
* to build them. Every object is of a certain type which must be
|
||||
* equal to the object it is being compared to.
|
||||
*
|
||||
* The definition of a objects conists of the type (meta type), a
|
||||
* identifier (meta id) and additional type specific information.
|
||||
* The meta id is either TCF_META_TYPE_VALUE for values provided by
|
||||
* userspace or a index to the meta operations table consisting of
|
||||
* function pointers to type specific meta data collectors returning
|
||||
* the value of the requested meta value.
|
||||
*
|
||||
* lvalue rvalue
|
||||
* +-----------+ +-----------+
|
||||
* | type: INT | | type: INT |
|
||||
* def | id: DEV | | id: VALUE |
|
||||
* | data: | | data: 3 |
|
||||
* +-----------+ +-----------+
|
||||
* | |
|
||||
* ---> meta_ops[INT][DEV](...) |
|
||||
* | |
|
||||
* ----------- |
|
||||
* V V
|
||||
* +-----------+ +-----------+
|
||||
* | type: INT | | type: INT |
|
||||
* obj | id: DEV | | id: VALUE |
|
||||
* | data: 2 |<--data got filled out | data: 3 |
|
||||
* +-----------+ +-----------+
|
||||
* | |
|
||||
* --------------> 2 equals 3 <--------------
|
||||
*
|
||||
* This is a simplified schema, the complexity varies depending
|
||||
* on the meta type. Obviously, the length of the data must also
|
||||
* be provided for non-numeric types.
|
||||
*
|
||||
* Additionally, type dependent modifiers such as shift operators
|
||||
* or mask may be applied to extend the functionaliy. As of now,
|
||||
* the variable length type supports shifting the byte string to
|
||||
* the right, eating up any number of octets and thus supporting
|
||||
* wildcard interface name comparisons such as "ppp%" matching
|
||||
* ppp0..9.
|
||||
*
|
||||
* NOTE: Certain meta values depend on other subsystems and are
|
||||
* only available if that subsystem is enabled in the kernel.
|
||||
*/
|
||||
|
||||
#include <linux/slab.h>
|
||||
#include <linux/module.h>
|
||||
#include <linux/types.h>
|
||||
#include <linux/kernel.h>
|
||||
#include <linux/sched.h>
|
||||
#include <linux/string.h>
|
||||
#include <linux/skbuff.h>
|
||||
#include <linux/random.h>
|
||||
#include <linux/if_vlan.h>
|
||||
#include <linux/tc_ematch/tc_em_meta.h>
|
||||
#include <net/dst.h>
|
||||
#include <net/route.h>
|
||||
#include <net/pkt_cls.h>
|
||||
#include <net/sock.h>
|
||||
|
||||
struct meta_obj {
|
||||
unsigned long value;
|
||||
unsigned int len;
|
||||
};
|
||||
|
||||
struct meta_value {
|
||||
struct tcf_meta_val hdr;
|
||||
unsigned long val;
|
||||
unsigned int len;
|
||||
};
|
||||
|
||||
struct meta_match {
|
||||
struct meta_value lvalue;
|
||||
struct meta_value rvalue;
|
||||
};
|
||||
|
||||
static inline int meta_id(struct meta_value *v)
|
||||
{
|
||||
return TCF_META_ID(v->hdr.kind);
|
||||
}
|
||||
|
||||
static inline int meta_type(struct meta_value *v)
|
||||
{
|
||||
return TCF_META_TYPE(v->hdr.kind);
|
||||
}
|
||||
|
||||
#define META_COLLECTOR(FUNC) static void meta_##FUNC(struct sk_buff *skb, \
|
||||
struct tcf_pkt_info *info, struct meta_value *v, \
|
||||
struct meta_obj *dst, int *err)
|
||||
|
||||
/**************************************************************************
|
||||
* System status & misc
|
||||
**************************************************************************/
|
||||
|
||||
META_COLLECTOR(int_random)
|
||||
{
|
||||
get_random_bytes(&dst->value, sizeof(dst->value));
|
||||
}
|
||||
|
||||
static inline unsigned long fixed_loadavg(int load)
|
||||
{
|
||||
int rnd_load = load + (FIXED_1/200);
|
||||
int rnd_frac = ((rnd_load & (FIXED_1-1)) * 100) >> FSHIFT;
|
||||
|
||||
return ((rnd_load >> FSHIFT) * 100) + rnd_frac;
|
||||
}
|
||||
|
||||
META_COLLECTOR(int_loadavg_0)
|
||||
{
|
||||
dst->value = fixed_loadavg(avenrun[0]);
|
||||
}
|
||||
|
||||
META_COLLECTOR(int_loadavg_1)
|
||||
{
|
||||
dst->value = fixed_loadavg(avenrun[1]);
|
||||
}
|
||||
|
||||
META_COLLECTOR(int_loadavg_2)
|
||||
{
|
||||
dst->value = fixed_loadavg(avenrun[2]);
|
||||
}
|
||||
|
||||
/**************************************************************************
|
||||
* Device names & indices
|
||||
**************************************************************************/
|
||||
|
||||
static inline int int_dev(struct net_device *dev, struct meta_obj *dst)
|
||||
{
|
||||
if (unlikely(dev == NULL))
|
||||
return -1;
|
||||
|
||||
dst->value = dev->ifindex;
|
||||
return 0;
|
||||
}
|
||||
|
||||
static inline int var_dev(struct net_device *dev, struct meta_obj *dst)
|
||||
{
|
||||
if (unlikely(dev == NULL))
|
||||
return -1;
|
||||
|
||||
dst->value = (unsigned long) dev->name;
|
||||
dst->len = strlen(dev->name);
|
||||
return 0;
|
||||
}
|
||||
|
||||
META_COLLECTOR(int_dev)
|
||||
{
|
||||
*err = int_dev(skb->dev, dst);
|
||||
}
|
||||
|
||||
META_COLLECTOR(var_dev)
|
||||
{
|
||||
*err = var_dev(skb->dev, dst);
|
||||
}
|
||||
|
||||
/**************************************************************************
|
||||
* vlan tag
|
||||
**************************************************************************/
|
||||
|
||||
META_COLLECTOR(int_vlan_tag)
|
||||
{
|
||||
unsigned short tag;
|
||||
|
||||
tag = vlan_tx_tag_get(skb);
|
||||
if (!tag && __vlan_get_tag(skb, &tag))
|
||||
*err = -1;
|
||||
else
|
||||
dst->value = tag;
|
||||
}
|
||||
|
||||
|
||||
|
||||
/**************************************************************************
|
||||
* skb attributes
|
||||
**************************************************************************/
|
||||
|
||||
META_COLLECTOR(int_priority)
|
||||
{
|
||||
dst->value = skb->priority;
|
||||
}
|
||||
|
||||
META_COLLECTOR(int_protocol)
|
||||
{
|
||||
/* Let userspace take care of the byte ordering */
|
||||
dst->value = skb->protocol;
|
||||
}
|
||||
|
||||
META_COLLECTOR(int_pkttype)
|
||||
{
|
||||
dst->value = skb->pkt_type;
|
||||
}
|
||||
|
||||
META_COLLECTOR(int_pktlen)
|
||||
{
|
||||
dst->value = skb->len;
|
||||
}
|
||||
|
||||
META_COLLECTOR(int_datalen)
|
||||
{
|
||||
dst->value = skb->data_len;
|
||||
}
|
||||
|
||||
META_COLLECTOR(int_maclen)
|
||||
{
|
||||
dst->value = skb->mac_len;
|
||||
}
|
||||
|
||||
META_COLLECTOR(int_rxhash)
|
||||
{
|
||||
dst->value = skb_get_hash(skb);
|
||||
}
|
||||
|
||||
/**************************************************************************
|
||||
* Netfilter
|
||||
**************************************************************************/
|
||||
|
||||
META_COLLECTOR(int_mark)
|
||||
{
|
||||
dst->value = skb->mark;
|
||||
}
|
||||
|
||||
/**************************************************************************
|
||||
* Traffic Control
|
||||
**************************************************************************/
|
||||
|
||||
META_COLLECTOR(int_tcindex)
|
||||
{
|
||||
dst->value = skb->tc_index;
|
||||
}
|
||||
|
||||
/**************************************************************************
|
||||
* Routing
|
||||
**************************************************************************/
|
||||
|
||||
META_COLLECTOR(int_rtclassid)
|
||||
{
|
||||
if (unlikely(skb_dst(skb) == NULL))
|
||||
*err = -1;
|
||||
else
|
||||
#ifdef CONFIG_IP_ROUTE_CLASSID
|
||||
dst->value = skb_dst(skb)->tclassid;
|
||||
#else
|
||||
dst->value = 0;
|
||||
#endif
|
||||
}
|
||||
|
||||
META_COLLECTOR(int_rtiif)
|
||||
{
|
||||
if (unlikely(skb_rtable(skb) == NULL))
|
||||
*err = -1;
|
||||
else
|
||||
dst->value = inet_iif(skb);
|
||||
}
|
||||
|
||||
/**************************************************************************
|
||||
* Socket Attributes
|
||||
**************************************************************************/
|
||||
|
||||
#define skip_nonlocal(skb) \
|
||||
(unlikely(skb->sk == NULL))
|
||||
|
||||
META_COLLECTOR(int_sk_family)
|
||||
{
|
||||
if (skip_nonlocal(skb)) {
|
||||
*err = -1;
|
||||
return;
|
||||
}
|
||||
dst->value = skb->sk->sk_family;
|
||||
}
|
||||
|
||||
META_COLLECTOR(int_sk_state)
|
||||
{
|
||||
if (skip_nonlocal(skb)) {
|
||||
*err = -1;
|
||||
return;
|
||||
}
|
||||
dst->value = skb->sk->sk_state;
|
||||
}
|
||||
|
||||
META_COLLECTOR(int_sk_reuse)
|
||||
{
|
||||
if (skip_nonlocal(skb)) {
|
||||
*err = -1;
|
||||
return;
|
||||
}
|
||||
dst->value = skb->sk->sk_reuse;
|
||||
}
|
||||
|
||||
META_COLLECTOR(int_sk_bound_if)
|
||||
{
|
||||
if (skip_nonlocal(skb)) {
|
||||
*err = -1;
|
||||
return;
|
||||
}
|
||||
/* No error if bound_dev_if is 0, legal userspace check */
|
||||
dst->value = skb->sk->sk_bound_dev_if;
|
||||
}
|
||||
|
||||
META_COLLECTOR(var_sk_bound_if)
|
||||
{
|
||||
if (skip_nonlocal(skb)) {
|
||||
*err = -1;
|
||||
return;
|
||||
}
|
||||
|
||||
if (skb->sk->sk_bound_dev_if == 0) {
|
||||
dst->value = (unsigned long) "any";
|
||||
dst->len = 3;
|
||||
} else {
|
||||
struct net_device *dev;
|
||||
|
||||
rcu_read_lock();
|
||||
dev = dev_get_by_index_rcu(sock_net(skb->sk),
|
||||
skb->sk->sk_bound_dev_if);
|
||||
*err = var_dev(dev, dst);
|
||||
rcu_read_unlock();
|
||||
}
|
||||
}
|
||||
|
||||
META_COLLECTOR(int_sk_refcnt)
|
||||
{
|
||||
if (skip_nonlocal(skb)) {
|
||||
*err = -1;
|
||||
return;
|
||||
}
|
||||
dst->value = atomic_read(&skb->sk->sk_refcnt);
|
||||
}
|
||||
|
||||
META_COLLECTOR(int_sk_rcvbuf)
|
||||
{
|
||||
if (skip_nonlocal(skb)) {
|
||||
*err = -1;
|
||||
return;
|
||||
}
|
||||
dst->value = skb->sk->sk_rcvbuf;
|
||||
}
|
||||
|
||||
META_COLLECTOR(int_sk_shutdown)
|
||||
{
|
||||
if (skip_nonlocal(skb)) {
|
||||
*err = -1;
|
||||
return;
|
||||
}
|
||||
dst->value = skb->sk->sk_shutdown;
|
||||
}
|
||||
|
||||
META_COLLECTOR(int_sk_proto)
|
||||
{
|
||||
if (skip_nonlocal(skb)) {
|
||||
*err = -1;
|
||||
return;
|
||||
}
|
||||
dst->value = skb->sk->sk_protocol;
|
||||
}
|
||||
|
||||
META_COLLECTOR(int_sk_type)
|
||||
{
|
||||
if (skip_nonlocal(skb)) {
|
||||
*err = -1;
|
||||
return;
|
||||
}
|
||||
dst->value = skb->sk->sk_type;
|
||||
}
|
||||
|
||||
META_COLLECTOR(int_sk_rmem_alloc)
|
||||
{
|
||||
if (skip_nonlocal(skb)) {
|
||||
*err = -1;
|
||||
return;
|
||||
}
|
||||
dst->value = sk_rmem_alloc_get(skb->sk);
|
||||
}
|
||||
|
||||
META_COLLECTOR(int_sk_wmem_alloc)
|
||||
{
|
||||
if (skip_nonlocal(skb)) {
|
||||
*err = -1;
|
||||
return;
|
||||
}
|
||||
dst->value = sk_wmem_alloc_get(skb->sk);
|
||||
}
|
||||
|
||||
META_COLLECTOR(int_sk_omem_alloc)
|
||||
{
|
||||
if (skip_nonlocal(skb)) {
|
||||
*err = -1;
|
||||
return;
|
||||
}
|
||||
dst->value = atomic_read(&skb->sk->sk_omem_alloc);
|
||||
}
|
||||
|
||||
META_COLLECTOR(int_sk_rcv_qlen)
|
||||
{
|
||||
if (skip_nonlocal(skb)) {
|
||||
*err = -1;
|
||||
return;
|
||||
}
|
||||
dst->value = skb->sk->sk_receive_queue.qlen;
|
||||
}
|
||||
|
||||
META_COLLECTOR(int_sk_snd_qlen)
|
||||
{
|
||||
if (skip_nonlocal(skb)) {
|
||||
*err = -1;
|
||||
return;
|
||||
}
|
||||
dst->value = skb->sk->sk_write_queue.qlen;
|
||||
}
|
||||
|
||||
META_COLLECTOR(int_sk_wmem_queued)
|
||||
{
|
||||
if (skip_nonlocal(skb)) {
|
||||
*err = -1;
|
||||
return;
|
||||
}
|
||||
dst->value = skb->sk->sk_wmem_queued;
|
||||
}
|
||||
|
||||
META_COLLECTOR(int_sk_fwd_alloc)
|
||||
{
|
||||
if (skip_nonlocal(skb)) {
|
||||
*err = -1;
|
||||
return;
|
||||
}
|
||||
dst->value = skb->sk->sk_forward_alloc;
|
||||
}
|
||||
|
||||
META_COLLECTOR(int_sk_sndbuf)
|
||||
{
|
||||
if (skip_nonlocal(skb)) {
|
||||
*err = -1;
|
||||
return;
|
||||
}
|
||||
dst->value = skb->sk->sk_sndbuf;
|
||||
}
|
||||
|
||||
META_COLLECTOR(int_sk_alloc)
|
||||
{
|
||||
if (skip_nonlocal(skb)) {
|
||||
*err = -1;
|
||||
return;
|
||||
}
|
||||
dst->value = (__force int) skb->sk->sk_allocation;
|
||||
}
|
||||
|
||||
META_COLLECTOR(int_sk_hash)
|
||||
{
|
||||
if (skip_nonlocal(skb)) {
|
||||
*err = -1;
|
||||
return;
|
||||
}
|
||||
dst->value = skb->sk->sk_hash;
|
||||
}
|
||||
|
||||
META_COLLECTOR(int_sk_lingertime)
|
||||
{
|
||||
if (skip_nonlocal(skb)) {
|
||||
*err = -1;
|
||||
return;
|
||||
}
|
||||
dst->value = skb->sk->sk_lingertime / HZ;
|
||||
}
|
||||
|
||||
META_COLLECTOR(int_sk_err_qlen)
|
||||
{
|
||||
if (skip_nonlocal(skb)) {
|
||||
*err = -1;
|
||||
return;
|
||||
}
|
||||
dst->value = skb->sk->sk_error_queue.qlen;
|
||||
}
|
||||
|
||||
META_COLLECTOR(int_sk_ack_bl)
|
||||
{
|
||||
if (skip_nonlocal(skb)) {
|
||||
*err = -1;
|
||||
return;
|
||||
}
|
||||
dst->value = skb->sk->sk_ack_backlog;
|
||||
}
|
||||
|
||||
META_COLLECTOR(int_sk_max_ack_bl)
|
||||
{
|
||||
if (skip_nonlocal(skb)) {
|
||||
*err = -1;
|
||||
return;
|
||||
}
|
||||
dst->value = skb->sk->sk_max_ack_backlog;
|
||||
}
|
||||
|
||||
META_COLLECTOR(int_sk_prio)
|
||||
{
|
||||
if (skip_nonlocal(skb)) {
|
||||
*err = -1;
|
||||
return;
|
||||
}
|
||||
dst->value = skb->sk->sk_priority;
|
||||
}
|
||||
|
||||
META_COLLECTOR(int_sk_rcvlowat)
|
||||
{
|
||||
if (skip_nonlocal(skb)) {
|
||||
*err = -1;
|
||||
return;
|
||||
}
|
||||
dst->value = skb->sk->sk_rcvlowat;
|
||||
}
|
||||
|
||||
META_COLLECTOR(int_sk_rcvtimeo)
|
||||
{
|
||||
if (skip_nonlocal(skb)) {
|
||||
*err = -1;
|
||||
return;
|
||||
}
|
||||
dst->value = skb->sk->sk_rcvtimeo / HZ;
|
||||
}
|
||||
|
||||
META_COLLECTOR(int_sk_sndtimeo)
|
||||
{
|
||||
if (skip_nonlocal(skb)) {
|
||||
*err = -1;
|
||||
return;
|
||||
}
|
||||
dst->value = skb->sk->sk_sndtimeo / HZ;
|
||||
}
|
||||
|
||||
META_COLLECTOR(int_sk_sendmsg_off)
|
||||
{
|
||||
if (skip_nonlocal(skb)) {
|
||||
*err = -1;
|
||||
return;
|
||||
}
|
||||
dst->value = skb->sk->sk_frag.offset;
|
||||
}
|
||||
|
||||
META_COLLECTOR(int_sk_write_pend)
|
||||
{
|
||||
if (skip_nonlocal(skb)) {
|
||||
*err = -1;
|
||||
return;
|
||||
}
|
||||
dst->value = skb->sk->sk_write_pending;
|
||||
}
|
||||
|
||||
/**************************************************************************
|
||||
* Meta value collectors assignment table
|
||||
**************************************************************************/
|
||||
|
||||
struct meta_ops {
|
||||
void (*get)(struct sk_buff *, struct tcf_pkt_info *,
|
||||
struct meta_value *, struct meta_obj *, int *);
|
||||
};
|
||||
|
||||
#define META_ID(name) TCF_META_ID_##name
|
||||
#define META_FUNC(name) { .get = meta_##name }
|
||||
|
||||
/* Meta value operations table listing all meta value collectors and
|
||||
* assigns them to a type and meta id. */
|
||||
static struct meta_ops __meta_ops[TCF_META_TYPE_MAX + 1][TCF_META_ID_MAX + 1] = {
|
||||
[TCF_META_TYPE_VAR] = {
|
||||
[META_ID(DEV)] = META_FUNC(var_dev),
|
||||
[META_ID(SK_BOUND_IF)] = META_FUNC(var_sk_bound_if),
|
||||
},
|
||||
[TCF_META_TYPE_INT] = {
|
||||
[META_ID(RANDOM)] = META_FUNC(int_random),
|
||||
[META_ID(LOADAVG_0)] = META_FUNC(int_loadavg_0),
|
||||
[META_ID(LOADAVG_1)] = META_FUNC(int_loadavg_1),
|
||||
[META_ID(LOADAVG_2)] = META_FUNC(int_loadavg_2),
|
||||
[META_ID(DEV)] = META_FUNC(int_dev),
|
||||
[META_ID(PRIORITY)] = META_FUNC(int_priority),
|
||||
[META_ID(PROTOCOL)] = META_FUNC(int_protocol),
|
||||
[META_ID(PKTTYPE)] = META_FUNC(int_pkttype),
|
||||
[META_ID(PKTLEN)] = META_FUNC(int_pktlen),
|
||||
[META_ID(DATALEN)] = META_FUNC(int_datalen),
|
||||
[META_ID(MACLEN)] = META_FUNC(int_maclen),
|
||||
[META_ID(NFMARK)] = META_FUNC(int_mark),
|
||||
[META_ID(TCINDEX)] = META_FUNC(int_tcindex),
|
||||
[META_ID(RTCLASSID)] = META_FUNC(int_rtclassid),
|
||||
[META_ID(RTIIF)] = META_FUNC(int_rtiif),
|
||||
[META_ID(SK_FAMILY)] = META_FUNC(int_sk_family),
|
||||
[META_ID(SK_STATE)] = META_FUNC(int_sk_state),
|
||||
[META_ID(SK_REUSE)] = META_FUNC(int_sk_reuse),
|
||||
[META_ID(SK_BOUND_IF)] = META_FUNC(int_sk_bound_if),
|
||||
[META_ID(SK_REFCNT)] = META_FUNC(int_sk_refcnt),
|
||||
[META_ID(SK_RCVBUF)] = META_FUNC(int_sk_rcvbuf),
|
||||
[META_ID(SK_SNDBUF)] = META_FUNC(int_sk_sndbuf),
|
||||
[META_ID(SK_SHUTDOWN)] = META_FUNC(int_sk_shutdown),
|
||||
[META_ID(SK_PROTO)] = META_FUNC(int_sk_proto),
|
||||
[META_ID(SK_TYPE)] = META_FUNC(int_sk_type),
|
||||
[META_ID(SK_RMEM_ALLOC)] = META_FUNC(int_sk_rmem_alloc),
|
||||
[META_ID(SK_WMEM_ALLOC)] = META_FUNC(int_sk_wmem_alloc),
|
||||
[META_ID(SK_OMEM_ALLOC)] = META_FUNC(int_sk_omem_alloc),
|
||||
[META_ID(SK_WMEM_QUEUED)] = META_FUNC(int_sk_wmem_queued),
|
||||
[META_ID(SK_RCV_QLEN)] = META_FUNC(int_sk_rcv_qlen),
|
||||
[META_ID(SK_SND_QLEN)] = META_FUNC(int_sk_snd_qlen),
|
||||
[META_ID(SK_ERR_QLEN)] = META_FUNC(int_sk_err_qlen),
|
||||
[META_ID(SK_FORWARD_ALLOCS)] = META_FUNC(int_sk_fwd_alloc),
|
||||
[META_ID(SK_ALLOCS)] = META_FUNC(int_sk_alloc),
|
||||
[META_ID(SK_HASH)] = META_FUNC(int_sk_hash),
|
||||
[META_ID(SK_LINGERTIME)] = META_FUNC(int_sk_lingertime),
|
||||
[META_ID(SK_ACK_BACKLOG)] = META_FUNC(int_sk_ack_bl),
|
||||
[META_ID(SK_MAX_ACK_BACKLOG)] = META_FUNC(int_sk_max_ack_bl),
|
||||
[META_ID(SK_PRIO)] = META_FUNC(int_sk_prio),
|
||||
[META_ID(SK_RCVLOWAT)] = META_FUNC(int_sk_rcvlowat),
|
||||
[META_ID(SK_RCVTIMEO)] = META_FUNC(int_sk_rcvtimeo),
|
||||
[META_ID(SK_SNDTIMEO)] = META_FUNC(int_sk_sndtimeo),
|
||||
[META_ID(SK_SENDMSG_OFF)] = META_FUNC(int_sk_sendmsg_off),
|
||||
[META_ID(SK_WRITE_PENDING)] = META_FUNC(int_sk_write_pend),
|
||||
[META_ID(VLAN_TAG)] = META_FUNC(int_vlan_tag),
|
||||
[META_ID(RXHASH)] = META_FUNC(int_rxhash),
|
||||
}
|
||||
};
|
||||
|
||||
static inline struct meta_ops *meta_ops(struct meta_value *val)
|
||||
{
|
||||
return &__meta_ops[meta_type(val)][meta_id(val)];
|
||||
}
|
||||
|
||||
/**************************************************************************
|
||||
* Type specific operations for TCF_META_TYPE_VAR
|
||||
**************************************************************************/
|
||||
|
||||
static int meta_var_compare(struct meta_obj *a, struct meta_obj *b)
|
||||
{
|
||||
int r = a->len - b->len;
|
||||
|
||||
if (r == 0)
|
||||
r = memcmp((void *) a->value, (void *) b->value, a->len);
|
||||
|
||||
return r;
|
||||
}
|
||||
|
||||
static int meta_var_change(struct meta_value *dst, struct nlattr *nla)
|
||||
{
|
||||
int len = nla_len(nla);
|
||||
|
||||
dst->val = (unsigned long)kmemdup(nla_data(nla), len, GFP_KERNEL);
|
||||
if (dst->val == 0UL)
|
||||
return -ENOMEM;
|
||||
dst->len = len;
|
||||
return 0;
|
||||
}
|
||||
|
||||
static void meta_var_destroy(struct meta_value *v)
|
||||
{
|
||||
kfree((void *) v->val);
|
||||
}
|
||||
|
||||
static void meta_var_apply_extras(struct meta_value *v,
|
||||
struct meta_obj *dst)
|
||||
{
|
||||
int shift = v->hdr.shift;
|
||||
|
||||
if (shift && shift < dst->len)
|
||||
dst->len -= shift;
|
||||
}
|
||||
|
||||
static int meta_var_dump(struct sk_buff *skb, struct meta_value *v, int tlv)
|
||||
{
|
||||
if (v->val && v->len &&
|
||||
nla_put(skb, tlv, v->len, (void *) v->val))
|
||||
goto nla_put_failure;
|
||||
return 0;
|
||||
|
||||
nla_put_failure:
|
||||
return -1;
|
||||
}
|
||||
|
||||
/**************************************************************************
|
||||
* Type specific operations for TCF_META_TYPE_INT
|
||||
**************************************************************************/
|
||||
|
||||
static int meta_int_compare(struct meta_obj *a, struct meta_obj *b)
|
||||
{
|
||||
/* Let gcc optimize it, the unlikely is not really based on
|
||||
* some numbers but jump free code for mismatches seems
|
||||
* more logical. */
|
||||
if (unlikely(a->value == b->value))
|
||||
return 0;
|
||||
else if (a->value < b->value)
|
||||
return -1;
|
||||
else
|
||||
return 1;
|
||||
}
|
||||
|
||||
static int meta_int_change(struct meta_value *dst, struct nlattr *nla)
|
||||
{
|
||||
if (nla_len(nla) >= sizeof(unsigned long)) {
|
||||
dst->val = *(unsigned long *) nla_data(nla);
|
||||
dst->len = sizeof(unsigned long);
|
||||
} else if (nla_len(nla) == sizeof(u32)) {
|
||||
dst->val = nla_get_u32(nla);
|
||||
dst->len = sizeof(u32);
|
||||
} else
|
||||
return -EINVAL;
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
static void meta_int_apply_extras(struct meta_value *v,
|
||||
struct meta_obj *dst)
|
||||
{
|
||||
if (v->hdr.shift)
|
||||
dst->value >>= v->hdr.shift;
|
||||
|
||||
if (v->val)
|
||||
dst->value &= v->val;
|
||||
}
|
||||
|
||||
static int meta_int_dump(struct sk_buff *skb, struct meta_value *v, int tlv)
|
||||
{
|
||||
if (v->len == sizeof(unsigned long)) {
|
||||
if (nla_put(skb, tlv, sizeof(unsigned long), &v->val))
|
||||
goto nla_put_failure;
|
||||
} else if (v->len == sizeof(u32)) {
|
||||
if (nla_put_u32(skb, tlv, v->val))
|
||||
goto nla_put_failure;
|
||||
}
|
||||
|
||||
return 0;
|
||||
|
||||
nla_put_failure:
|
||||
return -1;
|
||||
}
|
||||
|
||||
/**************************************************************************
|
||||
* Type specific operations table
|
||||
**************************************************************************/
|
||||
|
||||
struct meta_type_ops {
|
||||
void (*destroy)(struct meta_value *);
|
||||
int (*compare)(struct meta_obj *, struct meta_obj *);
|
||||
int (*change)(struct meta_value *, struct nlattr *);
|
||||
void (*apply_extras)(struct meta_value *, struct meta_obj *);
|
||||
int (*dump)(struct sk_buff *, struct meta_value *, int);
|
||||
};
|
||||
|
||||
static struct meta_type_ops __meta_type_ops[TCF_META_TYPE_MAX + 1] = {
|
||||
[TCF_META_TYPE_VAR] = {
|
||||
.destroy = meta_var_destroy,
|
||||
.compare = meta_var_compare,
|
||||
.change = meta_var_change,
|
||||
.apply_extras = meta_var_apply_extras,
|
||||
.dump = meta_var_dump
|
||||
},
|
||||
[TCF_META_TYPE_INT] = {
|
||||
.compare = meta_int_compare,
|
||||
.change = meta_int_change,
|
||||
.apply_extras = meta_int_apply_extras,
|
||||
.dump = meta_int_dump
|
||||
}
|
||||
};
|
||||
|
||||
static inline struct meta_type_ops *meta_type_ops(struct meta_value *v)
|
||||
{
|
||||
return &__meta_type_ops[meta_type(v)];
|
||||
}
|
||||
|
||||
/**************************************************************************
|
||||
* Core
|
||||
**************************************************************************/
|
||||
|
||||
static int meta_get(struct sk_buff *skb, struct tcf_pkt_info *info,
|
||||
struct meta_value *v, struct meta_obj *dst)
|
||||
{
|
||||
int err = 0;
|
||||
|
||||
if (meta_id(v) == TCF_META_ID_VALUE) {
|
||||
dst->value = v->val;
|
||||
dst->len = v->len;
|
||||
return 0;
|
||||
}
|
||||
|
||||
meta_ops(v)->get(skb, info, v, dst, &err);
|
||||
if (err < 0)
|
||||
return err;
|
||||
|
||||
if (meta_type_ops(v)->apply_extras)
|
||||
meta_type_ops(v)->apply_extras(v, dst);
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
static int em_meta_match(struct sk_buff *skb, struct tcf_ematch *m,
|
||||
struct tcf_pkt_info *info)
|
||||
{
|
||||
int r;
|
||||
struct meta_match *meta = (struct meta_match *) m->data;
|
||||
struct meta_obj l_value, r_value;
|
||||
|
||||
if (meta_get(skb, info, &meta->lvalue, &l_value) < 0 ||
|
||||
meta_get(skb, info, &meta->rvalue, &r_value) < 0)
|
||||
return 0;
|
||||
|
||||
r = meta_type_ops(&meta->lvalue)->compare(&l_value, &r_value);
|
||||
|
||||
switch (meta->lvalue.hdr.op) {
|
||||
case TCF_EM_OPND_EQ:
|
||||
return !r;
|
||||
case TCF_EM_OPND_LT:
|
||||
return r < 0;
|
||||
case TCF_EM_OPND_GT:
|
||||
return r > 0;
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
static void meta_delete(struct meta_match *meta)
|
||||
{
|
||||
if (meta) {
|
||||
struct meta_type_ops *ops = meta_type_ops(&meta->lvalue);
|
||||
|
||||
if (ops && ops->destroy) {
|
||||
ops->destroy(&meta->lvalue);
|
||||
ops->destroy(&meta->rvalue);
|
||||
}
|
||||
}
|
||||
|
||||
kfree(meta);
|
||||
}
|
||||
|
||||
static inline int meta_change_data(struct meta_value *dst, struct nlattr *nla)
|
||||
{
|
||||
if (nla) {
|
||||
if (nla_len(nla) == 0)
|
||||
return -EINVAL;
|
||||
|
||||
return meta_type_ops(dst)->change(dst, nla);
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
static inline int meta_is_supported(struct meta_value *val)
|
||||
{
|
||||
return !meta_id(val) || meta_ops(val)->get;
|
||||
}
|
||||
|
||||
static const struct nla_policy meta_policy[TCA_EM_META_MAX + 1] = {
|
||||
[TCA_EM_META_HDR] = { .len = sizeof(struct tcf_meta_hdr) },
|
||||
};
|
||||
|
||||
static int em_meta_change(struct net *net, void *data, int len,
|
||||
struct tcf_ematch *m)
|
||||
{
|
||||
int err;
|
||||
struct nlattr *tb[TCA_EM_META_MAX + 1];
|
||||
struct tcf_meta_hdr *hdr;
|
||||
struct meta_match *meta = NULL;
|
||||
|
||||
err = nla_parse(tb, TCA_EM_META_MAX, data, len, meta_policy);
|
||||
if (err < 0)
|
||||
goto errout;
|
||||
|
||||
err = -EINVAL;
|
||||
if (tb[TCA_EM_META_HDR] == NULL)
|
||||
goto errout;
|
||||
hdr = nla_data(tb[TCA_EM_META_HDR]);
|
||||
|
||||
if (TCF_META_TYPE(hdr->left.kind) != TCF_META_TYPE(hdr->right.kind) ||
|
||||
TCF_META_TYPE(hdr->left.kind) > TCF_META_TYPE_MAX ||
|
||||
TCF_META_ID(hdr->left.kind) > TCF_META_ID_MAX ||
|
||||
TCF_META_ID(hdr->right.kind) > TCF_META_ID_MAX)
|
||||
goto errout;
|
||||
|
||||
meta = kzalloc(sizeof(*meta), GFP_KERNEL);
|
||||
if (meta == NULL) {
|
||||
err = -ENOMEM;
|
||||
goto errout;
|
||||
}
|
||||
|
||||
memcpy(&meta->lvalue.hdr, &hdr->left, sizeof(hdr->left));
|
||||
memcpy(&meta->rvalue.hdr, &hdr->right, sizeof(hdr->right));
|
||||
|
||||
if (!meta_is_supported(&meta->lvalue) ||
|
||||
!meta_is_supported(&meta->rvalue)) {
|
||||
err = -EOPNOTSUPP;
|
||||
goto errout;
|
||||
}
|
||||
|
||||
if (meta_change_data(&meta->lvalue, tb[TCA_EM_META_LVALUE]) < 0 ||
|
||||
meta_change_data(&meta->rvalue, tb[TCA_EM_META_RVALUE]) < 0)
|
||||
goto errout;
|
||||
|
||||
m->datalen = sizeof(*meta);
|
||||
m->data = (unsigned long) meta;
|
||||
|
||||
err = 0;
|
||||
errout:
|
||||
if (err && meta)
|
||||
meta_delete(meta);
|
||||
return err;
|
||||
}
|
||||
|
||||
static void em_meta_destroy(struct tcf_ematch *m)
|
||||
{
|
||||
if (m)
|
||||
meta_delete((struct meta_match *) m->data);
|
||||
}
|
||||
|
||||
static int em_meta_dump(struct sk_buff *skb, struct tcf_ematch *em)
|
||||
{
|
||||
struct meta_match *meta = (struct meta_match *) em->data;
|
||||
struct tcf_meta_hdr hdr;
|
||||
struct meta_type_ops *ops;
|
||||
|
||||
memset(&hdr, 0, sizeof(hdr));
|
||||
memcpy(&hdr.left, &meta->lvalue.hdr, sizeof(hdr.left));
|
||||
memcpy(&hdr.right, &meta->rvalue.hdr, sizeof(hdr.right));
|
||||
|
||||
if (nla_put(skb, TCA_EM_META_HDR, sizeof(hdr), &hdr))
|
||||
goto nla_put_failure;
|
||||
|
||||
ops = meta_type_ops(&meta->lvalue);
|
||||
if (ops->dump(skb, &meta->lvalue, TCA_EM_META_LVALUE) < 0 ||
|
||||
ops->dump(skb, &meta->rvalue, TCA_EM_META_RVALUE) < 0)
|
||||
goto nla_put_failure;
|
||||
|
||||
return 0;
|
||||
|
||||
nla_put_failure:
|
||||
return -1;
|
||||
}
|
||||
|
||||
static struct tcf_ematch_ops em_meta_ops = {
|
||||
.kind = TCF_EM_META,
|
||||
.change = em_meta_change,
|
||||
.match = em_meta_match,
|
||||
.destroy = em_meta_destroy,
|
||||
.dump = em_meta_dump,
|
||||
.owner = THIS_MODULE,
|
||||
.link = LIST_HEAD_INIT(em_meta_ops.link)
|
||||
};
|
||||
|
||||
static int __init init_em_meta(void)
|
||||
{
|
||||
return tcf_em_register(&em_meta_ops);
|
||||
}
|
||||
|
||||
static void __exit exit_em_meta(void)
|
||||
{
|
||||
tcf_em_unregister(&em_meta_ops);
|
||||
}
|
||||
|
||||
MODULE_LICENSE("GPL");
|
||||
|
||||
module_init(init_em_meta);
|
||||
module_exit(exit_em_meta);
|
||||
|
||||
MODULE_ALIAS_TCF_EMATCH(TCF_EM_META);
|
||||
80
net/sched/em_nbyte.c
Normal file
80
net/sched/em_nbyte.c
Normal file
|
|
@ -0,0 +1,80 @@
|
|||
/*
|
||||
* net/sched/em_nbyte.c N-Byte ematch
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU General Public License
|
||||
* as published by the Free Software Foundation; either version
|
||||
* 2 of the License, or (at your option) any later version.
|
||||
*
|
||||
* Authors: Thomas Graf <tgraf@suug.ch>
|
||||
*/
|
||||
|
||||
#include <linux/gfp.h>
|
||||
#include <linux/module.h>
|
||||
#include <linux/types.h>
|
||||
#include <linux/kernel.h>
|
||||
#include <linux/string.h>
|
||||
#include <linux/skbuff.h>
|
||||
#include <linux/tc_ematch/tc_em_nbyte.h>
|
||||
#include <net/pkt_cls.h>
|
||||
|
||||
struct nbyte_data {
|
||||
struct tcf_em_nbyte hdr;
|
||||
char pattern[0];
|
||||
};
|
||||
|
||||
static int em_nbyte_change(struct net *net, void *data, int data_len,
|
||||
struct tcf_ematch *em)
|
||||
{
|
||||
struct tcf_em_nbyte *nbyte = data;
|
||||
|
||||
if (data_len < sizeof(*nbyte) ||
|
||||
data_len < (sizeof(*nbyte) + nbyte->len))
|
||||
return -EINVAL;
|
||||
|
||||
em->datalen = sizeof(*nbyte) + nbyte->len;
|
||||
em->data = (unsigned long)kmemdup(data, em->datalen, GFP_KERNEL);
|
||||
if (em->data == 0UL)
|
||||
return -ENOBUFS;
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
static int em_nbyte_match(struct sk_buff *skb, struct tcf_ematch *em,
|
||||
struct tcf_pkt_info *info)
|
||||
{
|
||||
struct nbyte_data *nbyte = (struct nbyte_data *) em->data;
|
||||
unsigned char *ptr = tcf_get_base_ptr(skb, nbyte->hdr.layer);
|
||||
|
||||
ptr += nbyte->hdr.off;
|
||||
|
||||
if (!tcf_valid_offset(skb, ptr, nbyte->hdr.len))
|
||||
return 0;
|
||||
|
||||
return !memcmp(ptr + nbyte->hdr.off, nbyte->pattern, nbyte->hdr.len);
|
||||
}
|
||||
|
||||
static struct tcf_ematch_ops em_nbyte_ops = {
|
||||
.kind = TCF_EM_NBYTE,
|
||||
.change = em_nbyte_change,
|
||||
.match = em_nbyte_match,
|
||||
.owner = THIS_MODULE,
|
||||
.link = LIST_HEAD_INIT(em_nbyte_ops.link)
|
||||
};
|
||||
|
||||
static int __init init_em_nbyte(void)
|
||||
{
|
||||
return tcf_em_register(&em_nbyte_ops);
|
||||
}
|
||||
|
||||
static void __exit exit_em_nbyte(void)
|
||||
{
|
||||
tcf_em_unregister(&em_nbyte_ops);
|
||||
}
|
||||
|
||||
MODULE_LICENSE("GPL");
|
||||
|
||||
module_init(init_em_nbyte);
|
||||
module_exit(exit_em_nbyte);
|
||||
|
||||
MODULE_ALIAS_TCF_EMATCH(TCF_EM_NBYTE);
|
||||
158
net/sched/em_text.c
Normal file
158
net/sched/em_text.c
Normal file
|
|
@ -0,0 +1,158 @@
|
|||
/*
|
||||
* net/sched/em_text.c Textsearch ematch
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU General Public License
|
||||
* as published by the Free Software Foundation; either version
|
||||
* 2 of the License, or (at your option) any later version.
|
||||
*
|
||||
* Authors: Thomas Graf <tgraf@suug.ch>
|
||||
*/
|
||||
|
||||
#include <linux/slab.h>
|
||||
#include <linux/module.h>
|
||||
#include <linux/types.h>
|
||||
#include <linux/kernel.h>
|
||||
#include <linux/string.h>
|
||||
#include <linux/skbuff.h>
|
||||
#include <linux/textsearch.h>
|
||||
#include <linux/tc_ematch/tc_em_text.h>
|
||||
#include <net/pkt_cls.h>
|
||||
|
||||
struct text_match {
|
||||
u16 from_offset;
|
||||
u16 to_offset;
|
||||
u8 from_layer;
|
||||
u8 to_layer;
|
||||
struct ts_config *config;
|
||||
};
|
||||
|
||||
#define EM_TEXT_PRIV(m) ((struct text_match *) (m)->data)
|
||||
|
||||
static int em_text_match(struct sk_buff *skb, struct tcf_ematch *m,
|
||||
struct tcf_pkt_info *info)
|
||||
{
|
||||
struct text_match *tm = EM_TEXT_PRIV(m);
|
||||
int from, to;
|
||||
struct ts_state state;
|
||||
|
||||
from = tcf_get_base_ptr(skb, tm->from_layer) - skb->data;
|
||||
from += tm->from_offset;
|
||||
|
||||
to = tcf_get_base_ptr(skb, tm->to_layer) - skb->data;
|
||||
to += tm->to_offset;
|
||||
|
||||
return skb_find_text(skb, from, to, tm->config, &state) != UINT_MAX;
|
||||
}
|
||||
|
||||
static int em_text_change(struct net *net, void *data, int len,
|
||||
struct tcf_ematch *m)
|
||||
{
|
||||
struct text_match *tm;
|
||||
struct tcf_em_text *conf = data;
|
||||
struct ts_config *ts_conf;
|
||||
int flags = 0;
|
||||
|
||||
if (len < sizeof(*conf) || len < (sizeof(*conf) + conf->pattern_len))
|
||||
return -EINVAL;
|
||||
|
||||
if (conf->from_layer > conf->to_layer)
|
||||
return -EINVAL;
|
||||
|
||||
if (conf->from_layer == conf->to_layer &&
|
||||
conf->from_offset > conf->to_offset)
|
||||
return -EINVAL;
|
||||
|
||||
retry:
|
||||
ts_conf = textsearch_prepare(conf->algo, (u8 *) conf + sizeof(*conf),
|
||||
conf->pattern_len, GFP_KERNEL, flags);
|
||||
|
||||
if (flags & TS_AUTOLOAD)
|
||||
rtnl_lock();
|
||||
|
||||
if (IS_ERR(ts_conf)) {
|
||||
if (PTR_ERR(ts_conf) == -ENOENT && !(flags & TS_AUTOLOAD)) {
|
||||
rtnl_unlock();
|
||||
flags |= TS_AUTOLOAD;
|
||||
goto retry;
|
||||
} else
|
||||
return PTR_ERR(ts_conf);
|
||||
} else if (flags & TS_AUTOLOAD) {
|
||||
textsearch_destroy(ts_conf);
|
||||
return -EAGAIN;
|
||||
}
|
||||
|
||||
tm = kmalloc(sizeof(*tm), GFP_KERNEL);
|
||||
if (tm == NULL) {
|
||||
textsearch_destroy(ts_conf);
|
||||
return -ENOBUFS;
|
||||
}
|
||||
|
||||
tm->from_offset = conf->from_offset;
|
||||
tm->to_offset = conf->to_offset;
|
||||
tm->from_layer = conf->from_layer;
|
||||
tm->to_layer = conf->to_layer;
|
||||
tm->config = ts_conf;
|
||||
|
||||
m->datalen = sizeof(*tm);
|
||||
m->data = (unsigned long) tm;
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
static void em_text_destroy(struct tcf_ematch *m)
|
||||
{
|
||||
if (EM_TEXT_PRIV(m) && EM_TEXT_PRIV(m)->config)
|
||||
textsearch_destroy(EM_TEXT_PRIV(m)->config);
|
||||
}
|
||||
|
||||
static int em_text_dump(struct sk_buff *skb, struct tcf_ematch *m)
|
||||
{
|
||||
struct text_match *tm = EM_TEXT_PRIV(m);
|
||||
struct tcf_em_text conf;
|
||||
|
||||
strncpy(conf.algo, tm->config->ops->name, sizeof(conf.algo) - 1);
|
||||
conf.from_offset = tm->from_offset;
|
||||
conf.to_offset = tm->to_offset;
|
||||
conf.from_layer = tm->from_layer;
|
||||
conf.to_layer = tm->to_layer;
|
||||
conf.pattern_len = textsearch_get_pattern_len(tm->config);
|
||||
conf.pad = 0;
|
||||
|
||||
if (nla_put_nohdr(skb, sizeof(conf), &conf) < 0)
|
||||
goto nla_put_failure;
|
||||
if (nla_append(skb, conf.pattern_len,
|
||||
textsearch_get_pattern(tm->config)) < 0)
|
||||
goto nla_put_failure;
|
||||
return 0;
|
||||
|
||||
nla_put_failure:
|
||||
return -1;
|
||||
}
|
||||
|
||||
static struct tcf_ematch_ops em_text_ops = {
|
||||
.kind = TCF_EM_TEXT,
|
||||
.change = em_text_change,
|
||||
.match = em_text_match,
|
||||
.destroy = em_text_destroy,
|
||||
.dump = em_text_dump,
|
||||
.owner = THIS_MODULE,
|
||||
.link = LIST_HEAD_INIT(em_text_ops.link)
|
||||
};
|
||||
|
||||
static int __init init_em_text(void)
|
||||
{
|
||||
return tcf_em_register(&em_text_ops);
|
||||
}
|
||||
|
||||
static void __exit exit_em_text(void)
|
||||
{
|
||||
tcf_em_unregister(&em_text_ops);
|
||||
}
|
||||
|
||||
MODULE_LICENSE("GPL");
|
||||
|
||||
module_init(init_em_text);
|
||||
module_exit(exit_em_text);
|
||||
|
||||
MODULE_ALIAS_TCF_EMATCH(TCF_EM_TEXT);
|
||||
64
net/sched/em_u32.c
Normal file
64
net/sched/em_u32.c
Normal file
|
|
@ -0,0 +1,64 @@
|
|||
/*
|
||||
* net/sched/em_u32.c U32 Ematch
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU General Public License
|
||||
* as published by the Free Software Foundation; either version
|
||||
* 2 of the License, or (at your option) any later version.
|
||||
*
|
||||
* Authors: Thomas Graf <tgraf@suug.ch>
|
||||
* Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
|
||||
*
|
||||
* Based on net/sched/cls_u32.c
|
||||
*/
|
||||
|
||||
#include <linux/module.h>
|
||||
#include <linux/types.h>
|
||||
#include <linux/kernel.h>
|
||||
#include <linux/skbuff.h>
|
||||
#include <net/pkt_cls.h>
|
||||
|
||||
static int em_u32_match(struct sk_buff *skb, struct tcf_ematch *em,
|
||||
struct tcf_pkt_info *info)
|
||||
{
|
||||
struct tc_u32_key *key = (struct tc_u32_key *) em->data;
|
||||
const unsigned char *ptr = skb_network_header(skb);
|
||||
|
||||
if (info) {
|
||||
if (info->ptr)
|
||||
ptr = info->ptr;
|
||||
ptr += (info->nexthdr & key->offmask);
|
||||
}
|
||||
|
||||
ptr += key->off;
|
||||
|
||||
if (!tcf_valid_offset(skb, ptr, sizeof(u32)))
|
||||
return 0;
|
||||
|
||||
return !(((*(__be32 *) ptr) ^ key->val) & key->mask);
|
||||
}
|
||||
|
||||
static struct tcf_ematch_ops em_u32_ops = {
|
||||
.kind = TCF_EM_U32,
|
||||
.datalen = sizeof(struct tc_u32_key),
|
||||
.match = em_u32_match,
|
||||
.owner = THIS_MODULE,
|
||||
.link = LIST_HEAD_INIT(em_u32_ops.link)
|
||||
};
|
||||
|
||||
static int __init init_em_u32(void)
|
||||
{
|
||||
return tcf_em_register(&em_u32_ops);
|
||||
}
|
||||
|
||||
static void __exit exit_em_u32(void)
|
||||
{
|
||||
tcf_em_unregister(&em_u32_ops);
|
||||
}
|
||||
|
||||
MODULE_LICENSE("GPL");
|
||||
|
||||
module_init(init_em_u32);
|
||||
module_exit(exit_em_u32);
|
||||
|
||||
MODULE_ALIAS_TCF_EMATCH(TCF_EM_U32);
|
||||
549
net/sched/ematch.c
Normal file
549
net/sched/ematch.c
Normal file
|
|
@ -0,0 +1,549 @@
|
|||
/*
|
||||
* net/sched/ematch.c Extended Match API
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU General Public License
|
||||
* as published by the Free Software Foundation; either version
|
||||
* 2 of the License, or (at your option) any later version.
|
||||
*
|
||||
* Authors: Thomas Graf <tgraf@suug.ch>
|
||||
*
|
||||
* ==========================================================================
|
||||
*
|
||||
* An extended match (ematch) is a small classification tool not worth
|
||||
* writing a full classifier for. Ematches can be interconnected to form
|
||||
* a logic expression and get attached to classifiers to extend their
|
||||
* functionatlity.
|
||||
*
|
||||
* The userspace part transforms the logic expressions into an array
|
||||
* consisting of multiple sequences of interconnected ematches separated
|
||||
* by markers. Precedence is implemented by a special ematch kind
|
||||
* referencing a sequence beyond the marker of the current sequence
|
||||
* causing the current position in the sequence to be pushed onto a stack
|
||||
* to allow the current position to be overwritten by the position referenced
|
||||
* in the special ematch. Matching continues in the new sequence until a
|
||||
* marker is reached causing the position to be restored from the stack.
|
||||
*
|
||||
* Example:
|
||||
* A AND (B1 OR B2) AND C AND D
|
||||
*
|
||||
* ------->-PUSH-------
|
||||
* -->-- / -->-- \ -->--
|
||||
* / \ / / \ \ / \
|
||||
* +-------+-------+-------+-------+-------+--------+
|
||||
* | A AND | B AND | C AND | D END | B1 OR | B2 END |
|
||||
* +-------+-------+-------+-------+-------+--------+
|
||||
* \ /
|
||||
* --------<-POP---------
|
||||
*
|
||||
* where B is a virtual ematch referencing to sequence starting with B1.
|
||||
*
|
||||
* ==========================================================================
|
||||
*
|
||||
* How to write an ematch in 60 seconds
|
||||
* ------------------------------------
|
||||
*
|
||||
* 1) Provide a matcher function:
|
||||
* static int my_match(struct sk_buff *skb, struct tcf_ematch *m,
|
||||
* struct tcf_pkt_info *info)
|
||||
* {
|
||||
* struct mydata *d = (struct mydata *) m->data;
|
||||
*
|
||||
* if (...matching goes here...)
|
||||
* return 1;
|
||||
* else
|
||||
* return 0;
|
||||
* }
|
||||
*
|
||||
* 2) Fill out a struct tcf_ematch_ops:
|
||||
* static struct tcf_ematch_ops my_ops = {
|
||||
* .kind = unique id,
|
||||
* .datalen = sizeof(struct mydata),
|
||||
* .match = my_match,
|
||||
* .owner = THIS_MODULE,
|
||||
* };
|
||||
*
|
||||
* 3) Register/Unregister your ematch:
|
||||
* static int __init init_my_ematch(void)
|
||||
* {
|
||||
* return tcf_em_register(&my_ops);
|
||||
* }
|
||||
*
|
||||
* static void __exit exit_my_ematch(void)
|
||||
* {
|
||||
* tcf_em_unregister(&my_ops);
|
||||
* }
|
||||
*
|
||||
* module_init(init_my_ematch);
|
||||
* module_exit(exit_my_ematch);
|
||||
*
|
||||
* 4) By now you should have two more seconds left, barely enough to
|
||||
* open up a beer to watch the compilation going.
|
||||
*/
|
||||
|
||||
#include <linux/module.h>
|
||||
#include <linux/slab.h>
|
||||
#include <linux/types.h>
|
||||
#include <linux/kernel.h>
|
||||
#include <linux/errno.h>
|
||||
#include <linux/rtnetlink.h>
|
||||
#include <linux/skbuff.h>
|
||||
#include <net/pkt_cls.h>
|
||||
|
||||
static LIST_HEAD(ematch_ops);
|
||||
static DEFINE_RWLOCK(ematch_mod_lock);
|
||||
|
||||
static struct tcf_ematch_ops *tcf_em_lookup(u16 kind)
|
||||
{
|
||||
struct tcf_ematch_ops *e = NULL;
|
||||
|
||||
read_lock(&ematch_mod_lock);
|
||||
list_for_each_entry(e, &ematch_ops, link) {
|
||||
if (kind == e->kind) {
|
||||
if (!try_module_get(e->owner))
|
||||
e = NULL;
|
||||
read_unlock(&ematch_mod_lock);
|
||||
return e;
|
||||
}
|
||||
}
|
||||
read_unlock(&ematch_mod_lock);
|
||||
|
||||
return NULL;
|
||||
}
|
||||
|
||||
/**
|
||||
* tcf_em_register - register an extended match
|
||||
*
|
||||
* @ops: ematch operations lookup table
|
||||
*
|
||||
* This function must be called by ematches to announce their presence.
|
||||
* The given @ops must have kind set to a unique identifier and the
|
||||
* callback match() must be implemented. All other callbacks are optional
|
||||
* and a fallback implementation is used instead.
|
||||
*
|
||||
* Returns -EEXISTS if an ematch of the same kind has already registered.
|
||||
*/
|
||||
int tcf_em_register(struct tcf_ematch_ops *ops)
|
||||
{
|
||||
int err = -EEXIST;
|
||||
struct tcf_ematch_ops *e;
|
||||
|
||||
if (ops->match == NULL)
|
||||
return -EINVAL;
|
||||
|
||||
write_lock(&ematch_mod_lock);
|
||||
list_for_each_entry(e, &ematch_ops, link)
|
||||
if (ops->kind == e->kind)
|
||||
goto errout;
|
||||
|
||||
list_add_tail(&ops->link, &ematch_ops);
|
||||
err = 0;
|
||||
errout:
|
||||
write_unlock(&ematch_mod_lock);
|
||||
return err;
|
||||
}
|
||||
EXPORT_SYMBOL(tcf_em_register);
|
||||
|
||||
/**
|
||||
* tcf_em_unregister - unregster and extended match
|
||||
*
|
||||
* @ops: ematch operations lookup table
|
||||
*
|
||||
* This function must be called by ematches to announce their disappearance
|
||||
* for examples when the module gets unloaded. The @ops parameter must be
|
||||
* the same as the one used for registration.
|
||||
*
|
||||
* Returns -ENOENT if no matching ematch was found.
|
||||
*/
|
||||
void tcf_em_unregister(struct tcf_ematch_ops *ops)
|
||||
{
|
||||
write_lock(&ematch_mod_lock);
|
||||
list_del(&ops->link);
|
||||
write_unlock(&ematch_mod_lock);
|
||||
}
|
||||
EXPORT_SYMBOL(tcf_em_unregister);
|
||||
|
||||
static inline struct tcf_ematch *tcf_em_get_match(struct tcf_ematch_tree *tree,
|
||||
int index)
|
||||
{
|
||||
return &tree->matches[index];
|
||||
}
|
||||
|
||||
|
||||
static int tcf_em_validate(struct tcf_proto *tp,
|
||||
struct tcf_ematch_tree_hdr *tree_hdr,
|
||||
struct tcf_ematch *em, struct nlattr *nla, int idx)
|
||||
{
|
||||
int err = -EINVAL;
|
||||
struct tcf_ematch_hdr *em_hdr = nla_data(nla);
|
||||
int data_len = nla_len(nla) - sizeof(*em_hdr);
|
||||
void *data = (void *) em_hdr + sizeof(*em_hdr);
|
||||
struct net *net = dev_net(qdisc_dev(tp->q));
|
||||
|
||||
if (!TCF_EM_REL_VALID(em_hdr->flags))
|
||||
goto errout;
|
||||
|
||||
if (em_hdr->kind == TCF_EM_CONTAINER) {
|
||||
/* Special ematch called "container", carries an index
|
||||
* referencing an external ematch sequence.
|
||||
*/
|
||||
u32 ref;
|
||||
|
||||
if (data_len < sizeof(ref))
|
||||
goto errout;
|
||||
ref = *(u32 *) data;
|
||||
|
||||
if (ref >= tree_hdr->nmatches)
|
||||
goto errout;
|
||||
|
||||
/* We do not allow backward jumps to avoid loops and jumps
|
||||
* to our own position are of course illegal.
|
||||
*/
|
||||
if (ref <= idx)
|
||||
goto errout;
|
||||
|
||||
|
||||
em->data = ref;
|
||||
} else {
|
||||
/* Note: This lookup will increase the module refcnt
|
||||
* of the ematch module referenced. In case of a failure,
|
||||
* a destroy function is called by the underlying layer
|
||||
* which automatically releases the reference again, therefore
|
||||
* the module MUST not be given back under any circumstances
|
||||
* here. Be aware, the destroy function assumes that the
|
||||
* module is held if the ops field is non zero.
|
||||
*/
|
||||
em->ops = tcf_em_lookup(em_hdr->kind);
|
||||
|
||||
if (em->ops == NULL) {
|
||||
err = -ENOENT;
|
||||
#ifdef CONFIG_MODULES
|
||||
__rtnl_unlock();
|
||||
request_module("ematch-kind-%u", em_hdr->kind);
|
||||
rtnl_lock();
|
||||
em->ops = tcf_em_lookup(em_hdr->kind);
|
||||
if (em->ops) {
|
||||
/* We dropped the RTNL mutex in order to
|
||||
* perform the module load. Tell the caller
|
||||
* to replay the request.
|
||||
*/
|
||||
module_put(em->ops->owner);
|
||||
em->ops = NULL;
|
||||
err = -EAGAIN;
|
||||
}
|
||||
#endif
|
||||
goto errout;
|
||||
}
|
||||
|
||||
/* ematch module provides expected length of data, so we
|
||||
* can do a basic sanity check.
|
||||
*/
|
||||
if (em->ops->datalen && data_len < em->ops->datalen)
|
||||
goto errout;
|
||||
|
||||
if (em->ops->change) {
|
||||
err = em->ops->change(net, data, data_len, em);
|
||||
if (err < 0)
|
||||
goto errout;
|
||||
} else if (data_len > 0) {
|
||||
/* ematch module doesn't provide an own change
|
||||
* procedure and expects us to allocate and copy
|
||||
* the ematch data.
|
||||
*
|
||||
* TCF_EM_SIMPLE may be specified stating that the
|
||||
* data only consists of a u32 integer and the module
|
||||
* does not expected a memory reference but rather
|
||||
* the value carried.
|
||||
*/
|
||||
if (em_hdr->flags & TCF_EM_SIMPLE) {
|
||||
if (data_len < sizeof(u32))
|
||||
goto errout;
|
||||
em->data = *(u32 *) data;
|
||||
} else {
|
||||
void *v = kmemdup(data, data_len, GFP_KERNEL);
|
||||
if (v == NULL) {
|
||||
err = -ENOBUFS;
|
||||
goto errout;
|
||||
}
|
||||
em->data = (unsigned long) v;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
em->matchid = em_hdr->matchid;
|
||||
em->flags = em_hdr->flags;
|
||||
em->datalen = data_len;
|
||||
em->net = net;
|
||||
|
||||
err = 0;
|
||||
errout:
|
||||
return err;
|
||||
}
|
||||
|
||||
static const struct nla_policy em_policy[TCA_EMATCH_TREE_MAX + 1] = {
|
||||
[TCA_EMATCH_TREE_HDR] = { .len = sizeof(struct tcf_ematch_tree_hdr) },
|
||||
[TCA_EMATCH_TREE_LIST] = { .type = NLA_NESTED },
|
||||
};
|
||||
|
||||
/**
|
||||
* tcf_em_tree_validate - validate ematch config TLV and build ematch tree
|
||||
*
|
||||
* @tp: classifier kind handle
|
||||
* @nla: ematch tree configuration TLV
|
||||
* @tree: destination ematch tree variable to store the resulting
|
||||
* ematch tree.
|
||||
*
|
||||
* This function validates the given configuration TLV @nla and builds an
|
||||
* ematch tree in @tree. The resulting tree must later be copied into
|
||||
* the private classifier data using tcf_em_tree_change(). You MUST NOT
|
||||
* provide the ematch tree variable of the private classifier data directly,
|
||||
* the changes would not be locked properly.
|
||||
*
|
||||
* Returns a negative error code if the configuration TLV contains errors.
|
||||
*/
|
||||
int tcf_em_tree_validate(struct tcf_proto *tp, struct nlattr *nla,
|
||||
struct tcf_ematch_tree *tree)
|
||||
{
|
||||
int idx, list_len, matches_len, err;
|
||||
struct nlattr *tb[TCA_EMATCH_TREE_MAX + 1];
|
||||
struct nlattr *rt_match, *rt_hdr, *rt_list;
|
||||
struct tcf_ematch_tree_hdr *tree_hdr;
|
||||
struct tcf_ematch *em;
|
||||
|
||||
memset(tree, 0, sizeof(*tree));
|
||||
if (!nla)
|
||||
return 0;
|
||||
|
||||
err = nla_parse_nested(tb, TCA_EMATCH_TREE_MAX, nla, em_policy);
|
||||
if (err < 0)
|
||||
goto errout;
|
||||
|
||||
err = -EINVAL;
|
||||
rt_hdr = tb[TCA_EMATCH_TREE_HDR];
|
||||
rt_list = tb[TCA_EMATCH_TREE_LIST];
|
||||
|
||||
if (rt_hdr == NULL || rt_list == NULL)
|
||||
goto errout;
|
||||
|
||||
tree_hdr = nla_data(rt_hdr);
|
||||
memcpy(&tree->hdr, tree_hdr, sizeof(*tree_hdr));
|
||||
|
||||
rt_match = nla_data(rt_list);
|
||||
list_len = nla_len(rt_list);
|
||||
matches_len = tree_hdr->nmatches * sizeof(*em);
|
||||
|
||||
tree->matches = kzalloc(matches_len, GFP_KERNEL);
|
||||
if (tree->matches == NULL)
|
||||
goto errout;
|
||||
|
||||
/* We do not use nla_parse_nested here because the maximum
|
||||
* number of attributes is unknown. This saves us the allocation
|
||||
* for a tb buffer which would serve no purpose at all.
|
||||
*
|
||||
* The array of rt attributes is parsed in the order as they are
|
||||
* provided, their type must be incremental from 1 to n. Even
|
||||
* if it does not serve any real purpose, a failure of sticking
|
||||
* to this policy will result in parsing failure.
|
||||
*/
|
||||
for (idx = 0; nla_ok(rt_match, list_len); idx++) {
|
||||
err = -EINVAL;
|
||||
|
||||
if (rt_match->nla_type != (idx + 1))
|
||||
goto errout_abort;
|
||||
|
||||
if (idx >= tree_hdr->nmatches)
|
||||
goto errout_abort;
|
||||
|
||||
if (nla_len(rt_match) < sizeof(struct tcf_ematch_hdr))
|
||||
goto errout_abort;
|
||||
|
||||
em = tcf_em_get_match(tree, idx);
|
||||
|
||||
err = tcf_em_validate(tp, tree_hdr, em, rt_match, idx);
|
||||
if (err < 0)
|
||||
goto errout_abort;
|
||||
|
||||
rt_match = nla_next(rt_match, &list_len);
|
||||
}
|
||||
|
||||
/* Check if the number of matches provided by userspace actually
|
||||
* complies with the array of matches. The number was used for
|
||||
* the validation of references and a mismatch could lead to
|
||||
* undefined references during the matching process.
|
||||
*/
|
||||
if (idx != tree_hdr->nmatches) {
|
||||
err = -EINVAL;
|
||||
goto errout_abort;
|
||||
}
|
||||
|
||||
err = 0;
|
||||
errout:
|
||||
return err;
|
||||
|
||||
errout_abort:
|
||||
tcf_em_tree_destroy(tree);
|
||||
return err;
|
||||
}
|
||||
EXPORT_SYMBOL(tcf_em_tree_validate);
|
||||
|
||||
/**
|
||||
* tcf_em_tree_destroy - destroy an ematch tree
|
||||
*
|
||||
* @tp: classifier kind handle
|
||||
* @tree: ematch tree to be deleted
|
||||
*
|
||||
* This functions destroys an ematch tree previously created by
|
||||
* tcf_em_tree_validate()/tcf_em_tree_change(). You must ensure that
|
||||
* the ematch tree is not in use before calling this function.
|
||||
*/
|
||||
void tcf_em_tree_destroy(struct tcf_ematch_tree *tree)
|
||||
{
|
||||
int i;
|
||||
|
||||
if (tree->matches == NULL)
|
||||
return;
|
||||
|
||||
for (i = 0; i < tree->hdr.nmatches; i++) {
|
||||
struct tcf_ematch *em = tcf_em_get_match(tree, i);
|
||||
|
||||
if (em->ops) {
|
||||
if (em->ops->destroy)
|
||||
em->ops->destroy(em);
|
||||
else if (!tcf_em_is_simple(em))
|
||||
kfree((void *) em->data);
|
||||
module_put(em->ops->owner);
|
||||
}
|
||||
}
|
||||
|
||||
tree->hdr.nmatches = 0;
|
||||
kfree(tree->matches);
|
||||
tree->matches = NULL;
|
||||
}
|
||||
EXPORT_SYMBOL(tcf_em_tree_destroy);
|
||||
|
||||
/**
|
||||
* tcf_em_tree_dump - dump ematch tree into a rtnl message
|
||||
*
|
||||
* @skb: skb holding the rtnl message
|
||||
* @t: ematch tree to be dumped
|
||||
* @tlv: TLV type to be used to encapsulate the tree
|
||||
*
|
||||
* This function dumps a ematch tree into a rtnl message. It is valid to
|
||||
* call this function while the ematch tree is in use.
|
||||
*
|
||||
* Returns -1 if the skb tailroom is insufficient.
|
||||
*/
|
||||
int tcf_em_tree_dump(struct sk_buff *skb, struct tcf_ematch_tree *tree, int tlv)
|
||||
{
|
||||
int i;
|
||||
u8 *tail;
|
||||
struct nlattr *top_start;
|
||||
struct nlattr *list_start;
|
||||
|
||||
top_start = nla_nest_start(skb, tlv);
|
||||
if (top_start == NULL)
|
||||
goto nla_put_failure;
|
||||
|
||||
if (nla_put(skb, TCA_EMATCH_TREE_HDR, sizeof(tree->hdr), &tree->hdr))
|
||||
goto nla_put_failure;
|
||||
|
||||
list_start = nla_nest_start(skb, TCA_EMATCH_TREE_LIST);
|
||||
if (list_start == NULL)
|
||||
goto nla_put_failure;
|
||||
|
||||
tail = skb_tail_pointer(skb);
|
||||
for (i = 0; i < tree->hdr.nmatches; i++) {
|
||||
struct nlattr *match_start = (struct nlattr *)tail;
|
||||
struct tcf_ematch *em = tcf_em_get_match(tree, i);
|
||||
struct tcf_ematch_hdr em_hdr = {
|
||||
.kind = em->ops ? em->ops->kind : TCF_EM_CONTAINER,
|
||||
.matchid = em->matchid,
|
||||
.flags = em->flags
|
||||
};
|
||||
|
||||
if (nla_put(skb, i + 1, sizeof(em_hdr), &em_hdr))
|
||||
goto nla_put_failure;
|
||||
|
||||
if (em->ops && em->ops->dump) {
|
||||
if (em->ops->dump(skb, em) < 0)
|
||||
goto nla_put_failure;
|
||||
} else if (tcf_em_is_container(em) || tcf_em_is_simple(em)) {
|
||||
u32 u = em->data;
|
||||
nla_put_nohdr(skb, sizeof(u), &u);
|
||||
} else if (em->datalen > 0)
|
||||
nla_put_nohdr(skb, em->datalen, (void *) em->data);
|
||||
|
||||
tail = skb_tail_pointer(skb);
|
||||
match_start->nla_len = tail - (u8 *)match_start;
|
||||
}
|
||||
|
||||
nla_nest_end(skb, list_start);
|
||||
nla_nest_end(skb, top_start);
|
||||
|
||||
return 0;
|
||||
|
||||
nla_put_failure:
|
||||
return -1;
|
||||
}
|
||||
EXPORT_SYMBOL(tcf_em_tree_dump);
|
||||
|
||||
static inline int tcf_em_match(struct sk_buff *skb, struct tcf_ematch *em,
|
||||
struct tcf_pkt_info *info)
|
||||
{
|
||||
int r = em->ops->match(skb, em, info);
|
||||
|
||||
return tcf_em_is_inverted(em) ? !r : r;
|
||||
}
|
||||
|
||||
/* Do not use this function directly, use tcf_em_tree_match instead */
|
||||
int __tcf_em_tree_match(struct sk_buff *skb, struct tcf_ematch_tree *tree,
|
||||
struct tcf_pkt_info *info)
|
||||
{
|
||||
int stackp = 0, match_idx = 0, res = 0;
|
||||
struct tcf_ematch *cur_match;
|
||||
int stack[CONFIG_NET_EMATCH_STACK];
|
||||
|
||||
proceed:
|
||||
while (match_idx < tree->hdr.nmatches) {
|
||||
cur_match = tcf_em_get_match(tree, match_idx);
|
||||
|
||||
if (tcf_em_is_container(cur_match)) {
|
||||
if (unlikely(stackp >= CONFIG_NET_EMATCH_STACK))
|
||||
goto stack_overflow;
|
||||
|
||||
stack[stackp++] = match_idx;
|
||||
match_idx = cur_match->data;
|
||||
goto proceed;
|
||||
}
|
||||
|
||||
res = tcf_em_match(skb, cur_match, info);
|
||||
|
||||
if (tcf_em_early_end(cur_match, res))
|
||||
break;
|
||||
|
||||
match_idx++;
|
||||
}
|
||||
|
||||
pop_stack:
|
||||
if (stackp > 0) {
|
||||
match_idx = stack[--stackp];
|
||||
cur_match = tcf_em_get_match(tree, match_idx);
|
||||
|
||||
if (tcf_em_is_inverted(cur_match))
|
||||
res = !res;
|
||||
|
||||
if (tcf_em_early_end(cur_match, res)) {
|
||||
goto pop_stack;
|
||||
} else {
|
||||
match_idx++;
|
||||
goto proceed;
|
||||
}
|
||||
}
|
||||
|
||||
return res;
|
||||
|
||||
stack_overflow:
|
||||
net_warn_ratelimited("tc ematch: local stack overflow, increase NET_EMATCH_STACK\n");
|
||||
return -1;
|
||||
}
|
||||
EXPORT_SYMBOL(__tcf_em_tree_match);
|
||||
1993
net/sched/sch_api.c
Normal file
1993
net/sched/sch_api.c
Normal file
File diff suppressed because it is too large
Load diff
694
net/sched/sch_atm.c
Normal file
694
net/sched/sch_atm.c
Normal file
|
|
@ -0,0 +1,694 @@
|
|||
/* net/sched/sch_atm.c - ATM VC selection "queueing discipline" */
|
||||
|
||||
/* Written 1998-2000 by Werner Almesberger, EPFL ICA */
|
||||
|
||||
#include <linux/module.h>
|
||||
#include <linux/slab.h>
|
||||
#include <linux/init.h>
|
||||
#include <linux/interrupt.h>
|
||||
#include <linux/string.h>
|
||||
#include <linux/errno.h>
|
||||
#include <linux/skbuff.h>
|
||||
#include <linux/atmdev.h>
|
||||
#include <linux/atmclip.h>
|
||||
#include <linux/rtnetlink.h>
|
||||
#include <linux/file.h> /* for fput */
|
||||
#include <net/netlink.h>
|
||||
#include <net/pkt_sched.h>
|
||||
|
||||
/*
|
||||
* The ATM queuing discipline provides a framework for invoking classifiers
|
||||
* (aka "filters"), which in turn select classes of this queuing discipline.
|
||||
* Each class maps the flow(s) it is handling to a given VC. Multiple classes
|
||||
* may share the same VC.
|
||||
*
|
||||
* When creating a class, VCs are specified by passing the number of the open
|
||||
* socket descriptor by which the calling process references the VC. The kernel
|
||||
* keeps the VC open at least until all classes using it are removed.
|
||||
*
|
||||
* In this file, most functions are named atm_tc_* to avoid confusion with all
|
||||
* the atm_* in net/atm. This naming convention differs from what's used in the
|
||||
* rest of net/sched.
|
||||
*
|
||||
* Known bugs:
|
||||
* - sometimes messes up the IP stack
|
||||
* - any manipulations besides the few operations described in the README, are
|
||||
* untested and likely to crash the system
|
||||
* - should lock the flow while there is data in the queue (?)
|
||||
*/
|
||||
|
||||
#define VCC2FLOW(vcc) ((struct atm_flow_data *) ((vcc)->user_back))
|
||||
|
||||
struct atm_flow_data {
|
||||
struct Qdisc *q; /* FIFO, TBF, etc. */
|
||||
struct tcf_proto __rcu *filter_list;
|
||||
struct atm_vcc *vcc; /* VCC; NULL if VCC is closed */
|
||||
void (*old_pop)(struct atm_vcc *vcc,
|
||||
struct sk_buff *skb); /* chaining */
|
||||
struct atm_qdisc_data *parent; /* parent qdisc */
|
||||
struct socket *sock; /* for closing */
|
||||
u32 classid; /* x:y type ID */
|
||||
int ref; /* reference count */
|
||||
struct gnet_stats_basic_packed bstats;
|
||||
struct gnet_stats_queue qstats;
|
||||
struct list_head list;
|
||||
struct atm_flow_data *excess; /* flow for excess traffic;
|
||||
NULL to set CLP instead */
|
||||
int hdr_len;
|
||||
unsigned char hdr[0]; /* header data; MUST BE LAST */
|
||||
};
|
||||
|
||||
struct atm_qdisc_data {
|
||||
struct atm_flow_data link; /* unclassified skbs go here */
|
||||
struct list_head flows; /* NB: "link" is also on this
|
||||
list */
|
||||
struct tasklet_struct task; /* dequeue tasklet */
|
||||
};
|
||||
|
||||
/* ------------------------- Class/flow operations ------------------------- */
|
||||
|
||||
static inline struct atm_flow_data *lookup_flow(struct Qdisc *sch, u32 classid)
|
||||
{
|
||||
struct atm_qdisc_data *p = qdisc_priv(sch);
|
||||
struct atm_flow_data *flow;
|
||||
|
||||
list_for_each_entry(flow, &p->flows, list) {
|
||||
if (flow->classid == classid)
|
||||
return flow;
|
||||
}
|
||||
return NULL;
|
||||
}
|
||||
|
||||
static int atm_tc_graft(struct Qdisc *sch, unsigned long arg,
|
||||
struct Qdisc *new, struct Qdisc **old)
|
||||
{
|
||||
struct atm_qdisc_data *p = qdisc_priv(sch);
|
||||
struct atm_flow_data *flow = (struct atm_flow_data *)arg;
|
||||
|
||||
pr_debug("atm_tc_graft(sch %p,[qdisc %p],flow %p,new %p,old %p)\n",
|
||||
sch, p, flow, new, old);
|
||||
if (list_empty(&flow->list))
|
||||
return -EINVAL;
|
||||
if (!new)
|
||||
new = &noop_qdisc;
|
||||
*old = flow->q;
|
||||
flow->q = new;
|
||||
if (*old)
|
||||
qdisc_reset(*old);
|
||||
return 0;
|
||||
}
|
||||
|
||||
static struct Qdisc *atm_tc_leaf(struct Qdisc *sch, unsigned long cl)
|
||||
{
|
||||
struct atm_flow_data *flow = (struct atm_flow_data *)cl;
|
||||
|
||||
pr_debug("atm_tc_leaf(sch %p,flow %p)\n", sch, flow);
|
||||
return flow ? flow->q : NULL;
|
||||
}
|
||||
|
||||
static unsigned long atm_tc_get(struct Qdisc *sch, u32 classid)
|
||||
{
|
||||
struct atm_qdisc_data *p __maybe_unused = qdisc_priv(sch);
|
||||
struct atm_flow_data *flow;
|
||||
|
||||
pr_debug("atm_tc_get(sch %p,[qdisc %p],classid %x)\n", sch, p, classid);
|
||||
flow = lookup_flow(sch, classid);
|
||||
if (flow)
|
||||
flow->ref++;
|
||||
pr_debug("atm_tc_get: flow %p\n", flow);
|
||||
return (unsigned long)flow;
|
||||
}
|
||||
|
||||
static unsigned long atm_tc_bind_filter(struct Qdisc *sch,
|
||||
unsigned long parent, u32 classid)
|
||||
{
|
||||
return atm_tc_get(sch, classid);
|
||||
}
|
||||
|
||||
/*
|
||||
* atm_tc_put handles all destructions, including the ones that are explicitly
|
||||
* requested (atm_tc_destroy, etc.). The assumption here is that we never drop
|
||||
* anything that still seems to be in use.
|
||||
*/
|
||||
static void atm_tc_put(struct Qdisc *sch, unsigned long cl)
|
||||
{
|
||||
struct atm_qdisc_data *p = qdisc_priv(sch);
|
||||
struct atm_flow_data *flow = (struct atm_flow_data *)cl;
|
||||
|
||||
pr_debug("atm_tc_put(sch %p,[qdisc %p],flow %p)\n", sch, p, flow);
|
||||
if (--flow->ref)
|
||||
return;
|
||||
pr_debug("atm_tc_put: destroying\n");
|
||||
list_del_init(&flow->list);
|
||||
pr_debug("atm_tc_put: qdisc %p\n", flow->q);
|
||||
qdisc_destroy(flow->q);
|
||||
tcf_destroy_chain(&flow->filter_list);
|
||||
if (flow->sock) {
|
||||
pr_debug("atm_tc_put: f_count %ld\n",
|
||||
file_count(flow->sock->file));
|
||||
flow->vcc->pop = flow->old_pop;
|
||||
sockfd_put(flow->sock);
|
||||
}
|
||||
if (flow->excess)
|
||||
atm_tc_put(sch, (unsigned long)flow->excess);
|
||||
if (flow != &p->link)
|
||||
kfree(flow);
|
||||
/*
|
||||
* If flow == &p->link, the qdisc no longer works at this point and
|
||||
* needs to be removed. (By the caller of atm_tc_put.)
|
||||
*/
|
||||
}
|
||||
|
||||
static void sch_atm_pop(struct atm_vcc *vcc, struct sk_buff *skb)
|
||||
{
|
||||
struct atm_qdisc_data *p = VCC2FLOW(vcc)->parent;
|
||||
|
||||
pr_debug("sch_atm_pop(vcc %p,skb %p,[qdisc %p])\n", vcc, skb, p);
|
||||
VCC2FLOW(vcc)->old_pop(vcc, skb);
|
||||
tasklet_schedule(&p->task);
|
||||
}
|
||||
|
||||
static const u8 llc_oui_ip[] = {
|
||||
0xaa, /* DSAP: non-ISO */
|
||||
0xaa, /* SSAP: non-ISO */
|
||||
0x03, /* Ctrl: Unnumbered Information Command PDU */
|
||||
0x00, /* OUI: EtherType */
|
||||
0x00, 0x00,
|
||||
0x08, 0x00
|
||||
}; /* Ethertype IP (0800) */
|
||||
|
||||
static const struct nla_policy atm_policy[TCA_ATM_MAX + 1] = {
|
||||
[TCA_ATM_FD] = { .type = NLA_U32 },
|
||||
[TCA_ATM_EXCESS] = { .type = NLA_U32 },
|
||||
};
|
||||
|
||||
static int atm_tc_change(struct Qdisc *sch, u32 classid, u32 parent,
|
||||
struct nlattr **tca, unsigned long *arg)
|
||||
{
|
||||
struct atm_qdisc_data *p = qdisc_priv(sch);
|
||||
struct atm_flow_data *flow = (struct atm_flow_data *)*arg;
|
||||
struct atm_flow_data *excess = NULL;
|
||||
struct nlattr *opt = tca[TCA_OPTIONS];
|
||||
struct nlattr *tb[TCA_ATM_MAX + 1];
|
||||
struct socket *sock;
|
||||
int fd, error, hdr_len;
|
||||
void *hdr;
|
||||
|
||||
pr_debug("atm_tc_change(sch %p,[qdisc %p],classid %x,parent %x,"
|
||||
"flow %p,opt %p)\n", sch, p, classid, parent, flow, opt);
|
||||
/*
|
||||
* The concept of parents doesn't apply for this qdisc.
|
||||
*/
|
||||
if (parent && parent != TC_H_ROOT && parent != sch->handle)
|
||||
return -EINVAL;
|
||||
/*
|
||||
* ATM classes cannot be changed. In order to change properties of the
|
||||
* ATM connection, that socket needs to be modified directly (via the
|
||||
* native ATM API. In order to send a flow to a different VC, the old
|
||||
* class needs to be removed and a new one added. (This may be changed
|
||||
* later.)
|
||||
*/
|
||||
if (flow)
|
||||
return -EBUSY;
|
||||
if (opt == NULL)
|
||||
return -EINVAL;
|
||||
|
||||
error = nla_parse_nested(tb, TCA_ATM_MAX, opt, atm_policy);
|
||||
if (error < 0)
|
||||
return error;
|
||||
|
||||
if (!tb[TCA_ATM_FD])
|
||||
return -EINVAL;
|
||||
fd = nla_get_u32(tb[TCA_ATM_FD]);
|
||||
pr_debug("atm_tc_change: fd %d\n", fd);
|
||||
if (tb[TCA_ATM_HDR]) {
|
||||
hdr_len = nla_len(tb[TCA_ATM_HDR]);
|
||||
hdr = nla_data(tb[TCA_ATM_HDR]);
|
||||
} else {
|
||||
hdr_len = RFC1483LLC_LEN;
|
||||
hdr = NULL; /* default LLC/SNAP for IP */
|
||||
}
|
||||
if (!tb[TCA_ATM_EXCESS])
|
||||
excess = NULL;
|
||||
else {
|
||||
excess = (struct atm_flow_data *)
|
||||
atm_tc_get(sch, nla_get_u32(tb[TCA_ATM_EXCESS]));
|
||||
if (!excess)
|
||||
return -ENOENT;
|
||||
}
|
||||
pr_debug("atm_tc_change: type %d, payload %d, hdr_len %d\n",
|
||||
opt->nla_type, nla_len(opt), hdr_len);
|
||||
sock = sockfd_lookup(fd, &error);
|
||||
if (!sock)
|
||||
return error; /* f_count++ */
|
||||
pr_debug("atm_tc_change: f_count %ld\n", file_count(sock->file));
|
||||
if (sock->ops->family != PF_ATMSVC && sock->ops->family != PF_ATMPVC) {
|
||||
error = -EPROTOTYPE;
|
||||
goto err_out;
|
||||
}
|
||||
/* @@@ should check if the socket is really operational or we'll crash
|
||||
on vcc->send */
|
||||
if (classid) {
|
||||
if (TC_H_MAJ(classid ^ sch->handle)) {
|
||||
pr_debug("atm_tc_change: classid mismatch\n");
|
||||
error = -EINVAL;
|
||||
goto err_out;
|
||||
}
|
||||
} else {
|
||||
int i;
|
||||
unsigned long cl;
|
||||
|
||||
for (i = 1; i < 0x8000; i++) {
|
||||
classid = TC_H_MAKE(sch->handle, 0x8000 | i);
|
||||
cl = atm_tc_get(sch, classid);
|
||||
if (!cl)
|
||||
break;
|
||||
atm_tc_put(sch, cl);
|
||||
}
|
||||
}
|
||||
pr_debug("atm_tc_change: new id %x\n", classid);
|
||||
flow = kzalloc(sizeof(struct atm_flow_data) + hdr_len, GFP_KERNEL);
|
||||
pr_debug("atm_tc_change: flow %p\n", flow);
|
||||
if (!flow) {
|
||||
error = -ENOBUFS;
|
||||
goto err_out;
|
||||
}
|
||||
RCU_INIT_POINTER(flow->filter_list, NULL);
|
||||
flow->q = qdisc_create_dflt(sch->dev_queue, &pfifo_qdisc_ops, classid);
|
||||
if (!flow->q)
|
||||
flow->q = &noop_qdisc;
|
||||
pr_debug("atm_tc_change: qdisc %p\n", flow->q);
|
||||
flow->sock = sock;
|
||||
flow->vcc = ATM_SD(sock); /* speedup */
|
||||
flow->vcc->user_back = flow;
|
||||
pr_debug("atm_tc_change: vcc %p\n", flow->vcc);
|
||||
flow->old_pop = flow->vcc->pop;
|
||||
flow->parent = p;
|
||||
flow->vcc->pop = sch_atm_pop;
|
||||
flow->classid = classid;
|
||||
flow->ref = 1;
|
||||
flow->excess = excess;
|
||||
list_add(&flow->list, &p->link.list);
|
||||
flow->hdr_len = hdr_len;
|
||||
if (hdr)
|
||||
memcpy(flow->hdr, hdr, hdr_len);
|
||||
else
|
||||
memcpy(flow->hdr, llc_oui_ip, sizeof(llc_oui_ip));
|
||||
*arg = (unsigned long)flow;
|
||||
return 0;
|
||||
err_out:
|
||||
if (excess)
|
||||
atm_tc_put(sch, (unsigned long)excess);
|
||||
sockfd_put(sock);
|
||||
return error;
|
||||
}
|
||||
|
||||
static int atm_tc_delete(struct Qdisc *sch, unsigned long arg)
|
||||
{
|
||||
struct atm_qdisc_data *p = qdisc_priv(sch);
|
||||
struct atm_flow_data *flow = (struct atm_flow_data *)arg;
|
||||
|
||||
pr_debug("atm_tc_delete(sch %p,[qdisc %p],flow %p)\n", sch, p, flow);
|
||||
if (list_empty(&flow->list))
|
||||
return -EINVAL;
|
||||
if (rcu_access_pointer(flow->filter_list) || flow == &p->link)
|
||||
return -EBUSY;
|
||||
/*
|
||||
* Reference count must be 2: one for "keepalive" (set at class
|
||||
* creation), and one for the reference held when calling delete.
|
||||
*/
|
||||
if (flow->ref < 2) {
|
||||
pr_err("atm_tc_delete: flow->ref == %d\n", flow->ref);
|
||||
return -EINVAL;
|
||||
}
|
||||
if (flow->ref > 2)
|
||||
return -EBUSY; /* catch references via excess, etc. */
|
||||
atm_tc_put(sch, arg);
|
||||
return 0;
|
||||
}
|
||||
|
||||
static void atm_tc_walk(struct Qdisc *sch, struct qdisc_walker *walker)
|
||||
{
|
||||
struct atm_qdisc_data *p = qdisc_priv(sch);
|
||||
struct atm_flow_data *flow;
|
||||
|
||||
pr_debug("atm_tc_walk(sch %p,[qdisc %p],walker %p)\n", sch, p, walker);
|
||||
if (walker->stop)
|
||||
return;
|
||||
list_for_each_entry(flow, &p->flows, list) {
|
||||
if (walker->count >= walker->skip &&
|
||||
walker->fn(sch, (unsigned long)flow, walker) < 0) {
|
||||
walker->stop = 1;
|
||||
break;
|
||||
}
|
||||
walker->count++;
|
||||
}
|
||||
}
|
||||
|
||||
static struct tcf_proto __rcu **atm_tc_find_tcf(struct Qdisc *sch,
|
||||
unsigned long cl)
|
||||
{
|
||||
struct atm_qdisc_data *p = qdisc_priv(sch);
|
||||
struct atm_flow_data *flow = (struct atm_flow_data *)cl;
|
||||
|
||||
pr_debug("atm_tc_find_tcf(sch %p,[qdisc %p],flow %p)\n", sch, p, flow);
|
||||
return flow ? &flow->filter_list : &p->link.filter_list;
|
||||
}
|
||||
|
||||
/* --------------------------- Qdisc operations ---------------------------- */
|
||||
|
||||
static int atm_tc_enqueue(struct sk_buff *skb, struct Qdisc *sch)
|
||||
{
|
||||
struct atm_qdisc_data *p = qdisc_priv(sch);
|
||||
struct atm_flow_data *flow;
|
||||
struct tcf_result res;
|
||||
int result;
|
||||
int ret = NET_XMIT_POLICED;
|
||||
|
||||
pr_debug("atm_tc_enqueue(skb %p,sch %p,[qdisc %p])\n", skb, sch, p);
|
||||
result = TC_POLICE_OK; /* be nice to gcc */
|
||||
flow = NULL;
|
||||
if (TC_H_MAJ(skb->priority) != sch->handle ||
|
||||
!(flow = (struct atm_flow_data *)atm_tc_get(sch, skb->priority))) {
|
||||
struct tcf_proto *fl;
|
||||
|
||||
list_for_each_entry(flow, &p->flows, list) {
|
||||
fl = rcu_dereference_bh(flow->filter_list);
|
||||
if (fl) {
|
||||
result = tc_classify_compat(skb, fl, &res);
|
||||
if (result < 0)
|
||||
continue;
|
||||
flow = (struct atm_flow_data *)res.class;
|
||||
if (!flow)
|
||||
flow = lookup_flow(sch, res.classid);
|
||||
goto done;
|
||||
}
|
||||
}
|
||||
flow = NULL;
|
||||
done:
|
||||
;
|
||||
}
|
||||
if (!flow) {
|
||||
flow = &p->link;
|
||||
} else {
|
||||
if (flow->vcc)
|
||||
ATM_SKB(skb)->atm_options = flow->vcc->atm_options;
|
||||
/*@@@ looks good ... but it's not supposed to work :-) */
|
||||
#ifdef CONFIG_NET_CLS_ACT
|
||||
switch (result) {
|
||||
case TC_ACT_QUEUED:
|
||||
case TC_ACT_STOLEN:
|
||||
kfree_skb(skb);
|
||||
return NET_XMIT_SUCCESS | __NET_XMIT_STOLEN;
|
||||
case TC_ACT_SHOT:
|
||||
kfree_skb(skb);
|
||||
goto drop;
|
||||
case TC_POLICE_RECLASSIFY:
|
||||
if (flow->excess)
|
||||
flow = flow->excess;
|
||||
else
|
||||
ATM_SKB(skb)->atm_options |= ATM_ATMOPT_CLP;
|
||||
break;
|
||||
}
|
||||
#endif
|
||||
}
|
||||
|
||||
ret = qdisc_enqueue(skb, flow->q);
|
||||
if (ret != NET_XMIT_SUCCESS) {
|
||||
drop: __maybe_unused
|
||||
if (net_xmit_drop_count(ret)) {
|
||||
qdisc_qstats_drop(sch);
|
||||
if (flow)
|
||||
flow->qstats.drops++;
|
||||
}
|
||||
return ret;
|
||||
}
|
||||
/*
|
||||
* Okay, this may seem weird. We pretend we've dropped the packet if
|
||||
* it goes via ATM. The reason for this is that the outer qdisc
|
||||
* expects to be able to q->dequeue the packet later on if we return
|
||||
* success at this place. Also, sch->q.qdisc needs to reflect whether
|
||||
* there is a packet egligible for dequeuing or not. Note that the
|
||||
* statistics of the outer qdisc are necessarily wrong because of all
|
||||
* this. There's currently no correct solution for this.
|
||||
*/
|
||||
if (flow == &p->link) {
|
||||
sch->q.qlen++;
|
||||
return NET_XMIT_SUCCESS;
|
||||
}
|
||||
tasklet_schedule(&p->task);
|
||||
return NET_XMIT_SUCCESS | __NET_XMIT_BYPASS;
|
||||
}
|
||||
|
||||
/*
|
||||
* Dequeue packets and send them over ATM. Note that we quite deliberately
|
||||
* avoid checking net_device's flow control here, simply because sch_atm
|
||||
* uses its own channels, which have nothing to do with any CLIP/LANE/or
|
||||
* non-ATM interfaces.
|
||||
*/
|
||||
|
||||
static void sch_atm_dequeue(unsigned long data)
|
||||
{
|
||||
struct Qdisc *sch = (struct Qdisc *)data;
|
||||
struct atm_qdisc_data *p = qdisc_priv(sch);
|
||||
struct atm_flow_data *flow;
|
||||
struct sk_buff *skb;
|
||||
|
||||
pr_debug("sch_atm_dequeue(sch %p,[qdisc %p])\n", sch, p);
|
||||
list_for_each_entry(flow, &p->flows, list) {
|
||||
if (flow == &p->link)
|
||||
continue;
|
||||
/*
|
||||
* If traffic is properly shaped, this won't generate nasty
|
||||
* little bursts. Otherwise, it may ... (but that's okay)
|
||||
*/
|
||||
while ((skb = flow->q->ops->peek(flow->q))) {
|
||||
if (!atm_may_send(flow->vcc, skb->truesize))
|
||||
break;
|
||||
|
||||
skb = qdisc_dequeue_peeked(flow->q);
|
||||
if (unlikely(!skb))
|
||||
break;
|
||||
|
||||
qdisc_bstats_update(sch, skb);
|
||||
bstats_update(&flow->bstats, skb);
|
||||
pr_debug("atm_tc_dequeue: sending on class %p\n", flow);
|
||||
/* remove any LL header somebody else has attached */
|
||||
skb_pull(skb, skb_network_offset(skb));
|
||||
if (skb_headroom(skb) < flow->hdr_len) {
|
||||
struct sk_buff *new;
|
||||
|
||||
new = skb_realloc_headroom(skb, flow->hdr_len);
|
||||
dev_kfree_skb(skb);
|
||||
if (!new)
|
||||
continue;
|
||||
skb = new;
|
||||
}
|
||||
pr_debug("sch_atm_dequeue: ip %p, data %p\n",
|
||||
skb_network_header(skb), skb->data);
|
||||
ATM_SKB(skb)->vcc = flow->vcc;
|
||||
memcpy(skb_push(skb, flow->hdr_len), flow->hdr,
|
||||
flow->hdr_len);
|
||||
atomic_add(skb->truesize,
|
||||
&sk_atm(flow->vcc)->sk_wmem_alloc);
|
||||
/* atm.atm_options are already set by atm_tc_enqueue */
|
||||
flow->vcc->send(flow->vcc, skb);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static struct sk_buff *atm_tc_dequeue(struct Qdisc *sch)
|
||||
{
|
||||
struct atm_qdisc_data *p = qdisc_priv(sch);
|
||||
struct sk_buff *skb;
|
||||
|
||||
pr_debug("atm_tc_dequeue(sch %p,[qdisc %p])\n", sch, p);
|
||||
tasklet_schedule(&p->task);
|
||||
skb = qdisc_dequeue_peeked(p->link.q);
|
||||
if (skb)
|
||||
sch->q.qlen--;
|
||||
return skb;
|
||||
}
|
||||
|
||||
static struct sk_buff *atm_tc_peek(struct Qdisc *sch)
|
||||
{
|
||||
struct atm_qdisc_data *p = qdisc_priv(sch);
|
||||
|
||||
pr_debug("atm_tc_peek(sch %p,[qdisc %p])\n", sch, p);
|
||||
|
||||
return p->link.q->ops->peek(p->link.q);
|
||||
}
|
||||
|
||||
static unsigned int atm_tc_drop(struct Qdisc *sch)
|
||||
{
|
||||
struct atm_qdisc_data *p = qdisc_priv(sch);
|
||||
struct atm_flow_data *flow;
|
||||
unsigned int len;
|
||||
|
||||
pr_debug("atm_tc_drop(sch %p,[qdisc %p])\n", sch, p);
|
||||
list_for_each_entry(flow, &p->flows, list) {
|
||||
if (flow->q->ops->drop && (len = flow->q->ops->drop(flow->q)))
|
||||
return len;
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
static int atm_tc_init(struct Qdisc *sch, struct nlattr *opt)
|
||||
{
|
||||
struct atm_qdisc_data *p = qdisc_priv(sch);
|
||||
|
||||
pr_debug("atm_tc_init(sch %p,[qdisc %p],opt %p)\n", sch, p, opt);
|
||||
INIT_LIST_HEAD(&p->flows);
|
||||
INIT_LIST_HEAD(&p->link.list);
|
||||
list_add(&p->link.list, &p->flows);
|
||||
p->link.q = qdisc_create_dflt(sch->dev_queue,
|
||||
&pfifo_qdisc_ops, sch->handle);
|
||||
if (!p->link.q)
|
||||
p->link.q = &noop_qdisc;
|
||||
pr_debug("atm_tc_init: link (%p) qdisc %p\n", &p->link, p->link.q);
|
||||
RCU_INIT_POINTER(p->link.filter_list, NULL);
|
||||
p->link.vcc = NULL;
|
||||
p->link.sock = NULL;
|
||||
p->link.classid = sch->handle;
|
||||
p->link.ref = 1;
|
||||
tasklet_init(&p->task, sch_atm_dequeue, (unsigned long)sch);
|
||||
return 0;
|
||||
}
|
||||
|
||||
static void atm_tc_reset(struct Qdisc *sch)
|
||||
{
|
||||
struct atm_qdisc_data *p = qdisc_priv(sch);
|
||||
struct atm_flow_data *flow;
|
||||
|
||||
pr_debug("atm_tc_reset(sch %p,[qdisc %p])\n", sch, p);
|
||||
list_for_each_entry(flow, &p->flows, list)
|
||||
qdisc_reset(flow->q);
|
||||
sch->q.qlen = 0;
|
||||
}
|
||||
|
||||
static void atm_tc_destroy(struct Qdisc *sch)
|
||||
{
|
||||
struct atm_qdisc_data *p = qdisc_priv(sch);
|
||||
struct atm_flow_data *flow, *tmp;
|
||||
|
||||
pr_debug("atm_tc_destroy(sch %p,[qdisc %p])\n", sch, p);
|
||||
list_for_each_entry(flow, &p->flows, list)
|
||||
tcf_destroy_chain(&flow->filter_list);
|
||||
|
||||
list_for_each_entry_safe(flow, tmp, &p->flows, list) {
|
||||
if (flow->ref > 1)
|
||||
pr_err("atm_destroy: %p->ref = %d\n", flow, flow->ref);
|
||||
atm_tc_put(sch, (unsigned long)flow);
|
||||
}
|
||||
tasklet_kill(&p->task);
|
||||
}
|
||||
|
||||
static int atm_tc_dump_class(struct Qdisc *sch, unsigned long cl,
|
||||
struct sk_buff *skb, struct tcmsg *tcm)
|
||||
{
|
||||
struct atm_qdisc_data *p = qdisc_priv(sch);
|
||||
struct atm_flow_data *flow = (struct atm_flow_data *)cl;
|
||||
struct nlattr *nest;
|
||||
|
||||
pr_debug("atm_tc_dump_class(sch %p,[qdisc %p],flow %p,skb %p,tcm %p)\n",
|
||||
sch, p, flow, skb, tcm);
|
||||
if (list_empty(&flow->list))
|
||||
return -EINVAL;
|
||||
tcm->tcm_handle = flow->classid;
|
||||
tcm->tcm_info = flow->q->handle;
|
||||
|
||||
nest = nla_nest_start(skb, TCA_OPTIONS);
|
||||
if (nest == NULL)
|
||||
goto nla_put_failure;
|
||||
|
||||
if (nla_put(skb, TCA_ATM_HDR, flow->hdr_len, flow->hdr))
|
||||
goto nla_put_failure;
|
||||
if (flow->vcc) {
|
||||
struct sockaddr_atmpvc pvc;
|
||||
int state;
|
||||
|
||||
memset(&pvc, 0, sizeof(pvc));
|
||||
pvc.sap_family = AF_ATMPVC;
|
||||
pvc.sap_addr.itf = flow->vcc->dev ? flow->vcc->dev->number : -1;
|
||||
pvc.sap_addr.vpi = flow->vcc->vpi;
|
||||
pvc.sap_addr.vci = flow->vcc->vci;
|
||||
if (nla_put(skb, TCA_ATM_ADDR, sizeof(pvc), &pvc))
|
||||
goto nla_put_failure;
|
||||
state = ATM_VF2VS(flow->vcc->flags);
|
||||
if (nla_put_u32(skb, TCA_ATM_STATE, state))
|
||||
goto nla_put_failure;
|
||||
}
|
||||
if (flow->excess) {
|
||||
if (nla_put_u32(skb, TCA_ATM_EXCESS, flow->classid))
|
||||
goto nla_put_failure;
|
||||
} else {
|
||||
if (nla_put_u32(skb, TCA_ATM_EXCESS, 0))
|
||||
goto nla_put_failure;
|
||||
}
|
||||
return nla_nest_end(skb, nest);
|
||||
|
||||
nla_put_failure:
|
||||
nla_nest_cancel(skb, nest);
|
||||
return -1;
|
||||
}
|
||||
static int
|
||||
atm_tc_dump_class_stats(struct Qdisc *sch, unsigned long arg,
|
||||
struct gnet_dump *d)
|
||||
{
|
||||
struct atm_flow_data *flow = (struct atm_flow_data *)arg;
|
||||
|
||||
if (gnet_stats_copy_basic(d, NULL, &flow->bstats) < 0 ||
|
||||
gnet_stats_copy_queue(d, NULL, &flow->qstats, flow->q->q.qlen) < 0)
|
||||
return -1;
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
static int atm_tc_dump(struct Qdisc *sch, struct sk_buff *skb)
|
||||
{
|
||||
return 0;
|
||||
}
|
||||
|
||||
static const struct Qdisc_class_ops atm_class_ops = {
|
||||
.graft = atm_tc_graft,
|
||||
.leaf = atm_tc_leaf,
|
||||
.get = atm_tc_get,
|
||||
.put = atm_tc_put,
|
||||
.change = atm_tc_change,
|
||||
.delete = atm_tc_delete,
|
||||
.walk = atm_tc_walk,
|
||||
.tcf_chain = atm_tc_find_tcf,
|
||||
.bind_tcf = atm_tc_bind_filter,
|
||||
.unbind_tcf = atm_tc_put,
|
||||
.dump = atm_tc_dump_class,
|
||||
.dump_stats = atm_tc_dump_class_stats,
|
||||
};
|
||||
|
||||
static struct Qdisc_ops atm_qdisc_ops __read_mostly = {
|
||||
.cl_ops = &atm_class_ops,
|
||||
.id = "atm",
|
||||
.priv_size = sizeof(struct atm_qdisc_data),
|
||||
.enqueue = atm_tc_enqueue,
|
||||
.dequeue = atm_tc_dequeue,
|
||||
.peek = atm_tc_peek,
|
||||
.drop = atm_tc_drop,
|
||||
.init = atm_tc_init,
|
||||
.reset = atm_tc_reset,
|
||||
.destroy = atm_tc_destroy,
|
||||
.dump = atm_tc_dump,
|
||||
.owner = THIS_MODULE,
|
||||
};
|
||||
|
||||
static int __init atm_init(void)
|
||||
{
|
||||
return register_qdisc(&atm_qdisc_ops);
|
||||
}
|
||||
|
||||
static void __exit atm_exit(void)
|
||||
{
|
||||
unregister_qdisc(&atm_qdisc_ops);
|
||||
}
|
||||
|
||||
module_init(atm_init)
|
||||
module_exit(atm_exit)
|
||||
MODULE_LICENSE("GPL");
|
||||
53
net/sched/sch_blackhole.c
Normal file
53
net/sched/sch_blackhole.c
Normal file
|
|
@ -0,0 +1,53 @@
|
|||
/*
|
||||
* net/sched/sch_blackhole.c Black hole queue
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU General Public License
|
||||
* as published by the Free Software Foundation; either version
|
||||
* 2 of the License, or (at your option) any later version.
|
||||
*
|
||||
* Authors: Thomas Graf <tgraf@suug.ch>
|
||||
*
|
||||
* Note: Quantum tunneling is not supported.
|
||||
*/
|
||||
|
||||
#include <linux/module.h>
|
||||
#include <linux/types.h>
|
||||
#include <linux/kernel.h>
|
||||
#include <linux/skbuff.h>
|
||||
#include <net/pkt_sched.h>
|
||||
|
||||
static int blackhole_enqueue(struct sk_buff *skb, struct Qdisc *sch)
|
||||
{
|
||||
qdisc_drop(skb, sch);
|
||||
return NET_XMIT_SUCCESS;
|
||||
}
|
||||
|
||||
static struct sk_buff *blackhole_dequeue(struct Qdisc *sch)
|
||||
{
|
||||
return NULL;
|
||||
}
|
||||
|
||||
static struct Qdisc_ops blackhole_qdisc_ops __read_mostly = {
|
||||
.id = "blackhole",
|
||||
.priv_size = 0,
|
||||
.enqueue = blackhole_enqueue,
|
||||
.dequeue = blackhole_dequeue,
|
||||
.peek = blackhole_dequeue,
|
||||
.owner = THIS_MODULE,
|
||||
};
|
||||
|
||||
static int __init blackhole_module_init(void)
|
||||
{
|
||||
return register_qdisc(&blackhole_qdisc_ops);
|
||||
}
|
||||
|
||||
static void __exit blackhole_module_exit(void)
|
||||
{
|
||||
unregister_qdisc(&blackhole_qdisc_ops);
|
||||
}
|
||||
|
||||
module_init(blackhole_module_init)
|
||||
module_exit(blackhole_module_exit)
|
||||
|
||||
MODULE_LICENSE("GPL");
|
||||
2062
net/sched/sch_cbq.c
Normal file
2062
net/sched/sch_cbq.c
Normal file
File diff suppressed because it is too large
Load diff
645
net/sched/sch_choke.c
Normal file
645
net/sched/sch_choke.c
Normal file
|
|
@ -0,0 +1,645 @@
|
|||
/*
|
||||
* net/sched/sch_choke.c CHOKE scheduler
|
||||
*
|
||||
* Copyright (c) 2011 Stephen Hemminger <shemminger@vyatta.com>
|
||||
* Copyright (c) 2011 Eric Dumazet <eric.dumazet@gmail.com>
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU General Public License
|
||||
* version 2 as published by the Free Software Foundation.
|
||||
*
|
||||
*/
|
||||
|
||||
#include <linux/module.h>
|
||||
#include <linux/types.h>
|
||||
#include <linux/kernel.h>
|
||||
#include <linux/skbuff.h>
|
||||
#include <linux/vmalloc.h>
|
||||
#include <net/pkt_sched.h>
|
||||
#include <net/inet_ecn.h>
|
||||
#include <net/red.h>
|
||||
#include <net/flow_keys.h>
|
||||
|
||||
/*
|
||||
CHOKe stateless AQM for fair bandwidth allocation
|
||||
=================================================
|
||||
|
||||
CHOKe (CHOose and Keep for responsive flows, CHOose and Kill for
|
||||
unresponsive flows) is a variant of RED that penalizes misbehaving flows but
|
||||
maintains no flow state. The difference from RED is an additional step
|
||||
during the enqueuing process. If average queue size is over the
|
||||
low threshold (qmin), a packet is chosen at random from the queue.
|
||||
If both the new and chosen packet are from the same flow, both
|
||||
are dropped. Unlike RED, CHOKe is not really a "classful" qdisc because it
|
||||
needs to access packets in queue randomly. It has a minimal class
|
||||
interface to allow overriding the builtin flow classifier with
|
||||
filters.
|
||||
|
||||
Source:
|
||||
R. Pan, B. Prabhakar, and K. Psounis, "CHOKe, A Stateless
|
||||
Active Queue Management Scheme for Approximating Fair Bandwidth Allocation",
|
||||
IEEE INFOCOM, 2000.
|
||||
|
||||
A. Tang, J. Wang, S. Low, "Understanding CHOKe: Throughput and Spatial
|
||||
Characteristics", IEEE/ACM Transactions on Networking, 2004
|
||||
|
||||
*/
|
||||
|
||||
/* Upper bound on size of sk_buff table (packets) */
|
||||
#define CHOKE_MAX_QUEUE (128*1024 - 1)
|
||||
|
||||
struct choke_sched_data {
|
||||
/* Parameters */
|
||||
u32 limit;
|
||||
unsigned char flags;
|
||||
|
||||
struct red_parms parms;
|
||||
|
||||
/* Variables */
|
||||
struct red_vars vars;
|
||||
struct tcf_proto __rcu *filter_list;
|
||||
struct {
|
||||
u32 prob_drop; /* Early probability drops */
|
||||
u32 prob_mark; /* Early probability marks */
|
||||
u32 forced_drop; /* Forced drops, qavg > max_thresh */
|
||||
u32 forced_mark; /* Forced marks, qavg > max_thresh */
|
||||
u32 pdrop; /* Drops due to queue limits */
|
||||
u32 other; /* Drops due to drop() calls */
|
||||
u32 matched; /* Drops to flow match */
|
||||
} stats;
|
||||
|
||||
unsigned int head;
|
||||
unsigned int tail;
|
||||
|
||||
unsigned int tab_mask; /* size - 1 */
|
||||
|
||||
struct sk_buff **tab;
|
||||
};
|
||||
|
||||
/* number of elements in queue including holes */
|
||||
static unsigned int choke_len(const struct choke_sched_data *q)
|
||||
{
|
||||
return (q->tail - q->head) & q->tab_mask;
|
||||
}
|
||||
|
||||
/* Is ECN parameter configured */
|
||||
static int use_ecn(const struct choke_sched_data *q)
|
||||
{
|
||||
return q->flags & TC_RED_ECN;
|
||||
}
|
||||
|
||||
/* Should packets over max just be dropped (versus marked) */
|
||||
static int use_harddrop(const struct choke_sched_data *q)
|
||||
{
|
||||
return q->flags & TC_RED_HARDDROP;
|
||||
}
|
||||
|
||||
/* Move head pointer forward to skip over holes */
|
||||
static void choke_zap_head_holes(struct choke_sched_data *q)
|
||||
{
|
||||
do {
|
||||
q->head = (q->head + 1) & q->tab_mask;
|
||||
if (q->head == q->tail)
|
||||
break;
|
||||
} while (q->tab[q->head] == NULL);
|
||||
}
|
||||
|
||||
/* Move tail pointer backwards to reuse holes */
|
||||
static void choke_zap_tail_holes(struct choke_sched_data *q)
|
||||
{
|
||||
do {
|
||||
q->tail = (q->tail - 1) & q->tab_mask;
|
||||
if (q->head == q->tail)
|
||||
break;
|
||||
} while (q->tab[q->tail] == NULL);
|
||||
}
|
||||
|
||||
/* Drop packet from queue array by creating a "hole" */
|
||||
static void choke_drop_by_idx(struct Qdisc *sch, unsigned int idx)
|
||||
{
|
||||
struct choke_sched_data *q = qdisc_priv(sch);
|
||||
struct sk_buff *skb = q->tab[idx];
|
||||
|
||||
q->tab[idx] = NULL;
|
||||
|
||||
if (idx == q->head)
|
||||
choke_zap_head_holes(q);
|
||||
if (idx == q->tail)
|
||||
choke_zap_tail_holes(q);
|
||||
|
||||
qdisc_qstats_backlog_dec(sch, skb);
|
||||
qdisc_drop(skb, sch);
|
||||
qdisc_tree_decrease_qlen(sch, 1);
|
||||
--sch->q.qlen;
|
||||
}
|
||||
|
||||
/* private part of skb->cb[] that a qdisc is allowed to use
|
||||
* is limited to QDISC_CB_PRIV_LEN bytes.
|
||||
* As a flow key might be too large, we store a part of it only.
|
||||
*/
|
||||
#define CHOKE_K_LEN min_t(u32, sizeof(struct flow_keys), QDISC_CB_PRIV_LEN - 3)
|
||||
|
||||
struct choke_skb_cb {
|
||||
u16 classid;
|
||||
u8 keys_valid;
|
||||
u8 keys[QDISC_CB_PRIV_LEN - 3];
|
||||
};
|
||||
|
||||
static inline struct choke_skb_cb *choke_skb_cb(const struct sk_buff *skb)
|
||||
{
|
||||
qdisc_cb_private_validate(skb, sizeof(struct choke_skb_cb));
|
||||
return (struct choke_skb_cb *)qdisc_skb_cb(skb)->data;
|
||||
}
|
||||
|
||||
static inline void choke_set_classid(struct sk_buff *skb, u16 classid)
|
||||
{
|
||||
choke_skb_cb(skb)->classid = classid;
|
||||
}
|
||||
|
||||
static u16 choke_get_classid(const struct sk_buff *skb)
|
||||
{
|
||||
return choke_skb_cb(skb)->classid;
|
||||
}
|
||||
|
||||
/*
|
||||
* Compare flow of two packets
|
||||
* Returns true only if source and destination address and port match.
|
||||
* false for special cases
|
||||
*/
|
||||
static bool choke_match_flow(struct sk_buff *skb1,
|
||||
struct sk_buff *skb2)
|
||||
{
|
||||
struct flow_keys temp;
|
||||
|
||||
if (skb1->protocol != skb2->protocol)
|
||||
return false;
|
||||
|
||||
if (!choke_skb_cb(skb1)->keys_valid) {
|
||||
choke_skb_cb(skb1)->keys_valid = 1;
|
||||
skb_flow_dissect(skb1, &temp);
|
||||
memcpy(&choke_skb_cb(skb1)->keys, &temp, CHOKE_K_LEN);
|
||||
}
|
||||
|
||||
if (!choke_skb_cb(skb2)->keys_valid) {
|
||||
choke_skb_cb(skb2)->keys_valid = 1;
|
||||
skb_flow_dissect(skb2, &temp);
|
||||
memcpy(&choke_skb_cb(skb2)->keys, &temp, CHOKE_K_LEN);
|
||||
}
|
||||
|
||||
return !memcmp(&choke_skb_cb(skb1)->keys,
|
||||
&choke_skb_cb(skb2)->keys,
|
||||
CHOKE_K_LEN);
|
||||
}
|
||||
|
||||
/*
|
||||
* Classify flow using either:
|
||||
* 1. pre-existing classification result in skb
|
||||
* 2. fast internal classification
|
||||
* 3. use TC filter based classification
|
||||
*/
|
||||
static bool choke_classify(struct sk_buff *skb,
|
||||
struct Qdisc *sch, int *qerr)
|
||||
|
||||
{
|
||||
struct choke_sched_data *q = qdisc_priv(sch);
|
||||
struct tcf_result res;
|
||||
struct tcf_proto *fl;
|
||||
int result;
|
||||
|
||||
fl = rcu_dereference_bh(q->filter_list);
|
||||
result = tc_classify(skb, fl, &res);
|
||||
if (result >= 0) {
|
||||
#ifdef CONFIG_NET_CLS_ACT
|
||||
switch (result) {
|
||||
case TC_ACT_STOLEN:
|
||||
case TC_ACT_QUEUED:
|
||||
*qerr = NET_XMIT_SUCCESS | __NET_XMIT_STOLEN;
|
||||
case TC_ACT_SHOT:
|
||||
return false;
|
||||
}
|
||||
#endif
|
||||
choke_set_classid(skb, TC_H_MIN(res.classid));
|
||||
return true;
|
||||
}
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
/*
|
||||
* Select a packet at random from queue
|
||||
* HACK: since queue can have holes from previous deletion; retry several
|
||||
* times to find a random skb but then just give up and return the head
|
||||
* Will return NULL if queue is empty (q->head == q->tail)
|
||||
*/
|
||||
static struct sk_buff *choke_peek_random(const struct choke_sched_data *q,
|
||||
unsigned int *pidx)
|
||||
{
|
||||
struct sk_buff *skb;
|
||||
int retrys = 3;
|
||||
|
||||
do {
|
||||
*pidx = (q->head + prandom_u32_max(choke_len(q))) & q->tab_mask;
|
||||
skb = q->tab[*pidx];
|
||||
if (skb)
|
||||
return skb;
|
||||
} while (--retrys > 0);
|
||||
|
||||
return q->tab[*pidx = q->head];
|
||||
}
|
||||
|
||||
/*
|
||||
* Compare new packet with random packet in queue
|
||||
* returns true if matched and sets *pidx
|
||||
*/
|
||||
static bool choke_match_random(const struct choke_sched_data *q,
|
||||
struct sk_buff *nskb,
|
||||
unsigned int *pidx)
|
||||
{
|
||||
struct sk_buff *oskb;
|
||||
|
||||
if (q->head == q->tail)
|
||||
return false;
|
||||
|
||||
oskb = choke_peek_random(q, pidx);
|
||||
if (rcu_access_pointer(q->filter_list))
|
||||
return choke_get_classid(nskb) == choke_get_classid(oskb);
|
||||
|
||||
return choke_match_flow(oskb, nskb);
|
||||
}
|
||||
|
||||
static int choke_enqueue(struct sk_buff *skb, struct Qdisc *sch)
|
||||
{
|
||||
int ret = NET_XMIT_SUCCESS | __NET_XMIT_BYPASS;
|
||||
struct choke_sched_data *q = qdisc_priv(sch);
|
||||
const struct red_parms *p = &q->parms;
|
||||
|
||||
if (rcu_access_pointer(q->filter_list)) {
|
||||
/* If using external classifiers, get result and record it. */
|
||||
if (!choke_classify(skb, sch, &ret))
|
||||
goto other_drop; /* Packet was eaten by filter */
|
||||
}
|
||||
|
||||
choke_skb_cb(skb)->keys_valid = 0;
|
||||
/* Compute average queue usage (see RED) */
|
||||
q->vars.qavg = red_calc_qavg(p, &q->vars, sch->q.qlen);
|
||||
if (red_is_idling(&q->vars))
|
||||
red_end_of_idle_period(&q->vars);
|
||||
|
||||
/* Is queue small? */
|
||||
if (q->vars.qavg <= p->qth_min)
|
||||
q->vars.qcount = -1;
|
||||
else {
|
||||
unsigned int idx;
|
||||
|
||||
/* Draw a packet at random from queue and compare flow */
|
||||
if (choke_match_random(q, skb, &idx)) {
|
||||
q->stats.matched++;
|
||||
choke_drop_by_idx(sch, idx);
|
||||
goto congestion_drop;
|
||||
}
|
||||
|
||||
/* Queue is large, always mark/drop */
|
||||
if (q->vars.qavg > p->qth_max) {
|
||||
q->vars.qcount = -1;
|
||||
|
||||
qdisc_qstats_overlimit(sch);
|
||||
if (use_harddrop(q) || !use_ecn(q) ||
|
||||
!INET_ECN_set_ce(skb)) {
|
||||
q->stats.forced_drop++;
|
||||
goto congestion_drop;
|
||||
}
|
||||
|
||||
q->stats.forced_mark++;
|
||||
} else if (++q->vars.qcount) {
|
||||
if (red_mark_probability(p, &q->vars, q->vars.qavg)) {
|
||||
q->vars.qcount = 0;
|
||||
q->vars.qR = red_random(p);
|
||||
|
||||
qdisc_qstats_overlimit(sch);
|
||||
if (!use_ecn(q) || !INET_ECN_set_ce(skb)) {
|
||||
q->stats.prob_drop++;
|
||||
goto congestion_drop;
|
||||
}
|
||||
|
||||
q->stats.prob_mark++;
|
||||
}
|
||||
} else
|
||||
q->vars.qR = red_random(p);
|
||||
}
|
||||
|
||||
/* Admit new packet */
|
||||
if (sch->q.qlen < q->limit) {
|
||||
q->tab[q->tail] = skb;
|
||||
q->tail = (q->tail + 1) & q->tab_mask;
|
||||
++sch->q.qlen;
|
||||
qdisc_qstats_backlog_inc(sch, skb);
|
||||
return NET_XMIT_SUCCESS;
|
||||
}
|
||||
|
||||
q->stats.pdrop++;
|
||||
return qdisc_drop(skb, sch);
|
||||
|
||||
congestion_drop:
|
||||
qdisc_drop(skb, sch);
|
||||
return NET_XMIT_CN;
|
||||
|
||||
other_drop:
|
||||
if (ret & __NET_XMIT_BYPASS)
|
||||
qdisc_qstats_drop(sch);
|
||||
kfree_skb(skb);
|
||||
return ret;
|
||||
}
|
||||
|
||||
static struct sk_buff *choke_dequeue(struct Qdisc *sch)
|
||||
{
|
||||
struct choke_sched_data *q = qdisc_priv(sch);
|
||||
struct sk_buff *skb;
|
||||
|
||||
if (q->head == q->tail) {
|
||||
if (!red_is_idling(&q->vars))
|
||||
red_start_of_idle_period(&q->vars);
|
||||
return NULL;
|
||||
}
|
||||
|
||||
skb = q->tab[q->head];
|
||||
q->tab[q->head] = NULL;
|
||||
choke_zap_head_holes(q);
|
||||
--sch->q.qlen;
|
||||
qdisc_qstats_backlog_dec(sch, skb);
|
||||
qdisc_bstats_update(sch, skb);
|
||||
|
||||
return skb;
|
||||
}
|
||||
|
||||
static unsigned int choke_drop(struct Qdisc *sch)
|
||||
{
|
||||
struct choke_sched_data *q = qdisc_priv(sch);
|
||||
unsigned int len;
|
||||
|
||||
len = qdisc_queue_drop(sch);
|
||||
if (len > 0)
|
||||
q->stats.other++;
|
||||
else {
|
||||
if (!red_is_idling(&q->vars))
|
||||
red_start_of_idle_period(&q->vars);
|
||||
}
|
||||
|
||||
return len;
|
||||
}
|
||||
|
||||
static void choke_reset(struct Qdisc *sch)
|
||||
{
|
||||
struct choke_sched_data *q = qdisc_priv(sch);
|
||||
|
||||
red_restart(&q->vars);
|
||||
}
|
||||
|
||||
static const struct nla_policy choke_policy[TCA_CHOKE_MAX + 1] = {
|
||||
[TCA_CHOKE_PARMS] = { .len = sizeof(struct tc_red_qopt) },
|
||||
[TCA_CHOKE_STAB] = { .len = RED_STAB_SIZE },
|
||||
[TCA_CHOKE_MAX_P] = { .type = NLA_U32 },
|
||||
};
|
||||
|
||||
|
||||
static void choke_free(void *addr)
|
||||
{
|
||||
kvfree(addr);
|
||||
}
|
||||
|
||||
static int choke_change(struct Qdisc *sch, struct nlattr *opt)
|
||||
{
|
||||
struct choke_sched_data *q = qdisc_priv(sch);
|
||||
struct nlattr *tb[TCA_CHOKE_MAX + 1];
|
||||
const struct tc_red_qopt *ctl;
|
||||
int err;
|
||||
struct sk_buff **old = NULL;
|
||||
unsigned int mask;
|
||||
u32 max_P;
|
||||
|
||||
if (opt == NULL)
|
||||
return -EINVAL;
|
||||
|
||||
err = nla_parse_nested(tb, TCA_CHOKE_MAX, opt, choke_policy);
|
||||
if (err < 0)
|
||||
return err;
|
||||
|
||||
if (tb[TCA_CHOKE_PARMS] == NULL ||
|
||||
tb[TCA_CHOKE_STAB] == NULL)
|
||||
return -EINVAL;
|
||||
|
||||
max_P = tb[TCA_CHOKE_MAX_P] ? nla_get_u32(tb[TCA_CHOKE_MAX_P]) : 0;
|
||||
|
||||
ctl = nla_data(tb[TCA_CHOKE_PARMS]);
|
||||
|
||||
if (ctl->limit > CHOKE_MAX_QUEUE)
|
||||
return -EINVAL;
|
||||
|
||||
mask = roundup_pow_of_two(ctl->limit + 1) - 1;
|
||||
if (mask != q->tab_mask) {
|
||||
struct sk_buff **ntab;
|
||||
|
||||
ntab = kcalloc(mask + 1, sizeof(struct sk_buff *),
|
||||
GFP_KERNEL | __GFP_NOWARN);
|
||||
if (!ntab)
|
||||
ntab = vzalloc((mask + 1) * sizeof(struct sk_buff *));
|
||||
if (!ntab)
|
||||
return -ENOMEM;
|
||||
|
||||
sch_tree_lock(sch);
|
||||
old = q->tab;
|
||||
if (old) {
|
||||
unsigned int oqlen = sch->q.qlen, tail = 0;
|
||||
|
||||
while (q->head != q->tail) {
|
||||
struct sk_buff *skb = q->tab[q->head];
|
||||
|
||||
q->head = (q->head + 1) & q->tab_mask;
|
||||
if (!skb)
|
||||
continue;
|
||||
if (tail < mask) {
|
||||
ntab[tail++] = skb;
|
||||
continue;
|
||||
}
|
||||
qdisc_qstats_backlog_dec(sch, skb);
|
||||
--sch->q.qlen;
|
||||
qdisc_drop(skb, sch);
|
||||
}
|
||||
qdisc_tree_decrease_qlen(sch, oqlen - sch->q.qlen);
|
||||
q->head = 0;
|
||||
q->tail = tail;
|
||||
}
|
||||
|
||||
q->tab_mask = mask;
|
||||
q->tab = ntab;
|
||||
} else
|
||||
sch_tree_lock(sch);
|
||||
|
||||
q->flags = ctl->flags;
|
||||
q->limit = ctl->limit;
|
||||
|
||||
red_set_parms(&q->parms, ctl->qth_min, ctl->qth_max, ctl->Wlog,
|
||||
ctl->Plog, ctl->Scell_log,
|
||||
nla_data(tb[TCA_CHOKE_STAB]),
|
||||
max_P);
|
||||
red_set_vars(&q->vars);
|
||||
|
||||
if (q->head == q->tail)
|
||||
red_end_of_idle_period(&q->vars);
|
||||
|
||||
sch_tree_unlock(sch);
|
||||
choke_free(old);
|
||||
return 0;
|
||||
}
|
||||
|
||||
static int choke_init(struct Qdisc *sch, struct nlattr *opt)
|
||||
{
|
||||
return choke_change(sch, opt);
|
||||
}
|
||||
|
||||
static int choke_dump(struct Qdisc *sch, struct sk_buff *skb)
|
||||
{
|
||||
struct choke_sched_data *q = qdisc_priv(sch);
|
||||
struct nlattr *opts = NULL;
|
||||
struct tc_red_qopt opt = {
|
||||
.limit = q->limit,
|
||||
.flags = q->flags,
|
||||
.qth_min = q->parms.qth_min >> q->parms.Wlog,
|
||||
.qth_max = q->parms.qth_max >> q->parms.Wlog,
|
||||
.Wlog = q->parms.Wlog,
|
||||
.Plog = q->parms.Plog,
|
||||
.Scell_log = q->parms.Scell_log,
|
||||
};
|
||||
|
||||
opts = nla_nest_start(skb, TCA_OPTIONS);
|
||||
if (opts == NULL)
|
||||
goto nla_put_failure;
|
||||
|
||||
if (nla_put(skb, TCA_CHOKE_PARMS, sizeof(opt), &opt) ||
|
||||
nla_put_u32(skb, TCA_CHOKE_MAX_P, q->parms.max_P))
|
||||
goto nla_put_failure;
|
||||
return nla_nest_end(skb, opts);
|
||||
|
||||
nla_put_failure:
|
||||
nla_nest_cancel(skb, opts);
|
||||
return -EMSGSIZE;
|
||||
}
|
||||
|
||||
static int choke_dump_stats(struct Qdisc *sch, struct gnet_dump *d)
|
||||
{
|
||||
struct choke_sched_data *q = qdisc_priv(sch);
|
||||
struct tc_choke_xstats st = {
|
||||
.early = q->stats.prob_drop + q->stats.forced_drop,
|
||||
.marked = q->stats.prob_mark + q->stats.forced_mark,
|
||||
.pdrop = q->stats.pdrop,
|
||||
.other = q->stats.other,
|
||||
.matched = q->stats.matched,
|
||||
};
|
||||
|
||||
return gnet_stats_copy_app(d, &st, sizeof(st));
|
||||
}
|
||||
|
||||
static void choke_destroy(struct Qdisc *sch)
|
||||
{
|
||||
struct choke_sched_data *q = qdisc_priv(sch);
|
||||
|
||||
tcf_destroy_chain(&q->filter_list);
|
||||
choke_free(q->tab);
|
||||
}
|
||||
|
||||
static struct Qdisc *choke_leaf(struct Qdisc *sch, unsigned long arg)
|
||||
{
|
||||
return NULL;
|
||||
}
|
||||
|
||||
static unsigned long choke_get(struct Qdisc *sch, u32 classid)
|
||||
{
|
||||
return 0;
|
||||
}
|
||||
|
||||
static void choke_put(struct Qdisc *q, unsigned long cl)
|
||||
{
|
||||
}
|
||||
|
||||
static unsigned long choke_bind(struct Qdisc *sch, unsigned long parent,
|
||||
u32 classid)
|
||||
{
|
||||
return 0;
|
||||
}
|
||||
|
||||
static struct tcf_proto __rcu **choke_find_tcf(struct Qdisc *sch,
|
||||
unsigned long cl)
|
||||
{
|
||||
struct choke_sched_data *q = qdisc_priv(sch);
|
||||
|
||||
if (cl)
|
||||
return NULL;
|
||||
return &q->filter_list;
|
||||
}
|
||||
|
||||
static int choke_dump_class(struct Qdisc *sch, unsigned long cl,
|
||||
struct sk_buff *skb, struct tcmsg *tcm)
|
||||
{
|
||||
tcm->tcm_handle |= TC_H_MIN(cl);
|
||||
return 0;
|
||||
}
|
||||
|
||||
static void choke_walk(struct Qdisc *sch, struct qdisc_walker *arg)
|
||||
{
|
||||
if (!arg->stop) {
|
||||
if (arg->fn(sch, 1, arg) < 0) {
|
||||
arg->stop = 1;
|
||||
return;
|
||||
}
|
||||
arg->count++;
|
||||
}
|
||||
}
|
||||
|
||||
static const struct Qdisc_class_ops choke_class_ops = {
|
||||
.leaf = choke_leaf,
|
||||
.get = choke_get,
|
||||
.put = choke_put,
|
||||
.tcf_chain = choke_find_tcf,
|
||||
.bind_tcf = choke_bind,
|
||||
.unbind_tcf = choke_put,
|
||||
.dump = choke_dump_class,
|
||||
.walk = choke_walk,
|
||||
};
|
||||
|
||||
static struct sk_buff *choke_peek_head(struct Qdisc *sch)
|
||||
{
|
||||
struct choke_sched_data *q = qdisc_priv(sch);
|
||||
|
||||
return (q->head != q->tail) ? q->tab[q->head] : NULL;
|
||||
}
|
||||
|
||||
static struct Qdisc_ops choke_qdisc_ops __read_mostly = {
|
||||
.id = "choke",
|
||||
.priv_size = sizeof(struct choke_sched_data),
|
||||
|
||||
.enqueue = choke_enqueue,
|
||||
.dequeue = choke_dequeue,
|
||||
.peek = choke_peek_head,
|
||||
.drop = choke_drop,
|
||||
.init = choke_init,
|
||||
.destroy = choke_destroy,
|
||||
.reset = choke_reset,
|
||||
.change = choke_change,
|
||||
.dump = choke_dump,
|
||||
.dump_stats = choke_dump_stats,
|
||||
.owner = THIS_MODULE,
|
||||
};
|
||||
|
||||
static int __init choke_module_init(void)
|
||||
{
|
||||
return register_qdisc(&choke_qdisc_ops);
|
||||
}
|
||||
|
||||
static void __exit choke_module_exit(void)
|
||||
{
|
||||
unregister_qdisc(&choke_qdisc_ops);
|
||||
}
|
||||
|
||||
module_init(choke_module_init)
|
||||
module_exit(choke_module_exit)
|
||||
|
||||
MODULE_LICENSE("GPL");
|
||||
276
net/sched/sch_codel.c
Normal file
276
net/sched/sch_codel.c
Normal file
|
|
@ -0,0 +1,276 @@
|
|||
/*
|
||||
* Codel - The Controlled-Delay Active Queue Management algorithm
|
||||
*
|
||||
* Copyright (C) 2011-2012 Kathleen Nichols <nichols@pollere.com>
|
||||
* Copyright (C) 2011-2012 Van Jacobson <van@pollere.net>
|
||||
*
|
||||
* Implemented on linux by :
|
||||
* Copyright (C) 2012 Michael D. Taht <dave.taht@bufferbloat.net>
|
||||
* Copyright (C) 2012 Eric Dumazet <edumazet@google.com>
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions
|
||||
* are met:
|
||||
* 1. Redistributions of source code must retain the above copyright
|
||||
* notice, this list of conditions, and the following disclaimer,
|
||||
* without modification.
|
||||
* 2. Redistributions in binary form must reproduce the above copyright
|
||||
* notice, this list of conditions and the following disclaimer in the
|
||||
* documentation and/or other materials provided with the distribution.
|
||||
* 3. The names of the authors may not be used to endorse or promote products
|
||||
* derived from this software without specific prior written permission.
|
||||
*
|
||||
* Alternatively, provided that this notice is retained in full, this
|
||||
* software may be distributed under the terms of the GNU General
|
||||
* Public License ("GPL") version 2, in which case the provisions of the
|
||||
* GPL apply INSTEAD OF those given above.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
* A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
* OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH
|
||||
* DAMAGE.
|
||||
*
|
||||
*/
|
||||
|
||||
#include <linux/module.h>
|
||||
#include <linux/slab.h>
|
||||
#include <linux/types.h>
|
||||
#include <linux/kernel.h>
|
||||
#include <linux/errno.h>
|
||||
#include <linux/skbuff.h>
|
||||
#include <linux/prefetch.h>
|
||||
#include <net/pkt_sched.h>
|
||||
#include <net/codel.h>
|
||||
|
||||
|
||||
#define DEFAULT_CODEL_LIMIT 1000
|
||||
|
||||
struct codel_sched_data {
|
||||
struct codel_params params;
|
||||
struct codel_vars vars;
|
||||
struct codel_stats stats;
|
||||
u32 drop_overlimit;
|
||||
};
|
||||
|
||||
/* This is the specific function called from codel_dequeue()
|
||||
* to dequeue a packet from queue. Note: backlog is handled in
|
||||
* codel, we dont need to reduce it here.
|
||||
*/
|
||||
static struct sk_buff *dequeue(struct codel_vars *vars, struct Qdisc *sch)
|
||||
{
|
||||
struct sk_buff *skb = __skb_dequeue(&sch->q);
|
||||
|
||||
prefetch(&skb->end); /* we'll need skb_shinfo() */
|
||||
return skb;
|
||||
}
|
||||
|
||||
static struct sk_buff *codel_qdisc_dequeue(struct Qdisc *sch)
|
||||
{
|
||||
struct codel_sched_data *q = qdisc_priv(sch);
|
||||
struct sk_buff *skb;
|
||||
|
||||
skb = codel_dequeue(sch, &q->params, &q->vars, &q->stats, dequeue);
|
||||
|
||||
/* We cant call qdisc_tree_decrease_qlen() if our qlen is 0,
|
||||
* or HTB crashes. Defer it for next round.
|
||||
*/
|
||||
if (q->stats.drop_count && sch->q.qlen) {
|
||||
qdisc_tree_decrease_qlen(sch, q->stats.drop_count);
|
||||
q->stats.drop_count = 0;
|
||||
}
|
||||
if (skb)
|
||||
qdisc_bstats_update(sch, skb);
|
||||
return skb;
|
||||
}
|
||||
|
||||
static int codel_qdisc_enqueue(struct sk_buff *skb, struct Qdisc *sch)
|
||||
{
|
||||
struct codel_sched_data *q;
|
||||
|
||||
if (likely(qdisc_qlen(sch) < sch->limit)) {
|
||||
codel_set_enqueue_time(skb);
|
||||
return qdisc_enqueue_tail(skb, sch);
|
||||
}
|
||||
q = qdisc_priv(sch);
|
||||
q->drop_overlimit++;
|
||||
return qdisc_drop(skb, sch);
|
||||
}
|
||||
|
||||
static const struct nla_policy codel_policy[TCA_CODEL_MAX + 1] = {
|
||||
[TCA_CODEL_TARGET] = { .type = NLA_U32 },
|
||||
[TCA_CODEL_LIMIT] = { .type = NLA_U32 },
|
||||
[TCA_CODEL_INTERVAL] = { .type = NLA_U32 },
|
||||
[TCA_CODEL_ECN] = { .type = NLA_U32 },
|
||||
};
|
||||
|
||||
static int codel_change(struct Qdisc *sch, struct nlattr *opt)
|
||||
{
|
||||
struct codel_sched_data *q = qdisc_priv(sch);
|
||||
struct nlattr *tb[TCA_CODEL_MAX + 1];
|
||||
unsigned int qlen;
|
||||
int err;
|
||||
|
||||
if (!opt)
|
||||
return -EINVAL;
|
||||
|
||||
err = nla_parse_nested(tb, TCA_CODEL_MAX, opt, codel_policy);
|
||||
if (err < 0)
|
||||
return err;
|
||||
|
||||
sch_tree_lock(sch);
|
||||
|
||||
if (tb[TCA_CODEL_TARGET]) {
|
||||
u32 target = nla_get_u32(tb[TCA_CODEL_TARGET]);
|
||||
|
||||
q->params.target = ((u64)target * NSEC_PER_USEC) >> CODEL_SHIFT;
|
||||
}
|
||||
|
||||
if (tb[TCA_CODEL_INTERVAL]) {
|
||||
u32 interval = nla_get_u32(tb[TCA_CODEL_INTERVAL]);
|
||||
|
||||
q->params.interval = ((u64)interval * NSEC_PER_USEC) >> CODEL_SHIFT;
|
||||
}
|
||||
|
||||
if (tb[TCA_CODEL_LIMIT])
|
||||
sch->limit = nla_get_u32(tb[TCA_CODEL_LIMIT]);
|
||||
|
||||
if (tb[TCA_CODEL_ECN])
|
||||
q->params.ecn = !!nla_get_u32(tb[TCA_CODEL_ECN]);
|
||||
|
||||
qlen = sch->q.qlen;
|
||||
while (sch->q.qlen > sch->limit) {
|
||||
struct sk_buff *skb = __skb_dequeue(&sch->q);
|
||||
|
||||
qdisc_qstats_backlog_dec(sch, skb);
|
||||
qdisc_drop(skb, sch);
|
||||
}
|
||||
qdisc_tree_decrease_qlen(sch, qlen - sch->q.qlen);
|
||||
|
||||
sch_tree_unlock(sch);
|
||||
return 0;
|
||||
}
|
||||
|
||||
static int codel_init(struct Qdisc *sch, struct nlattr *opt)
|
||||
{
|
||||
struct codel_sched_data *q = qdisc_priv(sch);
|
||||
|
||||
sch->limit = DEFAULT_CODEL_LIMIT;
|
||||
|
||||
codel_params_init(&q->params);
|
||||
codel_vars_init(&q->vars);
|
||||
codel_stats_init(&q->stats);
|
||||
|
||||
if (opt) {
|
||||
int err = codel_change(sch, opt);
|
||||
|
||||
if (err)
|
||||
return err;
|
||||
}
|
||||
|
||||
if (sch->limit >= 1)
|
||||
sch->flags |= TCQ_F_CAN_BYPASS;
|
||||
else
|
||||
sch->flags &= ~TCQ_F_CAN_BYPASS;
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
static int codel_dump(struct Qdisc *sch, struct sk_buff *skb)
|
||||
{
|
||||
struct codel_sched_data *q = qdisc_priv(sch);
|
||||
struct nlattr *opts;
|
||||
|
||||
opts = nla_nest_start(skb, TCA_OPTIONS);
|
||||
if (opts == NULL)
|
||||
goto nla_put_failure;
|
||||
|
||||
if (nla_put_u32(skb, TCA_CODEL_TARGET,
|
||||
codel_time_to_us(q->params.target)) ||
|
||||
nla_put_u32(skb, TCA_CODEL_LIMIT,
|
||||
sch->limit) ||
|
||||
nla_put_u32(skb, TCA_CODEL_INTERVAL,
|
||||
codel_time_to_us(q->params.interval)) ||
|
||||
nla_put_u32(skb, TCA_CODEL_ECN,
|
||||
q->params.ecn))
|
||||
goto nla_put_failure;
|
||||
|
||||
return nla_nest_end(skb, opts);
|
||||
|
||||
nla_put_failure:
|
||||
nla_nest_cancel(skb, opts);
|
||||
return -1;
|
||||
}
|
||||
|
||||
static int codel_dump_stats(struct Qdisc *sch, struct gnet_dump *d)
|
||||
{
|
||||
const struct codel_sched_data *q = qdisc_priv(sch);
|
||||
struct tc_codel_xstats st = {
|
||||
.maxpacket = q->stats.maxpacket,
|
||||
.count = q->vars.count,
|
||||
.lastcount = q->vars.lastcount,
|
||||
.drop_overlimit = q->drop_overlimit,
|
||||
.ldelay = codel_time_to_us(q->vars.ldelay),
|
||||
.dropping = q->vars.dropping,
|
||||
.ecn_mark = q->stats.ecn_mark,
|
||||
};
|
||||
|
||||
if (q->vars.dropping) {
|
||||
codel_tdiff_t delta = q->vars.drop_next - codel_get_time();
|
||||
|
||||
if (delta >= 0)
|
||||
st.drop_next = codel_time_to_us(delta);
|
||||
else
|
||||
st.drop_next = -codel_time_to_us(-delta);
|
||||
}
|
||||
|
||||
return gnet_stats_copy_app(d, &st, sizeof(st));
|
||||
}
|
||||
|
||||
static void codel_reset(struct Qdisc *sch)
|
||||
{
|
||||
struct codel_sched_data *q = qdisc_priv(sch);
|
||||
|
||||
qdisc_reset_queue(sch);
|
||||
codel_vars_init(&q->vars);
|
||||
}
|
||||
|
||||
static struct Qdisc_ops codel_qdisc_ops __read_mostly = {
|
||||
.id = "codel",
|
||||
.priv_size = sizeof(struct codel_sched_data),
|
||||
|
||||
.enqueue = codel_qdisc_enqueue,
|
||||
.dequeue = codel_qdisc_dequeue,
|
||||
.peek = qdisc_peek_dequeued,
|
||||
.init = codel_init,
|
||||
.reset = codel_reset,
|
||||
.change = codel_change,
|
||||
.dump = codel_dump,
|
||||
.dump_stats = codel_dump_stats,
|
||||
.owner = THIS_MODULE,
|
||||
};
|
||||
|
||||
static int __init codel_module_init(void)
|
||||
{
|
||||
return register_qdisc(&codel_qdisc_ops);
|
||||
}
|
||||
|
||||
static void __exit codel_module_exit(void)
|
||||
{
|
||||
unregister_qdisc(&codel_qdisc_ops);
|
||||
}
|
||||
|
||||
module_init(codel_module_init)
|
||||
module_exit(codel_module_exit)
|
||||
|
||||
MODULE_DESCRIPTION("Controlled Delay queue discipline");
|
||||
MODULE_AUTHOR("Dave Taht");
|
||||
MODULE_AUTHOR("Eric Dumazet");
|
||||
MODULE_LICENSE("Dual BSD/GPL");
|
||||
529
net/sched/sch_drr.c
Normal file
529
net/sched/sch_drr.c
Normal file
|
|
@ -0,0 +1,529 @@
|
|||
/*
|
||||
* net/sched/sch_drr.c Deficit Round Robin scheduler
|
||||
*
|
||||
* Copyright (c) 2008 Patrick McHardy <kaber@trash.net>
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU General Public License
|
||||
* version 2 as published by the Free Software Foundation.
|
||||
*/
|
||||
|
||||
#include <linux/module.h>
|
||||
#include <linux/slab.h>
|
||||
#include <linux/init.h>
|
||||
#include <linux/errno.h>
|
||||
#include <linux/netdevice.h>
|
||||
#include <linux/pkt_sched.h>
|
||||
#include <net/sch_generic.h>
|
||||
#include <net/pkt_sched.h>
|
||||
#include <net/pkt_cls.h>
|
||||
|
||||
struct drr_class {
|
||||
struct Qdisc_class_common common;
|
||||
unsigned int refcnt;
|
||||
unsigned int filter_cnt;
|
||||
|
||||
struct gnet_stats_basic_packed bstats;
|
||||
struct gnet_stats_queue qstats;
|
||||
struct gnet_stats_rate_est64 rate_est;
|
||||
struct list_head alist;
|
||||
struct Qdisc *qdisc;
|
||||
|
||||
u32 quantum;
|
||||
u32 deficit;
|
||||
};
|
||||
|
||||
struct drr_sched {
|
||||
struct list_head active;
|
||||
struct tcf_proto __rcu *filter_list;
|
||||
struct Qdisc_class_hash clhash;
|
||||
};
|
||||
|
||||
static struct drr_class *drr_find_class(struct Qdisc *sch, u32 classid)
|
||||
{
|
||||
struct drr_sched *q = qdisc_priv(sch);
|
||||
struct Qdisc_class_common *clc;
|
||||
|
||||
clc = qdisc_class_find(&q->clhash, classid);
|
||||
if (clc == NULL)
|
||||
return NULL;
|
||||
return container_of(clc, struct drr_class, common);
|
||||
}
|
||||
|
||||
static void drr_purge_queue(struct drr_class *cl)
|
||||
{
|
||||
unsigned int len = cl->qdisc->q.qlen;
|
||||
|
||||
qdisc_reset(cl->qdisc);
|
||||
qdisc_tree_decrease_qlen(cl->qdisc, len);
|
||||
}
|
||||
|
||||
static const struct nla_policy drr_policy[TCA_DRR_MAX + 1] = {
|
||||
[TCA_DRR_QUANTUM] = { .type = NLA_U32 },
|
||||
};
|
||||
|
||||
static int drr_change_class(struct Qdisc *sch, u32 classid, u32 parentid,
|
||||
struct nlattr **tca, unsigned long *arg)
|
||||
{
|
||||
struct drr_sched *q = qdisc_priv(sch);
|
||||
struct drr_class *cl = (struct drr_class *)*arg;
|
||||
struct nlattr *opt = tca[TCA_OPTIONS];
|
||||
struct nlattr *tb[TCA_DRR_MAX + 1];
|
||||
u32 quantum;
|
||||
int err;
|
||||
|
||||
if (!opt)
|
||||
return -EINVAL;
|
||||
|
||||
err = nla_parse_nested(tb, TCA_DRR_MAX, opt, drr_policy);
|
||||
if (err < 0)
|
||||
return err;
|
||||
|
||||
if (tb[TCA_DRR_QUANTUM]) {
|
||||
quantum = nla_get_u32(tb[TCA_DRR_QUANTUM]);
|
||||
if (quantum == 0)
|
||||
return -EINVAL;
|
||||
} else
|
||||
quantum = psched_mtu(qdisc_dev(sch));
|
||||
|
||||
if (cl != NULL) {
|
||||
if (tca[TCA_RATE]) {
|
||||
err = gen_replace_estimator(&cl->bstats, NULL,
|
||||
&cl->rate_est,
|
||||
qdisc_root_sleeping_lock(sch),
|
||||
tca[TCA_RATE]);
|
||||
if (err)
|
||||
return err;
|
||||
}
|
||||
|
||||
sch_tree_lock(sch);
|
||||
if (tb[TCA_DRR_QUANTUM])
|
||||
cl->quantum = quantum;
|
||||
sch_tree_unlock(sch);
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
cl = kzalloc(sizeof(struct drr_class), GFP_KERNEL);
|
||||
if (cl == NULL)
|
||||
return -ENOBUFS;
|
||||
|
||||
cl->refcnt = 1;
|
||||
cl->common.classid = classid;
|
||||
cl->quantum = quantum;
|
||||
cl->qdisc = qdisc_create_dflt(sch->dev_queue,
|
||||
&pfifo_qdisc_ops, classid);
|
||||
if (cl->qdisc == NULL)
|
||||
cl->qdisc = &noop_qdisc;
|
||||
|
||||
if (tca[TCA_RATE]) {
|
||||
err = gen_replace_estimator(&cl->bstats, NULL, &cl->rate_est,
|
||||
qdisc_root_sleeping_lock(sch),
|
||||
tca[TCA_RATE]);
|
||||
if (err) {
|
||||
qdisc_destroy(cl->qdisc);
|
||||
kfree(cl);
|
||||
return err;
|
||||
}
|
||||
}
|
||||
|
||||
sch_tree_lock(sch);
|
||||
qdisc_class_hash_insert(&q->clhash, &cl->common);
|
||||
sch_tree_unlock(sch);
|
||||
|
||||
qdisc_class_hash_grow(sch, &q->clhash);
|
||||
|
||||
*arg = (unsigned long)cl;
|
||||
return 0;
|
||||
}
|
||||
|
||||
static void drr_destroy_class(struct Qdisc *sch, struct drr_class *cl)
|
||||
{
|
||||
gen_kill_estimator(&cl->bstats, &cl->rate_est);
|
||||
qdisc_destroy(cl->qdisc);
|
||||
kfree(cl);
|
||||
}
|
||||
|
||||
static int drr_delete_class(struct Qdisc *sch, unsigned long arg)
|
||||
{
|
||||
struct drr_sched *q = qdisc_priv(sch);
|
||||
struct drr_class *cl = (struct drr_class *)arg;
|
||||
|
||||
if (cl->filter_cnt > 0)
|
||||
return -EBUSY;
|
||||
|
||||
sch_tree_lock(sch);
|
||||
|
||||
drr_purge_queue(cl);
|
||||
qdisc_class_hash_remove(&q->clhash, &cl->common);
|
||||
|
||||
BUG_ON(--cl->refcnt == 0);
|
||||
/*
|
||||
* This shouldn't happen: we "hold" one cops->get() when called
|
||||
* from tc_ctl_tclass; the destroy method is done from cops->put().
|
||||
*/
|
||||
|
||||
sch_tree_unlock(sch);
|
||||
return 0;
|
||||
}
|
||||
|
||||
static unsigned long drr_get_class(struct Qdisc *sch, u32 classid)
|
||||
{
|
||||
struct drr_class *cl = drr_find_class(sch, classid);
|
||||
|
||||
if (cl != NULL)
|
||||
cl->refcnt++;
|
||||
|
||||
return (unsigned long)cl;
|
||||
}
|
||||
|
||||
static void drr_put_class(struct Qdisc *sch, unsigned long arg)
|
||||
{
|
||||
struct drr_class *cl = (struct drr_class *)arg;
|
||||
|
||||
if (--cl->refcnt == 0)
|
||||
drr_destroy_class(sch, cl);
|
||||
}
|
||||
|
||||
static struct tcf_proto __rcu **drr_tcf_chain(struct Qdisc *sch,
|
||||
unsigned long cl)
|
||||
{
|
||||
struct drr_sched *q = qdisc_priv(sch);
|
||||
|
||||
if (cl)
|
||||
return NULL;
|
||||
|
||||
return &q->filter_list;
|
||||
}
|
||||
|
||||
static unsigned long drr_bind_tcf(struct Qdisc *sch, unsigned long parent,
|
||||
u32 classid)
|
||||
{
|
||||
struct drr_class *cl = drr_find_class(sch, classid);
|
||||
|
||||
if (cl != NULL)
|
||||
cl->filter_cnt++;
|
||||
|
||||
return (unsigned long)cl;
|
||||
}
|
||||
|
||||
static void drr_unbind_tcf(struct Qdisc *sch, unsigned long arg)
|
||||
{
|
||||
struct drr_class *cl = (struct drr_class *)arg;
|
||||
|
||||
cl->filter_cnt--;
|
||||
}
|
||||
|
||||
static int drr_graft_class(struct Qdisc *sch, unsigned long arg,
|
||||
struct Qdisc *new, struct Qdisc **old)
|
||||
{
|
||||
struct drr_class *cl = (struct drr_class *)arg;
|
||||
|
||||
if (new == NULL) {
|
||||
new = qdisc_create_dflt(sch->dev_queue,
|
||||
&pfifo_qdisc_ops, cl->common.classid);
|
||||
if (new == NULL)
|
||||
new = &noop_qdisc;
|
||||
}
|
||||
|
||||
sch_tree_lock(sch);
|
||||
drr_purge_queue(cl);
|
||||
*old = cl->qdisc;
|
||||
cl->qdisc = new;
|
||||
sch_tree_unlock(sch);
|
||||
return 0;
|
||||
}
|
||||
|
||||
static struct Qdisc *drr_class_leaf(struct Qdisc *sch, unsigned long arg)
|
||||
{
|
||||
struct drr_class *cl = (struct drr_class *)arg;
|
||||
|
||||
return cl->qdisc;
|
||||
}
|
||||
|
||||
static void drr_qlen_notify(struct Qdisc *csh, unsigned long arg)
|
||||
{
|
||||
struct drr_class *cl = (struct drr_class *)arg;
|
||||
|
||||
if (cl->qdisc->q.qlen == 0)
|
||||
list_del(&cl->alist);
|
||||
}
|
||||
|
||||
static int drr_dump_class(struct Qdisc *sch, unsigned long arg,
|
||||
struct sk_buff *skb, struct tcmsg *tcm)
|
||||
{
|
||||
struct drr_class *cl = (struct drr_class *)arg;
|
||||
struct nlattr *nest;
|
||||
|
||||
tcm->tcm_parent = TC_H_ROOT;
|
||||
tcm->tcm_handle = cl->common.classid;
|
||||
tcm->tcm_info = cl->qdisc->handle;
|
||||
|
||||
nest = nla_nest_start(skb, TCA_OPTIONS);
|
||||
if (nest == NULL)
|
||||
goto nla_put_failure;
|
||||
if (nla_put_u32(skb, TCA_DRR_QUANTUM, cl->quantum))
|
||||
goto nla_put_failure;
|
||||
return nla_nest_end(skb, nest);
|
||||
|
||||
nla_put_failure:
|
||||
nla_nest_cancel(skb, nest);
|
||||
return -EMSGSIZE;
|
||||
}
|
||||
|
||||
static int drr_dump_class_stats(struct Qdisc *sch, unsigned long arg,
|
||||
struct gnet_dump *d)
|
||||
{
|
||||
struct drr_class *cl = (struct drr_class *)arg;
|
||||
__u32 qlen = cl->qdisc->q.qlen;
|
||||
struct tc_drr_stats xstats;
|
||||
|
||||
memset(&xstats, 0, sizeof(xstats));
|
||||
if (qlen)
|
||||
xstats.deficit = cl->deficit;
|
||||
|
||||
if (gnet_stats_copy_basic(d, NULL, &cl->bstats) < 0 ||
|
||||
gnet_stats_copy_rate_est(d, &cl->bstats, &cl->rate_est) < 0 ||
|
||||
gnet_stats_copy_queue(d, NULL, &cl->qdisc->qstats, qlen) < 0)
|
||||
return -1;
|
||||
|
||||
return gnet_stats_copy_app(d, &xstats, sizeof(xstats));
|
||||
}
|
||||
|
||||
static void drr_walk(struct Qdisc *sch, struct qdisc_walker *arg)
|
||||
{
|
||||
struct drr_sched *q = qdisc_priv(sch);
|
||||
struct drr_class *cl;
|
||||
unsigned int i;
|
||||
|
||||
if (arg->stop)
|
||||
return;
|
||||
|
||||
for (i = 0; i < q->clhash.hashsize; i++) {
|
||||
hlist_for_each_entry(cl, &q->clhash.hash[i], common.hnode) {
|
||||
if (arg->count < arg->skip) {
|
||||
arg->count++;
|
||||
continue;
|
||||
}
|
||||
if (arg->fn(sch, (unsigned long)cl, arg) < 0) {
|
||||
arg->stop = 1;
|
||||
return;
|
||||
}
|
||||
arg->count++;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static struct drr_class *drr_classify(struct sk_buff *skb, struct Qdisc *sch,
|
||||
int *qerr)
|
||||
{
|
||||
struct drr_sched *q = qdisc_priv(sch);
|
||||
struct drr_class *cl;
|
||||
struct tcf_result res;
|
||||
struct tcf_proto *fl;
|
||||
int result;
|
||||
|
||||
if (TC_H_MAJ(skb->priority ^ sch->handle) == 0) {
|
||||
cl = drr_find_class(sch, skb->priority);
|
||||
if (cl != NULL)
|
||||
return cl;
|
||||
}
|
||||
|
||||
*qerr = NET_XMIT_SUCCESS | __NET_XMIT_BYPASS;
|
||||
fl = rcu_dereference_bh(q->filter_list);
|
||||
result = tc_classify(skb, fl, &res);
|
||||
if (result >= 0) {
|
||||
#ifdef CONFIG_NET_CLS_ACT
|
||||
switch (result) {
|
||||
case TC_ACT_QUEUED:
|
||||
case TC_ACT_STOLEN:
|
||||
*qerr = NET_XMIT_SUCCESS | __NET_XMIT_STOLEN;
|
||||
case TC_ACT_SHOT:
|
||||
return NULL;
|
||||
}
|
||||
#endif
|
||||
cl = (struct drr_class *)res.class;
|
||||
if (cl == NULL)
|
||||
cl = drr_find_class(sch, res.classid);
|
||||
return cl;
|
||||
}
|
||||
return NULL;
|
||||
}
|
||||
|
||||
static int drr_enqueue(struct sk_buff *skb, struct Qdisc *sch)
|
||||
{
|
||||
struct drr_sched *q = qdisc_priv(sch);
|
||||
struct drr_class *cl;
|
||||
int err = 0;
|
||||
|
||||
cl = drr_classify(skb, sch, &err);
|
||||
if (cl == NULL) {
|
||||
if (err & __NET_XMIT_BYPASS)
|
||||
qdisc_qstats_drop(sch);
|
||||
kfree_skb(skb);
|
||||
return err;
|
||||
}
|
||||
|
||||
err = qdisc_enqueue(skb, cl->qdisc);
|
||||
if (unlikely(err != NET_XMIT_SUCCESS)) {
|
||||
if (net_xmit_drop_count(err)) {
|
||||
cl->qstats.drops++;
|
||||
qdisc_qstats_drop(sch);
|
||||
}
|
||||
return err;
|
||||
}
|
||||
|
||||
if (cl->qdisc->q.qlen == 1) {
|
||||
list_add_tail(&cl->alist, &q->active);
|
||||
cl->deficit = cl->quantum;
|
||||
}
|
||||
|
||||
sch->q.qlen++;
|
||||
return err;
|
||||
}
|
||||
|
||||
static struct sk_buff *drr_dequeue(struct Qdisc *sch)
|
||||
{
|
||||
struct drr_sched *q = qdisc_priv(sch);
|
||||
struct drr_class *cl;
|
||||
struct sk_buff *skb;
|
||||
unsigned int len;
|
||||
|
||||
if (list_empty(&q->active))
|
||||
goto out;
|
||||
while (1) {
|
||||
cl = list_first_entry(&q->active, struct drr_class, alist);
|
||||
skb = cl->qdisc->ops->peek(cl->qdisc);
|
||||
if (skb == NULL) {
|
||||
qdisc_warn_nonwc(__func__, cl->qdisc);
|
||||
goto out;
|
||||
}
|
||||
|
||||
len = qdisc_pkt_len(skb);
|
||||
if (len <= cl->deficit) {
|
||||
cl->deficit -= len;
|
||||
skb = qdisc_dequeue_peeked(cl->qdisc);
|
||||
if (cl->qdisc->q.qlen == 0)
|
||||
list_del(&cl->alist);
|
||||
|
||||
bstats_update(&cl->bstats, skb);
|
||||
qdisc_bstats_update(sch, skb);
|
||||
sch->q.qlen--;
|
||||
return skb;
|
||||
}
|
||||
|
||||
cl->deficit += cl->quantum;
|
||||
list_move_tail(&cl->alist, &q->active);
|
||||
}
|
||||
out:
|
||||
return NULL;
|
||||
}
|
||||
|
||||
static unsigned int drr_drop(struct Qdisc *sch)
|
||||
{
|
||||
struct drr_sched *q = qdisc_priv(sch);
|
||||
struct drr_class *cl;
|
||||
unsigned int len;
|
||||
|
||||
list_for_each_entry(cl, &q->active, alist) {
|
||||
if (cl->qdisc->ops->drop) {
|
||||
len = cl->qdisc->ops->drop(cl->qdisc);
|
||||
if (len > 0) {
|
||||
sch->q.qlen--;
|
||||
if (cl->qdisc->q.qlen == 0)
|
||||
list_del(&cl->alist);
|
||||
return len;
|
||||
}
|
||||
}
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
static int drr_init_qdisc(struct Qdisc *sch, struct nlattr *opt)
|
||||
{
|
||||
struct drr_sched *q = qdisc_priv(sch);
|
||||
int err;
|
||||
|
||||
err = qdisc_class_hash_init(&q->clhash);
|
||||
if (err < 0)
|
||||
return err;
|
||||
INIT_LIST_HEAD(&q->active);
|
||||
return 0;
|
||||
}
|
||||
|
||||
static void drr_reset_qdisc(struct Qdisc *sch)
|
||||
{
|
||||
struct drr_sched *q = qdisc_priv(sch);
|
||||
struct drr_class *cl;
|
||||
unsigned int i;
|
||||
|
||||
for (i = 0; i < q->clhash.hashsize; i++) {
|
||||
hlist_for_each_entry(cl, &q->clhash.hash[i], common.hnode) {
|
||||
if (cl->qdisc->q.qlen)
|
||||
list_del(&cl->alist);
|
||||
qdisc_reset(cl->qdisc);
|
||||
}
|
||||
}
|
||||
sch->q.qlen = 0;
|
||||
}
|
||||
|
||||
static void drr_destroy_qdisc(struct Qdisc *sch)
|
||||
{
|
||||
struct drr_sched *q = qdisc_priv(sch);
|
||||
struct drr_class *cl;
|
||||
struct hlist_node *next;
|
||||
unsigned int i;
|
||||
|
||||
tcf_destroy_chain(&q->filter_list);
|
||||
|
||||
for (i = 0; i < q->clhash.hashsize; i++) {
|
||||
hlist_for_each_entry_safe(cl, next, &q->clhash.hash[i],
|
||||
common.hnode)
|
||||
drr_destroy_class(sch, cl);
|
||||
}
|
||||
qdisc_class_hash_destroy(&q->clhash);
|
||||
}
|
||||
|
||||
static const struct Qdisc_class_ops drr_class_ops = {
|
||||
.change = drr_change_class,
|
||||
.delete = drr_delete_class,
|
||||
.get = drr_get_class,
|
||||
.put = drr_put_class,
|
||||
.tcf_chain = drr_tcf_chain,
|
||||
.bind_tcf = drr_bind_tcf,
|
||||
.unbind_tcf = drr_unbind_tcf,
|
||||
.graft = drr_graft_class,
|
||||
.leaf = drr_class_leaf,
|
||||
.qlen_notify = drr_qlen_notify,
|
||||
.dump = drr_dump_class,
|
||||
.dump_stats = drr_dump_class_stats,
|
||||
.walk = drr_walk,
|
||||
};
|
||||
|
||||
static struct Qdisc_ops drr_qdisc_ops __read_mostly = {
|
||||
.cl_ops = &drr_class_ops,
|
||||
.id = "drr",
|
||||
.priv_size = sizeof(struct drr_sched),
|
||||
.enqueue = drr_enqueue,
|
||||
.dequeue = drr_dequeue,
|
||||
.peek = qdisc_peek_dequeued,
|
||||
.drop = drr_drop,
|
||||
.init = drr_init_qdisc,
|
||||
.reset = drr_reset_qdisc,
|
||||
.destroy = drr_destroy_qdisc,
|
||||
.owner = THIS_MODULE,
|
||||
};
|
||||
|
||||
static int __init drr_init(void)
|
||||
{
|
||||
return register_qdisc(&drr_qdisc_ops);
|
||||
}
|
||||
|
||||
static void __exit drr_exit(void)
|
||||
{
|
||||
unregister_qdisc(&drr_qdisc_ops);
|
||||
}
|
||||
|
||||
module_init(drr_init);
|
||||
module_exit(drr_exit);
|
||||
MODULE_LICENSE("GPL");
|
||||
514
net/sched/sch_dsmark.c
Normal file
514
net/sched/sch_dsmark.c
Normal file
|
|
@ -0,0 +1,514 @@
|
|||
/* net/sched/sch_dsmark.c - Differentiated Services field marker */
|
||||
|
||||
/* Written 1998-2000 by Werner Almesberger, EPFL ICA */
|
||||
|
||||
|
||||
#include <linux/module.h>
|
||||
#include <linux/init.h>
|
||||
#include <linux/slab.h>
|
||||
#include <linux/types.h>
|
||||
#include <linux/string.h>
|
||||
#include <linux/errno.h>
|
||||
#include <linux/skbuff.h>
|
||||
#include <linux/rtnetlink.h>
|
||||
#include <linux/bitops.h>
|
||||
#include <net/pkt_sched.h>
|
||||
#include <net/dsfield.h>
|
||||
#include <net/inet_ecn.h>
|
||||
#include <asm/byteorder.h>
|
||||
|
||||
/*
|
||||
* classid class marking
|
||||
* ------- ----- -------
|
||||
* n/a 0 n/a
|
||||
* x:0 1 use entry [0]
|
||||
* ... ... ...
|
||||
* x:y y>0 y+1 use entry [y]
|
||||
* ... ... ...
|
||||
* x:indices-1 indices use entry [indices-1]
|
||||
* ... ... ...
|
||||
* x:y y+1 use entry [y & (indices-1)]
|
||||
* ... ... ...
|
||||
* 0xffff 0x10000 use entry [indices-1]
|
||||
*/
|
||||
|
||||
|
||||
#define NO_DEFAULT_INDEX (1 << 16)
|
||||
|
||||
struct dsmark_qdisc_data {
|
||||
struct Qdisc *q;
|
||||
struct tcf_proto __rcu *filter_list;
|
||||
u8 *mask; /* "owns" the array */
|
||||
u8 *value;
|
||||
u16 indices;
|
||||
u32 default_index; /* index range is 0...0xffff */
|
||||
int set_tc_index;
|
||||
};
|
||||
|
||||
static inline int dsmark_valid_index(struct dsmark_qdisc_data *p, u16 index)
|
||||
{
|
||||
return index <= p->indices && index > 0;
|
||||
}
|
||||
|
||||
/* ------------------------- Class/flow operations ------------------------- */
|
||||
|
||||
static int dsmark_graft(struct Qdisc *sch, unsigned long arg,
|
||||
struct Qdisc *new, struct Qdisc **old)
|
||||
{
|
||||
struct dsmark_qdisc_data *p = qdisc_priv(sch);
|
||||
|
||||
pr_debug("%s(sch %p,[qdisc %p],new %p,old %p)\n",
|
||||
__func__, sch, p, new, old);
|
||||
|
||||
if (new == NULL) {
|
||||
new = qdisc_create_dflt(sch->dev_queue, &pfifo_qdisc_ops,
|
||||
sch->handle);
|
||||
if (new == NULL)
|
||||
new = &noop_qdisc;
|
||||
}
|
||||
|
||||
sch_tree_lock(sch);
|
||||
*old = p->q;
|
||||
p->q = new;
|
||||
qdisc_tree_decrease_qlen(*old, (*old)->q.qlen);
|
||||
qdisc_reset(*old);
|
||||
sch_tree_unlock(sch);
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
static struct Qdisc *dsmark_leaf(struct Qdisc *sch, unsigned long arg)
|
||||
{
|
||||
struct dsmark_qdisc_data *p = qdisc_priv(sch);
|
||||
return p->q;
|
||||
}
|
||||
|
||||
static unsigned long dsmark_get(struct Qdisc *sch, u32 classid)
|
||||
{
|
||||
pr_debug("%s(sch %p,[qdisc %p],classid %x)\n",
|
||||
__func__, sch, qdisc_priv(sch), classid);
|
||||
|
||||
return TC_H_MIN(classid) + 1;
|
||||
}
|
||||
|
||||
static unsigned long dsmark_bind_filter(struct Qdisc *sch,
|
||||
unsigned long parent, u32 classid)
|
||||
{
|
||||
return dsmark_get(sch, classid);
|
||||
}
|
||||
|
||||
static void dsmark_put(struct Qdisc *sch, unsigned long cl)
|
||||
{
|
||||
}
|
||||
|
||||
static const struct nla_policy dsmark_policy[TCA_DSMARK_MAX + 1] = {
|
||||
[TCA_DSMARK_INDICES] = { .type = NLA_U16 },
|
||||
[TCA_DSMARK_DEFAULT_INDEX] = { .type = NLA_U16 },
|
||||
[TCA_DSMARK_SET_TC_INDEX] = { .type = NLA_FLAG },
|
||||
[TCA_DSMARK_MASK] = { .type = NLA_U8 },
|
||||
[TCA_DSMARK_VALUE] = { .type = NLA_U8 },
|
||||
};
|
||||
|
||||
static int dsmark_change(struct Qdisc *sch, u32 classid, u32 parent,
|
||||
struct nlattr **tca, unsigned long *arg)
|
||||
{
|
||||
struct dsmark_qdisc_data *p = qdisc_priv(sch);
|
||||
struct nlattr *opt = tca[TCA_OPTIONS];
|
||||
struct nlattr *tb[TCA_DSMARK_MAX + 1];
|
||||
int err = -EINVAL;
|
||||
u8 mask = 0;
|
||||
|
||||
pr_debug("%s(sch %p,[qdisc %p],classid %x,parent %x), arg 0x%lx\n",
|
||||
__func__, sch, p, classid, parent, *arg);
|
||||
|
||||
if (!dsmark_valid_index(p, *arg)) {
|
||||
err = -ENOENT;
|
||||
goto errout;
|
||||
}
|
||||
|
||||
if (!opt)
|
||||
goto errout;
|
||||
|
||||
err = nla_parse_nested(tb, TCA_DSMARK_MAX, opt, dsmark_policy);
|
||||
if (err < 0)
|
||||
goto errout;
|
||||
|
||||
if (tb[TCA_DSMARK_MASK])
|
||||
mask = nla_get_u8(tb[TCA_DSMARK_MASK]);
|
||||
|
||||
if (tb[TCA_DSMARK_VALUE])
|
||||
p->value[*arg - 1] = nla_get_u8(tb[TCA_DSMARK_VALUE]);
|
||||
|
||||
if (tb[TCA_DSMARK_MASK])
|
||||
p->mask[*arg - 1] = mask;
|
||||
|
||||
err = 0;
|
||||
|
||||
errout:
|
||||
return err;
|
||||
}
|
||||
|
||||
static int dsmark_delete(struct Qdisc *sch, unsigned long arg)
|
||||
{
|
||||
struct dsmark_qdisc_data *p = qdisc_priv(sch);
|
||||
|
||||
if (!dsmark_valid_index(p, arg))
|
||||
return -EINVAL;
|
||||
|
||||
p->mask[arg - 1] = 0xff;
|
||||
p->value[arg - 1] = 0;
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
static void dsmark_walk(struct Qdisc *sch, struct qdisc_walker *walker)
|
||||
{
|
||||
struct dsmark_qdisc_data *p = qdisc_priv(sch);
|
||||
int i;
|
||||
|
||||
pr_debug("%s(sch %p,[qdisc %p],walker %p)\n",
|
||||
__func__, sch, p, walker);
|
||||
|
||||
if (walker->stop)
|
||||
return;
|
||||
|
||||
for (i = 0; i < p->indices; i++) {
|
||||
if (p->mask[i] == 0xff && !p->value[i])
|
||||
goto ignore;
|
||||
if (walker->count >= walker->skip) {
|
||||
if (walker->fn(sch, i + 1, walker) < 0) {
|
||||
walker->stop = 1;
|
||||
break;
|
||||
}
|
||||
}
|
||||
ignore:
|
||||
walker->count++;
|
||||
}
|
||||
}
|
||||
|
||||
static inline struct tcf_proto __rcu **dsmark_find_tcf(struct Qdisc *sch,
|
||||
unsigned long cl)
|
||||
{
|
||||
struct dsmark_qdisc_data *p = qdisc_priv(sch);
|
||||
return &p->filter_list;
|
||||
}
|
||||
|
||||
/* --------------------------- Qdisc operations ---------------------------- */
|
||||
|
||||
static int dsmark_enqueue(struct sk_buff *skb, struct Qdisc *sch)
|
||||
{
|
||||
struct dsmark_qdisc_data *p = qdisc_priv(sch);
|
||||
int err;
|
||||
|
||||
pr_debug("%s(skb %p,sch %p,[qdisc %p])\n", __func__, skb, sch, p);
|
||||
|
||||
if (p->set_tc_index) {
|
||||
switch (skb->protocol) {
|
||||
case htons(ETH_P_IP):
|
||||
if (skb_cow_head(skb, sizeof(struct iphdr)))
|
||||
goto drop;
|
||||
|
||||
skb->tc_index = ipv4_get_dsfield(ip_hdr(skb))
|
||||
& ~INET_ECN_MASK;
|
||||
break;
|
||||
|
||||
case htons(ETH_P_IPV6):
|
||||
if (skb_cow_head(skb, sizeof(struct ipv6hdr)))
|
||||
goto drop;
|
||||
|
||||
skb->tc_index = ipv6_get_dsfield(ipv6_hdr(skb))
|
||||
& ~INET_ECN_MASK;
|
||||
break;
|
||||
default:
|
||||
skb->tc_index = 0;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
if (TC_H_MAJ(skb->priority) == sch->handle)
|
||||
skb->tc_index = TC_H_MIN(skb->priority);
|
||||
else {
|
||||
struct tcf_result res;
|
||||
struct tcf_proto *fl = rcu_dereference_bh(p->filter_list);
|
||||
int result = tc_classify(skb, fl, &res);
|
||||
|
||||
pr_debug("result %d class 0x%04x\n", result, res.classid);
|
||||
|
||||
switch (result) {
|
||||
#ifdef CONFIG_NET_CLS_ACT
|
||||
case TC_ACT_QUEUED:
|
||||
case TC_ACT_STOLEN:
|
||||
kfree_skb(skb);
|
||||
return NET_XMIT_SUCCESS | __NET_XMIT_STOLEN;
|
||||
|
||||
case TC_ACT_SHOT:
|
||||
goto drop;
|
||||
#endif
|
||||
case TC_ACT_OK:
|
||||
skb->tc_index = TC_H_MIN(res.classid);
|
||||
break;
|
||||
|
||||
default:
|
||||
if (p->default_index != NO_DEFAULT_INDEX)
|
||||
skb->tc_index = p->default_index;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
err = qdisc_enqueue(skb, p->q);
|
||||
if (err != NET_XMIT_SUCCESS) {
|
||||
if (net_xmit_drop_count(err))
|
||||
qdisc_qstats_drop(sch);
|
||||
return err;
|
||||
}
|
||||
|
||||
sch->q.qlen++;
|
||||
|
||||
return NET_XMIT_SUCCESS;
|
||||
|
||||
drop:
|
||||
qdisc_drop(skb, sch);
|
||||
return NET_XMIT_SUCCESS | __NET_XMIT_BYPASS;
|
||||
}
|
||||
|
||||
static struct sk_buff *dsmark_dequeue(struct Qdisc *sch)
|
||||
{
|
||||
struct dsmark_qdisc_data *p = qdisc_priv(sch);
|
||||
struct sk_buff *skb;
|
||||
u32 index;
|
||||
|
||||
pr_debug("%s(sch %p,[qdisc %p])\n", __func__, sch, p);
|
||||
|
||||
skb = p->q->ops->dequeue(p->q);
|
||||
if (skb == NULL)
|
||||
return NULL;
|
||||
|
||||
qdisc_bstats_update(sch, skb);
|
||||
sch->q.qlen--;
|
||||
|
||||
index = skb->tc_index & (p->indices - 1);
|
||||
pr_debug("index %d->%d\n", skb->tc_index, index);
|
||||
|
||||
switch (skb->protocol) {
|
||||
case htons(ETH_P_IP):
|
||||
ipv4_change_dsfield(ip_hdr(skb), p->mask[index],
|
||||
p->value[index]);
|
||||
break;
|
||||
case htons(ETH_P_IPV6):
|
||||
ipv6_change_dsfield(ipv6_hdr(skb), p->mask[index],
|
||||
p->value[index]);
|
||||
break;
|
||||
default:
|
||||
/*
|
||||
* Only complain if a change was actually attempted.
|
||||
* This way, we can send non-IP traffic through dsmark
|
||||
* and don't need yet another qdisc as a bypass.
|
||||
*/
|
||||
if (p->mask[index] != 0xff || p->value[index])
|
||||
pr_warn("%s: unsupported protocol %d\n",
|
||||
__func__, ntohs(skb->protocol));
|
||||
break;
|
||||
}
|
||||
|
||||
return skb;
|
||||
}
|
||||
|
||||
static struct sk_buff *dsmark_peek(struct Qdisc *sch)
|
||||
{
|
||||
struct dsmark_qdisc_data *p = qdisc_priv(sch);
|
||||
|
||||
pr_debug("%s(sch %p,[qdisc %p])\n", __func__, sch, p);
|
||||
|
||||
return p->q->ops->peek(p->q);
|
||||
}
|
||||
|
||||
static unsigned int dsmark_drop(struct Qdisc *sch)
|
||||
{
|
||||
struct dsmark_qdisc_data *p = qdisc_priv(sch);
|
||||
unsigned int len;
|
||||
|
||||
pr_debug("%s(sch %p,[qdisc %p])\n", __func__, sch, p);
|
||||
|
||||
if (p->q->ops->drop == NULL)
|
||||
return 0;
|
||||
|
||||
len = p->q->ops->drop(p->q);
|
||||
if (len)
|
||||
sch->q.qlen--;
|
||||
|
||||
return len;
|
||||
}
|
||||
|
||||
static int dsmark_init(struct Qdisc *sch, struct nlattr *opt)
|
||||
{
|
||||
struct dsmark_qdisc_data *p = qdisc_priv(sch);
|
||||
struct nlattr *tb[TCA_DSMARK_MAX + 1];
|
||||
int err = -EINVAL;
|
||||
u32 default_index = NO_DEFAULT_INDEX;
|
||||
u16 indices;
|
||||
u8 *mask;
|
||||
|
||||
pr_debug("%s(sch %p,[qdisc %p],opt %p)\n", __func__, sch, p, opt);
|
||||
|
||||
if (!opt)
|
||||
goto errout;
|
||||
|
||||
err = nla_parse_nested(tb, TCA_DSMARK_MAX, opt, dsmark_policy);
|
||||
if (err < 0)
|
||||
goto errout;
|
||||
|
||||
err = -EINVAL;
|
||||
indices = nla_get_u16(tb[TCA_DSMARK_INDICES]);
|
||||
|
||||
if (hweight32(indices) != 1)
|
||||
goto errout;
|
||||
|
||||
if (tb[TCA_DSMARK_DEFAULT_INDEX])
|
||||
default_index = nla_get_u16(tb[TCA_DSMARK_DEFAULT_INDEX]);
|
||||
|
||||
mask = kmalloc(indices * 2, GFP_KERNEL);
|
||||
if (mask == NULL) {
|
||||
err = -ENOMEM;
|
||||
goto errout;
|
||||
}
|
||||
|
||||
p->mask = mask;
|
||||
memset(p->mask, 0xff, indices);
|
||||
|
||||
p->value = p->mask + indices;
|
||||
memset(p->value, 0, indices);
|
||||
|
||||
p->indices = indices;
|
||||
p->default_index = default_index;
|
||||
p->set_tc_index = nla_get_flag(tb[TCA_DSMARK_SET_TC_INDEX]);
|
||||
|
||||
p->q = qdisc_create_dflt(sch->dev_queue, &pfifo_qdisc_ops, sch->handle);
|
||||
if (p->q == NULL)
|
||||
p->q = &noop_qdisc;
|
||||
|
||||
pr_debug("%s: qdisc %p\n", __func__, p->q);
|
||||
|
||||
err = 0;
|
||||
errout:
|
||||
return err;
|
||||
}
|
||||
|
||||
static void dsmark_reset(struct Qdisc *sch)
|
||||
{
|
||||
struct dsmark_qdisc_data *p = qdisc_priv(sch);
|
||||
|
||||
pr_debug("%s(sch %p,[qdisc %p])\n", __func__, sch, p);
|
||||
qdisc_reset(p->q);
|
||||
sch->q.qlen = 0;
|
||||
}
|
||||
|
||||
static void dsmark_destroy(struct Qdisc *sch)
|
||||
{
|
||||
struct dsmark_qdisc_data *p = qdisc_priv(sch);
|
||||
|
||||
pr_debug("%s(sch %p,[qdisc %p])\n", __func__, sch, p);
|
||||
|
||||
tcf_destroy_chain(&p->filter_list);
|
||||
qdisc_destroy(p->q);
|
||||
kfree(p->mask);
|
||||
}
|
||||
|
||||
static int dsmark_dump_class(struct Qdisc *sch, unsigned long cl,
|
||||
struct sk_buff *skb, struct tcmsg *tcm)
|
||||
{
|
||||
struct dsmark_qdisc_data *p = qdisc_priv(sch);
|
||||
struct nlattr *opts = NULL;
|
||||
|
||||
pr_debug("%s(sch %p,[qdisc %p],class %ld\n", __func__, sch, p, cl);
|
||||
|
||||
if (!dsmark_valid_index(p, cl))
|
||||
return -EINVAL;
|
||||
|
||||
tcm->tcm_handle = TC_H_MAKE(TC_H_MAJ(sch->handle), cl - 1);
|
||||
tcm->tcm_info = p->q->handle;
|
||||
|
||||
opts = nla_nest_start(skb, TCA_OPTIONS);
|
||||
if (opts == NULL)
|
||||
goto nla_put_failure;
|
||||
if (nla_put_u8(skb, TCA_DSMARK_MASK, p->mask[cl - 1]) ||
|
||||
nla_put_u8(skb, TCA_DSMARK_VALUE, p->value[cl - 1]))
|
||||
goto nla_put_failure;
|
||||
|
||||
return nla_nest_end(skb, opts);
|
||||
|
||||
nla_put_failure:
|
||||
nla_nest_cancel(skb, opts);
|
||||
return -EMSGSIZE;
|
||||
}
|
||||
|
||||
static int dsmark_dump(struct Qdisc *sch, struct sk_buff *skb)
|
||||
{
|
||||
struct dsmark_qdisc_data *p = qdisc_priv(sch);
|
||||
struct nlattr *opts = NULL;
|
||||
|
||||
opts = nla_nest_start(skb, TCA_OPTIONS);
|
||||
if (opts == NULL)
|
||||
goto nla_put_failure;
|
||||
if (nla_put_u16(skb, TCA_DSMARK_INDICES, p->indices))
|
||||
goto nla_put_failure;
|
||||
|
||||
if (p->default_index != NO_DEFAULT_INDEX &&
|
||||
nla_put_u16(skb, TCA_DSMARK_DEFAULT_INDEX, p->default_index))
|
||||
goto nla_put_failure;
|
||||
|
||||
if (p->set_tc_index &&
|
||||
nla_put_flag(skb, TCA_DSMARK_SET_TC_INDEX))
|
||||
goto nla_put_failure;
|
||||
|
||||
return nla_nest_end(skb, opts);
|
||||
|
||||
nla_put_failure:
|
||||
nla_nest_cancel(skb, opts);
|
||||
return -EMSGSIZE;
|
||||
}
|
||||
|
||||
static const struct Qdisc_class_ops dsmark_class_ops = {
|
||||
.graft = dsmark_graft,
|
||||
.leaf = dsmark_leaf,
|
||||
.get = dsmark_get,
|
||||
.put = dsmark_put,
|
||||
.change = dsmark_change,
|
||||
.delete = dsmark_delete,
|
||||
.walk = dsmark_walk,
|
||||
.tcf_chain = dsmark_find_tcf,
|
||||
.bind_tcf = dsmark_bind_filter,
|
||||
.unbind_tcf = dsmark_put,
|
||||
.dump = dsmark_dump_class,
|
||||
};
|
||||
|
||||
static struct Qdisc_ops dsmark_qdisc_ops __read_mostly = {
|
||||
.next = NULL,
|
||||
.cl_ops = &dsmark_class_ops,
|
||||
.id = "dsmark",
|
||||
.priv_size = sizeof(struct dsmark_qdisc_data),
|
||||
.enqueue = dsmark_enqueue,
|
||||
.dequeue = dsmark_dequeue,
|
||||
.peek = dsmark_peek,
|
||||
.drop = dsmark_drop,
|
||||
.init = dsmark_init,
|
||||
.reset = dsmark_reset,
|
||||
.destroy = dsmark_destroy,
|
||||
.change = NULL,
|
||||
.dump = dsmark_dump,
|
||||
.owner = THIS_MODULE,
|
||||
};
|
||||
|
||||
static int __init dsmark_module_init(void)
|
||||
{
|
||||
return register_qdisc(&dsmark_qdisc_ops);
|
||||
}
|
||||
|
||||
static void __exit dsmark_module_exit(void)
|
||||
{
|
||||
unregister_qdisc(&dsmark_qdisc_ops);
|
||||
}
|
||||
|
||||
module_init(dsmark_module_init)
|
||||
module_exit(dsmark_module_exit)
|
||||
|
||||
MODULE_LICENSE("GPL");
|
||||
180
net/sched/sch_fifo.c
Normal file
180
net/sched/sch_fifo.c
Normal file
|
|
@ -0,0 +1,180 @@
|
|||
/*
|
||||
* net/sched/sch_fifo.c The simplest FIFO queue.
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU General Public License
|
||||
* as published by the Free Software Foundation; either version
|
||||
* 2 of the License, or (at your option) any later version.
|
||||
*
|
||||
* Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
|
||||
*/
|
||||
|
||||
#include <linux/module.h>
|
||||
#include <linux/slab.h>
|
||||
#include <linux/types.h>
|
||||
#include <linux/kernel.h>
|
||||
#include <linux/errno.h>
|
||||
#include <linux/skbuff.h>
|
||||
#include <net/pkt_sched.h>
|
||||
|
||||
/* 1 band FIFO pseudo-"scheduler" */
|
||||
|
||||
static int bfifo_enqueue(struct sk_buff *skb, struct Qdisc *sch)
|
||||
{
|
||||
if (likely(sch->qstats.backlog + qdisc_pkt_len(skb) <= sch->limit))
|
||||
return qdisc_enqueue_tail(skb, sch);
|
||||
|
||||
return qdisc_reshape_fail(skb, sch);
|
||||
}
|
||||
|
||||
static int pfifo_enqueue(struct sk_buff *skb, struct Qdisc *sch)
|
||||
{
|
||||
if (likely(skb_queue_len(&sch->q) < sch->limit))
|
||||
return qdisc_enqueue_tail(skb, sch);
|
||||
|
||||
return qdisc_reshape_fail(skb, sch);
|
||||
}
|
||||
|
||||
static int pfifo_tail_enqueue(struct sk_buff *skb, struct Qdisc *sch)
|
||||
{
|
||||
if (likely(skb_queue_len(&sch->q) < sch->limit))
|
||||
return qdisc_enqueue_tail(skb, sch);
|
||||
|
||||
/* queue full, remove one skb to fulfill the limit */
|
||||
__qdisc_queue_drop_head(sch, &sch->q);
|
||||
qdisc_qstats_drop(sch);
|
||||
qdisc_enqueue_tail(skb, sch);
|
||||
|
||||
return NET_XMIT_CN;
|
||||
}
|
||||
|
||||
static int fifo_init(struct Qdisc *sch, struct nlattr *opt)
|
||||
{
|
||||
bool bypass;
|
||||
bool is_bfifo = sch->ops == &bfifo_qdisc_ops;
|
||||
|
||||
if (opt == NULL) {
|
||||
u32 limit = qdisc_dev(sch)->tx_queue_len ? : 1;
|
||||
|
||||
if (is_bfifo)
|
||||
limit *= psched_mtu(qdisc_dev(sch));
|
||||
|
||||
sch->limit = limit;
|
||||
} else {
|
||||
struct tc_fifo_qopt *ctl = nla_data(opt);
|
||||
|
||||
if (nla_len(opt) < sizeof(*ctl))
|
||||
return -EINVAL;
|
||||
|
||||
sch->limit = ctl->limit;
|
||||
}
|
||||
|
||||
if (is_bfifo)
|
||||
bypass = sch->limit >= psched_mtu(qdisc_dev(sch));
|
||||
else
|
||||
bypass = sch->limit >= 1;
|
||||
|
||||
if (bypass)
|
||||
sch->flags |= TCQ_F_CAN_BYPASS;
|
||||
else
|
||||
sch->flags &= ~TCQ_F_CAN_BYPASS;
|
||||
return 0;
|
||||
}
|
||||
|
||||
static int fifo_dump(struct Qdisc *sch, struct sk_buff *skb)
|
||||
{
|
||||
struct tc_fifo_qopt opt = { .limit = sch->limit };
|
||||
|
||||
if (nla_put(skb, TCA_OPTIONS, sizeof(opt), &opt))
|
||||
goto nla_put_failure;
|
||||
return skb->len;
|
||||
|
||||
nla_put_failure:
|
||||
return -1;
|
||||
}
|
||||
|
||||
struct Qdisc_ops pfifo_qdisc_ops __read_mostly = {
|
||||
.id = "pfifo",
|
||||
.priv_size = 0,
|
||||
.enqueue = pfifo_enqueue,
|
||||
.dequeue = qdisc_dequeue_head,
|
||||
.peek = qdisc_peek_head,
|
||||
.drop = qdisc_queue_drop,
|
||||
.init = fifo_init,
|
||||
.reset = qdisc_reset_queue,
|
||||
.change = fifo_init,
|
||||
.dump = fifo_dump,
|
||||
.owner = THIS_MODULE,
|
||||
};
|
||||
EXPORT_SYMBOL(pfifo_qdisc_ops);
|
||||
|
||||
struct Qdisc_ops bfifo_qdisc_ops __read_mostly = {
|
||||
.id = "bfifo",
|
||||
.priv_size = 0,
|
||||
.enqueue = bfifo_enqueue,
|
||||
.dequeue = qdisc_dequeue_head,
|
||||
.peek = qdisc_peek_head,
|
||||
.drop = qdisc_queue_drop,
|
||||
.init = fifo_init,
|
||||
.reset = qdisc_reset_queue,
|
||||
.change = fifo_init,
|
||||
.dump = fifo_dump,
|
||||
.owner = THIS_MODULE,
|
||||
};
|
||||
EXPORT_SYMBOL(bfifo_qdisc_ops);
|
||||
|
||||
struct Qdisc_ops pfifo_head_drop_qdisc_ops __read_mostly = {
|
||||
.id = "pfifo_head_drop",
|
||||
.priv_size = 0,
|
||||
.enqueue = pfifo_tail_enqueue,
|
||||
.dequeue = qdisc_dequeue_head,
|
||||
.peek = qdisc_peek_head,
|
||||
.drop = qdisc_queue_drop_head,
|
||||
.init = fifo_init,
|
||||
.reset = qdisc_reset_queue,
|
||||
.change = fifo_init,
|
||||
.dump = fifo_dump,
|
||||
.owner = THIS_MODULE,
|
||||
};
|
||||
|
||||
/* Pass size change message down to embedded FIFO */
|
||||
int fifo_set_limit(struct Qdisc *q, unsigned int limit)
|
||||
{
|
||||
struct nlattr *nla;
|
||||
int ret = -ENOMEM;
|
||||
|
||||
/* Hack to avoid sending change message to non-FIFO */
|
||||
if (strncmp(q->ops->id + 1, "fifo", 4) != 0)
|
||||
return 0;
|
||||
|
||||
nla = kmalloc(nla_attr_size(sizeof(struct tc_fifo_qopt)), GFP_KERNEL);
|
||||
if (nla) {
|
||||
nla->nla_type = RTM_NEWQDISC;
|
||||
nla->nla_len = nla_attr_size(sizeof(struct tc_fifo_qopt));
|
||||
((struct tc_fifo_qopt *)nla_data(nla))->limit = limit;
|
||||
|
||||
ret = q->ops->change(q, nla);
|
||||
kfree(nla);
|
||||
}
|
||||
return ret;
|
||||
}
|
||||
EXPORT_SYMBOL(fifo_set_limit);
|
||||
|
||||
struct Qdisc *fifo_create_dflt(struct Qdisc *sch, struct Qdisc_ops *ops,
|
||||
unsigned int limit)
|
||||
{
|
||||
struct Qdisc *q;
|
||||
int err = -ENOMEM;
|
||||
|
||||
q = qdisc_create_dflt(sch->dev_queue, ops, TC_H_MAKE(sch->handle, 1));
|
||||
if (q) {
|
||||
err = fifo_set_limit(q, limit);
|
||||
if (err < 0) {
|
||||
qdisc_destroy(q);
|
||||
q = NULL;
|
||||
}
|
||||
}
|
||||
|
||||
return q ? : ERR_PTR(err);
|
||||
}
|
||||
EXPORT_SYMBOL(fifo_create_dflt);
|
||||
847
net/sched/sch_fq.c
Normal file
847
net/sched/sch_fq.c
Normal file
|
|
@ -0,0 +1,847 @@
|
|||
/*
|
||||
* net/sched/sch_fq.c Fair Queue Packet Scheduler (per flow pacing)
|
||||
*
|
||||
* Copyright (C) 2013 Eric Dumazet <edumazet@google.com>
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU General Public License
|
||||
* as published by the Free Software Foundation; either version
|
||||
* 2 of the License, or (at your option) any later version.
|
||||
*
|
||||
* Meant to be mostly used for localy generated traffic :
|
||||
* Fast classification depends on skb->sk being set before reaching us.
|
||||
* If not, (router workload), we use rxhash as fallback, with 32 bits wide hash.
|
||||
* All packets belonging to a socket are considered as a 'flow'.
|
||||
*
|
||||
* Flows are dynamically allocated and stored in a hash table of RB trees
|
||||
* They are also part of one Round Robin 'queues' (new or old flows)
|
||||
*
|
||||
* Burst avoidance (aka pacing) capability :
|
||||
*
|
||||
* Transport (eg TCP) can set in sk->sk_pacing_rate a rate, enqueue a
|
||||
* bunch of packets, and this packet scheduler adds delay between
|
||||
* packets to respect rate limitation.
|
||||
*
|
||||
* enqueue() :
|
||||
* - lookup one RB tree (out of 1024 or more) to find the flow.
|
||||
* If non existent flow, create it, add it to the tree.
|
||||
* Add skb to the per flow list of skb (fifo).
|
||||
* - Use a special fifo for high prio packets
|
||||
*
|
||||
* dequeue() : serves flows in Round Robin
|
||||
* Note : When a flow becomes empty, we do not immediately remove it from
|
||||
* rb trees, for performance reasons (its expected to send additional packets,
|
||||
* or SLAB cache will reuse socket for another flow)
|
||||
*/
|
||||
|
||||
#include <linux/module.h>
|
||||
#include <linux/types.h>
|
||||
#include <linux/kernel.h>
|
||||
#include <linux/jiffies.h>
|
||||
#include <linux/string.h>
|
||||
#include <linux/in.h>
|
||||
#include <linux/errno.h>
|
||||
#include <linux/init.h>
|
||||
#include <linux/skbuff.h>
|
||||
#include <linux/slab.h>
|
||||
#include <linux/rbtree.h>
|
||||
#include <linux/hash.h>
|
||||
#include <linux/prefetch.h>
|
||||
#include <linux/vmalloc.h>
|
||||
#include <net/netlink.h>
|
||||
#include <net/pkt_sched.h>
|
||||
#include <net/sock.h>
|
||||
#include <net/tcp_states.h>
|
||||
|
||||
/*
|
||||
* Per flow structure, dynamically allocated
|
||||
*/
|
||||
struct fq_flow {
|
||||
struct sk_buff *head; /* list of skbs for this flow : first skb */
|
||||
union {
|
||||
struct sk_buff *tail; /* last skb in the list */
|
||||
unsigned long age; /* jiffies when flow was emptied, for gc */
|
||||
};
|
||||
struct rb_node fq_node; /* anchor in fq_root[] trees */
|
||||
struct sock *sk;
|
||||
int qlen; /* number of packets in flow queue */
|
||||
int credit;
|
||||
u32 socket_hash; /* sk_hash */
|
||||
struct fq_flow *next; /* next pointer in RR lists, or &detached */
|
||||
|
||||
struct rb_node rate_node; /* anchor in q->delayed tree */
|
||||
u64 time_next_packet;
|
||||
};
|
||||
|
||||
struct fq_flow_head {
|
||||
struct fq_flow *first;
|
||||
struct fq_flow *last;
|
||||
};
|
||||
|
||||
struct fq_sched_data {
|
||||
struct fq_flow_head new_flows;
|
||||
|
||||
struct fq_flow_head old_flows;
|
||||
|
||||
struct rb_root delayed; /* for rate limited flows */
|
||||
u64 time_next_delayed_flow;
|
||||
|
||||
struct fq_flow internal; /* for non classified or high prio packets */
|
||||
u32 quantum;
|
||||
u32 initial_quantum;
|
||||
u32 flow_refill_delay;
|
||||
u32 flow_max_rate; /* optional max rate per flow */
|
||||
u32 flow_plimit; /* max packets per flow */
|
||||
struct rb_root *fq_root;
|
||||
u8 rate_enable;
|
||||
u8 fq_trees_log;
|
||||
|
||||
u32 flows;
|
||||
u32 inactive_flows;
|
||||
u32 throttled_flows;
|
||||
|
||||
u64 stat_gc_flows;
|
||||
u64 stat_internal_packets;
|
||||
u64 stat_tcp_retrans;
|
||||
u64 stat_throttled;
|
||||
u64 stat_flows_plimit;
|
||||
u64 stat_pkts_too_long;
|
||||
u64 stat_allocation_errors;
|
||||
struct qdisc_watchdog watchdog;
|
||||
};
|
||||
|
||||
/* special value to mark a detached flow (not on old/new list) */
|
||||
static struct fq_flow detached, throttled;
|
||||
|
||||
static void fq_flow_set_detached(struct fq_flow *f)
|
||||
{
|
||||
f->next = &detached;
|
||||
f->age = jiffies;
|
||||
}
|
||||
|
||||
static bool fq_flow_is_detached(const struct fq_flow *f)
|
||||
{
|
||||
return f->next == &detached;
|
||||
}
|
||||
|
||||
static void fq_flow_set_throttled(struct fq_sched_data *q, struct fq_flow *f)
|
||||
{
|
||||
struct rb_node **p = &q->delayed.rb_node, *parent = NULL;
|
||||
|
||||
while (*p) {
|
||||
struct fq_flow *aux;
|
||||
|
||||
parent = *p;
|
||||
aux = container_of(parent, struct fq_flow, rate_node);
|
||||
if (f->time_next_packet >= aux->time_next_packet)
|
||||
p = &parent->rb_right;
|
||||
else
|
||||
p = &parent->rb_left;
|
||||
}
|
||||
rb_link_node(&f->rate_node, parent, p);
|
||||
rb_insert_color(&f->rate_node, &q->delayed);
|
||||
q->throttled_flows++;
|
||||
q->stat_throttled++;
|
||||
|
||||
f->next = &throttled;
|
||||
if (q->time_next_delayed_flow > f->time_next_packet)
|
||||
q->time_next_delayed_flow = f->time_next_packet;
|
||||
}
|
||||
|
||||
|
||||
static struct kmem_cache *fq_flow_cachep __read_mostly;
|
||||
|
||||
static void fq_flow_add_tail(struct fq_flow_head *head, struct fq_flow *flow)
|
||||
{
|
||||
if (head->first)
|
||||
head->last->next = flow;
|
||||
else
|
||||
head->first = flow;
|
||||
head->last = flow;
|
||||
flow->next = NULL;
|
||||
}
|
||||
|
||||
/* limit number of collected flows per round */
|
||||
#define FQ_GC_MAX 8
|
||||
#define FQ_GC_AGE (3*HZ)
|
||||
|
||||
static bool fq_gc_candidate(const struct fq_flow *f)
|
||||
{
|
||||
return fq_flow_is_detached(f) &&
|
||||
time_after(jiffies, f->age + FQ_GC_AGE);
|
||||
}
|
||||
|
||||
static void fq_gc(struct fq_sched_data *q,
|
||||
struct rb_root *root,
|
||||
struct sock *sk)
|
||||
{
|
||||
struct fq_flow *f, *tofree[FQ_GC_MAX];
|
||||
struct rb_node **p, *parent;
|
||||
int fcnt = 0;
|
||||
|
||||
p = &root->rb_node;
|
||||
parent = NULL;
|
||||
while (*p) {
|
||||
parent = *p;
|
||||
|
||||
f = container_of(parent, struct fq_flow, fq_node);
|
||||
if (f->sk == sk)
|
||||
break;
|
||||
|
||||
if (fq_gc_candidate(f)) {
|
||||
tofree[fcnt++] = f;
|
||||
if (fcnt == FQ_GC_MAX)
|
||||
break;
|
||||
}
|
||||
|
||||
if (f->sk > sk)
|
||||
p = &parent->rb_right;
|
||||
else
|
||||
p = &parent->rb_left;
|
||||
}
|
||||
|
||||
q->flows -= fcnt;
|
||||
q->inactive_flows -= fcnt;
|
||||
q->stat_gc_flows += fcnt;
|
||||
while (fcnt) {
|
||||
struct fq_flow *f = tofree[--fcnt];
|
||||
|
||||
rb_erase(&f->fq_node, root);
|
||||
kmem_cache_free(fq_flow_cachep, f);
|
||||
}
|
||||
}
|
||||
|
||||
static struct fq_flow *fq_classify(struct sk_buff *skb, struct fq_sched_data *q)
|
||||
{
|
||||
struct rb_node **p, *parent;
|
||||
struct sock *sk = skb->sk;
|
||||
struct rb_root *root;
|
||||
struct fq_flow *f;
|
||||
|
||||
/* warning: no starvation prevention... */
|
||||
if (unlikely((skb->priority & TC_PRIO_MAX) == TC_PRIO_CONTROL))
|
||||
return &q->internal;
|
||||
|
||||
if (unlikely(!sk)) {
|
||||
/* By forcing low order bit to 1, we make sure to not
|
||||
* collide with a local flow (socket pointers are word aligned)
|
||||
*/
|
||||
sk = (struct sock *)(skb_get_hash(skb) | 1L);
|
||||
}
|
||||
|
||||
root = &q->fq_root[hash_32((u32)(long)sk, q->fq_trees_log)];
|
||||
|
||||
if (q->flows >= (2U << q->fq_trees_log) &&
|
||||
q->inactive_flows > q->flows/2)
|
||||
fq_gc(q, root, sk);
|
||||
|
||||
p = &root->rb_node;
|
||||
parent = NULL;
|
||||
while (*p) {
|
||||
parent = *p;
|
||||
|
||||
f = container_of(parent, struct fq_flow, fq_node);
|
||||
if (f->sk == sk) {
|
||||
/* socket might have been reallocated, so check
|
||||
* if its sk_hash is the same.
|
||||
* It not, we need to refill credit with
|
||||
* initial quantum
|
||||
*/
|
||||
if (unlikely(skb->sk &&
|
||||
f->socket_hash != sk->sk_hash)) {
|
||||
f->credit = q->initial_quantum;
|
||||
f->socket_hash = sk->sk_hash;
|
||||
f->time_next_packet = 0ULL;
|
||||
}
|
||||
return f;
|
||||
}
|
||||
if (f->sk > sk)
|
||||
p = &parent->rb_right;
|
||||
else
|
||||
p = &parent->rb_left;
|
||||
}
|
||||
|
||||
f = kmem_cache_zalloc(fq_flow_cachep, GFP_ATOMIC | __GFP_NOWARN);
|
||||
if (unlikely(!f)) {
|
||||
q->stat_allocation_errors++;
|
||||
return &q->internal;
|
||||
}
|
||||
fq_flow_set_detached(f);
|
||||
f->sk = sk;
|
||||
if (skb->sk)
|
||||
f->socket_hash = sk->sk_hash;
|
||||
f->credit = q->initial_quantum;
|
||||
|
||||
rb_link_node(&f->fq_node, parent, p);
|
||||
rb_insert_color(&f->fq_node, root);
|
||||
|
||||
q->flows++;
|
||||
q->inactive_flows++;
|
||||
return f;
|
||||
}
|
||||
|
||||
|
||||
/* remove one skb from head of flow queue */
|
||||
static struct sk_buff *fq_dequeue_head(struct Qdisc *sch, struct fq_flow *flow)
|
||||
{
|
||||
struct sk_buff *skb = flow->head;
|
||||
|
||||
if (skb) {
|
||||
flow->head = skb->next;
|
||||
skb->next = NULL;
|
||||
flow->qlen--;
|
||||
qdisc_qstats_backlog_dec(sch, skb);
|
||||
sch->q.qlen--;
|
||||
}
|
||||
return skb;
|
||||
}
|
||||
|
||||
/* We might add in the future detection of retransmits
|
||||
* For the time being, just return false
|
||||
*/
|
||||
static bool skb_is_retransmit(struct sk_buff *skb)
|
||||
{
|
||||
return false;
|
||||
}
|
||||
|
||||
/* add skb to flow queue
|
||||
* flow queue is a linked list, kind of FIFO, except for TCP retransmits
|
||||
* We special case tcp retransmits to be transmitted before other packets.
|
||||
* We rely on fact that TCP retransmits are unlikely, so we do not waste
|
||||
* a separate queue or a pointer.
|
||||
* head-> [retrans pkt 1]
|
||||
* [retrans pkt 2]
|
||||
* [ normal pkt 1]
|
||||
* [ normal pkt 2]
|
||||
* [ normal pkt 3]
|
||||
* tail-> [ normal pkt 4]
|
||||
*/
|
||||
static void flow_queue_add(struct fq_flow *flow, struct sk_buff *skb)
|
||||
{
|
||||
struct sk_buff *prev, *head = flow->head;
|
||||
|
||||
skb->next = NULL;
|
||||
if (!head) {
|
||||
flow->head = skb;
|
||||
flow->tail = skb;
|
||||
return;
|
||||
}
|
||||
if (likely(!skb_is_retransmit(skb))) {
|
||||
flow->tail->next = skb;
|
||||
flow->tail = skb;
|
||||
return;
|
||||
}
|
||||
|
||||
/* This skb is a tcp retransmit,
|
||||
* find the last retrans packet in the queue
|
||||
*/
|
||||
prev = NULL;
|
||||
while (skb_is_retransmit(head)) {
|
||||
prev = head;
|
||||
head = head->next;
|
||||
if (!head)
|
||||
break;
|
||||
}
|
||||
if (!prev) { /* no rtx packet in queue, become the new head */
|
||||
skb->next = flow->head;
|
||||
flow->head = skb;
|
||||
} else {
|
||||
if (prev == flow->tail)
|
||||
flow->tail = skb;
|
||||
else
|
||||
skb->next = prev->next;
|
||||
prev->next = skb;
|
||||
}
|
||||
}
|
||||
|
||||
static int fq_enqueue(struct sk_buff *skb, struct Qdisc *sch)
|
||||
{
|
||||
struct fq_sched_data *q = qdisc_priv(sch);
|
||||
struct fq_flow *f;
|
||||
|
||||
if (unlikely(sch->q.qlen >= sch->limit))
|
||||
return qdisc_drop(skb, sch);
|
||||
|
||||
f = fq_classify(skb, q);
|
||||
if (unlikely(f->qlen >= q->flow_plimit && f != &q->internal)) {
|
||||
q->stat_flows_plimit++;
|
||||
return qdisc_drop(skb, sch);
|
||||
}
|
||||
|
||||
f->qlen++;
|
||||
if (skb_is_retransmit(skb))
|
||||
q->stat_tcp_retrans++;
|
||||
qdisc_qstats_backlog_inc(sch, skb);
|
||||
if (fq_flow_is_detached(f)) {
|
||||
fq_flow_add_tail(&q->new_flows, f);
|
||||
if (time_after(jiffies, f->age + q->flow_refill_delay))
|
||||
f->credit = max_t(u32, f->credit, q->quantum);
|
||||
q->inactive_flows--;
|
||||
}
|
||||
|
||||
/* Note: this overwrites f->age */
|
||||
flow_queue_add(f, skb);
|
||||
|
||||
if (unlikely(f == &q->internal)) {
|
||||
q->stat_internal_packets++;
|
||||
}
|
||||
sch->q.qlen++;
|
||||
|
||||
return NET_XMIT_SUCCESS;
|
||||
}
|
||||
|
||||
static void fq_check_throttled(struct fq_sched_data *q, u64 now)
|
||||
{
|
||||
struct rb_node *p;
|
||||
|
||||
if (q->time_next_delayed_flow > now)
|
||||
return;
|
||||
|
||||
q->time_next_delayed_flow = ~0ULL;
|
||||
while ((p = rb_first(&q->delayed)) != NULL) {
|
||||
struct fq_flow *f = container_of(p, struct fq_flow, rate_node);
|
||||
|
||||
if (f->time_next_packet > now) {
|
||||
q->time_next_delayed_flow = f->time_next_packet;
|
||||
break;
|
||||
}
|
||||
rb_erase(p, &q->delayed);
|
||||
q->throttled_flows--;
|
||||
fq_flow_add_tail(&q->old_flows, f);
|
||||
}
|
||||
}
|
||||
|
||||
static struct sk_buff *fq_dequeue(struct Qdisc *sch)
|
||||
{
|
||||
struct fq_sched_data *q = qdisc_priv(sch);
|
||||
u64 now = ktime_get_ns();
|
||||
struct fq_flow_head *head;
|
||||
struct sk_buff *skb;
|
||||
struct fq_flow *f;
|
||||
u32 rate;
|
||||
|
||||
skb = fq_dequeue_head(sch, &q->internal);
|
||||
if (skb)
|
||||
goto out;
|
||||
fq_check_throttled(q, now);
|
||||
begin:
|
||||
head = &q->new_flows;
|
||||
if (!head->first) {
|
||||
head = &q->old_flows;
|
||||
if (!head->first) {
|
||||
if (q->time_next_delayed_flow != ~0ULL)
|
||||
qdisc_watchdog_schedule_ns(&q->watchdog,
|
||||
q->time_next_delayed_flow,
|
||||
false);
|
||||
return NULL;
|
||||
}
|
||||
}
|
||||
f = head->first;
|
||||
|
||||
if (f->credit <= 0) {
|
||||
f->credit += q->quantum;
|
||||
head->first = f->next;
|
||||
fq_flow_add_tail(&q->old_flows, f);
|
||||
goto begin;
|
||||
}
|
||||
|
||||
if (unlikely(f->head && now < f->time_next_packet)) {
|
||||
head->first = f->next;
|
||||
fq_flow_set_throttled(q, f);
|
||||
goto begin;
|
||||
}
|
||||
|
||||
skb = fq_dequeue_head(sch, f);
|
||||
if (!skb) {
|
||||
head->first = f->next;
|
||||
/* force a pass through old_flows to prevent starvation */
|
||||
if ((head == &q->new_flows) && q->old_flows.first) {
|
||||
fq_flow_add_tail(&q->old_flows, f);
|
||||
} else {
|
||||
fq_flow_set_detached(f);
|
||||
q->inactive_flows++;
|
||||
}
|
||||
goto begin;
|
||||
}
|
||||
prefetch(&skb->end);
|
||||
f->time_next_packet = now;
|
||||
f->credit -= qdisc_pkt_len(skb);
|
||||
|
||||
if (f->credit > 0 || !q->rate_enable)
|
||||
goto out;
|
||||
|
||||
rate = q->flow_max_rate;
|
||||
if (skb->sk && skb->sk->sk_state != TCP_TIME_WAIT)
|
||||
rate = min(skb->sk->sk_pacing_rate, rate);
|
||||
|
||||
if (rate != ~0U) {
|
||||
u32 plen = max(qdisc_pkt_len(skb), q->quantum);
|
||||
u64 len = (u64)plen * NSEC_PER_SEC;
|
||||
|
||||
if (likely(rate))
|
||||
do_div(len, rate);
|
||||
/* Since socket rate can change later,
|
||||
* clamp the delay to 125 ms.
|
||||
* TODO: maybe segment the too big skb, as in commit
|
||||
* e43ac79a4bc ("sch_tbf: segment too big GSO packets")
|
||||
*/
|
||||
if (unlikely(len > 125 * NSEC_PER_MSEC)) {
|
||||
len = 125 * NSEC_PER_MSEC;
|
||||
q->stat_pkts_too_long++;
|
||||
}
|
||||
|
||||
f->time_next_packet = now + len;
|
||||
}
|
||||
out:
|
||||
qdisc_bstats_update(sch, skb);
|
||||
return skb;
|
||||
}
|
||||
|
||||
static void fq_reset(struct Qdisc *sch)
|
||||
{
|
||||
struct fq_sched_data *q = qdisc_priv(sch);
|
||||
struct rb_root *root;
|
||||
struct sk_buff *skb;
|
||||
struct rb_node *p;
|
||||
struct fq_flow *f;
|
||||
unsigned int idx;
|
||||
|
||||
while ((skb = fq_dequeue_head(sch, &q->internal)) != NULL)
|
||||
kfree_skb(skb);
|
||||
|
||||
if (!q->fq_root)
|
||||
return;
|
||||
|
||||
for (idx = 0; idx < (1U << q->fq_trees_log); idx++) {
|
||||
root = &q->fq_root[idx];
|
||||
while ((p = rb_first(root)) != NULL) {
|
||||
f = container_of(p, struct fq_flow, fq_node);
|
||||
rb_erase(p, root);
|
||||
|
||||
while ((skb = fq_dequeue_head(sch, f)) != NULL)
|
||||
kfree_skb(skb);
|
||||
|
||||
kmem_cache_free(fq_flow_cachep, f);
|
||||
}
|
||||
}
|
||||
q->new_flows.first = NULL;
|
||||
q->old_flows.first = NULL;
|
||||
q->delayed = RB_ROOT;
|
||||
q->flows = 0;
|
||||
q->inactive_flows = 0;
|
||||
q->throttled_flows = 0;
|
||||
}
|
||||
|
||||
static void fq_rehash(struct fq_sched_data *q,
|
||||
struct rb_root *old_array, u32 old_log,
|
||||
struct rb_root *new_array, u32 new_log)
|
||||
{
|
||||
struct rb_node *op, **np, *parent;
|
||||
struct rb_root *oroot, *nroot;
|
||||
struct fq_flow *of, *nf;
|
||||
int fcnt = 0;
|
||||
u32 idx;
|
||||
|
||||
for (idx = 0; idx < (1U << old_log); idx++) {
|
||||
oroot = &old_array[idx];
|
||||
while ((op = rb_first(oroot)) != NULL) {
|
||||
rb_erase(op, oroot);
|
||||
of = container_of(op, struct fq_flow, fq_node);
|
||||
if (fq_gc_candidate(of)) {
|
||||
fcnt++;
|
||||
kmem_cache_free(fq_flow_cachep, of);
|
||||
continue;
|
||||
}
|
||||
nroot = &new_array[hash_32((u32)(long)of->sk, new_log)];
|
||||
|
||||
np = &nroot->rb_node;
|
||||
parent = NULL;
|
||||
while (*np) {
|
||||
parent = *np;
|
||||
|
||||
nf = container_of(parent, struct fq_flow, fq_node);
|
||||
BUG_ON(nf->sk == of->sk);
|
||||
|
||||
if (nf->sk > of->sk)
|
||||
np = &parent->rb_right;
|
||||
else
|
||||
np = &parent->rb_left;
|
||||
}
|
||||
|
||||
rb_link_node(&of->fq_node, parent, np);
|
||||
rb_insert_color(&of->fq_node, nroot);
|
||||
}
|
||||
}
|
||||
q->flows -= fcnt;
|
||||
q->inactive_flows -= fcnt;
|
||||
q->stat_gc_flows += fcnt;
|
||||
}
|
||||
|
||||
static void *fq_alloc_node(size_t sz, int node)
|
||||
{
|
||||
void *ptr;
|
||||
|
||||
ptr = kmalloc_node(sz, GFP_KERNEL | __GFP_REPEAT | __GFP_NOWARN, node);
|
||||
if (!ptr)
|
||||
ptr = vmalloc_node(sz, node);
|
||||
return ptr;
|
||||
}
|
||||
|
||||
static void fq_free(void *addr)
|
||||
{
|
||||
kvfree(addr);
|
||||
}
|
||||
|
||||
static int fq_resize(struct Qdisc *sch, u32 log)
|
||||
{
|
||||
struct fq_sched_data *q = qdisc_priv(sch);
|
||||
struct rb_root *array;
|
||||
void *old_fq_root;
|
||||
u32 idx;
|
||||
|
||||
if (q->fq_root && log == q->fq_trees_log)
|
||||
return 0;
|
||||
|
||||
/* If XPS was setup, we can allocate memory on right NUMA node */
|
||||
array = fq_alloc_node(sizeof(struct rb_root) << log,
|
||||
netdev_queue_numa_node_read(sch->dev_queue));
|
||||
if (!array)
|
||||
return -ENOMEM;
|
||||
|
||||
for (idx = 0; idx < (1U << log); idx++)
|
||||
array[idx] = RB_ROOT;
|
||||
|
||||
sch_tree_lock(sch);
|
||||
|
||||
old_fq_root = q->fq_root;
|
||||
if (old_fq_root)
|
||||
fq_rehash(q, old_fq_root, q->fq_trees_log, array, log);
|
||||
|
||||
q->fq_root = array;
|
||||
q->fq_trees_log = log;
|
||||
|
||||
sch_tree_unlock(sch);
|
||||
|
||||
fq_free(old_fq_root);
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
static const struct nla_policy fq_policy[TCA_FQ_MAX + 1] = {
|
||||
[TCA_FQ_PLIMIT] = { .type = NLA_U32 },
|
||||
[TCA_FQ_FLOW_PLIMIT] = { .type = NLA_U32 },
|
||||
[TCA_FQ_QUANTUM] = { .type = NLA_U32 },
|
||||
[TCA_FQ_INITIAL_QUANTUM] = { .type = NLA_U32 },
|
||||
[TCA_FQ_RATE_ENABLE] = { .type = NLA_U32 },
|
||||
[TCA_FQ_FLOW_DEFAULT_RATE] = { .type = NLA_U32 },
|
||||
[TCA_FQ_FLOW_MAX_RATE] = { .type = NLA_U32 },
|
||||
[TCA_FQ_BUCKETS_LOG] = { .type = NLA_U32 },
|
||||
[TCA_FQ_FLOW_REFILL_DELAY] = { .type = NLA_U32 },
|
||||
};
|
||||
|
||||
static int fq_change(struct Qdisc *sch, struct nlattr *opt)
|
||||
{
|
||||
struct fq_sched_data *q = qdisc_priv(sch);
|
||||
struct nlattr *tb[TCA_FQ_MAX + 1];
|
||||
int err, drop_count = 0;
|
||||
u32 fq_log;
|
||||
|
||||
if (!opt)
|
||||
return -EINVAL;
|
||||
|
||||
err = nla_parse_nested(tb, TCA_FQ_MAX, opt, fq_policy);
|
||||
if (err < 0)
|
||||
return err;
|
||||
|
||||
sch_tree_lock(sch);
|
||||
|
||||
fq_log = q->fq_trees_log;
|
||||
|
||||
if (tb[TCA_FQ_BUCKETS_LOG]) {
|
||||
u32 nval = nla_get_u32(tb[TCA_FQ_BUCKETS_LOG]);
|
||||
|
||||
if (nval >= 1 && nval <= ilog2(256*1024))
|
||||
fq_log = nval;
|
||||
else
|
||||
err = -EINVAL;
|
||||
}
|
||||
if (tb[TCA_FQ_PLIMIT])
|
||||
sch->limit = nla_get_u32(tb[TCA_FQ_PLIMIT]);
|
||||
|
||||
if (tb[TCA_FQ_FLOW_PLIMIT])
|
||||
q->flow_plimit = nla_get_u32(tb[TCA_FQ_FLOW_PLIMIT]);
|
||||
|
||||
if (tb[TCA_FQ_QUANTUM])
|
||||
q->quantum = nla_get_u32(tb[TCA_FQ_QUANTUM]);
|
||||
|
||||
if (tb[TCA_FQ_INITIAL_QUANTUM])
|
||||
q->initial_quantum = nla_get_u32(tb[TCA_FQ_INITIAL_QUANTUM]);
|
||||
|
||||
if (tb[TCA_FQ_FLOW_DEFAULT_RATE])
|
||||
pr_warn_ratelimited("sch_fq: defrate %u ignored.\n",
|
||||
nla_get_u32(tb[TCA_FQ_FLOW_DEFAULT_RATE]));
|
||||
|
||||
if (tb[TCA_FQ_FLOW_MAX_RATE])
|
||||
q->flow_max_rate = nla_get_u32(tb[TCA_FQ_FLOW_MAX_RATE]);
|
||||
|
||||
if (tb[TCA_FQ_RATE_ENABLE]) {
|
||||
u32 enable = nla_get_u32(tb[TCA_FQ_RATE_ENABLE]);
|
||||
|
||||
if (enable <= 1)
|
||||
q->rate_enable = enable;
|
||||
else
|
||||
err = -EINVAL;
|
||||
}
|
||||
|
||||
if (tb[TCA_FQ_FLOW_REFILL_DELAY]) {
|
||||
u32 usecs_delay = nla_get_u32(tb[TCA_FQ_FLOW_REFILL_DELAY]) ;
|
||||
|
||||
q->flow_refill_delay = usecs_to_jiffies(usecs_delay);
|
||||
}
|
||||
|
||||
if (!err) {
|
||||
sch_tree_unlock(sch);
|
||||
err = fq_resize(sch, fq_log);
|
||||
sch_tree_lock(sch);
|
||||
}
|
||||
while (sch->q.qlen > sch->limit) {
|
||||
struct sk_buff *skb = fq_dequeue(sch);
|
||||
|
||||
if (!skb)
|
||||
break;
|
||||
kfree_skb(skb);
|
||||
drop_count++;
|
||||
}
|
||||
qdisc_tree_decrease_qlen(sch, drop_count);
|
||||
|
||||
sch_tree_unlock(sch);
|
||||
return err;
|
||||
}
|
||||
|
||||
static void fq_destroy(struct Qdisc *sch)
|
||||
{
|
||||
struct fq_sched_data *q = qdisc_priv(sch);
|
||||
|
||||
fq_reset(sch);
|
||||
fq_free(q->fq_root);
|
||||
qdisc_watchdog_cancel(&q->watchdog);
|
||||
}
|
||||
|
||||
static int fq_init(struct Qdisc *sch, struct nlattr *opt)
|
||||
{
|
||||
struct fq_sched_data *q = qdisc_priv(sch);
|
||||
int err;
|
||||
|
||||
sch->limit = 10000;
|
||||
q->flow_plimit = 100;
|
||||
q->quantum = 2 * psched_mtu(qdisc_dev(sch));
|
||||
q->initial_quantum = 10 * psched_mtu(qdisc_dev(sch));
|
||||
q->flow_refill_delay = msecs_to_jiffies(40);
|
||||
q->flow_max_rate = ~0U;
|
||||
q->rate_enable = 1;
|
||||
q->new_flows.first = NULL;
|
||||
q->old_flows.first = NULL;
|
||||
q->delayed = RB_ROOT;
|
||||
q->fq_root = NULL;
|
||||
q->fq_trees_log = ilog2(1024);
|
||||
qdisc_watchdog_init(&q->watchdog, sch);
|
||||
|
||||
if (opt)
|
||||
err = fq_change(sch, opt);
|
||||
else
|
||||
err = fq_resize(sch, q->fq_trees_log);
|
||||
|
||||
return err;
|
||||
}
|
||||
|
||||
static int fq_dump(struct Qdisc *sch, struct sk_buff *skb)
|
||||
{
|
||||
struct fq_sched_data *q = qdisc_priv(sch);
|
||||
struct nlattr *opts;
|
||||
|
||||
opts = nla_nest_start(skb, TCA_OPTIONS);
|
||||
if (opts == NULL)
|
||||
goto nla_put_failure;
|
||||
|
||||
/* TCA_FQ_FLOW_DEFAULT_RATE is not used anymore */
|
||||
|
||||
if (nla_put_u32(skb, TCA_FQ_PLIMIT, sch->limit) ||
|
||||
nla_put_u32(skb, TCA_FQ_FLOW_PLIMIT, q->flow_plimit) ||
|
||||
nla_put_u32(skb, TCA_FQ_QUANTUM, q->quantum) ||
|
||||
nla_put_u32(skb, TCA_FQ_INITIAL_QUANTUM, q->initial_quantum) ||
|
||||
nla_put_u32(skb, TCA_FQ_RATE_ENABLE, q->rate_enable) ||
|
||||
nla_put_u32(skb, TCA_FQ_FLOW_MAX_RATE, q->flow_max_rate) ||
|
||||
nla_put_u32(skb, TCA_FQ_FLOW_REFILL_DELAY,
|
||||
jiffies_to_usecs(q->flow_refill_delay)) ||
|
||||
nla_put_u32(skb, TCA_FQ_BUCKETS_LOG, q->fq_trees_log))
|
||||
goto nla_put_failure;
|
||||
|
||||
return nla_nest_end(skb, opts);
|
||||
|
||||
nla_put_failure:
|
||||
return -1;
|
||||
}
|
||||
|
||||
static int fq_dump_stats(struct Qdisc *sch, struct gnet_dump *d)
|
||||
{
|
||||
struct fq_sched_data *q = qdisc_priv(sch);
|
||||
u64 now = ktime_get_ns();
|
||||
struct tc_fq_qd_stats st = {
|
||||
.gc_flows = q->stat_gc_flows,
|
||||
.highprio_packets = q->stat_internal_packets,
|
||||
.tcp_retrans = q->stat_tcp_retrans,
|
||||
.throttled = q->stat_throttled,
|
||||
.flows_plimit = q->stat_flows_plimit,
|
||||
.pkts_too_long = q->stat_pkts_too_long,
|
||||
.allocation_errors = q->stat_allocation_errors,
|
||||
.flows = q->flows,
|
||||
.inactive_flows = q->inactive_flows,
|
||||
.throttled_flows = q->throttled_flows,
|
||||
.time_next_delayed_flow = q->time_next_delayed_flow - now,
|
||||
};
|
||||
|
||||
return gnet_stats_copy_app(d, &st, sizeof(st));
|
||||
}
|
||||
|
||||
static struct Qdisc_ops fq_qdisc_ops __read_mostly = {
|
||||
.id = "fq",
|
||||
.priv_size = sizeof(struct fq_sched_data),
|
||||
|
||||
.enqueue = fq_enqueue,
|
||||
.dequeue = fq_dequeue,
|
||||
.peek = qdisc_peek_dequeued,
|
||||
.init = fq_init,
|
||||
.reset = fq_reset,
|
||||
.destroy = fq_destroy,
|
||||
.change = fq_change,
|
||||
.dump = fq_dump,
|
||||
.dump_stats = fq_dump_stats,
|
||||
.owner = THIS_MODULE,
|
||||
};
|
||||
|
||||
static int __init fq_module_init(void)
|
||||
{
|
||||
int ret;
|
||||
|
||||
fq_flow_cachep = kmem_cache_create("fq_flow_cache",
|
||||
sizeof(struct fq_flow),
|
||||
0, 0, NULL);
|
||||
if (!fq_flow_cachep)
|
||||
return -ENOMEM;
|
||||
|
||||
ret = register_qdisc(&fq_qdisc_ops);
|
||||
if (ret)
|
||||
kmem_cache_destroy(fq_flow_cachep);
|
||||
return ret;
|
||||
}
|
||||
|
||||
static void __exit fq_module_exit(void)
|
||||
{
|
||||
unregister_qdisc(&fq_qdisc_ops);
|
||||
kmem_cache_destroy(fq_flow_cachep);
|
||||
}
|
||||
|
||||
module_init(fq_module_init)
|
||||
module_exit(fq_module_exit)
|
||||
MODULE_AUTHOR("Eric Dumazet");
|
||||
MODULE_LICENSE("GPL");
|
||||
624
net/sched/sch_fq_codel.c
Normal file
624
net/sched/sch_fq_codel.c
Normal file
|
|
@ -0,0 +1,624 @@
|
|||
/*
|
||||
* Fair Queue CoDel discipline
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU General Public License
|
||||
* as published by the Free Software Foundation; either version
|
||||
* 2 of the License, or (at your option) any later version.
|
||||
*
|
||||
* Copyright (C) 2012 Eric Dumazet <edumazet@google.com>
|
||||
*/
|
||||
|
||||
#include <linux/module.h>
|
||||
#include <linux/types.h>
|
||||
#include <linux/kernel.h>
|
||||
#include <linux/jiffies.h>
|
||||
#include <linux/string.h>
|
||||
#include <linux/in.h>
|
||||
#include <linux/errno.h>
|
||||
#include <linux/init.h>
|
||||
#include <linux/skbuff.h>
|
||||
#include <linux/jhash.h>
|
||||
#include <linux/slab.h>
|
||||
#include <linux/vmalloc.h>
|
||||
#include <net/netlink.h>
|
||||
#include <net/pkt_sched.h>
|
||||
#include <net/flow_keys.h>
|
||||
#include <net/codel.h>
|
||||
|
||||
/* Fair Queue CoDel.
|
||||
*
|
||||
* Principles :
|
||||
* Packets are classified (internal classifier or external) on flows.
|
||||
* This is a Stochastic model (as we use a hash, several flows
|
||||
* might be hashed on same slot)
|
||||
* Each flow has a CoDel managed queue.
|
||||
* Flows are linked onto two (Round Robin) lists,
|
||||
* so that new flows have priority on old ones.
|
||||
*
|
||||
* For a given flow, packets are not reordered (CoDel uses a FIFO)
|
||||
* head drops only.
|
||||
* ECN capability is on by default.
|
||||
* Low memory footprint (64 bytes per flow)
|
||||
*/
|
||||
|
||||
struct fq_codel_flow {
|
||||
struct sk_buff *head;
|
||||
struct sk_buff *tail;
|
||||
struct list_head flowchain;
|
||||
int deficit;
|
||||
u32 dropped; /* number of drops (or ECN marks) on this flow */
|
||||
struct codel_vars cvars;
|
||||
}; /* please try to keep this structure <= 64 bytes */
|
||||
|
||||
struct fq_codel_sched_data {
|
||||
struct tcf_proto __rcu *filter_list; /* optional external classifier */
|
||||
struct fq_codel_flow *flows; /* Flows table [flows_cnt] */
|
||||
u32 *backlogs; /* backlog table [flows_cnt] */
|
||||
u32 flows_cnt; /* number of flows */
|
||||
u32 perturbation; /* hash perturbation */
|
||||
u32 quantum; /* psched_mtu(qdisc_dev(sch)); */
|
||||
struct codel_params cparams;
|
||||
struct codel_stats cstats;
|
||||
u32 drop_overlimit;
|
||||
u32 new_flow_count;
|
||||
|
||||
struct list_head new_flows; /* list of new flows */
|
||||
struct list_head old_flows; /* list of old flows */
|
||||
};
|
||||
|
||||
static unsigned int fq_codel_hash(const struct fq_codel_sched_data *q,
|
||||
const struct sk_buff *skb)
|
||||
{
|
||||
struct flow_keys keys;
|
||||
unsigned int hash;
|
||||
|
||||
skb_flow_dissect(skb, &keys);
|
||||
hash = jhash_3words((__force u32)keys.dst,
|
||||
(__force u32)keys.src ^ keys.ip_proto,
|
||||
(__force u32)keys.ports, q->perturbation);
|
||||
|
||||
return reciprocal_scale(hash, q->flows_cnt);
|
||||
}
|
||||
|
||||
static unsigned int fq_codel_classify(struct sk_buff *skb, struct Qdisc *sch,
|
||||
int *qerr)
|
||||
{
|
||||
struct fq_codel_sched_data *q = qdisc_priv(sch);
|
||||
struct tcf_proto *filter;
|
||||
struct tcf_result res;
|
||||
int result;
|
||||
|
||||
if (TC_H_MAJ(skb->priority) == sch->handle &&
|
||||
TC_H_MIN(skb->priority) > 0 &&
|
||||
TC_H_MIN(skb->priority) <= q->flows_cnt)
|
||||
return TC_H_MIN(skb->priority);
|
||||
|
||||
filter = rcu_dereference_bh(q->filter_list);
|
||||
if (!filter)
|
||||
return fq_codel_hash(q, skb) + 1;
|
||||
|
||||
*qerr = NET_XMIT_SUCCESS | __NET_XMIT_BYPASS;
|
||||
result = tc_classify(skb, filter, &res);
|
||||
if (result >= 0) {
|
||||
#ifdef CONFIG_NET_CLS_ACT
|
||||
switch (result) {
|
||||
case TC_ACT_STOLEN:
|
||||
case TC_ACT_QUEUED:
|
||||
*qerr = NET_XMIT_SUCCESS | __NET_XMIT_STOLEN;
|
||||
case TC_ACT_SHOT:
|
||||
return 0;
|
||||
}
|
||||
#endif
|
||||
if (TC_H_MIN(res.classid) <= q->flows_cnt)
|
||||
return TC_H_MIN(res.classid);
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
/* helper functions : might be changed when/if skb use a standard list_head */
|
||||
|
||||
/* remove one skb from head of slot queue */
|
||||
static inline struct sk_buff *dequeue_head(struct fq_codel_flow *flow)
|
||||
{
|
||||
struct sk_buff *skb = flow->head;
|
||||
|
||||
flow->head = skb->next;
|
||||
skb->next = NULL;
|
||||
return skb;
|
||||
}
|
||||
|
||||
/* add skb to flow queue (tail add) */
|
||||
static inline void flow_queue_add(struct fq_codel_flow *flow,
|
||||
struct sk_buff *skb)
|
||||
{
|
||||
if (flow->head == NULL)
|
||||
flow->head = skb;
|
||||
else
|
||||
flow->tail->next = skb;
|
||||
flow->tail = skb;
|
||||
skb->next = NULL;
|
||||
}
|
||||
|
||||
static unsigned int fq_codel_drop(struct Qdisc *sch)
|
||||
{
|
||||
struct fq_codel_sched_data *q = qdisc_priv(sch);
|
||||
struct sk_buff *skb;
|
||||
unsigned int maxbacklog = 0, idx = 0, i, len;
|
||||
struct fq_codel_flow *flow;
|
||||
|
||||
/* Queue is full! Find the fat flow and drop packet from it.
|
||||
* This might sound expensive, but with 1024 flows, we scan
|
||||
* 4KB of memory, and we dont need to handle a complex tree
|
||||
* in fast path (packet queue/enqueue) with many cache misses.
|
||||
*/
|
||||
for (i = 0; i < q->flows_cnt; i++) {
|
||||
if (q->backlogs[i] > maxbacklog) {
|
||||
maxbacklog = q->backlogs[i];
|
||||
idx = i;
|
||||
}
|
||||
}
|
||||
flow = &q->flows[idx];
|
||||
skb = dequeue_head(flow);
|
||||
len = qdisc_pkt_len(skb);
|
||||
q->backlogs[idx] -= len;
|
||||
kfree_skb(skb);
|
||||
sch->q.qlen--;
|
||||
qdisc_qstats_drop(sch);
|
||||
qdisc_qstats_backlog_dec(sch, skb);
|
||||
flow->dropped++;
|
||||
return idx;
|
||||
}
|
||||
|
||||
static int fq_codel_enqueue(struct sk_buff *skb, struct Qdisc *sch)
|
||||
{
|
||||
struct fq_codel_sched_data *q = qdisc_priv(sch);
|
||||
unsigned int idx;
|
||||
struct fq_codel_flow *flow;
|
||||
int uninitialized_var(ret);
|
||||
|
||||
idx = fq_codel_classify(skb, sch, &ret);
|
||||
if (idx == 0) {
|
||||
if (ret & __NET_XMIT_BYPASS)
|
||||
qdisc_qstats_drop(sch);
|
||||
kfree_skb(skb);
|
||||
return ret;
|
||||
}
|
||||
idx--;
|
||||
|
||||
codel_set_enqueue_time(skb);
|
||||
flow = &q->flows[idx];
|
||||
flow_queue_add(flow, skb);
|
||||
q->backlogs[idx] += qdisc_pkt_len(skb);
|
||||
qdisc_qstats_backlog_inc(sch, skb);
|
||||
|
||||
if (list_empty(&flow->flowchain)) {
|
||||
list_add_tail(&flow->flowchain, &q->new_flows);
|
||||
q->new_flow_count++;
|
||||
flow->deficit = q->quantum;
|
||||
flow->dropped = 0;
|
||||
}
|
||||
if (++sch->q.qlen <= sch->limit)
|
||||
return NET_XMIT_SUCCESS;
|
||||
|
||||
q->drop_overlimit++;
|
||||
/* Return Congestion Notification only if we dropped a packet
|
||||
* from this flow.
|
||||
*/
|
||||
if (fq_codel_drop(sch) == idx)
|
||||
return NET_XMIT_CN;
|
||||
|
||||
/* As we dropped a packet, better let upper stack know this */
|
||||
qdisc_tree_decrease_qlen(sch, 1);
|
||||
return NET_XMIT_SUCCESS;
|
||||
}
|
||||
|
||||
/* This is the specific function called from codel_dequeue()
|
||||
* to dequeue a packet from queue. Note: backlog is handled in
|
||||
* codel, we dont need to reduce it here.
|
||||
*/
|
||||
static struct sk_buff *dequeue(struct codel_vars *vars, struct Qdisc *sch)
|
||||
{
|
||||
struct fq_codel_sched_data *q = qdisc_priv(sch);
|
||||
struct fq_codel_flow *flow;
|
||||
struct sk_buff *skb = NULL;
|
||||
|
||||
flow = container_of(vars, struct fq_codel_flow, cvars);
|
||||
if (flow->head) {
|
||||
skb = dequeue_head(flow);
|
||||
q->backlogs[flow - q->flows] -= qdisc_pkt_len(skb);
|
||||
sch->q.qlen--;
|
||||
}
|
||||
return skb;
|
||||
}
|
||||
|
||||
static struct sk_buff *fq_codel_dequeue(struct Qdisc *sch)
|
||||
{
|
||||
struct fq_codel_sched_data *q = qdisc_priv(sch);
|
||||
struct sk_buff *skb;
|
||||
struct fq_codel_flow *flow;
|
||||
struct list_head *head;
|
||||
u32 prev_drop_count, prev_ecn_mark;
|
||||
|
||||
begin:
|
||||
head = &q->new_flows;
|
||||
if (list_empty(head)) {
|
||||
head = &q->old_flows;
|
||||
if (list_empty(head))
|
||||
return NULL;
|
||||
}
|
||||
flow = list_first_entry(head, struct fq_codel_flow, flowchain);
|
||||
|
||||
if (flow->deficit <= 0) {
|
||||
flow->deficit += q->quantum;
|
||||
list_move_tail(&flow->flowchain, &q->old_flows);
|
||||
goto begin;
|
||||
}
|
||||
|
||||
prev_drop_count = q->cstats.drop_count;
|
||||
prev_ecn_mark = q->cstats.ecn_mark;
|
||||
|
||||
skb = codel_dequeue(sch, &q->cparams, &flow->cvars, &q->cstats,
|
||||
dequeue);
|
||||
|
||||
flow->dropped += q->cstats.drop_count - prev_drop_count;
|
||||
flow->dropped += q->cstats.ecn_mark - prev_ecn_mark;
|
||||
|
||||
if (!skb) {
|
||||
/* force a pass through old_flows to prevent starvation */
|
||||
if ((head == &q->new_flows) && !list_empty(&q->old_flows))
|
||||
list_move_tail(&flow->flowchain, &q->old_flows);
|
||||
else
|
||||
list_del_init(&flow->flowchain);
|
||||
goto begin;
|
||||
}
|
||||
qdisc_bstats_update(sch, skb);
|
||||
flow->deficit -= qdisc_pkt_len(skb);
|
||||
/* We cant call qdisc_tree_decrease_qlen() if our qlen is 0,
|
||||
* or HTB crashes. Defer it for next round.
|
||||
*/
|
||||
if (q->cstats.drop_count && sch->q.qlen) {
|
||||
qdisc_tree_decrease_qlen(sch, q->cstats.drop_count);
|
||||
q->cstats.drop_count = 0;
|
||||
}
|
||||
return skb;
|
||||
}
|
||||
|
||||
static void fq_codel_reset(struct Qdisc *sch)
|
||||
{
|
||||
struct sk_buff *skb;
|
||||
|
||||
while ((skb = fq_codel_dequeue(sch)) != NULL)
|
||||
kfree_skb(skb);
|
||||
}
|
||||
|
||||
static const struct nla_policy fq_codel_policy[TCA_FQ_CODEL_MAX + 1] = {
|
||||
[TCA_FQ_CODEL_TARGET] = { .type = NLA_U32 },
|
||||
[TCA_FQ_CODEL_LIMIT] = { .type = NLA_U32 },
|
||||
[TCA_FQ_CODEL_INTERVAL] = { .type = NLA_U32 },
|
||||
[TCA_FQ_CODEL_ECN] = { .type = NLA_U32 },
|
||||
[TCA_FQ_CODEL_FLOWS] = { .type = NLA_U32 },
|
||||
[TCA_FQ_CODEL_QUANTUM] = { .type = NLA_U32 },
|
||||
};
|
||||
|
||||
static int fq_codel_change(struct Qdisc *sch, struct nlattr *opt)
|
||||
{
|
||||
struct fq_codel_sched_data *q = qdisc_priv(sch);
|
||||
struct nlattr *tb[TCA_FQ_CODEL_MAX + 1];
|
||||
int err;
|
||||
|
||||
if (!opt)
|
||||
return -EINVAL;
|
||||
|
||||
err = nla_parse_nested(tb, TCA_FQ_CODEL_MAX, opt, fq_codel_policy);
|
||||
if (err < 0)
|
||||
return err;
|
||||
if (tb[TCA_FQ_CODEL_FLOWS]) {
|
||||
if (q->flows)
|
||||
return -EINVAL;
|
||||
q->flows_cnt = nla_get_u32(tb[TCA_FQ_CODEL_FLOWS]);
|
||||
if (!q->flows_cnt ||
|
||||
q->flows_cnt > 65536)
|
||||
return -EINVAL;
|
||||
}
|
||||
sch_tree_lock(sch);
|
||||
|
||||
if (tb[TCA_FQ_CODEL_TARGET]) {
|
||||
u64 target = nla_get_u32(tb[TCA_FQ_CODEL_TARGET]);
|
||||
|
||||
q->cparams.target = (target * NSEC_PER_USEC) >> CODEL_SHIFT;
|
||||
}
|
||||
|
||||
if (tb[TCA_FQ_CODEL_INTERVAL]) {
|
||||
u64 interval = nla_get_u32(tb[TCA_FQ_CODEL_INTERVAL]);
|
||||
|
||||
q->cparams.interval = (interval * NSEC_PER_USEC) >> CODEL_SHIFT;
|
||||
}
|
||||
|
||||
if (tb[TCA_FQ_CODEL_LIMIT])
|
||||
sch->limit = nla_get_u32(tb[TCA_FQ_CODEL_LIMIT]);
|
||||
|
||||
if (tb[TCA_FQ_CODEL_ECN])
|
||||
q->cparams.ecn = !!nla_get_u32(tb[TCA_FQ_CODEL_ECN]);
|
||||
|
||||
if (tb[TCA_FQ_CODEL_QUANTUM])
|
||||
q->quantum = max(256U, nla_get_u32(tb[TCA_FQ_CODEL_QUANTUM]));
|
||||
|
||||
while (sch->q.qlen > sch->limit) {
|
||||
struct sk_buff *skb = fq_codel_dequeue(sch);
|
||||
|
||||
kfree_skb(skb);
|
||||
q->cstats.drop_count++;
|
||||
}
|
||||
qdisc_tree_decrease_qlen(sch, q->cstats.drop_count);
|
||||
q->cstats.drop_count = 0;
|
||||
|
||||
sch_tree_unlock(sch);
|
||||
return 0;
|
||||
}
|
||||
|
||||
static void *fq_codel_zalloc(size_t sz)
|
||||
{
|
||||
void *ptr = kzalloc(sz, GFP_KERNEL | __GFP_NOWARN);
|
||||
|
||||
if (!ptr)
|
||||
ptr = vzalloc(sz);
|
||||
return ptr;
|
||||
}
|
||||
|
||||
static void fq_codel_free(void *addr)
|
||||
{
|
||||
kvfree(addr);
|
||||
}
|
||||
|
||||
static void fq_codel_destroy(struct Qdisc *sch)
|
||||
{
|
||||
struct fq_codel_sched_data *q = qdisc_priv(sch);
|
||||
|
||||
tcf_destroy_chain(&q->filter_list);
|
||||
fq_codel_free(q->backlogs);
|
||||
fq_codel_free(q->flows);
|
||||
}
|
||||
|
||||
static int fq_codel_init(struct Qdisc *sch, struct nlattr *opt)
|
||||
{
|
||||
struct fq_codel_sched_data *q = qdisc_priv(sch);
|
||||
int i;
|
||||
|
||||
sch->limit = 10*1024;
|
||||
q->flows_cnt = 1024;
|
||||
q->quantum = psched_mtu(qdisc_dev(sch));
|
||||
q->perturbation = prandom_u32();
|
||||
INIT_LIST_HEAD(&q->new_flows);
|
||||
INIT_LIST_HEAD(&q->old_flows);
|
||||
codel_params_init(&q->cparams);
|
||||
codel_stats_init(&q->cstats);
|
||||
q->cparams.ecn = true;
|
||||
|
||||
if (opt) {
|
||||
int err = fq_codel_change(sch, opt);
|
||||
if (err)
|
||||
return err;
|
||||
}
|
||||
|
||||
if (!q->flows) {
|
||||
q->flows = fq_codel_zalloc(q->flows_cnt *
|
||||
sizeof(struct fq_codel_flow));
|
||||
if (!q->flows)
|
||||
return -ENOMEM;
|
||||
q->backlogs = fq_codel_zalloc(q->flows_cnt * sizeof(u32));
|
||||
if (!q->backlogs) {
|
||||
fq_codel_free(q->flows);
|
||||
return -ENOMEM;
|
||||
}
|
||||
for (i = 0; i < q->flows_cnt; i++) {
|
||||
struct fq_codel_flow *flow = q->flows + i;
|
||||
|
||||
INIT_LIST_HEAD(&flow->flowchain);
|
||||
codel_vars_init(&flow->cvars);
|
||||
}
|
||||
}
|
||||
if (sch->limit >= 1)
|
||||
sch->flags |= TCQ_F_CAN_BYPASS;
|
||||
else
|
||||
sch->flags &= ~TCQ_F_CAN_BYPASS;
|
||||
return 0;
|
||||
}
|
||||
|
||||
static int fq_codel_dump(struct Qdisc *sch, struct sk_buff *skb)
|
||||
{
|
||||
struct fq_codel_sched_data *q = qdisc_priv(sch);
|
||||
struct nlattr *opts;
|
||||
|
||||
opts = nla_nest_start(skb, TCA_OPTIONS);
|
||||
if (opts == NULL)
|
||||
goto nla_put_failure;
|
||||
|
||||
if (nla_put_u32(skb, TCA_FQ_CODEL_TARGET,
|
||||
codel_time_to_us(q->cparams.target)) ||
|
||||
nla_put_u32(skb, TCA_FQ_CODEL_LIMIT,
|
||||
sch->limit) ||
|
||||
nla_put_u32(skb, TCA_FQ_CODEL_INTERVAL,
|
||||
codel_time_to_us(q->cparams.interval)) ||
|
||||
nla_put_u32(skb, TCA_FQ_CODEL_ECN,
|
||||
q->cparams.ecn) ||
|
||||
nla_put_u32(skb, TCA_FQ_CODEL_QUANTUM,
|
||||
q->quantum) ||
|
||||
nla_put_u32(skb, TCA_FQ_CODEL_FLOWS,
|
||||
q->flows_cnt))
|
||||
goto nla_put_failure;
|
||||
|
||||
return nla_nest_end(skb, opts);
|
||||
|
||||
nla_put_failure:
|
||||
return -1;
|
||||
}
|
||||
|
||||
static int fq_codel_dump_stats(struct Qdisc *sch, struct gnet_dump *d)
|
||||
{
|
||||
struct fq_codel_sched_data *q = qdisc_priv(sch);
|
||||
struct tc_fq_codel_xstats st = {
|
||||
.type = TCA_FQ_CODEL_XSTATS_QDISC,
|
||||
};
|
||||
struct list_head *pos;
|
||||
|
||||
st.qdisc_stats.maxpacket = q->cstats.maxpacket;
|
||||
st.qdisc_stats.drop_overlimit = q->drop_overlimit;
|
||||
st.qdisc_stats.ecn_mark = q->cstats.ecn_mark;
|
||||
st.qdisc_stats.new_flow_count = q->new_flow_count;
|
||||
|
||||
list_for_each(pos, &q->new_flows)
|
||||
st.qdisc_stats.new_flows_len++;
|
||||
|
||||
list_for_each(pos, &q->old_flows)
|
||||
st.qdisc_stats.old_flows_len++;
|
||||
|
||||
return gnet_stats_copy_app(d, &st, sizeof(st));
|
||||
}
|
||||
|
||||
static struct Qdisc *fq_codel_leaf(struct Qdisc *sch, unsigned long arg)
|
||||
{
|
||||
return NULL;
|
||||
}
|
||||
|
||||
static unsigned long fq_codel_get(struct Qdisc *sch, u32 classid)
|
||||
{
|
||||
return 0;
|
||||
}
|
||||
|
||||
static unsigned long fq_codel_bind(struct Qdisc *sch, unsigned long parent,
|
||||
u32 classid)
|
||||
{
|
||||
/* we cannot bypass queue discipline anymore */
|
||||
sch->flags &= ~TCQ_F_CAN_BYPASS;
|
||||
return 0;
|
||||
}
|
||||
|
||||
static void fq_codel_put(struct Qdisc *q, unsigned long cl)
|
||||
{
|
||||
}
|
||||
|
||||
static struct tcf_proto __rcu **fq_codel_find_tcf(struct Qdisc *sch,
|
||||
unsigned long cl)
|
||||
{
|
||||
struct fq_codel_sched_data *q = qdisc_priv(sch);
|
||||
|
||||
if (cl)
|
||||
return NULL;
|
||||
return &q->filter_list;
|
||||
}
|
||||
|
||||
static int fq_codel_dump_class(struct Qdisc *sch, unsigned long cl,
|
||||
struct sk_buff *skb, struct tcmsg *tcm)
|
||||
{
|
||||
tcm->tcm_handle |= TC_H_MIN(cl);
|
||||
return 0;
|
||||
}
|
||||
|
||||
static int fq_codel_dump_class_stats(struct Qdisc *sch, unsigned long cl,
|
||||
struct gnet_dump *d)
|
||||
{
|
||||
struct fq_codel_sched_data *q = qdisc_priv(sch);
|
||||
u32 idx = cl - 1;
|
||||
struct gnet_stats_queue qs = { 0 };
|
||||
struct tc_fq_codel_xstats xstats;
|
||||
|
||||
if (idx < q->flows_cnt) {
|
||||
const struct fq_codel_flow *flow = &q->flows[idx];
|
||||
const struct sk_buff *skb = flow->head;
|
||||
|
||||
memset(&xstats, 0, sizeof(xstats));
|
||||
xstats.type = TCA_FQ_CODEL_XSTATS_CLASS;
|
||||
xstats.class_stats.deficit = flow->deficit;
|
||||
xstats.class_stats.ldelay =
|
||||
codel_time_to_us(flow->cvars.ldelay);
|
||||
xstats.class_stats.count = flow->cvars.count;
|
||||
xstats.class_stats.lastcount = flow->cvars.lastcount;
|
||||
xstats.class_stats.dropping = flow->cvars.dropping;
|
||||
if (flow->cvars.dropping) {
|
||||
codel_tdiff_t delta = flow->cvars.drop_next -
|
||||
codel_get_time();
|
||||
|
||||
xstats.class_stats.drop_next = (delta >= 0) ?
|
||||
codel_time_to_us(delta) :
|
||||
-codel_time_to_us(-delta);
|
||||
}
|
||||
while (skb) {
|
||||
qs.qlen++;
|
||||
skb = skb->next;
|
||||
}
|
||||
qs.backlog = q->backlogs[idx];
|
||||
qs.drops = flow->dropped;
|
||||
}
|
||||
if (gnet_stats_copy_queue(d, NULL, &qs, 0) < 0)
|
||||
return -1;
|
||||
if (idx < q->flows_cnt)
|
||||
return gnet_stats_copy_app(d, &xstats, sizeof(xstats));
|
||||
return 0;
|
||||
}
|
||||
|
||||
static void fq_codel_walk(struct Qdisc *sch, struct qdisc_walker *arg)
|
||||
{
|
||||
struct fq_codel_sched_data *q = qdisc_priv(sch);
|
||||
unsigned int i;
|
||||
|
||||
if (arg->stop)
|
||||
return;
|
||||
|
||||
for (i = 0; i < q->flows_cnt; i++) {
|
||||
if (list_empty(&q->flows[i].flowchain) ||
|
||||
arg->count < arg->skip) {
|
||||
arg->count++;
|
||||
continue;
|
||||
}
|
||||
if (arg->fn(sch, i + 1, arg) < 0) {
|
||||
arg->stop = 1;
|
||||
break;
|
||||
}
|
||||
arg->count++;
|
||||
}
|
||||
}
|
||||
|
||||
static const struct Qdisc_class_ops fq_codel_class_ops = {
|
||||
.leaf = fq_codel_leaf,
|
||||
.get = fq_codel_get,
|
||||
.put = fq_codel_put,
|
||||
.tcf_chain = fq_codel_find_tcf,
|
||||
.bind_tcf = fq_codel_bind,
|
||||
.unbind_tcf = fq_codel_put,
|
||||
.dump = fq_codel_dump_class,
|
||||
.dump_stats = fq_codel_dump_class_stats,
|
||||
.walk = fq_codel_walk,
|
||||
};
|
||||
|
||||
static struct Qdisc_ops fq_codel_qdisc_ops __read_mostly = {
|
||||
.cl_ops = &fq_codel_class_ops,
|
||||
.id = "fq_codel",
|
||||
.priv_size = sizeof(struct fq_codel_sched_data),
|
||||
.enqueue = fq_codel_enqueue,
|
||||
.dequeue = fq_codel_dequeue,
|
||||
.peek = qdisc_peek_dequeued,
|
||||
.drop = fq_codel_drop,
|
||||
.init = fq_codel_init,
|
||||
.reset = fq_codel_reset,
|
||||
.destroy = fq_codel_destroy,
|
||||
.change = fq_codel_change,
|
||||
.dump = fq_codel_dump,
|
||||
.dump_stats = fq_codel_dump_stats,
|
||||
.owner = THIS_MODULE,
|
||||
};
|
||||
|
||||
static int __init fq_codel_module_init(void)
|
||||
{
|
||||
return register_qdisc(&fq_codel_qdisc_ops);
|
||||
}
|
||||
|
||||
static void __exit fq_codel_module_exit(void)
|
||||
{
|
||||
unregister_qdisc(&fq_codel_qdisc_ops);
|
||||
}
|
||||
|
||||
module_init(fq_codel_module_init)
|
||||
module_exit(fq_codel_module_exit)
|
||||
MODULE_AUTHOR("Eric Dumazet");
|
||||
MODULE_LICENSE("GPL");
|
||||
990
net/sched/sch_generic.c
Normal file
990
net/sched/sch_generic.c
Normal file
|
|
@ -0,0 +1,990 @@
|
|||
/*
|
||||
* net/sched/sch_generic.c Generic packet scheduler routines.
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU General Public License
|
||||
* as published by the Free Software Foundation; either version
|
||||
* 2 of the License, or (at your option) any later version.
|
||||
*
|
||||
* Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
|
||||
* Jamal Hadi Salim, <hadi@cyberus.ca> 990601
|
||||
* - Ingress support
|
||||
*/
|
||||
|
||||
#include <linux/bitops.h>
|
||||
#include <linux/module.h>
|
||||
#include <linux/types.h>
|
||||
#include <linux/kernel.h>
|
||||
#include <linux/sched.h>
|
||||
#include <linux/string.h>
|
||||
#include <linux/errno.h>
|
||||
#include <linux/netdevice.h>
|
||||
#include <linux/skbuff.h>
|
||||
#include <linux/rtnetlink.h>
|
||||
#include <linux/init.h>
|
||||
#include <linux/rcupdate.h>
|
||||
#include <linux/list.h>
|
||||
#include <linux/slab.h>
|
||||
#include <linux/if_vlan.h>
|
||||
#include <net/sch_generic.h>
|
||||
#include <net/pkt_sched.h>
|
||||
#include <net/dst.h>
|
||||
|
||||
/* Qdisc to use by default */
|
||||
const struct Qdisc_ops *default_qdisc_ops = &pfifo_fast_ops;
|
||||
EXPORT_SYMBOL(default_qdisc_ops);
|
||||
|
||||
/* Main transmission queue. */
|
||||
|
||||
/* Modifications to data participating in scheduling must be protected with
|
||||
* qdisc_lock(qdisc) spinlock.
|
||||
*
|
||||
* The idea is the following:
|
||||
* - enqueue, dequeue are serialized via qdisc root lock
|
||||
* - ingress filtering is also serialized via qdisc root lock
|
||||
* - updates to tree and tree walking are only done under the rtnl mutex.
|
||||
*/
|
||||
|
||||
static inline int dev_requeue_skb(struct sk_buff *skb, struct Qdisc *q)
|
||||
{
|
||||
q->gso_skb = skb;
|
||||
q->qstats.requeues++;
|
||||
q->q.qlen++; /* it's still part of the queue */
|
||||
__netif_schedule(q);
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
static void try_bulk_dequeue_skb(struct Qdisc *q,
|
||||
struct sk_buff *skb,
|
||||
const struct netdev_queue *txq,
|
||||
int *packets)
|
||||
{
|
||||
int bytelimit = qdisc_avail_bulklimit(txq) - skb->len;
|
||||
|
||||
while (bytelimit > 0) {
|
||||
struct sk_buff *nskb = q->dequeue(q);
|
||||
|
||||
if (!nskb)
|
||||
break;
|
||||
|
||||
bytelimit -= nskb->len; /* covers GSO len */
|
||||
skb->next = nskb;
|
||||
skb = nskb;
|
||||
(*packets)++; /* GSO counts as one pkt */
|
||||
}
|
||||
skb->next = NULL;
|
||||
}
|
||||
|
||||
/* Note that dequeue_skb can possibly return a SKB list (via skb->next).
|
||||
* A requeued skb (via q->gso_skb) can also be a SKB list.
|
||||
*/
|
||||
static struct sk_buff *dequeue_skb(struct Qdisc *q, bool *validate,
|
||||
int *packets)
|
||||
{
|
||||
struct sk_buff *skb = q->gso_skb;
|
||||
const struct netdev_queue *txq = q->dev_queue;
|
||||
|
||||
*packets = 1;
|
||||
*validate = true;
|
||||
if (unlikely(skb)) {
|
||||
/* check the reason of requeuing without tx lock first */
|
||||
txq = skb_get_tx_queue(txq->dev, skb);
|
||||
if (!netif_xmit_frozen_or_stopped(txq)) {
|
||||
q->gso_skb = NULL;
|
||||
q->q.qlen--;
|
||||
} else
|
||||
skb = NULL;
|
||||
/* skb in gso_skb were already validated */
|
||||
*validate = false;
|
||||
} else {
|
||||
if (!(q->flags & TCQ_F_ONETXQUEUE) ||
|
||||
!netif_xmit_frozen_or_stopped(txq)) {
|
||||
skb = q->dequeue(q);
|
||||
if (skb && qdisc_may_bulk(q))
|
||||
try_bulk_dequeue_skb(q, skb, txq, packets);
|
||||
}
|
||||
}
|
||||
return skb;
|
||||
}
|
||||
|
||||
static inline int handle_dev_cpu_collision(struct sk_buff *skb,
|
||||
struct netdev_queue *dev_queue,
|
||||
struct Qdisc *q)
|
||||
{
|
||||
int ret;
|
||||
|
||||
if (unlikely(dev_queue->xmit_lock_owner == smp_processor_id())) {
|
||||
/*
|
||||
* Same CPU holding the lock. It may be a transient
|
||||
* configuration error, when hard_start_xmit() recurses. We
|
||||
* detect it by checking xmit owner and drop the packet when
|
||||
* deadloop is detected. Return OK to try the next skb.
|
||||
*/
|
||||
kfree_skb_list(skb);
|
||||
net_warn_ratelimited("Dead loop on netdevice %s, fix it urgently!\n",
|
||||
dev_queue->dev->name);
|
||||
ret = qdisc_qlen(q);
|
||||
} else {
|
||||
/*
|
||||
* Another cpu is holding lock, requeue & delay xmits for
|
||||
* some time.
|
||||
*/
|
||||
__this_cpu_inc(softnet_data.cpu_collision);
|
||||
ret = dev_requeue_skb(skb, q);
|
||||
}
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
/*
|
||||
* Transmit possibly several skbs, and handle the return status as
|
||||
* required. Holding the __QDISC___STATE_RUNNING bit guarantees that
|
||||
* only one CPU can execute this function.
|
||||
*
|
||||
* Returns to the caller:
|
||||
* 0 - queue is empty or throttled.
|
||||
* >0 - queue is not empty.
|
||||
*/
|
||||
int sch_direct_xmit(struct sk_buff *skb, struct Qdisc *q,
|
||||
struct net_device *dev, struct netdev_queue *txq,
|
||||
spinlock_t *root_lock, bool validate)
|
||||
{
|
||||
int ret = NETDEV_TX_BUSY;
|
||||
|
||||
/* And release qdisc */
|
||||
spin_unlock(root_lock);
|
||||
|
||||
/* Note that we validate skb (GSO, checksum, ...) outside of locks */
|
||||
if (validate)
|
||||
skb = validate_xmit_skb_list(skb, dev);
|
||||
|
||||
if (skb) {
|
||||
HARD_TX_LOCK(dev, txq, smp_processor_id());
|
||||
if (!netif_xmit_frozen_or_stopped(txq))
|
||||
skb = dev_hard_start_xmit(skb, dev, txq, &ret);
|
||||
|
||||
HARD_TX_UNLOCK(dev, txq);
|
||||
}
|
||||
spin_lock(root_lock);
|
||||
|
||||
if (dev_xmit_complete(ret)) {
|
||||
/* Driver sent out skb successfully or skb was consumed */
|
||||
ret = qdisc_qlen(q);
|
||||
} else if (ret == NETDEV_TX_LOCKED) {
|
||||
/* Driver try lock failed */
|
||||
ret = handle_dev_cpu_collision(skb, txq, q);
|
||||
} else {
|
||||
/* Driver returned NETDEV_TX_BUSY - requeue skb */
|
||||
if (unlikely(ret != NETDEV_TX_BUSY))
|
||||
net_warn_ratelimited("BUG %s code %d qlen %d\n",
|
||||
dev->name, ret, q->q.qlen);
|
||||
|
||||
ret = dev_requeue_skb(skb, q);
|
||||
}
|
||||
|
||||
if (ret && netif_xmit_frozen_or_stopped(txq))
|
||||
ret = 0;
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
/*
|
||||
* NOTE: Called under qdisc_lock(q) with locally disabled BH.
|
||||
*
|
||||
* __QDISC___STATE_RUNNING guarantees only one CPU can process
|
||||
* this qdisc at a time. qdisc_lock(q) serializes queue accesses for
|
||||
* this queue.
|
||||
*
|
||||
* netif_tx_lock serializes accesses to device driver.
|
||||
*
|
||||
* qdisc_lock(q) and netif_tx_lock are mutually exclusive,
|
||||
* if one is grabbed, another must be free.
|
||||
*
|
||||
* Note, that this procedure can be called by a watchdog timer
|
||||
*
|
||||
* Returns to the caller:
|
||||
* 0 - queue is empty or throttled.
|
||||
* >0 - queue is not empty.
|
||||
*
|
||||
*/
|
||||
static inline int qdisc_restart(struct Qdisc *q, int *packets)
|
||||
{
|
||||
struct netdev_queue *txq;
|
||||
struct net_device *dev;
|
||||
spinlock_t *root_lock;
|
||||
struct sk_buff *skb;
|
||||
bool validate;
|
||||
|
||||
/* Dequeue packet */
|
||||
skb = dequeue_skb(q, &validate, packets);
|
||||
if (unlikely(!skb))
|
||||
return 0;
|
||||
|
||||
root_lock = qdisc_lock(q);
|
||||
dev = qdisc_dev(q);
|
||||
txq = skb_get_tx_queue(dev, skb);
|
||||
|
||||
return sch_direct_xmit(skb, q, dev, txq, root_lock, validate);
|
||||
}
|
||||
|
||||
void __qdisc_run(struct Qdisc *q)
|
||||
{
|
||||
int quota = weight_p;
|
||||
int packets;
|
||||
|
||||
while (qdisc_restart(q, &packets)) {
|
||||
/*
|
||||
* Ordered by possible occurrence: Postpone processing if
|
||||
* 1. we've exceeded packet quota
|
||||
* 2. another process needs the CPU;
|
||||
*/
|
||||
quota -= packets;
|
||||
if (quota <= 0 || need_resched()) {
|
||||
__netif_schedule(q);
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
qdisc_run_end(q);
|
||||
}
|
||||
|
||||
unsigned long dev_trans_start(struct net_device *dev)
|
||||
{
|
||||
unsigned long val, res;
|
||||
unsigned int i;
|
||||
|
||||
if (is_vlan_dev(dev))
|
||||
dev = vlan_dev_real_dev(dev);
|
||||
res = dev->trans_start;
|
||||
for (i = 0; i < dev->num_tx_queues; i++) {
|
||||
val = netdev_get_tx_queue(dev, i)->trans_start;
|
||||
if (val && time_after(val, res))
|
||||
res = val;
|
||||
}
|
||||
dev->trans_start = res;
|
||||
|
||||
return res;
|
||||
}
|
||||
EXPORT_SYMBOL(dev_trans_start);
|
||||
|
||||
static void dev_watchdog(unsigned long arg)
|
||||
{
|
||||
struct net_device *dev = (struct net_device *)arg;
|
||||
|
||||
netif_tx_lock(dev);
|
||||
if (!qdisc_tx_is_noop(dev)) {
|
||||
if (netif_device_present(dev) &&
|
||||
netif_running(dev) &&
|
||||
netif_carrier_ok(dev)) {
|
||||
int some_queue_timedout = 0;
|
||||
unsigned int i;
|
||||
unsigned long trans_start;
|
||||
|
||||
for (i = 0; i < dev->num_tx_queues; i++) {
|
||||
struct netdev_queue *txq;
|
||||
|
||||
txq = netdev_get_tx_queue(dev, i);
|
||||
/*
|
||||
* old device drivers set dev->trans_start
|
||||
*/
|
||||
trans_start = txq->trans_start ? : dev->trans_start;
|
||||
if (netif_xmit_stopped(txq) &&
|
||||
time_after(jiffies, (trans_start +
|
||||
dev->watchdog_timeo))) {
|
||||
some_queue_timedout = 1;
|
||||
txq->trans_timeout++;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
if (some_queue_timedout) {
|
||||
WARN_ONCE(1, KERN_INFO "NETDEV WATCHDOG: %s (%s): transmit queue %u timed out\n",
|
||||
dev->name, netdev_drivername(dev), i);
|
||||
dev->netdev_ops->ndo_tx_timeout(dev);
|
||||
}
|
||||
if (!mod_timer(&dev->watchdog_timer,
|
||||
round_jiffies(jiffies +
|
||||
dev->watchdog_timeo)))
|
||||
dev_hold(dev);
|
||||
}
|
||||
}
|
||||
netif_tx_unlock(dev);
|
||||
|
||||
dev_put(dev);
|
||||
}
|
||||
|
||||
void __netdev_watchdog_up(struct net_device *dev)
|
||||
{
|
||||
if (dev->netdev_ops->ndo_tx_timeout) {
|
||||
if (dev->watchdog_timeo <= 0)
|
||||
dev->watchdog_timeo = 5*HZ;
|
||||
if (!mod_timer(&dev->watchdog_timer,
|
||||
round_jiffies(jiffies + dev->watchdog_timeo)))
|
||||
dev_hold(dev);
|
||||
}
|
||||
}
|
||||
|
||||
static void dev_watchdog_up(struct net_device *dev)
|
||||
{
|
||||
__netdev_watchdog_up(dev);
|
||||
}
|
||||
|
||||
static void dev_watchdog_down(struct net_device *dev)
|
||||
{
|
||||
netif_tx_lock_bh(dev);
|
||||
if (del_timer(&dev->watchdog_timer))
|
||||
dev_put(dev);
|
||||
netif_tx_unlock_bh(dev);
|
||||
}
|
||||
|
||||
/**
|
||||
* netif_carrier_on - set carrier
|
||||
* @dev: network device
|
||||
*
|
||||
* Device has detected that carrier.
|
||||
*/
|
||||
void netif_carrier_on(struct net_device *dev)
|
||||
{
|
||||
if (test_and_clear_bit(__LINK_STATE_NOCARRIER, &dev->state)) {
|
||||
if (dev->reg_state == NETREG_UNINITIALIZED)
|
||||
return;
|
||||
atomic_inc(&dev->carrier_changes);
|
||||
linkwatch_fire_event(dev);
|
||||
if (netif_running(dev))
|
||||
__netdev_watchdog_up(dev);
|
||||
}
|
||||
}
|
||||
EXPORT_SYMBOL(netif_carrier_on);
|
||||
|
||||
/**
|
||||
* netif_carrier_off - clear carrier
|
||||
* @dev: network device
|
||||
*
|
||||
* Device has detected loss of carrier.
|
||||
*/
|
||||
void netif_carrier_off(struct net_device *dev)
|
||||
{
|
||||
if (!test_and_set_bit(__LINK_STATE_NOCARRIER, &dev->state)) {
|
||||
if (dev->reg_state == NETREG_UNINITIALIZED)
|
||||
return;
|
||||
atomic_inc(&dev->carrier_changes);
|
||||
linkwatch_fire_event(dev);
|
||||
}
|
||||
}
|
||||
EXPORT_SYMBOL(netif_carrier_off);
|
||||
|
||||
/* "NOOP" scheduler: the best scheduler, recommended for all interfaces
|
||||
under all circumstances. It is difficult to invent anything faster or
|
||||
cheaper.
|
||||
*/
|
||||
|
||||
static int noop_enqueue(struct sk_buff *skb, struct Qdisc *qdisc)
|
||||
{
|
||||
kfree_skb(skb);
|
||||
return NET_XMIT_CN;
|
||||
}
|
||||
|
||||
static struct sk_buff *noop_dequeue(struct Qdisc *qdisc)
|
||||
{
|
||||
return NULL;
|
||||
}
|
||||
|
||||
struct Qdisc_ops noop_qdisc_ops __read_mostly = {
|
||||
.id = "noop",
|
||||
.priv_size = 0,
|
||||
.enqueue = noop_enqueue,
|
||||
.dequeue = noop_dequeue,
|
||||
.peek = noop_dequeue,
|
||||
.owner = THIS_MODULE,
|
||||
};
|
||||
|
||||
static struct netdev_queue noop_netdev_queue = {
|
||||
.qdisc = &noop_qdisc,
|
||||
.qdisc_sleeping = &noop_qdisc,
|
||||
};
|
||||
|
||||
struct Qdisc noop_qdisc = {
|
||||
.enqueue = noop_enqueue,
|
||||
.dequeue = noop_dequeue,
|
||||
.flags = TCQ_F_BUILTIN,
|
||||
.ops = &noop_qdisc_ops,
|
||||
.list = LIST_HEAD_INIT(noop_qdisc.list),
|
||||
.q.lock = __SPIN_LOCK_UNLOCKED(noop_qdisc.q.lock),
|
||||
.dev_queue = &noop_netdev_queue,
|
||||
.busylock = __SPIN_LOCK_UNLOCKED(noop_qdisc.busylock),
|
||||
};
|
||||
EXPORT_SYMBOL(noop_qdisc);
|
||||
|
||||
static struct Qdisc_ops noqueue_qdisc_ops __read_mostly = {
|
||||
.id = "noqueue",
|
||||
.priv_size = 0,
|
||||
.enqueue = noop_enqueue,
|
||||
.dequeue = noop_dequeue,
|
||||
.peek = noop_dequeue,
|
||||
.owner = THIS_MODULE,
|
||||
};
|
||||
|
||||
static struct Qdisc noqueue_qdisc;
|
||||
static struct netdev_queue noqueue_netdev_queue = {
|
||||
.qdisc = &noqueue_qdisc,
|
||||
.qdisc_sleeping = &noqueue_qdisc,
|
||||
};
|
||||
|
||||
static struct Qdisc noqueue_qdisc = {
|
||||
.enqueue = NULL,
|
||||
.dequeue = noop_dequeue,
|
||||
.flags = TCQ_F_BUILTIN,
|
||||
.ops = &noqueue_qdisc_ops,
|
||||
.list = LIST_HEAD_INIT(noqueue_qdisc.list),
|
||||
.q.lock = __SPIN_LOCK_UNLOCKED(noqueue_qdisc.q.lock),
|
||||
.dev_queue = &noqueue_netdev_queue,
|
||||
.busylock = __SPIN_LOCK_UNLOCKED(noqueue_qdisc.busylock),
|
||||
};
|
||||
|
||||
|
||||
static const u8 prio2band[TC_PRIO_MAX + 1] = {
|
||||
1, 2, 2, 2, 1, 2, 0, 0 , 1, 1, 1, 1, 1, 1, 1, 1
|
||||
};
|
||||
|
||||
/* 3-band FIFO queue: old style, but should be a bit faster than
|
||||
generic prio+fifo combination.
|
||||
*/
|
||||
|
||||
#define PFIFO_FAST_BANDS 3
|
||||
|
||||
/*
|
||||
* Private data for a pfifo_fast scheduler containing:
|
||||
* - queues for the three band
|
||||
* - bitmap indicating which of the bands contain skbs
|
||||
*/
|
||||
struct pfifo_fast_priv {
|
||||
u32 bitmap;
|
||||
struct sk_buff_head q[PFIFO_FAST_BANDS];
|
||||
};
|
||||
|
||||
/*
|
||||
* Convert a bitmap to the first band number where an skb is queued, where:
|
||||
* bitmap=0 means there are no skbs on any band.
|
||||
* bitmap=1 means there is an skb on band 0.
|
||||
* bitmap=7 means there are skbs on all 3 bands, etc.
|
||||
*/
|
||||
static const int bitmap2band[] = {-1, 0, 1, 0, 2, 0, 1, 0};
|
||||
|
||||
static inline struct sk_buff_head *band2list(struct pfifo_fast_priv *priv,
|
||||
int band)
|
||||
{
|
||||
return priv->q + band;
|
||||
}
|
||||
|
||||
static int pfifo_fast_enqueue(struct sk_buff *skb, struct Qdisc *qdisc)
|
||||
{
|
||||
if (skb_queue_len(&qdisc->q) < qdisc_dev(qdisc)->tx_queue_len) {
|
||||
int band = prio2band[skb->priority & TC_PRIO_MAX];
|
||||
struct pfifo_fast_priv *priv = qdisc_priv(qdisc);
|
||||
struct sk_buff_head *list = band2list(priv, band);
|
||||
|
||||
priv->bitmap |= (1 << band);
|
||||
qdisc->q.qlen++;
|
||||
return __qdisc_enqueue_tail(skb, qdisc, list);
|
||||
}
|
||||
|
||||
return qdisc_drop(skb, qdisc);
|
||||
}
|
||||
|
||||
static struct sk_buff *pfifo_fast_dequeue(struct Qdisc *qdisc)
|
||||
{
|
||||
struct pfifo_fast_priv *priv = qdisc_priv(qdisc);
|
||||
int band = bitmap2band[priv->bitmap];
|
||||
|
||||
if (likely(band >= 0)) {
|
||||
struct sk_buff_head *list = band2list(priv, band);
|
||||
struct sk_buff *skb = __qdisc_dequeue_head(qdisc, list);
|
||||
|
||||
qdisc->q.qlen--;
|
||||
if (skb_queue_empty(list))
|
||||
priv->bitmap &= ~(1 << band);
|
||||
|
||||
return skb;
|
||||
}
|
||||
|
||||
return NULL;
|
||||
}
|
||||
|
||||
static struct sk_buff *pfifo_fast_peek(struct Qdisc *qdisc)
|
||||
{
|
||||
struct pfifo_fast_priv *priv = qdisc_priv(qdisc);
|
||||
int band = bitmap2band[priv->bitmap];
|
||||
|
||||
if (band >= 0) {
|
||||
struct sk_buff_head *list = band2list(priv, band);
|
||||
|
||||
return skb_peek(list);
|
||||
}
|
||||
|
||||
return NULL;
|
||||
}
|
||||
|
||||
static void pfifo_fast_reset(struct Qdisc *qdisc)
|
||||
{
|
||||
int prio;
|
||||
struct pfifo_fast_priv *priv = qdisc_priv(qdisc);
|
||||
|
||||
for (prio = 0; prio < PFIFO_FAST_BANDS; prio++)
|
||||
__qdisc_reset_queue(qdisc, band2list(priv, prio));
|
||||
|
||||
priv->bitmap = 0;
|
||||
qdisc->qstats.backlog = 0;
|
||||
qdisc->q.qlen = 0;
|
||||
}
|
||||
|
||||
static int pfifo_fast_dump(struct Qdisc *qdisc, struct sk_buff *skb)
|
||||
{
|
||||
struct tc_prio_qopt opt = { .bands = PFIFO_FAST_BANDS };
|
||||
|
||||
memcpy(&opt.priomap, prio2band, TC_PRIO_MAX + 1);
|
||||
if (nla_put(skb, TCA_OPTIONS, sizeof(opt), &opt))
|
||||
goto nla_put_failure;
|
||||
return skb->len;
|
||||
|
||||
nla_put_failure:
|
||||
return -1;
|
||||
}
|
||||
|
||||
static int pfifo_fast_init(struct Qdisc *qdisc, struct nlattr *opt)
|
||||
{
|
||||
int prio;
|
||||
struct pfifo_fast_priv *priv = qdisc_priv(qdisc);
|
||||
|
||||
for (prio = 0; prio < PFIFO_FAST_BANDS; prio++)
|
||||
__skb_queue_head_init(band2list(priv, prio));
|
||||
|
||||
/* Can by-pass the queue discipline */
|
||||
qdisc->flags |= TCQ_F_CAN_BYPASS;
|
||||
return 0;
|
||||
}
|
||||
|
||||
struct Qdisc_ops pfifo_fast_ops __read_mostly = {
|
||||
.id = "pfifo_fast",
|
||||
.priv_size = sizeof(struct pfifo_fast_priv),
|
||||
.enqueue = pfifo_fast_enqueue,
|
||||
.dequeue = pfifo_fast_dequeue,
|
||||
.peek = pfifo_fast_peek,
|
||||
.init = pfifo_fast_init,
|
||||
.reset = pfifo_fast_reset,
|
||||
.dump = pfifo_fast_dump,
|
||||
.owner = THIS_MODULE,
|
||||
};
|
||||
|
||||
static struct lock_class_key qdisc_tx_busylock;
|
||||
|
||||
struct Qdisc *qdisc_alloc(struct netdev_queue *dev_queue,
|
||||
const struct Qdisc_ops *ops)
|
||||
{
|
||||
void *p;
|
||||
struct Qdisc *sch;
|
||||
unsigned int size = QDISC_ALIGN(sizeof(*sch)) + ops->priv_size;
|
||||
int err = -ENOBUFS;
|
||||
struct net_device *dev = dev_queue->dev;
|
||||
|
||||
p = kzalloc_node(size, GFP_KERNEL,
|
||||
netdev_queue_numa_node_read(dev_queue));
|
||||
|
||||
if (!p)
|
||||
goto errout;
|
||||
sch = (struct Qdisc *) QDISC_ALIGN((unsigned long) p);
|
||||
/* if we got non aligned memory, ask more and do alignment ourself */
|
||||
if (sch != p) {
|
||||
kfree(p);
|
||||
p = kzalloc_node(size + QDISC_ALIGNTO - 1, GFP_KERNEL,
|
||||
netdev_queue_numa_node_read(dev_queue));
|
||||
if (!p)
|
||||
goto errout;
|
||||
sch = (struct Qdisc *) QDISC_ALIGN((unsigned long) p);
|
||||
sch->padded = (char *) sch - (char *) p;
|
||||
}
|
||||
INIT_LIST_HEAD(&sch->list);
|
||||
skb_queue_head_init(&sch->q);
|
||||
|
||||
spin_lock_init(&sch->busylock);
|
||||
lockdep_set_class(&sch->busylock,
|
||||
dev->qdisc_tx_busylock ?: &qdisc_tx_busylock);
|
||||
|
||||
sch->ops = ops;
|
||||
sch->enqueue = ops->enqueue;
|
||||
sch->dequeue = ops->dequeue;
|
||||
sch->dev_queue = dev_queue;
|
||||
dev_hold(dev);
|
||||
atomic_set(&sch->refcnt, 1);
|
||||
|
||||
return sch;
|
||||
errout:
|
||||
return ERR_PTR(err);
|
||||
}
|
||||
|
||||
struct Qdisc *qdisc_create_dflt(struct netdev_queue *dev_queue,
|
||||
const struct Qdisc_ops *ops,
|
||||
unsigned int parentid)
|
||||
{
|
||||
struct Qdisc *sch;
|
||||
|
||||
if (!try_module_get(ops->owner))
|
||||
goto errout;
|
||||
|
||||
sch = qdisc_alloc(dev_queue, ops);
|
||||
if (IS_ERR(sch))
|
||||
goto errout;
|
||||
sch->parent = parentid;
|
||||
|
||||
if (!ops->init || ops->init(sch, NULL) == 0)
|
||||
return sch;
|
||||
|
||||
qdisc_destroy(sch);
|
||||
errout:
|
||||
return NULL;
|
||||
}
|
||||
EXPORT_SYMBOL(qdisc_create_dflt);
|
||||
|
||||
/* Under qdisc_lock(qdisc) and BH! */
|
||||
|
||||
void qdisc_reset(struct Qdisc *qdisc)
|
||||
{
|
||||
const struct Qdisc_ops *ops = qdisc->ops;
|
||||
|
||||
if (ops->reset)
|
||||
ops->reset(qdisc);
|
||||
|
||||
if (qdisc->gso_skb) {
|
||||
kfree_skb_list(qdisc->gso_skb);
|
||||
qdisc->gso_skb = NULL;
|
||||
qdisc->q.qlen = 0;
|
||||
}
|
||||
}
|
||||
EXPORT_SYMBOL(qdisc_reset);
|
||||
|
||||
static void qdisc_rcu_free(struct rcu_head *head)
|
||||
{
|
||||
struct Qdisc *qdisc = container_of(head, struct Qdisc, rcu_head);
|
||||
|
||||
if (qdisc_is_percpu_stats(qdisc))
|
||||
free_percpu(qdisc->cpu_bstats);
|
||||
|
||||
kfree((char *) qdisc - qdisc->padded);
|
||||
}
|
||||
|
||||
void qdisc_destroy(struct Qdisc *qdisc)
|
||||
{
|
||||
const struct Qdisc_ops *ops = qdisc->ops;
|
||||
|
||||
if (qdisc->flags & TCQ_F_BUILTIN ||
|
||||
!atomic_dec_and_test(&qdisc->refcnt))
|
||||
return;
|
||||
|
||||
#ifdef CONFIG_NET_SCHED
|
||||
qdisc_list_del(qdisc);
|
||||
|
||||
qdisc_put_stab(rtnl_dereference(qdisc->stab));
|
||||
#endif
|
||||
gen_kill_estimator(&qdisc->bstats, &qdisc->rate_est);
|
||||
if (ops->reset)
|
||||
ops->reset(qdisc);
|
||||
if (ops->destroy)
|
||||
ops->destroy(qdisc);
|
||||
|
||||
module_put(ops->owner);
|
||||
dev_put(qdisc_dev(qdisc));
|
||||
|
||||
kfree_skb_list(qdisc->gso_skb);
|
||||
/*
|
||||
* gen_estimator est_timer() might access qdisc->q.lock,
|
||||
* wait a RCU grace period before freeing qdisc.
|
||||
*/
|
||||
call_rcu(&qdisc->rcu_head, qdisc_rcu_free);
|
||||
}
|
||||
EXPORT_SYMBOL(qdisc_destroy);
|
||||
|
||||
/* Attach toplevel qdisc to device queue. */
|
||||
struct Qdisc *dev_graft_qdisc(struct netdev_queue *dev_queue,
|
||||
struct Qdisc *qdisc)
|
||||
{
|
||||
struct Qdisc *oqdisc = dev_queue->qdisc_sleeping;
|
||||
spinlock_t *root_lock;
|
||||
|
||||
root_lock = qdisc_lock(oqdisc);
|
||||
spin_lock_bh(root_lock);
|
||||
|
||||
/* Prune old scheduler */
|
||||
if (oqdisc && atomic_read(&oqdisc->refcnt) <= 1)
|
||||
qdisc_reset(oqdisc);
|
||||
|
||||
/* ... and graft new one */
|
||||
if (qdisc == NULL)
|
||||
qdisc = &noop_qdisc;
|
||||
dev_queue->qdisc_sleeping = qdisc;
|
||||
rcu_assign_pointer(dev_queue->qdisc, &noop_qdisc);
|
||||
|
||||
spin_unlock_bh(root_lock);
|
||||
|
||||
return oqdisc;
|
||||
}
|
||||
EXPORT_SYMBOL(dev_graft_qdisc);
|
||||
|
||||
static void attach_one_default_qdisc(struct net_device *dev,
|
||||
struct netdev_queue *dev_queue,
|
||||
void *_unused)
|
||||
{
|
||||
struct Qdisc *qdisc = &noqueue_qdisc;
|
||||
|
||||
if (dev->tx_queue_len) {
|
||||
qdisc = qdisc_create_dflt(dev_queue,
|
||||
default_qdisc_ops, TC_H_ROOT);
|
||||
if (!qdisc) {
|
||||
netdev_info(dev, "activation failed\n");
|
||||
return;
|
||||
}
|
||||
if (!netif_is_multiqueue(dev))
|
||||
qdisc->flags |= TCQ_F_ONETXQUEUE;
|
||||
}
|
||||
dev_queue->qdisc_sleeping = qdisc;
|
||||
}
|
||||
|
||||
static void attach_default_qdiscs(struct net_device *dev)
|
||||
{
|
||||
struct netdev_queue *txq;
|
||||
struct Qdisc *qdisc;
|
||||
|
||||
txq = netdev_get_tx_queue(dev, 0);
|
||||
|
||||
if (!netif_is_multiqueue(dev) || dev->tx_queue_len == 0) {
|
||||
netdev_for_each_tx_queue(dev, attach_one_default_qdisc, NULL);
|
||||
dev->qdisc = txq->qdisc_sleeping;
|
||||
atomic_inc(&dev->qdisc->refcnt);
|
||||
} else {
|
||||
qdisc = qdisc_create_dflt(txq, &mq_qdisc_ops, TC_H_ROOT);
|
||||
if (qdisc) {
|
||||
dev->qdisc = qdisc;
|
||||
qdisc->ops->attach(qdisc);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static void transition_one_qdisc(struct net_device *dev,
|
||||
struct netdev_queue *dev_queue,
|
||||
void *_need_watchdog)
|
||||
{
|
||||
struct Qdisc *new_qdisc = dev_queue->qdisc_sleeping;
|
||||
int *need_watchdog_p = _need_watchdog;
|
||||
|
||||
if (!(new_qdisc->flags & TCQ_F_BUILTIN))
|
||||
clear_bit(__QDISC_STATE_DEACTIVATED, &new_qdisc->state);
|
||||
|
||||
rcu_assign_pointer(dev_queue->qdisc, new_qdisc);
|
||||
if (need_watchdog_p && new_qdisc != &noqueue_qdisc) {
|
||||
dev_queue->trans_start = 0;
|
||||
*need_watchdog_p = 1;
|
||||
}
|
||||
}
|
||||
|
||||
void dev_activate(struct net_device *dev)
|
||||
{
|
||||
int need_watchdog;
|
||||
|
||||
/* No queueing discipline is attached to device;
|
||||
* create default one for devices, which need queueing
|
||||
* and noqueue_qdisc for virtual interfaces
|
||||
*/
|
||||
|
||||
if (dev->qdisc == &noop_qdisc)
|
||||
attach_default_qdiscs(dev);
|
||||
|
||||
if (!netif_carrier_ok(dev))
|
||||
/* Delay activation until next carrier-on event */
|
||||
return;
|
||||
|
||||
need_watchdog = 0;
|
||||
netdev_for_each_tx_queue(dev, transition_one_qdisc, &need_watchdog);
|
||||
if (dev_ingress_queue(dev))
|
||||
transition_one_qdisc(dev, dev_ingress_queue(dev), NULL);
|
||||
|
||||
if (need_watchdog) {
|
||||
dev->trans_start = jiffies;
|
||||
dev_watchdog_up(dev);
|
||||
}
|
||||
}
|
||||
EXPORT_SYMBOL(dev_activate);
|
||||
|
||||
static void dev_deactivate_queue(struct net_device *dev,
|
||||
struct netdev_queue *dev_queue,
|
||||
void *_qdisc_default)
|
||||
{
|
||||
struct Qdisc *qdisc_default = _qdisc_default;
|
||||
struct Qdisc *qdisc;
|
||||
|
||||
qdisc = rtnl_dereference(dev_queue->qdisc);
|
||||
if (qdisc) {
|
||||
spin_lock_bh(qdisc_lock(qdisc));
|
||||
|
||||
if (!(qdisc->flags & TCQ_F_BUILTIN))
|
||||
set_bit(__QDISC_STATE_DEACTIVATED, &qdisc->state);
|
||||
|
||||
rcu_assign_pointer(dev_queue->qdisc, qdisc_default);
|
||||
qdisc_reset(qdisc);
|
||||
|
||||
spin_unlock_bh(qdisc_lock(qdisc));
|
||||
}
|
||||
}
|
||||
|
||||
static bool some_qdisc_is_busy(struct net_device *dev)
|
||||
{
|
||||
unsigned int i;
|
||||
|
||||
for (i = 0; i < dev->num_tx_queues; i++) {
|
||||
struct netdev_queue *dev_queue;
|
||||
spinlock_t *root_lock;
|
||||
struct Qdisc *q;
|
||||
int val;
|
||||
|
||||
dev_queue = netdev_get_tx_queue(dev, i);
|
||||
q = dev_queue->qdisc_sleeping;
|
||||
root_lock = qdisc_lock(q);
|
||||
|
||||
spin_lock_bh(root_lock);
|
||||
|
||||
val = (qdisc_is_running(q) ||
|
||||
test_bit(__QDISC_STATE_SCHED, &q->state));
|
||||
|
||||
spin_unlock_bh(root_lock);
|
||||
|
||||
if (val)
|
||||
return true;
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
/**
|
||||
* dev_deactivate_many - deactivate transmissions on several devices
|
||||
* @head: list of devices to deactivate
|
||||
*
|
||||
* This function returns only when all outstanding transmissions
|
||||
* have completed, unless all devices are in dismantle phase.
|
||||
*/
|
||||
void dev_deactivate_many(struct list_head *head)
|
||||
{
|
||||
struct net_device *dev;
|
||||
bool sync_needed = false;
|
||||
|
||||
list_for_each_entry(dev, head, close_list) {
|
||||
netdev_for_each_tx_queue(dev, dev_deactivate_queue,
|
||||
&noop_qdisc);
|
||||
if (dev_ingress_queue(dev))
|
||||
dev_deactivate_queue(dev, dev_ingress_queue(dev),
|
||||
&noop_qdisc);
|
||||
|
||||
dev_watchdog_down(dev);
|
||||
sync_needed |= !dev->dismantle;
|
||||
}
|
||||
|
||||
/* Wait for outstanding qdisc-less dev_queue_xmit calls.
|
||||
* This is avoided if all devices are in dismantle phase :
|
||||
* Caller will call synchronize_net() for us
|
||||
*/
|
||||
if (sync_needed)
|
||||
synchronize_net();
|
||||
|
||||
/* Wait for outstanding qdisc_run calls. */
|
||||
list_for_each_entry(dev, head, close_list)
|
||||
while (some_qdisc_is_busy(dev))
|
||||
yield();
|
||||
}
|
||||
|
||||
void dev_deactivate(struct net_device *dev)
|
||||
{
|
||||
LIST_HEAD(single);
|
||||
|
||||
list_add(&dev->close_list, &single);
|
||||
dev_deactivate_many(&single);
|
||||
list_del(&single);
|
||||
}
|
||||
EXPORT_SYMBOL(dev_deactivate);
|
||||
|
||||
static void dev_init_scheduler_queue(struct net_device *dev,
|
||||
struct netdev_queue *dev_queue,
|
||||
void *_qdisc)
|
||||
{
|
||||
struct Qdisc *qdisc = _qdisc;
|
||||
|
||||
rcu_assign_pointer(dev_queue->qdisc, qdisc);
|
||||
dev_queue->qdisc_sleeping = qdisc;
|
||||
}
|
||||
|
||||
void dev_init_scheduler(struct net_device *dev)
|
||||
{
|
||||
dev->qdisc = &noop_qdisc;
|
||||
netdev_for_each_tx_queue(dev, dev_init_scheduler_queue, &noop_qdisc);
|
||||
if (dev_ingress_queue(dev))
|
||||
dev_init_scheduler_queue(dev, dev_ingress_queue(dev), &noop_qdisc);
|
||||
|
||||
setup_timer(&dev->watchdog_timer, dev_watchdog, (unsigned long)dev);
|
||||
}
|
||||
|
||||
static void shutdown_scheduler_queue(struct net_device *dev,
|
||||
struct netdev_queue *dev_queue,
|
||||
void *_qdisc_default)
|
||||
{
|
||||
struct Qdisc *qdisc = dev_queue->qdisc_sleeping;
|
||||
struct Qdisc *qdisc_default = _qdisc_default;
|
||||
|
||||
if (qdisc) {
|
||||
rcu_assign_pointer(dev_queue->qdisc, qdisc_default);
|
||||
dev_queue->qdisc_sleeping = qdisc_default;
|
||||
|
||||
qdisc_destroy(qdisc);
|
||||
}
|
||||
}
|
||||
|
||||
void dev_shutdown(struct net_device *dev)
|
||||
{
|
||||
netdev_for_each_tx_queue(dev, shutdown_scheduler_queue, &noop_qdisc);
|
||||
if (dev_ingress_queue(dev))
|
||||
shutdown_scheduler_queue(dev, dev_ingress_queue(dev), &noop_qdisc);
|
||||
qdisc_destroy(dev->qdisc);
|
||||
dev->qdisc = &noop_qdisc;
|
||||
|
||||
WARN_ON(timer_pending(&dev->watchdog_timer));
|
||||
}
|
||||
|
||||
void psched_ratecfg_precompute(struct psched_ratecfg *r,
|
||||
const struct tc_ratespec *conf,
|
||||
u64 rate64)
|
||||
{
|
||||
memset(r, 0, sizeof(*r));
|
||||
r->overhead = conf->overhead;
|
||||
r->rate_bytes_ps = max_t(u64, conf->rate, rate64);
|
||||
r->linklayer = (conf->linklayer & TC_LINKLAYER_MASK);
|
||||
r->mult = 1;
|
||||
/*
|
||||
* The deal here is to replace a divide by a reciprocal one
|
||||
* in fast path (a reciprocal divide is a multiply and a shift)
|
||||
*
|
||||
* Normal formula would be :
|
||||
* time_in_ns = (NSEC_PER_SEC * len) / rate_bps
|
||||
*
|
||||
* We compute mult/shift to use instead :
|
||||
* time_in_ns = (len * mult) >> shift;
|
||||
*
|
||||
* We try to get the highest possible mult value for accuracy,
|
||||
* but have to make sure no overflows will ever happen.
|
||||
*/
|
||||
if (r->rate_bytes_ps > 0) {
|
||||
u64 factor = NSEC_PER_SEC;
|
||||
|
||||
for (;;) {
|
||||
r->mult = div64_u64(factor, r->rate_bytes_ps);
|
||||
if (r->mult & (1U << 31) || factor & (1ULL << 63))
|
||||
break;
|
||||
factor <<= 1;
|
||||
r->shift++;
|
||||
}
|
||||
}
|
||||
}
|
||||
EXPORT_SYMBOL(psched_ratecfg_precompute);
|
||||
630
net/sched/sch_gred.c
Normal file
630
net/sched/sch_gred.c
Normal file
|
|
@ -0,0 +1,630 @@
|
|||
/*
|
||||
* net/sched/sch_gred.c Generic Random Early Detection queue.
|
||||
*
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU General Public License
|
||||
* as published by the Free Software Foundation; either version
|
||||
* 2 of the License, or (at your option) any later version.
|
||||
*
|
||||
* Authors: J Hadi Salim (hadi@cyberus.ca) 1998-2002
|
||||
*
|
||||
* 991129: - Bug fix with grio mode
|
||||
* - a better sing. AvgQ mode with Grio(WRED)
|
||||
* - A finer grained VQ dequeue based on sugestion
|
||||
* from Ren Liu
|
||||
* - More error checks
|
||||
*
|
||||
* For all the glorious comments look at include/net/red.h
|
||||
*/
|
||||
|
||||
#include <linux/slab.h>
|
||||
#include <linux/module.h>
|
||||
#include <linux/types.h>
|
||||
#include <linux/kernel.h>
|
||||
#include <linux/skbuff.h>
|
||||
#include <net/pkt_sched.h>
|
||||
#include <net/red.h>
|
||||
|
||||
#define GRED_DEF_PRIO (MAX_DPs / 2)
|
||||
#define GRED_VQ_MASK (MAX_DPs - 1)
|
||||
|
||||
struct gred_sched_data;
|
||||
struct gred_sched;
|
||||
|
||||
struct gred_sched_data {
|
||||
u32 limit; /* HARD maximal queue length */
|
||||
u32 DP; /* the drop parameters */
|
||||
u32 bytesin; /* bytes seen on virtualQ so far*/
|
||||
u32 packetsin; /* packets seen on virtualQ so far*/
|
||||
u32 backlog; /* bytes on the virtualQ */
|
||||
u8 prio; /* the prio of this vq */
|
||||
|
||||
struct red_parms parms;
|
||||
struct red_vars vars;
|
||||
struct red_stats stats;
|
||||
};
|
||||
|
||||
enum {
|
||||
GRED_WRED_MODE = 1,
|
||||
GRED_RIO_MODE,
|
||||
};
|
||||
|
||||
struct gred_sched {
|
||||
struct gred_sched_data *tab[MAX_DPs];
|
||||
unsigned long flags;
|
||||
u32 red_flags;
|
||||
u32 DPs;
|
||||
u32 def;
|
||||
struct red_vars wred_set;
|
||||
};
|
||||
|
||||
static inline int gred_wred_mode(struct gred_sched *table)
|
||||
{
|
||||
return test_bit(GRED_WRED_MODE, &table->flags);
|
||||
}
|
||||
|
||||
static inline void gred_enable_wred_mode(struct gred_sched *table)
|
||||
{
|
||||
__set_bit(GRED_WRED_MODE, &table->flags);
|
||||
}
|
||||
|
||||
static inline void gred_disable_wred_mode(struct gred_sched *table)
|
||||
{
|
||||
__clear_bit(GRED_WRED_MODE, &table->flags);
|
||||
}
|
||||
|
||||
static inline int gred_rio_mode(struct gred_sched *table)
|
||||
{
|
||||
return test_bit(GRED_RIO_MODE, &table->flags);
|
||||
}
|
||||
|
||||
static inline void gred_enable_rio_mode(struct gred_sched *table)
|
||||
{
|
||||
__set_bit(GRED_RIO_MODE, &table->flags);
|
||||
}
|
||||
|
||||
static inline void gred_disable_rio_mode(struct gred_sched *table)
|
||||
{
|
||||
__clear_bit(GRED_RIO_MODE, &table->flags);
|
||||
}
|
||||
|
||||
static inline int gred_wred_mode_check(struct Qdisc *sch)
|
||||
{
|
||||
struct gred_sched *table = qdisc_priv(sch);
|
||||
int i;
|
||||
|
||||
/* Really ugly O(n^2) but shouldn't be necessary too frequent. */
|
||||
for (i = 0; i < table->DPs; i++) {
|
||||
struct gred_sched_data *q = table->tab[i];
|
||||
int n;
|
||||
|
||||
if (q == NULL)
|
||||
continue;
|
||||
|
||||
for (n = i + 1; n < table->DPs; n++)
|
||||
if (table->tab[n] && table->tab[n]->prio == q->prio)
|
||||
return 1;
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
static inline unsigned int gred_backlog(struct gred_sched *table,
|
||||
struct gred_sched_data *q,
|
||||
struct Qdisc *sch)
|
||||
{
|
||||
if (gred_wred_mode(table))
|
||||
return sch->qstats.backlog;
|
||||
else
|
||||
return q->backlog;
|
||||
}
|
||||
|
||||
static inline u16 tc_index_to_dp(struct sk_buff *skb)
|
||||
{
|
||||
return skb->tc_index & GRED_VQ_MASK;
|
||||
}
|
||||
|
||||
static inline void gred_load_wred_set(const struct gred_sched *table,
|
||||
struct gred_sched_data *q)
|
||||
{
|
||||
q->vars.qavg = table->wred_set.qavg;
|
||||
q->vars.qidlestart = table->wred_set.qidlestart;
|
||||
}
|
||||
|
||||
static inline void gred_store_wred_set(struct gred_sched *table,
|
||||
struct gred_sched_data *q)
|
||||
{
|
||||
table->wred_set.qavg = q->vars.qavg;
|
||||
table->wred_set.qidlestart = q->vars.qidlestart;
|
||||
}
|
||||
|
||||
static inline int gred_use_ecn(struct gred_sched *t)
|
||||
{
|
||||
return t->red_flags & TC_RED_ECN;
|
||||
}
|
||||
|
||||
static inline int gred_use_harddrop(struct gred_sched *t)
|
||||
{
|
||||
return t->red_flags & TC_RED_HARDDROP;
|
||||
}
|
||||
|
||||
static int gred_enqueue(struct sk_buff *skb, struct Qdisc *sch)
|
||||
{
|
||||
struct gred_sched_data *q = NULL;
|
||||
struct gred_sched *t = qdisc_priv(sch);
|
||||
unsigned long qavg = 0;
|
||||
u16 dp = tc_index_to_dp(skb);
|
||||
|
||||
if (dp >= t->DPs || (q = t->tab[dp]) == NULL) {
|
||||
dp = t->def;
|
||||
|
||||
q = t->tab[dp];
|
||||
if (!q) {
|
||||
/* Pass through packets not assigned to a DP
|
||||
* if no default DP has been configured. This
|
||||
* allows for DP flows to be left untouched.
|
||||
*/
|
||||
if (skb_queue_len(&sch->q) < qdisc_dev(sch)->tx_queue_len)
|
||||
return qdisc_enqueue_tail(skb, sch);
|
||||
else
|
||||
goto drop;
|
||||
}
|
||||
|
||||
/* fix tc_index? --could be controversial but needed for
|
||||
requeueing */
|
||||
skb->tc_index = (skb->tc_index & ~GRED_VQ_MASK) | dp;
|
||||
}
|
||||
|
||||
/* sum up all the qaves of prios < ours to get the new qave */
|
||||
if (!gred_wred_mode(t) && gred_rio_mode(t)) {
|
||||
int i;
|
||||
|
||||
for (i = 0; i < t->DPs; i++) {
|
||||
if (t->tab[i] && t->tab[i]->prio < q->prio &&
|
||||
!red_is_idling(&t->tab[i]->vars))
|
||||
qavg += t->tab[i]->vars.qavg;
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
q->packetsin++;
|
||||
q->bytesin += qdisc_pkt_len(skb);
|
||||
|
||||
if (gred_wred_mode(t))
|
||||
gred_load_wred_set(t, q);
|
||||
|
||||
q->vars.qavg = red_calc_qavg(&q->parms,
|
||||
&q->vars,
|
||||
gred_backlog(t, q, sch));
|
||||
|
||||
if (red_is_idling(&q->vars))
|
||||
red_end_of_idle_period(&q->vars);
|
||||
|
||||
if (gred_wred_mode(t))
|
||||
gred_store_wred_set(t, q);
|
||||
|
||||
switch (red_action(&q->parms, &q->vars, q->vars.qavg + qavg)) {
|
||||
case RED_DONT_MARK:
|
||||
break;
|
||||
|
||||
case RED_PROB_MARK:
|
||||
qdisc_qstats_overlimit(sch);
|
||||
if (!gred_use_ecn(t) || !INET_ECN_set_ce(skb)) {
|
||||
q->stats.prob_drop++;
|
||||
goto congestion_drop;
|
||||
}
|
||||
|
||||
q->stats.prob_mark++;
|
||||
break;
|
||||
|
||||
case RED_HARD_MARK:
|
||||
qdisc_qstats_overlimit(sch);
|
||||
if (gred_use_harddrop(t) || !gred_use_ecn(t) ||
|
||||
!INET_ECN_set_ce(skb)) {
|
||||
q->stats.forced_drop++;
|
||||
goto congestion_drop;
|
||||
}
|
||||
q->stats.forced_mark++;
|
||||
break;
|
||||
}
|
||||
|
||||
if (q->backlog + qdisc_pkt_len(skb) <= q->limit) {
|
||||
q->backlog += qdisc_pkt_len(skb);
|
||||
return qdisc_enqueue_tail(skb, sch);
|
||||
}
|
||||
|
||||
q->stats.pdrop++;
|
||||
drop:
|
||||
return qdisc_drop(skb, sch);
|
||||
|
||||
congestion_drop:
|
||||
qdisc_drop(skb, sch);
|
||||
return NET_XMIT_CN;
|
||||
}
|
||||
|
||||
static struct sk_buff *gred_dequeue(struct Qdisc *sch)
|
||||
{
|
||||
struct sk_buff *skb;
|
||||
struct gred_sched *t = qdisc_priv(sch);
|
||||
|
||||
skb = qdisc_dequeue_head(sch);
|
||||
|
||||
if (skb) {
|
||||
struct gred_sched_data *q;
|
||||
u16 dp = tc_index_to_dp(skb);
|
||||
|
||||
if (dp >= t->DPs || (q = t->tab[dp]) == NULL) {
|
||||
net_warn_ratelimited("GRED: Unable to relocate VQ 0x%x after dequeue, screwing up backlog\n",
|
||||
tc_index_to_dp(skb));
|
||||
} else {
|
||||
q->backlog -= qdisc_pkt_len(skb);
|
||||
|
||||
if (gred_wred_mode(t)) {
|
||||
if (!sch->qstats.backlog)
|
||||
red_start_of_idle_period(&t->wred_set);
|
||||
} else {
|
||||
if (!q->backlog)
|
||||
red_start_of_idle_period(&q->vars);
|
||||
}
|
||||
}
|
||||
|
||||
return skb;
|
||||
}
|
||||
|
||||
return NULL;
|
||||
}
|
||||
|
||||
static unsigned int gred_drop(struct Qdisc *sch)
|
||||
{
|
||||
struct sk_buff *skb;
|
||||
struct gred_sched *t = qdisc_priv(sch);
|
||||
|
||||
skb = qdisc_dequeue_tail(sch);
|
||||
if (skb) {
|
||||
unsigned int len = qdisc_pkt_len(skb);
|
||||
struct gred_sched_data *q;
|
||||
u16 dp = tc_index_to_dp(skb);
|
||||
|
||||
if (dp >= t->DPs || (q = t->tab[dp]) == NULL) {
|
||||
net_warn_ratelimited("GRED: Unable to relocate VQ 0x%x while dropping, screwing up backlog\n",
|
||||
tc_index_to_dp(skb));
|
||||
} else {
|
||||
q->backlog -= len;
|
||||
q->stats.other++;
|
||||
|
||||
if (gred_wred_mode(t)) {
|
||||
if (!sch->qstats.backlog)
|
||||
red_start_of_idle_period(&t->wred_set);
|
||||
} else {
|
||||
if (!q->backlog)
|
||||
red_start_of_idle_period(&q->vars);
|
||||
}
|
||||
}
|
||||
|
||||
qdisc_drop(skb, sch);
|
||||
return len;
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
static void gred_reset(struct Qdisc *sch)
|
||||
{
|
||||
int i;
|
||||
struct gred_sched *t = qdisc_priv(sch);
|
||||
|
||||
qdisc_reset_queue(sch);
|
||||
|
||||
for (i = 0; i < t->DPs; i++) {
|
||||
struct gred_sched_data *q = t->tab[i];
|
||||
|
||||
if (!q)
|
||||
continue;
|
||||
|
||||
red_restart(&q->vars);
|
||||
q->backlog = 0;
|
||||
}
|
||||
}
|
||||
|
||||
static inline void gred_destroy_vq(struct gred_sched_data *q)
|
||||
{
|
||||
kfree(q);
|
||||
}
|
||||
|
||||
static inline int gred_change_table_def(struct Qdisc *sch, struct nlattr *dps)
|
||||
{
|
||||
struct gred_sched *table = qdisc_priv(sch);
|
||||
struct tc_gred_sopt *sopt;
|
||||
int i;
|
||||
|
||||
if (dps == NULL)
|
||||
return -EINVAL;
|
||||
|
||||
sopt = nla_data(dps);
|
||||
|
||||
if (sopt->DPs > MAX_DPs || sopt->DPs == 0 || sopt->def_DP >= sopt->DPs)
|
||||
return -EINVAL;
|
||||
|
||||
sch_tree_lock(sch);
|
||||
table->DPs = sopt->DPs;
|
||||
table->def = sopt->def_DP;
|
||||
table->red_flags = sopt->flags;
|
||||
|
||||
/*
|
||||
* Every entry point to GRED is synchronized with the above code
|
||||
* and the DP is checked against DPs, i.e. shadowed VQs can no
|
||||
* longer be found so we can unlock right here.
|
||||
*/
|
||||
sch_tree_unlock(sch);
|
||||
|
||||
if (sopt->grio) {
|
||||
gred_enable_rio_mode(table);
|
||||
gred_disable_wred_mode(table);
|
||||
if (gred_wred_mode_check(sch))
|
||||
gred_enable_wred_mode(table);
|
||||
} else {
|
||||
gred_disable_rio_mode(table);
|
||||
gred_disable_wred_mode(table);
|
||||
}
|
||||
|
||||
for (i = table->DPs; i < MAX_DPs; i++) {
|
||||
if (table->tab[i]) {
|
||||
pr_warn("GRED: Warning: Destroying shadowed VQ 0x%x\n",
|
||||
i);
|
||||
gred_destroy_vq(table->tab[i]);
|
||||
table->tab[i] = NULL;
|
||||
}
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
static inline int gred_change_vq(struct Qdisc *sch, int dp,
|
||||
struct tc_gred_qopt *ctl, int prio,
|
||||
u8 *stab, u32 max_P,
|
||||
struct gred_sched_data **prealloc)
|
||||
{
|
||||
struct gred_sched *table = qdisc_priv(sch);
|
||||
struct gred_sched_data *q = table->tab[dp];
|
||||
|
||||
if (!q) {
|
||||
table->tab[dp] = q = *prealloc;
|
||||
*prealloc = NULL;
|
||||
if (!q)
|
||||
return -ENOMEM;
|
||||
}
|
||||
|
||||
q->DP = dp;
|
||||
q->prio = prio;
|
||||
q->limit = ctl->limit;
|
||||
|
||||
if (q->backlog == 0)
|
||||
red_end_of_idle_period(&q->vars);
|
||||
|
||||
red_set_parms(&q->parms,
|
||||
ctl->qth_min, ctl->qth_max, ctl->Wlog, ctl->Plog,
|
||||
ctl->Scell_log, stab, max_P);
|
||||
red_set_vars(&q->vars);
|
||||
return 0;
|
||||
}
|
||||
|
||||
static const struct nla_policy gred_policy[TCA_GRED_MAX + 1] = {
|
||||
[TCA_GRED_PARMS] = { .len = sizeof(struct tc_gred_qopt) },
|
||||
[TCA_GRED_STAB] = { .len = 256 },
|
||||
[TCA_GRED_DPS] = { .len = sizeof(struct tc_gred_sopt) },
|
||||
[TCA_GRED_MAX_P] = { .type = NLA_U32 },
|
||||
};
|
||||
|
||||
static int gred_change(struct Qdisc *sch, struct nlattr *opt)
|
||||
{
|
||||
struct gred_sched *table = qdisc_priv(sch);
|
||||
struct tc_gred_qopt *ctl;
|
||||
struct nlattr *tb[TCA_GRED_MAX + 1];
|
||||
int err, prio = GRED_DEF_PRIO;
|
||||
u8 *stab;
|
||||
u32 max_P;
|
||||
struct gred_sched_data *prealloc;
|
||||
|
||||
if (opt == NULL)
|
||||
return -EINVAL;
|
||||
|
||||
err = nla_parse_nested(tb, TCA_GRED_MAX, opt, gred_policy);
|
||||
if (err < 0)
|
||||
return err;
|
||||
|
||||
if (tb[TCA_GRED_PARMS] == NULL && tb[TCA_GRED_STAB] == NULL)
|
||||
return gred_change_table_def(sch, opt);
|
||||
|
||||
if (tb[TCA_GRED_PARMS] == NULL ||
|
||||
tb[TCA_GRED_STAB] == NULL)
|
||||
return -EINVAL;
|
||||
|
||||
max_P = tb[TCA_GRED_MAX_P] ? nla_get_u32(tb[TCA_GRED_MAX_P]) : 0;
|
||||
|
||||
err = -EINVAL;
|
||||
ctl = nla_data(tb[TCA_GRED_PARMS]);
|
||||
stab = nla_data(tb[TCA_GRED_STAB]);
|
||||
|
||||
if (ctl->DP >= table->DPs)
|
||||
goto errout;
|
||||
|
||||
if (gred_rio_mode(table)) {
|
||||
if (ctl->prio == 0) {
|
||||
int def_prio = GRED_DEF_PRIO;
|
||||
|
||||
if (table->tab[table->def])
|
||||
def_prio = table->tab[table->def]->prio;
|
||||
|
||||
printk(KERN_DEBUG "GRED: DP %u does not have a prio "
|
||||
"setting default to %d\n", ctl->DP, def_prio);
|
||||
|
||||
prio = def_prio;
|
||||
} else
|
||||
prio = ctl->prio;
|
||||
}
|
||||
|
||||
prealloc = kzalloc(sizeof(*prealloc), GFP_KERNEL);
|
||||
sch_tree_lock(sch);
|
||||
|
||||
err = gred_change_vq(sch, ctl->DP, ctl, prio, stab, max_P, &prealloc);
|
||||
if (err < 0)
|
||||
goto errout_locked;
|
||||
|
||||
if (gred_rio_mode(table)) {
|
||||
gred_disable_wred_mode(table);
|
||||
if (gred_wred_mode_check(sch))
|
||||
gred_enable_wred_mode(table);
|
||||
}
|
||||
|
||||
err = 0;
|
||||
|
||||
errout_locked:
|
||||
sch_tree_unlock(sch);
|
||||
kfree(prealloc);
|
||||
errout:
|
||||
return err;
|
||||
}
|
||||
|
||||
static int gred_init(struct Qdisc *sch, struct nlattr *opt)
|
||||
{
|
||||
struct nlattr *tb[TCA_GRED_MAX + 1];
|
||||
int err;
|
||||
|
||||
if (opt == NULL)
|
||||
return -EINVAL;
|
||||
|
||||
err = nla_parse_nested(tb, TCA_GRED_MAX, opt, gred_policy);
|
||||
if (err < 0)
|
||||
return err;
|
||||
|
||||
if (tb[TCA_GRED_PARMS] || tb[TCA_GRED_STAB])
|
||||
return -EINVAL;
|
||||
|
||||
return gred_change_table_def(sch, tb[TCA_GRED_DPS]);
|
||||
}
|
||||
|
||||
static int gred_dump(struct Qdisc *sch, struct sk_buff *skb)
|
||||
{
|
||||
struct gred_sched *table = qdisc_priv(sch);
|
||||
struct nlattr *parms, *opts = NULL;
|
||||
int i;
|
||||
u32 max_p[MAX_DPs];
|
||||
struct tc_gred_sopt sopt = {
|
||||
.DPs = table->DPs,
|
||||
.def_DP = table->def,
|
||||
.grio = gred_rio_mode(table),
|
||||
.flags = table->red_flags,
|
||||
};
|
||||
|
||||
opts = nla_nest_start(skb, TCA_OPTIONS);
|
||||
if (opts == NULL)
|
||||
goto nla_put_failure;
|
||||
if (nla_put(skb, TCA_GRED_DPS, sizeof(sopt), &sopt))
|
||||
goto nla_put_failure;
|
||||
|
||||
for (i = 0; i < MAX_DPs; i++) {
|
||||
struct gred_sched_data *q = table->tab[i];
|
||||
|
||||
max_p[i] = q ? q->parms.max_P : 0;
|
||||
}
|
||||
if (nla_put(skb, TCA_GRED_MAX_P, sizeof(max_p), max_p))
|
||||
goto nla_put_failure;
|
||||
|
||||
parms = nla_nest_start(skb, TCA_GRED_PARMS);
|
||||
if (parms == NULL)
|
||||
goto nla_put_failure;
|
||||
|
||||
for (i = 0; i < MAX_DPs; i++) {
|
||||
struct gred_sched_data *q = table->tab[i];
|
||||
struct tc_gred_qopt opt;
|
||||
unsigned long qavg;
|
||||
|
||||
memset(&opt, 0, sizeof(opt));
|
||||
|
||||
if (!q) {
|
||||
/* hack -- fix at some point with proper message
|
||||
This is how we indicate to tc that there is no VQ
|
||||
at this DP */
|
||||
|
||||
opt.DP = MAX_DPs + i;
|
||||
goto append_opt;
|
||||
}
|
||||
|
||||
opt.limit = q->limit;
|
||||
opt.DP = q->DP;
|
||||
opt.backlog = q->backlog;
|
||||
opt.prio = q->prio;
|
||||
opt.qth_min = q->parms.qth_min >> q->parms.Wlog;
|
||||
opt.qth_max = q->parms.qth_max >> q->parms.Wlog;
|
||||
opt.Wlog = q->parms.Wlog;
|
||||
opt.Plog = q->parms.Plog;
|
||||
opt.Scell_log = q->parms.Scell_log;
|
||||
opt.other = q->stats.other;
|
||||
opt.early = q->stats.prob_drop;
|
||||
opt.forced = q->stats.forced_drop;
|
||||
opt.pdrop = q->stats.pdrop;
|
||||
opt.packets = q->packetsin;
|
||||
opt.bytesin = q->bytesin;
|
||||
|
||||
if (gred_wred_mode(table))
|
||||
gred_load_wred_set(table, q);
|
||||
|
||||
qavg = red_calc_qavg(&q->parms, &q->vars,
|
||||
q->vars.qavg >> q->parms.Wlog);
|
||||
opt.qave = qavg >> q->parms.Wlog;
|
||||
|
||||
append_opt:
|
||||
if (nla_append(skb, sizeof(opt), &opt) < 0)
|
||||
goto nla_put_failure;
|
||||
}
|
||||
|
||||
nla_nest_end(skb, parms);
|
||||
|
||||
return nla_nest_end(skb, opts);
|
||||
|
||||
nla_put_failure:
|
||||
nla_nest_cancel(skb, opts);
|
||||
return -EMSGSIZE;
|
||||
}
|
||||
|
||||
static void gred_destroy(struct Qdisc *sch)
|
||||
{
|
||||
struct gred_sched *table = qdisc_priv(sch);
|
||||
int i;
|
||||
|
||||
for (i = 0; i < table->DPs; i++) {
|
||||
if (table->tab[i])
|
||||
gred_destroy_vq(table->tab[i]);
|
||||
}
|
||||
}
|
||||
|
||||
static struct Qdisc_ops gred_qdisc_ops __read_mostly = {
|
||||
.id = "gred",
|
||||
.priv_size = sizeof(struct gred_sched),
|
||||
.enqueue = gred_enqueue,
|
||||
.dequeue = gred_dequeue,
|
||||
.peek = qdisc_peek_head,
|
||||
.drop = gred_drop,
|
||||
.init = gred_init,
|
||||
.reset = gred_reset,
|
||||
.destroy = gred_destroy,
|
||||
.change = gred_change,
|
||||
.dump = gred_dump,
|
||||
.owner = THIS_MODULE,
|
||||
};
|
||||
|
||||
static int __init gred_module_init(void)
|
||||
{
|
||||
return register_qdisc(&gred_qdisc_ops);
|
||||
}
|
||||
|
||||
static void __exit gred_module_exit(void)
|
||||
{
|
||||
unregister_qdisc(&gred_qdisc_ops);
|
||||
}
|
||||
|
||||
module_init(gred_module_init)
|
||||
module_exit(gred_module_exit)
|
||||
|
||||
MODULE_LICENSE("GPL");
|
||||
1754
net/sched/sch_hfsc.c
Normal file
1754
net/sched/sch_hfsc.c
Normal file
File diff suppressed because it is too large
Load diff
740
net/sched/sch_hhf.c
Normal file
740
net/sched/sch_hhf.c
Normal file
|
|
@ -0,0 +1,740 @@
|
|||
/* net/sched/sch_hhf.c Heavy-Hitter Filter (HHF)
|
||||
*
|
||||
* Copyright (C) 2013 Terry Lam <vtlam@google.com>
|
||||
* Copyright (C) 2013 Nandita Dukkipati <nanditad@google.com>
|
||||
*/
|
||||
|
||||
#include <linux/jhash.h>
|
||||
#include <linux/jiffies.h>
|
||||
#include <linux/module.h>
|
||||
#include <linux/skbuff.h>
|
||||
#include <linux/vmalloc.h>
|
||||
#include <net/flow_keys.h>
|
||||
#include <net/pkt_sched.h>
|
||||
#include <net/sock.h>
|
||||
|
||||
/* Heavy-Hitter Filter (HHF)
|
||||
*
|
||||
* Principles :
|
||||
* Flows are classified into two buckets: non-heavy-hitter and heavy-hitter
|
||||
* buckets. Initially, a new flow starts as non-heavy-hitter. Once classified
|
||||
* as heavy-hitter, it is immediately switched to the heavy-hitter bucket.
|
||||
* The buckets are dequeued by a Weighted Deficit Round Robin (WDRR) scheduler,
|
||||
* in which the heavy-hitter bucket is served with less weight.
|
||||
* In other words, non-heavy-hitters (e.g., short bursts of critical traffic)
|
||||
* are isolated from heavy-hitters (e.g., persistent bulk traffic) and also have
|
||||
* higher share of bandwidth.
|
||||
*
|
||||
* To capture heavy-hitters, we use the "multi-stage filter" algorithm in the
|
||||
* following paper:
|
||||
* [EV02] C. Estan and G. Varghese, "New Directions in Traffic Measurement and
|
||||
* Accounting", in ACM SIGCOMM, 2002.
|
||||
*
|
||||
* Conceptually, a multi-stage filter comprises k independent hash functions
|
||||
* and k counter arrays. Packets are indexed into k counter arrays by k hash
|
||||
* functions, respectively. The counters are then increased by the packet sizes.
|
||||
* Therefore,
|
||||
* - For a heavy-hitter flow: *all* of its k array counters must be large.
|
||||
* - For a non-heavy-hitter flow: some of its k array counters can be large
|
||||
* due to hash collision with other small flows; however, with high
|
||||
* probability, not *all* k counters are large.
|
||||
*
|
||||
* By the design of the multi-stage filter algorithm, the false negative rate
|
||||
* (heavy-hitters getting away uncaptured) is zero. However, the algorithm is
|
||||
* susceptible to false positives (non-heavy-hitters mistakenly classified as
|
||||
* heavy-hitters).
|
||||
* Therefore, we also implement the following optimizations to reduce false
|
||||
* positives by avoiding unnecessary increment of the counter values:
|
||||
* - Optimization O1: once a heavy-hitter is identified, its bytes are not
|
||||
* accounted in the array counters. This technique is called "shielding"
|
||||
* in Section 3.3.1 of [EV02].
|
||||
* - Optimization O2: conservative update of counters
|
||||
* (Section 3.3.2 of [EV02]),
|
||||
* New counter value = max {old counter value,
|
||||
* smallest counter value + packet bytes}
|
||||
*
|
||||
* Finally, we refresh the counters periodically since otherwise the counter
|
||||
* values will keep accumulating.
|
||||
*
|
||||
* Once a flow is classified as heavy-hitter, we also save its per-flow state
|
||||
* in an exact-matching flow table so that its subsequent packets can be
|
||||
* dispatched to the heavy-hitter bucket accordingly.
|
||||
*
|
||||
*
|
||||
* At a high level, this qdisc works as follows:
|
||||
* Given a packet p:
|
||||
* - If the flow-id of p (e.g., TCP 5-tuple) is already in the exact-matching
|
||||
* heavy-hitter flow table, denoted table T, then send p to the heavy-hitter
|
||||
* bucket.
|
||||
* - Otherwise, forward p to the multi-stage filter, denoted filter F
|
||||
* + If F decides that p belongs to a non-heavy-hitter flow, then send p
|
||||
* to the non-heavy-hitter bucket.
|
||||
* + Otherwise, if F decides that p belongs to a new heavy-hitter flow,
|
||||
* then set up a new flow entry for the flow-id of p in the table T and
|
||||
* send p to the heavy-hitter bucket.
|
||||
*
|
||||
* In this implementation:
|
||||
* - T is a fixed-size hash-table with 1024 entries. Hash collision is
|
||||
* resolved by linked-list chaining.
|
||||
* - F has four counter arrays, each array containing 1024 32-bit counters.
|
||||
* That means 4 * 1024 * 32 bits = 16KB of memory.
|
||||
* - Since each array in F contains 1024 counters, 10 bits are sufficient to
|
||||
* index into each array.
|
||||
* Hence, instead of having four hash functions, we chop the 32-bit
|
||||
* skb-hash into three 10-bit chunks, and the remaining 10-bit chunk is
|
||||
* computed as XOR sum of those three chunks.
|
||||
* - We need to clear the counter arrays periodically; however, directly
|
||||
* memsetting 16KB of memory can lead to cache eviction and unwanted delay.
|
||||
* So by representing each counter by a valid bit, we only need to reset
|
||||
* 4K of 1 bit (i.e. 512 bytes) instead of 16KB of memory.
|
||||
* - The Deficit Round Robin engine is taken from fq_codel implementation
|
||||
* (net/sched/sch_fq_codel.c). Note that wdrr_bucket corresponds to
|
||||
* fq_codel_flow in fq_codel implementation.
|
||||
*
|
||||
*/
|
||||
|
||||
/* Non-configurable parameters */
|
||||
#define HH_FLOWS_CNT 1024 /* number of entries in exact-matching table T */
|
||||
#define HHF_ARRAYS_CNT 4 /* number of arrays in multi-stage filter F */
|
||||
#define HHF_ARRAYS_LEN 1024 /* number of counters in each array of F */
|
||||
#define HHF_BIT_MASK_LEN 10 /* masking 10 bits */
|
||||
#define HHF_BIT_MASK 0x3FF /* bitmask of 10 bits */
|
||||
|
||||
#define WDRR_BUCKET_CNT 2 /* two buckets for Weighted DRR */
|
||||
enum wdrr_bucket_idx {
|
||||
WDRR_BUCKET_FOR_HH = 0, /* bucket id for heavy-hitters */
|
||||
WDRR_BUCKET_FOR_NON_HH = 1 /* bucket id for non-heavy-hitters */
|
||||
};
|
||||
|
||||
#define hhf_time_before(a, b) \
|
||||
(typecheck(u32, a) && typecheck(u32, b) && ((s32)((a) - (b)) < 0))
|
||||
|
||||
/* Heavy-hitter per-flow state */
|
||||
struct hh_flow_state {
|
||||
u32 hash_id; /* hash of flow-id (e.g. TCP 5-tuple) */
|
||||
u32 hit_timestamp; /* last time heavy-hitter was seen */
|
||||
struct list_head flowchain; /* chaining under hash collision */
|
||||
};
|
||||
|
||||
/* Weighted Deficit Round Robin (WDRR) scheduler */
|
||||
struct wdrr_bucket {
|
||||
struct sk_buff *head;
|
||||
struct sk_buff *tail;
|
||||
struct list_head bucketchain;
|
||||
int deficit;
|
||||
};
|
||||
|
||||
struct hhf_sched_data {
|
||||
struct wdrr_bucket buckets[WDRR_BUCKET_CNT];
|
||||
u32 perturbation; /* hash perturbation */
|
||||
u32 quantum; /* psched_mtu(qdisc_dev(sch)); */
|
||||
u32 drop_overlimit; /* number of times max qdisc packet
|
||||
* limit was hit
|
||||
*/
|
||||
struct list_head *hh_flows; /* table T (currently active HHs) */
|
||||
u32 hh_flows_limit; /* max active HH allocs */
|
||||
u32 hh_flows_overlimit; /* num of disallowed HH allocs */
|
||||
u32 hh_flows_total_cnt; /* total admitted HHs */
|
||||
u32 hh_flows_current_cnt; /* total current HHs */
|
||||
u32 *hhf_arrays[HHF_ARRAYS_CNT]; /* HH filter F */
|
||||
u32 hhf_arrays_reset_timestamp; /* last time hhf_arrays
|
||||
* was reset
|
||||
*/
|
||||
unsigned long *hhf_valid_bits[HHF_ARRAYS_CNT]; /* shadow valid bits
|
||||
* of hhf_arrays
|
||||
*/
|
||||
/* Similar to the "new_flows" vs. "old_flows" concept in fq_codel DRR */
|
||||
struct list_head new_buckets; /* list of new buckets */
|
||||
struct list_head old_buckets; /* list of old buckets */
|
||||
|
||||
/* Configurable HHF parameters */
|
||||
u32 hhf_reset_timeout; /* interval to reset counter
|
||||
* arrays in filter F
|
||||
* (default 40ms)
|
||||
*/
|
||||
u32 hhf_admit_bytes; /* counter thresh to classify as
|
||||
* HH (default 128KB).
|
||||
* With these default values,
|
||||
* 128KB / 40ms = 25 Mbps
|
||||
* i.e., we expect to capture HHs
|
||||
* sending > 25 Mbps.
|
||||
*/
|
||||
u32 hhf_evict_timeout; /* aging threshold to evict idle
|
||||
* HHs out of table T. This should
|
||||
* be large enough to avoid
|
||||
* reordering during HH eviction.
|
||||
* (default 1s)
|
||||
*/
|
||||
u32 hhf_non_hh_weight; /* WDRR weight for non-HHs
|
||||
* (default 2,
|
||||
* i.e., non-HH : HH = 2 : 1)
|
||||
*/
|
||||
};
|
||||
|
||||
static u32 hhf_time_stamp(void)
|
||||
{
|
||||
return jiffies;
|
||||
}
|
||||
|
||||
static unsigned int skb_hash(const struct hhf_sched_data *q,
|
||||
const struct sk_buff *skb)
|
||||
{
|
||||
struct flow_keys keys;
|
||||
unsigned int hash;
|
||||
|
||||
if (skb->sk && skb->sk->sk_hash)
|
||||
return skb->sk->sk_hash;
|
||||
|
||||
skb_flow_dissect(skb, &keys);
|
||||
hash = jhash_3words((__force u32)keys.dst,
|
||||
(__force u32)keys.src ^ keys.ip_proto,
|
||||
(__force u32)keys.ports, q->perturbation);
|
||||
return hash;
|
||||
}
|
||||
|
||||
/* Looks up a heavy-hitter flow in a chaining list of table T. */
|
||||
static struct hh_flow_state *seek_list(const u32 hash,
|
||||
struct list_head *head,
|
||||
struct hhf_sched_data *q)
|
||||
{
|
||||
struct hh_flow_state *flow, *next;
|
||||
u32 now = hhf_time_stamp();
|
||||
|
||||
if (list_empty(head))
|
||||
return NULL;
|
||||
|
||||
list_for_each_entry_safe(flow, next, head, flowchain) {
|
||||
u32 prev = flow->hit_timestamp + q->hhf_evict_timeout;
|
||||
|
||||
if (hhf_time_before(prev, now)) {
|
||||
/* Delete expired heavy-hitters, but preserve one entry
|
||||
* to avoid kzalloc() when next time this slot is hit.
|
||||
*/
|
||||
if (list_is_last(&flow->flowchain, head))
|
||||
return NULL;
|
||||
list_del(&flow->flowchain);
|
||||
kfree(flow);
|
||||
q->hh_flows_current_cnt--;
|
||||
} else if (flow->hash_id == hash) {
|
||||
return flow;
|
||||
}
|
||||
}
|
||||
return NULL;
|
||||
}
|
||||
|
||||
/* Returns a flow state entry for a new heavy-hitter. Either reuses an expired
|
||||
* entry or dynamically alloc a new entry.
|
||||
*/
|
||||
static struct hh_flow_state *alloc_new_hh(struct list_head *head,
|
||||
struct hhf_sched_data *q)
|
||||
{
|
||||
struct hh_flow_state *flow;
|
||||
u32 now = hhf_time_stamp();
|
||||
|
||||
if (!list_empty(head)) {
|
||||
/* Find an expired heavy-hitter flow entry. */
|
||||
list_for_each_entry(flow, head, flowchain) {
|
||||
u32 prev = flow->hit_timestamp + q->hhf_evict_timeout;
|
||||
|
||||
if (hhf_time_before(prev, now))
|
||||
return flow;
|
||||
}
|
||||
}
|
||||
|
||||
if (q->hh_flows_current_cnt >= q->hh_flows_limit) {
|
||||
q->hh_flows_overlimit++;
|
||||
return NULL;
|
||||
}
|
||||
/* Create new entry. */
|
||||
flow = kzalloc(sizeof(struct hh_flow_state), GFP_ATOMIC);
|
||||
if (!flow)
|
||||
return NULL;
|
||||
|
||||
q->hh_flows_current_cnt++;
|
||||
INIT_LIST_HEAD(&flow->flowchain);
|
||||
list_add_tail(&flow->flowchain, head);
|
||||
|
||||
return flow;
|
||||
}
|
||||
|
||||
/* Assigns packets to WDRR buckets. Implements a multi-stage filter to
|
||||
* classify heavy-hitters.
|
||||
*/
|
||||
static enum wdrr_bucket_idx hhf_classify(struct sk_buff *skb, struct Qdisc *sch)
|
||||
{
|
||||
struct hhf_sched_data *q = qdisc_priv(sch);
|
||||
u32 tmp_hash, hash;
|
||||
u32 xorsum, filter_pos[HHF_ARRAYS_CNT], flow_pos;
|
||||
struct hh_flow_state *flow;
|
||||
u32 pkt_len, min_hhf_val;
|
||||
int i;
|
||||
u32 prev;
|
||||
u32 now = hhf_time_stamp();
|
||||
|
||||
/* Reset the HHF counter arrays if this is the right time. */
|
||||
prev = q->hhf_arrays_reset_timestamp + q->hhf_reset_timeout;
|
||||
if (hhf_time_before(prev, now)) {
|
||||
for (i = 0; i < HHF_ARRAYS_CNT; i++)
|
||||
bitmap_zero(q->hhf_valid_bits[i], HHF_ARRAYS_LEN);
|
||||
q->hhf_arrays_reset_timestamp = now;
|
||||
}
|
||||
|
||||
/* Get hashed flow-id of the skb. */
|
||||
hash = skb_hash(q, skb);
|
||||
|
||||
/* Check if this packet belongs to an already established HH flow. */
|
||||
flow_pos = hash & HHF_BIT_MASK;
|
||||
flow = seek_list(hash, &q->hh_flows[flow_pos], q);
|
||||
if (flow) { /* found its HH flow */
|
||||
flow->hit_timestamp = now;
|
||||
return WDRR_BUCKET_FOR_HH;
|
||||
}
|
||||
|
||||
/* Now pass the packet through the multi-stage filter. */
|
||||
tmp_hash = hash;
|
||||
xorsum = 0;
|
||||
for (i = 0; i < HHF_ARRAYS_CNT - 1; i++) {
|
||||
/* Split the skb_hash into three 10-bit chunks. */
|
||||
filter_pos[i] = tmp_hash & HHF_BIT_MASK;
|
||||
xorsum ^= filter_pos[i];
|
||||
tmp_hash >>= HHF_BIT_MASK_LEN;
|
||||
}
|
||||
/* The last chunk is computed as XOR sum of other chunks. */
|
||||
filter_pos[HHF_ARRAYS_CNT - 1] = xorsum ^ tmp_hash;
|
||||
|
||||
pkt_len = qdisc_pkt_len(skb);
|
||||
min_hhf_val = ~0U;
|
||||
for (i = 0; i < HHF_ARRAYS_CNT; i++) {
|
||||
u32 val;
|
||||
|
||||
if (!test_bit(filter_pos[i], q->hhf_valid_bits[i])) {
|
||||
q->hhf_arrays[i][filter_pos[i]] = 0;
|
||||
__set_bit(filter_pos[i], q->hhf_valid_bits[i]);
|
||||
}
|
||||
|
||||
val = q->hhf_arrays[i][filter_pos[i]] + pkt_len;
|
||||
if (min_hhf_val > val)
|
||||
min_hhf_val = val;
|
||||
}
|
||||
|
||||
/* Found a new HH iff all counter values > HH admit threshold. */
|
||||
if (min_hhf_val > q->hhf_admit_bytes) {
|
||||
/* Just captured a new heavy-hitter. */
|
||||
flow = alloc_new_hh(&q->hh_flows[flow_pos], q);
|
||||
if (!flow) /* memory alloc problem */
|
||||
return WDRR_BUCKET_FOR_NON_HH;
|
||||
flow->hash_id = hash;
|
||||
flow->hit_timestamp = now;
|
||||
q->hh_flows_total_cnt++;
|
||||
|
||||
/* By returning without updating counters in q->hhf_arrays,
|
||||
* we implicitly implement "shielding" (see Optimization O1).
|
||||
*/
|
||||
return WDRR_BUCKET_FOR_HH;
|
||||
}
|
||||
|
||||
/* Conservative update of HHF arrays (see Optimization O2). */
|
||||
for (i = 0; i < HHF_ARRAYS_CNT; i++) {
|
||||
if (q->hhf_arrays[i][filter_pos[i]] < min_hhf_val)
|
||||
q->hhf_arrays[i][filter_pos[i]] = min_hhf_val;
|
||||
}
|
||||
return WDRR_BUCKET_FOR_NON_HH;
|
||||
}
|
||||
|
||||
/* Removes one skb from head of bucket. */
|
||||
static struct sk_buff *dequeue_head(struct wdrr_bucket *bucket)
|
||||
{
|
||||
struct sk_buff *skb = bucket->head;
|
||||
|
||||
bucket->head = skb->next;
|
||||
skb->next = NULL;
|
||||
return skb;
|
||||
}
|
||||
|
||||
/* Tail-adds skb to bucket. */
|
||||
static void bucket_add(struct wdrr_bucket *bucket, struct sk_buff *skb)
|
||||
{
|
||||
if (bucket->head == NULL)
|
||||
bucket->head = skb;
|
||||
else
|
||||
bucket->tail->next = skb;
|
||||
bucket->tail = skb;
|
||||
skb->next = NULL;
|
||||
}
|
||||
|
||||
static unsigned int hhf_drop(struct Qdisc *sch)
|
||||
{
|
||||
struct hhf_sched_data *q = qdisc_priv(sch);
|
||||
struct wdrr_bucket *bucket;
|
||||
|
||||
/* Always try to drop from heavy-hitters first. */
|
||||
bucket = &q->buckets[WDRR_BUCKET_FOR_HH];
|
||||
if (!bucket->head)
|
||||
bucket = &q->buckets[WDRR_BUCKET_FOR_NON_HH];
|
||||
|
||||
if (bucket->head) {
|
||||
struct sk_buff *skb = dequeue_head(bucket);
|
||||
|
||||
sch->q.qlen--;
|
||||
qdisc_qstats_drop(sch);
|
||||
qdisc_qstats_backlog_dec(sch, skb);
|
||||
kfree_skb(skb);
|
||||
}
|
||||
|
||||
/* Return id of the bucket from which the packet was dropped. */
|
||||
return bucket - q->buckets;
|
||||
}
|
||||
|
||||
static int hhf_enqueue(struct sk_buff *skb, struct Qdisc *sch)
|
||||
{
|
||||
struct hhf_sched_data *q = qdisc_priv(sch);
|
||||
enum wdrr_bucket_idx idx;
|
||||
struct wdrr_bucket *bucket;
|
||||
|
||||
idx = hhf_classify(skb, sch);
|
||||
|
||||
bucket = &q->buckets[idx];
|
||||
bucket_add(bucket, skb);
|
||||
qdisc_qstats_backlog_inc(sch, skb);
|
||||
|
||||
if (list_empty(&bucket->bucketchain)) {
|
||||
unsigned int weight;
|
||||
|
||||
/* The logic of new_buckets vs. old_buckets is the same as
|
||||
* new_flows vs. old_flows in the implementation of fq_codel,
|
||||
* i.e., short bursts of non-HHs should have strict priority.
|
||||
*/
|
||||
if (idx == WDRR_BUCKET_FOR_HH) {
|
||||
/* Always move heavy-hitters to old bucket. */
|
||||
weight = 1;
|
||||
list_add_tail(&bucket->bucketchain, &q->old_buckets);
|
||||
} else {
|
||||
weight = q->hhf_non_hh_weight;
|
||||
list_add_tail(&bucket->bucketchain, &q->new_buckets);
|
||||
}
|
||||
bucket->deficit = weight * q->quantum;
|
||||
}
|
||||
if (++sch->q.qlen <= sch->limit)
|
||||
return NET_XMIT_SUCCESS;
|
||||
|
||||
q->drop_overlimit++;
|
||||
/* Return Congestion Notification only if we dropped a packet from this
|
||||
* bucket.
|
||||
*/
|
||||
if (hhf_drop(sch) == idx)
|
||||
return NET_XMIT_CN;
|
||||
|
||||
/* As we dropped a packet, better let upper stack know this. */
|
||||
qdisc_tree_decrease_qlen(sch, 1);
|
||||
return NET_XMIT_SUCCESS;
|
||||
}
|
||||
|
||||
static struct sk_buff *hhf_dequeue(struct Qdisc *sch)
|
||||
{
|
||||
struct hhf_sched_data *q = qdisc_priv(sch);
|
||||
struct sk_buff *skb = NULL;
|
||||
struct wdrr_bucket *bucket;
|
||||
struct list_head *head;
|
||||
|
||||
begin:
|
||||
head = &q->new_buckets;
|
||||
if (list_empty(head)) {
|
||||
head = &q->old_buckets;
|
||||
if (list_empty(head))
|
||||
return NULL;
|
||||
}
|
||||
bucket = list_first_entry(head, struct wdrr_bucket, bucketchain);
|
||||
|
||||
if (bucket->deficit <= 0) {
|
||||
int weight = (bucket - q->buckets == WDRR_BUCKET_FOR_HH) ?
|
||||
1 : q->hhf_non_hh_weight;
|
||||
|
||||
bucket->deficit += weight * q->quantum;
|
||||
list_move_tail(&bucket->bucketchain, &q->old_buckets);
|
||||
goto begin;
|
||||
}
|
||||
|
||||
if (bucket->head) {
|
||||
skb = dequeue_head(bucket);
|
||||
sch->q.qlen--;
|
||||
qdisc_qstats_backlog_dec(sch, skb);
|
||||
}
|
||||
|
||||
if (!skb) {
|
||||
/* Force a pass through old_buckets to prevent starvation. */
|
||||
if ((head == &q->new_buckets) && !list_empty(&q->old_buckets))
|
||||
list_move_tail(&bucket->bucketchain, &q->old_buckets);
|
||||
else
|
||||
list_del_init(&bucket->bucketchain);
|
||||
goto begin;
|
||||
}
|
||||
qdisc_bstats_update(sch, skb);
|
||||
bucket->deficit -= qdisc_pkt_len(skb);
|
||||
|
||||
return skb;
|
||||
}
|
||||
|
||||
static void hhf_reset(struct Qdisc *sch)
|
||||
{
|
||||
struct sk_buff *skb;
|
||||
|
||||
while ((skb = hhf_dequeue(sch)) != NULL)
|
||||
kfree_skb(skb);
|
||||
}
|
||||
|
||||
static void *hhf_zalloc(size_t sz)
|
||||
{
|
||||
void *ptr = kzalloc(sz, GFP_KERNEL | __GFP_NOWARN);
|
||||
|
||||
if (!ptr)
|
||||
ptr = vzalloc(sz);
|
||||
|
||||
return ptr;
|
||||
}
|
||||
|
||||
static void hhf_free(void *addr)
|
||||
{
|
||||
kvfree(addr);
|
||||
}
|
||||
|
||||
static void hhf_destroy(struct Qdisc *sch)
|
||||
{
|
||||
int i;
|
||||
struct hhf_sched_data *q = qdisc_priv(sch);
|
||||
|
||||
for (i = 0; i < HHF_ARRAYS_CNT; i++) {
|
||||
hhf_free(q->hhf_arrays[i]);
|
||||
hhf_free(q->hhf_valid_bits[i]);
|
||||
}
|
||||
|
||||
for (i = 0; i < HH_FLOWS_CNT; i++) {
|
||||
struct hh_flow_state *flow, *next;
|
||||
struct list_head *head = &q->hh_flows[i];
|
||||
|
||||
if (list_empty(head))
|
||||
continue;
|
||||
list_for_each_entry_safe(flow, next, head, flowchain) {
|
||||
list_del(&flow->flowchain);
|
||||
kfree(flow);
|
||||
}
|
||||
}
|
||||
hhf_free(q->hh_flows);
|
||||
}
|
||||
|
||||
static const struct nla_policy hhf_policy[TCA_HHF_MAX + 1] = {
|
||||
[TCA_HHF_BACKLOG_LIMIT] = { .type = NLA_U32 },
|
||||
[TCA_HHF_QUANTUM] = { .type = NLA_U32 },
|
||||
[TCA_HHF_HH_FLOWS_LIMIT] = { .type = NLA_U32 },
|
||||
[TCA_HHF_RESET_TIMEOUT] = { .type = NLA_U32 },
|
||||
[TCA_HHF_ADMIT_BYTES] = { .type = NLA_U32 },
|
||||
[TCA_HHF_EVICT_TIMEOUT] = { .type = NLA_U32 },
|
||||
[TCA_HHF_NON_HH_WEIGHT] = { .type = NLA_U32 },
|
||||
};
|
||||
|
||||
static int hhf_change(struct Qdisc *sch, struct nlattr *opt)
|
||||
{
|
||||
struct hhf_sched_data *q = qdisc_priv(sch);
|
||||
struct nlattr *tb[TCA_HHF_MAX + 1];
|
||||
unsigned int qlen;
|
||||
int err;
|
||||
u64 non_hh_quantum;
|
||||
u32 new_quantum = q->quantum;
|
||||
u32 new_hhf_non_hh_weight = q->hhf_non_hh_weight;
|
||||
|
||||
if (!opt)
|
||||
return -EINVAL;
|
||||
|
||||
err = nla_parse_nested(tb, TCA_HHF_MAX, opt, hhf_policy);
|
||||
if (err < 0)
|
||||
return err;
|
||||
|
||||
if (tb[TCA_HHF_QUANTUM])
|
||||
new_quantum = nla_get_u32(tb[TCA_HHF_QUANTUM]);
|
||||
|
||||
if (tb[TCA_HHF_NON_HH_WEIGHT])
|
||||
new_hhf_non_hh_weight = nla_get_u32(tb[TCA_HHF_NON_HH_WEIGHT]);
|
||||
|
||||
non_hh_quantum = (u64)new_quantum * new_hhf_non_hh_weight;
|
||||
if (non_hh_quantum > INT_MAX)
|
||||
return -EINVAL;
|
||||
|
||||
sch_tree_lock(sch);
|
||||
|
||||
if (tb[TCA_HHF_BACKLOG_LIMIT])
|
||||
sch->limit = nla_get_u32(tb[TCA_HHF_BACKLOG_LIMIT]);
|
||||
|
||||
q->quantum = new_quantum;
|
||||
q->hhf_non_hh_weight = new_hhf_non_hh_weight;
|
||||
|
||||
if (tb[TCA_HHF_HH_FLOWS_LIMIT])
|
||||
q->hh_flows_limit = nla_get_u32(tb[TCA_HHF_HH_FLOWS_LIMIT]);
|
||||
|
||||
if (tb[TCA_HHF_RESET_TIMEOUT]) {
|
||||
u32 us = nla_get_u32(tb[TCA_HHF_RESET_TIMEOUT]);
|
||||
|
||||
q->hhf_reset_timeout = usecs_to_jiffies(us);
|
||||
}
|
||||
|
||||
if (tb[TCA_HHF_ADMIT_BYTES])
|
||||
q->hhf_admit_bytes = nla_get_u32(tb[TCA_HHF_ADMIT_BYTES]);
|
||||
|
||||
if (tb[TCA_HHF_EVICT_TIMEOUT]) {
|
||||
u32 us = nla_get_u32(tb[TCA_HHF_EVICT_TIMEOUT]);
|
||||
|
||||
q->hhf_evict_timeout = usecs_to_jiffies(us);
|
||||
}
|
||||
|
||||
qlen = sch->q.qlen;
|
||||
while (sch->q.qlen > sch->limit) {
|
||||
struct sk_buff *skb = hhf_dequeue(sch);
|
||||
|
||||
kfree_skb(skb);
|
||||
}
|
||||
qdisc_tree_decrease_qlen(sch, qlen - sch->q.qlen);
|
||||
|
||||
sch_tree_unlock(sch);
|
||||
return 0;
|
||||
}
|
||||
|
||||
static int hhf_init(struct Qdisc *sch, struct nlattr *opt)
|
||||
{
|
||||
struct hhf_sched_data *q = qdisc_priv(sch);
|
||||
int i;
|
||||
|
||||
sch->limit = 1000;
|
||||
q->quantum = psched_mtu(qdisc_dev(sch));
|
||||
q->perturbation = prandom_u32();
|
||||
INIT_LIST_HEAD(&q->new_buckets);
|
||||
INIT_LIST_HEAD(&q->old_buckets);
|
||||
|
||||
/* Configurable HHF parameters */
|
||||
q->hhf_reset_timeout = HZ / 25; /* 40 ms */
|
||||
q->hhf_admit_bytes = 131072; /* 128 KB */
|
||||
q->hhf_evict_timeout = HZ; /* 1 sec */
|
||||
q->hhf_non_hh_weight = 2;
|
||||
|
||||
if (opt) {
|
||||
int err = hhf_change(sch, opt);
|
||||
|
||||
if (err)
|
||||
return err;
|
||||
}
|
||||
|
||||
if (!q->hh_flows) {
|
||||
/* Initialize heavy-hitter flow table. */
|
||||
q->hh_flows = hhf_zalloc(HH_FLOWS_CNT *
|
||||
sizeof(struct list_head));
|
||||
if (!q->hh_flows)
|
||||
return -ENOMEM;
|
||||
for (i = 0; i < HH_FLOWS_CNT; i++)
|
||||
INIT_LIST_HEAD(&q->hh_flows[i]);
|
||||
|
||||
/* Cap max active HHs at twice len of hh_flows table. */
|
||||
q->hh_flows_limit = 2 * HH_FLOWS_CNT;
|
||||
q->hh_flows_overlimit = 0;
|
||||
q->hh_flows_total_cnt = 0;
|
||||
q->hh_flows_current_cnt = 0;
|
||||
|
||||
/* Initialize heavy-hitter filter arrays. */
|
||||
for (i = 0; i < HHF_ARRAYS_CNT; i++) {
|
||||
q->hhf_arrays[i] = hhf_zalloc(HHF_ARRAYS_LEN *
|
||||
sizeof(u32));
|
||||
if (!q->hhf_arrays[i]) {
|
||||
hhf_destroy(sch);
|
||||
return -ENOMEM;
|
||||
}
|
||||
}
|
||||
q->hhf_arrays_reset_timestamp = hhf_time_stamp();
|
||||
|
||||
/* Initialize valid bits of heavy-hitter filter arrays. */
|
||||
for (i = 0; i < HHF_ARRAYS_CNT; i++) {
|
||||
q->hhf_valid_bits[i] = hhf_zalloc(HHF_ARRAYS_LEN /
|
||||
BITS_PER_BYTE);
|
||||
if (!q->hhf_valid_bits[i]) {
|
||||
hhf_destroy(sch);
|
||||
return -ENOMEM;
|
||||
}
|
||||
}
|
||||
|
||||
/* Initialize Weighted DRR buckets. */
|
||||
for (i = 0; i < WDRR_BUCKET_CNT; i++) {
|
||||
struct wdrr_bucket *bucket = q->buckets + i;
|
||||
|
||||
INIT_LIST_HEAD(&bucket->bucketchain);
|
||||
}
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
static int hhf_dump(struct Qdisc *sch, struct sk_buff *skb)
|
||||
{
|
||||
struct hhf_sched_data *q = qdisc_priv(sch);
|
||||
struct nlattr *opts;
|
||||
|
||||
opts = nla_nest_start(skb, TCA_OPTIONS);
|
||||
if (opts == NULL)
|
||||
goto nla_put_failure;
|
||||
|
||||
if (nla_put_u32(skb, TCA_HHF_BACKLOG_LIMIT, sch->limit) ||
|
||||
nla_put_u32(skb, TCA_HHF_QUANTUM, q->quantum) ||
|
||||
nla_put_u32(skb, TCA_HHF_HH_FLOWS_LIMIT, q->hh_flows_limit) ||
|
||||
nla_put_u32(skb, TCA_HHF_RESET_TIMEOUT,
|
||||
jiffies_to_usecs(q->hhf_reset_timeout)) ||
|
||||
nla_put_u32(skb, TCA_HHF_ADMIT_BYTES, q->hhf_admit_bytes) ||
|
||||
nla_put_u32(skb, TCA_HHF_EVICT_TIMEOUT,
|
||||
jiffies_to_usecs(q->hhf_evict_timeout)) ||
|
||||
nla_put_u32(skb, TCA_HHF_NON_HH_WEIGHT, q->hhf_non_hh_weight))
|
||||
goto nla_put_failure;
|
||||
|
||||
return nla_nest_end(skb, opts);
|
||||
|
||||
nla_put_failure:
|
||||
return -1;
|
||||
}
|
||||
|
||||
static int hhf_dump_stats(struct Qdisc *sch, struct gnet_dump *d)
|
||||
{
|
||||
struct hhf_sched_data *q = qdisc_priv(sch);
|
||||
struct tc_hhf_xstats st = {
|
||||
.drop_overlimit = q->drop_overlimit,
|
||||
.hh_overlimit = q->hh_flows_overlimit,
|
||||
.hh_tot_count = q->hh_flows_total_cnt,
|
||||
.hh_cur_count = q->hh_flows_current_cnt,
|
||||
};
|
||||
|
||||
return gnet_stats_copy_app(d, &st, sizeof(st));
|
||||
}
|
||||
|
||||
static struct Qdisc_ops hhf_qdisc_ops __read_mostly = {
|
||||
.id = "hhf",
|
||||
.priv_size = sizeof(struct hhf_sched_data),
|
||||
|
||||
.enqueue = hhf_enqueue,
|
||||
.dequeue = hhf_dequeue,
|
||||
.peek = qdisc_peek_dequeued,
|
||||
.drop = hhf_drop,
|
||||
.init = hhf_init,
|
||||
.reset = hhf_reset,
|
||||
.destroy = hhf_destroy,
|
||||
.change = hhf_change,
|
||||
.dump = hhf_dump,
|
||||
.dump_stats = hhf_dump_stats,
|
||||
.owner = THIS_MODULE,
|
||||
};
|
||||
|
||||
static int __init hhf_module_init(void)
|
||||
{
|
||||
return register_qdisc(&hhf_qdisc_ops);
|
||||
}
|
||||
|
||||
static void __exit hhf_module_exit(void)
|
||||
{
|
||||
unregister_qdisc(&hhf_qdisc_ops);
|
||||
}
|
||||
|
||||
module_init(hhf_module_init)
|
||||
module_exit(hhf_module_exit)
|
||||
MODULE_AUTHOR("Terry Lam");
|
||||
MODULE_AUTHOR("Nandita Dukkipati");
|
||||
MODULE_LICENSE("GPL");
|
||||
1630
net/sched/sch_htb.c
Normal file
1630
net/sched/sch_htb.c
Normal file
File diff suppressed because it is too large
Load diff
144
net/sched/sch_ingress.c
Normal file
144
net/sched/sch_ingress.c
Normal file
|
|
@ -0,0 +1,144 @@
|
|||
/* net/sched/sch_ingress.c - Ingress qdisc
|
||||
* This program is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU General Public License
|
||||
* as published by the Free Software Foundation; either version
|
||||
* 2 of the License, or (at your option) any later version.
|
||||
*
|
||||
* Authors: Jamal Hadi Salim 1999
|
||||
*/
|
||||
|
||||
#include <linux/module.h>
|
||||
#include <linux/types.h>
|
||||
#include <linux/list.h>
|
||||
#include <linux/skbuff.h>
|
||||
#include <linux/rtnetlink.h>
|
||||
#include <net/netlink.h>
|
||||
#include <net/pkt_sched.h>
|
||||
|
||||
|
||||
struct ingress_qdisc_data {
|
||||
struct tcf_proto __rcu *filter_list;
|
||||
};
|
||||
|
||||
/* ------------------------- Class/flow operations ------------------------- */
|
||||
|
||||
static struct Qdisc *ingress_leaf(struct Qdisc *sch, unsigned long arg)
|
||||
{
|
||||
return NULL;
|
||||
}
|
||||
|
||||
static unsigned long ingress_get(struct Qdisc *sch, u32 classid)
|
||||
{
|
||||
return TC_H_MIN(classid) + 1;
|
||||
}
|
||||
|
||||
static unsigned long ingress_bind_filter(struct Qdisc *sch,
|
||||
unsigned long parent, u32 classid)
|
||||
{
|
||||
return ingress_get(sch, classid);
|
||||
}
|
||||
|
||||
static void ingress_put(struct Qdisc *sch, unsigned long cl)
|
||||
{
|
||||
}
|
||||
|
||||
static void ingress_walk(struct Qdisc *sch, struct qdisc_walker *walker)
|
||||
{
|
||||
}
|
||||
|
||||
static struct tcf_proto __rcu **ingress_find_tcf(struct Qdisc *sch,
|
||||
unsigned long cl)
|
||||
{
|
||||
struct ingress_qdisc_data *p = qdisc_priv(sch);
|
||||
|
||||
return &p->filter_list;
|
||||
}
|
||||
|
||||
/* --------------------------- Qdisc operations ---------------------------- */
|
||||
|
||||
static int ingress_enqueue(struct sk_buff *skb, struct Qdisc *sch)
|
||||
{
|
||||
struct ingress_qdisc_data *p = qdisc_priv(sch);
|
||||
struct tcf_result res;
|
||||
struct tcf_proto *fl = rcu_dereference_bh(p->filter_list);
|
||||
int result;
|
||||
|
||||
result = tc_classify(skb, fl, &res);
|
||||
|
||||
qdisc_bstats_update(sch, skb);
|
||||
switch (result) {
|
||||
case TC_ACT_SHOT:
|
||||
result = TC_ACT_SHOT;
|
||||
qdisc_qstats_drop(sch);
|
||||
break;
|
||||
case TC_ACT_STOLEN:
|
||||
case TC_ACT_QUEUED:
|
||||
result = TC_ACT_STOLEN;
|
||||
break;
|
||||
case TC_ACT_RECLASSIFY:
|
||||
case TC_ACT_OK:
|
||||
skb->tc_index = TC_H_MIN(res.classid);
|
||||
default:
|
||||
result = TC_ACT_OK;
|
||||
break;
|
||||
}
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
/* ------------------------------------------------------------- */
|
||||
|
||||
static void ingress_destroy(struct Qdisc *sch)
|
||||
{
|
||||
struct ingress_qdisc_data *p = qdisc_priv(sch);
|
||||
|
||||
tcf_destroy_chain(&p->filter_list);
|
||||
}
|
||||
|
||||
static int ingress_dump(struct Qdisc *sch, struct sk_buff *skb)
|
||||
{
|
||||
struct nlattr *nest;
|
||||
|
||||
nest = nla_nest_start(skb, TCA_OPTIONS);
|
||||
if (nest == NULL)
|
||||
goto nla_put_failure;
|
||||
return nla_nest_end(skb, nest);
|
||||
|
||||
nla_put_failure:
|
||||
nla_nest_cancel(skb, nest);
|
||||
return -1;
|
||||
}
|
||||
|
||||
static const struct Qdisc_class_ops ingress_class_ops = {
|
||||
.leaf = ingress_leaf,
|
||||
.get = ingress_get,
|
||||
.put = ingress_put,
|
||||
.walk = ingress_walk,
|
||||
.tcf_chain = ingress_find_tcf,
|
||||
.bind_tcf = ingress_bind_filter,
|
||||
.unbind_tcf = ingress_put,
|
||||
};
|
||||
|
||||
static struct Qdisc_ops ingress_qdisc_ops __read_mostly = {
|
||||
.cl_ops = &ingress_class_ops,
|
||||
.id = "ingress",
|
||||
.priv_size = sizeof(struct ingress_qdisc_data),
|
||||
.enqueue = ingress_enqueue,
|
||||
.destroy = ingress_destroy,
|
||||
.dump = ingress_dump,
|
||||
.owner = THIS_MODULE,
|
||||
};
|
||||
|
||||
static int __init ingress_module_init(void)
|
||||
{
|
||||
return register_qdisc(&ingress_qdisc_ops);
|
||||
}
|
||||
|
||||
static void __exit ingress_module_exit(void)
|
||||
{
|
||||
unregister_qdisc(&ingress_qdisc_ops);
|
||||
}
|
||||
|
||||
module_init(ingress_module_init)
|
||||
module_exit(ingress_module_exit)
|
||||
MODULE_LICENSE("GPL");
|
||||
246
net/sched/sch_mq.c
Normal file
246
net/sched/sch_mq.c
Normal file
|
|
@ -0,0 +1,246 @@
|
|||
/*
|
||||
* net/sched/sch_mq.c Classful multiqueue dummy scheduler
|
||||
*
|
||||
* Copyright (c) 2009 Patrick McHardy <kaber@trash.net>
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU General Public License
|
||||
* version 2 as published by the Free Software Foundation.
|
||||
*/
|
||||
|
||||
#include <linux/types.h>
|
||||
#include <linux/slab.h>
|
||||
#include <linux/kernel.h>
|
||||
#include <linux/export.h>
|
||||
#include <linux/string.h>
|
||||
#include <linux/errno.h>
|
||||
#include <linux/skbuff.h>
|
||||
#include <net/netlink.h>
|
||||
#include <net/pkt_sched.h>
|
||||
|
||||
struct mq_sched {
|
||||
struct Qdisc **qdiscs;
|
||||
};
|
||||
|
||||
static void mq_destroy(struct Qdisc *sch)
|
||||
{
|
||||
struct net_device *dev = qdisc_dev(sch);
|
||||
struct mq_sched *priv = qdisc_priv(sch);
|
||||
unsigned int ntx;
|
||||
|
||||
if (!priv->qdiscs)
|
||||
return;
|
||||
for (ntx = 0; ntx < dev->num_tx_queues && priv->qdiscs[ntx]; ntx++)
|
||||
qdisc_destroy(priv->qdiscs[ntx]);
|
||||
kfree(priv->qdiscs);
|
||||
}
|
||||
|
||||
static int mq_init(struct Qdisc *sch, struct nlattr *opt)
|
||||
{
|
||||
struct net_device *dev = qdisc_dev(sch);
|
||||
struct mq_sched *priv = qdisc_priv(sch);
|
||||
struct netdev_queue *dev_queue;
|
||||
struct Qdisc *qdisc;
|
||||
unsigned int ntx;
|
||||
|
||||
if (sch->parent != TC_H_ROOT)
|
||||
return -EOPNOTSUPP;
|
||||
|
||||
if (!netif_is_multiqueue(dev))
|
||||
return -EOPNOTSUPP;
|
||||
|
||||
/* pre-allocate qdiscs, attachment can't fail */
|
||||
priv->qdiscs = kcalloc(dev->num_tx_queues, sizeof(priv->qdiscs[0]),
|
||||
GFP_KERNEL);
|
||||
if (priv->qdiscs == NULL)
|
||||
return -ENOMEM;
|
||||
|
||||
for (ntx = 0; ntx < dev->num_tx_queues; ntx++) {
|
||||
dev_queue = netdev_get_tx_queue(dev, ntx);
|
||||
qdisc = qdisc_create_dflt(dev_queue, default_qdisc_ops,
|
||||
TC_H_MAKE(TC_H_MAJ(sch->handle),
|
||||
TC_H_MIN(ntx + 1)));
|
||||
if (qdisc == NULL)
|
||||
goto err;
|
||||
priv->qdiscs[ntx] = qdisc;
|
||||
qdisc->flags |= TCQ_F_ONETXQUEUE;
|
||||
}
|
||||
|
||||
sch->flags |= TCQ_F_MQROOT;
|
||||
return 0;
|
||||
|
||||
err:
|
||||
mq_destroy(sch);
|
||||
return -ENOMEM;
|
||||
}
|
||||
|
||||
static void mq_attach(struct Qdisc *sch)
|
||||
{
|
||||
struct net_device *dev = qdisc_dev(sch);
|
||||
struct mq_sched *priv = qdisc_priv(sch);
|
||||
struct Qdisc *qdisc, *old;
|
||||
unsigned int ntx;
|
||||
|
||||
for (ntx = 0; ntx < dev->num_tx_queues; ntx++) {
|
||||
qdisc = priv->qdiscs[ntx];
|
||||
old = dev_graft_qdisc(qdisc->dev_queue, qdisc);
|
||||
if (old)
|
||||
qdisc_destroy(old);
|
||||
#ifdef CONFIG_NET_SCHED
|
||||
if (ntx < dev->real_num_tx_queues)
|
||||
qdisc_list_add(qdisc);
|
||||
#endif
|
||||
|
||||
}
|
||||
kfree(priv->qdiscs);
|
||||
priv->qdiscs = NULL;
|
||||
}
|
||||
|
||||
static int mq_dump(struct Qdisc *sch, struct sk_buff *skb)
|
||||
{
|
||||
struct net_device *dev = qdisc_dev(sch);
|
||||
struct Qdisc *qdisc;
|
||||
unsigned int ntx;
|
||||
|
||||
sch->q.qlen = 0;
|
||||
memset(&sch->bstats, 0, sizeof(sch->bstats));
|
||||
memset(&sch->qstats, 0, sizeof(sch->qstats));
|
||||
|
||||
for (ntx = 0; ntx < dev->num_tx_queues; ntx++) {
|
||||
qdisc = netdev_get_tx_queue(dev, ntx)->qdisc_sleeping;
|
||||
spin_lock_bh(qdisc_lock(qdisc));
|
||||
sch->q.qlen += qdisc->q.qlen;
|
||||
sch->bstats.bytes += qdisc->bstats.bytes;
|
||||
sch->bstats.packets += qdisc->bstats.packets;
|
||||
sch->qstats.backlog += qdisc->qstats.backlog;
|
||||
sch->qstats.drops += qdisc->qstats.drops;
|
||||
sch->qstats.requeues += qdisc->qstats.requeues;
|
||||
sch->qstats.overlimits += qdisc->qstats.overlimits;
|
||||
spin_unlock_bh(qdisc_lock(qdisc));
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
static struct netdev_queue *mq_queue_get(struct Qdisc *sch, unsigned long cl)
|
||||
{
|
||||
struct net_device *dev = qdisc_dev(sch);
|
||||
unsigned long ntx = cl - 1;
|
||||
|
||||
if (ntx >= dev->num_tx_queues)
|
||||
return NULL;
|
||||
return netdev_get_tx_queue(dev, ntx);
|
||||
}
|
||||
|
||||
static struct netdev_queue *mq_select_queue(struct Qdisc *sch,
|
||||
struct tcmsg *tcm)
|
||||
{
|
||||
unsigned int ntx = TC_H_MIN(tcm->tcm_parent);
|
||||
struct netdev_queue *dev_queue = mq_queue_get(sch, ntx);
|
||||
|
||||
if (!dev_queue) {
|
||||
struct net_device *dev = qdisc_dev(sch);
|
||||
|
||||
return netdev_get_tx_queue(dev, 0);
|
||||
}
|
||||
return dev_queue;
|
||||
}
|
||||
|
||||
static int mq_graft(struct Qdisc *sch, unsigned long cl, struct Qdisc *new,
|
||||
struct Qdisc **old)
|
||||
{
|
||||
struct netdev_queue *dev_queue = mq_queue_get(sch, cl);
|
||||
struct net_device *dev = qdisc_dev(sch);
|
||||
|
||||
if (dev->flags & IFF_UP)
|
||||
dev_deactivate(dev);
|
||||
|
||||
*old = dev_graft_qdisc(dev_queue, new);
|
||||
if (new)
|
||||
new->flags |= TCQ_F_ONETXQUEUE;
|
||||
if (dev->flags & IFF_UP)
|
||||
dev_activate(dev);
|
||||
return 0;
|
||||
}
|
||||
|
||||
static struct Qdisc *mq_leaf(struct Qdisc *sch, unsigned long cl)
|
||||
{
|
||||
struct netdev_queue *dev_queue = mq_queue_get(sch, cl);
|
||||
|
||||
return dev_queue->qdisc_sleeping;
|
||||
}
|
||||
|
||||
static unsigned long mq_get(struct Qdisc *sch, u32 classid)
|
||||
{
|
||||
unsigned int ntx = TC_H_MIN(classid);
|
||||
|
||||
if (!mq_queue_get(sch, ntx))
|
||||
return 0;
|
||||
return ntx;
|
||||
}
|
||||
|
||||
static void mq_put(struct Qdisc *sch, unsigned long cl)
|
||||
{
|
||||
}
|
||||
|
||||
static int mq_dump_class(struct Qdisc *sch, unsigned long cl,
|
||||
struct sk_buff *skb, struct tcmsg *tcm)
|
||||
{
|
||||
struct netdev_queue *dev_queue = mq_queue_get(sch, cl);
|
||||
|
||||
tcm->tcm_parent = TC_H_ROOT;
|
||||
tcm->tcm_handle |= TC_H_MIN(cl);
|
||||
tcm->tcm_info = dev_queue->qdisc_sleeping->handle;
|
||||
return 0;
|
||||
}
|
||||
|
||||
static int mq_dump_class_stats(struct Qdisc *sch, unsigned long cl,
|
||||
struct gnet_dump *d)
|
||||
{
|
||||
struct netdev_queue *dev_queue = mq_queue_get(sch, cl);
|
||||
|
||||
sch = dev_queue->qdisc_sleeping;
|
||||
if (gnet_stats_copy_basic(d, NULL, &sch->bstats) < 0 ||
|
||||
gnet_stats_copy_queue(d, NULL, &sch->qstats, sch->q.qlen) < 0)
|
||||
return -1;
|
||||
return 0;
|
||||
}
|
||||
|
||||
static void mq_walk(struct Qdisc *sch, struct qdisc_walker *arg)
|
||||
{
|
||||
struct net_device *dev = qdisc_dev(sch);
|
||||
unsigned int ntx;
|
||||
|
||||
if (arg->stop)
|
||||
return;
|
||||
|
||||
arg->count = arg->skip;
|
||||
for (ntx = arg->skip; ntx < dev->num_tx_queues; ntx++) {
|
||||
if (arg->fn(sch, ntx + 1, arg) < 0) {
|
||||
arg->stop = 1;
|
||||
break;
|
||||
}
|
||||
arg->count++;
|
||||
}
|
||||
}
|
||||
|
||||
static const struct Qdisc_class_ops mq_class_ops = {
|
||||
.select_queue = mq_select_queue,
|
||||
.graft = mq_graft,
|
||||
.leaf = mq_leaf,
|
||||
.get = mq_get,
|
||||
.put = mq_put,
|
||||
.walk = mq_walk,
|
||||
.dump = mq_dump_class,
|
||||
.dump_stats = mq_dump_class_stats,
|
||||
};
|
||||
|
||||
struct Qdisc_ops mq_qdisc_ops __read_mostly = {
|
||||
.cl_ops = &mq_class_ops,
|
||||
.id = "mq",
|
||||
.priv_size = sizeof(struct mq_sched),
|
||||
.init = mq_init,
|
||||
.destroy = mq_destroy,
|
||||
.attach = mq_attach,
|
||||
.dump = mq_dump,
|
||||
.owner = THIS_MODULE,
|
||||
};
|
||||
428
net/sched/sch_mqprio.c
Normal file
428
net/sched/sch_mqprio.c
Normal file
|
|
@ -0,0 +1,428 @@
|
|||
/*
|
||||
* net/sched/sch_mqprio.c
|
||||
*
|
||||
* Copyright (c) 2010 John Fastabend <john.r.fastabend@intel.com>
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU General Public License
|
||||
* version 2 as published by the Free Software Foundation.
|
||||
*/
|
||||
|
||||
#include <linux/types.h>
|
||||
#include <linux/slab.h>
|
||||
#include <linux/kernel.h>
|
||||
#include <linux/string.h>
|
||||
#include <linux/errno.h>
|
||||
#include <linux/skbuff.h>
|
||||
#include <linux/module.h>
|
||||
#include <net/netlink.h>
|
||||
#include <net/pkt_sched.h>
|
||||
#include <net/sch_generic.h>
|
||||
|
||||
struct mqprio_sched {
|
||||
struct Qdisc **qdiscs;
|
||||
int hw_owned;
|
||||
};
|
||||
|
||||
static void mqprio_destroy(struct Qdisc *sch)
|
||||
{
|
||||
struct net_device *dev = qdisc_dev(sch);
|
||||
struct mqprio_sched *priv = qdisc_priv(sch);
|
||||
unsigned int ntx;
|
||||
|
||||
if (priv->qdiscs) {
|
||||
for (ntx = 0;
|
||||
ntx < dev->num_tx_queues && priv->qdiscs[ntx];
|
||||
ntx++)
|
||||
qdisc_destroy(priv->qdiscs[ntx]);
|
||||
kfree(priv->qdiscs);
|
||||
}
|
||||
|
||||
if (priv->hw_owned && dev->netdev_ops->ndo_setup_tc)
|
||||
dev->netdev_ops->ndo_setup_tc(dev, 0);
|
||||
else
|
||||
netdev_set_num_tc(dev, 0);
|
||||
}
|
||||
|
||||
static int mqprio_parse_opt(struct net_device *dev, struct tc_mqprio_qopt *qopt)
|
||||
{
|
||||
int i, j;
|
||||
|
||||
/* Verify num_tc is not out of max range */
|
||||
if (qopt->num_tc > TC_MAX_QUEUE)
|
||||
return -EINVAL;
|
||||
|
||||
/* Verify priority mapping uses valid tcs */
|
||||
for (i = 0; i < TC_BITMASK + 1; i++) {
|
||||
if (qopt->prio_tc_map[i] >= qopt->num_tc)
|
||||
return -EINVAL;
|
||||
}
|
||||
|
||||
/* net_device does not support requested operation */
|
||||
if (qopt->hw && !dev->netdev_ops->ndo_setup_tc)
|
||||
return -EINVAL;
|
||||
|
||||
/* if hw owned qcount and qoffset are taken from LLD so
|
||||
* no reason to verify them here
|
||||
*/
|
||||
if (qopt->hw)
|
||||
return 0;
|
||||
|
||||
for (i = 0; i < qopt->num_tc; i++) {
|
||||
unsigned int last = qopt->offset[i] + qopt->count[i];
|
||||
|
||||
/* Verify the queue count is in tx range being equal to the
|
||||
* real_num_tx_queues indicates the last queue is in use.
|
||||
*/
|
||||
if (qopt->offset[i] >= dev->real_num_tx_queues ||
|
||||
!qopt->count[i] ||
|
||||
last > dev->real_num_tx_queues)
|
||||
return -EINVAL;
|
||||
|
||||
/* Verify that the offset and counts do not overlap */
|
||||
for (j = i + 1; j < qopt->num_tc; j++) {
|
||||
if (last > qopt->offset[j])
|
||||
return -EINVAL;
|
||||
}
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
static int mqprio_init(struct Qdisc *sch, struct nlattr *opt)
|
||||
{
|
||||
struct net_device *dev = qdisc_dev(sch);
|
||||
struct mqprio_sched *priv = qdisc_priv(sch);
|
||||
struct netdev_queue *dev_queue;
|
||||
struct Qdisc *qdisc;
|
||||
int i, err = -EOPNOTSUPP;
|
||||
struct tc_mqprio_qopt *qopt = NULL;
|
||||
|
||||
BUILD_BUG_ON(TC_MAX_QUEUE != TC_QOPT_MAX_QUEUE);
|
||||
BUILD_BUG_ON(TC_BITMASK != TC_QOPT_BITMASK);
|
||||
|
||||
if (sch->parent != TC_H_ROOT)
|
||||
return -EOPNOTSUPP;
|
||||
|
||||
if (!netif_is_multiqueue(dev))
|
||||
return -EOPNOTSUPP;
|
||||
|
||||
if (!opt || nla_len(opt) < sizeof(*qopt))
|
||||
return -EINVAL;
|
||||
|
||||
qopt = nla_data(opt);
|
||||
if (mqprio_parse_opt(dev, qopt))
|
||||
return -EINVAL;
|
||||
|
||||
/* pre-allocate qdisc, attachment can't fail */
|
||||
priv->qdiscs = kcalloc(dev->num_tx_queues, sizeof(priv->qdiscs[0]),
|
||||
GFP_KERNEL);
|
||||
if (priv->qdiscs == NULL) {
|
||||
err = -ENOMEM;
|
||||
goto err;
|
||||
}
|
||||
|
||||
for (i = 0; i < dev->num_tx_queues; i++) {
|
||||
dev_queue = netdev_get_tx_queue(dev, i);
|
||||
qdisc = qdisc_create_dflt(dev_queue, default_qdisc_ops,
|
||||
TC_H_MAKE(TC_H_MAJ(sch->handle),
|
||||
TC_H_MIN(i + 1)));
|
||||
if (qdisc == NULL) {
|
||||
err = -ENOMEM;
|
||||
goto err;
|
||||
}
|
||||
priv->qdiscs[i] = qdisc;
|
||||
qdisc->flags |= TCQ_F_ONETXQUEUE;
|
||||
}
|
||||
|
||||
/* If the mqprio options indicate that hardware should own
|
||||
* the queue mapping then run ndo_setup_tc otherwise use the
|
||||
* supplied and verified mapping
|
||||
*/
|
||||
if (qopt->hw) {
|
||||
priv->hw_owned = 1;
|
||||
err = dev->netdev_ops->ndo_setup_tc(dev, qopt->num_tc);
|
||||
if (err)
|
||||
goto err;
|
||||
} else {
|
||||
netdev_set_num_tc(dev, qopt->num_tc);
|
||||
for (i = 0; i < qopt->num_tc; i++)
|
||||
netdev_set_tc_queue(dev, i,
|
||||
qopt->count[i], qopt->offset[i]);
|
||||
}
|
||||
|
||||
/* Always use supplied priority mappings */
|
||||
for (i = 0; i < TC_BITMASK + 1; i++)
|
||||
netdev_set_prio_tc_map(dev, i, qopt->prio_tc_map[i]);
|
||||
|
||||
sch->flags |= TCQ_F_MQROOT;
|
||||
return 0;
|
||||
|
||||
err:
|
||||
mqprio_destroy(sch);
|
||||
return err;
|
||||
}
|
||||
|
||||
static void mqprio_attach(struct Qdisc *sch)
|
||||
{
|
||||
struct net_device *dev = qdisc_dev(sch);
|
||||
struct mqprio_sched *priv = qdisc_priv(sch);
|
||||
struct Qdisc *qdisc, *old;
|
||||
unsigned int ntx;
|
||||
|
||||
/* Attach underlying qdisc */
|
||||
for (ntx = 0; ntx < dev->num_tx_queues; ntx++) {
|
||||
qdisc = priv->qdiscs[ntx];
|
||||
old = dev_graft_qdisc(qdisc->dev_queue, qdisc);
|
||||
if (old)
|
||||
qdisc_destroy(old);
|
||||
if (ntx < dev->real_num_tx_queues)
|
||||
qdisc_list_add(qdisc);
|
||||
}
|
||||
kfree(priv->qdiscs);
|
||||
priv->qdiscs = NULL;
|
||||
}
|
||||
|
||||
static struct netdev_queue *mqprio_queue_get(struct Qdisc *sch,
|
||||
unsigned long cl)
|
||||
{
|
||||
struct net_device *dev = qdisc_dev(sch);
|
||||
unsigned long ntx = cl - 1 - netdev_get_num_tc(dev);
|
||||
|
||||
if (ntx >= dev->num_tx_queues)
|
||||
return NULL;
|
||||
return netdev_get_tx_queue(dev, ntx);
|
||||
}
|
||||
|
||||
static int mqprio_graft(struct Qdisc *sch, unsigned long cl, struct Qdisc *new,
|
||||
struct Qdisc **old)
|
||||
{
|
||||
struct net_device *dev = qdisc_dev(sch);
|
||||
struct netdev_queue *dev_queue = mqprio_queue_get(sch, cl);
|
||||
|
||||
if (!dev_queue)
|
||||
return -EINVAL;
|
||||
|
||||
if (dev->flags & IFF_UP)
|
||||
dev_deactivate(dev);
|
||||
|
||||
*old = dev_graft_qdisc(dev_queue, new);
|
||||
|
||||
if (new)
|
||||
new->flags |= TCQ_F_ONETXQUEUE;
|
||||
|
||||
if (dev->flags & IFF_UP)
|
||||
dev_activate(dev);
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
static int mqprio_dump(struct Qdisc *sch, struct sk_buff *skb)
|
||||
{
|
||||
struct net_device *dev = qdisc_dev(sch);
|
||||
struct mqprio_sched *priv = qdisc_priv(sch);
|
||||
unsigned char *b = skb_tail_pointer(skb);
|
||||
struct tc_mqprio_qopt opt = { 0 };
|
||||
struct Qdisc *qdisc;
|
||||
unsigned int i;
|
||||
|
||||
sch->q.qlen = 0;
|
||||
memset(&sch->bstats, 0, sizeof(sch->bstats));
|
||||
memset(&sch->qstats, 0, sizeof(sch->qstats));
|
||||
|
||||
for (i = 0; i < dev->num_tx_queues; i++) {
|
||||
qdisc = rtnl_dereference(netdev_get_tx_queue(dev, i)->qdisc);
|
||||
spin_lock_bh(qdisc_lock(qdisc));
|
||||
sch->q.qlen += qdisc->q.qlen;
|
||||
sch->bstats.bytes += qdisc->bstats.bytes;
|
||||
sch->bstats.packets += qdisc->bstats.packets;
|
||||
sch->qstats.backlog += qdisc->qstats.backlog;
|
||||
sch->qstats.drops += qdisc->qstats.drops;
|
||||
sch->qstats.requeues += qdisc->qstats.requeues;
|
||||
sch->qstats.overlimits += qdisc->qstats.overlimits;
|
||||
spin_unlock_bh(qdisc_lock(qdisc));
|
||||
}
|
||||
|
||||
opt.num_tc = netdev_get_num_tc(dev);
|
||||
memcpy(opt.prio_tc_map, dev->prio_tc_map, sizeof(opt.prio_tc_map));
|
||||
opt.hw = priv->hw_owned;
|
||||
|
||||
for (i = 0; i < netdev_get_num_tc(dev); i++) {
|
||||
opt.count[i] = dev->tc_to_txq[i].count;
|
||||
opt.offset[i] = dev->tc_to_txq[i].offset;
|
||||
}
|
||||
|
||||
if (nla_put(skb, TCA_OPTIONS, sizeof(opt), &opt))
|
||||
goto nla_put_failure;
|
||||
|
||||
return skb->len;
|
||||
nla_put_failure:
|
||||
nlmsg_trim(skb, b);
|
||||
return -1;
|
||||
}
|
||||
|
||||
static struct Qdisc *mqprio_leaf(struct Qdisc *sch, unsigned long cl)
|
||||
{
|
||||
struct netdev_queue *dev_queue = mqprio_queue_get(sch, cl);
|
||||
|
||||
if (!dev_queue)
|
||||
return NULL;
|
||||
|
||||
return dev_queue->qdisc_sleeping;
|
||||
}
|
||||
|
||||
static unsigned long mqprio_get(struct Qdisc *sch, u32 classid)
|
||||
{
|
||||
struct net_device *dev = qdisc_dev(sch);
|
||||
unsigned int ntx = TC_H_MIN(classid);
|
||||
|
||||
if (ntx > dev->num_tx_queues + netdev_get_num_tc(dev))
|
||||
return 0;
|
||||
return ntx;
|
||||
}
|
||||
|
||||
static void mqprio_put(struct Qdisc *sch, unsigned long cl)
|
||||
{
|
||||
}
|
||||
|
||||
static int mqprio_dump_class(struct Qdisc *sch, unsigned long cl,
|
||||
struct sk_buff *skb, struct tcmsg *tcm)
|
||||
{
|
||||
struct net_device *dev = qdisc_dev(sch);
|
||||
|
||||
if (cl <= netdev_get_num_tc(dev)) {
|
||||
tcm->tcm_parent = TC_H_ROOT;
|
||||
tcm->tcm_info = 0;
|
||||
} else {
|
||||
int i;
|
||||
struct netdev_queue *dev_queue;
|
||||
|
||||
dev_queue = mqprio_queue_get(sch, cl);
|
||||
tcm->tcm_parent = 0;
|
||||
for (i = 0; i < netdev_get_num_tc(dev); i++) {
|
||||
struct netdev_tc_txq tc = dev->tc_to_txq[i];
|
||||
int q_idx = cl - netdev_get_num_tc(dev);
|
||||
|
||||
if (q_idx > tc.offset &&
|
||||
q_idx <= tc.offset + tc.count) {
|
||||
tcm->tcm_parent =
|
||||
TC_H_MAKE(TC_H_MAJ(sch->handle),
|
||||
TC_H_MIN(i + 1));
|
||||
break;
|
||||
}
|
||||
}
|
||||
tcm->tcm_info = dev_queue->qdisc_sleeping->handle;
|
||||
}
|
||||
tcm->tcm_handle |= TC_H_MIN(cl);
|
||||
return 0;
|
||||
}
|
||||
|
||||
static int mqprio_dump_class_stats(struct Qdisc *sch, unsigned long cl,
|
||||
struct gnet_dump *d)
|
||||
__releases(d->lock)
|
||||
__acquires(d->lock)
|
||||
{
|
||||
struct net_device *dev = qdisc_dev(sch);
|
||||
|
||||
if (cl <= netdev_get_num_tc(dev)) {
|
||||
int i;
|
||||
__u32 qlen = 0;
|
||||
struct Qdisc *qdisc;
|
||||
struct gnet_stats_queue qstats = {0};
|
||||
struct gnet_stats_basic_packed bstats = {0};
|
||||
struct netdev_tc_txq tc = dev->tc_to_txq[cl - 1];
|
||||
|
||||
/* Drop lock here it will be reclaimed before touching
|
||||
* statistics this is required because the d->lock we
|
||||
* hold here is the look on dev_queue->qdisc_sleeping
|
||||
* also acquired below.
|
||||
*/
|
||||
spin_unlock_bh(d->lock);
|
||||
|
||||
for (i = tc.offset; i < tc.offset + tc.count; i++) {
|
||||
struct netdev_queue *q = netdev_get_tx_queue(dev, i);
|
||||
|
||||
qdisc = rtnl_dereference(q->qdisc);
|
||||
spin_lock_bh(qdisc_lock(qdisc));
|
||||
qlen += qdisc->q.qlen;
|
||||
bstats.bytes += qdisc->bstats.bytes;
|
||||
bstats.packets += qdisc->bstats.packets;
|
||||
qstats.backlog += qdisc->qstats.backlog;
|
||||
qstats.drops += qdisc->qstats.drops;
|
||||
qstats.requeues += qdisc->qstats.requeues;
|
||||
qstats.overlimits += qdisc->qstats.overlimits;
|
||||
spin_unlock_bh(qdisc_lock(qdisc));
|
||||
}
|
||||
/* Reclaim root sleeping lock before completing stats */
|
||||
spin_lock_bh(d->lock);
|
||||
if (gnet_stats_copy_basic(d, NULL, &bstats) < 0 ||
|
||||
gnet_stats_copy_queue(d, NULL, &qstats, qlen) < 0)
|
||||
return -1;
|
||||
} else {
|
||||
struct netdev_queue *dev_queue = mqprio_queue_get(sch, cl);
|
||||
|
||||
sch = dev_queue->qdisc_sleeping;
|
||||
if (gnet_stats_copy_basic(d, NULL, &sch->bstats) < 0 ||
|
||||
gnet_stats_copy_queue(d, NULL,
|
||||
&sch->qstats, sch->q.qlen) < 0)
|
||||
return -1;
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
static void mqprio_walk(struct Qdisc *sch, struct qdisc_walker *arg)
|
||||
{
|
||||
struct net_device *dev = qdisc_dev(sch);
|
||||
unsigned long ntx;
|
||||
|
||||
if (arg->stop)
|
||||
return;
|
||||
|
||||
/* Walk hierarchy with a virtual class per tc */
|
||||
arg->count = arg->skip;
|
||||
for (ntx = arg->skip;
|
||||
ntx < dev->num_tx_queues + netdev_get_num_tc(dev);
|
||||
ntx++) {
|
||||
if (arg->fn(sch, ntx + 1, arg) < 0) {
|
||||
arg->stop = 1;
|
||||
break;
|
||||
}
|
||||
arg->count++;
|
||||
}
|
||||
}
|
||||
|
||||
static const struct Qdisc_class_ops mqprio_class_ops = {
|
||||
.graft = mqprio_graft,
|
||||
.leaf = mqprio_leaf,
|
||||
.get = mqprio_get,
|
||||
.put = mqprio_put,
|
||||
.walk = mqprio_walk,
|
||||
.dump = mqprio_dump_class,
|
||||
.dump_stats = mqprio_dump_class_stats,
|
||||
};
|
||||
|
||||
static struct Qdisc_ops mqprio_qdisc_ops __read_mostly = {
|
||||
.cl_ops = &mqprio_class_ops,
|
||||
.id = "mqprio",
|
||||
.priv_size = sizeof(struct mqprio_sched),
|
||||
.init = mqprio_init,
|
||||
.destroy = mqprio_destroy,
|
||||
.attach = mqprio_attach,
|
||||
.dump = mqprio_dump,
|
||||
.owner = THIS_MODULE,
|
||||
};
|
||||
|
||||
static int __init mqprio_module_init(void)
|
||||
{
|
||||
return register_qdisc(&mqprio_qdisc_ops);
|
||||
}
|
||||
|
||||
static void __exit mqprio_module_exit(void)
|
||||
{
|
||||
unregister_qdisc(&mqprio_qdisc_ops);
|
||||
}
|
||||
|
||||
module_init(mqprio_module_init);
|
||||
module_exit(mqprio_module_exit);
|
||||
|
||||
MODULE_LICENSE("GPL");
|
||||
444
net/sched/sch_multiq.c
Normal file
444
net/sched/sch_multiq.c
Normal file
|
|
@ -0,0 +1,444 @@
|
|||
/*
|
||||
* Copyright (c) 2008, Intel Corporation.
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or modify it
|
||||
* under the terms and conditions of the GNU General Public License,
|
||||
* version 2, as published by the Free Software Foundation.
|
||||
*
|
||||
* This program is distributed in the hope it will be useful, but WITHOUT
|
||||
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
|
||||
* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
|
||||
* more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License along with
|
||||
* this program; if not, see <http://www.gnu.org/licenses/>.
|
||||
*
|
||||
* Author: Alexander Duyck <alexander.h.duyck@intel.com>
|
||||
*/
|
||||
|
||||
#include <linux/module.h>
|
||||
#include <linux/slab.h>
|
||||
#include <linux/types.h>
|
||||
#include <linux/kernel.h>
|
||||
#include <linux/string.h>
|
||||
#include <linux/errno.h>
|
||||
#include <linux/skbuff.h>
|
||||
#include <net/netlink.h>
|
||||
#include <net/pkt_sched.h>
|
||||
|
||||
|
||||
struct multiq_sched_data {
|
||||
u16 bands;
|
||||
u16 max_bands;
|
||||
u16 curband;
|
||||
struct tcf_proto __rcu *filter_list;
|
||||
struct Qdisc **queues;
|
||||
};
|
||||
|
||||
|
||||
static struct Qdisc *
|
||||
multiq_classify(struct sk_buff *skb, struct Qdisc *sch, int *qerr)
|
||||
{
|
||||
struct multiq_sched_data *q = qdisc_priv(sch);
|
||||
u32 band;
|
||||
struct tcf_result res;
|
||||
struct tcf_proto *fl = rcu_dereference_bh(q->filter_list);
|
||||
int err;
|
||||
|
||||
*qerr = NET_XMIT_SUCCESS | __NET_XMIT_BYPASS;
|
||||
err = tc_classify(skb, fl, &res);
|
||||
#ifdef CONFIG_NET_CLS_ACT
|
||||
switch (err) {
|
||||
case TC_ACT_STOLEN:
|
||||
case TC_ACT_QUEUED:
|
||||
*qerr = NET_XMIT_SUCCESS | __NET_XMIT_STOLEN;
|
||||
case TC_ACT_SHOT:
|
||||
return NULL;
|
||||
}
|
||||
#endif
|
||||
band = skb_get_queue_mapping(skb);
|
||||
|
||||
if (band >= q->bands)
|
||||
return q->queues[0];
|
||||
|
||||
return q->queues[band];
|
||||
}
|
||||
|
||||
static int
|
||||
multiq_enqueue(struct sk_buff *skb, struct Qdisc *sch)
|
||||
{
|
||||
struct Qdisc *qdisc;
|
||||
int ret;
|
||||
|
||||
qdisc = multiq_classify(skb, sch, &ret);
|
||||
#ifdef CONFIG_NET_CLS_ACT
|
||||
if (qdisc == NULL) {
|
||||
|
||||
if (ret & __NET_XMIT_BYPASS)
|
||||
qdisc_qstats_drop(sch);
|
||||
kfree_skb(skb);
|
||||
return ret;
|
||||
}
|
||||
#endif
|
||||
|
||||
ret = qdisc_enqueue(skb, qdisc);
|
||||
if (ret == NET_XMIT_SUCCESS) {
|
||||
sch->q.qlen++;
|
||||
return NET_XMIT_SUCCESS;
|
||||
}
|
||||
if (net_xmit_drop_count(ret))
|
||||
qdisc_qstats_drop(sch);
|
||||
return ret;
|
||||
}
|
||||
|
||||
static struct sk_buff *multiq_dequeue(struct Qdisc *sch)
|
||||
{
|
||||
struct multiq_sched_data *q = qdisc_priv(sch);
|
||||
struct Qdisc *qdisc;
|
||||
struct sk_buff *skb;
|
||||
int band;
|
||||
|
||||
for (band = 0; band < q->bands; band++) {
|
||||
/* cycle through bands to ensure fairness */
|
||||
q->curband++;
|
||||
if (q->curband >= q->bands)
|
||||
q->curband = 0;
|
||||
|
||||
/* Check that target subqueue is available before
|
||||
* pulling an skb to avoid head-of-line blocking.
|
||||
*/
|
||||
if (!netif_xmit_stopped(
|
||||
netdev_get_tx_queue(qdisc_dev(sch), q->curband))) {
|
||||
qdisc = q->queues[q->curband];
|
||||
skb = qdisc->dequeue(qdisc);
|
||||
if (skb) {
|
||||
qdisc_bstats_update(sch, skb);
|
||||
sch->q.qlen--;
|
||||
return skb;
|
||||
}
|
||||
}
|
||||
}
|
||||
return NULL;
|
||||
|
||||
}
|
||||
|
||||
static struct sk_buff *multiq_peek(struct Qdisc *sch)
|
||||
{
|
||||
struct multiq_sched_data *q = qdisc_priv(sch);
|
||||
unsigned int curband = q->curband;
|
||||
struct Qdisc *qdisc;
|
||||
struct sk_buff *skb;
|
||||
int band;
|
||||
|
||||
for (band = 0; band < q->bands; band++) {
|
||||
/* cycle through bands to ensure fairness */
|
||||
curband++;
|
||||
if (curband >= q->bands)
|
||||
curband = 0;
|
||||
|
||||
/* Check that target subqueue is available before
|
||||
* pulling an skb to avoid head-of-line blocking.
|
||||
*/
|
||||
if (!netif_xmit_stopped(
|
||||
netdev_get_tx_queue(qdisc_dev(sch), curband))) {
|
||||
qdisc = q->queues[curband];
|
||||
skb = qdisc->ops->peek(qdisc);
|
||||
if (skb)
|
||||
return skb;
|
||||
}
|
||||
}
|
||||
return NULL;
|
||||
|
||||
}
|
||||
|
||||
static unsigned int multiq_drop(struct Qdisc *sch)
|
||||
{
|
||||
struct multiq_sched_data *q = qdisc_priv(sch);
|
||||
int band;
|
||||
unsigned int len;
|
||||
struct Qdisc *qdisc;
|
||||
|
||||
for (band = q->bands - 1; band >= 0; band--) {
|
||||
qdisc = q->queues[band];
|
||||
if (qdisc->ops->drop) {
|
||||
len = qdisc->ops->drop(qdisc);
|
||||
if (len != 0) {
|
||||
sch->q.qlen--;
|
||||
return len;
|
||||
}
|
||||
}
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
|
||||
static void
|
||||
multiq_reset(struct Qdisc *sch)
|
||||
{
|
||||
u16 band;
|
||||
struct multiq_sched_data *q = qdisc_priv(sch);
|
||||
|
||||
for (band = 0; band < q->bands; band++)
|
||||
qdisc_reset(q->queues[band]);
|
||||
sch->q.qlen = 0;
|
||||
q->curband = 0;
|
||||
}
|
||||
|
||||
static void
|
||||
multiq_destroy(struct Qdisc *sch)
|
||||
{
|
||||
int band;
|
||||
struct multiq_sched_data *q = qdisc_priv(sch);
|
||||
|
||||
tcf_destroy_chain(&q->filter_list);
|
||||
for (band = 0; band < q->bands; band++)
|
||||
qdisc_destroy(q->queues[band]);
|
||||
|
||||
kfree(q->queues);
|
||||
}
|
||||
|
||||
static int multiq_tune(struct Qdisc *sch, struct nlattr *opt)
|
||||
{
|
||||
struct multiq_sched_data *q = qdisc_priv(sch);
|
||||
struct tc_multiq_qopt *qopt;
|
||||
int i;
|
||||
|
||||
if (!netif_is_multiqueue(qdisc_dev(sch)))
|
||||
return -EOPNOTSUPP;
|
||||
if (nla_len(opt) < sizeof(*qopt))
|
||||
return -EINVAL;
|
||||
|
||||
qopt = nla_data(opt);
|
||||
|
||||
qopt->bands = qdisc_dev(sch)->real_num_tx_queues;
|
||||
|
||||
sch_tree_lock(sch);
|
||||
q->bands = qopt->bands;
|
||||
for (i = q->bands; i < q->max_bands; i++) {
|
||||
if (q->queues[i] != &noop_qdisc) {
|
||||
struct Qdisc *child = q->queues[i];
|
||||
q->queues[i] = &noop_qdisc;
|
||||
qdisc_tree_decrease_qlen(child, child->q.qlen);
|
||||
qdisc_destroy(child);
|
||||
}
|
||||
}
|
||||
|
||||
sch_tree_unlock(sch);
|
||||
|
||||
for (i = 0; i < q->bands; i++) {
|
||||
if (q->queues[i] == &noop_qdisc) {
|
||||
struct Qdisc *child, *old;
|
||||
child = qdisc_create_dflt(sch->dev_queue,
|
||||
&pfifo_qdisc_ops,
|
||||
TC_H_MAKE(sch->handle,
|
||||
i + 1));
|
||||
if (child) {
|
||||
sch_tree_lock(sch);
|
||||
old = q->queues[i];
|
||||
q->queues[i] = child;
|
||||
|
||||
if (old != &noop_qdisc) {
|
||||
qdisc_tree_decrease_qlen(old,
|
||||
old->q.qlen);
|
||||
qdisc_destroy(old);
|
||||
}
|
||||
sch_tree_unlock(sch);
|
||||
}
|
||||
}
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
static int multiq_init(struct Qdisc *sch, struct nlattr *opt)
|
||||
{
|
||||
struct multiq_sched_data *q = qdisc_priv(sch);
|
||||
int i, err;
|
||||
|
||||
q->queues = NULL;
|
||||
|
||||
if (opt == NULL)
|
||||
return -EINVAL;
|
||||
|
||||
q->max_bands = qdisc_dev(sch)->num_tx_queues;
|
||||
|
||||
q->queues = kcalloc(q->max_bands, sizeof(struct Qdisc *), GFP_KERNEL);
|
||||
if (!q->queues)
|
||||
return -ENOBUFS;
|
||||
for (i = 0; i < q->max_bands; i++)
|
||||
q->queues[i] = &noop_qdisc;
|
||||
|
||||
err = multiq_tune(sch, opt);
|
||||
|
||||
if (err)
|
||||
kfree(q->queues);
|
||||
|
||||
return err;
|
||||
}
|
||||
|
||||
static int multiq_dump(struct Qdisc *sch, struct sk_buff *skb)
|
||||
{
|
||||
struct multiq_sched_data *q = qdisc_priv(sch);
|
||||
unsigned char *b = skb_tail_pointer(skb);
|
||||
struct tc_multiq_qopt opt;
|
||||
|
||||
opt.bands = q->bands;
|
||||
opt.max_bands = q->max_bands;
|
||||
|
||||
if (nla_put(skb, TCA_OPTIONS, sizeof(opt), &opt))
|
||||
goto nla_put_failure;
|
||||
|
||||
return skb->len;
|
||||
|
||||
nla_put_failure:
|
||||
nlmsg_trim(skb, b);
|
||||
return -1;
|
||||
}
|
||||
|
||||
static int multiq_graft(struct Qdisc *sch, unsigned long arg, struct Qdisc *new,
|
||||
struct Qdisc **old)
|
||||
{
|
||||
struct multiq_sched_data *q = qdisc_priv(sch);
|
||||
unsigned long band = arg - 1;
|
||||
|
||||
if (new == NULL)
|
||||
new = &noop_qdisc;
|
||||
|
||||
sch_tree_lock(sch);
|
||||
*old = q->queues[band];
|
||||
q->queues[band] = new;
|
||||
qdisc_tree_decrease_qlen(*old, (*old)->q.qlen);
|
||||
qdisc_reset(*old);
|
||||
sch_tree_unlock(sch);
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
static struct Qdisc *
|
||||
multiq_leaf(struct Qdisc *sch, unsigned long arg)
|
||||
{
|
||||
struct multiq_sched_data *q = qdisc_priv(sch);
|
||||
unsigned long band = arg - 1;
|
||||
|
||||
return q->queues[band];
|
||||
}
|
||||
|
||||
static unsigned long multiq_get(struct Qdisc *sch, u32 classid)
|
||||
{
|
||||
struct multiq_sched_data *q = qdisc_priv(sch);
|
||||
unsigned long band = TC_H_MIN(classid);
|
||||
|
||||
if (band - 1 >= q->bands)
|
||||
return 0;
|
||||
return band;
|
||||
}
|
||||
|
||||
static unsigned long multiq_bind(struct Qdisc *sch, unsigned long parent,
|
||||
u32 classid)
|
||||
{
|
||||
return multiq_get(sch, classid);
|
||||
}
|
||||
|
||||
|
||||
static void multiq_put(struct Qdisc *q, unsigned long cl)
|
||||
{
|
||||
}
|
||||
|
||||
static int multiq_dump_class(struct Qdisc *sch, unsigned long cl,
|
||||
struct sk_buff *skb, struct tcmsg *tcm)
|
||||
{
|
||||
struct multiq_sched_data *q = qdisc_priv(sch);
|
||||
|
||||
tcm->tcm_handle |= TC_H_MIN(cl);
|
||||
tcm->tcm_info = q->queues[cl - 1]->handle;
|
||||
return 0;
|
||||
}
|
||||
|
||||
static int multiq_dump_class_stats(struct Qdisc *sch, unsigned long cl,
|
||||
struct gnet_dump *d)
|
||||
{
|
||||
struct multiq_sched_data *q = qdisc_priv(sch);
|
||||
struct Qdisc *cl_q;
|
||||
|
||||
cl_q = q->queues[cl - 1];
|
||||
if (gnet_stats_copy_basic(d, NULL, &cl_q->bstats) < 0 ||
|
||||
gnet_stats_copy_queue(d, NULL, &cl_q->qstats, cl_q->q.qlen) < 0)
|
||||
return -1;
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
static void multiq_walk(struct Qdisc *sch, struct qdisc_walker *arg)
|
||||
{
|
||||
struct multiq_sched_data *q = qdisc_priv(sch);
|
||||
int band;
|
||||
|
||||
if (arg->stop)
|
||||
return;
|
||||
|
||||
for (band = 0; band < q->bands; band++) {
|
||||
if (arg->count < arg->skip) {
|
||||
arg->count++;
|
||||
continue;
|
||||
}
|
||||
if (arg->fn(sch, band + 1, arg) < 0) {
|
||||
arg->stop = 1;
|
||||
break;
|
||||
}
|
||||
arg->count++;
|
||||
}
|
||||
}
|
||||
|
||||
static struct tcf_proto __rcu **multiq_find_tcf(struct Qdisc *sch,
|
||||
unsigned long cl)
|
||||
{
|
||||
struct multiq_sched_data *q = qdisc_priv(sch);
|
||||
|
||||
if (cl)
|
||||
return NULL;
|
||||
return &q->filter_list;
|
||||
}
|
||||
|
||||
static const struct Qdisc_class_ops multiq_class_ops = {
|
||||
.graft = multiq_graft,
|
||||
.leaf = multiq_leaf,
|
||||
.get = multiq_get,
|
||||
.put = multiq_put,
|
||||
.walk = multiq_walk,
|
||||
.tcf_chain = multiq_find_tcf,
|
||||
.bind_tcf = multiq_bind,
|
||||
.unbind_tcf = multiq_put,
|
||||
.dump = multiq_dump_class,
|
||||
.dump_stats = multiq_dump_class_stats,
|
||||
};
|
||||
|
||||
static struct Qdisc_ops multiq_qdisc_ops __read_mostly = {
|
||||
.next = NULL,
|
||||
.cl_ops = &multiq_class_ops,
|
||||
.id = "multiq",
|
||||
.priv_size = sizeof(struct multiq_sched_data),
|
||||
.enqueue = multiq_enqueue,
|
||||
.dequeue = multiq_dequeue,
|
||||
.peek = multiq_peek,
|
||||
.drop = multiq_drop,
|
||||
.init = multiq_init,
|
||||
.reset = multiq_reset,
|
||||
.destroy = multiq_destroy,
|
||||
.change = multiq_tune,
|
||||
.dump = multiq_dump,
|
||||
.owner = THIS_MODULE,
|
||||
};
|
||||
|
||||
static int __init multiq_module_init(void)
|
||||
{
|
||||
return register_qdisc(&multiq_qdisc_ops);
|
||||
}
|
||||
|
||||
static void __exit multiq_module_exit(void)
|
||||
{
|
||||
unregister_qdisc(&multiq_qdisc_ops);
|
||||
}
|
||||
|
||||
module_init(multiq_module_init)
|
||||
module_exit(multiq_module_exit)
|
||||
|
||||
MODULE_LICENSE("GPL");
|
||||
1128
net/sched/sch_netem.c
Normal file
1128
net/sched/sch_netem.c
Normal file
File diff suppressed because it is too large
Load diff
566
net/sched/sch_pie.c
Normal file
566
net/sched/sch_pie.c
Normal file
|
|
@ -0,0 +1,566 @@
|
|||
/* Copyright (C) 2013 Cisco Systems, Inc, 2013.
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU General Public License
|
||||
* as published by the Free Software Foundation; either version 2
|
||||
* of the License.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU General Public License for more details.
|
||||
*
|
||||
* Author: Vijay Subramanian <vijaynsu@cisco.com>
|
||||
* Author: Mythili Prabhu <mysuryan@cisco.com>
|
||||
*
|
||||
* ECN support is added by Naeem Khademi <naeemk@ifi.uio.no>
|
||||
* University of Oslo, Norway.
|
||||
*
|
||||
* References:
|
||||
* IETF draft submission: http://tools.ietf.org/html/draft-pan-aqm-pie-00
|
||||
* IEEE Conference on High Performance Switching and Routing 2013 :
|
||||
* "PIE: A * Lightweight Control Scheme to Address the Bufferbloat Problem"
|
||||
*/
|
||||
|
||||
#include <linux/module.h>
|
||||
#include <linux/slab.h>
|
||||
#include <linux/types.h>
|
||||
#include <linux/kernel.h>
|
||||
#include <linux/errno.h>
|
||||
#include <linux/skbuff.h>
|
||||
#include <net/pkt_sched.h>
|
||||
#include <net/inet_ecn.h>
|
||||
|
||||
#define QUEUE_THRESHOLD 10000
|
||||
#define DQCOUNT_INVALID -1
|
||||
#define MAX_PROB 0xffffffff
|
||||
#define PIE_SCALE 8
|
||||
|
||||
/* parameters used */
|
||||
struct pie_params {
|
||||
psched_time_t target; /* user specified target delay in pschedtime */
|
||||
u32 tupdate; /* timer frequency (in jiffies) */
|
||||
u32 limit; /* number of packets that can be enqueued */
|
||||
u32 alpha; /* alpha and beta are between 0 and 32 */
|
||||
u32 beta; /* and are used for shift relative to 1 */
|
||||
bool ecn; /* true if ecn is enabled */
|
||||
bool bytemode; /* to scale drop early prob based on pkt size */
|
||||
};
|
||||
|
||||
/* variables used */
|
||||
struct pie_vars {
|
||||
u32 prob; /* probability but scaled by u32 limit. */
|
||||
psched_time_t burst_time;
|
||||
psched_time_t qdelay;
|
||||
psched_time_t qdelay_old;
|
||||
u64 dq_count; /* measured in bytes */
|
||||
psched_time_t dq_tstamp; /* drain rate */
|
||||
u32 avg_dq_rate; /* bytes per pschedtime tick,scaled */
|
||||
u32 qlen_old; /* in bytes */
|
||||
};
|
||||
|
||||
/* statistics gathering */
|
||||
struct pie_stats {
|
||||
u32 packets_in; /* total number of packets enqueued */
|
||||
u32 dropped; /* packets dropped due to pie_action */
|
||||
u32 overlimit; /* dropped due to lack of space in queue */
|
||||
u32 maxq; /* maximum queue size */
|
||||
u32 ecn_mark; /* packets marked with ECN */
|
||||
};
|
||||
|
||||
/* private data for the Qdisc */
|
||||
struct pie_sched_data {
|
||||
struct pie_params params;
|
||||
struct pie_vars vars;
|
||||
struct pie_stats stats;
|
||||
struct timer_list adapt_timer;
|
||||
};
|
||||
|
||||
static void pie_params_init(struct pie_params *params)
|
||||
{
|
||||
params->alpha = 2;
|
||||
params->beta = 20;
|
||||
params->tupdate = usecs_to_jiffies(30 * USEC_PER_MSEC); /* 30 ms */
|
||||
params->limit = 1000; /* default of 1000 packets */
|
||||
params->target = PSCHED_NS2TICKS(20 * NSEC_PER_MSEC); /* 20 ms */
|
||||
params->ecn = false;
|
||||
params->bytemode = false;
|
||||
}
|
||||
|
||||
static void pie_vars_init(struct pie_vars *vars)
|
||||
{
|
||||
vars->dq_count = DQCOUNT_INVALID;
|
||||
vars->avg_dq_rate = 0;
|
||||
/* default of 100 ms in pschedtime */
|
||||
vars->burst_time = PSCHED_NS2TICKS(100 * NSEC_PER_MSEC);
|
||||
}
|
||||
|
||||
static bool drop_early(struct Qdisc *sch, u32 packet_size)
|
||||
{
|
||||
struct pie_sched_data *q = qdisc_priv(sch);
|
||||
u32 rnd;
|
||||
u32 local_prob = q->vars.prob;
|
||||
u32 mtu = psched_mtu(qdisc_dev(sch));
|
||||
|
||||
/* If there is still burst allowance left skip random early drop */
|
||||
if (q->vars.burst_time > 0)
|
||||
return false;
|
||||
|
||||
/* If current delay is less than half of target, and
|
||||
* if drop prob is low already, disable early_drop
|
||||
*/
|
||||
if ((q->vars.qdelay < q->params.target / 2)
|
||||
&& (q->vars.prob < MAX_PROB / 5))
|
||||
return false;
|
||||
|
||||
/* If we have fewer than 2 mtu-sized packets, disable drop_early,
|
||||
* similar to min_th in RED
|
||||
*/
|
||||
if (sch->qstats.backlog < 2 * mtu)
|
||||
return false;
|
||||
|
||||
/* If bytemode is turned on, use packet size to compute new
|
||||
* probablity. Smaller packets will have lower drop prob in this case
|
||||
*/
|
||||
if (q->params.bytemode && packet_size <= mtu)
|
||||
local_prob = (local_prob / mtu) * packet_size;
|
||||
else
|
||||
local_prob = q->vars.prob;
|
||||
|
||||
rnd = prandom_u32();
|
||||
if (rnd < local_prob)
|
||||
return true;
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
static int pie_qdisc_enqueue(struct sk_buff *skb, struct Qdisc *sch)
|
||||
{
|
||||
struct pie_sched_data *q = qdisc_priv(sch);
|
||||
bool enqueue = false;
|
||||
|
||||
if (unlikely(qdisc_qlen(sch) >= sch->limit)) {
|
||||
q->stats.overlimit++;
|
||||
goto out;
|
||||
}
|
||||
|
||||
if (!drop_early(sch, skb->len)) {
|
||||
enqueue = true;
|
||||
} else if (q->params.ecn && (q->vars.prob <= MAX_PROB / 10) &&
|
||||
INET_ECN_set_ce(skb)) {
|
||||
/* If packet is ecn capable, mark it if drop probability
|
||||
* is lower than 10%, else drop it.
|
||||
*/
|
||||
q->stats.ecn_mark++;
|
||||
enqueue = true;
|
||||
}
|
||||
|
||||
/* we can enqueue the packet */
|
||||
if (enqueue) {
|
||||
q->stats.packets_in++;
|
||||
if (qdisc_qlen(sch) > q->stats.maxq)
|
||||
q->stats.maxq = qdisc_qlen(sch);
|
||||
|
||||
return qdisc_enqueue_tail(skb, sch);
|
||||
}
|
||||
|
||||
out:
|
||||
q->stats.dropped++;
|
||||
return qdisc_drop(skb, sch);
|
||||
}
|
||||
|
||||
static const struct nla_policy pie_policy[TCA_PIE_MAX + 1] = {
|
||||
[TCA_PIE_TARGET] = {.type = NLA_U32},
|
||||
[TCA_PIE_LIMIT] = {.type = NLA_U32},
|
||||
[TCA_PIE_TUPDATE] = {.type = NLA_U32},
|
||||
[TCA_PIE_ALPHA] = {.type = NLA_U32},
|
||||
[TCA_PIE_BETA] = {.type = NLA_U32},
|
||||
[TCA_PIE_ECN] = {.type = NLA_U32},
|
||||
[TCA_PIE_BYTEMODE] = {.type = NLA_U32},
|
||||
};
|
||||
|
||||
static int pie_change(struct Qdisc *sch, struct nlattr *opt)
|
||||
{
|
||||
struct pie_sched_data *q = qdisc_priv(sch);
|
||||
struct nlattr *tb[TCA_PIE_MAX + 1];
|
||||
unsigned int qlen;
|
||||
int err;
|
||||
|
||||
if (!opt)
|
||||
return -EINVAL;
|
||||
|
||||
err = nla_parse_nested(tb, TCA_PIE_MAX, opt, pie_policy);
|
||||
if (err < 0)
|
||||
return err;
|
||||
|
||||
sch_tree_lock(sch);
|
||||
|
||||
/* convert from microseconds to pschedtime */
|
||||
if (tb[TCA_PIE_TARGET]) {
|
||||
/* target is in us */
|
||||
u32 target = nla_get_u32(tb[TCA_PIE_TARGET]);
|
||||
|
||||
/* convert to pschedtime */
|
||||
q->params.target = PSCHED_NS2TICKS((u64)target * NSEC_PER_USEC);
|
||||
}
|
||||
|
||||
/* tupdate is in jiffies */
|
||||
if (tb[TCA_PIE_TUPDATE])
|
||||
q->params.tupdate = usecs_to_jiffies(nla_get_u32(tb[TCA_PIE_TUPDATE]));
|
||||
|
||||
if (tb[TCA_PIE_LIMIT]) {
|
||||
u32 limit = nla_get_u32(tb[TCA_PIE_LIMIT]);
|
||||
|
||||
q->params.limit = limit;
|
||||
sch->limit = limit;
|
||||
}
|
||||
|
||||
if (tb[TCA_PIE_ALPHA])
|
||||
q->params.alpha = nla_get_u32(tb[TCA_PIE_ALPHA]);
|
||||
|
||||
if (tb[TCA_PIE_BETA])
|
||||
q->params.beta = nla_get_u32(tb[TCA_PIE_BETA]);
|
||||
|
||||
if (tb[TCA_PIE_ECN])
|
||||
q->params.ecn = nla_get_u32(tb[TCA_PIE_ECN]);
|
||||
|
||||
if (tb[TCA_PIE_BYTEMODE])
|
||||
q->params.bytemode = nla_get_u32(tb[TCA_PIE_BYTEMODE]);
|
||||
|
||||
/* Drop excess packets if new limit is lower */
|
||||
qlen = sch->q.qlen;
|
||||
while (sch->q.qlen > sch->limit) {
|
||||
struct sk_buff *skb = __skb_dequeue(&sch->q);
|
||||
|
||||
qdisc_qstats_backlog_dec(sch, skb);
|
||||
qdisc_drop(skb, sch);
|
||||
}
|
||||
qdisc_tree_decrease_qlen(sch, qlen - sch->q.qlen);
|
||||
|
||||
sch_tree_unlock(sch);
|
||||
return 0;
|
||||
}
|
||||
|
||||
static void pie_process_dequeue(struct Qdisc *sch, struct sk_buff *skb)
|
||||
{
|
||||
|
||||
struct pie_sched_data *q = qdisc_priv(sch);
|
||||
int qlen = sch->qstats.backlog; /* current queue size in bytes */
|
||||
|
||||
/* If current queue is about 10 packets or more and dq_count is unset
|
||||
* we have enough packets to calculate the drain rate. Save
|
||||
* current time as dq_tstamp and start measurement cycle.
|
||||
*/
|
||||
if (qlen >= QUEUE_THRESHOLD && q->vars.dq_count == DQCOUNT_INVALID) {
|
||||
q->vars.dq_tstamp = psched_get_time();
|
||||
q->vars.dq_count = 0;
|
||||
}
|
||||
|
||||
/* Calculate the average drain rate from this value. If queue length
|
||||
* has receded to a small value viz., <= QUEUE_THRESHOLD bytes,reset
|
||||
* the dq_count to -1 as we don't have enough packets to calculate the
|
||||
* drain rate anymore The following if block is entered only when we
|
||||
* have a substantial queue built up (QUEUE_THRESHOLD bytes or more)
|
||||
* and we calculate the drain rate for the threshold here. dq_count is
|
||||
* in bytes, time difference in psched_time, hence rate is in
|
||||
* bytes/psched_time.
|
||||
*/
|
||||
if (q->vars.dq_count != DQCOUNT_INVALID) {
|
||||
q->vars.dq_count += skb->len;
|
||||
|
||||
if (q->vars.dq_count >= QUEUE_THRESHOLD) {
|
||||
psched_time_t now = psched_get_time();
|
||||
u32 dtime = now - q->vars.dq_tstamp;
|
||||
u32 count = q->vars.dq_count << PIE_SCALE;
|
||||
|
||||
if (dtime == 0)
|
||||
return;
|
||||
|
||||
count = count / dtime;
|
||||
|
||||
if (q->vars.avg_dq_rate == 0)
|
||||
q->vars.avg_dq_rate = count;
|
||||
else
|
||||
q->vars.avg_dq_rate =
|
||||
(q->vars.avg_dq_rate -
|
||||
(q->vars.avg_dq_rate >> 3)) + (count >> 3);
|
||||
|
||||
/* If the queue has receded below the threshold, we hold
|
||||
* on to the last drain rate calculated, else we reset
|
||||
* dq_count to 0 to re-enter the if block when the next
|
||||
* packet is dequeued
|
||||
*/
|
||||
if (qlen < QUEUE_THRESHOLD)
|
||||
q->vars.dq_count = DQCOUNT_INVALID;
|
||||
else {
|
||||
q->vars.dq_count = 0;
|
||||
q->vars.dq_tstamp = psched_get_time();
|
||||
}
|
||||
|
||||
if (q->vars.burst_time > 0) {
|
||||
if (q->vars.burst_time > dtime)
|
||||
q->vars.burst_time -= dtime;
|
||||
else
|
||||
q->vars.burst_time = 0;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static void calculate_probability(struct Qdisc *sch)
|
||||
{
|
||||
struct pie_sched_data *q = qdisc_priv(sch);
|
||||
u32 qlen = sch->qstats.backlog; /* queue size in bytes */
|
||||
psched_time_t qdelay = 0; /* in pschedtime */
|
||||
psched_time_t qdelay_old = q->vars.qdelay; /* in pschedtime */
|
||||
s32 delta = 0; /* determines the change in probability */
|
||||
u32 oldprob;
|
||||
u32 alpha, beta;
|
||||
bool update_prob = true;
|
||||
|
||||
q->vars.qdelay_old = q->vars.qdelay;
|
||||
|
||||
if (q->vars.avg_dq_rate > 0)
|
||||
qdelay = (qlen << PIE_SCALE) / q->vars.avg_dq_rate;
|
||||
else
|
||||
qdelay = 0;
|
||||
|
||||
/* If qdelay is zero and qlen is not, it means qlen is very small, less
|
||||
* than dequeue_rate, so we do not update probabilty in this round
|
||||
*/
|
||||
if (qdelay == 0 && qlen != 0)
|
||||
update_prob = false;
|
||||
|
||||
/* In the algorithm, alpha and beta are between 0 and 2 with typical
|
||||
* value for alpha as 0.125. In this implementation, we use values 0-32
|
||||
* passed from user space to represent this. Also, alpha and beta have
|
||||
* unit of HZ and need to be scaled before they can used to update
|
||||
* probability. alpha/beta are updated locally below by 1) scaling them
|
||||
* appropriately 2) scaling down by 16 to come to 0-2 range.
|
||||
* Please see paper for details.
|
||||
*
|
||||
* We scale alpha and beta differently depending on whether we are in
|
||||
* light, medium or high dropping mode.
|
||||
*/
|
||||
if (q->vars.prob < MAX_PROB / 100) {
|
||||
alpha =
|
||||
(q->params.alpha * (MAX_PROB / PSCHED_TICKS_PER_SEC)) >> 7;
|
||||
beta =
|
||||
(q->params.beta * (MAX_PROB / PSCHED_TICKS_PER_SEC)) >> 7;
|
||||
} else if (q->vars.prob < MAX_PROB / 10) {
|
||||
alpha =
|
||||
(q->params.alpha * (MAX_PROB / PSCHED_TICKS_PER_SEC)) >> 5;
|
||||
beta =
|
||||
(q->params.beta * (MAX_PROB / PSCHED_TICKS_PER_SEC)) >> 5;
|
||||
} else {
|
||||
alpha =
|
||||
(q->params.alpha * (MAX_PROB / PSCHED_TICKS_PER_SEC)) >> 4;
|
||||
beta =
|
||||
(q->params.beta * (MAX_PROB / PSCHED_TICKS_PER_SEC)) >> 4;
|
||||
}
|
||||
|
||||
/* alpha and beta should be between 0 and 32, in multiples of 1/16 */
|
||||
delta += alpha * ((qdelay - q->params.target));
|
||||
delta += beta * ((qdelay - qdelay_old));
|
||||
|
||||
oldprob = q->vars.prob;
|
||||
|
||||
/* to ensure we increase probability in steps of no more than 2% */
|
||||
if (delta > (s32) (MAX_PROB / (100 / 2)) &&
|
||||
q->vars.prob >= MAX_PROB / 10)
|
||||
delta = (MAX_PROB / 100) * 2;
|
||||
|
||||
/* Non-linear drop:
|
||||
* Tune drop probability to increase quickly for high delays(>= 250ms)
|
||||
* 250ms is derived through experiments and provides error protection
|
||||
*/
|
||||
|
||||
if (qdelay > (PSCHED_NS2TICKS(250 * NSEC_PER_MSEC)))
|
||||
delta += MAX_PROB / (100 / 2);
|
||||
|
||||
q->vars.prob += delta;
|
||||
|
||||
if (delta > 0) {
|
||||
/* prevent overflow */
|
||||
if (q->vars.prob < oldprob) {
|
||||
q->vars.prob = MAX_PROB;
|
||||
/* Prevent normalization error. If probability is at
|
||||
* maximum value already, we normalize it here, and
|
||||
* skip the check to do a non-linear drop in the next
|
||||
* section.
|
||||
*/
|
||||
update_prob = false;
|
||||
}
|
||||
} else {
|
||||
/* prevent underflow */
|
||||
if (q->vars.prob > oldprob)
|
||||
q->vars.prob = 0;
|
||||
}
|
||||
|
||||
/* Non-linear drop in probability: Reduce drop probability quickly if
|
||||
* delay is 0 for 2 consecutive Tupdate periods.
|
||||
*/
|
||||
|
||||
if ((qdelay == 0) && (qdelay_old == 0) && update_prob)
|
||||
q->vars.prob = (q->vars.prob * 98) / 100;
|
||||
|
||||
q->vars.qdelay = qdelay;
|
||||
q->vars.qlen_old = qlen;
|
||||
|
||||
/* We restart the measurement cycle if the following conditions are met
|
||||
* 1. If the delay has been low for 2 consecutive Tupdate periods
|
||||
* 2. Calculated drop probability is zero
|
||||
* 3. We have atleast one estimate for the avg_dq_rate ie.,
|
||||
* is a non-zero value
|
||||
*/
|
||||
if ((q->vars.qdelay < q->params.target / 2) &&
|
||||
(q->vars.qdelay_old < q->params.target / 2) &&
|
||||
(q->vars.prob == 0) &&
|
||||
(q->vars.avg_dq_rate > 0))
|
||||
pie_vars_init(&q->vars);
|
||||
}
|
||||
|
||||
static void pie_timer(unsigned long arg)
|
||||
{
|
||||
struct Qdisc *sch = (struct Qdisc *)arg;
|
||||
struct pie_sched_data *q = qdisc_priv(sch);
|
||||
spinlock_t *root_lock = qdisc_lock(qdisc_root_sleeping(sch));
|
||||
|
||||
spin_lock(root_lock);
|
||||
calculate_probability(sch);
|
||||
|
||||
/* reset the timer to fire after 'tupdate'. tupdate is in jiffies. */
|
||||
if (q->params.tupdate)
|
||||
mod_timer(&q->adapt_timer, jiffies + q->params.tupdate);
|
||||
spin_unlock(root_lock);
|
||||
|
||||
}
|
||||
|
||||
static int pie_init(struct Qdisc *sch, struct nlattr *opt)
|
||||
{
|
||||
struct pie_sched_data *q = qdisc_priv(sch);
|
||||
|
||||
pie_params_init(&q->params);
|
||||
pie_vars_init(&q->vars);
|
||||
sch->limit = q->params.limit;
|
||||
|
||||
setup_timer(&q->adapt_timer, pie_timer, (unsigned long)sch);
|
||||
|
||||
if (opt) {
|
||||
int err = pie_change(sch, opt);
|
||||
|
||||
if (err)
|
||||
return err;
|
||||
}
|
||||
|
||||
mod_timer(&q->adapt_timer, jiffies + HZ / 2);
|
||||
return 0;
|
||||
}
|
||||
|
||||
static int pie_dump(struct Qdisc *sch, struct sk_buff *skb)
|
||||
{
|
||||
struct pie_sched_data *q = qdisc_priv(sch);
|
||||
struct nlattr *opts;
|
||||
|
||||
opts = nla_nest_start(skb, TCA_OPTIONS);
|
||||
if (opts == NULL)
|
||||
goto nla_put_failure;
|
||||
|
||||
/* convert target from pschedtime to us */
|
||||
if (nla_put_u32(skb, TCA_PIE_TARGET,
|
||||
((u32) PSCHED_TICKS2NS(q->params.target)) /
|
||||
NSEC_PER_USEC) ||
|
||||
nla_put_u32(skb, TCA_PIE_LIMIT, sch->limit) ||
|
||||
nla_put_u32(skb, TCA_PIE_TUPDATE, jiffies_to_usecs(q->params.tupdate)) ||
|
||||
nla_put_u32(skb, TCA_PIE_ALPHA, q->params.alpha) ||
|
||||
nla_put_u32(skb, TCA_PIE_BETA, q->params.beta) ||
|
||||
nla_put_u32(skb, TCA_PIE_ECN, q->params.ecn) ||
|
||||
nla_put_u32(skb, TCA_PIE_BYTEMODE, q->params.bytemode))
|
||||
goto nla_put_failure;
|
||||
|
||||
return nla_nest_end(skb, opts);
|
||||
|
||||
nla_put_failure:
|
||||
nla_nest_cancel(skb, opts);
|
||||
return -1;
|
||||
|
||||
}
|
||||
|
||||
static int pie_dump_stats(struct Qdisc *sch, struct gnet_dump *d)
|
||||
{
|
||||
struct pie_sched_data *q = qdisc_priv(sch);
|
||||
struct tc_pie_xstats st = {
|
||||
.prob = q->vars.prob,
|
||||
.delay = ((u32) PSCHED_TICKS2NS(q->vars.qdelay)) /
|
||||
NSEC_PER_USEC,
|
||||
/* unscale and return dq_rate in bytes per sec */
|
||||
.avg_dq_rate = q->vars.avg_dq_rate *
|
||||
(PSCHED_TICKS_PER_SEC) >> PIE_SCALE,
|
||||
.packets_in = q->stats.packets_in,
|
||||
.overlimit = q->stats.overlimit,
|
||||
.maxq = q->stats.maxq,
|
||||
.dropped = q->stats.dropped,
|
||||
.ecn_mark = q->stats.ecn_mark,
|
||||
};
|
||||
|
||||
return gnet_stats_copy_app(d, &st, sizeof(st));
|
||||
}
|
||||
|
||||
static struct sk_buff *pie_qdisc_dequeue(struct Qdisc *sch)
|
||||
{
|
||||
struct sk_buff *skb;
|
||||
skb = __qdisc_dequeue_head(sch, &sch->q);
|
||||
|
||||
if (!skb)
|
||||
return NULL;
|
||||
|
||||
pie_process_dequeue(sch, skb);
|
||||
return skb;
|
||||
}
|
||||
|
||||
static void pie_reset(struct Qdisc *sch)
|
||||
{
|
||||
struct pie_sched_data *q = qdisc_priv(sch);
|
||||
qdisc_reset_queue(sch);
|
||||
pie_vars_init(&q->vars);
|
||||
}
|
||||
|
||||
static void pie_destroy(struct Qdisc *sch)
|
||||
{
|
||||
struct pie_sched_data *q = qdisc_priv(sch);
|
||||
q->params.tupdate = 0;
|
||||
del_timer_sync(&q->adapt_timer);
|
||||
}
|
||||
|
||||
static struct Qdisc_ops pie_qdisc_ops __read_mostly = {
|
||||
.id = "pie",
|
||||
.priv_size = sizeof(struct pie_sched_data),
|
||||
.enqueue = pie_qdisc_enqueue,
|
||||
.dequeue = pie_qdisc_dequeue,
|
||||
.peek = qdisc_peek_dequeued,
|
||||
.init = pie_init,
|
||||
.destroy = pie_destroy,
|
||||
.reset = pie_reset,
|
||||
.change = pie_change,
|
||||
.dump = pie_dump,
|
||||
.dump_stats = pie_dump_stats,
|
||||
.owner = THIS_MODULE,
|
||||
};
|
||||
|
||||
static int __init pie_module_init(void)
|
||||
{
|
||||
return register_qdisc(&pie_qdisc_ops);
|
||||
}
|
||||
|
||||
static void __exit pie_module_exit(void)
|
||||
{
|
||||
unregister_qdisc(&pie_qdisc_ops);
|
||||
}
|
||||
|
||||
module_init(pie_module_init);
|
||||
module_exit(pie_module_exit);
|
||||
|
||||
MODULE_DESCRIPTION("Proportional Integral controller Enhanced (PIE) scheduler");
|
||||
MODULE_AUTHOR("Vijay Subramanian");
|
||||
MODULE_AUTHOR("Mythili Prabhu");
|
||||
MODULE_LICENSE("GPL");
|
||||
233
net/sched/sch_plug.c
Normal file
233
net/sched/sch_plug.c
Normal file
|
|
@ -0,0 +1,233 @@
|
|||
/*
|
||||
* sch_plug.c Queue traffic until an explicit release command
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU General Public License
|
||||
* as published by the Free Software Foundation; either version
|
||||
* 2 of the License, or (at your option) any later version.
|
||||
*
|
||||
* There are two ways to use this qdisc:
|
||||
* 1. A simple "instantaneous" plug/unplug operation, by issuing an alternating
|
||||
* sequence of TCQ_PLUG_BUFFER & TCQ_PLUG_RELEASE_INDEFINITE commands.
|
||||
*
|
||||
* 2. For network output buffering (a.k.a output commit) functionality.
|
||||
* Output commit property is commonly used by applications using checkpoint
|
||||
* based fault-tolerance to ensure that the checkpoint from which a system
|
||||
* is being restored is consistent w.r.t outside world.
|
||||
*
|
||||
* Consider for e.g. Remus - a Virtual Machine checkpointing system,
|
||||
* wherein a VM is checkpointed, say every 50ms. The checkpoint is replicated
|
||||
* asynchronously to the backup host, while the VM continues executing the
|
||||
* next epoch speculatively.
|
||||
*
|
||||
* The following is a typical sequence of output buffer operations:
|
||||
* 1.At epoch i, start_buffer(i)
|
||||
* 2. At end of epoch i (i.e. after 50ms):
|
||||
* 2.1 Stop VM and take checkpoint(i).
|
||||
* 2.2 start_buffer(i+1) and Resume VM
|
||||
* 3. While speculatively executing epoch(i+1), asynchronously replicate
|
||||
* checkpoint(i) to backup host.
|
||||
* 4. When checkpoint_ack(i) is received from backup, release_buffer(i)
|
||||
* Thus, this Qdisc would receive the following sequence of commands:
|
||||
* TCQ_PLUG_BUFFER (epoch i)
|
||||
* .. TCQ_PLUG_BUFFER (epoch i+1)
|
||||
* ....TCQ_PLUG_RELEASE_ONE (epoch i)
|
||||
* ......TCQ_PLUG_BUFFER (epoch i+2)
|
||||
* ........
|
||||
*/
|
||||
|
||||
#include <linux/module.h>
|
||||
#include <linux/types.h>
|
||||
#include <linux/kernel.h>
|
||||
#include <linux/errno.h>
|
||||
#include <linux/netdevice.h>
|
||||
#include <linux/skbuff.h>
|
||||
#include <net/pkt_sched.h>
|
||||
|
||||
/*
|
||||
* State of the queue, when used for network output buffering:
|
||||
*
|
||||
* plug(i+1) plug(i) head
|
||||
* ------------------+--------------------+---------------->
|
||||
* | |
|
||||
* | |
|
||||
* pkts_current_epoch| pkts_last_epoch |pkts_to_release
|
||||
* ----------------->|<--------+--------->|+--------------->
|
||||
* v v
|
||||
*
|
||||
*/
|
||||
|
||||
struct plug_sched_data {
|
||||
/* If true, the dequeue function releases all packets
|
||||
* from head to end of the queue. The queue turns into
|
||||
* a pass-through queue for newly arriving packets.
|
||||
*/
|
||||
bool unplug_indefinite;
|
||||
|
||||
/* Queue Limit in bytes */
|
||||
u32 limit;
|
||||
|
||||
/* Number of packets (output) from the current speculatively
|
||||
* executing epoch.
|
||||
*/
|
||||
u32 pkts_current_epoch;
|
||||
|
||||
/* Number of packets corresponding to the recently finished
|
||||
* epoch. These will be released when we receive a
|
||||
* TCQ_PLUG_RELEASE_ONE command. This command is typically
|
||||
* issued after committing a checkpoint at the target.
|
||||
*/
|
||||
u32 pkts_last_epoch;
|
||||
|
||||
/*
|
||||
* Number of packets from the head of the queue, that can
|
||||
* be released (committed checkpoint).
|
||||
*/
|
||||
u32 pkts_to_release;
|
||||
};
|
||||
|
||||
static int plug_enqueue(struct sk_buff *skb, struct Qdisc *sch)
|
||||
{
|
||||
struct plug_sched_data *q = qdisc_priv(sch);
|
||||
|
||||
if (likely(sch->qstats.backlog + skb->len <= q->limit)) {
|
||||
if (!q->unplug_indefinite)
|
||||
q->pkts_current_epoch++;
|
||||
return qdisc_enqueue_tail(skb, sch);
|
||||
}
|
||||
|
||||
return qdisc_reshape_fail(skb, sch);
|
||||
}
|
||||
|
||||
static struct sk_buff *plug_dequeue(struct Qdisc *sch)
|
||||
{
|
||||
struct plug_sched_data *q = qdisc_priv(sch);
|
||||
|
||||
if (qdisc_is_throttled(sch))
|
||||
return NULL;
|
||||
|
||||
if (!q->unplug_indefinite) {
|
||||
if (!q->pkts_to_release) {
|
||||
/* No more packets to dequeue. Block the queue
|
||||
* and wait for the next release command.
|
||||
*/
|
||||
qdisc_throttled(sch);
|
||||
return NULL;
|
||||
}
|
||||
q->pkts_to_release--;
|
||||
}
|
||||
|
||||
return qdisc_dequeue_head(sch);
|
||||
}
|
||||
|
||||
static int plug_init(struct Qdisc *sch, struct nlattr *opt)
|
||||
{
|
||||
struct plug_sched_data *q = qdisc_priv(sch);
|
||||
|
||||
q->pkts_current_epoch = 0;
|
||||
q->pkts_last_epoch = 0;
|
||||
q->pkts_to_release = 0;
|
||||
q->unplug_indefinite = false;
|
||||
|
||||
if (opt == NULL) {
|
||||
/* We will set a default limit of 100 pkts (~150kB)
|
||||
* in case tx_queue_len is not available. The
|
||||
* default value is completely arbitrary.
|
||||
*/
|
||||
u32 pkt_limit = qdisc_dev(sch)->tx_queue_len ? : 100;
|
||||
q->limit = pkt_limit * psched_mtu(qdisc_dev(sch));
|
||||
} else {
|
||||
struct tc_plug_qopt *ctl = nla_data(opt);
|
||||
|
||||
if (nla_len(opt) < sizeof(*ctl))
|
||||
return -EINVAL;
|
||||
|
||||
q->limit = ctl->limit;
|
||||
}
|
||||
|
||||
qdisc_throttled(sch);
|
||||
return 0;
|
||||
}
|
||||
|
||||
/* Receives 4 types of messages:
|
||||
* TCQ_PLUG_BUFFER: Inset a plug into the queue and
|
||||
* buffer any incoming packets
|
||||
* TCQ_PLUG_RELEASE_ONE: Dequeue packets from queue head
|
||||
* to beginning of the next plug.
|
||||
* TCQ_PLUG_RELEASE_INDEFINITE: Dequeue all packets from queue.
|
||||
* Stop buffering packets until the next TCQ_PLUG_BUFFER
|
||||
* command is received (just act as a pass-thru queue).
|
||||
* TCQ_PLUG_LIMIT: Increase/decrease queue size
|
||||
*/
|
||||
static int plug_change(struct Qdisc *sch, struct nlattr *opt)
|
||||
{
|
||||
struct plug_sched_data *q = qdisc_priv(sch);
|
||||
struct tc_plug_qopt *msg;
|
||||
|
||||
if (opt == NULL)
|
||||
return -EINVAL;
|
||||
|
||||
msg = nla_data(opt);
|
||||
if (nla_len(opt) < sizeof(*msg))
|
||||
return -EINVAL;
|
||||
|
||||
switch (msg->action) {
|
||||
case TCQ_PLUG_BUFFER:
|
||||
/* Save size of the current buffer */
|
||||
q->pkts_last_epoch = q->pkts_current_epoch;
|
||||
q->pkts_current_epoch = 0;
|
||||
if (q->unplug_indefinite)
|
||||
qdisc_throttled(sch);
|
||||
q->unplug_indefinite = false;
|
||||
break;
|
||||
case TCQ_PLUG_RELEASE_ONE:
|
||||
/* Add packets from the last complete buffer to the
|
||||
* packets to be released set.
|
||||
*/
|
||||
q->pkts_to_release += q->pkts_last_epoch;
|
||||
q->pkts_last_epoch = 0;
|
||||
qdisc_unthrottled(sch);
|
||||
netif_schedule_queue(sch->dev_queue);
|
||||
break;
|
||||
case TCQ_PLUG_RELEASE_INDEFINITE:
|
||||
q->unplug_indefinite = true;
|
||||
q->pkts_to_release = 0;
|
||||
q->pkts_last_epoch = 0;
|
||||
q->pkts_current_epoch = 0;
|
||||
qdisc_unthrottled(sch);
|
||||
netif_schedule_queue(sch->dev_queue);
|
||||
break;
|
||||
case TCQ_PLUG_LIMIT:
|
||||
/* Limit is supplied in bytes */
|
||||
q->limit = msg->limit;
|
||||
break;
|
||||
default:
|
||||
return -EINVAL;
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
static struct Qdisc_ops plug_qdisc_ops __read_mostly = {
|
||||
.id = "plug",
|
||||
.priv_size = sizeof(struct plug_sched_data),
|
||||
.enqueue = plug_enqueue,
|
||||
.dequeue = plug_dequeue,
|
||||
.peek = qdisc_peek_head,
|
||||
.init = plug_init,
|
||||
.change = plug_change,
|
||||
.owner = THIS_MODULE,
|
||||
};
|
||||
|
||||
static int __init plug_module_init(void)
|
||||
{
|
||||
return register_qdisc(&plug_qdisc_ops);
|
||||
}
|
||||
|
||||
static void __exit plug_module_exit(void)
|
||||
{
|
||||
unregister_qdisc(&plug_qdisc_ops);
|
||||
}
|
||||
module_init(plug_module_init)
|
||||
module_exit(plug_module_exit)
|
||||
MODULE_LICENSE("GPL");
|
||||
426
net/sched/sch_prio.c
Normal file
426
net/sched/sch_prio.c
Normal file
|
|
@ -0,0 +1,426 @@
|
|||
/*
|
||||
* net/sched/sch_prio.c Simple 3-band priority "scheduler".
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU General Public License
|
||||
* as published by the Free Software Foundation; either version
|
||||
* 2 of the License, or (at your option) any later version.
|
||||
*
|
||||
* Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
|
||||
* Fixes: 19990609: J Hadi Salim <hadi@nortelnetworks.com>:
|
||||
* Init -- EINVAL when opt undefined
|
||||
*/
|
||||
|
||||
#include <linux/module.h>
|
||||
#include <linux/slab.h>
|
||||
#include <linux/types.h>
|
||||
#include <linux/kernel.h>
|
||||
#include <linux/string.h>
|
||||
#include <linux/errno.h>
|
||||
#include <linux/skbuff.h>
|
||||
#include <linux/netdevice.h>
|
||||
#include <net/netlink.h>
|
||||
#include <net/pkt_sched.h>
|
||||
|
||||
|
||||
struct prio_sched_data {
|
||||
int bands;
|
||||
struct tcf_proto __rcu *filter_list;
|
||||
u8 prio2band[TC_PRIO_MAX+1];
|
||||
struct Qdisc *queues[TCQ_PRIO_BANDS];
|
||||
u8 enable_flow;
|
||||
};
|
||||
|
||||
|
||||
static struct Qdisc *
|
||||
prio_classify(struct sk_buff *skb, struct Qdisc *sch, int *qerr)
|
||||
{
|
||||
struct prio_sched_data *q = qdisc_priv(sch);
|
||||
u32 band = skb->priority;
|
||||
struct tcf_result res;
|
||||
struct tcf_proto *fl;
|
||||
int err;
|
||||
|
||||
*qerr = NET_XMIT_SUCCESS | __NET_XMIT_BYPASS;
|
||||
if (TC_H_MAJ(skb->priority) != sch->handle) {
|
||||
fl = rcu_dereference_bh(q->filter_list);
|
||||
err = tc_classify(skb, fl, &res);
|
||||
#ifdef CONFIG_NET_CLS_ACT
|
||||
switch (err) {
|
||||
case TC_ACT_STOLEN:
|
||||
case TC_ACT_QUEUED:
|
||||
*qerr = NET_XMIT_SUCCESS | __NET_XMIT_STOLEN;
|
||||
case TC_ACT_SHOT:
|
||||
return NULL;
|
||||
}
|
||||
#endif
|
||||
if (!fl || err < 0) {
|
||||
if (TC_H_MAJ(band))
|
||||
band = 0;
|
||||
return q->queues[q->prio2band[band & TC_PRIO_MAX]];
|
||||
}
|
||||
band = res.classid;
|
||||
}
|
||||
band = TC_H_MIN(band) - 1;
|
||||
if (band >= q->bands)
|
||||
return q->queues[q->prio2band[0]];
|
||||
|
||||
return q->queues[band];
|
||||
}
|
||||
|
||||
static int
|
||||
prio_enqueue(struct sk_buff *skb, struct Qdisc *sch)
|
||||
{
|
||||
struct Qdisc *qdisc;
|
||||
int ret;
|
||||
|
||||
qdisc = prio_classify(skb, sch, &ret);
|
||||
#ifdef CONFIG_NET_CLS_ACT
|
||||
if (qdisc == NULL) {
|
||||
|
||||
if (ret & __NET_XMIT_BYPASS)
|
||||
qdisc_qstats_drop(sch);
|
||||
kfree_skb(skb);
|
||||
return ret;
|
||||
}
|
||||
#endif
|
||||
|
||||
ret = qdisc_enqueue(skb, qdisc);
|
||||
if (ret == NET_XMIT_SUCCESS) {
|
||||
sch->q.qlen++;
|
||||
return NET_XMIT_SUCCESS;
|
||||
}
|
||||
if (net_xmit_drop_count(ret))
|
||||
qdisc_qstats_drop(sch);
|
||||
return ret;
|
||||
}
|
||||
|
||||
static struct sk_buff *prio_peek(struct Qdisc *sch)
|
||||
{
|
||||
struct prio_sched_data *q = qdisc_priv(sch);
|
||||
int prio;
|
||||
if (!q->enable_flow)
|
||||
return NULL;
|
||||
for (prio = 0; prio < q->bands; prio++) {
|
||||
struct Qdisc *qdisc = q->queues[prio];
|
||||
struct sk_buff *skb = qdisc->ops->peek(qdisc);
|
||||
if (skb)
|
||||
return skb;
|
||||
}
|
||||
return NULL;
|
||||
}
|
||||
|
||||
static struct sk_buff *prio_dequeue(struct Qdisc *sch)
|
||||
{
|
||||
struct prio_sched_data *q = qdisc_priv(sch);
|
||||
int prio;
|
||||
if (!q->enable_flow)
|
||||
return NULL;
|
||||
|
||||
for (prio = 0; prio < q->bands; prio++) {
|
||||
struct Qdisc *qdisc = q->queues[prio];
|
||||
struct sk_buff *skb = qdisc_dequeue_peeked(qdisc);
|
||||
if (skb) {
|
||||
qdisc_bstats_update(sch, skb);
|
||||
sch->q.qlen--;
|
||||
return skb;
|
||||
}
|
||||
}
|
||||
return NULL;
|
||||
|
||||
}
|
||||
|
||||
static unsigned int prio_drop(struct Qdisc *sch)
|
||||
{
|
||||
struct prio_sched_data *q = qdisc_priv(sch);
|
||||
int prio;
|
||||
unsigned int len;
|
||||
struct Qdisc *qdisc;
|
||||
|
||||
for (prio = q->bands-1; prio >= 0; prio--) {
|
||||
qdisc = q->queues[prio];
|
||||
if (qdisc->ops->drop && (len = qdisc->ops->drop(qdisc)) != 0) {
|
||||
sch->q.qlen--;
|
||||
return len;
|
||||
}
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
|
||||
static void
|
||||
prio_reset(struct Qdisc *sch)
|
||||
{
|
||||
int prio;
|
||||
struct prio_sched_data *q = qdisc_priv(sch);
|
||||
|
||||
for (prio = 0; prio < q->bands; prio++)
|
||||
qdisc_reset(q->queues[prio]);
|
||||
sch->q.qlen = 0;
|
||||
q->enable_flow = 1;
|
||||
}
|
||||
|
||||
static void
|
||||
prio_destroy(struct Qdisc *sch)
|
||||
{
|
||||
int prio;
|
||||
struct prio_sched_data *q = qdisc_priv(sch);
|
||||
|
||||
tcf_destroy_chain(&q->filter_list);
|
||||
for (prio = 0; prio < q->bands; prio++)
|
||||
qdisc_destroy(q->queues[prio]);
|
||||
}
|
||||
|
||||
static int prio_tune(struct Qdisc *sch, struct nlattr *opt)
|
||||
{
|
||||
struct prio_sched_data *q = qdisc_priv(sch);
|
||||
struct tc_prio_qopt *qopt;
|
||||
int i;
|
||||
int flow_change = 0;
|
||||
|
||||
if (nla_len(opt) < sizeof(*qopt))
|
||||
return -EINVAL;
|
||||
qopt = nla_data(opt);
|
||||
|
||||
if (qopt->bands > TCQ_PRIO_BANDS || qopt->bands < 2)
|
||||
return -EINVAL;
|
||||
|
||||
for (i = 0; i <= TC_PRIO_MAX; i++) {
|
||||
if (qopt->priomap[i] >= qopt->bands)
|
||||
return -EINVAL;
|
||||
}
|
||||
|
||||
sch_tree_lock(sch);
|
||||
if (q->enable_flow != qopt->enable_flow) {
|
||||
q->enable_flow = qopt->enable_flow;
|
||||
flow_change = 1;
|
||||
}
|
||||
q->bands = qopt->bands;
|
||||
memcpy(q->prio2band, qopt->priomap, TC_PRIO_MAX+1);
|
||||
|
||||
for (i = q->bands; i < TCQ_PRIO_BANDS; i++) {
|
||||
struct Qdisc *child = q->queues[i];
|
||||
q->queues[i] = &noop_qdisc;
|
||||
if (child != &noop_qdisc) {
|
||||
qdisc_tree_decrease_qlen(child, child->q.qlen);
|
||||
qdisc_destroy(child);
|
||||
}
|
||||
}
|
||||
sch_tree_unlock(sch);
|
||||
|
||||
for (i = 0; i < q->bands; i++) {
|
||||
if (q->queues[i] == &noop_qdisc) {
|
||||
struct Qdisc *child, *old;
|
||||
|
||||
child = qdisc_create_dflt(sch->dev_queue,
|
||||
&pfifo_qdisc_ops,
|
||||
TC_H_MAKE(sch->handle, i + 1));
|
||||
if (child) {
|
||||
sch_tree_lock(sch);
|
||||
old = q->queues[i];
|
||||
q->queues[i] = child;
|
||||
|
||||
if (old != &noop_qdisc) {
|
||||
qdisc_tree_decrease_qlen(old,
|
||||
old->q.qlen);
|
||||
qdisc_destroy(old);
|
||||
}
|
||||
sch_tree_unlock(sch);
|
||||
}
|
||||
}
|
||||
}
|
||||
/* Schedule qdisc when flow re-enabled */
|
||||
if (flow_change && q->enable_flow) {
|
||||
if (!test_bit(__QDISC_STATE_DEACTIVATED,
|
||||
&sch->state))
|
||||
__netif_schedule(qdisc_root(sch));
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
static int prio_init(struct Qdisc *sch, struct nlattr *opt)
|
||||
{
|
||||
struct prio_sched_data *q = qdisc_priv(sch);
|
||||
int i;
|
||||
|
||||
for (i = 0; i < TCQ_PRIO_BANDS; i++)
|
||||
q->queues[i] = &noop_qdisc;
|
||||
|
||||
if (opt == NULL) {
|
||||
return -EINVAL;
|
||||
} else {
|
||||
int err;
|
||||
|
||||
if ((err = prio_tune(sch, opt)) != 0)
|
||||
return err;
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
static int prio_dump(struct Qdisc *sch, struct sk_buff *skb)
|
||||
{
|
||||
struct prio_sched_data *q = qdisc_priv(sch);
|
||||
unsigned char *b = skb_tail_pointer(skb);
|
||||
struct tc_prio_qopt opt;
|
||||
|
||||
opt.bands = q->bands;
|
||||
opt.enable_flow = q->enable_flow;
|
||||
memcpy(&opt.priomap, q->prio2band, TC_PRIO_MAX + 1);
|
||||
|
||||
if (nla_put(skb, TCA_OPTIONS, sizeof(opt), &opt))
|
||||
goto nla_put_failure;
|
||||
|
||||
return skb->len;
|
||||
|
||||
nla_put_failure:
|
||||
nlmsg_trim(skb, b);
|
||||
return -1;
|
||||
}
|
||||
|
||||
static int prio_graft(struct Qdisc *sch, unsigned long arg, struct Qdisc *new,
|
||||
struct Qdisc **old)
|
||||
{
|
||||
struct prio_sched_data *q = qdisc_priv(sch);
|
||||
unsigned long band = arg - 1;
|
||||
|
||||
if (new == NULL)
|
||||
new = &noop_qdisc;
|
||||
|
||||
sch_tree_lock(sch);
|
||||
*old = q->queues[band];
|
||||
q->queues[band] = new;
|
||||
qdisc_tree_decrease_qlen(*old, (*old)->q.qlen);
|
||||
qdisc_reset(*old);
|
||||
sch_tree_unlock(sch);
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
static struct Qdisc *
|
||||
prio_leaf(struct Qdisc *sch, unsigned long arg)
|
||||
{
|
||||
struct prio_sched_data *q = qdisc_priv(sch);
|
||||
unsigned long band = arg - 1;
|
||||
|
||||
return q->queues[band];
|
||||
}
|
||||
|
||||
static unsigned long prio_get(struct Qdisc *sch, u32 classid)
|
||||
{
|
||||
struct prio_sched_data *q = qdisc_priv(sch);
|
||||
unsigned long band = TC_H_MIN(classid);
|
||||
|
||||
if (band - 1 >= q->bands)
|
||||
return 0;
|
||||
return band;
|
||||
}
|
||||
|
||||
static unsigned long prio_bind(struct Qdisc *sch, unsigned long parent, u32 classid)
|
||||
{
|
||||
return prio_get(sch, classid);
|
||||
}
|
||||
|
||||
|
||||
static void prio_put(struct Qdisc *q, unsigned long cl)
|
||||
{
|
||||
}
|
||||
|
||||
static int prio_dump_class(struct Qdisc *sch, unsigned long cl, struct sk_buff *skb,
|
||||
struct tcmsg *tcm)
|
||||
{
|
||||
struct prio_sched_data *q = qdisc_priv(sch);
|
||||
|
||||
tcm->tcm_handle |= TC_H_MIN(cl);
|
||||
tcm->tcm_info = q->queues[cl-1]->handle;
|
||||
return 0;
|
||||
}
|
||||
|
||||
static int prio_dump_class_stats(struct Qdisc *sch, unsigned long cl,
|
||||
struct gnet_dump *d)
|
||||
{
|
||||
struct prio_sched_data *q = qdisc_priv(sch);
|
||||
struct Qdisc *cl_q;
|
||||
|
||||
cl_q = q->queues[cl - 1];
|
||||
if (gnet_stats_copy_basic(d, NULL, &cl_q->bstats) < 0 ||
|
||||
gnet_stats_copy_queue(d, NULL, &cl_q->qstats, cl_q->q.qlen) < 0)
|
||||
return -1;
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
static void prio_walk(struct Qdisc *sch, struct qdisc_walker *arg)
|
||||
{
|
||||
struct prio_sched_data *q = qdisc_priv(sch);
|
||||
int prio;
|
||||
|
||||
if (arg->stop)
|
||||
return;
|
||||
|
||||
for (prio = 0; prio < q->bands; prio++) {
|
||||
if (arg->count < arg->skip) {
|
||||
arg->count++;
|
||||
continue;
|
||||
}
|
||||
if (arg->fn(sch, prio + 1, arg) < 0) {
|
||||
arg->stop = 1;
|
||||
break;
|
||||
}
|
||||
arg->count++;
|
||||
}
|
||||
}
|
||||
|
||||
static struct tcf_proto __rcu **prio_find_tcf(struct Qdisc *sch,
|
||||
unsigned long cl)
|
||||
{
|
||||
struct prio_sched_data *q = qdisc_priv(sch);
|
||||
|
||||
if (cl)
|
||||
return NULL;
|
||||
return &q->filter_list;
|
||||
}
|
||||
|
||||
static const struct Qdisc_class_ops prio_class_ops = {
|
||||
.graft = prio_graft,
|
||||
.leaf = prio_leaf,
|
||||
.get = prio_get,
|
||||
.put = prio_put,
|
||||
.walk = prio_walk,
|
||||
.tcf_chain = prio_find_tcf,
|
||||
.bind_tcf = prio_bind,
|
||||
.unbind_tcf = prio_put,
|
||||
.dump = prio_dump_class,
|
||||
.dump_stats = prio_dump_class_stats,
|
||||
};
|
||||
|
||||
static struct Qdisc_ops prio_qdisc_ops __read_mostly = {
|
||||
.next = NULL,
|
||||
.cl_ops = &prio_class_ops,
|
||||
.id = "prio",
|
||||
.priv_size = sizeof(struct prio_sched_data),
|
||||
.enqueue = prio_enqueue,
|
||||
.dequeue = prio_dequeue,
|
||||
.peek = prio_peek,
|
||||
.drop = prio_drop,
|
||||
.init = prio_init,
|
||||
.reset = prio_reset,
|
||||
.destroy = prio_destroy,
|
||||
.change = prio_tune,
|
||||
.dump = prio_dump,
|
||||
.owner = THIS_MODULE,
|
||||
};
|
||||
|
||||
static int __init prio_module_init(void)
|
||||
{
|
||||
return register_qdisc(&prio_qdisc_ops);
|
||||
}
|
||||
|
||||
static void __exit prio_module_exit(void)
|
||||
{
|
||||
unregister_qdisc(&prio_qdisc_ops);
|
||||
}
|
||||
|
||||
module_init(prio_module_init)
|
||||
module_exit(prio_module_exit)
|
||||
|
||||
MODULE_LICENSE("GPL");
|
||||
1587
net/sched/sch_qfq.c
Normal file
1587
net/sched/sch_qfq.c
Normal file
File diff suppressed because it is too large
Load diff
391
net/sched/sch_red.c
Normal file
391
net/sched/sch_red.c
Normal file
|
|
@ -0,0 +1,391 @@
|
|||
/*
|
||||
* net/sched/sch_red.c Random Early Detection queue.
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU General Public License
|
||||
* as published by the Free Software Foundation; either version
|
||||
* 2 of the License, or (at your option) any later version.
|
||||
*
|
||||
* Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
|
||||
*
|
||||
* Changes:
|
||||
* J Hadi Salim 980914: computation fixes
|
||||
* Alexey Makarenko <makar@phoenix.kharkov.ua> 990814: qave on idle link was calculated incorrectly.
|
||||
* J Hadi Salim 980816: ECN support
|
||||
*/
|
||||
|
||||
#include <linux/module.h>
|
||||
#include <linux/types.h>
|
||||
#include <linux/kernel.h>
|
||||
#include <linux/skbuff.h>
|
||||
#include <net/pkt_sched.h>
|
||||
#include <net/inet_ecn.h>
|
||||
#include <net/red.h>
|
||||
|
||||
|
||||
/* Parameters, settable by user:
|
||||
-----------------------------
|
||||
|
||||
limit - bytes (must be > qth_max + burst)
|
||||
|
||||
Hard limit on queue length, should be chosen >qth_max
|
||||
to allow packet bursts. This parameter does not
|
||||
affect the algorithms behaviour and can be chosen
|
||||
arbitrarily high (well, less than ram size)
|
||||
Really, this limit will never be reached
|
||||
if RED works correctly.
|
||||
*/
|
||||
|
||||
struct red_sched_data {
|
||||
u32 limit; /* HARD maximal queue length */
|
||||
unsigned char flags;
|
||||
struct timer_list adapt_timer;
|
||||
struct red_parms parms;
|
||||
struct red_vars vars;
|
||||
struct red_stats stats;
|
||||
struct Qdisc *qdisc;
|
||||
};
|
||||
|
||||
static inline int red_use_ecn(struct red_sched_data *q)
|
||||
{
|
||||
return q->flags & TC_RED_ECN;
|
||||
}
|
||||
|
||||
static inline int red_use_harddrop(struct red_sched_data *q)
|
||||
{
|
||||
return q->flags & TC_RED_HARDDROP;
|
||||
}
|
||||
|
||||
static int red_enqueue(struct sk_buff *skb, struct Qdisc *sch)
|
||||
{
|
||||
struct red_sched_data *q = qdisc_priv(sch);
|
||||
struct Qdisc *child = q->qdisc;
|
||||
int ret;
|
||||
|
||||
q->vars.qavg = red_calc_qavg(&q->parms,
|
||||
&q->vars,
|
||||
child->qstats.backlog);
|
||||
|
||||
if (red_is_idling(&q->vars))
|
||||
red_end_of_idle_period(&q->vars);
|
||||
|
||||
switch (red_action(&q->parms, &q->vars, q->vars.qavg)) {
|
||||
case RED_DONT_MARK:
|
||||
break;
|
||||
|
||||
case RED_PROB_MARK:
|
||||
qdisc_qstats_overlimit(sch);
|
||||
if (!red_use_ecn(q) || !INET_ECN_set_ce(skb)) {
|
||||
q->stats.prob_drop++;
|
||||
goto congestion_drop;
|
||||
}
|
||||
|
||||
q->stats.prob_mark++;
|
||||
break;
|
||||
|
||||
case RED_HARD_MARK:
|
||||
qdisc_qstats_overlimit(sch);
|
||||
if (red_use_harddrop(q) || !red_use_ecn(q) ||
|
||||
!INET_ECN_set_ce(skb)) {
|
||||
q->stats.forced_drop++;
|
||||
goto congestion_drop;
|
||||
}
|
||||
|
||||
q->stats.forced_mark++;
|
||||
break;
|
||||
}
|
||||
|
||||
ret = qdisc_enqueue(skb, child);
|
||||
if (likely(ret == NET_XMIT_SUCCESS)) {
|
||||
sch->q.qlen++;
|
||||
} else if (net_xmit_drop_count(ret)) {
|
||||
q->stats.pdrop++;
|
||||
qdisc_qstats_drop(sch);
|
||||
}
|
||||
return ret;
|
||||
|
||||
congestion_drop:
|
||||
qdisc_drop(skb, sch);
|
||||
return NET_XMIT_CN;
|
||||
}
|
||||
|
||||
static struct sk_buff *red_dequeue(struct Qdisc *sch)
|
||||
{
|
||||
struct sk_buff *skb;
|
||||
struct red_sched_data *q = qdisc_priv(sch);
|
||||
struct Qdisc *child = q->qdisc;
|
||||
|
||||
skb = child->dequeue(child);
|
||||
if (skb) {
|
||||
qdisc_bstats_update(sch, skb);
|
||||
sch->q.qlen--;
|
||||
} else {
|
||||
if (!red_is_idling(&q->vars))
|
||||
red_start_of_idle_period(&q->vars);
|
||||
}
|
||||
return skb;
|
||||
}
|
||||
|
||||
static struct sk_buff *red_peek(struct Qdisc *sch)
|
||||
{
|
||||
struct red_sched_data *q = qdisc_priv(sch);
|
||||
struct Qdisc *child = q->qdisc;
|
||||
|
||||
return child->ops->peek(child);
|
||||
}
|
||||
|
||||
static unsigned int red_drop(struct Qdisc *sch)
|
||||
{
|
||||
struct red_sched_data *q = qdisc_priv(sch);
|
||||
struct Qdisc *child = q->qdisc;
|
||||
unsigned int len;
|
||||
|
||||
if (child->ops->drop && (len = child->ops->drop(child)) > 0) {
|
||||
q->stats.other++;
|
||||
qdisc_qstats_drop(sch);
|
||||
sch->q.qlen--;
|
||||
return len;
|
||||
}
|
||||
|
||||
if (!red_is_idling(&q->vars))
|
||||
red_start_of_idle_period(&q->vars);
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
static void red_reset(struct Qdisc *sch)
|
||||
{
|
||||
struct red_sched_data *q = qdisc_priv(sch);
|
||||
|
||||
qdisc_reset(q->qdisc);
|
||||
sch->q.qlen = 0;
|
||||
red_restart(&q->vars);
|
||||
}
|
||||
|
||||
static void red_destroy(struct Qdisc *sch)
|
||||
{
|
||||
struct red_sched_data *q = qdisc_priv(sch);
|
||||
|
||||
del_timer_sync(&q->adapt_timer);
|
||||
qdisc_destroy(q->qdisc);
|
||||
}
|
||||
|
||||
static const struct nla_policy red_policy[TCA_RED_MAX + 1] = {
|
||||
[TCA_RED_PARMS] = { .len = sizeof(struct tc_red_qopt) },
|
||||
[TCA_RED_STAB] = { .len = RED_STAB_SIZE },
|
||||
[TCA_RED_MAX_P] = { .type = NLA_U32 },
|
||||
};
|
||||
|
||||
static int red_change(struct Qdisc *sch, struct nlattr *opt)
|
||||
{
|
||||
struct red_sched_data *q = qdisc_priv(sch);
|
||||
struct nlattr *tb[TCA_RED_MAX + 1];
|
||||
struct tc_red_qopt *ctl;
|
||||
struct Qdisc *child = NULL;
|
||||
int err;
|
||||
u32 max_P;
|
||||
|
||||
if (opt == NULL)
|
||||
return -EINVAL;
|
||||
|
||||
err = nla_parse_nested(tb, TCA_RED_MAX, opt, red_policy);
|
||||
if (err < 0)
|
||||
return err;
|
||||
|
||||
if (tb[TCA_RED_PARMS] == NULL ||
|
||||
tb[TCA_RED_STAB] == NULL)
|
||||
return -EINVAL;
|
||||
|
||||
max_P = tb[TCA_RED_MAX_P] ? nla_get_u32(tb[TCA_RED_MAX_P]) : 0;
|
||||
|
||||
ctl = nla_data(tb[TCA_RED_PARMS]);
|
||||
|
||||
if (ctl->limit > 0) {
|
||||
child = fifo_create_dflt(sch, &bfifo_qdisc_ops, ctl->limit);
|
||||
if (IS_ERR(child))
|
||||
return PTR_ERR(child);
|
||||
}
|
||||
|
||||
sch_tree_lock(sch);
|
||||
q->flags = ctl->flags;
|
||||
q->limit = ctl->limit;
|
||||
if (child) {
|
||||
qdisc_tree_decrease_qlen(q->qdisc, q->qdisc->q.qlen);
|
||||
qdisc_destroy(q->qdisc);
|
||||
q->qdisc = child;
|
||||
}
|
||||
|
||||
red_set_parms(&q->parms,
|
||||
ctl->qth_min, ctl->qth_max, ctl->Wlog,
|
||||
ctl->Plog, ctl->Scell_log,
|
||||
nla_data(tb[TCA_RED_STAB]),
|
||||
max_P);
|
||||
red_set_vars(&q->vars);
|
||||
|
||||
del_timer(&q->adapt_timer);
|
||||
if (ctl->flags & TC_RED_ADAPTATIVE)
|
||||
mod_timer(&q->adapt_timer, jiffies + HZ/2);
|
||||
|
||||
if (!q->qdisc->q.qlen)
|
||||
red_start_of_idle_period(&q->vars);
|
||||
|
||||
sch_tree_unlock(sch);
|
||||
return 0;
|
||||
}
|
||||
|
||||
static inline void red_adaptative_timer(unsigned long arg)
|
||||
{
|
||||
struct Qdisc *sch = (struct Qdisc *)arg;
|
||||
struct red_sched_data *q = qdisc_priv(sch);
|
||||
spinlock_t *root_lock = qdisc_lock(qdisc_root_sleeping(sch));
|
||||
|
||||
spin_lock(root_lock);
|
||||
red_adaptative_algo(&q->parms, &q->vars);
|
||||
mod_timer(&q->adapt_timer, jiffies + HZ/2);
|
||||
spin_unlock(root_lock);
|
||||
}
|
||||
|
||||
static int red_init(struct Qdisc *sch, struct nlattr *opt)
|
||||
{
|
||||
struct red_sched_data *q = qdisc_priv(sch);
|
||||
|
||||
q->qdisc = &noop_qdisc;
|
||||
setup_timer(&q->adapt_timer, red_adaptative_timer, (unsigned long)sch);
|
||||
return red_change(sch, opt);
|
||||
}
|
||||
|
||||
static int red_dump(struct Qdisc *sch, struct sk_buff *skb)
|
||||
{
|
||||
struct red_sched_data *q = qdisc_priv(sch);
|
||||
struct nlattr *opts = NULL;
|
||||
struct tc_red_qopt opt = {
|
||||
.limit = q->limit,
|
||||
.flags = q->flags,
|
||||
.qth_min = q->parms.qth_min >> q->parms.Wlog,
|
||||
.qth_max = q->parms.qth_max >> q->parms.Wlog,
|
||||
.Wlog = q->parms.Wlog,
|
||||
.Plog = q->parms.Plog,
|
||||
.Scell_log = q->parms.Scell_log,
|
||||
};
|
||||
|
||||
sch->qstats.backlog = q->qdisc->qstats.backlog;
|
||||
opts = nla_nest_start(skb, TCA_OPTIONS);
|
||||
if (opts == NULL)
|
||||
goto nla_put_failure;
|
||||
if (nla_put(skb, TCA_RED_PARMS, sizeof(opt), &opt) ||
|
||||
nla_put_u32(skb, TCA_RED_MAX_P, q->parms.max_P))
|
||||
goto nla_put_failure;
|
||||
return nla_nest_end(skb, opts);
|
||||
|
||||
nla_put_failure:
|
||||
nla_nest_cancel(skb, opts);
|
||||
return -EMSGSIZE;
|
||||
}
|
||||
|
||||
static int red_dump_stats(struct Qdisc *sch, struct gnet_dump *d)
|
||||
{
|
||||
struct red_sched_data *q = qdisc_priv(sch);
|
||||
struct tc_red_xstats st = {
|
||||
.early = q->stats.prob_drop + q->stats.forced_drop,
|
||||
.pdrop = q->stats.pdrop,
|
||||
.other = q->stats.other,
|
||||
.marked = q->stats.prob_mark + q->stats.forced_mark,
|
||||
};
|
||||
|
||||
return gnet_stats_copy_app(d, &st, sizeof(st));
|
||||
}
|
||||
|
||||
static int red_dump_class(struct Qdisc *sch, unsigned long cl,
|
||||
struct sk_buff *skb, struct tcmsg *tcm)
|
||||
{
|
||||
struct red_sched_data *q = qdisc_priv(sch);
|
||||
|
||||
tcm->tcm_handle |= TC_H_MIN(1);
|
||||
tcm->tcm_info = q->qdisc->handle;
|
||||
return 0;
|
||||
}
|
||||
|
||||
static int red_graft(struct Qdisc *sch, unsigned long arg, struct Qdisc *new,
|
||||
struct Qdisc **old)
|
||||
{
|
||||
struct red_sched_data *q = qdisc_priv(sch);
|
||||
|
||||
if (new == NULL)
|
||||
new = &noop_qdisc;
|
||||
|
||||
sch_tree_lock(sch);
|
||||
*old = q->qdisc;
|
||||
q->qdisc = new;
|
||||
qdisc_tree_decrease_qlen(*old, (*old)->q.qlen);
|
||||
qdisc_reset(*old);
|
||||
sch_tree_unlock(sch);
|
||||
return 0;
|
||||
}
|
||||
|
||||
static struct Qdisc *red_leaf(struct Qdisc *sch, unsigned long arg)
|
||||
{
|
||||
struct red_sched_data *q = qdisc_priv(sch);
|
||||
return q->qdisc;
|
||||
}
|
||||
|
||||
static unsigned long red_get(struct Qdisc *sch, u32 classid)
|
||||
{
|
||||
return 1;
|
||||
}
|
||||
|
||||
static void red_put(struct Qdisc *sch, unsigned long arg)
|
||||
{
|
||||
}
|
||||
|
||||
static void red_walk(struct Qdisc *sch, struct qdisc_walker *walker)
|
||||
{
|
||||
if (!walker->stop) {
|
||||
if (walker->count >= walker->skip)
|
||||
if (walker->fn(sch, 1, walker) < 0) {
|
||||
walker->stop = 1;
|
||||
return;
|
||||
}
|
||||
walker->count++;
|
||||
}
|
||||
}
|
||||
|
||||
static const struct Qdisc_class_ops red_class_ops = {
|
||||
.graft = red_graft,
|
||||
.leaf = red_leaf,
|
||||
.get = red_get,
|
||||
.put = red_put,
|
||||
.walk = red_walk,
|
||||
.dump = red_dump_class,
|
||||
};
|
||||
|
||||
static struct Qdisc_ops red_qdisc_ops __read_mostly = {
|
||||
.id = "red",
|
||||
.priv_size = sizeof(struct red_sched_data),
|
||||
.cl_ops = &red_class_ops,
|
||||
.enqueue = red_enqueue,
|
||||
.dequeue = red_dequeue,
|
||||
.peek = red_peek,
|
||||
.drop = red_drop,
|
||||
.init = red_init,
|
||||
.reset = red_reset,
|
||||
.destroy = red_destroy,
|
||||
.change = red_change,
|
||||
.dump = red_dump,
|
||||
.dump_stats = red_dump_stats,
|
||||
.owner = THIS_MODULE,
|
||||
};
|
||||
|
||||
static int __init red_module_init(void)
|
||||
{
|
||||
return register_qdisc(&red_qdisc_ops);
|
||||
}
|
||||
|
||||
static void __exit red_module_exit(void)
|
||||
{
|
||||
unregister_qdisc(&red_qdisc_ops);
|
||||
}
|
||||
|
||||
module_init(red_module_init)
|
||||
module_exit(red_module_exit)
|
||||
|
||||
MODULE_LICENSE("GPL");
|
||||
728
net/sched/sch_sfb.c
Normal file
728
net/sched/sch_sfb.c
Normal file
|
|
@ -0,0 +1,728 @@
|
|||
/*
|
||||
* net/sched/sch_sfb.c Stochastic Fair Blue
|
||||
*
|
||||
* Copyright (c) 2008-2011 Juliusz Chroboczek <jch@pps.jussieu.fr>
|
||||
* Copyright (c) 2011 Eric Dumazet <eric.dumazet@gmail.com>
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU General Public License
|
||||
* version 2 as published by the Free Software Foundation.
|
||||
*
|
||||
* W. Feng, D. Kandlur, D. Saha, K. Shin. Blue:
|
||||
* A New Class of Active Queue Management Algorithms.
|
||||
* U. Michigan CSE-TR-387-99, April 1999.
|
||||
*
|
||||
* http://www.thefengs.com/wuchang/blue/CSE-TR-387-99.pdf
|
||||
*
|
||||
*/
|
||||
|
||||
#include <linux/module.h>
|
||||
#include <linux/types.h>
|
||||
#include <linux/kernel.h>
|
||||
#include <linux/errno.h>
|
||||
#include <linux/skbuff.h>
|
||||
#include <linux/random.h>
|
||||
#include <linux/jhash.h>
|
||||
#include <net/ip.h>
|
||||
#include <net/pkt_sched.h>
|
||||
#include <net/inet_ecn.h>
|
||||
#include <net/flow_keys.h>
|
||||
|
||||
/*
|
||||
* SFB uses two B[l][n] : L x N arrays of bins (L levels, N bins per level)
|
||||
* This implementation uses L = 8 and N = 16
|
||||
* This permits us to split one 32bit hash (provided per packet by rxhash or
|
||||
* external classifier) into 8 subhashes of 4 bits.
|
||||
*/
|
||||
#define SFB_BUCKET_SHIFT 4
|
||||
#define SFB_NUMBUCKETS (1 << SFB_BUCKET_SHIFT) /* N bins per Level */
|
||||
#define SFB_BUCKET_MASK (SFB_NUMBUCKETS - 1)
|
||||
#define SFB_LEVELS (32 / SFB_BUCKET_SHIFT) /* L */
|
||||
|
||||
/* SFB algo uses a virtual queue, named "bin" */
|
||||
struct sfb_bucket {
|
||||
u16 qlen; /* length of virtual queue */
|
||||
u16 p_mark; /* marking probability */
|
||||
};
|
||||
|
||||
/* We use a double buffering right before hash change
|
||||
* (Section 4.4 of SFB reference : moving hash functions)
|
||||
*/
|
||||
struct sfb_bins {
|
||||
u32 perturbation; /* jhash perturbation */
|
||||
struct sfb_bucket bins[SFB_LEVELS][SFB_NUMBUCKETS];
|
||||
};
|
||||
|
||||
struct sfb_sched_data {
|
||||
struct Qdisc *qdisc;
|
||||
struct tcf_proto __rcu *filter_list;
|
||||
unsigned long rehash_interval;
|
||||
unsigned long warmup_time; /* double buffering warmup time in jiffies */
|
||||
u32 max;
|
||||
u32 bin_size; /* maximum queue length per bin */
|
||||
u32 increment; /* d1 */
|
||||
u32 decrement; /* d2 */
|
||||
u32 limit; /* HARD maximal queue length */
|
||||
u32 penalty_rate;
|
||||
u32 penalty_burst;
|
||||
u32 tokens_avail;
|
||||
unsigned long rehash_time;
|
||||
unsigned long token_time;
|
||||
|
||||
u8 slot; /* current active bins (0 or 1) */
|
||||
bool double_buffering;
|
||||
struct sfb_bins bins[2];
|
||||
|
||||
struct {
|
||||
u32 earlydrop;
|
||||
u32 penaltydrop;
|
||||
u32 bucketdrop;
|
||||
u32 queuedrop;
|
||||
u32 childdrop; /* drops in child qdisc */
|
||||
u32 marked; /* ECN mark */
|
||||
} stats;
|
||||
};
|
||||
|
||||
/*
|
||||
* Each queued skb might be hashed on one or two bins
|
||||
* We store in skb_cb the two hash values.
|
||||
* (A zero value means double buffering was not used)
|
||||
*/
|
||||
struct sfb_skb_cb {
|
||||
u32 hashes[2];
|
||||
};
|
||||
|
||||
static inline struct sfb_skb_cb *sfb_skb_cb(const struct sk_buff *skb)
|
||||
{
|
||||
qdisc_cb_private_validate(skb, sizeof(struct sfb_skb_cb));
|
||||
return (struct sfb_skb_cb *)qdisc_skb_cb(skb)->data;
|
||||
}
|
||||
|
||||
/*
|
||||
* If using 'internal' SFB flow classifier, hash comes from skb rxhash
|
||||
* If using external classifier, hash comes from the classid.
|
||||
*/
|
||||
static u32 sfb_hash(const struct sk_buff *skb, u32 slot)
|
||||
{
|
||||
return sfb_skb_cb(skb)->hashes[slot];
|
||||
}
|
||||
|
||||
/* Probabilities are coded as Q0.16 fixed-point values,
|
||||
* with 0xFFFF representing 65535/65536 (almost 1.0)
|
||||
* Addition and subtraction are saturating in [0, 65535]
|
||||
*/
|
||||
static u32 prob_plus(u32 p1, u32 p2)
|
||||
{
|
||||
u32 res = p1 + p2;
|
||||
|
||||
return min_t(u32, res, SFB_MAX_PROB);
|
||||
}
|
||||
|
||||
static u32 prob_minus(u32 p1, u32 p2)
|
||||
{
|
||||
return p1 > p2 ? p1 - p2 : 0;
|
||||
}
|
||||
|
||||
static void increment_one_qlen(u32 sfbhash, u32 slot, struct sfb_sched_data *q)
|
||||
{
|
||||
int i;
|
||||
struct sfb_bucket *b = &q->bins[slot].bins[0][0];
|
||||
|
||||
for (i = 0; i < SFB_LEVELS; i++) {
|
||||
u32 hash = sfbhash & SFB_BUCKET_MASK;
|
||||
|
||||
sfbhash >>= SFB_BUCKET_SHIFT;
|
||||
if (b[hash].qlen < 0xFFFF)
|
||||
b[hash].qlen++;
|
||||
b += SFB_NUMBUCKETS; /* next level */
|
||||
}
|
||||
}
|
||||
|
||||
static void increment_qlen(const struct sk_buff *skb, struct sfb_sched_data *q)
|
||||
{
|
||||
u32 sfbhash;
|
||||
|
||||
sfbhash = sfb_hash(skb, 0);
|
||||
if (sfbhash)
|
||||
increment_one_qlen(sfbhash, 0, q);
|
||||
|
||||
sfbhash = sfb_hash(skb, 1);
|
||||
if (sfbhash)
|
||||
increment_one_qlen(sfbhash, 1, q);
|
||||
}
|
||||
|
||||
static void decrement_one_qlen(u32 sfbhash, u32 slot,
|
||||
struct sfb_sched_data *q)
|
||||
{
|
||||
int i;
|
||||
struct sfb_bucket *b = &q->bins[slot].bins[0][0];
|
||||
|
||||
for (i = 0; i < SFB_LEVELS; i++) {
|
||||
u32 hash = sfbhash & SFB_BUCKET_MASK;
|
||||
|
||||
sfbhash >>= SFB_BUCKET_SHIFT;
|
||||
if (b[hash].qlen > 0)
|
||||
b[hash].qlen--;
|
||||
b += SFB_NUMBUCKETS; /* next level */
|
||||
}
|
||||
}
|
||||
|
||||
static void decrement_qlen(const struct sk_buff *skb, struct sfb_sched_data *q)
|
||||
{
|
||||
u32 sfbhash;
|
||||
|
||||
sfbhash = sfb_hash(skb, 0);
|
||||
if (sfbhash)
|
||||
decrement_one_qlen(sfbhash, 0, q);
|
||||
|
||||
sfbhash = sfb_hash(skb, 1);
|
||||
if (sfbhash)
|
||||
decrement_one_qlen(sfbhash, 1, q);
|
||||
}
|
||||
|
||||
static void decrement_prob(struct sfb_bucket *b, struct sfb_sched_data *q)
|
||||
{
|
||||
b->p_mark = prob_minus(b->p_mark, q->decrement);
|
||||
}
|
||||
|
||||
static void increment_prob(struct sfb_bucket *b, struct sfb_sched_data *q)
|
||||
{
|
||||
b->p_mark = prob_plus(b->p_mark, q->increment);
|
||||
}
|
||||
|
||||
static void sfb_zero_all_buckets(struct sfb_sched_data *q)
|
||||
{
|
||||
memset(&q->bins, 0, sizeof(q->bins));
|
||||
}
|
||||
|
||||
/*
|
||||
* compute max qlen, max p_mark, and avg p_mark
|
||||
*/
|
||||
static u32 sfb_compute_qlen(u32 *prob_r, u32 *avgpm_r, const struct sfb_sched_data *q)
|
||||
{
|
||||
int i;
|
||||
u32 qlen = 0, prob = 0, totalpm = 0;
|
||||
const struct sfb_bucket *b = &q->bins[q->slot].bins[0][0];
|
||||
|
||||
for (i = 0; i < SFB_LEVELS * SFB_NUMBUCKETS; i++) {
|
||||
if (qlen < b->qlen)
|
||||
qlen = b->qlen;
|
||||
totalpm += b->p_mark;
|
||||
if (prob < b->p_mark)
|
||||
prob = b->p_mark;
|
||||
b++;
|
||||
}
|
||||
*prob_r = prob;
|
||||
*avgpm_r = totalpm / (SFB_LEVELS * SFB_NUMBUCKETS);
|
||||
return qlen;
|
||||
}
|
||||
|
||||
|
||||
static void sfb_init_perturbation(u32 slot, struct sfb_sched_data *q)
|
||||
{
|
||||
q->bins[slot].perturbation = prandom_u32();
|
||||
}
|
||||
|
||||
static void sfb_swap_slot(struct sfb_sched_data *q)
|
||||
{
|
||||
sfb_init_perturbation(q->slot, q);
|
||||
q->slot ^= 1;
|
||||
q->double_buffering = false;
|
||||
}
|
||||
|
||||
/* Non elastic flows are allowed to use part of the bandwidth, expressed
|
||||
* in "penalty_rate" packets per second, with "penalty_burst" burst
|
||||
*/
|
||||
static bool sfb_rate_limit(struct sk_buff *skb, struct sfb_sched_data *q)
|
||||
{
|
||||
if (q->penalty_rate == 0 || q->penalty_burst == 0)
|
||||
return true;
|
||||
|
||||
if (q->tokens_avail < 1) {
|
||||
unsigned long age = min(10UL * HZ, jiffies - q->token_time);
|
||||
|
||||
q->tokens_avail = (age * q->penalty_rate) / HZ;
|
||||
if (q->tokens_avail > q->penalty_burst)
|
||||
q->tokens_avail = q->penalty_burst;
|
||||
q->token_time = jiffies;
|
||||
if (q->tokens_avail < 1)
|
||||
return true;
|
||||
}
|
||||
|
||||
q->tokens_avail--;
|
||||
return false;
|
||||
}
|
||||
|
||||
static bool sfb_classify(struct sk_buff *skb, struct tcf_proto *fl,
|
||||
int *qerr, u32 *salt)
|
||||
{
|
||||
struct tcf_result res;
|
||||
int result;
|
||||
|
||||
result = tc_classify(skb, fl, &res);
|
||||
if (result >= 0) {
|
||||
#ifdef CONFIG_NET_CLS_ACT
|
||||
switch (result) {
|
||||
case TC_ACT_STOLEN:
|
||||
case TC_ACT_QUEUED:
|
||||
*qerr = NET_XMIT_SUCCESS | __NET_XMIT_STOLEN;
|
||||
case TC_ACT_SHOT:
|
||||
return false;
|
||||
}
|
||||
#endif
|
||||
*salt = TC_H_MIN(res.classid);
|
||||
return true;
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
static int sfb_enqueue(struct sk_buff *skb, struct Qdisc *sch)
|
||||
{
|
||||
|
||||
struct sfb_sched_data *q = qdisc_priv(sch);
|
||||
struct Qdisc *child = q->qdisc;
|
||||
struct tcf_proto *fl;
|
||||
int i;
|
||||
u32 p_min = ~0;
|
||||
u32 minqlen = ~0;
|
||||
u32 r, slot, salt, sfbhash;
|
||||
int ret = NET_XMIT_SUCCESS | __NET_XMIT_BYPASS;
|
||||
struct flow_keys keys;
|
||||
|
||||
if (unlikely(sch->q.qlen >= q->limit)) {
|
||||
qdisc_qstats_overlimit(sch);
|
||||
q->stats.queuedrop++;
|
||||
goto drop;
|
||||
}
|
||||
|
||||
if (q->rehash_interval > 0) {
|
||||
unsigned long limit = q->rehash_time + q->rehash_interval;
|
||||
|
||||
if (unlikely(time_after(jiffies, limit))) {
|
||||
sfb_swap_slot(q);
|
||||
q->rehash_time = jiffies;
|
||||
} else if (unlikely(!q->double_buffering && q->warmup_time > 0 &&
|
||||
time_after(jiffies, limit - q->warmup_time))) {
|
||||
q->double_buffering = true;
|
||||
}
|
||||
}
|
||||
|
||||
fl = rcu_dereference_bh(q->filter_list);
|
||||
if (fl) {
|
||||
/* If using external classifiers, get result and record it. */
|
||||
if (!sfb_classify(skb, fl, &ret, &salt))
|
||||
goto other_drop;
|
||||
keys.src = salt;
|
||||
keys.dst = 0;
|
||||
keys.ports = 0;
|
||||
} else {
|
||||
skb_flow_dissect(skb, &keys);
|
||||
}
|
||||
|
||||
slot = q->slot;
|
||||
|
||||
sfbhash = jhash_3words((__force u32)keys.dst,
|
||||
(__force u32)keys.src,
|
||||
(__force u32)keys.ports,
|
||||
q->bins[slot].perturbation);
|
||||
if (!sfbhash)
|
||||
sfbhash = 1;
|
||||
sfb_skb_cb(skb)->hashes[slot] = sfbhash;
|
||||
|
||||
for (i = 0; i < SFB_LEVELS; i++) {
|
||||
u32 hash = sfbhash & SFB_BUCKET_MASK;
|
||||
struct sfb_bucket *b = &q->bins[slot].bins[i][hash];
|
||||
|
||||
sfbhash >>= SFB_BUCKET_SHIFT;
|
||||
if (b->qlen == 0)
|
||||
decrement_prob(b, q);
|
||||
else if (b->qlen >= q->bin_size)
|
||||
increment_prob(b, q);
|
||||
if (minqlen > b->qlen)
|
||||
minqlen = b->qlen;
|
||||
if (p_min > b->p_mark)
|
||||
p_min = b->p_mark;
|
||||
}
|
||||
|
||||
slot ^= 1;
|
||||
sfb_skb_cb(skb)->hashes[slot] = 0;
|
||||
|
||||
if (unlikely(minqlen >= q->max)) {
|
||||
qdisc_qstats_overlimit(sch);
|
||||
q->stats.bucketdrop++;
|
||||
goto drop;
|
||||
}
|
||||
|
||||
if (unlikely(p_min >= SFB_MAX_PROB)) {
|
||||
/* Inelastic flow */
|
||||
if (q->double_buffering) {
|
||||
sfbhash = jhash_3words((__force u32)keys.dst,
|
||||
(__force u32)keys.src,
|
||||
(__force u32)keys.ports,
|
||||
q->bins[slot].perturbation);
|
||||
if (!sfbhash)
|
||||
sfbhash = 1;
|
||||
sfb_skb_cb(skb)->hashes[slot] = sfbhash;
|
||||
|
||||
for (i = 0; i < SFB_LEVELS; i++) {
|
||||
u32 hash = sfbhash & SFB_BUCKET_MASK;
|
||||
struct sfb_bucket *b = &q->bins[slot].bins[i][hash];
|
||||
|
||||
sfbhash >>= SFB_BUCKET_SHIFT;
|
||||
if (b->qlen == 0)
|
||||
decrement_prob(b, q);
|
||||
else if (b->qlen >= q->bin_size)
|
||||
increment_prob(b, q);
|
||||
}
|
||||
}
|
||||
if (sfb_rate_limit(skb, q)) {
|
||||
qdisc_qstats_overlimit(sch);
|
||||
q->stats.penaltydrop++;
|
||||
goto drop;
|
||||
}
|
||||
goto enqueue;
|
||||
}
|
||||
|
||||
r = prandom_u32() & SFB_MAX_PROB;
|
||||
|
||||
if (unlikely(r < p_min)) {
|
||||
if (unlikely(p_min > SFB_MAX_PROB / 2)) {
|
||||
/* If we're marking that many packets, then either
|
||||
* this flow is unresponsive, or we're badly congested.
|
||||
* In either case, we want to start dropping packets.
|
||||
*/
|
||||
if (r < (p_min - SFB_MAX_PROB / 2) * 2) {
|
||||
q->stats.earlydrop++;
|
||||
goto drop;
|
||||
}
|
||||
}
|
||||
if (INET_ECN_set_ce(skb)) {
|
||||
q->stats.marked++;
|
||||
} else {
|
||||
q->stats.earlydrop++;
|
||||
goto drop;
|
||||
}
|
||||
}
|
||||
|
||||
enqueue:
|
||||
ret = qdisc_enqueue(skb, child);
|
||||
if (likely(ret == NET_XMIT_SUCCESS)) {
|
||||
sch->q.qlen++;
|
||||
increment_qlen(skb, q);
|
||||
} else if (net_xmit_drop_count(ret)) {
|
||||
q->stats.childdrop++;
|
||||
qdisc_qstats_drop(sch);
|
||||
}
|
||||
return ret;
|
||||
|
||||
drop:
|
||||
qdisc_drop(skb, sch);
|
||||
return NET_XMIT_CN;
|
||||
other_drop:
|
||||
if (ret & __NET_XMIT_BYPASS)
|
||||
qdisc_qstats_drop(sch);
|
||||
kfree_skb(skb);
|
||||
return ret;
|
||||
}
|
||||
|
||||
static struct sk_buff *sfb_dequeue(struct Qdisc *sch)
|
||||
{
|
||||
struct sfb_sched_data *q = qdisc_priv(sch);
|
||||
struct Qdisc *child = q->qdisc;
|
||||
struct sk_buff *skb;
|
||||
|
||||
skb = child->dequeue(q->qdisc);
|
||||
|
||||
if (skb) {
|
||||
qdisc_bstats_update(sch, skb);
|
||||
sch->q.qlen--;
|
||||
decrement_qlen(skb, q);
|
||||
}
|
||||
|
||||
return skb;
|
||||
}
|
||||
|
||||
static struct sk_buff *sfb_peek(struct Qdisc *sch)
|
||||
{
|
||||
struct sfb_sched_data *q = qdisc_priv(sch);
|
||||
struct Qdisc *child = q->qdisc;
|
||||
|
||||
return child->ops->peek(child);
|
||||
}
|
||||
|
||||
/* No sfb_drop -- impossible since the child doesn't return the dropped skb. */
|
||||
|
||||
static void sfb_reset(struct Qdisc *sch)
|
||||
{
|
||||
struct sfb_sched_data *q = qdisc_priv(sch);
|
||||
|
||||
qdisc_reset(q->qdisc);
|
||||
sch->q.qlen = 0;
|
||||
q->slot = 0;
|
||||
q->double_buffering = false;
|
||||
sfb_zero_all_buckets(q);
|
||||
sfb_init_perturbation(0, q);
|
||||
}
|
||||
|
||||
static void sfb_destroy(struct Qdisc *sch)
|
||||
{
|
||||
struct sfb_sched_data *q = qdisc_priv(sch);
|
||||
|
||||
tcf_destroy_chain(&q->filter_list);
|
||||
qdisc_destroy(q->qdisc);
|
||||
}
|
||||
|
||||
static const struct nla_policy sfb_policy[TCA_SFB_MAX + 1] = {
|
||||
[TCA_SFB_PARMS] = { .len = sizeof(struct tc_sfb_qopt) },
|
||||
};
|
||||
|
||||
static const struct tc_sfb_qopt sfb_default_ops = {
|
||||
.rehash_interval = 600 * MSEC_PER_SEC,
|
||||
.warmup_time = 60 * MSEC_PER_SEC,
|
||||
.limit = 0,
|
||||
.max = 25,
|
||||
.bin_size = 20,
|
||||
.increment = (SFB_MAX_PROB + 500) / 1000, /* 0.1 % */
|
||||
.decrement = (SFB_MAX_PROB + 3000) / 6000,
|
||||
.penalty_rate = 10,
|
||||
.penalty_burst = 20,
|
||||
};
|
||||
|
||||
static int sfb_change(struct Qdisc *sch, struct nlattr *opt)
|
||||
{
|
||||
struct sfb_sched_data *q = qdisc_priv(sch);
|
||||
struct Qdisc *child;
|
||||
struct nlattr *tb[TCA_SFB_MAX + 1];
|
||||
const struct tc_sfb_qopt *ctl = &sfb_default_ops;
|
||||
u32 limit;
|
||||
int err;
|
||||
|
||||
if (opt) {
|
||||
err = nla_parse_nested(tb, TCA_SFB_MAX, opt, sfb_policy);
|
||||
if (err < 0)
|
||||
return -EINVAL;
|
||||
|
||||
if (tb[TCA_SFB_PARMS] == NULL)
|
||||
return -EINVAL;
|
||||
|
||||
ctl = nla_data(tb[TCA_SFB_PARMS]);
|
||||
}
|
||||
|
||||
limit = ctl->limit;
|
||||
if (limit == 0)
|
||||
limit = max_t(u32, qdisc_dev(sch)->tx_queue_len, 1);
|
||||
|
||||
child = fifo_create_dflt(sch, &pfifo_qdisc_ops, limit);
|
||||
if (IS_ERR(child))
|
||||
return PTR_ERR(child);
|
||||
|
||||
sch_tree_lock(sch);
|
||||
|
||||
qdisc_tree_decrease_qlen(q->qdisc, q->qdisc->q.qlen);
|
||||
qdisc_destroy(q->qdisc);
|
||||
q->qdisc = child;
|
||||
|
||||
q->rehash_interval = msecs_to_jiffies(ctl->rehash_interval);
|
||||
q->warmup_time = msecs_to_jiffies(ctl->warmup_time);
|
||||
q->rehash_time = jiffies;
|
||||
q->limit = limit;
|
||||
q->increment = ctl->increment;
|
||||
q->decrement = ctl->decrement;
|
||||
q->max = ctl->max;
|
||||
q->bin_size = ctl->bin_size;
|
||||
q->penalty_rate = ctl->penalty_rate;
|
||||
q->penalty_burst = ctl->penalty_burst;
|
||||
q->tokens_avail = ctl->penalty_burst;
|
||||
q->token_time = jiffies;
|
||||
|
||||
q->slot = 0;
|
||||
q->double_buffering = false;
|
||||
sfb_zero_all_buckets(q);
|
||||
sfb_init_perturbation(0, q);
|
||||
sfb_init_perturbation(1, q);
|
||||
|
||||
sch_tree_unlock(sch);
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
static int sfb_init(struct Qdisc *sch, struct nlattr *opt)
|
||||
{
|
||||
struct sfb_sched_data *q = qdisc_priv(sch);
|
||||
|
||||
q->qdisc = &noop_qdisc;
|
||||
return sfb_change(sch, opt);
|
||||
}
|
||||
|
||||
static int sfb_dump(struct Qdisc *sch, struct sk_buff *skb)
|
||||
{
|
||||
struct sfb_sched_data *q = qdisc_priv(sch);
|
||||
struct nlattr *opts;
|
||||
struct tc_sfb_qopt opt = {
|
||||
.rehash_interval = jiffies_to_msecs(q->rehash_interval),
|
||||
.warmup_time = jiffies_to_msecs(q->warmup_time),
|
||||
.limit = q->limit,
|
||||
.max = q->max,
|
||||
.bin_size = q->bin_size,
|
||||
.increment = q->increment,
|
||||
.decrement = q->decrement,
|
||||
.penalty_rate = q->penalty_rate,
|
||||
.penalty_burst = q->penalty_burst,
|
||||
};
|
||||
|
||||
sch->qstats.backlog = q->qdisc->qstats.backlog;
|
||||
opts = nla_nest_start(skb, TCA_OPTIONS);
|
||||
if (opts == NULL)
|
||||
goto nla_put_failure;
|
||||
if (nla_put(skb, TCA_SFB_PARMS, sizeof(opt), &opt))
|
||||
goto nla_put_failure;
|
||||
return nla_nest_end(skb, opts);
|
||||
|
||||
nla_put_failure:
|
||||
nla_nest_cancel(skb, opts);
|
||||
return -EMSGSIZE;
|
||||
}
|
||||
|
||||
static int sfb_dump_stats(struct Qdisc *sch, struct gnet_dump *d)
|
||||
{
|
||||
struct sfb_sched_data *q = qdisc_priv(sch);
|
||||
struct tc_sfb_xstats st = {
|
||||
.earlydrop = q->stats.earlydrop,
|
||||
.penaltydrop = q->stats.penaltydrop,
|
||||
.bucketdrop = q->stats.bucketdrop,
|
||||
.queuedrop = q->stats.queuedrop,
|
||||
.childdrop = q->stats.childdrop,
|
||||
.marked = q->stats.marked,
|
||||
};
|
||||
|
||||
st.maxqlen = sfb_compute_qlen(&st.maxprob, &st.avgprob, q);
|
||||
|
||||
return gnet_stats_copy_app(d, &st, sizeof(st));
|
||||
}
|
||||
|
||||
static int sfb_dump_class(struct Qdisc *sch, unsigned long cl,
|
||||
struct sk_buff *skb, struct tcmsg *tcm)
|
||||
{
|
||||
return -ENOSYS;
|
||||
}
|
||||
|
||||
static int sfb_graft(struct Qdisc *sch, unsigned long arg, struct Qdisc *new,
|
||||
struct Qdisc **old)
|
||||
{
|
||||
struct sfb_sched_data *q = qdisc_priv(sch);
|
||||
|
||||
if (new == NULL)
|
||||
new = &noop_qdisc;
|
||||
|
||||
sch_tree_lock(sch);
|
||||
*old = q->qdisc;
|
||||
q->qdisc = new;
|
||||
qdisc_tree_decrease_qlen(*old, (*old)->q.qlen);
|
||||
qdisc_reset(*old);
|
||||
sch_tree_unlock(sch);
|
||||
return 0;
|
||||
}
|
||||
|
||||
static struct Qdisc *sfb_leaf(struct Qdisc *sch, unsigned long arg)
|
||||
{
|
||||
struct sfb_sched_data *q = qdisc_priv(sch);
|
||||
|
||||
return q->qdisc;
|
||||
}
|
||||
|
||||
static unsigned long sfb_get(struct Qdisc *sch, u32 classid)
|
||||
{
|
||||
return 1;
|
||||
}
|
||||
|
||||
static void sfb_put(struct Qdisc *sch, unsigned long arg)
|
||||
{
|
||||
}
|
||||
|
||||
static int sfb_change_class(struct Qdisc *sch, u32 classid, u32 parentid,
|
||||
struct nlattr **tca, unsigned long *arg)
|
||||
{
|
||||
return -ENOSYS;
|
||||
}
|
||||
|
||||
static int sfb_delete(struct Qdisc *sch, unsigned long cl)
|
||||
{
|
||||
return -ENOSYS;
|
||||
}
|
||||
|
||||
static void sfb_walk(struct Qdisc *sch, struct qdisc_walker *walker)
|
||||
{
|
||||
if (!walker->stop) {
|
||||
if (walker->count >= walker->skip)
|
||||
if (walker->fn(sch, 1, walker) < 0) {
|
||||
walker->stop = 1;
|
||||
return;
|
||||
}
|
||||
walker->count++;
|
||||
}
|
||||
}
|
||||
|
||||
static struct tcf_proto __rcu **sfb_find_tcf(struct Qdisc *sch,
|
||||
unsigned long cl)
|
||||
{
|
||||
struct sfb_sched_data *q = qdisc_priv(sch);
|
||||
|
||||
if (cl)
|
||||
return NULL;
|
||||
return &q->filter_list;
|
||||
}
|
||||
|
||||
static unsigned long sfb_bind(struct Qdisc *sch, unsigned long parent,
|
||||
u32 classid)
|
||||
{
|
||||
return 0;
|
||||
}
|
||||
|
||||
|
||||
static const struct Qdisc_class_ops sfb_class_ops = {
|
||||
.graft = sfb_graft,
|
||||
.leaf = sfb_leaf,
|
||||
.get = sfb_get,
|
||||
.put = sfb_put,
|
||||
.change = sfb_change_class,
|
||||
.delete = sfb_delete,
|
||||
.walk = sfb_walk,
|
||||
.tcf_chain = sfb_find_tcf,
|
||||
.bind_tcf = sfb_bind,
|
||||
.unbind_tcf = sfb_put,
|
||||
.dump = sfb_dump_class,
|
||||
};
|
||||
|
||||
static struct Qdisc_ops sfb_qdisc_ops __read_mostly = {
|
||||
.id = "sfb",
|
||||
.priv_size = sizeof(struct sfb_sched_data),
|
||||
.cl_ops = &sfb_class_ops,
|
||||
.enqueue = sfb_enqueue,
|
||||
.dequeue = sfb_dequeue,
|
||||
.peek = sfb_peek,
|
||||
.init = sfb_init,
|
||||
.reset = sfb_reset,
|
||||
.destroy = sfb_destroy,
|
||||
.change = sfb_change,
|
||||
.dump = sfb_dump,
|
||||
.dump_stats = sfb_dump_stats,
|
||||
.owner = THIS_MODULE,
|
||||
};
|
||||
|
||||
static int __init sfb_module_init(void)
|
||||
{
|
||||
return register_qdisc(&sfb_qdisc_ops);
|
||||
}
|
||||
|
||||
static void __exit sfb_module_exit(void)
|
||||
{
|
||||
unregister_qdisc(&sfb_qdisc_ops);
|
||||
}
|
||||
|
||||
module_init(sfb_module_init)
|
||||
module_exit(sfb_module_exit)
|
||||
|
||||
MODULE_DESCRIPTION("Stochastic Fair Blue queue discipline");
|
||||
MODULE_AUTHOR("Juliusz Chroboczek");
|
||||
MODULE_AUTHOR("Eric Dumazet");
|
||||
MODULE_LICENSE("GPL");
|
||||
939
net/sched/sch_sfq.c
Normal file
939
net/sched/sch_sfq.c
Normal file
|
|
@ -0,0 +1,939 @@
|
|||
/*
|
||||
* net/sched/sch_sfq.c Stochastic Fairness Queueing discipline.
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU General Public License
|
||||
* as published by the Free Software Foundation; either version
|
||||
* 2 of the License, or (at your option) any later version.
|
||||
*
|
||||
* Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
|
||||
*/
|
||||
|
||||
#include <linux/module.h>
|
||||
#include <linux/types.h>
|
||||
#include <linux/kernel.h>
|
||||
#include <linux/jiffies.h>
|
||||
#include <linux/string.h>
|
||||
#include <linux/in.h>
|
||||
#include <linux/errno.h>
|
||||
#include <linux/init.h>
|
||||
#include <linux/skbuff.h>
|
||||
#include <linux/jhash.h>
|
||||
#include <linux/slab.h>
|
||||
#include <linux/vmalloc.h>
|
||||
#include <net/netlink.h>
|
||||
#include <net/pkt_sched.h>
|
||||
#include <net/flow_keys.h>
|
||||
#include <net/red.h>
|
||||
|
||||
|
||||
/* Stochastic Fairness Queuing algorithm.
|
||||
=======================================
|
||||
|
||||
Source:
|
||||
Paul E. McKenney "Stochastic Fairness Queuing",
|
||||
IEEE INFOCOMM'90 Proceedings, San Francisco, 1990.
|
||||
|
||||
Paul E. McKenney "Stochastic Fairness Queuing",
|
||||
"Interworking: Research and Experience", v.2, 1991, p.113-131.
|
||||
|
||||
|
||||
See also:
|
||||
M. Shreedhar and George Varghese "Efficient Fair
|
||||
Queuing using Deficit Round Robin", Proc. SIGCOMM 95.
|
||||
|
||||
|
||||
This is not the thing that is usually called (W)FQ nowadays.
|
||||
It does not use any timestamp mechanism, but instead
|
||||
processes queues in round-robin order.
|
||||
|
||||
ADVANTAGE:
|
||||
|
||||
- It is very cheap. Both CPU and memory requirements are minimal.
|
||||
|
||||
DRAWBACKS:
|
||||
|
||||
- "Stochastic" -> It is not 100% fair.
|
||||
When hash collisions occur, several flows are considered as one.
|
||||
|
||||
- "Round-robin" -> It introduces larger delays than virtual clock
|
||||
based schemes, and should not be used for isolating interactive
|
||||
traffic from non-interactive. It means, that this scheduler
|
||||
should be used as leaf of CBQ or P3, which put interactive traffic
|
||||
to higher priority band.
|
||||
|
||||
We still need true WFQ for top level CSZ, but using WFQ
|
||||
for the best effort traffic is absolutely pointless:
|
||||
SFQ is superior for this purpose.
|
||||
|
||||
IMPLEMENTATION:
|
||||
This implementation limits :
|
||||
- maximal queue length per flow to 127 packets.
|
||||
- max mtu to 2^18-1;
|
||||
- max 65408 flows,
|
||||
- number of hash buckets to 65536.
|
||||
|
||||
It is easy to increase these values, but not in flight. */
|
||||
|
||||
#define SFQ_MAX_DEPTH 127 /* max number of packets per flow */
|
||||
#define SFQ_DEFAULT_FLOWS 128
|
||||
#define SFQ_MAX_FLOWS (0x10000 - SFQ_MAX_DEPTH - 1) /* max number of flows */
|
||||
#define SFQ_EMPTY_SLOT 0xffff
|
||||
#define SFQ_DEFAULT_HASH_DIVISOR 1024
|
||||
|
||||
/* We use 16 bits to store allot, and want to handle packets up to 64K
|
||||
* Scale allot by 8 (1<<3) so that no overflow occurs.
|
||||
*/
|
||||
#define SFQ_ALLOT_SHIFT 3
|
||||
#define SFQ_ALLOT_SIZE(X) DIV_ROUND_UP(X, 1 << SFQ_ALLOT_SHIFT)
|
||||
|
||||
/* This type should contain at least SFQ_MAX_DEPTH + 1 + SFQ_MAX_FLOWS values */
|
||||
typedef u16 sfq_index;
|
||||
|
||||
/*
|
||||
* We dont use pointers to save space.
|
||||
* Small indexes [0 ... SFQ_MAX_FLOWS - 1] are 'pointers' to slots[] array
|
||||
* while following values [SFQ_MAX_FLOWS ... SFQ_MAX_FLOWS + SFQ_MAX_DEPTH]
|
||||
* are 'pointers' to dep[] array
|
||||
*/
|
||||
struct sfq_head {
|
||||
sfq_index next;
|
||||
sfq_index prev;
|
||||
};
|
||||
|
||||
struct sfq_slot {
|
||||
struct sk_buff *skblist_next;
|
||||
struct sk_buff *skblist_prev;
|
||||
sfq_index qlen; /* number of skbs in skblist */
|
||||
sfq_index next; /* next slot in sfq RR chain */
|
||||
struct sfq_head dep; /* anchor in dep[] chains */
|
||||
unsigned short hash; /* hash value (index in ht[]) */
|
||||
short allot; /* credit for this slot */
|
||||
|
||||
unsigned int backlog;
|
||||
struct red_vars vars;
|
||||
};
|
||||
|
||||
struct sfq_sched_data {
|
||||
/* frequently used fields */
|
||||
int limit; /* limit of total number of packets in this qdisc */
|
||||
unsigned int divisor; /* number of slots in hash table */
|
||||
u8 headdrop;
|
||||
u8 maxdepth; /* limit of packets per flow */
|
||||
|
||||
u32 perturbation;
|
||||
u8 cur_depth; /* depth of longest slot */
|
||||
u8 flags;
|
||||
unsigned short scaled_quantum; /* SFQ_ALLOT_SIZE(quantum) */
|
||||
struct tcf_proto __rcu *filter_list;
|
||||
sfq_index *ht; /* Hash table ('divisor' slots) */
|
||||
struct sfq_slot *slots; /* Flows table ('maxflows' entries) */
|
||||
|
||||
struct red_parms *red_parms;
|
||||
struct tc_sfqred_stats stats;
|
||||
struct sfq_slot *tail; /* current slot in round */
|
||||
|
||||
struct sfq_head dep[SFQ_MAX_DEPTH + 1];
|
||||
/* Linked lists of slots, indexed by depth
|
||||
* dep[0] : list of unused flows
|
||||
* dep[1] : list of flows with 1 packet
|
||||
* dep[X] : list of flows with X packets
|
||||
*/
|
||||
|
||||
unsigned int maxflows; /* number of flows in flows array */
|
||||
int perturb_period;
|
||||
unsigned int quantum; /* Allotment per round: MUST BE >= MTU */
|
||||
struct timer_list perturb_timer;
|
||||
};
|
||||
|
||||
/*
|
||||
* sfq_head are either in a sfq_slot or in dep[] array
|
||||
*/
|
||||
static inline struct sfq_head *sfq_dep_head(struct sfq_sched_data *q, sfq_index val)
|
||||
{
|
||||
if (val < SFQ_MAX_FLOWS)
|
||||
return &q->slots[val].dep;
|
||||
return &q->dep[val - SFQ_MAX_FLOWS];
|
||||
}
|
||||
|
||||
/*
|
||||
* In order to be able to quickly rehash our queue when timer changes
|
||||
* q->perturbation, we store flow_keys in skb->cb[]
|
||||
*/
|
||||
struct sfq_skb_cb {
|
||||
struct flow_keys keys;
|
||||
};
|
||||
|
||||
static inline struct sfq_skb_cb *sfq_skb_cb(const struct sk_buff *skb)
|
||||
{
|
||||
qdisc_cb_private_validate(skb, sizeof(struct sfq_skb_cb));
|
||||
return (struct sfq_skb_cb *)qdisc_skb_cb(skb)->data;
|
||||
}
|
||||
|
||||
static unsigned int sfq_hash(const struct sfq_sched_data *q,
|
||||
const struct sk_buff *skb)
|
||||
{
|
||||
const struct flow_keys *keys = &sfq_skb_cb(skb)->keys;
|
||||
unsigned int hash;
|
||||
|
||||
hash = jhash_3words((__force u32)keys->dst,
|
||||
(__force u32)keys->src ^ keys->ip_proto,
|
||||
(__force u32)keys->ports, q->perturbation);
|
||||
return hash & (q->divisor - 1);
|
||||
}
|
||||
|
||||
static unsigned int sfq_classify(struct sk_buff *skb, struct Qdisc *sch,
|
||||
int *qerr)
|
||||
{
|
||||
struct sfq_sched_data *q = qdisc_priv(sch);
|
||||
struct tcf_result res;
|
||||
struct tcf_proto *fl;
|
||||
int result;
|
||||
|
||||
if (TC_H_MAJ(skb->priority) == sch->handle &&
|
||||
TC_H_MIN(skb->priority) > 0 &&
|
||||
TC_H_MIN(skb->priority) <= q->divisor)
|
||||
return TC_H_MIN(skb->priority);
|
||||
|
||||
fl = rcu_dereference_bh(q->filter_list);
|
||||
if (!fl) {
|
||||
skb_flow_dissect(skb, &sfq_skb_cb(skb)->keys);
|
||||
return sfq_hash(q, skb) + 1;
|
||||
}
|
||||
|
||||
*qerr = NET_XMIT_SUCCESS | __NET_XMIT_BYPASS;
|
||||
result = tc_classify(skb, fl, &res);
|
||||
if (result >= 0) {
|
||||
#ifdef CONFIG_NET_CLS_ACT
|
||||
switch (result) {
|
||||
case TC_ACT_STOLEN:
|
||||
case TC_ACT_QUEUED:
|
||||
*qerr = NET_XMIT_SUCCESS | __NET_XMIT_STOLEN;
|
||||
case TC_ACT_SHOT:
|
||||
return 0;
|
||||
}
|
||||
#endif
|
||||
if (TC_H_MIN(res.classid) <= q->divisor)
|
||||
return TC_H_MIN(res.classid);
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
/*
|
||||
* x : slot number [0 .. SFQ_MAX_FLOWS - 1]
|
||||
*/
|
||||
static inline void sfq_link(struct sfq_sched_data *q, sfq_index x)
|
||||
{
|
||||
sfq_index p, n;
|
||||
struct sfq_slot *slot = &q->slots[x];
|
||||
int qlen = slot->qlen;
|
||||
|
||||
p = qlen + SFQ_MAX_FLOWS;
|
||||
n = q->dep[qlen].next;
|
||||
|
||||
slot->dep.next = n;
|
||||
slot->dep.prev = p;
|
||||
|
||||
q->dep[qlen].next = x; /* sfq_dep_head(q, p)->next = x */
|
||||
sfq_dep_head(q, n)->prev = x;
|
||||
}
|
||||
|
||||
#define sfq_unlink(q, x, n, p) \
|
||||
do { \
|
||||
n = q->slots[x].dep.next; \
|
||||
p = q->slots[x].dep.prev; \
|
||||
sfq_dep_head(q, p)->next = n; \
|
||||
sfq_dep_head(q, n)->prev = p; \
|
||||
} while (0)
|
||||
|
||||
|
||||
static inline void sfq_dec(struct sfq_sched_data *q, sfq_index x)
|
||||
{
|
||||
sfq_index p, n;
|
||||
int d;
|
||||
|
||||
sfq_unlink(q, x, n, p);
|
||||
|
||||
d = q->slots[x].qlen--;
|
||||
if (n == p && q->cur_depth == d)
|
||||
q->cur_depth--;
|
||||
sfq_link(q, x);
|
||||
}
|
||||
|
||||
static inline void sfq_inc(struct sfq_sched_data *q, sfq_index x)
|
||||
{
|
||||
sfq_index p, n;
|
||||
int d;
|
||||
|
||||
sfq_unlink(q, x, n, p);
|
||||
|
||||
d = ++q->slots[x].qlen;
|
||||
if (q->cur_depth < d)
|
||||
q->cur_depth = d;
|
||||
sfq_link(q, x);
|
||||
}
|
||||
|
||||
/* helper functions : might be changed when/if skb use a standard list_head */
|
||||
|
||||
/* remove one skb from tail of slot queue */
|
||||
static inline struct sk_buff *slot_dequeue_tail(struct sfq_slot *slot)
|
||||
{
|
||||
struct sk_buff *skb = slot->skblist_prev;
|
||||
|
||||
slot->skblist_prev = skb->prev;
|
||||
skb->prev->next = (struct sk_buff *)slot;
|
||||
skb->next = skb->prev = NULL;
|
||||
return skb;
|
||||
}
|
||||
|
||||
/* remove one skb from head of slot queue */
|
||||
static inline struct sk_buff *slot_dequeue_head(struct sfq_slot *slot)
|
||||
{
|
||||
struct sk_buff *skb = slot->skblist_next;
|
||||
|
||||
slot->skblist_next = skb->next;
|
||||
skb->next->prev = (struct sk_buff *)slot;
|
||||
skb->next = skb->prev = NULL;
|
||||
return skb;
|
||||
}
|
||||
|
||||
static inline void slot_queue_init(struct sfq_slot *slot)
|
||||
{
|
||||
memset(slot, 0, sizeof(*slot));
|
||||
slot->skblist_prev = slot->skblist_next = (struct sk_buff *)slot;
|
||||
}
|
||||
|
||||
/* add skb to slot queue (tail add) */
|
||||
static inline void slot_queue_add(struct sfq_slot *slot, struct sk_buff *skb)
|
||||
{
|
||||
skb->prev = slot->skblist_prev;
|
||||
skb->next = (struct sk_buff *)slot;
|
||||
slot->skblist_prev->next = skb;
|
||||
slot->skblist_prev = skb;
|
||||
}
|
||||
|
||||
static unsigned int sfq_drop(struct Qdisc *sch)
|
||||
{
|
||||
struct sfq_sched_data *q = qdisc_priv(sch);
|
||||
sfq_index x, d = q->cur_depth;
|
||||
struct sk_buff *skb;
|
||||
unsigned int len;
|
||||
struct sfq_slot *slot;
|
||||
|
||||
/* Queue is full! Find the longest slot and drop tail packet from it */
|
||||
if (d > 1) {
|
||||
x = q->dep[d].next;
|
||||
slot = &q->slots[x];
|
||||
drop:
|
||||
skb = q->headdrop ? slot_dequeue_head(slot) : slot_dequeue_tail(slot);
|
||||
len = qdisc_pkt_len(skb);
|
||||
slot->backlog -= len;
|
||||
sfq_dec(q, x);
|
||||
kfree_skb(skb);
|
||||
sch->q.qlen--;
|
||||
qdisc_qstats_drop(sch);
|
||||
qdisc_qstats_backlog_dec(sch, skb);
|
||||
return len;
|
||||
}
|
||||
|
||||
if (d == 1) {
|
||||
/* It is difficult to believe, but ALL THE SLOTS HAVE LENGTH 1. */
|
||||
x = q->tail->next;
|
||||
slot = &q->slots[x];
|
||||
q->tail->next = slot->next;
|
||||
q->ht[slot->hash] = SFQ_EMPTY_SLOT;
|
||||
goto drop;
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
/* Is ECN parameter configured */
|
||||
static int sfq_prob_mark(const struct sfq_sched_data *q)
|
||||
{
|
||||
return q->flags & TC_RED_ECN;
|
||||
}
|
||||
|
||||
/* Should packets over max threshold just be marked */
|
||||
static int sfq_hard_mark(const struct sfq_sched_data *q)
|
||||
{
|
||||
return (q->flags & (TC_RED_ECN | TC_RED_HARDDROP)) == TC_RED_ECN;
|
||||
}
|
||||
|
||||
static int sfq_headdrop(const struct sfq_sched_data *q)
|
||||
{
|
||||
return q->headdrop;
|
||||
}
|
||||
|
||||
static int
|
||||
sfq_enqueue(struct sk_buff *skb, struct Qdisc *sch)
|
||||
{
|
||||
struct sfq_sched_data *q = qdisc_priv(sch);
|
||||
unsigned int hash;
|
||||
sfq_index x, qlen;
|
||||
struct sfq_slot *slot;
|
||||
int uninitialized_var(ret);
|
||||
struct sk_buff *head;
|
||||
int delta;
|
||||
|
||||
hash = sfq_classify(skb, sch, &ret);
|
||||
if (hash == 0) {
|
||||
if (ret & __NET_XMIT_BYPASS)
|
||||
qdisc_qstats_drop(sch);
|
||||
kfree_skb(skb);
|
||||
return ret;
|
||||
}
|
||||
hash--;
|
||||
|
||||
x = q->ht[hash];
|
||||
slot = &q->slots[x];
|
||||
if (x == SFQ_EMPTY_SLOT) {
|
||||
x = q->dep[0].next; /* get a free slot */
|
||||
if (x >= SFQ_MAX_FLOWS)
|
||||
return qdisc_drop(skb, sch);
|
||||
q->ht[hash] = x;
|
||||
slot = &q->slots[x];
|
||||
slot->hash = hash;
|
||||
slot->backlog = 0; /* should already be 0 anyway... */
|
||||
red_set_vars(&slot->vars);
|
||||
goto enqueue;
|
||||
}
|
||||
if (q->red_parms) {
|
||||
slot->vars.qavg = red_calc_qavg_no_idle_time(q->red_parms,
|
||||
&slot->vars,
|
||||
slot->backlog);
|
||||
switch (red_action(q->red_parms,
|
||||
&slot->vars,
|
||||
slot->vars.qavg)) {
|
||||
case RED_DONT_MARK:
|
||||
break;
|
||||
|
||||
case RED_PROB_MARK:
|
||||
qdisc_qstats_overlimit(sch);
|
||||
if (sfq_prob_mark(q)) {
|
||||
/* We know we have at least one packet in queue */
|
||||
if (sfq_headdrop(q) &&
|
||||
INET_ECN_set_ce(slot->skblist_next)) {
|
||||
q->stats.prob_mark_head++;
|
||||
break;
|
||||
}
|
||||
if (INET_ECN_set_ce(skb)) {
|
||||
q->stats.prob_mark++;
|
||||
break;
|
||||
}
|
||||
}
|
||||
q->stats.prob_drop++;
|
||||
goto congestion_drop;
|
||||
|
||||
case RED_HARD_MARK:
|
||||
qdisc_qstats_overlimit(sch);
|
||||
if (sfq_hard_mark(q)) {
|
||||
/* We know we have at least one packet in queue */
|
||||
if (sfq_headdrop(q) &&
|
||||
INET_ECN_set_ce(slot->skblist_next)) {
|
||||
q->stats.forced_mark_head++;
|
||||
break;
|
||||
}
|
||||
if (INET_ECN_set_ce(skb)) {
|
||||
q->stats.forced_mark++;
|
||||
break;
|
||||
}
|
||||
}
|
||||
q->stats.forced_drop++;
|
||||
goto congestion_drop;
|
||||
}
|
||||
}
|
||||
|
||||
if (slot->qlen >= q->maxdepth) {
|
||||
congestion_drop:
|
||||
if (!sfq_headdrop(q))
|
||||
return qdisc_drop(skb, sch);
|
||||
|
||||
/* We know we have at least one packet in queue */
|
||||
head = slot_dequeue_head(slot);
|
||||
delta = qdisc_pkt_len(head) - qdisc_pkt_len(skb);
|
||||
sch->qstats.backlog -= delta;
|
||||
slot->backlog -= delta;
|
||||
qdisc_drop(head, sch);
|
||||
|
||||
slot_queue_add(slot, skb);
|
||||
return NET_XMIT_CN;
|
||||
}
|
||||
|
||||
enqueue:
|
||||
qdisc_qstats_backlog_inc(sch, skb);
|
||||
slot->backlog += qdisc_pkt_len(skb);
|
||||
slot_queue_add(slot, skb);
|
||||
sfq_inc(q, x);
|
||||
if (slot->qlen == 1) { /* The flow is new */
|
||||
if (q->tail == NULL) { /* It is the first flow */
|
||||
slot->next = x;
|
||||
} else {
|
||||
slot->next = q->tail->next;
|
||||
q->tail->next = x;
|
||||
}
|
||||
/* We put this flow at the end of our flow list.
|
||||
* This might sound unfair for a new flow to wait after old ones,
|
||||
* but we could endup servicing new flows only, and freeze old ones.
|
||||
*/
|
||||
q->tail = slot;
|
||||
/* We could use a bigger initial quantum for new flows */
|
||||
slot->allot = q->scaled_quantum;
|
||||
}
|
||||
if (++sch->q.qlen <= q->limit)
|
||||
return NET_XMIT_SUCCESS;
|
||||
|
||||
qlen = slot->qlen;
|
||||
sfq_drop(sch);
|
||||
/* Return Congestion Notification only if we dropped a packet
|
||||
* from this flow.
|
||||
*/
|
||||
if (qlen != slot->qlen)
|
||||
return NET_XMIT_CN;
|
||||
|
||||
/* As we dropped a packet, better let upper stack know this */
|
||||
qdisc_tree_decrease_qlen(sch, 1);
|
||||
return NET_XMIT_SUCCESS;
|
||||
}
|
||||
|
||||
static struct sk_buff *
|
||||
sfq_dequeue(struct Qdisc *sch)
|
||||
{
|
||||
struct sfq_sched_data *q = qdisc_priv(sch);
|
||||
struct sk_buff *skb;
|
||||
sfq_index a, next_a;
|
||||
struct sfq_slot *slot;
|
||||
|
||||
/* No active slots */
|
||||
if (q->tail == NULL)
|
||||
return NULL;
|
||||
|
||||
next_slot:
|
||||
a = q->tail->next;
|
||||
slot = &q->slots[a];
|
||||
if (slot->allot <= 0) {
|
||||
q->tail = slot;
|
||||
slot->allot += q->scaled_quantum;
|
||||
goto next_slot;
|
||||
}
|
||||
skb = slot_dequeue_head(slot);
|
||||
sfq_dec(q, a);
|
||||
qdisc_bstats_update(sch, skb);
|
||||
sch->q.qlen--;
|
||||
qdisc_qstats_backlog_dec(sch, skb);
|
||||
slot->backlog -= qdisc_pkt_len(skb);
|
||||
/* Is the slot empty? */
|
||||
if (slot->qlen == 0) {
|
||||
q->ht[slot->hash] = SFQ_EMPTY_SLOT;
|
||||
next_a = slot->next;
|
||||
if (a == next_a) {
|
||||
q->tail = NULL; /* no more active slots */
|
||||
return skb;
|
||||
}
|
||||
q->tail->next = next_a;
|
||||
} else {
|
||||
slot->allot -= SFQ_ALLOT_SIZE(qdisc_pkt_len(skb));
|
||||
}
|
||||
return skb;
|
||||
}
|
||||
|
||||
static void
|
||||
sfq_reset(struct Qdisc *sch)
|
||||
{
|
||||
struct sk_buff *skb;
|
||||
|
||||
while ((skb = sfq_dequeue(sch)) != NULL)
|
||||
kfree_skb(skb);
|
||||
}
|
||||
|
||||
/*
|
||||
* When q->perturbation is changed, we rehash all queued skbs
|
||||
* to avoid OOO (Out Of Order) effects.
|
||||
* We dont use sfq_dequeue()/sfq_enqueue() because we dont want to change
|
||||
* counters.
|
||||
*/
|
||||
static void sfq_rehash(struct Qdisc *sch)
|
||||
{
|
||||
struct sfq_sched_data *q = qdisc_priv(sch);
|
||||
struct sk_buff *skb;
|
||||
int i;
|
||||
struct sfq_slot *slot;
|
||||
struct sk_buff_head list;
|
||||
int dropped = 0;
|
||||
|
||||
__skb_queue_head_init(&list);
|
||||
|
||||
for (i = 0; i < q->maxflows; i++) {
|
||||
slot = &q->slots[i];
|
||||
if (!slot->qlen)
|
||||
continue;
|
||||
while (slot->qlen) {
|
||||
skb = slot_dequeue_head(slot);
|
||||
sfq_dec(q, i);
|
||||
__skb_queue_tail(&list, skb);
|
||||
}
|
||||
slot->backlog = 0;
|
||||
red_set_vars(&slot->vars);
|
||||
q->ht[slot->hash] = SFQ_EMPTY_SLOT;
|
||||
}
|
||||
q->tail = NULL;
|
||||
|
||||
while ((skb = __skb_dequeue(&list)) != NULL) {
|
||||
unsigned int hash = sfq_hash(q, skb);
|
||||
sfq_index x = q->ht[hash];
|
||||
|
||||
slot = &q->slots[x];
|
||||
if (x == SFQ_EMPTY_SLOT) {
|
||||
x = q->dep[0].next; /* get a free slot */
|
||||
if (x >= SFQ_MAX_FLOWS) {
|
||||
drop:
|
||||
qdisc_qstats_backlog_dec(sch, skb);
|
||||
kfree_skb(skb);
|
||||
dropped++;
|
||||
continue;
|
||||
}
|
||||
q->ht[hash] = x;
|
||||
slot = &q->slots[x];
|
||||
slot->hash = hash;
|
||||
}
|
||||
if (slot->qlen >= q->maxdepth)
|
||||
goto drop;
|
||||
slot_queue_add(slot, skb);
|
||||
if (q->red_parms)
|
||||
slot->vars.qavg = red_calc_qavg(q->red_parms,
|
||||
&slot->vars,
|
||||
slot->backlog);
|
||||
slot->backlog += qdisc_pkt_len(skb);
|
||||
sfq_inc(q, x);
|
||||
if (slot->qlen == 1) { /* The flow is new */
|
||||
if (q->tail == NULL) { /* It is the first flow */
|
||||
slot->next = x;
|
||||
} else {
|
||||
slot->next = q->tail->next;
|
||||
q->tail->next = x;
|
||||
}
|
||||
q->tail = slot;
|
||||
slot->allot = q->scaled_quantum;
|
||||
}
|
||||
}
|
||||
sch->q.qlen -= dropped;
|
||||
qdisc_tree_decrease_qlen(sch, dropped);
|
||||
}
|
||||
|
||||
static void sfq_perturbation(unsigned long arg)
|
||||
{
|
||||
struct Qdisc *sch = (struct Qdisc *)arg;
|
||||
struct sfq_sched_data *q = qdisc_priv(sch);
|
||||
spinlock_t *root_lock = qdisc_lock(qdisc_root_sleeping(sch));
|
||||
|
||||
spin_lock(root_lock);
|
||||
q->perturbation = prandom_u32();
|
||||
if (!q->filter_list && q->tail)
|
||||
sfq_rehash(sch);
|
||||
spin_unlock(root_lock);
|
||||
|
||||
if (q->perturb_period)
|
||||
mod_timer(&q->perturb_timer, jiffies + q->perturb_period);
|
||||
}
|
||||
|
||||
static int sfq_change(struct Qdisc *sch, struct nlattr *opt)
|
||||
{
|
||||
struct sfq_sched_data *q = qdisc_priv(sch);
|
||||
struct tc_sfq_qopt *ctl = nla_data(opt);
|
||||
struct tc_sfq_qopt_v1 *ctl_v1 = NULL;
|
||||
unsigned int qlen;
|
||||
struct red_parms *p = NULL;
|
||||
|
||||
if (opt->nla_len < nla_attr_size(sizeof(*ctl)))
|
||||
return -EINVAL;
|
||||
if (opt->nla_len >= nla_attr_size(sizeof(*ctl_v1)))
|
||||
ctl_v1 = nla_data(opt);
|
||||
if (ctl->divisor &&
|
||||
(!is_power_of_2(ctl->divisor) || ctl->divisor > 65536))
|
||||
return -EINVAL;
|
||||
if (ctl_v1 && ctl_v1->qth_min) {
|
||||
p = kmalloc(sizeof(*p), GFP_KERNEL);
|
||||
if (!p)
|
||||
return -ENOMEM;
|
||||
}
|
||||
sch_tree_lock(sch);
|
||||
if (ctl->quantum) {
|
||||
q->quantum = ctl->quantum;
|
||||
q->scaled_quantum = SFQ_ALLOT_SIZE(q->quantum);
|
||||
}
|
||||
q->perturb_period = ctl->perturb_period * HZ;
|
||||
if (ctl->flows)
|
||||
q->maxflows = min_t(u32, ctl->flows, SFQ_MAX_FLOWS);
|
||||
if (ctl->divisor) {
|
||||
q->divisor = ctl->divisor;
|
||||
q->maxflows = min_t(u32, q->maxflows, q->divisor);
|
||||
}
|
||||
if (ctl_v1) {
|
||||
if (ctl_v1->depth)
|
||||
q->maxdepth = min_t(u32, ctl_v1->depth, SFQ_MAX_DEPTH);
|
||||
if (p) {
|
||||
swap(q->red_parms, p);
|
||||
red_set_parms(q->red_parms,
|
||||
ctl_v1->qth_min, ctl_v1->qth_max,
|
||||
ctl_v1->Wlog,
|
||||
ctl_v1->Plog, ctl_v1->Scell_log,
|
||||
NULL,
|
||||
ctl_v1->max_P);
|
||||
}
|
||||
q->flags = ctl_v1->flags;
|
||||
q->headdrop = ctl_v1->headdrop;
|
||||
}
|
||||
if (ctl->limit) {
|
||||
q->limit = min_t(u32, ctl->limit, q->maxdepth * q->maxflows);
|
||||
q->maxflows = min_t(u32, q->maxflows, q->limit);
|
||||
}
|
||||
|
||||
qlen = sch->q.qlen;
|
||||
while (sch->q.qlen > q->limit)
|
||||
sfq_drop(sch);
|
||||
qdisc_tree_decrease_qlen(sch, qlen - sch->q.qlen);
|
||||
|
||||
del_timer(&q->perturb_timer);
|
||||
if (q->perturb_period) {
|
||||
mod_timer(&q->perturb_timer, jiffies + q->perturb_period);
|
||||
q->perturbation = prandom_u32();
|
||||
}
|
||||
sch_tree_unlock(sch);
|
||||
kfree(p);
|
||||
return 0;
|
||||
}
|
||||
|
||||
static void *sfq_alloc(size_t sz)
|
||||
{
|
||||
void *ptr = kmalloc(sz, GFP_KERNEL | __GFP_NOWARN);
|
||||
|
||||
if (!ptr)
|
||||
ptr = vmalloc(sz);
|
||||
return ptr;
|
||||
}
|
||||
|
||||
static void sfq_free(void *addr)
|
||||
{
|
||||
kvfree(addr);
|
||||
}
|
||||
|
||||
static void sfq_destroy(struct Qdisc *sch)
|
||||
{
|
||||
struct sfq_sched_data *q = qdisc_priv(sch);
|
||||
|
||||
tcf_destroy_chain(&q->filter_list);
|
||||
q->perturb_period = 0;
|
||||
del_timer_sync(&q->perturb_timer);
|
||||
sfq_free(q->ht);
|
||||
sfq_free(q->slots);
|
||||
kfree(q->red_parms);
|
||||
}
|
||||
|
||||
static int sfq_init(struct Qdisc *sch, struct nlattr *opt)
|
||||
{
|
||||
struct sfq_sched_data *q = qdisc_priv(sch);
|
||||
int i;
|
||||
|
||||
q->perturb_timer.function = sfq_perturbation;
|
||||
q->perturb_timer.data = (unsigned long)sch;
|
||||
init_timer_deferrable(&q->perturb_timer);
|
||||
|
||||
for (i = 0; i < SFQ_MAX_DEPTH + 1; i++) {
|
||||
q->dep[i].next = i + SFQ_MAX_FLOWS;
|
||||
q->dep[i].prev = i + SFQ_MAX_FLOWS;
|
||||
}
|
||||
|
||||
q->limit = SFQ_MAX_DEPTH;
|
||||
q->maxdepth = SFQ_MAX_DEPTH;
|
||||
q->cur_depth = 0;
|
||||
q->tail = NULL;
|
||||
q->divisor = SFQ_DEFAULT_HASH_DIVISOR;
|
||||
q->maxflows = SFQ_DEFAULT_FLOWS;
|
||||
q->quantum = psched_mtu(qdisc_dev(sch));
|
||||
q->scaled_quantum = SFQ_ALLOT_SIZE(q->quantum);
|
||||
q->perturb_period = 0;
|
||||
q->perturbation = prandom_u32();
|
||||
|
||||
if (opt) {
|
||||
int err = sfq_change(sch, opt);
|
||||
if (err)
|
||||
return err;
|
||||
}
|
||||
|
||||
q->ht = sfq_alloc(sizeof(q->ht[0]) * q->divisor);
|
||||
q->slots = sfq_alloc(sizeof(q->slots[0]) * q->maxflows);
|
||||
if (!q->ht || !q->slots) {
|
||||
sfq_destroy(sch);
|
||||
return -ENOMEM;
|
||||
}
|
||||
for (i = 0; i < q->divisor; i++)
|
||||
q->ht[i] = SFQ_EMPTY_SLOT;
|
||||
|
||||
for (i = 0; i < q->maxflows; i++) {
|
||||
slot_queue_init(&q->slots[i]);
|
||||
sfq_link(q, i);
|
||||
}
|
||||
if (q->limit >= 1)
|
||||
sch->flags |= TCQ_F_CAN_BYPASS;
|
||||
else
|
||||
sch->flags &= ~TCQ_F_CAN_BYPASS;
|
||||
return 0;
|
||||
}
|
||||
|
||||
static int sfq_dump(struct Qdisc *sch, struct sk_buff *skb)
|
||||
{
|
||||
struct sfq_sched_data *q = qdisc_priv(sch);
|
||||
unsigned char *b = skb_tail_pointer(skb);
|
||||
struct tc_sfq_qopt_v1 opt;
|
||||
struct red_parms *p = q->red_parms;
|
||||
|
||||
memset(&opt, 0, sizeof(opt));
|
||||
opt.v0.quantum = q->quantum;
|
||||
opt.v0.perturb_period = q->perturb_period / HZ;
|
||||
opt.v0.limit = q->limit;
|
||||
opt.v0.divisor = q->divisor;
|
||||
opt.v0.flows = q->maxflows;
|
||||
opt.depth = q->maxdepth;
|
||||
opt.headdrop = q->headdrop;
|
||||
|
||||
if (p) {
|
||||
opt.qth_min = p->qth_min >> p->Wlog;
|
||||
opt.qth_max = p->qth_max >> p->Wlog;
|
||||
opt.Wlog = p->Wlog;
|
||||
opt.Plog = p->Plog;
|
||||
opt.Scell_log = p->Scell_log;
|
||||
opt.max_P = p->max_P;
|
||||
}
|
||||
memcpy(&opt.stats, &q->stats, sizeof(opt.stats));
|
||||
opt.flags = q->flags;
|
||||
|
||||
if (nla_put(skb, TCA_OPTIONS, sizeof(opt), &opt))
|
||||
goto nla_put_failure;
|
||||
|
||||
return skb->len;
|
||||
|
||||
nla_put_failure:
|
||||
nlmsg_trim(skb, b);
|
||||
return -1;
|
||||
}
|
||||
|
||||
static struct Qdisc *sfq_leaf(struct Qdisc *sch, unsigned long arg)
|
||||
{
|
||||
return NULL;
|
||||
}
|
||||
|
||||
static unsigned long sfq_get(struct Qdisc *sch, u32 classid)
|
||||
{
|
||||
return 0;
|
||||
}
|
||||
|
||||
static unsigned long sfq_bind(struct Qdisc *sch, unsigned long parent,
|
||||
u32 classid)
|
||||
{
|
||||
/* we cannot bypass queue discipline anymore */
|
||||
sch->flags &= ~TCQ_F_CAN_BYPASS;
|
||||
return 0;
|
||||
}
|
||||
|
||||
static void sfq_put(struct Qdisc *q, unsigned long cl)
|
||||
{
|
||||
}
|
||||
|
||||
static struct tcf_proto __rcu **sfq_find_tcf(struct Qdisc *sch,
|
||||
unsigned long cl)
|
||||
{
|
||||
struct sfq_sched_data *q = qdisc_priv(sch);
|
||||
|
||||
if (cl)
|
||||
return NULL;
|
||||
return &q->filter_list;
|
||||
}
|
||||
|
||||
static int sfq_dump_class(struct Qdisc *sch, unsigned long cl,
|
||||
struct sk_buff *skb, struct tcmsg *tcm)
|
||||
{
|
||||
tcm->tcm_handle |= TC_H_MIN(cl);
|
||||
return 0;
|
||||
}
|
||||
|
||||
static int sfq_dump_class_stats(struct Qdisc *sch, unsigned long cl,
|
||||
struct gnet_dump *d)
|
||||
{
|
||||
struct sfq_sched_data *q = qdisc_priv(sch);
|
||||
sfq_index idx = q->ht[cl - 1];
|
||||
struct gnet_stats_queue qs = { 0 };
|
||||
struct tc_sfq_xstats xstats = { 0 };
|
||||
|
||||
if (idx != SFQ_EMPTY_SLOT) {
|
||||
const struct sfq_slot *slot = &q->slots[idx];
|
||||
|
||||
xstats.allot = slot->allot << SFQ_ALLOT_SHIFT;
|
||||
qs.qlen = slot->qlen;
|
||||
qs.backlog = slot->backlog;
|
||||
}
|
||||
if (gnet_stats_copy_queue(d, NULL, &qs, qs.qlen) < 0)
|
||||
return -1;
|
||||
return gnet_stats_copy_app(d, &xstats, sizeof(xstats));
|
||||
}
|
||||
|
||||
static void sfq_walk(struct Qdisc *sch, struct qdisc_walker *arg)
|
||||
{
|
||||
struct sfq_sched_data *q = qdisc_priv(sch);
|
||||
unsigned int i;
|
||||
|
||||
if (arg->stop)
|
||||
return;
|
||||
|
||||
for (i = 0; i < q->divisor; i++) {
|
||||
if (q->ht[i] == SFQ_EMPTY_SLOT ||
|
||||
arg->count < arg->skip) {
|
||||
arg->count++;
|
||||
continue;
|
||||
}
|
||||
if (arg->fn(sch, i + 1, arg) < 0) {
|
||||
arg->stop = 1;
|
||||
break;
|
||||
}
|
||||
arg->count++;
|
||||
}
|
||||
}
|
||||
|
||||
static const struct Qdisc_class_ops sfq_class_ops = {
|
||||
.leaf = sfq_leaf,
|
||||
.get = sfq_get,
|
||||
.put = sfq_put,
|
||||
.tcf_chain = sfq_find_tcf,
|
||||
.bind_tcf = sfq_bind,
|
||||
.unbind_tcf = sfq_put,
|
||||
.dump = sfq_dump_class,
|
||||
.dump_stats = sfq_dump_class_stats,
|
||||
.walk = sfq_walk,
|
||||
};
|
||||
|
||||
static struct Qdisc_ops sfq_qdisc_ops __read_mostly = {
|
||||
.cl_ops = &sfq_class_ops,
|
||||
.id = "sfq",
|
||||
.priv_size = sizeof(struct sfq_sched_data),
|
||||
.enqueue = sfq_enqueue,
|
||||
.dequeue = sfq_dequeue,
|
||||
.peek = qdisc_peek_dequeued,
|
||||
.drop = sfq_drop,
|
||||
.init = sfq_init,
|
||||
.reset = sfq_reset,
|
||||
.destroy = sfq_destroy,
|
||||
.change = NULL,
|
||||
.dump = sfq_dump,
|
||||
.owner = THIS_MODULE,
|
||||
};
|
||||
|
||||
static int __init sfq_module_init(void)
|
||||
{
|
||||
return register_qdisc(&sfq_qdisc_ops);
|
||||
}
|
||||
static void __exit sfq_module_exit(void)
|
||||
{
|
||||
unregister_qdisc(&sfq_qdisc_ops);
|
||||
}
|
||||
module_init(sfq_module_init)
|
||||
module_exit(sfq_module_exit)
|
||||
MODULE_LICENSE("GPL");
|
||||
579
net/sched/sch_tbf.c
Normal file
579
net/sched/sch_tbf.c
Normal file
|
|
@ -0,0 +1,579 @@
|
|||
/*
|
||||
* net/sched/sch_tbf.c Token Bucket Filter queue.
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU General Public License
|
||||
* as published by the Free Software Foundation; either version
|
||||
* 2 of the License, or (at your option) any later version.
|
||||
*
|
||||
* Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
|
||||
* Dmitry Torokhov <dtor@mail.ru> - allow attaching inner qdiscs -
|
||||
* original idea by Martin Devera
|
||||
*
|
||||
*/
|
||||
|
||||
#include <linux/module.h>
|
||||
#include <linux/types.h>
|
||||
#include <linux/kernel.h>
|
||||
#include <linux/string.h>
|
||||
#include <linux/errno.h>
|
||||
#include <linux/skbuff.h>
|
||||
#include <net/netlink.h>
|
||||
#include <net/sch_generic.h>
|
||||
#include <net/pkt_sched.h>
|
||||
|
||||
|
||||
/* Simple Token Bucket Filter.
|
||||
=======================================
|
||||
|
||||
SOURCE.
|
||||
-------
|
||||
|
||||
None.
|
||||
|
||||
Description.
|
||||
------------
|
||||
|
||||
A data flow obeys TBF with rate R and depth B, if for any
|
||||
time interval t_i...t_f the number of transmitted bits
|
||||
does not exceed B + R*(t_f-t_i).
|
||||
|
||||
Packetized version of this definition:
|
||||
The sequence of packets of sizes s_i served at moments t_i
|
||||
obeys TBF, if for any i<=k:
|
||||
|
||||
s_i+....+s_k <= B + R*(t_k - t_i)
|
||||
|
||||
Algorithm.
|
||||
----------
|
||||
|
||||
Let N(t_i) be B/R initially and N(t) grow continuously with time as:
|
||||
|
||||
N(t+delta) = min{B/R, N(t) + delta}
|
||||
|
||||
If the first packet in queue has length S, it may be
|
||||
transmitted only at the time t_* when S/R <= N(t_*),
|
||||
and in this case N(t) jumps:
|
||||
|
||||
N(t_* + 0) = N(t_* - 0) - S/R.
|
||||
|
||||
|
||||
|
||||
Actually, QoS requires two TBF to be applied to a data stream.
|
||||
One of them controls steady state burst size, another
|
||||
one with rate P (peak rate) and depth M (equal to link MTU)
|
||||
limits bursts at a smaller time scale.
|
||||
|
||||
It is easy to see that P>R, and B>M. If P is infinity, this double
|
||||
TBF is equivalent to a single one.
|
||||
|
||||
When TBF works in reshaping mode, latency is estimated as:
|
||||
|
||||
lat = max ((L-B)/R, (L-M)/P)
|
||||
|
||||
|
||||
NOTES.
|
||||
------
|
||||
|
||||
If TBF throttles, it starts a watchdog timer, which will wake it up
|
||||
when it is ready to transmit.
|
||||
Note that the minimal timer resolution is 1/HZ.
|
||||
If no new packets arrive during this period,
|
||||
or if the device is not awaken by EOI for some previous packet,
|
||||
TBF can stop its activity for 1/HZ.
|
||||
|
||||
|
||||
This means, that with depth B, the maximal rate is
|
||||
|
||||
R_crit = B*HZ
|
||||
|
||||
F.e. for 10Mbit ethernet and HZ=100 the minimal allowed B is ~10Kbytes.
|
||||
|
||||
Note that the peak rate TBF is much more tough: with MTU 1500
|
||||
P_crit = 150Kbytes/sec. So, if you need greater peak
|
||||
rates, use alpha with HZ=1000 :-)
|
||||
|
||||
With classful TBF, limit is just kept for backwards compatibility.
|
||||
It is passed to the default bfifo qdisc - if the inner qdisc is
|
||||
changed the limit is not effective anymore.
|
||||
*/
|
||||
|
||||
struct tbf_sched_data {
|
||||
/* Parameters */
|
||||
u32 limit; /* Maximal length of backlog: bytes */
|
||||
u32 max_size;
|
||||
s64 buffer; /* Token bucket depth/rate: MUST BE >= MTU/B */
|
||||
s64 mtu;
|
||||
struct psched_ratecfg rate;
|
||||
struct psched_ratecfg peak;
|
||||
|
||||
/* Variables */
|
||||
s64 tokens; /* Current number of B tokens */
|
||||
s64 ptokens; /* Current number of P tokens */
|
||||
s64 t_c; /* Time check-point */
|
||||
struct Qdisc *qdisc; /* Inner qdisc, default - bfifo queue */
|
||||
struct qdisc_watchdog watchdog; /* Watchdog timer */
|
||||
};
|
||||
|
||||
|
||||
/* Time to Length, convert time in ns to length in bytes
|
||||
* to determinate how many bytes can be sent in given time.
|
||||
*/
|
||||
static u64 psched_ns_t2l(const struct psched_ratecfg *r,
|
||||
u64 time_in_ns)
|
||||
{
|
||||
/* The formula is :
|
||||
* len = (time_in_ns * r->rate_bytes_ps) / NSEC_PER_SEC
|
||||
*/
|
||||
u64 len = time_in_ns * r->rate_bytes_ps;
|
||||
|
||||
do_div(len, NSEC_PER_SEC);
|
||||
|
||||
if (unlikely(r->linklayer == TC_LINKLAYER_ATM)) {
|
||||
do_div(len, 53);
|
||||
len = len * 48;
|
||||
}
|
||||
|
||||
if (len > r->overhead)
|
||||
len -= r->overhead;
|
||||
else
|
||||
len = 0;
|
||||
|
||||
return len;
|
||||
}
|
||||
|
||||
/*
|
||||
* Return length of individual segments of a gso packet,
|
||||
* including all headers (MAC, IP, TCP/UDP)
|
||||
*/
|
||||
static unsigned int skb_gso_mac_seglen(const struct sk_buff *skb)
|
||||
{
|
||||
unsigned int hdr_len = skb_transport_header(skb) - skb_mac_header(skb);
|
||||
return hdr_len + skb_gso_transport_seglen(skb);
|
||||
}
|
||||
|
||||
/* GSO packet is too big, segment it so that tbf can transmit
|
||||
* each segment in time
|
||||
*/
|
||||
static int tbf_segment(struct sk_buff *skb, struct Qdisc *sch)
|
||||
{
|
||||
struct tbf_sched_data *q = qdisc_priv(sch);
|
||||
struct sk_buff *segs, *nskb;
|
||||
netdev_features_t features = netif_skb_features(skb);
|
||||
int ret, nb;
|
||||
|
||||
segs = skb_gso_segment(skb, features & ~NETIF_F_GSO_MASK);
|
||||
|
||||
if (IS_ERR_OR_NULL(segs))
|
||||
return qdisc_reshape_fail(skb, sch);
|
||||
|
||||
nb = 0;
|
||||
while (segs) {
|
||||
nskb = segs->next;
|
||||
segs->next = NULL;
|
||||
qdisc_skb_cb(segs)->pkt_len = segs->len;
|
||||
ret = qdisc_enqueue(segs, q->qdisc);
|
||||
if (ret != NET_XMIT_SUCCESS) {
|
||||
if (net_xmit_drop_count(ret))
|
||||
qdisc_qstats_drop(sch);
|
||||
} else {
|
||||
nb++;
|
||||
}
|
||||
segs = nskb;
|
||||
}
|
||||
sch->q.qlen += nb;
|
||||
if (nb > 1)
|
||||
qdisc_tree_decrease_qlen(sch, 1 - nb);
|
||||
consume_skb(skb);
|
||||
return nb > 0 ? NET_XMIT_SUCCESS : NET_XMIT_DROP;
|
||||
}
|
||||
|
||||
static int tbf_enqueue(struct sk_buff *skb, struct Qdisc *sch)
|
||||
{
|
||||
struct tbf_sched_data *q = qdisc_priv(sch);
|
||||
int ret;
|
||||
|
||||
if (qdisc_pkt_len(skb) > q->max_size) {
|
||||
if (skb_is_gso(skb) && skb_gso_mac_seglen(skb) <= q->max_size)
|
||||
return tbf_segment(skb, sch);
|
||||
return qdisc_reshape_fail(skb, sch);
|
||||
}
|
||||
ret = qdisc_enqueue(skb, q->qdisc);
|
||||
if (ret != NET_XMIT_SUCCESS) {
|
||||
if (net_xmit_drop_count(ret))
|
||||
qdisc_qstats_drop(sch);
|
||||
return ret;
|
||||
}
|
||||
|
||||
sch->q.qlen++;
|
||||
return NET_XMIT_SUCCESS;
|
||||
}
|
||||
|
||||
static unsigned int tbf_drop(struct Qdisc *sch)
|
||||
{
|
||||
struct tbf_sched_data *q = qdisc_priv(sch);
|
||||
unsigned int len = 0;
|
||||
|
||||
if (q->qdisc->ops->drop && (len = q->qdisc->ops->drop(q->qdisc)) != 0) {
|
||||
sch->q.qlen--;
|
||||
qdisc_qstats_drop(sch);
|
||||
}
|
||||
return len;
|
||||
}
|
||||
|
||||
static bool tbf_peak_present(const struct tbf_sched_data *q)
|
||||
{
|
||||
return q->peak.rate_bytes_ps;
|
||||
}
|
||||
|
||||
static struct sk_buff *tbf_dequeue(struct Qdisc *sch)
|
||||
{
|
||||
struct tbf_sched_data *q = qdisc_priv(sch);
|
||||
struct sk_buff *skb;
|
||||
|
||||
skb = q->qdisc->ops->peek(q->qdisc);
|
||||
|
||||
if (skb) {
|
||||
s64 now;
|
||||
s64 toks;
|
||||
s64 ptoks = 0;
|
||||
unsigned int len = qdisc_pkt_len(skb);
|
||||
|
||||
now = ktime_get_ns();
|
||||
toks = min_t(s64, now - q->t_c, q->buffer);
|
||||
|
||||
if (tbf_peak_present(q)) {
|
||||
ptoks = toks + q->ptokens;
|
||||
if (ptoks > q->mtu)
|
||||
ptoks = q->mtu;
|
||||
ptoks -= (s64) psched_l2t_ns(&q->peak, len);
|
||||
}
|
||||
toks += q->tokens;
|
||||
if (toks > q->buffer)
|
||||
toks = q->buffer;
|
||||
toks -= (s64) psched_l2t_ns(&q->rate, len);
|
||||
|
||||
if ((toks|ptoks) >= 0) {
|
||||
skb = qdisc_dequeue_peeked(q->qdisc);
|
||||
if (unlikely(!skb))
|
||||
return NULL;
|
||||
|
||||
q->t_c = now;
|
||||
q->tokens = toks;
|
||||
q->ptokens = ptoks;
|
||||
sch->q.qlen--;
|
||||
qdisc_unthrottled(sch);
|
||||
qdisc_bstats_update(sch, skb);
|
||||
return skb;
|
||||
}
|
||||
|
||||
qdisc_watchdog_schedule_ns(&q->watchdog,
|
||||
now + max_t(long, -toks, -ptoks),
|
||||
true);
|
||||
|
||||
/* Maybe we have a shorter packet in the queue,
|
||||
which can be sent now. It sounds cool,
|
||||
but, however, this is wrong in principle.
|
||||
We MUST NOT reorder packets under these circumstances.
|
||||
|
||||
Really, if we split the flow into independent
|
||||
subflows, it would be a very good solution.
|
||||
This is the main idea of all FQ algorithms
|
||||
(cf. CSZ, HPFQ, HFSC)
|
||||
*/
|
||||
|
||||
qdisc_qstats_overlimit(sch);
|
||||
}
|
||||
return NULL;
|
||||
}
|
||||
|
||||
static void tbf_reset(struct Qdisc *sch)
|
||||
{
|
||||
struct tbf_sched_data *q = qdisc_priv(sch);
|
||||
|
||||
qdisc_reset(q->qdisc);
|
||||
sch->q.qlen = 0;
|
||||
q->t_c = ktime_get_ns();
|
||||
q->tokens = q->buffer;
|
||||
q->ptokens = q->mtu;
|
||||
qdisc_watchdog_cancel(&q->watchdog);
|
||||
}
|
||||
|
||||
static const struct nla_policy tbf_policy[TCA_TBF_MAX + 1] = {
|
||||
[TCA_TBF_PARMS] = { .len = sizeof(struct tc_tbf_qopt) },
|
||||
[TCA_TBF_RTAB] = { .type = NLA_BINARY, .len = TC_RTAB_SIZE },
|
||||
[TCA_TBF_PTAB] = { .type = NLA_BINARY, .len = TC_RTAB_SIZE },
|
||||
[TCA_TBF_RATE64] = { .type = NLA_U64 },
|
||||
[TCA_TBF_PRATE64] = { .type = NLA_U64 },
|
||||
[TCA_TBF_BURST] = { .type = NLA_U32 },
|
||||
[TCA_TBF_PBURST] = { .type = NLA_U32 },
|
||||
};
|
||||
|
||||
static int tbf_change(struct Qdisc *sch, struct nlattr *opt)
|
||||
{
|
||||
int err;
|
||||
struct tbf_sched_data *q = qdisc_priv(sch);
|
||||
struct nlattr *tb[TCA_TBF_MAX + 1];
|
||||
struct tc_tbf_qopt *qopt;
|
||||
struct Qdisc *child = NULL;
|
||||
struct psched_ratecfg rate;
|
||||
struct psched_ratecfg peak;
|
||||
u64 max_size;
|
||||
s64 buffer, mtu;
|
||||
u64 rate64 = 0, prate64 = 0;
|
||||
|
||||
err = nla_parse_nested(tb, TCA_TBF_MAX, opt, tbf_policy);
|
||||
if (err < 0)
|
||||
return err;
|
||||
|
||||
err = -EINVAL;
|
||||
if (tb[TCA_TBF_PARMS] == NULL)
|
||||
goto done;
|
||||
|
||||
qopt = nla_data(tb[TCA_TBF_PARMS]);
|
||||
if (qopt->rate.linklayer == TC_LINKLAYER_UNAWARE)
|
||||
qdisc_put_rtab(qdisc_get_rtab(&qopt->rate,
|
||||
tb[TCA_TBF_RTAB]));
|
||||
|
||||
if (qopt->peakrate.linklayer == TC_LINKLAYER_UNAWARE)
|
||||
qdisc_put_rtab(qdisc_get_rtab(&qopt->peakrate,
|
||||
tb[TCA_TBF_PTAB]));
|
||||
|
||||
buffer = min_t(u64, PSCHED_TICKS2NS(qopt->buffer), ~0U);
|
||||
mtu = min_t(u64, PSCHED_TICKS2NS(qopt->mtu), ~0U);
|
||||
|
||||
if (tb[TCA_TBF_RATE64])
|
||||
rate64 = nla_get_u64(tb[TCA_TBF_RATE64]);
|
||||
psched_ratecfg_precompute(&rate, &qopt->rate, rate64);
|
||||
|
||||
if (tb[TCA_TBF_BURST]) {
|
||||
max_size = nla_get_u32(tb[TCA_TBF_BURST]);
|
||||
buffer = psched_l2t_ns(&rate, max_size);
|
||||
} else {
|
||||
max_size = min_t(u64, psched_ns_t2l(&rate, buffer), ~0U);
|
||||
}
|
||||
|
||||
if (qopt->peakrate.rate) {
|
||||
if (tb[TCA_TBF_PRATE64])
|
||||
prate64 = nla_get_u64(tb[TCA_TBF_PRATE64]);
|
||||
psched_ratecfg_precompute(&peak, &qopt->peakrate, prate64);
|
||||
if (peak.rate_bytes_ps <= rate.rate_bytes_ps) {
|
||||
pr_warn_ratelimited("sch_tbf: peakrate %llu is lower than or equals to rate %llu !\n",
|
||||
peak.rate_bytes_ps, rate.rate_bytes_ps);
|
||||
err = -EINVAL;
|
||||
goto done;
|
||||
}
|
||||
|
||||
if (tb[TCA_TBF_PBURST]) {
|
||||
u32 pburst = nla_get_u32(tb[TCA_TBF_PBURST]);
|
||||
max_size = min_t(u32, max_size, pburst);
|
||||
mtu = psched_l2t_ns(&peak, pburst);
|
||||
} else {
|
||||
max_size = min_t(u64, max_size, psched_ns_t2l(&peak, mtu));
|
||||
}
|
||||
} else {
|
||||
memset(&peak, 0, sizeof(peak));
|
||||
}
|
||||
|
||||
if (max_size < psched_mtu(qdisc_dev(sch)))
|
||||
pr_warn_ratelimited("sch_tbf: burst %llu is lower than device %s mtu (%u) !\n",
|
||||
max_size, qdisc_dev(sch)->name,
|
||||
psched_mtu(qdisc_dev(sch)));
|
||||
|
||||
if (!max_size) {
|
||||
err = -EINVAL;
|
||||
goto done;
|
||||
}
|
||||
|
||||
if (q->qdisc != &noop_qdisc) {
|
||||
err = fifo_set_limit(q->qdisc, qopt->limit);
|
||||
if (err)
|
||||
goto done;
|
||||
} else if (qopt->limit > 0) {
|
||||
child = fifo_create_dflt(sch, &bfifo_qdisc_ops, qopt->limit);
|
||||
if (IS_ERR(child)) {
|
||||
err = PTR_ERR(child);
|
||||
goto done;
|
||||
}
|
||||
}
|
||||
|
||||
sch_tree_lock(sch);
|
||||
if (child) {
|
||||
qdisc_tree_decrease_qlen(q->qdisc, q->qdisc->q.qlen);
|
||||
qdisc_destroy(q->qdisc);
|
||||
q->qdisc = child;
|
||||
}
|
||||
q->limit = qopt->limit;
|
||||
if (tb[TCA_TBF_PBURST])
|
||||
q->mtu = mtu;
|
||||
else
|
||||
q->mtu = PSCHED_TICKS2NS(qopt->mtu);
|
||||
q->max_size = max_size;
|
||||
if (tb[TCA_TBF_BURST])
|
||||
q->buffer = buffer;
|
||||
else
|
||||
q->buffer = PSCHED_TICKS2NS(qopt->buffer);
|
||||
q->tokens = q->buffer;
|
||||
q->ptokens = q->mtu;
|
||||
|
||||
memcpy(&q->rate, &rate, sizeof(struct psched_ratecfg));
|
||||
memcpy(&q->peak, &peak, sizeof(struct psched_ratecfg));
|
||||
|
||||
sch_tree_unlock(sch);
|
||||
err = 0;
|
||||
done:
|
||||
return err;
|
||||
}
|
||||
|
||||
static int tbf_init(struct Qdisc *sch, struct nlattr *opt)
|
||||
{
|
||||
struct tbf_sched_data *q = qdisc_priv(sch);
|
||||
|
||||
if (opt == NULL)
|
||||
return -EINVAL;
|
||||
|
||||
q->t_c = ktime_get_ns();
|
||||
qdisc_watchdog_init(&q->watchdog, sch);
|
||||
q->qdisc = &noop_qdisc;
|
||||
|
||||
return tbf_change(sch, opt);
|
||||
}
|
||||
|
||||
static void tbf_destroy(struct Qdisc *sch)
|
||||
{
|
||||
struct tbf_sched_data *q = qdisc_priv(sch);
|
||||
|
||||
qdisc_watchdog_cancel(&q->watchdog);
|
||||
qdisc_destroy(q->qdisc);
|
||||
}
|
||||
|
||||
static int tbf_dump(struct Qdisc *sch, struct sk_buff *skb)
|
||||
{
|
||||
struct tbf_sched_data *q = qdisc_priv(sch);
|
||||
struct nlattr *nest;
|
||||
struct tc_tbf_qopt opt;
|
||||
|
||||
sch->qstats.backlog = q->qdisc->qstats.backlog;
|
||||
nest = nla_nest_start(skb, TCA_OPTIONS);
|
||||
if (nest == NULL)
|
||||
goto nla_put_failure;
|
||||
|
||||
opt.limit = q->limit;
|
||||
psched_ratecfg_getrate(&opt.rate, &q->rate);
|
||||
if (tbf_peak_present(q))
|
||||
psched_ratecfg_getrate(&opt.peakrate, &q->peak);
|
||||
else
|
||||
memset(&opt.peakrate, 0, sizeof(opt.peakrate));
|
||||
opt.mtu = PSCHED_NS2TICKS(q->mtu);
|
||||
opt.buffer = PSCHED_NS2TICKS(q->buffer);
|
||||
if (nla_put(skb, TCA_TBF_PARMS, sizeof(opt), &opt))
|
||||
goto nla_put_failure;
|
||||
if (q->rate.rate_bytes_ps >= (1ULL << 32) &&
|
||||
nla_put_u64(skb, TCA_TBF_RATE64, q->rate.rate_bytes_ps))
|
||||
goto nla_put_failure;
|
||||
if (tbf_peak_present(q) &&
|
||||
q->peak.rate_bytes_ps >= (1ULL << 32) &&
|
||||
nla_put_u64(skb, TCA_TBF_PRATE64, q->peak.rate_bytes_ps))
|
||||
goto nla_put_failure;
|
||||
|
||||
return nla_nest_end(skb, nest);
|
||||
|
||||
nla_put_failure:
|
||||
nla_nest_cancel(skb, nest);
|
||||
return -1;
|
||||
}
|
||||
|
||||
static int tbf_dump_class(struct Qdisc *sch, unsigned long cl,
|
||||
struct sk_buff *skb, struct tcmsg *tcm)
|
||||
{
|
||||
struct tbf_sched_data *q = qdisc_priv(sch);
|
||||
|
||||
tcm->tcm_handle |= TC_H_MIN(1);
|
||||
tcm->tcm_info = q->qdisc->handle;
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
static int tbf_graft(struct Qdisc *sch, unsigned long arg, struct Qdisc *new,
|
||||
struct Qdisc **old)
|
||||
{
|
||||
struct tbf_sched_data *q = qdisc_priv(sch);
|
||||
|
||||
if (new == NULL)
|
||||
new = &noop_qdisc;
|
||||
|
||||
sch_tree_lock(sch);
|
||||
*old = q->qdisc;
|
||||
q->qdisc = new;
|
||||
qdisc_tree_decrease_qlen(*old, (*old)->q.qlen);
|
||||
qdisc_reset(*old);
|
||||
sch_tree_unlock(sch);
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
static struct Qdisc *tbf_leaf(struct Qdisc *sch, unsigned long arg)
|
||||
{
|
||||
struct tbf_sched_data *q = qdisc_priv(sch);
|
||||
return q->qdisc;
|
||||
}
|
||||
|
||||
static unsigned long tbf_get(struct Qdisc *sch, u32 classid)
|
||||
{
|
||||
return 1;
|
||||
}
|
||||
|
||||
static void tbf_put(struct Qdisc *sch, unsigned long arg)
|
||||
{
|
||||
}
|
||||
|
||||
static void tbf_walk(struct Qdisc *sch, struct qdisc_walker *walker)
|
||||
{
|
||||
if (!walker->stop) {
|
||||
if (walker->count >= walker->skip)
|
||||
if (walker->fn(sch, 1, walker) < 0) {
|
||||
walker->stop = 1;
|
||||
return;
|
||||
}
|
||||
walker->count++;
|
||||
}
|
||||
}
|
||||
|
||||
static const struct Qdisc_class_ops tbf_class_ops = {
|
||||
.graft = tbf_graft,
|
||||
.leaf = tbf_leaf,
|
||||
.get = tbf_get,
|
||||
.put = tbf_put,
|
||||
.walk = tbf_walk,
|
||||
.dump = tbf_dump_class,
|
||||
};
|
||||
|
||||
static struct Qdisc_ops tbf_qdisc_ops __read_mostly = {
|
||||
.next = NULL,
|
||||
.cl_ops = &tbf_class_ops,
|
||||
.id = "tbf",
|
||||
.priv_size = sizeof(struct tbf_sched_data),
|
||||
.enqueue = tbf_enqueue,
|
||||
.dequeue = tbf_dequeue,
|
||||
.peek = qdisc_peek_dequeued,
|
||||
.drop = tbf_drop,
|
||||
.init = tbf_init,
|
||||
.reset = tbf_reset,
|
||||
.destroy = tbf_destroy,
|
||||
.change = tbf_change,
|
||||
.dump = tbf_dump,
|
||||
.owner = THIS_MODULE,
|
||||
};
|
||||
|
||||
static int __init tbf_module_init(void)
|
||||
{
|
||||
return register_qdisc(&tbf_qdisc_ops);
|
||||
}
|
||||
|
||||
static void __exit tbf_module_exit(void)
|
||||
{
|
||||
unregister_qdisc(&tbf_qdisc_ops);
|
||||
}
|
||||
module_init(tbf_module_init)
|
||||
module_exit(tbf_module_exit)
|
||||
MODULE_LICENSE("GPL");
|
||||
535
net/sched/sch_teql.c
Normal file
535
net/sched/sch_teql.c
Normal file
|
|
@ -0,0 +1,535 @@
|
|||
/* net/sched/sch_teql.c "True" (or "trivial") link equalizer.
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU General Public License
|
||||
* as published by the Free Software Foundation; either version
|
||||
* 2 of the License, or (at your option) any later version.
|
||||
*
|
||||
* Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
|
||||
*/
|
||||
|
||||
#include <linux/module.h>
|
||||
#include <linux/types.h>
|
||||
#include <linux/kernel.h>
|
||||
#include <linux/slab.h>
|
||||
#include <linux/string.h>
|
||||
#include <linux/errno.h>
|
||||
#include <linux/if_arp.h>
|
||||
#include <linux/netdevice.h>
|
||||
#include <linux/init.h>
|
||||
#include <linux/skbuff.h>
|
||||
#include <linux/moduleparam.h>
|
||||
#include <net/dst.h>
|
||||
#include <net/neighbour.h>
|
||||
#include <net/pkt_sched.h>
|
||||
|
||||
/*
|
||||
How to setup it.
|
||||
----------------
|
||||
|
||||
After loading this module you will find a new device teqlN
|
||||
and new qdisc with the same name. To join a slave to the equalizer
|
||||
you should just set this qdisc on a device f.e.
|
||||
|
||||
# tc qdisc add dev eth0 root teql0
|
||||
# tc qdisc add dev eth1 root teql0
|
||||
|
||||
That's all. Full PnP 8)
|
||||
|
||||
Applicability.
|
||||
--------------
|
||||
|
||||
1. Slave devices MUST be active devices, i.e., they must raise the tbusy
|
||||
signal and generate EOI events. If you want to equalize virtual devices
|
||||
like tunnels, use a normal eql device.
|
||||
2. This device puts no limitations on physical slave characteristics
|
||||
f.e. it will equalize 9600baud line and 100Mb ethernet perfectly :-)
|
||||
Certainly, large difference in link speeds will make the resulting
|
||||
eqalized link unusable, because of huge packet reordering.
|
||||
I estimate an upper useful difference as ~10 times.
|
||||
3. If the slave requires address resolution, only protocols using
|
||||
neighbour cache (IPv4/IPv6) will work over the equalized link.
|
||||
Other protocols are still allowed to use the slave device directly,
|
||||
which will not break load balancing, though native slave
|
||||
traffic will have the highest priority. */
|
||||
|
||||
struct teql_master {
|
||||
struct Qdisc_ops qops;
|
||||
struct net_device *dev;
|
||||
struct Qdisc *slaves;
|
||||
struct list_head master_list;
|
||||
unsigned long tx_bytes;
|
||||
unsigned long tx_packets;
|
||||
unsigned long tx_errors;
|
||||
unsigned long tx_dropped;
|
||||
};
|
||||
|
||||
struct teql_sched_data {
|
||||
struct Qdisc *next;
|
||||
struct teql_master *m;
|
||||
struct sk_buff_head q;
|
||||
};
|
||||
|
||||
#define NEXT_SLAVE(q) (((struct teql_sched_data *)qdisc_priv(q))->next)
|
||||
|
||||
#define FMASK (IFF_BROADCAST | IFF_POINTOPOINT)
|
||||
|
||||
/* "teql*" qdisc routines */
|
||||
|
||||
static int
|
||||
teql_enqueue(struct sk_buff *skb, struct Qdisc *sch)
|
||||
{
|
||||
struct net_device *dev = qdisc_dev(sch);
|
||||
struct teql_sched_data *q = qdisc_priv(sch);
|
||||
|
||||
if (q->q.qlen < dev->tx_queue_len) {
|
||||
__skb_queue_tail(&q->q, skb);
|
||||
return NET_XMIT_SUCCESS;
|
||||
}
|
||||
|
||||
return qdisc_drop(skb, sch);
|
||||
}
|
||||
|
||||
static struct sk_buff *
|
||||
teql_dequeue(struct Qdisc *sch)
|
||||
{
|
||||
struct teql_sched_data *dat = qdisc_priv(sch);
|
||||
struct netdev_queue *dat_queue;
|
||||
struct sk_buff *skb;
|
||||
struct Qdisc *q;
|
||||
|
||||
skb = __skb_dequeue(&dat->q);
|
||||
dat_queue = netdev_get_tx_queue(dat->m->dev, 0);
|
||||
q = rcu_dereference_bh(dat_queue->qdisc);
|
||||
|
||||
if (skb == NULL) {
|
||||
struct net_device *m = qdisc_dev(q);
|
||||
if (m) {
|
||||
dat->m->slaves = sch;
|
||||
netif_wake_queue(m);
|
||||
}
|
||||
} else {
|
||||
qdisc_bstats_update(sch, skb);
|
||||
}
|
||||
sch->q.qlen = dat->q.qlen + q->q.qlen;
|
||||
return skb;
|
||||
}
|
||||
|
||||
static struct sk_buff *
|
||||
teql_peek(struct Qdisc *sch)
|
||||
{
|
||||
/* teql is meant to be used as root qdisc */
|
||||
return NULL;
|
||||
}
|
||||
|
||||
static inline void
|
||||
teql_neigh_release(struct neighbour *n)
|
||||
{
|
||||
if (n)
|
||||
neigh_release(n);
|
||||
}
|
||||
|
||||
static void
|
||||
teql_reset(struct Qdisc *sch)
|
||||
{
|
||||
struct teql_sched_data *dat = qdisc_priv(sch);
|
||||
|
||||
skb_queue_purge(&dat->q);
|
||||
sch->q.qlen = 0;
|
||||
}
|
||||
|
||||
static void
|
||||
teql_destroy(struct Qdisc *sch)
|
||||
{
|
||||
struct Qdisc *q, *prev;
|
||||
struct teql_sched_data *dat = qdisc_priv(sch);
|
||||
struct teql_master *master = dat->m;
|
||||
|
||||
prev = master->slaves;
|
||||
if (prev) {
|
||||
do {
|
||||
q = NEXT_SLAVE(prev);
|
||||
if (q == sch) {
|
||||
NEXT_SLAVE(prev) = NEXT_SLAVE(q);
|
||||
if (q == master->slaves) {
|
||||
master->slaves = NEXT_SLAVE(q);
|
||||
if (q == master->slaves) {
|
||||
struct netdev_queue *txq;
|
||||
spinlock_t *root_lock;
|
||||
|
||||
txq = netdev_get_tx_queue(master->dev, 0);
|
||||
master->slaves = NULL;
|
||||
|
||||
root_lock = qdisc_root_sleeping_lock(rtnl_dereference(txq->qdisc));
|
||||
spin_lock_bh(root_lock);
|
||||
qdisc_reset(rtnl_dereference(txq->qdisc));
|
||||
spin_unlock_bh(root_lock);
|
||||
}
|
||||
}
|
||||
skb_queue_purge(&dat->q);
|
||||
break;
|
||||
}
|
||||
|
||||
} while ((prev = q) != master->slaves);
|
||||
}
|
||||
}
|
||||
|
||||
static int teql_qdisc_init(struct Qdisc *sch, struct nlattr *opt)
|
||||
{
|
||||
struct net_device *dev = qdisc_dev(sch);
|
||||
struct teql_master *m = (struct teql_master *)sch->ops;
|
||||
struct teql_sched_data *q = qdisc_priv(sch);
|
||||
|
||||
if (dev->hard_header_len > m->dev->hard_header_len)
|
||||
return -EINVAL;
|
||||
|
||||
if (m->dev == dev)
|
||||
return -ELOOP;
|
||||
|
||||
q->m = m;
|
||||
|
||||
skb_queue_head_init(&q->q);
|
||||
|
||||
if (m->slaves) {
|
||||
if (m->dev->flags & IFF_UP) {
|
||||
if ((m->dev->flags & IFF_POINTOPOINT &&
|
||||
!(dev->flags & IFF_POINTOPOINT)) ||
|
||||
(m->dev->flags & IFF_BROADCAST &&
|
||||
!(dev->flags & IFF_BROADCAST)) ||
|
||||
(m->dev->flags & IFF_MULTICAST &&
|
||||
!(dev->flags & IFF_MULTICAST)) ||
|
||||
dev->mtu < m->dev->mtu)
|
||||
return -EINVAL;
|
||||
} else {
|
||||
if (!(dev->flags&IFF_POINTOPOINT))
|
||||
m->dev->flags &= ~IFF_POINTOPOINT;
|
||||
if (!(dev->flags&IFF_BROADCAST))
|
||||
m->dev->flags &= ~IFF_BROADCAST;
|
||||
if (!(dev->flags&IFF_MULTICAST))
|
||||
m->dev->flags &= ~IFF_MULTICAST;
|
||||
if (dev->mtu < m->dev->mtu)
|
||||
m->dev->mtu = dev->mtu;
|
||||
}
|
||||
q->next = NEXT_SLAVE(m->slaves);
|
||||
NEXT_SLAVE(m->slaves) = sch;
|
||||
} else {
|
||||
q->next = sch;
|
||||
m->slaves = sch;
|
||||
m->dev->mtu = dev->mtu;
|
||||
m->dev->flags = (m->dev->flags&~FMASK)|(dev->flags&FMASK);
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
|
||||
static int
|
||||
__teql_resolve(struct sk_buff *skb, struct sk_buff *skb_res,
|
||||
struct net_device *dev, struct netdev_queue *txq,
|
||||
struct dst_entry *dst)
|
||||
{
|
||||
struct neighbour *n;
|
||||
int err = 0;
|
||||
|
||||
n = dst_neigh_lookup_skb(dst, skb);
|
||||
if (!n)
|
||||
return -ENOENT;
|
||||
|
||||
if (dst->dev != dev) {
|
||||
struct neighbour *mn;
|
||||
|
||||
mn = __neigh_lookup_errno(n->tbl, n->primary_key, dev);
|
||||
neigh_release(n);
|
||||
if (IS_ERR(mn))
|
||||
return PTR_ERR(mn);
|
||||
n = mn;
|
||||
}
|
||||
|
||||
if (neigh_event_send(n, skb_res) == 0) {
|
||||
int err;
|
||||
char haddr[MAX_ADDR_LEN];
|
||||
|
||||
neigh_ha_snapshot(haddr, n, dev);
|
||||
err = dev_hard_header(skb, dev, ntohs(skb->protocol), haddr,
|
||||
NULL, skb->len);
|
||||
|
||||
if (err < 0)
|
||||
err = -EINVAL;
|
||||
} else {
|
||||
err = (skb_res == NULL) ? -EAGAIN : 1;
|
||||
}
|
||||
neigh_release(n);
|
||||
return err;
|
||||
}
|
||||
|
||||
static inline int teql_resolve(struct sk_buff *skb,
|
||||
struct sk_buff *skb_res,
|
||||
struct net_device *dev,
|
||||
struct netdev_queue *txq)
|
||||
{
|
||||
struct dst_entry *dst = skb_dst(skb);
|
||||
int res;
|
||||
|
||||
if (rcu_access_pointer(txq->qdisc) == &noop_qdisc)
|
||||
return -ENODEV;
|
||||
|
||||
if (!dev->header_ops || !dst)
|
||||
return 0;
|
||||
|
||||
rcu_read_lock();
|
||||
res = __teql_resolve(skb, skb_res, dev, txq, dst);
|
||||
rcu_read_unlock();
|
||||
|
||||
return res;
|
||||
}
|
||||
|
||||
static netdev_tx_t teql_master_xmit(struct sk_buff *skb, struct net_device *dev)
|
||||
{
|
||||
struct teql_master *master = netdev_priv(dev);
|
||||
struct Qdisc *start, *q;
|
||||
int busy;
|
||||
int nores;
|
||||
int subq = skb_get_queue_mapping(skb);
|
||||
struct sk_buff *skb_res = NULL;
|
||||
|
||||
start = master->slaves;
|
||||
|
||||
restart:
|
||||
nores = 0;
|
||||
busy = 0;
|
||||
|
||||
q = start;
|
||||
if (!q)
|
||||
goto drop;
|
||||
|
||||
do {
|
||||
struct net_device *slave = qdisc_dev(q);
|
||||
struct netdev_queue *slave_txq = netdev_get_tx_queue(slave, 0);
|
||||
|
||||
if (slave_txq->qdisc_sleeping != q)
|
||||
continue;
|
||||
if (netif_xmit_stopped(netdev_get_tx_queue(slave, subq)) ||
|
||||
!netif_running(slave)) {
|
||||
busy = 1;
|
||||
continue;
|
||||
}
|
||||
|
||||
switch (teql_resolve(skb, skb_res, slave, slave_txq)) {
|
||||
case 0:
|
||||
if (__netif_tx_trylock(slave_txq)) {
|
||||
unsigned int length = qdisc_pkt_len(skb);
|
||||
|
||||
if (!netif_xmit_frozen_or_stopped(slave_txq) &&
|
||||
netdev_start_xmit(skb, slave, slave_txq, false) ==
|
||||
NETDEV_TX_OK) {
|
||||
__netif_tx_unlock(slave_txq);
|
||||
master->slaves = NEXT_SLAVE(q);
|
||||
netif_wake_queue(dev);
|
||||
master->tx_packets++;
|
||||
master->tx_bytes += length;
|
||||
return NETDEV_TX_OK;
|
||||
}
|
||||
__netif_tx_unlock(slave_txq);
|
||||
}
|
||||
if (netif_xmit_stopped(netdev_get_tx_queue(dev, 0)))
|
||||
busy = 1;
|
||||
break;
|
||||
case 1:
|
||||
master->slaves = NEXT_SLAVE(q);
|
||||
return NETDEV_TX_OK;
|
||||
default:
|
||||
nores = 1;
|
||||
break;
|
||||
}
|
||||
__skb_pull(skb, skb_network_offset(skb));
|
||||
} while ((q = NEXT_SLAVE(q)) != start);
|
||||
|
||||
if (nores && skb_res == NULL) {
|
||||
skb_res = skb;
|
||||
goto restart;
|
||||
}
|
||||
|
||||
if (busy) {
|
||||
netif_stop_queue(dev);
|
||||
return NETDEV_TX_BUSY;
|
||||
}
|
||||
master->tx_errors++;
|
||||
|
||||
drop:
|
||||
master->tx_dropped++;
|
||||
dev_kfree_skb(skb);
|
||||
return NETDEV_TX_OK;
|
||||
}
|
||||
|
||||
static int teql_master_open(struct net_device *dev)
|
||||
{
|
||||
struct Qdisc *q;
|
||||
struct teql_master *m = netdev_priv(dev);
|
||||
int mtu = 0xFFFE;
|
||||
unsigned int flags = IFF_NOARP | IFF_MULTICAST;
|
||||
|
||||
if (m->slaves == NULL)
|
||||
return -EUNATCH;
|
||||
|
||||
flags = FMASK;
|
||||
|
||||
q = m->slaves;
|
||||
do {
|
||||
struct net_device *slave = qdisc_dev(q);
|
||||
|
||||
if (slave == NULL)
|
||||
return -EUNATCH;
|
||||
|
||||
if (slave->mtu < mtu)
|
||||
mtu = slave->mtu;
|
||||
if (slave->hard_header_len > LL_MAX_HEADER)
|
||||
return -EINVAL;
|
||||
|
||||
/* If all the slaves are BROADCAST, master is BROADCAST
|
||||
If all the slaves are PtP, master is PtP
|
||||
Otherwise, master is NBMA.
|
||||
*/
|
||||
if (!(slave->flags&IFF_POINTOPOINT))
|
||||
flags &= ~IFF_POINTOPOINT;
|
||||
if (!(slave->flags&IFF_BROADCAST))
|
||||
flags &= ~IFF_BROADCAST;
|
||||
if (!(slave->flags&IFF_MULTICAST))
|
||||
flags &= ~IFF_MULTICAST;
|
||||
} while ((q = NEXT_SLAVE(q)) != m->slaves);
|
||||
|
||||
m->dev->mtu = mtu;
|
||||
m->dev->flags = (m->dev->flags&~FMASK) | flags;
|
||||
netif_start_queue(m->dev);
|
||||
return 0;
|
||||
}
|
||||
|
||||
static int teql_master_close(struct net_device *dev)
|
||||
{
|
||||
netif_stop_queue(dev);
|
||||
return 0;
|
||||
}
|
||||
|
||||
static struct rtnl_link_stats64 *teql_master_stats64(struct net_device *dev,
|
||||
struct rtnl_link_stats64 *stats)
|
||||
{
|
||||
struct teql_master *m = netdev_priv(dev);
|
||||
|
||||
stats->tx_packets = m->tx_packets;
|
||||
stats->tx_bytes = m->tx_bytes;
|
||||
stats->tx_errors = m->tx_errors;
|
||||
stats->tx_dropped = m->tx_dropped;
|
||||
return stats;
|
||||
}
|
||||
|
||||
static int teql_master_mtu(struct net_device *dev, int new_mtu)
|
||||
{
|
||||
struct teql_master *m = netdev_priv(dev);
|
||||
struct Qdisc *q;
|
||||
|
||||
if (new_mtu < 68)
|
||||
return -EINVAL;
|
||||
|
||||
q = m->slaves;
|
||||
if (q) {
|
||||
do {
|
||||
if (new_mtu > qdisc_dev(q)->mtu)
|
||||
return -EINVAL;
|
||||
} while ((q = NEXT_SLAVE(q)) != m->slaves);
|
||||
}
|
||||
|
||||
dev->mtu = new_mtu;
|
||||
return 0;
|
||||
}
|
||||
|
||||
static const struct net_device_ops teql_netdev_ops = {
|
||||
.ndo_open = teql_master_open,
|
||||
.ndo_stop = teql_master_close,
|
||||
.ndo_start_xmit = teql_master_xmit,
|
||||
.ndo_get_stats64 = teql_master_stats64,
|
||||
.ndo_change_mtu = teql_master_mtu,
|
||||
};
|
||||
|
||||
static __init void teql_master_setup(struct net_device *dev)
|
||||
{
|
||||
struct teql_master *master = netdev_priv(dev);
|
||||
struct Qdisc_ops *ops = &master->qops;
|
||||
|
||||
master->dev = dev;
|
||||
ops->priv_size = sizeof(struct teql_sched_data);
|
||||
|
||||
ops->enqueue = teql_enqueue;
|
||||
ops->dequeue = teql_dequeue;
|
||||
ops->peek = teql_peek;
|
||||
ops->init = teql_qdisc_init;
|
||||
ops->reset = teql_reset;
|
||||
ops->destroy = teql_destroy;
|
||||
ops->owner = THIS_MODULE;
|
||||
|
||||
dev->netdev_ops = &teql_netdev_ops;
|
||||
dev->type = ARPHRD_VOID;
|
||||
dev->mtu = 1500;
|
||||
dev->tx_queue_len = 100;
|
||||
dev->flags = IFF_NOARP;
|
||||
dev->hard_header_len = LL_MAX_HEADER;
|
||||
netif_keep_dst(dev);
|
||||
}
|
||||
|
||||
static LIST_HEAD(master_dev_list);
|
||||
static int max_equalizers = 1;
|
||||
module_param(max_equalizers, int, 0);
|
||||
MODULE_PARM_DESC(max_equalizers, "Max number of link equalizers");
|
||||
|
||||
static int __init teql_init(void)
|
||||
{
|
||||
int i;
|
||||
int err = -ENODEV;
|
||||
|
||||
for (i = 0; i < max_equalizers; i++) {
|
||||
struct net_device *dev;
|
||||
struct teql_master *master;
|
||||
|
||||
dev = alloc_netdev(sizeof(struct teql_master), "teql%d",
|
||||
NET_NAME_UNKNOWN, teql_master_setup);
|
||||
if (!dev) {
|
||||
err = -ENOMEM;
|
||||
break;
|
||||
}
|
||||
|
||||
if ((err = register_netdev(dev))) {
|
||||
free_netdev(dev);
|
||||
break;
|
||||
}
|
||||
|
||||
master = netdev_priv(dev);
|
||||
|
||||
strlcpy(master->qops.id, dev->name, IFNAMSIZ);
|
||||
err = register_qdisc(&master->qops);
|
||||
|
||||
if (err) {
|
||||
unregister_netdev(dev);
|
||||
free_netdev(dev);
|
||||
break;
|
||||
}
|
||||
|
||||
list_add_tail(&master->master_list, &master_dev_list);
|
||||
}
|
||||
return i ? 0 : err;
|
||||
}
|
||||
|
||||
static void __exit teql_exit(void)
|
||||
{
|
||||
struct teql_master *master, *nxt;
|
||||
|
||||
list_for_each_entry_safe(master, nxt, &master_dev_list, master_list) {
|
||||
|
||||
list_del(&master->master_list);
|
||||
|
||||
unregister_qdisc(&master->qops);
|
||||
unregister_netdev(master->dev);
|
||||
free_netdev(master->dev);
|
||||
}
|
||||
}
|
||||
|
||||
module_init(teql_init);
|
||||
module_exit(teql_exit);
|
||||
|
||||
MODULE_LICENSE("GPL");
|
||||
Loading…
Add table
Add a link
Reference in a new issue