Fixed MTP to work with TWRP

This commit is contained in:
awab228 2018-06-19 23:16:04 +02:00
commit f6dfaef42e
50820 changed files with 20846062 additions and 0 deletions

700
net/sched/Kconfig Normal file
View file

@ -0,0 +1,700 @@
#
# Traffic control configuration.
#
menuconfig NET_SCHED
bool "QoS and/or fair queueing"
select NET_SCH_FIFO
---help---
When the kernel has several packets to send out over a network
device, it has to decide which ones to send first, which ones to
delay, and which ones to drop. This is the job of the queueing
disciplines, several different algorithms for how to do this
"fairly" have been proposed.
If you say N here, you will get the standard packet scheduler, which
is a FIFO (first come, first served). If you say Y here, you will be
able to choose from among several alternative algorithms which can
then be attached to different network devices. This is useful for
example if some of your network devices are real time devices that
need a certain minimum data flow rate, or if you need to limit the
maximum data flow rate for traffic which matches specified criteria.
This code is considered to be experimental.
To administer these schedulers, you'll need the user-level utilities
from the package iproute2+tc at <ftp://ftp.tux.org/pub/net/ip-routing/>.
That package also contains some documentation; for more, check out
<http://www.linuxfoundation.org/collaborate/workgroups/networking/iproute2>.
This Quality of Service (QoS) support will enable you to use
Differentiated Services (diffserv) and Resource Reservation Protocol
(RSVP) on your Linux router if you also say Y to the corresponding
classifiers below. Documentation and software is at
<http://diffserv.sourceforge.net/>.
If you say Y here and to "/proc file system" below, you will be able
to read status information about packet schedulers from the file
/proc/net/psched.
The available schedulers are listed in the following questions; you
can say Y to as many as you like. If unsure, say N now.
if NET_SCHED
comment "Queueing/Scheduling"
config NET_SCH_CBQ
tristate "Class Based Queueing (CBQ)"
---help---
Say Y here if you want to use the Class-Based Queueing (CBQ) packet
scheduling algorithm. This algorithm classifies the waiting packets
into a tree-like hierarchy of classes; the leaves of this tree are
in turn scheduled by separate algorithms.
See the top of <file:net/sched/sch_cbq.c> for more details.
CBQ is a commonly used scheduler, so if you're unsure, you should
say Y here. Then say Y to all the queueing algorithms below that you
want to use as leaf disciplines.
To compile this code as a module, choose M here: the
module will be called sch_cbq.
config NET_SCH_HTB
tristate "Hierarchical Token Bucket (HTB)"
---help---
Say Y here if you want to use the Hierarchical Token Buckets (HTB)
packet scheduling algorithm. See
<http://luxik.cdi.cz/~devik/qos/htb/> for complete manual and
in-depth articles.
HTB is very similar to CBQ regarding its goals however is has
different properties and different algorithm.
To compile this code as a module, choose M here: the
module will be called sch_htb.
config NET_SCH_HFSC
tristate "Hierarchical Fair Service Curve (HFSC)"
---help---
Say Y here if you want to use the Hierarchical Fair Service Curve
(HFSC) packet scheduling algorithm.
To compile this code as a module, choose M here: the
module will be called sch_hfsc.
config NET_SCH_ATM
tristate "ATM Virtual Circuits (ATM)"
depends on ATM
---help---
Say Y here if you want to use the ATM pseudo-scheduler. This
provides a framework for invoking classifiers, which in turn
select classes of this queuing discipline. Each class maps
the flow(s) it is handling to a given virtual circuit.
See the top of <file:net/sched/sch_atm.c> for more details.
To compile this code as a module, choose M here: the
module will be called sch_atm.
config NET_SCH_PRIO
tristate "Multi Band Priority Queueing (PRIO)"
---help---
Say Y here if you want to use an n-band priority queue packet
scheduler.
To compile this code as a module, choose M here: the
module will be called sch_prio.
config NET_SCH_MULTIQ
tristate "Hardware Multiqueue-aware Multi Band Queuing (MULTIQ)"
---help---
Say Y here if you want to use an n-band queue packet scheduler
to support devices that have multiple hardware transmit queues.
To compile this code as a module, choose M here: the
module will be called sch_multiq.
config NET_SCH_RED
tristate "Random Early Detection (RED)"
---help---
Say Y here if you want to use the Random Early Detection (RED)
packet scheduling algorithm.
See the top of <file:net/sched/sch_red.c> for more details.
To compile this code as a module, choose M here: the
module will be called sch_red.
config NET_SCH_SFB
tristate "Stochastic Fair Blue (SFB)"
---help---
Say Y here if you want to use the Stochastic Fair Blue (SFB)
packet scheduling algorithm.
See the top of <file:net/sched/sch_sfb.c> for more details.
To compile this code as a module, choose M here: the
module will be called sch_sfb.
config NET_SCH_SFQ
tristate "Stochastic Fairness Queueing (SFQ)"
---help---
Say Y here if you want to use the Stochastic Fairness Queueing (SFQ)
packet scheduling algorithm.
See the top of <file:net/sched/sch_sfq.c> for more details.
To compile this code as a module, choose M here: the
module will be called sch_sfq.
config NET_SCH_TEQL
tristate "True Link Equalizer (TEQL)"
---help---
Say Y here if you want to use the True Link Equalizer (TLE) packet
scheduling algorithm. This queueing discipline allows the combination
of several physical devices into one virtual device.
See the top of <file:net/sched/sch_teql.c> for more details.
To compile this code as a module, choose M here: the
module will be called sch_teql.
config NET_SCH_TBF
tristate "Token Bucket Filter (TBF)"
---help---
Say Y here if you want to use the Token Bucket Filter (TBF) packet
scheduling algorithm.
See the top of <file:net/sched/sch_tbf.c> for more details.
To compile this code as a module, choose M here: the
module will be called sch_tbf.
config NET_SCH_GRED
tristate "Generic Random Early Detection (GRED)"
---help---
Say Y here if you want to use the Generic Random Early Detection
(GRED) packet scheduling algorithm for some of your network devices
(see the top of <file:net/sched/sch_red.c> for details and
references about the algorithm).
To compile this code as a module, choose M here: the
module will be called sch_gred.
config NET_SCH_DSMARK
tristate "Differentiated Services marker (DSMARK)"
---help---
Say Y if you want to schedule packets according to the
Differentiated Services architecture proposed in RFC 2475.
Technical information on this method, with pointers to associated
RFCs, is available at <http://www.gta.ufrj.br/diffserv/>.
To compile this code as a module, choose M here: the
module will be called sch_dsmark.
config NET_SCH_NETEM
tristate "Network emulator (NETEM)"
---help---
Say Y if you want to emulate network delay, loss, and packet
re-ordering. This is often useful to simulate networks when
testing applications or protocols.
To compile this driver as a module, choose M here: the module
will be called sch_netem.
If unsure, say N.
config NET_SCH_DRR
tristate "Deficit Round Robin scheduler (DRR)"
help
Say Y here if you want to use the Deficit Round Robin (DRR) packet
scheduling algorithm.
To compile this driver as a module, choose M here: the module
will be called sch_drr.
If unsure, say N.
config NET_SCH_MQPRIO
tristate "Multi-queue priority scheduler (MQPRIO)"
help
Say Y here if you want to use the Multi-queue Priority scheduler.
This scheduler allows QOS to be offloaded on NICs that have support
for offloading QOS schedulers.
To compile this driver as a module, choose M here: the module will
be called sch_mqprio.
If unsure, say N.
config NET_SCH_CHOKE
tristate "CHOose and Keep responsive flow scheduler (CHOKE)"
help
Say Y here if you want to use the CHOKe packet scheduler (CHOose
and Keep for responsive flows, CHOose and Kill for unresponsive
flows). This is a variation of RED which trys to penalize flows
that monopolize the queue.
To compile this code as a module, choose M here: the
module will be called sch_choke.
config NET_SCH_QFQ
tristate "Quick Fair Queueing scheduler (QFQ)"
help
Say Y here if you want to use the Quick Fair Queueing Scheduler (QFQ)
packet scheduling algorithm.
To compile this driver as a module, choose M here: the module
will be called sch_qfq.
If unsure, say N.
config NET_SCH_CODEL
tristate "Controlled Delay AQM (CODEL)"
help
Say Y here if you want to use the Controlled Delay (CODEL)
packet scheduling algorithm.
To compile this driver as a module, choose M here: the module
will be called sch_codel.
If unsure, say N.
config NET_SCH_FQ_CODEL
tristate "Fair Queue Controlled Delay AQM (FQ_CODEL)"
help
Say Y here if you want to use the FQ Controlled Delay (FQ_CODEL)
packet scheduling algorithm.
To compile this driver as a module, choose M here: the module
will be called sch_fq_codel.
If unsure, say N.
config NET_SCH_FQ
tristate "Fair Queue"
help
Say Y here if you want to use the FQ packet scheduling algorithm.
FQ does flow separation, and is able to respect pacing requirements
set by TCP stack into sk->sk_pacing_rate (for localy generated
traffic)
To compile this driver as a module, choose M here: the module
will be called sch_fq.
If unsure, say N.
config NET_SCH_HHF
tristate "Heavy-Hitter Filter (HHF)"
help
Say Y here if you want to use the Heavy-Hitter Filter (HHF)
packet scheduling algorithm.
To compile this driver as a module, choose M here: the module
will be called sch_hhf.
config NET_SCH_PIE
tristate "Proportional Integral controller Enhanced (PIE) scheduler"
help
Say Y here if you want to use the Proportional Integral controller
Enhanced scheduler packet scheduling algorithm.
For more information, please see
http://tools.ietf.org/html/draft-pan-tsvwg-pie-00
To compile this driver as a module, choose M here: the module
will be called sch_pie.
If unsure, say N.
config NET_SCH_INGRESS
tristate "Ingress Qdisc"
depends on NET_CLS_ACT
---help---
Say Y here if you want to use classifiers for incoming packets.
If unsure, say Y.
To compile this code as a module, choose M here: the
module will be called sch_ingress.
config NET_SCH_PLUG
tristate "Plug network traffic until release (PLUG)"
---help---
This queuing discipline allows userspace to plug/unplug a network
output queue, using the netlink interface. When it receives an
enqueue command it inserts a plug into the outbound queue that
causes following packets to enqueue until a dequeue command arrives
over netlink, causing the plug to be removed and resuming the normal
packet flow.
This module also provides a generic "network output buffering"
functionality (aka output commit), wherein upon arrival of a dequeue
command, only packets up to the first plug are released for delivery.
The Remus HA project uses this module to enable speculative execution
of virtual machines by allowing the generated network output to be rolled
back if needed.
For more information, please refer to http://wiki.xensource.com/xenwiki/Remus
Say Y here if you are using this kernel for Xen dom0 and
want to protect Xen guests with Remus.
To compile this code as a module, choose M here: the
module will be called sch_plug.
comment "Classification"
config NET_CLS
boolean
config NET_CLS_BASIC
tristate "Elementary classification (BASIC)"
select NET_CLS
---help---
Say Y here if you want to be able to classify packets using
only extended matches and actions.
To compile this code as a module, choose M here: the
module will be called cls_basic.
config NET_CLS_TCINDEX
tristate "Traffic-Control Index (TCINDEX)"
select NET_CLS
---help---
Say Y here if you want to be able to classify packets based on
traffic control indices. You will want this feature if you want
to implement Differentiated Services together with DSMARK.
To compile this code as a module, choose M here: the
module will be called cls_tcindex.
config NET_CLS_ROUTE4
tristate "Routing decision (ROUTE)"
depends on INET
select IP_ROUTE_CLASSID
select NET_CLS
---help---
If you say Y here, you will be able to classify packets
according to the route table entry they matched.
To compile this code as a module, choose M here: the
module will be called cls_route.
config NET_CLS_FW
tristate "Netfilter mark (FW)"
select NET_CLS
---help---
If you say Y here, you will be able to classify packets
according to netfilter/firewall marks.
To compile this code as a module, choose M here: the
module will be called cls_fw.
config NET_CLS_U32
tristate "Universal 32bit comparisons w/ hashing (U32)"
select NET_CLS
---help---
Say Y here to be able to classify packets using a universal
32bit pieces based comparison scheme.
To compile this code as a module, choose M here: the
module will be called cls_u32.
config CLS_U32_PERF
bool "Performance counters support"
depends on NET_CLS_U32
---help---
Say Y here to make u32 gather additional statistics useful for
fine tuning u32 classifiers.
config CLS_U32_MARK
bool "Netfilter marks support"
depends on NET_CLS_U32
---help---
Say Y here to be able to use netfilter marks as u32 key.
config NET_CLS_RSVP
tristate "IPv4 Resource Reservation Protocol (RSVP)"
select NET_CLS
---help---
The Resource Reservation Protocol (RSVP) permits end systems to
request a minimum and maximum data flow rate for a connection; this
is important for real time data such as streaming sound or video.
Say Y here if you want to be able to classify outgoing packets based
on their RSVP requests.
To compile this code as a module, choose M here: the
module will be called cls_rsvp.
config NET_CLS_RSVP6
tristate "IPv6 Resource Reservation Protocol (RSVP6)"
select NET_CLS
---help---
The Resource Reservation Protocol (RSVP) permits end systems to
request a minimum and maximum data flow rate for a connection; this
is important for real time data such as streaming sound or video.
Say Y here if you want to be able to classify outgoing packets based
on their RSVP requests and you are using the IPv6 protocol.
To compile this code as a module, choose M here: the
module will be called cls_rsvp6.
config NET_CLS_FLOW
tristate "Flow classifier"
select NET_CLS
---help---
If you say Y here, you will be able to classify packets based on
a configurable combination of packet keys. This is mostly useful
in combination with SFQ.
To compile this code as a module, choose M here: the
module will be called cls_flow.
config NET_CLS_CGROUP
tristate "Control Group Classifier"
select NET_CLS
select CGROUP_NET_CLASSID
depends on CGROUPS
---help---
Say Y here if you want to classify packets based on the control
cgroup of their process.
To compile this code as a module, choose M here: the
module will be called cls_cgroup.
config NET_CLS_BPF
tristate "BPF-based classifier"
select NET_CLS
---help---
If you say Y here, you will be able to classify packets based on
programmable BPF (JIT'ed) filters as an alternative to ematches.
To compile this code as a module, choose M here: the module will
be called cls_bpf.
config NET_EMATCH
bool "Extended Matches"
select NET_CLS
---help---
Say Y here if you want to use extended matches on top of classifiers
and select the extended matches below.
Extended matches are small classification helpers not worth writing
a separate classifier for.
A recent version of the iproute2 package is required to use
extended matches.
config NET_EMATCH_STACK
int "Stack size"
depends on NET_EMATCH
default "32"
---help---
Size of the local stack variable used while evaluating the tree of
ematches. Limits the depth of the tree, i.e. the number of
encapsulated precedences. Every level requires 4 bytes of additional
stack space.
config NET_EMATCH_CMP
tristate "Simple packet data comparison"
depends on NET_EMATCH
---help---
Say Y here if you want to be able to classify packets based on
simple packet data comparisons for 8, 16, and 32bit values.
To compile this code as a module, choose M here: the
module will be called em_cmp.
config NET_EMATCH_NBYTE
tristate "Multi byte comparison"
depends on NET_EMATCH
---help---
Say Y here if you want to be able to classify packets based on
multiple byte comparisons mainly useful for IPv6 address comparisons.
To compile this code as a module, choose M here: the
module will be called em_nbyte.
config NET_EMATCH_U32
tristate "U32 key"
depends on NET_EMATCH
---help---
Say Y here if you want to be able to classify packets using
the famous u32 key in combination with logic relations.
To compile this code as a module, choose M here: the
module will be called em_u32.
config NET_EMATCH_META
tristate "Metadata"
depends on NET_EMATCH
---help---
Say Y here if you want to be able to classify packets based on
metadata such as load average, netfilter attributes, socket
attributes and routing decisions.
To compile this code as a module, choose M here: the
module will be called em_meta.
config NET_EMATCH_TEXT
tristate "Textsearch"
depends on NET_EMATCH
select TEXTSEARCH
select TEXTSEARCH_KMP
select TEXTSEARCH_BM
select TEXTSEARCH_FSM
---help---
Say Y here if you want to be able to classify packets based on
textsearch comparisons.
To compile this code as a module, choose M here: the
module will be called em_text.
config NET_EMATCH_CANID
tristate "CAN Identifier"
depends on NET_EMATCH && (CAN=y || CAN=m)
---help---
Say Y here if you want to be able to classify CAN frames based
on CAN Identifier.
To compile this code as a module, choose M here: the
module will be called em_canid.
config NET_EMATCH_IPSET
tristate "IPset"
depends on NET_EMATCH && IP_SET
---help---
Say Y here if you want to be able to classify packets based on
ipset membership.
To compile this code as a module, choose M here: the
module will be called em_ipset.
config NET_CLS_ACT
bool "Actions"
---help---
Say Y here if you want to use traffic control actions. Actions
get attached to classifiers and are invoked after a successful
classification. They are used to overwrite the classification
result, instantly drop or redirect packets, etc.
A recent version of the iproute2 package is required to use
extended matches.
config NET_ACT_POLICE
tristate "Traffic Policing"
depends on NET_CLS_ACT
---help---
Say Y here if you want to do traffic policing, i.e. strict
bandwidth limiting. This action replaces the existing policing
module.
To compile this code as a module, choose M here: the
module will be called act_police.
config NET_ACT_GACT
tristate "Generic actions"
depends on NET_CLS_ACT
---help---
Say Y here to take generic actions such as dropping and
accepting packets.
To compile this code as a module, choose M here: the
module will be called act_gact.
config GACT_PROB
bool "Probability support"
depends on NET_ACT_GACT
---help---
Say Y here to use the generic action randomly or deterministically.
config NET_ACT_MIRRED
tristate "Redirecting and Mirroring"
depends on NET_CLS_ACT
---help---
Say Y here to allow packets to be mirrored or redirected to
other devices.
To compile this code as a module, choose M here: the
module will be called act_mirred.
config NET_ACT_IPT
tristate "IPtables targets"
depends on NET_CLS_ACT && NETFILTER && IP_NF_IPTABLES
---help---
Say Y here to be able to invoke iptables targets after successful
classification.
To compile this code as a module, choose M here: the
module will be called act_ipt.
config NET_ACT_NAT
tristate "Stateless NAT"
depends on NET_CLS_ACT
---help---
Say Y here to do stateless NAT on IPv4 packets. You should use
netfilter for NAT unless you know what you are doing.
To compile this code as a module, choose M here: the
module will be called act_nat.
config NET_ACT_PEDIT
tristate "Packet Editing"
depends on NET_CLS_ACT
---help---
Say Y here if you want to mangle the content of packets.
To compile this code as a module, choose M here: the
module will be called act_pedit.
config NET_ACT_SIMP
tristate "Simple Example (Debug)"
depends on NET_CLS_ACT
---help---
Say Y here to add a simple action for demonstration purposes.
It is meant as an example and for debugging purposes. It will
print a configured policy string followed by the packet count
to the console for every packet that passes by.
If unsure, say N.
To compile this code as a module, choose M here: the
module will be called act_simple.
config NET_ACT_SKBEDIT
tristate "SKB Editing"
depends on NET_CLS_ACT
---help---
Say Y here to change skb priority or queue_mapping settings.
If unsure, say N.
To compile this code as a module, choose M here: the
module will be called act_skbedit.
config NET_ACT_CSUM
tristate "Checksum Updating"
depends on NET_CLS_ACT && INET
---help---
Say Y here to update some common checksum after some direct
packet alterations.
To compile this code as a module, choose M here: the
module will be called act_csum.
config NET_CLS_IND
bool "Incoming device classification"
depends on NET_CLS_U32 || NET_CLS_FW
---help---
Say Y here to extend the u32 and fw classifier to support
classification based on the incoming device. This option is
likely to disappear in favour of the metadata ematch.
endif # NET_SCHED
config NET_SCH_FIFO
bool

63
net/sched/Makefile Normal file
View file

@ -0,0 +1,63 @@
#
# Makefile for the Linux Traffic Control Unit.
#
obj-y := sch_generic.o sch_mq.o
obj-$(CONFIG_NET_SCHED) += sch_api.o sch_blackhole.o
obj-$(CONFIG_NET_CLS) += cls_api.o
obj-$(CONFIG_NET_CLS_ACT) += act_api.o
obj-$(CONFIG_NET_ACT_POLICE) += act_police.o
obj-$(CONFIG_NET_ACT_GACT) += act_gact.o
obj-$(CONFIG_NET_ACT_MIRRED) += act_mirred.o
obj-$(CONFIG_NET_ACT_IPT) += act_ipt.o
obj-$(CONFIG_NET_ACT_NAT) += act_nat.o
obj-$(CONFIG_NET_ACT_PEDIT) += act_pedit.o
obj-$(CONFIG_NET_ACT_SIMP) += act_simple.o
obj-$(CONFIG_NET_ACT_SKBEDIT) += act_skbedit.o
obj-$(CONFIG_NET_ACT_CSUM) += act_csum.o
obj-$(CONFIG_NET_SCH_FIFO) += sch_fifo.o
obj-$(CONFIG_NET_SCH_CBQ) += sch_cbq.o
obj-$(CONFIG_NET_SCH_HTB) += sch_htb.o
obj-$(CONFIG_NET_SCH_HFSC) += sch_hfsc.o
obj-$(CONFIG_NET_SCH_RED) += sch_red.o
obj-$(CONFIG_NET_SCH_GRED) += sch_gred.o
obj-$(CONFIG_NET_SCH_INGRESS) += sch_ingress.o
obj-$(CONFIG_NET_SCH_DSMARK) += sch_dsmark.o
obj-$(CONFIG_NET_SCH_SFB) += sch_sfb.o
obj-$(CONFIG_NET_SCH_SFQ) += sch_sfq.o
obj-$(CONFIG_NET_SCH_TBF) += sch_tbf.o
obj-$(CONFIG_NET_SCH_TEQL) += sch_teql.o
obj-$(CONFIG_NET_SCH_PRIO) += sch_prio.o
obj-$(CONFIG_NET_SCH_MULTIQ) += sch_multiq.o
obj-$(CONFIG_NET_SCH_ATM) += sch_atm.o
obj-$(CONFIG_NET_SCH_NETEM) += sch_netem.o
obj-$(CONFIG_NET_SCH_DRR) += sch_drr.o
obj-$(CONFIG_NET_SCH_PLUG) += sch_plug.o
obj-$(CONFIG_NET_SCH_MQPRIO) += sch_mqprio.o
obj-$(CONFIG_NET_SCH_CHOKE) += sch_choke.o
obj-$(CONFIG_NET_SCH_QFQ) += sch_qfq.o
obj-$(CONFIG_NET_SCH_CODEL) += sch_codel.o
obj-$(CONFIG_NET_SCH_FQ_CODEL) += sch_fq_codel.o
obj-$(CONFIG_NET_SCH_FQ) += sch_fq.o
obj-$(CONFIG_NET_SCH_HHF) += sch_hhf.o
obj-$(CONFIG_NET_SCH_PIE) += sch_pie.o
obj-$(CONFIG_NET_CLS_U32) += cls_u32.o
obj-$(CONFIG_NET_CLS_ROUTE4) += cls_route.o
obj-$(CONFIG_NET_CLS_FW) += cls_fw.o
obj-$(CONFIG_NET_CLS_RSVP) += cls_rsvp.o
obj-$(CONFIG_NET_CLS_TCINDEX) += cls_tcindex.o
obj-$(CONFIG_NET_CLS_RSVP6) += cls_rsvp6.o
obj-$(CONFIG_NET_CLS_BASIC) += cls_basic.o
obj-$(CONFIG_NET_CLS_FLOW) += cls_flow.o
obj-$(CONFIG_NET_CLS_CGROUP) += cls_cgroup.o
obj-$(CONFIG_NET_CLS_BPF) += cls_bpf.o
obj-$(CONFIG_NET_EMATCH) += ematch.o
obj-$(CONFIG_NET_EMATCH_CMP) += em_cmp.o
obj-$(CONFIG_NET_EMATCH_NBYTE) += em_nbyte.o
obj-$(CONFIG_NET_EMATCH_U32) += em_u32.o
obj-$(CONFIG_NET_EMATCH_META) += em_meta.o
obj-$(CONFIG_NET_EMATCH_TEXT) += em_text.o
obj-$(CONFIG_NET_EMATCH_CANID) += em_canid.o
obj-$(CONFIG_NET_EMATCH_IPSET) += em_ipset.o

1094
net/sched/act_api.c Normal file

File diff suppressed because it is too large Load diff

584
net/sched/act_csum.c Normal file
View file

@ -0,0 +1,584 @@
/*
* Checksum updating actions
*
* Copyright (c) 2010 Gregoire Baron <baronchon@n7mm.org>
*
* This program is free software; you can redistribute it and/or modify it
* under the terms of the GNU General Public License as published by the Free
* Software Foundation; either version 2 of the License, or (at your option)
* any later version.
*
*/
#include <linux/types.h>
#include <linux/init.h>
#include <linux/kernel.h>
#include <linux/module.h>
#include <linux/spinlock.h>
#include <linux/netlink.h>
#include <net/netlink.h>
#include <linux/rtnetlink.h>
#include <linux/skbuff.h>
#include <net/ip.h>
#include <net/ipv6.h>
#include <net/icmp.h>
#include <linux/icmpv6.h>
#include <linux/igmp.h>
#include <net/tcp.h>
#include <net/udp.h>
#include <net/ip6_checksum.h>
#include <net/act_api.h>
#include <linux/tc_act/tc_csum.h>
#include <net/tc_act/tc_csum.h>
#define CSUM_TAB_MASK 15
static const struct nla_policy csum_policy[TCA_CSUM_MAX + 1] = {
[TCA_CSUM_PARMS] = { .len = sizeof(struct tc_csum), },
};
static int tcf_csum_init(struct net *n, struct nlattr *nla, struct nlattr *est,
struct tc_action *a, int ovr, int bind)
{
struct nlattr *tb[TCA_CSUM_MAX + 1];
struct tc_csum *parm;
struct tcf_csum *p;
int ret = 0, err;
if (nla == NULL)
return -EINVAL;
err = nla_parse_nested(tb, TCA_CSUM_MAX, nla, csum_policy);
if (err < 0)
return err;
if (tb[TCA_CSUM_PARMS] == NULL)
return -EINVAL;
parm = nla_data(tb[TCA_CSUM_PARMS]);
if (!tcf_hash_check(parm->index, a, bind)) {
ret = tcf_hash_create(parm->index, est, a, sizeof(*p), bind);
if (ret)
return ret;
ret = ACT_P_CREATED;
} else {
if (bind)/* dont override defaults */
return 0;
tcf_hash_release(a, bind);
if (!ovr)
return -EEXIST;
}
p = to_tcf_csum(a);
spin_lock_bh(&p->tcf_lock);
p->tcf_action = parm->action;
p->update_flags = parm->update_flags;
spin_unlock_bh(&p->tcf_lock);
if (ret == ACT_P_CREATED)
tcf_hash_insert(a);
return ret;
}
/**
* tcf_csum_skb_nextlayer - Get next layer pointer
* @skb: sk_buff to use
* @ihl: previous summed headers length
* @ipl: complete packet length
* @jhl: next header length
*
* Check the expected next layer availability in the specified sk_buff.
* Return the next layer pointer if pass, NULL otherwise.
*/
static void *tcf_csum_skb_nextlayer(struct sk_buff *skb,
unsigned int ihl, unsigned int ipl,
unsigned int jhl)
{
int ntkoff = skb_network_offset(skb);
int hl = ihl + jhl;
if (!pskb_may_pull(skb, ipl + ntkoff) || (ipl < hl) ||
(skb_cloned(skb) &&
!skb_clone_writable(skb, hl + ntkoff) &&
pskb_expand_head(skb, 0, 0, GFP_ATOMIC)))
return NULL;
else
return (void *)(skb_network_header(skb) + ihl);
}
static int tcf_csum_ipv4_icmp(struct sk_buff *skb,
unsigned int ihl, unsigned int ipl)
{
struct icmphdr *icmph;
icmph = tcf_csum_skb_nextlayer(skb, ihl, ipl, sizeof(*icmph));
if (icmph == NULL)
return 0;
icmph->checksum = 0;
skb->csum = csum_partial(icmph, ipl - ihl, 0);
icmph->checksum = csum_fold(skb->csum);
skb->ip_summed = CHECKSUM_NONE;
return 1;
}
static int tcf_csum_ipv4_igmp(struct sk_buff *skb,
unsigned int ihl, unsigned int ipl)
{
struct igmphdr *igmph;
igmph = tcf_csum_skb_nextlayer(skb, ihl, ipl, sizeof(*igmph));
if (igmph == NULL)
return 0;
igmph->csum = 0;
skb->csum = csum_partial(igmph, ipl - ihl, 0);
igmph->csum = csum_fold(skb->csum);
skb->ip_summed = CHECKSUM_NONE;
return 1;
}
static int tcf_csum_ipv6_icmp(struct sk_buff *skb,
unsigned int ihl, unsigned int ipl)
{
struct icmp6hdr *icmp6h;
const struct ipv6hdr *ip6h;
icmp6h = tcf_csum_skb_nextlayer(skb, ihl, ipl, sizeof(*icmp6h));
if (icmp6h == NULL)
return 0;
ip6h = ipv6_hdr(skb);
icmp6h->icmp6_cksum = 0;
skb->csum = csum_partial(icmp6h, ipl - ihl, 0);
icmp6h->icmp6_cksum = csum_ipv6_magic(&ip6h->saddr, &ip6h->daddr,
ipl - ihl, IPPROTO_ICMPV6,
skb->csum);
skb->ip_summed = CHECKSUM_NONE;
return 1;
}
static int tcf_csum_ipv4_tcp(struct sk_buff *skb,
unsigned int ihl, unsigned int ipl)
{
struct tcphdr *tcph;
const struct iphdr *iph;
tcph = tcf_csum_skb_nextlayer(skb, ihl, ipl, sizeof(*tcph));
if (tcph == NULL)
return 0;
iph = ip_hdr(skb);
tcph->check = 0;
skb->csum = csum_partial(tcph, ipl - ihl, 0);
tcph->check = tcp_v4_check(ipl - ihl,
iph->saddr, iph->daddr, skb->csum);
skb->ip_summed = CHECKSUM_NONE;
return 1;
}
static int tcf_csum_ipv6_tcp(struct sk_buff *skb,
unsigned int ihl, unsigned int ipl)
{
struct tcphdr *tcph;
const struct ipv6hdr *ip6h;
tcph = tcf_csum_skb_nextlayer(skb, ihl, ipl, sizeof(*tcph));
if (tcph == NULL)
return 0;
ip6h = ipv6_hdr(skb);
tcph->check = 0;
skb->csum = csum_partial(tcph, ipl - ihl, 0);
tcph->check = csum_ipv6_magic(&ip6h->saddr, &ip6h->daddr,
ipl - ihl, IPPROTO_TCP,
skb->csum);
skb->ip_summed = CHECKSUM_NONE;
return 1;
}
static int tcf_csum_ipv4_udp(struct sk_buff *skb,
unsigned int ihl, unsigned int ipl, int udplite)
{
struct udphdr *udph;
const struct iphdr *iph;
u16 ul;
/*
* Support both UDP and UDPLITE checksum algorithms, Don't use
* udph->len to get the real length without any protocol check,
* UDPLITE uses udph->len for another thing,
* Use iph->tot_len, or just ipl.
*/
udph = tcf_csum_skb_nextlayer(skb, ihl, ipl, sizeof(*udph));
if (udph == NULL)
return 0;
iph = ip_hdr(skb);
ul = ntohs(udph->len);
if (udplite || udph->check) {
udph->check = 0;
if (udplite) {
if (ul == 0)
skb->csum = csum_partial(udph, ipl - ihl, 0);
else if ((ul >= sizeof(*udph)) && (ul <= ipl - ihl))
skb->csum = csum_partial(udph, ul, 0);
else
goto ignore_obscure_skb;
} else {
if (ul != ipl - ihl)
goto ignore_obscure_skb;
skb->csum = csum_partial(udph, ul, 0);
}
udph->check = csum_tcpudp_magic(iph->saddr, iph->daddr,
ul, iph->protocol,
skb->csum);
if (!udph->check)
udph->check = CSUM_MANGLED_0;
}
skb->ip_summed = CHECKSUM_NONE;
ignore_obscure_skb:
return 1;
}
static int tcf_csum_ipv6_udp(struct sk_buff *skb,
unsigned int ihl, unsigned int ipl, int udplite)
{
struct udphdr *udph;
const struct ipv6hdr *ip6h;
u16 ul;
/*
* Support both UDP and UDPLITE checksum algorithms, Don't use
* udph->len to get the real length without any protocol check,
* UDPLITE uses udph->len for another thing,
* Use ip6h->payload_len + sizeof(*ip6h) ... , or just ipl.
*/
udph = tcf_csum_skb_nextlayer(skb, ihl, ipl, sizeof(*udph));
if (udph == NULL)
return 0;
ip6h = ipv6_hdr(skb);
ul = ntohs(udph->len);
udph->check = 0;
if (udplite) {
if (ul == 0)
skb->csum = csum_partial(udph, ipl - ihl, 0);
else if ((ul >= sizeof(*udph)) && (ul <= ipl - ihl))
skb->csum = csum_partial(udph, ul, 0);
else
goto ignore_obscure_skb;
} else {
if (ul != ipl - ihl)
goto ignore_obscure_skb;
skb->csum = csum_partial(udph, ul, 0);
}
udph->check = csum_ipv6_magic(&ip6h->saddr, &ip6h->daddr, ul,
udplite ? IPPROTO_UDPLITE : IPPROTO_UDP,
skb->csum);
if (!udph->check)
udph->check = CSUM_MANGLED_0;
skb->ip_summed = CHECKSUM_NONE;
ignore_obscure_skb:
return 1;
}
static int tcf_csum_ipv4(struct sk_buff *skb, u32 update_flags)
{
const struct iphdr *iph;
int ntkoff;
ntkoff = skb_network_offset(skb);
if (!pskb_may_pull(skb, sizeof(*iph) + ntkoff))
goto fail;
iph = ip_hdr(skb);
switch (iph->frag_off & htons(IP_OFFSET) ? 0 : iph->protocol) {
case IPPROTO_ICMP:
if (update_flags & TCA_CSUM_UPDATE_FLAG_ICMP)
if (!tcf_csum_ipv4_icmp(skb, iph->ihl * 4,
ntohs(iph->tot_len)))
goto fail;
break;
case IPPROTO_IGMP:
if (update_flags & TCA_CSUM_UPDATE_FLAG_IGMP)
if (!tcf_csum_ipv4_igmp(skb, iph->ihl * 4,
ntohs(iph->tot_len)))
goto fail;
break;
case IPPROTO_TCP:
if (update_flags & TCA_CSUM_UPDATE_FLAG_TCP)
if (!tcf_csum_ipv4_tcp(skb, iph->ihl * 4,
ntohs(iph->tot_len)))
goto fail;
break;
case IPPROTO_UDP:
if (update_flags & TCA_CSUM_UPDATE_FLAG_UDP)
if (!tcf_csum_ipv4_udp(skb, iph->ihl * 4,
ntohs(iph->tot_len), 0))
goto fail;
break;
case IPPROTO_UDPLITE:
if (update_flags & TCA_CSUM_UPDATE_FLAG_UDPLITE)
if (!tcf_csum_ipv4_udp(skb, iph->ihl * 4,
ntohs(iph->tot_len), 1))
goto fail;
break;
}
if (update_flags & TCA_CSUM_UPDATE_FLAG_IPV4HDR) {
if (skb_cloned(skb) &&
!skb_clone_writable(skb, sizeof(*iph) + ntkoff) &&
pskb_expand_head(skb, 0, 0, GFP_ATOMIC))
goto fail;
ip_send_check(ip_hdr(skb));
}
return 1;
fail:
return 0;
}
static int tcf_csum_ipv6_hopopts(struct ipv6_opt_hdr *ip6xh,
unsigned int ixhl, unsigned int *pl)
{
int off, len, optlen;
unsigned char *xh = (void *)ip6xh;
off = sizeof(*ip6xh);
len = ixhl - off;
while (len > 1) {
switch (xh[off]) {
case IPV6_TLV_PAD1:
optlen = 1;
break;
case IPV6_TLV_JUMBO:
optlen = xh[off + 1] + 2;
if (optlen != 6 || len < 6 || (off & 3) != 2)
/* wrong jumbo option length/alignment */
return 0;
*pl = ntohl(*(__be32 *)(xh + off + 2));
goto done;
default:
optlen = xh[off + 1] + 2;
if (optlen > len)
/* ignore obscure options */
goto done;
break;
}
off += optlen;
len -= optlen;
}
done:
return 1;
}
static int tcf_csum_ipv6(struct sk_buff *skb, u32 update_flags)
{
struct ipv6hdr *ip6h;
struct ipv6_opt_hdr *ip6xh;
unsigned int hl, ixhl;
unsigned int pl;
int ntkoff;
u8 nexthdr;
ntkoff = skb_network_offset(skb);
hl = sizeof(*ip6h);
if (!pskb_may_pull(skb, hl + ntkoff))
goto fail;
ip6h = ipv6_hdr(skb);
pl = ntohs(ip6h->payload_len);
nexthdr = ip6h->nexthdr;
do {
switch (nexthdr) {
case NEXTHDR_FRAGMENT:
goto ignore_skb;
case NEXTHDR_ROUTING:
case NEXTHDR_HOP:
case NEXTHDR_DEST:
if (!pskb_may_pull(skb, hl + sizeof(*ip6xh) + ntkoff))
goto fail;
ip6xh = (void *)(skb_network_header(skb) + hl);
ixhl = ipv6_optlen(ip6xh);
if (!pskb_may_pull(skb, hl + ixhl + ntkoff))
goto fail;
ip6xh = (void *)(skb_network_header(skb) + hl);
if ((nexthdr == NEXTHDR_HOP) &&
!(tcf_csum_ipv6_hopopts(ip6xh, ixhl, &pl)))
goto fail;
nexthdr = ip6xh->nexthdr;
hl += ixhl;
break;
case IPPROTO_ICMPV6:
if (update_flags & TCA_CSUM_UPDATE_FLAG_ICMP)
if (!tcf_csum_ipv6_icmp(skb,
hl, pl + sizeof(*ip6h)))
goto fail;
goto done;
case IPPROTO_TCP:
if (update_flags & TCA_CSUM_UPDATE_FLAG_TCP)
if (!tcf_csum_ipv6_tcp(skb,
hl, pl + sizeof(*ip6h)))
goto fail;
goto done;
case IPPROTO_UDP:
if (update_flags & TCA_CSUM_UPDATE_FLAG_UDP)
if (!tcf_csum_ipv6_udp(skb, hl,
pl + sizeof(*ip6h), 0))
goto fail;
goto done;
case IPPROTO_UDPLITE:
if (update_flags & TCA_CSUM_UPDATE_FLAG_UDPLITE)
if (!tcf_csum_ipv6_udp(skb, hl,
pl + sizeof(*ip6h), 1))
goto fail;
goto done;
default:
goto ignore_skb;
}
} while (pskb_may_pull(skb, hl + 1 + ntkoff));
done:
ignore_skb:
return 1;
fail:
return 0;
}
static int tcf_csum(struct sk_buff *skb,
const struct tc_action *a, struct tcf_result *res)
{
struct tcf_csum *p = a->priv;
int action;
u32 update_flags;
spin_lock(&p->tcf_lock);
p->tcf_tm.lastuse = jiffies;
bstats_update(&p->tcf_bstats, skb);
action = p->tcf_action;
update_flags = p->update_flags;
spin_unlock(&p->tcf_lock);
if (unlikely(action == TC_ACT_SHOT))
goto drop;
switch (skb->protocol) {
case cpu_to_be16(ETH_P_IP):
if (!tcf_csum_ipv4(skb, update_flags))
goto drop;
break;
case cpu_to_be16(ETH_P_IPV6):
if (!tcf_csum_ipv6(skb, update_flags))
goto drop;
break;
}
return action;
drop:
spin_lock(&p->tcf_lock);
p->tcf_qstats.drops++;
spin_unlock(&p->tcf_lock);
return TC_ACT_SHOT;
}
static int tcf_csum_dump(struct sk_buff *skb,
struct tc_action *a, int bind, int ref)
{
unsigned char *b = skb_tail_pointer(skb);
struct tcf_csum *p = a->priv;
struct tc_csum opt = {
.update_flags = p->update_flags,
.index = p->tcf_index,
.action = p->tcf_action,
.refcnt = p->tcf_refcnt - ref,
.bindcnt = p->tcf_bindcnt - bind,
};
struct tcf_t t;
if (nla_put(skb, TCA_CSUM_PARMS, sizeof(opt), &opt))
goto nla_put_failure;
t.install = jiffies_to_clock_t(jiffies - p->tcf_tm.install);
t.lastuse = jiffies_to_clock_t(jiffies - p->tcf_tm.lastuse);
t.expires = jiffies_to_clock_t(p->tcf_tm.expires);
if (nla_put(skb, TCA_CSUM_TM, sizeof(t), &t))
goto nla_put_failure;
return skb->len;
nla_put_failure:
nlmsg_trim(skb, b);
return -1;
}
static struct tc_action_ops act_csum_ops = {
.kind = "csum",
.type = TCA_ACT_CSUM,
.owner = THIS_MODULE,
.act = tcf_csum,
.dump = tcf_csum_dump,
.init = tcf_csum_init,
};
MODULE_DESCRIPTION("Checksum updating actions");
MODULE_LICENSE("GPL");
static int __init csum_init_module(void)
{
return tcf_register_action(&act_csum_ops, CSUM_TAB_MASK);
}
static void __exit csum_cleanup_module(void)
{
tcf_unregister_action(&act_csum_ops);
}
module_init(csum_init_module);
module_exit(csum_cleanup_module);

209
net/sched/act_gact.c Normal file
View file

@ -0,0 +1,209 @@
/*
* net/sched/gact.c Generic actions
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public License
* as published by the Free Software Foundation; either version
* 2 of the License, or (at your option) any later version.
*
* copyright Jamal Hadi Salim (2002-4)
*
*/
#include <linux/types.h>
#include <linux/kernel.h>
#include <linux/string.h>
#include <linux/errno.h>
#include <linux/skbuff.h>
#include <linux/rtnetlink.h>
#include <linux/module.h>
#include <linux/init.h>
#include <net/netlink.h>
#include <net/pkt_sched.h>
#include <linux/tc_act/tc_gact.h>
#include <net/tc_act/tc_gact.h>
#define GACT_TAB_MASK 15
#ifdef CONFIG_GACT_PROB
static int gact_net_rand(struct tcf_gact *gact)
{
if (!gact->tcfg_pval || prandom_u32() % gact->tcfg_pval)
return gact->tcf_action;
return gact->tcfg_paction;
}
static int gact_determ(struct tcf_gact *gact)
{
if (!gact->tcfg_pval || gact->tcf_bstats.packets % gact->tcfg_pval)
return gact->tcf_action;
return gact->tcfg_paction;
}
typedef int (*g_rand)(struct tcf_gact *gact);
static g_rand gact_rand[MAX_RAND] = { NULL, gact_net_rand, gact_determ };
#endif /* CONFIG_GACT_PROB */
static const struct nla_policy gact_policy[TCA_GACT_MAX + 1] = {
[TCA_GACT_PARMS] = { .len = sizeof(struct tc_gact) },
[TCA_GACT_PROB] = { .len = sizeof(struct tc_gact_p) },
};
static int tcf_gact_init(struct net *net, struct nlattr *nla,
struct nlattr *est, struct tc_action *a,
int ovr, int bind)
{
struct nlattr *tb[TCA_GACT_MAX + 1];
struct tc_gact *parm;
struct tcf_gact *gact;
int ret = 0;
int err;
#ifdef CONFIG_GACT_PROB
struct tc_gact_p *p_parm = NULL;
#endif
if (nla == NULL)
return -EINVAL;
err = nla_parse_nested(tb, TCA_GACT_MAX, nla, gact_policy);
if (err < 0)
return err;
if (tb[TCA_GACT_PARMS] == NULL)
return -EINVAL;
parm = nla_data(tb[TCA_GACT_PARMS]);
#ifndef CONFIG_GACT_PROB
if (tb[TCA_GACT_PROB] != NULL)
return -EOPNOTSUPP;
#else
if (tb[TCA_GACT_PROB]) {
p_parm = nla_data(tb[TCA_GACT_PROB]);
if (p_parm->ptype >= MAX_RAND)
return -EINVAL;
}
#endif
if (!tcf_hash_check(parm->index, a, bind)) {
ret = tcf_hash_create(parm->index, est, a, sizeof(*gact), bind);
if (ret)
return ret;
ret = ACT_P_CREATED;
} else {
if (bind)/* dont override defaults */
return 0;
tcf_hash_release(a, bind);
if (!ovr)
return -EEXIST;
}
gact = to_gact(a);
spin_lock_bh(&gact->tcf_lock);
gact->tcf_action = parm->action;
#ifdef CONFIG_GACT_PROB
if (p_parm) {
gact->tcfg_paction = p_parm->paction;
gact->tcfg_pval = p_parm->pval;
gact->tcfg_ptype = p_parm->ptype;
}
#endif
spin_unlock_bh(&gact->tcf_lock);
if (ret == ACT_P_CREATED)
tcf_hash_insert(a);
return ret;
}
static int tcf_gact(struct sk_buff *skb, const struct tc_action *a,
struct tcf_result *res)
{
struct tcf_gact *gact = a->priv;
int action = TC_ACT_SHOT;
spin_lock(&gact->tcf_lock);
#ifdef CONFIG_GACT_PROB
if (gact->tcfg_ptype)
action = gact_rand[gact->tcfg_ptype](gact);
else
action = gact->tcf_action;
#else
action = gact->tcf_action;
#endif
gact->tcf_bstats.bytes += qdisc_pkt_len(skb);
gact->tcf_bstats.packets++;
if (action == TC_ACT_SHOT)
gact->tcf_qstats.drops++;
gact->tcf_tm.lastuse = jiffies;
spin_unlock(&gact->tcf_lock);
return action;
}
static int tcf_gact_dump(struct sk_buff *skb, struct tc_action *a, int bind, int ref)
{
unsigned char *b = skb_tail_pointer(skb);
struct tcf_gact *gact = a->priv;
struct tc_gact opt = {
.index = gact->tcf_index,
.refcnt = gact->tcf_refcnt - ref,
.bindcnt = gact->tcf_bindcnt - bind,
.action = gact->tcf_action,
};
struct tcf_t t;
if (nla_put(skb, TCA_GACT_PARMS, sizeof(opt), &opt))
goto nla_put_failure;
#ifdef CONFIG_GACT_PROB
if (gact->tcfg_ptype) {
struct tc_gact_p p_opt = {
.paction = gact->tcfg_paction,
.pval = gact->tcfg_pval,
.ptype = gact->tcfg_ptype,
};
if (nla_put(skb, TCA_GACT_PROB, sizeof(p_opt), &p_opt))
goto nla_put_failure;
}
#endif
t.install = jiffies_to_clock_t(jiffies - gact->tcf_tm.install);
t.lastuse = jiffies_to_clock_t(jiffies - gact->tcf_tm.lastuse);
t.expires = jiffies_to_clock_t(gact->tcf_tm.expires);
if (nla_put(skb, TCA_GACT_TM, sizeof(t), &t))
goto nla_put_failure;
return skb->len;
nla_put_failure:
nlmsg_trim(skb, b);
return -1;
}
static struct tc_action_ops act_gact_ops = {
.kind = "gact",
.type = TCA_ACT_GACT,
.owner = THIS_MODULE,
.act = tcf_gact,
.dump = tcf_gact_dump,
.init = tcf_gact_init,
};
MODULE_AUTHOR("Jamal Hadi Salim(2002-4)");
MODULE_DESCRIPTION("Generic Classifier actions");
MODULE_LICENSE("GPL");
static int __init gact_init_module(void)
{
#ifdef CONFIG_GACT_PROB
pr_info("GACT probability on\n");
#else
pr_info("GACT probability NOT on\n");
#endif
return tcf_register_action(&act_gact_ops, GACT_TAB_MASK);
}
static void __exit gact_cleanup_module(void)
{
tcf_unregister_action(&act_gact_ops);
}
module_init(gact_init_module);
module_exit(gact_cleanup_module);

311
net/sched/act_ipt.c Normal file
View file

@ -0,0 +1,311 @@
/*
* net/sched/ipt.c iptables target interface
*
*TODO: Add other tables. For now we only support the ipv4 table targets
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public License
* as published by the Free Software Foundation; either version
* 2 of the License, or (at your option) any later version.
*
* Copyright: Jamal Hadi Salim (2002-13)
*/
#include <linux/types.h>
#include <linux/kernel.h>
#include <linux/string.h>
#include <linux/errno.h>
#include <linux/skbuff.h>
#include <linux/rtnetlink.h>
#include <linux/module.h>
#include <linux/init.h>
#include <linux/slab.h>
#include <net/netlink.h>
#include <net/pkt_sched.h>
#include <linux/tc_act/tc_ipt.h>
#include <net/tc_act/tc_ipt.h>
#include <linux/netfilter_ipv4/ip_tables.h>
#define IPT_TAB_MASK 15
static int ipt_init_target(struct xt_entry_target *t, char *table, unsigned int hook)
{
struct xt_tgchk_param par;
struct xt_target *target;
int ret = 0;
target = xt_request_find_target(AF_INET, t->u.user.name,
t->u.user.revision);
if (IS_ERR(target))
return PTR_ERR(target);
t->u.kernel.target = target;
par.table = table;
par.entryinfo = NULL;
par.target = target;
par.targinfo = t->data;
par.hook_mask = hook;
par.family = NFPROTO_IPV4;
ret = xt_check_target(&par, t->u.target_size - sizeof(*t), 0, false);
if (ret < 0) {
module_put(t->u.kernel.target->me);
return ret;
}
return 0;
}
static void ipt_destroy_target(struct xt_entry_target *t)
{
struct xt_tgdtor_param par = {
.target = t->u.kernel.target,
.targinfo = t->data,
};
if (par.target->destroy != NULL)
par.target->destroy(&par);
module_put(par.target->me);
}
static void tcf_ipt_release(struct tc_action *a, int bind)
{
struct tcf_ipt *ipt = to_ipt(a);
ipt_destroy_target(ipt->tcfi_t);
kfree(ipt->tcfi_tname);
kfree(ipt->tcfi_t);
}
static const struct nla_policy ipt_policy[TCA_IPT_MAX + 1] = {
[TCA_IPT_TABLE] = { .type = NLA_STRING, .len = IFNAMSIZ },
[TCA_IPT_HOOK] = { .type = NLA_U32 },
[TCA_IPT_INDEX] = { .type = NLA_U32 },
[TCA_IPT_TARG] = { .len = sizeof(struct xt_entry_target) },
};
static int tcf_ipt_init(struct net *net, struct nlattr *nla, struct nlattr *est,
struct tc_action *a, int ovr, int bind)
{
struct nlattr *tb[TCA_IPT_MAX + 1];
struct tcf_ipt *ipt;
struct xt_entry_target *td, *t;
char *tname;
int ret = 0, err;
u32 hook = 0;
u32 index = 0;
if (nla == NULL)
return -EINVAL;
err = nla_parse_nested(tb, TCA_IPT_MAX, nla, ipt_policy);
if (err < 0)
return err;
if (tb[TCA_IPT_HOOK] == NULL)
return -EINVAL;
if (tb[TCA_IPT_TARG] == NULL)
return -EINVAL;
td = (struct xt_entry_target *)nla_data(tb[TCA_IPT_TARG]);
if (nla_len(tb[TCA_IPT_TARG]) < td->u.target_size)
return -EINVAL;
if (tb[TCA_IPT_INDEX] != NULL)
index = nla_get_u32(tb[TCA_IPT_INDEX]);
if (!tcf_hash_check(index, a, bind) ) {
ret = tcf_hash_create(index, est, a, sizeof(*ipt), bind);
if (ret)
return ret;
ret = ACT_P_CREATED;
} else {
if (bind)/* dont override defaults */
return 0;
tcf_hash_release(a, bind);
if (!ovr)
return -EEXIST;
}
ipt = to_ipt(a);
hook = nla_get_u32(tb[TCA_IPT_HOOK]);
err = -ENOMEM;
tname = kmalloc(IFNAMSIZ, GFP_KERNEL);
if (unlikely(!tname))
goto err1;
if (tb[TCA_IPT_TABLE] == NULL ||
nla_strlcpy(tname, tb[TCA_IPT_TABLE], IFNAMSIZ) >= IFNAMSIZ)
strcpy(tname, "mangle");
t = kmemdup(td, td->u.target_size, GFP_KERNEL);
if (unlikely(!t))
goto err2;
err = ipt_init_target(t, tname, hook);
if (err < 0)
goto err3;
spin_lock_bh(&ipt->tcf_lock);
if (ret != ACT_P_CREATED) {
ipt_destroy_target(ipt->tcfi_t);
kfree(ipt->tcfi_tname);
kfree(ipt->tcfi_t);
}
ipt->tcfi_tname = tname;
ipt->tcfi_t = t;
ipt->tcfi_hook = hook;
spin_unlock_bh(&ipt->tcf_lock);
if (ret == ACT_P_CREATED)
tcf_hash_insert(a);
return ret;
err3:
kfree(t);
err2:
kfree(tname);
err1:
if (ret == ACT_P_CREATED)
tcf_hash_cleanup(a, est);
return err;
}
static int tcf_ipt(struct sk_buff *skb, const struct tc_action *a,
struct tcf_result *res)
{
int ret = 0, result = 0;
struct tcf_ipt *ipt = a->priv;
struct xt_action_param par;
if (skb_unclone(skb, GFP_ATOMIC))
return TC_ACT_UNSPEC;
spin_lock(&ipt->tcf_lock);
ipt->tcf_tm.lastuse = jiffies;
bstats_update(&ipt->tcf_bstats, skb);
/* yes, we have to worry about both in and out dev
* worry later - danger - this API seems to have changed
* from earlier kernels
*/
par.in = skb->dev;
par.out = NULL;
par.hooknum = ipt->tcfi_hook;
par.target = ipt->tcfi_t->u.kernel.target;
par.targinfo = ipt->tcfi_t->data;
ret = par.target->target(skb, &par);
switch (ret) {
case NF_ACCEPT:
result = TC_ACT_OK;
break;
case NF_DROP:
result = TC_ACT_SHOT;
ipt->tcf_qstats.drops++;
break;
case XT_CONTINUE:
result = TC_ACT_PIPE;
break;
default:
net_notice_ratelimited("tc filter: Bogus netfilter code %d assume ACCEPT\n",
ret);
result = TC_POLICE_OK;
break;
}
spin_unlock(&ipt->tcf_lock);
return result;
}
static int tcf_ipt_dump(struct sk_buff *skb, struct tc_action *a, int bind, int ref)
{
unsigned char *b = skb_tail_pointer(skb);
struct tcf_ipt *ipt = a->priv;
struct xt_entry_target *t;
struct tcf_t tm;
struct tc_cnt c;
/* for simple targets kernel size == user size
* user name = target name
* for foolproof you need to not assume this
*/
t = kmemdup(ipt->tcfi_t, ipt->tcfi_t->u.user.target_size, GFP_ATOMIC);
if (unlikely(!t))
goto nla_put_failure;
c.bindcnt = ipt->tcf_bindcnt - bind;
c.refcnt = ipt->tcf_refcnt - ref;
strcpy(t->u.user.name, ipt->tcfi_t->u.kernel.target->name);
if (nla_put(skb, TCA_IPT_TARG, ipt->tcfi_t->u.user.target_size, t) ||
nla_put_u32(skb, TCA_IPT_INDEX, ipt->tcf_index) ||
nla_put_u32(skb, TCA_IPT_HOOK, ipt->tcfi_hook) ||
nla_put(skb, TCA_IPT_CNT, sizeof(struct tc_cnt), &c) ||
nla_put_string(skb, TCA_IPT_TABLE, ipt->tcfi_tname))
goto nla_put_failure;
tm.install = jiffies_to_clock_t(jiffies - ipt->tcf_tm.install);
tm.lastuse = jiffies_to_clock_t(jiffies - ipt->tcf_tm.lastuse);
tm.expires = jiffies_to_clock_t(ipt->tcf_tm.expires);
if (nla_put(skb, TCA_IPT_TM, sizeof (tm), &tm))
goto nla_put_failure;
kfree(t);
return skb->len;
nla_put_failure:
nlmsg_trim(skb, b);
kfree(t);
return -1;
}
static struct tc_action_ops act_ipt_ops = {
.kind = "ipt",
.type = TCA_ACT_IPT,
.owner = THIS_MODULE,
.act = tcf_ipt,
.dump = tcf_ipt_dump,
.cleanup = tcf_ipt_release,
.init = tcf_ipt_init,
};
static struct tc_action_ops act_xt_ops = {
.kind = "xt",
.type = TCA_ACT_XT,
.owner = THIS_MODULE,
.act = tcf_ipt,
.dump = tcf_ipt_dump,
.cleanup = tcf_ipt_release,
.init = tcf_ipt_init,
};
MODULE_AUTHOR("Jamal Hadi Salim(2002-13)");
MODULE_DESCRIPTION("Iptables target actions");
MODULE_LICENSE("GPL");
MODULE_ALIAS("act_xt");
static int __init ipt_init_module(void)
{
int ret1, ret2;
ret1 = tcf_register_action(&act_xt_ops, IPT_TAB_MASK);
if (ret1 < 0)
printk("Failed to load xt action\n");
ret2 = tcf_register_action(&act_ipt_ops, IPT_TAB_MASK);
if (ret2 < 0)
printk("Failed to load ipt action\n");
if (ret1 < 0 && ret2 < 0) {
return ret1;
} else
return 0;
}
static void __exit ipt_cleanup_module(void)
{
tcf_unregister_action(&act_xt_ops);
tcf_unregister_action(&act_ipt_ops);
}
module_init(ipt_init_module);
module_exit(ipt_cleanup_module);

292
net/sched/act_mirred.c Normal file
View file

@ -0,0 +1,292 @@
/*
* net/sched/mirred.c packet mirroring and redirect actions
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public License
* as published by the Free Software Foundation; either version
* 2 of the License, or (at your option) any later version.
*
* Authors: Jamal Hadi Salim (2002-4)
*
* TODO: Add ingress support (and socket redirect support)
*
*/
#include <linux/types.h>
#include <linux/kernel.h>
#include <linux/string.h>
#include <linux/errno.h>
#include <linux/skbuff.h>
#include <linux/rtnetlink.h>
#include <linux/module.h>
#include <linux/init.h>
#include <linux/gfp.h>
#include <net/net_namespace.h>
#include <net/netlink.h>
#include <net/pkt_sched.h>
#include <linux/tc_act/tc_mirred.h>
#include <net/tc_act/tc_mirred.h>
#include <linux/if_arp.h>
#define MIRRED_TAB_MASK 7
static LIST_HEAD(mirred_list);
static DEFINE_SPINLOCK(mirred_list_lock);
static void tcf_mirred_release(struct tc_action *a, int bind)
{
struct tcf_mirred *m = to_mirred(a);
/* We could be called either in a RCU callback or with RTNL lock held. */
spin_lock_bh(&mirred_list_lock);
list_del(&m->tcfm_list);
spin_unlock_bh(&mirred_list_lock);
if (m->tcfm_dev)
dev_put(m->tcfm_dev);
}
static const struct nla_policy mirred_policy[TCA_MIRRED_MAX + 1] = {
[TCA_MIRRED_PARMS] = { .len = sizeof(struct tc_mirred) },
};
static int tcf_mirred_init(struct net *net, struct nlattr *nla,
struct nlattr *est, struct tc_action *a, int ovr,
int bind)
{
struct nlattr *tb[TCA_MIRRED_MAX + 1];
struct tc_mirred *parm;
struct tcf_mirred *m;
struct net_device *dev;
int ret, ok_push = 0;
if (nla == NULL)
return -EINVAL;
ret = nla_parse_nested(tb, TCA_MIRRED_MAX, nla, mirred_policy);
if (ret < 0)
return ret;
if (tb[TCA_MIRRED_PARMS] == NULL)
return -EINVAL;
parm = nla_data(tb[TCA_MIRRED_PARMS]);
switch (parm->eaction) {
case TCA_EGRESS_MIRROR:
case TCA_EGRESS_REDIR:
case TCA_INGRESS_REDIR:
break;
default:
return -EINVAL;
}
if (parm->ifindex) {
dev = __dev_get_by_index(net, parm->ifindex);
if (dev == NULL)
return -ENODEV;
switch (dev->type) {
case ARPHRD_TUNNEL:
case ARPHRD_TUNNEL6:
case ARPHRD_SIT:
case ARPHRD_IPGRE:
case ARPHRD_VOID:
case ARPHRD_NONE:
ok_push = 0;
break;
default:
ok_push = 1;
break;
}
} else {
dev = NULL;
}
if (!tcf_hash_check(parm->index, a, bind)) {
if (dev == NULL)
return -EINVAL;
ret = tcf_hash_create(parm->index, est, a, sizeof(*m), bind);
if (ret)
return ret;
ret = ACT_P_CREATED;
} else {
if (!ovr) {
tcf_hash_release(a, bind);
return -EEXIST;
}
}
m = to_mirred(a);
spin_lock_bh(&m->tcf_lock);
m->tcf_action = parm->action;
m->tcfm_eaction = parm->eaction;
if (dev != NULL) {
m->tcfm_ifindex = parm->ifindex;
if (ret != ACT_P_CREATED)
dev_put(m->tcfm_dev);
dev_hold(dev);
m->tcfm_dev = dev;
m->tcfm_ok_push = ok_push;
}
spin_unlock_bh(&m->tcf_lock);
if (ret == ACT_P_CREATED) {
spin_lock_bh(&mirred_list_lock);
list_add(&m->tcfm_list, &mirred_list);
spin_unlock_bh(&mirred_list_lock);
tcf_hash_insert(a);
}
return ret;
}
static int tcf_mirred(struct sk_buff *skb, const struct tc_action *a,
struct tcf_result *res)
{
struct tcf_mirred *m = a->priv;
struct net_device *dev;
struct sk_buff *skb2;
u32 at;
int retval, err = 1;
spin_lock(&m->tcf_lock);
m->tcf_tm.lastuse = jiffies;
bstats_update(&m->tcf_bstats, skb);
dev = m->tcfm_dev;
if (!dev) {
printk_once(KERN_NOTICE "tc mirred: target device is gone\n");
goto out;
}
if (!(dev->flags & IFF_UP)) {
net_notice_ratelimited("tc mirred to Houston: device %s is down\n",
dev->name);
goto out;
}
skb2 = skb_act_clone(skb, GFP_ATOMIC, m->tcf_action);
if (skb2 == NULL)
goto out;
if (m->tcfm_eaction == TCA_INGRESS_REDIR) {
/* Let's _hope_ the devices are of similar type.
* This is rather dangerous; with changed skb_iif, we
* will not know the real input device, but perhaps
* that's the whole point of doing the ingress
* redirect/mirror in the first place? (Note: This
* can lead to bad things if two devices ingress
* redirect at each other. Don't do that.)*/
skb2->dev = dev;
skb2->skb_iif = skb2->dev->ifindex;
skb2->pkt_type = PACKET_HOST;
netif_rx(skb2);
} else {
at = G_TC_AT(skb->tc_verd);
if (!(at & AT_EGRESS)) {
if (m->tcfm_ok_push) {
skb_push(skb2, skb2->dev->hard_header_len);
}
}
/* mirror is always swallowed */
if (m->tcfm_eaction != TCA_EGRESS_MIRROR)
skb2->tc_verd = SET_TC_FROM(skb2->tc_verd, at);
skb2->skb_iif = skb->dev->ifindex;
skb2->dev = dev;
err = dev_queue_xmit(skb2);
}
out:
if (err) {
m->tcf_qstats.overlimits++;
if (m->tcfm_eaction != TCA_EGRESS_MIRROR)
retval = TC_ACT_SHOT;
else
retval = m->tcf_action;
} else
retval = m->tcf_action;
spin_unlock(&m->tcf_lock);
return retval;
}
static int tcf_mirred_dump(struct sk_buff *skb, struct tc_action *a, int bind, int ref)
{
unsigned char *b = skb_tail_pointer(skb);
struct tcf_mirred *m = a->priv;
struct tc_mirred opt = {
.index = m->tcf_index,
.action = m->tcf_action,
.refcnt = m->tcf_refcnt - ref,
.bindcnt = m->tcf_bindcnt - bind,
.eaction = m->tcfm_eaction,
.ifindex = m->tcfm_ifindex,
};
struct tcf_t t;
if (nla_put(skb, TCA_MIRRED_PARMS, sizeof(opt), &opt))
goto nla_put_failure;
t.install = jiffies_to_clock_t(jiffies - m->tcf_tm.install);
t.lastuse = jiffies_to_clock_t(jiffies - m->tcf_tm.lastuse);
t.expires = jiffies_to_clock_t(m->tcf_tm.expires);
if (nla_put(skb, TCA_MIRRED_TM, sizeof(t), &t))
goto nla_put_failure;
return skb->len;
nla_put_failure:
nlmsg_trim(skb, b);
return -1;
}
static int mirred_device_event(struct notifier_block *unused,
unsigned long event, void *ptr)
{
struct net_device *dev = netdev_notifier_info_to_dev(ptr);
struct tcf_mirred *m;
if (event == NETDEV_UNREGISTER) {
spin_lock_bh(&mirred_list_lock);
list_for_each_entry(m, &mirred_list, tcfm_list) {
spin_lock_bh(&m->tcf_lock);
if (m->tcfm_dev == dev) {
dev_put(dev);
m->tcfm_dev = NULL;
}
spin_unlock_bh(&m->tcf_lock);
}
spin_unlock_bh(&mirred_list_lock);
}
return NOTIFY_DONE;
}
static struct notifier_block mirred_device_notifier = {
.notifier_call = mirred_device_event,
};
static struct tc_action_ops act_mirred_ops = {
.kind = "mirred",
.type = TCA_ACT_MIRRED,
.owner = THIS_MODULE,
.act = tcf_mirred,
.dump = tcf_mirred_dump,
.cleanup = tcf_mirred_release,
.init = tcf_mirred_init,
};
MODULE_AUTHOR("Jamal Hadi Salim(2002)");
MODULE_DESCRIPTION("Device Mirror/redirect actions");
MODULE_LICENSE("GPL");
static int __init mirred_init_module(void)
{
int err = register_netdevice_notifier(&mirred_device_notifier);
if (err)
return err;
pr_info("Mirror/redirect action on\n");
return tcf_register_action(&act_mirred_ops, MIRRED_TAB_MASK);
}
static void __exit mirred_cleanup_module(void)
{
tcf_unregister_action(&act_mirred_ops);
unregister_netdevice_notifier(&mirred_device_notifier);
}
module_init(mirred_init_module);
module_exit(mirred_cleanup_module);

306
net/sched/act_nat.c Normal file
View file

@ -0,0 +1,306 @@
/*
* Stateless NAT actions
*
* Copyright (c) 2007 Herbert Xu <herbert@gondor.apana.org.au>
*
* This program is free software; you can redistribute it and/or modify it
* under the terms of the GNU General Public License as published by the Free
* Software Foundation; either version 2 of the License, or (at your option)
* any later version.
*/
#include <linux/errno.h>
#include <linux/init.h>
#include <linux/kernel.h>
#include <linux/module.h>
#include <linux/netfilter.h>
#include <linux/rtnetlink.h>
#include <linux/skbuff.h>
#include <linux/slab.h>
#include <linux/spinlock.h>
#include <linux/string.h>
#include <linux/tc_act/tc_nat.h>
#include <net/act_api.h>
#include <net/icmp.h>
#include <net/ip.h>
#include <net/netlink.h>
#include <net/tc_act/tc_nat.h>
#include <net/tcp.h>
#include <net/udp.h>
#define NAT_TAB_MASK 15
static const struct nla_policy nat_policy[TCA_NAT_MAX + 1] = {
[TCA_NAT_PARMS] = { .len = sizeof(struct tc_nat) },
};
static int tcf_nat_init(struct net *net, struct nlattr *nla, struct nlattr *est,
struct tc_action *a, int ovr, int bind)
{
struct nlattr *tb[TCA_NAT_MAX + 1];
struct tc_nat *parm;
int ret = 0, err;
struct tcf_nat *p;
if (nla == NULL)
return -EINVAL;
err = nla_parse_nested(tb, TCA_NAT_MAX, nla, nat_policy);
if (err < 0)
return err;
if (tb[TCA_NAT_PARMS] == NULL)
return -EINVAL;
parm = nla_data(tb[TCA_NAT_PARMS]);
if (!tcf_hash_check(parm->index, a, bind)) {
ret = tcf_hash_create(parm->index, est, a, sizeof(*p), bind);
if (ret)
return ret;
ret = ACT_P_CREATED;
} else {
if (bind)
return 0;
tcf_hash_release(a, bind);
if (!ovr)
return -EEXIST;
}
p = to_tcf_nat(a);
spin_lock_bh(&p->tcf_lock);
p->old_addr = parm->old_addr;
p->new_addr = parm->new_addr;
p->mask = parm->mask;
p->flags = parm->flags;
p->tcf_action = parm->action;
spin_unlock_bh(&p->tcf_lock);
if (ret == ACT_P_CREATED)
tcf_hash_insert(a);
return ret;
}
static int tcf_nat(struct sk_buff *skb, const struct tc_action *a,
struct tcf_result *res)
{
struct tcf_nat *p = a->priv;
struct iphdr *iph;
__be32 old_addr;
__be32 new_addr;
__be32 mask;
__be32 addr;
int egress;
int action;
int ihl;
int noff;
spin_lock(&p->tcf_lock);
p->tcf_tm.lastuse = jiffies;
old_addr = p->old_addr;
new_addr = p->new_addr;
mask = p->mask;
egress = p->flags & TCA_NAT_FLAG_EGRESS;
action = p->tcf_action;
bstats_update(&p->tcf_bstats, skb);
spin_unlock(&p->tcf_lock);
if (unlikely(action == TC_ACT_SHOT))
goto drop;
noff = skb_network_offset(skb);
if (!pskb_may_pull(skb, sizeof(*iph) + noff))
goto drop;
iph = ip_hdr(skb);
if (egress)
addr = iph->saddr;
else
addr = iph->daddr;
if (!((old_addr ^ addr) & mask)) {
if (skb_cloned(skb) &&
!skb_clone_writable(skb, sizeof(*iph) + noff) &&
pskb_expand_head(skb, 0, 0, GFP_ATOMIC))
goto drop;
new_addr &= mask;
new_addr |= addr & ~mask;
/* Rewrite IP header */
iph = ip_hdr(skb);
if (egress)
iph->saddr = new_addr;
else
iph->daddr = new_addr;
csum_replace4(&iph->check, addr, new_addr);
} else if ((iph->frag_off & htons(IP_OFFSET)) ||
iph->protocol != IPPROTO_ICMP) {
goto out;
}
ihl = iph->ihl * 4;
/* It would be nice to share code with stateful NAT. */
switch (iph->frag_off & htons(IP_OFFSET) ? 0 : iph->protocol) {
case IPPROTO_TCP:
{
struct tcphdr *tcph;
if (!pskb_may_pull(skb, ihl + sizeof(*tcph) + noff) ||
(skb_cloned(skb) &&
!skb_clone_writable(skb, ihl + sizeof(*tcph) + noff) &&
pskb_expand_head(skb, 0, 0, GFP_ATOMIC)))
goto drop;
tcph = (void *)(skb_network_header(skb) + ihl);
inet_proto_csum_replace4(&tcph->check, skb, addr, new_addr, 1);
break;
}
case IPPROTO_UDP:
{
struct udphdr *udph;
if (!pskb_may_pull(skb, ihl + sizeof(*udph) + noff) ||
(skb_cloned(skb) &&
!skb_clone_writable(skb, ihl + sizeof(*udph) + noff) &&
pskb_expand_head(skb, 0, 0, GFP_ATOMIC)))
goto drop;
udph = (void *)(skb_network_header(skb) + ihl);
if (udph->check || skb->ip_summed == CHECKSUM_PARTIAL) {
inet_proto_csum_replace4(&udph->check, skb, addr,
new_addr, 1);
if (!udph->check)
udph->check = CSUM_MANGLED_0;
}
break;
}
case IPPROTO_ICMP:
{
struct icmphdr *icmph;
if (!pskb_may_pull(skb, ihl + sizeof(*icmph) + noff))
goto drop;
icmph = (void *)(skb_network_header(skb) + ihl);
if ((icmph->type != ICMP_DEST_UNREACH) &&
(icmph->type != ICMP_TIME_EXCEEDED) &&
(icmph->type != ICMP_PARAMETERPROB))
break;
if (!pskb_may_pull(skb, ihl + sizeof(*icmph) + sizeof(*iph) +
noff))
goto drop;
icmph = (void *)(skb_network_header(skb) + ihl);
iph = (void *)(icmph + 1);
if (egress)
addr = iph->daddr;
else
addr = iph->saddr;
if ((old_addr ^ addr) & mask)
break;
if (skb_cloned(skb) &&
!skb_clone_writable(skb, ihl + sizeof(*icmph) +
sizeof(*iph) + noff) &&
pskb_expand_head(skb, 0, 0, GFP_ATOMIC))
goto drop;
icmph = (void *)(skb_network_header(skb) + ihl);
iph = (void *)(icmph + 1);
new_addr &= mask;
new_addr |= addr & ~mask;
/* XXX Fix up the inner checksums. */
if (egress)
iph->daddr = new_addr;
else
iph->saddr = new_addr;
inet_proto_csum_replace4(&icmph->checksum, skb, addr, new_addr,
0);
break;
}
default:
break;
}
out:
return action;
drop:
spin_lock(&p->tcf_lock);
p->tcf_qstats.drops++;
spin_unlock(&p->tcf_lock);
return TC_ACT_SHOT;
}
static int tcf_nat_dump(struct sk_buff *skb, struct tc_action *a,
int bind, int ref)
{
unsigned char *b = skb_tail_pointer(skb);
struct tcf_nat *p = a->priv;
struct tc_nat opt = {
.old_addr = p->old_addr,
.new_addr = p->new_addr,
.mask = p->mask,
.flags = p->flags,
.index = p->tcf_index,
.action = p->tcf_action,
.refcnt = p->tcf_refcnt - ref,
.bindcnt = p->tcf_bindcnt - bind,
};
struct tcf_t t;
if (nla_put(skb, TCA_NAT_PARMS, sizeof(opt), &opt))
goto nla_put_failure;
t.install = jiffies_to_clock_t(jiffies - p->tcf_tm.install);
t.lastuse = jiffies_to_clock_t(jiffies - p->tcf_tm.lastuse);
t.expires = jiffies_to_clock_t(p->tcf_tm.expires);
if (nla_put(skb, TCA_NAT_TM, sizeof(t), &t))
goto nla_put_failure;
return skb->len;
nla_put_failure:
nlmsg_trim(skb, b);
return -1;
}
static struct tc_action_ops act_nat_ops = {
.kind = "nat",
.type = TCA_ACT_NAT,
.owner = THIS_MODULE,
.act = tcf_nat,
.dump = tcf_nat_dump,
.init = tcf_nat_init,
};
MODULE_DESCRIPTION("Stateless NAT actions");
MODULE_LICENSE("GPL");
static int __init nat_init_module(void)
{
return tcf_register_action(&act_nat_ops, NAT_TAB_MASK);
}
static void __exit nat_cleanup_module(void)
{
tcf_unregister_action(&act_nat_ops);
}
module_init(nat_init_module);
module_exit(nat_cleanup_module);

243
net/sched/act_pedit.c Normal file
View file

@ -0,0 +1,243 @@
/*
* net/sched/pedit.c Generic packet editor
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public License
* as published by the Free Software Foundation; either version
* 2 of the License, or (at your option) any later version.
*
* Authors: Jamal Hadi Salim (2002-4)
*/
#include <linux/types.h>
#include <linux/kernel.h>
#include <linux/string.h>
#include <linux/errno.h>
#include <linux/skbuff.h>
#include <linux/rtnetlink.h>
#include <linux/module.h>
#include <linux/init.h>
#include <linux/slab.h>
#include <net/netlink.h>
#include <net/pkt_sched.h>
#include <linux/tc_act/tc_pedit.h>
#include <net/tc_act/tc_pedit.h>
#define PEDIT_TAB_MASK 15
static const struct nla_policy pedit_policy[TCA_PEDIT_MAX + 1] = {
[TCA_PEDIT_PARMS] = { .len = sizeof(struct tc_pedit) },
};
static int tcf_pedit_init(struct net *net, struct nlattr *nla,
struct nlattr *est, struct tc_action *a,
int ovr, int bind)
{
struct nlattr *tb[TCA_PEDIT_MAX + 1];
struct tc_pedit *parm;
int ret = 0, err;
struct tcf_pedit *p;
struct tc_pedit_key *keys = NULL;
int ksize;
if (nla == NULL)
return -EINVAL;
err = nla_parse_nested(tb, TCA_PEDIT_MAX, nla, pedit_policy);
if (err < 0)
return err;
if (tb[TCA_PEDIT_PARMS] == NULL)
return -EINVAL;
parm = nla_data(tb[TCA_PEDIT_PARMS]);
ksize = parm->nkeys * sizeof(struct tc_pedit_key);
if (nla_len(tb[TCA_PEDIT_PARMS]) < sizeof(*parm) + ksize)
return -EINVAL;
if (!tcf_hash_check(parm->index, a, bind)) {
if (!parm->nkeys)
return -EINVAL;
ret = tcf_hash_create(parm->index, est, a, sizeof(*p), bind);
if (ret)
return ret;
p = to_pedit(a);
keys = kmalloc(ksize, GFP_KERNEL);
if (keys == NULL) {
tcf_hash_cleanup(a, est);
return -ENOMEM;
}
ret = ACT_P_CREATED;
} else {
p = to_pedit(a);
tcf_hash_release(a, bind);
if (bind)
return 0;
if (!ovr)
return -EEXIST;
if (p->tcfp_nkeys && p->tcfp_nkeys != parm->nkeys) {
keys = kmalloc(ksize, GFP_KERNEL);
if (keys == NULL)
return -ENOMEM;
}
}
spin_lock_bh(&p->tcf_lock);
p->tcfp_flags = parm->flags;
p->tcf_action = parm->action;
if (keys) {
kfree(p->tcfp_keys);
p->tcfp_keys = keys;
p->tcfp_nkeys = parm->nkeys;
}
memcpy(p->tcfp_keys, parm->keys, ksize);
spin_unlock_bh(&p->tcf_lock);
if (ret == ACT_P_CREATED)
tcf_hash_insert(a);
return ret;
}
static void tcf_pedit_cleanup(struct tc_action *a, int bind)
{
struct tcf_pedit *p = a->priv;
struct tc_pedit_key *keys = p->tcfp_keys;
kfree(keys);
}
static int tcf_pedit(struct sk_buff *skb, const struct tc_action *a,
struct tcf_result *res)
{
struct tcf_pedit *p = a->priv;
int i, munged = 0;
unsigned int off;
if (skb_unclone(skb, GFP_ATOMIC))
return p->tcf_action;
off = skb_network_offset(skb);
spin_lock(&p->tcf_lock);
p->tcf_tm.lastuse = jiffies;
if (p->tcfp_nkeys > 0) {
struct tc_pedit_key *tkey = p->tcfp_keys;
for (i = p->tcfp_nkeys; i > 0; i--, tkey++) {
u32 *ptr, _data;
int offset = tkey->off;
if (tkey->offmask) {
char *d, _d;
d = skb_header_pointer(skb, off + tkey->at, 1,
&_d);
if (!d)
goto bad;
offset += (*d & tkey->offmask) >> tkey->shift;
}
if (offset % 4) {
pr_info("tc filter pedit"
" offset must be on 32 bit boundaries\n");
goto bad;
}
if (offset > 0 && offset > skb->len) {
pr_info("tc filter pedit"
" offset %d can't exceed pkt length %d\n",
offset, skb->len);
goto bad;
}
ptr = skb_header_pointer(skb, off + offset, 4, &_data);
if (!ptr)
goto bad;
/* just do it, baby */
*ptr = ((*ptr & tkey->mask) ^ tkey->val);
if (ptr == &_data)
skb_store_bits(skb, off + offset, ptr, 4);
munged++;
}
if (munged)
skb->tc_verd = SET_TC_MUNGED(skb->tc_verd);
goto done;
} else
WARN(1, "pedit BUG: index %d\n", p->tcf_index);
bad:
p->tcf_qstats.overlimits++;
done:
bstats_update(&p->tcf_bstats, skb);
spin_unlock(&p->tcf_lock);
return p->tcf_action;
}
static int tcf_pedit_dump(struct sk_buff *skb, struct tc_action *a,
int bind, int ref)
{
unsigned char *b = skb_tail_pointer(skb);
struct tcf_pedit *p = a->priv;
struct tc_pedit *opt;
struct tcf_t t;
int s;
s = sizeof(*opt) + p->tcfp_nkeys * sizeof(struct tc_pedit_key);
/* netlink spinlocks held above us - must use ATOMIC */
opt = kzalloc(s, GFP_ATOMIC);
if (unlikely(!opt))
return -ENOBUFS;
memcpy(opt->keys, p->tcfp_keys,
p->tcfp_nkeys * sizeof(struct tc_pedit_key));
opt->index = p->tcf_index;
opt->nkeys = p->tcfp_nkeys;
opt->flags = p->tcfp_flags;
opt->action = p->tcf_action;
opt->refcnt = p->tcf_refcnt - ref;
opt->bindcnt = p->tcf_bindcnt - bind;
if (nla_put(skb, TCA_PEDIT_PARMS, s, opt))
goto nla_put_failure;
t.install = jiffies_to_clock_t(jiffies - p->tcf_tm.install);
t.lastuse = jiffies_to_clock_t(jiffies - p->tcf_tm.lastuse);
t.expires = jiffies_to_clock_t(p->tcf_tm.expires);
if (nla_put(skb, TCA_PEDIT_TM, sizeof(t), &t))
goto nla_put_failure;
kfree(opt);
return skb->len;
nla_put_failure:
nlmsg_trim(skb, b);
kfree(opt);
return -1;
}
static struct tc_action_ops act_pedit_ops = {
.kind = "pedit",
.type = TCA_ACT_PEDIT,
.owner = THIS_MODULE,
.act = tcf_pedit,
.dump = tcf_pedit_dump,
.cleanup = tcf_pedit_cleanup,
.init = tcf_pedit_init,
};
MODULE_AUTHOR("Jamal Hadi Salim(2002-4)");
MODULE_DESCRIPTION("Generic Packet Editor actions");
MODULE_LICENSE("GPL");
static int __init pedit_init_module(void)
{
return tcf_register_action(&act_pedit_ops, PEDIT_TAB_MASK);
}
static void __exit pedit_cleanup_module(void)
{
tcf_unregister_action(&act_pedit_ops);
}
module_init(pedit_init_module);
module_exit(pedit_cleanup_module);

372
net/sched/act_police.c Normal file
View file

@ -0,0 +1,372 @@
/*
* net/sched/police.c Input police filter.
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public License
* as published by the Free Software Foundation; either version
* 2 of the License, or (at your option) any later version.
*
* Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
* J Hadi Salim (action changes)
*/
#include <linux/module.h>
#include <linux/types.h>
#include <linux/kernel.h>
#include <linux/string.h>
#include <linux/errno.h>
#include <linux/skbuff.h>
#include <linux/rtnetlink.h>
#include <linux/init.h>
#include <linux/slab.h>
#include <net/act_api.h>
#include <net/netlink.h>
struct tcf_police {
struct tcf_common common;
int tcfp_result;
u32 tcfp_ewma_rate;
s64 tcfp_burst;
u32 tcfp_mtu;
s64 tcfp_toks;
s64 tcfp_ptoks;
s64 tcfp_mtu_ptoks;
s64 tcfp_t_c;
struct psched_ratecfg rate;
bool rate_present;
struct psched_ratecfg peak;
bool peak_present;
};
#define to_police(pc) \
container_of(pc, struct tcf_police, common)
#define POL_TAB_MASK 15
/* old policer structure from before tc actions */
struct tc_police_compat {
u32 index;
int action;
u32 limit;
u32 burst;
u32 mtu;
struct tc_ratespec rate;
struct tc_ratespec peakrate;
};
/* Each policer is serialized by its individual spinlock */
static int tcf_act_police_walker(struct sk_buff *skb, struct netlink_callback *cb,
int type, struct tc_action *a)
{
struct tcf_hashinfo *hinfo = a->ops->hinfo;
struct hlist_head *head;
struct tcf_common *p;
int err = 0, index = -1, i = 0, s_i = 0, n_i = 0;
struct nlattr *nest;
spin_lock_bh(&hinfo->lock);
s_i = cb->args[0];
for (i = 0; i < (POL_TAB_MASK + 1); i++) {
head = &hinfo->htab[tcf_hash(i, POL_TAB_MASK)];
hlist_for_each_entry_rcu(p, head, tcfc_head) {
index++;
if (index < s_i)
continue;
a->priv = p;
a->order = index;
nest = nla_nest_start(skb, a->order);
if (nest == NULL)
goto nla_put_failure;
if (type == RTM_DELACTION)
err = tcf_action_dump_1(skb, a, 0, 1);
else
err = tcf_action_dump_1(skb, a, 0, 0);
if (err < 0) {
index--;
nla_nest_cancel(skb, nest);
goto done;
}
nla_nest_end(skb, nest);
n_i++;
}
}
done:
spin_unlock_bh(&hinfo->lock);
if (n_i)
cb->args[0] += n_i;
return n_i;
nla_put_failure:
nla_nest_cancel(skb, nest);
goto done;
}
static const struct nla_policy police_policy[TCA_POLICE_MAX + 1] = {
[TCA_POLICE_RATE] = { .len = TC_RTAB_SIZE },
[TCA_POLICE_PEAKRATE] = { .len = TC_RTAB_SIZE },
[TCA_POLICE_AVRATE] = { .type = NLA_U32 },
[TCA_POLICE_RESULT] = { .type = NLA_U32 },
};
static int tcf_act_police_locate(struct net *net, struct nlattr *nla,
struct nlattr *est, struct tc_action *a,
int ovr, int bind)
{
unsigned int h;
int ret = 0, err;
struct nlattr *tb[TCA_POLICE_MAX + 1];
struct tc_police *parm;
struct tcf_police *police;
struct qdisc_rate_table *R_tab = NULL, *P_tab = NULL;
struct tcf_hashinfo *hinfo = a->ops->hinfo;
int size;
if (nla == NULL)
return -EINVAL;
err = nla_parse_nested(tb, TCA_POLICE_MAX, nla, police_policy);
if (err < 0)
return err;
if (tb[TCA_POLICE_TBF] == NULL)
return -EINVAL;
size = nla_len(tb[TCA_POLICE_TBF]);
if (size != sizeof(*parm) && size != sizeof(struct tc_police_compat))
return -EINVAL;
parm = nla_data(tb[TCA_POLICE_TBF]);
if (parm->index) {
if (tcf_hash_search(a, parm->index)) {
police = to_police(a->priv);
if (bind) {
police->tcf_bindcnt += 1;
police->tcf_refcnt += 1;
return 0;
}
if (ovr)
goto override;
/* not replacing */
return -EEXIST;
}
}
police = kzalloc(sizeof(*police), GFP_KERNEL);
if (police == NULL)
return -ENOMEM;
ret = ACT_P_CREATED;
police->tcf_refcnt = 1;
spin_lock_init(&police->tcf_lock);
if (bind)
police->tcf_bindcnt = 1;
override:
if (parm->rate.rate) {
err = -ENOMEM;
R_tab = qdisc_get_rtab(&parm->rate, tb[TCA_POLICE_RATE]);
if (R_tab == NULL)
goto failure;
if (parm->peakrate.rate) {
P_tab = qdisc_get_rtab(&parm->peakrate,
tb[TCA_POLICE_PEAKRATE]);
if (P_tab == NULL)
goto failure;
}
}
spin_lock_bh(&police->tcf_lock);
if (est) {
err = gen_replace_estimator(&police->tcf_bstats, NULL,
&police->tcf_rate_est,
&police->tcf_lock, est);
if (err)
goto failure_unlock;
} else if (tb[TCA_POLICE_AVRATE] &&
(ret == ACT_P_CREATED ||
!gen_estimator_active(&police->tcf_bstats,
&police->tcf_rate_est))) {
err = -EINVAL;
goto failure_unlock;
}
/* No failure allowed after this point */
police->tcfp_mtu = parm->mtu;
if (police->tcfp_mtu == 0) {
police->tcfp_mtu = ~0;
if (R_tab)
police->tcfp_mtu = 255 << R_tab->rate.cell_log;
}
if (R_tab) {
police->rate_present = true;
psched_ratecfg_precompute(&police->rate, &R_tab->rate, 0);
qdisc_put_rtab(R_tab);
} else {
police->rate_present = false;
}
if (P_tab) {
police->peak_present = true;
psched_ratecfg_precompute(&police->peak, &P_tab->rate, 0);
qdisc_put_rtab(P_tab);
} else {
police->peak_present = false;
}
if (tb[TCA_POLICE_RESULT])
police->tcfp_result = nla_get_u32(tb[TCA_POLICE_RESULT]);
police->tcfp_burst = PSCHED_TICKS2NS(parm->burst);
police->tcfp_toks = police->tcfp_burst;
if (police->peak_present) {
police->tcfp_mtu_ptoks = (s64) psched_l2t_ns(&police->peak,
police->tcfp_mtu);
police->tcfp_ptoks = police->tcfp_mtu_ptoks;
}
police->tcf_action = parm->action;
if (tb[TCA_POLICE_AVRATE])
police->tcfp_ewma_rate = nla_get_u32(tb[TCA_POLICE_AVRATE]);
spin_unlock_bh(&police->tcf_lock);
if (ret != ACT_P_CREATED)
return ret;
police->tcfp_t_c = ktime_get_ns();
police->tcf_index = parm->index ? parm->index :
tcf_hash_new_index(hinfo);
h = tcf_hash(police->tcf_index, POL_TAB_MASK);
spin_lock_bh(&hinfo->lock);
hlist_add_head(&police->tcf_head, &hinfo->htab[h]);
spin_unlock_bh(&hinfo->lock);
a->priv = police;
return ret;
failure_unlock:
spin_unlock_bh(&police->tcf_lock);
failure:
qdisc_put_rtab(P_tab);
qdisc_put_rtab(R_tab);
if (ret == ACT_P_CREATED)
kfree(police);
return err;
}
static int tcf_act_police(struct sk_buff *skb, const struct tc_action *a,
struct tcf_result *res)
{
struct tcf_police *police = a->priv;
s64 now;
s64 toks;
s64 ptoks = 0;
spin_lock(&police->tcf_lock);
bstats_update(&police->tcf_bstats, skb);
if (police->tcfp_ewma_rate &&
police->tcf_rate_est.bps >= police->tcfp_ewma_rate) {
police->tcf_qstats.overlimits++;
if (police->tcf_action == TC_ACT_SHOT)
police->tcf_qstats.drops++;
spin_unlock(&police->tcf_lock);
return police->tcf_action;
}
if (qdisc_pkt_len(skb) <= police->tcfp_mtu) {
if (!police->rate_present) {
spin_unlock(&police->tcf_lock);
return police->tcfp_result;
}
now = ktime_get_ns();
toks = min_t(s64, now - police->tcfp_t_c,
police->tcfp_burst);
if (police->peak_present) {
ptoks = toks + police->tcfp_ptoks;
if (ptoks > police->tcfp_mtu_ptoks)
ptoks = police->tcfp_mtu_ptoks;
ptoks -= (s64) psched_l2t_ns(&police->peak,
qdisc_pkt_len(skb));
}
toks += police->tcfp_toks;
if (toks > police->tcfp_burst)
toks = police->tcfp_burst;
toks -= (s64) psched_l2t_ns(&police->rate, qdisc_pkt_len(skb));
if ((toks|ptoks) >= 0) {
police->tcfp_t_c = now;
police->tcfp_toks = toks;
police->tcfp_ptoks = ptoks;
spin_unlock(&police->tcf_lock);
return police->tcfp_result;
}
}
police->tcf_qstats.overlimits++;
if (police->tcf_action == TC_ACT_SHOT)
police->tcf_qstats.drops++;
spin_unlock(&police->tcf_lock);
return police->tcf_action;
}
static int
tcf_act_police_dump(struct sk_buff *skb, struct tc_action *a, int bind, int ref)
{
unsigned char *b = skb_tail_pointer(skb);
struct tcf_police *police = a->priv;
struct tc_police opt = {
.index = police->tcf_index,
.action = police->tcf_action,
.mtu = police->tcfp_mtu,
.burst = PSCHED_NS2TICKS(police->tcfp_burst),
.refcnt = police->tcf_refcnt - ref,
.bindcnt = police->tcf_bindcnt - bind,
};
if (police->rate_present)
psched_ratecfg_getrate(&opt.rate, &police->rate);
if (police->peak_present)
psched_ratecfg_getrate(&opt.peakrate, &police->peak);
if (nla_put(skb, TCA_POLICE_TBF, sizeof(opt), &opt))
goto nla_put_failure;
if (police->tcfp_result &&
nla_put_u32(skb, TCA_POLICE_RESULT, police->tcfp_result))
goto nla_put_failure;
if (police->tcfp_ewma_rate &&
nla_put_u32(skb, TCA_POLICE_AVRATE, police->tcfp_ewma_rate))
goto nla_put_failure;
return skb->len;
nla_put_failure:
nlmsg_trim(skb, b);
return -1;
}
MODULE_AUTHOR("Alexey Kuznetsov");
MODULE_DESCRIPTION("Policing actions");
MODULE_LICENSE("GPL");
static struct tc_action_ops act_police_ops = {
.kind = "police",
.type = TCA_ID_POLICE,
.owner = THIS_MODULE,
.act = tcf_act_police,
.dump = tcf_act_police_dump,
.init = tcf_act_police_locate,
.walk = tcf_act_police_walker
};
static int __init
police_init_module(void)
{
return tcf_register_action(&act_police_ops, POL_TAB_MASK);
}
static void __exit
police_cleanup_module(void)
{
tcf_unregister_action(&act_police_ops);
}
module_init(police_init_module);
module_exit(police_cleanup_module);

192
net/sched/act_simple.c Normal file
View file

@ -0,0 +1,192 @@
/*
* net/sched/simp.c Simple example of an action
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public License
* as published by the Free Software Foundation; either version
* 2 of the License, or (at your option) any later version.
*
* Authors: Jamal Hadi Salim (2005-8)
*
*/
#include <linux/module.h>
#include <linux/slab.h>
#include <linux/init.h>
#include <linux/kernel.h>
#include <linux/skbuff.h>
#include <linux/rtnetlink.h>
#include <net/netlink.h>
#include <net/pkt_sched.h>
#define TCA_ACT_SIMP 22
#include <linux/tc_act/tc_defact.h>
#include <net/tc_act/tc_defact.h>
#define SIMP_TAB_MASK 7
#define SIMP_MAX_DATA 32
static int tcf_simp(struct sk_buff *skb, const struct tc_action *a,
struct tcf_result *res)
{
struct tcf_defact *d = a->priv;
spin_lock(&d->tcf_lock);
d->tcf_tm.lastuse = jiffies;
bstats_update(&d->tcf_bstats, skb);
/* print policy string followed by _ then packet count
* Example if this was the 3rd packet and the string was "hello"
* then it would look like "hello_3" (without quotes)
*/
pr_info("simple: %s_%d\n",
(char *)d->tcfd_defdata, d->tcf_bstats.packets);
spin_unlock(&d->tcf_lock);
return d->tcf_action;
}
static void tcf_simp_release(struct tc_action *a, int bind)
{
struct tcf_defact *d = to_defact(a);
kfree(d->tcfd_defdata);
}
static int alloc_defdata(struct tcf_defact *d, char *defdata)
{
d->tcfd_defdata = kzalloc(SIMP_MAX_DATA, GFP_KERNEL);
if (unlikely(!d->tcfd_defdata))
return -ENOMEM;
strlcpy(d->tcfd_defdata, defdata, SIMP_MAX_DATA);
return 0;
}
static void reset_policy(struct tcf_defact *d, char *defdata,
struct tc_defact *p)
{
spin_lock_bh(&d->tcf_lock);
d->tcf_action = p->action;
memset(d->tcfd_defdata, 0, SIMP_MAX_DATA);
strlcpy(d->tcfd_defdata, defdata, SIMP_MAX_DATA);
spin_unlock_bh(&d->tcf_lock);
}
static const struct nla_policy simple_policy[TCA_DEF_MAX + 1] = {
[TCA_DEF_PARMS] = { .len = sizeof(struct tc_defact) },
[TCA_DEF_DATA] = { .type = NLA_STRING, .len = SIMP_MAX_DATA },
};
static int tcf_simp_init(struct net *net, struct nlattr *nla,
struct nlattr *est, struct tc_action *a,
int ovr, int bind)
{
struct nlattr *tb[TCA_DEF_MAX + 1];
struct tc_defact *parm;
struct tcf_defact *d;
char *defdata;
int ret = 0, err;
if (nla == NULL)
return -EINVAL;
err = nla_parse_nested(tb, TCA_DEF_MAX, nla, simple_policy);
if (err < 0)
return err;
if (tb[TCA_DEF_PARMS] == NULL)
return -EINVAL;
if (tb[TCA_DEF_DATA] == NULL)
return -EINVAL;
parm = nla_data(tb[TCA_DEF_PARMS]);
defdata = nla_data(tb[TCA_DEF_DATA]);
if (!tcf_hash_check(parm->index, a, bind)) {
ret = tcf_hash_create(parm->index, est, a, sizeof(*d), bind);
if (ret)
return ret;
d = to_defact(a);
ret = alloc_defdata(d, defdata);
if (ret < 0) {
tcf_hash_cleanup(a, est);
return ret;
}
d->tcf_action = parm->action;
ret = ACT_P_CREATED;
} else {
d = to_defact(a);
if (bind)
return 0;
tcf_hash_release(a, bind);
if (!ovr)
return -EEXIST;
reset_policy(d, defdata, parm);
}
if (ret == ACT_P_CREATED)
tcf_hash_insert(a);
return ret;
}
static int tcf_simp_dump(struct sk_buff *skb, struct tc_action *a,
int bind, int ref)
{
unsigned char *b = skb_tail_pointer(skb);
struct tcf_defact *d = a->priv;
struct tc_defact opt = {
.index = d->tcf_index,
.refcnt = d->tcf_refcnt - ref,
.bindcnt = d->tcf_bindcnt - bind,
.action = d->tcf_action,
};
struct tcf_t t;
if (nla_put(skb, TCA_DEF_PARMS, sizeof(opt), &opt) ||
nla_put_string(skb, TCA_DEF_DATA, d->tcfd_defdata))
goto nla_put_failure;
t.install = jiffies_to_clock_t(jiffies - d->tcf_tm.install);
t.lastuse = jiffies_to_clock_t(jiffies - d->tcf_tm.lastuse);
t.expires = jiffies_to_clock_t(d->tcf_tm.expires);
if (nla_put(skb, TCA_DEF_TM, sizeof(t), &t))
goto nla_put_failure;
return skb->len;
nla_put_failure:
nlmsg_trim(skb, b);
return -1;
}
static struct tc_action_ops act_simp_ops = {
.kind = "simple",
.type = TCA_ACT_SIMP,
.owner = THIS_MODULE,
.act = tcf_simp,
.dump = tcf_simp_dump,
.cleanup = tcf_simp_release,
.init = tcf_simp_init,
};
MODULE_AUTHOR("Jamal Hadi Salim(2005)");
MODULE_DESCRIPTION("Simple example action");
MODULE_LICENSE("GPL");
static int __init simp_init_module(void)
{
int ret;
ret = tcf_register_action(&act_simp_ops, SIMP_TAB_MASK);
if (!ret)
pr_info("Simple TC action Loaded\n");
return ret;
}
static void __exit simp_cleanup_module(void)
{
tcf_unregister_action(&act_simp_ops);
}
module_init(simp_init_module);
module_exit(simp_cleanup_module);

199
net/sched/act_skbedit.c Normal file
View file

@ -0,0 +1,199 @@
/*
* Copyright (c) 2008, Intel Corporation.
*
* This program is free software; you can redistribute it and/or modify it
* under the terms and conditions of the GNU General Public License,
* version 2, as published by the Free Software Foundation.
*
* This program is distributed in the hope it will be useful, but WITHOUT
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
* more details.
*
* You should have received a copy of the GNU General Public License along with
* this program; if not, see <http://www.gnu.org/licenses/>.
*
* Author: Alexander Duyck <alexander.h.duyck@intel.com>
*/
#include <linux/module.h>
#include <linux/init.h>
#include <linux/kernel.h>
#include <linux/skbuff.h>
#include <linux/rtnetlink.h>
#include <net/netlink.h>
#include <net/pkt_sched.h>
#include <linux/tc_act/tc_skbedit.h>
#include <net/tc_act/tc_skbedit.h>
#define SKBEDIT_TAB_MASK 15
static int tcf_skbedit(struct sk_buff *skb, const struct tc_action *a,
struct tcf_result *res)
{
struct tcf_skbedit *d = a->priv;
spin_lock(&d->tcf_lock);
d->tcf_tm.lastuse = jiffies;
bstats_update(&d->tcf_bstats, skb);
if (d->flags & SKBEDIT_F_PRIORITY)
skb->priority = d->priority;
if (d->flags & SKBEDIT_F_QUEUE_MAPPING &&
skb->dev->real_num_tx_queues > d->queue_mapping)
skb_set_queue_mapping(skb, d->queue_mapping);
if (d->flags & SKBEDIT_F_MARK)
skb->mark = d->mark;
spin_unlock(&d->tcf_lock);
return d->tcf_action;
}
static const struct nla_policy skbedit_policy[TCA_SKBEDIT_MAX + 1] = {
[TCA_SKBEDIT_PARMS] = { .len = sizeof(struct tc_skbedit) },
[TCA_SKBEDIT_PRIORITY] = { .len = sizeof(u32) },
[TCA_SKBEDIT_QUEUE_MAPPING] = { .len = sizeof(u16) },
[TCA_SKBEDIT_MARK] = { .len = sizeof(u32) },
};
static int tcf_skbedit_init(struct net *net, struct nlattr *nla,
struct nlattr *est, struct tc_action *a,
int ovr, int bind)
{
struct nlattr *tb[TCA_SKBEDIT_MAX + 1];
struct tc_skbedit *parm;
struct tcf_skbedit *d;
u32 flags = 0, *priority = NULL, *mark = NULL;
u16 *queue_mapping = NULL;
int ret = 0, err;
if (nla == NULL)
return -EINVAL;
err = nla_parse_nested(tb, TCA_SKBEDIT_MAX, nla, skbedit_policy);
if (err < 0)
return err;
if (tb[TCA_SKBEDIT_PARMS] == NULL)
return -EINVAL;
if (tb[TCA_SKBEDIT_PRIORITY] != NULL) {
flags |= SKBEDIT_F_PRIORITY;
priority = nla_data(tb[TCA_SKBEDIT_PRIORITY]);
}
if (tb[TCA_SKBEDIT_QUEUE_MAPPING] != NULL) {
flags |= SKBEDIT_F_QUEUE_MAPPING;
queue_mapping = nla_data(tb[TCA_SKBEDIT_QUEUE_MAPPING]);
}
if (tb[TCA_SKBEDIT_MARK] != NULL) {
flags |= SKBEDIT_F_MARK;
mark = nla_data(tb[TCA_SKBEDIT_MARK]);
}
if (!flags)
return -EINVAL;
parm = nla_data(tb[TCA_SKBEDIT_PARMS]);
if (!tcf_hash_check(parm->index, a, bind)) {
ret = tcf_hash_create(parm->index, est, a, sizeof(*d), bind);
if (ret)
return ret;
d = to_skbedit(a);
ret = ACT_P_CREATED;
} else {
d = to_skbedit(a);
if (bind)
return 0;
tcf_hash_release(a, bind);
if (!ovr)
return -EEXIST;
}
spin_lock_bh(&d->tcf_lock);
d->flags = flags;
if (flags & SKBEDIT_F_PRIORITY)
d->priority = *priority;
if (flags & SKBEDIT_F_QUEUE_MAPPING)
d->queue_mapping = *queue_mapping;
if (flags & SKBEDIT_F_MARK)
d->mark = *mark;
d->tcf_action = parm->action;
spin_unlock_bh(&d->tcf_lock);
if (ret == ACT_P_CREATED)
tcf_hash_insert(a);
return ret;
}
static int tcf_skbedit_dump(struct sk_buff *skb, struct tc_action *a,
int bind, int ref)
{
unsigned char *b = skb_tail_pointer(skb);
struct tcf_skbedit *d = a->priv;
struct tc_skbedit opt = {
.index = d->tcf_index,
.refcnt = d->tcf_refcnt - ref,
.bindcnt = d->tcf_bindcnt - bind,
.action = d->tcf_action,
};
struct tcf_t t;
if (nla_put(skb, TCA_SKBEDIT_PARMS, sizeof(opt), &opt))
goto nla_put_failure;
if ((d->flags & SKBEDIT_F_PRIORITY) &&
nla_put(skb, TCA_SKBEDIT_PRIORITY, sizeof(d->priority),
&d->priority))
goto nla_put_failure;
if ((d->flags & SKBEDIT_F_QUEUE_MAPPING) &&
nla_put(skb, TCA_SKBEDIT_QUEUE_MAPPING,
sizeof(d->queue_mapping), &d->queue_mapping))
goto nla_put_failure;
if ((d->flags & SKBEDIT_F_MARK) &&
nla_put(skb, TCA_SKBEDIT_MARK, sizeof(d->mark),
&d->mark))
goto nla_put_failure;
t.install = jiffies_to_clock_t(jiffies - d->tcf_tm.install);
t.lastuse = jiffies_to_clock_t(jiffies - d->tcf_tm.lastuse);
t.expires = jiffies_to_clock_t(d->tcf_tm.expires);
if (nla_put(skb, TCA_SKBEDIT_TM, sizeof(t), &t))
goto nla_put_failure;
return skb->len;
nla_put_failure:
nlmsg_trim(skb, b);
return -1;
}
static struct tc_action_ops act_skbedit_ops = {
.kind = "skbedit",
.type = TCA_ACT_SKBEDIT,
.owner = THIS_MODULE,
.act = tcf_skbedit,
.dump = tcf_skbedit_dump,
.init = tcf_skbedit_init,
};
MODULE_AUTHOR("Alexander Duyck, <alexander.h.duyck@intel.com>");
MODULE_DESCRIPTION("SKB Editing");
MODULE_LICENSE("GPL");
static int __init skbedit_init_module(void)
{
return tcf_register_action(&act_skbedit_ops, SKBEDIT_TAB_MASK);
}
static void __exit skbedit_cleanup_module(void)
{
tcf_unregister_action(&act_skbedit_ops);
}
module_init(skbedit_init_module);
module_exit(skbedit_cleanup_module);

623
net/sched/cls_api.c Normal file
View file

@ -0,0 +1,623 @@
/*
* net/sched/cls_api.c Packet classifier API.
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public License
* as published by the Free Software Foundation; either version
* 2 of the License, or (at your option) any later version.
*
* Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
*
* Changes:
*
* Eduardo J. Blanco <ejbs@netlabs.com.uy> :990222: kmod support
*
*/
#include <linux/module.h>
#include <linux/types.h>
#include <linux/kernel.h>
#include <linux/string.h>
#include <linux/errno.h>
#include <linux/skbuff.h>
#include <linux/init.h>
#include <linux/kmod.h>
#include <linux/err.h>
#include <linux/slab.h>
#include <net/net_namespace.h>
#include <net/sock.h>
#include <net/netlink.h>
#include <net/pkt_sched.h>
#include <net/pkt_cls.h>
/* The list of all installed classifier types */
static LIST_HEAD(tcf_proto_base);
/* Protects list of registered TC modules. It is pure SMP lock. */
static DEFINE_RWLOCK(cls_mod_lock);
/* Find classifier type by string name */
static const struct tcf_proto_ops *tcf_proto_lookup_ops(struct nlattr *kind)
{
const struct tcf_proto_ops *t, *res = NULL;
if (kind) {
read_lock(&cls_mod_lock);
list_for_each_entry(t, &tcf_proto_base, head) {
if (nla_strcmp(kind, t->kind) == 0) {
if (try_module_get(t->owner))
res = t;
break;
}
}
read_unlock(&cls_mod_lock);
}
return res;
}
/* Register(unregister) new classifier type */
int register_tcf_proto_ops(struct tcf_proto_ops *ops)
{
struct tcf_proto_ops *t;
int rc = -EEXIST;
write_lock(&cls_mod_lock);
list_for_each_entry(t, &tcf_proto_base, head)
if (!strcmp(ops->kind, t->kind))
goto out;
list_add_tail(&ops->head, &tcf_proto_base);
rc = 0;
out:
write_unlock(&cls_mod_lock);
return rc;
}
EXPORT_SYMBOL(register_tcf_proto_ops);
int unregister_tcf_proto_ops(struct tcf_proto_ops *ops)
{
struct tcf_proto_ops *t;
int rc = -ENOENT;
write_lock(&cls_mod_lock);
list_for_each_entry(t, &tcf_proto_base, head) {
if (t == ops) {
list_del(&t->head);
rc = 0;
break;
}
}
write_unlock(&cls_mod_lock);
return rc;
}
EXPORT_SYMBOL(unregister_tcf_proto_ops);
static int tfilter_notify(struct net *net, struct sk_buff *oskb,
struct nlmsghdr *n, struct tcf_proto *tp,
unsigned long fh, int event);
/* Select new prio value from the range, managed by kernel. */
static inline u32 tcf_auto_prio(struct tcf_proto *tp)
{
u32 first = TC_H_MAKE(0xC0000000U, 0U);
if (tp)
first = tp->prio - 1;
return first;
}
/* Add/change/delete/get a filter node */
static int tc_ctl_tfilter(struct sk_buff *skb, struct nlmsghdr *n)
{
struct net *net = sock_net(skb->sk);
struct nlattr *tca[TCA_MAX + 1];
struct tcmsg *t;
u32 protocol;
u32 prio;
u32 nprio;
u32 parent;
struct net_device *dev;
struct Qdisc *q;
struct tcf_proto __rcu **back;
struct tcf_proto __rcu **chain;
struct tcf_proto *tp;
const struct tcf_proto_ops *tp_ops;
const struct Qdisc_class_ops *cops;
unsigned long cl;
unsigned long fh;
int err;
int tp_created = 0;
if ((n->nlmsg_type != RTM_GETTFILTER) &&
!netlink_ns_capable(skb, net->user_ns, CAP_NET_ADMIN))
return -EPERM;
replay:
err = nlmsg_parse(n, sizeof(*t), tca, TCA_MAX, NULL);
if (err < 0)
return err;
t = nlmsg_data(n);
protocol = TC_H_MIN(t->tcm_info);
prio = TC_H_MAJ(t->tcm_info);
nprio = prio;
parent = t->tcm_parent;
cl = 0;
if (prio == 0) {
/* If no priority is given, user wants we allocated it. */
if (n->nlmsg_type != RTM_NEWTFILTER ||
!(n->nlmsg_flags & NLM_F_CREATE))
return -ENOENT;
prio = TC_H_MAKE(0x80000000U, 0U);
}
/* Find head of filter chain. */
/* Find link */
dev = __dev_get_by_index(net, t->tcm_ifindex);
if (dev == NULL)
return -ENODEV;
/* Find qdisc */
if (!parent) {
q = dev->qdisc;
parent = q->handle;
} else {
q = qdisc_lookup(dev, TC_H_MAJ(t->tcm_parent));
if (q == NULL)
return -EINVAL;
}
/* Is it classful? */
cops = q->ops->cl_ops;
if (!cops)
return -EINVAL;
if (cops->tcf_chain == NULL)
return -EOPNOTSUPP;
/* Do we search for filter, attached to class? */
if (TC_H_MIN(parent)) {
cl = cops->get(q, parent);
if (cl == 0)
return -ENOENT;
}
/* And the last stroke */
chain = cops->tcf_chain(q, cl);
err = -EINVAL;
if (chain == NULL)
goto errout;
/* Check the chain for existence of proto-tcf with this priority */
for (back = chain;
(tp = rtnl_dereference(*back)) != NULL;
back = &tp->next) {
if (tp->prio >= prio) {
if (tp->prio == prio) {
if (!nprio ||
(tp->protocol != protocol && protocol))
goto errout;
} else
tp = NULL;
break;
}
}
if (tp == NULL) {
/* Proto-tcf does not exist, create new one */
if (tca[TCA_KIND] == NULL || !protocol)
goto errout;
err = -ENOENT;
if (n->nlmsg_type != RTM_NEWTFILTER ||
!(n->nlmsg_flags & NLM_F_CREATE))
goto errout;
/* Create new proto tcf */
err = -ENOBUFS;
tp = kzalloc(sizeof(*tp), GFP_KERNEL);
if (tp == NULL)
goto errout;
err = -ENOENT;
tp_ops = tcf_proto_lookup_ops(tca[TCA_KIND]);
if (tp_ops == NULL) {
#ifdef CONFIG_MODULES
struct nlattr *kind = tca[TCA_KIND];
char name[IFNAMSIZ];
if (kind != NULL &&
nla_strlcpy(name, kind, IFNAMSIZ) < IFNAMSIZ) {
rtnl_unlock();
request_module("cls_%s", name);
rtnl_lock();
tp_ops = tcf_proto_lookup_ops(kind);
/* We dropped the RTNL semaphore in order to
* perform the module load. So, even if we
* succeeded in loading the module we have to
* replay the request. We indicate this using
* -EAGAIN.
*/
if (tp_ops != NULL) {
module_put(tp_ops->owner);
err = -EAGAIN;
}
}
#endif
kfree(tp);
goto errout;
}
tp->ops = tp_ops;
tp->protocol = protocol;
tp->prio = nprio ? :
TC_H_MAJ(tcf_auto_prio(rtnl_dereference(*back)));
tp->q = q;
tp->classify = tp_ops->classify;
tp->classid = parent;
err = tp_ops->init(tp);
if (err != 0) {
module_put(tp_ops->owner);
kfree(tp);
goto errout;
}
tp_created = 1;
} else if (tca[TCA_KIND] && nla_strcmp(tca[TCA_KIND], tp->ops->kind))
goto errout;
fh = tp->ops->get(tp, t->tcm_handle);
if (fh == 0) {
if (n->nlmsg_type == RTM_DELTFILTER && t->tcm_handle == 0) {
struct tcf_proto *next = rtnl_dereference(tp->next);
RCU_INIT_POINTER(*back, next);
tfilter_notify(net, skb, n, tp, fh, RTM_DELTFILTER);
tcf_destroy(tp);
err = 0;
goto errout;
}
err = -ENOENT;
if (n->nlmsg_type != RTM_NEWTFILTER ||
!(n->nlmsg_flags & NLM_F_CREATE))
goto errout;
} else {
switch (n->nlmsg_type) {
case RTM_NEWTFILTER:
err = -EEXIST;
if (n->nlmsg_flags & NLM_F_EXCL) {
if (tp_created)
tcf_destroy(tp);
goto errout;
}
break;
case RTM_DELTFILTER:
err = tp->ops->delete(tp, fh);
if (err == 0)
tfilter_notify(net, skb, n, tp, fh, RTM_DELTFILTER);
goto errout;
case RTM_GETTFILTER:
err = tfilter_notify(net, skb, n, tp, fh, RTM_NEWTFILTER);
goto errout;
default:
err = -EINVAL;
goto errout;
}
}
err = tp->ops->change(net, skb, tp, cl, t->tcm_handle, tca, &fh,
n->nlmsg_flags & NLM_F_CREATE ? TCA_ACT_NOREPLACE : TCA_ACT_REPLACE);
if (err == 0) {
if (tp_created) {
RCU_INIT_POINTER(tp->next, rtnl_dereference(*back));
rcu_assign_pointer(*back, tp);
}
tfilter_notify(net, skb, n, tp, fh, RTM_NEWTFILTER);
} else {
if (tp_created)
tcf_destroy(tp);
}
errout:
if (cl)
cops->put(q, cl);
if (err == -EAGAIN)
/* Replay the request. */
goto replay;
return err;
}
static int tcf_fill_node(struct net *net, struct sk_buff *skb, struct tcf_proto *tp,
unsigned long fh, u32 portid, u32 seq, u16 flags, int event)
{
struct tcmsg *tcm;
struct nlmsghdr *nlh;
unsigned char *b = skb_tail_pointer(skb);
nlh = nlmsg_put(skb, portid, seq, event, sizeof(*tcm), flags);
if (!nlh)
goto out_nlmsg_trim;
tcm = nlmsg_data(nlh);
tcm->tcm_family = AF_UNSPEC;
tcm->tcm__pad1 = 0;
tcm->tcm__pad2 = 0;
tcm->tcm_ifindex = qdisc_dev(tp->q)->ifindex;
tcm->tcm_parent = tp->classid;
tcm->tcm_info = TC_H_MAKE(tp->prio, tp->protocol);
if (nla_put_string(skb, TCA_KIND, tp->ops->kind))
goto nla_put_failure;
tcm->tcm_handle = fh;
if (RTM_DELTFILTER != event) {
tcm->tcm_handle = 0;
if (tp->ops->dump && tp->ops->dump(net, tp, fh, skb, tcm) < 0)
goto nla_put_failure;
}
nlh->nlmsg_len = skb_tail_pointer(skb) - b;
return skb->len;
out_nlmsg_trim:
nla_put_failure:
nlmsg_trim(skb, b);
return -1;
}
static int tfilter_notify(struct net *net, struct sk_buff *oskb,
struct nlmsghdr *n, struct tcf_proto *tp,
unsigned long fh, int event)
{
struct sk_buff *skb;
u32 portid = oskb ? NETLINK_CB(oskb).portid : 0;
skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
if (!skb)
return -ENOBUFS;
if (tcf_fill_node(net, skb, tp, fh, portid, n->nlmsg_seq, 0, event) <= 0) {
kfree_skb(skb);
return -EINVAL;
}
return rtnetlink_send(skb, net, portid, RTNLGRP_TC,
n->nlmsg_flags & NLM_F_ECHO);
}
struct tcf_dump_args {
struct tcf_walker w;
struct sk_buff *skb;
struct netlink_callback *cb;
};
static int tcf_node_dump(struct tcf_proto *tp, unsigned long n,
struct tcf_walker *arg)
{
struct tcf_dump_args *a = (void *)arg;
struct net *net = sock_net(a->skb->sk);
return tcf_fill_node(net, a->skb, tp, n, NETLINK_CB(a->cb->skb).portid,
a->cb->nlh->nlmsg_seq, NLM_F_MULTI, RTM_NEWTFILTER);
}
/* called with RTNL */
static int tc_dump_tfilter(struct sk_buff *skb, struct netlink_callback *cb)
{
struct net *net = sock_net(skb->sk);
int t;
int s_t;
struct net_device *dev;
struct Qdisc *q;
struct tcf_proto *tp, __rcu **chain;
struct tcmsg *tcm = nlmsg_data(cb->nlh);
unsigned long cl = 0;
const struct Qdisc_class_ops *cops;
struct tcf_dump_args arg;
if (nlmsg_len(cb->nlh) < sizeof(*tcm))
return skb->len;
dev = __dev_get_by_index(net, tcm->tcm_ifindex);
if (!dev)
return skb->len;
if (!tcm->tcm_parent)
q = dev->qdisc;
else
q = qdisc_lookup(dev, TC_H_MAJ(tcm->tcm_parent));
if (!q)
goto out;
cops = q->ops->cl_ops;
if (!cops)
goto errout;
if (cops->tcf_chain == NULL)
goto errout;
if (TC_H_MIN(tcm->tcm_parent)) {
cl = cops->get(q, tcm->tcm_parent);
if (cl == 0)
goto errout;
}
chain = cops->tcf_chain(q, cl);
if (chain == NULL)
goto errout;
s_t = cb->args[0];
for (tp = rtnl_dereference(*chain), t = 0;
tp; tp = rtnl_dereference(tp->next), t++) {
if (t < s_t)
continue;
if (TC_H_MAJ(tcm->tcm_info) &&
TC_H_MAJ(tcm->tcm_info) != tp->prio)
continue;
if (TC_H_MIN(tcm->tcm_info) &&
TC_H_MIN(tcm->tcm_info) != tp->protocol)
continue;
if (t > s_t)
memset(&cb->args[1], 0, sizeof(cb->args)-sizeof(cb->args[0]));
if (cb->args[1] == 0) {
if (tcf_fill_node(net, skb, tp, 0, NETLINK_CB(cb->skb).portid,
cb->nlh->nlmsg_seq, NLM_F_MULTI,
RTM_NEWTFILTER) <= 0)
break;
cb->args[1] = 1;
}
if (tp->ops->walk == NULL)
continue;
arg.w.fn = tcf_node_dump;
arg.skb = skb;
arg.cb = cb;
arg.w.stop = 0;
arg.w.skip = cb->args[1] - 1;
arg.w.count = 0;
tp->ops->walk(tp, &arg.w);
cb->args[1] = arg.w.count + 1;
if (arg.w.stop)
break;
}
cb->args[0] = t;
errout:
if (cl)
cops->put(q, cl);
out:
return skb->len;
}
void tcf_exts_destroy(struct tcf_exts *exts)
{
#ifdef CONFIG_NET_CLS_ACT
tcf_action_destroy(&exts->actions, TCA_ACT_UNBIND);
INIT_LIST_HEAD(&exts->actions);
#endif
}
EXPORT_SYMBOL(tcf_exts_destroy);
int tcf_exts_validate(struct net *net, struct tcf_proto *tp, struct nlattr **tb,
struct nlattr *rate_tlv, struct tcf_exts *exts, bool ovr)
{
#ifdef CONFIG_NET_CLS_ACT
{
struct tc_action *act;
INIT_LIST_HEAD(&exts->actions);
if (exts->police && tb[exts->police]) {
act = tcf_action_init_1(net, tb[exts->police], rate_tlv,
"police", ovr,
TCA_ACT_BIND);
if (IS_ERR(act))
return PTR_ERR(act);
act->type = exts->type = TCA_OLD_COMPAT;
list_add(&act->list, &exts->actions);
} else if (exts->action && tb[exts->action]) {
int err;
err = tcf_action_init(net, tb[exts->action], rate_tlv,
NULL, ovr,
TCA_ACT_BIND, &exts->actions);
if (err)
return err;
}
}
#else
if ((exts->action && tb[exts->action]) ||
(exts->police && tb[exts->police]))
return -EOPNOTSUPP;
#endif
return 0;
}
EXPORT_SYMBOL(tcf_exts_validate);
void tcf_exts_change(struct tcf_proto *tp, struct tcf_exts *dst,
struct tcf_exts *src)
{
#ifdef CONFIG_NET_CLS_ACT
LIST_HEAD(tmp);
tcf_tree_lock(tp);
list_splice_init(&dst->actions, &tmp);
list_splice(&src->actions, &dst->actions);
dst->type = src->type;
tcf_tree_unlock(tp);
tcf_action_destroy(&tmp, TCA_ACT_UNBIND);
#endif
}
EXPORT_SYMBOL(tcf_exts_change);
#define tcf_exts_first_act(ext) \
list_first_entry(&(exts)->actions, struct tc_action, list)
int tcf_exts_dump(struct sk_buff *skb, struct tcf_exts *exts)
{
#ifdef CONFIG_NET_CLS_ACT
struct nlattr *nest;
if (exts->action && !list_empty(&exts->actions)) {
/*
* again for backward compatible mode - we want
* to work with both old and new modes of entering
* tc data even if iproute2 was newer - jhs
*/
if (exts->type != TCA_OLD_COMPAT) {
nest = nla_nest_start(skb, exts->action);
if (nest == NULL)
goto nla_put_failure;
if (tcf_action_dump(skb, &exts->actions, 0, 0) < 0)
goto nla_put_failure;
nla_nest_end(skb, nest);
} else if (exts->police) {
struct tc_action *act = tcf_exts_first_act(exts);
nest = nla_nest_start(skb, exts->police);
if (nest == NULL || !act)
goto nla_put_failure;
if (tcf_action_dump_old(skb, act, 0, 0) < 0)
goto nla_put_failure;
nla_nest_end(skb, nest);
}
}
return 0;
nla_put_failure:
nla_nest_cancel(skb, nest);
return -1;
#else
return 0;
#endif
}
EXPORT_SYMBOL(tcf_exts_dump);
int tcf_exts_dump_stats(struct sk_buff *skb, struct tcf_exts *exts)
{
#ifdef CONFIG_NET_CLS_ACT
struct tc_action *a = tcf_exts_first_act(exts);
if (tcf_action_copy_stats(skb, a, 1) < 0)
return -1;
#endif
return 0;
}
EXPORT_SYMBOL(tcf_exts_dump_stats);
static int __init tc_filter_init(void)
{
rtnl_register(PF_UNSPEC, RTM_NEWTFILTER, tc_ctl_tfilter, NULL, NULL);
rtnl_register(PF_UNSPEC, RTM_DELTFILTER, tc_ctl_tfilter, NULL, NULL);
rtnl_register(PF_UNSPEC, RTM_GETTFILTER, tc_ctl_tfilter,
tc_dump_tfilter, NULL);
return 0;
}
subsys_initcall(tc_filter_init);

317
net/sched/cls_basic.c Normal file
View file

@ -0,0 +1,317 @@
/*
* net/sched/cls_basic.c Basic Packet Classifier.
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public License
* as published by the Free Software Foundation; either version
* 2 of the License, or (at your option) any later version.
*
* Authors: Thomas Graf <tgraf@suug.ch>
*/
#include <linux/module.h>
#include <linux/slab.h>
#include <linux/types.h>
#include <linux/kernel.h>
#include <linux/string.h>
#include <linux/errno.h>
#include <linux/rtnetlink.h>
#include <linux/skbuff.h>
#include <net/netlink.h>
#include <net/act_api.h>
#include <net/pkt_cls.h>
struct basic_head {
u32 hgenerator;
struct list_head flist;
struct rcu_head rcu;
};
struct basic_filter {
u32 handle;
struct tcf_exts exts;
struct tcf_ematch_tree ematches;
struct tcf_result res;
struct tcf_proto *tp;
struct list_head link;
struct rcu_head rcu;
};
static int basic_classify(struct sk_buff *skb, const struct tcf_proto *tp,
struct tcf_result *res)
{
int r;
struct basic_head *head = rcu_dereference_bh(tp->root);
struct basic_filter *f;
list_for_each_entry_rcu(f, &head->flist, link) {
if (!tcf_em_tree_match(skb, &f->ematches, NULL))
continue;
*res = f->res;
r = tcf_exts_exec(skb, &f->exts, res);
if (r < 0)
continue;
return r;
}
return -1;
}
static unsigned long basic_get(struct tcf_proto *tp, u32 handle)
{
unsigned long l = 0UL;
struct basic_head *head = rtnl_dereference(tp->root);
struct basic_filter *f;
if (head == NULL)
return 0UL;
list_for_each_entry(f, &head->flist, link)
if (f->handle == handle)
l = (unsigned long) f;
return l;
}
static void basic_put(struct tcf_proto *tp, unsigned long f)
{
}
static int basic_init(struct tcf_proto *tp)
{
struct basic_head *head;
head = kzalloc(sizeof(*head), GFP_KERNEL);
if (head == NULL)
return -ENOBUFS;
INIT_LIST_HEAD(&head->flist);
rcu_assign_pointer(tp->root, head);
return 0;
}
static void basic_delete_filter(struct rcu_head *head)
{
struct basic_filter *f = container_of(head, struct basic_filter, rcu);
tcf_exts_destroy(&f->exts);
tcf_em_tree_destroy(&f->ematches);
kfree(f);
}
static void basic_destroy(struct tcf_proto *tp)
{
struct basic_head *head = rtnl_dereference(tp->root);
struct basic_filter *f, *n;
list_for_each_entry_safe(f, n, &head->flist, link) {
list_del_rcu(&f->link);
tcf_unbind_filter(tp, &f->res);
call_rcu(&f->rcu, basic_delete_filter);
}
RCU_INIT_POINTER(tp->root, NULL);
kfree_rcu(head, rcu);
}
static int basic_delete(struct tcf_proto *tp, unsigned long arg)
{
struct basic_head *head = rtnl_dereference(tp->root);
struct basic_filter *t, *f = (struct basic_filter *) arg;
list_for_each_entry(t, &head->flist, link)
if (t == f) {
list_del_rcu(&t->link);
tcf_unbind_filter(tp, &t->res);
call_rcu(&t->rcu, basic_delete_filter);
return 0;
}
return -ENOENT;
}
static const struct nla_policy basic_policy[TCA_BASIC_MAX + 1] = {
[TCA_BASIC_CLASSID] = { .type = NLA_U32 },
[TCA_BASIC_EMATCHES] = { .type = NLA_NESTED },
};
static int basic_set_parms(struct net *net, struct tcf_proto *tp,
struct basic_filter *f, unsigned long base,
struct nlattr **tb,
struct nlattr *est, bool ovr)
{
int err;
struct tcf_exts e;
struct tcf_ematch_tree t;
tcf_exts_init(&e, TCA_BASIC_ACT, TCA_BASIC_POLICE);
err = tcf_exts_validate(net, tp, tb, est, &e, ovr);
if (err < 0)
return err;
err = tcf_em_tree_validate(tp, tb[TCA_BASIC_EMATCHES], &t);
if (err < 0)
goto errout;
if (tb[TCA_BASIC_CLASSID]) {
f->res.classid = nla_get_u32(tb[TCA_BASIC_CLASSID]);
tcf_bind_filter(tp, &f->res, base);
}
tcf_exts_change(tp, &f->exts, &e);
tcf_em_tree_change(tp, &f->ematches, &t);
f->tp = tp;
return 0;
errout:
tcf_exts_destroy(&e);
return err;
}
static int basic_change(struct net *net, struct sk_buff *in_skb,
struct tcf_proto *tp, unsigned long base, u32 handle,
struct nlattr **tca, unsigned long *arg, bool ovr)
{
int err;
struct basic_head *head = rtnl_dereference(tp->root);
struct nlattr *tb[TCA_BASIC_MAX + 1];
struct basic_filter *fold = (struct basic_filter *) *arg;
struct basic_filter *fnew;
if (tca[TCA_OPTIONS] == NULL)
return -EINVAL;
err = nla_parse_nested(tb, TCA_BASIC_MAX, tca[TCA_OPTIONS],
basic_policy);
if (err < 0)
return err;
if (fold != NULL) {
if (handle && fold->handle != handle)
return -EINVAL;
}
err = -ENOBUFS;
fnew = kzalloc(sizeof(*fnew), GFP_KERNEL);
if (fnew == NULL)
goto errout;
tcf_exts_init(&fnew->exts, TCA_BASIC_ACT, TCA_BASIC_POLICE);
err = -EINVAL;
if (handle) {
fnew->handle = handle;
} else if (fold) {
fnew->handle = fold->handle;
} else {
unsigned int i = 0x80000000;
do {
if (++head->hgenerator == 0x7FFFFFFF)
head->hgenerator = 1;
} while (--i > 0 && basic_get(tp, head->hgenerator));
if (i <= 0) {
pr_err("Insufficient number of handles\n");
goto errout;
}
fnew->handle = head->hgenerator;
}
err = basic_set_parms(net, tp, fnew, base, tb, tca[TCA_RATE], ovr);
if (err < 0)
goto errout;
*arg = (unsigned long)fnew;
if (fold) {
list_replace_rcu(&fold->link, &fnew->link);
tcf_unbind_filter(tp, &fold->res);
call_rcu(&fold->rcu, basic_delete_filter);
} else {
list_add_rcu(&fnew->link, &head->flist);
}
return 0;
errout:
kfree(fnew);
return err;
}
static void basic_walk(struct tcf_proto *tp, struct tcf_walker *arg)
{
struct basic_head *head = rtnl_dereference(tp->root);
struct basic_filter *f;
list_for_each_entry(f, &head->flist, link) {
if (arg->count < arg->skip)
goto skip;
if (arg->fn(tp, (unsigned long) f, arg) < 0) {
arg->stop = 1;
break;
}
skip:
arg->count++;
}
}
static int basic_dump(struct net *net, struct tcf_proto *tp, unsigned long fh,
struct sk_buff *skb, struct tcmsg *t)
{
struct basic_filter *f = (struct basic_filter *) fh;
struct nlattr *nest;
if (f == NULL)
return skb->len;
t->tcm_handle = f->handle;
nest = nla_nest_start(skb, TCA_OPTIONS);
if (nest == NULL)
goto nla_put_failure;
if (f->res.classid &&
nla_put_u32(skb, TCA_BASIC_CLASSID, f->res.classid))
goto nla_put_failure;
if (tcf_exts_dump(skb, &f->exts) < 0 ||
tcf_em_tree_dump(skb, &f->ematches, TCA_BASIC_EMATCHES) < 0)
goto nla_put_failure;
nla_nest_end(skb, nest);
if (tcf_exts_dump_stats(skb, &f->exts) < 0)
goto nla_put_failure;
return skb->len;
nla_put_failure:
nla_nest_cancel(skb, nest);
return -1;
}
static struct tcf_proto_ops cls_basic_ops __read_mostly = {
.kind = "basic",
.classify = basic_classify,
.init = basic_init,
.destroy = basic_destroy,
.get = basic_get,
.put = basic_put,
.change = basic_change,
.delete = basic_delete,
.walk = basic_walk,
.dump = basic_dump,
.owner = THIS_MODULE,
};
static int __init init_basic(void)
{
return register_tcf_proto_ops(&cls_basic_ops);
}
static void __exit exit_basic(void)
{
unregister_tcf_proto_ops(&cls_basic_ops);
}
module_init(init_basic)
module_exit(exit_basic)
MODULE_LICENSE("GPL");

384
net/sched/cls_bpf.c Normal file
View file

@ -0,0 +1,384 @@
/*
* Berkeley Packet Filter based traffic classifier
*
* Might be used to classify traffic through flexible, user-defined and
* possibly JIT-ed BPF filters for traffic control as an alternative to
* ematches.
*
* (C) 2013 Daniel Borkmann <dborkman@redhat.com>
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License version 2 as
* published by the Free Software Foundation.
*/
#include <linux/module.h>
#include <linux/types.h>
#include <linux/skbuff.h>
#include <linux/filter.h>
#include <net/rtnetlink.h>
#include <net/pkt_cls.h>
#include <net/sock.h>
MODULE_LICENSE("GPL");
MODULE_AUTHOR("Daniel Borkmann <dborkman@redhat.com>");
MODULE_DESCRIPTION("TC BPF based classifier");
struct cls_bpf_head {
struct list_head plist;
u32 hgen;
struct rcu_head rcu;
};
struct cls_bpf_prog {
struct bpf_prog *filter;
struct sock_filter *bpf_ops;
struct tcf_exts exts;
struct tcf_result res;
struct list_head link;
u32 handle;
u16 bpf_len;
struct tcf_proto *tp;
struct rcu_head rcu;
};
static const struct nla_policy bpf_policy[TCA_BPF_MAX + 1] = {
[TCA_BPF_CLASSID] = { .type = NLA_U32 },
[TCA_BPF_OPS_LEN] = { .type = NLA_U16 },
[TCA_BPF_OPS] = { .type = NLA_BINARY,
.len = sizeof(struct sock_filter) * BPF_MAXINSNS },
};
static int cls_bpf_classify(struct sk_buff *skb, const struct tcf_proto *tp,
struct tcf_result *res)
{
struct cls_bpf_head *head = rcu_dereference_bh(tp->root);
struct cls_bpf_prog *prog;
int ret;
list_for_each_entry_rcu(prog, &head->plist, link) {
int filter_res = BPF_PROG_RUN(prog->filter, skb);
if (filter_res == 0)
continue;
*res = prog->res;
if (filter_res != -1)
res->classid = filter_res;
ret = tcf_exts_exec(skb, &prog->exts, res);
if (ret < 0)
continue;
return ret;
}
return -1;
}
static int cls_bpf_init(struct tcf_proto *tp)
{
struct cls_bpf_head *head;
head = kzalloc(sizeof(*head), GFP_KERNEL);
if (head == NULL)
return -ENOBUFS;
INIT_LIST_HEAD_RCU(&head->plist);
rcu_assign_pointer(tp->root, head);
return 0;
}
static void cls_bpf_delete_prog(struct tcf_proto *tp, struct cls_bpf_prog *prog)
{
tcf_exts_destroy(&prog->exts);
bpf_prog_destroy(prog->filter);
kfree(prog->bpf_ops);
kfree(prog);
}
static void __cls_bpf_delete_prog(struct rcu_head *rcu)
{
struct cls_bpf_prog *prog = container_of(rcu, struct cls_bpf_prog, rcu);
cls_bpf_delete_prog(prog->tp, prog);
}
static int cls_bpf_delete(struct tcf_proto *tp, unsigned long arg)
{
struct cls_bpf_head *head = rtnl_dereference(tp->root);
struct cls_bpf_prog *prog, *todel = (struct cls_bpf_prog *) arg;
list_for_each_entry(prog, &head->plist, link) {
if (prog == todel) {
list_del_rcu(&prog->link);
tcf_unbind_filter(tp, &prog->res);
call_rcu(&prog->rcu, __cls_bpf_delete_prog);
return 0;
}
}
return -ENOENT;
}
static void cls_bpf_destroy(struct tcf_proto *tp)
{
struct cls_bpf_head *head = rtnl_dereference(tp->root);
struct cls_bpf_prog *prog, *tmp;
list_for_each_entry_safe(prog, tmp, &head->plist, link) {
list_del_rcu(&prog->link);
tcf_unbind_filter(tp, &prog->res);
call_rcu(&prog->rcu, __cls_bpf_delete_prog);
}
RCU_INIT_POINTER(tp->root, NULL);
kfree_rcu(head, rcu);
}
static unsigned long cls_bpf_get(struct tcf_proto *tp, u32 handle)
{
struct cls_bpf_head *head = rtnl_dereference(tp->root);
struct cls_bpf_prog *prog;
unsigned long ret = 0UL;
if (head == NULL)
return 0UL;
list_for_each_entry_rcu(prog, &head->plist, link) {
if (prog->handle == handle) {
ret = (unsigned long) prog;
break;
}
}
return ret;
}
static void cls_bpf_put(struct tcf_proto *tp, unsigned long f)
{
}
static int cls_bpf_modify_existing(struct net *net, struct tcf_proto *tp,
struct cls_bpf_prog *prog,
unsigned long base, struct nlattr **tb,
struct nlattr *est, bool ovr)
{
struct sock_filter *bpf_ops;
struct tcf_exts exts;
struct sock_fprog_kern tmp;
struct bpf_prog *fp;
u16 bpf_size, bpf_len;
u32 classid;
int ret;
if (!tb[TCA_BPF_OPS_LEN] || !tb[TCA_BPF_OPS] || !tb[TCA_BPF_CLASSID])
return -EINVAL;
tcf_exts_init(&exts, TCA_BPF_ACT, TCA_BPF_POLICE);
ret = tcf_exts_validate(net, tp, tb, est, &exts, ovr);
if (ret < 0)
return ret;
classid = nla_get_u32(tb[TCA_BPF_CLASSID]);
bpf_len = nla_get_u16(tb[TCA_BPF_OPS_LEN]);
if (bpf_len > BPF_MAXINSNS || bpf_len == 0) {
ret = -EINVAL;
goto errout;
}
bpf_size = bpf_len * sizeof(*bpf_ops);
bpf_ops = kzalloc(bpf_size, GFP_KERNEL);
if (bpf_ops == NULL) {
ret = -ENOMEM;
goto errout;
}
memcpy(bpf_ops, nla_data(tb[TCA_BPF_OPS]), bpf_size);
tmp.len = bpf_len;
tmp.filter = bpf_ops;
ret = bpf_prog_create(&fp, &tmp);
if (ret)
goto errout_free;
prog->bpf_len = bpf_len;
prog->bpf_ops = bpf_ops;
prog->filter = fp;
prog->res.classid = classid;
tcf_bind_filter(tp, &prog->res, base);
tcf_exts_change(tp, &prog->exts, &exts);
return 0;
errout_free:
kfree(bpf_ops);
errout:
tcf_exts_destroy(&exts);
return ret;
}
static u32 cls_bpf_grab_new_handle(struct tcf_proto *tp,
struct cls_bpf_head *head)
{
unsigned int i = 0x80000000;
do {
if (++head->hgen == 0x7FFFFFFF)
head->hgen = 1;
} while (--i > 0 && cls_bpf_get(tp, head->hgen));
if (i == 0)
pr_err("Insufficient number of handles\n");
return i;
}
static int cls_bpf_change(struct net *net, struct sk_buff *in_skb,
struct tcf_proto *tp, unsigned long base,
u32 handle, struct nlattr **tca,
unsigned long *arg, bool ovr)
{
struct cls_bpf_head *head = rtnl_dereference(tp->root);
struct cls_bpf_prog *oldprog = (struct cls_bpf_prog *) *arg;
struct nlattr *tb[TCA_BPF_MAX + 1];
struct cls_bpf_prog *prog;
int ret;
if (tca[TCA_OPTIONS] == NULL)
return -EINVAL;
ret = nla_parse_nested(tb, TCA_BPF_MAX, tca[TCA_OPTIONS], bpf_policy);
if (ret < 0)
return ret;
prog = kzalloc(sizeof(*prog), GFP_KERNEL);
if (!prog)
return -ENOBUFS;
tcf_exts_init(&prog->exts, TCA_BPF_ACT, TCA_BPF_POLICE);
if (oldprog) {
if (handle && oldprog->handle != handle) {
ret = -EINVAL;
goto errout;
}
}
if (handle == 0)
prog->handle = cls_bpf_grab_new_handle(tp, head);
else
prog->handle = handle;
if (prog->handle == 0) {
ret = -EINVAL;
goto errout;
}
ret = cls_bpf_modify_existing(net, tp, prog, base, tb, tca[TCA_RATE], ovr);
if (ret < 0)
goto errout;
if (oldprog) {
list_replace_rcu(&prog->link, &oldprog->link);
tcf_unbind_filter(tp, &oldprog->res);
call_rcu(&oldprog->rcu, __cls_bpf_delete_prog);
} else {
list_add_rcu(&prog->link, &head->plist);
}
*arg = (unsigned long) prog;
return 0;
errout:
kfree(prog);
return ret;
}
static int cls_bpf_dump(struct net *net, struct tcf_proto *tp, unsigned long fh,
struct sk_buff *skb, struct tcmsg *tm)
{
struct cls_bpf_prog *prog = (struct cls_bpf_prog *) fh;
struct nlattr *nest, *nla;
if (prog == NULL)
return skb->len;
tm->tcm_handle = prog->handle;
nest = nla_nest_start(skb, TCA_OPTIONS);
if (nest == NULL)
goto nla_put_failure;
if (nla_put_u32(skb, TCA_BPF_CLASSID, prog->res.classid))
goto nla_put_failure;
if (nla_put_u16(skb, TCA_BPF_OPS_LEN, prog->bpf_len))
goto nla_put_failure;
nla = nla_reserve(skb, TCA_BPF_OPS, prog->bpf_len *
sizeof(struct sock_filter));
if (nla == NULL)
goto nla_put_failure;
memcpy(nla_data(nla), prog->bpf_ops, nla_len(nla));
if (tcf_exts_dump(skb, &prog->exts) < 0)
goto nla_put_failure;
nla_nest_end(skb, nest);
if (tcf_exts_dump_stats(skb, &prog->exts) < 0)
goto nla_put_failure;
return skb->len;
nla_put_failure:
nla_nest_cancel(skb, nest);
return -1;
}
static void cls_bpf_walk(struct tcf_proto *tp, struct tcf_walker *arg)
{
struct cls_bpf_head *head = rtnl_dereference(tp->root);
struct cls_bpf_prog *prog;
list_for_each_entry_rcu(prog, &head->plist, link) {
if (arg->count < arg->skip)
goto skip;
if (arg->fn(tp, (unsigned long) prog, arg) < 0) {
arg->stop = 1;
break;
}
skip:
arg->count++;
}
}
static struct tcf_proto_ops cls_bpf_ops __read_mostly = {
.kind = "bpf",
.owner = THIS_MODULE,
.classify = cls_bpf_classify,
.init = cls_bpf_init,
.destroy = cls_bpf_destroy,
.get = cls_bpf_get,
.put = cls_bpf_put,
.change = cls_bpf_change,
.delete = cls_bpf_delete,
.walk = cls_bpf_walk,
.dump = cls_bpf_dump,
};
static int __init cls_bpf_init_mod(void)
{
return register_tcf_proto_ops(&cls_bpf_ops);
}
static void __exit cls_bpf_exit_mod(void)
{
unregister_tcf_proto_ops(&cls_bpf_ops);
}
module_init(cls_bpf_init_mod);
module_exit(cls_bpf_exit_mod);

239
net/sched/cls_cgroup.c Normal file
View file

@ -0,0 +1,239 @@
/*
* net/sched/cls_cgroup.c Control Group Classifier
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public License
* as published by the Free Software Foundation; either version
* 2 of the License, or (at your option) any later version.
*
* Authors: Thomas Graf <tgraf@suug.ch>
*/
#include <linux/module.h>
#include <linux/slab.h>
#include <linux/skbuff.h>
#include <linux/rcupdate.h>
#include <net/rtnetlink.h>
#include <net/pkt_cls.h>
#include <net/sock.h>
#include <net/cls_cgroup.h>
struct cls_cgroup_head {
u32 handle;
struct tcf_exts exts;
struct tcf_ematch_tree ematches;
struct tcf_proto *tp;
struct rcu_head rcu;
};
static int cls_cgroup_classify(struct sk_buff *skb, const struct tcf_proto *tp,
struct tcf_result *res)
{
struct cls_cgroup_head *head = rcu_dereference_bh(tp->root);
u32 classid;
classid = task_cls_state(current)->classid;
/*
* Due to the nature of the classifier it is required to ignore all
* packets originating from softirq context as accessing `current'
* would lead to false results.
*
* This test assumes that all callers of dev_queue_xmit() explicitely
* disable bh. Knowing this, it is possible to detect softirq based
* calls by looking at the number of nested bh disable calls because
* softirqs always disables bh.
*/
if (in_serving_softirq()) {
/* If there is an sk_classid we'll use that. */
if (!skb->sk)
return -1;
classid = skb->sk->sk_classid;
}
if (!classid)
return -1;
if (!tcf_em_tree_match(skb, &head->ematches, NULL))
return -1;
res->classid = classid;
res->class = 0;
return tcf_exts_exec(skb, &head->exts, res);
}
static unsigned long cls_cgroup_get(struct tcf_proto *tp, u32 handle)
{
return 0UL;
}
static void cls_cgroup_put(struct tcf_proto *tp, unsigned long f)
{
}
static int cls_cgroup_init(struct tcf_proto *tp)
{
return 0;
}
static const struct nla_policy cgroup_policy[TCA_CGROUP_MAX + 1] = {
[TCA_CGROUP_EMATCHES] = { .type = NLA_NESTED },
};
static void cls_cgroup_destroy_rcu(struct rcu_head *root)
{
struct cls_cgroup_head *head = container_of(root,
struct cls_cgroup_head,
rcu);
tcf_exts_destroy(&head->exts);
tcf_em_tree_destroy(&head->ematches);
kfree(head);
}
static int cls_cgroup_change(struct net *net, struct sk_buff *in_skb,
struct tcf_proto *tp, unsigned long base,
u32 handle, struct nlattr **tca,
unsigned long *arg, bool ovr)
{
struct nlattr *tb[TCA_CGROUP_MAX + 1];
struct cls_cgroup_head *head = rtnl_dereference(tp->root);
struct cls_cgroup_head *new;
struct tcf_ematch_tree t;
struct tcf_exts e;
int err;
if (!tca[TCA_OPTIONS])
return -EINVAL;
if (!head && !handle)
return -EINVAL;
if (head && handle != head->handle)
return -ENOENT;
new = kzalloc(sizeof(*head), GFP_KERNEL);
if (!new)
return -ENOBUFS;
tcf_exts_init(&new->exts, TCA_CGROUP_ACT, TCA_CGROUP_POLICE);
if (head)
new->handle = head->handle;
else
new->handle = handle;
new->tp = tp;
err = nla_parse_nested(tb, TCA_CGROUP_MAX, tca[TCA_OPTIONS],
cgroup_policy);
if (err < 0)
goto errout;
tcf_exts_init(&e, TCA_CGROUP_ACT, TCA_CGROUP_POLICE);
err = tcf_exts_validate(net, tp, tb, tca[TCA_RATE], &e, ovr);
if (err < 0)
goto errout;
err = tcf_em_tree_validate(tp, tb[TCA_CGROUP_EMATCHES], &t);
if (err < 0) {
tcf_exts_destroy(&e);
goto errout;
}
tcf_exts_change(tp, &new->exts, &e);
tcf_em_tree_change(tp, &new->ematches, &t);
rcu_assign_pointer(tp->root, new);
if (head)
call_rcu(&head->rcu, cls_cgroup_destroy_rcu);
return 0;
errout:
kfree(new);
return err;
}
static void cls_cgroup_destroy(struct tcf_proto *tp)
{
struct cls_cgroup_head *head = rtnl_dereference(tp->root);
if (head) {
RCU_INIT_POINTER(tp->root, NULL);
call_rcu(&head->rcu, cls_cgroup_destroy_rcu);
}
}
static int cls_cgroup_delete(struct tcf_proto *tp, unsigned long arg)
{
return -EOPNOTSUPP;
}
static void cls_cgroup_walk(struct tcf_proto *tp, struct tcf_walker *arg)
{
struct cls_cgroup_head *head = rtnl_dereference(tp->root);
if (arg->count < arg->skip)
goto skip;
if (arg->fn(tp, (unsigned long) head, arg) < 0) {
arg->stop = 1;
return;
}
skip:
arg->count++;
}
static int cls_cgroup_dump(struct net *net, struct tcf_proto *tp, unsigned long fh,
struct sk_buff *skb, struct tcmsg *t)
{
struct cls_cgroup_head *head = rtnl_dereference(tp->root);
unsigned char *b = skb_tail_pointer(skb);
struct nlattr *nest;
t->tcm_handle = head->handle;
nest = nla_nest_start(skb, TCA_OPTIONS);
if (nest == NULL)
goto nla_put_failure;
if (tcf_exts_dump(skb, &head->exts) < 0 ||
tcf_em_tree_dump(skb, &head->ematches, TCA_CGROUP_EMATCHES) < 0)
goto nla_put_failure;
nla_nest_end(skb, nest);
if (tcf_exts_dump_stats(skb, &head->exts) < 0)
goto nla_put_failure;
return skb->len;
nla_put_failure:
nlmsg_trim(skb, b);
return -1;
}
static struct tcf_proto_ops cls_cgroup_ops __read_mostly = {
.kind = "cgroup",
.init = cls_cgroup_init,
.change = cls_cgroup_change,
.classify = cls_cgroup_classify,
.destroy = cls_cgroup_destroy,
.get = cls_cgroup_get,
.put = cls_cgroup_put,
.delete = cls_cgroup_delete,
.walk = cls_cgroup_walk,
.dump = cls_cgroup_dump,
.owner = THIS_MODULE,
};
static int __init init_cgroup_cls(void)
{
return register_tcf_proto_ops(&cls_cgroup_ops);
}
static void __exit exit_cgroup_cls(void)
{
unregister_tcf_proto_ops(&cls_cgroup_ops);
}
module_init(init_cgroup_cls);
module_exit(exit_cgroup_cls);
MODULE_LICENSE("GPL");

698
net/sched/cls_flow.c Normal file
View file

@ -0,0 +1,698 @@
/*
* net/sched/cls_flow.c Generic flow classifier
*
* Copyright (c) 2007, 2008 Patrick McHardy <kaber@trash.net>
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public License
* as published by the Free Software Foundation; either version 2
* of the License, or (at your option) any later version.
*/
#include <linux/kernel.h>
#include <linux/init.h>
#include <linux/list.h>
#include <linux/jhash.h>
#include <linux/random.h>
#include <linux/pkt_cls.h>
#include <linux/skbuff.h>
#include <linux/in.h>
#include <linux/ip.h>
#include <linux/ipv6.h>
#include <linux/if_vlan.h>
#include <linux/slab.h>
#include <linux/module.h>
#include <net/pkt_cls.h>
#include <net/ip.h>
#include <net/route.h>
#include <net/flow_keys.h>
#if defined(CONFIG_NF_CONNTRACK) || defined(CONFIG_NF_CONNTRACK_MODULE)
#include <net/netfilter/nf_conntrack.h>
#endif
struct flow_head {
struct list_head filters;
struct rcu_head rcu;
};
struct flow_filter {
struct list_head list;
struct tcf_exts exts;
struct tcf_ematch_tree ematches;
struct tcf_proto *tp;
struct timer_list perturb_timer;
u32 perturb_period;
u32 handle;
u32 nkeys;
u32 keymask;
u32 mode;
u32 mask;
u32 xor;
u32 rshift;
u32 addend;
u32 divisor;
u32 baseclass;
u32 hashrnd;
struct rcu_head rcu;
};
static inline u32 addr_fold(void *addr)
{
unsigned long a = (unsigned long)addr;
return (a & 0xFFFFFFFF) ^ (BITS_PER_LONG > 32 ? a >> 32 : 0);
}
static u32 flow_get_src(const struct sk_buff *skb, const struct flow_keys *flow)
{
if (flow->src)
return ntohl(flow->src);
return addr_fold(skb->sk);
}
static u32 flow_get_dst(const struct sk_buff *skb, const struct flow_keys *flow)
{
if (flow->dst)
return ntohl(flow->dst);
return addr_fold(skb_dst(skb)) ^ (__force u16)skb->protocol;
}
static u32 flow_get_proto(const struct sk_buff *skb, const struct flow_keys *flow)
{
return flow->ip_proto;
}
static u32 flow_get_proto_src(const struct sk_buff *skb, const struct flow_keys *flow)
{
if (flow->ports)
return ntohs(flow->port16[0]);
return addr_fold(skb->sk);
}
static u32 flow_get_proto_dst(const struct sk_buff *skb, const struct flow_keys *flow)
{
if (flow->ports)
return ntohs(flow->port16[1]);
return addr_fold(skb_dst(skb)) ^ (__force u16)skb->protocol;
}
static u32 flow_get_iif(const struct sk_buff *skb)
{
return skb->skb_iif;
}
static u32 flow_get_priority(const struct sk_buff *skb)
{
return skb->priority;
}
static u32 flow_get_mark(const struct sk_buff *skb)
{
return skb->mark;
}
static u32 flow_get_nfct(const struct sk_buff *skb)
{
#if defined(CONFIG_NF_CONNTRACK) || defined(CONFIG_NF_CONNTRACK_MODULE)
return addr_fold(skb->nfct);
#else
return 0;
#endif
}
#if defined(CONFIG_NF_CONNTRACK) || defined(CONFIG_NF_CONNTRACK_MODULE)
#define CTTUPLE(skb, member) \
({ \
enum ip_conntrack_info ctinfo; \
const struct nf_conn *ct = nf_ct_get(skb, &ctinfo); \
if (ct == NULL) \
goto fallback; \
ct->tuplehash[CTINFO2DIR(ctinfo)].tuple.member; \
})
#else
#define CTTUPLE(skb, member) \
({ \
goto fallback; \
0; \
})
#endif
static u32 flow_get_nfct_src(const struct sk_buff *skb, const struct flow_keys *flow)
{
switch (skb->protocol) {
case htons(ETH_P_IP):
return ntohl(CTTUPLE(skb, src.u3.ip));
case htons(ETH_P_IPV6):
return ntohl(CTTUPLE(skb, src.u3.ip6[3]));
}
fallback:
return flow_get_src(skb, flow);
}
static u32 flow_get_nfct_dst(const struct sk_buff *skb, const struct flow_keys *flow)
{
switch (skb->protocol) {
case htons(ETH_P_IP):
return ntohl(CTTUPLE(skb, dst.u3.ip));
case htons(ETH_P_IPV6):
return ntohl(CTTUPLE(skb, dst.u3.ip6[3]));
}
fallback:
return flow_get_dst(skb, flow);
}
static u32 flow_get_nfct_proto_src(const struct sk_buff *skb, const struct flow_keys *flow)
{
return ntohs(CTTUPLE(skb, src.u.all));
fallback:
return flow_get_proto_src(skb, flow);
}
static u32 flow_get_nfct_proto_dst(const struct sk_buff *skb, const struct flow_keys *flow)
{
return ntohs(CTTUPLE(skb, dst.u.all));
fallback:
return flow_get_proto_dst(skb, flow);
}
static u32 flow_get_rtclassid(const struct sk_buff *skb)
{
#ifdef CONFIG_IP_ROUTE_CLASSID
if (skb_dst(skb))
return skb_dst(skb)->tclassid;
#endif
return 0;
}
static u32 flow_get_skuid(const struct sk_buff *skb)
{
if (skb->sk && skb->sk->sk_socket && skb->sk->sk_socket->file) {
kuid_t skuid = skb->sk->sk_socket->file->f_cred->fsuid;
return from_kuid(&init_user_ns, skuid);
}
return 0;
}
static u32 flow_get_skgid(const struct sk_buff *skb)
{
if (skb->sk && skb->sk->sk_socket && skb->sk->sk_socket->file) {
kgid_t skgid = skb->sk->sk_socket->file->f_cred->fsgid;
return from_kgid(&init_user_ns, skgid);
}
return 0;
}
static u32 flow_get_vlan_tag(const struct sk_buff *skb)
{
u16 uninitialized_var(tag);
if (vlan_get_tag(skb, &tag) < 0)
return 0;
return tag & VLAN_VID_MASK;
}
static u32 flow_get_rxhash(struct sk_buff *skb)
{
return skb_get_hash(skb);
}
static u32 flow_key_get(struct sk_buff *skb, int key, struct flow_keys *flow)
{
switch (key) {
case FLOW_KEY_SRC:
return flow_get_src(skb, flow);
case FLOW_KEY_DST:
return flow_get_dst(skb, flow);
case FLOW_KEY_PROTO:
return flow_get_proto(skb, flow);
case FLOW_KEY_PROTO_SRC:
return flow_get_proto_src(skb, flow);
case FLOW_KEY_PROTO_DST:
return flow_get_proto_dst(skb, flow);
case FLOW_KEY_IIF:
return flow_get_iif(skb);
case FLOW_KEY_PRIORITY:
return flow_get_priority(skb);
case FLOW_KEY_MARK:
return flow_get_mark(skb);
case FLOW_KEY_NFCT:
return flow_get_nfct(skb);
case FLOW_KEY_NFCT_SRC:
return flow_get_nfct_src(skb, flow);
case FLOW_KEY_NFCT_DST:
return flow_get_nfct_dst(skb, flow);
case FLOW_KEY_NFCT_PROTO_SRC:
return flow_get_nfct_proto_src(skb, flow);
case FLOW_KEY_NFCT_PROTO_DST:
return flow_get_nfct_proto_dst(skb, flow);
case FLOW_KEY_RTCLASSID:
return flow_get_rtclassid(skb);
case FLOW_KEY_SKUID:
return flow_get_skuid(skb);
case FLOW_KEY_SKGID:
return flow_get_skgid(skb);
case FLOW_KEY_VLAN_TAG:
return flow_get_vlan_tag(skb);
case FLOW_KEY_RXHASH:
return flow_get_rxhash(skb);
default:
WARN_ON(1);
return 0;
}
}
#define FLOW_KEYS_NEEDED ((1 << FLOW_KEY_SRC) | \
(1 << FLOW_KEY_DST) | \
(1 << FLOW_KEY_PROTO) | \
(1 << FLOW_KEY_PROTO_SRC) | \
(1 << FLOW_KEY_PROTO_DST) | \
(1 << FLOW_KEY_NFCT_SRC) | \
(1 << FLOW_KEY_NFCT_DST) | \
(1 << FLOW_KEY_NFCT_PROTO_SRC) | \
(1 << FLOW_KEY_NFCT_PROTO_DST))
static int flow_classify(struct sk_buff *skb, const struct tcf_proto *tp,
struct tcf_result *res)
{
struct flow_head *head = rcu_dereference_bh(tp->root);
struct flow_filter *f;
u32 keymask;
u32 classid;
unsigned int n, key;
int r;
list_for_each_entry_rcu(f, &head->filters, list) {
u32 keys[FLOW_KEY_MAX + 1];
struct flow_keys flow_keys;
if (!tcf_em_tree_match(skb, &f->ematches, NULL))
continue;
keymask = f->keymask;
if (keymask & FLOW_KEYS_NEEDED)
skb_flow_dissect(skb, &flow_keys);
for (n = 0; n < f->nkeys; n++) {
key = ffs(keymask) - 1;
keymask &= ~(1 << key);
keys[n] = flow_key_get(skb, key, &flow_keys);
}
if (f->mode == FLOW_MODE_HASH)
classid = jhash2(keys, f->nkeys, f->hashrnd);
else {
classid = keys[0];
classid = (classid & f->mask) ^ f->xor;
classid = (classid >> f->rshift) + f->addend;
}
if (f->divisor)
classid %= f->divisor;
res->class = 0;
res->classid = TC_H_MAKE(f->baseclass, f->baseclass + classid);
r = tcf_exts_exec(skb, &f->exts, res);
if (r < 0)
continue;
return r;
}
return -1;
}
static void flow_perturbation(unsigned long arg)
{
struct flow_filter *f = (struct flow_filter *)arg;
get_random_bytes(&f->hashrnd, 4);
if (f->perturb_period)
mod_timer(&f->perturb_timer, jiffies + f->perturb_period);
}
static const struct nla_policy flow_policy[TCA_FLOW_MAX + 1] = {
[TCA_FLOW_KEYS] = { .type = NLA_U32 },
[TCA_FLOW_MODE] = { .type = NLA_U32 },
[TCA_FLOW_BASECLASS] = { .type = NLA_U32 },
[TCA_FLOW_RSHIFT] = { .type = NLA_U32 },
[TCA_FLOW_ADDEND] = { .type = NLA_U32 },
[TCA_FLOW_MASK] = { .type = NLA_U32 },
[TCA_FLOW_XOR] = { .type = NLA_U32 },
[TCA_FLOW_DIVISOR] = { .type = NLA_U32 },
[TCA_FLOW_ACT] = { .type = NLA_NESTED },
[TCA_FLOW_POLICE] = { .type = NLA_NESTED },
[TCA_FLOW_EMATCHES] = { .type = NLA_NESTED },
[TCA_FLOW_PERTURB] = { .type = NLA_U32 },
};
static void flow_destroy_filter(struct rcu_head *head)
{
struct flow_filter *f = container_of(head, struct flow_filter, rcu);
del_timer_sync(&f->perturb_timer);
tcf_exts_destroy(&f->exts);
tcf_em_tree_destroy(&f->ematches);
kfree(f);
}
static int flow_change(struct net *net, struct sk_buff *in_skb,
struct tcf_proto *tp, unsigned long base,
u32 handle, struct nlattr **tca,
unsigned long *arg, bool ovr)
{
struct flow_head *head = rtnl_dereference(tp->root);
struct flow_filter *fold, *fnew;
struct nlattr *opt = tca[TCA_OPTIONS];
struct nlattr *tb[TCA_FLOW_MAX + 1];
struct tcf_exts e;
struct tcf_ematch_tree t;
unsigned int nkeys = 0;
unsigned int perturb_period = 0;
u32 baseclass = 0;
u32 keymask = 0;
u32 mode;
int err;
if (opt == NULL)
return -EINVAL;
err = nla_parse_nested(tb, TCA_FLOW_MAX, opt, flow_policy);
if (err < 0)
return err;
if (tb[TCA_FLOW_BASECLASS]) {
baseclass = nla_get_u32(tb[TCA_FLOW_BASECLASS]);
if (TC_H_MIN(baseclass) == 0)
return -EINVAL;
}
if (tb[TCA_FLOW_KEYS]) {
keymask = nla_get_u32(tb[TCA_FLOW_KEYS]);
nkeys = hweight32(keymask);
if (nkeys == 0)
return -EINVAL;
if (fls(keymask) - 1 > FLOW_KEY_MAX)
return -EOPNOTSUPP;
if ((keymask & (FLOW_KEY_SKUID|FLOW_KEY_SKGID)) &&
sk_user_ns(NETLINK_CB(in_skb).sk) != &init_user_ns)
return -EOPNOTSUPP;
}
tcf_exts_init(&e, TCA_FLOW_ACT, TCA_FLOW_POLICE);
err = tcf_exts_validate(net, tp, tb, tca[TCA_RATE], &e, ovr);
if (err < 0)
return err;
err = tcf_em_tree_validate(tp, tb[TCA_FLOW_EMATCHES], &t);
if (err < 0)
goto err1;
err = -ENOBUFS;
fnew = kzalloc(sizeof(*fnew), GFP_KERNEL);
if (!fnew)
goto err2;
fold = (struct flow_filter *)*arg;
if (fold) {
err = -EINVAL;
if (fold->handle != handle && handle)
goto err2;
/* Copy fold into fnew */
fnew->handle = fold->handle;
fnew->keymask = fold->keymask;
fnew->tp = fold->tp;
fnew->handle = fold->handle;
fnew->nkeys = fold->nkeys;
fnew->keymask = fold->keymask;
fnew->mode = fold->mode;
fnew->mask = fold->mask;
fnew->xor = fold->xor;
fnew->rshift = fold->rshift;
fnew->addend = fold->addend;
fnew->divisor = fold->divisor;
fnew->baseclass = fold->baseclass;
fnew->hashrnd = fold->hashrnd;
mode = fold->mode;
if (tb[TCA_FLOW_MODE])
mode = nla_get_u32(tb[TCA_FLOW_MODE]);
if (mode != FLOW_MODE_HASH && nkeys > 1)
goto err2;
if (mode == FLOW_MODE_HASH)
perturb_period = fold->perturb_period;
if (tb[TCA_FLOW_PERTURB]) {
if (mode != FLOW_MODE_HASH)
goto err2;
perturb_period = nla_get_u32(tb[TCA_FLOW_PERTURB]) * HZ;
}
} else {
err = -EINVAL;
if (!handle)
goto err2;
if (!tb[TCA_FLOW_KEYS])
goto err2;
mode = FLOW_MODE_MAP;
if (tb[TCA_FLOW_MODE])
mode = nla_get_u32(tb[TCA_FLOW_MODE]);
if (mode != FLOW_MODE_HASH && nkeys > 1)
goto err2;
if (tb[TCA_FLOW_PERTURB]) {
if (mode != FLOW_MODE_HASH)
goto err2;
perturb_period = nla_get_u32(tb[TCA_FLOW_PERTURB]) * HZ;
}
if (TC_H_MAJ(baseclass) == 0)
baseclass = TC_H_MAKE(tp->q->handle, baseclass);
if (TC_H_MIN(baseclass) == 0)
baseclass = TC_H_MAKE(baseclass, 1);
fnew->handle = handle;
fnew->mask = ~0U;
fnew->tp = tp;
get_random_bytes(&fnew->hashrnd, 4);
tcf_exts_init(&fnew->exts, TCA_FLOW_ACT, TCA_FLOW_POLICE);
}
fnew->perturb_timer.function = flow_perturbation;
fnew->perturb_timer.data = (unsigned long)fnew;
init_timer_deferrable(&fnew->perturb_timer);
tcf_exts_change(tp, &fnew->exts, &e);
tcf_em_tree_change(tp, &fnew->ematches, &t);
netif_keep_dst(qdisc_dev(tp->q));
if (tb[TCA_FLOW_KEYS]) {
fnew->keymask = keymask;
fnew->nkeys = nkeys;
}
fnew->mode = mode;
if (tb[TCA_FLOW_MASK])
fnew->mask = nla_get_u32(tb[TCA_FLOW_MASK]);
if (tb[TCA_FLOW_XOR])
fnew->xor = nla_get_u32(tb[TCA_FLOW_XOR]);
if (tb[TCA_FLOW_RSHIFT])
fnew->rshift = nla_get_u32(tb[TCA_FLOW_RSHIFT]);
if (tb[TCA_FLOW_ADDEND])
fnew->addend = nla_get_u32(tb[TCA_FLOW_ADDEND]);
if (tb[TCA_FLOW_DIVISOR])
fnew->divisor = nla_get_u32(tb[TCA_FLOW_DIVISOR]);
if (baseclass)
fnew->baseclass = baseclass;
fnew->perturb_period = perturb_period;
if (perturb_period)
mod_timer(&fnew->perturb_timer, jiffies + perturb_period);
if (*arg == 0)
list_add_tail_rcu(&fnew->list, &head->filters);
else
list_replace_rcu(&fnew->list, &fold->list);
*arg = (unsigned long)fnew;
if (fold)
call_rcu(&fold->rcu, flow_destroy_filter);
return 0;
err2:
tcf_em_tree_destroy(&t);
kfree(fnew);
err1:
tcf_exts_destroy(&e);
return err;
}
static int flow_delete(struct tcf_proto *tp, unsigned long arg)
{
struct flow_filter *f = (struct flow_filter *)arg;
list_del_rcu(&f->list);
call_rcu(&f->rcu, flow_destroy_filter);
return 0;
}
static int flow_init(struct tcf_proto *tp)
{
struct flow_head *head;
head = kzalloc(sizeof(*head), GFP_KERNEL);
if (head == NULL)
return -ENOBUFS;
INIT_LIST_HEAD(&head->filters);
rcu_assign_pointer(tp->root, head);
return 0;
}
static void flow_destroy(struct tcf_proto *tp)
{
struct flow_head *head = rtnl_dereference(tp->root);
struct flow_filter *f, *next;
list_for_each_entry_safe(f, next, &head->filters, list) {
list_del_rcu(&f->list);
call_rcu(&f->rcu, flow_destroy_filter);
}
RCU_INIT_POINTER(tp->root, NULL);
kfree_rcu(head, rcu);
}
static unsigned long flow_get(struct tcf_proto *tp, u32 handle)
{
struct flow_head *head = rtnl_dereference(tp->root);
struct flow_filter *f;
list_for_each_entry_rcu(f, &head->filters, list)
if (f->handle == handle)
return (unsigned long)f;
return 0;
}
static void flow_put(struct tcf_proto *tp, unsigned long f)
{
}
static int flow_dump(struct net *net, struct tcf_proto *tp, unsigned long fh,
struct sk_buff *skb, struct tcmsg *t)
{
struct flow_filter *f = (struct flow_filter *)fh;
struct nlattr *nest;
if (f == NULL)
return skb->len;
t->tcm_handle = f->handle;
nest = nla_nest_start(skb, TCA_OPTIONS);
if (nest == NULL)
goto nla_put_failure;
if (nla_put_u32(skb, TCA_FLOW_KEYS, f->keymask) ||
nla_put_u32(skb, TCA_FLOW_MODE, f->mode))
goto nla_put_failure;
if (f->mask != ~0 || f->xor != 0) {
if (nla_put_u32(skb, TCA_FLOW_MASK, f->mask) ||
nla_put_u32(skb, TCA_FLOW_XOR, f->xor))
goto nla_put_failure;
}
if (f->rshift &&
nla_put_u32(skb, TCA_FLOW_RSHIFT, f->rshift))
goto nla_put_failure;
if (f->addend &&
nla_put_u32(skb, TCA_FLOW_ADDEND, f->addend))
goto nla_put_failure;
if (f->divisor &&
nla_put_u32(skb, TCA_FLOW_DIVISOR, f->divisor))
goto nla_put_failure;
if (f->baseclass &&
nla_put_u32(skb, TCA_FLOW_BASECLASS, f->baseclass))
goto nla_put_failure;
if (f->perturb_period &&
nla_put_u32(skb, TCA_FLOW_PERTURB, f->perturb_period / HZ))
goto nla_put_failure;
if (tcf_exts_dump(skb, &f->exts) < 0)
goto nla_put_failure;
#ifdef CONFIG_NET_EMATCH
if (f->ematches.hdr.nmatches &&
tcf_em_tree_dump(skb, &f->ematches, TCA_FLOW_EMATCHES) < 0)
goto nla_put_failure;
#endif
nla_nest_end(skb, nest);
if (tcf_exts_dump_stats(skb, &f->exts) < 0)
goto nla_put_failure;
return skb->len;
nla_put_failure:
nlmsg_trim(skb, nest);
return -1;
}
static void flow_walk(struct tcf_proto *tp, struct tcf_walker *arg)
{
struct flow_head *head = rtnl_dereference(tp->root);
struct flow_filter *f;
list_for_each_entry_rcu(f, &head->filters, list) {
if (arg->count < arg->skip)
goto skip;
if (arg->fn(tp, (unsigned long)f, arg) < 0) {
arg->stop = 1;
break;
}
skip:
arg->count++;
}
}
static struct tcf_proto_ops cls_flow_ops __read_mostly = {
.kind = "flow",
.classify = flow_classify,
.init = flow_init,
.destroy = flow_destroy,
.change = flow_change,
.delete = flow_delete,
.get = flow_get,
.put = flow_put,
.dump = flow_dump,
.walk = flow_walk,
.owner = THIS_MODULE,
};
static int __init cls_flow_init(void)
{
return register_tcf_proto_ops(&cls_flow_ops);
}
static void __exit cls_flow_exit(void)
{
unregister_tcf_proto_ops(&cls_flow_ops);
}
module_init(cls_flow_init);
module_exit(cls_flow_exit);
MODULE_LICENSE("GPL");
MODULE_AUTHOR("Patrick McHardy <kaber@trash.net>");
MODULE_DESCRIPTION("TC flow classifier");

434
net/sched/cls_fw.c Normal file
View file

@ -0,0 +1,434 @@
/*
* net/sched/cls_fw.c Classifier mapping ipchains' fwmark to traffic class.
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public License
* as published by the Free Software Foundation; either version
* 2 of the License, or (at your option) any later version.
*
* Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
*
* Changes:
* Karlis Peisenieks <karlis@mt.lv> : 990415 : fw_walk off by one
* Karlis Peisenieks <karlis@mt.lv> : 990415 : fw_delete killed all the filter (and kernel).
* Alex <alex@pilotsoft.com> : 2004xxyy: Added Action extension
*
* JHS: We should remove the CONFIG_NET_CLS_IND from here
* eventually when the meta match extension is made available
*
*/
#include <linux/module.h>
#include <linux/slab.h>
#include <linux/types.h>
#include <linux/kernel.h>
#include <linux/string.h>
#include <linux/errno.h>
#include <linux/skbuff.h>
#include <net/netlink.h>
#include <net/act_api.h>
#include <net/pkt_cls.h>
#define HTSIZE 256
struct fw_head {
u32 mask;
struct fw_filter __rcu *ht[HTSIZE];
struct rcu_head rcu;
};
struct fw_filter {
struct fw_filter __rcu *next;
u32 id;
struct tcf_result res;
#ifdef CONFIG_NET_CLS_IND
int ifindex;
#endif /* CONFIG_NET_CLS_IND */
struct tcf_exts exts;
struct tcf_proto *tp;
struct rcu_head rcu;
};
static u32 fw_hash(u32 handle)
{
handle ^= (handle >> 16);
handle ^= (handle >> 8);
return handle % HTSIZE;
}
static int fw_classify(struct sk_buff *skb, const struct tcf_proto *tp,
struct tcf_result *res)
{
struct fw_head *head = rcu_dereference_bh(tp->root);
struct fw_filter *f;
int r;
u32 id = skb->mark;
if (head != NULL) {
id &= head->mask;
for (f = rcu_dereference_bh(head->ht[fw_hash(id)]); f;
f = rcu_dereference_bh(f->next)) {
if (f->id == id) {
*res = f->res;
#ifdef CONFIG_NET_CLS_IND
if (!tcf_match_indev(skb, f->ifindex))
continue;
#endif /* CONFIG_NET_CLS_IND */
r = tcf_exts_exec(skb, &f->exts, res);
if (r < 0)
continue;
return r;
}
}
} else {
/* old method */
if (id && (TC_H_MAJ(id) == 0 ||
!(TC_H_MAJ(id ^ tp->q->handle)))) {
res->classid = id;
res->class = 0;
return 0;
}
}
return -1;
}
static unsigned long fw_get(struct tcf_proto *tp, u32 handle)
{
struct fw_head *head = rtnl_dereference(tp->root);
struct fw_filter *f;
if (head == NULL)
return 0;
f = rtnl_dereference(head->ht[fw_hash(handle)]);
for (; f; f = rtnl_dereference(f->next)) {
if (f->id == handle)
return (unsigned long)f;
}
return 0;
}
static void fw_put(struct tcf_proto *tp, unsigned long f)
{
}
static int fw_init(struct tcf_proto *tp)
{
return 0;
}
static void fw_delete_filter(struct rcu_head *head)
{
struct fw_filter *f = container_of(head, struct fw_filter, rcu);
tcf_exts_destroy(&f->exts);
kfree(f);
}
static void fw_destroy(struct tcf_proto *tp)
{
struct fw_head *head = rtnl_dereference(tp->root);
struct fw_filter *f;
int h;
if (head == NULL)
return;
for (h = 0; h < HTSIZE; h++) {
while ((f = rtnl_dereference(head->ht[h])) != NULL) {
RCU_INIT_POINTER(head->ht[h],
rtnl_dereference(f->next));
tcf_unbind_filter(tp, &f->res);
call_rcu(&f->rcu, fw_delete_filter);
}
}
RCU_INIT_POINTER(tp->root, NULL);
kfree_rcu(head, rcu);
}
static int fw_delete(struct tcf_proto *tp, unsigned long arg)
{
struct fw_head *head = rtnl_dereference(tp->root);
struct fw_filter *f = (struct fw_filter *)arg;
struct fw_filter __rcu **fp;
struct fw_filter *pfp;
if (head == NULL || f == NULL)
goto out;
fp = &head->ht[fw_hash(f->id)];
for (pfp = rtnl_dereference(*fp); pfp;
fp = &pfp->next, pfp = rtnl_dereference(*fp)) {
if (pfp == f) {
RCU_INIT_POINTER(*fp, rtnl_dereference(f->next));
tcf_unbind_filter(tp, &f->res);
call_rcu(&f->rcu, fw_delete_filter);
return 0;
}
}
out:
return -EINVAL;
}
static const struct nla_policy fw_policy[TCA_FW_MAX + 1] = {
[TCA_FW_CLASSID] = { .type = NLA_U32 },
[TCA_FW_INDEV] = { .type = NLA_STRING, .len = IFNAMSIZ },
[TCA_FW_MASK] = { .type = NLA_U32 },
};
static int
fw_change_attrs(struct net *net, struct tcf_proto *tp, struct fw_filter *f,
struct nlattr **tb, struct nlattr **tca, unsigned long base, bool ovr)
{
struct fw_head *head = rtnl_dereference(tp->root);
struct tcf_exts e;
u32 mask;
int err;
tcf_exts_init(&e, TCA_FW_ACT, TCA_FW_POLICE);
err = tcf_exts_validate(net, tp, tb, tca[TCA_RATE], &e, ovr);
if (err < 0)
return err;
if (tb[TCA_FW_CLASSID]) {
f->res.classid = nla_get_u32(tb[TCA_FW_CLASSID]);
tcf_bind_filter(tp, &f->res, base);
}
#ifdef CONFIG_NET_CLS_IND
if (tb[TCA_FW_INDEV]) {
int ret;
ret = tcf_change_indev(net, tb[TCA_FW_INDEV]);
if (ret < 0) {
err = ret;
goto errout;
}
f->ifindex = ret;
}
#endif /* CONFIG_NET_CLS_IND */
err = -EINVAL;
if (tb[TCA_FW_MASK]) {
mask = nla_get_u32(tb[TCA_FW_MASK]);
if (mask != head->mask)
goto errout;
} else if (head->mask != 0xFFFFFFFF)
goto errout;
tcf_exts_change(tp, &f->exts, &e);
return 0;
errout:
tcf_exts_destroy(&e);
return err;
}
static int fw_change(struct net *net, struct sk_buff *in_skb,
struct tcf_proto *tp, unsigned long base,
u32 handle,
struct nlattr **tca,
unsigned long *arg, bool ovr)
{
struct fw_head *head = rtnl_dereference(tp->root);
struct fw_filter *f = (struct fw_filter *) *arg;
struct nlattr *opt = tca[TCA_OPTIONS];
struct nlattr *tb[TCA_FW_MAX + 1];
int err;
if (!opt)
return handle ? -EINVAL : 0;
err = nla_parse_nested(tb, TCA_FW_MAX, opt, fw_policy);
if (err < 0)
return err;
if (f) {
struct fw_filter *pfp, *fnew;
struct fw_filter __rcu **fp;
if (f->id != handle && handle)
return -EINVAL;
fnew = kzalloc(sizeof(struct fw_filter), GFP_KERNEL);
if (!fnew)
return -ENOBUFS;
fnew->id = f->id;
fnew->res = f->res;
#ifdef CONFIG_NET_CLS_IND
fnew->ifindex = f->ifindex;
#endif /* CONFIG_NET_CLS_IND */
fnew->tp = f->tp;
tcf_exts_init(&fnew->exts, TCA_FW_ACT, TCA_FW_POLICE);
err = fw_change_attrs(net, tp, fnew, tb, tca, base, ovr);
if (err < 0) {
kfree(fnew);
return err;
}
fp = &head->ht[fw_hash(fnew->id)];
for (pfp = rtnl_dereference(*fp); pfp;
fp = &pfp->next, pfp = rtnl_dereference(*fp))
if (pfp == f)
break;
RCU_INIT_POINTER(fnew->next, rtnl_dereference(pfp->next));
rcu_assign_pointer(*fp, fnew);
tcf_unbind_filter(tp, &f->res);
call_rcu(&f->rcu, fw_delete_filter);
*arg = (unsigned long)fnew;
return err;
}
if (!handle)
return -EINVAL;
if (head == NULL) {
u32 mask = 0xFFFFFFFF;
if (tb[TCA_FW_MASK])
mask = nla_get_u32(tb[TCA_FW_MASK]);
head = kzalloc(sizeof(struct fw_head), GFP_KERNEL);
if (head == NULL)
return -ENOBUFS;
head->mask = mask;
rcu_assign_pointer(tp->root, head);
}
f = kzalloc(sizeof(struct fw_filter), GFP_KERNEL);
if (f == NULL)
return -ENOBUFS;
tcf_exts_init(&f->exts, TCA_FW_ACT, TCA_FW_POLICE);
f->id = handle;
f->tp = tp;
err = fw_change_attrs(net, tp, f, tb, tca, base, ovr);
if (err < 0)
goto errout;
RCU_INIT_POINTER(f->next, head->ht[fw_hash(handle)]);
rcu_assign_pointer(head->ht[fw_hash(handle)], f);
*arg = (unsigned long)f;
return 0;
errout:
kfree(f);
return err;
}
static void fw_walk(struct tcf_proto *tp, struct tcf_walker *arg)
{
struct fw_head *head = rtnl_dereference(tp->root);
int h;
if (head == NULL)
arg->stop = 1;
if (arg->stop)
return;
for (h = 0; h < HTSIZE; h++) {
struct fw_filter *f;
for (f = rtnl_dereference(head->ht[h]); f;
f = rtnl_dereference(f->next)) {
if (arg->count < arg->skip) {
arg->count++;
continue;
}
if (arg->fn(tp, (unsigned long)f, arg) < 0) {
arg->stop = 1;
return;
}
arg->count++;
}
}
}
static int fw_dump(struct net *net, struct tcf_proto *tp, unsigned long fh,
struct sk_buff *skb, struct tcmsg *t)
{
struct fw_head *head = rtnl_dereference(tp->root);
struct fw_filter *f = (struct fw_filter *)fh;
unsigned char *b = skb_tail_pointer(skb);
struct nlattr *nest;
if (f == NULL)
return skb->len;
t->tcm_handle = f->id;
if (!f->res.classid && !tcf_exts_is_available(&f->exts))
return skb->len;
nest = nla_nest_start(skb, TCA_OPTIONS);
if (nest == NULL)
goto nla_put_failure;
if (f->res.classid &&
nla_put_u32(skb, TCA_FW_CLASSID, f->res.classid))
goto nla_put_failure;
#ifdef CONFIG_NET_CLS_IND
if (f->ifindex) {
struct net_device *dev;
dev = __dev_get_by_index(net, f->ifindex);
if (dev && nla_put_string(skb, TCA_FW_INDEV, dev->name))
goto nla_put_failure;
}
#endif /* CONFIG_NET_CLS_IND */
if (head->mask != 0xFFFFFFFF &&
nla_put_u32(skb, TCA_FW_MASK, head->mask))
goto nla_put_failure;
if (tcf_exts_dump(skb, &f->exts) < 0)
goto nla_put_failure;
nla_nest_end(skb, nest);
if (tcf_exts_dump_stats(skb, &f->exts) < 0)
goto nla_put_failure;
return skb->len;
nla_put_failure:
nlmsg_trim(skb, b);
return -1;
}
static struct tcf_proto_ops cls_fw_ops __read_mostly = {
.kind = "fw",
.classify = fw_classify,
.init = fw_init,
.destroy = fw_destroy,
.get = fw_get,
.put = fw_put,
.change = fw_change,
.delete = fw_delete,
.walk = fw_walk,
.dump = fw_dump,
.owner = THIS_MODULE,
};
static int __init init_fw(void)
{
return register_tcf_proto_ops(&cls_fw_ops);
}
static void __exit exit_fw(void)
{
unregister_tcf_proto_ops(&cls_fw_ops);
}
module_init(init_fw)
module_exit(exit_fw)
MODULE_LICENSE("GPL");

672
net/sched/cls_route.c Normal file
View file

@ -0,0 +1,672 @@
/*
* net/sched/cls_route.c ROUTE4 classifier.
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public License
* as published by the Free Software Foundation; either version
* 2 of the License, or (at your option) any later version.
*
* Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
*/
#include <linux/module.h>
#include <linux/slab.h>
#include <linux/types.h>
#include <linux/kernel.h>
#include <linux/string.h>
#include <linux/errno.h>
#include <linux/skbuff.h>
#include <net/dst.h>
#include <net/route.h>
#include <net/netlink.h>
#include <net/act_api.h>
#include <net/pkt_cls.h>
/*
* 1. For now we assume that route tags < 256.
* It allows to use direct table lookups, instead of hash tables.
* 2. For now we assume that "from TAG" and "fromdev DEV" statements
* are mutually exclusive.
* 3. "to TAG from ANY" has higher priority, than "to ANY from XXX"
*/
struct route4_fastmap {
struct route4_filter *filter;
u32 id;
int iif;
};
struct route4_head {
struct route4_fastmap fastmap[16];
struct route4_bucket __rcu *table[256 + 1];
struct rcu_head rcu;
};
struct route4_bucket {
/* 16 FROM buckets + 16 IIF buckets + 1 wildcard bucket */
struct route4_filter __rcu *ht[16 + 16 + 1];
struct rcu_head rcu;
};
struct route4_filter {
struct route4_filter __rcu *next;
u32 id;
int iif;
struct tcf_result res;
struct tcf_exts exts;
u32 handle;
struct route4_bucket *bkt;
struct tcf_proto *tp;
struct rcu_head rcu;
};
#define ROUTE4_FAILURE ((struct route4_filter *)(-1L))
static inline int route4_fastmap_hash(u32 id, int iif)
{
return id & 0xF;
}
static DEFINE_SPINLOCK(fastmap_lock);
static void
route4_reset_fastmap(struct route4_head *head)
{
spin_lock_bh(&fastmap_lock);
memset(head->fastmap, 0, sizeof(head->fastmap));
spin_unlock_bh(&fastmap_lock);
}
static void
route4_set_fastmap(struct route4_head *head, u32 id, int iif,
struct route4_filter *f)
{
int h = route4_fastmap_hash(id, iif);
/* fastmap updates must look atomic to aling id, iff, filter */
spin_lock_bh(&fastmap_lock);
head->fastmap[h].id = id;
head->fastmap[h].iif = iif;
head->fastmap[h].filter = f;
spin_unlock_bh(&fastmap_lock);
}
static inline int route4_hash_to(u32 id)
{
return id & 0xFF;
}
static inline int route4_hash_from(u32 id)
{
return (id >> 16) & 0xF;
}
static inline int route4_hash_iif(int iif)
{
return 16 + ((iif >> 16) & 0xF);
}
static inline int route4_hash_wild(void)
{
return 32;
}
#define ROUTE4_APPLY_RESULT() \
{ \
*res = f->res; \
if (tcf_exts_is_available(&f->exts)) { \
int r = tcf_exts_exec(skb, &f->exts, res); \
if (r < 0) { \
dont_cache = 1; \
continue; \
} \
return r; \
} else if (!dont_cache) \
route4_set_fastmap(head, id, iif, f); \
return 0; \
}
static int route4_classify(struct sk_buff *skb, const struct tcf_proto *tp,
struct tcf_result *res)
{
struct route4_head *head = rcu_dereference_bh(tp->root);
struct dst_entry *dst;
struct route4_bucket *b;
struct route4_filter *f;
u32 id, h;
int iif, dont_cache = 0;
dst = skb_dst(skb);
if (!dst)
goto failure;
id = dst->tclassid;
if (head == NULL)
goto old_method;
iif = inet_iif(skb);
h = route4_fastmap_hash(id, iif);
spin_lock(&fastmap_lock);
if (id == head->fastmap[h].id &&
iif == head->fastmap[h].iif &&
(f = head->fastmap[h].filter) != NULL) {
if (f == ROUTE4_FAILURE) {
spin_unlock(&fastmap_lock);
goto failure;
}
*res = f->res;
spin_unlock(&fastmap_lock);
return 0;
}
spin_unlock(&fastmap_lock);
h = route4_hash_to(id);
restart:
b = rcu_dereference_bh(head->table[h]);
if (b) {
for (f = rcu_dereference_bh(b->ht[route4_hash_from(id)]);
f;
f = rcu_dereference_bh(f->next))
if (f->id == id)
ROUTE4_APPLY_RESULT();
for (f = rcu_dereference_bh(b->ht[route4_hash_iif(iif)]);
f;
f = rcu_dereference_bh(f->next))
if (f->iif == iif)
ROUTE4_APPLY_RESULT();
for (f = rcu_dereference_bh(b->ht[route4_hash_wild()]);
f;
f = rcu_dereference_bh(f->next))
ROUTE4_APPLY_RESULT();
}
if (h < 256) {
h = 256;
id &= ~0xFFFF;
goto restart;
}
if (!dont_cache)
route4_set_fastmap(head, id, iif, ROUTE4_FAILURE);
failure:
return -1;
old_method:
if (id && (TC_H_MAJ(id) == 0 ||
!(TC_H_MAJ(id^tp->q->handle)))) {
res->classid = id;
res->class = 0;
return 0;
}
return -1;
}
static inline u32 to_hash(u32 id)
{
u32 h = id & 0xFF;
if (id & 0x8000)
h += 256;
return h;
}
static inline u32 from_hash(u32 id)
{
id &= 0xFFFF;
if (id == 0xFFFF)
return 32;
if (!(id & 0x8000)) {
if (id > 255)
return 256;
return id & 0xF;
}
return 16 + (id & 0xF);
}
static unsigned long route4_get(struct tcf_proto *tp, u32 handle)
{
struct route4_head *head = rtnl_dereference(tp->root);
struct route4_bucket *b;
struct route4_filter *f;
unsigned int h1, h2;
if (!head)
return 0;
h1 = to_hash(handle);
if (h1 > 256)
return 0;
h2 = from_hash(handle >> 16);
if (h2 > 32)
return 0;
b = rtnl_dereference(head->table[h1]);
if (b) {
for (f = rtnl_dereference(b->ht[h2]);
f;
f = rtnl_dereference(f->next))
if (f->handle == handle)
return (unsigned long)f;
}
return 0;
}
static void route4_put(struct tcf_proto *tp, unsigned long f)
{
}
static int route4_init(struct tcf_proto *tp)
{
return 0;
}
static void
route4_delete_filter(struct rcu_head *head)
{
struct route4_filter *f = container_of(head, struct route4_filter, rcu);
tcf_exts_destroy(&f->exts);
kfree(f);
}
static void route4_destroy(struct tcf_proto *tp)
{
struct route4_head *head = rtnl_dereference(tp->root);
int h1, h2;
if (head == NULL)
return;
for (h1 = 0; h1 <= 256; h1++) {
struct route4_bucket *b;
b = rtnl_dereference(head->table[h1]);
if (b) {
for (h2 = 0; h2 <= 32; h2++) {
struct route4_filter *f;
while ((f = rtnl_dereference(b->ht[h2])) != NULL) {
struct route4_filter *next;
next = rtnl_dereference(f->next);
RCU_INIT_POINTER(b->ht[h2], next);
tcf_unbind_filter(tp, &f->res);
call_rcu(&f->rcu, route4_delete_filter);
}
}
RCU_INIT_POINTER(head->table[h1], NULL);
kfree_rcu(b, rcu);
}
}
RCU_INIT_POINTER(tp->root, NULL);
kfree_rcu(head, rcu);
}
static int route4_delete(struct tcf_proto *tp, unsigned long arg)
{
struct route4_head *head = rtnl_dereference(tp->root);
struct route4_filter *f = (struct route4_filter *)arg;
struct route4_filter __rcu **fp;
struct route4_filter *nf;
struct route4_bucket *b;
unsigned int h = 0;
int i;
if (!head || !f)
return -EINVAL;
h = f->handle;
b = f->bkt;
fp = &b->ht[from_hash(h >> 16)];
for (nf = rtnl_dereference(*fp); nf;
fp = &nf->next, nf = rtnl_dereference(*fp)) {
if (nf == f) {
/* unlink it */
RCU_INIT_POINTER(*fp, rtnl_dereference(f->next));
/* Remove any fastmap lookups that might ref filter
* notice we unlink'd the filter so we can't get it
* back in the fastmap.
*/
route4_reset_fastmap(head);
/* Delete it */
tcf_unbind_filter(tp, &f->res);
call_rcu(&f->rcu, route4_delete_filter);
/* Strip RTNL protected tree */
for (i = 0; i <= 32; i++) {
struct route4_filter *rt;
rt = rtnl_dereference(b->ht[i]);
if (rt)
return 0;
}
/* OK, session has no flows */
RCU_INIT_POINTER(head->table[to_hash(h)], NULL);
kfree_rcu(b, rcu);
return 0;
}
}
return 0;
}
static const struct nla_policy route4_policy[TCA_ROUTE4_MAX + 1] = {
[TCA_ROUTE4_CLASSID] = { .type = NLA_U32 },
[TCA_ROUTE4_TO] = { .type = NLA_U32 },
[TCA_ROUTE4_FROM] = { .type = NLA_U32 },
[TCA_ROUTE4_IIF] = { .type = NLA_U32 },
};
static int route4_set_parms(struct net *net, struct tcf_proto *tp,
unsigned long base, struct route4_filter *f,
u32 handle, struct route4_head *head,
struct nlattr **tb, struct nlattr *est, int new,
bool ovr)
{
int err;
u32 id = 0, to = 0, nhandle = 0x8000;
struct route4_filter *fp;
unsigned int h1;
struct route4_bucket *b;
struct tcf_exts e;
tcf_exts_init(&e, TCA_ROUTE4_ACT, TCA_ROUTE4_POLICE);
err = tcf_exts_validate(net, tp, tb, est, &e, ovr);
if (err < 0)
return err;
err = -EINVAL;
if (tb[TCA_ROUTE4_TO]) {
if (new && handle & 0x8000)
goto errout;
to = nla_get_u32(tb[TCA_ROUTE4_TO]);
if (to > 0xFF)
goto errout;
nhandle = to;
}
if (tb[TCA_ROUTE4_FROM]) {
if (tb[TCA_ROUTE4_IIF])
goto errout;
id = nla_get_u32(tb[TCA_ROUTE4_FROM]);
if (id > 0xFF)
goto errout;
nhandle |= id << 16;
} else if (tb[TCA_ROUTE4_IIF]) {
id = nla_get_u32(tb[TCA_ROUTE4_IIF]);
if (id > 0x7FFF)
goto errout;
nhandle |= (id | 0x8000) << 16;
} else
nhandle |= 0xFFFF << 16;
if (handle && new) {
nhandle |= handle & 0x7F00;
if (nhandle != handle)
goto errout;
}
h1 = to_hash(nhandle);
b = rtnl_dereference(head->table[h1]);
if (!b) {
err = -ENOBUFS;
b = kzalloc(sizeof(struct route4_bucket), GFP_KERNEL);
if (b == NULL)
goto errout;
rcu_assign_pointer(head->table[h1], b);
} else {
unsigned int h2 = from_hash(nhandle >> 16);
err = -EEXIST;
for (fp = rtnl_dereference(b->ht[h2]);
fp;
fp = rtnl_dereference(fp->next))
if (fp->handle == f->handle)
goto errout;
}
if (tb[TCA_ROUTE4_TO])
f->id = to;
if (tb[TCA_ROUTE4_FROM])
f->id = to | id<<16;
else if (tb[TCA_ROUTE4_IIF])
f->iif = id;
f->handle = nhandle;
f->bkt = b;
f->tp = tp;
if (tb[TCA_ROUTE4_CLASSID]) {
f->res.classid = nla_get_u32(tb[TCA_ROUTE4_CLASSID]);
tcf_bind_filter(tp, &f->res, base);
}
tcf_exts_change(tp, &f->exts, &e);
return 0;
errout:
tcf_exts_destroy(&e);
return err;
}
static int route4_change(struct net *net, struct sk_buff *in_skb,
struct tcf_proto *tp, unsigned long base,
u32 handle,
struct nlattr **tca,
unsigned long *arg, bool ovr)
{
struct route4_head *head = rtnl_dereference(tp->root);
struct route4_filter __rcu **fp;
struct route4_filter *fold, *f1, *pfp, *f = NULL;
struct route4_bucket *b;
struct nlattr *opt = tca[TCA_OPTIONS];
struct nlattr *tb[TCA_ROUTE4_MAX + 1];
unsigned int h, th;
int err;
bool new = true;
if (opt == NULL)
return handle ? -EINVAL : 0;
err = nla_parse_nested(tb, TCA_ROUTE4_MAX, opt, route4_policy);
if (err < 0)
return err;
fold = (struct route4_filter *)*arg;
if (fold && handle && fold->handle != handle)
return -EINVAL;
err = -ENOBUFS;
if (head == NULL) {
head = kzalloc(sizeof(struct route4_head), GFP_KERNEL);
if (head == NULL)
goto errout;
rcu_assign_pointer(tp->root, head);
}
f = kzalloc(sizeof(struct route4_filter), GFP_KERNEL);
if (!f)
goto errout;
tcf_exts_init(&f->exts, TCA_ROUTE4_ACT, TCA_ROUTE4_POLICE);
if (fold) {
f->id = fold->id;
f->iif = fold->iif;
f->res = fold->res;
f->handle = fold->handle;
f->tp = fold->tp;
f->bkt = fold->bkt;
new = false;
}
err = route4_set_parms(net, tp, base, f, handle, head, tb,
tca[TCA_RATE], new, ovr);
if (err < 0)
goto errout;
h = from_hash(f->handle >> 16);
fp = &f->bkt->ht[h];
for (pfp = rtnl_dereference(*fp);
(f1 = rtnl_dereference(*fp)) != NULL;
fp = &f1->next)
if (f->handle < f1->handle)
break;
netif_keep_dst(qdisc_dev(tp->q));
rcu_assign_pointer(f->next, f1);
rcu_assign_pointer(*fp, f);
if (fold && fold->handle && f->handle != fold->handle) {
th = to_hash(fold->handle);
h = from_hash(fold->handle >> 16);
b = rtnl_dereference(head->table[th]);
if (b) {
fp = &b->ht[h];
for (pfp = rtnl_dereference(*fp); pfp;
fp = &pfp->next, pfp = rtnl_dereference(*fp)) {
if (pfp == f) {
*fp = f->next;
break;
}
}
}
}
route4_reset_fastmap(head);
*arg = (unsigned long)f;
if (fold) {
tcf_unbind_filter(tp, &fold->res);
call_rcu(&fold->rcu, route4_delete_filter);
}
return 0;
errout:
kfree(f);
return err;
}
static void route4_walk(struct tcf_proto *tp, struct tcf_walker *arg)
{
struct route4_head *head = rtnl_dereference(tp->root);
unsigned int h, h1;
if (head == NULL)
arg->stop = 1;
if (arg->stop)
return;
for (h = 0; h <= 256; h++) {
struct route4_bucket *b = rtnl_dereference(head->table[h]);
if (b) {
for (h1 = 0; h1 <= 32; h1++) {
struct route4_filter *f;
for (f = rtnl_dereference(b->ht[h1]);
f;
f = rtnl_dereference(f->next)) {
if (arg->count < arg->skip) {
arg->count++;
continue;
}
if (arg->fn(tp, (unsigned long)f, arg) < 0) {
arg->stop = 1;
return;
}
arg->count++;
}
}
}
}
}
static int route4_dump(struct net *net, struct tcf_proto *tp, unsigned long fh,
struct sk_buff *skb, struct tcmsg *t)
{
struct route4_filter *f = (struct route4_filter *)fh;
unsigned char *b = skb_tail_pointer(skb);
struct nlattr *nest;
u32 id;
if (f == NULL)
return skb->len;
t->tcm_handle = f->handle;
nest = nla_nest_start(skb, TCA_OPTIONS);
if (nest == NULL)
goto nla_put_failure;
if (!(f->handle & 0x8000)) {
id = f->id & 0xFF;
if (nla_put_u32(skb, TCA_ROUTE4_TO, id))
goto nla_put_failure;
}
if (f->handle & 0x80000000) {
if ((f->handle >> 16) != 0xFFFF &&
nla_put_u32(skb, TCA_ROUTE4_IIF, f->iif))
goto nla_put_failure;
} else {
id = f->id >> 16;
if (nla_put_u32(skb, TCA_ROUTE4_FROM, id))
goto nla_put_failure;
}
if (f->res.classid &&
nla_put_u32(skb, TCA_ROUTE4_CLASSID, f->res.classid))
goto nla_put_failure;
if (tcf_exts_dump(skb, &f->exts) < 0)
goto nla_put_failure;
nla_nest_end(skb, nest);
if (tcf_exts_dump_stats(skb, &f->exts) < 0)
goto nla_put_failure;
return skb->len;
nla_put_failure:
nlmsg_trim(skb, b);
return -1;
}
static struct tcf_proto_ops cls_route4_ops __read_mostly = {
.kind = "route",
.classify = route4_classify,
.init = route4_init,
.destroy = route4_destroy,
.get = route4_get,
.put = route4_put,
.change = route4_change,
.delete = route4_delete,
.walk = route4_walk,
.dump = route4_dump,
.owner = THIS_MODULE,
};
static int __init init_route4(void)
{
return register_tcf_proto_ops(&cls_route4_ops);
}
static void __exit exit_route4(void)
{
unregister_tcf_proto_ops(&cls_route4_ops);
}
module_init(init_route4)
module_exit(exit_route4)
MODULE_LICENSE("GPL");

28
net/sched/cls_rsvp.c Normal file
View file

@ -0,0 +1,28 @@
/*
* net/sched/cls_rsvp.c Special RSVP packet classifier for IPv4.
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public License
* as published by the Free Software Foundation; either version
* 2 of the License, or (at your option) any later version.
*
* Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
*/
#include <linux/module.h>
#include <linux/types.h>
#include <linux/kernel.h>
#include <linux/string.h>
#include <linux/errno.h>
#include <linux/skbuff.h>
#include <net/ip.h>
#include <net/netlink.h>
#include <net/act_api.h>
#include <net/pkt_cls.h>
#define RSVP_DST_LEN 1
#define RSVP_ID "rsvp"
#define RSVP_OPS cls_rsvp_ops
#include "cls_rsvp.h"
MODULE_LICENSE("GPL");

730
net/sched/cls_rsvp.h Normal file
View file

@ -0,0 +1,730 @@
/*
* net/sched/cls_rsvp.h Template file for RSVPv[46] classifiers.
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public License
* as published by the Free Software Foundation; either version
* 2 of the License, or (at your option) any later version.
*
* Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
*/
/*
Comparing to general packet classification problem,
RSVP needs only sevaral relatively simple rules:
* (dst, protocol) are always specified,
so that we are able to hash them.
* src may be exact, or may be wildcard, so that
we can keep a hash table plus one wildcard entry.
* source port (or flow label) is important only if src is given.
IMPLEMENTATION.
We use a two level hash table: The top level is keyed by
destination address and protocol ID, every bucket contains a list
of "rsvp sessions", identified by destination address, protocol and
DPI(="Destination Port ID"): triple (key, mask, offset).
Every bucket has a smaller hash table keyed by source address
(cf. RSVP flowspec) and one wildcard entry for wildcard reservations.
Every bucket is again a list of "RSVP flows", selected by
source address and SPI(="Source Port ID" here rather than
"security parameter index"): triple (key, mask, offset).
NOTE 1. All the packets with IPv6 extension headers (but AH and ESP)
and all fragmented packets go to the best-effort traffic class.
NOTE 2. Two "port id"'s seems to be redundant, rfc2207 requires
only one "Generalized Port Identifier". So that for classic
ah, esp (and udp,tcp) both *pi should coincide or one of them
should be wildcard.
At first sight, this redundancy is just a waste of CPU
resources. But DPI and SPI add the possibility to assign different
priorities to GPIs. Look also at note 4 about tunnels below.
NOTE 3. One complication is the case of tunneled packets.
We implement it as following: if the first lookup
matches a special session with "tunnelhdr" value not zero,
flowid doesn't contain the true flow ID, but the tunnel ID (1...255).
In this case, we pull tunnelhdr bytes and restart lookup
with tunnel ID added to the list of keys. Simple and stupid 8)8)
It's enough for PIMREG and IPIP.
NOTE 4. Two GPIs make it possible to parse even GRE packets.
F.e. DPI can select ETH_P_IP (and necessary flags to make
tunnelhdr correct) in GRE protocol field and SPI matches
GRE key. Is it not nice? 8)8)
Well, as result, despite its simplicity, we get a pretty
powerful classification engine. */
struct rsvp_head {
u32 tmap[256/32];
u32 hgenerator;
u8 tgenerator;
struct rsvp_session __rcu *ht[256];
struct rcu_head rcu;
};
struct rsvp_session {
struct rsvp_session __rcu *next;
__be32 dst[RSVP_DST_LEN];
struct tc_rsvp_gpi dpi;
u8 protocol;
u8 tunnelid;
/* 16 (src,sport) hash slots, and one wildcard source slot */
struct rsvp_filter __rcu *ht[16 + 1];
struct rcu_head rcu;
};
struct rsvp_filter {
struct rsvp_filter __rcu *next;
__be32 src[RSVP_DST_LEN];
struct tc_rsvp_gpi spi;
u8 tunnelhdr;
struct tcf_result res;
struct tcf_exts exts;
u32 handle;
struct rsvp_session *sess;
struct rcu_head rcu;
};
static inline unsigned int hash_dst(__be32 *dst, u8 protocol, u8 tunnelid)
{
unsigned int h = (__force __u32)dst[RSVP_DST_LEN - 1];
h ^= h>>16;
h ^= h>>8;
return (h ^ protocol ^ tunnelid) & 0xFF;
}
static inline unsigned int hash_src(__be32 *src)
{
unsigned int h = (__force __u32)src[RSVP_DST_LEN-1];
h ^= h>>16;
h ^= h>>8;
h ^= h>>4;
return h & 0xF;
}
#define RSVP_APPLY_RESULT() \
{ \
int r = tcf_exts_exec(skb, &f->exts, res); \
if (r < 0) \
continue; \
else if (r > 0) \
return r; \
}
static int rsvp_classify(struct sk_buff *skb, const struct tcf_proto *tp,
struct tcf_result *res)
{
struct rsvp_head *head = rcu_dereference_bh(tp->root);
struct rsvp_session *s;
struct rsvp_filter *f;
unsigned int h1, h2;
__be32 *dst, *src;
u8 protocol;
u8 tunnelid = 0;
u8 *xprt;
#if RSVP_DST_LEN == 4
struct ipv6hdr *nhptr;
if (!pskb_network_may_pull(skb, sizeof(*nhptr)))
return -1;
nhptr = ipv6_hdr(skb);
#else
struct iphdr *nhptr;
if (!pskb_network_may_pull(skb, sizeof(*nhptr)))
return -1;
nhptr = ip_hdr(skb);
#endif
restart:
#if RSVP_DST_LEN == 4
src = &nhptr->saddr.s6_addr32[0];
dst = &nhptr->daddr.s6_addr32[0];
protocol = nhptr->nexthdr;
xprt = ((u8 *)nhptr) + sizeof(struct ipv6hdr);
#else
src = &nhptr->saddr;
dst = &nhptr->daddr;
protocol = nhptr->protocol;
xprt = ((u8 *)nhptr) + (nhptr->ihl<<2);
if (ip_is_fragment(nhptr))
return -1;
#endif
h1 = hash_dst(dst, protocol, tunnelid);
h2 = hash_src(src);
for (s = rcu_dereference_bh(head->ht[h1]); s;
s = rcu_dereference_bh(s->next)) {
if (dst[RSVP_DST_LEN-1] == s->dst[RSVP_DST_LEN - 1] &&
protocol == s->protocol &&
!(s->dpi.mask &
(*(u32 *)(xprt + s->dpi.offset) ^ s->dpi.key)) &&
#if RSVP_DST_LEN == 4
dst[0] == s->dst[0] &&
dst[1] == s->dst[1] &&
dst[2] == s->dst[2] &&
#endif
tunnelid == s->tunnelid) {
for (f = rcu_dereference_bh(s->ht[h2]); f;
f = rcu_dereference_bh(f->next)) {
if (src[RSVP_DST_LEN-1] == f->src[RSVP_DST_LEN - 1] &&
!(f->spi.mask & (*(u32 *)(xprt + f->spi.offset) ^ f->spi.key))
#if RSVP_DST_LEN == 4
&&
src[0] == f->src[0] &&
src[1] == f->src[1] &&
src[2] == f->src[2]
#endif
) {
*res = f->res;
RSVP_APPLY_RESULT();
matched:
if (f->tunnelhdr == 0)
return 0;
tunnelid = f->res.classid;
nhptr = (void *)(xprt + f->tunnelhdr - sizeof(*nhptr));
goto restart;
}
}
/* And wildcard bucket... */
for (f = rcu_dereference_bh(s->ht[16]); f;
f = rcu_dereference_bh(f->next)) {
*res = f->res;
RSVP_APPLY_RESULT();
goto matched;
}
return -1;
}
}
return -1;
}
static void rsvp_replace(struct tcf_proto *tp, struct rsvp_filter *n, u32 h)
{
struct rsvp_head *head = rtnl_dereference(tp->root);
struct rsvp_session *s;
struct rsvp_filter __rcu **ins;
struct rsvp_filter *pins;
unsigned int h1 = h & 0xFF;
unsigned int h2 = (h >> 8) & 0xFF;
for (s = rtnl_dereference(head->ht[h1]); s;
s = rtnl_dereference(s->next)) {
for (ins = &s->ht[h2], pins = rtnl_dereference(*ins); ;
ins = &pins->next, pins = rtnl_dereference(*ins)) {
if (pins->handle == h) {
RCU_INIT_POINTER(n->next, pins->next);
rcu_assign_pointer(*ins, n);
return;
}
}
}
/* Something went wrong if we are trying to replace a non-existant
* node. Mind as well halt instead of silently failing.
*/
BUG_ON(1);
}
static unsigned long rsvp_get(struct tcf_proto *tp, u32 handle)
{
struct rsvp_head *head = rtnl_dereference(tp->root);
struct rsvp_session *s;
struct rsvp_filter *f;
unsigned int h1 = handle & 0xFF;
unsigned int h2 = (handle >> 8) & 0xFF;
if (h2 > 16)
return 0;
for (s = rtnl_dereference(head->ht[h1]); s;
s = rtnl_dereference(s->next)) {
for (f = rtnl_dereference(s->ht[h2]); f;
f = rtnl_dereference(f->next)) {
if (f->handle == handle)
return (unsigned long)f;
}
}
return 0;
}
static void rsvp_put(struct tcf_proto *tp, unsigned long f)
{
}
static int rsvp_init(struct tcf_proto *tp)
{
struct rsvp_head *data;
data = kzalloc(sizeof(struct rsvp_head), GFP_KERNEL);
if (data) {
rcu_assign_pointer(tp->root, data);
return 0;
}
return -ENOBUFS;
}
static void
rsvp_delete_filter(struct tcf_proto *tp, struct rsvp_filter *f)
{
tcf_unbind_filter(tp, &f->res);
tcf_exts_destroy(&f->exts);
kfree_rcu(f, rcu);
}
static void rsvp_destroy(struct tcf_proto *tp)
{
struct rsvp_head *data = rtnl_dereference(tp->root);
int h1, h2;
if (data == NULL)
return;
RCU_INIT_POINTER(tp->root, NULL);
for (h1 = 0; h1 < 256; h1++) {
struct rsvp_session *s;
while ((s = rtnl_dereference(data->ht[h1])) != NULL) {
RCU_INIT_POINTER(data->ht[h1], s->next);
for (h2 = 0; h2 <= 16; h2++) {
struct rsvp_filter *f;
while ((f = rtnl_dereference(s->ht[h2])) != NULL) {
rcu_assign_pointer(s->ht[h2], f->next);
rsvp_delete_filter(tp, f);
}
}
kfree_rcu(s, rcu);
}
}
kfree_rcu(data, rcu);
}
static int rsvp_delete(struct tcf_proto *tp, unsigned long arg)
{
struct rsvp_head *head = rtnl_dereference(tp->root);
struct rsvp_filter *nfp, *f = (struct rsvp_filter *)arg;
struct rsvp_filter __rcu **fp;
unsigned int h = f->handle;
struct rsvp_session __rcu **sp;
struct rsvp_session *nsp, *s = f->sess;
int i;
fp = &s->ht[(h >> 8) & 0xFF];
for (nfp = rtnl_dereference(*fp); nfp;
fp = &nfp->next, nfp = rtnl_dereference(*fp)) {
if (nfp == f) {
RCU_INIT_POINTER(*fp, f->next);
rsvp_delete_filter(tp, f);
/* Strip tree */
for (i = 0; i <= 16; i++)
if (s->ht[i])
return 0;
/* OK, session has no flows */
sp = &head->ht[h & 0xFF];
for (nsp = rtnl_dereference(*sp); nsp;
sp = &nsp->next, nsp = rtnl_dereference(*sp)) {
if (nsp == s) {
RCU_INIT_POINTER(*sp, s->next);
kfree_rcu(s, rcu);
return 0;
}
}
return 0;
}
}
return 0;
}
static unsigned int gen_handle(struct tcf_proto *tp, unsigned salt)
{
struct rsvp_head *data = rtnl_dereference(tp->root);
int i = 0xFFFF;
while (i-- > 0) {
u32 h;
if ((data->hgenerator += 0x10000) == 0)
data->hgenerator = 0x10000;
h = data->hgenerator|salt;
if (rsvp_get(tp, h) == 0)
return h;
}
return 0;
}
static int tunnel_bts(struct rsvp_head *data)
{
int n = data->tgenerator >> 5;
u32 b = 1 << (data->tgenerator & 0x1F);
if (data->tmap[n] & b)
return 0;
data->tmap[n] |= b;
return 1;
}
static void tunnel_recycle(struct rsvp_head *data)
{
struct rsvp_session __rcu **sht = data->ht;
u32 tmap[256/32];
int h1, h2;
memset(tmap, 0, sizeof(tmap));
for (h1 = 0; h1 < 256; h1++) {
struct rsvp_session *s;
for (s = rtnl_dereference(sht[h1]); s;
s = rtnl_dereference(s->next)) {
for (h2 = 0; h2 <= 16; h2++) {
struct rsvp_filter *f;
for (f = rtnl_dereference(s->ht[h2]); f;
f = rtnl_dereference(f->next)) {
if (f->tunnelhdr == 0)
continue;
data->tgenerator = f->res.classid;
tunnel_bts(data);
}
}
}
}
memcpy(data->tmap, tmap, sizeof(tmap));
}
static u32 gen_tunnel(struct rsvp_head *data)
{
int i, k;
for (k = 0; k < 2; k++) {
for (i = 255; i > 0; i--) {
if (++data->tgenerator == 0)
data->tgenerator = 1;
if (tunnel_bts(data))
return data->tgenerator;
}
tunnel_recycle(data);
}
return 0;
}
static const struct nla_policy rsvp_policy[TCA_RSVP_MAX + 1] = {
[TCA_RSVP_CLASSID] = { .type = NLA_U32 },
[TCA_RSVP_DST] = { .type = NLA_BINARY,
.len = RSVP_DST_LEN * sizeof(u32) },
[TCA_RSVP_SRC] = { .type = NLA_BINARY,
.len = RSVP_DST_LEN * sizeof(u32) },
[TCA_RSVP_PINFO] = { .len = sizeof(struct tc_rsvp_pinfo) },
};
static int rsvp_change(struct net *net, struct sk_buff *in_skb,
struct tcf_proto *tp, unsigned long base,
u32 handle,
struct nlattr **tca,
unsigned long *arg, bool ovr)
{
struct rsvp_head *data = rtnl_dereference(tp->root);
struct rsvp_filter *f, *nfp;
struct rsvp_filter __rcu **fp;
struct rsvp_session *nsp, *s;
struct rsvp_session __rcu **sp;
struct tc_rsvp_pinfo *pinfo = NULL;
struct nlattr *opt = tca[TCA_OPTIONS];
struct nlattr *tb[TCA_RSVP_MAX + 1];
struct tcf_exts e;
unsigned int h1, h2;
__be32 *dst;
int err;
if (opt == NULL)
return handle ? -EINVAL : 0;
err = nla_parse_nested(tb, TCA_RSVP_MAX, opt, rsvp_policy);
if (err < 0)
return err;
tcf_exts_init(&e, TCA_RSVP_ACT, TCA_RSVP_POLICE);
err = tcf_exts_validate(net, tp, tb, tca[TCA_RATE], &e, ovr);
if (err < 0)
return err;
f = (struct rsvp_filter *)*arg;
if (f) {
/* Node exists: adjust only classid */
struct rsvp_filter *n;
if (f->handle != handle && handle)
goto errout2;
n = kmemdup(f, sizeof(*f), GFP_KERNEL);
if (!n) {
err = -ENOMEM;
goto errout2;
}
tcf_exts_init(&n->exts, TCA_RSVP_ACT, TCA_RSVP_POLICE);
if (tb[TCA_RSVP_CLASSID]) {
n->res.classid = nla_get_u32(tb[TCA_RSVP_CLASSID]);
tcf_bind_filter(tp, &n->res, base);
}
tcf_exts_change(tp, &n->exts, &e);
rsvp_replace(tp, n, handle);
return 0;
}
/* Now more serious part... */
err = -EINVAL;
if (handle)
goto errout2;
if (tb[TCA_RSVP_DST] == NULL)
goto errout2;
err = -ENOBUFS;
f = kzalloc(sizeof(struct rsvp_filter), GFP_KERNEL);
if (f == NULL)
goto errout2;
tcf_exts_init(&f->exts, TCA_RSVP_ACT, TCA_RSVP_POLICE);
h2 = 16;
if (tb[TCA_RSVP_SRC]) {
memcpy(f->src, nla_data(tb[TCA_RSVP_SRC]), sizeof(f->src));
h2 = hash_src(f->src);
}
if (tb[TCA_RSVP_PINFO]) {
pinfo = nla_data(tb[TCA_RSVP_PINFO]);
f->spi = pinfo->spi;
f->tunnelhdr = pinfo->tunnelhdr;
}
if (tb[TCA_RSVP_CLASSID])
f->res.classid = nla_get_u32(tb[TCA_RSVP_CLASSID]);
dst = nla_data(tb[TCA_RSVP_DST]);
h1 = hash_dst(dst, pinfo ? pinfo->protocol : 0, pinfo ? pinfo->tunnelid : 0);
err = -ENOMEM;
if ((f->handle = gen_handle(tp, h1 | (h2<<8))) == 0)
goto errout;
if (f->tunnelhdr) {
err = -EINVAL;
if (f->res.classid > 255)
goto errout;
err = -ENOMEM;
if (f->res.classid == 0 &&
(f->res.classid = gen_tunnel(data)) == 0)
goto errout;
}
for (sp = &data->ht[h1];
(s = rtnl_dereference(*sp)) != NULL;
sp = &s->next) {
if (dst[RSVP_DST_LEN-1] == s->dst[RSVP_DST_LEN-1] &&
pinfo && pinfo->protocol == s->protocol &&
memcmp(&pinfo->dpi, &s->dpi, sizeof(s->dpi)) == 0 &&
#if RSVP_DST_LEN == 4
dst[0] == s->dst[0] &&
dst[1] == s->dst[1] &&
dst[2] == s->dst[2] &&
#endif
pinfo->tunnelid == s->tunnelid) {
insert:
/* OK, we found appropriate session */
fp = &s->ht[h2];
f->sess = s;
if (f->tunnelhdr == 0)
tcf_bind_filter(tp, &f->res, base);
tcf_exts_change(tp, &f->exts, &e);
fp = &s->ht[h2];
for (nfp = rtnl_dereference(*fp); nfp;
fp = &nfp->next, nfp = rtnl_dereference(*fp)) {
__u32 mask = nfp->spi.mask & f->spi.mask;
if (mask != f->spi.mask)
break;
}
RCU_INIT_POINTER(f->next, nfp);
rcu_assign_pointer(*fp, f);
*arg = (unsigned long)f;
return 0;
}
}
/* No session found. Create new one. */
err = -ENOBUFS;
s = kzalloc(sizeof(struct rsvp_session), GFP_KERNEL);
if (s == NULL)
goto errout;
memcpy(s->dst, dst, sizeof(s->dst));
if (pinfo) {
s->dpi = pinfo->dpi;
s->protocol = pinfo->protocol;
s->tunnelid = pinfo->tunnelid;
}
sp = &data->ht[h1];
for (nsp = rtnl_dereference(*sp); nsp;
sp = &nsp->next, nsp = rtnl_dereference(*sp)) {
if ((nsp->dpi.mask & s->dpi.mask) != s->dpi.mask)
break;
}
RCU_INIT_POINTER(s->next, nsp);
rcu_assign_pointer(*sp, s);
goto insert;
errout:
kfree(f);
errout2:
tcf_exts_destroy(&e);
return err;
}
static void rsvp_walk(struct tcf_proto *tp, struct tcf_walker *arg)
{
struct rsvp_head *head = rtnl_dereference(tp->root);
unsigned int h, h1;
if (arg->stop)
return;
for (h = 0; h < 256; h++) {
struct rsvp_session *s;
for (s = rtnl_dereference(head->ht[h]); s;
s = rtnl_dereference(s->next)) {
for (h1 = 0; h1 <= 16; h1++) {
struct rsvp_filter *f;
for (f = rtnl_dereference(s->ht[h1]); f;
f = rtnl_dereference(f->next)) {
if (arg->count < arg->skip) {
arg->count++;
continue;
}
if (arg->fn(tp, (unsigned long)f, arg) < 0) {
arg->stop = 1;
return;
}
arg->count++;
}
}
}
}
}
static int rsvp_dump(struct net *net, struct tcf_proto *tp, unsigned long fh,
struct sk_buff *skb, struct tcmsg *t)
{
struct rsvp_filter *f = (struct rsvp_filter *)fh;
struct rsvp_session *s;
unsigned char *b = skb_tail_pointer(skb);
struct nlattr *nest;
struct tc_rsvp_pinfo pinfo;
if (f == NULL)
return skb->len;
s = f->sess;
t->tcm_handle = f->handle;
nest = nla_nest_start(skb, TCA_OPTIONS);
if (nest == NULL)
goto nla_put_failure;
if (nla_put(skb, TCA_RSVP_DST, sizeof(s->dst), &s->dst))
goto nla_put_failure;
pinfo.dpi = s->dpi;
pinfo.spi = f->spi;
pinfo.protocol = s->protocol;
pinfo.tunnelid = s->tunnelid;
pinfo.tunnelhdr = f->tunnelhdr;
pinfo.pad = 0;
if (nla_put(skb, TCA_RSVP_PINFO, sizeof(pinfo), &pinfo))
goto nla_put_failure;
if (f->res.classid &&
nla_put_u32(skb, TCA_RSVP_CLASSID, f->res.classid))
goto nla_put_failure;
if (((f->handle >> 8) & 0xFF) != 16 &&
nla_put(skb, TCA_RSVP_SRC, sizeof(f->src), f->src))
goto nla_put_failure;
if (tcf_exts_dump(skb, &f->exts) < 0)
goto nla_put_failure;
nla_nest_end(skb, nest);
if (tcf_exts_dump_stats(skb, &f->exts) < 0)
goto nla_put_failure;
return skb->len;
nla_put_failure:
nlmsg_trim(skb, b);
return -1;
}
static struct tcf_proto_ops RSVP_OPS __read_mostly = {
.kind = RSVP_ID,
.classify = rsvp_classify,
.init = rsvp_init,
.destroy = rsvp_destroy,
.get = rsvp_get,
.put = rsvp_put,
.change = rsvp_change,
.delete = rsvp_delete,
.walk = rsvp_walk,
.dump = rsvp_dump,
.owner = THIS_MODULE,
};
static int __init init_rsvp(void)
{
return register_tcf_proto_ops(&RSVP_OPS);
}
static void __exit exit_rsvp(void)
{
unregister_tcf_proto_ops(&RSVP_OPS);
}
module_init(init_rsvp)
module_exit(exit_rsvp)

28
net/sched/cls_rsvp6.c Normal file
View file

@ -0,0 +1,28 @@
/*
* net/sched/cls_rsvp6.c Special RSVP packet classifier for IPv6.
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public License
* as published by the Free Software Foundation; either version
* 2 of the License, or (at your option) any later version.
*
* Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
*/
#include <linux/module.h>
#include <linux/types.h>
#include <linux/kernel.h>
#include <linux/string.h>
#include <linux/errno.h>
#include <linux/ipv6.h>
#include <linux/skbuff.h>
#include <net/act_api.h>
#include <net/pkt_cls.h>
#include <net/netlink.h>
#define RSVP_DST_LEN 4
#define RSVP_ID "rsvp6"
#define RSVP_OPS cls_rsvp6_ops
#include "cls_rsvp.h"
MODULE_LICENSE("GPL");

583
net/sched/cls_tcindex.c Normal file
View file

@ -0,0 +1,583 @@
/*
* net/sched/cls_tcindex.c Packet classifier for skb->tc_index
*
* Written 1998,1999 by Werner Almesberger, EPFL ICA
*/
#include <linux/module.h>
#include <linux/types.h>
#include <linux/kernel.h>
#include <linux/skbuff.h>
#include <linux/errno.h>
#include <linux/slab.h>
#include <net/act_api.h>
#include <net/netlink.h>
#include <net/pkt_cls.h>
/*
* Passing parameters to the root seems to be done more awkwardly than really
* necessary. At least, u32 doesn't seem to use such dirty hacks. To be
* verified. FIXME.
*/
#define PERFECT_HASH_THRESHOLD 64 /* use perfect hash if not bigger */
#define DEFAULT_HASH_SIZE 64 /* optimized for diffserv */
struct tcindex_filter_result {
struct tcf_exts exts;
struct tcf_result res;
};
struct tcindex_filter {
u16 key;
struct tcindex_filter_result result;
struct tcindex_filter __rcu *next;
struct rcu_head rcu;
};
struct tcindex_data {
struct tcindex_filter_result *perfect; /* perfect hash; NULL if none */
struct tcindex_filter __rcu **h; /* imperfect hash; */
struct tcf_proto *tp;
u16 mask; /* AND key with mask */
u32 shift; /* shift ANDed key to the right */
u32 hash; /* hash table size; 0 if undefined */
u32 alloc_hash; /* allocated size */
u32 fall_through; /* 0: only classify if explicit match */
struct rcu_head rcu;
};
static inline int
tcindex_filter_is_set(struct tcindex_filter_result *r)
{
return tcf_exts_is_predicative(&r->exts) || r->res.classid;
}
static struct tcindex_filter_result *
tcindex_lookup(struct tcindex_data *p, u16 key)
{
if (p->perfect) {
struct tcindex_filter_result *f = p->perfect + key;
return tcindex_filter_is_set(f) ? f : NULL;
} else if (p->h) {
struct tcindex_filter __rcu **fp;
struct tcindex_filter *f;
fp = &p->h[key % p->hash];
for (f = rcu_dereference_bh_rtnl(*fp);
f;
fp = &f->next, f = rcu_dereference_bh_rtnl(*fp))
if (f->key == key)
return &f->result;
}
return NULL;
}
static int tcindex_classify(struct sk_buff *skb, const struct tcf_proto *tp,
struct tcf_result *res)
{
struct tcindex_data *p = rcu_dereference_bh(tp->root);
struct tcindex_filter_result *f;
int key = (skb->tc_index & p->mask) >> p->shift;
pr_debug("tcindex_classify(skb %p,tp %p,res %p),p %p\n",
skb, tp, res, p);
f = tcindex_lookup(p, key);
if (!f) {
if (!p->fall_through)
return -1;
res->classid = TC_H_MAKE(TC_H_MAJ(tp->q->handle), key);
res->class = 0;
pr_debug("alg 0x%x\n", res->classid);
return 0;
}
*res = f->res;
pr_debug("map 0x%x\n", res->classid);
return tcf_exts_exec(skb, &f->exts, res);
}
static unsigned long tcindex_get(struct tcf_proto *tp, u32 handle)
{
struct tcindex_data *p = rtnl_dereference(tp->root);
struct tcindex_filter_result *r;
pr_debug("tcindex_get(tp %p,handle 0x%08x)\n", tp, handle);
if (p->perfect && handle >= p->alloc_hash)
return 0;
r = tcindex_lookup(p, handle);
return r && tcindex_filter_is_set(r) ? (unsigned long) r : 0UL;
}
static void tcindex_put(struct tcf_proto *tp, unsigned long f)
{
pr_debug("tcindex_put(tp %p,f 0x%lx)\n", tp, f);
}
static int tcindex_init(struct tcf_proto *tp)
{
struct tcindex_data *p;
pr_debug("tcindex_init(tp %p)\n", tp);
p = kzalloc(sizeof(struct tcindex_data), GFP_KERNEL);
if (!p)
return -ENOMEM;
p->mask = 0xffff;
p->hash = DEFAULT_HASH_SIZE;
p->fall_through = 1;
rcu_assign_pointer(tp->root, p);
return 0;
}
static int
tcindex_delete(struct tcf_proto *tp, unsigned long arg)
{
struct tcindex_data *p = rtnl_dereference(tp->root);
struct tcindex_filter_result *r = (struct tcindex_filter_result *) arg;
struct tcindex_filter __rcu **walk;
struct tcindex_filter *f = NULL;
pr_debug("tcindex_delete(tp %p,arg 0x%lx),p %p\n", tp, arg, p);
if (p->perfect) {
if (!r->res.class)
return -ENOENT;
} else {
int i;
for (i = 0; i < p->hash; i++) {
walk = p->h + i;
for (f = rtnl_dereference(*walk); f;
walk = &f->next, f = rtnl_dereference(*walk)) {
if (&f->result == r)
goto found;
}
}
return -ENOENT;
found:
rcu_assign_pointer(*walk, rtnl_dereference(f->next));
}
tcf_unbind_filter(tp, &r->res);
tcf_exts_destroy(&r->exts);
if (f)
kfree_rcu(f, rcu);
return 0;
}
static int tcindex_destroy_element(struct tcf_proto *tp,
unsigned long arg,
struct tcf_walker *walker)
{
return tcindex_delete(tp, arg);
}
static void __tcindex_destroy(struct rcu_head *head)
{
struct tcindex_data *p = container_of(head, struct tcindex_data, rcu);
kfree(p->perfect);
kfree(p->h);
kfree(p);
}
static inline int
valid_perfect_hash(struct tcindex_data *p)
{
return p->hash > (p->mask >> p->shift);
}
static const struct nla_policy tcindex_policy[TCA_TCINDEX_MAX + 1] = {
[TCA_TCINDEX_HASH] = { .type = NLA_U32 },
[TCA_TCINDEX_MASK] = { .type = NLA_U16 },
[TCA_TCINDEX_SHIFT] = { .type = NLA_U32 },
[TCA_TCINDEX_FALL_THROUGH] = { .type = NLA_U32 },
[TCA_TCINDEX_CLASSID] = { .type = NLA_U32 },
};
static void tcindex_filter_result_init(struct tcindex_filter_result *r)
{
memset(r, 0, sizeof(*r));
tcf_exts_init(&r->exts, TCA_TCINDEX_ACT, TCA_TCINDEX_POLICE);
}
static void __tcindex_partial_destroy(struct rcu_head *head)
{
struct tcindex_data *p = container_of(head, struct tcindex_data, rcu);
kfree(p->perfect);
kfree(p);
}
static int
tcindex_set_parms(struct net *net, struct tcf_proto *tp, unsigned long base,
u32 handle, struct tcindex_data *p,
struct tcindex_filter_result *r, struct nlattr **tb,
struct nlattr *est, bool ovr)
{
int err, balloc = 0;
struct tcindex_filter_result new_filter_result, *old_r = r;
struct tcindex_filter_result cr;
struct tcindex_data *cp, *oldp;
struct tcindex_filter *f = NULL; /* make gcc behave */
struct tcf_exts e;
tcf_exts_init(&e, TCA_TCINDEX_ACT, TCA_TCINDEX_POLICE);
err = tcf_exts_validate(net, tp, tb, est, &e, ovr);
if (err < 0)
return err;
err = -ENOMEM;
/* tcindex_data attributes must look atomic to classifier/lookup so
* allocate new tcindex data and RCU assign it onto root. Keeping
* perfect hash and hash pointers from old data.
*/
cp = kzalloc(sizeof(*cp), GFP_KERNEL);
if (!cp)
goto errout;
cp->mask = p->mask;
cp->shift = p->shift;
cp->hash = p->hash;
cp->alloc_hash = p->alloc_hash;
cp->fall_through = p->fall_through;
cp->tp = tp;
if (p->perfect) {
int i;
cp->perfect = kmemdup(p->perfect,
sizeof(*r) * cp->hash, GFP_KERNEL);
if (!cp->perfect)
goto errout;
for (i = 0; i < cp->hash; i++)
tcf_exts_init(&cp->perfect[i].exts,
TCA_TCINDEX_ACT, TCA_TCINDEX_POLICE);
balloc = 1;
}
cp->h = p->h;
tcindex_filter_result_init(&new_filter_result);
tcindex_filter_result_init(&cr);
if (old_r)
cr.res = r->res;
if (tb[TCA_TCINDEX_HASH])
cp->hash = nla_get_u32(tb[TCA_TCINDEX_HASH]);
if (tb[TCA_TCINDEX_MASK])
cp->mask = nla_get_u16(tb[TCA_TCINDEX_MASK]);
if (tb[TCA_TCINDEX_SHIFT])
cp->shift = nla_get_u32(tb[TCA_TCINDEX_SHIFT]);
err = -EBUSY;
/* Hash already allocated, make sure that we still meet the
* requirements for the allocated hash.
*/
if (cp->perfect) {
if (!valid_perfect_hash(cp) ||
cp->hash > cp->alloc_hash)
goto errout_alloc;
} else if (cp->h && cp->hash != cp->alloc_hash) {
goto errout_alloc;
}
err = -EINVAL;
if (tb[TCA_TCINDEX_FALL_THROUGH])
cp->fall_through = nla_get_u32(tb[TCA_TCINDEX_FALL_THROUGH]);
if (!cp->hash) {
/* Hash not specified, use perfect hash if the upper limit
* of the hashing index is below the threshold.
*/
if ((cp->mask >> cp->shift) < PERFECT_HASH_THRESHOLD)
cp->hash = (cp->mask >> cp->shift) + 1;
else
cp->hash = DEFAULT_HASH_SIZE;
}
if (!cp->perfect && !cp->h)
cp->alloc_hash = cp->hash;
/* Note: this could be as restrictive as if (handle & ~(mask >> shift))
* but then, we'd fail handles that may become valid after some future
* mask change. While this is extremely unlikely to ever matter,
* the check below is safer (and also more backwards-compatible).
*/
if (cp->perfect || valid_perfect_hash(cp))
if (handle >= cp->alloc_hash)
goto errout_alloc;
err = -ENOMEM;
if (!cp->perfect && !cp->h) {
if (valid_perfect_hash(cp)) {
int i;
cp->perfect = kcalloc(cp->hash, sizeof(*r), GFP_KERNEL);
if (!cp->perfect)
goto errout_alloc;
for (i = 0; i < cp->hash; i++)
tcf_exts_init(&cp->perfect[i].exts,
TCA_TCINDEX_ACT,
TCA_TCINDEX_POLICE);
balloc = 1;
} else {
struct tcindex_filter __rcu **hash;
hash = kcalloc(cp->hash,
sizeof(struct tcindex_filter *),
GFP_KERNEL);
if (!hash)
goto errout_alloc;
cp->h = hash;
balloc = 2;
}
}
if (cp->perfect)
r = cp->perfect + handle;
else
r = tcindex_lookup(cp, handle) ? : &new_filter_result;
if (r == &new_filter_result) {
f = kzalloc(sizeof(*f), GFP_KERNEL);
if (!f)
goto errout_alloc;
f->key = handle;
tcindex_filter_result_init(&f->result);
f->next = NULL;
}
if (tb[TCA_TCINDEX_CLASSID]) {
cr.res.classid = nla_get_u32(tb[TCA_TCINDEX_CLASSID]);
tcf_bind_filter(tp, &cr.res, base);
}
if (old_r)
tcf_exts_change(tp, &r->exts, &e);
else
tcf_exts_change(tp, &cr.exts, &e);
if (old_r && old_r != r)
tcindex_filter_result_init(old_r);
oldp = p;
r->res = cr.res;
rcu_assign_pointer(tp->root, cp);
if (r == &new_filter_result) {
struct tcindex_filter *nfp;
struct tcindex_filter __rcu **fp;
tcf_exts_change(tp, &f->result.exts, &r->exts);
fp = cp->h + (handle % cp->hash);
for (nfp = rtnl_dereference(*fp);
nfp;
fp = &nfp->next, nfp = rtnl_dereference(*fp))
; /* nothing */
rcu_assign_pointer(*fp, f);
}
if (oldp)
call_rcu(&oldp->rcu, __tcindex_partial_destroy);
return 0;
errout_alloc:
if (balloc == 1)
kfree(cp->perfect);
else if (balloc == 2)
kfree(cp->h);
errout:
kfree(cp);
tcf_exts_destroy(&e);
return err;
}
static int
tcindex_change(struct net *net, struct sk_buff *in_skb,
struct tcf_proto *tp, unsigned long base, u32 handle,
struct nlattr **tca, unsigned long *arg, bool ovr)
{
struct nlattr *opt = tca[TCA_OPTIONS];
struct nlattr *tb[TCA_TCINDEX_MAX + 1];
struct tcindex_data *p = rtnl_dereference(tp->root);
struct tcindex_filter_result *r = (struct tcindex_filter_result *) *arg;
int err;
pr_debug("tcindex_change(tp %p,handle 0x%08x,tca %p,arg %p),opt %p,"
"p %p,r %p,*arg 0x%lx\n",
tp, handle, tca, arg, opt, p, r, arg ? *arg : 0L);
if (!opt)
return 0;
err = nla_parse_nested(tb, TCA_TCINDEX_MAX, opt, tcindex_policy);
if (err < 0)
return err;
return tcindex_set_parms(net, tp, base, handle, p, r, tb,
tca[TCA_RATE], ovr);
}
static void tcindex_walk(struct tcf_proto *tp, struct tcf_walker *walker)
{
struct tcindex_data *p = rtnl_dereference(tp->root);
struct tcindex_filter *f, *next;
int i;
pr_debug("tcindex_walk(tp %p,walker %p),p %p\n", tp, walker, p);
if (p->perfect) {
for (i = 0; i < p->hash; i++) {
if (!p->perfect[i].res.class)
continue;
if (walker->count >= walker->skip) {
if (walker->fn(tp,
(unsigned long) (p->perfect+i), walker)
< 0) {
walker->stop = 1;
return;
}
}
walker->count++;
}
}
if (!p->h)
return;
for (i = 0; i < p->hash; i++) {
for (f = rtnl_dereference(p->h[i]); f; f = next) {
next = rtnl_dereference(f->next);
if (walker->count >= walker->skip) {
if (walker->fn(tp, (unsigned long) &f->result,
walker) < 0) {
walker->stop = 1;
return;
}
}
walker->count++;
}
}
}
static void tcindex_destroy(struct tcf_proto *tp)
{
struct tcindex_data *p = rtnl_dereference(tp->root);
struct tcf_walker walker;
pr_debug("tcindex_destroy(tp %p),p %p\n", tp, p);
walker.count = 0;
walker.skip = 0;
walker.fn = tcindex_destroy_element;
tcindex_walk(tp, &walker);
RCU_INIT_POINTER(tp->root, NULL);
call_rcu(&p->rcu, __tcindex_destroy);
}
static int tcindex_dump(struct net *net, struct tcf_proto *tp, unsigned long fh,
struct sk_buff *skb, struct tcmsg *t)
{
struct tcindex_data *p = rtnl_dereference(tp->root);
struct tcindex_filter_result *r = (struct tcindex_filter_result *) fh;
unsigned char *b = skb_tail_pointer(skb);
struct nlattr *nest;
pr_debug("tcindex_dump(tp %p,fh 0x%lx,skb %p,t %p),p %p,r %p,b %p\n",
tp, fh, skb, t, p, r, b);
pr_debug("p->perfect %p p->h %p\n", p->perfect, p->h);
nest = nla_nest_start(skb, TCA_OPTIONS);
if (nest == NULL)
goto nla_put_failure;
if (!fh) {
t->tcm_handle = ~0; /* whatever ... */
if (nla_put_u32(skb, TCA_TCINDEX_HASH, p->hash) ||
nla_put_u16(skb, TCA_TCINDEX_MASK, p->mask) ||
nla_put_u32(skb, TCA_TCINDEX_SHIFT, p->shift) ||
nla_put_u32(skb, TCA_TCINDEX_FALL_THROUGH, p->fall_through))
goto nla_put_failure;
nla_nest_end(skb, nest);
} else {
if (p->perfect) {
t->tcm_handle = r - p->perfect;
} else {
struct tcindex_filter *f;
struct tcindex_filter __rcu **fp;
int i;
t->tcm_handle = 0;
for (i = 0; !t->tcm_handle && i < p->hash; i++) {
fp = &p->h[i];
for (f = rtnl_dereference(*fp);
!t->tcm_handle && f;
fp = &f->next, f = rtnl_dereference(*fp)) {
if (&f->result == r)
t->tcm_handle = f->key;
}
}
}
pr_debug("handle = %d\n", t->tcm_handle);
if (r->res.class &&
nla_put_u32(skb, TCA_TCINDEX_CLASSID, r->res.classid))
goto nla_put_failure;
if (tcf_exts_dump(skb, &r->exts) < 0)
goto nla_put_failure;
nla_nest_end(skb, nest);
if (tcf_exts_dump_stats(skb, &r->exts) < 0)
goto nla_put_failure;
}
return skb->len;
nla_put_failure:
nlmsg_trim(skb, b);
return -1;
}
static struct tcf_proto_ops cls_tcindex_ops __read_mostly = {
.kind = "tcindex",
.classify = tcindex_classify,
.init = tcindex_init,
.destroy = tcindex_destroy,
.get = tcindex_get,
.put = tcindex_put,
.change = tcindex_change,
.delete = tcindex_delete,
.walk = tcindex_walk,
.dump = tcindex_dump,
.owner = THIS_MODULE,
};
static int __init init_tcindex(void)
{
return register_tcf_proto_ops(&cls_tcindex_ops);
}
static void __exit exit_tcindex(void)
{
unregister_tcf_proto_ops(&cls_tcindex_ops);
}
module_init(init_tcindex)
module_exit(exit_tcindex)
MODULE_LICENSE("GPL");

1057
net/sched/cls_u32.c Normal file

File diff suppressed because it is too large Load diff

233
net/sched/em_canid.c Normal file
View file

@ -0,0 +1,233 @@
/*
* em_canid.c Ematch rule to match CAN frames according to their CAN IDs
*
* This program is free software; you can distribute it and/or
* modify it under the terms of the GNU General Public License
* as published by the Free Software Foundation; either version
* 2 of the License, or (at your option) any later version.
*
* Idea: Oliver Hartkopp <oliver.hartkopp@volkswagen.de>
* Copyright: (c) 2011 Czech Technical University in Prague
* (c) 2011 Volkswagen Group Research
* Authors: Michal Sojka <sojkam1@fel.cvut.cz>
* Pavel Pisa <pisa@cmp.felk.cvut.cz>
* Rostislav Lisovy <lisovy@gmail.cz>
* Funded by: Volkswagen Group Research
*/
#include <linux/slab.h>
#include <linux/module.h>
#include <linux/types.h>
#include <linux/kernel.h>
#include <linux/string.h>
#include <linux/skbuff.h>
#include <net/pkt_cls.h>
#include <linux/can.h>
#define EM_CAN_RULES_MAX 500
struct canid_match {
/* For each SFF CAN ID (11 bit) there is one record in this bitfield */
DECLARE_BITMAP(match_sff, (1 << CAN_SFF_ID_BITS));
int rules_count;
int sff_rules_count;
int eff_rules_count;
/*
* Raw rules copied from netlink message; Used for sending
* information to userspace (when 'tc filter show' is invoked)
* AND when matching EFF frames
*/
struct can_filter rules_raw[];
};
/**
* em_canid_get_id() - Extracts Can ID out of the sk_buff structure.
*/
static canid_t em_canid_get_id(struct sk_buff *skb)
{
/* CAN ID is stored within the data field */
struct can_frame *cf = (struct can_frame *)skb->data;
return cf->can_id;
}
static void em_canid_sff_match_add(struct canid_match *cm, u32 can_id,
u32 can_mask)
{
int i;
/*
* Limit can_mask and can_id to SFF range to
* protect against write after end of array
*/
can_mask &= CAN_SFF_MASK;
can_id &= can_mask;
/* Single frame */
if (can_mask == CAN_SFF_MASK) {
set_bit(can_id, cm->match_sff);
return;
}
/* All frames */
if (can_mask == 0) {
bitmap_fill(cm->match_sff, (1 << CAN_SFF_ID_BITS));
return;
}
/*
* Individual frame filter.
* Add record (set bit to 1) for each ID that
* conforms particular rule
*/
for (i = 0; i < (1 << CAN_SFF_ID_BITS); i++) {
if ((i & can_mask) == can_id)
set_bit(i, cm->match_sff);
}
}
static inline struct canid_match *em_canid_priv(struct tcf_ematch *m)
{
return (struct canid_match *)m->data;
}
static int em_canid_match(struct sk_buff *skb, struct tcf_ematch *m,
struct tcf_pkt_info *info)
{
struct canid_match *cm = em_canid_priv(m);
canid_t can_id;
int match = 0;
int i;
const struct can_filter *lp;
can_id = em_canid_get_id(skb);
if (can_id & CAN_EFF_FLAG) {
for (i = 0, lp = cm->rules_raw;
i < cm->eff_rules_count; i++, lp++) {
if (!(((lp->can_id ^ can_id) & lp->can_mask))) {
match = 1;
break;
}
}
} else { /* SFF */
can_id &= CAN_SFF_MASK;
match = (test_bit(can_id, cm->match_sff) ? 1 : 0);
}
return match;
}
static int em_canid_change(struct net *net, void *data, int len,
struct tcf_ematch *m)
{
struct can_filter *conf = data; /* Array with rules */
struct canid_match *cm;
int i;
if (!len)
return -EINVAL;
if (len % sizeof(struct can_filter))
return -EINVAL;
if (len > sizeof(struct can_filter) * EM_CAN_RULES_MAX)
return -EINVAL;
cm = kzalloc(sizeof(struct canid_match) + len, GFP_KERNEL);
if (!cm)
return -ENOMEM;
cm->rules_count = len / sizeof(struct can_filter);
/*
* We need two for() loops for copying rules into two contiguous
* areas in rules_raw to process all eff rules with a simple loop.
* NB: The configuration interface supports sff and eff rules.
* We do not support filters here that match for the same can_id
* provided in a SFF and EFF frame (e.g. 0x123 / 0x80000123).
* For this (unusual case) two filters have to be specified. The
* SFF/EFF separation is done with the CAN_EFF_FLAG in the can_id.
*/
/* Fill rules_raw with EFF rules first */
for (i = 0; i < cm->rules_count; i++) {
if (conf[i].can_id & CAN_EFF_FLAG) {
memcpy(cm->rules_raw + cm->eff_rules_count,
&conf[i],
sizeof(struct can_filter));
cm->eff_rules_count++;
}
}
/* append SFF frame rules */
for (i = 0; i < cm->rules_count; i++) {
if (!(conf[i].can_id & CAN_EFF_FLAG)) {
memcpy(cm->rules_raw
+ cm->eff_rules_count
+ cm->sff_rules_count,
&conf[i], sizeof(struct can_filter));
cm->sff_rules_count++;
em_canid_sff_match_add(cm,
conf[i].can_id, conf[i].can_mask);
}
}
m->datalen = sizeof(struct canid_match) + len;
m->data = (unsigned long)cm;
return 0;
}
static void em_canid_destroy(struct tcf_ematch *m)
{
struct canid_match *cm = em_canid_priv(m);
kfree(cm);
}
static int em_canid_dump(struct sk_buff *skb, struct tcf_ematch *m)
{
struct canid_match *cm = em_canid_priv(m);
/*
* When configuring this ematch 'rules_count' is set not to exceed
* 'rules_raw' array size
*/
if (nla_put_nohdr(skb, sizeof(struct can_filter) * cm->rules_count,
&cm->rules_raw) < 0)
return -EMSGSIZE;
return 0;
}
static struct tcf_ematch_ops em_canid_ops = {
.kind = TCF_EM_CANID,
.change = em_canid_change,
.match = em_canid_match,
.destroy = em_canid_destroy,
.dump = em_canid_dump,
.owner = THIS_MODULE,
.link = LIST_HEAD_INIT(em_canid_ops.link)
};
static int __init init_em_canid(void)
{
return tcf_em_register(&em_canid_ops);
}
static void __exit exit_em_canid(void)
{
tcf_em_unregister(&em_canid_ops);
}
MODULE_LICENSE("GPL");
module_init(init_em_canid);
module_exit(exit_em_canid);
MODULE_ALIAS_TCF_EMATCH(TCF_EM_CANID);

99
net/sched/em_cmp.c Normal file
View file

@ -0,0 +1,99 @@
/*
* net/sched/em_cmp.c Simple packet data comparison ematch
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public License
* as published by the Free Software Foundation; either version
* 2 of the License, or (at your option) any later version.
*
* Authors: Thomas Graf <tgraf@suug.ch>
*/
#include <linux/module.h>
#include <linux/types.h>
#include <linux/kernel.h>
#include <linux/skbuff.h>
#include <linux/tc_ematch/tc_em_cmp.h>
#include <asm/unaligned.h>
#include <net/pkt_cls.h>
static inline int cmp_needs_transformation(struct tcf_em_cmp *cmp)
{
return unlikely(cmp->flags & TCF_EM_CMP_TRANS);
}
static int em_cmp_match(struct sk_buff *skb, struct tcf_ematch *em,
struct tcf_pkt_info *info)
{
struct tcf_em_cmp *cmp = (struct tcf_em_cmp *) em->data;
unsigned char *ptr = tcf_get_base_ptr(skb, cmp->layer) + cmp->off;
u32 val = 0;
if (!tcf_valid_offset(skb, ptr, cmp->align))
return 0;
switch (cmp->align) {
case TCF_EM_ALIGN_U8:
val = *ptr;
break;
case TCF_EM_ALIGN_U16:
val = get_unaligned_be16(ptr);
if (cmp_needs_transformation(cmp))
val = be16_to_cpu(val);
break;
case TCF_EM_ALIGN_U32:
/* Worth checking boundries? The branching seems
* to get worse. Visit again.
*/
val = get_unaligned_be32(ptr);
if (cmp_needs_transformation(cmp))
val = be32_to_cpu(val);
break;
default:
return 0;
}
if (cmp->mask)
val &= cmp->mask;
switch (cmp->opnd) {
case TCF_EM_OPND_EQ:
return val == cmp->val;
case TCF_EM_OPND_LT:
return val < cmp->val;
case TCF_EM_OPND_GT:
return val > cmp->val;
}
return 0;
}
static struct tcf_ematch_ops em_cmp_ops = {
.kind = TCF_EM_CMP,
.datalen = sizeof(struct tcf_em_cmp),
.match = em_cmp_match,
.owner = THIS_MODULE,
.link = LIST_HEAD_INIT(em_cmp_ops.link)
};
static int __init init_em_cmp(void)
{
return tcf_em_register(&em_cmp_ops);
}
static void __exit exit_em_cmp(void)
{
tcf_em_unregister(&em_cmp_ops);
}
MODULE_LICENSE("GPL");
module_init(init_em_cmp);
module_exit(exit_em_cmp);
MODULE_ALIAS_TCF_EMATCH(TCF_EM_CMP);

135
net/sched/em_ipset.c Normal file
View file

@ -0,0 +1,135 @@
/*
* net/sched/em_ipset.c ipset ematch
*
* Copyright (c) 2012 Florian Westphal <fw@strlen.de>
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public License
* version 2 as published by the Free Software Foundation.
*/
#include <linux/gfp.h>
#include <linux/module.h>
#include <linux/types.h>
#include <linux/kernel.h>
#include <linux/string.h>
#include <linux/skbuff.h>
#include <linux/netfilter/xt_set.h>
#include <linux/ipv6.h>
#include <net/ip.h>
#include <net/pkt_cls.h>
static int em_ipset_change(struct net *net, void *data, int data_len,
struct tcf_ematch *em)
{
struct xt_set_info *set = data;
ip_set_id_t index;
if (data_len != sizeof(*set))
return -EINVAL;
index = ip_set_nfnl_get_byindex(net, set->index);
if (index == IPSET_INVALID_ID)
return -ENOENT;
em->datalen = sizeof(*set);
em->data = (unsigned long)kmemdup(data, em->datalen, GFP_KERNEL);
if (em->data)
return 0;
ip_set_nfnl_put(net, index);
return -ENOMEM;
}
static void em_ipset_destroy(struct tcf_ematch *em)
{
const struct xt_set_info *set = (const void *) em->data;
if (set) {
ip_set_nfnl_put(em->net, set->index);
kfree((void *) em->data);
}
}
static int em_ipset_match(struct sk_buff *skb, struct tcf_ematch *em,
struct tcf_pkt_info *info)
{
struct ip_set_adt_opt opt;
struct xt_action_param acpar;
const struct xt_set_info *set = (const void *) em->data;
struct net_device *dev, *indev = NULL;
int ret, network_offset;
switch (skb->protocol) {
case htons(ETH_P_IP):
acpar.family = NFPROTO_IPV4;
if (!pskb_network_may_pull(skb, sizeof(struct iphdr)))
return 0;
acpar.thoff = ip_hdrlen(skb);
break;
case htons(ETH_P_IPV6):
acpar.family = NFPROTO_IPV6;
if (!pskb_network_may_pull(skb, sizeof(struct ipv6hdr)))
return 0;
/* doesn't call ipv6_find_hdr() because ipset doesn't use thoff, yet */
acpar.thoff = sizeof(struct ipv6hdr);
break;
default:
return 0;
}
acpar.hooknum = 0;
opt.family = acpar.family;
opt.dim = set->dim;
opt.flags = set->flags;
opt.cmdflags = 0;
opt.ext.timeout = ~0u;
network_offset = skb_network_offset(skb);
skb_pull(skb, network_offset);
dev = skb->dev;
rcu_read_lock();
if (dev && skb->skb_iif)
indev = dev_get_by_index_rcu(dev_net(dev), skb->skb_iif);
acpar.in = indev ? indev : dev;
acpar.out = dev;
ret = ip_set_test(set->index, skb, &acpar, &opt);
rcu_read_unlock();
skb_push(skb, network_offset);
return ret;
}
static struct tcf_ematch_ops em_ipset_ops = {
.kind = TCF_EM_IPSET,
.change = em_ipset_change,
.destroy = em_ipset_destroy,
.match = em_ipset_match,
.owner = THIS_MODULE,
.link = LIST_HEAD_INIT(em_ipset_ops.link)
};
static int __init init_em_ipset(void)
{
return tcf_em_register(&em_ipset_ops);
}
static void __exit exit_em_ipset(void)
{
tcf_em_unregister(&em_ipset_ops);
}
MODULE_LICENSE("GPL");
MODULE_AUTHOR("Florian Westphal <fw@strlen.de>");
MODULE_DESCRIPTION("TC extended match for IP sets");
module_init(init_em_ipset);
module_exit(exit_em_ipset);
MODULE_ALIAS_TCF_EMATCH(TCF_EM_IPSET);

966
net/sched/em_meta.c Normal file
View file

@ -0,0 +1,966 @@
/*
* net/sched/em_meta.c Metadata ematch
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public License
* as published by the Free Software Foundation; either version
* 2 of the License, or (at your option) any later version.
*
* Authors: Thomas Graf <tgraf@suug.ch>
*
* ==========================================================================
*
* The metadata ematch compares two meta objects where each object
* represents either a meta value stored in the kernel or a static
* value provided by userspace. The objects are not provided by
* userspace itself but rather a definition providing the information
* to build them. Every object is of a certain type which must be
* equal to the object it is being compared to.
*
* The definition of a objects conists of the type (meta type), a
* identifier (meta id) and additional type specific information.
* The meta id is either TCF_META_TYPE_VALUE for values provided by
* userspace or a index to the meta operations table consisting of
* function pointers to type specific meta data collectors returning
* the value of the requested meta value.
*
* lvalue rvalue
* +-----------+ +-----------+
* | type: INT | | type: INT |
* def | id: DEV | | id: VALUE |
* | data: | | data: 3 |
* +-----------+ +-----------+
* | |
* ---> meta_ops[INT][DEV](...) |
* | |
* ----------- |
* V V
* +-----------+ +-----------+
* | type: INT | | type: INT |
* obj | id: DEV | | id: VALUE |
* | data: 2 |<--data got filled out | data: 3 |
* +-----------+ +-----------+
* | |
* --------------> 2 equals 3 <--------------
*
* This is a simplified schema, the complexity varies depending
* on the meta type. Obviously, the length of the data must also
* be provided for non-numeric types.
*
* Additionally, type dependent modifiers such as shift operators
* or mask may be applied to extend the functionaliy. As of now,
* the variable length type supports shifting the byte string to
* the right, eating up any number of octets and thus supporting
* wildcard interface name comparisons such as "ppp%" matching
* ppp0..9.
*
* NOTE: Certain meta values depend on other subsystems and are
* only available if that subsystem is enabled in the kernel.
*/
#include <linux/slab.h>
#include <linux/module.h>
#include <linux/types.h>
#include <linux/kernel.h>
#include <linux/sched.h>
#include <linux/string.h>
#include <linux/skbuff.h>
#include <linux/random.h>
#include <linux/if_vlan.h>
#include <linux/tc_ematch/tc_em_meta.h>
#include <net/dst.h>
#include <net/route.h>
#include <net/pkt_cls.h>
#include <net/sock.h>
struct meta_obj {
unsigned long value;
unsigned int len;
};
struct meta_value {
struct tcf_meta_val hdr;
unsigned long val;
unsigned int len;
};
struct meta_match {
struct meta_value lvalue;
struct meta_value rvalue;
};
static inline int meta_id(struct meta_value *v)
{
return TCF_META_ID(v->hdr.kind);
}
static inline int meta_type(struct meta_value *v)
{
return TCF_META_TYPE(v->hdr.kind);
}
#define META_COLLECTOR(FUNC) static void meta_##FUNC(struct sk_buff *skb, \
struct tcf_pkt_info *info, struct meta_value *v, \
struct meta_obj *dst, int *err)
/**************************************************************************
* System status & misc
**************************************************************************/
META_COLLECTOR(int_random)
{
get_random_bytes(&dst->value, sizeof(dst->value));
}
static inline unsigned long fixed_loadavg(int load)
{
int rnd_load = load + (FIXED_1/200);
int rnd_frac = ((rnd_load & (FIXED_1-1)) * 100) >> FSHIFT;
return ((rnd_load >> FSHIFT) * 100) + rnd_frac;
}
META_COLLECTOR(int_loadavg_0)
{
dst->value = fixed_loadavg(avenrun[0]);
}
META_COLLECTOR(int_loadavg_1)
{
dst->value = fixed_loadavg(avenrun[1]);
}
META_COLLECTOR(int_loadavg_2)
{
dst->value = fixed_loadavg(avenrun[2]);
}
/**************************************************************************
* Device names & indices
**************************************************************************/
static inline int int_dev(struct net_device *dev, struct meta_obj *dst)
{
if (unlikely(dev == NULL))
return -1;
dst->value = dev->ifindex;
return 0;
}
static inline int var_dev(struct net_device *dev, struct meta_obj *dst)
{
if (unlikely(dev == NULL))
return -1;
dst->value = (unsigned long) dev->name;
dst->len = strlen(dev->name);
return 0;
}
META_COLLECTOR(int_dev)
{
*err = int_dev(skb->dev, dst);
}
META_COLLECTOR(var_dev)
{
*err = var_dev(skb->dev, dst);
}
/**************************************************************************
* vlan tag
**************************************************************************/
META_COLLECTOR(int_vlan_tag)
{
unsigned short tag;
tag = vlan_tx_tag_get(skb);
if (!tag && __vlan_get_tag(skb, &tag))
*err = -1;
else
dst->value = tag;
}
/**************************************************************************
* skb attributes
**************************************************************************/
META_COLLECTOR(int_priority)
{
dst->value = skb->priority;
}
META_COLLECTOR(int_protocol)
{
/* Let userspace take care of the byte ordering */
dst->value = skb->protocol;
}
META_COLLECTOR(int_pkttype)
{
dst->value = skb->pkt_type;
}
META_COLLECTOR(int_pktlen)
{
dst->value = skb->len;
}
META_COLLECTOR(int_datalen)
{
dst->value = skb->data_len;
}
META_COLLECTOR(int_maclen)
{
dst->value = skb->mac_len;
}
META_COLLECTOR(int_rxhash)
{
dst->value = skb_get_hash(skb);
}
/**************************************************************************
* Netfilter
**************************************************************************/
META_COLLECTOR(int_mark)
{
dst->value = skb->mark;
}
/**************************************************************************
* Traffic Control
**************************************************************************/
META_COLLECTOR(int_tcindex)
{
dst->value = skb->tc_index;
}
/**************************************************************************
* Routing
**************************************************************************/
META_COLLECTOR(int_rtclassid)
{
if (unlikely(skb_dst(skb) == NULL))
*err = -1;
else
#ifdef CONFIG_IP_ROUTE_CLASSID
dst->value = skb_dst(skb)->tclassid;
#else
dst->value = 0;
#endif
}
META_COLLECTOR(int_rtiif)
{
if (unlikely(skb_rtable(skb) == NULL))
*err = -1;
else
dst->value = inet_iif(skb);
}
/**************************************************************************
* Socket Attributes
**************************************************************************/
#define skip_nonlocal(skb) \
(unlikely(skb->sk == NULL))
META_COLLECTOR(int_sk_family)
{
if (skip_nonlocal(skb)) {
*err = -1;
return;
}
dst->value = skb->sk->sk_family;
}
META_COLLECTOR(int_sk_state)
{
if (skip_nonlocal(skb)) {
*err = -1;
return;
}
dst->value = skb->sk->sk_state;
}
META_COLLECTOR(int_sk_reuse)
{
if (skip_nonlocal(skb)) {
*err = -1;
return;
}
dst->value = skb->sk->sk_reuse;
}
META_COLLECTOR(int_sk_bound_if)
{
if (skip_nonlocal(skb)) {
*err = -1;
return;
}
/* No error if bound_dev_if is 0, legal userspace check */
dst->value = skb->sk->sk_bound_dev_if;
}
META_COLLECTOR(var_sk_bound_if)
{
if (skip_nonlocal(skb)) {
*err = -1;
return;
}
if (skb->sk->sk_bound_dev_if == 0) {
dst->value = (unsigned long) "any";
dst->len = 3;
} else {
struct net_device *dev;
rcu_read_lock();
dev = dev_get_by_index_rcu(sock_net(skb->sk),
skb->sk->sk_bound_dev_if);
*err = var_dev(dev, dst);
rcu_read_unlock();
}
}
META_COLLECTOR(int_sk_refcnt)
{
if (skip_nonlocal(skb)) {
*err = -1;
return;
}
dst->value = atomic_read(&skb->sk->sk_refcnt);
}
META_COLLECTOR(int_sk_rcvbuf)
{
if (skip_nonlocal(skb)) {
*err = -1;
return;
}
dst->value = skb->sk->sk_rcvbuf;
}
META_COLLECTOR(int_sk_shutdown)
{
if (skip_nonlocal(skb)) {
*err = -1;
return;
}
dst->value = skb->sk->sk_shutdown;
}
META_COLLECTOR(int_sk_proto)
{
if (skip_nonlocal(skb)) {
*err = -1;
return;
}
dst->value = skb->sk->sk_protocol;
}
META_COLLECTOR(int_sk_type)
{
if (skip_nonlocal(skb)) {
*err = -1;
return;
}
dst->value = skb->sk->sk_type;
}
META_COLLECTOR(int_sk_rmem_alloc)
{
if (skip_nonlocal(skb)) {
*err = -1;
return;
}
dst->value = sk_rmem_alloc_get(skb->sk);
}
META_COLLECTOR(int_sk_wmem_alloc)
{
if (skip_nonlocal(skb)) {
*err = -1;
return;
}
dst->value = sk_wmem_alloc_get(skb->sk);
}
META_COLLECTOR(int_sk_omem_alloc)
{
if (skip_nonlocal(skb)) {
*err = -1;
return;
}
dst->value = atomic_read(&skb->sk->sk_omem_alloc);
}
META_COLLECTOR(int_sk_rcv_qlen)
{
if (skip_nonlocal(skb)) {
*err = -1;
return;
}
dst->value = skb->sk->sk_receive_queue.qlen;
}
META_COLLECTOR(int_sk_snd_qlen)
{
if (skip_nonlocal(skb)) {
*err = -1;
return;
}
dst->value = skb->sk->sk_write_queue.qlen;
}
META_COLLECTOR(int_sk_wmem_queued)
{
if (skip_nonlocal(skb)) {
*err = -1;
return;
}
dst->value = skb->sk->sk_wmem_queued;
}
META_COLLECTOR(int_sk_fwd_alloc)
{
if (skip_nonlocal(skb)) {
*err = -1;
return;
}
dst->value = skb->sk->sk_forward_alloc;
}
META_COLLECTOR(int_sk_sndbuf)
{
if (skip_nonlocal(skb)) {
*err = -1;
return;
}
dst->value = skb->sk->sk_sndbuf;
}
META_COLLECTOR(int_sk_alloc)
{
if (skip_nonlocal(skb)) {
*err = -1;
return;
}
dst->value = (__force int) skb->sk->sk_allocation;
}
META_COLLECTOR(int_sk_hash)
{
if (skip_nonlocal(skb)) {
*err = -1;
return;
}
dst->value = skb->sk->sk_hash;
}
META_COLLECTOR(int_sk_lingertime)
{
if (skip_nonlocal(skb)) {
*err = -1;
return;
}
dst->value = skb->sk->sk_lingertime / HZ;
}
META_COLLECTOR(int_sk_err_qlen)
{
if (skip_nonlocal(skb)) {
*err = -1;
return;
}
dst->value = skb->sk->sk_error_queue.qlen;
}
META_COLLECTOR(int_sk_ack_bl)
{
if (skip_nonlocal(skb)) {
*err = -1;
return;
}
dst->value = skb->sk->sk_ack_backlog;
}
META_COLLECTOR(int_sk_max_ack_bl)
{
if (skip_nonlocal(skb)) {
*err = -1;
return;
}
dst->value = skb->sk->sk_max_ack_backlog;
}
META_COLLECTOR(int_sk_prio)
{
if (skip_nonlocal(skb)) {
*err = -1;
return;
}
dst->value = skb->sk->sk_priority;
}
META_COLLECTOR(int_sk_rcvlowat)
{
if (skip_nonlocal(skb)) {
*err = -1;
return;
}
dst->value = skb->sk->sk_rcvlowat;
}
META_COLLECTOR(int_sk_rcvtimeo)
{
if (skip_nonlocal(skb)) {
*err = -1;
return;
}
dst->value = skb->sk->sk_rcvtimeo / HZ;
}
META_COLLECTOR(int_sk_sndtimeo)
{
if (skip_nonlocal(skb)) {
*err = -1;
return;
}
dst->value = skb->sk->sk_sndtimeo / HZ;
}
META_COLLECTOR(int_sk_sendmsg_off)
{
if (skip_nonlocal(skb)) {
*err = -1;
return;
}
dst->value = skb->sk->sk_frag.offset;
}
META_COLLECTOR(int_sk_write_pend)
{
if (skip_nonlocal(skb)) {
*err = -1;
return;
}
dst->value = skb->sk->sk_write_pending;
}
/**************************************************************************
* Meta value collectors assignment table
**************************************************************************/
struct meta_ops {
void (*get)(struct sk_buff *, struct tcf_pkt_info *,
struct meta_value *, struct meta_obj *, int *);
};
#define META_ID(name) TCF_META_ID_##name
#define META_FUNC(name) { .get = meta_##name }
/* Meta value operations table listing all meta value collectors and
* assigns them to a type and meta id. */
static struct meta_ops __meta_ops[TCF_META_TYPE_MAX + 1][TCF_META_ID_MAX + 1] = {
[TCF_META_TYPE_VAR] = {
[META_ID(DEV)] = META_FUNC(var_dev),
[META_ID(SK_BOUND_IF)] = META_FUNC(var_sk_bound_if),
},
[TCF_META_TYPE_INT] = {
[META_ID(RANDOM)] = META_FUNC(int_random),
[META_ID(LOADAVG_0)] = META_FUNC(int_loadavg_0),
[META_ID(LOADAVG_1)] = META_FUNC(int_loadavg_1),
[META_ID(LOADAVG_2)] = META_FUNC(int_loadavg_2),
[META_ID(DEV)] = META_FUNC(int_dev),
[META_ID(PRIORITY)] = META_FUNC(int_priority),
[META_ID(PROTOCOL)] = META_FUNC(int_protocol),
[META_ID(PKTTYPE)] = META_FUNC(int_pkttype),
[META_ID(PKTLEN)] = META_FUNC(int_pktlen),
[META_ID(DATALEN)] = META_FUNC(int_datalen),
[META_ID(MACLEN)] = META_FUNC(int_maclen),
[META_ID(NFMARK)] = META_FUNC(int_mark),
[META_ID(TCINDEX)] = META_FUNC(int_tcindex),
[META_ID(RTCLASSID)] = META_FUNC(int_rtclassid),
[META_ID(RTIIF)] = META_FUNC(int_rtiif),
[META_ID(SK_FAMILY)] = META_FUNC(int_sk_family),
[META_ID(SK_STATE)] = META_FUNC(int_sk_state),
[META_ID(SK_REUSE)] = META_FUNC(int_sk_reuse),
[META_ID(SK_BOUND_IF)] = META_FUNC(int_sk_bound_if),
[META_ID(SK_REFCNT)] = META_FUNC(int_sk_refcnt),
[META_ID(SK_RCVBUF)] = META_FUNC(int_sk_rcvbuf),
[META_ID(SK_SNDBUF)] = META_FUNC(int_sk_sndbuf),
[META_ID(SK_SHUTDOWN)] = META_FUNC(int_sk_shutdown),
[META_ID(SK_PROTO)] = META_FUNC(int_sk_proto),
[META_ID(SK_TYPE)] = META_FUNC(int_sk_type),
[META_ID(SK_RMEM_ALLOC)] = META_FUNC(int_sk_rmem_alloc),
[META_ID(SK_WMEM_ALLOC)] = META_FUNC(int_sk_wmem_alloc),
[META_ID(SK_OMEM_ALLOC)] = META_FUNC(int_sk_omem_alloc),
[META_ID(SK_WMEM_QUEUED)] = META_FUNC(int_sk_wmem_queued),
[META_ID(SK_RCV_QLEN)] = META_FUNC(int_sk_rcv_qlen),
[META_ID(SK_SND_QLEN)] = META_FUNC(int_sk_snd_qlen),
[META_ID(SK_ERR_QLEN)] = META_FUNC(int_sk_err_qlen),
[META_ID(SK_FORWARD_ALLOCS)] = META_FUNC(int_sk_fwd_alloc),
[META_ID(SK_ALLOCS)] = META_FUNC(int_sk_alloc),
[META_ID(SK_HASH)] = META_FUNC(int_sk_hash),
[META_ID(SK_LINGERTIME)] = META_FUNC(int_sk_lingertime),
[META_ID(SK_ACK_BACKLOG)] = META_FUNC(int_sk_ack_bl),
[META_ID(SK_MAX_ACK_BACKLOG)] = META_FUNC(int_sk_max_ack_bl),
[META_ID(SK_PRIO)] = META_FUNC(int_sk_prio),
[META_ID(SK_RCVLOWAT)] = META_FUNC(int_sk_rcvlowat),
[META_ID(SK_RCVTIMEO)] = META_FUNC(int_sk_rcvtimeo),
[META_ID(SK_SNDTIMEO)] = META_FUNC(int_sk_sndtimeo),
[META_ID(SK_SENDMSG_OFF)] = META_FUNC(int_sk_sendmsg_off),
[META_ID(SK_WRITE_PENDING)] = META_FUNC(int_sk_write_pend),
[META_ID(VLAN_TAG)] = META_FUNC(int_vlan_tag),
[META_ID(RXHASH)] = META_FUNC(int_rxhash),
}
};
static inline struct meta_ops *meta_ops(struct meta_value *val)
{
return &__meta_ops[meta_type(val)][meta_id(val)];
}
/**************************************************************************
* Type specific operations for TCF_META_TYPE_VAR
**************************************************************************/
static int meta_var_compare(struct meta_obj *a, struct meta_obj *b)
{
int r = a->len - b->len;
if (r == 0)
r = memcmp((void *) a->value, (void *) b->value, a->len);
return r;
}
static int meta_var_change(struct meta_value *dst, struct nlattr *nla)
{
int len = nla_len(nla);
dst->val = (unsigned long)kmemdup(nla_data(nla), len, GFP_KERNEL);
if (dst->val == 0UL)
return -ENOMEM;
dst->len = len;
return 0;
}
static void meta_var_destroy(struct meta_value *v)
{
kfree((void *) v->val);
}
static void meta_var_apply_extras(struct meta_value *v,
struct meta_obj *dst)
{
int shift = v->hdr.shift;
if (shift && shift < dst->len)
dst->len -= shift;
}
static int meta_var_dump(struct sk_buff *skb, struct meta_value *v, int tlv)
{
if (v->val && v->len &&
nla_put(skb, tlv, v->len, (void *) v->val))
goto nla_put_failure;
return 0;
nla_put_failure:
return -1;
}
/**************************************************************************
* Type specific operations for TCF_META_TYPE_INT
**************************************************************************/
static int meta_int_compare(struct meta_obj *a, struct meta_obj *b)
{
/* Let gcc optimize it, the unlikely is not really based on
* some numbers but jump free code for mismatches seems
* more logical. */
if (unlikely(a->value == b->value))
return 0;
else if (a->value < b->value)
return -1;
else
return 1;
}
static int meta_int_change(struct meta_value *dst, struct nlattr *nla)
{
if (nla_len(nla) >= sizeof(unsigned long)) {
dst->val = *(unsigned long *) nla_data(nla);
dst->len = sizeof(unsigned long);
} else if (nla_len(nla) == sizeof(u32)) {
dst->val = nla_get_u32(nla);
dst->len = sizeof(u32);
} else
return -EINVAL;
return 0;
}
static void meta_int_apply_extras(struct meta_value *v,
struct meta_obj *dst)
{
if (v->hdr.shift)
dst->value >>= v->hdr.shift;
if (v->val)
dst->value &= v->val;
}
static int meta_int_dump(struct sk_buff *skb, struct meta_value *v, int tlv)
{
if (v->len == sizeof(unsigned long)) {
if (nla_put(skb, tlv, sizeof(unsigned long), &v->val))
goto nla_put_failure;
} else if (v->len == sizeof(u32)) {
if (nla_put_u32(skb, tlv, v->val))
goto nla_put_failure;
}
return 0;
nla_put_failure:
return -1;
}
/**************************************************************************
* Type specific operations table
**************************************************************************/
struct meta_type_ops {
void (*destroy)(struct meta_value *);
int (*compare)(struct meta_obj *, struct meta_obj *);
int (*change)(struct meta_value *, struct nlattr *);
void (*apply_extras)(struct meta_value *, struct meta_obj *);
int (*dump)(struct sk_buff *, struct meta_value *, int);
};
static struct meta_type_ops __meta_type_ops[TCF_META_TYPE_MAX + 1] = {
[TCF_META_TYPE_VAR] = {
.destroy = meta_var_destroy,
.compare = meta_var_compare,
.change = meta_var_change,
.apply_extras = meta_var_apply_extras,
.dump = meta_var_dump
},
[TCF_META_TYPE_INT] = {
.compare = meta_int_compare,
.change = meta_int_change,
.apply_extras = meta_int_apply_extras,
.dump = meta_int_dump
}
};
static inline struct meta_type_ops *meta_type_ops(struct meta_value *v)
{
return &__meta_type_ops[meta_type(v)];
}
/**************************************************************************
* Core
**************************************************************************/
static int meta_get(struct sk_buff *skb, struct tcf_pkt_info *info,
struct meta_value *v, struct meta_obj *dst)
{
int err = 0;
if (meta_id(v) == TCF_META_ID_VALUE) {
dst->value = v->val;
dst->len = v->len;
return 0;
}
meta_ops(v)->get(skb, info, v, dst, &err);
if (err < 0)
return err;
if (meta_type_ops(v)->apply_extras)
meta_type_ops(v)->apply_extras(v, dst);
return 0;
}
static int em_meta_match(struct sk_buff *skb, struct tcf_ematch *m,
struct tcf_pkt_info *info)
{
int r;
struct meta_match *meta = (struct meta_match *) m->data;
struct meta_obj l_value, r_value;
if (meta_get(skb, info, &meta->lvalue, &l_value) < 0 ||
meta_get(skb, info, &meta->rvalue, &r_value) < 0)
return 0;
r = meta_type_ops(&meta->lvalue)->compare(&l_value, &r_value);
switch (meta->lvalue.hdr.op) {
case TCF_EM_OPND_EQ:
return !r;
case TCF_EM_OPND_LT:
return r < 0;
case TCF_EM_OPND_GT:
return r > 0;
}
return 0;
}
static void meta_delete(struct meta_match *meta)
{
if (meta) {
struct meta_type_ops *ops = meta_type_ops(&meta->lvalue);
if (ops && ops->destroy) {
ops->destroy(&meta->lvalue);
ops->destroy(&meta->rvalue);
}
}
kfree(meta);
}
static inline int meta_change_data(struct meta_value *dst, struct nlattr *nla)
{
if (nla) {
if (nla_len(nla) == 0)
return -EINVAL;
return meta_type_ops(dst)->change(dst, nla);
}
return 0;
}
static inline int meta_is_supported(struct meta_value *val)
{
return !meta_id(val) || meta_ops(val)->get;
}
static const struct nla_policy meta_policy[TCA_EM_META_MAX + 1] = {
[TCA_EM_META_HDR] = { .len = sizeof(struct tcf_meta_hdr) },
};
static int em_meta_change(struct net *net, void *data, int len,
struct tcf_ematch *m)
{
int err;
struct nlattr *tb[TCA_EM_META_MAX + 1];
struct tcf_meta_hdr *hdr;
struct meta_match *meta = NULL;
err = nla_parse(tb, TCA_EM_META_MAX, data, len, meta_policy);
if (err < 0)
goto errout;
err = -EINVAL;
if (tb[TCA_EM_META_HDR] == NULL)
goto errout;
hdr = nla_data(tb[TCA_EM_META_HDR]);
if (TCF_META_TYPE(hdr->left.kind) != TCF_META_TYPE(hdr->right.kind) ||
TCF_META_TYPE(hdr->left.kind) > TCF_META_TYPE_MAX ||
TCF_META_ID(hdr->left.kind) > TCF_META_ID_MAX ||
TCF_META_ID(hdr->right.kind) > TCF_META_ID_MAX)
goto errout;
meta = kzalloc(sizeof(*meta), GFP_KERNEL);
if (meta == NULL) {
err = -ENOMEM;
goto errout;
}
memcpy(&meta->lvalue.hdr, &hdr->left, sizeof(hdr->left));
memcpy(&meta->rvalue.hdr, &hdr->right, sizeof(hdr->right));
if (!meta_is_supported(&meta->lvalue) ||
!meta_is_supported(&meta->rvalue)) {
err = -EOPNOTSUPP;
goto errout;
}
if (meta_change_data(&meta->lvalue, tb[TCA_EM_META_LVALUE]) < 0 ||
meta_change_data(&meta->rvalue, tb[TCA_EM_META_RVALUE]) < 0)
goto errout;
m->datalen = sizeof(*meta);
m->data = (unsigned long) meta;
err = 0;
errout:
if (err && meta)
meta_delete(meta);
return err;
}
static void em_meta_destroy(struct tcf_ematch *m)
{
if (m)
meta_delete((struct meta_match *) m->data);
}
static int em_meta_dump(struct sk_buff *skb, struct tcf_ematch *em)
{
struct meta_match *meta = (struct meta_match *) em->data;
struct tcf_meta_hdr hdr;
struct meta_type_ops *ops;
memset(&hdr, 0, sizeof(hdr));
memcpy(&hdr.left, &meta->lvalue.hdr, sizeof(hdr.left));
memcpy(&hdr.right, &meta->rvalue.hdr, sizeof(hdr.right));
if (nla_put(skb, TCA_EM_META_HDR, sizeof(hdr), &hdr))
goto nla_put_failure;
ops = meta_type_ops(&meta->lvalue);
if (ops->dump(skb, &meta->lvalue, TCA_EM_META_LVALUE) < 0 ||
ops->dump(skb, &meta->rvalue, TCA_EM_META_RVALUE) < 0)
goto nla_put_failure;
return 0;
nla_put_failure:
return -1;
}
static struct tcf_ematch_ops em_meta_ops = {
.kind = TCF_EM_META,
.change = em_meta_change,
.match = em_meta_match,
.destroy = em_meta_destroy,
.dump = em_meta_dump,
.owner = THIS_MODULE,
.link = LIST_HEAD_INIT(em_meta_ops.link)
};
static int __init init_em_meta(void)
{
return tcf_em_register(&em_meta_ops);
}
static void __exit exit_em_meta(void)
{
tcf_em_unregister(&em_meta_ops);
}
MODULE_LICENSE("GPL");
module_init(init_em_meta);
module_exit(exit_em_meta);
MODULE_ALIAS_TCF_EMATCH(TCF_EM_META);

80
net/sched/em_nbyte.c Normal file
View file

@ -0,0 +1,80 @@
/*
* net/sched/em_nbyte.c N-Byte ematch
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public License
* as published by the Free Software Foundation; either version
* 2 of the License, or (at your option) any later version.
*
* Authors: Thomas Graf <tgraf@suug.ch>
*/
#include <linux/gfp.h>
#include <linux/module.h>
#include <linux/types.h>
#include <linux/kernel.h>
#include <linux/string.h>
#include <linux/skbuff.h>
#include <linux/tc_ematch/tc_em_nbyte.h>
#include <net/pkt_cls.h>
struct nbyte_data {
struct tcf_em_nbyte hdr;
char pattern[0];
};
static int em_nbyte_change(struct net *net, void *data, int data_len,
struct tcf_ematch *em)
{
struct tcf_em_nbyte *nbyte = data;
if (data_len < sizeof(*nbyte) ||
data_len < (sizeof(*nbyte) + nbyte->len))
return -EINVAL;
em->datalen = sizeof(*nbyte) + nbyte->len;
em->data = (unsigned long)kmemdup(data, em->datalen, GFP_KERNEL);
if (em->data == 0UL)
return -ENOBUFS;
return 0;
}
static int em_nbyte_match(struct sk_buff *skb, struct tcf_ematch *em,
struct tcf_pkt_info *info)
{
struct nbyte_data *nbyte = (struct nbyte_data *) em->data;
unsigned char *ptr = tcf_get_base_ptr(skb, nbyte->hdr.layer);
ptr += nbyte->hdr.off;
if (!tcf_valid_offset(skb, ptr, nbyte->hdr.len))
return 0;
return !memcmp(ptr + nbyte->hdr.off, nbyte->pattern, nbyte->hdr.len);
}
static struct tcf_ematch_ops em_nbyte_ops = {
.kind = TCF_EM_NBYTE,
.change = em_nbyte_change,
.match = em_nbyte_match,
.owner = THIS_MODULE,
.link = LIST_HEAD_INIT(em_nbyte_ops.link)
};
static int __init init_em_nbyte(void)
{
return tcf_em_register(&em_nbyte_ops);
}
static void __exit exit_em_nbyte(void)
{
tcf_em_unregister(&em_nbyte_ops);
}
MODULE_LICENSE("GPL");
module_init(init_em_nbyte);
module_exit(exit_em_nbyte);
MODULE_ALIAS_TCF_EMATCH(TCF_EM_NBYTE);

158
net/sched/em_text.c Normal file
View file

@ -0,0 +1,158 @@
/*
* net/sched/em_text.c Textsearch ematch
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public License
* as published by the Free Software Foundation; either version
* 2 of the License, or (at your option) any later version.
*
* Authors: Thomas Graf <tgraf@suug.ch>
*/
#include <linux/slab.h>
#include <linux/module.h>
#include <linux/types.h>
#include <linux/kernel.h>
#include <linux/string.h>
#include <linux/skbuff.h>
#include <linux/textsearch.h>
#include <linux/tc_ematch/tc_em_text.h>
#include <net/pkt_cls.h>
struct text_match {
u16 from_offset;
u16 to_offset;
u8 from_layer;
u8 to_layer;
struct ts_config *config;
};
#define EM_TEXT_PRIV(m) ((struct text_match *) (m)->data)
static int em_text_match(struct sk_buff *skb, struct tcf_ematch *m,
struct tcf_pkt_info *info)
{
struct text_match *tm = EM_TEXT_PRIV(m);
int from, to;
struct ts_state state;
from = tcf_get_base_ptr(skb, tm->from_layer) - skb->data;
from += tm->from_offset;
to = tcf_get_base_ptr(skb, tm->to_layer) - skb->data;
to += tm->to_offset;
return skb_find_text(skb, from, to, tm->config, &state) != UINT_MAX;
}
static int em_text_change(struct net *net, void *data, int len,
struct tcf_ematch *m)
{
struct text_match *tm;
struct tcf_em_text *conf = data;
struct ts_config *ts_conf;
int flags = 0;
if (len < sizeof(*conf) || len < (sizeof(*conf) + conf->pattern_len))
return -EINVAL;
if (conf->from_layer > conf->to_layer)
return -EINVAL;
if (conf->from_layer == conf->to_layer &&
conf->from_offset > conf->to_offset)
return -EINVAL;
retry:
ts_conf = textsearch_prepare(conf->algo, (u8 *) conf + sizeof(*conf),
conf->pattern_len, GFP_KERNEL, flags);
if (flags & TS_AUTOLOAD)
rtnl_lock();
if (IS_ERR(ts_conf)) {
if (PTR_ERR(ts_conf) == -ENOENT && !(flags & TS_AUTOLOAD)) {
rtnl_unlock();
flags |= TS_AUTOLOAD;
goto retry;
} else
return PTR_ERR(ts_conf);
} else if (flags & TS_AUTOLOAD) {
textsearch_destroy(ts_conf);
return -EAGAIN;
}
tm = kmalloc(sizeof(*tm), GFP_KERNEL);
if (tm == NULL) {
textsearch_destroy(ts_conf);
return -ENOBUFS;
}
tm->from_offset = conf->from_offset;
tm->to_offset = conf->to_offset;
tm->from_layer = conf->from_layer;
tm->to_layer = conf->to_layer;
tm->config = ts_conf;
m->datalen = sizeof(*tm);
m->data = (unsigned long) tm;
return 0;
}
static void em_text_destroy(struct tcf_ematch *m)
{
if (EM_TEXT_PRIV(m) && EM_TEXT_PRIV(m)->config)
textsearch_destroy(EM_TEXT_PRIV(m)->config);
}
static int em_text_dump(struct sk_buff *skb, struct tcf_ematch *m)
{
struct text_match *tm = EM_TEXT_PRIV(m);
struct tcf_em_text conf;
strncpy(conf.algo, tm->config->ops->name, sizeof(conf.algo) - 1);
conf.from_offset = tm->from_offset;
conf.to_offset = tm->to_offset;
conf.from_layer = tm->from_layer;
conf.to_layer = tm->to_layer;
conf.pattern_len = textsearch_get_pattern_len(tm->config);
conf.pad = 0;
if (nla_put_nohdr(skb, sizeof(conf), &conf) < 0)
goto nla_put_failure;
if (nla_append(skb, conf.pattern_len,
textsearch_get_pattern(tm->config)) < 0)
goto nla_put_failure;
return 0;
nla_put_failure:
return -1;
}
static struct tcf_ematch_ops em_text_ops = {
.kind = TCF_EM_TEXT,
.change = em_text_change,
.match = em_text_match,
.destroy = em_text_destroy,
.dump = em_text_dump,
.owner = THIS_MODULE,
.link = LIST_HEAD_INIT(em_text_ops.link)
};
static int __init init_em_text(void)
{
return tcf_em_register(&em_text_ops);
}
static void __exit exit_em_text(void)
{
tcf_em_unregister(&em_text_ops);
}
MODULE_LICENSE("GPL");
module_init(init_em_text);
module_exit(exit_em_text);
MODULE_ALIAS_TCF_EMATCH(TCF_EM_TEXT);

64
net/sched/em_u32.c Normal file
View file

@ -0,0 +1,64 @@
/*
* net/sched/em_u32.c U32 Ematch
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public License
* as published by the Free Software Foundation; either version
* 2 of the License, or (at your option) any later version.
*
* Authors: Thomas Graf <tgraf@suug.ch>
* Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
*
* Based on net/sched/cls_u32.c
*/
#include <linux/module.h>
#include <linux/types.h>
#include <linux/kernel.h>
#include <linux/skbuff.h>
#include <net/pkt_cls.h>
static int em_u32_match(struct sk_buff *skb, struct tcf_ematch *em,
struct tcf_pkt_info *info)
{
struct tc_u32_key *key = (struct tc_u32_key *) em->data;
const unsigned char *ptr = skb_network_header(skb);
if (info) {
if (info->ptr)
ptr = info->ptr;
ptr += (info->nexthdr & key->offmask);
}
ptr += key->off;
if (!tcf_valid_offset(skb, ptr, sizeof(u32)))
return 0;
return !(((*(__be32 *) ptr) ^ key->val) & key->mask);
}
static struct tcf_ematch_ops em_u32_ops = {
.kind = TCF_EM_U32,
.datalen = sizeof(struct tc_u32_key),
.match = em_u32_match,
.owner = THIS_MODULE,
.link = LIST_HEAD_INIT(em_u32_ops.link)
};
static int __init init_em_u32(void)
{
return tcf_em_register(&em_u32_ops);
}
static void __exit exit_em_u32(void)
{
tcf_em_unregister(&em_u32_ops);
}
MODULE_LICENSE("GPL");
module_init(init_em_u32);
module_exit(exit_em_u32);
MODULE_ALIAS_TCF_EMATCH(TCF_EM_U32);

549
net/sched/ematch.c Normal file
View file

@ -0,0 +1,549 @@
/*
* net/sched/ematch.c Extended Match API
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public License
* as published by the Free Software Foundation; either version
* 2 of the License, or (at your option) any later version.
*
* Authors: Thomas Graf <tgraf@suug.ch>
*
* ==========================================================================
*
* An extended match (ematch) is a small classification tool not worth
* writing a full classifier for. Ematches can be interconnected to form
* a logic expression and get attached to classifiers to extend their
* functionatlity.
*
* The userspace part transforms the logic expressions into an array
* consisting of multiple sequences of interconnected ematches separated
* by markers. Precedence is implemented by a special ematch kind
* referencing a sequence beyond the marker of the current sequence
* causing the current position in the sequence to be pushed onto a stack
* to allow the current position to be overwritten by the position referenced
* in the special ematch. Matching continues in the new sequence until a
* marker is reached causing the position to be restored from the stack.
*
* Example:
* A AND (B1 OR B2) AND C AND D
*
* ------->-PUSH-------
* -->-- / -->-- \ -->--
* / \ / / \ \ / \
* +-------+-------+-------+-------+-------+--------+
* | A AND | B AND | C AND | D END | B1 OR | B2 END |
* +-------+-------+-------+-------+-------+--------+
* \ /
* --------<-POP---------
*
* where B is a virtual ematch referencing to sequence starting with B1.
*
* ==========================================================================
*
* How to write an ematch in 60 seconds
* ------------------------------------
*
* 1) Provide a matcher function:
* static int my_match(struct sk_buff *skb, struct tcf_ematch *m,
* struct tcf_pkt_info *info)
* {
* struct mydata *d = (struct mydata *) m->data;
*
* if (...matching goes here...)
* return 1;
* else
* return 0;
* }
*
* 2) Fill out a struct tcf_ematch_ops:
* static struct tcf_ematch_ops my_ops = {
* .kind = unique id,
* .datalen = sizeof(struct mydata),
* .match = my_match,
* .owner = THIS_MODULE,
* };
*
* 3) Register/Unregister your ematch:
* static int __init init_my_ematch(void)
* {
* return tcf_em_register(&my_ops);
* }
*
* static void __exit exit_my_ematch(void)
* {
* tcf_em_unregister(&my_ops);
* }
*
* module_init(init_my_ematch);
* module_exit(exit_my_ematch);
*
* 4) By now you should have two more seconds left, barely enough to
* open up a beer to watch the compilation going.
*/
#include <linux/module.h>
#include <linux/slab.h>
#include <linux/types.h>
#include <linux/kernel.h>
#include <linux/errno.h>
#include <linux/rtnetlink.h>
#include <linux/skbuff.h>
#include <net/pkt_cls.h>
static LIST_HEAD(ematch_ops);
static DEFINE_RWLOCK(ematch_mod_lock);
static struct tcf_ematch_ops *tcf_em_lookup(u16 kind)
{
struct tcf_ematch_ops *e = NULL;
read_lock(&ematch_mod_lock);
list_for_each_entry(e, &ematch_ops, link) {
if (kind == e->kind) {
if (!try_module_get(e->owner))
e = NULL;
read_unlock(&ematch_mod_lock);
return e;
}
}
read_unlock(&ematch_mod_lock);
return NULL;
}
/**
* tcf_em_register - register an extended match
*
* @ops: ematch operations lookup table
*
* This function must be called by ematches to announce their presence.
* The given @ops must have kind set to a unique identifier and the
* callback match() must be implemented. All other callbacks are optional
* and a fallback implementation is used instead.
*
* Returns -EEXISTS if an ematch of the same kind has already registered.
*/
int tcf_em_register(struct tcf_ematch_ops *ops)
{
int err = -EEXIST;
struct tcf_ematch_ops *e;
if (ops->match == NULL)
return -EINVAL;
write_lock(&ematch_mod_lock);
list_for_each_entry(e, &ematch_ops, link)
if (ops->kind == e->kind)
goto errout;
list_add_tail(&ops->link, &ematch_ops);
err = 0;
errout:
write_unlock(&ematch_mod_lock);
return err;
}
EXPORT_SYMBOL(tcf_em_register);
/**
* tcf_em_unregister - unregster and extended match
*
* @ops: ematch operations lookup table
*
* This function must be called by ematches to announce their disappearance
* for examples when the module gets unloaded. The @ops parameter must be
* the same as the one used for registration.
*
* Returns -ENOENT if no matching ematch was found.
*/
void tcf_em_unregister(struct tcf_ematch_ops *ops)
{
write_lock(&ematch_mod_lock);
list_del(&ops->link);
write_unlock(&ematch_mod_lock);
}
EXPORT_SYMBOL(tcf_em_unregister);
static inline struct tcf_ematch *tcf_em_get_match(struct tcf_ematch_tree *tree,
int index)
{
return &tree->matches[index];
}
static int tcf_em_validate(struct tcf_proto *tp,
struct tcf_ematch_tree_hdr *tree_hdr,
struct tcf_ematch *em, struct nlattr *nla, int idx)
{
int err = -EINVAL;
struct tcf_ematch_hdr *em_hdr = nla_data(nla);
int data_len = nla_len(nla) - sizeof(*em_hdr);
void *data = (void *) em_hdr + sizeof(*em_hdr);
struct net *net = dev_net(qdisc_dev(tp->q));
if (!TCF_EM_REL_VALID(em_hdr->flags))
goto errout;
if (em_hdr->kind == TCF_EM_CONTAINER) {
/* Special ematch called "container", carries an index
* referencing an external ematch sequence.
*/
u32 ref;
if (data_len < sizeof(ref))
goto errout;
ref = *(u32 *) data;
if (ref >= tree_hdr->nmatches)
goto errout;
/* We do not allow backward jumps to avoid loops and jumps
* to our own position are of course illegal.
*/
if (ref <= idx)
goto errout;
em->data = ref;
} else {
/* Note: This lookup will increase the module refcnt
* of the ematch module referenced. In case of a failure,
* a destroy function is called by the underlying layer
* which automatically releases the reference again, therefore
* the module MUST not be given back under any circumstances
* here. Be aware, the destroy function assumes that the
* module is held if the ops field is non zero.
*/
em->ops = tcf_em_lookup(em_hdr->kind);
if (em->ops == NULL) {
err = -ENOENT;
#ifdef CONFIG_MODULES
__rtnl_unlock();
request_module("ematch-kind-%u", em_hdr->kind);
rtnl_lock();
em->ops = tcf_em_lookup(em_hdr->kind);
if (em->ops) {
/* We dropped the RTNL mutex in order to
* perform the module load. Tell the caller
* to replay the request.
*/
module_put(em->ops->owner);
em->ops = NULL;
err = -EAGAIN;
}
#endif
goto errout;
}
/* ematch module provides expected length of data, so we
* can do a basic sanity check.
*/
if (em->ops->datalen && data_len < em->ops->datalen)
goto errout;
if (em->ops->change) {
err = em->ops->change(net, data, data_len, em);
if (err < 0)
goto errout;
} else if (data_len > 0) {
/* ematch module doesn't provide an own change
* procedure and expects us to allocate and copy
* the ematch data.
*
* TCF_EM_SIMPLE may be specified stating that the
* data only consists of a u32 integer and the module
* does not expected a memory reference but rather
* the value carried.
*/
if (em_hdr->flags & TCF_EM_SIMPLE) {
if (data_len < sizeof(u32))
goto errout;
em->data = *(u32 *) data;
} else {
void *v = kmemdup(data, data_len, GFP_KERNEL);
if (v == NULL) {
err = -ENOBUFS;
goto errout;
}
em->data = (unsigned long) v;
}
}
}
em->matchid = em_hdr->matchid;
em->flags = em_hdr->flags;
em->datalen = data_len;
em->net = net;
err = 0;
errout:
return err;
}
static const struct nla_policy em_policy[TCA_EMATCH_TREE_MAX + 1] = {
[TCA_EMATCH_TREE_HDR] = { .len = sizeof(struct tcf_ematch_tree_hdr) },
[TCA_EMATCH_TREE_LIST] = { .type = NLA_NESTED },
};
/**
* tcf_em_tree_validate - validate ematch config TLV and build ematch tree
*
* @tp: classifier kind handle
* @nla: ematch tree configuration TLV
* @tree: destination ematch tree variable to store the resulting
* ematch tree.
*
* This function validates the given configuration TLV @nla and builds an
* ematch tree in @tree. The resulting tree must later be copied into
* the private classifier data using tcf_em_tree_change(). You MUST NOT
* provide the ematch tree variable of the private classifier data directly,
* the changes would not be locked properly.
*
* Returns a negative error code if the configuration TLV contains errors.
*/
int tcf_em_tree_validate(struct tcf_proto *tp, struct nlattr *nla,
struct tcf_ematch_tree *tree)
{
int idx, list_len, matches_len, err;
struct nlattr *tb[TCA_EMATCH_TREE_MAX + 1];
struct nlattr *rt_match, *rt_hdr, *rt_list;
struct tcf_ematch_tree_hdr *tree_hdr;
struct tcf_ematch *em;
memset(tree, 0, sizeof(*tree));
if (!nla)
return 0;
err = nla_parse_nested(tb, TCA_EMATCH_TREE_MAX, nla, em_policy);
if (err < 0)
goto errout;
err = -EINVAL;
rt_hdr = tb[TCA_EMATCH_TREE_HDR];
rt_list = tb[TCA_EMATCH_TREE_LIST];
if (rt_hdr == NULL || rt_list == NULL)
goto errout;
tree_hdr = nla_data(rt_hdr);
memcpy(&tree->hdr, tree_hdr, sizeof(*tree_hdr));
rt_match = nla_data(rt_list);
list_len = nla_len(rt_list);
matches_len = tree_hdr->nmatches * sizeof(*em);
tree->matches = kzalloc(matches_len, GFP_KERNEL);
if (tree->matches == NULL)
goto errout;
/* We do not use nla_parse_nested here because the maximum
* number of attributes is unknown. This saves us the allocation
* for a tb buffer which would serve no purpose at all.
*
* The array of rt attributes is parsed in the order as they are
* provided, their type must be incremental from 1 to n. Even
* if it does not serve any real purpose, a failure of sticking
* to this policy will result in parsing failure.
*/
for (idx = 0; nla_ok(rt_match, list_len); idx++) {
err = -EINVAL;
if (rt_match->nla_type != (idx + 1))
goto errout_abort;
if (idx >= tree_hdr->nmatches)
goto errout_abort;
if (nla_len(rt_match) < sizeof(struct tcf_ematch_hdr))
goto errout_abort;
em = tcf_em_get_match(tree, idx);
err = tcf_em_validate(tp, tree_hdr, em, rt_match, idx);
if (err < 0)
goto errout_abort;
rt_match = nla_next(rt_match, &list_len);
}
/* Check if the number of matches provided by userspace actually
* complies with the array of matches. The number was used for
* the validation of references and a mismatch could lead to
* undefined references during the matching process.
*/
if (idx != tree_hdr->nmatches) {
err = -EINVAL;
goto errout_abort;
}
err = 0;
errout:
return err;
errout_abort:
tcf_em_tree_destroy(tree);
return err;
}
EXPORT_SYMBOL(tcf_em_tree_validate);
/**
* tcf_em_tree_destroy - destroy an ematch tree
*
* @tp: classifier kind handle
* @tree: ematch tree to be deleted
*
* This functions destroys an ematch tree previously created by
* tcf_em_tree_validate()/tcf_em_tree_change(). You must ensure that
* the ematch tree is not in use before calling this function.
*/
void tcf_em_tree_destroy(struct tcf_ematch_tree *tree)
{
int i;
if (tree->matches == NULL)
return;
for (i = 0; i < tree->hdr.nmatches; i++) {
struct tcf_ematch *em = tcf_em_get_match(tree, i);
if (em->ops) {
if (em->ops->destroy)
em->ops->destroy(em);
else if (!tcf_em_is_simple(em))
kfree((void *) em->data);
module_put(em->ops->owner);
}
}
tree->hdr.nmatches = 0;
kfree(tree->matches);
tree->matches = NULL;
}
EXPORT_SYMBOL(tcf_em_tree_destroy);
/**
* tcf_em_tree_dump - dump ematch tree into a rtnl message
*
* @skb: skb holding the rtnl message
* @t: ematch tree to be dumped
* @tlv: TLV type to be used to encapsulate the tree
*
* This function dumps a ematch tree into a rtnl message. It is valid to
* call this function while the ematch tree is in use.
*
* Returns -1 if the skb tailroom is insufficient.
*/
int tcf_em_tree_dump(struct sk_buff *skb, struct tcf_ematch_tree *tree, int tlv)
{
int i;
u8 *tail;
struct nlattr *top_start;
struct nlattr *list_start;
top_start = nla_nest_start(skb, tlv);
if (top_start == NULL)
goto nla_put_failure;
if (nla_put(skb, TCA_EMATCH_TREE_HDR, sizeof(tree->hdr), &tree->hdr))
goto nla_put_failure;
list_start = nla_nest_start(skb, TCA_EMATCH_TREE_LIST);
if (list_start == NULL)
goto nla_put_failure;
tail = skb_tail_pointer(skb);
for (i = 0; i < tree->hdr.nmatches; i++) {
struct nlattr *match_start = (struct nlattr *)tail;
struct tcf_ematch *em = tcf_em_get_match(tree, i);
struct tcf_ematch_hdr em_hdr = {
.kind = em->ops ? em->ops->kind : TCF_EM_CONTAINER,
.matchid = em->matchid,
.flags = em->flags
};
if (nla_put(skb, i + 1, sizeof(em_hdr), &em_hdr))
goto nla_put_failure;
if (em->ops && em->ops->dump) {
if (em->ops->dump(skb, em) < 0)
goto nla_put_failure;
} else if (tcf_em_is_container(em) || tcf_em_is_simple(em)) {
u32 u = em->data;
nla_put_nohdr(skb, sizeof(u), &u);
} else if (em->datalen > 0)
nla_put_nohdr(skb, em->datalen, (void *) em->data);
tail = skb_tail_pointer(skb);
match_start->nla_len = tail - (u8 *)match_start;
}
nla_nest_end(skb, list_start);
nla_nest_end(skb, top_start);
return 0;
nla_put_failure:
return -1;
}
EXPORT_SYMBOL(tcf_em_tree_dump);
static inline int tcf_em_match(struct sk_buff *skb, struct tcf_ematch *em,
struct tcf_pkt_info *info)
{
int r = em->ops->match(skb, em, info);
return tcf_em_is_inverted(em) ? !r : r;
}
/* Do not use this function directly, use tcf_em_tree_match instead */
int __tcf_em_tree_match(struct sk_buff *skb, struct tcf_ematch_tree *tree,
struct tcf_pkt_info *info)
{
int stackp = 0, match_idx = 0, res = 0;
struct tcf_ematch *cur_match;
int stack[CONFIG_NET_EMATCH_STACK];
proceed:
while (match_idx < tree->hdr.nmatches) {
cur_match = tcf_em_get_match(tree, match_idx);
if (tcf_em_is_container(cur_match)) {
if (unlikely(stackp >= CONFIG_NET_EMATCH_STACK))
goto stack_overflow;
stack[stackp++] = match_idx;
match_idx = cur_match->data;
goto proceed;
}
res = tcf_em_match(skb, cur_match, info);
if (tcf_em_early_end(cur_match, res))
break;
match_idx++;
}
pop_stack:
if (stackp > 0) {
match_idx = stack[--stackp];
cur_match = tcf_em_get_match(tree, match_idx);
if (tcf_em_is_inverted(cur_match))
res = !res;
if (tcf_em_early_end(cur_match, res)) {
goto pop_stack;
} else {
match_idx++;
goto proceed;
}
}
return res;
stack_overflow:
net_warn_ratelimited("tc ematch: local stack overflow, increase NET_EMATCH_STACK\n");
return -1;
}
EXPORT_SYMBOL(__tcf_em_tree_match);

1993
net/sched/sch_api.c Normal file

File diff suppressed because it is too large Load diff

694
net/sched/sch_atm.c Normal file
View file

@ -0,0 +1,694 @@
/* net/sched/sch_atm.c - ATM VC selection "queueing discipline" */
/* Written 1998-2000 by Werner Almesberger, EPFL ICA */
#include <linux/module.h>
#include <linux/slab.h>
#include <linux/init.h>
#include <linux/interrupt.h>
#include <linux/string.h>
#include <linux/errno.h>
#include <linux/skbuff.h>
#include <linux/atmdev.h>
#include <linux/atmclip.h>
#include <linux/rtnetlink.h>
#include <linux/file.h> /* for fput */
#include <net/netlink.h>
#include <net/pkt_sched.h>
/*
* The ATM queuing discipline provides a framework for invoking classifiers
* (aka "filters"), which in turn select classes of this queuing discipline.
* Each class maps the flow(s) it is handling to a given VC. Multiple classes
* may share the same VC.
*
* When creating a class, VCs are specified by passing the number of the open
* socket descriptor by which the calling process references the VC. The kernel
* keeps the VC open at least until all classes using it are removed.
*
* In this file, most functions are named atm_tc_* to avoid confusion with all
* the atm_* in net/atm. This naming convention differs from what's used in the
* rest of net/sched.
*
* Known bugs:
* - sometimes messes up the IP stack
* - any manipulations besides the few operations described in the README, are
* untested and likely to crash the system
* - should lock the flow while there is data in the queue (?)
*/
#define VCC2FLOW(vcc) ((struct atm_flow_data *) ((vcc)->user_back))
struct atm_flow_data {
struct Qdisc *q; /* FIFO, TBF, etc. */
struct tcf_proto __rcu *filter_list;
struct atm_vcc *vcc; /* VCC; NULL if VCC is closed */
void (*old_pop)(struct atm_vcc *vcc,
struct sk_buff *skb); /* chaining */
struct atm_qdisc_data *parent; /* parent qdisc */
struct socket *sock; /* for closing */
u32 classid; /* x:y type ID */
int ref; /* reference count */
struct gnet_stats_basic_packed bstats;
struct gnet_stats_queue qstats;
struct list_head list;
struct atm_flow_data *excess; /* flow for excess traffic;
NULL to set CLP instead */
int hdr_len;
unsigned char hdr[0]; /* header data; MUST BE LAST */
};
struct atm_qdisc_data {
struct atm_flow_data link; /* unclassified skbs go here */
struct list_head flows; /* NB: "link" is also on this
list */
struct tasklet_struct task; /* dequeue tasklet */
};
/* ------------------------- Class/flow operations ------------------------- */
static inline struct atm_flow_data *lookup_flow(struct Qdisc *sch, u32 classid)
{
struct atm_qdisc_data *p = qdisc_priv(sch);
struct atm_flow_data *flow;
list_for_each_entry(flow, &p->flows, list) {
if (flow->classid == classid)
return flow;
}
return NULL;
}
static int atm_tc_graft(struct Qdisc *sch, unsigned long arg,
struct Qdisc *new, struct Qdisc **old)
{
struct atm_qdisc_data *p = qdisc_priv(sch);
struct atm_flow_data *flow = (struct atm_flow_data *)arg;
pr_debug("atm_tc_graft(sch %p,[qdisc %p],flow %p,new %p,old %p)\n",
sch, p, flow, new, old);
if (list_empty(&flow->list))
return -EINVAL;
if (!new)
new = &noop_qdisc;
*old = flow->q;
flow->q = new;
if (*old)
qdisc_reset(*old);
return 0;
}
static struct Qdisc *atm_tc_leaf(struct Qdisc *sch, unsigned long cl)
{
struct atm_flow_data *flow = (struct atm_flow_data *)cl;
pr_debug("atm_tc_leaf(sch %p,flow %p)\n", sch, flow);
return flow ? flow->q : NULL;
}
static unsigned long atm_tc_get(struct Qdisc *sch, u32 classid)
{
struct atm_qdisc_data *p __maybe_unused = qdisc_priv(sch);
struct atm_flow_data *flow;
pr_debug("atm_tc_get(sch %p,[qdisc %p],classid %x)\n", sch, p, classid);
flow = lookup_flow(sch, classid);
if (flow)
flow->ref++;
pr_debug("atm_tc_get: flow %p\n", flow);
return (unsigned long)flow;
}
static unsigned long atm_tc_bind_filter(struct Qdisc *sch,
unsigned long parent, u32 classid)
{
return atm_tc_get(sch, classid);
}
/*
* atm_tc_put handles all destructions, including the ones that are explicitly
* requested (atm_tc_destroy, etc.). The assumption here is that we never drop
* anything that still seems to be in use.
*/
static void atm_tc_put(struct Qdisc *sch, unsigned long cl)
{
struct atm_qdisc_data *p = qdisc_priv(sch);
struct atm_flow_data *flow = (struct atm_flow_data *)cl;
pr_debug("atm_tc_put(sch %p,[qdisc %p],flow %p)\n", sch, p, flow);
if (--flow->ref)
return;
pr_debug("atm_tc_put: destroying\n");
list_del_init(&flow->list);
pr_debug("atm_tc_put: qdisc %p\n", flow->q);
qdisc_destroy(flow->q);
tcf_destroy_chain(&flow->filter_list);
if (flow->sock) {
pr_debug("atm_tc_put: f_count %ld\n",
file_count(flow->sock->file));
flow->vcc->pop = flow->old_pop;
sockfd_put(flow->sock);
}
if (flow->excess)
atm_tc_put(sch, (unsigned long)flow->excess);
if (flow != &p->link)
kfree(flow);
/*
* If flow == &p->link, the qdisc no longer works at this point and
* needs to be removed. (By the caller of atm_tc_put.)
*/
}
static void sch_atm_pop(struct atm_vcc *vcc, struct sk_buff *skb)
{
struct atm_qdisc_data *p = VCC2FLOW(vcc)->parent;
pr_debug("sch_atm_pop(vcc %p,skb %p,[qdisc %p])\n", vcc, skb, p);
VCC2FLOW(vcc)->old_pop(vcc, skb);
tasklet_schedule(&p->task);
}
static const u8 llc_oui_ip[] = {
0xaa, /* DSAP: non-ISO */
0xaa, /* SSAP: non-ISO */
0x03, /* Ctrl: Unnumbered Information Command PDU */
0x00, /* OUI: EtherType */
0x00, 0x00,
0x08, 0x00
}; /* Ethertype IP (0800) */
static const struct nla_policy atm_policy[TCA_ATM_MAX + 1] = {
[TCA_ATM_FD] = { .type = NLA_U32 },
[TCA_ATM_EXCESS] = { .type = NLA_U32 },
};
static int atm_tc_change(struct Qdisc *sch, u32 classid, u32 parent,
struct nlattr **tca, unsigned long *arg)
{
struct atm_qdisc_data *p = qdisc_priv(sch);
struct atm_flow_data *flow = (struct atm_flow_data *)*arg;
struct atm_flow_data *excess = NULL;
struct nlattr *opt = tca[TCA_OPTIONS];
struct nlattr *tb[TCA_ATM_MAX + 1];
struct socket *sock;
int fd, error, hdr_len;
void *hdr;
pr_debug("atm_tc_change(sch %p,[qdisc %p],classid %x,parent %x,"
"flow %p,opt %p)\n", sch, p, classid, parent, flow, opt);
/*
* The concept of parents doesn't apply for this qdisc.
*/
if (parent && parent != TC_H_ROOT && parent != sch->handle)
return -EINVAL;
/*
* ATM classes cannot be changed. In order to change properties of the
* ATM connection, that socket needs to be modified directly (via the
* native ATM API. In order to send a flow to a different VC, the old
* class needs to be removed and a new one added. (This may be changed
* later.)
*/
if (flow)
return -EBUSY;
if (opt == NULL)
return -EINVAL;
error = nla_parse_nested(tb, TCA_ATM_MAX, opt, atm_policy);
if (error < 0)
return error;
if (!tb[TCA_ATM_FD])
return -EINVAL;
fd = nla_get_u32(tb[TCA_ATM_FD]);
pr_debug("atm_tc_change: fd %d\n", fd);
if (tb[TCA_ATM_HDR]) {
hdr_len = nla_len(tb[TCA_ATM_HDR]);
hdr = nla_data(tb[TCA_ATM_HDR]);
} else {
hdr_len = RFC1483LLC_LEN;
hdr = NULL; /* default LLC/SNAP for IP */
}
if (!tb[TCA_ATM_EXCESS])
excess = NULL;
else {
excess = (struct atm_flow_data *)
atm_tc_get(sch, nla_get_u32(tb[TCA_ATM_EXCESS]));
if (!excess)
return -ENOENT;
}
pr_debug("atm_tc_change: type %d, payload %d, hdr_len %d\n",
opt->nla_type, nla_len(opt), hdr_len);
sock = sockfd_lookup(fd, &error);
if (!sock)
return error; /* f_count++ */
pr_debug("atm_tc_change: f_count %ld\n", file_count(sock->file));
if (sock->ops->family != PF_ATMSVC && sock->ops->family != PF_ATMPVC) {
error = -EPROTOTYPE;
goto err_out;
}
/* @@@ should check if the socket is really operational or we'll crash
on vcc->send */
if (classid) {
if (TC_H_MAJ(classid ^ sch->handle)) {
pr_debug("atm_tc_change: classid mismatch\n");
error = -EINVAL;
goto err_out;
}
} else {
int i;
unsigned long cl;
for (i = 1; i < 0x8000; i++) {
classid = TC_H_MAKE(sch->handle, 0x8000 | i);
cl = atm_tc_get(sch, classid);
if (!cl)
break;
atm_tc_put(sch, cl);
}
}
pr_debug("atm_tc_change: new id %x\n", classid);
flow = kzalloc(sizeof(struct atm_flow_data) + hdr_len, GFP_KERNEL);
pr_debug("atm_tc_change: flow %p\n", flow);
if (!flow) {
error = -ENOBUFS;
goto err_out;
}
RCU_INIT_POINTER(flow->filter_list, NULL);
flow->q = qdisc_create_dflt(sch->dev_queue, &pfifo_qdisc_ops, classid);
if (!flow->q)
flow->q = &noop_qdisc;
pr_debug("atm_tc_change: qdisc %p\n", flow->q);
flow->sock = sock;
flow->vcc = ATM_SD(sock); /* speedup */
flow->vcc->user_back = flow;
pr_debug("atm_tc_change: vcc %p\n", flow->vcc);
flow->old_pop = flow->vcc->pop;
flow->parent = p;
flow->vcc->pop = sch_atm_pop;
flow->classid = classid;
flow->ref = 1;
flow->excess = excess;
list_add(&flow->list, &p->link.list);
flow->hdr_len = hdr_len;
if (hdr)
memcpy(flow->hdr, hdr, hdr_len);
else
memcpy(flow->hdr, llc_oui_ip, sizeof(llc_oui_ip));
*arg = (unsigned long)flow;
return 0;
err_out:
if (excess)
atm_tc_put(sch, (unsigned long)excess);
sockfd_put(sock);
return error;
}
static int atm_tc_delete(struct Qdisc *sch, unsigned long arg)
{
struct atm_qdisc_data *p = qdisc_priv(sch);
struct atm_flow_data *flow = (struct atm_flow_data *)arg;
pr_debug("atm_tc_delete(sch %p,[qdisc %p],flow %p)\n", sch, p, flow);
if (list_empty(&flow->list))
return -EINVAL;
if (rcu_access_pointer(flow->filter_list) || flow == &p->link)
return -EBUSY;
/*
* Reference count must be 2: one for "keepalive" (set at class
* creation), and one for the reference held when calling delete.
*/
if (flow->ref < 2) {
pr_err("atm_tc_delete: flow->ref == %d\n", flow->ref);
return -EINVAL;
}
if (flow->ref > 2)
return -EBUSY; /* catch references via excess, etc. */
atm_tc_put(sch, arg);
return 0;
}
static void atm_tc_walk(struct Qdisc *sch, struct qdisc_walker *walker)
{
struct atm_qdisc_data *p = qdisc_priv(sch);
struct atm_flow_data *flow;
pr_debug("atm_tc_walk(sch %p,[qdisc %p],walker %p)\n", sch, p, walker);
if (walker->stop)
return;
list_for_each_entry(flow, &p->flows, list) {
if (walker->count >= walker->skip &&
walker->fn(sch, (unsigned long)flow, walker) < 0) {
walker->stop = 1;
break;
}
walker->count++;
}
}
static struct tcf_proto __rcu **atm_tc_find_tcf(struct Qdisc *sch,
unsigned long cl)
{
struct atm_qdisc_data *p = qdisc_priv(sch);
struct atm_flow_data *flow = (struct atm_flow_data *)cl;
pr_debug("atm_tc_find_tcf(sch %p,[qdisc %p],flow %p)\n", sch, p, flow);
return flow ? &flow->filter_list : &p->link.filter_list;
}
/* --------------------------- Qdisc operations ---------------------------- */
static int atm_tc_enqueue(struct sk_buff *skb, struct Qdisc *sch)
{
struct atm_qdisc_data *p = qdisc_priv(sch);
struct atm_flow_data *flow;
struct tcf_result res;
int result;
int ret = NET_XMIT_POLICED;
pr_debug("atm_tc_enqueue(skb %p,sch %p,[qdisc %p])\n", skb, sch, p);
result = TC_POLICE_OK; /* be nice to gcc */
flow = NULL;
if (TC_H_MAJ(skb->priority) != sch->handle ||
!(flow = (struct atm_flow_data *)atm_tc_get(sch, skb->priority))) {
struct tcf_proto *fl;
list_for_each_entry(flow, &p->flows, list) {
fl = rcu_dereference_bh(flow->filter_list);
if (fl) {
result = tc_classify_compat(skb, fl, &res);
if (result < 0)
continue;
flow = (struct atm_flow_data *)res.class;
if (!flow)
flow = lookup_flow(sch, res.classid);
goto done;
}
}
flow = NULL;
done:
;
}
if (!flow) {
flow = &p->link;
} else {
if (flow->vcc)
ATM_SKB(skb)->atm_options = flow->vcc->atm_options;
/*@@@ looks good ... but it's not supposed to work :-) */
#ifdef CONFIG_NET_CLS_ACT
switch (result) {
case TC_ACT_QUEUED:
case TC_ACT_STOLEN:
kfree_skb(skb);
return NET_XMIT_SUCCESS | __NET_XMIT_STOLEN;
case TC_ACT_SHOT:
kfree_skb(skb);
goto drop;
case TC_POLICE_RECLASSIFY:
if (flow->excess)
flow = flow->excess;
else
ATM_SKB(skb)->atm_options |= ATM_ATMOPT_CLP;
break;
}
#endif
}
ret = qdisc_enqueue(skb, flow->q);
if (ret != NET_XMIT_SUCCESS) {
drop: __maybe_unused
if (net_xmit_drop_count(ret)) {
qdisc_qstats_drop(sch);
if (flow)
flow->qstats.drops++;
}
return ret;
}
/*
* Okay, this may seem weird. We pretend we've dropped the packet if
* it goes via ATM. The reason for this is that the outer qdisc
* expects to be able to q->dequeue the packet later on if we return
* success at this place. Also, sch->q.qdisc needs to reflect whether
* there is a packet egligible for dequeuing or not. Note that the
* statistics of the outer qdisc are necessarily wrong because of all
* this. There's currently no correct solution for this.
*/
if (flow == &p->link) {
sch->q.qlen++;
return NET_XMIT_SUCCESS;
}
tasklet_schedule(&p->task);
return NET_XMIT_SUCCESS | __NET_XMIT_BYPASS;
}
/*
* Dequeue packets and send them over ATM. Note that we quite deliberately
* avoid checking net_device's flow control here, simply because sch_atm
* uses its own channels, which have nothing to do with any CLIP/LANE/or
* non-ATM interfaces.
*/
static void sch_atm_dequeue(unsigned long data)
{
struct Qdisc *sch = (struct Qdisc *)data;
struct atm_qdisc_data *p = qdisc_priv(sch);
struct atm_flow_data *flow;
struct sk_buff *skb;
pr_debug("sch_atm_dequeue(sch %p,[qdisc %p])\n", sch, p);
list_for_each_entry(flow, &p->flows, list) {
if (flow == &p->link)
continue;
/*
* If traffic is properly shaped, this won't generate nasty
* little bursts. Otherwise, it may ... (but that's okay)
*/
while ((skb = flow->q->ops->peek(flow->q))) {
if (!atm_may_send(flow->vcc, skb->truesize))
break;
skb = qdisc_dequeue_peeked(flow->q);
if (unlikely(!skb))
break;
qdisc_bstats_update(sch, skb);
bstats_update(&flow->bstats, skb);
pr_debug("atm_tc_dequeue: sending on class %p\n", flow);
/* remove any LL header somebody else has attached */
skb_pull(skb, skb_network_offset(skb));
if (skb_headroom(skb) < flow->hdr_len) {
struct sk_buff *new;
new = skb_realloc_headroom(skb, flow->hdr_len);
dev_kfree_skb(skb);
if (!new)
continue;
skb = new;
}
pr_debug("sch_atm_dequeue: ip %p, data %p\n",
skb_network_header(skb), skb->data);
ATM_SKB(skb)->vcc = flow->vcc;
memcpy(skb_push(skb, flow->hdr_len), flow->hdr,
flow->hdr_len);
atomic_add(skb->truesize,
&sk_atm(flow->vcc)->sk_wmem_alloc);
/* atm.atm_options are already set by atm_tc_enqueue */
flow->vcc->send(flow->vcc, skb);
}
}
}
static struct sk_buff *atm_tc_dequeue(struct Qdisc *sch)
{
struct atm_qdisc_data *p = qdisc_priv(sch);
struct sk_buff *skb;
pr_debug("atm_tc_dequeue(sch %p,[qdisc %p])\n", sch, p);
tasklet_schedule(&p->task);
skb = qdisc_dequeue_peeked(p->link.q);
if (skb)
sch->q.qlen--;
return skb;
}
static struct sk_buff *atm_tc_peek(struct Qdisc *sch)
{
struct atm_qdisc_data *p = qdisc_priv(sch);
pr_debug("atm_tc_peek(sch %p,[qdisc %p])\n", sch, p);
return p->link.q->ops->peek(p->link.q);
}
static unsigned int atm_tc_drop(struct Qdisc *sch)
{
struct atm_qdisc_data *p = qdisc_priv(sch);
struct atm_flow_data *flow;
unsigned int len;
pr_debug("atm_tc_drop(sch %p,[qdisc %p])\n", sch, p);
list_for_each_entry(flow, &p->flows, list) {
if (flow->q->ops->drop && (len = flow->q->ops->drop(flow->q)))
return len;
}
return 0;
}
static int atm_tc_init(struct Qdisc *sch, struct nlattr *opt)
{
struct atm_qdisc_data *p = qdisc_priv(sch);
pr_debug("atm_tc_init(sch %p,[qdisc %p],opt %p)\n", sch, p, opt);
INIT_LIST_HEAD(&p->flows);
INIT_LIST_HEAD(&p->link.list);
list_add(&p->link.list, &p->flows);
p->link.q = qdisc_create_dflt(sch->dev_queue,
&pfifo_qdisc_ops, sch->handle);
if (!p->link.q)
p->link.q = &noop_qdisc;
pr_debug("atm_tc_init: link (%p) qdisc %p\n", &p->link, p->link.q);
RCU_INIT_POINTER(p->link.filter_list, NULL);
p->link.vcc = NULL;
p->link.sock = NULL;
p->link.classid = sch->handle;
p->link.ref = 1;
tasklet_init(&p->task, sch_atm_dequeue, (unsigned long)sch);
return 0;
}
static void atm_tc_reset(struct Qdisc *sch)
{
struct atm_qdisc_data *p = qdisc_priv(sch);
struct atm_flow_data *flow;
pr_debug("atm_tc_reset(sch %p,[qdisc %p])\n", sch, p);
list_for_each_entry(flow, &p->flows, list)
qdisc_reset(flow->q);
sch->q.qlen = 0;
}
static void atm_tc_destroy(struct Qdisc *sch)
{
struct atm_qdisc_data *p = qdisc_priv(sch);
struct atm_flow_data *flow, *tmp;
pr_debug("atm_tc_destroy(sch %p,[qdisc %p])\n", sch, p);
list_for_each_entry(flow, &p->flows, list)
tcf_destroy_chain(&flow->filter_list);
list_for_each_entry_safe(flow, tmp, &p->flows, list) {
if (flow->ref > 1)
pr_err("atm_destroy: %p->ref = %d\n", flow, flow->ref);
atm_tc_put(sch, (unsigned long)flow);
}
tasklet_kill(&p->task);
}
static int atm_tc_dump_class(struct Qdisc *sch, unsigned long cl,
struct sk_buff *skb, struct tcmsg *tcm)
{
struct atm_qdisc_data *p = qdisc_priv(sch);
struct atm_flow_data *flow = (struct atm_flow_data *)cl;
struct nlattr *nest;
pr_debug("atm_tc_dump_class(sch %p,[qdisc %p],flow %p,skb %p,tcm %p)\n",
sch, p, flow, skb, tcm);
if (list_empty(&flow->list))
return -EINVAL;
tcm->tcm_handle = flow->classid;
tcm->tcm_info = flow->q->handle;
nest = nla_nest_start(skb, TCA_OPTIONS);
if (nest == NULL)
goto nla_put_failure;
if (nla_put(skb, TCA_ATM_HDR, flow->hdr_len, flow->hdr))
goto nla_put_failure;
if (flow->vcc) {
struct sockaddr_atmpvc pvc;
int state;
memset(&pvc, 0, sizeof(pvc));
pvc.sap_family = AF_ATMPVC;
pvc.sap_addr.itf = flow->vcc->dev ? flow->vcc->dev->number : -1;
pvc.sap_addr.vpi = flow->vcc->vpi;
pvc.sap_addr.vci = flow->vcc->vci;
if (nla_put(skb, TCA_ATM_ADDR, sizeof(pvc), &pvc))
goto nla_put_failure;
state = ATM_VF2VS(flow->vcc->flags);
if (nla_put_u32(skb, TCA_ATM_STATE, state))
goto nla_put_failure;
}
if (flow->excess) {
if (nla_put_u32(skb, TCA_ATM_EXCESS, flow->classid))
goto nla_put_failure;
} else {
if (nla_put_u32(skb, TCA_ATM_EXCESS, 0))
goto nla_put_failure;
}
return nla_nest_end(skb, nest);
nla_put_failure:
nla_nest_cancel(skb, nest);
return -1;
}
static int
atm_tc_dump_class_stats(struct Qdisc *sch, unsigned long arg,
struct gnet_dump *d)
{
struct atm_flow_data *flow = (struct atm_flow_data *)arg;
if (gnet_stats_copy_basic(d, NULL, &flow->bstats) < 0 ||
gnet_stats_copy_queue(d, NULL, &flow->qstats, flow->q->q.qlen) < 0)
return -1;
return 0;
}
static int atm_tc_dump(struct Qdisc *sch, struct sk_buff *skb)
{
return 0;
}
static const struct Qdisc_class_ops atm_class_ops = {
.graft = atm_tc_graft,
.leaf = atm_tc_leaf,
.get = atm_tc_get,
.put = atm_tc_put,
.change = atm_tc_change,
.delete = atm_tc_delete,
.walk = atm_tc_walk,
.tcf_chain = atm_tc_find_tcf,
.bind_tcf = atm_tc_bind_filter,
.unbind_tcf = atm_tc_put,
.dump = atm_tc_dump_class,
.dump_stats = atm_tc_dump_class_stats,
};
static struct Qdisc_ops atm_qdisc_ops __read_mostly = {
.cl_ops = &atm_class_ops,
.id = "atm",
.priv_size = sizeof(struct atm_qdisc_data),
.enqueue = atm_tc_enqueue,
.dequeue = atm_tc_dequeue,
.peek = atm_tc_peek,
.drop = atm_tc_drop,
.init = atm_tc_init,
.reset = atm_tc_reset,
.destroy = atm_tc_destroy,
.dump = atm_tc_dump,
.owner = THIS_MODULE,
};
static int __init atm_init(void)
{
return register_qdisc(&atm_qdisc_ops);
}
static void __exit atm_exit(void)
{
unregister_qdisc(&atm_qdisc_ops);
}
module_init(atm_init)
module_exit(atm_exit)
MODULE_LICENSE("GPL");

53
net/sched/sch_blackhole.c Normal file
View file

@ -0,0 +1,53 @@
/*
* net/sched/sch_blackhole.c Black hole queue
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public License
* as published by the Free Software Foundation; either version
* 2 of the License, or (at your option) any later version.
*
* Authors: Thomas Graf <tgraf@suug.ch>
*
* Note: Quantum tunneling is not supported.
*/
#include <linux/module.h>
#include <linux/types.h>
#include <linux/kernel.h>
#include <linux/skbuff.h>
#include <net/pkt_sched.h>
static int blackhole_enqueue(struct sk_buff *skb, struct Qdisc *sch)
{
qdisc_drop(skb, sch);
return NET_XMIT_SUCCESS;
}
static struct sk_buff *blackhole_dequeue(struct Qdisc *sch)
{
return NULL;
}
static struct Qdisc_ops blackhole_qdisc_ops __read_mostly = {
.id = "blackhole",
.priv_size = 0,
.enqueue = blackhole_enqueue,
.dequeue = blackhole_dequeue,
.peek = blackhole_dequeue,
.owner = THIS_MODULE,
};
static int __init blackhole_module_init(void)
{
return register_qdisc(&blackhole_qdisc_ops);
}
static void __exit blackhole_module_exit(void)
{
unregister_qdisc(&blackhole_qdisc_ops);
}
module_init(blackhole_module_init)
module_exit(blackhole_module_exit)
MODULE_LICENSE("GPL");

2062
net/sched/sch_cbq.c Normal file

File diff suppressed because it is too large Load diff

645
net/sched/sch_choke.c Normal file
View file

@ -0,0 +1,645 @@
/*
* net/sched/sch_choke.c CHOKE scheduler
*
* Copyright (c) 2011 Stephen Hemminger <shemminger@vyatta.com>
* Copyright (c) 2011 Eric Dumazet <eric.dumazet@gmail.com>
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public License
* version 2 as published by the Free Software Foundation.
*
*/
#include <linux/module.h>
#include <linux/types.h>
#include <linux/kernel.h>
#include <linux/skbuff.h>
#include <linux/vmalloc.h>
#include <net/pkt_sched.h>
#include <net/inet_ecn.h>
#include <net/red.h>
#include <net/flow_keys.h>
/*
CHOKe stateless AQM for fair bandwidth allocation
=================================================
CHOKe (CHOose and Keep for responsive flows, CHOose and Kill for
unresponsive flows) is a variant of RED that penalizes misbehaving flows but
maintains no flow state. The difference from RED is an additional step
during the enqueuing process. If average queue size is over the
low threshold (qmin), a packet is chosen at random from the queue.
If both the new and chosen packet are from the same flow, both
are dropped. Unlike RED, CHOKe is not really a "classful" qdisc because it
needs to access packets in queue randomly. It has a minimal class
interface to allow overriding the builtin flow classifier with
filters.
Source:
R. Pan, B. Prabhakar, and K. Psounis, "CHOKe, A Stateless
Active Queue Management Scheme for Approximating Fair Bandwidth Allocation",
IEEE INFOCOM, 2000.
A. Tang, J. Wang, S. Low, "Understanding CHOKe: Throughput and Spatial
Characteristics", IEEE/ACM Transactions on Networking, 2004
*/
/* Upper bound on size of sk_buff table (packets) */
#define CHOKE_MAX_QUEUE (128*1024 - 1)
struct choke_sched_data {
/* Parameters */
u32 limit;
unsigned char flags;
struct red_parms parms;
/* Variables */
struct red_vars vars;
struct tcf_proto __rcu *filter_list;
struct {
u32 prob_drop; /* Early probability drops */
u32 prob_mark; /* Early probability marks */
u32 forced_drop; /* Forced drops, qavg > max_thresh */
u32 forced_mark; /* Forced marks, qavg > max_thresh */
u32 pdrop; /* Drops due to queue limits */
u32 other; /* Drops due to drop() calls */
u32 matched; /* Drops to flow match */
} stats;
unsigned int head;
unsigned int tail;
unsigned int tab_mask; /* size - 1 */
struct sk_buff **tab;
};
/* number of elements in queue including holes */
static unsigned int choke_len(const struct choke_sched_data *q)
{
return (q->tail - q->head) & q->tab_mask;
}
/* Is ECN parameter configured */
static int use_ecn(const struct choke_sched_data *q)
{
return q->flags & TC_RED_ECN;
}
/* Should packets over max just be dropped (versus marked) */
static int use_harddrop(const struct choke_sched_data *q)
{
return q->flags & TC_RED_HARDDROP;
}
/* Move head pointer forward to skip over holes */
static void choke_zap_head_holes(struct choke_sched_data *q)
{
do {
q->head = (q->head + 1) & q->tab_mask;
if (q->head == q->tail)
break;
} while (q->tab[q->head] == NULL);
}
/* Move tail pointer backwards to reuse holes */
static void choke_zap_tail_holes(struct choke_sched_data *q)
{
do {
q->tail = (q->tail - 1) & q->tab_mask;
if (q->head == q->tail)
break;
} while (q->tab[q->tail] == NULL);
}
/* Drop packet from queue array by creating a "hole" */
static void choke_drop_by_idx(struct Qdisc *sch, unsigned int idx)
{
struct choke_sched_data *q = qdisc_priv(sch);
struct sk_buff *skb = q->tab[idx];
q->tab[idx] = NULL;
if (idx == q->head)
choke_zap_head_holes(q);
if (idx == q->tail)
choke_zap_tail_holes(q);
qdisc_qstats_backlog_dec(sch, skb);
qdisc_drop(skb, sch);
qdisc_tree_decrease_qlen(sch, 1);
--sch->q.qlen;
}
/* private part of skb->cb[] that a qdisc is allowed to use
* is limited to QDISC_CB_PRIV_LEN bytes.
* As a flow key might be too large, we store a part of it only.
*/
#define CHOKE_K_LEN min_t(u32, sizeof(struct flow_keys), QDISC_CB_PRIV_LEN - 3)
struct choke_skb_cb {
u16 classid;
u8 keys_valid;
u8 keys[QDISC_CB_PRIV_LEN - 3];
};
static inline struct choke_skb_cb *choke_skb_cb(const struct sk_buff *skb)
{
qdisc_cb_private_validate(skb, sizeof(struct choke_skb_cb));
return (struct choke_skb_cb *)qdisc_skb_cb(skb)->data;
}
static inline void choke_set_classid(struct sk_buff *skb, u16 classid)
{
choke_skb_cb(skb)->classid = classid;
}
static u16 choke_get_classid(const struct sk_buff *skb)
{
return choke_skb_cb(skb)->classid;
}
/*
* Compare flow of two packets
* Returns true only if source and destination address and port match.
* false for special cases
*/
static bool choke_match_flow(struct sk_buff *skb1,
struct sk_buff *skb2)
{
struct flow_keys temp;
if (skb1->protocol != skb2->protocol)
return false;
if (!choke_skb_cb(skb1)->keys_valid) {
choke_skb_cb(skb1)->keys_valid = 1;
skb_flow_dissect(skb1, &temp);
memcpy(&choke_skb_cb(skb1)->keys, &temp, CHOKE_K_LEN);
}
if (!choke_skb_cb(skb2)->keys_valid) {
choke_skb_cb(skb2)->keys_valid = 1;
skb_flow_dissect(skb2, &temp);
memcpy(&choke_skb_cb(skb2)->keys, &temp, CHOKE_K_LEN);
}
return !memcmp(&choke_skb_cb(skb1)->keys,
&choke_skb_cb(skb2)->keys,
CHOKE_K_LEN);
}
/*
* Classify flow using either:
* 1. pre-existing classification result in skb
* 2. fast internal classification
* 3. use TC filter based classification
*/
static bool choke_classify(struct sk_buff *skb,
struct Qdisc *sch, int *qerr)
{
struct choke_sched_data *q = qdisc_priv(sch);
struct tcf_result res;
struct tcf_proto *fl;
int result;
fl = rcu_dereference_bh(q->filter_list);
result = tc_classify(skb, fl, &res);
if (result >= 0) {
#ifdef CONFIG_NET_CLS_ACT
switch (result) {
case TC_ACT_STOLEN:
case TC_ACT_QUEUED:
*qerr = NET_XMIT_SUCCESS | __NET_XMIT_STOLEN;
case TC_ACT_SHOT:
return false;
}
#endif
choke_set_classid(skb, TC_H_MIN(res.classid));
return true;
}
return false;
}
/*
* Select a packet at random from queue
* HACK: since queue can have holes from previous deletion; retry several
* times to find a random skb but then just give up and return the head
* Will return NULL if queue is empty (q->head == q->tail)
*/
static struct sk_buff *choke_peek_random(const struct choke_sched_data *q,
unsigned int *pidx)
{
struct sk_buff *skb;
int retrys = 3;
do {
*pidx = (q->head + prandom_u32_max(choke_len(q))) & q->tab_mask;
skb = q->tab[*pidx];
if (skb)
return skb;
} while (--retrys > 0);
return q->tab[*pidx = q->head];
}
/*
* Compare new packet with random packet in queue
* returns true if matched and sets *pidx
*/
static bool choke_match_random(const struct choke_sched_data *q,
struct sk_buff *nskb,
unsigned int *pidx)
{
struct sk_buff *oskb;
if (q->head == q->tail)
return false;
oskb = choke_peek_random(q, pidx);
if (rcu_access_pointer(q->filter_list))
return choke_get_classid(nskb) == choke_get_classid(oskb);
return choke_match_flow(oskb, nskb);
}
static int choke_enqueue(struct sk_buff *skb, struct Qdisc *sch)
{
int ret = NET_XMIT_SUCCESS | __NET_XMIT_BYPASS;
struct choke_sched_data *q = qdisc_priv(sch);
const struct red_parms *p = &q->parms;
if (rcu_access_pointer(q->filter_list)) {
/* If using external classifiers, get result and record it. */
if (!choke_classify(skb, sch, &ret))
goto other_drop; /* Packet was eaten by filter */
}
choke_skb_cb(skb)->keys_valid = 0;
/* Compute average queue usage (see RED) */
q->vars.qavg = red_calc_qavg(p, &q->vars, sch->q.qlen);
if (red_is_idling(&q->vars))
red_end_of_idle_period(&q->vars);
/* Is queue small? */
if (q->vars.qavg <= p->qth_min)
q->vars.qcount = -1;
else {
unsigned int idx;
/* Draw a packet at random from queue and compare flow */
if (choke_match_random(q, skb, &idx)) {
q->stats.matched++;
choke_drop_by_idx(sch, idx);
goto congestion_drop;
}
/* Queue is large, always mark/drop */
if (q->vars.qavg > p->qth_max) {
q->vars.qcount = -1;
qdisc_qstats_overlimit(sch);
if (use_harddrop(q) || !use_ecn(q) ||
!INET_ECN_set_ce(skb)) {
q->stats.forced_drop++;
goto congestion_drop;
}
q->stats.forced_mark++;
} else if (++q->vars.qcount) {
if (red_mark_probability(p, &q->vars, q->vars.qavg)) {
q->vars.qcount = 0;
q->vars.qR = red_random(p);
qdisc_qstats_overlimit(sch);
if (!use_ecn(q) || !INET_ECN_set_ce(skb)) {
q->stats.prob_drop++;
goto congestion_drop;
}
q->stats.prob_mark++;
}
} else
q->vars.qR = red_random(p);
}
/* Admit new packet */
if (sch->q.qlen < q->limit) {
q->tab[q->tail] = skb;
q->tail = (q->tail + 1) & q->tab_mask;
++sch->q.qlen;
qdisc_qstats_backlog_inc(sch, skb);
return NET_XMIT_SUCCESS;
}
q->stats.pdrop++;
return qdisc_drop(skb, sch);
congestion_drop:
qdisc_drop(skb, sch);
return NET_XMIT_CN;
other_drop:
if (ret & __NET_XMIT_BYPASS)
qdisc_qstats_drop(sch);
kfree_skb(skb);
return ret;
}
static struct sk_buff *choke_dequeue(struct Qdisc *sch)
{
struct choke_sched_data *q = qdisc_priv(sch);
struct sk_buff *skb;
if (q->head == q->tail) {
if (!red_is_idling(&q->vars))
red_start_of_idle_period(&q->vars);
return NULL;
}
skb = q->tab[q->head];
q->tab[q->head] = NULL;
choke_zap_head_holes(q);
--sch->q.qlen;
qdisc_qstats_backlog_dec(sch, skb);
qdisc_bstats_update(sch, skb);
return skb;
}
static unsigned int choke_drop(struct Qdisc *sch)
{
struct choke_sched_data *q = qdisc_priv(sch);
unsigned int len;
len = qdisc_queue_drop(sch);
if (len > 0)
q->stats.other++;
else {
if (!red_is_idling(&q->vars))
red_start_of_idle_period(&q->vars);
}
return len;
}
static void choke_reset(struct Qdisc *sch)
{
struct choke_sched_data *q = qdisc_priv(sch);
red_restart(&q->vars);
}
static const struct nla_policy choke_policy[TCA_CHOKE_MAX + 1] = {
[TCA_CHOKE_PARMS] = { .len = sizeof(struct tc_red_qopt) },
[TCA_CHOKE_STAB] = { .len = RED_STAB_SIZE },
[TCA_CHOKE_MAX_P] = { .type = NLA_U32 },
};
static void choke_free(void *addr)
{
kvfree(addr);
}
static int choke_change(struct Qdisc *sch, struct nlattr *opt)
{
struct choke_sched_data *q = qdisc_priv(sch);
struct nlattr *tb[TCA_CHOKE_MAX + 1];
const struct tc_red_qopt *ctl;
int err;
struct sk_buff **old = NULL;
unsigned int mask;
u32 max_P;
if (opt == NULL)
return -EINVAL;
err = nla_parse_nested(tb, TCA_CHOKE_MAX, opt, choke_policy);
if (err < 0)
return err;
if (tb[TCA_CHOKE_PARMS] == NULL ||
tb[TCA_CHOKE_STAB] == NULL)
return -EINVAL;
max_P = tb[TCA_CHOKE_MAX_P] ? nla_get_u32(tb[TCA_CHOKE_MAX_P]) : 0;
ctl = nla_data(tb[TCA_CHOKE_PARMS]);
if (ctl->limit > CHOKE_MAX_QUEUE)
return -EINVAL;
mask = roundup_pow_of_two(ctl->limit + 1) - 1;
if (mask != q->tab_mask) {
struct sk_buff **ntab;
ntab = kcalloc(mask + 1, sizeof(struct sk_buff *),
GFP_KERNEL | __GFP_NOWARN);
if (!ntab)
ntab = vzalloc((mask + 1) * sizeof(struct sk_buff *));
if (!ntab)
return -ENOMEM;
sch_tree_lock(sch);
old = q->tab;
if (old) {
unsigned int oqlen = sch->q.qlen, tail = 0;
while (q->head != q->tail) {
struct sk_buff *skb = q->tab[q->head];
q->head = (q->head + 1) & q->tab_mask;
if (!skb)
continue;
if (tail < mask) {
ntab[tail++] = skb;
continue;
}
qdisc_qstats_backlog_dec(sch, skb);
--sch->q.qlen;
qdisc_drop(skb, sch);
}
qdisc_tree_decrease_qlen(sch, oqlen - sch->q.qlen);
q->head = 0;
q->tail = tail;
}
q->tab_mask = mask;
q->tab = ntab;
} else
sch_tree_lock(sch);
q->flags = ctl->flags;
q->limit = ctl->limit;
red_set_parms(&q->parms, ctl->qth_min, ctl->qth_max, ctl->Wlog,
ctl->Plog, ctl->Scell_log,
nla_data(tb[TCA_CHOKE_STAB]),
max_P);
red_set_vars(&q->vars);
if (q->head == q->tail)
red_end_of_idle_period(&q->vars);
sch_tree_unlock(sch);
choke_free(old);
return 0;
}
static int choke_init(struct Qdisc *sch, struct nlattr *opt)
{
return choke_change(sch, opt);
}
static int choke_dump(struct Qdisc *sch, struct sk_buff *skb)
{
struct choke_sched_data *q = qdisc_priv(sch);
struct nlattr *opts = NULL;
struct tc_red_qopt opt = {
.limit = q->limit,
.flags = q->flags,
.qth_min = q->parms.qth_min >> q->parms.Wlog,
.qth_max = q->parms.qth_max >> q->parms.Wlog,
.Wlog = q->parms.Wlog,
.Plog = q->parms.Plog,
.Scell_log = q->parms.Scell_log,
};
opts = nla_nest_start(skb, TCA_OPTIONS);
if (opts == NULL)
goto nla_put_failure;
if (nla_put(skb, TCA_CHOKE_PARMS, sizeof(opt), &opt) ||
nla_put_u32(skb, TCA_CHOKE_MAX_P, q->parms.max_P))
goto nla_put_failure;
return nla_nest_end(skb, opts);
nla_put_failure:
nla_nest_cancel(skb, opts);
return -EMSGSIZE;
}
static int choke_dump_stats(struct Qdisc *sch, struct gnet_dump *d)
{
struct choke_sched_data *q = qdisc_priv(sch);
struct tc_choke_xstats st = {
.early = q->stats.prob_drop + q->stats.forced_drop,
.marked = q->stats.prob_mark + q->stats.forced_mark,
.pdrop = q->stats.pdrop,
.other = q->stats.other,
.matched = q->stats.matched,
};
return gnet_stats_copy_app(d, &st, sizeof(st));
}
static void choke_destroy(struct Qdisc *sch)
{
struct choke_sched_data *q = qdisc_priv(sch);
tcf_destroy_chain(&q->filter_list);
choke_free(q->tab);
}
static struct Qdisc *choke_leaf(struct Qdisc *sch, unsigned long arg)
{
return NULL;
}
static unsigned long choke_get(struct Qdisc *sch, u32 classid)
{
return 0;
}
static void choke_put(struct Qdisc *q, unsigned long cl)
{
}
static unsigned long choke_bind(struct Qdisc *sch, unsigned long parent,
u32 classid)
{
return 0;
}
static struct tcf_proto __rcu **choke_find_tcf(struct Qdisc *sch,
unsigned long cl)
{
struct choke_sched_data *q = qdisc_priv(sch);
if (cl)
return NULL;
return &q->filter_list;
}
static int choke_dump_class(struct Qdisc *sch, unsigned long cl,
struct sk_buff *skb, struct tcmsg *tcm)
{
tcm->tcm_handle |= TC_H_MIN(cl);
return 0;
}
static void choke_walk(struct Qdisc *sch, struct qdisc_walker *arg)
{
if (!arg->stop) {
if (arg->fn(sch, 1, arg) < 0) {
arg->stop = 1;
return;
}
arg->count++;
}
}
static const struct Qdisc_class_ops choke_class_ops = {
.leaf = choke_leaf,
.get = choke_get,
.put = choke_put,
.tcf_chain = choke_find_tcf,
.bind_tcf = choke_bind,
.unbind_tcf = choke_put,
.dump = choke_dump_class,
.walk = choke_walk,
};
static struct sk_buff *choke_peek_head(struct Qdisc *sch)
{
struct choke_sched_data *q = qdisc_priv(sch);
return (q->head != q->tail) ? q->tab[q->head] : NULL;
}
static struct Qdisc_ops choke_qdisc_ops __read_mostly = {
.id = "choke",
.priv_size = sizeof(struct choke_sched_data),
.enqueue = choke_enqueue,
.dequeue = choke_dequeue,
.peek = choke_peek_head,
.drop = choke_drop,
.init = choke_init,
.destroy = choke_destroy,
.reset = choke_reset,
.change = choke_change,
.dump = choke_dump,
.dump_stats = choke_dump_stats,
.owner = THIS_MODULE,
};
static int __init choke_module_init(void)
{
return register_qdisc(&choke_qdisc_ops);
}
static void __exit choke_module_exit(void)
{
unregister_qdisc(&choke_qdisc_ops);
}
module_init(choke_module_init)
module_exit(choke_module_exit)
MODULE_LICENSE("GPL");

276
net/sched/sch_codel.c Normal file
View file

@ -0,0 +1,276 @@
/*
* Codel - The Controlled-Delay Active Queue Management algorithm
*
* Copyright (C) 2011-2012 Kathleen Nichols <nichols@pollere.com>
* Copyright (C) 2011-2012 Van Jacobson <van@pollere.net>
*
* Implemented on linux by :
* Copyright (C) 2012 Michael D. Taht <dave.taht@bufferbloat.net>
* Copyright (C) 2012 Eric Dumazet <edumazet@google.com>
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions, and the following disclaimer,
* without modification.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. The names of the authors may not be used to endorse or promote products
* derived from this software without specific prior written permission.
*
* Alternatively, provided that this notice is retained in full, this
* software may be distributed under the terms of the GNU General
* Public License ("GPL") version 2, in which case the provisions of the
* GPL apply INSTEAD OF those given above.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
* A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
* OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH
* DAMAGE.
*
*/
#include <linux/module.h>
#include <linux/slab.h>
#include <linux/types.h>
#include <linux/kernel.h>
#include <linux/errno.h>
#include <linux/skbuff.h>
#include <linux/prefetch.h>
#include <net/pkt_sched.h>
#include <net/codel.h>
#define DEFAULT_CODEL_LIMIT 1000
struct codel_sched_data {
struct codel_params params;
struct codel_vars vars;
struct codel_stats stats;
u32 drop_overlimit;
};
/* This is the specific function called from codel_dequeue()
* to dequeue a packet from queue. Note: backlog is handled in
* codel, we dont need to reduce it here.
*/
static struct sk_buff *dequeue(struct codel_vars *vars, struct Qdisc *sch)
{
struct sk_buff *skb = __skb_dequeue(&sch->q);
prefetch(&skb->end); /* we'll need skb_shinfo() */
return skb;
}
static struct sk_buff *codel_qdisc_dequeue(struct Qdisc *sch)
{
struct codel_sched_data *q = qdisc_priv(sch);
struct sk_buff *skb;
skb = codel_dequeue(sch, &q->params, &q->vars, &q->stats, dequeue);
/* We cant call qdisc_tree_decrease_qlen() if our qlen is 0,
* or HTB crashes. Defer it for next round.
*/
if (q->stats.drop_count && sch->q.qlen) {
qdisc_tree_decrease_qlen(sch, q->stats.drop_count);
q->stats.drop_count = 0;
}
if (skb)
qdisc_bstats_update(sch, skb);
return skb;
}
static int codel_qdisc_enqueue(struct sk_buff *skb, struct Qdisc *sch)
{
struct codel_sched_data *q;
if (likely(qdisc_qlen(sch) < sch->limit)) {
codel_set_enqueue_time(skb);
return qdisc_enqueue_tail(skb, sch);
}
q = qdisc_priv(sch);
q->drop_overlimit++;
return qdisc_drop(skb, sch);
}
static const struct nla_policy codel_policy[TCA_CODEL_MAX + 1] = {
[TCA_CODEL_TARGET] = { .type = NLA_U32 },
[TCA_CODEL_LIMIT] = { .type = NLA_U32 },
[TCA_CODEL_INTERVAL] = { .type = NLA_U32 },
[TCA_CODEL_ECN] = { .type = NLA_U32 },
};
static int codel_change(struct Qdisc *sch, struct nlattr *opt)
{
struct codel_sched_data *q = qdisc_priv(sch);
struct nlattr *tb[TCA_CODEL_MAX + 1];
unsigned int qlen;
int err;
if (!opt)
return -EINVAL;
err = nla_parse_nested(tb, TCA_CODEL_MAX, opt, codel_policy);
if (err < 0)
return err;
sch_tree_lock(sch);
if (tb[TCA_CODEL_TARGET]) {
u32 target = nla_get_u32(tb[TCA_CODEL_TARGET]);
q->params.target = ((u64)target * NSEC_PER_USEC) >> CODEL_SHIFT;
}
if (tb[TCA_CODEL_INTERVAL]) {
u32 interval = nla_get_u32(tb[TCA_CODEL_INTERVAL]);
q->params.interval = ((u64)interval * NSEC_PER_USEC) >> CODEL_SHIFT;
}
if (tb[TCA_CODEL_LIMIT])
sch->limit = nla_get_u32(tb[TCA_CODEL_LIMIT]);
if (tb[TCA_CODEL_ECN])
q->params.ecn = !!nla_get_u32(tb[TCA_CODEL_ECN]);
qlen = sch->q.qlen;
while (sch->q.qlen > sch->limit) {
struct sk_buff *skb = __skb_dequeue(&sch->q);
qdisc_qstats_backlog_dec(sch, skb);
qdisc_drop(skb, sch);
}
qdisc_tree_decrease_qlen(sch, qlen - sch->q.qlen);
sch_tree_unlock(sch);
return 0;
}
static int codel_init(struct Qdisc *sch, struct nlattr *opt)
{
struct codel_sched_data *q = qdisc_priv(sch);
sch->limit = DEFAULT_CODEL_LIMIT;
codel_params_init(&q->params);
codel_vars_init(&q->vars);
codel_stats_init(&q->stats);
if (opt) {
int err = codel_change(sch, opt);
if (err)
return err;
}
if (sch->limit >= 1)
sch->flags |= TCQ_F_CAN_BYPASS;
else
sch->flags &= ~TCQ_F_CAN_BYPASS;
return 0;
}
static int codel_dump(struct Qdisc *sch, struct sk_buff *skb)
{
struct codel_sched_data *q = qdisc_priv(sch);
struct nlattr *opts;
opts = nla_nest_start(skb, TCA_OPTIONS);
if (opts == NULL)
goto nla_put_failure;
if (nla_put_u32(skb, TCA_CODEL_TARGET,
codel_time_to_us(q->params.target)) ||
nla_put_u32(skb, TCA_CODEL_LIMIT,
sch->limit) ||
nla_put_u32(skb, TCA_CODEL_INTERVAL,
codel_time_to_us(q->params.interval)) ||
nla_put_u32(skb, TCA_CODEL_ECN,
q->params.ecn))
goto nla_put_failure;
return nla_nest_end(skb, opts);
nla_put_failure:
nla_nest_cancel(skb, opts);
return -1;
}
static int codel_dump_stats(struct Qdisc *sch, struct gnet_dump *d)
{
const struct codel_sched_data *q = qdisc_priv(sch);
struct tc_codel_xstats st = {
.maxpacket = q->stats.maxpacket,
.count = q->vars.count,
.lastcount = q->vars.lastcount,
.drop_overlimit = q->drop_overlimit,
.ldelay = codel_time_to_us(q->vars.ldelay),
.dropping = q->vars.dropping,
.ecn_mark = q->stats.ecn_mark,
};
if (q->vars.dropping) {
codel_tdiff_t delta = q->vars.drop_next - codel_get_time();
if (delta >= 0)
st.drop_next = codel_time_to_us(delta);
else
st.drop_next = -codel_time_to_us(-delta);
}
return gnet_stats_copy_app(d, &st, sizeof(st));
}
static void codel_reset(struct Qdisc *sch)
{
struct codel_sched_data *q = qdisc_priv(sch);
qdisc_reset_queue(sch);
codel_vars_init(&q->vars);
}
static struct Qdisc_ops codel_qdisc_ops __read_mostly = {
.id = "codel",
.priv_size = sizeof(struct codel_sched_data),
.enqueue = codel_qdisc_enqueue,
.dequeue = codel_qdisc_dequeue,
.peek = qdisc_peek_dequeued,
.init = codel_init,
.reset = codel_reset,
.change = codel_change,
.dump = codel_dump,
.dump_stats = codel_dump_stats,
.owner = THIS_MODULE,
};
static int __init codel_module_init(void)
{
return register_qdisc(&codel_qdisc_ops);
}
static void __exit codel_module_exit(void)
{
unregister_qdisc(&codel_qdisc_ops);
}
module_init(codel_module_init)
module_exit(codel_module_exit)
MODULE_DESCRIPTION("Controlled Delay queue discipline");
MODULE_AUTHOR("Dave Taht");
MODULE_AUTHOR("Eric Dumazet");
MODULE_LICENSE("Dual BSD/GPL");

529
net/sched/sch_drr.c Normal file
View file

@ -0,0 +1,529 @@
/*
* net/sched/sch_drr.c Deficit Round Robin scheduler
*
* Copyright (c) 2008 Patrick McHardy <kaber@trash.net>
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public License
* version 2 as published by the Free Software Foundation.
*/
#include <linux/module.h>
#include <linux/slab.h>
#include <linux/init.h>
#include <linux/errno.h>
#include <linux/netdevice.h>
#include <linux/pkt_sched.h>
#include <net/sch_generic.h>
#include <net/pkt_sched.h>
#include <net/pkt_cls.h>
struct drr_class {
struct Qdisc_class_common common;
unsigned int refcnt;
unsigned int filter_cnt;
struct gnet_stats_basic_packed bstats;
struct gnet_stats_queue qstats;
struct gnet_stats_rate_est64 rate_est;
struct list_head alist;
struct Qdisc *qdisc;
u32 quantum;
u32 deficit;
};
struct drr_sched {
struct list_head active;
struct tcf_proto __rcu *filter_list;
struct Qdisc_class_hash clhash;
};
static struct drr_class *drr_find_class(struct Qdisc *sch, u32 classid)
{
struct drr_sched *q = qdisc_priv(sch);
struct Qdisc_class_common *clc;
clc = qdisc_class_find(&q->clhash, classid);
if (clc == NULL)
return NULL;
return container_of(clc, struct drr_class, common);
}
static void drr_purge_queue(struct drr_class *cl)
{
unsigned int len = cl->qdisc->q.qlen;
qdisc_reset(cl->qdisc);
qdisc_tree_decrease_qlen(cl->qdisc, len);
}
static const struct nla_policy drr_policy[TCA_DRR_MAX + 1] = {
[TCA_DRR_QUANTUM] = { .type = NLA_U32 },
};
static int drr_change_class(struct Qdisc *sch, u32 classid, u32 parentid,
struct nlattr **tca, unsigned long *arg)
{
struct drr_sched *q = qdisc_priv(sch);
struct drr_class *cl = (struct drr_class *)*arg;
struct nlattr *opt = tca[TCA_OPTIONS];
struct nlattr *tb[TCA_DRR_MAX + 1];
u32 quantum;
int err;
if (!opt)
return -EINVAL;
err = nla_parse_nested(tb, TCA_DRR_MAX, opt, drr_policy);
if (err < 0)
return err;
if (tb[TCA_DRR_QUANTUM]) {
quantum = nla_get_u32(tb[TCA_DRR_QUANTUM]);
if (quantum == 0)
return -EINVAL;
} else
quantum = psched_mtu(qdisc_dev(sch));
if (cl != NULL) {
if (tca[TCA_RATE]) {
err = gen_replace_estimator(&cl->bstats, NULL,
&cl->rate_est,
qdisc_root_sleeping_lock(sch),
tca[TCA_RATE]);
if (err)
return err;
}
sch_tree_lock(sch);
if (tb[TCA_DRR_QUANTUM])
cl->quantum = quantum;
sch_tree_unlock(sch);
return 0;
}
cl = kzalloc(sizeof(struct drr_class), GFP_KERNEL);
if (cl == NULL)
return -ENOBUFS;
cl->refcnt = 1;
cl->common.classid = classid;
cl->quantum = quantum;
cl->qdisc = qdisc_create_dflt(sch->dev_queue,
&pfifo_qdisc_ops, classid);
if (cl->qdisc == NULL)
cl->qdisc = &noop_qdisc;
if (tca[TCA_RATE]) {
err = gen_replace_estimator(&cl->bstats, NULL, &cl->rate_est,
qdisc_root_sleeping_lock(sch),
tca[TCA_RATE]);
if (err) {
qdisc_destroy(cl->qdisc);
kfree(cl);
return err;
}
}
sch_tree_lock(sch);
qdisc_class_hash_insert(&q->clhash, &cl->common);
sch_tree_unlock(sch);
qdisc_class_hash_grow(sch, &q->clhash);
*arg = (unsigned long)cl;
return 0;
}
static void drr_destroy_class(struct Qdisc *sch, struct drr_class *cl)
{
gen_kill_estimator(&cl->bstats, &cl->rate_est);
qdisc_destroy(cl->qdisc);
kfree(cl);
}
static int drr_delete_class(struct Qdisc *sch, unsigned long arg)
{
struct drr_sched *q = qdisc_priv(sch);
struct drr_class *cl = (struct drr_class *)arg;
if (cl->filter_cnt > 0)
return -EBUSY;
sch_tree_lock(sch);
drr_purge_queue(cl);
qdisc_class_hash_remove(&q->clhash, &cl->common);
BUG_ON(--cl->refcnt == 0);
/*
* This shouldn't happen: we "hold" one cops->get() when called
* from tc_ctl_tclass; the destroy method is done from cops->put().
*/
sch_tree_unlock(sch);
return 0;
}
static unsigned long drr_get_class(struct Qdisc *sch, u32 classid)
{
struct drr_class *cl = drr_find_class(sch, classid);
if (cl != NULL)
cl->refcnt++;
return (unsigned long)cl;
}
static void drr_put_class(struct Qdisc *sch, unsigned long arg)
{
struct drr_class *cl = (struct drr_class *)arg;
if (--cl->refcnt == 0)
drr_destroy_class(sch, cl);
}
static struct tcf_proto __rcu **drr_tcf_chain(struct Qdisc *sch,
unsigned long cl)
{
struct drr_sched *q = qdisc_priv(sch);
if (cl)
return NULL;
return &q->filter_list;
}
static unsigned long drr_bind_tcf(struct Qdisc *sch, unsigned long parent,
u32 classid)
{
struct drr_class *cl = drr_find_class(sch, classid);
if (cl != NULL)
cl->filter_cnt++;
return (unsigned long)cl;
}
static void drr_unbind_tcf(struct Qdisc *sch, unsigned long arg)
{
struct drr_class *cl = (struct drr_class *)arg;
cl->filter_cnt--;
}
static int drr_graft_class(struct Qdisc *sch, unsigned long arg,
struct Qdisc *new, struct Qdisc **old)
{
struct drr_class *cl = (struct drr_class *)arg;
if (new == NULL) {
new = qdisc_create_dflt(sch->dev_queue,
&pfifo_qdisc_ops, cl->common.classid);
if (new == NULL)
new = &noop_qdisc;
}
sch_tree_lock(sch);
drr_purge_queue(cl);
*old = cl->qdisc;
cl->qdisc = new;
sch_tree_unlock(sch);
return 0;
}
static struct Qdisc *drr_class_leaf(struct Qdisc *sch, unsigned long arg)
{
struct drr_class *cl = (struct drr_class *)arg;
return cl->qdisc;
}
static void drr_qlen_notify(struct Qdisc *csh, unsigned long arg)
{
struct drr_class *cl = (struct drr_class *)arg;
if (cl->qdisc->q.qlen == 0)
list_del(&cl->alist);
}
static int drr_dump_class(struct Qdisc *sch, unsigned long arg,
struct sk_buff *skb, struct tcmsg *tcm)
{
struct drr_class *cl = (struct drr_class *)arg;
struct nlattr *nest;
tcm->tcm_parent = TC_H_ROOT;
tcm->tcm_handle = cl->common.classid;
tcm->tcm_info = cl->qdisc->handle;
nest = nla_nest_start(skb, TCA_OPTIONS);
if (nest == NULL)
goto nla_put_failure;
if (nla_put_u32(skb, TCA_DRR_QUANTUM, cl->quantum))
goto nla_put_failure;
return nla_nest_end(skb, nest);
nla_put_failure:
nla_nest_cancel(skb, nest);
return -EMSGSIZE;
}
static int drr_dump_class_stats(struct Qdisc *sch, unsigned long arg,
struct gnet_dump *d)
{
struct drr_class *cl = (struct drr_class *)arg;
__u32 qlen = cl->qdisc->q.qlen;
struct tc_drr_stats xstats;
memset(&xstats, 0, sizeof(xstats));
if (qlen)
xstats.deficit = cl->deficit;
if (gnet_stats_copy_basic(d, NULL, &cl->bstats) < 0 ||
gnet_stats_copy_rate_est(d, &cl->bstats, &cl->rate_est) < 0 ||
gnet_stats_copy_queue(d, NULL, &cl->qdisc->qstats, qlen) < 0)
return -1;
return gnet_stats_copy_app(d, &xstats, sizeof(xstats));
}
static void drr_walk(struct Qdisc *sch, struct qdisc_walker *arg)
{
struct drr_sched *q = qdisc_priv(sch);
struct drr_class *cl;
unsigned int i;
if (arg->stop)
return;
for (i = 0; i < q->clhash.hashsize; i++) {
hlist_for_each_entry(cl, &q->clhash.hash[i], common.hnode) {
if (arg->count < arg->skip) {
arg->count++;
continue;
}
if (arg->fn(sch, (unsigned long)cl, arg) < 0) {
arg->stop = 1;
return;
}
arg->count++;
}
}
}
static struct drr_class *drr_classify(struct sk_buff *skb, struct Qdisc *sch,
int *qerr)
{
struct drr_sched *q = qdisc_priv(sch);
struct drr_class *cl;
struct tcf_result res;
struct tcf_proto *fl;
int result;
if (TC_H_MAJ(skb->priority ^ sch->handle) == 0) {
cl = drr_find_class(sch, skb->priority);
if (cl != NULL)
return cl;
}
*qerr = NET_XMIT_SUCCESS | __NET_XMIT_BYPASS;
fl = rcu_dereference_bh(q->filter_list);
result = tc_classify(skb, fl, &res);
if (result >= 0) {
#ifdef CONFIG_NET_CLS_ACT
switch (result) {
case TC_ACT_QUEUED:
case TC_ACT_STOLEN:
*qerr = NET_XMIT_SUCCESS | __NET_XMIT_STOLEN;
case TC_ACT_SHOT:
return NULL;
}
#endif
cl = (struct drr_class *)res.class;
if (cl == NULL)
cl = drr_find_class(sch, res.classid);
return cl;
}
return NULL;
}
static int drr_enqueue(struct sk_buff *skb, struct Qdisc *sch)
{
struct drr_sched *q = qdisc_priv(sch);
struct drr_class *cl;
int err = 0;
cl = drr_classify(skb, sch, &err);
if (cl == NULL) {
if (err & __NET_XMIT_BYPASS)
qdisc_qstats_drop(sch);
kfree_skb(skb);
return err;
}
err = qdisc_enqueue(skb, cl->qdisc);
if (unlikely(err != NET_XMIT_SUCCESS)) {
if (net_xmit_drop_count(err)) {
cl->qstats.drops++;
qdisc_qstats_drop(sch);
}
return err;
}
if (cl->qdisc->q.qlen == 1) {
list_add_tail(&cl->alist, &q->active);
cl->deficit = cl->quantum;
}
sch->q.qlen++;
return err;
}
static struct sk_buff *drr_dequeue(struct Qdisc *sch)
{
struct drr_sched *q = qdisc_priv(sch);
struct drr_class *cl;
struct sk_buff *skb;
unsigned int len;
if (list_empty(&q->active))
goto out;
while (1) {
cl = list_first_entry(&q->active, struct drr_class, alist);
skb = cl->qdisc->ops->peek(cl->qdisc);
if (skb == NULL) {
qdisc_warn_nonwc(__func__, cl->qdisc);
goto out;
}
len = qdisc_pkt_len(skb);
if (len <= cl->deficit) {
cl->deficit -= len;
skb = qdisc_dequeue_peeked(cl->qdisc);
if (cl->qdisc->q.qlen == 0)
list_del(&cl->alist);
bstats_update(&cl->bstats, skb);
qdisc_bstats_update(sch, skb);
sch->q.qlen--;
return skb;
}
cl->deficit += cl->quantum;
list_move_tail(&cl->alist, &q->active);
}
out:
return NULL;
}
static unsigned int drr_drop(struct Qdisc *sch)
{
struct drr_sched *q = qdisc_priv(sch);
struct drr_class *cl;
unsigned int len;
list_for_each_entry(cl, &q->active, alist) {
if (cl->qdisc->ops->drop) {
len = cl->qdisc->ops->drop(cl->qdisc);
if (len > 0) {
sch->q.qlen--;
if (cl->qdisc->q.qlen == 0)
list_del(&cl->alist);
return len;
}
}
}
return 0;
}
static int drr_init_qdisc(struct Qdisc *sch, struct nlattr *opt)
{
struct drr_sched *q = qdisc_priv(sch);
int err;
err = qdisc_class_hash_init(&q->clhash);
if (err < 0)
return err;
INIT_LIST_HEAD(&q->active);
return 0;
}
static void drr_reset_qdisc(struct Qdisc *sch)
{
struct drr_sched *q = qdisc_priv(sch);
struct drr_class *cl;
unsigned int i;
for (i = 0; i < q->clhash.hashsize; i++) {
hlist_for_each_entry(cl, &q->clhash.hash[i], common.hnode) {
if (cl->qdisc->q.qlen)
list_del(&cl->alist);
qdisc_reset(cl->qdisc);
}
}
sch->q.qlen = 0;
}
static void drr_destroy_qdisc(struct Qdisc *sch)
{
struct drr_sched *q = qdisc_priv(sch);
struct drr_class *cl;
struct hlist_node *next;
unsigned int i;
tcf_destroy_chain(&q->filter_list);
for (i = 0; i < q->clhash.hashsize; i++) {
hlist_for_each_entry_safe(cl, next, &q->clhash.hash[i],
common.hnode)
drr_destroy_class(sch, cl);
}
qdisc_class_hash_destroy(&q->clhash);
}
static const struct Qdisc_class_ops drr_class_ops = {
.change = drr_change_class,
.delete = drr_delete_class,
.get = drr_get_class,
.put = drr_put_class,
.tcf_chain = drr_tcf_chain,
.bind_tcf = drr_bind_tcf,
.unbind_tcf = drr_unbind_tcf,
.graft = drr_graft_class,
.leaf = drr_class_leaf,
.qlen_notify = drr_qlen_notify,
.dump = drr_dump_class,
.dump_stats = drr_dump_class_stats,
.walk = drr_walk,
};
static struct Qdisc_ops drr_qdisc_ops __read_mostly = {
.cl_ops = &drr_class_ops,
.id = "drr",
.priv_size = sizeof(struct drr_sched),
.enqueue = drr_enqueue,
.dequeue = drr_dequeue,
.peek = qdisc_peek_dequeued,
.drop = drr_drop,
.init = drr_init_qdisc,
.reset = drr_reset_qdisc,
.destroy = drr_destroy_qdisc,
.owner = THIS_MODULE,
};
static int __init drr_init(void)
{
return register_qdisc(&drr_qdisc_ops);
}
static void __exit drr_exit(void)
{
unregister_qdisc(&drr_qdisc_ops);
}
module_init(drr_init);
module_exit(drr_exit);
MODULE_LICENSE("GPL");

514
net/sched/sch_dsmark.c Normal file
View file

@ -0,0 +1,514 @@
/* net/sched/sch_dsmark.c - Differentiated Services field marker */
/* Written 1998-2000 by Werner Almesberger, EPFL ICA */
#include <linux/module.h>
#include <linux/init.h>
#include <linux/slab.h>
#include <linux/types.h>
#include <linux/string.h>
#include <linux/errno.h>
#include <linux/skbuff.h>
#include <linux/rtnetlink.h>
#include <linux/bitops.h>
#include <net/pkt_sched.h>
#include <net/dsfield.h>
#include <net/inet_ecn.h>
#include <asm/byteorder.h>
/*
* classid class marking
* ------- ----- -------
* n/a 0 n/a
* x:0 1 use entry [0]
* ... ... ...
* x:y y>0 y+1 use entry [y]
* ... ... ...
* x:indices-1 indices use entry [indices-1]
* ... ... ...
* x:y y+1 use entry [y & (indices-1)]
* ... ... ...
* 0xffff 0x10000 use entry [indices-1]
*/
#define NO_DEFAULT_INDEX (1 << 16)
struct dsmark_qdisc_data {
struct Qdisc *q;
struct tcf_proto __rcu *filter_list;
u8 *mask; /* "owns" the array */
u8 *value;
u16 indices;
u32 default_index; /* index range is 0...0xffff */
int set_tc_index;
};
static inline int dsmark_valid_index(struct dsmark_qdisc_data *p, u16 index)
{
return index <= p->indices && index > 0;
}
/* ------------------------- Class/flow operations ------------------------- */
static int dsmark_graft(struct Qdisc *sch, unsigned long arg,
struct Qdisc *new, struct Qdisc **old)
{
struct dsmark_qdisc_data *p = qdisc_priv(sch);
pr_debug("%s(sch %p,[qdisc %p],new %p,old %p)\n",
__func__, sch, p, new, old);
if (new == NULL) {
new = qdisc_create_dflt(sch->dev_queue, &pfifo_qdisc_ops,
sch->handle);
if (new == NULL)
new = &noop_qdisc;
}
sch_tree_lock(sch);
*old = p->q;
p->q = new;
qdisc_tree_decrease_qlen(*old, (*old)->q.qlen);
qdisc_reset(*old);
sch_tree_unlock(sch);
return 0;
}
static struct Qdisc *dsmark_leaf(struct Qdisc *sch, unsigned long arg)
{
struct dsmark_qdisc_data *p = qdisc_priv(sch);
return p->q;
}
static unsigned long dsmark_get(struct Qdisc *sch, u32 classid)
{
pr_debug("%s(sch %p,[qdisc %p],classid %x)\n",
__func__, sch, qdisc_priv(sch), classid);
return TC_H_MIN(classid) + 1;
}
static unsigned long dsmark_bind_filter(struct Qdisc *sch,
unsigned long parent, u32 classid)
{
return dsmark_get(sch, classid);
}
static void dsmark_put(struct Qdisc *sch, unsigned long cl)
{
}
static const struct nla_policy dsmark_policy[TCA_DSMARK_MAX + 1] = {
[TCA_DSMARK_INDICES] = { .type = NLA_U16 },
[TCA_DSMARK_DEFAULT_INDEX] = { .type = NLA_U16 },
[TCA_DSMARK_SET_TC_INDEX] = { .type = NLA_FLAG },
[TCA_DSMARK_MASK] = { .type = NLA_U8 },
[TCA_DSMARK_VALUE] = { .type = NLA_U8 },
};
static int dsmark_change(struct Qdisc *sch, u32 classid, u32 parent,
struct nlattr **tca, unsigned long *arg)
{
struct dsmark_qdisc_data *p = qdisc_priv(sch);
struct nlattr *opt = tca[TCA_OPTIONS];
struct nlattr *tb[TCA_DSMARK_MAX + 1];
int err = -EINVAL;
u8 mask = 0;
pr_debug("%s(sch %p,[qdisc %p],classid %x,parent %x), arg 0x%lx\n",
__func__, sch, p, classid, parent, *arg);
if (!dsmark_valid_index(p, *arg)) {
err = -ENOENT;
goto errout;
}
if (!opt)
goto errout;
err = nla_parse_nested(tb, TCA_DSMARK_MAX, opt, dsmark_policy);
if (err < 0)
goto errout;
if (tb[TCA_DSMARK_MASK])
mask = nla_get_u8(tb[TCA_DSMARK_MASK]);
if (tb[TCA_DSMARK_VALUE])
p->value[*arg - 1] = nla_get_u8(tb[TCA_DSMARK_VALUE]);
if (tb[TCA_DSMARK_MASK])
p->mask[*arg - 1] = mask;
err = 0;
errout:
return err;
}
static int dsmark_delete(struct Qdisc *sch, unsigned long arg)
{
struct dsmark_qdisc_data *p = qdisc_priv(sch);
if (!dsmark_valid_index(p, arg))
return -EINVAL;
p->mask[arg - 1] = 0xff;
p->value[arg - 1] = 0;
return 0;
}
static void dsmark_walk(struct Qdisc *sch, struct qdisc_walker *walker)
{
struct dsmark_qdisc_data *p = qdisc_priv(sch);
int i;
pr_debug("%s(sch %p,[qdisc %p],walker %p)\n",
__func__, sch, p, walker);
if (walker->stop)
return;
for (i = 0; i < p->indices; i++) {
if (p->mask[i] == 0xff && !p->value[i])
goto ignore;
if (walker->count >= walker->skip) {
if (walker->fn(sch, i + 1, walker) < 0) {
walker->stop = 1;
break;
}
}
ignore:
walker->count++;
}
}
static inline struct tcf_proto __rcu **dsmark_find_tcf(struct Qdisc *sch,
unsigned long cl)
{
struct dsmark_qdisc_data *p = qdisc_priv(sch);
return &p->filter_list;
}
/* --------------------------- Qdisc operations ---------------------------- */
static int dsmark_enqueue(struct sk_buff *skb, struct Qdisc *sch)
{
struct dsmark_qdisc_data *p = qdisc_priv(sch);
int err;
pr_debug("%s(skb %p,sch %p,[qdisc %p])\n", __func__, skb, sch, p);
if (p->set_tc_index) {
switch (skb->protocol) {
case htons(ETH_P_IP):
if (skb_cow_head(skb, sizeof(struct iphdr)))
goto drop;
skb->tc_index = ipv4_get_dsfield(ip_hdr(skb))
& ~INET_ECN_MASK;
break;
case htons(ETH_P_IPV6):
if (skb_cow_head(skb, sizeof(struct ipv6hdr)))
goto drop;
skb->tc_index = ipv6_get_dsfield(ipv6_hdr(skb))
& ~INET_ECN_MASK;
break;
default:
skb->tc_index = 0;
break;
}
}
if (TC_H_MAJ(skb->priority) == sch->handle)
skb->tc_index = TC_H_MIN(skb->priority);
else {
struct tcf_result res;
struct tcf_proto *fl = rcu_dereference_bh(p->filter_list);
int result = tc_classify(skb, fl, &res);
pr_debug("result %d class 0x%04x\n", result, res.classid);
switch (result) {
#ifdef CONFIG_NET_CLS_ACT
case TC_ACT_QUEUED:
case TC_ACT_STOLEN:
kfree_skb(skb);
return NET_XMIT_SUCCESS | __NET_XMIT_STOLEN;
case TC_ACT_SHOT:
goto drop;
#endif
case TC_ACT_OK:
skb->tc_index = TC_H_MIN(res.classid);
break;
default:
if (p->default_index != NO_DEFAULT_INDEX)
skb->tc_index = p->default_index;
break;
}
}
err = qdisc_enqueue(skb, p->q);
if (err != NET_XMIT_SUCCESS) {
if (net_xmit_drop_count(err))
qdisc_qstats_drop(sch);
return err;
}
sch->q.qlen++;
return NET_XMIT_SUCCESS;
drop:
qdisc_drop(skb, sch);
return NET_XMIT_SUCCESS | __NET_XMIT_BYPASS;
}
static struct sk_buff *dsmark_dequeue(struct Qdisc *sch)
{
struct dsmark_qdisc_data *p = qdisc_priv(sch);
struct sk_buff *skb;
u32 index;
pr_debug("%s(sch %p,[qdisc %p])\n", __func__, sch, p);
skb = p->q->ops->dequeue(p->q);
if (skb == NULL)
return NULL;
qdisc_bstats_update(sch, skb);
sch->q.qlen--;
index = skb->tc_index & (p->indices - 1);
pr_debug("index %d->%d\n", skb->tc_index, index);
switch (skb->protocol) {
case htons(ETH_P_IP):
ipv4_change_dsfield(ip_hdr(skb), p->mask[index],
p->value[index]);
break;
case htons(ETH_P_IPV6):
ipv6_change_dsfield(ipv6_hdr(skb), p->mask[index],
p->value[index]);
break;
default:
/*
* Only complain if a change was actually attempted.
* This way, we can send non-IP traffic through dsmark
* and don't need yet another qdisc as a bypass.
*/
if (p->mask[index] != 0xff || p->value[index])
pr_warn("%s: unsupported protocol %d\n",
__func__, ntohs(skb->protocol));
break;
}
return skb;
}
static struct sk_buff *dsmark_peek(struct Qdisc *sch)
{
struct dsmark_qdisc_data *p = qdisc_priv(sch);
pr_debug("%s(sch %p,[qdisc %p])\n", __func__, sch, p);
return p->q->ops->peek(p->q);
}
static unsigned int dsmark_drop(struct Qdisc *sch)
{
struct dsmark_qdisc_data *p = qdisc_priv(sch);
unsigned int len;
pr_debug("%s(sch %p,[qdisc %p])\n", __func__, sch, p);
if (p->q->ops->drop == NULL)
return 0;
len = p->q->ops->drop(p->q);
if (len)
sch->q.qlen--;
return len;
}
static int dsmark_init(struct Qdisc *sch, struct nlattr *opt)
{
struct dsmark_qdisc_data *p = qdisc_priv(sch);
struct nlattr *tb[TCA_DSMARK_MAX + 1];
int err = -EINVAL;
u32 default_index = NO_DEFAULT_INDEX;
u16 indices;
u8 *mask;
pr_debug("%s(sch %p,[qdisc %p],opt %p)\n", __func__, sch, p, opt);
if (!opt)
goto errout;
err = nla_parse_nested(tb, TCA_DSMARK_MAX, opt, dsmark_policy);
if (err < 0)
goto errout;
err = -EINVAL;
indices = nla_get_u16(tb[TCA_DSMARK_INDICES]);
if (hweight32(indices) != 1)
goto errout;
if (tb[TCA_DSMARK_DEFAULT_INDEX])
default_index = nla_get_u16(tb[TCA_DSMARK_DEFAULT_INDEX]);
mask = kmalloc(indices * 2, GFP_KERNEL);
if (mask == NULL) {
err = -ENOMEM;
goto errout;
}
p->mask = mask;
memset(p->mask, 0xff, indices);
p->value = p->mask + indices;
memset(p->value, 0, indices);
p->indices = indices;
p->default_index = default_index;
p->set_tc_index = nla_get_flag(tb[TCA_DSMARK_SET_TC_INDEX]);
p->q = qdisc_create_dflt(sch->dev_queue, &pfifo_qdisc_ops, sch->handle);
if (p->q == NULL)
p->q = &noop_qdisc;
pr_debug("%s: qdisc %p\n", __func__, p->q);
err = 0;
errout:
return err;
}
static void dsmark_reset(struct Qdisc *sch)
{
struct dsmark_qdisc_data *p = qdisc_priv(sch);
pr_debug("%s(sch %p,[qdisc %p])\n", __func__, sch, p);
qdisc_reset(p->q);
sch->q.qlen = 0;
}
static void dsmark_destroy(struct Qdisc *sch)
{
struct dsmark_qdisc_data *p = qdisc_priv(sch);
pr_debug("%s(sch %p,[qdisc %p])\n", __func__, sch, p);
tcf_destroy_chain(&p->filter_list);
qdisc_destroy(p->q);
kfree(p->mask);
}
static int dsmark_dump_class(struct Qdisc *sch, unsigned long cl,
struct sk_buff *skb, struct tcmsg *tcm)
{
struct dsmark_qdisc_data *p = qdisc_priv(sch);
struct nlattr *opts = NULL;
pr_debug("%s(sch %p,[qdisc %p],class %ld\n", __func__, sch, p, cl);
if (!dsmark_valid_index(p, cl))
return -EINVAL;
tcm->tcm_handle = TC_H_MAKE(TC_H_MAJ(sch->handle), cl - 1);
tcm->tcm_info = p->q->handle;
opts = nla_nest_start(skb, TCA_OPTIONS);
if (opts == NULL)
goto nla_put_failure;
if (nla_put_u8(skb, TCA_DSMARK_MASK, p->mask[cl - 1]) ||
nla_put_u8(skb, TCA_DSMARK_VALUE, p->value[cl - 1]))
goto nla_put_failure;
return nla_nest_end(skb, opts);
nla_put_failure:
nla_nest_cancel(skb, opts);
return -EMSGSIZE;
}
static int dsmark_dump(struct Qdisc *sch, struct sk_buff *skb)
{
struct dsmark_qdisc_data *p = qdisc_priv(sch);
struct nlattr *opts = NULL;
opts = nla_nest_start(skb, TCA_OPTIONS);
if (opts == NULL)
goto nla_put_failure;
if (nla_put_u16(skb, TCA_DSMARK_INDICES, p->indices))
goto nla_put_failure;
if (p->default_index != NO_DEFAULT_INDEX &&
nla_put_u16(skb, TCA_DSMARK_DEFAULT_INDEX, p->default_index))
goto nla_put_failure;
if (p->set_tc_index &&
nla_put_flag(skb, TCA_DSMARK_SET_TC_INDEX))
goto nla_put_failure;
return nla_nest_end(skb, opts);
nla_put_failure:
nla_nest_cancel(skb, opts);
return -EMSGSIZE;
}
static const struct Qdisc_class_ops dsmark_class_ops = {
.graft = dsmark_graft,
.leaf = dsmark_leaf,
.get = dsmark_get,
.put = dsmark_put,
.change = dsmark_change,
.delete = dsmark_delete,
.walk = dsmark_walk,
.tcf_chain = dsmark_find_tcf,
.bind_tcf = dsmark_bind_filter,
.unbind_tcf = dsmark_put,
.dump = dsmark_dump_class,
};
static struct Qdisc_ops dsmark_qdisc_ops __read_mostly = {
.next = NULL,
.cl_ops = &dsmark_class_ops,
.id = "dsmark",
.priv_size = sizeof(struct dsmark_qdisc_data),
.enqueue = dsmark_enqueue,
.dequeue = dsmark_dequeue,
.peek = dsmark_peek,
.drop = dsmark_drop,
.init = dsmark_init,
.reset = dsmark_reset,
.destroy = dsmark_destroy,
.change = NULL,
.dump = dsmark_dump,
.owner = THIS_MODULE,
};
static int __init dsmark_module_init(void)
{
return register_qdisc(&dsmark_qdisc_ops);
}
static void __exit dsmark_module_exit(void)
{
unregister_qdisc(&dsmark_qdisc_ops);
}
module_init(dsmark_module_init)
module_exit(dsmark_module_exit)
MODULE_LICENSE("GPL");

180
net/sched/sch_fifo.c Normal file
View file

@ -0,0 +1,180 @@
/*
* net/sched/sch_fifo.c The simplest FIFO queue.
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public License
* as published by the Free Software Foundation; either version
* 2 of the License, or (at your option) any later version.
*
* Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
*/
#include <linux/module.h>
#include <linux/slab.h>
#include <linux/types.h>
#include <linux/kernel.h>
#include <linux/errno.h>
#include <linux/skbuff.h>
#include <net/pkt_sched.h>
/* 1 band FIFO pseudo-"scheduler" */
static int bfifo_enqueue(struct sk_buff *skb, struct Qdisc *sch)
{
if (likely(sch->qstats.backlog + qdisc_pkt_len(skb) <= sch->limit))
return qdisc_enqueue_tail(skb, sch);
return qdisc_reshape_fail(skb, sch);
}
static int pfifo_enqueue(struct sk_buff *skb, struct Qdisc *sch)
{
if (likely(skb_queue_len(&sch->q) < sch->limit))
return qdisc_enqueue_tail(skb, sch);
return qdisc_reshape_fail(skb, sch);
}
static int pfifo_tail_enqueue(struct sk_buff *skb, struct Qdisc *sch)
{
if (likely(skb_queue_len(&sch->q) < sch->limit))
return qdisc_enqueue_tail(skb, sch);
/* queue full, remove one skb to fulfill the limit */
__qdisc_queue_drop_head(sch, &sch->q);
qdisc_qstats_drop(sch);
qdisc_enqueue_tail(skb, sch);
return NET_XMIT_CN;
}
static int fifo_init(struct Qdisc *sch, struct nlattr *opt)
{
bool bypass;
bool is_bfifo = sch->ops == &bfifo_qdisc_ops;
if (opt == NULL) {
u32 limit = qdisc_dev(sch)->tx_queue_len ? : 1;
if (is_bfifo)
limit *= psched_mtu(qdisc_dev(sch));
sch->limit = limit;
} else {
struct tc_fifo_qopt *ctl = nla_data(opt);
if (nla_len(opt) < sizeof(*ctl))
return -EINVAL;
sch->limit = ctl->limit;
}
if (is_bfifo)
bypass = sch->limit >= psched_mtu(qdisc_dev(sch));
else
bypass = sch->limit >= 1;
if (bypass)
sch->flags |= TCQ_F_CAN_BYPASS;
else
sch->flags &= ~TCQ_F_CAN_BYPASS;
return 0;
}
static int fifo_dump(struct Qdisc *sch, struct sk_buff *skb)
{
struct tc_fifo_qopt opt = { .limit = sch->limit };
if (nla_put(skb, TCA_OPTIONS, sizeof(opt), &opt))
goto nla_put_failure;
return skb->len;
nla_put_failure:
return -1;
}
struct Qdisc_ops pfifo_qdisc_ops __read_mostly = {
.id = "pfifo",
.priv_size = 0,
.enqueue = pfifo_enqueue,
.dequeue = qdisc_dequeue_head,
.peek = qdisc_peek_head,
.drop = qdisc_queue_drop,
.init = fifo_init,
.reset = qdisc_reset_queue,
.change = fifo_init,
.dump = fifo_dump,
.owner = THIS_MODULE,
};
EXPORT_SYMBOL(pfifo_qdisc_ops);
struct Qdisc_ops bfifo_qdisc_ops __read_mostly = {
.id = "bfifo",
.priv_size = 0,
.enqueue = bfifo_enqueue,
.dequeue = qdisc_dequeue_head,
.peek = qdisc_peek_head,
.drop = qdisc_queue_drop,
.init = fifo_init,
.reset = qdisc_reset_queue,
.change = fifo_init,
.dump = fifo_dump,
.owner = THIS_MODULE,
};
EXPORT_SYMBOL(bfifo_qdisc_ops);
struct Qdisc_ops pfifo_head_drop_qdisc_ops __read_mostly = {
.id = "pfifo_head_drop",
.priv_size = 0,
.enqueue = pfifo_tail_enqueue,
.dequeue = qdisc_dequeue_head,
.peek = qdisc_peek_head,
.drop = qdisc_queue_drop_head,
.init = fifo_init,
.reset = qdisc_reset_queue,
.change = fifo_init,
.dump = fifo_dump,
.owner = THIS_MODULE,
};
/* Pass size change message down to embedded FIFO */
int fifo_set_limit(struct Qdisc *q, unsigned int limit)
{
struct nlattr *nla;
int ret = -ENOMEM;
/* Hack to avoid sending change message to non-FIFO */
if (strncmp(q->ops->id + 1, "fifo", 4) != 0)
return 0;
nla = kmalloc(nla_attr_size(sizeof(struct tc_fifo_qopt)), GFP_KERNEL);
if (nla) {
nla->nla_type = RTM_NEWQDISC;
nla->nla_len = nla_attr_size(sizeof(struct tc_fifo_qopt));
((struct tc_fifo_qopt *)nla_data(nla))->limit = limit;
ret = q->ops->change(q, nla);
kfree(nla);
}
return ret;
}
EXPORT_SYMBOL(fifo_set_limit);
struct Qdisc *fifo_create_dflt(struct Qdisc *sch, struct Qdisc_ops *ops,
unsigned int limit)
{
struct Qdisc *q;
int err = -ENOMEM;
q = qdisc_create_dflt(sch->dev_queue, ops, TC_H_MAKE(sch->handle, 1));
if (q) {
err = fifo_set_limit(q, limit);
if (err < 0) {
qdisc_destroy(q);
q = NULL;
}
}
return q ? : ERR_PTR(err);
}
EXPORT_SYMBOL(fifo_create_dflt);

847
net/sched/sch_fq.c Normal file
View file

@ -0,0 +1,847 @@
/*
* net/sched/sch_fq.c Fair Queue Packet Scheduler (per flow pacing)
*
* Copyright (C) 2013 Eric Dumazet <edumazet@google.com>
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public License
* as published by the Free Software Foundation; either version
* 2 of the License, or (at your option) any later version.
*
* Meant to be mostly used for localy generated traffic :
* Fast classification depends on skb->sk being set before reaching us.
* If not, (router workload), we use rxhash as fallback, with 32 bits wide hash.
* All packets belonging to a socket are considered as a 'flow'.
*
* Flows are dynamically allocated and stored in a hash table of RB trees
* They are also part of one Round Robin 'queues' (new or old flows)
*
* Burst avoidance (aka pacing) capability :
*
* Transport (eg TCP) can set in sk->sk_pacing_rate a rate, enqueue a
* bunch of packets, and this packet scheduler adds delay between
* packets to respect rate limitation.
*
* enqueue() :
* - lookup one RB tree (out of 1024 or more) to find the flow.
* If non existent flow, create it, add it to the tree.
* Add skb to the per flow list of skb (fifo).
* - Use a special fifo for high prio packets
*
* dequeue() : serves flows in Round Robin
* Note : When a flow becomes empty, we do not immediately remove it from
* rb trees, for performance reasons (its expected to send additional packets,
* or SLAB cache will reuse socket for another flow)
*/
#include <linux/module.h>
#include <linux/types.h>
#include <linux/kernel.h>
#include <linux/jiffies.h>
#include <linux/string.h>
#include <linux/in.h>
#include <linux/errno.h>
#include <linux/init.h>
#include <linux/skbuff.h>
#include <linux/slab.h>
#include <linux/rbtree.h>
#include <linux/hash.h>
#include <linux/prefetch.h>
#include <linux/vmalloc.h>
#include <net/netlink.h>
#include <net/pkt_sched.h>
#include <net/sock.h>
#include <net/tcp_states.h>
/*
* Per flow structure, dynamically allocated
*/
struct fq_flow {
struct sk_buff *head; /* list of skbs for this flow : first skb */
union {
struct sk_buff *tail; /* last skb in the list */
unsigned long age; /* jiffies when flow was emptied, for gc */
};
struct rb_node fq_node; /* anchor in fq_root[] trees */
struct sock *sk;
int qlen; /* number of packets in flow queue */
int credit;
u32 socket_hash; /* sk_hash */
struct fq_flow *next; /* next pointer in RR lists, or &detached */
struct rb_node rate_node; /* anchor in q->delayed tree */
u64 time_next_packet;
};
struct fq_flow_head {
struct fq_flow *first;
struct fq_flow *last;
};
struct fq_sched_data {
struct fq_flow_head new_flows;
struct fq_flow_head old_flows;
struct rb_root delayed; /* for rate limited flows */
u64 time_next_delayed_flow;
struct fq_flow internal; /* for non classified or high prio packets */
u32 quantum;
u32 initial_quantum;
u32 flow_refill_delay;
u32 flow_max_rate; /* optional max rate per flow */
u32 flow_plimit; /* max packets per flow */
struct rb_root *fq_root;
u8 rate_enable;
u8 fq_trees_log;
u32 flows;
u32 inactive_flows;
u32 throttled_flows;
u64 stat_gc_flows;
u64 stat_internal_packets;
u64 stat_tcp_retrans;
u64 stat_throttled;
u64 stat_flows_plimit;
u64 stat_pkts_too_long;
u64 stat_allocation_errors;
struct qdisc_watchdog watchdog;
};
/* special value to mark a detached flow (not on old/new list) */
static struct fq_flow detached, throttled;
static void fq_flow_set_detached(struct fq_flow *f)
{
f->next = &detached;
f->age = jiffies;
}
static bool fq_flow_is_detached(const struct fq_flow *f)
{
return f->next == &detached;
}
static void fq_flow_set_throttled(struct fq_sched_data *q, struct fq_flow *f)
{
struct rb_node **p = &q->delayed.rb_node, *parent = NULL;
while (*p) {
struct fq_flow *aux;
parent = *p;
aux = container_of(parent, struct fq_flow, rate_node);
if (f->time_next_packet >= aux->time_next_packet)
p = &parent->rb_right;
else
p = &parent->rb_left;
}
rb_link_node(&f->rate_node, parent, p);
rb_insert_color(&f->rate_node, &q->delayed);
q->throttled_flows++;
q->stat_throttled++;
f->next = &throttled;
if (q->time_next_delayed_flow > f->time_next_packet)
q->time_next_delayed_flow = f->time_next_packet;
}
static struct kmem_cache *fq_flow_cachep __read_mostly;
static void fq_flow_add_tail(struct fq_flow_head *head, struct fq_flow *flow)
{
if (head->first)
head->last->next = flow;
else
head->first = flow;
head->last = flow;
flow->next = NULL;
}
/* limit number of collected flows per round */
#define FQ_GC_MAX 8
#define FQ_GC_AGE (3*HZ)
static bool fq_gc_candidate(const struct fq_flow *f)
{
return fq_flow_is_detached(f) &&
time_after(jiffies, f->age + FQ_GC_AGE);
}
static void fq_gc(struct fq_sched_data *q,
struct rb_root *root,
struct sock *sk)
{
struct fq_flow *f, *tofree[FQ_GC_MAX];
struct rb_node **p, *parent;
int fcnt = 0;
p = &root->rb_node;
parent = NULL;
while (*p) {
parent = *p;
f = container_of(parent, struct fq_flow, fq_node);
if (f->sk == sk)
break;
if (fq_gc_candidate(f)) {
tofree[fcnt++] = f;
if (fcnt == FQ_GC_MAX)
break;
}
if (f->sk > sk)
p = &parent->rb_right;
else
p = &parent->rb_left;
}
q->flows -= fcnt;
q->inactive_flows -= fcnt;
q->stat_gc_flows += fcnt;
while (fcnt) {
struct fq_flow *f = tofree[--fcnt];
rb_erase(&f->fq_node, root);
kmem_cache_free(fq_flow_cachep, f);
}
}
static struct fq_flow *fq_classify(struct sk_buff *skb, struct fq_sched_data *q)
{
struct rb_node **p, *parent;
struct sock *sk = skb->sk;
struct rb_root *root;
struct fq_flow *f;
/* warning: no starvation prevention... */
if (unlikely((skb->priority & TC_PRIO_MAX) == TC_PRIO_CONTROL))
return &q->internal;
if (unlikely(!sk)) {
/* By forcing low order bit to 1, we make sure to not
* collide with a local flow (socket pointers are word aligned)
*/
sk = (struct sock *)(skb_get_hash(skb) | 1L);
}
root = &q->fq_root[hash_32((u32)(long)sk, q->fq_trees_log)];
if (q->flows >= (2U << q->fq_trees_log) &&
q->inactive_flows > q->flows/2)
fq_gc(q, root, sk);
p = &root->rb_node;
parent = NULL;
while (*p) {
parent = *p;
f = container_of(parent, struct fq_flow, fq_node);
if (f->sk == sk) {
/* socket might have been reallocated, so check
* if its sk_hash is the same.
* It not, we need to refill credit with
* initial quantum
*/
if (unlikely(skb->sk &&
f->socket_hash != sk->sk_hash)) {
f->credit = q->initial_quantum;
f->socket_hash = sk->sk_hash;
f->time_next_packet = 0ULL;
}
return f;
}
if (f->sk > sk)
p = &parent->rb_right;
else
p = &parent->rb_left;
}
f = kmem_cache_zalloc(fq_flow_cachep, GFP_ATOMIC | __GFP_NOWARN);
if (unlikely(!f)) {
q->stat_allocation_errors++;
return &q->internal;
}
fq_flow_set_detached(f);
f->sk = sk;
if (skb->sk)
f->socket_hash = sk->sk_hash;
f->credit = q->initial_quantum;
rb_link_node(&f->fq_node, parent, p);
rb_insert_color(&f->fq_node, root);
q->flows++;
q->inactive_flows++;
return f;
}
/* remove one skb from head of flow queue */
static struct sk_buff *fq_dequeue_head(struct Qdisc *sch, struct fq_flow *flow)
{
struct sk_buff *skb = flow->head;
if (skb) {
flow->head = skb->next;
skb->next = NULL;
flow->qlen--;
qdisc_qstats_backlog_dec(sch, skb);
sch->q.qlen--;
}
return skb;
}
/* We might add in the future detection of retransmits
* For the time being, just return false
*/
static bool skb_is_retransmit(struct sk_buff *skb)
{
return false;
}
/* add skb to flow queue
* flow queue is a linked list, kind of FIFO, except for TCP retransmits
* We special case tcp retransmits to be transmitted before other packets.
* We rely on fact that TCP retransmits are unlikely, so we do not waste
* a separate queue or a pointer.
* head-> [retrans pkt 1]
* [retrans pkt 2]
* [ normal pkt 1]
* [ normal pkt 2]
* [ normal pkt 3]
* tail-> [ normal pkt 4]
*/
static void flow_queue_add(struct fq_flow *flow, struct sk_buff *skb)
{
struct sk_buff *prev, *head = flow->head;
skb->next = NULL;
if (!head) {
flow->head = skb;
flow->tail = skb;
return;
}
if (likely(!skb_is_retransmit(skb))) {
flow->tail->next = skb;
flow->tail = skb;
return;
}
/* This skb is a tcp retransmit,
* find the last retrans packet in the queue
*/
prev = NULL;
while (skb_is_retransmit(head)) {
prev = head;
head = head->next;
if (!head)
break;
}
if (!prev) { /* no rtx packet in queue, become the new head */
skb->next = flow->head;
flow->head = skb;
} else {
if (prev == flow->tail)
flow->tail = skb;
else
skb->next = prev->next;
prev->next = skb;
}
}
static int fq_enqueue(struct sk_buff *skb, struct Qdisc *sch)
{
struct fq_sched_data *q = qdisc_priv(sch);
struct fq_flow *f;
if (unlikely(sch->q.qlen >= sch->limit))
return qdisc_drop(skb, sch);
f = fq_classify(skb, q);
if (unlikely(f->qlen >= q->flow_plimit && f != &q->internal)) {
q->stat_flows_plimit++;
return qdisc_drop(skb, sch);
}
f->qlen++;
if (skb_is_retransmit(skb))
q->stat_tcp_retrans++;
qdisc_qstats_backlog_inc(sch, skb);
if (fq_flow_is_detached(f)) {
fq_flow_add_tail(&q->new_flows, f);
if (time_after(jiffies, f->age + q->flow_refill_delay))
f->credit = max_t(u32, f->credit, q->quantum);
q->inactive_flows--;
}
/* Note: this overwrites f->age */
flow_queue_add(f, skb);
if (unlikely(f == &q->internal)) {
q->stat_internal_packets++;
}
sch->q.qlen++;
return NET_XMIT_SUCCESS;
}
static void fq_check_throttled(struct fq_sched_data *q, u64 now)
{
struct rb_node *p;
if (q->time_next_delayed_flow > now)
return;
q->time_next_delayed_flow = ~0ULL;
while ((p = rb_first(&q->delayed)) != NULL) {
struct fq_flow *f = container_of(p, struct fq_flow, rate_node);
if (f->time_next_packet > now) {
q->time_next_delayed_flow = f->time_next_packet;
break;
}
rb_erase(p, &q->delayed);
q->throttled_flows--;
fq_flow_add_tail(&q->old_flows, f);
}
}
static struct sk_buff *fq_dequeue(struct Qdisc *sch)
{
struct fq_sched_data *q = qdisc_priv(sch);
u64 now = ktime_get_ns();
struct fq_flow_head *head;
struct sk_buff *skb;
struct fq_flow *f;
u32 rate;
skb = fq_dequeue_head(sch, &q->internal);
if (skb)
goto out;
fq_check_throttled(q, now);
begin:
head = &q->new_flows;
if (!head->first) {
head = &q->old_flows;
if (!head->first) {
if (q->time_next_delayed_flow != ~0ULL)
qdisc_watchdog_schedule_ns(&q->watchdog,
q->time_next_delayed_flow,
false);
return NULL;
}
}
f = head->first;
if (f->credit <= 0) {
f->credit += q->quantum;
head->first = f->next;
fq_flow_add_tail(&q->old_flows, f);
goto begin;
}
if (unlikely(f->head && now < f->time_next_packet)) {
head->first = f->next;
fq_flow_set_throttled(q, f);
goto begin;
}
skb = fq_dequeue_head(sch, f);
if (!skb) {
head->first = f->next;
/* force a pass through old_flows to prevent starvation */
if ((head == &q->new_flows) && q->old_flows.first) {
fq_flow_add_tail(&q->old_flows, f);
} else {
fq_flow_set_detached(f);
q->inactive_flows++;
}
goto begin;
}
prefetch(&skb->end);
f->time_next_packet = now;
f->credit -= qdisc_pkt_len(skb);
if (f->credit > 0 || !q->rate_enable)
goto out;
rate = q->flow_max_rate;
if (skb->sk && skb->sk->sk_state != TCP_TIME_WAIT)
rate = min(skb->sk->sk_pacing_rate, rate);
if (rate != ~0U) {
u32 plen = max(qdisc_pkt_len(skb), q->quantum);
u64 len = (u64)plen * NSEC_PER_SEC;
if (likely(rate))
do_div(len, rate);
/* Since socket rate can change later,
* clamp the delay to 125 ms.
* TODO: maybe segment the too big skb, as in commit
* e43ac79a4bc ("sch_tbf: segment too big GSO packets")
*/
if (unlikely(len > 125 * NSEC_PER_MSEC)) {
len = 125 * NSEC_PER_MSEC;
q->stat_pkts_too_long++;
}
f->time_next_packet = now + len;
}
out:
qdisc_bstats_update(sch, skb);
return skb;
}
static void fq_reset(struct Qdisc *sch)
{
struct fq_sched_data *q = qdisc_priv(sch);
struct rb_root *root;
struct sk_buff *skb;
struct rb_node *p;
struct fq_flow *f;
unsigned int idx;
while ((skb = fq_dequeue_head(sch, &q->internal)) != NULL)
kfree_skb(skb);
if (!q->fq_root)
return;
for (idx = 0; idx < (1U << q->fq_trees_log); idx++) {
root = &q->fq_root[idx];
while ((p = rb_first(root)) != NULL) {
f = container_of(p, struct fq_flow, fq_node);
rb_erase(p, root);
while ((skb = fq_dequeue_head(sch, f)) != NULL)
kfree_skb(skb);
kmem_cache_free(fq_flow_cachep, f);
}
}
q->new_flows.first = NULL;
q->old_flows.first = NULL;
q->delayed = RB_ROOT;
q->flows = 0;
q->inactive_flows = 0;
q->throttled_flows = 0;
}
static void fq_rehash(struct fq_sched_data *q,
struct rb_root *old_array, u32 old_log,
struct rb_root *new_array, u32 new_log)
{
struct rb_node *op, **np, *parent;
struct rb_root *oroot, *nroot;
struct fq_flow *of, *nf;
int fcnt = 0;
u32 idx;
for (idx = 0; idx < (1U << old_log); idx++) {
oroot = &old_array[idx];
while ((op = rb_first(oroot)) != NULL) {
rb_erase(op, oroot);
of = container_of(op, struct fq_flow, fq_node);
if (fq_gc_candidate(of)) {
fcnt++;
kmem_cache_free(fq_flow_cachep, of);
continue;
}
nroot = &new_array[hash_32((u32)(long)of->sk, new_log)];
np = &nroot->rb_node;
parent = NULL;
while (*np) {
parent = *np;
nf = container_of(parent, struct fq_flow, fq_node);
BUG_ON(nf->sk == of->sk);
if (nf->sk > of->sk)
np = &parent->rb_right;
else
np = &parent->rb_left;
}
rb_link_node(&of->fq_node, parent, np);
rb_insert_color(&of->fq_node, nroot);
}
}
q->flows -= fcnt;
q->inactive_flows -= fcnt;
q->stat_gc_flows += fcnt;
}
static void *fq_alloc_node(size_t sz, int node)
{
void *ptr;
ptr = kmalloc_node(sz, GFP_KERNEL | __GFP_REPEAT | __GFP_NOWARN, node);
if (!ptr)
ptr = vmalloc_node(sz, node);
return ptr;
}
static void fq_free(void *addr)
{
kvfree(addr);
}
static int fq_resize(struct Qdisc *sch, u32 log)
{
struct fq_sched_data *q = qdisc_priv(sch);
struct rb_root *array;
void *old_fq_root;
u32 idx;
if (q->fq_root && log == q->fq_trees_log)
return 0;
/* If XPS was setup, we can allocate memory on right NUMA node */
array = fq_alloc_node(sizeof(struct rb_root) << log,
netdev_queue_numa_node_read(sch->dev_queue));
if (!array)
return -ENOMEM;
for (idx = 0; idx < (1U << log); idx++)
array[idx] = RB_ROOT;
sch_tree_lock(sch);
old_fq_root = q->fq_root;
if (old_fq_root)
fq_rehash(q, old_fq_root, q->fq_trees_log, array, log);
q->fq_root = array;
q->fq_trees_log = log;
sch_tree_unlock(sch);
fq_free(old_fq_root);
return 0;
}
static const struct nla_policy fq_policy[TCA_FQ_MAX + 1] = {
[TCA_FQ_PLIMIT] = { .type = NLA_U32 },
[TCA_FQ_FLOW_PLIMIT] = { .type = NLA_U32 },
[TCA_FQ_QUANTUM] = { .type = NLA_U32 },
[TCA_FQ_INITIAL_QUANTUM] = { .type = NLA_U32 },
[TCA_FQ_RATE_ENABLE] = { .type = NLA_U32 },
[TCA_FQ_FLOW_DEFAULT_RATE] = { .type = NLA_U32 },
[TCA_FQ_FLOW_MAX_RATE] = { .type = NLA_U32 },
[TCA_FQ_BUCKETS_LOG] = { .type = NLA_U32 },
[TCA_FQ_FLOW_REFILL_DELAY] = { .type = NLA_U32 },
};
static int fq_change(struct Qdisc *sch, struct nlattr *opt)
{
struct fq_sched_data *q = qdisc_priv(sch);
struct nlattr *tb[TCA_FQ_MAX + 1];
int err, drop_count = 0;
u32 fq_log;
if (!opt)
return -EINVAL;
err = nla_parse_nested(tb, TCA_FQ_MAX, opt, fq_policy);
if (err < 0)
return err;
sch_tree_lock(sch);
fq_log = q->fq_trees_log;
if (tb[TCA_FQ_BUCKETS_LOG]) {
u32 nval = nla_get_u32(tb[TCA_FQ_BUCKETS_LOG]);
if (nval >= 1 && nval <= ilog2(256*1024))
fq_log = nval;
else
err = -EINVAL;
}
if (tb[TCA_FQ_PLIMIT])
sch->limit = nla_get_u32(tb[TCA_FQ_PLIMIT]);
if (tb[TCA_FQ_FLOW_PLIMIT])
q->flow_plimit = nla_get_u32(tb[TCA_FQ_FLOW_PLIMIT]);
if (tb[TCA_FQ_QUANTUM])
q->quantum = nla_get_u32(tb[TCA_FQ_QUANTUM]);
if (tb[TCA_FQ_INITIAL_QUANTUM])
q->initial_quantum = nla_get_u32(tb[TCA_FQ_INITIAL_QUANTUM]);
if (tb[TCA_FQ_FLOW_DEFAULT_RATE])
pr_warn_ratelimited("sch_fq: defrate %u ignored.\n",
nla_get_u32(tb[TCA_FQ_FLOW_DEFAULT_RATE]));
if (tb[TCA_FQ_FLOW_MAX_RATE])
q->flow_max_rate = nla_get_u32(tb[TCA_FQ_FLOW_MAX_RATE]);
if (tb[TCA_FQ_RATE_ENABLE]) {
u32 enable = nla_get_u32(tb[TCA_FQ_RATE_ENABLE]);
if (enable <= 1)
q->rate_enable = enable;
else
err = -EINVAL;
}
if (tb[TCA_FQ_FLOW_REFILL_DELAY]) {
u32 usecs_delay = nla_get_u32(tb[TCA_FQ_FLOW_REFILL_DELAY]) ;
q->flow_refill_delay = usecs_to_jiffies(usecs_delay);
}
if (!err) {
sch_tree_unlock(sch);
err = fq_resize(sch, fq_log);
sch_tree_lock(sch);
}
while (sch->q.qlen > sch->limit) {
struct sk_buff *skb = fq_dequeue(sch);
if (!skb)
break;
kfree_skb(skb);
drop_count++;
}
qdisc_tree_decrease_qlen(sch, drop_count);
sch_tree_unlock(sch);
return err;
}
static void fq_destroy(struct Qdisc *sch)
{
struct fq_sched_data *q = qdisc_priv(sch);
fq_reset(sch);
fq_free(q->fq_root);
qdisc_watchdog_cancel(&q->watchdog);
}
static int fq_init(struct Qdisc *sch, struct nlattr *opt)
{
struct fq_sched_data *q = qdisc_priv(sch);
int err;
sch->limit = 10000;
q->flow_plimit = 100;
q->quantum = 2 * psched_mtu(qdisc_dev(sch));
q->initial_quantum = 10 * psched_mtu(qdisc_dev(sch));
q->flow_refill_delay = msecs_to_jiffies(40);
q->flow_max_rate = ~0U;
q->rate_enable = 1;
q->new_flows.first = NULL;
q->old_flows.first = NULL;
q->delayed = RB_ROOT;
q->fq_root = NULL;
q->fq_trees_log = ilog2(1024);
qdisc_watchdog_init(&q->watchdog, sch);
if (opt)
err = fq_change(sch, opt);
else
err = fq_resize(sch, q->fq_trees_log);
return err;
}
static int fq_dump(struct Qdisc *sch, struct sk_buff *skb)
{
struct fq_sched_data *q = qdisc_priv(sch);
struct nlattr *opts;
opts = nla_nest_start(skb, TCA_OPTIONS);
if (opts == NULL)
goto nla_put_failure;
/* TCA_FQ_FLOW_DEFAULT_RATE is not used anymore */
if (nla_put_u32(skb, TCA_FQ_PLIMIT, sch->limit) ||
nla_put_u32(skb, TCA_FQ_FLOW_PLIMIT, q->flow_plimit) ||
nla_put_u32(skb, TCA_FQ_QUANTUM, q->quantum) ||
nla_put_u32(skb, TCA_FQ_INITIAL_QUANTUM, q->initial_quantum) ||
nla_put_u32(skb, TCA_FQ_RATE_ENABLE, q->rate_enable) ||
nla_put_u32(skb, TCA_FQ_FLOW_MAX_RATE, q->flow_max_rate) ||
nla_put_u32(skb, TCA_FQ_FLOW_REFILL_DELAY,
jiffies_to_usecs(q->flow_refill_delay)) ||
nla_put_u32(skb, TCA_FQ_BUCKETS_LOG, q->fq_trees_log))
goto nla_put_failure;
return nla_nest_end(skb, opts);
nla_put_failure:
return -1;
}
static int fq_dump_stats(struct Qdisc *sch, struct gnet_dump *d)
{
struct fq_sched_data *q = qdisc_priv(sch);
u64 now = ktime_get_ns();
struct tc_fq_qd_stats st = {
.gc_flows = q->stat_gc_flows,
.highprio_packets = q->stat_internal_packets,
.tcp_retrans = q->stat_tcp_retrans,
.throttled = q->stat_throttled,
.flows_plimit = q->stat_flows_plimit,
.pkts_too_long = q->stat_pkts_too_long,
.allocation_errors = q->stat_allocation_errors,
.flows = q->flows,
.inactive_flows = q->inactive_flows,
.throttled_flows = q->throttled_flows,
.time_next_delayed_flow = q->time_next_delayed_flow - now,
};
return gnet_stats_copy_app(d, &st, sizeof(st));
}
static struct Qdisc_ops fq_qdisc_ops __read_mostly = {
.id = "fq",
.priv_size = sizeof(struct fq_sched_data),
.enqueue = fq_enqueue,
.dequeue = fq_dequeue,
.peek = qdisc_peek_dequeued,
.init = fq_init,
.reset = fq_reset,
.destroy = fq_destroy,
.change = fq_change,
.dump = fq_dump,
.dump_stats = fq_dump_stats,
.owner = THIS_MODULE,
};
static int __init fq_module_init(void)
{
int ret;
fq_flow_cachep = kmem_cache_create("fq_flow_cache",
sizeof(struct fq_flow),
0, 0, NULL);
if (!fq_flow_cachep)
return -ENOMEM;
ret = register_qdisc(&fq_qdisc_ops);
if (ret)
kmem_cache_destroy(fq_flow_cachep);
return ret;
}
static void __exit fq_module_exit(void)
{
unregister_qdisc(&fq_qdisc_ops);
kmem_cache_destroy(fq_flow_cachep);
}
module_init(fq_module_init)
module_exit(fq_module_exit)
MODULE_AUTHOR("Eric Dumazet");
MODULE_LICENSE("GPL");

624
net/sched/sch_fq_codel.c Normal file
View file

@ -0,0 +1,624 @@
/*
* Fair Queue CoDel discipline
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public License
* as published by the Free Software Foundation; either version
* 2 of the License, or (at your option) any later version.
*
* Copyright (C) 2012 Eric Dumazet <edumazet@google.com>
*/
#include <linux/module.h>
#include <linux/types.h>
#include <linux/kernel.h>
#include <linux/jiffies.h>
#include <linux/string.h>
#include <linux/in.h>
#include <linux/errno.h>
#include <linux/init.h>
#include <linux/skbuff.h>
#include <linux/jhash.h>
#include <linux/slab.h>
#include <linux/vmalloc.h>
#include <net/netlink.h>
#include <net/pkt_sched.h>
#include <net/flow_keys.h>
#include <net/codel.h>
/* Fair Queue CoDel.
*
* Principles :
* Packets are classified (internal classifier or external) on flows.
* This is a Stochastic model (as we use a hash, several flows
* might be hashed on same slot)
* Each flow has a CoDel managed queue.
* Flows are linked onto two (Round Robin) lists,
* so that new flows have priority on old ones.
*
* For a given flow, packets are not reordered (CoDel uses a FIFO)
* head drops only.
* ECN capability is on by default.
* Low memory footprint (64 bytes per flow)
*/
struct fq_codel_flow {
struct sk_buff *head;
struct sk_buff *tail;
struct list_head flowchain;
int deficit;
u32 dropped; /* number of drops (or ECN marks) on this flow */
struct codel_vars cvars;
}; /* please try to keep this structure <= 64 bytes */
struct fq_codel_sched_data {
struct tcf_proto __rcu *filter_list; /* optional external classifier */
struct fq_codel_flow *flows; /* Flows table [flows_cnt] */
u32 *backlogs; /* backlog table [flows_cnt] */
u32 flows_cnt; /* number of flows */
u32 perturbation; /* hash perturbation */
u32 quantum; /* psched_mtu(qdisc_dev(sch)); */
struct codel_params cparams;
struct codel_stats cstats;
u32 drop_overlimit;
u32 new_flow_count;
struct list_head new_flows; /* list of new flows */
struct list_head old_flows; /* list of old flows */
};
static unsigned int fq_codel_hash(const struct fq_codel_sched_data *q,
const struct sk_buff *skb)
{
struct flow_keys keys;
unsigned int hash;
skb_flow_dissect(skb, &keys);
hash = jhash_3words((__force u32)keys.dst,
(__force u32)keys.src ^ keys.ip_proto,
(__force u32)keys.ports, q->perturbation);
return reciprocal_scale(hash, q->flows_cnt);
}
static unsigned int fq_codel_classify(struct sk_buff *skb, struct Qdisc *sch,
int *qerr)
{
struct fq_codel_sched_data *q = qdisc_priv(sch);
struct tcf_proto *filter;
struct tcf_result res;
int result;
if (TC_H_MAJ(skb->priority) == sch->handle &&
TC_H_MIN(skb->priority) > 0 &&
TC_H_MIN(skb->priority) <= q->flows_cnt)
return TC_H_MIN(skb->priority);
filter = rcu_dereference_bh(q->filter_list);
if (!filter)
return fq_codel_hash(q, skb) + 1;
*qerr = NET_XMIT_SUCCESS | __NET_XMIT_BYPASS;
result = tc_classify(skb, filter, &res);
if (result >= 0) {
#ifdef CONFIG_NET_CLS_ACT
switch (result) {
case TC_ACT_STOLEN:
case TC_ACT_QUEUED:
*qerr = NET_XMIT_SUCCESS | __NET_XMIT_STOLEN;
case TC_ACT_SHOT:
return 0;
}
#endif
if (TC_H_MIN(res.classid) <= q->flows_cnt)
return TC_H_MIN(res.classid);
}
return 0;
}
/* helper functions : might be changed when/if skb use a standard list_head */
/* remove one skb from head of slot queue */
static inline struct sk_buff *dequeue_head(struct fq_codel_flow *flow)
{
struct sk_buff *skb = flow->head;
flow->head = skb->next;
skb->next = NULL;
return skb;
}
/* add skb to flow queue (tail add) */
static inline void flow_queue_add(struct fq_codel_flow *flow,
struct sk_buff *skb)
{
if (flow->head == NULL)
flow->head = skb;
else
flow->tail->next = skb;
flow->tail = skb;
skb->next = NULL;
}
static unsigned int fq_codel_drop(struct Qdisc *sch)
{
struct fq_codel_sched_data *q = qdisc_priv(sch);
struct sk_buff *skb;
unsigned int maxbacklog = 0, idx = 0, i, len;
struct fq_codel_flow *flow;
/* Queue is full! Find the fat flow and drop packet from it.
* This might sound expensive, but with 1024 flows, we scan
* 4KB of memory, and we dont need to handle a complex tree
* in fast path (packet queue/enqueue) with many cache misses.
*/
for (i = 0; i < q->flows_cnt; i++) {
if (q->backlogs[i] > maxbacklog) {
maxbacklog = q->backlogs[i];
idx = i;
}
}
flow = &q->flows[idx];
skb = dequeue_head(flow);
len = qdisc_pkt_len(skb);
q->backlogs[idx] -= len;
kfree_skb(skb);
sch->q.qlen--;
qdisc_qstats_drop(sch);
qdisc_qstats_backlog_dec(sch, skb);
flow->dropped++;
return idx;
}
static int fq_codel_enqueue(struct sk_buff *skb, struct Qdisc *sch)
{
struct fq_codel_sched_data *q = qdisc_priv(sch);
unsigned int idx;
struct fq_codel_flow *flow;
int uninitialized_var(ret);
idx = fq_codel_classify(skb, sch, &ret);
if (idx == 0) {
if (ret & __NET_XMIT_BYPASS)
qdisc_qstats_drop(sch);
kfree_skb(skb);
return ret;
}
idx--;
codel_set_enqueue_time(skb);
flow = &q->flows[idx];
flow_queue_add(flow, skb);
q->backlogs[idx] += qdisc_pkt_len(skb);
qdisc_qstats_backlog_inc(sch, skb);
if (list_empty(&flow->flowchain)) {
list_add_tail(&flow->flowchain, &q->new_flows);
q->new_flow_count++;
flow->deficit = q->quantum;
flow->dropped = 0;
}
if (++sch->q.qlen <= sch->limit)
return NET_XMIT_SUCCESS;
q->drop_overlimit++;
/* Return Congestion Notification only if we dropped a packet
* from this flow.
*/
if (fq_codel_drop(sch) == idx)
return NET_XMIT_CN;
/* As we dropped a packet, better let upper stack know this */
qdisc_tree_decrease_qlen(sch, 1);
return NET_XMIT_SUCCESS;
}
/* This is the specific function called from codel_dequeue()
* to dequeue a packet from queue. Note: backlog is handled in
* codel, we dont need to reduce it here.
*/
static struct sk_buff *dequeue(struct codel_vars *vars, struct Qdisc *sch)
{
struct fq_codel_sched_data *q = qdisc_priv(sch);
struct fq_codel_flow *flow;
struct sk_buff *skb = NULL;
flow = container_of(vars, struct fq_codel_flow, cvars);
if (flow->head) {
skb = dequeue_head(flow);
q->backlogs[flow - q->flows] -= qdisc_pkt_len(skb);
sch->q.qlen--;
}
return skb;
}
static struct sk_buff *fq_codel_dequeue(struct Qdisc *sch)
{
struct fq_codel_sched_data *q = qdisc_priv(sch);
struct sk_buff *skb;
struct fq_codel_flow *flow;
struct list_head *head;
u32 prev_drop_count, prev_ecn_mark;
begin:
head = &q->new_flows;
if (list_empty(head)) {
head = &q->old_flows;
if (list_empty(head))
return NULL;
}
flow = list_first_entry(head, struct fq_codel_flow, flowchain);
if (flow->deficit <= 0) {
flow->deficit += q->quantum;
list_move_tail(&flow->flowchain, &q->old_flows);
goto begin;
}
prev_drop_count = q->cstats.drop_count;
prev_ecn_mark = q->cstats.ecn_mark;
skb = codel_dequeue(sch, &q->cparams, &flow->cvars, &q->cstats,
dequeue);
flow->dropped += q->cstats.drop_count - prev_drop_count;
flow->dropped += q->cstats.ecn_mark - prev_ecn_mark;
if (!skb) {
/* force a pass through old_flows to prevent starvation */
if ((head == &q->new_flows) && !list_empty(&q->old_flows))
list_move_tail(&flow->flowchain, &q->old_flows);
else
list_del_init(&flow->flowchain);
goto begin;
}
qdisc_bstats_update(sch, skb);
flow->deficit -= qdisc_pkt_len(skb);
/* We cant call qdisc_tree_decrease_qlen() if our qlen is 0,
* or HTB crashes. Defer it for next round.
*/
if (q->cstats.drop_count && sch->q.qlen) {
qdisc_tree_decrease_qlen(sch, q->cstats.drop_count);
q->cstats.drop_count = 0;
}
return skb;
}
static void fq_codel_reset(struct Qdisc *sch)
{
struct sk_buff *skb;
while ((skb = fq_codel_dequeue(sch)) != NULL)
kfree_skb(skb);
}
static const struct nla_policy fq_codel_policy[TCA_FQ_CODEL_MAX + 1] = {
[TCA_FQ_CODEL_TARGET] = { .type = NLA_U32 },
[TCA_FQ_CODEL_LIMIT] = { .type = NLA_U32 },
[TCA_FQ_CODEL_INTERVAL] = { .type = NLA_U32 },
[TCA_FQ_CODEL_ECN] = { .type = NLA_U32 },
[TCA_FQ_CODEL_FLOWS] = { .type = NLA_U32 },
[TCA_FQ_CODEL_QUANTUM] = { .type = NLA_U32 },
};
static int fq_codel_change(struct Qdisc *sch, struct nlattr *opt)
{
struct fq_codel_sched_data *q = qdisc_priv(sch);
struct nlattr *tb[TCA_FQ_CODEL_MAX + 1];
int err;
if (!opt)
return -EINVAL;
err = nla_parse_nested(tb, TCA_FQ_CODEL_MAX, opt, fq_codel_policy);
if (err < 0)
return err;
if (tb[TCA_FQ_CODEL_FLOWS]) {
if (q->flows)
return -EINVAL;
q->flows_cnt = nla_get_u32(tb[TCA_FQ_CODEL_FLOWS]);
if (!q->flows_cnt ||
q->flows_cnt > 65536)
return -EINVAL;
}
sch_tree_lock(sch);
if (tb[TCA_FQ_CODEL_TARGET]) {
u64 target = nla_get_u32(tb[TCA_FQ_CODEL_TARGET]);
q->cparams.target = (target * NSEC_PER_USEC) >> CODEL_SHIFT;
}
if (tb[TCA_FQ_CODEL_INTERVAL]) {
u64 interval = nla_get_u32(tb[TCA_FQ_CODEL_INTERVAL]);
q->cparams.interval = (interval * NSEC_PER_USEC) >> CODEL_SHIFT;
}
if (tb[TCA_FQ_CODEL_LIMIT])
sch->limit = nla_get_u32(tb[TCA_FQ_CODEL_LIMIT]);
if (tb[TCA_FQ_CODEL_ECN])
q->cparams.ecn = !!nla_get_u32(tb[TCA_FQ_CODEL_ECN]);
if (tb[TCA_FQ_CODEL_QUANTUM])
q->quantum = max(256U, nla_get_u32(tb[TCA_FQ_CODEL_QUANTUM]));
while (sch->q.qlen > sch->limit) {
struct sk_buff *skb = fq_codel_dequeue(sch);
kfree_skb(skb);
q->cstats.drop_count++;
}
qdisc_tree_decrease_qlen(sch, q->cstats.drop_count);
q->cstats.drop_count = 0;
sch_tree_unlock(sch);
return 0;
}
static void *fq_codel_zalloc(size_t sz)
{
void *ptr = kzalloc(sz, GFP_KERNEL | __GFP_NOWARN);
if (!ptr)
ptr = vzalloc(sz);
return ptr;
}
static void fq_codel_free(void *addr)
{
kvfree(addr);
}
static void fq_codel_destroy(struct Qdisc *sch)
{
struct fq_codel_sched_data *q = qdisc_priv(sch);
tcf_destroy_chain(&q->filter_list);
fq_codel_free(q->backlogs);
fq_codel_free(q->flows);
}
static int fq_codel_init(struct Qdisc *sch, struct nlattr *opt)
{
struct fq_codel_sched_data *q = qdisc_priv(sch);
int i;
sch->limit = 10*1024;
q->flows_cnt = 1024;
q->quantum = psched_mtu(qdisc_dev(sch));
q->perturbation = prandom_u32();
INIT_LIST_HEAD(&q->new_flows);
INIT_LIST_HEAD(&q->old_flows);
codel_params_init(&q->cparams);
codel_stats_init(&q->cstats);
q->cparams.ecn = true;
if (opt) {
int err = fq_codel_change(sch, opt);
if (err)
return err;
}
if (!q->flows) {
q->flows = fq_codel_zalloc(q->flows_cnt *
sizeof(struct fq_codel_flow));
if (!q->flows)
return -ENOMEM;
q->backlogs = fq_codel_zalloc(q->flows_cnt * sizeof(u32));
if (!q->backlogs) {
fq_codel_free(q->flows);
return -ENOMEM;
}
for (i = 0; i < q->flows_cnt; i++) {
struct fq_codel_flow *flow = q->flows + i;
INIT_LIST_HEAD(&flow->flowchain);
codel_vars_init(&flow->cvars);
}
}
if (sch->limit >= 1)
sch->flags |= TCQ_F_CAN_BYPASS;
else
sch->flags &= ~TCQ_F_CAN_BYPASS;
return 0;
}
static int fq_codel_dump(struct Qdisc *sch, struct sk_buff *skb)
{
struct fq_codel_sched_data *q = qdisc_priv(sch);
struct nlattr *opts;
opts = nla_nest_start(skb, TCA_OPTIONS);
if (opts == NULL)
goto nla_put_failure;
if (nla_put_u32(skb, TCA_FQ_CODEL_TARGET,
codel_time_to_us(q->cparams.target)) ||
nla_put_u32(skb, TCA_FQ_CODEL_LIMIT,
sch->limit) ||
nla_put_u32(skb, TCA_FQ_CODEL_INTERVAL,
codel_time_to_us(q->cparams.interval)) ||
nla_put_u32(skb, TCA_FQ_CODEL_ECN,
q->cparams.ecn) ||
nla_put_u32(skb, TCA_FQ_CODEL_QUANTUM,
q->quantum) ||
nla_put_u32(skb, TCA_FQ_CODEL_FLOWS,
q->flows_cnt))
goto nla_put_failure;
return nla_nest_end(skb, opts);
nla_put_failure:
return -1;
}
static int fq_codel_dump_stats(struct Qdisc *sch, struct gnet_dump *d)
{
struct fq_codel_sched_data *q = qdisc_priv(sch);
struct tc_fq_codel_xstats st = {
.type = TCA_FQ_CODEL_XSTATS_QDISC,
};
struct list_head *pos;
st.qdisc_stats.maxpacket = q->cstats.maxpacket;
st.qdisc_stats.drop_overlimit = q->drop_overlimit;
st.qdisc_stats.ecn_mark = q->cstats.ecn_mark;
st.qdisc_stats.new_flow_count = q->new_flow_count;
list_for_each(pos, &q->new_flows)
st.qdisc_stats.new_flows_len++;
list_for_each(pos, &q->old_flows)
st.qdisc_stats.old_flows_len++;
return gnet_stats_copy_app(d, &st, sizeof(st));
}
static struct Qdisc *fq_codel_leaf(struct Qdisc *sch, unsigned long arg)
{
return NULL;
}
static unsigned long fq_codel_get(struct Qdisc *sch, u32 classid)
{
return 0;
}
static unsigned long fq_codel_bind(struct Qdisc *sch, unsigned long parent,
u32 classid)
{
/* we cannot bypass queue discipline anymore */
sch->flags &= ~TCQ_F_CAN_BYPASS;
return 0;
}
static void fq_codel_put(struct Qdisc *q, unsigned long cl)
{
}
static struct tcf_proto __rcu **fq_codel_find_tcf(struct Qdisc *sch,
unsigned long cl)
{
struct fq_codel_sched_data *q = qdisc_priv(sch);
if (cl)
return NULL;
return &q->filter_list;
}
static int fq_codel_dump_class(struct Qdisc *sch, unsigned long cl,
struct sk_buff *skb, struct tcmsg *tcm)
{
tcm->tcm_handle |= TC_H_MIN(cl);
return 0;
}
static int fq_codel_dump_class_stats(struct Qdisc *sch, unsigned long cl,
struct gnet_dump *d)
{
struct fq_codel_sched_data *q = qdisc_priv(sch);
u32 idx = cl - 1;
struct gnet_stats_queue qs = { 0 };
struct tc_fq_codel_xstats xstats;
if (idx < q->flows_cnt) {
const struct fq_codel_flow *flow = &q->flows[idx];
const struct sk_buff *skb = flow->head;
memset(&xstats, 0, sizeof(xstats));
xstats.type = TCA_FQ_CODEL_XSTATS_CLASS;
xstats.class_stats.deficit = flow->deficit;
xstats.class_stats.ldelay =
codel_time_to_us(flow->cvars.ldelay);
xstats.class_stats.count = flow->cvars.count;
xstats.class_stats.lastcount = flow->cvars.lastcount;
xstats.class_stats.dropping = flow->cvars.dropping;
if (flow->cvars.dropping) {
codel_tdiff_t delta = flow->cvars.drop_next -
codel_get_time();
xstats.class_stats.drop_next = (delta >= 0) ?
codel_time_to_us(delta) :
-codel_time_to_us(-delta);
}
while (skb) {
qs.qlen++;
skb = skb->next;
}
qs.backlog = q->backlogs[idx];
qs.drops = flow->dropped;
}
if (gnet_stats_copy_queue(d, NULL, &qs, 0) < 0)
return -1;
if (idx < q->flows_cnt)
return gnet_stats_copy_app(d, &xstats, sizeof(xstats));
return 0;
}
static void fq_codel_walk(struct Qdisc *sch, struct qdisc_walker *arg)
{
struct fq_codel_sched_data *q = qdisc_priv(sch);
unsigned int i;
if (arg->stop)
return;
for (i = 0; i < q->flows_cnt; i++) {
if (list_empty(&q->flows[i].flowchain) ||
arg->count < arg->skip) {
arg->count++;
continue;
}
if (arg->fn(sch, i + 1, arg) < 0) {
arg->stop = 1;
break;
}
arg->count++;
}
}
static const struct Qdisc_class_ops fq_codel_class_ops = {
.leaf = fq_codel_leaf,
.get = fq_codel_get,
.put = fq_codel_put,
.tcf_chain = fq_codel_find_tcf,
.bind_tcf = fq_codel_bind,
.unbind_tcf = fq_codel_put,
.dump = fq_codel_dump_class,
.dump_stats = fq_codel_dump_class_stats,
.walk = fq_codel_walk,
};
static struct Qdisc_ops fq_codel_qdisc_ops __read_mostly = {
.cl_ops = &fq_codel_class_ops,
.id = "fq_codel",
.priv_size = sizeof(struct fq_codel_sched_data),
.enqueue = fq_codel_enqueue,
.dequeue = fq_codel_dequeue,
.peek = qdisc_peek_dequeued,
.drop = fq_codel_drop,
.init = fq_codel_init,
.reset = fq_codel_reset,
.destroy = fq_codel_destroy,
.change = fq_codel_change,
.dump = fq_codel_dump,
.dump_stats = fq_codel_dump_stats,
.owner = THIS_MODULE,
};
static int __init fq_codel_module_init(void)
{
return register_qdisc(&fq_codel_qdisc_ops);
}
static void __exit fq_codel_module_exit(void)
{
unregister_qdisc(&fq_codel_qdisc_ops);
}
module_init(fq_codel_module_init)
module_exit(fq_codel_module_exit)
MODULE_AUTHOR("Eric Dumazet");
MODULE_LICENSE("GPL");

990
net/sched/sch_generic.c Normal file
View file

@ -0,0 +1,990 @@
/*
* net/sched/sch_generic.c Generic packet scheduler routines.
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public License
* as published by the Free Software Foundation; either version
* 2 of the License, or (at your option) any later version.
*
* Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
* Jamal Hadi Salim, <hadi@cyberus.ca> 990601
* - Ingress support
*/
#include <linux/bitops.h>
#include <linux/module.h>
#include <linux/types.h>
#include <linux/kernel.h>
#include <linux/sched.h>
#include <linux/string.h>
#include <linux/errno.h>
#include <linux/netdevice.h>
#include <linux/skbuff.h>
#include <linux/rtnetlink.h>
#include <linux/init.h>
#include <linux/rcupdate.h>
#include <linux/list.h>
#include <linux/slab.h>
#include <linux/if_vlan.h>
#include <net/sch_generic.h>
#include <net/pkt_sched.h>
#include <net/dst.h>
/* Qdisc to use by default */
const struct Qdisc_ops *default_qdisc_ops = &pfifo_fast_ops;
EXPORT_SYMBOL(default_qdisc_ops);
/* Main transmission queue. */
/* Modifications to data participating in scheduling must be protected with
* qdisc_lock(qdisc) spinlock.
*
* The idea is the following:
* - enqueue, dequeue are serialized via qdisc root lock
* - ingress filtering is also serialized via qdisc root lock
* - updates to tree and tree walking are only done under the rtnl mutex.
*/
static inline int dev_requeue_skb(struct sk_buff *skb, struct Qdisc *q)
{
q->gso_skb = skb;
q->qstats.requeues++;
q->q.qlen++; /* it's still part of the queue */
__netif_schedule(q);
return 0;
}
static void try_bulk_dequeue_skb(struct Qdisc *q,
struct sk_buff *skb,
const struct netdev_queue *txq,
int *packets)
{
int bytelimit = qdisc_avail_bulklimit(txq) - skb->len;
while (bytelimit > 0) {
struct sk_buff *nskb = q->dequeue(q);
if (!nskb)
break;
bytelimit -= nskb->len; /* covers GSO len */
skb->next = nskb;
skb = nskb;
(*packets)++; /* GSO counts as one pkt */
}
skb->next = NULL;
}
/* Note that dequeue_skb can possibly return a SKB list (via skb->next).
* A requeued skb (via q->gso_skb) can also be a SKB list.
*/
static struct sk_buff *dequeue_skb(struct Qdisc *q, bool *validate,
int *packets)
{
struct sk_buff *skb = q->gso_skb;
const struct netdev_queue *txq = q->dev_queue;
*packets = 1;
*validate = true;
if (unlikely(skb)) {
/* check the reason of requeuing without tx lock first */
txq = skb_get_tx_queue(txq->dev, skb);
if (!netif_xmit_frozen_or_stopped(txq)) {
q->gso_skb = NULL;
q->q.qlen--;
} else
skb = NULL;
/* skb in gso_skb were already validated */
*validate = false;
} else {
if (!(q->flags & TCQ_F_ONETXQUEUE) ||
!netif_xmit_frozen_or_stopped(txq)) {
skb = q->dequeue(q);
if (skb && qdisc_may_bulk(q))
try_bulk_dequeue_skb(q, skb, txq, packets);
}
}
return skb;
}
static inline int handle_dev_cpu_collision(struct sk_buff *skb,
struct netdev_queue *dev_queue,
struct Qdisc *q)
{
int ret;
if (unlikely(dev_queue->xmit_lock_owner == smp_processor_id())) {
/*
* Same CPU holding the lock. It may be a transient
* configuration error, when hard_start_xmit() recurses. We
* detect it by checking xmit owner and drop the packet when
* deadloop is detected. Return OK to try the next skb.
*/
kfree_skb_list(skb);
net_warn_ratelimited("Dead loop on netdevice %s, fix it urgently!\n",
dev_queue->dev->name);
ret = qdisc_qlen(q);
} else {
/*
* Another cpu is holding lock, requeue & delay xmits for
* some time.
*/
__this_cpu_inc(softnet_data.cpu_collision);
ret = dev_requeue_skb(skb, q);
}
return ret;
}
/*
* Transmit possibly several skbs, and handle the return status as
* required. Holding the __QDISC___STATE_RUNNING bit guarantees that
* only one CPU can execute this function.
*
* Returns to the caller:
* 0 - queue is empty or throttled.
* >0 - queue is not empty.
*/
int sch_direct_xmit(struct sk_buff *skb, struct Qdisc *q,
struct net_device *dev, struct netdev_queue *txq,
spinlock_t *root_lock, bool validate)
{
int ret = NETDEV_TX_BUSY;
/* And release qdisc */
spin_unlock(root_lock);
/* Note that we validate skb (GSO, checksum, ...) outside of locks */
if (validate)
skb = validate_xmit_skb_list(skb, dev);
if (skb) {
HARD_TX_LOCK(dev, txq, smp_processor_id());
if (!netif_xmit_frozen_or_stopped(txq))
skb = dev_hard_start_xmit(skb, dev, txq, &ret);
HARD_TX_UNLOCK(dev, txq);
}
spin_lock(root_lock);
if (dev_xmit_complete(ret)) {
/* Driver sent out skb successfully or skb was consumed */
ret = qdisc_qlen(q);
} else if (ret == NETDEV_TX_LOCKED) {
/* Driver try lock failed */
ret = handle_dev_cpu_collision(skb, txq, q);
} else {
/* Driver returned NETDEV_TX_BUSY - requeue skb */
if (unlikely(ret != NETDEV_TX_BUSY))
net_warn_ratelimited("BUG %s code %d qlen %d\n",
dev->name, ret, q->q.qlen);
ret = dev_requeue_skb(skb, q);
}
if (ret && netif_xmit_frozen_or_stopped(txq))
ret = 0;
return ret;
}
/*
* NOTE: Called under qdisc_lock(q) with locally disabled BH.
*
* __QDISC___STATE_RUNNING guarantees only one CPU can process
* this qdisc at a time. qdisc_lock(q) serializes queue accesses for
* this queue.
*
* netif_tx_lock serializes accesses to device driver.
*
* qdisc_lock(q) and netif_tx_lock are mutually exclusive,
* if one is grabbed, another must be free.
*
* Note, that this procedure can be called by a watchdog timer
*
* Returns to the caller:
* 0 - queue is empty or throttled.
* >0 - queue is not empty.
*
*/
static inline int qdisc_restart(struct Qdisc *q, int *packets)
{
struct netdev_queue *txq;
struct net_device *dev;
spinlock_t *root_lock;
struct sk_buff *skb;
bool validate;
/* Dequeue packet */
skb = dequeue_skb(q, &validate, packets);
if (unlikely(!skb))
return 0;
root_lock = qdisc_lock(q);
dev = qdisc_dev(q);
txq = skb_get_tx_queue(dev, skb);
return sch_direct_xmit(skb, q, dev, txq, root_lock, validate);
}
void __qdisc_run(struct Qdisc *q)
{
int quota = weight_p;
int packets;
while (qdisc_restart(q, &packets)) {
/*
* Ordered by possible occurrence: Postpone processing if
* 1. we've exceeded packet quota
* 2. another process needs the CPU;
*/
quota -= packets;
if (quota <= 0 || need_resched()) {
__netif_schedule(q);
break;
}
}
qdisc_run_end(q);
}
unsigned long dev_trans_start(struct net_device *dev)
{
unsigned long val, res;
unsigned int i;
if (is_vlan_dev(dev))
dev = vlan_dev_real_dev(dev);
res = dev->trans_start;
for (i = 0; i < dev->num_tx_queues; i++) {
val = netdev_get_tx_queue(dev, i)->trans_start;
if (val && time_after(val, res))
res = val;
}
dev->trans_start = res;
return res;
}
EXPORT_SYMBOL(dev_trans_start);
static void dev_watchdog(unsigned long arg)
{
struct net_device *dev = (struct net_device *)arg;
netif_tx_lock(dev);
if (!qdisc_tx_is_noop(dev)) {
if (netif_device_present(dev) &&
netif_running(dev) &&
netif_carrier_ok(dev)) {
int some_queue_timedout = 0;
unsigned int i;
unsigned long trans_start;
for (i = 0; i < dev->num_tx_queues; i++) {
struct netdev_queue *txq;
txq = netdev_get_tx_queue(dev, i);
/*
* old device drivers set dev->trans_start
*/
trans_start = txq->trans_start ? : dev->trans_start;
if (netif_xmit_stopped(txq) &&
time_after(jiffies, (trans_start +
dev->watchdog_timeo))) {
some_queue_timedout = 1;
txq->trans_timeout++;
break;
}
}
if (some_queue_timedout) {
WARN_ONCE(1, KERN_INFO "NETDEV WATCHDOG: %s (%s): transmit queue %u timed out\n",
dev->name, netdev_drivername(dev), i);
dev->netdev_ops->ndo_tx_timeout(dev);
}
if (!mod_timer(&dev->watchdog_timer,
round_jiffies(jiffies +
dev->watchdog_timeo)))
dev_hold(dev);
}
}
netif_tx_unlock(dev);
dev_put(dev);
}
void __netdev_watchdog_up(struct net_device *dev)
{
if (dev->netdev_ops->ndo_tx_timeout) {
if (dev->watchdog_timeo <= 0)
dev->watchdog_timeo = 5*HZ;
if (!mod_timer(&dev->watchdog_timer,
round_jiffies(jiffies + dev->watchdog_timeo)))
dev_hold(dev);
}
}
static void dev_watchdog_up(struct net_device *dev)
{
__netdev_watchdog_up(dev);
}
static void dev_watchdog_down(struct net_device *dev)
{
netif_tx_lock_bh(dev);
if (del_timer(&dev->watchdog_timer))
dev_put(dev);
netif_tx_unlock_bh(dev);
}
/**
* netif_carrier_on - set carrier
* @dev: network device
*
* Device has detected that carrier.
*/
void netif_carrier_on(struct net_device *dev)
{
if (test_and_clear_bit(__LINK_STATE_NOCARRIER, &dev->state)) {
if (dev->reg_state == NETREG_UNINITIALIZED)
return;
atomic_inc(&dev->carrier_changes);
linkwatch_fire_event(dev);
if (netif_running(dev))
__netdev_watchdog_up(dev);
}
}
EXPORT_SYMBOL(netif_carrier_on);
/**
* netif_carrier_off - clear carrier
* @dev: network device
*
* Device has detected loss of carrier.
*/
void netif_carrier_off(struct net_device *dev)
{
if (!test_and_set_bit(__LINK_STATE_NOCARRIER, &dev->state)) {
if (dev->reg_state == NETREG_UNINITIALIZED)
return;
atomic_inc(&dev->carrier_changes);
linkwatch_fire_event(dev);
}
}
EXPORT_SYMBOL(netif_carrier_off);
/* "NOOP" scheduler: the best scheduler, recommended for all interfaces
under all circumstances. It is difficult to invent anything faster or
cheaper.
*/
static int noop_enqueue(struct sk_buff *skb, struct Qdisc *qdisc)
{
kfree_skb(skb);
return NET_XMIT_CN;
}
static struct sk_buff *noop_dequeue(struct Qdisc *qdisc)
{
return NULL;
}
struct Qdisc_ops noop_qdisc_ops __read_mostly = {
.id = "noop",
.priv_size = 0,
.enqueue = noop_enqueue,
.dequeue = noop_dequeue,
.peek = noop_dequeue,
.owner = THIS_MODULE,
};
static struct netdev_queue noop_netdev_queue = {
.qdisc = &noop_qdisc,
.qdisc_sleeping = &noop_qdisc,
};
struct Qdisc noop_qdisc = {
.enqueue = noop_enqueue,
.dequeue = noop_dequeue,
.flags = TCQ_F_BUILTIN,
.ops = &noop_qdisc_ops,
.list = LIST_HEAD_INIT(noop_qdisc.list),
.q.lock = __SPIN_LOCK_UNLOCKED(noop_qdisc.q.lock),
.dev_queue = &noop_netdev_queue,
.busylock = __SPIN_LOCK_UNLOCKED(noop_qdisc.busylock),
};
EXPORT_SYMBOL(noop_qdisc);
static struct Qdisc_ops noqueue_qdisc_ops __read_mostly = {
.id = "noqueue",
.priv_size = 0,
.enqueue = noop_enqueue,
.dequeue = noop_dequeue,
.peek = noop_dequeue,
.owner = THIS_MODULE,
};
static struct Qdisc noqueue_qdisc;
static struct netdev_queue noqueue_netdev_queue = {
.qdisc = &noqueue_qdisc,
.qdisc_sleeping = &noqueue_qdisc,
};
static struct Qdisc noqueue_qdisc = {
.enqueue = NULL,
.dequeue = noop_dequeue,
.flags = TCQ_F_BUILTIN,
.ops = &noqueue_qdisc_ops,
.list = LIST_HEAD_INIT(noqueue_qdisc.list),
.q.lock = __SPIN_LOCK_UNLOCKED(noqueue_qdisc.q.lock),
.dev_queue = &noqueue_netdev_queue,
.busylock = __SPIN_LOCK_UNLOCKED(noqueue_qdisc.busylock),
};
static const u8 prio2band[TC_PRIO_MAX + 1] = {
1, 2, 2, 2, 1, 2, 0, 0 , 1, 1, 1, 1, 1, 1, 1, 1
};
/* 3-band FIFO queue: old style, but should be a bit faster than
generic prio+fifo combination.
*/
#define PFIFO_FAST_BANDS 3
/*
* Private data for a pfifo_fast scheduler containing:
* - queues for the three band
* - bitmap indicating which of the bands contain skbs
*/
struct pfifo_fast_priv {
u32 bitmap;
struct sk_buff_head q[PFIFO_FAST_BANDS];
};
/*
* Convert a bitmap to the first band number where an skb is queued, where:
* bitmap=0 means there are no skbs on any band.
* bitmap=1 means there is an skb on band 0.
* bitmap=7 means there are skbs on all 3 bands, etc.
*/
static const int bitmap2band[] = {-1, 0, 1, 0, 2, 0, 1, 0};
static inline struct sk_buff_head *band2list(struct pfifo_fast_priv *priv,
int band)
{
return priv->q + band;
}
static int pfifo_fast_enqueue(struct sk_buff *skb, struct Qdisc *qdisc)
{
if (skb_queue_len(&qdisc->q) < qdisc_dev(qdisc)->tx_queue_len) {
int band = prio2band[skb->priority & TC_PRIO_MAX];
struct pfifo_fast_priv *priv = qdisc_priv(qdisc);
struct sk_buff_head *list = band2list(priv, band);
priv->bitmap |= (1 << band);
qdisc->q.qlen++;
return __qdisc_enqueue_tail(skb, qdisc, list);
}
return qdisc_drop(skb, qdisc);
}
static struct sk_buff *pfifo_fast_dequeue(struct Qdisc *qdisc)
{
struct pfifo_fast_priv *priv = qdisc_priv(qdisc);
int band = bitmap2band[priv->bitmap];
if (likely(band >= 0)) {
struct sk_buff_head *list = band2list(priv, band);
struct sk_buff *skb = __qdisc_dequeue_head(qdisc, list);
qdisc->q.qlen--;
if (skb_queue_empty(list))
priv->bitmap &= ~(1 << band);
return skb;
}
return NULL;
}
static struct sk_buff *pfifo_fast_peek(struct Qdisc *qdisc)
{
struct pfifo_fast_priv *priv = qdisc_priv(qdisc);
int band = bitmap2band[priv->bitmap];
if (band >= 0) {
struct sk_buff_head *list = band2list(priv, band);
return skb_peek(list);
}
return NULL;
}
static void pfifo_fast_reset(struct Qdisc *qdisc)
{
int prio;
struct pfifo_fast_priv *priv = qdisc_priv(qdisc);
for (prio = 0; prio < PFIFO_FAST_BANDS; prio++)
__qdisc_reset_queue(qdisc, band2list(priv, prio));
priv->bitmap = 0;
qdisc->qstats.backlog = 0;
qdisc->q.qlen = 0;
}
static int pfifo_fast_dump(struct Qdisc *qdisc, struct sk_buff *skb)
{
struct tc_prio_qopt opt = { .bands = PFIFO_FAST_BANDS };
memcpy(&opt.priomap, prio2band, TC_PRIO_MAX + 1);
if (nla_put(skb, TCA_OPTIONS, sizeof(opt), &opt))
goto nla_put_failure;
return skb->len;
nla_put_failure:
return -1;
}
static int pfifo_fast_init(struct Qdisc *qdisc, struct nlattr *opt)
{
int prio;
struct pfifo_fast_priv *priv = qdisc_priv(qdisc);
for (prio = 0; prio < PFIFO_FAST_BANDS; prio++)
__skb_queue_head_init(band2list(priv, prio));
/* Can by-pass the queue discipline */
qdisc->flags |= TCQ_F_CAN_BYPASS;
return 0;
}
struct Qdisc_ops pfifo_fast_ops __read_mostly = {
.id = "pfifo_fast",
.priv_size = sizeof(struct pfifo_fast_priv),
.enqueue = pfifo_fast_enqueue,
.dequeue = pfifo_fast_dequeue,
.peek = pfifo_fast_peek,
.init = pfifo_fast_init,
.reset = pfifo_fast_reset,
.dump = pfifo_fast_dump,
.owner = THIS_MODULE,
};
static struct lock_class_key qdisc_tx_busylock;
struct Qdisc *qdisc_alloc(struct netdev_queue *dev_queue,
const struct Qdisc_ops *ops)
{
void *p;
struct Qdisc *sch;
unsigned int size = QDISC_ALIGN(sizeof(*sch)) + ops->priv_size;
int err = -ENOBUFS;
struct net_device *dev = dev_queue->dev;
p = kzalloc_node(size, GFP_KERNEL,
netdev_queue_numa_node_read(dev_queue));
if (!p)
goto errout;
sch = (struct Qdisc *) QDISC_ALIGN((unsigned long) p);
/* if we got non aligned memory, ask more and do alignment ourself */
if (sch != p) {
kfree(p);
p = kzalloc_node(size + QDISC_ALIGNTO - 1, GFP_KERNEL,
netdev_queue_numa_node_read(dev_queue));
if (!p)
goto errout;
sch = (struct Qdisc *) QDISC_ALIGN((unsigned long) p);
sch->padded = (char *) sch - (char *) p;
}
INIT_LIST_HEAD(&sch->list);
skb_queue_head_init(&sch->q);
spin_lock_init(&sch->busylock);
lockdep_set_class(&sch->busylock,
dev->qdisc_tx_busylock ?: &qdisc_tx_busylock);
sch->ops = ops;
sch->enqueue = ops->enqueue;
sch->dequeue = ops->dequeue;
sch->dev_queue = dev_queue;
dev_hold(dev);
atomic_set(&sch->refcnt, 1);
return sch;
errout:
return ERR_PTR(err);
}
struct Qdisc *qdisc_create_dflt(struct netdev_queue *dev_queue,
const struct Qdisc_ops *ops,
unsigned int parentid)
{
struct Qdisc *sch;
if (!try_module_get(ops->owner))
goto errout;
sch = qdisc_alloc(dev_queue, ops);
if (IS_ERR(sch))
goto errout;
sch->parent = parentid;
if (!ops->init || ops->init(sch, NULL) == 0)
return sch;
qdisc_destroy(sch);
errout:
return NULL;
}
EXPORT_SYMBOL(qdisc_create_dflt);
/* Under qdisc_lock(qdisc) and BH! */
void qdisc_reset(struct Qdisc *qdisc)
{
const struct Qdisc_ops *ops = qdisc->ops;
if (ops->reset)
ops->reset(qdisc);
if (qdisc->gso_skb) {
kfree_skb_list(qdisc->gso_skb);
qdisc->gso_skb = NULL;
qdisc->q.qlen = 0;
}
}
EXPORT_SYMBOL(qdisc_reset);
static void qdisc_rcu_free(struct rcu_head *head)
{
struct Qdisc *qdisc = container_of(head, struct Qdisc, rcu_head);
if (qdisc_is_percpu_stats(qdisc))
free_percpu(qdisc->cpu_bstats);
kfree((char *) qdisc - qdisc->padded);
}
void qdisc_destroy(struct Qdisc *qdisc)
{
const struct Qdisc_ops *ops = qdisc->ops;
if (qdisc->flags & TCQ_F_BUILTIN ||
!atomic_dec_and_test(&qdisc->refcnt))
return;
#ifdef CONFIG_NET_SCHED
qdisc_list_del(qdisc);
qdisc_put_stab(rtnl_dereference(qdisc->stab));
#endif
gen_kill_estimator(&qdisc->bstats, &qdisc->rate_est);
if (ops->reset)
ops->reset(qdisc);
if (ops->destroy)
ops->destroy(qdisc);
module_put(ops->owner);
dev_put(qdisc_dev(qdisc));
kfree_skb_list(qdisc->gso_skb);
/*
* gen_estimator est_timer() might access qdisc->q.lock,
* wait a RCU grace period before freeing qdisc.
*/
call_rcu(&qdisc->rcu_head, qdisc_rcu_free);
}
EXPORT_SYMBOL(qdisc_destroy);
/* Attach toplevel qdisc to device queue. */
struct Qdisc *dev_graft_qdisc(struct netdev_queue *dev_queue,
struct Qdisc *qdisc)
{
struct Qdisc *oqdisc = dev_queue->qdisc_sleeping;
spinlock_t *root_lock;
root_lock = qdisc_lock(oqdisc);
spin_lock_bh(root_lock);
/* Prune old scheduler */
if (oqdisc && atomic_read(&oqdisc->refcnt) <= 1)
qdisc_reset(oqdisc);
/* ... and graft new one */
if (qdisc == NULL)
qdisc = &noop_qdisc;
dev_queue->qdisc_sleeping = qdisc;
rcu_assign_pointer(dev_queue->qdisc, &noop_qdisc);
spin_unlock_bh(root_lock);
return oqdisc;
}
EXPORT_SYMBOL(dev_graft_qdisc);
static void attach_one_default_qdisc(struct net_device *dev,
struct netdev_queue *dev_queue,
void *_unused)
{
struct Qdisc *qdisc = &noqueue_qdisc;
if (dev->tx_queue_len) {
qdisc = qdisc_create_dflt(dev_queue,
default_qdisc_ops, TC_H_ROOT);
if (!qdisc) {
netdev_info(dev, "activation failed\n");
return;
}
if (!netif_is_multiqueue(dev))
qdisc->flags |= TCQ_F_ONETXQUEUE;
}
dev_queue->qdisc_sleeping = qdisc;
}
static void attach_default_qdiscs(struct net_device *dev)
{
struct netdev_queue *txq;
struct Qdisc *qdisc;
txq = netdev_get_tx_queue(dev, 0);
if (!netif_is_multiqueue(dev) || dev->tx_queue_len == 0) {
netdev_for_each_tx_queue(dev, attach_one_default_qdisc, NULL);
dev->qdisc = txq->qdisc_sleeping;
atomic_inc(&dev->qdisc->refcnt);
} else {
qdisc = qdisc_create_dflt(txq, &mq_qdisc_ops, TC_H_ROOT);
if (qdisc) {
dev->qdisc = qdisc;
qdisc->ops->attach(qdisc);
}
}
}
static void transition_one_qdisc(struct net_device *dev,
struct netdev_queue *dev_queue,
void *_need_watchdog)
{
struct Qdisc *new_qdisc = dev_queue->qdisc_sleeping;
int *need_watchdog_p = _need_watchdog;
if (!(new_qdisc->flags & TCQ_F_BUILTIN))
clear_bit(__QDISC_STATE_DEACTIVATED, &new_qdisc->state);
rcu_assign_pointer(dev_queue->qdisc, new_qdisc);
if (need_watchdog_p && new_qdisc != &noqueue_qdisc) {
dev_queue->trans_start = 0;
*need_watchdog_p = 1;
}
}
void dev_activate(struct net_device *dev)
{
int need_watchdog;
/* No queueing discipline is attached to device;
* create default one for devices, which need queueing
* and noqueue_qdisc for virtual interfaces
*/
if (dev->qdisc == &noop_qdisc)
attach_default_qdiscs(dev);
if (!netif_carrier_ok(dev))
/* Delay activation until next carrier-on event */
return;
need_watchdog = 0;
netdev_for_each_tx_queue(dev, transition_one_qdisc, &need_watchdog);
if (dev_ingress_queue(dev))
transition_one_qdisc(dev, dev_ingress_queue(dev), NULL);
if (need_watchdog) {
dev->trans_start = jiffies;
dev_watchdog_up(dev);
}
}
EXPORT_SYMBOL(dev_activate);
static void dev_deactivate_queue(struct net_device *dev,
struct netdev_queue *dev_queue,
void *_qdisc_default)
{
struct Qdisc *qdisc_default = _qdisc_default;
struct Qdisc *qdisc;
qdisc = rtnl_dereference(dev_queue->qdisc);
if (qdisc) {
spin_lock_bh(qdisc_lock(qdisc));
if (!(qdisc->flags & TCQ_F_BUILTIN))
set_bit(__QDISC_STATE_DEACTIVATED, &qdisc->state);
rcu_assign_pointer(dev_queue->qdisc, qdisc_default);
qdisc_reset(qdisc);
spin_unlock_bh(qdisc_lock(qdisc));
}
}
static bool some_qdisc_is_busy(struct net_device *dev)
{
unsigned int i;
for (i = 0; i < dev->num_tx_queues; i++) {
struct netdev_queue *dev_queue;
spinlock_t *root_lock;
struct Qdisc *q;
int val;
dev_queue = netdev_get_tx_queue(dev, i);
q = dev_queue->qdisc_sleeping;
root_lock = qdisc_lock(q);
spin_lock_bh(root_lock);
val = (qdisc_is_running(q) ||
test_bit(__QDISC_STATE_SCHED, &q->state));
spin_unlock_bh(root_lock);
if (val)
return true;
}
return false;
}
/**
* dev_deactivate_many - deactivate transmissions on several devices
* @head: list of devices to deactivate
*
* This function returns only when all outstanding transmissions
* have completed, unless all devices are in dismantle phase.
*/
void dev_deactivate_many(struct list_head *head)
{
struct net_device *dev;
bool sync_needed = false;
list_for_each_entry(dev, head, close_list) {
netdev_for_each_tx_queue(dev, dev_deactivate_queue,
&noop_qdisc);
if (dev_ingress_queue(dev))
dev_deactivate_queue(dev, dev_ingress_queue(dev),
&noop_qdisc);
dev_watchdog_down(dev);
sync_needed |= !dev->dismantle;
}
/* Wait for outstanding qdisc-less dev_queue_xmit calls.
* This is avoided if all devices are in dismantle phase :
* Caller will call synchronize_net() for us
*/
if (sync_needed)
synchronize_net();
/* Wait for outstanding qdisc_run calls. */
list_for_each_entry(dev, head, close_list)
while (some_qdisc_is_busy(dev))
yield();
}
void dev_deactivate(struct net_device *dev)
{
LIST_HEAD(single);
list_add(&dev->close_list, &single);
dev_deactivate_many(&single);
list_del(&single);
}
EXPORT_SYMBOL(dev_deactivate);
static void dev_init_scheduler_queue(struct net_device *dev,
struct netdev_queue *dev_queue,
void *_qdisc)
{
struct Qdisc *qdisc = _qdisc;
rcu_assign_pointer(dev_queue->qdisc, qdisc);
dev_queue->qdisc_sleeping = qdisc;
}
void dev_init_scheduler(struct net_device *dev)
{
dev->qdisc = &noop_qdisc;
netdev_for_each_tx_queue(dev, dev_init_scheduler_queue, &noop_qdisc);
if (dev_ingress_queue(dev))
dev_init_scheduler_queue(dev, dev_ingress_queue(dev), &noop_qdisc);
setup_timer(&dev->watchdog_timer, dev_watchdog, (unsigned long)dev);
}
static void shutdown_scheduler_queue(struct net_device *dev,
struct netdev_queue *dev_queue,
void *_qdisc_default)
{
struct Qdisc *qdisc = dev_queue->qdisc_sleeping;
struct Qdisc *qdisc_default = _qdisc_default;
if (qdisc) {
rcu_assign_pointer(dev_queue->qdisc, qdisc_default);
dev_queue->qdisc_sleeping = qdisc_default;
qdisc_destroy(qdisc);
}
}
void dev_shutdown(struct net_device *dev)
{
netdev_for_each_tx_queue(dev, shutdown_scheduler_queue, &noop_qdisc);
if (dev_ingress_queue(dev))
shutdown_scheduler_queue(dev, dev_ingress_queue(dev), &noop_qdisc);
qdisc_destroy(dev->qdisc);
dev->qdisc = &noop_qdisc;
WARN_ON(timer_pending(&dev->watchdog_timer));
}
void psched_ratecfg_precompute(struct psched_ratecfg *r,
const struct tc_ratespec *conf,
u64 rate64)
{
memset(r, 0, sizeof(*r));
r->overhead = conf->overhead;
r->rate_bytes_ps = max_t(u64, conf->rate, rate64);
r->linklayer = (conf->linklayer & TC_LINKLAYER_MASK);
r->mult = 1;
/*
* The deal here is to replace a divide by a reciprocal one
* in fast path (a reciprocal divide is a multiply and a shift)
*
* Normal formula would be :
* time_in_ns = (NSEC_PER_SEC * len) / rate_bps
*
* We compute mult/shift to use instead :
* time_in_ns = (len * mult) >> shift;
*
* We try to get the highest possible mult value for accuracy,
* but have to make sure no overflows will ever happen.
*/
if (r->rate_bytes_ps > 0) {
u64 factor = NSEC_PER_SEC;
for (;;) {
r->mult = div64_u64(factor, r->rate_bytes_ps);
if (r->mult & (1U << 31) || factor & (1ULL << 63))
break;
factor <<= 1;
r->shift++;
}
}
}
EXPORT_SYMBOL(psched_ratecfg_precompute);

630
net/sched/sch_gred.c Normal file
View file

@ -0,0 +1,630 @@
/*
* net/sched/sch_gred.c Generic Random Early Detection queue.
*
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public License
* as published by the Free Software Foundation; either version
* 2 of the License, or (at your option) any later version.
*
* Authors: J Hadi Salim (hadi@cyberus.ca) 1998-2002
*
* 991129: - Bug fix with grio mode
* - a better sing. AvgQ mode with Grio(WRED)
* - A finer grained VQ dequeue based on sugestion
* from Ren Liu
* - More error checks
*
* For all the glorious comments look at include/net/red.h
*/
#include <linux/slab.h>
#include <linux/module.h>
#include <linux/types.h>
#include <linux/kernel.h>
#include <linux/skbuff.h>
#include <net/pkt_sched.h>
#include <net/red.h>
#define GRED_DEF_PRIO (MAX_DPs / 2)
#define GRED_VQ_MASK (MAX_DPs - 1)
struct gred_sched_data;
struct gred_sched;
struct gred_sched_data {
u32 limit; /* HARD maximal queue length */
u32 DP; /* the drop parameters */
u32 bytesin; /* bytes seen on virtualQ so far*/
u32 packetsin; /* packets seen on virtualQ so far*/
u32 backlog; /* bytes on the virtualQ */
u8 prio; /* the prio of this vq */
struct red_parms parms;
struct red_vars vars;
struct red_stats stats;
};
enum {
GRED_WRED_MODE = 1,
GRED_RIO_MODE,
};
struct gred_sched {
struct gred_sched_data *tab[MAX_DPs];
unsigned long flags;
u32 red_flags;
u32 DPs;
u32 def;
struct red_vars wred_set;
};
static inline int gred_wred_mode(struct gred_sched *table)
{
return test_bit(GRED_WRED_MODE, &table->flags);
}
static inline void gred_enable_wred_mode(struct gred_sched *table)
{
__set_bit(GRED_WRED_MODE, &table->flags);
}
static inline void gred_disable_wred_mode(struct gred_sched *table)
{
__clear_bit(GRED_WRED_MODE, &table->flags);
}
static inline int gred_rio_mode(struct gred_sched *table)
{
return test_bit(GRED_RIO_MODE, &table->flags);
}
static inline void gred_enable_rio_mode(struct gred_sched *table)
{
__set_bit(GRED_RIO_MODE, &table->flags);
}
static inline void gred_disable_rio_mode(struct gred_sched *table)
{
__clear_bit(GRED_RIO_MODE, &table->flags);
}
static inline int gred_wred_mode_check(struct Qdisc *sch)
{
struct gred_sched *table = qdisc_priv(sch);
int i;
/* Really ugly O(n^2) but shouldn't be necessary too frequent. */
for (i = 0; i < table->DPs; i++) {
struct gred_sched_data *q = table->tab[i];
int n;
if (q == NULL)
continue;
for (n = i + 1; n < table->DPs; n++)
if (table->tab[n] && table->tab[n]->prio == q->prio)
return 1;
}
return 0;
}
static inline unsigned int gred_backlog(struct gred_sched *table,
struct gred_sched_data *q,
struct Qdisc *sch)
{
if (gred_wred_mode(table))
return sch->qstats.backlog;
else
return q->backlog;
}
static inline u16 tc_index_to_dp(struct sk_buff *skb)
{
return skb->tc_index & GRED_VQ_MASK;
}
static inline void gred_load_wred_set(const struct gred_sched *table,
struct gred_sched_data *q)
{
q->vars.qavg = table->wred_set.qavg;
q->vars.qidlestart = table->wred_set.qidlestart;
}
static inline void gred_store_wred_set(struct gred_sched *table,
struct gred_sched_data *q)
{
table->wred_set.qavg = q->vars.qavg;
table->wred_set.qidlestart = q->vars.qidlestart;
}
static inline int gred_use_ecn(struct gred_sched *t)
{
return t->red_flags & TC_RED_ECN;
}
static inline int gred_use_harddrop(struct gred_sched *t)
{
return t->red_flags & TC_RED_HARDDROP;
}
static int gred_enqueue(struct sk_buff *skb, struct Qdisc *sch)
{
struct gred_sched_data *q = NULL;
struct gred_sched *t = qdisc_priv(sch);
unsigned long qavg = 0;
u16 dp = tc_index_to_dp(skb);
if (dp >= t->DPs || (q = t->tab[dp]) == NULL) {
dp = t->def;
q = t->tab[dp];
if (!q) {
/* Pass through packets not assigned to a DP
* if no default DP has been configured. This
* allows for DP flows to be left untouched.
*/
if (skb_queue_len(&sch->q) < qdisc_dev(sch)->tx_queue_len)
return qdisc_enqueue_tail(skb, sch);
else
goto drop;
}
/* fix tc_index? --could be controversial but needed for
requeueing */
skb->tc_index = (skb->tc_index & ~GRED_VQ_MASK) | dp;
}
/* sum up all the qaves of prios < ours to get the new qave */
if (!gred_wred_mode(t) && gred_rio_mode(t)) {
int i;
for (i = 0; i < t->DPs; i++) {
if (t->tab[i] && t->tab[i]->prio < q->prio &&
!red_is_idling(&t->tab[i]->vars))
qavg += t->tab[i]->vars.qavg;
}
}
q->packetsin++;
q->bytesin += qdisc_pkt_len(skb);
if (gred_wred_mode(t))
gred_load_wred_set(t, q);
q->vars.qavg = red_calc_qavg(&q->parms,
&q->vars,
gred_backlog(t, q, sch));
if (red_is_idling(&q->vars))
red_end_of_idle_period(&q->vars);
if (gred_wred_mode(t))
gred_store_wred_set(t, q);
switch (red_action(&q->parms, &q->vars, q->vars.qavg + qavg)) {
case RED_DONT_MARK:
break;
case RED_PROB_MARK:
qdisc_qstats_overlimit(sch);
if (!gred_use_ecn(t) || !INET_ECN_set_ce(skb)) {
q->stats.prob_drop++;
goto congestion_drop;
}
q->stats.prob_mark++;
break;
case RED_HARD_MARK:
qdisc_qstats_overlimit(sch);
if (gred_use_harddrop(t) || !gred_use_ecn(t) ||
!INET_ECN_set_ce(skb)) {
q->stats.forced_drop++;
goto congestion_drop;
}
q->stats.forced_mark++;
break;
}
if (q->backlog + qdisc_pkt_len(skb) <= q->limit) {
q->backlog += qdisc_pkt_len(skb);
return qdisc_enqueue_tail(skb, sch);
}
q->stats.pdrop++;
drop:
return qdisc_drop(skb, sch);
congestion_drop:
qdisc_drop(skb, sch);
return NET_XMIT_CN;
}
static struct sk_buff *gred_dequeue(struct Qdisc *sch)
{
struct sk_buff *skb;
struct gred_sched *t = qdisc_priv(sch);
skb = qdisc_dequeue_head(sch);
if (skb) {
struct gred_sched_data *q;
u16 dp = tc_index_to_dp(skb);
if (dp >= t->DPs || (q = t->tab[dp]) == NULL) {
net_warn_ratelimited("GRED: Unable to relocate VQ 0x%x after dequeue, screwing up backlog\n",
tc_index_to_dp(skb));
} else {
q->backlog -= qdisc_pkt_len(skb);
if (gred_wred_mode(t)) {
if (!sch->qstats.backlog)
red_start_of_idle_period(&t->wred_set);
} else {
if (!q->backlog)
red_start_of_idle_period(&q->vars);
}
}
return skb;
}
return NULL;
}
static unsigned int gred_drop(struct Qdisc *sch)
{
struct sk_buff *skb;
struct gred_sched *t = qdisc_priv(sch);
skb = qdisc_dequeue_tail(sch);
if (skb) {
unsigned int len = qdisc_pkt_len(skb);
struct gred_sched_data *q;
u16 dp = tc_index_to_dp(skb);
if (dp >= t->DPs || (q = t->tab[dp]) == NULL) {
net_warn_ratelimited("GRED: Unable to relocate VQ 0x%x while dropping, screwing up backlog\n",
tc_index_to_dp(skb));
} else {
q->backlog -= len;
q->stats.other++;
if (gred_wred_mode(t)) {
if (!sch->qstats.backlog)
red_start_of_idle_period(&t->wred_set);
} else {
if (!q->backlog)
red_start_of_idle_period(&q->vars);
}
}
qdisc_drop(skb, sch);
return len;
}
return 0;
}
static void gred_reset(struct Qdisc *sch)
{
int i;
struct gred_sched *t = qdisc_priv(sch);
qdisc_reset_queue(sch);
for (i = 0; i < t->DPs; i++) {
struct gred_sched_data *q = t->tab[i];
if (!q)
continue;
red_restart(&q->vars);
q->backlog = 0;
}
}
static inline void gred_destroy_vq(struct gred_sched_data *q)
{
kfree(q);
}
static inline int gred_change_table_def(struct Qdisc *sch, struct nlattr *dps)
{
struct gred_sched *table = qdisc_priv(sch);
struct tc_gred_sopt *sopt;
int i;
if (dps == NULL)
return -EINVAL;
sopt = nla_data(dps);
if (sopt->DPs > MAX_DPs || sopt->DPs == 0 || sopt->def_DP >= sopt->DPs)
return -EINVAL;
sch_tree_lock(sch);
table->DPs = sopt->DPs;
table->def = sopt->def_DP;
table->red_flags = sopt->flags;
/*
* Every entry point to GRED is synchronized with the above code
* and the DP is checked against DPs, i.e. shadowed VQs can no
* longer be found so we can unlock right here.
*/
sch_tree_unlock(sch);
if (sopt->grio) {
gred_enable_rio_mode(table);
gred_disable_wred_mode(table);
if (gred_wred_mode_check(sch))
gred_enable_wred_mode(table);
} else {
gred_disable_rio_mode(table);
gred_disable_wred_mode(table);
}
for (i = table->DPs; i < MAX_DPs; i++) {
if (table->tab[i]) {
pr_warn("GRED: Warning: Destroying shadowed VQ 0x%x\n",
i);
gred_destroy_vq(table->tab[i]);
table->tab[i] = NULL;
}
}
return 0;
}
static inline int gred_change_vq(struct Qdisc *sch, int dp,
struct tc_gred_qopt *ctl, int prio,
u8 *stab, u32 max_P,
struct gred_sched_data **prealloc)
{
struct gred_sched *table = qdisc_priv(sch);
struct gred_sched_data *q = table->tab[dp];
if (!q) {
table->tab[dp] = q = *prealloc;
*prealloc = NULL;
if (!q)
return -ENOMEM;
}
q->DP = dp;
q->prio = prio;
q->limit = ctl->limit;
if (q->backlog == 0)
red_end_of_idle_period(&q->vars);
red_set_parms(&q->parms,
ctl->qth_min, ctl->qth_max, ctl->Wlog, ctl->Plog,
ctl->Scell_log, stab, max_P);
red_set_vars(&q->vars);
return 0;
}
static const struct nla_policy gred_policy[TCA_GRED_MAX + 1] = {
[TCA_GRED_PARMS] = { .len = sizeof(struct tc_gred_qopt) },
[TCA_GRED_STAB] = { .len = 256 },
[TCA_GRED_DPS] = { .len = sizeof(struct tc_gred_sopt) },
[TCA_GRED_MAX_P] = { .type = NLA_U32 },
};
static int gred_change(struct Qdisc *sch, struct nlattr *opt)
{
struct gred_sched *table = qdisc_priv(sch);
struct tc_gred_qopt *ctl;
struct nlattr *tb[TCA_GRED_MAX + 1];
int err, prio = GRED_DEF_PRIO;
u8 *stab;
u32 max_P;
struct gred_sched_data *prealloc;
if (opt == NULL)
return -EINVAL;
err = nla_parse_nested(tb, TCA_GRED_MAX, opt, gred_policy);
if (err < 0)
return err;
if (tb[TCA_GRED_PARMS] == NULL && tb[TCA_GRED_STAB] == NULL)
return gred_change_table_def(sch, opt);
if (tb[TCA_GRED_PARMS] == NULL ||
tb[TCA_GRED_STAB] == NULL)
return -EINVAL;
max_P = tb[TCA_GRED_MAX_P] ? nla_get_u32(tb[TCA_GRED_MAX_P]) : 0;
err = -EINVAL;
ctl = nla_data(tb[TCA_GRED_PARMS]);
stab = nla_data(tb[TCA_GRED_STAB]);
if (ctl->DP >= table->DPs)
goto errout;
if (gred_rio_mode(table)) {
if (ctl->prio == 0) {
int def_prio = GRED_DEF_PRIO;
if (table->tab[table->def])
def_prio = table->tab[table->def]->prio;
printk(KERN_DEBUG "GRED: DP %u does not have a prio "
"setting default to %d\n", ctl->DP, def_prio);
prio = def_prio;
} else
prio = ctl->prio;
}
prealloc = kzalloc(sizeof(*prealloc), GFP_KERNEL);
sch_tree_lock(sch);
err = gred_change_vq(sch, ctl->DP, ctl, prio, stab, max_P, &prealloc);
if (err < 0)
goto errout_locked;
if (gred_rio_mode(table)) {
gred_disable_wred_mode(table);
if (gred_wred_mode_check(sch))
gred_enable_wred_mode(table);
}
err = 0;
errout_locked:
sch_tree_unlock(sch);
kfree(prealloc);
errout:
return err;
}
static int gred_init(struct Qdisc *sch, struct nlattr *opt)
{
struct nlattr *tb[TCA_GRED_MAX + 1];
int err;
if (opt == NULL)
return -EINVAL;
err = nla_parse_nested(tb, TCA_GRED_MAX, opt, gred_policy);
if (err < 0)
return err;
if (tb[TCA_GRED_PARMS] || tb[TCA_GRED_STAB])
return -EINVAL;
return gred_change_table_def(sch, tb[TCA_GRED_DPS]);
}
static int gred_dump(struct Qdisc *sch, struct sk_buff *skb)
{
struct gred_sched *table = qdisc_priv(sch);
struct nlattr *parms, *opts = NULL;
int i;
u32 max_p[MAX_DPs];
struct tc_gred_sopt sopt = {
.DPs = table->DPs,
.def_DP = table->def,
.grio = gred_rio_mode(table),
.flags = table->red_flags,
};
opts = nla_nest_start(skb, TCA_OPTIONS);
if (opts == NULL)
goto nla_put_failure;
if (nla_put(skb, TCA_GRED_DPS, sizeof(sopt), &sopt))
goto nla_put_failure;
for (i = 0; i < MAX_DPs; i++) {
struct gred_sched_data *q = table->tab[i];
max_p[i] = q ? q->parms.max_P : 0;
}
if (nla_put(skb, TCA_GRED_MAX_P, sizeof(max_p), max_p))
goto nla_put_failure;
parms = nla_nest_start(skb, TCA_GRED_PARMS);
if (parms == NULL)
goto nla_put_failure;
for (i = 0; i < MAX_DPs; i++) {
struct gred_sched_data *q = table->tab[i];
struct tc_gred_qopt opt;
unsigned long qavg;
memset(&opt, 0, sizeof(opt));
if (!q) {
/* hack -- fix at some point with proper message
This is how we indicate to tc that there is no VQ
at this DP */
opt.DP = MAX_DPs + i;
goto append_opt;
}
opt.limit = q->limit;
opt.DP = q->DP;
opt.backlog = q->backlog;
opt.prio = q->prio;
opt.qth_min = q->parms.qth_min >> q->parms.Wlog;
opt.qth_max = q->parms.qth_max >> q->parms.Wlog;
opt.Wlog = q->parms.Wlog;
opt.Plog = q->parms.Plog;
opt.Scell_log = q->parms.Scell_log;
opt.other = q->stats.other;
opt.early = q->stats.prob_drop;
opt.forced = q->stats.forced_drop;
opt.pdrop = q->stats.pdrop;
opt.packets = q->packetsin;
opt.bytesin = q->bytesin;
if (gred_wred_mode(table))
gred_load_wred_set(table, q);
qavg = red_calc_qavg(&q->parms, &q->vars,
q->vars.qavg >> q->parms.Wlog);
opt.qave = qavg >> q->parms.Wlog;
append_opt:
if (nla_append(skb, sizeof(opt), &opt) < 0)
goto nla_put_failure;
}
nla_nest_end(skb, parms);
return nla_nest_end(skb, opts);
nla_put_failure:
nla_nest_cancel(skb, opts);
return -EMSGSIZE;
}
static void gred_destroy(struct Qdisc *sch)
{
struct gred_sched *table = qdisc_priv(sch);
int i;
for (i = 0; i < table->DPs; i++) {
if (table->tab[i])
gred_destroy_vq(table->tab[i]);
}
}
static struct Qdisc_ops gred_qdisc_ops __read_mostly = {
.id = "gred",
.priv_size = sizeof(struct gred_sched),
.enqueue = gred_enqueue,
.dequeue = gred_dequeue,
.peek = qdisc_peek_head,
.drop = gred_drop,
.init = gred_init,
.reset = gred_reset,
.destroy = gred_destroy,
.change = gred_change,
.dump = gred_dump,
.owner = THIS_MODULE,
};
static int __init gred_module_init(void)
{
return register_qdisc(&gred_qdisc_ops);
}
static void __exit gred_module_exit(void)
{
unregister_qdisc(&gred_qdisc_ops);
}
module_init(gred_module_init)
module_exit(gred_module_exit)
MODULE_LICENSE("GPL");

1754
net/sched/sch_hfsc.c Normal file

File diff suppressed because it is too large Load diff

740
net/sched/sch_hhf.c Normal file
View file

@ -0,0 +1,740 @@
/* net/sched/sch_hhf.c Heavy-Hitter Filter (HHF)
*
* Copyright (C) 2013 Terry Lam <vtlam@google.com>
* Copyright (C) 2013 Nandita Dukkipati <nanditad@google.com>
*/
#include <linux/jhash.h>
#include <linux/jiffies.h>
#include <linux/module.h>
#include <linux/skbuff.h>
#include <linux/vmalloc.h>
#include <net/flow_keys.h>
#include <net/pkt_sched.h>
#include <net/sock.h>
/* Heavy-Hitter Filter (HHF)
*
* Principles :
* Flows are classified into two buckets: non-heavy-hitter and heavy-hitter
* buckets. Initially, a new flow starts as non-heavy-hitter. Once classified
* as heavy-hitter, it is immediately switched to the heavy-hitter bucket.
* The buckets are dequeued by a Weighted Deficit Round Robin (WDRR) scheduler,
* in which the heavy-hitter bucket is served with less weight.
* In other words, non-heavy-hitters (e.g., short bursts of critical traffic)
* are isolated from heavy-hitters (e.g., persistent bulk traffic) and also have
* higher share of bandwidth.
*
* To capture heavy-hitters, we use the "multi-stage filter" algorithm in the
* following paper:
* [EV02] C. Estan and G. Varghese, "New Directions in Traffic Measurement and
* Accounting", in ACM SIGCOMM, 2002.
*
* Conceptually, a multi-stage filter comprises k independent hash functions
* and k counter arrays. Packets are indexed into k counter arrays by k hash
* functions, respectively. The counters are then increased by the packet sizes.
* Therefore,
* - For a heavy-hitter flow: *all* of its k array counters must be large.
* - For a non-heavy-hitter flow: some of its k array counters can be large
* due to hash collision with other small flows; however, with high
* probability, not *all* k counters are large.
*
* By the design of the multi-stage filter algorithm, the false negative rate
* (heavy-hitters getting away uncaptured) is zero. However, the algorithm is
* susceptible to false positives (non-heavy-hitters mistakenly classified as
* heavy-hitters).
* Therefore, we also implement the following optimizations to reduce false
* positives by avoiding unnecessary increment of the counter values:
* - Optimization O1: once a heavy-hitter is identified, its bytes are not
* accounted in the array counters. This technique is called "shielding"
* in Section 3.3.1 of [EV02].
* - Optimization O2: conservative update of counters
* (Section 3.3.2 of [EV02]),
* New counter value = max {old counter value,
* smallest counter value + packet bytes}
*
* Finally, we refresh the counters periodically since otherwise the counter
* values will keep accumulating.
*
* Once a flow is classified as heavy-hitter, we also save its per-flow state
* in an exact-matching flow table so that its subsequent packets can be
* dispatched to the heavy-hitter bucket accordingly.
*
*
* At a high level, this qdisc works as follows:
* Given a packet p:
* - If the flow-id of p (e.g., TCP 5-tuple) is already in the exact-matching
* heavy-hitter flow table, denoted table T, then send p to the heavy-hitter
* bucket.
* - Otherwise, forward p to the multi-stage filter, denoted filter F
* + If F decides that p belongs to a non-heavy-hitter flow, then send p
* to the non-heavy-hitter bucket.
* + Otherwise, if F decides that p belongs to a new heavy-hitter flow,
* then set up a new flow entry for the flow-id of p in the table T and
* send p to the heavy-hitter bucket.
*
* In this implementation:
* - T is a fixed-size hash-table with 1024 entries. Hash collision is
* resolved by linked-list chaining.
* - F has four counter arrays, each array containing 1024 32-bit counters.
* That means 4 * 1024 * 32 bits = 16KB of memory.
* - Since each array in F contains 1024 counters, 10 bits are sufficient to
* index into each array.
* Hence, instead of having four hash functions, we chop the 32-bit
* skb-hash into three 10-bit chunks, and the remaining 10-bit chunk is
* computed as XOR sum of those three chunks.
* - We need to clear the counter arrays periodically; however, directly
* memsetting 16KB of memory can lead to cache eviction and unwanted delay.
* So by representing each counter by a valid bit, we only need to reset
* 4K of 1 bit (i.e. 512 bytes) instead of 16KB of memory.
* - The Deficit Round Robin engine is taken from fq_codel implementation
* (net/sched/sch_fq_codel.c). Note that wdrr_bucket corresponds to
* fq_codel_flow in fq_codel implementation.
*
*/
/* Non-configurable parameters */
#define HH_FLOWS_CNT 1024 /* number of entries in exact-matching table T */
#define HHF_ARRAYS_CNT 4 /* number of arrays in multi-stage filter F */
#define HHF_ARRAYS_LEN 1024 /* number of counters in each array of F */
#define HHF_BIT_MASK_LEN 10 /* masking 10 bits */
#define HHF_BIT_MASK 0x3FF /* bitmask of 10 bits */
#define WDRR_BUCKET_CNT 2 /* two buckets for Weighted DRR */
enum wdrr_bucket_idx {
WDRR_BUCKET_FOR_HH = 0, /* bucket id for heavy-hitters */
WDRR_BUCKET_FOR_NON_HH = 1 /* bucket id for non-heavy-hitters */
};
#define hhf_time_before(a, b) \
(typecheck(u32, a) && typecheck(u32, b) && ((s32)((a) - (b)) < 0))
/* Heavy-hitter per-flow state */
struct hh_flow_state {
u32 hash_id; /* hash of flow-id (e.g. TCP 5-tuple) */
u32 hit_timestamp; /* last time heavy-hitter was seen */
struct list_head flowchain; /* chaining under hash collision */
};
/* Weighted Deficit Round Robin (WDRR) scheduler */
struct wdrr_bucket {
struct sk_buff *head;
struct sk_buff *tail;
struct list_head bucketchain;
int deficit;
};
struct hhf_sched_data {
struct wdrr_bucket buckets[WDRR_BUCKET_CNT];
u32 perturbation; /* hash perturbation */
u32 quantum; /* psched_mtu(qdisc_dev(sch)); */
u32 drop_overlimit; /* number of times max qdisc packet
* limit was hit
*/
struct list_head *hh_flows; /* table T (currently active HHs) */
u32 hh_flows_limit; /* max active HH allocs */
u32 hh_flows_overlimit; /* num of disallowed HH allocs */
u32 hh_flows_total_cnt; /* total admitted HHs */
u32 hh_flows_current_cnt; /* total current HHs */
u32 *hhf_arrays[HHF_ARRAYS_CNT]; /* HH filter F */
u32 hhf_arrays_reset_timestamp; /* last time hhf_arrays
* was reset
*/
unsigned long *hhf_valid_bits[HHF_ARRAYS_CNT]; /* shadow valid bits
* of hhf_arrays
*/
/* Similar to the "new_flows" vs. "old_flows" concept in fq_codel DRR */
struct list_head new_buckets; /* list of new buckets */
struct list_head old_buckets; /* list of old buckets */
/* Configurable HHF parameters */
u32 hhf_reset_timeout; /* interval to reset counter
* arrays in filter F
* (default 40ms)
*/
u32 hhf_admit_bytes; /* counter thresh to classify as
* HH (default 128KB).
* With these default values,
* 128KB / 40ms = 25 Mbps
* i.e., we expect to capture HHs
* sending > 25 Mbps.
*/
u32 hhf_evict_timeout; /* aging threshold to evict idle
* HHs out of table T. This should
* be large enough to avoid
* reordering during HH eviction.
* (default 1s)
*/
u32 hhf_non_hh_weight; /* WDRR weight for non-HHs
* (default 2,
* i.e., non-HH : HH = 2 : 1)
*/
};
static u32 hhf_time_stamp(void)
{
return jiffies;
}
static unsigned int skb_hash(const struct hhf_sched_data *q,
const struct sk_buff *skb)
{
struct flow_keys keys;
unsigned int hash;
if (skb->sk && skb->sk->sk_hash)
return skb->sk->sk_hash;
skb_flow_dissect(skb, &keys);
hash = jhash_3words((__force u32)keys.dst,
(__force u32)keys.src ^ keys.ip_proto,
(__force u32)keys.ports, q->perturbation);
return hash;
}
/* Looks up a heavy-hitter flow in a chaining list of table T. */
static struct hh_flow_state *seek_list(const u32 hash,
struct list_head *head,
struct hhf_sched_data *q)
{
struct hh_flow_state *flow, *next;
u32 now = hhf_time_stamp();
if (list_empty(head))
return NULL;
list_for_each_entry_safe(flow, next, head, flowchain) {
u32 prev = flow->hit_timestamp + q->hhf_evict_timeout;
if (hhf_time_before(prev, now)) {
/* Delete expired heavy-hitters, but preserve one entry
* to avoid kzalloc() when next time this slot is hit.
*/
if (list_is_last(&flow->flowchain, head))
return NULL;
list_del(&flow->flowchain);
kfree(flow);
q->hh_flows_current_cnt--;
} else if (flow->hash_id == hash) {
return flow;
}
}
return NULL;
}
/* Returns a flow state entry for a new heavy-hitter. Either reuses an expired
* entry or dynamically alloc a new entry.
*/
static struct hh_flow_state *alloc_new_hh(struct list_head *head,
struct hhf_sched_data *q)
{
struct hh_flow_state *flow;
u32 now = hhf_time_stamp();
if (!list_empty(head)) {
/* Find an expired heavy-hitter flow entry. */
list_for_each_entry(flow, head, flowchain) {
u32 prev = flow->hit_timestamp + q->hhf_evict_timeout;
if (hhf_time_before(prev, now))
return flow;
}
}
if (q->hh_flows_current_cnt >= q->hh_flows_limit) {
q->hh_flows_overlimit++;
return NULL;
}
/* Create new entry. */
flow = kzalloc(sizeof(struct hh_flow_state), GFP_ATOMIC);
if (!flow)
return NULL;
q->hh_flows_current_cnt++;
INIT_LIST_HEAD(&flow->flowchain);
list_add_tail(&flow->flowchain, head);
return flow;
}
/* Assigns packets to WDRR buckets. Implements a multi-stage filter to
* classify heavy-hitters.
*/
static enum wdrr_bucket_idx hhf_classify(struct sk_buff *skb, struct Qdisc *sch)
{
struct hhf_sched_data *q = qdisc_priv(sch);
u32 tmp_hash, hash;
u32 xorsum, filter_pos[HHF_ARRAYS_CNT], flow_pos;
struct hh_flow_state *flow;
u32 pkt_len, min_hhf_val;
int i;
u32 prev;
u32 now = hhf_time_stamp();
/* Reset the HHF counter arrays if this is the right time. */
prev = q->hhf_arrays_reset_timestamp + q->hhf_reset_timeout;
if (hhf_time_before(prev, now)) {
for (i = 0; i < HHF_ARRAYS_CNT; i++)
bitmap_zero(q->hhf_valid_bits[i], HHF_ARRAYS_LEN);
q->hhf_arrays_reset_timestamp = now;
}
/* Get hashed flow-id of the skb. */
hash = skb_hash(q, skb);
/* Check if this packet belongs to an already established HH flow. */
flow_pos = hash & HHF_BIT_MASK;
flow = seek_list(hash, &q->hh_flows[flow_pos], q);
if (flow) { /* found its HH flow */
flow->hit_timestamp = now;
return WDRR_BUCKET_FOR_HH;
}
/* Now pass the packet through the multi-stage filter. */
tmp_hash = hash;
xorsum = 0;
for (i = 0; i < HHF_ARRAYS_CNT - 1; i++) {
/* Split the skb_hash into three 10-bit chunks. */
filter_pos[i] = tmp_hash & HHF_BIT_MASK;
xorsum ^= filter_pos[i];
tmp_hash >>= HHF_BIT_MASK_LEN;
}
/* The last chunk is computed as XOR sum of other chunks. */
filter_pos[HHF_ARRAYS_CNT - 1] = xorsum ^ tmp_hash;
pkt_len = qdisc_pkt_len(skb);
min_hhf_val = ~0U;
for (i = 0; i < HHF_ARRAYS_CNT; i++) {
u32 val;
if (!test_bit(filter_pos[i], q->hhf_valid_bits[i])) {
q->hhf_arrays[i][filter_pos[i]] = 0;
__set_bit(filter_pos[i], q->hhf_valid_bits[i]);
}
val = q->hhf_arrays[i][filter_pos[i]] + pkt_len;
if (min_hhf_val > val)
min_hhf_val = val;
}
/* Found a new HH iff all counter values > HH admit threshold. */
if (min_hhf_val > q->hhf_admit_bytes) {
/* Just captured a new heavy-hitter. */
flow = alloc_new_hh(&q->hh_flows[flow_pos], q);
if (!flow) /* memory alloc problem */
return WDRR_BUCKET_FOR_NON_HH;
flow->hash_id = hash;
flow->hit_timestamp = now;
q->hh_flows_total_cnt++;
/* By returning without updating counters in q->hhf_arrays,
* we implicitly implement "shielding" (see Optimization O1).
*/
return WDRR_BUCKET_FOR_HH;
}
/* Conservative update of HHF arrays (see Optimization O2). */
for (i = 0; i < HHF_ARRAYS_CNT; i++) {
if (q->hhf_arrays[i][filter_pos[i]] < min_hhf_val)
q->hhf_arrays[i][filter_pos[i]] = min_hhf_val;
}
return WDRR_BUCKET_FOR_NON_HH;
}
/* Removes one skb from head of bucket. */
static struct sk_buff *dequeue_head(struct wdrr_bucket *bucket)
{
struct sk_buff *skb = bucket->head;
bucket->head = skb->next;
skb->next = NULL;
return skb;
}
/* Tail-adds skb to bucket. */
static void bucket_add(struct wdrr_bucket *bucket, struct sk_buff *skb)
{
if (bucket->head == NULL)
bucket->head = skb;
else
bucket->tail->next = skb;
bucket->tail = skb;
skb->next = NULL;
}
static unsigned int hhf_drop(struct Qdisc *sch)
{
struct hhf_sched_data *q = qdisc_priv(sch);
struct wdrr_bucket *bucket;
/* Always try to drop from heavy-hitters first. */
bucket = &q->buckets[WDRR_BUCKET_FOR_HH];
if (!bucket->head)
bucket = &q->buckets[WDRR_BUCKET_FOR_NON_HH];
if (bucket->head) {
struct sk_buff *skb = dequeue_head(bucket);
sch->q.qlen--;
qdisc_qstats_drop(sch);
qdisc_qstats_backlog_dec(sch, skb);
kfree_skb(skb);
}
/* Return id of the bucket from which the packet was dropped. */
return bucket - q->buckets;
}
static int hhf_enqueue(struct sk_buff *skb, struct Qdisc *sch)
{
struct hhf_sched_data *q = qdisc_priv(sch);
enum wdrr_bucket_idx idx;
struct wdrr_bucket *bucket;
idx = hhf_classify(skb, sch);
bucket = &q->buckets[idx];
bucket_add(bucket, skb);
qdisc_qstats_backlog_inc(sch, skb);
if (list_empty(&bucket->bucketchain)) {
unsigned int weight;
/* The logic of new_buckets vs. old_buckets is the same as
* new_flows vs. old_flows in the implementation of fq_codel,
* i.e., short bursts of non-HHs should have strict priority.
*/
if (idx == WDRR_BUCKET_FOR_HH) {
/* Always move heavy-hitters to old bucket. */
weight = 1;
list_add_tail(&bucket->bucketchain, &q->old_buckets);
} else {
weight = q->hhf_non_hh_weight;
list_add_tail(&bucket->bucketchain, &q->new_buckets);
}
bucket->deficit = weight * q->quantum;
}
if (++sch->q.qlen <= sch->limit)
return NET_XMIT_SUCCESS;
q->drop_overlimit++;
/* Return Congestion Notification only if we dropped a packet from this
* bucket.
*/
if (hhf_drop(sch) == idx)
return NET_XMIT_CN;
/* As we dropped a packet, better let upper stack know this. */
qdisc_tree_decrease_qlen(sch, 1);
return NET_XMIT_SUCCESS;
}
static struct sk_buff *hhf_dequeue(struct Qdisc *sch)
{
struct hhf_sched_data *q = qdisc_priv(sch);
struct sk_buff *skb = NULL;
struct wdrr_bucket *bucket;
struct list_head *head;
begin:
head = &q->new_buckets;
if (list_empty(head)) {
head = &q->old_buckets;
if (list_empty(head))
return NULL;
}
bucket = list_first_entry(head, struct wdrr_bucket, bucketchain);
if (bucket->deficit <= 0) {
int weight = (bucket - q->buckets == WDRR_BUCKET_FOR_HH) ?
1 : q->hhf_non_hh_weight;
bucket->deficit += weight * q->quantum;
list_move_tail(&bucket->bucketchain, &q->old_buckets);
goto begin;
}
if (bucket->head) {
skb = dequeue_head(bucket);
sch->q.qlen--;
qdisc_qstats_backlog_dec(sch, skb);
}
if (!skb) {
/* Force a pass through old_buckets to prevent starvation. */
if ((head == &q->new_buckets) && !list_empty(&q->old_buckets))
list_move_tail(&bucket->bucketchain, &q->old_buckets);
else
list_del_init(&bucket->bucketchain);
goto begin;
}
qdisc_bstats_update(sch, skb);
bucket->deficit -= qdisc_pkt_len(skb);
return skb;
}
static void hhf_reset(struct Qdisc *sch)
{
struct sk_buff *skb;
while ((skb = hhf_dequeue(sch)) != NULL)
kfree_skb(skb);
}
static void *hhf_zalloc(size_t sz)
{
void *ptr = kzalloc(sz, GFP_KERNEL | __GFP_NOWARN);
if (!ptr)
ptr = vzalloc(sz);
return ptr;
}
static void hhf_free(void *addr)
{
kvfree(addr);
}
static void hhf_destroy(struct Qdisc *sch)
{
int i;
struct hhf_sched_data *q = qdisc_priv(sch);
for (i = 0; i < HHF_ARRAYS_CNT; i++) {
hhf_free(q->hhf_arrays[i]);
hhf_free(q->hhf_valid_bits[i]);
}
for (i = 0; i < HH_FLOWS_CNT; i++) {
struct hh_flow_state *flow, *next;
struct list_head *head = &q->hh_flows[i];
if (list_empty(head))
continue;
list_for_each_entry_safe(flow, next, head, flowchain) {
list_del(&flow->flowchain);
kfree(flow);
}
}
hhf_free(q->hh_flows);
}
static const struct nla_policy hhf_policy[TCA_HHF_MAX + 1] = {
[TCA_HHF_BACKLOG_LIMIT] = { .type = NLA_U32 },
[TCA_HHF_QUANTUM] = { .type = NLA_U32 },
[TCA_HHF_HH_FLOWS_LIMIT] = { .type = NLA_U32 },
[TCA_HHF_RESET_TIMEOUT] = { .type = NLA_U32 },
[TCA_HHF_ADMIT_BYTES] = { .type = NLA_U32 },
[TCA_HHF_EVICT_TIMEOUT] = { .type = NLA_U32 },
[TCA_HHF_NON_HH_WEIGHT] = { .type = NLA_U32 },
};
static int hhf_change(struct Qdisc *sch, struct nlattr *opt)
{
struct hhf_sched_data *q = qdisc_priv(sch);
struct nlattr *tb[TCA_HHF_MAX + 1];
unsigned int qlen;
int err;
u64 non_hh_quantum;
u32 new_quantum = q->quantum;
u32 new_hhf_non_hh_weight = q->hhf_non_hh_weight;
if (!opt)
return -EINVAL;
err = nla_parse_nested(tb, TCA_HHF_MAX, opt, hhf_policy);
if (err < 0)
return err;
if (tb[TCA_HHF_QUANTUM])
new_quantum = nla_get_u32(tb[TCA_HHF_QUANTUM]);
if (tb[TCA_HHF_NON_HH_WEIGHT])
new_hhf_non_hh_weight = nla_get_u32(tb[TCA_HHF_NON_HH_WEIGHT]);
non_hh_quantum = (u64)new_quantum * new_hhf_non_hh_weight;
if (non_hh_quantum > INT_MAX)
return -EINVAL;
sch_tree_lock(sch);
if (tb[TCA_HHF_BACKLOG_LIMIT])
sch->limit = nla_get_u32(tb[TCA_HHF_BACKLOG_LIMIT]);
q->quantum = new_quantum;
q->hhf_non_hh_weight = new_hhf_non_hh_weight;
if (tb[TCA_HHF_HH_FLOWS_LIMIT])
q->hh_flows_limit = nla_get_u32(tb[TCA_HHF_HH_FLOWS_LIMIT]);
if (tb[TCA_HHF_RESET_TIMEOUT]) {
u32 us = nla_get_u32(tb[TCA_HHF_RESET_TIMEOUT]);
q->hhf_reset_timeout = usecs_to_jiffies(us);
}
if (tb[TCA_HHF_ADMIT_BYTES])
q->hhf_admit_bytes = nla_get_u32(tb[TCA_HHF_ADMIT_BYTES]);
if (tb[TCA_HHF_EVICT_TIMEOUT]) {
u32 us = nla_get_u32(tb[TCA_HHF_EVICT_TIMEOUT]);
q->hhf_evict_timeout = usecs_to_jiffies(us);
}
qlen = sch->q.qlen;
while (sch->q.qlen > sch->limit) {
struct sk_buff *skb = hhf_dequeue(sch);
kfree_skb(skb);
}
qdisc_tree_decrease_qlen(sch, qlen - sch->q.qlen);
sch_tree_unlock(sch);
return 0;
}
static int hhf_init(struct Qdisc *sch, struct nlattr *opt)
{
struct hhf_sched_data *q = qdisc_priv(sch);
int i;
sch->limit = 1000;
q->quantum = psched_mtu(qdisc_dev(sch));
q->perturbation = prandom_u32();
INIT_LIST_HEAD(&q->new_buckets);
INIT_LIST_HEAD(&q->old_buckets);
/* Configurable HHF parameters */
q->hhf_reset_timeout = HZ / 25; /* 40 ms */
q->hhf_admit_bytes = 131072; /* 128 KB */
q->hhf_evict_timeout = HZ; /* 1 sec */
q->hhf_non_hh_weight = 2;
if (opt) {
int err = hhf_change(sch, opt);
if (err)
return err;
}
if (!q->hh_flows) {
/* Initialize heavy-hitter flow table. */
q->hh_flows = hhf_zalloc(HH_FLOWS_CNT *
sizeof(struct list_head));
if (!q->hh_flows)
return -ENOMEM;
for (i = 0; i < HH_FLOWS_CNT; i++)
INIT_LIST_HEAD(&q->hh_flows[i]);
/* Cap max active HHs at twice len of hh_flows table. */
q->hh_flows_limit = 2 * HH_FLOWS_CNT;
q->hh_flows_overlimit = 0;
q->hh_flows_total_cnt = 0;
q->hh_flows_current_cnt = 0;
/* Initialize heavy-hitter filter arrays. */
for (i = 0; i < HHF_ARRAYS_CNT; i++) {
q->hhf_arrays[i] = hhf_zalloc(HHF_ARRAYS_LEN *
sizeof(u32));
if (!q->hhf_arrays[i]) {
hhf_destroy(sch);
return -ENOMEM;
}
}
q->hhf_arrays_reset_timestamp = hhf_time_stamp();
/* Initialize valid bits of heavy-hitter filter arrays. */
for (i = 0; i < HHF_ARRAYS_CNT; i++) {
q->hhf_valid_bits[i] = hhf_zalloc(HHF_ARRAYS_LEN /
BITS_PER_BYTE);
if (!q->hhf_valid_bits[i]) {
hhf_destroy(sch);
return -ENOMEM;
}
}
/* Initialize Weighted DRR buckets. */
for (i = 0; i < WDRR_BUCKET_CNT; i++) {
struct wdrr_bucket *bucket = q->buckets + i;
INIT_LIST_HEAD(&bucket->bucketchain);
}
}
return 0;
}
static int hhf_dump(struct Qdisc *sch, struct sk_buff *skb)
{
struct hhf_sched_data *q = qdisc_priv(sch);
struct nlattr *opts;
opts = nla_nest_start(skb, TCA_OPTIONS);
if (opts == NULL)
goto nla_put_failure;
if (nla_put_u32(skb, TCA_HHF_BACKLOG_LIMIT, sch->limit) ||
nla_put_u32(skb, TCA_HHF_QUANTUM, q->quantum) ||
nla_put_u32(skb, TCA_HHF_HH_FLOWS_LIMIT, q->hh_flows_limit) ||
nla_put_u32(skb, TCA_HHF_RESET_TIMEOUT,
jiffies_to_usecs(q->hhf_reset_timeout)) ||
nla_put_u32(skb, TCA_HHF_ADMIT_BYTES, q->hhf_admit_bytes) ||
nla_put_u32(skb, TCA_HHF_EVICT_TIMEOUT,
jiffies_to_usecs(q->hhf_evict_timeout)) ||
nla_put_u32(skb, TCA_HHF_NON_HH_WEIGHT, q->hhf_non_hh_weight))
goto nla_put_failure;
return nla_nest_end(skb, opts);
nla_put_failure:
return -1;
}
static int hhf_dump_stats(struct Qdisc *sch, struct gnet_dump *d)
{
struct hhf_sched_data *q = qdisc_priv(sch);
struct tc_hhf_xstats st = {
.drop_overlimit = q->drop_overlimit,
.hh_overlimit = q->hh_flows_overlimit,
.hh_tot_count = q->hh_flows_total_cnt,
.hh_cur_count = q->hh_flows_current_cnt,
};
return gnet_stats_copy_app(d, &st, sizeof(st));
}
static struct Qdisc_ops hhf_qdisc_ops __read_mostly = {
.id = "hhf",
.priv_size = sizeof(struct hhf_sched_data),
.enqueue = hhf_enqueue,
.dequeue = hhf_dequeue,
.peek = qdisc_peek_dequeued,
.drop = hhf_drop,
.init = hhf_init,
.reset = hhf_reset,
.destroy = hhf_destroy,
.change = hhf_change,
.dump = hhf_dump,
.dump_stats = hhf_dump_stats,
.owner = THIS_MODULE,
};
static int __init hhf_module_init(void)
{
return register_qdisc(&hhf_qdisc_ops);
}
static void __exit hhf_module_exit(void)
{
unregister_qdisc(&hhf_qdisc_ops);
}
module_init(hhf_module_init)
module_exit(hhf_module_exit)
MODULE_AUTHOR("Terry Lam");
MODULE_AUTHOR("Nandita Dukkipati");
MODULE_LICENSE("GPL");

1630
net/sched/sch_htb.c Normal file

File diff suppressed because it is too large Load diff

144
net/sched/sch_ingress.c Normal file
View file

@ -0,0 +1,144 @@
/* net/sched/sch_ingress.c - Ingress qdisc
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public License
* as published by the Free Software Foundation; either version
* 2 of the License, or (at your option) any later version.
*
* Authors: Jamal Hadi Salim 1999
*/
#include <linux/module.h>
#include <linux/types.h>
#include <linux/list.h>
#include <linux/skbuff.h>
#include <linux/rtnetlink.h>
#include <net/netlink.h>
#include <net/pkt_sched.h>
struct ingress_qdisc_data {
struct tcf_proto __rcu *filter_list;
};
/* ------------------------- Class/flow operations ------------------------- */
static struct Qdisc *ingress_leaf(struct Qdisc *sch, unsigned long arg)
{
return NULL;
}
static unsigned long ingress_get(struct Qdisc *sch, u32 classid)
{
return TC_H_MIN(classid) + 1;
}
static unsigned long ingress_bind_filter(struct Qdisc *sch,
unsigned long parent, u32 classid)
{
return ingress_get(sch, classid);
}
static void ingress_put(struct Qdisc *sch, unsigned long cl)
{
}
static void ingress_walk(struct Qdisc *sch, struct qdisc_walker *walker)
{
}
static struct tcf_proto __rcu **ingress_find_tcf(struct Qdisc *sch,
unsigned long cl)
{
struct ingress_qdisc_data *p = qdisc_priv(sch);
return &p->filter_list;
}
/* --------------------------- Qdisc operations ---------------------------- */
static int ingress_enqueue(struct sk_buff *skb, struct Qdisc *sch)
{
struct ingress_qdisc_data *p = qdisc_priv(sch);
struct tcf_result res;
struct tcf_proto *fl = rcu_dereference_bh(p->filter_list);
int result;
result = tc_classify(skb, fl, &res);
qdisc_bstats_update(sch, skb);
switch (result) {
case TC_ACT_SHOT:
result = TC_ACT_SHOT;
qdisc_qstats_drop(sch);
break;
case TC_ACT_STOLEN:
case TC_ACT_QUEUED:
result = TC_ACT_STOLEN;
break;
case TC_ACT_RECLASSIFY:
case TC_ACT_OK:
skb->tc_index = TC_H_MIN(res.classid);
default:
result = TC_ACT_OK;
break;
}
return result;
}
/* ------------------------------------------------------------- */
static void ingress_destroy(struct Qdisc *sch)
{
struct ingress_qdisc_data *p = qdisc_priv(sch);
tcf_destroy_chain(&p->filter_list);
}
static int ingress_dump(struct Qdisc *sch, struct sk_buff *skb)
{
struct nlattr *nest;
nest = nla_nest_start(skb, TCA_OPTIONS);
if (nest == NULL)
goto nla_put_failure;
return nla_nest_end(skb, nest);
nla_put_failure:
nla_nest_cancel(skb, nest);
return -1;
}
static const struct Qdisc_class_ops ingress_class_ops = {
.leaf = ingress_leaf,
.get = ingress_get,
.put = ingress_put,
.walk = ingress_walk,
.tcf_chain = ingress_find_tcf,
.bind_tcf = ingress_bind_filter,
.unbind_tcf = ingress_put,
};
static struct Qdisc_ops ingress_qdisc_ops __read_mostly = {
.cl_ops = &ingress_class_ops,
.id = "ingress",
.priv_size = sizeof(struct ingress_qdisc_data),
.enqueue = ingress_enqueue,
.destroy = ingress_destroy,
.dump = ingress_dump,
.owner = THIS_MODULE,
};
static int __init ingress_module_init(void)
{
return register_qdisc(&ingress_qdisc_ops);
}
static void __exit ingress_module_exit(void)
{
unregister_qdisc(&ingress_qdisc_ops);
}
module_init(ingress_module_init)
module_exit(ingress_module_exit)
MODULE_LICENSE("GPL");

246
net/sched/sch_mq.c Normal file
View file

@ -0,0 +1,246 @@
/*
* net/sched/sch_mq.c Classful multiqueue dummy scheduler
*
* Copyright (c) 2009 Patrick McHardy <kaber@trash.net>
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public License
* version 2 as published by the Free Software Foundation.
*/
#include <linux/types.h>
#include <linux/slab.h>
#include <linux/kernel.h>
#include <linux/export.h>
#include <linux/string.h>
#include <linux/errno.h>
#include <linux/skbuff.h>
#include <net/netlink.h>
#include <net/pkt_sched.h>
struct mq_sched {
struct Qdisc **qdiscs;
};
static void mq_destroy(struct Qdisc *sch)
{
struct net_device *dev = qdisc_dev(sch);
struct mq_sched *priv = qdisc_priv(sch);
unsigned int ntx;
if (!priv->qdiscs)
return;
for (ntx = 0; ntx < dev->num_tx_queues && priv->qdiscs[ntx]; ntx++)
qdisc_destroy(priv->qdiscs[ntx]);
kfree(priv->qdiscs);
}
static int mq_init(struct Qdisc *sch, struct nlattr *opt)
{
struct net_device *dev = qdisc_dev(sch);
struct mq_sched *priv = qdisc_priv(sch);
struct netdev_queue *dev_queue;
struct Qdisc *qdisc;
unsigned int ntx;
if (sch->parent != TC_H_ROOT)
return -EOPNOTSUPP;
if (!netif_is_multiqueue(dev))
return -EOPNOTSUPP;
/* pre-allocate qdiscs, attachment can't fail */
priv->qdiscs = kcalloc(dev->num_tx_queues, sizeof(priv->qdiscs[0]),
GFP_KERNEL);
if (priv->qdiscs == NULL)
return -ENOMEM;
for (ntx = 0; ntx < dev->num_tx_queues; ntx++) {
dev_queue = netdev_get_tx_queue(dev, ntx);
qdisc = qdisc_create_dflt(dev_queue, default_qdisc_ops,
TC_H_MAKE(TC_H_MAJ(sch->handle),
TC_H_MIN(ntx + 1)));
if (qdisc == NULL)
goto err;
priv->qdiscs[ntx] = qdisc;
qdisc->flags |= TCQ_F_ONETXQUEUE;
}
sch->flags |= TCQ_F_MQROOT;
return 0;
err:
mq_destroy(sch);
return -ENOMEM;
}
static void mq_attach(struct Qdisc *sch)
{
struct net_device *dev = qdisc_dev(sch);
struct mq_sched *priv = qdisc_priv(sch);
struct Qdisc *qdisc, *old;
unsigned int ntx;
for (ntx = 0; ntx < dev->num_tx_queues; ntx++) {
qdisc = priv->qdiscs[ntx];
old = dev_graft_qdisc(qdisc->dev_queue, qdisc);
if (old)
qdisc_destroy(old);
#ifdef CONFIG_NET_SCHED
if (ntx < dev->real_num_tx_queues)
qdisc_list_add(qdisc);
#endif
}
kfree(priv->qdiscs);
priv->qdiscs = NULL;
}
static int mq_dump(struct Qdisc *sch, struct sk_buff *skb)
{
struct net_device *dev = qdisc_dev(sch);
struct Qdisc *qdisc;
unsigned int ntx;
sch->q.qlen = 0;
memset(&sch->bstats, 0, sizeof(sch->bstats));
memset(&sch->qstats, 0, sizeof(sch->qstats));
for (ntx = 0; ntx < dev->num_tx_queues; ntx++) {
qdisc = netdev_get_tx_queue(dev, ntx)->qdisc_sleeping;
spin_lock_bh(qdisc_lock(qdisc));
sch->q.qlen += qdisc->q.qlen;
sch->bstats.bytes += qdisc->bstats.bytes;
sch->bstats.packets += qdisc->bstats.packets;
sch->qstats.backlog += qdisc->qstats.backlog;
sch->qstats.drops += qdisc->qstats.drops;
sch->qstats.requeues += qdisc->qstats.requeues;
sch->qstats.overlimits += qdisc->qstats.overlimits;
spin_unlock_bh(qdisc_lock(qdisc));
}
return 0;
}
static struct netdev_queue *mq_queue_get(struct Qdisc *sch, unsigned long cl)
{
struct net_device *dev = qdisc_dev(sch);
unsigned long ntx = cl - 1;
if (ntx >= dev->num_tx_queues)
return NULL;
return netdev_get_tx_queue(dev, ntx);
}
static struct netdev_queue *mq_select_queue(struct Qdisc *sch,
struct tcmsg *tcm)
{
unsigned int ntx = TC_H_MIN(tcm->tcm_parent);
struct netdev_queue *dev_queue = mq_queue_get(sch, ntx);
if (!dev_queue) {
struct net_device *dev = qdisc_dev(sch);
return netdev_get_tx_queue(dev, 0);
}
return dev_queue;
}
static int mq_graft(struct Qdisc *sch, unsigned long cl, struct Qdisc *new,
struct Qdisc **old)
{
struct netdev_queue *dev_queue = mq_queue_get(sch, cl);
struct net_device *dev = qdisc_dev(sch);
if (dev->flags & IFF_UP)
dev_deactivate(dev);
*old = dev_graft_qdisc(dev_queue, new);
if (new)
new->flags |= TCQ_F_ONETXQUEUE;
if (dev->flags & IFF_UP)
dev_activate(dev);
return 0;
}
static struct Qdisc *mq_leaf(struct Qdisc *sch, unsigned long cl)
{
struct netdev_queue *dev_queue = mq_queue_get(sch, cl);
return dev_queue->qdisc_sleeping;
}
static unsigned long mq_get(struct Qdisc *sch, u32 classid)
{
unsigned int ntx = TC_H_MIN(classid);
if (!mq_queue_get(sch, ntx))
return 0;
return ntx;
}
static void mq_put(struct Qdisc *sch, unsigned long cl)
{
}
static int mq_dump_class(struct Qdisc *sch, unsigned long cl,
struct sk_buff *skb, struct tcmsg *tcm)
{
struct netdev_queue *dev_queue = mq_queue_get(sch, cl);
tcm->tcm_parent = TC_H_ROOT;
tcm->tcm_handle |= TC_H_MIN(cl);
tcm->tcm_info = dev_queue->qdisc_sleeping->handle;
return 0;
}
static int mq_dump_class_stats(struct Qdisc *sch, unsigned long cl,
struct gnet_dump *d)
{
struct netdev_queue *dev_queue = mq_queue_get(sch, cl);
sch = dev_queue->qdisc_sleeping;
if (gnet_stats_copy_basic(d, NULL, &sch->bstats) < 0 ||
gnet_stats_copy_queue(d, NULL, &sch->qstats, sch->q.qlen) < 0)
return -1;
return 0;
}
static void mq_walk(struct Qdisc *sch, struct qdisc_walker *arg)
{
struct net_device *dev = qdisc_dev(sch);
unsigned int ntx;
if (arg->stop)
return;
arg->count = arg->skip;
for (ntx = arg->skip; ntx < dev->num_tx_queues; ntx++) {
if (arg->fn(sch, ntx + 1, arg) < 0) {
arg->stop = 1;
break;
}
arg->count++;
}
}
static const struct Qdisc_class_ops mq_class_ops = {
.select_queue = mq_select_queue,
.graft = mq_graft,
.leaf = mq_leaf,
.get = mq_get,
.put = mq_put,
.walk = mq_walk,
.dump = mq_dump_class,
.dump_stats = mq_dump_class_stats,
};
struct Qdisc_ops mq_qdisc_ops __read_mostly = {
.cl_ops = &mq_class_ops,
.id = "mq",
.priv_size = sizeof(struct mq_sched),
.init = mq_init,
.destroy = mq_destroy,
.attach = mq_attach,
.dump = mq_dump,
.owner = THIS_MODULE,
};

428
net/sched/sch_mqprio.c Normal file
View file

@ -0,0 +1,428 @@
/*
* net/sched/sch_mqprio.c
*
* Copyright (c) 2010 John Fastabend <john.r.fastabend@intel.com>
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public License
* version 2 as published by the Free Software Foundation.
*/
#include <linux/types.h>
#include <linux/slab.h>
#include <linux/kernel.h>
#include <linux/string.h>
#include <linux/errno.h>
#include <linux/skbuff.h>
#include <linux/module.h>
#include <net/netlink.h>
#include <net/pkt_sched.h>
#include <net/sch_generic.h>
struct mqprio_sched {
struct Qdisc **qdiscs;
int hw_owned;
};
static void mqprio_destroy(struct Qdisc *sch)
{
struct net_device *dev = qdisc_dev(sch);
struct mqprio_sched *priv = qdisc_priv(sch);
unsigned int ntx;
if (priv->qdiscs) {
for (ntx = 0;
ntx < dev->num_tx_queues && priv->qdiscs[ntx];
ntx++)
qdisc_destroy(priv->qdiscs[ntx]);
kfree(priv->qdiscs);
}
if (priv->hw_owned && dev->netdev_ops->ndo_setup_tc)
dev->netdev_ops->ndo_setup_tc(dev, 0);
else
netdev_set_num_tc(dev, 0);
}
static int mqprio_parse_opt(struct net_device *dev, struct tc_mqprio_qopt *qopt)
{
int i, j;
/* Verify num_tc is not out of max range */
if (qopt->num_tc > TC_MAX_QUEUE)
return -EINVAL;
/* Verify priority mapping uses valid tcs */
for (i = 0; i < TC_BITMASK + 1; i++) {
if (qopt->prio_tc_map[i] >= qopt->num_tc)
return -EINVAL;
}
/* net_device does not support requested operation */
if (qopt->hw && !dev->netdev_ops->ndo_setup_tc)
return -EINVAL;
/* if hw owned qcount and qoffset are taken from LLD so
* no reason to verify them here
*/
if (qopt->hw)
return 0;
for (i = 0; i < qopt->num_tc; i++) {
unsigned int last = qopt->offset[i] + qopt->count[i];
/* Verify the queue count is in tx range being equal to the
* real_num_tx_queues indicates the last queue is in use.
*/
if (qopt->offset[i] >= dev->real_num_tx_queues ||
!qopt->count[i] ||
last > dev->real_num_tx_queues)
return -EINVAL;
/* Verify that the offset and counts do not overlap */
for (j = i + 1; j < qopt->num_tc; j++) {
if (last > qopt->offset[j])
return -EINVAL;
}
}
return 0;
}
static int mqprio_init(struct Qdisc *sch, struct nlattr *opt)
{
struct net_device *dev = qdisc_dev(sch);
struct mqprio_sched *priv = qdisc_priv(sch);
struct netdev_queue *dev_queue;
struct Qdisc *qdisc;
int i, err = -EOPNOTSUPP;
struct tc_mqprio_qopt *qopt = NULL;
BUILD_BUG_ON(TC_MAX_QUEUE != TC_QOPT_MAX_QUEUE);
BUILD_BUG_ON(TC_BITMASK != TC_QOPT_BITMASK);
if (sch->parent != TC_H_ROOT)
return -EOPNOTSUPP;
if (!netif_is_multiqueue(dev))
return -EOPNOTSUPP;
if (!opt || nla_len(opt) < sizeof(*qopt))
return -EINVAL;
qopt = nla_data(opt);
if (mqprio_parse_opt(dev, qopt))
return -EINVAL;
/* pre-allocate qdisc, attachment can't fail */
priv->qdiscs = kcalloc(dev->num_tx_queues, sizeof(priv->qdiscs[0]),
GFP_KERNEL);
if (priv->qdiscs == NULL) {
err = -ENOMEM;
goto err;
}
for (i = 0; i < dev->num_tx_queues; i++) {
dev_queue = netdev_get_tx_queue(dev, i);
qdisc = qdisc_create_dflt(dev_queue, default_qdisc_ops,
TC_H_MAKE(TC_H_MAJ(sch->handle),
TC_H_MIN(i + 1)));
if (qdisc == NULL) {
err = -ENOMEM;
goto err;
}
priv->qdiscs[i] = qdisc;
qdisc->flags |= TCQ_F_ONETXQUEUE;
}
/* If the mqprio options indicate that hardware should own
* the queue mapping then run ndo_setup_tc otherwise use the
* supplied and verified mapping
*/
if (qopt->hw) {
priv->hw_owned = 1;
err = dev->netdev_ops->ndo_setup_tc(dev, qopt->num_tc);
if (err)
goto err;
} else {
netdev_set_num_tc(dev, qopt->num_tc);
for (i = 0; i < qopt->num_tc; i++)
netdev_set_tc_queue(dev, i,
qopt->count[i], qopt->offset[i]);
}
/* Always use supplied priority mappings */
for (i = 0; i < TC_BITMASK + 1; i++)
netdev_set_prio_tc_map(dev, i, qopt->prio_tc_map[i]);
sch->flags |= TCQ_F_MQROOT;
return 0;
err:
mqprio_destroy(sch);
return err;
}
static void mqprio_attach(struct Qdisc *sch)
{
struct net_device *dev = qdisc_dev(sch);
struct mqprio_sched *priv = qdisc_priv(sch);
struct Qdisc *qdisc, *old;
unsigned int ntx;
/* Attach underlying qdisc */
for (ntx = 0; ntx < dev->num_tx_queues; ntx++) {
qdisc = priv->qdiscs[ntx];
old = dev_graft_qdisc(qdisc->dev_queue, qdisc);
if (old)
qdisc_destroy(old);
if (ntx < dev->real_num_tx_queues)
qdisc_list_add(qdisc);
}
kfree(priv->qdiscs);
priv->qdiscs = NULL;
}
static struct netdev_queue *mqprio_queue_get(struct Qdisc *sch,
unsigned long cl)
{
struct net_device *dev = qdisc_dev(sch);
unsigned long ntx = cl - 1 - netdev_get_num_tc(dev);
if (ntx >= dev->num_tx_queues)
return NULL;
return netdev_get_tx_queue(dev, ntx);
}
static int mqprio_graft(struct Qdisc *sch, unsigned long cl, struct Qdisc *new,
struct Qdisc **old)
{
struct net_device *dev = qdisc_dev(sch);
struct netdev_queue *dev_queue = mqprio_queue_get(sch, cl);
if (!dev_queue)
return -EINVAL;
if (dev->flags & IFF_UP)
dev_deactivate(dev);
*old = dev_graft_qdisc(dev_queue, new);
if (new)
new->flags |= TCQ_F_ONETXQUEUE;
if (dev->flags & IFF_UP)
dev_activate(dev);
return 0;
}
static int mqprio_dump(struct Qdisc *sch, struct sk_buff *skb)
{
struct net_device *dev = qdisc_dev(sch);
struct mqprio_sched *priv = qdisc_priv(sch);
unsigned char *b = skb_tail_pointer(skb);
struct tc_mqprio_qopt opt = { 0 };
struct Qdisc *qdisc;
unsigned int i;
sch->q.qlen = 0;
memset(&sch->bstats, 0, sizeof(sch->bstats));
memset(&sch->qstats, 0, sizeof(sch->qstats));
for (i = 0; i < dev->num_tx_queues; i++) {
qdisc = rtnl_dereference(netdev_get_tx_queue(dev, i)->qdisc);
spin_lock_bh(qdisc_lock(qdisc));
sch->q.qlen += qdisc->q.qlen;
sch->bstats.bytes += qdisc->bstats.bytes;
sch->bstats.packets += qdisc->bstats.packets;
sch->qstats.backlog += qdisc->qstats.backlog;
sch->qstats.drops += qdisc->qstats.drops;
sch->qstats.requeues += qdisc->qstats.requeues;
sch->qstats.overlimits += qdisc->qstats.overlimits;
spin_unlock_bh(qdisc_lock(qdisc));
}
opt.num_tc = netdev_get_num_tc(dev);
memcpy(opt.prio_tc_map, dev->prio_tc_map, sizeof(opt.prio_tc_map));
opt.hw = priv->hw_owned;
for (i = 0; i < netdev_get_num_tc(dev); i++) {
opt.count[i] = dev->tc_to_txq[i].count;
opt.offset[i] = dev->tc_to_txq[i].offset;
}
if (nla_put(skb, TCA_OPTIONS, sizeof(opt), &opt))
goto nla_put_failure;
return skb->len;
nla_put_failure:
nlmsg_trim(skb, b);
return -1;
}
static struct Qdisc *mqprio_leaf(struct Qdisc *sch, unsigned long cl)
{
struct netdev_queue *dev_queue = mqprio_queue_get(sch, cl);
if (!dev_queue)
return NULL;
return dev_queue->qdisc_sleeping;
}
static unsigned long mqprio_get(struct Qdisc *sch, u32 classid)
{
struct net_device *dev = qdisc_dev(sch);
unsigned int ntx = TC_H_MIN(classid);
if (ntx > dev->num_tx_queues + netdev_get_num_tc(dev))
return 0;
return ntx;
}
static void mqprio_put(struct Qdisc *sch, unsigned long cl)
{
}
static int mqprio_dump_class(struct Qdisc *sch, unsigned long cl,
struct sk_buff *skb, struct tcmsg *tcm)
{
struct net_device *dev = qdisc_dev(sch);
if (cl <= netdev_get_num_tc(dev)) {
tcm->tcm_parent = TC_H_ROOT;
tcm->tcm_info = 0;
} else {
int i;
struct netdev_queue *dev_queue;
dev_queue = mqprio_queue_get(sch, cl);
tcm->tcm_parent = 0;
for (i = 0; i < netdev_get_num_tc(dev); i++) {
struct netdev_tc_txq tc = dev->tc_to_txq[i];
int q_idx = cl - netdev_get_num_tc(dev);
if (q_idx > tc.offset &&
q_idx <= tc.offset + tc.count) {
tcm->tcm_parent =
TC_H_MAKE(TC_H_MAJ(sch->handle),
TC_H_MIN(i + 1));
break;
}
}
tcm->tcm_info = dev_queue->qdisc_sleeping->handle;
}
tcm->tcm_handle |= TC_H_MIN(cl);
return 0;
}
static int mqprio_dump_class_stats(struct Qdisc *sch, unsigned long cl,
struct gnet_dump *d)
__releases(d->lock)
__acquires(d->lock)
{
struct net_device *dev = qdisc_dev(sch);
if (cl <= netdev_get_num_tc(dev)) {
int i;
__u32 qlen = 0;
struct Qdisc *qdisc;
struct gnet_stats_queue qstats = {0};
struct gnet_stats_basic_packed bstats = {0};
struct netdev_tc_txq tc = dev->tc_to_txq[cl - 1];
/* Drop lock here it will be reclaimed before touching
* statistics this is required because the d->lock we
* hold here is the look on dev_queue->qdisc_sleeping
* also acquired below.
*/
spin_unlock_bh(d->lock);
for (i = tc.offset; i < tc.offset + tc.count; i++) {
struct netdev_queue *q = netdev_get_tx_queue(dev, i);
qdisc = rtnl_dereference(q->qdisc);
spin_lock_bh(qdisc_lock(qdisc));
qlen += qdisc->q.qlen;
bstats.bytes += qdisc->bstats.bytes;
bstats.packets += qdisc->bstats.packets;
qstats.backlog += qdisc->qstats.backlog;
qstats.drops += qdisc->qstats.drops;
qstats.requeues += qdisc->qstats.requeues;
qstats.overlimits += qdisc->qstats.overlimits;
spin_unlock_bh(qdisc_lock(qdisc));
}
/* Reclaim root sleeping lock before completing stats */
spin_lock_bh(d->lock);
if (gnet_stats_copy_basic(d, NULL, &bstats) < 0 ||
gnet_stats_copy_queue(d, NULL, &qstats, qlen) < 0)
return -1;
} else {
struct netdev_queue *dev_queue = mqprio_queue_get(sch, cl);
sch = dev_queue->qdisc_sleeping;
if (gnet_stats_copy_basic(d, NULL, &sch->bstats) < 0 ||
gnet_stats_copy_queue(d, NULL,
&sch->qstats, sch->q.qlen) < 0)
return -1;
}
return 0;
}
static void mqprio_walk(struct Qdisc *sch, struct qdisc_walker *arg)
{
struct net_device *dev = qdisc_dev(sch);
unsigned long ntx;
if (arg->stop)
return;
/* Walk hierarchy with a virtual class per tc */
arg->count = arg->skip;
for (ntx = arg->skip;
ntx < dev->num_tx_queues + netdev_get_num_tc(dev);
ntx++) {
if (arg->fn(sch, ntx + 1, arg) < 0) {
arg->stop = 1;
break;
}
arg->count++;
}
}
static const struct Qdisc_class_ops mqprio_class_ops = {
.graft = mqprio_graft,
.leaf = mqprio_leaf,
.get = mqprio_get,
.put = mqprio_put,
.walk = mqprio_walk,
.dump = mqprio_dump_class,
.dump_stats = mqprio_dump_class_stats,
};
static struct Qdisc_ops mqprio_qdisc_ops __read_mostly = {
.cl_ops = &mqprio_class_ops,
.id = "mqprio",
.priv_size = sizeof(struct mqprio_sched),
.init = mqprio_init,
.destroy = mqprio_destroy,
.attach = mqprio_attach,
.dump = mqprio_dump,
.owner = THIS_MODULE,
};
static int __init mqprio_module_init(void)
{
return register_qdisc(&mqprio_qdisc_ops);
}
static void __exit mqprio_module_exit(void)
{
unregister_qdisc(&mqprio_qdisc_ops);
}
module_init(mqprio_module_init);
module_exit(mqprio_module_exit);
MODULE_LICENSE("GPL");

444
net/sched/sch_multiq.c Normal file
View file

@ -0,0 +1,444 @@
/*
* Copyright (c) 2008, Intel Corporation.
*
* This program is free software; you can redistribute it and/or modify it
* under the terms and conditions of the GNU General Public License,
* version 2, as published by the Free Software Foundation.
*
* This program is distributed in the hope it will be useful, but WITHOUT
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
* more details.
*
* You should have received a copy of the GNU General Public License along with
* this program; if not, see <http://www.gnu.org/licenses/>.
*
* Author: Alexander Duyck <alexander.h.duyck@intel.com>
*/
#include <linux/module.h>
#include <linux/slab.h>
#include <linux/types.h>
#include <linux/kernel.h>
#include <linux/string.h>
#include <linux/errno.h>
#include <linux/skbuff.h>
#include <net/netlink.h>
#include <net/pkt_sched.h>
struct multiq_sched_data {
u16 bands;
u16 max_bands;
u16 curband;
struct tcf_proto __rcu *filter_list;
struct Qdisc **queues;
};
static struct Qdisc *
multiq_classify(struct sk_buff *skb, struct Qdisc *sch, int *qerr)
{
struct multiq_sched_data *q = qdisc_priv(sch);
u32 band;
struct tcf_result res;
struct tcf_proto *fl = rcu_dereference_bh(q->filter_list);
int err;
*qerr = NET_XMIT_SUCCESS | __NET_XMIT_BYPASS;
err = tc_classify(skb, fl, &res);
#ifdef CONFIG_NET_CLS_ACT
switch (err) {
case TC_ACT_STOLEN:
case TC_ACT_QUEUED:
*qerr = NET_XMIT_SUCCESS | __NET_XMIT_STOLEN;
case TC_ACT_SHOT:
return NULL;
}
#endif
band = skb_get_queue_mapping(skb);
if (band >= q->bands)
return q->queues[0];
return q->queues[band];
}
static int
multiq_enqueue(struct sk_buff *skb, struct Qdisc *sch)
{
struct Qdisc *qdisc;
int ret;
qdisc = multiq_classify(skb, sch, &ret);
#ifdef CONFIG_NET_CLS_ACT
if (qdisc == NULL) {
if (ret & __NET_XMIT_BYPASS)
qdisc_qstats_drop(sch);
kfree_skb(skb);
return ret;
}
#endif
ret = qdisc_enqueue(skb, qdisc);
if (ret == NET_XMIT_SUCCESS) {
sch->q.qlen++;
return NET_XMIT_SUCCESS;
}
if (net_xmit_drop_count(ret))
qdisc_qstats_drop(sch);
return ret;
}
static struct sk_buff *multiq_dequeue(struct Qdisc *sch)
{
struct multiq_sched_data *q = qdisc_priv(sch);
struct Qdisc *qdisc;
struct sk_buff *skb;
int band;
for (band = 0; band < q->bands; band++) {
/* cycle through bands to ensure fairness */
q->curband++;
if (q->curband >= q->bands)
q->curband = 0;
/* Check that target subqueue is available before
* pulling an skb to avoid head-of-line blocking.
*/
if (!netif_xmit_stopped(
netdev_get_tx_queue(qdisc_dev(sch), q->curband))) {
qdisc = q->queues[q->curband];
skb = qdisc->dequeue(qdisc);
if (skb) {
qdisc_bstats_update(sch, skb);
sch->q.qlen--;
return skb;
}
}
}
return NULL;
}
static struct sk_buff *multiq_peek(struct Qdisc *sch)
{
struct multiq_sched_data *q = qdisc_priv(sch);
unsigned int curband = q->curband;
struct Qdisc *qdisc;
struct sk_buff *skb;
int band;
for (band = 0; band < q->bands; band++) {
/* cycle through bands to ensure fairness */
curband++;
if (curband >= q->bands)
curband = 0;
/* Check that target subqueue is available before
* pulling an skb to avoid head-of-line blocking.
*/
if (!netif_xmit_stopped(
netdev_get_tx_queue(qdisc_dev(sch), curband))) {
qdisc = q->queues[curband];
skb = qdisc->ops->peek(qdisc);
if (skb)
return skb;
}
}
return NULL;
}
static unsigned int multiq_drop(struct Qdisc *sch)
{
struct multiq_sched_data *q = qdisc_priv(sch);
int band;
unsigned int len;
struct Qdisc *qdisc;
for (band = q->bands - 1; band >= 0; band--) {
qdisc = q->queues[band];
if (qdisc->ops->drop) {
len = qdisc->ops->drop(qdisc);
if (len != 0) {
sch->q.qlen--;
return len;
}
}
}
return 0;
}
static void
multiq_reset(struct Qdisc *sch)
{
u16 band;
struct multiq_sched_data *q = qdisc_priv(sch);
for (band = 0; band < q->bands; band++)
qdisc_reset(q->queues[band]);
sch->q.qlen = 0;
q->curband = 0;
}
static void
multiq_destroy(struct Qdisc *sch)
{
int band;
struct multiq_sched_data *q = qdisc_priv(sch);
tcf_destroy_chain(&q->filter_list);
for (band = 0; band < q->bands; band++)
qdisc_destroy(q->queues[band]);
kfree(q->queues);
}
static int multiq_tune(struct Qdisc *sch, struct nlattr *opt)
{
struct multiq_sched_data *q = qdisc_priv(sch);
struct tc_multiq_qopt *qopt;
int i;
if (!netif_is_multiqueue(qdisc_dev(sch)))
return -EOPNOTSUPP;
if (nla_len(opt) < sizeof(*qopt))
return -EINVAL;
qopt = nla_data(opt);
qopt->bands = qdisc_dev(sch)->real_num_tx_queues;
sch_tree_lock(sch);
q->bands = qopt->bands;
for (i = q->bands; i < q->max_bands; i++) {
if (q->queues[i] != &noop_qdisc) {
struct Qdisc *child = q->queues[i];
q->queues[i] = &noop_qdisc;
qdisc_tree_decrease_qlen(child, child->q.qlen);
qdisc_destroy(child);
}
}
sch_tree_unlock(sch);
for (i = 0; i < q->bands; i++) {
if (q->queues[i] == &noop_qdisc) {
struct Qdisc *child, *old;
child = qdisc_create_dflt(sch->dev_queue,
&pfifo_qdisc_ops,
TC_H_MAKE(sch->handle,
i + 1));
if (child) {
sch_tree_lock(sch);
old = q->queues[i];
q->queues[i] = child;
if (old != &noop_qdisc) {
qdisc_tree_decrease_qlen(old,
old->q.qlen);
qdisc_destroy(old);
}
sch_tree_unlock(sch);
}
}
}
return 0;
}
static int multiq_init(struct Qdisc *sch, struct nlattr *opt)
{
struct multiq_sched_data *q = qdisc_priv(sch);
int i, err;
q->queues = NULL;
if (opt == NULL)
return -EINVAL;
q->max_bands = qdisc_dev(sch)->num_tx_queues;
q->queues = kcalloc(q->max_bands, sizeof(struct Qdisc *), GFP_KERNEL);
if (!q->queues)
return -ENOBUFS;
for (i = 0; i < q->max_bands; i++)
q->queues[i] = &noop_qdisc;
err = multiq_tune(sch, opt);
if (err)
kfree(q->queues);
return err;
}
static int multiq_dump(struct Qdisc *sch, struct sk_buff *skb)
{
struct multiq_sched_data *q = qdisc_priv(sch);
unsigned char *b = skb_tail_pointer(skb);
struct tc_multiq_qopt opt;
opt.bands = q->bands;
opt.max_bands = q->max_bands;
if (nla_put(skb, TCA_OPTIONS, sizeof(opt), &opt))
goto nla_put_failure;
return skb->len;
nla_put_failure:
nlmsg_trim(skb, b);
return -1;
}
static int multiq_graft(struct Qdisc *sch, unsigned long arg, struct Qdisc *new,
struct Qdisc **old)
{
struct multiq_sched_data *q = qdisc_priv(sch);
unsigned long band = arg - 1;
if (new == NULL)
new = &noop_qdisc;
sch_tree_lock(sch);
*old = q->queues[band];
q->queues[band] = new;
qdisc_tree_decrease_qlen(*old, (*old)->q.qlen);
qdisc_reset(*old);
sch_tree_unlock(sch);
return 0;
}
static struct Qdisc *
multiq_leaf(struct Qdisc *sch, unsigned long arg)
{
struct multiq_sched_data *q = qdisc_priv(sch);
unsigned long band = arg - 1;
return q->queues[band];
}
static unsigned long multiq_get(struct Qdisc *sch, u32 classid)
{
struct multiq_sched_data *q = qdisc_priv(sch);
unsigned long band = TC_H_MIN(classid);
if (band - 1 >= q->bands)
return 0;
return band;
}
static unsigned long multiq_bind(struct Qdisc *sch, unsigned long parent,
u32 classid)
{
return multiq_get(sch, classid);
}
static void multiq_put(struct Qdisc *q, unsigned long cl)
{
}
static int multiq_dump_class(struct Qdisc *sch, unsigned long cl,
struct sk_buff *skb, struct tcmsg *tcm)
{
struct multiq_sched_data *q = qdisc_priv(sch);
tcm->tcm_handle |= TC_H_MIN(cl);
tcm->tcm_info = q->queues[cl - 1]->handle;
return 0;
}
static int multiq_dump_class_stats(struct Qdisc *sch, unsigned long cl,
struct gnet_dump *d)
{
struct multiq_sched_data *q = qdisc_priv(sch);
struct Qdisc *cl_q;
cl_q = q->queues[cl - 1];
if (gnet_stats_copy_basic(d, NULL, &cl_q->bstats) < 0 ||
gnet_stats_copy_queue(d, NULL, &cl_q->qstats, cl_q->q.qlen) < 0)
return -1;
return 0;
}
static void multiq_walk(struct Qdisc *sch, struct qdisc_walker *arg)
{
struct multiq_sched_data *q = qdisc_priv(sch);
int band;
if (arg->stop)
return;
for (band = 0; band < q->bands; band++) {
if (arg->count < arg->skip) {
arg->count++;
continue;
}
if (arg->fn(sch, band + 1, arg) < 0) {
arg->stop = 1;
break;
}
arg->count++;
}
}
static struct tcf_proto __rcu **multiq_find_tcf(struct Qdisc *sch,
unsigned long cl)
{
struct multiq_sched_data *q = qdisc_priv(sch);
if (cl)
return NULL;
return &q->filter_list;
}
static const struct Qdisc_class_ops multiq_class_ops = {
.graft = multiq_graft,
.leaf = multiq_leaf,
.get = multiq_get,
.put = multiq_put,
.walk = multiq_walk,
.tcf_chain = multiq_find_tcf,
.bind_tcf = multiq_bind,
.unbind_tcf = multiq_put,
.dump = multiq_dump_class,
.dump_stats = multiq_dump_class_stats,
};
static struct Qdisc_ops multiq_qdisc_ops __read_mostly = {
.next = NULL,
.cl_ops = &multiq_class_ops,
.id = "multiq",
.priv_size = sizeof(struct multiq_sched_data),
.enqueue = multiq_enqueue,
.dequeue = multiq_dequeue,
.peek = multiq_peek,
.drop = multiq_drop,
.init = multiq_init,
.reset = multiq_reset,
.destroy = multiq_destroy,
.change = multiq_tune,
.dump = multiq_dump,
.owner = THIS_MODULE,
};
static int __init multiq_module_init(void)
{
return register_qdisc(&multiq_qdisc_ops);
}
static void __exit multiq_module_exit(void)
{
unregister_qdisc(&multiq_qdisc_ops);
}
module_init(multiq_module_init)
module_exit(multiq_module_exit)
MODULE_LICENSE("GPL");

1128
net/sched/sch_netem.c Normal file

File diff suppressed because it is too large Load diff

566
net/sched/sch_pie.c Normal file
View file

@ -0,0 +1,566 @@
/* Copyright (C) 2013 Cisco Systems, Inc, 2013.
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public License
* as published by the Free Software Foundation; either version 2
* of the License.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* Author: Vijay Subramanian <vijaynsu@cisco.com>
* Author: Mythili Prabhu <mysuryan@cisco.com>
*
* ECN support is added by Naeem Khademi <naeemk@ifi.uio.no>
* University of Oslo, Norway.
*
* References:
* IETF draft submission: http://tools.ietf.org/html/draft-pan-aqm-pie-00
* IEEE Conference on High Performance Switching and Routing 2013 :
* "PIE: A * Lightweight Control Scheme to Address the Bufferbloat Problem"
*/
#include <linux/module.h>
#include <linux/slab.h>
#include <linux/types.h>
#include <linux/kernel.h>
#include <linux/errno.h>
#include <linux/skbuff.h>
#include <net/pkt_sched.h>
#include <net/inet_ecn.h>
#define QUEUE_THRESHOLD 10000
#define DQCOUNT_INVALID -1
#define MAX_PROB 0xffffffff
#define PIE_SCALE 8
/* parameters used */
struct pie_params {
psched_time_t target; /* user specified target delay in pschedtime */
u32 tupdate; /* timer frequency (in jiffies) */
u32 limit; /* number of packets that can be enqueued */
u32 alpha; /* alpha and beta are between 0 and 32 */
u32 beta; /* and are used for shift relative to 1 */
bool ecn; /* true if ecn is enabled */
bool bytemode; /* to scale drop early prob based on pkt size */
};
/* variables used */
struct pie_vars {
u32 prob; /* probability but scaled by u32 limit. */
psched_time_t burst_time;
psched_time_t qdelay;
psched_time_t qdelay_old;
u64 dq_count; /* measured in bytes */
psched_time_t dq_tstamp; /* drain rate */
u32 avg_dq_rate; /* bytes per pschedtime tick,scaled */
u32 qlen_old; /* in bytes */
};
/* statistics gathering */
struct pie_stats {
u32 packets_in; /* total number of packets enqueued */
u32 dropped; /* packets dropped due to pie_action */
u32 overlimit; /* dropped due to lack of space in queue */
u32 maxq; /* maximum queue size */
u32 ecn_mark; /* packets marked with ECN */
};
/* private data for the Qdisc */
struct pie_sched_data {
struct pie_params params;
struct pie_vars vars;
struct pie_stats stats;
struct timer_list adapt_timer;
};
static void pie_params_init(struct pie_params *params)
{
params->alpha = 2;
params->beta = 20;
params->tupdate = usecs_to_jiffies(30 * USEC_PER_MSEC); /* 30 ms */
params->limit = 1000; /* default of 1000 packets */
params->target = PSCHED_NS2TICKS(20 * NSEC_PER_MSEC); /* 20 ms */
params->ecn = false;
params->bytemode = false;
}
static void pie_vars_init(struct pie_vars *vars)
{
vars->dq_count = DQCOUNT_INVALID;
vars->avg_dq_rate = 0;
/* default of 100 ms in pschedtime */
vars->burst_time = PSCHED_NS2TICKS(100 * NSEC_PER_MSEC);
}
static bool drop_early(struct Qdisc *sch, u32 packet_size)
{
struct pie_sched_data *q = qdisc_priv(sch);
u32 rnd;
u32 local_prob = q->vars.prob;
u32 mtu = psched_mtu(qdisc_dev(sch));
/* If there is still burst allowance left skip random early drop */
if (q->vars.burst_time > 0)
return false;
/* If current delay is less than half of target, and
* if drop prob is low already, disable early_drop
*/
if ((q->vars.qdelay < q->params.target / 2)
&& (q->vars.prob < MAX_PROB / 5))
return false;
/* If we have fewer than 2 mtu-sized packets, disable drop_early,
* similar to min_th in RED
*/
if (sch->qstats.backlog < 2 * mtu)
return false;
/* If bytemode is turned on, use packet size to compute new
* probablity. Smaller packets will have lower drop prob in this case
*/
if (q->params.bytemode && packet_size <= mtu)
local_prob = (local_prob / mtu) * packet_size;
else
local_prob = q->vars.prob;
rnd = prandom_u32();
if (rnd < local_prob)
return true;
return false;
}
static int pie_qdisc_enqueue(struct sk_buff *skb, struct Qdisc *sch)
{
struct pie_sched_data *q = qdisc_priv(sch);
bool enqueue = false;
if (unlikely(qdisc_qlen(sch) >= sch->limit)) {
q->stats.overlimit++;
goto out;
}
if (!drop_early(sch, skb->len)) {
enqueue = true;
} else if (q->params.ecn && (q->vars.prob <= MAX_PROB / 10) &&
INET_ECN_set_ce(skb)) {
/* If packet is ecn capable, mark it if drop probability
* is lower than 10%, else drop it.
*/
q->stats.ecn_mark++;
enqueue = true;
}
/* we can enqueue the packet */
if (enqueue) {
q->stats.packets_in++;
if (qdisc_qlen(sch) > q->stats.maxq)
q->stats.maxq = qdisc_qlen(sch);
return qdisc_enqueue_tail(skb, sch);
}
out:
q->stats.dropped++;
return qdisc_drop(skb, sch);
}
static const struct nla_policy pie_policy[TCA_PIE_MAX + 1] = {
[TCA_PIE_TARGET] = {.type = NLA_U32},
[TCA_PIE_LIMIT] = {.type = NLA_U32},
[TCA_PIE_TUPDATE] = {.type = NLA_U32},
[TCA_PIE_ALPHA] = {.type = NLA_U32},
[TCA_PIE_BETA] = {.type = NLA_U32},
[TCA_PIE_ECN] = {.type = NLA_U32},
[TCA_PIE_BYTEMODE] = {.type = NLA_U32},
};
static int pie_change(struct Qdisc *sch, struct nlattr *opt)
{
struct pie_sched_data *q = qdisc_priv(sch);
struct nlattr *tb[TCA_PIE_MAX + 1];
unsigned int qlen;
int err;
if (!opt)
return -EINVAL;
err = nla_parse_nested(tb, TCA_PIE_MAX, opt, pie_policy);
if (err < 0)
return err;
sch_tree_lock(sch);
/* convert from microseconds to pschedtime */
if (tb[TCA_PIE_TARGET]) {
/* target is in us */
u32 target = nla_get_u32(tb[TCA_PIE_TARGET]);
/* convert to pschedtime */
q->params.target = PSCHED_NS2TICKS((u64)target * NSEC_PER_USEC);
}
/* tupdate is in jiffies */
if (tb[TCA_PIE_TUPDATE])
q->params.tupdate = usecs_to_jiffies(nla_get_u32(tb[TCA_PIE_TUPDATE]));
if (tb[TCA_PIE_LIMIT]) {
u32 limit = nla_get_u32(tb[TCA_PIE_LIMIT]);
q->params.limit = limit;
sch->limit = limit;
}
if (tb[TCA_PIE_ALPHA])
q->params.alpha = nla_get_u32(tb[TCA_PIE_ALPHA]);
if (tb[TCA_PIE_BETA])
q->params.beta = nla_get_u32(tb[TCA_PIE_BETA]);
if (tb[TCA_PIE_ECN])
q->params.ecn = nla_get_u32(tb[TCA_PIE_ECN]);
if (tb[TCA_PIE_BYTEMODE])
q->params.bytemode = nla_get_u32(tb[TCA_PIE_BYTEMODE]);
/* Drop excess packets if new limit is lower */
qlen = sch->q.qlen;
while (sch->q.qlen > sch->limit) {
struct sk_buff *skb = __skb_dequeue(&sch->q);
qdisc_qstats_backlog_dec(sch, skb);
qdisc_drop(skb, sch);
}
qdisc_tree_decrease_qlen(sch, qlen - sch->q.qlen);
sch_tree_unlock(sch);
return 0;
}
static void pie_process_dequeue(struct Qdisc *sch, struct sk_buff *skb)
{
struct pie_sched_data *q = qdisc_priv(sch);
int qlen = sch->qstats.backlog; /* current queue size in bytes */
/* If current queue is about 10 packets or more and dq_count is unset
* we have enough packets to calculate the drain rate. Save
* current time as dq_tstamp and start measurement cycle.
*/
if (qlen >= QUEUE_THRESHOLD && q->vars.dq_count == DQCOUNT_INVALID) {
q->vars.dq_tstamp = psched_get_time();
q->vars.dq_count = 0;
}
/* Calculate the average drain rate from this value. If queue length
* has receded to a small value viz., <= QUEUE_THRESHOLD bytes,reset
* the dq_count to -1 as we don't have enough packets to calculate the
* drain rate anymore The following if block is entered only when we
* have a substantial queue built up (QUEUE_THRESHOLD bytes or more)
* and we calculate the drain rate for the threshold here. dq_count is
* in bytes, time difference in psched_time, hence rate is in
* bytes/psched_time.
*/
if (q->vars.dq_count != DQCOUNT_INVALID) {
q->vars.dq_count += skb->len;
if (q->vars.dq_count >= QUEUE_THRESHOLD) {
psched_time_t now = psched_get_time();
u32 dtime = now - q->vars.dq_tstamp;
u32 count = q->vars.dq_count << PIE_SCALE;
if (dtime == 0)
return;
count = count / dtime;
if (q->vars.avg_dq_rate == 0)
q->vars.avg_dq_rate = count;
else
q->vars.avg_dq_rate =
(q->vars.avg_dq_rate -
(q->vars.avg_dq_rate >> 3)) + (count >> 3);
/* If the queue has receded below the threshold, we hold
* on to the last drain rate calculated, else we reset
* dq_count to 0 to re-enter the if block when the next
* packet is dequeued
*/
if (qlen < QUEUE_THRESHOLD)
q->vars.dq_count = DQCOUNT_INVALID;
else {
q->vars.dq_count = 0;
q->vars.dq_tstamp = psched_get_time();
}
if (q->vars.burst_time > 0) {
if (q->vars.burst_time > dtime)
q->vars.burst_time -= dtime;
else
q->vars.burst_time = 0;
}
}
}
}
static void calculate_probability(struct Qdisc *sch)
{
struct pie_sched_data *q = qdisc_priv(sch);
u32 qlen = sch->qstats.backlog; /* queue size in bytes */
psched_time_t qdelay = 0; /* in pschedtime */
psched_time_t qdelay_old = q->vars.qdelay; /* in pschedtime */
s32 delta = 0; /* determines the change in probability */
u32 oldprob;
u32 alpha, beta;
bool update_prob = true;
q->vars.qdelay_old = q->vars.qdelay;
if (q->vars.avg_dq_rate > 0)
qdelay = (qlen << PIE_SCALE) / q->vars.avg_dq_rate;
else
qdelay = 0;
/* If qdelay is zero and qlen is not, it means qlen is very small, less
* than dequeue_rate, so we do not update probabilty in this round
*/
if (qdelay == 0 && qlen != 0)
update_prob = false;
/* In the algorithm, alpha and beta are between 0 and 2 with typical
* value for alpha as 0.125. In this implementation, we use values 0-32
* passed from user space to represent this. Also, alpha and beta have
* unit of HZ and need to be scaled before they can used to update
* probability. alpha/beta are updated locally below by 1) scaling them
* appropriately 2) scaling down by 16 to come to 0-2 range.
* Please see paper for details.
*
* We scale alpha and beta differently depending on whether we are in
* light, medium or high dropping mode.
*/
if (q->vars.prob < MAX_PROB / 100) {
alpha =
(q->params.alpha * (MAX_PROB / PSCHED_TICKS_PER_SEC)) >> 7;
beta =
(q->params.beta * (MAX_PROB / PSCHED_TICKS_PER_SEC)) >> 7;
} else if (q->vars.prob < MAX_PROB / 10) {
alpha =
(q->params.alpha * (MAX_PROB / PSCHED_TICKS_PER_SEC)) >> 5;
beta =
(q->params.beta * (MAX_PROB / PSCHED_TICKS_PER_SEC)) >> 5;
} else {
alpha =
(q->params.alpha * (MAX_PROB / PSCHED_TICKS_PER_SEC)) >> 4;
beta =
(q->params.beta * (MAX_PROB / PSCHED_TICKS_PER_SEC)) >> 4;
}
/* alpha and beta should be between 0 and 32, in multiples of 1/16 */
delta += alpha * ((qdelay - q->params.target));
delta += beta * ((qdelay - qdelay_old));
oldprob = q->vars.prob;
/* to ensure we increase probability in steps of no more than 2% */
if (delta > (s32) (MAX_PROB / (100 / 2)) &&
q->vars.prob >= MAX_PROB / 10)
delta = (MAX_PROB / 100) * 2;
/* Non-linear drop:
* Tune drop probability to increase quickly for high delays(>= 250ms)
* 250ms is derived through experiments and provides error protection
*/
if (qdelay > (PSCHED_NS2TICKS(250 * NSEC_PER_MSEC)))
delta += MAX_PROB / (100 / 2);
q->vars.prob += delta;
if (delta > 0) {
/* prevent overflow */
if (q->vars.prob < oldprob) {
q->vars.prob = MAX_PROB;
/* Prevent normalization error. If probability is at
* maximum value already, we normalize it here, and
* skip the check to do a non-linear drop in the next
* section.
*/
update_prob = false;
}
} else {
/* prevent underflow */
if (q->vars.prob > oldprob)
q->vars.prob = 0;
}
/* Non-linear drop in probability: Reduce drop probability quickly if
* delay is 0 for 2 consecutive Tupdate periods.
*/
if ((qdelay == 0) && (qdelay_old == 0) && update_prob)
q->vars.prob = (q->vars.prob * 98) / 100;
q->vars.qdelay = qdelay;
q->vars.qlen_old = qlen;
/* We restart the measurement cycle if the following conditions are met
* 1. If the delay has been low for 2 consecutive Tupdate periods
* 2. Calculated drop probability is zero
* 3. We have atleast one estimate for the avg_dq_rate ie.,
* is a non-zero value
*/
if ((q->vars.qdelay < q->params.target / 2) &&
(q->vars.qdelay_old < q->params.target / 2) &&
(q->vars.prob == 0) &&
(q->vars.avg_dq_rate > 0))
pie_vars_init(&q->vars);
}
static void pie_timer(unsigned long arg)
{
struct Qdisc *sch = (struct Qdisc *)arg;
struct pie_sched_data *q = qdisc_priv(sch);
spinlock_t *root_lock = qdisc_lock(qdisc_root_sleeping(sch));
spin_lock(root_lock);
calculate_probability(sch);
/* reset the timer to fire after 'tupdate'. tupdate is in jiffies. */
if (q->params.tupdate)
mod_timer(&q->adapt_timer, jiffies + q->params.tupdate);
spin_unlock(root_lock);
}
static int pie_init(struct Qdisc *sch, struct nlattr *opt)
{
struct pie_sched_data *q = qdisc_priv(sch);
pie_params_init(&q->params);
pie_vars_init(&q->vars);
sch->limit = q->params.limit;
setup_timer(&q->adapt_timer, pie_timer, (unsigned long)sch);
if (opt) {
int err = pie_change(sch, opt);
if (err)
return err;
}
mod_timer(&q->adapt_timer, jiffies + HZ / 2);
return 0;
}
static int pie_dump(struct Qdisc *sch, struct sk_buff *skb)
{
struct pie_sched_data *q = qdisc_priv(sch);
struct nlattr *opts;
opts = nla_nest_start(skb, TCA_OPTIONS);
if (opts == NULL)
goto nla_put_failure;
/* convert target from pschedtime to us */
if (nla_put_u32(skb, TCA_PIE_TARGET,
((u32) PSCHED_TICKS2NS(q->params.target)) /
NSEC_PER_USEC) ||
nla_put_u32(skb, TCA_PIE_LIMIT, sch->limit) ||
nla_put_u32(skb, TCA_PIE_TUPDATE, jiffies_to_usecs(q->params.tupdate)) ||
nla_put_u32(skb, TCA_PIE_ALPHA, q->params.alpha) ||
nla_put_u32(skb, TCA_PIE_BETA, q->params.beta) ||
nla_put_u32(skb, TCA_PIE_ECN, q->params.ecn) ||
nla_put_u32(skb, TCA_PIE_BYTEMODE, q->params.bytemode))
goto nla_put_failure;
return nla_nest_end(skb, opts);
nla_put_failure:
nla_nest_cancel(skb, opts);
return -1;
}
static int pie_dump_stats(struct Qdisc *sch, struct gnet_dump *d)
{
struct pie_sched_data *q = qdisc_priv(sch);
struct tc_pie_xstats st = {
.prob = q->vars.prob,
.delay = ((u32) PSCHED_TICKS2NS(q->vars.qdelay)) /
NSEC_PER_USEC,
/* unscale and return dq_rate in bytes per sec */
.avg_dq_rate = q->vars.avg_dq_rate *
(PSCHED_TICKS_PER_SEC) >> PIE_SCALE,
.packets_in = q->stats.packets_in,
.overlimit = q->stats.overlimit,
.maxq = q->stats.maxq,
.dropped = q->stats.dropped,
.ecn_mark = q->stats.ecn_mark,
};
return gnet_stats_copy_app(d, &st, sizeof(st));
}
static struct sk_buff *pie_qdisc_dequeue(struct Qdisc *sch)
{
struct sk_buff *skb;
skb = __qdisc_dequeue_head(sch, &sch->q);
if (!skb)
return NULL;
pie_process_dequeue(sch, skb);
return skb;
}
static void pie_reset(struct Qdisc *sch)
{
struct pie_sched_data *q = qdisc_priv(sch);
qdisc_reset_queue(sch);
pie_vars_init(&q->vars);
}
static void pie_destroy(struct Qdisc *sch)
{
struct pie_sched_data *q = qdisc_priv(sch);
q->params.tupdate = 0;
del_timer_sync(&q->adapt_timer);
}
static struct Qdisc_ops pie_qdisc_ops __read_mostly = {
.id = "pie",
.priv_size = sizeof(struct pie_sched_data),
.enqueue = pie_qdisc_enqueue,
.dequeue = pie_qdisc_dequeue,
.peek = qdisc_peek_dequeued,
.init = pie_init,
.destroy = pie_destroy,
.reset = pie_reset,
.change = pie_change,
.dump = pie_dump,
.dump_stats = pie_dump_stats,
.owner = THIS_MODULE,
};
static int __init pie_module_init(void)
{
return register_qdisc(&pie_qdisc_ops);
}
static void __exit pie_module_exit(void)
{
unregister_qdisc(&pie_qdisc_ops);
}
module_init(pie_module_init);
module_exit(pie_module_exit);
MODULE_DESCRIPTION("Proportional Integral controller Enhanced (PIE) scheduler");
MODULE_AUTHOR("Vijay Subramanian");
MODULE_AUTHOR("Mythili Prabhu");
MODULE_LICENSE("GPL");

233
net/sched/sch_plug.c Normal file
View file

@ -0,0 +1,233 @@
/*
* sch_plug.c Queue traffic until an explicit release command
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public License
* as published by the Free Software Foundation; either version
* 2 of the License, or (at your option) any later version.
*
* There are two ways to use this qdisc:
* 1. A simple "instantaneous" plug/unplug operation, by issuing an alternating
* sequence of TCQ_PLUG_BUFFER & TCQ_PLUG_RELEASE_INDEFINITE commands.
*
* 2. For network output buffering (a.k.a output commit) functionality.
* Output commit property is commonly used by applications using checkpoint
* based fault-tolerance to ensure that the checkpoint from which a system
* is being restored is consistent w.r.t outside world.
*
* Consider for e.g. Remus - a Virtual Machine checkpointing system,
* wherein a VM is checkpointed, say every 50ms. The checkpoint is replicated
* asynchronously to the backup host, while the VM continues executing the
* next epoch speculatively.
*
* The following is a typical sequence of output buffer operations:
* 1.At epoch i, start_buffer(i)
* 2. At end of epoch i (i.e. after 50ms):
* 2.1 Stop VM and take checkpoint(i).
* 2.2 start_buffer(i+1) and Resume VM
* 3. While speculatively executing epoch(i+1), asynchronously replicate
* checkpoint(i) to backup host.
* 4. When checkpoint_ack(i) is received from backup, release_buffer(i)
* Thus, this Qdisc would receive the following sequence of commands:
* TCQ_PLUG_BUFFER (epoch i)
* .. TCQ_PLUG_BUFFER (epoch i+1)
* ....TCQ_PLUG_RELEASE_ONE (epoch i)
* ......TCQ_PLUG_BUFFER (epoch i+2)
* ........
*/
#include <linux/module.h>
#include <linux/types.h>
#include <linux/kernel.h>
#include <linux/errno.h>
#include <linux/netdevice.h>
#include <linux/skbuff.h>
#include <net/pkt_sched.h>
/*
* State of the queue, when used for network output buffering:
*
* plug(i+1) plug(i) head
* ------------------+--------------------+---------------->
* | |
* | |
* pkts_current_epoch| pkts_last_epoch |pkts_to_release
* ----------------->|<--------+--------->|+--------------->
* v v
*
*/
struct plug_sched_data {
/* If true, the dequeue function releases all packets
* from head to end of the queue. The queue turns into
* a pass-through queue for newly arriving packets.
*/
bool unplug_indefinite;
/* Queue Limit in bytes */
u32 limit;
/* Number of packets (output) from the current speculatively
* executing epoch.
*/
u32 pkts_current_epoch;
/* Number of packets corresponding to the recently finished
* epoch. These will be released when we receive a
* TCQ_PLUG_RELEASE_ONE command. This command is typically
* issued after committing a checkpoint at the target.
*/
u32 pkts_last_epoch;
/*
* Number of packets from the head of the queue, that can
* be released (committed checkpoint).
*/
u32 pkts_to_release;
};
static int plug_enqueue(struct sk_buff *skb, struct Qdisc *sch)
{
struct plug_sched_data *q = qdisc_priv(sch);
if (likely(sch->qstats.backlog + skb->len <= q->limit)) {
if (!q->unplug_indefinite)
q->pkts_current_epoch++;
return qdisc_enqueue_tail(skb, sch);
}
return qdisc_reshape_fail(skb, sch);
}
static struct sk_buff *plug_dequeue(struct Qdisc *sch)
{
struct plug_sched_data *q = qdisc_priv(sch);
if (qdisc_is_throttled(sch))
return NULL;
if (!q->unplug_indefinite) {
if (!q->pkts_to_release) {
/* No more packets to dequeue. Block the queue
* and wait for the next release command.
*/
qdisc_throttled(sch);
return NULL;
}
q->pkts_to_release--;
}
return qdisc_dequeue_head(sch);
}
static int plug_init(struct Qdisc *sch, struct nlattr *opt)
{
struct plug_sched_data *q = qdisc_priv(sch);
q->pkts_current_epoch = 0;
q->pkts_last_epoch = 0;
q->pkts_to_release = 0;
q->unplug_indefinite = false;
if (opt == NULL) {
/* We will set a default limit of 100 pkts (~150kB)
* in case tx_queue_len is not available. The
* default value is completely arbitrary.
*/
u32 pkt_limit = qdisc_dev(sch)->tx_queue_len ? : 100;
q->limit = pkt_limit * psched_mtu(qdisc_dev(sch));
} else {
struct tc_plug_qopt *ctl = nla_data(opt);
if (nla_len(opt) < sizeof(*ctl))
return -EINVAL;
q->limit = ctl->limit;
}
qdisc_throttled(sch);
return 0;
}
/* Receives 4 types of messages:
* TCQ_PLUG_BUFFER: Inset a plug into the queue and
* buffer any incoming packets
* TCQ_PLUG_RELEASE_ONE: Dequeue packets from queue head
* to beginning of the next plug.
* TCQ_PLUG_RELEASE_INDEFINITE: Dequeue all packets from queue.
* Stop buffering packets until the next TCQ_PLUG_BUFFER
* command is received (just act as a pass-thru queue).
* TCQ_PLUG_LIMIT: Increase/decrease queue size
*/
static int plug_change(struct Qdisc *sch, struct nlattr *opt)
{
struct plug_sched_data *q = qdisc_priv(sch);
struct tc_plug_qopt *msg;
if (opt == NULL)
return -EINVAL;
msg = nla_data(opt);
if (nla_len(opt) < sizeof(*msg))
return -EINVAL;
switch (msg->action) {
case TCQ_PLUG_BUFFER:
/* Save size of the current buffer */
q->pkts_last_epoch = q->pkts_current_epoch;
q->pkts_current_epoch = 0;
if (q->unplug_indefinite)
qdisc_throttled(sch);
q->unplug_indefinite = false;
break;
case TCQ_PLUG_RELEASE_ONE:
/* Add packets from the last complete buffer to the
* packets to be released set.
*/
q->pkts_to_release += q->pkts_last_epoch;
q->pkts_last_epoch = 0;
qdisc_unthrottled(sch);
netif_schedule_queue(sch->dev_queue);
break;
case TCQ_PLUG_RELEASE_INDEFINITE:
q->unplug_indefinite = true;
q->pkts_to_release = 0;
q->pkts_last_epoch = 0;
q->pkts_current_epoch = 0;
qdisc_unthrottled(sch);
netif_schedule_queue(sch->dev_queue);
break;
case TCQ_PLUG_LIMIT:
/* Limit is supplied in bytes */
q->limit = msg->limit;
break;
default:
return -EINVAL;
}
return 0;
}
static struct Qdisc_ops plug_qdisc_ops __read_mostly = {
.id = "plug",
.priv_size = sizeof(struct plug_sched_data),
.enqueue = plug_enqueue,
.dequeue = plug_dequeue,
.peek = qdisc_peek_head,
.init = plug_init,
.change = plug_change,
.owner = THIS_MODULE,
};
static int __init plug_module_init(void)
{
return register_qdisc(&plug_qdisc_ops);
}
static void __exit plug_module_exit(void)
{
unregister_qdisc(&plug_qdisc_ops);
}
module_init(plug_module_init)
module_exit(plug_module_exit)
MODULE_LICENSE("GPL");

426
net/sched/sch_prio.c Normal file
View file

@ -0,0 +1,426 @@
/*
* net/sched/sch_prio.c Simple 3-band priority "scheduler".
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public License
* as published by the Free Software Foundation; either version
* 2 of the License, or (at your option) any later version.
*
* Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
* Fixes: 19990609: J Hadi Salim <hadi@nortelnetworks.com>:
* Init -- EINVAL when opt undefined
*/
#include <linux/module.h>
#include <linux/slab.h>
#include <linux/types.h>
#include <linux/kernel.h>
#include <linux/string.h>
#include <linux/errno.h>
#include <linux/skbuff.h>
#include <linux/netdevice.h>
#include <net/netlink.h>
#include <net/pkt_sched.h>
struct prio_sched_data {
int bands;
struct tcf_proto __rcu *filter_list;
u8 prio2band[TC_PRIO_MAX+1];
struct Qdisc *queues[TCQ_PRIO_BANDS];
u8 enable_flow;
};
static struct Qdisc *
prio_classify(struct sk_buff *skb, struct Qdisc *sch, int *qerr)
{
struct prio_sched_data *q = qdisc_priv(sch);
u32 band = skb->priority;
struct tcf_result res;
struct tcf_proto *fl;
int err;
*qerr = NET_XMIT_SUCCESS | __NET_XMIT_BYPASS;
if (TC_H_MAJ(skb->priority) != sch->handle) {
fl = rcu_dereference_bh(q->filter_list);
err = tc_classify(skb, fl, &res);
#ifdef CONFIG_NET_CLS_ACT
switch (err) {
case TC_ACT_STOLEN:
case TC_ACT_QUEUED:
*qerr = NET_XMIT_SUCCESS | __NET_XMIT_STOLEN;
case TC_ACT_SHOT:
return NULL;
}
#endif
if (!fl || err < 0) {
if (TC_H_MAJ(band))
band = 0;
return q->queues[q->prio2band[band & TC_PRIO_MAX]];
}
band = res.classid;
}
band = TC_H_MIN(band) - 1;
if (band >= q->bands)
return q->queues[q->prio2band[0]];
return q->queues[band];
}
static int
prio_enqueue(struct sk_buff *skb, struct Qdisc *sch)
{
struct Qdisc *qdisc;
int ret;
qdisc = prio_classify(skb, sch, &ret);
#ifdef CONFIG_NET_CLS_ACT
if (qdisc == NULL) {
if (ret & __NET_XMIT_BYPASS)
qdisc_qstats_drop(sch);
kfree_skb(skb);
return ret;
}
#endif
ret = qdisc_enqueue(skb, qdisc);
if (ret == NET_XMIT_SUCCESS) {
sch->q.qlen++;
return NET_XMIT_SUCCESS;
}
if (net_xmit_drop_count(ret))
qdisc_qstats_drop(sch);
return ret;
}
static struct sk_buff *prio_peek(struct Qdisc *sch)
{
struct prio_sched_data *q = qdisc_priv(sch);
int prio;
if (!q->enable_flow)
return NULL;
for (prio = 0; prio < q->bands; prio++) {
struct Qdisc *qdisc = q->queues[prio];
struct sk_buff *skb = qdisc->ops->peek(qdisc);
if (skb)
return skb;
}
return NULL;
}
static struct sk_buff *prio_dequeue(struct Qdisc *sch)
{
struct prio_sched_data *q = qdisc_priv(sch);
int prio;
if (!q->enable_flow)
return NULL;
for (prio = 0; prio < q->bands; prio++) {
struct Qdisc *qdisc = q->queues[prio];
struct sk_buff *skb = qdisc_dequeue_peeked(qdisc);
if (skb) {
qdisc_bstats_update(sch, skb);
sch->q.qlen--;
return skb;
}
}
return NULL;
}
static unsigned int prio_drop(struct Qdisc *sch)
{
struct prio_sched_data *q = qdisc_priv(sch);
int prio;
unsigned int len;
struct Qdisc *qdisc;
for (prio = q->bands-1; prio >= 0; prio--) {
qdisc = q->queues[prio];
if (qdisc->ops->drop && (len = qdisc->ops->drop(qdisc)) != 0) {
sch->q.qlen--;
return len;
}
}
return 0;
}
static void
prio_reset(struct Qdisc *sch)
{
int prio;
struct prio_sched_data *q = qdisc_priv(sch);
for (prio = 0; prio < q->bands; prio++)
qdisc_reset(q->queues[prio]);
sch->q.qlen = 0;
q->enable_flow = 1;
}
static void
prio_destroy(struct Qdisc *sch)
{
int prio;
struct prio_sched_data *q = qdisc_priv(sch);
tcf_destroy_chain(&q->filter_list);
for (prio = 0; prio < q->bands; prio++)
qdisc_destroy(q->queues[prio]);
}
static int prio_tune(struct Qdisc *sch, struct nlattr *opt)
{
struct prio_sched_data *q = qdisc_priv(sch);
struct tc_prio_qopt *qopt;
int i;
int flow_change = 0;
if (nla_len(opt) < sizeof(*qopt))
return -EINVAL;
qopt = nla_data(opt);
if (qopt->bands > TCQ_PRIO_BANDS || qopt->bands < 2)
return -EINVAL;
for (i = 0; i <= TC_PRIO_MAX; i++) {
if (qopt->priomap[i] >= qopt->bands)
return -EINVAL;
}
sch_tree_lock(sch);
if (q->enable_flow != qopt->enable_flow) {
q->enable_flow = qopt->enable_flow;
flow_change = 1;
}
q->bands = qopt->bands;
memcpy(q->prio2band, qopt->priomap, TC_PRIO_MAX+1);
for (i = q->bands; i < TCQ_PRIO_BANDS; i++) {
struct Qdisc *child = q->queues[i];
q->queues[i] = &noop_qdisc;
if (child != &noop_qdisc) {
qdisc_tree_decrease_qlen(child, child->q.qlen);
qdisc_destroy(child);
}
}
sch_tree_unlock(sch);
for (i = 0; i < q->bands; i++) {
if (q->queues[i] == &noop_qdisc) {
struct Qdisc *child, *old;
child = qdisc_create_dflt(sch->dev_queue,
&pfifo_qdisc_ops,
TC_H_MAKE(sch->handle, i + 1));
if (child) {
sch_tree_lock(sch);
old = q->queues[i];
q->queues[i] = child;
if (old != &noop_qdisc) {
qdisc_tree_decrease_qlen(old,
old->q.qlen);
qdisc_destroy(old);
}
sch_tree_unlock(sch);
}
}
}
/* Schedule qdisc when flow re-enabled */
if (flow_change && q->enable_flow) {
if (!test_bit(__QDISC_STATE_DEACTIVATED,
&sch->state))
__netif_schedule(qdisc_root(sch));
}
return 0;
}
static int prio_init(struct Qdisc *sch, struct nlattr *opt)
{
struct prio_sched_data *q = qdisc_priv(sch);
int i;
for (i = 0; i < TCQ_PRIO_BANDS; i++)
q->queues[i] = &noop_qdisc;
if (opt == NULL) {
return -EINVAL;
} else {
int err;
if ((err = prio_tune(sch, opt)) != 0)
return err;
}
return 0;
}
static int prio_dump(struct Qdisc *sch, struct sk_buff *skb)
{
struct prio_sched_data *q = qdisc_priv(sch);
unsigned char *b = skb_tail_pointer(skb);
struct tc_prio_qopt opt;
opt.bands = q->bands;
opt.enable_flow = q->enable_flow;
memcpy(&opt.priomap, q->prio2band, TC_PRIO_MAX + 1);
if (nla_put(skb, TCA_OPTIONS, sizeof(opt), &opt))
goto nla_put_failure;
return skb->len;
nla_put_failure:
nlmsg_trim(skb, b);
return -1;
}
static int prio_graft(struct Qdisc *sch, unsigned long arg, struct Qdisc *new,
struct Qdisc **old)
{
struct prio_sched_data *q = qdisc_priv(sch);
unsigned long band = arg - 1;
if (new == NULL)
new = &noop_qdisc;
sch_tree_lock(sch);
*old = q->queues[band];
q->queues[band] = new;
qdisc_tree_decrease_qlen(*old, (*old)->q.qlen);
qdisc_reset(*old);
sch_tree_unlock(sch);
return 0;
}
static struct Qdisc *
prio_leaf(struct Qdisc *sch, unsigned long arg)
{
struct prio_sched_data *q = qdisc_priv(sch);
unsigned long band = arg - 1;
return q->queues[band];
}
static unsigned long prio_get(struct Qdisc *sch, u32 classid)
{
struct prio_sched_data *q = qdisc_priv(sch);
unsigned long band = TC_H_MIN(classid);
if (band - 1 >= q->bands)
return 0;
return band;
}
static unsigned long prio_bind(struct Qdisc *sch, unsigned long parent, u32 classid)
{
return prio_get(sch, classid);
}
static void prio_put(struct Qdisc *q, unsigned long cl)
{
}
static int prio_dump_class(struct Qdisc *sch, unsigned long cl, struct sk_buff *skb,
struct tcmsg *tcm)
{
struct prio_sched_data *q = qdisc_priv(sch);
tcm->tcm_handle |= TC_H_MIN(cl);
tcm->tcm_info = q->queues[cl-1]->handle;
return 0;
}
static int prio_dump_class_stats(struct Qdisc *sch, unsigned long cl,
struct gnet_dump *d)
{
struct prio_sched_data *q = qdisc_priv(sch);
struct Qdisc *cl_q;
cl_q = q->queues[cl - 1];
if (gnet_stats_copy_basic(d, NULL, &cl_q->bstats) < 0 ||
gnet_stats_copy_queue(d, NULL, &cl_q->qstats, cl_q->q.qlen) < 0)
return -1;
return 0;
}
static void prio_walk(struct Qdisc *sch, struct qdisc_walker *arg)
{
struct prio_sched_data *q = qdisc_priv(sch);
int prio;
if (arg->stop)
return;
for (prio = 0; prio < q->bands; prio++) {
if (arg->count < arg->skip) {
arg->count++;
continue;
}
if (arg->fn(sch, prio + 1, arg) < 0) {
arg->stop = 1;
break;
}
arg->count++;
}
}
static struct tcf_proto __rcu **prio_find_tcf(struct Qdisc *sch,
unsigned long cl)
{
struct prio_sched_data *q = qdisc_priv(sch);
if (cl)
return NULL;
return &q->filter_list;
}
static const struct Qdisc_class_ops prio_class_ops = {
.graft = prio_graft,
.leaf = prio_leaf,
.get = prio_get,
.put = prio_put,
.walk = prio_walk,
.tcf_chain = prio_find_tcf,
.bind_tcf = prio_bind,
.unbind_tcf = prio_put,
.dump = prio_dump_class,
.dump_stats = prio_dump_class_stats,
};
static struct Qdisc_ops prio_qdisc_ops __read_mostly = {
.next = NULL,
.cl_ops = &prio_class_ops,
.id = "prio",
.priv_size = sizeof(struct prio_sched_data),
.enqueue = prio_enqueue,
.dequeue = prio_dequeue,
.peek = prio_peek,
.drop = prio_drop,
.init = prio_init,
.reset = prio_reset,
.destroy = prio_destroy,
.change = prio_tune,
.dump = prio_dump,
.owner = THIS_MODULE,
};
static int __init prio_module_init(void)
{
return register_qdisc(&prio_qdisc_ops);
}
static void __exit prio_module_exit(void)
{
unregister_qdisc(&prio_qdisc_ops);
}
module_init(prio_module_init)
module_exit(prio_module_exit)
MODULE_LICENSE("GPL");

1587
net/sched/sch_qfq.c Normal file

File diff suppressed because it is too large Load diff

391
net/sched/sch_red.c Normal file
View file

@ -0,0 +1,391 @@
/*
* net/sched/sch_red.c Random Early Detection queue.
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public License
* as published by the Free Software Foundation; either version
* 2 of the License, or (at your option) any later version.
*
* Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
*
* Changes:
* J Hadi Salim 980914: computation fixes
* Alexey Makarenko <makar@phoenix.kharkov.ua> 990814: qave on idle link was calculated incorrectly.
* J Hadi Salim 980816: ECN support
*/
#include <linux/module.h>
#include <linux/types.h>
#include <linux/kernel.h>
#include <linux/skbuff.h>
#include <net/pkt_sched.h>
#include <net/inet_ecn.h>
#include <net/red.h>
/* Parameters, settable by user:
-----------------------------
limit - bytes (must be > qth_max + burst)
Hard limit on queue length, should be chosen >qth_max
to allow packet bursts. This parameter does not
affect the algorithms behaviour and can be chosen
arbitrarily high (well, less than ram size)
Really, this limit will never be reached
if RED works correctly.
*/
struct red_sched_data {
u32 limit; /* HARD maximal queue length */
unsigned char flags;
struct timer_list adapt_timer;
struct red_parms parms;
struct red_vars vars;
struct red_stats stats;
struct Qdisc *qdisc;
};
static inline int red_use_ecn(struct red_sched_data *q)
{
return q->flags & TC_RED_ECN;
}
static inline int red_use_harddrop(struct red_sched_data *q)
{
return q->flags & TC_RED_HARDDROP;
}
static int red_enqueue(struct sk_buff *skb, struct Qdisc *sch)
{
struct red_sched_data *q = qdisc_priv(sch);
struct Qdisc *child = q->qdisc;
int ret;
q->vars.qavg = red_calc_qavg(&q->parms,
&q->vars,
child->qstats.backlog);
if (red_is_idling(&q->vars))
red_end_of_idle_period(&q->vars);
switch (red_action(&q->parms, &q->vars, q->vars.qavg)) {
case RED_DONT_MARK:
break;
case RED_PROB_MARK:
qdisc_qstats_overlimit(sch);
if (!red_use_ecn(q) || !INET_ECN_set_ce(skb)) {
q->stats.prob_drop++;
goto congestion_drop;
}
q->stats.prob_mark++;
break;
case RED_HARD_MARK:
qdisc_qstats_overlimit(sch);
if (red_use_harddrop(q) || !red_use_ecn(q) ||
!INET_ECN_set_ce(skb)) {
q->stats.forced_drop++;
goto congestion_drop;
}
q->stats.forced_mark++;
break;
}
ret = qdisc_enqueue(skb, child);
if (likely(ret == NET_XMIT_SUCCESS)) {
sch->q.qlen++;
} else if (net_xmit_drop_count(ret)) {
q->stats.pdrop++;
qdisc_qstats_drop(sch);
}
return ret;
congestion_drop:
qdisc_drop(skb, sch);
return NET_XMIT_CN;
}
static struct sk_buff *red_dequeue(struct Qdisc *sch)
{
struct sk_buff *skb;
struct red_sched_data *q = qdisc_priv(sch);
struct Qdisc *child = q->qdisc;
skb = child->dequeue(child);
if (skb) {
qdisc_bstats_update(sch, skb);
sch->q.qlen--;
} else {
if (!red_is_idling(&q->vars))
red_start_of_idle_period(&q->vars);
}
return skb;
}
static struct sk_buff *red_peek(struct Qdisc *sch)
{
struct red_sched_data *q = qdisc_priv(sch);
struct Qdisc *child = q->qdisc;
return child->ops->peek(child);
}
static unsigned int red_drop(struct Qdisc *sch)
{
struct red_sched_data *q = qdisc_priv(sch);
struct Qdisc *child = q->qdisc;
unsigned int len;
if (child->ops->drop && (len = child->ops->drop(child)) > 0) {
q->stats.other++;
qdisc_qstats_drop(sch);
sch->q.qlen--;
return len;
}
if (!red_is_idling(&q->vars))
red_start_of_idle_period(&q->vars);
return 0;
}
static void red_reset(struct Qdisc *sch)
{
struct red_sched_data *q = qdisc_priv(sch);
qdisc_reset(q->qdisc);
sch->q.qlen = 0;
red_restart(&q->vars);
}
static void red_destroy(struct Qdisc *sch)
{
struct red_sched_data *q = qdisc_priv(sch);
del_timer_sync(&q->adapt_timer);
qdisc_destroy(q->qdisc);
}
static const struct nla_policy red_policy[TCA_RED_MAX + 1] = {
[TCA_RED_PARMS] = { .len = sizeof(struct tc_red_qopt) },
[TCA_RED_STAB] = { .len = RED_STAB_SIZE },
[TCA_RED_MAX_P] = { .type = NLA_U32 },
};
static int red_change(struct Qdisc *sch, struct nlattr *opt)
{
struct red_sched_data *q = qdisc_priv(sch);
struct nlattr *tb[TCA_RED_MAX + 1];
struct tc_red_qopt *ctl;
struct Qdisc *child = NULL;
int err;
u32 max_P;
if (opt == NULL)
return -EINVAL;
err = nla_parse_nested(tb, TCA_RED_MAX, opt, red_policy);
if (err < 0)
return err;
if (tb[TCA_RED_PARMS] == NULL ||
tb[TCA_RED_STAB] == NULL)
return -EINVAL;
max_P = tb[TCA_RED_MAX_P] ? nla_get_u32(tb[TCA_RED_MAX_P]) : 0;
ctl = nla_data(tb[TCA_RED_PARMS]);
if (ctl->limit > 0) {
child = fifo_create_dflt(sch, &bfifo_qdisc_ops, ctl->limit);
if (IS_ERR(child))
return PTR_ERR(child);
}
sch_tree_lock(sch);
q->flags = ctl->flags;
q->limit = ctl->limit;
if (child) {
qdisc_tree_decrease_qlen(q->qdisc, q->qdisc->q.qlen);
qdisc_destroy(q->qdisc);
q->qdisc = child;
}
red_set_parms(&q->parms,
ctl->qth_min, ctl->qth_max, ctl->Wlog,
ctl->Plog, ctl->Scell_log,
nla_data(tb[TCA_RED_STAB]),
max_P);
red_set_vars(&q->vars);
del_timer(&q->adapt_timer);
if (ctl->flags & TC_RED_ADAPTATIVE)
mod_timer(&q->adapt_timer, jiffies + HZ/2);
if (!q->qdisc->q.qlen)
red_start_of_idle_period(&q->vars);
sch_tree_unlock(sch);
return 0;
}
static inline void red_adaptative_timer(unsigned long arg)
{
struct Qdisc *sch = (struct Qdisc *)arg;
struct red_sched_data *q = qdisc_priv(sch);
spinlock_t *root_lock = qdisc_lock(qdisc_root_sleeping(sch));
spin_lock(root_lock);
red_adaptative_algo(&q->parms, &q->vars);
mod_timer(&q->adapt_timer, jiffies + HZ/2);
spin_unlock(root_lock);
}
static int red_init(struct Qdisc *sch, struct nlattr *opt)
{
struct red_sched_data *q = qdisc_priv(sch);
q->qdisc = &noop_qdisc;
setup_timer(&q->adapt_timer, red_adaptative_timer, (unsigned long)sch);
return red_change(sch, opt);
}
static int red_dump(struct Qdisc *sch, struct sk_buff *skb)
{
struct red_sched_data *q = qdisc_priv(sch);
struct nlattr *opts = NULL;
struct tc_red_qopt opt = {
.limit = q->limit,
.flags = q->flags,
.qth_min = q->parms.qth_min >> q->parms.Wlog,
.qth_max = q->parms.qth_max >> q->parms.Wlog,
.Wlog = q->parms.Wlog,
.Plog = q->parms.Plog,
.Scell_log = q->parms.Scell_log,
};
sch->qstats.backlog = q->qdisc->qstats.backlog;
opts = nla_nest_start(skb, TCA_OPTIONS);
if (opts == NULL)
goto nla_put_failure;
if (nla_put(skb, TCA_RED_PARMS, sizeof(opt), &opt) ||
nla_put_u32(skb, TCA_RED_MAX_P, q->parms.max_P))
goto nla_put_failure;
return nla_nest_end(skb, opts);
nla_put_failure:
nla_nest_cancel(skb, opts);
return -EMSGSIZE;
}
static int red_dump_stats(struct Qdisc *sch, struct gnet_dump *d)
{
struct red_sched_data *q = qdisc_priv(sch);
struct tc_red_xstats st = {
.early = q->stats.prob_drop + q->stats.forced_drop,
.pdrop = q->stats.pdrop,
.other = q->stats.other,
.marked = q->stats.prob_mark + q->stats.forced_mark,
};
return gnet_stats_copy_app(d, &st, sizeof(st));
}
static int red_dump_class(struct Qdisc *sch, unsigned long cl,
struct sk_buff *skb, struct tcmsg *tcm)
{
struct red_sched_data *q = qdisc_priv(sch);
tcm->tcm_handle |= TC_H_MIN(1);
tcm->tcm_info = q->qdisc->handle;
return 0;
}
static int red_graft(struct Qdisc *sch, unsigned long arg, struct Qdisc *new,
struct Qdisc **old)
{
struct red_sched_data *q = qdisc_priv(sch);
if (new == NULL)
new = &noop_qdisc;
sch_tree_lock(sch);
*old = q->qdisc;
q->qdisc = new;
qdisc_tree_decrease_qlen(*old, (*old)->q.qlen);
qdisc_reset(*old);
sch_tree_unlock(sch);
return 0;
}
static struct Qdisc *red_leaf(struct Qdisc *sch, unsigned long arg)
{
struct red_sched_data *q = qdisc_priv(sch);
return q->qdisc;
}
static unsigned long red_get(struct Qdisc *sch, u32 classid)
{
return 1;
}
static void red_put(struct Qdisc *sch, unsigned long arg)
{
}
static void red_walk(struct Qdisc *sch, struct qdisc_walker *walker)
{
if (!walker->stop) {
if (walker->count >= walker->skip)
if (walker->fn(sch, 1, walker) < 0) {
walker->stop = 1;
return;
}
walker->count++;
}
}
static const struct Qdisc_class_ops red_class_ops = {
.graft = red_graft,
.leaf = red_leaf,
.get = red_get,
.put = red_put,
.walk = red_walk,
.dump = red_dump_class,
};
static struct Qdisc_ops red_qdisc_ops __read_mostly = {
.id = "red",
.priv_size = sizeof(struct red_sched_data),
.cl_ops = &red_class_ops,
.enqueue = red_enqueue,
.dequeue = red_dequeue,
.peek = red_peek,
.drop = red_drop,
.init = red_init,
.reset = red_reset,
.destroy = red_destroy,
.change = red_change,
.dump = red_dump,
.dump_stats = red_dump_stats,
.owner = THIS_MODULE,
};
static int __init red_module_init(void)
{
return register_qdisc(&red_qdisc_ops);
}
static void __exit red_module_exit(void)
{
unregister_qdisc(&red_qdisc_ops);
}
module_init(red_module_init)
module_exit(red_module_exit)
MODULE_LICENSE("GPL");

728
net/sched/sch_sfb.c Normal file
View file

@ -0,0 +1,728 @@
/*
* net/sched/sch_sfb.c Stochastic Fair Blue
*
* Copyright (c) 2008-2011 Juliusz Chroboczek <jch@pps.jussieu.fr>
* Copyright (c) 2011 Eric Dumazet <eric.dumazet@gmail.com>
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public License
* version 2 as published by the Free Software Foundation.
*
* W. Feng, D. Kandlur, D. Saha, K. Shin. Blue:
* A New Class of Active Queue Management Algorithms.
* U. Michigan CSE-TR-387-99, April 1999.
*
* http://www.thefengs.com/wuchang/blue/CSE-TR-387-99.pdf
*
*/
#include <linux/module.h>
#include <linux/types.h>
#include <linux/kernel.h>
#include <linux/errno.h>
#include <linux/skbuff.h>
#include <linux/random.h>
#include <linux/jhash.h>
#include <net/ip.h>
#include <net/pkt_sched.h>
#include <net/inet_ecn.h>
#include <net/flow_keys.h>
/*
* SFB uses two B[l][n] : L x N arrays of bins (L levels, N bins per level)
* This implementation uses L = 8 and N = 16
* This permits us to split one 32bit hash (provided per packet by rxhash or
* external classifier) into 8 subhashes of 4 bits.
*/
#define SFB_BUCKET_SHIFT 4
#define SFB_NUMBUCKETS (1 << SFB_BUCKET_SHIFT) /* N bins per Level */
#define SFB_BUCKET_MASK (SFB_NUMBUCKETS - 1)
#define SFB_LEVELS (32 / SFB_BUCKET_SHIFT) /* L */
/* SFB algo uses a virtual queue, named "bin" */
struct sfb_bucket {
u16 qlen; /* length of virtual queue */
u16 p_mark; /* marking probability */
};
/* We use a double buffering right before hash change
* (Section 4.4 of SFB reference : moving hash functions)
*/
struct sfb_bins {
u32 perturbation; /* jhash perturbation */
struct sfb_bucket bins[SFB_LEVELS][SFB_NUMBUCKETS];
};
struct sfb_sched_data {
struct Qdisc *qdisc;
struct tcf_proto __rcu *filter_list;
unsigned long rehash_interval;
unsigned long warmup_time; /* double buffering warmup time in jiffies */
u32 max;
u32 bin_size; /* maximum queue length per bin */
u32 increment; /* d1 */
u32 decrement; /* d2 */
u32 limit; /* HARD maximal queue length */
u32 penalty_rate;
u32 penalty_burst;
u32 tokens_avail;
unsigned long rehash_time;
unsigned long token_time;
u8 slot; /* current active bins (0 or 1) */
bool double_buffering;
struct sfb_bins bins[2];
struct {
u32 earlydrop;
u32 penaltydrop;
u32 bucketdrop;
u32 queuedrop;
u32 childdrop; /* drops in child qdisc */
u32 marked; /* ECN mark */
} stats;
};
/*
* Each queued skb might be hashed on one or two bins
* We store in skb_cb the two hash values.
* (A zero value means double buffering was not used)
*/
struct sfb_skb_cb {
u32 hashes[2];
};
static inline struct sfb_skb_cb *sfb_skb_cb(const struct sk_buff *skb)
{
qdisc_cb_private_validate(skb, sizeof(struct sfb_skb_cb));
return (struct sfb_skb_cb *)qdisc_skb_cb(skb)->data;
}
/*
* If using 'internal' SFB flow classifier, hash comes from skb rxhash
* If using external classifier, hash comes from the classid.
*/
static u32 sfb_hash(const struct sk_buff *skb, u32 slot)
{
return sfb_skb_cb(skb)->hashes[slot];
}
/* Probabilities are coded as Q0.16 fixed-point values,
* with 0xFFFF representing 65535/65536 (almost 1.0)
* Addition and subtraction are saturating in [0, 65535]
*/
static u32 prob_plus(u32 p1, u32 p2)
{
u32 res = p1 + p2;
return min_t(u32, res, SFB_MAX_PROB);
}
static u32 prob_minus(u32 p1, u32 p2)
{
return p1 > p2 ? p1 - p2 : 0;
}
static void increment_one_qlen(u32 sfbhash, u32 slot, struct sfb_sched_data *q)
{
int i;
struct sfb_bucket *b = &q->bins[slot].bins[0][0];
for (i = 0; i < SFB_LEVELS; i++) {
u32 hash = sfbhash & SFB_BUCKET_MASK;
sfbhash >>= SFB_BUCKET_SHIFT;
if (b[hash].qlen < 0xFFFF)
b[hash].qlen++;
b += SFB_NUMBUCKETS; /* next level */
}
}
static void increment_qlen(const struct sk_buff *skb, struct sfb_sched_data *q)
{
u32 sfbhash;
sfbhash = sfb_hash(skb, 0);
if (sfbhash)
increment_one_qlen(sfbhash, 0, q);
sfbhash = sfb_hash(skb, 1);
if (sfbhash)
increment_one_qlen(sfbhash, 1, q);
}
static void decrement_one_qlen(u32 sfbhash, u32 slot,
struct sfb_sched_data *q)
{
int i;
struct sfb_bucket *b = &q->bins[slot].bins[0][0];
for (i = 0; i < SFB_LEVELS; i++) {
u32 hash = sfbhash & SFB_BUCKET_MASK;
sfbhash >>= SFB_BUCKET_SHIFT;
if (b[hash].qlen > 0)
b[hash].qlen--;
b += SFB_NUMBUCKETS; /* next level */
}
}
static void decrement_qlen(const struct sk_buff *skb, struct sfb_sched_data *q)
{
u32 sfbhash;
sfbhash = sfb_hash(skb, 0);
if (sfbhash)
decrement_one_qlen(sfbhash, 0, q);
sfbhash = sfb_hash(skb, 1);
if (sfbhash)
decrement_one_qlen(sfbhash, 1, q);
}
static void decrement_prob(struct sfb_bucket *b, struct sfb_sched_data *q)
{
b->p_mark = prob_minus(b->p_mark, q->decrement);
}
static void increment_prob(struct sfb_bucket *b, struct sfb_sched_data *q)
{
b->p_mark = prob_plus(b->p_mark, q->increment);
}
static void sfb_zero_all_buckets(struct sfb_sched_data *q)
{
memset(&q->bins, 0, sizeof(q->bins));
}
/*
* compute max qlen, max p_mark, and avg p_mark
*/
static u32 sfb_compute_qlen(u32 *prob_r, u32 *avgpm_r, const struct sfb_sched_data *q)
{
int i;
u32 qlen = 0, prob = 0, totalpm = 0;
const struct sfb_bucket *b = &q->bins[q->slot].bins[0][0];
for (i = 0; i < SFB_LEVELS * SFB_NUMBUCKETS; i++) {
if (qlen < b->qlen)
qlen = b->qlen;
totalpm += b->p_mark;
if (prob < b->p_mark)
prob = b->p_mark;
b++;
}
*prob_r = prob;
*avgpm_r = totalpm / (SFB_LEVELS * SFB_NUMBUCKETS);
return qlen;
}
static void sfb_init_perturbation(u32 slot, struct sfb_sched_data *q)
{
q->bins[slot].perturbation = prandom_u32();
}
static void sfb_swap_slot(struct sfb_sched_data *q)
{
sfb_init_perturbation(q->slot, q);
q->slot ^= 1;
q->double_buffering = false;
}
/* Non elastic flows are allowed to use part of the bandwidth, expressed
* in "penalty_rate" packets per second, with "penalty_burst" burst
*/
static bool sfb_rate_limit(struct sk_buff *skb, struct sfb_sched_data *q)
{
if (q->penalty_rate == 0 || q->penalty_burst == 0)
return true;
if (q->tokens_avail < 1) {
unsigned long age = min(10UL * HZ, jiffies - q->token_time);
q->tokens_avail = (age * q->penalty_rate) / HZ;
if (q->tokens_avail > q->penalty_burst)
q->tokens_avail = q->penalty_burst;
q->token_time = jiffies;
if (q->tokens_avail < 1)
return true;
}
q->tokens_avail--;
return false;
}
static bool sfb_classify(struct sk_buff *skb, struct tcf_proto *fl,
int *qerr, u32 *salt)
{
struct tcf_result res;
int result;
result = tc_classify(skb, fl, &res);
if (result >= 0) {
#ifdef CONFIG_NET_CLS_ACT
switch (result) {
case TC_ACT_STOLEN:
case TC_ACT_QUEUED:
*qerr = NET_XMIT_SUCCESS | __NET_XMIT_STOLEN;
case TC_ACT_SHOT:
return false;
}
#endif
*salt = TC_H_MIN(res.classid);
return true;
}
return false;
}
static int sfb_enqueue(struct sk_buff *skb, struct Qdisc *sch)
{
struct sfb_sched_data *q = qdisc_priv(sch);
struct Qdisc *child = q->qdisc;
struct tcf_proto *fl;
int i;
u32 p_min = ~0;
u32 minqlen = ~0;
u32 r, slot, salt, sfbhash;
int ret = NET_XMIT_SUCCESS | __NET_XMIT_BYPASS;
struct flow_keys keys;
if (unlikely(sch->q.qlen >= q->limit)) {
qdisc_qstats_overlimit(sch);
q->stats.queuedrop++;
goto drop;
}
if (q->rehash_interval > 0) {
unsigned long limit = q->rehash_time + q->rehash_interval;
if (unlikely(time_after(jiffies, limit))) {
sfb_swap_slot(q);
q->rehash_time = jiffies;
} else if (unlikely(!q->double_buffering && q->warmup_time > 0 &&
time_after(jiffies, limit - q->warmup_time))) {
q->double_buffering = true;
}
}
fl = rcu_dereference_bh(q->filter_list);
if (fl) {
/* If using external classifiers, get result and record it. */
if (!sfb_classify(skb, fl, &ret, &salt))
goto other_drop;
keys.src = salt;
keys.dst = 0;
keys.ports = 0;
} else {
skb_flow_dissect(skb, &keys);
}
slot = q->slot;
sfbhash = jhash_3words((__force u32)keys.dst,
(__force u32)keys.src,
(__force u32)keys.ports,
q->bins[slot].perturbation);
if (!sfbhash)
sfbhash = 1;
sfb_skb_cb(skb)->hashes[slot] = sfbhash;
for (i = 0; i < SFB_LEVELS; i++) {
u32 hash = sfbhash & SFB_BUCKET_MASK;
struct sfb_bucket *b = &q->bins[slot].bins[i][hash];
sfbhash >>= SFB_BUCKET_SHIFT;
if (b->qlen == 0)
decrement_prob(b, q);
else if (b->qlen >= q->bin_size)
increment_prob(b, q);
if (minqlen > b->qlen)
minqlen = b->qlen;
if (p_min > b->p_mark)
p_min = b->p_mark;
}
slot ^= 1;
sfb_skb_cb(skb)->hashes[slot] = 0;
if (unlikely(minqlen >= q->max)) {
qdisc_qstats_overlimit(sch);
q->stats.bucketdrop++;
goto drop;
}
if (unlikely(p_min >= SFB_MAX_PROB)) {
/* Inelastic flow */
if (q->double_buffering) {
sfbhash = jhash_3words((__force u32)keys.dst,
(__force u32)keys.src,
(__force u32)keys.ports,
q->bins[slot].perturbation);
if (!sfbhash)
sfbhash = 1;
sfb_skb_cb(skb)->hashes[slot] = sfbhash;
for (i = 0; i < SFB_LEVELS; i++) {
u32 hash = sfbhash & SFB_BUCKET_MASK;
struct sfb_bucket *b = &q->bins[slot].bins[i][hash];
sfbhash >>= SFB_BUCKET_SHIFT;
if (b->qlen == 0)
decrement_prob(b, q);
else if (b->qlen >= q->bin_size)
increment_prob(b, q);
}
}
if (sfb_rate_limit(skb, q)) {
qdisc_qstats_overlimit(sch);
q->stats.penaltydrop++;
goto drop;
}
goto enqueue;
}
r = prandom_u32() & SFB_MAX_PROB;
if (unlikely(r < p_min)) {
if (unlikely(p_min > SFB_MAX_PROB / 2)) {
/* If we're marking that many packets, then either
* this flow is unresponsive, or we're badly congested.
* In either case, we want to start dropping packets.
*/
if (r < (p_min - SFB_MAX_PROB / 2) * 2) {
q->stats.earlydrop++;
goto drop;
}
}
if (INET_ECN_set_ce(skb)) {
q->stats.marked++;
} else {
q->stats.earlydrop++;
goto drop;
}
}
enqueue:
ret = qdisc_enqueue(skb, child);
if (likely(ret == NET_XMIT_SUCCESS)) {
sch->q.qlen++;
increment_qlen(skb, q);
} else if (net_xmit_drop_count(ret)) {
q->stats.childdrop++;
qdisc_qstats_drop(sch);
}
return ret;
drop:
qdisc_drop(skb, sch);
return NET_XMIT_CN;
other_drop:
if (ret & __NET_XMIT_BYPASS)
qdisc_qstats_drop(sch);
kfree_skb(skb);
return ret;
}
static struct sk_buff *sfb_dequeue(struct Qdisc *sch)
{
struct sfb_sched_data *q = qdisc_priv(sch);
struct Qdisc *child = q->qdisc;
struct sk_buff *skb;
skb = child->dequeue(q->qdisc);
if (skb) {
qdisc_bstats_update(sch, skb);
sch->q.qlen--;
decrement_qlen(skb, q);
}
return skb;
}
static struct sk_buff *sfb_peek(struct Qdisc *sch)
{
struct sfb_sched_data *q = qdisc_priv(sch);
struct Qdisc *child = q->qdisc;
return child->ops->peek(child);
}
/* No sfb_drop -- impossible since the child doesn't return the dropped skb. */
static void sfb_reset(struct Qdisc *sch)
{
struct sfb_sched_data *q = qdisc_priv(sch);
qdisc_reset(q->qdisc);
sch->q.qlen = 0;
q->slot = 0;
q->double_buffering = false;
sfb_zero_all_buckets(q);
sfb_init_perturbation(0, q);
}
static void sfb_destroy(struct Qdisc *sch)
{
struct sfb_sched_data *q = qdisc_priv(sch);
tcf_destroy_chain(&q->filter_list);
qdisc_destroy(q->qdisc);
}
static const struct nla_policy sfb_policy[TCA_SFB_MAX + 1] = {
[TCA_SFB_PARMS] = { .len = sizeof(struct tc_sfb_qopt) },
};
static const struct tc_sfb_qopt sfb_default_ops = {
.rehash_interval = 600 * MSEC_PER_SEC,
.warmup_time = 60 * MSEC_PER_SEC,
.limit = 0,
.max = 25,
.bin_size = 20,
.increment = (SFB_MAX_PROB + 500) / 1000, /* 0.1 % */
.decrement = (SFB_MAX_PROB + 3000) / 6000,
.penalty_rate = 10,
.penalty_burst = 20,
};
static int sfb_change(struct Qdisc *sch, struct nlattr *opt)
{
struct sfb_sched_data *q = qdisc_priv(sch);
struct Qdisc *child;
struct nlattr *tb[TCA_SFB_MAX + 1];
const struct tc_sfb_qopt *ctl = &sfb_default_ops;
u32 limit;
int err;
if (opt) {
err = nla_parse_nested(tb, TCA_SFB_MAX, opt, sfb_policy);
if (err < 0)
return -EINVAL;
if (tb[TCA_SFB_PARMS] == NULL)
return -EINVAL;
ctl = nla_data(tb[TCA_SFB_PARMS]);
}
limit = ctl->limit;
if (limit == 0)
limit = max_t(u32, qdisc_dev(sch)->tx_queue_len, 1);
child = fifo_create_dflt(sch, &pfifo_qdisc_ops, limit);
if (IS_ERR(child))
return PTR_ERR(child);
sch_tree_lock(sch);
qdisc_tree_decrease_qlen(q->qdisc, q->qdisc->q.qlen);
qdisc_destroy(q->qdisc);
q->qdisc = child;
q->rehash_interval = msecs_to_jiffies(ctl->rehash_interval);
q->warmup_time = msecs_to_jiffies(ctl->warmup_time);
q->rehash_time = jiffies;
q->limit = limit;
q->increment = ctl->increment;
q->decrement = ctl->decrement;
q->max = ctl->max;
q->bin_size = ctl->bin_size;
q->penalty_rate = ctl->penalty_rate;
q->penalty_burst = ctl->penalty_burst;
q->tokens_avail = ctl->penalty_burst;
q->token_time = jiffies;
q->slot = 0;
q->double_buffering = false;
sfb_zero_all_buckets(q);
sfb_init_perturbation(0, q);
sfb_init_perturbation(1, q);
sch_tree_unlock(sch);
return 0;
}
static int sfb_init(struct Qdisc *sch, struct nlattr *opt)
{
struct sfb_sched_data *q = qdisc_priv(sch);
q->qdisc = &noop_qdisc;
return sfb_change(sch, opt);
}
static int sfb_dump(struct Qdisc *sch, struct sk_buff *skb)
{
struct sfb_sched_data *q = qdisc_priv(sch);
struct nlattr *opts;
struct tc_sfb_qopt opt = {
.rehash_interval = jiffies_to_msecs(q->rehash_interval),
.warmup_time = jiffies_to_msecs(q->warmup_time),
.limit = q->limit,
.max = q->max,
.bin_size = q->bin_size,
.increment = q->increment,
.decrement = q->decrement,
.penalty_rate = q->penalty_rate,
.penalty_burst = q->penalty_burst,
};
sch->qstats.backlog = q->qdisc->qstats.backlog;
opts = nla_nest_start(skb, TCA_OPTIONS);
if (opts == NULL)
goto nla_put_failure;
if (nla_put(skb, TCA_SFB_PARMS, sizeof(opt), &opt))
goto nla_put_failure;
return nla_nest_end(skb, opts);
nla_put_failure:
nla_nest_cancel(skb, opts);
return -EMSGSIZE;
}
static int sfb_dump_stats(struct Qdisc *sch, struct gnet_dump *d)
{
struct sfb_sched_data *q = qdisc_priv(sch);
struct tc_sfb_xstats st = {
.earlydrop = q->stats.earlydrop,
.penaltydrop = q->stats.penaltydrop,
.bucketdrop = q->stats.bucketdrop,
.queuedrop = q->stats.queuedrop,
.childdrop = q->stats.childdrop,
.marked = q->stats.marked,
};
st.maxqlen = sfb_compute_qlen(&st.maxprob, &st.avgprob, q);
return gnet_stats_copy_app(d, &st, sizeof(st));
}
static int sfb_dump_class(struct Qdisc *sch, unsigned long cl,
struct sk_buff *skb, struct tcmsg *tcm)
{
return -ENOSYS;
}
static int sfb_graft(struct Qdisc *sch, unsigned long arg, struct Qdisc *new,
struct Qdisc **old)
{
struct sfb_sched_data *q = qdisc_priv(sch);
if (new == NULL)
new = &noop_qdisc;
sch_tree_lock(sch);
*old = q->qdisc;
q->qdisc = new;
qdisc_tree_decrease_qlen(*old, (*old)->q.qlen);
qdisc_reset(*old);
sch_tree_unlock(sch);
return 0;
}
static struct Qdisc *sfb_leaf(struct Qdisc *sch, unsigned long arg)
{
struct sfb_sched_data *q = qdisc_priv(sch);
return q->qdisc;
}
static unsigned long sfb_get(struct Qdisc *sch, u32 classid)
{
return 1;
}
static void sfb_put(struct Qdisc *sch, unsigned long arg)
{
}
static int sfb_change_class(struct Qdisc *sch, u32 classid, u32 parentid,
struct nlattr **tca, unsigned long *arg)
{
return -ENOSYS;
}
static int sfb_delete(struct Qdisc *sch, unsigned long cl)
{
return -ENOSYS;
}
static void sfb_walk(struct Qdisc *sch, struct qdisc_walker *walker)
{
if (!walker->stop) {
if (walker->count >= walker->skip)
if (walker->fn(sch, 1, walker) < 0) {
walker->stop = 1;
return;
}
walker->count++;
}
}
static struct tcf_proto __rcu **sfb_find_tcf(struct Qdisc *sch,
unsigned long cl)
{
struct sfb_sched_data *q = qdisc_priv(sch);
if (cl)
return NULL;
return &q->filter_list;
}
static unsigned long sfb_bind(struct Qdisc *sch, unsigned long parent,
u32 classid)
{
return 0;
}
static const struct Qdisc_class_ops sfb_class_ops = {
.graft = sfb_graft,
.leaf = sfb_leaf,
.get = sfb_get,
.put = sfb_put,
.change = sfb_change_class,
.delete = sfb_delete,
.walk = sfb_walk,
.tcf_chain = sfb_find_tcf,
.bind_tcf = sfb_bind,
.unbind_tcf = sfb_put,
.dump = sfb_dump_class,
};
static struct Qdisc_ops sfb_qdisc_ops __read_mostly = {
.id = "sfb",
.priv_size = sizeof(struct sfb_sched_data),
.cl_ops = &sfb_class_ops,
.enqueue = sfb_enqueue,
.dequeue = sfb_dequeue,
.peek = sfb_peek,
.init = sfb_init,
.reset = sfb_reset,
.destroy = sfb_destroy,
.change = sfb_change,
.dump = sfb_dump,
.dump_stats = sfb_dump_stats,
.owner = THIS_MODULE,
};
static int __init sfb_module_init(void)
{
return register_qdisc(&sfb_qdisc_ops);
}
static void __exit sfb_module_exit(void)
{
unregister_qdisc(&sfb_qdisc_ops);
}
module_init(sfb_module_init)
module_exit(sfb_module_exit)
MODULE_DESCRIPTION("Stochastic Fair Blue queue discipline");
MODULE_AUTHOR("Juliusz Chroboczek");
MODULE_AUTHOR("Eric Dumazet");
MODULE_LICENSE("GPL");

939
net/sched/sch_sfq.c Normal file
View file

@ -0,0 +1,939 @@
/*
* net/sched/sch_sfq.c Stochastic Fairness Queueing discipline.
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public License
* as published by the Free Software Foundation; either version
* 2 of the License, or (at your option) any later version.
*
* Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
*/
#include <linux/module.h>
#include <linux/types.h>
#include <linux/kernel.h>
#include <linux/jiffies.h>
#include <linux/string.h>
#include <linux/in.h>
#include <linux/errno.h>
#include <linux/init.h>
#include <linux/skbuff.h>
#include <linux/jhash.h>
#include <linux/slab.h>
#include <linux/vmalloc.h>
#include <net/netlink.h>
#include <net/pkt_sched.h>
#include <net/flow_keys.h>
#include <net/red.h>
/* Stochastic Fairness Queuing algorithm.
=======================================
Source:
Paul E. McKenney "Stochastic Fairness Queuing",
IEEE INFOCOMM'90 Proceedings, San Francisco, 1990.
Paul E. McKenney "Stochastic Fairness Queuing",
"Interworking: Research and Experience", v.2, 1991, p.113-131.
See also:
M. Shreedhar and George Varghese "Efficient Fair
Queuing using Deficit Round Robin", Proc. SIGCOMM 95.
This is not the thing that is usually called (W)FQ nowadays.
It does not use any timestamp mechanism, but instead
processes queues in round-robin order.
ADVANTAGE:
- It is very cheap. Both CPU and memory requirements are minimal.
DRAWBACKS:
- "Stochastic" -> It is not 100% fair.
When hash collisions occur, several flows are considered as one.
- "Round-robin" -> It introduces larger delays than virtual clock
based schemes, and should not be used for isolating interactive
traffic from non-interactive. It means, that this scheduler
should be used as leaf of CBQ or P3, which put interactive traffic
to higher priority band.
We still need true WFQ for top level CSZ, but using WFQ
for the best effort traffic is absolutely pointless:
SFQ is superior for this purpose.
IMPLEMENTATION:
This implementation limits :
- maximal queue length per flow to 127 packets.
- max mtu to 2^18-1;
- max 65408 flows,
- number of hash buckets to 65536.
It is easy to increase these values, but not in flight. */
#define SFQ_MAX_DEPTH 127 /* max number of packets per flow */
#define SFQ_DEFAULT_FLOWS 128
#define SFQ_MAX_FLOWS (0x10000 - SFQ_MAX_DEPTH - 1) /* max number of flows */
#define SFQ_EMPTY_SLOT 0xffff
#define SFQ_DEFAULT_HASH_DIVISOR 1024
/* We use 16 bits to store allot, and want to handle packets up to 64K
* Scale allot by 8 (1<<3) so that no overflow occurs.
*/
#define SFQ_ALLOT_SHIFT 3
#define SFQ_ALLOT_SIZE(X) DIV_ROUND_UP(X, 1 << SFQ_ALLOT_SHIFT)
/* This type should contain at least SFQ_MAX_DEPTH + 1 + SFQ_MAX_FLOWS values */
typedef u16 sfq_index;
/*
* We dont use pointers to save space.
* Small indexes [0 ... SFQ_MAX_FLOWS - 1] are 'pointers' to slots[] array
* while following values [SFQ_MAX_FLOWS ... SFQ_MAX_FLOWS + SFQ_MAX_DEPTH]
* are 'pointers' to dep[] array
*/
struct sfq_head {
sfq_index next;
sfq_index prev;
};
struct sfq_slot {
struct sk_buff *skblist_next;
struct sk_buff *skblist_prev;
sfq_index qlen; /* number of skbs in skblist */
sfq_index next; /* next slot in sfq RR chain */
struct sfq_head dep; /* anchor in dep[] chains */
unsigned short hash; /* hash value (index in ht[]) */
short allot; /* credit for this slot */
unsigned int backlog;
struct red_vars vars;
};
struct sfq_sched_data {
/* frequently used fields */
int limit; /* limit of total number of packets in this qdisc */
unsigned int divisor; /* number of slots in hash table */
u8 headdrop;
u8 maxdepth; /* limit of packets per flow */
u32 perturbation;
u8 cur_depth; /* depth of longest slot */
u8 flags;
unsigned short scaled_quantum; /* SFQ_ALLOT_SIZE(quantum) */
struct tcf_proto __rcu *filter_list;
sfq_index *ht; /* Hash table ('divisor' slots) */
struct sfq_slot *slots; /* Flows table ('maxflows' entries) */
struct red_parms *red_parms;
struct tc_sfqred_stats stats;
struct sfq_slot *tail; /* current slot in round */
struct sfq_head dep[SFQ_MAX_DEPTH + 1];
/* Linked lists of slots, indexed by depth
* dep[0] : list of unused flows
* dep[1] : list of flows with 1 packet
* dep[X] : list of flows with X packets
*/
unsigned int maxflows; /* number of flows in flows array */
int perturb_period;
unsigned int quantum; /* Allotment per round: MUST BE >= MTU */
struct timer_list perturb_timer;
};
/*
* sfq_head are either in a sfq_slot or in dep[] array
*/
static inline struct sfq_head *sfq_dep_head(struct sfq_sched_data *q, sfq_index val)
{
if (val < SFQ_MAX_FLOWS)
return &q->slots[val].dep;
return &q->dep[val - SFQ_MAX_FLOWS];
}
/*
* In order to be able to quickly rehash our queue when timer changes
* q->perturbation, we store flow_keys in skb->cb[]
*/
struct sfq_skb_cb {
struct flow_keys keys;
};
static inline struct sfq_skb_cb *sfq_skb_cb(const struct sk_buff *skb)
{
qdisc_cb_private_validate(skb, sizeof(struct sfq_skb_cb));
return (struct sfq_skb_cb *)qdisc_skb_cb(skb)->data;
}
static unsigned int sfq_hash(const struct sfq_sched_data *q,
const struct sk_buff *skb)
{
const struct flow_keys *keys = &sfq_skb_cb(skb)->keys;
unsigned int hash;
hash = jhash_3words((__force u32)keys->dst,
(__force u32)keys->src ^ keys->ip_proto,
(__force u32)keys->ports, q->perturbation);
return hash & (q->divisor - 1);
}
static unsigned int sfq_classify(struct sk_buff *skb, struct Qdisc *sch,
int *qerr)
{
struct sfq_sched_data *q = qdisc_priv(sch);
struct tcf_result res;
struct tcf_proto *fl;
int result;
if (TC_H_MAJ(skb->priority) == sch->handle &&
TC_H_MIN(skb->priority) > 0 &&
TC_H_MIN(skb->priority) <= q->divisor)
return TC_H_MIN(skb->priority);
fl = rcu_dereference_bh(q->filter_list);
if (!fl) {
skb_flow_dissect(skb, &sfq_skb_cb(skb)->keys);
return sfq_hash(q, skb) + 1;
}
*qerr = NET_XMIT_SUCCESS | __NET_XMIT_BYPASS;
result = tc_classify(skb, fl, &res);
if (result >= 0) {
#ifdef CONFIG_NET_CLS_ACT
switch (result) {
case TC_ACT_STOLEN:
case TC_ACT_QUEUED:
*qerr = NET_XMIT_SUCCESS | __NET_XMIT_STOLEN;
case TC_ACT_SHOT:
return 0;
}
#endif
if (TC_H_MIN(res.classid) <= q->divisor)
return TC_H_MIN(res.classid);
}
return 0;
}
/*
* x : slot number [0 .. SFQ_MAX_FLOWS - 1]
*/
static inline void sfq_link(struct sfq_sched_data *q, sfq_index x)
{
sfq_index p, n;
struct sfq_slot *slot = &q->slots[x];
int qlen = slot->qlen;
p = qlen + SFQ_MAX_FLOWS;
n = q->dep[qlen].next;
slot->dep.next = n;
slot->dep.prev = p;
q->dep[qlen].next = x; /* sfq_dep_head(q, p)->next = x */
sfq_dep_head(q, n)->prev = x;
}
#define sfq_unlink(q, x, n, p) \
do { \
n = q->slots[x].dep.next; \
p = q->slots[x].dep.prev; \
sfq_dep_head(q, p)->next = n; \
sfq_dep_head(q, n)->prev = p; \
} while (0)
static inline void sfq_dec(struct sfq_sched_data *q, sfq_index x)
{
sfq_index p, n;
int d;
sfq_unlink(q, x, n, p);
d = q->slots[x].qlen--;
if (n == p && q->cur_depth == d)
q->cur_depth--;
sfq_link(q, x);
}
static inline void sfq_inc(struct sfq_sched_data *q, sfq_index x)
{
sfq_index p, n;
int d;
sfq_unlink(q, x, n, p);
d = ++q->slots[x].qlen;
if (q->cur_depth < d)
q->cur_depth = d;
sfq_link(q, x);
}
/* helper functions : might be changed when/if skb use a standard list_head */
/* remove one skb from tail of slot queue */
static inline struct sk_buff *slot_dequeue_tail(struct sfq_slot *slot)
{
struct sk_buff *skb = slot->skblist_prev;
slot->skblist_prev = skb->prev;
skb->prev->next = (struct sk_buff *)slot;
skb->next = skb->prev = NULL;
return skb;
}
/* remove one skb from head of slot queue */
static inline struct sk_buff *slot_dequeue_head(struct sfq_slot *slot)
{
struct sk_buff *skb = slot->skblist_next;
slot->skblist_next = skb->next;
skb->next->prev = (struct sk_buff *)slot;
skb->next = skb->prev = NULL;
return skb;
}
static inline void slot_queue_init(struct sfq_slot *slot)
{
memset(slot, 0, sizeof(*slot));
slot->skblist_prev = slot->skblist_next = (struct sk_buff *)slot;
}
/* add skb to slot queue (tail add) */
static inline void slot_queue_add(struct sfq_slot *slot, struct sk_buff *skb)
{
skb->prev = slot->skblist_prev;
skb->next = (struct sk_buff *)slot;
slot->skblist_prev->next = skb;
slot->skblist_prev = skb;
}
static unsigned int sfq_drop(struct Qdisc *sch)
{
struct sfq_sched_data *q = qdisc_priv(sch);
sfq_index x, d = q->cur_depth;
struct sk_buff *skb;
unsigned int len;
struct sfq_slot *slot;
/* Queue is full! Find the longest slot and drop tail packet from it */
if (d > 1) {
x = q->dep[d].next;
slot = &q->slots[x];
drop:
skb = q->headdrop ? slot_dequeue_head(slot) : slot_dequeue_tail(slot);
len = qdisc_pkt_len(skb);
slot->backlog -= len;
sfq_dec(q, x);
kfree_skb(skb);
sch->q.qlen--;
qdisc_qstats_drop(sch);
qdisc_qstats_backlog_dec(sch, skb);
return len;
}
if (d == 1) {
/* It is difficult to believe, but ALL THE SLOTS HAVE LENGTH 1. */
x = q->tail->next;
slot = &q->slots[x];
q->tail->next = slot->next;
q->ht[slot->hash] = SFQ_EMPTY_SLOT;
goto drop;
}
return 0;
}
/* Is ECN parameter configured */
static int sfq_prob_mark(const struct sfq_sched_data *q)
{
return q->flags & TC_RED_ECN;
}
/* Should packets over max threshold just be marked */
static int sfq_hard_mark(const struct sfq_sched_data *q)
{
return (q->flags & (TC_RED_ECN | TC_RED_HARDDROP)) == TC_RED_ECN;
}
static int sfq_headdrop(const struct sfq_sched_data *q)
{
return q->headdrop;
}
static int
sfq_enqueue(struct sk_buff *skb, struct Qdisc *sch)
{
struct sfq_sched_data *q = qdisc_priv(sch);
unsigned int hash;
sfq_index x, qlen;
struct sfq_slot *slot;
int uninitialized_var(ret);
struct sk_buff *head;
int delta;
hash = sfq_classify(skb, sch, &ret);
if (hash == 0) {
if (ret & __NET_XMIT_BYPASS)
qdisc_qstats_drop(sch);
kfree_skb(skb);
return ret;
}
hash--;
x = q->ht[hash];
slot = &q->slots[x];
if (x == SFQ_EMPTY_SLOT) {
x = q->dep[0].next; /* get a free slot */
if (x >= SFQ_MAX_FLOWS)
return qdisc_drop(skb, sch);
q->ht[hash] = x;
slot = &q->slots[x];
slot->hash = hash;
slot->backlog = 0; /* should already be 0 anyway... */
red_set_vars(&slot->vars);
goto enqueue;
}
if (q->red_parms) {
slot->vars.qavg = red_calc_qavg_no_idle_time(q->red_parms,
&slot->vars,
slot->backlog);
switch (red_action(q->red_parms,
&slot->vars,
slot->vars.qavg)) {
case RED_DONT_MARK:
break;
case RED_PROB_MARK:
qdisc_qstats_overlimit(sch);
if (sfq_prob_mark(q)) {
/* We know we have at least one packet in queue */
if (sfq_headdrop(q) &&
INET_ECN_set_ce(slot->skblist_next)) {
q->stats.prob_mark_head++;
break;
}
if (INET_ECN_set_ce(skb)) {
q->stats.prob_mark++;
break;
}
}
q->stats.prob_drop++;
goto congestion_drop;
case RED_HARD_MARK:
qdisc_qstats_overlimit(sch);
if (sfq_hard_mark(q)) {
/* We know we have at least one packet in queue */
if (sfq_headdrop(q) &&
INET_ECN_set_ce(slot->skblist_next)) {
q->stats.forced_mark_head++;
break;
}
if (INET_ECN_set_ce(skb)) {
q->stats.forced_mark++;
break;
}
}
q->stats.forced_drop++;
goto congestion_drop;
}
}
if (slot->qlen >= q->maxdepth) {
congestion_drop:
if (!sfq_headdrop(q))
return qdisc_drop(skb, sch);
/* We know we have at least one packet in queue */
head = slot_dequeue_head(slot);
delta = qdisc_pkt_len(head) - qdisc_pkt_len(skb);
sch->qstats.backlog -= delta;
slot->backlog -= delta;
qdisc_drop(head, sch);
slot_queue_add(slot, skb);
return NET_XMIT_CN;
}
enqueue:
qdisc_qstats_backlog_inc(sch, skb);
slot->backlog += qdisc_pkt_len(skb);
slot_queue_add(slot, skb);
sfq_inc(q, x);
if (slot->qlen == 1) { /* The flow is new */
if (q->tail == NULL) { /* It is the first flow */
slot->next = x;
} else {
slot->next = q->tail->next;
q->tail->next = x;
}
/* We put this flow at the end of our flow list.
* This might sound unfair for a new flow to wait after old ones,
* but we could endup servicing new flows only, and freeze old ones.
*/
q->tail = slot;
/* We could use a bigger initial quantum for new flows */
slot->allot = q->scaled_quantum;
}
if (++sch->q.qlen <= q->limit)
return NET_XMIT_SUCCESS;
qlen = slot->qlen;
sfq_drop(sch);
/* Return Congestion Notification only if we dropped a packet
* from this flow.
*/
if (qlen != slot->qlen)
return NET_XMIT_CN;
/* As we dropped a packet, better let upper stack know this */
qdisc_tree_decrease_qlen(sch, 1);
return NET_XMIT_SUCCESS;
}
static struct sk_buff *
sfq_dequeue(struct Qdisc *sch)
{
struct sfq_sched_data *q = qdisc_priv(sch);
struct sk_buff *skb;
sfq_index a, next_a;
struct sfq_slot *slot;
/* No active slots */
if (q->tail == NULL)
return NULL;
next_slot:
a = q->tail->next;
slot = &q->slots[a];
if (slot->allot <= 0) {
q->tail = slot;
slot->allot += q->scaled_quantum;
goto next_slot;
}
skb = slot_dequeue_head(slot);
sfq_dec(q, a);
qdisc_bstats_update(sch, skb);
sch->q.qlen--;
qdisc_qstats_backlog_dec(sch, skb);
slot->backlog -= qdisc_pkt_len(skb);
/* Is the slot empty? */
if (slot->qlen == 0) {
q->ht[slot->hash] = SFQ_EMPTY_SLOT;
next_a = slot->next;
if (a == next_a) {
q->tail = NULL; /* no more active slots */
return skb;
}
q->tail->next = next_a;
} else {
slot->allot -= SFQ_ALLOT_SIZE(qdisc_pkt_len(skb));
}
return skb;
}
static void
sfq_reset(struct Qdisc *sch)
{
struct sk_buff *skb;
while ((skb = sfq_dequeue(sch)) != NULL)
kfree_skb(skb);
}
/*
* When q->perturbation is changed, we rehash all queued skbs
* to avoid OOO (Out Of Order) effects.
* We dont use sfq_dequeue()/sfq_enqueue() because we dont want to change
* counters.
*/
static void sfq_rehash(struct Qdisc *sch)
{
struct sfq_sched_data *q = qdisc_priv(sch);
struct sk_buff *skb;
int i;
struct sfq_slot *slot;
struct sk_buff_head list;
int dropped = 0;
__skb_queue_head_init(&list);
for (i = 0; i < q->maxflows; i++) {
slot = &q->slots[i];
if (!slot->qlen)
continue;
while (slot->qlen) {
skb = slot_dequeue_head(slot);
sfq_dec(q, i);
__skb_queue_tail(&list, skb);
}
slot->backlog = 0;
red_set_vars(&slot->vars);
q->ht[slot->hash] = SFQ_EMPTY_SLOT;
}
q->tail = NULL;
while ((skb = __skb_dequeue(&list)) != NULL) {
unsigned int hash = sfq_hash(q, skb);
sfq_index x = q->ht[hash];
slot = &q->slots[x];
if (x == SFQ_EMPTY_SLOT) {
x = q->dep[0].next; /* get a free slot */
if (x >= SFQ_MAX_FLOWS) {
drop:
qdisc_qstats_backlog_dec(sch, skb);
kfree_skb(skb);
dropped++;
continue;
}
q->ht[hash] = x;
slot = &q->slots[x];
slot->hash = hash;
}
if (slot->qlen >= q->maxdepth)
goto drop;
slot_queue_add(slot, skb);
if (q->red_parms)
slot->vars.qavg = red_calc_qavg(q->red_parms,
&slot->vars,
slot->backlog);
slot->backlog += qdisc_pkt_len(skb);
sfq_inc(q, x);
if (slot->qlen == 1) { /* The flow is new */
if (q->tail == NULL) { /* It is the first flow */
slot->next = x;
} else {
slot->next = q->tail->next;
q->tail->next = x;
}
q->tail = slot;
slot->allot = q->scaled_quantum;
}
}
sch->q.qlen -= dropped;
qdisc_tree_decrease_qlen(sch, dropped);
}
static void sfq_perturbation(unsigned long arg)
{
struct Qdisc *sch = (struct Qdisc *)arg;
struct sfq_sched_data *q = qdisc_priv(sch);
spinlock_t *root_lock = qdisc_lock(qdisc_root_sleeping(sch));
spin_lock(root_lock);
q->perturbation = prandom_u32();
if (!q->filter_list && q->tail)
sfq_rehash(sch);
spin_unlock(root_lock);
if (q->perturb_period)
mod_timer(&q->perturb_timer, jiffies + q->perturb_period);
}
static int sfq_change(struct Qdisc *sch, struct nlattr *opt)
{
struct sfq_sched_data *q = qdisc_priv(sch);
struct tc_sfq_qopt *ctl = nla_data(opt);
struct tc_sfq_qopt_v1 *ctl_v1 = NULL;
unsigned int qlen;
struct red_parms *p = NULL;
if (opt->nla_len < nla_attr_size(sizeof(*ctl)))
return -EINVAL;
if (opt->nla_len >= nla_attr_size(sizeof(*ctl_v1)))
ctl_v1 = nla_data(opt);
if (ctl->divisor &&
(!is_power_of_2(ctl->divisor) || ctl->divisor > 65536))
return -EINVAL;
if (ctl_v1 && ctl_v1->qth_min) {
p = kmalloc(sizeof(*p), GFP_KERNEL);
if (!p)
return -ENOMEM;
}
sch_tree_lock(sch);
if (ctl->quantum) {
q->quantum = ctl->quantum;
q->scaled_quantum = SFQ_ALLOT_SIZE(q->quantum);
}
q->perturb_period = ctl->perturb_period * HZ;
if (ctl->flows)
q->maxflows = min_t(u32, ctl->flows, SFQ_MAX_FLOWS);
if (ctl->divisor) {
q->divisor = ctl->divisor;
q->maxflows = min_t(u32, q->maxflows, q->divisor);
}
if (ctl_v1) {
if (ctl_v1->depth)
q->maxdepth = min_t(u32, ctl_v1->depth, SFQ_MAX_DEPTH);
if (p) {
swap(q->red_parms, p);
red_set_parms(q->red_parms,
ctl_v1->qth_min, ctl_v1->qth_max,
ctl_v1->Wlog,
ctl_v1->Plog, ctl_v1->Scell_log,
NULL,
ctl_v1->max_P);
}
q->flags = ctl_v1->flags;
q->headdrop = ctl_v1->headdrop;
}
if (ctl->limit) {
q->limit = min_t(u32, ctl->limit, q->maxdepth * q->maxflows);
q->maxflows = min_t(u32, q->maxflows, q->limit);
}
qlen = sch->q.qlen;
while (sch->q.qlen > q->limit)
sfq_drop(sch);
qdisc_tree_decrease_qlen(sch, qlen - sch->q.qlen);
del_timer(&q->perturb_timer);
if (q->perturb_period) {
mod_timer(&q->perturb_timer, jiffies + q->perturb_period);
q->perturbation = prandom_u32();
}
sch_tree_unlock(sch);
kfree(p);
return 0;
}
static void *sfq_alloc(size_t sz)
{
void *ptr = kmalloc(sz, GFP_KERNEL | __GFP_NOWARN);
if (!ptr)
ptr = vmalloc(sz);
return ptr;
}
static void sfq_free(void *addr)
{
kvfree(addr);
}
static void sfq_destroy(struct Qdisc *sch)
{
struct sfq_sched_data *q = qdisc_priv(sch);
tcf_destroy_chain(&q->filter_list);
q->perturb_period = 0;
del_timer_sync(&q->perturb_timer);
sfq_free(q->ht);
sfq_free(q->slots);
kfree(q->red_parms);
}
static int sfq_init(struct Qdisc *sch, struct nlattr *opt)
{
struct sfq_sched_data *q = qdisc_priv(sch);
int i;
q->perturb_timer.function = sfq_perturbation;
q->perturb_timer.data = (unsigned long)sch;
init_timer_deferrable(&q->perturb_timer);
for (i = 0; i < SFQ_MAX_DEPTH + 1; i++) {
q->dep[i].next = i + SFQ_MAX_FLOWS;
q->dep[i].prev = i + SFQ_MAX_FLOWS;
}
q->limit = SFQ_MAX_DEPTH;
q->maxdepth = SFQ_MAX_DEPTH;
q->cur_depth = 0;
q->tail = NULL;
q->divisor = SFQ_DEFAULT_HASH_DIVISOR;
q->maxflows = SFQ_DEFAULT_FLOWS;
q->quantum = psched_mtu(qdisc_dev(sch));
q->scaled_quantum = SFQ_ALLOT_SIZE(q->quantum);
q->perturb_period = 0;
q->perturbation = prandom_u32();
if (opt) {
int err = sfq_change(sch, opt);
if (err)
return err;
}
q->ht = sfq_alloc(sizeof(q->ht[0]) * q->divisor);
q->slots = sfq_alloc(sizeof(q->slots[0]) * q->maxflows);
if (!q->ht || !q->slots) {
sfq_destroy(sch);
return -ENOMEM;
}
for (i = 0; i < q->divisor; i++)
q->ht[i] = SFQ_EMPTY_SLOT;
for (i = 0; i < q->maxflows; i++) {
slot_queue_init(&q->slots[i]);
sfq_link(q, i);
}
if (q->limit >= 1)
sch->flags |= TCQ_F_CAN_BYPASS;
else
sch->flags &= ~TCQ_F_CAN_BYPASS;
return 0;
}
static int sfq_dump(struct Qdisc *sch, struct sk_buff *skb)
{
struct sfq_sched_data *q = qdisc_priv(sch);
unsigned char *b = skb_tail_pointer(skb);
struct tc_sfq_qopt_v1 opt;
struct red_parms *p = q->red_parms;
memset(&opt, 0, sizeof(opt));
opt.v0.quantum = q->quantum;
opt.v0.perturb_period = q->perturb_period / HZ;
opt.v0.limit = q->limit;
opt.v0.divisor = q->divisor;
opt.v0.flows = q->maxflows;
opt.depth = q->maxdepth;
opt.headdrop = q->headdrop;
if (p) {
opt.qth_min = p->qth_min >> p->Wlog;
opt.qth_max = p->qth_max >> p->Wlog;
opt.Wlog = p->Wlog;
opt.Plog = p->Plog;
opt.Scell_log = p->Scell_log;
opt.max_P = p->max_P;
}
memcpy(&opt.stats, &q->stats, sizeof(opt.stats));
opt.flags = q->flags;
if (nla_put(skb, TCA_OPTIONS, sizeof(opt), &opt))
goto nla_put_failure;
return skb->len;
nla_put_failure:
nlmsg_trim(skb, b);
return -1;
}
static struct Qdisc *sfq_leaf(struct Qdisc *sch, unsigned long arg)
{
return NULL;
}
static unsigned long sfq_get(struct Qdisc *sch, u32 classid)
{
return 0;
}
static unsigned long sfq_bind(struct Qdisc *sch, unsigned long parent,
u32 classid)
{
/* we cannot bypass queue discipline anymore */
sch->flags &= ~TCQ_F_CAN_BYPASS;
return 0;
}
static void sfq_put(struct Qdisc *q, unsigned long cl)
{
}
static struct tcf_proto __rcu **sfq_find_tcf(struct Qdisc *sch,
unsigned long cl)
{
struct sfq_sched_data *q = qdisc_priv(sch);
if (cl)
return NULL;
return &q->filter_list;
}
static int sfq_dump_class(struct Qdisc *sch, unsigned long cl,
struct sk_buff *skb, struct tcmsg *tcm)
{
tcm->tcm_handle |= TC_H_MIN(cl);
return 0;
}
static int sfq_dump_class_stats(struct Qdisc *sch, unsigned long cl,
struct gnet_dump *d)
{
struct sfq_sched_data *q = qdisc_priv(sch);
sfq_index idx = q->ht[cl - 1];
struct gnet_stats_queue qs = { 0 };
struct tc_sfq_xstats xstats = { 0 };
if (idx != SFQ_EMPTY_SLOT) {
const struct sfq_slot *slot = &q->slots[idx];
xstats.allot = slot->allot << SFQ_ALLOT_SHIFT;
qs.qlen = slot->qlen;
qs.backlog = slot->backlog;
}
if (gnet_stats_copy_queue(d, NULL, &qs, qs.qlen) < 0)
return -1;
return gnet_stats_copy_app(d, &xstats, sizeof(xstats));
}
static void sfq_walk(struct Qdisc *sch, struct qdisc_walker *arg)
{
struct sfq_sched_data *q = qdisc_priv(sch);
unsigned int i;
if (arg->stop)
return;
for (i = 0; i < q->divisor; i++) {
if (q->ht[i] == SFQ_EMPTY_SLOT ||
arg->count < arg->skip) {
arg->count++;
continue;
}
if (arg->fn(sch, i + 1, arg) < 0) {
arg->stop = 1;
break;
}
arg->count++;
}
}
static const struct Qdisc_class_ops sfq_class_ops = {
.leaf = sfq_leaf,
.get = sfq_get,
.put = sfq_put,
.tcf_chain = sfq_find_tcf,
.bind_tcf = sfq_bind,
.unbind_tcf = sfq_put,
.dump = sfq_dump_class,
.dump_stats = sfq_dump_class_stats,
.walk = sfq_walk,
};
static struct Qdisc_ops sfq_qdisc_ops __read_mostly = {
.cl_ops = &sfq_class_ops,
.id = "sfq",
.priv_size = sizeof(struct sfq_sched_data),
.enqueue = sfq_enqueue,
.dequeue = sfq_dequeue,
.peek = qdisc_peek_dequeued,
.drop = sfq_drop,
.init = sfq_init,
.reset = sfq_reset,
.destroy = sfq_destroy,
.change = NULL,
.dump = sfq_dump,
.owner = THIS_MODULE,
};
static int __init sfq_module_init(void)
{
return register_qdisc(&sfq_qdisc_ops);
}
static void __exit sfq_module_exit(void)
{
unregister_qdisc(&sfq_qdisc_ops);
}
module_init(sfq_module_init)
module_exit(sfq_module_exit)
MODULE_LICENSE("GPL");

579
net/sched/sch_tbf.c Normal file
View file

@ -0,0 +1,579 @@
/*
* net/sched/sch_tbf.c Token Bucket Filter queue.
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public License
* as published by the Free Software Foundation; either version
* 2 of the License, or (at your option) any later version.
*
* Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
* Dmitry Torokhov <dtor@mail.ru> - allow attaching inner qdiscs -
* original idea by Martin Devera
*
*/
#include <linux/module.h>
#include <linux/types.h>
#include <linux/kernel.h>
#include <linux/string.h>
#include <linux/errno.h>
#include <linux/skbuff.h>
#include <net/netlink.h>
#include <net/sch_generic.h>
#include <net/pkt_sched.h>
/* Simple Token Bucket Filter.
=======================================
SOURCE.
-------
None.
Description.
------------
A data flow obeys TBF with rate R and depth B, if for any
time interval t_i...t_f the number of transmitted bits
does not exceed B + R*(t_f-t_i).
Packetized version of this definition:
The sequence of packets of sizes s_i served at moments t_i
obeys TBF, if for any i<=k:
s_i+....+s_k <= B + R*(t_k - t_i)
Algorithm.
----------
Let N(t_i) be B/R initially and N(t) grow continuously with time as:
N(t+delta) = min{B/R, N(t) + delta}
If the first packet in queue has length S, it may be
transmitted only at the time t_* when S/R <= N(t_*),
and in this case N(t) jumps:
N(t_* + 0) = N(t_* - 0) - S/R.
Actually, QoS requires two TBF to be applied to a data stream.
One of them controls steady state burst size, another
one with rate P (peak rate) and depth M (equal to link MTU)
limits bursts at a smaller time scale.
It is easy to see that P>R, and B>M. If P is infinity, this double
TBF is equivalent to a single one.
When TBF works in reshaping mode, latency is estimated as:
lat = max ((L-B)/R, (L-M)/P)
NOTES.
------
If TBF throttles, it starts a watchdog timer, which will wake it up
when it is ready to transmit.
Note that the minimal timer resolution is 1/HZ.
If no new packets arrive during this period,
or if the device is not awaken by EOI for some previous packet,
TBF can stop its activity for 1/HZ.
This means, that with depth B, the maximal rate is
R_crit = B*HZ
F.e. for 10Mbit ethernet and HZ=100 the minimal allowed B is ~10Kbytes.
Note that the peak rate TBF is much more tough: with MTU 1500
P_crit = 150Kbytes/sec. So, if you need greater peak
rates, use alpha with HZ=1000 :-)
With classful TBF, limit is just kept for backwards compatibility.
It is passed to the default bfifo qdisc - if the inner qdisc is
changed the limit is not effective anymore.
*/
struct tbf_sched_data {
/* Parameters */
u32 limit; /* Maximal length of backlog: bytes */
u32 max_size;
s64 buffer; /* Token bucket depth/rate: MUST BE >= MTU/B */
s64 mtu;
struct psched_ratecfg rate;
struct psched_ratecfg peak;
/* Variables */
s64 tokens; /* Current number of B tokens */
s64 ptokens; /* Current number of P tokens */
s64 t_c; /* Time check-point */
struct Qdisc *qdisc; /* Inner qdisc, default - bfifo queue */
struct qdisc_watchdog watchdog; /* Watchdog timer */
};
/* Time to Length, convert time in ns to length in bytes
* to determinate how many bytes can be sent in given time.
*/
static u64 psched_ns_t2l(const struct psched_ratecfg *r,
u64 time_in_ns)
{
/* The formula is :
* len = (time_in_ns * r->rate_bytes_ps) / NSEC_PER_SEC
*/
u64 len = time_in_ns * r->rate_bytes_ps;
do_div(len, NSEC_PER_SEC);
if (unlikely(r->linklayer == TC_LINKLAYER_ATM)) {
do_div(len, 53);
len = len * 48;
}
if (len > r->overhead)
len -= r->overhead;
else
len = 0;
return len;
}
/*
* Return length of individual segments of a gso packet,
* including all headers (MAC, IP, TCP/UDP)
*/
static unsigned int skb_gso_mac_seglen(const struct sk_buff *skb)
{
unsigned int hdr_len = skb_transport_header(skb) - skb_mac_header(skb);
return hdr_len + skb_gso_transport_seglen(skb);
}
/* GSO packet is too big, segment it so that tbf can transmit
* each segment in time
*/
static int tbf_segment(struct sk_buff *skb, struct Qdisc *sch)
{
struct tbf_sched_data *q = qdisc_priv(sch);
struct sk_buff *segs, *nskb;
netdev_features_t features = netif_skb_features(skb);
int ret, nb;
segs = skb_gso_segment(skb, features & ~NETIF_F_GSO_MASK);
if (IS_ERR_OR_NULL(segs))
return qdisc_reshape_fail(skb, sch);
nb = 0;
while (segs) {
nskb = segs->next;
segs->next = NULL;
qdisc_skb_cb(segs)->pkt_len = segs->len;
ret = qdisc_enqueue(segs, q->qdisc);
if (ret != NET_XMIT_SUCCESS) {
if (net_xmit_drop_count(ret))
qdisc_qstats_drop(sch);
} else {
nb++;
}
segs = nskb;
}
sch->q.qlen += nb;
if (nb > 1)
qdisc_tree_decrease_qlen(sch, 1 - nb);
consume_skb(skb);
return nb > 0 ? NET_XMIT_SUCCESS : NET_XMIT_DROP;
}
static int tbf_enqueue(struct sk_buff *skb, struct Qdisc *sch)
{
struct tbf_sched_data *q = qdisc_priv(sch);
int ret;
if (qdisc_pkt_len(skb) > q->max_size) {
if (skb_is_gso(skb) && skb_gso_mac_seglen(skb) <= q->max_size)
return tbf_segment(skb, sch);
return qdisc_reshape_fail(skb, sch);
}
ret = qdisc_enqueue(skb, q->qdisc);
if (ret != NET_XMIT_SUCCESS) {
if (net_xmit_drop_count(ret))
qdisc_qstats_drop(sch);
return ret;
}
sch->q.qlen++;
return NET_XMIT_SUCCESS;
}
static unsigned int tbf_drop(struct Qdisc *sch)
{
struct tbf_sched_data *q = qdisc_priv(sch);
unsigned int len = 0;
if (q->qdisc->ops->drop && (len = q->qdisc->ops->drop(q->qdisc)) != 0) {
sch->q.qlen--;
qdisc_qstats_drop(sch);
}
return len;
}
static bool tbf_peak_present(const struct tbf_sched_data *q)
{
return q->peak.rate_bytes_ps;
}
static struct sk_buff *tbf_dequeue(struct Qdisc *sch)
{
struct tbf_sched_data *q = qdisc_priv(sch);
struct sk_buff *skb;
skb = q->qdisc->ops->peek(q->qdisc);
if (skb) {
s64 now;
s64 toks;
s64 ptoks = 0;
unsigned int len = qdisc_pkt_len(skb);
now = ktime_get_ns();
toks = min_t(s64, now - q->t_c, q->buffer);
if (tbf_peak_present(q)) {
ptoks = toks + q->ptokens;
if (ptoks > q->mtu)
ptoks = q->mtu;
ptoks -= (s64) psched_l2t_ns(&q->peak, len);
}
toks += q->tokens;
if (toks > q->buffer)
toks = q->buffer;
toks -= (s64) psched_l2t_ns(&q->rate, len);
if ((toks|ptoks) >= 0) {
skb = qdisc_dequeue_peeked(q->qdisc);
if (unlikely(!skb))
return NULL;
q->t_c = now;
q->tokens = toks;
q->ptokens = ptoks;
sch->q.qlen--;
qdisc_unthrottled(sch);
qdisc_bstats_update(sch, skb);
return skb;
}
qdisc_watchdog_schedule_ns(&q->watchdog,
now + max_t(long, -toks, -ptoks),
true);
/* Maybe we have a shorter packet in the queue,
which can be sent now. It sounds cool,
but, however, this is wrong in principle.
We MUST NOT reorder packets under these circumstances.
Really, if we split the flow into independent
subflows, it would be a very good solution.
This is the main idea of all FQ algorithms
(cf. CSZ, HPFQ, HFSC)
*/
qdisc_qstats_overlimit(sch);
}
return NULL;
}
static void tbf_reset(struct Qdisc *sch)
{
struct tbf_sched_data *q = qdisc_priv(sch);
qdisc_reset(q->qdisc);
sch->q.qlen = 0;
q->t_c = ktime_get_ns();
q->tokens = q->buffer;
q->ptokens = q->mtu;
qdisc_watchdog_cancel(&q->watchdog);
}
static const struct nla_policy tbf_policy[TCA_TBF_MAX + 1] = {
[TCA_TBF_PARMS] = { .len = sizeof(struct tc_tbf_qopt) },
[TCA_TBF_RTAB] = { .type = NLA_BINARY, .len = TC_RTAB_SIZE },
[TCA_TBF_PTAB] = { .type = NLA_BINARY, .len = TC_RTAB_SIZE },
[TCA_TBF_RATE64] = { .type = NLA_U64 },
[TCA_TBF_PRATE64] = { .type = NLA_U64 },
[TCA_TBF_BURST] = { .type = NLA_U32 },
[TCA_TBF_PBURST] = { .type = NLA_U32 },
};
static int tbf_change(struct Qdisc *sch, struct nlattr *opt)
{
int err;
struct tbf_sched_data *q = qdisc_priv(sch);
struct nlattr *tb[TCA_TBF_MAX + 1];
struct tc_tbf_qopt *qopt;
struct Qdisc *child = NULL;
struct psched_ratecfg rate;
struct psched_ratecfg peak;
u64 max_size;
s64 buffer, mtu;
u64 rate64 = 0, prate64 = 0;
err = nla_parse_nested(tb, TCA_TBF_MAX, opt, tbf_policy);
if (err < 0)
return err;
err = -EINVAL;
if (tb[TCA_TBF_PARMS] == NULL)
goto done;
qopt = nla_data(tb[TCA_TBF_PARMS]);
if (qopt->rate.linklayer == TC_LINKLAYER_UNAWARE)
qdisc_put_rtab(qdisc_get_rtab(&qopt->rate,
tb[TCA_TBF_RTAB]));
if (qopt->peakrate.linklayer == TC_LINKLAYER_UNAWARE)
qdisc_put_rtab(qdisc_get_rtab(&qopt->peakrate,
tb[TCA_TBF_PTAB]));
buffer = min_t(u64, PSCHED_TICKS2NS(qopt->buffer), ~0U);
mtu = min_t(u64, PSCHED_TICKS2NS(qopt->mtu), ~0U);
if (tb[TCA_TBF_RATE64])
rate64 = nla_get_u64(tb[TCA_TBF_RATE64]);
psched_ratecfg_precompute(&rate, &qopt->rate, rate64);
if (tb[TCA_TBF_BURST]) {
max_size = nla_get_u32(tb[TCA_TBF_BURST]);
buffer = psched_l2t_ns(&rate, max_size);
} else {
max_size = min_t(u64, psched_ns_t2l(&rate, buffer), ~0U);
}
if (qopt->peakrate.rate) {
if (tb[TCA_TBF_PRATE64])
prate64 = nla_get_u64(tb[TCA_TBF_PRATE64]);
psched_ratecfg_precompute(&peak, &qopt->peakrate, prate64);
if (peak.rate_bytes_ps <= rate.rate_bytes_ps) {
pr_warn_ratelimited("sch_tbf: peakrate %llu is lower than or equals to rate %llu !\n",
peak.rate_bytes_ps, rate.rate_bytes_ps);
err = -EINVAL;
goto done;
}
if (tb[TCA_TBF_PBURST]) {
u32 pburst = nla_get_u32(tb[TCA_TBF_PBURST]);
max_size = min_t(u32, max_size, pburst);
mtu = psched_l2t_ns(&peak, pburst);
} else {
max_size = min_t(u64, max_size, psched_ns_t2l(&peak, mtu));
}
} else {
memset(&peak, 0, sizeof(peak));
}
if (max_size < psched_mtu(qdisc_dev(sch)))
pr_warn_ratelimited("sch_tbf: burst %llu is lower than device %s mtu (%u) !\n",
max_size, qdisc_dev(sch)->name,
psched_mtu(qdisc_dev(sch)));
if (!max_size) {
err = -EINVAL;
goto done;
}
if (q->qdisc != &noop_qdisc) {
err = fifo_set_limit(q->qdisc, qopt->limit);
if (err)
goto done;
} else if (qopt->limit > 0) {
child = fifo_create_dflt(sch, &bfifo_qdisc_ops, qopt->limit);
if (IS_ERR(child)) {
err = PTR_ERR(child);
goto done;
}
}
sch_tree_lock(sch);
if (child) {
qdisc_tree_decrease_qlen(q->qdisc, q->qdisc->q.qlen);
qdisc_destroy(q->qdisc);
q->qdisc = child;
}
q->limit = qopt->limit;
if (tb[TCA_TBF_PBURST])
q->mtu = mtu;
else
q->mtu = PSCHED_TICKS2NS(qopt->mtu);
q->max_size = max_size;
if (tb[TCA_TBF_BURST])
q->buffer = buffer;
else
q->buffer = PSCHED_TICKS2NS(qopt->buffer);
q->tokens = q->buffer;
q->ptokens = q->mtu;
memcpy(&q->rate, &rate, sizeof(struct psched_ratecfg));
memcpy(&q->peak, &peak, sizeof(struct psched_ratecfg));
sch_tree_unlock(sch);
err = 0;
done:
return err;
}
static int tbf_init(struct Qdisc *sch, struct nlattr *opt)
{
struct tbf_sched_data *q = qdisc_priv(sch);
if (opt == NULL)
return -EINVAL;
q->t_c = ktime_get_ns();
qdisc_watchdog_init(&q->watchdog, sch);
q->qdisc = &noop_qdisc;
return tbf_change(sch, opt);
}
static void tbf_destroy(struct Qdisc *sch)
{
struct tbf_sched_data *q = qdisc_priv(sch);
qdisc_watchdog_cancel(&q->watchdog);
qdisc_destroy(q->qdisc);
}
static int tbf_dump(struct Qdisc *sch, struct sk_buff *skb)
{
struct tbf_sched_data *q = qdisc_priv(sch);
struct nlattr *nest;
struct tc_tbf_qopt opt;
sch->qstats.backlog = q->qdisc->qstats.backlog;
nest = nla_nest_start(skb, TCA_OPTIONS);
if (nest == NULL)
goto nla_put_failure;
opt.limit = q->limit;
psched_ratecfg_getrate(&opt.rate, &q->rate);
if (tbf_peak_present(q))
psched_ratecfg_getrate(&opt.peakrate, &q->peak);
else
memset(&opt.peakrate, 0, sizeof(opt.peakrate));
opt.mtu = PSCHED_NS2TICKS(q->mtu);
opt.buffer = PSCHED_NS2TICKS(q->buffer);
if (nla_put(skb, TCA_TBF_PARMS, sizeof(opt), &opt))
goto nla_put_failure;
if (q->rate.rate_bytes_ps >= (1ULL << 32) &&
nla_put_u64(skb, TCA_TBF_RATE64, q->rate.rate_bytes_ps))
goto nla_put_failure;
if (tbf_peak_present(q) &&
q->peak.rate_bytes_ps >= (1ULL << 32) &&
nla_put_u64(skb, TCA_TBF_PRATE64, q->peak.rate_bytes_ps))
goto nla_put_failure;
return nla_nest_end(skb, nest);
nla_put_failure:
nla_nest_cancel(skb, nest);
return -1;
}
static int tbf_dump_class(struct Qdisc *sch, unsigned long cl,
struct sk_buff *skb, struct tcmsg *tcm)
{
struct tbf_sched_data *q = qdisc_priv(sch);
tcm->tcm_handle |= TC_H_MIN(1);
tcm->tcm_info = q->qdisc->handle;
return 0;
}
static int tbf_graft(struct Qdisc *sch, unsigned long arg, struct Qdisc *new,
struct Qdisc **old)
{
struct tbf_sched_data *q = qdisc_priv(sch);
if (new == NULL)
new = &noop_qdisc;
sch_tree_lock(sch);
*old = q->qdisc;
q->qdisc = new;
qdisc_tree_decrease_qlen(*old, (*old)->q.qlen);
qdisc_reset(*old);
sch_tree_unlock(sch);
return 0;
}
static struct Qdisc *tbf_leaf(struct Qdisc *sch, unsigned long arg)
{
struct tbf_sched_data *q = qdisc_priv(sch);
return q->qdisc;
}
static unsigned long tbf_get(struct Qdisc *sch, u32 classid)
{
return 1;
}
static void tbf_put(struct Qdisc *sch, unsigned long arg)
{
}
static void tbf_walk(struct Qdisc *sch, struct qdisc_walker *walker)
{
if (!walker->stop) {
if (walker->count >= walker->skip)
if (walker->fn(sch, 1, walker) < 0) {
walker->stop = 1;
return;
}
walker->count++;
}
}
static const struct Qdisc_class_ops tbf_class_ops = {
.graft = tbf_graft,
.leaf = tbf_leaf,
.get = tbf_get,
.put = tbf_put,
.walk = tbf_walk,
.dump = tbf_dump_class,
};
static struct Qdisc_ops tbf_qdisc_ops __read_mostly = {
.next = NULL,
.cl_ops = &tbf_class_ops,
.id = "tbf",
.priv_size = sizeof(struct tbf_sched_data),
.enqueue = tbf_enqueue,
.dequeue = tbf_dequeue,
.peek = qdisc_peek_dequeued,
.drop = tbf_drop,
.init = tbf_init,
.reset = tbf_reset,
.destroy = tbf_destroy,
.change = tbf_change,
.dump = tbf_dump,
.owner = THIS_MODULE,
};
static int __init tbf_module_init(void)
{
return register_qdisc(&tbf_qdisc_ops);
}
static void __exit tbf_module_exit(void)
{
unregister_qdisc(&tbf_qdisc_ops);
}
module_init(tbf_module_init)
module_exit(tbf_module_exit)
MODULE_LICENSE("GPL");

535
net/sched/sch_teql.c Normal file
View file

@ -0,0 +1,535 @@
/* net/sched/sch_teql.c "True" (or "trivial") link equalizer.
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public License
* as published by the Free Software Foundation; either version
* 2 of the License, or (at your option) any later version.
*
* Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
*/
#include <linux/module.h>
#include <linux/types.h>
#include <linux/kernel.h>
#include <linux/slab.h>
#include <linux/string.h>
#include <linux/errno.h>
#include <linux/if_arp.h>
#include <linux/netdevice.h>
#include <linux/init.h>
#include <linux/skbuff.h>
#include <linux/moduleparam.h>
#include <net/dst.h>
#include <net/neighbour.h>
#include <net/pkt_sched.h>
/*
How to setup it.
----------------
After loading this module you will find a new device teqlN
and new qdisc with the same name. To join a slave to the equalizer
you should just set this qdisc on a device f.e.
# tc qdisc add dev eth0 root teql0
# tc qdisc add dev eth1 root teql0
That's all. Full PnP 8)
Applicability.
--------------
1. Slave devices MUST be active devices, i.e., they must raise the tbusy
signal and generate EOI events. If you want to equalize virtual devices
like tunnels, use a normal eql device.
2. This device puts no limitations on physical slave characteristics
f.e. it will equalize 9600baud line and 100Mb ethernet perfectly :-)
Certainly, large difference in link speeds will make the resulting
eqalized link unusable, because of huge packet reordering.
I estimate an upper useful difference as ~10 times.
3. If the slave requires address resolution, only protocols using
neighbour cache (IPv4/IPv6) will work over the equalized link.
Other protocols are still allowed to use the slave device directly,
which will not break load balancing, though native slave
traffic will have the highest priority. */
struct teql_master {
struct Qdisc_ops qops;
struct net_device *dev;
struct Qdisc *slaves;
struct list_head master_list;
unsigned long tx_bytes;
unsigned long tx_packets;
unsigned long tx_errors;
unsigned long tx_dropped;
};
struct teql_sched_data {
struct Qdisc *next;
struct teql_master *m;
struct sk_buff_head q;
};
#define NEXT_SLAVE(q) (((struct teql_sched_data *)qdisc_priv(q))->next)
#define FMASK (IFF_BROADCAST | IFF_POINTOPOINT)
/* "teql*" qdisc routines */
static int
teql_enqueue(struct sk_buff *skb, struct Qdisc *sch)
{
struct net_device *dev = qdisc_dev(sch);
struct teql_sched_data *q = qdisc_priv(sch);
if (q->q.qlen < dev->tx_queue_len) {
__skb_queue_tail(&q->q, skb);
return NET_XMIT_SUCCESS;
}
return qdisc_drop(skb, sch);
}
static struct sk_buff *
teql_dequeue(struct Qdisc *sch)
{
struct teql_sched_data *dat = qdisc_priv(sch);
struct netdev_queue *dat_queue;
struct sk_buff *skb;
struct Qdisc *q;
skb = __skb_dequeue(&dat->q);
dat_queue = netdev_get_tx_queue(dat->m->dev, 0);
q = rcu_dereference_bh(dat_queue->qdisc);
if (skb == NULL) {
struct net_device *m = qdisc_dev(q);
if (m) {
dat->m->slaves = sch;
netif_wake_queue(m);
}
} else {
qdisc_bstats_update(sch, skb);
}
sch->q.qlen = dat->q.qlen + q->q.qlen;
return skb;
}
static struct sk_buff *
teql_peek(struct Qdisc *sch)
{
/* teql is meant to be used as root qdisc */
return NULL;
}
static inline void
teql_neigh_release(struct neighbour *n)
{
if (n)
neigh_release(n);
}
static void
teql_reset(struct Qdisc *sch)
{
struct teql_sched_data *dat = qdisc_priv(sch);
skb_queue_purge(&dat->q);
sch->q.qlen = 0;
}
static void
teql_destroy(struct Qdisc *sch)
{
struct Qdisc *q, *prev;
struct teql_sched_data *dat = qdisc_priv(sch);
struct teql_master *master = dat->m;
prev = master->slaves;
if (prev) {
do {
q = NEXT_SLAVE(prev);
if (q == sch) {
NEXT_SLAVE(prev) = NEXT_SLAVE(q);
if (q == master->slaves) {
master->slaves = NEXT_SLAVE(q);
if (q == master->slaves) {
struct netdev_queue *txq;
spinlock_t *root_lock;
txq = netdev_get_tx_queue(master->dev, 0);
master->slaves = NULL;
root_lock = qdisc_root_sleeping_lock(rtnl_dereference(txq->qdisc));
spin_lock_bh(root_lock);
qdisc_reset(rtnl_dereference(txq->qdisc));
spin_unlock_bh(root_lock);
}
}
skb_queue_purge(&dat->q);
break;
}
} while ((prev = q) != master->slaves);
}
}
static int teql_qdisc_init(struct Qdisc *sch, struct nlattr *opt)
{
struct net_device *dev = qdisc_dev(sch);
struct teql_master *m = (struct teql_master *)sch->ops;
struct teql_sched_data *q = qdisc_priv(sch);
if (dev->hard_header_len > m->dev->hard_header_len)
return -EINVAL;
if (m->dev == dev)
return -ELOOP;
q->m = m;
skb_queue_head_init(&q->q);
if (m->slaves) {
if (m->dev->flags & IFF_UP) {
if ((m->dev->flags & IFF_POINTOPOINT &&
!(dev->flags & IFF_POINTOPOINT)) ||
(m->dev->flags & IFF_BROADCAST &&
!(dev->flags & IFF_BROADCAST)) ||
(m->dev->flags & IFF_MULTICAST &&
!(dev->flags & IFF_MULTICAST)) ||
dev->mtu < m->dev->mtu)
return -EINVAL;
} else {
if (!(dev->flags&IFF_POINTOPOINT))
m->dev->flags &= ~IFF_POINTOPOINT;
if (!(dev->flags&IFF_BROADCAST))
m->dev->flags &= ~IFF_BROADCAST;
if (!(dev->flags&IFF_MULTICAST))
m->dev->flags &= ~IFF_MULTICAST;
if (dev->mtu < m->dev->mtu)
m->dev->mtu = dev->mtu;
}
q->next = NEXT_SLAVE(m->slaves);
NEXT_SLAVE(m->slaves) = sch;
} else {
q->next = sch;
m->slaves = sch;
m->dev->mtu = dev->mtu;
m->dev->flags = (m->dev->flags&~FMASK)|(dev->flags&FMASK);
}
return 0;
}
static int
__teql_resolve(struct sk_buff *skb, struct sk_buff *skb_res,
struct net_device *dev, struct netdev_queue *txq,
struct dst_entry *dst)
{
struct neighbour *n;
int err = 0;
n = dst_neigh_lookup_skb(dst, skb);
if (!n)
return -ENOENT;
if (dst->dev != dev) {
struct neighbour *mn;
mn = __neigh_lookup_errno(n->tbl, n->primary_key, dev);
neigh_release(n);
if (IS_ERR(mn))
return PTR_ERR(mn);
n = mn;
}
if (neigh_event_send(n, skb_res) == 0) {
int err;
char haddr[MAX_ADDR_LEN];
neigh_ha_snapshot(haddr, n, dev);
err = dev_hard_header(skb, dev, ntohs(skb->protocol), haddr,
NULL, skb->len);
if (err < 0)
err = -EINVAL;
} else {
err = (skb_res == NULL) ? -EAGAIN : 1;
}
neigh_release(n);
return err;
}
static inline int teql_resolve(struct sk_buff *skb,
struct sk_buff *skb_res,
struct net_device *dev,
struct netdev_queue *txq)
{
struct dst_entry *dst = skb_dst(skb);
int res;
if (rcu_access_pointer(txq->qdisc) == &noop_qdisc)
return -ENODEV;
if (!dev->header_ops || !dst)
return 0;
rcu_read_lock();
res = __teql_resolve(skb, skb_res, dev, txq, dst);
rcu_read_unlock();
return res;
}
static netdev_tx_t teql_master_xmit(struct sk_buff *skb, struct net_device *dev)
{
struct teql_master *master = netdev_priv(dev);
struct Qdisc *start, *q;
int busy;
int nores;
int subq = skb_get_queue_mapping(skb);
struct sk_buff *skb_res = NULL;
start = master->slaves;
restart:
nores = 0;
busy = 0;
q = start;
if (!q)
goto drop;
do {
struct net_device *slave = qdisc_dev(q);
struct netdev_queue *slave_txq = netdev_get_tx_queue(slave, 0);
if (slave_txq->qdisc_sleeping != q)
continue;
if (netif_xmit_stopped(netdev_get_tx_queue(slave, subq)) ||
!netif_running(slave)) {
busy = 1;
continue;
}
switch (teql_resolve(skb, skb_res, slave, slave_txq)) {
case 0:
if (__netif_tx_trylock(slave_txq)) {
unsigned int length = qdisc_pkt_len(skb);
if (!netif_xmit_frozen_or_stopped(slave_txq) &&
netdev_start_xmit(skb, slave, slave_txq, false) ==
NETDEV_TX_OK) {
__netif_tx_unlock(slave_txq);
master->slaves = NEXT_SLAVE(q);
netif_wake_queue(dev);
master->tx_packets++;
master->tx_bytes += length;
return NETDEV_TX_OK;
}
__netif_tx_unlock(slave_txq);
}
if (netif_xmit_stopped(netdev_get_tx_queue(dev, 0)))
busy = 1;
break;
case 1:
master->slaves = NEXT_SLAVE(q);
return NETDEV_TX_OK;
default:
nores = 1;
break;
}
__skb_pull(skb, skb_network_offset(skb));
} while ((q = NEXT_SLAVE(q)) != start);
if (nores && skb_res == NULL) {
skb_res = skb;
goto restart;
}
if (busy) {
netif_stop_queue(dev);
return NETDEV_TX_BUSY;
}
master->tx_errors++;
drop:
master->tx_dropped++;
dev_kfree_skb(skb);
return NETDEV_TX_OK;
}
static int teql_master_open(struct net_device *dev)
{
struct Qdisc *q;
struct teql_master *m = netdev_priv(dev);
int mtu = 0xFFFE;
unsigned int flags = IFF_NOARP | IFF_MULTICAST;
if (m->slaves == NULL)
return -EUNATCH;
flags = FMASK;
q = m->slaves;
do {
struct net_device *slave = qdisc_dev(q);
if (slave == NULL)
return -EUNATCH;
if (slave->mtu < mtu)
mtu = slave->mtu;
if (slave->hard_header_len > LL_MAX_HEADER)
return -EINVAL;
/* If all the slaves are BROADCAST, master is BROADCAST
If all the slaves are PtP, master is PtP
Otherwise, master is NBMA.
*/
if (!(slave->flags&IFF_POINTOPOINT))
flags &= ~IFF_POINTOPOINT;
if (!(slave->flags&IFF_BROADCAST))
flags &= ~IFF_BROADCAST;
if (!(slave->flags&IFF_MULTICAST))
flags &= ~IFF_MULTICAST;
} while ((q = NEXT_SLAVE(q)) != m->slaves);
m->dev->mtu = mtu;
m->dev->flags = (m->dev->flags&~FMASK) | flags;
netif_start_queue(m->dev);
return 0;
}
static int teql_master_close(struct net_device *dev)
{
netif_stop_queue(dev);
return 0;
}
static struct rtnl_link_stats64 *teql_master_stats64(struct net_device *dev,
struct rtnl_link_stats64 *stats)
{
struct teql_master *m = netdev_priv(dev);
stats->tx_packets = m->tx_packets;
stats->tx_bytes = m->tx_bytes;
stats->tx_errors = m->tx_errors;
stats->tx_dropped = m->tx_dropped;
return stats;
}
static int teql_master_mtu(struct net_device *dev, int new_mtu)
{
struct teql_master *m = netdev_priv(dev);
struct Qdisc *q;
if (new_mtu < 68)
return -EINVAL;
q = m->slaves;
if (q) {
do {
if (new_mtu > qdisc_dev(q)->mtu)
return -EINVAL;
} while ((q = NEXT_SLAVE(q)) != m->slaves);
}
dev->mtu = new_mtu;
return 0;
}
static const struct net_device_ops teql_netdev_ops = {
.ndo_open = teql_master_open,
.ndo_stop = teql_master_close,
.ndo_start_xmit = teql_master_xmit,
.ndo_get_stats64 = teql_master_stats64,
.ndo_change_mtu = teql_master_mtu,
};
static __init void teql_master_setup(struct net_device *dev)
{
struct teql_master *master = netdev_priv(dev);
struct Qdisc_ops *ops = &master->qops;
master->dev = dev;
ops->priv_size = sizeof(struct teql_sched_data);
ops->enqueue = teql_enqueue;
ops->dequeue = teql_dequeue;
ops->peek = teql_peek;
ops->init = teql_qdisc_init;
ops->reset = teql_reset;
ops->destroy = teql_destroy;
ops->owner = THIS_MODULE;
dev->netdev_ops = &teql_netdev_ops;
dev->type = ARPHRD_VOID;
dev->mtu = 1500;
dev->tx_queue_len = 100;
dev->flags = IFF_NOARP;
dev->hard_header_len = LL_MAX_HEADER;
netif_keep_dst(dev);
}
static LIST_HEAD(master_dev_list);
static int max_equalizers = 1;
module_param(max_equalizers, int, 0);
MODULE_PARM_DESC(max_equalizers, "Max number of link equalizers");
static int __init teql_init(void)
{
int i;
int err = -ENODEV;
for (i = 0; i < max_equalizers; i++) {
struct net_device *dev;
struct teql_master *master;
dev = alloc_netdev(sizeof(struct teql_master), "teql%d",
NET_NAME_UNKNOWN, teql_master_setup);
if (!dev) {
err = -ENOMEM;
break;
}
if ((err = register_netdev(dev))) {
free_netdev(dev);
break;
}
master = netdev_priv(dev);
strlcpy(master->qops.id, dev->name, IFNAMSIZ);
err = register_qdisc(&master->qops);
if (err) {
unregister_netdev(dev);
free_netdev(dev);
break;
}
list_add_tail(&master->master_list, &master_dev_list);
}
return i ? 0 : err;
}
static void __exit teql_exit(void)
{
struct teql_master *master, *nxt;
list_for_each_entry_safe(master, nxt, &master_dev_list, master_list) {
list_del(&master->master_list);
unregister_qdisc(&master->qops);
unregister_netdev(master->dev);
free_netdev(master->dev);
}
}
module_init(teql_init);
module_exit(teql_exit);
MODULE_LICENSE("GPL");