Fixed MTP to work with TWRP

This commit is contained in:
awab228 2018-06-19 23:16:04 +02:00
commit f6dfaef42e
50820 changed files with 20846062 additions and 0 deletions

62
net/dccp/Kconfig Normal file
View file

@ -0,0 +1,62 @@
menuconfig IP_DCCP
tristate "The DCCP Protocol"
depends on INET
---help---
Datagram Congestion Control Protocol (RFC 4340)
From http://www.ietf.org/rfc/rfc4340.txt:
The Datagram Congestion Control Protocol (DCCP) is a transport
protocol that implements bidirectional, unicast connections of
congestion-controlled, unreliable datagrams. It should be suitable
for use by applications such as streaming media, Internet telephony,
and on-line games.
To compile this protocol support as a module, choose M here: the
module will be called dccp.
If in doubt, say N.
if IP_DCCP
config INET_DCCP_DIAG
depends on INET_DIAG
def_tristate y if (IP_DCCP = y && INET_DIAG = y)
def_tristate m
source "net/dccp/ccids/Kconfig"
menu "DCCP Kernel Hacking"
depends on DEBUG_KERNEL=y
config IP_DCCP_DEBUG
bool "DCCP debug messages"
---help---
Only use this if you're hacking DCCP.
When compiling DCCP as a module, this debugging output can be toggled
by setting the parameter dccp_debug of the `dccp' module to 0 or 1.
Just say N.
config NET_DCCPPROBE
tristate "DCCP connection probing"
depends on PROC_FS && KPROBES
---help---
This module allows for capturing the changes to DCCP connection
state in response to incoming packets. It is used for debugging
DCCP congestion avoidance modules. If you don't understand
what was just said, you don't need it: say N.
Documentation on how to use DCCP connection probing can be found
at:
http://www.linuxfoundation.org/collaborate/workgroups/networking/dccpprobe
To compile this code as a module, choose M here: the
module will be called dccp_probe.
endmenu
endif # IP_DDCP

28
net/dccp/Makefile Normal file
View file

@ -0,0 +1,28 @@
obj-$(CONFIG_IP_DCCP) += dccp.o dccp_ipv4.o
dccp-y := ccid.o feat.o input.o minisocks.o options.o output.o proto.o timer.o \
qpolicy.o
#
# CCID algorithms to be used by dccp.ko
#
# CCID-2 is default (RFC 4340, p. 77) and has Ack Vectors as dependency
dccp-y += ccids/ccid2.o ackvec.o
dccp-$(CONFIG_IP_DCCP_CCID3) += ccids/ccid3.o
dccp-$(CONFIG_IP_DCCP_TFRC_LIB) += ccids/lib/tfrc.o \
ccids/lib/tfrc_equation.o \
ccids/lib/packet_history.o \
ccids/lib/loss_interval.o
dccp_ipv4-y := ipv4.o
# build dccp_ipv6 as module whenever either IPv6 or DCCP is a module
obj-$(subst y,$(CONFIG_IP_DCCP),$(CONFIG_IPV6)) += dccp_ipv6.o
dccp_ipv6-y := ipv6.o
obj-$(CONFIG_INET_DCCP_DIAG) += dccp_diag.o
obj-$(CONFIG_NET_DCCPPROBE) += dccp_probe.o
dccp-$(CONFIG_SYSCTL) += sysctl.o
dccp_diag-y := diag.o
dccp_probe-y := probe.o

409
net/dccp/ackvec.c Normal file
View file

@ -0,0 +1,409 @@
/*
* net/dccp/ackvec.c
*
* An implementation of Ack Vectors for the DCCP protocol
* Copyright (c) 2007 University of Aberdeen, Scotland, UK
* Copyright (c) 2005 Arnaldo Carvalho de Melo <acme@ghostprotocols.net>
*
* This program is free software; you can redistribute it and/or modify it
* under the terms of the GNU General Public License as published by the
* Free Software Foundation; version 2 of the License;
*/
#include "dccp.h"
#include <linux/kernel.h>
#include <linux/slab.h>
#include <linux/export.h>
static struct kmem_cache *dccp_ackvec_slab;
static struct kmem_cache *dccp_ackvec_record_slab;
struct dccp_ackvec *dccp_ackvec_alloc(const gfp_t priority)
{
struct dccp_ackvec *av = kmem_cache_zalloc(dccp_ackvec_slab, priority);
if (av != NULL) {
av->av_buf_head = av->av_buf_tail = DCCPAV_MAX_ACKVEC_LEN - 1;
INIT_LIST_HEAD(&av->av_records);
}
return av;
}
static void dccp_ackvec_purge_records(struct dccp_ackvec *av)
{
struct dccp_ackvec_record *cur, *next;
list_for_each_entry_safe(cur, next, &av->av_records, avr_node)
kmem_cache_free(dccp_ackvec_record_slab, cur);
INIT_LIST_HEAD(&av->av_records);
}
void dccp_ackvec_free(struct dccp_ackvec *av)
{
if (likely(av != NULL)) {
dccp_ackvec_purge_records(av);
kmem_cache_free(dccp_ackvec_slab, av);
}
}
/**
* dccp_ackvec_update_records - Record information about sent Ack Vectors
* @av: Ack Vector records to update
* @seqno: Sequence number of the packet carrying the Ack Vector just sent
* @nonce_sum: The sum of all buffer nonces contained in the Ack Vector
*/
int dccp_ackvec_update_records(struct dccp_ackvec *av, u64 seqno, u8 nonce_sum)
{
struct dccp_ackvec_record *avr;
avr = kmem_cache_alloc(dccp_ackvec_record_slab, GFP_ATOMIC);
if (avr == NULL)
return -ENOBUFS;
avr->avr_ack_seqno = seqno;
avr->avr_ack_ptr = av->av_buf_head;
avr->avr_ack_ackno = av->av_buf_ackno;
avr->avr_ack_nonce = nonce_sum;
avr->avr_ack_runlen = dccp_ackvec_runlen(av->av_buf + av->av_buf_head);
/*
* When the buffer overflows, we keep no more than one record. This is
* the simplest way of disambiguating sender-Acks dating from before the
* overflow from sender-Acks which refer to after the overflow; a simple
* solution is preferable here since we are handling an exception.
*/
if (av->av_overflow)
dccp_ackvec_purge_records(av);
/*
* Since GSS is incremented for each packet, the list is automatically
* arranged in descending order of @ack_seqno.
*/
list_add(&avr->avr_node, &av->av_records);
dccp_pr_debug("Added Vector, ack_seqno=%llu, ack_ackno=%llu (rl=%u)\n",
(unsigned long long)avr->avr_ack_seqno,
(unsigned long long)avr->avr_ack_ackno,
avr->avr_ack_runlen);
return 0;
}
static struct dccp_ackvec_record *dccp_ackvec_lookup(struct list_head *av_list,
const u64 ackno)
{
struct dccp_ackvec_record *avr;
/*
* Exploit that records are inserted in descending order of sequence
* number, start with the oldest record first. If @ackno is `before'
* the earliest ack_ackno, the packet is too old to be considered.
*/
list_for_each_entry_reverse(avr, av_list, avr_node) {
if (avr->avr_ack_seqno == ackno)
return avr;
if (before48(ackno, avr->avr_ack_seqno))
break;
}
return NULL;
}
/*
* Buffer index and length computation using modulo-buffersize arithmetic.
* Note that, as pointers move from right to left, head is `before' tail.
*/
static inline u16 __ackvec_idx_add(const u16 a, const u16 b)
{
return (a + b) % DCCPAV_MAX_ACKVEC_LEN;
}
static inline u16 __ackvec_idx_sub(const u16 a, const u16 b)
{
return __ackvec_idx_add(a, DCCPAV_MAX_ACKVEC_LEN - b);
}
u16 dccp_ackvec_buflen(const struct dccp_ackvec *av)
{
if (unlikely(av->av_overflow))
return DCCPAV_MAX_ACKVEC_LEN;
return __ackvec_idx_sub(av->av_buf_tail, av->av_buf_head);
}
/**
* dccp_ackvec_update_old - Update previous state as per RFC 4340, 11.4.1
* @av: non-empty buffer to update
* @distance: negative or zero distance of @seqno from buf_ackno downward
* @seqno: the (old) sequence number whose record is to be updated
* @state: state in which packet carrying @seqno was received
*/
static void dccp_ackvec_update_old(struct dccp_ackvec *av, s64 distance,
u64 seqno, enum dccp_ackvec_states state)
{
u16 ptr = av->av_buf_head;
BUG_ON(distance > 0);
if (unlikely(dccp_ackvec_is_empty(av)))
return;
do {
u8 runlen = dccp_ackvec_runlen(av->av_buf + ptr);
if (distance + runlen >= 0) {
/*
* Only update the state if packet has not been received
* yet. This is OK as per the second table in RFC 4340,
* 11.4.1; i.e. here we are using the following table:
* RECEIVED
* 0 1 3
* S +---+---+---+
* T 0 | 0 | 0 | 0 |
* O +---+---+---+
* R 1 | 1 | 1 | 1 |
* E +---+---+---+
* D 3 | 0 | 1 | 3 |
* +---+---+---+
* The "Not Received" state was set by reserve_seats().
*/
if (av->av_buf[ptr] == DCCPAV_NOT_RECEIVED)
av->av_buf[ptr] = state;
else
dccp_pr_debug("Not changing %llu state to %u\n",
(unsigned long long)seqno, state);
break;
}
distance += runlen + 1;
ptr = __ackvec_idx_add(ptr, 1);
} while (ptr != av->av_buf_tail);
}
/* Mark @num entries after buf_head as "Not yet received". */
static void dccp_ackvec_reserve_seats(struct dccp_ackvec *av, u16 num)
{
u16 start = __ackvec_idx_add(av->av_buf_head, 1),
len = DCCPAV_MAX_ACKVEC_LEN - start;
/* check for buffer wrap-around */
if (num > len) {
memset(av->av_buf + start, DCCPAV_NOT_RECEIVED, len);
start = 0;
num -= len;
}
if (num)
memset(av->av_buf + start, DCCPAV_NOT_RECEIVED, num);
}
/**
* dccp_ackvec_add_new - Record one or more new entries in Ack Vector buffer
* @av: container of buffer to update (can be empty or non-empty)
* @num_packets: number of packets to register (must be >= 1)
* @seqno: sequence number of the first packet in @num_packets
* @state: state in which packet carrying @seqno was received
*/
static void dccp_ackvec_add_new(struct dccp_ackvec *av, u32 num_packets,
u64 seqno, enum dccp_ackvec_states state)
{
u32 num_cells = num_packets;
if (num_packets > DCCPAV_BURST_THRESH) {
u32 lost_packets = num_packets - 1;
DCCP_WARN("Warning: large burst loss (%u)\n", lost_packets);
/*
* We received 1 packet and have a loss of size "num_packets-1"
* which we squeeze into num_cells-1 rather than reserving an
* entire byte for each lost packet.
* The reason is that the vector grows in O(burst_length); when
* it grows too large there will no room left for the payload.
* This is a trade-off: if a few packets out of the burst show
* up later, their state will not be changed; it is simply too
* costly to reshuffle/reallocate/copy the buffer each time.
* Should such problems persist, we will need to switch to a
* different underlying data structure.
*/
for (num_packets = num_cells = 1; lost_packets; ++num_cells) {
u8 len = min(lost_packets, (u32)DCCPAV_MAX_RUNLEN);
av->av_buf_head = __ackvec_idx_sub(av->av_buf_head, 1);
av->av_buf[av->av_buf_head] = DCCPAV_NOT_RECEIVED | len;
lost_packets -= len;
}
}
if (num_cells + dccp_ackvec_buflen(av) >= DCCPAV_MAX_ACKVEC_LEN) {
DCCP_CRIT("Ack Vector buffer overflow: dropping old entries\n");
av->av_overflow = true;
}
av->av_buf_head = __ackvec_idx_sub(av->av_buf_head, num_packets);
if (av->av_overflow)
av->av_buf_tail = av->av_buf_head;
av->av_buf[av->av_buf_head] = state;
av->av_buf_ackno = seqno;
if (num_packets > 1)
dccp_ackvec_reserve_seats(av, num_packets - 1);
}
/**
* dccp_ackvec_input - Register incoming packet in the buffer
*/
void dccp_ackvec_input(struct dccp_ackvec *av, struct sk_buff *skb)
{
u64 seqno = DCCP_SKB_CB(skb)->dccpd_seq;
enum dccp_ackvec_states state = DCCPAV_RECEIVED;
if (dccp_ackvec_is_empty(av)) {
dccp_ackvec_add_new(av, 1, seqno, state);
av->av_tail_ackno = seqno;
} else {
s64 num_packets = dccp_delta_seqno(av->av_buf_ackno, seqno);
u8 *current_head = av->av_buf + av->av_buf_head;
if (num_packets == 1 &&
dccp_ackvec_state(current_head) == state &&
dccp_ackvec_runlen(current_head) < DCCPAV_MAX_RUNLEN) {
*current_head += 1;
av->av_buf_ackno = seqno;
} else if (num_packets > 0) {
dccp_ackvec_add_new(av, num_packets, seqno, state);
} else {
dccp_ackvec_update_old(av, num_packets, seqno, state);
}
}
}
/**
* dccp_ackvec_clear_state - Perform house-keeping / garbage-collection
* This routine is called when the peer acknowledges the receipt of Ack Vectors
* up to and including @ackno. While based on on section A.3 of RFC 4340, here
* are additional precautions to prevent corrupted buffer state. In particular,
* we use tail_ackno to identify outdated records; it always marks the earliest
* packet of group (2) in 11.4.2.
*/
void dccp_ackvec_clear_state(struct dccp_ackvec *av, const u64 ackno)
{
struct dccp_ackvec_record *avr, *next;
u8 runlen_now, eff_runlen;
s64 delta;
avr = dccp_ackvec_lookup(&av->av_records, ackno);
if (avr == NULL)
return;
/*
* Deal with outdated acknowledgments: this arises when e.g. there are
* several old records and the acks from the peer come in slowly. In
* that case we may still have records that pre-date tail_ackno.
*/
delta = dccp_delta_seqno(av->av_tail_ackno, avr->avr_ack_ackno);
if (delta < 0)
goto free_records;
/*
* Deal with overlapping Ack Vectors: don't subtract more than the
* number of packets between tail_ackno and ack_ackno.
*/
eff_runlen = delta < avr->avr_ack_runlen ? delta : avr->avr_ack_runlen;
runlen_now = dccp_ackvec_runlen(av->av_buf + avr->avr_ack_ptr);
/*
* The run length of Ack Vector cells does not decrease over time. If
* the run length is the same as at the time the Ack Vector was sent, we
* free the ack_ptr cell. That cell can however not be freed if the run
* length has increased: in this case we need to move the tail pointer
* backwards (towards higher indices), to its next-oldest neighbour.
*/
if (runlen_now > eff_runlen) {
av->av_buf[avr->avr_ack_ptr] -= eff_runlen + 1;
av->av_buf_tail = __ackvec_idx_add(avr->avr_ack_ptr, 1);
/* This move may not have cleared the overflow flag. */
if (av->av_overflow)
av->av_overflow = (av->av_buf_head == av->av_buf_tail);
} else {
av->av_buf_tail = avr->avr_ack_ptr;
/*
* We have made sure that avr points to a valid cell within the
* buffer. This cell is either older than head, or equals head
* (empty buffer): in both cases we no longer have any overflow.
*/
av->av_overflow = 0;
}
/*
* The peer has acknowledged up to and including ack_ackno. Hence the
* first packet in group (2) of 11.4.2 is the successor of ack_ackno.
*/
av->av_tail_ackno = ADD48(avr->avr_ack_ackno, 1);
free_records:
list_for_each_entry_safe_from(avr, next, &av->av_records, avr_node) {
list_del(&avr->avr_node);
kmem_cache_free(dccp_ackvec_record_slab, avr);
}
}
/*
* Routines to keep track of Ack Vectors received in an skb
*/
int dccp_ackvec_parsed_add(struct list_head *head, u8 *vec, u8 len, u8 nonce)
{
struct dccp_ackvec_parsed *new = kmalloc(sizeof(*new), GFP_ATOMIC);
if (new == NULL)
return -ENOBUFS;
new->vec = vec;
new->len = len;
new->nonce = nonce;
list_add_tail(&new->node, head);
return 0;
}
EXPORT_SYMBOL_GPL(dccp_ackvec_parsed_add);
void dccp_ackvec_parsed_cleanup(struct list_head *parsed_chunks)
{
struct dccp_ackvec_parsed *cur, *next;
list_for_each_entry_safe(cur, next, parsed_chunks, node)
kfree(cur);
INIT_LIST_HEAD(parsed_chunks);
}
EXPORT_SYMBOL_GPL(dccp_ackvec_parsed_cleanup);
int __init dccp_ackvec_init(void)
{
dccp_ackvec_slab = kmem_cache_create("dccp_ackvec",
sizeof(struct dccp_ackvec), 0,
SLAB_HWCACHE_ALIGN, NULL);
if (dccp_ackvec_slab == NULL)
goto out_err;
dccp_ackvec_record_slab = kmem_cache_create("dccp_ackvec_record",
sizeof(struct dccp_ackvec_record),
0, SLAB_HWCACHE_ALIGN, NULL);
if (dccp_ackvec_record_slab == NULL)
goto out_destroy_slab;
return 0;
out_destroy_slab:
kmem_cache_destroy(dccp_ackvec_slab);
dccp_ackvec_slab = NULL;
out_err:
DCCP_CRIT("Unable to create Ack Vector slab cache");
return -ENOBUFS;
}
void dccp_ackvec_exit(void)
{
if (dccp_ackvec_slab != NULL) {
kmem_cache_destroy(dccp_ackvec_slab);
dccp_ackvec_slab = NULL;
}
if (dccp_ackvec_record_slab != NULL) {
kmem_cache_destroy(dccp_ackvec_record_slab);
dccp_ackvec_record_slab = NULL;
}
}

138
net/dccp/ackvec.h Normal file
View file

@ -0,0 +1,138 @@
#ifndef _ACKVEC_H
#define _ACKVEC_H
/*
* net/dccp/ackvec.h
*
* An implementation of Ack Vectors for the DCCP protocol
* Copyright (c) 2007 University of Aberdeen, Scotland, UK
* Copyright (c) 2005 Arnaldo Carvalho de Melo <acme@mandriva.com>
* This program is free software; you can redistribute it and/or modify it
* under the terms of the GNU General Public License version 2 as
* published by the Free Software Foundation.
*/
#include <linux/dccp.h>
#include <linux/compiler.h>
#include <linux/list.h>
#include <linux/types.h>
/*
* Ack Vector buffer space is static, in multiples of %DCCP_SINGLE_OPT_MAXLEN,
* the maximum size of a single Ack Vector. Setting %DCCPAV_NUM_ACKVECS to 1
* will be sufficient for most cases of low Ack Ratios, using a value of 2 gives
* more headroom if Ack Ratio is higher or when the sender acknowledges slowly.
* The maximum value is bounded by the u16 types for indices and functions.
*/
#define DCCPAV_NUM_ACKVECS 2
#define DCCPAV_MAX_ACKVEC_LEN (DCCP_SINGLE_OPT_MAXLEN * DCCPAV_NUM_ACKVECS)
/* Estimated minimum average Ack Vector length - used for updating MPS */
#define DCCPAV_MIN_OPTLEN 16
/* Threshold for coping with large bursts of losses */
#define DCCPAV_BURST_THRESH (DCCPAV_MAX_ACKVEC_LEN / 8)
enum dccp_ackvec_states {
DCCPAV_RECEIVED = 0x00,
DCCPAV_ECN_MARKED = 0x40,
DCCPAV_RESERVED = 0x80,
DCCPAV_NOT_RECEIVED = 0xC0
};
#define DCCPAV_MAX_RUNLEN 0x3F
static inline u8 dccp_ackvec_runlen(const u8 *cell)
{
return *cell & DCCPAV_MAX_RUNLEN;
}
static inline u8 dccp_ackvec_state(const u8 *cell)
{
return *cell & ~DCCPAV_MAX_RUNLEN;
}
/**
* struct dccp_ackvec - Ack Vector main data structure
*
* This implements a fixed-size circular buffer within an array and is largely
* based on Appendix A of RFC 4340.
*
* @av_buf: circular buffer storage area
* @av_buf_head: head index; begin of live portion in @av_buf
* @av_buf_tail: tail index; first index _after_ the live portion in @av_buf
* @av_buf_ackno: highest seqno of acknowledgeable packet recorded in @av_buf
* @av_tail_ackno: lowest seqno of acknowledgeable packet recorded in @av_buf
* @av_buf_nonce: ECN nonce sums, each covering subsequent segments of up to
* %DCCP_SINGLE_OPT_MAXLEN cells in the live portion of @av_buf
* @av_overflow: if 1 then buf_head == buf_tail indicates buffer wraparound
* @av_records: list of %dccp_ackvec_record (Ack Vectors sent previously)
*/
struct dccp_ackvec {
u8 av_buf[DCCPAV_MAX_ACKVEC_LEN];
u16 av_buf_head;
u16 av_buf_tail;
u64 av_buf_ackno:48;
u64 av_tail_ackno:48;
bool av_buf_nonce[DCCPAV_NUM_ACKVECS];
u8 av_overflow:1;
struct list_head av_records;
};
/**
* struct dccp_ackvec_record - Records information about sent Ack Vectors
*
* These list entries define the additional information which the HC-Receiver
* keeps about recently-sent Ack Vectors; again refer to RFC 4340, Appendix A.
*
* @avr_node: the list node in @av_records
* @avr_ack_seqno: sequence number of the packet the Ack Vector was sent on
* @avr_ack_ackno: the Ack number that this record/Ack Vector refers to
* @avr_ack_ptr: pointer into @av_buf where this record starts
* @avr_ack_runlen: run length of @avr_ack_ptr at the time of sending
* @avr_ack_nonce: the sum of @av_buf_nonce's at the time this record was sent
*
* The list as a whole is sorted in descending order by @avr_ack_seqno.
*/
struct dccp_ackvec_record {
struct list_head avr_node;
u64 avr_ack_seqno:48;
u64 avr_ack_ackno:48;
u16 avr_ack_ptr;
u8 avr_ack_runlen;
u8 avr_ack_nonce:1;
};
int dccp_ackvec_init(void);
void dccp_ackvec_exit(void);
struct dccp_ackvec *dccp_ackvec_alloc(const gfp_t priority);
void dccp_ackvec_free(struct dccp_ackvec *av);
void dccp_ackvec_input(struct dccp_ackvec *av, struct sk_buff *skb);
int dccp_ackvec_update_records(struct dccp_ackvec *av, u64 seq, u8 sum);
void dccp_ackvec_clear_state(struct dccp_ackvec *av, const u64 ackno);
u16 dccp_ackvec_buflen(const struct dccp_ackvec *av);
static inline bool dccp_ackvec_is_empty(const struct dccp_ackvec *av)
{
return av->av_overflow == 0 && av->av_buf_head == av->av_buf_tail;
}
/**
* struct dccp_ackvec_parsed - Record offsets of Ack Vectors in skb
* @vec: start of vector (offset into skb)
* @len: length of @vec
* @nonce: whether @vec had an ECN nonce of 0 or 1
* @node: FIFO - arranged in descending order of ack_ackno
*
* This structure is used by CCIDs to access Ack Vectors in a received skb.
*/
struct dccp_ackvec_parsed {
u8 *vec,
len,
nonce:1;
struct list_head node;
};
int dccp_ackvec_parsed_add(struct list_head *head, u8 *vec, u8 len, u8 nonce);
void dccp_ackvec_parsed_cleanup(struct list_head *parsed_chunks);
#endif /* _ACKVEC_H */

223
net/dccp/ccid.c Normal file
View file

@ -0,0 +1,223 @@
/*
* net/dccp/ccid.c
*
* An implementation of the DCCP protocol
* Arnaldo Carvalho de Melo <acme@conectiva.com.br>
*
* CCID infrastructure
*
* This program is free software; you can redistribute it and/or modify it
* under the terms of the GNU General Public License version 2 as
* published by the Free Software Foundation.
*/
#include <linux/slab.h>
#include "ccid.h"
#include "ccids/lib/tfrc.h"
static struct ccid_operations *ccids[] = {
&ccid2_ops,
#ifdef CONFIG_IP_DCCP_CCID3
&ccid3_ops,
#endif
};
static struct ccid_operations *ccid_by_number(const u8 id)
{
int i;
for (i = 0; i < ARRAY_SIZE(ccids); i++)
if (ccids[i]->ccid_id == id)
return ccids[i];
return NULL;
}
/* check that up to @array_len members in @ccid_array are supported */
bool ccid_support_check(u8 const *ccid_array, u8 array_len)
{
while (array_len > 0)
if (ccid_by_number(ccid_array[--array_len]) == NULL)
return false;
return true;
}
/**
* ccid_get_builtin_ccids - Populate a list of built-in CCIDs
* @ccid_array: pointer to copy into
* @array_len: value to return length into
*
* This function allocates memory - caller must see that it is freed after use.
*/
int ccid_get_builtin_ccids(u8 **ccid_array, u8 *array_len)
{
*ccid_array = kmalloc(ARRAY_SIZE(ccids), gfp_any());
if (*ccid_array == NULL)
return -ENOBUFS;
for (*array_len = 0; *array_len < ARRAY_SIZE(ccids); *array_len += 1)
(*ccid_array)[*array_len] = ccids[*array_len]->ccid_id;
return 0;
}
int ccid_getsockopt_builtin_ccids(struct sock *sk, int len,
char __user *optval, int __user *optlen)
{
u8 *ccid_array, array_len;
int err = 0;
if (ccid_get_builtin_ccids(&ccid_array, &array_len))
return -ENOBUFS;
if (put_user(array_len, optlen))
err = -EFAULT;
else if (len > 0 && copy_to_user(optval, ccid_array,
len > array_len ? array_len : len))
err = -EFAULT;
kfree(ccid_array);
return err;
}
static struct kmem_cache *ccid_kmem_cache_create(int obj_size, char *slab_name_fmt, const char *fmt,...)
{
struct kmem_cache *slab;
va_list args;
va_start(args, fmt);
vsnprintf(slab_name_fmt, CCID_SLAB_NAME_LENGTH, fmt, args);
va_end(args);
slab = kmem_cache_create(slab_name_fmt, sizeof(struct ccid) + obj_size, 0,
SLAB_HWCACHE_ALIGN, NULL);
return slab;
}
static void ccid_kmem_cache_destroy(struct kmem_cache *slab)
{
if (slab != NULL)
kmem_cache_destroy(slab);
}
static int __init ccid_activate(struct ccid_operations *ccid_ops)
{
int err = -ENOBUFS;
ccid_ops->ccid_hc_rx_slab =
ccid_kmem_cache_create(ccid_ops->ccid_hc_rx_obj_size,
ccid_ops->ccid_hc_rx_slab_name,
"ccid%u_hc_rx_sock",
ccid_ops->ccid_id);
if (ccid_ops->ccid_hc_rx_slab == NULL)
goto out;
ccid_ops->ccid_hc_tx_slab =
ccid_kmem_cache_create(ccid_ops->ccid_hc_tx_obj_size,
ccid_ops->ccid_hc_tx_slab_name,
"ccid%u_hc_tx_sock",
ccid_ops->ccid_id);
if (ccid_ops->ccid_hc_tx_slab == NULL)
goto out_free_rx_slab;
pr_info("DCCP: Activated CCID %d (%s)\n",
ccid_ops->ccid_id, ccid_ops->ccid_name);
err = 0;
out:
return err;
out_free_rx_slab:
ccid_kmem_cache_destroy(ccid_ops->ccid_hc_rx_slab);
ccid_ops->ccid_hc_rx_slab = NULL;
goto out;
}
static void ccid_deactivate(struct ccid_operations *ccid_ops)
{
ccid_kmem_cache_destroy(ccid_ops->ccid_hc_tx_slab);
ccid_ops->ccid_hc_tx_slab = NULL;
ccid_kmem_cache_destroy(ccid_ops->ccid_hc_rx_slab);
ccid_ops->ccid_hc_rx_slab = NULL;
pr_info("DCCP: Deactivated CCID %d (%s)\n",
ccid_ops->ccid_id, ccid_ops->ccid_name);
}
struct ccid *ccid_new(const u8 id, struct sock *sk, bool rx)
{
struct ccid_operations *ccid_ops = ccid_by_number(id);
struct ccid *ccid = NULL;
if (ccid_ops == NULL)
goto out;
ccid = kmem_cache_alloc(rx ? ccid_ops->ccid_hc_rx_slab :
ccid_ops->ccid_hc_tx_slab, gfp_any());
if (ccid == NULL)
goto out;
ccid->ccid_ops = ccid_ops;
if (rx) {
memset(ccid + 1, 0, ccid_ops->ccid_hc_rx_obj_size);
if (ccid->ccid_ops->ccid_hc_rx_init != NULL &&
ccid->ccid_ops->ccid_hc_rx_init(ccid, sk) != 0)
goto out_free_ccid;
} else {
memset(ccid + 1, 0, ccid_ops->ccid_hc_tx_obj_size);
if (ccid->ccid_ops->ccid_hc_tx_init != NULL &&
ccid->ccid_ops->ccid_hc_tx_init(ccid, sk) != 0)
goto out_free_ccid;
}
out:
return ccid;
out_free_ccid:
kmem_cache_free(rx ? ccid_ops->ccid_hc_rx_slab :
ccid_ops->ccid_hc_tx_slab, ccid);
ccid = NULL;
goto out;
}
void ccid_hc_rx_delete(struct ccid *ccid, struct sock *sk)
{
if (ccid != NULL) {
if (ccid->ccid_ops->ccid_hc_rx_exit != NULL)
ccid->ccid_ops->ccid_hc_rx_exit(sk);
kmem_cache_free(ccid->ccid_ops->ccid_hc_rx_slab, ccid);
}
}
void ccid_hc_tx_delete(struct ccid *ccid, struct sock *sk)
{
if (ccid != NULL) {
if (ccid->ccid_ops->ccid_hc_tx_exit != NULL)
ccid->ccid_ops->ccid_hc_tx_exit(sk);
kmem_cache_free(ccid->ccid_ops->ccid_hc_tx_slab, ccid);
}
}
int __init ccid_initialize_builtins(void)
{
int i, err = tfrc_lib_init();
if (err)
return err;
for (i = 0; i < ARRAY_SIZE(ccids); i++) {
err = ccid_activate(ccids[i]);
if (err)
goto unwind_registrations;
}
return 0;
unwind_registrations:
while(--i >= 0)
ccid_deactivate(ccids[i]);
tfrc_lib_exit();
return err;
}
void ccid_cleanup_builtins(void)
{
int i;
for (i = 0; i < ARRAY_SIZE(ccids); i++)
ccid_deactivate(ccids[i]);
tfrc_lib_exit();
}

265
net/dccp/ccid.h Normal file
View file

@ -0,0 +1,265 @@
#ifndef _CCID_H
#define _CCID_H
/*
* net/dccp/ccid.h
*
* An implementation of the DCCP protocol
* Arnaldo Carvalho de Melo <acme@conectiva.com.br>
*
* CCID infrastructure
*
* This program is free software; you can redistribute it and/or modify it
* under the terms of the GNU General Public License version 2 as
* published by the Free Software Foundation.
*/
#include <net/sock.h>
#include <linux/compiler.h>
#include <linux/dccp.h>
#include <linux/list.h>
#include <linux/module.h>
/* maximum value for a CCID (RFC 4340, 19.5) */
#define CCID_MAX 255
#define CCID_SLAB_NAME_LENGTH 32
struct tcp_info;
/**
* struct ccid_operations - Interface to Congestion-Control Infrastructure
*
* @ccid_id: numerical CCID ID (up to %CCID_MAX, cf. table 5 in RFC 4340, 10.)
* @ccid_ccmps: the CCMPS including network/transport headers (0 when disabled)
* @ccid_name: alphabetical identifier string for @ccid_id
* @ccid_hc_{r,t}x_slab: memory pool for the receiver/sender half-connection
* @ccid_hc_{r,t}x_obj_size: size of the receiver/sender half-connection socket
*
* @ccid_hc_{r,t}x_init: CCID-specific initialisation routine (before startup)
* @ccid_hc_{r,t}x_exit: CCID-specific cleanup routine (before destruction)
* @ccid_hc_rx_packet_recv: implements the HC-receiver side
* @ccid_hc_{r,t}x_parse_options: parsing routine for CCID/HC-specific options
* @ccid_hc_{r,t}x_insert_options: insert routine for CCID/HC-specific options
* @ccid_hc_tx_packet_recv: implements feedback processing for the HC-sender
* @ccid_hc_tx_send_packet: implements the sending part of the HC-sender
* @ccid_hc_tx_packet_sent: does accounting for packets in flight by HC-sender
* @ccid_hc_{r,t}x_get_info: INET_DIAG information for HC-receiver/sender
* @ccid_hc_{r,t}x_getsockopt: socket options specific to HC-receiver/sender
*/
struct ccid_operations {
unsigned char ccid_id;
__u32 ccid_ccmps;
const char *ccid_name;
struct kmem_cache *ccid_hc_rx_slab,
*ccid_hc_tx_slab;
char ccid_hc_rx_slab_name[CCID_SLAB_NAME_LENGTH];
char ccid_hc_tx_slab_name[CCID_SLAB_NAME_LENGTH];
__u32 ccid_hc_rx_obj_size,
ccid_hc_tx_obj_size;
/* Interface Routines */
int (*ccid_hc_rx_init)(struct ccid *ccid, struct sock *sk);
int (*ccid_hc_tx_init)(struct ccid *ccid, struct sock *sk);
void (*ccid_hc_rx_exit)(struct sock *sk);
void (*ccid_hc_tx_exit)(struct sock *sk);
void (*ccid_hc_rx_packet_recv)(struct sock *sk,
struct sk_buff *skb);
int (*ccid_hc_rx_parse_options)(struct sock *sk, u8 pkt,
u8 opt, u8 *val, u8 len);
int (*ccid_hc_rx_insert_options)(struct sock *sk,
struct sk_buff *skb);
void (*ccid_hc_tx_packet_recv)(struct sock *sk,
struct sk_buff *skb);
int (*ccid_hc_tx_parse_options)(struct sock *sk, u8 pkt,
u8 opt, u8 *val, u8 len);
int (*ccid_hc_tx_send_packet)(struct sock *sk,
struct sk_buff *skb);
void (*ccid_hc_tx_packet_sent)(struct sock *sk,
unsigned int len);
void (*ccid_hc_rx_get_info)(struct sock *sk,
struct tcp_info *info);
void (*ccid_hc_tx_get_info)(struct sock *sk,
struct tcp_info *info);
int (*ccid_hc_rx_getsockopt)(struct sock *sk,
const int optname, int len,
u32 __user *optval,
int __user *optlen);
int (*ccid_hc_tx_getsockopt)(struct sock *sk,
const int optname, int len,
u32 __user *optval,
int __user *optlen);
};
extern struct ccid_operations ccid2_ops;
#ifdef CONFIG_IP_DCCP_CCID3
extern struct ccid_operations ccid3_ops;
#endif
int ccid_initialize_builtins(void);
void ccid_cleanup_builtins(void);
struct ccid {
struct ccid_operations *ccid_ops;
char ccid_priv[0];
};
static inline void *ccid_priv(const struct ccid *ccid)
{
return (void *)ccid->ccid_priv;
}
bool ccid_support_check(u8 const *ccid_array, u8 array_len);
int ccid_get_builtin_ccids(u8 **ccid_array, u8 *array_len);
int ccid_getsockopt_builtin_ccids(struct sock *sk, int len,
char __user *, int __user *);
struct ccid *ccid_new(const u8 id, struct sock *sk, bool rx);
static inline int ccid_get_current_rx_ccid(struct dccp_sock *dp)
{
struct ccid *ccid = dp->dccps_hc_rx_ccid;
if (ccid == NULL || ccid->ccid_ops == NULL)
return -1;
return ccid->ccid_ops->ccid_id;
}
static inline int ccid_get_current_tx_ccid(struct dccp_sock *dp)
{
struct ccid *ccid = dp->dccps_hc_tx_ccid;
if (ccid == NULL || ccid->ccid_ops == NULL)
return -1;
return ccid->ccid_ops->ccid_id;
}
void ccid_hc_rx_delete(struct ccid *ccid, struct sock *sk);
void ccid_hc_tx_delete(struct ccid *ccid, struct sock *sk);
/*
* Congestion control of queued data packets via CCID decision.
*
* The TX CCID performs its congestion-control by indicating whether and when a
* queued packet may be sent, using the return code of ccid_hc_tx_send_packet().
* The following modes are supported via the symbolic constants below:
* - timer-based pacing (CCID returns a delay value in milliseconds);
* - autonomous dequeueing (CCID internally schedules dccps_xmitlet).
*/
enum ccid_dequeueing_decision {
CCID_PACKET_SEND_AT_ONCE = 0x00000, /* "green light": no delay */
CCID_PACKET_DELAY_MAX = 0x0FFFF, /* maximum delay in msecs */
CCID_PACKET_DELAY = 0x10000, /* CCID msec-delay mode */
CCID_PACKET_WILL_DEQUEUE_LATER = 0x20000, /* CCID autonomous mode */
CCID_PACKET_ERR = 0xF0000, /* error condition */
};
static inline int ccid_packet_dequeue_eval(const int return_code)
{
if (return_code < 0)
return CCID_PACKET_ERR;
if (return_code == 0)
return CCID_PACKET_SEND_AT_ONCE;
if (return_code <= CCID_PACKET_DELAY_MAX)
return CCID_PACKET_DELAY;
return return_code;
}
static inline int ccid_hc_tx_send_packet(struct ccid *ccid, struct sock *sk,
struct sk_buff *skb)
{
if (ccid->ccid_ops->ccid_hc_tx_send_packet != NULL)
return ccid->ccid_ops->ccid_hc_tx_send_packet(sk, skb);
return CCID_PACKET_SEND_AT_ONCE;
}
static inline void ccid_hc_tx_packet_sent(struct ccid *ccid, struct sock *sk,
unsigned int len)
{
if (ccid->ccid_ops->ccid_hc_tx_packet_sent != NULL)
ccid->ccid_ops->ccid_hc_tx_packet_sent(sk, len);
}
static inline void ccid_hc_rx_packet_recv(struct ccid *ccid, struct sock *sk,
struct sk_buff *skb)
{
if (ccid->ccid_ops->ccid_hc_rx_packet_recv != NULL)
ccid->ccid_ops->ccid_hc_rx_packet_recv(sk, skb);
}
static inline void ccid_hc_tx_packet_recv(struct ccid *ccid, struct sock *sk,
struct sk_buff *skb)
{
if (ccid->ccid_ops->ccid_hc_tx_packet_recv != NULL)
ccid->ccid_ops->ccid_hc_tx_packet_recv(sk, skb);
}
/**
* ccid_hc_tx_parse_options - Parse CCID-specific options sent by the receiver
* @pkt: type of packet that @opt appears on (RFC 4340, 5.1)
* @opt: the CCID-specific option type (RFC 4340, 5.8 and 10.3)
* @val: value of @opt
* @len: length of @val in bytes
*/
static inline int ccid_hc_tx_parse_options(struct ccid *ccid, struct sock *sk,
u8 pkt, u8 opt, u8 *val, u8 len)
{
if (ccid->ccid_ops->ccid_hc_tx_parse_options == NULL)
return 0;
return ccid->ccid_ops->ccid_hc_tx_parse_options(sk, pkt, opt, val, len);
}
/**
* ccid_hc_rx_parse_options - Parse CCID-specific options sent by the sender
* Arguments are analogous to ccid_hc_tx_parse_options()
*/
static inline int ccid_hc_rx_parse_options(struct ccid *ccid, struct sock *sk,
u8 pkt, u8 opt, u8 *val, u8 len)
{
if (ccid->ccid_ops->ccid_hc_rx_parse_options == NULL)
return 0;
return ccid->ccid_ops->ccid_hc_rx_parse_options(sk, pkt, opt, val, len);
}
static inline int ccid_hc_rx_insert_options(struct ccid *ccid, struct sock *sk,
struct sk_buff *skb)
{
if (ccid->ccid_ops->ccid_hc_rx_insert_options != NULL)
return ccid->ccid_ops->ccid_hc_rx_insert_options(sk, skb);
return 0;
}
static inline void ccid_hc_rx_get_info(struct ccid *ccid, struct sock *sk,
struct tcp_info *info)
{
if (ccid->ccid_ops->ccid_hc_rx_get_info != NULL)
ccid->ccid_ops->ccid_hc_rx_get_info(sk, info);
}
static inline void ccid_hc_tx_get_info(struct ccid *ccid, struct sock *sk,
struct tcp_info *info)
{
if (ccid->ccid_ops->ccid_hc_tx_get_info != NULL)
ccid->ccid_ops->ccid_hc_tx_get_info(sk, info);
}
static inline int ccid_hc_rx_getsockopt(struct ccid *ccid, struct sock *sk,
const int optname, int len,
u32 __user *optval, int __user *optlen)
{
int rc = -ENOPROTOOPT;
if (ccid != NULL && ccid->ccid_ops->ccid_hc_rx_getsockopt != NULL)
rc = ccid->ccid_ops->ccid_hc_rx_getsockopt(sk, optname, len,
optval, optlen);
return rc;
}
static inline int ccid_hc_tx_getsockopt(struct ccid *ccid, struct sock *sk,
const int optname, int len,
u32 __user *optval, int __user *optlen)
{
int rc = -ENOPROTOOPT;
if (ccid != NULL && ccid->ccid_ops->ccid_hc_tx_getsockopt != NULL)
rc = ccid->ccid_ops->ccid_hc_tx_getsockopt(sk, optname, len,
optval, optlen);
return rc;
}
#endif /* _CCID_H */

54
net/dccp/ccids/Kconfig Normal file
View file

@ -0,0 +1,54 @@
menu "DCCP CCIDs Configuration"
config IP_DCCP_CCID2_DEBUG
bool "CCID-2 debugging messages"
---help---
Enable CCID-2 specific debugging messages.
The debugging output can additionally be toggled by setting the
ccid2_debug parameter to 0 or 1.
If in doubt, say N.
config IP_DCCP_CCID3
bool "CCID-3 (TCP-Friendly)"
def_bool y if (IP_DCCP = y || IP_DCCP = m)
---help---
CCID-3 denotes TCP-Friendly Rate Control (TFRC), an equation-based
rate-controlled congestion control mechanism. TFRC is designed to
be reasonably fair when competing for bandwidth with TCP-like flows,
where a flow is "reasonably fair" if its sending rate is generally
within a factor of two of the sending rate of a TCP flow under the
same conditions. However, TFRC has a much lower variation of
throughput over time compared with TCP, which makes CCID-3 more
suitable than CCID-2 for applications such streaming media where a
relatively smooth sending rate is of importance.
CCID-3 is further described in RFC 4342,
http://www.ietf.org/rfc/rfc4342.txt
The TFRC congestion control algorithms were initially described in
RFC 5348.
This text was extracted from RFC 4340 (sec. 10.2),
http://www.ietf.org/rfc/rfc4340.txt
If in doubt, say N.
config IP_DCCP_CCID3_DEBUG
bool "CCID-3 debugging messages"
depends on IP_DCCP_CCID3
---help---
Enable CCID-3 specific debugging messages.
The debugging output can additionally be toggled by setting the
ccid3_debug parameter to 0 or 1.
If in doubt, say N.
config IP_DCCP_TFRC_LIB
def_bool y if IP_DCCP_CCID3
config IP_DCCP_TFRC_DEBUG
def_bool y if IP_DCCP_CCID3_DEBUG
endmenu

784
net/dccp/ccids/ccid2.c Normal file
View file

@ -0,0 +1,784 @@
/*
* Copyright (c) 2005, 2006 Andrea Bittau <a.bittau@cs.ucl.ac.uk>
*
* Changes to meet Linux coding standards, and DCCP infrastructure fixes.
*
* Copyright (c) 2006 Arnaldo Carvalho de Melo <acme@conectiva.com.br>
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
*/
/*
* This implementation should follow RFC 4341
*/
#include <linux/slab.h>
#include "../feat.h"
#include "ccid2.h"
#ifdef CONFIG_IP_DCCP_CCID2_DEBUG
static bool ccid2_debug;
#define ccid2_pr_debug(format, a...) DCCP_PR_DEBUG(ccid2_debug, format, ##a)
#else
#define ccid2_pr_debug(format, a...)
#endif
static int ccid2_hc_tx_alloc_seq(struct ccid2_hc_tx_sock *hc)
{
struct ccid2_seq *seqp;
int i;
/* check if we have space to preserve the pointer to the buffer */
if (hc->tx_seqbufc >= (sizeof(hc->tx_seqbuf) /
sizeof(struct ccid2_seq *)))
return -ENOMEM;
/* allocate buffer and initialize linked list */
seqp = kmalloc(CCID2_SEQBUF_LEN * sizeof(struct ccid2_seq), gfp_any());
if (seqp == NULL)
return -ENOMEM;
for (i = 0; i < (CCID2_SEQBUF_LEN - 1); i++) {
seqp[i].ccid2s_next = &seqp[i + 1];
seqp[i + 1].ccid2s_prev = &seqp[i];
}
seqp[CCID2_SEQBUF_LEN - 1].ccid2s_next = seqp;
seqp->ccid2s_prev = &seqp[CCID2_SEQBUF_LEN - 1];
/* This is the first allocation. Initiate the head and tail. */
if (hc->tx_seqbufc == 0)
hc->tx_seqh = hc->tx_seqt = seqp;
else {
/* link the existing list with the one we just created */
hc->tx_seqh->ccid2s_next = seqp;
seqp->ccid2s_prev = hc->tx_seqh;
hc->tx_seqt->ccid2s_prev = &seqp[CCID2_SEQBUF_LEN - 1];
seqp[CCID2_SEQBUF_LEN - 1].ccid2s_next = hc->tx_seqt;
}
/* store the original pointer to the buffer so we can free it */
hc->tx_seqbuf[hc->tx_seqbufc] = seqp;
hc->tx_seqbufc++;
return 0;
}
static int ccid2_hc_tx_send_packet(struct sock *sk, struct sk_buff *skb)
{
if (ccid2_cwnd_network_limited(ccid2_hc_tx_sk(sk)))
return CCID_PACKET_WILL_DEQUEUE_LATER;
return CCID_PACKET_SEND_AT_ONCE;
}
static void ccid2_change_l_ack_ratio(struct sock *sk, u32 val)
{
u32 max_ratio = DIV_ROUND_UP(ccid2_hc_tx_sk(sk)->tx_cwnd, 2);
/*
* Ensure that Ack Ratio does not exceed ceil(cwnd/2), which is (2) from
* RFC 4341, 6.1.2. We ignore the statement that Ack Ratio 2 is always
* acceptable since this causes starvation/deadlock whenever cwnd < 2.
* The same problem arises when Ack Ratio is 0 (ie. Ack Ratio disabled).
*/
if (val == 0 || val > max_ratio) {
DCCP_WARN("Limiting Ack Ratio (%u) to %u\n", val, max_ratio);
val = max_ratio;
}
dccp_feat_signal_nn_change(sk, DCCPF_ACK_RATIO,
min_t(u32, val, DCCPF_ACK_RATIO_MAX));
}
static void ccid2_check_l_ack_ratio(struct sock *sk)
{
struct ccid2_hc_tx_sock *hc = ccid2_hc_tx_sk(sk);
/*
* After a loss, idle period, application limited period, or RTO we
* need to check that the ack ratio is still less than the congestion
* window. Otherwise, we will send an entire congestion window of
* packets and got no response because we haven't sent ack ratio
* packets yet.
* If the ack ratio does need to be reduced, we reduce it to half of
* the congestion window (or 1 if that's zero) instead of to the
* congestion window. This prevents problems if one ack is lost.
*/
if (dccp_feat_nn_get(sk, DCCPF_ACK_RATIO) > hc->tx_cwnd)
ccid2_change_l_ack_ratio(sk, hc->tx_cwnd/2 ? : 1U);
}
static void ccid2_change_l_seq_window(struct sock *sk, u64 val)
{
dccp_feat_signal_nn_change(sk, DCCPF_SEQUENCE_WINDOW,
clamp_val(val, DCCPF_SEQ_WMIN,
DCCPF_SEQ_WMAX));
}
static void ccid2_hc_tx_rto_expire(unsigned long data)
{
struct sock *sk = (struct sock *)data;
struct ccid2_hc_tx_sock *hc = ccid2_hc_tx_sk(sk);
const bool sender_was_blocked = ccid2_cwnd_network_limited(hc);
bh_lock_sock(sk);
if (sock_owned_by_user(sk)) {
sk_reset_timer(sk, &hc->tx_rtotimer, jiffies + HZ / 5);
goto out;
}
ccid2_pr_debug("RTO_EXPIRE\n");
/* back-off timer */
hc->tx_rto <<= 1;
if (hc->tx_rto > DCCP_RTO_MAX)
hc->tx_rto = DCCP_RTO_MAX;
/* adjust pipe, cwnd etc */
hc->tx_ssthresh = hc->tx_cwnd / 2;
if (hc->tx_ssthresh < 2)
hc->tx_ssthresh = 2;
hc->tx_cwnd = 1;
hc->tx_pipe = 0;
/* clear state about stuff we sent */
hc->tx_seqt = hc->tx_seqh;
hc->tx_packets_acked = 0;
/* clear ack ratio state. */
hc->tx_rpseq = 0;
hc->tx_rpdupack = -1;
ccid2_change_l_ack_ratio(sk, 1);
/* if we were blocked before, we may now send cwnd=1 packet */
if (sender_was_blocked)
tasklet_schedule(&dccp_sk(sk)->dccps_xmitlet);
/* restart backed-off timer */
sk_reset_timer(sk, &hc->tx_rtotimer, jiffies + hc->tx_rto);
out:
bh_unlock_sock(sk);
sock_put(sk);
}
/*
* Congestion window validation (RFC 2861).
*/
static bool ccid2_do_cwv = true;
module_param(ccid2_do_cwv, bool, 0644);
MODULE_PARM_DESC(ccid2_do_cwv, "Perform RFC2861 Congestion Window Validation");
/**
* ccid2_update_used_window - Track how much of cwnd is actually used
* This is done in addition to CWV. The sender needs to have an idea of how many
* packets may be in flight, to set the local Sequence Window value accordingly
* (RFC 4340, 7.5.2). The CWV mechanism is exploited to keep track of the
* maximum-used window. We use an EWMA low-pass filter to filter out noise.
*/
static void ccid2_update_used_window(struct ccid2_hc_tx_sock *hc, u32 new_wnd)
{
hc->tx_expected_wnd = (3 * hc->tx_expected_wnd + new_wnd) / 4;
}
/* This borrows the code of tcp_cwnd_application_limited() */
static void ccid2_cwnd_application_limited(struct sock *sk, const u32 now)
{
struct ccid2_hc_tx_sock *hc = ccid2_hc_tx_sk(sk);
/* don't reduce cwnd below the initial window (IW) */
u32 init_win = rfc3390_bytes_to_packets(dccp_sk(sk)->dccps_mss_cache),
win_used = max(hc->tx_cwnd_used, init_win);
if (win_used < hc->tx_cwnd) {
hc->tx_ssthresh = max(hc->tx_ssthresh,
(hc->tx_cwnd >> 1) + (hc->tx_cwnd >> 2));
hc->tx_cwnd = (hc->tx_cwnd + win_used) >> 1;
}
hc->tx_cwnd_used = 0;
hc->tx_cwnd_stamp = now;
ccid2_check_l_ack_ratio(sk);
}
/* This borrows the code of tcp_cwnd_restart() */
static void ccid2_cwnd_restart(struct sock *sk, const u32 now)
{
struct ccid2_hc_tx_sock *hc = ccid2_hc_tx_sk(sk);
u32 cwnd = hc->tx_cwnd, restart_cwnd,
iwnd = rfc3390_bytes_to_packets(dccp_sk(sk)->dccps_mss_cache);
hc->tx_ssthresh = max(hc->tx_ssthresh, (cwnd >> 1) + (cwnd >> 2));
/* don't reduce cwnd below the initial window (IW) */
restart_cwnd = min(cwnd, iwnd);
cwnd >>= (now - hc->tx_lsndtime) / hc->tx_rto;
hc->tx_cwnd = max(cwnd, restart_cwnd);
hc->tx_cwnd_stamp = now;
hc->tx_cwnd_used = 0;
ccid2_check_l_ack_ratio(sk);
}
static void ccid2_hc_tx_packet_sent(struct sock *sk, unsigned int len)
{
struct dccp_sock *dp = dccp_sk(sk);
struct ccid2_hc_tx_sock *hc = ccid2_hc_tx_sk(sk);
const u32 now = ccid2_time_stamp;
struct ccid2_seq *next;
/* slow-start after idle periods (RFC 2581, RFC 2861) */
if (ccid2_do_cwv && !hc->tx_pipe &&
(s32)(now - hc->tx_lsndtime) >= hc->tx_rto)
ccid2_cwnd_restart(sk, now);
hc->tx_lsndtime = now;
hc->tx_pipe += 1;
/* see whether cwnd was fully used (RFC 2861), update expected window */
if (ccid2_cwnd_network_limited(hc)) {
ccid2_update_used_window(hc, hc->tx_cwnd);
hc->tx_cwnd_used = 0;
hc->tx_cwnd_stamp = now;
} else {
if (hc->tx_pipe > hc->tx_cwnd_used)
hc->tx_cwnd_used = hc->tx_pipe;
ccid2_update_used_window(hc, hc->tx_cwnd_used);
if (ccid2_do_cwv && (s32)(now - hc->tx_cwnd_stamp) >= hc->tx_rto)
ccid2_cwnd_application_limited(sk, now);
}
hc->tx_seqh->ccid2s_seq = dp->dccps_gss;
hc->tx_seqh->ccid2s_acked = 0;
hc->tx_seqh->ccid2s_sent = now;
next = hc->tx_seqh->ccid2s_next;
/* check if we need to alloc more space */
if (next == hc->tx_seqt) {
if (ccid2_hc_tx_alloc_seq(hc)) {
DCCP_CRIT("packet history - out of memory!");
/* FIXME: find a more graceful way to bail out */
return;
}
next = hc->tx_seqh->ccid2s_next;
BUG_ON(next == hc->tx_seqt);
}
hc->tx_seqh = next;
ccid2_pr_debug("cwnd=%d pipe=%d\n", hc->tx_cwnd, hc->tx_pipe);
/*
* FIXME: The code below is broken and the variables have been removed
* from the socket struct. The `ackloss' variable was always set to 0,
* and with arsent there are several problems:
* (i) it doesn't just count the number of Acks, but all sent packets;
* (ii) it is expressed in # of packets, not # of windows, so the
* comparison below uses the wrong formula: Appendix A of RFC 4341
* comes up with the number K = cwnd / (R^2 - R) of consecutive windows
* of data with no lost or marked Ack packets. If arsent were the # of
* consecutive Acks received without loss, then Ack Ratio needs to be
* decreased by 1 when
* arsent >= K * cwnd / R = cwnd^2 / (R^3 - R^2)
* where cwnd / R is the number of Acks received per window of data
* (cf. RFC 4341, App. A). The problems are that
* - arsent counts other packets as well;
* - the comparison uses a formula different from RFC 4341;
* - computing a cubic/quadratic equation each time is too complicated.
* Hence a different algorithm is needed.
*/
#if 0
/* Ack Ratio. Need to maintain a concept of how many windows we sent */
hc->tx_arsent++;
/* We had an ack loss in this window... */
if (hc->tx_ackloss) {
if (hc->tx_arsent >= hc->tx_cwnd) {
hc->tx_arsent = 0;
hc->tx_ackloss = 0;
}
} else {
/* No acks lost up to now... */
/* decrease ack ratio if enough packets were sent */
if (dp->dccps_l_ack_ratio > 1) {
/* XXX don't calculate denominator each time */
int denom = dp->dccps_l_ack_ratio * dp->dccps_l_ack_ratio -
dp->dccps_l_ack_ratio;
denom = hc->tx_cwnd * hc->tx_cwnd / denom;
if (hc->tx_arsent >= denom) {
ccid2_change_l_ack_ratio(sk, dp->dccps_l_ack_ratio - 1);
hc->tx_arsent = 0;
}
} else {
/* we can't increase ack ratio further [1] */
hc->tx_arsent = 0; /* or maybe set it to cwnd*/
}
}
#endif
sk_reset_timer(sk, &hc->tx_rtotimer, jiffies + hc->tx_rto);
#ifdef CONFIG_IP_DCCP_CCID2_DEBUG
do {
struct ccid2_seq *seqp = hc->tx_seqt;
while (seqp != hc->tx_seqh) {
ccid2_pr_debug("out seq=%llu acked=%d time=%u\n",
(unsigned long long)seqp->ccid2s_seq,
seqp->ccid2s_acked, seqp->ccid2s_sent);
seqp = seqp->ccid2s_next;
}
} while (0);
ccid2_pr_debug("=========\n");
#endif
}
/**
* ccid2_rtt_estimator - Sample RTT and compute RTO using RFC2988 algorithm
* This code is almost identical with TCP's tcp_rtt_estimator(), since
* - it has a higher sampling frequency (recommended by RFC 1323),
* - the RTO does not collapse into RTT due to RTTVAR going towards zero,
* - it is simple (cf. more complex proposals such as Eifel timer or research
* which suggests that the gain should be set according to window size),
* - in tests it was found to work well with CCID2 [gerrit].
*/
static void ccid2_rtt_estimator(struct sock *sk, const long mrtt)
{
struct ccid2_hc_tx_sock *hc = ccid2_hc_tx_sk(sk);
long m = mrtt ? : 1;
if (hc->tx_srtt == 0) {
/* First measurement m */
hc->tx_srtt = m << 3;
hc->tx_mdev = m << 1;
hc->tx_mdev_max = max(hc->tx_mdev, tcp_rto_min(sk));
hc->tx_rttvar = hc->tx_mdev_max;
hc->tx_rtt_seq = dccp_sk(sk)->dccps_gss;
} else {
/* Update scaled SRTT as SRTT += 1/8 * (m - SRTT) */
m -= (hc->tx_srtt >> 3);
hc->tx_srtt += m;
/* Similarly, update scaled mdev with regard to |m| */
if (m < 0) {
m = -m;
m -= (hc->tx_mdev >> 2);
/*
* This neutralises RTO increase when RTT < SRTT - mdev
* (see P. Sarolahti, A. Kuznetsov,"Congestion Control
* in Linux TCP", USENIX 2002, pp. 49-62).
*/
if (m > 0)
m >>= 3;
} else {
m -= (hc->tx_mdev >> 2);
}
hc->tx_mdev += m;
if (hc->tx_mdev > hc->tx_mdev_max) {
hc->tx_mdev_max = hc->tx_mdev;
if (hc->tx_mdev_max > hc->tx_rttvar)
hc->tx_rttvar = hc->tx_mdev_max;
}
/*
* Decay RTTVAR at most once per flight, exploiting that
* 1) pipe <= cwnd <= Sequence_Window = W (RFC 4340, 7.5.2)
* 2) AWL = GSS-W+1 <= GAR <= GSS (RFC 4340, 7.5.1)
* GAR is a useful bound for FlightSize = pipe.
* AWL is probably too low here, as it over-estimates pipe.
*/
if (after48(dccp_sk(sk)->dccps_gar, hc->tx_rtt_seq)) {
if (hc->tx_mdev_max < hc->tx_rttvar)
hc->tx_rttvar -= (hc->tx_rttvar -
hc->tx_mdev_max) >> 2;
hc->tx_rtt_seq = dccp_sk(sk)->dccps_gss;
hc->tx_mdev_max = tcp_rto_min(sk);
}
}
/*
* Set RTO from SRTT and RTTVAR
* As in TCP, 4 * RTTVAR >= TCP_RTO_MIN, giving a minimum RTO of 200 ms.
* This agrees with RFC 4341, 5:
* "Because DCCP does not retransmit data, DCCP does not require
* TCP's recommended minimum timeout of one second".
*/
hc->tx_rto = (hc->tx_srtt >> 3) + hc->tx_rttvar;
if (hc->tx_rto > DCCP_RTO_MAX)
hc->tx_rto = DCCP_RTO_MAX;
}
static void ccid2_new_ack(struct sock *sk, struct ccid2_seq *seqp,
unsigned int *maxincr)
{
struct ccid2_hc_tx_sock *hc = ccid2_hc_tx_sk(sk);
struct dccp_sock *dp = dccp_sk(sk);
int r_seq_used = hc->tx_cwnd / dp->dccps_l_ack_ratio;
if (hc->tx_cwnd < dp->dccps_l_seq_win &&
r_seq_used < dp->dccps_r_seq_win) {
if (hc->tx_cwnd < hc->tx_ssthresh) {
if (*maxincr > 0 && ++hc->tx_packets_acked >= 2) {
hc->tx_cwnd += 1;
*maxincr -= 1;
hc->tx_packets_acked = 0;
}
} else if (++hc->tx_packets_acked >= hc->tx_cwnd) {
hc->tx_cwnd += 1;
hc->tx_packets_acked = 0;
}
}
/*
* Adjust the local sequence window and the ack ratio to allow about
* 5 times the number of packets in the network (RFC 4340 7.5.2)
*/
if (r_seq_used * CCID2_WIN_CHANGE_FACTOR >= dp->dccps_r_seq_win)
ccid2_change_l_ack_ratio(sk, dp->dccps_l_ack_ratio * 2);
else if (r_seq_used * CCID2_WIN_CHANGE_FACTOR < dp->dccps_r_seq_win/2)
ccid2_change_l_ack_ratio(sk, dp->dccps_l_ack_ratio / 2 ? : 1U);
if (hc->tx_cwnd * CCID2_WIN_CHANGE_FACTOR >= dp->dccps_l_seq_win)
ccid2_change_l_seq_window(sk, dp->dccps_l_seq_win * 2);
else if (hc->tx_cwnd * CCID2_WIN_CHANGE_FACTOR < dp->dccps_l_seq_win/2)
ccid2_change_l_seq_window(sk, dp->dccps_l_seq_win / 2);
/*
* FIXME: RTT is sampled several times per acknowledgment (for each
* entry in the Ack Vector), instead of once per Ack (as in TCP SACK).
* This causes the RTT to be over-estimated, since the older entries
* in the Ack Vector have earlier sending times.
* The cleanest solution is to not use the ccid2s_sent field at all
* and instead use DCCP timestamps: requires changes in other places.
*/
ccid2_rtt_estimator(sk, ccid2_time_stamp - seqp->ccid2s_sent);
}
static void ccid2_congestion_event(struct sock *sk, struct ccid2_seq *seqp)
{
struct ccid2_hc_tx_sock *hc = ccid2_hc_tx_sk(sk);
if ((s32)(seqp->ccid2s_sent - hc->tx_last_cong) < 0) {
ccid2_pr_debug("Multiple losses in an RTT---treating as one\n");
return;
}
hc->tx_last_cong = ccid2_time_stamp;
hc->tx_cwnd = hc->tx_cwnd / 2 ? : 1U;
hc->tx_ssthresh = max(hc->tx_cwnd, 2U);
ccid2_check_l_ack_ratio(sk);
}
static int ccid2_hc_tx_parse_options(struct sock *sk, u8 packet_type,
u8 option, u8 *optval, u8 optlen)
{
struct ccid2_hc_tx_sock *hc = ccid2_hc_tx_sk(sk);
switch (option) {
case DCCPO_ACK_VECTOR_0:
case DCCPO_ACK_VECTOR_1:
return dccp_ackvec_parsed_add(&hc->tx_av_chunks, optval, optlen,
option - DCCPO_ACK_VECTOR_0);
}
return 0;
}
static void ccid2_hc_tx_packet_recv(struct sock *sk, struct sk_buff *skb)
{
struct dccp_sock *dp = dccp_sk(sk);
struct ccid2_hc_tx_sock *hc = ccid2_hc_tx_sk(sk);
const bool sender_was_blocked = ccid2_cwnd_network_limited(hc);
struct dccp_ackvec_parsed *avp;
u64 ackno, seqno;
struct ccid2_seq *seqp;
int done = 0;
unsigned int maxincr = 0;
/* check reverse path congestion */
seqno = DCCP_SKB_CB(skb)->dccpd_seq;
/* XXX this whole "algorithm" is broken. Need to fix it to keep track
* of the seqnos of the dupacks so that rpseq and rpdupack are correct
* -sorbo.
*/
/* need to bootstrap */
if (hc->tx_rpdupack == -1) {
hc->tx_rpdupack = 0;
hc->tx_rpseq = seqno;
} else {
/* check if packet is consecutive */
if (dccp_delta_seqno(hc->tx_rpseq, seqno) == 1)
hc->tx_rpseq = seqno;
/* it's a later packet */
else if (after48(seqno, hc->tx_rpseq)) {
hc->tx_rpdupack++;
/* check if we got enough dupacks */
if (hc->tx_rpdupack >= NUMDUPACK) {
hc->tx_rpdupack = -1; /* XXX lame */
hc->tx_rpseq = 0;
#ifdef __CCID2_COPES_GRACEFULLY_WITH_ACK_CONGESTION_CONTROL__
/*
* FIXME: Ack Congestion Control is broken; in
* the current state instabilities occurred with
* Ack Ratios greater than 1; causing hang-ups
* and long RTO timeouts. This needs to be fixed
* before opening up dynamic changes. -- gerrit
*/
ccid2_change_l_ack_ratio(sk, 2 * dp->dccps_l_ack_ratio);
#endif
}
}
}
/* check forward path congestion */
if (dccp_packet_without_ack(skb))
return;
/* still didn't send out new data packets */
if (hc->tx_seqh == hc->tx_seqt)
goto done;
ackno = DCCP_SKB_CB(skb)->dccpd_ack_seq;
if (after48(ackno, hc->tx_high_ack))
hc->tx_high_ack = ackno;
seqp = hc->tx_seqt;
while (before48(seqp->ccid2s_seq, ackno)) {
seqp = seqp->ccid2s_next;
if (seqp == hc->tx_seqh) {
seqp = hc->tx_seqh->ccid2s_prev;
break;
}
}
/*
* In slow-start, cwnd can increase up to a maximum of Ack Ratio/2
* packets per acknowledgement. Rounding up avoids that cwnd is not
* advanced when Ack Ratio is 1 and gives a slight edge otherwise.
*/
if (hc->tx_cwnd < hc->tx_ssthresh)
maxincr = DIV_ROUND_UP(dp->dccps_l_ack_ratio, 2);
/* go through all ack vectors */
list_for_each_entry(avp, &hc->tx_av_chunks, node) {
/* go through this ack vector */
for (; avp->len--; avp->vec++) {
u64 ackno_end_rl = SUB48(ackno,
dccp_ackvec_runlen(avp->vec));
ccid2_pr_debug("ackvec %llu |%u,%u|\n",
(unsigned long long)ackno,
dccp_ackvec_state(avp->vec) >> 6,
dccp_ackvec_runlen(avp->vec));
/* if the seqno we are analyzing is larger than the
* current ackno, then move towards the tail of our
* seqnos.
*/
while (after48(seqp->ccid2s_seq, ackno)) {
if (seqp == hc->tx_seqt) {
done = 1;
break;
}
seqp = seqp->ccid2s_prev;
}
if (done)
break;
/* check all seqnos in the range of the vector
* run length
*/
while (between48(seqp->ccid2s_seq,ackno_end_rl,ackno)) {
const u8 state = dccp_ackvec_state(avp->vec);
/* new packet received or marked */
if (state != DCCPAV_NOT_RECEIVED &&
!seqp->ccid2s_acked) {
if (state == DCCPAV_ECN_MARKED)
ccid2_congestion_event(sk,
seqp);
else
ccid2_new_ack(sk, seqp,
&maxincr);
seqp->ccid2s_acked = 1;
ccid2_pr_debug("Got ack for %llu\n",
(unsigned long long)seqp->ccid2s_seq);
hc->tx_pipe--;
}
if (seqp == hc->tx_seqt) {
done = 1;
break;
}
seqp = seqp->ccid2s_prev;
}
if (done)
break;
ackno = SUB48(ackno_end_rl, 1);
}
if (done)
break;
}
/* The state about what is acked should be correct now
* Check for NUMDUPACK
*/
seqp = hc->tx_seqt;
while (before48(seqp->ccid2s_seq, hc->tx_high_ack)) {
seqp = seqp->ccid2s_next;
if (seqp == hc->tx_seqh) {
seqp = hc->tx_seqh->ccid2s_prev;
break;
}
}
done = 0;
while (1) {
if (seqp->ccid2s_acked) {
done++;
if (done == NUMDUPACK)
break;
}
if (seqp == hc->tx_seqt)
break;
seqp = seqp->ccid2s_prev;
}
/* If there are at least 3 acknowledgements, anything unacknowledged
* below the last sequence number is considered lost
*/
if (done == NUMDUPACK) {
struct ccid2_seq *last_acked = seqp;
/* check for lost packets */
while (1) {
if (!seqp->ccid2s_acked) {
ccid2_pr_debug("Packet lost: %llu\n",
(unsigned long long)seqp->ccid2s_seq);
/* XXX need to traverse from tail -> head in
* order to detect multiple congestion events in
* one ack vector.
*/
ccid2_congestion_event(sk, seqp);
hc->tx_pipe--;
}
if (seqp == hc->tx_seqt)
break;
seqp = seqp->ccid2s_prev;
}
hc->tx_seqt = last_acked;
}
/* trim acked packets in tail */
while (hc->tx_seqt != hc->tx_seqh) {
if (!hc->tx_seqt->ccid2s_acked)
break;
hc->tx_seqt = hc->tx_seqt->ccid2s_next;
}
/* restart RTO timer if not all outstanding data has been acked */
if (hc->tx_pipe == 0)
sk_stop_timer(sk, &hc->tx_rtotimer);
else
sk_reset_timer(sk, &hc->tx_rtotimer, jiffies + hc->tx_rto);
done:
/* check if incoming Acks allow pending packets to be sent */
if (sender_was_blocked && !ccid2_cwnd_network_limited(hc))
tasklet_schedule(&dccp_sk(sk)->dccps_xmitlet);
dccp_ackvec_parsed_cleanup(&hc->tx_av_chunks);
}
static int ccid2_hc_tx_init(struct ccid *ccid, struct sock *sk)
{
struct ccid2_hc_tx_sock *hc = ccid_priv(ccid);
struct dccp_sock *dp = dccp_sk(sk);
u32 max_ratio;
/* RFC 4341, 5: initialise ssthresh to arbitrarily high (max) value */
hc->tx_ssthresh = ~0U;
/* Use larger initial windows (RFC 4341, section 5). */
hc->tx_cwnd = rfc3390_bytes_to_packets(dp->dccps_mss_cache);
hc->tx_expected_wnd = hc->tx_cwnd;
/* Make sure that Ack Ratio is enabled and within bounds. */
max_ratio = DIV_ROUND_UP(hc->tx_cwnd, 2);
if (dp->dccps_l_ack_ratio == 0 || dp->dccps_l_ack_ratio > max_ratio)
dp->dccps_l_ack_ratio = max_ratio;
/* XXX init ~ to window size... */
if (ccid2_hc_tx_alloc_seq(hc))
return -ENOMEM;
hc->tx_rto = DCCP_TIMEOUT_INIT;
hc->tx_rpdupack = -1;
hc->tx_last_cong = hc->tx_lsndtime = hc->tx_cwnd_stamp = ccid2_time_stamp;
hc->tx_cwnd_used = 0;
setup_timer(&hc->tx_rtotimer, ccid2_hc_tx_rto_expire,
(unsigned long)sk);
INIT_LIST_HEAD(&hc->tx_av_chunks);
return 0;
}
static void ccid2_hc_tx_exit(struct sock *sk)
{
struct ccid2_hc_tx_sock *hc = ccid2_hc_tx_sk(sk);
int i;
sk_stop_timer(sk, &hc->tx_rtotimer);
for (i = 0; i < hc->tx_seqbufc; i++)
kfree(hc->tx_seqbuf[i]);
hc->tx_seqbufc = 0;
}
static void ccid2_hc_rx_packet_recv(struct sock *sk, struct sk_buff *skb)
{
struct ccid2_hc_rx_sock *hc = ccid2_hc_rx_sk(sk);
if (!dccp_data_packet(skb))
return;
if (++hc->rx_num_data_pkts >= dccp_sk(sk)->dccps_r_ack_ratio) {
dccp_send_ack(sk);
hc->rx_num_data_pkts = 0;
}
}
struct ccid_operations ccid2_ops = {
.ccid_id = DCCPC_CCID2,
.ccid_name = "TCP-like",
.ccid_hc_tx_obj_size = sizeof(struct ccid2_hc_tx_sock),
.ccid_hc_tx_init = ccid2_hc_tx_init,
.ccid_hc_tx_exit = ccid2_hc_tx_exit,
.ccid_hc_tx_send_packet = ccid2_hc_tx_send_packet,
.ccid_hc_tx_packet_sent = ccid2_hc_tx_packet_sent,
.ccid_hc_tx_parse_options = ccid2_hc_tx_parse_options,
.ccid_hc_tx_packet_recv = ccid2_hc_tx_packet_recv,
.ccid_hc_rx_obj_size = sizeof(struct ccid2_hc_rx_sock),
.ccid_hc_rx_packet_recv = ccid2_hc_rx_packet_recv,
};
#ifdef CONFIG_IP_DCCP_CCID2_DEBUG
module_param(ccid2_debug, bool, 0644);
MODULE_PARM_DESC(ccid2_debug, "Enable CCID-2 debug messages");
#endif

133
net/dccp/ccids/ccid2.h Normal file
View file

@ -0,0 +1,133 @@
/*
* Copyright (c) 2005 Andrea Bittau <a.bittau@cs.ucl.ac.uk>
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
*/
#ifndef _DCCP_CCID2_H_
#define _DCCP_CCID2_H_
#include <linux/timer.h>
#include <linux/types.h>
#include "../ccid.h"
#include "../dccp.h"
/*
* CCID-2 timestamping faces the same issues as TCP timestamping.
* Hence we reuse/share as much of the code as possible.
*/
#define ccid2_time_stamp tcp_time_stamp
/* NUMDUPACK parameter from RFC 4341, p. 6 */
#define NUMDUPACK 3
struct ccid2_seq {
u64 ccid2s_seq;
u32 ccid2s_sent;
int ccid2s_acked;
struct ccid2_seq *ccid2s_prev;
struct ccid2_seq *ccid2s_next;
};
#define CCID2_SEQBUF_LEN 1024
#define CCID2_SEQBUF_MAX 128
/*
* Multiple of congestion window to keep the sequence window at
* (RFC 4340 7.5.2)
*/
#define CCID2_WIN_CHANGE_FACTOR 5
/**
* struct ccid2_hc_tx_sock - CCID2 TX half connection
* @tx_{cwnd,ssthresh,pipe}: as per RFC 4341, section 5
* @tx_packets_acked: Ack counter for deriving cwnd growth (RFC 3465)
* @tx_srtt: smoothed RTT estimate, scaled by 2^3
* @tx_mdev: smoothed RTT variation, scaled by 2^2
* @tx_mdev_max: maximum of @mdev during one flight
* @tx_rttvar: moving average/maximum of @mdev_max
* @tx_rto: RTO value deriving from SRTT and RTTVAR (RFC 2988)
* @tx_rtt_seq: to decay RTTVAR at most once per flight
* @tx_cwnd_used: actually used cwnd, W_used of RFC 2861
* @tx_expected_wnd: moving average of @tx_cwnd_used
* @tx_cwnd_stamp: to track idle periods in CWV
* @tx_lsndtime: last time (in jiffies) a data packet was sent
* @tx_rpseq: last consecutive seqno
* @tx_rpdupack: dupacks since rpseq
* @tx_av_chunks: list of Ack Vectors received on current skb
*/
struct ccid2_hc_tx_sock {
u32 tx_cwnd;
u32 tx_ssthresh;
u32 tx_pipe;
u32 tx_packets_acked;
struct ccid2_seq *tx_seqbuf[CCID2_SEQBUF_MAX];
int tx_seqbufc;
struct ccid2_seq *tx_seqh;
struct ccid2_seq *tx_seqt;
/* RTT measurement: variables/principles are the same as in TCP */
u32 tx_srtt,
tx_mdev,
tx_mdev_max,
tx_rttvar,
tx_rto;
u64 tx_rtt_seq:48;
struct timer_list tx_rtotimer;
/* Congestion Window validation (optional, RFC 2861) */
u32 tx_cwnd_used,
tx_expected_wnd,
tx_cwnd_stamp,
tx_lsndtime;
u64 tx_rpseq;
int tx_rpdupack;
u32 tx_last_cong;
u64 tx_high_ack;
struct list_head tx_av_chunks;
};
static inline bool ccid2_cwnd_network_limited(struct ccid2_hc_tx_sock *hc)
{
return hc->tx_pipe >= hc->tx_cwnd;
}
/*
* Convert RFC 3390 larger initial window into an equivalent number of packets.
* This is based on the numbers specified in RFC 5681, 3.1.
*/
static inline u32 rfc3390_bytes_to_packets(const u32 smss)
{
return smss <= 1095 ? 4 : (smss > 2190 ? 2 : 3);
}
/**
* struct ccid2_hc_rx_sock - Receiving end of CCID-2 half-connection
* @rx_num_data_pkts: number of data packets received since last feedback
*/
struct ccid2_hc_rx_sock {
u32 rx_num_data_pkts;
};
static inline struct ccid2_hc_tx_sock *ccid2_hc_tx_sk(const struct sock *sk)
{
return ccid_priv(dccp_sk(sk)->dccps_hc_tx_ccid);
}
static inline struct ccid2_hc_rx_sock *ccid2_hc_rx_sk(const struct sock *sk)
{
return ccid_priv(dccp_sk(sk)->dccps_hc_rx_ccid);
}
#endif /* _DCCP_CCID2_H_ */

870
net/dccp/ccids/ccid3.c Normal file
View file

@ -0,0 +1,870 @@
/*
* Copyright (c) 2007 The University of Aberdeen, Scotland, UK
* Copyright (c) 2005-7 The University of Waikato, Hamilton, New Zealand.
* Copyright (c) 2005-7 Ian McDonald <ian.mcdonald@jandi.co.nz>
*
* An implementation of the DCCP protocol
*
* This code has been developed by the University of Waikato WAND
* research group. For further information please see http://www.wand.net.nz/
*
* This code also uses code from Lulea University, rereleased as GPL by its
* authors:
* Copyright (c) 2003 Nils-Erik Mattsson, Joacim Haggmark, Magnus Erixzon
*
* Changes to meet Linux coding standards, to make it meet latest ccid3 draft
* and to make it work as a loadable module in the DCCP stack written by
* Arnaldo Carvalho de Melo <acme@conectiva.com.br>.
*
* Copyright (c) 2005 Arnaldo Carvalho de Melo <acme@conectiva.com.br>
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
*/
#include "../dccp.h"
#include "ccid3.h"
#include <asm/unaligned.h>
#ifdef CONFIG_IP_DCCP_CCID3_DEBUG
static bool ccid3_debug;
#define ccid3_pr_debug(format, a...) DCCP_PR_DEBUG(ccid3_debug, format, ##a)
#else
#define ccid3_pr_debug(format, a...)
#endif
/*
* Transmitter Half-Connection Routines
*/
#ifdef CONFIG_IP_DCCP_CCID3_DEBUG
static const char *ccid3_tx_state_name(enum ccid3_hc_tx_states state)
{
static const char *const ccid3_state_names[] = {
[TFRC_SSTATE_NO_SENT] = "NO_SENT",
[TFRC_SSTATE_NO_FBACK] = "NO_FBACK",
[TFRC_SSTATE_FBACK] = "FBACK",
};
return ccid3_state_names[state];
}
#endif
static void ccid3_hc_tx_set_state(struct sock *sk,
enum ccid3_hc_tx_states state)
{
struct ccid3_hc_tx_sock *hc = ccid3_hc_tx_sk(sk);
enum ccid3_hc_tx_states oldstate = hc->tx_state;
ccid3_pr_debug("%s(%p) %-8.8s -> %s\n",
dccp_role(sk), sk, ccid3_tx_state_name(oldstate),
ccid3_tx_state_name(state));
WARN_ON(state == oldstate);
hc->tx_state = state;
}
/*
* Compute the initial sending rate X_init in the manner of RFC 3390:
*
* X_init = min(4 * s, max(2 * s, 4380 bytes)) / RTT
*
* Note that RFC 3390 uses MSS, RFC 4342 refers to RFC 3390, and rfc3448bis
* (rev-02) clarifies the use of RFC 3390 with regard to the above formula.
* For consistency with other parts of the code, X_init is scaled by 2^6.
*/
static inline u64 rfc3390_initial_rate(struct sock *sk)
{
const struct ccid3_hc_tx_sock *hc = ccid3_hc_tx_sk(sk);
const __u32 w_init = clamp_t(__u32, 4380U, 2 * hc->tx_s, 4 * hc->tx_s);
return scaled_div(w_init << 6, hc->tx_rtt);
}
/**
* ccid3_update_send_interval - Calculate new t_ipi = s / X_inst
* This respects the granularity of X_inst (64 * bytes/second).
*/
static void ccid3_update_send_interval(struct ccid3_hc_tx_sock *hc)
{
hc->tx_t_ipi = scaled_div32(((u64)hc->tx_s) << 6, hc->tx_x);
DCCP_BUG_ON(hc->tx_t_ipi == 0);
ccid3_pr_debug("t_ipi=%u, s=%u, X=%u\n", hc->tx_t_ipi,
hc->tx_s, (unsigned int)(hc->tx_x >> 6));
}
static u32 ccid3_hc_tx_idle_rtt(struct ccid3_hc_tx_sock *hc, ktime_t now)
{
u32 delta = ktime_us_delta(now, hc->tx_t_last_win_count);
return delta / hc->tx_rtt;
}
/**
* ccid3_hc_tx_update_x - Update allowed sending rate X
* @stamp: most recent time if available - can be left NULL.
*
* This function tracks draft rfc3448bis, check there for latest details.
*
* Note: X and X_recv are both stored in units of 64 * bytes/second, to support
* fine-grained resolution of sending rates. This requires scaling by 2^6
* throughout the code. Only X_calc is unscaled (in bytes/second).
*
*/
static void ccid3_hc_tx_update_x(struct sock *sk, ktime_t *stamp)
{
struct ccid3_hc_tx_sock *hc = ccid3_hc_tx_sk(sk);
__u64 min_rate = 2 * hc->tx_x_recv;
const __u64 old_x = hc->tx_x;
ktime_t now = stamp ? *stamp : ktime_get_real();
/*
* Handle IDLE periods: do not reduce below RFC3390 initial sending rate
* when idling [RFC 4342, 5.1]. Definition of idling is from rfc3448bis:
* a sender is idle if it has not sent anything over a 2-RTT-period.
* For consistency with X and X_recv, min_rate is also scaled by 2^6.
*/
if (ccid3_hc_tx_idle_rtt(hc, now) >= 2) {
min_rate = rfc3390_initial_rate(sk);
min_rate = max(min_rate, 2 * hc->tx_x_recv);
}
if (hc->tx_p > 0) {
hc->tx_x = min(((__u64)hc->tx_x_calc) << 6, min_rate);
hc->tx_x = max(hc->tx_x, (((__u64)hc->tx_s) << 6) / TFRC_T_MBI);
} else if (ktime_us_delta(now, hc->tx_t_ld) - (s64)hc->tx_rtt >= 0) {
hc->tx_x = min(2 * hc->tx_x, min_rate);
hc->tx_x = max(hc->tx_x,
scaled_div(((__u64)hc->tx_s) << 6, hc->tx_rtt));
hc->tx_t_ld = now;
}
if (hc->tx_x != old_x) {
ccid3_pr_debug("X_prev=%u, X_now=%u, X_calc=%u, "
"X_recv=%u\n", (unsigned int)(old_x >> 6),
(unsigned int)(hc->tx_x >> 6), hc->tx_x_calc,
(unsigned int)(hc->tx_x_recv >> 6));
ccid3_update_send_interval(hc);
}
}
/**
* ccid3_hc_tx_update_s - Track the mean packet size `s'
* @len: DCCP packet payload size in bytes
*
* cf. RFC 4342, 5.3 and RFC 3448, 4.1
*/
static inline void ccid3_hc_tx_update_s(struct ccid3_hc_tx_sock *hc, int len)
{
const u16 old_s = hc->tx_s;
hc->tx_s = tfrc_ewma(hc->tx_s, len, 9);
if (hc->tx_s != old_s)
ccid3_update_send_interval(hc);
}
/*
* Update Window Counter using the algorithm from [RFC 4342, 8.1].
* As elsewhere, RTT > 0 is assumed by using dccp_sample_rtt().
*/
static inline void ccid3_hc_tx_update_win_count(struct ccid3_hc_tx_sock *hc,
ktime_t now)
{
u32 delta = ktime_us_delta(now, hc->tx_t_last_win_count),
quarter_rtts = (4 * delta) / hc->tx_rtt;
if (quarter_rtts > 0) {
hc->tx_t_last_win_count = now;
hc->tx_last_win_count += min(quarter_rtts, 5U);
hc->tx_last_win_count &= 0xF; /* mod 16 */
}
}
static void ccid3_hc_tx_no_feedback_timer(unsigned long data)
{
struct sock *sk = (struct sock *)data;
struct ccid3_hc_tx_sock *hc = ccid3_hc_tx_sk(sk);
unsigned long t_nfb = USEC_PER_SEC / 5;
bh_lock_sock(sk);
if (sock_owned_by_user(sk)) {
/* Try again later. */
/* XXX: set some sensible MIB */
goto restart_timer;
}
ccid3_pr_debug("%s(%p, state=%s) - entry\n", dccp_role(sk), sk,
ccid3_tx_state_name(hc->tx_state));
/* Ignore and do not restart after leaving the established state */
if ((1 << sk->sk_state) & ~(DCCPF_OPEN | DCCPF_PARTOPEN))
goto out;
/* Reset feedback state to "no feedback received" */
if (hc->tx_state == TFRC_SSTATE_FBACK)
ccid3_hc_tx_set_state(sk, TFRC_SSTATE_NO_FBACK);
/*
* Determine new allowed sending rate X as per draft rfc3448bis-00, 4.4
* RTO is 0 if and only if no feedback has been received yet.
*/
if (hc->tx_t_rto == 0 || hc->tx_p == 0) {
/* halve send rate directly */
hc->tx_x = max(hc->tx_x / 2,
(((__u64)hc->tx_s) << 6) / TFRC_T_MBI);
ccid3_update_send_interval(hc);
} else {
/*
* Modify the cached value of X_recv
*
* If (X_calc > 2 * X_recv)
* X_recv = max(X_recv / 2, s / (2 * t_mbi));
* Else
* X_recv = X_calc / 4;
*
* Note that X_recv is scaled by 2^6 while X_calc is not
*/
if (hc->tx_x_calc > (hc->tx_x_recv >> 5))
hc->tx_x_recv =
max(hc->tx_x_recv / 2,
(((__u64)hc->tx_s) << 6) / (2*TFRC_T_MBI));
else {
hc->tx_x_recv = hc->tx_x_calc;
hc->tx_x_recv <<= 4;
}
ccid3_hc_tx_update_x(sk, NULL);
}
ccid3_pr_debug("Reduced X to %llu/64 bytes/sec\n",
(unsigned long long)hc->tx_x);
/*
* Set new timeout for the nofeedback timer.
* See comments in packet_recv() regarding the value of t_RTO.
*/
if (unlikely(hc->tx_t_rto == 0)) /* no feedback received yet */
t_nfb = TFRC_INITIAL_TIMEOUT;
else
t_nfb = max(hc->tx_t_rto, 2 * hc->tx_t_ipi);
restart_timer:
sk_reset_timer(sk, &hc->tx_no_feedback_timer,
jiffies + usecs_to_jiffies(t_nfb));
out:
bh_unlock_sock(sk);
sock_put(sk);
}
/**
* ccid3_hc_tx_send_packet - Delay-based dequeueing of TX packets
* @skb: next packet candidate to send on @sk
*
* This function uses the convention of ccid_packet_dequeue_eval() and
* returns a millisecond-delay value between 0 and t_mbi = 64000 msec.
*/
static int ccid3_hc_tx_send_packet(struct sock *sk, struct sk_buff *skb)
{
struct dccp_sock *dp = dccp_sk(sk);
struct ccid3_hc_tx_sock *hc = ccid3_hc_tx_sk(sk);
ktime_t now = ktime_get_real();
s64 delay;
/*
* This function is called only for Data and DataAck packets. Sending
* zero-sized Data(Ack)s is theoretically possible, but for congestion
* control this case is pathological - ignore it.
*/
if (unlikely(skb->len == 0))
return -EBADMSG;
if (hc->tx_state == TFRC_SSTATE_NO_SENT) {
sk_reset_timer(sk, &hc->tx_no_feedback_timer, (jiffies +
usecs_to_jiffies(TFRC_INITIAL_TIMEOUT)));
hc->tx_last_win_count = 0;
hc->tx_t_last_win_count = now;
/* Set t_0 for initial packet */
hc->tx_t_nom = now;
hc->tx_s = skb->len;
/*
* Use initial RTT sample when available: recommended by erratum
* to RFC 4342. This implements the initialisation procedure of
* draft rfc3448bis, section 4.2. Remember, X is scaled by 2^6.
*/
if (dp->dccps_syn_rtt) {
ccid3_pr_debug("SYN RTT = %uus\n", dp->dccps_syn_rtt);
hc->tx_rtt = dp->dccps_syn_rtt;
hc->tx_x = rfc3390_initial_rate(sk);
hc->tx_t_ld = now;
} else {
/*
* Sender does not have RTT sample:
* - set fallback RTT (RFC 4340, 3.4) since a RTT value
* is needed in several parts (e.g. window counter);
* - set sending rate X_pps = 1pps as per RFC 3448, 4.2.
*/
hc->tx_rtt = DCCP_FALLBACK_RTT;
hc->tx_x = hc->tx_s;
hc->tx_x <<= 6;
}
ccid3_update_send_interval(hc);
ccid3_hc_tx_set_state(sk, TFRC_SSTATE_NO_FBACK);
} else {
delay = ktime_us_delta(hc->tx_t_nom, now);
ccid3_pr_debug("delay=%ld\n", (long)delay);
/*
* Scheduling of packet transmissions (RFC 5348, 8.3)
*
* if (t_now > t_nom - delta)
* // send the packet now
* else
* // send the packet in (t_nom - t_now) milliseconds.
*/
if (delay >= TFRC_T_DELTA)
return (u32)delay / USEC_PER_MSEC;
ccid3_hc_tx_update_win_count(hc, now);
}
/* prepare to send now (add options etc.) */
dp->dccps_hc_tx_insert_options = 1;
DCCP_SKB_CB(skb)->dccpd_ccval = hc->tx_last_win_count;
/* set the nominal send time for the next following packet */
hc->tx_t_nom = ktime_add_us(hc->tx_t_nom, hc->tx_t_ipi);
return CCID_PACKET_SEND_AT_ONCE;
}
static void ccid3_hc_tx_packet_sent(struct sock *sk, unsigned int len)
{
struct ccid3_hc_tx_sock *hc = ccid3_hc_tx_sk(sk);
ccid3_hc_tx_update_s(hc, len);
if (tfrc_tx_hist_add(&hc->tx_hist, dccp_sk(sk)->dccps_gss))
DCCP_CRIT("packet history - out of memory!");
}
static void ccid3_hc_tx_packet_recv(struct sock *sk, struct sk_buff *skb)
{
struct ccid3_hc_tx_sock *hc = ccid3_hc_tx_sk(sk);
struct tfrc_tx_hist_entry *acked;
ktime_t now;
unsigned long t_nfb;
u32 r_sample;
/* we are only interested in ACKs */
if (!(DCCP_SKB_CB(skb)->dccpd_type == DCCP_PKT_ACK ||
DCCP_SKB_CB(skb)->dccpd_type == DCCP_PKT_DATAACK))
return;
/*
* Locate the acknowledged packet in the TX history.
*
* Returning "entry not found" here can for instance happen when
* - the host has not sent out anything (e.g. a passive server),
* - the Ack is outdated (packet with higher Ack number was received),
* - it is a bogus Ack (for a packet not sent on this connection).
*/
acked = tfrc_tx_hist_find_entry(hc->tx_hist, dccp_hdr_ack_seq(skb));
if (acked == NULL)
return;
/* For the sake of RTT sampling, ignore/remove all older entries */
tfrc_tx_hist_purge(&acked->next);
/* Update the moving average for the RTT estimate (RFC 3448, 4.3) */
now = ktime_get_real();
r_sample = dccp_sample_rtt(sk, ktime_us_delta(now, acked->stamp));
hc->tx_rtt = tfrc_ewma(hc->tx_rtt, r_sample, 9);
/*
* Update allowed sending rate X as per draft rfc3448bis-00, 4.2/3
*/
if (hc->tx_state == TFRC_SSTATE_NO_FBACK) {
ccid3_hc_tx_set_state(sk, TFRC_SSTATE_FBACK);
if (hc->tx_t_rto == 0) {
/*
* Initial feedback packet: Larger Initial Windows (4.2)
*/
hc->tx_x = rfc3390_initial_rate(sk);
hc->tx_t_ld = now;
ccid3_update_send_interval(hc);
goto done_computing_x;
} else if (hc->tx_p == 0) {
/*
* First feedback after nofeedback timer expiry (4.3)
*/
goto done_computing_x;
}
}
/* Update sending rate (step 4 of [RFC 3448, 4.3]) */
if (hc->tx_p > 0)
hc->tx_x_calc = tfrc_calc_x(hc->tx_s, hc->tx_rtt, hc->tx_p);
ccid3_hc_tx_update_x(sk, &now);
done_computing_x:
ccid3_pr_debug("%s(%p), RTT=%uus (sample=%uus), s=%u, "
"p=%u, X_calc=%u, X_recv=%u, X=%u\n",
dccp_role(sk), sk, hc->tx_rtt, r_sample,
hc->tx_s, hc->tx_p, hc->tx_x_calc,
(unsigned int)(hc->tx_x_recv >> 6),
(unsigned int)(hc->tx_x >> 6));
/* unschedule no feedback timer */
sk_stop_timer(sk, &hc->tx_no_feedback_timer);
/*
* As we have calculated new ipi, delta, t_nom it is possible
* that we now can send a packet, so wake up dccp_wait_for_ccid
*/
sk->sk_write_space(sk);
/*
* Update timeout interval for the nofeedback timer. In order to control
* rate halving on networks with very low RTTs (<= 1 ms), use per-route
* tunable RTAX_RTO_MIN value as the lower bound.
*/
hc->tx_t_rto = max_t(u32, 4 * hc->tx_rtt,
USEC_PER_SEC/HZ * tcp_rto_min(sk));
/*
* Schedule no feedback timer to expire in
* max(t_RTO, 2 * s/X) = max(t_RTO, 2 * t_ipi)
*/
t_nfb = max(hc->tx_t_rto, 2 * hc->tx_t_ipi);
ccid3_pr_debug("%s(%p), Scheduled no feedback timer to "
"expire in %lu jiffies (%luus)\n",
dccp_role(sk), sk, usecs_to_jiffies(t_nfb), t_nfb);
sk_reset_timer(sk, &hc->tx_no_feedback_timer,
jiffies + usecs_to_jiffies(t_nfb));
}
static int ccid3_hc_tx_parse_options(struct sock *sk, u8 packet_type,
u8 option, u8 *optval, u8 optlen)
{
struct ccid3_hc_tx_sock *hc = ccid3_hc_tx_sk(sk);
__be32 opt_val;
switch (option) {
case TFRC_OPT_RECEIVE_RATE:
case TFRC_OPT_LOSS_EVENT_RATE:
/* Must be ignored on Data packets, cf. RFC 4342 8.3 and 8.5 */
if (packet_type == DCCP_PKT_DATA)
break;
if (unlikely(optlen != 4)) {
DCCP_WARN("%s(%p), invalid len %d for %u\n",
dccp_role(sk), sk, optlen, option);
return -EINVAL;
}
opt_val = ntohl(get_unaligned((__be32 *)optval));
if (option == TFRC_OPT_RECEIVE_RATE) {
/* Receive Rate is kept in units of 64 bytes/second */
hc->tx_x_recv = opt_val;
hc->tx_x_recv <<= 6;
ccid3_pr_debug("%s(%p), RECEIVE_RATE=%u\n",
dccp_role(sk), sk, opt_val);
} else {
/* Update the fixpoint Loss Event Rate fraction */
hc->tx_p = tfrc_invert_loss_event_rate(opt_val);
ccid3_pr_debug("%s(%p), LOSS_EVENT_RATE=%u\n",
dccp_role(sk), sk, opt_val);
}
}
return 0;
}
static int ccid3_hc_tx_init(struct ccid *ccid, struct sock *sk)
{
struct ccid3_hc_tx_sock *hc = ccid_priv(ccid);
hc->tx_state = TFRC_SSTATE_NO_SENT;
hc->tx_hist = NULL;
setup_timer(&hc->tx_no_feedback_timer,
ccid3_hc_tx_no_feedback_timer, (unsigned long)sk);
return 0;
}
static void ccid3_hc_tx_exit(struct sock *sk)
{
struct ccid3_hc_tx_sock *hc = ccid3_hc_tx_sk(sk);
sk_stop_timer(sk, &hc->tx_no_feedback_timer);
tfrc_tx_hist_purge(&hc->tx_hist);
}
static void ccid3_hc_tx_get_info(struct sock *sk, struct tcp_info *info)
{
info->tcpi_rto = ccid3_hc_tx_sk(sk)->tx_t_rto;
info->tcpi_rtt = ccid3_hc_tx_sk(sk)->tx_rtt;
}
static int ccid3_hc_tx_getsockopt(struct sock *sk, const int optname, int len,
u32 __user *optval, int __user *optlen)
{
const struct ccid3_hc_tx_sock *hc = ccid3_hc_tx_sk(sk);
struct tfrc_tx_info tfrc;
const void *val;
switch (optname) {
case DCCP_SOCKOPT_CCID_TX_INFO:
if (len < sizeof(tfrc))
return -EINVAL;
memset(&tfrc, 0, sizeof(tfrc));
tfrc.tfrctx_x = hc->tx_x;
tfrc.tfrctx_x_recv = hc->tx_x_recv;
tfrc.tfrctx_x_calc = hc->tx_x_calc;
tfrc.tfrctx_rtt = hc->tx_rtt;
tfrc.tfrctx_p = hc->tx_p;
tfrc.tfrctx_rto = hc->tx_t_rto;
tfrc.tfrctx_ipi = hc->tx_t_ipi;
len = sizeof(tfrc);
val = &tfrc;
break;
default:
return -ENOPROTOOPT;
}
if (put_user(len, optlen) || copy_to_user(optval, val, len))
return -EFAULT;
return 0;
}
/*
* Receiver Half-Connection Routines
*/
/* CCID3 feedback types */
enum ccid3_fback_type {
CCID3_FBACK_NONE = 0,
CCID3_FBACK_INITIAL,
CCID3_FBACK_PERIODIC,
CCID3_FBACK_PARAM_CHANGE
};
#ifdef CONFIG_IP_DCCP_CCID3_DEBUG
static const char *ccid3_rx_state_name(enum ccid3_hc_rx_states state)
{
static const char *const ccid3_rx_state_names[] = {
[TFRC_RSTATE_NO_DATA] = "NO_DATA",
[TFRC_RSTATE_DATA] = "DATA",
};
return ccid3_rx_state_names[state];
}
#endif
static void ccid3_hc_rx_set_state(struct sock *sk,
enum ccid3_hc_rx_states state)
{
struct ccid3_hc_rx_sock *hc = ccid3_hc_rx_sk(sk);
enum ccid3_hc_rx_states oldstate = hc->rx_state;
ccid3_pr_debug("%s(%p) %-8.8s -> %s\n",
dccp_role(sk), sk, ccid3_rx_state_name(oldstate),
ccid3_rx_state_name(state));
WARN_ON(state == oldstate);
hc->rx_state = state;
}
static void ccid3_hc_rx_send_feedback(struct sock *sk,
const struct sk_buff *skb,
enum ccid3_fback_type fbtype)
{
struct ccid3_hc_rx_sock *hc = ccid3_hc_rx_sk(sk);
struct dccp_sock *dp = dccp_sk(sk);
ktime_t now = ktime_get_real();
s64 delta = 0;
switch (fbtype) {
case CCID3_FBACK_INITIAL:
hc->rx_x_recv = 0;
hc->rx_pinv = ~0U; /* see RFC 4342, 8.5 */
break;
case CCID3_FBACK_PARAM_CHANGE:
/*
* When parameters change (new loss or p > p_prev), we do not
* have a reliable estimate for R_m of [RFC 3448, 6.2] and so
* need to reuse the previous value of X_recv. However, when
* X_recv was 0 (due to early loss), this would kill X down to
* s/t_mbi (i.e. one packet in 64 seconds).
* To avoid such drastic reduction, we approximate X_recv as
* the number of bytes since last feedback.
* This is a safe fallback, since X is bounded above by X_calc.
*/
if (hc->rx_x_recv > 0)
break;
/* fall through */
case CCID3_FBACK_PERIODIC:
delta = ktime_us_delta(now, hc->rx_tstamp_last_feedback);
if (delta <= 0)
DCCP_BUG("delta (%ld) <= 0", (long)delta);
else
hc->rx_x_recv = scaled_div32(hc->rx_bytes_recv, delta);
break;
default:
return;
}
ccid3_pr_debug("Interval %ldusec, X_recv=%u, 1/p=%u\n", (long)delta,
hc->rx_x_recv, hc->rx_pinv);
hc->rx_tstamp_last_feedback = now;
hc->rx_last_counter = dccp_hdr(skb)->dccph_ccval;
hc->rx_bytes_recv = 0;
dp->dccps_hc_rx_insert_options = 1;
dccp_send_ack(sk);
}
static int ccid3_hc_rx_insert_options(struct sock *sk, struct sk_buff *skb)
{
const struct ccid3_hc_rx_sock *hc = ccid3_hc_rx_sk(sk);
__be32 x_recv, pinv;
if (!(sk->sk_state == DCCP_OPEN || sk->sk_state == DCCP_PARTOPEN))
return 0;
if (dccp_packet_without_ack(skb))
return 0;
x_recv = htonl(hc->rx_x_recv);
pinv = htonl(hc->rx_pinv);
if (dccp_insert_option(skb, TFRC_OPT_LOSS_EVENT_RATE,
&pinv, sizeof(pinv)) ||
dccp_insert_option(skb, TFRC_OPT_RECEIVE_RATE,
&x_recv, sizeof(x_recv)))
return -1;
return 0;
}
/**
* ccid3_first_li - Implements [RFC 5348, 6.3.1]
*
* Determine the length of the first loss interval via inverse lookup.
* Assume that X_recv can be computed by the throughput equation
* s
* X_recv = --------
* R * fval
* Find some p such that f(p) = fval; return 1/p (scaled).
*/
static u32 ccid3_first_li(struct sock *sk)
{
struct ccid3_hc_rx_sock *hc = ccid3_hc_rx_sk(sk);
u32 x_recv, p, delta;
u64 fval;
if (hc->rx_rtt == 0) {
DCCP_WARN("No RTT estimate available, using fallback RTT\n");
hc->rx_rtt = DCCP_FALLBACK_RTT;
}
delta = ktime_to_us(net_timedelta(hc->rx_tstamp_last_feedback));
x_recv = scaled_div32(hc->rx_bytes_recv, delta);
if (x_recv == 0) { /* would also trigger divide-by-zero */
DCCP_WARN("X_recv==0\n");
if (hc->rx_x_recv == 0) {
DCCP_BUG("stored value of X_recv is zero");
return ~0U;
}
x_recv = hc->rx_x_recv;
}
fval = scaled_div(hc->rx_s, hc->rx_rtt);
fval = scaled_div32(fval, x_recv);
p = tfrc_calc_x_reverse_lookup(fval);
ccid3_pr_debug("%s(%p), receive rate=%u bytes/s, implied "
"loss rate=%u\n", dccp_role(sk), sk, x_recv, p);
return p == 0 ? ~0U : scaled_div(1, p);
}
static void ccid3_hc_rx_packet_recv(struct sock *sk, struct sk_buff *skb)
{
struct ccid3_hc_rx_sock *hc = ccid3_hc_rx_sk(sk);
enum ccid3_fback_type do_feedback = CCID3_FBACK_NONE;
const u64 ndp = dccp_sk(sk)->dccps_options_received.dccpor_ndp;
const bool is_data_packet = dccp_data_packet(skb);
if (unlikely(hc->rx_state == TFRC_RSTATE_NO_DATA)) {
if (is_data_packet) {
const u32 payload = skb->len - dccp_hdr(skb)->dccph_doff * 4;
do_feedback = CCID3_FBACK_INITIAL;
ccid3_hc_rx_set_state(sk, TFRC_RSTATE_DATA);
hc->rx_s = payload;
/*
* Not necessary to update rx_bytes_recv here,
* since X_recv = 0 for the first feedback packet (cf.
* RFC 3448, 6.3) -- gerrit
*/
}
goto update_records;
}
if (tfrc_rx_hist_duplicate(&hc->rx_hist, skb))
return; /* done receiving */
if (is_data_packet) {
const u32 payload = skb->len - dccp_hdr(skb)->dccph_doff * 4;
/*
* Update moving-average of s and the sum of received payload bytes
*/
hc->rx_s = tfrc_ewma(hc->rx_s, payload, 9);
hc->rx_bytes_recv += payload;
}
/*
* Perform loss detection and handle pending losses
*/
if (tfrc_rx_handle_loss(&hc->rx_hist, &hc->rx_li_hist,
skb, ndp, ccid3_first_li, sk)) {
do_feedback = CCID3_FBACK_PARAM_CHANGE;
goto done_receiving;
}
if (tfrc_rx_hist_loss_pending(&hc->rx_hist))
return; /* done receiving */
/*
* Handle data packets: RTT sampling and monitoring p
*/
if (unlikely(!is_data_packet))
goto update_records;
if (!tfrc_lh_is_initialised(&hc->rx_li_hist)) {
const u32 sample = tfrc_rx_hist_sample_rtt(&hc->rx_hist, skb);
/*
* Empty loss history: no loss so far, hence p stays 0.
* Sample RTT values, since an RTT estimate is required for the
* computation of p when the first loss occurs; RFC 3448, 6.3.1.
*/
if (sample != 0)
hc->rx_rtt = tfrc_ewma(hc->rx_rtt, sample, 9);
} else if (tfrc_lh_update_i_mean(&hc->rx_li_hist, skb)) {
/*
* Step (3) of [RFC 3448, 6.1]: Recompute I_mean and, if I_mean
* has decreased (resp. p has increased), send feedback now.
*/
do_feedback = CCID3_FBACK_PARAM_CHANGE;
}
/*
* Check if the periodic once-per-RTT feedback is due; RFC 4342, 10.3
*/
if (SUB16(dccp_hdr(skb)->dccph_ccval, hc->rx_last_counter) > 3)
do_feedback = CCID3_FBACK_PERIODIC;
update_records:
tfrc_rx_hist_add_packet(&hc->rx_hist, skb, ndp);
done_receiving:
if (do_feedback)
ccid3_hc_rx_send_feedback(sk, skb, do_feedback);
}
static int ccid3_hc_rx_init(struct ccid *ccid, struct sock *sk)
{
struct ccid3_hc_rx_sock *hc = ccid_priv(ccid);
hc->rx_state = TFRC_RSTATE_NO_DATA;
tfrc_lh_init(&hc->rx_li_hist);
return tfrc_rx_hist_alloc(&hc->rx_hist);
}
static void ccid3_hc_rx_exit(struct sock *sk)
{
struct ccid3_hc_rx_sock *hc = ccid3_hc_rx_sk(sk);
tfrc_rx_hist_purge(&hc->rx_hist);
tfrc_lh_cleanup(&hc->rx_li_hist);
}
static void ccid3_hc_rx_get_info(struct sock *sk, struct tcp_info *info)
{
info->tcpi_ca_state = ccid3_hc_rx_sk(sk)->rx_state;
info->tcpi_options |= TCPI_OPT_TIMESTAMPS;
info->tcpi_rcv_rtt = ccid3_hc_rx_sk(sk)->rx_rtt;
}
static int ccid3_hc_rx_getsockopt(struct sock *sk, const int optname, int len,
u32 __user *optval, int __user *optlen)
{
const struct ccid3_hc_rx_sock *hc = ccid3_hc_rx_sk(sk);
struct tfrc_rx_info rx_info;
const void *val;
switch (optname) {
case DCCP_SOCKOPT_CCID_RX_INFO:
if (len < sizeof(rx_info))
return -EINVAL;
rx_info.tfrcrx_x_recv = hc->rx_x_recv;
rx_info.tfrcrx_rtt = hc->rx_rtt;
rx_info.tfrcrx_p = tfrc_invert_loss_event_rate(hc->rx_pinv);
len = sizeof(rx_info);
val = &rx_info;
break;
default:
return -ENOPROTOOPT;
}
if (put_user(len, optlen) || copy_to_user(optval, val, len))
return -EFAULT;
return 0;
}
struct ccid_operations ccid3_ops = {
.ccid_id = DCCPC_CCID3,
.ccid_name = "TCP-Friendly Rate Control",
.ccid_hc_tx_obj_size = sizeof(struct ccid3_hc_tx_sock),
.ccid_hc_tx_init = ccid3_hc_tx_init,
.ccid_hc_tx_exit = ccid3_hc_tx_exit,
.ccid_hc_tx_send_packet = ccid3_hc_tx_send_packet,
.ccid_hc_tx_packet_sent = ccid3_hc_tx_packet_sent,
.ccid_hc_tx_packet_recv = ccid3_hc_tx_packet_recv,
.ccid_hc_tx_parse_options = ccid3_hc_tx_parse_options,
.ccid_hc_rx_obj_size = sizeof(struct ccid3_hc_rx_sock),
.ccid_hc_rx_init = ccid3_hc_rx_init,
.ccid_hc_rx_exit = ccid3_hc_rx_exit,
.ccid_hc_rx_insert_options = ccid3_hc_rx_insert_options,
.ccid_hc_rx_packet_recv = ccid3_hc_rx_packet_recv,
.ccid_hc_rx_get_info = ccid3_hc_rx_get_info,
.ccid_hc_tx_get_info = ccid3_hc_tx_get_info,
.ccid_hc_rx_getsockopt = ccid3_hc_rx_getsockopt,
.ccid_hc_tx_getsockopt = ccid3_hc_tx_getsockopt,
};
#ifdef CONFIG_IP_DCCP_CCID3_DEBUG
module_param(ccid3_debug, bool, 0644);
MODULE_PARM_DESC(ccid3_debug, "Enable CCID-3 debug messages");
#endif

160
net/dccp/ccids/ccid3.h Normal file
View file

@ -0,0 +1,160 @@
/*
* Copyright (c) 2005-7 The University of Waikato, Hamilton, New Zealand.
* Copyright (c) 2007 The University of Aberdeen, Scotland, UK
*
* An implementation of the DCCP protocol
*
* This code has been developed by the University of Waikato WAND
* research group. For further information please see http://www.wand.net.nz/
* or e-mail Ian McDonald - ian.mcdonald@jandi.co.nz
*
* This code also uses code from Lulea University, rereleased as GPL by its
* authors:
* Copyright (c) 2003 Nils-Erik Mattsson, Joacim Haggmark, Magnus Erixzon
*
* Changes to meet Linux coding standards, to make it meet latest ccid3 draft
* and to make it work as a loadable module in the DCCP stack written by
* Arnaldo Carvalho de Melo <acme@conectiva.com.br>.
*
* Copyright (c) 2005 Arnaldo Carvalho de Melo <acme@conectiva.com.br>
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
*/
#ifndef _DCCP_CCID3_H_
#define _DCCP_CCID3_H_
#include <linux/ktime.h>
#include <linux/list.h>
#include <linux/types.h>
#include <linux/tfrc.h>
#include "lib/tfrc.h"
#include "../ccid.h"
/* Two seconds as per RFC 5348, 4.2 */
#define TFRC_INITIAL_TIMEOUT (2 * USEC_PER_SEC)
/* Parameter t_mbi from [RFC 3448, 4.3]: backoff interval in seconds */
#define TFRC_T_MBI 64
/*
* The t_delta parameter (RFC 5348, 8.3): delays of less than %USEC_PER_MSEC are
* rounded down to 0, since sk_reset_timer() here uses millisecond granularity.
* Hence we can use a constant t_delta = %USEC_PER_MSEC when HZ >= 500. A coarse
* resolution of HZ < 500 means that the error is below one timer tick (t_gran)
* when using the constant t_delta = t_gran / 2 = %USEC_PER_SEC / (2 * HZ).
*/
#if (HZ >= 500)
# define TFRC_T_DELTA USEC_PER_MSEC
#else
# define TFRC_T_DELTA (USEC_PER_SEC / (2 * HZ))
#endif
enum ccid3_options {
TFRC_OPT_LOSS_EVENT_RATE = 192,
TFRC_OPT_LOSS_INTERVALS = 193,
TFRC_OPT_RECEIVE_RATE = 194,
};
/* TFRC sender states */
enum ccid3_hc_tx_states {
TFRC_SSTATE_NO_SENT = 1,
TFRC_SSTATE_NO_FBACK,
TFRC_SSTATE_FBACK,
};
/**
* struct ccid3_hc_tx_sock - CCID3 sender half-connection socket
* @tx_x: Current sending rate in 64 * bytes per second
* @tx_x_recv: Receive rate in 64 * bytes per second
* @tx_x_calc: Calculated rate in bytes per second
* @tx_rtt: Estimate of current round trip time in usecs
* @tx_p: Current loss event rate (0-1) scaled by 1000000
* @tx_s: Packet size in bytes
* @tx_t_rto: Nofeedback Timer setting in usecs
* @tx_t_ipi: Interpacket (send) interval (RFC 3448, 4.6) in usecs
* @tx_state: Sender state, one of %ccid3_hc_tx_states
* @tx_last_win_count: Last window counter sent
* @tx_t_last_win_count: Timestamp of earliest packet
* with last_win_count value sent
* @tx_no_feedback_timer: Handle to no feedback timer
* @tx_t_ld: Time last doubled during slow start
* @tx_t_nom: Nominal send time of next packet
* @tx_hist: Packet history
*/
struct ccid3_hc_tx_sock {
u64 tx_x;
u64 tx_x_recv;
u32 tx_x_calc;
u32 tx_rtt;
u32 tx_p;
u32 tx_t_rto;
u32 tx_t_ipi;
u16 tx_s;
enum ccid3_hc_tx_states tx_state:8;
u8 tx_last_win_count;
ktime_t tx_t_last_win_count;
struct timer_list tx_no_feedback_timer;
ktime_t tx_t_ld;
ktime_t tx_t_nom;
struct tfrc_tx_hist_entry *tx_hist;
};
static inline struct ccid3_hc_tx_sock *ccid3_hc_tx_sk(const struct sock *sk)
{
struct ccid3_hc_tx_sock *hctx = ccid_priv(dccp_sk(sk)->dccps_hc_tx_ccid);
BUG_ON(hctx == NULL);
return hctx;
}
/* TFRC receiver states */
enum ccid3_hc_rx_states {
TFRC_RSTATE_NO_DATA = 1,
TFRC_RSTATE_DATA,
};
/**
* struct ccid3_hc_rx_sock - CCID3 receiver half-connection socket
* @rx_last_counter: Tracks window counter (RFC 4342, 8.1)
* @rx_state: Receiver state, one of %ccid3_hc_rx_states
* @rx_bytes_recv: Total sum of DCCP payload bytes
* @rx_x_recv: Receiver estimate of send rate (RFC 3448, sec. 4.3)
* @rx_rtt: Receiver estimate of RTT
* @rx_tstamp_last_feedback: Time at which last feedback was sent
* @rx_hist: Packet history (loss detection + RTT sampling)
* @rx_li_hist: Loss Interval database
* @rx_s: Received packet size in bytes
* @rx_pinv: Inverse of Loss Event Rate (RFC 4342, sec. 8.5)
*/
struct ccid3_hc_rx_sock {
u8 rx_last_counter:4;
enum ccid3_hc_rx_states rx_state:8;
u32 rx_bytes_recv;
u32 rx_x_recv;
u32 rx_rtt;
ktime_t rx_tstamp_last_feedback;
struct tfrc_rx_hist rx_hist;
struct tfrc_loss_hist rx_li_hist;
u16 rx_s;
#define rx_pinv rx_li_hist.i_mean
};
static inline struct ccid3_hc_rx_sock *ccid3_hc_rx_sk(const struct sock *sk)
{
struct ccid3_hc_rx_sock *hcrx = ccid_priv(dccp_sk(sk)->dccps_hc_rx_ccid);
BUG_ON(hcrx == NULL);
return hcrx;
}
#endif /* _DCCP_CCID3_H_ */

View file

@ -0,0 +1,185 @@
/*
* Copyright (c) 2007 The University of Aberdeen, Scotland, UK
* Copyright (c) 2005-7 The University of Waikato, Hamilton, New Zealand.
* Copyright (c) 2005-7 Ian McDonald <ian.mcdonald@jandi.co.nz>
* Copyright (c) 2005 Arnaldo Carvalho de Melo <acme@conectiva.com.br>
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*/
#include <net/sock.h>
#include "tfrc.h"
static struct kmem_cache *tfrc_lh_slab __read_mostly;
/* Loss Interval weights from [RFC 3448, 5.4], scaled by 10 */
static const int tfrc_lh_weights[NINTERVAL] = { 10, 10, 10, 10, 8, 6, 4, 2 };
/* implements LIFO semantics on the array */
static inline u8 LIH_INDEX(const u8 ctr)
{
return LIH_SIZE - 1 - (ctr % LIH_SIZE);
}
/* the `counter' index always points at the next entry to be populated */
static inline struct tfrc_loss_interval *tfrc_lh_peek(struct tfrc_loss_hist *lh)
{
return lh->counter ? lh->ring[LIH_INDEX(lh->counter - 1)] : NULL;
}
/* given i with 0 <= i <= k, return I_i as per the rfc3448bis notation */
static inline u32 tfrc_lh_get_interval(struct tfrc_loss_hist *lh, const u8 i)
{
BUG_ON(i >= lh->counter);
return lh->ring[LIH_INDEX(lh->counter - i - 1)]->li_length;
}
/*
* On-demand allocation and de-allocation of entries
*/
static struct tfrc_loss_interval *tfrc_lh_demand_next(struct tfrc_loss_hist *lh)
{
if (lh->ring[LIH_INDEX(lh->counter)] == NULL)
lh->ring[LIH_INDEX(lh->counter)] = kmem_cache_alloc(tfrc_lh_slab,
GFP_ATOMIC);
return lh->ring[LIH_INDEX(lh->counter)];
}
void tfrc_lh_cleanup(struct tfrc_loss_hist *lh)
{
if (!tfrc_lh_is_initialised(lh))
return;
for (lh->counter = 0; lh->counter < LIH_SIZE; lh->counter++)
if (lh->ring[LIH_INDEX(lh->counter)] != NULL) {
kmem_cache_free(tfrc_lh_slab,
lh->ring[LIH_INDEX(lh->counter)]);
lh->ring[LIH_INDEX(lh->counter)] = NULL;
}
}
static void tfrc_lh_calc_i_mean(struct tfrc_loss_hist *lh)
{
u32 i_i, i_tot0 = 0, i_tot1 = 0, w_tot = 0;
int i, k = tfrc_lh_length(lh) - 1; /* k is as in rfc3448bis, 5.4 */
if (k <= 0)
return;
for (i = 0; i <= k; i++) {
i_i = tfrc_lh_get_interval(lh, i);
if (i < k) {
i_tot0 += i_i * tfrc_lh_weights[i];
w_tot += tfrc_lh_weights[i];
}
if (i > 0)
i_tot1 += i_i * tfrc_lh_weights[i-1];
}
lh->i_mean = max(i_tot0, i_tot1) / w_tot;
}
/**
* tfrc_lh_update_i_mean - Update the `open' loss interval I_0
* For recomputing p: returns `true' if p > p_prev <=> 1/p < 1/p_prev
*/
u8 tfrc_lh_update_i_mean(struct tfrc_loss_hist *lh, struct sk_buff *skb)
{
struct tfrc_loss_interval *cur = tfrc_lh_peek(lh);
u32 old_i_mean = lh->i_mean;
s64 len;
if (cur == NULL) /* not initialised */
return 0;
len = dccp_delta_seqno(cur->li_seqno, DCCP_SKB_CB(skb)->dccpd_seq) + 1;
if (len - (s64)cur->li_length <= 0) /* duplicate or reordered */
return 0;
if (SUB16(dccp_hdr(skb)->dccph_ccval, cur->li_ccval) > 4)
/*
* Implements RFC 4342, 10.2:
* If a packet S (skb) exists whose seqno comes `after' the one
* starting the current loss interval (cur) and if the modulo-16
* distance from C(cur) to C(S) is greater than 4, consider all
* subsequent packets as belonging to a new loss interval. This
* test is necessary since CCVal may wrap between intervals.
*/
cur->li_is_closed = 1;
if (tfrc_lh_length(lh) == 1) /* due to RFC 3448, 6.3.1 */
return 0;
cur->li_length = len;
tfrc_lh_calc_i_mean(lh);
return lh->i_mean < old_i_mean;
}
/* Determine if `new_loss' does begin a new loss interval [RFC 4342, 10.2] */
static inline u8 tfrc_lh_is_new_loss(struct tfrc_loss_interval *cur,
struct tfrc_rx_hist_entry *new_loss)
{
return dccp_delta_seqno(cur->li_seqno, new_loss->tfrchrx_seqno) > 0 &&
(cur->li_is_closed || SUB16(new_loss->tfrchrx_ccval, cur->li_ccval) > 4);
}
/**
* tfrc_lh_interval_add - Insert new record into the Loss Interval database
* @lh: Loss Interval database
* @rh: Receive history containing a fresh loss event
* @calc_first_li: Caller-dependent routine to compute length of first interval
* @sk: Used by @calc_first_li in caller-specific way (subtyping)
*
* Updates I_mean and returns 1 if a new interval has in fact been added to @lh.
*/
int tfrc_lh_interval_add(struct tfrc_loss_hist *lh, struct tfrc_rx_hist *rh,
u32 (*calc_first_li)(struct sock *), struct sock *sk)
{
struct tfrc_loss_interval *cur = tfrc_lh_peek(lh), *new;
if (cur != NULL && !tfrc_lh_is_new_loss(cur, tfrc_rx_hist_loss_prev(rh)))
return 0;
new = tfrc_lh_demand_next(lh);
if (unlikely(new == NULL)) {
DCCP_CRIT("Cannot allocate/add loss record.");
return 0;
}
new->li_seqno = tfrc_rx_hist_loss_prev(rh)->tfrchrx_seqno;
new->li_ccval = tfrc_rx_hist_loss_prev(rh)->tfrchrx_ccval;
new->li_is_closed = 0;
if (++lh->counter == 1)
lh->i_mean = new->li_length = (*calc_first_li)(sk);
else {
cur->li_length = dccp_delta_seqno(cur->li_seqno, new->li_seqno);
new->li_length = dccp_delta_seqno(new->li_seqno,
tfrc_rx_hist_last_rcv(rh)->tfrchrx_seqno) + 1;
if (lh->counter > (2*LIH_SIZE))
lh->counter -= LIH_SIZE;
tfrc_lh_calc_i_mean(lh);
}
return 1;
}
int __init tfrc_li_init(void)
{
tfrc_lh_slab = kmem_cache_create("tfrc_li_hist",
sizeof(struct tfrc_loss_interval), 0,
SLAB_HWCACHE_ALIGN, NULL);
return tfrc_lh_slab == NULL ? -ENOBUFS : 0;
}
void tfrc_li_exit(void)
{
if (tfrc_lh_slab != NULL) {
kmem_cache_destroy(tfrc_lh_slab);
tfrc_lh_slab = NULL;
}
}

View file

@ -0,0 +1,73 @@
#ifndef _DCCP_LI_HIST_
#define _DCCP_LI_HIST_
/*
* Copyright (c) 2007 The University of Aberdeen, Scotland, UK
* Copyright (c) 2005-7 The University of Waikato, Hamilton, New Zealand.
* Copyright (c) 2005-7 Ian McDonald <ian.mcdonald@jandi.co.nz>
* Copyright (c) 2005 Arnaldo Carvalho de Melo <acme@conectiva.com.br>
*
* This program is free software; you can redistribute it and/or modify it
* under the terms of the GNU General Public License as published by the Free
* Software Foundation; either version 2 of the License, or (at your option)
* any later version.
*/
#include <linux/ktime.h>
#include <linux/list.h>
#include <linux/slab.h>
/*
* Number of loss intervals (RFC 4342, 8.6.1). The history size is one more than
* NINTERVAL, since the `open' interval I_0 is always stored as the first entry.
*/
#define NINTERVAL 8
#define LIH_SIZE (NINTERVAL + 1)
/**
* tfrc_loss_interval - Loss history record for TFRC-based protocols
* @li_seqno: Highest received seqno before the start of loss
* @li_ccval: The CCVal belonging to @li_seqno
* @li_is_closed: Whether @li_seqno is older than 1 RTT
* @li_length: Loss interval sequence length
*/
struct tfrc_loss_interval {
u64 li_seqno:48,
li_ccval:4,
li_is_closed:1;
u32 li_length;
};
/**
* tfrc_loss_hist - Loss record database
* @ring: Circular queue managed in LIFO manner
* @counter: Current count of entries (can be more than %LIH_SIZE)
* @i_mean: Current Average Loss Interval [RFC 3448, 5.4]
*/
struct tfrc_loss_hist {
struct tfrc_loss_interval *ring[LIH_SIZE];
u8 counter;
u32 i_mean;
};
static inline void tfrc_lh_init(struct tfrc_loss_hist *lh)
{
memset(lh, 0, sizeof(struct tfrc_loss_hist));
}
static inline u8 tfrc_lh_is_initialised(struct tfrc_loss_hist *lh)
{
return lh->counter > 0;
}
static inline u8 tfrc_lh_length(struct tfrc_loss_hist *lh)
{
return min(lh->counter, (u8)LIH_SIZE);
}
struct tfrc_rx_hist;
int tfrc_lh_interval_add(struct tfrc_loss_hist *, struct tfrc_rx_hist *,
u32 (*first_li)(struct sock *), struct sock *);
u8 tfrc_lh_update_i_mean(struct tfrc_loss_hist *lh, struct sk_buff *);
void tfrc_lh_cleanup(struct tfrc_loss_hist *lh);
#endif /* _DCCP_LI_HIST_ */

View file

@ -0,0 +1,449 @@
/*
* Copyright (c) 2007 The University of Aberdeen, Scotland, UK
* Copyright (c) 2005-7 The University of Waikato, Hamilton, New Zealand.
*
* An implementation of the DCCP protocol
*
* This code has been developed by the University of Waikato WAND
* research group. For further information please see http://www.wand.net.nz/
* or e-mail Ian McDonald - ian.mcdonald@jandi.co.nz
*
* This code also uses code from Lulea University, rereleased as GPL by its
* authors:
* Copyright (c) 2003 Nils-Erik Mattsson, Joacim Haggmark, Magnus Erixzon
*
* Changes to meet Linux coding standards, to make it meet latest ccid3 draft
* and to make it work as a loadable module in the DCCP stack written by
* Arnaldo Carvalho de Melo <acme@conectiva.com.br>.
*
* Copyright (c) 2005 Arnaldo Carvalho de Melo <acme@conectiva.com.br>
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
*/
#include <linux/string.h>
#include <linux/slab.h>
#include "packet_history.h"
#include "../../dccp.h"
/*
* Transmitter History Routines
*/
static struct kmem_cache *tfrc_tx_hist_slab;
int __init tfrc_tx_packet_history_init(void)
{
tfrc_tx_hist_slab = kmem_cache_create("tfrc_tx_hist",
sizeof(struct tfrc_tx_hist_entry),
0, SLAB_HWCACHE_ALIGN, NULL);
return tfrc_tx_hist_slab == NULL ? -ENOBUFS : 0;
}
void tfrc_tx_packet_history_exit(void)
{
if (tfrc_tx_hist_slab != NULL) {
kmem_cache_destroy(tfrc_tx_hist_slab);
tfrc_tx_hist_slab = NULL;
}
}
int tfrc_tx_hist_add(struct tfrc_tx_hist_entry **headp, u64 seqno)
{
struct tfrc_tx_hist_entry *entry = kmem_cache_alloc(tfrc_tx_hist_slab, gfp_any());
if (entry == NULL)
return -ENOBUFS;
entry->seqno = seqno;
entry->stamp = ktime_get_real();
entry->next = *headp;
*headp = entry;
return 0;
}
void tfrc_tx_hist_purge(struct tfrc_tx_hist_entry **headp)
{
struct tfrc_tx_hist_entry *head = *headp;
while (head != NULL) {
struct tfrc_tx_hist_entry *next = head->next;
kmem_cache_free(tfrc_tx_hist_slab, head);
head = next;
}
*headp = NULL;
}
/*
* Receiver History Routines
*/
static struct kmem_cache *tfrc_rx_hist_slab;
int __init tfrc_rx_packet_history_init(void)
{
tfrc_rx_hist_slab = kmem_cache_create("tfrc_rxh_cache",
sizeof(struct tfrc_rx_hist_entry),
0, SLAB_HWCACHE_ALIGN, NULL);
return tfrc_rx_hist_slab == NULL ? -ENOBUFS : 0;
}
void tfrc_rx_packet_history_exit(void)
{
if (tfrc_rx_hist_slab != NULL) {
kmem_cache_destroy(tfrc_rx_hist_slab);
tfrc_rx_hist_slab = NULL;
}
}
static inline void tfrc_rx_hist_entry_from_skb(struct tfrc_rx_hist_entry *entry,
const struct sk_buff *skb,
const u64 ndp)
{
const struct dccp_hdr *dh = dccp_hdr(skb);
entry->tfrchrx_seqno = DCCP_SKB_CB(skb)->dccpd_seq;
entry->tfrchrx_ccval = dh->dccph_ccval;
entry->tfrchrx_type = dh->dccph_type;
entry->tfrchrx_ndp = ndp;
entry->tfrchrx_tstamp = ktime_get_real();
}
void tfrc_rx_hist_add_packet(struct tfrc_rx_hist *h,
const struct sk_buff *skb,
const u64 ndp)
{
struct tfrc_rx_hist_entry *entry = tfrc_rx_hist_last_rcv(h);
tfrc_rx_hist_entry_from_skb(entry, skb, ndp);
}
/* has the packet contained in skb been seen before? */
int tfrc_rx_hist_duplicate(struct tfrc_rx_hist *h, struct sk_buff *skb)
{
const u64 seq = DCCP_SKB_CB(skb)->dccpd_seq;
int i;
if (dccp_delta_seqno(tfrc_rx_hist_loss_prev(h)->tfrchrx_seqno, seq) <= 0)
return 1;
for (i = 1; i <= h->loss_count; i++)
if (tfrc_rx_hist_entry(h, i)->tfrchrx_seqno == seq)
return 1;
return 0;
}
static void tfrc_rx_hist_swap(struct tfrc_rx_hist *h, const u8 a, const u8 b)
{
const u8 idx_a = tfrc_rx_hist_index(h, a),
idx_b = tfrc_rx_hist_index(h, b);
struct tfrc_rx_hist_entry *tmp = h->ring[idx_a];
h->ring[idx_a] = h->ring[idx_b];
h->ring[idx_b] = tmp;
}
/*
* Private helper functions for loss detection.
*
* In the descriptions, `Si' refers to the sequence number of entry number i,
* whose NDP count is `Ni' (lower case is used for variables).
* Note: All __xxx_loss functions expect that a test against duplicates has been
* performed already: the seqno of the skb must not be less than the seqno
* of loss_prev; and it must not equal that of any valid history entry.
*/
static void __do_track_loss(struct tfrc_rx_hist *h, struct sk_buff *skb, u64 n1)
{
u64 s0 = tfrc_rx_hist_loss_prev(h)->tfrchrx_seqno,
s1 = DCCP_SKB_CB(skb)->dccpd_seq;
if (!dccp_loss_free(s0, s1, n1)) { /* gap between S0 and S1 */
h->loss_count = 1;
tfrc_rx_hist_entry_from_skb(tfrc_rx_hist_entry(h, 1), skb, n1);
}
}
static void __one_after_loss(struct tfrc_rx_hist *h, struct sk_buff *skb, u32 n2)
{
u64 s0 = tfrc_rx_hist_loss_prev(h)->tfrchrx_seqno,
s1 = tfrc_rx_hist_entry(h, 1)->tfrchrx_seqno,
s2 = DCCP_SKB_CB(skb)->dccpd_seq;
if (likely(dccp_delta_seqno(s1, s2) > 0)) { /* S1 < S2 */
h->loss_count = 2;
tfrc_rx_hist_entry_from_skb(tfrc_rx_hist_entry(h, 2), skb, n2);
return;
}
/* S0 < S2 < S1 */
if (dccp_loss_free(s0, s2, n2)) {
u64 n1 = tfrc_rx_hist_entry(h, 1)->tfrchrx_ndp;
if (dccp_loss_free(s2, s1, n1)) {
/* hole is filled: S0, S2, and S1 are consecutive */
h->loss_count = 0;
h->loss_start = tfrc_rx_hist_index(h, 1);
} else
/* gap between S2 and S1: just update loss_prev */
tfrc_rx_hist_entry_from_skb(tfrc_rx_hist_loss_prev(h), skb, n2);
} else { /* gap between S0 and S2 */
/*
* Reorder history to insert S2 between S0 and S1
*/
tfrc_rx_hist_swap(h, 0, 3);
h->loss_start = tfrc_rx_hist_index(h, 3);
tfrc_rx_hist_entry_from_skb(tfrc_rx_hist_entry(h, 1), skb, n2);
h->loss_count = 2;
}
}
/* return 1 if a new loss event has been identified */
static int __two_after_loss(struct tfrc_rx_hist *h, struct sk_buff *skb, u32 n3)
{
u64 s0 = tfrc_rx_hist_loss_prev(h)->tfrchrx_seqno,
s1 = tfrc_rx_hist_entry(h, 1)->tfrchrx_seqno,
s2 = tfrc_rx_hist_entry(h, 2)->tfrchrx_seqno,
s3 = DCCP_SKB_CB(skb)->dccpd_seq;
if (likely(dccp_delta_seqno(s2, s3) > 0)) { /* S2 < S3 */
h->loss_count = 3;
tfrc_rx_hist_entry_from_skb(tfrc_rx_hist_entry(h, 3), skb, n3);
return 1;
}
/* S3 < S2 */
if (dccp_delta_seqno(s1, s3) > 0) { /* S1 < S3 < S2 */
/*
* Reorder history to insert S3 between S1 and S2
*/
tfrc_rx_hist_swap(h, 2, 3);
tfrc_rx_hist_entry_from_skb(tfrc_rx_hist_entry(h, 2), skb, n3);
h->loss_count = 3;
return 1;
}
/* S0 < S3 < S1 */
if (dccp_loss_free(s0, s3, n3)) {
u64 n1 = tfrc_rx_hist_entry(h, 1)->tfrchrx_ndp;
if (dccp_loss_free(s3, s1, n1)) {
/* hole between S0 and S1 filled by S3 */
u64 n2 = tfrc_rx_hist_entry(h, 2)->tfrchrx_ndp;
if (dccp_loss_free(s1, s2, n2)) {
/* entire hole filled by S0, S3, S1, S2 */
h->loss_start = tfrc_rx_hist_index(h, 2);
h->loss_count = 0;
} else {
/* gap remains between S1 and S2 */
h->loss_start = tfrc_rx_hist_index(h, 1);
h->loss_count = 1;
}
} else /* gap exists between S3 and S1, loss_count stays at 2 */
tfrc_rx_hist_entry_from_skb(tfrc_rx_hist_loss_prev(h), skb, n3);
return 0;
}
/*
* The remaining case: S0 < S3 < S1 < S2; gap between S0 and S3
* Reorder history to insert S3 between S0 and S1.
*/
tfrc_rx_hist_swap(h, 0, 3);
h->loss_start = tfrc_rx_hist_index(h, 3);
tfrc_rx_hist_entry_from_skb(tfrc_rx_hist_entry(h, 1), skb, n3);
h->loss_count = 3;
return 1;
}
/* recycle RX history records to continue loss detection if necessary */
static void __three_after_loss(struct tfrc_rx_hist *h)
{
/*
* At this stage we know already that there is a gap between S0 and S1
* (since S0 was the highest sequence number received before detecting
* the loss). To recycle the loss record, it is thus only necessary to
* check for other possible gaps between S1/S2 and between S2/S3.
*/
u64 s1 = tfrc_rx_hist_entry(h, 1)->tfrchrx_seqno,
s2 = tfrc_rx_hist_entry(h, 2)->tfrchrx_seqno,
s3 = tfrc_rx_hist_entry(h, 3)->tfrchrx_seqno;
u64 n2 = tfrc_rx_hist_entry(h, 2)->tfrchrx_ndp,
n3 = tfrc_rx_hist_entry(h, 3)->tfrchrx_ndp;
if (dccp_loss_free(s1, s2, n2)) {
if (dccp_loss_free(s2, s3, n3)) {
/* no gap between S2 and S3: entire hole is filled */
h->loss_start = tfrc_rx_hist_index(h, 3);
h->loss_count = 0;
} else {
/* gap between S2 and S3 */
h->loss_start = tfrc_rx_hist_index(h, 2);
h->loss_count = 1;
}
} else { /* gap between S1 and S2 */
h->loss_start = tfrc_rx_hist_index(h, 1);
h->loss_count = 2;
}
}
/**
* tfrc_rx_handle_loss - Loss detection and further processing
* @h: The non-empty RX history object
* @lh: Loss Intervals database to update
* @skb: Currently received packet
* @ndp: The NDP count belonging to @skb
* @calc_first_li: Caller-dependent computation of first loss interval in @lh
* @sk: Used by @calc_first_li (see tfrc_lh_interval_add)
*
* Chooses action according to pending loss, updates LI database when a new
* loss was detected, and does required post-processing. Returns 1 when caller
* should send feedback, 0 otherwise.
* Since it also takes care of reordering during loss detection and updates the
* records accordingly, the caller should not perform any more RX history
* operations when loss_count is greater than 0 after calling this function.
*/
int tfrc_rx_handle_loss(struct tfrc_rx_hist *h,
struct tfrc_loss_hist *lh,
struct sk_buff *skb, const u64 ndp,
u32 (*calc_first_li)(struct sock *), struct sock *sk)
{
int is_new_loss = 0;
if (h->loss_count == 0) {
__do_track_loss(h, skb, ndp);
} else if (h->loss_count == 1) {
__one_after_loss(h, skb, ndp);
} else if (h->loss_count != 2) {
DCCP_BUG("invalid loss_count %d", h->loss_count);
} else if (__two_after_loss(h, skb, ndp)) {
/*
* Update Loss Interval database and recycle RX records
*/
is_new_loss = tfrc_lh_interval_add(lh, h, calc_first_li, sk);
__three_after_loss(h);
}
return is_new_loss;
}
int tfrc_rx_hist_alloc(struct tfrc_rx_hist *h)
{
int i;
for (i = 0; i <= TFRC_NDUPACK; i++) {
h->ring[i] = kmem_cache_alloc(tfrc_rx_hist_slab, GFP_ATOMIC);
if (h->ring[i] == NULL)
goto out_free;
}
h->loss_count = h->loss_start = 0;
return 0;
out_free:
while (i-- != 0) {
kmem_cache_free(tfrc_rx_hist_slab, h->ring[i]);
h->ring[i] = NULL;
}
return -ENOBUFS;
}
void tfrc_rx_hist_purge(struct tfrc_rx_hist *h)
{
int i;
for (i = 0; i <= TFRC_NDUPACK; ++i)
if (h->ring[i] != NULL) {
kmem_cache_free(tfrc_rx_hist_slab, h->ring[i]);
h->ring[i] = NULL;
}
}
/**
* tfrc_rx_hist_rtt_last_s - reference entry to compute RTT samples against
*/
static inline struct tfrc_rx_hist_entry *
tfrc_rx_hist_rtt_last_s(const struct tfrc_rx_hist *h)
{
return h->ring[0];
}
/**
* tfrc_rx_hist_rtt_prev_s - previously suitable (wrt rtt_last_s) RTT-sampling entry
*/
static inline struct tfrc_rx_hist_entry *
tfrc_rx_hist_rtt_prev_s(const struct tfrc_rx_hist *h)
{
return h->ring[h->rtt_sample_prev];
}
/**
* tfrc_rx_hist_sample_rtt - Sample RTT from timestamp / CCVal
* Based on ideas presented in RFC 4342, 8.1. Returns 0 if it was not able
* to compute a sample with given data - calling function should check this.
*/
u32 tfrc_rx_hist_sample_rtt(struct tfrc_rx_hist *h, const struct sk_buff *skb)
{
u32 sample = 0,
delta_v = SUB16(dccp_hdr(skb)->dccph_ccval,
tfrc_rx_hist_rtt_last_s(h)->tfrchrx_ccval);
if (delta_v < 1 || delta_v > 4) { /* unsuitable CCVal delta */
if (h->rtt_sample_prev == 2) { /* previous candidate stored */
sample = SUB16(tfrc_rx_hist_rtt_prev_s(h)->tfrchrx_ccval,
tfrc_rx_hist_rtt_last_s(h)->tfrchrx_ccval);
if (sample)
sample = 4 / sample *
ktime_us_delta(tfrc_rx_hist_rtt_prev_s(h)->tfrchrx_tstamp,
tfrc_rx_hist_rtt_last_s(h)->tfrchrx_tstamp);
else /*
* FIXME: This condition is in principle not
* possible but occurs when CCID is used for
* two-way data traffic. I have tried to trace
* it, but the cause does not seem to be here.
*/
DCCP_BUG("please report to dccp@vger.kernel.org"
" => prev = %u, last = %u",
tfrc_rx_hist_rtt_prev_s(h)->tfrchrx_ccval,
tfrc_rx_hist_rtt_last_s(h)->tfrchrx_ccval);
} else if (delta_v < 1) {
h->rtt_sample_prev = 1;
goto keep_ref_for_next_time;
}
} else if (delta_v == 4) /* optimal match */
sample = ktime_to_us(net_timedelta(tfrc_rx_hist_rtt_last_s(h)->tfrchrx_tstamp));
else { /* suboptimal match */
h->rtt_sample_prev = 2;
goto keep_ref_for_next_time;
}
if (unlikely(sample > DCCP_SANE_RTT_MAX)) {
DCCP_WARN("RTT sample %u too large, using max\n", sample);
sample = DCCP_SANE_RTT_MAX;
}
h->rtt_sample_prev = 0; /* use current entry as next reference */
keep_ref_for_next_time:
return sample;
}

View file

@ -0,0 +1,155 @@
/*
* Packet RX/TX history data structures and routines for TFRC-based protocols.
*
* Copyright (c) 2007 The University of Aberdeen, Scotland, UK
* Copyright (c) 2005-6 The University of Waikato, Hamilton, New Zealand.
*
* This code has been developed by the University of Waikato WAND
* research group. For further information please see http://www.wand.net.nz/
* or e-mail Ian McDonald - ian.mcdonald@jandi.co.nz
*
* This code also uses code from Lulea University, rereleased as GPL by its
* authors:
* Copyright (c) 2003 Nils-Erik Mattsson, Joacim Haggmark, Magnus Erixzon
*
* Changes to meet Linux coding standards, to make it meet latest ccid3 draft
* and to make it work as a loadable module in the DCCP stack written by
* Arnaldo Carvalho de Melo <acme@conectiva.com.br>.
*
* Copyright (c) 2005 Arnaldo Carvalho de Melo <acme@conectiva.com.br>
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
*/
#ifndef _DCCP_PKT_HIST_
#define _DCCP_PKT_HIST_
#include <linux/list.h>
#include <linux/slab.h>
#include "tfrc.h"
/**
* tfrc_tx_hist_entry - Simple singly-linked TX history list
* @next: next oldest entry (LIFO order)
* @seqno: sequence number of this entry
* @stamp: send time of packet with sequence number @seqno
*/
struct tfrc_tx_hist_entry {
struct tfrc_tx_hist_entry *next;
u64 seqno;
ktime_t stamp;
};
static inline struct tfrc_tx_hist_entry *
tfrc_tx_hist_find_entry(struct tfrc_tx_hist_entry *head, u64 seqno)
{
while (head != NULL && head->seqno != seqno)
head = head->next;
return head;
}
int tfrc_tx_hist_add(struct tfrc_tx_hist_entry **headp, u64 seqno);
void tfrc_tx_hist_purge(struct tfrc_tx_hist_entry **headp);
/* Subtraction a-b modulo-16, respects circular wrap-around */
#define SUB16(a, b) (((a) + 16 - (b)) & 0xF)
/* Number of packets to wait after a missing packet (RFC 4342, 6.1) */
#define TFRC_NDUPACK 3
/**
* tfrc_rx_hist_entry - Store information about a single received packet
* @tfrchrx_seqno: DCCP packet sequence number
* @tfrchrx_ccval: window counter value of packet (RFC 4342, 8.1)
* @tfrchrx_ndp: the NDP count (if any) of the packet
* @tfrchrx_tstamp: actual receive time of packet
*/
struct tfrc_rx_hist_entry {
u64 tfrchrx_seqno:48,
tfrchrx_ccval:4,
tfrchrx_type:4;
u64 tfrchrx_ndp:48;
ktime_t tfrchrx_tstamp;
};
/**
* tfrc_rx_hist - RX history structure for TFRC-based protocols
* @ring: Packet history for RTT sampling and loss detection
* @loss_count: Number of entries in circular history
* @loss_start: Movable index (for loss detection)
* @rtt_sample_prev: Used during RTT sampling, points to candidate entry
*/
struct tfrc_rx_hist {
struct tfrc_rx_hist_entry *ring[TFRC_NDUPACK + 1];
u8 loss_count:2,
loss_start:2;
#define rtt_sample_prev loss_start
};
/**
* tfrc_rx_hist_index - index to reach n-th entry after loss_start
*/
static inline u8 tfrc_rx_hist_index(const struct tfrc_rx_hist *h, const u8 n)
{
return (h->loss_start + n) & TFRC_NDUPACK;
}
/**
* tfrc_rx_hist_last_rcv - entry with highest-received-seqno so far
*/
static inline struct tfrc_rx_hist_entry *
tfrc_rx_hist_last_rcv(const struct tfrc_rx_hist *h)
{
return h->ring[tfrc_rx_hist_index(h, h->loss_count)];
}
/**
* tfrc_rx_hist_entry - return the n-th history entry after loss_start
*/
static inline struct tfrc_rx_hist_entry *
tfrc_rx_hist_entry(const struct tfrc_rx_hist *h, const u8 n)
{
return h->ring[tfrc_rx_hist_index(h, n)];
}
/**
* tfrc_rx_hist_loss_prev - entry with highest-received-seqno before loss was detected
*/
static inline struct tfrc_rx_hist_entry *
tfrc_rx_hist_loss_prev(const struct tfrc_rx_hist *h)
{
return h->ring[h->loss_start];
}
/* indicate whether previously a packet was detected missing */
static inline bool tfrc_rx_hist_loss_pending(const struct tfrc_rx_hist *h)
{
return h->loss_count > 0;
}
void tfrc_rx_hist_add_packet(struct tfrc_rx_hist *h, const struct sk_buff *skb,
const u64 ndp);
int tfrc_rx_hist_duplicate(struct tfrc_rx_hist *h, struct sk_buff *skb);
struct tfrc_loss_hist;
int tfrc_rx_handle_loss(struct tfrc_rx_hist *h, struct tfrc_loss_hist *lh,
struct sk_buff *skb, const u64 ndp,
u32 (*first_li)(struct sock *sk), struct sock *sk);
u32 tfrc_rx_hist_sample_rtt(struct tfrc_rx_hist *h, const struct sk_buff *skb);
int tfrc_rx_hist_alloc(struct tfrc_rx_hist *h);
void tfrc_rx_hist_purge(struct tfrc_rx_hist *h);
#endif /* _DCCP_PKT_HIST_ */

45
net/dccp/ccids/lib/tfrc.c Normal file
View file

@ -0,0 +1,45 @@
/*
* TFRC library initialisation
*
* Copyright (c) 2007 The University of Aberdeen, Scotland, UK
* Copyright (c) 2007 Arnaldo Carvalho de Melo <acme@redhat.com>
*/
#include <linux/moduleparam.h>
#include "tfrc.h"
#ifdef CONFIG_IP_DCCP_TFRC_DEBUG
bool tfrc_debug;
module_param(tfrc_debug, bool, 0644);
MODULE_PARM_DESC(tfrc_debug, "Enable TFRC debug messages");
#endif
int __init tfrc_lib_init(void)
{
int rc = tfrc_li_init();
if (rc)
goto out;
rc = tfrc_tx_packet_history_init();
if (rc)
goto out_free_loss_intervals;
rc = tfrc_rx_packet_history_init();
if (rc)
goto out_free_tx_history;
return 0;
out_free_tx_history:
tfrc_tx_packet_history_exit();
out_free_loss_intervals:
tfrc_li_exit();
out:
return rc;
}
void tfrc_lib_exit(void)
{
tfrc_rx_packet_history_exit();
tfrc_tx_packet_history_exit();
tfrc_li_exit();
}

77
net/dccp/ccids/lib/tfrc.h Normal file
View file

@ -0,0 +1,77 @@
#ifndef _TFRC_H_
#define _TFRC_H_
/*
* Copyright (c) 2007 The University of Aberdeen, Scotland, UK
* Copyright (c) 2005-6 The University of Waikato, Hamilton, New Zealand.
* Copyright (c) 2005-6 Ian McDonald <ian.mcdonald@jandi.co.nz>
* Copyright (c) 2005 Arnaldo Carvalho de Melo <acme@conectiva.com.br>
* Copyright (c) 2003 Nils-Erik Mattsson, Joacim Haggmark, Magnus Erixzon
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*/
#include <linux/types.h>
#include <linux/math64.h>
#include "../../dccp.h"
/* internal includes that this library exports: */
#include "loss_interval.h"
#include "packet_history.h"
#ifdef CONFIG_IP_DCCP_TFRC_DEBUG
extern bool tfrc_debug;
#define tfrc_pr_debug(format, a...) DCCP_PR_DEBUG(tfrc_debug, format, ##a)
#else
#define tfrc_pr_debug(format, a...)
#endif
/* integer-arithmetic divisions of type (a * 1000000)/b */
static inline u64 scaled_div(u64 a, u64 b)
{
BUG_ON(b == 0);
return div64_u64(a * 1000000, b);
}
static inline u32 scaled_div32(u64 a, u64 b)
{
u64 result = scaled_div(a, b);
if (result > UINT_MAX) {
DCCP_CRIT("Overflow: %llu/%llu > UINT_MAX",
(unsigned long long)a, (unsigned long long)b);
return UINT_MAX;
}
return result;
}
/**
* tfrc_ewma - Exponentially weighted moving average
* @weight: Weight to be used as damping factor, in units of 1/10
*/
static inline u32 tfrc_ewma(const u32 avg, const u32 newval, const u8 weight)
{
return avg ? (weight * avg + (10 - weight) * newval) / 10 : newval;
}
u32 tfrc_calc_x(u16 s, u32 R, u32 p);
u32 tfrc_calc_x_reverse_lookup(u32 fvalue);
u32 tfrc_invert_loss_event_rate(u32 loss_event_rate);
int tfrc_tx_packet_history_init(void);
void tfrc_tx_packet_history_exit(void);
int tfrc_rx_packet_history_init(void);
void tfrc_rx_packet_history_exit(void);
int tfrc_li_init(void);
void tfrc_li_exit(void);
#ifdef CONFIG_IP_DCCP_TFRC_LIB
int tfrc_lib_init(void);
void tfrc_lib_exit(void);
#else
#define tfrc_lib_init() (0)
#define tfrc_lib_exit()
#endif
#endif /* _TFRC_H_ */

View file

@ -0,0 +1,705 @@
/*
* Copyright (c) 2005 The University of Waikato, Hamilton, New Zealand.
* Copyright (c) 2005 Ian McDonald <ian.mcdonald@jandi.co.nz>
* Copyright (c) 2005 Arnaldo Carvalho de Melo <acme@conectiva.com.br>
* Copyright (c) 2003 Nils-Erik Mattsson, Joacim Haggmark, Magnus Erixzon
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*/
#include <linux/module.h>
#include "../../dccp.h"
#include "tfrc.h"
#define TFRC_CALC_X_ARRSIZE 500
#define TFRC_CALC_X_SPLIT 50000 /* 0.05 * 1000000, details below */
#define TFRC_SMALLEST_P (TFRC_CALC_X_SPLIT/TFRC_CALC_X_ARRSIZE)
/*
TFRC TCP Reno Throughput Equation Lookup Table for f(p)
The following two-column lookup table implements a part of the TCP throughput
equation from [RFC 3448, sec. 3.1]:
s
X_calc = --------------------------------------------------------------
R * sqrt(2*b*p/3) + (3 * t_RTO * sqrt(3*b*p/8) * (p + 32*p^3))
Where:
X is the transmit rate in bytes/second
s is the packet size in bytes
R is the round trip time in seconds
p is the loss event rate, between 0 and 1.0, of the number of loss
events as a fraction of the number of packets transmitted
t_RTO is the TCP retransmission timeout value in seconds
b is the number of packets acknowledged by a single TCP ACK
We can assume that b = 1 and t_RTO is 4 * R. The equation now becomes:
s
X_calc = -------------------------------------------------------
R * sqrt(p*2/3) + (12 * R * sqrt(p*3/8) * (p + 32*p^3))
which we can break down into:
s
X_calc = ---------
R * f(p)
where f(p) is given for 0 < p <= 1 by:
f(p) = sqrt(2*p/3) + 12 * sqrt(3*p/8) * (p + 32*p^3)
Since this is kernel code, floating-point arithmetic is avoided in favour of
integer arithmetic. This means that nearly all fractional parameters are
scaled by 1000000:
* the parameters p and R
* the return result f(p)
The lookup table therefore actually tabulates the following function g(q):
g(q) = 1000000 * f(q/1000000)
Hence, when p <= 1, q must be less than or equal to 1000000. To achieve finer
granularity for the practically more relevant case of small values of p (up to
5%), the second column is used; the first one ranges up to 100%. This split
corresponds to the value of q = TFRC_CALC_X_SPLIT. At the same time this also
determines the smallest resolution possible with this lookup table:
TFRC_SMALLEST_P = TFRC_CALC_X_SPLIT / TFRC_CALC_X_ARRSIZE
The entire table is generated by:
for(i=0; i < TFRC_CALC_X_ARRSIZE; i++) {
lookup[i][0] = g((i+1) * 1000000/TFRC_CALC_X_ARRSIZE);
lookup[i][1] = g((i+1) * TFRC_CALC_X_SPLIT/TFRC_CALC_X_ARRSIZE);
}
With the given configuration, we have, with M = TFRC_CALC_X_ARRSIZE-1,
lookup[0][0] = g(1000000/(M+1)) = 1000000 * f(0.2%)
lookup[M][0] = g(1000000) = 1000000 * f(100%)
lookup[0][1] = g(TFRC_SMALLEST_P) = 1000000 * f(0.01%)
lookup[M][1] = g(TFRC_CALC_X_SPLIT) = 1000000 * f(5%)
In summary, the two columns represent f(p) for the following ranges:
* The first column is for 0.002 <= p <= 1.0
* The second column is for 0.0001 <= p <= 0.05
Where the columns overlap, the second (finer-grained) is given preference,
i.e. the first column is used only for p >= 0.05.
*/
static const u32 tfrc_calc_x_lookup[TFRC_CALC_X_ARRSIZE][2] = {
{ 37172, 8172 },
{ 53499, 11567 },
{ 66664, 14180 },
{ 78298, 16388 },
{ 89021, 18339 },
{ 99147, 20108 },
{ 108858, 21738 },
{ 118273, 23260 },
{ 127474, 24693 },
{ 136520, 26052 },
{ 145456, 27348 },
{ 154316, 28589 },
{ 163130, 29783 },
{ 171919, 30935 },
{ 180704, 32049 },
{ 189502, 33130 },
{ 198328, 34180 },
{ 207194, 35202 },
{ 216114, 36198 },
{ 225097, 37172 },
{ 234153, 38123 },
{ 243294, 39055 },
{ 252527, 39968 },
{ 261861, 40864 },
{ 271305, 41743 },
{ 280866, 42607 },
{ 290553, 43457 },
{ 300372, 44293 },
{ 310333, 45117 },
{ 320441, 45929 },
{ 330705, 46729 },
{ 341131, 47518 },
{ 351728, 48297 },
{ 362501, 49066 },
{ 373460, 49826 },
{ 384609, 50577 },
{ 395958, 51320 },
{ 407513, 52054 },
{ 419281, 52780 },
{ 431270, 53499 },
{ 443487, 54211 },
{ 455940, 54916 },
{ 468635, 55614 },
{ 481581, 56306 },
{ 494785, 56991 },
{ 508254, 57671 },
{ 521996, 58345 },
{ 536019, 59014 },
{ 550331, 59677 },
{ 564939, 60335 },
{ 579851, 60988 },
{ 595075, 61636 },
{ 610619, 62279 },
{ 626491, 62918 },
{ 642700, 63553 },
{ 659253, 64183 },
{ 676158, 64809 },
{ 693424, 65431 },
{ 711060, 66050 },
{ 729073, 66664 },
{ 747472, 67275 },
{ 766266, 67882 },
{ 785464, 68486 },
{ 805073, 69087 },
{ 825103, 69684 },
{ 845562, 70278 },
{ 866460, 70868 },
{ 887805, 71456 },
{ 909606, 72041 },
{ 931873, 72623 },
{ 954614, 73202 },
{ 977839, 73778 },
{ 1001557, 74352 },
{ 1025777, 74923 },
{ 1050508, 75492 },
{ 1075761, 76058 },
{ 1101544, 76621 },
{ 1127867, 77183 },
{ 1154739, 77741 },
{ 1182172, 78298 },
{ 1210173, 78852 },
{ 1238753, 79405 },
{ 1267922, 79955 },
{ 1297689, 80503 },
{ 1328066, 81049 },
{ 1359060, 81593 },
{ 1390684, 82135 },
{ 1422947, 82675 },
{ 1455859, 83213 },
{ 1489430, 83750 },
{ 1523671, 84284 },
{ 1558593, 84817 },
{ 1594205, 85348 },
{ 1630518, 85878 },
{ 1667543, 86406 },
{ 1705290, 86932 },
{ 1743770, 87457 },
{ 1782994, 87980 },
{ 1822973, 88501 },
{ 1863717, 89021 },
{ 1905237, 89540 },
{ 1947545, 90057 },
{ 1990650, 90573 },
{ 2034566, 91087 },
{ 2079301, 91600 },
{ 2124869, 92111 },
{ 2171279, 92622 },
{ 2218543, 93131 },
{ 2266673, 93639 },
{ 2315680, 94145 },
{ 2365575, 94650 },
{ 2416371, 95154 },
{ 2468077, 95657 },
{ 2520707, 96159 },
{ 2574271, 96660 },
{ 2628782, 97159 },
{ 2684250, 97658 },
{ 2740689, 98155 },
{ 2798110, 98651 },
{ 2856524, 99147 },
{ 2915944, 99641 },
{ 2976382, 100134 },
{ 3037850, 100626 },
{ 3100360, 101117 },
{ 3163924, 101608 },
{ 3228554, 102097 },
{ 3294263, 102586 },
{ 3361063, 103073 },
{ 3428966, 103560 },
{ 3497984, 104045 },
{ 3568131, 104530 },
{ 3639419, 105014 },
{ 3711860, 105498 },
{ 3785467, 105980 },
{ 3860253, 106462 },
{ 3936229, 106942 },
{ 4013410, 107422 },
{ 4091808, 107902 },
{ 4171435, 108380 },
{ 4252306, 108858 },
{ 4334431, 109335 },
{ 4417825, 109811 },
{ 4502501, 110287 },
{ 4588472, 110762 },
{ 4675750, 111236 },
{ 4764349, 111709 },
{ 4854283, 112182 },
{ 4945564, 112654 },
{ 5038206, 113126 },
{ 5132223, 113597 },
{ 5227627, 114067 },
{ 5324432, 114537 },
{ 5422652, 115006 },
{ 5522299, 115474 },
{ 5623389, 115942 },
{ 5725934, 116409 },
{ 5829948, 116876 },
{ 5935446, 117342 },
{ 6042439, 117808 },
{ 6150943, 118273 },
{ 6260972, 118738 },
{ 6372538, 119202 },
{ 6485657, 119665 },
{ 6600342, 120128 },
{ 6716607, 120591 },
{ 6834467, 121053 },
{ 6953935, 121514 },
{ 7075025, 121976 },
{ 7197752, 122436 },
{ 7322131, 122896 },
{ 7448175, 123356 },
{ 7575898, 123815 },
{ 7705316, 124274 },
{ 7836442, 124733 },
{ 7969291, 125191 },
{ 8103877, 125648 },
{ 8240216, 126105 },
{ 8378321, 126562 },
{ 8518208, 127018 },
{ 8659890, 127474 },
{ 8803384, 127930 },
{ 8948702, 128385 },
{ 9095861, 128840 },
{ 9244875, 129294 },
{ 9395760, 129748 },
{ 9548529, 130202 },
{ 9703198, 130655 },
{ 9859782, 131108 },
{ 10018296, 131561 },
{ 10178755, 132014 },
{ 10341174, 132466 },
{ 10505569, 132917 },
{ 10671954, 133369 },
{ 10840345, 133820 },
{ 11010757, 134271 },
{ 11183206, 134721 },
{ 11357706, 135171 },
{ 11534274, 135621 },
{ 11712924, 136071 },
{ 11893673, 136520 },
{ 12076536, 136969 },
{ 12261527, 137418 },
{ 12448664, 137867 },
{ 12637961, 138315 },
{ 12829435, 138763 },
{ 13023101, 139211 },
{ 13218974, 139658 },
{ 13417071, 140106 },
{ 13617407, 140553 },
{ 13819999, 140999 },
{ 14024862, 141446 },
{ 14232012, 141892 },
{ 14441465, 142339 },
{ 14653238, 142785 },
{ 14867346, 143230 },
{ 15083805, 143676 },
{ 15302632, 144121 },
{ 15523842, 144566 },
{ 15747453, 145011 },
{ 15973479, 145456 },
{ 16201939, 145900 },
{ 16432847, 146345 },
{ 16666221, 146789 },
{ 16902076, 147233 },
{ 17140429, 147677 },
{ 17381297, 148121 },
{ 17624696, 148564 },
{ 17870643, 149007 },
{ 18119154, 149451 },
{ 18370247, 149894 },
{ 18623936, 150336 },
{ 18880241, 150779 },
{ 19139176, 151222 },
{ 19400759, 151664 },
{ 19665007, 152107 },
{ 19931936, 152549 },
{ 20201564, 152991 },
{ 20473907, 153433 },
{ 20748982, 153875 },
{ 21026807, 154316 },
{ 21307399, 154758 },
{ 21590773, 155199 },
{ 21876949, 155641 },
{ 22165941, 156082 },
{ 22457769, 156523 },
{ 22752449, 156964 },
{ 23049999, 157405 },
{ 23350435, 157846 },
{ 23653774, 158287 },
{ 23960036, 158727 },
{ 24269236, 159168 },
{ 24581392, 159608 },
{ 24896521, 160049 },
{ 25214642, 160489 },
{ 25535772, 160929 },
{ 25859927, 161370 },
{ 26187127, 161810 },
{ 26517388, 162250 },
{ 26850728, 162690 },
{ 27187165, 163130 },
{ 27526716, 163569 },
{ 27869400, 164009 },
{ 28215234, 164449 },
{ 28564236, 164889 },
{ 28916423, 165328 },
{ 29271815, 165768 },
{ 29630428, 166208 },
{ 29992281, 166647 },
{ 30357392, 167087 },
{ 30725779, 167526 },
{ 31097459, 167965 },
{ 31472452, 168405 },
{ 31850774, 168844 },
{ 32232445, 169283 },
{ 32617482, 169723 },
{ 33005904, 170162 },
{ 33397730, 170601 },
{ 33792976, 171041 },
{ 34191663, 171480 },
{ 34593807, 171919 },
{ 34999428, 172358 },
{ 35408544, 172797 },
{ 35821174, 173237 },
{ 36237335, 173676 },
{ 36657047, 174115 },
{ 37080329, 174554 },
{ 37507197, 174993 },
{ 37937673, 175433 },
{ 38371773, 175872 },
{ 38809517, 176311 },
{ 39250924, 176750 },
{ 39696012, 177190 },
{ 40144800, 177629 },
{ 40597308, 178068 },
{ 41053553, 178507 },
{ 41513554, 178947 },
{ 41977332, 179386 },
{ 42444904, 179825 },
{ 42916290, 180265 },
{ 43391509, 180704 },
{ 43870579, 181144 },
{ 44353520, 181583 },
{ 44840352, 182023 },
{ 45331092, 182462 },
{ 45825761, 182902 },
{ 46324378, 183342 },
{ 46826961, 183781 },
{ 47333531, 184221 },
{ 47844106, 184661 },
{ 48358706, 185101 },
{ 48877350, 185541 },
{ 49400058, 185981 },
{ 49926849, 186421 },
{ 50457743, 186861 },
{ 50992759, 187301 },
{ 51531916, 187741 },
{ 52075235, 188181 },
{ 52622735, 188622 },
{ 53174435, 189062 },
{ 53730355, 189502 },
{ 54290515, 189943 },
{ 54854935, 190383 },
{ 55423634, 190824 },
{ 55996633, 191265 },
{ 56573950, 191706 },
{ 57155606, 192146 },
{ 57741621, 192587 },
{ 58332014, 193028 },
{ 58926806, 193470 },
{ 59526017, 193911 },
{ 60129666, 194352 },
{ 60737774, 194793 },
{ 61350361, 195235 },
{ 61967446, 195677 },
{ 62589050, 196118 },
{ 63215194, 196560 },
{ 63845897, 197002 },
{ 64481179, 197444 },
{ 65121061, 197886 },
{ 65765563, 198328 },
{ 66414705, 198770 },
{ 67068508, 199213 },
{ 67726992, 199655 },
{ 68390177, 200098 },
{ 69058085, 200540 },
{ 69730735, 200983 },
{ 70408147, 201426 },
{ 71090343, 201869 },
{ 71777343, 202312 },
{ 72469168, 202755 },
{ 73165837, 203199 },
{ 73867373, 203642 },
{ 74573795, 204086 },
{ 75285124, 204529 },
{ 76001380, 204973 },
{ 76722586, 205417 },
{ 77448761, 205861 },
{ 78179926, 206306 },
{ 78916102, 206750 },
{ 79657310, 207194 },
{ 80403571, 207639 },
{ 81154906, 208084 },
{ 81911335, 208529 },
{ 82672880, 208974 },
{ 83439562, 209419 },
{ 84211402, 209864 },
{ 84988421, 210309 },
{ 85770640, 210755 },
{ 86558080, 211201 },
{ 87350762, 211647 },
{ 88148708, 212093 },
{ 88951938, 212539 },
{ 89760475, 212985 },
{ 90574339, 213432 },
{ 91393551, 213878 },
{ 92218133, 214325 },
{ 93048107, 214772 },
{ 93883493, 215219 },
{ 94724314, 215666 },
{ 95570590, 216114 },
{ 96422343, 216561 },
{ 97279594, 217009 },
{ 98142366, 217457 },
{ 99010679, 217905 },
{ 99884556, 218353 },
{ 100764018, 218801 },
{ 101649086, 219250 },
{ 102539782, 219698 },
{ 103436128, 220147 },
{ 104338146, 220596 },
{ 105245857, 221046 },
{ 106159284, 221495 },
{ 107078448, 221945 },
{ 108003370, 222394 },
{ 108934074, 222844 },
{ 109870580, 223294 },
{ 110812910, 223745 },
{ 111761087, 224195 },
{ 112715133, 224646 },
{ 113675069, 225097 },
{ 114640918, 225548 },
{ 115612702, 225999 },
{ 116590442, 226450 },
{ 117574162, 226902 },
{ 118563882, 227353 },
{ 119559626, 227805 },
{ 120561415, 228258 },
{ 121569272, 228710 },
{ 122583219, 229162 },
{ 123603278, 229615 },
{ 124629471, 230068 },
{ 125661822, 230521 },
{ 126700352, 230974 },
{ 127745083, 231428 },
{ 128796039, 231882 },
{ 129853241, 232336 },
{ 130916713, 232790 },
{ 131986475, 233244 },
{ 133062553, 233699 },
{ 134144966, 234153 },
{ 135233739, 234608 },
{ 136328894, 235064 },
{ 137430453, 235519 },
{ 138538440, 235975 },
{ 139652876, 236430 },
{ 140773786, 236886 },
{ 141901190, 237343 },
{ 143035113, 237799 },
{ 144175576, 238256 },
{ 145322604, 238713 },
{ 146476218, 239170 },
{ 147636442, 239627 },
{ 148803298, 240085 },
{ 149976809, 240542 },
{ 151156999, 241000 },
{ 152343890, 241459 },
{ 153537506, 241917 },
{ 154737869, 242376 },
{ 155945002, 242835 },
{ 157158929, 243294 },
{ 158379673, 243753 },
{ 159607257, 244213 },
{ 160841704, 244673 },
{ 162083037, 245133 },
{ 163331279, 245593 },
{ 164586455, 246054 },
{ 165848586, 246514 },
{ 167117696, 246975 },
{ 168393810, 247437 },
{ 169676949, 247898 },
{ 170967138, 248360 },
{ 172264399, 248822 },
{ 173568757, 249284 },
{ 174880235, 249747 },
{ 176198856, 250209 },
{ 177524643, 250672 },
{ 178857621, 251136 },
{ 180197813, 251599 },
{ 181545242, 252063 },
{ 182899933, 252527 },
{ 184261908, 252991 },
{ 185631191, 253456 },
{ 187007807, 253920 },
{ 188391778, 254385 },
{ 189783129, 254851 },
{ 191181884, 255316 },
{ 192588065, 255782 },
{ 194001698, 256248 },
{ 195422805, 256714 },
{ 196851411, 257181 },
{ 198287540, 257648 },
{ 199731215, 258115 },
{ 201182461, 258582 },
{ 202641302, 259050 },
{ 204107760, 259518 },
{ 205581862, 259986 },
{ 207063630, 260454 },
{ 208553088, 260923 },
{ 210050262, 261392 },
{ 211555174, 261861 },
{ 213067849, 262331 },
{ 214588312, 262800 },
{ 216116586, 263270 },
{ 217652696, 263741 },
{ 219196666, 264211 },
{ 220748520, 264682 },
{ 222308282, 265153 },
{ 223875978, 265625 },
{ 225451630, 266097 },
{ 227035265, 266569 },
{ 228626905, 267041 },
{ 230226576, 267514 },
{ 231834302, 267986 },
{ 233450107, 268460 },
{ 235074016, 268933 },
{ 236706054, 269407 },
{ 238346244, 269881 },
{ 239994613, 270355 },
{ 241651183, 270830 },
{ 243315981, 271305 }
};
/* return largest index i such that fval <= lookup[i][small] */
static inline u32 tfrc_binsearch(u32 fval, u8 small)
{
u32 try, low = 0, high = TFRC_CALC_X_ARRSIZE - 1;
while (low < high) {
try = (low + high) / 2;
if (fval <= tfrc_calc_x_lookup[try][small])
high = try;
else
low = try + 1;
}
return high;
}
/**
* tfrc_calc_x - Calculate the send rate as per section 3.1 of RFC3448
* @s: packet size in bytes
* @R: RTT scaled by 1000000 (i.e., microseconds)
* @p: loss ratio estimate scaled by 1000000
*
* Returns X_calc in bytes per second (not scaled).
*/
u32 tfrc_calc_x(u16 s, u32 R, u32 p)
{
u16 index;
u32 f;
u64 result;
/* check against invalid parameters and divide-by-zero */
BUG_ON(p > 1000000); /* p must not exceed 100% */
BUG_ON(p == 0); /* f(0) = 0, divide by zero */
if (R == 0) { /* possible divide by zero */
DCCP_CRIT("WARNING: RTT is 0, returning maximum X_calc.");
return ~0U;
}
if (p <= TFRC_CALC_X_SPLIT) { /* 0.0000 < p <= 0.05 */
if (p < TFRC_SMALLEST_P) { /* 0.0000 < p < 0.0001 */
DCCP_WARN("Value of p (%d) below resolution. "
"Substituting %d\n", p, TFRC_SMALLEST_P);
index = 0;
} else /* 0.0001 <= p <= 0.05 */
index = p/TFRC_SMALLEST_P - 1;
f = tfrc_calc_x_lookup[index][1];
} else { /* 0.05 < p <= 1.00 */
index = p/(1000000/TFRC_CALC_X_ARRSIZE) - 1;
f = tfrc_calc_x_lookup[index][0];
}
/*
* Compute X = s/(R*f(p)) in bytes per second.
* Since f(p) and R are both scaled by 1000000, we need to multiply by
* 1000000^2. To avoid overflow, the result is computed in two stages.
* This works under almost all reasonable operational conditions, for a
* wide range of parameters. Yet, should some strange combination of
* parameters result in overflow, the use of scaled_div32 will catch
* this and return UINT_MAX - which is a logically adequate consequence.
*/
result = scaled_div(s, R);
return scaled_div32(result, f);
}
/**
* tfrc_calc_x_reverse_lookup - try to find p given f(p)
* @fvalue: function value to match, scaled by 1000000
*
* Returns closest match for p, also scaled by 1000000
*/
u32 tfrc_calc_x_reverse_lookup(u32 fvalue)
{
int index;
if (fvalue == 0) /* f(p) = 0 whenever p = 0 */
return 0;
/* Error cases. */
if (fvalue < tfrc_calc_x_lookup[0][1]) {
DCCP_WARN("fvalue %u smaller than resolution\n", fvalue);
return TFRC_SMALLEST_P;
}
if (fvalue > tfrc_calc_x_lookup[TFRC_CALC_X_ARRSIZE - 1][0]) {
DCCP_WARN("fvalue %u exceeds bounds!\n", fvalue);
return 1000000;
}
if (fvalue <= tfrc_calc_x_lookup[TFRC_CALC_X_ARRSIZE - 1][1]) {
index = tfrc_binsearch(fvalue, 1);
return (index + 1) * TFRC_CALC_X_SPLIT / TFRC_CALC_X_ARRSIZE;
}
/* else ... it must be in the coarse-grained column */
index = tfrc_binsearch(fvalue, 0);
return (index + 1) * 1000000 / TFRC_CALC_X_ARRSIZE;
}
/**
* tfrc_invert_loss_event_rate - Compute p so that 10^6 corresponds to 100%
* When @loss_event_rate is large, there is a chance that p is truncated to 0.
* To avoid re-entering slow-start in that case, we set p = TFRC_SMALLEST_P > 0.
*/
u32 tfrc_invert_loss_event_rate(u32 loss_event_rate)
{
if (loss_event_rate == UINT_MAX) /* see RFC 4342, 8.5 */
return 0;
if (unlikely(loss_event_rate == 0)) /* map 1/0 into 100% */
return 1000000;
return max_t(u32, scaled_div(1, loss_event_rate), TFRC_SMALLEST_P);
}

501
net/dccp/dccp.h Normal file
View file

@ -0,0 +1,501 @@
#ifndef _DCCP_H
#define _DCCP_H
/*
* net/dccp/dccp.h
*
* An implementation of the DCCP protocol
* Copyright (c) 2005 Arnaldo Carvalho de Melo <acme@conectiva.com.br>
* Copyright (c) 2005-6 Ian McDonald <ian.mcdonald@jandi.co.nz>
*
* This program is free software; you can redistribute it and/or modify it
* under the terms of the GNU General Public License version 2 as
* published by the Free Software Foundation.
*/
#include <linux/dccp.h>
#include <linux/ktime.h>
#include <net/snmp.h>
#include <net/sock.h>
#include <net/tcp.h>
#include "ackvec.h"
/*
* DCCP - specific warning and debugging macros.
*/
#define DCCP_WARN(fmt, a...) LIMIT_NETDEBUG(KERN_WARNING "%s: " fmt, \
__func__, ##a)
#define DCCP_CRIT(fmt, a...) printk(KERN_CRIT fmt " at %s:%d/%s()\n", ##a, \
__FILE__, __LINE__, __func__)
#define DCCP_BUG(a...) do { DCCP_CRIT("BUG: " a); dump_stack(); } while(0)
#define DCCP_BUG_ON(cond) do { if (unlikely((cond) != 0)) \
DCCP_BUG("\"%s\" holds (exception!)", \
__stringify(cond)); \
} while (0)
#define DCCP_PRINTK(enable, fmt, args...) do { if (enable) \
printk(fmt, ##args); \
} while(0)
#define DCCP_PR_DEBUG(enable, fmt, a...) DCCP_PRINTK(enable, KERN_DEBUG \
"%s: " fmt, __func__, ##a)
#ifdef CONFIG_IP_DCCP_DEBUG
extern bool dccp_debug;
#define dccp_pr_debug(format, a...) DCCP_PR_DEBUG(dccp_debug, format, ##a)
#define dccp_pr_debug_cat(format, a...) DCCP_PRINTK(dccp_debug, format, ##a)
#define dccp_debug(fmt, a...) dccp_pr_debug_cat(KERN_DEBUG fmt, ##a)
#else
#define dccp_pr_debug(format, a...)
#define dccp_pr_debug_cat(format, a...)
#define dccp_debug(format, a...)
#endif
extern struct inet_hashinfo dccp_hashinfo;
extern struct percpu_counter dccp_orphan_count;
void dccp_time_wait(struct sock *sk, int state, int timeo);
/*
* Set safe upper bounds for header and option length. Since Data Offset is 8
* bits (RFC 4340, sec. 5.1), the total header length can never be more than
* 4 * 255 = 1020 bytes. The largest possible header length is 28 bytes (X=1):
* - DCCP-Response with ACK Subheader and 4 bytes of Service code OR
* - DCCP-Reset with ACK Subheader and 4 bytes of Reset Code fields
* Hence a safe upper bound for the maximum option length is 1020-28 = 992
*/
#define MAX_DCCP_SPECIFIC_HEADER (255 * sizeof(uint32_t))
#define DCCP_MAX_PACKET_HDR 28
#define DCCP_MAX_OPT_LEN (MAX_DCCP_SPECIFIC_HEADER - DCCP_MAX_PACKET_HDR)
#define MAX_DCCP_HEADER (MAX_DCCP_SPECIFIC_HEADER + MAX_HEADER)
/* Upper bound for initial feature-negotiation overhead (padded to 32 bits) */
#define DCCP_FEATNEG_OVERHEAD (32 * sizeof(uint32_t))
#define DCCP_TIMEWAIT_LEN (60 * HZ) /* how long to wait to destroy TIME-WAIT
* state, about 60 seconds */
/* RFC 1122, 4.2.3.1 initial RTO value */
#define DCCP_TIMEOUT_INIT ((unsigned int)(3 * HZ))
/*
* The maximum back-off value for retransmissions. This is needed for
* - retransmitting client-Requests (sec. 8.1.1),
* - retransmitting Close/CloseReq when closing (sec. 8.3),
* - feature-negotiation retransmission (sec. 6.6.3),
* - Acks in client-PARTOPEN state (sec. 8.1.5).
*/
#define DCCP_RTO_MAX ((unsigned int)(64 * HZ))
/*
* RTT sampling: sanity bounds and fallback RTT value from RFC 4340, section 3.4
*/
#define DCCP_SANE_RTT_MIN 100
#define DCCP_FALLBACK_RTT (USEC_PER_SEC / 5)
#define DCCP_SANE_RTT_MAX (3 * USEC_PER_SEC)
/* sysctl variables for DCCP */
extern int sysctl_dccp_request_retries;
extern int sysctl_dccp_retries1;
extern int sysctl_dccp_retries2;
extern int sysctl_dccp_tx_qlen;
extern int sysctl_dccp_sync_ratelimit;
/*
* 48-bit sequence number arithmetic (signed and unsigned)
*/
#define INT48_MIN 0x800000000000LL /* 2^47 */
#define UINT48_MAX 0xFFFFFFFFFFFFLL /* 2^48 - 1 */
#define COMPLEMENT48(x) (0x1000000000000LL - (x)) /* 2^48 - x */
#define TO_SIGNED48(x) (((x) < INT48_MIN)? (x) : -COMPLEMENT48( (x)))
#define TO_UNSIGNED48(x) (((x) >= 0)? (x) : COMPLEMENT48(-(x)))
#define ADD48(a, b) (((a) + (b)) & UINT48_MAX)
#define SUB48(a, b) ADD48((a), COMPLEMENT48(b))
static inline void dccp_set_seqno(u64 *seqno, u64 value)
{
*seqno = value & UINT48_MAX;
}
static inline void dccp_inc_seqno(u64 *seqno)
{
*seqno = ADD48(*seqno, 1);
}
/* signed mod-2^48 distance: pos. if seqno1 < seqno2, neg. if seqno1 > seqno2 */
static inline s64 dccp_delta_seqno(const u64 seqno1, const u64 seqno2)
{
u64 delta = SUB48(seqno2, seqno1);
return TO_SIGNED48(delta);
}
/* is seq1 < seq2 ? */
static inline int before48(const u64 seq1, const u64 seq2)
{
return (s64)((seq2 << 16) - (seq1 << 16)) > 0;
}
/* is seq1 > seq2 ? */
#define after48(seq1, seq2) before48(seq2, seq1)
/* is seq2 <= seq1 <= seq3 ? */
static inline int between48(const u64 seq1, const u64 seq2, const u64 seq3)
{
return (seq3 << 16) - (seq2 << 16) >= (seq1 << 16) - (seq2 << 16);
}
static inline u64 max48(const u64 seq1, const u64 seq2)
{
return after48(seq1, seq2) ? seq1 : seq2;
}
/**
* dccp_loss_count - Approximate the number of lost data packets in a burst loss
* @s1: last known sequence number before the loss ('hole')
* @s2: first sequence number seen after the 'hole'
* @ndp: NDP count on packet with sequence number @s2
*/
static inline u64 dccp_loss_count(const u64 s1, const u64 s2, const u64 ndp)
{
s64 delta = dccp_delta_seqno(s1, s2);
WARN_ON(delta < 0);
delta -= ndp + 1;
return delta > 0 ? delta : 0;
}
/**
* dccp_loss_free - Evaluate condition for data loss from RFC 4340, 7.7.1
*/
static inline bool dccp_loss_free(const u64 s1, const u64 s2, const u64 ndp)
{
return dccp_loss_count(s1, s2, ndp) == 0;
}
enum {
DCCP_MIB_NUM = 0,
DCCP_MIB_ACTIVEOPENS, /* ActiveOpens */
DCCP_MIB_ESTABRESETS, /* EstabResets */
DCCP_MIB_CURRESTAB, /* CurrEstab */
DCCP_MIB_OUTSEGS, /* OutSegs */
DCCP_MIB_OUTRSTS,
DCCP_MIB_ABORTONTIMEOUT,
DCCP_MIB_TIMEOUTS,
DCCP_MIB_ABORTFAILED,
DCCP_MIB_PASSIVEOPENS,
DCCP_MIB_ATTEMPTFAILS,
DCCP_MIB_OUTDATAGRAMS,
DCCP_MIB_INERRS,
DCCP_MIB_OPTMANDATORYERROR,
DCCP_MIB_INVALIDOPT,
__DCCP_MIB_MAX
};
#define DCCP_MIB_MAX __DCCP_MIB_MAX
struct dccp_mib {
unsigned long mibs[DCCP_MIB_MAX];
};
DECLARE_SNMP_STAT(struct dccp_mib, dccp_statistics);
#define DCCP_INC_STATS(field) SNMP_INC_STATS(dccp_statistics, field)
#define DCCP_INC_STATS_BH(field) SNMP_INC_STATS_BH(dccp_statistics, field)
#define DCCP_DEC_STATS(field) SNMP_DEC_STATS(dccp_statistics, field)
/*
* Checksumming routines
*/
static inline unsigned int dccp_csum_coverage(const struct sk_buff *skb)
{
const struct dccp_hdr* dh = dccp_hdr(skb);
if (dh->dccph_cscov == 0)
return skb->len;
return (dh->dccph_doff + dh->dccph_cscov - 1) * sizeof(u32);
}
static inline void dccp_csum_outgoing(struct sk_buff *skb)
{
unsigned int cov = dccp_csum_coverage(skb);
if (cov >= skb->len)
dccp_hdr(skb)->dccph_cscov = 0;
skb->csum = skb_checksum(skb, 0, (cov > skb->len)? skb->len : cov, 0);
}
void dccp_v4_send_check(struct sock *sk, struct sk_buff *skb);
int dccp_retransmit_skb(struct sock *sk);
void dccp_send_ack(struct sock *sk);
void dccp_reqsk_send_ack(struct sock *sk, struct sk_buff *skb,
struct request_sock *rsk);
void dccp_send_sync(struct sock *sk, const u64 seq,
const enum dccp_pkt_type pkt_type);
/*
* TX Packet Dequeueing Interface
*/
void dccp_qpolicy_push(struct sock *sk, struct sk_buff *skb);
bool dccp_qpolicy_full(struct sock *sk);
void dccp_qpolicy_drop(struct sock *sk, struct sk_buff *skb);
struct sk_buff *dccp_qpolicy_top(struct sock *sk);
struct sk_buff *dccp_qpolicy_pop(struct sock *sk);
bool dccp_qpolicy_param_ok(struct sock *sk, __be32 param);
/*
* TX Packet Output and TX Timers
*/
void dccp_write_xmit(struct sock *sk);
void dccp_write_space(struct sock *sk);
void dccp_flush_write_queue(struct sock *sk, long *time_budget);
void dccp_init_xmit_timers(struct sock *sk);
static inline void dccp_clear_xmit_timers(struct sock *sk)
{
inet_csk_clear_xmit_timers(sk);
}
unsigned int dccp_sync_mss(struct sock *sk, u32 pmtu);
const char *dccp_packet_name(const int type);
void dccp_set_state(struct sock *sk, const int state);
void dccp_done(struct sock *sk);
int dccp_reqsk_init(struct request_sock *rq, struct dccp_sock const *dp,
struct sk_buff const *skb);
int dccp_v4_conn_request(struct sock *sk, struct sk_buff *skb);
struct sock *dccp_create_openreq_child(struct sock *sk,
const struct request_sock *req,
const struct sk_buff *skb);
int dccp_v4_do_rcv(struct sock *sk, struct sk_buff *skb);
struct sock *dccp_v4_request_recv_sock(struct sock *sk, struct sk_buff *skb,
struct request_sock *req,
struct dst_entry *dst);
struct sock *dccp_check_req(struct sock *sk, struct sk_buff *skb,
struct request_sock *req,
struct request_sock **prev);
int dccp_child_process(struct sock *parent, struct sock *child,
struct sk_buff *skb);
int dccp_rcv_state_process(struct sock *sk, struct sk_buff *skb,
struct dccp_hdr *dh, unsigned int len);
int dccp_rcv_established(struct sock *sk, struct sk_buff *skb,
const struct dccp_hdr *dh, const unsigned int len);
int dccp_init_sock(struct sock *sk, const __u8 ctl_sock_initialized);
void dccp_destroy_sock(struct sock *sk);
void dccp_close(struct sock *sk, long timeout);
struct sk_buff *dccp_make_response(struct sock *sk, struct dst_entry *dst,
struct request_sock *req);
int dccp_connect(struct sock *sk);
int dccp_disconnect(struct sock *sk, int flags);
int dccp_getsockopt(struct sock *sk, int level, int optname,
char __user *optval, int __user *optlen);
int dccp_setsockopt(struct sock *sk, int level, int optname,
char __user *optval, unsigned int optlen);
#ifdef CONFIG_COMPAT
int compat_dccp_getsockopt(struct sock *sk, int level, int optname,
char __user *optval, int __user *optlen);
int compat_dccp_setsockopt(struct sock *sk, int level, int optname,
char __user *optval, unsigned int optlen);
#endif
int dccp_ioctl(struct sock *sk, int cmd, unsigned long arg);
int dccp_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
size_t size);
int dccp_recvmsg(struct kiocb *iocb, struct sock *sk,
struct msghdr *msg, size_t len, int nonblock, int flags,
int *addr_len);
void dccp_shutdown(struct sock *sk, int how);
int inet_dccp_listen(struct socket *sock, int backlog);
unsigned int dccp_poll(struct file *file, struct socket *sock,
poll_table *wait);
int dccp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len);
struct sk_buff *dccp_ctl_make_reset(struct sock *sk, struct sk_buff *skb);
int dccp_send_reset(struct sock *sk, enum dccp_reset_codes code);
void dccp_send_close(struct sock *sk, const int active);
int dccp_invalid_packet(struct sk_buff *skb);
u32 dccp_sample_rtt(struct sock *sk, long delta);
static inline int dccp_bad_service_code(const struct sock *sk,
const __be32 service)
{
const struct dccp_sock *dp = dccp_sk(sk);
if (dp->dccps_service == service)
return 0;
return !dccp_list_has_service(dp->dccps_service_list, service);
}
/**
* dccp_skb_cb - DCCP per-packet control information
* @dccpd_type: one of %dccp_pkt_type (or unknown)
* @dccpd_ccval: CCVal field (5.1), see e.g. RFC 4342, 8.1
* @dccpd_reset_code: one of %dccp_reset_codes
* @dccpd_reset_data: Data1..3 fields (depend on @dccpd_reset_code)
* @dccpd_opt_len: total length of all options (5.8) in the packet
* @dccpd_seq: sequence number
* @dccpd_ack_seq: acknowledgment number subheader field value
*
* This is used for transmission as well as for reception.
*/
struct dccp_skb_cb {
union {
struct inet_skb_parm h4;
#if IS_ENABLED(CONFIG_IPV6)
struct inet6_skb_parm h6;
#endif
} header;
__u8 dccpd_type:4;
__u8 dccpd_ccval:4;
__u8 dccpd_reset_code,
dccpd_reset_data[3];
__u16 dccpd_opt_len;
__u64 dccpd_seq;
__u64 dccpd_ack_seq;
};
#define DCCP_SKB_CB(__skb) ((struct dccp_skb_cb *)&((__skb)->cb[0]))
/* RFC 4340, sec. 7.7 */
static inline int dccp_non_data_packet(const struct sk_buff *skb)
{
const __u8 type = DCCP_SKB_CB(skb)->dccpd_type;
return type == DCCP_PKT_ACK ||
type == DCCP_PKT_CLOSE ||
type == DCCP_PKT_CLOSEREQ ||
type == DCCP_PKT_RESET ||
type == DCCP_PKT_SYNC ||
type == DCCP_PKT_SYNCACK;
}
/* RFC 4340, sec. 7.7 */
static inline int dccp_data_packet(const struct sk_buff *skb)
{
const __u8 type = DCCP_SKB_CB(skb)->dccpd_type;
return type == DCCP_PKT_DATA ||
type == DCCP_PKT_DATAACK ||
type == DCCP_PKT_REQUEST ||
type == DCCP_PKT_RESPONSE;
}
static inline int dccp_packet_without_ack(const struct sk_buff *skb)
{
const __u8 type = DCCP_SKB_CB(skb)->dccpd_type;
return type == DCCP_PKT_DATA || type == DCCP_PKT_REQUEST;
}
#define DCCP_PKT_WITHOUT_ACK_SEQ (UINT48_MAX << 2)
static inline void dccp_hdr_set_seq(struct dccp_hdr *dh, const u64 gss)
{
struct dccp_hdr_ext *dhx = (struct dccp_hdr_ext *)((void *)dh +
sizeof(*dh));
dh->dccph_seq2 = 0;
dh->dccph_seq = htons((gss >> 32) & 0xfffff);
dhx->dccph_seq_low = htonl(gss & 0xffffffff);
}
static inline void dccp_hdr_set_ack(struct dccp_hdr_ack_bits *dhack,
const u64 gsr)
{
dhack->dccph_reserved1 = 0;
dhack->dccph_ack_nr_high = htons(gsr >> 32);
dhack->dccph_ack_nr_low = htonl(gsr & 0xffffffff);
}
static inline void dccp_update_gsr(struct sock *sk, u64 seq)
{
struct dccp_sock *dp = dccp_sk(sk);
if (after48(seq, dp->dccps_gsr))
dp->dccps_gsr = seq;
/* Sequence validity window depends on remote Sequence Window (7.5.1) */
dp->dccps_swl = SUB48(ADD48(dp->dccps_gsr, 1), dp->dccps_r_seq_win / 4);
/*
* Adjust SWL so that it is not below ISR. In contrast to RFC 4340,
* 7.5.1 we perform this check beyond the initial handshake: W/W' are
* always > 32, so for the first W/W' packets in the lifetime of a
* connection we always have to adjust SWL.
* A second reason why we are doing this is that the window depends on
* the feature-remote value of Sequence Window: nothing stops the peer
* from updating this value while we are busy adjusting SWL for the
* first W packets (we would have to count from scratch again then).
* Therefore it is safer to always make sure that the Sequence Window
* is not artificially extended by a peer who grows SWL downwards by
* continually updating the feature-remote Sequence-Window.
* If sequence numbers wrap it is bad luck. But that will take a while
* (48 bit), and this measure prevents Sequence-number attacks.
*/
if (before48(dp->dccps_swl, dp->dccps_isr))
dp->dccps_swl = dp->dccps_isr;
dp->dccps_swh = ADD48(dp->dccps_gsr, (3 * dp->dccps_r_seq_win) / 4);
}
static inline void dccp_update_gss(struct sock *sk, u64 seq)
{
struct dccp_sock *dp = dccp_sk(sk);
dp->dccps_gss = seq;
/* Ack validity window depends on local Sequence Window value (7.5.1) */
dp->dccps_awl = SUB48(ADD48(dp->dccps_gss, 1), dp->dccps_l_seq_win);
/* Adjust AWL so that it is not below ISS - see comment above for SWL */
if (before48(dp->dccps_awl, dp->dccps_iss))
dp->dccps_awl = dp->dccps_iss;
dp->dccps_awh = dp->dccps_gss;
}
static inline int dccp_ackvec_pending(const struct sock *sk)
{
return dccp_sk(sk)->dccps_hc_rx_ackvec != NULL &&
!dccp_ackvec_is_empty(dccp_sk(sk)->dccps_hc_rx_ackvec);
}
static inline int dccp_ack_pending(const struct sock *sk)
{
return dccp_ackvec_pending(sk) || inet_csk_ack_scheduled(sk);
}
int dccp_feat_signal_nn_change(struct sock *sk, u8 feat, u64 nn_val);
int dccp_feat_finalise_settings(struct dccp_sock *dp);
int dccp_feat_server_ccid_dependencies(struct dccp_request_sock *dreq);
int dccp_feat_insert_opts(struct dccp_sock*, struct dccp_request_sock*,
struct sk_buff *skb);
int dccp_feat_activate_values(struct sock *sk, struct list_head *fn);
void dccp_feat_list_purge(struct list_head *fn_list);
int dccp_insert_options(struct sock *sk, struct sk_buff *skb);
int dccp_insert_options_rsk(struct dccp_request_sock *, struct sk_buff *);
u32 dccp_timestamp(void);
void dccp_timestamping_init(void);
int dccp_insert_option(struct sk_buff *skb, unsigned char option,
const void *value, unsigned char len);
#ifdef CONFIG_SYSCTL
int dccp_sysctl_init(void);
void dccp_sysctl_exit(void);
#else
static inline int dccp_sysctl_init(void)
{
return 0;
}
static inline void dccp_sysctl_exit(void)
{
}
#endif
#endif /* _DCCP_H */

86
net/dccp/diag.c Normal file
View file

@ -0,0 +1,86 @@
/*
* net/dccp/diag.c
*
* An implementation of the DCCP protocol
* Arnaldo Carvalho de Melo <acme@mandriva.com>
*
* This program is free software; you can redistribute it and/or modify it
* under the terms of the GNU General Public License version 2 as
* published by the Free Software Foundation.
*/
#include <linux/module.h>
#include <linux/inet_diag.h>
#include "ccid.h"
#include "dccp.h"
static void dccp_get_info(struct sock *sk, struct tcp_info *info)
{
struct dccp_sock *dp = dccp_sk(sk);
const struct inet_connection_sock *icsk = inet_csk(sk);
memset(info, 0, sizeof(*info));
info->tcpi_state = sk->sk_state;
info->tcpi_retransmits = icsk->icsk_retransmits;
info->tcpi_probes = icsk->icsk_probes_out;
info->tcpi_backoff = icsk->icsk_backoff;
info->tcpi_pmtu = icsk->icsk_pmtu_cookie;
if (dp->dccps_hc_rx_ackvec != NULL)
info->tcpi_options |= TCPI_OPT_SACK;
if (dp->dccps_hc_rx_ccid != NULL)
ccid_hc_rx_get_info(dp->dccps_hc_rx_ccid, sk, info);
if (dp->dccps_hc_tx_ccid != NULL)
ccid_hc_tx_get_info(dp->dccps_hc_tx_ccid, sk, info);
}
static void dccp_diag_get_info(struct sock *sk, struct inet_diag_msg *r,
void *_info)
{
r->idiag_rqueue = r->idiag_wqueue = 0;
if (_info != NULL)
dccp_get_info(sk, _info);
}
static void dccp_diag_dump(struct sk_buff *skb, struct netlink_callback *cb,
struct inet_diag_req_v2 *r, struct nlattr *bc)
{
inet_diag_dump_icsk(&dccp_hashinfo, skb, cb, r, bc);
}
static int dccp_diag_dump_one(struct sk_buff *in_skb, const struct nlmsghdr *nlh,
struct inet_diag_req_v2 *req)
{
return inet_diag_dump_one_icsk(&dccp_hashinfo, in_skb, nlh, req);
}
static const struct inet_diag_handler dccp_diag_handler = {
.dump = dccp_diag_dump,
.dump_one = dccp_diag_dump_one,
.idiag_get_info = dccp_diag_get_info,
.idiag_type = IPPROTO_DCCP,
};
static int __init dccp_diag_init(void)
{
return inet_diag_register(&dccp_diag_handler);
}
static void __exit dccp_diag_fini(void)
{
inet_diag_unregister(&dccp_diag_handler);
}
module_init(dccp_diag_init);
module_exit(dccp_diag_fini);
MODULE_LICENSE("GPL");
MODULE_AUTHOR("Arnaldo Carvalho de Melo <acme@mandriva.com>");
MODULE_DESCRIPTION("DCCP inet_diag handler");
MODULE_ALIAS_NET_PF_PROTO_TYPE(PF_NETLINK, NETLINK_SOCK_DIAG, 2-33 /* AF_INET - IPPROTO_DCCP */);

1561
net/dccp/feat.c Normal file

File diff suppressed because it is too large Load diff

137
net/dccp/feat.h Normal file
View file

@ -0,0 +1,137 @@
#ifndef _DCCP_FEAT_H
#define _DCCP_FEAT_H
/*
* net/dccp/feat.h
*
* Feature negotiation for the DCCP protocol (RFC 4340, section 6)
* Copyright (c) 2008 Gerrit Renker <gerrit@erg.abdn.ac.uk>
* Copyright (c) 2005 Andrea Bittau <a.bittau@cs.ucl.ac.uk>
*
* This program is free software; you can redistribute it and/or modify it
* under the terms of the GNU General Public License version 2 as
* published by the Free Software Foundation.
*/
#include <linux/types.h>
#include "dccp.h"
/*
* Known limit values
*/
/* Ack Ratio takes 2-byte integer values (11.3) */
#define DCCPF_ACK_RATIO_MAX 0xFFFF
/* Wmin=32 and Wmax=2^46-1 from 7.5.2 */
#define DCCPF_SEQ_WMIN 32
#define DCCPF_SEQ_WMAX 0x3FFFFFFFFFFFull
/* Maximum number of SP values that fit in a single (Confirm) option */
#define DCCP_FEAT_MAX_SP_VALS (DCCP_SINGLE_OPT_MAXLEN - 2)
enum dccp_feat_type {
FEAT_AT_RX = 1, /* located at RX side of half-connection */
FEAT_AT_TX = 2, /* located at TX side of half-connection */
FEAT_SP = 4, /* server-priority reconciliation (6.3.1) */
FEAT_NN = 8, /* non-negotiable reconciliation (6.3.2) */
FEAT_UNKNOWN = 0xFF /* not understood or invalid feature */
};
enum dccp_feat_state {
FEAT_DEFAULT = 0, /* using default values from 6.4 */
FEAT_INITIALISING, /* feature is being initialised */
FEAT_CHANGING, /* Change sent but not confirmed yet */
FEAT_UNSTABLE, /* local modification in state CHANGING */
FEAT_STABLE /* both ends (think they) agree */
};
/**
* dccp_feat_val - Container for SP or NN feature values
* @nn: single NN value
* @sp.vec: single SP value plus optional preference list
* @sp.len: length of @sp.vec in bytes
*/
typedef union {
u64 nn;
struct {
u8 *vec;
u8 len;
} sp;
} dccp_feat_val;
/**
* struct feat_entry - Data structure to perform feature negotiation
* @val: feature's current value (SP features may have preference list)
* @state: feature's current state
* @feat_num: one of %dccp_feature_numbers
* @needs_mandatory: whether Mandatory options should be sent
* @needs_confirm: whether to send a Confirm instead of a Change
* @empty_confirm: whether to send an empty Confirm (depends on @needs_confirm)
* @is_local: feature location (1) or feature-remote (0)
* @node: list pointers, entries arranged in FIFO order
*/
struct dccp_feat_entry {
dccp_feat_val val;
enum dccp_feat_state state:8;
u8 feat_num;
bool needs_mandatory,
needs_confirm,
empty_confirm,
is_local;
struct list_head node;
};
static inline u8 dccp_feat_genopt(struct dccp_feat_entry *entry)
{
if (entry->needs_confirm)
return entry->is_local ? DCCPO_CONFIRM_L : DCCPO_CONFIRM_R;
return entry->is_local ? DCCPO_CHANGE_L : DCCPO_CHANGE_R;
}
/**
* struct ccid_dependency - Track changes resulting from choosing a CCID
* @dependent_feat: one of %dccp_feature_numbers
* @is_local: local (1) or remote (0) @dependent_feat
* @is_mandatory: whether presence of @dependent_feat is mission-critical or not
* @val: corresponding default value for @dependent_feat (u8 is sufficient here)
*/
struct ccid_dependency {
u8 dependent_feat;
bool is_local:1,
is_mandatory:1;
u8 val;
};
/*
* Sysctls to seed defaults for feature negotiation
*/
extern unsigned long sysctl_dccp_sequence_window;
extern int sysctl_dccp_rx_ccid;
extern int sysctl_dccp_tx_ccid;
int dccp_feat_init(struct sock *sk);
void dccp_feat_initialise_sysctls(void);
int dccp_feat_register_sp(struct sock *sk, u8 feat, u8 is_local,
u8 const *list, u8 len);
int dccp_feat_parse_options(struct sock *, struct dccp_request_sock *,
u8 mand, u8 opt, u8 feat, u8 *val, u8 len);
int dccp_feat_clone_list(struct list_head const *, struct list_head *);
/*
* Encoding variable-length options and their maximum length.
*
* This affects NN options (SP options are all u8) and other variable-length
* options (see table 3 in RFC 4340). The limit is currently given the Sequence
* Window NN value (sec. 7.5.2) and the NDP count (sec. 7.7) option, all other
* options consume less than 6 bytes (timestamps are 4 bytes).
* When updating this constant (e.g. due to new internet drafts / RFCs), make
* sure that you also update all code which refers to it.
*/
#define DCCP_OPTVAL_MAXLEN 6
void dccp_encode_value_var(const u64 value, u8 *to, const u8 len);
u64 dccp_decode_value_var(const u8 *bf, const u8 len);
u64 dccp_feat_nn_get(struct sock *sk, u8 feat);
int dccp_insert_option_mandatory(struct sk_buff *skb);
int dccp_insert_fn_opt(struct sk_buff *skb, u8 type, u8 feat, u8 *val, u8 len,
bool repeat_first);
#endif /* _DCCP_FEAT_H */

732
net/dccp/input.c Normal file
View file

@ -0,0 +1,732 @@
/*
* net/dccp/input.c
*
* An implementation of the DCCP protocol
* Arnaldo Carvalho de Melo <acme@conectiva.com.br>
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public License
* as published by the Free Software Foundation; either version
* 2 of the License, or (at your option) any later version.
*/
#include <linux/dccp.h>
#include <linux/skbuff.h>
#include <linux/slab.h>
#include <net/sock.h>
#include "ackvec.h"
#include "ccid.h"
#include "dccp.h"
/* rate-limit for syncs in reply to sequence-invalid packets; RFC 4340, 7.5.4 */
int sysctl_dccp_sync_ratelimit __read_mostly = HZ / 8;
static void dccp_enqueue_skb(struct sock *sk, struct sk_buff *skb)
{
__skb_pull(skb, dccp_hdr(skb)->dccph_doff * 4);
__skb_queue_tail(&sk->sk_receive_queue, skb);
skb_set_owner_r(skb, sk);
sk->sk_data_ready(sk);
}
static void dccp_fin(struct sock *sk, struct sk_buff *skb)
{
/*
* On receiving Close/CloseReq, both RD/WR shutdown are performed.
* RFC 4340, 8.3 says that we MAY send further Data/DataAcks after
* receiving the closing segment, but there is no guarantee that such
* data will be processed at all.
*/
sk->sk_shutdown = SHUTDOWN_MASK;
sock_set_flag(sk, SOCK_DONE);
dccp_enqueue_skb(sk, skb);
}
static int dccp_rcv_close(struct sock *sk, struct sk_buff *skb)
{
int queued = 0;
switch (sk->sk_state) {
/*
* We ignore Close when received in one of the following states:
* - CLOSED (may be a late or duplicate packet)
* - PASSIVE_CLOSEREQ (the peer has sent a CloseReq earlier)
* - RESPOND (already handled by dccp_check_req)
*/
case DCCP_CLOSING:
/*
* Simultaneous-close: receiving a Close after sending one. This
* can happen if both client and server perform active-close and
* will result in an endless ping-pong of crossing and retrans-
* mitted Close packets, which only terminates when one of the
* nodes times out (min. 64 seconds). Quicker convergence can be
* achieved when one of the nodes acts as tie-breaker.
* This is ok as both ends are done with data transfer and each
* end is just waiting for the other to acknowledge termination.
*/
if (dccp_sk(sk)->dccps_role != DCCP_ROLE_CLIENT)
break;
/* fall through */
case DCCP_REQUESTING:
case DCCP_ACTIVE_CLOSEREQ:
dccp_send_reset(sk, DCCP_RESET_CODE_CLOSED);
dccp_done(sk);
break;
case DCCP_OPEN:
case DCCP_PARTOPEN:
/* Give waiting application a chance to read pending data */
queued = 1;
dccp_fin(sk, skb);
dccp_set_state(sk, DCCP_PASSIVE_CLOSE);
/* fall through */
case DCCP_PASSIVE_CLOSE:
/*
* Retransmitted Close: we have already enqueued the first one.
*/
sk_wake_async(sk, SOCK_WAKE_WAITD, POLL_HUP);
}
return queued;
}
static int dccp_rcv_closereq(struct sock *sk, struct sk_buff *skb)
{
int queued = 0;
/*
* Step 7: Check for unexpected packet types
* If (S.is_server and P.type == CloseReq)
* Send Sync packet acknowledging P.seqno
* Drop packet and return
*/
if (dccp_sk(sk)->dccps_role != DCCP_ROLE_CLIENT) {
dccp_send_sync(sk, DCCP_SKB_CB(skb)->dccpd_seq, DCCP_PKT_SYNC);
return queued;
}
/* Step 13: process relevant Client states < CLOSEREQ */
switch (sk->sk_state) {
case DCCP_REQUESTING:
dccp_send_close(sk, 0);
dccp_set_state(sk, DCCP_CLOSING);
break;
case DCCP_OPEN:
case DCCP_PARTOPEN:
/* Give waiting application a chance to read pending data */
queued = 1;
dccp_fin(sk, skb);
dccp_set_state(sk, DCCP_PASSIVE_CLOSEREQ);
/* fall through */
case DCCP_PASSIVE_CLOSEREQ:
sk_wake_async(sk, SOCK_WAKE_WAITD, POLL_HUP);
}
return queued;
}
static u16 dccp_reset_code_convert(const u8 code)
{
const u16 error_code[] = {
[DCCP_RESET_CODE_CLOSED] = 0, /* normal termination */
[DCCP_RESET_CODE_UNSPECIFIED] = 0, /* nothing known */
[DCCP_RESET_CODE_ABORTED] = ECONNRESET,
[DCCP_RESET_CODE_NO_CONNECTION] = ECONNREFUSED,
[DCCP_RESET_CODE_CONNECTION_REFUSED] = ECONNREFUSED,
[DCCP_RESET_CODE_TOO_BUSY] = EUSERS,
[DCCP_RESET_CODE_AGGRESSION_PENALTY] = EDQUOT,
[DCCP_RESET_CODE_PACKET_ERROR] = ENOMSG,
[DCCP_RESET_CODE_BAD_INIT_COOKIE] = EBADR,
[DCCP_RESET_CODE_BAD_SERVICE_CODE] = EBADRQC,
[DCCP_RESET_CODE_OPTION_ERROR] = EILSEQ,
[DCCP_RESET_CODE_MANDATORY_ERROR] = EOPNOTSUPP,
};
return code >= DCCP_MAX_RESET_CODES ? 0 : error_code[code];
}
static void dccp_rcv_reset(struct sock *sk, struct sk_buff *skb)
{
u16 err = dccp_reset_code_convert(dccp_hdr_reset(skb)->dccph_reset_code);
sk->sk_err = err;
/* Queue the equivalent of TCP fin so that dccp_recvmsg exits the loop */
dccp_fin(sk, skb);
if (err && !sock_flag(sk, SOCK_DEAD))
sk_wake_async(sk, SOCK_WAKE_IO, POLL_ERR);
dccp_time_wait(sk, DCCP_TIME_WAIT, 0);
}
static void dccp_handle_ackvec_processing(struct sock *sk, struct sk_buff *skb)
{
struct dccp_ackvec *av = dccp_sk(sk)->dccps_hc_rx_ackvec;
if (av == NULL)
return;
if (DCCP_SKB_CB(skb)->dccpd_ack_seq != DCCP_PKT_WITHOUT_ACK_SEQ)
dccp_ackvec_clear_state(av, DCCP_SKB_CB(skb)->dccpd_ack_seq);
dccp_ackvec_input(av, skb);
}
static void dccp_deliver_input_to_ccids(struct sock *sk, struct sk_buff *skb)
{
const struct dccp_sock *dp = dccp_sk(sk);
/* Don't deliver to RX CCID when node has shut down read end. */
if (!(sk->sk_shutdown & RCV_SHUTDOWN))
ccid_hc_rx_packet_recv(dp->dccps_hc_rx_ccid, sk, skb);
/*
* Until the TX queue has been drained, we can not honour SHUT_WR, since
* we need received feedback as input to adjust congestion control.
*/
if (sk->sk_write_queue.qlen > 0 || !(sk->sk_shutdown & SEND_SHUTDOWN))
ccid_hc_tx_packet_recv(dp->dccps_hc_tx_ccid, sk, skb);
}
static int dccp_check_seqno(struct sock *sk, struct sk_buff *skb)
{
const struct dccp_hdr *dh = dccp_hdr(skb);
struct dccp_sock *dp = dccp_sk(sk);
u64 lswl, lawl, seqno = DCCP_SKB_CB(skb)->dccpd_seq,
ackno = DCCP_SKB_CB(skb)->dccpd_ack_seq;
/*
* Step 5: Prepare sequence numbers for Sync
* If P.type == Sync or P.type == SyncAck,
* If S.AWL <= P.ackno <= S.AWH and P.seqno >= S.SWL,
* / * P is valid, so update sequence number variables
* accordingly. After this update, P will pass the tests
* in Step 6. A SyncAck is generated if necessary in
* Step 15 * /
* Update S.GSR, S.SWL, S.SWH
* Otherwise,
* Drop packet and return
*/
if (dh->dccph_type == DCCP_PKT_SYNC ||
dh->dccph_type == DCCP_PKT_SYNCACK) {
if (between48(ackno, dp->dccps_awl, dp->dccps_awh) &&
dccp_delta_seqno(dp->dccps_swl, seqno) >= 0)
dccp_update_gsr(sk, seqno);
else
return -1;
}
/*
* Step 6: Check sequence numbers
* Let LSWL = S.SWL and LAWL = S.AWL
* If P.type == CloseReq or P.type == Close or P.type == Reset,
* LSWL := S.GSR + 1, LAWL := S.GAR
* If LSWL <= P.seqno <= S.SWH
* and (P.ackno does not exist or LAWL <= P.ackno <= S.AWH),
* Update S.GSR, S.SWL, S.SWH
* If P.type != Sync,
* Update S.GAR
*/
lswl = dp->dccps_swl;
lawl = dp->dccps_awl;
if (dh->dccph_type == DCCP_PKT_CLOSEREQ ||
dh->dccph_type == DCCP_PKT_CLOSE ||
dh->dccph_type == DCCP_PKT_RESET) {
lswl = ADD48(dp->dccps_gsr, 1);
lawl = dp->dccps_gar;
}
if (between48(seqno, lswl, dp->dccps_swh) &&
(ackno == DCCP_PKT_WITHOUT_ACK_SEQ ||
between48(ackno, lawl, dp->dccps_awh))) {
dccp_update_gsr(sk, seqno);
if (dh->dccph_type != DCCP_PKT_SYNC &&
ackno != DCCP_PKT_WITHOUT_ACK_SEQ &&
after48(ackno, dp->dccps_gar))
dp->dccps_gar = ackno;
} else {
unsigned long now = jiffies;
/*
* Step 6: Check sequence numbers
* Otherwise,
* If P.type == Reset,
* Send Sync packet acknowledging S.GSR
* Otherwise,
* Send Sync packet acknowledging P.seqno
* Drop packet and return
*
* These Syncs are rate-limited as per RFC 4340, 7.5.4:
* at most 1 / (dccp_sync_rate_limit * HZ) Syncs per second.
*/
if (time_before(now, (dp->dccps_rate_last +
sysctl_dccp_sync_ratelimit)))
return -1;
DCCP_WARN("Step 6 failed for %s packet, "
"(LSWL(%llu) <= P.seqno(%llu) <= S.SWH(%llu)) and "
"(P.ackno %s or LAWL(%llu) <= P.ackno(%llu) <= S.AWH(%llu), "
"sending SYNC...\n", dccp_packet_name(dh->dccph_type),
(unsigned long long) lswl, (unsigned long long) seqno,
(unsigned long long) dp->dccps_swh,
(ackno == DCCP_PKT_WITHOUT_ACK_SEQ) ? "doesn't exist"
: "exists",
(unsigned long long) lawl, (unsigned long long) ackno,
(unsigned long long) dp->dccps_awh);
dp->dccps_rate_last = now;
if (dh->dccph_type == DCCP_PKT_RESET)
seqno = dp->dccps_gsr;
dccp_send_sync(sk, seqno, DCCP_PKT_SYNC);
return -1;
}
return 0;
}
static int __dccp_rcv_established(struct sock *sk, struct sk_buff *skb,
const struct dccp_hdr *dh, const unsigned int len)
{
struct dccp_sock *dp = dccp_sk(sk);
switch (dccp_hdr(skb)->dccph_type) {
case DCCP_PKT_DATAACK:
case DCCP_PKT_DATA:
/*
* FIXME: schedule DATA_DROPPED (RFC 4340, 11.7.2) if and when
* - sk_shutdown == RCV_SHUTDOWN, use Code 1, "Not Listening"
* - sk_receive_queue is full, use Code 2, "Receive Buffer"
*/
dccp_enqueue_skb(sk, skb);
return 0;
case DCCP_PKT_ACK:
goto discard;
case DCCP_PKT_RESET:
/*
* Step 9: Process Reset
* If P.type == Reset,
* Tear down connection
* S.state := TIMEWAIT
* Set TIMEWAIT timer
* Drop packet and return
*/
dccp_rcv_reset(sk, skb);
return 0;
case DCCP_PKT_CLOSEREQ:
if (dccp_rcv_closereq(sk, skb))
return 0;
goto discard;
case DCCP_PKT_CLOSE:
if (dccp_rcv_close(sk, skb))
return 0;
goto discard;
case DCCP_PKT_REQUEST:
/* Step 7
* or (S.is_server and P.type == Response)
* or (S.is_client and P.type == Request)
* or (S.state >= OPEN and P.type == Request
* and P.seqno >= S.OSR)
* or (S.state >= OPEN and P.type == Response
* and P.seqno >= S.OSR)
* or (S.state == RESPOND and P.type == Data),
* Send Sync packet acknowledging P.seqno
* Drop packet and return
*/
if (dp->dccps_role != DCCP_ROLE_LISTEN)
goto send_sync;
goto check_seq;
case DCCP_PKT_RESPONSE:
if (dp->dccps_role != DCCP_ROLE_CLIENT)
goto send_sync;
check_seq:
if (dccp_delta_seqno(dp->dccps_osr,
DCCP_SKB_CB(skb)->dccpd_seq) >= 0) {
send_sync:
dccp_send_sync(sk, DCCP_SKB_CB(skb)->dccpd_seq,
DCCP_PKT_SYNC);
}
break;
case DCCP_PKT_SYNC:
dccp_send_sync(sk, DCCP_SKB_CB(skb)->dccpd_seq,
DCCP_PKT_SYNCACK);
/*
* From RFC 4340, sec. 5.7
*
* As with DCCP-Ack packets, DCCP-Sync and DCCP-SyncAck packets
* MAY have non-zero-length application data areas, whose
* contents receivers MUST ignore.
*/
goto discard;
}
DCCP_INC_STATS_BH(DCCP_MIB_INERRS);
discard:
__kfree_skb(skb);
return 0;
}
int dccp_rcv_established(struct sock *sk, struct sk_buff *skb,
const struct dccp_hdr *dh, const unsigned int len)
{
if (dccp_check_seqno(sk, skb))
goto discard;
if (dccp_parse_options(sk, NULL, skb))
return 1;
dccp_handle_ackvec_processing(sk, skb);
dccp_deliver_input_to_ccids(sk, skb);
return __dccp_rcv_established(sk, skb, dh, len);
discard:
__kfree_skb(skb);
return 0;
}
EXPORT_SYMBOL_GPL(dccp_rcv_established);
static int dccp_rcv_request_sent_state_process(struct sock *sk,
struct sk_buff *skb,
const struct dccp_hdr *dh,
const unsigned int len)
{
/*
* Step 4: Prepare sequence numbers in REQUEST
* If S.state == REQUEST,
* If (P.type == Response or P.type == Reset)
* and S.AWL <= P.ackno <= S.AWH,
* / * Set sequence number variables corresponding to the
* other endpoint, so P will pass the tests in Step 6 * /
* Set S.GSR, S.ISR, S.SWL, S.SWH
* / * Response processing continues in Step 10; Reset
* processing continues in Step 9 * /
*/
if (dh->dccph_type == DCCP_PKT_RESPONSE) {
const struct inet_connection_sock *icsk = inet_csk(sk);
struct dccp_sock *dp = dccp_sk(sk);
long tstamp = dccp_timestamp();
if (!between48(DCCP_SKB_CB(skb)->dccpd_ack_seq,
dp->dccps_awl, dp->dccps_awh)) {
dccp_pr_debug("invalid ackno: S.AWL=%llu, "
"P.ackno=%llu, S.AWH=%llu\n",
(unsigned long long)dp->dccps_awl,
(unsigned long long)DCCP_SKB_CB(skb)->dccpd_ack_seq,
(unsigned long long)dp->dccps_awh);
goto out_invalid_packet;
}
/*
* If option processing (Step 8) failed, return 1 here so that
* dccp_v4_do_rcv() sends a Reset. The Reset code depends on
* the option type and is set in dccp_parse_options().
*/
if (dccp_parse_options(sk, NULL, skb))
return 1;
/* Obtain usec RTT sample from SYN exchange (used by TFRC). */
if (likely(dp->dccps_options_received.dccpor_timestamp_echo))
dp->dccps_syn_rtt = dccp_sample_rtt(sk, 10 * (tstamp -
dp->dccps_options_received.dccpor_timestamp_echo));
/* Stop the REQUEST timer */
inet_csk_clear_xmit_timer(sk, ICSK_TIME_RETRANS);
WARN_ON(sk->sk_send_head == NULL);
kfree_skb(sk->sk_send_head);
sk->sk_send_head = NULL;
/*
* Set ISR, GSR from packet. ISS was set in dccp_v{4,6}_connect
* and GSS in dccp_transmit_skb(). Setting AWL/AWH and SWL/SWH
* is done as part of activating the feature values below, since
* these settings depend on the local/remote Sequence Window
* features, which were undefined or not confirmed until now.
*/
dp->dccps_gsr = dp->dccps_isr = DCCP_SKB_CB(skb)->dccpd_seq;
dccp_sync_mss(sk, icsk->icsk_pmtu_cookie);
/*
* Step 10: Process REQUEST state (second part)
* If S.state == REQUEST,
* / * If we get here, P is a valid Response from the
* server (see Step 4), and we should move to
* PARTOPEN state. PARTOPEN means send an Ack,
* don't send Data packets, retransmit Acks
* periodically, and always include any Init Cookie
* from the Response * /
* S.state := PARTOPEN
* Set PARTOPEN timer
* Continue with S.state == PARTOPEN
* / * Step 12 will send the Ack completing the
* three-way handshake * /
*/
dccp_set_state(sk, DCCP_PARTOPEN);
/*
* If feature negotiation was successful, activate features now;
* an activation failure means that this host could not activate
* one ore more features (e.g. insufficient memory), which would
* leave at least one feature in an undefined state.
*/
if (dccp_feat_activate_values(sk, &dp->dccps_featneg))
goto unable_to_proceed;
/* Make sure socket is routed, for correct metrics. */
icsk->icsk_af_ops->rebuild_header(sk);
if (!sock_flag(sk, SOCK_DEAD)) {
sk->sk_state_change(sk);
sk_wake_async(sk, SOCK_WAKE_IO, POLL_OUT);
}
if (sk->sk_write_pending || icsk->icsk_ack.pingpong ||
icsk->icsk_accept_queue.rskq_defer_accept) {
/* Save one ACK. Data will be ready after
* several ticks, if write_pending is set.
*
* It may be deleted, but with this feature tcpdumps
* look so _wonderfully_ clever, that I was not able
* to stand against the temptation 8) --ANK
*/
/*
* OK, in DCCP we can as well do a similar trick, its
* even in the draft, but there is no need for us to
* schedule an ack here, as dccp_sendmsg does this for
* us, also stated in the draft. -acme
*/
__kfree_skb(skb);
return 0;
}
dccp_send_ack(sk);
return -1;
}
out_invalid_packet:
/* dccp_v4_do_rcv will send a reset */
DCCP_SKB_CB(skb)->dccpd_reset_code = DCCP_RESET_CODE_PACKET_ERROR;
return 1;
unable_to_proceed:
DCCP_SKB_CB(skb)->dccpd_reset_code = DCCP_RESET_CODE_ABORTED;
/*
* We mark this socket as no longer usable, so that the loop in
* dccp_sendmsg() terminates and the application gets notified.
*/
dccp_set_state(sk, DCCP_CLOSED);
sk->sk_err = ECOMM;
return 1;
}
static int dccp_rcv_respond_partopen_state_process(struct sock *sk,
struct sk_buff *skb,
const struct dccp_hdr *dh,
const unsigned int len)
{
struct dccp_sock *dp = dccp_sk(sk);
u32 sample = dp->dccps_options_received.dccpor_timestamp_echo;
int queued = 0;
switch (dh->dccph_type) {
case DCCP_PKT_RESET:
inet_csk_clear_xmit_timer(sk, ICSK_TIME_DACK);
break;
case DCCP_PKT_DATA:
if (sk->sk_state == DCCP_RESPOND)
break;
case DCCP_PKT_DATAACK:
case DCCP_PKT_ACK:
/*
* FIXME: we should be reseting the PARTOPEN (DELACK) timer
* here but only if we haven't used the DELACK timer for
* something else, like sending a delayed ack for a TIMESTAMP
* echo, etc, for now were not clearing it, sending an extra
* ACK when there is nothing else to do in DELACK is not a big
* deal after all.
*/
/* Stop the PARTOPEN timer */
if (sk->sk_state == DCCP_PARTOPEN)
inet_csk_clear_xmit_timer(sk, ICSK_TIME_DACK);
/* Obtain usec RTT sample from SYN exchange (used by TFRC). */
if (likely(sample)) {
long delta = dccp_timestamp() - sample;
dp->dccps_syn_rtt = dccp_sample_rtt(sk, 10 * delta);
}
dp->dccps_osr = DCCP_SKB_CB(skb)->dccpd_seq;
dccp_set_state(sk, DCCP_OPEN);
if (dh->dccph_type == DCCP_PKT_DATAACK ||
dh->dccph_type == DCCP_PKT_DATA) {
__dccp_rcv_established(sk, skb, dh, len);
queued = 1; /* packet was queued
(by __dccp_rcv_established) */
}
break;
}
return queued;
}
int dccp_rcv_state_process(struct sock *sk, struct sk_buff *skb,
struct dccp_hdr *dh, unsigned int len)
{
struct dccp_sock *dp = dccp_sk(sk);
struct dccp_skb_cb *dcb = DCCP_SKB_CB(skb);
const int old_state = sk->sk_state;
int queued = 0;
/*
* Step 3: Process LISTEN state
*
* If S.state == LISTEN,
* If P.type == Request or P contains a valid Init Cookie option,
* (* Must scan the packet's options to check for Init
* Cookies. Only Init Cookies are processed here,
* however; other options are processed in Step 8. This
* scan need only be performed if the endpoint uses Init
* Cookies *)
* (* Generate a new socket and switch to that socket *)
* Set S := new socket for this port pair
* S.state = RESPOND
* Choose S.ISS (initial seqno) or set from Init Cookies
* Initialize S.GAR := S.ISS
* Set S.ISR, S.GSR, S.SWL, S.SWH from packet or Init
* Cookies Continue with S.state == RESPOND
* (* A Response packet will be generated in Step 11 *)
* Otherwise,
* Generate Reset(No Connection) unless P.type == Reset
* Drop packet and return
*/
if (sk->sk_state == DCCP_LISTEN) {
if (dh->dccph_type == DCCP_PKT_REQUEST) {
if (inet_csk(sk)->icsk_af_ops->conn_request(sk,
skb) < 0)
return 1;
goto discard;
}
if (dh->dccph_type == DCCP_PKT_RESET)
goto discard;
/* Caller (dccp_v4_do_rcv) will send Reset */
dcb->dccpd_reset_code = DCCP_RESET_CODE_NO_CONNECTION;
return 1;
} else if (sk->sk_state == DCCP_CLOSED) {
dcb->dccpd_reset_code = DCCP_RESET_CODE_NO_CONNECTION;
return 1;
}
/* Step 6: Check sequence numbers (omitted in LISTEN/REQUEST state) */
if (sk->sk_state != DCCP_REQUESTING && dccp_check_seqno(sk, skb))
goto discard;
/*
* Step 7: Check for unexpected packet types
* If (S.is_server and P.type == Response)
* or (S.is_client and P.type == Request)
* or (S.state == RESPOND and P.type == Data),
* Send Sync packet acknowledging P.seqno
* Drop packet and return
*/
if ((dp->dccps_role != DCCP_ROLE_CLIENT &&
dh->dccph_type == DCCP_PKT_RESPONSE) ||
(dp->dccps_role == DCCP_ROLE_CLIENT &&
dh->dccph_type == DCCP_PKT_REQUEST) ||
(sk->sk_state == DCCP_RESPOND && dh->dccph_type == DCCP_PKT_DATA)) {
dccp_send_sync(sk, dcb->dccpd_seq, DCCP_PKT_SYNC);
goto discard;
}
/* Step 8: Process options */
if (dccp_parse_options(sk, NULL, skb))
return 1;
/*
* Step 9: Process Reset
* If P.type == Reset,
* Tear down connection
* S.state := TIMEWAIT
* Set TIMEWAIT timer
* Drop packet and return
*/
if (dh->dccph_type == DCCP_PKT_RESET) {
dccp_rcv_reset(sk, skb);
return 0;
} else if (dh->dccph_type == DCCP_PKT_CLOSEREQ) { /* Step 13 */
if (dccp_rcv_closereq(sk, skb))
return 0;
goto discard;
} else if (dh->dccph_type == DCCP_PKT_CLOSE) { /* Step 14 */
if (dccp_rcv_close(sk, skb))
return 0;
goto discard;
}
switch (sk->sk_state) {
case DCCP_REQUESTING:
queued = dccp_rcv_request_sent_state_process(sk, skb, dh, len);
if (queued >= 0)
return queued;
__kfree_skb(skb);
return 0;
case DCCP_PARTOPEN:
/* Step 8: if using Ack Vectors, mark packet acknowledgeable */
dccp_handle_ackvec_processing(sk, skb);
dccp_deliver_input_to_ccids(sk, skb);
/* fall through */
case DCCP_RESPOND:
queued = dccp_rcv_respond_partopen_state_process(sk, skb,
dh, len);
break;
}
if (dh->dccph_type == DCCP_PKT_ACK ||
dh->dccph_type == DCCP_PKT_DATAACK) {
switch (old_state) {
case DCCP_PARTOPEN:
sk->sk_state_change(sk);
sk_wake_async(sk, SOCK_WAKE_IO, POLL_OUT);
break;
}
} else if (unlikely(dh->dccph_type == DCCP_PKT_SYNC)) {
dccp_send_sync(sk, dcb->dccpd_seq, DCCP_PKT_SYNCACK);
goto discard;
}
if (!queued) {
discard:
__kfree_skb(skb);
}
return 0;
}
EXPORT_SYMBOL_GPL(dccp_rcv_state_process);
/**
* dccp_sample_rtt - Validate and finalise computation of RTT sample
* @delta: number of microseconds between packet and acknowledgment
*
* The routine is kept generic to work in different contexts. It should be
* called immediately when the ACK used for the RTT sample arrives.
*/
u32 dccp_sample_rtt(struct sock *sk, long delta)
{
/* dccpor_elapsed_time is either zeroed out or set and > 0 */
delta -= dccp_sk(sk)->dccps_options_received.dccpor_elapsed_time * 10;
if (unlikely(delta <= 0)) {
DCCP_WARN("unusable RTT sample %ld, using min\n", delta);
return DCCP_SANE_RTT_MIN;
}
if (unlikely(delta > DCCP_SANE_RTT_MAX)) {
DCCP_WARN("RTT sample %ld too large, using max\n", delta);
return DCCP_SANE_RTT_MAX;
}
return delta;
}

1095
net/dccp/ipv4.c Normal file

File diff suppressed because it is too large Load diff

1188
net/dccp/ipv6.c Normal file

File diff suppressed because it is too large Load diff

34
net/dccp/ipv6.h Normal file
View file

@ -0,0 +1,34 @@
#ifndef _DCCP_IPV6_H
#define _DCCP_IPV6_H
/*
* net/dccp/ipv6.h
*
* An implementation of the DCCP protocol
* Copyright (c) 2005 Arnaldo Carvalho de Melo <acme@ghostprotocols.net>
*
* This program is free software; you can redistribute it and/or modify it
* under the terms of the GNU General Public License version 2 as
* published by the Free Software Foundation.
*/
#include <linux/dccp.h>
#include <linux/ipv6.h>
struct dccp6_sock {
struct dccp_sock dccp;
/*
* ipv6_pinfo has to be the last member of dccp6_sock,
* see inet6_sk_generic.
*/
struct ipv6_pinfo inet6;
};
struct dccp6_request_sock {
struct dccp_request_sock dccp;
};
struct dccp6_timewait_sock {
struct inet_timewait_sock inet;
};
#endif /* _DCCP_IPV6_H */

276
net/dccp/minisocks.c Normal file
View file

@ -0,0 +1,276 @@
/*
* net/dccp/minisocks.c
*
* An implementation of the DCCP protocol
* Arnaldo Carvalho de Melo <acme@conectiva.com.br>
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public License
* as published by the Free Software Foundation; either version
* 2 of the License, or (at your option) any later version.
*/
#include <linux/dccp.h>
#include <linux/gfp.h>
#include <linux/kernel.h>
#include <linux/skbuff.h>
#include <linux/timer.h>
#include <net/sock.h>
#include <net/xfrm.h>
#include <net/inet_timewait_sock.h>
#include "ackvec.h"
#include "ccid.h"
#include "dccp.h"
#include "feat.h"
struct inet_timewait_death_row dccp_death_row = {
.sysctl_max_tw_buckets = NR_FILE * 2,
.period = DCCP_TIMEWAIT_LEN / INET_TWDR_TWKILL_SLOTS,
.death_lock = __SPIN_LOCK_UNLOCKED(dccp_death_row.death_lock),
.hashinfo = &dccp_hashinfo,
.tw_timer = TIMER_INITIALIZER(inet_twdr_hangman, 0,
(unsigned long)&dccp_death_row),
.twkill_work = __WORK_INITIALIZER(dccp_death_row.twkill_work,
inet_twdr_twkill_work),
/* Short-time timewait calendar */
.twcal_hand = -1,
.twcal_timer = TIMER_INITIALIZER(inet_twdr_twcal_tick, 0,
(unsigned long)&dccp_death_row),
};
EXPORT_SYMBOL_GPL(dccp_death_row);
void dccp_time_wait(struct sock *sk, int state, int timeo)
{
struct inet_timewait_sock *tw = NULL;
if (dccp_death_row.tw_count < dccp_death_row.sysctl_max_tw_buckets)
tw = inet_twsk_alloc(sk, state);
if (tw != NULL) {
const struct inet_connection_sock *icsk = inet_csk(sk);
const int rto = (icsk->icsk_rto << 2) - (icsk->icsk_rto >> 1);
#if IS_ENABLED(CONFIG_IPV6)
if (tw->tw_family == PF_INET6) {
tw->tw_v6_daddr = sk->sk_v6_daddr;
tw->tw_v6_rcv_saddr = sk->sk_v6_rcv_saddr;
tw->tw_ipv6only = sk->sk_ipv6only;
}
#endif
/* Linkage updates. */
__inet_twsk_hashdance(tw, sk, &dccp_hashinfo);
/* Get the TIME_WAIT timeout firing. */
if (timeo < rto)
timeo = rto;
tw->tw_timeout = DCCP_TIMEWAIT_LEN;
if (state == DCCP_TIME_WAIT)
timeo = DCCP_TIMEWAIT_LEN;
inet_twsk_schedule(tw, &dccp_death_row, timeo,
DCCP_TIMEWAIT_LEN);
inet_twsk_put(tw);
} else {
/* Sorry, if we're out of memory, just CLOSE this
* socket up. We've got bigger problems than
* non-graceful socket closings.
*/
DCCP_WARN("time wait bucket table overflow\n");
}
dccp_done(sk);
}
struct sock *dccp_create_openreq_child(struct sock *sk,
const struct request_sock *req,
const struct sk_buff *skb)
{
/*
* Step 3: Process LISTEN state
*
* (* Generate a new socket and switch to that socket *)
* Set S := new socket for this port pair
*/
struct sock *newsk = inet_csk_clone_lock(sk, req, GFP_ATOMIC);
if (newsk != NULL) {
struct dccp_request_sock *dreq = dccp_rsk(req);
struct inet_connection_sock *newicsk = inet_csk(newsk);
struct dccp_sock *newdp = dccp_sk(newsk);
newdp->dccps_role = DCCP_ROLE_SERVER;
newdp->dccps_hc_rx_ackvec = NULL;
newdp->dccps_service_list = NULL;
newdp->dccps_service = dreq->dreq_service;
newdp->dccps_timestamp_echo = dreq->dreq_timestamp_echo;
newdp->dccps_timestamp_time = dreq->dreq_timestamp_time;
newicsk->icsk_rto = DCCP_TIMEOUT_INIT;
INIT_LIST_HEAD(&newdp->dccps_featneg);
/*
* Step 3: Process LISTEN state
*
* Choose S.ISS (initial seqno) or set from Init Cookies
* Initialize S.GAR := S.ISS
* Set S.ISR, S.GSR from packet (or Init Cookies)
*
* Setting AWL/AWH and SWL/SWH happens as part of the feature
* activation below, as these windows all depend on the local
* and remote Sequence Window feature values (7.5.2).
*/
newdp->dccps_iss = dreq->dreq_iss;
newdp->dccps_gss = dreq->dreq_gss;
newdp->dccps_gar = newdp->dccps_iss;
newdp->dccps_isr = dreq->dreq_isr;
newdp->dccps_gsr = dreq->dreq_gsr;
/*
* Activate features: initialise CCIDs, sequence windows etc.
*/
if (dccp_feat_activate_values(newsk, &dreq->dreq_featneg)) {
/* It is still raw copy of parent, so invalidate
* destructor and make plain sk_free() */
newsk->sk_destruct = NULL;
sk_free(newsk);
return NULL;
}
dccp_init_xmit_timers(newsk);
DCCP_INC_STATS_BH(DCCP_MIB_PASSIVEOPENS);
}
return newsk;
}
EXPORT_SYMBOL_GPL(dccp_create_openreq_child);
/*
* Process an incoming packet for RESPOND sockets represented
* as an request_sock.
*/
struct sock *dccp_check_req(struct sock *sk, struct sk_buff *skb,
struct request_sock *req,
struct request_sock **prev)
{
struct sock *child = NULL;
struct dccp_request_sock *dreq = dccp_rsk(req);
/* Check for retransmitted REQUEST */
if (dccp_hdr(skb)->dccph_type == DCCP_PKT_REQUEST) {
if (after48(DCCP_SKB_CB(skb)->dccpd_seq, dreq->dreq_gsr)) {
dccp_pr_debug("Retransmitted REQUEST\n");
dreq->dreq_gsr = DCCP_SKB_CB(skb)->dccpd_seq;
/*
* Send another RESPONSE packet
* To protect against Request floods, increment retrans
* counter (backoff, monitored by dccp_response_timer).
*/
inet_rtx_syn_ack(sk, req);
}
/* Network Duplicate, discard packet */
return NULL;
}
DCCP_SKB_CB(skb)->dccpd_reset_code = DCCP_RESET_CODE_PACKET_ERROR;
if (dccp_hdr(skb)->dccph_type != DCCP_PKT_ACK &&
dccp_hdr(skb)->dccph_type != DCCP_PKT_DATAACK)
goto drop;
/* Invalid ACK */
if (!between48(DCCP_SKB_CB(skb)->dccpd_ack_seq,
dreq->dreq_iss, dreq->dreq_gss)) {
dccp_pr_debug("Invalid ACK number: ack_seq=%llu, "
"dreq_iss=%llu, dreq_gss=%llu\n",
(unsigned long long)
DCCP_SKB_CB(skb)->dccpd_ack_seq,
(unsigned long long) dreq->dreq_iss,
(unsigned long long) dreq->dreq_gss);
goto drop;
}
if (dccp_parse_options(sk, dreq, skb))
goto drop;
child = inet_csk(sk)->icsk_af_ops->syn_recv_sock(sk, skb, req, NULL);
if (child == NULL)
goto listen_overflow;
inet_csk_reqsk_queue_unlink(sk, req, prev);
inet_csk_reqsk_queue_removed(sk, req);
inet_csk_reqsk_queue_add(sk, req, child);
out:
return child;
listen_overflow:
dccp_pr_debug("listen_overflow!\n");
DCCP_SKB_CB(skb)->dccpd_reset_code = DCCP_RESET_CODE_TOO_BUSY;
drop:
if (dccp_hdr(skb)->dccph_type != DCCP_PKT_RESET)
req->rsk_ops->send_reset(sk, skb);
inet_csk_reqsk_queue_drop(sk, req, prev);
goto out;
}
EXPORT_SYMBOL_GPL(dccp_check_req);
/*
* Queue segment on the new socket if the new socket is active,
* otherwise we just shortcircuit this and continue with
* the new socket.
*/
int dccp_child_process(struct sock *parent, struct sock *child,
struct sk_buff *skb)
{
int ret = 0;
const int state = child->sk_state;
if (!sock_owned_by_user(child)) {
ret = dccp_rcv_state_process(child, skb, dccp_hdr(skb),
skb->len);
/* Wakeup parent, send SIGIO */
if (state == DCCP_RESPOND && child->sk_state != state)
parent->sk_data_ready(parent);
} else {
/* Alas, it is possible again, because we do lookup
* in main socket hash table and lock on listening
* socket does not protect us more.
*/
__sk_add_backlog(child, skb);
}
bh_unlock_sock(child);
sock_put(child);
return ret;
}
EXPORT_SYMBOL_GPL(dccp_child_process);
void dccp_reqsk_send_ack(struct sock *sk, struct sk_buff *skb,
struct request_sock *rsk)
{
DCCP_BUG("DCCP-ACK packets are never sent in LISTEN/RESPOND state");
}
EXPORT_SYMBOL_GPL(dccp_reqsk_send_ack);
int dccp_reqsk_init(struct request_sock *req,
struct dccp_sock const *dp, struct sk_buff const *skb)
{
struct dccp_request_sock *dreq = dccp_rsk(req);
inet_rsk(req)->ir_rmt_port = dccp_hdr(skb)->dccph_sport;
inet_rsk(req)->ir_num = ntohs(dccp_hdr(skb)->dccph_dport);
inet_rsk(req)->acked = 0;
dreq->dreq_timestamp_echo = 0;
/* inherit feature negotiation options from listening socket */
return dccp_feat_clone_list(&dp->dccps_featneg, &dreq->dreq_featneg);
}
EXPORT_SYMBOL_GPL(dccp_reqsk_init);

609
net/dccp/options.c Normal file
View file

@ -0,0 +1,609 @@
/*
* net/dccp/options.c
*
* An implementation of the DCCP protocol
* Copyright (c) 2005 Aristeu Sergio Rozanski Filho <aris@cathedrallabs.org>
* Copyright (c) 2005 Arnaldo Carvalho de Melo <acme@ghostprotocols.net>
* Copyright (c) 2005 Ian McDonald <ian.mcdonald@jandi.co.nz>
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public License
* as published by the Free Software Foundation; either version
* 2 of the License, or (at your option) any later version.
*/
#include <linux/dccp.h>
#include <linux/module.h>
#include <linux/types.h>
#include <asm/unaligned.h>
#include <linux/kernel.h>
#include <linux/skbuff.h>
#include "ackvec.h"
#include "ccid.h"
#include "dccp.h"
#include "feat.h"
u64 dccp_decode_value_var(const u8 *bf, const u8 len)
{
u64 value = 0;
if (len >= DCCP_OPTVAL_MAXLEN)
value += ((u64)*bf++) << 40;
if (len > 4)
value += ((u64)*bf++) << 32;
if (len > 3)
value += ((u64)*bf++) << 24;
if (len > 2)
value += ((u64)*bf++) << 16;
if (len > 1)
value += ((u64)*bf++) << 8;
if (len > 0)
value += *bf;
return value;
}
/**
* dccp_parse_options - Parse DCCP options present in @skb
* @sk: client|server|listening dccp socket (when @dreq != NULL)
* @dreq: request socket to use during connection setup, or NULL
*/
int dccp_parse_options(struct sock *sk, struct dccp_request_sock *dreq,
struct sk_buff *skb)
{
struct dccp_sock *dp = dccp_sk(sk);
const struct dccp_hdr *dh = dccp_hdr(skb);
const u8 pkt_type = DCCP_SKB_CB(skb)->dccpd_type;
unsigned char *options = (unsigned char *)dh + dccp_hdr_len(skb);
unsigned char *opt_ptr = options;
const unsigned char *opt_end = (unsigned char *)dh +
(dh->dccph_doff * 4);
struct dccp_options_received *opt_recv = &dp->dccps_options_received;
unsigned char opt, len;
unsigned char *uninitialized_var(value);
u32 elapsed_time;
__be32 opt_val;
int rc;
int mandatory = 0;
memset(opt_recv, 0, sizeof(*opt_recv));
opt = len = 0;
while (opt_ptr != opt_end) {
opt = *opt_ptr++;
len = 0;
value = NULL;
/* Check if this isn't a single byte option */
if (opt > DCCPO_MAX_RESERVED) {
if (opt_ptr == opt_end)
goto out_nonsensical_length;
len = *opt_ptr++;
if (len < 2)
goto out_nonsensical_length;
/*
* Remove the type and len fields, leaving
* just the value size
*/
len -= 2;
value = opt_ptr;
opt_ptr += len;
if (opt_ptr > opt_end)
goto out_nonsensical_length;
}
/*
* CCID-specific options are ignored during connection setup, as
* negotiation may still be in progress (see RFC 4340, 10.3).
* The same applies to Ack Vectors, as these depend on the CCID.
*/
if (dreq != NULL && (opt >= DCCPO_MIN_RX_CCID_SPECIFIC ||
opt == DCCPO_ACK_VECTOR_0 || opt == DCCPO_ACK_VECTOR_1))
goto ignore_option;
switch (opt) {
case DCCPO_PADDING:
break;
case DCCPO_MANDATORY:
if (mandatory)
goto out_invalid_option;
if (pkt_type != DCCP_PKT_DATA)
mandatory = 1;
break;
case DCCPO_NDP_COUNT:
if (len > 6)
goto out_invalid_option;
opt_recv->dccpor_ndp = dccp_decode_value_var(value, len);
dccp_pr_debug("%s opt: NDP count=%llu\n", dccp_role(sk),
(unsigned long long)opt_recv->dccpor_ndp);
break;
case DCCPO_CHANGE_L ... DCCPO_CONFIRM_R:
if (pkt_type == DCCP_PKT_DATA) /* RFC 4340, 6 */
break;
if (len == 0)
goto out_invalid_option;
rc = dccp_feat_parse_options(sk, dreq, mandatory, opt,
*value, value + 1, len - 1);
if (rc)
goto out_featneg_failed;
break;
case DCCPO_TIMESTAMP:
if (len != 4)
goto out_invalid_option;
/*
* RFC 4340 13.1: "The precise time corresponding to
* Timestamp Value zero is not specified". We use
* zero to indicate absence of a meaningful timestamp.
*/
opt_val = get_unaligned((__be32 *)value);
if (unlikely(opt_val == 0)) {
DCCP_WARN("Timestamp with zero value\n");
break;
}
if (dreq != NULL) {
dreq->dreq_timestamp_echo = ntohl(opt_val);
dreq->dreq_timestamp_time = dccp_timestamp();
} else {
opt_recv->dccpor_timestamp =
dp->dccps_timestamp_echo = ntohl(opt_val);
dp->dccps_timestamp_time = dccp_timestamp();
}
dccp_pr_debug("%s rx opt: TIMESTAMP=%u, ackno=%llu\n",
dccp_role(sk), ntohl(opt_val),
(unsigned long long)
DCCP_SKB_CB(skb)->dccpd_ack_seq);
/* schedule an Ack in case this sender is quiescent */
inet_csk_schedule_ack(sk);
break;
case DCCPO_TIMESTAMP_ECHO:
if (len != 4 && len != 6 && len != 8)
goto out_invalid_option;
opt_val = get_unaligned((__be32 *)value);
opt_recv->dccpor_timestamp_echo = ntohl(opt_val);
dccp_pr_debug("%s rx opt: TIMESTAMP_ECHO=%u, len=%d, "
"ackno=%llu", dccp_role(sk),
opt_recv->dccpor_timestamp_echo,
len + 2,
(unsigned long long)
DCCP_SKB_CB(skb)->dccpd_ack_seq);
value += 4;
if (len == 4) { /* no elapsed time included */
dccp_pr_debug_cat("\n");
break;
}
if (len == 6) { /* 2-byte elapsed time */
__be16 opt_val2 = get_unaligned((__be16 *)value);
elapsed_time = ntohs(opt_val2);
} else { /* 4-byte elapsed time */
opt_val = get_unaligned((__be32 *)value);
elapsed_time = ntohl(opt_val);
}
dccp_pr_debug_cat(", ELAPSED_TIME=%u\n", elapsed_time);
/* Give precedence to the biggest ELAPSED_TIME */
if (elapsed_time > opt_recv->dccpor_elapsed_time)
opt_recv->dccpor_elapsed_time = elapsed_time;
break;
case DCCPO_ELAPSED_TIME:
if (dccp_packet_without_ack(skb)) /* RFC 4340, 13.2 */
break;
if (len == 2) {
__be16 opt_val2 = get_unaligned((__be16 *)value);
elapsed_time = ntohs(opt_val2);
} else if (len == 4) {
opt_val = get_unaligned((__be32 *)value);
elapsed_time = ntohl(opt_val);
} else {
goto out_invalid_option;
}
if (elapsed_time > opt_recv->dccpor_elapsed_time)
opt_recv->dccpor_elapsed_time = elapsed_time;
dccp_pr_debug("%s rx opt: ELAPSED_TIME=%d\n",
dccp_role(sk), elapsed_time);
break;
case DCCPO_MIN_RX_CCID_SPECIFIC ... DCCPO_MAX_RX_CCID_SPECIFIC:
if (ccid_hc_rx_parse_options(dp->dccps_hc_rx_ccid, sk,
pkt_type, opt, value, len))
goto out_invalid_option;
break;
case DCCPO_ACK_VECTOR_0:
case DCCPO_ACK_VECTOR_1:
if (dccp_packet_without_ack(skb)) /* RFC 4340, 11.4 */
break;
/*
* Ack vectors are processed by the TX CCID if it is
* interested. The RX CCID need not parse Ack Vectors,
* since it is only interested in clearing old state.
* Fall through.
*/
case DCCPO_MIN_TX_CCID_SPECIFIC ... DCCPO_MAX_TX_CCID_SPECIFIC:
if (ccid_hc_tx_parse_options(dp->dccps_hc_tx_ccid, sk,
pkt_type, opt, value, len))
goto out_invalid_option;
break;
default:
DCCP_CRIT("DCCP(%p): option %d(len=%d) not "
"implemented, ignoring", sk, opt, len);
break;
}
ignore_option:
if (opt != DCCPO_MANDATORY)
mandatory = 0;
}
/* mandatory was the last byte in option list -> reset connection */
if (mandatory)
goto out_invalid_option;
out_nonsensical_length:
/* RFC 4340, 5.8: ignore option and all remaining option space */
return 0;
out_invalid_option:
DCCP_INC_STATS_BH(DCCP_MIB_INVALIDOPT);
rc = DCCP_RESET_CODE_OPTION_ERROR;
out_featneg_failed:
DCCP_WARN("DCCP(%p): Option %d (len=%d) error=%u\n", sk, opt, len, rc);
DCCP_SKB_CB(skb)->dccpd_reset_code = rc;
DCCP_SKB_CB(skb)->dccpd_reset_data[0] = opt;
DCCP_SKB_CB(skb)->dccpd_reset_data[1] = len > 0 ? value[0] : 0;
DCCP_SKB_CB(skb)->dccpd_reset_data[2] = len > 1 ? value[1] : 0;
return -1;
}
EXPORT_SYMBOL_GPL(dccp_parse_options);
void dccp_encode_value_var(const u64 value, u8 *to, const u8 len)
{
if (len >= DCCP_OPTVAL_MAXLEN)
*to++ = (value & 0xFF0000000000ull) >> 40;
if (len > 4)
*to++ = (value & 0xFF00000000ull) >> 32;
if (len > 3)
*to++ = (value & 0xFF000000) >> 24;
if (len > 2)
*to++ = (value & 0xFF0000) >> 16;
if (len > 1)
*to++ = (value & 0xFF00) >> 8;
if (len > 0)
*to++ = (value & 0xFF);
}
static inline u8 dccp_ndp_len(const u64 ndp)
{
if (likely(ndp <= 0xFF))
return 1;
return likely(ndp <= USHRT_MAX) ? 2 : (ndp <= UINT_MAX ? 4 : 6);
}
int dccp_insert_option(struct sk_buff *skb, const unsigned char option,
const void *value, const unsigned char len)
{
unsigned char *to;
if (DCCP_SKB_CB(skb)->dccpd_opt_len + len + 2 > DCCP_MAX_OPT_LEN)
return -1;
DCCP_SKB_CB(skb)->dccpd_opt_len += len + 2;
to = skb_push(skb, len + 2);
*to++ = option;
*to++ = len + 2;
memcpy(to, value, len);
return 0;
}
EXPORT_SYMBOL_GPL(dccp_insert_option);
static int dccp_insert_option_ndp(struct sock *sk, struct sk_buff *skb)
{
struct dccp_sock *dp = dccp_sk(sk);
u64 ndp = dp->dccps_ndp_count;
if (dccp_non_data_packet(skb))
++dp->dccps_ndp_count;
else
dp->dccps_ndp_count = 0;
if (ndp > 0) {
unsigned char *ptr;
const int ndp_len = dccp_ndp_len(ndp);
const int len = ndp_len + 2;
if (DCCP_SKB_CB(skb)->dccpd_opt_len + len > DCCP_MAX_OPT_LEN)
return -1;
DCCP_SKB_CB(skb)->dccpd_opt_len += len;
ptr = skb_push(skb, len);
*ptr++ = DCCPO_NDP_COUNT;
*ptr++ = len;
dccp_encode_value_var(ndp, ptr, ndp_len);
}
return 0;
}
static inline int dccp_elapsed_time_len(const u32 elapsed_time)
{
return elapsed_time == 0 ? 0 : elapsed_time <= 0xFFFF ? 2 : 4;
}
static int dccp_insert_option_timestamp(struct sk_buff *skb)
{
__be32 now = htonl(dccp_timestamp());
/* yes this will overflow but that is the point as we want a
* 10 usec 32 bit timer which mean it wraps every 11.9 hours */
return dccp_insert_option(skb, DCCPO_TIMESTAMP, &now, sizeof(now));
}
static int dccp_insert_option_timestamp_echo(struct dccp_sock *dp,
struct dccp_request_sock *dreq,
struct sk_buff *skb)
{
__be32 tstamp_echo;
unsigned char *to;
u32 elapsed_time, elapsed_time_len, len;
if (dreq != NULL) {
elapsed_time = dccp_timestamp() - dreq->dreq_timestamp_time;
tstamp_echo = htonl(dreq->dreq_timestamp_echo);
dreq->dreq_timestamp_echo = 0;
} else {
elapsed_time = dccp_timestamp() - dp->dccps_timestamp_time;
tstamp_echo = htonl(dp->dccps_timestamp_echo);
dp->dccps_timestamp_echo = 0;
}
elapsed_time_len = dccp_elapsed_time_len(elapsed_time);
len = 6 + elapsed_time_len;
if (DCCP_SKB_CB(skb)->dccpd_opt_len + len > DCCP_MAX_OPT_LEN)
return -1;
DCCP_SKB_CB(skb)->dccpd_opt_len += len;
to = skb_push(skb, len);
*to++ = DCCPO_TIMESTAMP_ECHO;
*to++ = len;
memcpy(to, &tstamp_echo, 4);
to += 4;
if (elapsed_time_len == 2) {
const __be16 var16 = htons((u16)elapsed_time);
memcpy(to, &var16, 2);
} else if (elapsed_time_len == 4) {
const __be32 var32 = htonl(elapsed_time);
memcpy(to, &var32, 4);
}
return 0;
}
static int dccp_insert_option_ackvec(struct sock *sk, struct sk_buff *skb)
{
struct dccp_sock *dp = dccp_sk(sk);
struct dccp_ackvec *av = dp->dccps_hc_rx_ackvec;
struct dccp_skb_cb *dcb = DCCP_SKB_CB(skb);
const u16 buflen = dccp_ackvec_buflen(av);
/* Figure out how many options do we need to represent the ackvec */
const u8 nr_opts = DIV_ROUND_UP(buflen, DCCP_SINGLE_OPT_MAXLEN);
u16 len = buflen + 2 * nr_opts;
u8 i, nonce = 0;
const unsigned char *tail, *from;
unsigned char *to;
if (dcb->dccpd_opt_len + len > DCCP_MAX_OPT_LEN) {
DCCP_WARN("Lacking space for %u bytes on %s packet\n", len,
dccp_packet_name(dcb->dccpd_type));
return -1;
}
/*
* Since Ack Vectors are variable-length, we can not always predict
* their size. To catch exception cases where the space is running out
* on the skb, a separate Sync is scheduled to carry the Ack Vector.
*/
if (len > DCCPAV_MIN_OPTLEN &&
len + dcb->dccpd_opt_len + skb->len > dp->dccps_mss_cache) {
DCCP_WARN("No space left for Ack Vector (%u) on skb (%u+%u), "
"MPS=%u ==> reduce payload size?\n", len, skb->len,
dcb->dccpd_opt_len, dp->dccps_mss_cache);
dp->dccps_sync_scheduled = 1;
return 0;
}
dcb->dccpd_opt_len += len;
to = skb_push(skb, len);
len = buflen;
from = av->av_buf + av->av_buf_head;
tail = av->av_buf + DCCPAV_MAX_ACKVEC_LEN;
for (i = 0; i < nr_opts; ++i) {
int copylen = len;
if (len > DCCP_SINGLE_OPT_MAXLEN)
copylen = DCCP_SINGLE_OPT_MAXLEN;
/*
* RFC 4340, 12.2: Encode the Nonce Echo for this Ack Vector via
* its type; ack_nonce is the sum of all individual buf_nonce's.
*/
nonce ^= av->av_buf_nonce[i];
*to++ = DCCPO_ACK_VECTOR_0 + av->av_buf_nonce[i];
*to++ = copylen + 2;
/* Check if buf_head wraps */
if (from + copylen > tail) {
const u16 tailsize = tail - from;
memcpy(to, from, tailsize);
to += tailsize;
len -= tailsize;
copylen -= tailsize;
from = av->av_buf;
}
memcpy(to, from, copylen);
from += copylen;
to += copylen;
len -= copylen;
}
/*
* Each sent Ack Vector is recorded in the list, as per A.2 of RFC 4340.
*/
if (dccp_ackvec_update_records(av, dcb->dccpd_seq, nonce))
return -ENOBUFS;
return 0;
}
/**
* dccp_insert_option_mandatory - Mandatory option (5.8.2)
* Note that since we are using skb_push, this function needs to be called
* _after_ inserting the option it is supposed to influence (stack order).
*/
int dccp_insert_option_mandatory(struct sk_buff *skb)
{
if (DCCP_SKB_CB(skb)->dccpd_opt_len >= DCCP_MAX_OPT_LEN)
return -1;
DCCP_SKB_CB(skb)->dccpd_opt_len++;
*skb_push(skb, 1) = DCCPO_MANDATORY;
return 0;
}
/**
* dccp_insert_fn_opt - Insert single Feature-Negotiation option into @skb
* @type: %DCCPO_CHANGE_L, %DCCPO_CHANGE_R, %DCCPO_CONFIRM_L, %DCCPO_CONFIRM_R
* @feat: one out of %dccp_feature_numbers
* @val: NN value or SP array (preferred element first) to copy
* @len: true length of @val in bytes (excluding first element repetition)
* @repeat_first: whether to copy the first element of @val twice
*
* The last argument is used to construct Confirm options, where the preferred
* value and the preference list appear separately (RFC 4340, 6.3.1). Preference
* lists are kept such that the preferred entry is always first, so we only need
* to copy twice, and avoid the overhead of cloning into a bigger array.
*/
int dccp_insert_fn_opt(struct sk_buff *skb, u8 type, u8 feat,
u8 *val, u8 len, bool repeat_first)
{
u8 tot_len, *to;
/* take the `Feature' field and possible repetition into account */
if (len > (DCCP_SINGLE_OPT_MAXLEN - 2)) {
DCCP_WARN("length %u for feature %u too large\n", len, feat);
return -1;
}
if (unlikely(val == NULL || len == 0))
len = repeat_first = false;
tot_len = 3 + repeat_first + len;
if (DCCP_SKB_CB(skb)->dccpd_opt_len + tot_len > DCCP_MAX_OPT_LEN) {
DCCP_WARN("packet too small for feature %d option!\n", feat);
return -1;
}
DCCP_SKB_CB(skb)->dccpd_opt_len += tot_len;
to = skb_push(skb, tot_len);
*to++ = type;
*to++ = tot_len;
*to++ = feat;
if (repeat_first)
*to++ = *val;
if (len)
memcpy(to, val, len);
return 0;
}
/* The length of all options needs to be a multiple of 4 (5.8) */
static void dccp_insert_option_padding(struct sk_buff *skb)
{
int padding = DCCP_SKB_CB(skb)->dccpd_opt_len % 4;
if (padding != 0) {
padding = 4 - padding;
memset(skb_push(skb, padding), 0, padding);
DCCP_SKB_CB(skb)->dccpd_opt_len += padding;
}
}
int dccp_insert_options(struct sock *sk, struct sk_buff *skb)
{
struct dccp_sock *dp = dccp_sk(sk);
DCCP_SKB_CB(skb)->dccpd_opt_len = 0;
if (dp->dccps_send_ndp_count && dccp_insert_option_ndp(sk, skb))
return -1;
if (DCCP_SKB_CB(skb)->dccpd_type != DCCP_PKT_DATA) {
/* Feature Negotiation */
if (dccp_feat_insert_opts(dp, NULL, skb))
return -1;
if (DCCP_SKB_CB(skb)->dccpd_type == DCCP_PKT_REQUEST) {
/*
* Obtain RTT sample from Request/Response exchange.
* This is currently used for TFRC initialisation.
*/
if (dccp_insert_option_timestamp(skb))
return -1;
} else if (dccp_ackvec_pending(sk) &&
dccp_insert_option_ackvec(sk, skb)) {
return -1;
}
}
if (dp->dccps_hc_rx_insert_options) {
if (ccid_hc_rx_insert_options(dp->dccps_hc_rx_ccid, sk, skb))
return -1;
dp->dccps_hc_rx_insert_options = 0;
}
if (dp->dccps_timestamp_echo != 0 &&
dccp_insert_option_timestamp_echo(dp, NULL, skb))
return -1;
dccp_insert_option_padding(skb);
return 0;
}
int dccp_insert_options_rsk(struct dccp_request_sock *dreq, struct sk_buff *skb)
{
DCCP_SKB_CB(skb)->dccpd_opt_len = 0;
if (dccp_feat_insert_opts(NULL, dreq, skb))
return -1;
/* Obtain RTT sample from Response/Ack exchange (used by TFRC). */
if (dccp_insert_option_timestamp(skb))
return -1;
if (dreq->dreq_timestamp_echo != 0 &&
dccp_insert_option_timestamp_echo(NULL, dreq, skb))
return -1;
dccp_insert_option_padding(skb);
return 0;
}

698
net/dccp/output.c Normal file
View file

@ -0,0 +1,698 @@
/*
* net/dccp/output.c
*
* An implementation of the DCCP protocol
* Arnaldo Carvalho de Melo <acme@conectiva.com.br>
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public License
* as published by the Free Software Foundation; either version
* 2 of the License, or (at your option) any later version.
*/
#include <linux/dccp.h>
#include <linux/kernel.h>
#include <linux/skbuff.h>
#include <linux/slab.h>
#include <net/inet_sock.h>
#include <net/sock.h>
#include "ackvec.h"
#include "ccid.h"
#include "dccp.h"
static inline void dccp_event_ack_sent(struct sock *sk)
{
inet_csk_clear_xmit_timer(sk, ICSK_TIME_DACK);
}
/* enqueue @skb on sk_send_head for retransmission, return clone to send now */
static struct sk_buff *dccp_skb_entail(struct sock *sk, struct sk_buff *skb)
{
skb_set_owner_w(skb, sk);
WARN_ON(sk->sk_send_head);
sk->sk_send_head = skb;
return skb_clone(sk->sk_send_head, gfp_any());
}
/*
* All SKB's seen here are completely headerless. It is our
* job to build the DCCP header, and pass the packet down to
* IP so it can do the same plus pass the packet off to the
* device.
*/
static int dccp_transmit_skb(struct sock *sk, struct sk_buff *skb)
{
if (likely(skb != NULL)) {
struct inet_sock *inet = inet_sk(sk);
const struct inet_connection_sock *icsk = inet_csk(sk);
struct dccp_sock *dp = dccp_sk(sk);
struct dccp_skb_cb *dcb = DCCP_SKB_CB(skb);
struct dccp_hdr *dh;
/* XXX For now we're using only 48 bits sequence numbers */
const u32 dccp_header_size = sizeof(*dh) +
sizeof(struct dccp_hdr_ext) +
dccp_packet_hdr_len(dcb->dccpd_type);
int err, set_ack = 1;
u64 ackno = dp->dccps_gsr;
/*
* Increment GSS here already in case the option code needs it.
* Update GSS for real only if option processing below succeeds.
*/
dcb->dccpd_seq = ADD48(dp->dccps_gss, 1);
switch (dcb->dccpd_type) {
case DCCP_PKT_DATA:
set_ack = 0;
/* fall through */
case DCCP_PKT_DATAACK:
case DCCP_PKT_RESET:
break;
case DCCP_PKT_REQUEST:
set_ack = 0;
/* Use ISS on the first (non-retransmitted) Request. */
if (icsk->icsk_retransmits == 0)
dcb->dccpd_seq = dp->dccps_iss;
/* fall through */
case DCCP_PKT_SYNC:
case DCCP_PKT_SYNCACK:
ackno = dcb->dccpd_ack_seq;
/* fall through */
default:
/*
* Set owner/destructor: some skbs are allocated via
* alloc_skb (e.g. when retransmission may happen).
* Only Data, DataAck, and Reset packets should come
* through here with skb->sk set.
*/
WARN_ON(skb->sk);
skb_set_owner_w(skb, sk);
break;
}
if (dccp_insert_options(sk, skb)) {
kfree_skb(skb);
return -EPROTO;
}
/* Build DCCP header and checksum it. */
dh = dccp_zeroed_hdr(skb, dccp_header_size);
dh->dccph_type = dcb->dccpd_type;
dh->dccph_sport = inet->inet_sport;
dh->dccph_dport = inet->inet_dport;
dh->dccph_doff = (dccp_header_size + dcb->dccpd_opt_len) / 4;
dh->dccph_ccval = dcb->dccpd_ccval;
dh->dccph_cscov = dp->dccps_pcslen;
/* XXX For now we're using only 48 bits sequence numbers */
dh->dccph_x = 1;
dccp_update_gss(sk, dcb->dccpd_seq);
dccp_hdr_set_seq(dh, dp->dccps_gss);
if (set_ack)
dccp_hdr_set_ack(dccp_hdr_ack_bits(skb), ackno);
switch (dcb->dccpd_type) {
case DCCP_PKT_REQUEST:
dccp_hdr_request(skb)->dccph_req_service =
dp->dccps_service;
/*
* Limit Ack window to ISS <= P.ackno <= GSS, so that
* only Responses to Requests we sent are considered.
*/
dp->dccps_awl = dp->dccps_iss;
break;
case DCCP_PKT_RESET:
dccp_hdr_reset(skb)->dccph_reset_code =
dcb->dccpd_reset_code;
break;
}
icsk->icsk_af_ops->send_check(sk, skb);
if (set_ack)
dccp_event_ack_sent(sk);
DCCP_INC_STATS(DCCP_MIB_OUTSEGS);
err = icsk->icsk_af_ops->queue_xmit(sk, skb, &inet->cork.fl);
return net_xmit_eval(err);
}
return -ENOBUFS;
}
/**
* dccp_determine_ccmps - Find out about CCID-specific packet-size limits
* We only consider the HC-sender CCID for setting the CCMPS (RFC 4340, 14.),
* since the RX CCID is restricted to feedback packets (Acks), which are small
* in comparison with the data traffic. A value of 0 means "no current CCMPS".
*/
static u32 dccp_determine_ccmps(const struct dccp_sock *dp)
{
const struct ccid *tx_ccid = dp->dccps_hc_tx_ccid;
if (tx_ccid == NULL || tx_ccid->ccid_ops == NULL)
return 0;
return tx_ccid->ccid_ops->ccid_ccmps;
}
unsigned int dccp_sync_mss(struct sock *sk, u32 pmtu)
{
struct inet_connection_sock *icsk = inet_csk(sk);
struct dccp_sock *dp = dccp_sk(sk);
u32 ccmps = dccp_determine_ccmps(dp);
u32 cur_mps = ccmps ? min(pmtu, ccmps) : pmtu;
/* Account for header lengths and IPv4/v6 option overhead */
cur_mps -= (icsk->icsk_af_ops->net_header_len + icsk->icsk_ext_hdr_len +
sizeof(struct dccp_hdr) + sizeof(struct dccp_hdr_ext));
/*
* Leave enough headroom for common DCCP header options.
* This only considers options which may appear on DCCP-Data packets, as
* per table 3 in RFC 4340, 5.8. When running out of space for other
* options (eg. Ack Vector which can take up to 255 bytes), it is better
* to schedule a separate Ack. Thus we leave headroom for the following:
* - 1 byte for Slow Receiver (11.6)
* - 6 bytes for Timestamp (13.1)
* - 10 bytes for Timestamp Echo (13.3)
* - 8 bytes for NDP count (7.7, when activated)
* - 6 bytes for Data Checksum (9.3)
* - %DCCPAV_MIN_OPTLEN bytes for Ack Vector size (11.4, when enabled)
*/
cur_mps -= roundup(1 + 6 + 10 + dp->dccps_send_ndp_count * 8 + 6 +
(dp->dccps_hc_rx_ackvec ? DCCPAV_MIN_OPTLEN : 0), 4);
/* And store cached results */
icsk->icsk_pmtu_cookie = pmtu;
dp->dccps_mss_cache = cur_mps;
return cur_mps;
}
EXPORT_SYMBOL_GPL(dccp_sync_mss);
void dccp_write_space(struct sock *sk)
{
struct socket_wq *wq;
rcu_read_lock();
wq = rcu_dereference(sk->sk_wq);
if (wq_has_sleeper(wq))
wake_up_interruptible(&wq->wait);
/* Should agree with poll, otherwise some programs break */
if (sock_writeable(sk))
sk_wake_async(sk, SOCK_WAKE_SPACE, POLL_OUT);
rcu_read_unlock();
}
/**
* dccp_wait_for_ccid - Await CCID send permission
* @sk: socket to wait for
* @delay: timeout in jiffies
*
* This is used by CCIDs which need to delay the send time in process context.
*/
static int dccp_wait_for_ccid(struct sock *sk, unsigned long delay)
{
DEFINE_WAIT(wait);
long remaining;
prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE);
sk->sk_write_pending++;
release_sock(sk);
remaining = schedule_timeout(delay);
lock_sock(sk);
sk->sk_write_pending--;
finish_wait(sk_sleep(sk), &wait);
if (signal_pending(current) || sk->sk_err)
return -1;
return remaining;
}
/**
* dccp_xmit_packet - Send data packet under control of CCID
* Transmits next-queued payload and informs CCID to account for the packet.
*/
static void dccp_xmit_packet(struct sock *sk)
{
int err, len;
struct dccp_sock *dp = dccp_sk(sk);
struct sk_buff *skb = dccp_qpolicy_pop(sk);
if (unlikely(skb == NULL))
return;
len = skb->len;
if (sk->sk_state == DCCP_PARTOPEN) {
const u32 cur_mps = dp->dccps_mss_cache - DCCP_FEATNEG_OVERHEAD;
/*
* See 8.1.5 - Handshake Completion.
*
* For robustness we resend Confirm options until the client has
* entered OPEN. During the initial feature negotiation, the MPS
* is smaller than usual, reduced by the Change/Confirm options.
*/
if (!list_empty(&dp->dccps_featneg) && len > cur_mps) {
DCCP_WARN("Payload too large (%d) for featneg.\n", len);
dccp_send_ack(sk);
dccp_feat_list_purge(&dp->dccps_featneg);
}
inet_csk_schedule_ack(sk);
inet_csk_reset_xmit_timer(sk, ICSK_TIME_DACK,
inet_csk(sk)->icsk_rto,
DCCP_RTO_MAX);
DCCP_SKB_CB(skb)->dccpd_type = DCCP_PKT_DATAACK;
} else if (dccp_ack_pending(sk)) {
DCCP_SKB_CB(skb)->dccpd_type = DCCP_PKT_DATAACK;
} else {
DCCP_SKB_CB(skb)->dccpd_type = DCCP_PKT_DATA;
}
err = dccp_transmit_skb(sk, skb);
if (err)
dccp_pr_debug("transmit_skb() returned err=%d\n", err);
/*
* Register this one as sent even if an error occurred. To the remote
* end a local packet drop is indistinguishable from network loss, i.e.
* any local drop will eventually be reported via receiver feedback.
*/
ccid_hc_tx_packet_sent(dp->dccps_hc_tx_ccid, sk, len);
/*
* If the CCID needs to transfer additional header options out-of-band
* (e.g. Ack Vectors or feature-negotiation options), it activates this
* flag to schedule a Sync. The Sync will automatically incorporate all
* currently pending header options, thus clearing the backlog.
*/
if (dp->dccps_sync_scheduled)
dccp_send_sync(sk, dp->dccps_gsr, DCCP_PKT_SYNC);
}
/**
* dccp_flush_write_queue - Drain queue at end of connection
* Since dccp_sendmsg queues packets without waiting for them to be sent, it may
* happen that the TX queue is not empty at the end of a connection. We give the
* HC-sender CCID a grace period of up to @time_budget jiffies. If this function
* returns with a non-empty write queue, it will be purged later.
*/
void dccp_flush_write_queue(struct sock *sk, long *time_budget)
{
struct dccp_sock *dp = dccp_sk(sk);
struct sk_buff *skb;
long delay, rc;
while (*time_budget > 0 && (skb = skb_peek(&sk->sk_write_queue))) {
rc = ccid_hc_tx_send_packet(dp->dccps_hc_tx_ccid, sk, skb);
switch (ccid_packet_dequeue_eval(rc)) {
case CCID_PACKET_WILL_DEQUEUE_LATER:
/*
* If the CCID determines when to send, the next sending
* time is unknown or the CCID may not even send again
* (e.g. remote host crashes or lost Ack packets).
*/
DCCP_WARN("CCID did not manage to send all packets\n");
return;
case CCID_PACKET_DELAY:
delay = msecs_to_jiffies(rc);
if (delay > *time_budget)
return;
rc = dccp_wait_for_ccid(sk, delay);
if (rc < 0)
return;
*time_budget -= (delay - rc);
/* check again if we can send now */
break;
case CCID_PACKET_SEND_AT_ONCE:
dccp_xmit_packet(sk);
break;
case CCID_PACKET_ERR:
skb_dequeue(&sk->sk_write_queue);
kfree_skb(skb);
dccp_pr_debug("packet discarded due to err=%ld\n", rc);
}
}
}
void dccp_write_xmit(struct sock *sk)
{
struct dccp_sock *dp = dccp_sk(sk);
struct sk_buff *skb;
while ((skb = dccp_qpolicy_top(sk))) {
int rc = ccid_hc_tx_send_packet(dp->dccps_hc_tx_ccid, sk, skb);
switch (ccid_packet_dequeue_eval(rc)) {
case CCID_PACKET_WILL_DEQUEUE_LATER:
return;
case CCID_PACKET_DELAY:
sk_reset_timer(sk, &dp->dccps_xmit_timer,
jiffies + msecs_to_jiffies(rc));
return;
case CCID_PACKET_SEND_AT_ONCE:
dccp_xmit_packet(sk);
break;
case CCID_PACKET_ERR:
dccp_qpolicy_drop(sk, skb);
dccp_pr_debug("packet discarded due to err=%d\n", rc);
}
}
}
/**
* dccp_retransmit_skb - Retransmit Request, Close, or CloseReq packets
* There are only four retransmittable packet types in DCCP:
* - Request in client-REQUEST state (sec. 8.1.1),
* - CloseReq in server-CLOSEREQ state (sec. 8.3),
* - Close in node-CLOSING state (sec. 8.3),
* - Acks in client-PARTOPEN state (sec. 8.1.5, handled by dccp_delack_timer()).
* This function expects sk->sk_send_head to contain the original skb.
*/
int dccp_retransmit_skb(struct sock *sk)
{
WARN_ON(sk->sk_send_head == NULL);
if (inet_csk(sk)->icsk_af_ops->rebuild_header(sk) != 0)
return -EHOSTUNREACH; /* Routing failure or similar. */
/* this count is used to distinguish original and retransmitted skb */
inet_csk(sk)->icsk_retransmits++;
return dccp_transmit_skb(sk, skb_clone(sk->sk_send_head, GFP_ATOMIC));
}
struct sk_buff *dccp_make_response(struct sock *sk, struct dst_entry *dst,
struct request_sock *req)
{
struct dccp_hdr *dh;
struct dccp_request_sock *dreq;
const u32 dccp_header_size = sizeof(struct dccp_hdr) +
sizeof(struct dccp_hdr_ext) +
sizeof(struct dccp_hdr_response);
struct sk_buff *skb = sock_wmalloc(sk, sk->sk_prot->max_header, 1,
GFP_ATOMIC);
if (skb == NULL)
return NULL;
/* Reserve space for headers. */
skb_reserve(skb, sk->sk_prot->max_header);
skb_dst_set(skb, dst_clone(dst));
dreq = dccp_rsk(req);
if (inet_rsk(req)->acked) /* increase GSS upon retransmission */
dccp_inc_seqno(&dreq->dreq_gss);
DCCP_SKB_CB(skb)->dccpd_type = DCCP_PKT_RESPONSE;
DCCP_SKB_CB(skb)->dccpd_seq = dreq->dreq_gss;
/* Resolve feature dependencies resulting from choice of CCID */
if (dccp_feat_server_ccid_dependencies(dreq))
goto response_failed;
if (dccp_insert_options_rsk(dreq, skb))
goto response_failed;
/* Build and checksum header */
dh = dccp_zeroed_hdr(skb, dccp_header_size);
dh->dccph_sport = htons(inet_rsk(req)->ir_num);
dh->dccph_dport = inet_rsk(req)->ir_rmt_port;
dh->dccph_doff = (dccp_header_size +
DCCP_SKB_CB(skb)->dccpd_opt_len) / 4;
dh->dccph_type = DCCP_PKT_RESPONSE;
dh->dccph_x = 1;
dccp_hdr_set_seq(dh, dreq->dreq_gss);
dccp_hdr_set_ack(dccp_hdr_ack_bits(skb), dreq->dreq_gsr);
dccp_hdr_response(skb)->dccph_resp_service = dreq->dreq_service;
dccp_csum_outgoing(skb);
/* We use `acked' to remember that a Response was already sent. */
inet_rsk(req)->acked = 1;
DCCP_INC_STATS(DCCP_MIB_OUTSEGS);
return skb;
response_failed:
kfree_skb(skb);
return NULL;
}
EXPORT_SYMBOL_GPL(dccp_make_response);
/* answer offending packet in @rcv_skb with Reset from control socket @ctl */
struct sk_buff *dccp_ctl_make_reset(struct sock *sk, struct sk_buff *rcv_skb)
{
struct dccp_hdr *rxdh = dccp_hdr(rcv_skb), *dh;
struct dccp_skb_cb *dcb = DCCP_SKB_CB(rcv_skb);
const u32 dccp_hdr_reset_len = sizeof(struct dccp_hdr) +
sizeof(struct dccp_hdr_ext) +
sizeof(struct dccp_hdr_reset);
struct dccp_hdr_reset *dhr;
struct sk_buff *skb;
skb = alloc_skb(sk->sk_prot->max_header, GFP_ATOMIC);
if (skb == NULL)
return NULL;
skb_reserve(skb, sk->sk_prot->max_header);
/* Swap the send and the receive. */
dh = dccp_zeroed_hdr(skb, dccp_hdr_reset_len);
dh->dccph_type = DCCP_PKT_RESET;
dh->dccph_sport = rxdh->dccph_dport;
dh->dccph_dport = rxdh->dccph_sport;
dh->dccph_doff = dccp_hdr_reset_len / 4;
dh->dccph_x = 1;
dhr = dccp_hdr_reset(skb);
dhr->dccph_reset_code = dcb->dccpd_reset_code;
switch (dcb->dccpd_reset_code) {
case DCCP_RESET_CODE_PACKET_ERROR:
dhr->dccph_reset_data[0] = rxdh->dccph_type;
break;
case DCCP_RESET_CODE_OPTION_ERROR: /* fall through */
case DCCP_RESET_CODE_MANDATORY_ERROR:
memcpy(dhr->dccph_reset_data, dcb->dccpd_reset_data, 3);
break;
}
/*
* From RFC 4340, 8.3.1:
* If P.ackno exists, set R.seqno := P.ackno + 1.
* Else set R.seqno := 0.
*/
if (dcb->dccpd_ack_seq != DCCP_PKT_WITHOUT_ACK_SEQ)
dccp_hdr_set_seq(dh, ADD48(dcb->dccpd_ack_seq, 1));
dccp_hdr_set_ack(dccp_hdr_ack_bits(skb), dcb->dccpd_seq);
dccp_csum_outgoing(skb);
return skb;
}
EXPORT_SYMBOL_GPL(dccp_ctl_make_reset);
/* send Reset on established socket, to close or abort the connection */
int dccp_send_reset(struct sock *sk, enum dccp_reset_codes code)
{
struct sk_buff *skb;
/*
* FIXME: what if rebuild_header fails?
* Should we be doing a rebuild_header here?
*/
int err = inet_csk(sk)->icsk_af_ops->rebuild_header(sk);
if (err != 0)
return err;
skb = sock_wmalloc(sk, sk->sk_prot->max_header, 1, GFP_ATOMIC);
if (skb == NULL)
return -ENOBUFS;
/* Reserve space for headers and prepare control bits. */
skb_reserve(skb, sk->sk_prot->max_header);
DCCP_SKB_CB(skb)->dccpd_type = DCCP_PKT_RESET;
DCCP_SKB_CB(skb)->dccpd_reset_code = code;
return dccp_transmit_skb(sk, skb);
}
/*
* Do all connect socket setups that can be done AF independent.
*/
int dccp_connect(struct sock *sk)
{
struct sk_buff *skb;
struct dccp_sock *dp = dccp_sk(sk);
struct dst_entry *dst = __sk_dst_get(sk);
struct inet_connection_sock *icsk = inet_csk(sk);
sk->sk_err = 0;
sock_reset_flag(sk, SOCK_DONE);
dccp_sync_mss(sk, dst_mtu(dst));
/* do not connect if feature negotiation setup fails */
if (dccp_feat_finalise_settings(dccp_sk(sk)))
return -EPROTO;
/* Initialise GAR as per 8.5; AWL/AWH are set in dccp_transmit_skb() */
dp->dccps_gar = dp->dccps_iss;
skb = alloc_skb(sk->sk_prot->max_header, sk->sk_allocation);
if (unlikely(skb == NULL))
return -ENOBUFS;
/* Reserve space for headers. */
skb_reserve(skb, sk->sk_prot->max_header);
DCCP_SKB_CB(skb)->dccpd_type = DCCP_PKT_REQUEST;
dccp_transmit_skb(sk, dccp_skb_entail(sk, skb));
DCCP_INC_STATS(DCCP_MIB_ACTIVEOPENS);
/* Timer for repeating the REQUEST until an answer. */
icsk->icsk_retransmits = 0;
inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
icsk->icsk_rto, DCCP_RTO_MAX);
return 0;
}
EXPORT_SYMBOL_GPL(dccp_connect);
void dccp_send_ack(struct sock *sk)
{
/* If we have been reset, we may not send again. */
if (sk->sk_state != DCCP_CLOSED) {
struct sk_buff *skb = alloc_skb(sk->sk_prot->max_header,
GFP_ATOMIC);
if (skb == NULL) {
inet_csk_schedule_ack(sk);
inet_csk(sk)->icsk_ack.ato = TCP_ATO_MIN;
inet_csk_reset_xmit_timer(sk, ICSK_TIME_DACK,
TCP_DELACK_MAX,
DCCP_RTO_MAX);
return;
}
/* Reserve space for headers */
skb_reserve(skb, sk->sk_prot->max_header);
DCCP_SKB_CB(skb)->dccpd_type = DCCP_PKT_ACK;
dccp_transmit_skb(sk, skb);
}
}
EXPORT_SYMBOL_GPL(dccp_send_ack);
#if 0
/* FIXME: Is this still necessary (11.3) - currently nowhere used by DCCP. */
void dccp_send_delayed_ack(struct sock *sk)
{
struct inet_connection_sock *icsk = inet_csk(sk);
/*
* FIXME: tune this timer. elapsed time fixes the skew, so no problem
* with using 2s, and active senders also piggyback the ACK into a
* DATAACK packet, so this is really for quiescent senders.
*/
unsigned long timeout = jiffies + 2 * HZ;
/* Use new timeout only if there wasn't a older one earlier. */
if (icsk->icsk_ack.pending & ICSK_ACK_TIMER) {
/* If delack timer was blocked or is about to expire,
* send ACK now.
*
* FIXME: check the "about to expire" part
*/
if (icsk->icsk_ack.blocked) {
dccp_send_ack(sk);
return;
}
if (!time_before(timeout, icsk->icsk_ack.timeout))
timeout = icsk->icsk_ack.timeout;
}
icsk->icsk_ack.pending |= ICSK_ACK_SCHED | ICSK_ACK_TIMER;
icsk->icsk_ack.timeout = timeout;
sk_reset_timer(sk, &icsk->icsk_delack_timer, timeout);
}
#endif
void dccp_send_sync(struct sock *sk, const u64 ackno,
const enum dccp_pkt_type pkt_type)
{
/*
* We are not putting this on the write queue, so
* dccp_transmit_skb() will set the ownership to this
* sock.
*/
struct sk_buff *skb = alloc_skb(sk->sk_prot->max_header, GFP_ATOMIC);
if (skb == NULL) {
/* FIXME: how to make sure the sync is sent? */
DCCP_CRIT("could not send %s", dccp_packet_name(pkt_type));
return;
}
/* Reserve space for headers and prepare control bits. */
skb_reserve(skb, sk->sk_prot->max_header);
DCCP_SKB_CB(skb)->dccpd_type = pkt_type;
DCCP_SKB_CB(skb)->dccpd_ack_seq = ackno;
/*
* Clear the flag in case the Sync was scheduled for out-of-band data,
* such as carrying a long Ack Vector.
*/
dccp_sk(sk)->dccps_sync_scheduled = 0;
dccp_transmit_skb(sk, skb);
}
EXPORT_SYMBOL_GPL(dccp_send_sync);
/*
* Send a DCCP_PKT_CLOSE/CLOSEREQ. The caller locks the socket for us. This
* cannot be allowed to fail queueing a DCCP_PKT_CLOSE/CLOSEREQ frame under
* any circumstances.
*/
void dccp_send_close(struct sock *sk, const int active)
{
struct dccp_sock *dp = dccp_sk(sk);
struct sk_buff *skb;
const gfp_t prio = active ? GFP_KERNEL : GFP_ATOMIC;
skb = alloc_skb(sk->sk_prot->max_header, prio);
if (skb == NULL)
return;
/* Reserve space for headers and prepare control bits. */
skb_reserve(skb, sk->sk_prot->max_header);
if (dp->dccps_role == DCCP_ROLE_SERVER && !dp->dccps_server_timewait)
DCCP_SKB_CB(skb)->dccpd_type = DCCP_PKT_CLOSEREQ;
else
DCCP_SKB_CB(skb)->dccpd_type = DCCP_PKT_CLOSE;
if (active) {
skb = dccp_skb_entail(sk, skb);
/*
* Retransmission timer for active-close: RFC 4340, 8.3 requires
* to retransmit the Close/CloseReq until the CLOSING/CLOSEREQ
* state can be left. The initial timeout is 2 RTTs.
* Since RTT measurement is done by the CCIDs, there is no easy
* way to get an RTT sample. The fallback RTT from RFC 4340, 3.4
* is too low (200ms); we use a high value to avoid unnecessary
* retransmissions when the link RTT is > 0.2 seconds.
* FIXME: Let main module sample RTTs and use that instead.
*/
inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
DCCP_TIMEOUT_INIT, DCCP_RTO_MAX);
}
dccp_transmit_skb(sk, skb);
}

203
net/dccp/probe.c Normal file
View file

@ -0,0 +1,203 @@
/*
* dccp_probe - Observe the DCCP flow with kprobes.
*
* The idea for this came from Werner Almesberger's umlsim
* Copyright (C) 2004, Stephen Hemminger <shemminger@osdl.org>
*
* Modified for DCCP from Stephen Hemminger's code
* Copyright (C) 2006, Ian McDonald <ian.mcdonald@jandi.co.nz>
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
*/
#include <linux/kernel.h>
#include <linux/kprobes.h>
#include <linux/socket.h>
#include <linux/dccp.h>
#include <linux/proc_fs.h>
#include <linux/module.h>
#include <linux/kfifo.h>
#include <linux/vmalloc.h>
#include <linux/gfp.h>
#include <net/net_namespace.h>
#include "dccp.h"
#include "ccid.h"
#include "ccids/ccid3.h"
static int port;
static int bufsize = 64 * 1024;
static const char procname[] = "dccpprobe";
static struct {
struct kfifo fifo;
spinlock_t lock;
wait_queue_head_t wait;
struct timespec tstart;
} dccpw;
static void printl(const char *fmt, ...)
{
va_list args;
int len;
struct timespec now;
char tbuf[256];
va_start(args, fmt);
getnstimeofday(&now);
now = timespec_sub(now, dccpw.tstart);
len = sprintf(tbuf, "%lu.%06lu ",
(unsigned long) now.tv_sec,
(unsigned long) now.tv_nsec / NSEC_PER_USEC);
len += vscnprintf(tbuf+len, sizeof(tbuf)-len, fmt, args);
va_end(args);
kfifo_in_locked(&dccpw.fifo, tbuf, len, &dccpw.lock);
wake_up(&dccpw.wait);
}
static int jdccp_sendmsg(struct kiocb *iocb, struct sock *sk,
struct msghdr *msg, size_t size)
{
const struct inet_sock *inet = inet_sk(sk);
struct ccid3_hc_tx_sock *hc = NULL;
if (ccid_get_current_tx_ccid(dccp_sk(sk)) == DCCPC_CCID3)
hc = ccid3_hc_tx_sk(sk);
if (port == 0 || ntohs(inet->inet_dport) == port ||
ntohs(inet->inet_sport) == port) {
if (hc)
printl("%pI4:%u %pI4:%u %d %d %d %d %u %llu %llu %d\n",
&inet->inet_saddr, ntohs(inet->inet_sport),
&inet->inet_daddr, ntohs(inet->inet_dport), size,
hc->tx_s, hc->tx_rtt, hc->tx_p,
hc->tx_x_calc, hc->tx_x_recv >> 6,
hc->tx_x >> 6, hc->tx_t_ipi);
else
printl("%pI4:%u %pI4:%u %d\n",
&inet->inet_saddr, ntohs(inet->inet_sport),
&inet->inet_daddr, ntohs(inet->inet_dport),
size);
}
jprobe_return();
return 0;
}
static struct jprobe dccp_send_probe = {
.kp = {
.symbol_name = "dccp_sendmsg",
},
.entry = jdccp_sendmsg,
};
static int dccpprobe_open(struct inode *inode, struct file *file)
{
kfifo_reset(&dccpw.fifo);
getnstimeofday(&dccpw.tstart);
return 0;
}
static ssize_t dccpprobe_read(struct file *file, char __user *buf,
size_t len, loff_t *ppos)
{
int error = 0, cnt = 0;
unsigned char *tbuf;
if (!buf)
return -EINVAL;
if (len == 0)
return 0;
tbuf = vmalloc(len);
if (!tbuf)
return -ENOMEM;
error = wait_event_interruptible(dccpw.wait,
kfifo_len(&dccpw.fifo) != 0);
if (error)
goto out_free;
cnt = kfifo_out_locked(&dccpw.fifo, tbuf, len, &dccpw.lock);
error = copy_to_user(buf, tbuf, cnt) ? -EFAULT : 0;
out_free:
vfree(tbuf);
return error ? error : cnt;
}
static const struct file_operations dccpprobe_fops = {
.owner = THIS_MODULE,
.open = dccpprobe_open,
.read = dccpprobe_read,
.llseek = noop_llseek,
};
static __init int dccpprobe_init(void)
{
int ret = -ENOMEM;
init_waitqueue_head(&dccpw.wait);
spin_lock_init(&dccpw.lock);
if (kfifo_alloc(&dccpw.fifo, bufsize, GFP_KERNEL))
return ret;
if (!proc_create(procname, S_IRUSR, init_net.proc_net, &dccpprobe_fops))
goto err0;
ret = register_jprobe(&dccp_send_probe);
if (ret) {
ret = request_module("dccp");
if (!ret)
ret = register_jprobe(&dccp_send_probe);
}
if (ret)
goto err1;
pr_info("DCCP watch registered (port=%d)\n", port);
return 0;
err1:
remove_proc_entry(procname, init_net.proc_net);
err0:
kfifo_free(&dccpw.fifo);
return ret;
}
module_init(dccpprobe_init);
static __exit void dccpprobe_exit(void)
{
kfifo_free(&dccpw.fifo);
remove_proc_entry(procname, init_net.proc_net);
unregister_jprobe(&dccp_send_probe);
}
module_exit(dccpprobe_exit);
MODULE_PARM_DESC(port, "Port to match (0=all)");
module_param(port, int, 0);
MODULE_PARM_DESC(bufsize, "Log buffer size (default 64k)");
module_param(bufsize, int, 0);
MODULE_AUTHOR("Ian McDonald <ian.mcdonald@jandi.co.nz>");
MODULE_DESCRIPTION("DCCP snooper");
MODULE_LICENSE("GPL");

1255
net/dccp/proto.c Normal file

File diff suppressed because it is too large Load diff

137
net/dccp/qpolicy.c Normal file
View file

@ -0,0 +1,137 @@
/*
* net/dccp/qpolicy.c
*
* Policy-based packet dequeueing interface for DCCP.
*
* Copyright (c) 2008 Tomasz Grobelny <tomasz@grobelny.oswiecenia.net>
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public License v2
* as published by the Free Software Foundation.
*/
#include "dccp.h"
/*
* Simple Dequeueing Policy:
* If tx_qlen is different from 0, enqueue up to tx_qlen elements.
*/
static void qpolicy_simple_push(struct sock *sk, struct sk_buff *skb)
{
skb_queue_tail(&sk->sk_write_queue, skb);
}
static bool qpolicy_simple_full(struct sock *sk)
{
return dccp_sk(sk)->dccps_tx_qlen &&
sk->sk_write_queue.qlen >= dccp_sk(sk)->dccps_tx_qlen;
}
static struct sk_buff *qpolicy_simple_top(struct sock *sk)
{
return skb_peek(&sk->sk_write_queue);
}
/*
* Priority-based Dequeueing Policy:
* If tx_qlen is different from 0 and the queue has reached its upper bound
* of tx_qlen elements, replace older packets lowest-priority-first.
*/
static struct sk_buff *qpolicy_prio_best_skb(struct sock *sk)
{
struct sk_buff *skb, *best = NULL;
skb_queue_walk(&sk->sk_write_queue, skb)
if (best == NULL || skb->priority > best->priority)
best = skb;
return best;
}
static struct sk_buff *qpolicy_prio_worst_skb(struct sock *sk)
{
struct sk_buff *skb, *worst = NULL;
skb_queue_walk(&sk->sk_write_queue, skb)
if (worst == NULL || skb->priority < worst->priority)
worst = skb;
return worst;
}
static bool qpolicy_prio_full(struct sock *sk)
{
if (qpolicy_simple_full(sk))
dccp_qpolicy_drop(sk, qpolicy_prio_worst_skb(sk));
return false;
}
/**
* struct dccp_qpolicy_operations - TX Packet Dequeueing Interface
* @push: add a new @skb to the write queue
* @full: indicates that no more packets will be admitted
* @top: peeks at whatever the queueing policy defines as its `top'
*/
static struct dccp_qpolicy_operations {
void (*push) (struct sock *sk, struct sk_buff *skb);
bool (*full) (struct sock *sk);
struct sk_buff* (*top) (struct sock *sk);
__be32 params;
} qpol_table[DCCPQ_POLICY_MAX] = {
[DCCPQ_POLICY_SIMPLE] = {
.push = qpolicy_simple_push,
.full = qpolicy_simple_full,
.top = qpolicy_simple_top,
.params = 0,
},
[DCCPQ_POLICY_PRIO] = {
.push = qpolicy_simple_push,
.full = qpolicy_prio_full,
.top = qpolicy_prio_best_skb,
.params = DCCP_SCM_PRIORITY,
},
};
/*
* Externally visible interface
*/
void dccp_qpolicy_push(struct sock *sk, struct sk_buff *skb)
{
qpol_table[dccp_sk(sk)->dccps_qpolicy].push(sk, skb);
}
bool dccp_qpolicy_full(struct sock *sk)
{
return qpol_table[dccp_sk(sk)->dccps_qpolicy].full(sk);
}
void dccp_qpolicy_drop(struct sock *sk, struct sk_buff *skb)
{
if (skb != NULL) {
skb_unlink(skb, &sk->sk_write_queue);
kfree_skb(skb);
}
}
struct sk_buff *dccp_qpolicy_top(struct sock *sk)
{
return qpol_table[dccp_sk(sk)->dccps_qpolicy].top(sk);
}
struct sk_buff *dccp_qpolicy_pop(struct sock *sk)
{
struct sk_buff *skb = dccp_qpolicy_top(sk);
if (skb != NULL) {
/* Clear any skb fields that we used internally */
skb->priority = 0;
skb_unlink(skb, &sk->sk_write_queue);
}
return skb;
}
bool dccp_qpolicy_param_ok(struct sock *sk, __be32 param)
{
/* check if exactly one bit is set */
if (!param || (param & (param - 1)))
return false;
return (qpol_table[dccp_sk(sk)->dccps_qpolicy].params & param) == param;
}

118
net/dccp/sysctl.c Normal file
View file

@ -0,0 +1,118 @@
/*
* net/dccp/sysctl.c
*
* An implementation of the DCCP protocol
* Arnaldo Carvalho de Melo <acme@mandriva.com>
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public License v2
* as published by the Free Software Foundation.
*/
#include <linux/mm.h>
#include <linux/sysctl.h>
#include "dccp.h"
#include "feat.h"
#ifndef CONFIG_SYSCTL
#error This file should not be compiled without CONFIG_SYSCTL defined
#endif
/* Boundary values */
static int zero = 0,
one = 1,
u8_max = 0xFF;
static unsigned long seqw_min = DCCPF_SEQ_WMIN,
seqw_max = 0xFFFFFFFF; /* maximum on 32 bit */
static struct ctl_table dccp_default_table[] = {
{
.procname = "seq_window",
.data = &sysctl_dccp_sequence_window,
.maxlen = sizeof(sysctl_dccp_sequence_window),
.mode = 0644,
.proc_handler = proc_doulongvec_minmax,
.extra1 = &seqw_min, /* RFC 4340, 7.5.2 */
.extra2 = &seqw_max,
},
{
.procname = "rx_ccid",
.data = &sysctl_dccp_rx_ccid,
.maxlen = sizeof(sysctl_dccp_rx_ccid),
.mode = 0644,
.proc_handler = proc_dointvec_minmax,
.extra1 = &zero,
.extra2 = &u8_max, /* RFC 4340, 10. */
},
{
.procname = "tx_ccid",
.data = &sysctl_dccp_tx_ccid,
.maxlen = sizeof(sysctl_dccp_tx_ccid),
.mode = 0644,
.proc_handler = proc_dointvec_minmax,
.extra1 = &zero,
.extra2 = &u8_max, /* RFC 4340, 10. */
},
{
.procname = "request_retries",
.data = &sysctl_dccp_request_retries,
.maxlen = sizeof(sysctl_dccp_request_retries),
.mode = 0644,
.proc_handler = proc_dointvec_minmax,
.extra1 = &one,
.extra2 = &u8_max,
},
{
.procname = "retries1",
.data = &sysctl_dccp_retries1,
.maxlen = sizeof(sysctl_dccp_retries1),
.mode = 0644,
.proc_handler = proc_dointvec_minmax,
.extra1 = &zero,
.extra2 = &u8_max,
},
{
.procname = "retries2",
.data = &sysctl_dccp_retries2,
.maxlen = sizeof(sysctl_dccp_retries2),
.mode = 0644,
.proc_handler = proc_dointvec_minmax,
.extra1 = &zero,
.extra2 = &u8_max,
},
{
.procname = "tx_qlen",
.data = &sysctl_dccp_tx_qlen,
.maxlen = sizeof(sysctl_dccp_tx_qlen),
.mode = 0644,
.proc_handler = proc_dointvec_minmax,
.extra1 = &zero,
},
{
.procname = "sync_ratelimit",
.data = &sysctl_dccp_sync_ratelimit,
.maxlen = sizeof(sysctl_dccp_sync_ratelimit),
.mode = 0644,
.proc_handler = proc_dointvec_ms_jiffies,
},
{ }
};
static struct ctl_table_header *dccp_table_header;
int __init dccp_sysctl_init(void)
{
dccp_table_header = register_net_sysctl(&init_net, "net/dccp/default",
dccp_default_table);
return dccp_table_header != NULL ? 0 : -ENOMEM;
}
void dccp_sysctl_exit(void)
{
if (dccp_table_header != NULL) {
unregister_net_sysctl_table(dccp_table_header);
dccp_table_header = NULL;
}
}

293
net/dccp/timer.c Normal file
View file

@ -0,0 +1,293 @@
/*
* net/dccp/timer.c
*
* An implementation of the DCCP protocol
* Arnaldo Carvalho de Melo <acme@conectiva.com.br>
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public License
* as published by the Free Software Foundation; either version
* 2 of the License, or (at your option) any later version.
*/
#include <linux/dccp.h>
#include <linux/skbuff.h>
#include <linux/export.h>
#include "dccp.h"
/* sysctl variables governing numbers of retransmission attempts */
int sysctl_dccp_request_retries __read_mostly = TCP_SYN_RETRIES;
int sysctl_dccp_retries1 __read_mostly = TCP_RETR1;
int sysctl_dccp_retries2 __read_mostly = TCP_RETR2;
static void dccp_write_err(struct sock *sk)
{
sk->sk_err = sk->sk_err_soft ? : ETIMEDOUT;
sk->sk_error_report(sk);
dccp_send_reset(sk, DCCP_RESET_CODE_ABORTED);
dccp_done(sk);
DCCP_INC_STATS_BH(DCCP_MIB_ABORTONTIMEOUT);
}
/* A write timeout has occurred. Process the after effects. */
static int dccp_write_timeout(struct sock *sk)
{
const struct inet_connection_sock *icsk = inet_csk(sk);
int retry_until;
if (sk->sk_state == DCCP_REQUESTING || sk->sk_state == DCCP_PARTOPEN) {
if (icsk->icsk_retransmits != 0)
dst_negative_advice(sk);
retry_until = icsk->icsk_syn_retries ?
: sysctl_dccp_request_retries;
} else {
if (icsk->icsk_retransmits >= sysctl_dccp_retries1) {
/* NOTE. draft-ietf-tcpimpl-pmtud-01.txt requires pmtu
black hole detection. :-(
It is place to make it. It is not made. I do not want
to make it. It is disguisting. It does not work in any
case. Let me to cite the same draft, which requires for
us to implement this:
"The one security concern raised by this memo is that ICMP black holes
are often caused by over-zealous security administrators who block
all ICMP messages. It is vitally important that those who design and
deploy security systems understand the impact of strict filtering on
upper-layer protocols. The safest web site in the world is worthless
if most TCP implementations cannot transfer data from it. It would
be far nicer to have all of the black holes fixed rather than fixing
all of the TCP implementations."
Golden words :-).
*/
dst_negative_advice(sk);
}
retry_until = sysctl_dccp_retries2;
/*
* FIXME: see tcp_write_timout and tcp_out_of_resources
*/
}
if (icsk->icsk_retransmits >= retry_until) {
/* Has it gone just too far? */
dccp_write_err(sk);
return 1;
}
return 0;
}
/*
* The DCCP retransmit timer.
*/
static void dccp_retransmit_timer(struct sock *sk)
{
struct inet_connection_sock *icsk = inet_csk(sk);
/*
* More than than 4MSL (8 minutes) has passed, a RESET(aborted) was
* sent, no need to retransmit, this sock is dead.
*/
if (dccp_write_timeout(sk))
return;
/*
* We want to know the number of packets retransmitted, not the
* total number of retransmissions of clones of original packets.
*/
if (icsk->icsk_retransmits == 0)
DCCP_INC_STATS_BH(DCCP_MIB_TIMEOUTS);
if (dccp_retransmit_skb(sk) != 0) {
/*
* Retransmission failed because of local congestion,
* do not backoff.
*/
if (--icsk->icsk_retransmits == 0)
icsk->icsk_retransmits = 1;
inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
min(icsk->icsk_rto,
TCP_RESOURCE_PROBE_INTERVAL),
DCCP_RTO_MAX);
return;
}
icsk->icsk_backoff++;
icsk->icsk_rto = min(icsk->icsk_rto << 1, DCCP_RTO_MAX);
inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS, icsk->icsk_rto,
DCCP_RTO_MAX);
if (icsk->icsk_retransmits > sysctl_dccp_retries1)
__sk_dst_reset(sk);
}
static void dccp_write_timer(unsigned long data)
{
struct sock *sk = (struct sock *)data;
struct inet_connection_sock *icsk = inet_csk(sk);
int event = 0;
bh_lock_sock(sk);
if (sock_owned_by_user(sk)) {
/* Try again later */
sk_reset_timer(sk, &icsk->icsk_retransmit_timer,
jiffies + (HZ / 20));
goto out;
}
if (sk->sk_state == DCCP_CLOSED || !icsk->icsk_pending)
goto out;
if (time_after(icsk->icsk_timeout, jiffies)) {
sk_reset_timer(sk, &icsk->icsk_retransmit_timer,
icsk->icsk_timeout);
goto out;
}
event = icsk->icsk_pending;
icsk->icsk_pending = 0;
switch (event) {
case ICSK_TIME_RETRANS:
dccp_retransmit_timer(sk);
break;
}
out:
bh_unlock_sock(sk);
sock_put(sk);
}
/*
* Timer for listening sockets
*/
static void dccp_response_timer(struct sock *sk)
{
inet_csk_reqsk_queue_prune(sk, TCP_SYNQ_INTERVAL, DCCP_TIMEOUT_INIT,
DCCP_RTO_MAX);
}
static void dccp_keepalive_timer(unsigned long data)
{
struct sock *sk = (struct sock *)data;
/* Only process if socket is not in use. */
bh_lock_sock(sk);
if (sock_owned_by_user(sk)) {
/* Try again later. */
inet_csk_reset_keepalive_timer(sk, HZ / 20);
goto out;
}
if (sk->sk_state == DCCP_LISTEN) {
dccp_response_timer(sk);
goto out;
}
out:
bh_unlock_sock(sk);
sock_put(sk);
}
/* This is the same as tcp_delack_timer, sans prequeue & mem_reclaim stuff */
static void dccp_delack_timer(unsigned long data)
{
struct sock *sk = (struct sock *)data;
struct inet_connection_sock *icsk = inet_csk(sk);
bh_lock_sock(sk);
if (sock_owned_by_user(sk)) {
/* Try again later. */
icsk->icsk_ack.blocked = 1;
NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_DELAYEDACKLOCKED);
sk_reset_timer(sk, &icsk->icsk_delack_timer,
jiffies + TCP_DELACK_MIN);
goto out;
}
if (sk->sk_state == DCCP_CLOSED ||
!(icsk->icsk_ack.pending & ICSK_ACK_TIMER))
goto out;
if (time_after(icsk->icsk_ack.timeout, jiffies)) {
sk_reset_timer(sk, &icsk->icsk_delack_timer,
icsk->icsk_ack.timeout);
goto out;
}
icsk->icsk_ack.pending &= ~ICSK_ACK_TIMER;
if (inet_csk_ack_scheduled(sk)) {
if (!icsk->icsk_ack.pingpong) {
/* Delayed ACK missed: inflate ATO. */
icsk->icsk_ack.ato = min(icsk->icsk_ack.ato << 1,
icsk->icsk_rto);
} else {
/* Delayed ACK missed: leave pingpong mode and
* deflate ATO.
*/
icsk->icsk_ack.pingpong = 0;
icsk->icsk_ack.ato = TCP_ATO_MIN;
}
dccp_send_ack(sk);
NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_DELAYEDACKS);
}
out:
bh_unlock_sock(sk);
sock_put(sk);
}
/**
* dccp_write_xmitlet - Workhorse for CCID packet dequeueing interface
* See the comments above %ccid_dequeueing_decision for supported modes.
*/
static void dccp_write_xmitlet(unsigned long data)
{
struct sock *sk = (struct sock *)data;
bh_lock_sock(sk);
if (sock_owned_by_user(sk))
sk_reset_timer(sk, &dccp_sk(sk)->dccps_xmit_timer, jiffies + 1);
else
dccp_write_xmit(sk);
bh_unlock_sock(sk);
}
static void dccp_write_xmit_timer(unsigned long data)
{
dccp_write_xmitlet(data);
sock_put((struct sock *)data);
}
void dccp_init_xmit_timers(struct sock *sk)
{
struct dccp_sock *dp = dccp_sk(sk);
tasklet_init(&dp->dccps_xmitlet, dccp_write_xmitlet, (unsigned long)sk);
setup_timer(&dp->dccps_xmit_timer, dccp_write_xmit_timer,
(unsigned long)sk);
inet_csk_init_xmit_timers(sk, &dccp_write_timer, &dccp_delack_timer,
&dccp_keepalive_timer);
}
static ktime_t dccp_timestamp_seed;
/**
* dccp_timestamp - 10s of microseconds time source
* Returns the number of 10s of microseconds since loading DCCP. This is native
* DCCP time difference format (RFC 4340, sec. 13).
* Please note: This will wrap around about circa every 11.9 hours.
*/
u32 dccp_timestamp(void)
{
u64 delta = (u64)ktime_us_delta(ktime_get_real(), dccp_timestamp_seed);
do_div(delta, 10);
return delta;
}
EXPORT_SYMBOL_GPL(dccp_timestamp);
void __init dccp_timestamping_init(void)
{
dccp_timestamp_seed = ktime_get_real();
}