Commit 2e6599cb authored by Arnaldo Carvalho de Melo's avatar Arnaldo Carvalho de Melo Committed by David S. Miller
Browse files

[NET] Generalise TCP's struct open_request minisock infrastructure



Kept this first changeset minimal, without changing existing names to
ease peer review.

Basicaly tcp_openreq_alloc now receives the or_calltable, that in turn
has two new members:

->slab, that replaces tcp_openreq_cachep
->obj_size, to inform the size of the openreq descendant for
  a specific protocol

The protocol specific fields in struct open_request were moved to a
class hierarchy, with the things that are common to all connection
oriented PF_INET protocols in struct inet_request_sock, the TCP ones
in tcp_request_sock, that is an inet_request_sock, that is an
open_request.

I.e. this uses the same approach used for the struct sock class
hierarchy, with sk_prot indicating if the protocol wants to use the
open_request infrastructure by filling in sk_prot->rsk_prot with an
or_calltable.

Results? Performance is improved and TCP v4 now uses only 64 bytes per
open request minisock, down from 96 without this patch :-)

Next changeset will rename some of the structs, fields and functions
mentioned above, struct or_calltable is way unclear, better name it
struct request_sock_ops, s/struct open_request/struct request_sock/g,
etc.

Signed-off-by: default avatarArnaldo Carvalho de Melo <acme@ghostprotocols.net>
Signed-off-by: default avatarDavid S. Miller <davem@davemloft.net>
parent 1944972d
......@@ -81,6 +81,7 @@
#ifdef __KERNEL__
#include <linux/config.h>
#include <linux/types.h>
#include <net/request_sock.h>
#include <net/sock.h>
#include <linux/igmp.h>
#include <net/flow.h>
......@@ -107,6 +108,26 @@ struct ip_options {
#define optlength(opt) (sizeof(struct ip_options) + opt->optlen)
struct inet_request_sock {
struct open_request req;
u32 loc_addr;
u32 rmt_addr;
u16 rmt_port;
u16 snd_wscale : 4,
rcv_wscale : 4,
tstamp_ok : 1,
sack_ok : 1,
wscale_ok : 1,
ecn_ok : 1,
acked : 1;
struct ip_options *opt;
};
static inline struct inet_request_sock *inet_rsk(const struct open_request *sk)
{
return (struct inet_request_sock *)sk;
}
struct ipv6_pinfo;
struct inet_sock {
......
......@@ -193,6 +193,19 @@ struct inet6_skb_parm {
#define IP6CB(skb) ((struct inet6_skb_parm*)((skb)->cb))
struct tcp6_request_sock {
struct tcp_request_sock req;
struct in6_addr loc_addr;
struct in6_addr rmt_addr;
struct sk_buff *pktopts;
int iif;
};
static inline struct tcp6_request_sock *tcp6_rsk(const struct open_request *sk)
{
return (struct tcp6_request_sock *)sk;
}
/**
* struct ipv6_pinfo - ipv6 private area
*
......
......@@ -230,6 +230,17 @@ struct tcp_options_received {
__u16 mss_clamp; /* Maximal mss, negotiated at connection setup */
};
struct tcp_request_sock {
struct inet_request_sock req;
__u32 rcv_isn;
__u32 snt_isn;
};
static inline struct tcp_request_sock *tcp_rsk(const struct open_request *req)
{
return (struct tcp_request_sock *)req;
}
struct tcp_sock {
/* inet_sock has to be the first member of tcp_sock */
struct inet_sock inet;
......
/*
* NET Generic infrastructure for Network protocols.
*
* Definitions for request_sock
*
* Authors: Arnaldo Carvalho de Melo <acme@conectiva.com.br>
*
* From code originally in include/net/tcp.h
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public License
* as published by the Free Software Foundation; either version
* 2 of the License, or (at your option) any later version.
*/
#ifndef _REQUEST_SOCK_H
#define _REQUEST_SOCK_H
#include <linux/slab.h>
#include <linux/types.h>
#include <net/sock.h>
struct open_request;
struct sk_buff;
struct dst_entry;
struct proto;
struct or_calltable {
int family;
kmem_cache_t *slab;
int obj_size;
int (*rtx_syn_ack)(struct sock *sk,
struct open_request *req,
struct dst_entry *dst);
void (*send_ack)(struct sk_buff *skb,
struct open_request *req);
void (*send_reset)(struct sk_buff *skb);
void (*destructor)(struct open_request *req);
};
/* struct open_request - mini sock to represent a connection request
*/
struct open_request {
struct open_request *dl_next; /* Must be first member! */
u16 mss;
u8 retrans;
u8 __pad;
/* The following two fields can be easily recomputed I think -AK */
u32 window_clamp; /* window clamp at creation time */
u32 rcv_wnd; /* rcv_wnd offered first time */
u32 ts_recent;
unsigned long expires;
struct or_calltable *class;
struct sock *sk;
};
static inline struct open_request *tcp_openreq_alloc(struct or_calltable *class)
{
struct open_request *req = kmem_cache_alloc(class->slab, SLAB_ATOMIC);
if (req != NULL)
req->class = class;
return req;
}
static inline void tcp_openreq_fastfree(struct open_request *req)
{
kmem_cache_free(req->class->slab, req);
}
static inline void tcp_openreq_free(struct open_request *req)
{
req->class->destructor(req);
tcp_openreq_fastfree(req);
}
#endif /* _REQUEST_SOCK_H */
......@@ -484,6 +484,8 @@ extern void sk_stream_kill_queues(struct sock *sk);
extern int sk_wait_data(struct sock *sk, long *timeo);
struct or_calltable;
/* Networking protocol blocks we attach to sockets.
* socket layer -> transport layer interface
* transport -> network interface is defined by struct inet_proto
......@@ -547,6 +549,8 @@ struct proto {
kmem_cache_t *slab;
unsigned int obj_size;
struct or_calltable *rsk_prot;
struct module *owner;
char name[32];
......
......@@ -31,6 +31,7 @@
#include <linux/cache.h>
#include <linux/percpu.h>
#include <net/checksum.h>
#include <net/request_sock.h>
#include <net/sock.h>
#include <net/snmp.h>
#include <net/ip.h>
......@@ -613,74 +614,6 @@ extern atomic_t tcp_memory_allocated;
extern atomic_t tcp_sockets_allocated;
extern int tcp_memory_pressure;
struct open_request;
struct or_calltable {
int family;
int (*rtx_syn_ack) (struct sock *sk, struct open_request *req, struct dst_entry*);
void (*send_ack) (struct sk_buff *skb, struct open_request *req);
void (*destructor) (struct open_request *req);
void (*send_reset) (struct sk_buff *skb);
};
struct tcp_v4_open_req {
__u32 loc_addr;
__u32 rmt_addr;
struct ip_options *opt;
};
#if defined(CONFIG_IPV6) || defined (CONFIG_IPV6_MODULE)
struct tcp_v6_open_req {
struct in6_addr loc_addr;
struct in6_addr rmt_addr;
struct sk_buff *pktopts;
int iif;
};
#endif
/* this structure is too big */
struct open_request {
struct open_request *dl_next; /* Must be first member! */
__u32 rcv_isn;
__u32 snt_isn;
__u16 rmt_port;
__u16 mss;
__u8 retrans;
__u8 __pad;
__u16 snd_wscale : 4,
rcv_wscale : 4,
tstamp_ok : 1,
sack_ok : 1,
wscale_ok : 1,
ecn_ok : 1,
acked : 1;
/* The following two fields can be easily recomputed I think -AK */
__u32 window_clamp; /* window clamp at creation time */
__u32 rcv_wnd; /* rcv_wnd offered first time */
__u32 ts_recent;
unsigned long expires;
struct or_calltable *class;
struct sock *sk;
union {
struct tcp_v4_open_req v4_req;
#if defined(CONFIG_IPV6) || defined (CONFIG_IPV6_MODULE)
struct tcp_v6_open_req v6_req;
#endif
} af;
};
/* SLAB cache for open requests. */
extern kmem_cache_t *tcp_openreq_cachep;
#define tcp_openreq_alloc() kmem_cache_alloc(tcp_openreq_cachep, SLAB_ATOMIC)
#define tcp_openreq_fastfree(req) kmem_cache_free(tcp_openreq_cachep, req)
static inline void tcp_openreq_free(struct open_request *req)
{
req->class->destructor(req);
tcp_openreq_fastfree(req);
}
#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
#define TCP_INET_FAMILY(fam) ((fam) == AF_INET)
#else
......@@ -1832,17 +1765,19 @@ static __inline__ void tcp_openreq_init(struct open_request *req,
struct tcp_options_received *rx_opt,
struct sk_buff *skb)
{
struct inet_request_sock *ireq = inet_rsk(req);
req->rcv_wnd = 0; /* So that tcp_send_synack() knows! */
req->rcv_isn = TCP_SKB_CB(skb)->seq;
tcp_rsk(req)->rcv_isn = TCP_SKB_CB(skb)->seq;
req->mss = rx_opt->mss_clamp;
req->ts_recent = rx_opt->saw_tstamp ? rx_opt->rcv_tsval : 0;
req->tstamp_ok = rx_opt->tstamp_ok;
req->sack_ok = rx_opt->sack_ok;
req->snd_wscale = rx_opt->snd_wscale;
req->wscale_ok = rx_opt->wscale_ok;
req->acked = 0;
req->ecn_ok = 0;
req->rmt_port = skb->h.th->source;
ireq->tstamp_ok = rx_opt->tstamp_ok;
ireq->sack_ok = rx_opt->sack_ok;
ireq->snd_wscale = rx_opt->snd_wscale;
ireq->wscale_ok = rx_opt->wscale_ok;
ireq->acked = 0;
ireq->ecn_ok = 0;
ireq->rmt_port = skb->h.th->source;
}
extern void tcp_enter_memory_pressure(void);
......
......@@ -2,6 +2,7 @@
#define _NET_TCP_ECN_H_ 1
#include <net/inet_ecn.h>
#include <net/request_sock.h>
#define TCP_HP_BITS (~(TCP_RESERVED_BITS|TCP_FLAG_PSH))
......@@ -40,7 +41,7 @@ static inline void TCP_ECN_send_syn(struct sock *sk, struct tcp_sock *tp,
static __inline__ void
TCP_ECN_make_synack(struct open_request *req, struct tcphdr *th)
{
if (req->ecn_ok)
if (inet_rsk(req)->ecn_ok)
th->ece = 1;
}
......@@ -113,14 +114,14 @@ static inline int TCP_ECN_rcv_ecn_echo(struct tcp_sock *tp, struct tcphdr *th)
static inline void TCP_ECN_openreq_child(struct tcp_sock *tp,
struct open_request *req)
{
tp->ecn_flags = req->ecn_ok ? TCP_ECN_OK : 0;
tp->ecn_flags = inet_rsk(req)->ecn_ok ? TCP_ECN_OK : 0;
}
static __inline__ void
TCP_ECN_create_request(struct open_request *req, struct tcphdr *th)
{
if (sysctl_tcp_ecn && th->ece && th->cwr)
req->ecn_ok = 1;
inet_rsk(req)->ecn_ok = 1;
}
#endif
......@@ -118,6 +118,7 @@
#include <linux/netdevice.h>
#include <net/protocol.h>
#include <linux/skbuff.h>
#include <net/request_sock.h>
#include <net/sock.h>
#include <net/xfrm.h>
#include <linux/ipsec.h>
......@@ -1363,6 +1364,7 @@ static LIST_HEAD(proto_list);
int proto_register(struct proto *prot, int alloc_slab)
{
char *request_sock_slab_name;
int rc = -ENOBUFS;
if (alloc_slab) {
......@@ -1374,6 +1376,25 @@ int proto_register(struct proto *prot, int alloc_slab)
prot->name);
goto out;
}
if (prot->rsk_prot != NULL) {
static const char mask[] = "request_sock_%s";
request_sock_slab_name = kmalloc(strlen(prot->name) + sizeof(mask) - 1, GFP_KERNEL);
if (request_sock_slab_name == NULL)
goto out_free_sock_slab;
sprintf(request_sock_slab_name, mask, prot->name);
prot->rsk_prot->slab = kmem_cache_create(request_sock_slab_name,
prot->rsk_prot->obj_size, 0,
SLAB_HWCACHE_ALIGN, NULL, NULL);
if (prot->rsk_prot->slab == NULL) {
printk(KERN_CRIT "%s: Can't create request sock SLAB cache!\n",
prot->name);
goto out_free_request_sock_slab_name;
}
}
}
write_lock(&proto_list_lock);
......@@ -1382,6 +1403,12 @@ int proto_register(struct proto *prot, int alloc_slab)
rc = 0;
out:
return rc;
out_free_request_sock_slab_name:
kfree(request_sock_slab_name);
out_free_sock_slab:
kmem_cache_destroy(prot->slab);
prot->slab = NULL;
goto out;
}
EXPORT_SYMBOL(proto_register);
......@@ -1395,6 +1422,14 @@ void proto_unregister(struct proto *prot)
prot->slab = NULL;
}
if (prot->rsk_prot != NULL && prot->rsk_prot->slab != NULL) {
const char *name = kmem_cache_name(prot->rsk_prot->slab);
kmem_cache_destroy(prot->rsk_prot->slab);
kfree(name);
prot->rsk_prot->slab = NULL;
}
list_del(&prot->node);
write_unlock(&proto_list_lock);
}
......
......@@ -190,6 +190,8 @@ static inline struct sock *get_cookie_sock(struct sock *sk, struct sk_buff *skb,
struct sock *cookie_v4_check(struct sock *sk, struct sk_buff *skb,
struct ip_options *opt)
{
struct inet_request_sock *ireq;
struct tcp_request_sock *treq;
struct tcp_sock *tp = tcp_sk(sk);
__u32 cookie = ntohl(skb->h.th->ack_seq) - 1;
struct sock *ret = sk;
......@@ -209,19 +211,20 @@ struct sock *cookie_v4_check(struct sock *sk, struct sk_buff *skb,
NET_INC_STATS_BH(LINUX_MIB_SYNCOOKIESRECV);
req = tcp_openreq_alloc();
ret = NULL;
req = tcp_openreq_alloc(&or_ipv4); /* for safety */
if (!req)
goto out;
req->rcv_isn = htonl(skb->h.th->seq) - 1;
req->snt_isn = cookie;
ireq = inet_rsk(req);
treq = tcp_rsk(req);
treq->rcv_isn = htonl(skb->h.th->seq) - 1;
treq->snt_isn = cookie;
req->mss = mss;
req->rmt_port = skb->h.th->source;
req->af.v4_req.loc_addr = skb->nh.iph->daddr;
req->af.v4_req.rmt_addr = skb->nh.iph->saddr;
req->class = &or_ipv4; /* for savety */
req->af.v4_req.opt = NULL;
ireq->rmt_port = skb->h.th->source;
ireq->loc_addr = skb->nh.iph->daddr;
ireq->rmt_addr = skb->nh.iph->saddr;
ireq->opt = NULL;
/* We throwed the options of the initial SYN away, so we hope
* the ACK carries the same options again (see RFC1122 4.2.3.8)
......@@ -229,17 +232,15 @@ struct sock *cookie_v4_check(struct sock *sk, struct sk_buff *skb,
if (opt && opt->optlen) {
int opt_size = sizeof(struct ip_options) + opt->optlen;
req->af.v4_req.opt = kmalloc(opt_size, GFP_ATOMIC);
if (req->af.v4_req.opt) {
if (ip_options_echo(req->af.v4_req.opt, skb)) {
kfree(req->af.v4_req.opt);
req->af.v4_req.opt = NULL;
}
ireq->opt = kmalloc(opt_size, GFP_ATOMIC);
if (ireq->opt != NULL && ip_options_echo(ireq->opt, skb)) {
kfree(ireq->opt);
ireq->opt = NULL;
}
}
req->snd_wscale = req->rcv_wscale = req->tstamp_ok = 0;
req->wscale_ok = req->sack_ok = 0;
ireq->snd_wscale = ireq->rcv_wscale = ireq->tstamp_ok = 0;
ireq->wscale_ok = ireq->sack_ok = 0;
req->expires = 0UL;
req->retrans = 0;
......@@ -253,8 +254,8 @@ struct sock *cookie_v4_check(struct sock *sk, struct sk_buff *skb,
struct flowi fl = { .nl_u = { .ip4_u =
{ .daddr = ((opt && opt->srr) ?
opt->faddr :
req->af.v4_req.rmt_addr),
.saddr = req->af.v4_req.loc_addr,
ireq->rmt_addr),
.saddr = ireq->loc_addr,
.tos = RT_CONN_FLAGS(sk) } },
.proto = IPPROTO_TCP,
.uli_u = { .ports =
......@@ -272,7 +273,7 @@ struct sock *cookie_v4_check(struct sock *sk, struct sk_buff *skb,
&req->rcv_wnd, &req->window_clamp,
0, &rcv_wscale);
/* BTW win scale with syncookies is 0 by definition */
req->rcv_wscale = rcv_wscale;
ireq->rcv_wscale = rcv_wscale;
ret = get_cookie_sock(sk, skb, req, &rt->u.dst);
out: return ret;
......
......@@ -271,7 +271,6 @@ int sysctl_tcp_fin_timeout = TCP_FIN_TIMEOUT;
DEFINE_SNMP_STAT(struct tcp_mib, tcp_statistics);
kmem_cache_t *tcp_openreq_cachep;
kmem_cache_t *tcp_bucket_cachep;
kmem_cache_t *tcp_timewait_cachep;
......@@ -2271,13 +2270,6 @@ void __init tcp_init(void)
__skb_cb_too_small_for_tcp(sizeof(struct tcp_skb_cb),
sizeof(skb->cb));
tcp_openreq_cachep = kmem_cache_create("tcp_open_request",
sizeof(struct open_request),
0, SLAB_HWCACHE_ALIGN,
NULL, NULL);
if (!tcp_openreq_cachep)
panic("tcp_init: Cannot alloc open_request cache.");
tcp_bucket_cachep = kmem_cache_create("tcp_bind_bucket",
sizeof(struct tcp_bind_bucket),
0, SLAB_HWCACHE_ALIGN,
......@@ -2374,7 +2366,6 @@ EXPORT_SYMBOL(tcp_destroy_sock);
EXPORT_SYMBOL(tcp_disconnect);
EXPORT_SYMBOL(tcp_getsockopt);
EXPORT_SYMBOL(tcp_ioctl);
EXPORT_SYMBOL(tcp_openreq_cachep);
EXPORT_SYMBOL(tcp_poll);
EXPORT_SYMBOL(tcp_read_sock);
EXPORT_SYMBOL(tcp_recvmsg);
......
......@@ -458,6 +458,7 @@ static int tcpdiag_fill_req(struct sk_buff *skb, struct sock *sk,
struct open_request *req,
u32 pid, u32 seq)
{
const struct inet_request_sock *ireq = inet_rsk(req);
struct inet_sock *inet = inet_sk(sk);
unsigned char *b = skb->tail;
struct tcpdiagmsg *r;
......@@ -482,9 +483,9 @@ static int tcpdiag_fill_req(struct sk_buff *skb, struct sock *sk,
tmo = 0;
r->id.tcpdiag_sport = inet->sport;
r->id.tcpdiag_dport = req->rmt_port;
r->id.tcpdiag_src[0] = req->af.v4_req.loc_addr;
r->id.tcpdiag_dst[0] = req->af.v4_req.rmt_addr;
r->id.tcpdiag_dport = ireq->rmt_port;
r->id.tcpdiag_src[0] = ireq->loc_addr;
r->id.tcpdiag_dst[0] = ireq->rmt_addr;
r->tcpdiag_expires = jiffies_to_msecs(tmo),
r->tcpdiag_rqueue = 0;
r->tcpdiag_wqueue = 0;
......@@ -493,9 +494,9 @@ static int tcpdiag_fill_req(struct sk_buff *skb, struct sock *sk,
#ifdef CONFIG_IP_TCPDIAG_IPV6
if (r->tcpdiag_family == AF_INET6) {
ipv6_addr_copy((struct in6_addr *)r->id.tcpdiag_src,
&req->af.v6_req.loc_addr);
&tcp6_rsk(req)->loc_addr);
ipv6_addr_copy((struct in6_addr *)r->id.tcpdiag_dst,
&req->af.v6_req.rmt_addr);
&tcp6_rsk(req)->rmt_addr);
}
#endif
nlh->nlmsg_len = skb->tail - b;
......@@ -545,9 +546,11 @@ static int tcpdiag_dump_reqs(struct sk_buff *skb, struct sock *sk,
reqnum = 0;
for (req = head; req; reqnum++, req = req->dl_next) {
struct inet_request_sock *ireq = inet_rsk(req);
if (reqnum < s_reqnum)
continue;
if (r->id.tcpdiag_dport != req->rmt_port &&
if (r->id.tcpdiag_dport != ireq->rmt_port &&
r->id.tcpdiag_dport)
continue;
......@@ -555,16 +558,16 @@ static int tcpdiag_dump_reqs(struct sk_buff *skb, struct sock *sk,
entry.saddr =
#ifdef CONFIG_IP_TCPDIAG_IPV6
(entry.family == AF_INET6) ?
req->af.v6_req.loc_addr.s6_addr32 :
tcp6_rsk(req)->loc_addr.s6_addr32 :
#endif
&req->af.v4_req.loc_addr;
&ireq->loc_addr;
entry.daddr =
#ifdef CONFIG_IP_TCPDIAG_IPV6
(entry.family == AF_INET6) ?
req->af.v6_req.rmt_addr.s6_addr32 :
tcp6_rsk(req)->rmt_addr.s6_addr32 :
#endif
&req->af.v4_req.rmt_addr;
entry.dport = ntohs(req->rmt_port);
&ireq->rmt_addr;
entry.dport = ntohs(ireq->rmt_port);
if (!tcpdiag_bc_run(RTA_DATA(bc),
RTA_PAYLOAD(bc), &entry))
......
......@@ -880,9 +880,11 @@ static struct open_request *tcp_v4_search_req(struct tcp_sock *tp,
for (prev = &lopt->syn_table[tcp_v4_synq_hash(raddr, rport, lopt->hash_rnd)];
(req = *prev) != NULL;
prev = &req->dl_next) {
if (req->rmt_port == rport &&
req->af.v4_req.rmt_addr == raddr &&
req->af.v4_req.loc_addr == laddr &&
const struct inet_request_sock *ireq = inet_rsk(req);
if (ireq->rmt_port == rport &&
ireq->rmt_addr == raddr &&
ireq->loc_addr == laddr &&
TCP_INET_FAMILY(req->class->family)) {
BUG_TRAP(!req->sk);
*prevp = prev;
......@@ -897,7 +899,7 @@ static void tcp_v4_synq_add(struct sock *sk, struct open_request *req)
{
struct tcp_sock *tp = tcp_sk(sk);
struct tcp_listen_opt *lopt = tp->listen_opt;
u32 h = tcp_v4_synq_hash(req->af.v4_req.rmt_addr, req->rmt_port, lopt->hash_rnd);
u32 h = tcp_v4_synq_hash(inet_rsk(req)->rmt_addr, inet_rsk(req)->rmt_port, lopt->hash_rnd);
req->expires = jiffies + TCP_TIMEOUT_INIT;
req->retrans = 0;
......@@ -1065,7 +1067,7 @@ void tcp_v4_err(struct sk_buff *skb, u32 info)
*/
BUG_TRAP(!req->sk);
if (seq != req->snt_isn) {
if (seq != tcp_rsk(req)->snt_isn) {
NET_INC_STATS_BH(LINUX_MIB_OUTOFWINDOWICMPS);
goto out;
}
......@@ -1256,7 +1258,7 @@ static void tcp_v4_timewait_ack(struct sock *sk, struct sk_buff *skb)
static void tcp_v4_or_send_ack(struct sk_buff *skb, struct open_request *req)
{
tcp_v4_send_ack(skb, req->snt_isn + 1, req->rcv_isn + 1, req->rcv_wnd,
tcp_v4_send_ack(skb, tcp_rsk(req)->snt_isn + 1, tcp_rsk(req)->rcv_isn + 1, req->rcv_wnd,
req->ts_recent);
}
......@@ -1264,18 +1266,19 @@ static struct dst_entry* tcp_v4_route_req(struct sock *sk,
struct open_request *req)
{
struct rtable *rt;
struct ip_options *opt = req->af.v4_req.opt;
const struct inet_request_sock *ireq = inet_rsk(req);
struct ip_options *opt = inet_rsk(req)->opt;
struct flowi fl = { .oif = sk->sk_bound_dev_if,
.nl_u = { .ip4_u =
{ .daddr = ((opt && opt->srr) ?
opt->faddr :
req->af.v4_req.rmt_addr),