Commit 079096f1 authored by Eric Dumazet's avatar Eric Dumazet Committed by David S. Miller

tcp/dccp: install syn_recv requests into ehash table

In this patch, we insert request sockets into TCP/DCCP
regular ehash table (where ESTABLISHED and TIMEWAIT sockets
are) instead of using the per listener hash table.

ACK packets find SYN_RECV pseudo sockets without having
to find and lock the listener.

In nominal conditions, this halves pressure on listener lock.

Note that this will allow for SO_REUSEPORT refinements,
so that we can select a listener using cpu/numa affinities instead
of the prior 'consistent hash', since only SYN packets will
apply this selection logic.

We will shrink listen_sock in the following patch to ease
code review.
Signed-off-by: default avatarEric Dumazet <edumazet@google.com>
Cc: Ying Cai <ycai@google.com>
Cc: Willem de Bruijn <willemb@google.com>
Signed-off-by: default avatarDavid S. Miller <davem@davemloft.net>
parent 2feda341
......@@ -258,10 +258,6 @@ inet_csk_rto_backoff(const struct inet_connection_sock *icsk,
struct sock *inet_csk_accept(struct sock *sk, int flags, int *err);
struct request_sock *inet_csk_search_req(struct sock *sk,
const __be16 rport,
const __be32 raddr,
const __be32 laddr);
int inet_csk_bind_conflict(const struct sock *sk,
const struct inet_bind_bucket *tb, bool relax);
int inet_csk_get_port(struct sock *sk, unsigned short snum);
......
......@@ -205,6 +205,7 @@ void inet_put_port(struct sock *sk);
void inet_hashinfo_init(struct inet_hashinfo *h);
int inet_ehash_insert(struct sock *sk, struct sock *osk);
void __inet_hash_nolisten(struct sock *sk, struct sock *osk);
void __inet_hash(struct sock *sk, struct sock *osk);
void inet_hash(struct sock *sk);
......
......@@ -266,8 +266,4 @@ static inline int reqsk_queue_is_full(const struct request_sock_queue *queue)
return reqsk_queue_len(queue) >> queue->listen_opt->max_qlen_log;
}
void reqsk_queue_hash_req(struct request_sock_queue *queue,
u32 hash, struct request_sock *req,
unsigned long timeout);
#endif /* _REQUEST_SOCK_H */
......@@ -1618,7 +1618,6 @@ static inline bool tcp_stream_is_thin(struct tcp_sock *tp)
/* /proc */
enum tcp_seq_states {
TCP_SEQ_STATE_LISTENING,
TCP_SEQ_STATE_OPENREQ,
TCP_SEQ_STATE_ESTABLISHED,
};
......@@ -1717,8 +1716,6 @@ struct tcp_request_sock_ops {
int (*send_synack)(const struct sock *sk, struct dst_entry *dst,
struct flowi *fl, struct request_sock *req,
u16 queue_mapping, struct tcp_fastopen_cookie *foc);
void (*queue_hash_add)(struct sock *sk, struct request_sock *req,
const unsigned long timeout);
};
#ifdef CONFIG_SYN_COOKIES
......
......@@ -99,35 +99,9 @@ static inline struct listen_sock *reqsk_queue_yank_listen_sk(
void reqsk_queue_destroy(struct request_sock_queue *queue)
{
/* make all the listen_opt local to us */
struct listen_sock *lopt = reqsk_queue_yank_listen_sk(queue);
if (reqsk_queue_len(queue) != 0) {
unsigned int i;
for (i = 0; i < lopt->nr_table_entries; i++) {
struct request_sock *req;
spin_lock_bh(&queue->syn_wait_lock);
while ((req = lopt->syn_table[i]) != NULL) {
lopt->syn_table[i] = req->dl_next;
/* Because of following del_timer_sync(),
* we must release the spinlock here
* or risk a dead lock.
*/
spin_unlock_bh(&queue->syn_wait_lock);
atomic_dec(&queue->qlen);
if (del_timer_sync(&req->rsk_timer))
reqsk_put(req);
reqsk_put(req);
spin_lock_bh(&queue->syn_wait_lock);
}
spin_unlock_bh(&queue->syn_wait_lock);
}
}
if (WARN_ON(reqsk_queue_len(queue) != 0))
pr_err("qlen %u\n", reqsk_queue_len(queue));
/* cleaning is done by req timers */
kvfree(lopt);
}
......
......@@ -444,36 +444,6 @@ put_and_exit:
}
EXPORT_SYMBOL_GPL(dccp_v4_request_recv_sock);
static struct sock *dccp_v4_hnd_req(struct sock *sk, struct sk_buff *skb)
{
const struct dccp_hdr *dh = dccp_hdr(skb);
const struct iphdr *iph = ip_hdr(skb);
struct sock *nsk;
/* Find possible connection requests. */
struct request_sock *req = inet_csk_search_req(sk, dh->dccph_sport,
iph->saddr, iph->daddr);
if (req) {
nsk = dccp_check_req(sk, skb, req);
if (!nsk)
reqsk_put(req);
return nsk;
}
nsk = inet_lookup_established(sock_net(sk), &dccp_hashinfo,
iph->saddr, dh->dccph_sport,
iph->daddr, dh->dccph_dport,
inet_iif(skb));
if (nsk != NULL) {
if (nsk->sk_state != DCCP_TIME_WAIT) {
bh_lock_sock(nsk);
return nsk;
}
inet_twsk_put(inet_twsk(nsk));
return NULL;
}
return sk;
}
static struct dst_entry* dccp_v4_route_skb(struct net *net, struct sock *sk,
struct sk_buff *skb)
{
......@@ -705,18 +675,6 @@ int dccp_v4_do_rcv(struct sock *sk, struct sk_buff *skb)
* NOTE: the check for the packet types is done in
* dccp_rcv_state_process
*/
if (sk->sk_state == DCCP_LISTEN) {
struct sock *nsk = dccp_v4_hnd_req(sk, skb);
if (nsk == NULL)
goto discard;
if (nsk != sk) {
if (dccp_child_process(sk, nsk, skb))
goto reset;
return 0;
}
}
if (dccp_rcv_state_process(sk, skb, dh, skb->len))
goto reset;
......@@ -724,7 +682,6 @@ int dccp_v4_do_rcv(struct sock *sk, struct sk_buff *skb)
reset:
dccp_v4_ctl_send_reset(sk, skb);
discard:
kfree_skb(skb);
return 0;
}
......@@ -868,6 +825,27 @@ static int dccp_v4_rcv(struct sk_buff *skb)
goto no_dccp_socket;
}
if (sk->sk_state == DCCP_NEW_SYN_RECV) {
struct request_sock *req = inet_reqsk(sk);
struct sock *nsk = NULL;
sk = req->rsk_listener;
if (sk->sk_state == DCCP_LISTEN)
nsk = dccp_check_req(sk, skb, req);
if (!nsk) {
reqsk_put(req);
goto discard_it;
}
if (nsk == sk) {
sock_hold(sk);
reqsk_put(req);
} else if (dccp_child_process(sk, nsk, skb)) {
dccp_v4_ctl_send_reset(sk, skb);
goto discard_it;
} else {
return 0;
}
}
/*
* RFC 4340, sec. 9.2.1: Minimum Checksum Coverage
* o if MinCsCov = 0, only packets with CsCov = 0 are accepted
......
......@@ -290,37 +290,6 @@ static struct request_sock_ops dccp6_request_sock_ops = {
.syn_ack_timeout = dccp_syn_ack_timeout,
};
static struct sock *dccp_v6_hnd_req(struct sock *sk,struct sk_buff *skb)
{
const struct dccp_hdr *dh = dccp_hdr(skb);
const struct ipv6hdr *iph = ipv6_hdr(skb);
struct request_sock *req;
struct sock *nsk;
req = inet6_csk_search_req(sk, dh->dccph_sport, &iph->saddr,
&iph->daddr, inet6_iif(skb));
if (req) {
nsk = dccp_check_req(sk, skb, req);
if (!nsk)
reqsk_put(req);
return nsk;
}
nsk = __inet6_lookup_established(sock_net(sk), &dccp_hashinfo,
&iph->saddr, dh->dccph_sport,
&iph->daddr, ntohs(dh->dccph_dport),
inet6_iif(skb));
if (nsk != NULL) {
if (nsk->sk_state != DCCP_TIME_WAIT) {
bh_lock_sock(nsk);
return nsk;
}
inet_twsk_put(inet_twsk(nsk));
return NULL;
}
return sk;
}
static int dccp_v6_conn_request(struct sock *sk, struct sk_buff *skb)
{
struct request_sock *req;
......@@ -398,7 +367,7 @@ static int dccp_v6_conn_request(struct sock *sk, struct sk_buff *skb)
if (dccp_v6_send_response(sk, req))
goto drop_and_free;
inet6_csk_reqsk_queue_hash_add(sk, req, DCCP_TIMEOUT_INIT);
inet_csk_reqsk_queue_hash_add(sk, req, DCCP_TIMEOUT_INIT);
return 0;
drop_and_free:
......@@ -641,24 +610,6 @@ static int dccp_v6_do_rcv(struct sock *sk, struct sk_buff *skb)
* NOTE: the check for the packet types is done in
* dccp_rcv_state_process
*/
if (sk->sk_state == DCCP_LISTEN) {
struct sock *nsk = dccp_v6_hnd_req(sk, skb);
if (nsk == NULL)
goto discard;
/*
* Queue it on the new socket if the new socket is active,
* otherwise we just shortcircuit this and continue with
* the new socket..
*/
if (nsk != sk) {
if (dccp_child_process(sk, nsk, skb))
goto reset;
if (opt_skb != NULL)
__kfree_skb(opt_skb);
return 0;
}
}
if (dccp_rcv_state_process(sk, skb, dccp_hdr(skb), skb->len))
goto reset;
......@@ -732,6 +683,27 @@ static int dccp_v6_rcv(struct sk_buff *skb)
goto no_dccp_socket;
}
if (sk->sk_state == DCCP_NEW_SYN_RECV) {
struct request_sock *req = inet_reqsk(sk);
struct sock *nsk = NULL;
sk = req->rsk_listener;
if (sk->sk_state == DCCP_LISTEN)
nsk = dccp_check_req(sk, skb, req);
if (!nsk) {
reqsk_put(req);
goto discard_it;
}
if (nsk == sk) {
sock_hold(sk);
reqsk_put(req);
} else if (dccp_child_process(sk, nsk, skb)) {
dccp_v6_ctl_send_reset(sk, skb);
goto discard_it;
} else {
return 0;
}
}
/*
* RFC 4340, sec. 9.2.1: Minimum Checksum Coverage
* o if MinCsCov = 0, only packets with CsCov = 0 are accepted
......
......@@ -476,65 +476,12 @@ no_route:
}
EXPORT_SYMBOL_GPL(inet_csk_route_child_sock);
static inline u32 inet_synq_hash(const __be32 raddr, const __be16 rport,
const u32 rnd, const u32 synq_hsize)
{
return jhash_2words((__force u32)raddr, (__force u32)rport, rnd) & (synq_hsize - 1);
}
#if IS_ENABLED(CONFIG_IPV6)
#define AF_INET_FAMILY(fam) ((fam) == AF_INET)
#else
#define AF_INET_FAMILY(fam) true
#endif
/* Note: this is temporary :
* req sock will no longer be in listener hash table
*/
struct request_sock *inet_csk_search_req(struct sock *sk,
const __be16 rport,
const __be32 raddr,
const __be32 laddr)
{
struct inet_connection_sock *icsk = inet_csk(sk);
struct listen_sock *lopt = icsk->icsk_accept_queue.listen_opt;
struct request_sock *req;
u32 hash = inet_synq_hash(raddr, rport, lopt->hash_rnd,
lopt->nr_table_entries);
spin_lock(&icsk->icsk_accept_queue.syn_wait_lock);
for (req = lopt->syn_table[hash]; req != NULL; req = req->dl_next) {
const struct inet_request_sock *ireq = inet_rsk(req);
if (ireq->ir_rmt_port == rport &&
ireq->ir_rmt_addr == raddr &&
ireq->ir_loc_addr == laddr &&
AF_INET_FAMILY(req->rsk_ops->family)) {
atomic_inc(&req->rsk_refcnt);
WARN_ON(req->sk);
break;
}
}
spin_unlock(&icsk->icsk_accept_queue.syn_wait_lock);
return req;
}
EXPORT_SYMBOL_GPL(inet_csk_search_req);
void inet_csk_reqsk_queue_hash_add(struct sock *sk, struct request_sock *req,
unsigned long timeout)
{
struct inet_connection_sock *icsk = inet_csk(sk);
struct listen_sock *lopt = icsk->icsk_accept_queue.listen_opt;
const u32 h = inet_synq_hash(inet_rsk(req)->ir_rmt_addr,
inet_rsk(req)->ir_rmt_port,
lopt->hash_rnd, lopt->nr_table_entries);
reqsk_queue_hash_req(&icsk->icsk_accept_queue, h, req, timeout);
inet_csk_reqsk_queue_added(sk);
}
EXPORT_SYMBOL_GPL(inet_csk_reqsk_queue_hash_add);
/* Only thing we need from tcp.h */
extern int sysctl_tcp_synack_retries;
......@@ -571,26 +518,20 @@ int inet_rtx_syn_ack(const struct sock *parent, struct request_sock *req)
}
EXPORT_SYMBOL(inet_rtx_syn_ack);
/* return true if req was found in the syn_table[] */
/* return true if req was found in the ehash table */
static bool reqsk_queue_unlink(struct request_sock_queue *queue,
struct request_sock *req)
{
struct listen_sock *lopt = queue->listen_opt;
struct request_sock **prev;
bool found = false;
struct inet_hashinfo *hashinfo = req_to_sk(req)->sk_prot->h.hashinfo;
spinlock_t *lock;
bool found;
spin_lock(&queue->syn_wait_lock);
lock = inet_ehash_lockp(hashinfo, req->rsk_hash);
for (prev = &lopt->syn_table[req->rsk_hash]; *prev != NULL;
prev = &(*prev)->dl_next) {
if (*prev == req) {
*prev = req->dl_next;
found = true;
break;
}
}
spin_lock(lock);
found = __sk_nulls_del_node_init_rcu(req_to_sk(req));
spin_unlock(lock);
spin_unlock(&queue->syn_wait_lock);
if (timer_pending(&req->rsk_timer) && del_timer_sync(&req->rsk_timer))
reqsk_put(req);
return found;
......@@ -616,10 +557,8 @@ static void reqsk_timer_handler(unsigned long data)
int max_retries, thresh;
u8 defer_accept;
if (sk_listener->sk_state != TCP_LISTEN || !lopt) {
reqsk_put(req);
return;
}
if (sk_listener->sk_state != TCP_LISTEN || !lopt)
goto drop;
max_retries = icsk->icsk_syn_retries ? : sysctl_tcp_synack_retries;
thresh = max_retries;
......@@ -669,36 +608,36 @@ static void reqsk_timer_handler(unsigned long data)
mod_timer_pinned(&req->rsk_timer, jiffies + timeo);
return;
}
drop:
inet_csk_reqsk_queue_drop(sk_listener, req);
reqsk_put(req);
}
void reqsk_queue_hash_req(struct request_sock_queue *queue,
u32 hash, struct request_sock *req,
unsigned long timeout)
static void reqsk_queue_hash_req(struct request_sock *req,
unsigned long timeout)
{
struct listen_sock *lopt = queue->listen_opt;
req->num_retrans = 0;
req->num_timeout = 0;
req->sk = NULL;
setup_timer(&req->rsk_timer, reqsk_timer_handler, (unsigned long)req);
mod_timer_pinned(&req->rsk_timer, jiffies + timeout);
req->rsk_hash = hash;
inet_ehash_insert(req_to_sk(req), NULL);
/* before letting lookups find us, make sure all req fields
* are committed to memory and refcnt initialized.
*/
smp_wmb();
atomic_set(&req->rsk_refcnt, 2);
}
spin_lock(&queue->syn_wait_lock);
req->dl_next = lopt->syn_table[hash];
lopt->syn_table[hash] = req;
spin_unlock(&queue->syn_wait_lock);
void inet_csk_reqsk_queue_hash_add(struct sock *sk, struct request_sock *req,
unsigned long timeout)
{
reqsk_queue_hash_req(req, timeout);
inet_csk_reqsk_queue_added(sk);
}
EXPORT_SYMBOL(reqsk_queue_hash_req);
EXPORT_SYMBOL_GPL(inet_csk_reqsk_queue_hash_add);
/**
* inet_csk_clone_lock - clone an inet socket, and lock its clone
......
......@@ -730,91 +730,21 @@ static void twsk_build_assert(void)
#endif
}
static int inet_diag_dump_reqs(struct sk_buff *skb, struct sock *sk,
struct netlink_callback *cb,
const struct inet_diag_req_v2 *r,
const struct nlattr *bc)
{
struct inet_connection_sock *icsk = inet_csk(sk);
struct inet_sock *inet = inet_sk(sk);
struct inet_diag_entry entry;
int j, s_j, reqnum, s_reqnum;
struct listen_sock *lopt;
int err = 0;
s_j = cb->args[3];
s_reqnum = cb->args[4];
if (s_j > 0)
s_j--;
entry.family = sk->sk_family;
spin_lock(&icsk->icsk_accept_queue.syn_wait_lock);
lopt = icsk->icsk_accept_queue.listen_opt;
if (!lopt || !reqsk_queue_len(&icsk->icsk_accept_queue))
goto out;
if (bc) {
entry.sport = inet->inet_num;
entry.userlocks = sk->sk_userlocks;
}
for (j = s_j; j < lopt->nr_table_entries; j++) {
struct request_sock *req, *head = lopt->syn_table[j];
reqnum = 0;
for (req = head; req; reqnum++, req = req->dl_next) {
struct inet_request_sock *ireq = inet_rsk(req);
if (reqnum < s_reqnum)
continue;
if (r->id.idiag_dport != ireq->ir_rmt_port &&
r->id.idiag_dport)
continue;
if (bc) {
/* Note: entry.sport and entry.userlocks are already set */
entry_fill_addrs(&entry, req_to_sk(req));
entry.dport = ntohs(ireq->ir_rmt_port);
if (!inet_diag_bc_run(bc, &entry))
continue;
}
err = inet_req_diag_fill(req_to_sk(req), skb,
NETLINK_CB(cb->skb).portid,
cb->nlh->nlmsg_seq,
NLM_F_MULTI, cb->nlh);
if (err < 0) {
cb->args[3] = j + 1;
cb->args[4] = reqnum;
goto out;
}
}
s_reqnum = 0;
}
out:
spin_unlock(&icsk->icsk_accept_queue.syn_wait_lock);
return err;
}
void inet_diag_dump_icsk(struct inet_hashinfo *hashinfo, struct sk_buff *skb,
struct netlink_callback *cb,
const struct inet_diag_req_v2 *r, struct nlattr *bc)
{
struct net *net = sock_net(skb->sk);
int i, num, s_i, s_num;
u32 idiag_states = r->idiag_states;
if (idiag_states & TCPF_SYN_RECV)
idiag_states |= TCPF_NEW_SYN_RECV;
s_i = cb->args[1];
s_num = num = cb->args[2];
if (cb->args[0] == 0) {
if (!(r->idiag_states & (TCPF_LISTEN | TCPF_SYN_RECV)))
if (!(idiag_states & TCPF_LISTEN))
goto skip_listen_ht;
for (i = s_i; i < INET_LHTABLE_SIZE; i++) {
......@@ -844,21 +774,11 @@ void inet_diag_dump_icsk(struct inet_hashinfo *hashinfo, struct sk_buff *skb,
r->id.idiag_sport)
goto next_listen;
if (!(r->idiag_states & TCPF_LISTEN) ||
r->id.idiag_dport ||
if (r->id.idiag_dport ||
cb->args[3] > 0)
goto syn_recv;
if (inet_csk_diag_dump(sk, skb, cb, r, bc) < 0) {
spin_unlock_bh(&ilb->lock);
goto done;
}
syn_recv:
if (!(r->idiag_states & TCPF_SYN_RECV))
goto next_listen;
if (inet_diag_dump_reqs(skb, sk, cb, r, bc) < 0) {
if (inet_csk_diag_dump(sk, skb, cb, r, bc) < 0) {
spin_unlock_bh(&ilb->lock);
goto done;
}
......@@ -879,7 +799,7 @@ skip_listen_ht:
s_i = num = s_num = 0;
}
if (!(r->idiag_states & ~(TCPF_LISTEN | TCPF_SYN_RECV)))
if (!(idiag_states & ~TCPF_LISTEN))
goto out;
for (i = s_i; i <= hashinfo->ehash_mask; i++) {
......@@ -906,7 +826,7 @@ skip_listen_ht:
goto next_normal;
state = (sk->sk_state == TCP_TIME_WAIT) ?
inet_twsk(sk)->tw_substate : sk->sk_state;
if (!(r->idiag_states & (1 << state)))
if (!(idiag_states & (1 << state)))
goto next_normal;
if (r->sdiag_family != AF_UNSPEC &&
sk->sk_family != r->sdiag_family)
......
......@@ -398,14 +398,18 @@ static u32 inet_sk_port_offset(const struct sock *sk)
inet->inet_dport);
}
void __inet_hash_nolisten(struct sock *sk, struct sock *osk)
/* insert a socket into ehash, and eventually remove another one
* (The another one can be a SYN_RECV or TIMEWAIT
*/
int inet_ehash_insert(struct sock *sk, struct sock *osk)
{
struct inet_hashinfo *hashinfo = sk->sk_prot->h.hashinfo;
struct hlist_nulls_head *list;
struct inet_ehash_bucket *head;
spinlock_t *lock;
int ret = 0;
WARN_ON(!sk_unhashed(sk));
WARN_ON_ONCE(!sk_unhashed(sk));
sk->sk_hash = sk_ehashfn(sk);
head = inet_ehash_bucket(hashinfo, sk->sk_hash);
......@@ -419,6 +423,12 @@ void __inet_hash_nolisten(struct sock *sk, struct sock *osk)
sk_nulls_del_node_init_rcu(osk);
}
spin_unlock(lock);
return ret;
}
void __inet_hash_nolisten(struct sock *sk, struct sock *osk)
{
inet_ehash_insert(sk, osk);
sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1);
}
EXPORT_SYMBOL_GPL(__inet_hash_nolisten);
......
......@@ -284,6 +284,10 @@ bool cookie_ecn_ok(const struct tcp_options_received *tcp_opt,
}
EXPORT_SYMBOL(cookie_ecn_ok);
/* On input, sk is a listener.
* Output is listener if incoming packet would not create a child
* NULL if memory could not be allocated.
*/
struct sock *cookie_v4_check(struct sock *sk, struct sk_buff *skb)
{
struct ip_options *opt = &TCP_SKB_CB(skb)->header.h4.opt;
......
......@@ -6241,7 +6241,7 @@ int tcp_conn_request(struct request_sock_ops *rsk_ops,
goto drop_and_free;
tcp_rsk(req)->tfo_listener = false;
af_ops->queue_hash_add(sk, req, TCP_TIMEOUT_INIT);
inet_csk_reqsk_queue_hash_add(sk, req, TCP_TIMEOUT_INIT);
}
tcp_reqsk_record_syn(sk, req, skb);
......
......@@ -1224,7 +1224,6 @@ static const struct tcp_request_sock_ops tcp_request_sock_ipv4_ops = {
.route_req = tcp_v4_route_req,
.init_seq = tcp_v4_init_sequence,
.send_synack = tcp_v4_send_synack,
.queue_hash_add = inet_csk_reqsk_queue_hash_add,
};
int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
......@@ -1343,34 +1342,11 @@ put_and_exit:
}
EXPORT_SYMBOL(tcp_v4_syn_recv_sock);
static struct sock *tcp_v4_hnd_req(struct sock *sk, struct sk_buff *skb)