vxlan.c 85.1 KB
Newer Older
1
/*
Rami Rosen's avatar
Rami Rosen committed
2
 * VXLAN: Virtual eXtensible Local Area Network
3
 *
4
 * Copyright (c) 2012-2013 Vyatta Inc.
5 6 7 8 9 10 11 12 13 14 15 16 17 18 19
 *
 * This program is free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License version 2 as
 * published by the Free Software Foundation.
 */

#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt

#include <linux/kernel.h>
#include <linux/module.h>
#include <linux/errno.h>
#include <linux/slab.h>
#include <linux/udp.h>
#include <linux/igmp.h>
#include <linux/if_ether.h>
Yan Burman's avatar
Yan Burman committed
20
#include <linux/ethtool.h>
David Stevens's avatar
David Stevens committed
21 22
#include <net/arp.h>
#include <net/ndisc.h>
23 24 25 26 27 28
#include <net/ip.h>
#include <net/icmp.h>
#include <net/rtnetlink.h>
#include <net/inet_ecn.h>
#include <net/net_namespace.h>
#include <net/netns/generic.h>
29
#include <net/vxlan.h>
30

Cong Wang's avatar
Cong Wang committed
31 32
#if IS_ENABLED(CONFIG_IPV6)
#include <net/ip6_tunnel.h>
33
#include <net/ip6_checksum.h>
Cong Wang's avatar
Cong Wang committed
34
#endif
35 36 37

#define VXLAN_VERSION	"0.1"

38 39
#define PORT_HASH_BITS	8
#define PORT_HASH_SIZE  (1<<PORT_HASH_BITS)
40 41 42
#define FDB_AGE_DEFAULT 300 /* 5 min */
#define FDB_AGE_INTERVAL (10 * HZ)	/* rescan interval */

43 44
/* UDP port for VXLAN traffic.
 * The IANA assigned port is 4789, but the Linux default is 8472
Stephen Hemminger's avatar
Stephen Hemminger committed
45
 * for compatibility with early adopters.
46
 */
47 48
static unsigned short vxlan_port __read_mostly = 8472;
module_param_named(udp_port, vxlan_port, ushort, 0444);
49 50 51 52 53 54
MODULE_PARM_DESC(udp_port, "Destination UDP port");

static bool log_ecn_error = true;
module_param(log_ecn_error, bool, 0644);
MODULE_PARM_DESC(log_ecn_error, "Log packets received with corrupted ECN");

Pravin B Shelar's avatar
Pravin B Shelar committed
55
static int vxlan_net_id;
56
static struct rtnl_link_ops vxlan_link_ops;
57

58
static const u8 all_zeros_mac[ETH_ALEN + 2];
59

60
static int vxlan_sock_add(struct vxlan_dev *vxlan);
61

62 63
static void vxlan_vs_del_dev(struct vxlan_dev *vxlan);

64 65 66 67
/* per-network namespace private data for this module */
struct vxlan_net {
	struct list_head  vxlan_list;
	struct hlist_head sock_list[PORT_HASH_SIZE];
68
	spinlock_t	  sock_lock;
69 70
};

71 72 73 74 75 76
/* Forwarding table entry */
struct vxlan_fdb {
	struct hlist_node hlist;	/* linked list of entries */
	struct rcu_head	  rcu;
	unsigned long	  updated;	/* jiffies */
	unsigned long	  used;
77
	struct list_head  remotes;
78
	u8		  eth_addr[ETH_ALEN];
79
	u16		  state;	/* see ndm_state */
80
	u8		  flags;	/* see ndm_flags */
81 82 83 84 85
};

/* salt for hash table */
static u32 vxlan_salt __read_mostly;

Thomas Graf's avatar
Thomas Graf committed
86 87
static inline bool vxlan_collect_metadata(struct vxlan_sock *vs)
{
88 89
	return vs->flags & VXLAN_F_COLLECT_METADATA ||
	       ip_tunnel_collect_metadata();
Thomas Graf's avatar
Thomas Graf committed
90 91
}

Cong Wang's avatar
Cong Wang committed
92 93 94 95
#if IS_ENABLED(CONFIG_IPV6)
static inline
bool vxlan_addr_equal(const union vxlan_addr *a, const union vxlan_addr *b)
{
Jiri Benc's avatar
Jiri Benc committed
96 97 98 99 100 101
	if (a->sa.sa_family != b->sa.sa_family)
		return false;
	if (a->sa.sa_family == AF_INET6)
		return ipv6_addr_equal(&a->sin6.sin6_addr, &b->sin6.sin6_addr);
	else
		return a->sin.sin_addr.s_addr == b->sin.sin_addr.s_addr;
Cong Wang's avatar
Cong Wang committed
102 103 104 105
}

static inline bool vxlan_addr_any(const union vxlan_addr *ipa)
{
Jiri Benc's avatar
Jiri Benc committed
106 107 108 109
	if (ipa->sa.sa_family == AF_INET6)
		return ipv6_addr_any(&ipa->sin6.sin6_addr);
	else
		return ipa->sin.sin_addr.s_addr == htonl(INADDR_ANY);
Cong Wang's avatar
Cong Wang committed
110 111 112 113
}

static inline bool vxlan_addr_multicast(const union vxlan_addr *ipa)
{
Jiri Benc's avatar
Jiri Benc committed
114 115 116 117
	if (ipa->sa.sa_family == AF_INET6)
		return ipv6_addr_is_multicast(&ipa->sin6.sin6_addr);
	else
		return IN_MULTICAST(ntohl(ipa->sin.sin_addr.s_addr));
Cong Wang's avatar
Cong Wang committed
118 119 120 121
}

static int vxlan_nla_get_addr(union vxlan_addr *ip, struct nlattr *nla)
{
Jiri Benc's avatar
Jiri Benc committed
122
	if (nla_len(nla) >= sizeof(struct in6_addr)) {
123
		ip->sin6.sin6_addr = nla_get_in6_addr(nla);
Jiri Benc's avatar
Jiri Benc committed
124 125 126
		ip->sa.sa_family = AF_INET6;
		return 0;
	} else if (nla_len(nla) >= sizeof(__be32)) {
127
		ip->sin.sin_addr.s_addr = nla_get_in_addr(nla);
Jiri Benc's avatar
Jiri Benc committed
128 129 130 131 132
		ip->sa.sa_family = AF_INET;
		return 0;
	} else {
		return -EAFNOSUPPORT;
	}
Cong Wang's avatar
Cong Wang committed
133 134 135
}

static int vxlan_nla_put_addr(struct sk_buff *skb, int attr,
Jiri Benc's avatar
Jiri Benc committed
136
			      const union vxlan_addr *ip)
Cong Wang's avatar
Cong Wang committed
137
{
Jiri Benc's avatar
Jiri Benc committed
138
	if (ip->sa.sa_family == AF_INET6)
139
		return nla_put_in6_addr(skb, attr, &ip->sin6.sin6_addr);
Jiri Benc's avatar
Jiri Benc committed
140
	else
141
		return nla_put_in_addr(skb, attr, ip->sin.sin_addr.s_addr);
Cong Wang's avatar
Cong Wang committed
142 143 144 145 146 147 148
}

#else /* !CONFIG_IPV6 */

static inline
bool vxlan_addr_equal(const union vxlan_addr *a, const union vxlan_addr *b)
{
Jiri Benc's avatar
Jiri Benc committed
149
	return a->sin.sin_addr.s_addr == b->sin.sin_addr.s_addr;
Cong Wang's avatar
Cong Wang committed
150 151 152 153
}

static inline bool vxlan_addr_any(const union vxlan_addr *ipa)
{
Jiri Benc's avatar
Jiri Benc committed
154
	return ipa->sin.sin_addr.s_addr == htonl(INADDR_ANY);
Cong Wang's avatar
Cong Wang committed
155 156 157 158
}

static inline bool vxlan_addr_multicast(const union vxlan_addr *ipa)
{
Jiri Benc's avatar
Jiri Benc committed
159
	return IN_MULTICAST(ntohl(ipa->sin.sin_addr.s_addr));
Cong Wang's avatar
Cong Wang committed
160 161 162 163
}

static int vxlan_nla_get_addr(union vxlan_addr *ip, struct nlattr *nla)
{
Jiri Benc's avatar
Jiri Benc committed
164 165 166
	if (nla_len(nla) >= sizeof(struct in6_addr)) {
		return -EAFNOSUPPORT;
	} else if (nla_len(nla) >= sizeof(__be32)) {
167
		ip->sin.sin_addr.s_addr = nla_get_in_addr(nla);
Jiri Benc's avatar
Jiri Benc committed
168 169 170 171 172
		ip->sa.sa_family = AF_INET;
		return 0;
	} else {
		return -EAFNOSUPPORT;
	}
Cong Wang's avatar
Cong Wang committed
173 174 175
}

static int vxlan_nla_put_addr(struct sk_buff *skb, int attr,
Jiri Benc's avatar
Jiri Benc committed
176
			      const union vxlan_addr *ip)
Cong Wang's avatar
Cong Wang committed
177
{
178
	return nla_put_in_addr(skb, attr, ip->sin.sin_addr.s_addr);
Cong Wang's avatar
Cong Wang committed
179 180 181
}
#endif

182
/* Virtual Network hash table head */
183
static inline struct hlist_head *vni_head(struct vxlan_sock *vs, __be32 vni)
184
{
185
	return &vs->vni_list[hash_32((__force u32)vni, VNI_HASH_BITS)];
186 187 188 189
}

/* Socket hash table head */
static inline struct hlist_head *vs_head(struct net *net, __be16 port)
190 191 192
{
	struct vxlan_net *vn = net_generic(net, vxlan_net_id);

193 194 195
	return &vn->sock_list[hash_32(ntohs(port), PORT_HASH_BITS)];
}

196 197 198
/* First remote destination for a forwarding entry.
 * Guaranteed to be non-NULL because remotes are never deleted.
 */
199
static inline struct vxlan_rdst *first_remote_rcu(struct vxlan_fdb *fdb)
200
{
201 202 203 204 205 206
	return list_entry_rcu(fdb->remotes.next, struct vxlan_rdst, list);
}

static inline struct vxlan_rdst *first_remote_rtnl(struct vxlan_fdb *fdb)
{
	return list_first_entry(&fdb->remotes, struct vxlan_rdst, list);
207 208
}

209 210 211 212 213
/* Find VXLAN socket based on network namespace, address family and UDP port
 * and enabled unshareable flags.
 */
static struct vxlan_sock *vxlan_find_sock(struct net *net, sa_family_t family,
					  __be16 port, u32 flags)
214 215
{
	struct vxlan_sock *vs;
216 217

	flags &= VXLAN_F_RCV_FLAGS;
218 219

	hlist_for_each_entry_rcu(vs, vs_head(net, port), hlist) {
220
		if (inet_sk(vs->sock->sk)->inet_sport == port &&
221
		    vxlan_get_sk_family(vs) == family &&
222
		    vs->flags == flags)
223 224 225
			return vs;
	}
	return NULL;
226 227
}

228
static struct vxlan_dev *vxlan_vs_find_vni(struct vxlan_sock *vs, __be32 vni)
229 230 231
{
	struct vxlan_dev *vxlan;

232 233 234 235
	/* For flow based devices, map all packets to VNI 0 */
	if (vs->flags & VXLAN_F_COLLECT_METADATA)
		vni = 0;

236 237
	hlist_for_each_entry_rcu(vxlan, vni_head(vs, vni), hlist) {
		if (vxlan->default_dst.remote_vni == vni)
238 239 240 241 242 243
			return vxlan;
	}

	return NULL;
}

Pravin B Shelar's avatar
Pravin B Shelar committed
244
/* Look up VNI in a per net namespace table */
245
static struct vxlan_dev *vxlan_find_vni(struct net *net, __be32 vni,
246 247
					sa_family_t family, __be16 port,
					u32 flags)
Pravin B Shelar's avatar
Pravin B Shelar committed
248 249 250
{
	struct vxlan_sock *vs;

251
	vs = vxlan_find_sock(net, family, port, flags);
Pravin B Shelar's avatar
Pravin B Shelar committed
252 253 254
	if (!vs)
		return NULL;

255
	return vxlan_vs_find_vni(vs, vni);
Pravin B Shelar's avatar
Pravin B Shelar committed
256 257
}

258 259
/* Fill in neighbour message in skbuff. */
static int vxlan_fdb_info(struct sk_buff *skb, struct vxlan_dev *vxlan,
Stephen Hemminger's avatar
Stephen Hemminger committed
260 261 262
			  const struct vxlan_fdb *fdb,
			  u32 portid, u32 seq, int type, unsigned int flags,
			  const struct vxlan_rdst *rdst)
263 264 265 266 267
{
	unsigned long now = jiffies;
	struct nda_cacheinfo ci;
	struct nlmsghdr *nlh;
	struct ndmsg *ndm;
David Stevens's avatar
David Stevens committed
268
	bool send_ip, send_eth;
269 270 271 272 273 274 275

	nlh = nlmsg_put(skb, portid, seq, type, sizeof(*ndm), flags);
	if (nlh == NULL)
		return -EMSGSIZE;

	ndm = nlmsg_data(nlh);
	memset(ndm, 0, sizeof(*ndm));
David Stevens's avatar
David Stevens committed
276 277 278 279 280

	send_eth = send_ip = true;

	if (type == RTM_GETNEIGH) {
		ndm->ndm_family	= AF_INET;
Cong Wang's avatar
Cong Wang committed
281
		send_ip = !vxlan_addr_any(&rdst->remote_ip);
David Stevens's avatar
David Stevens committed
282 283 284
		send_eth = !is_zero_ether_addr(fdb->eth_addr);
	} else
		ndm->ndm_family	= AF_BRIDGE;
285 286
	ndm->ndm_state = fdb->state;
	ndm->ndm_ifindex = vxlan->dev->ifindex;
287
	ndm->ndm_flags = fdb->flags;
288
	ndm->ndm_type = RTN_UNICAST;
289

290
	if (!net_eq(dev_net(vxlan->dev), vxlan->net) &&
291
	    nla_put_s32(skb, NDA_LINK_NETNSID,
292
			peernet2id(dev_net(vxlan->dev), vxlan->net)))
293 294
		goto nla_put_failure;

David Stevens's avatar
David Stevens committed
295
	if (send_eth && nla_put(skb, NDA_LLADDR, ETH_ALEN, &fdb->eth_addr))
296 297
		goto nla_put_failure;

Cong Wang's avatar
Cong Wang committed
298
	if (send_ip && vxlan_nla_put_addr(skb, NDA_DST, &rdst->remote_ip))
299 300
		goto nla_put_failure;

301
	if (rdst->remote_port && rdst->remote_port != vxlan->cfg.dst_port &&
302 303
	    nla_put_be16(skb, NDA_PORT, rdst->remote_port))
		goto nla_put_failure;
304
	if (rdst->remote_vni != vxlan->default_dst.remote_vni &&
305
	    nla_put_u32(skb, NDA_VNI, be32_to_cpu(rdst->remote_vni)))
306 307 308
		goto nla_put_failure;
	if (rdst->remote_ifindex &&
	    nla_put_u32(skb, NDA_IFINDEX, rdst->remote_ifindex))
309 310 311 312 313 314 315 316 317 318
		goto nla_put_failure;

	ci.ndm_used	 = jiffies_to_clock_t(now - fdb->used);
	ci.ndm_confirmed = 0;
	ci.ndm_updated	 = jiffies_to_clock_t(now - fdb->updated);
	ci.ndm_refcnt	 = 0;

	if (nla_put(skb, NDA_CACHEINFO, sizeof(ci), &ci))
		goto nla_put_failure;

319 320
	nlmsg_end(skb, nlh);
	return 0;
321 322 323 324 325 326 327 328 329 330

nla_put_failure:
	nlmsg_cancel(skb, nlh);
	return -EMSGSIZE;
}

static inline size_t vxlan_nlmsg_size(void)
{
	return NLMSG_ALIGN(sizeof(struct ndmsg))
		+ nla_total_size(ETH_ALEN) /* NDA_LLADDR */
Cong Wang's avatar
Cong Wang committed
331
		+ nla_total_size(sizeof(struct in6_addr)) /* NDA_DST */
332
		+ nla_total_size(sizeof(__be16)) /* NDA_PORT */
333 334
		+ nla_total_size(sizeof(__be32)) /* NDA_VNI */
		+ nla_total_size(sizeof(__u32)) /* NDA_IFINDEX */
335
		+ nla_total_size(sizeof(__s32)) /* NDA_LINK_NETNSID */
336 337 338
		+ nla_total_size(sizeof(struct nda_cacheinfo));
}

339 340
static void vxlan_fdb_notify(struct vxlan_dev *vxlan, struct vxlan_fdb *fdb,
			     struct vxlan_rdst *rd, int type)
341 342 343 344 345 346 347 348 349
{
	struct net *net = dev_net(vxlan->dev);
	struct sk_buff *skb;
	int err = -ENOBUFS;

	skb = nlmsg_new(vxlan_nlmsg_size(), GFP_ATOMIC);
	if (skb == NULL)
		goto errout;

350
	err = vxlan_fdb_info(skb, vxlan, fdb, 0, 0, type, 0, rd);
351 352 353 354 355 356 357 358 359 360 361 362 363 364
	if (err < 0) {
		/* -EMSGSIZE implies BUG in vxlan_nlmsg_size() */
		WARN_ON(err == -EMSGSIZE);
		kfree_skb(skb);
		goto errout;
	}

	rtnl_notify(skb, net, 0, RTNLGRP_NEIGH, NULL, GFP_ATOMIC);
	return;
errout:
	if (err < 0)
		rtnl_set_sk_err(net, RTNLGRP_NEIGH, err);
}

Cong Wang's avatar
Cong Wang committed
365
static void vxlan_ip_miss(struct net_device *dev, union vxlan_addr *ipa)
David Stevens's avatar
David Stevens committed
366 367
{
	struct vxlan_dev *vxlan = netdev_priv(dev);
368 369 370 371
	struct vxlan_fdb f = {
		.state = NUD_STALE,
	};
	struct vxlan_rdst remote = {
Cong Wang's avatar
Cong Wang committed
372
		.remote_ip = *ipa, /* goes to NDA_DST */
373
		.remote_vni = cpu_to_be32(VXLAN_N_VID),
374
	};
375

376
	vxlan_fdb_notify(vxlan, &f, &remote, RTM_GETNEIGH);
David Stevens's avatar
David Stevens committed
377 378 379 380
}

static void vxlan_fdb_miss(struct vxlan_dev *vxlan, const u8 eth_addr[ETH_ALEN])
{
381 382 383
	struct vxlan_fdb f = {
		.state = NUD_STALE,
	};
384
	struct vxlan_rdst remote = { };
David Stevens's avatar
David Stevens committed
385 386 387

	memcpy(f.eth_addr, eth_addr, ETH_ALEN);

388
	vxlan_fdb_notify(vxlan, &f, &remote, RTM_GETNEIGH);
David Stevens's avatar
David Stevens committed
389 390
}

391 392 393 394 395 396 397 398
/* Hash Ethernet address */
static u32 eth_hash(const unsigned char *addr)
{
	u64 value = get_unaligned((u64 *)addr);

	/* only want 6 bytes */
#ifdef __BIG_ENDIAN
	value >>= 16;
399 400
#else
	value <<= 16;
401 402 403 404 405 406 407 408 409 410 411 412
#endif
	return hash_64(value, FDB_HASH_BITS);
}

/* Hash chain to use given mac address */
static inline struct hlist_head *vxlan_fdb_head(struct vxlan_dev *vxlan,
						const u8 *mac)
{
	return &vxlan->fdb_head[eth_hash(mac)];
}

/* Look up Ethernet address in forwarding table */
413
static struct vxlan_fdb *__vxlan_find_mac(struct vxlan_dev *vxlan,
414 415 416 417 418
					const u8 *mac)
{
	struct hlist_head *head = vxlan_fdb_head(vxlan, mac);
	struct vxlan_fdb *f;

419
	hlist_for_each_entry_rcu(f, head, hlist) {
420
		if (ether_addr_equal(mac, f->eth_addr))
421 422 423 424 425 426
			return f;
	}

	return NULL;
}

427 428 429 430 431 432 433 434 435 436 437 438
static struct vxlan_fdb *vxlan_find_mac(struct vxlan_dev *vxlan,
					const u8 *mac)
{
	struct vxlan_fdb *f;

	f = __vxlan_find_mac(vxlan, mac);
	if (f)
		f->used = jiffies;

	return f;
}

439 440
/* caller should hold vxlan->hash_lock */
static struct vxlan_rdst *vxlan_fdb_find_rdst(struct vxlan_fdb *f,
Cong Wang's avatar
Cong Wang committed
441
					      union vxlan_addr *ip, __be16 port,
442
					      __be32 vni, __u32 ifindex)
443
{
444
	struct vxlan_rdst *rd;
445

446
	list_for_each_entry(rd, &f->remotes, list) {
Cong Wang's avatar
Cong Wang committed
447
		if (vxlan_addr_equal(&rd->remote_ip, ip) &&
448 449 450
		    rd->remote_port == port &&
		    rd->remote_vni == vni &&
		    rd->remote_ifindex == ifindex)
451
			return rd;
452
	}
453

454 455 456
	return NULL;
}

457 458
/* Replace destination of unicast mac */
static int vxlan_fdb_replace(struct vxlan_fdb *f,
459 460
			     union vxlan_addr *ip, __be16 port, __be32 vni,
			     __u32 ifindex)
461 462 463 464 465 466 467 468 469 470
{
	struct vxlan_rdst *rd;

	rd = vxlan_fdb_find_rdst(f, ip, port, vni, ifindex);
	if (rd)
		return 0;

	rd = list_first_entry_or_null(&f->remotes, struct vxlan_rdst, list);
	if (!rd)
		return 0;
471 472

	dst_cache_reset(&rd->dst_cache);
Cong Wang's avatar
Cong Wang committed
473
	rd->remote_ip = *ip;
474 475 476 477 478 479
	rd->remote_port = port;
	rd->remote_vni = vni;
	rd->remote_ifindex = ifindex;
	return 1;
}

480 481
/* Add/update destinations for multicast */
static int vxlan_fdb_append(struct vxlan_fdb *f,
482
			    union vxlan_addr *ip, __be16 port, __be32 vni,
483
			    __u32 ifindex, struct vxlan_rdst **rdp)
484 485 486 487 488 489 490
{
	struct vxlan_rdst *rd;

	rd = vxlan_fdb_find_rdst(f, ip, port, vni, ifindex);
	if (rd)
		return 0;

491 492 493
	rd = kmalloc(sizeof(*rd), GFP_ATOMIC);
	if (rd == NULL)
		return -ENOBUFS;
494 495 496 497 498 499

	if (dst_cache_init(&rd->dst_cache, GFP_ATOMIC)) {
		kfree(rd);
		return -ENOBUFS;
	}

Cong Wang's avatar
Cong Wang committed
500
	rd->remote_ip = *ip;
501 502 503
	rd->remote_port = port;
	rd->remote_vni = vni;
	rd->remote_ifindex = ifindex;
504 505 506

	list_add_tail_rcu(&rd->list, &f->remotes);

507
	*rdp = rd;
508 509 510
	return 1;
}

Tom Herbert's avatar
Tom Herbert committed
511 512 513
static struct vxlanhdr *vxlan_gro_remcsum(struct sk_buff *skb,
					  unsigned int off,
					  struct vxlanhdr *vh, size_t hdrlen,
514 515
					  __be32 vni_field,
					  struct gro_remcsum *grc,
516
					  bool nopartial)
Tom Herbert's avatar
Tom Herbert committed
517
{
518
	size_t start, offset;
Tom Herbert's avatar
Tom Herbert committed
519 520

	if (skb->remcsum_offload)
521
		return vh;
Tom Herbert's avatar
Tom Herbert committed
522 523 524 525

	if (!NAPI_GRO_CB(skb)->csum_valid)
		return NULL;

526 527
	start = vxlan_rco_start(vni_field);
	offset = start + vxlan_rco_offset(vni_field);
Tom Herbert's avatar
Tom Herbert committed
528

529 530
	vh = skb_gro_remcsum_process(skb, (void *)vh, off, hdrlen,
				     start, offset, grc, nopartial);
Tom Herbert's avatar
Tom Herbert committed
531 532 533 534 535 536

	skb->remcsum_offload = 1;

	return vh;
}

537 538 539
static struct sk_buff **vxlan_gro_receive(struct sock *sk,
					  struct sk_buff **head,
					  struct sk_buff *skb)
540 541 542
{
	struct sk_buff *p, **pp = NULL;
	struct vxlanhdr *vh, *vh2;
543
	unsigned int hlen, off_vx;
544
	int flush = 1;
545
	struct vxlan_sock *vs = rcu_dereference_sk_user_data(sk);
546
	__be32 flags;
547 548 549
	struct gro_remcsum grc;

	skb_gro_remcsum_init(&grc);
550 551 552 553 554 555 556 557 558 559

	off_vx = skb_gro_offset(skb);
	hlen = off_vx + sizeof(*vh);
	vh   = skb_gro_header_fast(skb, off_vx);
	if (skb_gro_header_hard(skb, hlen)) {
		vh = skb_gro_header_slow(skb, hlen, off_vx);
		if (unlikely(!vh))
			goto out;
	}

Tom Herbert's avatar
Tom Herbert committed
560 561
	skb_gro_postpull_rcsum(skb, vh, sizeof(struct vxlanhdr));

562
	flags = vh->vx_flags;
Tom Herbert's avatar
Tom Herbert committed
563 564 565

	if ((flags & VXLAN_HF_RCO) && (vs->flags & VXLAN_F_REMCSUM_RX)) {
		vh = vxlan_gro_remcsum(skb, off_vx, vh, sizeof(struct vxlanhdr),
566
				       vh->vx_vni, &grc,
567 568
				       !!(vs->flags &
					  VXLAN_F_REMCSUM_NOPARTIAL));
Tom Herbert's avatar
Tom Herbert committed
569 570 571 572 573

		if (!vh)
			goto out;
	}

574 575
	skb_gro_pull(skb, sizeof(struct vxlanhdr)); /* pull vxlan header */

576 577 578 579 580
	for (p = *head; p; p = p->next) {
		if (!NAPI_GRO_CB(p)->same_flow)
			continue;

		vh2 = (struct vxlanhdr *)(p->data + off_vx);
Thomas Graf's avatar
Thomas Graf committed
581 582
		if (vh->vx_flags != vh2->vx_flags ||
		    vh->vx_vni != vh2->vx_vni) {
583 584 585 586 587
			NAPI_GRO_CB(p)->same_flow = 0;
			continue;
		}
	}

588
	pp = call_gro_receive(eth_gro_receive, head, skb);
589
	flush = 0;
590 591

out:
592
	skb_gro_remcsum_cleanup(skb, &grc);
593 594 595 596 597
	NAPI_GRO_CB(skb)->flush |= flush;

	return pp;
}

598
static int vxlan_gro_complete(struct sock *sk, struct sk_buff *skb, int nhoff)
599
{
600 601 602
	/* Sets 'skb->inner_mac_header' since we are always called with
	 * 'skb->encapsulation' set.
	 */
603
	return eth_gro_complete(skb, nhoff + sizeof(struct vxlanhdr));
604 605
}

606 607
/* Add new entry to forwarding table -- assumes lock held */
static int vxlan_fdb_create(struct vxlan_dev *vxlan,
Cong Wang's avatar
Cong Wang committed
608
			    const u8 *mac, union vxlan_addr *ip,
609
			    __u16 state, __u16 flags,
610
			    __be16 port, __be32 vni, __u32 ifindex,
611
			    __u8 ndm_flags)
612
{
613
	struct vxlan_rdst *rd = NULL;
614 615
	struct vxlan_fdb *f;
	int notify = 0;
616
	int rc;
617

618
	f = __vxlan_find_mac(vxlan, mac);
619 620 621 622 623 624 625 626 627 628 629
	if (f) {
		if (flags & NLM_F_EXCL) {
			netdev_dbg(vxlan->dev,
				   "lost race to create %pM\n", mac);
			return -EEXIST;
		}
		if (f->state != state) {
			f->state = state;
			f->updated = jiffies;
			notify = 1;
		}
630 631 632 633 634
		if (f->flags != ndm_flags) {
			f->flags = ndm_flags;
			f->updated = jiffies;
			notify = 1;
		}
635 636 637 638
		if ((flags & NLM_F_REPLACE)) {
			/* Only change unicasts */
			if (!(is_multicast_ether_addr(f->eth_addr) ||
			     is_zero_ether_addr(f->eth_addr))) {
639
				notify |= vxlan_fdb_replace(f, ip, port, vni,
640 641 642 643
							   ifindex);
			} else
				return -EOPNOTSUPP;
		}
644
		if ((flags & NLM_F_APPEND) &&
645 646
		    (is_multicast_ether_addr(f->eth_addr) ||
		     is_zero_ether_addr(f->eth_addr))) {
647
			rc = vxlan_fdb_append(f, ip, port, vni, ifindex, &rd);
648 649 650 651 652

			if (rc < 0)
				return rc;
			notify |= rc;
		}
653 654 655 656
	} else {
		if (!(flags & NLM_F_CREATE))
			return -ENOENT;

657 658
		if (vxlan->cfg.addrmax &&
		    vxlan->addrcnt >= vxlan->cfg.addrmax)
659 660
			return -ENOSPC;

661 662 663 664 665
		/* Disallow replace to add a multicast entry */
		if ((flags & NLM_F_REPLACE) &&
		    (is_multicast_ether_addr(mac) || is_zero_ether_addr(mac)))
			return -EOPNOTSUPP;

Cong Wang's avatar
Cong Wang committed
666
		netdev_dbg(vxlan->dev, "add %pM -> %pIS\n", mac, ip);
667 668 669 670 671 672
		f = kmalloc(sizeof(*f), GFP_ATOMIC);
		if (!f)
			return -ENOMEM;

		notify = 1;
		f->state = state;
673
		f->flags = ndm_flags;
674
		f->updated = f->used = jiffies;
675
		INIT_LIST_HEAD(&f->remotes);
676 677
		memcpy(f->eth_addr, mac, ETH_ALEN);

678 679 680 681 682
		rc = vxlan_fdb_append(f, ip, port, vni, ifindex, &rd);
		if (rc < 0) {
			kfree(f);
			return rc;
		}
683

684 685 686 687 688
		++vxlan->addrcnt;
		hlist_add_head_rcu(&f->hlist,
				   vxlan_fdb_head(vxlan, mac));
	}

689 690 691 692 693
	if (notify) {
		if (rd == NULL)
			rd = first_remote_rtnl(f);
		vxlan_fdb_notify(vxlan, f, rd, RTM_NEWNEIGH);
	}
694 695 696 697

	return 0;
}

Wei Yongjun's avatar
Wei Yongjun committed
698
static void vxlan_fdb_free(struct rcu_head *head)
699 700
{
	struct vxlan_fdb *f = container_of(head, struct vxlan_fdb, rcu);
701
	struct vxlan_rdst *rd, *nd;
702

703 704
	list_for_each_entry_safe(rd, nd, &f->remotes, list) {
		dst_cache_destroy(&rd->dst_cache);
705
		kfree(rd);
706
	}
707 708 709
	kfree(f);
}

710 711 712 713 714 715
static void vxlan_fdb_destroy(struct vxlan_dev *vxlan, struct vxlan_fdb *f)
{
	netdev_dbg(vxlan->dev,
		    "delete %pM\n", f->eth_addr);

	--vxlan->addrcnt;
716
	vxlan_fdb_notify(vxlan, f, first_remote_rtnl(f), RTM_DELNEIGH);
717 718

	hlist_del_rcu(&f->hlist);
719
	call_rcu(&f->rcu, vxlan_fdb_free);
720 721
}

722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737
static void vxlan_dst_free(struct rcu_head *head)
{
	struct vxlan_rdst *rd = container_of(head, struct vxlan_rdst, rcu);

	dst_cache_destroy(&rd->dst_cache);
	kfree(rd);
}

static void vxlan_fdb_dst_destroy(struct vxlan_dev *vxlan, struct vxlan_fdb *f,
				  struct vxlan_rdst *rd)
{
	list_del_rcu(&rd->list);
	vxlan_fdb_notify(vxlan, f, rd, RTM_DELNEIGH);
	call_rcu(&rd->rcu, vxlan_dst_free);
}

738
static int vxlan_fdb_parse(struct nlattr *tb[], struct vxlan_dev *vxlan,
739 740
			   union vxlan_addr *ip, __be16 *port, __be32 *vni,
			   u32 *ifindex)
741
{
742
	struct net *net = dev_net(vxlan->dev);
Cong Wang's avatar
Cong Wang committed
743
	int err;
744

745
	if (tb[NDA_DST]) {
Cong Wang's avatar
Cong Wang committed
746 747 748
		err = vxlan_nla_get_addr(ip, tb[NDA_DST]);
		if (err)
			return err;
749
	} else {
Cong Wang's avatar
Cong Wang committed
750 751 752 753 754 755 756 757 758 759
		union vxlan_addr *remote = &vxlan->default_dst.remote_ip;
		if (remote->sa.sa_family == AF_INET) {
			ip->sin.sin_addr.s_addr = htonl(INADDR_ANY);
			ip->sa.sa_family = AF_INET;
#if IS_ENABLED(CONFIG_IPV6)
		} else {
			ip->sin6.sin6_addr = in6addr_any;
			ip->sa.sa_family = AF_INET6;
#endif
		}
760
	}
761

762
	if (tb[NDA_PORT]) {
763
		if (nla_len(tb[NDA_PORT]) != sizeof(__be16))
764
			return -EINVAL;
765 766
		*port = nla_get_be16(tb[NDA_PORT]);
	} else {
767
		*port = vxlan->cfg.dst_port;
768
	}
769 770 771 772

	if (tb[NDA_VNI]) {
		if (nla_len(tb[NDA_VNI]) != sizeof(u32))
			return -EINVAL;
773
		*vni = cpu_to_be32(nla_get_u32(tb[NDA_VNI]));
774 775 776
	} else {
		*vni = vxlan->default_dst.remote_vni;
	}
777 778

	if (tb[NDA_IFINDEX]) {
Pravin B Shelar's avatar
Pravin B Shelar committed
779
		struct net_device *tdev;
780 781 782

		if (nla_len(tb[NDA_IFINDEX]) != sizeof(u32))
			return -EINVAL;
783
		*ifindex = nla_get_u32(tb[NDA_IFINDEX]);
784
		tdev = __dev_get_by_index(net, *ifindex);
Pravin B Shelar's avatar
Pravin B Shelar committed
785
		if (!tdev)
786
			return -EADDRNOTAVAIL;
787 788 789 790 791 792 793 794 795 796
	} else {
		*ifindex = 0;
	}

	return 0;
}

/* Add static entry (via netlink) */
static int vxlan_fdb_add(struct ndmsg *ndm, struct nlattr *tb[],
			 struct net_device *dev,
797
			 const unsigned char *addr, u16 vid, u16 flags)
798 799 800
{
	struct vxlan_dev *vxlan = netdev_priv(dev);
	/* struct net *net = dev_net(vxlan->dev); */
Cong Wang's avatar
Cong Wang committed
801
	union vxlan_addr ip;
802
	__be16 port;
803 804
	__be32 vni;
	u32 ifindex;
805 806 807 808 809 810 811 812 813 814 815 816 817 818
	int err;

	if (!(ndm->ndm_state & (NUD_PERMANENT|NUD_REACHABLE))) {
		pr_info("RTM_NEWNEIGH with invalid state %#x\n",
			ndm->ndm_state);
		return -EINVAL;
	}

	if (tb[NDA_DST] == NULL)
		return -EINVAL;

	err = vxlan_fdb_parse(tb, vxlan, &ip, &port, &vni, &ifindex);
	if (err)
		return err;
819

820 821 822
	if (vxlan->default_dst.remote_ip.sa.sa_family != ip.sa.sa_family)
		return -EAFNOSUPPORT;

823
	spin_lock_bh(&vxlan->hash_lock);
Cong Wang's avatar
Cong Wang committed
824
	err = vxlan_fdb_create(vxlan, addr, &ip, ndm->ndm_state, flags,
825
			       port, vni, ifindex, ndm->ndm_flags);
826 827 828 829 830 831
	spin_unlock_bh(&vxlan->hash_lock);

	return err;
}

/* Delete entry (via netlink) */
832 833
static int vxlan_fdb_delete(struct ndmsg *ndm, struct nlattr *tb[],
			    struct net_device *dev,
834
			    const unsigned char *addr, u16 vid)
835 836 837
{
	struct vxlan_dev *vxlan = netdev_priv(dev);
	struct vxlan_fdb *f;
838
	struct vxlan_rdst *rd = NULL;
Cong Wang's avatar
Cong Wang committed
839
	union vxlan_addr ip;
840
	__be16 port;
841 842
	__be32 vni;
	u32 ifindex;
843 844 845 846 847 848 849
	int err;

	err = vxlan_fdb_parse(tb, vxlan, &ip, &port, &vni, &ifindex);
	if (err)
		return err;

	err = -ENOENT;
850 851 852

	spin_lock_bh(&vxlan->hash_lock);
	f = vxlan_find_mac(vxlan, addr);
853 854 855
	if (!f)
		goto out;

Cong Wang's avatar
Cong Wang committed
856 857
	if (!vxlan_addr_any(&ip)) {
		rd = vxlan_fdb_find_rdst(f, &ip, port, vni, ifindex);
858 859 860 861 862 863 864 865 866 867
		if (!rd)
			goto out;
	}

	err = 0;

	/* remove a destination if it's not the only one on the list,
	 * otherwise destroy the fdb entry
	 */
	if (rd && !list_is_singular(&f->remotes)) {
868
		vxlan_fdb_dst_destroy(vxlan, f, rd);
869
		goto out;
870
	}
871 872 873 874

	vxlan_fdb_destroy(vxlan, f);

out:
875 876 877 878 879 880 881
	spin_unlock_bh(&vxlan->hash_lock);

	return err;
}

/* Dump forwarding table */
static int vxlan_fdb_dump(struct sk_buff *skb, struct netlink_callback *cb,
882
			  struct net_device *dev,
883
			  struct net_device *filter_dev, int *idx)
884 885 886
{
	struct vxlan_dev *vxlan = netdev_priv(dev);
	unsigned int h;
887
	int err = 0;
888 889 890 891

	for (h = 0; h < FDB_HASH_SIZE; ++h) {
		struct vxlan_fdb *f;

892
		hlist_for_each_entry_rcu(f, &vxlan->fdb_head[h], hlist) {
893 894
			struct vxlan_rdst *rd;

895
			list_for_each_entry_rcu(rd, &f->remotes, list) {
896
				if (*idx < cb->args[2])
897 898
					goto skip;

899 900 901 902 903
				err = vxlan_fdb_info(skb, vxlan, f,
						     NETLINK_CB(cb->skb).portid,
						     cb->nlh->nlmsg_seq,
						     RTM_NEWNEIGH,
						     NLM_F_MULTI, rd);
904
				if (err < 0)
905 906
					goto out;
skip:
907
				*idx += 1;
908
			}
909 910
		}
	}
911
out:
912
	return err;
913 914 915 916
}

/* Watch incoming packets to learn mapping between Ethernet address
 * and Tunnel endpoint.
917
 * Return true if packet is bogus and should be dropped.
918
 */
919
static bool vxlan_snoop(struct net_device *dev,
Cong Wang's avatar
Cong Wang committed
920
			union vxlan_addr *src_ip, const u8 *src_mac)
921 922 923 924 925 926
{
	struct vxlan_dev *vxlan = netdev_priv(dev);
	struct vxlan_fdb *f;

	f = vxlan_find_mac(vxlan, src_mac);
	if (likely(f)) {
927
		struct vxlan_rdst *rdst = first_remote_rcu(f);
928

Cong Wang's avatar
Cong Wang committed
929
		if (likely(vxlan_addr_equal(&rdst->remote_ip, src_ip)))
930 931 932
			return false;

		/* Don't migrate static entries, drop packets */
933
		if (f->state & NUD_NOARP)
934
			return true;
935 936 937

		if (net_ratelimit())
			netdev_info(dev,
Cong Wang's avatar
Cong Wang committed
938
				    "%pM migrated from %pIS to %pIS\n",
939
				    src_mac, &rdst->remote_ip.sa, &src_ip->sa);
940

Cong Wang's avatar
Cong Wang committed
941
		rdst->remote_ip = *src_ip;
942
		f->updated = jiffies;
943
		vxlan_fdb_notify(vxlan, f, rdst, RTM_NEWNEIGH);
944 945 946
	} else {
		/* learned new entry */
		spin_lock(&vxlan->hash_lock);
947 948 949 950 951 952

		/* close off race between vxlan_flush and incoming packets */
		if (netif_running(dev))
			vxlan_fdb_create(vxlan, src_mac, src_ip,
					 NUD_REACHABLE,
					 NLM_F_EXCL|NLM_F_CREATE,
953
					 vxlan->cfg.dst_port,
954 955
					 vxlan->default_dst.remote_vni,
					 0, NTF_SELF);
956 957
		spin_unlock(&vxlan->hash_lock);
	}
958 959

	return false;
960 961 962
}

/* See if multicast group is already in use by other ID */
963
static bool vxlan_group_used(struct vxlan_net *vn, struct vxlan_dev *dev)
964
{
965
	struct vxlan_dev *vxlan;
966
	struct vxlan_sock *sock4;
967 968 969
#if IS_ENABLED(CONFIG_IPV6)
	struct vxlan_sock *sock6;
#endif
970
	unsigned short family = dev->default_dst.remote_ip.sa.sa_family;
971

972 973
	sock4 = rtnl_dereference(dev->vn4_sock);

974 975 976
	/* The vxlan_sock is only used by dev, leaving group has
	 * no effect on other vxlan devices.
	 */
977
	if (family == AF_INET && sock4 && atomic_read(&sock4->refcnt) == 1)
978
		return false;
979
#if IS_ENABLED(CONFIG_IPV6)
980 981
	sock6 = rtnl_dereference(dev->vn6_sock);
	if (family == AF_INET6 && sock6 && atomic_read(&sock6->refcnt) == 1)
982 983
		return false;
#endif
984

985
	list_for_each_entry(vxlan, &vn->vxlan_list, next) {
986
		if (!netif_running(vxlan->dev) || vxlan == dev)
stephen hemminger's avatar