addr.c 21.7 KB
Newer Older
1
2
3
4
5
6
/*
 * Copyright (c) 2005 Voltaire Inc.  All rights reserved.
 * Copyright (c) 2002-2005, Network Appliance, Inc. All rights reserved.
 * Copyright (c) 1999-2005, Mellanox Technologies, Inc. All rights reserved.
 * Copyright (c) 2005 Intel Corporation.  All rights reserved.
 *
Sean Hefty's avatar
Sean Hefty committed
7
8
9
10
11
 * This software is available to you under a choice of one of two
 * licenses.  You may choose to be licensed under the terms of the GNU
 * General Public License (GPL) Version 2, available from the file
 * COPYING in the main directory of this source tree, or the
 * OpenIB.org BSD license below:
12
 *
Sean Hefty's avatar
Sean Hefty committed
13
14
15
 *     Redistribution and use in source and binary forms, with or
 *     without modification, are permitted provided that the following
 *     conditions are met:
16
 *
Sean Hefty's avatar
Sean Hefty committed
17
18
19
 *      - Redistributions of source code must retain the above
 *        copyright notice, this list of conditions and the following
 *        disclaimer.
20
 *
Sean Hefty's avatar
Sean Hefty committed
21
22
23
24
 *      - Redistributions in binary form must reproduce the above
 *        copyright notice, this list of conditions and the following
 *        disclaimer in the documentation and/or other materials
 *        provided with the distribution.
25
 *
Sean Hefty's avatar
Sean Hefty committed
26
27
28
29
30
31
32
33
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
 * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
 * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
 * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
 * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 * SOFTWARE.
34
35
36
37
 */

#include <linux/mutex.h>
#include <linux/inetdevice.h>
38
#include <linux/slab.h>
39
#include <linux/workqueue.h>
40
#include <linux/module.h>
41
42
43
#include <net/arp.h>
#include <net/neighbour.h>
#include <net/route.h>
44
#include <net/netevent.h>
45
#include <net/ipv6_stubs.h>
46
#include <net/ip6_route.h>
47
#include <rdma/ib_addr.h>
48
#include <rdma/ib_cache.h>
49
#include <rdma/ib_sa.h>
50
#include <rdma/ib.h>
51
52
53
54
#include <rdma/rdma_netlink.h>
#include <net/netlink.h>

#include "core_priv.h"
55
56
57

struct addr_req {
	struct list_head list;
58
59
	struct sockaddr_storage src_addr;
	struct sockaddr_storage dst_addr;
60
61
62
63
64
	struct rdma_dev_addr *addr;
	void *context;
	void (*callback)(int status, struct sockaddr *src_addr,
			 struct rdma_dev_addr *addr, void *context);
	unsigned long timeout;
65
	struct delayed_work work;
66
	bool resolve_by_gid_attr;	/* Consider gid attr in resolve phase */
67
	int status;
68
	u32 seq;
69
70
};

71
72
static atomic_t ib_nl_addr_request_seq = ATOMIC_INIT(0);

73
static DEFINE_SPINLOCK(lock);
74
75
76
static LIST_HEAD(req_list);
static struct workqueue_struct *addr_wq;

77
78
79
80
81
82
83
84
85
86
87
88
89
static const struct nla_policy ib_nl_addr_policy[LS_NLA_TYPE_MAX] = {
	[LS_NLA_TYPE_DGID] = {.type = NLA_BINARY,
		.len = sizeof(struct rdma_nla_ls_gid)},
};

static inline bool ib_nl_is_good_ip_resp(const struct nlmsghdr *nlh)
{
	struct nlattr *tb[LS_NLA_TYPE_MAX] = {};
	int ret;

	if (nlh->nlmsg_flags & RDMA_NL_LS_F_ERR)
		return false;

90
91
	ret = nla_parse_deprecated(tb, LS_NLA_TYPE_MAX - 1, nlmsg_data(nlh),
				   nlmsg_len(nlh), ib_nl_addr_policy, NULL);
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
	if (ret)
		return false;

	return true;
}

static void ib_nl_process_good_ip_rsep(const struct nlmsghdr *nlh)
{
	const struct nlattr *head, *curr;
	union ib_gid gid;
	struct addr_req *req;
	int len, rem;
	int found = 0;

	head = (const struct nlattr *)nlmsg_data(nlh);
	len = nlmsg_len(nlh);

	nla_for_each_attr(curr, head, len, rem) {
		if (curr->nla_type == LS_NLA_TYPE_DGID)
			memcpy(&gid, nla_data(curr), nla_len(curr));
	}

114
	spin_lock_bh(&lock);
115
116
117
118
119
120
121
122
123
	list_for_each_entry(req, &req_list, list) {
		if (nlh->nlmsg_seq != req->seq)
			continue;
		/* We set the DGID part, the rest was set earlier */
		rdma_addr_set_dgid(req->addr, &gid);
		req->status = 0;
		found = 1;
		break;
	}
124
	spin_unlock_bh(&lock);
125
126
127
128
129
130
131

	if (!found)
		pr_info("Couldn't find request waiting for DGID: %pI6\n",
			&gid);
}

int ib_nl_handle_ip_res_resp(struct sk_buff *skb,
132
133
			     struct nlmsghdr *nlh,
			     struct netlink_ext_ack *extack)
134
135
{
	if ((nlh->nlmsg_flags & NLM_F_REQUEST) ||
136
	    !(NETLINK_CB(skb).sk))
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
		return -EPERM;

	if (ib_nl_is_good_ip_resp(nlh))
		ib_nl_process_good_ip_rsep(nlh);

	return skb->len;
}

static int ib_nl_ip_send_msg(struct rdma_dev_addr *dev_addr,
			     const void *daddr,
			     u32 seq, u16 family)
{
	struct sk_buff *skb = NULL;
	struct nlmsghdr *nlh;
	struct rdma_ls_ip_resolve_header *header;
	void *data;
	size_t size;
	int attrtype;
	int len;

	if (family == AF_INET) {
		size = sizeof(struct in_addr);
		attrtype = RDMA_NLA_F_MANDATORY | LS_NLA_TYPE_IPV4;
	} else {
		size = sizeof(struct in6_addr);
		attrtype = RDMA_NLA_F_MANDATORY | LS_NLA_TYPE_IPV6;
	}

	len = nla_total_size(sizeof(size));
	len += NLMSG_ALIGN(sizeof(*header));

	skb = nlmsg_new(len, GFP_KERNEL);
	if (!skb)
		return -ENOMEM;

	data = ibnl_put_msg(skb, &nlh, seq, 0, RDMA_NL_LS,
			    RDMA_NL_LS_OP_IP_RESOLVE, NLM_F_REQUEST);
	if (!data) {
		nlmsg_free(skb);
		return -ENODATA;
	}

	/* Construct the family header first */
180
	header = skb_put(skb, NLMSG_ALIGN(sizeof(*header)));
181
182
183
184
185
	header->ifindex = dev_addr->bound_dev_if;
	nla_put(skb, attrtype, size, daddr);

	/* Repair the nlmsg header length */
	nlmsg_end(skb, nlh);
186
	rdma_nl_multicast(&init_net, skb, RDMA_NL_GROUP_LS, GFP_KERNEL);
187
188
189
190
191
192
193

	/* Make the request retry, so when we get the response from userspace
	 * we will have something.
	 */
	return -ENODATA;
}

194
int rdma_addr_size(const struct sockaddr *addr)
195
196
197
198
199
200
201
202
203
204
205
206
207
208
{
	switch (addr->sa_family) {
	case AF_INET:
		return sizeof(struct sockaddr_in);
	case AF_INET6:
		return sizeof(struct sockaddr_in6);
	case AF_IB:
		return sizeof(struct sockaddr_ib);
	default:
		return 0;
	}
}
EXPORT_SYMBOL(rdma_addr_size);

209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
int rdma_addr_size_in6(struct sockaddr_in6 *addr)
{
	int ret = rdma_addr_size((struct sockaddr *) addr);

	return ret <= sizeof(*addr) ? ret : 0;
}
EXPORT_SYMBOL(rdma_addr_size_in6);

int rdma_addr_size_kss(struct __kernel_sockaddr_storage *addr)
{
	int ret = rdma_addr_size((struct sockaddr *) addr);

	return ret <= sizeof(*addr) ? ret : 0;
}
EXPORT_SYMBOL(rdma_addr_size_kss);

225
226
227
228
229
230
231
232
233
234
235
/**
 * rdma_copy_src_l2_addr - Copy netdevice source addresses
 * @dev_addr:	Destination address pointer where to copy the addresses
 * @dev:	Netdevice whose source addresses to copy
 *
 * rdma_copy_src_l2_addr() copies source addresses from the specified netdevice.
 * This includes unicast address, broadcast address, device type and
 * interface index.
 */
void rdma_copy_src_l2_addr(struct rdma_dev_addr *dev_addr,
			   const struct net_device *dev)
236
{
237
	dev_addr->dev_type = dev->type;
238
239
	memcpy(dev_addr->src_dev_addr, dev->dev_addr, MAX_ADDR_LEN);
	memcpy(dev_addr->broadcast, dev->broadcast, MAX_ADDR_LEN);
240
	dev_addr->bound_dev_if = dev->ifindex;
241
}
242
EXPORT_SYMBOL(rdma_copy_src_l2_addr);
243

244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
static struct net_device *
rdma_find_ndev_for_src_ip_rcu(struct net *net, const struct sockaddr *src_in)
{
	struct net_device *dev = NULL;
	int ret = -EADDRNOTAVAIL;

	switch (src_in->sa_family) {
	case AF_INET:
		dev = __ip_dev_find(net,
				    ((const struct sockaddr_in *)src_in)->sin_addr.s_addr,
				    false);
		if (dev)
			ret = 0;
		break;
#if IS_ENABLED(CONFIG_IPV6)
	case AF_INET6:
		for_each_netdev_rcu(net, dev) {
			if (ipv6_chk_addr(net,
					  &((const struct sockaddr_in6 *)src_in)->sin6_addr,
					  dev, 1)) {
				ret = 0;
				break;
			}
		}
		break;
#endif
	}
	return ret ? ERR_PTR(ret) : dev;
}

274
int rdma_translate_ip(const struct sockaddr *addr,
275
		      struct rdma_dev_addr *dev_addr)
276
277
278
{
	struct net_device *dev;

279
	if (dev_addr->bound_dev_if) {
280
		dev = dev_get_by_index(dev_addr->net, dev_addr->bound_dev_if);
281
282
		if (!dev)
			return -ENODEV;
283
		rdma_copy_src_l2_addr(dev_addr, dev);
284
		dev_put(dev);
285
		return 0;
286
287
	}

288
289
290
	rcu_read_lock();
	dev = rdma_find_ndev_for_src_ip_rcu(dev_addr->net, addr);
	if (!IS_ERR(dev))
291
		rdma_copy_src_l2_addr(dev_addr, dev);
292
293
	rcu_read_unlock();
	return PTR_ERR_OR_ZERO(dev);
294
295
296
}
EXPORT_SYMBOL(rdma_translate_ip);

297
static void set_timeout(struct addr_req *req, unsigned long time)
298
299
300
301
{
	unsigned long delay;

	delay = time - jiffies;
302
303
	if ((long)delay < 0)
		delay = 0;
304

305
	mod_delayed_work(addr_wq, &req->work, delay);
306
307
308
309
}

static void queue_req(struct addr_req *req)
{
310
311
312
313
	spin_lock_bh(&lock);
	list_add_tail(&req->list, &req_list);
	set_timeout(req, req->timeout);
	spin_unlock_bh(&lock);
314
315
}

316
static int ib_nl_fetch_ha(struct rdma_dev_addr *dev_addr,
317
318
			  const void *daddr, u32 seq, u16 family)
{
319
	if (!rdma_nl_chk_listeners(RDMA_NL_GROUP_LS))
320
321
322
323
324
		return -EADDRNOTAVAIL;

	return ib_nl_ip_send_msg(dev_addr, daddr, seq, family);
}

325
326
static int dst_fetch_ha(const struct dst_entry *dst,
			struct rdma_dev_addr *dev_addr,
327
			const void *daddr)
328
329
{
	struct neighbour *n;
330
	int ret = 0;
331

332
	n = dst_neigh_lookup(dst, daddr);
333
334
	if (!n)
		return -ENODATA;
335

336
337
	if (!(n->nud_state & NUD_VALID)) {
		neigh_event_send(n, NULL);
338
339
		ret = -ENODATA;
	} else {
340
		neigh_ha_snapshot(dev_addr->dst_dev_addr, n, dst->dev);
341
342
	}

343
	neigh_release(n);
344

345
346
347
	return ret;
}

348
static bool has_gateway(const struct dst_entry *dst, sa_family_t family)
349
350
351
352
353
354
{
	struct rtable *rt;
	struct rt6_info *rt6;

	if (family == AF_INET) {
		rt = container_of(dst, struct rtable, dst);
355
		return rt->rt_uses_gateway;
356
357
358
359
360
361
	}

	rt6 = container_of(dst, struct rt6_info, dst);
	return rt6->rt6i_flags & RTF_GATEWAY;
}

362
static int fetch_ha(const struct dst_entry *dst, struct rdma_dev_addr *dev_addr,
363
364
365
366
367
368
369
370
371
372
373
		    const struct sockaddr *dst_in, u32 seq)
{
	const struct sockaddr_in *dst_in4 =
		(const struct sockaddr_in *)dst_in;
	const struct sockaddr_in6 *dst_in6 =
		(const struct sockaddr_in6 *)dst_in;
	const void *daddr = (dst_in->sa_family == AF_INET) ?
		(const void *)&dst_in4->sin_addr.s_addr :
		(const void *)&dst_in6->sin6_addr;
	sa_family_t family = dst_in->sa_family;

374
375
	/* If we have a gateway in IB mode then it must be an IB network */
	if (has_gateway(dst, family) && dev_addr->network == RDMA_NETWORK_IB)
376
		return ib_nl_fetch_ha(dev_addr, daddr, seq, family);
377
378
379
380
	else
		return dst_fetch_ha(dst, dev_addr, daddr);
}

381
382
static int addr4_resolve(struct sockaddr *src_sock,
			 const struct sockaddr *dst_sock,
383
384
			 struct rdma_dev_addr *addr,
			 struct rtable **prt)
385
{
386
387
388
389
	struct sockaddr_in *src_in = (struct sockaddr_in *)src_sock;
	const struct sockaddr_in *dst_in =
			(const struct sockaddr_in *)dst_sock;

390
391
	__be32 src_ip = src_in->sin_addr.s_addr;
	__be32 dst_ip = dst_in->sin_addr.s_addr;
392
	struct rtable *rt;
393
	struct flowi4 fl4;
394
395
	int ret;

396
397
398
399
	memset(&fl4, 0, sizeof(fl4));
	fl4.daddr = dst_ip;
	fl4.saddr = src_ip;
	fl4.flowi4_oif = addr->bound_dev_if;
400
	rt = ip_route_output_key(addr->net, &fl4);
401
402
403
404
	ret = PTR_ERR_OR_ZERO(rt);
	if (ret)
		return ret;

405
	src_in->sin_addr.s_addr = fl4.saddr;
406

407
408
	addr->hoplimit = ip4_dst_hoplimit(&rt->dst);

409
410
	*prt = rt;
	return 0;
411
412
}

413
#if IS_ENABLED(CONFIG_IPV6)
414
415
static int addr6_resolve(struct sockaddr *src_sock,
			 const struct sockaddr *dst_sock,
416
417
			 struct rdma_dev_addr *addr,
			 struct dst_entry **pdst)
418
{
419
420
421
	struct sockaddr_in6 *src_in = (struct sockaddr_in6 *)src_sock;
	const struct sockaddr_in6 *dst_in =
				(const struct sockaddr_in6 *)dst_sock;
422
	struct flowi6 fl6;
423
	struct dst_entry *dst;
Sean Hefty's avatar
Sean Hefty committed
424
	int ret;
425

426
	memset(&fl6, 0, sizeof fl6);
Alexey Dobriyan's avatar
Alexey Dobriyan committed
427
428
	fl6.daddr = dst_in->sin6_addr;
	fl6.saddr = src_in->sin6_addr;
429
	fl6.flowi6_oif = addr->bound_dev_if;
430

431
432
	ret = ipv6_stub->ipv6_dst_lookup(addr->net, NULL, &dst, &fl6);
	if (ret < 0)
433
		return ret;
Sean Hefty's avatar
Sean Hefty committed
434

435
	if (ipv6_addr_any(&src_in->sin6_addr))
Alexey Dobriyan's avatar
Alexey Dobriyan committed
436
		src_in->sin6_addr = fl6.saddr;
Sean Hefty's avatar
Sean Hefty committed
437

438
439
	addr->hoplimit = ip6_dst_hoplimit(dst);

440
441
	*pdst = dst;
	return 0;
442
}
443
#else
444
445
static int addr6_resolve(struct sockaddr *src_sock,
			 const struct sockaddr *dst_sock,
446
447
			 struct rdma_dev_addr *addr,
			 struct dst_entry **pdst)
448
449
450
451
{
	return -EADDRNOTAVAIL;
}
#endif
452

453
static int addr_resolve_neigh(const struct dst_entry *dst,
454
			      const struct sockaddr *dst_in,
455
			      struct rdma_dev_addr *addr,
456
			      unsigned int ndev_flags,
457
			      u32 seq)
458
{
459
460
461
	int ret = 0;

	if (ndev_flags & IFF_LOOPBACK) {
462
		memcpy(addr->dst_dev_addr, addr->src_dev_addr, MAX_ADDR_LEN);
463
464
465
466
467
	} else {
		if (!(ndev_flags & IFF_NOARP)) {
			/* If the device doesn't do ARP internally */
			ret = fetch_ha(dst, addr, dst_in, seq);
		}
468
	}
469
	return ret;
470
471
}

472
473
474
475
static int copy_src_l2_addr(struct rdma_dev_addr *dev_addr,
			    const struct sockaddr *dst_in,
			    const struct dst_entry *dst,
			    const struct net_device *ndev)
476
477
478
479
480
481
{
	int ret = 0;

	if (dst->dev->flags & IFF_LOOPBACK)
		ret = rdma_translate_ip(dst_in, dev_addr);
	else
482
		rdma_copy_src_l2_addr(dev_addr, dst->dev);
483
484
485
486
487
488
489

	/*
	 * If there's a gateway and type of device not ARPHRD_INFINIBAND,
	 * we're definitely in RoCE v2 (as RoCE v1 isn't routable) set the
	 * network type accordingly.
	 */
	if (has_gateway(dst, dst_in->sa_family) &&
490
	    ndev->type != ARPHRD_INFINIBAND)
491
492
493
494
495
		dev_addr->network = dst_in->sa_family == AF_INET ?
						RDMA_NETWORK_IPV4 :
						RDMA_NETWORK_IPV6;
	else
		dev_addr->network = RDMA_NETWORK_IB;
496
497

	return ret;
498
}
499

500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
static int rdma_set_src_addr_rcu(struct rdma_dev_addr *dev_addr,
				 unsigned int *ndev_flags,
				 const struct sockaddr *dst_in,
				 const struct dst_entry *dst)
{
	struct net_device *ndev = READ_ONCE(dst->dev);

	*ndev_flags = ndev->flags;
	/* A physical device must be the RDMA device to use */
	if (ndev->flags & IFF_LOOPBACK) {
		/*
		 * RDMA (IB/RoCE, iWarp) doesn't run on lo interface or
		 * loopback IP address. So if route is resolved to loopback
		 * interface, translate that to a real ndev based on non
		 * loopback IP address.
		 */
		ndev = rdma_find_ndev_for_src_ip_rcu(dev_net(ndev), dst_in);
517
		if (IS_ERR(ndev))
518
519
520
			return -ENODEV;
	}

521
	return copy_src_l2_addr(dev_addr, dst_in, dst, ndev);
522
523
}

524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
static int set_addr_netns_by_gid_rcu(struct rdma_dev_addr *addr)
{
	struct net_device *ndev;

	ndev = rdma_read_gid_attr_ndev_rcu(addr->sgid_attr);
	if (IS_ERR(ndev))
		return PTR_ERR(ndev);

	/*
	 * Since we are holding the rcu, reading net and ifindex
	 * are safe without any additional reference; because
	 * change_net_namespace() in net/core/dev.c does rcu sync
	 * after it changes the state to IFF_DOWN and before
	 * updating netdev fields {net, ifindex}.
	 */
	addr->net = dev_net(ndev);
	addr->bound_dev_if = ndev->ifindex;
	return 0;
}

static void rdma_addr_set_net_defaults(struct rdma_dev_addr *addr)
{
	addr->net = &init_net;
	addr->bound_dev_if = 0;
}

550
static int addr_resolve(struct sockaddr *src_in,
551
552
			const struct sockaddr *dst_in,
			struct rdma_dev_addr *addr,
553
			bool resolve_neigh,
554
			bool resolve_by_gid_attr,
555
			u32 seq)
556
{
557
	struct dst_entry *dst = NULL;
558
	unsigned int ndev_flags = 0;
559
	struct rtable *rt = NULL;
560
561
	int ret;

562
563
564
565
566
	if (!addr->net) {
		pr_warn_ratelimited("%s: missing namespace\n", __func__);
		return -EINVAL;
	}

567
	rcu_read_lock();
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
	if (resolve_by_gid_attr) {
		if (!addr->sgid_attr) {
			rcu_read_unlock();
			pr_warn_ratelimited("%s: missing gid_attr\n", __func__);
			return -EINVAL;
		}
		/*
		 * If the request is for a specific gid attribute of the
		 * rdma_dev_addr, derive net from the netdevice of the
		 * GID attribute.
		 */
		ret = set_addr_netns_by_gid_rcu(addr);
		if (ret) {
			rcu_read_unlock();
			return ret;
		}
	}
585
	if (src_in->sa_family == AF_INET) {
586
		ret = addr4_resolve(src_in, dst_in, addr, &rt);
587
		dst = &rt->dst;
588
	} else {
589
		ret = addr6_resolve(src_in, dst_in, addr, &dst);
590
	}
591
592
	if (ret) {
		rcu_read_unlock();
593
		goto done;
594
595
596
	}
	ret = rdma_set_src_addr_rcu(addr, &ndev_flags, dst_in, dst);
	rcu_read_unlock();
597

598
599
600
601
602
	/*
	 * Resolve neighbor destination address if requested and
	 * only if src addr translation didn't fail.
	 */
	if (!ret && resolve_neigh)
603
		ret = addr_resolve_neigh(dst, dst_in, addr, ndev_flags, seq);
604

605
606
607
	if (src_in->sa_family == AF_INET)
		ip_rt_put(rt);
	else
608
		dst_release(dst);
609
610
611
612
613
614
615
done:
	/*
	 * Clear the addr net to go back to its original state, only if it was
	 * derived from GID attribute in this context.
	 */
	if (resolve_by_gid_attr)
		rdma_addr_set_net_defaults(addr);
616
	return ret;
617
618
}

619
620
621
622
623
624
625
626
627
628
629
static void process_one_req(struct work_struct *_work)
{
	struct addr_req *req;
	struct sockaddr *src_in, *dst_in;

	req = container_of(_work, struct addr_req, work.work);

	if (req->status == -ENODATA) {
		src_in = (struct sockaddr *)&req->src_addr;
		dst_in = (struct sockaddr *)&req->dst_addr;
		req->status = addr_resolve(src_in, dst_in, req->addr,
630
631
					   true, req->resolve_by_gid_attr,
					   req->seq);
632
633
634
635
		if (req->status && time_after_eq(jiffies, req->timeout)) {
			req->status = -ETIMEDOUT;
		} else if (req->status == -ENODATA) {
			/* requeue the work for retrying again */
636
			spin_lock_bh(&lock);
637
638
			if (!list_empty(&req->list))
				set_timeout(req, req->timeout);
639
			spin_unlock_bh(&lock);
640
641
642
			return;
		}
	}
643

644
645
	req->callback(req->status, (struct sockaddr *)&req->src_addr,
		req->addr, req->context);
646
647
648
649
650
651
652
653
654
655
656
657
658
659
	req->callback = NULL;

	spin_lock_bh(&lock);
	if (!list_empty(&req->list)) {
		/*
		 * Although the work will normally have been canceled by the
		 * workqueue, it can still be requeued as long as it is on the
		 * req_list.
		 */
		cancel_delayed_work(&req->work);
		list_del_init(&req->list);
		kfree(req);
	}
	spin_unlock_bh(&lock);
660
661
}

662
int rdma_resolve_ip(struct sockaddr *src_addr, const struct sockaddr *dst_addr,
663
		    struct rdma_dev_addr *addr, unsigned long timeout_ms,
664
665
		    void (*callback)(int status, struct sockaddr *src_addr,
				     struct rdma_dev_addr *addr, void *context),
666
		    bool resolve_by_gid_attr, void *context)
667
{
668
	struct sockaddr *src_in, *dst_in;
669
670
671
	struct addr_req *req;
	int ret = 0;

672
	req = kzalloc(sizeof *req, GFP_KERNEL);
673
674
675
	if (!req)
		return -ENOMEM;

676
677
678
679
680
681
682
683
684
	src_in = (struct sockaddr *) &req->src_addr;
	dst_in = (struct sockaddr *) &req->dst_addr;

	if (src_addr) {
		if (src_addr->sa_family != dst_addr->sa_family) {
			ret = -EINVAL;
			goto err;
		}

685
		memcpy(src_in, src_addr, rdma_addr_size(src_addr));
686
687
688
689
	} else {
		src_in->sa_family = dst_addr->sa_family;
	}

690
	memcpy(dst_in, dst_addr, rdma_addr_size(dst_addr));
691
692
693
	req->addr = addr;
	req->callback = callback;
	req->context = context;
694
	req->resolve_by_gid_attr = resolve_by_gid_attr;
695
	INIT_DELAYED_WORK(&req->work, process_one_req);
696
	req->seq = (u32)atomic_inc_return(&ib_nl_addr_request_seq);
697

698
699
	req->status = addr_resolve(src_in, dst_in, addr, true,
				   req->resolve_by_gid_attr, req->seq);
700
701
702
703
704
705
706
707
708
709
710
	switch (req->status) {
	case 0:
		req->timeout = jiffies;
		queue_req(req);
		break;
	case -ENODATA:
		req->timeout = msecs_to_jiffies(timeout_ms) + jiffies;
		queue_req(req);
		break;
	default:
		ret = req->status;
711
		goto err;
712
713
	}
	return ret;
714
715
716
err:
	kfree(req);
	return ret;
717
718
719
}
EXPORT_SYMBOL(rdma_resolve_ip);

720
721
int roce_resolve_route_from_path(struct sa_path_rec *rec,
				 const struct ib_gid_attr *attr)
722
{
723
724
725
726
727
728
729
	union {
		struct sockaddr     _sockaddr;
		struct sockaddr_in  _sockaddr_in;
		struct sockaddr_in6 _sockaddr_in6;
	} sgid, dgid;
	struct rdma_dev_addr dev_addr = {};
	int ret;
730

731
732
	if (rec->roce.route_resolved)
		return 0;
733

734
735
	rdma_gid2ip((struct sockaddr *)&sgid, &rec->sgid);
	rdma_gid2ip((struct sockaddr *)&dgid, &rec->dgid);
736

737
738
739
740
741
742
743
	if (sgid._sockaddr.sa_family != dgid._sockaddr.sa_family)
		return -EINVAL;

	if (!attr || !attr->ndev)
		return -EINVAL;

	dev_addr.net = &init_net;
744
	dev_addr.sgid_attr = attr;
745

746
	ret = addr_resolve((struct sockaddr *)&sgid, (struct sockaddr *)&dgid,
747
			   &dev_addr, false, true, 0);
748
749
750
751
752
753
754
755
756
757
	if (ret)
		return ret;

	if ((dev_addr.network == RDMA_NETWORK_IPV4 ||
	     dev_addr.network == RDMA_NETWORK_IPV6) &&
	    rec->rec_type != SA_PATH_REC_TYPE_ROCE_V2)
		return -EINVAL;

	rec->roce.route_resolved = true;
	return 0;
758
759
}

760
761
762
763
764
765
766
/**
 * rdma_addr_cancel - Cancel resolve ip request
 * @addr:	Pointer to address structure given previously
 *		during rdma_resolve_ip().
 * rdma_addr_cancel() is synchronous function which cancels any pending
 * request if there is any.
 */
767
768
769
void rdma_addr_cancel(struct rdma_dev_addr *addr)
{
	struct addr_req *req, *temp_req;
770
	struct addr_req *found = NULL;
771

772
	spin_lock_bh(&lock);
773
774
	list_for_each_entry_safe(req, temp_req, &req_list, list) {
		if (req->addr == addr) {
775
776
777
778
779
780
			/*
			 * Removing from the list means we take ownership of
			 * the req
			 */
			list_del_init(&req->list);
			found = req;
781
782
783
			break;
		}
	}
784
	spin_unlock_bh(&lock);
785
786
787
788
789
790
791
792
793
794

	if (!found)
		return;

	/*
	 * sync canceling the work after removing it from the req_list
	 * guarentees no work is running and none will be started.
	 */
	cancel_delayed_work_sync(&found->work);
	kfree(found);
795
796
797
}
EXPORT_SYMBOL(rdma_addr_cancel);

798
799
struct resolve_cb_context {
	struct completion comp;
800
	int status;
801
802
803
804
805
};

static void resolve_cb(int status, struct sockaddr *src_addr,
	     struct rdma_dev_addr *addr, void *context)
{
806
	((struct resolve_cb_context *)context)->status = status;
807
808
809
	complete(&((struct resolve_cb_context *)context)->comp);
}

810
811
int rdma_addr_find_l2_eth_by_grh(const union ib_gid *sgid,
				 const union ib_gid *dgid,
812
				 u8 *dmac, const struct ib_gid_attr *sgid_attr,
813
				 int *hoplimit)
814
815
816
817
818
819
820
{
	struct rdma_dev_addr dev_addr;
	struct resolve_cb_context ctx;
	union {
		struct sockaddr_in  _sockaddr_in;
		struct sockaddr_in6 _sockaddr_in6;
	} sgid_addr, dgid_addr;
821
	int ret;
822

823
824
	rdma_gid2ip((struct sockaddr *)&sgid_addr, sgid);
	rdma_gid2ip((struct sockaddr *)&dgid_addr, dgid);
825
826

	memset(&dev_addr, 0, sizeof(dev_addr));
827
	dev_addr.net = &init_net;
828
	dev_addr.sgid_attr = sgid_attr;
829
830

	init_completion(&ctx.comp);
831
832
833
	ret = rdma_resolve_ip((struct sockaddr *)&sgid_addr,
			      (struct sockaddr *)&dgid_addr, &dev_addr, 1000,
			      resolve_cb, true, &ctx);
834
835
836
837
838
	if (ret)
		return ret;

	wait_for_completion(&ctx.comp);

839
840
841
842
	ret = ctx.status;
	if (ret)
		return ret;

843
	memcpy(dmac, dev_addr.dst_dev_addr, ETH_ALEN);
844
845
	*hoplimit = dev_addr.hoplimit;
	return 0;
846
847
}

Roland Dreier's avatar
Roland Dreier committed
848
static int netevent_callback(struct notifier_block *self, unsigned long event,
849
	void *ctx)
850
{
851
852
	struct addr_req *req;

Roland Dreier's avatar
Roland Dreier committed
853
	if (event == NETEVENT_NEIGH_UPDATE) {
854
		struct neighbour *neigh = ctx;
855

856
857
858
859
860
861
		if (neigh->nud_state & NUD_VALID) {
			spin_lock_bh(&lock);
			list_for_each_entry(req, &req_list, list)
				set_timeout(req, jiffies);
			spin_unlock_bh(&lock);
		}
862
	}
863
864
865
	return 0;
}

866
867
static struct notifier_block nb = {
	.notifier_call = netevent_callback
868
869
};

870
int addr_init(void)
871
{
872
	addr_wq = alloc_ordered_workqueue("ib_addr", 0);
873
874
875
	if (!addr_wq)
		return -ENOMEM;

876
	register_netevent_notifier(&nb);
877

878
879
880
	return 0;
}

881
void addr_cleanup(void)
882
{
883
	unregister_netevent_notifier(&nb);
884
	destroy_workqueue(addr_wq);
885
	WARN_ON(!list_empty(&req_list));
886
}