af_smc.c 39.7 KB
Newer Older
1
2
3
4
5
6
7
8
/*
 *  Shared Memory Communications over RDMA (SMC-R) and RoCE
 *
 *  AF_SMC protocol family socket handler keeping the AF_INET sock address type
 *  applies to SOCK_STREAM sockets only
 *  offers an alternative communication option for TCP-protocol sockets
 *  applicable with RoCE-cards only
 *
9
10
11
12
13
 *  Initial restrictions:
 *    - support for alternate links postponed
 *    - partial support for non-blocking sockets only
 *    - support for urgent data postponed
 *
14
 *  Copyright IBM Corp. 2016, 2018
15
16
17
18
19
20
21
22
23
24
 *
 *  Author(s):  Ursula Braun <ubraun@linux.vnet.ibm.com>
 *              based on prototype from Frank Blaschka
 */

#define KMSG_COMPONENT "smc"
#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt

#include <linux/module.h>
#include <linux/socket.h>
25
#include <linux/workqueue.h>
26
#include <linux/in.h>
27
28
#include <linux/sched/signal.h>

29
#include <net/sock.h>
30
#include <net/tcp.h>
31
#include <net/smc.h>
32
#include <asm/ioctls.h>
33
34

#include "smc.h"
35
#include "smc_clc.h"
Ursula Braun's avatar
Ursula Braun committed
36
#include "smc_llc.h"
37
#include "smc_cdc.h"
38
#include "smc_core.h"
39
#include "smc_ib.h"
40
#include "smc_pnet.h"
Ursula Braun's avatar
Ursula Braun committed
41
#include "smc_tx.h"
Ursula Braun's avatar
Ursula Braun committed
42
#include "smc_rx.h"
43
#include "smc_close.h"
44

45
46
47
48
49
50
51
52
53
static DEFINE_MUTEX(smc_create_lgr_pending);	/* serialize link group
						 * creation
						 */

struct smc_lgr_list smc_lgr_list = {		/* established link groups */
	.lock = __SPIN_LOCK_UNLOCKED(smc_lgr_list.lock),
	.list = LIST_HEAD_INIT(smc_lgr_list.list),
};

54
55
static void smc_tcp_listen_work(struct work_struct *);

56
57
58
59
60
61
62
static void smc_set_keepalive(struct sock *sk, int val)
{
	struct smc_sock *smc = smc_sk(sk);

	smc->clcsock->sk->sk_prot->keepalive(smc->clcsock->sk, val);
}

63
64
65
66
static struct smc_hashinfo smc_v4_hashinfo = {
	.lock = __RW_LOCK_UNLOCKED(smc_v4_hashinfo.lock),
};

67
68
69
70
static struct smc_hashinfo smc_v6_hashinfo = {
	.lock = __RW_LOCK_UNLOCKED(smc_v6_hashinfo.lock),
};

71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
int smc_hash_sk(struct sock *sk)
{
	struct smc_hashinfo *h = sk->sk_prot->h.smc_hash;
	struct hlist_head *head;

	head = &h->ht;

	write_lock_bh(&h->lock);
	sk_add_node(sk, head);
	sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1);
	write_unlock_bh(&h->lock);

	return 0;
}
EXPORT_SYMBOL_GPL(smc_hash_sk);

void smc_unhash_sk(struct sock *sk)
{
	struct smc_hashinfo *h = sk->sk_prot->h.smc_hash;

	write_lock_bh(&h->lock);
	if (sk_del_node_init(sk))
		sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1);
	write_unlock_bh(&h->lock);
}
EXPORT_SYMBOL_GPL(smc_unhash_sk);

struct proto smc_proto = {
99
100
101
	.name		= "SMC",
	.owner		= THIS_MODULE,
	.keepalive	= smc_set_keepalive,
102
103
	.hash		= smc_hash_sk,
	.unhash		= smc_unhash_sk,
104
	.obj_size	= sizeof(struct smc_sock),
105
	.h.smc_hash	= &smc_v4_hashinfo,
106
	.slab_flags	= SLAB_TYPESAFE_BY_RCU,
107
};
108
EXPORT_SYMBOL_GPL(smc_proto);
109

110
111
112
113
114
115
116
117
118
119
120
121
struct proto smc_proto6 = {
	.name		= "SMC6",
	.owner		= THIS_MODULE,
	.keepalive	= smc_set_keepalive,
	.hash		= smc_hash_sk,
	.unhash		= smc_unhash_sk,
	.obj_size	= sizeof(struct smc_sock),
	.h.smc_hash	= &smc_v6_hashinfo,
	.slab_flags	= SLAB_TYPESAFE_BY_RCU,
};
EXPORT_SYMBOL_GPL(smc_proto6);

122
123
124
125
static int smc_release(struct socket *sock)
{
	struct sock *sk = sock->sk;
	struct smc_sock *smc;
126
	int rc = 0;
127
128
129
130
131

	if (!sk)
		goto out;

	smc = smc_sk(sk);
132
133
134
135
136
137
138
	if (sk->sk_state == SMC_LISTEN)
		/* smc_close_non_accepted() is called and acquires
		 * sock lock for child sockets again
		 */
		lock_sock_nested(sk, SINGLE_DEPTH_NESTING);
	else
		lock_sock(sk);
139

140
	if (!smc->use_fallback) {
141
142
143
144
		rc = smc_close_active(smc);
		sock_set_flag(sk, SOCK_DEAD);
		sk->sk_shutdown |= SHUTDOWN_MASK;
	}
145
146
147
148
	if (smc->clcsock) {
		sock_release(smc->clcsock);
		smc->clcsock = NULL;
	}
149
150
151
152
153
	if (smc->use_fallback) {
		sock_put(sk); /* passive closing */
		sk->sk_state = SMC_CLOSED;
		sk->sk_state_change(sk);
	}
154
155
156
157

	/* detach socket */
	sock_orphan(sk);
	sock->sk = NULL;
158
	if (!smc->use_fallback && sk->sk_state == SMC_CLOSED)
159
		smc_conn_free(&smc->conn);
160
161
	release_sock(sk);

162
163
	sk->sk_prot->unhash(sk);
	sock_put(sk); /* final sock_put */
164
out:
165
	return rc;
166
167
168
169
170
171
172
173
174
175
176
177
}

static void smc_destruct(struct sock *sk)
{
	if (sk->sk_state != SMC_CLOSED)
		return;
	if (!sock_flag(sk, SOCK_DEAD))
		return;

	sk_refcnt_debug_dec(sk);
}

178
179
static struct sock *smc_sock_alloc(struct net *net, struct socket *sock,
				   int protocol)
180
181
{
	struct smc_sock *smc;
182
	struct proto *prot;
183
184
	struct sock *sk;

185
186
	prot = (protocol == SMCPROTO_SMC6) ? &smc_proto6 : &smc_proto;
	sk = sk_alloc(net, PF_SMC, GFP_KERNEL, prot, 0);
187
188
189
190
191
192
	if (!sk)
		return NULL;

	sock_init_data(sock, sk); /* sets sk_refcnt to 1 */
	sk->sk_state = SMC_INIT;
	sk->sk_destruct = smc_destruct;
193
	sk->sk_protocol = protocol;
194
	smc = smc_sk(sk);
195
196
197
	INIT_WORK(&smc->tcp_listen_work, smc_tcp_listen_work);
	INIT_LIST_HEAD(&smc->accept_q);
	spin_lock_init(&smc->accept_q_lock);
198
	sk->sk_prot->hash(sk);
199
	sk_refcnt_debug_inc(sk);
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219

	return sk;
}

static int smc_bind(struct socket *sock, struct sockaddr *uaddr,
		    int addr_len)
{
	struct sockaddr_in *addr = (struct sockaddr_in *)uaddr;
	struct sock *sk = sock->sk;
	struct smc_sock *smc;
	int rc;

	smc = smc_sk(sk);

	/* replicate tests from inet_bind(), to be safe wrt. future changes */
	rc = -EINVAL;
	if (addr_len < sizeof(struct sockaddr_in))
		goto out;

	rc = -EAFNOSUPPORT;
220
221
222
223
	if (addr->sin_family != AF_INET &&
	    addr->sin_family != AF_INET6 &&
	    addr->sin_family != AF_UNSPEC)
		goto out;
224
	/* accept AF_UNSPEC (mapped to AF_INET) only if s_addr is INADDR_ANY */
225
226
	if (addr->sin_family == AF_UNSPEC &&
	    addr->sin_addr.s_addr != htonl(INADDR_ANY))
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
		goto out;

	lock_sock(sk);

	/* Check if socket is already active */
	rc = -EINVAL;
	if (sk->sk_state != SMC_INIT)
		goto out_rel;

	smc->clcsock->sk->sk_reuse = sk->sk_reuse;
	rc = kernel_bind(smc->clcsock, uaddr, addr_len);

out_rel:
	release_sock(sk);
out:
	return rc;
}

static void smc_copy_sock_settings(struct sock *nsk, struct sock *osk,
				   unsigned long mask)
{
	/* options we don't get control via setsockopt for */
	nsk->sk_type = osk->sk_type;
	nsk->sk_sndbuf = osk->sk_sndbuf;
	nsk->sk_rcvbuf = osk->sk_rcvbuf;
	nsk->sk_sndtimeo = osk->sk_sndtimeo;
	nsk->sk_rcvtimeo = osk->sk_rcvtimeo;
	nsk->sk_mark = osk->sk_mark;
	nsk->sk_priority = osk->sk_priority;
	nsk->sk_rcvlowat = osk->sk_rcvlowat;
	nsk->sk_bound_dev_if = osk->sk_bound_dev_if;
	nsk->sk_err = osk->sk_err;

	nsk->sk_flags &= ~mask;
	nsk->sk_flags |= osk->sk_flags & mask;
}

#define SK_FLAGS_SMC_TO_CLC ((1UL << SOCK_URGINLINE) | \
			     (1UL << SOCK_KEEPOPEN) | \
			     (1UL << SOCK_LINGER) | \
			     (1UL << SOCK_BROADCAST) | \
			     (1UL << SOCK_TIMESTAMP) | \
			     (1UL << SOCK_DBG) | \
			     (1UL << SOCK_RCVTSTAMP) | \
			     (1UL << SOCK_RCVTSTAMPNS) | \
			     (1UL << SOCK_LOCALROUTE) | \
			     (1UL << SOCK_TIMESTAMPING_RX_SOFTWARE) | \
			     (1UL << SOCK_RXQ_OVFL) | \
			     (1UL << SOCK_WIFI_STATUS) | \
			     (1UL << SOCK_NOFCS) | \
			     (1UL << SOCK_FILTER_LOCKED))
/* copy only relevant settings and flags of SOL_SOCKET level from smc to
 * clc socket (since smc is not called for these options from net/core)
 */
static void smc_copy_sock_settings_to_clc(struct smc_sock *smc)
{
	smc_copy_sock_settings(smc->clcsock->sk, &smc->sk, SK_FLAGS_SMC_TO_CLC);
}

#define SK_FLAGS_CLC_TO_SMC ((1UL << SOCK_URGINLINE) | \
			     (1UL << SOCK_KEEPOPEN) | \
			     (1UL << SOCK_LINGER) | \
			     (1UL << SOCK_DBG))
/* copy only settings and flags relevant for smc from clc to smc socket */
static void smc_copy_sock_settings_to_smc(struct smc_sock *smc)
{
	smc_copy_sock_settings(&smc->sk, smc->clcsock->sk, SK_FLAGS_CLC_TO_SMC);
}

Karsten Graul's avatar
Karsten Graul committed
296
297
298
299
/* register a new rmb */
static int smc_reg_rmb(struct smc_link *link, struct smc_buf_desc *rmb_desc)
{
	/* register memory region for new rmb */
300
301
	if (smc_wr_reg_send(link, rmb_desc->mr_rx[SMC_SINGLE_LINK])) {
		rmb_desc->regerr = 1;
Karsten Graul's avatar
Karsten Graul committed
302
		return -EFAULT;
303
	}
Karsten Graul's avatar
Karsten Graul committed
304
305
306
	return 0;
}

307
static int smc_clnt_conf_first_link(struct smc_sock *smc)
Ursula Braun's avatar
Ursula Braun committed
308
{
309
	struct net *net = sock_net(smc->clcsock->sk);
Ursula Braun's avatar
Ursula Braun committed
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
	struct smc_link_group *lgr = smc->conn.lgr;
	struct smc_link *link;
	int rest;
	int rc;

	link = &lgr->lnk[SMC_SINGLE_LINK];
	/* receive CONFIRM LINK request from server over RoCE fabric */
	rest = wait_for_completion_interruptible_timeout(
		&link->llc_confirm,
		SMC_LLC_WAIT_FIRST_TIME);
	if (rest <= 0) {
		struct smc_clc_msg_decline dclc;

		rc = smc_clc_wait_msg(smc, &dclc, sizeof(dclc),
				      SMC_CLC_DECLINE);
		return rc;
	}

328
329
330
	if (link->llc_confirm_rc)
		return SMC_CLC_DECL_RMBE_EC;

Ursula Braun's avatar
Ursula Braun committed
331
332
333
334
335
	rc = smc_ib_modify_qp_rts(link);
	if (rc)
		return SMC_CLC_DECL_INTERR;

	smc_wr_remember_qp_attr(link);
336

Karsten Graul's avatar
Karsten Graul committed
337
	if (smc_reg_rmb(link, smc->conn.rmb_desc))
338
339
		return SMC_CLC_DECL_INTERR;

Ursula Braun's avatar
Ursula Braun committed
340
341
342
	/* send CONFIRM LINK response over RoCE fabric */
	rc = smc_llc_send_confirm_link(link,
				       link->smcibdev->mac[link->ibport - 1],
343
344
				       &link->smcibdev->gid[link->ibport - 1],
				       SMC_LLC_RESP);
Ursula Braun's avatar
Ursula Braun committed
345
346
347
	if (rc < 0)
		return SMC_CLC_DECL_TCL;

348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
	/* receive ADD LINK request from server over RoCE fabric */
	rest = wait_for_completion_interruptible_timeout(&link->llc_add,
							 SMC_LLC_WAIT_TIME);
	if (rest <= 0) {
		struct smc_clc_msg_decline dclc;

		rc = smc_clc_wait_msg(smc, &dclc, sizeof(dclc),
				      SMC_CLC_DECLINE);
		return rc;
	}

	/* send add link reject message, only one link supported for now */
	rc = smc_llc_send_add_link(link,
				   link->smcibdev->mac[link->ibport - 1],
				   &link->smcibdev->gid[link->ibport - 1],
				   SMC_LLC_RESP);
	if (rc < 0)
		return SMC_CLC_DECL_TCL;

367
	smc_llc_link_active(link, net->ipv4.sysctl_tcp_keepalive_time);
368

369
	return 0;
Ursula Braun's avatar
Ursula Braun committed
370
371
}

372
373
374
375
static void smc_conn_save_peer_info(struct smc_sock *smc,
				    struct smc_clc_msg_accept_confirm *clc)
{
	smc->conn.peer_conn_idx = clc->conn_idx;
376
	smc->conn.local_tx_ctrl.token = ntohl(clc->rmbe_alert_token);
377
378
	smc->conn.peer_rmbe_size = smc_uncompress_bufsize(clc->rmbe_size);
	atomic_set(&smc->conn.peer_rmbe_space, smc->conn.peer_rmbe_size);
379
380
381
382
383
384
385
386
387
388
389
390
}

static void smc_link_save_peer_info(struct smc_link *link,
				    struct smc_clc_msg_accept_confirm *clc)
{
	link->peer_qpn = ntoh24(clc->qpn);
	memcpy(link->peer_gid, clc->lcl.gid, SMC_GID_SIZE);
	memcpy(link->peer_mac, clc->lcl.mac, sizeof(link->peer_mac));
	link->peer_psn = ntoh24(clc->psn);
	link->peer_mtu = clc->qp_mtu;
}

391
392
393
394
/* setup for RDMA connection of client */
static int smc_connect_rdma(struct smc_sock *smc)
{
	struct smc_clc_msg_accept_confirm aclc;
395
	int local_contact = SMC_FIRST_CONTACT;
396
	struct smc_ib_device *smcibdev;
397
398
	struct smc_link *link;
	u8 srv_first_contact;
399
400
401
402
	int reason_code = 0;
	int rc = 0;
	u8 ibport;

403
404
	sock_hold(&smc->sk); /* sock put in passive closing */

405
406
407
	if (smc->use_fallback)
		goto out_connected;

408
409
410
411
412
413
	if (!tcp_sk(smc->clcsock->sk)->syn_smc) {
		/* peer has not signalled SMC-capability */
		smc->use_fallback = true;
		goto out_connected;
	}

414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
	/* IPSec connections opt out of SMC-R optimizations */
	if (using_ipsec(smc)) {
		reason_code = SMC_CLC_DECL_IPSEC;
		goto decline_rdma;
	}

	/* PNET table look up: search active ib_device and port
	 * within same PNETID that also contains the ethernet device
	 * used for the internal TCP socket
	 */
	smc_pnet_find_roce_resource(smc->clcsock->sk, &smcibdev, &ibport);
	if (!smcibdev) {
		reason_code = SMC_CLC_DECL_CNFERR; /* configuration error */
		goto decline_rdma;
	}

	/* do inband token exchange */
	reason_code = smc_clc_send_proposal(smc, smcibdev, ibport);
	if (reason_code < 0) {
		rc = reason_code;
		goto out_err;
	}
	if (reason_code > 0) /* configuration error */
		goto decline_rdma;
	/* receive SMC Accept CLC message */
	reason_code = smc_clc_wait_msg(smc, &aclc, sizeof(aclc),
				       SMC_CLC_ACCEPT);
	if (reason_code < 0) {
		rc = reason_code;
		goto out_err;
	}
	if (reason_code > 0)
		goto decline_rdma;

448
449
	srv_first_contact = aclc.hdr.flag;
	mutex_lock(&smc_create_lgr_pending);
450
451
	local_contact = smc_conn_create(smc, smcibdev, ibport, &aclc.lcl,
					srv_first_contact);
452
453
454
455
456
457
458
459
460
	if (local_contact < 0) {
		rc = local_contact;
		if (rc == -ENOMEM)
			reason_code = SMC_CLC_DECL_MEM;/* insufficient memory*/
		else if (rc == -ENOLINK)
			reason_code = SMC_CLC_DECL_SYNCERR; /* synchr. error */
		goto decline_rdma_unlock;
	}
	link = &smc->conn.lgr->lnk[SMC_SINGLE_LINK];
461

462
	smc_conn_save_peer_info(smc, &aclc);
463

464
465
	/* create send buffer and rmb */
	rc = smc_buf_create(smc);
466
467
468
469
470
	if (rc) {
		reason_code = SMC_CLC_DECL_MEM;
		goto decline_rdma_unlock;
	}

471
472
	if (local_contact == SMC_FIRST_CONTACT)
		smc_link_save_peer_info(link, &aclc);
473
474
475
476
477
478
479

	rc = smc_rmb_rtoken_handling(&smc->conn, &aclc);
	if (rc) {
		reason_code = SMC_CLC_DECL_INTERR;
		goto decline_rdma_unlock;
	}

480
481
482
	smc_close_init(smc);
	smc_rx_init(smc);

483
484
485
486
487
488
	if (local_contact == SMC_FIRST_CONTACT) {
		rc = smc_ib_ready_link(link);
		if (rc) {
			reason_code = SMC_CLC_DECL_INTERR;
			goto decline_rdma_unlock;
		}
489
	} else {
Karsten Graul's avatar
Karsten Graul committed
490
491
		if (!smc->conn.rmb_desc->reused) {
			if (smc_reg_rmb(link, smc->conn.rmb_desc)) {
492
493
494
495
				reason_code = SMC_CLC_DECL_INTERR;
				goto decline_rdma_unlock;
			}
		}
496
	}
497
	smc_rmb_sync_sg_for_device(&smc->conn);
498
499
500

	rc = smc_clc_send_confirm(smc);
	if (rc)
501
		goto out_err_unlock;
502

Ursula Braun's avatar
Ursula Braun committed
503
504
	if (local_contact == SMC_FIRST_CONTACT) {
		/* QP confirmation over RoCE fabric */
505
		reason_code = smc_clnt_conf_first_link(smc);
Ursula Braun's avatar
Ursula Braun committed
506
507
508
509
510
511
512
		if (reason_code < 0) {
			rc = reason_code;
			goto out_err_unlock;
		}
		if (reason_code > 0)
			goto decline_rdma_unlock;
	}
513

514
	mutex_unlock(&smc_create_lgr_pending);
Ursula Braun's avatar
Ursula Braun committed
515
516
	smc_tx_init(smc);

517
518
out_connected:
	smc_copy_sock_settings_to_clc(smc);
519
520
	if (smc->sk.sk_state == SMC_INIT)
		smc->sk.sk_state = SMC_ACTIVE;
521

522
	return rc ? rc : local_contact;
523

524
decline_rdma_unlock:
525
526
	if (local_contact == SMC_FIRST_CONTACT)
		smc_lgr_forget(smc->conn.lgr);
527
528
	mutex_unlock(&smc_create_lgr_pending);
	smc_conn_free(&smc->conn);
529
530
531
532
decline_rdma:
	/* RDMA setup failed, switch back to TCP */
	smc->use_fallback = true;
	if (reason_code && (reason_code != SMC_CLC_DECL_REPLY)) {
533
		rc = smc_clc_send_decline(smc, reason_code);
534
		if (rc < 0)
535
536
537
538
			goto out_err;
	}
	goto out_connected;

539
out_err_unlock:
540
541
	if (local_contact == SMC_FIRST_CONTACT)
		smc_lgr_forget(smc->conn.lgr);
542
543
	mutex_unlock(&smc_create_lgr_pending);
	smc_conn_free(&smc->conn);
544
out_err:
545
546
	if (smc->sk.sk_state == SMC_INIT)
		sock_put(&smc->sk); /* passive closing */
547
548
549
	return rc;
}

550
551
552
553
554
555
556
557
558
559
560
561
static int smc_connect(struct socket *sock, struct sockaddr *addr,
		       int alen, int flags)
{
	struct sock *sk = sock->sk;
	struct smc_sock *smc;
	int rc = -EINVAL;

	smc = smc_sk(sk);

	/* separate smc parameter checking to be safe */
	if (alen < sizeof(addr->sa_family))
		goto out_err;
562
	if (addr->sa_family != AF_INET && addr->sa_family != AF_INET6)
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
		goto out_err;

	lock_sock(sk);
	switch (sk->sk_state) {
	default:
		goto out;
	case SMC_ACTIVE:
		rc = -EISCONN;
		goto out;
	case SMC_INIT:
		rc = 0;
		break;
	}

	smc_copy_sock_settings_to_clc(smc);
578
	tcp_sk(smc->clcsock->sk)->syn_smc = 1;
579
580
581
582
	rc = kernel_connect(smc->clcsock, addr, alen, flags);
	if (rc)
		goto out;

583
584
585
586
587
588
	/* setup RDMA connection */
	rc = smc_connect_rdma(smc);
	if (rc < 0)
		goto out;
	else
		rc = 0; /* success cases including fallback */
589
590
591
592
593
594
595
596
597

out:
	release_sock(sk);
out_err:
	return rc;
}

static int smc_clcsock_accept(struct smc_sock *lsmc, struct smc_sock **new_smc)
{
598
599
	struct socket *new_clcsock = NULL;
	struct sock *lsk = &lsmc->sk;
600
601
602
	struct sock *new_sk;
	int rc;

603
	release_sock(lsk);
604
	new_sk = smc_sock_alloc(sock_net(lsk), NULL, lsk->sk_protocol);
605
606
	if (!new_sk) {
		rc = -ENOMEM;
607
		lsk->sk_err = ENOMEM;
608
		*new_smc = NULL;
609
		lock_sock(lsk);
610
611
612
613
614
		goto out;
	}
	*new_smc = smc_sk(new_sk);

	rc = kernel_accept(lsmc->clcsock, &new_clcsock, 0);
615
	lock_sock(lsk);
616
	if  (rc < 0)
617
		lsk->sk_err = -rc;
618
	if (rc < 0 || lsk->sk_state == SMC_CLOSED) {
619
620
621
622
		if (new_clcsock)
			sock_release(new_clcsock);
		new_sk->sk_state = SMC_CLOSED;
		sock_set_flag(new_sk, SOCK_DEAD);
623
		new_sk->sk_prot->unhash(new_sk);
624
		sock_put(new_sk); /* final */
625
626
627
628
629
630
631
632
633
		*new_smc = NULL;
		goto out;
	}

	(*new_smc)->clcsock = new_clcsock;
out:
	return rc;
}

634
635
636
637
638
639
640
/* add a just created sock to the accept queue of the listen sock as
 * candidate for a following socket accept call from user space
 */
static void smc_accept_enqueue(struct sock *parent, struct sock *sk)
{
	struct smc_sock *par = smc_sk(parent);

641
	sock_hold(sk); /* sock_put in smc_accept_unlink () */
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
	spin_lock(&par->accept_q_lock);
	list_add_tail(&smc_sk(sk)->accept_q, &par->accept_q);
	spin_unlock(&par->accept_q_lock);
	sk_acceptq_added(parent);
}

/* remove a socket from the accept queue of its parental listening socket */
static void smc_accept_unlink(struct sock *sk)
{
	struct smc_sock *par = smc_sk(sk)->listen_smc;

	spin_lock(&par->accept_q_lock);
	list_del_init(&smc_sk(sk)->accept_q);
	spin_unlock(&par->accept_q_lock);
	sk_acceptq_removed(&smc_sk(sk)->listen_smc->sk);
657
	sock_put(sk); /* sock_hold in smc_accept_enqueue */
658
659
660
661
662
}

/* remove a sock from the accept queue to bind it to a new socket created
 * for a socket accept call from user space
 */
663
664
struct sock *smc_accept_dequeue(struct sock *parent,
				struct socket *new_sock)
665
666
667
668
669
670
671
672
673
{
	struct smc_sock *isk, *n;
	struct sock *new_sk;

	list_for_each_entry_safe(isk, n, &smc_sk(parent)->accept_q, accept_q) {
		new_sk = (struct sock *)isk;

		smc_accept_unlink(new_sk);
		if (new_sk->sk_state == SMC_CLOSED) {
674
675
676
677
			if (isk->clcsock) {
				sock_release(isk->clcsock);
				isk->clcsock = NULL;
			}
678
			new_sk->sk_prot->unhash(new_sk);
679
			sock_put(new_sk); /* final */
680
681
682
683
684
685
686
687
688
689
			continue;
		}
		if (new_sock)
			sock_graft(new_sk, new_sock);
		return new_sk;
	}
	return NULL;
}

/* clean up for a created but never accepted sock */
690
void smc_close_non_accepted(struct sock *sk)
691
692
693
{
	struct smc_sock *smc = smc_sk(sk);

694
695
696
697
	lock_sock(sk);
	if (!sk->sk_lingertime)
		/* wait for peer closing */
		sk->sk_lingertime = SMC_MAX_STREAM_WAIT_TIMEOUT;
698
	if (!smc->use_fallback) {
699
		smc_close_active(smc);
700
701
702
		sock_set_flag(sk, SOCK_DEAD);
		sk->sk_shutdown |= SHUTDOWN_MASK;
	}
703
704
705
706
707
708
709
	if (smc->clcsock) {
		struct socket *tcp;

		tcp = smc->clcsock;
		smc->clcsock = NULL;
		sock_release(tcp);
	}
710
	if (smc->use_fallback) {
711
712
713
714
715
		sock_put(sk); /* passive closing */
		sk->sk_state = SMC_CLOSED;
	} else {
		if (sk->sk_state == SMC_CLOSED)
			smc_conn_free(&smc->conn);
716
717
	}
	release_sock(sk);
718
719
	sk->sk_prot->unhash(sk);
	sock_put(sk); /* final sock_put */
720
721
}

Ursula Braun's avatar
Ursula Braun committed
722
723
static int smc_serv_conf_first_link(struct smc_sock *smc)
{
724
	struct net *net = sock_net(smc->clcsock->sk);
Ursula Braun's avatar
Ursula Braun committed
725
726
727
728
729
730
	struct smc_link_group *lgr = smc->conn.lgr;
	struct smc_link *link;
	int rest;
	int rc;

	link = &lgr->lnk[SMC_SINGLE_LINK];
731

Karsten Graul's avatar
Karsten Graul committed
732
	if (smc_reg_rmb(link, smc->conn.rmb_desc))
733
734
		return SMC_CLC_DECL_INTERR;

Ursula Braun's avatar
Ursula Braun committed
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
	/* send CONFIRM LINK request to client over the RoCE fabric */
	rc = smc_llc_send_confirm_link(link,
				       link->smcibdev->mac[link->ibport - 1],
				       &link->smcibdev->gid[link->ibport - 1],
				       SMC_LLC_REQ);
	if (rc < 0)
		return SMC_CLC_DECL_TCL;

	/* receive CONFIRM LINK response from client over the RoCE fabric */
	rest = wait_for_completion_interruptible_timeout(
		&link->llc_confirm_resp,
		SMC_LLC_WAIT_FIRST_TIME);
	if (rest <= 0) {
		struct smc_clc_msg_decline dclc;

		rc = smc_clc_wait_msg(smc, &dclc, sizeof(dclc),
				      SMC_CLC_DECLINE);
752
		return rc;
Ursula Braun's avatar
Ursula Braun committed
753
754
	}

755
756
757
	if (link->llc_confirm_resp_rc)
		return SMC_CLC_DECL_RMBE_EC;

758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
	/* send ADD LINK request to client over the RoCE fabric */
	rc = smc_llc_send_add_link(link,
				   link->smcibdev->mac[link->ibport - 1],
				   &link->smcibdev->gid[link->ibport - 1],
				   SMC_LLC_REQ);
	if (rc < 0)
		return SMC_CLC_DECL_TCL;

	/* receive ADD LINK response from client over the RoCE fabric */
	rest = wait_for_completion_interruptible_timeout(&link->llc_add_resp,
							 SMC_LLC_WAIT_TIME);
	if (rest <= 0) {
		struct smc_clc_msg_decline dclc;

		rc = smc_clc_wait_msg(smc, &dclc, sizeof(dclc),
				      SMC_CLC_DECLINE);
		return rc;
	}

777
	smc_llc_link_active(link, net->ipv4.sysctl_tcp_keepalive_time);
778

779
	return 0;
Ursula Braun's avatar
Ursula Braun committed
780
781
}

782
783
784
785
786
/* setup for RDMA connection of server */
static void smc_listen_work(struct work_struct *work)
{
	struct smc_sock *new_smc = container_of(work, struct smc_sock,
						smc_listen_work);
787
	struct smc_clc_msg_proposal_prefix *pclc_prfx;
788
789
790
	struct socket *newclcsock = new_smc->clcsock;
	struct smc_sock *lsmc = new_smc->listen_smc;
	struct smc_clc_msg_accept_confirm cclc;
791
	int local_contact = SMC_REUSE_CONTACT;
792
	struct sock *newsmcsk = &new_smc->sk;
793
	struct smc_clc_msg_proposal *pclc;
794
	struct smc_ib_device *smcibdev;
795
	u8 buf[SMC_CLC_MAX_LEN];
796
	struct smc_link *link;
797
	int reason_code = 0;
798
	int rc = 0;
799
800
	u8 ibport;

801
802
803
	if (new_smc->use_fallback)
		goto out_connected;

804
805
806
807
808
809
	/* check if peer is smc capable */
	if (!tcp_sk(newclcsock->sk)->syn_smc) {
		new_smc->use_fallback = true;
		goto out_connected;
	}

810
811
812
	/* do inband token exchange -
	 *wait for and receive SMC Proposal CLC message
	 */
813
	reason_code = smc_clc_wait_msg(new_smc, &buf, sizeof(buf),
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
				       SMC_CLC_PROPOSAL);
	if (reason_code < 0)
		goto out_err;
	if (reason_code > 0)
		goto decline_rdma;

	/* IPSec connections opt out of SMC-R optimizations */
	if (using_ipsec(new_smc)) {
		reason_code = SMC_CLC_DECL_IPSEC;
		goto decline_rdma;
	}

	/* PNET table look up: search active ib_device and port
	 * within same PNETID that also contains the ethernet device
	 * used for the internal TCP socket
	 */
	smc_pnet_find_roce_resource(newclcsock->sk, &smcibdev, &ibport);
	if (!smcibdev) {
		reason_code = SMC_CLC_DECL_CNFERR; /* configuration error */
		goto decline_rdma;
	}

836
837
	pclc = (struct smc_clc_msg_proposal *)&buf;
	pclc_prfx = smc_clc_proposal_get_prefix(pclc);
838
839
840

	rc = smc_clc_prfx_match(newclcsock, pclc_prfx);
	if (rc) {
841
842
843
844
		reason_code = SMC_CLC_DECL_CNFERR; /* configuration error */
		goto decline_rdma;
	}

845
846
	/* allocate connection / link group */
	mutex_lock(&smc_create_lgr_pending);
847
848
	local_contact = smc_conn_create(new_smc, smcibdev, ibport, &pclc->lcl,
					0);
849
850
851
852
	if (local_contact < 0) {
		rc = local_contact;
		if (rc == -ENOMEM)
			reason_code = SMC_CLC_DECL_MEM;/* insufficient memory*/
853
		goto decline_rdma_unlock;
854
855
	}
	link = &new_smc->conn.lgr->lnk[SMC_SINGLE_LINK];
856

857
858
	/* create send buffer and rmb */
	rc = smc_buf_create(new_smc);
859
860
	if (rc) {
		reason_code = SMC_CLC_DECL_MEM;
861
		goto decline_rdma_unlock;
862
	}
863

864
865
866
	smc_close_init(new_smc);
	smc_rx_init(new_smc);

867
	if (local_contact != SMC_FIRST_CONTACT) {
Karsten Graul's avatar
Karsten Graul committed
868
869
		if (!new_smc->conn.rmb_desc->reused) {
			if (smc_reg_rmb(link, new_smc->conn.rmb_desc)) {
870
				reason_code = SMC_CLC_DECL_INTERR;
871
				goto decline_rdma_unlock;
872
873
874
			}
		}
	}
875
	smc_rmb_sync_sg_for_device(&new_smc->conn);
876

877
	rc = smc_clc_send_accept(new_smc, local_contact);
878
	if (rc)
879
		goto out_err_unlock;
880
881
882
883
884

	/* receive SMC Confirm CLC message */
	reason_code = smc_clc_wait_msg(new_smc, &cclc, sizeof(cclc),
				       SMC_CLC_CONFIRM);
	if (reason_code < 0)
885
		goto out_err_unlock;
886
	if (reason_code > 0)
887
		goto decline_rdma_unlock;
888
889
890
	smc_conn_save_peer_info(new_smc, &cclc);
	if (local_contact == SMC_FIRST_CONTACT)
		smc_link_save_peer_info(link, &cclc);
891

892
893
894
	rc = smc_rmb_rtoken_handling(&new_smc->conn, &cclc);
	if (rc) {
		reason_code = SMC_CLC_DECL_INTERR;
895
		goto decline_rdma_unlock;
896
897
898
899
900
901
	}

	if (local_contact == SMC_FIRST_CONTACT) {
		rc = smc_ib_ready_link(link);
		if (rc) {
			reason_code = SMC_CLC_DECL_INTERR;
902
			goto decline_rdma_unlock;
903
		}
Ursula Braun's avatar
Ursula Braun committed
904
905
		/* QP confirmation over RoCE fabric */
		reason_code = smc_serv_conf_first_link(new_smc);
906
		if (reason_code < 0)
Ursula Braun's avatar
Ursula Braun committed
907
			/* peer is not aware of a problem */
908
			goto out_err_unlock;
Ursula Braun's avatar
Ursula Braun committed
909
		if (reason_code > 0)
910
			goto decline_rdma_unlock;
911
	}
912

Ursula Braun's avatar
Ursula Braun committed
913
	smc_tx_init(new_smc);
914
	mutex_unlock(&smc_create_lgr_pending);
Ursula Braun's avatar
Ursula Braun committed
915

916
917
out_connected:
	sk_refcnt_debug_inc(newsmcsk);
918
919
	if (newsmcsk->sk_state == SMC_INIT)
		newsmcsk->sk_state = SMC_ACTIVE;
920
enqueue:
921
	lock_sock_nested(&lsmc->sk, SINGLE_DEPTH_NESTING);
922
923
924
925
926
927
928
929
930
931
932
933
	if (lsmc->sk.sk_state == SMC_LISTEN) {
		smc_accept_enqueue(&lsmc->sk, newsmcsk);
	} else { /* no longer listening */
		smc_close_non_accepted(newsmcsk);
	}
	release_sock(&lsmc->sk);

	/* Wake up accept */
	lsmc->sk.sk_data_ready(&lsmc->sk);
	sock_put(&lsmc->sk); /* sock_hold in smc_tcp_listen_work */
	return;

934
decline_rdma_unlock:
935
936
	if (local_contact == SMC_FIRST_CONTACT)
		smc_lgr_forget(new_smc->conn.lgr);
937
	mutex_unlock(&smc_create_lgr_pending);
938
939
decline_rdma:
	/* RDMA setup failed, switch back to TCP */
940
	smc_conn_free(&new_smc->conn);
941
942
	new_smc->use_fallback = true;
	if (reason_code && (reason_code != SMC_CLC_DECL_REPLY)) {
943
		if (smc_clc_send_decline(new_smc, reason_code) < 0)
944
945
946
947
			goto out_err;
	}
	goto out_connected;

948
out_err_unlock:
949
950
	if (local_contact == SMC_FIRST_CONTACT)
		smc_lgr_forget(new_smc->conn.lgr);
951
	mutex_unlock(&smc_create_lgr_pending);
952
out_err:
953
954
	if (newsmcsk->sk_state == SMC_INIT)
		sock_put(&new_smc->sk); /* passive closing */
955
	newsmcsk->sk_state = SMC_CLOSED;
956
	smc_conn_free(&new_smc->conn);
957
958
959
960
961
962
963
	goto enqueue; /* queue new sock with sk_err set */
}

static void smc_tcp_listen_work(struct work_struct *work)
{
	struct smc_sock *lsmc = container_of(work, struct smc_sock,
					     tcp_listen_work);
964
	struct sock *lsk = &lsmc->sk;
965
966
967
	struct smc_sock *new_smc;
	int rc = 0;

968
969
	lock_sock(lsk);
	while (lsk->sk_state == SMC_LISTEN) {
970
971
972
973
974
975
976
		rc = smc_clcsock_accept(lsmc, &new_smc);
		if (rc)
			goto out;
		if (!new_smc)
			continue;

		new_smc->listen_smc = lsmc;
977
		new_smc->use_fallback = lsmc->use_fallback;
978
		sock_hold(lsk); /* sock_put in smc_listen_work */
979
980
		INIT_WORK(&new_smc->smc_listen_work, smc_listen_work);
		smc_copy_sock_settings_to_smc(new_smc);
981
982
983
		sock_hold(&new_smc->sk); /* sock_put in passive closing */
		if (!schedule_work(&new_smc->smc_listen_work))
			sock_put(&new_smc->sk);
984
985
986
	}

out:
987
	release_sock(lsk);
988
	sock_put(&lsmc->sk); /* sock_hold in smc_listen */
989
990
}

991
992
993
994
995
996
997
998
999
1000
static int smc_listen(struct socket *sock, int backlog)
{
	struct sock *sk = sock->sk;
	struct smc_sock *smc;
	int rc;

	smc = smc_sk(sk);
	lock_sock(sk);

	rc = -EINVAL;
For faster browsing, not all history is shown. View entire blame