af_smc.c 37.2 KB
Newer Older
1
2
3
4
5
6
7
8
/*
 *  Shared Memory Communications over RDMA (SMC-R) and RoCE
 *
 *  AF_SMC protocol family socket handler keeping the AF_INET sock address type
 *  applies to SOCK_STREAM sockets only
 *  offers an alternative communication option for TCP-protocol sockets
 *  applicable with RoCE-cards only
 *
9
10
11
12
13
 *  Initial restrictions:
 *    - support for alternate links postponed
 *    - partial support for non-blocking sockets only
 *    - support for urgent data postponed
 *
14
 *  Copyright IBM Corp. 2016, 2018
15
16
17
18
19
20
21
22
23
24
 *
 *  Author(s):  Ursula Braun <ubraun@linux.vnet.ibm.com>
 *              based on prototype from Frank Blaschka
 */

#define KMSG_COMPONENT "smc"
#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt

#include <linux/module.h>
#include <linux/socket.h>
25
#include <linux/workqueue.h>
26
#include <linux/in.h>
27
28
#include <linux/sched/signal.h>

29
#include <net/sock.h>
30
#include <net/tcp.h>
31
#include <net/smc.h>
32
33

#include "smc.h"
34
#include "smc_clc.h"
Ursula Braun's avatar
Ursula Braun committed
35
#include "smc_llc.h"
36
#include "smc_cdc.h"
37
#include "smc_core.h"
38
#include "smc_ib.h"
39
#include "smc_pnet.h"
Ursula Braun's avatar
Ursula Braun committed
40
#include "smc_tx.h"
Ursula Braun's avatar
Ursula Braun committed
41
#include "smc_rx.h"
42
#include "smc_close.h"
43

44
45
46
47
48
49
50
51
52
static DEFINE_MUTEX(smc_create_lgr_pending);	/* serialize link group
						 * creation
						 */

struct smc_lgr_list smc_lgr_list = {		/* established link groups */
	.lock = __SPIN_LOCK_UNLOCKED(smc_lgr_list.lock),
	.list = LIST_HEAD_INIT(smc_lgr_list.list),
};

53
54
static void smc_tcp_listen_work(struct work_struct *);

55
56
57
58
59
60
61
static void smc_set_keepalive(struct sock *sk, int val)
{
	struct smc_sock *smc = smc_sk(sk);

	smc->clcsock->sk->sk_prot->keepalive(smc->clcsock->sk, val);
}

62
63
64
65
static struct smc_hashinfo smc_v4_hashinfo = {
	.lock = __RW_LOCK_UNLOCKED(smc_v4_hashinfo.lock),
};

66
67
68
69
static struct smc_hashinfo smc_v6_hashinfo = {
	.lock = __RW_LOCK_UNLOCKED(smc_v6_hashinfo.lock),
};

70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
int smc_hash_sk(struct sock *sk)
{
	struct smc_hashinfo *h = sk->sk_prot->h.smc_hash;
	struct hlist_head *head;

	head = &h->ht;

	write_lock_bh(&h->lock);
	sk_add_node(sk, head);
	sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1);
	write_unlock_bh(&h->lock);

	return 0;
}
EXPORT_SYMBOL_GPL(smc_hash_sk);

void smc_unhash_sk(struct sock *sk)
{
	struct smc_hashinfo *h = sk->sk_prot->h.smc_hash;

	write_lock_bh(&h->lock);
	if (sk_del_node_init(sk))
		sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1);
	write_unlock_bh(&h->lock);
}
EXPORT_SYMBOL_GPL(smc_unhash_sk);

struct proto smc_proto = {
98
99
100
	.name		= "SMC",
	.owner		= THIS_MODULE,
	.keepalive	= smc_set_keepalive,
101
102
	.hash		= smc_hash_sk,
	.unhash		= smc_unhash_sk,
103
	.obj_size	= sizeof(struct smc_sock),
104
	.h.smc_hash	= &smc_v4_hashinfo,
105
	.slab_flags	= SLAB_TYPESAFE_BY_RCU,
106
};
107
EXPORT_SYMBOL_GPL(smc_proto);
108

109
110
111
112
113
114
115
116
117
118
119
120
struct proto smc_proto6 = {
	.name		= "SMC6",
	.owner		= THIS_MODULE,
	.keepalive	= smc_set_keepalive,
	.hash		= smc_hash_sk,
	.unhash		= smc_unhash_sk,
	.obj_size	= sizeof(struct smc_sock),
	.h.smc_hash	= &smc_v6_hashinfo,
	.slab_flags	= SLAB_TYPESAFE_BY_RCU,
};
EXPORT_SYMBOL_GPL(smc_proto6);

121
122
123
124
static int smc_release(struct socket *sock)
{
	struct sock *sk = sock->sk;
	struct smc_sock *smc;
125
	int rc = 0;
126
127
128
129
130

	if (!sk)
		goto out;

	smc = smc_sk(sk);
131
132
133
134
135
136
137
	if (sk->sk_state == SMC_LISTEN)
		/* smc_close_non_accepted() is called and acquires
		 * sock lock for child sockets again
		 */
		lock_sock_nested(sk, SINGLE_DEPTH_NESTING);
	else
		lock_sock(sk);
138

139
	if (!smc->use_fallback) {
140
141
142
143
		rc = smc_close_active(smc);
		sock_set_flag(sk, SOCK_DEAD);
		sk->sk_shutdown |= SHUTDOWN_MASK;
	}
144
145
146
147
	if (smc->clcsock) {
		sock_release(smc->clcsock);
		smc->clcsock = NULL;
	}
148
149
150
151
152
	if (smc->use_fallback) {
		sock_put(sk); /* passive closing */
		sk->sk_state = SMC_CLOSED;
		sk->sk_state_change(sk);
	}
153
154
155
156

	/* detach socket */
	sock_orphan(sk);
	sock->sk = NULL;
157
	if (!smc->use_fallback && sk->sk_state == SMC_CLOSED)
158
		smc_conn_free(&smc->conn);
159
160
	release_sock(sk);

161
162
	sk->sk_prot->unhash(sk);
	sock_put(sk); /* final sock_put */
163
out:
164
	return rc;
165
166
167
168
169
170
171
172
173
174
175
176
}

static void smc_destruct(struct sock *sk)
{
	if (sk->sk_state != SMC_CLOSED)
		return;
	if (!sock_flag(sk, SOCK_DEAD))
		return;

	sk_refcnt_debug_dec(sk);
}

177
178
static struct sock *smc_sock_alloc(struct net *net, struct socket *sock,
				   int protocol)
179
180
{
	struct smc_sock *smc;
181
	struct proto *prot;
182
183
	struct sock *sk;

184
185
	prot = (protocol == SMCPROTO_SMC6) ? &smc_proto6 : &smc_proto;
	sk = sk_alloc(net, PF_SMC, GFP_KERNEL, prot, 0);
186
187
188
189
190
191
	if (!sk)
		return NULL;

	sock_init_data(sock, sk); /* sets sk_refcnt to 1 */
	sk->sk_state = SMC_INIT;
	sk->sk_destruct = smc_destruct;
192
	sk->sk_protocol = protocol;
193
	smc = smc_sk(sk);
194
195
196
	INIT_WORK(&smc->tcp_listen_work, smc_tcp_listen_work);
	INIT_LIST_HEAD(&smc->accept_q);
	spin_lock_init(&smc->accept_q_lock);
197
	sk->sk_prot->hash(sk);
198
	sk_refcnt_debug_inc(sk);
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218

	return sk;
}

static int smc_bind(struct socket *sock, struct sockaddr *uaddr,
		    int addr_len)
{
	struct sockaddr_in *addr = (struct sockaddr_in *)uaddr;
	struct sock *sk = sock->sk;
	struct smc_sock *smc;
	int rc;

	smc = smc_sk(sk);

	/* replicate tests from inet_bind(), to be safe wrt. future changes */
	rc = -EINVAL;
	if (addr_len < sizeof(struct sockaddr_in))
		goto out;

	rc = -EAFNOSUPPORT;
219
220
221
222
	if (addr->sin_family != AF_INET &&
	    addr->sin_family != AF_INET6 &&
	    addr->sin_family != AF_UNSPEC)
		goto out;
223
	/* accept AF_UNSPEC (mapped to AF_INET) only if s_addr is INADDR_ANY */
224
225
	if (addr->sin_family == AF_UNSPEC &&
	    addr->sin_addr.s_addr != htonl(INADDR_ANY))
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
		goto out;

	lock_sock(sk);

	/* Check if socket is already active */
	rc = -EINVAL;
	if (sk->sk_state != SMC_INIT)
		goto out_rel;

	smc->clcsock->sk->sk_reuse = sk->sk_reuse;
	rc = kernel_bind(smc->clcsock, uaddr, addr_len);

out_rel:
	release_sock(sk);
out:
	return rc;
}

static void smc_copy_sock_settings(struct sock *nsk, struct sock *osk,
				   unsigned long mask)
{
	/* options we don't get control via setsockopt for */
	nsk->sk_type = osk->sk_type;
	nsk->sk_sndbuf = osk->sk_sndbuf;
	nsk->sk_rcvbuf = osk->sk_rcvbuf;
	nsk->sk_sndtimeo = osk->sk_sndtimeo;
	nsk->sk_rcvtimeo = osk->sk_rcvtimeo;
	nsk->sk_mark = osk->sk_mark;
	nsk->sk_priority = osk->sk_priority;
	nsk->sk_rcvlowat = osk->sk_rcvlowat;
	nsk->sk_bound_dev_if = osk->sk_bound_dev_if;
	nsk->sk_err = osk->sk_err;

	nsk->sk_flags &= ~mask;
	nsk->sk_flags |= osk->sk_flags & mask;
}

#define SK_FLAGS_SMC_TO_CLC ((1UL << SOCK_URGINLINE) | \
			     (1UL << SOCK_KEEPOPEN) | \
			     (1UL << SOCK_LINGER) | \
			     (1UL << SOCK_BROADCAST) | \
			     (1UL << SOCK_TIMESTAMP) | \
			     (1UL << SOCK_DBG) | \
			     (1UL << SOCK_RCVTSTAMP) | \
			     (1UL << SOCK_RCVTSTAMPNS) | \
			     (1UL << SOCK_LOCALROUTE) | \
			     (1UL << SOCK_TIMESTAMPING_RX_SOFTWARE) | \
			     (1UL << SOCK_RXQ_OVFL) | \
			     (1UL << SOCK_WIFI_STATUS) | \
			     (1UL << SOCK_NOFCS) | \
			     (1UL << SOCK_FILTER_LOCKED))
/* copy only relevant settings and flags of SOL_SOCKET level from smc to
 * clc socket (since smc is not called for these options from net/core)
 */
static void smc_copy_sock_settings_to_clc(struct smc_sock *smc)
{
	smc_copy_sock_settings(smc->clcsock->sk, &smc->sk, SK_FLAGS_SMC_TO_CLC);
}

#define SK_FLAGS_CLC_TO_SMC ((1UL << SOCK_URGINLINE) | \
			     (1UL << SOCK_KEEPOPEN) | \
			     (1UL << SOCK_LINGER) | \
			     (1UL << SOCK_DBG))
/* copy only settings and flags relevant for smc from clc to smc socket */
static void smc_copy_sock_settings_to_smc(struct smc_sock *smc)
{
	smc_copy_sock_settings(&smc->sk, smc->clcsock->sk, SK_FLAGS_CLC_TO_SMC);
}

295
static int smc_clnt_conf_first_link(struct smc_sock *smc)
Ursula Braun's avatar
Ursula Braun committed
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
{
	struct smc_link_group *lgr = smc->conn.lgr;
	struct smc_link *link;
	int rest;
	int rc;

	link = &lgr->lnk[SMC_SINGLE_LINK];
	/* receive CONFIRM LINK request from server over RoCE fabric */
	rest = wait_for_completion_interruptible_timeout(
		&link->llc_confirm,
		SMC_LLC_WAIT_FIRST_TIME);
	if (rest <= 0) {
		struct smc_clc_msg_decline dclc;

		rc = smc_clc_wait_msg(smc, &dclc, sizeof(dclc),
				      SMC_CLC_DECLINE);
		return rc;
	}

315
316
317
	if (link->llc_confirm_rc)
		return SMC_CLC_DECL_RMBE_EC;

Ursula Braun's avatar
Ursula Braun committed
318
319
320
321
322
	rc = smc_ib_modify_qp_rts(link);
	if (rc)
		return SMC_CLC_DECL_INTERR;

	smc_wr_remember_qp_attr(link);
323
324
325
326
327
328

	rc = smc_wr_reg_send(link,
			     smc->conn.rmb_desc->mr_rx[SMC_SINGLE_LINK]);
	if (rc)
		return SMC_CLC_DECL_INTERR;

Ursula Braun's avatar
Ursula Braun committed
329
330
331
	/* send CONFIRM LINK response over RoCE fabric */
	rc = smc_llc_send_confirm_link(link,
				       link->smcibdev->mac[link->ibport - 1],
332
333
				       &link->smcibdev->gid[link->ibport - 1],
				       SMC_LLC_RESP);
Ursula Braun's avatar
Ursula Braun committed
334
335
336
	if (rc < 0)
		return SMC_CLC_DECL_TCL;

337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
	/* receive ADD LINK request from server over RoCE fabric */
	rest = wait_for_completion_interruptible_timeout(&link->llc_add,
							 SMC_LLC_WAIT_TIME);
	if (rest <= 0) {
		struct smc_clc_msg_decline dclc;

		rc = smc_clc_wait_msg(smc, &dclc, sizeof(dclc),
				      SMC_CLC_DECLINE);
		return rc;
	}

	/* send add link reject message, only one link supported for now */
	rc = smc_llc_send_add_link(link,
				   link->smcibdev->mac[link->ibport - 1],
				   &link->smcibdev->gid[link->ibport - 1],
				   SMC_LLC_RESP);
	if (rc < 0)
		return SMC_CLC_DECL_TCL;

	link->state = SMC_LNK_ACTIVE;

358
	return 0;
Ursula Braun's avatar
Ursula Braun committed
359
360
}

361
362
363
364
static void smc_conn_save_peer_info(struct smc_sock *smc,
				    struct smc_clc_msg_accept_confirm *clc)
{
	smc->conn.peer_conn_idx = clc->conn_idx;
365
	smc->conn.local_tx_ctrl.token = ntohl(clc->rmbe_alert_token);
366
367
	smc->conn.peer_rmbe_size = smc_uncompress_bufsize(clc->rmbe_size);
	atomic_set(&smc->conn.peer_rmbe_space, smc->conn.peer_rmbe_size);
368
369
370
371
372
373
374
375
376
377
378
379
}

static void smc_link_save_peer_info(struct smc_link *link,
				    struct smc_clc_msg_accept_confirm *clc)
{
	link->peer_qpn = ntoh24(clc->qpn);
	memcpy(link->peer_gid, clc->lcl.gid, SMC_GID_SIZE);
	memcpy(link->peer_mac, clc->lcl.mac, sizeof(link->peer_mac));
	link->peer_psn = ntoh24(clc->psn);
	link->peer_mtu = clc->qp_mtu;
}

380
381
382
383
/* setup for RDMA connection of client */
static int smc_connect_rdma(struct smc_sock *smc)
{
	struct smc_clc_msg_accept_confirm aclc;
384
	int local_contact = SMC_FIRST_CONTACT;
385
	struct smc_ib_device *smcibdev;
386
387
	struct smc_link *link;
	u8 srv_first_contact;
388
389
390
391
	int reason_code = 0;
	int rc = 0;
	u8 ibport;

392
393
	sock_hold(&smc->sk); /* sock put in passive closing */

394
395
396
	if (smc->use_fallback)
		goto out_connected;

397
398
399
400
401
402
	if (!tcp_sk(smc->clcsock->sk)->syn_smc) {
		/* peer has not signalled SMC-capability */
		smc->use_fallback = true;
		goto out_connected;
	}

403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
	/* IPSec connections opt out of SMC-R optimizations */
	if (using_ipsec(smc)) {
		reason_code = SMC_CLC_DECL_IPSEC;
		goto decline_rdma;
	}

	/* PNET table look up: search active ib_device and port
	 * within same PNETID that also contains the ethernet device
	 * used for the internal TCP socket
	 */
	smc_pnet_find_roce_resource(smc->clcsock->sk, &smcibdev, &ibport);
	if (!smcibdev) {
		reason_code = SMC_CLC_DECL_CNFERR; /* configuration error */
		goto decline_rdma;
	}

	/* do inband token exchange */
	reason_code = smc_clc_send_proposal(smc, smcibdev, ibport);
	if (reason_code < 0) {
		rc = reason_code;
		goto out_err;
	}
	if (reason_code > 0) /* configuration error */
		goto decline_rdma;
	/* receive SMC Accept CLC message */
	reason_code = smc_clc_wait_msg(smc, &aclc, sizeof(aclc),
				       SMC_CLC_ACCEPT);
	if (reason_code < 0) {
		rc = reason_code;
		goto out_err;
	}
	if (reason_code > 0)
		goto decline_rdma;

437
438
	srv_first_contact = aclc.hdr.flag;
	mutex_lock(&smc_create_lgr_pending);
439
440
	local_contact = smc_conn_create(smc, smcibdev, ibport, &aclc.lcl,
					srv_first_contact);
441
442
443
444
445
446
447
448
449
	if (local_contact < 0) {
		rc = local_contact;
		if (rc == -ENOMEM)
			reason_code = SMC_CLC_DECL_MEM;/* insufficient memory*/
		else if (rc == -ENOLINK)
			reason_code = SMC_CLC_DECL_SYNCERR; /* synchr. error */
		goto decline_rdma_unlock;
	}
	link = &smc->conn.lgr->lnk[SMC_SINGLE_LINK];
450

451
	smc_conn_save_peer_info(smc, &aclc);
452

453
454
	/* create send buffer and rmb */
	rc = smc_buf_create(smc);
455
456
457
458
459
	if (rc) {
		reason_code = SMC_CLC_DECL_MEM;
		goto decline_rdma_unlock;
	}

460
461
	if (local_contact == SMC_FIRST_CONTACT)
		smc_link_save_peer_info(link, &aclc);
462
463
464
465
466
467
468

	rc = smc_rmb_rtoken_handling(&smc->conn, &aclc);
	if (rc) {
		reason_code = SMC_CLC_DECL_INTERR;
		goto decline_rdma_unlock;
	}

469
470
471
	smc_close_init(smc);
	smc_rx_init(smc);

472
473
474
475
476
477
	if (local_contact == SMC_FIRST_CONTACT) {
		rc = smc_ib_ready_link(link);
		if (rc) {
			reason_code = SMC_CLC_DECL_INTERR;
			goto decline_rdma_unlock;
		}
478
479
480
481
482
483
484
485
486
487
488
489
	} else {
		struct smc_buf_desc *buf_desc = smc->conn.rmb_desc;

		if (!buf_desc->reused) {
			/* register memory region for new rmb */
			rc = smc_wr_reg_send(link,
					     buf_desc->mr_rx[SMC_SINGLE_LINK]);
			if (rc) {
				reason_code = SMC_CLC_DECL_INTERR;
				goto decline_rdma_unlock;
			}
		}
490
	}
491
	smc_rmb_sync_sg_for_device(&smc->conn);
492
493
494

	rc = smc_clc_send_confirm(smc);
	if (rc)
495
		goto out_err_unlock;
496

Ursula Braun's avatar
Ursula Braun committed
497
498
	if (local_contact == SMC_FIRST_CONTACT) {
		/* QP confirmation over RoCE fabric */
499
		reason_code = smc_clnt_conf_first_link(smc);
Ursula Braun's avatar
Ursula Braun committed
500
501
502
503
504
505
506
		if (reason_code < 0) {
			rc = reason_code;
			goto out_err_unlock;
		}
		if (reason_code > 0)
			goto decline_rdma_unlock;
	}
507

508
	mutex_unlock(&smc_create_lgr_pending);
Ursula Braun's avatar
Ursula Braun committed
509
510
	smc_tx_init(smc);

511
512
out_connected:
	smc_copy_sock_settings_to_clc(smc);
513
514
	if (smc->sk.sk_state == SMC_INIT)
		smc->sk.sk_state = SMC_ACTIVE;
515

516
	return rc ? rc : local_contact;
517

518
decline_rdma_unlock:
519
520
	if (local_contact == SMC_FIRST_CONTACT)
		smc_lgr_forget(smc->conn.lgr);
521
522
	mutex_unlock(&smc_create_lgr_pending);
	smc_conn_free(&smc->conn);
523
524
525
526
decline_rdma:
	/* RDMA setup failed, switch back to TCP */
	smc->use_fallback = true;
	if (reason_code && (reason_code != SMC_CLC_DECL_REPLY)) {
527
		rc = smc_clc_send_decline(smc, reason_code);
528
		if (rc < 0)
529
530
531
532
			goto out_err;
	}
	goto out_connected;

533
out_err_unlock:
534
535
	if (local_contact == SMC_FIRST_CONTACT)
		smc_lgr_forget(smc->conn.lgr);
536
537
	mutex_unlock(&smc_create_lgr_pending);
	smc_conn_free(&smc->conn);
538
out_err:
539
540
	if (smc->sk.sk_state == SMC_INIT)
		sock_put(&smc->sk); /* passive closing */
541
542
543
	return rc;
}

544
545
546
547
548
549
550
551
552
553
554
555
static int smc_connect(struct socket *sock, struct sockaddr *addr,
		       int alen, int flags)
{
	struct sock *sk = sock->sk;
	struct smc_sock *smc;
	int rc = -EINVAL;

	smc = smc_sk(sk);

	/* separate smc parameter checking to be safe */
	if (alen < sizeof(addr->sa_family))
		goto out_err;
556
	if (addr->sa_family != AF_INET && addr->sa_family != AF_INET6)
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
		goto out_err;

	lock_sock(sk);
	switch (sk->sk_state) {
	default:
		goto out;
	case SMC_ACTIVE:
		rc = -EISCONN;
		goto out;
	case SMC_INIT:
		rc = 0;
		break;
	}

	smc_copy_sock_settings_to_clc(smc);
572
	tcp_sk(smc->clcsock->sk)->syn_smc = 1;
573
574
575
576
	rc = kernel_connect(smc->clcsock, addr, alen, flags);
	if (rc)
		goto out;

577
578
579
580
581
582
	/* setup RDMA connection */
	rc = smc_connect_rdma(smc);
	if (rc < 0)
		goto out;
	else
		rc = 0; /* success cases including fallback */
583
584
585
586
587
588
589
590
591

out:
	release_sock(sk);
out_err:
	return rc;
}

static int smc_clcsock_accept(struct smc_sock *lsmc, struct smc_sock **new_smc)
{
592
593
	struct socket *new_clcsock = NULL;
	struct sock *lsk = &lsmc->sk;
594
595
596
	struct sock *new_sk;
	int rc;

597
	release_sock(lsk);
598
	new_sk = smc_sock_alloc(sock_net(lsk), NULL, lsk->sk_protocol);
599
600
	if (!new_sk) {
		rc = -ENOMEM;
601
		lsk->sk_err = ENOMEM;
602
		*new_smc = NULL;
603
		lock_sock(lsk);
604
605
606
607
608
		goto out;
	}
	*new_smc = smc_sk(new_sk);

	rc = kernel_accept(lsmc->clcsock, &new_clcsock, 0);
609
	lock_sock(lsk);
610
	if  (rc < 0)
611
		lsk->sk_err = -rc;
612
	if (rc < 0 || lsk->sk_state == SMC_CLOSED) {
613
614
615
616
		if (new_clcsock)
			sock_release(new_clcsock);
		new_sk->sk_state = SMC_CLOSED;
		sock_set_flag(new_sk, SOCK_DEAD);
617
		new_sk->sk_prot->unhash(new_sk);
618
		sock_put(new_sk); /* final */
619
620
621
622
623
624
625
626
627
		*new_smc = NULL;
		goto out;
	}

	(*new_smc)->clcsock = new_clcsock;
out:
	return rc;
}

628
629
630
631
632
633
634
/* add a just created sock to the accept queue of the listen sock as
 * candidate for a following socket accept call from user space
 */
static void smc_accept_enqueue(struct sock *parent, struct sock *sk)
{
	struct smc_sock *par = smc_sk(parent);

635
	sock_hold(sk); /* sock_put in smc_accept_unlink () */
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
	spin_lock(&par->accept_q_lock);
	list_add_tail(&smc_sk(sk)->accept_q, &par->accept_q);
	spin_unlock(&par->accept_q_lock);
	sk_acceptq_added(parent);
}

/* remove a socket from the accept queue of its parental listening socket */
static void smc_accept_unlink(struct sock *sk)
{
	struct smc_sock *par = smc_sk(sk)->listen_smc;

	spin_lock(&par->accept_q_lock);
	list_del_init(&smc_sk(sk)->accept_q);
	spin_unlock(&par->accept_q_lock);
	sk_acceptq_removed(&smc_sk(sk)->listen_smc->sk);
651
	sock_put(sk); /* sock_hold in smc_accept_enqueue */
652
653
654
655
656
}

/* remove a sock from the accept queue to bind it to a new socket created
 * for a socket accept call from user space
 */
657
658
struct sock *smc_accept_dequeue(struct sock *parent,
				struct socket *new_sock)
659
660
661
662
663
664
665
666
667
{
	struct smc_sock *isk, *n;
	struct sock *new_sk;

	list_for_each_entry_safe(isk, n, &smc_sk(parent)->accept_q, accept_q) {
		new_sk = (struct sock *)isk;

		smc_accept_unlink(new_sk);
		if (new_sk->sk_state == SMC_CLOSED) {
668
669
670
671
			if (isk->clcsock) {
				sock_release(isk->clcsock);
				isk->clcsock = NULL;
			}
672
			new_sk->sk_prot->unhash(new_sk);
673
			sock_put(new_sk); /* final */
674
675
676
677
678
679
680
681
682
683
			continue;
		}
		if (new_sock)
			sock_graft(new_sk, new_sock);
		return new_sk;
	}
	return NULL;
}

/* clean up for a created but never accepted sock */
684
void smc_close_non_accepted(struct sock *sk)
685
686
687
{
	struct smc_sock *smc = smc_sk(sk);

688
689
690
691
	lock_sock(sk);
	if (!sk->sk_lingertime)
		/* wait for peer closing */
		sk->sk_lingertime = SMC_MAX_STREAM_WAIT_TIMEOUT;
692
	if (!smc->use_fallback) {
693
		smc_close_active(smc);
694
695
696
		sock_set_flag(sk, SOCK_DEAD);
		sk->sk_shutdown |= SHUTDOWN_MASK;
	}
697
698
699
700
701
702
703
	if (smc->clcsock) {
		struct socket *tcp;

		tcp = smc->clcsock;
		smc->clcsock = NULL;
		sock_release(tcp);
	}
704
	if (smc->use_fallback) {
705
706
707
708
709
		sock_put(sk); /* passive closing */
		sk->sk_state = SMC_CLOSED;
	} else {
		if (sk->sk_state == SMC_CLOSED)
			smc_conn_free(&smc->conn);
710
711
	}
	release_sock(sk);
712
713
	sk->sk_prot->unhash(sk);
	sock_put(sk); /* final sock_put */
714
715
}

Ursula Braun's avatar
Ursula Braun committed
716
717
718
719
720
721
722
723
static int smc_serv_conf_first_link(struct smc_sock *smc)
{
	struct smc_link_group *lgr = smc->conn.lgr;
	struct smc_link *link;
	int rest;
	int rc;

	link = &lgr->lnk[SMC_SINGLE_LINK];
724
725
726
727
728
729

	rc = smc_wr_reg_send(link,
			     smc->conn.rmb_desc->mr_rx[SMC_SINGLE_LINK]);
	if (rc)
		return SMC_CLC_DECL_INTERR;

Ursula Braun's avatar
Ursula Braun committed
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
	/* send CONFIRM LINK request to client over the RoCE fabric */
	rc = smc_llc_send_confirm_link(link,
				       link->smcibdev->mac[link->ibport - 1],
				       &link->smcibdev->gid[link->ibport - 1],
				       SMC_LLC_REQ);
	if (rc < 0)
		return SMC_CLC_DECL_TCL;

	/* receive CONFIRM LINK response from client over the RoCE fabric */
	rest = wait_for_completion_interruptible_timeout(
		&link->llc_confirm_resp,
		SMC_LLC_WAIT_FIRST_TIME);
	if (rest <= 0) {
		struct smc_clc_msg_decline dclc;

		rc = smc_clc_wait_msg(smc, &dclc, sizeof(dclc),
				      SMC_CLC_DECLINE);
747
		return rc;
Ursula Braun's avatar
Ursula Braun committed
748
749
	}

750
751
752
	if (link->llc_confirm_resp_rc)
		return SMC_CLC_DECL_RMBE_EC;

753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
	/* send ADD LINK request to client over the RoCE fabric */
	rc = smc_llc_send_add_link(link,
				   link->smcibdev->mac[link->ibport - 1],
				   &link->smcibdev->gid[link->ibport - 1],
				   SMC_LLC_REQ);
	if (rc < 0)
		return SMC_CLC_DECL_TCL;

	/* receive ADD LINK response from client over the RoCE fabric */
	rest = wait_for_completion_interruptible_timeout(&link->llc_add_resp,
							 SMC_LLC_WAIT_TIME);
	if (rest <= 0) {
		struct smc_clc_msg_decline dclc;

		rc = smc_clc_wait_msg(smc, &dclc, sizeof(dclc),
				      SMC_CLC_DECLINE);
		return rc;
	}

	link->state = SMC_LNK_ACTIVE;

774
	return 0;
Ursula Braun's avatar
Ursula Braun committed
775
776
}

777
778
779
780
781
/* setup for RDMA connection of server */
static void smc_listen_work(struct work_struct *work)
{
	struct smc_sock *new_smc = container_of(work, struct smc_sock,
						smc_listen_work);
782
	struct smc_clc_msg_proposal_prefix *pclc_prfx;
783
784
785
	struct socket *newclcsock = new_smc->clcsock;
	struct smc_sock *lsmc = new_smc->listen_smc;
	struct smc_clc_msg_accept_confirm cclc;
786
	int local_contact = SMC_REUSE_CONTACT;
787
	struct sock *newsmcsk = &new_smc->sk;
788
	struct smc_clc_msg_proposal *pclc;
789
	struct smc_ib_device *smcibdev;
790
	u8 buf[SMC_CLC_MAX_LEN];
791
	struct smc_link *link;
792
	int reason_code = 0;
793
	int rc = 0;
794
795
	u8 ibport;

796
797
798
	if (new_smc->use_fallback)
		goto out_connected;

799
800
801
802
803
804
	/* check if peer is smc capable */
	if (!tcp_sk(newclcsock->sk)->syn_smc) {
		new_smc->use_fallback = true;
		goto out_connected;
	}

805
806
807
	/* do inband token exchange -
	 *wait for and receive SMC Proposal CLC message
	 */
808
	reason_code = smc_clc_wait_msg(new_smc, &buf, sizeof(buf),
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
				       SMC_CLC_PROPOSAL);
	if (reason_code < 0)
		goto out_err;
	if (reason_code > 0)
		goto decline_rdma;

	/* IPSec connections opt out of SMC-R optimizations */
	if (using_ipsec(new_smc)) {
		reason_code = SMC_CLC_DECL_IPSEC;
		goto decline_rdma;
	}

	/* PNET table look up: search active ib_device and port
	 * within same PNETID that also contains the ethernet device
	 * used for the internal TCP socket
	 */
	smc_pnet_find_roce_resource(newclcsock->sk, &smcibdev, &ibport);
	if (!smcibdev) {
		reason_code = SMC_CLC_DECL_CNFERR; /* configuration error */
		goto decline_rdma;
	}

831
832
	pclc = (struct smc_clc_msg_proposal *)&buf;
	pclc_prfx = smc_clc_proposal_get_prefix(pclc);
833
834
835

	rc = smc_clc_prfx_match(newclcsock, pclc_prfx);
	if (rc) {
836
837
838
839
		reason_code = SMC_CLC_DECL_CNFERR; /* configuration error */
		goto decline_rdma;
	}

840
841
	/* allocate connection / link group */
	mutex_lock(&smc_create_lgr_pending);
842
843
	local_contact = smc_conn_create(new_smc, smcibdev, ibport, &pclc->lcl,
					0);
844
845
846
847
	if (local_contact < 0) {
		rc = local_contact;
		if (rc == -ENOMEM)
			reason_code = SMC_CLC_DECL_MEM;/* insufficient memory*/
848
		goto decline_rdma_unlock;
849
850
	}
	link = &new_smc->conn.lgr->lnk[SMC_SINGLE_LINK];
851

852
853
	/* create send buffer and rmb */
	rc = smc_buf_create(new_smc);
854
855
	if (rc) {
		reason_code = SMC_CLC_DECL_MEM;
856
		goto decline_rdma_unlock;
857
	}
858

859
860
861
	smc_close_init(new_smc);
	smc_rx_init(new_smc);

862
863
864
865
866
867
868
869
870
	if (local_contact != SMC_FIRST_CONTACT) {
		struct smc_buf_desc *buf_desc = new_smc->conn.rmb_desc;

		if (!buf_desc->reused) {
			/* register memory region for new rmb */
			rc = smc_wr_reg_send(link,
					     buf_desc->mr_rx[SMC_SINGLE_LINK]);
			if (rc) {
				reason_code = SMC_CLC_DECL_INTERR;
871
				goto decline_rdma_unlock;
872
873
874
			}
		}
	}
875
	smc_rmb_sync_sg_for_device(&new_smc->conn);
876

877
	rc = smc_clc_send_accept(new_smc, local_contact);
878
	if (rc)
879
		goto out_err_unlock;
880
881
882
883
884

	/* receive SMC Confirm CLC message */
	reason_code = smc_clc_wait_msg(new_smc, &cclc, sizeof(cclc),
				       SMC_CLC_CONFIRM);
	if (reason_code < 0)
885
		goto out_err_unlock;
886
	if (reason_code > 0)
887
		goto decline_rdma_unlock;
888
889
890
	smc_conn_save_peer_info(new_smc, &cclc);
	if (local_contact == SMC_FIRST_CONTACT)
		smc_link_save_peer_info(link, &cclc);
891

892
893
894
	rc = smc_rmb_rtoken_handling(&new_smc->conn, &cclc);
	if (rc) {
		reason_code = SMC_CLC_DECL_INTERR;
895
		goto decline_rdma_unlock;
896
897
898
899
900
901
	}

	if (local_contact == SMC_FIRST_CONTACT) {
		rc = smc_ib_ready_link(link);
		if (rc) {
			reason_code = SMC_CLC_DECL_INTERR;
902
			goto decline_rdma_unlock;
903
		}
Ursula Braun's avatar
Ursula Braun committed
904
905
		/* QP confirmation over RoCE fabric */
		reason_code = smc_serv_conf_first_link(new_smc);
906
		if (reason_code < 0)
Ursula Braun's avatar
Ursula Braun committed
907
			/* peer is not aware of a problem */
908
			goto out_err_unlock;
Ursula Braun's avatar
Ursula Braun committed
909
		if (reason_code > 0)
910
			goto decline_rdma_unlock;
911
	}
912

Ursula Braun's avatar
Ursula Braun committed
913
	smc_tx_init(new_smc);
914
	mutex_unlock(&smc_create_lgr_pending);
Ursula Braun's avatar
Ursula Braun committed
915

916
917
out_connected:
	sk_refcnt_debug_inc(newsmcsk);
918
919
	if (newsmcsk->sk_state == SMC_INIT)
		newsmcsk->sk_state = SMC_ACTIVE;
920
enqueue:
921
	lock_sock_nested(&lsmc->sk, SINGLE_DEPTH_NESTING);
922
923
924
925
926
927
928
929
930
931
932
933
	if (lsmc->sk.sk_state == SMC_LISTEN) {
		smc_accept_enqueue(&lsmc->sk, newsmcsk);
	} else { /* no longer listening */
		smc_close_non_accepted(newsmcsk);
	}
	release_sock(&lsmc->sk);

	/* Wake up accept */
	lsmc->sk.sk_data_ready(&lsmc->sk);
	sock_put(&lsmc->sk); /* sock_hold in smc_tcp_listen_work */
	return;

934
decline_rdma_unlock:
935
936
	if (local_contact == SMC_FIRST_CONTACT)
		smc_lgr_forget(new_smc->conn.lgr);
937
	mutex_unlock(&smc_create_lgr_pending);
938
939
decline_rdma:
	/* RDMA setup failed, switch back to TCP */
940
	smc_conn_free(&new_smc->conn);
941
942
	new_smc->use_fallback = true;
	if (reason_code && (reason_code != SMC_CLC_DECL_REPLY)) {
943
		if (smc_clc_send_decline(new_smc, reason_code) < 0)
944
945
946
947
			goto out_err;
	}
	goto out_connected;

948
out_err_unlock:
949
950
	if (local_contact == SMC_FIRST_CONTACT)
		smc_lgr_forget(new_smc->conn.lgr);
951
	mutex_unlock(&smc_create_lgr_pending);
952
out_err:
953
954
	if (newsmcsk->sk_state == SMC_INIT)
		sock_put(&new_smc->sk); /* passive closing */
955
	newsmcsk->sk_state = SMC_CLOSED;
956
	smc_conn_free(&new_smc->conn);
957
958
959
960
961
962
963
	goto enqueue; /* queue new sock with sk_err set */
}

static void smc_tcp_listen_work(struct work_struct *work)
{
	struct smc_sock *lsmc = container_of(work, struct smc_sock,
					     tcp_listen_work);
964
	struct sock *lsk = &lsmc->sk;
965
966
967
	struct smc_sock *new_smc;
	int rc = 0;

968
969
	lock_sock(lsk);
	while (lsk->sk_state == SMC_LISTEN) {
970
971
972
973
974
975
976
		rc = smc_clcsock_accept(lsmc, &new_smc);
		if (rc)
			goto out;
		if (!new_smc)
			continue;

		new_smc->listen_smc = lsmc;
977
		new_smc->use_fallback = lsmc->use_fallback;
978
		sock_hold(lsk); /* sock_put in smc_listen_work */
979
980
		INIT_WORK(&new_smc->smc_listen_work, smc_listen_work);
		smc_copy_sock_settings_to_smc(new_smc);
981
982
983
		sock_hold(&new_smc->sk); /* sock_put in passive closing */
		if (!schedule_work(&new_smc->smc_listen_work))
			sock_put(&new_smc->sk);
984
985
986
	}

out:
987
	release_sock(lsk);
988
	sock_put(&lsmc->sk); /* sock_hold in smc_listen */
989
990
}

991
992
993
994
995
996
997
998
999
1000
static int smc_listen(struct socket *sock, int backlog)
{
	struct sock *sk = sock->sk;
	struct smc_sock *smc;
	int rc;

	smc = smc_sk(sk);
	lock_sock(sk);

	rc = -EINVAL;