af_smc.c 36.4 KB
Newer Older
1
2
3
4
5
6
7
8
/*
 *  Shared Memory Communications over RDMA (SMC-R) and RoCE
 *
 *  AF_SMC protocol family socket handler keeping the AF_INET sock address type
 *  applies to SOCK_STREAM sockets only
 *  offers an alternative communication option for TCP-protocol sockets
 *  applicable with RoCE-cards only
 *
9
10
11
12
13
 *  Initial restrictions:
 *    - support for alternate links postponed
 *    - partial support for non-blocking sockets only
 *    - support for urgent data postponed
 *
14
 *  Copyright IBM Corp. 2016, 2018
15
16
17
18
19
20
21
22
23
24
 *
 *  Author(s):  Ursula Braun <ubraun@linux.vnet.ibm.com>
 *              based on prototype from Frank Blaschka
 */

#define KMSG_COMPONENT "smc"
#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt

#include <linux/module.h>
#include <linux/socket.h>
25
#include <linux/workqueue.h>
26
#include <linux/in.h>
27
28
#include <linux/sched/signal.h>

29
#include <net/sock.h>
30
#include <net/tcp.h>
31
#include <net/smc.h>
32
33

#include "smc.h"
34
#include "smc_clc.h"
Ursula Braun's avatar
Ursula Braun committed
35
#include "smc_llc.h"
36
#include "smc_cdc.h"
37
#include "smc_core.h"
38
#include "smc_ib.h"
39
#include "smc_pnet.h"
Ursula Braun's avatar
Ursula Braun committed
40
#include "smc_tx.h"
Ursula Braun's avatar
Ursula Braun committed
41
#include "smc_rx.h"
42
#include "smc_close.h"
43

44
45
46
47
48
49
50
51
52
static DEFINE_MUTEX(smc_create_lgr_pending);	/* serialize link group
						 * creation
						 */

struct smc_lgr_list smc_lgr_list = {		/* established link groups */
	.lock = __SPIN_LOCK_UNLOCKED(smc_lgr_list.lock),
	.list = LIST_HEAD_INIT(smc_lgr_list.list),
};

53
54
static void smc_tcp_listen_work(struct work_struct *);

55
56
57
58
59
60
61
static void smc_set_keepalive(struct sock *sk, int val)
{
	struct smc_sock *smc = smc_sk(sk);

	smc->clcsock->sk->sk_prot->keepalive(smc->clcsock->sk, val);
}

62
63
64
65
static struct smc_hashinfo smc_v4_hashinfo = {
	.lock = __RW_LOCK_UNLOCKED(smc_v4_hashinfo.lock),
};

66
67
68
69
static struct smc_hashinfo smc_v6_hashinfo = {
	.lock = __RW_LOCK_UNLOCKED(smc_v6_hashinfo.lock),
};

70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
int smc_hash_sk(struct sock *sk)
{
	struct smc_hashinfo *h = sk->sk_prot->h.smc_hash;
	struct hlist_head *head;

	head = &h->ht;

	write_lock_bh(&h->lock);
	sk_add_node(sk, head);
	sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1);
	write_unlock_bh(&h->lock);

	return 0;
}
EXPORT_SYMBOL_GPL(smc_hash_sk);

void smc_unhash_sk(struct sock *sk)
{
	struct smc_hashinfo *h = sk->sk_prot->h.smc_hash;

	write_lock_bh(&h->lock);
	if (sk_del_node_init(sk))
		sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1);
	write_unlock_bh(&h->lock);
}
EXPORT_SYMBOL_GPL(smc_unhash_sk);

struct proto smc_proto = {
98
99
100
	.name		= "SMC",
	.owner		= THIS_MODULE,
	.keepalive	= smc_set_keepalive,
101
102
	.hash		= smc_hash_sk,
	.unhash		= smc_unhash_sk,
103
	.obj_size	= sizeof(struct smc_sock),
104
	.h.smc_hash	= &smc_v4_hashinfo,
105
	.slab_flags	= SLAB_TYPESAFE_BY_RCU,
106
};
107
EXPORT_SYMBOL_GPL(smc_proto);
108

109
110
111
112
113
114
115
116
117
118
119
120
struct proto smc_proto6 = {
	.name		= "SMC6",
	.owner		= THIS_MODULE,
	.keepalive	= smc_set_keepalive,
	.hash		= smc_hash_sk,
	.unhash		= smc_unhash_sk,
	.obj_size	= sizeof(struct smc_sock),
	.h.smc_hash	= &smc_v6_hashinfo,
	.slab_flags	= SLAB_TYPESAFE_BY_RCU,
};
EXPORT_SYMBOL_GPL(smc_proto6);

121
122
123
124
static int smc_release(struct socket *sock)
{
	struct sock *sk = sock->sk;
	struct smc_sock *smc;
125
	int rc = 0;
126
127
128
129
130

	if (!sk)
		goto out;

	smc = smc_sk(sk);
131
132
133
134
135
136
137
	if (sk->sk_state == SMC_LISTEN)
		/* smc_close_non_accepted() is called and acquires
		 * sock lock for child sockets again
		 */
		lock_sock_nested(sk, SINGLE_DEPTH_NESTING);
	else
		lock_sock(sk);
138

139
	if (!smc->use_fallback) {
140
141
142
143
		rc = smc_close_active(smc);
		sock_set_flag(sk, SOCK_DEAD);
		sk->sk_shutdown |= SHUTDOWN_MASK;
	}
144
145
146
147
	if (smc->clcsock) {
		sock_release(smc->clcsock);
		smc->clcsock = NULL;
	}
148
149
150
151
152
	if (smc->use_fallback) {
		sock_put(sk); /* passive closing */
		sk->sk_state = SMC_CLOSED;
		sk->sk_state_change(sk);
	}
153
154
155
156

	/* detach socket */
	sock_orphan(sk);
	sock->sk = NULL;
157
	if (!smc->use_fallback && sk->sk_state == SMC_CLOSED)
158
		smc_conn_free(&smc->conn);
159
160
	release_sock(sk);

161
162
	sk->sk_prot->unhash(sk);
	sock_put(sk); /* final sock_put */
163
out:
164
	return rc;
165
166
167
168
169
170
171
172
173
174
175
176
}

static void smc_destruct(struct sock *sk)
{
	if (sk->sk_state != SMC_CLOSED)
		return;
	if (!sock_flag(sk, SOCK_DEAD))
		return;

	sk_refcnt_debug_dec(sk);
}

177
178
static struct sock *smc_sock_alloc(struct net *net, struct socket *sock,
				   int protocol)
179
180
{
	struct smc_sock *smc;
181
	struct proto *prot;
182
183
	struct sock *sk;

184
185
	prot = (protocol == SMCPROTO_SMC6) ? &smc_proto6 : &smc_proto;
	sk = sk_alloc(net, PF_SMC, GFP_KERNEL, prot, 0);
186
187
188
189
190
191
	if (!sk)
		return NULL;

	sock_init_data(sock, sk); /* sets sk_refcnt to 1 */
	sk->sk_state = SMC_INIT;
	sk->sk_destruct = smc_destruct;
192
	sk->sk_protocol = protocol;
193
	smc = smc_sk(sk);
194
195
196
	INIT_WORK(&smc->tcp_listen_work, smc_tcp_listen_work);
	INIT_LIST_HEAD(&smc->accept_q);
	spin_lock_init(&smc->accept_q_lock);
197
	sk->sk_prot->hash(sk);
198
	sk_refcnt_debug_inc(sk);
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218

	return sk;
}

static int smc_bind(struct socket *sock, struct sockaddr *uaddr,
		    int addr_len)
{
	struct sockaddr_in *addr = (struct sockaddr_in *)uaddr;
	struct sock *sk = sock->sk;
	struct smc_sock *smc;
	int rc;

	smc = smc_sk(sk);

	/* replicate tests from inet_bind(), to be safe wrt. future changes */
	rc = -EINVAL;
	if (addr_len < sizeof(struct sockaddr_in))
		goto out;

	rc = -EAFNOSUPPORT;
219
220
221
222
	if (addr->sin_family != AF_INET &&
	    addr->sin_family != AF_INET6 &&
	    addr->sin_family != AF_UNSPEC)
		goto out;
223
	/* accept AF_UNSPEC (mapped to AF_INET) only if s_addr is INADDR_ANY */
224
225
	if (addr->sin_family == AF_UNSPEC &&
	    addr->sin_addr.s_addr != htonl(INADDR_ANY))
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
		goto out;

	lock_sock(sk);

	/* Check if socket is already active */
	rc = -EINVAL;
	if (sk->sk_state != SMC_INIT)
		goto out_rel;

	smc->clcsock->sk->sk_reuse = sk->sk_reuse;
	rc = kernel_bind(smc->clcsock, uaddr, addr_len);

out_rel:
	release_sock(sk);
out:
	return rc;
}

static void smc_copy_sock_settings(struct sock *nsk, struct sock *osk,
				   unsigned long mask)
{
	/* options we don't get control via setsockopt for */
	nsk->sk_type = osk->sk_type;
	nsk->sk_sndbuf = osk->sk_sndbuf;
	nsk->sk_rcvbuf = osk->sk_rcvbuf;
	nsk->sk_sndtimeo = osk->sk_sndtimeo;
	nsk->sk_rcvtimeo = osk->sk_rcvtimeo;
	nsk->sk_mark = osk->sk_mark;
	nsk->sk_priority = osk->sk_priority;
	nsk->sk_rcvlowat = osk->sk_rcvlowat;
	nsk->sk_bound_dev_if = osk->sk_bound_dev_if;
	nsk->sk_err = osk->sk_err;

	nsk->sk_flags &= ~mask;
	nsk->sk_flags |= osk->sk_flags & mask;
}

#define SK_FLAGS_SMC_TO_CLC ((1UL << SOCK_URGINLINE) | \
			     (1UL << SOCK_KEEPOPEN) | \
			     (1UL << SOCK_LINGER) | \
			     (1UL << SOCK_BROADCAST) | \
			     (1UL << SOCK_TIMESTAMP) | \
			     (1UL << SOCK_DBG) | \
			     (1UL << SOCK_RCVTSTAMP) | \
			     (1UL << SOCK_RCVTSTAMPNS) | \
			     (1UL << SOCK_LOCALROUTE) | \
			     (1UL << SOCK_TIMESTAMPING_RX_SOFTWARE) | \
			     (1UL << SOCK_RXQ_OVFL) | \
			     (1UL << SOCK_WIFI_STATUS) | \
			     (1UL << SOCK_NOFCS) | \
			     (1UL << SOCK_FILTER_LOCKED))
/* copy only relevant settings and flags of SOL_SOCKET level from smc to
 * clc socket (since smc is not called for these options from net/core)
 */
static void smc_copy_sock_settings_to_clc(struct smc_sock *smc)
{
	smc_copy_sock_settings(smc->clcsock->sk, &smc->sk, SK_FLAGS_SMC_TO_CLC);
}

#define SK_FLAGS_CLC_TO_SMC ((1UL << SOCK_URGINLINE) | \
			     (1UL << SOCK_KEEPOPEN) | \
			     (1UL << SOCK_LINGER) | \
			     (1UL << SOCK_DBG))
/* copy only settings and flags relevant for smc from clc to smc socket */
static void smc_copy_sock_settings_to_smc(struct smc_sock *smc)
{
	smc_copy_sock_settings(&smc->sk, smc->clcsock->sk, SK_FLAGS_CLC_TO_SMC);
}

Karsten Graul's avatar
Karsten Graul committed
295
296
297
298
299
300
301
302
303
/* register a new rmb */
static int smc_reg_rmb(struct smc_link *link, struct smc_buf_desc *rmb_desc)
{
	/* register memory region for new rmb */
	if (smc_wr_reg_send(link, rmb_desc->mr_rx[SMC_SINGLE_LINK]))
		return -EFAULT;
	return 0;
}

304
static int smc_clnt_conf_first_link(struct smc_sock *smc)
Ursula Braun's avatar
Ursula Braun committed
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
{
	struct smc_link_group *lgr = smc->conn.lgr;
	struct smc_link *link;
	int rest;
	int rc;

	link = &lgr->lnk[SMC_SINGLE_LINK];
	/* receive CONFIRM LINK request from server over RoCE fabric */
	rest = wait_for_completion_interruptible_timeout(
		&link->llc_confirm,
		SMC_LLC_WAIT_FIRST_TIME);
	if (rest <= 0) {
		struct smc_clc_msg_decline dclc;

		rc = smc_clc_wait_msg(smc, &dclc, sizeof(dclc),
				      SMC_CLC_DECLINE);
		return rc;
	}

324
325
326
	if (link->llc_confirm_rc)
		return SMC_CLC_DECL_RMBE_EC;

Ursula Braun's avatar
Ursula Braun committed
327
328
329
330
331
	rc = smc_ib_modify_qp_rts(link);
	if (rc)
		return SMC_CLC_DECL_INTERR;

	smc_wr_remember_qp_attr(link);
332

Karsten Graul's avatar
Karsten Graul committed
333
	if (smc_reg_rmb(link, smc->conn.rmb_desc))
334
335
		return SMC_CLC_DECL_INTERR;

Ursula Braun's avatar
Ursula Braun committed
336
337
338
	/* send CONFIRM LINK response over RoCE fabric */
	rc = smc_llc_send_confirm_link(link,
				       link->smcibdev->mac[link->ibport - 1],
339
340
				       &link->smcibdev->gid[link->ibport - 1],
				       SMC_LLC_RESP);
Ursula Braun's avatar
Ursula Braun committed
341
342
343
	if (rc < 0)
		return SMC_CLC_DECL_TCL;

344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
	/* receive ADD LINK request from server over RoCE fabric */
	rest = wait_for_completion_interruptible_timeout(&link->llc_add,
							 SMC_LLC_WAIT_TIME);
	if (rest <= 0) {
		struct smc_clc_msg_decline dclc;

		rc = smc_clc_wait_msg(smc, &dclc, sizeof(dclc),
				      SMC_CLC_DECLINE);
		return rc;
	}

	/* send add link reject message, only one link supported for now */
	rc = smc_llc_send_add_link(link,
				   link->smcibdev->mac[link->ibport - 1],
				   &link->smcibdev->gid[link->ibport - 1],
				   SMC_LLC_RESP);
	if (rc < 0)
		return SMC_CLC_DECL_TCL;

	link->state = SMC_LNK_ACTIVE;

365
	return 0;
Ursula Braun's avatar
Ursula Braun committed
366
367
}

368
369
370
371
static void smc_conn_save_peer_info(struct smc_sock *smc,
				    struct smc_clc_msg_accept_confirm *clc)
{
	smc->conn.peer_conn_idx = clc->conn_idx;
372
	smc->conn.local_tx_ctrl.token = ntohl(clc->rmbe_alert_token);
373
374
	smc->conn.peer_rmbe_size = smc_uncompress_bufsize(clc->rmbe_size);
	atomic_set(&smc->conn.peer_rmbe_space, smc->conn.peer_rmbe_size);
375
376
377
378
379
380
381
382
383
384
385
386
}

static void smc_link_save_peer_info(struct smc_link *link,
				    struct smc_clc_msg_accept_confirm *clc)
{
	link->peer_qpn = ntoh24(clc->qpn);
	memcpy(link->peer_gid, clc->lcl.gid, SMC_GID_SIZE);
	memcpy(link->peer_mac, clc->lcl.mac, sizeof(link->peer_mac));
	link->peer_psn = ntoh24(clc->psn);
	link->peer_mtu = clc->qp_mtu;
}

387
388
389
390
/* setup for RDMA connection of client */
static int smc_connect_rdma(struct smc_sock *smc)
{
	struct smc_clc_msg_accept_confirm aclc;
391
	int local_contact = SMC_FIRST_CONTACT;
392
	struct smc_ib_device *smcibdev;
393
394
	struct smc_link *link;
	u8 srv_first_contact;
395
396
397
398
	int reason_code = 0;
	int rc = 0;
	u8 ibport;

399
400
	sock_hold(&smc->sk); /* sock put in passive closing */

401
402
403
404
405
406
	if (!tcp_sk(smc->clcsock->sk)->syn_smc) {
		/* peer has not signalled SMC-capability */
		smc->use_fallback = true;
		goto out_connected;
	}

407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
	/* IPSec connections opt out of SMC-R optimizations */
	if (using_ipsec(smc)) {
		reason_code = SMC_CLC_DECL_IPSEC;
		goto decline_rdma;
	}

	/* PNET table look up: search active ib_device and port
	 * within same PNETID that also contains the ethernet device
	 * used for the internal TCP socket
	 */
	smc_pnet_find_roce_resource(smc->clcsock->sk, &smcibdev, &ibport);
	if (!smcibdev) {
		reason_code = SMC_CLC_DECL_CNFERR; /* configuration error */
		goto decline_rdma;
	}

	/* do inband token exchange */
	reason_code = smc_clc_send_proposal(smc, smcibdev, ibport);
	if (reason_code < 0) {
		rc = reason_code;
		goto out_err;
	}
	if (reason_code > 0) /* configuration error */
		goto decline_rdma;
	/* receive SMC Accept CLC message */
	reason_code = smc_clc_wait_msg(smc, &aclc, sizeof(aclc),
				       SMC_CLC_ACCEPT);
	if (reason_code < 0) {
		rc = reason_code;
		goto out_err;
	}
	if (reason_code > 0)
		goto decline_rdma;

441
442
	srv_first_contact = aclc.hdr.flag;
	mutex_lock(&smc_create_lgr_pending);
443
444
	local_contact = smc_conn_create(smc, smcibdev, ibport, &aclc.lcl,
					srv_first_contact);
445
446
447
448
449
450
451
452
453
	if (local_contact < 0) {
		rc = local_contact;
		if (rc == -ENOMEM)
			reason_code = SMC_CLC_DECL_MEM;/* insufficient memory*/
		else if (rc == -ENOLINK)
			reason_code = SMC_CLC_DECL_SYNCERR; /* synchr. error */
		goto decline_rdma_unlock;
	}
	link = &smc->conn.lgr->lnk[SMC_SINGLE_LINK];
454

455
	smc_conn_save_peer_info(smc, &aclc);
456

457
458
	/* create send buffer and rmb */
	rc = smc_buf_create(smc);
459
460
461
462
463
	if (rc) {
		reason_code = SMC_CLC_DECL_MEM;
		goto decline_rdma_unlock;
	}

464
465
	if (local_contact == SMC_FIRST_CONTACT)
		smc_link_save_peer_info(link, &aclc);
466
467
468
469
470
471
472

	rc = smc_rmb_rtoken_handling(&smc->conn, &aclc);
	if (rc) {
		reason_code = SMC_CLC_DECL_INTERR;
		goto decline_rdma_unlock;
	}

473
474
475
	smc_close_init(smc);
	smc_rx_init(smc);

476
477
478
479
480
481
	if (local_contact == SMC_FIRST_CONTACT) {
		rc = smc_ib_ready_link(link);
		if (rc) {
			reason_code = SMC_CLC_DECL_INTERR;
			goto decline_rdma_unlock;
		}
482
	} else {
Karsten Graul's avatar
Karsten Graul committed
483
484
		if (!smc->conn.rmb_desc->reused) {
			if (smc_reg_rmb(link, smc->conn.rmb_desc)) {
485
486
487
488
				reason_code = SMC_CLC_DECL_INTERR;
				goto decline_rdma_unlock;
			}
		}
489
	}
490
	smc_rmb_sync_sg_for_device(&smc->conn);
491
492
493

	rc = smc_clc_send_confirm(smc);
	if (rc)
494
		goto out_err_unlock;
495

Ursula Braun's avatar
Ursula Braun committed
496
497
	if (local_contact == SMC_FIRST_CONTACT) {
		/* QP confirmation over RoCE fabric */
498
		reason_code = smc_clnt_conf_first_link(smc);
Ursula Braun's avatar
Ursula Braun committed
499
500
501
502
503
504
505
		if (reason_code < 0) {
			rc = reason_code;
			goto out_err_unlock;
		}
		if (reason_code > 0)
			goto decline_rdma_unlock;
	}
506

507
	mutex_unlock(&smc_create_lgr_pending);
Ursula Braun's avatar
Ursula Braun committed
508
509
	smc_tx_init(smc);

510
511
out_connected:
	smc_copy_sock_settings_to_clc(smc);
512
513
	if (smc->sk.sk_state == SMC_INIT)
		smc->sk.sk_state = SMC_ACTIVE;
514

515
	return rc ? rc : local_contact;
516

517
decline_rdma_unlock:
518
519
	if (local_contact == SMC_FIRST_CONTACT)
		smc_lgr_forget(smc->conn.lgr);
520
521
	mutex_unlock(&smc_create_lgr_pending);
	smc_conn_free(&smc->conn);
522
523
524
525
decline_rdma:
	/* RDMA setup failed, switch back to TCP */
	smc->use_fallback = true;
	if (reason_code && (reason_code != SMC_CLC_DECL_REPLY)) {
526
		rc = smc_clc_send_decline(smc, reason_code);
527
		if (rc < 0)
528
529
530
531
			goto out_err;
	}
	goto out_connected;

532
out_err_unlock:
533
534
	if (local_contact == SMC_FIRST_CONTACT)
		smc_lgr_forget(smc->conn.lgr);
535
536
	mutex_unlock(&smc_create_lgr_pending);
	smc_conn_free(&smc->conn);
537
out_err:
538
539
	if (smc->sk.sk_state == SMC_INIT)
		sock_put(&smc->sk); /* passive closing */
540
541
542
	return rc;
}

543
544
545
546
547
548
549
550
551
552
553
554
static int smc_connect(struct socket *sock, struct sockaddr *addr,
		       int alen, int flags)
{
	struct sock *sk = sock->sk;
	struct smc_sock *smc;
	int rc = -EINVAL;

	smc = smc_sk(sk);

	/* separate smc parameter checking to be safe */
	if (alen < sizeof(addr->sa_family))
		goto out_err;
555
	if (addr->sa_family != AF_INET && addr->sa_family != AF_INET6)
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
		goto out_err;

	lock_sock(sk);
	switch (sk->sk_state) {
	default:
		goto out;
	case SMC_ACTIVE:
		rc = -EISCONN;
		goto out;
	case SMC_INIT:
		rc = 0;
		break;
	}

	smc_copy_sock_settings_to_clc(smc);
571
	tcp_sk(smc->clcsock->sk)->syn_smc = 1;
572
573
574
575
	rc = kernel_connect(smc->clcsock, addr, alen, flags);
	if (rc)
		goto out;

576
577
578
579
580
581
	/* setup RDMA connection */
	rc = smc_connect_rdma(smc);
	if (rc < 0)
		goto out;
	else
		rc = 0; /* success cases including fallback */
582
583
584
585
586
587
588
589
590

out:
	release_sock(sk);
out_err:
	return rc;
}

static int smc_clcsock_accept(struct smc_sock *lsmc, struct smc_sock **new_smc)
{
591
592
	struct socket *new_clcsock = NULL;
	struct sock *lsk = &lsmc->sk;
593
594
595
	struct sock *new_sk;
	int rc;

596
	release_sock(lsk);
597
	new_sk = smc_sock_alloc(sock_net(lsk), NULL, lsk->sk_protocol);
598
599
	if (!new_sk) {
		rc = -ENOMEM;
600
		lsk->sk_err = ENOMEM;
601
		*new_smc = NULL;
602
		lock_sock(lsk);
603
604
605
606
607
		goto out;
	}
	*new_smc = smc_sk(new_sk);

	rc = kernel_accept(lsmc->clcsock, &new_clcsock, 0);
608
	lock_sock(lsk);
609
	if  (rc < 0)
610
		lsk->sk_err = -rc;
611
	if (rc < 0 || lsk->sk_state == SMC_CLOSED) {
612
613
614
615
		if (new_clcsock)
			sock_release(new_clcsock);
		new_sk->sk_state = SMC_CLOSED;
		sock_set_flag(new_sk, SOCK_DEAD);
616
		new_sk->sk_prot->unhash(new_sk);
617
		sock_put(new_sk); /* final */
618
619
620
621
622
623
624
625
626
		*new_smc = NULL;
		goto out;
	}

	(*new_smc)->clcsock = new_clcsock;
out:
	return rc;
}

627
628
629
630
631
632
633
/* add a just created sock to the accept queue of the listen sock as
 * candidate for a following socket accept call from user space
 */
static void smc_accept_enqueue(struct sock *parent, struct sock *sk)
{
	struct smc_sock *par = smc_sk(parent);

634
	sock_hold(sk); /* sock_put in smc_accept_unlink () */
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
	spin_lock(&par->accept_q_lock);
	list_add_tail(&smc_sk(sk)->accept_q, &par->accept_q);
	spin_unlock(&par->accept_q_lock);
	sk_acceptq_added(parent);
}

/* remove a socket from the accept queue of its parental listening socket */
static void smc_accept_unlink(struct sock *sk)
{
	struct smc_sock *par = smc_sk(sk)->listen_smc;

	spin_lock(&par->accept_q_lock);
	list_del_init(&smc_sk(sk)->accept_q);
	spin_unlock(&par->accept_q_lock);
	sk_acceptq_removed(&smc_sk(sk)->listen_smc->sk);
650
	sock_put(sk); /* sock_hold in smc_accept_enqueue */
651
652
653
654
655
}

/* remove a sock from the accept queue to bind it to a new socket created
 * for a socket accept call from user space
 */
656
657
struct sock *smc_accept_dequeue(struct sock *parent,
				struct socket *new_sock)
658
659
660
661
662
663
664
665
666
{
	struct smc_sock *isk, *n;
	struct sock *new_sk;

	list_for_each_entry_safe(isk, n, &smc_sk(parent)->accept_q, accept_q) {
		new_sk = (struct sock *)isk;

		smc_accept_unlink(new_sk);
		if (new_sk->sk_state == SMC_CLOSED) {
667
668
669
670
			if (isk->clcsock) {
				sock_release(isk->clcsock);
				isk->clcsock = NULL;
			}
671
			new_sk->sk_prot->unhash(new_sk);
672
			sock_put(new_sk); /* final */
673
674
675
676
677
678
679
680
681
682
			continue;
		}
		if (new_sock)
			sock_graft(new_sk, new_sock);
		return new_sk;
	}
	return NULL;
}

/* clean up for a created but never accepted sock */
683
void smc_close_non_accepted(struct sock *sk)
684
685
686
{
	struct smc_sock *smc = smc_sk(sk);

687
688
689
690
	lock_sock(sk);
	if (!sk->sk_lingertime)
		/* wait for peer closing */
		sk->sk_lingertime = SMC_MAX_STREAM_WAIT_TIMEOUT;
691
	if (!smc->use_fallback) {
692
		smc_close_active(smc);
693
694
695
		sock_set_flag(sk, SOCK_DEAD);
		sk->sk_shutdown |= SHUTDOWN_MASK;
	}
696
697
698
699
700
701
702
	if (smc->clcsock) {
		struct socket *tcp;

		tcp = smc->clcsock;
		smc->clcsock = NULL;
		sock_release(tcp);
	}
703
	if (smc->use_fallback) {
704
705
706
707
708
		sock_put(sk); /* passive closing */
		sk->sk_state = SMC_CLOSED;
	} else {
		if (sk->sk_state == SMC_CLOSED)
			smc_conn_free(&smc->conn);
709
710
	}
	release_sock(sk);
711
712
	sk->sk_prot->unhash(sk);
	sock_put(sk); /* final sock_put */
713
714
}

Ursula Braun's avatar
Ursula Braun committed
715
716
717
718
719
720
721
722
static int smc_serv_conf_first_link(struct smc_sock *smc)
{
	struct smc_link_group *lgr = smc->conn.lgr;
	struct smc_link *link;
	int rest;
	int rc;

	link = &lgr->lnk[SMC_SINGLE_LINK];
723

Karsten Graul's avatar
Karsten Graul committed
724
	if (smc_reg_rmb(link, smc->conn.rmb_desc))
725
726
		return SMC_CLC_DECL_INTERR;

Ursula Braun's avatar
Ursula Braun committed
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
	/* send CONFIRM LINK request to client over the RoCE fabric */
	rc = smc_llc_send_confirm_link(link,
				       link->smcibdev->mac[link->ibport - 1],
				       &link->smcibdev->gid[link->ibport - 1],
				       SMC_LLC_REQ);
	if (rc < 0)
		return SMC_CLC_DECL_TCL;

	/* receive CONFIRM LINK response from client over the RoCE fabric */
	rest = wait_for_completion_interruptible_timeout(
		&link->llc_confirm_resp,
		SMC_LLC_WAIT_FIRST_TIME);
	if (rest <= 0) {
		struct smc_clc_msg_decline dclc;

		rc = smc_clc_wait_msg(smc, &dclc, sizeof(dclc),
				      SMC_CLC_DECLINE);
744
		return rc;
Ursula Braun's avatar
Ursula Braun committed
745
746
	}

747
748
749
	if (link->llc_confirm_resp_rc)
		return SMC_CLC_DECL_RMBE_EC;

750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
	/* send ADD LINK request to client over the RoCE fabric */
	rc = smc_llc_send_add_link(link,
				   link->smcibdev->mac[link->ibport - 1],
				   &link->smcibdev->gid[link->ibport - 1],
				   SMC_LLC_REQ);
	if (rc < 0)
		return SMC_CLC_DECL_TCL;

	/* receive ADD LINK response from client over the RoCE fabric */
	rest = wait_for_completion_interruptible_timeout(&link->llc_add_resp,
							 SMC_LLC_WAIT_TIME);
	if (rest <= 0) {
		struct smc_clc_msg_decline dclc;

		rc = smc_clc_wait_msg(smc, &dclc, sizeof(dclc),
				      SMC_CLC_DECLINE);
		return rc;
	}

	link->state = SMC_LNK_ACTIVE;

771
	return 0;
Ursula Braun's avatar
Ursula Braun committed
772
773
}

774
775
776
777
778
/* setup for RDMA connection of server */
static void smc_listen_work(struct work_struct *work)
{
	struct smc_sock *new_smc = container_of(work, struct smc_sock,
						smc_listen_work);
779
	struct smc_clc_msg_proposal_prefix *pclc_prfx;
780
781
782
	struct socket *newclcsock = new_smc->clcsock;
	struct smc_sock *lsmc = new_smc->listen_smc;
	struct smc_clc_msg_accept_confirm cclc;
783
	int local_contact = SMC_REUSE_CONTACT;
784
	struct sock *newsmcsk = &new_smc->sk;
785
	struct smc_clc_msg_proposal *pclc;
786
	struct smc_ib_device *smcibdev;
787
	u8 buf[SMC_CLC_MAX_LEN];
788
	struct smc_link *link;
789
	int reason_code = 0;
790
	int rc = 0;
791
792
	u8 ibport;

793
794
795
796
797
798
	/* check if peer is smc capable */
	if (!tcp_sk(newclcsock->sk)->syn_smc) {
		new_smc->use_fallback = true;
		goto out_connected;
	}

799
800
801
	/* do inband token exchange -
	 *wait for and receive SMC Proposal CLC message
	 */
802
	reason_code = smc_clc_wait_msg(new_smc, &buf, sizeof(buf),
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
				       SMC_CLC_PROPOSAL);
	if (reason_code < 0)
		goto out_err;
	if (reason_code > 0)
		goto decline_rdma;

	/* IPSec connections opt out of SMC-R optimizations */
	if (using_ipsec(new_smc)) {
		reason_code = SMC_CLC_DECL_IPSEC;
		goto decline_rdma;
	}

	/* PNET table look up: search active ib_device and port
	 * within same PNETID that also contains the ethernet device
	 * used for the internal TCP socket
	 */
	smc_pnet_find_roce_resource(newclcsock->sk, &smcibdev, &ibport);
	if (!smcibdev) {
		reason_code = SMC_CLC_DECL_CNFERR; /* configuration error */
		goto decline_rdma;
	}

825
826
	pclc = (struct smc_clc_msg_proposal *)&buf;
	pclc_prfx = smc_clc_proposal_get_prefix(pclc);
827
828
829

	rc = smc_clc_prfx_match(newclcsock, pclc_prfx);
	if (rc) {
830
831
832
833
		reason_code = SMC_CLC_DECL_CNFERR; /* configuration error */
		goto decline_rdma;
	}

834
835
	/* allocate connection / link group */
	mutex_lock(&smc_create_lgr_pending);
836
837
	local_contact = smc_conn_create(new_smc, smcibdev, ibport, &pclc->lcl,
					0);
838
839
840
841
	if (local_contact < 0) {
		rc = local_contact;
		if (rc == -ENOMEM)
			reason_code = SMC_CLC_DECL_MEM;/* insufficient memory*/
842
		goto decline_rdma_unlock;
843
844
	}
	link = &new_smc->conn.lgr->lnk[SMC_SINGLE_LINK];
845

846
847
	/* create send buffer and rmb */
	rc = smc_buf_create(new_smc);
848
849
	if (rc) {
		reason_code = SMC_CLC_DECL_MEM;
850
		goto decline_rdma_unlock;
851
	}
852

853
854
855
	smc_close_init(new_smc);
	smc_rx_init(new_smc);

856
	if (local_contact != SMC_FIRST_CONTACT) {
Karsten Graul's avatar
Karsten Graul committed
857
858
		if (!new_smc->conn.rmb_desc->reused) {
			if (smc_reg_rmb(link, new_smc->conn.rmb_desc)) {
859
				reason_code = SMC_CLC_DECL_INTERR;
860
				goto decline_rdma_unlock;
861
862
863
			}
		}
	}
864
	smc_rmb_sync_sg_for_device(&new_smc->conn);
865

866
	rc = smc_clc_send_accept(new_smc, local_contact);
867
	if (rc)
868
		goto out_err_unlock;
869
870
871
872
873

	/* receive SMC Confirm CLC message */
	reason_code = smc_clc_wait_msg(new_smc, &cclc, sizeof(cclc),
				       SMC_CLC_CONFIRM);
	if (reason_code < 0)
874
		goto out_err_unlock;
875
	if (reason_code > 0)
876
		goto decline_rdma_unlock;
877
878
879
	smc_conn_save_peer_info(new_smc, &cclc);
	if (local_contact == SMC_FIRST_CONTACT)
		smc_link_save_peer_info(link, &cclc);
880

881
882
883
	rc = smc_rmb_rtoken_handling(&new_smc->conn, &cclc);
	if (rc) {
		reason_code = SMC_CLC_DECL_INTERR;
884
		goto decline_rdma_unlock;
885
886
887
888
889
890
	}

	if (local_contact == SMC_FIRST_CONTACT) {
		rc = smc_ib_ready_link(link);
		if (rc) {
			reason_code = SMC_CLC_DECL_INTERR;
891
			goto decline_rdma_unlock;
892
		}
Ursula Braun's avatar
Ursula Braun committed
893
894
		/* QP confirmation over RoCE fabric */
		reason_code = smc_serv_conf_first_link(new_smc);
895
		if (reason_code < 0)
Ursula Braun's avatar
Ursula Braun committed
896
			/* peer is not aware of a problem */
897
			goto out_err_unlock;
Ursula Braun's avatar
Ursula Braun committed
898
		if (reason_code > 0)
899
			goto decline_rdma_unlock;
900
	}
901

Ursula Braun's avatar
Ursula Braun committed
902
	smc_tx_init(new_smc);
903
	mutex_unlock(&smc_create_lgr_pending);
Ursula Braun's avatar
Ursula Braun committed
904

905
906
out_connected:
	sk_refcnt_debug_inc(newsmcsk);
907
908
	if (newsmcsk->sk_state == SMC_INIT)
		newsmcsk->sk_state = SMC_ACTIVE;
909
enqueue:
910
	lock_sock_nested(&lsmc->sk, SINGLE_DEPTH_NESTING);
911
912
913
914
915
916
917
918
919
920
921
922
	if (lsmc->sk.sk_state == SMC_LISTEN) {
		smc_accept_enqueue(&lsmc->sk, newsmcsk);
	} else { /* no longer listening */
		smc_close_non_accepted(newsmcsk);
	}
	release_sock(&lsmc->sk);

	/* Wake up accept */
	lsmc->sk.sk_data_ready(&lsmc->sk);
	sock_put(&lsmc->sk); /* sock_hold in smc_tcp_listen_work */
	return;

923
decline_rdma_unlock:
924
925
	if (local_contact == SMC_FIRST_CONTACT)
		smc_lgr_forget(new_smc->conn.lgr);
926
	mutex_unlock(&smc_create_lgr_pending);
927
928
decline_rdma:
	/* RDMA setup failed, switch back to TCP */
929
	smc_conn_free(&new_smc->conn);
930
931
	new_smc->use_fallback = true;
	if (reason_code && (reason_code != SMC_CLC_DECL_REPLY)) {
932
		if (smc_clc_send_decline(new_smc, reason_code) < 0)
933
934
935
936
			goto out_err;
	}
	goto out_connected;

937
out_err_unlock:
938
939
	if (local_contact == SMC_FIRST_CONTACT)
		smc_lgr_forget(new_smc->conn.lgr);
940
	mutex_unlock(&smc_create_lgr_pending);
941
out_err:
942
943
	if (newsmcsk->sk_state == SMC_INIT)
		sock_put(&new_smc->sk); /* passive closing */
944
	newsmcsk->sk_state = SMC_CLOSED;
945
	smc_conn_free(&new_smc->conn);
946
947
948
949
950
951
952
	goto enqueue; /* queue new sock with sk_err set */
}

static void smc_tcp_listen_work(struct work_struct *work)
{
	struct smc_sock *lsmc = container_of(work, struct smc_sock,
					     tcp_listen_work);
953
	struct sock *lsk = &lsmc->sk;
954
955
956
	struct smc_sock *new_smc;
	int rc = 0;

957
958
	lock_sock(lsk);
	while (lsk->sk_state == SMC_LISTEN) {
959
960
961
962
963
964
965
966
		rc = smc_clcsock_accept(lsmc, &new_smc);
		if (rc)
			goto out;
		if (!new_smc)
			continue;

		new_smc->listen_smc = lsmc;
		new_smc->use_fallback = false; /* assume rdma capability first*/
967
		sock_hold(lsk); /* sock_put in smc_listen_work */
968
969
		INIT_WORK(&new_smc->smc_listen_work, smc_listen_work);
		smc_copy_sock_settings_to_smc(new_smc);
970
971
972
		sock_hold(&new_smc->sk); /* sock_put in passive closing */
		if (!schedule_work(&new_smc->smc_listen_work))
			sock_put(&new_smc->sk);
973
974
975
	}

out:
976
	release_sock(lsk);
977
	sock_put(&lsmc->sk); /* sock_hold in smc_listen */
978
979
}

980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
static int smc_listen(struct socket *sock, int backlog)
{
	struct sock *sk = sock->sk;
	struct smc_sock *smc;
	int rc;

	smc = smc_sk(sk);
	lock_sock(sk);

	rc = -EINVAL;
	if ((sk->sk_state != SMC_INIT) && (sk->sk_state != SMC_LISTEN))
		goto out;

	rc = 0;
	if (sk->sk_state == SMC_LISTEN) {
		sk->sk_max_ack_backlog = backlog;
		goto out;
	}
	/* some socket options are handled in core, so we could not apply
	 * them to the clc socket -- copy smc socket options to clc socket
	 */
For faster browsing, not all history is shown. View entire blame