blk-mq.c 86 KB
Newer Older
1
// SPDX-License-Identifier: GPL-2.0
2
3
4
5
6
7
/*
 * Block multiqueue core code
 *
 * Copyright (C) 2013-2014 Jens Axboe
 * Copyright (C) 2013-2014 Christoph Hellwig
 */
8
9
10
11
12
#include <linux/kernel.h>
#include <linux/module.h>
#include <linux/backing-dev.h>
#include <linux/bio.h>
#include <linux/blkdev.h>
13
#include <linux/kmemleak.h>
14
15
16
17
18
19
20
21
22
23
#include <linux/mm.h>
#include <linux/init.h>
#include <linux/slab.h>
#include <linux/workqueue.h>
#include <linux/smp.h>
#include <linux/llist.h>
#include <linux/list_sort.h>
#include <linux/cpu.h>
#include <linux/cache.h>
#include <linux/sched/sysctl.h>
24
#include <linux/sched/topology.h>
25
#include <linux/sched/signal.h>
26
#include <linux/delay.h>
27
#include <linux/crash_dump.h>
28
#include <linux/prefetch.h>
29
30
31
32

#include <trace/events/block.h>

#include <linux/blk-mq.h>
33
#include <linux/t10-pi.h>
34
35
#include "blk.h"
#include "blk-mq.h"
36
#include "blk-mq-debugfs.h"
37
#include "blk-mq-tag.h"
38
#include "blk-pm.h"
39
#include "blk-stat.h"
40
#include "blk-mq-sched.h"
41
#include "blk-rq-qos.h"
42

43
44
45
static void blk_mq_poll_stats_start(struct request_queue *q);
static void blk_mq_poll_stats_fn(struct blk_stat_callback *cb);

46
47
static int blk_mq_poll_stats_bkt(const struct request *rq)
{
48
	int ddir, sectors, bucket;
49

Jens Axboe's avatar
Jens Axboe committed
50
	ddir = rq_data_dir(rq);
51
	sectors = blk_rq_stats_sectors(rq);
52

53
	bucket = ddir + 2 * ilog2(sectors);
54
55
56
57
58
59
60
61
62

	if (bucket < 0)
		return -1;
	else if (bucket >= BLK_MQ_POLL_STATS_BKTS)
		return ddir + BLK_MQ_POLL_STATS_BKTS - 2;

	return bucket;
}

63
/*
64
65
 * Check if any of the ctx, dispatch list or elevator
 * have pending work in this hardware queue.
66
 */
67
static bool blk_mq_hctx_has_pending(struct blk_mq_hw_ctx *hctx)
68
{
69
70
	return !list_empty_careful(&hctx->dispatch) ||
		sbitmap_any_bit_set(&hctx->ctx_map) ||
71
			blk_mq_sched_has_work(hctx);
72
73
}

74
75
76
77
78
79
/*
 * Mark this ctx as having pending work in this hardware queue
 */
static void blk_mq_hctx_mark_pending(struct blk_mq_hw_ctx *hctx,
				     struct blk_mq_ctx *ctx)
{
80
81
82
83
	const int bit = ctx->index_hw[hctx->type];

	if (!sbitmap_test_bit(&hctx->ctx_map, bit))
		sbitmap_set_bit(&hctx->ctx_map, bit);
84
85
86
87
88
}

static void blk_mq_hctx_clear_pending(struct blk_mq_hw_ctx *hctx,
				      struct blk_mq_ctx *ctx)
{
89
90
91
	const int bit = ctx->index_hw[hctx->type];

	sbitmap_clear_bit(&hctx->ctx_map, bit);
92
93
}

94
95
96
97
98
struct mq_inflight {
	struct hd_struct *part;
	unsigned int *inflight;
};

99
static bool blk_mq_check_inflight(struct blk_mq_hw_ctx *hctx,
100
101
102
103
104
				  struct request *rq, void *priv,
				  bool reserved)
{
	struct mq_inflight *mi = priv;

105
	/*
106
	 * index[0] counts the specific partition that was asked for.
107
108
109
	 */
	if (rq->part == mi->part)
		mi->inflight[0]++;
110
111

	return true;
112
113
}

114
unsigned int blk_mq_in_flight(struct request_queue *q, struct hd_struct *part)
115
{
116
	unsigned inflight[2];
117
118
	struct mq_inflight mi = { .part = part, .inflight = inflight, };

119
	inflight[0] = inflight[1] = 0;
120
	blk_mq_queue_tag_busy_iter(q, blk_mq_check_inflight, &mi);
121
122

	return inflight[0];
123
124
}

125
static bool blk_mq_check_inflight_rw(struct blk_mq_hw_ctx *hctx,
126
127
128
129
130
131
132
				     struct request *rq, void *priv,
				     bool reserved)
{
	struct mq_inflight *mi = priv;

	if (rq->part == mi->part)
		mi->inflight[rq_data_dir(rq)]++;
133
134

	return true;
135
136
137
138
139
140
141
142
143
144
145
}

void blk_mq_in_flight_rw(struct request_queue *q, struct hd_struct *part,
			 unsigned int inflight[2])
{
	struct mq_inflight mi = { .part = part, .inflight = inflight, };

	inflight[0] = inflight[1] = 0;
	blk_mq_queue_tag_busy_iter(q, blk_mq_check_inflight_rw, &mi);
}

146
void blk_freeze_queue_start(struct request_queue *q)
147
{
148
149
	mutex_lock(&q->mq_freeze_lock);
	if (++q->mq_freeze_depth == 1) {
150
		percpu_ref_kill(&q->q_usage_counter);
151
		mutex_unlock(&q->mq_freeze_lock);
Jens Axboe's avatar
Jens Axboe committed
152
		if (queue_is_mq(q))
153
			blk_mq_run_hw_queues(q, false);
154
155
	} else {
		mutex_unlock(&q->mq_freeze_lock);
156
	}
157
}
158
EXPORT_SYMBOL_GPL(blk_freeze_queue_start);
159

160
void blk_mq_freeze_queue_wait(struct request_queue *q)
161
{
162
	wait_event(q->mq_freeze_wq, percpu_ref_is_zero(&q->q_usage_counter));
163
}
164
EXPORT_SYMBOL_GPL(blk_mq_freeze_queue_wait);
165

166
167
168
169
170
171
172
173
int blk_mq_freeze_queue_wait_timeout(struct request_queue *q,
				     unsigned long timeout)
{
	return wait_event_timeout(q->mq_freeze_wq,
					percpu_ref_is_zero(&q->q_usage_counter),
					timeout);
}
EXPORT_SYMBOL_GPL(blk_mq_freeze_queue_wait_timeout);
174

175
176
177
178
/*
 * Guarantee no request is in use, so we can change any data structure of
 * the queue afterward.
 */
179
void blk_freeze_queue(struct request_queue *q)
180
{
181
182
183
184
185
186
187
	/*
	 * In the !blk_mq case we are only calling this to kill the
	 * q_usage_counter, otherwise this increases the freeze depth
	 * and waits for it to return to zero.  For this reason there is
	 * no blk_unfreeze_queue(), and blk_freeze_queue() is not
	 * exported to drivers as the only user for unfreeze is blk_mq.
	 */
188
	blk_freeze_queue_start(q);
189
190
	blk_mq_freeze_queue_wait(q);
}
191
192
193
194
195
196
197
198
199

void blk_mq_freeze_queue(struct request_queue *q)
{
	/*
	 * ...just an alias to keep freeze and unfreeze actions balanced
	 * in the blk_mq_* namespace
	 */
	blk_freeze_queue(q);
}
200
EXPORT_SYMBOL_GPL(blk_mq_freeze_queue);
201

202
void blk_mq_unfreeze_queue(struct request_queue *q)
203
{
204
205
206
207
	mutex_lock(&q->mq_freeze_lock);
	q->mq_freeze_depth--;
	WARN_ON_ONCE(q->mq_freeze_depth < 0);
	if (!q->mq_freeze_depth) {
208
		percpu_ref_resurrect(&q->q_usage_counter);
209
		wake_up_all(&q->mq_freeze_wq);
210
	}
211
	mutex_unlock(&q->mq_freeze_lock);
212
}
213
EXPORT_SYMBOL_GPL(blk_mq_unfreeze_queue);
214

215
216
217
218
219
220
/*
 * FIXME: replace the scsi_internal_device_*block_nowait() calls in the
 * mpt3sas driver such that this function can be removed.
 */
void blk_mq_quiesce_queue_nowait(struct request_queue *q)
{
221
	blk_queue_flag_set(QUEUE_FLAG_QUIESCED, q);
222
223
224
}
EXPORT_SYMBOL_GPL(blk_mq_quiesce_queue_nowait);

225
/**
226
 * blk_mq_quiesce_queue() - wait until all ongoing dispatches have finished
227
228
229
 * @q: request queue.
 *
 * Note: this function does not prevent that the struct request end_io()
230
231
232
 * callback function is invoked. Once this function is returned, we make
 * sure no dispatch can happen until the queue is unquiesced via
 * blk_mq_unquiesce_queue().
233
234
235
236
237
238
239
 */
void blk_mq_quiesce_queue(struct request_queue *q)
{
	struct blk_mq_hw_ctx *hctx;
	unsigned int i;
	bool rcu = false;

240
	blk_mq_quiesce_queue_nowait(q);
241

242
243
	queue_for_each_hw_ctx(q, hctx, i) {
		if (hctx->flags & BLK_MQ_F_BLOCKING)
244
			synchronize_srcu(hctx->srcu);
245
246
247
248
249
250
251
252
		else
			rcu = true;
	}
	if (rcu)
		synchronize_rcu();
}
EXPORT_SYMBOL_GPL(blk_mq_quiesce_queue);

253
254
255
256
257
258
259
260
261
/*
 * blk_mq_unquiesce_queue() - counterpart of blk_mq_quiesce_queue()
 * @q: request queue.
 *
 * This function recovers queue into the state before quiescing
 * which is done by blk_mq_quiesce_queue.
 */
void blk_mq_unquiesce_queue(struct request_queue *q)
{
262
	blk_queue_flag_clear(QUEUE_FLAG_QUIESCED, q);
263

264
265
	/* dispatch requests which are inserted during quiescing */
	blk_mq_run_hw_queues(q, true);
266
267
268
}
EXPORT_SYMBOL_GPL(blk_mq_unquiesce_queue);

269
270
271
272
273
274
275
276
277
278
void blk_mq_wake_waiters(struct request_queue *q)
{
	struct blk_mq_hw_ctx *hctx;
	unsigned int i;

	queue_for_each_hw_ctx(q, hctx, i)
		if (blk_mq_hw_queue_mapped(hctx))
			blk_mq_tag_wakeup_all(hctx->tags, true);
}

279
280
281
282
283
284
bool blk_mq_can_queue(struct blk_mq_hw_ctx *hctx)
{
	return blk_mq_has_free_tags(hctx->tags);
}
EXPORT_SYMBOL(blk_mq_can_queue);

285
/*
286
287
 * Only need start/end time stamping if we have iostat or
 * blk stats enabled, or using an IO scheduler.
288
289
290
 */
static inline bool blk_mq_need_time_stamp(struct request *rq)
{
291
	return (rq->rq_flags & (RQF_IO_STAT | RQF_STATS)) || rq->q->elevator;
292
293
}

294
static struct request *blk_mq_rq_ctx_init(struct blk_mq_alloc_data *data,
295
		unsigned int tag, unsigned int op, u64 alloc_time_ns)
296
{
297
298
	struct blk_mq_tags *tags = blk_mq_tags_from_data(data);
	struct request *rq = tags->static_rqs[tag];
299
	req_flags_t rq_flags = 0;
300

301
302
303
304
	if (data->flags & BLK_MQ_REQ_INTERNAL) {
		rq->tag = -1;
		rq->internal_tag = tag;
	} else {
305
		if (data->hctx->flags & BLK_MQ_F_TAG_SHARED) {
306
			rq_flags = RQF_MQ_INFLIGHT;
307
308
309
310
311
312
313
			atomic_inc(&data->hctx->nr_active);
		}
		rq->tag = tag;
		rq->internal_tag = -1;
		data->hctx->tags->rqs[rq->tag] = rq;
	}

314
	/* csd/requeue_work/fifo_time is initialized before use */
315
316
	rq->q = data->q;
	rq->mq_ctx = data->ctx;
317
	rq->mq_hctx = data->hctx;
318
	rq->rq_flags = rq_flags;
319
	rq->cmd_flags = op;
320
321
	if (data->flags & BLK_MQ_REQ_PREEMPT)
		rq->rq_flags |= RQF_PREEMPT;
322
	if (blk_queue_io_stat(data->q))
323
		rq->rq_flags |= RQF_IO_STAT;
324
	INIT_LIST_HEAD(&rq->queuelist);
325
326
327
328
	INIT_HLIST_NODE(&rq->hash);
	RB_CLEAR_NODE(&rq->rb_node);
	rq->rq_disk = NULL;
	rq->part = NULL;
329
330
331
#ifdef CONFIG_BLK_RQ_ALLOC_TIME
	rq->alloc_time_ns = alloc_time_ns;
#endif
332
333
334
335
	if (blk_mq_need_time_stamp(rq))
		rq->start_time_ns = ktime_get_ns();
	else
		rq->start_time_ns = 0;
336
	rq->io_start_time_ns = 0;
337
	rq->stats_sectors = 0;
338
339
340
341
342
343
	rq->nr_phys_segments = 0;
#if defined(CONFIG_BLK_DEV_INTEGRITY)
	rq->nr_integrity_segments = 0;
#endif
	/* tag was already set */
	rq->extra_len = 0;
344
	WRITE_ONCE(rq->deadline, 0);
345

346
347
	rq->timeout = 0;

348
349
350
	rq->end_io = NULL;
	rq->end_io_data = NULL;

351
	data->ctx->rq_dispatched[op_is_sync(op)]++;
352
	refcount_set(&rq->ref, 1);
353
	return rq;
354
355
}

356
static struct request *blk_mq_get_request(struct request_queue *q,
357
358
					  struct bio *bio,
					  struct blk_mq_alloc_data *data)
359
360
361
{
	struct elevator_queue *e = q->elevator;
	struct request *rq;
362
	unsigned int tag;
363
	bool clear_ctx_on_error = false;
364
	u64 alloc_time_ns = 0;
365
366

	blk_queue_enter_live(q);
367
368
369
370
371

	/* alloc_time includes depth and tag waits */
	if (blk_queue_rq_alloc_time(q))
		alloc_time_ns = ktime_get_ns();

372
	data->q = q;
373
374
	if (likely(!data->ctx)) {
		data->ctx = blk_mq_get_ctx(q);
375
		clear_ctx_on_error = true;
376
	}
377
	if (likely(!data->hctx))
378
		data->hctx = blk_mq_map_queue(q, data->cmd_flags,
379
						data->ctx);
380
	if (data->cmd_flags & REQ_NOWAIT)
381
		data->flags |= BLK_MQ_REQ_NOWAIT;
382
383
384
385
386
387

	if (e) {
		data->flags |= BLK_MQ_REQ_INTERNAL;

		/*
		 * Flush requests are special and go directly to the
388
389
		 * dispatch list. Don't include reserved tags in the
		 * limiting, as it isn't useful.
390
		 */
391
392
		if (!op_is_flush(data->cmd_flags) &&
		    e->type->ops.limit_depth &&
393
		    !(data->flags & BLK_MQ_REQ_RESERVED))
394
			e->type->ops.limit_depth(data->cmd_flags, data);
395
396
	} else {
		blk_mq_tag_busy(data->hctx);
397
398
	}

399
400
	tag = blk_mq_get_tag(data);
	if (tag == BLK_MQ_TAG_FAIL) {
401
		if (clear_ctx_on_error)
402
			data->ctx = NULL;
403
404
		blk_queue_exit(q);
		return NULL;
405
406
	}

407
	rq = blk_mq_rq_ctx_init(data, tag, data->cmd_flags, alloc_time_ns);
408
	if (!op_is_flush(data->cmd_flags)) {
409
		rq->elv.icq = NULL;
410
		if (e && e->type->ops.prepare_request) {
Damien Le Moal's avatar
Damien Le Moal committed
411
412
			if (e->type->icq_cache)
				blk_mq_sched_assign_ioc(rq);
413

414
			e->type->ops.prepare_request(rq, bio);
415
			rq->rq_flags |= RQF_ELVPRIV;
416
		}
417
418
419
	}
	data->hctx->queued++;
	return rq;
420
421
}

422
struct request *blk_mq_alloc_request(struct request_queue *q, unsigned int op,
423
		blk_mq_req_flags_t flags)
424
{
425
	struct blk_mq_alloc_data alloc_data = { .flags = flags, .cmd_flags = op };
426
	struct request *rq;
427
	int ret;
428

429
	ret = blk_queue_enter(q, flags);
430
431
	if (ret)
		return ERR_PTR(ret);
432

433
	rq = blk_mq_get_request(q, NULL, &alloc_data);
434
	blk_queue_exit(q);
435

436
	if (!rq)
437
		return ERR_PTR(-EWOULDBLOCK);
438
439
440
441

	rq->__data_len = 0;
	rq->__sector = (sector_t) -1;
	rq->bio = rq->biotail = NULL;
442
443
	return rq;
}
444
EXPORT_SYMBOL(blk_mq_alloc_request);
445

446
struct request *blk_mq_alloc_request_hctx(struct request_queue *q,
447
	unsigned int op, blk_mq_req_flags_t flags, unsigned int hctx_idx)
448
{
449
	struct blk_mq_alloc_data alloc_data = { .flags = flags, .cmd_flags = op };
450
	struct request *rq;
451
	unsigned int cpu;
452
453
454
455
456
457
458
459
460
461
462
463
464
465
	int ret;

	/*
	 * If the tag allocator sleeps we could get an allocation for a
	 * different hardware context.  No need to complicate the low level
	 * allocator for this for the rare use case of a command tied to
	 * a specific queue.
	 */
	if (WARN_ON_ONCE(!(flags & BLK_MQ_REQ_NOWAIT)))
		return ERR_PTR(-EINVAL);

	if (hctx_idx >= q->nr_hw_queues)
		return ERR_PTR(-EIO);

466
	ret = blk_queue_enter(q, flags);
467
468
469
	if (ret)
		return ERR_PTR(ret);

470
471
472
473
	/*
	 * Check if the hardware context is actually mapped to anything.
	 * If not tell the caller that it should skip this queue.
	 */
474
475
476
477
	alloc_data.hctx = q->queue_hw_ctx[hctx_idx];
	if (!blk_mq_hw_queue_mapped(alloc_data.hctx)) {
		blk_queue_exit(q);
		return ERR_PTR(-EXDEV);
478
	}
479
	cpu = cpumask_first_and(alloc_data.hctx->cpumask, cpu_online_mask);
480
	alloc_data.ctx = __blk_mq_get_ctx(q, cpu);
481

482
	rq = blk_mq_get_request(q, NULL, &alloc_data);
483
	blk_queue_exit(q);
484

485
486
487
488
	if (!rq)
		return ERR_PTR(-EWOULDBLOCK);

	return rq;
489
490
491
}
EXPORT_SYMBOL_GPL(blk_mq_alloc_request_hctx);

492
493
494
495
static void __blk_mq_free_request(struct request *rq)
{
	struct request_queue *q = rq->q;
	struct blk_mq_ctx *ctx = rq->mq_ctx;
496
	struct blk_mq_hw_ctx *hctx = rq->mq_hctx;
497
498
	const int sched_tag = rq->internal_tag;

499
	blk_pm_mark_last_busy(rq);
500
	rq->mq_hctx = NULL;
501
502
503
504
505
506
507
508
	if (rq->tag != -1)
		blk_mq_put_tag(hctx, hctx->tags, ctx, rq->tag);
	if (sched_tag != -1)
		blk_mq_put_tag(hctx, hctx->sched_tags, ctx, sched_tag);
	blk_mq_sched_restart(hctx);
	blk_queue_exit(q);
}

509
void blk_mq_free_request(struct request *rq)
510
511
{
	struct request_queue *q = rq->q;
512
513
	struct elevator_queue *e = q->elevator;
	struct blk_mq_ctx *ctx = rq->mq_ctx;
514
	struct blk_mq_hw_ctx *hctx = rq->mq_hctx;
515

516
	if (rq->rq_flags & RQF_ELVPRIV) {
517
518
		if (e && e->type->ops.finish_request)
			e->type->ops.finish_request(rq);
519
520
521
522
523
		if (rq->elv.icq) {
			put_io_context(rq->elv.icq->ioc);
			rq->elv.icq = NULL;
		}
	}
524

525
	ctx->rq_completed[rq_is_sync(rq)]++;
526
	if (rq->rq_flags & RQF_MQ_INFLIGHT)
527
		atomic_dec(&hctx->nr_active);
528

529
530
531
	if (unlikely(laptop_mode && !blk_rq_is_passthrough(rq)))
		laptop_io_completion(q->backing_dev_info);

532
	rq_qos_done(q, rq);
533

534
535
536
	WRITE_ONCE(rq->state, MQ_RQ_IDLE);
	if (refcount_dec_and_test(&rq->ref))
		__blk_mq_free_request(rq);
537
}
538
EXPORT_SYMBOL_GPL(blk_mq_free_request);
539

540
inline void __blk_mq_end_request(struct request *rq, blk_status_t error)
541
{
542
543
544
545
	u64 now = 0;

	if (blk_mq_need_time_stamp(rq))
		now = ktime_get_ns();
546

547
548
	if (rq->rq_flags & RQF_STATS) {
		blk_mq_poll_stats_start(rq->q);
549
		blk_stat_add(rq, now);
550
551
	}

552
553
554
	if (rq->internal_tag != -1)
		blk_mq_sched_completed_request(rq, now);

555
	blk_account_io_done(rq, now);
556

Christoph Hellwig's avatar
Christoph Hellwig committed
557
	if (rq->end_io) {
558
		rq_qos_done(rq->q, rq);
559
		rq->end_io(rq, error);
Christoph Hellwig's avatar
Christoph Hellwig committed
560
	} else {
561
		blk_mq_free_request(rq);
Christoph Hellwig's avatar
Christoph Hellwig committed
562
	}
563
}
564
EXPORT_SYMBOL(__blk_mq_end_request);
565

566
void blk_mq_end_request(struct request *rq, blk_status_t error)
567
568
569
{
	if (blk_update_request(rq, error, blk_rq_bytes(rq)))
		BUG();
570
	__blk_mq_end_request(rq, error);
571
}
572
EXPORT_SYMBOL(blk_mq_end_request);
573

574
static void __blk_mq_complete_request_remote(void *data)
575
{
576
	struct request *rq = data;
577
	struct request_queue *q = rq->q;
578

579
	q->mq_ops->complete(rq);
580
581
}

582
static void __blk_mq_complete_request(struct request *rq)
583
584
{
	struct blk_mq_ctx *ctx = rq->mq_ctx;
585
	struct request_queue *q = rq->q;
586
	bool shared = false;
587
588
	int cpu;

589
	WRITE_ONCE(rq->state, MQ_RQ_COMPLETE);
590
591
592
593
594
595
596
597
598
	/*
	 * Most of single queue controllers, there is only one irq vector
	 * for handling IO completion, and the only irq's affinity is set
	 * as all possible CPUs. On most of ARCHs, this affinity means the
	 * irq is handled on one specific CPU.
	 *
	 * So complete IO reqeust in softirq context in case of single queue
	 * for not degrading IO performance by irqsoff latency.
	 */
599
	if (q->nr_hw_queues == 1) {
600
601
602
603
		__blk_complete_request(rq);
		return;
	}

604
605
606
607
608
609
	/*
	 * For a polled request, always complete locallly, it's pointless
	 * to redirect the completion.
	 */
	if ((rq->cmd_flags & REQ_HIPRI) ||
	    !test_bit(QUEUE_FLAG_SAME_COMP, &q->queue_flags)) {
610
		q->mq_ops->complete(rq);
611
612
		return;
	}
613
614

	cpu = get_cpu();
615
	if (!test_bit(QUEUE_FLAG_SAME_FORCE, &q->queue_flags))
616
617
618
		shared = cpus_share_cache(cpu, ctx->cpu);

	if (cpu != ctx->cpu && !shared && cpu_online(ctx->cpu)) {
619
		rq->csd.func = __blk_mq_complete_request_remote;
620
621
		rq->csd.info = rq;
		rq->csd.flags = 0;
622
		smp_call_function_single_async(ctx->cpu, &rq->csd);
623
	} else {
624
		q->mq_ops->complete(rq);
625
	}
626
627
	put_cpu();
}
628

629
static void hctx_unlock(struct blk_mq_hw_ctx *hctx, int srcu_idx)
630
	__releases(hctx->srcu)
631
632
633
634
{
	if (!(hctx->flags & BLK_MQ_F_BLOCKING))
		rcu_read_unlock();
	else
635
		srcu_read_unlock(hctx->srcu, srcu_idx);
636
637
638
}

static void hctx_lock(struct blk_mq_hw_ctx *hctx, int *srcu_idx)
639
	__acquires(hctx->srcu)
640
{
641
642
643
	if (!(hctx->flags & BLK_MQ_F_BLOCKING)) {
		/* shut up gcc false positive */
		*srcu_idx = 0;
644
		rcu_read_lock();
645
	} else
646
		*srcu_idx = srcu_read_lock(hctx->srcu);
647
648
}

649
650
651
652
653
654
655
656
/**
 * blk_mq_complete_request - end I/O on a request
 * @rq:		the request being processed
 *
 * Description:
 *	Ends all I/O on a request. It does not handle partial completions.
 *	The actual completion happens out-of-order, through a IPI handler.
 **/
657
bool blk_mq_complete_request(struct request *rq)
658
{
659
	if (unlikely(blk_should_fake_timeout(rq->q)))
660
		return false;
661
	__blk_mq_complete_request(rq);
662
	return true;
663
664
}
EXPORT_SYMBOL(blk_mq_complete_request);
665

666
667
int blk_mq_request_started(struct request *rq)
{
Tejun Heo's avatar
Tejun Heo committed
668
	return blk_mq_rq_state(rq) != MQ_RQ_IDLE;
669
670
671
}
EXPORT_SYMBOL_GPL(blk_mq_request_started);

672
673
674
675
676
677
int blk_mq_request_completed(struct request *rq)
{
	return blk_mq_rq_state(rq) == MQ_RQ_COMPLETE;
}
EXPORT_SYMBOL_GPL(blk_mq_request_completed);

678
void blk_mq_start_request(struct request *rq)
679
680
681
682
683
{
	struct request_queue *q = rq->q;

	trace_block_rq_issue(q, rq);

684
	if (test_bit(QUEUE_FLAG_STATS, &q->queue_flags)) {
685
		rq->io_start_time_ns = ktime_get_ns();
686
		rq->stats_sectors = blk_rq_sectors(rq);
687
		rq->rq_flags |= RQF_STATS;
688
		rq_qos_issue(q, rq);
689
690
	}

691
	WARN_ON_ONCE(blk_mq_rq_state(rq) != MQ_RQ_IDLE);
692

693
	blk_add_timer(rq);
694
	WRITE_ONCE(rq->state, MQ_RQ_IN_FLIGHT);
695
696
697
698
699
700
701
702
703

	if (q->dma_drain_size && blk_rq_bytes(rq)) {
		/*
		 * Make sure space for the drain appears.  We know we can do
		 * this because max_hw_segments has been adjusted to be one
		 * fewer than the device can handle.
		 */
		rq->nr_phys_segments++;
	}
704
705
706
707
708

#ifdef CONFIG_BLK_DEV_INTEGRITY
	if (blk_integrity_rq(rq) && req_op(rq) == REQ_OP_WRITE)
		q->integrity.profile->prepare_fn(rq);
#endif
709
}
710
EXPORT_SYMBOL(blk_mq_start_request);
711

712
static void __blk_mq_requeue_request(struct request *rq)
713
714
715
{
	struct request_queue *q = rq->q;

716
717
	blk_mq_put_driver_tag(rq);

718
	trace_block_rq_requeue(q, rq);
719
	rq_qos_requeue(q, rq);
720

721
722
	if (blk_mq_request_started(rq)) {
		WRITE_ONCE(rq->state, MQ_RQ_IDLE);
723
		rq->rq_flags &= ~RQF_TIMED_OUT;
724
725
726
		if (q->dma_drain_size && blk_rq_bytes(rq))
			rq->nr_phys_segments--;
	}
727
728
}

729
void blk_mq_requeue_request(struct request *rq, bool kick_requeue_list)
730
731
732
{
	__blk_mq_requeue_request(rq);

733
734
735
	/* this request will be re-inserted to io scheduler queue */
	blk_mq_sched_requeue_request(rq);

Jens Axboe's avatar
Jens Axboe committed
736
	BUG_ON(!list_empty(&rq->queuelist));
737
	blk_mq_add_to_requeue_list(rq, true, kick_requeue_list);
738
739
740
}
EXPORT_SYMBOL(blk_mq_requeue_request);

741
742
743
static void blk_mq_requeue_work(struct work_struct *work)
{
	struct request_queue *q =
744
		container_of(work, struct request_queue, requeue_work.work);
745
746
747
	LIST_HEAD(rq_list);
	struct request *rq, *next;

748
	spin_lock_irq(&q->requeue_lock);
749
	list_splice_init(&q->requeue_list, &rq_list);
750
	spin_unlock_irq(&q->requeue_lock);
751
752

	list_for_each_entry_safe(rq, next, &rq_list, queuelist) {
753
		if (!(rq->rq_flags & (RQF_SOFTBARRIER | RQF_DONTPREP)))
754
755
			continue;

756
		rq->rq_flags &= ~RQF_SOFTBARRIER;
757
		list_del_init(&rq->queuelist);
758
759
760
761
762
763
764
765
766
		/*
		 * If RQF_DONTPREP, rq has contained some driver specific
		 * data, so insert it to hctx dispatch list to avoid any
		 * merge.
		 */
		if (rq->rq_flags & RQF_DONTPREP)
			blk_mq_request_bypass_insert(rq, false);
		else
			blk_mq_sched_insert_request(rq, true, false, false);
767
768
769
770
771
	}

	while (!list_empty(&rq_list)) {
		rq = list_entry(rq_list.next, struct request, queuelist);
		list_del_init(&rq->queuelist);
772
		blk_mq_sched_insert_request(rq, false, false, false);
773
774
	}

775
	blk_mq_run_hw_queues(q, false);
776
777
}

778
779
void blk_mq_add_to_requeue_list(struct request *rq, bool at_head,
				bool kick_requeue_list)
780
781
782
783
784
785
{
	struct request_queue *q = rq->q;
	unsigned long flags;

	/*
	 * We abuse this flag that is otherwise used by the I/O scheduler to
786
	 * request head insertion from the workqueue.
787
	 */
788
	BUG_ON(rq->rq_flags & RQF_SOFTBARRIER);
789
790
791

	spin_lock_irqsave(&q->requeue_lock, flags);
	if (at_head) {
792
		rq->rq_flags |= RQF_SOFTBARRIER;
793
794
795
796
797
		list_add(&rq->queuelist, &q->requeue_list);
	} else {
		list_add_tail(&rq->queuelist, &q->requeue_list);
	}
	spin_unlock_irqrestore(&q->requeue_lock, flags);
798
799
800

	if (kick_requeue_list)
		blk_mq_kick_requeue_list(q);
801
802
803
804
}

void blk_mq_kick_requeue_list(struct request_queue *q)
{
805
	kblockd_mod_delayed_work_on(WORK_CPU_UNBOUND, &q->requeue_work, 0);
806
807
808
}
EXPORT_SYMBOL(blk_mq_kick_requeue_list);

809
810
811
void blk_mq_delay_kick_requeue_list(struct request_queue *q,
				    unsigned long msecs)
{
812
813
	kblockd_mod_delayed_work_on(WORK_CPU_UNBOUND, &q->requeue_work,
				    msecs_to_jiffies(msecs));
814
815
816
}
EXPORT_SYMBOL(blk_mq_delay_kick_requeue_list);

817
818
struct request *blk_mq_tag_to_rq(struct blk_mq_tags *tags, unsigned int tag)
{
819
820
	if (tag < tags->nr_tags) {
		prefetch(tags->rqs[tag]);
821
		return tags->rqs[tag];
822
	}
823
824

	return NULL;
825
826
827
}
EXPORT_SYMBOL(blk_mq_tag_to_rq);

828
829
static bool blk_mq_rq_inflight(struct blk_mq_hw_ctx *hctx, struct request *rq,
			       void *priv, bool reserved)
830
831
{
	/*
832
833
	 * If we find a request that is inflight and the queue matches,
	 * we know the queue is busy. Return false to stop the iteration.
834
	 */
835
	if (rq->state == MQ_RQ_IN_FLIGHT && rq->q == hctx->queue) {
836
837
838
839
840
841
842
843
844
		bool *busy = priv;

		*busy = true;
		return false;
	}

	return true;
}

845
bool blk_mq_queue_inflight(struct request_queue *q)
846
847
848
{
	bool busy = false;

849
	blk_mq_queue_tag_busy_iter(q, blk_mq_rq_inflight, &busy);
850
851
	return busy;
}
852
EXPORT_SYMBOL_GPL(blk_mq_queue_inflight);
853

854
static void blk_mq_rq_timed_out(struct request *req, bool reserved)
855
{
856
	req->rq_flags |= RQF_TIMED_OUT;
857
858
859
860
861
862
863
	if (req->q->mq_ops->timeout) {
		enum blk_eh_timer_return ret;

		ret = req->q->mq_ops->timeout(req, reserved);
		if (ret == BLK_EH_DONE)
			return;
		WARN_ON_ONCE(ret != BLK_EH_RESET_TIMER);
864
	}
865
866

	blk_add_timer(req);
867
}
868

869
static bool blk_mq_req_expired(struct request *rq, unsigned long *next)
870
{
871
	unsigned long deadline;
872

873
874
	if (blk_mq_rq_state(rq) != MQ_RQ_IN_FLIGHT)
		return false;
875
876
	if (rq->rq_flags & RQF_TIMED_OUT)
		return false;
877

878
	deadline = READ_ONCE(rq->deadline);
879
880
	if (time_after_eq(jiffies, deadline))
		return true;
881

882
883
884
885
886
	if (*next == 0)
		*next = deadline;
	else if (time_after(*next, deadline))
		*next = deadline;
	return false;
887
888
}

889
static bool blk_mq_check_expired(struct blk_mq_hw_ctx *hctx,
890
891
		struct request *rq, void *priv, bool reserved)
{
892
893
894
895
896
897
898
	unsigned long *next = priv;

	/*
	 * Just do a quick check if it is expired before locking the request in
	 * so we're not unnecessarilly synchronizing across CPUs.
	 */
	if (!blk_mq_req_expired(rq, next))
899
		return true;
900
901
902
903
904
905
906
907
908
909
910

	/*
	 * We have reason to believe the request may be expired. Take a
	 * reference on the request to lock this request lifetime into its
	 * currently allocated context to prevent it from being reallocated in
	 * the event the completion by-passes this timeout handler.
	 *
	 * If the reference was already released, then the driver beat the
	 * timeout handler to posting a natural completion.
	 */
	if (!refcount_inc_not_zero(&rq->ref))
911
		return true;
912

913
	/*
914
915
916
917
	 * The request is now locked and cannot be reallocated underneath the
	 * timeout handler's processing. Re-verify this exact request is truly
	 * expired; if it is not expired, then the request was completed and
	 * reallocated as a new request.
918
	 */
919
	if (blk_mq_req_expired(rq, next))
920
		blk_mq_rq_timed_out(rq, reserved);
921
922
923
924

	if (is_flush_rq(rq, hctx))
		rq->end_io(rq