fair.c 229 KB
Newer Older
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
/*
 * Completely Fair Scheduling (CFS) Class (SCHED_NORMAL/SCHED_BATCH)
 *
 *  Copyright (C) 2007 Red Hat, Inc., Ingo Molnar <mingo@redhat.com>
 *
 *  Interactivity improvements by Mike Galbraith
 *  (C) 2007 Mike Galbraith <efault@gmx.de>
 *
 *  Various enhancements by Dmitry Adamushko.
 *  (C) 2007 Dmitry Adamushko <dmitry.adamushko@gmail.com>
 *
 *  Group scheduling enhancements by Srivatsa Vaddagiri
 *  Copyright IBM Corporation, 2007
 *  Author: Srivatsa Vaddagiri <vatsa@linux.vnet.ibm.com>
 *
 *  Scaled math optimizations by Thomas Gleixner
 *  Copyright (C) 2007, Thomas Gleixner <tglx@linutronix.de>
18
19
20
 *
 *  Adaptive scheduling granularity, math enhancements by Peter Zijlstra
 *  Copyright (C) 2007 Red Hat, Inc., Peter Zijlstra <pzijlstr@redhat.com>
21
22
 */

Arjan van de Ven's avatar
Arjan van de Ven committed
23
#include <linux/latencytop.h>
24
#include <linux/sched.h>
25
#include <linux/cpumask.h>
26
#include <linux/cpuidle.h>
27
28
29
#include <linux/slab.h>
#include <linux/profile.h>
#include <linux/interrupt.h>
30
#include <linux/mempolicy.h>
31
#include <linux/migrate.h>
32
#include <linux/task_work.h>
33
34
35
36

#include <trace/events/sched.h>

#include "sched.h"
Arjan van de Ven's avatar
Arjan van de Ven committed
37

38
/*
39
 * Targeted preemption latency for CPU-bound tasks:
40
 * (default: 6ms * (1 + ilog(ncpus)), units: nanoseconds)
41
 *
42
 * NOTE: this latency value is not the same as the concept of
Ingo Molnar's avatar
Ingo Molnar committed
43
44
45
 * 'timeslice length' - timeslices in CFS are of variable length
 * and have no persistent notion like in traditional, time-slice
 * based scheduling concepts.
46
 *
Ingo Molnar's avatar
Ingo Molnar committed
47
48
 * (to see the precise effective timeslice length of your workload,
 *  run vmstat and monitor the context-switches (cs) field)
49
 */
50
51
unsigned int sysctl_sched_latency = 6000000ULL;
unsigned int normalized_sysctl_sched_latency = 6000000ULL;
52

53
54
55
56
57
58
59
60
61
62
63
64
/*
 * The initial- and re-scaling of tunables is configurable
 * (default SCHED_TUNABLESCALING_LOG = *(1+ilog(ncpus))
 *
 * Options are:
 * SCHED_TUNABLESCALING_NONE - unscaled, always *1
 * SCHED_TUNABLESCALING_LOG - scaled logarithmical, *1+ilog(ncpus)
 * SCHED_TUNABLESCALING_LINEAR - scaled linear, *ncpus
 */
enum sched_tunable_scaling sysctl_sched_tunable_scaling
	= SCHED_TUNABLESCALING_LOG;

65
/*
66
 * Minimal preemption granularity for CPU-bound tasks:
67
 * (default: 0.75 msec * (1 + ilog(ncpus)), units: nanoseconds)
68
 */
69
70
unsigned int sysctl_sched_min_granularity = 750000ULL;
unsigned int normalized_sysctl_sched_min_granularity = 750000ULL;
71
72

/*
73
74
 * is kept at sysctl_sched_latency / sysctl_sched_min_granularity
 */
75
static unsigned int sched_nr_latency = 8;
76
77

/*
78
 * After fork, child runs first. If set to 0 (default) then
79
 * parent will (try to) run first.
80
 */
81
unsigned int sysctl_sched_child_runs_first __read_mostly;
82
83
84

/*
 * SCHED_OTHER wake-up granularity.
85
 * (default: 1 msec * (1 + ilog(ncpus)), units: nanoseconds)
86
87
88
89
90
 *
 * This option delays the preemption effects of decoupled workloads
 * and reduces their over-scheduling. Synchronous workloads will still
 * have immediate wakeup/sleep latencies.
 */
91
unsigned int sysctl_sched_wakeup_granularity = 1000000UL;
92
unsigned int normalized_sysctl_sched_wakeup_granularity = 1000000UL;
93

94
95
const_debug unsigned int sysctl_sched_migration_cost = 500000UL;

96
97
98
99
100
101
102
/*
 * The exponential sliding  window over which load is averaged for shares
 * distribution.
 * (default: 10msec)
 */
unsigned int __read_mostly sysctl_sched_shares_window = 10000000UL;

103
104
105
106
107
108
109
110
111
112
113
114
115
116
#ifdef CONFIG_CFS_BANDWIDTH
/*
 * Amount of runtime to allocate from global (tg) to local (per-cfs_rq) pool
 * each time a cfs_rq requests quota.
 *
 * Note: in the case that the slice exceeds the runtime remaining (either due
 * to consumption or the quota being specified to be smaller than the slice)
 * we will always only issue the remaining available time.
 *
 * default: 5 msec, units: microseconds
  */
unsigned int sysctl_sched_cfs_bandwidth_slice = 5000UL;
#endif

117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
static inline void update_load_add(struct load_weight *lw, unsigned long inc)
{
	lw->weight += inc;
	lw->inv_weight = 0;
}

static inline void update_load_sub(struct load_weight *lw, unsigned long dec)
{
	lw->weight -= dec;
	lw->inv_weight = 0;
}

static inline void update_load_set(struct load_weight *lw, unsigned long w)
{
	lw->weight = w;
	lw->inv_weight = 0;
}

135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
/*
 * Increase the granularity value when there are more CPUs,
 * because with more CPUs the 'effective latency' as visible
 * to users decreases. But the relationship is not linear,
 * so pick a second-best guess by going with the log2 of the
 * number of CPUs.
 *
 * This idea comes from the SD scheduler of Con Kolivas:
 */
static int get_update_sysctl_factor(void)
{
	unsigned int cpus = min_t(int, num_online_cpus(), 8);
	unsigned int factor;

	switch (sysctl_sched_tunable_scaling) {
	case SCHED_TUNABLESCALING_NONE:
		factor = 1;
		break;
	case SCHED_TUNABLESCALING_LINEAR:
		factor = cpus;
		break;
	case SCHED_TUNABLESCALING_LOG:
	default:
		factor = 1 + ilog2(cpus);
		break;
	}

	return factor;
}

static void update_sysctl(void)
{
	unsigned int factor = get_update_sysctl_factor();

#define SET_SYSCTL(name) \
	(sysctl_##name = (factor) * normalized_sysctl_##name)
	SET_SYSCTL(sched_min_granularity);
	SET_SYSCTL(sched_latency);
	SET_SYSCTL(sched_wakeup_granularity);
#undef SET_SYSCTL
}

void sched_init_granularity(void)
{
	update_sysctl();
}

182
#define WMULT_CONST	(~0U)
183
184
#define WMULT_SHIFT	32

185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
static void __update_inv_weight(struct load_weight *lw)
{
	unsigned long w;

	if (likely(lw->inv_weight))
		return;

	w = scale_load_down(lw->weight);

	if (BITS_PER_LONG > 32 && unlikely(w >= WMULT_CONST))
		lw->inv_weight = 1;
	else if (unlikely(!w))
		lw->inv_weight = WMULT_CONST;
	else
		lw->inv_weight = WMULT_CONST / w;
}
201
202

/*
203
204
205
206
207
208
209
210
211
212
 * delta_exec * weight / lw.weight
 *   OR
 * (delta_exec * (weight * lw->inv_weight)) >> WMULT_SHIFT
 *
 * Either weight := NICE_0_LOAD and lw \e prio_to_wmult[], in which case
 * we're guaranteed shift stays positive because inv_weight is guaranteed to
 * fit 32 bits, and NICE_0_LOAD gives another 10 bits; therefore shift >= 22.
 *
 * Or, weight =< lw.weight (because lw.weight is the runqueue weight), thus
 * weight/lw.weight <= 1, and therefore our shift will also be positive.
213
 */
214
static u64 __calc_delta(u64 delta_exec, unsigned long weight, struct load_weight *lw)
215
{
216
217
	u64 fact = scale_load_down(weight);
	int shift = WMULT_SHIFT;
218

219
	__update_inv_weight(lw);
220

221
222
223
224
225
	if (unlikely(fact >> 32)) {
		while (fact >> 32) {
			fact >>= 1;
			shift--;
		}
226
227
	}

228
229
	/* hint to use a 32x32->64 mul */
	fact = (u64)(u32)fact * lw->inv_weight;
230

231
232
233
234
	while (fact >> 32) {
		fact >>= 1;
		shift--;
	}
235

236
	return mul_u64_u32_shr(delta_exec, fact, shift);
237
238
239
240
}


const struct sched_class fair_sched_class;
241

242
243
244
245
/**************************************************************
 * CFS operations on generic schedulable entities:
 */

246
#ifdef CONFIG_FAIR_GROUP_SCHED
247

248
/* cpu runqueue to which this cfs_rq is attached */
249
250
static inline struct rq *rq_of(struct cfs_rq *cfs_rq)
{
251
	return cfs_rq->rq;
252
253
}

254
255
/* An entity is a task if it doesn't "own" a runqueue */
#define entity_is_task(se)	(!se->my_q)
256

257
258
259
260
261
262
263
264
static inline struct task_struct *task_of(struct sched_entity *se)
{
#ifdef CONFIG_SCHED_DEBUG
	WARN_ON_ONCE(!entity_is_task(se));
#endif
	return container_of(se, struct task_struct, se);
}

265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
/* Walk up scheduling entities hierarchy */
#define for_each_sched_entity(se) \
		for (; se; se = se->parent)

static inline struct cfs_rq *task_cfs_rq(struct task_struct *p)
{
	return p->se.cfs_rq;
}

/* runqueue on which this entity is (to be) queued */
static inline struct cfs_rq *cfs_rq_of(struct sched_entity *se)
{
	return se->cfs_rq;
}

/* runqueue "owned" by this group */
static inline struct cfs_rq *group_cfs_rq(struct sched_entity *grp)
{
	return grp->my_q;
}

286
287
static void update_cfs_rq_blocked_load(struct cfs_rq *cfs_rq,
				       int force_update);
288

289
290
291
static inline void list_add_leaf_cfs_rq(struct cfs_rq *cfs_rq)
{
	if (!cfs_rq->on_list) {
292
293
294
295
296
297
298
299
300
301
302
303
		/*
		 * Ensure we either appear before our parent (if already
		 * enqueued) or force our parent to appear after us when it is
		 * enqueued.  The fact that we always enqueue bottom-up
		 * reduces this to two cases.
		 */
		if (cfs_rq->tg->parent &&
		    cfs_rq->tg->parent->cfs_rq[cpu_of(rq_of(cfs_rq))]->on_list) {
			list_add_rcu(&cfs_rq->leaf_cfs_rq_list,
				&rq_of(cfs_rq)->leaf_cfs_rq_list);
		} else {
			list_add_tail_rcu(&cfs_rq->leaf_cfs_rq_list,
304
				&rq_of(cfs_rq)->leaf_cfs_rq_list);
305
		}
306
307

		cfs_rq->on_list = 1;
308
		/* We should have no load, but we need to update last_decay. */
309
		update_cfs_rq_blocked_load(cfs_rq, 0);
310
311
312
313
314
315
316
317
318
319
320
	}
}

static inline void list_del_leaf_cfs_rq(struct cfs_rq *cfs_rq)
{
	if (cfs_rq->on_list) {
		list_del_rcu(&cfs_rq->leaf_cfs_rq_list);
		cfs_rq->on_list = 0;
	}
}

321
322
323
324
325
/* Iterate thr' all leaf cfs_rq's on a runqueue */
#define for_each_leaf_cfs_rq(rq, cfs_rq) \
	list_for_each_entry_rcu(cfs_rq, &rq->leaf_cfs_rq_list, leaf_cfs_rq_list)

/* Do the two (enqueued) entities belong to the same group ? */
326
static inline struct cfs_rq *
327
328
329
is_same_group(struct sched_entity *se, struct sched_entity *pse)
{
	if (se->cfs_rq == pse->cfs_rq)
330
		return se->cfs_rq;
331

332
	return NULL;
333
334
335
336
337
338
339
}

static inline struct sched_entity *parent_entity(struct sched_entity *se)
{
	return se->parent;
}

340
341
342
343
344
345
346
347
348
349
350
351
352
static void
find_matching_se(struct sched_entity **se, struct sched_entity **pse)
{
	int se_depth, pse_depth;

	/*
	 * preemption test can be made between sibling entities who are in the
	 * same cfs_rq i.e who have a common parent. Walk up the hierarchy of
	 * both tasks until we find their ancestors who are siblings of common
	 * parent.
	 */

	/* First walk up until both entities are at same depth */
353
354
	se_depth = (*se)->depth;
	pse_depth = (*pse)->depth;
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371

	while (se_depth > pse_depth) {
		se_depth--;
		*se = parent_entity(*se);
	}

	while (pse_depth > se_depth) {
		pse_depth--;
		*pse = parent_entity(*pse);
	}

	while (!is_same_group(*se, *pse)) {
		*se = parent_entity(*se);
		*pse = parent_entity(*pse);
	}
}

372
373
374
375
376
377
#else	/* !CONFIG_FAIR_GROUP_SCHED */

static inline struct task_struct *task_of(struct sched_entity *se)
{
	return container_of(se, struct task_struct, se);
}
378

379
380
381
static inline struct rq *rq_of(struct cfs_rq *cfs_rq)
{
	return container_of(cfs_rq, struct rq, cfs);
382
383
384
385
}

#define entity_is_task(se)	1

386
387
#define for_each_sched_entity(se) \
		for (; se; se = NULL)
388

389
static inline struct cfs_rq *task_cfs_rq(struct task_struct *p)
390
{
391
	return &task_rq(p)->cfs;
392
393
}

394
395
396
397
398
399
400
401
402
403
404
405
406
407
static inline struct cfs_rq *cfs_rq_of(struct sched_entity *se)
{
	struct task_struct *p = task_of(se);
	struct rq *rq = task_rq(p);

	return &rq->cfs;
}

/* runqueue "owned" by this group */
static inline struct cfs_rq *group_cfs_rq(struct sched_entity *grp)
{
	return NULL;
}

408
409
410
411
412
413
414
415
static inline void list_add_leaf_cfs_rq(struct cfs_rq *cfs_rq)
{
}

static inline void list_del_leaf_cfs_rq(struct cfs_rq *cfs_rq)
{
}

416
417
418
419
420
421
422
423
#define for_each_leaf_cfs_rq(rq, cfs_rq) \
		for (cfs_rq = &rq->cfs; cfs_rq; cfs_rq = NULL)

static inline struct sched_entity *parent_entity(struct sched_entity *se)
{
	return NULL;
}

424
425
426
427
428
static inline void
find_matching_se(struct sched_entity **se, struct sched_entity **pse)
{
}

429
430
#endif	/* CONFIG_FAIR_GROUP_SCHED */

431
static __always_inline
432
void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, u64 delta_exec);
433
434
435
436
437

/**************************************************************
 * Scheduling class tree data structure manipulation methods:
 */

438
static inline u64 max_vruntime(u64 max_vruntime, u64 vruntime)
439
{
440
	s64 delta = (s64)(vruntime - max_vruntime);
441
	if (delta > 0)
442
		max_vruntime = vruntime;
443

444
	return max_vruntime;
445
446
}

447
static inline u64 min_vruntime(u64 min_vruntime, u64 vruntime)
448
449
450
451
452
453
454
455
{
	s64 delta = (s64)(vruntime - min_vruntime);
	if (delta < 0)
		min_vruntime = vruntime;

	return min_vruntime;
}

456
457
458
459
460
461
static inline int entity_before(struct sched_entity *a,
				struct sched_entity *b)
{
	return (s64)(a->vruntime - b->vruntime) < 0;
}

462
463
464
465
466
467
468
469
470
471
472
473
static void update_min_vruntime(struct cfs_rq *cfs_rq)
{
	u64 vruntime = cfs_rq->min_vruntime;

	if (cfs_rq->curr)
		vruntime = cfs_rq->curr->vruntime;

	if (cfs_rq->rb_leftmost) {
		struct sched_entity *se = rb_entry(cfs_rq->rb_leftmost,
						   struct sched_entity,
						   run_node);

474
		if (!cfs_rq->curr)
475
476
477
478
479
			vruntime = se->vruntime;
		else
			vruntime = min_vruntime(vruntime, se->vruntime);
	}

480
	/* ensure we never gain time by being placed backwards. */
481
	cfs_rq->min_vruntime = max_vruntime(cfs_rq->min_vruntime, vruntime);
482
483
484
485
#ifndef CONFIG_64BIT
	smp_wmb();
	cfs_rq->min_vruntime_copy = cfs_rq->min_vruntime;
#endif
486
487
}

488
489
490
/*
 * Enqueue an entity into the rb-tree:
 */
491
static void __enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
{
	struct rb_node **link = &cfs_rq->tasks_timeline.rb_node;
	struct rb_node *parent = NULL;
	struct sched_entity *entry;
	int leftmost = 1;

	/*
	 * Find the right place in the rbtree:
	 */
	while (*link) {
		parent = *link;
		entry = rb_entry(parent, struct sched_entity, run_node);
		/*
		 * We dont care about collisions. Nodes with
		 * the same key stay together.
		 */
508
		if (entity_before(se, entry)) {
509
510
511
512
513
514
515
516
517
518
519
			link = &parent->rb_left;
		} else {
			link = &parent->rb_right;
			leftmost = 0;
		}
	}

	/*
	 * Maintain a cache of leftmost tree entries (it is frequently
	 * used):
	 */
520
	if (leftmost)
Ingo Molnar's avatar
Ingo Molnar committed
521
		cfs_rq->rb_leftmost = &se->run_node;
522
523
524
525
526

	rb_link_node(&se->run_node, parent, link);
	rb_insert_color(&se->run_node, &cfs_rq->tasks_timeline);
}

527
static void __dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
528
{
Peter Zijlstra's avatar
Peter Zijlstra committed
529
530
531
532
533
534
	if (cfs_rq->rb_leftmost == &se->run_node) {
		struct rb_node *next_node;

		next_node = rb_next(&se->run_node);
		cfs_rq->rb_leftmost = next_node;
	}
Ingo Molnar's avatar
Ingo Molnar committed
535

536
537
538
	rb_erase(&se->run_node, &cfs_rq->tasks_timeline);
}

539
struct sched_entity *__pick_first_entity(struct cfs_rq *cfs_rq)
540
{
541
542
543
544
545
546
	struct rb_node *left = cfs_rq->rb_leftmost;

	if (!left)
		return NULL;

	return rb_entry(left, struct sched_entity, run_node);
547
548
}

549
550
551
552
553
554
555
556
557
558
559
static struct sched_entity *__pick_next_entity(struct sched_entity *se)
{
	struct rb_node *next = rb_next(&se->run_node);

	if (!next)
		return NULL;

	return rb_entry(next, struct sched_entity, run_node);
}

#ifdef CONFIG_SCHED_DEBUG
560
struct sched_entity *__pick_last_entity(struct cfs_rq *cfs_rq)
561
{
562
	struct rb_node *last = rb_last(&cfs_rq->tasks_timeline);
563

564
565
	if (!last)
		return NULL;
566
567

	return rb_entry(last, struct sched_entity, run_node);
568
569
}

570
571
572
573
/**************************************************************
 * Scheduling class statistics methods:
 */

574
int sched_proc_update_handler(struct ctl_table *table, int write,
575
		void __user *buffer, size_t *lenp,
576
577
		loff_t *ppos)
{
578
	int ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
579
	int factor = get_update_sysctl_factor();
580
581
582
583
584
585
586

	if (ret || !write)
		return ret;

	sched_nr_latency = DIV_ROUND_UP(sysctl_sched_latency,
					sysctl_sched_min_granularity);

587
588
589
590
591
592
593
#define WRT_SYSCTL(name) \
	(normalized_sysctl_##name = sysctl_##name / (factor))
	WRT_SYSCTL(sched_min_granularity);
	WRT_SYSCTL(sched_latency);
	WRT_SYSCTL(sched_wakeup_granularity);
#undef WRT_SYSCTL

594
595
596
	return 0;
}
#endif
597

598
/*
599
 * delta /= w
600
 */
601
static inline u64 calc_delta_fair(u64 delta, struct sched_entity *se)
602
{
603
	if (unlikely(se->load.weight != NICE_0_LOAD))
604
		delta = __calc_delta(delta, NICE_0_LOAD, &se->load);
605
606
607
608

	return delta;
}

609
610
611
/*
 * The idea is to set a period in which each task runs once.
 *
612
 * When there are too many tasks (sched_nr_latency) we have to stretch
613
614
615
616
 * this period because otherwise the slices get too small.
 *
 * p = (nr <= nl) ? l : l*nr/nl
 */
617
618
619
static u64 __sched_period(unsigned long nr_running)
{
	u64 period = sysctl_sched_latency;
620
	unsigned long nr_latency = sched_nr_latency;
621
622

	if (unlikely(nr_running > nr_latency)) {
623
		period = sysctl_sched_min_granularity;
624
625
626
627
628
629
		period *= nr_running;
	}

	return period;
}

630
631
632
633
/*
 * We calculate the wall-time slice from the period by taking a part
 * proportional to the weight.
 *
634
 * s = p*P[w/rw]
635
 */
636
static u64 sched_slice(struct cfs_rq *cfs_rq, struct sched_entity *se)
637
{
Mike Galbraith's avatar
Mike Galbraith committed
638
	u64 slice = __sched_period(cfs_rq->nr_running + !se->on_rq);
639

Mike Galbraith's avatar
Mike Galbraith committed
640
	for_each_sched_entity(se) {
Lin Ming's avatar
Lin Ming committed
641
		struct load_weight *load;
642
		struct load_weight lw;
Lin Ming's avatar
Lin Ming committed
643
644
645

		cfs_rq = cfs_rq_of(se);
		load = &cfs_rq->load;
646

Mike Galbraith's avatar
Mike Galbraith committed
647
		if (unlikely(!se->on_rq)) {
648
			lw = cfs_rq->load;
Mike Galbraith's avatar
Mike Galbraith committed
649
650
651
652

			update_load_add(&lw, se->load.weight);
			load = &lw;
		}
653
		slice = __calc_delta(slice, se->load.weight, load);
Mike Galbraith's avatar
Mike Galbraith committed
654
655
	}
	return slice;
656
657
}

658
/*
Andrei Epure's avatar
Andrei Epure committed
659
 * We calculate the vruntime slice of a to-be-inserted task.
660
 *
661
 * vs = s/w
662
 */
663
static u64 sched_vslice(struct cfs_rq *cfs_rq, struct sched_entity *se)
Peter Zijlstra's avatar
Peter Zijlstra committed
664
{
665
	return calc_delta_fair(sched_slice(cfs_rq, se), se);
666
667
}

668
#ifdef CONFIG_SMP
669
static int select_idle_sibling(struct task_struct *p, int cpu);
670
671
static unsigned long task_h_load(struct task_struct *p);

672
static inline void __update_task_entity_contrib(struct sched_entity *se);
673
static inline void __update_task_entity_utilization(struct sched_entity *se);
674
675
676
677
678
679
680
681

/* Give new task start runnable values to heavy its load in infant time */
void init_task_runnable_average(struct task_struct *p)
{
	u32 slice;

	p->se.avg.decay_count = 0;
	slice = sched_slice(task_cfs_rq(p), &p->se) >> 10;
682
683
	p->se.avg.runnable_avg_sum = p->se.avg.running_avg_sum = slice;
	p->se.avg.avg_period = slice;
684
	__update_task_entity_contrib(&p->se);
685
	__update_task_entity_utilization(&p->se);
686
687
688
689
690
691
692
}
#else
void init_task_runnable_average(struct task_struct *p)
{
}
#endif

693
/*
694
 * Update the current task's runtime statistics.
695
 */
696
static void update_curr(struct cfs_rq *cfs_rq)
697
{
698
	struct sched_entity *curr = cfs_rq->curr;
699
	u64 now = rq_clock_task(rq_of(cfs_rq));
700
	u64 delta_exec;
701
702
703
704

	if (unlikely(!curr))
		return;

705
706
	delta_exec = now - curr->exec_start;
	if (unlikely((s64)delta_exec <= 0))
Peter Zijlstra's avatar
Peter Zijlstra committed
707
		return;
708

Ingo Molnar's avatar
Ingo Molnar committed
709
	curr->exec_start = now;
710

711
712
713
714
715
716
717
718
719
	schedstat_set(curr->statistics.exec_max,
		      max(delta_exec, curr->statistics.exec_max));

	curr->sum_exec_runtime += delta_exec;
	schedstat_add(cfs_rq, exec_clock, delta_exec);

	curr->vruntime += calc_delta_fair(delta_exec, curr);
	update_min_vruntime(cfs_rq);

720
721
722
	if (entity_is_task(curr)) {
		struct task_struct *curtask = task_of(curr);

723
		trace_sched_stat_runtime(curtask, delta_exec, curr->vruntime);
724
		cpuacct_charge(curtask, delta_exec);
725
		account_group_exec_runtime(curtask, delta_exec);
726
	}
727
728

	account_cfs_rq_runtime(cfs_rq, delta_exec);
729
730
}

731
732
733
734
735
static void update_curr_fair(struct rq *rq)
{
	update_curr(cfs_rq_of(&rq->curr->se));
}

736
static inline void
737
update_stats_wait_start(struct cfs_rq *cfs_rq, struct sched_entity *se)
738
{
739
	schedstat_set(se->statistics.wait_start, rq_clock(rq_of(cfs_rq)));
740
741
742
743
744
}

/*
 * Task is being enqueued - update stats:
 */
745
static void update_stats_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se)
746
747
748
749
750
{
	/*
	 * Are we enqueueing a waiting task? (for current tasks
	 * a dequeue/enqueue event is a NOP)
	 */
751
	if (se != cfs_rq->curr)
752
		update_stats_wait_start(cfs_rq, se);
753
754
755
}

static void
756
update_stats_wait_end(struct cfs_rq *cfs_rq, struct sched_entity *se)
757
{
758
	schedstat_set(se->statistics.wait_max, max(se->statistics.wait_max,
759
			rq_clock(rq_of(cfs_rq)) - se->statistics.wait_start));
760
761
	schedstat_set(se->statistics.wait_count, se->statistics.wait_count + 1);
	schedstat_set(se->statistics.wait_sum, se->statistics.wait_sum +
762
			rq_clock(rq_of(cfs_rq)) - se->statistics.wait_start);
763
764
765
#ifdef CONFIG_SCHEDSTATS
	if (entity_is_task(se)) {
		trace_sched_stat_wait(task_of(se),
766
			rq_clock(rq_of(cfs_rq)) - se->statistics.wait_start);
767
768
	}
#endif
769
	schedstat_set(se->statistics.wait_start, 0);
770
771
772
}

static inline void
773
update_stats_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se)
774
775
776
777
778
{
	/*
	 * Mark the end of the wait period if dequeueing a
	 * waiting task:
	 */
779
	if (se != cfs_rq->curr)
780
		update_stats_wait_end(cfs_rq, se);
781
782
783
784
785
786
}

/*
 * We are picking a new current task - update its stats:
 */
static inline void
787
update_stats_curr_start(struct cfs_rq *cfs_rq, struct sched_entity *se)
788
789
790
791
{
	/*
	 * We are starting a new run period:
	 */
792
	se->exec_start = rq_clock_task(rq_of(cfs_rq));
793
794
795
796
797
798
}

/**************************************************
 * Scheduling class queueing methods:
 */

799
800
#ifdef CONFIG_NUMA_BALANCING
/*
801
802
803
 * Approximate time to scan a full NUMA task in ms. The task scan period is
 * calculated based on the tasks virtual memory size and
 * numa_balancing_scan_size.
804
 */
805
806
unsigned int sysctl_numa_balancing_scan_period_min = 1000;
unsigned int sysctl_numa_balancing_scan_period_max = 60000;
807
808
809

/* Portion of address space to scan in MB */
unsigned int sysctl_numa_balancing_scan_size = 256;
810

811
812
813
/* Scan @scan_size MB every @scan_period after an initial @scan_delay in ms */
unsigned int sysctl_numa_balancing_scan_delay = 1000;

814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
static unsigned int task_nr_scan_windows(struct task_struct *p)
{
	unsigned long rss = 0;
	unsigned long nr_scan_pages;

	/*
	 * Calculations based on RSS as non-present and empty pages are skipped
	 * by the PTE scanner and NUMA hinting faults should be trapped based
	 * on resident pages
	 */
	nr_scan_pages = sysctl_numa_balancing_scan_size << (20 - PAGE_SHIFT);
	rss = get_mm_rss(p->mm);
	if (!rss)
		rss = nr_scan_pages;

	rss = round_up(rss, nr_scan_pages);
	return rss / nr_scan_pages;
}

/* For sanitys sake, never scan more PTEs than MAX_SCAN_WINDOW MB/sec. */
#define MAX_SCAN_WINDOW 2560

static unsigned int task_scan_min(struct task_struct *p)
{
838
	unsigned int scan_size = ACCESS_ONCE(sysctl_numa_balancing_scan_size);
839
840
841
	unsigned int scan, floor;
	unsigned int windows = 1;

842
843
	if (scan_size < MAX_SCAN_WINDOW)
		windows = MAX_SCAN_WINDOW / scan_size;
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
	floor = 1000 / windows;

	scan = sysctl_numa_balancing_scan_period_min / task_nr_scan_windows(p);
	return max_t(unsigned int, floor, scan);
}

static unsigned int task_scan_max(struct task_struct *p)
{
	unsigned int smin = task_scan_min(p);
	unsigned int smax;

	/* Watch for min being lower than max due to floor calculations */
	smax = sysctl_numa_balancing_scan_period_max / task_nr_scan_windows(p);
	return max(smin, smax);
}

860
861
862
863
864
865
866
867
868
869
870
871
static void account_numa_enqueue(struct rq *rq, struct task_struct *p)
{
	rq->nr_numa_running += (p->numa_preferred_nid != -1);
	rq->nr_preferred_running += (p->numa_preferred_nid == task_node(p));
}

static void account_numa_dequeue(struct rq *rq, struct task_struct *p)
{
	rq->nr_numa_running -= (p->numa_preferred_nid != -1);
	rq->nr_preferred_running -= (p->numa_preferred_nid == task_node(p));
}

872
873
874
875
876
struct numa_group {
	atomic_t refcount;

	spinlock_t lock; /* nr_tasks, tasks */
	int nr_tasks;
877
	pid_t gid;
878
879

	struct rcu_head rcu;
880
	nodemask_t active_nodes;
881
	unsigned long total_faults;
882
883
884
885
886
	/*
	 * Faults_cpu is used to decide whether memory should move
	 * towards the CPU. As a consequence, these stats are weighted
	 * more by CPU use than by memory faults.
	 */
887
	unsigned long *faults_cpu;
888
	unsigned long faults[0];
889
890
};

891
892
893
894
895
896
897
898
899
/* Shared or private faults. */
#define NR_NUMA_HINT_FAULT_TYPES 2

/* Memory and CPU locality */
#define NR_NUMA_HINT_FAULT_STATS (NR_NUMA_HINT_FAULT_TYPES * 2)

/* Averaged statistics, and temporary buffers. */
#define NR_NUMA_HINT_FAULT_BUCKETS (NR_NUMA_HINT_FAULT_STATS * 2)

900
901
902
903
904
pid_t task_numa_group_id(struct task_struct *p)
{
	return p->numa_group ? p->numa_group->gid : 0;
}

905
906
907
908
909
910
911
/*
 * The averaged statistics, shared & private, memory & cpu,
 * occupy the first half of the array. The second half of the
 * array is for current counters, which are averaged into the
 * first set by task_numa_placement.
 */
static inline int task_faults_idx(enum numa_faults_stats s, int nid, int priv)
912
{
913
	return NR_NUMA_HINT_FAULT_TYPES * (s * nr_node_ids + nid) + priv;
914
915
916
917
}

static inline unsigned long task_faults(struct task_struct *p, int nid)
{
918
	if (!p->numa_faults)
919
920
		return 0;

921
922
	return p->numa_faults[task_faults_idx(NUMA_MEM, nid, 0)] +
		p->numa_faults[task_faults_idx(NUMA_MEM, nid, 1)];
923
924
}

925
926
927
928
929
static inline unsigned long group_faults(struct task_struct *p, int nid)
{
	if (!p->numa_group)
		return 0;

930
931
	return p->numa_group->faults[task_faults_idx(NUMA_MEM, nid, 0)] +
		p->numa_group->faults[task_faults_idx(NUMA_MEM, nid, 1)];
932
933
}

934
935
static inline unsigned long group_faults_cpu(struct numa_group *group, int nid)
{
936
937
	return group->faults_cpu[task_faults_idx(NUMA_MEM, nid, 0)] +
		group->faults_cpu[task_faults_idx(NUMA_MEM, nid, 1)];
938
939
}

940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
/* Handle placement on systems where not all nodes are directly connected. */
static unsigned long score_nearby_nodes(struct task_struct *p, int nid,
					int maxdist, bool task)
{
	unsigned long score = 0;
	int node;

	/*
	 * All nodes are directly connected, and the same distance
	 * from each other. No need for fancy placement algorithms.
	 */
	if (sched_numa_topology_type == NUMA_DIRECT)
		return 0;

	/*
	 * This code is called for each node, introducing N^2 complexity,
	 * which should be ok given the number of nodes rarely exceeds 8.
	 */
	for_each_online_node(node) {
		unsigned long faults;
		int dist = node_distance(nid, node);

		/*
		 * The furthest away nodes in the system are not interesting
		 * for placement; nid was already counted.
		 */
		if (dist == sched_max_numa_distance || node == nid)
			continue;

		/*
		 * On systems with a backplane NUMA topology, compare groups
		 * of nodes, and move tasks towards the group with the most
		 * memory accesses. When comparing two nodes at distance
		 * "hoplimit", only nodes closer by than "hoplimit" are part
		 * of each group. Skip other nodes.
		 */
		if (sched_numa_topology_type == NUMA_BACKPLANE &&
					dist > maxdist)
			continue;

		/* Add up the faults from nearby nodes. */
		if (task)
			faults = task_faults(p, node);
		else
			faults = group_faults(p, node);

		/*
		 * On systems with a glueless mesh NUMA topology, there are
		 * no fixed "groups of nodes". Instead, nodes that are not
		 * directly connected bounce traffic through intermediate
		 * nodes; a numa_group can occupy any set of nodes.
		 * The further away a node is, the less the faults count.
		 * This seems to result in good task placement.
		 */
		if (sched_numa_topology_type == NUMA_GLUELESS_MESH) {
			faults *= (sched_max_numa_distance - dist);
			faults /= (sched_max_numa_distance - LOCAL_DISTANCE);
		}

		score += faults;
	}
For faster browsing, not all history is shown. View entire blame