Commit 17bf423a authored by Linus Torvalds's avatar Linus Torvalds
Browse files

Merge branch 'sched-core-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip

Pull scheduler updates from Ingo Molnar:
 "The main changes in this cycle were:

   - Introduce "Energy Aware Scheduling" - by Quentin Perret.

     This is a coherent topology description of CPUs in cooperation with
     the PM subsystem, with the goal to schedule more energy-efficiently
     on asymetric SMP platform - such as waking up tasks to the more
     energy-efficient CPUs first, as long as the system isn't
     oversubscribed.

     For details of the design, see:

        https://lore.kernel.org/lkml/20180724122521.22109-1-quentin.perret@arm.com/

   - Misc cleanups and smaller enhancements"

* 'sched-core-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip: (23 commits)
  sched/fair: Select an energy-efficient CPU on task wake-up
  sched/fair: Introduce an energy estimation helper function
  sched/fair: Add over-utilization/tipping point indicator
  sched/fair: Clean-up update_sg_lb_stats parameters
  sched/toplogy: Introduce the 'sched_energy_present' static key
  sched/topology: Make Energy Aware Scheduling depend on schedutil
  sched/topology: Disable EAS on inappropriate platforms
  sched/topology: Add lowest CPU asymmetry sched_domain level pointer
  sched/topology: Reference the Energy Model of CPUs when available
  PM: Introduce an Energy Model management framework
  sched/cpufreq: Prepare schedutil for Energy Aware Scheduling
  sched/topology: Relocate arch_scale_cpu_capacity() to the internal header
  sched/core: Remove unnecessary unlikely() in push_*_task()
  sched/topology: Remove the ::smt_gain field from 'struct sched_domain'
  sched: Fix various typos in comments
  sched/core: Clean up the #ifdef block in add_nr_running()
  sched/fair: Make some variables static
  sched/core: Create task_has_idle_policy() helper
  sched/fair: Add lsub_positive() and use it consistently
  sched/fair: Mask UTIL_AVG_UNCHANGED usages
  ...
parents 116b081c 732cd75b
......@@ -2277,6 +2277,7 @@ static int cpufreq_set_policy(struct cpufreq_policy *policy,
ret = cpufreq_start_governor(policy);
if (!ret) {
pr_debug("cpufreq: governor change\n");
sched_cpufreq_governor_change(policy, old_gov);
return 0;
}
cpufreq_exit_governor(policy);
......
......@@ -950,6 +950,14 @@ static inline bool policy_has_boost_freq(struct cpufreq_policy *policy)
}
#endif
#if defined(CONFIG_ENERGY_MODEL) && defined(CONFIG_CPU_FREQ_GOV_SCHEDUTIL)
void sched_cpufreq_governor_change(struct cpufreq_policy *policy,
struct cpufreq_governor *old_gov);
#else
static inline void sched_cpufreq_governor_change(struct cpufreq_policy *policy,
struct cpufreq_governor *old_gov) { }
#endif
extern void arch_freq_prepare_all(void);
extern unsigned int arch_freq_get_on_cpu(int cpu);
......
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _LINUX_ENERGY_MODEL_H
#define _LINUX_ENERGY_MODEL_H
#include <linux/cpumask.h>
#include <linux/jump_label.h>
#include <linux/kobject.h>
#include <linux/rcupdate.h>
#include <linux/sched/cpufreq.h>
#include <linux/sched/topology.h>
#include <linux/types.h>
#ifdef CONFIG_ENERGY_MODEL
/**
* em_cap_state - Capacity state of a performance domain
* @frequency: The CPU frequency in KHz, for consistency with CPUFreq
* @power: The power consumed by 1 CPU at this level, in milli-watts
* @cost: The cost coefficient associated with this level, used during
* energy calculation. Equal to: power * max_frequency / frequency
*/
struct em_cap_state {
unsigned long frequency;
unsigned long power;
unsigned long cost;
};
/**
* em_perf_domain - Performance domain
* @table: List of capacity states, in ascending order
* @nr_cap_states: Number of capacity states
* @cpus: Cpumask covering the CPUs of the domain
*
* A "performance domain" represents a group of CPUs whose performance is
* scaled together. All CPUs of a performance domain must have the same
* micro-architecture. Performance domains often have a 1-to-1 mapping with
* CPUFreq policies.
*/
struct em_perf_domain {
struct em_cap_state *table;
int nr_cap_states;
unsigned long cpus[0];
};
#define EM_CPU_MAX_POWER 0xFFFF
struct em_data_callback {
/**
* active_power() - Provide power at the next capacity state of a CPU
* @power : Active power at the capacity state in mW (modified)
* @freq : Frequency at the capacity state in kHz (modified)
* @cpu : CPU for which we do this operation
*
* active_power() must find the lowest capacity state of 'cpu' above
* 'freq' and update 'power' and 'freq' to the matching active power
* and frequency.
*
* The power is the one of a single CPU in the domain, expressed in
* milli-watts. It is expected to fit in the [0, EM_CPU_MAX_POWER]
* range.
*
* Return 0 on success.
*/
int (*active_power)(unsigned long *power, unsigned long *freq, int cpu);
};
#define EM_DATA_CB(_active_power_cb) { .active_power = &_active_power_cb }
struct em_perf_domain *em_cpu_get(int cpu);
int em_register_perf_domain(cpumask_t *span, unsigned int nr_states,
struct em_data_callback *cb);
/**
* em_pd_energy() - Estimates the energy consumed by the CPUs of a perf. domain
* @pd : performance domain for which energy has to be estimated
* @max_util : highest utilization among CPUs of the domain
* @sum_util : sum of the utilization of all CPUs in the domain
*
* Return: the sum of the energy consumed by the CPUs of the domain assuming
* a capacity state satisfying the max utilization of the domain.
*/
static inline unsigned long em_pd_energy(struct em_perf_domain *pd,
unsigned long max_util, unsigned long sum_util)
{
unsigned long freq, scale_cpu;
struct em_cap_state *cs;
int i, cpu;
/*
* In order to predict the capacity state, map the utilization of the
* most utilized CPU of the performance domain to a requested frequency,
* like schedutil.
*/
cpu = cpumask_first(to_cpumask(pd->cpus));
scale_cpu = arch_scale_cpu_capacity(NULL, cpu);
cs = &pd->table[pd->nr_cap_states - 1];
freq = map_util_freq(max_util, cs->frequency, scale_cpu);
/*
* Find the lowest capacity state of the Energy Model above the
* requested frequency.
*/
for (i = 0; i < pd->nr_cap_states; i++) {
cs = &pd->table[i];
if (cs->frequency >= freq)
break;
}
/*
* The capacity of a CPU in the domain at that capacity state (cs)
* can be computed as:
*
* cs->freq * scale_cpu
* cs->cap = -------------------- (1)
* cpu_max_freq
*
* So, ignoring the costs of idle states (which are not available in
* the EM), the energy consumed by this CPU at that capacity state is
* estimated as:
*
* cs->power * cpu_util
* cpu_nrg = -------------------- (2)
* cs->cap
*
* since 'cpu_util / cs->cap' represents its percentage of busy time.
*
* NOTE: Although the result of this computation actually is in
* units of power, it can be manipulated as an energy value
* over a scheduling period, since it is assumed to be
* constant during that interval.
*
* By injecting (1) in (2), 'cpu_nrg' can be re-expressed as a product
* of two terms:
*
* cs->power * cpu_max_freq cpu_util
* cpu_nrg = ------------------------ * --------- (3)
* cs->freq scale_cpu
*
* The first term is static, and is stored in the em_cap_state struct
* as 'cs->cost'.
*
* Since all CPUs of the domain have the same micro-architecture, they
* share the same 'cs->cost', and the same CPU capacity. Hence, the
* total energy of the domain (which is the simple sum of the energy of
* all of its CPUs) can be factorized as:
*
* cs->cost * \Sum cpu_util
* pd_nrg = ------------------------ (4)
* scale_cpu
*/
return cs->cost * sum_util / scale_cpu;
}
/**
* em_pd_nr_cap_states() - Get the number of capacity states of a perf. domain
* @pd : performance domain for which this must be done
*
* Return: the number of capacity states in the performance domain table
*/
static inline int em_pd_nr_cap_states(struct em_perf_domain *pd)
{
return pd->nr_cap_states;
}
#else
struct em_perf_domain {};
struct em_data_callback {};
#define EM_DATA_CB(_active_power_cb) { }
static inline int em_register_perf_domain(cpumask_t *span,
unsigned int nr_states, struct em_data_callback *cb)
{
return -EINVAL;
}
static inline struct em_perf_domain *em_cpu_get(int cpu)
{
return NULL;
}
static inline unsigned long em_pd_energy(struct em_perf_domain *pd,
unsigned long max_util, unsigned long sum_util)
{
return 0;
}
static inline int em_pd_nr_cap_states(struct em_perf_domain *pd)
{
return 0;
}
#endif
#endif
......@@ -176,7 +176,7 @@ struct task_group;
* TASK_RUNNING store which can collide with __set_current_state(TASK_RUNNING).
*
* However, with slightly different timing the wakeup TASK_RUNNING store can
* also collide with the TASK_UNINTERRUPTIBLE store. Loosing that store is not
* also collide with the TASK_UNINTERRUPTIBLE store. Losing that store is not
* a problem either because that will result in one extra go around the loop
* and our @cond test will save the day.
*
......@@ -515,7 +515,7 @@ struct sched_dl_entity {
/*
* Actual scheduling parameters. Initialized with the values above,
* they are continously updated during task execution. Note that
* they are continuously updated during task execution. Note that
* the remaining runtime could be < 0 in case we are in overrun.
*/
s64 runtime; /* Remaining runtime for this instance */
......
......@@ -20,6 +20,12 @@ void cpufreq_add_update_util_hook(int cpu, struct update_util_data *data,
void (*func)(struct update_util_data *data, u64 time,
unsigned int flags));
void cpufreq_remove_update_util_hook(int cpu);
static inline unsigned long map_util_freq(unsigned long util,
unsigned long freq, unsigned long cap)
{
return (freq + (freq >> 2)) * util / cap;
}
#endif /* CONFIG_CPU_FREQ */
#endif /* _LINUX_SCHED_CPUFREQ_H */
......@@ -16,7 +16,7 @@ enum hk_flags {
};
#ifdef CONFIG_CPU_ISOLATION
DECLARE_STATIC_KEY_FALSE(housekeeping_overriden);
DECLARE_STATIC_KEY_FALSE(housekeeping_overridden);
extern int housekeeping_any_cpu(enum hk_flags flags);
extern const struct cpumask *housekeeping_cpumask(enum hk_flags flags);
extern void housekeeping_affine(struct task_struct *t, enum hk_flags flags);
......@@ -43,7 +43,7 @@ static inline void housekeeping_init(void) { }
static inline bool housekeeping_cpu(int cpu, enum hk_flags flags)
{
#ifdef CONFIG_CPU_ISOLATION
if (static_branch_unlikely(&housekeeping_overriden))
if (static_branch_unlikely(&housekeeping_overridden))
return housekeeping_test_cpu(cpu, flags);
#endif
return true;
......
......@@ -153,7 +153,7 @@ static inline gfp_t current_gfp_context(gfp_t flags)
{
/*
* NOIO implies both NOIO and NOFS and it is a weaker context
* so always make sure it makes precendence
* so always make sure it makes precedence
*/
if (unlikely(current->flags & PF_MEMALLOC_NOIO))
flags &= ~(__GFP_IO | __GFP_FS);
......
......@@ -8,7 +8,7 @@
* Various counters maintained by the scheduler and fork(),
* exposed via /proc, sys.c or used by drivers via these APIs.
*
* ( Note that all these values are aquired without locking,
* ( Note that all these values are acquired without locking,
* so they can only be relied on in narrow circumstances. )
*/
......
......@@ -89,7 +89,6 @@ struct sched_domain {
unsigned int newidle_idx;
unsigned int wake_idx;
unsigned int forkexec_idx;
unsigned int smt_gain;
int nohz_idle; /* NOHZ IDLE status */
int flags; /* See SD_* */
......@@ -202,6 +201,14 @@ extern void set_sched_topology(struct sched_domain_topology_level *tl);
# define SD_INIT_NAME(type)
#endif
#ifndef arch_scale_cpu_capacity
static __always_inline
unsigned long arch_scale_cpu_capacity(struct sched_domain *sd, int cpu)
{
return SCHED_CAPACITY_SCALE;
}
#endif
#else /* CONFIG_SMP */
struct sched_domain_attr;
......@@ -217,6 +224,14 @@ static inline bool cpus_share_cache(int this_cpu, int that_cpu)
return true;
}
#ifndef arch_scale_cpu_capacity
static __always_inline
unsigned long arch_scale_cpu_capacity(void __always_unused *sd, int cpu)
{
return SCHED_CAPACITY_SCALE;
}
#endif
#endif /* !CONFIG_SMP */
static inline int task_node(const struct task_struct *p)
......
......@@ -298,3 +298,18 @@ config PM_GENERIC_DOMAINS_OF
config CPU_PM
bool
config ENERGY_MODEL
bool "Energy Model for CPUs"
depends on SMP
depends on CPU_FREQ
default n
help
Several subsystems (thermal and/or the task scheduler for example)
can leverage information about the energy consumed by CPUs to make
smarter decisions. This config option enables the framework from
which subsystems can access the energy models.
The exact usage of the energy model is subsystem-dependent.
If in doubt, say N.
......@@ -15,3 +15,5 @@ obj-$(CONFIG_PM_AUTOSLEEP) += autosleep.o
obj-$(CONFIG_PM_WAKELOCKS) += wakelock.o
obj-$(CONFIG_MAGIC_SYSRQ) += poweroff.o
obj-$(CONFIG_ENERGY_MODEL) += energy_model.o
// SPDX-License-Identifier: GPL-2.0
/*
* Energy Model of CPUs
*
* Copyright (c) 2018, Arm ltd.
* Written by: Quentin Perret, Arm ltd.
*/
#define pr_fmt(fmt) "energy_model: " fmt
#include <linux/cpu.h>
#include <linux/cpumask.h>
#include <linux/energy_model.h>
#include <linux/sched/topology.h>
#include <linux/slab.h>
/* Mapping of each CPU to the performance domain to which it belongs. */
static DEFINE_PER_CPU(struct em_perf_domain *, em_data);
/*
* Mutex serializing the registrations of performance domains and letting
* callbacks defined by drivers sleep.
*/
static DEFINE_MUTEX(em_pd_mutex);
static struct em_perf_domain *em_create_pd(cpumask_t *span, int nr_states,
struct em_data_callback *cb)
{
unsigned long opp_eff, prev_opp_eff = ULONG_MAX;
unsigned long power, freq, prev_freq = 0;
int i, ret, cpu = cpumask_first(span);
struct em_cap_state *table;
struct em_perf_domain *pd;
u64 fmax;
if (!cb->active_power)
return NULL;
pd = kzalloc(sizeof(*pd) + cpumask_size(), GFP_KERNEL);
if (!pd)
return NULL;
table = kcalloc(nr_states, sizeof(*table), GFP_KERNEL);
if (!table)
goto free_pd;
/* Build the list of capacity states for this performance domain */
for (i = 0, freq = 0; i < nr_states; i++, freq++) {
/*
* active_power() is a driver callback which ceils 'freq' to
* lowest capacity state of 'cpu' above 'freq' and updates
* 'power' and 'freq' accordingly.
*/
ret = cb->active_power(&power, &freq, cpu);
if (ret) {
pr_err("pd%d: invalid cap. state: %d\n", cpu, ret);
goto free_cs_table;
}
/*
* We expect the driver callback to increase the frequency for
* higher capacity states.
*/
if (freq <= prev_freq) {
pr_err("pd%d: non-increasing freq: %lu\n", cpu, freq);
goto free_cs_table;
}
/*
* The power returned by active_state() is expected to be
* positive, in milli-watts and to fit into 16 bits.
*/
if (!power || power > EM_CPU_MAX_POWER) {
pr_err("pd%d: invalid power: %lu\n", cpu, power);
goto free_cs_table;
}
table[i].power = power;
table[i].frequency = prev_freq = freq;
/*
* The hertz/watts efficiency ratio should decrease as the
* frequency grows on sane platforms. But this isn't always
* true in practice so warn the user if a higher OPP is more
* power efficient than a lower one.
*/
opp_eff = freq / power;
if (opp_eff >= prev_opp_eff)
pr_warn("pd%d: hertz/watts ratio non-monotonically decreasing: em_cap_state %d >= em_cap_state%d\n",
cpu, i, i - 1);
prev_opp_eff = opp_eff;
}
/* Compute the cost of each capacity_state. */
fmax = (u64) table[nr_states - 1].frequency;
for (i = 0; i < nr_states; i++) {
table[i].cost = div64_u64(fmax * table[i].power,
table[i].frequency);
}
pd->table = table;
pd->nr_cap_states = nr_states;
cpumask_copy(to_cpumask(pd->cpus), span);
return pd;
free_cs_table:
kfree(table);
free_pd:
kfree(pd);
return NULL;
}
/**
* em_cpu_get() - Return the performance domain for a CPU
* @cpu : CPU to find the performance domain for
*
* Return: the performance domain to which 'cpu' belongs, or NULL if it doesn't
* exist.
*/
struct em_perf_domain *em_cpu_get(int cpu)
{
return READ_ONCE(per_cpu(em_data, cpu));
}
EXPORT_SYMBOL_GPL(em_cpu_get);
/**
* em_register_perf_domain() - Register the Energy Model of a performance domain
* @span : Mask of CPUs in the performance domain
* @nr_states : Number of capacity states to register
* @cb : Callback functions providing the data of the Energy Model
*
* Create Energy Model tables for a performance domain using the callbacks
* defined in cb.
*
* If multiple clients register the same performance domain, all but the first
* registration will be ignored.
*
* Return 0 on success
*/
int em_register_perf_domain(cpumask_t *span, unsigned int nr_states,
struct em_data_callback *cb)
{
unsigned long cap, prev_cap = 0;
struct em_perf_domain *pd;
int cpu, ret = 0;
if (!span || !nr_states || !cb)
return -EINVAL;
/*
* Use a mutex to serialize the registration of performance domains and
* let the driver-defined callback functions sleep.
*/
mutex_lock(&em_pd_mutex);
for_each_cpu(cpu, span) {
/* Make sure we don't register again an existing domain. */
if (READ_ONCE(per_cpu(em_data, cpu))) {
ret = -EEXIST;
goto unlock;
}
/*
* All CPUs of a domain must have the same micro-architecture
* since they all share the same table.
*/
cap = arch_scale_cpu_capacity(NULL, cpu);
if (prev_cap && prev_cap != cap) {
pr_err("CPUs of %*pbl must have the same capacity\n",
cpumask_pr_args(span));
ret = -EINVAL;
goto unlock;
}
prev_cap = cap;
}
/* Create the performance domain and add it to the Energy Model. */
pd = em_create_pd(span, nr_states, cb);
if (!pd) {
ret = -EINVAL;
goto unlock;
}
for_each_cpu(cpu, span) {
/*
* The per-cpu array can be read concurrently from em_cpu_get().
* The barrier enforces the ordering needed to make sure readers
* can only access well formed em_perf_domain structs.
*/
smp_store_release(per_cpu_ptr(&em_data, cpu), pd);
}
pr_debug("Created perf domain %*pbl\n", cpumask_pr_args(span));
unlock:
mutex_unlock(&em_pd_mutex);
return ret;
}
EXPORT_SYMBOL_GPL(em_register_perf_domain);
......@@ -697,7 +697,7 @@ static void set_load_weight(struct task_struct *p, bool update_load)
/*
* SCHED_IDLE tasks get minimal weight:
*/
if (idle_policy(p->policy)) {
if (task_has_idle_policy(p)) {
load->weight = scale_load(WEIGHT_IDLEPRIO);
load->inv_weight = WMULT_IDLEPRIO;
p->se.runnable_weight = load->weight;
......@@ -2857,7 +2857,7 @@ unsigned long nr_running(void)
* preemption, thus the result might have a time-of-check-to-time-of-use
* race. The caller is responsible to use it correctly, for example:
*
* - from a non-preemptable section (of course)
* - from a non-preemptible section (of course)
*
* - from a thread that is bound to a single CPU
*
......@@ -4191,7 +4191,7 @@ recheck:
* Treat SCHED_IDLE as nice 20. Only allow a switch to
* SCHED_NORMAL if the RLIMIT_NICE would normally permit it.
*/
if (idle_policy(p->policy) && !idle_policy(policy)) {
if (task_has_idle_policy(p) && !idle_policy(policy)) {
if (!can_nice(p, task_nice(p)))
return -EPERM;
}
......
......@@ -10,6 +10,7 @@
#include "sched.h"
#include <linux/sched/cpufreq.h>
#include <trace/events/power.h>
struct sugov_tunables {
......@@ -164,7 +165,7 @@ static unsigned int get_next_freq(struct sugov_policy *sg_policy,
unsigned int freq = arch_scale_freq_invariant() ?
policy->cpuinfo.max_freq : policy->cur;
freq = (freq + (freq >> 2)) * util / max;
freq = map_util_freq(util, freq, max);
if (freq == sg_policy->cached_raw_freq && !sg_policy->need_freq_update)
return sg_policy->next_freq;
......@@ -194,15 +195,13 @@ static unsigned int get_next_freq(struct sugov_policy *sg_policy,
* based on the task model parameters and gives the minimal utilization
* required to meet deadlines.
*/
static unsigned long sugov_get_util(struct sugov_cpu *sg_cpu)
unsigned long schedutil_freq_util(int cpu, unsigned long util_cfs,
unsigned long max, enum schedutil_type type)
{
struct rq *rq = cpu_rq(sg_cpu->cpu);
unsigned long util, irq, max;
sg_cpu->max = max = arch_scale_cpu_capacity(NULL, sg_cpu->cpu);
sg_cpu->bw_dl = cpu_bw_dl(rq);
unsigned long dl_util, util, irq;
struct rq *rq = cpu_rq(cpu);
if (rt_rq_is_runnable(&rq->rt))
if (type == FREQUENCY_UTIL && rt_rq_is_runnable(&rq->rt))
return max;
/*
......@@ -220,21 +219,30 @@ static unsigned long sugov_get_util(struct sugov_cpu *sg_cpu)
* utilization (PELT windows are synchronized) we can directly add them
* to obtain the CPU's actual utilization.
*/
util = cpu_util_cfs(rq);
util = util_cfs;
util += cpu_util_rt(rq);
<