From b61876ed122f816660fe49e0de1b7ee4891deaa2 Mon Sep 17 00:00:00 2001 From: Patrick Bellasi Date: Tue, 18 Dec 2018 10:31:30 +0000 Subject: [PATCH 01/15] ANDROID: sched/fair: EAS: Add uclamp support to find_energy_efficient_cpu() Utilization clamping can be used to boost the utilization of small tasks or cap that of big tasks. Thus, one of its possible usages is to bias tasks placement to "promote" small tasks on higher capacity (less energy efficient) CPUs or "constraint" big tasks on small capacity (more energy efficient) CPUs. When the Energy Aware Scheduler (EAS) looks for the most energy efficient CPU to run a task on, it currently considers only the effective utiliation estimated for a task. Fix this by adding an additional check to skip CPUs which capacity is smaller then the task clamped utilization. Change-Id: I43fa6fa27e27c1eb5272c6a45d1a6a5b0faae1aa Signed-off-by: Patrick Bellasi Cc: Ingo Molnar Cc: Peter Zijlstra Signed-off-by: Quentin Perret --- kernel/sched/fair.c | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 83ab35e2374f..c80cf449f9fb 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -6026,6 +6026,19 @@ static int select_idle_sibling(struct task_struct *p, int prev, int target) return target; } +static unsigned int uclamp_task_util(struct task_struct *p) +{ +#ifdef CONFIG_UCLAMP_TASK + unsigned int min_util = uclamp_eff_value(p, UCLAMP_MIN); + unsigned int max_util = uclamp_eff_value(p, UCLAMP_MAX); + unsigned int est_util = task_util_est(p); + + return clamp(est_util, min_util, max_util); +#else + return task_util_est(p); +#endif +} + /** * Amount of capacity of a CPU that is (estimated to be) used by CFS tasks * @cpu: the CPU to get the utilization of @@ -6370,6 +6383,10 @@ static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu) if (!fits_capacity(util, cpu_cap)) continue; + /* Skip CPUs which do not fit task requirements */ + if (cpu_cap < uclamp_task_util(p)) + continue; + /* Always use prev_cpu as a candidate. */ if (cpu == prev_cpu) { prev_delta = compute_energy(p, prev_cpu, pd); From 79e3a4a27ee502ffed05bb8011d9540867a66719 Mon Sep 17 00:00:00 2001 From: Chris Redpath Date: Wed, 27 Mar 2019 17:15:17 +0000 Subject: [PATCH 02/15] ANDROID: sched: Unconditionally honor sync flag for energy-aware wakeups Since we don't do energy-aware wakeups when we are overutilized, always honoring sync wakeups in this state does not prevent wake-wide mechanics overruling the flag as normal. This patch is based upon previous work to build EAS for android products. sync-hint code taken from commit 4a5e890ec60d "sched/fair: add tunable to force selection at cpu granularity" written by Juri Lelli Change-Id: I4b3d79141fc8e53dc51cd63ac11096c2e3cb10f5 Signed-off-by: Chris Redpath (cherry-picked from commit f1ec666a62dec1083ed52fe1ddef093b84373aaf) [ Moved the feature to find_energy_efficient_cpu() and removed the sysctl knob ] Signed-off-by: Quentin Perret --- kernel/sched/fair.c | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index c80cf449f9fb..baefb5ebc85e 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -6336,7 +6336,7 @@ compute_energy(struct task_struct *p, int dst_cpu, struct perf_domain *pd) * other use-cases too. So, until someone finds a better way to solve this, * let's keep things simple by re-using the existing slow path. */ -static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu) +static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu, int sync) { unsigned long prev_delta = ULONG_MAX, best_delta = ULONG_MAX; struct root_domain *rd = cpu_rq(smp_processor_id())->rd; @@ -6350,6 +6350,12 @@ static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu) if (!pd || READ_ONCE(rd->overutilized)) goto fail; + cpu = smp_processor_id(); + if (sync && cpumask_test_cpu(cpu, p->cpus_ptr)) { + rcu_read_unlock(); + return cpu; + } + /* * Energy-aware wake-up happens on the lowest sched_domain starting * from sd_asym_cpucapacity spanning over this_cpu and prev_cpu. @@ -6461,7 +6467,7 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_f record_wakee(p); if (sched_energy_enabled()) { - new_cpu = find_energy_efficient_cpu(p, prev_cpu); + new_cpu = find_energy_efficient_cpu(p, prev_cpu, sync); if (new_cpu >= 0) return new_cpu; new_cpu = prev_cpu; From 27ad2abc032bff8761bd0dd03f6e90d0bbd9e3e3 Mon Sep 17 00:00:00 2001 From: Dietmar Eggemann Date: Thu, 10 May 2018 15:48:06 +0100 Subject: [PATCH 03/15] ANDROID: sched/fair: add arch scaling function for max frequency capping To be able to scale the cpu capacity by this factor introduce a call to the new arch scaling function arch_scale_max_freq_capacity() in update_cpu_capacity() and provide a default implementation which returns SCHED_CAPACITY_SCALE. Another subsystem (e.g. cpufreq) or architectural or platform specific code can overwrite this default implementation, exactly as for frequency and cpu invariance. It has to be enabled by the arch by defining arch_scale_max_freq_capacity to the actual implementation. Change-Id: I770a8b1f4f7340e9e314f71c64a765bf880f4b4d Signed-off-by: Ionela Voinescu Signed-off-by: Dietmar Eggemann ( Fixed conflict with scaling against the PELT-based scale_rt_capacity ) Signed-off-by: Quentin Perret --- kernel/sched/fair.c | 12 ++++++++---- kernel/sched/sched.h | 9 +++++++++ 2 files changed, 17 insertions(+), 4 deletions(-) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index baefb5ebc85e..fee99f8cb193 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -7725,10 +7725,9 @@ static inline void init_sd_lb_stats(struct sd_lb_stats *sds) }; } -static unsigned long scale_rt_capacity(struct sched_domain *sd, int cpu) +static unsigned long scale_rt_capacity(int cpu, unsigned long max) { struct rq *rq = cpu_rq(cpu); - unsigned long max = arch_scale_cpu_capacity(cpu); unsigned long used, free; unsigned long irq; @@ -7750,10 +7749,15 @@ static unsigned long scale_rt_capacity(struct sched_domain *sd, int cpu) static void update_cpu_capacity(struct sched_domain *sd, int cpu) { - unsigned long capacity = scale_rt_capacity(sd, cpu); + unsigned long capacity = arch_scale_cpu_capacity(cpu); struct sched_group *sdg = sd->groups; - cpu_rq(cpu)->cpu_capacity_orig = arch_scale_cpu_capacity(cpu); + cpu_rq(cpu)->cpu_capacity_orig = capacity; + + capacity *= arch_scale_max_freq_capacity(sd, cpu); + capacity >>= SCHED_CAPACITY_SHIFT; + + capacity = scale_rt_capacity(cpu, capacity); if (!capacity) capacity = 1; diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 0db2c1b3361e..f0b6e3ab2c96 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -1961,6 +1961,15 @@ unsigned long arch_scale_freq_capacity(int cpu) } #endif +#ifndef arch_scale_max_freq_capacity +struct sched_domain; +static __always_inline +unsigned long arch_scale_max_freq_capacity(struct sched_domain *sd, int cpu) +{ + return SCHED_CAPACITY_SCALE; +} +#endif + #ifdef CONFIG_SMP #ifdef CONFIG_PREEMPTION From 0cfe39fe403ea6dbe2fdfb38edd36022b35d4d66 Mon Sep 17 00:00:00 2001 From: Dietmar Eggemann Date: Thu, 10 May 2018 16:52:33 +0100 Subject: [PATCH 04/15] ANDROID: cpufreq: arch_topology: implement max frequency capping Implements the Max Frequency Capping Engine (MFCE) getter function topology_get_max_freq_scale() to provide the scheduler with a maximum frequency scaling correction factor for more accurate cpu capacity handling by being able to deal with max frequency capping. This scaling factor describes the influence of running a cpu with a current maximum frequency (policy) lower than the maximum possible frequency (cpuinfo). The factor is: policy_max_freq(cpu) << SCHED_CAPACITY_SHIFT / cpuinfo_max_freq(cpu) It also implements the MFCE setter function arch_set_max_freq_scale() which is called from cpufreq_set_policy(). Change-Id: I59e52861ee260755ab0518fe1f7183a2e4e3d0fc Signed-off-by: Ionela Voinescu Signed-off-by: Dietmar Eggemann [Trivial cherry-pick issue in cpufreq.c] Signed-off-by: Quentin Perret --- drivers/base/arch_topology.c | 25 ++++++++++++++++++++++++- drivers/cpufreq/cpufreq.c | 8 ++++++++ include/linux/arch_topology.h | 8 ++++++++ include/linux/cpufreq.h | 2 ++ 4 files changed, 42 insertions(+), 1 deletion(-) diff --git a/drivers/base/arch_topology.c b/drivers/base/arch_topology.c index 1eb81f113786..c0cf9ef5c2f5 100644 --- a/drivers/base/arch_topology.c +++ b/drivers/base/arch_topology.c @@ -22,6 +22,8 @@ #include DEFINE_PER_CPU(unsigned long, freq_scale) = SCHED_CAPACITY_SCALE; +DEFINE_PER_CPU(unsigned long, max_cpu_freq); +DEFINE_PER_CPU(unsigned long, max_freq_scale) = SCHED_CAPACITY_SCALE; void arch_set_freq_scale(struct cpumask *cpus, unsigned long cur_freq, unsigned long max_freq) @@ -31,8 +33,29 @@ void arch_set_freq_scale(struct cpumask *cpus, unsigned long cur_freq, scale = (cur_freq << SCHED_CAPACITY_SHIFT) / max_freq; - for_each_cpu(i, cpus) + for_each_cpu(i, cpus) { per_cpu(freq_scale, i) = scale; + per_cpu(max_cpu_freq, i) = max_freq; + } +} + +void arch_set_max_freq_scale(struct cpumask *cpus, + unsigned long policy_max_freq) +{ + unsigned long scale, max_freq; + int cpu = cpumask_first(cpus); + + if (cpu > nr_cpu_ids) + return; + + max_freq = per_cpu(max_cpu_freq, cpu); + if (!max_freq) + return; + + scale = (policy_max_freq << SCHED_CAPACITY_SHIFT) / max_freq; + + for_each_cpu(cpu, cpus) + per_cpu(max_freq_scale, cpu) = scale; } DEFINE_PER_CPU(unsigned long, cpu_scale) = SCHED_CAPACITY_SCALE; diff --git a/drivers/cpufreq/cpufreq.c b/drivers/cpufreq/cpufreq.c index c52d6fa32aac..cf118aa0a83e 100644 --- a/drivers/cpufreq/cpufreq.c +++ b/drivers/cpufreq/cpufreq.c @@ -152,6 +152,12 @@ __weak void arch_set_freq_scale(struct cpumask *cpus, unsigned long cur_freq, } EXPORT_SYMBOL_GPL(arch_set_freq_scale); +__weak void arch_set_max_freq_scale(struct cpumask *cpus, + unsigned long policy_max_freq) +{ +} +EXPORT_SYMBOL_GPL(arch_set_max_freq_scale); + /* * This is a generic cpufreq init() routine which can be used by cpufreq * drivers of SMP systems. It will do following: @@ -2398,6 +2404,8 @@ int cpufreq_set_policy(struct cpufreq_policy *policy, policy->max = new_policy->max; trace_cpu_frequency_limits(policy); + arch_set_max_freq_scale(policy->cpus, policy->max); + policy->cached_target_freq = UINT_MAX; pr_debug("new min and max freqs are %u - %u kHz\n", diff --git a/include/linux/arch_topology.h b/include/linux/arch_topology.h index 42f2b5126094..5402bc0e2b1e 100644 --- a/include/linux/arch_topology.h +++ b/include/linux/arch_topology.h @@ -33,6 +33,14 @@ unsigned long topology_get_freq_scale(int cpu) return per_cpu(freq_scale, cpu); } +DECLARE_PER_CPU(unsigned long, max_freq_scale); + +static inline +unsigned long topology_get_max_freq_scale(struct sched_domain *sd, int cpu) +{ + return per_cpu(max_freq_scale, cpu); +} + struct cpu_topology { int thread_id; int core_id; diff --git a/include/linux/cpufreq.h b/include/linux/cpufreq.h index c57e88e85c41..fca45a2749d4 100644 --- a/include/linux/cpufreq.h +++ b/include/linux/cpufreq.h @@ -984,6 +984,8 @@ extern unsigned int arch_freq_get_on_cpu(int cpu); extern void arch_set_freq_scale(struct cpumask *cpus, unsigned long cur_freq, unsigned long max_freq); +extern void arch_set_max_freq_scale(struct cpumask *cpus, + unsigned long policy_max_freq); /* the following are really really optional */ extern struct freq_attr cpufreq_freq_attr_scaling_available_freqs; From a273512dafbf6322b134716873d50d1af803972f Mon Sep 17 00:00:00 2001 From: Dietmar Eggemann Date: Thu, 10 May 2018 16:54:16 +0100 Subject: [PATCH 05/15] ANDROID: arm64: enable max frequency capping Defines arch_scale_max_freq_capacity() to use the topology driver scale function. Change-Id: If7565747ec862e42ac55196240522ef8d22ca67d Signed-off-by: Ionela Voinescu Signed-off-by: Dietmar Eggemann Signed-off-by: Quentin Perret --- arch/arm64/include/asm/topology.h | 3 +++ 1 file changed, 3 insertions(+) diff --git a/arch/arm64/include/asm/topology.h b/arch/arm64/include/asm/topology.h index a4d945db95a2..70697177d6ec 100644 --- a/arch/arm64/include/asm/topology.h +++ b/arch/arm64/include/asm/topology.h @@ -19,6 +19,9 @@ int pcibus_to_node(struct pci_bus *bus); /* Replace task scheduler's default frequency-invariant accounting */ #define arch_scale_freq_capacity topology_get_freq_scale +/* Replace task scheduler's default max-frequency-invariant accounting */ +#define arch_scale_max_freq_capacity topology_get_max_freq_scale + /* Replace task scheduler's default cpu-invariant accounting */ #define arch_scale_cpu_capacity topology_get_cpu_scale From 150b7ce48a29566d996bda293e8857ef45298bdc Mon Sep 17 00:00:00 2001 From: Dietmar Eggemann Date: Thu, 10 May 2018 16:58:04 +0100 Subject: [PATCH 06/15] ANDROID: arm: enable max frequency capping Defines arch_scale_max_freq_capacity() to use the topology driver scale function. Signed-off-by: Ionela Voinescu Signed-off-by: Dietmar Eggemann Signed-off-by: Quentin Perret Change-Id: I79f444399ea3b2948364fde80ccee52a9ece5b9a --- arch/arm/include/asm/topology.h | 3 +++ 1 file changed, 3 insertions(+) diff --git a/arch/arm/include/asm/topology.h b/arch/arm/include/asm/topology.h index 8a0fae94d45e..a2edacb56459 100644 --- a/arch/arm/include/asm/topology.h +++ b/arch/arm/include/asm/topology.h @@ -10,6 +10,9 @@ /* Replace task scheduler's default frequency-invariant accounting */ #define arch_scale_freq_capacity topology_get_freq_scale +/* Replace task scheduler's default max-frequency-invariant accounting */ +#define arch_scale_max_freq_capacity topology_get_max_freq_scale + /* Replace task scheduler's default cpu-invariant accounting */ #define arch_scale_cpu_capacity topology_get_cpu_scale From 24c44377976ea7812e321072d6bb366a648f7356 Mon Sep 17 00:00:00 2001 From: Dietmar Eggemann Date: Sat, 26 Sep 2015 18:19:54 +0100 Subject: [PATCH 07/15] ANDROID: sched: Update max cpu capacity in case of max frequency constraints Wakeup balancing uses cpu capacity awareness and needs to know the system-wide maximum cpu capacity. Patch "sched: Store system-wide maximum cpu capacity in root domain" finds the system-wide maximum cpu capacity during scheduler domain hierarchy setup. This is sufficient as long as maximum frequency invariance is not enabled. If it is enabled, the system-wide maximum cpu capacity can change between scheduler domain hierarchy setups due to frequency capping. The cpu capacity is changed in update_cpu_capacity() which is called in load balance on the lowest scheduler domain hierarchy level. To be able to know if a change in cpu capacity for a certain cpu also has an effect on the system-wide maximum cpu capacity it is normally necessary to iterate over all cpus. This would be way too costly. That's why this patch follows a different approach. The unsigned long max_cpu_capacity value in struct root_domain is replaced with a struct max_cpu_capacity, containing value (the max_cpu_capacity) and cpu (the cpu index of the cpu providing the maximum cpu_capacity). Changes to the system-wide maximum cpu capacity and the cpu index are made if: 1 System-wide maximum cpu capacity < cpu capacity 2 System-wide maximum cpu capacity > cpu capacity and cpu index == cpu There are no changes to the system-wide maximum cpu capacity in all other cases. Atomic read and write access to the pair (max_cpu_capacity.val, max_cpu_capacity.cpu) is enforced by max_cpu_capacity.lock. The access to max_cpu_capacity.val in task_fits_max() is still performed without taking the max_cpu_capacity.lock. The code to set max cpu capacity in build_sched_domains() has been removed because the whole functionality is now provided by update_cpu_capacity() instead. This approach can introduce errors temporarily, e.g. in case the cpu currently providing the max cpu capacity has its cpu capacity lowered due to frequency capping and calls update_cpu_capacity() before any cpu which might provide the max cpu now. Change-Id: I5063befab088fbf49e5d5e484ce0c6ee6165283a Signed-off-by: Ionela Voinescu * Signed-off-by: Dietmar Eggemann (- Fixed cherry-pick issues, and conflict with a0fe2cf086ae "sched/fair: Tune down misfit NOHZ kicks" which makes use of max_cpu_capacity - Squashed "sched/fair: remove printk while schedule is in progress" fix from Caesar Wang ) Signed-off-by: Quentin Perret --- kernel/sched/fair.c | 34 ++++++++++++++++++++++++++++++++-- kernel/sched/sched.h | 10 +++++++++- kernel/sched/topology.c | 15 +++------------ 3 files changed, 44 insertions(+), 15 deletions(-) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index fee99f8cb193..b4314e918c29 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -6195,7 +6195,7 @@ static int wake_cap(struct task_struct *p, int cpu, int prev_cpu) return 0; min_cap = min(capacity_orig_of(prev_cpu), capacity_orig_of(cpu)); - max_cap = cpu_rq(cpu)->rd->max_cpu_capacity; + max_cap = cpu_rq(cpu)->rd->max_cpu_capacity.val; /* Minimum capacity is close to max, no need to abort wake_affine */ if (max_cap - min_cap < max_cap >> 3) @@ -7747,16 +7747,46 @@ static unsigned long scale_rt_capacity(int cpu, unsigned long max) return scale_irq_capacity(free, irq, max); } +void init_max_cpu_capacity(struct max_cpu_capacity *mcc) { + raw_spin_lock_init(&mcc->lock); + mcc->val = 0; + mcc->cpu = -1; +} + static void update_cpu_capacity(struct sched_domain *sd, int cpu) { unsigned long capacity = arch_scale_cpu_capacity(cpu); struct sched_group *sdg = sd->groups; + struct max_cpu_capacity *mcc; + unsigned long max_capacity; + int max_cap_cpu; + unsigned long flags; cpu_rq(cpu)->cpu_capacity_orig = capacity; capacity *= arch_scale_max_freq_capacity(sd, cpu); capacity >>= SCHED_CAPACITY_SHIFT; + mcc = &cpu_rq(cpu)->rd->max_cpu_capacity; + + raw_spin_lock_irqsave(&mcc->lock, flags); + max_capacity = mcc->val; + max_cap_cpu = mcc->cpu; + + if ((max_capacity > capacity && max_cap_cpu == cpu) || + (max_capacity < capacity)) { + mcc->val = capacity; + mcc->cpu = cpu; +#ifdef CONFIG_SCHED_DEBUG + raw_spin_unlock_irqrestore(&mcc->lock, flags); + printk_deferred(KERN_INFO "CPU%d: update max cpu_capacity %lu\n", + cpu, capacity); + goto skip_unlock; +#endif + } + raw_spin_unlock_irqrestore(&mcc->lock, flags); + +skip_unlock: __attribute__ ((unused)); capacity = scale_rt_capacity(cpu, capacity); if (!capacity) @@ -7861,7 +7891,7 @@ check_cpu_capacity(struct rq *rq, struct sched_domain *sd) static inline int check_misfit_status(struct rq *rq, struct sched_domain *sd) { return rq->misfit_task_load && - (rq->cpu_capacity_orig < rq->rd->max_cpu_capacity || + (rq->cpu_capacity_orig < rq->rd->max_cpu_capacity.val || check_cpu_capacity(rq, sd)); } diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index f0b6e3ab2c96..9015c687de19 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -717,6 +717,12 @@ struct perf_domain { struct rcu_head rcu; }; +struct max_cpu_capacity { + raw_spinlock_t lock; + unsigned long val; + int cpu; +}; + /* Scheduling group status flags */ #define SG_OVERLOAD 0x1 /* More than one runnable task on a CPU. */ #define SG_OVERUTILIZED 0x2 /* One or more CPUs are over-utilized. */ @@ -775,7 +781,8 @@ struct root_domain { cpumask_var_t rto_mask; struct cpupri cpupri; - unsigned long max_cpu_capacity; + /* Maximum cpu capacity in the system. */ + struct max_cpu_capacity max_cpu_capacity; /* * NULL-terminated list of performance domains intersecting with the @@ -785,6 +792,7 @@ struct root_domain { }; extern void init_defrootdomain(void); +extern void init_max_cpu_capacity(struct max_cpu_capacity *mcc); extern int sched_init_domains(const struct cpumask *cpu_map); extern void rq_attach_root(struct rq *rq, struct root_domain *rd); extern void sched_get_rd(struct root_domain *rd); diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c index b5667a273bf6..4ad1e5194555 100644 --- a/kernel/sched/topology.c +++ b/kernel/sched/topology.c @@ -510,6 +510,9 @@ static int init_rootdomain(struct root_domain *rd) if (cpupri_init(&rd->cpupri) != 0) goto free_cpudl; + + init_max_cpu_capacity(&rd->max_cpu_capacity); + return 0; free_cpudl: @@ -1951,7 +1954,6 @@ build_sched_domains(const struct cpumask *cpu_map, struct sched_domain_attr *att enum s_alloc alloc_state; struct sched_domain *sd; struct s_data d; - struct rq *rq = NULL; int i, ret = -ENOMEM; struct sched_domain_topology_level *tl_asym; bool has_asym = false; @@ -2014,13 +2016,7 @@ build_sched_domains(const struct cpumask *cpu_map, struct sched_domain_attr *att /* Attach the domains */ rcu_read_lock(); for_each_cpu(i, cpu_map) { - rq = cpu_rq(i); sd = *per_cpu_ptr(d.sd, i); - - /* Use READ_ONCE()/WRITE_ONCE() to avoid load/store tearing: */ - if (rq->cpu_capacity_orig > READ_ONCE(d.rd->max_cpu_capacity)) - WRITE_ONCE(d.rd->max_cpu_capacity, rq->cpu_capacity_orig); - cpu_attach_domain(sd, d.rd, i); } rcu_read_unlock(); @@ -2028,11 +2024,6 @@ build_sched_domains(const struct cpumask *cpu_map, struct sched_domain_attr *att if (has_asym) static_branch_enable_cpuslocked(&sched_asym_cpucapacity); - if (rq && sched_debug_enabled) { - pr_info("root domain span: %*pbl (max cpu_capacity = %lu)\n", - cpumask_pr_args(cpu_map), rq->rd->max_cpu_capacity); - } - ret = 0; error: __free_domain_allocs(&d, alloc_state, cpu_map); From 65b8ddef4ef06f78b29207ef5ec508fb58ad807f Mon Sep 17 00:00:00 2001 From: Morten Rasmussen Date: Thu, 2 Jul 2015 17:16:34 +0100 Subject: [PATCH 08/15] ANDROID: sched: Prevent unnecessary active balance of single task in sched group Scenarios with the busiest group having just one task and the local being idle on topologies with sched groups with different numbers of cpus manage to dodge all load-balance bailout conditions resulting the nr_balance_failed counter to be incremented. This eventually causes a pointless active migration of the task. This patch prevents this by not incrementing the counter when the busiest group only has one task. ASYM_PACKING migrations and migrations due to reduced capacity should still take place as these are explicitly captured by need_active_balance(). A better solution would be to not attempt the load-balance in the first place, but that requires significant changes to the order of bailout conditions and statistics gathering. Change-Id: I28f69c72febe0211decbe77b7bc3e48839d3d7b3 cc: Ingo Molnar cc: Peter Zijlstra Signed-off-by: Morten Rasmussen Signed-off-by: Quentin Perret --- kernel/sched/fair.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index b4314e918c29..df279e422001 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -7112,6 +7112,7 @@ struct lb_env { int new_dst_cpu; enum cpu_idle_type idle; long imbalance; + unsigned int src_grp_nr_running; /* The set of CPUs under consideration for load-balancing */ struct cpumask *cpus; @@ -8294,6 +8295,8 @@ next_group: if (env->sd->flags & SD_NUMA) env->fbq_type = fbq_classify_group(&sds->busiest_stat); + env->src_grp_nr_running = sds->busiest_stat.sum_nr_running; + if (!env->sd->parent) { struct root_domain *rd = env->dst_rq->rd; @@ -8988,7 +8991,8 @@ more_balance: * excessive cache_hot migrations and active balances. */ if (idle != CPU_NEWLY_IDLE) - sd->nr_balance_failed++; + if (env.src_grp_nr_running > 1) + sd->nr_balance_failed++; if (need_active_balance(&env)) { unsigned long flags; From f351885fc72a3c13e5a713fcc37097f7c2eceee8 Mon Sep 17 00:00:00 2001 From: Chris Redpath Date: Fri, 1 Jun 2018 20:34:10 +0100 Subject: [PATCH 09/15] ANDROID: sched/fair: Attempt to improve throughput for asym cap systems In some systems the capacity and group weights line up to defeat all the small imbalance correction conditions in fix_small_imbalance, which can cause bad task placement. Add a new condition if the existing code can't see anything to fix: If we have asymmetric capacity, and there are more tasks than CPUs in the busiest group *and* there are less tasks than CPUs in the local group then we try to pull something. There could be transient small tasks which prevent this from working, but on the whole it is beneficial for those systems with inconvenient capacity/cluster size relationships. Change-Id: Icf81cde215c082a61f816534b7990ccb70aee409 Signed-off-by: Chris Redpath Signed-off-by: Quentin Perret --- kernel/sched/fair.c | 17 ++++++++++++++++- 1 file changed, 16 insertions(+), 1 deletion(-) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index df279e422001..1962e68fcf60 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -8425,7 +8425,22 @@ void fix_small_imbalance(struct lb_env *env, struct sd_lb_stats *sds) capa_move /= SCHED_CAPACITY_SCALE; /* Move if we gain throughput */ - if (capa_move > capa_now) + if (capa_move > capa_now) { + env->imbalance = busiest->load_per_task; + return; + } + + /* We can't see throughput improvement with the load-based + * method, but it is possible depending upon group size and + * capacity range that there might still be an underutilized + * cpu available in an asymmetric capacity system. Do one last + * check just in case. + */ + if (env->sd->flags & SD_ASYM_CPUCAPACITY && + busiest->group_type == group_overloaded && + busiest->sum_nr_running > busiest->group_weight && + local->sum_nr_running < local->group_weight && + local->group_capacity < busiest->group_capacity) env->imbalance = busiest->load_per_task; } From a7455f8123479ddcad3ff7b366ce143cb78a9ae3 Mon Sep 17 00:00:00 2001 From: Chris Redpath Date: Tue, 5 Jun 2018 11:47:57 +0100 Subject: [PATCH 10/15] ANDROID: sched/fair: Don't balance misfits if it would overload local group When load balancing in a system with misfit tasks present, if we always pull a misfit task to the local group this can lead to pulling a running task from a smaller capacity CPUs to a bigger CPU which is busy. In this situation, the pulled task is likely not to get a chance to run before an idle balance on another small CPU pulls it back. This penalises the pulled task as it is stopped for a short amount of time and then likely relocated to a different CPU (since the original CPU just did a NEWLY_IDLE balance and reset the periodic interval). If we only do this unconditionally for NEWLY_IDLE balance, we can be sure that any tasks and load which are present on the local group are related to short-running tasks which we are happy to displace for a longer running task in a system with misfit tasks present. However, other balance types should only pull a task if we think that the local group is underutilized - checking the number of tasks gives us a conservative estimate here since if they were short tasks we would have been doing NEWLY_IDLE balances instead. Change-Id: I710add1ab1139482620b6addc8370ad194791beb Signed-off-by: Chris Redpath Signed-off-by: Quentin Perret --- kernel/sched/fair.c | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 1962e68fcf60..af23c63eca32 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -8509,8 +8509,18 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s (sds->avg_load - local->avg_load) * local->group_capacity ) / SCHED_CAPACITY_SCALE; - /* Boost imbalance to allow misfit task to be balanced. */ - if (busiest->group_type == group_misfit_task) { + /* Boost imbalance to allow misfit task to be balanced. + * Always do this if we are doing a NEWLY_IDLE balance + * on the assumption that any tasks we have must not be + * long-running (and hence we cannot rely upon load). + * However if we are not idle, we should assume the tasks + * we have are longer running and not override load-based + * calculations above unless we are sure that the local + * group is underutilized. + */ + if (busiest->group_type == group_misfit_task && + (env->idle == CPU_NEWLY_IDLE || + local->sum_nr_running < local->group_weight)) { env->imbalance = max_t(long, env->imbalance, busiest->group_misfit_task_load); } From 44ab1611a8fa4ee1dc59a80b656f8b2e6aa4e57d Mon Sep 17 00:00:00 2001 From: Chris Redpath Date: Tue, 5 Jun 2018 12:21:33 +0100 Subject: [PATCH 11/15] ANDROID: sched/fair: Also do misfit in overloaded groups If we can classify the group as overloaded, that overrides any classification as misfit but we may still have misfit tasks present. Check the rq we're looking at to see if this is the case. Change-Id: Ida8eb66aa625e34de3fe2ee1b0dd8a78926273d8 Signed-off-by: Chris Redpath [Removed stray reference to rq_has_misfit] Signed-off-by: Valentin Schneider Signed-off-by: Quentin Perret --- kernel/sched/fair.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index af23c63eca32..4012266f4f64 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -8798,6 +8798,9 @@ static int need_active_balance(struct lb_env *env) if (voluntary_active_balance(env)) return 1; + if (env->src_grp_type == group_overloaded && env->src_rq->misfit_task_load) + return 1; + return unlikely(sd->nr_balance_failed > sd->cache_nice_tries+2); } From 588085feb44fa553aec7eb98545a570e189eabad Mon Sep 17 00:00:00 2001 From: Quentin Perret Date: Wed, 3 Jul 2019 10:48:14 +0100 Subject: [PATCH 12/15] ANDROID: arm64: defconfig: Enable EAS by default Use schedutil as default cpufreq governor so EAS can start. Also, enable util-clamp to enable frequency selection biasing. Change-Id: Iec9098f27c0353dabc23bd98efbca6479de41796 Signed-off-by: Quentin Perret --- arch/arm64/configs/defconfig | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/arch/arm64/configs/defconfig b/arch/arm64/configs/defconfig index 8e05c39eab08..2c029af5d7e0 100644 --- a/arch/arm64/configs/defconfig +++ b/arch/arm64/configs/defconfig @@ -13,10 +13,12 @@ CONFIG_TASK_XACCT=y CONFIG_TASK_IO_ACCOUNTING=y CONFIG_IKCONFIG=y CONFIG_IKCONFIG_PROC=y +CONFIG_UCLAMP_TASK=y CONFIG_NUMA_BALANCING=y CONFIG_MEMCG=y CONFIG_MEMCG_SWAP=y CONFIG_BLK_CGROUP=y +CONFIG_UCLAMP_TASK_GROUP=y CONFIG_CGROUP_PIDS=y CONFIG_CGROUP_HUGETLB=y CONFIG_CPUSETS=y @@ -71,10 +73,12 @@ CONFIG_COMPAT=y CONFIG_RANDOMIZE_BASE=y CONFIG_HIBERNATION=y CONFIG_WQ_POWER_EFFICIENT_DEFAULT=y +CONFIG_ENERGY_MODEL=y CONFIG_ARM_CPUIDLE=y CONFIG_ARM_PSCI_CPUIDLE=y CONFIG_CPU_FREQ=y CONFIG_CPU_FREQ_STAT=y +CONFIG_CPU_FREQ_DEFAULT_GOV_SCHEDUTIL=y CONFIG_CPU_FREQ_GOV_POWERSAVE=m CONFIG_CPU_FREQ_GOV_USERSPACE=y CONFIG_CPU_FREQ_GOV_ONDEMAND=y From c28f9d3945f10f655983cd4fb0e71d972efa747b Mon Sep 17 00:00:00 2001 From: Quentin Perret Date: Tue, 30 Jul 2019 13:54:00 +0100 Subject: [PATCH 13/15] ANDROID: sched/core: Add a latency-sensitive flag to uclamp Add a 'latency_sensitive' flag to uclamp in order to express the need for some tasks to find a CPU where they can wake-up quickly. This is not expected to be used without cgroup support, so add solely a cgroup interface for it. As this flag represents a boolean attribute and not an amount of resources to be shared, it is not clear what the delegation logic should be. As such, it is kept simple: every new cgroup starts with latency_sensitive set to false, regardless of the parent. In essence, this is similar to SchedTune's prefer-idle flag which was used in android-4.19 and prior. Change-Id: I722d8ecabb428bb7b95a5b54bc70a87f182dde2a Signed-off-by: Quentin Perret --- kernel/sched/core.c | 33 +++++++++++++++++++++++++++++++++ kernel/sched/sched.h | 2 ++ 2 files changed, 35 insertions(+) diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 7880f4f64d0e..7571fc49af89 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -7356,6 +7356,27 @@ static int cpu_uclamp_max_show(struct seq_file *sf, void *v) cpu_uclamp_print(sf, UCLAMP_MAX); return 0; } + +static int cpu_uclamp_ls_write_u64(struct cgroup_subsys_state *css, + struct cftype *cftype, u64 ls) +{ + struct task_group *tg; + + if (ls > 1) + return -EINVAL; + tg = css_tg(css); + tg->latency_sensitive = (unsigned int) ls; + + return 0; +} + +static u64 cpu_uclamp_ls_read_u64(struct cgroup_subsys_state *css, + struct cftype *cft) +{ + struct task_group *tg = css_tg(css); + + return (u64) tg->latency_sensitive; +} #endif /* CONFIG_UCLAMP_TASK_GROUP */ #ifdef CONFIG_FAIR_GROUP_SCHED @@ -7716,6 +7737,12 @@ static struct cftype cpu_legacy_files[] = { .seq_show = cpu_uclamp_max_show, .write = cpu_uclamp_max_write, }, + { + .name = "uclamp.latency_sensitive", + .flags = CFTYPE_NOT_ON_ROOT, + .read_u64 = cpu_uclamp_ls_read_u64, + .write_u64 = cpu_uclamp_ls_write_u64, + }, #endif { } /* Terminate */ }; @@ -7897,6 +7924,12 @@ static struct cftype cpu_files[] = { .seq_show = cpu_uclamp_max_show, .write = cpu_uclamp_max_write, }, + { + .name = "uclamp.latency_sensitive", + .flags = CFTYPE_NOT_ON_ROOT, + .read_u64 = cpu_uclamp_ls_read_u64, + .write_u64 = cpu_uclamp_ls_write_u64, + }, #endif { } /* terminate */ }; diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 9015c687de19..7de9f16cb838 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -399,6 +399,8 @@ struct task_group { struct uclamp_se uclamp_req[UCLAMP_CNT]; /* Effective clamp values used for a task group */ struct uclamp_se uclamp[UCLAMP_CNT]; + /* Latency-sensitive flag used for a task group */ + unsigned int latency_sensitive; #endif }; From 0e00b6f9dd78f16d214959bb4b1af7ec4e62f9e4 Mon Sep 17 00:00:00 2001 From: Quentin Perret Date: Tue, 30 Jul 2019 13:58:29 +0100 Subject: [PATCH 14/15] ANDROID: sched: Introduce uclamp latency and boost wrapper Introduce a simple helper to read the latency_sensitive flag from a task. It is called uclamp_latency_sensitive() to match the API proposed by Patrick. While at it, introduce uclamp_boosted() which returns true only when a task has a non-null min-clamp. Change-Id: I5fc747da8b58625257a6604a3c88487b657fbe7a Suggested-by: Patrick Bellasi Signed-off-by: Quentin Perret --- kernel/sched/sched.h | 28 ++++++++++++++++++++++++++++ 1 file changed, 28 insertions(+) diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 7de9f16cb838..4f0d1668c283 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -2333,6 +2333,11 @@ static inline unsigned int uclamp_util(struct rq *rq, unsigned int util) { return uclamp_util_with(rq, util, NULL); } + +static inline bool uclamp_boosted(struct task_struct *p) +{ + return uclamp_eff_value(p, UCLAMP_MIN) > 0; +} #else /* CONFIG_UCLAMP_TASK */ static inline unsigned int uclamp_util_with(struct rq *rq, unsigned int util, struct task_struct *p) @@ -2343,8 +2348,31 @@ static inline unsigned int uclamp_util(struct rq *rq, unsigned int util) { return util; } +static inline bool uclamp_boosted(struct task_struct *p) +{ + return false; +} #endif /* CONFIG_UCLAMP_TASK */ +#ifdef CONFIG_UCLAMP_TASK_GROUP +static inline bool uclamp_latency_sensitive(struct task_struct *p) +{ + struct cgroup_subsys_state *css = task_css(p, cpu_cgrp_id); + struct task_group *tg; + + if (!css) + return false; + tg = container_of(css, struct task_group, css); + + return tg->latency_sensitive; +} +#else +static inline bool uclamp_latency_sensitive(struct task_struct *p) +{ + return false; +} +#endif /* CONFIG_UCLAMP_TASK_GROUP */ + #ifdef arch_scale_freq_capacity # ifndef arch_scale_freq_invariant # define arch_scale_freq_invariant() true From 760b82c9b88d2c8125abfc5f732cc3cd460b2a54 Mon Sep 17 00:00:00 2001 From: Quentin Perret Date: Wed, 27 Feb 2019 11:21:24 +0000 Subject: [PATCH 15/15] ANDROID: sched/fair: Bias EAS placement for latency Add to find_energy_efficient_cpu() a latency sensitive case which mimics what was done for prefer-idle in android-4.19 and before (see [1] for reference). This isn't strictly equivalent to the legacy algorithm but comes real close, and isn't very invasive. Overall, the idea is to select the biggest idle CPU we can find for latency-sensitive boosted tasks, and the smallest CPU where the can fit for latency-sensitive non-boosted tasks. The main differences with the legacy behaviour are the following: 1. the policy for 'prefer idle' when there isn't a single idle CPU in the system is simpler now. We just pick the CPU with the highest spare capacity; 2. the cstate awareness is implemented by minimizing the exit latency rather than the idle state index. This is how it is done in the slow path (find_idlest_group_cpu()), it doesn't require us to keep hooks into CPUIdle, and should actually be better because what we want is a CPU that can wake up quickly; 3. non-latency-sensitive tasks just use the standard mainline energy-aware wake-up path, which decides the placement using the Energy Model; 4. the 'boosted' and 'latency_sensitive' attributes of a task come from util_clamp (which now replaces schedtune). [1] https://android.googlesource.com/kernel/common.git/+/c27c56105dcaaae54ecc39ef33fbfac87a1486fc Change-Id: Ia58516906e9cb5abe08385a8cd088097043d8703 Signed-off-by: Quentin Perret --- kernel/sched/fair.c | 40 ++++++++++++++++++++++++++++++++++++++-- 1 file changed, 38 insertions(+), 2 deletions(-) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 4012266f4f64..9b43520dc248 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -6340,8 +6340,13 @@ static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu, int sy { unsigned long prev_delta = ULONG_MAX, best_delta = ULONG_MAX; struct root_domain *rd = cpu_rq(smp_processor_id())->rd; + int max_spare_cap_cpu_ls = prev_cpu, best_idle_cpu = -1; + unsigned long max_spare_cap_ls = 0, target_cap; unsigned long cpu_cap, util, base_energy = 0; + bool boosted, latency_sensitive = false; + unsigned int min_exit_lat = UINT_MAX; int cpu, best_energy_cpu = prev_cpu; + struct cpuidle_state *idle; struct sched_domain *sd; struct perf_domain *pd; @@ -6370,6 +6375,10 @@ static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu, int sy if (!task_util_est(p)) goto unlock; + latency_sensitive = uclamp_latency_sensitive(p); + boosted = uclamp_boosted(p); + target_cap = boosted ? 0 : ULONG_MAX; + for (; pd; pd = pd->next) { unsigned long cur_delta, spare_cap, max_spare_cap = 0; unsigned long base_energy_pd; @@ -6394,7 +6403,7 @@ static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu, int sy continue; /* Always use prev_cpu as a candidate. */ - if (cpu == prev_cpu) { + if (!latency_sensitive && cpu == prev_cpu) { prev_delta = compute_energy(p, prev_cpu, pd); prev_delta -= base_energy_pd; best_delta = min(best_delta, prev_delta); @@ -6409,10 +6418,34 @@ static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu, int sy max_spare_cap = spare_cap; max_spare_cap_cpu = cpu; } + + if (!latency_sensitive) + continue; + + if (idle_cpu(cpu)) { + cpu_cap = capacity_orig_of(cpu); + if (boosted && cpu_cap < target_cap) + continue; + if (!boosted && cpu_cap > target_cap) + continue; + idle = idle_get_state(cpu_rq(cpu)); + if (idle && idle->exit_latency > min_exit_lat && + cpu_cap == target_cap) + continue; + + if (idle) + min_exit_lat = idle->exit_latency; + target_cap = cpu_cap; + best_idle_cpu = cpu; + } else if (spare_cap > max_spare_cap_ls) { + max_spare_cap_ls = spare_cap; + max_spare_cap_cpu_ls = cpu; + } } /* Evaluate the energy impact of using this CPU. */ - if (max_spare_cap_cpu >= 0 && max_spare_cap_cpu != prev_cpu) { + if (!latency_sensitive && max_spare_cap_cpu >= 0 && + max_spare_cap_cpu != prev_cpu) { cur_delta = compute_energy(p, max_spare_cap_cpu, pd); cur_delta -= base_energy_pd; if (cur_delta < best_delta) { @@ -6424,6 +6457,9 @@ static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu, int sy unlock: rcu_read_unlock(); + if (latency_sensitive) + return best_idle_cpu >= 0 ? best_idle_cpu : max_spare_cap_cpu_ls; + /* * Pick the best CPU if prev_cpu cannot be used, or if it saves at * least 6% of the energy used by prev_cpu.