// SPDX-License-Identifier: GPL-2.0-only /* * Copyright (c) 2016-2021, The Linux Foundation. All rights reserved. * Copyright (c) 2022-2024, Qualcomm Innovation Center, Inc. All rights reserved. */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include "walt.h" #include "trace.h" #include bool enable_logging; bool trail_active; bool sustain_active; const char *task_event_names[] = { "PUT_PREV_TASK", "PICK_NEXT_TASK", "TASK_WAKE", "TASK_MIGRATE", "TASK_UPDATE", "IRQ_UPDATE" }; const char *migrate_type_names[] = { "GROUP_TO_RQ", "RQ_TO_GROUP", "RQ_TO_RQ", "GROUP_TO_GROUP" }; #define SCHED_FREQ_ACCOUNT_WAIT_TIME 0 #define SCHED_ACCOUNT_WAIT_TIME 1 #define EARLY_DETECTION_DURATION 9500000 #define MAX_NUM_CGROUP_COLOC_ID 20 #define NEW_TASK_ACTIVE_TIME 100000000 cpumask_t walt_cpus_taken_mask = { CPU_BITS_NONE }; DEFINE_SPINLOCK(cpus_taken_lock); DEFINE_PER_CPU(int, cpus_taken_refcount); DEFINE_PER_CPU(struct walt_rq, walt_rq); unsigned int sysctl_sched_user_hint; static u64 sched_clock_last; static bool walt_clock_suspended; DECLARE_COMPLETION(walt_get_cycle_counts_cb_completion); bool use_cycle_counter; u64 (*walt_get_cycle_counts_cb)(int cpu, u64 wc); static u64 walt_load_reported_window; struct irq_work walt_cpufreq_irq_work; struct irq_work walt_migration_irq_work; unsigned int walt_rotation_enabled; unsigned int __read_mostly sched_ravg_window = 20000000; int min_possible_cluster_id; int max_possible_cluster_id; /* Initial task load. Newly created tasks are assigned this load. */ unsigned int __read_mostly sched_init_task_load_windows; /* * Task load is categorized into buckets for the purpose of top task tracking. * The entire range of load from 0 to sched_ravg_window needs to be covered * in NUM_LOAD_INDICES number of buckets. Therefore the size of each bucket * is given by sched_ravg_window / NUM_LOAD_INDICES. Since the default value * of sched_ravg_window is DEFAULT_SCHED_RAVG_WINDOW, use that to compute * sched_load_granule. */ unsigned int __read_mostly sched_load_granule; /* frequent yielder tracking */ static unsigned int total_yield_cnt; static unsigned int total_sleep_cnt; static u64 yield_counting_window_ts; bool walt_is_idle_task(struct task_struct *p) { return walt_flag_test(p, WALT_IDLE_TASK_BIT); } u64 walt_sched_clock(void) { if (unlikely(walt_clock_suspended)) return sched_clock_last; return sched_clock(); } static void walt_resume(void) { walt_clock_suspended = false; } static int walt_suspend(void) { sched_clock_last = sched_clock(); walt_clock_suspended = true; return 0; } static struct syscore_ops walt_syscore_ops = { .resume = walt_resume, .suspend = walt_suspend }; /* *@boost:should be 0,1,2. *@period:boost time based on ms units. */ int set_task_boost(int boost, u64 period) { struct walt_task_struct *wts = (struct walt_task_struct *) current->android_vendor_data1; if (unlikely(walt_disabled)) return -EAGAIN; if (boost < TASK_BOOST_NONE || boost >= TASK_BOOST_END) return -EINVAL; if (boost) { wts->boost = boost; wts->boost_period = (u64)period * 1000 * 1000; wts->boost_expires = walt_sched_clock() + wts->boost_period; } else { wts->boost = 0; wts->boost_expires = 0; wts->boost_period = 0; } return 0; } EXPORT_SYMBOL_GPL(set_task_boost); static inline void acquire_rq_locks_irqsave(const cpumask_t *cpus, unsigned long *flags) { int cpu; int level; local_irq_save(*flags); level = 0; for_each_cpu(cpu, cpus) { if (level == 0) raw_spin_lock(&cpu_rq(cpu)->__lock); else raw_spin_lock_nested(&cpu_rq(cpu)->__lock, level); level++; } } static inline void release_rq_locks_irqrestore(const cpumask_t *cpus, unsigned long *flags) { int cpu; for_each_cpu(cpu, cpus) raw_spin_unlock(&cpu_rq(cpu)->__lock); local_irq_restore(*flags); } static inline u64 walt_rq_clock(struct rq *rq) { struct walt_rq *wrq = &per_cpu(walt_rq, cpu_of(rq)); if (unlikely(walt_clock_suspended)) return sched_clock_last; walt_lockdep_assert_rq(rq, NULL); if (!(rq->clock_update_flags & RQCF_UPDATED)) update_rq_clock(rq); return max(rq_clock(rq), wrq->latest_clock); } static unsigned int walt_cpu_high_irqload; static __read_mostly unsigned int sched_io_is_busy = 1; /* Window size (in ns) */ static __read_mostly unsigned int new_sched_ravg_window = DEFAULT_SCHED_RAVG_WINDOW; static DEFINE_SPINLOCK(sched_ravg_window_lock); static u64 sched_ravg_window_change_time; static unsigned int __read_mostly sched_init_task_load_windows_scaled; /* Size of bitmaps maintained to track top tasks */ static const unsigned int top_tasks_bitmap_size = BITS_TO_LONGS(NUM_LOAD_INDICES + 1) * sizeof(unsigned long); __read_mostly unsigned int walt_scale_demand_divisor; #define SCHED_PRINT(arg) printk_deferred("%s=%llu", #arg, (unsigned long long)arg) #define STRG(arg) #arg void walt_task_dump(struct task_struct *p) { char buff[WALT_NR_CPUS * 16]; int i, j = 0; int buffsz = WALT_NR_CPUS * 16; struct walt_task_struct *wts = (struct walt_task_struct *) p->android_vendor_data1; bool is_32bit_thread = is_compat_thread(task_thread_info(p)); printk_deferred("Task: %.16s-%d\n", p->comm, p->pid); SCHED_PRINT(READ_ONCE(p->__state)); SCHED_PRINT(task_thread_info(p)->cpu); SCHED_PRINT(p->policy); SCHED_PRINT(p->prio); SCHED_PRINT(wts->mark_start); SCHED_PRINT(wts->demand); SCHED_PRINT(wts->coloc_demand); SCHED_PRINT(wts->enqueue_after_migration); SCHED_PRINT(wts->prev_cpu); SCHED_PRINT(wts->new_cpu); SCHED_PRINT(wts->misfit); SCHED_PRINT(wts->prev_on_rq); SCHED_PRINT(wts->prev_on_rq_cpu); SCHED_PRINT(wts->mvp_prio); SCHED_PRINT(wts->iowaited); SCHED_PRINT(sched_ravg_window); SCHED_PRINT(new_sched_ravg_window); for (i = 0 ; i < nr_cpu_ids; i++) j += scnprintf(buff + j, buffsz - j, "%u ", wts->curr_window_cpu[i]); printk_deferred("%s=%u (%s)\n", STRG(wts->curr_window), wts->curr_window, buff); for (i = 0, j = 0 ; i < nr_cpu_ids; i++) j += scnprintf(buff + j, buffsz - j, "%u ", wts->prev_window_cpu[i]); printk_deferred("%s=%u (%s)\n", STRG(wts->prev_window), wts->prev_window, buff); SCHED_PRINT(wts->last_sleep_ts); SCHED_PRINT(wts->last_wake_ts); SCHED_PRINT(wts->last_enqueued_ts); SCHED_PRINT(wts->mark_start_birth_ts); SCHED_PRINT(wts->misfit); SCHED_PRINT(wts->unfilter); SCHED_PRINT(is_32bit_thread); SCHED_PRINT(wts->grp); SCHED_PRINT(p->on_cpu); SCHED_PRINT(p->on_rq); } void walt_rq_dump(int cpu) { struct rq *rq = cpu_rq(cpu); struct task_struct *tsk = cpu_curr(cpu); int i; struct walt_rq *wrq = &per_cpu(walt_rq, cpu_of(rq)); /* * Increment the task reference so that it can't be * freed on a remote CPU. Since we are going to * enter panic, there is no need to decrement the * task reference. Decrementing the task reference * can't be done in atomic context, especially with * rq locks held. */ get_task_struct(tsk); printk_deferred("CPU:%d nr_running:%u current: %d (%s)\n", cpu, rq->nr_running, tsk->pid, tsk->comm); printk_deferred("=========================================="); SCHED_PRINT(wrq->latest_clock); SCHED_PRINT(wrq->window_start); SCHED_PRINT(wrq->prev_window_size); SCHED_PRINT(wrq->curr_runnable_sum); SCHED_PRINT(wrq->prev_runnable_sum); SCHED_PRINT(wrq->nt_curr_runnable_sum); SCHED_PRINT(wrq->nt_prev_runnable_sum); SCHED_PRINT(wrq->task_exec_scale); SCHED_PRINT(wrq->grp_time.curr_runnable_sum); SCHED_PRINT(wrq->grp_time.prev_runnable_sum); SCHED_PRINT(wrq->grp_time.nt_curr_runnable_sum); SCHED_PRINT(wrq->grp_time.nt_prev_runnable_sum); for (i = 0 ; i < NUM_TRACKED_WINDOWS; i++) { printk_deferred("wrq->load_subs[%d].window_start=%llu)\n", i, wrq->load_subs[i].window_start); printk_deferred("wrq->load_subs[%d].subs=%llu)\n", i, wrq->load_subs[i].subs); printk_deferred("wrq->load_subs[%d].new_subs=%llu)\n", i, wrq->load_subs[i].new_subs); } walt_task_dump(tsk); SCHED_PRINT(sched_capacity_margin_up[cpu]); SCHED_PRINT(sched_capacity_margin_down[cpu]); } void walt_dump(void) { int cpu; printk_deferred("============ WALT RQ DUMP START ==============\n"); printk_deferred("Sched clock: %llu\n", walt_sched_clock()); printk_deferred("Time last window changed=%llu\n", sched_ravg_window_change_time); printk_deferred("global_ws=%llu\n", atomic64_read(&walt_irq_work_lastq_ws)); for_each_online_cpu(cpu) walt_rq_dump(cpu); SCHED_PRINT(max_possible_cluster_id); printk_deferred("============ WALT RQ DUMP END ==============\n"); } int in_sched_bug; static inline void fixup_cumulative_runnable_avg(struct rq *rq, struct task_struct *p, struct walt_sched_stats *stats, s64 demand_scaled_delta, s64 pred_demand_scaled_delta) { struct walt_task_struct *wts = (struct walt_task_struct *) p->android_vendor_data1; s64 cumulative_runnable_avg_scaled = stats->cumulative_runnable_avg_scaled + demand_scaled_delta; s64 pred_demands_sum_scaled = stats->pred_demands_sum_scaled + pred_demand_scaled_delta; walt_lockdep_assert_rq(rq, p); if (task_rq(p) != rq) WALT_BUG(WALT_BUG_UPSTREAM, p, "on CPU %d task %s(%d) not on rq %d", raw_smp_processor_id(), p->comm, p->pid, rq->cpu); if (cumulative_runnable_avg_scaled < 0) { WALT_BUG(WALT_BUG_WALT, p, "on CPU %d task ds=%hu is higher than cra=%llu\n", raw_smp_processor_id(), wts->demand_scaled, stats->cumulative_runnable_avg_scaled); cumulative_runnable_avg_scaled = 0; } stats->cumulative_runnable_avg_scaled = (u64)cumulative_runnable_avg_scaled; if (pred_demands_sum_scaled < 0) { WALT_BUG(WALT_BUG_WALT, p, "on CPU %d task pds=%hu is higher than pds_sum=%llu\n", raw_smp_processor_id(), wts->pred_demand_scaled, stats->pred_demands_sum_scaled); pred_demands_sum_scaled = 0; } stats->pred_demands_sum_scaled = (u64)pred_demands_sum_scaled; } static void fixup_walt_sched_stats_common(struct rq *rq, struct task_struct *p, u16 updated_demand_scaled, u16 updated_pred_demand_scaled) { struct walt_task_struct *wts = (struct walt_task_struct *) p->android_vendor_data1; s64 task_load_delta = (s64)updated_demand_scaled - wts->demand_scaled; s64 pred_demand_delta = (s64)updated_pred_demand_scaled - wts->pred_demand_scaled; struct walt_rq *wrq = &per_cpu(walt_rq, cpu_of(rq)); fixup_cumulative_runnable_avg(rq, p, &wrq->walt_stats, task_load_delta, pred_demand_delta); } static void rollover_cpu_window(struct rq *rq, bool full_window); static void rollover_top_tasks(struct rq *rq, bool full_window); /* * Demand aggregation for frequency purpose: * * CPU demand of tasks from various related groups is aggregated per-cluster and * added to the "max_busy_cpu" in that cluster, where max_busy_cpu is determined * by just wrq->prev_runnable_sum. * * Some examples follow, which assume: * Cluster0 = CPU0-3, Cluster1 = CPU4-7 * One related thread group A that has tasks A0, A1, A2 * * A->cpu_time[X].curr/prev_sum = counters in which cpu execution stats of * tasks belonging to group A are accumulated when they run on cpu X. * * CX->curr/prev_sum = counters in which cpu execution stats of all tasks * not belonging to group A are accumulated when they run on cpu X * * Lets say the stats for window M was as below: * * C0->prev_sum = 1ms, A->cpu_time[0].prev_sum = 5ms * Task A0 ran 5ms on CPU0 * Task B0 ran 1ms on CPU0 * * C1->prev_sum = 5ms, A->cpu_time[1].prev_sum = 6ms * Task A1 ran 4ms on CPU1 * Task A2 ran 2ms on CPU1 * Task B1 ran 5ms on CPU1 * * C2->prev_sum = 0ms, A->cpu_time[2].prev_sum = 0 * CPU2 idle * * C3->prev_sum = 0ms, A->cpu_time[3].prev_sum = 0 * CPU3 idle * * In this case, CPU1 was most busy going by just its prev_sum counter. Demand * from all group A tasks are added to CPU1. IOW, at end of window M, cpu busy * time reported to governor will be: * * * C0 busy time = 1ms * C1 busy time = 5 + 5 + 6 = 16ms * */ __read_mostly bool sched_freq_aggr_en; static u64 update_window_start(struct rq *rq, u64 wallclock, int event) { s64 delta; int nr_windows; struct walt_rq *wrq = &per_cpu(walt_rq, cpu_of(rq)); struct walt_sched_cluster *cluster = cpu_cluster(task_cpu(current)); struct smart_freq_cluster_info *smart_freq_info = cluster->smart_freq_info; u64 old_window_start = wrq->window_start; bool full_window; if (wallclock < wrq->latest_clock) { WALT_BUG(WALT_BUG_WALT, NULL, "on CPU%d; wallclock=%llu(0x%llx) is lesser than latest_clock=%llu(0x%llx)", rq->cpu, wallclock, wallclock, wrq->latest_clock, wrq->latest_clock); wallclock = wrq->latest_clock; } delta = wallclock - wrq->window_start; if (delta < 0) { WALT_BUG(WALT_BUG_WALT, NULL, " on CPU%d; wallclock=%llu(0x%llx) is lesser than window_start=%llu(0x%llx)", rq->cpu, wallclock, wallclock, wrq->window_start, wrq->window_start); delta = 0; wallclock = max(wallclock, wrq->window_start); } wrq->latest_clock = wallclock; if (delta < sched_ravg_window) return old_window_start; nr_windows = div64_u64(delta, sched_ravg_window); wrq->window_start += (u64)nr_windows * (u64)sched_ravg_window; wrq->prev_window_size = sched_ravg_window; full_window = nr_windows > 1; rollover_cpu_window(rq, full_window); rollover_top_tasks(rq, full_window); /* Update yielder statistics */ if (cpu_of(rq) == 0) { u64 delta = wallclock - yield_counting_window_ts; /* window boundary crossed */ if (delta > YIELD_WINDOW_SIZE_NSEC) { unsigned int target_threshold_wake = MAX_YIELD_CNT_GLOBAL_THR; unsigned int target_threshold_sleep = MAX_YIELD_SLEEP_CNT_GLOBAL_THR; /* * if update_window_start comes more than * YIELD_GRACE_PERIOD_NSEC after the YIELD_WINDOW_SIZE_NSEC then * extrapolate the threasholds based on delta time. */ if (unlikely(delta > YIELD_WINDOW_SIZE_NSEC + YIELD_GRACE_PERIOD_NSEC)) { target_threshold_wake = div64_u64(delta * MAX_YIELD_CNT_GLOBAL_THR, YIELD_WINDOW_SIZE_NSEC); target_threshold_sleep = div64_u64(delta * MAX_YIELD_SLEEP_CNT_GLOBAL_THR, YIELD_WINDOW_SIZE_NSEC); } if ((total_yield_cnt >= target_threshold_wake) || (total_sleep_cnt >= target_threshold_sleep / 2)) { if (contiguous_yielding_windows < MIN_CONTIGUOUS_YIELDING_WINDOW) contiguous_yielding_windows++; } else { contiguous_yielding_windows = 0; } trace_sched_yielder(wallclock, yield_counting_window_ts, contiguous_yielding_windows, total_yield_cnt, target_threshold_wake, total_sleep_cnt, target_threshold_sleep, smart_freq_info->cluster_active_reason); yield_counting_window_ts = wallclock; total_yield_cnt = 0; total_sleep_cnt = 0; } } return old_window_start; } /* * Assumes rq_lock is held and wallclock was recorded in the same critical * section as this function's invocation. */ static inline u64 read_cycle_counter(int cpu, u64 wallclock) { struct walt_rq *wrq = &per_cpu(walt_rq, cpu); if (wrq->last_cc_update != wallclock) { wrq->cycles = walt_get_cycle_counts_cb(cpu, wallclock); wrq->last_cc_update = wallclock; } return wrq->cycles; } static void update_task_cpu_cycles(struct task_struct *p, int cpu, u64 wallclock) { struct walt_task_struct *wts = (struct walt_task_struct *) p->android_vendor_data1; if (use_cycle_counter) wts->cpu_cycles = read_cycle_counter(cpu, wallclock); } static inline bool is_ed_enabled(void) { return (boost_policy != SCHED_BOOST_NONE); } static inline bool is_ed_task(struct task_struct *p, u64 wallclock) { struct walt_task_struct *wts = (struct walt_task_struct *) p->android_vendor_data1; return (wallclock - wts->last_wake_ts >= EARLY_DETECTION_DURATION); } static bool is_ed_task_present(struct rq *rq, u64 wallclock, struct task_struct *deq_task) { struct task_struct *p; int loop_max = 10; struct walt_rq *wrq = &per_cpu(walt_rq, cpu_of(rq)); wrq->ed_task = NULL; if (!is_ed_enabled() || !rq->cfs.h_nr_running) return false; list_for_each_entry(p, &rq->cfs_tasks, se.group_node) { if (!loop_max) break; if (p == deq_task) continue; if (is_ed_task(p, wallclock)) { wrq->ed_task = p; return true; } loop_max--; } return false; } static void walt_update_task_ravg(struct task_struct *p, struct rq *rq, int event, u64 wallclock, u64 irqtime); /* * Return total number of tasks "eligible" to run on higher capacity cpus */ unsigned int walt_big_tasks(int cpu) { struct walt_rq *wrq = &per_cpu(walt_rq, cpu); return wrq->walt_stats.nr_big_tasks; } int walt_trailblazer_tasks(int cpu) { struct walt_rq *wrq = &per_cpu(walt_rq, cpu); return wrq->walt_stats.nr_trailblazer_tasks; } bool trailblazer_on_prime(void) { int cpu; for_each_cpu(cpu, &cpu_array[0][num_sched_clusters - 1]) { if (walt_trailblazer_tasks(cpu)) return true; } return false; } static void clear_walt_request(int cpu) { struct rq *rq = cpu_rq(cpu); unsigned long flags; struct walt_rq *wrq = &per_cpu(walt_rq, cpu_of(rq)); clear_reserved(cpu); if (wrq->push_task) { struct task_struct *push_task = NULL; raw_spin_lock_irqsave(&rq->__lock, flags); if (wrq->push_task) { clear_reserved(rq->push_cpu); push_task = wrq->push_task; wrq->push_task = NULL; } rq->active_balance = 0; raw_spin_unlock_irqrestore(&rq->__lock, flags); if (push_task) put_task_struct(push_task); } } /* * Special case the last index and provide a fast path for index = 0. * Note that sched_load_granule can change underneath us if we are not * holding any runqueue locks while calling the two functions below. */ static u32 top_task_load(struct rq *rq) { struct walt_rq *wrq = &per_cpu(walt_rq, cpu_of(rq)); int index = wrq->prev_top; u8 prev = 1 - wrq->curr_table; if (!index) { int msb = NUM_LOAD_INDICES - 1; if (!test_bit(msb, wrq->top_tasks_bitmap[prev])) return 0; else return sched_load_granule; } else if (index == NUM_LOAD_INDICES - 1) { return sched_ravg_window; } else { return (index + 1) * sched_load_granule; } } unsigned long sched_user_hint_reset_time; static bool is_cluster_hosting_top_app(struct walt_sched_cluster *cluster); static inline bool should_apply_suh_freq_boost(struct walt_sched_cluster *cluster) { if (sched_freq_aggr_en || !sysctl_sched_user_hint || !cluster->aggr_grp_load) return false; return is_cluster_hosting_top_app(cluster); } static inline u64 freq_policy_load(struct rq *rq, unsigned int *reason, bool trace) { struct walt_rq *wrq = &per_cpu(walt_rq, cpu_of(rq)); struct walt_sched_cluster *cluster = wrq->cluster; u64 aggr_grp_load = cluster->aggr_grp_load; u64 load, tt_load = 0, kload = 0; struct task_struct *cpu_ksoftirqd = per_cpu(ksoftirqd, cpu_of(rq)); if (sched_freq_aggr_en) { load = wrq->prev_runnable_sum + aggr_grp_load; *reason = CPUFREQ_REASON_FREQ_AGR_BIT; } else { load = wrq->prev_runnable_sum + wrq->grp_time.prev_runnable_sum; } if (cpu_ksoftirqd && READ_ONCE(cpu_ksoftirqd->__state) == TASK_RUNNING) { kload = task_load(cpu_ksoftirqd); if (kload > load) { load = kload; *reason = CPUFREQ_REASON_KSOFTIRQD_BIT; } } tt_load = top_task_load(rq); if (tt_load > load) { load = tt_load; *reason = CPUFREQ_REASON_TT_LOAD_BIT; } if (should_apply_suh_freq_boost(cluster)) { if (is_suh_max()) load = sched_ravg_window; else load = div64_u64(load * sysctl_sched_user_hint, (u64)100); *reason = CPUFREQ_REASON_SUH_BIT; } if (wrq->ed_task) { load = mult_frac(load, 100 + sysctl_ed_boost_pct, 100); *reason = CPUFREQ_REASON_EARLY_DET_BIT; } if (wrq->lrb_pipeline_start_time) { load = mult_frac(load, 100 + sysctl_pipeline_busy_boost_pct, 100); *reason = CPUFREQ_REASON_PIPELINE_BUSY_BIT; } if (walt_rotation_enabled) { load = sched_ravg_window; *reason = CPUFREQ_REASON_BTR_BIT; } if (walt_trailblazer_tasks(cpu_of(rq)) && walt_feat(WALT_FEAT_TRAILBLAZER_BIT)) { load = sched_ravg_window; *reason = CPUFREQ_REASON_TRAILBLAZER_CPU_BIT; } if (trace) trace_sched_load_to_gov(rq, aggr_grp_load, tt_load, sched_freq_aggr_en, load, 0, walt_rotation_enabled, sysctl_sched_user_hint, wrq, *reason); return load; } static bool rtgb_active; static inline unsigned long __cpu_util_freq_walt(int cpu, struct walt_cpu_load *walt_load, unsigned int *reason, bool trace) { u64 util; struct rq *rq = cpu_rq(cpu); unsigned long capacity = capacity_orig_of(cpu); struct walt_rq *wrq = &per_cpu(walt_rq, cpu_of(rq)); util = scale_time_to_util(freq_policy_load(rq, reason, trace)); /* * util is on a scale of 0 to 1024. this is the utilization * of the cpu in the last window */ wrq->util = util; if (walt_load) { u64 nl = wrq->nt_prev_runnable_sum + wrq->grp_time.nt_prev_runnable_sum; u64 pl = wrq->walt_stats.pred_demands_sum_scaled; wrq->old_busy_time = util; wrq->old_estimated_time = pl; nl = scale_time_to_util(nl); walt_load->nl = nl; walt_load->pl = pl; walt_load->ws = walt_load_reported_window; walt_load->rtgb_active = rtgb_active; if (wrq->ed_task) walt_load->ed_active = true; else walt_load->ed_active = false; walt_load->trailblazer_state = trailblazer_state; } return (util >= capacity) ? capacity : util; } #define PIPELINE_SYNC_VAL(first, second, x) \ (max(first, mult_frac(second, x, 100))) int other_sync_pct(unsigned long util_other) { int pct; if (sched_ravg_window >= SCHED_RAVG_16MS_WINDOW) { if (util_other <= load_sync_util_thres_60fps[num_sched_clusters - 2][num_sched_clusters - 1]) pct = load_sync_low_pct_60fps[num_sched_clusters - 2][num_sched_clusters - 1]; else pct = load_sync_high_pct_60fps[num_sched_clusters - 2][num_sched_clusters - 1]; return pct; } if (util_other <= load_sync_util_thres[num_sched_clusters - 2][num_sched_clusters - 1]) pct = load_sync_low_pct[num_sched_clusters - 2][num_sched_clusters - 1]; else pct = load_sync_high_pct[num_sched_clusters - 2][num_sched_clusters - 1]; return pct; } int prime_sync_pct(unsigned long util_prime) { int pct; if (sched_ravg_window >= SCHED_RAVG_16MS_WINDOW) { if (util_prime <= load_sync_util_thres_60fps[num_sched_clusters - 1][num_sched_clusters - 2]) pct = load_sync_low_pct_60fps[num_sched_clusters - 1][num_sched_clusters - 2]; else pct = load_sync_high_pct_60fps[num_sched_clusters - 1][num_sched_clusters - 2]; return pct; } if (util_prime <= load_sync_util_thres[num_sched_clusters - 1][num_sched_clusters - 2]) pct = load_sync_low_pct[num_sched_clusters - 1][num_sched_clusters - 2]; else pct = load_sync_high_pct[num_sched_clusters - 1][num_sched_clusters - 2]; return pct; } unsigned long cpu_util_freq_walt(int cpu, struct walt_cpu_load *walt_load, unsigned int *reason) { struct walt_cpu_load wl_other = {0}; struct walt_cpu_load wl_prime = {0}; unsigned long util = 0, util_other = 0, util_prime = 0; unsigned long capacity = capacity_orig_of(cpu); int i, mpct_other, mpct_prime; unsigned long max_nl_other = 0, max_pl_other = 0; unsigned long max_nl_prime = 0, max_pl_prime = 0; util = __cpu_util_freq_walt(cpu, walt_load, reason, true); if (enable_load_sync(cpu)) { for_each_cpu(i, &pipeline_sync_cpus) { if (cpumask_test_cpu(i, &cpu_array[0][num_sched_clusters-1])) { util_prime = max(util_prime, __cpu_util_freq_walt(i, &wl_prime, reason, false)); max_nl_prime = max(max_nl_prime, wl_prime.nl); max_pl_prime = max(max_pl_prime, wl_prime.pl); } else { util_other = max(util_other, __cpu_util_freq_walt(i, &wl_other, reason, false)); max_nl_other = max(max_nl_other, wl_other.nl); max_pl_other = max(max_pl_other, wl_other.pl); } } mpct_other = other_sync_pct(util_other); mpct_prime = prime_sync_pct(util_prime); if (cpumask_test_cpu(cpu, &cpu_array[0][num_sched_clusters-1])) { util = PIPELINE_SYNC_VAL(util_prime, util_other, mpct_other); walt_load->nl = PIPELINE_SYNC_VAL(max_nl_prime, max_nl_other, mpct_other); walt_load->pl = PIPELINE_SYNC_VAL(max_pl_prime, max_pl_other, mpct_other); trace_sched_load_sync_settings(cpu, util_other, util_prime, mpct_other); } else { util = PIPELINE_SYNC_VAL(util_other, util_prime, mpct_prime); walt_load->nl = PIPELINE_SYNC_VAL(max_nl_other, max_nl_prime, mpct_prime); walt_load->pl = PIPELINE_SYNC_VAL(max_pl_other, max_pl_prime, mpct_prime); trace_sched_load_sync_settings(cpu, util_other, util_prime, mpct_prime); } } if (!cpumask_test_cpu(cpu, &asym_cap_sibling_cpus)) goto finish; if (is_state1()) goto finish; for_each_cpu(i, &asym_cap_sibling_cpus) { if (i != cpu) { util_other = max(util_other, __cpu_util_freq_walt(i, &wl_other, reason, false)); max_nl_other = max(max_nl_other, wl_other.nl); max_pl_other = max(max_pl_other, wl_other.pl); } } util = max(util, util_other); walt_load->nl = max(walt_load->nl, max_nl_other); walt_load->pl = max(walt_load->pl, max_pl_other); finish: return (util >= capacity) ? capacity : util; } /* * In this function we match the accumulated subtractions with the current * and previous windows we are operating with. Ignore any entries where * the window start in the load_subtraction struct does not match either * the curent or the previous window. This could happen whenever CPUs * become idle or busy with interrupts disabled for an extended period. */ static inline void account_load_subtractions(struct rq *rq) { struct walt_rq *wrq = &per_cpu(walt_rq, cpu_of(rq)); u64 ws = wrq->window_start; u64 prev_ws = ws - wrq->prev_window_size; struct load_subtractions *ls = wrq->load_subs; int i; for (i = 0; i < NUM_TRACKED_WINDOWS; i++) { if (ls[i].window_start == ws) { wrq->curr_runnable_sum -= ls[i].subs; wrq->nt_curr_runnable_sum -= ls[i].new_subs; } else if (ls[i].window_start == prev_ws) { wrq->prev_runnable_sum -= ls[i].subs; wrq->nt_prev_runnable_sum -= ls[i].new_subs; } ls[i].subs = 0; ls[i].new_subs = 0; } if ((s64)wrq->prev_runnable_sum < 0) { WALT_BUG(WALT_BUG_WALT, NULL, "wrq->prev_runnable_sum=%llu < 0", (s64)wrq->prev_runnable_sum); wrq->prev_runnable_sum = 0; } if ((s64)wrq->curr_runnable_sum < 0) { WALT_BUG(WALT_BUG_WALT, NULL, "wrq->curr_runnable_sum=%llu < 0", (s64)wrq->curr_runnable_sum); wrq->curr_runnable_sum = 0; } if ((s64)wrq->nt_prev_runnable_sum < 0) { WALT_BUG(WALT_BUG_WALT, NULL, "wrq->nt_prev_runnable_sum=%llu < 0", (s64)wrq->nt_prev_runnable_sum); wrq->nt_prev_runnable_sum = 0; } if ((s64)wrq->nt_curr_runnable_sum < 0) { WALT_BUG(WALT_BUG_WALT, NULL, "wrq->nt_curr_runnable_sum=%llu < 0", (s64)wrq->nt_curr_runnable_sum); wrq->nt_curr_runnable_sum = 0; } } static inline void create_subtraction_entry(struct rq *rq, u64 ws, int index) { struct walt_rq *wrq = &per_cpu(walt_rq, cpu_of(rq)); wrq->load_subs[index].window_start = ws; wrq->load_subs[index].subs = 0; wrq->load_subs[index].new_subs = 0; } static int get_top_index(unsigned long *bitmap, unsigned long old_top) { int index = find_next_bit(bitmap, NUM_LOAD_INDICES, old_top); if (index == NUM_LOAD_INDICES) return 0; return NUM_LOAD_INDICES - 1 - index; } static bool get_subtraction_index(struct rq *rq, u64 ws) { int i; u64 oldest = ULLONG_MAX; int oldest_index = 0; struct walt_rq *wrq = &per_cpu(walt_rq, cpu_of(rq)); for (i = 0; i < NUM_TRACKED_WINDOWS; i++) { u64 entry_ws = wrq->load_subs[i].window_start; if (ws == entry_ws) return i; if (entry_ws < oldest) { oldest = entry_ws; oldest_index = i; } } create_subtraction_entry(rq, ws, oldest_index); return oldest_index; } static void update_rq_load_subtractions(int index, struct rq *rq, u32 sub_load, bool new_task) { struct walt_rq *wrq = &per_cpu(walt_rq, cpu_of(rq)); wrq->load_subs[index].subs += sub_load; if (new_task) wrq->load_subs[index].new_subs += sub_load; } static void update_cluster_load_subtractions(struct task_struct *p, int cpu, u64 ws, bool new_task) { struct walt_sched_cluster *cluster = cpu_cluster(cpu); struct cpumask cluster_cpus = cluster->cpus; struct walt_rq *wrq = &per_cpu(walt_rq, cpu); u64 prev_ws = ws - wrq->prev_window_size; int i; struct walt_task_struct *wts = (struct walt_task_struct *) p->android_vendor_data1; cpumask_clear_cpu(cpu, &cluster_cpus); raw_spin_lock(&cluster->load_lock); for_each_cpu(i, &cluster_cpus) { struct rq *rq = cpu_rq(i); int index; if (wts->curr_window_cpu[i]) { index = get_subtraction_index(rq, ws); update_rq_load_subtractions(index, rq, wts->curr_window_cpu[i], new_task); wts->curr_window_cpu[i] = 0; } if (wts->prev_window_cpu[i]) { index = get_subtraction_index(rq, prev_ws); update_rq_load_subtractions(index, rq, wts->prev_window_cpu[i], new_task); wts->prev_window_cpu[i] = 0; } } raw_spin_unlock(&cluster->load_lock); } static inline void migrate_inter_cluster_subtraction(struct task_struct *p, int task_cpu, bool new_task) { struct rq *src_rq = cpu_rq(task_cpu); struct walt_rq *src_wrq = &per_cpu(walt_rq, task_cpu); struct walt_task_struct *wts = (struct walt_task_struct *) p->android_vendor_data1; if (src_wrq->curr_runnable_sum < wts->curr_window_cpu[task_cpu]) { WALT_BUG(WALT_BUG_WALT, p, "pid=%u CPU%d src_crs=%llu is lesser than task_contrib=%u", p->pid, src_rq->cpu, src_wrq->curr_runnable_sum, wts->curr_window_cpu[task_cpu]); src_wrq->curr_runnable_sum = wts->curr_window_cpu[task_cpu]; } src_wrq->curr_runnable_sum -= wts->curr_window_cpu[task_cpu]; if (src_wrq->prev_runnable_sum < wts->prev_window_cpu[task_cpu]) { WALT_BUG(WALT_BUG_WALT, p, "pid=%u CPU%d src_prs=%llu is lesser than task_contrib=%u", p->pid, src_rq->cpu, src_wrq->prev_runnable_sum, wts->prev_window_cpu[task_cpu]); src_wrq->prev_runnable_sum = wts->prev_window_cpu[task_cpu]; } src_wrq->prev_runnable_sum -= wts->prev_window_cpu[task_cpu]; if (new_task) { if (src_wrq->nt_curr_runnable_sum < wts->curr_window_cpu[task_cpu]) { WALT_BUG(WALT_BUG_WALT, p, "pid=%u CPU%d src_nt_crs=%llu is lesser than task_contrib=%u", p->pid, src_rq->cpu, src_wrq->nt_curr_runnable_sum, wts->curr_window_cpu[task_cpu]); src_wrq->nt_curr_runnable_sum = wts->curr_window_cpu[task_cpu]; } src_wrq->nt_curr_runnable_sum -= wts->curr_window_cpu[task_cpu]; if (src_wrq->nt_prev_runnable_sum < wts->prev_window_cpu[task_cpu]) { WALT_BUG(WALT_BUG_WALT, p, "pid=%u CPU%d src_nt_prs=%llu is lesser than task_contrib=%u", p->pid, src_rq->cpu, src_wrq->nt_prev_runnable_sum, wts->prev_window_cpu[task_cpu]); src_wrq->nt_prev_runnable_sum = wts->prev_window_cpu[task_cpu]; } src_wrq->nt_prev_runnable_sum -= wts->prev_window_cpu[task_cpu]; } wts->curr_window_cpu[task_cpu] = 0; wts->prev_window_cpu[task_cpu] = 0; update_cluster_load_subtractions(p, task_cpu, src_wrq->window_start, new_task); } static inline void migrate_inter_cluster_addition(struct task_struct *p, int new_cpu, bool new_task) { struct walt_rq *dest_wrq = &per_cpu(walt_rq, new_cpu); struct walt_task_struct *wts = (struct walt_task_struct *) p->android_vendor_data1; wts->curr_window_cpu[new_cpu] = wts->curr_window; wts->prev_window_cpu[new_cpu] = wts->prev_window; dest_wrq->curr_runnable_sum += wts->curr_window; dest_wrq->prev_runnable_sum += wts->prev_window; if (new_task) { dest_wrq->nt_curr_runnable_sum += wts->curr_window; dest_wrq->nt_prev_runnable_sum += wts->prev_window; } } static u32 load_to_index(u32 load) { u32 index = load / sched_load_granule; return min(index, (u32)(NUM_LOAD_INDICES - 1)); } static void migrate_top_tasks_subtraction(struct task_struct *p, struct rq *src_rq) { int index; int top_index; struct walt_task_struct *wts = (struct walt_task_struct *) p->android_vendor_data1; u32 curr_window = wts->curr_window; u32 prev_window = wts->prev_window; struct walt_rq *src_wrq = &per_cpu(walt_rq, cpu_of(src_rq)); u8 src = src_wrq->curr_table; u8 *src_table; if (curr_window) { src_table = src_wrq->top_tasks[src]; index = load_to_index(curr_window); src_table[index] -= 1; if (!src_table[index]) __clear_bit(NUM_LOAD_INDICES - index - 1, src_wrq->top_tasks_bitmap[src]); top_index = src_wrq->curr_top; if (index == top_index && !src_table[index]) src_wrq->curr_top = get_top_index( src_wrq->top_tasks_bitmap[src], top_index); } if (prev_window) { src = 1 - src; src_table = src_wrq->top_tasks[src]; index = load_to_index(prev_window); src_table[index] -= 1; if (!src_table[index]) __clear_bit(NUM_LOAD_INDICES - index - 1, src_wrq->top_tasks_bitmap[src]); top_index = src_wrq->prev_top; if (index == top_index && !src_table[index]) src_wrq->prev_top = get_top_index( src_wrq->top_tasks_bitmap[src], top_index); } } static void migrate_top_tasks_addition(struct task_struct *p, struct rq *rq) { int index; struct walt_task_struct *wts = (struct walt_task_struct *) p->android_vendor_data1; u32 curr_window = wts->curr_window; u32 prev_window = wts->prev_window; struct walt_rq *dst_wrq = &per_cpu(walt_rq, cpu_of(rq)); u8 dst = dst_wrq->curr_table; u8 *dst_table; if (curr_window) { dst_table = dst_wrq->top_tasks[dst]; index = load_to_index(curr_window); dst_table[index] += 1; if (dst_table[index] == 1) __set_bit(NUM_LOAD_INDICES - index - 1, dst_wrq->top_tasks_bitmap[dst]); if (index > dst_wrq->curr_top) dst_wrq->curr_top = index; } if (prev_window) { dst = 1 - dst; dst_table = dst_wrq->top_tasks[dst]; index = load_to_index(prev_window); dst_table[index] += 1; if (dst_table[index] == 1) __set_bit(NUM_LOAD_INDICES - index - 1, dst_wrq->top_tasks_bitmap[dst]); if (index > dst_wrq->prev_top) dst_wrq->prev_top = index; } } static inline bool is_new_task(struct task_struct *p) { struct walt_task_struct *wts = (struct walt_task_struct *) p->android_vendor_data1; return wts->active_time < NEW_TASK_ACTIVE_TIME; } static inline int run_walt_irq_work_rollover(u64 old_window_start, struct rq *rq); static void migrate_busy_time_subtraction(struct task_struct *p, int new_cpu) { struct rq *src_rq = task_rq(p); u64 wallclock; u64 *src_curr_runnable_sum, *src_prev_runnable_sum; u64 *src_nt_curr_runnable_sum, *src_nt_prev_runnable_sum; bool new_task; struct walt_related_thread_group *grp; long pstate; struct walt_rq *src_wrq = &per_cpu(walt_rq, cpu_of(src_rq)); struct walt_task_struct *wts = (struct walt_task_struct *) p->android_vendor_data1; if (!p->on_rq && READ_ONCE(p->__state) != TASK_WAKING) return; pstate = READ_ONCE(p->__state); if (pstate == TASK_WAKING) raw_spin_rq_lock(src_rq); walt_lockdep_assert_rq(src_rq, p); if (task_rq(p) != src_rq) WALT_BUG(WALT_BUG_UPSTREAM, p, "on CPU %d task %s(%d) not on src_rq %d", raw_smp_processor_id(), p->comm, p->pid, src_rq->cpu); wts->new_cpu = new_cpu; if (!same_freq_domain(task_cpu(p), new_cpu)) wts->enqueue_after_migration = 2; /* 2 is intercluster */ else wts->enqueue_after_migration = 1; /* 1 is within cluster */ wallclock = walt_sched_clock(); walt_update_task_ravg(p, task_rq(p), TASK_MIGRATE, wallclock, 0); if (wts->window_start != src_wrq->window_start) WALT_BUG(WALT_BUG_WALT, p, "CPU%d: %s task %s(%d)'s ws=%llu not equal to src_rq %d's ws=%llu", raw_smp_processor_id(), __func__, p->comm, p->pid, wts->window_start, src_rq->cpu, src_wrq->window_start); /* safe to update the task cyc cntr for new_cpu without the new_cpu rq_lock */ update_task_cpu_cycles(p, new_cpu, wallclock); new_task = is_new_task(p); /* Protected by rq_lock */ grp = wts->grp; /* * For frequency aggregation, we continue to do migration fixups * even for intra cluster migrations. This is because, the aggregated * load has to reported on a single CPU regardless. */ if (grp) { struct group_cpu_time *cpu_time = &src_wrq->grp_time; src_curr_runnable_sum = &cpu_time->curr_runnable_sum; src_prev_runnable_sum = &cpu_time->prev_runnable_sum; src_nt_curr_runnable_sum = &cpu_time->nt_curr_runnable_sum; src_nt_prev_runnable_sum = &cpu_time->nt_prev_runnable_sum; if (wts->curr_window) { *src_curr_runnable_sum -= wts->curr_window; if (new_task) *src_nt_curr_runnable_sum -= wts->curr_window; } if (wts->prev_window) { *src_prev_runnable_sum -= wts->prev_window; if (new_task) *src_nt_prev_runnable_sum -= wts->prev_window; } } else { if (wts->enqueue_after_migration == 2) migrate_inter_cluster_subtraction(p, task_cpu(p), new_task); } migrate_top_tasks_subtraction(p, src_rq); if (is_ed_enabled() && (p == src_wrq->ed_task)) src_wrq->ed_task = NULL; wts->prev_cpu = task_cpu(p); if (pstate == TASK_WAKING) raw_spin_rq_unlock(src_rq); } static void migrate_busy_time_addition(struct task_struct *p, int new_cpu, u64 wallclock) { struct rq *dest_rq = cpu_rq(new_cpu); u64 *dst_curr_runnable_sum, *dst_prev_runnable_sum; u64 *dst_nt_curr_runnable_sum, *dst_nt_prev_runnable_sum; bool new_task; struct walt_related_thread_group *grp; struct walt_rq *dest_wrq = &per_cpu(walt_rq, new_cpu); struct walt_task_struct *wts = (struct walt_task_struct *) p->android_vendor_data1; int src_cpu = wts->prev_cpu; struct walt_rq *src_wrq = &per_cpu(walt_rq, src_cpu); walt_lockdep_assert_rq(dest_rq, p); walt_update_task_ravg(p, dest_rq, TASK_UPDATE, wallclock, 0); if (wts->window_start != dest_wrq->window_start) WALT_BUG(WALT_BUG_WALT, p, "CPU%d: %s task %s(%d)'s ws=%llu not equal to dest_rq %d's ws=%llu", raw_smp_processor_id(), __func__, p->comm, p->pid, wts->window_start, dest_rq->cpu, dest_wrq->window_start); new_task = is_new_task(p); /* Protected by rq_lock */ grp = wts->grp; /* * For frequency aggregation, we continue to do migration fixups * even for intra cluster migrations. This is because, the aggregated * load has to reported on a single CPU regardless. */ if (grp) { struct group_cpu_time *cpu_time = &dest_wrq->grp_time; dst_curr_runnable_sum = &cpu_time->curr_runnable_sum; dst_prev_runnable_sum = &cpu_time->prev_runnable_sum; dst_nt_curr_runnable_sum = &cpu_time->nt_curr_runnable_sum; dst_nt_prev_runnable_sum = &cpu_time->nt_prev_runnable_sum; if (wts->curr_window) { *dst_curr_runnable_sum += wts->curr_window; if (new_task) *dst_nt_curr_runnable_sum += wts->curr_window; } if (wts->prev_window) { *dst_prev_runnable_sum += wts->prev_window; if (new_task) *dst_nt_prev_runnable_sum += wts->prev_window; } } else { if (wts->enqueue_after_migration == 2) migrate_inter_cluster_addition(p, new_cpu, new_task); } migrate_top_tasks_addition(p, dest_rq); if (wts->enqueue_after_migration == 2) { src_wrq->notif_pending = true; dest_wrq->notif_pending = true; walt_irq_work_queue(&walt_migration_irq_work); } if (is_ed_enabled() && is_ed_task(p, wallclock)) dest_wrq->ed_task = p; wts->new_cpu = -1; } #define INC_STEP 8 #define DEC_STEP 2 #define CONSISTENT_THRES 16 #define INC_STEP_BIG 16 /* * bucket_increase - update the count of all buckets * * @buckets: array of buckets tracking busy time of a task * @idx: the index of bucket to be incremented * * Each time a complete window finishes, count of bucket that runtime * falls in (@idx) is incremented. Counts of all other buckets are * decayed. The rate of increase and decay could be different based * on current count in the bucket. */ static inline void bucket_increase(u8 *buckets, u16 *bucket_bitmask, int idx) { int i, step; for (i = 0; i < NUM_BUSY_BUCKETS; i++) { if (idx != i) { if (buckets[i] > DEC_STEP) buckets[i] -= DEC_STEP; else { buckets[i] = 0; *bucket_bitmask &= ~BIT_MASK(i); } } else { step = buckets[i] >= CONSISTENT_THRES ? INC_STEP_BIG : INC_STEP; if (buckets[i] > U8_MAX - step) buckets[i] = U8_MAX; else buckets[i] += step; *bucket_bitmask |= BIT_MASK(i); } } } static inline int busy_to_bucket(u16 normalized_rt) { int bidx; bidx = normalized_rt >> (SCHED_CAPACITY_SHIFT - NUM_BUSY_BUCKETS_SHIFT); bidx = min(bidx, NUM_BUSY_BUCKETS - 1); /* * Combine lowest two buckets. The lowest frequency falls into * 2nd bucket and thus keep predicting lowest bucket is not * useful. */ if (!bidx) bidx++; return bidx; } /* * get_pred_busy - calculate predicted demand for a task on runqueue * * @p: task whose prediction is being updated * @start: starting bucket. returned prediction should not be lower than * this bucket. * @runtime: runtime of the task. returned prediction should not be lower * than this runtime. * Note: @start can be derived from @runtime. It's passed in only to * avoid duplicated calculation in some cases. * * A new predicted busy time is returned for task @p based on @runtime * passed in. The function searches through buckets that represent busy * time equal to or bigger than @runtime and attempts to find the bucket * to use for prediction. Once found, it searches through historical busy * time and returns the latest that falls into the bucket. If no such busy * time exists, it returns the medium of that bucket. */ static u32 get_pred_busy(struct task_struct *p, int start, u16 runtime_scaled, u16 bucket_bitmask) { struct walt_task_struct *wts = (struct walt_task_struct *) p->android_vendor_data1; u16 dmin, dmax; int first = NUM_BUSY_BUCKETS, final = NUM_BUSY_BUCKETS; u16 ret = runtime_scaled; u16 next_mask = bucket_bitmask >> start; u16 *hist_util = wts->sum_history_util; int i; /* skip prediction for new tasks due to lack of history */ if (unlikely(is_new_task(p))) goto out; /* find minimal bucket index to pick */ if (next_mask) first = ffs(next_mask) - 1 + start; /* if no higher buckets are filled, predict runtime */ if (first >= NUM_BUSY_BUCKETS) goto out; /* compute the bucket for prediction */ final = first; /* determine demand range for the predicted bucket */ if (final < 2) { /* lowest two buckets are combined */ dmin = 0; final = 1; } else { dmin = final << (SCHED_CAPACITY_SHIFT - NUM_BUSY_BUCKETS_SHIFT); } dmax = (final + 1) << (SCHED_CAPACITY_SHIFT - NUM_BUSY_BUCKETS_SHIFT); /* * search through runtime history and return first runtime that falls * into the range of predicted bucket. */ for (i = 0; i < RAVG_HIST_SIZE; i++) { if (hist_util[i] >= dmin && hist_util[i] < dmax) { ret = hist_util[i]; break; } } /* no historical runtime within bucket found, use average of the bin */ if (ret < dmin) ret = (u16) (((u32)dmin + dmax) / 2); /* * when updating in middle of a window, runtime could be higher * than all recorded history. Always predict at least runtime. */ ret = max(runtime_scaled, ret); out: trace_sched_update_pred_demand(p, runtime_scaled, ret, start, first, final, wts); return ret; } /* * predictive demand of a task was calculated at the last window roll-over. * if the task current window busy time exceeds the predicted * demand, update it here to reflect the task needs. */ static void update_task_pred_demand(struct rq *rq, struct task_struct *p, int event) { u16 new_pred_demand_scaled; struct walt_task_struct *wts = (struct walt_task_struct *) p->android_vendor_data1; u16 curr_window_scaled; if (walt_is_idle_task(p)) return; if (event != PUT_PREV_TASK && event != TASK_UPDATE && (!SCHED_FREQ_ACCOUNT_WAIT_TIME || (event != TASK_MIGRATE && event != PICK_NEXT_TASK))) return; /* * TASK_UPDATE can be called on sleeping task, when its moved between * related groups */ if (event == TASK_UPDATE) { if (!p->on_rq && !SCHED_FREQ_ACCOUNT_WAIT_TIME) return; } curr_window_scaled = scale_time_to_util(wts->curr_window); if (wts->pred_demand_scaled >= curr_window_scaled) return; new_pred_demand_scaled = get_pred_busy(p, busy_to_bucket(curr_window_scaled), curr_window_scaled, wts->bucket_bitmask); if (task_on_rq_queued(p)) fixup_walt_sched_stats_common(rq, p, wts->demand_scaled, new_pred_demand_scaled); wts->pred_demand_scaled = new_pred_demand_scaled; } static void clear_top_tasks_bitmap(unsigned long *bitmap) { memset(bitmap, 0, top_tasks_bitmap_size); __set_bit(NUM_LOAD_INDICES, bitmap); } static inline void clear_top_tasks_table(u8 *table) { memset(table, 0, NUM_LOAD_INDICES * sizeof(u8)); } static void update_top_tasks(struct task_struct *p, struct rq *rq, u32 old_curr_window, int new_window, bool full_window) { struct walt_rq *wrq = &per_cpu(walt_rq, cpu_of(rq)); struct walt_task_struct *wts = (struct walt_task_struct *) p->android_vendor_data1; u8 curr = wrq->curr_table; u8 prev = 1 - curr; u8 *curr_table = wrq->top_tasks[curr]; u8 *prev_table = wrq->top_tasks[prev]; int old_index, new_index, update_index; u32 curr_window = wts->curr_window; u32 prev_window = wts->prev_window; bool zero_index_update; if (old_curr_window == curr_window && !new_window) return; old_index = load_to_index(old_curr_window); new_index = load_to_index(curr_window); if (!new_window) { zero_index_update = !old_curr_window && curr_window; if (old_index != new_index || zero_index_update) { if (old_curr_window) curr_table[old_index] -= 1; if (curr_window) curr_table[new_index] += 1; if (new_index > wrq->curr_top) wrq->curr_top = new_index; } if (!curr_table[old_index]) __clear_bit(NUM_LOAD_INDICES - old_index - 1, wrq->top_tasks_bitmap[curr]); if (curr_table[new_index] == 1) __set_bit(NUM_LOAD_INDICES - new_index - 1, wrq->top_tasks_bitmap[curr]); return; } /* * The window has rolled over for this task. By the time we get * here, curr/prev swaps would has already occurred. So we need * to use prev_window for the new index. */ update_index = load_to_index(prev_window); if (full_window) { /* * Two cases here. Either 'p' ran for the entire window or * it didn't run at all. In either case there is no entry * in the prev table. If 'p' ran the entire window, we just * need to create a new entry in the prev table. In this case * update_index will be correspond to sched_ravg_window * so we can unconditionally update the top index. */ if (prev_window) { prev_table[update_index] += 1; wrq->prev_top = update_index; } if (prev_table[update_index] == 1) __set_bit(NUM_LOAD_INDICES - update_index - 1, wrq->top_tasks_bitmap[prev]); } else { zero_index_update = !old_curr_window && prev_window; if (old_index != update_index || zero_index_update) { if (old_curr_window) prev_table[old_index] -= 1; prev_table[update_index] += 1; if (update_index > wrq->prev_top) wrq->prev_top = update_index; if (!prev_table[old_index]) __clear_bit(NUM_LOAD_INDICES - old_index - 1, wrq->top_tasks_bitmap[prev]); if (prev_table[update_index] == 1) __set_bit(NUM_LOAD_INDICES - update_index - 1, wrq->top_tasks_bitmap[prev]); } } if (curr_window) { curr_table[new_index] += 1; if (new_index > wrq->curr_top) wrq->curr_top = new_index; if (curr_table[new_index] == 1) __set_bit(NUM_LOAD_INDICES - new_index - 1, wrq->top_tasks_bitmap[curr]); } } static void rollover_top_tasks(struct rq *rq, bool full_window) { struct walt_rq *wrq = &per_cpu(walt_rq, cpu_of(rq)); u8 curr_table = wrq->curr_table; u8 prev_table = 1 - curr_table; int curr_top = wrq->curr_top; clear_top_tasks_table(wrq->top_tasks[prev_table]); clear_top_tasks_bitmap(wrq->top_tasks_bitmap[prev_table]); if (full_window) { curr_top = 0; clear_top_tasks_table(wrq->top_tasks[curr_table]); clear_top_tasks_bitmap(wrq->top_tasks_bitmap[curr_table]); } wrq->curr_table = prev_table; wrq->prev_top = curr_top; wrq->curr_top = 0; } static u32 empty_windows[WALT_NR_CPUS]; static void rollover_task_window(struct task_struct *p, bool full_window) { u32 *curr_cpu_windows = empty_windows; u32 curr_window; int i; struct walt_rq *wrq = &per_cpu(walt_rq, cpu_of(task_rq(p))); struct walt_task_struct *wts = (struct walt_task_struct *) p->android_vendor_data1; /* Rollover the sum */ curr_window = 0; if (!full_window) { curr_window = wts->curr_window; curr_cpu_windows = wts->curr_window_cpu; } wts->prev_window = curr_window; wts->curr_window = 0; /* Roll over individual CPU contributions */ for (i = 0; i < nr_cpu_ids; i++) { wts->prev_window_cpu[i] = curr_cpu_windows[i]; wts->curr_window_cpu[i] = 0; } if (is_new_task(p)) wts->active_time += wrq->prev_window_size; } static inline int cpu_is_waiting_on_io(struct rq *rq) { if (!sched_io_is_busy) return 0; return atomic_read(&rq->nr_iowait); } static int account_busy_for_cpu_time(struct rq *rq, struct task_struct *p, u64 irqtime, int event) { if (walt_is_idle_task(p)) { /* TASK_WAKE && TASK_MIGRATE is not possible on idle task! */ if (event == PICK_NEXT_TASK) return 0; /* PUT_PREV_TASK, TASK_UPDATE && IRQ_UPDATE are left */ return irqtime || cpu_is_waiting_on_io(rq); } if (event == TASK_WAKE) return 0; if (event == PUT_PREV_TASK || event == IRQ_UPDATE) return 1; /* * TASK_UPDATE can be called on sleeping task, when its moved between * related groups */ if (event == TASK_UPDATE) { if (rq->curr == p) return 1; return p->on_rq ? SCHED_FREQ_ACCOUNT_WAIT_TIME : 0; } /* TASK_MIGRATE, PICK_NEXT_TASK left */ return SCHED_FREQ_ACCOUNT_WAIT_TIME; } #define DIV64_U64_ROUNDUP(X, Y) div64_u64((X) + (Y - 1), Y) static inline u64 scale_exec_time(u64 delta, struct rq *rq, struct walt_task_struct *wts) { struct walt_rq *wrq = &per_cpu(walt_rq, cpu_of(rq)); delta = (delta * wrq->task_exec_scale) >> SCHED_CAPACITY_SHIFT; if (wts->load_boost && wts->grp) delta = (delta * (1024 + wts->boosted_task_load) >> 10); return delta; } /* Convert busy time to frequency equivalent * Assumes load is scaled to 1024 */ static inline unsigned int load_to_freq(struct rq *rq, unsigned int load) { return mult_frac(cpu_max_possible_freq(cpu_of(rq)), load, (unsigned int)arch_scale_cpu_capacity(cpu_of(rq))); } static bool do_pl_notif(struct rq *rq) { struct walt_rq *wrq = &per_cpu(walt_rq, cpu_of(rq)); u64 prev = wrq->old_busy_time; u64 pl = wrq->walt_stats.pred_demands_sum_scaled; int cpu = cpu_of(rq); /* If already at max freq, bail out */ if (capacity_orig_of(cpu) == capacity_curr_of(cpu)) return false; prev = max(prev, wrq->old_estimated_time); /* 400 MHz filter. */ return (pl > prev) && (load_to_freq(rq, pl - prev) > 400000); } static void rollover_cpu_window(struct rq *rq, bool full_window) { struct walt_rq *wrq = &per_cpu(walt_rq, cpu_of(rq)); u64 curr_sum = wrq->curr_runnable_sum; u64 nt_curr_sum = wrq->nt_curr_runnable_sum; u64 grp_curr_sum = wrq->grp_time.curr_runnable_sum; u64 grp_nt_curr_sum = wrq->grp_time.nt_curr_runnable_sum; if (unlikely(full_window)) { curr_sum = 0; nt_curr_sum = 0; grp_curr_sum = 0; grp_nt_curr_sum = 0; } wrq->prev_runnable_sum = curr_sum; wrq->nt_prev_runnable_sum = nt_curr_sum; wrq->grp_time.prev_runnable_sum = grp_curr_sum; wrq->grp_time.nt_prev_runnable_sum = grp_nt_curr_sum; wrq->curr_runnable_sum = 0; wrq->nt_curr_runnable_sum = 0; wrq->grp_time.curr_runnable_sum = 0; wrq->grp_time.nt_curr_runnable_sum = 0; } /* * Account cpu activity in its * busy time counters(wrq->curr/prev_runnable_sum) * * While the comments at the top of update_task_demand() apply, irqtime handling * needs some explanation. * * Note that update_task_ravg() with irqtime is only called when idle, i.e. p is * always idle * * ms_i = mark_start of idle task * ws = wrq->window_start * irq_s = start time of irq * irq_e = end time of irq = wallclock * * note irqtime = irq_e - irq_s * * Similar to the explanation at update_task_demand() we have few sitautions for irqtime * * ws ms_i is ie * | | | | * V V V V * --------|--------------------| * prev curr * * In the above case, new_window is false and irqtime is accounted in curr_runnable_sum, this is * done in the if (!new_window) block. * * ms_i ws is ie * | | | | * V V V V * -------------|--------------------- * prev curr * * In this case, new_window is true, however the irqtime falls within the current window, the * entire irqtime is accounted in curr_runnable_sum. This is handled in the if (irqtime) block and * within that if (mark_start > window_start) block * * ms_i is ws ie * | | | | * V V V V * --------------------|--------------- * prev curr * * In this case, new_window is true, portion of the irqtime needs to be accounted in * prev_runnable_sum while the rest is in curr_runnable_sum. This is handled in the * if (irqtime) block */ static void update_cpu_busy_time(struct task_struct *p, struct rq *rq, int event, u64 wallclock, u64 irqtime) { int new_window, full_window = 0; int p_is_curr_task = (p == rq->curr); struct walt_task_struct *wts = (struct walt_task_struct *) p->android_vendor_data1; u64 mark_start = wts->mark_start; struct walt_rq *wrq = &per_cpu(walt_rq, cpu_of(rq)); u64 window_start = wrq->window_start; u32 window_size = wrq->prev_window_size; u64 delta; u64 *curr_runnable_sum = &wrq->curr_runnable_sum; u64 *prev_runnable_sum = &wrq->prev_runnable_sum; u64 *nt_curr_runnable_sum = &wrq->nt_curr_runnable_sum; u64 *nt_prev_runnable_sum = &wrq->nt_prev_runnable_sum; bool new_task; struct walt_related_thread_group *grp; int cpu = rq->cpu; u32 old_curr_window = wts->curr_window; walt_lockdep_assert_rq(rq, p); new_window = mark_start < window_start; if (new_window) full_window = (window_start - mark_start) >= window_size; /* * Handle per-task window rollover. We don't care about the * idle task. */ if (new_window) { if (!walt_is_idle_task(p)) rollover_task_window(p, full_window); wts->window_start = window_start; } new_task = is_new_task(p); if (!account_busy_for_cpu_time(rq, p, irqtime, event)) goto done; grp = wts->grp; if (grp) { struct group_cpu_time *cpu_time = &wrq->grp_time; curr_runnable_sum = &cpu_time->curr_runnable_sum; prev_runnable_sum = &cpu_time->prev_runnable_sum; nt_curr_runnable_sum = &cpu_time->nt_curr_runnable_sum; nt_prev_runnable_sum = &cpu_time->nt_prev_runnable_sum; } if (wts->window_start != wrq->window_start) WALT_BUG(WALT_BUG_WALT, p, "CPU%d: %s task %s(%d)'s ws=%llu not equal to rq %d's ws=%llu", raw_smp_processor_id(), __func__, p->comm, p->pid, wts->window_start, rq->cpu, wrq->window_start); if (!new_window) { /* * account_busy_for_cpu_time() = 1 so busy time needs * to be accounted to the current window. No rollover * since we didn't start a new window. An example of this is * when a task starts execution and then sleeps within the * same window. */ if (!irqtime || !walt_is_idle_task(p) || cpu_is_waiting_on_io(rq)) delta = wallclock - mark_start; else delta = irqtime; delta = scale_exec_time(delta, rq, wts); *curr_runnable_sum += delta; if (new_task) *nt_curr_runnable_sum += delta; if (!walt_is_idle_task(p)) { wts->curr_window += delta; wts->curr_window_cpu[cpu] += delta; } goto done; } /* * situations below this need window rollover, * Rollover of cpu counters (curr/prev_runnable_sum) should have already be done * in update_window_start() * * For task counters curr/prev_window[_cpu] are rolled over in the early part of * this function. If full_window(s) have expired and time since last update needs * to be accounted as busy time, set the prev to a complete window size time, else * add the prev window portion. * * For task curr counters a new window has begun, always assign */ if (!p_is_curr_task) { /* * account_busy_for_cpu_time() = 1 so busy time needs * to be accounted to the current window. A new window * must have been started in udpate_window_start() * - just split up and account as necessary into curr and prev. * * Irqtime can't be accounted by a task that isn't the * currently running task. */ if (!full_window) { /* * A full window hasn't elapsed, account partial * contribution to previous completed window. */ delta = scale_exec_time(window_start - mark_start, rq, wts); wts->prev_window += delta; wts->prev_window_cpu[cpu] += delta; } else { /* * Since at least one full window has elapsed, * the contribution to the previous window is the * full window (window_size). */ delta = scale_exec_time(window_size, rq, wts); wts->prev_window = delta; wts->prev_window_cpu[cpu] = delta; } *prev_runnable_sum += delta; if (new_task) *nt_prev_runnable_sum += delta; /* Account piece of busy time in the current window. */ delta = scale_exec_time(wallclock - window_start, rq, wts); *curr_runnable_sum += delta; if (new_task) *nt_curr_runnable_sum += delta; wts->curr_window = delta; wts->curr_window_cpu[cpu] = delta; goto done; } if (!irqtime || !walt_is_idle_task(p) || cpu_is_waiting_on_io(rq)) { /* * account_busy_for_cpu_time() = 1 so busy time needs * to be accounted to the current window. A new window * must have been started in udpate_window_start() * If any of these three above conditions are true * then this busy time can't be accounted as irqtime. * * Busy time for the idle task need not be accounted. * * An example of this would be a task that starts execution * and then sleeps once a new window has begun. */ if (!full_window) { /* * A full window hasn't elapsed, account partial * contribution to previous completed window. */ delta = scale_exec_time(window_start - mark_start, rq, wts); if (!walt_is_idle_task(p)) { wts->prev_window += delta; wts->prev_window_cpu[cpu] += delta; } } else { /* * Since at least one full window has elapsed, * the contribution to the previous window is the * full window (window_size). */ delta = scale_exec_time(window_size, rq, wts); if (!walt_is_idle_task(p)) { wts->prev_window = delta; wts->prev_window_cpu[cpu] = delta; } } *prev_runnable_sum += delta; if (new_task) *nt_prev_runnable_sum += delta; /* Account piece of busy time in the current window. */ delta = scale_exec_time(wallclock - window_start, rq, wts); *curr_runnable_sum += delta; if (new_task) *nt_curr_runnable_sum += delta; if (!walt_is_idle_task(p)) { wts->curr_window = delta; wts->curr_window_cpu[cpu] = delta; } goto done; } if (irqtime) { /* * account_busy_for_cpu_time() = 1 so busy time needs * to be accounted to the current window. A new window * must have been started in udpate_window_start() * The current task must be the idle task because * irqtime is not accounted for any other task. * * Irqtime will be accounted each time we process IRQ activity * after a period of idleness, so we know the IRQ busy time * started at wallclock - irqtime. */ WALT_PANIC(!walt_is_idle_task(p)); /* mark_start here becomes the starting time of interrupt */ mark_start = wallclock - irqtime; /* * If IRQ busy time was just in the current * window then that is all that need be accounted. */ if (mark_start > window_start) { *curr_runnable_sum += scale_exec_time(irqtime, rq, wts); return; } /* * The IRQ busy time spanned multiple windows. Process the * busy time preceding the current window start first. */ delta = window_start - mark_start; if (delta > window_size) delta = window_size; delta = scale_exec_time(delta, rq, wts); *prev_runnable_sum += delta; /* Process the remaining IRQ busy time in the current window. */ delta = wallclock - window_start; wrq->curr_runnable_sum += scale_exec_time(delta, rq, wts); return; } done: if (!walt_is_idle_task(p)) update_top_tasks(p, rq, old_curr_window, new_window, full_window); } static inline u16 predict_and_update_buckets( struct task_struct *p, u16 runtime_scaled) { int bidx; u32 pred_demand_scaled; struct walt_task_struct *wts = (struct walt_task_struct *) p->android_vendor_data1; bidx = busy_to_bucket(runtime_scaled); pred_demand_scaled = get_pred_busy(p, bidx, runtime_scaled, wts->bucket_bitmask); bucket_increase(wts->busy_buckets, &wts->bucket_bitmask, bidx); return pred_demand_scaled; } static int account_busy_for_task_demand(struct rq *rq, struct task_struct *p, int event) { /* * No need to bother updating task demand for the idle task. */ if (walt_is_idle_task(p)) return 0; /* * When a task is waking up it is completing a segment of non-busy * time. Likewise, if wait time is not treated as busy time, then * when a task begins to run or is migrated, it is not running and * is completing a segment of non-busy time. */ if (event == TASK_WAKE || (!SCHED_ACCOUNT_WAIT_TIME && (event == PICK_NEXT_TASK || event == TASK_MIGRATE))) return 0; /* * The idle exit time is not accounted for the first task _picked_ up to * run on the idle CPU. */ if (event == PICK_NEXT_TASK && rq->curr == rq->idle) return 0; /* * TASK_UPDATE can be called on sleeping task, when its moved between * related groups */ if (event == TASK_UPDATE) { if (rq->curr == p) return 1; return p->on_rq ? SCHED_ACCOUNT_WAIT_TIME : 0; } return 1; } #define TRAILBLAZER_THRES 230 #define TRAILBLAZER_BYPASS 243 #define FINAL_BUCKET_STEP_UP 8 #define FINAL_BUCKET_STEP_DOWN 1 static inline u32 scale_util_to_time(u16 util) { return util * walt_scale_demand_divisor; } static void update_trailblazer_accounting(struct task_struct *p, struct rq *rq, u32 runtime, u16 runtime_scaled, u32 *demand, u16 *trailblazer_demand) { struct walt_task_struct *wts = (struct walt_task_struct *) p->android_vendor_data1; struct walt_rq *wrq = &per_cpu(walt_rq, cpu_of(rq)); bool is_prev_trailblazer = walt_flag_test(p, WALT_TRAILBLAZER_BIT); u64 trailblazer_capacity; if (walt_feat(WALT_FEAT_TRAILBLAZER_BIT) && (((runtime >= *demand) && (wts->high_util_history >= TRAILBLAZER_THRES)) || wts->high_util_history >= TRAILBLAZER_BYPASS)) { *trailblazer_demand = 1 << SCHED_CAPACITY_SHIFT; *demand = scale_util_to_time(*trailblazer_demand); walt_flag_set(p, WALT_TRAILBLAZER_BIT, 1); } else if (is_prev_trailblazer) { walt_flag_set(p, WALT_TRAILBLAZER_BIT, 0); } /* * In the event that a trailblazer task is detected (or an existing trailblazer task * no longer matches the criteria) and is already enqueued on the cpu, ensure to * close the prod-sum accounts for this task before the next update takes place. */ if (task_on_rq_queued(p)) { if (is_prev_trailblazer != walt_flag_test(p, WALT_TRAILBLAZER_BIT)) sched_update_nr_prod(rq->cpu, 0); if (is_prev_trailblazer && !walt_flag_test(p, WALT_TRAILBLAZER_BIT)) wrq->walt_stats.nr_trailblazer_tasks--; else if (!is_prev_trailblazer && walt_flag_test(p, WALT_TRAILBLAZER_BIT)) wrq->walt_stats.nr_trailblazer_tasks++; } /* * The CPU might be running with capped capacities. In order for a runtime to be considered * as trailblazer worthy, it must be 87.5% or more of the prime CPU capacity. */ trailblazer_capacity = capacity_orig_of(cpumask_first(&cpu_array[0][num_sched_clusters - 1])); trailblazer_capacity = trailblazer_capacity - (trailblazer_capacity >> 3); if (runtime_scaled >= (u16)trailblazer_capacity) { if (wts->high_util_history > U8_MAX - FINAL_BUCKET_STEP_UP) wts->high_util_history = U8_MAX; else wts->high_util_history += FINAL_BUCKET_STEP_UP; } else if (wts->high_util_history) { wts->high_util_history -= FINAL_BUCKET_STEP_DOWN; } } /* * Called when new window is starting for a task, to record cpu usage over * recently concluded window(s). Normally 'samples' should be 1. It can be > 1 * when, say, a real-time task runs without preemption for several windows at a * stretch. */ static void update_history(struct rq *rq, struct task_struct *p, u32 runtime, int samples, int event) { struct walt_task_struct *wts = (struct walt_task_struct *) p->android_vendor_data1; u32 *hist = &wts->sum_history[0]; u16 *hist_util = &wts->sum_history_util[0]; int i; u32 max = 0, avg, demand; u64 sum = 0; u16 demand_scaled, pred_demand_scaled, runtime_scaled; u16 trailblazer_demand = 0; struct walt_rq *wrq = &per_cpu(walt_rq, cpu_of(rq)); /* clear yield status of task if there is a change in window */ if ((wts->yield_state & YIELD_CNT_MASK) < MAX_YIELD_CNT_PER_TASK_THR) wts->yield_state = 0; /* Ignore windows where task had no activity */ if (!runtime || walt_is_idle_task(p) || !samples) goto done; runtime_scaled = scale_time_to_util(runtime); /* Push new 'runtime' value onto stack */ for (; samples > 0; samples--) { hist[wts->cidx] = runtime; hist_util[wts->cidx] = runtime_scaled; wts->cidx++; wts->cidx = wts->cidx % RAVG_HIST_SIZE; } for (i = 0; i < RAVG_HIST_SIZE; i++) { sum += hist[i]; if (hist[i] > max) max = hist[i]; } wts->sum = 0; avg = div64_u64(sum, RAVG_HIST_SIZE); switch (sysctl_sched_window_stats_policy) { case WINDOW_STATS_RECENT: demand = runtime; break; case WINDOW_STATS_MAX: demand = max; break; case WINDOW_STATS_AVG: demand = avg; break; default: demand = max(avg, runtime); } if (walt_fair_task(p)) update_trailblazer_accounting(p, rq, runtime, runtime_scaled, &demand, &trailblazer_demand); pred_demand_scaled = predict_and_update_buckets(p, runtime_scaled); demand_scaled = scale_time_to_util(demand); /* * Avoid double accounting of task demand as demand will be updated * to CRA as part of enqueue/dequeue. * * When window is rolled over, the cumulative window demand * is reset to the cumulative runnable average (contribution from * the tasks on the runqueue). If the current task is dequeued * already, it's demand is not included in the cumulative runnable * average. So add the task demand separately to cumulative window * demand. */ if (task_on_rq_queued(p)) fixup_walt_sched_stats_common(rq, p, demand_scaled, pred_demand_scaled); wts->demand = demand; wts->demand_scaled = demand_scaled; wts->coloc_demand = avg; wts->pred_demand_scaled = pred_demand_scaled; if (demand_scaled > sysctl_sched_min_task_util_for_colocation) wts->unfilter = sysctl_sched_task_unfilter_period; else if (wts->unfilter) wts->unfilter = max_t(int, 0, wts->unfilter - wrq->prev_window_size); done: trace_sched_update_history(rq, p, runtime, samples, event, wrq, wts, trailblazer_demand); } static u64 add_to_task_demand(struct rq *rq, struct task_struct *p, u64 delta) { struct walt_task_struct *wts = (struct walt_task_struct *) p->android_vendor_data1; delta = scale_exec_time(delta, rq, wts); wts->sum += delta; if (unlikely(wts->sum > sched_ravg_window)) wts->sum = sched_ravg_window; return delta; } /* * Account cpu demand of task and/or update task's cpu demand history * * ms = wts->mark_start; * wc = wallclock * ws = wrq->window_start * * Three possibilities: * * a) Task event is contained within one window. * window_start < mark_start < wallclock * * ws ms wc * | | | * V V V * |---------------| * * In this case, wts->sum is updated *iff* event is appropriate * (ex: event == PUT_PREV_TASK) * * b) Task event spans two windows. * mark_start < window_start < wallclock * * ms ws wc * | | | * V V V * -----|------------------- * * In this case, wts->sum is updated with (ws - ms) *iff* event * is appropriate, then a new window sample is recorded followed * by wts->sum being set to (wc - ws) *iff* event is appropriate. * * c) Task event spans more than two windows. * * ms ws_tmp ws wc * | | | | * V V V V * ---|-------|-------|-------|-------|------ * | | * |<------ nr_full_windows ------>| * * In this case, wts->sum is updated with (ws_tmp - ms) first *iff* * event is appropriate, window sample of wts->sum is recorded, * 'nr_full_window' samples of window_size is also recorded *iff* * event is appropriate and finally wts->sum is set to (wc - ws) * *iff* event is appropriate. * * IMPORTANT : Leave wts->mark_start unchanged, as update_cpu_busy_time() * depends on it! */ static u64 update_task_demand(struct task_struct *p, struct rq *rq, int event, u64 wallclock) { struct walt_task_struct *wts = (struct walt_task_struct *) p->android_vendor_data1; u64 mark_start = wts->mark_start; struct walt_rq *wrq = &per_cpu(walt_rq, cpu_of(rq)); u64 delta, window_start = wrq->window_start; int new_window, nr_full_windows; u32 window_size = sched_ravg_window; u64 runtime; new_window = mark_start < window_start; if (!account_busy_for_task_demand(rq, p, event)) { if (new_window) /* * If the time accounted isn't being accounted as * busy time, and a new window started, only the * previous window need be closed out with the * pre-existing demand. Multiple windows may have * elapsed, but since empty windows are dropped, * it is not necessary to account those. */ update_history(rq, p, wts->sum, 1, event); return 0; } if (!new_window) { /* * The simple case - busy time contained within the existing * window. */ return add_to_task_demand(rq, p, wallclock - mark_start); } /* * Busy time spans at least two windows. Temporarily rewind * window_start to first window boundary after mark_start. */ delta = window_start - mark_start; nr_full_windows = div64_u64(delta, window_size); window_start -= (u64)nr_full_windows * (u64)window_size; /* Process (window_start - mark_start) first */ runtime = add_to_task_demand(rq, p, window_start - mark_start); /* Push new sample(s) into task's demand history */ update_history(rq, p, wts->sum, 1, event); if (nr_full_windows) { u64 scaled_window = scale_exec_time(window_size, rq, wts); update_history(rq, p, scaled_window, nr_full_windows, event); runtime += nr_full_windows * scaled_window; } /* * Roll window_start back to current to process any remainder * in current window. */ window_start += (u64)nr_full_windows * (u64)window_size; /* Process (wallclock - window_start) next */ mark_start = window_start; runtime += add_to_task_demand(rq, p, wallclock - mark_start); return runtime; } static inline unsigned int cpu_cur_freq(int cpu) { struct walt_rq *wrq = &per_cpu(walt_rq, cpu); return wrq->cluster->cur_freq; } static void update_task_rq_cpu_cycles(struct task_struct *p, struct rq *rq, int event, u64 wallclock, u64 irqtime) { u64 cur_cycles; u64 cycles_delta; u64 time_delta; int cpu = cpu_of(rq); struct walt_rq *wrq = &per_cpu(walt_rq, cpu_of(rq)); struct walt_task_struct *wts = (struct walt_task_struct *) p->android_vendor_data1; walt_lockdep_assert_rq(rq, p); if (!use_cycle_counter) { wrq->task_exec_scale = DIV64_U64_ROUNDUP(cpu_cur_freq(cpu) * arch_scale_cpu_capacity(cpu), wrq->cluster->max_possible_freq); return; } cur_cycles = read_cycle_counter(cpu, wallclock); /* * If current task is idle task and irqtime == 0 CPU was * indeed idle and probably its cycle counter was not * increasing. We still need estimatied CPU frequency * for IO wait time accounting. Use the previously * calculated frequency in such a case. */ if (!walt_is_idle_task(rq->curr) || irqtime) { if (unlikely(cur_cycles < wts->cpu_cycles)) cycles_delta = cur_cycles + (U64_MAX - wts->cpu_cycles); else cycles_delta = cur_cycles - wts->cpu_cycles; cycles_delta = cycles_delta * NSEC_PER_MSEC; if (event == IRQ_UPDATE && walt_is_idle_task(p)) /* * Time between mark_start of idle task and IRQ handler * entry time is CPU cycle counter stall period. * Upon IRQ handler entry walt_sched_account_irqstart() * replenishes idle task's cpu cycle counter so * cycles_delta now represents increased cycles during * IRQ handler rather than time between idle entry and * IRQ exit. Thus use irqtime as time delta. */ time_delta = irqtime; else time_delta = wallclock - wts->mark_start; if ((s64)time_delta < 0) { WALT_BUG(WALT_BUG_WALT, p, "pid=%u CPU%d wallclock=%llu(0x%llx) < mark_start=%llu(0x%llx) event=%d irqtime=%llu", p->pid, rq->cpu, wallclock, wallclock, wts->mark_start, wts->mark_start, event, irqtime); time_delta = 1; } wrq->task_exec_scale = DIV64_U64_ROUNDUP(cycles_delta * arch_scale_cpu_capacity(cpu), time_delta * wrq->cluster->max_possible_freq); trace_sched_get_task_cpu_cycles(cpu, event, cycles_delta, time_delta, p); } wts->cpu_cycles = cur_cycles; } /* * Returns * 0: if window rollover not required or is not the winning CPU. * 1: if this CPU is tasked with window rollover duties. */ static inline int run_walt_irq_work_rollover(u64 old_window_start, struct rq *rq) { u64 result; struct walt_rq *wrq = &per_cpu(walt_rq, cpu_of(rq)); if (old_window_start == wrq->window_start) return 0; result = atomic64_cmpxchg(&walt_irq_work_lastq_ws, old_window_start, wrq->window_start); if (result == old_window_start) { walt_irq_work_queue(&walt_cpufreq_irq_work); trace_walt_window_rollover(wrq->window_start); return 1; } return 0; } static inline void set_bits(struct walt_task_struct *wts, int nr_bits, bool set_bit) { int mask = 0; if (nr_bits > 16) nr_bits = 16; wts->busy_bitmap = wts->busy_bitmap << nr_bits; if (set_bit) mask = (1 << nr_bits) - 1; wts->busy_bitmap |= mask; } /* * Easy Case * * | ms wc | * | | | | * +----------+------------+-------------+-------- * | contrib->+----------->| | * | | | | * boundary next_ms_boundary * * * ms in old ms boundary while wc in new ms boundary, which case the code accounts for bit * until then next_ms_boundary and from next_ms_boundary to wc gets accounted in period * * | ms | wc * | | | | * +----------+-------------------------+----------+- * | | | contrib->| * | | | * boundary next_ms_boundary * * * multiple boundaries between ms and wc, which case the code accounts for bit * until the next_ms_boundary and fills in the interm periods and the leftover from * the closest is accounted in period * * | ms | | wc * | | | | | * +----------+-------------------------+------periods----------+---------+-------- * | | | |contrib-> * | | | | * boundary next_ms_boundary */ static void update_busy_bitmap(struct task_struct *p, struct rq *rq, int event, u64 wallclock) { struct walt_task_struct *wts = (struct walt_task_struct *)p->android_vendor_data1; struct walt_rq *wrq = &per_cpu(walt_rq, task_cpu(p)); u64 next_ms_boundary, delta; int periods; bool running; int no_boost_reason = 0; /* * If it has been active for more than 4mS turn it off, the task that caused this activation * should have slept and if its still running it must have updated its load via * prs. No need to continue boosting. */ if (wallclock > wrq->lrb_pipeline_start_time + 4000000) wrq->lrb_pipeline_start_time = 0; if (!pipeline_in_progress()) return; /* * Figure out whether pipeline_cpu, cpu_of(rq) are both same or if it * even matters. */ if (wts->pipeline_cpu == -1) return; if (wallclock < wts->mark_start) { WALT_BUG(WALT_BUG_WALT, p, "on CPU%d: %s task %s(%d) mark_start %llu is higher than wallclock %llu\n", raw_smp_processor_id(), __func__, p->comm, p->pid, wts->mark_start, wallclock); wallclock = wts->mark_start; } running = account_busy_for_cpu_time(rq, p, 0, event); /* task woke up or utra happened while its asleep, clear old boosts */ if (p->on_rq == 0) walt_flag_set(p, WALT_LRB_PIPELINE_BIT, 0); next_ms_boundary = ((wts->mark_start + (NSEC_PER_MSEC - 1)) / NSEC_PER_MSEC) * NSEC_PER_MSEC; if (wallclock < next_ms_boundary) { if (running) wts->period_contrib_run += wallclock - wts->mark_start; goto out; } /* Exceeding a ms boundary */ /* Close the bit corresponding to the mark_start */ if (running) wts->period_contrib_run += next_ms_boundary - wts->mark_start; /* Set the bit representing the ms if runtime within that ms is more than 500us*/ if (wts->period_contrib_run > 500000) set_bits(wts, 1, true); else set_bits(wts, 1, false); wts->period_contrib_run = 0; /* Account the action starting from next_ms_boundary to the closest ms boundary */ delta = wallclock - next_ms_boundary; periods = delta / NSEC_PER_MSEC; if (periods) { if (running) set_bits(wts, periods, true); else set_bits(wts, periods, false); } /* Start contributions for latest ms */ if (running) wts->period_contrib_run = wallclock % NSEC_PER_MSEC; /* task had already set a boost since wakeup, boost just once since wakeup */ if (walt_flag_test(p, WALT_LRB_PIPELINE_BIT)) { no_boost_reason = 1; goto out; } /* * task is not on_rq - if it is in the process of waking up, boost will be applied on the * right cpu at PICK event */ if (p->on_rq == 0) { no_boost_reason = 2; goto out; } if (sched_ravg_window <= SCHED_RAVG_8MS_WINDOW && ((hweight16(wts->busy_bitmap & 0x00FF) < sysctl_sched_lrpb_active_ms[0]) || !sysctl_sched_lrpb_active_ms[0])) { no_boost_reason = 3; goto out; } if (sched_ravg_window == SCHED_RAVG_12MS_WINDOW && ((hweight16(wts->busy_bitmap & 0x0FFF) < sysctl_sched_lrpb_active_ms[1]) || !sysctl_sched_lrpb_active_ms[1])) { no_boost_reason = 4; goto out; } if (sched_ravg_window >= SCHED_RAVG_16MS_WINDOW && ((hweight16(wts->busy_bitmap) < sysctl_sched_lrpb_active_ms[2]) || !sysctl_sched_lrpb_active_ms[2])) { no_boost_reason = 5; goto out; } /* cpu already boosted, so dont extend */ if (wrq->lrb_pipeline_start_time != 0) { no_boost_reason = 6; goto out; } walt_flag_set(p, WALT_LRB_PIPELINE_BIT, 1); wrq->lrb_pipeline_start_time = wallclock; out: trace_sched_update_busy_bitmap(p, rq, wts, wrq, event, wallclock, next_ms_boundary, no_boost_reason); } /* Reflect task activity on its demand and cpu's busy time statistics */ static void walt_update_task_ravg(struct task_struct *p, struct rq *rq, int event, u64 wallclock, u64 irqtime) { u64 old_window_start; int this_cpu_runs_window_rollover; bool old_lrb_pipeline_task_state; bool old_lrb_pipeline_cpu_state; struct walt_rq *wrq = &per_cpu(walt_rq, cpu_of(rq)); struct walt_task_struct *wts = (struct walt_task_struct *) p->android_vendor_data1; if (!wrq->window_start || wts->mark_start == wallclock) return; if (unlikely(!raw_spin_is_locked(&rq->__lock))) { WALT_BUG(WALT_BUG_WALT, p, "on CPU%d: %s task %s(%d) unlocked access for cpu=%d suspended=%d last_clk=%llu stack[%pS <== %pS <== %pS]\n", raw_smp_processor_id(), __func__, p->comm, p->pid, rq->cpu, walt_clock_suspended, sched_clock_last, (void *)CALLER_ADDR0, (void *)CALLER_ADDR1, (void *)CALLER_ADDR2); } walt_lockdep_assert_rq(rq, p); old_window_start = update_window_start(rq, wallclock, event); old_lrb_pipeline_task_state = walt_flag_test(p, WALT_LRB_PIPELINE_BIT); old_lrb_pipeline_cpu_state = wrq->lrb_pipeline_start_time; if (!wts->window_start) wts->window_start = wrq->window_start; if (!wts->mark_start) { update_task_cpu_cycles(p, cpu_of(rq), wallclock); goto done; } update_task_rq_cpu_cycles(p, rq, event, wallclock, irqtime); update_task_demand(p, rq, event, wallclock); update_cpu_busy_time(p, rq, event, wallclock, irqtime); update_task_pred_demand(rq, p, event); update_busy_bitmap(p, rq, event, wallclock); if (event == PUT_PREV_TASK && READ_ONCE(p->__state)) wts->iowaited = p->in_iowait; trace_sched_update_task_ravg(p, rq, event, wallclock, irqtime, &wrq->grp_time, wrq, wts, atomic64_read(&walt_irq_work_lastq_ws)); trace_sched_update_task_ravg_mini(p, rq, event, wallclock, irqtime, &wrq->grp_time, wrq, wts, atomic64_read(&walt_irq_work_lastq_ws)); done: wts->mark_start = wallclock; if (wts->mark_start > (wts->window_start + sched_ravg_window)) WALT_BUG(WALT_BUG_WALT, p, "CPU%d: %s task %s(%d)'s ms=%llu is ahead of ws=%llu by more than 1 window on rq=%d event=%d", raw_smp_processor_id(), __func__, p->comm, p->pid, wts->mark_start, wts->window_start, rq->cpu, event); this_cpu_runs_window_rollover = run_walt_irq_work_rollover(old_window_start, rq); if (likely(!this_cpu_runs_window_rollover)) { if ((unlikely(wts->pipeline_cpu != -1) && task_cpu(p) == cpu_of(rq) && !old_lrb_pipeline_task_state && walt_flag_test(p, WALT_LRB_PIPELINE_BIT)) || (old_lrb_pipeline_cpu_state && !wrq->lrb_pipeline_start_time)) waltgov_run_callback(rq, WALT_CPUFREQ_PIPELINE_BUSY_BIT); } } static inline void __sched_fork_init(struct task_struct *p) { struct walt_task_struct *wts = (struct walt_task_struct *) p->android_vendor_data1; wts->last_sleep_ts = 0; wts->wake_up_idle = false; wts->boost = 0; wts->boost_expires = 0; wts->boost_period = false; wts->low_latency = false; wts->iowaited = false; wts->load_boost = 0; wts->boosted_task_load = 0; wts->reduce_mask = CPU_MASK_ALL; } static void init_new_task_load(struct task_struct *p) { int i; struct walt_task_struct *wts = (struct walt_task_struct *) p->android_vendor_data1; struct walt_task_struct *cur_wts = (struct walt_task_struct *) current->android_vendor_data1; u32 init_load_windows = sched_init_task_load_windows; u32 init_load_windows_scaled = sched_init_task_load_windows_scaled; u32 init_load_pct = cur_wts->init_load_pct; struct cpufreq_policy *policy; unsigned long cpuinfo_max = 0; unsigned long scaling_max = 0; wts->init_load_pct = 0; rcu_assign_pointer(wts->grp, NULL); INIT_LIST_HEAD(&wts->grp_list); wts->prev_cpu = raw_smp_processor_id(); wts->new_cpu = -1; wts->enqueue_after_migration = 0; wts->mark_start = 0; wts->window_start = 0; wts->sum = 0; wts->curr_window = 0; wts->prev_window = 0; wts->active_time = 0; wts->prev_on_rq = 0; wts->prev_on_rq_cpu = -1; wts->pipeline_cpu = -1; wts->yield_state = 0; wts->busy_bitmap = 0; wts->period_contrib_run = 0; for (i = 0; i < NUM_BUSY_BUCKETS; ++i) wts->busy_buckets[i] = 0; wts->bucket_bitmask = 0; wts->cpu_cycles = 0; memset(wts->curr_window_cpu, 0, sizeof(u32) * WALT_NR_CPUS); memset(wts->prev_window_cpu, 0, sizeof(u32) * WALT_NR_CPUS); if (trail_active && sustain_active && task_in_related_thread_group(p->group_leader) && (p->prio <= 120)) { policy = cpufreq_cpu_get_raw(WALT_NR_CPUS-1); if (policy) { cpuinfo_max = policy->cpuinfo.max_freq; scaling_max = policy->max; } if ((scaling_max > 0) && (cpuinfo_max == scaling_max)) init_load_pct = 90; } if (init_load_pct) { init_load_windows = div64_u64((u64)init_load_pct * (u64)sched_ravg_window, 100); init_load_windows_scaled = scale_time_to_util(init_load_windows); } wts->demand = init_load_windows; wts->demand_scaled = init_load_windows_scaled; wts->coloc_demand = init_load_windows; wts->pred_demand_scaled = 0; for (i = 0; i < RAVG_HIST_SIZE; ++i) wts->sum_history[i] = init_load_windows; wts->misfit = false; wts->rtg_high_prio = false; wts->unfilter = sysctl_sched_task_unfilter_period; INIT_LIST_HEAD(&wts->mvp_list); wts->sum_exec_snapshot_for_slice = 0; wts->sum_exec_snapshot_for_total = 0; wts->total_exec = 0; wts->mvp_prio = WALT_NOT_MVP; wts->cidx = 0; wts->mark_start_birth_ts = 0; wts->high_util_history = 0; __sched_fork_init(p); /* New task inherits the MPAM part_id */ wts->mpam_part_id = cur_wts->mpam_part_id; walt_flag_set(p, WALT_INIT_BIT, 1); walt_flag_set(p, WALT_TRAILBLAZER_BIT, 0); } int remove_heavy(struct walt_task_struct *wts); static void walt_task_dead(struct task_struct *p) { struct walt_task_struct *wts = (struct walt_task_struct *) p->android_vendor_data1; sched_set_group_id(p, 0); if (wts->low_latency & WALT_LOW_LATENCY_PIPELINE_BIT) remove_pipeline(wts); if (wts->low_latency & WALT_LOW_LATENCY_HEAVY_BIT) remove_heavy(wts); if (p == pipeline_special_task) remove_special_task(); } static void mark_task_starting(struct task_struct *p) { struct rq *rq = task_rq(p); struct walt_task_struct *wts = (struct walt_task_struct *) p->android_vendor_data1; u64 wallclock = walt_rq_clock(rq); wts->last_wake_ts = wallclock; wts->last_enqueued_ts = wallclock; wts->mark_start_birth_ts = wallclock; if (wts->mark_start) return; walt_update_task_ravg(p, rq, TASK_UPDATE, wallclock, 0); } /* * Task groups whose aggregate demand on a cpu is more than * sched_group_upmigrate need to be up-migrated if possible. */ static unsigned int __read_mostly sched_group_upmigrate = 20000000; /* * Task groups, once up-migrated, will need to drop their aggregate * demand to less than sched_group_downmigrate before they are "down" * migrated. */ static unsigned int __read_mostly sched_group_downmigrate = 19000000; void walt_update_group_thresholds(void) { unsigned int min_scale = arch_scale_cpu_capacity( cluster_first_cpu(sched_cluster[0])); u64 min_ms = min_scale * (sched_ravg_window >> SCHED_CAPACITY_SHIFT); sched_group_upmigrate = div64_ul(min_ms * sysctl_sched_group_upmigrate_pct, 100); sched_group_downmigrate = div64_ul(min_ms * sysctl_sched_group_downmigrate_pct, 100); } struct walt_sched_cluster *sched_cluster[WALT_NR_CPUS]; __read_mostly int num_sched_clusters; struct list_head cluster_head; static struct walt_sched_cluster init_cluster = { .list = LIST_HEAD_INIT(init_cluster.list), .id = 0, .cur_freq = 1, .max_possible_freq = 1, .aggr_grp_load = 0, .found_ts = 0, }; static void init_clusters(void) { init_cluster.cpus = *cpu_possible_mask; raw_spin_lock_init(&init_cluster.load_lock); INIT_LIST_HEAD(&cluster_head); list_add(&init_cluster.list, &cluster_head); } static void insert_cluster(struct walt_sched_cluster *cluster, struct list_head *head) { struct walt_sched_cluster *tmp; struct list_head *iter = head; list_for_each_entry(tmp, head, list) { if (arch_scale_cpu_capacity(cluster_first_cpu(cluster)) < arch_scale_cpu_capacity(cluster_first_cpu(tmp))) break; iter = &tmp->list; } list_add(&cluster->list, iter); } static struct walt_sched_cluster *alloc_new_cluster(const struct cpumask *cpus) { struct walt_sched_cluster *cluster = NULL; cluster = kzalloc(sizeof(struct walt_sched_cluster), GFP_ATOMIC); BUG_ON(!cluster); INIT_LIST_HEAD(&cluster->list); cluster->cur_freq = 1; cluster->max_freq = 1; cluster->max_possible_freq = 1; raw_spin_lock_init(&cluster->load_lock); cluster->cpus = *cpus; cluster->found_ts = 0; return cluster; } static void add_cluster(const struct cpumask *cpus, struct list_head *head) { struct walt_sched_cluster *cluster = alloc_new_cluster(cpus); int i; struct walt_rq *wrq; BUG_ON(num_sched_clusters >= MAX_CLUSTERS); for_each_cpu(i, cpus) { wrq = &per_cpu(walt_rq, i); wrq->cluster = cluster; } insert_cluster(cluster, head); num_sched_clusters++; } static void cleanup_clusters(struct list_head *head) { struct walt_sched_cluster *cluster, *tmp; int i; struct walt_rq *wrq; list_for_each_entry_safe(cluster, tmp, head, list) { for_each_cpu(i, &cluster->cpus) { wrq = &per_cpu(walt_rq, i); wrq->cluster = &init_cluster; } list_del(&cluster->list); num_sched_clusters--; kfree(cluster); } } static inline void align_clusters(struct list_head *head) { struct walt_sched_cluster *tmp; struct list_head *cluster1 = head, *cluster2 = head; unsigned long capacity1 = 0, capacity2 = 0; int i = 0; if (num_sched_clusters != 4) return; list_for_each_entry(tmp, head, list) { if (i == 1) { cluster1 = &tmp->list; capacity1 = arch_scale_cpu_capacity(cluster_first_cpu(tmp)); } if (i == 2) { cluster2 = &tmp->list; capacity2 = arch_scale_cpu_capacity(cluster_first_cpu(tmp)); } i++; } if (capacity1 < capacity2) list_swap(cluster1, cluster2); } static inline void assign_cluster_ids(struct list_head *head) { struct walt_sched_cluster *cluster; int pos = 0; list_for_each_entry(cluster, head, list) { cluster->id = pos; sched_cluster[pos++] = cluster; } WARN_ON(pos > MAX_CLUSTERS); } static inline void move_list(struct list_head *dst, struct list_head *src, bool sync_rcu) { struct list_head *first, *last; first = src->next; last = src->prev; if (sync_rcu) { INIT_LIST_HEAD_RCU(src); synchronize_rcu(); } first->prev = dst; dst->prev = last; last->next = dst; /* Ensure list sanity before making the head visible to all CPUs. */ smp_mb(); dst->next = first; } static void update_all_clusters_stats(void) { struct walt_sched_cluster *cluster; u64 highest_mpc = 0, lowest_mpc = U64_MAX; for_each_sched_cluster(cluster) { u64 mpc = arch_scale_cpu_capacity( cluster_first_cpu(cluster)); int cluster_id = cluster->id; if (mpc > highest_mpc) { highest_mpc = mpc; max_possible_cluster_id = cluster_id; } if (mpc < lowest_mpc) { lowest_mpc = mpc; min_possible_cluster_id = cluster_id; } } walt_update_group_thresholds(); } static bool walt_clusters_parsed; cpumask_t __read_mostly **cpu_array; u8 cpu_arrays_init_x11[1][1] = { {0}, /* S */ }; u8 cpu_arrays_init_x22[2][2] = { {0, 1}, /* S G */ {1, 0}, /* G S */ }; u8 cpu_arrays_init_x33[3][3] = { {0, 1, 2}, /* S G P */ {1, 2, 0}, /* G P S */ {2, 1, 0}, /* P G S */ }; u8 cpu_arrays_init_x44[4][4] = { {0, 2, 1, 3}, /* S T G P */ {1, 2, 3, 0}, /* G T P S */ {2, 3, 1, 0}, /* T P G S */ {3, 1, 2, 0}, /* P G T S */ }; static void init_cpu_array(void) { int i; int rows = num_sched_clusters; cpu_array = kcalloc(rows, sizeof(cpumask_t *), GFP_ATOMIC | __GFP_NOFAIL); if (!cpu_array) WALT_PANIC(1); for (i = 0; i < rows; i++) { cpu_array[i] = kcalloc(num_sched_clusters, sizeof(cpumask_t), GFP_ATOMIC | __GFP_NOFAIL); if (!cpu_array[i]) WALT_PANIC(1); } } static void build_cpu_array(void) { u8 *select_init_list; u8 id; int i, j; if (!cpu_array) WALT_PANIC(1); switch (num_sched_clusters) { case 1: select_init_list = (u8 *)cpu_arrays_init_x11; break; case 2: select_init_list = (u8 *)cpu_arrays_init_x22; break; case 3: select_init_list = (u8 *)cpu_arrays_init_x33; break; case 4: select_init_list = (u8 *)cpu_arrays_init_x44; break; default: pr_err("unsupported num clusters=%d\n", num_sched_clusters); WALT_PANIC(1); } for (i = 0; i < num_sched_clusters; i++) { for (j = 0; j < num_sched_clusters; j++) { id = select_init_list[i * num_sched_clusters + j]; cpumask_copy(&cpu_array[i][j], &sched_cluster[id]->cpus); } } } static void walt_get_possible_siblings(int cpuid, struct cpumask *cluster_cpus) { int cpu; struct cpu_topology *cpu_topo, *cpuid_topo = &cpu_topology[cpuid]; if (cpuid_topo->cluster_id == -1) return; for_each_possible_cpu(cpu) { cpu_topo = &cpu_topology[cpu]; if (cpuid_topo->cluster_id != cpu_topo->cluster_id) continue; cpumask_set_cpu(cpu, cluster_cpus); } } int cpu_l2_sibling[WALT_NR_CPUS] = {[0 ... WALT_NR_CPUS-1] = -1}; static void find_cache_siblings(void) { int cpu, cpu2; struct device_node *cpu_dev, *cpu_dev2, *cpu_l2_cache_node, *cpu_l2_cache_node2; for_each_possible_cpu(cpu) { cpu_dev = of_get_cpu_node(cpu, NULL); if (!cpu_dev) continue; cpu_l2_cache_node = of_parse_phandle(cpu_dev, "next-level-cache", 0); if (!cpu_l2_cache_node) continue; for_each_possible_cpu(cpu2) { if (cpu == cpu2) continue; cpu_dev2 = of_get_cpu_node(cpu2, NULL); if (!cpu_dev2) continue; cpu_l2_cache_node2 = of_parse_phandle(cpu_dev2, "next-level-cache", 0); if (!cpu_l2_cache_node2) continue; if (cpu_l2_cache_node == cpu_l2_cache_node2) { cpu_l2_sibling[cpu] = cpu2; break; } } } } static void walt_update_cluster_topology(void) { struct cpumask cpus = *cpu_possible_mask; struct cpumask cluster_cpus; struct walt_sched_cluster *cluster; struct list_head new_head; int i; struct walt_rq *wrq; INIT_LIST_HEAD(&new_head); for_each_cpu(i, &cpus) { cpumask_clear(&cluster_cpus); walt_get_possible_siblings(i, &cluster_cpus); if (cpumask_empty(&cluster_cpus)) { WARN(1, "WALT: Invalid cpu topology!!"); cleanup_clusters(&new_head); return; } cpumask_andnot(&cpus, &cpus, &cluster_cpus); add_cluster(&cluster_cpus, &new_head); } align_clusters(&new_head); assign_cluster_ids(&new_head); list_for_each_entry(cluster, &new_head, list) { struct cpufreq_policy *policy; policy = cpufreq_cpu_get_raw(cluster_first_cpu(cluster)); /* * walt_update_cluster_topology() must be called AFTER policies * for all cpus are initialized. If not, simply BUG(). */ WALT_PANIC(!policy); if (policy) { cluster->max_possible_freq = policy->cpuinfo.max_freq; cluster->max_freq = policy->max; for_each_cpu(i, &cluster->cpus) { wrq = &per_cpu(walt_rq, i); cpumask_copy(&wrq->freq_domain_cpumask, policy->related_cpus); } cpuinfo_max_freq_cached = (cpuinfo_max_freq_cached > policy->cpuinfo.max_freq) ? cpuinfo_max_freq_cached : policy->cpuinfo.max_freq; } } /* * Ensure cluster ids are visible to all CPUs before making * cluster_head visible. */ move_list(&cluster_head, &new_head, false); update_all_clusters_stats(); init_cpu_array(); build_cpu_array(); find_cache_siblings(); create_util_to_cost(); walt_clusters_parsed = true; } static void walt_init_cycle_counter(void) { char *walt_cycle_cntr_path = "/soc/walt"; struct device_node *np = NULL; if (soc_feat(SOC_ENABLE_SW_CYCLE_COUNTER_BIT)) { walt_cycle_counter_init(); } else { np = of_find_node_by_path(walt_cycle_cntr_path); of_platform_populate(np, NULL, NULL, NULL); } wait_for_completion_interruptible(&walt_get_cycle_counts_cb_completion); } static void transfer_busy_time(struct rq *rq, struct walt_related_thread_group *grp, struct task_struct *p, int event); /* * Enable colocation and frequency aggregation for all threads in a process. * The children inherits the group id from the parent. */ static struct walt_related_thread_group *related_thread_groups[MAX_NUM_CGROUP_COLOC_ID]; static LIST_HEAD(active_related_thread_groups); static DEFINE_RWLOCK(related_thread_group_lock); static inline void update_best_cluster(struct walt_related_thread_group *grp, u64 combined_demand, bool boost) { if (boost) { /* * since we are in boost, we can keep grp on min, the boosts * will ensure tasks get to bigs */ grp->skip_min = false; return; } if (is_suh_max()) combined_demand = sched_group_upmigrate; if (!grp->skip_min) { if (combined_demand >= sched_group_upmigrate) grp->skip_min = true; return; } if (combined_demand < sched_group_downmigrate) { if (!sysctl_sched_coloc_downmigrate_ns || (grp->last_update - grp->start_ktime_ts) < sysctl_sched_hyst_min_coloc_ns) { grp->downmigrate_ts = 0; grp->skip_min = false; return; } if (!grp->downmigrate_ts) { grp->downmigrate_ts = grp->last_update; return; } if (grp->last_update - grp->downmigrate_ts > sysctl_sched_coloc_downmigrate_ns) { grp->downmigrate_ts = 0; grp->skip_min = false; } } else if (grp->downmigrate_ts) grp->downmigrate_ts = 0; } static void _set_preferred_cluster(struct walt_related_thread_group *grp) { struct task_struct *p; u64 combined_demand = 0; bool group_boost = false; u64 wallclock; bool prev_skip_min = grp->skip_min; struct walt_task_struct *wts; if (sched_group_upmigrate == 0) { grp->skip_min = false; goto out; } if (list_empty(&grp->tasks)) { grp->skip_min = false; goto out; } if (!hmp_capable()) { grp->skip_min = false; goto out; } wallclock = walt_sched_clock(); /* * wakeup of two or more related tasks could race with each other and * could result in multiple calls to _set_preferred_cluster being issued * at same time. Avoid overhead in such cases of rechecking preferred * cluster */ if (wallclock - grp->last_update < sched_ravg_window / 10) return; list_for_each_entry(wts, &grp->tasks, grp_list) { p = wts_to_ts(wts); if (task_boost_policy(p) == SCHED_BOOST_ON_BIG) { group_boost = true; break; } if (wts->mark_start < wallclock - (sched_ravg_window * RAVG_HIST_SIZE)) continue; combined_demand += wts->coloc_demand; if (!trace_sched_set_preferred_cluster_enabled()) { if (combined_demand > sched_group_upmigrate) break; } } grp->last_update = wallclock; update_best_cluster(grp, combined_demand, group_boost); out: trace_sched_set_preferred_cluster(grp, combined_demand, prev_skip_min, sched_group_upmigrate, sched_group_downmigrate); if (grp->id == DEFAULT_CGROUP_COLOC_ID && grp->skip_min != prev_skip_min) { if (grp->skip_min) grp->start_ktime_ts = wallclock; else grp->start_ktime_ts = 0; sched_update_hyst_times(); } } static void set_preferred_cluster(struct walt_related_thread_group *grp) { raw_spin_lock(&grp->lock); _set_preferred_cluster(grp); raw_spin_unlock(&grp->lock); } static int update_preferred_cluster(struct walt_related_thread_group *grp, struct task_struct *p, u32 old_load, bool from_tick) { u32 new_load = task_load(p); if (!grp) return 0; if (unlikely(from_tick && is_suh_max())) return 1; /* * Update if task's load has changed significantly or a complete window * has passed since we last updated preference */ if (abs(new_load - old_load) > sched_ravg_window / 4) return 1; if (walt_sched_clock() - grp->last_update > sched_ravg_window) return 1; return 0; } #define ADD_TASK 0 #define REM_TASK 1 struct walt_related_thread_group* lookup_related_thread_group(unsigned int group_id) { return related_thread_groups[group_id]; } static int alloc_related_thread_groups(void) { int i; struct walt_related_thread_group *grp; /* groupd_id = 0 is invalid as it's special id to remove group. */ for (i = 1; i < MAX_NUM_CGROUP_COLOC_ID; i++) { grp = kzalloc(sizeof(*grp), GFP_ATOMIC | GFP_NOWAIT); BUG_ON(!grp); grp->id = i; INIT_LIST_HEAD(&grp->tasks); INIT_LIST_HEAD(&grp->list); raw_spin_lock_init(&grp->lock); related_thread_groups[i] = grp; } return 0; } static void remove_task_from_group(struct task_struct *p) { struct walt_task_struct *wts = (struct walt_task_struct *) p->android_vendor_data1; struct walt_related_thread_group *grp = wts->grp; struct rq *rq; int empty_group = 1; struct rq_flags rf; raw_spin_lock(&grp->lock); rq = __task_rq_lock(p, &rf); transfer_busy_time(rq, wts->grp, p, REM_TASK); list_del_init(&wts->grp_list); rcu_assign_pointer(wts->grp, NULL); __task_rq_unlock(rq, &rf); if (!list_empty(&grp->tasks)) { empty_group = 0; _set_preferred_cluster(grp); } raw_spin_unlock(&grp->lock); /* Reserved groups cannot be destroyed */ if (empty_group && grp->id != DEFAULT_CGROUP_COLOC_ID) /* * We test whether grp->list is attached with list_empty() * hence re-init the list after deletion. */ list_del_init(&grp->list); } static int add_task_to_group(struct task_struct *p, struct walt_related_thread_group *grp) { struct rq *rq; struct rq_flags rf; struct walt_task_struct *wts = (struct walt_task_struct *) p->android_vendor_data1; raw_spin_lock(&grp->lock); /* * Change wts->grp under rq->lock. Will prevent races with read-side * reference of wts->grp in various hot-paths */ rq = __task_rq_lock(p, &rf); transfer_busy_time(rq, grp, p, ADD_TASK); list_add(&wts->grp_list, &grp->tasks); rcu_assign_pointer(wts->grp, grp); __task_rq_unlock(rq, &rf); _set_preferred_cluster(grp); raw_spin_unlock(&grp->lock); return 0; } #ifdef CONFIG_UCLAMP_TASK_GROUP static inline bool uclamp_task_colocated(struct task_struct *p) { struct cgroup_subsys_state *css; struct task_group *tg; bool colocate; struct walt_task_group *wtg; rcu_read_lock(); css = task_css(p, cpu_cgrp_id); if (!css) { rcu_read_unlock(); return false; } tg = container_of(css, struct task_group, css); wtg = (struct walt_task_group *) tg->android_vendor_data1; colocate = wtg->colocate; rcu_read_unlock(); return colocate; } #else static inline bool uclamp_task_colocated(struct task_struct *p) { return false; } #endif /* CONFIG_UCLAMP_TASK_GROUP */ static void add_new_task_to_grp(struct task_struct *new) { unsigned long flags; struct walt_related_thread_group *grp; struct walt_task_struct *wts = (struct walt_task_struct *) new->android_vendor_data1; /* * If the task does not belong to colocated schedtune * cgroup, nothing to do. We are checking this without * lock. Even if there is a race, it will be added * to the co-located cgroup via cgroup attach. */ if (!uclamp_task_colocated(new)) return; grp = lookup_related_thread_group(DEFAULT_CGROUP_COLOC_ID); write_lock_irqsave(&related_thread_group_lock, flags); /* * It's possible that someone already added the new task to the * group. or it might have taken out from the colocated schedtune * cgroup. check these conditions under lock. */ if (!uclamp_task_colocated(new) || wts->grp) { write_unlock_irqrestore(&related_thread_group_lock, flags); return; } raw_spin_lock(&grp->lock); rcu_assign_pointer(wts->grp, grp); list_add(&wts->grp_list, &grp->tasks); raw_spin_unlock(&grp->lock); write_unlock_irqrestore(&related_thread_group_lock, flags); } static int __sched_set_group_id(struct task_struct *p, unsigned int group_id) { int rc = 0; unsigned long flags; struct walt_related_thread_group *grp = NULL; struct walt_task_struct *wts = (struct walt_task_struct *) p->android_vendor_data1; if (group_id >= MAX_NUM_CGROUP_COLOC_ID) return -EINVAL; if (unlikely(!walt_flag_test(p, WALT_INIT_BIT))) return -EINVAL; raw_spin_lock_irqsave(&p->pi_lock, flags); write_lock(&related_thread_group_lock); /* Switching from one group to another directly is not permitted */ if ((!wts->grp && !group_id) || (wts->grp && group_id)) goto done; if (!group_id) { remove_task_from_group(p); goto done; } grp = lookup_related_thread_group(group_id); if (list_empty(&grp->list)) list_add(&grp->list, &active_related_thread_groups); rc = add_task_to_group(p, grp); done: write_unlock(&related_thread_group_lock); raw_spin_unlock_irqrestore(&p->pi_lock, flags); return rc; } int sched_set_group_id(struct task_struct *p, unsigned int group_id) { /* DEFAULT_CGROUP_COLOC_ID is a reserved id */ if (group_id == DEFAULT_CGROUP_COLOC_ID) return -EINVAL; return __sched_set_group_id(p, group_id); } unsigned int sched_get_group_id(struct task_struct *p) { unsigned int group_id; struct walt_related_thread_group *grp; rcu_read_lock(); grp = task_related_thread_group(p); group_id = grp ? grp->id : 0; rcu_read_unlock(); return group_id; } /* * We create a default colocation group at boot. There is no need to * synchronize tasks between cgroups at creation time because the * correct cgroup hierarchy is not available at boot. Therefore cgroup * colocation is turned off by default even though the colocation group * itself has been allocated. Furthermore this colocation group cannot * be destroyted once it has been created. All of this has been as part * of runtime optimizations. * * The job of synchronizing tasks to the colocation group is done when * the colocation flag in the cgroup is turned on. */ static int create_default_coloc_group(void) { struct walt_related_thread_group *grp = NULL; unsigned long flags; grp = lookup_related_thread_group(DEFAULT_CGROUP_COLOC_ID); write_lock_irqsave(&related_thread_group_lock, flags); list_add(&grp->list, &active_related_thread_groups); write_unlock_irqrestore(&related_thread_group_lock, flags); return 0; } static void walt_update_tg_pointer(struct cgroup_subsys_state *css) { if (!strcmp(css->cgroup->kn->name, "top-app")) walt_init_topapp_tg(css_tg(css)); else if (!strcmp(css->cgroup->kn->name, "foreground")) walt_init_foreground_tg(css_tg(css)); else if (!strcmp(css->cgroup->kn->name, "foreground-boost")) walt_init_foregroundboost_tg(css_tg(css)); else walt_init_tg(css_tg(css)); } void walt_kick_cpu(int cpu) { unsigned int flags = NOHZ_KICK_MASK; if (cpu == -1) return; /* * Access to rq::nohz_csd is serialized by NOHZ_KICK_MASK; he who sets * the first flag owns it; cleared by nohz_csd_func(). */ flags = atomic_fetch_or(flags, nohz_flags(cpu)); if (flags & NOHZ_KICK_MASK) return; /* * This way we generate an IPI on the target CPU which * is idle. And the softirq performing nohz idle load balance * will be run before returning from the IPI. */ smp_call_function_single_async(cpu, &cpu_rq(cpu)->nohz_csd); } static void android_rvh_cpu_cgroup_online(void *unused, struct cgroup_subsys_state *css) { if (unlikely(walt_disabled)) return; walt_update_tg_pointer(css); } static void android_rvh_cpu_cgroup_attach(void *unused, struct cgroup_taskset *tset) { struct task_struct *task; struct cgroup_subsys_state *css; struct task_group *tg; struct walt_task_group *wtg; unsigned int grp_id; int ret; if (unlikely(walt_disabled)) return; cgroup_taskset_first(tset, &css); if (!css) return; tg = container_of(css, struct task_group, css); wtg = (struct walt_task_group *) tg->android_vendor_data1; cgroup_taskset_for_each(task, css, tset) { grp_id = wtg->colocate ? DEFAULT_CGROUP_COLOC_ID : 0; ret = __sched_set_group_id(task, grp_id); trace_sched_cgroup_attach(task, grp_id, ret); } } static bool is_cluster_hosting_top_app(struct walt_sched_cluster *cluster) { struct walt_related_thread_group *grp; bool grp_on_min; grp = lookup_related_thread_group(DEFAULT_CGROUP_COLOC_ID); if (!grp) return false; grp_on_min = !grp->skip_min && (boost_policy != SCHED_BOOST_ON_BIG); return (is_min_capacity_cluster(cluster) == grp_on_min); } static void note_task_waking(struct task_struct *p, u64 wallclock) { struct walt_task_struct *wts = (struct walt_task_struct *) p->android_vendor_data1; wts->last_wake_ts = wallclock; } /* * Task's cpu usage is accounted in: * wrq->curr/prev_runnable_sum, when its ->grp is NULL * grp->cpu_time[cpu]->curr/prev_runnable_sum, when its ->grp is !NULL * * Transfer task's cpu usage between those counters when transitioning between * groups */ static void transfer_busy_time(struct rq *rq, struct walt_related_thread_group *grp, struct task_struct *p, int event) { u64 wallclock; struct group_cpu_time *cpu_time; u64 *src_curr_runnable_sum, *dst_curr_runnable_sum; u64 *src_prev_runnable_sum, *dst_prev_runnable_sum; u64 *src_nt_curr_runnable_sum, *dst_nt_curr_runnable_sum; u64 *src_nt_prev_runnable_sum, *dst_nt_prev_runnable_sum; int migrate_type; int cpu = cpu_of(rq); bool new_task; int i; struct walt_rq *wrq = &per_cpu(walt_rq, cpu_of(rq)); struct walt_task_struct *wts = (struct walt_task_struct *) p->android_vendor_data1; wallclock = walt_sched_clock(); walt_update_task_ravg(p, rq, TASK_UPDATE, wallclock, 0); if (wts->window_start != wrq->window_start) WALT_BUG(WALT_BUG_WALT, p, "CPU%d: %s event=%d task %s(%d)'s ws=%llu not equal to rq %d's ws=%llu", raw_smp_processor_id(), __func__, event, p->comm, p->pid, wts->window_start, rq->cpu, wrq->window_start); new_task = is_new_task(p); if (wts->enqueue_after_migration != 0) { wallclock = walt_sched_clock(); migrate_busy_time_addition(p, cpu_of(rq), wallclock); wts->enqueue_after_migration = 0; } cpu_time = &wrq->grp_time; if (event == ADD_TASK) { migrate_type = RQ_TO_GROUP; src_curr_runnable_sum = &wrq->curr_runnable_sum; dst_curr_runnable_sum = &cpu_time->curr_runnable_sum; src_prev_runnable_sum = &wrq->prev_runnable_sum; dst_prev_runnable_sum = &cpu_time->prev_runnable_sum; src_nt_curr_runnable_sum = &wrq->nt_curr_runnable_sum; dst_nt_curr_runnable_sum = &cpu_time->nt_curr_runnable_sum; src_nt_prev_runnable_sum = &wrq->nt_prev_runnable_sum; dst_nt_prev_runnable_sum = &cpu_time->nt_prev_runnable_sum; if (*src_curr_runnable_sum < wts->curr_window_cpu[cpu]) { WALT_BUG(WALT_BUG_WALT, p, "pid=%u CPU=%d event=%d src_crs=%llu is lesser than task_contrib=%u", p->pid, cpu, event, *src_curr_runnable_sum, wts->curr_window_cpu[cpu]); *src_curr_runnable_sum = wts->curr_window_cpu[cpu]; } *src_curr_runnable_sum -= wts->curr_window_cpu[cpu]; if (*src_prev_runnable_sum < wts->prev_window_cpu[cpu]) { WALT_BUG(WALT_BUG_WALT, p, "pid=%u CPU=%d event=%d src_prs=%llu is lesser than task_contrib=%u", p->pid, cpu, event, *src_prev_runnable_sum, wts->prev_window_cpu[cpu]); *src_prev_runnable_sum = wts->prev_window_cpu[cpu]; } *src_prev_runnable_sum -= wts->prev_window_cpu[cpu]; if (new_task) { if (*src_nt_curr_runnable_sum < wts->curr_window_cpu[cpu]) { WALT_BUG(WALT_BUG_WALT, p, "pid=%u CPU=%d event=%d src_nt_crs=%llu is lesser than task_contrib=%u", p->pid, cpu, event, *src_nt_curr_runnable_sum, wts->curr_window_cpu[cpu]); *src_nt_curr_runnable_sum = wts->curr_window_cpu[cpu]; } *src_nt_curr_runnable_sum -= wts->curr_window_cpu[cpu]; if (*src_nt_prev_runnable_sum < wts->prev_window_cpu[cpu]) { WALT_BUG(WALT_BUG_WALT, p, "pid=%u CPU=%d event=%d src_nt_prs=%llu is lesser than task_contrib=%u", p->pid, cpu, event, *src_nt_prev_runnable_sum, wts->prev_window_cpu[cpu]); *src_nt_prev_runnable_sum = wts->prev_window_cpu[cpu]; } *src_nt_prev_runnable_sum -= wts->prev_window_cpu[cpu]; } update_cluster_load_subtractions(p, cpu, wrq->window_start, new_task); } else { migrate_type = GROUP_TO_RQ; src_curr_runnable_sum = &cpu_time->curr_runnable_sum; dst_curr_runnable_sum = &wrq->curr_runnable_sum; src_prev_runnable_sum = &cpu_time->prev_runnable_sum; dst_prev_runnable_sum = &wrq->prev_runnable_sum; src_nt_curr_runnable_sum = &cpu_time->nt_curr_runnable_sum; dst_nt_curr_runnable_sum = &wrq->nt_curr_runnable_sum; src_nt_prev_runnable_sum = &cpu_time->nt_prev_runnable_sum; dst_nt_prev_runnable_sum = &wrq->nt_prev_runnable_sum; if (*src_curr_runnable_sum < wts->curr_window) { WALT_BUG(WALT_BUG_WALT, p, "WALT-UG pid=%u CPU=%d event=%d src_crs=%llu is lesser than task_contrib=%u", p->pid, cpu, event, *src_curr_runnable_sum, wts->curr_window); *src_curr_runnable_sum = wts->curr_window; } *src_curr_runnable_sum -= wts->curr_window; if (*src_prev_runnable_sum < wts->prev_window) { WALT_BUG(WALT_BUG_WALT, p, "pid=%u CPU=%d event=%d src_prs=%llu is lesser than task_contrib=%u", p->pid, cpu, event, *src_prev_runnable_sum, wts->prev_window); *src_prev_runnable_sum = wts->prev_window; } *src_prev_runnable_sum -= wts->prev_window; if (new_task) { if (*src_nt_curr_runnable_sum < wts->curr_window) { WALT_BUG(WALT_BUG_WALT, p, "pid=%u CPU=%d event=%d src_nt_crs=%llu is lesser than task_contrib=%u", p->pid, cpu, event, *src_nt_curr_runnable_sum, wts->curr_window); *src_nt_curr_runnable_sum = wts->curr_window; } *src_nt_curr_runnable_sum -= wts->curr_window; if (*src_nt_prev_runnable_sum < wts->prev_window) { WALT_BUG(WALT_BUG_WALT, p, "pid=%u CPU=%d event=%d src_nt_prs=%llu is lesser than task_contrib=%u", p->pid, cpu, event, *src_nt_prev_runnable_sum, wts->prev_window); *src_nt_prev_runnable_sum = wts->prev_window; } *src_nt_prev_runnable_sum -= wts->prev_window; } /* * Need to reset curr/prev windows for all CPUs, not just the * ones in the same cluster. Since inter cluster migrations * did not result in the appropriate book keeping, the values * per CPU would be inaccurate. */ for_each_possible_cpu(i) { wts->curr_window_cpu[i] = 0; wts->prev_window_cpu[i] = 0; } } *dst_curr_runnable_sum += wts->curr_window; *dst_prev_runnable_sum += wts->prev_window; if (new_task) { *dst_nt_curr_runnable_sum += wts->curr_window; *dst_nt_prev_runnable_sum += wts->prev_window; } /* * When a task enter or exits a group, it's curr and prev windows are * moved to a single CPU. This behavior might be sub-optimal in the * exit case, however, it saves us the overhead of handling inter * cluster migration fixups while the task is part of a related group. */ wts->curr_window_cpu[cpu] = wts->curr_window; wts->prev_window_cpu[cpu] = wts->prev_window; trace_sched_migration_update_sum(p, migrate_type, rq); } bool is_rtgb_active(void) { struct walt_related_thread_group *grp; grp = lookup_related_thread_group(DEFAULT_CGROUP_COLOC_ID); return grp && grp->skip_min; } u64 get_rtgb_active_time(void) { struct walt_related_thread_group *grp; u64 now = walt_sched_clock(); grp = lookup_related_thread_group(DEFAULT_CGROUP_COLOC_ID); if (grp && grp->skip_min && grp->start_ktime_ts) return now - grp->start_ktime_ts; return 0; } static void walt_init_window_dep(void); static void walt_tunables_fixup(void) { if (likely(num_sched_clusters > 0)) walt_update_group_thresholds(); walt_init_window_dep(); } static void walt_update_irqload(struct rq *rq) { u64 irq_delta = 0; unsigned int nr_windows = 0; u64 cur_irq_time; struct walt_rq *wrq = &per_cpu(walt_rq, cpu_of(rq)); u64 last_irq_window = READ_ONCE(wrq->last_irq_window); if (wrq->window_start > last_irq_window) nr_windows = div64_u64(wrq->window_start - last_irq_window, sched_ravg_window); /* Decay CPU's irqload by 3/4 for each window. */ if (nr_windows < 10) wrq->avg_irqload = mult_frac(wrq->avg_irqload, 3, 4); else wrq->avg_irqload = 0; cur_irq_time = irq_time_read(cpu_of(rq)); if (cur_irq_time > wrq->prev_irq_time) irq_delta = cur_irq_time - wrq->prev_irq_time; wrq->avg_irqload += irq_delta; wrq->prev_irq_time = cur_irq_time; if (nr_windows < SCHED_HIGH_IRQ_TIMEOUT) wrq->high_irqload = (wrq->avg_irqload >= walt_cpu_high_irqload); else wrq->high_irqload = false; } /** * __walt_irq_work_locked() - common function to process work * @is_migration: if true, performing migration work, else rollover * @is_asym_migration: if true, performing migration involving an asym cap sibling * @lock_cpus: mask of the cpus involved in the operation. * * In rq locked context, update the cluster group load and find * the load of the min cluster, while tracking the total aggregate * work load. Update the cpufreq through the walt governor, * based upon the new load calculated. * * For the window rollover case lock_cpus will be all possible cpus, * and for migrations it will include the cpus from the two clusters * involved in the migration. */ static inline void __walt_irq_work_locked(bool is_migration, bool is_asym_migration, bool is_pipeline_sync_migration, struct cpumask *lock_cpus) { struct walt_sched_cluster *cluster; struct rq *rq; int cpu; u64 wc; u64 total_grp_load = 0; unsigned long flags; struct walt_rq *wrq; wc = walt_sched_clock(); if (!is_migration) walt_load_reported_window = atomic64_read(&walt_irq_work_lastq_ws); for_each_sched_cluster(cluster) { u64 aggr_grp_load = 0; raw_spin_lock(&cluster->load_lock); for_each_cpu(cpu, &cluster->cpus) { rq = cpu_rq(cpu); wrq = &per_cpu(walt_rq, cpu_of(rq)); if (rq->curr) { /* only update ravg for locked cpus */ if (cpumask_intersects(lock_cpus, &cluster->cpus)) { if (unlikely(!raw_spin_is_locked(&rq->__lock))) { WALT_BUG(WALT_BUG_WALT, NULL, "%s unlocked cpu=%d is_migration=%d is_asym_migration=%d is_pipeline_sync_migration=%d lock_cpus=%*pbl suspended=%d last_clk=%llu stack[%pS <= %pS <= %pS]\n", __func__, rq->cpu, is_migration, is_asym_migration, is_pipeline_sync_migration, cpumask_pr_args(lock_cpus), walt_clock_suspended, sched_clock_last, (void *)CALLER_ADDR0, (void *)CALLER_ADDR1, (void *)CALLER_ADDR2); } walt_update_task_ravg(rq->curr, rq, TASK_UPDATE, wc, 0); account_load_subtractions(rq); } /* update aggr_grp_load for all clusters, all cpus */ aggr_grp_load += wrq->grp_time.prev_runnable_sum; } } raw_spin_unlock(&cluster->load_lock); cluster->aggr_grp_load = aggr_grp_load; total_grp_load += aggr_grp_load; } if (total_grp_load) rtgb_active = is_rtgb_active(); else rtgb_active = false; if (!is_migration && sysctl_sched_user_hint && time_after(jiffies, sched_user_hint_reset_time)) sysctl_sched_user_hint = 0; for_each_sched_cluster(cluster) { cpumask_t cluster_online_cpus; unsigned int num_cpus, i = 1; /* for migration, skip unnotified clusters */ if (is_migration && !cpumask_intersects(lock_cpus, &cluster->cpus)) continue; cpumask_and(&cluster_online_cpus, &cluster->cpus, cpu_online_mask); num_cpus = cpumask_weight(&cluster_online_cpus); for_each_cpu(cpu, &cluster_online_cpus) { int wflag = 0; rq = cpu_rq(cpu); wrq = &per_cpu(walt_rq, cpu_of(rq)); if (is_migration) { if (wrq->notif_pending) { wrq->notif_pending = false; wflag |= WALT_CPUFREQ_IC_MIGRATION_BIT; } if (is_asym_migration) wflag |= WALT_CPUFREQ_ASYM_FIXUP_BIT; if (is_pipeline_sync_migration) wflag |= WALT_CPUFREQ_SHARED_RAIL_BIT; } else { wflag |= WALT_CPUFREQ_ROLLOVER_BIT; } if (i == num_cpus) waltgov_run_callback(cpu_rq(cpu), wflag); else waltgov_run_callback(cpu_rq(cpu), wflag | WALT_CPUFREQ_CONTINUE_BIT); i++; if (!is_migration) walt_update_irqload(rq); } } /* * If the window change request is in pending, good place to * change sched_ravg_window since all rq locks are acquired. * * If the current window roll over is delayed such that the * mark_start (current wallclock with which roll over is done) * of the current task went past the window start with the * updated new window size, delay the update to the next * window roll over. Otherwise the CPU counters (prs and crs) are * not rolled over properly as mark_start > window_start. */ if (!is_migration) { spin_lock_irqsave(&sched_ravg_window_lock, flags); wrq = &per_cpu(walt_rq, cpu_of(this_rq())); if ((sched_ravg_window != new_sched_ravg_window) && (wc < wrq->window_start + new_sched_ravg_window)) { sched_ravg_window_change_time = walt_sched_clock(); trace_sched_ravg_window_change(sched_ravg_window, new_sched_ravg_window, sched_ravg_window_change_time); sched_ravg_window = new_sched_ravg_window; walt_tunables_fixup(); } spin_unlock_irqrestore(&sched_ravg_window_lock, flags); } } /** * irq_work_restrict_to_mig_clusters() - only allow notified clusters * @lock_cpus: mask of the cpus for which the runque should be locked. * * Remove cpus in clusters that are not part of the migration, using * the notif_pending flag to track. * * This is only valid for the migration irq work. */ static inline void irq_work_restrict_to_mig_clusters(cpumask_t *lock_cpus) { struct walt_sched_cluster *cluster; struct rq *rq; struct walt_rq *wrq; int cpu; for_each_sched_cluster(cluster) { bool keep_locked = false; for_each_cpu(cpu, &cluster->cpus) { rq = cpu_rq(cpu); wrq = &per_cpu(walt_rq, cpu_of(rq)); /* remove this cluster if it's not being notified */ if (wrq->notif_pending) { keep_locked = true; break; } } if (!keep_locked) cpumask_andnot(lock_cpus, lock_cpus, &cluster->cpus); } } void update_cpu_capacity_helper(int cpu) { unsigned long fmax_capacity = arch_scale_cpu_capacity(cpu); unsigned long thermal_pressure = arch_scale_thermal_pressure(cpu); unsigned long thermal_cap, old; struct walt_sched_cluster *cluster; struct rq *rq = cpu_rq(cpu); if (unlikely(walt_disabled)) return; /* * thermal_pressure = cpu_scale - curr_cap_as_per_thermal. * so, * curr_cap_as_per_thermal = cpu_scale - thermal_pressure. */ thermal_cap = fmax_capacity - thermal_pressure; cluster = cpu_cluster(cpu); /* reduce the fmax_capacity under cpufreq constraints */ if (cluster->walt_internal_freq_limit != cluster->max_possible_freq) fmax_capacity = mult_frac(fmax_capacity, min(cluster->walt_internal_freq_limit, cluster->max_freq), cluster->max_possible_freq); old = rq->cpu_capacity_orig; rq->cpu_capacity_orig = min(fmax_capacity, thermal_cap); if (old != rq->cpu_capacity_orig) trace_update_cpu_capacity(cpu, fmax_capacity, rq->cpu_capacity_orig); } /* * The intention of this hook is to update cpu_capacity_orig as well as * (*capacity), otherwise we will end up capacity_of() > capacity_orig_of(). */ static void android_rvh_update_cpu_capacity(void *unused, int cpu, unsigned long *capacity) { unsigned long rt_pressure = arch_scale_cpu_capacity(cpu) - *capacity; update_cpu_capacity_helper(cpu); *capacity = max((int)(cpu_rq(cpu)->cpu_capacity_orig - rt_pressure), 0); } /* * big_task_pid is used by the One Big Enqueue Task feature to track the * number of big tasks enqueued on largest cluster. * big_task_pid = {0, pid, -1} * 0 -> indicates there are 0 big tasks enqueued on the CPU * pid -> indicates that there is one big task enqueued whose PID * is pid * -1 -> indicates that there is more than one big task enqueued * on the CPU. * The big_task_pid value is checked every window rollover * and updated according to the rules above. */ DEFINE_PER_CPU(pid_t, big_task_pid); bool is_obet; /* * check_obet() needs to be called with all the rq locks held. * It resets per cpu big_task_pid and does cpu checks on a * single big task. */ static void check_obet(void) { struct task_struct *p; int is_obet_temp = 0; int mid_cluster_cpu, cpu; if (num_sched_clusters < 2) return; mid_cluster_cpu = cpumask_first(&cpu_array[0][num_sched_clusters - 2]); for_each_cpu(cpu, &cpu_array[0][num_sched_clusters - 1]) { if (per_cpu(big_task_pid, cpu) == -1) { is_obet_temp = -1; } else if (per_cpu(big_task_pid, cpu) != 0) { if (is_obet_temp == 0) { is_obet_temp = per_cpu(big_task_pid, cpu); } else { if (is_obet_temp != per_cpu(big_task_pid, cpu)) is_obet_temp = -1; } } } if (is_obet_temp == -1 || is_obet_temp == 0) is_obet = false; else is_obet = true; //reset per CPU big_task_pid for the upcoming window for_each_cpu(cpu, &cpu_array[0][num_sched_clusters - 1]) { pid_t pid = per_cpu(big_task_pid, cpu); if (pid) { int task_count = 0; int big_task_count = 0; list_for_each_entry(p, &(cpu_rq(cpu)->cfs_tasks), se.group_node) { task_count++; if (!task_fits_max(p, mid_cluster_cpu)) { big_task_count++; pid = p->pid; if (big_task_count == 2) break; } if (task_count == 10) break; } if (task_count == 10) per_cpu(big_task_pid, cpu) = -1; else if (big_task_count == 0) per_cpu(big_task_pid, cpu) = 0; else if (big_task_count == 1) per_cpu(big_task_pid, cpu) = pid; else per_cpu(big_task_pid, cpu) = -1; } } } static void check_obet_set_boost(void) { static bool prev_is_obet; bool now_is_obet; //Determine if core_ctl boost is needed now_is_obet = is_obet; if (prev_is_obet != now_is_obet) core_ctl_set_cluster_boost(num_sched_clusters - 1, is_obet); prev_is_obet = now_is_obet; } #define CORE_UTIL_PERIOD 1000000000 static void walt_core_utilization(int cpu) { static u64 sum[WALT_NR_CPUS]; static u64 timestamp; static int nr_windows[WALT_NR_CPUS]; struct walt_rq *wrq = &per_cpu(walt_rq, cpu); u64 max_capacity = arch_scale_cpu_capacity(cpu); if (wrq->window_start > timestamp + CORE_UTIL_PERIOD) { sysctl_sched_walt_core_util[cpu] = sum[cpu] / nr_windows[cpu]; sum[cpu] = 0; nr_windows[cpu] = 0; if (cpu == cpumask_last(cpu_online_mask)) timestamp = wrq->window_start; } nr_windows[cpu]++; if (max_capacity < wrq->walt_stats.cumulative_runnable_avg_scaled) sum[cpu] += max_capacity; else sum[cpu] += wrq->walt_stats.cumulative_runnable_avg_scaled; } DEFINE_PER_CPU(u32, wakeup_ctr); /** * walt_irq_work() - perform walt irq work for rollover and migration * * Process a workqueue call scheduled, while running in a hard irq * protected context. Handle migration and window rollover work * with common funtionality, and on window rollover ask core control * to decide if it needs to adjust the active cpus. */ static void walt_irq_work(struct irq_work *irq_work) { cpumask_t lock_cpus; struct walt_rq *wrq; int level; int cpu; bool is_migration = false, is_asym_migration = false, is_pipeline_sync_migration = false; u32 wakeup_ctr_sum = 0; struct walt_sched_cluster *cluster; bool need_assign_heavy = false; if (irq_work == &walt_migration_irq_work) is_migration = true; cpumask_copy(&lock_cpus, cpu_possible_mask); if (is_migration) { irq_work_restrict_to_mig_clusters(&lock_cpus); /* * if the notif_pending was handled by a previous * walt_irq_work invocation, there is no migration * work. */ if (cpumask_empty(&lock_cpus)) return; if (pipeline_in_progress() && cpumask_intersects(&lock_cpus, &pipeline_sync_cpus)) { cpumask_or(&lock_cpus, &lock_cpus, &pipeline_sync_cpus); is_pipeline_sync_migration = true; } if (!is_state1() && cpumask_intersects(&lock_cpus, &asym_cap_sibling_cpus)) { cpumask_or(&lock_cpus, &lock_cpus, &asym_cap_sibling_cpus); is_asym_migration = true; } } level = 0; for_each_cpu(cpu, &lock_cpus) { if (level == 0) raw_spin_lock(&cpu_rq(cpu)->__lock); else raw_spin_lock_nested(&cpu_rq(cpu)->__lock, level); level++; } __walt_irq_work_locked(is_migration, is_asym_migration, is_pipeline_sync_migration, &lock_cpus); if (!is_migration) { for_each_cpu(cpu, cpu_online_mask) { wakeup_ctr_sum += per_cpu(wakeup_ctr, cpu); per_cpu(wakeup_ctr, cpu) = 0; walt_core_utilization(cpu); set_cpu_flag(cpu, CPU_FIRST_ENQ_IN_WINDOW, 0); } check_obet(); } for_each_cpu(cpu, &lock_cpus) raw_spin_unlock(&cpu_rq(cpu)->__lock); if (!is_migration) { wrq = &per_cpu(walt_rq, cpu_of(this_rq())); need_assign_heavy = pipeline_check(wrq); core_ctl_check(wrq->window_start, wakeup_ctr_sum); pipeline_rearrange(wrq, need_assign_heavy); for_each_sched_cluster(cluster) { update_smart_freq_legacy_reason_hyst_time(cluster); } check_obet_set_boost(); } } void walt_rotation_checkpoint(int nr_big) { int i; bool prev = walt_rotation_enabled; if (!hmp_capable()) return; if (!sysctl_sched_walt_rotate_big_tasks || sched_boost_type != NO_BOOST) { walt_rotation_enabled = 0; return; } walt_rotation_enabled = nr_big >= num_possible_cpus(); for (i = 0; i < num_sched_clusters; i++) { if (walt_rotation_enabled && !prev) freq_cap[HIGH_PERF_CAP][i] = high_perf_cluster_freq_cap[i]; else if (!walt_rotation_enabled && prev) freq_cap[HIGH_PERF_CAP][i] = FREQ_QOS_MAX_DEFAULT_VALUE; } update_smart_freq_capacities(); } void walt_fill_ta_data(struct core_ctl_notif_data *data) { struct walt_related_thread_group *grp; unsigned long flags; u64 total_demand = 0, wallclock; int min_cap_cpu, scale = 1024; struct walt_sched_cluster *cluster; int i = 0; struct walt_task_struct *wts; grp = lookup_related_thread_group(DEFAULT_CGROUP_COLOC_ID); raw_spin_lock_irqsave(&grp->lock, flags); if (list_empty(&grp->tasks)) { raw_spin_unlock_irqrestore(&grp->lock, flags); goto fill_util; } wallclock = walt_sched_clock(); list_for_each_entry(wts, &grp->tasks, grp_list) { if (wts->mark_start < wallclock - (sched_ravg_window * RAVG_HIST_SIZE)) continue; total_demand += wts->coloc_demand; } raw_spin_unlock_irqrestore(&grp->lock, flags); /* * Scale the total demand to the lowest capacity CPU and * convert into percentage. * * P = total_demand/sched_ravg_window * 1024/scale * 100 */ min_cap_cpu = cpumask_first(&cpu_array[0][0]); if (min_cap_cpu != -1) scale = arch_scale_cpu_capacity(min_cap_cpu); data->coloc_load_pct = div64_u64(total_demand * 1024 * 100, (u64)sched_ravg_window * scale); fill_util: for_each_sched_cluster(cluster) { int fcpu = cluster_first_cpu(cluster); if (i == MAX_CLUSTERS) break; scale = arch_scale_cpu_capacity(fcpu); data->ta_util_pct[i] = div64_u64(cluster->aggr_grp_load * 1024 * 100, (u64)sched_ravg_window * scale); scale = arch_scale_freq_capacity(fcpu); data->cur_cap_pct[i] = (scale * 100)/1024; i++; } } #define INIT_TASK_LOAD_PCT 15 static void walt_init_window_dep(void) { walt_scale_demand_divisor = sched_ravg_window >> SCHED_CAPACITY_SHIFT; /* default task to 15 pct */ sched_init_task_load_windows = div64_u64((u64)INIT_TASK_LOAD_PCT * (u64)sched_ravg_window, 100); sched_init_task_load_windows_scaled = scale_time_to_util(sched_init_task_load_windows); walt_cpu_high_irqload = div64_u64((u64)sched_ravg_window * 95, (u64) 100); } static void walt_init_once(void) { init_irq_work(&walt_migration_irq_work, walt_irq_work); init_irq_work(&walt_cpufreq_irq_work, walt_irq_work); walt_init_window_dep(); } static void walt_sched_init_rq(struct rq *rq) { int j; struct walt_rq *wrq = &per_cpu(walt_rq, cpu_of(rq)); cpumask_set_cpu(cpu_of(rq), &wrq->freq_domain_cpumask); wrq->walt_stats.cumulative_runnable_avg_scaled = 0; wrq->prev_window_size = sched_ravg_window; wrq->window_start = 0; wrq->walt_stats.nr_big_tasks = 0; wrq->walt_stats.nr_trailblazer_tasks = 0; wrq->walt_flags = 0; wrq->avg_irqload = 0; wrq->prev_irq_time = 0; wrq->last_irq_window = 0; wrq->high_irqload = false; wrq->task_exec_scale = 1024; wrq->push_task = NULL; wrq->lrb_pipeline_start_time = 0; wrq->curr_runnable_sum = wrq->prev_runnable_sum = 0; wrq->nt_curr_runnable_sum = wrq->nt_prev_runnable_sum = 0; memset(&wrq->grp_time, 0, sizeof(struct group_cpu_time)); wrq->old_busy_time = 0; wrq->old_estimated_time = 0; wrq->walt_stats.pred_demands_sum_scaled = 0; wrq->walt_stats.nr_rtg_high_prio_tasks = 0; wrq->ed_task = NULL; wrq->curr_table = 0; wrq->prev_top = 0; wrq->curr_top = 0; wrq->last_cc_update = 0; wrq->cycles = 0; for (j = 0; j < NUM_TRACKED_WINDOWS; j++) { memset(&wrq->load_subs[j], 0, sizeof(struct load_subtractions)); wrq->top_tasks[j] = kcalloc(NUM_LOAD_INDICES, sizeof(u8), GFP_ATOMIC | GFP_NOWAIT); /* No other choice */ BUG_ON(!wrq->top_tasks[j]); clear_top_tasks_bitmap(wrq->top_tasks_bitmap[j]); } wrq->notif_pending = false; wrq->num_mvp_tasks = 0; INIT_LIST_HEAD(&wrq->mvp_tasks); wrq->mvp_arrival_time = 0; wrq->mvp_throttle_time = 0; wrq->skip_mvp = false; wrq->uclamp_limit[UCLAMP_MIN] = 0; wrq->uclamp_limit[UCLAMP_MAX] = SCHED_CAPACITY_SCALE; } void sched_window_nr_ticks_change(void) { unsigned long flags; spin_lock_irqsave(&sched_ravg_window_lock, flags); new_sched_ravg_window = mult_frac(sysctl_sched_ravg_window_nr_ticks, NSEC_PER_SEC, HZ); spin_unlock_irqrestore(&sched_ravg_window_lock, flags); } static void walt_inc_cumulative_runnable_avg(struct rq *rq, struct task_struct *p) { struct walt_rq *wrq = &per_cpu(walt_rq, cpu_of(rq)); struct walt_task_struct *wts = (struct walt_task_struct *) p->android_vendor_data1; fixup_cumulative_runnable_avg(rq, p, &wrq->walt_stats, wts->demand_scaled, wts->pred_demand_scaled); } static void walt_dec_cumulative_runnable_avg(struct rq *rq, struct task_struct *p) { struct walt_rq *wrq = &per_cpu(walt_rq, cpu_of(rq)); struct walt_task_struct *wts = (struct walt_task_struct *) p->android_vendor_data1; fixup_cumulative_runnable_avg(rq, p, &wrq->walt_stats, -(s64)wts->demand_scaled, -(s64)wts->pred_demand_scaled); } static void android_rvh_wake_up_new_task(void *unused, struct task_struct *new) { if (unlikely(walt_disabled)) return; init_new_task_load(new); add_new_task_to_grp(new); } static void walt_cpu_frequency_limits(void *unused, struct cpufreq_policy *policy) { int cpu; if (unlikely(walt_disabled)) return; cpu_cluster(policy->cpu)->max_freq = policy->max; for_each_cpu(cpu, policy->related_cpus) update_cpu_capacity_helper(cpu); } static void android_rvh_sched_cpu_starting(void *unused, int cpu) { if (unlikely(walt_disabled)) return; clear_walt_request(cpu); } static void android_rvh_sched_cpu_dying(void *unused, int cpu) { if (unlikely(walt_disabled)) return; clear_walt_request(cpu); } static void android_rvh_set_task_cpu(void *unused, struct task_struct *p, unsigned int new_cpu) { if (unlikely(walt_disabled)) return; migrate_busy_time_subtraction(p, (int) new_cpu); if (!cpumask_test_cpu(new_cpu, p->cpus_ptr)) WALT_BUG(WALT_BUG_WALT, p, "selecting unaffined cpu=%d comm=%s(%d) affinity=0x%lx", new_cpu, p->comm, p->pid, (*(cpumask_bits(p->cpus_ptr)))); if (!p->in_execve && is_compat_thread(task_thread_info(p)) && !cpumask_test_cpu(new_cpu, system_32bit_el0_cpumask())) WALT_BUG(WALT_BUG_WALT, p, "selecting non 32 bit cpu=%d comm=%s(%d) 32bit_cpus=0x%lx", new_cpu, p->comm, p->pid, (*(cpumask_bits(system_32bit_el0_cpumask())))); } static void android_rvh_new_task_stats(void *unused, struct task_struct *p) { if (unlikely(walt_disabled)) return; mark_task_starting(p); } static void android_rvh_account_irq(void *unused, struct task_struct *curr, int cpu, s64 delta, bool start) { struct rq *rq; unsigned long flags; struct walt_rq *wrq; if (unlikely(walt_disabled)) return; if (!walt_is_idle_task(curr)) return; rq = cpu_rq(cpu); wrq = &per_cpu(walt_rq, cpu_of(rq)); if (start) { if (!wrq->window_start) return; /* We're here without rq->lock held, IRQ disabled */ raw_spin_lock(&rq->__lock); update_task_cpu_cycles(curr, cpu, walt_sched_clock()); raw_spin_unlock(&rq->__lock); } else { raw_spin_lock_irqsave(&rq->__lock, flags); walt_update_task_ravg(curr, rq, IRQ_UPDATE, walt_sched_clock(), delta); raw_spin_unlock_irqrestore(&rq->__lock, flags); wrq->last_irq_window = wrq->window_start; } } static void android_rvh_flush_task(void *unused, struct task_struct *p) { if (unlikely(walt_disabled)) return; walt_task_dead(p); } static void android_rvh_enqueue_task(void *unused, struct rq *rq, struct task_struct *p, int flags) { u64 wallclock; struct walt_task_struct *wts = (struct walt_task_struct *) p->android_vendor_data1; struct walt_rq *wrq = &per_cpu(walt_rq, cpu_of(rq)); bool double_enqueue = false; int mid_cluster_cpu; if (unlikely(walt_disabled)) return; walt_lockdep_assert_rq(rq, p); if (flags & ENQUEUE_WAKEUP) per_cpu(wakeup_ctr, cpu_of(rq)) += 1; if (!is_per_cpu_kthread(p)) wrq->enqueue_counter++; if (task_thread_info(p)->cpu != cpu_of(rq)) WALT_BUG(WALT_BUG_UPSTREAM, p, "enqueuing on rq %d when task->cpu is %d\n", cpu_of(rq), task_thread_info(p)->cpu); /* catch double enqueue */ if (wts->prev_on_rq == 1) { WALT_BUG(WALT_BUG_UPSTREAM, p, "double enqueue detected: task_cpu=%d new_cpu=%d\n", task_cpu(p), cpu_of(rq)); double_enqueue = true; } wallclock = walt_rq_clock(rq); if (wts->enqueue_after_migration != 0) { wallclock = walt_sched_clock(); migrate_busy_time_addition(p, cpu_of(rq), wallclock); wts->enqueue_after_migration = 0; } wts->prev_on_rq = 1; wts->prev_on_rq_cpu = cpu_of(rq); wts->last_enqueued_ts = wallclock; sched_update_nr_prod(rq->cpu, 1); if (walt_fair_task(p)) { wts->misfit = !task_fits_max(p, rq->cpu); if (!double_enqueue) inc_rq_walt_stats(rq, p); walt_cfs_enqueue_task(rq, p); } if (!double_enqueue) walt_inc_cumulative_runnable_avg(rq, p); if ((flags & ENQUEUE_WAKEUP) && walt_flag_test(p, WALT_TRAILBLAZER_BIT)) { waltgov_run_callback(rq, WALT_CPUFREQ_TRAILBLAZER_BIT); } else if (((flags & ENQUEUE_WAKEUP) || !is_cpu_flag_set(cpu_of(rq), CPU_FIRST_ENQ_IN_WINDOW)) && do_pl_notif(rq)) { waltgov_run_callback(rq, WALT_CPUFREQ_PL_BIT); } else if (walt_feat(WALT_FEAT_UCLAMP_FREQ_BIT)) { unsigned long min, max; min = uclamp_rq_get(rq, UCLAMP_MIN); max = uclamp_rq_get(rq, UCLAMP_MAX); if ((wrq->uclamp_limit[UCLAMP_MIN] != min) || (wrq->uclamp_limit[UCLAMP_MAX] != max)) { wrq->uclamp_limit[UCLAMP_MIN] = min; wrq->uclamp_limit[UCLAMP_MAX] = max; waltgov_run_callback(rq, WALT_CPUFREQ_UCLAMP_BIT); } } set_cpu_flag(cpu_of(rq), CPU_FIRST_ENQ_IN_WINDOW, 1); if (num_sched_clusters >= 2) { mid_cluster_cpu = cpumask_first( &cpu_array[0][num_sched_clusters - 2]); if (is_max_possible_cluster_cpu(rq->cpu) && !task_fits_max(p, mid_cluster_cpu)) { if (!per_cpu(big_task_pid, rq->cpu)) per_cpu(big_task_pid, rq->cpu) = p->pid; else if (p->pid != per_cpu(big_task_pid, rq->cpu)) per_cpu(big_task_pid, rq->cpu) = -1; } } trace_sched_enq_deq_task(p, 1, cpumask_bits(p->cpus_ptr)[0], is_mvp(wts)); } static void android_rvh_dequeue_task(void *unused, struct rq *rq, struct task_struct *p, int flags) { struct walt_rq *wrq = &per_cpu(walt_rq, cpu_of(rq)); struct walt_task_struct *wts = (struct walt_task_struct *) p->android_vendor_data1; bool double_dequeue = false; if (unlikely(walt_disabled)) return; walt_lockdep_assert_rq(rq, p); /* * a task can be enqueued before walt is started, and dequeued after. * therefore the check to ensure that prev_on_rq_cpu is needed to prevent * an invalid failure. */ if (wts->prev_on_rq_cpu >= 0 && wts->prev_on_rq_cpu != cpu_of(rq) && walt_flag_test(p, WALT_INIT_BIT)) WALT_BUG(WALT_BUG_UPSTREAM, p, "dequeue cpu %d not same as enqueue %d\n", cpu_of(rq), wts->prev_on_rq_cpu); /* no longer on a cpu */ wts->prev_on_rq_cpu = -1; /* catch double deq */ if (wts->prev_on_rq == 2) { WALT_BUG(WALT_BUG_UPSTREAM, p, "double dequeue detected: task_cpu=%d new_cpu=%d\n", task_cpu(p), cpu_of(rq)); double_dequeue = true; } wts->prev_on_rq = 2; if (p == wrq->ed_task) is_ed_task_present(rq, walt_rq_clock(rq), p); sched_update_nr_prod(rq->cpu, -1); if (walt_fair_task(p)) { if (!double_dequeue) dec_rq_walt_stats(rq, p); walt_cfs_dequeue_task(rq, p); } if (!double_dequeue) walt_dec_cumulative_runnable_avg(rq, p); if (walt_feat(WALT_FEAT_UCLAMP_FREQ_BIT)) { unsigned long min, max; min = uclamp_rq_get(rq, UCLAMP_MIN); max = uclamp_rq_get(rq, UCLAMP_MAX); if ((wrq->uclamp_limit[UCLAMP_MIN] != min) || (wrq->uclamp_limit[UCLAMP_MAX] != max)) { wrq->uclamp_limit[UCLAMP_MIN] = min; wrq->uclamp_limit[UCLAMP_MAX] = max; waltgov_run_callback(rq, WALT_CPUFREQ_UCLAMP_BIT); } } trace_sched_enq_deq_task(p, 0, cpumask_bits(p->cpus_ptr)[0], is_mvp(wts)); } static void android_rvh_update_misfit_status(void *unused, struct task_struct *p, struct rq *rq, bool *need_update) { struct walt_task_struct *wts; struct walt_rq *wrq; bool old_misfit, misfit; int change; if (unlikely(walt_disabled)) return; *need_update = false; if (!p) { rq->misfit_task_load = 0; return; } wrq = &per_cpu(walt_rq, cpu_of(rq)); wts = (struct walt_task_struct *) p->android_vendor_data1; old_misfit = wts->misfit; if (task_fits_max(p, rq->cpu)) rq->misfit_task_load = 0; else rq->misfit_task_load = task_util(p); misfit = rq->misfit_task_load; change = misfit - old_misfit; if (change) { sched_update_nr_prod(rq->cpu, 0); wts->misfit = misfit; wrq->walt_stats.nr_big_tasks += change; BUG_ON(wrq->walt_stats.nr_big_tasks < 0); } } /* utility function to update walt signals at wakeup */ static void android_rvh_try_to_wake_up(void *unused, struct task_struct *p) { struct rq *rq = cpu_rq(task_cpu(p)); struct walt_task_struct *wts = (struct walt_task_struct *) p->android_vendor_data1; struct rq_flags rf; u64 wallclock; unsigned int old_load; struct walt_related_thread_group *grp = NULL; if (unlikely(walt_disabled)) return; rq_lock_irqsave(rq, &rf); old_load = task_load(p); wallclock = walt_sched_clock(); /* * Once task does a sleep(not the yield induce sleep) * reset the flag, to ensure task is no longer qualified * as frequent yielder. * i.e. task needs to qualify again as frequent yielder. */ if (!(wts->yield_state & YIELD_INDUCED_SLEEP)) wts->yield_state = 0; else wts->yield_state &= YIELD_CNT_MASK; if (walt_is_idle_task(rq->curr) && p->in_iowait) walt_update_task_ravg(rq->curr, rq, TASK_UPDATE, wallclock, 0); walt_update_task_ravg(p, rq, TASK_WAKE, wallclock, 0); note_task_waking(p, wallclock); rq_unlock_irqrestore(rq, &rf); rcu_read_lock(); grp = task_related_thread_group(p); if (update_preferred_cluster(grp, p, old_load, false)) set_preferred_cluster(grp); rcu_read_unlock(); } static u64 tick_sched_clock; static DECLARE_COMPLETION(tick_sched_clock_completion); DEFINE_PER_CPU(unsigned long, intr_cnt); DEFINE_PER_CPU(unsigned long, cycle_cnt); DEFINE_PER_CPU(unsigned int, ipc_level); DEFINE_PER_CPU(unsigned long, ipc_cnt); DEFINE_PER_CPU(u64, last_ipc_update); DEFINE_PER_CPU(u64, ipc_deactivate_ns); DEFINE_PER_CPU(bool, tickless_mode); static unsigned long calculate_ipc(int cpu) { unsigned long amu_cnt, delta_cycl = 0, delta_intr = 0; unsigned long prev_cycl_cnt = per_cpu(cycle_cnt, cpu); unsigned long prev_intr_cnt = per_cpu(intr_cnt, cpu); unsigned long ipc = 0; struct walt_sched_cluster *cluster = cpu_cluster(cpu); amu_cnt = read_sysreg_s(SYS_AMEVCNTR0_CORE_EL0); delta_cycl = amu_cnt - prev_cycl_cnt; per_cpu(cycle_cnt, cpu) = amu_cnt; amu_cnt = read_sysreg_s(SYS_AMEVCNTR0_INST_RET_EL0); per_cpu(intr_cnt, cpu) = amu_cnt; delta_intr = amu_cnt - prev_intr_cnt; if (prev_cycl_cnt && delta_cycl > cluster->smart_freq_info->min_cycles) ipc = (delta_intr * 100) / delta_cycl; per_cpu(ipc_cnt, cpu) = ipc; per_cpu(last_ipc_update, cpu) = cpu_rq(cpu)->clock; trace_ipc_update(cpu, per_cpu(cycle_cnt, cpu), per_cpu(intr_cnt, cpu), per_cpu(ipc_cnt, cpu), per_cpu(last_ipc_update, cpu), per_cpu(ipc_deactivate_ns, cpu), cpu_rq(cpu)->clock); return ipc; } static void android_rvh_tick_entry(void *unused, struct rq *rq) { u64 wallclock; if (unlikely(walt_disabled)) return; walt_lockdep_assert_rq(rq, NULL); wallclock = walt_rq_clock(rq); walt_update_task_ravg(rq->curr, rq, TASK_UPDATE, wallclock, 0); if (is_ed_task_present(rq, wallclock, NULL)) waltgov_run_callback(rq, WALT_CPUFREQ_EARLY_DET_BIT); } bool is_sbt_or_oscillate(void) { return now_is_sbt || (oscillate_cpu != -1); } bool should_boost_bus_dcvs(void) { trace_sched_boost_bus_dcvs(oscillate_cpu); return (oscillate_cpu != -1) || is_storage_boost(); } EXPORT_SYMBOL_GPL(should_boost_bus_dcvs); /* * oscillate_cpu = {-1, cpu} tells if system is currently rotating a big * task between Prime CPUs and on which CPU the big task is currently * executing. * If it is -1, no big task oscillation is occurring. */ int oscillate_cpu = -1; bool should_oscillate(unsigned int busy_cpu, int *no_oscillate_reason) { int cpu; int busy_cpu_count = 0; if (busy_cpu >= nr_cpu_ids) { *no_oscillate_reason = 1; return false; } if (!is_obet) { *no_oscillate_reason = 2; return false; } if (!is_max_possible_cluster_cpu(busy_cpu)) { *no_oscillate_reason = 3; return false; } if (cpumask_weight(&cpu_array[0][num_sched_clusters - 1]) == 1) { *no_oscillate_reason = 4; return false; } for_each_cpu(cpu, &cpu_array[0][num_sched_clusters - 1]) { busy_cpu_count += !available_idle_cpu(cpu); } if (busy_cpu_count != 1) { *no_oscillate_reason = 5; return false; } return true; } static void android_vh_scheduler_tick(void *unused, struct rq *rq) { struct walt_related_thread_group *grp; unsigned int old_load, last_ipc_level, curr_ipc_level; unsigned long ipc; int i, cpu = cpu_of(rq); struct walt_sched_cluster *cluster; struct smart_freq_cluster_info *smart_freq_info; u64 last_deactivate_ns; bool inform_governor = false; char ipc_debug[15] = {0}; if (!tick_sched_clock) { /* * Let the window begin 20us prior to the tick, * that way we are guaranteed a rollover when the tick occurs. * Use rq->clock directly instead of rq_clock() since * we do not have the rq lock and * rq->clock was updated in the tick callpath. */ tick_sched_clock = rq->clock - 20000; complete(&tick_sched_clock_completion); } if (unlikely(walt_disabled)) return; old_load = task_load(rq->curr); rcu_read_lock(); grp = task_related_thread_group(rq->curr); if (update_preferred_cluster(grp, rq->curr, old_load, true)) set_preferred_cluster(grp); rcu_read_unlock(); walt_lb_tick(rq); /* IPC based smart FMAX */ cluster = cpu_cluster(cpu); smart_freq_info = cluster->smart_freq_info; if (smart_freq_init_done && smart_freq_info->smart_freq_ipc_participation_mask & IPC_PARTICIPATION) { last_ipc_level = per_cpu(ipc_level, cpu); last_deactivate_ns = per_cpu(ipc_deactivate_ns, cpu); ipc = calculate_ipc(cpu); if (enable_logging) { snprintf(ipc_debug, sizeof(ipc_debug), "cpu_%d_ipc", cpu); trace_clock_set_rate(ipc_debug, ipc, raw_smp_processor_id()); } for (i = 0; i < SMART_FMAX_IPC_MAX; i++) if (ipc < smart_freq_info->ipc_reason_config[i].ipc) break; if (i >= SMART_FMAX_IPC_MAX) i = SMART_FMAX_IPC_MAX - 1; curr_ipc_level = i; if ((curr_ipc_level != last_ipc_level) || per_cpu(tickless_mode, cpu)) inform_governor = true; if ((curr_ipc_level < last_ipc_level) && (smart_freq_info->ipc_reason_config[last_ipc_level].hyst_ns > 0)) { if (!last_deactivate_ns) { per_cpu(ipc_deactivate_ns, cpu) = rq->clock; inform_governor = false; } else { u64 delta = rq->clock - last_deactivate_ns; if (smart_freq_info->ipc_reason_config[last_ipc_level].hyst_ns > delta) inform_governor = false; } } if (inform_governor) { per_cpu(ipc_level, cpu) = curr_ipc_level; per_cpu(ipc_deactivate_ns, cpu) = 0; waltgov_run_callback(rq, WALT_CPUFREQ_SMART_FREQ_BIT); } } } static void android_rvh_schedule(void *unused, struct task_struct *prev, struct task_struct *next, struct rq *rq) { u64 wallclock; struct walt_task_struct *wts = (struct walt_task_struct *) prev->android_vendor_data1; if (unlikely(walt_disabled)) return; wallclock = walt_rq_clock(rq); if (likely(prev != next)) { if (!prev->on_rq) wts->last_sleep_ts = wallclock; walt_update_task_ravg(prev, rq, PUT_PREV_TASK, wallclock, 0); walt_update_task_ravg(next, rq, PICK_NEXT_TASK, wallclock, 0); } else { walt_update_task_ravg(prev, rq, TASK_UPDATE, wallclock, 0); } } static void android_rvh_sched_fork_init(void *unused, struct task_struct *p) { if (unlikely(walt_disabled)) return; __sched_fork_init(p); } static void android_rvh_ttwu_cond(void *unused, int cpu, bool *cond) { if (unlikely(walt_disabled)) return; *cond = (sysctl_sched_many_wakeup_threshold < WALT_MANY_WAKEUP_DEFAULT) && (cpu != smp_processor_id()); } static void android_rvh_sched_exec(void *unused, bool *cond) { if (unlikely(walt_disabled)) return; *cond = true; } static void android_rvh_build_perf_domains(void *unused, bool *eas_check) { if (unlikely(walt_disabled)) return; *eas_check = true; } static void android_rvh_update_thermal_stats(void *unused, int cpu) { if (unlikely(walt_disabled)) return; update_cpu_capacity_helper(cpu); } static DECLARE_COMPLETION(rebuild_domains_completion); static void rebuild_sd_workfn(struct work_struct *work); static DECLARE_WORK(rebuild_sd_work, rebuild_sd_workfn); /** rebuild_sd_workfn * * rebuild the sched domains (and therefore the perf * domains). It is absolutely necessary that the * em_pds are created for each cpu device before * proceeding, and this must complete for walt to * function properly. */ static void rebuild_sd_workfn(struct work_struct *work) { int cpu; struct device *cpu_dev; for_each_possible_cpu(cpu) { cpu_dev = get_cpu_device(cpu); if (cpu_dev->em_pd) continue; WARN_ONCE(true, "must wait for perf domains to be created"); schedule_work(&rebuild_sd_work); /* do not rebuild domains yet, and do not complete this action */ return; } rebuild_sched_domains(); complete(&rebuild_domains_completion); } u8 contiguous_yielding_windows; static void walt_do_sched_yield_before(void *unused, long *skip) { struct walt_task_struct *wts = (struct walt_task_struct *)current->android_vendor_data1; struct walt_sched_cluster *cluster; struct smart_freq_cluster_info *smart_freq_info; bool in_legacy_uncap; if (unlikely(walt_disabled)) return; if (!walt_fair_task(current)) return; cluster = cpu_cluster(task_cpu(current)); smart_freq_info = cluster->smart_freq_info; if ((wts->yield_state & YIELD_CNT_MASK) >= MAX_YIELD_CNT_PER_TASK_THR) { total_yield_cnt++; if (contiguous_yielding_windows >= MIN_CONTIGUOUS_YIELDING_WINDOW) { /* * if we are under any legacy frequency uncap(i.e some * load condition, ignore injecting sleep for the * yielding task. */ in_legacy_uncap = !!(smart_freq_info->cluster_active_reason & ~BIT(NO_REASON_SMART_FREQ)); if (!in_legacy_uncap) { wts->yield_state |= YIELD_INDUCED_SLEEP; total_sleep_cnt++; *skip = true; usleep_range_state(YIELD_SLEEP_TIME_USEC, YIELD_SLEEP_TIME_USEC, TASK_INTERRUPTIBLE); } } } else { wts->yield_state++; } } static void walt_do_sched_yield(void *unused, struct rq *rq) { struct task_struct *curr = rq->curr; struct walt_task_struct *wts = (struct walt_task_struct *) curr->android_vendor_data1; if (unlikely(walt_disabled)) return; walt_lockdep_assert_rq(rq, NULL); if (!list_empty(&wts->mvp_list) && wts->mvp_list.next) walt_cfs_deactivate_mvp_task(rq, curr); if (per_cpu(rt_task_arrival_time, cpu_of(rq))) per_cpu(rt_task_arrival_time, cpu_of(rq)) = 0; } int walt_set_cpus_taken(struct cpumask *set) { unsigned long flags; int cpu; if (unlikely(walt_disabled)) return -EAGAIN; spin_lock_irqsave(&cpus_taken_lock, flags); for_each_cpu(cpu, set) { per_cpu(cpus_taken_refcount, cpu)++; } cpumask_or(&walt_cpus_taken_mask, &walt_cpus_taken_mask, set); spin_unlock_irqrestore(&cpus_taken_lock, flags); return 0; } EXPORT_SYMBOL_GPL(walt_set_cpus_taken); int walt_unset_cpus_taken(struct cpumask *unset) { unsigned long flags; int cpu; if (unlikely(walt_disabled)) return -EAGAIN; spin_lock_irqsave(&cpus_taken_lock, flags); for_each_cpu(cpu, unset) { if (per_cpu(cpus_taken_refcount, cpu) >= 1) per_cpu(cpus_taken_refcount, cpu)--; if (!per_cpu(cpus_taken_refcount, cpu)) cpumask_clear_cpu(cpu, &walt_cpus_taken_mask); } spin_unlock_irqrestore(&cpus_taken_lock, flags); return 0; } EXPORT_SYMBOL_GPL(walt_unset_cpus_taken); cpumask_t walt_get_cpus_taken(void) { return walt_cpus_taken_mask; } EXPORT_SYMBOL_GPL(walt_get_cpus_taken); int walt_get_cpus_in_state1(struct cpumask *cpus) { if (unlikely(walt_disabled)) return -EAGAIN; cpumask_or(cpus, cpu_partial_halt_mask, &sched_cluster[0]->cpus); cpumask_andnot(cpus, cpus, cpu_halt_mask); return 0; } EXPORT_SYMBOL_GPL(walt_get_cpus_in_state1); cpumask_t walt_get_halted_cpus(void) { return *(cpu_halt_mask); } EXPORT_SYMBOL_GPL(walt_get_halted_cpus); static void walt_cgroup_force_kthread_migration(void *unused, struct task_struct *tsk, struct cgroup *dst_cgrp, bool *force_migration) { /* no depenency on walt_disabled flag here */ /* * RT kthreads may be born in a cgroup with no rt_runtime allocated. * Just say no. */ #ifdef CONFIG_RT_GROUP_SCHED if (tsk->no_cgroup_migration && (dst_cgrp->root->subsys_mask & (1U << cpu_cgrp_id))) return; #endif /* * kthreads may acquire PF_NO_SETAFFINITY during initialization. * If userland migrates such a kthread to a non-root cgroup, it can * become trapped in a cpuset. Just say no. */ #ifdef CONFIG_CPUSETS if ((tsk->no_cgroup_migration || (tsk->flags & PF_NO_SETAFFINITY)) && (dst_cgrp->root->subsys_mask & (1U << cpuset_cgrp_id))) return; #endif *force_migration = true; } static void register_walt_hooks(void) { register_trace_android_rvh_wake_up_new_task(android_rvh_wake_up_new_task, NULL); register_trace_android_rvh_update_cpu_capacity(android_rvh_update_cpu_capacity, NULL); register_trace_android_rvh_sched_cpu_starting(android_rvh_sched_cpu_starting, NULL); register_trace_android_rvh_sched_cpu_dying(android_rvh_sched_cpu_dying, NULL); register_trace_android_rvh_set_task_cpu(android_rvh_set_task_cpu, NULL); register_trace_android_rvh_new_task_stats(android_rvh_new_task_stats, NULL); register_trace_android_rvh_account_irq(android_rvh_account_irq, NULL); register_trace_android_rvh_flush_task(android_rvh_flush_task, NULL); register_trace_android_rvh_update_misfit_status(android_rvh_update_misfit_status, NULL); register_trace_android_rvh_enqueue_task(android_rvh_enqueue_task, NULL); register_trace_android_rvh_dequeue_task(android_rvh_dequeue_task, NULL); register_trace_android_rvh_try_to_wake_up(android_rvh_try_to_wake_up, NULL); register_trace_android_rvh_tick_entry(android_rvh_tick_entry, NULL); register_trace_android_vh_scheduler_tick(android_vh_scheduler_tick, NULL); register_trace_android_rvh_schedule(android_rvh_schedule, NULL); register_trace_android_rvh_cpu_cgroup_attach(android_rvh_cpu_cgroup_attach, NULL); register_trace_android_rvh_cpu_cgroup_online(android_rvh_cpu_cgroup_online, NULL); register_trace_android_rvh_sched_fork_init(android_rvh_sched_fork_init, NULL); register_trace_android_rvh_ttwu_cond(android_rvh_ttwu_cond, NULL); register_trace_android_rvh_sched_exec(android_rvh_sched_exec, NULL); register_trace_android_rvh_build_perf_domains(android_rvh_build_perf_domains, NULL); register_trace_cpu_frequency_limits(walt_cpu_frequency_limits, NULL); register_trace_android_rvh_do_sched_yield(walt_do_sched_yield, NULL); register_trace_android_rvh_before_do_sched_yield(walt_do_sched_yield_before, NULL); register_trace_android_rvh_update_thermal_stats(android_rvh_update_thermal_stats, NULL); register_trace_android_rvh_cgroup_force_kthread_migration( walt_cgroup_force_kthread_migration, NULL); } atomic64_t walt_irq_work_lastq_ws; bool walt_disabled = true; static int walt_init_stop_handler(void *data) { int cpu; struct task_struct *g, *p; struct walt_rq *wrq; int level; read_lock(&tasklist_lock); level = 0; for_each_possible_cpu(cpu) { if (level == 0) raw_spin_lock(&cpu_rq(cpu)->__lock); else raw_spin_lock_nested(&cpu_rq(cpu)->__lock, level); level++; } /* existing tasks get a demand of 0, including idle task */ for_each_process_thread(g, p) { init_new_task_load(p); } for_each_possible_cpu(cpu) { /* Create task members for idle thread */ init_new_task_load(cpu_rq(cpu)->idle); walt_flag_set(cpu_rq(cpu)->idle, WALT_IDLE_TASK_BIT, 1); } /* post walt_init_once() a new task will get a non zero demand */ walt_init_once(); for_each_possible_cpu(cpu) { struct rq *rq = cpu_rq(cpu); walt_sched_init_rq(rq); wrq = &per_cpu(walt_rq, cpu_of(rq)); wrq->window_start = tick_sched_clock; } atomic64_set(&walt_irq_work_lastq_ws, tick_sched_clock); create_default_coloc_group(); walt_disabled = false; for_each_possible_cpu(cpu) { raw_spin_unlock(&cpu_rq(cpu)->__lock); } read_unlock(&tasklist_lock); return 0; } static void walt_init_tg_pointers(void) { struct cgroup_subsys_state *css = &root_task_group.css; struct cgroup_subsys_state *top_css = css; rcu_read_lock(); css_for_each_child(css, top_css) walt_update_tg_pointer(css); rcu_read_unlock(); } static void walt_remove_cpufreq_efficiencies_available(void) { struct cpufreq_policy *policy; struct walt_sched_cluster *cluster; for_each_sched_cluster(cluster) { policy = cpufreq_cpu_get(cluster_first_cpu(cluster)); if (policy) { policy->efficiencies_available = false; cpufreq_cpu_put(policy); } } } static void walt_init(struct work_struct *work) { static atomic_t already_inited = ATOMIC_INIT(0); struct root_domain *rd = cpu_rq(cpumask_first(cpu_active_mask))->rd; int i; might_sleep(); if (atomic_cmpxchg(&already_inited, 0, 1)) return; register_syscore_ops(&walt_syscore_ops); BUG_ON(alloc_related_thread_groups()); init_clusters(); walt_init_tg_pointers(); register_walt_hooks(); walt_fixup_init(); walt_lb_init(); walt_rt_init(); walt_cfs_init(); walt_halt_init(); walt_mvp_lock_ordering_init(); wait_for_completion_interruptible(&tick_sched_clock_completion); if (!rcu_access_pointer(rd->pd)) { /* * perf domains not properly configured. this is a must as * create_util_to_cost depends on rd->pd being properly * initialized. */ schedule_work(&rebuild_sd_work); wait_for_completion_interruptible(&rebuild_domains_completion); } walt_update_cluster_topology(); walt_remove_cpufreq_efficiencies_available(); walt_config(); walt_init_cycle_counter(); stop_machine(walt_init_stop_handler, NULL, NULL); /* * validate root-domain perf-domain is configured properly * to work with an asymmetrical soc. This is necessary * for load balance and task placement to work properly. * see walt_find_energy_efficient_cpu(), and * create_util_to_cost(). */ if (!rcu_access_pointer(rd->pd) && num_sched_clusters > 1) WALT_BUG(WALT_BUG_WALT, NULL, "root domain's perf-domain values not initialized rd->pd=%p.", rd->pd); walt_register_sysctl(); walt_register_debugfs(); input_boost_init(); core_ctl_init(); walt_boost_init(); waltgov_register(); i = match_string(sched_feat_names, __SCHED_FEAT_NR, "TTWU_QUEUE"); if (i >= 0) { static_key_disable(&sched_feat_keys[i]); sysctl_sched_features &= ~(1UL << i); } topology_clear_scale_freq_source(SCALE_FREQ_SOURCE_ARCH, cpu_online_mask); enable_logging = !!sec_debug_is_enabled(); } static DECLARE_WORK(walt_init_work, walt_init); static void android_vh_update_topology_flags_workfn(void *unused, void *unused2) { schedule_work(&walt_init_work); } static void walt_devicetree_init(void) { struct device_node *np; int ret; np = of_find_node_by_name(NULL, "sched_walt"); if (!np) { pr_err("Failed to find node of sched_walt\n"); return; } ret = of_property_read_u32(np, "panic_on_walt_bug", &sysctl_panic_on_walt_bug); if (ret < 0) { pr_err("Failed to read panic_on_walt_bug property\n"); return; } } #define WALT_VENDOR_DATA_SIZE_TEST(wstruct, kstruct) \ BUILD_BUG_ON(sizeof(wstruct) > (sizeof(u64) * \ ARRAY_SIZE(((kstruct *)0)->android_vendor_data1))) static int walt_module_init(void) { /* compile time checks for vendor data size */ WALT_VENDOR_DATA_SIZE_TEST(struct walt_task_struct, struct task_struct); WALT_VENDOR_DATA_SIZE_TEST(struct walt_task_group, struct task_group); walt_devicetree_init(); register_trace_android_vh_update_topology_flags_workfn( android_vh_update_topology_flags_workfn, NULL); if (topology_update_done) schedule_work(&walt_init_work); walt_cpufreq_cycle_cntr_driver_register(); walt_gclk_cycle_counter_driver_register(); return 0; } module_init(walt_module_init); MODULE_LICENSE("GPL v2"); MODULE_SOFTDEP("pre: socinfo"); #if IS_ENABLED(CONFIG_SCHED_WALT_DEBUG) MODULE_SOFTDEP("pre: sched-walt-debug"); #endif #if IS_ENABLED(CONFIG_SEC_QC_SUMMARY) #include void sec_qc_summary_set_sched_walt_info(struct sec_qc_summary_data_apss *apss) { apss->aplpm.num_clusters = num_sched_clusters; apss->aplpm.p_cluster = virt_to_phys(sched_cluster); } EXPORT_SYMBOL(sec_qc_summary_set_sched_walt_info); #endif