scx_lavd 1.0.13

A Latency-criticality Aware Virtual Deadline (LAVD) scheduler based on sched_ext, which is a Linux kernel feature which enables implementing kernel thread schedulers in BPF and dynamically loading them. https://github.com/sched-ext/scx/tree/main
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
/* SPDX-License-Identifier: GPL-2.0 */
/*
 * Copyright (c) 2023, 2024 Valve Corporation.
 * Author: Changwoo Min <changwoo@igalia.com>
 */

/*
 * To be included to the main.bpf.c
 */

/*
 * CPU topology
 */
static u64		LAVD_AP_LOW_UTIL;
static bool		have_turbo_core;
static bool		have_little_core;

const volatile u16	cpu_order_performance[LAVD_CPU_ID_MAX]; /* CPU preference order for performance and balanced mode */
const volatile u16	cpu_order_powersave[LAVD_CPU_ID_MAX]; /* CPU preference order for powersave mode */
const volatile u16	cpu_capacity[LAVD_CPU_ID_MAX]; /* CPU capacity based on 1024 */
const volatile u8	cpu_big[LAVD_CPU_ID_MAX]; /* Is a CPU a big core? */
const volatile u8	cpu_turbo[LAVD_CPU_ID_MAX]; /* Is a CPU a turbo core? */

static int		nr_cpdoms; /* number of compute domains */
struct cpdom_ctx	cpdom_ctxs[LAVD_CPDOM_MAX_NR]; /* contexts for compute domains */
private(LAVD) struct bpf_cpumask cpdom_cpumask[LAVD_CPDOM_MAX_NR]; /* online CPU mask for each compute domain */


/*
 * Big core's compute ratio among currently active cores scaled by 1024.
 */
static u32		cur_big_core_scale;

/*
 * Big core's compute ratio when all cores are active scaled by 1024.
 */
static u32		default_big_core_scale;

/*
 * Statistics
 */
volatile int		power_mode;
volatile u64		last_power_mode_clk;
volatile u64		performance_mode_ns;
volatile u64		balanced_mode_ns;
volatile u64		powersave_mode_ns;

static bool is_perf_cri(struct task_ctx *taskc)
{
	if (!have_little_core)
		return true;

	if (READ_ONCE(taskc->on_big) && READ_ONCE(taskc->on_little))
		return taskc->perf_cri >= sys_stat.thr_perf_cri;
	return READ_ONCE(taskc->on_big);
}

static bool clear_cpu_periodically(u32 cpu, struct bpf_cpumask *cpumask)
{
	u32 clear;

	/*
	 * If the CPU is on, we clear the bit once every four times
	 * (LAVD_CC_CPU_PIN_INTERVAL_DIV). Hence, the bit will be
	 * probabilistically cleared once every 100 msec (4 * 25 msec).
	 */
	clear = !(bpf_get_prandom_u32() % LAVD_CC_CPU_PIN_INTERVAL_DIV);
	if (clear)
		bpf_cpumask_clear_cpu(cpu, cpumask);

	return clear;
}

static const volatile u16 *get_cpu_order(void)
{
	/*
	 * Decide a cpu order to use according to its power mode.
	 */
	if (is_powersave_mode)
		return cpu_order_powersave;
	else
		return cpu_order_performance;
}

static int calc_nr_active_cpus(void)
{
	const volatile u16 *cpu_order;
	u64 req_cap, cap_cpu, cap_sum = 0;
	u16 cpu_id, i;

	/*
	 * Calculate the required compute capacity:
	 *
	 * Scaled utilization assumes all the CPUs are the fastest ones
	 * running at the highest frequency. So the required compute capacity
	 * given the scaled utilization is defined as follows:
	 *
	 * req_cpacity = total_compute_capaticy * scaled_utilization
	 *             = (nr_cpus_onln * 1024) * (avg_sc_util / 1024)
	 *             = nr_cpus_only * scaled_utilization
	 */
	req_cap = nr_cpus_onln * sys_stat.avg_sc_util;

	/*
	 * Fill the required compute capacity in the CPU preference order,
	 * utilizing each CPU in a certain % (LAVD_CC_PER_CORE_UTIL or
	 * LAVD_CC_PER_CORE_SHIFT).
	 */
	cpu_order = get_cpu_order();
	bpf_for(i, 0, nr_cpu_ids) {
		if (i >= LAVD_CPU_ID_MAX)
			return nr_cpu_ids;

		cpu_id = cpu_order[i];
		if (cpu_id >= LAVD_CPU_ID_MAX)
			return nr_cpu_ids;

		cap_cpu = cpu_capacity[cpu_id];
		cap_sum += cap_cpu >> LAVD_CC_PER_CORE_SHIFT;
		if (cap_sum >= req_cap)
			return i+1;
	}

	/* Should not be here. */
	return nr_cpu_ids;
}

static void do_core_compaction(void)
{
	const volatile u16 *cpu_order = get_cpu_order();
	struct cpu_ctx *cpuc;
	struct bpf_cpumask *active, *ovrflw;
	struct cpdom_ctx *cpdomc;
	int nr_active, nr_active_old, cpu, i;
	u32 sum_capacity = 0, big_capacity = 0, nr_active_cpdoms = 0;
	bool clear;
	u64 cpdom_id;

	bpf_rcu_read_lock();

	/*
	 * Prepare cpumasks.
	 */
	active = active_cpumask;
	ovrflw = ovrflw_cpumask;
	if (!active || !ovrflw) {
		scx_bpf_error("Failed to prepare cpumasks.");
		goto unlock_out;
	}

	/*
	 * Assign active and overflow cores
	 */
	nr_active_old = sys_stat.nr_active;
	nr_active = calc_nr_active_cpus();
	bpf_for(i, 0, nr_cpu_ids) {
		if (i >= LAVD_CPU_ID_MAX)
			break;

		/*
		 * Skip offline cpu
		 */
		cpu = cpu_order[i];
		cpuc = get_cpu_ctx_id(cpu);
		if (!cpuc || !cpuc->is_online) {
			bpf_cpumask_clear_cpu(cpu, active);
			bpf_cpumask_clear_cpu(cpu, ovrflw);
			continue;
		}

		/*
		 * Assign an online cpu to active and overflow cpumasks
		 */
		if (i < nr_active) {
			bpf_cpumask_set_cpu(cpu, active);
			bpf_cpumask_clear_cpu(cpu, ovrflw);
			scx_bpf_kick_cpu(cpu, SCX_KICK_IDLE);

			/*
			 * Accumulate the capacity of active CPUs and
			 * increase the number of active CPUs.
			 */
			cpdomc = MEMBER_VPTR(cpdom_ctxs, [cpuc->cpdom_id]);
			if (cpdomc) {
				cpdomc->cap_sum_temp += cpuc->capacity;
				cpdomc->nr_acpus_temp++;
			}

			/*
			 * Calculate big capacity ratio among active cores.
			 */
			sum_capacity += cpuc->capacity;
			if (cpuc->big_core)
				big_capacity += cpuc->capacity;
		} else if (i < nr_active_old) {
			bpf_cpumask_clear_cpu(cpu, active);
			bpf_cpumask_clear_cpu(cpu, ovrflw);
		} else {
			/*
			 * This is the case when a CPU belongs to the
			 * overflow set even though that CPU was not an
			 * overflow set initially. This can happen only
			 * when a pinned userspace task ran on this
			 * CPU. In this case, we keep the CPU in an
			 * overflow set since the CPU will be used
			 * anyway for the task. This will promote equal
			 * use of all used CPUs, lowering the energy
			 * consumption by avoiding a few CPUs being
			 * turbo-boosted. Hence, we do not clear the
			 * overflow cpumask here for a while,
			 * approximately for LAVD_CC_CPU_PIN_INTERVAL.
			 */
			bpf_cpumask_clear_cpu(cpu, active);
			clear = clear_cpu_periodically(cpu, ovrflw);
			if (!clear)
				scx_bpf_kick_cpu(cpu, SCX_KICK_IDLE);
		}
	}

	cur_big_core_scale = (big_capacity << LAVD_SHIFT) / sum_capacity;
	sys_stat.nr_active = nr_active;

	/*
	 * Update nr_active_cpus and cap_sum_active_cpus.
	 */
	bpf_for(cpdom_id, 0, nr_cpdoms) {
		if (cpdom_id >= LAVD_CPDOM_MAX_NR)
			break;

		cpdomc = MEMBER_VPTR(cpdom_ctxs, [cpdom_id]);
		if (!cpdomc)
			continue;
		WRITE_ONCE(cpdomc->nr_active_cpus, cpdomc->nr_acpus_temp);
		WRITE_ONCE(cpdomc->nr_acpus_temp, 0);
		WRITE_ONCE(cpdomc->cap_sum_active_cpus, cpdomc->cap_sum_temp);
		WRITE_ONCE(cpdomc->cap_sum_temp, 0);

		if (cpdomc->nr_active_cpus)
			nr_active_cpdoms++;
	}
	sys_stat.nr_active_cpdoms = nr_active_cpdoms;

unlock_out:
	bpf_rcu_read_unlock();
}

static void update_power_mode_time(void)
{
	u64 now = scx_bpf_now();
	u64 delta;

	if (last_power_mode_clk == 0)
		last_power_mode_clk = now;

	delta = time_delta(now, last_power_mode_clk);
	last_power_mode_clk = now;

	switch (power_mode) {
	case LAVD_PM_PERFORMANCE:
		__sync_fetch_and_add(&performance_mode_ns, delta);
		break;
	case LAVD_PM_BALANCED:
		__sync_fetch_and_add(&balanced_mode_ns, delta);
		break;
	case LAVD_PM_POWERSAVE:
		__sync_fetch_and_add(&powersave_mode_ns, delta);
		break;
	}
}


static int do_set_power_profile(s32 pm, int util)
{
	/*
	 * Skip setting the mode if already in the same mode.
	 */
	if (power_mode == pm)
		return 0;

	/*
	 * Update power mode time
	 */
	update_power_mode_time();
	power_mode = pm;

	/*
	 * Change the power mode.
	 */
	switch (pm) {
	case LAVD_PM_PERFORMANCE:
		no_core_compaction = true;
		is_powersave_mode = false;

		/*
		 * Since the core compaction becomes off, we need to
		 * reinitialize the active and overflow cpumask for performance
		 * mode.
		 *
		 * Note that a verifier in an old kernel does not allow calling
		 * bpf_cpumask_set_cpu(), so we defer the actual update to our
		 * timer handler, update_sys_stat().
		 */
		reinit_cpumask_for_performance = true;
		debugln("Set the scheduler's power profile to performance mode: %d", util);
		break;
	case LAVD_PM_BALANCED:
		no_core_compaction = false;
		is_powersave_mode = false;
		reinit_cpumask_for_performance = false;
		debugln("Set the scheduler's power profile to balanced mode: %d", util);
		break;
	case LAVD_PM_POWERSAVE:
		no_core_compaction = false;
		is_powersave_mode = true;
		reinit_cpumask_for_performance = false;
		debugln("Set the scheduler's power profile to power-save mode: %d", util);
		break;
	default:
		return -EINVAL;
	}

	return 0;
}

static int do_autopilot(void)
{
	/*
	 * If the CPU utiulization is very low (say <= 5%), it means high
	 * performance is not required. We run the scheduler in powersave mode
	 * to save energy consumption.
	 */
	if (sys_stat.avg_util <= LAVD_AP_LOW_UTIL)
		return do_set_power_profile(LAVD_PM_POWERSAVE, sys_stat.avg_util);

	/*
	 * If the CPU utiulization is moderate (say > 5%, <= 30%), we run the
	 * scheduler in balanced mode. Actually, balanced mode can save energy
	 * consumption only under moderate CPU load.
	 */
	if (sys_stat.avg_util <= LAVD_AP_HIGH_UTIL)
		return do_set_power_profile(LAVD_PM_BALANCED, sys_stat.avg_util);

	/*
	 * If the CPU utilization is high enough (say > 30%), we run the
	 * scheduler in performance mode. The system indeed needs perrformance
	 * also there is little energy benefit even under balanced mode anyway.
	 */
	return do_set_power_profile(LAVD_PM_PERFORMANCE, sys_stat.avg_util);
}

static void update_thr_perf_cri(void)
{
	u32 little_core_scale, delta, diff, thr;

	if (no_core_compaction || !have_little_core)
		cur_big_core_scale = default_big_core_scale;

	/*
	 * If all active cores are big, all tasks should run on the big cores.
	 */
	if (cur_big_core_scale == LAVD_SCALE) {
		sys_stat.thr_perf_cri = 0;
		return;
	}

	/*
	 * We approximate the distribution of performance criticality of tasks
	 * using min, avg, and max performance criticality of a given period.
	 *
	 *   min_perf_cri
	 *   |         avg_perf_cri
	 *   |         |                       max_perf_cri
	 *   |         |                       |
	 *   <--------><----------------------->
	 *
	 * The half of compute capacity should be assigned to the below average
	 * tasks (< avg_perf_cri), and the other half should assigned to the
	 * above average tasks (>= avg_perf_cri).
	 *
	 *   <------------><------------------->
	 *   |            |                    |
	 *   |            |                    1024
	 *   |            1024 - big_core_scale (i.e., little_core_scale)
	 *   0
	 */
	little_core_scale = LAVD_SCALE - cur_big_core_scale;
	if (little_core_scale < p2s(50)) {
		/*
		 *   min_perf_cri
		 *   |         avg_perf_cri
		 *   |         |                       max_perf_cri
		 *   |         |                       |
		 *   <--------><----------------------->
		 *
		 *   <-///-><-------------------------->
		 *   |     |                           |
		 *   |     |                           1024
		 *   |     little_core_scale
		 *   0
		 */
		delta = sys_stat.avg_perf_cri - sys_stat.min_perf_cri;
		diff = (delta * little_core_scale) >> LAVD_SHIFT;
		thr = diff + sys_stat.min_perf_cri;
	}
	else {
		/*
		 *   min_perf_cri
		 *   |         avg_perf_cri
		 *   |         |                       max_perf_cri
		 *   |         |                       |
		 *   <--------><----------------------->
		 *
		 *   <---------------------><-////////->
		 *   |                     |           |
		 *   |                     |           1024
		 *   |                     little_core_scale
		 *   0
		 */
		delta = sys_stat.max_perf_cri - sys_stat.avg_perf_cri;
		diff = (delta * cur_big_core_scale) >> LAVD_SHIFT;
		thr = sys_stat.max_perf_cri - diff;
	}

	sys_stat.thr_perf_cri = thr;
}

static int reinit_active_cpumask_for_performance(void)
{
	struct cpu_ctx *cpuc;
	struct bpf_cpumask *active, *ovrflw;
	const struct cpumask *online_cpumask;
	struct cpdom_ctx *cpdomc;
	u64 dsq_id;
	u32 nr_active_cpdoms = 0;
	int cpu, err = 0;

	barrier();
	bpf_rcu_read_lock();

	/*
	 * Prepare cpumasks.
	 */
	active  = active_cpumask;
	ovrflw  = ovrflw_cpumask;
	if (!active || !ovrflw) {
		scx_bpf_error("Failed to prepare cpumasks.");
		err = -ENOMEM;
		goto unlock_out;
	}


	/*
	 * Once core compaction becomes off in performance mode, reinitialize
	 * active/overflow cpumasks to reflect the mode change.
	 * In an asymmetric system, big cores belong to the active set but
	 * little cores the overflow set to prefer big cores for performance.
	 * In a symmetric system, all online CPUs belong to the active set.
	 */
	if (have_little_core) {
		bpf_for(cpu, 0, nr_cpu_ids) {
			cpuc = get_cpu_ctx_id(cpu);
			if (!cpuc)
				continue;
			if (!cpuc->is_online) {
				bpf_cpumask_clear_cpu(cpu, active);
				bpf_cpumask_clear_cpu(cpu, ovrflw);
				continue;
			}

			if (cpuc->big_core) {
				bpf_cpumask_set_cpu(cpu, active);
				bpf_cpumask_clear_cpu(cpu, ovrflw);
			} else {
				bpf_cpumask_set_cpu(cpu, ovrflw);
				bpf_cpumask_clear_cpu(cpu, active);
			}
			scx_bpf_kick_cpu(cpu, SCX_KICK_IDLE);

			cpdomc = MEMBER_VPTR(cpdom_ctxs, [cpuc->cpdom_id]);
			if (cpdomc) {
				cpdomc->nr_acpus_temp++;
				cpdomc->cap_sum_temp += cpuc->capacity;
			}
		}
	} else {
		online_cpumask = scx_bpf_get_online_cpumask();
		nr_cpus_onln = bpf_cpumask_weight(online_cpumask);
		bpf_cpumask_copy(active, online_cpumask);
		scx_bpf_put_cpumask(online_cpumask);

		bpf_cpumask_clear(ovrflw);

		bpf_for(cpu, 0, nr_cpu_ids) {
			cpuc = get_cpu_ctx_id(cpu);
			if (!cpuc || !cpuc->is_online)
				continue;

			scx_bpf_kick_cpu(cpu, SCX_KICK_IDLE);

			cpdomc = MEMBER_VPTR(cpdom_ctxs, [cpuc->cpdom_id]);
			if (cpdomc) {
				cpdomc->nr_acpus_temp++;
				cpdomc->cap_sum_temp += cpuc->capacity;
			}
		}

	}

	/*
	 * Update nr_active_cpus and cap_sum_active_cpus.
	 */
	bpf_for(dsq_id, 0, nr_cpdoms) {
		if (dsq_id >= LAVD_CPDOM_MAX_NR)
			break;

		cpdomc = MEMBER_VPTR(cpdom_ctxs, [dsq_id]);
		WRITE_ONCE(cpdomc->nr_active_cpus, cpdomc->nr_acpus_temp);
		WRITE_ONCE(cpdomc->nr_acpus_temp, 0);
		WRITE_ONCE(cpdomc->cap_sum_active_cpus, cpdomc->cap_sum_temp);
		WRITE_ONCE(cpdomc->cap_sum_temp, 0);

		if (cpdomc->nr_active_cpus)
			nr_active_cpdoms++;
	}
	sys_stat.nr_active = nr_cpus_onln;
	sys_stat.nr_active_cpdoms = nr_active_cpdoms;

unlock_out:
	bpf_rcu_read_unlock();
	return err;
}

static void update_cpuperf_target(struct cpu_ctx *cpuc)
{
	u32 util, max_util, cpuperf_target;

	/*
	 * The CPU utilization decides the frequency. The bigger one between
	 * the running average and the recent utilization is used to respond
	 * quickly upon load spikes. When the utilization is greater than
	 * LAVD_CPU_UTIL_MAX_FOR_CPUPERF (85%), ceil to 100%.
	 */
	if (!no_freq_scaling) {
		max_util = max(cpuc->avg_util, cpuc->cur_util);
		util = (max_util < LAVD_CPU_UTIL_MAX_FOR_CPUPERF) ? max_util
								  : LAVD_SCALE;
		cpuperf_target = (util * SCX_CPUPERF_ONE) >> LAVD_SHIFT;
	} else
		cpuperf_target = SCX_CPUPERF_ONE;

	/*
	 * Update the performance target once it changes.
	 */
	if (cpuc->cpuperf_cur != cpuperf_target) {
		scx_bpf_cpuperf_set(cpuc->cpu_id, cpuperf_target);
		cpuc->cpuperf_cur = cpuperf_target;
	}
}

static void reset_cpuperf_target(struct cpu_ctx *cpuc)
{
	if (!no_freq_scaling) {
		cpuc->cpuperf_cur = 0;
	}
}

static u16 get_cpuperf_cap(s32 cpu)
{
	const volatile u16 *cap;

	cap = MEMBER_VPTR(cpu_capacity, [cpu]);
	if (cap)
		return *cap;

	debugln("Infeasible CPU id: %d", cpu);
	return 0;
}

static u64 scale_cap_freq(u64 dur, s32 cpu)
{
	u64 cap, freq, scaled_dur;

	/*
	 * Scale the duration by CPU capacity and frequency, so calculate
	 * capacity-invariant and frequency-invariant time duration.
	 */
	cap = get_cpuperf_cap(cpu);
	freq = scx_bpf_cpuperf_cur(cpu);
	scaled_dur = (dur * cap * freq) >> (LAVD_SHIFT * 2);

	return scaled_dur;
}

static void init_autopilot_low_util(void)
{
	if (nr_cpus_big < nr_cpus_onln) {
		/*
		 * When there are little cores, we move up to the balanced mode
		 * if one little core is fully utilized.
		 */
		LAVD_AP_LOW_UTIL = LAVD_SCALE / nr_cpus_onln;
	}
	else {
		/*
		 * When there are only big cores, we move up to the balanced
		 * mode if two big cores are fully utilized.
		 */
		LAVD_AP_LOW_UTIL = (2 * LAVD_SCALE) / nr_cpus_onln;
	}
}

SEC("syscall")
int set_power_profile(struct power_arg *input)
{
	return do_set_power_profile(input->power_mode, 0);
}