scx_layered 1.1.0

A highly configurable multi-layer BPF / user space hybrid scheduler used within sched_ext, which is a Linux kernel feature which enables implementing kernel thread schedulers in BPF and dynamically loading them. https://github.com/sched-ext/scx/tree/main
Documentation
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
// Copyright (c) Meta Platforms, Inc. and affiliates.

// This software may be used and distributed according to the terms of the
// GNU General Public License version 2.
#ifndef __INTF_H
#define __INTF_H

#include <stdbool.h>
#ifndef __kptr
#ifdef __KERNEL__
#error "__kptr_ref not defined in the kernel"
#endif
#define __kptr
#endif

#ifndef __KERNEL__
typedef unsigned char u8;
typedef unsigned short u16;
typedef int s32;
typedef unsigned u32;
typedef long long s64;
typedef unsigned long long u64;
#endif

enum consts {
	CACHELINE_SIZE		= 64,
	MAX_CPUS_SHIFT		= 9,
	MAX_CPUS		= 1 << MAX_CPUS_SHIFT,
	MAX_CPUS_U8		= MAX_CPUS / 8,
	MAX_TASKS		= 131072,
	MAX_PATH		= 4096,
	MAX_NUMA_NODES		= 32,
	MAX_LLCS		= 64,
	MAX_COMM		= 16,
	MAX_LAYER_MATCH_ORS	= 32,
	/* 64 chars for user-provided name, 64 for possible template suffix. */
	MAX_LAYER_NAME		= 128,
	MAX_LAYERS		= 16,
	MAX_CGROUP_REGEXES	= 16,
	MAX_LAYER_WEIGHT	= 10000,
	MIN_LAYER_WEIGHT	= 1,
	DEFAULT_LAYER_WEIGHT	= 100,
	USAGE_HALF_LIFE		= 100000000,	/* 100ms */
	RUNTIME_DECAY_FACTOR	= 4,
	DUTY_CYCLE_SHIFT	= 20,		/* duty_cycle 1.0 = 1 << 20 */
	LAYER_LAT_DECAY_FACTOR	= 32,
	CLEAR_PREEMPTING_AFTER	= 10000000,	/* 10ms */

	DSQ_ID_SPECIAL_MASK	= 0xc0000000,
	HI_FB_DSQ_BASE		= 0x40000000,
	LO_FB_DSQ_BASE		= 0x80000000,

	DSQ_ID_LAYER_SHIFT	= 16,
	DSQ_ID_LLC_MASK		= (1LLU << DSQ_ID_LAYER_SHIFT) - 1,		/* 0x0000ffff */
	DSQ_ID_LAYER_MASK	= ~DSQ_ID_LLC_MASK & ~DSQ_ID_SPECIAL_MASK,	/* 0x3fff0000 */

	/* XXX remove */
	MAX_CGRP_PREFIXES	= 32,

	NSEC_PER_USEC		= 1000ULL,
	NSEC_PER_MSEC		= (1000ULL * NSEC_PER_USEC),
	MSEC_PER_SEC		= 1000ULL,
	NSEC_PER_SEC		= NSEC_PER_MSEC * MSEC_PER_SEC,

	SCXCMD_OP_NONE 		= 0,
	SCXCMD_OP_JOIN 		= 1,
	SCXCMD_OP_LEAVE 	= 2,

	SCXCMD_PREFIX		= 0x5C10,
	SCXCMD_COMLEN		= 13,
	MAX_GPU_PIDS 		= 100000,

};

enum layer_membership {
	MEMBER_NOEXPIRE		= (u64)-1,
	MEMBER_EXPIRED		= (u64)-2,
	MEMBER_CANTMATCH	= (u64)-3,
	MEMBER_INVALID		= (u64)-4,
};

static inline void ___consts_sanity_check___(void) {
	/* layer->llcs_to_drain uses u64 as LLC bitmap */
	_Static_assert(MAX_LLCS <= 64, "MAX_LLCS too high");
	_Static_assert(MAX_LLCS <= (1 << DSQ_ID_LAYER_SHIFT), "MAX_LLCS too high");
	_Static_assert(MAX_LAYERS <= (DSQ_ID_LAYER_MASK >> DSQ_ID_LAYER_SHIFT) + 1,
		       "MAX_LAYERS too high");
	/* cgroup regex matching uses u64 as match bitmap */
	_Static_assert(MAX_CGROUP_REGEXES <= 64, "MAX_CGROUP_REGEXES too high for u64 bitmap");
}

enum layer_kind {
	LAYER_KIND_OPEN,
	LAYER_KIND_GROUPED,
	LAYER_KIND_CONFINED,
};

enum layer_usage {
	LAYER_USAGE_OWNED,
	LAYER_USAGE_OPEN,
	LAYER_USAGE_SUM_UPTO = LAYER_USAGE_OPEN,

	LAYER_USAGE_PROTECTED,
	LAYER_USAGE_PROTECTED_PREEMPT,

	NR_LAYER_USAGES,
};

/* Statistics */
enum global_stat_id {
	GSTAT_EXCL_IDLE,
	GSTAT_EXCL_WAKEUP,
	GSTAT_HI_FB_EVENTS,
	GSTAT_HI_FB_USAGE,
	GSTAT_LO_FB_EVENTS,
	GSTAT_LO_FB_USAGE,
	GSTAT_FB_CPU_USAGE,
	GSTAT_ANTISTALL,
	GSTAT_SKIP_PREEMPT,
	GSTAT_FIXUP_VTIME,
	GSTAT_PREEMPTING_MISMATCH,
	NR_GSTATS,
};

enum layer_stat_id {
	LSTAT_SEL_LOCAL,
	LSTAT_ENQ_LOCAL,
	LSTAT_ENQ_WAKEUP,
	LSTAT_ENQ_EXPIRE,
	LSTAT_ENQ_REENQ,
	LSTAT_ENQ_DSQ,
	LSTAT_KEEP,
	LSTAT_MIN_EXEC,
	LSTAT_MIN_EXEC_NS,
	LSTAT_OPEN_IDLE,
	LSTAT_AFFN_VIOL,
	LSTAT_KEEP_FAIL_MAX_EXEC,
	LSTAT_KEEP_FAIL_BUSY,
	LSTAT_PREEMPT,
	LSTAT_PREEMPT_FIRST,
	LSTAT_PREEMPT_XLLC,
	LSTAT_PREEMPT_XNUMA,
	LSTAT_PREEMPT_IDLE,
	LSTAT_PREEMPT_FAIL,
	LSTAT_EXCL_COLLISION,
	LSTAT_EXCL_PREEMPT,
	LSTAT_YIELD,
	LSTAT_YIELD_IGNORE,
	LSTAT_MIGRATION,
	LSTAT_XNUMA_MIGRATION,
	LSTAT_XLLC_MIGRATION,
	LSTAT_XLLC_MIGRATION_SKIP,
	LSTAT_XLAYER_WAKE,
	LSTAT_XLAYER_REWAKE,
	LSTAT_LLC_DRAIN_TRY,
	LSTAT_LLC_DRAIN,
	LSTAT_SKIP_REMOTE_NODE,
	NR_LSTATS,
};

enum llc_layer_stat_id {
	LLC_LSTAT_LAT,
	LLC_LSTAT_CNT,
	NR_LLC_LSTATS,
};

/* CPU proximity map from closest to farthest, starts with self */
struct cpu_prox_map {
	u16			cpus[MAX_CPUS];
	u32			core_end;
	u32			llc_end;
	u32			node_end;
	u32			sys_end;
};

struct cpu_ctx {
	s32			cpu;
	bool			current_preempt;
	bool			current_excl;
	bool			prev_excl;
	bool			next_excl;
	bool			yielding;
	bool			try_preempt_first;
	bool			is_big;
	struct task_struct	*preempting_task;
	u64			preempting_at;

	bool			protect_owned;
	bool			protect_owned_preempt;
	bool			running_owned;
	bool			running_open;
	bool			running_fallback;
	u64			used_at;
	bool			is_protected;

	u64			layer_usages[MAX_LAYERS][NR_LAYER_USAGES];
	u64			node_pinned_usage[MAX_LAYERS];
	u64			layer_membw_agg[MAX_LAYERS][NR_LAYER_USAGES];
	u64			gstats[NR_GSTATS];
	u64			lstats[MAX_LAYERS][NR_LSTATS];
	u64			layer_duty_sum[MAX_LAYERS];
	u64			ran_current_for;

	u64			usage;
	u64			usage_at_idle;

	u64			hi_fb_dsq_id;
	u64			lo_fb_dsq_id;
	bool			in_open_layers;
	u32			layer_id;
	u32			llc_id;
	u32			node_id;
	u32			perf;

	u64			lo_fb_seq;
	u64			lo_fb_seq_at;
	u64			lo_fb_usage_base;

	u32			ogp_layer_order[MAX_LAYERS];	/* open/grouped preempt */
	u32			ogn_layer_order[MAX_LAYERS];	/* open/grouped non-preempt */

	u32			op_layer_order[MAX_LAYERS];	/* open preempt */
	u32			on_layer_order[MAX_LAYERS];	/* open non-preempt */
	u32			gp_layer_order[MAX_LAYERS];	/* grouped preempt */
	u32			gn_layer_order[MAX_LAYERS];	/* grouped non-preempt */

	struct cpu_prox_map	prox_map;
};

struct llc_prox_map {
	u16			llcs[MAX_LLCS];
	u32			node_end;
	u32			sys_end;
};

struct llc_ctx {
	u32			id;
	struct bpf_cpumask __kptr *cpumask;
	u32			nr_cpus;
	u64			vtime_now[MAX_LAYERS];
	u64			queued_runtime[MAX_LAYERS];
	u64			lo_fb_seq;
	u64			lstats[MAX_LAYERS][NR_LLC_LSTATS];
	struct llc_prox_map	prox_map;
};

struct node_prox_map {
	u16			nodes[MAX_NUMA_NODES];
	u32			sys_end;
};

struct node_ctx {
	u32			id;
	struct bpf_cpumask __kptr *cpumask;
	struct bpf_cpumask __kptr *unprotected_cpumask;
	u32			nr_llcs;
	u32			nr_cpus;
	u32			llcs[MAX_LLCS];
	u32			empty_layer_ids[MAX_LAYERS];
	u32			nr_empty_layer_ids;
	struct node_prox_map	prox_map;
};

struct refresh_node_ctx_arg {
	u32			node_id;
	u32			init;
	u32			empty_layer_ids[MAX_LAYERS];
	u32			nr_empty_layer_ids;
	u32			llcs[MAX_LLCS];
	u32			nr_llcs;
};

enum layer_match_kind {
	MATCH_CGROUP_PREFIX,
	MATCH_COMM_PREFIX,
	MATCH_PCOMM_PREFIX,
	MATCH_NICE_ABOVE,
	MATCH_NICE_BELOW,
	MATCH_NICE_EQUALS,
	MATCH_USER_ID_EQUALS,
	MATCH_GROUP_ID_EQUALS,
	MATCH_PID_EQUALS,
	MATCH_PPID_EQUALS,
	MATCH_TGID_EQUALS,
	MATCH_NSPID_EQUALS,
	MATCH_NS_EQUALS,
	MATCH_SCXCMD_JOIN,
	MATCH_IS_GROUP_LEADER,
	MATCH_IS_KTHREAD,
	MATCH_USED_GPU_TID,
	MATCH_USED_GPU_PID,
	MATCH_AVG_RUNTIME,
	MATCH_CGROUP_SUFFIX,
	MATCH_CGROUP_CONTAINS,
	MATCH_CGROUP_REGEX,
	MATCH_HINT_EQUALS,
	MATCH_SYSTEM_CPU_UTIL_BELOW,
	MATCH_DSQ_INSERT_BELOW,
	MATCH_NUMA_NODE,

	NR_LAYER_MATCH_KINDS,
};

struct layer_match {
	int		kind;
	char		cgroup_prefix[MAX_PATH];
	char		cgroup_suffix[MAX_PATH];
	char		cgroup_substr[MAX_PATH];
	u32		cgroup_regex_id;
	char		comm_prefix[MAX_COMM];
	char		pcomm_prefix[MAX_COMM];
	int		nice;
	u32		user_id;
	u32		group_id;
	u32		pid;
	u32		ppid;
	u32		tgid;
	u64		nsid;
	bool		is_group_leader;
	bool		is_kthread;
	bool		used_gpu_tid;
	bool		used_gpu_pid;
	bool		exclude;
	u64		min_avg_runtime_us;
	u64		max_avg_runtime_us;
	u64		hint;
	u64		system_cpu_util_below;	/* ratio * 10000 */
	u64		dsq_insert_below;	/* ratio * 10000 */
	u32		numa_node_id;
};

struct layer_match_ands {
	struct layer_match	matches[NR_LAYER_MATCH_KINDS];
	int			nr_match_ands;
};

enum layer_growth_algo {
	GROWTH_ALGO_STICKY,
	GROWTH_ALGO_LINEAR,
	GROWTH_ALGO_REVERSE,
	GROWTH_ALGO_RANDOM,
	GROWTH_ALGO_TOPO,
	GROWTH_ALGO_ROUND_ROBIN,
	GROWTH_ALGO_BIG_LITTLE,
	GROWTH_ALGO_LITTLE_BIG,
	GROWTH_ALGO_NODE_SPREAD,
	GROWTH_ALGO_NODE_SPREAD_REVERSE,
	GROWTH_ALGO_NODE_SPREAD_RANDOM,
	GROWTH_ALGO_CPUSET_SPREAD,
	GROWTH_ALGO_CPUSET_SPREAD_REVERSE,
	GROWTH_ALGO_CPUSET_SPREAD_RANDOM,
	GROWTH_ALGO_RANDOM_TOPO,
	GROWTH_ALGO_STICKY_DYNAMIC,
};

enum layer_task_place {
	PLACEMENT_STD,
	PLACEMENT_STICK,
	PLACEMENT_FLOAT,
};

struct xnuma_bucket {
	s64			tokens;
	u64			last_refill_ts;
	u64			rate;		/* duty_cycle units per second, set by userspace */
};

struct layer_node_ctx {
	u32			nr_cpus;
	u64			nr_pinned_tasks;
	u64			llcs_to_drain;
	u32			llc_drain_cnt;
	bool			xnuma_is_mig_src;
	struct xnuma_bucket xnuma[MAX_NUMA_NODES];
};

struct layer {
	struct layer_match_ands	matches[MAX_LAYER_MATCH_ORS];
	unsigned int		nr_match_ors;
	unsigned int		id;
	u64			min_exec_ns;
	u64			max_exec_ns;
	u64			yield_step_ns;
	u64			slice_ns;
	bool			fifo;
	u32			weight;
	u64			disallow_open_after_ns;
	u64			disallow_preempt_after_ns;
	u64			xllc_mig_min_ns;

	int			kind;
	bool			preempt;
	bool			preempt_first;
	bool			excl;
	bool			has_cpuset;
	bool			skip_remote_node;
	bool			prev_over_idle_core;
	int			growth_algo;

	u64			nr_tasks;

	u64			cpus_seq;
	bool			check_no_idle;
	u32			perf;
	u64			refresh_cpus;
	u8			cpus[MAX_CPUS_U8];

	u32			nr_cpus;
	u32			nr_llc_cpus[MAX_LLCS];

	struct layer_node_ctx	node[MAX_NUMA_NODES];

	enum layer_task_place   task_place;

	char			name[MAX_LAYER_NAME];
	bool			is_protected;
	bool			periodically_refresh;
	u8			cpuset[MAX_CPUS_U8];
	u64			member_expire_ms;
};

struct scx_cmd {
	u16			prefix;
	u8 			opcode;
	u8			cmd[SCXCMD_COMLEN];
} __attribute__((packed));

struct hint_layer_info {
	u32			layer_id;
	u64			system_cpu_util_below;	/* ratio * 10000, u64::MAX = disabled */
	u64			dsq_insert_below;	/* ratio * 10000, u64::MAX = disabled */
};

#endif /* __INTF_H */