scxtop 1.1.0

sched_ext scheduler tool for observability
Documentation
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
// Copyright (c) Meta Platforms, Inc. and affiliates.
//
// This software may be used and distributed according to the terms of the
// GNU General Public License version 2.

use super::protocol::{McpPrompt, McpPromptArgument};
use anyhow::{anyhow, Result};
use serde_json::{json, Value};

pub struct McpPrompts {}

impl Default for McpPrompts {
    fn default() -> Self {
        Self::new()
    }
}

impl McpPrompts {
    pub fn new() -> Self {
        Self {}
    }

    pub fn list(&self) -> Value {
        let prompts = vec![
            McpPrompt {
                name: "analyze_scheduler_performance".to_string(),
                description: Some(
                    "Comprehensive scheduler performance analysis workflow".to_string(),
                ),
                arguments: Some(vec![McpPromptArgument {
                    name: "focus_area".to_string(),
                    description: Some(
                        "Area to focus on: latency, throughput, or balance".to_string(),
                    ),
                    required: false,
                }]),
            },
            McpPrompt {
                name: "debug_high_latency".to_string(),
                description: Some("Debug high scheduling latency issues".to_string()),
                arguments: Some(vec![McpPromptArgument {
                    name: "pid".to_string(),
                    description: Some("Process ID to investigate (optional)".to_string()),
                    required: false,
                }]),
            },
            McpPrompt {
                name: "analyze_cpu_imbalance".to_string(),
                description: Some("Analyze CPU load imbalance and migration patterns".to_string()),
                arguments: None,
            },
            McpPrompt {
                name: "investigate_scheduler_behavior".to_string(),
                description: Some("Deep dive into scheduler behavior and policies".to_string()),
                arguments: Some(vec![McpPromptArgument {
                    name: "scheduler_name".to_string(),
                    description: Some("Specific scheduler to analyze (optional)".to_string()),
                    required: false,
                }]),
            },
            McpPrompt {
                name: "summarize_system".to_string(),
                description: Some("Comprehensive system and scheduler summary".to_string()),
                arguments: None,
            },
        ];

        json!({ "prompts": prompts })
    }

    pub fn get(&self, name: &str, params: &Value) -> Result<Value> {
        let arguments = params.get("arguments").and_then(|v| v.as_object());

        match name {
            "analyze_scheduler_performance" => self.prompt_analyze_scheduler_performance(arguments),
            "debug_high_latency" => self.prompt_debug_high_latency(arguments),
            "analyze_cpu_imbalance" => self.prompt_analyze_cpu_imbalance(),
            "investigate_scheduler_behavior" => {
                self.prompt_investigate_scheduler_behavior(arguments)
            }
            "summarize_system" => self.prompt_summarize_system(),
            _ => Err(anyhow!("Unknown prompt: {}", name)),
        }
    }

    fn prompt_analyze_scheduler_performance(
        &self,
        arguments: Option<&serde_json::Map<String, Value>>,
    ) -> Result<Value> {
        let focus_area = arguments
            .and_then(|a| a.get("focus_area"))
            .and_then(|v| v.as_str())
            .unwrap_or("general");

        let workflow = match focus_area {
            "latency" => {
                r#"# Scheduler Latency Analysis Workflow

## 1. Check Current Scheduler
First, identify which scheduler is running:
- Read resource: `scheduler://current`
- Check if it's a sched_ext scheduler or the default CFS

## 2. Examine Dispatch Queue Latencies
For sched_ext schedulers, check DSQ latencies:
- Read resource: `stats://aggregated/dsq`
- Look for high `dsq_lat_us` values
- Identify queues with long wait times

## 3. Analyze Per-CPU Scheduling Delays
- Read resource: `stats://aggregated/cpu`
- Compare scheduling latency across CPUs
- Identify CPUs with unusually high latency

## 4. Check Process-Level Metrics
- Read resource: `stats://aggregated/process`
- Sort by scheduling latency or wait time
- Identify processes experiencing high latency

## 5. Monitor Real-Time Events (Daemon Mode)
If running in daemon mode:
- Subscribe to: `events://stream`
- Watch for `sched_wakeup` → `sched_switch` delays
- Track `dsq_lat_us` in sched_switch events

## 6. Check Hardware Topology
- Use tool: `get_topology` with `detail_level: "full"`
- Verify NUMA node distances
- Check LLC sharing patterns

## Key Metrics to Monitor:
- DSQ latency: < 100µs is good, > 1ms needs investigation
- Wakeup-to-run delay: Should be minimal for RT tasks
- Per-CPU scheduling rate: Balanced across cores
"#
            }
            "throughput" => {
                r#"# Scheduler Throughput Analysis Workflow

## 1. Measure Context Switch Rate
- Read resource: `stats://system/cpu`
- Check context switch rate (ctxt/sec)
- Compare against baseline expectations

## 2. Analyze CPU Utilization
- Read resource: `stats://aggregated/cpu`
- Check per-CPU utilization
- Identify idle or overloaded CPUs

## 3. Review Scheduler Statistics
- Read resource: `stats://scheduler/scx`
- Check dispatch counts
- Monitor stall indicators

## 4. Check Process Distribution
- Read resource: `stats://aggregated/process`
- Count active processes per CPU
- Look for concentration on few CPUs

## 5. Examine LLC Domain Efficiency
- Read resource: `stats://aggregated/llc`
- Compare throughput across cache domains
- Identify LLC contention

## 6. Monitor Migration Patterns
In daemon mode:
- Subscribe to: `events://stream`
- Track `sched_migrate_task` events
- High migration rate may hurt throughput

## Performance Indicators:
- High context switch rate: Could indicate overhead
- Idle CPUs with runnable tasks: Load balancing issue
- Uneven LLC utilization: Potential optimization opportunity
"#
            }
            "balance" => {
                r#"# Load Balance Analysis Workflow

## 1. Check Per-CPU Load
- Read resource: `stats://aggregated/cpu`
- Compare utilization across all CPUs
- Calculate standard deviation

## 2. Analyze NUMA Balance
- Read resource: `stats://aggregated/node`
- Check distribution across NUMA nodes
- Verify memory locality

## 3. Review LLC Domain Balance
- Read resource: `stats://aggregated/llc`
- Compare load across cache domains
- Identify hot LLCs

## 4. Monitor Migration Activity
In daemon mode:
- Subscribe to: `events://stream`
- Count `sched_migrate_task` events
- Track migration sources and destinations

## 5. Check Topology
- Use tool: `get_topology`
- Understand core/LLC/NUMA layout
- Verify SMT configuration

## 6. Analyze Task Affinity
- Read resource: `stats://aggregated/process`
- Check if tasks are pinned
- Review per-process CPU usage

## Balance Metrics:
- CPU utilization variance: < 10% is well-balanced
- Migration rate: Should be stable, not oscillating
- NUMA remote accesses: Minimize for memory-bound workloads
"#
            }
            _ => {
                r#"# General Scheduler Performance Analysis

## 1. System Overview
- Read resource: `scheduler://current` - Check active scheduler
- Use tool: `get_topology` - Understand hardware layout
- Read resource: `stats://system/cpu` - System-wide metrics

## 2. Scheduler-Specific Analysis
For sched_ext schedulers:
- Read resource: `stats://scheduler/raw` - Raw scheduler stats
- Read resource: `stats://scheduler/scx` - Kernel-level stats
- Read resource: `stats://aggregated/dsq` - Dispatch queue metrics

## 3. Resource Distribution
- Read resource: `stats://aggregated/cpu` - Per-CPU breakdown
- Read resource: `stats://aggregated/llc` - Cache domain view
- Read resource: `stats://aggregated/node` - NUMA perspective

## 4. Process-Level Insights
- Read resource: `stats://aggregated/process` - Per-process stats
- Identify top CPU consumers
- Check for scheduling outliers

## 5. Real-Time Monitoring (Daemon Mode)
- Subscribe to: `events://stream`
- Monitor scheduling decisions in real-time
- Track latency-critical events

## Quick Start:
Use `query_stats` tool with different stat_types to discover
available metrics, then read the appropriate resources.
"#
            }
        };

        Ok(json!({
            "description": format!("Scheduler performance analysis focused on: {}", focus_area),
            "messages": [{
                "role": "user",
                "content": {
                    "type": "text",
                    "text": workflow
                }
            }]
        }))
    }

    fn prompt_debug_high_latency(
        &self,
        arguments: Option<&serde_json::Map<String, Value>>,
    ) -> Result<Value> {
        let pid = arguments
            .and_then(|a| a.get("pid"))
            .and_then(|v| v.as_i64())
            .map(|p| p.to_string())
            .unwrap_or_else(|| "any process".to_string());

        let workflow = format!(
            r#"# Debug High Scheduling Latency

Target: {}

## Investigation Steps

### 1. Identify the Problem Scope
- Read resource: `scheduler://current`
- Check which scheduler is active
- Note: sched_ext provides more detailed metrics

### 2. Measure Current Latency
For specific process (if PID provided):
- Read resource: `stats://aggregated/process`
- Filter for PID: {}
- Check scheduling latency and vtime metrics

For system-wide view:
- Read resource: `stats://aggregated/dsq`
- Look for high `dsq_lat_us` values
- Identify problematic dispatch queues

### 3. Find the Bottleneck CPU
- Read resource: `stats://aggregated/cpu`
- Sort by scheduling latency
- Identify CPUs with high latency
- Check runqueue depths

### 4. Check Hardware Factors
- Use tool: `get_topology` with `detail_level: "full"`
- Verify CPU frequencies (min_freq, max_freq)
- Check for CPU capacity issues
- Review NUMA topology

### 5. Monitor Real-Time Behavior (Daemon Mode)
Subscribe to event stream and watch for:
- Long `dsq_lat_us` in sched_switch events
- Delays between sched_waking and sched_switch
- Frequent migrations (sched_migrate_task)

Filter events for target process:
```
events where pid == {}
```

### 6. Analyze Wakeup Patterns
Look for:
- Frequent wakeups from same waker
- Cross-LLC or cross-NUMA wakeups
- Interrupt-driven wakeups (check SoftIRQ events)

### 7. Check for Resource Contention
- Read resource: `stats://aggregated/llc`
- Check cache domain contention
- Read resource: `stats://aggregated/node`
- Verify memory locality

## Common Causes:
1. **High CPU load**: Check overall system utilization
2. **Poor cache locality**: Look at LLC statistics
3. **NUMA imbalance**: Verify node distribution
4. **Scheduler configuration**: Review sched_ext params
5. **Hardware throttling**: Check CPU frequencies

## Remediation Ideas:
- Adjust task placement (CPU affinity)
- Tune scheduler parameters (if sched_ext)
- Address system-wide contention
- Consider CPU frequency scaling
"#,
            pid, pid, pid
        );

        Ok(json!({
            "description": format!("Debug high scheduling latency for: {}", pid),
            "messages": [{
                "role": "user",
                "content": {
                    "type": "text",
                    "text": workflow
                }
            }]
        }))
    }

    fn prompt_analyze_cpu_imbalance(&self) -> Result<Value> {
        let workflow = r#"# Analyze CPU Load Imbalance

## 1. Measure Imbalance Severity
- Read resource: `stats://aggregated/cpu`
- Calculate utilization variance across CPUs
- Identify overloaded and idle CPUs
- Check runqueue depths

## 2. Understand Topology
- Use tool: `get_topology`
- Note core/LLC/NUMA organization
- Check SMT configuration
- Verify online CPU count

## 3. Review Load Distribution
By cache domain:
- Read resource: `stats://aggregated/llc`
- Compare utilization across LLCs
- Check for hot LLCs

By NUMA node:
- Read resource: `stats://aggregated/node`
- Verify NUMA balance
- Check memory locality

## 4. Identify Migration Patterns
In daemon mode:
- Subscribe to: `events://stream`
- Count `sched_migrate_task` events per minute
- Track common migration paths (source CPU → dest CPU)
- Look for oscillation (tasks bouncing between CPUs)

## 5. Analyze Task Characteristics
- Read resource: `stats://aggregated/process`
- Group by CPU assignment
- Check for pinned tasks (affinity masks)
- Identify CPU-intensive vs I/O-bound tasks

## 6. Check Scheduler Behavior
For sched_ext:
- Read resource: `stats://scheduler/raw`
- Review load balancing settings
- Check layer or partition configuration

For CFS:
- Read resource: `stats://scheduler/scx` (limited)
- Note: Use standard Linux tools for CFS analysis

## 7. Root Cause Analysis

### Common Imbalance Causes:
1. **Task Affinity**: Processes pinned to specific CPUs
2. **NUMA Locality**: Scheduler keeping tasks near memory
3. **Cache Affinity**: Avoiding LLC misses
4. **Interrupt Affinity**: IRQs pinned to specific CPUs
5. **Scheduler Policy**: Intentional imbalance for latency

### Investigation Checklist:
- [ ] Are there pinned tasks? (Check /proc/PID/status)
- [ ] Is this intentional for performance?
- [ ] Are IRQs balanced? (Check /proc/interrupts)
- [ ] Is NUMA topology well-understood by scheduler?
- [ ] Are some CPUs in power-saving states?

## 8. Remediation Options

### For sched_ext schedulers:
- Adjust layer weights
- Tune load balancing aggressiveness
- Modify migration thresholds

### System-level:
- Use `taskset` to redistribute pinned tasks
- Balance IRQ affinity with `irqbalance`
- Adjust CPU frequency governor
- Consider CPU isolation for real-time tasks

## Expected Balance:
- Utilization variance: < 10% for CPU-bound workloads
- Migration rate: Stable, not oscillating
- Per-LLC balance: More important than per-CPU for throughput
"#;

        Ok(json!({
            "description": "Analyze and debug CPU load imbalance issues",
            "messages": [{
                "role": "user",
                "content": {
                    "type": "text",
                    "text": workflow
                }
            }]
        }))
    }

    fn prompt_investigate_scheduler_behavior(
        &self,
        arguments: Option<&serde_json::Map<String, Value>>,
    ) -> Result<Value> {
        let scheduler_filter = arguments
            .and_then(|a| a.get("scheduler_name"))
            .and_then(|v| v.as_str())
            .unwrap_or("");

        let workflow = if !scheduler_filter.is_empty() {
            format!(
                r#"# Investigate Scheduler Behavior: {}

## 1. Verify Scheduler
- Read resource: `scheduler://current`
- Confirm {} is active
- Check scheduler class and state

## 2. Scheduler-Specific Metrics
- Read resource: `stats://scheduler/raw`
- Review {}-specific statistics
- Check configuration parameters

## 3. Kernel-Level Stats
- Read resource: `stats://scheduler/scx`
- Monitor kernel scheduler counters
- Check for error conditions

## 4. Dispatch Queue Analysis
- Read resource: `stats://aggregated/dsq`
- Review all dispatch queues
- Check latencies and depths
- Identify queue characteristics

## 5. Decision Pattern Analysis
In daemon mode, monitor scheduling decisions:
- Subscribe to: `events://stream`
- Watch `sched_switch` events for:
  - DSQ ID assignments
  - Slice allocations (slice_ns)
  - Vtime progression
  - Layer assignments (if layered)

## 6. Task Placement Patterns
- Read resource: `stats://aggregated/process`
- Group by layer_id (if applicable)
- Analyze vtime distribution
- Check task_util values

## 7. Performance Characteristics
- Read resource: `stats://aggregated/cpu`
- Compare performance across CPUs
- Check for scheduling hot spots
- Verify load distribution aligns with policy

## {}-Specific Considerations:
- Review scheduler documentation
- Check for known tuning parameters
- Look for scheduler-specific event types
- Monitor custom metrics in raw stats

## Questions to Answer:
1. Is the scheduler making expected decisions?
2. Are tasks placed according to policy?
3. Is preemption behavior correct?
4. Are latency targets being met?
5. Is load balancing working as designed?
"#,
                scheduler_filter, scheduler_filter, scheduler_filter, scheduler_filter
            )
        } else {
            r#"# Investigate General Scheduler Behavior

## 1. Identify Active Scheduler
- Read resource: `scheduler://current`
- Note scheduler name and class
- Check if sched_ext or traditional (CFS/RT)

## 2. Gather Scheduler Statistics
For sched_ext schedulers:
- Read resource: `stats://scheduler/raw`
- Read resource: `stats://scheduler/scx`
- Read resource: `stats://aggregated/dsq`

For traditional schedulers:
- Use standard Linux tools (schedtool, chrt)
- Check /proc/sched_debug

## 3. Understand Scheduling Decisions
Monitor in real-time (daemon mode):
- Subscribe to: `events://stream`
- Observe sched_switch patterns
- Track task migrations
- Note preemption behavior

## 4. Analyze Resource Distribution
- Read resource: `stats://aggregated/cpu`
- Read resource: `stats://aggregated/llc`
- Read resource: `stats://aggregated/node`
- Read resource: `stats://aggregated/process`

## 5. Check Hardware Topology
- Use tool: `get_topology` with `detail_level: "full"`
- Understand CPU/LLC/NUMA layout
- Verify scheduler awareness of topology

## 6. Evaluate Performance
Key metrics to check:
- Scheduling latency
- Context switch rate
- Migration frequency
- Load balance quality
- Throughput

## 7. Compare Against Expectations
Questions to answer:
- Does behavior match scheduler documentation?
- Are latency/throughput goals met?
- Is load balancing effective?
- Are there unexpected patterns?

## Common Investigation Patterns:

### For Latency-Sensitive Workloads:
- Check DSQ latencies
- Monitor wakeup-to-run delays
- Verify preemption behavior

### For Throughput Workloads:
- Check CPU utilization
- Monitor cache efficiency
- Verify load distribution

### For Mixed Workloads:
- Check layer separation (if layered scheduler)
- Verify priority handling
- Monitor resource isolation
"#
            .to_string()
        };

        Ok(json!({
            "description": "Deep investigation of scheduler behavior and policies",
            "messages": [{
                "role": "user",
                "content": {
                    "type": "text",
                    "text": workflow
                }
            }]
        }))
    }

    fn prompt_summarize_system(&self) -> Result<Value> {
        let workflow = r#"# Comprehensive System and Scheduler Summary

## 1. Hardware Overview
- Use tool: `get_topology`
- Get summary of CPUs, cores, LLCs, NUMA nodes
- Note SMT configuration
- Check CPU frequency ranges

## 2. Active Scheduler
- Read resource: `scheduler://current`
- Identify scheduler name and class
- Check if sched_ext is active

## 3. System-Wide Statistics
- Read resource: `stats://system/cpu`
- Read resource: `stats://system/memory`
- Read resource: `stats://system/network`
- Get overall resource utilization

## 4. Scheduler Performance
For sched_ext:
- Read resource: `stats://scheduler/raw`
- Read resource: `stats://scheduler/scx`
- Get key performance indicators

## 5. Resource Distribution
- Read resource: `stats://aggregated/cpu`
- Read resource: `stats://aggregated/llc`
- Read resource: `stats://aggregated/node`
- Understand load distribution

## 6. Process Overview
- Read resource: `stats://aggregated/process`
- Identify top CPU consumers
- Check process count and distribution

## 7. Available Events (for Tracing)
- Use tool: `list_events` with required `subsystem` parameter (e.g., "sched", "irq", "power")
- See available kprobes and perf events for the specified subsystem
- Note: Useful for detailed profiling

## 8. Available Statistics Resources
- Use tool: `query_stats`
- Get complete list of queryable resources
- Understand what data is available

## Summary Checklist:
- [ ] Hardware topology understood
- [ ] Scheduler identified and characterized
- [ ] System resource utilization checked
- [ ] Load distribution analyzed
- [ ] Top processes identified
- [ ] Monitoring capabilities noted

## Next Steps:
Based on the summary, you can:
1. Dive deeper with specific analysis prompts
2. Monitor real-time events in daemon mode
3. Investigate specific performance issues
4. Tune scheduler parameters if needed

Use the other prompts for focused analysis:
- `analyze_scheduler_performance` - Detailed performance analysis
- `debug_high_latency` - Latency issue investigation
- `analyze_cpu_imbalance` - Load balancing analysis
- `investigate_scheduler_behavior` - Scheduler policy deep-dive
"#;

        Ok(json!({
            "description": "Get a comprehensive overview of the system and scheduler state",
            "messages": [{
                "role": "user",
                "content": {
                    "type": "text",
                    "text": workflow
                }
            }]
        }))
    }
}