Skip to main content

cbtop/tracing_escalation/
types.rs

1//! Types for the tracing escalation framework.
2
3use std::collections::HashMap;
4use std::time::{Duration, Instant};
5
6/// Reason for escalation
7#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
8pub enum EscalationReason {
9    /// CV (Coefficient of Variation) exceeded threshold
10    CvExceeded,
11    /// Efficiency below threshold
12    EfficiencyLow,
13    /// Both CV and efficiency triggered
14    Both,
15    /// Memory cliff detected (sudden performance drop)
16    MemoryCliff,
17    /// GPU transfer overhead exceeded threshold
18    GpuTransferOverhead,
19    /// Manual escalation requested
20    Manual,
21}
22
23impl EscalationReason {
24    /// Get a human-readable description
25    pub fn description(&self) -> &'static str {
26        match self {
27            EscalationReason::CvExceeded => "CV exceeded threshold (unstable performance)",
28            EscalationReason::EfficiencyLow => "Efficiency below threshold",
29            EscalationReason::Both => "Both CV exceeded and efficiency low",
30            EscalationReason::MemoryCliff => "Memory cliff detected (sudden drop)",
31            EscalationReason::GpuTransferOverhead => "GPU transfer overhead exceeded threshold",
32            EscalationReason::Manual => "Manual escalation requested",
33        }
34    }
35
36    /// Get the OTLP span attribute value
37    pub fn otlp_value(&self) -> &'static str {
38        match self {
39            EscalationReason::CvExceeded => "cv_exceeded",
40            EscalationReason::EfficiencyLow => "efficiency_low",
41            EscalationReason::Both => "both",
42            EscalationReason::MemoryCliff => "memory_cliff",
43            EscalationReason::GpuTransferOverhead => "gpu_transfer_overhead",
44            EscalationReason::Manual => "manual",
45        }
46    }
47}
48
49/// Escalation thresholds configuration
50#[derive(Debug, Clone)]
51pub struct EscalationThresholds {
52    /// CV threshold (default: 15%)
53    pub cv_threshold: f64,
54    /// Efficiency threshold (default: 25%)
55    pub efficiency_threshold: f64,
56    /// GPU transfer overhead threshold (default: 50%)
57    pub gpu_transfer_threshold: f64,
58    /// Memory cliff threshold (percentage drop, default: 30%)
59    pub memory_cliff_threshold: f64,
60    /// Rate limit: max traces per interval
61    pub rate_limit: u32,
62    /// Rate limit interval
63    pub rate_interval: Duration,
64}
65
66impl Default for EscalationThresholds {
67    fn default() -> Self {
68        Self {
69            cv_threshold: 15.0,
70            efficiency_threshold: 25.0,
71            gpu_transfer_threshold: 50.0,
72            memory_cliff_threshold: 30.0,
73            rate_limit: 100,
74            rate_interval: Duration::from_secs(60),
75        }
76    }
77}
78
79impl EscalationThresholds {
80    /// Create new thresholds
81    pub fn new() -> Self {
82        Self::default()
83    }
84
85    /// Set CV threshold
86    pub fn with_cv(mut self, threshold: f64) -> Self {
87        self.cv_threshold = threshold;
88        self
89    }
90
91    /// Set efficiency threshold
92    pub fn with_efficiency(mut self, threshold: f64) -> Self {
93        self.efficiency_threshold = threshold;
94        self
95    }
96
97    /// Set GPU transfer threshold
98    pub fn with_gpu_transfer(mut self, threshold: f64) -> Self {
99        self.gpu_transfer_threshold = threshold;
100        self
101    }
102
103    /// Set rate limit
104    pub fn with_rate_limit(mut self, limit: u32) -> Self {
105        self.rate_limit = limit;
106        self
107    }
108
109    /// Set rate interval
110    pub fn with_rate_interval(mut self, interval: Duration) -> Self {
111        self.rate_interval = interval;
112        self
113    }
114}
115
116/// Syscall breakdown categories per section 35.2
117#[derive(Debug, Clone, Default)]
118pub struct SyscallBreakdown {
119    /// mmap, munmap, mprotect, brk - Memory allocation overhead
120    pub mmap_us: u64,
121    /// futex - Thread contention
122    pub futex_us: u64,
123    /// ioctl - CUDA driver overhead
124    pub ioctl_us: u64,
125    /// read, pread64, readv - I/O read bottleneck
126    pub read_us: u64,
127    /// write, pwrite64, writev - I/O write bottleneck
128    pub write_us: u64,
129    /// Other syscalls not categorized
130    pub other_us: u64,
131    /// Total duration
132    pub total_us: u64,
133}
134
135impl SyscallBreakdown {
136    /// Create a new empty breakdown
137    pub fn new() -> Self {
138        Self::default()
139    }
140
141    /// Calculate compute time (total - all syscall overhead)
142    pub fn compute_us(&self) -> u64 {
143        let syscall_total = self.mmap_us
144            + self.futex_us
145            + self.ioctl_us
146            + self.read_us
147            + self.write_us
148            + self.other_us;
149        self.total_us.saturating_sub(syscall_total)
150    }
151
152    /// Calculate syscall overhead percentage
153    pub fn syscall_overhead_percent(&self) -> f64 {
154        if self.total_us == 0 {
155            return 0.0;
156        }
157        let syscall_total = self.mmap_us
158            + self.futex_us
159            + self.ioctl_us
160            + self.read_us
161            + self.write_us
162            + self.other_us;
163        (syscall_total as f64 / self.total_us as f64) * 100.0
164    }
165
166    /// Get the dominant syscall category
167    pub fn dominant_syscall(&self) -> &'static str {
168        let categories = [
169            (self.mmap_us, "mmap"),
170            (self.futex_us, "futex"),
171            (self.ioctl_us, "ioctl"),
172            (self.read_us, "read"),
173            (self.write_us, "write"),
174            (self.other_us, "other"),
175        ];
176
177        // Return "none" if all categories are zero
178        if categories.iter().all(|(time, _)| *time == 0) {
179            return "none";
180        }
181
182        categories
183            .iter()
184            .max_by_key(|(time, _)| time)
185            .map(|(_, name)| *name)
186            .unwrap_or("none")
187    }
188
189    /// Add syscall time to appropriate category
190    ///
191    /// Uses saturating arithmetic to prevent integer overflow on extreme values.
192    pub fn add_syscall(&mut self, syscall: &str, duration_us: u64) {
193        match syscall {
194            "mmap" | "munmap" | "mprotect" | "brk" => {
195                self.mmap_us = self.mmap_us.saturating_add(duration_us)
196            }
197            "futex" => self.futex_us = self.futex_us.saturating_add(duration_us),
198            "ioctl" => self.ioctl_us = self.ioctl_us.saturating_add(duration_us),
199            "read" | "pread64" | "readv" => self.read_us = self.read_us.saturating_add(duration_us),
200            "write" | "pwrite64" | "writev" => {
201                self.write_us = self.write_us.saturating_add(duration_us)
202            }
203            _ => self.other_us = self.other_us.saturating_add(duration_us),
204        }
205    }
206
207    /// Get breakdown as a map for OTLP attributes
208    pub fn as_otlp_attributes(&self) -> HashMap<String, u64> {
209        let mut attrs = HashMap::new();
210        attrs.insert("syscall.mmap_us".to_string(), self.mmap_us);
211        attrs.insert("syscall.futex_us".to_string(), self.futex_us);
212        attrs.insert("syscall.ioctl_us".to_string(), self.ioctl_us);
213        attrs.insert("syscall.read_us".to_string(), self.read_us);
214        attrs.insert("syscall.write_us".to_string(), self.write_us);
215        attrs.insert("syscall.other_us".to_string(), self.other_us);
216        attrs.insert("syscall.compute_us".to_string(), self.compute_us());
217        attrs.insert("syscall.total_us".to_string(), self.total_us);
218        attrs
219    }
220}
221
222/// Result of a trace operation
223#[derive(Debug, Clone)]
224pub struct TraceResult {
225    /// Brick name that was traced
226    pub brick_name: String,
227    /// Budget in microseconds
228    pub budget_us: u64,
229    /// Actual duration in microseconds
230    pub actual_us: u64,
231    /// Reason for escalation
232    pub reason: EscalationReason,
233    /// Syscall breakdown
234    pub syscall_breakdown: SyscallBreakdown,
235    /// Timestamp of trace
236    pub timestamp: Instant,
237}
238
239impl TraceResult {
240    /// Check if over budget
241    pub fn over_budget(&self) -> bool {
242        self.actual_us > self.budget_us
243    }
244
245    /// Calculate efficiency (budget / actual * 100)
246    pub fn efficiency(&self) -> f64 {
247        if self.actual_us == 0 {
248            return 100.0;
249        }
250        (self.budget_us as f64 / self.actual_us as f64) * 100.0
251    }
252
253    /// Get OTLP span attributes
254    pub fn as_otlp_attributes(&self) -> HashMap<String, String> {
255        let mut attrs = HashMap::new();
256        attrs.insert("brick.name".to_string(), self.brick_name.clone());
257        attrs.insert("brick.budget_us".to_string(), self.budget_us.to_string());
258        attrs.insert("brick.actual_us".to_string(), self.actual_us.to_string());
259        attrs.insert(
260            "brick.efficiency".to_string(),
261            format!("{:.1}", self.efficiency()),
262        );
263        attrs.insert(
264            "brick.over_budget".to_string(),
265            self.over_budget().to_string(),
266        );
267        attrs.insert(
268            "escalation.reason".to_string(),
269            self.reason.otlp_value().to_string(),
270        );
271        attrs.insert(
272            "syscall.overhead_percent".to_string(),
273            format!("{:.1}", self.syscall_breakdown.syscall_overhead_percent()),
274        );
275        attrs.insert(
276            "syscall.dominant".to_string(),
277            self.syscall_breakdown.dominant_syscall().to_string(),
278        );
279        attrs
280    }
281}