Skip to main content

ralph_workflow/checkpoint/
size_monitor.rs

1//! Checkpoint size monitoring and alerting.
2//!
3//! This module provides size monitoring for checkpoint files to detect
4//! and alert on checkpoints that approach or exceed size thresholds.
5//!
6//! # Thresholds
7//!
8//! - **Warning threshold**: 1.5 MiB (log warning, continue operation)
9//! - **Error threshold**: 2 MiB (hard limit enforced by tests)
10//!
11//! These thresholds are based on observed checkpoint sizes with bounded
12//! execution history (default 1000 entries ≈ 363 KB serialized).
13
14/// Alert level for checkpoint size.
15#[derive(Debug, Clone, PartialEq, Eq)]
16pub enum SizeAlert {
17    /// Checkpoint size is within acceptable range.
18    Ok,
19    /// Checkpoint size approaches limit (warning threshold).
20    Warning(String),
21    /// Checkpoint size exceeds hard limit.
22    Error(String),
23}
24
25/// Checkpoint size thresholds in bytes.
26#[derive(Debug, Clone)]
27pub struct SizeThresholds {
28    /// Warning threshold in bytes (default: 1.5 MiB)
29    pub warn_threshold: usize,
30    /// Error threshold in bytes (default: 2 MiB)
31    pub error_threshold: usize,
32}
33
34impl SizeThresholds {
35    /// Default thresholds based on performance baselines.
36    ///
37    /// # Rationale
38    ///
39    /// - Default execution history limit: 1000 entries
40    /// - Measured checkpoint size: ~363 KB for 1000 entries
41    /// - Warning threshold: 1.5 MiB (4x baseline, allows growth headroom)
42    /// - Error threshold: 2 MiB (hard limit enforced by CI)
43    pub const DEFAULT: Self = Self {
44        warn_threshold: 1_572_864,  // 1.5 MiB
45        error_threshold: 2_097_152, // 2 MiB
46    };
47
48    /// Create custom thresholds.
49    #[must_use]
50    pub const fn new(warn_threshold: usize, error_threshold: usize) -> Self {
51        Self {
52            warn_threshold,
53            error_threshold,
54        }
55    }
56}
57
58impl Default for SizeThresholds {
59    fn default() -> Self {
60        Self::DEFAULT
61    }
62}
63
64/// Checkpoint size monitor.
65#[derive(Debug)]
66pub struct CheckpointSizeMonitor {
67    thresholds: SizeThresholds,
68}
69
70impl CheckpointSizeMonitor {
71    /// Create a new monitor with default thresholds.
72    #[must_use]
73    pub const fn new() -> Self {
74        Self {
75            thresholds: SizeThresholds::DEFAULT,
76        }
77    }
78
79    /// Create a new monitor with custom thresholds.
80    #[must_use]
81    pub const fn with_thresholds(thresholds: SizeThresholds) -> Self {
82        Self { thresholds }
83    }
84
85    /// Check checkpoint size and return appropriate alert.
86    #[must_use]
87    pub fn check_size(&self, size_bytes: usize) -> SizeAlert {
88        if size_bytes >= self.thresholds.error_threshold {
89            SizeAlert::Error(format!(
90                "Checkpoint size {} bytes exceeds hard limit {} bytes. \
91                 Consider reducing execution_history_limit in config.",
92                size_bytes, self.thresholds.error_threshold
93            ))
94        } else if size_bytes >= self.thresholds.warn_threshold {
95            let pct_of_error_threshold: u128 = if self.thresholds.error_threshold == 0 {
96                100
97            } else {
98                (size_bytes as u128).saturating_mul(100) / (self.thresholds.error_threshold as u128)
99            };
100            SizeAlert::Warning(format!(
101                "Checkpoint size {} bytes exceeds warning threshold {} bytes; \
102                 current size is {}% of hard limit {} bytes.",
103                size_bytes,
104                self.thresholds.warn_threshold,
105                pct_of_error_threshold,
106                self.thresholds.error_threshold
107            ))
108        } else {
109            SizeAlert::Ok
110        }
111    }
112
113    /// Check serialized JSON size and return an alert.
114    #[must_use]
115    pub fn check_json(&self, json: &str) -> SizeAlert {
116        self.check_size(json.len())
117    }
118
119    /// Backwards-compatible wrapper.
120    ///
121    /// Library code must not print directly; callers decide how/where to log.
122    #[deprecated(since = "0.7.3", note = "Use check_json(json) and log at the callsite")]
123    #[must_use]
124    pub fn check_json_and_log(&self, json: &str) -> SizeAlert {
125        self.check_json(json)
126    }
127
128    /// Get current thresholds.
129    #[must_use]
130    pub const fn thresholds(&self) -> &SizeThresholds {
131        &self.thresholds
132    }
133}
134
135impl Default for CheckpointSizeMonitor {
136    fn default() -> Self {
137        Self::new()
138    }
139}
140
141#[cfg(test)]
142mod tests {
143    use super::*;
144
145    #[test]
146    fn test_size_alert_ok_for_small_checkpoints() {
147        let monitor = CheckpointSizeMonitor::new();
148        let alert = monitor.check_size(363_000); // 363 KB (typical size)
149
150        assert_eq!(alert, SizeAlert::Ok);
151    }
152
153    #[test]
154    fn test_size_alert_warning_approaching_limit() {
155        let monitor = CheckpointSizeMonitor::new();
156        let alert = monitor.check_size(1_600_000); // 1.6 MB (over warning threshold)
157
158        match alert {
159            SizeAlert::Warning(msg) => {
160                assert!(msg.contains("1600000"));
161                assert!(msg.contains("warning threshold"));
162                assert!(msg.contains("hard limit"));
163            }
164            _ => panic!("Expected Warning, got {alert:?}"),
165        }
166    }
167
168    #[test]
169    fn test_size_alert_error_exceeds_limit() {
170        let monitor = CheckpointSizeMonitor::new();
171        let alert = monitor.check_size(2_100_000); // 2.1 MB (over error threshold)
172
173        match alert {
174            SizeAlert::Error(msg) => {
175                assert!(msg.contains("2100000"));
176                assert!(msg.contains("exceeds hard limit"));
177            }
178            _ => panic!("Expected Error, got {alert:?}"),
179        }
180    }
181
182    #[test]
183    fn test_custom_thresholds() {
184        let thresholds = SizeThresholds::new(1_000_000, 1_500_000);
185        let monitor = CheckpointSizeMonitor::with_thresholds(thresholds);
186
187        // Below warning
188        assert_eq!(monitor.check_size(900_000), SizeAlert::Ok);
189
190        // Above warning, below error
191        let alert = monitor.check_size(1_100_000);
192        assert!(matches!(alert, SizeAlert::Warning(_)));
193
194        // Above error
195        let alert = monitor.check_size(1_600_000);
196        assert!(matches!(alert, SizeAlert::Error(_)));
197    }
198
199    #[test]
200    fn test_check_json() {
201        let monitor = CheckpointSizeMonitor::new();
202
203        // Small JSON - should return Ok
204        let small_json = "x".repeat(100_000); // 100 KB
205        let alert = monitor.check_json(&small_json);
206        assert_eq!(alert, SizeAlert::Ok);
207
208        // Large JSON - should return Warning
209        let large_json = "x".repeat(1_600_000); // 1.6 MB
210        let alert = monitor.check_json(&large_json);
211        assert!(matches!(alert, SizeAlert::Warning(_)));
212    }
213
214    #[test]
215    fn test_warning_percentage_calculation_does_not_overflow() {
216        // Regression test for overflow in `(size_bytes * 100) / error_threshold`.
217        // With large sizes and thresholds, `usize` multiplication can overflow in debug builds.
218        let thresholds = SizeThresholds::new(1, usize::MAX);
219        let monitor = CheckpointSizeMonitor::with_thresholds(thresholds);
220
221        let result = std::panic::catch_unwind(|| monitor.check_size(usize::MAX - 1));
222        assert!(result.is_ok(), "check_size must not panic on large inputs");
223
224        let alert = result.unwrap();
225        assert!(matches!(alert, SizeAlert::Warning(_)));
226    }
227
228    #[test]
229    fn test_thresholds_default() {
230        let thresholds = SizeThresholds::default();
231        assert_eq!(thresholds.warn_threshold, 1_572_864);
232        assert_eq!(thresholds.error_threshold, 2_097_152);
233    }
234
235    #[test]
236    fn test_monitor_default() {
237        let monitor = CheckpointSizeMonitor::default();
238        assert_eq!(
239            monitor.thresholds().warn_threshold,
240            SizeThresholds::DEFAULT.warn_threshold
241        );
242    }
243}