Skip to main content

ralph_workflow/benchmarks/
io_baselines.rs

1//! Performance baselines for regression detection.
2//!
3//! This module defines expected performance characteristics based on
4//! measurements from the benchmark suite. Tests can compare against
5//! these baselines to detect regressions.
6//!
7//! # Baseline Measurements
8//!
9//! Current measurements (as of 2026-02-13):
10//! - **Execution history growth**: ~53 bytes per iteration (bounded at 1000 entries)
11//! - **Checkpoint size**: ~363 KB for 1000 entries (well under 2048 KB hard limit)
12//! - **Memory usage**: Bounded growth verified by integration tests
13//!
14//! # Regression Detection Strategy
15//!
16//! 1. **CI runs `cargo xtask verify` on every commit**
17//!    - Fails if execution history exceeds 1000 entries (hard limit)
18//!    - Fails if checkpoint size exceeds 2048 KB (hard limit)
19//!    - Fails if thread cleanup doesn't complete (timeout detection)
20//!
21//! 2. **Benchmark tests capture current values for trending**
22//!    - Run with: `cargo test -p ralph-workflow --lib benchmarks -- --nocapture`
23//!    - Values are informational; baselines have generous tolerance
24//!
25//! 3. **Integration tests enforce behavioral invariants**
26//!    - Bounded growth: `tests/integration_tests/memory_safety/bounded_growth.rs`
27//!    - Thread cleanup: `tests/integration_tests/memory_safety/thread_lifecycle.rs`
28//!    - Arc patterns: `tests/integration_tests/memory_safety/arc_patterns.rs`
29//!
30//! 4. **Tolerance rationale**
31//!    - Memory baselines: 20% tolerance (accounts for platform variance)
32//!    - Time baselines: 2x tolerance (serialization performance varies widely)
33//!    - Hard limits: 0% tolerance (prevent unbounded growth)
34
35use crate::checkpoint::execution_history::ExecutionStep;
36
37/// Estimate heap bytes for a single execution step using the same methodology as
38/// the benchmark suite and baselines.
39///
40/// Intentionally excludes `StepOutcome` payloads (e.g., output strings and file
41/// lists) because those are highly workload-dependent and can introduce
42/// cross-platform flakes when enforced as hard ceilings.
43///
44/// # Memory Optimization
45///
46/// After optimization, this accounts for:
47/// - `phase`: Arc<str> - counted as the length of the string (shared allocation)
48/// - `step_type`: Box<str> - counted as the length of the string
49/// - `timestamp`: String - counted as length (deterministic; capacity can vary)
50/// - `agent`: Option<Arc<str>> - counted as the length of the string (shared allocation)
51///
52/// Arc<str> fields are counted by length rather than capacity because the
53/// allocation is shared across multiple `ExecutionStep` instances via string interning.
54#[must_use]
55pub fn estimate_execution_step_heap_bytes_core_fields(step: &ExecutionStep) -> usize {
56    // For Arc<str>, count the string length (shared allocation)
57    step.phase.len()
58        // For Box<str>, count the string length
59        + step.step_type.len()
60        // For String, count length (capacity is allocator-dependent and can be flaky)
61        + step.timestamp.len()
62        // For Option<Arc<str>>, count the string length if present (shared allocation)
63        + step.agent.as_ref().map_or(0, |s| s.len())
64}
65
66/// Estimate heap bytes for an execution history slice.
67pub fn estimate_execution_history_heap_bytes_core_fields(steps: &[ExecutionStep]) -> usize {
68    steps
69        .iter()
70        .map(estimate_execution_step_heap_bytes_core_fields)
71        .sum()
72}
73
74/// Performance baseline for execution history growth.
75#[derive(Debug, Clone)]
76pub struct ExecutionHistoryBaseline {
77    /// Number of entries in benchmark
78    pub entry_count: usize,
79    /// Expected heap size in bytes
80    pub heap_size_bytes: usize,
81    /// Expected serialized size in bytes
82    pub serialized_size_bytes: usize,
83    /// Tolerance factor (1.2 = allow 20% deviation)
84    pub tolerance: f64,
85}
86
87impl ExecutionHistoryBaseline {
88    /// Baseline for 1000 entries (default limit).
89    ///
90    /// # Measurement Methodology
91    ///
92    /// These values are derived from benchmark tests that:
93    /// 1. Create 1000 execution history entries with realistic content
94    /// 2. Measure heap size using `std::mem::size_of_val` and content sizes
95    /// 3. Serialize to JSON and measure compressed size
96    /// 4. Run multiple iterations to verify consistency
97    ///
98    /// # Updating Baselines
99    ///
100    /// If legitimate performance improvements reduce memory usage, update these
101    /// values based on new benchmark measurements. Always maintain 20% tolerance
102    /// to account for platform variance.
103    ///
104    /// **DO NOT** increase these values to accommodate regressions. Investigate
105    /// and fix the root cause instead.
106    pub const ENTRIES_1000: Self = Self {
107        entry_count: 1000,
108        heap_size_bytes: 60_000, // 60 KB (measured: ~53_000 bytes for 1000 entries)
109        serialized_size_bytes: 400_000, // 400 KB (measured: ~363 KB actual)
110        tolerance: 1.2,          // 20% headroom for platform variance
111    };
112
113    /// Check if measured value exceeds baseline.
114    ///
115    /// # Errors
116    ///
117    /// Returns error if the operation fails.
118    pub fn check_heap_size(&self, measured: usize) -> Result<(), String> {
119        let max_allowed = tolerance_ceiling(self.heap_size_bytes, self.tolerance);
120        if measured > max_allowed {
121            Err(format!(
122                "Heap size {} bytes exceeds baseline {} bytes (tolerance: {}x)",
123                measured, max_allowed, self.tolerance
124            ))
125        } else {
126            Ok(())
127        }
128    }
129
130    /// Check if serialized size exceeds baseline.
131    ///
132    /// # Errors
133    ///
134    /// Returns error if the operation fails.
135    pub fn check_serialized_size(&self, measured: usize) -> Result<(), String> {
136        let max_allowed = tolerance_ceiling(self.serialized_size_bytes, self.tolerance);
137        if measured > max_allowed {
138            Err(format!(
139                "Serialized size {} bytes exceeds baseline {} bytes (tolerance: {}x)",
140                measured, max_allowed, self.tolerance
141            ))
142        } else {
143            Ok(())
144        }
145    }
146}
147
148fn tolerance_ceiling(baseline: usize, tolerance: f64) -> usize {
149    if !tolerance.is_finite() {
150        return usize::MAX;
151    }
152
153    let baseline_f = baseline.to_string().parse::<f64>().unwrap_or(f64::MAX);
154    let scaled = baseline_f * tolerance;
155    if !scaled.is_finite() {
156        return usize::MAX;
157    }
158
159    let ceil = scaled.ceil();
160    if ceil <= 0.0 {
161        return 0;
162    }
163
164    let max_f = usize::MAX.to_string().parse::<f64>().unwrap_or(f64::MAX);
165    if ceil >= max_f {
166        return usize::MAX;
167    }
168
169    let ceil_str = format!("{ceil:.0}");
170    ceil_str.parse::<usize>().unwrap_or(usize::MAX)
171}
172
173/// Checkpoint serialization performance baseline.
174#[derive(Debug, Clone)]
175pub struct CheckpointSerializationBaseline {
176    /// Number of history entries
177    pub entry_count: usize,
178    /// Expected serialization time in microseconds
179    pub serialize_us: u64,
180    /// Expected deserialization time in microseconds
181    pub deserialize_us: u64,
182    /// Tolerance factor
183    pub tolerance: f64,
184}
185
186impl CheckpointSerializationBaseline {
187    /// Baseline for 1000 entries.
188    ///
189    /// # Measurement Methodology
190    ///
191    /// These values are derived from benchmark tests that:
192    /// 1. Create checkpoint state with 1000 execution history entries
193    /// 2. Measure `serde_json::to_string()` serialization time
194    /// 3. Measure `serde_json::from_str()` deserialization time
195    /// 4. Run multiple iterations to get representative average
196    ///
197    /// # Tolerance Rationale
198    ///
199    /// Serialization performance varies significantly based on:
200    /// - CPU architecture and speed
201    /// - Memory bus speed
202    /// - System load (other processes)
203    /// - Compiler optimizations (debug vs release)
204    ///
205    /// We use 2x tolerance (100% headroom) to avoid false positives while
206    /// still catching catastrophic regressions (e.g., O(n²) algorithms).
207    pub const ENTRIES_1000: Self = Self {
208        entry_count: 1000,
209        serialize_us: 5_000,   // 5ms (typical range: 2-10ms)
210        deserialize_us: 5_000, // 5ms (typical range: 2-10ms)
211        tolerance: 2.0,        // 2x headroom for hardware/load variance
212    };
213}
214
215#[cfg(test)]
216mod tests {
217    use super::*;
218    use crate::checkpoint::execution_history::{ExecutionStep, StepOutcome};
219
220    #[test]
221    fn test_execution_history_baseline_magnitude_is_reasonable() {
222        let baseline = ExecutionHistoryBaseline::ENTRIES_1000;
223
224        // Baselines should be in the same order of magnitude as the measured
225        // benchmark output (~53_000 bytes for 1000 entries).
226        assert!(baseline.heap_size_bytes > 40_000);
227        assert!(baseline.heap_size_bytes < 100_000);
228    }
229
230    #[test]
231    fn test_baseline_check_within_tolerance() {
232        let baseline = ExecutionHistoryBaseline::ENTRIES_1000;
233
234        // 70 KB is within 60 KB baseline + 20% tolerance
235        assert!(baseline.check_heap_size(70_000).is_ok());
236
237        // 80 KB exceeds (60 KB * 1.2 = 72 KB)
238        assert!(baseline.check_heap_size(80_000).is_err());
239    }
240
241    #[test]
242    fn test_execution_history_heap_estimator_counts_only_core_fields() {
243        let mut step = ExecutionStep::new(
244            "Development",
245            1,
246            "agent_invoked",
247            StepOutcome::success(Some("output".to_string()), vec!["file.rs".to_string()]),
248        )
249        .with_agent("test-agent")
250        .with_duration(5);
251
252        // After optimization: Arc<str>, Box<str>, and String are counted by length.
253        //
254        // Ensure this test is robust by making timestamp capacity differ from its length.
255        // The estimator intentionally uses `len()` for determinism across allocators.
256        step.timestamp.reserve_exact(64);
257        assert!(
258            step.timestamp.capacity() > step.timestamp.len(),
259            "test invariant: timestamp capacity should exceed length"
260        );
261
262        let expected = step.phase.len()
263            + step.step_type.len()
264            + step.timestamp.len()
265            + step.agent.as_ref().map_or(0, |s| s.len());
266
267        assert_eq!(
268            estimate_execution_step_heap_bytes_core_fields(&step),
269            expected
270        );
271    }
272
273    /// Regression test: Verify memory optimization reduces per-entry footprint
274    ///
275    /// After Arc<str> and Box<str> optimizations, core fields should use ~40-45 bytes
276    /// per entry (down from ~53 bytes with String fields).
277    #[test]
278    fn test_memory_optimization_regression() {
279        use crate::checkpoint::StringPool;
280
281        let pool = StringPool::new();
282
283        // Create a typical execution step with string pool
284        let (step, pool) = ExecutionStep::new_with_pool(
285            "Development",
286            1,
287            "agent_invoked",
288            StepOutcome::success(Some("output".to_string()), vec!["file.rs".to_string()]),
289            pool,
290        );
291        let (step, _pool) = step.with_agent_pooled("test-agent", pool);
292        let step = step.with_duration(5);
293
294        let heap_size = estimate_execution_step_heap_bytes_core_fields(&step);
295
296        // Core fields should be optimized to ~40-45 bytes
297        // (11 bytes phase + 14 bytes step_type + ~25 bytes timestamp + 10 bytes agent)
298        assert!(
299            heap_size <= 60,
300            "Memory regression: {heap_size} bytes per entry exceeds 60 byte target (expected ~40-45 bytes)"
301        );
302    }
303
304    /// Regression test: Verify string pool deduplicates repeated strings
305    #[test]
306    fn test_string_pool_deduplication_regression() {
307        use crate::checkpoint::StringPool;
308        use std::sync::Arc;
309
310        let pool = StringPool::new();
311
312        // Create multiple steps with the same phase and agent
313        let (step1, pool) = ExecutionStep::new_with_pool(
314            "Development",
315            1,
316            "dev_run",
317            StepOutcome::success(None, vec![]),
318            pool,
319        );
320        let (step1, pool) = step1.with_agent_pooled("claude", pool);
321
322        let (step2, pool) = ExecutionStep::new_with_pool(
323            "Development",
324            2,
325            "dev_run",
326            StepOutcome::success(None, vec![]),
327            pool,
328        );
329        let (step2, pool) = step2.with_agent_pooled("claude", pool);
330
331        // Verify Arc sharing (same pointer)
332        assert!(
333            Arc::ptr_eq(&step1.phase, &step2.phase),
334            "String pool regression: phase strings not shared"
335        );
336        assert!(
337            Arc::ptr_eq(step1.agent.as_ref().unwrap(), step2.agent.as_ref().unwrap()),
338            "String pool regression: agent strings not shared"
339        );
340
341        // Pool should only contain 2 unique strings (phase and agent)
342        assert_eq!(
343            pool.len(),
344            2,
345            "String pool regression: expected 2 unique strings, got {}",
346            pool.len()
347        );
348    }
349}