ralph_workflow/benchmarks/io_baselines.rs
1//! Performance baselines for regression detection.
2//!
3//! This module defines expected performance characteristics based on
4//! measurements from the benchmark suite. Tests can compare against
5//! these baselines to detect regressions.
6//!
7//! # Baseline Measurements
8//!
9//! Current measurements (as of 2026-02-13):
10//! - **Execution history growth**: ~53 bytes per iteration (bounded at 1000 entries)
11//! - **Checkpoint size**: ~363 KB for 1000 entries (well under 2048 KB hard limit)
12//! - **Memory usage**: Bounded growth verified by integration tests
13//!
14//! # Regression Detection Strategy
15//!
16//! 1. **CI runs `cargo xtask verify` on every commit**
17//! - Fails if execution history exceeds 1000 entries (hard limit)
18//! - Fails if checkpoint size exceeds 2048 KB (hard limit)
19//! - Fails if thread cleanup doesn't complete (timeout detection)
20//!
21//! 2. **Benchmark tests capture current values for trending**
22//! - Run with: `cargo test -p ralph-workflow --lib benchmarks -- --nocapture`
23//! - Values are informational; baselines have generous tolerance
24//!
25//! 3. **Integration tests enforce behavioral invariants**
26//! - Bounded growth: `tests/integration_tests/memory_safety/bounded_growth.rs`
27//! - Thread cleanup: `tests/integration_tests/memory_safety/thread_lifecycle.rs`
28//! - Arc patterns: `tests/integration_tests/memory_safety/arc_patterns.rs`
29//!
30//! 4. **Tolerance rationale**
31//! - Memory baselines: 20% tolerance (accounts for platform variance)
32//! - Time baselines: 2x tolerance (serialization performance varies widely)
33//! - Hard limits: 0% tolerance (prevent unbounded growth)
34
35use crate::checkpoint::execution_history::ExecutionStep;
36
37/// Estimate heap bytes for a single execution step using the same methodology as
38/// the benchmark suite and baselines.
39///
40/// Intentionally excludes `StepOutcome` payloads (e.g., output strings and file
41/// lists) because those are highly workload-dependent and can introduce
42/// cross-platform flakes when enforced as hard ceilings.
43///
44/// # Memory Optimization
45///
46/// After optimization, this accounts for:
47/// - `phase`: Arc<str> - counted as the length of the string (shared allocation)
48/// - `step_type`: Box<str> - counted as the length of the string
49/// - `timestamp`: String - counted as length (deterministic; capacity can vary)
50/// - `agent`: Option<Arc<str>> - counted as the length of the string (shared allocation)
51///
52/// Arc<str> fields are counted by length rather than capacity because the
53/// allocation is shared across multiple `ExecutionStep` instances via string interning.
54#[must_use]
55pub fn estimate_execution_step_heap_bytes_core_fields(step: &ExecutionStep) -> usize {
56 // For Arc<str>, count the string length (shared allocation)
57 step.phase.len()
58 // For Box<str>, count the string length
59 + step.step_type.len()
60 // For String, count length (capacity is allocator-dependent and can be flaky)
61 + step.timestamp.len()
62 // For Option<Arc<str>>, count the string length if present (shared allocation)
63 + step.agent.as_ref().map_or(0, |s| s.len())
64}
65
66/// Estimate heap bytes for an execution history slice.
67pub fn estimate_execution_history_heap_bytes_core_fields(steps: &[ExecutionStep]) -> usize {
68 steps
69 .iter()
70 .map(estimate_execution_step_heap_bytes_core_fields)
71 .sum()
72}
73
74/// Performance baseline for execution history growth.
75#[derive(Debug, Clone)]
76pub struct ExecutionHistoryBaseline {
77 /// Number of entries in benchmark
78 pub entry_count: usize,
79 /// Expected heap size in bytes
80 pub heap_size_bytes: usize,
81 /// Expected serialized size in bytes
82 pub serialized_size_bytes: usize,
83 /// Tolerance factor (1.2 = allow 20% deviation)
84 pub tolerance: f64,
85}
86
87impl ExecutionHistoryBaseline {
88 /// Baseline for 1000 entries (default limit).
89 ///
90 /// # Measurement Methodology
91 ///
92 /// These values are derived from benchmark tests that:
93 /// 1. Create 1000 execution history entries with realistic content
94 /// 2. Measure heap size using `std::mem::size_of_val` and content sizes
95 /// 3. Serialize to JSON and measure compressed size
96 /// 4. Run multiple iterations to verify consistency
97 ///
98 /// # Updating Baselines
99 ///
100 /// If legitimate performance improvements reduce memory usage, update these
101 /// values based on new benchmark measurements. Always maintain 20% tolerance
102 /// to account for platform variance.
103 ///
104 /// **DO NOT** increase these values to accommodate regressions. Investigate
105 /// and fix the root cause instead.
106 pub const ENTRIES_1000: Self = Self {
107 entry_count: 1000,
108 heap_size_bytes: 60_000, // 60 KB (measured: ~53_000 bytes for 1000 entries)
109 serialized_size_bytes: 400_000, // 400 KB (measured: ~363 KB actual)
110 tolerance: 1.2, // 20% headroom for platform variance
111 };
112
113 /// Check if measured value exceeds baseline.
114 ///
115 /// # Errors
116 ///
117 /// Returns error if the operation fails.
118 pub fn check_heap_size(&self, measured: usize) -> Result<(), String> {
119 let max_allowed = tolerance_ceiling(self.heap_size_bytes, self.tolerance);
120 if measured > max_allowed {
121 Err(format!(
122 "Heap size {} bytes exceeds baseline {} bytes (tolerance: {}x)",
123 measured, max_allowed, self.tolerance
124 ))
125 } else {
126 Ok(())
127 }
128 }
129
130 /// Check if serialized size exceeds baseline.
131 ///
132 /// # Errors
133 ///
134 /// Returns error if the operation fails.
135 pub fn check_serialized_size(&self, measured: usize) -> Result<(), String> {
136 let max_allowed = tolerance_ceiling(self.serialized_size_bytes, self.tolerance);
137 if measured > max_allowed {
138 Err(format!(
139 "Serialized size {} bytes exceeds baseline {} bytes (tolerance: {}x)",
140 measured, max_allowed, self.tolerance
141 ))
142 } else {
143 Ok(())
144 }
145 }
146}
147
148fn tolerance_ceiling(baseline: usize, tolerance: f64) -> usize {
149 if !tolerance.is_finite() {
150 return usize::MAX;
151 }
152
153 let baseline_f = baseline.to_string().parse::<f64>().unwrap_or(f64::MAX);
154 let scaled = baseline_f * tolerance;
155 if !scaled.is_finite() {
156 return usize::MAX;
157 }
158
159 let ceil = scaled.ceil();
160 if ceil <= 0.0 {
161 return 0;
162 }
163
164 let max_f = usize::MAX.to_string().parse::<f64>().unwrap_or(f64::MAX);
165 if ceil >= max_f {
166 return usize::MAX;
167 }
168
169 let ceil_str = format!("{ceil:.0}");
170 ceil_str.parse::<usize>().unwrap_or(usize::MAX)
171}
172
173/// Checkpoint serialization performance baseline.
174#[derive(Debug, Clone)]
175pub struct CheckpointSerializationBaseline {
176 /// Number of history entries
177 pub entry_count: usize,
178 /// Expected serialization time in microseconds
179 pub serialize_us: u64,
180 /// Expected deserialization time in microseconds
181 pub deserialize_us: u64,
182 /// Tolerance factor
183 pub tolerance: f64,
184}
185
186impl CheckpointSerializationBaseline {
187 /// Baseline for 1000 entries.
188 ///
189 /// # Measurement Methodology
190 ///
191 /// These values are derived from benchmark tests that:
192 /// 1. Create checkpoint state with 1000 execution history entries
193 /// 2. Measure `serde_json::to_string()` serialization time
194 /// 3. Measure `serde_json::from_str()` deserialization time
195 /// 4. Run multiple iterations to get representative average
196 ///
197 /// # Tolerance Rationale
198 ///
199 /// Serialization performance varies significantly based on:
200 /// - CPU architecture and speed
201 /// - Memory bus speed
202 /// - System load (other processes)
203 /// - Compiler optimizations (debug vs release)
204 ///
205 /// We use 2x tolerance (100% headroom) to avoid false positives while
206 /// still catching catastrophic regressions (e.g., O(n²) algorithms).
207 pub const ENTRIES_1000: Self = Self {
208 entry_count: 1000,
209 serialize_us: 5_000, // 5ms (typical range: 2-10ms)
210 deserialize_us: 5_000, // 5ms (typical range: 2-10ms)
211 tolerance: 2.0, // 2x headroom for hardware/load variance
212 };
213}
214
215#[cfg(test)]
216mod tests {
217 use super::*;
218 use crate::checkpoint::execution_history::{ExecutionStep, StepOutcome};
219
220 #[test]
221 fn test_execution_history_baseline_magnitude_is_reasonable() {
222 let baseline = ExecutionHistoryBaseline::ENTRIES_1000;
223
224 // Baselines should be in the same order of magnitude as the measured
225 // benchmark output (~53_000 bytes for 1000 entries).
226 assert!(baseline.heap_size_bytes > 40_000);
227 assert!(baseline.heap_size_bytes < 100_000);
228 }
229
230 #[test]
231 fn test_baseline_check_within_tolerance() {
232 let baseline = ExecutionHistoryBaseline::ENTRIES_1000;
233
234 // 70 KB is within 60 KB baseline + 20% tolerance
235 assert!(baseline.check_heap_size(70_000).is_ok());
236
237 // 80 KB exceeds (60 KB * 1.2 = 72 KB)
238 assert!(baseline.check_heap_size(80_000).is_err());
239 }
240
241 #[test]
242 fn test_execution_history_heap_estimator_counts_only_core_fields() {
243 let mut step = ExecutionStep::new(
244 "Development",
245 1,
246 "agent_invoked",
247 StepOutcome::success(Some("output".to_string()), vec!["file.rs".to_string()]),
248 )
249 .with_agent("test-agent")
250 .with_duration(5);
251
252 // After optimization: Arc<str>, Box<str>, and String are counted by length.
253 //
254 // Ensure this test is robust by making timestamp capacity differ from its length.
255 // The estimator intentionally uses `len()` for determinism across allocators.
256 step.timestamp.reserve_exact(64);
257 assert!(
258 step.timestamp.capacity() > step.timestamp.len(),
259 "test invariant: timestamp capacity should exceed length"
260 );
261
262 let expected = step.phase.len()
263 + step.step_type.len()
264 + step.timestamp.len()
265 + step.agent.as_ref().map_or(0, |s| s.len());
266
267 assert_eq!(
268 estimate_execution_step_heap_bytes_core_fields(&step),
269 expected
270 );
271 }
272
273 /// Regression test: Verify memory optimization reduces per-entry footprint
274 ///
275 /// After Arc<str> and Box<str> optimizations, core fields should use ~40-45 bytes
276 /// per entry (down from ~53 bytes with String fields).
277 #[test]
278 fn test_memory_optimization_regression() {
279 use crate::checkpoint::StringPool;
280
281 let pool = StringPool::new();
282
283 // Create a typical execution step with string pool
284 let (step, pool) = ExecutionStep::new_with_pool(
285 "Development",
286 1,
287 "agent_invoked",
288 StepOutcome::success(Some("output".to_string()), vec!["file.rs".to_string()]),
289 pool,
290 );
291 let (step, _pool) = step.with_agent_pooled("test-agent", pool);
292 let step = step.with_duration(5);
293
294 let heap_size = estimate_execution_step_heap_bytes_core_fields(&step);
295
296 // Core fields should be optimized to ~40-45 bytes
297 // (11 bytes phase + 14 bytes step_type + ~25 bytes timestamp + 10 bytes agent)
298 assert!(
299 heap_size <= 60,
300 "Memory regression: {heap_size} bytes per entry exceeds 60 byte target (expected ~40-45 bytes)"
301 );
302 }
303
304 /// Regression test: Verify string pool deduplicates repeated strings
305 #[test]
306 fn test_string_pool_deduplication_regression() {
307 use crate::checkpoint::StringPool;
308 use std::sync::Arc;
309
310 let pool = StringPool::new();
311
312 // Create multiple steps with the same phase and agent
313 let (step1, pool) = ExecutionStep::new_with_pool(
314 "Development",
315 1,
316 "dev_run",
317 StepOutcome::success(None, vec![]),
318 pool,
319 );
320 let (step1, pool) = step1.with_agent_pooled("claude", pool);
321
322 let (step2, pool) = ExecutionStep::new_with_pool(
323 "Development",
324 2,
325 "dev_run",
326 StepOutcome::success(None, vec![]),
327 pool,
328 );
329 let (step2, pool) = step2.with_agent_pooled("claude", pool);
330
331 // Verify Arc sharing (same pointer)
332 assert!(
333 Arc::ptr_eq(&step1.phase, &step2.phase),
334 "String pool regression: phase strings not shared"
335 );
336 assert!(
337 Arc::ptr_eq(step1.agent.as_ref().unwrap(), step2.agent.as_ref().unwrap()),
338 "String pool regression: agent strings not shared"
339 );
340
341 // Pool should only contain 2 unique strings (phase and agent)
342 assert_eq!(
343 pool.len(),
344 2,
345 "String pool regression: expected 2 unique strings, got {}",
346 pool.len()
347 );
348 }
349}