Skip to main content

bones_sim/
campaign.rs

1//! Campaign runner for deterministic simulation campaigns.
2//!
3//! Executes many seeds across configurable parameters, collecting pass/fail
4//! results and identifying the first failing seed for replay.
5
6use std::ops::Range;
7
8use anyhow::{Result, bail};
9use serde::{Deserialize, Serialize};
10
11use crate::oracle::{ConvergenceOracle, InvariantViolation, OracleResult};
12use crate::rng::DeterministicRng;
13use crate::{SimulationConfig, SimulationResult, Simulator};
14
15/// Campaign-level configuration controlling how many seeds to run and
16/// what simulation parameters to use for each seed.
17#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
18pub struct CampaignConfig {
19    /// Range of seeds to execute, e.g., `0..100`.
20    pub seed_range: Range<u64>,
21    /// Number of simulated agents per seed.
22    pub agent_count: usize,
23    /// Number of simulation rounds per seed.
24    pub rounds: u64,
25    /// Number of peers each emitter sends each event to.
26    pub fanout: usize,
27    /// Network fault probability for random message drops (percent, 0–100).
28    pub fault_drop_percent: u8,
29    /// Network fault probability for message duplication (percent, 0–100).
30    pub fault_duplicate_percent: u8,
31    /// Network fault probability for message reordering (percent, 0–100).
32    pub fault_reorder_percent: u8,
33    /// Network fault probability for partition toggling (percent, 0–100).
34    pub fault_partition_percent: u8,
35    /// Maximum delivery delay in rounds.
36    pub fault_max_delay: u8,
37    /// Clock freeze probability (percent, 0–100).
38    pub fault_freeze_percent: u8,
39    /// Clock freeze duration in rounds.
40    pub fault_freeze_duration: u8,
41}
42
43impl Default for CampaignConfig {
44    fn default() -> Self {
45        Self {
46            seed_range: 0..100,
47            agent_count: 5,
48            rounds: 24,
49            fanout: 2,
50            fault_drop_percent: 10,
51            fault_duplicate_percent: 5,
52            fault_reorder_percent: 10,
53            fault_partition_percent: 5,
54            fault_max_delay: 3,
55            fault_freeze_percent: 5,
56            fault_freeze_duration: 2,
57        }
58    }
59}
60
61impl CampaignConfig {
62    /// Build a [`SimulationConfig`] for a specific seed.
63    #[must_use]
64    pub fn sim_config_for_seed(&self, seed: u64) -> SimulationConfig {
65        use crate::network::FaultConfig;
66        SimulationConfig {
67            seed,
68            agent_count: self.agent_count,
69            rounds: self.rounds,
70            fanout: self.fanout,
71            fault: FaultConfig {
72                max_delay_rounds: self.fault_max_delay,
73                drop_rate_percent: self.fault_drop_percent,
74                duplicate_rate_percent: self.fault_duplicate_percent,
75                reorder_rate_percent: self.fault_reorder_percent,
76                partition_rate_percent: self.fault_partition_percent,
77                freeze_rate_percent: self.fault_freeze_percent,
78                freeze_duration_rounds: self.fault_freeze_duration,
79            },
80            clock: crate::clock::ClockConfig::default(),
81        }
82    }
83
84    /// Validate configuration before running.
85    ///
86    /// # Errors
87    ///
88    /// Returns an error if any parameter is out of valid range.
89    pub fn validate(&self) -> Result<()> {
90        if self.seed_range.is_empty() {
91            bail!("seed_range must not be empty");
92        }
93        if self.agent_count == 0 {
94            bail!("agent_count must be > 0");
95        }
96        if self.rounds == 0 {
97            bail!("rounds must be > 0");
98        }
99        Ok(())
100    }
101}
102
103/// Failure details for a single seed.
104#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
105pub struct SeedFailure {
106    /// The seed that failed.
107    pub seed: u64,
108    /// Invariant violations found.
109    pub violations: Vec<String>,
110}
111
112/// Aggregate report produced by a campaign run.
113#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
114pub struct CampaignReport {
115    /// Total seeds executed.
116    pub seeds_run: usize,
117    /// Seeds that passed all invariants.
118    pub seeds_passed: usize,
119    /// First seed that failed (for prioritized replay).
120    pub first_failure: Option<u64>,
121    /// All seed failures with violation details.
122    pub failures: Vec<SeedFailure>,
123    /// Whether at least one seed reached an interesting fault state.
124    pub interesting_states_reached: usize,
125}
126
127impl CampaignReport {
128    /// True if every seed passed.
129    #[must_use]
130    pub const fn all_passed(&self) -> bool {
131        self.failures.is_empty()
132    }
133}
134
135/// Detailed trace produced by replaying a single seed.
136#[derive(Debug, Clone)]
137pub struct DetailedTrace {
138    /// The simulation result including full trace and convergence info.
139    pub result: SimulationResult,
140    /// Oracle check result with violation details.
141    pub oracle: OracleResult,
142    /// All event IDs produced during the simulation.
143    pub all_events: Vec<u64>,
144}
145
146/// Run a full campaign across all seeds in the config.
147///
148/// # Errors
149///
150/// Returns an error if config validation fails or a simulation encounters
151/// an internal error.
152pub fn run_campaign(config: &CampaignConfig) -> Result<CampaignReport> {
153    config.validate()?;
154
155    let mut seeds_run = 0_usize;
156    let mut seeds_passed = 0_usize;
157    let mut first_failure: Option<u64> = None;
158    let mut failures = Vec::new();
159    let mut interesting_states_reached = 0_usize;
160
161    for seed in config.seed_range.clone() {
162        seeds_run += 1;
163
164        match run_single_seed(seed, config)? {
165            Ok(()) => {
166                seeds_passed += 1;
167            }
168            Err(violations) => {
169                if first_failure.is_none() {
170                    first_failure = Some(seed);
171                }
172                failures.push(SeedFailure {
173                    seed,
174                    violations: violations.iter().map(format_violation).collect(),
175                });
176            }
177        }
178
179        // Track interesting states separately by replaying
180        let sim_config = config.sim_config_for_seed(seed);
181        let mut sim = Simulator::new(sim_config)?;
182        let result = sim.run()?;
183        if result.interesting_state_reached {
184            interesting_states_reached += 1;
185        }
186    }
187
188    Ok(CampaignReport {
189        seeds_run,
190        seeds_passed,
191        first_failure,
192        failures,
193        interesting_states_reached,
194    })
195}
196
197/// Run a single seed and return Ok(()) on pass, Err(violations) on failure.
198///
199/// # Errors
200///
201/// Returns an `anyhow::Error` if the simulation itself encounters an internal
202/// error (invalid config, etc). The inner `Result` distinguishes pass from
203/// invariant violations.
204pub fn run_single_seed(
205    seed: u64,
206    config: &CampaignConfig,
207) -> Result<std::result::Result<(), Vec<InvariantViolation>>> {
208    let sim_config = config.sim_config_for_seed(seed);
209    let mut simulator = Simulator::new(sim_config)?;
210    let result = simulator.run()?;
211
212    // Collect all event IDs from the trace for oracle checks.
213    let all_events = collect_emitted_events(&result);
214
215    // Run the full oracle suite.
216    let mut oracle_rng = DeterministicRng::new(seed.wrapping_add(0xDEAD));
217    let oracle_result = ConvergenceOracle::check_all(&result.states, &all_events, &mut oracle_rng);
218
219    if oracle_result.passed {
220        Ok(Ok(()))
221    } else {
222        Ok(Err(oracle_result.violations))
223    }
224}
225
226/// Replay a single seed with full trace details for debugging.
227///
228/// # Errors
229///
230/// Returns an error when config validation or simulation fails.
231pub fn replay_seed(seed: u64, config: &CampaignConfig) -> Result<DetailedTrace> {
232    config.validate()?;
233
234    let sim_config = config.sim_config_for_seed(seed);
235    let mut simulator = Simulator::new(sim_config)?;
236    let result = simulator.run()?;
237
238    let all_events = collect_emitted_events(&result);
239
240    let mut oracle_rng = DeterministicRng::new(seed.wrapping_add(0xDEAD));
241    let oracle = ConvergenceOracle::check_all(&result.states, &all_events, &mut oracle_rng);
242
243    Ok(DetailedTrace {
244        result,
245        oracle,
246        all_events,
247    })
248}
249
250/// Extract all emitted event IDs from a simulation result's trace.
251fn collect_emitted_events(result: &SimulationResult) -> Vec<u64> {
252    result
253        .trace
254        .iter()
255        .filter_map(|te| match te.kind {
256            crate::TraceEventKind::Emit { event_id, .. } => Some(event_id),
257            _ => None,
258        })
259        .collect()
260}
261
262/// Format an invariant violation into a human-readable string.
263fn format_violation(v: &InvariantViolation) -> String {
264    match v {
265        InvariantViolation::Convergence {
266            agent_a,
267            agent_b,
268            only_in_a,
269            only_in_b,
270        } => {
271            format!(
272                "Convergence: agents {agent_a} and {agent_b} diverge \
273                 (only_in_a={only_in_a:?}, only_in_b={only_in_b:?})"
274            )
275        }
276        InvariantViolation::Commutativity {
277            permutation_index,
278            missing_events,
279            extra_events,
280        } => {
281            format!(
282                "Commutativity: permutation {permutation_index} diverges \
283                 (missing={missing_events:?}, extra={extra_events:?})"
284            )
285        }
286        InvariantViolation::Idempotence {
287            event_id,
288            events_before,
289            events_after_dup,
290        } => {
291            format!(
292                "Idempotence: re-applying event {event_id} mutated state \
293                 (before={} events, after={} events)",
294                events_before.len(),
295                events_after_dup.len()
296            )
297        }
298        InvariantViolation::CausalConsistency {
299            observer_agent,
300            source_agent,
301            missing_seq,
302            present_higher_seq,
303        } => {
304            format!(
305                "CausalConsistency: agent {observer_agent} has seq={present_higher_seq} \
306                 from source {source_agent} but is missing seq={missing_seq}"
307            )
308        }
309        InvariantViolation::TriageStability {
310            agent_a,
311            agent_b,
312            score_a,
313            score_b,
314            diff,
315            epsilon,
316        } => {
317            format!(
318                "TriageStability: agents {agent_a} and {agent_b} scores \
319                 diverge ({score_a:.6} vs {score_b:.6}, diff={diff:.6} > epsilon={epsilon:.6})"
320            )
321        }
322    }
323}
324
325#[cfg(test)]
326mod tests {
327    use super::*;
328
329    #[test]
330    fn campaign_config_default_is_valid() {
331        let config = CampaignConfig::default();
332        assert!(config.validate().is_ok());
333    }
334
335    #[test]
336    fn campaign_config_empty_seed_range_rejected() {
337        let config = CampaignConfig {
338            seed_range: 5..5,
339            ..CampaignConfig::default()
340        };
341        assert!(config.validate().is_err());
342    }
343
344    #[test]
345    fn campaign_config_zero_agents_rejected() {
346        let config = CampaignConfig {
347            agent_count: 0,
348            ..CampaignConfig::default()
349        };
350        assert!(config.validate().is_err());
351    }
352
353    #[test]
354    fn campaign_config_zero_rounds_rejected() {
355        let config = CampaignConfig {
356            rounds: 0,
357            ..CampaignConfig::default()
358        };
359        assert!(config.validate().is_err());
360    }
361
362    #[test]
363    fn sim_config_for_seed_uses_correct_seed() {
364        let config = CampaignConfig::default();
365        let sim = config.sim_config_for_seed(42);
366        assert_eq!(sim.seed, 42);
367        assert_eq!(sim.agent_count, config.agent_count);
368        assert_eq!(sim.rounds, config.rounds);
369    }
370
371    #[test]
372    fn run_single_seed_passes_with_correct_crdt() {
373        // Use non-destructive faults only (no drops/partitions) so all events
374        // reach all agents via the final drain and convergence is guaranteed.
375        let config = CampaignConfig {
376            seed_range: 0..1,
377            agent_count: 3,
378            rounds: 16,
379            fanout: 2,
380            fault_drop_percent: 0,
381            fault_duplicate_percent: 3,
382            fault_reorder_percent: 5,
383            fault_partition_percent: 0,
384            fault_max_delay: 2,
385            fault_freeze_percent: 2,
386            fault_freeze_duration: 2,
387        };
388        let result = run_single_seed(0, &config).expect("sim should not error");
389        assert!(result.is_ok(), "seed 0 should pass: {result:?}");
390    }
391
392    #[test]
393    fn run_campaign_all_seeds_pass() {
394        // Non-destructive faults: delay, reorder, duplicate are fine (events
395        // still arrive). Drop and partition permanently lose events, preventing
396        // convergence without a sync protocol.
397        let config = CampaignConfig {
398            seed_range: 0..10,
399            agent_count: 3,
400            rounds: 12,
401            fanout: 2,
402            fault_drop_percent: 0,
403            fault_duplicate_percent: 3,
404            fault_reorder_percent: 5,
405            fault_partition_percent: 0,
406            fault_max_delay: 2,
407            fault_freeze_percent: 2,
408            fault_freeze_duration: 2,
409        };
410        let report = run_campaign(&config).expect("campaign should not error");
411        assert_eq!(report.seeds_run, 10);
412        assert_eq!(report.seeds_passed, 10);
413        assert!(report.all_passed());
414        assert!(report.first_failure.is_none());
415        assert!(report.failures.is_empty());
416    }
417
418    #[test]
419    fn run_campaign_100_seeds_pass() {
420        // The acceptance criterion: 100+ seeds without failure.
421        // Non-destructive faults only — the final drain delivers all pending
422        // events so all agents converge. Fanout = agent_count - 1 (broadcast)
423        // to ensure every event reaches every agent.
424        let config = CampaignConfig {
425            seed_range: 0..100,
426            agent_count: 4,
427            rounds: 16,
428            fanout: 3, // broadcast to all peers (agent_count - 1)
429            fault_drop_percent: 0,
430            fault_duplicate_percent: 5,
431            fault_reorder_percent: 10,
432            fault_partition_percent: 0,
433            fault_max_delay: 3,
434            fault_freeze_percent: 5,
435            fault_freeze_duration: 2,
436        };
437        let report = run_campaign(&config).expect("campaign should not error");
438        assert_eq!(report.seeds_run, 100);
439        assert!(
440            report.all_passed(),
441            "campaign failed: {} failures, first at seed {:?}",
442            report.failures.len(),
443            report.first_failure,
444        );
445    }
446
447    #[test]
448    fn replay_seed_produces_detailed_trace() {
449        // Non-destructive faults only so oracle passes.
450        let config = CampaignConfig {
451            seed_range: 0..1,
452            agent_count: 3,
453            rounds: 12,
454            fault_drop_percent: 0,
455            fault_partition_percent: 0,
456            ..CampaignConfig::default()
457        };
458        let trace = replay_seed(42, &config).expect("replay should not error");
459        assert!(!trace.result.trace.is_empty());
460        assert!(!trace.all_events.is_empty());
461        // With correct CRDT and no destructive faults, oracle should pass
462        assert!(
463            trace.oracle.passed,
464            "oracle should pass: {:?}",
465            trace.oracle.violations
466        );
467    }
468
469    #[test]
470    fn replay_is_deterministic() {
471        let config = CampaignConfig {
472            seed_range: 0..1,
473            agent_count: 4,
474            rounds: 16,
475            ..CampaignConfig::default()
476        };
477
478        let trace1 = replay_seed(7, &config).expect("replay 1");
479        let trace2 = replay_seed(7, &config).expect("replay 2");
480
481        assert_eq!(trace1.result.trace, trace2.result.trace);
482        assert_eq!(trace1.result.states, trace2.result.states);
483        assert_eq!(trace1.all_events, trace2.all_events);
484    }
485
486    #[test]
487    fn campaign_report_serializes_to_json() {
488        let report = CampaignReport {
489            seeds_run: 10,
490            seeds_passed: 9,
491            first_failure: Some(7),
492            failures: vec![SeedFailure {
493                seed: 7,
494                violations: vec!["Convergence: agents 0 and 1 diverge".into()],
495            }],
496            interesting_states_reached: 5,
497        };
498        let json = serde_json::to_string(&report).expect("serialize");
499        assert!(json.contains("\"seeds_run\":10"));
500        assert!(json.contains("\"first_failure\":7"));
501    }
502
503    #[test]
504    fn campaign_reaches_interesting_states() {
505        let config = CampaignConfig {
506            seed_range: 0..20,
507            agent_count: 4,
508            rounds: 16,
509            fault_drop_percent: 20,
510            fault_duplicate_percent: 15,
511            fault_reorder_percent: 20,
512            fault_partition_percent: 15,
513            fault_max_delay: 3,
514            fault_freeze_percent: 15,
515            fault_freeze_duration: 2,
516            ..CampaignConfig::default()
517        };
518        let report = run_campaign(&config).expect("campaign should not error");
519        assert!(
520            report.interesting_states_reached > 0,
521            "expected some seeds to reach interesting fault states"
522        );
523    }
524
525    #[test]
526    fn format_violation_produces_readable_strings() {
527        let v = InvariantViolation::Convergence {
528            agent_a: 0,
529            agent_b: 1,
530            only_in_a: vec![42],
531            only_in_b: vec![],
532        };
533        let s = format_violation(&v);
534        assert!(s.contains("Convergence"));
535        assert!(s.contains("agents 0 and 1"));
536    }
537}