Skip to main content

bones_sim/
campaign.rs

1//! Campaign runner for deterministic simulation campaigns.
2//!
3//! Executes many seeds across configurable parameters, collecting pass/fail
4//! results and identifying the first failing seed for replay.
5
6use std::ops::Range;
7
8use anyhow::{Result, bail};
9use serde::{Deserialize, Serialize};
10
11use crate::oracle::{ConvergenceOracle, InvariantViolation, OracleResult};
12use crate::rng::DeterministicRng;
13use crate::{SimulationConfig, SimulationResult, Simulator};
14
15/// Campaign-level configuration controlling how many seeds to run and
16/// what simulation parameters to use for each seed.
17#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
18pub struct CampaignConfig {
19    /// Range of seeds to execute, e.g., `0..100`.
20    pub seed_range: Range<u64>,
21    /// Number of simulated agents per seed.
22    pub agent_count: usize,
23    /// Number of simulation rounds per seed.
24    pub rounds: u64,
25    /// Number of peers each emitter sends each event to.
26    pub fanout: usize,
27    /// Network fault probability for random message drops (percent, 0–100).
28    pub fault_drop_percent: u8,
29    /// Network fault probability for message duplication (percent, 0–100).
30    pub fault_duplicate_percent: u8,
31    /// Network fault probability for message reordering (percent, 0–100).
32    pub fault_reorder_percent: u8,
33    /// Network fault probability for partition toggling (percent, 0–100).
34    pub fault_partition_percent: u8,
35    /// Maximum delivery delay in rounds.
36    pub fault_max_delay: u8,
37    /// Clock freeze probability (percent, 0–100).
38    pub fault_freeze_percent: u8,
39    /// Clock freeze duration in rounds.
40    pub fault_freeze_duration: u8,
41    /// Number of pairwise gossip reconciliation rounds after drain.
42    #[serde(default = "default_reconciliation_rounds")]
43    pub reconciliation_rounds: u8,
44}
45
46const fn default_reconciliation_rounds() -> u8 {
47    3
48}
49
50impl Default for CampaignConfig {
51    fn default() -> Self {
52        Self {
53            seed_range: 0..100,
54            agent_count: 5,
55            rounds: 24,
56            fanout: 2,
57            fault_drop_percent: 10,
58            fault_duplicate_percent: 5,
59            fault_reorder_percent: 10,
60            fault_partition_percent: 5,
61            fault_max_delay: 3,
62            fault_freeze_percent: 5,
63            fault_freeze_duration: 2,
64            reconciliation_rounds: default_reconciliation_rounds(),
65        }
66    }
67}
68
69impl CampaignConfig {
70    /// Build a [`SimulationConfig`] for a specific seed.
71    #[must_use]
72    pub fn sim_config_for_seed(&self, seed: u64) -> SimulationConfig {
73        use crate::network::FaultConfig;
74        SimulationConfig {
75            seed,
76            agent_count: self.agent_count,
77            rounds: self.rounds,
78            fanout: self.fanout,
79            fault: FaultConfig {
80                max_delay_rounds: self.fault_max_delay,
81                drop_rate_percent: self.fault_drop_percent,
82                duplicate_rate_percent: self.fault_duplicate_percent,
83                reorder_rate_percent: self.fault_reorder_percent,
84                partition_rate_percent: self.fault_partition_percent,
85                freeze_rate_percent: self.fault_freeze_percent,
86                freeze_duration_rounds: self.fault_freeze_duration,
87            },
88            clock: crate::clock::ClockConfig::default(),
89            reconciliation_rounds: self.reconciliation_rounds,
90        }
91    }
92
93    /// Validate configuration before running.
94    ///
95    /// # Errors
96    ///
97    /// Returns an error if any parameter is out of valid range.
98    pub fn validate(&self) -> Result<()> {
99        if self.seed_range.is_empty() {
100            bail!("seed_range must not be empty");
101        }
102        if self.agent_count == 0 {
103            bail!("agent_count must be > 0");
104        }
105        if self.rounds == 0 {
106            bail!("rounds must be > 0");
107        }
108        Ok(())
109    }
110}
111
112/// Failure details for a single seed.
113#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
114pub struct SeedFailure {
115    /// The seed that failed.
116    pub seed: u64,
117    /// Invariant violations found.
118    pub violations: Vec<String>,
119}
120
121/// Aggregate report produced by a campaign run.
122#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
123pub struct CampaignReport {
124    /// Total seeds executed.
125    pub seeds_run: usize,
126    /// Seeds that passed all invariants.
127    pub seeds_passed: usize,
128    /// First seed that failed (for prioritized replay).
129    pub first_failure: Option<u64>,
130    /// All seed failures with violation details.
131    pub failures: Vec<SeedFailure>,
132    /// Whether at least one seed reached an interesting fault state.
133    pub interesting_states_reached: usize,
134}
135
136impl CampaignReport {
137    /// True if every seed passed.
138    #[must_use]
139    pub const fn all_passed(&self) -> bool {
140        self.failures.is_empty()
141    }
142}
143
144/// Detailed trace produced by replaying a single seed.
145#[derive(Debug, Clone)]
146pub struct DetailedTrace {
147    /// The simulation result including full trace and convergence info.
148    pub result: SimulationResult,
149    /// Oracle check result with violation details.
150    pub oracle: OracleResult,
151    /// All event IDs produced during the simulation.
152    pub all_events: Vec<u64>,
153}
154
155/// Run a full campaign across all seeds in the config.
156///
157/// # Errors
158///
159/// Returns an error if config validation fails or a simulation encounters
160/// an internal error.
161pub fn run_campaign(config: &CampaignConfig) -> Result<CampaignReport> {
162    config.validate()?;
163
164    let mut seeds_run = 0_usize;
165    let mut seeds_passed = 0_usize;
166    let mut first_failure: Option<u64> = None;
167    let mut failures = Vec::new();
168    let mut interesting_states_reached = 0_usize;
169
170    for seed in config.seed_range.clone() {
171        seeds_run += 1;
172
173        match run_single_seed(seed, config)? {
174            Ok(()) => {
175                seeds_passed += 1;
176            }
177            Err(violations) => {
178                if first_failure.is_none() {
179                    first_failure = Some(seed);
180                }
181                failures.push(SeedFailure {
182                    seed,
183                    violations: violations.iter().map(format_violation).collect(),
184                });
185            }
186        }
187
188        // Track interesting states separately by replaying
189        let sim_config = config.sim_config_for_seed(seed);
190        let mut sim = Simulator::new(sim_config)?;
191        let result = sim.run()?;
192        if result.interesting_state_reached {
193            interesting_states_reached += 1;
194        }
195    }
196
197    Ok(CampaignReport {
198        seeds_run,
199        seeds_passed,
200        first_failure,
201        failures,
202        interesting_states_reached,
203    })
204}
205
206/// Run a single seed and return Ok(()) on pass, Err(violations) on failure.
207///
208/// # Errors
209///
210/// Returns an `anyhow::Error` if the simulation itself encounters an internal
211/// error (invalid config, etc). The inner `Result` distinguishes pass from
212/// invariant violations.
213pub fn run_single_seed(
214    seed: u64,
215    config: &CampaignConfig,
216) -> Result<std::result::Result<(), Vec<InvariantViolation>>> {
217    let sim_config = config.sim_config_for_seed(seed);
218    let mut simulator = Simulator::new(sim_config)?;
219    let result = simulator.run()?;
220
221    // Collect all event IDs from the trace for oracle checks.
222    let all_events = collect_emitted_events(&result);
223
224    // Run the full oracle suite.
225    let mut oracle_rng = DeterministicRng::new(seed.wrapping_add(0xDEAD));
226    let oracle_result = ConvergenceOracle::check_all(&result.states, &all_events, &mut oracle_rng);
227
228    if oracle_result.passed {
229        Ok(Ok(()))
230    } else {
231        Ok(Err(oracle_result.violations))
232    }
233}
234
235/// Replay a single seed with full trace details for debugging.
236///
237/// # Errors
238///
239/// Returns an error when config validation or simulation fails.
240pub fn replay_seed(seed: u64, config: &CampaignConfig) -> Result<DetailedTrace> {
241    config.validate()?;
242
243    let sim_config = config.sim_config_for_seed(seed);
244    let mut simulator = Simulator::new(sim_config)?;
245    let result = simulator.run()?;
246
247    let all_events = collect_emitted_events(&result);
248
249    let mut oracle_rng = DeterministicRng::new(seed.wrapping_add(0xDEAD));
250    let oracle = ConvergenceOracle::check_all(&result.states, &all_events, &mut oracle_rng);
251
252    Ok(DetailedTrace {
253        result,
254        oracle,
255        all_events,
256    })
257}
258
259/// Extract all emitted event IDs from a simulation result's trace.
260fn collect_emitted_events(result: &SimulationResult) -> Vec<u64> {
261    result
262        .trace
263        .iter()
264        .filter_map(|te| match te.kind {
265            crate::TraceEventKind::Emit { event_id, .. } => Some(event_id),
266            _ => None,
267        })
268        .collect()
269}
270
271/// Format an invariant violation into a human-readable string.
272fn format_violation(v: &InvariantViolation) -> String {
273    match v {
274        InvariantViolation::Convergence {
275            agent_a,
276            agent_b,
277            only_in_a,
278            only_in_b,
279        } => {
280            format!(
281                "Convergence: agents {agent_a} and {agent_b} diverge \
282                 (only_in_a={only_in_a:?}, only_in_b={only_in_b:?})"
283            )
284        }
285        InvariantViolation::Commutativity {
286            permutation_index,
287            missing_events,
288            extra_events,
289        } => {
290            format!(
291                "Commutativity: permutation {permutation_index} diverges \
292                 (missing={missing_events:?}, extra={extra_events:?})"
293            )
294        }
295        InvariantViolation::Idempotence {
296            event_id,
297            events_before,
298            events_after_dup,
299        } => {
300            format!(
301                "Idempotence: re-applying event {event_id} mutated state \
302                 (before={} events, after={} events)",
303                events_before.len(),
304                events_after_dup.len()
305            )
306        }
307        InvariantViolation::CausalConsistency {
308            observer_agent,
309            source_agent,
310            missing_seq,
311            present_higher_seq,
312        } => {
313            format!(
314                "CausalConsistency: agent {observer_agent} has seq={present_higher_seq} \
315                 from source {source_agent} but is missing seq={missing_seq}"
316            )
317        }
318        InvariantViolation::TriageStability {
319            agent_a,
320            agent_b,
321            score_a,
322            score_b,
323            diff,
324            epsilon,
325        } => {
326            format!(
327                "TriageStability: agents {agent_a} and {agent_b} scores \
328                 diverge ({score_a:.6} vs {score_b:.6}, diff={diff:.6} > epsilon={epsilon:.6})"
329            )
330        }
331    }
332}
333
334#[cfg(test)]
335mod tests {
336    use super::*;
337
338    #[test]
339    fn campaign_config_default_is_valid() {
340        let config = CampaignConfig::default();
341        assert!(config.validate().is_ok());
342    }
343
344    #[test]
345    fn campaign_config_empty_seed_range_rejected() {
346        let config = CampaignConfig {
347            seed_range: 5..5,
348            ..CampaignConfig::default()
349        };
350        assert!(config.validate().is_err());
351    }
352
353    #[test]
354    fn campaign_config_zero_agents_rejected() {
355        let config = CampaignConfig {
356            agent_count: 0,
357            ..CampaignConfig::default()
358        };
359        assert!(config.validate().is_err());
360    }
361
362    #[test]
363    fn campaign_config_zero_rounds_rejected() {
364        let config = CampaignConfig {
365            rounds: 0,
366            ..CampaignConfig::default()
367        };
368        assert!(config.validate().is_err());
369    }
370
371    #[test]
372    fn sim_config_for_seed_uses_correct_seed() {
373        let config = CampaignConfig::default();
374        let sim = config.sim_config_for_seed(42);
375        assert_eq!(sim.seed, 42);
376        assert_eq!(sim.agent_count, config.agent_count);
377        assert_eq!(sim.rounds, config.rounds);
378    }
379
380    #[test]
381    fn run_single_seed_passes_with_correct_crdt() {
382        // Use non-destructive faults only (no drops/partitions) so all events
383        // reach all agents via the final drain and convergence is guaranteed.
384        let config = CampaignConfig {
385            seed_range: 0..1,
386            agent_count: 3,
387            rounds: 16,
388            fanout: 2,
389            fault_drop_percent: 0,
390            fault_duplicate_percent: 3,
391            fault_reorder_percent: 5,
392            fault_partition_percent: 0,
393            fault_max_delay: 2,
394            fault_freeze_percent: 2,
395            fault_freeze_duration: 2,
396            ..CampaignConfig::default()
397        };
398        let result = run_single_seed(0, &config).expect("sim should not error");
399        assert!(result.is_ok(), "seed 0 should pass: {result:?}");
400    }
401
402    #[test]
403    fn run_campaign_all_seeds_pass() {
404        // Non-destructive faults: delay, reorder, duplicate are fine (events
405        // still arrive). Drop and partition permanently lose events, preventing
406        // convergence without a sync protocol.
407        let config = CampaignConfig {
408            seed_range: 0..10,
409            agent_count: 3,
410            rounds: 12,
411            fanout: 2,
412            fault_drop_percent: 0,
413            fault_duplicate_percent: 3,
414            fault_reorder_percent: 5,
415            fault_partition_percent: 0,
416            fault_max_delay: 2,
417            fault_freeze_percent: 2,
418            fault_freeze_duration: 2,
419            ..CampaignConfig::default()
420        };
421        let report = run_campaign(&config).expect("campaign should not error");
422        assert_eq!(report.seeds_run, 10);
423        assert_eq!(report.seeds_passed, 10);
424        assert!(report.all_passed());
425        assert!(report.first_failure.is_none());
426        assert!(report.failures.is_empty());
427    }
428
429    #[test]
430    fn run_campaign_100_seeds_pass() {
431        // The acceptance criterion: 100+ seeds without failure.
432        // Non-destructive faults only — the final drain delivers all pending
433        // events so all agents converge. Fanout = agent_count - 1 (broadcast)
434        // to ensure every event reaches every agent.
435        let config = CampaignConfig {
436            seed_range: 0..100,
437            agent_count: 4,
438            rounds: 16,
439            fanout: 3, // broadcast to all peers (agent_count - 1)
440            fault_drop_percent: 0,
441            fault_duplicate_percent: 5,
442            fault_reorder_percent: 10,
443            fault_partition_percent: 0,
444            fault_max_delay: 3,
445            fault_freeze_percent: 5,
446            fault_freeze_duration: 2,
447            ..CampaignConfig::default()
448        };
449        let report = run_campaign(&config).expect("campaign should not error");
450        assert_eq!(report.seeds_run, 100);
451        assert!(
452            report.all_passed(),
453            "campaign failed: {} failures, first at seed {:?}",
454            report.failures.len(),
455            report.first_failure,
456        );
457    }
458
459    #[test]
460    fn run_campaign_100_seeds_pass_with_faults_and_reconciliation() {
461        // The acceptance test: 100 seeds with real faults, reconciliation heals divergence.
462        let config = CampaignConfig {
463            seed_range: 0..100,
464            agent_count: 5,
465            rounds: 24,
466            fanout: 2,
467            fault_drop_percent: 5,
468            fault_duplicate_percent: 2,
469            fault_reorder_percent: 5,
470            fault_partition_percent: 2,
471            fault_max_delay: 3,
472            fault_freeze_percent: 2,
473            fault_freeze_duration: 2,
474            reconciliation_rounds: 3,
475        };
476        let report = run_campaign(&config).expect("campaign should not error");
477        assert_eq!(report.seeds_run, 100);
478        assert!(
479            report.all_passed(),
480            "campaign failed: {} failures, first at seed {:?}",
481            report.failures.len(),
482            report.first_failure,
483        );
484    }
485
486    #[test]
487    fn replay_seed_produces_detailed_trace() {
488        // Non-destructive faults only so oracle passes.
489        let config = CampaignConfig {
490            seed_range: 0..1,
491            agent_count: 3,
492            rounds: 12,
493            fault_drop_percent: 0,
494            fault_partition_percent: 0,
495            ..CampaignConfig::default()
496        };
497        let trace = replay_seed(42, &config).expect("replay should not error");
498        assert!(!trace.result.trace.is_empty());
499        assert!(!trace.all_events.is_empty());
500        // With correct CRDT and no destructive faults, oracle should pass
501        assert!(
502            trace.oracle.passed,
503            "oracle should pass: {:?}",
504            trace.oracle.violations
505        );
506    }
507
508    #[test]
509    fn replay_is_deterministic() {
510        let config = CampaignConfig {
511            seed_range: 0..1,
512            agent_count: 4,
513            rounds: 16,
514            ..CampaignConfig::default()
515        };
516
517        let trace1 = replay_seed(7, &config).expect("replay 1");
518        let trace2 = replay_seed(7, &config).expect("replay 2");
519
520        assert_eq!(trace1.result.trace, trace2.result.trace);
521        assert_eq!(trace1.result.states, trace2.result.states);
522        assert_eq!(trace1.all_events, trace2.all_events);
523    }
524
525    #[test]
526    fn campaign_report_serializes_to_json() {
527        let report = CampaignReport {
528            seeds_run: 10,
529            seeds_passed: 9,
530            first_failure: Some(7),
531            failures: vec![SeedFailure {
532                seed: 7,
533                violations: vec!["Convergence: agents 0 and 1 diverge".into()],
534            }],
535            interesting_states_reached: 5,
536        };
537        let json = serde_json::to_string(&report).expect("serialize");
538        assert!(json.contains("\"seeds_run\":10"));
539        assert!(json.contains("\"first_failure\":7"));
540    }
541
542    #[test]
543    fn campaign_reaches_interesting_states() {
544        let config = CampaignConfig {
545            seed_range: 0..20,
546            agent_count: 4,
547            rounds: 16,
548            fault_drop_percent: 20,
549            fault_duplicate_percent: 15,
550            fault_reorder_percent: 20,
551            fault_partition_percent: 15,
552            fault_max_delay: 3,
553            fault_freeze_percent: 15,
554            fault_freeze_duration: 2,
555            ..CampaignConfig::default()
556        };
557        let report = run_campaign(&config).expect("campaign should not error");
558        assert!(
559            report.interesting_states_reached > 0,
560            "expected some seeds to reach interesting fault states"
561        );
562    }
563
564    #[test]
565    fn format_violation_produces_readable_strings() {
566        let v = InvariantViolation::Convergence {
567            agent_a: 0,
568            agent_b: 1,
569            only_in_a: vec![42],
570            only_in_b: vec![],
571        };
572        let s = format_violation(&v);
573        assert!(s.contains("Convergence"));
574        assert!(s.contains("agents 0 and 1"));
575    }
576}