datasynth_audit_optimizer/
benchmark_comparison.rs

1//! Cross-firm methodology benchmark comparison.
2//!
3//! Runs all available built-in blueprints under the same conditions (seed,
4//! overlay, engagement context) and produces a comparative report that enables
5//! cross-firm audit methodology benchmarking.
6
7use rand::SeedableRng;
8use rand_chacha::ChaCha8Rng;
9use serde::{Deserialize, Serialize};
10
11use datasynth_audit_fsm::{
12    context::EngagementContext,
13    dispatch::infer_judgment_level,
14    engine::AuditFsmEngine,
15    error::AuditFsmError,
16    loader::{default_overlay, BlueprintWithPreconditions},
17};
18
19/// Function pointer type for blueprint loader functions.
20type BlueprintLoader = fn() -> Result<BlueprintWithPreconditions, AuditFsmError>;
21
22// ---------------------------------------------------------------------------
23// Report types
24// ---------------------------------------------------------------------------
25
26/// Per-firm benchmark metrics produced from a single engagement simulation.
27#[derive(Debug, Clone, Serialize, Deserialize)]
28pub struct FirmBenchmark {
29    /// Display name of the firm / methodology (e.g. "KPMG Clara").
30    pub firm: String,
31    /// Short identifier of the blueprint used (e.g. "kpmg").
32    pub blueprint: String,
33    /// Number of phases in the blueprint.
34    pub phases: usize,
35    /// Total number of procedures across all phases.
36    pub procedures: usize,
37    /// Total number of steps across all procedures.
38    pub steps: usize,
39    /// Number of events emitted during the engagement simulation.
40    pub events: usize,
41    /// Total typed artifacts produced by step dispatchers.
42    pub artifacts: usize,
43    /// Simulated engagement duration in hours.
44    pub duration_hours: f64,
45    /// Number of anomaly records injected during the engagement.
46    pub anomalies: usize,
47    /// Fraction of procedures reaching "completed" or "closed" state.
48    pub completion_rate: f64,
49    /// Breakdown of steps by judgment level.
50    pub judgment_distribution: JudgmentDistribution,
51    /// Number of accounting/audit standards referenced in the blueprint.
52    pub standards_count: usize,
53}
54
55/// Step-level judgment classification breakdown.
56#[derive(Debug, Clone, Serialize, Deserialize)]
57pub struct JudgmentDistribution {
58    /// Steps fully automatable via data processing.
59    pub data_only: usize,
60    /// Steps where AI can assist but a human reviews.
61    pub ai_assistable: usize,
62    /// Steps requiring professional skepticism / human judgment.
63    pub human_required: usize,
64    /// `data_only` as a percentage of total steps.
65    pub data_only_pct: f64,
66    /// `ai_assistable` as a percentage of total steps.
67    pub ai_assistable_pct: f64,
68    /// `human_required` as a percentage of total steps.
69    pub human_required_pct: f64,
70}
71
72/// The full cross-firm comparison report.
73#[derive(Debug, Clone, Serialize, Deserialize)]
74pub struct ComparisonReport {
75    /// One entry per firm / blueprint that loaded successfully.
76    pub benchmarks: Vec<FirmBenchmark>,
77    /// RNG seed used for all engagement simulations.
78    pub seed: u64,
79    /// Name of the overlay applied to all simulations.
80    pub overlay: String,
81}
82
83// ---------------------------------------------------------------------------
84// Public API
85// ---------------------------------------------------------------------------
86
87/// Run all available built-in blueprints under identical conditions and return
88/// a [`ComparisonReport`].
89///
90/// The same `seed` and the default overlay are used for every engagement so
91/// that differences in the report reflect methodology (blueprint) rather than
92/// randomness or configuration.
93pub fn run_comparison(seed: u64, context: Option<&EngagementContext>) -> ComparisonReport {
94    let overlay = default_overlay();
95    let default_ctx = EngagementContext::demo();
96    let ctx = context.unwrap_or(&default_ctx);
97    let mut benchmarks = Vec::new();
98
99    // (display name, short key, loader)
100    let loaders: &[(&str, &str, BlueprintLoader)] = &[
101        (
102            "Generic ISA",
103            "fsa",
104            BlueprintWithPreconditions::load_builtin_fsa,
105        ),
106        (
107            "KPMG Clara",
108            "kpmg",
109            BlueprintWithPreconditions::load_builtin_kpmg,
110        ),
111        (
112            "PwC Aura",
113            "pwc",
114            BlueprintWithPreconditions::load_builtin_pwc,
115        ),
116        (
117            "Deloitte Omnia",
118            "deloitte",
119            BlueprintWithPreconditions::load_builtin_deloitte,
120        ),
121        (
122            "EY GAM Lite",
123            "ey_gam_lite",
124            BlueprintWithPreconditions::load_builtin_ey_gam_lite,
125        ),
126        (
127            "IIA-GIAS",
128            "ia",
129            BlueprintWithPreconditions::load_builtin_ia,
130        ),
131    ];
132
133    for (firm_name, bp_name, loader) in loaders {
134        let bwp = match loader() {
135            Ok(b) => b,
136            Err(_) => continue,
137        };
138
139        // ------------------------------------------------------------------
140        // Structural counts
141        // ------------------------------------------------------------------
142        let phases = bwp.blueprint.phases.len();
143        let procedures: usize = bwp
144            .blueprint
145            .phases
146            .iter()
147            .map(|p| p.procedures.len())
148            .sum();
149        let steps: usize = bwp
150            .blueprint
151            .phases
152            .iter()
153            .flat_map(|p| p.procedures.iter())
154            .map(|proc| proc.steps.len())
155            .sum();
156
157        // ------------------------------------------------------------------
158        // Judgment-level classification
159        // ------------------------------------------------------------------
160        let mut data_only = 0usize;
161        let mut ai_assistable = 0usize;
162        let mut human_required = 0usize;
163
164        for phase in &bwp.blueprint.phases {
165            for proc in &phase.procedures {
166                for step in &proc.steps {
167                    let level = step.judgment_level.as_deref().unwrap_or_else(|| {
168                        infer_judgment_level(step.command.as_deref().unwrap_or(""))
169                    });
170                    match level {
171                        "data_only" => data_only += 1,
172                        "human_required" => human_required += 1,
173                        _ => ai_assistable += 1,
174                    }
175                }
176            }
177        }
178
179        let total_steps_f = (data_only + ai_assistable + human_required).max(1) as f64;
180
181        // ------------------------------------------------------------------
182        // Run engagement simulation
183        // ------------------------------------------------------------------
184        let mut engine = AuditFsmEngine::new(
185            bwp.clone(),
186            overlay.clone(),
187            ChaCha8Rng::seed_from_u64(seed),
188        );
189        let result = engine.run_engagement(ctx).unwrap();
190
191        let completed = result
192            .procedure_states
193            .values()
194            .filter(|s| s.as_str() == "completed" || s.as_str() == "closed")
195            .count();
196
197        let standards_count = bwp.blueprint.standards.len();
198
199        benchmarks.push(FirmBenchmark {
200            firm: firm_name.to_string(),
201            blueprint: bp_name.to_string(),
202            phases,
203            procedures,
204            steps,
205            events: result.event_log.len(),
206            artifacts: result.artifacts.total_artifacts(),
207            duration_hours: result.total_duration_hours,
208            anomalies: result.anomalies.len(),
209            completion_rate: completed as f64 / result.procedure_states.len().max(1) as f64,
210            judgment_distribution: JudgmentDistribution {
211                data_only,
212                ai_assistable,
213                human_required,
214                data_only_pct: data_only as f64 / total_steps_f * 100.0,
215                ai_assistable_pct: ai_assistable as f64 / total_steps_f * 100.0,
216                human_required_pct: human_required as f64 / total_steps_f * 100.0,
217            },
218            standards_count,
219        });
220    }
221
222    ComparisonReport {
223        benchmarks,
224        seed,
225        overlay: "default".to_string(),
226    }
227}
228
229/// Format a [`ComparisonReport`] as a human-readable table.
230pub fn format_comparison_report(report: &ComparisonReport) -> String {
231    let mut out = String::new();
232    out.push_str("Cross-Firm Methodology Benchmark\n");
233    out.push_str(&format!(
234        "Seed: {}, Overlay: {}\n\n",
235        report.seed, report.overlay
236    ));
237
238    // Header row
239    out.push_str(&format!(
240        "{:20} {:>6} {:>6} {:>6} {:>7} {:>9} {:>8} {:>6} {:>7} {:>6} {:>6} {:>6}\n",
241        "Firm",
242        "Phases",
243        "Procs",
244        "Steps",
245        "Events",
246        "Artifacts",
247        "Hours",
248        "Anom",
249        "Compl%",
250        "Data%",
251        "AI%",
252        "Human%"
253    ));
254    out.push_str(&"-".repeat(110));
255    out.push('\n');
256
257    for b in &report.benchmarks {
258        out.push_str(&format!(
259            "{:20} {:>6} {:>6} {:>6} {:>7} {:>9} {:>8.0} {:>6} {:>6.0}% {:>5.0}% {:>5.0}% {:>5.0}%\n",
260            b.firm,
261            b.phases,
262            b.procedures,
263            b.steps,
264            b.events,
265            b.artifacts,
266            b.duration_hours,
267            b.anomalies,
268            b.completion_rate * 100.0,
269            b.judgment_distribution.data_only_pct,
270            b.judgment_distribution.ai_assistable_pct,
271            b.judgment_distribution.human_required_pct,
272        ));
273    }
274    out
275}
276
277// ---------------------------------------------------------------------------
278// Unit tests
279// ---------------------------------------------------------------------------
280
281#[cfg(test)]
282mod tests {
283    use super::*;
284
285    #[test]
286    fn test_comparison_runs_all_firms() {
287        let report = run_comparison(42, None);
288        // All 6 blueprints should load; require at least 5 to be tolerant of
289        // potential future blueprint removal.
290        assert!(
291            report.benchmarks.len() >= 5,
292            "Expected >= 5 benchmarks, got {}",
293            report.benchmarks.len()
294        );
295    }
296
297    #[test]
298    fn test_comparison_shows_differences() {
299        let report = run_comparison(42, None);
300        // The blueprints should not all be structurally identical — at least
301        // some pair should differ in phase or procedure count.
302        let phases: Vec<usize> = report.benchmarks.iter().map(|b| b.phases).collect();
303        let procedures: Vec<usize> = report.benchmarks.iter().map(|b| b.procedures).collect();
304        let all_phases_same = phases.windows(2).all(|w| w[0] == w[1]);
305        let all_procs_same = procedures.windows(2).all(|w| w[0] == w[1]);
306        assert!(
307            !all_phases_same || !all_procs_same,
308            "All blueprints have identical phases AND procedures — expected some structural differences"
309        );
310    }
311
312    #[test]
313    fn test_comparison_report_serializes() {
314        let report = run_comparison(42, None);
315        let json = serde_json::to_string(&report).expect("serialization failed");
316        let decoded: ComparisonReport =
317            serde_json::from_str(&json).expect("deserialization failed");
318        assert_eq!(report.benchmarks.len(), decoded.benchmarks.len());
319        for (orig, dec) in report.benchmarks.iter().zip(decoded.benchmarks.iter()) {
320            assert_eq!(orig.firm, dec.firm);
321            assert_eq!(orig.events, dec.events);
322            assert_eq!(orig.artifacts, dec.artifacts);
323        }
324    }
325
326    #[test]
327    fn test_comparison_deterministic() {
328        let r1 = run_comparison(99, None);
329        let r2 = run_comparison(99, None);
330        assert_eq!(r1.benchmarks.len(), r2.benchmarks.len());
331        for (b1, b2) in r1.benchmarks.iter().zip(r2.benchmarks.iter()) {
332            assert_eq!(b1.firm, b2.firm);
333            assert_eq!(b1.events, b2.events);
334            assert_eq!(b1.artifacts, b2.artifacts);
335            assert_eq!(b1.duration_hours.to_bits(), b2.duration_hours.to_bits());
336            assert_eq!(b1.anomalies, b2.anomalies);
337        }
338    }
339}
datasynth_audit_optimizer/benchmark_comparison.rs

datasynth_audit_optimizer/
benchmark_comparison.rs