datasynth_audit_optimizer/
discovery.rs

1//! Blueprint discovery from event logs.
2//!
3//! Given a `Vec<AuditEvent>`, infers the underlying procedure state machines
4//! (states, transitions, initial/terminal states) and compares the result
5//! against a reference [`AuditBlueprint`].
6
7use std::collections::{HashMap, HashSet};
8
9use datasynth_audit_fsm::event::AuditEvent;
10use datasynth_audit_fsm::schema::AuditBlueprint;
11use serde::Serialize;
12
13// ---------------------------------------------------------------------------
14// Types
15// ---------------------------------------------------------------------------
16
17/// A blueprint inferred from an observed event log.
18#[derive(Debug, Clone, Serialize)]
19pub struct DiscoveredBlueprint {
20    /// One entry per unique `procedure_id` found in the event log.
21    pub procedures: Vec<DiscoveredProcedure>,
22    /// Unique phase identifiers observed across all events.
23    pub phases: Vec<String>,
24    /// Total number of events that were analysed.
25    pub total_events_analyzed: usize,
26}
27
28/// The state machine inferred for a single procedure from the event log.
29#[derive(Debug, Clone, Serialize)]
30pub struct DiscoveredProcedure {
31    /// Procedure identifier, taken directly from `AuditEvent::procedure_id`.
32    pub id: String,
33    /// Phase inferred from the first event belonging to this procedure.
34    pub phase: String,
35    /// All states encountered (union of `from_state` and `to_state` values).
36    pub states: Vec<String>,
37    /// Directed transitions as `(from_state, to_state)` pairs (deduplicated).
38    pub transitions: Vec<(String, String)>,
39    /// The `from_state` of the chronologically-first transition event.
40    pub initial_state: String,
41    /// States that appear as a `to_state` but never as a `from_state` in any
42    /// subsequent transition — these are the natural terminal states.
43    pub terminal_states: Vec<String>,
44    /// Number of events (of any kind) associated with this procedure.
45    pub event_count: usize,
46}
47
48/// Diff between a discovered blueprint and a reference blueprint.
49#[derive(Debug, Clone, Serialize)]
50pub struct BlueprintDiff {
51    /// Procedure IDs that exist in both blueprints.
52    pub matching_procedures: Vec<String>,
53    /// Procedure IDs present in the reference but absent from the discovered
54    /// blueprint (i.e. not observed in the event log).
55    pub missing_procedures: Vec<String>,
56    /// Procedure IDs present in the discovered blueprint but absent from the
57    /// reference (i.e. unexpected procedures in the event log).
58    pub extra_procedures: Vec<String>,
59    /// Transition-level differences for procedures that appear in both.
60    pub transition_diffs: Vec<TransitionDiff>,
61    /// Overall conformance score in `[0, 1]`:
62    /// `matching / (matching + missing + extra)` where *matching*,
63    /// *missing*, and *extra* are **procedure-level** counts.
64    pub conformance_score: f64,
65}
66
67/// A single transition-level difference between discovered and reference.
68#[derive(Debug, Clone, Serialize)]
69pub struct TransitionDiff {
70    /// The procedure this difference belongs to.
71    pub procedure_id: String,
72    /// `"missing"` — the transition is in the reference but not discovered;
73    /// `"extra"` — the transition is discovered but not in the reference.
74    pub diff_type: String,
75    /// Source state of the differing transition.
76    pub from_state: String,
77    /// Destination state of the differing transition.
78    pub to_state: String,
79}
80
81// ---------------------------------------------------------------------------
82// Discovery
83// ---------------------------------------------------------------------------
84
85/// Infer a [`DiscoveredBlueprint`] from a slice of [`AuditEvent`] records.
86///
87/// Only events that carry both a `from_state` **and** a `to_state` are used
88/// for state-machine reconstruction (i.e. transition events).  Pure step
89/// events — where `step_id.is_some()` and the state fields are `None` — are
90/// counted towards `event_count` but ignored for FSM inference.
91///
92/// Events are expected to be in the order they were emitted by the engine; the
93/// function preserves that ordering when determining the initial state.
94pub fn discover_blueprint(events: &[AuditEvent]) -> DiscoveredBlueprint {
95    // -----------------------------------------------------------------------
96    // 1. Group events by procedure_id, preserving arrival order.
97    // -----------------------------------------------------------------------
98    let mut proc_events: HashMap<String, Vec<&AuditEvent>> = HashMap::new();
99    for event in events {
100        proc_events
101            .entry(event.procedure_id.clone())
102            .or_default()
103            .push(event);
104    }
105
106    // -----------------------------------------------------------------------
107    // 2. For stable output order, sort procedures by the timestamp of their
108    //    first event.
109    // -----------------------------------------------------------------------
110    let mut proc_ids: Vec<String> = proc_events.keys().cloned().collect();
111    proc_ids.sort_by_key(|id| {
112        proc_events[id]
113            .first()
114            .map(|e| e.timestamp)
115            .unwrap_or_default()
116    });
117
118    // -----------------------------------------------------------------------
119    // 3. Reconstruct a state machine for each procedure.
120    // -----------------------------------------------------------------------
121    let mut procedures: Vec<DiscoveredProcedure> = Vec::new();
122
123    for id in &proc_ids {
124        let evts = &proc_events[id];
125
126        // Phase: from the first event in the group.
127        let phase = evts
128            .first()
129            .map(|e| e.phase_id.as_str())
130            .unwrap_or("")
131            .to_string();
132        let event_count = evts.len();
133
134        // Only consider transition events (both from_state and to_state present).
135        let transition_evts: Vec<&&AuditEvent> = evts
136            .iter()
137            .filter(|e| e.from_state.is_some() && e.to_state.is_some())
138            .collect();
139
140        // Collect states and transitions (preserving first-seen order while
141        // deduplicating).
142        let mut states_ordered: Vec<String> = Vec::new();
143        let mut states_seen: HashSet<String> = HashSet::new();
144        let mut transitions_ordered: Vec<(String, String)> = Vec::new();
145        let mut transitions_seen: HashSet<(String, String)> = HashSet::new();
146
147        // Track which states appear as from_state of any transition.
148        let mut from_states_set: HashSet<String> = HashSet::new();
149
150        for evt in &transition_evts {
151            let from = match evt.from_state.as_ref() {
152                Some(s) => s.clone(),
153                None => continue,
154            };
155            let to = match evt.to_state.as_ref() {
156                Some(s) => s.clone(),
157                None => continue,
158            };
159
160            if states_seen.insert(from.clone()) {
161                states_ordered.push(from.clone());
162            }
163            if states_seen.insert(to.clone()) {
164                states_ordered.push(to.clone());
165            }
166
167            if transitions_seen.insert((from.clone(), to.clone())) {
168                transitions_ordered.push((from.clone(), to.clone()));
169            }
170
171            from_states_set.insert(from);
172        }
173
174        // Initial state: from_state of the chronologically-first transition event.
175        let initial_state = transition_evts
176            .first()
177            .and_then(|e| e.from_state.as_ref())
178            .cloned()
179            .unwrap_or_default();
180
181        // Terminal states: appear as to_state but never as from_state of a
182        // subsequent transition.
183        let to_states_set: HashSet<String> = transition_evts
184            .iter()
185            .filter_map(|e| e.to_state.as_ref())
186            .cloned()
187            .collect();
188
189        let mut terminal_states: Vec<String> = to_states_set
190            .iter()
191            .filter(|s| !from_states_set.contains(*s))
192            .cloned()
193            .collect();
194        terminal_states.sort();
195
196        procedures.push(DiscoveredProcedure {
197            id: id.clone(),
198            phase,
199            states: states_ordered,
200            transitions: transitions_ordered,
201            initial_state,
202            terminal_states,
203            event_count,
204        });
205    }
206
207    // -----------------------------------------------------------------------
208    // 4. Collect unique phases (in the order they are first seen).
209    // -----------------------------------------------------------------------
210    let mut phases_ordered: Vec<String> = Vec::new();
211    let mut phases_seen: HashSet<String> = HashSet::new();
212    for proc in &procedures {
213        if phases_seen.insert(proc.phase.clone()) {
214            phases_ordered.push(proc.phase.clone());
215        }
216    }
217
218    DiscoveredBlueprint {
219        procedures,
220        phases: phases_ordered,
221        total_events_analyzed: events.len(),
222    }
223}
224
225// ---------------------------------------------------------------------------
226// Comparison
227// ---------------------------------------------------------------------------
228
229/// Compare a [`DiscoveredBlueprint`] against a reference [`AuditBlueprint`].
230///
231/// Returns a [`BlueprintDiff`] that describes:
232/// - which procedures match, are missing, or are extra;
233/// - which transitions within matching procedures are missing or extra;
234/// - an overall conformance score.
235///
236/// The conformance score is computed at the **procedure level**:
237/// ```text
238/// score = |matching| / (|matching| + |missing| + |extra|)
239/// ```
240/// where *matching* = procedures in both, *missing* = in reference only,
241/// *extra* = in discovered only.  A score of 1.0 means perfect structural
242/// alignment.
243pub fn compare_blueprints(
244    discovered: &DiscoveredBlueprint,
245    reference: &AuditBlueprint,
246) -> BlueprintDiff {
247    // -----------------------------------------------------------------------
248    // 1. Build sets of procedure IDs from each blueprint.
249    // -----------------------------------------------------------------------
250    let discovered_ids: HashSet<String> =
251        discovered.procedures.iter().map(|p| p.id.clone()).collect();
252
253    let reference_ids: HashSet<String> = reference
254        .phases
255        .iter()
256        .flat_map(|phase| phase.procedures.iter())
257        .map(|p| p.id.clone())
258        .collect();
259
260    // -----------------------------------------------------------------------
261    // 2. Compute set differences.
262    // -----------------------------------------------------------------------
263    let mut matching_procedures: Vec<String> = discovered_ids
264        .intersection(&reference_ids)
265        .cloned()
266        .collect();
267    matching_procedures.sort();
268
269    let mut missing_procedures: Vec<String> =
270        reference_ids.difference(&discovered_ids).cloned().collect();
271    missing_procedures.sort();
272
273    let mut extra_procedures: Vec<String> =
274        discovered_ids.difference(&reference_ids).cloned().collect();
275    extra_procedures.sort();
276
277    // -----------------------------------------------------------------------
278    // 3. Compare transitions for matching procedures.
279    // -----------------------------------------------------------------------
280
281    // Build a lookup from the discovered blueprint.
282    let discovered_map: HashMap<&str, &DiscoveredProcedure> = discovered
283        .procedures
284        .iter()
285        .map(|p| (p.id.as_str(), p))
286        .collect();
287
288    // Build a lookup from the reference blueprint.
289    let reference_transitions: HashMap<&str, HashSet<(String, String)>> = reference
290        .phases
291        .iter()
292        .flat_map(|phase| phase.procedures.iter())
293        .map(|p| {
294            let set: HashSet<(String, String)> = p
295                .aggregate
296                .transitions
297                .iter()
298                .map(|t| (t.from_state.clone(), t.to_state.clone()))
299                .collect();
300            (p.id.as_str(), set)
301        })
302        .collect();
303
304    let mut transition_diffs: Vec<TransitionDiff> = Vec::new();
305
306    for proc_id in &matching_procedures {
307        let disc_proc = match discovered_map.get(proc_id.as_str()) {
308            Some(p) => p,
309            None => continue,
310        };
311        let ref_transitions = match reference_transitions.get(proc_id.as_str()) {
312            Some(t) => t,
313            None => continue,
314        };
315
316        let disc_set: HashSet<(String, String)> = disc_proc.transitions.iter().cloned().collect();
317
318        // Transitions in reference but not discovered → "missing"
319        let mut missing_trans: Vec<&(String, String)> =
320            ref_transitions.difference(&disc_set).collect();
321        missing_trans.sort();
322        for (from, to) in missing_trans {
323            transition_diffs.push(TransitionDiff {
324                procedure_id: proc_id.clone(),
325                diff_type: "missing".to_string(),
326                from_state: from.clone(),
327                to_state: to.clone(),
328            });
329        }
330
331        // Transitions in discovered but not reference → "extra"
332        let mut extra_trans: Vec<&(String, String)> =
333            disc_set.difference(ref_transitions).collect();
334        extra_trans.sort();
335        for (from, to) in extra_trans {
336            transition_diffs.push(TransitionDiff {
337                procedure_id: proc_id.clone(),
338                diff_type: "extra".to_string(),
339                from_state: from.clone(),
340                to_state: to.clone(),
341            });
342        }
343    }
344
345    // -----------------------------------------------------------------------
346    // 4. Conformance score.
347    // -----------------------------------------------------------------------
348    let m = matching_procedures.len() as f64;
349    let mi = missing_procedures.len() as f64;
350    let ex = extra_procedures.len() as f64;
351    let denominator = m + mi + ex;
352    let conformance_score = if denominator > 0.0 {
353        m / denominator
354    } else {
355        1.0
356    };
357
358    BlueprintDiff {
359        matching_procedures,
360        missing_procedures,
361        extra_procedures,
362        transition_diffs,
363        conformance_score,
364    }
365}
366
367// ---------------------------------------------------------------------------
368// Tests
369// ---------------------------------------------------------------------------
370
371#[cfg(test)]
372mod tests {
373    use super::*;
374    use datasynth_audit_fsm::benchmark::{
375        generate_benchmark, BenchmarkComplexity, BenchmarkConfig,
376    };
377    use datasynth_audit_fsm::loader::BlueprintWithPreconditions;
378
379    // -----------------------------------------------------------------------
380    // Helper: generate FSA events
381    // -----------------------------------------------------------------------
382    fn fsa_events() -> Vec<AuditEvent> {
383        generate_benchmark(&BenchmarkConfig {
384            complexity: BenchmarkComplexity::Simple,
385            anomaly_rate: None,
386            seed: 42,
387        })
388        .unwrap()
389        .events
390    }
391
392    // -----------------------------------------------------------------------
393    // Helper: generate IA events
394    // -----------------------------------------------------------------------
395    fn ia_events() -> Vec<AuditEvent> {
396        generate_benchmark(&BenchmarkConfig {
397            complexity: BenchmarkComplexity::Complex,
398            anomaly_rate: None,
399            seed: 42,
400        })
401        .unwrap()
402        .events
403    }
404
405    // -----------------------------------------------------------------------
406    // Test 1: FSA events → 9 discovered procedures with states & transitions
407    // -----------------------------------------------------------------------
408    #[test]
409    fn test_discover_from_fsa_events() {
410        let events = fsa_events();
411        let discovered = discover_blueprint(&events);
412
413        assert_eq!(
414            discovered.procedures.len(),
415            9,
416            "FSA blueprint has 9 procedures, got {}",
417            discovered.procedures.len()
418        );
419        assert_eq!(discovered.total_events_analyzed, events.len());
420
421        for proc in &discovered.procedures {
422            assert!(
423                !proc.states.is_empty(),
424                "Procedure {} should have states",
425                proc.id
426            );
427            assert!(
428                !proc.transitions.is_empty(),
429                "Procedure {} should have transitions",
430                proc.id
431            );
432        }
433    }
434
435    // -----------------------------------------------------------------------
436    // Test 2: IA events → >= 30 discovered procedures
437    // -----------------------------------------------------------------------
438    #[test]
439    fn test_discover_from_ia_events() {
440        let events = ia_events();
441        let discovered = discover_blueprint(&events);
442
443        assert!(
444            discovered.procedures.len() >= 30,
445            "IA blueprint should yield >= 30 discovered procedures, got {}",
446            discovered.procedures.len()
447        );
448    }
449
450    // -----------------------------------------------------------------------
451    // Test 3: States for accept_engagement match expected set
452    // -----------------------------------------------------------------------
453    #[test]
454    fn test_discovered_states_match_aggregate() {
455        let events = fsa_events();
456        let discovered = discover_blueprint(&events);
457
458        let proc = discovered
459            .procedures
460            .iter()
461            .find(|p| p.id == "accept_engagement")
462            .expect("accept_engagement should be discovered");
463
464        let expected: HashSet<&str> = ["not_started", "in_progress", "under_review", "completed"]
465            .iter()
466            .copied()
467            .collect();
468
469        let found: HashSet<&str> = proc.states.iter().map(|s| s.as_str()).collect();
470
471        assert_eq!(
472            found, expected,
473            "accept_engagement states should be {:?}, got {:?}",
474            expected, found
475        );
476    }
477
478    // -----------------------------------------------------------------------
479    // Test 4: Conformance score > 0.7 when compared against FSA reference
480    // -----------------------------------------------------------------------
481    #[test]
482    fn test_compare_discovered_vs_reference() {
483        let events = fsa_events();
484        let discovered = discover_blueprint(&events);
485        let bwp = BlueprintWithPreconditions::load_builtin_fsa().unwrap();
486
487        let diff = compare_blueprints(&discovered, &bwp.blueprint);
488
489        assert!(
490            diff.conformance_score > 0.7,
491            "Conformance score should be > 0.7, got {}",
492            diff.conformance_score
493        );
494        assert!(
495            !diff.matching_procedures.is_empty(),
496            "Should have matching procedures"
497        );
498    }
499
500    // -----------------------------------------------------------------------
501    // Test 5: Partial event log → missing_procedures is non-empty
502    // -----------------------------------------------------------------------
503    #[test]
504    fn test_compare_reports_missing_procedures() {
505        let all_events = fsa_events();
506
507        // Keep only events from the first 3 unique procedure_ids.
508        let mut seen: Vec<String> = Vec::new();
509        let mut partial: Vec<AuditEvent> = Vec::new();
510        for evt in &all_events {
511            if !seen.contains(&evt.procedure_id) {
512                if seen.len() >= 3 {
513                    break;
514                }
515                seen.push(evt.procedure_id.clone());
516            }
517            if seen.contains(&evt.procedure_id) {
518                partial.push(evt.clone());
519            }
520        }
521
522        // Ensure we got exactly 3 procedures.
523        let discovered = discover_blueprint(&partial);
524        assert_eq!(discovered.procedures.len(), 3);
525
526        let bwp = BlueprintWithPreconditions::load_builtin_fsa().unwrap();
527        let diff = compare_blueprints(&discovered, &bwp.blueprint);
528
529        assert!(
530            !diff.missing_procedures.is_empty(),
531            "Should report missing procedures when only 3 / 9 procedures are in the log"
532        );
533        assert_eq!(
534            diff.missing_procedures.len(),
535            6,
536            "Expected 6 missing procedures (9 - 3), got {}",
537            diff.missing_procedures.len()
538        );
539    }
540
541    // -----------------------------------------------------------------------
542    // Test 6: BlueprintDiff serialises to JSON without error
543    // -----------------------------------------------------------------------
544    #[test]
545    fn test_blueprint_diff_serializes() {
546        let events = fsa_events();
547        let discovered = discover_blueprint(&events);
548        let bwp = BlueprintWithPreconditions::load_builtin_fsa().unwrap();
549
550        let diff = compare_blueprints(&discovered, &bwp.blueprint);
551        let json = serde_json::to_string(&diff).expect("BlueprintDiff should serialise to JSON");
552
553        assert!(json.contains("conformance_score"));
554        assert!(json.contains("matching_procedures"));
555    }
556}
datasynth_audit_optimizer/discovery.rs

datasynth_audit_optimizer/
discovery.rs