datasynth_audit_optimizer/discovery.rs
1//! Blueprint discovery from event logs.
2//!
3//! Given a `Vec<AuditEvent>`, infers the underlying procedure state machines
4//! (states, transitions, initial/terminal states) and compares the result
5//! against a reference [`AuditBlueprint`].
6
7use std::collections::{HashMap, HashSet};
8
9use datasynth_audit_fsm::event::AuditEvent;
10use datasynth_audit_fsm::schema::AuditBlueprint;
11use serde::Serialize;
12
13// ---------------------------------------------------------------------------
14// Types
15// ---------------------------------------------------------------------------
16
17/// A blueprint inferred from an observed event log.
18#[derive(Debug, Clone, Serialize)]
19pub struct DiscoveredBlueprint {
20 /// One entry per unique `procedure_id` found in the event log.
21 pub procedures: Vec<DiscoveredProcedure>,
22 /// Unique phase identifiers observed across all events.
23 pub phases: Vec<String>,
24 /// Total number of events that were analysed.
25 pub total_events_analyzed: usize,
26}
27
28/// The state machine inferred for a single procedure from the event log.
29#[derive(Debug, Clone, Serialize)]
30pub struct DiscoveredProcedure {
31 /// Procedure identifier, taken directly from `AuditEvent::procedure_id`.
32 pub id: String,
33 /// Phase inferred from the first event belonging to this procedure.
34 pub phase: String,
35 /// All states encountered (union of `from_state` and `to_state` values).
36 pub states: Vec<String>,
37 /// Directed transitions as `(from_state, to_state)` pairs (deduplicated).
38 pub transitions: Vec<(String, String)>,
39 /// The `from_state` of the chronologically-first transition event.
40 pub initial_state: String,
41 /// States that appear as a `to_state` but never as a `from_state` in any
42 /// subsequent transition — these are the natural terminal states.
43 pub terminal_states: Vec<String>,
44 /// Number of events (of any kind) associated with this procedure.
45 pub event_count: usize,
46}
47
48/// Diff between a discovered blueprint and a reference blueprint.
49#[derive(Debug, Clone, Serialize)]
50pub struct BlueprintDiff {
51 /// Procedure IDs that exist in both blueprints.
52 pub matching_procedures: Vec<String>,
53 /// Procedure IDs present in the reference but absent from the discovered
54 /// blueprint (i.e. not observed in the event log).
55 pub missing_procedures: Vec<String>,
56 /// Procedure IDs present in the discovered blueprint but absent from the
57 /// reference (i.e. unexpected procedures in the event log).
58 pub extra_procedures: Vec<String>,
59 /// Transition-level differences for procedures that appear in both.
60 pub transition_diffs: Vec<TransitionDiff>,
61 /// Overall conformance score in `[0, 1]`:
62 /// `matching / (matching + missing + extra)` where *matching*,
63 /// *missing*, and *extra* are **procedure-level** counts.
64 pub conformance_score: f64,
65}
66
67/// A single transition-level difference between discovered and reference.
68#[derive(Debug, Clone, Serialize)]
69pub struct TransitionDiff {
70 /// The procedure this difference belongs to.
71 pub procedure_id: String,
72 /// `"missing"` — the transition is in the reference but not discovered;
73 /// `"extra"` — the transition is discovered but not in the reference.
74 pub diff_type: String,
75 /// Source state of the differing transition.
76 pub from_state: String,
77 /// Destination state of the differing transition.
78 pub to_state: String,
79}
80
81// ---------------------------------------------------------------------------
82// Discovery
83// ---------------------------------------------------------------------------
84
85/// Infer a [`DiscoveredBlueprint`] from a slice of [`AuditEvent`] records.
86///
87/// Only events that carry both a `from_state` **and** a `to_state` are used
88/// for state-machine reconstruction (i.e. transition events). Pure step
89/// events — where `step_id.is_some()` and the state fields are `None` — are
90/// counted towards `event_count` but ignored for FSM inference.
91///
92/// Events are expected to be in the order they were emitted by the engine; the
93/// function preserves that ordering when determining the initial state.
94pub fn discover_blueprint(events: &[AuditEvent]) -> DiscoveredBlueprint {
95 // -----------------------------------------------------------------------
96 // 1. Group events by procedure_id, preserving arrival order.
97 // -----------------------------------------------------------------------
98 let mut proc_events: HashMap<String, Vec<&AuditEvent>> = HashMap::new();
99 for event in events {
100 proc_events
101 .entry(event.procedure_id.clone())
102 .or_default()
103 .push(event);
104 }
105
106 // -----------------------------------------------------------------------
107 // 2. For stable output order, sort procedures by the timestamp of their
108 // first event.
109 // -----------------------------------------------------------------------
110 let mut proc_ids: Vec<String> = proc_events.keys().cloned().collect();
111 proc_ids.sort_by_key(|id| {
112 proc_events[id]
113 .first()
114 .map(|e| e.timestamp)
115 .unwrap_or_default()
116 });
117
118 // -----------------------------------------------------------------------
119 // 3. Reconstruct a state machine for each procedure.
120 // -----------------------------------------------------------------------
121 let mut procedures: Vec<DiscoveredProcedure> = Vec::new();
122
123 for id in &proc_ids {
124 let evts = &proc_events[id];
125
126 // Phase: from the first event in the group.
127 let phase = evts
128 .first()
129 .map(|e| e.phase_id.as_str())
130 .unwrap_or("")
131 .to_string();
132 let event_count = evts.len();
133
134 // Only consider transition events (both from_state and to_state present).
135 let transition_evts: Vec<&&AuditEvent> = evts
136 .iter()
137 .filter(|e| e.from_state.is_some() && e.to_state.is_some())
138 .collect();
139
140 // Collect states and transitions (preserving first-seen order while
141 // deduplicating).
142 let mut states_ordered: Vec<String> = Vec::new();
143 let mut states_seen: HashSet<String> = HashSet::new();
144 let mut transitions_ordered: Vec<(String, String)> = Vec::new();
145 let mut transitions_seen: HashSet<(String, String)> = HashSet::new();
146
147 // Track which states appear as from_state of any transition.
148 let mut from_states_set: HashSet<String> = HashSet::new();
149
150 for evt in &transition_evts {
151 let from = match evt.from_state.as_ref() {
152 Some(s) => s.clone(),
153 None => continue,
154 };
155 let to = match evt.to_state.as_ref() {
156 Some(s) => s.clone(),
157 None => continue,
158 };
159
160 if states_seen.insert(from.clone()) {
161 states_ordered.push(from.clone());
162 }
163 if states_seen.insert(to.clone()) {
164 states_ordered.push(to.clone());
165 }
166
167 if transitions_seen.insert((from.clone(), to.clone())) {
168 transitions_ordered.push((from.clone(), to.clone()));
169 }
170
171 from_states_set.insert(from);
172 }
173
174 // Initial state: from_state of the chronologically-first transition event.
175 let initial_state = transition_evts
176 .first()
177 .and_then(|e| e.from_state.as_ref())
178 .cloned()
179 .unwrap_or_default();
180
181 // Terminal states: appear as to_state but never as from_state of a
182 // subsequent transition.
183 let to_states_set: HashSet<String> = transition_evts
184 .iter()
185 .filter_map(|e| e.to_state.as_ref())
186 .cloned()
187 .collect();
188
189 let mut terminal_states: Vec<String> = to_states_set
190 .iter()
191 .filter(|s| !from_states_set.contains(*s))
192 .cloned()
193 .collect();
194 terminal_states.sort();
195
196 procedures.push(DiscoveredProcedure {
197 id: id.clone(),
198 phase,
199 states: states_ordered,
200 transitions: transitions_ordered,
201 initial_state,
202 terminal_states,
203 event_count,
204 });
205 }
206
207 // -----------------------------------------------------------------------
208 // 4. Collect unique phases (in the order they are first seen).
209 // -----------------------------------------------------------------------
210 let mut phases_ordered: Vec<String> = Vec::new();
211 let mut phases_seen: HashSet<String> = HashSet::new();
212 for proc in &procedures {
213 if phases_seen.insert(proc.phase.clone()) {
214 phases_ordered.push(proc.phase.clone());
215 }
216 }
217
218 DiscoveredBlueprint {
219 procedures,
220 phases: phases_ordered,
221 total_events_analyzed: events.len(),
222 }
223}
224
225// ---------------------------------------------------------------------------
226// Comparison
227// ---------------------------------------------------------------------------
228
229/// Compare a [`DiscoveredBlueprint`] against a reference [`AuditBlueprint`].
230///
231/// Returns a [`BlueprintDiff`] that describes:
232/// - which procedures match, are missing, or are extra;
233/// - which transitions within matching procedures are missing or extra;
234/// - an overall conformance score.
235///
236/// The conformance score is computed at the **procedure level**:
237/// ```text
238/// score = |matching| / (|matching| + |missing| + |extra|)
239/// ```
240/// where *matching* = procedures in both, *missing* = in reference only,
241/// *extra* = in discovered only. A score of 1.0 means perfect structural
242/// alignment.
243pub fn compare_blueprints(
244 discovered: &DiscoveredBlueprint,
245 reference: &AuditBlueprint,
246) -> BlueprintDiff {
247 // -----------------------------------------------------------------------
248 // 1. Build sets of procedure IDs from each blueprint.
249 // -----------------------------------------------------------------------
250 let discovered_ids: HashSet<String> =
251 discovered.procedures.iter().map(|p| p.id.clone()).collect();
252
253 let reference_ids: HashSet<String> = reference
254 .phases
255 .iter()
256 .flat_map(|phase| phase.procedures.iter())
257 .map(|p| p.id.clone())
258 .collect();
259
260 // -----------------------------------------------------------------------
261 // 2. Compute set differences.
262 // -----------------------------------------------------------------------
263 let mut matching_procedures: Vec<String> = discovered_ids
264 .intersection(&reference_ids)
265 .cloned()
266 .collect();
267 matching_procedures.sort();
268
269 let mut missing_procedures: Vec<String> =
270 reference_ids.difference(&discovered_ids).cloned().collect();
271 missing_procedures.sort();
272
273 let mut extra_procedures: Vec<String> =
274 discovered_ids.difference(&reference_ids).cloned().collect();
275 extra_procedures.sort();
276
277 // -----------------------------------------------------------------------
278 // 3. Compare transitions for matching procedures.
279 // -----------------------------------------------------------------------
280
281 // Build a lookup from the discovered blueprint.
282 let discovered_map: HashMap<&str, &DiscoveredProcedure> = discovered
283 .procedures
284 .iter()
285 .map(|p| (p.id.as_str(), p))
286 .collect();
287
288 // Build a lookup from the reference blueprint.
289 let reference_transitions: HashMap<&str, HashSet<(String, String)>> = reference
290 .phases
291 .iter()
292 .flat_map(|phase| phase.procedures.iter())
293 .map(|p| {
294 let set: HashSet<(String, String)> = p
295 .aggregate
296 .transitions
297 .iter()
298 .map(|t| (t.from_state.clone(), t.to_state.clone()))
299 .collect();
300 (p.id.as_str(), set)
301 })
302 .collect();
303
304 let mut transition_diffs: Vec<TransitionDiff> = Vec::new();
305
306 for proc_id in &matching_procedures {
307 let disc_proc = match discovered_map.get(proc_id.as_str()) {
308 Some(p) => p,
309 None => continue,
310 };
311 let ref_transitions = match reference_transitions.get(proc_id.as_str()) {
312 Some(t) => t,
313 None => continue,
314 };
315
316 let disc_set: HashSet<(String, String)> = disc_proc.transitions.iter().cloned().collect();
317
318 // Transitions in reference but not discovered → "missing"
319 let mut missing_trans: Vec<&(String, String)> =
320 ref_transitions.difference(&disc_set).collect();
321 missing_trans.sort();
322 for (from, to) in missing_trans {
323 transition_diffs.push(TransitionDiff {
324 procedure_id: proc_id.clone(),
325 diff_type: "missing".to_string(),
326 from_state: from.clone(),
327 to_state: to.clone(),
328 });
329 }
330
331 // Transitions in discovered but not reference → "extra"
332 let mut extra_trans: Vec<&(String, String)> =
333 disc_set.difference(ref_transitions).collect();
334 extra_trans.sort();
335 for (from, to) in extra_trans {
336 transition_diffs.push(TransitionDiff {
337 procedure_id: proc_id.clone(),
338 diff_type: "extra".to_string(),
339 from_state: from.clone(),
340 to_state: to.clone(),
341 });
342 }
343 }
344
345 // -----------------------------------------------------------------------
346 // 4. Conformance score.
347 // -----------------------------------------------------------------------
348 let m = matching_procedures.len() as f64;
349 let mi = missing_procedures.len() as f64;
350 let ex = extra_procedures.len() as f64;
351 let denominator = m + mi + ex;
352 let conformance_score = if denominator > 0.0 {
353 m / denominator
354 } else {
355 1.0
356 };
357
358 BlueprintDiff {
359 matching_procedures,
360 missing_procedures,
361 extra_procedures,
362 transition_diffs,
363 conformance_score,
364 }
365}
366
367// ---------------------------------------------------------------------------
368// Tests
369// ---------------------------------------------------------------------------
370
371#[cfg(test)]
372mod tests {
373 use super::*;
374 use datasynth_audit_fsm::benchmark::{
375 generate_benchmark, BenchmarkComplexity, BenchmarkConfig,
376 };
377 use datasynth_audit_fsm::loader::BlueprintWithPreconditions;
378
379 // -----------------------------------------------------------------------
380 // Helper: generate FSA events
381 // -----------------------------------------------------------------------
382 fn fsa_events() -> Vec<AuditEvent> {
383 generate_benchmark(&BenchmarkConfig {
384 complexity: BenchmarkComplexity::Simple,
385 anomaly_rate: None,
386 seed: 42,
387 })
388 .unwrap()
389 .events
390 }
391
392 // -----------------------------------------------------------------------
393 // Helper: generate IA events
394 // -----------------------------------------------------------------------
395 fn ia_events() -> Vec<AuditEvent> {
396 generate_benchmark(&BenchmarkConfig {
397 complexity: BenchmarkComplexity::Complex,
398 anomaly_rate: None,
399 seed: 42,
400 })
401 .unwrap()
402 .events
403 }
404
405 // -----------------------------------------------------------------------
406 // Test 1: FSA events → 9 discovered procedures with states & transitions
407 // -----------------------------------------------------------------------
408 #[test]
409 fn test_discover_from_fsa_events() {
410 let events = fsa_events();
411 let discovered = discover_blueprint(&events);
412
413 assert_eq!(
414 discovered.procedures.len(),
415 9,
416 "FSA blueprint has 9 procedures, got {}",
417 discovered.procedures.len()
418 );
419 assert_eq!(discovered.total_events_analyzed, events.len());
420
421 for proc in &discovered.procedures {
422 assert!(
423 !proc.states.is_empty(),
424 "Procedure {} should have states",
425 proc.id
426 );
427 assert!(
428 !proc.transitions.is_empty(),
429 "Procedure {} should have transitions",
430 proc.id
431 );
432 }
433 }
434
435 // -----------------------------------------------------------------------
436 // Test 2: IA events → >= 30 discovered procedures
437 // -----------------------------------------------------------------------
438 #[test]
439 fn test_discover_from_ia_events() {
440 let events = ia_events();
441 let discovered = discover_blueprint(&events);
442
443 assert!(
444 discovered.procedures.len() >= 30,
445 "IA blueprint should yield >= 30 discovered procedures, got {}",
446 discovered.procedures.len()
447 );
448 }
449
450 // -----------------------------------------------------------------------
451 // Test 3: States for accept_engagement match expected set
452 // -----------------------------------------------------------------------
453 #[test]
454 fn test_discovered_states_match_aggregate() {
455 let events = fsa_events();
456 let discovered = discover_blueprint(&events);
457
458 let proc = discovered
459 .procedures
460 .iter()
461 .find(|p| p.id == "accept_engagement")
462 .expect("accept_engagement should be discovered");
463
464 let expected: HashSet<&str> = ["not_started", "in_progress", "under_review", "completed"]
465 .iter()
466 .copied()
467 .collect();
468
469 let found: HashSet<&str> = proc.states.iter().map(|s| s.as_str()).collect();
470
471 assert_eq!(
472 found, expected,
473 "accept_engagement states should be {:?}, got {:?}",
474 expected, found
475 );
476 }
477
478 // -----------------------------------------------------------------------
479 // Test 4: Conformance score > 0.7 when compared against FSA reference
480 // -----------------------------------------------------------------------
481 #[test]
482 fn test_compare_discovered_vs_reference() {
483 let events = fsa_events();
484 let discovered = discover_blueprint(&events);
485 let bwp = BlueprintWithPreconditions::load_builtin_fsa().unwrap();
486
487 let diff = compare_blueprints(&discovered, &bwp.blueprint);
488
489 assert!(
490 diff.conformance_score > 0.7,
491 "Conformance score should be > 0.7, got {}",
492 diff.conformance_score
493 );
494 assert!(
495 !diff.matching_procedures.is_empty(),
496 "Should have matching procedures"
497 );
498 }
499
500 // -----------------------------------------------------------------------
501 // Test 5: Partial event log → missing_procedures is non-empty
502 // -----------------------------------------------------------------------
503 #[test]
504 fn test_compare_reports_missing_procedures() {
505 let all_events = fsa_events();
506
507 // Keep only events from the first 3 unique procedure_ids.
508 let mut seen: Vec<String> = Vec::new();
509 let mut partial: Vec<AuditEvent> = Vec::new();
510 for evt in &all_events {
511 if !seen.contains(&evt.procedure_id) {
512 if seen.len() >= 3 {
513 break;
514 }
515 seen.push(evt.procedure_id.clone());
516 }
517 if seen.contains(&evt.procedure_id) {
518 partial.push(evt.clone());
519 }
520 }
521
522 // Ensure we got exactly 3 procedures.
523 let discovered = discover_blueprint(&partial);
524 assert_eq!(discovered.procedures.len(), 3);
525
526 let bwp = BlueprintWithPreconditions::load_builtin_fsa().unwrap();
527 let diff = compare_blueprints(&discovered, &bwp.blueprint);
528
529 assert!(
530 !diff.missing_procedures.is_empty(),
531 "Should report missing procedures when only 3 / 9 procedures are in the log"
532 );
533 assert_eq!(
534 diff.missing_procedures.len(),
535 6,
536 "Expected 6 missing procedures (9 - 3), got {}",
537 diff.missing_procedures.len()
538 );
539 }
540
541 // -----------------------------------------------------------------------
542 // Test 6: BlueprintDiff serialises to JSON without error
543 // -----------------------------------------------------------------------
544 #[test]
545 fn test_blueprint_diff_serializes() {
546 let events = fsa_events();
547 let discovered = discover_blueprint(&events);
548 let bwp = BlueprintWithPreconditions::load_builtin_fsa().unwrap();
549
550 let diff = compare_blueprints(&discovered, &bwp.blueprint);
551 let json = serde_json::to_string(&diff).expect("BlueprintDiff should serialise to JSON");
552
553 assert!(json.contains("conformance_score"));
554 assert!(json.contains("matching_procedures"));
555 }
556}