Skip to main content

datasynth_eval/process_mining/
variant_analysis.rs

1//! Process variant distribution analysis.
2//!
3//! Validates that process variants have reasonable diversity and
4//! are not all happy-path.
5
6use crate::error::EvalResult;
7use serde::{Deserialize, Serialize};
8
9/// Process variant data.
10#[derive(Debug, Clone)]
11pub struct VariantData {
12    /// Variant identifier (typically the sequence of activities).
13    pub variant_id: String,
14    /// Number of cases following this variant.
15    pub case_count: usize,
16    /// Whether this is the happy/normal path.
17    pub is_happy_path: bool,
18}
19
20/// Thresholds for variant analysis.
21#[derive(Debug, Clone, Serialize, Deserialize)]
22pub struct VariantThresholds {
23    /// Minimum entropy (Shannon entropy of variant distribution).
24    pub min_entropy: f64,
25    /// Maximum happy path concentration.
26    pub max_happy_path_concentration: f64,
27    /// Minimum number of distinct variants.
28    pub min_variant_count: usize,
29}
30
31impl Default for VariantThresholds {
32    fn default() -> Self {
33        Self {
34            min_entropy: 1.0,
35            max_happy_path_concentration: 0.95,
36            min_variant_count: 2,
37        }
38    }
39}
40
41/// Results of variant analysis.
42#[derive(Debug, Clone, Serialize, Deserialize)]
43pub struct VariantAnalysis {
44    /// Number of distinct variants.
45    pub variant_count: usize,
46    /// Total cases.
47    pub total_cases: usize,
48    /// Shannon entropy of variant distribution.
49    pub variant_entropy: f64,
50    /// Happy path concentration (fraction of cases on happy path).
51    pub happy_path_concentration: f64,
52    /// Top variant frequencies (variant_id, fraction).
53    pub top_variants: Vec<(String, f64)>,
54    /// Overall pass/fail.
55    pub passes: bool,
56    /// Issues found.
57    pub issues: Vec<String>,
58}
59
60/// Analyzer for process variants.
61pub struct VariantAnalyzer {
62    thresholds: VariantThresholds,
63}
64
65impl VariantAnalyzer {
66    /// Create a new analyzer with default thresholds.
67    pub fn new() -> Self {
68        Self {
69            thresholds: VariantThresholds::default(),
70        }
71    }
72
73    /// Create with custom thresholds.
74    pub fn with_thresholds(thresholds: VariantThresholds) -> Self {
75        Self { thresholds }
76    }
77
78    /// Analyze process variants.
79    pub fn analyze(&self, variants: &[VariantData]) -> EvalResult<VariantAnalysis> {
80        let mut issues = Vec::new();
81
82        if variants.is_empty() {
83            return Ok(VariantAnalysis {
84                variant_count: 0,
85                total_cases: 0,
86                variant_entropy: 0.0,
87                happy_path_concentration: 0.0,
88                top_variants: Vec::new(),
89                passes: true,
90                issues: Vec::new(),
91            });
92        }
93
94        let total_cases: usize = variants.iter().map(|v| v.case_count).sum();
95        let variant_count = variants.len();
96
97        // Shannon entropy
98        let variant_entropy = if total_cases > 0 {
99            let mut entropy = 0.0_f64;
100            for v in variants {
101                if v.case_count > 0 {
102                    let p = v.case_count as f64 / total_cases as f64;
103                    entropy -= p * p.ln();
104                }
105            }
106            entropy
107        } else {
108            0.0
109        };
110
111        // Happy path concentration
112        let happy_cases: usize = variants
113            .iter()
114            .filter(|v| v.is_happy_path)
115            .map(|v| v.case_count)
116            .sum();
117        let happy_path_concentration = if total_cases > 0 {
118            happy_cases as f64 / total_cases as f64
119        } else {
120            0.0
121        };
122
123        // Top variants
124        let mut sorted: Vec<&VariantData> = variants.iter().collect();
125        sorted.sort_by(|a, b| b.case_count.cmp(&a.case_count));
126        let top_variants: Vec<(String, f64)> = sorted
127            .iter()
128            .take(5)
129            .map(|v| {
130                (
131                    v.variant_id.clone(),
132                    if total_cases > 0 {
133                        v.case_count as f64 / total_cases as f64
134                    } else {
135                        0.0
136                    },
137                )
138            })
139            .collect();
140
141        // Check thresholds
142        if variant_count < self.thresholds.min_variant_count {
143            issues.push(format!(
144                "Only {} variants (minimum {})",
145                variant_count, self.thresholds.min_variant_count
146            ));
147        }
148        if variant_entropy < self.thresholds.min_entropy && variant_count > 1 {
149            issues.push(format!(
150                "Variant entropy {:.3} < {:.3}",
151                variant_entropy, self.thresholds.min_entropy
152            ));
153        }
154        if happy_path_concentration > self.thresholds.max_happy_path_concentration {
155            issues.push(format!(
156                "Happy path concentration {:.3} > {:.3}",
157                happy_path_concentration, self.thresholds.max_happy_path_concentration
158            ));
159        }
160
161        let passes = issues.is_empty();
162
163        Ok(VariantAnalysis {
164            variant_count,
165            total_cases,
166            variant_entropy,
167            happy_path_concentration,
168            top_variants,
169            passes,
170            issues,
171        })
172    }
173}
174
175impl Default for VariantAnalyzer {
176    fn default() -> Self {
177        Self::new()
178    }
179}
180
181#[cfg(test)]
182#[allow(clippy::unwrap_used)]
183mod tests {
184    use super::*;
185
186    #[test]
187    fn test_diverse_variants() {
188        let analyzer = VariantAnalyzer::new();
189        let variants = vec![
190            VariantData {
191                variant_id: "A->B->C".to_string(),
192                case_count: 50,
193                is_happy_path: true,
194            },
195            VariantData {
196                variant_id: "A->B->D->C".to_string(),
197                case_count: 30,
198                is_happy_path: false,
199            },
200            VariantData {
201                variant_id: "A->E->C".to_string(),
202                case_count: 20,
203                is_happy_path: false,
204            },
205        ];
206
207        let result = analyzer.analyze(&variants).unwrap();
208        assert!(result.passes);
209        assert_eq!(result.variant_count, 3);
210        assert!(result.variant_entropy > 0.0);
211    }
212
213    #[test]
214    fn test_all_happy_path() {
215        let analyzer = VariantAnalyzer::new();
216        let variants = vec![
217            VariantData {
218                variant_id: "A->B->C".to_string(),
219                case_count: 100,
220                is_happy_path: true,
221            },
222            VariantData {
223                variant_id: "A->B->D".to_string(),
224                case_count: 1,
225                is_happy_path: false,
226            },
227        ];
228
229        let result = analyzer.analyze(&variants).unwrap();
230        assert!(!result.passes);
231        assert!(result.happy_path_concentration > 0.95);
232    }
233
234    #[test]
235    fn test_single_variant() {
236        let analyzer = VariantAnalyzer::new();
237        let variants = vec![VariantData {
238            variant_id: "A->B".to_string(),
239            case_count: 100,
240            is_happy_path: true,
241        }];
242
243        let result = analyzer.analyze(&variants).unwrap();
244        assert!(!result.passes); // Too few variants
245    }
246
247    #[test]
248    fn test_empty() {
249        let analyzer = VariantAnalyzer::new();
250        let result = analyzer.analyze(&[]).unwrap();
251        assert!(result.passes);
252    }
253}