datasynth_eval/statistical/
relational_fidelity.rs1use crate::error::EvalResult;
16use serde::{Deserialize, Serialize};
17use std::collections::BTreeMap;
18
19const EPS: f64 = 1e-9;
20
21#[derive(Debug, Clone)]
23pub struct FlowEdge {
24 pub src: String,
26 pub dst: String,
28 pub weight: f64,
30}
31
32#[derive(Debug, Clone, Default)]
34pub struct RelationalFidelityThresholds {
35 pub min_pair_diversity: Option<f64>,
37 pub min_edge_weight_tail_ratio: Option<f64>,
39 pub min_reciprocity: Option<f64>,
41}
42
43#[derive(Debug, Clone, Serialize, Deserialize)]
45pub struct RelationalFidelityReport {
46 pub n_accounts: usize,
48 pub n_edges: usize,
50 pub distinct_pairs: usize,
52 pub pair_diversity: f64,
54 pub edge_weight_tail_ratio: f64,
56 pub reciprocity: f64,
59 pub mean_out_degree: f64,
61 pub max_out_degree: usize,
63 pub too_clean: Vec<String>,
65}
66
67#[derive(Debug, Clone, Default)]
69pub struct RelationalFidelityAnalyzer {
70 thresholds: RelationalFidelityThresholds,
71}
72
73impl RelationalFidelityAnalyzer {
74 pub fn new() -> Self {
76 Self::default()
77 }
78
79 pub fn with_thresholds(thresholds: RelationalFidelityThresholds) -> Self {
81 Self { thresholds }
82 }
83
84 pub fn analyze(&self, edges: &[FlowEdge]) -> EvalResult<RelationalFidelityReport> {
86 let n_edges = edges.len();
87 let mut pair_weights: BTreeMap<(&str, &str), f64> = BTreeMap::new();
88 let mut out_dsts: BTreeMap<&str, std::collections::BTreeSet<&str>> = BTreeMap::new();
89 let mut accounts: std::collections::BTreeSet<&str> = std::collections::BTreeSet::new();
90 for e in edges {
91 *pair_weights.entry((&e.src, &e.dst)).or_default() += e.weight;
92 out_dsts.entry(&e.src).or_default().insert(&e.dst);
93 accounts.insert(&e.src);
94 accounts.insert(&e.dst);
95 }
96
97 let distinct_pairs = pair_weights.len();
98 let pair_diversity = distinct_pairs as f64 / n_edges.max(1) as f64;
99
100 let mut weights: Vec<f64> = pair_weights.values().copied().collect();
101 weights.sort_by(|a, b| a.partial_cmp(b).unwrap_or(std::cmp::Ordering::Equal));
102 let p50 = percentile(&weights, 50.0);
103 let p99 = percentile(&weights, 99.0);
104 let edge_weight_tail_ratio = if p50 > EPS { p99 / p50 } else { 0.0 };
105
106 let reciprocal = pair_weights
107 .keys()
108 .filter(|(a, b)| pair_weights.contains_key(&(*b, *a)))
109 .count();
110 let reciprocity = reciprocal as f64 / distinct_pairs.max(1) as f64;
111
112 let degrees: Vec<usize> = out_dsts.values().map(|s| s.len()).collect();
113 let max_out_degree = degrees.iter().copied().max().unwrap_or(0);
114 let mean_out_degree = if degrees.is_empty() {
115 0.0
116 } else {
117 degrees.iter().sum::<usize>() as f64 / degrees.len() as f64
118 };
119
120 let mut too_clean = Vec::new();
121 let t = &self.thresholds;
122 if t.min_pair_diversity.is_some_and(|m| pair_diversity < m) {
123 too_clean.push("pair_diversity".to_string());
124 }
125 if t.min_edge_weight_tail_ratio
126 .is_some_and(|m| edge_weight_tail_ratio < m)
127 {
128 too_clean.push("edge_weight_tail_ratio".to_string());
129 }
130 if t.min_reciprocity.is_some_and(|m| reciprocity < m) {
131 too_clean.push("reciprocity".to_string());
132 }
133
134 Ok(RelationalFidelityReport {
135 n_accounts: accounts.len(),
136 n_edges,
137 distinct_pairs,
138 pair_diversity,
139 edge_weight_tail_ratio,
140 reciprocity,
141 mean_out_degree,
142 max_out_degree,
143 too_clean,
144 })
145 }
146}
147
148pub fn flow_edges_from_entries(entries: &[datasynth_core::models::JournalEntry]) -> Vec<FlowEdge> {
152 use rust_decimal::prelude::ToPrimitive;
153 let mut out = Vec::new();
154 for e in entries {
155 let src = e
156 .lines
157 .iter()
158 .find(|l| !l.is_debit())
159 .map(|l| &l.gl_account);
160 let dst = e.lines.iter().find(|l| l.is_debit()).map(|l| &l.gl_account);
161 if let (Some(src), Some(dst)) = (src, dst) {
162 out.push(FlowEdge {
163 src: src.clone(),
164 dst: dst.clone(),
165 weight: e.total_debit().to_f64().unwrap_or(0.0),
166 });
167 }
168 }
169 out
170}
171
172fn percentile(sorted: &[f64], p: f64) -> f64 {
173 if sorted.is_empty() {
174 return 0.0;
175 }
176 let idx = ((p / 100.0) * (sorted.len() as f64 - 1.0)).round() as usize;
177 sorted[idx.min(sorted.len() - 1)]
178}
179
180#[cfg(test)]
181mod tests {
182 use super::*;
183
184 fn e(src: &str, dst: &str, w: f64) -> FlowEdge {
185 FlowEdge {
186 src: src.to_string(),
187 dst: dst.to_string(),
188 weight: w,
189 }
190 }
191
192 #[test]
193 fn measures_diversity_reciprocity_and_tail() {
194 let edges = vec![
197 e("A", "B", 100.0),
198 e("B", "A", 120.0),
199 e("A", "C", 50.0),
200 e("A", "C", 60.0),
201 e("D", "E", 9000.0),
202 ];
203 let r = RelationalFidelityAnalyzer::new().analyze(&edges).unwrap();
204 assert_eq!(r.n_edges, 5);
205 assert_eq!(r.distinct_pairs, 4); assert_eq!(r.n_accounts, 5);
207 assert!((r.reciprocity - 0.5).abs() < 1e-9);
209 assert!(r.edge_weight_tail_ratio > 1.0);
211 assert_eq!(r.max_out_degree, 2);
213 }
214
215 #[test]
216 fn flags_too_clean_against_reference_band() {
217 let edges = vec![e("A", "B", 100.0); 4];
219 let thresholds = RelationalFidelityThresholds {
220 min_pair_diversity: Some(0.5),
221 min_reciprocity: Some(0.1),
222 min_edge_weight_tail_ratio: None,
223 };
224 let r = RelationalFidelityAnalyzer::with_thresholds(thresholds)
225 .analyze(&edges)
226 .unwrap();
227 assert_eq!(r.distinct_pairs, 1);
228 assert!(r.too_clean.contains(&"pair_diversity".to_string()));
229 assert!(r.too_clean.contains(&"reciprocity".to_string()));
230 }
231
232 #[test]
233 fn empty_is_safe() {
234 let r = RelationalFidelityAnalyzer::new().analyze(&[]).unwrap();
235 assert_eq!(r.n_edges, 0);
236 assert_eq!(r.pair_diversity, 0.0);
237 assert_eq!(r.edge_weight_tail_ratio, 0.0);
238 assert!(r.too_clean.is_empty());
239 }
240}