use crate::error::EvalResult;
use serde::{Deserialize, Serialize};
use std::collections::BTreeMap;
const EPS: f64 = 1e-9;
#[derive(Debug, Clone)]
pub struct FlowEdge {
pub src: String,
pub dst: String,
pub weight: f64,
}
#[derive(Debug, Clone, Default)]
pub struct RelationalFidelityThresholds {
pub min_pair_diversity: Option<f64>,
pub min_edge_weight_tail_ratio: Option<f64>,
pub min_reciprocity: Option<f64>,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct RelationalFidelityReport {
pub n_accounts: usize,
pub n_edges: usize,
pub distinct_pairs: usize,
pub pair_diversity: f64,
pub edge_weight_tail_ratio: f64,
pub reciprocity: f64,
pub mean_out_degree: f64,
pub max_out_degree: usize,
pub too_clean: Vec<String>,
}
#[derive(Debug, Clone, Default)]
pub struct RelationalFidelityAnalyzer {
thresholds: RelationalFidelityThresholds,
}
impl RelationalFidelityAnalyzer {
pub fn new() -> Self {
Self::default()
}
pub fn with_thresholds(thresholds: RelationalFidelityThresholds) -> Self {
Self { thresholds }
}
pub fn analyze(&self, edges: &[FlowEdge]) -> EvalResult<RelationalFidelityReport> {
let n_edges = edges.len();
let mut pair_weights: BTreeMap<(&str, &str), f64> = BTreeMap::new();
let mut out_dsts: BTreeMap<&str, std::collections::BTreeSet<&str>> = BTreeMap::new();
let mut accounts: std::collections::BTreeSet<&str> = std::collections::BTreeSet::new();
for e in edges {
*pair_weights.entry((&e.src, &e.dst)).or_default() += e.weight;
out_dsts.entry(&e.src).or_default().insert(&e.dst);
accounts.insert(&e.src);
accounts.insert(&e.dst);
}
let distinct_pairs = pair_weights.len();
let pair_diversity = distinct_pairs as f64 / n_edges.max(1) as f64;
let mut weights: Vec<f64> = pair_weights.values().copied().collect();
weights.sort_by(|a, b| a.partial_cmp(b).unwrap_or(std::cmp::Ordering::Equal));
let p50 = percentile(&weights, 50.0);
let p99 = percentile(&weights, 99.0);
let edge_weight_tail_ratio = if p50 > EPS { p99 / p50 } else { 0.0 };
let reciprocal = pair_weights
.keys()
.filter(|(a, b)| pair_weights.contains_key(&(*b, *a)))
.count();
let reciprocity = reciprocal as f64 / distinct_pairs.max(1) as f64;
let degrees: Vec<usize> = out_dsts.values().map(|s| s.len()).collect();
let max_out_degree = degrees.iter().copied().max().unwrap_or(0);
let mean_out_degree = if degrees.is_empty() {
0.0
} else {
degrees.iter().sum::<usize>() as f64 / degrees.len() as f64
};
let mut too_clean = Vec::new();
let t = &self.thresholds;
if t.min_pair_diversity.is_some_and(|m| pair_diversity < m) {
too_clean.push("pair_diversity".to_string());
}
if t.min_edge_weight_tail_ratio
.is_some_and(|m| edge_weight_tail_ratio < m)
{
too_clean.push("edge_weight_tail_ratio".to_string());
}
if t.min_reciprocity.is_some_and(|m| reciprocity < m) {
too_clean.push("reciprocity".to_string());
}
Ok(RelationalFidelityReport {
n_accounts: accounts.len(),
n_edges,
distinct_pairs,
pair_diversity,
edge_weight_tail_ratio,
reciprocity,
mean_out_degree,
max_out_degree,
too_clean,
})
}
}
pub fn flow_edges_from_entries(entries: &[datasynth_core::models::JournalEntry]) -> Vec<FlowEdge> {
use rust_decimal::prelude::ToPrimitive;
let mut out = Vec::new();
for e in entries {
let src = e
.lines
.iter()
.find(|l| !l.is_debit())
.map(|l| &l.gl_account);
let dst = e.lines.iter().find(|l| l.is_debit()).map(|l| &l.gl_account);
if let (Some(src), Some(dst)) = (src, dst) {
out.push(FlowEdge {
src: src.clone(),
dst: dst.clone(),
weight: e.total_debit().to_f64().unwrap_or(0.0),
});
}
}
out
}
fn percentile(sorted: &[f64], p: f64) -> f64 {
if sorted.is_empty() {
return 0.0;
}
let idx = ((p / 100.0) * (sorted.len() as f64 - 1.0)).round() as usize;
sorted[idx.min(sorted.len() - 1)]
}
#[cfg(test)]
mod tests {
use super::*;
fn e(src: &str, dst: &str, w: f64) -> FlowEdge {
FlowEdge {
src: src.to_string(),
dst: dst.to_string(),
weight: w,
}
}
#[test]
fn measures_diversity_reciprocity_and_tail() {
let edges = vec![
e("A", "B", 100.0),
e("B", "A", 120.0),
e("A", "C", 50.0),
e("A", "C", 60.0),
e("D", "E", 9000.0),
];
let r = RelationalFidelityAnalyzer::new().analyze(&edges).unwrap();
assert_eq!(r.n_edges, 5);
assert_eq!(r.distinct_pairs, 4); assert_eq!(r.n_accounts, 5);
assert!((r.reciprocity - 0.5).abs() < 1e-9);
assert!(r.edge_weight_tail_ratio > 1.0);
assert_eq!(r.max_out_degree, 2);
}
#[test]
fn flags_too_clean_against_reference_band() {
let edges = vec![e("A", "B", 100.0); 4];
let thresholds = RelationalFidelityThresholds {
min_pair_diversity: Some(0.5),
min_reciprocity: Some(0.1),
min_edge_weight_tail_ratio: None,
};
let r = RelationalFidelityAnalyzer::with_thresholds(thresholds)
.analyze(&edges)
.unwrap();
assert_eq!(r.distinct_pairs, 1);
assert!(r.too_clean.contains(&"pair_diversity".to_string()));
assert!(r.too_clean.contains(&"reciprocity".to_string()));
}
#[test]
fn empty_is_safe() {
let r = RelationalFidelityAnalyzer::new().analyze(&[]).unwrap();
assert_eq!(r.n_edges, 0);
assert_eq!(r.pair_diversity, 0.0);
assert_eq!(r.edge_weight_tail_ratio, 0.0);
assert!(r.too_clean.is_empty());
}
}