datasynth_eval/behavioral_fidelity/
ietd.rs1use std::collections::HashMap;
4
5use chrono::NaiveDate;
6
7use super::math::{pearson_lag1_correlation, wasserstein_1};
8use super::types::Record;
9
10#[derive(Debug, Clone, PartialEq)]
12pub struct P1Outcome {
13 pub ietd_w1_days: f64,
14 pub autocorr_real: f64,
15 pub autocorr_syn: f64,
16 pub autocorr_gap: f64,
17}
18
19pub fn compute_p1<F, G>(real: &[Record], syn: &[Record], entity_of: F, date_of: G) -> P1Outcome
25where
26 F: Fn(&Record) -> Option<String> + Copy,
27 G: Fn(&Record) -> NaiveDate + Copy,
28{
29 let iets_real = pooled_iets(real, entity_of, date_of);
30 let iets_syn = pooled_iets(syn, entity_of, date_of);
31 let w1 = wasserstein_1(&iets_real, &iets_syn);
32
33 let auto_real = pooled_autocorr(real, entity_of, date_of);
34 let auto_syn = pooled_autocorr(syn, entity_of, date_of);
35 P1Outcome {
36 ietd_w1_days: w1,
37 autocorr_real: auto_real,
38 autocorr_syn: auto_syn,
39 autocorr_gap: (auto_real - auto_syn).abs(),
40 }
41}
42
43fn group_by_entity<F>(records: &[Record], entity_of: F) -> HashMap<String, Vec<&Record>>
44where
45 F: Fn(&Record) -> Option<String> + Copy,
46{
47 let mut by: HashMap<String, Vec<&Record>> = HashMap::new();
48 for r in records {
49 if let Some(e) = entity_of(r) {
50 by.entry(e).or_default().push(r);
51 }
52 }
53 by
54}
55
56fn pooled_iets<F, G>(records: &[Record], entity_of: F, date_of: G) -> Vec<f64>
57where
58 F: Fn(&Record) -> Option<String> + Copy,
59 G: Fn(&Record) -> NaiveDate + Copy,
60{
61 let mut out = Vec::new();
62 for (_e, mut rows) in group_by_entity(records, entity_of) {
63 if rows.len() < 2 {
64 continue;
65 }
66 rows.sort_by_key(|r| date_of(r));
67 for w in rows.windows(2) {
68 let d = (date_of(w[1]) - date_of(w[0])).num_days() as f64;
69 if d >= 0.0 {
70 out.push(d);
71 }
72 }
73 }
74 out
75}
76
77fn pooled_autocorr<F, G>(records: &[Record], entity_of: F, date_of: G) -> f64
78where
79 F: Fn(&Record) -> Option<String> + Copy,
80 G: Fn(&Record) -> NaiveDate + Copy,
81{
82 let mut acc = 0.0;
83 let mut n = 0;
84 for (_e, mut rows) in group_by_entity(records, entity_of) {
85 if rows.len() < 3 {
86 continue;
87 }
88 rows.sort_by_key(|r| date_of(r));
89 let iets: Vec<f64> = rows
90 .windows(2)
91 .map(|w| (date_of(w[1]) - date_of(w[0])).num_days() as f64)
92 .collect();
93 if let Some(r) = pearson_lag1_correlation(&iets) {
94 acc += r;
95 n += 1;
96 }
97 }
98 if n == 0 {
99 0.0
100 } else {
101 acc / n as f64
102 }
103}
104
105pub fn source_of(r: &Record) -> Option<String> {
107 Some(r.source.clone())
108}
109
110pub fn trading_partner_of(r: &Record) -> Option<String> {
112 r.trading_partner.clone()
113}
114
115#[cfg(test)]
116mod tests {
117 use super::*;
118 use chrono::NaiveDate;
119
120 fn rec(src: &str, year: i32, mon: u32, day: u32) -> Record {
121 Record {
122 source: src.into(),
123 gl_account: "1".into(),
124 cost_center: None,
125 profit_center: None,
126 trading_partner: None,
127 je_number: format!("JE-{src}-{day}"),
128 je_line_number: "001".into(),
129 effective_date: NaiveDate::from_ymd_opt(year, mon, day).unwrap(),
130 entry_date: NaiveDate::from_ymd_opt(year, mon, day).unwrap(),
131 created_at: None,
132 functional_amount: 1.0,
133 header_text: String::new(),
134 line_text: String::new(),
135 }
136 }
137
138 #[test]
139 fn p1_identical_data_w1_zero_autocorr_gap_zero() {
140 let real = vec![
141 rec("A", 2022, 1, 1),
142 rec("A", 2022, 1, 2),
143 rec("A", 2022, 1, 3),
144 rec("A", 2022, 1, 4),
145 rec("B", 2022, 1, 1),
146 rec("B", 2022, 1, 5),
147 rec("B", 2022, 1, 9),
148 ];
149 let out = compute_p1(&real, &real, source_of, |r| r.entry_date);
150 assert!(out.ietd_w1_days.abs() < 1e-9);
151 assert!(out.autocorr_gap.abs() < 1e-9);
152 }
153
154 #[test]
155 fn p1_compressed_vs_uniform_detects_shift() {
156 let real = vec![
157 rec("A", 2022, 1, 1),
158 rec("A", 2022, 1, 2),
159 rec("A", 2022, 1, 3),
160 rec("A", 2022, 1, 4),
161 rec("B", 2022, 1, 1),
162 rec("B", 2022, 1, 5),
163 rec("B", 2022, 1, 9),
164 ];
165 let syn = vec![
166 rec("A", 2022, 1, 1),
167 rec("A", 2022, 1, 6),
168 rec("A", 2022, 1, 11),
169 rec("A", 2022, 1, 16),
170 rec("B", 2022, 1, 1),
171 rec("B", 2022, 1, 5),
172 rec("B", 2022, 1, 9),
173 ];
174 let out = compute_p1(&real, &syn, source_of, |r| r.entry_date);
175 assert!(
176 out.ietd_w1_days > 0.5,
177 "expected non-trivial W1, got {}",
178 out.ietd_w1_days
179 );
180 }
181}