use std::collections::hash_map::DefaultHasher;
use std::hash::{Hash, Hasher};
use super::types::Record;
pub fn split_5050(records: &[Record], seed: u64) -> (Vec<Record>, Vec<Record>) {
let mut a = Vec::new();
let mut b = Vec::new();
for r in records {
if hash_to_bucket(&r.je_number, seed) {
a.push(r.clone());
} else {
b.push(r.clone());
}
}
(a, b)
}
fn hash_to_bucket(key: &str, seed: u64) -> bool {
let mut h = DefaultHasher::new();
seed.hash(&mut h);
key.hash(&mut h);
(h.finish() & 1) == 0
}
pub const NOISE_FLOOR_JE_CAP: usize = 500_000;
pub fn subsample_to_je_cap(records: &[Record], cap_jes: usize, seed: u64) -> Vec<Record> {
if cap_jes == 0 {
return records.to_vec();
}
let mut seen = std::collections::HashSet::new();
for r in records {
seen.insert(r.je_number.as_str());
}
let n_jes = seen.len();
if n_jes <= cap_jes {
return records.to_vec();
}
let n = n_jes as u64;
let keep_below = cap_jes as u64;
records
.iter()
.filter(|r| {
let mut h = DefaultHasher::new();
seed.hash(&mut h);
r.je_number.hash(&mut h);
(h.finish() % n) < keep_below
})
.cloned()
.collect()
}
pub const DEGENERATE_BASELINE_CAP: f64 = 100.0;
pub const DEGENERATE_BASELINE_EPS: f64 = 1e-9;
#[inline]
pub fn is_degenerate_baseline(baseline: f64) -> bool {
baseline.abs() < DEGENERATE_BASELINE_EPS
}
pub const VOLUME_BOUNDED_METRICS: &[&str] = &[
"P1_IETD_W1_days", "P3_Fanout_W1_CostCenter", "P3_Fanout_W1_GLAccount",
"P3_Fanout_W1_ProfitCenter",
"P3_Fanout_W1_TradingPartner",
"P2_BurstLen_W1_7d", ];
#[inline]
pub fn is_volume_bounded(metric_name: &str) -> bool {
VOLUME_BOUNDED_METRICS.contains(&metric_name)
}
pub fn degradation_ratio(real_vs_syn: f64, real_split_baseline: f64) -> f64 {
const EPS: f64 = DEGENERATE_BASELINE_EPS;
if real_split_baseline.abs() < EPS {
if real_vs_syn.abs() < EPS {
0.0
} else {
DEGENERATE_BASELINE_CAP
}
} else {
real_vs_syn / real_split_baseline
}
}
#[cfg(test)]
mod tests {
use super::*;
use chrono::NaiveDate;
fn r(je: &str, line: &str) -> Record {
let d = NaiveDate::from_ymd_opt(2022, 1, 1).unwrap();
Record {
source: "S".into(),
gl_account: "1".into(),
cost_center: None,
profit_center: None,
trading_partner: None,
je_number: je.into(),
je_line_number: line.into(),
effective_date: d,
entry_date: d,
created_at: None,
functional_amount: 1.0,
header_text: String::new(),
line_text: String::new(),
}
}
#[test]
fn split_keeps_multiline_jes_together() {
let rs = vec![
r("J1", "001"),
r("J1", "002"),
r("J1", "003"),
r("J2", "001"),
r("J3", "001"),
r("J4", "001"),
r("J5", "001"),
r("J6", "001"),
];
let (a, b) = split_5050(&rs, 42);
let j1_a = a.iter().filter(|r| r.je_number == "J1").count();
let j1_b = b.iter().filter(|r| r.je_number == "J1").count();
assert!((j1_a == 3 && j1_b == 0) || (j1_a == 0 && j1_b == 3));
let (a2, _) = split_5050(&rs, 42);
assert_eq!(a, a2);
let (a3, _) = split_5050(&rs, 99);
assert_ne!(a, a3);
}
#[test]
fn degradation_ratio_caps_at_degenerate_baseline() {
let dr = degradation_ratio(1.0, 0.0);
assert_eq!(dr, 100.0);
assert_eq!(dr, DEGENERATE_BASELINE_CAP);
assert_eq!(degradation_ratio(0.0, 0.0), 0.0);
let tiny = 1e-12;
assert_eq!(degradation_ratio(tiny, tiny), 0.0);
assert_eq!(degradation_ratio(0.5, 0.25), 2.0);
let dr = degradation_ratio(100.0, 1.0);
assert_eq!(dr, 100.0);
}
#[test]
fn subsample_caps_distinct_jes_and_keeps_jes_whole() {
let mut rs = Vec::new();
for i in 0..100 {
let je = format!("J{i}");
rs.push(r(&je, "001"));
rs.push(r(&je, "002"));
}
let out = subsample_to_je_cap(&rs, 30, 42);
let jes: std::collections::HashSet<_> = out.iter().map(|x| x.je_number.clone()).collect();
assert!(
jes.len() <= 30,
"capped to <=30 distinct JEs, got {}",
jes.len()
);
assert!(
jes.len() >= 18,
"hash-uniform keep should be near 30, got {}",
jes.len()
);
for je in &jes {
let c = out.iter().filter(|x| &x.je_number == je).count();
assert_eq!(c, 2, "JE {je} should keep both lines");
}
assert_eq!(out.len(), subsample_to_je_cap(&rs, 30, 42).len());
assert_eq!(subsample_to_je_cap(&rs, 1000, 42).len(), rs.len());
assert_eq!(subsample_to_je_cap(&rs, 0, 42).len(), rs.len());
}
}