1use std::collections::BTreeMap;
10
11use rand::RngExt;
12use serde::{Deserialize, Serialize};
13
14use super::text_taxonomy::TextTaxonomyPrior;
15
16#[derive(Debug, Clone, Default, PartialEq, Serialize, Deserialize)]
22pub struct EmpiricalCdf {
23 pub column: String,
25 pub values: Vec<f64>,
27 pub probabilities: Vec<f64>,
29}
30
31impl EmpiricalCdf {
32 pub fn from_sorted_values(column: impl Into<String>, values: Vec<f64>) -> Self {
34 let n = values.len();
35 let probabilities: Vec<f64> = (1..=n).map(|i| i as f64 / n as f64).collect();
36 Self {
37 column: column.into(),
38 values,
39 probabilities,
40 }
41 }
42
43 pub fn cdf(&self, x: f64) -> f64 {
45 match self.values.binary_search_by(|v| v.total_cmp(&x)) {
46 Ok(i) => self.probabilities[i],
47 Err(i) => {
48 if i == 0 {
49 0.0
50 } else if i >= self.values.len() {
51 1.0
52 } else {
53 let (x0, x1) = (self.values[i - 1], self.values[i]);
54 let (p0, p1) = (self.probabilities[i - 1], self.probabilities[i]);
55 p0 + (p1 - p0) * (x - x0) / (x1 - x0)
56 }
57 }
58 }
59 }
60
61 pub fn quantile(&self, p: f64) -> f64 {
63 if p <= 0.0 {
64 return *self.values.first().unwrap_or(&0.0);
65 }
66 if p >= 1.0 {
67 return *self.values.last().unwrap_or(&0.0);
68 }
69 match self.probabilities.binary_search_by(|v| v.total_cmp(&p)) {
70 Ok(i) => self.values[i],
71 Err(i) => {
72 if i == 0 {
73 self.values[0]
74 } else if i >= self.probabilities.len() {
75 *self.values.last().unwrap_or(&0.0)
76 } else {
77 let (p0, p1) = (self.probabilities[i - 1], self.probabilities[i]);
78 let (x0, x1) = (self.values[i - 1], self.values[i]);
79 x0 + (x1 - x0) * (p - p0) / (p1 - p0)
80 }
81 }
82 }
83 }
84}
85
86#[derive(Debug, Clone, PartialEq, Default, Serialize, Deserialize)]
101pub struct PerSourceAmountPrior {
102 pub by_source_and_class: BTreeMap<String, BTreeMap<String, LognormalAmount>>,
106 pub by_source: BTreeMap<String, LognormalAmount>,
108}
109
110#[derive(Debug, Clone, PartialEq, Default, Serialize, Deserialize)]
117pub struct LognormalAmount {
118 pub mu: f64,
120 pub sigma: f64,
122 pub n: usize,
125 pub median_abs: f64,
127}
128
129impl LognormalAmount {
130 pub fn sample<R: rand::Rng>(&self, rng: &mut R) -> f64 {
138 use rand_distr::{Distribution, LogNormal};
139 let sigma = self.sigma.max(1e-6);
141 let dist = LogNormal::new(self.mu, sigma)
142 .unwrap_or_else(|_| LogNormal::new(0.0, 1.0).expect("fallback lognormal"));
143 dist.sample(rng)
144 }
145}
146
147#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)]
153pub struct BehavioralPriors {
154 pub schema_version: u32,
155 pub generator_version: String,
156 pub industry: String,
157 pub n_client_inputs: usize,
158 pub n_rows_aggregated: usize,
159 pub source_mix: SourceMixPrior,
160 pub per_source_iet: PerSourceIetPrior,
161 pub lines_per_je: LinesPerJePrior,
162 pub active_lifetime: ActiveLifetimePrior,
163 pub fanout: FanoutPrior,
164 #[serde(default, skip_serializing_if = "Option::is_none")]
165 pub posting_lag: Option<PostingLagPrior>,
166 #[serde(default, skip_serializing_if = "Option::is_none")]
169 pub active_segments: Option<ActiveSegmentsPrior>,
170 #[serde(default, skip_serializing_if = "Option::is_none")]
172 pub entity_clusters: Option<EntityClustersPrior>,
173 #[serde(default, skip_serializing_if = "Option::is_none")]
177 pub per_source_attribute: Option<PerSourceAttributePrior>,
178 #[serde(default, skip_serializing_if = "Option::is_none")]
183 pub tp_entity_clusters: Option<EntityClustersPrior>,
184 #[serde(default, skip_serializing_if = "Option::is_none")]
189 pub coa_semantic: Option<CoaSemanticPrior>,
190 #[serde(default, skip_serializing_if = "Option::is_none")]
195 pub reference_formats: Option<ReferenceFormatPrior>,
196 #[serde(default, skip_serializing_if = "Option::is_none")]
201 pub text_taxonomy: Option<TextTaxonomyPrior>,
202 #[serde(default, skip_serializing_if = "Option::is_none")]
212 pub user_personas: Option<UserPersonaPrior>,
213 #[serde(default, skip_serializing_if = "Option::is_none")]
221 pub source_amount_conditionals: Option<PerSourceAmountPrior>,
222 #[serde(default, skip_serializing_if = "Option::is_none")]
232 pub source_role_gl_conditionals: Option<PerSourceRolePrior>,
233 #[serde(default, skip_serializing_if = "Option::is_none")]
241 pub tb_anchor: Option<TbAnchorPrior>,
242}
243
244impl BehavioralPriors {
245 pub const SCHEMA_VERSION: u32 = 1;
246}
247
248#[derive(Debug, Clone, PartialEq, Default, Serialize, Deserialize)]
249pub struct SourceMixPrior {
250 pub probabilities: BTreeMap<String, f64>,
251 pub other_fraction: f64,
252 pub min_threshold: f64,
253}
254
255impl SourceMixPrior {
256 pub fn sample<R: rand::Rng>(&self, rng: &mut R) -> String {
262 if self.probabilities.is_empty() {
263 return "SA".to_string();
264 }
265 let r: f64 = rng.random_range(0.0..1.0);
266 let total: f64 = self.probabilities.values().sum();
267 if total <= 0.0 {
268 return self
269 .probabilities
270 .keys()
271 .next()
272 .cloned()
273 .unwrap_or_else(|| "SA".to_string());
274 }
275 let mut cum = 0.0;
276 for (code, &weight) in &self.probabilities {
277 cum += weight / total;
278 if r <= cum {
279 return code.clone();
280 }
281 }
282 self.probabilities
284 .keys()
285 .next_back()
286 .cloned()
287 .unwrap_or_else(|| "SA".to_string())
288 }
289
290 pub fn sap_default() -> Self {
299 let head = [
300 ("RV", 0.16),
301 ("KR", 0.12),
302 ("DR", 0.10),
303 ("SA", 0.09),
304 ("DZ", 0.08),
305 ("KZ", 0.07),
306 ("WE", 0.06),
307 ("RE", 0.05),
308 ("DG", 0.04),
309 ("KG", 0.035),
310 ("WA", 0.03),
311 ("WL", 0.025),
315 ("ZP", 0.02),
316 ("SK", 0.018),
317 ("AF", 0.015),
318 ("AA", 0.012),
319 ("ML", 0.010),
320 ("PR", 0.008),
321 ("RN", 0.007),
322 ("WI", 0.006),
323 ("AN", 0.005),
324 ("UE", 0.004),
325 ("ZV", 0.003),
326 ("EU", 0.002),
327 ];
328 let mut probabilities: BTreeMap<String, f64> =
329 head.into_iter().map(|(k, v)| (k.to_string(), v)).collect();
330
331 const TAIL_N: usize = 500;
346 const TAIL_MASS: f64 = 0.15;
347 let zipf: f64 = (1..=TAIL_N).map(|r| 1.0 / (r as f64).powf(1.1)).sum();
348 for r in 1..=TAIL_N {
349 let w = TAIL_MASS * (1.0 / (r as f64).powf(1.1)) / zipf;
350 probabilities.insert(format!("Z{r:03}"), w);
351 }
352
353 Self {
354 probabilities,
355 other_fraction: 0.0,
356 min_threshold: 0.0,
357 }
358 }
359}
360
361#[derive(Debug, Clone, PartialEq, Default, Serialize, Deserialize)]
362pub struct PerSourceIetPrior {
363 pub by_source: BTreeMap<String, IetSummary>,
364}
365
366#[derive(Debug, Clone, PartialEq, Default, Serialize, Deserialize)]
367pub struct IetSummary {
368 pub n: usize,
369 pub empirical_cdf_days: EmpiricalCdf,
370 pub lognormal_fit: Option<LognormalParams>,
371 pub lag1_autocorr: f64,
372}
373
374#[derive(Debug, Clone, PartialEq, Default, Serialize, Deserialize)]
375pub struct LognormalParams {
376 pub mu: f64,
377 pub sigma: f64,
378}
379
380#[derive(Debug, Clone, PartialEq, Default, Serialize, Deserialize)]
381pub struct LinesPerJePrior {
382 pub overall: LineCountHistogram,
383 pub by_source: BTreeMap<String, LineCountHistogram>,
384 pub min_jes_per_source: usize,
385}
386
387#[derive(Debug, Clone, PartialEq, Default, Serialize, Deserialize)]
388pub struct ActiveLifetimePrior {
389 pub by_source: BTreeMap<String, LineCountHistogram>,
390 pub overall: LineCountHistogram,
391}
392
393#[derive(Debug, Clone, PartialEq, Default, Serialize, Deserialize)]
394pub struct FanoutPrior {
395 pub by_attribute: BTreeMap<String, LineCountHistogram>,
396}
397
398#[derive(Debug, Clone, PartialEq, Default, Serialize, Deserialize)]
399pub struct PostingLagPrior {
400 pub by_source: BTreeMap<String, LagSummary>,
401}
402
403#[derive(Debug, Clone, PartialEq, Default, Serialize, Deserialize)]
404pub struct LagSummary {
405 pub empirical_cdf_days: EmpiricalCdf,
406 pub mean: f64,
407 pub stddev: f64,
408 pub n: usize,
409}
410
411#[derive(Debug, Clone, PartialEq, Default, Serialize, Deserialize)]
417pub struct ActiveSegmentsPrior {
418 pub by_source: BTreeMap<String, SourceSegmentSummary>,
419}
420
421#[derive(Debug, Clone, PartialEq, Default, Serialize, Deserialize)]
422pub struct SourceSegmentSummary {
423 pub segment_count_histogram: LineCountHistogram,
424 pub segment_length_histogram: LineCountHistogram,
425 pub gap_length_histogram: LineCountHistogram,
426}
427
428pub const SEGMENT_COUNT_BUCKETS: &[u32] = &[1, 2, 3, 4, 6, 8, 12, 16, 24];
430
431pub const SEGMENT_GAP_BUCKETS: &[u32] = &[1, 2, 3, 7, 14, 30, 60, 90];
433
434#[derive(Debug, Clone, PartialEq, Default, Serialize, Deserialize)]
440pub struct EntityClustersPrior {
441 pub clusters: Vec<EntityCluster>,
442 pub clustering_rate: f64,
444}
445
446#[derive(Debug, Clone, PartialEq, Default, Serialize, Deserialize)]
455pub struct PerSourceAttributePrior {
456 pub by_source: BTreeMap<String, BTreeMap<String, CategoricalDistribution>>,
461 pub min_observations: usize,
465}
466
467impl PerSourceAttributePrior {
468 pub fn conditional(&self, source: &str, attribute: &str) -> Option<&CategoricalDistribution> {
472 self.by_source.get(source)?.get(attribute)
473 }
474}
475
476#[derive(Debug, Clone, PartialEq, Default, Serialize, Deserialize)]
493pub struct PerSourceRolePrior {
494 pub by_source_and_role: BTreeMap<String, BTreeMap<String, CategoricalDistribution>>,
497}
498
499impl PerSourceRolePrior {
500 pub fn conditional(&self, source: &str, role: &str) -> Option<&CategoricalDistribution> {
506 self.by_source_and_role.get(source)?.get(role)
507 }
508}
509
510#[derive(Debug, Clone, PartialEq, Default, Serialize, Deserialize)]
526pub struct TbAnchorPrior {
527 pub per_account: BTreeMap<String, TbTarget>,
531 pub total_assets: f64,
534 pub total_liabilities: f64,
536 pub total_equity: f64,
538 pub n_clients: usize,
540}
541
542impl TbAnchorPrior {
543 pub fn has_data(&self) -> bool {
546 self.per_account
547 .values()
548 .any(|t| t.closing_balance.abs() > 1e-9 || t.opening_balance.abs() > 1e-9)
549 }
550}
551
552#[derive(Debug, Clone, PartialEq, Default, Serialize, Deserialize)]
555pub struct TbTarget {
556 pub opening_balance: f64,
558 pub closing_balance: f64,
560 pub period_net_activity: f64,
563 pub opening_stdev: f64,
567 pub closing_stdev: f64,
569 pub n_clients: usize,
571}
572
573#[derive(Debug, Clone, PartialEq, Default, Serialize, Deserialize)]
576pub struct CategoricalDistribution {
577 pub probabilities: BTreeMap<String, f64>,
580 pub n: usize,
582}
583
584impl CategoricalDistribution {
585 pub fn from_counts(counts: BTreeMap<String, usize>) -> Self {
587 let n: usize = counts.values().sum();
588 if n == 0 {
589 return Self::default();
590 }
591 let probabilities = counts
592 .into_iter()
593 .map(|(k, v)| (k, v as f64 / n as f64))
594 .collect();
595 Self { probabilities, n }
596 }
597
598 pub fn sample<R: rand::Rng>(&self, rng: &mut R) -> Option<String> {
602 if self.probabilities.is_empty() {
603 return None;
604 }
605 let total: f64 = self.probabilities.values().sum();
606 if total <= 0.0 {
607 return None;
608 }
609 let r: f64 = rng.random_range(0.0..1.0);
610 let mut cum = 0.0;
611 for (value, &p) in &self.probabilities {
612 cum += p / total;
613 if r <= cum {
614 return Some(value.clone());
615 }
616 }
617 self.probabilities.keys().next_back().cloned()
618 }
619}
620
621#[derive(Debug, Clone, PartialEq, Default, Serialize, Deserialize)]
622pub struct EntityCluster {
623 pub members: Vec<String>,
624 pub avg_jaccard: f64,
625}
626
627#[derive(Debug, Clone, PartialEq, Default, Serialize, Deserialize)]
637pub struct CoaSemanticPrior {
638 pub accounts: std::collections::BTreeMap<String, AccountSemantic>,
639}
640
641#[derive(Debug, Clone, PartialEq, Default, Serialize, Deserialize)]
643pub struct AccountSemantic {
644 pub description: String,
646 #[serde(default, skip_serializing_if = "Option::is_none")]
648 pub account_type: Option<String>,
649 #[serde(default, skip_serializing_if = "Option::is_none")]
651 pub account_class: Option<String>,
652 #[serde(default, skip_serializing_if = "Option::is_none")]
654 pub account_class_name: Option<String>,
655 #[serde(default, skip_serializing_if = "Option::is_none")]
657 pub account_sub_class: Option<String>,
658 #[serde(default, skip_serializing_if = "Option::is_none")]
660 pub account_sub_class_name: Option<String>,
661 #[serde(default, skip_serializing_if = "Option::is_none")]
663 pub parent_account: Option<String>,
664}
665
666#[derive(Debug, Clone, PartialEq, Default, Serialize, Deserialize)]
681pub struct ReferenceFormatPrior {
682 pub by_source: BTreeMap<String, Vec<ReferenceTemplate>>,
683}
684
685#[derive(Debug, Clone, PartialEq, Default, Serialize, Deserialize)]
687pub struct ReferenceTemplate {
688 pub template: String,
692 pub probability: f64,
695 pub example: String,
697}
698
699#[derive(Debug, Clone, PartialEq, Default, Serialize, Deserialize)]
717pub struct UserPersonaPrior {
718 pub users: BTreeMap<String, UserBehavior>,
720 pub user_count_distribution: LineCountHistogram,
723}
724
725impl UserPersonaPrior {
726 pub fn has_data(&self) -> bool {
729 self.users
730 .values()
731 .any(|u| !u.source_mix.is_empty() && u.volume_share > 0.0)
732 }
733
734 pub fn sample_user_for_source<R: rand::Rng>(
740 &self,
741 source: &str,
742 rng: &mut R,
743 ) -> Option<String> {
744 use rand::RngExt;
745 if self.users.is_empty() {
746 return None;
747 }
748 let weights: Vec<(&String, f64)> = self
749 .users
750 .iter()
751 .filter_map(|(uid, beh)| {
752 let mix = beh.source_mix.get(source).copied().unwrap_or(0.0);
753 let w = mix * beh.volume_share;
754 if w > 0.0 {
755 Some((uid, w))
756 } else {
757 None
758 }
759 })
760 .collect();
761 if weights.is_empty() {
762 return None;
763 }
764 let total: f64 = weights.iter().map(|(_, w)| *w).sum();
765 if total <= 0.0 {
766 return None;
767 }
768 let r: f64 = rng.random_range(0.0..total);
769 let mut cum = 0.0;
770 for (uid, w) in &weights {
771 cum += w;
772 if r <= cum {
773 return Some((*uid).clone());
774 }
775 }
776 weights.last().map(|(uid, _)| (*uid).clone())
777 }
778
779 pub fn sample_timestamp_for_user<R: rand::Rng>(
784 &self,
785 user_id: &str,
786 rng: &mut R,
787 ) -> Option<(u32, u32)> {
788 use rand::RngExt;
789 let beh = self.users.get(user_id)?;
790
791 let hour_total: f64 = beh.hourly_density.iter().sum();
792 if hour_total <= 0.0 {
793 return None;
794 }
795 let r: f64 = rng.random_range(0.0..hour_total);
796 let mut cum = 0.0;
797 let mut hour = 0u32;
798 for (h, &p) in beh.hourly_density.iter().enumerate() {
799 cum += p;
800 if r <= cum {
801 hour = h as u32;
802 break;
803 }
804 }
805
806 let weekday_total: f64 = beh.weekday_density.iter().sum();
807 if weekday_total <= 0.0 {
808 return None;
809 }
810 let r: f64 = rng.random_range(0.0..weekday_total);
811 let mut cum = 0.0;
812 let mut weekday = 0u32;
813 for (d, &p) in beh.weekday_density.iter().enumerate() {
814 cum += p;
815 if r <= cum {
816 weekday = d as u32;
817 break;
818 }
819 }
820
821 Some((hour, weekday))
822 }
823}
824
825#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)]
827pub struct UserBehavior {
828 pub source_mix: BTreeMap<String, f64>,
831 pub hourly_density: [f64; 24],
834 pub weekday_density: [f64; 7],
836 pub volume_share: f64,
838}
839
840impl Default for UserBehavior {
841 fn default() -> Self {
842 Self {
843 source_mix: BTreeMap::new(),
844 hourly_density: [0.0; 24],
845 weekday_density: [0.0; 7],
846 volume_share: 0.0,
847 }
848 }
849}
850
851#[derive(Debug, Clone, PartialEq, Default, Serialize, Deserialize)]
857pub struct LineCountHistogram {
858 pub buckets: Vec<u32>,
859 pub probabilities: Vec<f64>,
860 pub n: usize,
861}
862
863impl LineCountHistogram {
864 pub fn build(values: &[u32], buckets: &[u32]) -> (Self, usize) {
870 assert!(!buckets.is_empty(), "buckets must not be empty");
871 let n_buckets = buckets.len();
872 let mut counts = vec![0u64; n_buckets];
873 let mut dropped = 0usize;
874 for &v in values {
875 if v < buckets[0] {
876 dropped += 1;
877 continue;
878 }
879 let bucket_idx = bucket_index(buckets, v);
880 counts[bucket_idx] += 1;
881 }
882 let total: u64 = counts.iter().sum();
883 let probabilities = if total == 0 {
884 vec![0.0; n_buckets]
885 } else {
886 counts.iter().map(|&c| c as f64 / total as f64).collect()
887 };
888 (
889 Self {
890 buckets: buckets.to_vec(),
891 probabilities,
892 n: values.len(),
893 },
894 dropped,
895 )
896 }
897
898 pub fn pool(&self, other: &Self) -> Option<Self> {
901 if self.buckets != other.buckets {
902 return None;
903 }
904 let total_n = self.n + other.n;
905 if total_n == 0 {
906 return Some(Self {
907 buckets: self.buckets.clone(),
908 probabilities: vec![0.0; self.buckets.len()],
909 n: 0,
910 });
911 }
912 let probabilities: Vec<f64> = self
913 .probabilities
914 .iter()
915 .zip(other.probabilities.iter())
916 .map(|(&pa, &pb)| (pa * self.n as f64 + pb * other.n as f64) / total_n as f64)
917 .collect();
918 Some(Self {
919 buckets: self.buckets.clone(),
920 probabilities,
921 n: total_n,
922 })
923 }
924
925 pub fn median_bucket(&self) -> u32 {
927 let mut cum = 0.0;
928 for (i, &p) in self.probabilities.iter().enumerate() {
929 cum += p;
930 if cum >= 0.5 {
931 return self.buckets[i];
932 }
933 }
934 *self.buckets.last().unwrap_or(&0)
935 }
936
937 pub fn sample_bucket<R: rand::Rng>(&self, rng: &mut R) -> u32 {
942 if self.buckets.is_empty() {
943 return 0;
944 }
945 let r: f64 = rng.random_range(0.0..1.0);
946 let mut cum = 0.0;
947 let mut chosen_idx = self.buckets.len() - 1;
948 for (i, &p) in self.probabilities.iter().enumerate() {
949 cum += p;
950 if r <= cum {
951 chosen_idx = i;
952 break;
953 }
954 }
955 let lo = self.buckets[chosen_idx];
956 let hi = self.buckets.get(chosen_idx + 1).copied().unwrap_or(lo);
957 if hi <= lo {
958 lo
959 } else {
960 rng.random_range(lo..hi)
961 }
962 }
963}
964
965fn bucket_index(buckets: &[u32], v: u32) -> usize {
966 match buckets.binary_search(&v) {
967 Ok(i) => i,
968 Err(i) => i.saturating_sub(1),
969 }
970}
971
972pub const LINE_COUNT_BUCKETS: &[u32] = &[1, 2, 3, 4, 5, 6, 8, 10, 16, 32, 64, 128, 256, 1024];
974
975pub const ACTIVE_LIFETIME_DAY_BUCKETS: &[u32] = &[0, 1, 7, 30, 90, 180, 365, 730, 1825];
977
978pub const FANOUT_BUCKETS: &[u32] = &[1, 2, 3, 5, 8, 16, 32, 64, 128, 256, 1024];
980
981#[cfg(test)]
982mod tests {
983 use super::*;
984 use rand::SeedableRng;
985 use rand_chacha::ChaCha8Rng;
986
987 #[test]
988 fn sap_default_has_broad_long_tail() {
989 let m = SourceMixPrior::sap_default();
992 let p = &m.probabilities;
993 assert!(
994 p.len() >= 300,
995 "Lever-2 default should carry a long tail, got {} codes",
996 p.len()
997 );
998 let total: f64 = p.values().sum();
999 let ent: f64 = -p
1000 .values()
1001 .map(|&w| {
1002 let q = w / total;
1003 if q > 0.0 {
1004 q * q.ln()
1005 } else {
1006 0.0
1007 }
1008 })
1009 .sum::<f64>();
1010 assert!(
1011 ent > 3.0,
1012 "Lever-2 default entropy should exceed 3.0, got {ent:.3}"
1013 );
1014 assert!(p.contains_key("RV"), "standard head code present");
1015 assert!(p.contains_key("Z001"), "synthetic tail code present");
1016 let mut rng = ChaCha8Rng::seed_from_u64(1);
1017 for _ in 0..50 {
1018 assert!(p.contains_key(&m.sample(&mut rng)));
1019 }
1020 }
1021
1022 #[test]
1023 fn line_count_histogram_build_basic() {
1024 let values = vec![1, 1, 2, 3, 5, 5, 5, 32, 200];
1025 let (hist, dropped) = LineCountHistogram::build(&values, LINE_COUNT_BUCKETS);
1026 assert_eq!(dropped, 0);
1027 assert_eq!(hist.n, 9);
1028 assert!((hist.probabilities.iter().sum::<f64>() - 1.0).abs() < 1e-9);
1029 }
1030
1031 #[test]
1032 fn line_count_histogram_drops_below_min() {
1033 let values = vec![0, 0, 1, 2];
1034 let (hist, dropped) = LineCountHistogram::build(&values, &[1, 2, 4]);
1035 assert_eq!(dropped, 2);
1036 assert_eq!(hist.n, 4);
1037 assert!((hist.probabilities[0] - 0.5).abs() < 1e-9);
1038 }
1039
1040 #[test]
1041 fn sample_bucket_respects_probabilities() {
1042 let h = LineCountHistogram {
1043 buckets: vec![1, 2, 4, 8],
1044 probabilities: vec![0.0, 0.0, 1.0, 0.0],
1045 n: 100,
1046 };
1047 let mut rng = ChaCha8Rng::seed_from_u64(42);
1048 for _ in 0..50 {
1049 let s = h.sample_bucket(&mut rng);
1050 assert!((4..8).contains(&s), "expected sample in [4,8), got {s}");
1051 }
1052 }
1053
1054 #[test]
1055 fn empirical_cdf_from_sorted_values() {
1056 let cdf = EmpiricalCdf::from_sorted_values("test", vec![1.0, 2.0, 3.0]);
1057 assert_eq!(cdf.values.len(), 3);
1058 assert!((cdf.probabilities[2] - 1.0).abs() < 1e-9);
1059 }
1060
1061 #[test]
1062 fn active_segments_prior_default_round_trips() {
1063 let p = ActiveSegmentsPrior::default();
1064 let json = serde_json::to_string(&p).expect("serialize");
1065 let back: ActiveSegmentsPrior = serde_json::from_str(&json).expect("deserialize");
1066 assert!(back.by_source.is_empty());
1067 }
1068
1069 #[test]
1070 fn behavioral_priors_active_segments_optional_round_trip() {
1071 let bp = BehavioralPriors {
1074 schema_version: BehavioralPriors::SCHEMA_VERSION,
1075 generator_version: "test".to_string(),
1076 industry: "test".to_string(),
1077 n_client_inputs: 0,
1078 n_rows_aggregated: 0,
1079 source_mix: SourceMixPrior::default(),
1080 per_source_iet: PerSourceIetPrior::default(),
1081 lines_per_je: LinesPerJePrior::default(),
1082 active_lifetime: ActiveLifetimePrior::default(),
1083 fanout: FanoutPrior::default(),
1084 posting_lag: None,
1085 active_segments: Some(ActiveSegmentsPrior::default()),
1086 entity_clusters: None,
1087 per_source_attribute: None,
1088 tp_entity_clusters: None,
1089 coa_semantic: None,
1090 reference_formats: None,
1091 text_taxonomy: None,
1092 user_personas: None,
1093 source_amount_conditionals: None,
1094 source_role_gl_conditionals: None,
1095 tb_anchor: None,
1096 };
1097 let json = serde_json::to_string(&bp).expect("serialize");
1098 let back: BehavioralPriors = serde_json::from_str(&json).expect("deserialize");
1099 assert!(back.active_segments.is_some());
1100 }
1101
1102 #[test]
1103 fn behavioral_priors_legacy_round_trips_without_active_segments() {
1104 let legacy = r#"{
1106 "schema_version": 1,
1107 "generator_version": "5.12.0",
1108 "industry": "health",
1109 "n_client_inputs": 1,
1110 "n_rows_aggregated": 100,
1111 "source_mix": {"probabilities": {}, "other_fraction": 0.0, "min_threshold": 0.005},
1112 "per_source_iet": {"by_source": {}},
1113 "lines_per_je": {"overall": {"buckets": [], "probabilities": [], "n": 0}, "by_source": {}, "min_jes_per_source": 500},
1114 "active_lifetime": {"by_source": {}, "overall": {"buckets": [], "probabilities": [], "n": 0}},
1115 "fanout": {"by_attribute": {}}
1116 }"#;
1117 let bp: BehavioralPriors = serde_json::from_str(legacy).expect("legacy parse");
1118 assert!(bp.active_segments.is_none());
1119 assert!(bp.posting_lag.is_none());
1120 }
1121
1122 #[test]
1123 fn entity_clusters_prior_default_round_trips() {
1124 let p = EntityClustersPrior::default();
1125 let json = serde_json::to_string(&p).expect("serialize");
1126 let back: EntityClustersPrior = serde_json::from_str(&json).expect("deserialize");
1127 assert!(back.clusters.is_empty());
1128 assert!((back.clustering_rate).abs() < 1e-9);
1129 }
1130
1131 #[test]
1132 fn categorical_distribution_samples_with_correct_weights() {
1133 use rand::SeedableRng;
1134 use rand_chacha::ChaCha8Rng;
1135
1136 let mut counts = BTreeMap::new();
1137 counts.insert("A".to_string(), 700);
1138 counts.insert("B".to_string(), 200);
1139 counts.insert("C".to_string(), 100);
1140 let dist = CategoricalDistribution::from_counts(counts);
1141
1142 assert_eq!(dist.n, 1000);
1143 assert!((dist.probabilities["A"] - 0.7).abs() < 1e-9);
1144
1145 let mut rng = ChaCha8Rng::seed_from_u64(42);
1146 let mut buckets = BTreeMap::new();
1147 for _ in 0..10_000 {
1148 let v = dist.sample(&mut rng).expect("non-empty");
1149 *buckets.entry(v).or_insert(0) += 1;
1150 }
1151 let a_count = buckets.get("A").copied().unwrap_or(0);
1153 assert!(
1154 (a_count as i64 - 7000).abs() < 200,
1155 "got {} A samples",
1156 a_count
1157 );
1158 }
1159
1160 #[test]
1161 fn per_source_attribute_prior_conditional_lookup() {
1162 let mut inner = BTreeMap::new();
1163 let mut prob_map = BTreeMap::new();
1164 prob_map.insert("200001".to_string(), 0.9);
1165 prob_map.insert("200002".to_string(), 0.1);
1166 inner.insert(
1167 "gl_account".to_string(),
1168 CategoricalDistribution {
1169 probabilities: prob_map,
1170 n: 100,
1171 },
1172 );
1173 let mut by_source = BTreeMap::new();
1174 by_source.insert("KR".to_string(), inner);
1175 let prior = PerSourceAttributePrior {
1176 by_source,
1177 min_observations: 10,
1178 };
1179 assert!(prior.conditional("KR", "gl_account").is_some());
1180 assert!(prior.conditional("KR", "cost_center").is_none());
1181 assert!(prior.conditional("RV", "gl_account").is_none());
1182 }
1183
1184 #[test]
1185 fn behavioral_priors_per_source_attribute_optional_round_trip() {
1186 let bp = BehavioralPriors {
1187 schema_version: BehavioralPriors::SCHEMA_VERSION,
1188 generator_version: "test".to_string(),
1189 industry: "test".to_string(),
1190 n_client_inputs: 0,
1191 n_rows_aggregated: 0,
1192 source_mix: SourceMixPrior::default(),
1193 per_source_iet: PerSourceIetPrior::default(),
1194 lines_per_je: LinesPerJePrior::default(),
1195 active_lifetime: ActiveLifetimePrior::default(),
1196 fanout: FanoutPrior::default(),
1197 posting_lag: None,
1198 active_segments: None,
1199 entity_clusters: None,
1200 per_source_attribute: Some(PerSourceAttributePrior::default()),
1201 tp_entity_clusters: None,
1202 coa_semantic: None,
1203 reference_formats: None,
1204 text_taxonomy: None,
1205 user_personas: None,
1206 source_amount_conditionals: None,
1207 source_role_gl_conditionals: None,
1208 tb_anchor: None,
1209 };
1210 let json = serde_json::to_string(&bp).expect("serialize");
1211 let back: BehavioralPriors = serde_json::from_str(&json).expect("deserialize");
1212 assert!(back.per_source_attribute.is_some());
1213 }
1214
1215 #[test]
1216 fn entity_clusters_prior_with_members_round_trips() {
1217 let p = EntityClustersPrior {
1218 clusters: vec![EntityCluster {
1219 members: vec!["A".into(), "B".into(), "C".into()],
1220 avg_jaccard: 0.42,
1221 }],
1222 clustering_rate: 0.75,
1223 };
1224 let json = serde_json::to_string(&p).expect("serialize");
1225 let back: EntityClustersPrior = serde_json::from_str(&json).expect("deserialize");
1226 assert_eq!(back.clusters.len(), 1);
1227 assert_eq!(back.clusters[0].members.len(), 3);
1228 assert!((back.clusters[0].avg_jaccard - 0.42).abs() < 1e-9);
1229 assert!((back.clustering_rate - 0.75).abs() < 1e-9);
1230 }
1231
1232 #[test]
1237 fn lognormal_amount_sample_positive_values() {
1238 use rand::SeedableRng;
1239 use rand_chacha::ChaCha8Rng;
1240
1241 let params = LognormalAmount {
1242 mu: 4.5, sigma: 0.8,
1244 n: 1000,
1245 median_abs: 90.0,
1246 };
1247 let mut rng = ChaCha8Rng::seed_from_u64(42);
1248 let samples: Vec<f64> = (0..1000).map(|_| params.sample(&mut rng)).collect();
1249
1250 assert!(samples.iter().all(|&v| v > 0.0), "all samples must be > 0");
1252
1253 let log_mean: f64 = samples.iter().map(|v| v.ln()).sum::<f64>() / 1000.0;
1255 assert!(
1256 (log_mean - 4.5).abs() < 0.15,
1257 "log-mean {log_mean:.3} should be near mu=4.5"
1258 );
1259 }
1260
1261 #[test]
1263 fn lognormal_amount_sample_degenerate_sigma() {
1264 use rand::SeedableRng;
1265 use rand_chacha::ChaCha8Rng;
1266
1267 let params = LognormalAmount {
1268 mu: 3.0,
1269 sigma: 0.0, n: 5,
1271 median_abs: 20.0,
1272 };
1273 let mut rng = ChaCha8Rng::seed_from_u64(7);
1274 for _ in 0..10 {
1275 let v = params.sample(&mut rng);
1276 assert!(v > 0.0, "must be positive even with sigma=0");
1277 }
1278 }
1279
1280 #[test]
1282 fn per_source_amount_prior_round_trip() {
1283 let mut by_source = BTreeMap::new();
1284 by_source.insert(
1285 "KR".to_string(),
1286 LognormalAmount {
1287 mu: 4.5,
1288 sigma: 2.158,
1289 n: 278939,
1290 median_abs: 100.0,
1291 },
1292 );
1293 let mut by_source_and_class = BTreeMap::new();
1294 let mut inner = BTreeMap::new();
1295 inner.insert(
1296 "0041".to_string(),
1297 LognormalAmount {
1298 mu: 5.394,
1299 sigma: 1.602,
1300 n: 61726,
1301 median_abs: 209.98,
1302 },
1303 );
1304 by_source_and_class.insert("KR".to_string(), inner);
1305
1306 let prior = PerSourceAmountPrior {
1307 by_source_and_class,
1308 by_source,
1309 };
1310 let json = serde_json::to_string(&prior).expect("serialize");
1311 let back: PerSourceAmountPrior = serde_json::from_str(&json).expect("deserialize");
1312 assert_eq!(back.by_source.len(), 1);
1313 assert_eq!(back.by_source_and_class.len(), 1);
1314 assert_eq!(back.by_source["KR"].n, 278939);
1315 assert_eq!(back.by_source_and_class["KR"]["0041"].n, 61726);
1316 }
1317
1318 #[test]
1320 fn behavioral_priors_source_amount_conditionals_optional_round_trip() {
1321 let prior = PerSourceAmountPrior {
1322 by_source_and_class: BTreeMap::new(),
1323 by_source: BTreeMap::new(),
1324 };
1325 let bp = BehavioralPriors {
1326 schema_version: BehavioralPriors::SCHEMA_VERSION,
1327 generator_version: "test".to_string(),
1328 industry: "test".to_string(),
1329 n_client_inputs: 0,
1330 n_rows_aggregated: 0,
1331 source_mix: SourceMixPrior::default(),
1332 per_source_iet: PerSourceIetPrior::default(),
1333 lines_per_je: LinesPerJePrior::default(),
1334 active_lifetime: ActiveLifetimePrior::default(),
1335 fanout: FanoutPrior::default(),
1336 posting_lag: None,
1337 active_segments: None,
1338 entity_clusters: None,
1339 per_source_attribute: None,
1340 tp_entity_clusters: None,
1341 coa_semantic: None,
1342 reference_formats: None,
1343 text_taxonomy: None,
1344 user_personas: None,
1345 source_amount_conditionals: Some(prior),
1346 source_role_gl_conditionals: None,
1347 tb_anchor: None,
1348 };
1349 let json = serde_json::to_string(&bp).expect("serialize");
1350 let back: BehavioralPriors = serde_json::from_str(&json).expect("deserialize");
1351 assert!(back.source_amount_conditionals.is_some());
1352 }
1353
1354 #[test]
1357 fn behavioral_priors_legacy_missing_source_amount_conditionals() {
1358 let legacy = r#"{
1359 "schema_version": 1,
1360 "generator_version": "5.21.0",
1361 "industry": "health",
1362 "n_client_inputs": 1,
1363 "n_rows_aggregated": 100,
1364 "source_mix": {"probabilities": {}, "other_fraction": 0.0, "min_threshold": 0.005},
1365 "per_source_iet": {"by_source": {}},
1366 "lines_per_je": {"overall": {"buckets": [], "probabilities": [], "n": 0}, "by_source": {}, "min_jes_per_source": 500},
1367 "active_lifetime": {"by_source": {}, "overall": {"buckets": [], "probabilities": [], "n": 0}},
1368 "fanout": {"by_attribute": {}}
1369 }"#;
1370 let bp: BehavioralPriors = serde_json::from_str(legacy).expect("legacy parse");
1371 assert!(
1372 bp.source_amount_conditionals.is_none(),
1373 "missing field should deserialise as None"
1374 );
1375 }
1376
1377 #[test]
1382 fn sp4_6_role_conditional_keeps_dr_in_expense_class() {
1383 use rand::SeedableRng;
1384 use rand_chacha::ChaCha8Rng;
1385
1386 let mut dr_counts = BTreeMap::new();
1390 dr_counts.insert("6000".to_string(), 100usize);
1391 dr_counts.insert("6100".to_string(), 50usize);
1392 let mut cr_counts = BTreeMap::new();
1393 cr_counts.insert("2000".to_string(), 150usize);
1394
1395 let mut role_map = BTreeMap::new();
1396 role_map.insert(
1397 "DR".to_string(),
1398 CategoricalDistribution::from_counts(dr_counts),
1399 );
1400 role_map.insert(
1401 "CR".to_string(),
1402 CategoricalDistribution::from_counts(cr_counts),
1403 );
1404
1405 let mut by_source_and_role = BTreeMap::new();
1406 by_source_and_role.insert("KR".to_string(), role_map);
1407
1408 let prior = PerSourceRolePrior { by_source_and_role };
1409
1410 let mut rng = ChaCha8Rng::seed_from_u64(42);
1412 for _ in 0..100 {
1413 let v = prior
1414 .conditional("KR", "DR")
1415 .unwrap()
1416 .sample(&mut rng)
1417 .unwrap();
1418 assert!(
1419 v == "6000" || v == "6100",
1420 "DR draw must be expense account, got {v}"
1421 );
1422 }
1423
1424 for _ in 0..50 {
1426 let v = prior
1427 .conditional("KR", "CR")
1428 .unwrap()
1429 .sample(&mut rng)
1430 .unwrap();
1431 assert_eq!(v, "2000", "CR draw must be AP account");
1432 }
1433 }
1434
1435 #[test]
1437 fn sp4_6_role_conditional_falls_back_when_pair_missing() {
1438 let prior = PerSourceRolePrior::default();
1439 assert!(
1440 prior.conditional("KR", "DR").is_none(),
1441 "empty prior must return None"
1442 );
1443 assert!(
1444 prior.conditional("KR", "CR").is_none(),
1445 "empty prior must return None for CR too"
1446 );
1447 }
1448
1449 #[test]
1451 fn sp4_6_per_source_role_prior_json_round_trip() {
1452 let mut dr_counts = BTreeMap::new();
1453 dr_counts.insert("6000".to_string(), 200usize);
1454 let mut role_map = BTreeMap::new();
1455 role_map.insert(
1456 "DR".to_string(),
1457 CategoricalDistribution::from_counts(dr_counts),
1458 );
1459 let mut by_source_and_role = BTreeMap::new();
1460 by_source_and_role.insert("KR".to_string(), role_map);
1461 let prior = PerSourceRolePrior { by_source_and_role };
1462
1463 let json = serde_json::to_string(&prior).expect("serialize");
1464 let back: PerSourceRolePrior = serde_json::from_str(&json).expect("deserialize");
1465 assert!(back.conditional("KR", "DR").is_some());
1466 assert!(back.conditional("KR", "CR").is_none());
1467 }
1468
1469 #[test]
1471 fn behavioral_priors_source_role_gl_conditionals_optional_round_trip() {
1472 let prior = PerSourceRolePrior::default();
1473 let bp = BehavioralPriors {
1474 schema_version: BehavioralPriors::SCHEMA_VERSION,
1475 generator_version: "test".to_string(),
1476 industry: "test".to_string(),
1477 n_client_inputs: 0,
1478 n_rows_aggregated: 0,
1479 source_mix: SourceMixPrior::default(),
1480 per_source_iet: PerSourceIetPrior::default(),
1481 lines_per_je: LinesPerJePrior::default(),
1482 active_lifetime: ActiveLifetimePrior::default(),
1483 fanout: FanoutPrior::default(),
1484 posting_lag: None,
1485 active_segments: None,
1486 entity_clusters: None,
1487 per_source_attribute: None,
1488 tp_entity_clusters: None,
1489 coa_semantic: None,
1490 reference_formats: None,
1491 text_taxonomy: None,
1492 user_personas: None,
1493 source_amount_conditionals: None,
1494 source_role_gl_conditionals: Some(prior),
1495 tb_anchor: None,
1496 };
1497 let json = serde_json::to_string(&bp).expect("serialize");
1498 let back: BehavioralPriors = serde_json::from_str(&json).expect("deserialize");
1499 assert!(back.source_role_gl_conditionals.is_some());
1500 }
1501
1502 #[test]
1505 fn behavioral_priors_legacy_missing_source_role_gl_conditionals() {
1506 let legacy = r#"{
1507 "schema_version": 1,
1508 "generator_version": "5.21.0",
1509 "industry": "health",
1510 "n_client_inputs": 1,
1511 "n_rows_aggregated": 100,
1512 "source_mix": {"probabilities": {}, "other_fraction": 0.0, "min_threshold": 0.005},
1513 "per_source_iet": {"by_source": {}},
1514 "lines_per_je": {"overall": {"buckets": [], "probabilities": [], "n": 0}, "by_source": {}, "min_jes_per_source": 500},
1515 "active_lifetime": {"by_source": {}, "overall": {"buckets": [], "probabilities": [], "n": 0}},
1516 "fanout": {"by_attribute": {}}
1517 }"#;
1518 let bp: BehavioralPriors = serde_json::from_str(legacy).expect("legacy parse");
1519 assert!(
1520 bp.source_role_gl_conditionals.is_none(),
1521 "missing field should deserialise as None"
1522 );
1523 }
1524
1525 #[test]
1529 fn tb_anchor_prior_json_round_trip() {
1530 let mut per_account = BTreeMap::new();
1531 per_account.insert(
1532 "1000".to_string(),
1533 TbTarget {
1534 opening_balance: 100_000.0,
1535 closing_balance: 120_000.0,
1536 period_net_activity: 20_000.0,
1537 opening_stdev: 5_000.0,
1538 closing_stdev: 6_000.0,
1539 n_clients: 3,
1540 },
1541 );
1542 per_account.insert(
1543 "2000".to_string(),
1544 TbTarget {
1545 opening_balance: -50_000.0,
1546 closing_balance: -60_000.0,
1547 period_net_activity: -10_000.0,
1548 opening_stdev: 2_000.0,
1549 closing_stdev: 3_000.0,
1550 n_clients: 3,
1551 },
1552 );
1553 let anchor = TbAnchorPrior {
1554 per_account,
1555 total_assets: 300_000.0,
1556 total_liabilities: 120_000.0,
1557 total_equity: 180_000.0,
1558 n_clients: 3,
1559 };
1560 let json = serde_json::to_string(&anchor).expect("serialize");
1561 let back: TbAnchorPrior = serde_json::from_str(&json).expect("deserialize");
1562 assert_eq!(back.per_account.len(), 2);
1563 assert!((back.per_account["1000"].closing_balance - 120_000.0).abs() < 1e-6);
1564 assert!((back.total_assets - 300_000.0).abs() < 1e-6);
1565 assert_eq!(back.n_clients, 3);
1566 }
1567
1568 #[test]
1570 fn tb_anchor_prior_has_data() {
1571 let mut prior = TbAnchorPrior::default();
1572 assert!(!prior.has_data(), "empty prior must report no data");
1573
1574 prior.per_account.insert(
1575 "1000".to_string(),
1576 TbTarget {
1577 closing_balance: 1.0,
1578 ..Default::default()
1579 },
1580 );
1581 assert!(
1582 prior.has_data(),
1583 "non-zero closing balance must report has_data"
1584 );
1585 }
1586
1587 #[test]
1589 fn behavioral_priors_tb_anchor_optional_round_trip() {
1590 let mut per_account = BTreeMap::new();
1591 per_account.insert(
1592 "1000".to_string(),
1593 TbTarget {
1594 opening_balance: 50_000.0,
1595 closing_balance: 55_000.0,
1596 period_net_activity: 5_000.0,
1597 opening_stdev: 1_000.0,
1598 closing_stdev: 1_200.0,
1599 n_clients: 2,
1600 },
1601 );
1602 let tb_anchor = Some(TbAnchorPrior {
1603 per_account,
1604 total_assets: 55_000.0,
1605 total_liabilities: 0.0,
1606 total_equity: 55_000.0,
1607 n_clients: 2,
1608 });
1609 let bp = BehavioralPriors {
1610 schema_version: BehavioralPriors::SCHEMA_VERSION,
1611 generator_version: "test".to_string(),
1612 industry: "test".to_string(),
1613 n_client_inputs: 0,
1614 n_rows_aggregated: 0,
1615 source_mix: SourceMixPrior::default(),
1616 per_source_iet: PerSourceIetPrior::default(),
1617 lines_per_je: LinesPerJePrior::default(),
1618 active_lifetime: ActiveLifetimePrior::default(),
1619 fanout: FanoutPrior::default(),
1620 posting_lag: None,
1621 active_segments: None,
1622 entity_clusters: None,
1623 per_source_attribute: None,
1624 tp_entity_clusters: None,
1625 coa_semantic: None,
1626 reference_formats: None,
1627 text_taxonomy: None,
1628 user_personas: None,
1629 source_amount_conditionals: None,
1630 source_role_gl_conditionals: None,
1631 tb_anchor,
1632 };
1633 let json = serde_json::to_string(&bp).expect("serialize");
1634 let back: BehavioralPriors = serde_json::from_str(&json).expect("deserialize");
1635 let anchor = back.tb_anchor.expect("tb_anchor must be Some");
1636 assert_eq!(anchor.per_account.len(), 1);
1637 assert!((anchor.per_account["1000"].closing_balance - 55_000.0).abs() < 1e-6);
1638 }
1639
1640 #[test]
1642 fn behavioral_priors_legacy_missing_tb_anchor() {
1643 let legacy = r#"{
1644 "schema_version": 1,
1645 "generator_version": "5.22.0",
1646 "industry": "health",
1647 "n_client_inputs": 1,
1648 "n_rows_aggregated": 100,
1649 "source_mix": {"probabilities": {}, "other_fraction": 0.0, "min_threshold": 0.005},
1650 "per_source_iet": {"by_source": {}},
1651 "lines_per_je": {"overall": {"buckets": [], "probabilities": [], "n": 0}, "by_source": {}, "min_jes_per_source": 500},
1652 "active_lifetime": {"by_source": {}, "overall": {"buckets": [], "probabilities": [], "n": 0}},
1653 "fanout": {"by_attribute": {}}
1654 }"#;
1655 let bp: BehavioralPriors = serde_json::from_str(legacy).expect("legacy parse");
1656 assert!(
1657 bp.tb_anchor.is_none(),
1658 "missing tb_anchor field should deserialise as None"
1659 );
1660 }
1661}