use std::collections::HashMap;
use serde::{Deserialize, Serialize};
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct StatisticsFingerprint {
pub numeric_columns: HashMap<String, NumericStats>,
pub categorical_columns: HashMap<String, CategoricalStats>,
pub temporal_columns: HashMap<String, TemporalStats>,
#[serde(skip_serializing_if = "Option::is_none")]
pub benford_analysis: Option<BenfordStats>,
#[serde(default, skip_serializing_if = "Vec::is_empty")]
pub account_class_stats: Vec<AccountClassStats>,
}
impl StatisticsFingerprint {
pub fn new() -> Self {
Self {
numeric_columns: HashMap::new(),
categorical_columns: HashMap::new(),
temporal_columns: HashMap::new(),
benford_analysis: None,
account_class_stats: Vec::new(),
}
}
pub fn add_numeric(&mut self, table: &str, column: &str, stats: NumericStats) {
let key = format!("{table}.{column}");
self.numeric_columns.insert(key, stats);
}
pub fn add_categorical(&mut self, table: &str, column: &str, stats: CategoricalStats) {
let key = format!("{table}.{column}");
self.categorical_columns.insert(key, stats);
}
pub fn add_temporal(&mut self, table: &str, column: &str, stats: TemporalStats) {
let key = format!("{table}.{column}");
self.temporal_columns.insert(key, stats);
}
pub fn add_account_class_stats(&mut self, stats: AccountClassStats) {
self.account_class_stats.push(stats);
}
}
impl Default for StatisticsFingerprint {
fn default() -> Self {
Self::new()
}
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct NumericStats {
pub count: u64,
pub min: f64,
pub max: f64,
pub mean: f64,
pub std_dev: f64,
pub percentiles: Percentiles,
pub distribution: DistributionType,
pub distribution_params: DistributionParams,
pub zero_rate: f64,
pub negative_rate: f64,
#[serde(skip_serializing_if = "Option::is_none")]
pub benford_first_digit: Option<[f64; 9]>,
}
impl NumericStats {
pub fn new(count: u64, min: f64, max: f64, mean: f64, std_dev: f64) -> Self {
Self {
count,
min,
max,
mean,
std_dev,
percentiles: Percentiles::default(),
distribution: DistributionType::Unknown,
distribution_params: DistributionParams::empty(),
zero_rate: 0.0,
negative_rate: 0.0,
benford_first_digit: None,
}
}
pub fn follows_benford(&self) -> bool {
self.benford_first_digit
.map(|digits| {
(digits[0] - 0.301).abs() < 0.05
})
.unwrap_or(false)
}
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct Percentiles {
pub p1: f64,
pub p5: f64,
pub p10: f64,
pub p25: f64,
pub p50: f64,
pub p75: f64,
pub p90: f64,
pub p95: f64,
pub p99: f64,
}
impl Default for Percentiles {
fn default() -> Self {
Self {
p1: 0.0,
p5: 0.0,
p10: 0.0,
p25: 0.0,
p50: 0.0,
p75: 0.0,
p90: 0.0,
p95: 0.0,
p99: 0.0,
}
}
}
impl Percentiles {
pub fn from_array(values: [f64; 9]) -> Self {
Self {
p1: values[0],
p5: values[1],
p10: values[2],
p25: values[3],
p50: values[4],
p75: values[5],
p90: values[6],
p95: values[7],
p99: values[8],
}
}
pub fn to_array(&self) -> [f64; 9] {
[
self.p1, self.p5, self.p10, self.p25, self.p50, self.p75, self.p90, self.p95, self.p99,
]
}
pub fn iqr(&self) -> f64 {
self.p75 - self.p25
}
}
#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
#[serde(rename_all = "snake_case")]
pub enum DistributionType {
Normal,
LogNormal,
Gamma,
Exponential,
Pareto,
Uniform,
PointMass,
Mixture,
Empirical,
Unknown,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct DistributionParams {
#[serde(skip_serializing_if = "Option::is_none")]
pub param1: Option<f64>,
#[serde(skip_serializing_if = "Option::is_none")]
pub param2: Option<f64>,
#[serde(skip_serializing_if = "Option::is_none")]
pub shift: Option<f64>,
#[serde(skip_serializing_if = "Option::is_none")]
pub scale: Option<f64>,
#[serde(skip_serializing_if = "Option::is_none")]
pub histogram: Option<Histogram>,
#[serde(skip_serializing_if = "Option::is_none")]
pub mixture_components: Option<Vec<MixtureComponent>>,
}
impl DistributionParams {
pub fn empty() -> Self {
Self {
param1: None,
param2: None,
shift: None,
scale: None,
histogram: None,
mixture_components: None,
}
}
pub fn normal(mean: f64, std_dev: f64) -> Self {
Self {
param1: Some(mean),
param2: Some(std_dev),
..Self::empty()
}
}
pub fn log_normal(mu: f64, sigma: f64) -> Self {
Self {
param1: Some(mu),
param2: Some(sigma),
..Self::empty()
}
}
pub fn uniform(min: f64, max: f64) -> Self {
Self {
param1: Some(min),
param2: Some(max),
..Self::empty()
}
}
pub fn exponential(rate: f64) -> Self {
Self {
param1: Some(rate),
..Self::empty()
}
}
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct Histogram {
pub bin_edges: Vec<f64>,
pub bin_weights: Vec<f64>,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct MixtureComponent {
pub weight: f64,
pub distribution: DistributionType,
pub params: DistributionParams,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct CategoricalStats {
pub count: u64,
pub cardinality: u64,
pub top_values: Vec<CategoryFrequency>,
pub rare_values_suppressed: bool,
pub suppressed_count: u64,
pub entropy: f64,
}
impl CategoricalStats {
pub fn new(count: u64, cardinality: u64) -> Self {
Self {
count,
cardinality,
top_values: Vec::new(),
rare_values_suppressed: false,
suppressed_count: 0,
entropy: 0.0,
}
}
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct CategoryFrequency {
pub value: String,
pub frequency: f64,
#[serde(default)]
pub generalized: bool,
}
impl CategoryFrequency {
pub fn new(value: impl Into<String>, frequency: f64) -> Self {
Self {
value: value.into(),
frequency,
generalized: false,
}
}
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct TemporalStats {
pub count: u64,
pub min: String,
pub max: String,
pub day_of_week_distribution: [f64; 7],
pub month_distribution: [f64; 12],
#[serde(skip_serializing_if = "Option::is_none")]
pub hour_distribution: Option<[f64; 24]>,
pub weekend_effect: bool,
pub month_end_effect: bool,
pub year_end_effect: bool,
pub seasonality_strength: f64,
}
impl TemporalStats {
pub fn new(count: u64, min: String, max: String) -> Self {
Self {
count,
min,
max,
day_of_week_distribution: [1.0 / 7.0; 7],
month_distribution: [1.0 / 12.0; 12],
hour_distribution: None,
weekend_effect: false,
month_end_effect: false,
year_end_effect: false,
seasonality_strength: 0.0,
}
}
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct BenfordStats {
pub sample_size: u64,
pub observed_frequencies: [f64; 9],
pub expected_frequencies: [f64; 9],
pub mad: f64,
pub chi_squared: f64,
pub p_value: f64,
pub conforms: bool,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct AccountClassStats {
pub class_pattern: String,
pub class_label: String,
#[serde(skip_serializing_if = "Option::is_none")]
pub numeric: Option<NumericStats>,
pub row_count: u64,
#[serde(skip_serializing_if = "Option::is_none")]
pub benford_first_digit: Option<[f64; 9]>,
}
impl AccountClassStats {
pub fn new(class_pattern: String, class_label: String) -> Self {
Self {
class_pattern,
class_label,
numeric: None,
row_count: 0,
benford_first_digit: None,
}
}
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct RoundNumberStats {
pub round_hundred_rate: f64,
pub round_ten_rate: f64,
pub round_five_rate: f64,
pub whole_number_rate: f64,
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_account_class_stats_creation_and_add() {
let mut fp = StatisticsFingerprint::new();
assert!(fp.account_class_stats.is_empty());
let mut stats = AccountClassStats::new("1XXX".to_string(), "Assets".to_string());
assert_eq!(stats.class_pattern, "1XXX");
assert_eq!(stats.class_label, "Assets");
assert!(stats.numeric.is_none());
assert_eq!(stats.row_count, 0);
assert!(stats.benford_first_digit.is_none());
stats.row_count = 42;
stats.numeric = Some(NumericStats::new(42, 10.0, 5000.0, 1234.5, 800.0));
stats.benford_first_digit = Some([
0.301, 0.176, 0.125, 0.097, 0.079, 0.067, 0.058, 0.051, 0.046,
]);
fp.add_account_class_stats(stats);
assert_eq!(fp.account_class_stats.len(), 1);
assert_eq!(fp.account_class_stats[0].class_pattern, "1XXX");
assert_eq!(fp.account_class_stats[0].row_count, 42);
assert!(fp.account_class_stats[0].numeric.is_some());
assert!(fp.account_class_stats[0].benford_first_digit.is_some());
}
#[test]
fn test_account_class_stats_json_roundtrip() {
let mut stats = AccountClassStats::new("4XXX".to_string(), "Revenue".to_string());
stats.row_count = 100;
stats.numeric = Some(NumericStats::new(100, 500.0, 99000.0, 12345.67, 5000.0));
let json = serde_json::to_string(&stats).expect("serialize");
let deserialized: AccountClassStats = serde_json::from_str(&json).expect("deserialize");
assert_eq!(deserialized.class_pattern, "4XXX");
assert_eq!(deserialized.class_label, "Revenue");
assert_eq!(deserialized.row_count, 100);
assert!(deserialized.numeric.is_some());
assert!(deserialized.benford_first_digit.is_none());
}
#[test]
fn test_account_class_stats_skip_serializing_when_empty() {
let fp = StatisticsFingerprint::new();
let json = serde_json::to_string(&fp).expect("serialize");
assert!(
!json.contains("account_class_stats"),
"empty account_class_stats should be skipped in serialization"
);
let mut fp2 = StatisticsFingerprint::new();
fp2.add_account_class_stats(AccountClassStats::new(
"5XXX".to_string(),
"Expenses".to_string(),
));
let json2 = serde_json::to_string(&fp2).expect("serialize");
assert!(
json2.contains("account_class_stats"),
"non-empty account_class_stats should appear in serialization"
);
}
#[test]
fn test_statistics_fingerprint_json_roundtrip_with_account_class_stats() {
let mut fp = StatisticsFingerprint::new();
fp.add_account_class_stats(AccountClassStats::new(
"1XXX".to_string(),
"Assets".to_string(),
));
fp.add_account_class_stats(AccountClassStats::new(
"4XXX".to_string(),
"Revenue".to_string(),
));
let json = serde_json::to_string(&fp).expect("serialize");
let deserialized: StatisticsFingerprint = serde_json::from_str(&json).expect("deserialize");
assert_eq!(deserialized.account_class_stats.len(), 2);
assert_eq!(deserialized.account_class_stats[0].class_pattern, "1XXX");
assert_eq!(deserialized.account_class_stats[1].class_label, "Revenue");
}
#[test]
fn test_deserialize_without_account_class_stats_field() {
let json = r#"{
"numeric_columns": {},
"categorical_columns": {},
"temporal_columns": {}
}"#;
let fp: StatisticsFingerprint = serde_json::from_str(json).expect("deserialize");
assert!(fp.account_class_stats.is_empty());
}
}