use std::collections::HashMap;
use serde::{Deserialize, Serialize};
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct StatisticsFingerprint {
pub numeric_columns: HashMap<String, NumericStats>,
pub categorical_columns: HashMap<String, CategoricalStats>,
pub temporal_columns: HashMap<String, TemporalStats>,
#[serde(skip_serializing_if = "Option::is_none")]
pub benford_analysis: Option<BenfordStats>,
}
impl StatisticsFingerprint {
pub fn new() -> Self {
Self {
numeric_columns: HashMap::new(),
categorical_columns: HashMap::new(),
temporal_columns: HashMap::new(),
benford_analysis: None,
}
}
pub fn add_numeric(&mut self, table: &str, column: &str, stats: NumericStats) {
let key = format!("{}.{}", table, column);
self.numeric_columns.insert(key, stats);
}
pub fn add_categorical(&mut self, table: &str, column: &str, stats: CategoricalStats) {
let key = format!("{}.{}", table, column);
self.categorical_columns.insert(key, stats);
}
pub fn add_temporal(&mut self, table: &str, column: &str, stats: TemporalStats) {
let key = format!("{}.{}", table, column);
self.temporal_columns.insert(key, stats);
}
}
impl Default for StatisticsFingerprint {
fn default() -> Self {
Self::new()
}
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct NumericStats {
pub count: u64,
pub min: f64,
pub max: f64,
pub mean: f64,
pub std_dev: f64,
pub percentiles: Percentiles,
pub distribution: DistributionType,
pub distribution_params: DistributionParams,
pub zero_rate: f64,
pub negative_rate: f64,
#[serde(skip_serializing_if = "Option::is_none")]
pub benford_first_digit: Option<[f64; 9]>,
}
impl NumericStats {
pub fn new(count: u64, min: f64, max: f64, mean: f64, std_dev: f64) -> Self {
Self {
count,
min,
max,
mean,
std_dev,
percentiles: Percentiles::default(),
distribution: DistributionType::Unknown,
distribution_params: DistributionParams::empty(),
zero_rate: 0.0,
negative_rate: 0.0,
benford_first_digit: None,
}
}
pub fn follows_benford(&self) -> bool {
self.benford_first_digit
.map(|digits| {
(digits[0] - 0.301).abs() < 0.05
})
.unwrap_or(false)
}
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct Percentiles {
pub p1: f64,
pub p5: f64,
pub p10: f64,
pub p25: f64,
pub p50: f64,
pub p75: f64,
pub p90: f64,
pub p95: f64,
pub p99: f64,
}
impl Default for Percentiles {
fn default() -> Self {
Self {
p1: 0.0,
p5: 0.0,
p10: 0.0,
p25: 0.0,
p50: 0.0,
p75: 0.0,
p90: 0.0,
p95: 0.0,
p99: 0.0,
}
}
}
impl Percentiles {
pub fn from_array(values: [f64; 9]) -> Self {
Self {
p1: values[0],
p5: values[1],
p10: values[2],
p25: values[3],
p50: values[4],
p75: values[5],
p90: values[6],
p95: values[7],
p99: values[8],
}
}
pub fn to_array(&self) -> [f64; 9] {
[
self.p1, self.p5, self.p10, self.p25, self.p50, self.p75, self.p90, self.p95, self.p99,
]
}
pub fn iqr(&self) -> f64 {
self.p75 - self.p25
}
}
#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
#[serde(rename_all = "snake_case")]
pub enum DistributionType {
Normal,
LogNormal,
Gamma,
Exponential,
Pareto,
Uniform,
PointMass,
Mixture,
Empirical,
Unknown,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct DistributionParams {
#[serde(skip_serializing_if = "Option::is_none")]
pub param1: Option<f64>,
#[serde(skip_serializing_if = "Option::is_none")]
pub param2: Option<f64>,
#[serde(skip_serializing_if = "Option::is_none")]
pub shift: Option<f64>,
#[serde(skip_serializing_if = "Option::is_none")]
pub scale: Option<f64>,
#[serde(skip_serializing_if = "Option::is_none")]
pub histogram: Option<Histogram>,
#[serde(skip_serializing_if = "Option::is_none")]
pub mixture_components: Option<Vec<MixtureComponent>>,
}
impl DistributionParams {
pub fn empty() -> Self {
Self {
param1: None,
param2: None,
shift: None,
scale: None,
histogram: None,
mixture_components: None,
}
}
pub fn normal(mean: f64, std_dev: f64) -> Self {
Self {
param1: Some(mean),
param2: Some(std_dev),
..Self::empty()
}
}
pub fn log_normal(mu: f64, sigma: f64) -> Self {
Self {
param1: Some(mu),
param2: Some(sigma),
..Self::empty()
}
}
pub fn uniform(min: f64, max: f64) -> Self {
Self {
param1: Some(min),
param2: Some(max),
..Self::empty()
}
}
pub fn exponential(rate: f64) -> Self {
Self {
param1: Some(rate),
..Self::empty()
}
}
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct Histogram {
pub bin_edges: Vec<f64>,
pub bin_weights: Vec<f64>,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct MixtureComponent {
pub weight: f64,
pub distribution: DistributionType,
pub params: DistributionParams,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct CategoricalStats {
pub count: u64,
pub cardinality: u64,
pub top_values: Vec<CategoryFrequency>,
pub rare_values_suppressed: bool,
pub suppressed_count: u64,
pub entropy: f64,
}
impl CategoricalStats {
pub fn new(count: u64, cardinality: u64) -> Self {
Self {
count,
cardinality,
top_values: Vec::new(),
rare_values_suppressed: false,
suppressed_count: 0,
entropy: 0.0,
}
}
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct CategoryFrequency {
pub value: String,
pub frequency: f64,
#[serde(default)]
pub generalized: bool,
}
impl CategoryFrequency {
pub fn new(value: impl Into<String>, frequency: f64) -> Self {
Self {
value: value.into(),
frequency,
generalized: false,
}
}
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct TemporalStats {
pub count: u64,
pub min: String,
pub max: String,
pub day_of_week_distribution: [f64; 7],
pub month_distribution: [f64; 12],
#[serde(skip_serializing_if = "Option::is_none")]
pub hour_distribution: Option<[f64; 24]>,
pub weekend_effect: bool,
pub month_end_effect: bool,
pub year_end_effect: bool,
pub seasonality_strength: f64,
}
impl TemporalStats {
pub fn new(count: u64, min: String, max: String) -> Self {
Self {
count,
min,
max,
day_of_week_distribution: [1.0 / 7.0; 7],
month_distribution: [1.0 / 12.0; 12],
hour_distribution: None,
weekend_effect: false,
month_end_effect: false,
year_end_effect: false,
seasonality_strength: 0.0,
}
}
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct BenfordStats {
pub sample_size: u64,
pub observed_frequencies: [f64; 9],
pub expected_frequencies: [f64; 9],
pub mad: f64,
pub chi_squared: f64,
pub p_value: f64,
pub conforms: bool,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct RoundNumberStats {
pub round_hundred_rate: f64,
pub round_ten_rate: f64,
pub round_five_rate: f64,
pub whole_number_rate: f64,
}