use regex::Regex;
use std::sync::LazyLock;
pub(crate) static DATE_VALIDATION_REGEXES: LazyLock<Vec<Regex>> = LazyLock::new(|| {
vec![
Regex::new(r"^\d{4}-\d{2}-\d{2}$")
.expect("BUG: Invalid hardcoded regex for date validation YYYY-MM-DD"),
Regex::new(r"^\d{2}/\d{2}/\d{4}$")
.expect("BUG: Invalid hardcoded regex for date validation DD/MM/YYYY"),
Regex::new(r"^\d{2}-\d{2}-\d{4}$")
.expect("BUG: Invalid hardcoded regex for date validation DD-MM-YYYY"),
Regex::new(r"^\d{4}/\d{2}/\d{2}$")
.expect("BUG: Invalid hardcoded regex for date validation YYYY/MM/DD"),
Regex::new(r"^\d{1,2}/\d{1,2}/\d{4}$")
.expect("BUG: Invalid hardcoded regex for date validation M/D/YYYY"),
Regex::new(r"^\d{4}-\d{1,2}-\d{1,2}$")
.expect("BUG: Invalid hardcoded regex for date validation YYYY-M-D"),
Regex::new(r"^\d{1,2}-\d{1,2}-\d{4}$")
.expect("BUG: Invalid hardcoded regex for date validation M-D-YYYY"),
]
});
pub(crate) static DATE_FORMAT_REGEXES: LazyLock<Vec<(&'static str, Regex)>> = LazyLock::new(|| {
vec![
(
"YYYY-MM-DD",
Regex::new(r"^\d{4}-\d{2}-\d{2}$")
.expect("BUG: Invalid hardcoded regex for format YYYY-MM-DD"),
),
(
"DD/MM/YYYY",
Regex::new(r"^\d{2}/\d{2}/\d{4}$")
.expect("BUG: Invalid hardcoded regex for format DD/MM/YYYY"),
),
(
"DD-MM-YYYY",
Regex::new(r"^\d{2}-\d{2}-\d{4}$")
.expect("BUG: Invalid hardcoded regex for format DD-MM-YYYY"),
),
(
"YYYY/MM/DD",
Regex::new(r"^\d{4}/\d{2}/\d{2}$")
.expect("BUG: Invalid hardcoded regex for format YYYY/MM/DD"),
),
]
});
#[derive(Debug)]
pub struct StatisticalValidation {
pub sufficient_sample: bool,
pub min_sample_size: usize,
pub actual_sample_size: usize,
pub confidence_level: f64,
}
pub fn validate_sample_size(sample_size: usize, metric_type: &str) -> StatisticalValidation {
let min_sample = match metric_type {
"outlier_detection" => 30, "distribution_analysis" => 100,
"pattern_detection" => 50,
"general" => 10,
_ => 10,
};
StatisticalValidation {
sufficient_sample: sample_size >= min_sample,
min_sample_size: min_sample,
actual_sample_size: sample_size,
confidence_level: if sample_size >= min_sample { 0.95 } else { 0.0 },
}
}
pub(crate) fn is_valid_date_format(value: &str) -> bool {
DATE_VALIDATION_REGEXES
.iter()
.any(|regex| regex.is_match(value))
}
pub(crate) fn is_likely_date_column(column_name: &str) -> bool {
let date_indicators = [
"date",
"time",
"created",
"updated",
"timestamp",
"birth",
"expiry",
];
let name_lower = column_name.to_lowercase();
date_indicators
.iter()
.any(|indicator| name_lower.contains(indicator))
}
pub(crate) fn is_likely_id_column(column_name: &str) -> bool {
let name_lower = column_name.to_lowercase();
name_lower.contains("id")
|| name_lower.contains("key")
|| name_lower == "pk"
|| name_lower.ends_with("_id")
}
pub(crate) fn extract_year(date_str: &str) -> Option<i32> {
if date_str.len() >= 4
&& let Ok(year) = date_str[0..4].parse::<i32>()
&& (1900..=2100).contains(&year)
{
return Some(year);
}
if date_str.len() >= 10
&& let Ok(year) = date_str[6..10].parse::<i32>()
&& (1900..=2100).contains(&year)
{
return Some(year);
}
None
}
pub(crate) fn calculate_percentile(sorted_values: &[f64], percentile: f64) -> f64 {
let n = sorted_values.len();
if n == 0 {
return 0.0;
}
if n == 1 {
return sorted_values[0];
}
let h = (n - 1) as f64 * (percentile / 100.0);
let h_floor = h.floor() as usize;
let h_ceil = h.ceil() as usize;
if h_floor >= n - 1 {
return sorted_values[n - 1];
}
let lower = sorted_values[h_floor];
let upper = sorted_values[h_ceil];
let fraction = h - h_floor as f64;
lower + fraction * (upper - lower)
}