use crate::dataframe::{Column, DataFrame, DataType};
use std::collections::{HashMap, HashSet};
#[derive(Debug, Clone)]
pub struct NumericProfile {
pub valid_count: usize,
pub min: f64,
pub max: f64,
pub mean: f64,
pub median: f64,
pub std_dev: f64,
pub variance: f64,
pub skewness: f64,
pub kurtosis: f64,
pub p5: f64,
pub q1: f64,
pub q3: f64,
pub p95: f64,
pub iqr: f64,
pub zero_count: usize,
pub negative_count: usize,
pub infinity_count: usize,
pub distinct_count: usize,
}
#[derive(Debug, Clone)]
pub struct BooleanProfile {
pub valid_count: usize,
pub true_count: usize,
pub false_count: usize,
pub true_ratio: f64,
}
#[derive(Debug, Clone)]
pub struct CategoricalProfile {
pub valid_count: usize,
pub distinct_count: usize,
pub top_values: Vec<(String, usize)>,
pub mode_ratio: f64,
pub is_constant: bool,
}
#[derive(Debug, Clone)]
pub struct TextProfile {
pub valid_count: usize,
pub distinct_count: usize,
pub min_length: usize,
pub max_length: usize,
pub mean_length: f64,
pub empty_count: usize,
}
#[derive(Debug, Clone)]
pub struct ColumnProfile {
pub name: String,
pub data_type: DataType,
pub row_count: usize,
pub null_count: usize,
pub missing_pct: f64,
pub numeric: Option<NumericProfile>,
pub boolean: Option<BooleanProfile>,
pub categorical: Option<CategoricalProfile>,
pub text: Option<TextProfile>,
}
pub fn profile_dataframe(df: &DataFrame) -> Vec<ColumnProfile> {
df.iter()
.map(|(name, col)| profile_column(name, col))
.collect()
}
pub fn profile_column(name: &str, col: &Column) -> ColumnProfile {
let row_count = col.len();
let null_count = col.null_count();
let missing_pct = if row_count > 0 {
(null_count as f64 / row_count as f64) * 100.0
} else {
0.0
};
let (numeric, boolean, categorical, text) = match col {
Column::Numeric { values, validity } => {
(Some(profile_numeric(values, validity)), None, None, None)
}
Column::Boolean { values, validity } => {
(None, Some(profile_boolean(values, validity)), None, None)
}
Column::Categorical {
dictionary,
indices,
validity,
} => (
None,
None,
Some(profile_categorical(dictionary, indices, validity)),
None,
),
Column::Text { values, validity } => {
(None, None, None, Some(profile_text(values, validity)))
}
};
ColumnProfile {
name: name.to_string(),
data_type: col.data_type(),
row_count,
null_count,
missing_pct,
numeric,
boolean,
categorical,
text,
}
}
fn profile_numeric(values: &[f64], validity: &crate::dataframe::ValidityBitmap) -> NumericProfile {
let valid: Vec<f64> = validity.valid_indices().map(|i| values[i]).collect();
let valid_count = valid.len();
if valid_count == 0 {
return NumericProfile {
valid_count: 0,
min: f64::NAN,
max: f64::NAN,
mean: f64::NAN,
median: f64::NAN,
std_dev: f64::NAN,
variance: f64::NAN,
skewness: f64::NAN,
kurtosis: f64::NAN,
p5: f64::NAN,
q1: f64::NAN,
q3: f64::NAN,
p95: f64::NAN,
iqr: f64::NAN,
zero_count: 0,
negative_count: 0,
infinity_count: 0,
distinct_count: 0,
};
}
let mean = u_numflow::stats::mean(&valid).unwrap_or(f64::NAN);
let variance = u_numflow::stats::variance(&valid).unwrap_or(f64::NAN);
let std_dev = u_numflow::stats::std_dev(&valid).unwrap_or(f64::NAN);
let min = u_numflow::stats::min(&valid).unwrap_or(f64::NAN);
let max = u_numflow::stats::max(&valid).unwrap_or(f64::NAN);
let median = u_numflow::stats::median(&valid).unwrap_or(f64::NAN);
let skewness = u_numflow::stats::skewness(&valid).unwrap_or(f64::NAN);
let kurtosis = u_numflow::stats::kurtosis(&valid).unwrap_or(f64::NAN);
let p5 = u_numflow::stats::quantile(&valid, 0.05).unwrap_or(f64::NAN);
let q1 = u_numflow::stats::quantile(&valid, 0.25).unwrap_or(f64::NAN);
let q3 = u_numflow::stats::quantile(&valid, 0.75).unwrap_or(f64::NAN);
let p95 = u_numflow::stats::quantile(&valid, 0.95).unwrap_or(f64::NAN);
let iqr = q3 - q1;
let zero_count = valid.iter().filter(|&&v| v == 0.0).count();
let negative_count = valid.iter().filter(|&&v| v < 0.0).count();
let infinity_count = valid.iter().filter(|&&v| v.is_infinite()).count();
let mut distinct_bits: std::collections::HashSet<u64> = std::collections::HashSet::new();
for &v in &valid {
distinct_bits.insert(v.to_bits());
}
let distinct_count = distinct_bits.len();
NumericProfile {
valid_count,
min,
max,
mean,
median,
std_dev,
variance,
skewness,
kurtosis,
p5,
q1,
q3,
p95,
iqr,
zero_count,
negative_count,
infinity_count,
distinct_count,
}
}
fn profile_boolean(values: &[bool], validity: &crate::dataframe::ValidityBitmap) -> BooleanProfile {
let mut true_count = 0usize;
let mut false_count = 0usize;
for idx in validity.valid_indices() {
if values[idx] {
true_count += 1;
} else {
false_count += 1;
}
}
let valid_count = true_count + false_count;
let true_ratio = if valid_count > 0 {
true_count as f64 / valid_count as f64
} else {
0.0
};
BooleanProfile {
valid_count,
true_count,
false_count,
true_ratio,
}
}
fn profile_categorical(
dictionary: &[String],
indices: &[u32],
validity: &crate::dataframe::ValidityBitmap,
) -> CategoricalProfile {
let mut freq: HashMap<u32, usize> = HashMap::new();
for idx in validity.valid_indices() {
*freq.entry(indices[idx]).or_insert(0) += 1;
}
let valid_count: usize = freq.values().sum();
let distinct_count = freq.len();
let mut freq_vec: Vec<(u32, usize)> = freq.into_iter().collect();
freq_vec.sort_by_key(|b| std::cmp::Reverse(b.1));
let top_n = 10.min(freq_vec.len());
let top_values: Vec<(String, usize)> = freq_vec[..top_n]
.iter()
.map(|&(dict_idx, count)| {
let label = dictionary
.get(dict_idx as usize)
.cloned()
.unwrap_or_default();
(label, count)
})
.collect();
let mode_count = freq_vec.first().map_or(0, |&(_, c)| c);
let mode_ratio = if valid_count > 0 {
mode_count as f64 / valid_count as f64
} else {
0.0
};
let is_constant = distinct_count <= 1;
CategoricalProfile {
valid_count,
distinct_count,
top_values,
mode_ratio,
is_constant,
}
}
fn profile_text(values: &[String], validity: &crate::dataframe::ValidityBitmap) -> TextProfile {
let mut distinct: std::collections::HashSet<&str> = std::collections::HashSet::new();
let mut lengths: Vec<usize> = Vec::new();
let mut empty_count = 0usize;
for idx in validity.valid_indices() {
let s = &values[idx];
distinct.insert(s.as_str());
lengths.push(s.len());
if s.is_empty() {
empty_count += 1;
}
}
let valid_count = lengths.len();
let distinct_count = distinct.len();
let min_length = lengths.iter().copied().min().unwrap_or(0);
let max_length = lengths.iter().copied().max().unwrap_or(0);
let mean_length = if valid_count > 0 {
lengths.iter().sum::<usize>() as f64 / valid_count as f64
} else {
0.0
};
TextProfile {
valid_count,
distinct_count,
min_length,
max_length,
mean_length,
empty_count,
}
}
#[derive(Debug, Clone)]
pub struct DatasetProfile {
pub row_count: usize,
pub column_count: usize,
pub memory_bytes: usize,
pub type_counts: TypeCounts,
pub total_nulls: usize,
pub sparsity_pct: f64,
pub duplicate_count: usize,
pub duplicate_pct: f64,
pub quality_score: DataQualityScore,
pub columns: Vec<ColumnProfile>,
}
#[derive(Debug, Clone)]
pub struct DataQualityScore {
pub completeness: f64,
pub uniqueness: f64,
pub validity: f64,
pub consistency: f64,
pub overall: f64,
}
#[derive(Debug, Clone, Default)]
pub struct TypeCounts {
pub numeric: usize,
pub boolean: usize,
pub categorical: usize,
pub text: usize,
}
pub fn profile_dataset(df: &DataFrame) -> DatasetProfile {
let columns = profile_dataframe(df);
let row_count = df.row_count();
let column_count = df.column_count();
let mut type_counts = TypeCounts::default();
let mut total_nulls = 0usize;
let mut memory_bytes = 0usize;
for (_, col) in df.iter() {
total_nulls += col.null_count();
match col {
Column::Numeric { values, .. } => {
type_counts.numeric += 1;
memory_bytes += values.len() * 8 + values.len().div_ceil(64) * 8;
}
Column::Boolean { values, .. } => {
type_counts.boolean += 1;
memory_bytes += values.len() + values.len().div_ceil(64) * 8;
}
Column::Categorical {
dictionary,
indices,
..
} => {
type_counts.categorical += 1;
let dict_size: usize = dictionary.iter().map(|s| s.len() + 24).sum(); memory_bytes += dict_size + indices.len() * 4 + indices.len().div_ceil(64) * 8;
}
Column::Text { values, .. } => {
type_counts.text += 1;
let text_size: usize = values.iter().map(|s| s.len() + 24).sum();
memory_bytes += text_size + values.len().div_ceil(64) * 8;
}
}
}
let total_cells = row_count * column_count;
let sparsity_pct = if total_cells > 0 {
(total_nulls as f64 / total_cells as f64) * 100.0
} else {
0.0
};
let duplicate_count = count_duplicate_rows(df);
let duplicate_pct = if row_count > 0 {
(duplicate_count as f64 / row_count as f64) * 100.0
} else {
0.0
};
let quality_score = compute_quality_score(
row_count,
column_count,
total_nulls,
duplicate_count,
&columns,
);
DatasetProfile {
row_count,
column_count,
memory_bytes,
type_counts,
total_nulls,
sparsity_pct,
duplicate_count,
duplicate_pct,
quality_score,
columns,
}
}
fn compute_quality_score(
row_count: usize,
column_count: usize,
total_nulls: usize,
duplicate_count: usize,
columns: &[ColumnProfile],
) -> DataQualityScore {
let total_cells = row_count * column_count;
let completeness = if total_cells > 0 {
1.0 - (total_nulls as f64 / total_cells as f64)
} else {
1.0
};
let uniqueness = if row_count > 0 {
1.0 - (duplicate_count as f64 / row_count as f64)
} else {
1.0
};
let thresholds = FlagThresholds::default();
let validity = if columns.is_empty() {
1.0
} else {
let flagged = columns
.iter()
.filter(|c| !compute_flags(c, &thresholds).is_empty())
.count();
1.0 - (flagged as f64 / columns.len() as f64)
};
let consistency = {
let numeric_cols: Vec<&ColumnProfile> =
columns.iter().filter(|c| c.numeric.is_some()).collect();
if numeric_cols.is_empty() {
1.0
} else {
let problematic = numeric_cols
.iter()
.filter(|c| {
let flags = compute_flags(c, &thresholds);
flags.iter().any(|f| {
matches!(
f,
DiagnosticFlag::ContainsInfinity
| DiagnosticFlag::ExtremeSkewness
| DiagnosticFlag::ExtremeKurtosis
)
})
})
.count();
1.0 - (problematic as f64 / numeric_cols.len() as f64)
}
};
let overall = completeness * 0.30 + uniqueness * 0.25 + validity * 0.25 + consistency * 0.20;
DataQualityScore {
completeness,
uniqueness,
validity,
consistency,
overall,
}
}
fn count_duplicate_rows(df: &DataFrame) -> usize {
let n = df.row_count();
if n <= 1 {
return 0;
}
let cols: Vec<(&str, &Column)> = df.iter().collect();
let mut seen = HashSet::with_capacity(n);
let mut dups = 0usize;
for row_idx in 0..n {
let key = row_key(&cols, row_idx);
if !seen.insert(key) {
dups += 1;
}
}
dups
}
fn row_key(cols: &[(&str, &Column)], row_idx: usize) -> String {
use std::fmt::Write;
let mut key = String::new();
for (i, (_, col)) in cols.iter().enumerate() {
if i > 0 {
key.push('\x1F'); }
if !col.is_valid(row_idx) {
key.push_str("\x00NULL");
continue;
}
match col {
Column::Numeric { values, .. } => {
let _ = write!(key, "{}", values[row_idx].to_bits());
}
Column::Boolean { values, .. } => {
key.push(if values[row_idx] { 'T' } else { 'F' });
}
Column::Categorical {
dictionary,
indices,
..
} => {
let idx = indices[row_idx] as usize;
if idx < dictionary.len() {
key.push_str(&dictionary[idx]);
}
}
Column::Text { values, .. } => {
key.push_str(&values[row_idx]);
}
}
}
key
}
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub enum OutlierMethod {
Iqr,
Zscore,
ModifiedZscore,
}
#[derive(Debug, Clone)]
pub struct OutlierResult {
pub method: OutlierMethod,
pub indices: Vec<usize>,
pub scores: Vec<f64>,
pub count: usize,
pub pct: f64,
pub lower_fence: f64,
pub upper_fence: f64,
pub center: f64,
pub spread: f64,
}
pub fn detect_outliers(col: &Column, method: OutlierMethod) -> Option<OutlierResult> {
let (values, validity) = match col {
Column::Numeric { values, validity } => (values, validity),
_ => return None,
};
let valid: Vec<(usize, f64)> = validity
.valid_indices()
.map(|i| (i, values[i]))
.filter(|(_, v)| v.is_finite()) .collect();
if valid.len() < 3 {
return Some(OutlierResult {
method,
indices: Vec::new(),
scores: Vec::new(),
count: 0,
pct: 0.0,
lower_fence: f64::NAN,
upper_fence: f64::NAN,
center: f64::NAN,
spread: f64::NAN,
});
}
let vals: Vec<f64> = valid.iter().map(|&(_, v)| v).collect();
match method {
OutlierMethod::Iqr => detect_iqr(&valid, &vals),
OutlierMethod::Zscore => detect_zscore(&valid, &vals),
OutlierMethod::ModifiedZscore => detect_modified_zscore(&valid, &vals),
}
}
pub fn detect_outliers_slice(data: &[f64], method: OutlierMethod) -> Option<OutlierResult> {
let valid: Vec<(usize, f64)> = data
.iter()
.copied()
.enumerate()
.filter(|(_, v)| v.is_finite())
.collect();
if valid.len() < 3 {
return Some(OutlierResult {
method,
indices: Vec::new(),
scores: Vec::new(),
count: 0,
pct: 0.0,
lower_fence: f64::NAN,
upper_fence: f64::NAN,
center: f64::NAN,
spread: f64::NAN,
});
}
let vals: Vec<f64> = valid.iter().map(|&(_, v)| v).collect();
match method {
OutlierMethod::Iqr => detect_iqr(&valid, &vals),
OutlierMethod::Zscore => detect_zscore(&valid, &vals),
OutlierMethod::ModifiedZscore => detect_modified_zscore(&valid, &vals),
}
}
fn detect_iqr(valid: &[(usize, f64)], vals: &[f64]) -> Option<OutlierResult> {
let q1 = u_numflow::stats::quantile(vals, 0.25)?;
let q3 = u_numflow::stats::quantile(vals, 0.75)?;
let median = u_numflow::stats::median(vals)?;
let iqr = q3 - q1;
let k = 1.5;
let lower = q1 - k * iqr;
let upper = q3 + k * iqr;
let mut indices = Vec::new();
let mut scores = Vec::new();
for &(idx, v) in valid {
if v < lower || v > upper {
let score = if v < lower {
(lower - v) / iqr.max(1e-15)
} else {
(v - upper) / iqr.max(1e-15)
};
indices.push(idx);
scores.push(score);
}
}
let count = indices.len();
let pct = if !valid.is_empty() {
(count as f64 / valid.len() as f64) * 100.0
} else {
0.0
};
Some(OutlierResult {
method: OutlierMethod::Iqr,
indices,
scores,
count,
pct,
lower_fence: lower,
upper_fence: upper,
center: median,
spread: iqr,
})
}
fn detect_zscore(valid: &[(usize, f64)], vals: &[f64]) -> Option<OutlierResult> {
let mean = u_numflow::stats::mean(vals)?;
let std = u_numflow::stats::std_dev(vals)?;
let threshold = 3.0;
if std < 1e-15 {
return Some(OutlierResult {
method: OutlierMethod::Zscore,
indices: Vec::new(),
scores: Vec::new(),
count: 0,
pct: 0.0,
lower_fence: f64::NAN,
upper_fence: f64::NAN,
center: mean,
spread: 0.0,
});
}
let mut indices = Vec::new();
let mut scores = Vec::new();
for &(idx, v) in valid {
let z = ((v - mean) / std).abs();
if z > threshold {
indices.push(idx);
scores.push(z);
}
}
let count = indices.len();
let pct = if !valid.is_empty() {
(count as f64 / valid.len() as f64) * 100.0
} else {
0.0
};
Some(OutlierResult {
method: OutlierMethod::Zscore,
indices,
scores,
count,
pct,
lower_fence: mean - threshold * std,
upper_fence: mean + threshold * std,
center: mean,
spread: std,
})
}
fn detect_modified_zscore(valid: &[(usize, f64)], vals: &[f64]) -> Option<OutlierResult> {
let median = u_numflow::stats::median(vals)?;
let threshold = 3.5;
let mut abs_devs: Vec<f64> = vals.iter().map(|&v| (v - median).abs()).collect();
abs_devs.sort_by(|a, b| a.partial_cmp(b).unwrap_or(std::cmp::Ordering::Equal));
let mad = u_numflow::stats::median(&abs_devs)?;
if mad < 1e-15 {
return Some(OutlierResult {
method: OutlierMethod::ModifiedZscore,
indices: Vec::new(),
scores: Vec::new(),
count: 0,
pct: 0.0,
lower_fence: f64::NAN,
upper_fence: f64::NAN,
center: median,
spread: 0.0,
});
}
let factor = 0.6745;
let mut indices = Vec::new();
let mut scores = Vec::new();
for &(idx, v) in valid {
let m = (factor * (v - median) / mad).abs();
if m > threshold {
indices.push(idx);
scores.push(m);
}
}
let count = indices.len();
let pct = if !valid.is_empty() {
(count as f64 / valid.len() as f64) * 100.0
} else {
0.0
};
let scale = mad / factor;
Some(OutlierResult {
method: OutlierMethod::ModifiedZscore,
indices,
scores,
count,
pct,
lower_fence: median - threshold * scale,
upper_fence: median + threshold * scale,
center: median,
spread: mad,
})
}
#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
pub enum DiagnosticFlag {
ConstantColumn,
ZeroVariance,
HighMissing,
ExtremeSkewness,
ExtremeKurtosis,
HighCardinality,
AllUnique,
Imbalanced,
ContainsInfinity,
}
impl std::fmt::Display for DiagnosticFlag {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
match self {
Self::ConstantColumn => write!(f, "CONSTANT_COLUMN"),
Self::ZeroVariance => write!(f, "ZERO_VARIANCE"),
Self::HighMissing => write!(f, "HIGH_MISSING"),
Self::ExtremeSkewness => write!(f, "EXTREME_SKEWNESS"),
Self::ExtremeKurtosis => write!(f, "EXTREME_KURTOSIS"),
Self::HighCardinality => write!(f, "HIGH_CARDINALITY"),
Self::AllUnique => write!(f, "ALL_UNIQUE"),
Self::Imbalanced => write!(f, "IMBALANCED"),
Self::ContainsInfinity => write!(f, "CONTAINS_INFINITY"),
}
}
}
#[derive(Debug, Clone)]
pub struct FlagThresholds {
pub high_missing_pct: f64,
pub zero_variance_threshold: f64,
pub extreme_skewness: f64,
pub extreme_kurtosis: f64,
pub high_cardinality: usize,
pub imbalance_ratio: f64,
}
impl Default for FlagThresholds {
fn default() -> Self {
Self {
high_missing_pct: 50.0,
zero_variance_threshold: 1e-10,
extreme_skewness: 10.0,
extreme_kurtosis: 100.0,
high_cardinality: 100,
imbalance_ratio: 0.9,
}
}
}
pub fn compute_flags(profile: &ColumnProfile, thresholds: &FlagThresholds) -> Vec<DiagnosticFlag> {
let mut flags = Vec::new();
if profile.missing_pct > thresholds.high_missing_pct {
flags.push(DiagnosticFlag::HighMissing);
}
if let Some(np) = &profile.numeric {
if np.valid_count > 0 && np.distinct_count <= 1 {
flags.push(DiagnosticFlag::ConstantColumn);
} else if np.std_dev.is_finite() && np.std_dev < thresholds.zero_variance_threshold {
flags.push(DiagnosticFlag::ZeroVariance);
}
if np.skewness.is_finite() && np.skewness.abs() > thresholds.extreme_skewness {
flags.push(DiagnosticFlag::ExtremeSkewness);
}
if np.kurtosis.is_finite() && np.kurtosis.abs() > thresholds.extreme_kurtosis {
flags.push(DiagnosticFlag::ExtremeKurtosis);
}
if np.infinity_count > 0 {
flags.push(DiagnosticFlag::ContainsInfinity);
}
if np.valid_count > 1 && np.distinct_count == np.valid_count {
flags.push(DiagnosticFlag::AllUnique);
}
}
if let Some(cp) = &profile.categorical {
if cp.is_constant {
flags.push(DiagnosticFlag::ConstantColumn);
}
if cp.distinct_count > thresholds.high_cardinality {
flags.push(DiagnosticFlag::HighCardinality);
}
if cp.mode_ratio > thresholds.imbalance_ratio {
flags.push(DiagnosticFlag::Imbalanced);
}
if cp.valid_count > 1 && cp.distinct_count == cp.valid_count {
flags.push(DiagnosticFlag::AllUnique);
}
}
if let Some(bp) = &profile.boolean {
if bp.valid_count > 0 && (bp.true_count == 0 || bp.false_count == 0) {
flags.push(DiagnosticFlag::ConstantColumn);
}
let ratio = bp.true_ratio.max(1.0 - bp.true_ratio);
if ratio > thresholds.imbalance_ratio {
flags.push(DiagnosticFlag::Imbalanced);
}
}
if let Some(tp) = &profile.text {
if tp.valid_count > 0 && tp.distinct_count <= 1 {
flags.push(DiagnosticFlag::ConstantColumn);
}
if tp.distinct_count > thresholds.high_cardinality {
flags.push(DiagnosticFlag::HighCardinality);
}
if tp.valid_count > 1 && tp.distinct_count == tp.valid_count {
flags.push(DiagnosticFlag::AllUnique);
}
}
flags
}
#[cfg(test)]
mod tests {
use super::*;
use crate::csv_parser::CsvParser;
use crate::dataframe::ValidityBitmap;
#[test]
fn numeric_basic_stats() {
let col = Column::Numeric {
values: vec![1.0, 2.0, 3.0, 4.0, 5.0],
validity: ValidityBitmap::all_valid(5),
};
let prof = profile_column("x", &col);
assert_eq!(prof.null_count, 0);
assert_eq!(prof.missing_pct, 0.0);
let np = prof.numeric.expect("numeric profile");
assert_eq!(np.valid_count, 5);
assert_eq!(np.min, 1.0);
assert_eq!(np.max, 5.0);
assert!((np.mean - 3.0).abs() < 1e-10);
assert!((np.median - 3.0).abs() < 1e-10);
assert_eq!(np.zero_count, 0);
assert_eq!(np.negative_count, 0);
assert_eq!(np.distinct_count, 5);
}
#[test]
fn numeric_with_nulls() {
let mut validity = ValidityBitmap::all_valid(5);
validity.set_invalid(2);
validity.set_invalid(4);
let col = Column::Numeric {
values: vec![10.0, 20.0, 0.0, 40.0, 0.0],
validity,
};
let prof = profile_column("x", &col);
assert_eq!(prof.null_count, 2);
assert!((prof.missing_pct - 40.0).abs() < 1e-10);
let np = prof.numeric.expect("numeric profile");
assert_eq!(np.valid_count, 3);
assert_eq!(np.min, 10.0);
assert_eq!(np.max, 40.0);
}
#[test]
fn numeric_all_null() {
let col = Column::Numeric {
values: vec![0.0, 0.0, 0.0],
validity: ValidityBitmap::all_invalid(3),
};
let prof = profile_column("x", &col);
let np = prof.numeric.expect("numeric profile");
assert_eq!(np.valid_count, 0);
assert!(np.mean.is_nan());
}
#[test]
fn numeric_zeros_negatives_infinity() {
let col = Column::Numeric {
values: vec![0.0, -1.0, f64::INFINITY, -2.0, f64::NEG_INFINITY, 0.0, 3.0],
validity: ValidityBitmap::all_valid(7),
};
let np = profile_column("x", &col).numeric.expect("numeric");
assert_eq!(np.zero_count, 2);
assert_eq!(np.negative_count, 3); assert_eq!(np.infinity_count, 2);
}
#[test]
fn numeric_quantiles() {
let values: Vec<f64> = (0..=100).map(|i| i as f64).collect();
let n = values.len();
let col = Column::Numeric {
values,
validity: ValidityBitmap::all_valid(n),
};
let np = profile_column("x", &col).numeric.expect("numeric");
assert!((np.q1 - 25.0).abs() < 1.0);
assert!((np.median - 50.0).abs() < 1e-10);
assert!((np.q3 - 75.0).abs() < 1.0);
assert!((np.iqr - 50.0).abs() < 1.0);
}
#[test]
fn numeric_distinct_count() {
let col = Column::Numeric {
values: vec![1.0, 2.0, 1.0, 3.0, 2.0, 1.0],
validity: ValidityBitmap::all_valid(6),
};
let np = profile_column("x", &col).numeric.expect("numeric");
assert_eq!(np.distinct_count, 3);
}
#[test]
fn boolean_basic() {
let col = Column::Boolean {
values: vec![true, false, true, true, false],
validity: ValidityBitmap::all_valid(5),
};
let bp = profile_column("flag", &col).boolean.expect("boolean");
assert_eq!(bp.valid_count, 5);
assert_eq!(bp.true_count, 3);
assert_eq!(bp.false_count, 2);
assert!((bp.true_ratio - 0.6).abs() < 1e-10);
}
#[test]
fn boolean_with_nulls() {
let mut validity = ValidityBitmap::all_valid(4);
validity.set_invalid(1);
let col = Column::Boolean {
values: vec![true, false, false, true],
validity,
};
let bp = profile_column("flag", &col).boolean.expect("boolean");
assert_eq!(bp.valid_count, 3);
assert_eq!(bp.true_count, 2);
assert_eq!(bp.false_count, 1);
}
#[test]
fn categorical_basic() {
let dict = vec!["A".into(), "B".into(), "C".into()];
let indices = vec![0, 1, 0, 2, 0, 1, 0];
let col = Column::Categorical {
dictionary: dict,
indices,
validity: ValidityBitmap::all_valid(7),
};
let cp = profile_column("cat", &col)
.categorical
.expect("categorical");
assert_eq!(cp.valid_count, 7);
assert_eq!(cp.distinct_count, 3);
assert!(!cp.is_constant);
assert_eq!(cp.top_values[0].0, "A");
assert_eq!(cp.top_values[0].1, 4);
assert!((cp.mode_ratio - 4.0 / 7.0).abs() < 1e-10);
}
#[test]
fn categorical_constant() {
let dict = vec!["X".into()];
let indices = vec![0, 0, 0];
let col = Column::Categorical {
dictionary: dict,
indices,
validity: ValidityBitmap::all_valid(3),
};
let cp = profile_column("cat", &col)
.categorical
.expect("categorical");
assert!(cp.is_constant);
assert_eq!(cp.distinct_count, 1);
assert!((cp.mode_ratio - 1.0).abs() < 1e-10);
}
#[test]
fn categorical_with_nulls() {
let dict = vec!["A".into(), "B".into()];
let indices = vec![0, 0, 1, 0];
let mut validity = ValidityBitmap::all_valid(4);
validity.set_invalid(2);
let col = Column::Categorical {
dictionary: dict,
indices,
validity,
};
let cp = profile_column("cat", &col)
.categorical
.expect("categorical");
assert_eq!(cp.valid_count, 3); assert_eq!(cp.distinct_count, 1); assert!(cp.is_constant);
}
#[test]
fn text_basic() {
let values = vec![
"hello".into(),
"world".into(),
"hi".into(),
"".into(),
"hello".into(),
];
let col = Column::Text {
values,
validity: ValidityBitmap::all_valid(5),
};
let tp = profile_column("txt", &col).text.expect("text");
assert_eq!(tp.valid_count, 5);
assert_eq!(tp.distinct_count, 4); assert_eq!(tp.min_length, 0);
assert_eq!(tp.max_length, 5);
assert_eq!(tp.empty_count, 1);
}
#[test]
fn text_with_nulls() {
let values = vec!["abc".into(), String::new(), "de".into()];
let mut validity = ValidityBitmap::all_valid(3);
validity.set_invalid(1);
let col = Column::Text { values, validity };
let tp = profile_column("txt", &col).text.expect("text");
assert_eq!(tp.valid_count, 2);
assert_eq!(tp.min_length, 2);
assert_eq!(tp.max_length, 3);
}
#[test]
fn profile_from_csv() {
let csv = "x,y,ok\n1.0,A,true\n2.0,B,false\nNA,A,true\n4.0,A,NA\n5.0,B,false\n";
let df = CsvParser::new().parse_str(csv).unwrap();
let profiles = profile_dataframe(&df);
assert_eq!(profiles.len(), 3);
let x = &profiles[0];
assert_eq!(x.name, "x");
assert_eq!(x.data_type, DataType::Numeric);
assert_eq!(x.null_count, 1);
assert!(x.numeric.is_some());
assert_eq!(x.numeric.as_ref().unwrap().valid_count, 4);
let y = &profiles[1];
assert_eq!(y.name, "y");
assert_eq!(y.data_type, DataType::Categorical);
assert_eq!(y.null_count, 0);
assert!(y.categorical.is_some());
assert_eq!(y.categorical.as_ref().unwrap().distinct_count, 2);
let ok = &profiles[2];
assert_eq!(ok.name, "ok");
assert_eq!(ok.data_type, DataType::Boolean);
assert_eq!(ok.null_count, 1);
assert!(ok.boolean.is_some());
}
#[test]
fn empty_dataframe_profile() {
let df = DataFrame::new();
let profiles = profile_dataframe(&df);
assert!(profiles.is_empty());
}
#[test]
fn numeric_skewness_kurtosis() {
let values: Vec<f64> = vec![1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0];
let n = values.len();
let col = Column::Numeric {
values,
validity: ValidityBitmap::all_valid(n),
};
let np = profile_column("x", &col).numeric.expect("numeric");
assert!(
np.skewness.abs() < 0.5,
"skewness should be near zero for uniform-like data"
);
}
#[test]
fn negative_count_excludes_neg_infinity() {
let col = Column::Numeric {
values: vec![-1.0, f64::NEG_INFINITY, 0.0],
validity: ValidityBitmap::all_valid(3),
};
let np = profile_column("x", &col).numeric.expect("numeric");
assert_eq!(np.negative_count, 2);
assert_eq!(np.infinity_count, 1);
}
#[test]
fn dataset_profile_basic() {
let csv = "x,y,ok\n1.0,A,true\n2.0,B,false\n3.0,A,true\n";
let df = CsvParser::new().parse_str(csv).unwrap();
let ds = profile_dataset(&df);
assert_eq!(ds.row_count, 3);
assert_eq!(ds.column_count, 3);
assert_eq!(ds.type_counts.numeric, 1);
assert_eq!(ds.type_counts.boolean, 1);
assert_eq!(ds.total_nulls, 0);
assert!((ds.sparsity_pct - 0.0).abs() < 1e-10);
assert!(ds.memory_bytes > 0);
assert_eq!(ds.columns.len(), 3);
}
#[test]
fn dataset_profile_with_nulls() {
let csv = "x,y\n1.0,A\nNA,B\n3.0,NA\n";
let df = CsvParser::new().parse_str(csv).unwrap();
let ds = profile_dataset(&df);
assert_eq!(ds.total_nulls, 2);
assert!((ds.sparsity_pct - 33.333).abs() < 1.0);
}
#[test]
fn dataset_empty() {
let df = DataFrame::new();
let ds = profile_dataset(&df);
assert_eq!(ds.row_count, 0);
assert_eq!(ds.column_count, 0);
assert_eq!(ds.total_nulls, 0);
assert!((ds.sparsity_pct - 0.0).abs() < 1e-10);
}
#[test]
fn iqr_outlier_detection() {
let data = vec![1.0, 2.0, 2.5, 3.0, 2.0, 3.0, 2.5, 100.0];
let col = Column::numeric(data, ValidityBitmap::all_valid(8));
let result = detect_outliers(&col, OutlierMethod::Iqr).unwrap();
assert!(result.count >= 1);
assert!(result.indices.contains(&7)); }
#[test]
fn zscore_outlier_detection() {
let mut data: Vec<f64> = (0..50).map(|i| i as f64).collect();
data.push(500.0); let n = data.len();
let col = Column::numeric(data, ValidityBitmap::all_valid(n));
let result = detect_outliers(&col, OutlierMethod::Zscore).unwrap();
assert!(result.count >= 1);
assert!(result.indices.contains(&50)); }
#[test]
fn modified_zscore_outlier_detection() {
let data = vec![1.0, 2.0, 2.5, 3.0, 2.0, 3.0, 2.5, 50.0];
let col = Column::numeric(data, ValidityBitmap::all_valid(8));
let result = detect_outliers(&col, OutlierMethod::ModifiedZscore).unwrap();
assert!(result.count >= 1);
assert!(result.indices.contains(&7)); }
#[test]
fn outlier_with_nulls() {
let mut validity = ValidityBitmap::all_valid(5);
validity.set_invalid(2);
let col = Column::numeric(vec![1.0, 2.0, 0.0, 3.0, 100.0], validity);
let result = detect_outliers(&col, OutlierMethod::Iqr).unwrap();
assert!(result.count >= 1);
assert!(result.indices.contains(&4));
assert!(!result.indices.contains(&2)); }
#[test]
fn outlier_not_numeric() {
let col = Column::text(vec!["a".into()], ValidityBitmap::all_valid(1));
assert!(detect_outliers(&col, OutlierMethod::Iqr).is_none());
}
#[test]
fn outlier_too_few_values() {
let col = Column::numeric(vec![1.0, 2.0], ValidityBitmap::all_valid(2));
let result = detect_outliers(&col, OutlierMethod::Iqr).unwrap();
assert_eq!(result.count, 0);
}
#[test]
fn no_outliers_in_uniform_data() {
let data: Vec<f64> = (1..=10).map(|i| i as f64).collect();
let col = Column::numeric(data, ValidityBitmap::all_valid(10));
let result = detect_outliers(&col, OutlierMethod::Zscore).unwrap();
assert_eq!(result.count, 0);
}
#[test]
fn outlier_zero_variance() {
let data = vec![5.0; 10];
let col = Column::numeric(data, ValidityBitmap::all_valid(10));
let result = detect_outliers(&col, OutlierMethod::Zscore).unwrap();
assert_eq!(result.count, 0); }
#[test]
fn flag_constant_numeric() {
let col = Column::numeric(vec![5.0, 5.0, 5.0], ValidityBitmap::all_valid(3));
let prof = profile_column("x", &col);
let flags = compute_flags(&prof, &FlagThresholds::default());
assert!(flags.contains(&DiagnosticFlag::ConstantColumn));
}
#[test]
fn flag_high_missing() {
let mut validity = ValidityBitmap::all_valid(10);
for i in 0..6 {
validity.set_invalid(i);
}
let col = Column::numeric(vec![0.0; 10], validity);
let prof = profile_column("x", &col);
let flags = compute_flags(&prof, &FlagThresholds::default());
assert!(flags.contains(&DiagnosticFlag::HighMissing));
}
#[test]
fn flag_extreme_skewness() {
let mut data = vec![1.0; 100];
data.push(10000.0);
let n = data.len();
let col = Column::numeric(data, ValidityBitmap::all_valid(n));
let prof = profile_column("x", &col);
let flags = compute_flags(&prof, &FlagThresholds::default());
assert!(
flags.contains(&DiagnosticFlag::ExtremeSkewness),
"skewness = {}",
prof.numeric.as_ref().unwrap().skewness
);
}
#[test]
fn flag_contains_infinity() {
let col = Column::numeric(vec![1.0, f64::INFINITY, 3.0], ValidityBitmap::all_valid(3));
let prof = profile_column("x", &col);
let flags = compute_flags(&prof, &FlagThresholds::default());
assert!(flags.contains(&DiagnosticFlag::ContainsInfinity));
}
#[test]
fn flag_all_unique_numeric() {
let col = Column::numeric(vec![1.0, 2.0, 3.0, 4.0], ValidityBitmap::all_valid(4));
let prof = profile_column("x", &col);
let flags = compute_flags(&prof, &FlagThresholds::default());
assert!(flags.contains(&DiagnosticFlag::AllUnique));
}
#[test]
fn flag_imbalanced_boolean() {
let mut values = vec![true; 95];
values.extend(vec![false; 5]);
let n = values.len();
let col = Column::boolean(values, ValidityBitmap::all_valid(n));
let prof = profile_column("flag", &col);
let flags = compute_flags(&prof, &FlagThresholds::default());
assert!(flags.contains(&DiagnosticFlag::Imbalanced));
}
#[test]
fn flag_constant_categorical() {
let dict = vec!["X".into()];
let indices = vec![0, 0, 0, 0, 0];
let col = Column::categorical(dict, indices, ValidityBitmap::all_valid(5));
let prof = profile_column("cat", &col);
let flags = compute_flags(&prof, &FlagThresholds::default());
assert!(flags.contains(&DiagnosticFlag::ConstantColumn));
}
#[test]
fn no_flags_for_normal_data() {
let data: Vec<f64> = (1..=100).map(|i| i as f64).collect();
let col = Column::numeric(data, ValidityBitmap::all_valid(100));
let prof = profile_column("x", &col);
let flags = compute_flags(&prof, &FlagThresholds::default());
assert!(!flags.contains(&DiagnosticFlag::ExtremeSkewness));
assert!(!flags.contains(&DiagnosticFlag::ExtremeKurtosis));
assert!(!flags.contains(&DiagnosticFlag::ConstantColumn));
}
#[test]
fn custom_thresholds() {
let col = Column::numeric(vec![1.0, 2.0, 3.0], ValidityBitmap::all_valid(3));
let prof = profile_column("x", &col);
let flags = compute_flags(&prof, &FlagThresholds::default());
assert!(!flags.contains(&DiagnosticFlag::ExtremeSkewness));
let strict = FlagThresholds {
extreme_skewness: 0.0001,
..FlagThresholds::default()
};
let flags2 = compute_flags(&prof, &strict);
assert!(!flags2.contains(&DiagnosticFlag::ExtremeSkewness));
}
#[test]
fn quality_score_perfect() {
let csv = "x,y\n1.0,A\n2.0,B\n3.0,C\n";
let df = CsvParser::new().parse_str(csv).unwrap();
let ds = profile_dataset(&df);
let q = &ds.quality_score;
assert!((q.completeness - 1.0).abs() < 1e-10); assert!((q.uniqueness - 1.0).abs() < 1e-10); assert!(
q.overall > 0.5,
"overall should be reasonable: {}",
q.overall
);
}
#[test]
fn quality_score_with_nulls() {
let csv = "x,y\n1.0,A\nNA,B\n3.0,NA\n";
let df = CsvParser::new().parse_str(csv).unwrap();
let ds = profile_dataset(&df);
let q = &ds.quality_score;
assert!((q.completeness - 4.0 / 6.0).abs() < 1e-10);
}
#[test]
fn quality_score_with_duplicates() {
let csv = "x,y\n1.0,A\n1.0,A\n2.0,B\n";
let df = CsvParser::new().parse_str(csv).unwrap();
let ds = profile_dataset(&df);
let q = &ds.quality_score;
assert!((q.uniqueness - 2.0 / 3.0).abs() < 1e-10);
}
#[test]
fn quality_score_empty() {
let df = DataFrame::new();
let ds = profile_dataset(&df);
let q = &ds.quality_score;
assert!((q.overall - 1.0).abs() < 1e-10);
}
#[test]
fn quality_score_dimensions_in_range() {
let csv = "a,b,c\n1.0,x,true\nNA,y,false\n3.0,x,true\n";
let df = CsvParser::new().parse_str(csv).unwrap();
let ds = profile_dataset(&df);
let q = &ds.quality_score;
for score in [
q.completeness,
q.uniqueness,
q.validity,
q.consistency,
q.overall,
] {
assert!((0.0..=1.0).contains(&score), "score {score} out of range");
}
}
#[test]
fn duplicate_rows_none() {
let csv = "x,y\n1.0,A\n2.0,B\n3.0,C\n";
let df = CsvParser::new().parse_str(csv).unwrap();
let ds = profile_dataset(&df);
assert_eq!(ds.duplicate_count, 0);
assert!((ds.duplicate_pct - 0.0).abs() < 1e-10);
}
#[test]
fn duplicate_rows_all_same() {
let csv = "x,y\n1.0,A\n1.0,A\n1.0,A\n";
let df = CsvParser::new().parse_str(csv).unwrap();
let ds = profile_dataset(&df);
assert_eq!(ds.duplicate_count, 2); assert!((ds.duplicate_pct - 200.0 / 3.0).abs() < 0.1);
}
#[test]
fn duplicate_rows_partial() {
let csv = "x,y\n1.0,A\n2.0,B\n1.0,A\n3.0,C\n2.0,B\n";
let df = CsvParser::new().parse_str(csv).unwrap();
let ds = profile_dataset(&df);
assert_eq!(ds.duplicate_count, 2); }
#[test]
fn duplicate_rows_empty() {
let df = DataFrame::new();
let ds = profile_dataset(&df);
assert_eq!(ds.duplicate_count, 0);
assert!((ds.duplicate_pct - 0.0).abs() < 1e-10);
}
#[test]
fn duplicate_rows_with_nulls() {
let csv = "x,y\n1.0,A\nNA,B\nNA,B\n2.0,C\n";
let df = CsvParser::new().parse_str(csv).unwrap();
let ds = profile_dataset(&df);
assert_eq!(ds.duplicate_count, 1); }
#[test]
fn detect_outliers_iqr_populates_fences() {
let col = Column::numeric(
vec![1.0, 2.0, 3.0, 2.5, 100.0, 2.0, 3.0, 2.0],
ValidityBitmap::all_valid(8),
);
let r = detect_outliers(&col, OutlierMethod::Iqr).unwrap();
assert!(r.lower_fence.is_finite());
assert!(r.upper_fence.is_finite());
assert!(r.upper_fence < 100.0); assert!(r.center.is_finite());
assert!(r.spread > 0.0);
}
#[test]
fn detect_outliers_slice_iqr_basic() {
let data = [1.0, 2.0, 3.0, 2.5, 100.0, 2.0, 3.0, 2.0];
let r = detect_outliers_slice(&data, OutlierMethod::Iqr).unwrap();
assert!(r.indices.contains(&4));
assert!(r.upper_fence < 100.0);
}
#[test]
fn detect_outliers_slice_skips_non_finite() {
let data = [1.0, 2.0, 3.0, f64::NAN, 100.0, 2.0, 3.0, f64::INFINITY];
let r = detect_outliers_slice(&data, OutlierMethod::Iqr).unwrap();
assert!(r.indices.contains(&4));
assert!(!r.indices.contains(&3));
assert!(!r.indices.contains(&7));
}
#[test]
fn detect_outliers_slice_zscore_fences() {
let data: Vec<f64> = (0..100).map(|i| i as f64).collect();
let r = detect_outliers_slice(&data, OutlierMethod::Zscore).unwrap();
let expected_upper = r.center + 3.0 * r.spread;
let expected_lower = r.center - 3.0 * r.spread;
assert!((r.upper_fence - expected_upper).abs() < 1e-9);
assert!((r.lower_fence - expected_lower).abs() < 1e-9);
}
#[test]
fn detect_outliers_modified_zscore_fence_matches_threshold() {
let data: Vec<f64> = (0..100).map(|i| i as f64).collect();
let r = detect_outliers_slice(&data, OutlierMethod::ModifiedZscore).unwrap();
let scale = r.spread / 0.6745;
let expected_upper = r.center + 3.5 * scale;
assert!((r.upper_fence - expected_upper).abs() < 1e-9);
}
#[test]
fn detect_outliers_too_few_returns_nan_fences() {
let data = [1.0, 2.0];
let r = detect_outliers_slice(&data, OutlierMethod::Iqr).unwrap();
assert_eq!(r.count, 0);
assert!(r.lower_fence.is_nan());
assert!(r.upper_fence.is_nan());
}
}