use std::collections::HashSet;
#[derive(Debug, Clone)]
pub struct QualityProfile {
pub name: String,
pub description: String,
pub expected_constant_columns: HashSet<String>,
pub nullable_columns: HashSet<String>,
pub max_null_ratio: f64,
pub max_duplicate_ratio: f64,
pub min_cardinality: usize,
pub max_outlier_ratio: f64,
pub max_duplicate_row_ratio: f64,
pub penalize_unexpected_constants: bool,
pub require_signature: bool,
}
impl Default for QualityProfile {
fn default() -> Self {
Self {
name: "default".to_string(),
description: "General-purpose quality profile".to_string(),
expected_constant_columns: HashSet::new(),
nullable_columns: HashSet::new(),
max_null_ratio: 0.1,
max_duplicate_ratio: 0.5,
min_cardinality: 2,
max_outlier_ratio: 0.05,
max_duplicate_row_ratio: 0.01,
penalize_unexpected_constants: true,
require_signature: false,
}
}
}
impl QualityProfile {
#[must_use]
pub fn new(name: impl Into<String>) -> Self {
Self {
name: name.into(),
..Default::default()
}
}
#[must_use]
pub fn by_name(name: &str) -> Option<Self> {
match name {
"default" => Some(Self::default()),
"doctest-corpus" | "doctest" => Some(Self::doctest_corpus()),
"ml-training" | "ml" => Some(Self::ml_training()),
"time-series" | "timeseries" => Some(Self::time_series()),
_ => None,
}
}
#[must_use]
pub fn available_profiles() -> Vec<&'static str> {
vec!["default", "doctest-corpus", "ml-training", "time-series"]
}
#[must_use]
pub fn doctest_corpus() -> Self {
let mut expected_constants = HashSet::new();
expected_constants.insert("source".to_string());
expected_constants.insert("version".to_string());
let mut nullable = HashSet::new();
nullable.insert("signature".to_string());
Self {
name: "doctest-corpus".to_string(),
description: "Profile for Python doctest extraction datasets".to_string(),
expected_constant_columns: expected_constants,
nullable_columns: nullable,
max_null_ratio: 0.05, max_duplicate_ratio: 0.3, min_cardinality: 2,
max_outlier_ratio: 0.05,
max_duplicate_row_ratio: 0.0, penalize_unexpected_constants: true,
require_signature: false, }
}
#[must_use]
pub fn ml_training() -> Self {
Self {
name: "ml-training".to_string(),
description: "Profile for machine learning training datasets".to_string(),
expected_constant_columns: HashSet::new(),
nullable_columns: HashSet::new(),
max_null_ratio: 0.0, max_duplicate_ratio: 0.8, min_cardinality: 2,
max_outlier_ratio: 0.1, max_duplicate_row_ratio: 0.01,
penalize_unexpected_constants: true,
require_signature: false,
}
}
#[must_use]
pub fn time_series() -> Self {
Self {
name: "time-series".to_string(),
description: "Profile for time series datasets".to_string(),
expected_constant_columns: HashSet::new(),
nullable_columns: HashSet::new(),
max_null_ratio: 0.05,
max_duplicate_ratio: 0.5,
min_cardinality: 2,
max_outlier_ratio: 0.1, max_duplicate_row_ratio: 0.0, penalize_unexpected_constants: true,
require_signature: false,
}
}
#[must_use]
pub fn with_description(mut self, desc: impl Into<String>) -> Self {
self.description = desc.into();
self
}
#[must_use]
pub fn with_expected_constant(mut self, column: impl Into<String>) -> Self {
self.expected_constant_columns.insert(column.into());
self
}
#[must_use]
pub fn with_nullable(mut self, column: impl Into<String>) -> Self {
self.nullable_columns.insert(column.into());
self
}
#[must_use]
pub fn with_max_null_ratio(mut self, ratio: f64) -> Self {
self.max_null_ratio = ratio;
self
}
#[must_use]
pub fn with_max_duplicate_ratio(mut self, ratio: f64) -> Self {
self.max_duplicate_ratio = ratio;
self
}
#[must_use]
pub fn is_expected_constant(&self, column: &str) -> bool {
self.expected_constant_columns.contains(column)
}
#[must_use]
pub fn is_nullable(&self, column: &str) -> bool {
self.nullable_columns.contains(column)
}
#[must_use]
pub fn null_threshold_for(&self, column: &str) -> f64 {
if self.is_nullable(column) {
1.0 } else {
self.max_null_ratio
}
}
}