#![allow(
clippy::unwrap_used,
clippy::expect_used,
clippy::cast_possible_truncation,
clippy::cast_possible_wrap,
clippy::cast_precision_loss,
clippy::cast_lossless,
clippy::uninlined_format_args,
clippy::too_many_lines,
clippy::similar_names,
clippy::float_cmp,
clippy::needless_late_init,
clippy::redundant_clone,
clippy::doc_markdown,
clippy::unnecessary_debug_formatting
)]
use std::sync::Arc;
use alimentar::{ArrowDataset, Dataset, QualityChecker, QualityIssue, QualityReport};
use arrow::{
array::{Float64Array, Int32Array, StringArray},
datatypes::{DataType, Field, Schema},
record_batch::RecordBatch,
};
fn create_messy_dataset() -> alimentar::Result<ArrowDataset> {
let schema = Arc::new(Schema::new(vec![
Field::new("id", DataType::Int32, false),
Field::new("name", DataType::Utf8, true), Field::new("age", DataType::Float64, true), Field::new("salary", DataType::Float64, true), Field::new("department", DataType::Utf8, true), ]));
let batch = RecordBatch::try_new(
schema,
vec![
Arc::new(Int32Array::from(vec![1, 2, 3, 4, 5, 6, 7, 3, 9, 10])), Arc::new(StringArray::from(vec![
Some("Alice"),
Some("Bob"),
None,
Some("David"),
Some("Eve"),
None,
Some("Grace"),
Some("Henry"),
Some("Ivy"),
Some("Jack"),
])),
Arc::new(Float64Array::from(vec![
Some(25.0),
Some(30.0),
Some(35.0),
None,
Some(28.0),
Some(150.0), Some(32.0),
Some(29.0),
None,
Some(27.0),
])),
Arc::new(Float64Array::from(vec![
Some(50000.0),
Some(60000.0),
Some(55000.0),
Some(70000.0),
Some(-5000.0), Some(65000.0),
Some(58000.0),
Some(62000.0),
Some(54000.0),
Some(0.0), ])),
Arc::new(StringArray::from(vec![
Some("Engineering"),
Some("Sales"),
Some("Engineering"),
Some("HR"),
Some("Sales"),
Some("Marketing"),
Some("Engineering"),
Some("Sales"),
Some(""), Some("HR"),
])),
],
)?;
ArrowDataset::from_batch(batch)
}
fn create_clean_dataset() -> alimentar::Result<ArrowDataset> {
let schema = Arc::new(Schema::new(vec![
Field::new("id", DataType::Int32, false),
Field::new("value", DataType::Float64, false),
]));
let batch = RecordBatch::try_new(
schema,
vec![
Arc::new(Int32Array::from(vec![1, 2, 3, 4, 5])),
Arc::new(Float64Array::from(vec![10.0, 20.0, 30.0, 40.0, 50.0])),
],
)?;
ArrowDataset::from_batch(batch)
}
fn print_quality_report(report: &QualityReport) {
println!(" Total rows: {}", report.row_count);
println!(" Total columns: {}", report.column_count);
println!("\n Column Quality:");
for (col_name, col_quality) in &report.columns {
println!(
" {}: null_count={}, null_ratio={:.2}%",
col_name,
col_quality.null_count,
col_quality.null_ratio * 100.0
);
}
}
fn main() -> alimentar::Result<()> {
println!("=== Alimentar Data Quality Check Example ===\n");
let messy_data = create_messy_dataset()?;
let clean_data = create_clean_dataset()?;
println!("Created datasets:");
println!(
" Messy data: {} rows, {} columns",
messy_data.len(),
messy_data.schema().fields().len()
);
println!(
" Clean data: {} rows, {} columns",
clean_data.len(),
clean_data.schema().fields().len()
);
println!("\n1. Basic quality check on messy data");
let checker = QualityChecker::new();
let report = checker.check(&messy_data)?;
print_quality_report(&report);
println!("\n2. Detected quality issues");
for issue in &report.issues {
match issue {
QualityIssue::HighNullRatio {
column,
null_ratio,
threshold,
} => {
println!(
" [HIGH NULL] Column '{}': {:.1}% nulls (threshold: {:.1}%)",
column,
null_ratio * 100.0,
threshold * 100.0
);
}
QualityIssue::HighDuplicateRatio {
column,
duplicate_ratio,
threshold,
} => {
println!(
" [DUPLICATES] Column '{}': {:.1}% duplicates (threshold: {:.1}%)",
column,
duplicate_ratio * 100.0,
threshold * 100.0
);
}
QualityIssue::LowCardinality {
column,
unique_count,
total_count,
} => {
println!(
" [LOW CARDINALITY] Column '{}': {} unique of {} total",
column, unique_count, total_count
);
}
QualityIssue::OutliersDetected {
column,
outlier_count,
outlier_ratio,
} => {
println!(
" [OUTLIERS] Column '{}': {} outliers ({:.1}%)",
column,
outlier_count,
outlier_ratio * 100.0
);
}
QualityIssue::DuplicateRows {
duplicate_count,
duplicate_ratio,
} => {
println!(
" [DUPLICATE ROWS] {} duplicate rows ({:.1}%)",
duplicate_count,
duplicate_ratio * 100.0
);
}
QualityIssue::ConstantColumn { column, value } => {
println!(
" [CONSTANT] Column '{}' has only one unique value: {:?}",
column, value
);
}
QualityIssue::EmptySchema => {
println!(" [EMPTY SCHEMA] Dataset has no columns");
}
QualityIssue::EmptyDataset => {
println!(" [EMPTY DATASET] Dataset has no rows");
}
}
}
if report.issues.is_empty() {
println!(" No quality issues detected!");
}
println!("\n3. Quality check on clean data");
let clean_checker = QualityChecker::new();
let clean_report = clean_checker.check(&clean_data)?;
print_quality_report(&clean_report);
println!(" Issues found: {}", clean_report.issues.len());
if clean_report.issues.is_empty() {
println!(" Data is clean!");
}
println!("\n4. Customized quality checker with thresholds");
let strict_checker = QualityChecker::new()
.max_null_ratio(0.05) .max_duplicate_ratio(0.5);
let strict_report = strict_checker.check(&messy_data)?;
println!(" With strict thresholds:");
println!(" Issues found: {}", strict_report.issues.len());
for issue in &strict_report.issues {
println!(" - {:?}", issue);
}
println!("\n5. Individual column analysis");
for (col_name, col_quality) in &report.columns {
println!("\n Column '{}':", col_name);
println!(" Null count: {}", col_quality.null_count);
println!(" Null ratio: {:.2}%", col_quality.null_ratio * 100.0);
println!(" Unique count: {}", col_quality.unique_count);
println!(
" Unique ratio: {:.2}%",
col_quality.unique_ratio * 100.0
);
}
println!("\n6. Quality improvement recommendations");
for (col_name, col_quality) in &report.columns {
if col_quality.null_ratio > 0.5 {
println!(" [CRITICAL] Drop column '{}' (>50% missing)", col_name);
} else if col_quality.null_ratio > 0.1 {
println!(" [ACTION] Impute missing values in '{}'", col_name);
} else if col_quality.null_ratio > 0.0 {
println!(" [MINOR] Consider imputation for '{}'", col_name);
}
}
println!("\n7. Quality summary");
let total_nulls: usize = report.columns.values().map(|c| c.null_count).sum();
let total_cells = report.row_count * report.column_count;
let overall_completeness = if total_cells > 0 {
1.0 - (total_nulls as f64 / total_cells as f64)
} else {
1.0
};
println!(
" Overall completeness: {:.1}%",
overall_completeness * 100.0
);
println!(" Total null values: {}", total_nulls);
println!(" Total cells: {}", total_cells);
println!("\n=== Example Complete ===");
Ok(())
}