//! Quality checking CLI commands.
use std::path::PathBuf;
use clap::Subcommand;
use crate::quality::{ColumnQuality, QualityChecker};
use super::basic::load_dataset;
/// Quality checking commands.
#[derive(Subcommand)]
pub enum QualityCommands {
/// Check data quality of a dataset
Check {
/// Path to dataset file
path: PathBuf,
/// Null ratio threshold (0.0 to 1.0)
#[arg(long, default_value = "0.1")]
null_threshold: f64,
/// Duplicate ratio threshold (0.0 to 1.0)
#[arg(long, default_value = "0.05")]
duplicate_threshold: f64,
/// Enable outlier detection
#[arg(long, default_value = "true")]
detect_outliers: bool,
/// Output format (text, json)
#[arg(short, long, default_value = "text")]
format: String,
},
/// Generate a quality report
Report {
/// Path to dataset file
path: PathBuf,
/// Output file for the report (JSON format)
#[arg(short, long)]
output: Option<PathBuf>,
},
/// Calculate 100-point quality score with letter grade (GH-6)
Score {
/// Path to dataset file
path: PathBuf,
/// Quality profile to use (default, doctest-corpus, ml-training,
/// time-series)
#[arg(short, long, default_value = "default")]
profile: String,
/// Show improvement suggestions for failed checks
#[arg(long)]
suggest: bool,
/// Output as JSON for CI/CD integration
#[arg(long)]
json: bool,
/// Output badge URL for shields.io
#[arg(long)]
badge: bool,
},
/// List available quality profiles
Profiles,
}
/// Check data quality of a dataset.
pub(crate) fn cmd_quality_check(
path: &PathBuf,
null_threshold: f64,
_duplicate_threshold: f64,
detect_outliers: bool,
format: &str,
) -> crate::Result<()> {
let dataset = load_dataset(path)?;
let mut checker = QualityChecker::new();
if !detect_outliers {
checker = checker.with_outlier_check(false);
}
let report = checker.check(&dataset)?;
if format == "json" {
let json = serde_json::json!({
"path": path.display().to_string(),
"rows": report.row_count,
"columns": report.column_count,
"has_issues": !report.issues.is_empty(),
"score": report.score,
"issues": report.issues.iter().map(|i| format!("{:?}", i)).collect::<Vec<_>>(),
"column_qualities": report.columns.iter().map(|(name, c)| {
serde_json::json!({
"column": name,
"null_ratio": c.null_ratio,
"unique_count": c.unique_count,
"is_constant": c.is_constant(),
"is_mostly_null": c.null_ratio > null_threshold,
})
}).collect::<Vec<_>>()
});
println!(
"{}",
serde_json::to_string_pretty(&json).map_err(|e| crate::Error::Format(e.to_string()))?
);
} else {
println!("Data Quality Report");
println!("===================");
println!("File: {}", path.display());
println!("Rows: {}", report.row_count);
println!("Columns: {}", report.column_count);
println!();
println!("Quality Score: {:.1}%", report.score);
println!();
if report.issues.is_empty() {
println!("\u{2713} No quality issues found\n");
} else {
println!("Issues Found:");
println!("-------------");
for issue in &report.issues {
println!(" - {:?}", issue);
}
println!();
}
println!(
"{:<20} {:<12} {:<12} {:<10}",
"COLUMN", "NULL %", "UNIQUE", "STATUS"
);
println!("{}", "-".repeat(60));
for (name, col) in &report.columns {
let status = if col.is_constant() {
"CONSTANT"
} else if col.null_ratio > null_threshold {
"HIGH NULL"
} else {
"OK"
};
println!(
"{:<20} {:<12.2} {:<12} {:<10}",
name,
col.null_ratio * 100.0,
col.unique_count,
status
);
}
}
Ok(())
}
/// Generate a quality report.
pub(crate) fn cmd_quality_report(path: &PathBuf, output: Option<&PathBuf>) -> crate::Result<()> {
let dataset = load_dataset(path)?;
let report = QualityChecker::new().check(&dataset)?;
let json = serde_json::json!({
"path": path.display().to_string(),
"rows": report.row_count,
"columns": report.column_count,
"has_issues": !report.issues.is_empty(),
"score": report.score,
"issues": report.issues.iter().map(|i| format!("{:?}", i)).collect::<Vec<_>>(),
"column_qualities": report.columns.iter().map(|(name, c)| {
serde_json::json!({
"column": name,
"null_ratio": c.null_ratio,
"unique_count": c.unique_count,
"is_constant": c.is_constant(),
})
}).collect::<Vec<_>>()
});
let json_str =
serde_json::to_string_pretty(&json).map_err(|e| crate::Error::Format(e.to_string()))?;
if let Some(output_path) = output {
std::fs::write(output_path, &json_str).map_err(|e| crate::Error::io(e, output_path))?;
println!("Quality report written to: {}", output_path.display());
} else {
println!("{}", json_str);
}
Ok(())
}
/// Calculate 100-point quality score with letter grade (GH-6).
///
/// Implements the Doctest Corpus QA Checklist for Publication with
/// weighted scoring per Toyota Way Jidoka principles.
#[allow(clippy::too_many_lines)]
pub(crate) fn cmd_quality_score(
path: &PathBuf,
profile_name: &str,
suggest: bool,
json_output: bool,
badge_output: bool,
) -> crate::Result<()> {
use crate::quality::{QualityProfile, QualityScore, Severity};
// Load the quality profile
let profile = QualityProfile::by_name(profile_name).ok_or_else(|| {
crate::Error::Format(format!(
"Unknown quality profile '{}'. Available: {:?}",
profile_name,
QualityProfile::available_profiles()
))
})?;
let dataset = load_dataset(path)?;
let report = QualityChecker::new().check(&dataset)?;
// Wire QualityReport to ChecklistItems per the 100-point checklist
let checklist = build_checklist_from_report(&report, &profile);
let score = QualityScore::from_checklist(checklist);
// Output based on flags
if badge_output {
println!("{}", score.badge_url());
} else if json_output {
println!("{}", score.to_json());
} else {
// Text output (Andon-style visual management)
let grade_symbol = match score.grade {
crate::quality::LetterGrade::A | crate::quality::LetterGrade::B => "\u{2713}",
crate::quality::LetterGrade::C => "\u{25CB}",
crate::quality::LetterGrade::D => "\u{25B3}",
crate::quality::LetterGrade::F => "\u{2717}",
};
println!("\u{2550}\u{2550}\u{2550}\u{2550}\u{2550}\u{2550}\u{2550}\u{2550}\u{2550}\u{2550}\u{2550}\u{2550}\u{2550}\u{2550}\u{2550}\u{2550}\u{2550}\u{2550}\u{2550}\u{2550}\u{2550}\u{2550}\u{2550}\u{2550}\u{2550}\u{2550}\u{2550}\u{2550}\u{2550}\u{2550}\u{2550}\u{2550}\u{2550}\u{2550}\u{2550}\u{2550}\u{2550}\u{2550}\u{2550}\u{2550}\u{2550}\u{2550}\u{2550}\u{2550}\u{2550}\u{2550}\u{2550}\u{2550}\u{2550}\u{2550}\u{2550}\u{2550}\u{2550}\u{2550}\u{2550}\u{2550}\u{2550}\u{2550}\u{2550}\u{2550}\u{2550}\u{2550}\u{2550}");
println!(
" Data Quality Score: {} {} ({:.1}%) ",
grade_symbol, score.grade, score.score
);
println!(" Profile: {} ", profile.name);
println!(" Decision: {} ", score.grade.publication_decision());
println!("\u{2550}\u{2550}\u{2550}\u{2550}\u{2550}\u{2550}\u{2550}\u{2550}\u{2550}\u{2550}\u{2550}\u{2550}\u{2550}\u{2550}\u{2550}\u{2550}\u{2550}\u{2550}\u{2550}\u{2550}\u{2550}\u{2550}\u{2550}\u{2550}\u{2550}\u{2550}\u{2550}\u{2550}\u{2550}\u{2550}\u{2550}\u{2550}\u{2550}\u{2550}\u{2550}\u{2550}\u{2550}\u{2550}\u{2550}\u{2550}\u{2550}\u{2550}\u{2550}\u{2550}\u{2550}\u{2550}\u{2550}\u{2550}\u{2550}\u{2550}\u{2550}\u{2550}\u{2550}\u{2550}\u{2550}\u{2550}\u{2550}\u{2550}\u{2550}\u{2550}\u{2550}\u{2550}\u{2550}");
println!();
println!("File: {}", path.display());
println!(
"Points: {:.1} / {:.1}",
score.points_earned, score.max_points
);
println!();
// Severity breakdown
println!("Severity Breakdown:");
for severity in [
Severity::Critical,
Severity::High,
Severity::Medium,
Severity::Low,
] {
if let Some(stats) = score.severity_breakdown.get(&severity) {
let status = if stats.failed == 0 {
"\u{2713}"
} else {
"\u{2717}"
};
println!(
" {} {:8}: {}/{} passed ({:.1}/{:.1} pts)",
status,
format!("{}", severity),
stats.passed,
stats.total,
stats.points_earned,
stats.max_points
);
}
}
println!();
// Critical failures get highlighted
let critical_failures = score.critical_failures();
if !critical_failures.is_empty() {
println!("CRITICAL FAILURES (blocks publication):");
for item in critical_failures {
println!(" \u{2717} #{}: {}", item.id, item.description);
if suggest {
if let Some(ref suggestion) = item.suggestion {
println!(" \u{2192} {}", suggestion);
}
}
}
println!();
}
// Show suggestions for all failed items if --suggest flag
if suggest {
let failed = score.failed_items();
let non_critical: Vec<_> = failed
.iter()
.filter(|i| i.severity != Severity::Critical)
.collect();
if !non_critical.is_empty() {
println!("Other Issues ({}):", non_critical.len());
for item in non_critical {
let sev = match item.severity {
Severity::High => "[HIGH]",
Severity::Medium => "[MED]",
Severity::Low => "[LOW]",
Severity::Critical => "[CRIT]",
};
println!(" {} #{}: {}", sev, item.id, item.description);
if let Some(ref suggestion) = item.suggestion {
println!(" \u{2192} {}", suggestion);
}
}
}
}
}
// Exit with non-zero code if critical failures (for CI/CD)
if score.has_critical_failures() {
std::process::exit(1);
}
Ok(())
}
/// List available quality profiles.
#[allow(clippy::unnecessary_wraps)]
pub(crate) fn cmd_quality_profiles() -> crate::Result<()> {
use crate::quality::QualityProfile;
println!("Available Quality Profiles");
println!("\u{2550}\u{2550}\u{2550}\u{2550}\u{2550}\u{2550}\u{2550}\u{2550}\u{2550}\u{2550}\u{2550}\u{2550}\u{2550}\u{2550}\u{2550}\u{2550}\u{2550}\u{2550}\u{2550}\u{2550}\u{2550}\u{2550}\u{2550}\u{2550}\u{2550}\u{2550}");
println!();
for name in QualityProfile::available_profiles() {
if let Some(profile) = QualityProfile::by_name(name) {
println!(" {} - {}", profile.name, profile.description);
if !profile.expected_constant_columns.is_empty() {
let cols: Vec<_> = profile.expected_constant_columns.iter().collect();
println!(" Expected constants: {:?}", cols);
}
if !profile.nullable_columns.is_empty() {
let cols: Vec<_> = profile.nullable_columns.iter().collect();
println!(" Nullable columns: {:?}", cols);
}
println!(" Max null ratio: {:.0}%", profile.max_null_ratio * 100.0);
println!(
" Max duplicate ratio: {:.0}%",
profile.max_duplicate_ratio * 100.0
);
println!();
}
}
println!("Usage: alimentar quality score <path> --profile <name>");
Ok(())
}
/// Build checklist items from `QualityReport`.
///
/// Maps `QualityReport` findings to the 100-point checklist defined in GH-6.
/// This wires the existing quality checks to the weighted scoring system.
#[allow(clippy::too_many_lines)]
pub(crate) fn build_checklist_from_report(
report: &crate::quality::QualityReport,
profile: &crate::quality::QualityProfile,
) -> Vec<crate::quality::ChecklistItem> {
use crate::quality::{ChecklistItem, Severity};
let mut items = Vec::new();
let mut id: u8 = 1;
// === Critical Checks (2.0x weight) ===
// Check 1: Dataset not empty
let has_rows = report.row_count > 0;
items.push(
ChecklistItem::new(id, "Dataset contains rows", Severity::Critical, has_rows)
.with_suggestion("Extract more doctests or check input source"),
);
id += 1;
// Check 2: No empty schema
let has_columns = report.column_count > 0;
items.push(
ChecklistItem::new(
id,
"Schema has columns defined",
Severity::Critical,
has_columns,
)
.with_suggestion("Verify parser is extracting fields correctly"),
);
id += 1;
// Check 3: No unexpected constant columns (would break training)
// Filter out columns that the profile expects to be constant (e.g., source,
// version) Also allow nullable columns to be all-null (constant null is OK
// for optional fields)
let unexpected_constant_cols: Vec<String> = report
.columns
.iter()
.filter(|(name, c): &(&String, &ColumnQuality)| {
c.is_constant() && !profile.is_expected_constant(name) && !profile.is_nullable(name)
})
.map(|(n, _)| n.clone())
.collect();
let no_unexpected_constants = unexpected_constant_cols.is_empty();
items.push(
ChecklistItem::new(
id,
"No unexpected constant columns (zero variance)",
Severity::Critical,
no_unexpected_constants,
)
.with_suggestion(format!(
"Remove or investigate constant columns: {:?}",
unexpected_constant_cols
)),
);
id += 1;
// === High Priority Checks (1.5x weight) ===
// Check 4: Duplicate ratio below threshold (default 5%)
let duplicate_ratio = report
.issues
.iter()
.find_map(|i| {
if let crate::quality::QualityIssue::DuplicateRows {
duplicate_ratio: dr,
..
} = i
{
Some(*dr)
} else {
None
}
})
.unwrap_or(0.0);
let low_duplicates = duplicate_ratio <= 0.05;
items.push(
ChecklistItem::new(
id,
format!(
"Duplicate ratio <= 5% (actual: {:.1}%)",
duplicate_ratio * 100.0
),
Severity::High,
low_duplicates,
)
.with_suggestion("Run deduplication: alimentar dedupe <file>"),
);
id += 1;
// Check 5: No columns with >50% nulls (except nullable columns per profile)
let high_null_cols: Vec<String> = report
.columns
.iter()
.filter(|(name, c): &(&String, &ColumnQuality)| {
c.null_ratio > 0.5 && !profile.is_nullable(name)
})
.map(|(n, _)| n.clone())
.collect();
let no_high_null = high_null_cols.is_empty();
items.push(
ChecklistItem::new(
id,
"No columns with >50% null values",
Severity::High,
no_high_null,
)
.with_suggestion(format!(
"Investigate high-null columns: {:?}",
high_null_cols
)),
);
id += 1;
// Check 6: Minimum row count (at least 100 for meaningful training)
let min_rows = report.row_count >= 100;
items.push(
ChecklistItem::new(
id,
format!("Minimum 100 rows (actual: {})", report.row_count),
Severity::High,
min_rows,
)
.with_suggestion("Extract more data or combine with other sources"),
);
id += 1;
// === Medium Priority Checks (1.0x weight) ===
// Check 7: Overall quality score from existing checker
let good_score = report.score >= 70.0;
items.push(
ChecklistItem::new(
id,
format!("Quality score >= 70% (actual: {:.1}%)", report.score),
Severity::Medium,
good_score,
)
.with_suggestion("Address issues reported by quality check"),
);
id += 1;
// Check 8: No columns with >10% nulls (stricter, except nullable columns per
// profile)
let moderate_null_cols: Vec<String> = report
.columns
.iter()
.filter(|(name, c): &(&String, &ColumnQuality)| {
c.null_ratio > 0.1 && c.null_ratio <= 0.5 && !profile.is_nullable(name)
})
.map(|(n, _)| n.clone())
.collect();
let low_null_ratio = moderate_null_cols.is_empty();
items.push(
ChecklistItem::new(
id,
"No columns with >10% null values",
Severity::Medium,
low_null_ratio,
)
.with_suggestion(format!("Consider imputation for: {:?}", moderate_null_cols)),
);
id += 1;
// Check 9: Reasonable column count (not too few for ML)
let enough_columns = report.column_count >= 2;
items.push(
ChecklistItem::new(
id,
format!("At least 2 columns (actual: {})", report.column_count),
Severity::Medium,
enough_columns,
)
.with_suggestion("Ensure input and target columns are present"),
);
id += 1;
// Check 10: No outlier issues detected
let outlier_issues: Vec<(String, f64)> = report
.issues
.iter()
.filter_map(|i| {
if let crate::quality::QualityIssue::OutliersDetected {
column,
outlier_ratio: or,
..
} = i
{
Some((column.clone(), *or))
} else {
None
}
})
.collect();
let no_severe_outliers = outlier_issues.iter().all(|(_, r)| *r < 0.1);
items.push(
ChecklistItem::new(
id,
"No columns with >10% outliers",
Severity::Medium,
no_severe_outliers,
)
.with_suggestion("Review outlier columns for data quality issues"),
);
id += 1;
// === Low Priority Checks (0.5x weight) ===
// Check 11: No warnings at all
let no_issues = report.issues.is_empty();
items.push(
ChecklistItem::new(id, "No quality warnings", Severity::Low, no_issues)
.with_suggestion("Address all warnings for best results"),
);
id += 1;
// Check 12: Good cardinality (unique values)
let low_cardinality_cols: Vec<String> = report
.columns
.iter()
.filter(|(_, c): &(&String, &ColumnQuality)| c.unique_count < 10 && !c.is_constant())
.map(|(n, _)| n.clone())
.collect();
let good_cardinality = low_cardinality_cols.is_empty();
items.push(
ChecklistItem::new(
id,
"All columns have reasonable cardinality (>10 unique)",
Severity::Low,
good_cardinality,
)
.with_suggestion(format!(
"Low cardinality columns: {:?}",
low_cardinality_cols
)),
);
let _ = id; // suppress warning
items
}
#[cfg(test)]
#[allow(
clippy::cast_possible_truncation,
clippy::cast_possible_wrap,
clippy::cast_precision_loss,
clippy::uninlined_format_args,
clippy::unwrap_used,
clippy::expect_used,
clippy::redundant_clone,
clippy::cast_lossless,
clippy::redundant_closure_for_method_calls,
clippy::too_many_lines,
clippy::float_cmp,
clippy::similar_names,
clippy::needless_late_init,
clippy::redundant_pattern_matching
)]
mod tests {
use std::sync::Arc;
use arrow::{
array::{Int32Array, StringArray},
datatypes::{DataType, Field, Schema},
};
use crate::ArrowDataset;
use super::*;
fn create_test_parquet(path: &PathBuf, rows: usize) {
let schema = Arc::new(Schema::new(vec![
Field::new("id", DataType::Int32, false),
Field::new("name", DataType::Utf8, false),
]));
let ids: Vec<i32> = (0..rows as i32).collect();
let names: Vec<String> = ids.iter().map(|i| format!("item_{}", i)).collect();
let batch = arrow::array::RecordBatch::try_new(
schema,
vec![
Arc::new(Int32Array::from(ids)),
Arc::new(StringArray::from(names)),
],
)
.ok()
.unwrap_or_else(|| panic!("Should create batch"));
let dataset = ArrowDataset::from_batch(batch)
.ok()
.unwrap_or_else(|| panic!("Should create dataset"));
dataset
.to_parquet(path)
.ok()
.unwrap_or_else(|| panic!("Should write parquet"));
}
#[test]
fn test_cmd_quality_check_text() {
let temp_dir = tempfile::tempdir()
.ok()
.unwrap_or_else(|| panic!("Should create temp dir"));
let path = temp_dir.path().join("data.parquet");
create_test_parquet(&path, 100);
let result = cmd_quality_check(&path, 0.1, 0.05, true, "text");
assert!(result.is_ok());
}
#[test]
fn test_cmd_quality_check_json() {
let temp_dir = tempfile::tempdir()
.ok()
.unwrap_or_else(|| panic!("Should create temp dir"));
let path = temp_dir.path().join("data.parquet");
create_test_parquet(&path, 100);
let result = cmd_quality_check(&path, 0.1, 0.05, true, "json");
assert!(result.is_ok());
}
#[test]
fn test_cmd_quality_check_no_outliers() {
let temp_dir = tempfile::tempdir()
.ok()
.unwrap_or_else(|| panic!("Should create temp dir"));
let path = temp_dir.path().join("data.parquet");
create_test_parquet(&path, 100);
let result = cmd_quality_check(&path, 0.1, 0.05, false, "text");
assert!(result.is_ok());
}
#[test]
fn test_cmd_quality_report_basic() {
let temp_dir = tempfile::tempdir()
.ok()
.unwrap_or_else(|| panic!("Should create temp dir"));
let path = temp_dir.path().join("data.parquet");
create_test_parquet(&path, 100);
let result = cmd_quality_report(&path, None);
assert!(result.is_ok());
}
#[test]
fn test_cmd_quality_report_to_file() {
let temp_dir = tempfile::tempdir()
.ok()
.unwrap_or_else(|| panic!("Should create temp dir"));
let data_path = temp_dir.path().join("data.parquet");
let output_path = temp_dir.path().join("quality.json");
create_test_parquet(&data_path, 100);
let result = cmd_quality_report(&data_path, Some(&output_path));
assert!(result.is_ok());
assert!(output_path.exists());
// Verify JSON is valid
let content = std::fs::read_to_string(&output_path)
.ok()
.unwrap_or_else(|| panic!("Should read file"));
let parsed: serde_json::Value = serde_json::from_str(&content)
.ok()
.unwrap_or_else(|| panic!("Should parse JSON"));
assert!(parsed.get("score").is_some());
assert!(parsed.get("has_issues").is_some());
}
#[test]
fn test_cmd_quality_check_with_constant_column() {
let temp_dir = tempfile::tempdir()
.ok()
.unwrap_or_else(|| panic!("Should create temp dir"));
let path = temp_dir.path().join("data.parquet");
let schema = Arc::new(Schema::new(vec![
Field::new("id", DataType::Int32, false),
Field::new("constant", DataType::Int32, false),
]));
let ids: Vec<i32> = (0..100).collect();
let constants: Vec<i32> = vec![42; 100];
let batch = arrow::array::RecordBatch::try_new(
schema,
vec![
Arc::new(Int32Array::from(ids)),
Arc::new(Int32Array::from(constants)),
],
)
.ok()
.unwrap_or_else(|| panic!("Should create batch"));
let dataset = ArrowDataset::from_batch(batch)
.ok()
.unwrap_or_else(|| panic!("Should create dataset"));
dataset
.to_parquet(&path)
.ok()
.unwrap_or_else(|| panic!("Should write parquet"));
let result = cmd_quality_check(&path, 0.1, 0.05, true, "text");
assert!(result.is_ok());
}
#[test]
fn test_cmd_quality_report_default_output() {
let temp_dir = tempfile::tempdir()
.ok()
.unwrap_or_else(|| panic!("Should create temp dir"));
let path = temp_dir.path().join("data.parquet");
create_test_parquet(&path, 50);
let result = cmd_quality_report(&path, None);
assert!(result.is_ok());
}
#[test]
fn test_cmd_quality_report_with_output() {
let temp_dir = tempfile::tempdir()
.ok()
.unwrap_or_else(|| panic!("Should create temp dir"));
let path = temp_dir.path().join("data.parquet");
let output = temp_dir.path().join("report.html");
create_test_parquet(&path, 50);
let result = cmd_quality_report(&path, Some(&output));
assert!(result.is_ok());
assert!(output.exists());
}
#[test]
fn test_cmd_quality_score() {
let temp_dir = tempfile::tempdir()
.ok()
.unwrap_or_else(|| panic!("Should create temp dir"));
let path = temp_dir.path().join("data.parquet");
create_test_parquet(&path, 100);
let result = cmd_quality_score(&path, "default", false, false, false);
assert!(result.is_ok());
}
#[test]
fn test_cmd_quality_score_with_json() {
let temp_dir = tempfile::tempdir()
.ok()
.unwrap_or_else(|| panic!("Should create temp dir"));
let path = temp_dir.path().join("data.parquet");
create_test_parquet(&path, 100);
let result = cmd_quality_score(&path, "default", false, true, false);
assert!(result.is_ok());
}
#[test]
fn test_cmd_quality_score_with_badge() {
let temp_dir = tempfile::tempdir()
.ok()
.unwrap_or_else(|| panic!("Should create temp dir"));
let path = temp_dir.path().join("data.parquet");
create_test_parquet(&path, 100);
let result = cmd_quality_score(&path, "default", false, false, true);
assert!(result.is_ok());
}
#[test]
fn test_cmd_quality_score_with_suggest() {
let temp_dir = tempfile::tempdir()
.ok()
.unwrap_or_else(|| panic!("Should create temp dir"));
let path = temp_dir.path().join("data.parquet");
create_test_parquet(&path, 100);
let result = cmd_quality_score(&path, "default", true, false, false);
assert!(result.is_ok());
}
#[test]
fn test_cmd_quality_score_with_doctest_profile() {
let temp_dir = tempfile::tempdir()
.ok()
.unwrap_or_else(|| panic!("Should create temp dir"));
let path = temp_dir.path().join("data.parquet");
create_test_parquet(&path, 100);
let result = cmd_quality_score(&path, "doctest-corpus", false, false, false);
assert!(result.is_ok());
}
#[test]
fn test_cmd_quality_profiles() {
let result = cmd_quality_profiles();
assert!(result.is_ok());
}
// === Additional quality CLI tests ===
#[test]
fn test_cmd_quality_check_with_high_null_threshold() {
let temp_dir = tempfile::tempdir()
.ok()
.unwrap_or_else(|| panic!("Should create temp dir"));
let path = temp_dir.path().join("data.parquet");
create_test_parquet(&path, 100);
// Use very high null threshold
let result = cmd_quality_check(&path, 0.9, 0.9, true, "text");
assert!(result.is_ok());
}
#[test]
fn test_cmd_quality_check_small_dataset() {
let temp_dir = tempfile::tempdir()
.ok()
.unwrap_or_else(|| panic!("Should create temp dir"));
let path = temp_dir.path().join("small.parquet");
create_test_parquet(&path, 5);
let result = cmd_quality_check(&path, 0.1, 0.05, true, "text");
assert!(result.is_ok());
}
#[test]
fn test_cmd_quality_check_large_dataset() {
let temp_dir = tempfile::tempdir()
.ok()
.unwrap_or_else(|| panic!("Should create temp dir"));
let path = temp_dir.path().join("large.parquet");
create_test_parquet(&path, 500);
let result = cmd_quality_check(&path, 0.1, 0.05, false, "json");
assert!(result.is_ok());
}
#[test]
fn test_cmd_quality_score_ml_training_profile() {
let temp_dir = tempfile::tempdir()
.ok()
.unwrap_or_else(|| panic!("Should create temp dir"));
let path = temp_dir.path().join("ml.parquet");
create_test_parquet(&path, 150);
let result = cmd_quality_score(&path, "ml-training", false, false, false);
assert!(result.is_ok());
}
#[test]
fn test_cmd_quality_score_invalid_profile() {
let temp_dir = tempfile::tempdir()
.ok()
.unwrap_or_else(|| panic!("Should create temp dir"));
let path = temp_dir.path().join("data.parquet");
create_test_parquet(&path, 100);
let result = cmd_quality_score(&path, "nonexistent-profile", false, false, false);
assert!(result.is_err());
}
#[test]
fn test_cmd_quality_score_all_output_modes() {
let temp_dir = tempfile::tempdir()
.ok()
.unwrap_or_else(|| panic!("Should create temp dir"));
let path = temp_dir.path().join("data.parquet");
create_test_parquet(&path, 100);
// Test text output
let result = cmd_quality_score(&path, "default", false, false, false);
assert!(result.is_ok());
// Test JSON output
let result = cmd_quality_score(&path, "default", false, true, false);
assert!(result.is_ok());
// Test badge output
let result = cmd_quality_score(&path, "default", false, false, true);
assert!(result.is_ok());
}
#[test]
fn test_cmd_quality_report_to_stdout() {
let temp_dir = tempfile::tempdir()
.ok()
.unwrap_or_else(|| panic!("Should create temp dir"));
let path = temp_dir.path().join("data.parquet");
create_test_parquet(&path, 50);
// Output to stdout (None)
let result = cmd_quality_report(&path, None);
assert!(result.is_ok());
}
#[test]
fn test_build_checklist_from_report_empty_dataset() {
// Create empty-ish dataset scenario
use crate::quality::{QualityChecker, QualityProfile};
let schema = Arc::new(Schema::new(vec![Field::new("id", DataType::Int32, false)]));
let batch =
arrow::array::RecordBatch::try_new(schema, vec![Arc::new(Int32Array::from(vec![1]))])
.unwrap();
let dataset = ArrowDataset::from_batch(batch).unwrap();
let report = QualityChecker::new().check(&dataset).unwrap();
let profile = QualityProfile::by_name("default").unwrap();
let checklist = build_checklist_from_report(&report, &profile);
// Should have checklist items
assert!(!checklist.is_empty());
// Check that we have critical, high, medium, low items
let has_critical = checklist
.iter()
.any(|i| i.severity == crate::quality::Severity::Critical);
let has_high = checklist
.iter()
.any(|i| i.severity == crate::quality::Severity::High);
let has_medium = checklist
.iter()
.any(|i| i.severity == crate::quality::Severity::Medium);
let has_low = checklist
.iter()
.any(|i| i.severity == crate::quality::Severity::Low);
assert!(has_critical);
assert!(has_high);
assert!(has_medium);
assert!(has_low);
}
#[test]
fn test_build_checklist_high_quality_dataset() {
use crate::quality::{QualityChecker, QualityProfile};
// Create a high-quality dataset
let schema = Arc::new(Schema::new(vec![
Field::new("id", DataType::Int32, false),
Field::new("name", DataType::Utf8, false),
]));
let ids: Vec<i32> = (0..200).collect();
let names: Vec<String> = ids.iter().map(|i| format!("name_{}", i)).collect();
let batch = arrow::array::RecordBatch::try_new(
schema,
vec![
Arc::new(Int32Array::from(ids)),
Arc::new(StringArray::from(names)),
],
)
.unwrap();
let dataset = ArrowDataset::from_batch(batch).unwrap();
let report = QualityChecker::new().check(&dataset).unwrap();
let profile = QualityProfile::by_name("default").unwrap();
let checklist = build_checklist_from_report(&report, &profile);
// Count passed items
let passed = checklist.iter().filter(|i| i.passed).count();
// A good dataset should have most checks passing
assert!(passed > checklist.len() / 2);
}
#[test]
fn test_cmd_quality_check_with_issues() {
let temp_dir = tempfile::tempdir()
.ok()
.unwrap_or_else(|| panic!("Should create temp dir"));
let path = temp_dir.path().join("issues.parquet");
// Create dataset with potential issues (constant column)
let schema = Arc::new(Schema::new(vec![
Field::new("id", DataType::Int32, false),
Field::new("constant", DataType::Int32, false),
]));
let ids: Vec<i32> = (0..50).collect();
let constants: Vec<i32> = vec![42; 50];
let batch = arrow::array::RecordBatch::try_new(
schema,
vec![
Arc::new(Int32Array::from(ids)),
Arc::new(Int32Array::from(constants)),
],
)
.unwrap();
let dataset = ArrowDataset::from_batch(batch).unwrap();
dataset.to_parquet(&path).unwrap();
// Should handle issues gracefully
let result = cmd_quality_check(&path, 0.1, 0.05, true, "text");
assert!(result.is_ok());
}
}