alimentar/cli/
quality.rs

1//! Quality checking CLI commands.
2
3use std::path::{Path, PathBuf};
4
5use clap::Subcommand;
6
7use super::basic::load_dataset;
8use crate::quality::{ColumnQuality, QualityChecker};
9
10/// Quality checking commands.
11#[derive(Subcommand)]
12pub enum QualityCommands {
13    /// Check data quality of a dataset
14    Check {
15        /// Path to dataset file
16        path: PathBuf,
17        /// Null ratio threshold (0.0 to 1.0)
18        #[arg(long, default_value = "0.1")]
19        null_threshold: f64,
20        /// Duplicate ratio threshold (0.0 to 1.0)
21        #[arg(long, default_value = "0.05")]
22        duplicate_threshold: f64,
23        /// Enable outlier detection
24        #[arg(long, default_value = "true")]
25        detect_outliers: bool,
26        /// Output format (text, json)
27        #[arg(short, long, default_value = "text")]
28        format: String,
29    },
30    /// Generate a quality report
31    Report {
32        /// Path to dataset file
33        path: PathBuf,
34        /// Output file for the report (JSON format)
35        #[arg(short, long)]
36        output: Option<PathBuf>,
37    },
38    /// Calculate 100-point quality score with letter grade (GH-6)
39    Score {
40        /// Path to dataset file
41        path: PathBuf,
42        /// Quality profile to use (default, doctest-corpus, ml-training,
43        /// time-series)
44        #[arg(short, long, default_value = "default")]
45        profile: String,
46        /// Show improvement suggestions for failed checks
47        #[arg(long)]
48        suggest: bool,
49        /// Output as JSON for CI/CD integration
50        #[arg(long)]
51        json: bool,
52        /// Output badge URL for shields.io
53        #[arg(long)]
54        badge: bool,
55    },
56    /// List available quality profiles
57    Profiles,
58}
59
60/// Check data quality of a dataset.
61pub(crate) fn cmd_quality_check(
62    path: &Path,
63    null_threshold: f64,
64    duplicate_threshold: f64,
65    detect_outliers: bool,
66    format: &str,
67) -> crate::Result<()> {
68    let dataset = load_dataset(path)?;
69    warn_duplicate_threshold(duplicate_threshold);
70    let checker = build_quality_checker(detect_outliers);
71    let report = checker.check(&dataset)?;
72
73    if format == "json" {
74        print_quality_json(&report, path, null_threshold)
75    } else {
76        print_quality_text(&report, path, null_threshold);
77        Ok(())
78    }
79}
80
81fn warn_duplicate_threshold(duplicate_threshold: f64) {
82    if (duplicate_threshold - 0.05_f64).abs() > f64::EPSILON {
83        eprintln!(
84            "Warning: --duplicate-threshold {duplicate_threshold} is not yet implemented. Using default behavior."
85        );
86    }
87}
88
89fn build_quality_checker(detect_outliers: bool) -> QualityChecker {
90    let mut checker = QualityChecker::new();
91    if !detect_outliers {
92        checker = checker.with_outlier_check(false);
93    }
94    checker
95}
96
97fn print_quality_json(
98    report: &crate::quality::QualityReport,
99    path: &Path,
100    null_threshold: f64,
101) -> crate::Result<()> {
102    let json = serde_json::json!({
103        "path": path.display().to_string(),
104        "rows": report.row_count,
105        "columns": report.column_count,
106        "has_issues": !report.issues.is_empty(),
107        "score": report.score,
108        "issues": report.issues.iter().map(|i| format!("{:?}", i)).collect::<Vec<_>>(),
109        "column_qualities": report.columns.iter().map(|(name, c)| {
110            serde_json::json!({
111                "column": name,
112                "null_ratio": c.null_ratio,
113                "unique_count": c.unique_count,
114                "is_constant": c.is_constant(),
115                "is_mostly_null": c.null_ratio > null_threshold,
116            })
117        }).collect::<Vec<_>>()
118    });
119    println!(
120        "{}",
121        serde_json::to_string_pretty(&json).map_err(|e| crate::Error::Format(e.to_string()))?
122    );
123    Ok(())
124}
125
126fn print_quality_text(report: &crate::quality::QualityReport, path: &Path, null_threshold: f64) {
127    println!("Data Quality Report");
128    println!("===================");
129    println!("File: {}", path.display());
130    println!("Rows: {}", report.row_count);
131    println!("Columns: {}", report.column_count);
132    println!();
133
134    println!("Quality Score: {:.1}%", report.score);
135    println!();
136
137    print_quality_issues(report);
138    print_column_table(report, null_threshold);
139}
140
141fn print_quality_issues(report: &crate::quality::QualityReport) {
142    if report.issues.is_empty() {
143        println!("\u{2713} No quality issues found\n");
144    } else {
145        println!("Issues Found:");
146        println!("-------------");
147        for issue in &report.issues {
148            println!("  - {:?}", issue);
149        }
150        println!();
151    }
152}
153
154fn print_column_table(report: &crate::quality::QualityReport, null_threshold: f64) {
155    println!(
156        "{:<20} {:<12} {:<12} {:<10}",
157        "COLUMN", "NULL %", "UNIQUE", "STATUS"
158    );
159    println!("{}", "-".repeat(60));
160
161    for (name, col) in &report.columns {
162        println!(
163            "{:<20} {:<12.2} {:<12} {:<10}",
164            name,
165            col.null_ratio * 100.0,
166            col.unique_count,
167            column_status(col, null_threshold)
168        );
169    }
170}
171
172fn column_status(col: &ColumnQuality, null_threshold: f64) -> &'static str {
173    if col.is_constant() {
174        "CONSTANT"
175    } else if col.null_ratio > null_threshold {
176        "HIGH NULL"
177    } else {
178        "OK"
179    }
180}
181
182/// Generate a quality report.
183pub(crate) fn cmd_quality_report(path: &Path, output: Option<&Path>) -> crate::Result<()> {
184    let dataset = load_dataset(path)?;
185    let report = QualityChecker::new().check(&dataset)?;
186
187    let json = serde_json::json!({
188        "path": path.display().to_string(),
189        "rows": report.row_count,
190        "columns": report.column_count,
191        "has_issues": !report.issues.is_empty(),
192        "score": report.score,
193        "issues": report.issues.iter().map(|i| format!("{:?}", i)).collect::<Vec<_>>(),
194        "column_qualities": report.columns.iter().map(|(name, c)| {
195            serde_json::json!({
196                "column": name,
197                "null_ratio": c.null_ratio,
198                "unique_count": c.unique_count,
199                "is_constant": c.is_constant(),
200            })
201        }).collect::<Vec<_>>()
202    });
203
204    let json_str =
205        serde_json::to_string_pretty(&json).map_err(|e| crate::Error::Format(e.to_string()))?;
206
207    if let Some(output_path) = output {
208        std::fs::write(output_path, &json_str).map_err(|e| crate::Error::io(e, output_path))?;
209        println!("Quality report written to: {}", output_path.display());
210    } else {
211        println!("{}", json_str);
212    }
213
214    Ok(())
215}
216
217/// Calculate 100-point quality score with letter grade (GH-6).
218///
219/// Implements the Doctest Corpus QA Checklist for Publication with
220/// weighted scoring per Toyota Way Jidoka principles.
221#[allow(clippy::too_many_lines)]
222pub(crate) fn cmd_quality_score(
223    path: &Path,
224    profile_name: &str,
225    suggest: bool,
226    json_output: bool,
227    badge_output: bool,
228) -> crate::Result<()> {
229    use crate::quality::{QualityProfile, QualityScore};
230
231    // Load the quality profile
232    let profile = QualityProfile::by_name(profile_name).ok_or_else(|| {
233        crate::Error::Format(format!(
234            "Unknown quality profile '{}'. Available: {:?}",
235            profile_name,
236            QualityProfile::available_profiles()
237        ))
238    })?;
239
240    let dataset = load_dataset(path)?;
241    let report = QualityChecker::new().check(&dataset)?;
242
243    // Wire QualityReport to ChecklistItems per the 100-point checklist
244    let checklist = build_checklist_from_report(&report, &profile);
245    let score = QualityScore::from_checklist(checklist);
246
247    // Output based on flags
248    if badge_output {
249        println!("{}", score.badge_url());
250    } else if json_output {
251        println!("{}", score.to_json());
252    } else {
253        print_text_report(&score, &profile, path, suggest);
254    }
255
256    // Exit with non-zero code if critical failures (for CI/CD)
257    if score.has_critical_failures() {
258        std::process::exit(1);
259    }
260
261    Ok(())
262}
263
264/// Print the text quality report (Andon-style visual management).
265fn print_text_report(
266    score: &crate::quality::QualityScore,
267    profile: &crate::quality::QualityProfile,
268    path: &Path,
269    suggest: bool,
270) {
271    let grade_symbol = match score.grade {
272        crate::quality::LetterGrade::A | crate::quality::LetterGrade::B => "\u{2713}",
273        crate::quality::LetterGrade::C => "\u{25CB}",
274        crate::quality::LetterGrade::D => "\u{25B3}",
275        crate::quality::LetterGrade::F => "\u{2717}",
276    };
277
278    let separator = "\u{2550}".repeat(63);
279    println!("{separator}");
280    println!(
281        "  Data Quality Score: {} {} ({:.1}%)  ",
282        grade_symbol, score.grade, score.score
283    );
284    println!("  Profile: {}  ", profile.name);
285    println!("  Decision: {}  ", score.grade.publication_decision());
286    println!("{separator}");
287    println!();
288    println!("File: {}", path.display());
289    println!(
290        "Points: {:.1} / {:.1}",
291        score.points_earned, score.max_points
292    );
293    println!();
294
295    print_severity_breakdown(score);
296    print_critical_failures(score, suggest);
297
298    if suggest {
299        print_other_issues(score);
300    }
301}
302
303/// Print the severity breakdown section.
304fn print_severity_breakdown(score: &crate::quality::QualityScore) {
305    use crate::quality::Severity;
306
307    println!("Severity Breakdown:");
308    for severity in [
309        Severity::Critical,
310        Severity::High,
311        Severity::Medium,
312        Severity::Low,
313    ] {
314        if let Some(stats) = score.severity_breakdown.get(&severity) {
315            let status = if stats.failed == 0 {
316                "\u{2713}"
317            } else {
318                "\u{2717}"
319            };
320            println!(
321                "  {} {:8}: {}/{} passed ({:.1}/{:.1} pts)",
322                status,
323                format!("{}", severity),
324                stats.passed,
325                stats.total,
326                stats.points_earned,
327                stats.max_points
328            );
329        }
330    }
331    println!();
332}
333
334/// Print critical failures section.
335fn print_critical_failures(score: &crate::quality::QualityScore, suggest: bool) {
336    let critical_failures = score.critical_failures();
337    if !critical_failures.is_empty() {
338        println!("CRITICAL FAILURES (blocks publication):");
339        for item in critical_failures {
340            println!("  \u{2717} #{}: {}", item.id, item.description);
341            if suggest {
342                if let Some(ref suggestion) = item.suggestion {
343                    println!("    \u{2192} {}", suggestion);
344                }
345            }
346        }
347        println!();
348    }
349}
350
351/// Print non-critical issues with suggestions.
352fn print_other_issues(score: &crate::quality::QualityScore) {
353    use crate::quality::Severity;
354
355    let failed = score.failed_items();
356    let non_critical: Vec<_> = failed
357        .iter()
358        .filter(|i| i.severity != Severity::Critical)
359        .collect();
360
361    if !non_critical.is_empty() {
362        println!("Other Issues ({}):", non_critical.len());
363        for item in non_critical {
364            let sev = match item.severity {
365                Severity::High => "[HIGH]",
366                Severity::Medium => "[MED]",
367                Severity::Low => "[LOW]",
368                Severity::Critical => "[CRIT]",
369            };
370            println!("  {} #{}: {}", sev, item.id, item.description);
371            if let Some(ref suggestion) = item.suggestion {
372                println!("      \u{2192} {}", suggestion);
373            }
374        }
375    }
376}
377
378/// List available quality profiles.
379#[allow(clippy::unnecessary_wraps)]
380pub(crate) fn cmd_quality_profiles() -> crate::Result<()> {
381    use crate::quality::QualityProfile;
382
383    println!("Available Quality Profiles");
384    println!("\u{2550}\u{2550}\u{2550}\u{2550}\u{2550}\u{2550}\u{2550}\u{2550}\u{2550}\u{2550}\u{2550}\u{2550}\u{2550}\u{2550}\u{2550}\u{2550}\u{2550}\u{2550}\u{2550}\u{2550}\u{2550}\u{2550}\u{2550}\u{2550}\u{2550}\u{2550}");
385    println!();
386
387    for name in QualityProfile::available_profiles() {
388        if let Some(profile) = QualityProfile::by_name(name) {
389            println!("  {} - {}", profile.name, profile.description);
390            if !profile.expected_constant_columns.is_empty() {
391                let cols: Vec<_> = profile.expected_constant_columns.iter().collect();
392                println!("    Expected constants: {:?}", cols);
393            }
394            if !profile.nullable_columns.is_empty() {
395                let cols: Vec<_> = profile.nullable_columns.iter().collect();
396                println!("    Nullable columns: {:?}", cols);
397            }
398            println!("    Max null ratio: {:.0}%", profile.max_null_ratio * 100.0);
399            println!(
400                "    Max duplicate ratio: {:.0}%",
401                profile.max_duplicate_ratio * 100.0
402            );
403            println!();
404        }
405    }
406
407    println!("Usage: alimentar quality score <path> --profile <name>");
408    Ok(())
409}
410
411/// Build checklist items from `QualityReport`.
412///
413/// Maps `QualityReport` findings to the 100-point checklist defined in GH-6.
414/// This wires the existing quality checks to the weighted scoring system.
415#[allow(clippy::too_many_lines)]
416pub(crate) fn build_checklist_from_report(
417    report: &crate::quality::QualityReport,
418    profile: &crate::quality::QualityProfile,
419) -> Vec<crate::quality::ChecklistItem> {
420    use crate::quality::{ChecklistItem, Severity};
421
422    let mut items = Vec::new();
423    let mut id: u8 = 1;
424
425    // === Critical Checks (2.0x weight) ===
426
427    // Check 1: Dataset not empty
428    let has_rows = report.row_count > 0;
429    items.push(
430        ChecklistItem::new(id, "Dataset contains rows", Severity::Critical, has_rows)
431            .with_suggestion("Extract more doctests or check input source"),
432    );
433    id += 1;
434
435    // Check 2: No empty schema
436    let has_columns = report.column_count > 0;
437    items.push(
438        ChecklistItem::new(
439            id,
440            "Schema has columns defined",
441            Severity::Critical,
442            has_columns,
443        )
444        .with_suggestion("Verify parser is extracting fields correctly"),
445    );
446    id += 1;
447
448    // Check 3: No unexpected constant columns (would break training)
449    // Filter out columns that the profile expects to be constant (e.g., source,
450    // version) Also allow nullable columns to be all-null (constant null is OK
451    // for optional fields)
452    let unexpected_constant_cols: Vec<String> = report
453        .columns
454        .iter()
455        .filter(|(name, c): &(&String, &ColumnQuality)| {
456            c.is_constant() && !profile.is_expected_constant(name) && !profile.is_nullable(name)
457        })
458        .map(|(n, _)| n.clone())
459        .collect();
460    let no_unexpected_constants = unexpected_constant_cols.is_empty();
461    items.push(
462        ChecklistItem::new(
463            id,
464            "No unexpected constant columns (zero variance)",
465            Severity::Critical,
466            no_unexpected_constants,
467        )
468        .with_suggestion(format!(
469            "Remove or investigate constant columns: {:?}",
470            unexpected_constant_cols
471        )),
472    );
473    id += 1;
474
475    // === High Priority Checks (1.5x weight) ===
476
477    // Check 4: Duplicate ratio below threshold (default 5%)
478    let duplicate_ratio = report
479        .issues
480        .iter()
481        .find_map(|i| {
482            if let crate::quality::QualityIssue::DuplicateRows {
483                duplicate_ratio: dr,
484                ..
485            } = i
486            {
487                Some(*dr)
488            } else {
489                None
490            }
491        })
492        .unwrap_or(0.0);
493    let low_duplicates = duplicate_ratio <= 0.05;
494    items.push(
495        ChecklistItem::new(
496            id,
497            format!(
498                "Duplicate ratio <= 5% (actual: {:.1}%)",
499                duplicate_ratio * 100.0
500            ),
501            Severity::High,
502            low_duplicates,
503        )
504        .with_suggestion("Run deduplication: alimentar dedupe <file>"),
505    );
506    id += 1;
507
508    // Check 5: No columns with >50% nulls (except nullable columns per profile)
509    let high_null_cols: Vec<String> = report
510        .columns
511        .iter()
512        .filter(|(name, c): &(&String, &ColumnQuality)| {
513            c.null_ratio > 0.5 && !profile.is_nullable(name)
514        })
515        .map(|(n, _)| n.clone())
516        .collect();
517    let no_high_null = high_null_cols.is_empty();
518    items.push(
519        ChecklistItem::new(
520            id,
521            "No columns with >50% null values",
522            Severity::High,
523            no_high_null,
524        )
525        .with_suggestion(format!(
526            "Investigate high-null columns: {:?}",
527            high_null_cols
528        )),
529    );
530    id += 1;
531
532    // Check 6: Minimum row count (at least 100 for meaningful training)
533    let min_rows = report.row_count >= 100;
534    items.push(
535        ChecklistItem::new(
536            id,
537            format!("Minimum 100 rows (actual: {})", report.row_count),
538            Severity::High,
539            min_rows,
540        )
541        .with_suggestion("Extract more data or combine with other sources"),
542    );
543    id += 1;
544
545    // === Medium Priority Checks (1.0x weight) ===
546
547    // Check 7: Overall quality score from existing checker
548    let good_score = report.score >= 70.0;
549    items.push(
550        ChecklistItem::new(
551            id,
552            format!("Quality score >= 70% (actual: {:.1}%)", report.score),
553            Severity::Medium,
554            good_score,
555        )
556        .with_suggestion("Address issues reported by quality check"),
557    );
558    id += 1;
559
560    // Check 8: No columns with >10% nulls (stricter, except nullable columns per
561    // profile)
562    let moderate_null_cols: Vec<String> = report
563        .columns
564        .iter()
565        .filter(|(name, c): &(&String, &ColumnQuality)| {
566            c.null_ratio > 0.1 && c.null_ratio <= 0.5 && !profile.is_nullable(name)
567        })
568        .map(|(n, _)| n.clone())
569        .collect();
570    let low_null_ratio = moderate_null_cols.is_empty();
571    items.push(
572        ChecklistItem::new(
573            id,
574            "No columns with >10% null values",
575            Severity::Medium,
576            low_null_ratio,
577        )
578        .with_suggestion(format!("Consider imputation for: {:?}", moderate_null_cols)),
579    );
580    id += 1;
581
582    // Check 9: Reasonable column count (not too few for ML)
583    let enough_columns = report.column_count >= 2;
584    items.push(
585        ChecklistItem::new(
586            id,
587            format!("At least 2 columns (actual: {})", report.column_count),
588            Severity::Medium,
589            enough_columns,
590        )
591        .with_suggestion("Ensure input and target columns are present"),
592    );
593    id += 1;
594
595    // Check 10: No outlier issues detected
596    let outlier_issues: Vec<(String, f64)> = report
597        .issues
598        .iter()
599        .filter_map(|i| {
600            if let crate::quality::QualityIssue::OutliersDetected {
601                column,
602                outlier_ratio: or,
603                ..
604            } = i
605            {
606                Some((column.clone(), *or))
607            } else {
608                None
609            }
610        })
611        .collect();
612    let no_severe_outliers = outlier_issues.iter().all(|(_, r)| *r < 0.1);
613    items.push(
614        ChecklistItem::new(
615            id,
616            "No columns with >10% outliers",
617            Severity::Medium,
618            no_severe_outliers,
619        )
620        .with_suggestion("Review outlier columns for data quality issues"),
621    );
622    id += 1;
623
624    // === Low Priority Checks (0.5x weight) ===
625
626    // Check 11: No warnings at all
627    let no_issues = report.issues.is_empty();
628    items.push(
629        ChecklistItem::new(id, "No quality warnings", Severity::Low, no_issues)
630            .with_suggestion("Address all warnings for best results"),
631    );
632    id += 1;
633
634    // Check 12: Good cardinality (unique values)
635    let low_cardinality_cols: Vec<String> = report
636        .columns
637        .iter()
638        .filter(|(_, c): &(&String, &ColumnQuality)| c.unique_count < 10 && !c.is_constant())
639        .map(|(n, _)| n.clone())
640        .collect();
641    let good_cardinality = low_cardinality_cols.is_empty();
642    items.push(
643        ChecklistItem::new(
644            id,
645            "All columns have reasonable cardinality (>10 unique)",
646            Severity::Low,
647            good_cardinality,
648        )
649        .with_suggestion(format!(
650            "Low cardinality columns: {:?}",
651            low_cardinality_cols
652        )),
653    );
654    let _ = id; // suppress warning
655
656    items
657}
658
659#[cfg(test)]
660#[allow(
661    clippy::cast_possible_truncation,
662    clippy::cast_possible_wrap,
663    clippy::cast_precision_loss,
664    clippy::uninlined_format_args,
665    clippy::unwrap_used,
666    clippy::expect_used,
667    clippy::redundant_clone,
668    clippy::cast_lossless,
669    clippy::redundant_closure_for_method_calls,
670    clippy::too_many_lines,
671    clippy::float_cmp,
672    clippy::similar_names,
673    clippy::needless_late_init,
674    clippy::redundant_pattern_matching
675)]
676mod tests {
677    use std::sync::Arc;
678
679    use arrow::{
680        array::{Int32Array, StringArray},
681        datatypes::{DataType, Field, Schema},
682    };
683
684    use super::*;
685    use crate::ArrowDataset;
686
687    fn create_test_parquet(path: &PathBuf, rows: usize) {
688        let schema = Arc::new(Schema::new(vec![
689            Field::new("id", DataType::Int32, false),
690            Field::new("name", DataType::Utf8, false),
691        ]));
692
693        let ids: Vec<i32> = (0..rows as i32).collect();
694        let names: Vec<String> = ids.iter().map(|i| format!("item_{}", i)).collect();
695
696        let batch = arrow::array::RecordBatch::try_new(
697            schema,
698            vec![
699                Arc::new(Int32Array::from(ids)),
700                Arc::new(StringArray::from(names)),
701            ],
702        )
703        .ok()
704        .unwrap_or_else(|| panic!("Should create batch"));
705
706        let dataset = ArrowDataset::from_batch(batch)
707            .ok()
708            .unwrap_or_else(|| panic!("Should create dataset"));
709
710        dataset
711            .to_parquet(path)
712            .ok()
713            .unwrap_or_else(|| panic!("Should write parquet"));
714    }
715
716    #[test]
717    fn test_cmd_quality_check_text() {
718        let temp_dir = tempfile::tempdir()
719            .ok()
720            .unwrap_or_else(|| panic!("Should create temp dir"));
721        let path = temp_dir.path().join("data.parquet");
722        create_test_parquet(&path, 100);
723
724        let result = cmd_quality_check(&path, 0.1, 0.05, true, "text");
725        assert!(result.is_ok());
726    }
727
728    #[test]
729    fn test_cmd_quality_check_json() {
730        let temp_dir = tempfile::tempdir()
731            .ok()
732            .unwrap_or_else(|| panic!("Should create temp dir"));
733        let path = temp_dir.path().join("data.parquet");
734        create_test_parquet(&path, 100);
735
736        let result = cmd_quality_check(&path, 0.1, 0.05, true, "json");
737        assert!(result.is_ok());
738    }
739
740    #[test]
741    fn test_cmd_quality_check_no_outliers() {
742        let temp_dir = tempfile::tempdir()
743            .ok()
744            .unwrap_or_else(|| panic!("Should create temp dir"));
745        let path = temp_dir.path().join("data.parquet");
746        create_test_parquet(&path, 100);
747
748        let result = cmd_quality_check(&path, 0.1, 0.05, false, "text");
749        assert!(result.is_ok());
750    }
751
752    #[test]
753    fn test_cmd_quality_report_basic() {
754        let temp_dir = tempfile::tempdir()
755            .ok()
756            .unwrap_or_else(|| panic!("Should create temp dir"));
757        let path = temp_dir.path().join("data.parquet");
758        create_test_parquet(&path, 100);
759
760        let result = cmd_quality_report(&path, None);
761        assert!(result.is_ok());
762    }
763
764    #[test]
765    fn test_cmd_quality_report_to_file() {
766        let temp_dir = tempfile::tempdir()
767            .ok()
768            .unwrap_or_else(|| panic!("Should create temp dir"));
769        let data_path = temp_dir.path().join("data.parquet");
770        let output_path = temp_dir.path().join("quality.json");
771        create_test_parquet(&data_path, 100);
772
773        let result = cmd_quality_report(&data_path, Some(&output_path));
774        assert!(result.is_ok());
775        assert!(output_path.exists());
776
777        // Verify JSON is valid
778        let content = std::fs::read_to_string(&output_path)
779            .ok()
780            .unwrap_or_else(|| panic!("Should read file"));
781        let parsed: serde_json::Value = serde_json::from_str(&content)
782            .ok()
783            .unwrap_or_else(|| panic!("Should parse JSON"));
784        assert!(parsed.get("score").is_some());
785        assert!(parsed.get("has_issues").is_some());
786    }
787
788    #[test]
789    fn test_cmd_quality_check_with_constant_column() {
790        let temp_dir = tempfile::tempdir()
791            .ok()
792            .unwrap_or_else(|| panic!("Should create temp dir"));
793        let path = temp_dir.path().join("data.parquet");
794
795        let schema = Arc::new(Schema::new(vec![
796            Field::new("id", DataType::Int32, false),
797            Field::new("constant", DataType::Int32, false),
798        ]));
799
800        let ids: Vec<i32> = (0..100).collect();
801        let constants: Vec<i32> = vec![42; 100];
802
803        let batch = arrow::array::RecordBatch::try_new(
804            schema,
805            vec![
806                Arc::new(Int32Array::from(ids)),
807                Arc::new(Int32Array::from(constants)),
808            ],
809        )
810        .ok()
811        .unwrap_or_else(|| panic!("Should create batch"));
812
813        let dataset = ArrowDataset::from_batch(batch)
814            .ok()
815            .unwrap_or_else(|| panic!("Should create dataset"));
816
817        dataset
818            .to_parquet(&path)
819            .ok()
820            .unwrap_or_else(|| panic!("Should write parquet"));
821
822        let result = cmd_quality_check(&path, 0.1, 0.05, true, "text");
823        assert!(result.is_ok());
824    }
825
826    #[test]
827    fn test_cmd_quality_report_default_output() {
828        let temp_dir = tempfile::tempdir()
829            .ok()
830            .unwrap_or_else(|| panic!("Should create temp dir"));
831        let path = temp_dir.path().join("data.parquet");
832        create_test_parquet(&path, 50);
833
834        let result = cmd_quality_report(&path, None);
835        assert!(result.is_ok());
836    }
837
838    #[test]
839    fn test_cmd_quality_report_with_output() {
840        let temp_dir = tempfile::tempdir()
841            .ok()
842            .unwrap_or_else(|| panic!("Should create temp dir"));
843        let path = temp_dir.path().join("data.parquet");
844        let output = temp_dir.path().join("report.html");
845        create_test_parquet(&path, 50);
846
847        let result = cmd_quality_report(&path, Some(&output));
848        assert!(result.is_ok());
849        assert!(output.exists());
850    }
851
852    #[test]
853    fn test_cmd_quality_score() {
854        let temp_dir = tempfile::tempdir()
855            .ok()
856            .unwrap_or_else(|| panic!("Should create temp dir"));
857        let path = temp_dir.path().join("data.parquet");
858        create_test_parquet(&path, 100);
859
860        let result = cmd_quality_score(&path, "default", false, false, false);
861        assert!(result.is_ok());
862    }
863
864    #[test]
865    fn test_cmd_quality_score_with_json() {
866        let temp_dir = tempfile::tempdir()
867            .ok()
868            .unwrap_or_else(|| panic!("Should create temp dir"));
869        let path = temp_dir.path().join("data.parquet");
870        create_test_parquet(&path, 100);
871
872        let result = cmd_quality_score(&path, "default", false, true, false);
873        assert!(result.is_ok());
874    }
875
876    #[test]
877    fn test_cmd_quality_score_with_badge() {
878        let temp_dir = tempfile::tempdir()
879            .ok()
880            .unwrap_or_else(|| panic!("Should create temp dir"));
881        let path = temp_dir.path().join("data.parquet");
882        create_test_parquet(&path, 100);
883
884        let result = cmd_quality_score(&path, "default", false, false, true);
885        assert!(result.is_ok());
886    }
887
888    #[test]
889    fn test_cmd_quality_score_with_suggest() {
890        let temp_dir = tempfile::tempdir()
891            .ok()
892            .unwrap_or_else(|| panic!("Should create temp dir"));
893        let path = temp_dir.path().join("data.parquet");
894        create_test_parquet(&path, 100);
895
896        let result = cmd_quality_score(&path, "default", true, false, false);
897        assert!(result.is_ok());
898    }
899
900    #[test]
901    fn test_cmd_quality_score_with_doctest_profile() {
902        let temp_dir = tempfile::tempdir()
903            .ok()
904            .unwrap_or_else(|| panic!("Should create temp dir"));
905        let path = temp_dir.path().join("data.parquet");
906        create_test_parquet(&path, 100);
907
908        let result = cmd_quality_score(&path, "doctest-corpus", false, false, false);
909        assert!(result.is_ok());
910    }
911
912    #[test]
913    fn test_cmd_quality_profiles() {
914        let result = cmd_quality_profiles();
915        assert!(result.is_ok());
916    }
917
918    // === Additional quality CLI tests ===
919
920    #[test]
921    fn test_cmd_quality_check_with_high_null_threshold() {
922        let temp_dir = tempfile::tempdir()
923            .ok()
924            .unwrap_or_else(|| panic!("Should create temp dir"));
925        let path = temp_dir.path().join("data.parquet");
926        create_test_parquet(&path, 100);
927
928        // Use very high null threshold
929        let result = cmd_quality_check(&path, 0.9, 0.9, true, "text");
930        assert!(result.is_ok());
931    }
932
933    #[test]
934    fn test_cmd_quality_check_small_dataset() {
935        let temp_dir = tempfile::tempdir()
936            .ok()
937            .unwrap_or_else(|| panic!("Should create temp dir"));
938        let path = temp_dir.path().join("small.parquet");
939        create_test_parquet(&path, 5);
940
941        let result = cmd_quality_check(&path, 0.1, 0.05, true, "text");
942        assert!(result.is_ok());
943    }
944
945    #[test]
946    fn test_cmd_quality_check_large_dataset() {
947        let temp_dir = tempfile::tempdir()
948            .ok()
949            .unwrap_or_else(|| panic!("Should create temp dir"));
950        let path = temp_dir.path().join("large.parquet");
951        create_test_parquet(&path, 500);
952
953        let result = cmd_quality_check(&path, 0.1, 0.05, false, "json");
954        assert!(result.is_ok());
955    }
956
957    #[test]
958    fn test_cmd_quality_score_ml_training_profile() {
959        let temp_dir = tempfile::tempdir()
960            .ok()
961            .unwrap_or_else(|| panic!("Should create temp dir"));
962        let path = temp_dir.path().join("ml.parquet");
963        create_test_parquet(&path, 150);
964
965        let result = cmd_quality_score(&path, "ml-training", false, false, false);
966        assert!(result.is_ok());
967    }
968
969    #[test]
970    fn test_cmd_quality_score_invalid_profile() {
971        let temp_dir = tempfile::tempdir()
972            .ok()
973            .unwrap_or_else(|| panic!("Should create temp dir"));
974        let path = temp_dir.path().join("data.parquet");
975        create_test_parquet(&path, 100);
976
977        let result = cmd_quality_score(&path, "nonexistent-profile", false, false, false);
978        assert!(result.is_err());
979    }
980
981    #[test]
982    fn test_cmd_quality_score_all_output_modes() {
983        let temp_dir = tempfile::tempdir()
984            .ok()
985            .unwrap_or_else(|| panic!("Should create temp dir"));
986        let path = temp_dir.path().join("data.parquet");
987        create_test_parquet(&path, 100);
988
989        // Test text output
990        let result = cmd_quality_score(&path, "default", false, false, false);
991        assert!(result.is_ok());
992
993        // Test JSON output
994        let result = cmd_quality_score(&path, "default", false, true, false);
995        assert!(result.is_ok());
996
997        // Test badge output
998        let result = cmd_quality_score(&path, "default", false, false, true);
999        assert!(result.is_ok());
1000    }
1001
1002    #[test]
1003    fn test_cmd_quality_report_to_stdout() {
1004        let temp_dir = tempfile::tempdir()
1005            .ok()
1006            .unwrap_or_else(|| panic!("Should create temp dir"));
1007        let path = temp_dir.path().join("data.parquet");
1008        create_test_parquet(&path, 50);
1009
1010        // Output to stdout (None)
1011        let result = cmd_quality_report(&path, None);
1012        assert!(result.is_ok());
1013    }
1014
1015    #[test]
1016    fn test_build_checklist_from_report_empty_dataset() {
1017        // Create empty-ish dataset scenario
1018        use crate::quality::{QualityChecker, QualityProfile};
1019
1020        let schema = Arc::new(Schema::new(vec![Field::new("id", DataType::Int32, false)]));
1021
1022        let batch =
1023            arrow::array::RecordBatch::try_new(schema, vec![Arc::new(Int32Array::from(vec![1]))])
1024                .unwrap();
1025
1026        let dataset = ArrowDataset::from_batch(batch).unwrap();
1027        let report = QualityChecker::new().check(&dataset).unwrap();
1028        let profile = QualityProfile::by_name("default").unwrap();
1029
1030        let checklist = build_checklist_from_report(&report, &profile);
1031
1032        // Should have checklist items
1033        assert!(!checklist.is_empty());
1034
1035        // Check that we have critical, high, medium, low items
1036        let has_critical = checklist
1037            .iter()
1038            .any(|i| i.severity == crate::quality::Severity::Critical);
1039        let has_high = checklist
1040            .iter()
1041            .any(|i| i.severity == crate::quality::Severity::High);
1042        let has_medium = checklist
1043            .iter()
1044            .any(|i| i.severity == crate::quality::Severity::Medium);
1045        let has_low = checklist
1046            .iter()
1047            .any(|i| i.severity == crate::quality::Severity::Low);
1048
1049        assert!(has_critical);
1050        assert!(has_high);
1051        assert!(has_medium);
1052        assert!(has_low);
1053    }
1054
1055    #[test]
1056    fn test_build_checklist_high_quality_dataset() {
1057        use crate::quality::{QualityChecker, QualityProfile};
1058
1059        // Create a high-quality dataset
1060        let schema = Arc::new(Schema::new(vec![
1061            Field::new("id", DataType::Int32, false),
1062            Field::new("name", DataType::Utf8, false),
1063        ]));
1064
1065        let ids: Vec<i32> = (0..200).collect();
1066        let names: Vec<String> = ids.iter().map(|i| format!("name_{}", i)).collect();
1067
1068        let batch = arrow::array::RecordBatch::try_new(
1069            schema,
1070            vec![
1071                Arc::new(Int32Array::from(ids)),
1072                Arc::new(StringArray::from(names)),
1073            ],
1074        )
1075        .unwrap();
1076
1077        let dataset = ArrowDataset::from_batch(batch).unwrap();
1078        let report = QualityChecker::new().check(&dataset).unwrap();
1079        let profile = QualityProfile::by_name("default").unwrap();
1080
1081        let checklist = build_checklist_from_report(&report, &profile);
1082
1083        // Count passed items
1084        let passed = checklist.iter().filter(|i| i.passed).count();
1085        // A good dataset should have most checks passing
1086        assert!(passed > checklist.len() / 2);
1087    }
1088
1089    #[test]
1090    fn test_cmd_quality_check_with_issues() {
1091        let temp_dir = tempfile::tempdir()
1092            .ok()
1093            .unwrap_or_else(|| panic!("Should create temp dir"));
1094        let path = temp_dir.path().join("issues.parquet");
1095
1096        // Create dataset with potential issues (constant column)
1097        let schema = Arc::new(Schema::new(vec![
1098            Field::new("id", DataType::Int32, false),
1099            Field::new("constant", DataType::Int32, false),
1100        ]));
1101
1102        let ids: Vec<i32> = (0..50).collect();
1103        let constants: Vec<i32> = vec![42; 50];
1104
1105        let batch = arrow::array::RecordBatch::try_new(
1106            schema,
1107            vec![
1108                Arc::new(Int32Array::from(ids)),
1109                Arc::new(Int32Array::from(constants)),
1110            ],
1111        )
1112        .unwrap();
1113
1114        let dataset = ArrowDataset::from_batch(batch).unwrap();
1115        dataset.to_parquet(&path).unwrap();
1116
1117        // Should handle issues gracefully
1118        let result = cmd_quality_check(&path, 0.1, 0.05, true, "text");
1119        assert!(result.is_ok());
1120    }
1121}
alimentar/cli/quality.rs

alimentar/cli/
quality.rs