alimentar/cli/
quality.rs

1//! Quality checking CLI commands.
2
3use std::path::{Path, PathBuf};
4
5use clap::Subcommand;
6
7use super::basic::load_dataset;
8use crate::quality::{ColumnQuality, QualityChecker};
9
10/// Quality checking commands.
11#[derive(Subcommand)]
12pub enum QualityCommands {
13    /// Check data quality of a dataset
14    Check {
15        /// Path to dataset file
16        path: PathBuf,
17        /// Null ratio threshold (0.0 to 1.0)
18        #[arg(long, default_value = "0.1")]
19        null_threshold: f64,
20        /// Duplicate ratio threshold (0.0 to 1.0)
21        #[arg(long, default_value = "0.05")]
22        duplicate_threshold: f64,
23        /// Enable outlier detection
24        #[arg(long, default_value = "true")]
25        detect_outliers: bool,
26        /// Output format (text, json)
27        #[arg(short, long, default_value = "text")]
28        format: String,
29    },
30    /// Generate a quality report
31    Report {
32        /// Path to dataset file
33        path: PathBuf,
34        /// Output file for the report (JSON format)
35        #[arg(short, long)]
36        output: Option<PathBuf>,
37    },
38    /// Calculate 100-point quality score with letter grade (GH-6)
39    Score {
40        /// Path to dataset file
41        path: PathBuf,
42        /// Quality profile to use (default, doctest-corpus, ml-training,
43        /// time-series)
44        #[arg(short, long, default_value = "default")]
45        profile: String,
46        /// Show improvement suggestions for failed checks
47        #[arg(long)]
48        suggest: bool,
49        /// Output as JSON for CI/CD integration
50        #[arg(long)]
51        json: bool,
52        /// Output badge URL for shields.io
53        #[arg(long)]
54        badge: bool,
55    },
56    /// List available quality profiles
57    Profiles,
58}
59
60/// Check data quality of a dataset.
61pub(crate) fn cmd_quality_check(
62    path: &Path,
63    null_threshold: f64,
64    duplicate_threshold: f64,
65    detect_outliers: bool,
66    format: &str,
67) -> crate::Result<()> {
68    let dataset = load_dataset(path)?;
69
70    // GH-38: --duplicate-threshold is not yet used by QualityChecker
71    if (duplicate_threshold - 0.05_f64).abs() > f64::EPSILON {
72        eprintln!(
73            "Warning: --duplicate-threshold {duplicate_threshold} is not yet implemented. Using default behavior."
74        );
75    }
76    let _ = duplicate_threshold;
77    let mut checker = QualityChecker::new();
78
79    if !detect_outliers {
80        checker = checker.with_outlier_check(false);
81    }
82
83    let report = checker.check(&dataset)?;
84
85    if format == "json" {
86        let json = serde_json::json!({
87            "path": path.display().to_string(),
88            "rows": report.row_count,
89            "columns": report.column_count,
90            "has_issues": !report.issues.is_empty(),
91            "score": report.score,
92            "issues": report.issues.iter().map(|i| format!("{:?}", i)).collect::<Vec<_>>(),
93            "column_qualities": report.columns.iter().map(|(name, c)| {
94                serde_json::json!({
95                    "column": name,
96                    "null_ratio": c.null_ratio,
97                    "unique_count": c.unique_count,
98                    "is_constant": c.is_constant(),
99                    "is_mostly_null": c.null_ratio > null_threshold,
100                })
101            }).collect::<Vec<_>>()
102        });
103        println!(
104            "{}",
105            serde_json::to_string_pretty(&json).map_err(|e| crate::Error::Format(e.to_string()))?
106        );
107    } else {
108        println!("Data Quality Report");
109        println!("===================");
110        println!("File: {}", path.display());
111        println!("Rows: {}", report.row_count);
112        println!("Columns: {}", report.column_count);
113        println!();
114
115        println!("Quality Score: {:.1}%", report.score);
116        println!();
117
118        if report.issues.is_empty() {
119            println!("\u{2713} No quality issues found\n");
120        } else {
121            println!("Issues Found:");
122            println!("-------------");
123            for issue in &report.issues {
124                println!("  - {:?}", issue);
125            }
126            println!();
127        }
128
129        println!(
130            "{:<20} {:<12} {:<12} {:<10}",
131            "COLUMN", "NULL %", "UNIQUE", "STATUS"
132        );
133        println!("{}", "-".repeat(60));
134
135        for (name, col) in &report.columns {
136            let status = if col.is_constant() {
137                "CONSTANT"
138            } else if col.null_ratio > null_threshold {
139                "HIGH NULL"
140            } else {
141                "OK"
142            };
143
144            println!(
145                "{:<20} {:<12.2} {:<12} {:<10}",
146                name,
147                col.null_ratio * 100.0,
148                col.unique_count,
149                status
150            );
151        }
152    }
153
154    Ok(())
155}
156
157/// Generate a quality report.
158pub(crate) fn cmd_quality_report(path: &Path, output: Option<&Path>) -> crate::Result<()> {
159    let dataset = load_dataset(path)?;
160    let report = QualityChecker::new().check(&dataset)?;
161
162    let json = serde_json::json!({
163        "path": path.display().to_string(),
164        "rows": report.row_count,
165        "columns": report.column_count,
166        "has_issues": !report.issues.is_empty(),
167        "score": report.score,
168        "issues": report.issues.iter().map(|i| format!("{:?}", i)).collect::<Vec<_>>(),
169        "column_qualities": report.columns.iter().map(|(name, c)| {
170            serde_json::json!({
171                "column": name,
172                "null_ratio": c.null_ratio,
173                "unique_count": c.unique_count,
174                "is_constant": c.is_constant(),
175            })
176        }).collect::<Vec<_>>()
177    });
178
179    let json_str =
180        serde_json::to_string_pretty(&json).map_err(|e| crate::Error::Format(e.to_string()))?;
181
182    if let Some(output_path) = output {
183        std::fs::write(output_path, &json_str).map_err(|e| crate::Error::io(e, output_path))?;
184        println!("Quality report written to: {}", output_path.display());
185    } else {
186        println!("{}", json_str);
187    }
188
189    Ok(())
190}
191
192/// Calculate 100-point quality score with letter grade (GH-6).
193///
194/// Implements the Doctest Corpus QA Checklist for Publication with
195/// weighted scoring per Toyota Way Jidoka principles.
196#[allow(clippy::too_many_lines)]
197pub(crate) fn cmd_quality_score(
198    path: &Path,
199    profile_name: &str,
200    suggest: bool,
201    json_output: bool,
202    badge_output: bool,
203) -> crate::Result<()> {
204    use crate::quality::{QualityProfile, QualityScore};
205
206    // Load the quality profile
207    let profile = QualityProfile::by_name(profile_name).ok_or_else(|| {
208        crate::Error::Format(format!(
209            "Unknown quality profile '{}'. Available: {:?}",
210            profile_name,
211            QualityProfile::available_profiles()
212        ))
213    })?;
214
215    let dataset = load_dataset(path)?;
216    let report = QualityChecker::new().check(&dataset)?;
217
218    // Wire QualityReport to ChecklistItems per the 100-point checklist
219    let checklist = build_checklist_from_report(&report, &profile);
220    let score = QualityScore::from_checklist(checklist);
221
222    // Output based on flags
223    if badge_output {
224        println!("{}", score.badge_url());
225    } else if json_output {
226        println!("{}", score.to_json());
227    } else {
228        print_text_report(&score, &profile, path, suggest);
229    }
230
231    // Exit with non-zero code if critical failures (for CI/CD)
232    if score.has_critical_failures() {
233        std::process::exit(1);
234    }
235
236    Ok(())
237}
238
239/// Print the text quality report (Andon-style visual management).
240fn print_text_report(
241    score: &crate::quality::QualityScore,
242    profile: &crate::quality::QualityProfile,
243    path: &Path,
244    suggest: bool,
245) {
246    let grade_symbol = match score.grade {
247        crate::quality::LetterGrade::A | crate::quality::LetterGrade::B => "\u{2713}",
248        crate::quality::LetterGrade::C => "\u{25CB}",
249        crate::quality::LetterGrade::D => "\u{25B3}",
250        crate::quality::LetterGrade::F => "\u{2717}",
251    };
252
253    let separator = "\u{2550}".repeat(63);
254    println!("{separator}");
255    println!(
256        "  Data Quality Score: {} {} ({:.1}%)  ",
257        grade_symbol, score.grade, score.score
258    );
259    println!("  Profile: {}  ", profile.name);
260    println!("  Decision: {}  ", score.grade.publication_decision());
261    println!("{separator}");
262    println!();
263    println!("File: {}", path.display());
264    println!(
265        "Points: {:.1} / {:.1}",
266        score.points_earned, score.max_points
267    );
268    println!();
269
270    print_severity_breakdown(score);
271    print_critical_failures(score, suggest);
272
273    if suggest {
274        print_other_issues(score);
275    }
276}
277
278/// Print the severity breakdown section.
279fn print_severity_breakdown(score: &crate::quality::QualityScore) {
280    use crate::quality::Severity;
281
282    println!("Severity Breakdown:");
283    for severity in [
284        Severity::Critical,
285        Severity::High,
286        Severity::Medium,
287        Severity::Low,
288    ] {
289        if let Some(stats) = score.severity_breakdown.get(&severity) {
290            let status = if stats.failed == 0 {
291                "\u{2713}"
292            } else {
293                "\u{2717}"
294            };
295            println!(
296                "  {} {:8}: {}/{} passed ({:.1}/{:.1} pts)",
297                status,
298                format!("{}", severity),
299                stats.passed,
300                stats.total,
301                stats.points_earned,
302                stats.max_points
303            );
304        }
305    }
306    println!();
307}
308
309/// Print critical failures section.
310fn print_critical_failures(score: &crate::quality::QualityScore, suggest: bool) {
311    let critical_failures = score.critical_failures();
312    if !critical_failures.is_empty() {
313        println!("CRITICAL FAILURES (blocks publication):");
314        for item in critical_failures {
315            println!("  \u{2717} #{}: {}", item.id, item.description);
316            if suggest {
317                if let Some(ref suggestion) = item.suggestion {
318                    println!("    \u{2192} {}", suggestion);
319                }
320            }
321        }
322        println!();
323    }
324}
325
326/// Print non-critical issues with suggestions.
327fn print_other_issues(score: &crate::quality::QualityScore) {
328    use crate::quality::Severity;
329
330    let failed = score.failed_items();
331    let non_critical: Vec<_> = failed
332        .iter()
333        .filter(|i| i.severity != Severity::Critical)
334        .collect();
335
336    if !non_critical.is_empty() {
337        println!("Other Issues ({}):", non_critical.len());
338        for item in non_critical {
339            let sev = match item.severity {
340                Severity::High => "[HIGH]",
341                Severity::Medium => "[MED]",
342                Severity::Low => "[LOW]",
343                Severity::Critical => "[CRIT]",
344            };
345            println!("  {} #{}: {}", sev, item.id, item.description);
346            if let Some(ref suggestion) = item.suggestion {
347                println!("      \u{2192} {}", suggestion);
348            }
349        }
350    }
351}
352
353/// List available quality profiles.
354#[allow(clippy::unnecessary_wraps)]
355pub(crate) fn cmd_quality_profiles() -> crate::Result<()> {
356    use crate::quality::QualityProfile;
357
358    println!("Available Quality Profiles");
359    println!("\u{2550}\u{2550}\u{2550}\u{2550}\u{2550}\u{2550}\u{2550}\u{2550}\u{2550}\u{2550}\u{2550}\u{2550}\u{2550}\u{2550}\u{2550}\u{2550}\u{2550}\u{2550}\u{2550}\u{2550}\u{2550}\u{2550}\u{2550}\u{2550}\u{2550}\u{2550}");
360    println!();
361
362    for name in QualityProfile::available_profiles() {
363        if let Some(profile) = QualityProfile::by_name(name) {
364            println!("  {} - {}", profile.name, profile.description);
365            if !profile.expected_constant_columns.is_empty() {
366                let cols: Vec<_> = profile.expected_constant_columns.iter().collect();
367                println!("    Expected constants: {:?}", cols);
368            }
369            if !profile.nullable_columns.is_empty() {
370                let cols: Vec<_> = profile.nullable_columns.iter().collect();
371                println!("    Nullable columns: {:?}", cols);
372            }
373            println!("    Max null ratio: {:.0}%", profile.max_null_ratio * 100.0);
374            println!(
375                "    Max duplicate ratio: {:.0}%",
376                profile.max_duplicate_ratio * 100.0
377            );
378            println!();
379        }
380    }
381
382    println!("Usage: alimentar quality score <path> --profile <name>");
383    Ok(())
384}
385
386/// Build checklist items from `QualityReport`.
387///
388/// Maps `QualityReport` findings to the 100-point checklist defined in GH-6.
389/// This wires the existing quality checks to the weighted scoring system.
390#[allow(clippy::too_many_lines)]
391pub(crate) fn build_checklist_from_report(
392    report: &crate::quality::QualityReport,
393    profile: &crate::quality::QualityProfile,
394) -> Vec<crate::quality::ChecklistItem> {
395    use crate::quality::{ChecklistItem, Severity};
396
397    let mut items = Vec::new();
398    let mut id: u8 = 1;
399
400    // === Critical Checks (2.0x weight) ===
401
402    // Check 1: Dataset not empty
403    let has_rows = report.row_count > 0;
404    items.push(
405        ChecklistItem::new(id, "Dataset contains rows", Severity::Critical, has_rows)
406            .with_suggestion("Extract more doctests or check input source"),
407    );
408    id += 1;
409
410    // Check 2: No empty schema
411    let has_columns = report.column_count > 0;
412    items.push(
413        ChecklistItem::new(
414            id,
415            "Schema has columns defined",
416            Severity::Critical,
417            has_columns,
418        )
419        .with_suggestion("Verify parser is extracting fields correctly"),
420    );
421    id += 1;
422
423    // Check 3: No unexpected constant columns (would break training)
424    // Filter out columns that the profile expects to be constant (e.g., source,
425    // version) Also allow nullable columns to be all-null (constant null is OK
426    // for optional fields)
427    let unexpected_constant_cols: Vec<String> = report
428        .columns
429        .iter()
430        .filter(|(name, c): &(&String, &ColumnQuality)| {
431            c.is_constant() && !profile.is_expected_constant(name) && !profile.is_nullable(name)
432        })
433        .map(|(n, _)| n.clone())
434        .collect();
435    let no_unexpected_constants = unexpected_constant_cols.is_empty();
436    items.push(
437        ChecklistItem::new(
438            id,
439            "No unexpected constant columns (zero variance)",
440            Severity::Critical,
441            no_unexpected_constants,
442        )
443        .with_suggestion(format!(
444            "Remove or investigate constant columns: {:?}",
445            unexpected_constant_cols
446        )),
447    );
448    id += 1;
449
450    // === High Priority Checks (1.5x weight) ===
451
452    // Check 4: Duplicate ratio below threshold (default 5%)
453    let duplicate_ratio = report
454        .issues
455        .iter()
456        .find_map(|i| {
457            if let crate::quality::QualityIssue::DuplicateRows {
458                duplicate_ratio: dr,
459                ..
460            } = i
461            {
462                Some(*dr)
463            } else {
464                None
465            }
466        })
467        .unwrap_or(0.0);
468    let low_duplicates = duplicate_ratio <= 0.05;
469    items.push(
470        ChecklistItem::new(
471            id,
472            format!(
473                "Duplicate ratio <= 5% (actual: {:.1}%)",
474                duplicate_ratio * 100.0
475            ),
476            Severity::High,
477            low_duplicates,
478        )
479        .with_suggestion("Run deduplication: alimentar dedupe <file>"),
480    );
481    id += 1;
482
483    // Check 5: No columns with >50% nulls (except nullable columns per profile)
484    let high_null_cols: Vec<String> = report
485        .columns
486        .iter()
487        .filter(|(name, c): &(&String, &ColumnQuality)| {
488            c.null_ratio > 0.5 && !profile.is_nullable(name)
489        })
490        .map(|(n, _)| n.clone())
491        .collect();
492    let no_high_null = high_null_cols.is_empty();
493    items.push(
494        ChecklistItem::new(
495            id,
496            "No columns with >50% null values",
497            Severity::High,
498            no_high_null,
499        )
500        .with_suggestion(format!(
501            "Investigate high-null columns: {:?}",
502            high_null_cols
503        )),
504    );
505    id += 1;
506
507    // Check 6: Minimum row count (at least 100 for meaningful training)
508    let min_rows = report.row_count >= 100;
509    items.push(
510        ChecklistItem::new(
511            id,
512            format!("Minimum 100 rows (actual: {})", report.row_count),
513            Severity::High,
514            min_rows,
515        )
516        .with_suggestion("Extract more data or combine with other sources"),
517    );
518    id += 1;
519
520    // === Medium Priority Checks (1.0x weight) ===
521
522    // Check 7: Overall quality score from existing checker
523    let good_score = report.score >= 70.0;
524    items.push(
525        ChecklistItem::new(
526            id,
527            format!("Quality score >= 70% (actual: {:.1}%)", report.score),
528            Severity::Medium,
529            good_score,
530        )
531        .with_suggestion("Address issues reported by quality check"),
532    );
533    id += 1;
534
535    // Check 8: No columns with >10% nulls (stricter, except nullable columns per
536    // profile)
537    let moderate_null_cols: Vec<String> = report
538        .columns
539        .iter()
540        .filter(|(name, c): &(&String, &ColumnQuality)| {
541            c.null_ratio > 0.1 && c.null_ratio <= 0.5 && !profile.is_nullable(name)
542        })
543        .map(|(n, _)| n.clone())
544        .collect();
545    let low_null_ratio = moderate_null_cols.is_empty();
546    items.push(
547        ChecklistItem::new(
548            id,
549            "No columns with >10% null values",
550            Severity::Medium,
551            low_null_ratio,
552        )
553        .with_suggestion(format!("Consider imputation for: {:?}", moderate_null_cols)),
554    );
555    id += 1;
556
557    // Check 9: Reasonable column count (not too few for ML)
558    let enough_columns = report.column_count >= 2;
559    items.push(
560        ChecklistItem::new(
561            id,
562            format!("At least 2 columns (actual: {})", report.column_count),
563            Severity::Medium,
564            enough_columns,
565        )
566        .with_suggestion("Ensure input and target columns are present"),
567    );
568    id += 1;
569
570    // Check 10: No outlier issues detected
571    let outlier_issues: Vec<(String, f64)> = report
572        .issues
573        .iter()
574        .filter_map(|i| {
575            if let crate::quality::QualityIssue::OutliersDetected {
576                column,
577                outlier_ratio: or,
578                ..
579            } = i
580            {
581                Some((column.clone(), *or))
582            } else {
583                None
584            }
585        })
586        .collect();
587    let no_severe_outliers = outlier_issues.iter().all(|(_, r)| *r < 0.1);
588    items.push(
589        ChecklistItem::new(
590            id,
591            "No columns with >10% outliers",
592            Severity::Medium,
593            no_severe_outliers,
594        )
595        .with_suggestion("Review outlier columns for data quality issues"),
596    );
597    id += 1;
598
599    // === Low Priority Checks (0.5x weight) ===
600
601    // Check 11: No warnings at all
602    let no_issues = report.issues.is_empty();
603    items.push(
604        ChecklistItem::new(id, "No quality warnings", Severity::Low, no_issues)
605            .with_suggestion("Address all warnings for best results"),
606    );
607    id += 1;
608
609    // Check 12: Good cardinality (unique values)
610    let low_cardinality_cols: Vec<String> = report
611        .columns
612        .iter()
613        .filter(|(_, c): &(&String, &ColumnQuality)| c.unique_count < 10 && !c.is_constant())
614        .map(|(n, _)| n.clone())
615        .collect();
616    let good_cardinality = low_cardinality_cols.is_empty();
617    items.push(
618        ChecklistItem::new(
619            id,
620            "All columns have reasonable cardinality (>10 unique)",
621            Severity::Low,
622            good_cardinality,
623        )
624        .with_suggestion(format!(
625            "Low cardinality columns: {:?}",
626            low_cardinality_cols
627        )),
628    );
629    let _ = id; // suppress warning
630
631    items
632}
633
634#[cfg(test)]
635#[allow(
636    clippy::cast_possible_truncation,
637    clippy::cast_possible_wrap,
638    clippy::cast_precision_loss,
639    clippy::uninlined_format_args,
640    clippy::unwrap_used,
641    clippy::expect_used,
642    clippy::redundant_clone,
643    clippy::cast_lossless,
644    clippy::redundant_closure_for_method_calls,
645    clippy::too_many_lines,
646    clippy::float_cmp,
647    clippy::similar_names,
648    clippy::needless_late_init,
649    clippy::redundant_pattern_matching
650)]
651mod tests {
652    use std::sync::Arc;
653
654    use arrow::{
655        array::{Int32Array, StringArray},
656        datatypes::{DataType, Field, Schema},
657    };
658
659    use super::*;
660    use crate::ArrowDataset;
661
662    fn create_test_parquet(path: &PathBuf, rows: usize) {
663        let schema = Arc::new(Schema::new(vec![
664            Field::new("id", DataType::Int32, false),
665            Field::new("name", DataType::Utf8, false),
666        ]));
667
668        let ids: Vec<i32> = (0..rows as i32).collect();
669        let names: Vec<String> = ids.iter().map(|i| format!("item_{}", i)).collect();
670
671        let batch = arrow::array::RecordBatch::try_new(
672            schema,
673            vec![
674                Arc::new(Int32Array::from(ids)),
675                Arc::new(StringArray::from(names)),
676            ],
677        )
678        .ok()
679        .unwrap_or_else(|| panic!("Should create batch"));
680
681        let dataset = ArrowDataset::from_batch(batch)
682            .ok()
683            .unwrap_or_else(|| panic!("Should create dataset"));
684
685        dataset
686            .to_parquet(path)
687            .ok()
688            .unwrap_or_else(|| panic!("Should write parquet"));
689    }
690
691    #[test]
692    fn test_cmd_quality_check_text() {
693        let temp_dir = tempfile::tempdir()
694            .ok()
695            .unwrap_or_else(|| panic!("Should create temp dir"));
696        let path = temp_dir.path().join("data.parquet");
697        create_test_parquet(&path, 100);
698
699        let result = cmd_quality_check(&path, 0.1, 0.05, true, "text");
700        assert!(result.is_ok());
701    }
702
703    #[test]
704    fn test_cmd_quality_check_json() {
705        let temp_dir = tempfile::tempdir()
706            .ok()
707            .unwrap_or_else(|| panic!("Should create temp dir"));
708        let path = temp_dir.path().join("data.parquet");
709        create_test_parquet(&path, 100);
710
711        let result = cmd_quality_check(&path, 0.1, 0.05, true, "json");
712        assert!(result.is_ok());
713    }
714
715    #[test]
716    fn test_cmd_quality_check_no_outliers() {
717        let temp_dir = tempfile::tempdir()
718            .ok()
719            .unwrap_or_else(|| panic!("Should create temp dir"));
720        let path = temp_dir.path().join("data.parquet");
721        create_test_parquet(&path, 100);
722
723        let result = cmd_quality_check(&path, 0.1, 0.05, false, "text");
724        assert!(result.is_ok());
725    }
726
727    #[test]
728    fn test_cmd_quality_report_basic() {
729        let temp_dir = tempfile::tempdir()
730            .ok()
731            .unwrap_or_else(|| panic!("Should create temp dir"));
732        let path = temp_dir.path().join("data.parquet");
733        create_test_parquet(&path, 100);
734
735        let result = cmd_quality_report(&path, None);
736        assert!(result.is_ok());
737    }
738
739    #[test]
740    fn test_cmd_quality_report_to_file() {
741        let temp_dir = tempfile::tempdir()
742            .ok()
743            .unwrap_or_else(|| panic!("Should create temp dir"));
744        let data_path = temp_dir.path().join("data.parquet");
745        let output_path = temp_dir.path().join("quality.json");
746        create_test_parquet(&data_path, 100);
747
748        let result = cmd_quality_report(&data_path, Some(&output_path));
749        assert!(result.is_ok());
750        assert!(output_path.exists());
751
752        // Verify JSON is valid
753        let content = std::fs::read_to_string(&output_path)
754            .ok()
755            .unwrap_or_else(|| panic!("Should read file"));
756        let parsed: serde_json::Value = serde_json::from_str(&content)
757            .ok()
758            .unwrap_or_else(|| panic!("Should parse JSON"));
759        assert!(parsed.get("score").is_some());
760        assert!(parsed.get("has_issues").is_some());
761    }
762
763    #[test]
764    fn test_cmd_quality_check_with_constant_column() {
765        let temp_dir = tempfile::tempdir()
766            .ok()
767            .unwrap_or_else(|| panic!("Should create temp dir"));
768        let path = temp_dir.path().join("data.parquet");
769
770        let schema = Arc::new(Schema::new(vec![
771            Field::new("id", DataType::Int32, false),
772            Field::new("constant", DataType::Int32, false),
773        ]));
774
775        let ids: Vec<i32> = (0..100).collect();
776        let constants: Vec<i32> = vec![42; 100];
777
778        let batch = arrow::array::RecordBatch::try_new(
779            schema,
780            vec![
781                Arc::new(Int32Array::from(ids)),
782                Arc::new(Int32Array::from(constants)),
783            ],
784        )
785        .ok()
786        .unwrap_or_else(|| panic!("Should create batch"));
787
788        let dataset = ArrowDataset::from_batch(batch)
789            .ok()
790            .unwrap_or_else(|| panic!("Should create dataset"));
791
792        dataset
793            .to_parquet(&path)
794            .ok()
795            .unwrap_or_else(|| panic!("Should write parquet"));
796
797        let result = cmd_quality_check(&path, 0.1, 0.05, true, "text");
798        assert!(result.is_ok());
799    }
800
801    #[test]
802    fn test_cmd_quality_report_default_output() {
803        let temp_dir = tempfile::tempdir()
804            .ok()
805            .unwrap_or_else(|| panic!("Should create temp dir"));
806        let path = temp_dir.path().join("data.parquet");
807        create_test_parquet(&path, 50);
808
809        let result = cmd_quality_report(&path, None);
810        assert!(result.is_ok());
811    }
812
813    #[test]
814    fn test_cmd_quality_report_with_output() {
815        let temp_dir = tempfile::tempdir()
816            .ok()
817            .unwrap_or_else(|| panic!("Should create temp dir"));
818        let path = temp_dir.path().join("data.parquet");
819        let output = temp_dir.path().join("report.html");
820        create_test_parquet(&path, 50);
821
822        let result = cmd_quality_report(&path, Some(&output));
823        assert!(result.is_ok());
824        assert!(output.exists());
825    }
826
827    #[test]
828    fn test_cmd_quality_score() {
829        let temp_dir = tempfile::tempdir()
830            .ok()
831            .unwrap_or_else(|| panic!("Should create temp dir"));
832        let path = temp_dir.path().join("data.parquet");
833        create_test_parquet(&path, 100);
834
835        let result = cmd_quality_score(&path, "default", false, false, false);
836        assert!(result.is_ok());
837    }
838
839    #[test]
840    fn test_cmd_quality_score_with_json() {
841        let temp_dir = tempfile::tempdir()
842            .ok()
843            .unwrap_or_else(|| panic!("Should create temp dir"));
844        let path = temp_dir.path().join("data.parquet");
845        create_test_parquet(&path, 100);
846
847        let result = cmd_quality_score(&path, "default", false, true, false);
848        assert!(result.is_ok());
849    }
850
851    #[test]
852    fn test_cmd_quality_score_with_badge() {
853        let temp_dir = tempfile::tempdir()
854            .ok()
855            .unwrap_or_else(|| panic!("Should create temp dir"));
856        let path = temp_dir.path().join("data.parquet");
857        create_test_parquet(&path, 100);
858
859        let result = cmd_quality_score(&path, "default", false, false, true);
860        assert!(result.is_ok());
861    }
862
863    #[test]
864    fn test_cmd_quality_score_with_suggest() {
865        let temp_dir = tempfile::tempdir()
866            .ok()
867            .unwrap_or_else(|| panic!("Should create temp dir"));
868        let path = temp_dir.path().join("data.parquet");
869        create_test_parquet(&path, 100);
870
871        let result = cmd_quality_score(&path, "default", true, false, false);
872        assert!(result.is_ok());
873    }
874
875    #[test]
876    fn test_cmd_quality_score_with_doctest_profile() {
877        let temp_dir = tempfile::tempdir()
878            .ok()
879            .unwrap_or_else(|| panic!("Should create temp dir"));
880        let path = temp_dir.path().join("data.parquet");
881        create_test_parquet(&path, 100);
882
883        let result = cmd_quality_score(&path, "doctest-corpus", false, false, false);
884        assert!(result.is_ok());
885    }
886
887    #[test]
888    fn test_cmd_quality_profiles() {
889        let result = cmd_quality_profiles();
890        assert!(result.is_ok());
891    }
892
893    // === Additional quality CLI tests ===
894
895    #[test]
896    fn test_cmd_quality_check_with_high_null_threshold() {
897        let temp_dir = tempfile::tempdir()
898            .ok()
899            .unwrap_or_else(|| panic!("Should create temp dir"));
900        let path = temp_dir.path().join("data.parquet");
901        create_test_parquet(&path, 100);
902
903        // Use very high null threshold
904        let result = cmd_quality_check(&path, 0.9, 0.9, true, "text");
905        assert!(result.is_ok());
906    }
907
908    #[test]
909    fn test_cmd_quality_check_small_dataset() {
910        let temp_dir = tempfile::tempdir()
911            .ok()
912            .unwrap_or_else(|| panic!("Should create temp dir"));
913        let path = temp_dir.path().join("small.parquet");
914        create_test_parquet(&path, 5);
915
916        let result = cmd_quality_check(&path, 0.1, 0.05, true, "text");
917        assert!(result.is_ok());
918    }
919
920    #[test]
921    fn test_cmd_quality_check_large_dataset() {
922        let temp_dir = tempfile::tempdir()
923            .ok()
924            .unwrap_or_else(|| panic!("Should create temp dir"));
925        let path = temp_dir.path().join("large.parquet");
926        create_test_parquet(&path, 500);
927
928        let result = cmd_quality_check(&path, 0.1, 0.05, false, "json");
929        assert!(result.is_ok());
930    }
931
932    #[test]
933    fn test_cmd_quality_score_ml_training_profile() {
934        let temp_dir = tempfile::tempdir()
935            .ok()
936            .unwrap_or_else(|| panic!("Should create temp dir"));
937        let path = temp_dir.path().join("ml.parquet");
938        create_test_parquet(&path, 150);
939
940        let result = cmd_quality_score(&path, "ml-training", false, false, false);
941        assert!(result.is_ok());
942    }
943
944    #[test]
945    fn test_cmd_quality_score_invalid_profile() {
946        let temp_dir = tempfile::tempdir()
947            .ok()
948            .unwrap_or_else(|| panic!("Should create temp dir"));
949        let path = temp_dir.path().join("data.parquet");
950        create_test_parquet(&path, 100);
951
952        let result = cmd_quality_score(&path, "nonexistent-profile", false, false, false);
953        assert!(result.is_err());
954    }
955
956    #[test]
957    fn test_cmd_quality_score_all_output_modes() {
958        let temp_dir = tempfile::tempdir()
959            .ok()
960            .unwrap_or_else(|| panic!("Should create temp dir"));
961        let path = temp_dir.path().join("data.parquet");
962        create_test_parquet(&path, 100);
963
964        // Test text output
965        let result = cmd_quality_score(&path, "default", false, false, false);
966        assert!(result.is_ok());
967
968        // Test JSON output
969        let result = cmd_quality_score(&path, "default", false, true, false);
970        assert!(result.is_ok());
971
972        // Test badge output
973        let result = cmd_quality_score(&path, "default", false, false, true);
974        assert!(result.is_ok());
975    }
976
977    #[test]
978    fn test_cmd_quality_report_to_stdout() {
979        let temp_dir = tempfile::tempdir()
980            .ok()
981            .unwrap_or_else(|| panic!("Should create temp dir"));
982        let path = temp_dir.path().join("data.parquet");
983        create_test_parquet(&path, 50);
984
985        // Output to stdout (None)
986        let result = cmd_quality_report(&path, None);
987        assert!(result.is_ok());
988    }
989
990    #[test]
991    fn test_build_checklist_from_report_empty_dataset() {
992        // Create empty-ish dataset scenario
993        use crate::quality::{QualityChecker, QualityProfile};
994
995        let schema = Arc::new(Schema::new(vec![Field::new("id", DataType::Int32, false)]));
996
997        let batch =
998            arrow::array::RecordBatch::try_new(schema, vec![Arc::new(Int32Array::from(vec![1]))])
999                .unwrap();
1000
1001        let dataset = ArrowDataset::from_batch(batch).unwrap();
1002        let report = QualityChecker::new().check(&dataset).unwrap();
1003        let profile = QualityProfile::by_name("default").unwrap();
1004
1005        let checklist = build_checklist_from_report(&report, &profile);
1006
1007        // Should have checklist items
1008        assert!(!checklist.is_empty());
1009
1010        // Check that we have critical, high, medium, low items
1011        let has_critical = checklist
1012            .iter()
1013            .any(|i| i.severity == crate::quality::Severity::Critical);
1014        let has_high = checklist
1015            .iter()
1016            .any(|i| i.severity == crate::quality::Severity::High);
1017        let has_medium = checklist
1018            .iter()
1019            .any(|i| i.severity == crate::quality::Severity::Medium);
1020        let has_low = checklist
1021            .iter()
1022            .any(|i| i.severity == crate::quality::Severity::Low);
1023
1024        assert!(has_critical);
1025        assert!(has_high);
1026        assert!(has_medium);
1027        assert!(has_low);
1028    }
1029
1030    #[test]
1031    fn test_build_checklist_high_quality_dataset() {
1032        use crate::quality::{QualityChecker, QualityProfile};
1033
1034        // Create a high-quality dataset
1035        let schema = Arc::new(Schema::new(vec![
1036            Field::new("id", DataType::Int32, false),
1037            Field::new("name", DataType::Utf8, false),
1038        ]));
1039
1040        let ids: Vec<i32> = (0..200).collect();
1041        let names: Vec<String> = ids.iter().map(|i| format!("name_{}", i)).collect();
1042
1043        let batch = arrow::array::RecordBatch::try_new(
1044            schema,
1045            vec![
1046                Arc::new(Int32Array::from(ids)),
1047                Arc::new(StringArray::from(names)),
1048            ],
1049        )
1050        .unwrap();
1051
1052        let dataset = ArrowDataset::from_batch(batch).unwrap();
1053        let report = QualityChecker::new().check(&dataset).unwrap();
1054        let profile = QualityProfile::by_name("default").unwrap();
1055
1056        let checklist = build_checklist_from_report(&report, &profile);
1057
1058        // Count passed items
1059        let passed = checklist.iter().filter(|i| i.passed).count();
1060        // A good dataset should have most checks passing
1061        assert!(passed > checklist.len() / 2);
1062    }
1063
1064    #[test]
1065    fn test_cmd_quality_check_with_issues() {
1066        let temp_dir = tempfile::tempdir()
1067            .ok()
1068            .unwrap_or_else(|| panic!("Should create temp dir"));
1069        let path = temp_dir.path().join("issues.parquet");
1070
1071        // Create dataset with potential issues (constant column)
1072        let schema = Arc::new(Schema::new(vec![
1073            Field::new("id", DataType::Int32, false),
1074            Field::new("constant", DataType::Int32, false),
1075        ]));
1076
1077        let ids: Vec<i32> = (0..50).collect();
1078        let constants: Vec<i32> = vec![42; 50];
1079
1080        let batch = arrow::array::RecordBatch::try_new(
1081            schema,
1082            vec![
1083                Arc::new(Int32Array::from(ids)),
1084                Arc::new(Int32Array::from(constants)),
1085            ],
1086        )
1087        .unwrap();
1088
1089        let dataset = ArrowDataset::from_batch(batch).unwrap();
1090        dataset.to_parquet(&path).unwrap();
1091
1092        // Should handle issues gracefully
1093        let result = cmd_quality_check(&path, 0.1, 0.05, true, "text");
1094        assert!(result.is_ok());
1095    }
1096}
alimentar/cli/quality.rs

alimentar/cli/
quality.rs