1use std::path::{Path, PathBuf};
4
5use clap::Subcommand;
6
7use super::basic::load_dataset;
8use crate::quality::{ColumnQuality, QualityChecker};
9
10#[derive(Subcommand)]
12pub enum QualityCommands {
13 Check {
15 path: PathBuf,
17 #[arg(long, default_value = "0.1")]
19 null_threshold: f64,
20 #[arg(long, default_value = "0.05")]
22 duplicate_threshold: f64,
23 #[arg(long, default_value = "true")]
25 detect_outliers: bool,
26 #[arg(short, long, default_value = "text")]
28 format: String,
29 },
30 Report {
32 path: PathBuf,
34 #[arg(short, long)]
36 output: Option<PathBuf>,
37 },
38 Score {
40 path: PathBuf,
42 #[arg(short, long, default_value = "default")]
45 profile: String,
46 #[arg(long)]
48 suggest: bool,
49 #[arg(long)]
51 json: bool,
52 #[arg(long)]
54 badge: bool,
55 },
56 Profiles,
58}
59
60pub(crate) fn cmd_quality_check(
62 path: &Path,
63 null_threshold: f64,
64 duplicate_threshold: f64,
65 detect_outliers: bool,
66 format: &str,
67) -> crate::Result<()> {
68 let dataset = load_dataset(path)?;
69 warn_duplicate_threshold(duplicate_threshold);
70 let checker = build_quality_checker(detect_outliers);
71 let report = checker.check(&dataset)?;
72
73 if format == "json" {
74 print_quality_json(&report, path, null_threshold)
75 } else {
76 print_quality_text(&report, path, null_threshold);
77 Ok(())
78 }
79}
80
81fn warn_duplicate_threshold(duplicate_threshold: f64) {
82 if (duplicate_threshold - 0.05_f64).abs() > f64::EPSILON {
83 eprintln!(
84 "Warning: --duplicate-threshold {duplicate_threshold} is not yet implemented. Using default behavior."
85 );
86 }
87}
88
89fn build_quality_checker(detect_outliers: bool) -> QualityChecker {
90 let mut checker = QualityChecker::new();
91 if !detect_outliers {
92 checker = checker.with_outlier_check(false);
93 }
94 checker
95}
96
97fn print_quality_json(
98 report: &crate::quality::QualityReport,
99 path: &Path,
100 null_threshold: f64,
101) -> crate::Result<()> {
102 let json = serde_json::json!({
103 "path": path.display().to_string(),
104 "rows": report.row_count,
105 "columns": report.column_count,
106 "has_issues": !report.issues.is_empty(),
107 "score": report.score,
108 "issues": report.issues.iter().map(|i| format!("{:?}", i)).collect::<Vec<_>>(),
109 "column_qualities": report.columns.iter().map(|(name, c)| {
110 serde_json::json!({
111 "column": name,
112 "null_ratio": c.null_ratio,
113 "unique_count": c.unique_count,
114 "is_constant": c.is_constant(),
115 "is_mostly_null": c.null_ratio > null_threshold,
116 })
117 }).collect::<Vec<_>>()
118 });
119 println!(
120 "{}",
121 serde_json::to_string_pretty(&json).map_err(|e| crate::Error::Format(e.to_string()))?
122 );
123 Ok(())
124}
125
126fn print_quality_text(report: &crate::quality::QualityReport, path: &Path, null_threshold: f64) {
127 println!("Data Quality Report");
128 println!("===================");
129 println!("File: {}", path.display());
130 println!("Rows: {}", report.row_count);
131 println!("Columns: {}", report.column_count);
132 println!();
133
134 println!("Quality Score: {:.1}%", report.score);
135 println!();
136
137 print_quality_issues(report);
138 print_column_table(report, null_threshold);
139}
140
141fn print_quality_issues(report: &crate::quality::QualityReport) {
142 if report.issues.is_empty() {
143 println!("\u{2713} No quality issues found\n");
144 } else {
145 println!("Issues Found:");
146 println!("-------------");
147 for issue in &report.issues {
148 println!(" - {:?}", issue);
149 }
150 println!();
151 }
152}
153
154fn print_column_table(report: &crate::quality::QualityReport, null_threshold: f64) {
155 println!(
156 "{:<20} {:<12} {:<12} {:<10}",
157 "COLUMN", "NULL %", "UNIQUE", "STATUS"
158 );
159 println!("{}", "-".repeat(60));
160
161 for (name, col) in &report.columns {
162 println!(
163 "{:<20} {:<12.2} {:<12} {:<10}",
164 name,
165 col.null_ratio * 100.0,
166 col.unique_count,
167 column_status(col, null_threshold)
168 );
169 }
170}
171
172fn column_status(col: &ColumnQuality, null_threshold: f64) -> &'static str {
173 if col.is_constant() {
174 "CONSTANT"
175 } else if col.null_ratio > null_threshold {
176 "HIGH NULL"
177 } else {
178 "OK"
179 }
180}
181
182pub(crate) fn cmd_quality_report(path: &Path, output: Option<&Path>) -> crate::Result<()> {
184 let dataset = load_dataset(path)?;
185 let report = QualityChecker::new().check(&dataset)?;
186
187 let json = serde_json::json!({
188 "path": path.display().to_string(),
189 "rows": report.row_count,
190 "columns": report.column_count,
191 "has_issues": !report.issues.is_empty(),
192 "score": report.score,
193 "issues": report.issues.iter().map(|i| format!("{:?}", i)).collect::<Vec<_>>(),
194 "column_qualities": report.columns.iter().map(|(name, c)| {
195 serde_json::json!({
196 "column": name,
197 "null_ratio": c.null_ratio,
198 "unique_count": c.unique_count,
199 "is_constant": c.is_constant(),
200 })
201 }).collect::<Vec<_>>()
202 });
203
204 let json_str =
205 serde_json::to_string_pretty(&json).map_err(|e| crate::Error::Format(e.to_string()))?;
206
207 if let Some(output_path) = output {
208 std::fs::write(output_path, &json_str).map_err(|e| crate::Error::io(e, output_path))?;
209 println!("Quality report written to: {}", output_path.display());
210 } else {
211 println!("{}", json_str);
212 }
213
214 Ok(())
215}
216
217#[allow(clippy::too_many_lines)]
222pub(crate) fn cmd_quality_score(
223 path: &Path,
224 profile_name: &str,
225 suggest: bool,
226 json_output: bool,
227 badge_output: bool,
228) -> crate::Result<()> {
229 use crate::quality::{QualityProfile, QualityScore};
230
231 let profile = QualityProfile::by_name(profile_name).ok_or_else(|| {
233 crate::Error::Format(format!(
234 "Unknown quality profile '{}'. Available: {:?}",
235 profile_name,
236 QualityProfile::available_profiles()
237 ))
238 })?;
239
240 let dataset = load_dataset(path)?;
241 let report = QualityChecker::new().check(&dataset)?;
242
243 let checklist = build_checklist_from_report(&report, &profile);
245 let score = QualityScore::from_checklist(checklist);
246
247 if badge_output {
249 println!("{}", score.badge_url());
250 } else if json_output {
251 println!("{}", score.to_json());
252 } else {
253 print_text_report(&score, &profile, path, suggest);
254 }
255
256 if score.has_critical_failures() {
258 std::process::exit(1);
259 }
260
261 Ok(())
262}
263
264fn print_text_report(
266 score: &crate::quality::QualityScore,
267 profile: &crate::quality::QualityProfile,
268 path: &Path,
269 suggest: bool,
270) {
271 let grade_symbol = match score.grade {
272 crate::quality::LetterGrade::A | crate::quality::LetterGrade::B => "\u{2713}",
273 crate::quality::LetterGrade::C => "\u{25CB}",
274 crate::quality::LetterGrade::D => "\u{25B3}",
275 crate::quality::LetterGrade::F => "\u{2717}",
276 };
277
278 let separator = "\u{2550}".repeat(63);
279 println!("{separator}");
280 println!(
281 " Data Quality Score: {} {} ({:.1}%) ",
282 grade_symbol, score.grade, score.score
283 );
284 println!(" Profile: {} ", profile.name);
285 println!(" Decision: {} ", score.grade.publication_decision());
286 println!("{separator}");
287 println!();
288 println!("File: {}", path.display());
289 println!(
290 "Points: {:.1} / {:.1}",
291 score.points_earned, score.max_points
292 );
293 println!();
294
295 print_severity_breakdown(score);
296 print_critical_failures(score, suggest);
297
298 if suggest {
299 print_other_issues(score);
300 }
301}
302
303fn print_severity_breakdown(score: &crate::quality::QualityScore) {
305 use crate::quality::Severity;
306
307 println!("Severity Breakdown:");
308 for severity in [
309 Severity::Critical,
310 Severity::High,
311 Severity::Medium,
312 Severity::Low,
313 ] {
314 if let Some(stats) = score.severity_breakdown.get(&severity) {
315 let status = if stats.failed == 0 {
316 "\u{2713}"
317 } else {
318 "\u{2717}"
319 };
320 println!(
321 " {} {:8}: {}/{} passed ({:.1}/{:.1} pts)",
322 status,
323 format!("{}", severity),
324 stats.passed,
325 stats.total,
326 stats.points_earned,
327 stats.max_points
328 );
329 }
330 }
331 println!();
332}
333
334fn print_critical_failures(score: &crate::quality::QualityScore, suggest: bool) {
336 let critical_failures = score.critical_failures();
337 if !critical_failures.is_empty() {
338 println!("CRITICAL FAILURES (blocks publication):");
339 for item in critical_failures {
340 println!(" \u{2717} #{}: {}", item.id, item.description);
341 if suggest {
342 if let Some(ref suggestion) = item.suggestion {
343 println!(" \u{2192} {}", suggestion);
344 }
345 }
346 }
347 println!();
348 }
349}
350
351fn print_other_issues(score: &crate::quality::QualityScore) {
353 use crate::quality::Severity;
354
355 let failed = score.failed_items();
356 let non_critical: Vec<_> = failed
357 .iter()
358 .filter(|i| i.severity != Severity::Critical)
359 .collect();
360
361 if !non_critical.is_empty() {
362 println!("Other Issues ({}):", non_critical.len());
363 for item in non_critical {
364 let sev = match item.severity {
365 Severity::High => "[HIGH]",
366 Severity::Medium => "[MED]",
367 Severity::Low => "[LOW]",
368 Severity::Critical => "[CRIT]",
369 };
370 println!(" {} #{}: {}", sev, item.id, item.description);
371 if let Some(ref suggestion) = item.suggestion {
372 println!(" \u{2192} {}", suggestion);
373 }
374 }
375 }
376}
377
378#[allow(clippy::unnecessary_wraps)]
380pub(crate) fn cmd_quality_profiles() -> crate::Result<()> {
381 use crate::quality::QualityProfile;
382
383 println!("Available Quality Profiles");
384 println!("\u{2550}\u{2550}\u{2550}\u{2550}\u{2550}\u{2550}\u{2550}\u{2550}\u{2550}\u{2550}\u{2550}\u{2550}\u{2550}\u{2550}\u{2550}\u{2550}\u{2550}\u{2550}\u{2550}\u{2550}\u{2550}\u{2550}\u{2550}\u{2550}\u{2550}\u{2550}");
385 println!();
386
387 for name in QualityProfile::available_profiles() {
388 if let Some(profile) = QualityProfile::by_name(name) {
389 println!(" {} - {}", profile.name, profile.description);
390 if !profile.expected_constant_columns.is_empty() {
391 let cols: Vec<_> = profile.expected_constant_columns.iter().collect();
392 println!(" Expected constants: {:?}", cols);
393 }
394 if !profile.nullable_columns.is_empty() {
395 let cols: Vec<_> = profile.nullable_columns.iter().collect();
396 println!(" Nullable columns: {:?}", cols);
397 }
398 println!(" Max null ratio: {:.0}%", profile.max_null_ratio * 100.0);
399 println!(
400 " Max duplicate ratio: {:.0}%",
401 profile.max_duplicate_ratio * 100.0
402 );
403 println!();
404 }
405 }
406
407 println!("Usage: alimentar quality score <path> --profile <name>");
408 Ok(())
409}
410
411#[allow(clippy::too_many_lines)]
416pub(crate) fn build_checklist_from_report(
417 report: &crate::quality::QualityReport,
418 profile: &crate::quality::QualityProfile,
419) -> Vec<crate::quality::ChecklistItem> {
420 use crate::quality::{ChecklistItem, Severity};
421
422 let mut items = Vec::new();
423 let mut id: u8 = 1;
424
425 let has_rows = report.row_count > 0;
429 items.push(
430 ChecklistItem::new(id, "Dataset contains rows", Severity::Critical, has_rows)
431 .with_suggestion("Extract more doctests or check input source"),
432 );
433 id += 1;
434
435 let has_columns = report.column_count > 0;
437 items.push(
438 ChecklistItem::new(
439 id,
440 "Schema has columns defined",
441 Severity::Critical,
442 has_columns,
443 )
444 .with_suggestion("Verify parser is extracting fields correctly"),
445 );
446 id += 1;
447
448 let unexpected_constant_cols: Vec<String> = report
453 .columns
454 .iter()
455 .filter(|(name, c): &(&String, &ColumnQuality)| {
456 c.is_constant() && !profile.is_expected_constant(name) && !profile.is_nullable(name)
457 })
458 .map(|(n, _)| n.clone())
459 .collect();
460 let no_unexpected_constants = unexpected_constant_cols.is_empty();
461 items.push(
462 ChecklistItem::new(
463 id,
464 "No unexpected constant columns (zero variance)",
465 Severity::Critical,
466 no_unexpected_constants,
467 )
468 .with_suggestion(format!(
469 "Remove or investigate constant columns: {:?}",
470 unexpected_constant_cols
471 )),
472 );
473 id += 1;
474
475 let duplicate_ratio = report
479 .issues
480 .iter()
481 .find_map(|i| {
482 if let crate::quality::QualityIssue::DuplicateRows {
483 duplicate_ratio: dr,
484 ..
485 } = i
486 {
487 Some(*dr)
488 } else {
489 None
490 }
491 })
492 .unwrap_or(0.0);
493 let low_duplicates = duplicate_ratio <= 0.05;
494 items.push(
495 ChecklistItem::new(
496 id,
497 format!(
498 "Duplicate ratio <= 5% (actual: {:.1}%)",
499 duplicate_ratio * 100.0
500 ),
501 Severity::High,
502 low_duplicates,
503 )
504 .with_suggestion("Run deduplication: alimentar dedupe <file>"),
505 );
506 id += 1;
507
508 let high_null_cols: Vec<String> = report
510 .columns
511 .iter()
512 .filter(|(name, c): &(&String, &ColumnQuality)| {
513 c.null_ratio > 0.5 && !profile.is_nullable(name)
514 })
515 .map(|(n, _)| n.clone())
516 .collect();
517 let no_high_null = high_null_cols.is_empty();
518 items.push(
519 ChecklistItem::new(
520 id,
521 "No columns with >50% null values",
522 Severity::High,
523 no_high_null,
524 )
525 .with_suggestion(format!(
526 "Investigate high-null columns: {:?}",
527 high_null_cols
528 )),
529 );
530 id += 1;
531
532 let min_rows = report.row_count >= 100;
534 items.push(
535 ChecklistItem::new(
536 id,
537 format!("Minimum 100 rows (actual: {})", report.row_count),
538 Severity::High,
539 min_rows,
540 )
541 .with_suggestion("Extract more data or combine with other sources"),
542 );
543 id += 1;
544
545 let good_score = report.score >= 70.0;
549 items.push(
550 ChecklistItem::new(
551 id,
552 format!("Quality score >= 70% (actual: {:.1}%)", report.score),
553 Severity::Medium,
554 good_score,
555 )
556 .with_suggestion("Address issues reported by quality check"),
557 );
558 id += 1;
559
560 let moderate_null_cols: Vec<String> = report
563 .columns
564 .iter()
565 .filter(|(name, c): &(&String, &ColumnQuality)| {
566 c.null_ratio > 0.1 && c.null_ratio <= 0.5 && !profile.is_nullable(name)
567 })
568 .map(|(n, _)| n.clone())
569 .collect();
570 let low_null_ratio = moderate_null_cols.is_empty();
571 items.push(
572 ChecklistItem::new(
573 id,
574 "No columns with >10% null values",
575 Severity::Medium,
576 low_null_ratio,
577 )
578 .with_suggestion(format!("Consider imputation for: {:?}", moderate_null_cols)),
579 );
580 id += 1;
581
582 let enough_columns = report.column_count >= 2;
584 items.push(
585 ChecklistItem::new(
586 id,
587 format!("At least 2 columns (actual: {})", report.column_count),
588 Severity::Medium,
589 enough_columns,
590 )
591 .with_suggestion("Ensure input and target columns are present"),
592 );
593 id += 1;
594
595 let outlier_issues: Vec<(String, f64)> = report
597 .issues
598 .iter()
599 .filter_map(|i| {
600 if let crate::quality::QualityIssue::OutliersDetected {
601 column,
602 outlier_ratio: or,
603 ..
604 } = i
605 {
606 Some((column.clone(), *or))
607 } else {
608 None
609 }
610 })
611 .collect();
612 let no_severe_outliers = outlier_issues.iter().all(|(_, r)| *r < 0.1);
613 items.push(
614 ChecklistItem::new(
615 id,
616 "No columns with >10% outliers",
617 Severity::Medium,
618 no_severe_outliers,
619 )
620 .with_suggestion("Review outlier columns for data quality issues"),
621 );
622 id += 1;
623
624 let no_issues = report.issues.is_empty();
628 items.push(
629 ChecklistItem::new(id, "No quality warnings", Severity::Low, no_issues)
630 .with_suggestion("Address all warnings for best results"),
631 );
632 id += 1;
633
634 let low_cardinality_cols: Vec<String> = report
636 .columns
637 .iter()
638 .filter(|(_, c): &(&String, &ColumnQuality)| c.unique_count < 10 && !c.is_constant())
639 .map(|(n, _)| n.clone())
640 .collect();
641 let good_cardinality = low_cardinality_cols.is_empty();
642 items.push(
643 ChecklistItem::new(
644 id,
645 "All columns have reasonable cardinality (>10 unique)",
646 Severity::Low,
647 good_cardinality,
648 )
649 .with_suggestion(format!(
650 "Low cardinality columns: {:?}",
651 low_cardinality_cols
652 )),
653 );
654 let _ = id; items
657}
658
659#[cfg(test)]
660#[allow(
661 clippy::cast_possible_truncation,
662 clippy::cast_possible_wrap,
663 clippy::cast_precision_loss,
664 clippy::uninlined_format_args,
665 clippy::unwrap_used,
666 clippy::expect_used,
667 clippy::redundant_clone,
668 clippy::cast_lossless,
669 clippy::redundant_closure_for_method_calls,
670 clippy::too_many_lines,
671 clippy::float_cmp,
672 clippy::similar_names,
673 clippy::needless_late_init,
674 clippy::redundant_pattern_matching
675)]
676mod tests {
677 use std::sync::Arc;
678
679 use arrow::{
680 array::{Int32Array, StringArray},
681 datatypes::{DataType, Field, Schema},
682 };
683
684 use super::*;
685 use crate::ArrowDataset;
686
687 fn create_test_parquet(path: &PathBuf, rows: usize) {
688 let schema = Arc::new(Schema::new(vec![
689 Field::new("id", DataType::Int32, false),
690 Field::new("name", DataType::Utf8, false),
691 ]));
692
693 let ids: Vec<i32> = (0..rows as i32).collect();
694 let names: Vec<String> = ids.iter().map(|i| format!("item_{}", i)).collect();
695
696 let batch = arrow::array::RecordBatch::try_new(
697 schema,
698 vec![
699 Arc::new(Int32Array::from(ids)),
700 Arc::new(StringArray::from(names)),
701 ],
702 )
703 .ok()
704 .unwrap_or_else(|| panic!("Should create batch"));
705
706 let dataset = ArrowDataset::from_batch(batch)
707 .ok()
708 .unwrap_or_else(|| panic!("Should create dataset"));
709
710 dataset
711 .to_parquet(path)
712 .ok()
713 .unwrap_or_else(|| panic!("Should write parquet"));
714 }
715
716 #[test]
717 fn test_cmd_quality_check_text() {
718 let temp_dir = tempfile::tempdir()
719 .ok()
720 .unwrap_or_else(|| panic!("Should create temp dir"));
721 let path = temp_dir.path().join("data.parquet");
722 create_test_parquet(&path, 100);
723
724 let result = cmd_quality_check(&path, 0.1, 0.05, true, "text");
725 assert!(result.is_ok());
726 }
727
728 #[test]
729 fn test_cmd_quality_check_json() {
730 let temp_dir = tempfile::tempdir()
731 .ok()
732 .unwrap_or_else(|| panic!("Should create temp dir"));
733 let path = temp_dir.path().join("data.parquet");
734 create_test_parquet(&path, 100);
735
736 let result = cmd_quality_check(&path, 0.1, 0.05, true, "json");
737 assert!(result.is_ok());
738 }
739
740 #[test]
741 fn test_cmd_quality_check_no_outliers() {
742 let temp_dir = tempfile::tempdir()
743 .ok()
744 .unwrap_or_else(|| panic!("Should create temp dir"));
745 let path = temp_dir.path().join("data.parquet");
746 create_test_parquet(&path, 100);
747
748 let result = cmd_quality_check(&path, 0.1, 0.05, false, "text");
749 assert!(result.is_ok());
750 }
751
752 #[test]
753 fn test_cmd_quality_report_basic() {
754 let temp_dir = tempfile::tempdir()
755 .ok()
756 .unwrap_or_else(|| panic!("Should create temp dir"));
757 let path = temp_dir.path().join("data.parquet");
758 create_test_parquet(&path, 100);
759
760 let result = cmd_quality_report(&path, None);
761 assert!(result.is_ok());
762 }
763
764 #[test]
765 fn test_cmd_quality_report_to_file() {
766 let temp_dir = tempfile::tempdir()
767 .ok()
768 .unwrap_or_else(|| panic!("Should create temp dir"));
769 let data_path = temp_dir.path().join("data.parquet");
770 let output_path = temp_dir.path().join("quality.json");
771 create_test_parquet(&data_path, 100);
772
773 let result = cmd_quality_report(&data_path, Some(&output_path));
774 assert!(result.is_ok());
775 assert!(output_path.exists());
776
777 let content = std::fs::read_to_string(&output_path)
779 .ok()
780 .unwrap_or_else(|| panic!("Should read file"));
781 let parsed: serde_json::Value = serde_json::from_str(&content)
782 .ok()
783 .unwrap_or_else(|| panic!("Should parse JSON"));
784 assert!(parsed.get("score").is_some());
785 assert!(parsed.get("has_issues").is_some());
786 }
787
788 #[test]
789 fn test_cmd_quality_check_with_constant_column() {
790 let temp_dir = tempfile::tempdir()
791 .ok()
792 .unwrap_or_else(|| panic!("Should create temp dir"));
793 let path = temp_dir.path().join("data.parquet");
794
795 let schema = Arc::new(Schema::new(vec![
796 Field::new("id", DataType::Int32, false),
797 Field::new("constant", DataType::Int32, false),
798 ]));
799
800 let ids: Vec<i32> = (0..100).collect();
801 let constants: Vec<i32> = vec![42; 100];
802
803 let batch = arrow::array::RecordBatch::try_new(
804 schema,
805 vec![
806 Arc::new(Int32Array::from(ids)),
807 Arc::new(Int32Array::from(constants)),
808 ],
809 )
810 .ok()
811 .unwrap_or_else(|| panic!("Should create batch"));
812
813 let dataset = ArrowDataset::from_batch(batch)
814 .ok()
815 .unwrap_or_else(|| panic!("Should create dataset"));
816
817 dataset
818 .to_parquet(&path)
819 .ok()
820 .unwrap_or_else(|| panic!("Should write parquet"));
821
822 let result = cmd_quality_check(&path, 0.1, 0.05, true, "text");
823 assert!(result.is_ok());
824 }
825
826 #[test]
827 fn test_cmd_quality_report_default_output() {
828 let temp_dir = tempfile::tempdir()
829 .ok()
830 .unwrap_or_else(|| panic!("Should create temp dir"));
831 let path = temp_dir.path().join("data.parquet");
832 create_test_parquet(&path, 50);
833
834 let result = cmd_quality_report(&path, None);
835 assert!(result.is_ok());
836 }
837
838 #[test]
839 fn test_cmd_quality_report_with_output() {
840 let temp_dir = tempfile::tempdir()
841 .ok()
842 .unwrap_or_else(|| panic!("Should create temp dir"));
843 let path = temp_dir.path().join("data.parquet");
844 let output = temp_dir.path().join("report.html");
845 create_test_parquet(&path, 50);
846
847 let result = cmd_quality_report(&path, Some(&output));
848 assert!(result.is_ok());
849 assert!(output.exists());
850 }
851
852 #[test]
853 fn test_cmd_quality_score() {
854 let temp_dir = tempfile::tempdir()
855 .ok()
856 .unwrap_or_else(|| panic!("Should create temp dir"));
857 let path = temp_dir.path().join("data.parquet");
858 create_test_parquet(&path, 100);
859
860 let result = cmd_quality_score(&path, "default", false, false, false);
861 assert!(result.is_ok());
862 }
863
864 #[test]
865 fn test_cmd_quality_score_with_json() {
866 let temp_dir = tempfile::tempdir()
867 .ok()
868 .unwrap_or_else(|| panic!("Should create temp dir"));
869 let path = temp_dir.path().join("data.parquet");
870 create_test_parquet(&path, 100);
871
872 let result = cmd_quality_score(&path, "default", false, true, false);
873 assert!(result.is_ok());
874 }
875
876 #[test]
877 fn test_cmd_quality_score_with_badge() {
878 let temp_dir = tempfile::tempdir()
879 .ok()
880 .unwrap_or_else(|| panic!("Should create temp dir"));
881 let path = temp_dir.path().join("data.parquet");
882 create_test_parquet(&path, 100);
883
884 let result = cmd_quality_score(&path, "default", false, false, true);
885 assert!(result.is_ok());
886 }
887
888 #[test]
889 fn test_cmd_quality_score_with_suggest() {
890 let temp_dir = tempfile::tempdir()
891 .ok()
892 .unwrap_or_else(|| panic!("Should create temp dir"));
893 let path = temp_dir.path().join("data.parquet");
894 create_test_parquet(&path, 100);
895
896 let result = cmd_quality_score(&path, "default", true, false, false);
897 assert!(result.is_ok());
898 }
899
900 #[test]
901 fn test_cmd_quality_score_with_doctest_profile() {
902 let temp_dir = tempfile::tempdir()
903 .ok()
904 .unwrap_or_else(|| panic!("Should create temp dir"));
905 let path = temp_dir.path().join("data.parquet");
906 create_test_parquet(&path, 100);
907
908 let result = cmd_quality_score(&path, "doctest-corpus", false, false, false);
909 assert!(result.is_ok());
910 }
911
912 #[test]
913 fn test_cmd_quality_profiles() {
914 let result = cmd_quality_profiles();
915 assert!(result.is_ok());
916 }
917
918 #[test]
921 fn test_cmd_quality_check_with_high_null_threshold() {
922 let temp_dir = tempfile::tempdir()
923 .ok()
924 .unwrap_or_else(|| panic!("Should create temp dir"));
925 let path = temp_dir.path().join("data.parquet");
926 create_test_parquet(&path, 100);
927
928 let result = cmd_quality_check(&path, 0.9, 0.9, true, "text");
930 assert!(result.is_ok());
931 }
932
933 #[test]
934 fn test_cmd_quality_check_small_dataset() {
935 let temp_dir = tempfile::tempdir()
936 .ok()
937 .unwrap_or_else(|| panic!("Should create temp dir"));
938 let path = temp_dir.path().join("small.parquet");
939 create_test_parquet(&path, 5);
940
941 let result = cmd_quality_check(&path, 0.1, 0.05, true, "text");
942 assert!(result.is_ok());
943 }
944
945 #[test]
946 fn test_cmd_quality_check_large_dataset() {
947 let temp_dir = tempfile::tempdir()
948 .ok()
949 .unwrap_or_else(|| panic!("Should create temp dir"));
950 let path = temp_dir.path().join("large.parquet");
951 create_test_parquet(&path, 500);
952
953 let result = cmd_quality_check(&path, 0.1, 0.05, false, "json");
954 assert!(result.is_ok());
955 }
956
957 #[test]
958 fn test_cmd_quality_score_ml_training_profile() {
959 let temp_dir = tempfile::tempdir()
960 .ok()
961 .unwrap_or_else(|| panic!("Should create temp dir"));
962 let path = temp_dir.path().join("ml.parquet");
963 create_test_parquet(&path, 150);
964
965 let result = cmd_quality_score(&path, "ml-training", false, false, false);
966 assert!(result.is_ok());
967 }
968
969 #[test]
970 fn test_cmd_quality_score_invalid_profile() {
971 let temp_dir = tempfile::tempdir()
972 .ok()
973 .unwrap_or_else(|| panic!("Should create temp dir"));
974 let path = temp_dir.path().join("data.parquet");
975 create_test_parquet(&path, 100);
976
977 let result = cmd_quality_score(&path, "nonexistent-profile", false, false, false);
978 assert!(result.is_err());
979 }
980
981 #[test]
982 fn test_cmd_quality_score_all_output_modes() {
983 let temp_dir = tempfile::tempdir()
984 .ok()
985 .unwrap_or_else(|| panic!("Should create temp dir"));
986 let path = temp_dir.path().join("data.parquet");
987 create_test_parquet(&path, 100);
988
989 let result = cmd_quality_score(&path, "default", false, false, false);
991 assert!(result.is_ok());
992
993 let result = cmd_quality_score(&path, "default", false, true, false);
995 assert!(result.is_ok());
996
997 let result = cmd_quality_score(&path, "default", false, false, true);
999 assert!(result.is_ok());
1000 }
1001
1002 #[test]
1003 fn test_cmd_quality_report_to_stdout() {
1004 let temp_dir = tempfile::tempdir()
1005 .ok()
1006 .unwrap_or_else(|| panic!("Should create temp dir"));
1007 let path = temp_dir.path().join("data.parquet");
1008 create_test_parquet(&path, 50);
1009
1010 let result = cmd_quality_report(&path, None);
1012 assert!(result.is_ok());
1013 }
1014
1015 #[test]
1016 fn test_build_checklist_from_report_empty_dataset() {
1017 use crate::quality::{QualityChecker, QualityProfile};
1019
1020 let schema = Arc::new(Schema::new(vec![Field::new("id", DataType::Int32, false)]));
1021
1022 let batch =
1023 arrow::array::RecordBatch::try_new(schema, vec![Arc::new(Int32Array::from(vec![1]))])
1024 .unwrap();
1025
1026 let dataset = ArrowDataset::from_batch(batch).unwrap();
1027 let report = QualityChecker::new().check(&dataset).unwrap();
1028 let profile = QualityProfile::by_name("default").unwrap();
1029
1030 let checklist = build_checklist_from_report(&report, &profile);
1031
1032 assert!(!checklist.is_empty());
1034
1035 let has_critical = checklist
1037 .iter()
1038 .any(|i| i.severity == crate::quality::Severity::Critical);
1039 let has_high = checklist
1040 .iter()
1041 .any(|i| i.severity == crate::quality::Severity::High);
1042 let has_medium = checklist
1043 .iter()
1044 .any(|i| i.severity == crate::quality::Severity::Medium);
1045 let has_low = checklist
1046 .iter()
1047 .any(|i| i.severity == crate::quality::Severity::Low);
1048
1049 assert!(has_critical);
1050 assert!(has_high);
1051 assert!(has_medium);
1052 assert!(has_low);
1053 }
1054
1055 #[test]
1056 fn test_build_checklist_high_quality_dataset() {
1057 use crate::quality::{QualityChecker, QualityProfile};
1058
1059 let schema = Arc::new(Schema::new(vec![
1061 Field::new("id", DataType::Int32, false),
1062 Field::new("name", DataType::Utf8, false),
1063 ]));
1064
1065 let ids: Vec<i32> = (0..200).collect();
1066 let names: Vec<String> = ids.iter().map(|i| format!("name_{}", i)).collect();
1067
1068 let batch = arrow::array::RecordBatch::try_new(
1069 schema,
1070 vec![
1071 Arc::new(Int32Array::from(ids)),
1072 Arc::new(StringArray::from(names)),
1073 ],
1074 )
1075 .unwrap();
1076
1077 let dataset = ArrowDataset::from_batch(batch).unwrap();
1078 let report = QualityChecker::new().check(&dataset).unwrap();
1079 let profile = QualityProfile::by_name("default").unwrap();
1080
1081 let checklist = build_checklist_from_report(&report, &profile);
1082
1083 let passed = checklist.iter().filter(|i| i.passed).count();
1085 assert!(passed > checklist.len() / 2);
1087 }
1088
1089 #[test]
1090 fn test_cmd_quality_check_with_issues() {
1091 let temp_dir = tempfile::tempdir()
1092 .ok()
1093 .unwrap_or_else(|| panic!("Should create temp dir"));
1094 let path = temp_dir.path().join("issues.parquet");
1095
1096 let schema = Arc::new(Schema::new(vec![
1098 Field::new("id", DataType::Int32, false),
1099 Field::new("constant", DataType::Int32, false),
1100 ]));
1101
1102 let ids: Vec<i32> = (0..50).collect();
1103 let constants: Vec<i32> = vec![42; 50];
1104
1105 let batch = arrow::array::RecordBatch::try_new(
1106 schema,
1107 vec![
1108 Arc::new(Int32Array::from(ids)),
1109 Arc::new(Int32Array::from(constants)),
1110 ],
1111 )
1112 .unwrap();
1113
1114 let dataset = ArrowDataset::from_batch(batch).unwrap();
1115 dataset.to_parquet(&path).unwrap();
1116
1117 let result = cmd_quality_check(&path, 0.1, 0.05, true, "text");
1119 assert!(result.is_ok());
1120 }
1121}