1use std::path::{Path, PathBuf};
4
5use clap::Subcommand;
6
7use super::basic::load_dataset;
8use crate::quality::{ColumnQuality, QualityChecker};
9
10#[derive(Subcommand)]
12pub enum QualityCommands {
13 Check {
15 path: PathBuf,
17 #[arg(long, default_value = "0.1")]
19 null_threshold: f64,
20 #[arg(long, default_value = "0.05")]
22 duplicate_threshold: f64,
23 #[arg(long, default_value = "true")]
25 detect_outliers: bool,
26 #[arg(short, long, default_value = "text")]
28 format: String,
29 },
30 Report {
32 path: PathBuf,
34 #[arg(short, long)]
36 output: Option<PathBuf>,
37 },
38 Score {
40 path: PathBuf,
42 #[arg(short, long, default_value = "default")]
45 profile: String,
46 #[arg(long)]
48 suggest: bool,
49 #[arg(long)]
51 json: bool,
52 #[arg(long)]
54 badge: bool,
55 },
56 Profiles,
58}
59
60pub(crate) fn cmd_quality_check(
62 path: &Path,
63 null_threshold: f64,
64 duplicate_threshold: f64,
65 detect_outliers: bool,
66 format: &str,
67) -> crate::Result<()> {
68 let dataset = load_dataset(path)?;
69
70 if (duplicate_threshold - 0.05_f64).abs() > f64::EPSILON {
72 eprintln!(
73 "Warning: --duplicate-threshold {duplicate_threshold} is not yet implemented. Using default behavior."
74 );
75 }
76 let _ = duplicate_threshold;
77 let mut checker = QualityChecker::new();
78
79 if !detect_outliers {
80 checker = checker.with_outlier_check(false);
81 }
82
83 let report = checker.check(&dataset)?;
84
85 if format == "json" {
86 let json = serde_json::json!({
87 "path": path.display().to_string(),
88 "rows": report.row_count,
89 "columns": report.column_count,
90 "has_issues": !report.issues.is_empty(),
91 "score": report.score,
92 "issues": report.issues.iter().map(|i| format!("{:?}", i)).collect::<Vec<_>>(),
93 "column_qualities": report.columns.iter().map(|(name, c)| {
94 serde_json::json!({
95 "column": name,
96 "null_ratio": c.null_ratio,
97 "unique_count": c.unique_count,
98 "is_constant": c.is_constant(),
99 "is_mostly_null": c.null_ratio > null_threshold,
100 })
101 }).collect::<Vec<_>>()
102 });
103 println!(
104 "{}",
105 serde_json::to_string_pretty(&json).map_err(|e| crate::Error::Format(e.to_string()))?
106 );
107 } else {
108 println!("Data Quality Report");
109 println!("===================");
110 println!("File: {}", path.display());
111 println!("Rows: {}", report.row_count);
112 println!("Columns: {}", report.column_count);
113 println!();
114
115 println!("Quality Score: {:.1}%", report.score);
116 println!();
117
118 if report.issues.is_empty() {
119 println!("\u{2713} No quality issues found\n");
120 } else {
121 println!("Issues Found:");
122 println!("-------------");
123 for issue in &report.issues {
124 println!(" - {:?}", issue);
125 }
126 println!();
127 }
128
129 println!(
130 "{:<20} {:<12} {:<12} {:<10}",
131 "COLUMN", "NULL %", "UNIQUE", "STATUS"
132 );
133 println!("{}", "-".repeat(60));
134
135 for (name, col) in &report.columns {
136 let status = if col.is_constant() {
137 "CONSTANT"
138 } else if col.null_ratio > null_threshold {
139 "HIGH NULL"
140 } else {
141 "OK"
142 };
143
144 println!(
145 "{:<20} {:<12.2} {:<12} {:<10}",
146 name,
147 col.null_ratio * 100.0,
148 col.unique_count,
149 status
150 );
151 }
152 }
153
154 Ok(())
155}
156
157pub(crate) fn cmd_quality_report(path: &Path, output: Option<&Path>) -> crate::Result<()> {
159 let dataset = load_dataset(path)?;
160 let report = QualityChecker::new().check(&dataset)?;
161
162 let json = serde_json::json!({
163 "path": path.display().to_string(),
164 "rows": report.row_count,
165 "columns": report.column_count,
166 "has_issues": !report.issues.is_empty(),
167 "score": report.score,
168 "issues": report.issues.iter().map(|i| format!("{:?}", i)).collect::<Vec<_>>(),
169 "column_qualities": report.columns.iter().map(|(name, c)| {
170 serde_json::json!({
171 "column": name,
172 "null_ratio": c.null_ratio,
173 "unique_count": c.unique_count,
174 "is_constant": c.is_constant(),
175 })
176 }).collect::<Vec<_>>()
177 });
178
179 let json_str =
180 serde_json::to_string_pretty(&json).map_err(|e| crate::Error::Format(e.to_string()))?;
181
182 if let Some(output_path) = output {
183 std::fs::write(output_path, &json_str).map_err(|e| crate::Error::io(e, output_path))?;
184 println!("Quality report written to: {}", output_path.display());
185 } else {
186 println!("{}", json_str);
187 }
188
189 Ok(())
190}
191
192#[allow(clippy::too_many_lines)]
197pub(crate) fn cmd_quality_score(
198 path: &Path,
199 profile_name: &str,
200 suggest: bool,
201 json_output: bool,
202 badge_output: bool,
203) -> crate::Result<()> {
204 use crate::quality::{QualityProfile, QualityScore};
205
206 let profile = QualityProfile::by_name(profile_name).ok_or_else(|| {
208 crate::Error::Format(format!(
209 "Unknown quality profile '{}'. Available: {:?}",
210 profile_name,
211 QualityProfile::available_profiles()
212 ))
213 })?;
214
215 let dataset = load_dataset(path)?;
216 let report = QualityChecker::new().check(&dataset)?;
217
218 let checklist = build_checklist_from_report(&report, &profile);
220 let score = QualityScore::from_checklist(checklist);
221
222 if badge_output {
224 println!("{}", score.badge_url());
225 } else if json_output {
226 println!("{}", score.to_json());
227 } else {
228 print_text_report(&score, &profile, path, suggest);
229 }
230
231 if score.has_critical_failures() {
233 std::process::exit(1);
234 }
235
236 Ok(())
237}
238
239fn print_text_report(
241 score: &crate::quality::QualityScore,
242 profile: &crate::quality::QualityProfile,
243 path: &Path,
244 suggest: bool,
245) {
246 let grade_symbol = match score.grade {
247 crate::quality::LetterGrade::A | crate::quality::LetterGrade::B => "\u{2713}",
248 crate::quality::LetterGrade::C => "\u{25CB}",
249 crate::quality::LetterGrade::D => "\u{25B3}",
250 crate::quality::LetterGrade::F => "\u{2717}",
251 };
252
253 let separator = "\u{2550}".repeat(63);
254 println!("{separator}");
255 println!(
256 " Data Quality Score: {} {} ({:.1}%) ",
257 grade_symbol, score.grade, score.score
258 );
259 println!(" Profile: {} ", profile.name);
260 println!(" Decision: {} ", score.grade.publication_decision());
261 println!("{separator}");
262 println!();
263 println!("File: {}", path.display());
264 println!(
265 "Points: {:.1} / {:.1}",
266 score.points_earned, score.max_points
267 );
268 println!();
269
270 print_severity_breakdown(score);
271 print_critical_failures(score, suggest);
272
273 if suggest {
274 print_other_issues(score);
275 }
276}
277
278fn print_severity_breakdown(score: &crate::quality::QualityScore) {
280 use crate::quality::Severity;
281
282 println!("Severity Breakdown:");
283 for severity in [
284 Severity::Critical,
285 Severity::High,
286 Severity::Medium,
287 Severity::Low,
288 ] {
289 if let Some(stats) = score.severity_breakdown.get(&severity) {
290 let status = if stats.failed == 0 {
291 "\u{2713}"
292 } else {
293 "\u{2717}"
294 };
295 println!(
296 " {} {:8}: {}/{} passed ({:.1}/{:.1} pts)",
297 status,
298 format!("{}", severity),
299 stats.passed,
300 stats.total,
301 stats.points_earned,
302 stats.max_points
303 );
304 }
305 }
306 println!();
307}
308
309fn print_critical_failures(score: &crate::quality::QualityScore, suggest: bool) {
311 let critical_failures = score.critical_failures();
312 if !critical_failures.is_empty() {
313 println!("CRITICAL FAILURES (blocks publication):");
314 for item in critical_failures {
315 println!(" \u{2717} #{}: {}", item.id, item.description);
316 if suggest {
317 if let Some(ref suggestion) = item.suggestion {
318 println!(" \u{2192} {}", suggestion);
319 }
320 }
321 }
322 println!();
323 }
324}
325
326fn print_other_issues(score: &crate::quality::QualityScore) {
328 use crate::quality::Severity;
329
330 let failed = score.failed_items();
331 let non_critical: Vec<_> = failed
332 .iter()
333 .filter(|i| i.severity != Severity::Critical)
334 .collect();
335
336 if !non_critical.is_empty() {
337 println!("Other Issues ({}):", non_critical.len());
338 for item in non_critical {
339 let sev = match item.severity {
340 Severity::High => "[HIGH]",
341 Severity::Medium => "[MED]",
342 Severity::Low => "[LOW]",
343 Severity::Critical => "[CRIT]",
344 };
345 println!(" {} #{}: {}", sev, item.id, item.description);
346 if let Some(ref suggestion) = item.suggestion {
347 println!(" \u{2192} {}", suggestion);
348 }
349 }
350 }
351}
352
353#[allow(clippy::unnecessary_wraps)]
355pub(crate) fn cmd_quality_profiles() -> crate::Result<()> {
356 use crate::quality::QualityProfile;
357
358 println!("Available Quality Profiles");
359 println!("\u{2550}\u{2550}\u{2550}\u{2550}\u{2550}\u{2550}\u{2550}\u{2550}\u{2550}\u{2550}\u{2550}\u{2550}\u{2550}\u{2550}\u{2550}\u{2550}\u{2550}\u{2550}\u{2550}\u{2550}\u{2550}\u{2550}\u{2550}\u{2550}\u{2550}\u{2550}");
360 println!();
361
362 for name in QualityProfile::available_profiles() {
363 if let Some(profile) = QualityProfile::by_name(name) {
364 println!(" {} - {}", profile.name, profile.description);
365 if !profile.expected_constant_columns.is_empty() {
366 let cols: Vec<_> = profile.expected_constant_columns.iter().collect();
367 println!(" Expected constants: {:?}", cols);
368 }
369 if !profile.nullable_columns.is_empty() {
370 let cols: Vec<_> = profile.nullable_columns.iter().collect();
371 println!(" Nullable columns: {:?}", cols);
372 }
373 println!(" Max null ratio: {:.0}%", profile.max_null_ratio * 100.0);
374 println!(
375 " Max duplicate ratio: {:.0}%",
376 profile.max_duplicate_ratio * 100.0
377 );
378 println!();
379 }
380 }
381
382 println!("Usage: alimentar quality score <path> --profile <name>");
383 Ok(())
384}
385
386#[allow(clippy::too_many_lines)]
391pub(crate) fn build_checklist_from_report(
392 report: &crate::quality::QualityReport,
393 profile: &crate::quality::QualityProfile,
394) -> Vec<crate::quality::ChecklistItem> {
395 use crate::quality::{ChecklistItem, Severity};
396
397 let mut items = Vec::new();
398 let mut id: u8 = 1;
399
400 let has_rows = report.row_count > 0;
404 items.push(
405 ChecklistItem::new(id, "Dataset contains rows", Severity::Critical, has_rows)
406 .with_suggestion("Extract more doctests or check input source"),
407 );
408 id += 1;
409
410 let has_columns = report.column_count > 0;
412 items.push(
413 ChecklistItem::new(
414 id,
415 "Schema has columns defined",
416 Severity::Critical,
417 has_columns,
418 )
419 .with_suggestion("Verify parser is extracting fields correctly"),
420 );
421 id += 1;
422
423 let unexpected_constant_cols: Vec<String> = report
428 .columns
429 .iter()
430 .filter(|(name, c): &(&String, &ColumnQuality)| {
431 c.is_constant() && !profile.is_expected_constant(name) && !profile.is_nullable(name)
432 })
433 .map(|(n, _)| n.clone())
434 .collect();
435 let no_unexpected_constants = unexpected_constant_cols.is_empty();
436 items.push(
437 ChecklistItem::new(
438 id,
439 "No unexpected constant columns (zero variance)",
440 Severity::Critical,
441 no_unexpected_constants,
442 )
443 .with_suggestion(format!(
444 "Remove or investigate constant columns: {:?}",
445 unexpected_constant_cols
446 )),
447 );
448 id += 1;
449
450 let duplicate_ratio = report
454 .issues
455 .iter()
456 .find_map(|i| {
457 if let crate::quality::QualityIssue::DuplicateRows {
458 duplicate_ratio: dr,
459 ..
460 } = i
461 {
462 Some(*dr)
463 } else {
464 None
465 }
466 })
467 .unwrap_or(0.0);
468 let low_duplicates = duplicate_ratio <= 0.05;
469 items.push(
470 ChecklistItem::new(
471 id,
472 format!(
473 "Duplicate ratio <= 5% (actual: {:.1}%)",
474 duplicate_ratio * 100.0
475 ),
476 Severity::High,
477 low_duplicates,
478 )
479 .with_suggestion("Run deduplication: alimentar dedupe <file>"),
480 );
481 id += 1;
482
483 let high_null_cols: Vec<String> = report
485 .columns
486 .iter()
487 .filter(|(name, c): &(&String, &ColumnQuality)| {
488 c.null_ratio > 0.5 && !profile.is_nullable(name)
489 })
490 .map(|(n, _)| n.clone())
491 .collect();
492 let no_high_null = high_null_cols.is_empty();
493 items.push(
494 ChecklistItem::new(
495 id,
496 "No columns with >50% null values",
497 Severity::High,
498 no_high_null,
499 )
500 .with_suggestion(format!(
501 "Investigate high-null columns: {:?}",
502 high_null_cols
503 )),
504 );
505 id += 1;
506
507 let min_rows = report.row_count >= 100;
509 items.push(
510 ChecklistItem::new(
511 id,
512 format!("Minimum 100 rows (actual: {})", report.row_count),
513 Severity::High,
514 min_rows,
515 )
516 .with_suggestion("Extract more data or combine with other sources"),
517 );
518 id += 1;
519
520 let good_score = report.score >= 70.0;
524 items.push(
525 ChecklistItem::new(
526 id,
527 format!("Quality score >= 70% (actual: {:.1}%)", report.score),
528 Severity::Medium,
529 good_score,
530 )
531 .with_suggestion("Address issues reported by quality check"),
532 );
533 id += 1;
534
535 let moderate_null_cols: Vec<String> = report
538 .columns
539 .iter()
540 .filter(|(name, c): &(&String, &ColumnQuality)| {
541 c.null_ratio > 0.1 && c.null_ratio <= 0.5 && !profile.is_nullable(name)
542 })
543 .map(|(n, _)| n.clone())
544 .collect();
545 let low_null_ratio = moderate_null_cols.is_empty();
546 items.push(
547 ChecklistItem::new(
548 id,
549 "No columns with >10% null values",
550 Severity::Medium,
551 low_null_ratio,
552 )
553 .with_suggestion(format!("Consider imputation for: {:?}", moderate_null_cols)),
554 );
555 id += 1;
556
557 let enough_columns = report.column_count >= 2;
559 items.push(
560 ChecklistItem::new(
561 id,
562 format!("At least 2 columns (actual: {})", report.column_count),
563 Severity::Medium,
564 enough_columns,
565 )
566 .with_suggestion("Ensure input and target columns are present"),
567 );
568 id += 1;
569
570 let outlier_issues: Vec<(String, f64)> = report
572 .issues
573 .iter()
574 .filter_map(|i| {
575 if let crate::quality::QualityIssue::OutliersDetected {
576 column,
577 outlier_ratio: or,
578 ..
579 } = i
580 {
581 Some((column.clone(), *or))
582 } else {
583 None
584 }
585 })
586 .collect();
587 let no_severe_outliers = outlier_issues.iter().all(|(_, r)| *r < 0.1);
588 items.push(
589 ChecklistItem::new(
590 id,
591 "No columns with >10% outliers",
592 Severity::Medium,
593 no_severe_outliers,
594 )
595 .with_suggestion("Review outlier columns for data quality issues"),
596 );
597 id += 1;
598
599 let no_issues = report.issues.is_empty();
603 items.push(
604 ChecklistItem::new(id, "No quality warnings", Severity::Low, no_issues)
605 .with_suggestion("Address all warnings for best results"),
606 );
607 id += 1;
608
609 let low_cardinality_cols: Vec<String> = report
611 .columns
612 .iter()
613 .filter(|(_, c): &(&String, &ColumnQuality)| c.unique_count < 10 && !c.is_constant())
614 .map(|(n, _)| n.clone())
615 .collect();
616 let good_cardinality = low_cardinality_cols.is_empty();
617 items.push(
618 ChecklistItem::new(
619 id,
620 "All columns have reasonable cardinality (>10 unique)",
621 Severity::Low,
622 good_cardinality,
623 )
624 .with_suggestion(format!(
625 "Low cardinality columns: {:?}",
626 low_cardinality_cols
627 )),
628 );
629 let _ = id; items
632}
633
634#[cfg(test)]
635#[allow(
636 clippy::cast_possible_truncation,
637 clippy::cast_possible_wrap,
638 clippy::cast_precision_loss,
639 clippy::uninlined_format_args,
640 clippy::unwrap_used,
641 clippy::expect_used,
642 clippy::redundant_clone,
643 clippy::cast_lossless,
644 clippy::redundant_closure_for_method_calls,
645 clippy::too_many_lines,
646 clippy::float_cmp,
647 clippy::similar_names,
648 clippy::needless_late_init,
649 clippy::redundant_pattern_matching
650)]
651mod tests {
652 use std::sync::Arc;
653
654 use arrow::{
655 array::{Int32Array, StringArray},
656 datatypes::{DataType, Field, Schema},
657 };
658
659 use super::*;
660 use crate::ArrowDataset;
661
662 fn create_test_parquet(path: &PathBuf, rows: usize) {
663 let schema = Arc::new(Schema::new(vec![
664 Field::new("id", DataType::Int32, false),
665 Field::new("name", DataType::Utf8, false),
666 ]));
667
668 let ids: Vec<i32> = (0..rows as i32).collect();
669 let names: Vec<String> = ids.iter().map(|i| format!("item_{}", i)).collect();
670
671 let batch = arrow::array::RecordBatch::try_new(
672 schema,
673 vec![
674 Arc::new(Int32Array::from(ids)),
675 Arc::new(StringArray::from(names)),
676 ],
677 )
678 .ok()
679 .unwrap_or_else(|| panic!("Should create batch"));
680
681 let dataset = ArrowDataset::from_batch(batch)
682 .ok()
683 .unwrap_or_else(|| panic!("Should create dataset"));
684
685 dataset
686 .to_parquet(path)
687 .ok()
688 .unwrap_or_else(|| panic!("Should write parquet"));
689 }
690
691 #[test]
692 fn test_cmd_quality_check_text() {
693 let temp_dir = tempfile::tempdir()
694 .ok()
695 .unwrap_or_else(|| panic!("Should create temp dir"));
696 let path = temp_dir.path().join("data.parquet");
697 create_test_parquet(&path, 100);
698
699 let result = cmd_quality_check(&path, 0.1, 0.05, true, "text");
700 assert!(result.is_ok());
701 }
702
703 #[test]
704 fn test_cmd_quality_check_json() {
705 let temp_dir = tempfile::tempdir()
706 .ok()
707 .unwrap_or_else(|| panic!("Should create temp dir"));
708 let path = temp_dir.path().join("data.parquet");
709 create_test_parquet(&path, 100);
710
711 let result = cmd_quality_check(&path, 0.1, 0.05, true, "json");
712 assert!(result.is_ok());
713 }
714
715 #[test]
716 fn test_cmd_quality_check_no_outliers() {
717 let temp_dir = tempfile::tempdir()
718 .ok()
719 .unwrap_or_else(|| panic!("Should create temp dir"));
720 let path = temp_dir.path().join("data.parquet");
721 create_test_parquet(&path, 100);
722
723 let result = cmd_quality_check(&path, 0.1, 0.05, false, "text");
724 assert!(result.is_ok());
725 }
726
727 #[test]
728 fn test_cmd_quality_report_basic() {
729 let temp_dir = tempfile::tempdir()
730 .ok()
731 .unwrap_or_else(|| panic!("Should create temp dir"));
732 let path = temp_dir.path().join("data.parquet");
733 create_test_parquet(&path, 100);
734
735 let result = cmd_quality_report(&path, None);
736 assert!(result.is_ok());
737 }
738
739 #[test]
740 fn test_cmd_quality_report_to_file() {
741 let temp_dir = tempfile::tempdir()
742 .ok()
743 .unwrap_or_else(|| panic!("Should create temp dir"));
744 let data_path = temp_dir.path().join("data.parquet");
745 let output_path = temp_dir.path().join("quality.json");
746 create_test_parquet(&data_path, 100);
747
748 let result = cmd_quality_report(&data_path, Some(&output_path));
749 assert!(result.is_ok());
750 assert!(output_path.exists());
751
752 let content = std::fs::read_to_string(&output_path)
754 .ok()
755 .unwrap_or_else(|| panic!("Should read file"));
756 let parsed: serde_json::Value = serde_json::from_str(&content)
757 .ok()
758 .unwrap_or_else(|| panic!("Should parse JSON"));
759 assert!(parsed.get("score").is_some());
760 assert!(parsed.get("has_issues").is_some());
761 }
762
763 #[test]
764 fn test_cmd_quality_check_with_constant_column() {
765 let temp_dir = tempfile::tempdir()
766 .ok()
767 .unwrap_or_else(|| panic!("Should create temp dir"));
768 let path = temp_dir.path().join("data.parquet");
769
770 let schema = Arc::new(Schema::new(vec![
771 Field::new("id", DataType::Int32, false),
772 Field::new("constant", DataType::Int32, false),
773 ]));
774
775 let ids: Vec<i32> = (0..100).collect();
776 let constants: Vec<i32> = vec![42; 100];
777
778 let batch = arrow::array::RecordBatch::try_new(
779 schema,
780 vec![
781 Arc::new(Int32Array::from(ids)),
782 Arc::new(Int32Array::from(constants)),
783 ],
784 )
785 .ok()
786 .unwrap_or_else(|| panic!("Should create batch"));
787
788 let dataset = ArrowDataset::from_batch(batch)
789 .ok()
790 .unwrap_or_else(|| panic!("Should create dataset"));
791
792 dataset
793 .to_parquet(&path)
794 .ok()
795 .unwrap_or_else(|| panic!("Should write parquet"));
796
797 let result = cmd_quality_check(&path, 0.1, 0.05, true, "text");
798 assert!(result.is_ok());
799 }
800
801 #[test]
802 fn test_cmd_quality_report_default_output() {
803 let temp_dir = tempfile::tempdir()
804 .ok()
805 .unwrap_or_else(|| panic!("Should create temp dir"));
806 let path = temp_dir.path().join("data.parquet");
807 create_test_parquet(&path, 50);
808
809 let result = cmd_quality_report(&path, None);
810 assert!(result.is_ok());
811 }
812
813 #[test]
814 fn test_cmd_quality_report_with_output() {
815 let temp_dir = tempfile::tempdir()
816 .ok()
817 .unwrap_or_else(|| panic!("Should create temp dir"));
818 let path = temp_dir.path().join("data.parquet");
819 let output = temp_dir.path().join("report.html");
820 create_test_parquet(&path, 50);
821
822 let result = cmd_quality_report(&path, Some(&output));
823 assert!(result.is_ok());
824 assert!(output.exists());
825 }
826
827 #[test]
828 fn test_cmd_quality_score() {
829 let temp_dir = tempfile::tempdir()
830 .ok()
831 .unwrap_or_else(|| panic!("Should create temp dir"));
832 let path = temp_dir.path().join("data.parquet");
833 create_test_parquet(&path, 100);
834
835 let result = cmd_quality_score(&path, "default", false, false, false);
836 assert!(result.is_ok());
837 }
838
839 #[test]
840 fn test_cmd_quality_score_with_json() {
841 let temp_dir = tempfile::tempdir()
842 .ok()
843 .unwrap_or_else(|| panic!("Should create temp dir"));
844 let path = temp_dir.path().join("data.parquet");
845 create_test_parquet(&path, 100);
846
847 let result = cmd_quality_score(&path, "default", false, true, false);
848 assert!(result.is_ok());
849 }
850
851 #[test]
852 fn test_cmd_quality_score_with_badge() {
853 let temp_dir = tempfile::tempdir()
854 .ok()
855 .unwrap_or_else(|| panic!("Should create temp dir"));
856 let path = temp_dir.path().join("data.parquet");
857 create_test_parquet(&path, 100);
858
859 let result = cmd_quality_score(&path, "default", false, false, true);
860 assert!(result.is_ok());
861 }
862
863 #[test]
864 fn test_cmd_quality_score_with_suggest() {
865 let temp_dir = tempfile::tempdir()
866 .ok()
867 .unwrap_or_else(|| panic!("Should create temp dir"));
868 let path = temp_dir.path().join("data.parquet");
869 create_test_parquet(&path, 100);
870
871 let result = cmd_quality_score(&path, "default", true, false, false);
872 assert!(result.is_ok());
873 }
874
875 #[test]
876 fn test_cmd_quality_score_with_doctest_profile() {
877 let temp_dir = tempfile::tempdir()
878 .ok()
879 .unwrap_or_else(|| panic!("Should create temp dir"));
880 let path = temp_dir.path().join("data.parquet");
881 create_test_parquet(&path, 100);
882
883 let result = cmd_quality_score(&path, "doctest-corpus", false, false, false);
884 assert!(result.is_ok());
885 }
886
887 #[test]
888 fn test_cmd_quality_profiles() {
889 let result = cmd_quality_profiles();
890 assert!(result.is_ok());
891 }
892
893 #[test]
896 fn test_cmd_quality_check_with_high_null_threshold() {
897 let temp_dir = tempfile::tempdir()
898 .ok()
899 .unwrap_or_else(|| panic!("Should create temp dir"));
900 let path = temp_dir.path().join("data.parquet");
901 create_test_parquet(&path, 100);
902
903 let result = cmd_quality_check(&path, 0.9, 0.9, true, "text");
905 assert!(result.is_ok());
906 }
907
908 #[test]
909 fn test_cmd_quality_check_small_dataset() {
910 let temp_dir = tempfile::tempdir()
911 .ok()
912 .unwrap_or_else(|| panic!("Should create temp dir"));
913 let path = temp_dir.path().join("small.parquet");
914 create_test_parquet(&path, 5);
915
916 let result = cmd_quality_check(&path, 0.1, 0.05, true, "text");
917 assert!(result.is_ok());
918 }
919
920 #[test]
921 fn test_cmd_quality_check_large_dataset() {
922 let temp_dir = tempfile::tempdir()
923 .ok()
924 .unwrap_or_else(|| panic!("Should create temp dir"));
925 let path = temp_dir.path().join("large.parquet");
926 create_test_parquet(&path, 500);
927
928 let result = cmd_quality_check(&path, 0.1, 0.05, false, "json");
929 assert!(result.is_ok());
930 }
931
932 #[test]
933 fn test_cmd_quality_score_ml_training_profile() {
934 let temp_dir = tempfile::tempdir()
935 .ok()
936 .unwrap_or_else(|| panic!("Should create temp dir"));
937 let path = temp_dir.path().join("ml.parquet");
938 create_test_parquet(&path, 150);
939
940 let result = cmd_quality_score(&path, "ml-training", false, false, false);
941 assert!(result.is_ok());
942 }
943
944 #[test]
945 fn test_cmd_quality_score_invalid_profile() {
946 let temp_dir = tempfile::tempdir()
947 .ok()
948 .unwrap_or_else(|| panic!("Should create temp dir"));
949 let path = temp_dir.path().join("data.parquet");
950 create_test_parquet(&path, 100);
951
952 let result = cmd_quality_score(&path, "nonexistent-profile", false, false, false);
953 assert!(result.is_err());
954 }
955
956 #[test]
957 fn test_cmd_quality_score_all_output_modes() {
958 let temp_dir = tempfile::tempdir()
959 .ok()
960 .unwrap_or_else(|| panic!("Should create temp dir"));
961 let path = temp_dir.path().join("data.parquet");
962 create_test_parquet(&path, 100);
963
964 let result = cmd_quality_score(&path, "default", false, false, false);
966 assert!(result.is_ok());
967
968 let result = cmd_quality_score(&path, "default", false, true, false);
970 assert!(result.is_ok());
971
972 let result = cmd_quality_score(&path, "default", false, false, true);
974 assert!(result.is_ok());
975 }
976
977 #[test]
978 fn test_cmd_quality_report_to_stdout() {
979 let temp_dir = tempfile::tempdir()
980 .ok()
981 .unwrap_or_else(|| panic!("Should create temp dir"));
982 let path = temp_dir.path().join("data.parquet");
983 create_test_parquet(&path, 50);
984
985 let result = cmd_quality_report(&path, None);
987 assert!(result.is_ok());
988 }
989
990 #[test]
991 fn test_build_checklist_from_report_empty_dataset() {
992 use crate::quality::{QualityChecker, QualityProfile};
994
995 let schema = Arc::new(Schema::new(vec![Field::new("id", DataType::Int32, false)]));
996
997 let batch =
998 arrow::array::RecordBatch::try_new(schema, vec![Arc::new(Int32Array::from(vec![1]))])
999 .unwrap();
1000
1001 let dataset = ArrowDataset::from_batch(batch).unwrap();
1002 let report = QualityChecker::new().check(&dataset).unwrap();
1003 let profile = QualityProfile::by_name("default").unwrap();
1004
1005 let checklist = build_checklist_from_report(&report, &profile);
1006
1007 assert!(!checklist.is_empty());
1009
1010 let has_critical = checklist
1012 .iter()
1013 .any(|i| i.severity == crate::quality::Severity::Critical);
1014 let has_high = checklist
1015 .iter()
1016 .any(|i| i.severity == crate::quality::Severity::High);
1017 let has_medium = checklist
1018 .iter()
1019 .any(|i| i.severity == crate::quality::Severity::Medium);
1020 let has_low = checklist
1021 .iter()
1022 .any(|i| i.severity == crate::quality::Severity::Low);
1023
1024 assert!(has_critical);
1025 assert!(has_high);
1026 assert!(has_medium);
1027 assert!(has_low);
1028 }
1029
1030 #[test]
1031 fn test_build_checklist_high_quality_dataset() {
1032 use crate::quality::{QualityChecker, QualityProfile};
1033
1034 let schema = Arc::new(Schema::new(vec![
1036 Field::new("id", DataType::Int32, false),
1037 Field::new("name", DataType::Utf8, false),
1038 ]));
1039
1040 let ids: Vec<i32> = (0..200).collect();
1041 let names: Vec<String> = ids.iter().map(|i| format!("name_{}", i)).collect();
1042
1043 let batch = arrow::array::RecordBatch::try_new(
1044 schema,
1045 vec![
1046 Arc::new(Int32Array::from(ids)),
1047 Arc::new(StringArray::from(names)),
1048 ],
1049 )
1050 .unwrap();
1051
1052 let dataset = ArrowDataset::from_batch(batch).unwrap();
1053 let report = QualityChecker::new().check(&dataset).unwrap();
1054 let profile = QualityProfile::by_name("default").unwrap();
1055
1056 let checklist = build_checklist_from_report(&report, &profile);
1057
1058 let passed = checklist.iter().filter(|i| i.passed).count();
1060 assert!(passed > checklist.len() / 2);
1062 }
1063
1064 #[test]
1065 fn test_cmd_quality_check_with_issues() {
1066 let temp_dir = tempfile::tempdir()
1067 .ok()
1068 .unwrap_or_else(|| panic!("Should create temp dir"));
1069 let path = temp_dir.path().join("issues.parquet");
1070
1071 let schema = Arc::new(Schema::new(vec![
1073 Field::new("id", DataType::Int32, false),
1074 Field::new("constant", DataType::Int32, false),
1075 ]));
1076
1077 let ids: Vec<i32> = (0..50).collect();
1078 let constants: Vec<i32> = vec![42; 50];
1079
1080 let batch = arrow::array::RecordBatch::try_new(
1081 schema,
1082 vec![
1083 Arc::new(Int32Array::from(ids)),
1084 Arc::new(Int32Array::from(constants)),
1085 ],
1086 )
1087 .unwrap();
1088
1089 let dataset = ArrowDataset::from_batch(batch).unwrap();
1090 dataset.to_parquet(&path).unwrap();
1091
1092 let result = cmd_quality_check(&path, 0.1, 0.05, true, "text");
1094 assert!(result.is_ok());
1095 }
1096}