1#![allow(clippy::cast_precision_loss)]
44#![allow(clippy::suboptimal_flops)]
45#![allow(clippy::unused_self)]
46#![allow(clippy::if_not_else)]
47
48use std::{
49 collections::{HashMap, HashSet},
50 fmt,
51};
52
53use crate::{
54 dataset::{ArrowDataset, Dataset},
55 error::Result,
56};
57
58#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
64pub enum Severity {
65 Critical,
67 High,
69 Medium,
71 Low,
73}
74
75impl Severity {
76 #[must_use]
78 pub fn weight(&self) -> f64 {
79 match self {
80 Self::Critical => 2.0,
81 Self::High => 1.5,
82 Self::Medium => 1.0,
83 Self::Low => 0.5,
84 }
85 }
86
87 #[must_use]
89 pub fn base_points(&self) -> f64 {
90 match self {
91 Self::Critical => 2.0,
92 Self::High => 1.5,
93 Self::Medium => 1.0,
94 Self::Low => 0.5,
95 }
96 }
97}
98
99impl fmt::Display for Severity {
100 fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
101 match self {
102 Self::Critical => write!(f, "Critical"),
103 Self::High => write!(f, "High"),
104 Self::Medium => write!(f, "Medium"),
105 Self::Low => write!(f, "Low"),
106 }
107 }
108}
109
110#[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord, Hash)]
112pub enum LetterGrade {
113 A,
115 B,
117 C,
119 D,
121 F,
123}
124
125impl LetterGrade {
126 #[must_use]
128 pub fn from_score(score: f64) -> Self {
129 match score {
130 s if s >= 95.0 => Self::A,
131 s if s >= 85.0 => Self::B,
132 s if s >= 70.0 => Self::C,
133 s if s >= 50.0 => Self::D,
134 _ => Self::F,
135 }
136 }
137
138 #[must_use]
140 pub fn publication_decision(&self) -> &'static str {
141 match self {
142 Self::A => "Publish immediately",
143 Self::B => "Publish with documented caveats",
144 Self::C => "Remediation required before publication",
145 Self::D => "Major rework needed",
146 Self::F => "Do not publish",
147 }
148 }
149
150 #[must_use]
152 pub fn is_publishable(&self) -> bool {
153 matches!(self, Self::A | Self::B)
154 }
155}
156
157impl fmt::Display for LetterGrade {
158 fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
159 match self {
160 Self::A => write!(f, "A"),
161 Self::B => write!(f, "B"),
162 Self::C => write!(f, "C"),
163 Self::D => write!(f, "D"),
164 Self::F => write!(f, "F"),
165 }
166 }
167}
168
169#[derive(Debug, Clone)]
171pub struct ChecklistItem {
172 pub id: u8,
174 pub description: String,
176 pub passed: bool,
178 pub severity: Severity,
180 pub suggestion: Option<String>,
182}
183
184impl ChecklistItem {
185 #[must_use]
187 pub fn new(id: u8, description: impl Into<String>, severity: Severity, passed: bool) -> Self {
188 Self {
189 id,
190 description: description.into(),
191 passed,
192 severity,
193 suggestion: None,
194 }
195 }
196
197 #[must_use]
199 pub fn with_suggestion(mut self, suggestion: impl Into<String>) -> Self {
200 self.suggestion = Some(suggestion.into());
201 self
202 }
203
204 #[must_use]
206 pub fn points_earned(&self) -> f64 {
207 if self.passed {
208 self.severity.base_points()
209 } else {
210 0.0
211 }
212 }
213
214 #[must_use]
216 pub fn max_points(&self) -> f64 {
217 self.severity.base_points()
218 }
219}
220
221#[derive(Debug, Clone)]
223pub struct QualityScore {
224 pub score: f64,
226 pub grade: LetterGrade,
228 pub points_earned: f64,
230 pub max_points: f64,
232 pub checklist: Vec<ChecklistItem>,
234 pub severity_breakdown: HashMap<Severity, SeverityStats>,
236}
237
238#[derive(Debug, Clone, Default)]
240pub struct SeverityStats {
241 pub total: usize,
243 pub passed: usize,
245 pub failed: usize,
247 pub points_earned: f64,
249 pub max_points: f64,
251}
252
253impl QualityScore {
254 #[must_use]
256 pub fn from_checklist(checklist: Vec<ChecklistItem>) -> Self {
257 let mut severity_breakdown: HashMap<Severity, SeverityStats> = HashMap::new();
258
259 let mut points_earned = 0.0;
260 let mut max_points = 0.0;
261
262 for item in &checklist {
263 let stats = severity_breakdown.entry(item.severity).or_default();
264
265 stats.total += 1;
266 stats.max_points += item.max_points();
267
268 if item.passed {
269 stats.passed += 1;
270 stats.points_earned += item.points_earned();
271 points_earned += item.points_earned();
272 } else {
273 stats.failed += 1;
274 }
275
276 max_points += item.max_points();
277 }
278
279 let score = if max_points > 0.0 {
280 (points_earned / max_points * 100.0).clamp(0.0, 100.0)
281 } else {
282 100.0
283 };
284
285 let grade = LetterGrade::from_score(score);
286
287 Self {
288 score,
289 grade,
290 points_earned,
291 max_points,
292 checklist,
293 severity_breakdown,
294 }
295 }
296
297 #[must_use]
299 pub fn failed_items(&self) -> Vec<&ChecklistItem> {
300 self.checklist.iter().filter(|item| !item.passed).collect()
301 }
302
303 #[must_use]
305 pub fn critical_failures(&self) -> Vec<&ChecklistItem> {
306 self.checklist
307 .iter()
308 .filter(|item| !item.passed && item.severity == Severity::Critical)
309 .collect()
310 }
311
312 #[must_use]
314 pub fn has_critical_failures(&self) -> bool {
315 self.checklist
316 .iter()
317 .any(|item| !item.passed && item.severity == Severity::Critical)
318 }
319
320 #[must_use]
322 pub fn badge_url(&self) -> String {
323 let color = match self.grade {
324 LetterGrade::A => "brightgreen",
325 LetterGrade::B => "green",
326 LetterGrade::C => "yellow",
327 LetterGrade::D => "orange",
328 LetterGrade::F => "red",
329 };
330 format!(
331 "https://img.shields.io/badge/data_quality-{}_({:.0}%25)-{}",
332 self.grade, self.score, color
333 )
334 }
335
336 #[must_use]
338 pub fn to_json(&self) -> String {
339 let failed_items: Vec<_> = self
340 .failed_items()
341 .iter()
342 .map(|item| {
343 format!(
344 r#" {{"id": {}, "description": "{}", "severity": "{}", "suggestion": {}}}"#,
345 item.id,
346 item.description.replace('"', "\\\""),
347 item.severity,
348 item.suggestion
349 .as_ref()
350 .map(|s| format!("\"{}\"", s.replace('"', "\\\"")))
351 .unwrap_or_else(|| "null".to_string())
352 )
353 })
354 .collect();
355
356 format!(
357 r#"{{
358 "score": {:.2},
359 "grade": "{}",
360 "is_publishable": {},
361 "decision": "{}",
362 "points_earned": {:.2},
363 "max_points": {:.2},
364 "critical_failures": {},
365 "failed_items": [
366{}
367 ],
368 "badge_url": "{}"
369}}"#,
370 self.score,
371 self.grade,
372 self.grade.is_publishable(),
373 self.grade.publication_decision(),
374 self.points_earned,
375 self.max_points,
376 self.has_critical_failures(),
377 failed_items.join(",\n"),
378 self.badge_url()
379 )
380 }
381}
382
383#[derive(Debug, Clone)]
402pub struct QualityProfile {
403 pub name: String,
405 pub description: String,
407 pub expected_constant_columns: HashSet<String>,
409 pub nullable_columns: HashSet<String>,
411 pub max_null_ratio: f64,
413 pub max_duplicate_ratio: f64,
415 pub min_cardinality: usize,
417 pub max_outlier_ratio: f64,
419 pub max_duplicate_row_ratio: f64,
421 pub penalize_unexpected_constants: bool,
423 pub require_signature: bool,
425}
426
427impl Default for QualityProfile {
428 fn default() -> Self {
429 Self {
430 name: "default".to_string(),
431 description: "General-purpose quality profile".to_string(),
432 expected_constant_columns: HashSet::new(),
433 nullable_columns: HashSet::new(),
434 max_null_ratio: 0.1,
435 max_duplicate_ratio: 0.5,
436 min_cardinality: 2,
437 max_outlier_ratio: 0.05,
438 max_duplicate_row_ratio: 0.01,
439 penalize_unexpected_constants: true,
440 require_signature: false,
441 }
442 }
443}
444
445impl QualityProfile {
446 #[must_use]
448 pub fn new(name: impl Into<String>) -> Self {
449 Self {
450 name: name.into(),
451 ..Default::default()
452 }
453 }
454
455 #[must_use]
457 pub fn by_name(name: &str) -> Option<Self> {
458 match name {
459 "default" => Some(Self::default()),
460 "doctest-corpus" | "doctest" => Some(Self::doctest_corpus()),
461 "ml-training" | "ml" => Some(Self::ml_training()),
462 "time-series" | "timeseries" => Some(Self::time_series()),
463 _ => None,
464 }
465 }
466
467 #[must_use]
469 pub fn available_profiles() -> Vec<&'static str> {
470 vec!["default", "doctest-corpus", "ml-training", "time-series"]
471 }
472
473 #[must_use]
480 pub fn doctest_corpus() -> Self {
481 let mut expected_constants = HashSet::new();
482 expected_constants.insert("source".to_string());
483 expected_constants.insert("version".to_string());
484
485 let mut nullable = HashSet::new();
486 nullable.insert("signature".to_string()); Self {
489 name: "doctest-corpus".to_string(),
490 description: "Profile for Python doctest extraction datasets".to_string(),
491 expected_constant_columns: expected_constants,
492 nullable_columns: nullable,
493 max_null_ratio: 0.05, max_duplicate_ratio: 0.3, min_cardinality: 2,
496 max_outlier_ratio: 0.05,
497 max_duplicate_row_ratio: 0.0, penalize_unexpected_constants: true,
499 require_signature: false, }
501 }
502
503 #[must_use]
510 pub fn ml_training() -> Self {
511 Self {
512 name: "ml-training".to_string(),
513 description: "Profile for machine learning training datasets".to_string(),
514 expected_constant_columns: HashSet::new(),
515 nullable_columns: HashSet::new(),
516 max_null_ratio: 0.0, max_duplicate_ratio: 0.8, min_cardinality: 2,
519 max_outlier_ratio: 0.1, max_duplicate_row_ratio: 0.01,
521 penalize_unexpected_constants: true,
522 require_signature: false,
523 }
524 }
525
526 #[must_use]
532 pub fn time_series() -> Self {
533 Self {
534 name: "time-series".to_string(),
535 description: "Profile for time series datasets".to_string(),
536 expected_constant_columns: HashSet::new(),
537 nullable_columns: HashSet::new(),
538 max_null_ratio: 0.05,
539 max_duplicate_ratio: 0.5,
540 min_cardinality: 2,
541 max_outlier_ratio: 0.1, max_duplicate_row_ratio: 0.0, penalize_unexpected_constants: true,
544 require_signature: false,
545 }
546 }
547
548 #[must_use]
550 pub fn with_description(mut self, desc: impl Into<String>) -> Self {
551 self.description = desc.into();
552 self
553 }
554
555 #[must_use]
557 pub fn with_expected_constant(mut self, column: impl Into<String>) -> Self {
558 self.expected_constant_columns.insert(column.into());
559 self
560 }
561
562 #[must_use]
564 pub fn with_nullable(mut self, column: impl Into<String>) -> Self {
565 self.nullable_columns.insert(column.into());
566 self
567 }
568
569 #[must_use]
571 pub fn with_max_null_ratio(mut self, ratio: f64) -> Self {
572 self.max_null_ratio = ratio;
573 self
574 }
575
576 #[must_use]
578 pub fn with_max_duplicate_ratio(mut self, ratio: f64) -> Self {
579 self.max_duplicate_ratio = ratio;
580 self
581 }
582
583 #[must_use]
585 pub fn is_expected_constant(&self, column: &str) -> bool {
586 self.expected_constant_columns.contains(column)
587 }
588
589 #[must_use]
591 pub fn is_nullable(&self, column: &str) -> bool {
592 self.nullable_columns.contains(column)
593 }
594
595 #[must_use]
597 pub fn null_threshold_for(&self, column: &str) -> f64 {
598 if self.is_nullable(column) {
599 1.0 } else {
601 self.max_null_ratio
602 }
603 }
604}
605
606#[derive(Debug, Clone, PartialEq)]
612pub enum QualityIssue {
613 HighNullRatio {
615 column: String,
617 null_ratio: f64,
619 threshold: f64,
621 },
622 HighDuplicateRatio {
624 column: String,
626 duplicate_ratio: f64,
628 threshold: f64,
630 },
631 LowCardinality {
633 column: String,
635 unique_count: usize,
637 total_count: usize,
639 },
640 OutliersDetected {
642 column: String,
644 outlier_count: usize,
646 outlier_ratio: f64,
648 },
649 DuplicateRows {
651 duplicate_count: usize,
653 duplicate_ratio: f64,
655 },
656 ConstantColumn {
658 column: String,
660 value: String,
662 },
663 EmptySchema,
665 EmptyDataset,
667}
668
669impl QualityIssue {
670 pub fn severity(&self) -> u8 {
672 match self {
673 Self::EmptySchema | Self::EmptyDataset => 5,
674 Self::ConstantColumn { .. } => 4,
675 Self::HighNullRatio { null_ratio, .. } if *null_ratio > 0.5 => 4,
676 Self::HighNullRatio { .. } => 3,
677 Self::OutliersDetected { outlier_ratio, .. } if *outlier_ratio > 0.1 => 3,
678 Self::OutliersDetected { .. }
679 | Self::HighDuplicateRatio { .. }
680 | Self::DuplicateRows { .. } => 2,
681 Self::LowCardinality { .. } => 1,
682 }
683 }
684
685 pub fn column(&self) -> Option<&str> {
687 match self {
688 Self::HighNullRatio { column, .. }
689 | Self::HighDuplicateRatio { column, .. }
690 | Self::LowCardinality { column, .. }
691 | Self::OutliersDetected { column, .. }
692 | Self::ConstantColumn { column, .. } => Some(column),
693 _ => None,
694 }
695 }
696}
697
698#[derive(Debug, Clone)]
700pub struct ColumnQuality {
701 pub name: String,
703 pub total_count: usize,
705 pub null_count: usize,
707 pub null_ratio: f64,
709 pub unique_count: usize,
711 pub unique_ratio: f64,
713 pub duplicate_count: usize,
715 pub duplicate_ratio: f64,
717 pub outlier_count: Option<usize>,
719 pub numeric_stats: Option<NumericStats>,
721}
722
723impl ColumnQuality {
724 pub fn is_constant(&self) -> bool {
726 self.unique_count <= 1 && self.total_count > 0
727 }
728
729 pub fn is_mostly_null(&self, threshold: f64) -> bool {
731 self.null_ratio >= threshold
732 }
733}
734
735#[derive(Debug, Clone)]
737pub struct NumericStats {
738 pub min: f64,
740 pub max: f64,
742 pub mean: f64,
744 pub std_dev: f64,
746 pub q1: f64,
748 pub median: f64,
750 pub q3: f64,
752}
753
754impl NumericStats {
755 pub fn iqr(&self) -> f64 {
757 self.q3 - self.q1
758 }
759
760 pub fn outlier_lower_bound(&self) -> f64 {
762 self.q1 - 1.5 * self.iqr()
763 }
764
765 pub fn outlier_upper_bound(&self) -> f64 {
767 self.q3 + 1.5 * self.iqr()
768 }
769}
770
771#[derive(Debug, Clone)]
773pub struct QualityReport {
774 pub row_count: usize,
776 pub column_count: usize,
778 pub columns: HashMap<String, ColumnQuality>,
780 pub issues: Vec<QualityIssue>,
782 pub score: f64,
784 pub duplicate_row_count: usize,
786}
787
788impl QualityReport {
789 pub fn has_issues(&self) -> bool {
791 !self.issues.is_empty()
792 }
793
794 pub fn column_issues(&self, column: &str) -> Vec<&QualityIssue> {
796 self.issues
797 .iter()
798 .filter(|i| i.column() == Some(column))
799 .collect()
800 }
801
802 pub fn max_severity(&self) -> u8 {
804 self.issues.iter().map(|i| i.severity()).max().unwrap_or(0)
805 }
806
807 pub fn problematic_columns(&self) -> Vec<&str> {
809 self.issues
810 .iter()
811 .filter_map(|i| i.column())
812 .collect::<HashSet<_>>()
813 .into_iter()
814 .collect()
815 }
816}
817
818#[derive(Debug, Clone)]
820pub struct QualityThresholds {
821 pub max_null_ratio: f64,
823 pub max_duplicate_ratio: f64,
825 pub min_cardinality: usize,
827 pub max_outlier_ratio: f64,
829 pub max_duplicate_row_ratio: f64,
831}
832
833impl Default for QualityThresholds {
834 fn default() -> Self {
835 Self {
836 max_null_ratio: 0.1,
837 max_duplicate_ratio: 0.5,
838 min_cardinality: 2,
839 max_outlier_ratio: 0.05,
840 max_duplicate_row_ratio: 0.01,
841 }
842 }
843}
844
845pub struct QualityChecker {
847 thresholds: QualityThresholds,
848 check_outliers: bool,
849 check_duplicates: bool,
850}
851
852impl Default for QualityChecker {
853 fn default() -> Self {
854 Self::new()
855 }
856}
857
858impl QualityChecker {
859 pub fn new() -> Self {
861 Self {
862 thresholds: QualityThresholds::default(),
863 check_outliers: true,
864 check_duplicates: true,
865 }
866 }
867
868 #[must_use]
870 pub fn max_null_ratio(mut self, ratio: f64) -> Self {
871 self.thresholds.max_null_ratio = ratio;
872 self
873 }
874
875 #[must_use]
877 pub fn max_duplicate_ratio(mut self, ratio: f64) -> Self {
878 self.thresholds.max_duplicate_ratio = ratio;
879 self
880 }
881
882 #[must_use]
884 pub fn min_cardinality(mut self, min: usize) -> Self {
885 self.thresholds.min_cardinality = min;
886 self
887 }
888
889 #[must_use]
891 pub fn max_outlier_ratio(mut self, ratio: f64) -> Self {
892 self.thresholds.max_outlier_ratio = ratio;
893 self
894 }
895
896 #[must_use]
898 pub fn with_outlier_check(mut self, enabled: bool) -> Self {
899 self.check_outliers = enabled;
900 self
901 }
902
903 #[must_use]
905 pub fn with_duplicate_check(mut self, enabled: bool) -> Self {
906 self.check_duplicates = enabled;
907 self
908 }
909
910 pub fn check(&self, dataset: &ArrowDataset) -> Result<QualityReport> {
912 let schema = dataset.schema();
913 let mut issues = Vec::new();
914
915 if schema.fields().is_empty() {
917 issues.push(QualityIssue::EmptySchema);
918 return Ok(QualityReport {
919 row_count: 0,
920 column_count: 0,
921 columns: HashMap::new(),
922 issues,
923 score: 0.0,
924 duplicate_row_count: 0,
925 });
926 }
927
928 let (column_data, row_count) = self.collect_data(dataset);
930
931 if row_count == 0 {
933 issues.push(QualityIssue::EmptyDataset);
934 return Ok(QualityReport {
935 row_count: 0,
936 column_count: schema.fields().len(),
937 columns: HashMap::new(),
938 issues,
939 score: 0.0,
940 duplicate_row_count: 0,
941 });
942 }
943
944 let mut columns = HashMap::new();
946 for (col_name, values) in &column_data {
947 let quality = self.analyze_column(col_name, values, row_count);
948
949 if quality.null_ratio > self.thresholds.max_null_ratio {
951 issues.push(QualityIssue::HighNullRatio {
952 column: col_name.clone(),
953 null_ratio: quality.null_ratio,
954 threshold: self.thresholds.max_null_ratio,
955 });
956 }
957
958 if quality.duplicate_ratio > self.thresholds.max_duplicate_ratio {
959 issues.push(QualityIssue::HighDuplicateRatio {
960 column: col_name.clone(),
961 duplicate_ratio: quality.duplicate_ratio,
962 threshold: self.thresholds.max_duplicate_ratio,
963 });
964 }
965
966 if quality.unique_count < self.thresholds.min_cardinality && row_count > 1 {
967 issues.push(QualityIssue::LowCardinality {
968 column: col_name.clone(),
969 unique_count: quality.unique_count,
970 total_count: row_count,
971 });
972 }
973
974 if quality.is_constant() {
975 let value = values
976 .iter()
977 .find(|v| v.is_some())
978 .map(|v| v.clone().unwrap_or_default())
979 .unwrap_or_default();
980 issues.push(QualityIssue::ConstantColumn {
981 column: col_name.clone(),
982 value,
983 });
984 }
985
986 if let Some(outlier_count) = quality.outlier_count {
987 let outlier_ratio = outlier_count as f64 / row_count as f64;
988 if outlier_ratio > self.thresholds.max_outlier_ratio {
989 issues.push(QualityIssue::OutliersDetected {
990 column: col_name.clone(),
991 outlier_count,
992 outlier_ratio,
993 });
994 }
995 }
996
997 columns.insert(col_name.clone(), quality);
998 }
999
1000 let duplicate_row_count = if self.check_duplicates {
1002 self.count_duplicate_rows(&column_data, row_count)
1003 } else {
1004 0
1005 };
1006
1007 let duplicate_row_ratio = duplicate_row_count as f64 / row_count as f64;
1008 if duplicate_row_ratio > self.thresholds.max_duplicate_row_ratio {
1009 issues.push(QualityIssue::DuplicateRows {
1010 duplicate_count: duplicate_row_count,
1011 duplicate_ratio: duplicate_row_ratio,
1012 });
1013 }
1014
1015 let score = self.calculate_score(&columns, &issues, row_count);
1017
1018 Ok(QualityReport {
1019 row_count,
1020 column_count: schema.fields().len(),
1021 columns,
1022 issues,
1023 score,
1024 duplicate_row_count,
1025 })
1026 }
1027
1028 fn collect_data(
1030 &self,
1031 dataset: &ArrowDataset,
1032 ) -> (HashMap<String, Vec<Option<String>>>, usize) {
1033 use arrow::array::{Array, Float64Array, Int32Array, Int64Array, StringArray};
1034
1035 let schema = dataset.schema();
1036 let mut data: HashMap<String, Vec<Option<String>>> = HashMap::new();
1037 let mut row_count = 0;
1038
1039 for field in schema.fields() {
1040 data.insert(field.name().clone(), Vec::new());
1041 }
1042
1043 for batch in dataset.iter() {
1044 row_count += batch.num_rows();
1045
1046 for (col_idx, field) in schema.fields().iter().enumerate() {
1047 if let Some(col_data) = data.get_mut(field.name()) {
1048 let array = batch.column(col_idx);
1049
1050 for i in 0..array.len() {
1051 if array.is_null(i) {
1052 col_data.push(None);
1053 } else if let Some(arr) = array.as_any().downcast_ref::<StringArray>() {
1054 col_data.push(Some(arr.value(i).to_string()));
1055 } else if let Some(arr) = array.as_any().downcast_ref::<Int32Array>() {
1056 col_data.push(Some(arr.value(i).to_string()));
1057 } else if let Some(arr) = array.as_any().downcast_ref::<Int64Array>() {
1058 col_data.push(Some(arr.value(i).to_string()));
1059 } else if let Some(arr) = array.as_any().downcast_ref::<Float64Array>() {
1060 col_data.push(Some(arr.value(i).to_string()));
1061 } else {
1062 col_data.push(Some("?".to_string()));
1063 }
1064 }
1065 }
1066 }
1067 }
1068
1069 (data, row_count)
1070 }
1071
1072 fn analyze_column(
1074 &self,
1075 name: &str,
1076 values: &[Option<String>],
1077 total_count: usize,
1078 ) -> ColumnQuality {
1079 let null_count = values.iter().filter(|v| v.is_none()).count();
1080 let null_ratio = if total_count > 0 {
1081 null_count as f64 / total_count as f64
1082 } else {
1083 0.0
1084 };
1085
1086 let non_null_values: Vec<&str> = values.iter().filter_map(|v| v.as_deref()).collect();
1088 let unique_set: HashSet<&str> = non_null_values.iter().copied().collect();
1089 let unique_count = unique_set.len();
1090 let unique_ratio = if !non_null_values.is_empty() {
1091 unique_count as f64 / non_null_values.len() as f64
1092 } else {
1093 0.0
1094 };
1095
1096 let duplicate_count = non_null_values.len().saturating_sub(unique_count);
1098 let duplicate_ratio = if !non_null_values.is_empty() {
1099 duplicate_count as f64 / non_null_values.len() as f64
1100 } else {
1101 0.0
1102 };
1103
1104 let (outlier_count, numeric_stats) = if self.check_outliers {
1106 self.analyze_numeric(&non_null_values)
1107 } else {
1108 (None, None)
1109 };
1110
1111 ColumnQuality {
1112 name: name.to_string(),
1113 total_count,
1114 null_count,
1115 null_ratio,
1116 unique_count,
1117 unique_ratio,
1118 duplicate_count,
1119 duplicate_ratio,
1120 outlier_count,
1121 numeric_stats,
1122 }
1123 }
1124
1125 fn analyze_numeric(&self, values: &[&str]) -> (Option<usize>, Option<NumericStats>) {
1127 let numeric_values: Vec<f64> = values
1128 .iter()
1129 .filter_map(|v| v.parse::<f64>().ok())
1130 .filter(|v| v.is_finite())
1131 .collect();
1132
1133 if numeric_values.len() < 4 {
1134 return (None, None);
1135 }
1136
1137 let mut sorted = numeric_values.clone();
1138 sorted.sort_by(|a, b| a.partial_cmp(b).unwrap_or(std::cmp::Ordering::Equal));
1139
1140 let n = sorted.len();
1141 let min = sorted[0];
1142 let max = sorted[n - 1];
1143 let mean = numeric_values.iter().sum::<f64>() / n as f64;
1144
1145 let variance = numeric_values
1146 .iter()
1147 .map(|v| (v - mean).powi(2))
1148 .sum::<f64>()
1149 / n as f64;
1150 let std_dev = variance.sqrt();
1151
1152 let q1 = sorted[n / 4];
1153 let median = sorted[n / 2];
1154 let q3 = sorted[3 * n / 4];
1155
1156 let stats = NumericStats {
1157 min,
1158 max,
1159 mean,
1160 std_dev,
1161 q1,
1162 median,
1163 q3,
1164 };
1165
1166 let lower = stats.outlier_lower_bound();
1168 let upper = stats.outlier_upper_bound();
1169 let outlier_count = numeric_values
1170 .iter()
1171 .filter(|&&v| v < lower || v > upper)
1172 .count();
1173
1174 (Some(outlier_count), Some(stats))
1175 }
1176
1177 fn count_duplicate_rows(
1179 &self,
1180 data: &HashMap<String, Vec<Option<String>>>,
1181 row_count: usize,
1182 ) -> usize {
1183 if data.is_empty() || row_count == 0 {
1184 return 0;
1185 }
1186
1187 let mut row_set: HashSet<String> = HashSet::new();
1189 let mut duplicates = 0;
1190
1191 let columns: Vec<&String> = data.keys().collect();
1192
1193 for i in 0..row_count {
1194 let row_key: String = columns
1195 .iter()
1196 .map(|col| {
1197 data.get(*col)
1198 .and_then(|v| v.get(i))
1199 .map(|v| v.clone().unwrap_or_else(|| "NULL".to_string()))
1200 .unwrap_or_else(|| "NULL".to_string())
1201 })
1202 .collect::<Vec<_>>()
1203 .join("|");
1204
1205 if !row_set.insert(row_key) {
1206 duplicates += 1;
1207 }
1208 }
1209
1210 duplicates
1211 }
1212
1213 fn calculate_score(
1215 &self,
1216 columns: &HashMap<String, ColumnQuality>,
1217 issues: &[QualityIssue],
1218 row_count: usize,
1219 ) -> f64 {
1220 if row_count == 0 || columns.is_empty() {
1221 return 0.0;
1222 }
1223
1224 let mut score = 100.0;
1225
1226 let avg_null_ratio: f64 =
1228 columns.values().map(|c| c.null_ratio).sum::<f64>() / columns.len() as f64;
1229 score -= avg_null_ratio * 30.0;
1230
1231 for issue in issues {
1233 score -= match issue.severity() {
1234 5 => 25.0,
1235 4 => 15.0,
1236 3 => 10.0,
1237 2 => 5.0,
1238 1 => 2.0,
1239 _ => 0.0,
1240 };
1241 }
1242
1243 score.clamp(0.0, 100.0)
1244 }
1245}
1246
1247#[cfg(test)]
1248mod tests {
1249 use std::sync::Arc;
1250
1251 use arrow::{
1252 array::{Float64Array, Int32Array, StringArray},
1253 datatypes::{DataType, Field, Schema},
1254 record_batch::RecordBatch,
1255 };
1256
1257 use super::*;
1258
1259 #[test]
1262 fn test_issue_severity() {
1263 assert_eq!(QualityIssue::EmptySchema.severity(), 5);
1264 assert_eq!(QualityIssue::EmptyDataset.severity(), 5);
1265
1266 let constant = QualityIssue::ConstantColumn {
1267 column: "x".to_string(),
1268 value: "1".to_string(),
1269 };
1270 assert_eq!(constant.severity(), 4);
1271
1272 let high_null = QualityIssue::HighNullRatio {
1273 column: "x".to_string(),
1274 null_ratio: 0.6,
1275 threshold: 0.1,
1276 };
1277 assert_eq!(high_null.severity(), 4);
1278
1279 let low_null = QualityIssue::HighNullRatio {
1280 column: "x".to_string(),
1281 null_ratio: 0.3,
1282 threshold: 0.1,
1283 };
1284 assert_eq!(low_null.severity(), 3);
1285 }
1286
1287 #[test]
1288 fn test_issue_column() {
1289 let issue = QualityIssue::HighNullRatio {
1290 column: "test".to_string(),
1291 null_ratio: 0.5,
1292 threshold: 0.1,
1293 };
1294 assert_eq!(issue.column(), Some("test"));
1295
1296 assert_eq!(QualityIssue::EmptySchema.column(), None);
1297 }
1298
1299 #[test]
1302 fn test_column_quality_is_constant() {
1303 let mut quality = ColumnQuality {
1304 name: "test".to_string(),
1305 total_count: 100,
1306 null_count: 0,
1307 null_ratio: 0.0,
1308 unique_count: 1,
1309 unique_ratio: 0.01,
1310 duplicate_count: 99,
1311 duplicate_ratio: 0.99,
1312 outlier_count: None,
1313 numeric_stats: None,
1314 };
1315
1316 assert!(quality.is_constant());
1317
1318 quality.unique_count = 5;
1319 assert!(!quality.is_constant());
1320 }
1321
1322 #[test]
1323 fn test_column_quality_mostly_null() {
1324 let quality = ColumnQuality {
1325 name: "test".to_string(),
1326 total_count: 100,
1327 null_count: 80,
1328 null_ratio: 0.8,
1329 unique_count: 5,
1330 unique_ratio: 0.25,
1331 duplicate_count: 15,
1332 duplicate_ratio: 0.75,
1333 outlier_count: None,
1334 numeric_stats: None,
1335 };
1336
1337 assert!(quality.is_mostly_null(0.5));
1338 assert!(!quality.is_mostly_null(0.9));
1339 }
1340
1341 #[test]
1344 fn test_numeric_stats_iqr() {
1345 let stats = NumericStats {
1346 min: 0.0,
1347 max: 100.0,
1348 mean: 50.0,
1349 std_dev: 25.0,
1350 q1: 25.0,
1351 median: 50.0,
1352 q3: 75.0,
1353 };
1354
1355 assert!((stats.iqr() - 50.0).abs() < 0.01);
1356 assert!((stats.outlier_lower_bound() - (-50.0)).abs() < 0.01);
1357 assert!((stats.outlier_upper_bound() - 150.0).abs() < 0.01);
1358 }
1359
1360 #[test]
1363 fn test_report_has_issues() {
1364 let report = QualityReport {
1365 row_count: 100,
1366 column_count: 2,
1367 columns: HashMap::new(),
1368 issues: vec![],
1369 score: 100.0,
1370 duplicate_row_count: 0,
1371 };
1372 assert!(!report.has_issues());
1373
1374 let report_with_issues = QualityReport {
1375 row_count: 100,
1376 column_count: 2,
1377 columns: HashMap::new(),
1378 issues: vec![QualityIssue::EmptySchema],
1379 score: 50.0,
1380 duplicate_row_count: 0,
1381 };
1382 assert!(report_with_issues.has_issues());
1383 }
1384
1385 #[test]
1386 fn test_report_max_severity() {
1387 let report = QualityReport {
1388 row_count: 100,
1389 column_count: 2,
1390 columns: HashMap::new(),
1391 issues: vec![
1392 QualityIssue::LowCardinality {
1393 column: "x".to_string(),
1394 unique_count: 1,
1395 total_count: 100,
1396 },
1397 QualityIssue::ConstantColumn {
1398 column: "y".to_string(),
1399 value: "1".to_string(),
1400 },
1401 ],
1402 score: 80.0,
1403 duplicate_row_count: 0,
1404 };
1405
1406 assert_eq!(report.max_severity(), 4);
1407 }
1408
1409 fn make_dataset(col1: Vec<Option<&str>>, col2: Vec<Option<i32>>) -> ArrowDataset {
1412 let schema = Arc::new(Schema::new(vec![
1413 Field::new("name", DataType::Utf8, true),
1414 Field::new("value", DataType::Int32, true),
1415 ]));
1416
1417 let names: Vec<Option<&str>> = col1;
1418 let values: Vec<Option<i32>> = col2;
1419
1420 let batch = RecordBatch::try_new(
1421 Arc::clone(&schema),
1422 vec![
1423 Arc::new(StringArray::from(names)),
1424 Arc::new(Int32Array::from(values)),
1425 ],
1426 )
1427 .expect("batch");
1428
1429 ArrowDataset::from_batch(batch).expect("dataset")
1430 }
1431
1432 fn make_float_dataset(values: Vec<Option<f64>>) -> ArrowDataset {
1433 let schema = Arc::new(Schema::new(vec![Field::new(
1434 "value",
1435 DataType::Float64,
1436 true,
1437 )]));
1438
1439 let batch = RecordBatch::try_new(
1440 Arc::clone(&schema),
1441 vec![Arc::new(Float64Array::from(values))],
1442 )
1443 .expect("batch");
1444
1445 ArrowDataset::from_batch(batch).expect("dataset")
1446 }
1447
1448 #[test]
1449 fn test_checker_new() {
1450 let checker = QualityChecker::new();
1451 assert!((checker.thresholds.max_null_ratio - 0.1).abs() < 0.01);
1452 }
1453
1454 #[test]
1455 fn test_checker_builder() {
1456 let checker = QualityChecker::new()
1457 .max_null_ratio(0.2)
1458 .max_duplicate_ratio(0.3)
1459 .min_cardinality(5);
1460
1461 assert!((checker.thresholds.max_null_ratio - 0.2).abs() < 0.01);
1462 assert!((checker.thresholds.max_duplicate_ratio - 0.3).abs() < 0.01);
1463 assert_eq!(checker.thresholds.min_cardinality, 5);
1464 }
1465
1466 #[test]
1467 fn test_checker_clean_data() {
1468 let dataset = make_dataset(
1469 vec![Some("a"), Some("b"), Some("c"), Some("d")],
1470 vec![Some(1), Some(2), Some(3), Some(4)],
1471 );
1472
1473 let checker = QualityChecker::new();
1474 let report = checker.check(&dataset).expect("check");
1475
1476 assert_eq!(report.row_count, 4);
1477 assert_eq!(report.column_count, 2);
1478 assert!(report.score > 80.0);
1479 }
1480
1481 #[test]
1482 fn test_checker_detects_nulls() {
1483 let dataset = make_dataset(
1484 vec![Some("a"), None, None, None, None],
1485 vec![Some(1), Some(2), Some(3), Some(4), Some(5)],
1486 );
1487
1488 let checker = QualityChecker::new().max_null_ratio(0.5);
1489 let report = checker.check(&dataset).expect("check");
1490
1491 let null_issues: Vec<_> = report
1492 .issues
1493 .iter()
1494 .filter(|i| matches!(i, QualityIssue::HighNullRatio { .. }))
1495 .collect();
1496
1497 assert_eq!(null_issues.len(), 1);
1498 }
1499
1500 #[test]
1501 fn test_checker_detects_constant() {
1502 let dataset = make_dataset(
1503 vec![Some("same"), Some("same"), Some("same"), Some("same")],
1504 vec![Some(1), Some(2), Some(3), Some(4)],
1505 );
1506
1507 let checker = QualityChecker::new();
1508 let report = checker.check(&dataset).expect("check");
1509
1510 let constant_issues: Vec<_> = report
1511 .issues
1512 .iter()
1513 .filter(|i| matches!(i, QualityIssue::ConstantColumn { .. }))
1514 .collect();
1515
1516 assert_eq!(constant_issues.len(), 1);
1517 }
1518
1519 #[test]
1520 fn test_checker_detects_duplicates() {
1521 let dataset = make_dataset(
1522 vec![Some("a"), Some("a"), Some("a"), Some("b")],
1523 vec![Some(1), Some(1), Some(1), Some(2)],
1524 );
1525
1526 let checker = QualityChecker::new().max_duplicate_ratio(0.01);
1527 let report = checker.check(&dataset).expect("check");
1528
1529 assert!(report.duplicate_row_count > 0);
1531 }
1532
1533 #[test]
1534 fn test_checker_detects_outliers() {
1535 let mut values: Vec<Option<f64>> = (0..100).map(|i| Some(i as f64)).collect();
1537 values.push(Some(10000.0)); values.push(Some(-10000.0)); let dataset = make_float_dataset(values);
1541
1542 let checker = QualityChecker::new().max_outlier_ratio(0.01);
1543 let report = checker.check(&dataset).expect("check");
1544
1545 let outlier_issues: Vec<_> = report
1546 .issues
1547 .iter()
1548 .filter(|i| matches!(i, QualityIssue::OutliersDetected { .. }))
1549 .collect();
1550
1551 assert!(!outlier_issues.is_empty());
1552 }
1553
1554 #[test]
1555 fn test_checker_empty_dataset() {
1556 let schema = Arc::new(Schema::new(vec![Field::new("x", DataType::Int32, true)]));
1557 let batch = RecordBatch::try_new(
1558 Arc::clone(&schema),
1559 vec![Arc::new(Int32Array::from(Vec::<i32>::new()))],
1560 )
1561 .expect("batch");
1562 let dataset = ArrowDataset::from_batch(batch).expect("dataset");
1563
1564 let checker = QualityChecker::new();
1565 let report = checker.check(&dataset).expect("check");
1566
1567 assert!(report.issues.contains(&QualityIssue::EmptyDataset));
1568 assert_eq!(report.score, 0.0);
1569 }
1570
1571 #[test]
1572 fn test_checker_score_decreases_with_issues() {
1573 let clean_dataset = make_dataset(
1574 vec![Some("a"), Some("b"), Some("c"), Some("d")],
1575 vec![Some(1), Some(2), Some(3), Some(4)],
1576 );
1577
1578 let dirty_dataset = make_dataset(
1579 vec![Some("same"), Some("same"), None, None],
1580 vec![Some(1), Some(1), None, None],
1581 );
1582
1583 let checker = QualityChecker::new();
1584 let clean_report = checker.check(&clean_dataset).expect("check");
1585 let dirty_report = checker.check(&dirty_dataset).expect("check");
1586
1587 assert!(clean_report.score > dirty_report.score);
1588 }
1589
1590 #[test]
1591 fn test_checker_column_issues() {
1592 let dataset = make_dataset(
1593 vec![None, None, None, None],
1594 vec![Some(1), Some(2), Some(3), Some(4)],
1595 );
1596
1597 let checker = QualityChecker::new();
1598 let report = checker.check(&dataset).expect("check");
1599
1600 let name_issues = report.column_issues("name");
1601 assert!(!name_issues.is_empty());
1602
1603 let value_issues = report.column_issues("value");
1604 assert!(value_issues.len() < name_issues.len());
1606 }
1607
1608 #[test]
1609 fn test_checker_problematic_columns() {
1610 let dataset = make_dataset(
1611 vec![None, None, None, None],
1612 vec![Some(1), Some(1), Some(1), Some(1)],
1613 );
1614
1615 let checker = QualityChecker::new();
1616 let report = checker.check(&dataset).expect("check");
1617
1618 let problematic = report.problematic_columns();
1619 assert!(problematic.contains(&"name"));
1620 assert!(problematic.contains(&"value"));
1621 }
1622
1623 #[test]
1624 fn test_checker_disable_outliers() {
1625 let mut values: Vec<Option<f64>> = (0..100).map(|i| Some(i as f64)).collect();
1626 values.push(Some(10000.0));
1627
1628 let dataset = make_float_dataset(values);
1629
1630 let checker = QualityChecker::new()
1631 .with_outlier_check(false)
1632 .max_outlier_ratio(0.001);
1633 let report = checker.check(&dataset).expect("check");
1634
1635 let outlier_issues: Vec<_> = report
1636 .issues
1637 .iter()
1638 .filter(|i| matches!(i, QualityIssue::OutliersDetected { .. }))
1639 .collect();
1640
1641 assert!(outlier_issues.is_empty());
1642 }
1643
1644 #[test]
1647 fn test_severity_weights() {
1648 assert!((Severity::Critical.weight() - 2.0).abs() < 0.01);
1649 assert!((Severity::High.weight() - 1.5).abs() < 0.01);
1650 assert!((Severity::Medium.weight() - 1.0).abs() < 0.01);
1651 assert!((Severity::Low.weight() - 0.5).abs() < 0.01);
1652 }
1653
1654 #[test]
1655 fn test_severity_base_points() {
1656 assert!((Severity::Critical.base_points() - 2.0).abs() < 0.01);
1657 assert!((Severity::High.base_points() - 1.5).abs() < 0.01);
1658 assert!((Severity::Medium.base_points() - 1.0).abs() < 0.01);
1659 assert!((Severity::Low.base_points() - 0.5).abs() < 0.01);
1660 }
1661
1662 #[test]
1663 fn test_severity_display() {
1664 assert_eq!(format!("{}", Severity::Critical), "Critical");
1665 assert_eq!(format!("{}", Severity::High), "High");
1666 assert_eq!(format!("{}", Severity::Medium), "Medium");
1667 assert_eq!(format!("{}", Severity::Low), "Low");
1668 }
1669
1670 #[test]
1671 fn test_letter_grade_from_score() {
1672 assert_eq!(LetterGrade::from_score(100.0), LetterGrade::A);
1673 assert_eq!(LetterGrade::from_score(95.0), LetterGrade::A);
1674 assert_eq!(LetterGrade::from_score(94.9), LetterGrade::B);
1675 assert_eq!(LetterGrade::from_score(85.0), LetterGrade::B);
1676 assert_eq!(LetterGrade::from_score(84.9), LetterGrade::C);
1677 assert_eq!(LetterGrade::from_score(70.0), LetterGrade::C);
1678 assert_eq!(LetterGrade::from_score(69.9), LetterGrade::D);
1679 assert_eq!(LetterGrade::from_score(50.0), LetterGrade::D);
1680 assert_eq!(LetterGrade::from_score(49.9), LetterGrade::F);
1681 assert_eq!(LetterGrade::from_score(0.0), LetterGrade::F);
1682 }
1683
1684 #[test]
1685 fn test_letter_grade_publication_decision() {
1686 assert_eq!(LetterGrade::A.publication_decision(), "Publish immediately");
1687 assert_eq!(
1688 LetterGrade::B.publication_decision(),
1689 "Publish with documented caveats"
1690 );
1691 assert_eq!(
1692 LetterGrade::C.publication_decision(),
1693 "Remediation required before publication"
1694 );
1695 assert_eq!(LetterGrade::D.publication_decision(), "Major rework needed");
1696 assert_eq!(LetterGrade::F.publication_decision(), "Do not publish");
1697 }
1698
1699 #[test]
1700 fn test_letter_grade_is_publishable() {
1701 assert!(LetterGrade::A.is_publishable());
1702 assert!(LetterGrade::B.is_publishable());
1703 assert!(!LetterGrade::C.is_publishable());
1704 assert!(!LetterGrade::D.is_publishable());
1705 assert!(!LetterGrade::F.is_publishable());
1706 }
1707
1708 #[test]
1709 fn test_letter_grade_display() {
1710 assert_eq!(format!("{}", LetterGrade::A), "A");
1711 assert_eq!(format!("{}", LetterGrade::B), "B");
1712 assert_eq!(format!("{}", LetterGrade::C), "C");
1713 assert_eq!(format!("{}", LetterGrade::D), "D");
1714 assert_eq!(format!("{}", LetterGrade::F), "F");
1715 }
1716
1717 #[test]
1718 fn test_checklist_item_new() {
1719 let item = ChecklistItem::new(1, "Schema version documented", Severity::Critical, true);
1720 assert_eq!(item.id, 1);
1721 assert_eq!(item.description, "Schema version documented");
1722 assert!(item.passed);
1723 assert_eq!(item.severity, Severity::Critical);
1724 assert!(item.suggestion.is_none());
1725 }
1726
1727 #[test]
1728 fn test_checklist_item_with_suggestion() {
1729 let item = ChecklistItem::new(1, "Schema version documented", Severity::Critical, false)
1730 .with_suggestion("Add schema_version field to metadata");
1731 assert!(item.suggestion.is_some());
1732 assert_eq!(
1733 item.suggestion.unwrap(),
1734 "Add schema_version field to metadata"
1735 );
1736 }
1737
1738 #[test]
1739 fn test_checklist_item_points() {
1740 let passed_critical = ChecklistItem::new(1, "Test", Severity::Critical, true);
1741 assert!((passed_critical.points_earned() - 2.0).abs() < 0.01);
1742 assert!((passed_critical.max_points() - 2.0).abs() < 0.01);
1743
1744 let failed_critical = ChecklistItem::new(2, "Test", Severity::Critical, false);
1745 assert!((failed_critical.points_earned() - 0.0).abs() < 0.01);
1746 assert!((failed_critical.max_points() - 2.0).abs() < 0.01);
1747
1748 let passed_low = ChecklistItem::new(3, "Test", Severity::Low, true);
1749 assert!((passed_low.points_earned() - 0.5).abs() < 0.01);
1750 }
1751
1752 #[test]
1753 fn test_quality_score_perfect() {
1754 let checklist = vec![
1755 ChecklistItem::new(1, "Critical check", Severity::Critical, true),
1756 ChecklistItem::new(2, "High check", Severity::High, true),
1757 ChecklistItem::new(3, "Medium check", Severity::Medium, true),
1758 ChecklistItem::new(4, "Low check", Severity::Low, true),
1759 ];
1760 let score = QualityScore::from_checklist(checklist);
1761
1762 assert!((score.score - 100.0).abs() < 0.01);
1766 assert_eq!(score.grade, LetterGrade::A);
1767 assert!(score.grade.is_publishable());
1768 assert!(!score.has_critical_failures());
1769 }
1770
1771 #[test]
1772 fn test_quality_score_with_critical_failure() {
1773 let checklist = vec![
1774 ChecklistItem::new(1, "Critical check", Severity::Critical, false),
1775 ChecklistItem::new(2, "High check", Severity::High, true),
1776 ChecklistItem::new(3, "Medium check", Severity::Medium, true),
1777 ChecklistItem::new(4, "Low check", Severity::Low, true),
1778 ];
1779 let score = QualityScore::from_checklist(checklist);
1780
1781 assert!((score.score - 60.0).abs() < 0.01);
1783 assert_eq!(score.grade, LetterGrade::D);
1784 assert!(score.has_critical_failures());
1785 assert!(!score.grade.is_publishable());
1786 }
1787
1788 #[test]
1789 fn test_quality_score_failed_items() {
1790 let checklist = vec![
1791 ChecklistItem::new(1, "Critical check", Severity::Critical, false),
1792 ChecklistItem::new(2, "High check", Severity::High, true),
1793 ChecklistItem::new(3, "Medium check", Severity::Medium, false),
1794 ];
1795 let score = QualityScore::from_checklist(checklist);
1796
1797 let failed = score.failed_items();
1798 assert_eq!(failed.len(), 2);
1799 assert_eq!(failed[0].id, 1);
1800 assert_eq!(failed[1].id, 3);
1801
1802 let critical = score.critical_failures();
1803 assert_eq!(critical.len(), 1);
1804 assert_eq!(critical[0].id, 1);
1805 }
1806
1807 #[test]
1808 fn test_quality_score_severity_breakdown() {
1809 let checklist = vec![
1810 ChecklistItem::new(1, "C1", Severity::Critical, true),
1811 ChecklistItem::new(2, "C2", Severity::Critical, false),
1812 ChecklistItem::new(3, "H1", Severity::High, true),
1813 ];
1814 let score = QualityScore::from_checklist(checklist);
1815
1816 let critical_stats = score.severity_breakdown.get(&Severity::Critical).unwrap();
1817 assert_eq!(critical_stats.total, 2);
1818 assert_eq!(critical_stats.passed, 1);
1819 assert_eq!(critical_stats.failed, 1);
1820
1821 let high_stats = score.severity_breakdown.get(&Severity::High).unwrap();
1822 assert_eq!(high_stats.total, 1);
1823 assert_eq!(high_stats.passed, 1);
1824 }
1825
1826 #[test]
1827 fn test_quality_score_badge_url() {
1828 let checklist = vec![ChecklistItem::new(1, "Test", Severity::Critical, true)];
1829 let score = QualityScore::from_checklist(checklist);
1830
1831 let badge = score.badge_url();
1832 assert!(badge.contains("shields.io"));
1833 assert!(badge.contains("data_quality"));
1834 assert!(badge.contains("brightgreen")); }
1836
1837 #[test]
1838 fn test_quality_score_badge_colors() {
1839 let grades_colors = vec![
1841 (100.0, "brightgreen"), (90.0, "green"), (75.0, "yellow"), (55.0, "orange"), (30.0, "red"), ];
1847
1848 for (target_score, expected_color) in grades_colors {
1849 let target: f64 = target_score;
1851 #[allow(clippy::cast_sign_loss)] let passed = (target / 100.0 * 10.0).round() as usize;
1853 let failed = 10 - passed;
1854 let mut checklist: Vec<ChecklistItem> = (0..passed)
1855 .map(|i| ChecklistItem::new(i as u8, "Test", Severity::Medium, true))
1856 .collect();
1857 checklist.extend(
1858 (0..failed).map(|i| {
1859 ChecklistItem::new((passed + i) as u8, "Test", Severity::Medium, false)
1860 }),
1861 );
1862
1863 let score = QualityScore::from_checklist(checklist);
1864 let badge = score.badge_url();
1865 assert!(
1866 badge.contains(expected_color),
1867 "Score {:.0} should have color {} but badge was {}",
1868 score.score,
1869 expected_color,
1870 badge
1871 );
1872 }
1873 }
1874
1875 #[test]
1876 fn test_quality_score_json_output() {
1877 let checklist = vec![
1878 ChecklistItem::new(1, "Schema check", Severity::Critical, true),
1879 ChecklistItem::new(2, "Column check", Severity::High, false)
1880 .with_suggestion("Add missing columns"),
1881 ];
1882 let score = QualityScore::from_checklist(checklist);
1883
1884 let json = score.to_json();
1885 assert!(json.contains("\"score\":"));
1886 assert!(json.contains("\"grade\":"));
1887 assert!(json.contains("\"is_publishable\":"));
1888 assert!(json.contains("\"failed_items\":"));
1889 assert!(json.contains("\"badge_url\":"));
1890 assert!(json.contains("Add missing columns"));
1891 }
1892
1893 #[test]
1894 fn test_quality_score_empty_checklist() {
1895 let checklist: Vec<ChecklistItem> = vec![];
1896 let score = QualityScore::from_checklist(checklist);
1897
1898 assert!((score.score - 100.0).abs() < 0.01);
1900 assert_eq!(score.grade, LetterGrade::A);
1901 }
1902
1903 #[test]
1906 fn test_quality_profile_default() {
1907 let profile = QualityProfile::default();
1908 assert_eq!(profile.name, "default");
1909 assert!(profile.expected_constant_columns.is_empty());
1910 assert!(profile.nullable_columns.is_empty());
1911 assert!((profile.max_null_ratio - 0.1).abs() < 0.001);
1912 }
1913
1914 #[test]
1915 fn test_quality_profile_doctest_corpus() {
1916 let profile = QualityProfile::doctest_corpus();
1917 assert_eq!(profile.name, "doctest-corpus");
1918 assert!(profile.is_expected_constant("source"));
1919 assert!(profile.is_expected_constant("version"));
1920 assert!(!profile.is_expected_constant("function"));
1921 assert!(profile.is_nullable("signature"));
1922 assert!(!profile.is_nullable("input"));
1923 }
1924
1925 #[test]
1926 fn test_quality_profile_ml_training() {
1927 let profile = QualityProfile::ml_training();
1928 assert_eq!(profile.name, "ml-training");
1929 assert!((profile.max_null_ratio - 0.0).abs() < 0.001);
1930 assert!((profile.max_duplicate_ratio - 0.8).abs() < 0.001);
1931 }
1932
1933 #[test]
1934 fn test_quality_profile_time_series() {
1935 let profile = QualityProfile::time_series();
1936 assert_eq!(profile.name, "time-series");
1937 assert!((profile.max_duplicate_row_ratio - 0.0).abs() < 0.001);
1938 }
1939
1940 #[test]
1941 fn test_quality_profile_by_name() {
1942 assert!(QualityProfile::by_name("default").is_some());
1943 assert!(QualityProfile::by_name("doctest-corpus").is_some());
1944 assert!(QualityProfile::by_name("doctest").is_some());
1945 assert!(QualityProfile::by_name("ml-training").is_some());
1946 assert!(QualityProfile::by_name("ml").is_some());
1947 assert!(QualityProfile::by_name("time-series").is_some());
1948 assert!(QualityProfile::by_name("timeseries").is_some());
1949 assert!(QualityProfile::by_name("nonexistent").is_none());
1950 }
1951
1952 #[test]
1953 fn test_quality_profile_available_profiles() {
1954 let profiles = QualityProfile::available_profiles();
1955 assert!(profiles.contains(&"default"));
1956 assert!(profiles.contains(&"doctest-corpus"));
1957 assert!(profiles.contains(&"ml-training"));
1958 assert!(profiles.contains(&"time-series"));
1959 }
1960
1961 #[test]
1962 fn test_quality_profile_builders() {
1963 let profile = QualityProfile::new("custom")
1964 .with_description("Custom profile")
1965 .with_expected_constant("id")
1966 .with_nullable("optional_field")
1967 .with_max_null_ratio(0.2)
1968 .with_max_duplicate_ratio(0.6);
1969
1970 assert_eq!(profile.name, "custom");
1971 assert_eq!(profile.description, "Custom profile");
1972 assert!(profile.is_expected_constant("id"));
1973 assert!(profile.is_nullable("optional_field"));
1974 assert!((profile.max_null_ratio - 0.2).abs() < 0.001);
1975 assert!((profile.max_duplicate_ratio - 0.6).abs() < 0.001);
1976 }
1977
1978 #[test]
1979 fn test_quality_profile_null_threshold_for() {
1980 let profile = QualityProfile::doctest_corpus();
1981
1982 assert!((profile.null_threshold_for("signature") - 1.0).abs() < 0.001);
1984
1985 assert!((profile.null_threshold_for("input") - profile.max_null_ratio).abs() < 0.001);
1987 }
1988
1989 #[test]
1990 fn test_quality_profile_clone() {
1991 let profile = QualityProfile::doctest_corpus();
1992 let cloned = profile.clone();
1993 assert_eq!(profile.name, cloned.name);
1994 assert_eq!(
1995 profile.expected_constant_columns,
1996 cloned.expected_constant_columns
1997 );
1998 }
1999
2000 #[test]
2001 fn test_quality_profile_debug() {
2002 let profile = QualityProfile::default();
2003 let debug = format!("{:?}", profile);
2004 assert!(debug.contains("QualityProfile"));
2005 assert!(debug.contains("default"));
2006 }
2007}