term_guard/analyzers/
suggestions.rs

1//! Constraint suggestion system that analyzes column profiles to recommend data quality checks.
2//!
3//! This module implements a rule-based system that examines column profiles and suggests
4//! appropriate data quality constraints with confidence scores. The system supports
5//! various types of constraints including completeness, uniqueness, patterns, and ranges.
6//!
7//! ## Architecture
8//!
9//! The suggestion system consists of:
10//! - `ConstraintSuggestionRule` trait for implementing specific rule logic
11//! - `SuggestedConstraint` struct representing a recommendation
12//! - Individual rule implementations for different constraint types
13//! - `SuggestionEngine` for orchestrating rule evaluation
14//!
15//! ## Example Usage
16//!
17//! ```rust,ignore
18//! use term_guard::analyzers::{SuggestionEngine, CompletenessRule, ColumnProfile, BasicStatistics, DetectedDataType};
19//! use term_guard::test_fixtures::create_minimal_tpc_h_context;
20//!
21//! # tokio::runtime::Runtime::new().unwrap().block_on(async {
22//! let ctx = create_minimal_tpc_h_context().await.unwrap();
23//! // In a real scenario, profile would come from ColumnProfiler
24//! let profile = ColumnProfile {
25//!     column_name: "l_orderkey".to_string(),
26//!     data_type: DetectedDataType::Integer,
27//!     basic_stats: BasicStatistics {
28//!         row_count: 1000,
29//!         null_count: 10,
30//!         null_percentage: 0.01,
31//!         approximate_cardinality: 980,
32//!         min_value: Some("1".to_string()),
33//!         max_value: Some("1000".to_string()),
34//!         sample_values: vec!["1".to_string(), "500".to_string(), "1000".to_string()],
35//!     },
36//!     categorical_histogram: None,
37//!     numeric_distribution: None,
38//!     passes_executed: vec![1, 2, 3],
39//!     profiling_time_ms: 50,
40//! };
41//!
42//! let engine = SuggestionEngine::new()
43//!     .add_rule(Box::new(CompletenessRule::new()));
44//!
45//! let suggestions = engine.suggest_constraints(&profile);
46//! for suggestion in suggestions {
47//!     println!("Suggested: {} (confidence: {:.2})",
48//!              suggestion.check_type, suggestion.confidence);
49//! }
50//! # })
51//! ```
52
53use serde::{Deserialize, Serialize};
54use std::collections::HashMap;
55use tracing::{debug, instrument};
56
57use crate::analyzers::profiler::{ColumnProfile, DetectedDataType};
58
59/// A suggested constraint with confidence and rationale
60#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)]
61pub struct SuggestedConstraint {
62    /// Type of check to apply (e.g., "is_complete", "has_uniqueness")
63    pub check_type: String,
64    /// Column name this constraint applies to
65    pub column: String,
66    /// Parameters for the constraint (e.g., threshold values)
67    pub parameters: HashMap<String, ConstraintParameter>,
68    /// Confidence score from 0.0 to 1.0
69    pub confidence: f64,
70    /// Human-readable explanation for the suggestion
71    pub rationale: String,
72    /// Priority level for implementation
73    pub priority: SuggestionPriority,
74}
75
76/// Parameter value for constraint configuration
77#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)]
78pub enum ConstraintParameter {
79    Float(f64),
80    Integer(i64),
81    String(String),
82    Boolean(bool),
83}
84
85/// Priority levels for constraint suggestions
86#[derive(Debug, Clone, PartialEq, Eq, Hash, Serialize, Deserialize)]
87pub enum SuggestionPriority {
88    Critical, // Data quality issues likely to cause failures
89    High,     // Important constraints for data integrity
90    Medium,   // Useful constraints for monitoring
91    Low,      // Optional constraints for completeness
92}
93
94/// Trait for implementing constraint suggestion rules
95pub trait ConstraintSuggestionRule: Send + Sync {
96    /// Apply this rule to a column profile and return suggested constraints
97    fn apply(&self, profile: &ColumnProfile) -> Vec<SuggestedConstraint>;
98
99    /// Get a human-readable name for this rule
100    fn name(&self) -> &str;
101
102    /// Get a description of what this rule analyzes
103    fn description(&self) -> &str;
104}
105
106/// Engine that orchestrates multiple suggestion rules
107pub struct SuggestionEngine {
108    rules: Vec<Box<dyn ConstraintSuggestionRule>>,
109    confidence_threshold: f64,
110    max_suggestions_per_column: usize,
111}
112
113impl SuggestionEngine {
114    /// Create a new suggestion engine with default configuration
115    pub fn new() -> Self {
116        Self {
117            rules: Vec::new(),
118            confidence_threshold: 0.5,
119            max_suggestions_per_column: 10,
120        }
121    }
122
123    /// Add a suggestion rule to the engine
124    pub fn add_rule(mut self, rule: Box<dyn ConstraintSuggestionRule>) -> Self {
125        self.rules.push(rule);
126        self
127    }
128
129    /// Set the minimum confidence threshold for suggestions
130    pub fn confidence_threshold(mut self, threshold: f64) -> Self {
131        self.confidence_threshold = threshold.clamp(0.0, 1.0);
132        self
133    }
134
135    /// Set the maximum number of suggestions per column
136    pub fn max_suggestions_per_column(mut self, max: usize) -> Self {
137        self.max_suggestions_per_column = max;
138        self
139    }
140
141    /// Generate constraint suggestions for a column profile
142    #[instrument(skip(self, profile))]
143    pub fn suggest_constraints(&self, profile: &ColumnProfile) -> Vec<SuggestedConstraint> {
144        debug!(
145            column = profile.column_name,
146            rules_count = self.rules.len(),
147            "Generating constraint suggestions"
148        );
149
150        let mut all_suggestions = Vec::new();
151
152        // Apply each rule to the profile
153        for rule in &self.rules {
154            let rule_suggestions = rule.apply(profile);
155            debug!(
156                rule = rule.name(),
157                suggestions_count = rule_suggestions.len(),
158                "Applied suggestion rule"
159            );
160            all_suggestions.extend(rule_suggestions);
161        }
162
163        // Filter by confidence threshold
164        all_suggestions.retain(|s| s.confidence >= self.confidence_threshold);
165
166        // Sort by confidence (descending) and priority
167        all_suggestions.sort_by(|a, b| {
168            b.confidence
169                .partial_cmp(&a.confidence)
170                .unwrap_or(std::cmp::Ordering::Equal)
171                .then_with(|| priority_order(&b.priority).cmp(&priority_order(&a.priority)))
172        });
173
174        // Limit the number of suggestions
175        all_suggestions.truncate(self.max_suggestions_per_column);
176
177        debug!(
178            column = profile.column_name,
179            suggestions_count = all_suggestions.len(),
180            "Generated constraint suggestions"
181        );
182
183        all_suggestions
184    }
185
186    /// Generate suggestions for multiple column profiles
187    pub fn suggest_constraints_batch(
188        &self,
189        profiles: &[ColumnProfile],
190    ) -> HashMap<String, Vec<SuggestedConstraint>> {
191        profiles
192            .iter()
193            .map(|profile| {
194                (
195                    profile.column_name.clone(),
196                    self.suggest_constraints(profile),
197                )
198            })
199            .collect()
200    }
201}
202
203impl Default for SuggestionEngine {
204    fn default() -> Self {
205        Self::new()
206    }
207}
208
209/// Helper function to get priority order for sorting
210fn priority_order(priority: &SuggestionPriority) -> u8 {
211    match priority {
212        SuggestionPriority::Critical => 0,
213        SuggestionPriority::High => 1,
214        SuggestionPriority::Medium => 2,
215        SuggestionPriority::Low => 3,
216    }
217}
218
219/// Completeness rule that suggests constraints based on null percentage
220pub struct CompletenessRule {
221    high_completeness_threshold: f64,
222    medium_completeness_threshold: f64,
223}
224
225impl CompletenessRule {
226    /// Create a new completeness rule with default thresholds
227    pub fn new() -> Self {
228        Self {
229            high_completeness_threshold: 0.98,
230            medium_completeness_threshold: 0.90,
231        }
232    }
233
234    /// Create a completeness rule with custom thresholds
235    pub fn with_thresholds(high: f64, medium: f64) -> Self {
236        Self {
237            high_completeness_threshold: high.clamp(0.0, 1.0),
238            medium_completeness_threshold: medium.clamp(0.0, 1.0),
239        }
240    }
241}
242
243impl Default for CompletenessRule {
244    fn default() -> Self {
245        Self::new()
246    }
247}
248
249impl ConstraintSuggestionRule for CompletenessRule {
250    fn apply(&self, profile: &ColumnProfile) -> Vec<SuggestedConstraint> {
251        let completeness = 1.0 - profile.basic_stats.null_percentage;
252        let mut suggestions = Vec::new();
253
254        if completeness >= self.high_completeness_threshold {
255            suggestions.push(SuggestedConstraint {
256                check_type: "is_complete".to_string(),
257                column: profile.column_name.clone(),
258                parameters: HashMap::new(),
259                confidence: 0.9,
260                rationale: format!(
261                    "Column is {:.1}%+ complete, suggesting completeness constraint",
262                    completeness * 100.0
263                ),
264                priority: SuggestionPriority::High,
265            });
266        } else if completeness >= self.medium_completeness_threshold {
267            let mut params = HashMap::new();
268            params.insert(
269                "threshold".to_string(),
270                ConstraintParameter::Float(completeness - 0.02),
271            );
272
273            suggestions.push(SuggestedConstraint {
274                check_type: "has_completeness".to_string(),
275                column: profile.column_name.clone(),
276                parameters: params,
277                confidence: 0.8,
278                rationale: format!(
279                    "Column has {:.1}% completeness, suggesting threshold constraint",
280                    completeness * 100.0
281                ),
282                priority: SuggestionPriority::Medium,
283            });
284        } else if completeness < 0.5 {
285            // Very low completeness - might indicate data quality issues
286            suggestions.push(SuggestedConstraint {
287                check_type: "monitor_completeness".to_string(),
288                column: profile.column_name.clone(),
289                parameters: HashMap::new(),
290                confidence: 0.7,
291                rationale: format!(
292                    "Column has only {:.1}% completeness, suggesting monitoring",
293                    completeness * 100.0
294                ),
295                priority: SuggestionPriority::Critical,
296            });
297        }
298
299        suggestions
300    }
301
302    fn name(&self) -> &str {
303        "CompletenessRule"
304    }
305
306    fn description(&self) -> &str {
307        "Analyzes null percentage to suggest completeness constraints"
308    }
309}
310
311/// Uniqueness rule that suggests constraints for potential key columns
312pub struct UniquenessRule {
313    high_uniqueness_threshold: f64,
314    medium_uniqueness_threshold: f64,
315}
316
317impl UniquenessRule {
318    /// Create a new uniqueness rule with default thresholds
319    pub fn new() -> Self {
320        Self {
321            high_uniqueness_threshold: 0.95,
322            medium_uniqueness_threshold: 0.80,
323        }
324    }
325
326    /// Create a uniqueness rule with custom thresholds
327    pub fn with_thresholds(high: f64, medium: f64) -> Self {
328        Self {
329            high_uniqueness_threshold: high.clamp(0.0, 1.0),
330            medium_uniqueness_threshold: medium.clamp(0.0, 1.0),
331        }
332    }
333}
334
335impl Default for UniquenessRule {
336    fn default() -> Self {
337        Self::new()
338    }
339}
340
341impl ConstraintSuggestionRule for UniquenessRule {
342    fn apply(&self, profile: &ColumnProfile) -> Vec<SuggestedConstraint> {
343        let total_rows = profile.basic_stats.row_count as f64;
344        let unique_ratio = if total_rows > 0.0 {
345            profile.basic_stats.approximate_cardinality as f64 / total_rows
346        } else {
347            0.0
348        };
349
350        let mut suggestions = Vec::new();
351
352        if unique_ratio >= self.high_uniqueness_threshold {
353            suggestions.push(SuggestedConstraint {
354                check_type: "is_unique".to_string(),
355                column: profile.column_name.clone(),
356                parameters: HashMap::new(),
357                confidence: 0.9,
358                rationale: format!(
359                    "Column has {:.1}% unique values, suggesting uniqueness constraint",
360                    unique_ratio * 100.0
361                ),
362                priority: SuggestionPriority::High,
363            });
364        } else if unique_ratio >= self.medium_uniqueness_threshold {
365            let mut params = HashMap::new();
366            params.insert(
367                "threshold".to_string(),
368                ConstraintParameter::Float(unique_ratio - 0.05),
369            );
370
371            suggestions.push(SuggestedConstraint {
372                check_type: "has_uniqueness".to_string(),
373                column: profile.column_name.clone(),
374                parameters: params,
375                confidence: 0.7,
376                rationale: format!(
377                    "Column has {:.1}% unique values, suggesting uniqueness monitoring",
378                    unique_ratio * 100.0
379                ),
380                priority: SuggestionPriority::Medium,
381            });
382        }
383
384        // Special case: if the column looks like an ID based on naming patterns
385        let column_lower = profile.column_name.to_lowercase();
386        if (column_lower.contains("id") || column_lower.contains("key")) && unique_ratio > 0.7 {
387            suggestions.push(SuggestedConstraint {
388                check_type: "primary_key_candidate".to_string(),
389                column: profile.column_name.clone(),
390                parameters: HashMap::new(),
391                confidence: 0.8,
392                rationale: "Column name suggests identifier and has high uniqueness".to_string(),
393                priority: SuggestionPriority::High,
394            });
395        }
396
397        suggestions
398    }
399
400    fn name(&self) -> &str {
401        "UniquenessRule"
402    }
403
404    fn description(&self) -> &str {
405        "Analyzes cardinality to suggest uniqueness constraints for potential keys"
406    }
407}
408
409/// Pattern rule that identifies common data formats
410pub struct PatternRule;
411
412impl PatternRule {
413    /// Create a new pattern rule
414    pub fn new() -> Self {
415        Self
416    }
417
418    /// Check if values match email pattern
419    fn is_email_pattern(&self, samples: &[String]) -> bool {
420        samples
421            .iter()
422            .take(10)
423            .all(|s| s.contains('@') && s.contains('.'))
424    }
425
426    /// Check if values match date pattern
427    fn is_date_pattern(&self, samples: &[String]) -> bool {
428        samples
429            .iter()
430            .take(10)
431            .all(|s| s.contains('-') || s.contains('/') || s.len() == 8)
432    }
433
434    /// Check if values match phone pattern
435    fn is_phone_pattern(&self, samples: &[String]) -> bool {
436        samples
437            .iter()
438            .take(10)
439            .all(|s| s.chars().filter(|c| c.is_numeric()).count() >= 10)
440    }
441}
442
443impl Default for PatternRule {
444    fn default() -> Self {
445        Self::new()
446    }
447}
448
449impl ConstraintSuggestionRule for PatternRule {
450    fn apply(&self, profile: &ColumnProfile) -> Vec<SuggestedConstraint> {
451        let mut suggestions = Vec::new();
452
453        if profile.data_type == DetectedDataType::String
454            && !profile.basic_stats.sample_values.is_empty()
455        {
456            let samples = &profile.basic_stats.sample_values;
457
458            if self.is_email_pattern(samples) {
459                suggestions.push(SuggestedConstraint {
460                    check_type: "matches_email_pattern".to_string(),
461                    column: profile.column_name.clone(),
462                    parameters: HashMap::new(),
463                    confidence: 0.85,
464                    rationale: "Sample values suggest email format".to_string(),
465                    priority: SuggestionPriority::Medium,
466                });
467            }
468
469            if self.is_date_pattern(samples) {
470                suggestions.push(SuggestedConstraint {
471                    check_type: "matches_date_pattern".to_string(),
472                    column: profile.column_name.clone(),
473                    parameters: HashMap::new(),
474                    confidence: 0.75,
475                    rationale: "Sample values suggest date format".to_string(),
476                    priority: SuggestionPriority::Medium,
477                });
478            }
479
480            if self.is_phone_pattern(samples) {
481                suggestions.push(SuggestedConstraint {
482                    check_type: "matches_phone_pattern".to_string(),
483                    column: profile.column_name.clone(),
484                    parameters: HashMap::new(),
485                    confidence: 0.70,
486                    rationale: "Sample values suggest phone number format".to_string(),
487                    priority: SuggestionPriority::Low,
488                });
489            }
490        }
491
492        suggestions
493    }
494
495    fn name(&self) -> &str {
496        "PatternRule"
497    }
498
499    fn description(&self) -> &str {
500        "Identifies common data patterns like emails, dates, and phone numbers"
501    }
502}
503
504/// Range rule that suggests min/max constraints for numeric columns
505pub struct RangeRule;
506
507impl RangeRule {
508    /// Create a new range rule
509    pub fn new() -> Self {
510        Self
511    }
512}
513
514impl Default for RangeRule {
515    fn default() -> Self {
516        Self::new()
517    }
518}
519
520impl ConstraintSuggestionRule for RangeRule {
521    fn apply(&self, profile: &ColumnProfile) -> Vec<SuggestedConstraint> {
522        let mut suggestions = Vec::new();
523
524        match profile.data_type {
525            DetectedDataType::Integer | DetectedDataType::Double => {
526                // Try to get min/max from basic_stats
527                if let (Some(ref min_str), Some(ref max_str)) = (
528                    &profile.basic_stats.min_value,
529                    &profile.basic_stats.max_value,
530                ) {
531                    if let (Ok(min_val), Ok(max_val)) =
532                        (min_str.parse::<f64>(), max_str.parse::<f64>())
533                    {
534                        let range = max_val - min_val;
535
536                        // Suggest range constraints if the range is reasonable
537                        if range > 0.0 && min_val >= 0.0 {
538                            let mut min_params = HashMap::new();
539                            min_params.insert(
540                                "threshold".to_string(),
541                                ConstraintParameter::Float(min_val),
542                            );
543
544                            suggestions.push(SuggestedConstraint {
545                                check_type: "has_min".to_string(),
546                                column: profile.column_name.clone(),
547                                parameters: min_params,
548                                confidence: 0.8,
549                                rationale: format!("Minimum value observed: {min_val}"),
550                                priority: SuggestionPriority::Medium,
551                            });
552
553                            let mut max_params = HashMap::new();
554                            max_params.insert(
555                                "threshold".to_string(),
556                                ConstraintParameter::Float(max_val),
557                            );
558
559                            suggestions.push(SuggestedConstraint {
560                                check_type: "has_max".to_string(),
561                                column: profile.column_name.clone(),
562                                parameters: max_params,
563                                confidence: 0.8,
564                                rationale: format!("Maximum value observed: {max_val}"),
565                                priority: SuggestionPriority::Medium,
566                            });
567
568                            // Suggest positive values constraint if applicable
569                            if min_val >= 0.0 {
570                                suggestions.push(SuggestedConstraint {
571                                    check_type: "is_positive".to_string(),
572                                    column: profile.column_name.clone(),
573                                    parameters: HashMap::new(),
574                                    confidence: 0.9,
575                                    rationale: "All observed values are non-negative".to_string(),
576                                    priority: SuggestionPriority::High,
577                                });
578                            }
579                        }
580                    }
581                }
582
583                // Check for potential quantile-based constraints from numeric distribution
584                if let Some(distribution) = &profile.numeric_distribution {
585                    if let Some(quantiles) = distribution.quantiles.get("P99") {
586                        let mut outlier_params = HashMap::new();
587                        outlier_params.insert(
588                            "threshold".to_string(),
589                            ConstraintParameter::Float(*quantiles),
590                        );
591
592                        suggestions.push(SuggestedConstraint {
593                            check_type: "has_no_outliers".to_string(),
594                            column: profile.column_name.clone(),
595                            parameters: outlier_params,
596                            confidence: 0.7,
597                            rationale: "Suggests outlier detection based on P99".to_string(),
598                            priority: SuggestionPriority::Low,
599                        });
600                    }
601                }
602            }
603            _ => {}
604        }
605
606        suggestions
607    }
608
609    fn name(&self) -> &str {
610        "RangeRule"
611    }
612
613    fn description(&self) -> &str {
614        "Suggests min/max constraints and outlier detection for numeric columns"
615    }
616}
617
618/// Data type rule that enforces consistent types
619pub struct DataTypeRule;
620
621impl DataTypeRule {
622    /// Create a new data type rule
623    pub fn new() -> Self {
624        Self
625    }
626}
627
628impl Default for DataTypeRule {
629    fn default() -> Self {
630        Self::new()
631    }
632}
633
634impl ConstraintSuggestionRule for DataTypeRule {
635    fn apply(&self, profile: &ColumnProfile) -> Vec<SuggestedConstraint> {
636        let mut suggestions = Vec::new();
637
638        match &profile.data_type {
639            DetectedDataType::Mixed => {
640                suggestions.push(SuggestedConstraint {
641                    check_type: "has_consistent_type".to_string(),
642                    column: profile.column_name.clone(),
643                    parameters: HashMap::new(),
644                    confidence: 0.9,
645                    rationale: "Column has mixed data types, suggesting type consistency check"
646                        .to_string(),
647                    priority: SuggestionPriority::Critical,
648                });
649            }
650            DetectedDataType::Unknown => {
651                suggestions.push(SuggestedConstraint {
652                    check_type: "validate_data_type".to_string(),
653                    column: profile.column_name.clone(),
654                    parameters: HashMap::new(),
655                    confidence: 0.8,
656                    rationale: "Unable to determine data type, suggesting validation".to_string(),
657                    priority: SuggestionPriority::High,
658                });
659            }
660            detected_type => {
661                let mut params = HashMap::new();
662                params.insert(
663                    "expected_type".to_string(),
664                    ConstraintParameter::String(format!("{detected_type:?}")),
665                );
666
667                suggestions.push(SuggestedConstraint {
668                    check_type: "has_data_type".to_string(),
669                    column: profile.column_name.clone(),
670                    parameters: params,
671                    confidence: 0.85,
672                    rationale: format!("Column consistently contains {detected_type:?} values"),
673                    priority: SuggestionPriority::Medium,
674                });
675            }
676        }
677
678        suggestions
679    }
680
681    fn name(&self) -> &str {
682        "DataTypeRule"
683    }
684
685    fn description(&self) -> &str {
686        "Suggests data type validation constraints based on detected types"
687    }
688}
689
690/// Cardinality rule that detects categorical columns
691pub struct CardinalityRule {
692    categorical_threshold: u64,
693    low_cardinality_threshold: u64,
694}
695
696impl CardinalityRule {
697    /// Create a new cardinality rule with default thresholds
698    pub fn new() -> Self {
699        Self {
700            categorical_threshold: 50,
701            low_cardinality_threshold: 10,
702        }
703    }
704
705    /// Create a cardinality rule with custom thresholds
706    pub fn with_thresholds(categorical: u64, low_cardinality: u64) -> Self {
707        Self {
708            categorical_threshold: categorical,
709            low_cardinality_threshold: low_cardinality,
710        }
711    }
712}
713
714impl Default for CardinalityRule {
715    fn default() -> Self {
716        Self::new()
717    }
718}
719
720impl ConstraintSuggestionRule for CardinalityRule {
721    fn apply(&self, profile: &ColumnProfile) -> Vec<SuggestedConstraint> {
722        let mut suggestions = Vec::new();
723        let cardinality = profile.basic_stats.approximate_cardinality;
724        let total_rows = profile.basic_stats.row_count;
725
726        if cardinality <= self.low_cardinality_threshold {
727            suggestions.push(SuggestedConstraint {
728                check_type: "is_categorical".to_string(),
729                column: profile.column_name.clone(),
730                parameters: HashMap::new(),
731                confidence: 0.9,
732                rationale: format!(
733                    "Column has only {cardinality} distinct values, suggesting categorical constraint"
734                ),
735                priority: SuggestionPriority::High,
736            });
737
738            // Suggest specific value constraints if we have histogram data
739            if let Some(histogram) = &profile.categorical_histogram {
740                let valid_values: Vec<String> =
741                    histogram.buckets.iter().map(|b| b.value.clone()).collect();
742
743                let mut params = HashMap::new();
744                params.insert(
745                    "valid_values".to_string(),
746                    ConstraintParameter::String(valid_values.join(",")),
747                );
748
749                suggestions.push(SuggestedConstraint {
750                    check_type: "is_in_set".to_string(),
751                    column: profile.column_name.clone(),
752                    parameters: params,
753                    confidence: 0.85,
754                    rationale: "Column has well-defined categorical values".to_string(),
755                    priority: SuggestionPriority::Medium,
756                });
757            }
758        } else if cardinality <= self.categorical_threshold {
759            let mut params = HashMap::new();
760            params.insert(
761                "threshold".to_string(),
762                ConstraintParameter::Integer(cardinality as i64),
763            );
764
765            suggestions.push(SuggestedConstraint {
766                check_type: "has_max_cardinality".to_string(),
767                column: profile.column_name.clone(),
768                parameters: params,
769                confidence: 0.7,
770                rationale: format!(
771                    "Column has {cardinality} distinct values, suggesting cardinality monitoring"
772                ),
773                priority: SuggestionPriority::Medium,
774            });
775        }
776
777        // Check for very high cardinality that might indicate data quality issues
778        if total_rows > 0 && cardinality as f64 / total_rows as f64 > 0.8 {
779            suggestions.push(SuggestedConstraint {
780                check_type: "monitor_cardinality".to_string(),
781                column: profile.column_name.clone(),
782                parameters: HashMap::new(),
783                confidence: 0.6,
784                rationale: "High cardinality might indicate data quality issues".to_string(),
785                priority: SuggestionPriority::Low,
786            });
787        }
788
789        suggestions
790    }
791
792    fn name(&self) -> &str {
793        "CardinalityRule"
794    }
795
796    fn description(&self) -> &str {
797        "Detects categorical columns and suggests cardinality constraints"
798    }
799}
800
801#[cfg(test)]
802mod tests {
803    use super::*;
804    use crate::analyzers::profiler::{
805        BasicStatistics, CategoricalBucket, CategoricalHistogram, DetectedDataType,
806        NumericDistribution,
807    };
808    use std::collections::HashMap;
809
810    fn create_test_profile(column_name: &str, null_percentage: f64) -> ColumnProfile {
811        ColumnProfile {
812            column_name: column_name.to_string(),
813            data_type: DetectedDataType::String,
814            basic_stats: BasicStatistics {
815                row_count: 1000,
816                null_count: (1000.0 * null_percentage) as u64,
817                null_percentage,
818                approximate_cardinality: 500,
819                min_value: None,
820                max_value: None,
821                sample_values: vec!["A".to_string(), "B".to_string()],
822            },
823            categorical_histogram: None,
824            numeric_distribution: None,
825            profiling_time_ms: 100,
826            passes_executed: vec![1],
827        }
828    }
829
830    fn create_numeric_profile(column_name: &str, min_val: f64, max_val: f64) -> ColumnProfile {
831        let mut quantiles = HashMap::new();
832        quantiles.insert("P99".to_string(), max_val * 0.99);
833
834        ColumnProfile {
835            column_name: column_name.to_string(),
836            data_type: DetectedDataType::Double,
837            basic_stats: BasicStatistics {
838                row_count: 1000,
839                null_count: 0,
840                null_percentage: 0.0,
841                approximate_cardinality: 800,
842                min_value: Some(min_val.to_string()),
843                max_value: Some(max_val.to_string()),
844                sample_values: vec![min_val.to_string(), max_val.to_string()],
845            },
846            categorical_histogram: None,
847            numeric_distribution: Some(NumericDistribution {
848                mean: Some((min_val + max_val) / 2.0),
849                std_dev: Some(10.0),
850                variance: Some(100.0),
851                quantiles,
852                outlier_count: 0,
853                skewness: None,
854                kurtosis: None,
855            }),
856            profiling_time_ms: 100,
857            passes_executed: vec![1, 3],
858        }
859    }
860
861    fn create_categorical_profile(column_name: &str, cardinality: u64) -> ColumnProfile {
862        let buckets = vec![
863            CategoricalBucket {
864                value: "A".to_string(),
865                count: 400,
866            },
867            CategoricalBucket {
868                value: "B".to_string(),
869                count: 300,
870            },
871            CategoricalBucket {
872                value: "C".to_string(),
873                count: 200,
874            },
875            CategoricalBucket {
876                value: "D".to_string(),
877                count: 100,
878            },
879        ];
880
881        ColumnProfile {
882            column_name: column_name.to_string(),
883            data_type: DetectedDataType::String,
884            basic_stats: BasicStatistics {
885                row_count: 1000,
886                null_count: 0,
887                null_percentage: 0.0,
888                approximate_cardinality: cardinality,
889                min_value: None,
890                max_value: None,
891                sample_values: vec!["A".to_string(), "B".to_string()],
892            },
893            categorical_histogram: Some(CategoricalHistogram {
894                buckets,
895                total_count: 1000,
896                entropy: 1.5,
897                top_values: vec![("A".to_string(), 400), ("B".to_string(), 300)],
898            }),
899            numeric_distribution: None,
900            profiling_time_ms: 100,
901            passes_executed: vec![1, 2],
902        }
903    }
904
905    #[test]
906    fn test_completeness_rule_high_completeness() {
907        let rule = CompletenessRule::new();
908        let profile = create_test_profile("test_col", 0.01); // 99% complete
909
910        let suggestions = rule.apply(&profile);
911        assert_eq!(suggestions.len(), 1);
912        assert_eq!(suggestions[0].check_type, "is_complete");
913        assert_eq!(suggestions[0].confidence, 0.9);
914        assert_eq!(suggestions[0].priority, SuggestionPriority::High);
915    }
916
917    #[test]
918    fn test_completeness_rule_medium_completeness() {
919        let rule = CompletenessRule::new();
920        let profile = create_test_profile("test_col", 0.05); // 95% complete
921
922        let suggestions = rule.apply(&profile);
923        assert_eq!(suggestions.len(), 1);
924        assert_eq!(suggestions[0].check_type, "has_completeness");
925        assert_eq!(suggestions[0].confidence, 0.8);
926        assert_eq!(suggestions[0].priority, SuggestionPriority::Medium);
927
928        // Check threshold parameter
929        if let Some(ConstraintParameter::Float(threshold)) =
930            suggestions[0].parameters.get("threshold")
931        {
932            assert!(*threshold < 0.95);
933            assert!(*threshold > 0.90);
934        } else {
935            panic!("Expected threshold parameter");
936        }
937    }
938
939    #[test]
940    fn test_completeness_rule_low_completeness() {
941        let rule = CompletenessRule::new();
942        let profile = create_test_profile("test_col", 0.6); // 40% complete
943
944        let suggestions = rule.apply(&profile);
945        assert_eq!(suggestions.len(), 1);
946        assert_eq!(suggestions[0].check_type, "monitor_completeness");
947        assert_eq!(suggestions[0].confidence, 0.7);
948        assert_eq!(suggestions[0].priority, SuggestionPriority::Critical);
949    }
950
951    #[test]
952    fn test_suggestion_engine_confidence_filtering() {
953        let engine = SuggestionEngine::new()
954            .confidence_threshold(0.85)
955            .add_rule(Box::new(CompletenessRule::new()));
956
957        let profile = create_test_profile("test_col", 0.05); // 95% complete, confidence 0.8
958        let suggestions = engine.suggest_constraints(&profile);
959
960        // Should be filtered out due to confidence threshold
961        assert_eq!(suggestions.len(), 0);
962    }
963
964    #[test]
965    fn test_suggestion_engine_max_suggestions() {
966        let engine = SuggestionEngine::new()
967            .max_suggestions_per_column(1)
968            .add_rule(Box::new(CompletenessRule::new()));
969
970        let profile = create_test_profile("test_col", 0.01); // High completeness
971        let suggestions = engine.suggest_constraints(&profile);
972
973        assert!(suggestions.len() <= 1);
974    }
975
976    #[test]
977    fn test_uniqueness_rule_high_uniqueness() {
978        let rule = UniquenessRule::new();
979        let mut profile = create_test_profile("test_col", 0.0);
980        profile.basic_stats.approximate_cardinality = 980; // 98% unique
981
982        let suggestions = rule.apply(&profile);
983        assert_eq!(suggestions.len(), 1);
984        assert_eq!(suggestions[0].check_type, "is_unique");
985        assert_eq!(suggestions[0].confidence, 0.9);
986    }
987
988    #[test]
989    fn test_uniqueness_rule_id_column() {
990        let rule = UniquenessRule::new();
991        let mut profile = create_test_profile("user_id", 0.0);
992        profile.basic_stats.approximate_cardinality = 800; // 80% unique
993
994        let suggestions = rule.apply(&profile);
995        assert!(suggestions
996            .iter()
997            .any(|s| s.check_type == "primary_key_candidate"));
998    }
999
1000    #[test]
1001    fn test_pattern_rule_email() {
1002        let rule = PatternRule::new();
1003        let mut profile = create_test_profile("email", 0.0);
1004        profile.basic_stats.sample_values = vec![
1005            "user@example.com".to_string(),
1006            "test@domain.org".to_string(),
1007        ];
1008
1009        let suggestions = rule.apply(&profile);
1010        assert!(suggestions
1011            .iter()
1012            .any(|s| s.check_type == "matches_email_pattern"));
1013    }
1014
1015    #[test]
1016    fn test_range_rule_numeric() {
1017        let rule = RangeRule::new();
1018        let profile = create_numeric_profile("age", 0.0, 100.0);
1019
1020        let suggestions = rule.apply(&profile);
1021        assert!(suggestions.iter().any(|s| s.check_type == "has_min"));
1022        assert!(suggestions.iter().any(|s| s.check_type == "has_max"));
1023        assert!(suggestions.iter().any(|s| s.check_type == "is_positive"));
1024    }
1025
1026    #[test]
1027    fn test_data_type_rule_mixed() {
1028        let rule = DataTypeRule::new();
1029        let mut profile = create_test_profile("mixed_col", 0.0);
1030        profile.data_type = DetectedDataType::Mixed;
1031
1032        let suggestions = rule.apply(&profile);
1033        assert_eq!(suggestions.len(), 1);
1034        assert_eq!(suggestions[0].check_type, "has_consistent_type");
1035        assert_eq!(suggestions[0].priority, SuggestionPriority::Critical);
1036    }
1037
1038    #[test]
1039    fn test_cardinality_rule_categorical() {
1040        let rule = CardinalityRule::new();
1041        let profile = create_categorical_profile("status", 4);
1042
1043        let suggestions = rule.apply(&profile);
1044        assert!(suggestions.iter().any(|s| s.check_type == "is_categorical"));
1045        assert!(suggestions.iter().any(|s| s.check_type == "is_in_set"));
1046    }
1047
1048    #[test]
1049    fn test_suggestion_engine_with_all_rules() {
1050        let engine = SuggestionEngine::new()
1051            .add_rule(Box::new(CompletenessRule::new()))
1052            .add_rule(Box::new(UniquenessRule::new()))
1053            .add_rule(Box::new(PatternRule::new()))
1054            .add_rule(Box::new(RangeRule::new()))
1055            .add_rule(Box::new(DataTypeRule::new()))
1056            .add_rule(Box::new(CardinalityRule::new()));
1057
1058        let profile = create_numeric_profile("price", 0.0, 999.99);
1059        let suggestions = engine.suggest_constraints(&profile);
1060
1061        // Should get suggestions from multiple rules
1062        assert!(!suggestions.is_empty());
1063
1064        // Verify they're sorted by confidence
1065        for i in 1..suggestions.len() {
1066            assert!(suggestions[i - 1].confidence >= suggestions[i].confidence);
1067        }
1068    }
1069
1070    #[test]
1071    fn test_suggestion_batch_processing() {
1072        let engine = SuggestionEngine::new().add_rule(Box::new(CompletenessRule::new()));
1073
1074        let profiles = vec![
1075            create_test_profile("col1", 0.01),
1076            create_test_profile("col2", 0.05),
1077        ];
1078
1079        let batch_results = engine.suggest_constraints_batch(&profiles);
1080        assert_eq!(batch_results.len(), 2);
1081        assert!(batch_results.contains_key("col1"));
1082        assert!(batch_results.contains_key("col2"));
1083    }
1084}