Skip to main content

xls_rs/
validation.rs

1//! Data validation operations
2//!
3//! Provides comprehensive data validation capabilities including
4//! rule-based validation, data quality checks, and reporting.
5
6use crate::common::string;
7use anyhow::Result;
8use serde::{Deserialize, Serialize};
9use std::collections::HashMap;
10
11/// Validation rule types
12#[derive(Debug, Clone, Serialize, Deserialize)]
13#[serde(tag = "type")]
14pub enum ValidationRule {
15    /// Check if value is not null/empty
16    NotNull,
17    /// Check if value matches a regex pattern
18    Regex { pattern: String },
19    /// Check if value is within a numeric range
20    Range { min: Option<f64>, max: Option<f64> },
21    /// Check if value is one of allowed values
22    Enum { values: Vec<String> },
23    /// Check if value has specific length
24    Length {
25        min: Option<usize>,
26        max: Option<usize>,
27    },
28    /// Check if value is a valid email
29    Email,
30    /// Check if value is a valid URL
31    Url,
32    /// Check if value is numeric
33    Numeric,
34    /// Check if value is a valid date
35    Date { format: String },
36    /// Custom validation using expression
37    Custom { expression: String },
38}
39
40/// Validation result
41#[derive(Debug, Clone, Serialize, Deserialize)]
42pub struct ValidationResult {
43    pub is_valid: bool,
44    pub errors: Vec<ValidationError>,
45    pub warnings: Vec<ValidationWarning>,
46    pub stats: ValidationStats,
47}
48
49/// Validation error
50#[derive(Debug, Clone, Serialize, Deserialize)]
51pub struct ValidationError {
52    pub row: usize,
53    pub column: String,
54    pub value: String,
55    pub rule: String,
56    pub message: String,
57}
58
59/// Validation warning
60#[derive(Debug, Clone, Serialize, Deserialize)]
61pub struct ValidationWarning {
62    pub row: usize,
63    pub column: String,
64    pub value: String,
65    pub message: String,
66}
67
68/// Validation statistics
69#[derive(Debug, Clone, Serialize, Deserialize)]
70pub struct ValidationStats {
71    pub total_rows: usize,
72    pub valid_rows: usize,
73    pub invalid_rows: usize,
74    pub total_errors: usize,
75    pub total_warnings: usize,
76    pub columns_validated: usize,
77}
78
79/// Validation configuration
80#[derive(Debug, Clone, Serialize, Deserialize)]
81pub struct ValidationConfig {
82    pub rules: HashMap<String, Vec<ValidationRule>>,
83    pub strict_mode: bool,
84    pub stop_on_first_error: bool,
85    pub max_errors: Option<usize>,
86}
87
88impl Default for ValidationConfig {
89    fn default() -> Self {
90        Self {
91            rules: HashMap::new(),
92            strict_mode: false,
93            stop_on_first_error: false,
94            max_errors: None,
95        }
96    }
97}
98
99/// Data validator
100pub struct DataValidator {
101    config: ValidationConfig,
102}
103
104impl DataValidator {
105    /// Create a new validator with configuration
106    pub fn new(config: ValidationConfig) -> Self {
107        Self { config }
108    }
109
110    /// Create a validator from JSON configuration file
111    pub fn from_config_file(path: &str) -> Result<Self> {
112        let content = std::fs::read_to_string(path)?;
113        let config: ValidationConfig = serde_json::from_str(&content)?;
114        Ok(Self::new(config))
115    }
116
117    /// Validate data rows
118    pub fn validate(&self, data: &[Vec<String>]) -> Result<ValidationResult> {
119        if data.is_empty() {
120            return Ok(ValidationResult {
121                is_valid: true,
122                errors: Vec::new(),
123                warnings: Vec::new(),
124                stats: ValidationStats {
125                    total_rows: 0,
126                    valid_rows: 0,
127                    invalid_rows: 0,
128                    total_errors: 0,
129                    total_warnings: 0,
130                    columns_validated: 0,
131                },
132            });
133        }
134
135        let header = &data[0];
136        let mut errors = Vec::new();
137        let warnings = Vec::new();
138        let mut valid_rows = 0;
139
140        for (row_idx, row) in data.iter().enumerate().skip(1) {
141            let mut row_valid = true;
142
143            for (col_idx, cell_value) in row.iter().enumerate() {
144                if let Some(column_name) = header.get(col_idx) {
145                    if let Some(rules) = self.config.rules.get(column_name) {
146                        for rule in rules {
147                            match self.validate_value(cell_value, rule) {
148                                Ok(()) => {} // Valid
149                                Err(e) => {
150                                    let error = ValidationError {
151                                        row: row_idx,
152                                        column: column_name.clone(),
153                                        value: cell_value.clone(),
154                                        rule: format!("{:?}", rule),
155                                        message: e.to_string(),
156                                    };
157                                    errors.push(error);
158                                    row_valid = false;
159
160                                    if self.config.stop_on_first_error {
161                                        break;
162                                    }
163
164                                    if let Some(max) = self.config.max_errors {
165                                        if errors.len() >= max {
166                                            break;
167                                        }
168                                    }
169                                }
170                            }
171                        }
172                    }
173                }
174            }
175
176            if row_valid {
177                valid_rows += 1;
178            }
179
180            if self.config.stop_on_first_error && !errors.is_empty() {
181                break;
182            }
183
184            if let Some(max) = self.config.max_errors {
185                if errors.len() >= max {
186                    break;
187                }
188            }
189        }
190
191        let total_rows = data.len() - 1; // Exclude header
192        let invalid_rows = total_rows - valid_rows;
193        let is_valid = if self.config.strict_mode {
194            errors.is_empty() && warnings.is_empty()
195        } else {
196            errors.is_empty()
197        };
198
199        let total_errors = errors.len();
200        let total_warnings = warnings.len();
201
202        Ok(ValidationResult {
203            is_valid,
204            errors,
205            warnings,
206            stats: ValidationStats {
207                total_rows,
208                valid_rows,
209                invalid_rows,
210                total_errors,
211                total_warnings,
212                columns_validated: self.config.rules.len(),
213            },
214        })
215    }
216
217    /// Validate a single value against a rule
218    fn validate_value(&self, value: &str, rule: &ValidationRule) -> Result<()> {
219        match rule {
220            ValidationRule::NotNull => {
221                if string::is_empty_or_whitespace(value) {
222                    return Err(anyhow::anyhow!("Value cannot be null or empty"));
223                }
224            }
225            ValidationRule::Regex { pattern } => {
226                let re = regex::Regex::new(pattern)?;
227                if !re.is_match(value) {
228                    return Err(anyhow::anyhow!("Value does not match pattern: {}", pattern));
229                }
230            }
231            ValidationRule::Range { min, max } => {
232                if let Some(num) = string::to_number(value) {
233                    if let Some(min_val) = min {
234                        if num < *min_val {
235                            return Err(anyhow::anyhow!(
236                                "Value {} is below minimum {}",
237                                num,
238                                min_val
239                            ));
240                        }
241                    }
242                    if let Some(max_val) = max {
243                        if num > *max_val {
244                            return Err(anyhow::anyhow!(
245                                "Value {} is above maximum {}",
246                                num,
247                                max_val
248                            ));
249                        }
250                    }
251                } else {
252                    return Err(anyhow::anyhow!("Value is not numeric"));
253                }
254            }
255            ValidationRule::Enum { values } => {
256                if !values.contains(&value.to_string()) {
257                    return Err(anyhow::anyhow!(
258                        "Value '{}' is not in allowed values: {:?}",
259                        value,
260                        values
261                    ));
262                }
263            }
264            ValidationRule::Length { min, max } => {
265                let len = value.len();
266                if let Some(min_len) = min {
267                    if len < *min_len {
268                        return Err(anyhow::anyhow!(
269                            "Length {} is below minimum {}",
270                            len,
271                            min_len
272                        ));
273                    }
274                }
275                if let Some(max_len) = max {
276                    if len > *max_len {
277                        return Err(anyhow::anyhow!(
278                            "Length {} is above maximum {}",
279                            len,
280                            max_len
281                        ));
282                    }
283                }
284            }
285            ValidationRule::Email => {
286                let email_regex =
287                    regex::Regex::new(r"^[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}$")?;
288                if !email_regex.is_match(value) {
289                    return Err(anyhow::anyhow!("Invalid email format"));
290                }
291            }
292            ValidationRule::Url => {
293                let url_regex = regex::Regex::new(r"^https?://[^\s/$.?#].[^\s]*$")?;
294                if !url_regex.is_match(value) {
295                    return Err(anyhow::anyhow!("Invalid URL format"));
296                }
297            }
298            ValidationRule::Numeric => {
299                if !string::is_numeric(value) {
300                    return Err(anyhow::anyhow!("Value is not numeric"));
301                }
302            }
303            ValidationRule::Date { format } => {
304                chrono::NaiveDate::parse_from_str(value, format)
305                    .map_err(|_| anyhow::anyhow!("Invalid date format for {}", format))?;
306            }
307            ValidationRule::Custom { expression } => {
308                // Simple custom expression evaluation
309                // In a real implementation, this would use a proper expression parser
310                if expression.contains("not_empty") && string::is_empty_or_whitespace(value) {
311                    return Err(anyhow::anyhow!("Custom validation failed: {}", expression));
312                }
313            }
314        }
315
316        Ok(())
317    }
318
319    /// Generate validation report
320    pub fn generate_report(&self, result: &ValidationResult) -> String {
321        let mut report = String::new();
322
323        report.push_str("# Data Validation Report\n\n");
324
325        // Summary
326        report.push_str("## Summary\n\n");
327        report.push_str(&format!(
328            "- **Total Rows**: {}\n\
329             - **Valid Rows**: {}\n\
330             - **Invalid Rows**: {}\n\
331             - **Total Errors**: {}\n\
332             - **Total Warnings**: {}\n\
333             - **Columns Validated**: {}\n\
334             - **Overall Status**: {}\n\n",
335            result.stats.total_rows,
336            result.stats.valid_rows,
337            result.stats.invalid_rows,
338            result.stats.total_errors,
339            result.stats.total_warnings,
340            result.stats.columns_validated,
341            if result.is_valid {
342                "✅ PASSED"
343            } else {
344                "❌ FAILED"
345            }
346        ));
347
348        // Errors
349        if !result.errors.is_empty() {
350            report.push_str("## Errors\n\n");
351            for error in &result.errors {
352                report.push_str(&format!(
353                    "- **Row {}**, Column `{}`: {} (value: `{}`)\n",
354                    error.row + 1,
355                    error.column,
356                    error.message,
357                    error.value
358                ));
359            }
360            report.push('\n');
361        }
362
363        // Warnings
364        if !result.warnings.is_empty() {
365            report.push_str("## Warnings\n\n");
366            for warning in &result.warnings {
367                report.push_str(&format!(
368                    "- **Row {}**, Column `{}`: {} (value: `{}`)\n",
369                    warning.row + 1,
370                    warning.column,
371                    warning.message,
372                    warning.value
373                ));
374            }
375            report.push('\n');
376        }
377
378        report
379    }
380
381    /// Save validation result to file
382    pub fn save_result(&self, result: &ValidationResult, path: &str) -> Result<()> {
383        let json = serde_json::to_string_pretty(result)?;
384        std::fs::write(path, json)?;
385        Ok(())
386    }
387}
388
389/// Create a sample validation configuration
390pub fn create_sample_config() -> ValidationConfig {
391    let mut rules = HashMap::new();
392
393    // Email validation
394    rules.insert(
395        "email".to_string(),
396        vec![ValidationRule::Email, ValidationRule::NotNull],
397    );
398
399    // Age validation
400    rules.insert(
401        "age".to_string(),
402        vec![
403            ValidationRule::Numeric,
404            ValidationRule::Range {
405                min: Some(0.0),
406                max: Some(150.0),
407            },
408        ],
409    );
410
411    // Name validation
412    rules.insert(
413        "name".to_string(),
414        vec![
415            ValidationRule::NotNull,
416            ValidationRule::Length {
417                min: Some(1),
418                max: Some(100),
419            },
420        ],
421    );
422
423    // Status validation
424    rules.insert(
425        "status".to_string(),
426        vec![ValidationRule::Enum {
427            values: vec![
428                "active".to_string(),
429                "inactive".to_string(),
430                "pending".to_string(),
431            ],
432        }],
433    );
434
435    ValidationConfig {
436        rules,
437        strict_mode: false,
438        stop_on_first_error: false,
439        max_errors: Some(1000),
440    }
441}
442
443#[cfg(test)]
444mod tests {
445    use super::*;
446
447    #[test]
448    fn test_validation_not_null() {
449        let validator = DataValidator::new(ValidationConfig::default());
450
451        // Test valid value
452        assert!(
453            validator
454                .validate_value("test", &ValidationRule::NotNull)
455                .is_ok()
456        );
457
458        // Test invalid value
459        assert!(
460            validator
461                .validate_value("", &ValidationRule::NotNull)
462                .is_err()
463        );
464        assert!(
465            validator
466                .validate_value("   ", &ValidationRule::NotNull)
467                .is_err()
468        );
469    }
470
471    #[test]
472    fn test_validation_numeric() {
473        let validator = DataValidator::new(ValidationConfig::default());
474
475        // Test valid numbers
476        assert!(
477            validator
478                .validate_value("123", &ValidationRule::Numeric)
479                .is_ok()
480        );
481        assert!(
482            validator
483                .validate_value("-45.67", &ValidationRule::Numeric)
484                .is_ok()
485        );
486
487        // Test invalid numbers
488        assert!(
489            validator
490                .validate_value("abc", &ValidationRule::Numeric)
491                .is_err()
492        );
493        assert!(
494            validator
495                .validate_value("", &ValidationRule::Numeric)
496                .is_err()
497        );
498    }
499
500    #[test]
501    fn test_validation_range() {
502        let validator = DataValidator::new(ValidationConfig::default());
503        let rule = ValidationRule::Range {
504            min: Some(0.0),
505            max: Some(100.0),
506        };
507
508        // Test valid range
509        assert!(validator.validate_value("50", &rule).is_ok());
510        assert!(validator.validate_value("0", &rule).is_ok());
511        assert!(validator.validate_value("100", &rule).is_ok());
512
513        // Test invalid range
514        assert!(validator.validate_value("-1", &rule).is_err());
515        assert!(validator.validate_value("101", &rule).is_err());
516    }
517
518    #[test]
519    fn test_validation_enum() {
520        let validator = DataValidator::new(ValidationConfig::default());
521        let rule = ValidationRule::Enum {
522            values: vec!["red".to_string(), "green".to_string(), "blue".to_string()],
523        };
524
525        // Test valid enum values
526        assert!(validator.validate_value("red", &rule).is_ok());
527        assert!(validator.validate_value("green", &rule).is_ok());
528
529        // Test invalid enum value
530        assert!(validator.validate_value("yellow", &rule).is_err());
531    }
532}