term_guard/constraints/
format.rs

1//! Unified format validation constraint for pattern matching and content validation.
2//!
3//! This module provides a single flexible constraint that consolidates all pattern-based
4//! validation including email, URL, credit card detection, phone numbers, postal codes,
5//! UUIDs, IP addresses, JSON, and custom regex patterns.
6//!
7//! ## Overview
8//!
9//! The `FormatConstraint` replaces multiple individual constraint types with a single,
10//! powerful constraint that supports:
11//!
12//! - **Built-in formats**: Email, URL, phone, postal codes, UUIDs, IP addresses, JSON, dates
13//! - **Custom regex patterns**: Full regex support with security validation
14//! - **Rich configuration**: Case sensitivity, trimming, null handling
15//! - **Performance optimization**: Pattern caching and compiled regex reuse
16//! - **Security**: ReDoS protection and SQL injection prevention
17//!
18//! ## Quick Start Examples
19//!
20//! ### Basic Format Validation
21//!
22//! ```rust
23//! use term_guard::constraints::FormatConstraint;
24//!
25//! # fn example() -> Result<(), Box<dyn std::error::Error>> {
26//! // Email validation - require 95% of values to be valid emails
27//! let email_check = FormatConstraint::email("email", 0.95)?;
28//!
29//! // URL validation with localhost support
30//! let url_check = FormatConstraint::url("website", 0.90, true)?;
31//!
32//! // US phone number validation
33//! let phone_check = FormatConstraint::phone("phone", 0.98, Some("US".to_string()))?;
34//!
35//! // UUID validation (any version)
36//! let uuid_check = FormatConstraint::uuid("session_id", 1.0)?;
37//! # Ok(())
38//! # }
39//! ```
40//!
41//! ### Advanced Configuration with FormatOptions
42//!
43//! ```rust
44//! use term_guard::constraints::{FormatConstraint, FormatType, FormatOptions};
45//!
46//! # fn example() -> Result<(), Box<dyn std::error::Error>> {
47//! // Case-insensitive email validation with trimming
48//! let flexible_email = FormatConstraint::new(
49//!     "email",
50//!     FormatType::Email,
51//!     0.95,
52//!     FormatOptions::lenient()  // case insensitive + trimming + allows nulls
53//! )?;
54//!
55//! // Strict phone validation (no nulls, case sensitive)
56//! let strict_phone = FormatConstraint::new(
57//!     "phone",
58//!     FormatType::Phone { country: Some("US".to_string()) },
59//!     0.99,
60//!     FormatOptions::strict()  // null_is_valid = false
61//! )?;
62//!
63//! // Custom regex with options
64//! let product_code = FormatConstraint::new(
65//!     "product_code",
66//!     FormatType::Regex(r"^[A-Z]{2}\d{4}$".to_string()),
67//!     0.98,
68//!     FormatOptions::new()
69//!         .case_sensitive(false)
70//!         .trim_before_check(true)
71//! )?;
72//! # Ok(())
73//! # }
74//! ```
75//!
76//! ### Specialized Format Types
77//!
78//! ```rust
79//! use term_guard::constraints::FormatConstraint;
80//!
81//! # fn example() -> Result<(), Box<dyn std::error::Error>> {
82//! // Postal codes for different countries
83//! let us_zip = FormatConstraint::postal_code("zip", 0.95, "US")?;
84//! let uk_postcode = FormatConstraint::postal_code("postcode", 0.95, "UK")?;
85//! let ca_postal = FormatConstraint::postal_code("postal", 0.95, "CA")?;
86//!
87//! // IP address validation
88//! let ipv4_check = FormatConstraint::ipv4("client_ip", 0.99)?;
89//! let ipv6_check = FormatConstraint::ipv6("server_ip", 0.99)?;
90//!
91//! // JSON format validation
92//! let json_check = FormatConstraint::json("config", 0.98)?;
93//!
94//! // ISO 8601 datetime validation
95//! let datetime_check = FormatConstraint::iso8601_datetime("order_date", 1.0)?;
96//! # Ok(())
97//! # }
98//! ```
99//!
100//! ## Migration from Individual Constraints
101//!
102//! ### Before (Deprecated)
103//! ```rust,ignore
104//! use term_guard::constraints::{PatternConstraint, EmailConstraint, UrlConstraint};
105//!
106//! let email_old = EmailConstraint::new("email", 0.95);
107//! let pattern_old = PatternConstraint::new("phone", r"^\d{3}-\d{3}-\d{4}$", 0.90)?;
108//! let url_old = UrlConstraint::new("website", 0.85);
109//! ```
110//!
111//! ### After (Unified API)
112//! ```rust
113//! use term_guard::constraints::FormatConstraint;
114//!
115//! # fn example() -> Result<(), Box<dyn std::error::Error>> {
116//! let email_new = FormatConstraint::email("email", 0.95)?;
117//! let phone_new = FormatConstraint::phone("phone", 0.90, Some("US".to_string()))?;
118//! let url_new = FormatConstraint::url("website", 0.85, false)?;
119//! # Ok(())
120//! # }
121//! ```
122//!
123//! ## Performance Considerations
124//!
125//! - **Pattern Caching**: Compiled regex patterns are cached for reuse
126//! - **Built-in Patterns**: Predefined patterns are optimized and tested
127//! - **Security**: All patterns are validated to prevent ReDoS attacks
128//! - **Memory Efficiency**: Single constraint type reduces memory overhead
129//!
130//! ## Common Patterns and Use Cases
131//!
132//! ### Data Quality Checks
133//! ```rust
134//! use term_guard::constraints::{FormatConstraint, FormatOptions};
135//!
136//! # fn example() -> Result<(), Box<dyn std::error::Error>> {
137//! // Customer data validation
138//! let email_quality = FormatConstraint::new(
139//!     "customer_email",
140//!     term_guard::constraints::FormatType::Email,
141//!     0.98,  // 98% must be valid emails
142//!     FormatOptions::lenient()  // Allow some flexibility
143//! )?;
144//!
145//! // Credit card detection (for PII scanning)
146//! let cc_detection = FormatConstraint::credit_card("description", 0.01, true)?; // Detect if > 1% contain CCs
147//! # Ok(())
148//! # }
149//! ```
150//!
151//! ### International Data
152//! ```rust
153//! use term_guard::constraints::FormatConstraint;
154//!
155//! # fn example() -> Result<(), Box<dyn std::error::Error>> {
156//! // Multi-region phone validation
157//! let us_phones = FormatConstraint::phone("us_phone", 0.95, Some("US".to_string()))?;
158//! let uk_phones = FormatConstraint::phone("uk_phone", 0.95, Some("UK".to_string()))?;
159//! let intl_phones = FormatConstraint::phone("intl_phone", 0.90, None)?; // E.164 format
160//!
161//! // Multi-country postal codes
162//! let postal_codes = vec![
163//!     FormatConstraint::postal_code("us_zip", 0.99, "US")?,
164//!     FormatConstraint::postal_code("ca_postal", 0.99, "CA")?,
165//!     FormatConstraint::postal_code("uk_postcode", 0.99, "UK")?,
166//! ];
167//! # Ok(())
168//! # }
169//! ```
170
171use crate::core::{Constraint, ConstraintMetadata, ConstraintResult};
172use crate::prelude::*;
173use crate::security::SqlSecurity;
174use arrow::array::Array;
175use async_trait::async_trait;
176use datafusion::prelude::*;
177use once_cell::sync::Lazy;
178use serde::{Deserialize, Serialize};
179use std::collections::HashMap;
180use std::sync::RwLock;
181use tracing::instrument;
182
183/// Lazy static pattern cache for compiled regex patterns
184static PATTERN_CACHE: Lazy<RwLock<HashMap<String, String>>> =
185    Lazy::new(|| RwLock::new(HashMap::new()));
186
187/// Types of format validation that can be performed.
188#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)]
189pub enum FormatType {
190    /// Custom regular expression pattern
191    Regex(String),
192    /// Email address validation
193    Email,
194    /// URL validation with optional localhost support
195    Url { allow_localhost: bool },
196    /// Credit card number detection with optional detection-only mode
197    CreditCard { detect_only: bool },
198    /// Phone number validation with optional country specification
199    Phone { country: Option<String> },
200    /// Postal code validation for a specific country
201    PostalCode { country: String },
202    /// UUID (v1, v4, or any) validation
203    UUID,
204    /// IPv4 address validation
205    IPv4,
206    /// IPv6 address validation
207    IPv6,
208    /// JSON format validation
209    Json,
210    /// ISO 8601 date-time format validation
211    Iso8601DateTime,
212}
213
214impl FormatType {
215    /// Returns the regex pattern for this format type.
216    fn get_pattern(&self) -> Result<String> {
217        let cache_key = format!("{self:?}");
218
219        // Check cache first
220        {
221            let cache = PATTERN_CACHE.read().map_err(|_| {
222                TermError::Internal("Failed to acquire read lock on pattern cache".to_string())
223            })?;
224            if let Some(pattern) = cache.get(&cache_key) {
225                return Ok(pattern.clone());
226            }
227        }
228
229        let pattern = match self {
230            FormatType::Regex(pattern) => {
231                SqlSecurity::validate_regex_pattern(pattern)?;
232                pattern.clone()
233            }
234            FormatType::Email => {
235                // More comprehensive email pattern
236                r"^[a-zA-Z0-9.!#$%&'*+/=?^_`{|}~-]+@[a-zA-Z0-9](?:[a-zA-Z0-9-]{0,61}[a-zA-Z0-9])?(?:\.[a-zA-Z0-9](?:[a-zA-Z0-9-]{0,61}[a-zA-Z0-9])?)*$".to_string()
237            }
238            FormatType::Url { allow_localhost } => {
239                if *allow_localhost {
240                    r"^https?://(?:localhost|(?:[a-zA-Z0-9.-]+\.?[a-zA-Z]{2,}|(?:\d{1,3}\.){3}\d{1,3}))(?::\d+)?(?:/[^\s]*)?$".to_string()
241                } else {
242                    r"^https?://[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}(?::\d+)?(?:/[^\s]*)?$".to_string()
243                }
244            }
245            FormatType::CreditCard { .. } => {
246                // Pattern for major credit card formats (Visa, MasterCard, Amex, Discover)
247                r"^(?:4[0-9]{12}(?:[0-9]{3})?|5[1-5][0-9]{14}|3[47][0-9]{13}|3[0-9]{13}|6(?:011|5[0-9]{2})[0-9]{12})$|^(?:\d{4}[-\s]?){3}\d{4}$".to_string()
248            }
249            FormatType::Phone { country } => {
250                match country.as_deref() {
251                    Some("US") | Some("CA") => r"^(\+?1[-.\s]?)?\(?([0-9]{3})\)?[-.\s]?([0-9]{3})[-.\s]?([0-9]{4})$".to_string(),
252                    Some("UK") => r"^(\+44\s?)?(?:\(?0\d{4}\)?\s?\d{6}|\(?0\d{3}\)?\s?\d{7}|\(?0\d{2}\)?\s?\d{8})$".to_string(),
253                    Some("DE") => r"^(\+49\s?)?(?:\(?0\d{2,5}\)?\s?\d{4,12})$".to_string(),
254                    Some("FR") => r"^(\+33\s?)?(?:\(?0\d{1}\)?\s?\d{8})$".to_string(),
255                    _ => r"^[\+]?[1-9][\d]{0,15}$".to_string(), // E.164 international format
256                }
257            }
258            FormatType::PostalCode { country } => {
259                match country.as_str() {
260                    "US" => r"^\d{5}(-\d{4})?$".to_string(),
261                    "CA" => r"^[A-Za-z]\d[A-Za-z][ -]?\d[A-Za-z]\d$".to_string(),
262                    "UK" => r"^[A-Z]{1,2}\d[A-Z\d]?\s?\d[A-Z]{2}$".to_string(),
263                    "DE" => r"^\d{5}$".to_string(),
264                    "FR" => r"^\d{5}$".to_string(),
265                    "JP" => r"^\d{3}-\d{4}$".to_string(),
266                    "AU" => r"^\d{4}$".to_string(),
267                    _ => r"^[A-Za-z0-9\s-]{3,10}$".to_string(), // Generic postal code
268                }
269            }
270            FormatType::UUID => {
271                r"^[0-9a-fA-F]{8}-[0-9a-fA-F]{4}-[1-5][0-9a-fA-F]{3}-[89abAB][0-9a-fA-F]{3}-[0-9a-fA-F]{12}$".to_string()
272            }
273            FormatType::IPv4 => {
274                r"^(?:(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\.){3}(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)$".to_string()
275            }
276            FormatType::IPv6 => {
277                // Simplified IPv6 pattern that handles most common cases
278                r"^([0-9a-fA-F]{0,4}:){1,7}([0-9a-fA-F]{0,4})?$|^::$|^::1$|^([0-9a-fA-F]{1,4}:)*::([0-9a-fA-F]{1,4}:)*[0-9a-fA-F]{1,4}$".to_string()
279            }
280            FormatType::Json => {
281                // Simple JSON structure validation - starts with { or [
282                r"^\s*[\{\[].*[\}\]]\s*$".to_string()
283            }
284            FormatType::Iso8601DateTime => {
285                // ISO 8601 date-time format (basic validation)
286                r"^\d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2}(?:\.\d+)?(?:Z|[+-]\d{2}:\d{2})$".to_string()
287            }
288        };
289
290        // Cache the pattern
291        {
292            let mut cache = PATTERN_CACHE.write().map_err(|_| {
293                TermError::Internal("Failed to acquire write lock on pattern cache".to_string())
294            })?;
295            cache.insert(cache_key, pattern.clone());
296        }
297
298        Ok(pattern)
299    }
300
301    /// Returns a human-readable name for this format type.
302    pub fn name(&self) -> &str {
303        match self {
304            FormatType::Regex(_) => "regex",
305            FormatType::Email => "email",
306            FormatType::Url { .. } => "url",
307            FormatType::CreditCard { .. } => "credit_card",
308            FormatType::Phone { .. } => "phone",
309            FormatType::PostalCode { .. } => "postal_code",
310            FormatType::UUID => "uuid",
311            FormatType::IPv4 => "ipv4",
312            FormatType::IPv6 => "ipv6",
313            FormatType::Json => "json",
314            FormatType::Iso8601DateTime => "iso8601_datetime",
315        }
316    }
317
318    /// Returns a human-readable description for this format type.
319    pub fn description(&self) -> String {
320        match self {
321            FormatType::Regex(pattern) => format!("matches pattern '{pattern}'"),
322            FormatType::Email => "are valid email addresses".to_string(),
323            FormatType::Url { allow_localhost } => {
324                if *allow_localhost {
325                    "are valid URLs (including localhost)".to_string()
326                } else {
327                    "are valid URLs".to_string()
328                }
329            }
330            FormatType::CreditCard { detect_only } => {
331                if *detect_only {
332                    "contain credit card number patterns".to_string()
333                } else {
334                    "are valid credit card numbers".to_string()
335                }
336            }
337            FormatType::Phone { country } => match country.as_deref() {
338                Some(c) => format!("are valid {c} phone numbers"),
339                None => "are valid phone numbers".to_string(),
340            },
341            FormatType::PostalCode { country } => {
342                format!("are valid {country} postal codes")
343            }
344            FormatType::UUID => "are valid UUIDs".to_string(),
345            FormatType::IPv4 => "are valid IPv4 addresses".to_string(),
346            FormatType::IPv6 => "are valid IPv6 addresses".to_string(),
347            FormatType::Json => "are valid JSON documents".to_string(),
348            FormatType::Iso8601DateTime => "are valid ISO 8601 date-time strings".to_string(),
349        }
350    }
351}
352
353/// Options for format constraint behavior.
354#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)]
355pub struct FormatOptions {
356    /// Whether pattern matching should be case sensitive
357    pub case_sensitive: bool,
358    /// Whether to trim whitespace before checking format
359    pub trim_before_check: bool,
360    /// Whether NULL values should be considered valid
361    pub null_is_valid: bool,
362}
363
364impl Default for FormatOptions {
365    fn default() -> Self {
366        Self {
367            case_sensitive: true,
368            trim_before_check: false,
369            null_is_valid: true, // NULL values are typically considered valid in data quality
370        }
371    }
372}
373
374impl FormatOptions {
375    /// Creates new format options with default values.
376    pub fn new() -> Self {
377        Self::default()
378    }
379
380    /// Sets case sensitivity for pattern matching.
381    pub fn case_sensitive(mut self, case_sensitive: bool) -> Self {
382        self.case_sensitive = case_sensitive;
383        self
384    }
385
386    /// Sets whether to trim whitespace before format checking.
387    pub fn trim_before_check(mut self, trim: bool) -> Self {
388        self.trim_before_check = trim;
389        self
390    }
391
392    /// Sets whether NULL values should be considered valid.
393    pub fn null_is_valid(mut self, null_valid: bool) -> Self {
394        self.null_is_valid = null_valid;
395        self
396    }
397
398    /// Creates format options for case-insensitive matching.
399    ///
400    /// This is a convenience method that sets case_sensitive to false.
401    ///
402    /// # Examples
403    ///
404    /// ```rust
405    /// use term_guard::constraints::FormatOptions;
406    ///
407    /// let options = FormatOptions::case_insensitive();
408    /// assert_eq!(options.case_sensitive, false);
409    /// ```
410    pub fn case_insensitive() -> Self {
411        Self::new().case_sensitive(false)
412    }
413
414    /// Creates format options for strict validation (no nulls, case sensitive, no trimming).
415    ///
416    /// This is a convenience method for the most restrictive validation.
417    ///
418    /// # Examples
419    ///
420    /// ```rust
421    /// use term_guard::constraints::FormatOptions;
422    ///
423    /// let options = FormatOptions::strict();
424    /// assert_eq!(options.case_sensitive, true);
425    /// assert_eq!(options.trim_before_check, false);
426    /// assert_eq!(options.null_is_valid, false);
427    /// ```
428    pub fn strict() -> Self {
429        Self::new().null_is_valid(false)
430    }
431
432    /// Creates format options for lenient validation (case insensitive, trimming, nulls allowed).
433    ///
434    /// This is a convenience method for the most permissive validation.
435    ///
436    /// # Examples
437    ///
438    /// ```rust
439    /// use term_guard::constraints::FormatOptions;
440    ///
441    /// let options = FormatOptions::lenient();
442    /// assert_eq!(options.case_sensitive, false);
443    /// assert_eq!(options.trim_before_check, true);
444    /// assert_eq!(options.null_is_valid, true);
445    /// ```
446    pub fn lenient() -> Self {
447        Self::new()
448            .case_sensitive(false)
449            .trim_before_check(true)
450            .null_is_valid(true)
451    }
452
453    /// Creates format options with trimming enabled.
454    ///
455    /// This is a convenience method that enables whitespace trimming before validation.
456    ///
457    /// # Examples
458    ///
459    /// ```rust
460    /// use term_guard::constraints::FormatOptions;
461    ///
462    /// let options = FormatOptions::with_trimming();
463    /// assert_eq!(options.trim_before_check, true);
464    /// ```
465    pub fn with_trimming() -> Self {
466        Self::new().trim_before_check(true)
467    }
468}
469
470/// A unified constraint that validates data formats and patterns.
471///
472/// This constraint replaces individual format constraints (PatternConstraint,
473/// EmailConstraint, UrlConstraint, CreditCardConstraint) and adds support
474/// for many additional formats.
475///
476/// # Examples
477///
478/// ```rust
479/// use term_guard::constraints::{FormatConstraint, FormatType, FormatOptions};
480/// use term_guard::core::Constraint;
481///
482/// // Email validation
483/// let email_constraint = FormatConstraint::new(
484///     "email",
485///     FormatType::Email,
486///     0.95,
487///     FormatOptions::default()
488/// ).unwrap();
489///
490/// // Phone number validation for US
491/// let phone_constraint = FormatConstraint::new(
492///     "phone",
493///     FormatType::Phone { country: Some("US".to_string()) },
494///     0.90,
495///     FormatOptions::new().trim_before_check(true)
496/// ).unwrap();
497///
498/// // Custom regex pattern
499/// let code_constraint = FormatConstraint::new(
500///     "product_code",
501///     FormatType::Regex(r"^[A-Z]{2}\d{4}$".to_string()),
502///     1.0,
503///     FormatOptions::default()
504/// ).unwrap();
505/// ```
506#[derive(Debug, Clone)]
507pub struct FormatConstraint {
508    /// The column to validate
509    column: String,
510    /// The format type to check
511    format: FormatType,
512    /// The minimum ratio of values that must match the format (0.0 to 1.0)
513    threshold: f64,
514    /// Options for format validation behavior
515    options: FormatOptions,
516}
517
518impl FormatConstraint {
519    /// Creates a new format constraint.
520    ///
521    /// # Arguments
522    ///
523    /// * `column` - The column to check
524    /// * `format` - The format type to validate
525    /// * `threshold` - The minimum ratio of values that must match (0.0 to 1.0)
526    /// * `options` - Format validation options
527    ///
528    /// # Errors
529    ///
530    /// Returns error if column name is invalid or threshold is out of range
531    pub fn new(
532        column: impl Into<String>,
533        format: FormatType,
534        threshold: f64,
535        options: FormatOptions,
536    ) -> Result<Self> {
537        let column_str = column.into();
538
539        // Validate inputs
540        SqlSecurity::validate_identifier(&column_str)?;
541
542        if !(0.0..=1.0).contains(&threshold) {
543            return Err(TermError::SecurityError(
544                "Threshold must be between 0.0 and 1.0".to_string(),
545            ));
546        }
547
548        // Validate that the format can generate a pattern
549        format.get_pattern()?;
550
551        Ok(Self {
552            column: column_str,
553            format,
554            threshold,
555            options,
556        })
557    }
558
559    /// Creates a format constraint for email validation.
560    pub fn email(column: impl Into<String>, threshold: f64) -> Result<Self> {
561        Self::new(
562            column,
563            FormatType::Email,
564            threshold,
565            FormatOptions::default(),
566        )
567    }
568
569    /// Creates a format constraint for URL validation.
570    pub fn url(column: impl Into<String>, threshold: f64, allow_localhost: bool) -> Result<Self> {
571        Self::new(
572            column,
573            FormatType::Url { allow_localhost },
574            threshold,
575            FormatOptions::default(),
576        )
577    }
578
579    /// Creates a format constraint for credit card detection.
580    pub fn credit_card(
581        column: impl Into<String>,
582        threshold: f64,
583        detect_only: bool,
584    ) -> Result<Self> {
585        Self::new(
586            column,
587            FormatType::CreditCard { detect_only },
588            threshold,
589            FormatOptions::default(),
590        )
591    }
592
593    /// Creates a format constraint for phone number validation.
594    pub fn phone(
595        column: impl Into<String>,
596        threshold: f64,
597        country: Option<String>,
598    ) -> Result<Self> {
599        Self::new(
600            column,
601            FormatType::Phone { country },
602            threshold,
603            FormatOptions::new().trim_before_check(true),
604        )
605    }
606
607    /// Creates a format constraint for postal code validation.
608    pub fn postal_code(
609        column: impl Into<String>,
610        threshold: f64,
611        country: impl Into<String>,
612    ) -> Result<Self> {
613        Self::new(
614            column,
615            FormatType::PostalCode {
616                country: country.into(),
617            },
618            threshold,
619            FormatOptions::new().trim_before_check(true),
620        )
621    }
622
623    /// Creates a format constraint for UUID validation.
624    pub fn uuid(column: impl Into<String>, threshold: f64) -> Result<Self> {
625        Self::new(
626            column,
627            FormatType::UUID,
628            threshold,
629            FormatOptions::default(),
630        )
631    }
632
633    /// Creates a format constraint for IPv4 address validation.
634    pub fn ipv4(column: impl Into<String>, threshold: f64) -> Result<Self> {
635        Self::new(
636            column,
637            FormatType::IPv4,
638            threshold,
639            FormatOptions::default(),
640        )
641    }
642
643    /// Creates a format constraint for IPv6 address validation.
644    pub fn ipv6(column: impl Into<String>, threshold: f64) -> Result<Self> {
645        Self::new(
646            column,
647            FormatType::IPv6,
648            threshold,
649            FormatOptions::default(),
650        )
651    }
652
653    /// Creates a format constraint for JSON validation.
654    pub fn json(column: impl Into<String>, threshold: f64) -> Result<Self> {
655        Self::new(
656            column,
657            FormatType::Json,
658            threshold,
659            FormatOptions::default(),
660        )
661    }
662
663    /// Creates a format constraint for ISO 8601 date-time validation.
664    pub fn iso8601_datetime(column: impl Into<String>, threshold: f64) -> Result<Self> {
665        Self::new(
666            column,
667            FormatType::Iso8601DateTime,
668            threshold,
669            FormatOptions::default(),
670        )
671    }
672
673    /// Creates a format constraint for custom regex pattern validation.
674    pub fn regex(
675        column: impl Into<String>,
676        pattern: impl Into<String>,
677        threshold: f64,
678    ) -> Result<Self> {
679        Self::new(
680            column,
681            FormatType::Regex(pattern.into()),
682            threshold,
683            FormatOptions::default(),
684        )
685    }
686}
687
688#[async_trait]
689impl Constraint for FormatConstraint {
690    #[instrument(skip(self, ctx), fields(
691        column = %self.column,
692        format = %self.format.name(),
693        threshold = %self.threshold
694    ))]
695    async fn evaluate(&self, ctx: &SessionContext) -> Result<ConstraintResult> {
696        let column_identifier = SqlSecurity::escape_identifier(&self.column)?;
697        let pattern = self.format.get_pattern()?;
698        let escaped_pattern = SqlSecurity::validate_regex_pattern(&pattern)?;
699
700        // Build the SQL based on options
701        let column_expr = if self.options.trim_before_check {
702            format!("TRIM({column_identifier})")
703        } else {
704            column_identifier.clone()
705        };
706
707        let pattern_operator = if self.options.case_sensitive {
708            "~"
709        } else {
710            "~*"
711        };
712
713        let sql = if self.options.null_is_valid {
714            format!(
715                "SELECT 
716                    COUNT(CASE WHEN {column_expr} {pattern_operator} '{escaped_pattern}' OR {column_identifier} IS NULL THEN 1 END) as matches,
717                    COUNT(*) as total
718                 FROM data"
719            )
720        } else {
721            format!(
722                "SELECT 
723                    COUNT(CASE WHEN {column_expr} {pattern_operator} '{escaped_pattern}' THEN 1 END) as matches,
724                    COUNT(*) as total
725                 FROM data"
726            )
727        };
728
729        let df = ctx.sql(&sql).await?;
730        let batches = df.collect().await?;
731
732        if batches.is_empty() {
733            return Ok(ConstraintResult::skipped("No data to validate"));
734        }
735
736        let batch = &batches[0];
737        if batch.num_rows() == 0 {
738            return Ok(ConstraintResult::skipped("No data to validate"));
739        }
740
741        let matches = batch
742            .column(0)
743            .as_any()
744            .downcast_ref::<arrow::array::Int64Array>()
745            .ok_or_else(|| TermError::Internal("Failed to extract match count".to_string()))?
746            .value(0) as f64;
747
748        let total = batch
749            .column(1)
750            .as_any()
751            .downcast_ref::<arrow::array::Int64Array>()
752            .ok_or_else(|| TermError::Internal("Failed to extract total count".to_string()))?
753            .value(0) as f64;
754
755        if total == 0.0 {
756            return Ok(ConstraintResult::skipped("No data to validate"));
757        }
758
759        let match_ratio = matches / total;
760
761        // Determine success based on format type and threshold
762        let is_success = match &self.format {
763            FormatType::CreditCard { detect_only: true } => {
764                // For credit card detection, we want the ratio to be <= threshold
765                match_ratio <= self.threshold
766            }
767            _ => {
768                // For other formats, we want the ratio to be >= threshold
769                match_ratio >= self.threshold
770            }
771        };
772
773        if is_success {
774            Ok(ConstraintResult::success_with_metric(match_ratio))
775        } else {
776            let message = match &self.format {
777                FormatType::CreditCard { detect_only: true } => {
778                    format!(
779                        "Credit card detection ratio {match_ratio:.3} exceeds threshold {:.3}",
780                        self.threshold
781                    )
782                }
783                _ => {
784                    let desc = self.format.description();
785                    format!(
786                        "Format validation ratio {match_ratio:.3} is below threshold {:.3} - values that {desc}",
787                        self.threshold
788                    )
789                }
790            };
791
792            Ok(ConstraintResult::failure_with_metric(match_ratio, message))
793        }
794    }
795
796    fn name(&self) -> &str {
797        self.format.name()
798    }
799
800    fn column(&self) -> Option<&str> {
801        Some(&self.column)
802    }
803
804    fn metadata(&self) -> ConstraintMetadata {
805        let description = match &self.format {
806            FormatType::CreditCard { detect_only: true } => {
807                let threshold_pct = self.threshold * 100.0;
808                let desc = self.format.description();
809                format!(
810                    "Checks that no more than {threshold_pct:.1}% of values in '{}' {desc}",
811                    self.column
812                )
813            }
814            _ => {
815                let threshold_pct = self.threshold * 100.0;
816                let desc = self.format.description();
817                format!(
818                    "Checks that at least {threshold_pct:.1}% of values in '{}' {desc}",
819                    self.column
820                )
821            }
822        };
823
824        ConstraintMetadata::for_column(&self.column)
825            .with_description(description)
826            .with_custom("format_type", self.format.name())
827            .with_custom("threshold", self.threshold.to_string())
828            .with_custom("case_sensitive", self.options.case_sensitive.to_string())
829            .with_custom(
830                "trim_before_check",
831                self.options.trim_before_check.to_string(),
832            )
833            .with_custom("null_is_valid", self.options.null_is_valid.to_string())
834            .with_custom("constraint_type", "format")
835    }
836}
837
838#[cfg(test)]
839mod tests {
840    use super::*;
841    use crate::core::ConstraintStatus;
842    use arrow::array::StringArray;
843    use arrow::datatypes::{DataType, Field, Schema};
844    use arrow::record_batch::RecordBatch;
845    use datafusion::datasource::MemTable;
846    use std::sync::Arc;
847
848    async fn create_test_context(values: Vec<Option<&str>>) -> SessionContext {
849        let ctx = SessionContext::new();
850
851        let schema = Arc::new(Schema::new(vec![Field::new(
852            "text_col",
853            DataType::Utf8,
854            true,
855        )]));
856
857        let array = StringArray::from(values);
858        let batch = RecordBatch::try_new(schema.clone(), vec![Arc::new(array)]).unwrap();
859
860        let provider = MemTable::try_new(schema, vec![vec![batch]]).unwrap();
861        ctx.register_table("data", Arc::new(provider)).unwrap();
862
863        ctx
864    }
865
866    #[tokio::test]
867    async fn test_email_format_constraint() {
868        let values = vec![
869            Some("test@example.com"),
870            Some("user@domain.org"),
871            Some("invalid-email"),
872            Some("another@test.net"),
873        ];
874        let ctx = create_test_context(values).await;
875
876        let constraint = FormatConstraint::email("text_col", 0.7).unwrap();
877
878        let result = constraint.evaluate(&ctx).await.unwrap();
879        assert_eq!(result.status, ConstraintStatus::Success);
880        assert_eq!(result.metric, Some(0.75)); // 3 out of 4 are emails
881        assert_eq!(constraint.name(), "email");
882    }
883
884    #[tokio::test]
885    async fn test_url_format_constraint() {
886        let values = vec![
887            Some("https://example.com"),
888            Some("http://test.org"),
889            Some("not-a-url"),
890            Some("https://another.site.net/path"),
891        ];
892        let ctx = create_test_context(values).await;
893
894        let constraint = FormatConstraint::url("text_col", 0.7, false).unwrap();
895
896        let result = constraint.evaluate(&ctx).await.unwrap();
897        assert_eq!(result.status, ConstraintStatus::Success);
898        assert_eq!(result.metric, Some(0.75)); // 3 out of 4 are URLs
899        assert_eq!(constraint.name(), "url");
900    }
901
902    #[tokio::test]
903    async fn test_url_with_localhost() {
904        let values = vec![
905            Some("https://localhost:3000"),
906            Some("http://localhost"),
907            Some("https://example.com"),
908            Some("not-a-url"),
909        ];
910        let ctx = create_test_context(values).await;
911
912        let constraint = FormatConstraint::url("text_col", 0.7, true).unwrap();
913
914        let result = constraint.evaluate(&ctx).await.unwrap();
915        assert_eq!(result.status, ConstraintStatus::Success);
916        assert_eq!(result.metric, Some(0.75)); // 3 out of 4 are URLs (including localhost)
917    }
918
919    #[tokio::test]
920    async fn test_credit_card_detection() {
921        let values = vec![
922            Some("4111-1111-1111-1111"),
923            Some("5555 5555 5555 4444"),
924            Some("normal text"),
925            Some("4111111111111111"), // Visa format
926        ];
927        let ctx = create_test_context(values).await;
928
929        // Expect no more than 80% to be credit card numbers
930        let constraint = FormatConstraint::credit_card("text_col", 0.8, true).unwrap();
931
932        let result = constraint.evaluate(&ctx).await.unwrap();
933        assert_eq!(result.status, ConstraintStatus::Success);
934        assert_eq!(constraint.name(), "credit_card");
935    }
936
937    #[tokio::test]
938    async fn test_phone_number_us() {
939        let values = vec![
940            Some("(555) 123-4567"),
941            Some("555-123-4567"),
942            Some("5551234567"),
943            Some("invalid-phone"),
944        ];
945        let ctx = create_test_context(values).await;
946
947        let constraint = FormatConstraint::phone("text_col", 0.7, Some("US".to_string())).unwrap();
948
949        let result = constraint.evaluate(&ctx).await.unwrap();
950        assert_eq!(result.status, ConstraintStatus::Success);
951        assert_eq!(constraint.name(), "phone");
952    }
953
954    #[tokio::test]
955    async fn test_postal_code_us() {
956        let values = vec![
957            Some("12345"),
958            Some("12345-6789"),
959            Some("invalid"),
960            Some("98765"),
961        ];
962        let ctx = create_test_context(values).await;
963
964        let constraint = FormatConstraint::postal_code("text_col", 0.7, "US").unwrap();
965
966        let result = constraint.evaluate(&ctx).await.unwrap();
967        assert_eq!(result.status, ConstraintStatus::Success);
968        assert_eq!(result.metric, Some(0.75)); // 3 out of 4 are valid US postal codes
969        assert_eq!(constraint.name(), "postal_code");
970    }
971
972    #[tokio::test]
973    async fn test_uuid_format() {
974        let values = vec![
975            Some("550e8400-e29b-41d4-a716-446655440000"),
976            Some("6ba7b810-9dad-11d1-80b4-00c04fd430c8"),
977            Some("invalid-uuid"),
978            Some("6ba7b811-9dad-11d1-80b4-00c04fd430c8"),
979        ];
980        let ctx = create_test_context(values).await;
981
982        let constraint = FormatConstraint::uuid("text_col", 0.7).unwrap();
983
984        let result = constraint.evaluate(&ctx).await.unwrap();
985        assert_eq!(result.status, ConstraintStatus::Success);
986        assert_eq!(result.metric, Some(0.75)); // 3 out of 4 are UUIDs
987        assert_eq!(constraint.name(), "uuid");
988    }
989
990    #[tokio::test]
991    async fn test_ipv4_format() {
992        let values = vec![
993            Some("192.168.1.1"),
994            Some("10.0.0.1"),
995            Some("256.256.256.256"), // Invalid - out of range
996            Some("172.16.0.1"),
997        ];
998        let ctx = create_test_context(values).await;
999
1000        let constraint = FormatConstraint::ipv4("text_col", 0.7).unwrap();
1001
1002        let result = constraint.evaluate(&ctx).await.unwrap();
1003        assert_eq!(result.status, ConstraintStatus::Success);
1004        assert_eq!(result.metric, Some(0.75)); // 3 out of 4 are valid IPv4
1005        assert_eq!(constraint.name(), "ipv4");
1006    }
1007
1008    #[tokio::test]
1009    async fn test_ipv6_format() {
1010        let values = vec![
1011            Some("2001:0db8:85a3:0000:0000:8a2e:0370:7334"),
1012            Some("2001:db8:85a3::8a2e:370:7334"),
1013            Some("invalid-ipv6"),
1014            Some("::1"),
1015        ];
1016        let ctx = create_test_context(values).await;
1017
1018        let constraint = FormatConstraint::ipv6("text_col", 0.7).unwrap();
1019
1020        let result = constraint.evaluate(&ctx).await.unwrap();
1021        assert_eq!(result.status, ConstraintStatus::Success);
1022        assert_eq!(result.metric, Some(0.75)); // 3 out of 4 are valid IPv6
1023        assert_eq!(constraint.name(), "ipv6");
1024    }
1025
1026    #[tokio::test]
1027    async fn test_json_format() {
1028        let values = vec![
1029            Some(r#"{"key": "value"}"#),
1030            Some(r#"[1, 2, 3]"#),
1031            Some("not json"),
1032            Some(r#"{"nested": {"key": "value"}}"#),
1033        ];
1034        let ctx = create_test_context(values).await;
1035
1036        let constraint = FormatConstraint::json("text_col", 0.7).unwrap();
1037
1038        let result = constraint.evaluate(&ctx).await.unwrap();
1039        assert_eq!(result.status, ConstraintStatus::Success);
1040        assert_eq!(result.metric, Some(0.75)); // 3 out of 4 look like JSON
1041        assert_eq!(constraint.name(), "json");
1042    }
1043
1044    #[tokio::test]
1045    async fn test_iso8601_datetime_format() {
1046        let values = vec![
1047            Some("2023-12-25T10:30:00Z"),
1048            Some("2023-12-25T10:30:00.123Z"),
1049            Some("invalid-datetime"),
1050            Some("2023-12-25T10:30:00+05:30"),
1051        ];
1052        let ctx = create_test_context(values).await;
1053
1054        let constraint = FormatConstraint::iso8601_datetime("text_col", 0.7).unwrap();
1055
1056        let result = constraint.evaluate(&ctx).await.unwrap();
1057        assert_eq!(result.status, ConstraintStatus::Success);
1058        assert_eq!(result.metric, Some(0.75)); // 3 out of 4 are ISO 8601
1059        assert_eq!(constraint.name(), "iso8601_datetime");
1060    }
1061
1062    #[tokio::test]
1063    async fn test_custom_regex_format() {
1064        let values = vec![
1065            Some("ABC123"),
1066            Some("DEF456"),
1067            Some("invalid"),
1068            Some("GHI789"),
1069        ];
1070        let ctx = create_test_context(values).await;
1071
1072        // Pattern to match 3 letters followed by 3 digits
1073        let constraint = FormatConstraint::regex("text_col", r"^[A-Z]{3}\d{3}$", 0.7).unwrap();
1074
1075        let result = constraint.evaluate(&ctx).await.unwrap();
1076        assert_eq!(result.status, ConstraintStatus::Success);
1077        assert_eq!(result.metric, Some(0.75)); // 3 out of 4 match
1078        assert_eq!(constraint.name(), "regex");
1079    }
1080
1081    #[tokio::test]
1082    async fn test_format_options_case_insensitive() {
1083        let values = vec![
1084            Some("abc123"),
1085            Some("DEF456"),
1086            Some("invalid"),
1087            Some("ghi789"),
1088        ];
1089        let ctx = create_test_context(values).await;
1090
1091        // Pattern should match both upper and lower case when case_insensitive is true
1092        let constraint = FormatConstraint::new(
1093            "text_col",
1094            FormatType::Regex(r"^[A-Z]{3}\d{3}$".to_string()),
1095            0.7,
1096            FormatOptions::new().case_sensitive(false),
1097        )
1098        .unwrap();
1099
1100        let result = constraint.evaluate(&ctx).await.unwrap();
1101        assert_eq!(result.status, ConstraintStatus::Success);
1102        assert_eq!(result.metric, Some(0.75)); // 3 out of 4 match (case insensitive)
1103    }
1104
1105    #[tokio::test]
1106    async fn test_format_options_trim_whitespace() {
1107        let values = vec![
1108            Some("  test@example.com  "),
1109            Some("user@domain.org"),
1110            Some("  invalid-email  "),
1111            Some(" another@test.net "),
1112        ];
1113        let ctx = create_test_context(values).await;
1114
1115        let constraint = FormatConstraint::new(
1116            "text_col",
1117            FormatType::Email,
1118            0.7,
1119            FormatOptions::new().trim_before_check(true),
1120        )
1121        .unwrap();
1122
1123        let result = constraint.evaluate(&ctx).await.unwrap();
1124        assert_eq!(result.status, ConstraintStatus::Success);
1125        assert_eq!(result.metric, Some(0.75)); // 3 out of 4 are emails after trimming
1126    }
1127
1128    #[tokio::test]
1129    async fn test_format_options_null_handling() {
1130        let values = vec![Some("test@example.com"), None, Some("invalid-email"), None];
1131        let ctx = create_test_context(values).await;
1132
1133        // With null_is_valid = true (default)
1134        let constraint1 = FormatConstraint::new(
1135            "text_col",
1136            FormatType::Email,
1137            0.6,
1138            FormatOptions::new().null_is_valid(true),
1139        )
1140        .unwrap();
1141
1142        let result1 = constraint1.evaluate(&ctx).await.unwrap();
1143        assert_eq!(result1.status, ConstraintStatus::Success);
1144        assert_eq!(result1.metric, Some(0.75)); // 1 email + 2 nulls out of 4
1145
1146        // With null_is_valid = false
1147        let constraint2 = FormatConstraint::new(
1148            "text_col",
1149            FormatType::Email,
1150            0.2,
1151            FormatOptions::new().null_is_valid(false),
1152        )
1153        .unwrap();
1154
1155        let result2 = constraint2.evaluate(&ctx).await.unwrap();
1156        assert_eq!(result2.status, ConstraintStatus::Success);
1157        assert_eq!(result2.metric, Some(0.25)); // Only 1 email out of 4
1158    }
1159
1160    #[tokio::test]
1161    async fn test_constraint_failure() {
1162        let values = vec![
1163            Some("invalid"),
1164            Some("also_invalid"),
1165            Some("nope"),
1166            Some("still_invalid"),
1167        ];
1168        let ctx = create_test_context(values).await;
1169
1170        let constraint = FormatConstraint::email("text_col", 0.5).unwrap();
1171
1172        let result = constraint.evaluate(&ctx).await.unwrap();
1173        assert_eq!(result.status, ConstraintStatus::Failure);
1174        assert_eq!(result.metric, Some(0.0)); // No values are emails
1175        assert!(result.message.is_some());
1176    }
1177
1178    #[tokio::test]
1179    async fn test_empty_data() {
1180        let ctx = create_test_context(vec![]).await;
1181        let constraint = FormatConstraint::email("text_col", 0.9).unwrap();
1182
1183        let result = constraint.evaluate(&ctx).await.unwrap();
1184        assert_eq!(result.status, ConstraintStatus::Skipped);
1185    }
1186
1187    #[test]
1188    fn test_invalid_threshold() {
1189        let result = FormatConstraint::email("col", 1.5);
1190        assert!(result.is_err());
1191        assert!(result
1192            .unwrap_err()
1193            .to_string()
1194            .contains("Threshold must be between 0.0 and 1.0"));
1195    }
1196
1197    #[test]
1198    fn test_pattern_caching() {
1199        // Test that patterns are cached for performance
1200        let format1 = FormatType::Email;
1201        let format2 = FormatType::Email;
1202
1203        let pattern1 = format1.get_pattern().unwrap();
1204        let pattern2 = format2.get_pattern().unwrap();
1205
1206        assert_eq!(pattern1, pattern2);
1207
1208        // Accessing cache multiple times should be fast
1209        for _ in 0..100 {
1210            let _ = format1.get_pattern().unwrap();
1211        }
1212    }
1213
1214    #[test]
1215    fn test_format_type_descriptions() {
1216        assert_eq!(FormatType::Email.description(), "are valid email addresses");
1217        assert_eq!(
1218            FormatType::Url {
1219                allow_localhost: true
1220            }
1221            .description(),
1222            "are valid URLs (including localhost)"
1223        );
1224        assert_eq!(
1225            FormatType::Phone {
1226                country: Some("US".to_string())
1227            }
1228            .description(),
1229            "are valid US phone numbers"
1230        );
1231        assert_eq!(
1232            FormatType::PostalCode {
1233                country: "CA".to_string()
1234            }
1235            .description(),
1236            "are valid CA postal codes"
1237        );
1238    }
1239
1240    #[test]
1241    fn test_all_format_types_have_patterns() {
1242        // Ensure all format types can generate valid patterns
1243        let formats = vec![
1244            FormatType::Email,
1245            FormatType::Url {
1246                allow_localhost: false,
1247            },
1248            FormatType::Url {
1249                allow_localhost: true,
1250            },
1251            FormatType::CreditCard { detect_only: false },
1252            FormatType::Phone { country: None },
1253            FormatType::Phone {
1254                country: Some("US".to_string()),
1255            },
1256            FormatType::PostalCode {
1257                country: "US".to_string(),
1258            },
1259            FormatType::UUID,
1260            FormatType::IPv4,
1261            FormatType::IPv6,
1262            FormatType::Json,
1263            FormatType::Iso8601DateTime,
1264            FormatType::Regex(r"^\d+$".to_string()),
1265        ];
1266
1267        for format in formats {
1268            assert!(
1269                format.get_pattern().is_ok(),
1270                "Format {format:?} should have a valid pattern"
1271            );
1272        }
1273    }
1274
1275    #[test]
1276    fn test_format_options_convenience_methods() {
1277        // Test case_insensitive()
1278        let options = FormatOptions::case_insensitive();
1279        assert!(!options.case_sensitive);
1280        assert!(!options.trim_before_check);
1281        assert!(options.null_is_valid);
1282
1283        // Test strict()
1284        let options = FormatOptions::strict();
1285        assert!(options.case_sensitive);
1286        assert!(!options.trim_before_check);
1287        assert!(!options.null_is_valid);
1288
1289        // Test lenient()
1290        let options = FormatOptions::lenient();
1291        assert!(!options.case_sensitive);
1292        assert!(options.trim_before_check);
1293        assert!(options.null_is_valid);
1294
1295        // Test with_trimming()
1296        let options = FormatOptions::with_trimming();
1297        assert!(options.case_sensitive);
1298        assert!(options.trim_before_check);
1299        assert!(options.null_is_valid);
1300    }
1301}