term_guard/constraints/
format.rs

1//! Unified format validation constraint for pattern matching and content validation.
2//!
3//! This module provides a single flexible constraint that consolidates all pattern-based
4//! validation including email, URL, credit card detection, phone numbers, postal codes,
5//! UUIDs, IP addresses, JSON, and custom regex patterns.
6//!
7//! ## Overview
8//!
9//! The `FormatConstraint` replaces multiple individual constraint types with a single,
10//! powerful constraint that supports:
11//!
12//! - **Built-in formats**: Email, URL, phone, postal codes, UUIDs, IP addresses, JSON, dates
13//! - **Custom regex patterns**: Full regex support with security validation
14//! - **Rich configuration**: Case sensitivity, trimming, null handling
15//! - **Performance optimization**: Pattern caching and compiled regex reuse
16//! - **Security**: ReDoS protection and SQL injection prevention
17//!
18//! ## Quick Start Examples
19//!
20//! ### Basic Format Validation
21//!
22//! ```rust
23//! use term_guard::constraints::FormatConstraint;
24//!
25//! # fn example() -> Result<(), Box<dyn std::error::Error>> {
26//! // Email validation - require 95% of values to be valid emails
27//! let email_check = FormatConstraint::email("email", 0.95)?;
28//!
29//! // URL validation with localhost support
30//! let url_check = FormatConstraint::url("website", 0.90, true)?;
31//!
32//! // US phone number validation
33//! let phone_check = FormatConstraint::phone("phone", 0.98, Some("US".to_string()))?;
34//!
35//! // UUID validation (any version)
36//! let uuid_check = FormatConstraint::uuid("session_id", 1.0)?;
37//! # Ok(())
38//! # }
39//! ```
40//!
41//! ### Advanced Configuration with FormatOptions
42//!
43//! ```rust
44//! use term_guard::constraints::{FormatConstraint, FormatType, FormatOptions};
45//!
46//! # fn example() -> Result<(), Box<dyn std::error::Error>> {
47//! // Case-insensitive email validation with trimming
48//! let flexible_email = FormatConstraint::new(
49//!     "email",
50//!     FormatType::Email,
51//!     0.95,
52//!     FormatOptions::lenient()  // case insensitive + trimming + allows nulls
53//! )?;
54//!
55//! // Strict phone validation (no nulls, case sensitive)
56//! let strict_phone = FormatConstraint::new(
57//!     "phone",
58//!     FormatType::Phone { country: Some("US".to_string()) },
59//!     0.99,
60//!     FormatOptions::strict()  // null_is_valid = false
61//! )?;
62//!
63//! // Custom regex with options
64//! let product_code = FormatConstraint::new(
65//!     "product_code",
66//!     FormatType::Regex(r"^[A-Z]{2}\d{4}$".to_string()),
67//!     0.98,
68//!     FormatOptions::new()
69//!         .case_sensitive(false)
70//!         .trim_before_check(true)
71//! )?;
72//! # Ok(())
73//! # }
74//! ```
75//!
76//! ### Specialized Format Types
77//!
78//! ```rust
79//! use term_guard::constraints::FormatConstraint;
80//!
81//! # fn example() -> Result<(), Box<dyn std::error::Error>> {
82//! // Postal codes for different countries
83//! let us_zip = FormatConstraint::postal_code("zip", 0.95, "US")?;
84//! let uk_postcode = FormatConstraint::postal_code("postcode", 0.95, "UK")?;
85//! let ca_postal = FormatConstraint::postal_code("postal", 0.95, "CA")?;
86//!
87//! // IP address validation
88//! let ipv4_check = FormatConstraint::ipv4("client_ip", 0.99)?;
89//! let ipv6_check = FormatConstraint::ipv6("server_ip", 0.99)?;
90//!
91//! // JSON format validation
92//! let json_check = FormatConstraint::json("config", 0.98)?;
93//!
94//! // ISO 8601 datetime validation
95//! let datetime_check = FormatConstraint::iso8601_datetime("order_date", 1.0)?;
96//! # Ok(())
97//! # }
98//! ```
99//!
100//! ## Migration from Individual Constraints
101//!
102//! ### Before (Deprecated)
103//! ```rust,ignore
104//! use term_guard::constraints::{PatternConstraint, EmailConstraint, UrlConstraint};
105//!
106//! let email_old = EmailConstraint::new("email", 0.95);
107//! let pattern_old = PatternConstraint::new("phone", r"^\d{3}-\d{3}-\d{4}$", 0.90)?;
108//! let url_old = UrlConstraint::new("website", 0.85);
109//! ```
110//!
111//! ### After (Unified API)
112//! ```rust
113//! use term_guard::constraints::FormatConstraint;
114//!
115//! # fn example() -> Result<(), Box<dyn std::error::Error>> {
116//! let email_new = FormatConstraint::email("email", 0.95)?;
117//! let phone_new = FormatConstraint::phone("phone", 0.90, Some("US".to_string()))?;
118//! let url_new = FormatConstraint::url("website", 0.85, false)?;
119//! # Ok(())
120//! # }
121//! ```
122//!
123//! ## Performance Considerations
124//!
125//! - **Pattern Caching**: Compiled regex patterns are cached for reuse
126//! - **Built-in Patterns**: Predefined patterns are optimized and tested
127//! - **Security**: All patterns are validated to prevent ReDoS attacks
128//! - **Memory Efficiency**: Single constraint type reduces memory overhead
129//!
130//! ## Common Patterns and Use Cases
131//!
132//! ### Data Quality Checks
133//! ```rust
134//! use term_guard::constraints::{FormatConstraint, FormatOptions};
135//!
136//! # fn example() -> Result<(), Box<dyn std::error::Error>> {
137//! // Customer data validation
138//! let email_quality = FormatConstraint::new(
139//!     "customer_email",
140//!     term_guard::constraints::FormatType::Email,
141//!     0.98,  // 98% must be valid emails
142//!     FormatOptions::lenient()  // Allow some flexibility
143//! )?;
144//!
145//! // Credit card detection (for PII scanning)
146//! let cc_detection = FormatConstraint::credit_card("description", 0.01, true)?; // Detect if > 1% contain CCs
147//! # Ok(())
148//! # }
149//! ```
150//!
151//! ### International Data
152//! ```rust
153//! use term_guard::constraints::FormatConstraint;
154//!
155//! # fn example() -> Result<(), Box<dyn std::error::Error>> {
156//! // Multi-region phone validation
157//! let us_phones = FormatConstraint::phone("us_phone", 0.95, Some("US".to_string()))?;
158//! let uk_phones = FormatConstraint::phone("uk_phone", 0.95, Some("UK".to_string()))?;
159//! let intl_phones = FormatConstraint::phone("intl_phone", 0.90, None)?; // E.164 format
160//!
161//! // Multi-country postal codes
162//! let postal_codes = vec![
163//!     FormatConstraint::postal_code("us_zip", 0.99, "US")?,
164//!     FormatConstraint::postal_code("ca_postal", 0.99, "CA")?,
165//!     FormatConstraint::postal_code("uk_postcode", 0.99, "UK")?,
166//! ];
167//! # Ok(())
168//! # }
169//! ```
170
171use crate::core::{current_validation_context, Constraint, ConstraintMetadata, ConstraintResult};
172use crate::prelude::*;
173use crate::security::SqlSecurity;
174use arrow::array::Array;
175use async_trait::async_trait;
176use datafusion::prelude::*;
177use once_cell::sync::Lazy;
178use serde::{Deserialize, Serialize};
179use std::collections::HashMap;
180use std::sync::RwLock;
181use tracing::instrument;
182/// Lazy static pattern cache for compiled regex patterns
183static PATTERN_CACHE: Lazy<RwLock<HashMap<String, String>>> =
184    Lazy::new(|| RwLock::new(HashMap::new()));
185
186/// Types of format validation that can be performed.
187#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)]
188pub enum FormatType {
189    /// Custom regular expression pattern
190    Regex(String),
191    /// Email address validation
192    Email,
193    /// URL validation with optional localhost support
194    Url { allow_localhost: bool },
195    /// Credit card number detection with optional detection-only mode
196    CreditCard { detect_only: bool },
197    /// Phone number validation with optional country specification
198    Phone { country: Option<String> },
199    /// Postal code validation for a specific country
200    PostalCode { country: String },
201    /// UUID (v1, v4, or any) validation
202    UUID,
203    /// IPv4 address validation
204    IPv4,
205    /// IPv6 address validation
206    IPv6,
207    /// JSON format validation
208    Json,
209    /// ISO 8601 date-time format validation
210    Iso8601DateTime,
211    /// Social Security Number (SSN) pattern detection
212    SocialSecurityNumber,
213}
214
215impl FormatType {
216    /// Returns the regex pattern for this format type.
217    fn get_pattern(&self) -> Result<String> {
218        let cache_key = format!("{self:?}");
219
220        // Check cache first
221        {
222            let cache = PATTERN_CACHE.read().map_err(|_| {
223                TermError::Internal("Failed to acquire read lock on pattern cache".to_string())
224            })?;
225            if let Some(pattern) = cache.get(&cache_key) {
226                return Ok(pattern.clone());
227            }
228        }
229
230        let pattern = match self {
231            FormatType::Regex(pattern) => {
232                SqlSecurity::validate_regex_pattern(pattern)?;
233                pattern.clone()
234            }
235            FormatType::Email => {
236                // More comprehensive email pattern
237                r"^[a-zA-Z0-9.!#$%&'*+/=?^_`{|}~-]+@[a-zA-Z0-9](?:[a-zA-Z0-9-]{0,61}[a-zA-Z0-9])?(?:\.[a-zA-Z0-9](?:[a-zA-Z0-9-]{0,61}[a-zA-Z0-9])?)*$".to_string()
238            }
239            FormatType::Url { allow_localhost } => {
240                if *allow_localhost {
241                    r"^https?://(?:localhost|(?:[a-zA-Z0-9.-]+\.?[a-zA-Z]{2,}|(?:\d{1,3}\.){3}\d{1,3}))(?::\d+)?(?:/[^\s]*)?$".to_string()
242                } else {
243                    r"^https?://[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}(?::\d+)?(?:/[^\s]*)?$".to_string()
244                }
245            }
246            FormatType::CreditCard { .. } => {
247                // Pattern for major credit card formats (Visa, MasterCard, Amex, Discover)
248                r"^(?:4[0-9]{12}(?:[0-9]{3})?|5[1-5][0-9]{14}|3[47][0-9]{13}|3[0-9]{13}|6(?:011|5[0-9]{2})[0-9]{12})$|^(?:\d{4}[-\s]?){3}\d{4}$".to_string()
249            }
250            FormatType::Phone { country } => {
251                match country.as_deref() {
252                    Some("US") | Some("CA") => r"^(\+?1[-.\s]?)?\(?([0-9]{3})\)?[-.\s]?([0-9]{3})[-.\s]?([0-9]{4})$".to_string(),
253                    Some("UK") => r"^(\+44\s?)?(?:\(?0\d{4}\)?\s?\d{6}|\(?0\d{3}\)?\s?\d{7}|\(?0\d{2}\)?\s?\d{8})$".to_string(),
254                    Some("DE") => r"^(\+49\s?)?(?:\(?0\d{2,5}\)?\s?\d{4,12})$".to_string(),
255                    Some("FR") => r"^(\+33\s?)?(?:\(?0\d{1}\)?\s?\d{8})$".to_string(),
256                    _ => r"^[\+]?[1-9][\d]{0,15}$".to_string(), // E.164 international format
257                }
258            }
259            FormatType::PostalCode { country } => {
260                match country.as_str() {
261                    "US" => r"^\d{5}(-\d{4})?$".to_string(),
262                    "CA" => r"^[A-Za-z]\d[A-Za-z][ -]?\d[A-Za-z]\d$".to_string(),
263                    "UK" => r"^[A-Z]{1,2}\d[A-Z\d]?\s?\d[A-Z]{2}$".to_string(),
264                    "DE" => r"^\d{5}$".to_string(),
265                    "FR" => r"^\d{5}$".to_string(),
266                    "JP" => r"^\d{3}-\d{4}$".to_string(),
267                    "AU" => r"^\d{4}$".to_string(),
268                    _ => r"^[A-Za-z0-9\s-]{3,10}$".to_string(), // Generic postal code
269                }
270            }
271            FormatType::UUID => {
272                r"^[0-9a-fA-F]{8}-[0-9a-fA-F]{4}-[1-5][0-9a-fA-F]{3}-[89abAB][0-9a-fA-F]{3}-[0-9a-fA-F]{12}$".to_string()
273            }
274            FormatType::IPv4 => {
275                r"^(?:(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\.){3}(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)$".to_string()
276            }
277            FormatType::IPv6 => {
278                // Simplified IPv6 pattern that handles most common cases
279                r"^([0-9a-fA-F]{0,4}:){1,7}([0-9a-fA-F]{0,4})?$|^::$|^::1$|^([0-9a-fA-F]{1,4}:)*::([0-9a-fA-F]{1,4}:)*[0-9a-fA-F]{1,4}$".to_string()
280            }
281            FormatType::Json => {
282                // Simple JSON structure validation - starts with { or [
283                r"^\s*[\{\[].*[\}\]]\s*$".to_string()
284            }
285            FormatType::Iso8601DateTime => {
286                // ISO 8601 date-time format (basic validation)
287                r"^\d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2}(?:\.\d+)?(?:Z|[+-]\d{2}:\d{2})$".to_string()
288            }
289            FormatType::SocialSecurityNumber => {
290                // SSN patterns: XXX-XX-XXXX or XXXXXXXXX
291                // Matches valid SSN ranges (001-899 except 666) in first 3 digits
292                // Middle 2 digits must be 01-99, last 4 must be 0001-9999
293                // This regex avoids look-ahead by explicitly listing valid ranges
294                r"^(00[1-9]|0[1-9][0-9]|[1-5][0-9]{2}|6[0-5][0-9]|66[0-5]|667|66[89]|6[7-9][0-9]|[7-8][0-9]{2})-?(0[1-9]|[1-9][0-9])-?(000[1-9]|00[1-9][0-9]|0[1-9][0-9]{2}|[1-9][0-9]{3})$".to_string()
295            }
296        };
297
298        // Cache the pattern
299        {
300            let mut cache = PATTERN_CACHE.write().map_err(|_| {
301                TermError::Internal("Failed to acquire write lock on pattern cache".to_string())
302            })?;
303            cache.insert(cache_key, pattern.clone());
304        }
305
306        Ok(pattern)
307    }
308
309    /// Returns a human-readable name for this format type.
310    pub fn name(&self) -> &str {
311        match self {
312            FormatType::Regex(_) => "regex",
313            FormatType::Email => "email",
314            FormatType::Url { .. } => "url",
315            FormatType::CreditCard { .. } => "credit_card",
316            FormatType::Phone { .. } => "phone",
317            FormatType::PostalCode { .. } => "postal_code",
318            FormatType::UUID => "uuid",
319            FormatType::IPv4 => "ipv4",
320            FormatType::IPv6 => "ipv6",
321            FormatType::Json => "json",
322            FormatType::Iso8601DateTime => "iso8601_datetime",
323            FormatType::SocialSecurityNumber => "social_security_number",
324        }
325    }
326
327    /// Returns a human-readable description for this format type.
328    pub fn description(&self) -> String {
329        match self {
330            FormatType::Regex(pattern) => format!("matches pattern '{pattern}'"),
331            FormatType::Email => "are valid email addresses".to_string(),
332            FormatType::Url { allow_localhost } => {
333                if *allow_localhost {
334                    "are valid URLs (including localhost)".to_string()
335                } else {
336                    "are valid URLs".to_string()
337                }
338            }
339            FormatType::CreditCard { detect_only } => {
340                if *detect_only {
341                    "contain credit card number patterns".to_string()
342                } else {
343                    "are valid credit card numbers".to_string()
344                }
345            }
346            FormatType::Phone { country } => match country.as_deref() {
347                Some(c) => format!("are valid {c} phone numbers"),
348                None => "are valid phone numbers".to_string(),
349            },
350            FormatType::PostalCode { country } => {
351                format!("are valid {country} postal codes")
352            }
353            FormatType::UUID => "are valid UUIDs".to_string(),
354            FormatType::IPv4 => "are valid IPv4 addresses".to_string(),
355            FormatType::IPv6 => "are valid IPv6 addresses".to_string(),
356            FormatType::Json => "are valid JSON documents".to_string(),
357            FormatType::Iso8601DateTime => "are valid ISO 8601 date-time strings".to_string(),
358            FormatType::SocialSecurityNumber => {
359                "contain Social Security Number patterns".to_string()
360            }
361        }
362    }
363}
364
365/// Options for format constraint behavior.
366#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)]
367pub struct FormatOptions {
368    /// Whether pattern matching should be case sensitive
369    pub case_sensitive: bool,
370    /// Whether to trim whitespace before checking format
371    pub trim_before_check: bool,
372    /// Whether NULL values should be considered valid
373    pub null_is_valid: bool,
374}
375
376impl Default for FormatOptions {
377    fn default() -> Self {
378        Self {
379            case_sensitive: true,
380            trim_before_check: false,
381            null_is_valid: true, // NULL values are typically considered valid in data quality
382        }
383    }
384}
385
386impl FormatOptions {
387    /// Creates new format options with default values.
388    pub fn new() -> Self {
389        Self::default()
390    }
391
392    /// Sets case sensitivity for pattern matching.
393    pub fn case_sensitive(mut self, case_sensitive: bool) -> Self {
394        self.case_sensitive = case_sensitive;
395        self
396    }
397
398    /// Sets whether to trim whitespace before format checking.
399    pub fn trim_before_check(mut self, trim: bool) -> Self {
400        self.trim_before_check = trim;
401        self
402    }
403
404    /// Sets whether NULL values should be considered valid.
405    pub fn null_is_valid(mut self, null_valid: bool) -> Self {
406        self.null_is_valid = null_valid;
407        self
408    }
409
410    /// Creates format options for case-insensitive matching.
411    ///
412    /// This is a convenience method that sets case_sensitive to false.
413    ///
414    /// # Examples
415    ///
416    /// ```rust
417    /// use term_guard::constraints::FormatOptions;
418    ///
419    /// let options = FormatOptions::case_insensitive();
420    /// assert_eq!(options.case_sensitive, false);
421    /// ```
422    pub fn case_insensitive() -> Self {
423        Self::new().case_sensitive(false)
424    }
425
426    /// Creates format options for strict validation (no nulls, case sensitive, no trimming).
427    ///
428    /// This is a convenience method for the most restrictive validation.
429    ///
430    /// # Examples
431    ///
432    /// ```rust
433    /// use term_guard::constraints::FormatOptions;
434    ///
435    /// let options = FormatOptions::strict();
436    /// assert_eq!(options.case_sensitive, true);
437    /// assert_eq!(options.trim_before_check, false);
438    /// assert_eq!(options.null_is_valid, false);
439    /// ```
440    pub fn strict() -> Self {
441        Self::new().null_is_valid(false)
442    }
443
444    /// Creates format options for lenient validation (case insensitive, trimming, nulls allowed).
445    ///
446    /// This is a convenience method for the most permissive validation.
447    ///
448    /// # Examples
449    ///
450    /// ```rust
451    /// use term_guard::constraints::FormatOptions;
452    ///
453    /// let options = FormatOptions::lenient();
454    /// assert_eq!(options.case_sensitive, false);
455    /// assert_eq!(options.trim_before_check, true);
456    /// assert_eq!(options.null_is_valid, true);
457    /// ```
458    pub fn lenient() -> Self {
459        Self::new()
460            .case_sensitive(false)
461            .trim_before_check(true)
462            .null_is_valid(true)
463    }
464
465    /// Creates format options with trimming enabled.
466    ///
467    /// This is a convenience method that enables whitespace trimming before validation.
468    ///
469    /// # Examples
470    ///
471    /// ```rust
472    /// use term_guard::constraints::FormatOptions;
473    ///
474    /// let options = FormatOptions::with_trimming();
475    /// assert_eq!(options.trim_before_check, true);
476    /// ```
477    pub fn with_trimming() -> Self {
478        Self::new().trim_before_check(true)
479    }
480}
481
482/// A unified constraint that validates data formats and patterns.
483///
484/// This constraint replaces individual format constraints (PatternConstraint,
485/// EmailConstraint, UrlConstraint, CreditCardConstraint) and adds support
486/// for many additional formats.
487///
488/// # Examples
489///
490/// ```rust
491/// use term_guard::constraints::{FormatConstraint, FormatType, FormatOptions};
492/// use term_guard::core::Constraint;
493///
494/// // Email validation
495/// let email_constraint = FormatConstraint::new(
496///     "email",
497///     FormatType::Email,
498///     0.95,
499///     FormatOptions::default()
500/// ).unwrap();
501///
502/// // Phone number validation for US
503/// let phone_constraint = FormatConstraint::new(
504///     "phone",
505///     FormatType::Phone { country: Some("US".to_string()) },
506///     0.90,
507///     FormatOptions::new().trim_before_check(true)
508/// ).unwrap();
509///
510/// // Custom regex pattern
511/// let code_constraint = FormatConstraint::new(
512///     "product_code",
513///     FormatType::Regex(r"^[A-Z]{2}\d{4}$".to_string()),
514///     1.0,
515///     FormatOptions::default()
516/// ).unwrap();
517/// ```
518#[derive(Debug, Clone)]
519pub struct FormatConstraint {
520    /// The column to validate
521    column: String,
522    /// The format type to check
523    format: FormatType,
524    /// The minimum ratio of values that must match the format (0.0 to 1.0)
525    threshold: f64,
526    /// Options for format validation behavior
527    options: FormatOptions,
528}
529
530impl FormatConstraint {
531    /// Creates a new format constraint.
532    ///
533    /// # Arguments
534    ///
535    /// * `column` - The column to check
536    /// * `format` - The format type to validate
537    /// * `threshold` - The minimum ratio of values that must match (0.0 to 1.0)
538    /// * `options` - Format validation options
539    ///
540    /// # Errors
541    ///
542    /// Returns error if column name is invalid or threshold is out of range
543    pub fn new(
544        column: impl Into<String>,
545        format: FormatType,
546        threshold: f64,
547        options: FormatOptions,
548    ) -> Result<Self> {
549        let column_str = column.into();
550
551        // Validate inputs
552        SqlSecurity::validate_identifier(&column_str)?;
553
554        if !(0.0..=1.0).contains(&threshold) {
555            return Err(TermError::SecurityError(
556                "Threshold must be between 0.0 and 1.0".to_string(),
557            ));
558        }
559
560        // Validate that the format can generate a pattern
561        format.get_pattern()?;
562
563        Ok(Self {
564            column: column_str,
565            format,
566            threshold,
567            options,
568        })
569    }
570
571    /// Creates a format constraint for email validation.
572    pub fn email(column: impl Into<String>, threshold: f64) -> Result<Self> {
573        Self::new(
574            column,
575            FormatType::Email,
576            threshold,
577            FormatOptions::default(),
578        )
579    }
580
581    /// Creates a format constraint for URL validation.
582    pub fn url(column: impl Into<String>, threshold: f64, allow_localhost: bool) -> Result<Self> {
583        Self::new(
584            column,
585            FormatType::Url { allow_localhost },
586            threshold,
587            FormatOptions::default(),
588        )
589    }
590
591    /// Creates a format constraint for credit card detection.
592    pub fn credit_card(
593        column: impl Into<String>,
594        threshold: f64,
595        detect_only: bool,
596    ) -> Result<Self> {
597        Self::new(
598            column,
599            FormatType::CreditCard { detect_only },
600            threshold,
601            FormatOptions::default(),
602        )
603    }
604
605    /// Creates a format constraint for phone number validation.
606    pub fn phone(
607        column: impl Into<String>,
608        threshold: f64,
609        country: Option<String>,
610    ) -> Result<Self> {
611        Self::new(
612            column,
613            FormatType::Phone { country },
614            threshold,
615            FormatOptions::new().trim_before_check(true),
616        )
617    }
618
619    /// Creates a format constraint for postal code validation.
620    pub fn postal_code(
621        column: impl Into<String>,
622        threshold: f64,
623        country: impl Into<String>,
624    ) -> Result<Self> {
625        Self::new(
626            column,
627            FormatType::PostalCode {
628                country: country.into(),
629            },
630            threshold,
631            FormatOptions::new().trim_before_check(true),
632        )
633    }
634
635    /// Creates a format constraint for UUID validation.
636    pub fn uuid(column: impl Into<String>, threshold: f64) -> Result<Self> {
637        Self::new(
638            column,
639            FormatType::UUID,
640            threshold,
641            FormatOptions::default(),
642        )
643    }
644
645    /// Creates a format constraint for IPv4 address validation.
646    pub fn ipv4(column: impl Into<String>, threshold: f64) -> Result<Self> {
647        Self::new(
648            column,
649            FormatType::IPv4,
650            threshold,
651            FormatOptions::default(),
652        )
653    }
654
655    /// Creates a format constraint for IPv6 address validation.
656    pub fn ipv6(column: impl Into<String>, threshold: f64) -> Result<Self> {
657        Self::new(
658            column,
659            FormatType::IPv6,
660            threshold,
661            FormatOptions::default(),
662        )
663    }
664
665    /// Creates a format constraint for JSON validation.
666    pub fn json(column: impl Into<String>, threshold: f64) -> Result<Self> {
667        Self::new(
668            column,
669            FormatType::Json,
670            threshold,
671            FormatOptions::default(),
672        )
673    }
674
675    /// Creates a format constraint for ISO 8601 date-time validation.
676    pub fn iso8601_datetime(column: impl Into<String>, threshold: f64) -> Result<Self> {
677        Self::new(
678            column,
679            FormatType::Iso8601DateTime,
680            threshold,
681            FormatOptions::default(),
682        )
683    }
684
685    /// Creates a format constraint for custom regex pattern validation.
686    pub fn regex(
687        column: impl Into<String>,
688        pattern: impl Into<String>,
689        threshold: f64,
690    ) -> Result<Self> {
691        Self::new(
692            column,
693            FormatType::Regex(pattern.into()),
694            threshold,
695            FormatOptions::default(),
696        )
697    }
698
699    /// Creates a format constraint for Social Security Number pattern detection.
700    ///
701    /// This method checks for SSN patterns (XXX-XX-XXXX or XXXXXXXXX) and excludes
702    /// known invalid SSNs such as those starting with 000, 666, or 900-999.
703    ///
704    /// # Arguments
705    ///
706    /// * `column` - The column to check for SSN patterns
707    /// * `threshold` - The minimum ratio of values that must match the SSN pattern (0.0 to 1.0)
708    ///
709    /// # Examples
710    ///
711    /// ```rust
712    /// use term_guard::constraints::FormatConstraint;
713    ///
714    /// # fn example() -> Result<(), Box<dyn std::error::Error>> {
715    /// // Check that at least 95% of values are valid SSN patterns
716    /// let ssn_check = FormatConstraint::social_security_number("ssn", 0.95)?;
717    ///
718    /// // For PII detection - flag if more than 1% contain SSN patterns
719    /// let ssn_detection = FormatConstraint::social_security_number("description", 0.01)?;
720    /// # Ok(())
721    /// # }
722    /// ```
723    pub fn social_security_number(column: impl Into<String>, threshold: f64) -> Result<Self> {
724        Self::new(
725            column,
726            FormatType::SocialSecurityNumber,
727            threshold,
728            FormatOptions::new().trim_before_check(true),
729        )
730    }
731}
732
733#[async_trait]
734impl Constraint for FormatConstraint {
735    #[instrument(skip(self, ctx), fields(
736        column = %self.column,
737        format = %self.format.name(),
738        threshold = %self.threshold
739    ))]
740    async fn evaluate(&self, ctx: &SessionContext) -> Result<ConstraintResult> {
741        // Get the table name from the validation context
742        let validation_ctx = current_validation_context();
743        let table_name = validation_ctx.table_name();
744
745        let column_identifier = SqlSecurity::escape_identifier(&self.column)?;
746        let pattern = self.format.get_pattern()?;
747        let escaped_pattern = SqlSecurity::validate_regex_pattern(&pattern)?;
748
749        // Build the SQL based on options
750        let column_expr = if self.options.trim_before_check {
751            format!("TRIM({column_identifier})")
752        } else {
753            column_identifier.clone()
754        };
755
756        let pattern_operator = if self.options.case_sensitive {
757            "~"
758        } else {
759            "~*"
760        };
761
762        let sql = if self.options.null_is_valid {
763            format!(
764                "SELECT 
765                    COUNT(CASE WHEN {column_expr} {pattern_operator} '{escaped_pattern}' OR {column_identifier} IS NULL THEN 1 END) as matches,
766                    COUNT(*) as total
767                 FROM {table_name}"
768            )
769        } else {
770            format!(
771                "SELECT 
772                    COUNT(CASE WHEN {column_expr} {pattern_operator} '{escaped_pattern}' THEN 1 END) as matches,
773                    COUNT(*) as total
774                 FROM {table_name}"
775            )
776        };
777
778        let df = ctx.sql(&sql).await?;
779        let batches = df.collect().await?;
780
781        if batches.is_empty() {
782            return Ok(ConstraintResult::skipped("No data to validate"));
783        }
784
785        let batch = &batches[0];
786        if batch.num_rows() == 0 {
787            return Ok(ConstraintResult::skipped("No data to validate"));
788        }
789
790        let matches = batch
791            .column(0)
792            .as_any()
793            .downcast_ref::<arrow::array::Int64Array>()
794            .ok_or_else(|| TermError::Internal("Failed to extract match count".to_string()))?
795            .value(0) as f64;
796
797        let total = batch
798            .column(1)
799            .as_any()
800            .downcast_ref::<arrow::array::Int64Array>()
801            .ok_or_else(|| TermError::Internal("Failed to extract total count".to_string()))?
802            .value(0) as f64;
803
804        if total == 0.0 {
805            return Ok(ConstraintResult::skipped("No data to validate"));
806        }
807
808        let match_ratio = matches / total;
809
810        // Determine success based on format type and threshold
811        let is_success = match &self.format {
812            FormatType::CreditCard { detect_only: true } => {
813                // For credit card detection, we want the ratio to be <= threshold
814                match_ratio <= self.threshold
815            }
816            _ => {
817                // For other formats, we want the ratio to be >= threshold
818                match_ratio >= self.threshold
819            }
820        };
821
822        if is_success {
823            Ok(ConstraintResult::success_with_metric(match_ratio))
824        } else {
825            let message = match &self.format {
826                FormatType::CreditCard { detect_only: true } => {
827                    format!(
828                        "Credit card detection ratio {match_ratio:.3} exceeds threshold {:.3}",
829                        self.threshold
830                    )
831                }
832                _ => {
833                    let desc = self.format.description();
834                    format!(
835                        "Format validation ratio {match_ratio:.3} is below threshold {:.3} - values that {desc}",
836                        self.threshold
837                    )
838                }
839            };
840
841            Ok(ConstraintResult::failure_with_metric(match_ratio, message))
842        }
843    }
844
845    fn name(&self) -> &str {
846        self.format.name()
847    }
848
849    fn column(&self) -> Option<&str> {
850        Some(&self.column)
851    }
852
853    fn metadata(&self) -> ConstraintMetadata {
854        let description = match &self.format {
855            FormatType::CreditCard { detect_only: true } => {
856                let threshold_pct = self.threshold * 100.0;
857                let desc = self.format.description();
858                format!(
859                    "Checks that no more than {threshold_pct:.1}% of values in '{}' {desc}",
860                    self.column
861                )
862            }
863            _ => {
864                let threshold_pct = self.threshold * 100.0;
865                let desc = self.format.description();
866                format!(
867                    "Checks that at least {threshold_pct:.1}% of values in '{}' {desc}",
868                    self.column
869                )
870            }
871        };
872
873        ConstraintMetadata::for_column(&self.column)
874            .with_description(description)
875            .with_custom("format_type", self.format.name())
876            .with_custom("threshold", self.threshold.to_string())
877            .with_custom("case_sensitive", self.options.case_sensitive.to_string())
878            .with_custom(
879                "trim_before_check",
880                self.options.trim_before_check.to_string(),
881            )
882            .with_custom("null_is_valid", self.options.null_is_valid.to_string())
883            .with_custom("constraint_type", "format")
884    }
885}
886
887#[cfg(test)]
888mod tests {
889    use super::*;
890    use crate::core::ConstraintStatus;
891    use arrow::array::StringArray;
892    use arrow::datatypes::{DataType, Field, Schema};
893    use arrow::record_batch::RecordBatch;
894    use datafusion::datasource::MemTable;
895    use std::sync::Arc;
896
897    use crate::test_helpers::evaluate_constraint_with_context;
898    async fn create_test_context(values: Vec<Option<&str>>) -> SessionContext {
899        let ctx = SessionContext::new();
900
901        let schema = Arc::new(Schema::new(vec![Field::new(
902            "text_col",
903            DataType::Utf8,
904            true,
905        )]));
906
907        let array = StringArray::from(values);
908        let batch = RecordBatch::try_new(schema.clone(), vec![Arc::new(array)]).unwrap();
909
910        let provider = MemTable::try_new(schema, vec![vec![batch]]).unwrap();
911        ctx.register_table("data", Arc::new(provider)).unwrap();
912
913        ctx
914    }
915
916    #[tokio::test]
917    async fn test_email_format_constraint() {
918        let values = vec![
919            Some("test@example.com"),
920            Some("user@domain.org"),
921            Some("invalid-email"),
922            Some("another@test.net"),
923        ];
924        let ctx = create_test_context(values).await;
925
926        let constraint = FormatConstraint::email("text_col", 0.7).unwrap();
927
928        let result = evaluate_constraint_with_context(&constraint, &ctx, "data")
929            .await
930            .unwrap();
931        assert_eq!(result.status, ConstraintStatus::Success);
932        assert_eq!(result.metric, Some(0.75)); // 3 out of 4 are emails
933        assert_eq!(constraint.name(), "email");
934    }
935
936    #[tokio::test]
937    async fn test_url_format_constraint() {
938        let values = vec![
939            Some("https://example.com"),
940            Some("http://test.org"),
941            Some("not-a-url"),
942            Some("https://another.site.net/path"),
943        ];
944        let ctx = create_test_context(values).await;
945
946        let constraint = FormatConstraint::url("text_col", 0.7, false).unwrap();
947
948        let result = evaluate_constraint_with_context(&constraint, &ctx, "data")
949            .await
950            .unwrap();
951        assert_eq!(result.status, ConstraintStatus::Success);
952        assert_eq!(result.metric, Some(0.75)); // 3 out of 4 are URLs
953        assert_eq!(constraint.name(), "url");
954    }
955
956    #[tokio::test]
957    async fn test_url_with_localhost() {
958        let values = vec![
959            Some("https://localhost:3000"),
960            Some("http://localhost"),
961            Some("https://example.com"),
962            Some("not-a-url"),
963        ];
964        let ctx = create_test_context(values).await;
965
966        let constraint = FormatConstraint::url("text_col", 0.7, true).unwrap();
967
968        let result = evaluate_constraint_with_context(&constraint, &ctx, "data")
969            .await
970            .unwrap();
971        assert_eq!(result.status, ConstraintStatus::Success);
972        assert_eq!(result.metric, Some(0.75)); // 3 out of 4 are URLs (including localhost)
973    }
974
975    #[tokio::test]
976    async fn test_credit_card_detection() {
977        let values = vec![
978            Some("4111-1111-1111-1111"),
979            Some("5555 5555 5555 4444"),
980            Some("normal text"),
981            Some("4111111111111111"), // Visa format
982        ];
983        let ctx = create_test_context(values).await;
984
985        // Expect no more than 80% to be credit card numbers
986        let constraint = FormatConstraint::credit_card("text_col", 0.8, true).unwrap();
987
988        let result = evaluate_constraint_with_context(&constraint, &ctx, "data")
989            .await
990            .unwrap();
991        assert_eq!(result.status, ConstraintStatus::Success);
992        assert_eq!(constraint.name(), "credit_card");
993    }
994
995    #[tokio::test]
996    async fn test_phone_number_us() {
997        let values = vec![
998            Some("(555) 123-4567"),
999            Some("555-123-4567"),
1000            Some("5551234567"),
1001            Some("invalid-phone"),
1002        ];
1003        let ctx = create_test_context(values).await;
1004
1005        let constraint = FormatConstraint::phone("text_col", 0.7, Some("US".to_string())).unwrap();
1006
1007        let result = evaluate_constraint_with_context(&constraint, &ctx, "data")
1008            .await
1009            .unwrap();
1010        assert_eq!(result.status, ConstraintStatus::Success);
1011        assert_eq!(constraint.name(), "phone");
1012    }
1013
1014    #[tokio::test]
1015    async fn test_postal_code_us() {
1016        let values = vec![
1017            Some("12345"),
1018            Some("12345-6789"),
1019            Some("invalid"),
1020            Some("98765"),
1021        ];
1022        let ctx = create_test_context(values).await;
1023
1024        let constraint = FormatConstraint::postal_code("text_col", 0.7, "US").unwrap();
1025
1026        let result = evaluate_constraint_with_context(&constraint, &ctx, "data")
1027            .await
1028            .unwrap();
1029        assert_eq!(result.status, ConstraintStatus::Success);
1030        assert_eq!(result.metric, Some(0.75)); // 3 out of 4 are valid US postal codes
1031        assert_eq!(constraint.name(), "postal_code");
1032    }
1033
1034    #[tokio::test]
1035    async fn test_uuid_format() {
1036        let values = vec![
1037            Some("550e8400-e29b-41d4-a716-446655440000"),
1038            Some("6ba7b810-9dad-11d1-80b4-00c04fd430c8"),
1039            Some("invalid-uuid"),
1040            Some("6ba7b811-9dad-11d1-80b4-00c04fd430c8"),
1041        ];
1042        let ctx = create_test_context(values).await;
1043
1044        let constraint = FormatConstraint::uuid("text_col", 0.7).unwrap();
1045
1046        let result = evaluate_constraint_with_context(&constraint, &ctx, "data")
1047            .await
1048            .unwrap();
1049        assert_eq!(result.status, ConstraintStatus::Success);
1050        assert_eq!(result.metric, Some(0.75)); // 3 out of 4 are UUIDs
1051        assert_eq!(constraint.name(), "uuid");
1052    }
1053
1054    #[tokio::test]
1055    async fn test_ipv4_format() {
1056        let values = vec![
1057            Some("192.168.1.1"),
1058            Some("10.0.0.1"),
1059            Some("256.256.256.256"), // Invalid - out of range
1060            Some("172.16.0.1"),
1061        ];
1062        let ctx = create_test_context(values).await;
1063
1064        let constraint = FormatConstraint::ipv4("text_col", 0.7).unwrap();
1065
1066        let result = evaluate_constraint_with_context(&constraint, &ctx, "data")
1067            .await
1068            .unwrap();
1069        assert_eq!(result.status, ConstraintStatus::Success);
1070        assert_eq!(result.metric, Some(0.75)); // 3 out of 4 are valid IPv4
1071        assert_eq!(constraint.name(), "ipv4");
1072    }
1073
1074    #[tokio::test]
1075    async fn test_ipv6_format() {
1076        let values = vec![
1077            Some("2001:0db8:85a3:0000:0000:8a2e:0370:7334"),
1078            Some("2001:db8:85a3::8a2e:370:7334"),
1079            Some("invalid-ipv6"),
1080            Some("::1"),
1081        ];
1082        let ctx = create_test_context(values).await;
1083
1084        let constraint = FormatConstraint::ipv6("text_col", 0.7).unwrap();
1085
1086        let result = evaluate_constraint_with_context(&constraint, &ctx, "data")
1087            .await
1088            .unwrap();
1089        assert_eq!(result.status, ConstraintStatus::Success);
1090        assert_eq!(result.metric, Some(0.75)); // 3 out of 4 are valid IPv6
1091        assert_eq!(constraint.name(), "ipv6");
1092    }
1093
1094    #[tokio::test]
1095    async fn test_json_format() {
1096        let values = vec![
1097            Some(r#"{"key": "value"}"#),
1098            Some(r#"[1, 2, 3]"#),
1099            Some("not json"),
1100            Some(r#"{"nested": {"key": "value"}}"#),
1101        ];
1102        let ctx = create_test_context(values).await;
1103
1104        let constraint = FormatConstraint::json("text_col", 0.7).unwrap();
1105
1106        let result = evaluate_constraint_with_context(&constraint, &ctx, "data")
1107            .await
1108            .unwrap();
1109        assert_eq!(result.status, ConstraintStatus::Success);
1110        assert_eq!(result.metric, Some(0.75)); // 3 out of 4 look like JSON
1111        assert_eq!(constraint.name(), "json");
1112    }
1113
1114    #[tokio::test]
1115    async fn test_iso8601_datetime_format() {
1116        let values = vec![
1117            Some("2023-12-25T10:30:00Z"),
1118            Some("2023-12-25T10:30:00.123Z"),
1119            Some("invalid-datetime"),
1120            Some("2023-12-25T10:30:00+05:30"),
1121        ];
1122        let ctx = create_test_context(values).await;
1123
1124        let constraint = FormatConstraint::iso8601_datetime("text_col", 0.7).unwrap();
1125
1126        let result = evaluate_constraint_with_context(&constraint, &ctx, "data")
1127            .await
1128            .unwrap();
1129        assert_eq!(result.status, ConstraintStatus::Success);
1130        assert_eq!(result.metric, Some(0.75)); // 3 out of 4 are ISO 8601
1131        assert_eq!(constraint.name(), "iso8601_datetime");
1132    }
1133
1134    #[tokio::test]
1135    async fn test_custom_regex_format() {
1136        let values = vec![
1137            Some("ABC123"),
1138            Some("DEF456"),
1139            Some("invalid"),
1140            Some("GHI789"),
1141        ];
1142        let ctx = create_test_context(values).await;
1143
1144        // Pattern to match 3 letters followed by 3 digits
1145        let constraint = FormatConstraint::regex("text_col", r"^[A-Z]{3}\d{3}$", 0.7).unwrap();
1146
1147        let result = evaluate_constraint_with_context(&constraint, &ctx, "data")
1148            .await
1149            .unwrap();
1150        assert_eq!(result.status, ConstraintStatus::Success);
1151        assert_eq!(result.metric, Some(0.75)); // 3 out of 4 match
1152        assert_eq!(constraint.name(), "regex");
1153    }
1154
1155    #[tokio::test]
1156    async fn test_format_options_case_insensitive() {
1157        let values = vec![
1158            Some("abc123"),
1159            Some("DEF456"),
1160            Some("invalid"),
1161            Some("ghi789"),
1162        ];
1163        let ctx = create_test_context(values).await;
1164
1165        // Pattern should match both upper and lower case when case_insensitive is true
1166        let constraint = FormatConstraint::new(
1167            "text_col",
1168            FormatType::Regex(r"^[A-Z]{3}\d{3}$".to_string()),
1169            0.7,
1170            FormatOptions::new().case_sensitive(false),
1171        )
1172        .unwrap();
1173
1174        let result = evaluate_constraint_with_context(&constraint, &ctx, "data")
1175            .await
1176            .unwrap();
1177        assert_eq!(result.status, ConstraintStatus::Success);
1178        assert_eq!(result.metric, Some(0.75)); // 3 out of 4 match (case insensitive)
1179    }
1180
1181    #[tokio::test]
1182    async fn test_format_options_trim_whitespace() {
1183        let values = vec![
1184            Some("  test@example.com  "),
1185            Some("user@domain.org"),
1186            Some("  invalid-email  "),
1187            Some(" another@test.net "),
1188        ];
1189        let ctx = create_test_context(values).await;
1190
1191        let constraint = FormatConstraint::new(
1192            "text_col",
1193            FormatType::Email,
1194            0.7,
1195            FormatOptions::new().trim_before_check(true),
1196        )
1197        .unwrap();
1198
1199        let result = evaluate_constraint_with_context(&constraint, &ctx, "data")
1200            .await
1201            .unwrap();
1202        assert_eq!(result.status, ConstraintStatus::Success);
1203        assert_eq!(result.metric, Some(0.75)); // 3 out of 4 are emails after trimming
1204    }
1205
1206    #[tokio::test]
1207    async fn test_format_options_null_handling() {
1208        let values = vec![Some("test@example.com"), None, Some("invalid-email"), None];
1209        let ctx = create_test_context(values).await;
1210
1211        // With null_is_valid = true (default)
1212        let constraint1 = FormatConstraint::new(
1213            "text_col",
1214            FormatType::Email,
1215            0.6,
1216            FormatOptions::new().null_is_valid(true),
1217        )
1218        .unwrap();
1219
1220        let result1 = evaluate_constraint_with_context(&constraint1, &ctx, "data")
1221            .await
1222            .unwrap();
1223        assert_eq!(result1.status, ConstraintStatus::Success);
1224        assert_eq!(result1.metric, Some(0.75)); // 1 email + 2 nulls out of 4
1225
1226        // With null_is_valid = false
1227        let constraint2 = FormatConstraint::new(
1228            "text_col",
1229            FormatType::Email,
1230            0.2,
1231            FormatOptions::new().null_is_valid(false),
1232        )
1233        .unwrap();
1234
1235        let result2 = evaluate_constraint_with_context(&constraint2, &ctx, "data")
1236            .await
1237            .unwrap();
1238        assert_eq!(result2.status, ConstraintStatus::Success);
1239        assert_eq!(result2.metric, Some(0.25)); // Only 1 email out of 4
1240    }
1241
1242    #[tokio::test]
1243    async fn test_constraint_failure() {
1244        let values = vec![
1245            Some("invalid"),
1246            Some("also_invalid"),
1247            Some("nope"),
1248            Some("still_invalid"),
1249        ];
1250        let ctx = create_test_context(values).await;
1251
1252        let constraint = FormatConstraint::email("text_col", 0.5).unwrap();
1253
1254        let result = evaluate_constraint_with_context(&constraint, &ctx, "data")
1255            .await
1256            .unwrap();
1257        assert_eq!(result.status, ConstraintStatus::Failure);
1258        assert_eq!(result.metric, Some(0.0)); // No values are emails
1259        assert!(result.message.is_some());
1260    }
1261
1262    #[tokio::test]
1263    async fn test_empty_data() {
1264        let ctx = create_test_context(vec![]).await;
1265        let constraint = FormatConstraint::email("text_col", 0.9).unwrap();
1266
1267        let result = evaluate_constraint_with_context(&constraint, &ctx, "data")
1268            .await
1269            .unwrap();
1270        assert_eq!(result.status, ConstraintStatus::Skipped);
1271    }
1272
1273    #[test]
1274    fn test_invalid_threshold() {
1275        let result = FormatConstraint::email("col", 1.5);
1276        assert!(result.is_err());
1277        assert!(result
1278            .unwrap_err()
1279            .to_string()
1280            .contains("Threshold must be between 0.0 and 1.0"));
1281    }
1282
1283    #[test]
1284    fn test_pattern_caching() {
1285        // Test that patterns are cached for performance
1286        let format1 = FormatType::Email;
1287        let format2 = FormatType::Email;
1288
1289        let pattern1 = format1.get_pattern().unwrap();
1290        let pattern2 = format2.get_pattern().unwrap();
1291
1292        assert_eq!(pattern1, pattern2);
1293
1294        // Accessing cache multiple times should be fast
1295        for _ in 0..100 {
1296            let _ = format1.get_pattern().unwrap();
1297        }
1298    }
1299
1300    #[test]
1301    fn test_format_type_descriptions() {
1302        assert_eq!(FormatType::Email.description(), "are valid email addresses");
1303        assert_eq!(
1304            FormatType::Url {
1305                allow_localhost: true
1306            }
1307            .description(),
1308            "are valid URLs (including localhost)"
1309        );
1310        assert_eq!(
1311            FormatType::Phone {
1312                country: Some("US".to_string())
1313            }
1314            .description(),
1315            "are valid US phone numbers"
1316        );
1317        assert_eq!(
1318            FormatType::PostalCode {
1319                country: "CA".to_string()
1320            }
1321            .description(),
1322            "are valid CA postal codes"
1323        );
1324    }
1325
1326    #[test]
1327    fn test_all_format_types_have_patterns() {
1328        // Ensure all format types can generate valid patterns
1329        let formats = vec![
1330            FormatType::Email,
1331            FormatType::Url {
1332                allow_localhost: false,
1333            },
1334            FormatType::Url {
1335                allow_localhost: true,
1336            },
1337            FormatType::CreditCard { detect_only: false },
1338            FormatType::Phone { country: None },
1339            FormatType::Phone {
1340                country: Some("US".to_string()),
1341            },
1342            FormatType::PostalCode {
1343                country: "US".to_string(),
1344            },
1345            FormatType::UUID,
1346            FormatType::IPv4,
1347            FormatType::IPv6,
1348            FormatType::Json,
1349            FormatType::Iso8601DateTime,
1350            FormatType::Regex(r"^\d+$".to_string()),
1351        ];
1352
1353        for format in formats {
1354            assert!(
1355                format.get_pattern().is_ok(),
1356                "Format {format:?} should have a valid pattern"
1357            );
1358        }
1359    }
1360
1361    #[test]
1362    fn test_format_options_convenience_methods() {
1363        // Test case_insensitive()
1364        let options = FormatOptions::case_insensitive();
1365        assert!(!options.case_sensitive);
1366        assert!(!options.trim_before_check);
1367        assert!(options.null_is_valid);
1368
1369        // Test strict()
1370        let options = FormatOptions::strict();
1371        assert!(options.case_sensitive);
1372        assert!(!options.trim_before_check);
1373        assert!(!options.null_is_valid);
1374
1375        // Test lenient()
1376        let options = FormatOptions::lenient();
1377        assert!(!options.case_sensitive);
1378        assert!(options.trim_before_check);
1379        assert!(options.null_is_valid);
1380
1381        // Test with_trimming()
1382        let options = FormatOptions::with_trimming();
1383        assert!(options.case_sensitive);
1384        assert!(options.trim_before_check);
1385        assert!(options.null_is_valid);
1386    }
1387
1388    #[tokio::test]
1389    async fn test_ssn_format_valid() {
1390        let values = vec![
1391            Some("123-45-6789"), // Valid hyphenated
1392            Some("123456789"),   // Valid non-hyphenated
1393            Some("456-78-9012"), // Valid hyphenated
1394            Some("789012345"),   // Valid non-hyphenated
1395        ];
1396        let ctx = create_test_context(values).await;
1397
1398        let constraint = FormatConstraint::social_security_number("text_col", 0.95).unwrap();
1399
1400        let result = evaluate_constraint_with_context(&constraint, &ctx, "data")
1401            .await
1402            .unwrap();
1403        assert_eq!(result.status, ConstraintStatus::Success);
1404        assert_eq!(result.metric, Some(1.0)); // All are valid SSNs
1405        assert_eq!(constraint.name(), "social_security_number");
1406    }
1407
1408    #[tokio::test]
1409    async fn test_ssn_format_invalid_patterns() {
1410        let values = vec![
1411            Some("000-12-3456"), // Invalid: starts with 000
1412            Some("666-12-3456"), // Invalid: starts with 666
1413            Some("900-12-3456"), // Invalid: starts with 9xx
1414            Some("123-00-4567"), // Invalid: middle is 00
1415            Some("123-45-0000"), // Invalid: last four are 0000
1416        ];
1417        let ctx = create_test_context(values).await;
1418
1419        // Threshold means we expect AT LEAST that percentage to be valid
1420        // Since none are valid (0.0), and we're expecting at least 0.0, it's Success
1421        let constraint = FormatConstraint::social_security_number("text_col", 0.0).unwrap();
1422
1423        let result = evaluate_constraint_with_context(&constraint, &ctx, "data")
1424            .await
1425            .unwrap();
1426        // None of these should be valid with our regex
1427        // 0.0 >= 0.0 means Success
1428        assert_eq!(result.status, ConstraintStatus::Success);
1429        assert_eq!(result.metric, Some(0.0)); // None should be valid
1430    }
1431
1432    #[tokio::test]
1433    async fn test_ssn_format_mixed() {
1434        let values = vec![
1435            Some("123-45-6789"), // Valid
1436            Some("not-an-ssn"),  // Invalid format
1437            Some("666-12-3456"), // Invalid: starts with 666
1438            Some("456789012"),   // Valid non-hyphenated
1439            Some("123 45 6789"), // Invalid: spaces instead of hyphens
1440            Some("789-01-2345"), // Valid
1441            None,                // Null value
1442            Some("234-56-7890"), // Valid
1443        ];
1444        let ctx = create_test_context(values).await;
1445
1446        let constraint = FormatConstraint::social_security_number("text_col", 0.5).unwrap();
1447
1448        let result = evaluate_constraint_with_context(&constraint, &ctx, "data")
1449            .await
1450            .unwrap();
1451        assert_eq!(result.status, ConstraintStatus::Success);
1452        // Valid SSNs: 123-45-6789, 456789012, 789-01-2345, 234-56-7890
1453        // Invalid: not-an-ssn, 666-12-3456, "123 45 6789" (spaces)
1454        // Null is handled by default options (null_is_valid = true in FormatOptions)
1455        // So we have 4 valid + 1 null = 5 matches out of 8 total = 0.625
1456        assert_eq!(result.metric, Some(0.625));
1457    }
1458
1459    #[tokio::test]
1460    async fn test_ssn_format_threshold() {
1461        let values = vec![
1462            Some("123-45-6789"), // Valid
1463            Some("invalid"),     // Invalid
1464            Some("234-56-7890"), // Valid
1465            Some("not-ssn"),     // Invalid
1466        ];
1467        let ctx = create_test_context(values).await;
1468
1469        // Test with threshold that should fail
1470        let constraint = FormatConstraint::social_security_number("text_col", 0.8).unwrap();
1471        let result = evaluate_constraint_with_context(&constraint, &ctx, "data")
1472            .await
1473            .unwrap();
1474        assert_eq!(result.status, ConstraintStatus::Failure); // 0.5 < 0.8
1475        assert_eq!(result.metric, Some(0.5));
1476
1477        // Test with threshold that should pass
1478        let constraint = FormatConstraint::social_security_number("text_col", 0.4).unwrap();
1479        let result = evaluate_constraint_with_context(&constraint, &ctx, "data")
1480            .await
1481            .unwrap();
1482        assert_eq!(result.status, ConstraintStatus::Success); // 0.5 >= 0.4
1483        assert_eq!(result.metric, Some(0.5));
1484    }
1485
1486    #[tokio::test]
1487    async fn test_ssn_edge_cases() {
1488        let values = vec![
1489            Some("078-05-1120"),  // Valid: Woolworth's SSN (historically used in examples)
1490            Some("219-09-9999"),  // Valid: Used in advertisements
1491            Some("457-55-5462"),  // Valid: Used in LifeLock ads
1492            Some("999-99-9999"),  // Invalid: starts with 9xx
1493            Some("123-45-67890"), // Invalid: too many digits
1494            Some("12-345-6789"),  // Invalid: wrong format
1495            Some("ABC-DE-FGHI"),  // Invalid: letters
1496            Some(""),             // Invalid: empty string
1497        ];
1498        let ctx = create_test_context(values).await;
1499
1500        let constraint = FormatConstraint::social_security_number("text_col", 0.3).unwrap();
1501
1502        let result = evaluate_constraint_with_context(&constraint, &ctx, "data")
1503            .await
1504            .unwrap();
1505        // Only first 3 are valid
1506        assert_eq!(result.metric, Some(0.375)); // 3 out of 8
1507        assert_eq!(result.status, ConstraintStatus::Success); // 0.375 >= 0.3
1508    }
1509}