1use crate::core::{current_validation_context, Constraint, ConstraintMetadata, ConstraintResult};
172use crate::prelude::*;
173use crate::security::SqlSecurity;
174use arrow::array::Array;
175use async_trait::async_trait;
176use datafusion::prelude::*;
177use once_cell::sync::Lazy;
178use serde::{Deserialize, Serialize};
179use std::collections::HashMap;
180use std::sync::RwLock;
181use tracing::instrument;
182static PATTERN_CACHE: Lazy<RwLock<HashMap<String, String>>> =
184 Lazy::new(|| RwLock::new(HashMap::new()));
185
186#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)]
188pub enum FormatType {
189 Regex(String),
191 Email,
193 Url { allow_localhost: bool },
195 CreditCard { detect_only: bool },
197 Phone { country: Option<String> },
199 PostalCode { country: String },
201 UUID,
203 IPv4,
205 IPv6,
207 Json,
209 Iso8601DateTime,
211 SocialSecurityNumber,
213}
214
215impl FormatType {
216 fn get_pattern(&self) -> Result<String> {
218 let cache_key = format!("{self:?}");
219
220 {
222 let cache = PATTERN_CACHE.read().map_err(|_| {
223 TermError::Internal("Failed to acquire read lock on pattern cache".to_string())
224 })?;
225 if let Some(pattern) = cache.get(&cache_key) {
226 return Ok(pattern.clone());
227 }
228 }
229
230 let pattern = match self {
231 FormatType::Regex(pattern) => {
232 SqlSecurity::validate_regex_pattern(pattern)?;
233 pattern.clone()
234 }
235 FormatType::Email => {
236 r"^[a-zA-Z0-9.!#$%&'*+/=?^_`{|}~-]+@[a-zA-Z0-9](?:[a-zA-Z0-9-]{0,61}[a-zA-Z0-9])?(?:\.[a-zA-Z0-9](?:[a-zA-Z0-9-]{0,61}[a-zA-Z0-9])?)*$".to_string()
238 }
239 FormatType::Url { allow_localhost } => {
240 if *allow_localhost {
241 r"^https?://(?:localhost|(?:[a-zA-Z0-9.-]+\.?[a-zA-Z]{2,}|(?:\d{1,3}\.){3}\d{1,3}))(?::\d+)?(?:/[^\s]*)?$".to_string()
242 } else {
243 r"^https?://[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}(?::\d+)?(?:/[^\s]*)?$".to_string()
244 }
245 }
246 FormatType::CreditCard { .. } => {
247 r"^(?:4[0-9]{12}(?:[0-9]{3})?|5[1-5][0-9]{14}|3[47][0-9]{13}|3[0-9]{13}|6(?:011|5[0-9]{2})[0-9]{12})$|^(?:\d{4}[-\s]?){3}\d{4}$".to_string()
249 }
250 FormatType::Phone { country } => {
251 match country.as_deref() {
252 Some("US") | Some("CA") => r"^(\+?1[-.\s]?)?\(?([0-9]{3})\)?[-.\s]?([0-9]{3})[-.\s]?([0-9]{4})$".to_string(),
253 Some("UK") => r"^(\+44\s?)?(?:\(?0\d{4}\)?\s?\d{6}|\(?0\d{3}\)?\s?\d{7}|\(?0\d{2}\)?\s?\d{8})$".to_string(),
254 Some("DE") => r"^(\+49\s?)?(?:\(?0\d{2,5}\)?\s?\d{4,12})$".to_string(),
255 Some("FR") => r"^(\+33\s?)?(?:\(?0\d{1}\)?\s?\d{8})$".to_string(),
256 _ => r"^[\+]?[1-9][\d]{0,15}$".to_string(), }
258 }
259 FormatType::PostalCode { country } => {
260 match country.as_str() {
261 "US" => r"^\d{5}(-\d{4})?$".to_string(),
262 "CA" => r"^[A-Za-z]\d[A-Za-z][ -]?\d[A-Za-z]\d$".to_string(),
263 "UK" => r"^[A-Z]{1,2}\d[A-Z\d]?\s?\d[A-Z]{2}$".to_string(),
264 "DE" => r"^\d{5}$".to_string(),
265 "FR" => r"^\d{5}$".to_string(),
266 "JP" => r"^\d{3}-\d{4}$".to_string(),
267 "AU" => r"^\d{4}$".to_string(),
268 _ => r"^[A-Za-z0-9\s-]{3,10}$".to_string(), }
270 }
271 FormatType::UUID => {
272 r"^[0-9a-fA-F]{8}-[0-9a-fA-F]{4}-[1-5][0-9a-fA-F]{3}-[89abAB][0-9a-fA-F]{3}-[0-9a-fA-F]{12}$".to_string()
273 }
274 FormatType::IPv4 => {
275 r"^(?:(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\.){3}(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)$".to_string()
276 }
277 FormatType::IPv6 => {
278 r"^([0-9a-fA-F]{0,4}:){1,7}([0-9a-fA-F]{0,4})?$|^::$|^::1$|^([0-9a-fA-F]{1,4}:)*::([0-9a-fA-F]{1,4}:)*[0-9a-fA-F]{1,4}$".to_string()
280 }
281 FormatType::Json => {
282 r"^\s*[\{\[].*[\}\]]\s*$".to_string()
284 }
285 FormatType::Iso8601DateTime => {
286 r"^\d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2}(?:\.\d+)?(?:Z|[+-]\d{2}:\d{2})$".to_string()
288 }
289 FormatType::SocialSecurityNumber => {
290 r"^(00[1-9]|0[1-9][0-9]|[1-5][0-9]{2}|6[0-5][0-9]|66[0-5]|667|66[89]|6[7-9][0-9]|[7-8][0-9]{2})-?(0[1-9]|[1-9][0-9])-?(000[1-9]|00[1-9][0-9]|0[1-9][0-9]{2}|[1-9][0-9]{3})$".to_string()
295 }
296 };
297
298 {
300 let mut cache = PATTERN_CACHE.write().map_err(|_| {
301 TermError::Internal("Failed to acquire write lock on pattern cache".to_string())
302 })?;
303 cache.insert(cache_key, pattern.clone());
304 }
305
306 Ok(pattern)
307 }
308
309 pub fn name(&self) -> &str {
311 match self {
312 FormatType::Regex(_) => "regex",
313 FormatType::Email => "email",
314 FormatType::Url { .. } => "url",
315 FormatType::CreditCard { .. } => "credit_card",
316 FormatType::Phone { .. } => "phone",
317 FormatType::PostalCode { .. } => "postal_code",
318 FormatType::UUID => "uuid",
319 FormatType::IPv4 => "ipv4",
320 FormatType::IPv6 => "ipv6",
321 FormatType::Json => "json",
322 FormatType::Iso8601DateTime => "iso8601_datetime",
323 FormatType::SocialSecurityNumber => "social_security_number",
324 }
325 }
326
327 pub fn description(&self) -> String {
329 match self {
330 FormatType::Regex(pattern) => format!("matches pattern '{pattern}'"),
331 FormatType::Email => "are valid email addresses".to_string(),
332 FormatType::Url { allow_localhost } => {
333 if *allow_localhost {
334 "are valid URLs (including localhost)".to_string()
335 } else {
336 "are valid URLs".to_string()
337 }
338 }
339 FormatType::CreditCard { detect_only } => {
340 if *detect_only {
341 "contain credit card number patterns".to_string()
342 } else {
343 "are valid credit card numbers".to_string()
344 }
345 }
346 FormatType::Phone { country } => match country.as_deref() {
347 Some(c) => format!("are valid {c} phone numbers"),
348 None => "are valid phone numbers".to_string(),
349 },
350 FormatType::PostalCode { country } => {
351 format!("are valid {country} postal codes")
352 }
353 FormatType::UUID => "are valid UUIDs".to_string(),
354 FormatType::IPv4 => "are valid IPv4 addresses".to_string(),
355 FormatType::IPv6 => "are valid IPv6 addresses".to_string(),
356 FormatType::Json => "are valid JSON documents".to_string(),
357 FormatType::Iso8601DateTime => "are valid ISO 8601 date-time strings".to_string(),
358 FormatType::SocialSecurityNumber => {
359 "contain Social Security Number patterns".to_string()
360 }
361 }
362 }
363}
364
365#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)]
367pub struct FormatOptions {
368 pub case_sensitive: bool,
370 pub trim_before_check: bool,
372 pub null_is_valid: bool,
374}
375
376impl Default for FormatOptions {
377 fn default() -> Self {
378 Self {
379 case_sensitive: true,
380 trim_before_check: false,
381 null_is_valid: true, }
383 }
384}
385
386impl FormatOptions {
387 pub fn new() -> Self {
389 Self::default()
390 }
391
392 pub fn case_sensitive(mut self, case_sensitive: bool) -> Self {
394 self.case_sensitive = case_sensitive;
395 self
396 }
397
398 pub fn trim_before_check(mut self, trim: bool) -> Self {
400 self.trim_before_check = trim;
401 self
402 }
403
404 pub fn null_is_valid(mut self, null_valid: bool) -> Self {
406 self.null_is_valid = null_valid;
407 self
408 }
409
410 pub fn case_insensitive() -> Self {
423 Self::new().case_sensitive(false)
424 }
425
426 pub fn strict() -> Self {
441 Self::new().null_is_valid(false)
442 }
443
444 pub fn lenient() -> Self {
459 Self::new()
460 .case_sensitive(false)
461 .trim_before_check(true)
462 .null_is_valid(true)
463 }
464
465 pub fn with_trimming() -> Self {
478 Self::new().trim_before_check(true)
479 }
480}
481
482#[derive(Debug, Clone)]
519pub struct FormatConstraint {
520 column: String,
522 format: FormatType,
524 threshold: f64,
526 options: FormatOptions,
528}
529
530impl FormatConstraint {
531 pub fn new(
544 column: impl Into<String>,
545 format: FormatType,
546 threshold: f64,
547 options: FormatOptions,
548 ) -> Result<Self> {
549 let column_str = column.into();
550
551 SqlSecurity::validate_identifier(&column_str)?;
553
554 if !(0.0..=1.0).contains(&threshold) {
555 return Err(TermError::SecurityError(
556 "Threshold must be between 0.0 and 1.0".to_string(),
557 ));
558 }
559
560 format.get_pattern()?;
562
563 Ok(Self {
564 column: column_str,
565 format,
566 threshold,
567 options,
568 })
569 }
570
571 pub fn email(column: impl Into<String>, threshold: f64) -> Result<Self> {
573 Self::new(
574 column,
575 FormatType::Email,
576 threshold,
577 FormatOptions::default(),
578 )
579 }
580
581 pub fn url(column: impl Into<String>, threshold: f64, allow_localhost: bool) -> Result<Self> {
583 Self::new(
584 column,
585 FormatType::Url { allow_localhost },
586 threshold,
587 FormatOptions::default(),
588 )
589 }
590
591 pub fn credit_card(
593 column: impl Into<String>,
594 threshold: f64,
595 detect_only: bool,
596 ) -> Result<Self> {
597 Self::new(
598 column,
599 FormatType::CreditCard { detect_only },
600 threshold,
601 FormatOptions::default(),
602 )
603 }
604
605 pub fn phone(
607 column: impl Into<String>,
608 threshold: f64,
609 country: Option<String>,
610 ) -> Result<Self> {
611 Self::new(
612 column,
613 FormatType::Phone { country },
614 threshold,
615 FormatOptions::new().trim_before_check(true),
616 )
617 }
618
619 pub fn postal_code(
621 column: impl Into<String>,
622 threshold: f64,
623 country: impl Into<String>,
624 ) -> Result<Self> {
625 Self::new(
626 column,
627 FormatType::PostalCode {
628 country: country.into(),
629 },
630 threshold,
631 FormatOptions::new().trim_before_check(true),
632 )
633 }
634
635 pub fn uuid(column: impl Into<String>, threshold: f64) -> Result<Self> {
637 Self::new(
638 column,
639 FormatType::UUID,
640 threshold,
641 FormatOptions::default(),
642 )
643 }
644
645 pub fn ipv4(column: impl Into<String>, threshold: f64) -> Result<Self> {
647 Self::new(
648 column,
649 FormatType::IPv4,
650 threshold,
651 FormatOptions::default(),
652 )
653 }
654
655 pub fn ipv6(column: impl Into<String>, threshold: f64) -> Result<Self> {
657 Self::new(
658 column,
659 FormatType::IPv6,
660 threshold,
661 FormatOptions::default(),
662 )
663 }
664
665 pub fn json(column: impl Into<String>, threshold: f64) -> Result<Self> {
667 Self::new(
668 column,
669 FormatType::Json,
670 threshold,
671 FormatOptions::default(),
672 )
673 }
674
675 pub fn iso8601_datetime(column: impl Into<String>, threshold: f64) -> Result<Self> {
677 Self::new(
678 column,
679 FormatType::Iso8601DateTime,
680 threshold,
681 FormatOptions::default(),
682 )
683 }
684
685 pub fn regex(
687 column: impl Into<String>,
688 pattern: impl Into<String>,
689 threshold: f64,
690 ) -> Result<Self> {
691 Self::new(
692 column,
693 FormatType::Regex(pattern.into()),
694 threshold,
695 FormatOptions::default(),
696 )
697 }
698
699 pub fn social_security_number(column: impl Into<String>, threshold: f64) -> Result<Self> {
724 Self::new(
725 column,
726 FormatType::SocialSecurityNumber,
727 threshold,
728 FormatOptions::new().trim_before_check(true),
729 )
730 }
731}
732
733#[async_trait]
734impl Constraint for FormatConstraint {
735 #[instrument(skip(self, ctx), fields(
736 column = %self.column,
737 format = %self.format.name(),
738 threshold = %self.threshold
739 ))]
740 async fn evaluate(&self, ctx: &SessionContext) -> Result<ConstraintResult> {
741 let validation_ctx = current_validation_context();
743 let table_name = validation_ctx.table_name();
744
745 let column_identifier = SqlSecurity::escape_identifier(&self.column)?;
746 let pattern = self.format.get_pattern()?;
747 let escaped_pattern = SqlSecurity::validate_regex_pattern(&pattern)?;
748
749 let column_expr = if self.options.trim_before_check {
751 format!("TRIM({column_identifier})")
752 } else {
753 column_identifier.clone()
754 };
755
756 let pattern_operator = if self.options.case_sensitive {
757 "~"
758 } else {
759 "~*"
760 };
761
762 let sql = if self.options.null_is_valid {
763 format!(
764 "SELECT
765 COUNT(CASE WHEN {column_expr} {pattern_operator} '{escaped_pattern}' OR {column_identifier} IS NULL THEN 1 END) as matches,
766 COUNT(*) as total
767 FROM {table_name}"
768 )
769 } else {
770 format!(
771 "SELECT
772 COUNT(CASE WHEN {column_expr} {pattern_operator} '{escaped_pattern}' THEN 1 END) as matches,
773 COUNT(*) as total
774 FROM {table_name}"
775 )
776 };
777
778 let df = ctx.sql(&sql).await?;
779 let batches = df.collect().await?;
780
781 if batches.is_empty() {
782 return Ok(ConstraintResult::skipped("No data to validate"));
783 }
784
785 let batch = &batches[0];
786 if batch.num_rows() == 0 {
787 return Ok(ConstraintResult::skipped("No data to validate"));
788 }
789
790 let matches = batch
791 .column(0)
792 .as_any()
793 .downcast_ref::<arrow::array::Int64Array>()
794 .ok_or_else(|| TermError::Internal("Failed to extract match count".to_string()))?
795 .value(0) as f64;
796
797 let total = batch
798 .column(1)
799 .as_any()
800 .downcast_ref::<arrow::array::Int64Array>()
801 .ok_or_else(|| TermError::Internal("Failed to extract total count".to_string()))?
802 .value(0) as f64;
803
804 if total == 0.0 {
805 return Ok(ConstraintResult::skipped("No data to validate"));
806 }
807
808 let match_ratio = matches / total;
809
810 let is_success = match &self.format {
812 FormatType::CreditCard { detect_only: true } => {
813 match_ratio <= self.threshold
815 }
816 _ => {
817 match_ratio >= self.threshold
819 }
820 };
821
822 if is_success {
823 Ok(ConstraintResult::success_with_metric(match_ratio))
824 } else {
825 let message = match &self.format {
826 FormatType::CreditCard { detect_only: true } => {
827 format!(
828 "Credit card detection ratio {match_ratio:.3} exceeds threshold {:.3}",
829 self.threshold
830 )
831 }
832 _ => {
833 let desc = self.format.description();
834 format!(
835 "Format validation ratio {match_ratio:.3} is below threshold {:.3} - values that {desc}",
836 self.threshold
837 )
838 }
839 };
840
841 Ok(ConstraintResult::failure_with_metric(match_ratio, message))
842 }
843 }
844
845 fn name(&self) -> &str {
846 self.format.name()
847 }
848
849 fn column(&self) -> Option<&str> {
850 Some(&self.column)
851 }
852
853 fn metadata(&self) -> ConstraintMetadata {
854 let description = match &self.format {
855 FormatType::CreditCard { detect_only: true } => {
856 let threshold_pct = self.threshold * 100.0;
857 let desc = self.format.description();
858 format!(
859 "Checks that no more than {threshold_pct:.1}% of values in '{}' {desc}",
860 self.column
861 )
862 }
863 _ => {
864 let threshold_pct = self.threshold * 100.0;
865 let desc = self.format.description();
866 format!(
867 "Checks that at least {threshold_pct:.1}% of values in '{}' {desc}",
868 self.column
869 )
870 }
871 };
872
873 ConstraintMetadata::for_column(&self.column)
874 .with_description(description)
875 .with_custom("format_type", self.format.name())
876 .with_custom("threshold", self.threshold.to_string())
877 .with_custom("case_sensitive", self.options.case_sensitive.to_string())
878 .with_custom(
879 "trim_before_check",
880 self.options.trim_before_check.to_string(),
881 )
882 .with_custom("null_is_valid", self.options.null_is_valid.to_string())
883 .with_custom("constraint_type", "format")
884 }
885}
886
887#[cfg(test)]
888mod tests {
889 use super::*;
890 use crate::core::ConstraintStatus;
891 use arrow::array::StringArray;
892 use arrow::datatypes::{DataType, Field, Schema};
893 use arrow::record_batch::RecordBatch;
894 use datafusion::datasource::MemTable;
895 use std::sync::Arc;
896
897 use crate::test_helpers::evaluate_constraint_with_context;
898 async fn create_test_context(values: Vec<Option<&str>>) -> SessionContext {
899 let ctx = SessionContext::new();
900
901 let schema = Arc::new(Schema::new(vec![Field::new(
902 "text_col",
903 DataType::Utf8,
904 true,
905 )]));
906
907 let array = StringArray::from(values);
908 let batch = RecordBatch::try_new(schema.clone(), vec![Arc::new(array)]).unwrap();
909
910 let provider = MemTable::try_new(schema, vec![vec![batch]]).unwrap();
911 ctx.register_table("data", Arc::new(provider)).unwrap();
912
913 ctx
914 }
915
916 #[tokio::test]
917 async fn test_email_format_constraint() {
918 let values = vec![
919 Some("test@example.com"),
920 Some("user@domain.org"),
921 Some("invalid-email"),
922 Some("another@test.net"),
923 ];
924 let ctx = create_test_context(values).await;
925
926 let constraint = FormatConstraint::email("text_col", 0.7).unwrap();
927
928 let result = evaluate_constraint_with_context(&constraint, &ctx, "data")
929 .await
930 .unwrap();
931 assert_eq!(result.status, ConstraintStatus::Success);
932 assert_eq!(result.metric, Some(0.75)); assert_eq!(constraint.name(), "email");
934 }
935
936 #[tokio::test]
937 async fn test_url_format_constraint() {
938 let values = vec![
939 Some("https://example.com"),
940 Some("http://test.org"),
941 Some("not-a-url"),
942 Some("https://another.site.net/path"),
943 ];
944 let ctx = create_test_context(values).await;
945
946 let constraint = FormatConstraint::url("text_col", 0.7, false).unwrap();
947
948 let result = evaluate_constraint_with_context(&constraint, &ctx, "data")
949 .await
950 .unwrap();
951 assert_eq!(result.status, ConstraintStatus::Success);
952 assert_eq!(result.metric, Some(0.75)); assert_eq!(constraint.name(), "url");
954 }
955
956 #[tokio::test]
957 async fn test_url_with_localhost() {
958 let values = vec![
959 Some("https://localhost:3000"),
960 Some("http://localhost"),
961 Some("https://example.com"),
962 Some("not-a-url"),
963 ];
964 let ctx = create_test_context(values).await;
965
966 let constraint = FormatConstraint::url("text_col", 0.7, true).unwrap();
967
968 let result = evaluate_constraint_with_context(&constraint, &ctx, "data")
969 .await
970 .unwrap();
971 assert_eq!(result.status, ConstraintStatus::Success);
972 assert_eq!(result.metric, Some(0.75)); }
974
975 #[tokio::test]
976 async fn test_credit_card_detection() {
977 let values = vec![
978 Some("4111-1111-1111-1111"),
979 Some("5555 5555 5555 4444"),
980 Some("normal text"),
981 Some("4111111111111111"), ];
983 let ctx = create_test_context(values).await;
984
985 let constraint = FormatConstraint::credit_card("text_col", 0.8, true).unwrap();
987
988 let result = evaluate_constraint_with_context(&constraint, &ctx, "data")
989 .await
990 .unwrap();
991 assert_eq!(result.status, ConstraintStatus::Success);
992 assert_eq!(constraint.name(), "credit_card");
993 }
994
995 #[tokio::test]
996 async fn test_phone_number_us() {
997 let values = vec![
998 Some("(555) 123-4567"),
999 Some("555-123-4567"),
1000 Some("5551234567"),
1001 Some("invalid-phone"),
1002 ];
1003 let ctx = create_test_context(values).await;
1004
1005 let constraint = FormatConstraint::phone("text_col", 0.7, Some("US".to_string())).unwrap();
1006
1007 let result = evaluate_constraint_with_context(&constraint, &ctx, "data")
1008 .await
1009 .unwrap();
1010 assert_eq!(result.status, ConstraintStatus::Success);
1011 assert_eq!(constraint.name(), "phone");
1012 }
1013
1014 #[tokio::test]
1015 async fn test_postal_code_us() {
1016 let values = vec![
1017 Some("12345"),
1018 Some("12345-6789"),
1019 Some("invalid"),
1020 Some("98765"),
1021 ];
1022 let ctx = create_test_context(values).await;
1023
1024 let constraint = FormatConstraint::postal_code("text_col", 0.7, "US").unwrap();
1025
1026 let result = evaluate_constraint_with_context(&constraint, &ctx, "data")
1027 .await
1028 .unwrap();
1029 assert_eq!(result.status, ConstraintStatus::Success);
1030 assert_eq!(result.metric, Some(0.75)); assert_eq!(constraint.name(), "postal_code");
1032 }
1033
1034 #[tokio::test]
1035 async fn test_uuid_format() {
1036 let values = vec![
1037 Some("550e8400-e29b-41d4-a716-446655440000"),
1038 Some("6ba7b810-9dad-11d1-80b4-00c04fd430c8"),
1039 Some("invalid-uuid"),
1040 Some("6ba7b811-9dad-11d1-80b4-00c04fd430c8"),
1041 ];
1042 let ctx = create_test_context(values).await;
1043
1044 let constraint = FormatConstraint::uuid("text_col", 0.7).unwrap();
1045
1046 let result = evaluate_constraint_with_context(&constraint, &ctx, "data")
1047 .await
1048 .unwrap();
1049 assert_eq!(result.status, ConstraintStatus::Success);
1050 assert_eq!(result.metric, Some(0.75)); assert_eq!(constraint.name(), "uuid");
1052 }
1053
1054 #[tokio::test]
1055 async fn test_ipv4_format() {
1056 let values = vec![
1057 Some("192.168.1.1"),
1058 Some("10.0.0.1"),
1059 Some("256.256.256.256"), Some("172.16.0.1"),
1061 ];
1062 let ctx = create_test_context(values).await;
1063
1064 let constraint = FormatConstraint::ipv4("text_col", 0.7).unwrap();
1065
1066 let result = evaluate_constraint_with_context(&constraint, &ctx, "data")
1067 .await
1068 .unwrap();
1069 assert_eq!(result.status, ConstraintStatus::Success);
1070 assert_eq!(result.metric, Some(0.75)); assert_eq!(constraint.name(), "ipv4");
1072 }
1073
1074 #[tokio::test]
1075 async fn test_ipv6_format() {
1076 let values = vec![
1077 Some("2001:0db8:85a3:0000:0000:8a2e:0370:7334"),
1078 Some("2001:db8:85a3::8a2e:370:7334"),
1079 Some("invalid-ipv6"),
1080 Some("::1"),
1081 ];
1082 let ctx = create_test_context(values).await;
1083
1084 let constraint = FormatConstraint::ipv6("text_col", 0.7).unwrap();
1085
1086 let result = evaluate_constraint_with_context(&constraint, &ctx, "data")
1087 .await
1088 .unwrap();
1089 assert_eq!(result.status, ConstraintStatus::Success);
1090 assert_eq!(result.metric, Some(0.75)); assert_eq!(constraint.name(), "ipv6");
1092 }
1093
1094 #[tokio::test]
1095 async fn test_json_format() {
1096 let values = vec![
1097 Some(r#"{"key": "value"}"#),
1098 Some(r#"[1, 2, 3]"#),
1099 Some("not json"),
1100 Some(r#"{"nested": {"key": "value"}}"#),
1101 ];
1102 let ctx = create_test_context(values).await;
1103
1104 let constraint = FormatConstraint::json("text_col", 0.7).unwrap();
1105
1106 let result = evaluate_constraint_with_context(&constraint, &ctx, "data")
1107 .await
1108 .unwrap();
1109 assert_eq!(result.status, ConstraintStatus::Success);
1110 assert_eq!(result.metric, Some(0.75)); assert_eq!(constraint.name(), "json");
1112 }
1113
1114 #[tokio::test]
1115 async fn test_iso8601_datetime_format() {
1116 let values = vec![
1117 Some("2023-12-25T10:30:00Z"),
1118 Some("2023-12-25T10:30:00.123Z"),
1119 Some("invalid-datetime"),
1120 Some("2023-12-25T10:30:00+05:30"),
1121 ];
1122 let ctx = create_test_context(values).await;
1123
1124 let constraint = FormatConstraint::iso8601_datetime("text_col", 0.7).unwrap();
1125
1126 let result = evaluate_constraint_with_context(&constraint, &ctx, "data")
1127 .await
1128 .unwrap();
1129 assert_eq!(result.status, ConstraintStatus::Success);
1130 assert_eq!(result.metric, Some(0.75)); assert_eq!(constraint.name(), "iso8601_datetime");
1132 }
1133
1134 #[tokio::test]
1135 async fn test_custom_regex_format() {
1136 let values = vec![
1137 Some("ABC123"),
1138 Some("DEF456"),
1139 Some("invalid"),
1140 Some("GHI789"),
1141 ];
1142 let ctx = create_test_context(values).await;
1143
1144 let constraint = FormatConstraint::regex("text_col", r"^[A-Z]{3}\d{3}$", 0.7).unwrap();
1146
1147 let result = evaluate_constraint_with_context(&constraint, &ctx, "data")
1148 .await
1149 .unwrap();
1150 assert_eq!(result.status, ConstraintStatus::Success);
1151 assert_eq!(result.metric, Some(0.75)); assert_eq!(constraint.name(), "regex");
1153 }
1154
1155 #[tokio::test]
1156 async fn test_format_options_case_insensitive() {
1157 let values = vec![
1158 Some("abc123"),
1159 Some("DEF456"),
1160 Some("invalid"),
1161 Some("ghi789"),
1162 ];
1163 let ctx = create_test_context(values).await;
1164
1165 let constraint = FormatConstraint::new(
1167 "text_col",
1168 FormatType::Regex(r"^[A-Z]{3}\d{3}$".to_string()),
1169 0.7,
1170 FormatOptions::new().case_sensitive(false),
1171 )
1172 .unwrap();
1173
1174 let result = evaluate_constraint_with_context(&constraint, &ctx, "data")
1175 .await
1176 .unwrap();
1177 assert_eq!(result.status, ConstraintStatus::Success);
1178 assert_eq!(result.metric, Some(0.75)); }
1180
1181 #[tokio::test]
1182 async fn test_format_options_trim_whitespace() {
1183 let values = vec![
1184 Some(" test@example.com "),
1185 Some("user@domain.org"),
1186 Some(" invalid-email "),
1187 Some(" another@test.net "),
1188 ];
1189 let ctx = create_test_context(values).await;
1190
1191 let constraint = FormatConstraint::new(
1192 "text_col",
1193 FormatType::Email,
1194 0.7,
1195 FormatOptions::new().trim_before_check(true),
1196 )
1197 .unwrap();
1198
1199 let result = evaluate_constraint_with_context(&constraint, &ctx, "data")
1200 .await
1201 .unwrap();
1202 assert_eq!(result.status, ConstraintStatus::Success);
1203 assert_eq!(result.metric, Some(0.75)); }
1205
1206 #[tokio::test]
1207 async fn test_format_options_null_handling() {
1208 let values = vec![Some("test@example.com"), None, Some("invalid-email"), None];
1209 let ctx = create_test_context(values).await;
1210
1211 let constraint1 = FormatConstraint::new(
1213 "text_col",
1214 FormatType::Email,
1215 0.6,
1216 FormatOptions::new().null_is_valid(true),
1217 )
1218 .unwrap();
1219
1220 let result1 = evaluate_constraint_with_context(&constraint1, &ctx, "data")
1221 .await
1222 .unwrap();
1223 assert_eq!(result1.status, ConstraintStatus::Success);
1224 assert_eq!(result1.metric, Some(0.75)); let constraint2 = FormatConstraint::new(
1228 "text_col",
1229 FormatType::Email,
1230 0.2,
1231 FormatOptions::new().null_is_valid(false),
1232 )
1233 .unwrap();
1234
1235 let result2 = evaluate_constraint_with_context(&constraint2, &ctx, "data")
1236 .await
1237 .unwrap();
1238 assert_eq!(result2.status, ConstraintStatus::Success);
1239 assert_eq!(result2.metric, Some(0.25)); }
1241
1242 #[tokio::test]
1243 async fn test_constraint_failure() {
1244 let values = vec![
1245 Some("invalid"),
1246 Some("also_invalid"),
1247 Some("nope"),
1248 Some("still_invalid"),
1249 ];
1250 let ctx = create_test_context(values).await;
1251
1252 let constraint = FormatConstraint::email("text_col", 0.5).unwrap();
1253
1254 let result = evaluate_constraint_with_context(&constraint, &ctx, "data")
1255 .await
1256 .unwrap();
1257 assert_eq!(result.status, ConstraintStatus::Failure);
1258 assert_eq!(result.metric, Some(0.0)); assert!(result.message.is_some());
1260 }
1261
1262 #[tokio::test]
1263 async fn test_empty_data() {
1264 let ctx = create_test_context(vec![]).await;
1265 let constraint = FormatConstraint::email("text_col", 0.9).unwrap();
1266
1267 let result = evaluate_constraint_with_context(&constraint, &ctx, "data")
1268 .await
1269 .unwrap();
1270 assert_eq!(result.status, ConstraintStatus::Skipped);
1271 }
1272
1273 #[test]
1274 fn test_invalid_threshold() {
1275 let result = FormatConstraint::email("col", 1.5);
1276 assert!(result.is_err());
1277 assert!(result
1278 .unwrap_err()
1279 .to_string()
1280 .contains("Threshold must be between 0.0 and 1.0"));
1281 }
1282
1283 #[test]
1284 fn test_pattern_caching() {
1285 let format1 = FormatType::Email;
1287 let format2 = FormatType::Email;
1288
1289 let pattern1 = format1.get_pattern().unwrap();
1290 let pattern2 = format2.get_pattern().unwrap();
1291
1292 assert_eq!(pattern1, pattern2);
1293
1294 for _ in 0..100 {
1296 let _ = format1.get_pattern().unwrap();
1297 }
1298 }
1299
1300 #[test]
1301 fn test_format_type_descriptions() {
1302 assert_eq!(FormatType::Email.description(), "are valid email addresses");
1303 assert_eq!(
1304 FormatType::Url {
1305 allow_localhost: true
1306 }
1307 .description(),
1308 "are valid URLs (including localhost)"
1309 );
1310 assert_eq!(
1311 FormatType::Phone {
1312 country: Some("US".to_string())
1313 }
1314 .description(),
1315 "are valid US phone numbers"
1316 );
1317 assert_eq!(
1318 FormatType::PostalCode {
1319 country: "CA".to_string()
1320 }
1321 .description(),
1322 "are valid CA postal codes"
1323 );
1324 }
1325
1326 #[test]
1327 fn test_all_format_types_have_patterns() {
1328 let formats = vec![
1330 FormatType::Email,
1331 FormatType::Url {
1332 allow_localhost: false,
1333 },
1334 FormatType::Url {
1335 allow_localhost: true,
1336 },
1337 FormatType::CreditCard { detect_only: false },
1338 FormatType::Phone { country: None },
1339 FormatType::Phone {
1340 country: Some("US".to_string()),
1341 },
1342 FormatType::PostalCode {
1343 country: "US".to_string(),
1344 },
1345 FormatType::UUID,
1346 FormatType::IPv4,
1347 FormatType::IPv6,
1348 FormatType::Json,
1349 FormatType::Iso8601DateTime,
1350 FormatType::Regex(r"^\d+$".to_string()),
1351 ];
1352
1353 for format in formats {
1354 assert!(
1355 format.get_pattern().is_ok(),
1356 "Format {format:?} should have a valid pattern"
1357 );
1358 }
1359 }
1360
1361 #[test]
1362 fn test_format_options_convenience_methods() {
1363 let options = FormatOptions::case_insensitive();
1365 assert!(!options.case_sensitive);
1366 assert!(!options.trim_before_check);
1367 assert!(options.null_is_valid);
1368
1369 let options = FormatOptions::strict();
1371 assert!(options.case_sensitive);
1372 assert!(!options.trim_before_check);
1373 assert!(!options.null_is_valid);
1374
1375 let options = FormatOptions::lenient();
1377 assert!(!options.case_sensitive);
1378 assert!(options.trim_before_check);
1379 assert!(options.null_is_valid);
1380
1381 let options = FormatOptions::with_trimming();
1383 assert!(options.case_sensitive);
1384 assert!(options.trim_before_check);
1385 assert!(options.null_is_valid);
1386 }
1387
1388 #[tokio::test]
1389 async fn test_ssn_format_valid() {
1390 let values = vec![
1391 Some("123-45-6789"), Some("123456789"), Some("456-78-9012"), Some("789012345"), ];
1396 let ctx = create_test_context(values).await;
1397
1398 let constraint = FormatConstraint::social_security_number("text_col", 0.95).unwrap();
1399
1400 let result = evaluate_constraint_with_context(&constraint, &ctx, "data")
1401 .await
1402 .unwrap();
1403 assert_eq!(result.status, ConstraintStatus::Success);
1404 assert_eq!(result.metric, Some(1.0)); assert_eq!(constraint.name(), "social_security_number");
1406 }
1407
1408 #[tokio::test]
1409 async fn test_ssn_format_invalid_patterns() {
1410 let values = vec![
1411 Some("000-12-3456"), Some("666-12-3456"), Some("900-12-3456"), Some("123-00-4567"), Some("123-45-0000"), ];
1417 let ctx = create_test_context(values).await;
1418
1419 let constraint = FormatConstraint::social_security_number("text_col", 0.0).unwrap();
1422
1423 let result = evaluate_constraint_with_context(&constraint, &ctx, "data")
1424 .await
1425 .unwrap();
1426 assert_eq!(result.status, ConstraintStatus::Success);
1429 assert_eq!(result.metric, Some(0.0)); }
1431
1432 #[tokio::test]
1433 async fn test_ssn_format_mixed() {
1434 let values = vec![
1435 Some("123-45-6789"), Some("not-an-ssn"), Some("666-12-3456"), Some("456789012"), Some("123 45 6789"), Some("789-01-2345"), None, Some("234-56-7890"), ];
1444 let ctx = create_test_context(values).await;
1445
1446 let constraint = FormatConstraint::social_security_number("text_col", 0.5).unwrap();
1447
1448 let result = evaluate_constraint_with_context(&constraint, &ctx, "data")
1449 .await
1450 .unwrap();
1451 assert_eq!(result.status, ConstraintStatus::Success);
1452 assert_eq!(result.metric, Some(0.625));
1457 }
1458
1459 #[tokio::test]
1460 async fn test_ssn_format_threshold() {
1461 let values = vec![
1462 Some("123-45-6789"), Some("invalid"), Some("234-56-7890"), Some("not-ssn"), ];
1467 let ctx = create_test_context(values).await;
1468
1469 let constraint = FormatConstraint::social_security_number("text_col", 0.8).unwrap();
1471 let result = evaluate_constraint_with_context(&constraint, &ctx, "data")
1472 .await
1473 .unwrap();
1474 assert_eq!(result.status, ConstraintStatus::Failure); assert_eq!(result.metric, Some(0.5));
1476
1477 let constraint = FormatConstraint::social_security_number("text_col", 0.4).unwrap();
1479 let result = evaluate_constraint_with_context(&constraint, &ctx, "data")
1480 .await
1481 .unwrap();
1482 assert_eq!(result.status, ConstraintStatus::Success); assert_eq!(result.metric, Some(0.5));
1484 }
1485
1486 #[tokio::test]
1487 async fn test_ssn_edge_cases() {
1488 let values = vec![
1489 Some("078-05-1120"), Some("219-09-9999"), Some("457-55-5462"), Some("999-99-9999"), Some("123-45-67890"), Some("12-345-6789"), Some("ABC-DE-FGHI"), Some(""), ];
1498 let ctx = create_test_context(values).await;
1499
1500 let constraint = FormatConstraint::social_security_number("text_col", 0.3).unwrap();
1501
1502 let result = evaluate_constraint_with_context(&constraint, &ctx, "data")
1503 .await
1504 .unwrap();
1505 assert_eq!(result.metric, Some(0.375)); assert_eq!(result.status, ConstraintStatus::Success); }
1509}