1use crate::core::{Constraint, ConstraintMetadata, ConstraintResult};
172use crate::prelude::*;
173use crate::security::SqlSecurity;
174use arrow::array::Array;
175use async_trait::async_trait;
176use datafusion::prelude::*;
177use once_cell::sync::Lazy;
178use serde::{Deserialize, Serialize};
179use std::collections::HashMap;
180use std::sync::RwLock;
181use tracing::instrument;
182
183static PATTERN_CACHE: Lazy<RwLock<HashMap<String, String>>> =
185 Lazy::new(|| RwLock::new(HashMap::new()));
186
187#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)]
189pub enum FormatType {
190 Regex(String),
192 Email,
194 Url { allow_localhost: bool },
196 CreditCard { detect_only: bool },
198 Phone { country: Option<String> },
200 PostalCode { country: String },
202 UUID,
204 IPv4,
206 IPv6,
208 Json,
210 Iso8601DateTime,
212}
213
214impl FormatType {
215 fn get_pattern(&self) -> Result<String> {
217 let cache_key = format!("{self:?}");
218
219 {
221 let cache = PATTERN_CACHE.read().map_err(|_| {
222 TermError::Internal("Failed to acquire read lock on pattern cache".to_string())
223 })?;
224 if let Some(pattern) = cache.get(&cache_key) {
225 return Ok(pattern.clone());
226 }
227 }
228
229 let pattern = match self {
230 FormatType::Regex(pattern) => {
231 SqlSecurity::validate_regex_pattern(pattern)?;
232 pattern.clone()
233 }
234 FormatType::Email => {
235 r"^[a-zA-Z0-9.!#$%&'*+/=?^_`{|}~-]+@[a-zA-Z0-9](?:[a-zA-Z0-9-]{0,61}[a-zA-Z0-9])?(?:\.[a-zA-Z0-9](?:[a-zA-Z0-9-]{0,61}[a-zA-Z0-9])?)*$".to_string()
237 }
238 FormatType::Url { allow_localhost } => {
239 if *allow_localhost {
240 r"^https?://(?:localhost|(?:[a-zA-Z0-9.-]+\.?[a-zA-Z]{2,}|(?:\d{1,3}\.){3}\d{1,3}))(?::\d+)?(?:/[^\s]*)?$".to_string()
241 } else {
242 r"^https?://[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}(?::\d+)?(?:/[^\s]*)?$".to_string()
243 }
244 }
245 FormatType::CreditCard { .. } => {
246 r"^(?:4[0-9]{12}(?:[0-9]{3})?|5[1-5][0-9]{14}|3[47][0-9]{13}|3[0-9]{13}|6(?:011|5[0-9]{2})[0-9]{12})$|^(?:\d{4}[-\s]?){3}\d{4}$".to_string()
248 }
249 FormatType::Phone { country } => {
250 match country.as_deref() {
251 Some("US") | Some("CA") => r"^(\+?1[-.\s]?)?\(?([0-9]{3})\)?[-.\s]?([0-9]{3})[-.\s]?([0-9]{4})$".to_string(),
252 Some("UK") => r"^(\+44\s?)?(?:\(?0\d{4}\)?\s?\d{6}|\(?0\d{3}\)?\s?\d{7}|\(?0\d{2}\)?\s?\d{8})$".to_string(),
253 Some("DE") => r"^(\+49\s?)?(?:\(?0\d{2,5}\)?\s?\d{4,12})$".to_string(),
254 Some("FR") => r"^(\+33\s?)?(?:\(?0\d{1}\)?\s?\d{8})$".to_string(),
255 _ => r"^[\+]?[1-9][\d]{0,15}$".to_string(), }
257 }
258 FormatType::PostalCode { country } => {
259 match country.as_str() {
260 "US" => r"^\d{5}(-\d{4})?$".to_string(),
261 "CA" => r"^[A-Za-z]\d[A-Za-z][ -]?\d[A-Za-z]\d$".to_string(),
262 "UK" => r"^[A-Z]{1,2}\d[A-Z\d]?\s?\d[A-Z]{2}$".to_string(),
263 "DE" => r"^\d{5}$".to_string(),
264 "FR" => r"^\d{5}$".to_string(),
265 "JP" => r"^\d{3}-\d{4}$".to_string(),
266 "AU" => r"^\d{4}$".to_string(),
267 _ => r"^[A-Za-z0-9\s-]{3,10}$".to_string(), }
269 }
270 FormatType::UUID => {
271 r"^[0-9a-fA-F]{8}-[0-9a-fA-F]{4}-[1-5][0-9a-fA-F]{3}-[89abAB][0-9a-fA-F]{3}-[0-9a-fA-F]{12}$".to_string()
272 }
273 FormatType::IPv4 => {
274 r"^(?:(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\.){3}(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)$".to_string()
275 }
276 FormatType::IPv6 => {
277 r"^([0-9a-fA-F]{0,4}:){1,7}([0-9a-fA-F]{0,4})?$|^::$|^::1$|^([0-9a-fA-F]{1,4}:)*::([0-9a-fA-F]{1,4}:)*[0-9a-fA-F]{1,4}$".to_string()
279 }
280 FormatType::Json => {
281 r"^\s*[\{\[].*[\}\]]\s*$".to_string()
283 }
284 FormatType::Iso8601DateTime => {
285 r"^\d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2}(?:\.\d+)?(?:Z|[+-]\d{2}:\d{2})$".to_string()
287 }
288 };
289
290 {
292 let mut cache = PATTERN_CACHE.write().map_err(|_| {
293 TermError::Internal("Failed to acquire write lock on pattern cache".to_string())
294 })?;
295 cache.insert(cache_key, pattern.clone());
296 }
297
298 Ok(pattern)
299 }
300
301 pub fn name(&self) -> &str {
303 match self {
304 FormatType::Regex(_) => "regex",
305 FormatType::Email => "email",
306 FormatType::Url { .. } => "url",
307 FormatType::CreditCard { .. } => "credit_card",
308 FormatType::Phone { .. } => "phone",
309 FormatType::PostalCode { .. } => "postal_code",
310 FormatType::UUID => "uuid",
311 FormatType::IPv4 => "ipv4",
312 FormatType::IPv6 => "ipv6",
313 FormatType::Json => "json",
314 FormatType::Iso8601DateTime => "iso8601_datetime",
315 }
316 }
317
318 pub fn description(&self) -> String {
320 match self {
321 FormatType::Regex(pattern) => format!("matches pattern '{pattern}'"),
322 FormatType::Email => "are valid email addresses".to_string(),
323 FormatType::Url { allow_localhost } => {
324 if *allow_localhost {
325 "are valid URLs (including localhost)".to_string()
326 } else {
327 "are valid URLs".to_string()
328 }
329 }
330 FormatType::CreditCard { detect_only } => {
331 if *detect_only {
332 "contain credit card number patterns".to_string()
333 } else {
334 "are valid credit card numbers".to_string()
335 }
336 }
337 FormatType::Phone { country } => match country.as_deref() {
338 Some(c) => format!("are valid {c} phone numbers"),
339 None => "are valid phone numbers".to_string(),
340 },
341 FormatType::PostalCode { country } => {
342 format!("are valid {country} postal codes")
343 }
344 FormatType::UUID => "are valid UUIDs".to_string(),
345 FormatType::IPv4 => "are valid IPv4 addresses".to_string(),
346 FormatType::IPv6 => "are valid IPv6 addresses".to_string(),
347 FormatType::Json => "are valid JSON documents".to_string(),
348 FormatType::Iso8601DateTime => "are valid ISO 8601 date-time strings".to_string(),
349 }
350 }
351}
352
353#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)]
355pub struct FormatOptions {
356 pub case_sensitive: bool,
358 pub trim_before_check: bool,
360 pub null_is_valid: bool,
362}
363
364impl Default for FormatOptions {
365 fn default() -> Self {
366 Self {
367 case_sensitive: true,
368 trim_before_check: false,
369 null_is_valid: true, }
371 }
372}
373
374impl FormatOptions {
375 pub fn new() -> Self {
377 Self::default()
378 }
379
380 pub fn case_sensitive(mut self, case_sensitive: bool) -> Self {
382 self.case_sensitive = case_sensitive;
383 self
384 }
385
386 pub fn trim_before_check(mut self, trim: bool) -> Self {
388 self.trim_before_check = trim;
389 self
390 }
391
392 pub fn null_is_valid(mut self, null_valid: bool) -> Self {
394 self.null_is_valid = null_valid;
395 self
396 }
397
398 pub fn case_insensitive() -> Self {
411 Self::new().case_sensitive(false)
412 }
413
414 pub fn strict() -> Self {
429 Self::new().null_is_valid(false)
430 }
431
432 pub fn lenient() -> Self {
447 Self::new()
448 .case_sensitive(false)
449 .trim_before_check(true)
450 .null_is_valid(true)
451 }
452
453 pub fn with_trimming() -> Self {
466 Self::new().trim_before_check(true)
467 }
468}
469
470#[derive(Debug, Clone)]
507pub struct FormatConstraint {
508 column: String,
510 format: FormatType,
512 threshold: f64,
514 options: FormatOptions,
516}
517
518impl FormatConstraint {
519 pub fn new(
532 column: impl Into<String>,
533 format: FormatType,
534 threshold: f64,
535 options: FormatOptions,
536 ) -> Result<Self> {
537 let column_str = column.into();
538
539 SqlSecurity::validate_identifier(&column_str)?;
541
542 if !(0.0..=1.0).contains(&threshold) {
543 return Err(TermError::SecurityError(
544 "Threshold must be between 0.0 and 1.0".to_string(),
545 ));
546 }
547
548 format.get_pattern()?;
550
551 Ok(Self {
552 column: column_str,
553 format,
554 threshold,
555 options,
556 })
557 }
558
559 pub fn email(column: impl Into<String>, threshold: f64) -> Result<Self> {
561 Self::new(
562 column,
563 FormatType::Email,
564 threshold,
565 FormatOptions::default(),
566 )
567 }
568
569 pub fn url(column: impl Into<String>, threshold: f64, allow_localhost: bool) -> Result<Self> {
571 Self::new(
572 column,
573 FormatType::Url { allow_localhost },
574 threshold,
575 FormatOptions::default(),
576 )
577 }
578
579 pub fn credit_card(
581 column: impl Into<String>,
582 threshold: f64,
583 detect_only: bool,
584 ) -> Result<Self> {
585 Self::new(
586 column,
587 FormatType::CreditCard { detect_only },
588 threshold,
589 FormatOptions::default(),
590 )
591 }
592
593 pub fn phone(
595 column: impl Into<String>,
596 threshold: f64,
597 country: Option<String>,
598 ) -> Result<Self> {
599 Self::new(
600 column,
601 FormatType::Phone { country },
602 threshold,
603 FormatOptions::new().trim_before_check(true),
604 )
605 }
606
607 pub fn postal_code(
609 column: impl Into<String>,
610 threshold: f64,
611 country: impl Into<String>,
612 ) -> Result<Self> {
613 Self::new(
614 column,
615 FormatType::PostalCode {
616 country: country.into(),
617 },
618 threshold,
619 FormatOptions::new().trim_before_check(true),
620 )
621 }
622
623 pub fn uuid(column: impl Into<String>, threshold: f64) -> Result<Self> {
625 Self::new(
626 column,
627 FormatType::UUID,
628 threshold,
629 FormatOptions::default(),
630 )
631 }
632
633 pub fn ipv4(column: impl Into<String>, threshold: f64) -> Result<Self> {
635 Self::new(
636 column,
637 FormatType::IPv4,
638 threshold,
639 FormatOptions::default(),
640 )
641 }
642
643 pub fn ipv6(column: impl Into<String>, threshold: f64) -> Result<Self> {
645 Self::new(
646 column,
647 FormatType::IPv6,
648 threshold,
649 FormatOptions::default(),
650 )
651 }
652
653 pub fn json(column: impl Into<String>, threshold: f64) -> Result<Self> {
655 Self::new(
656 column,
657 FormatType::Json,
658 threshold,
659 FormatOptions::default(),
660 )
661 }
662
663 pub fn iso8601_datetime(column: impl Into<String>, threshold: f64) -> Result<Self> {
665 Self::new(
666 column,
667 FormatType::Iso8601DateTime,
668 threshold,
669 FormatOptions::default(),
670 )
671 }
672
673 pub fn regex(
675 column: impl Into<String>,
676 pattern: impl Into<String>,
677 threshold: f64,
678 ) -> Result<Self> {
679 Self::new(
680 column,
681 FormatType::Regex(pattern.into()),
682 threshold,
683 FormatOptions::default(),
684 )
685 }
686}
687
688#[async_trait]
689impl Constraint for FormatConstraint {
690 #[instrument(skip(self, ctx), fields(
691 column = %self.column,
692 format = %self.format.name(),
693 threshold = %self.threshold
694 ))]
695 async fn evaluate(&self, ctx: &SessionContext) -> Result<ConstraintResult> {
696 let column_identifier = SqlSecurity::escape_identifier(&self.column)?;
697 let pattern = self.format.get_pattern()?;
698 let escaped_pattern = SqlSecurity::validate_regex_pattern(&pattern)?;
699
700 let column_expr = if self.options.trim_before_check {
702 format!("TRIM({column_identifier})")
703 } else {
704 column_identifier.clone()
705 };
706
707 let pattern_operator = if self.options.case_sensitive {
708 "~"
709 } else {
710 "~*"
711 };
712
713 let sql = if self.options.null_is_valid {
714 format!(
715 "SELECT
716 COUNT(CASE WHEN {column_expr} {pattern_operator} '{escaped_pattern}' OR {column_identifier} IS NULL THEN 1 END) as matches,
717 COUNT(*) as total
718 FROM data"
719 )
720 } else {
721 format!(
722 "SELECT
723 COUNT(CASE WHEN {column_expr} {pattern_operator} '{escaped_pattern}' THEN 1 END) as matches,
724 COUNT(*) as total
725 FROM data"
726 )
727 };
728
729 let df = ctx.sql(&sql).await?;
730 let batches = df.collect().await?;
731
732 if batches.is_empty() {
733 return Ok(ConstraintResult::skipped("No data to validate"));
734 }
735
736 let batch = &batches[0];
737 if batch.num_rows() == 0 {
738 return Ok(ConstraintResult::skipped("No data to validate"));
739 }
740
741 let matches = batch
742 .column(0)
743 .as_any()
744 .downcast_ref::<arrow::array::Int64Array>()
745 .ok_or_else(|| TermError::Internal("Failed to extract match count".to_string()))?
746 .value(0) as f64;
747
748 let total = batch
749 .column(1)
750 .as_any()
751 .downcast_ref::<arrow::array::Int64Array>()
752 .ok_or_else(|| TermError::Internal("Failed to extract total count".to_string()))?
753 .value(0) as f64;
754
755 if total == 0.0 {
756 return Ok(ConstraintResult::skipped("No data to validate"));
757 }
758
759 let match_ratio = matches / total;
760
761 let is_success = match &self.format {
763 FormatType::CreditCard { detect_only: true } => {
764 match_ratio <= self.threshold
766 }
767 _ => {
768 match_ratio >= self.threshold
770 }
771 };
772
773 if is_success {
774 Ok(ConstraintResult::success_with_metric(match_ratio))
775 } else {
776 let message = match &self.format {
777 FormatType::CreditCard { detect_only: true } => {
778 format!(
779 "Credit card detection ratio {match_ratio:.3} exceeds threshold {:.3}",
780 self.threshold
781 )
782 }
783 _ => {
784 let desc = self.format.description();
785 format!(
786 "Format validation ratio {match_ratio:.3} is below threshold {:.3} - values that {desc}",
787 self.threshold
788 )
789 }
790 };
791
792 Ok(ConstraintResult::failure_with_metric(match_ratio, message))
793 }
794 }
795
796 fn name(&self) -> &str {
797 self.format.name()
798 }
799
800 fn column(&self) -> Option<&str> {
801 Some(&self.column)
802 }
803
804 fn metadata(&self) -> ConstraintMetadata {
805 let description = match &self.format {
806 FormatType::CreditCard { detect_only: true } => {
807 let threshold_pct = self.threshold * 100.0;
808 let desc = self.format.description();
809 format!(
810 "Checks that no more than {threshold_pct:.1}% of values in '{}' {desc}",
811 self.column
812 )
813 }
814 _ => {
815 let threshold_pct = self.threshold * 100.0;
816 let desc = self.format.description();
817 format!(
818 "Checks that at least {threshold_pct:.1}% of values in '{}' {desc}",
819 self.column
820 )
821 }
822 };
823
824 ConstraintMetadata::for_column(&self.column)
825 .with_description(description)
826 .with_custom("format_type", self.format.name())
827 .with_custom("threshold", self.threshold.to_string())
828 .with_custom("case_sensitive", self.options.case_sensitive.to_string())
829 .with_custom(
830 "trim_before_check",
831 self.options.trim_before_check.to_string(),
832 )
833 .with_custom("null_is_valid", self.options.null_is_valid.to_string())
834 .with_custom("constraint_type", "format")
835 }
836}
837
838#[cfg(test)]
839mod tests {
840 use super::*;
841 use crate::core::ConstraintStatus;
842 use arrow::array::StringArray;
843 use arrow::datatypes::{DataType, Field, Schema};
844 use arrow::record_batch::RecordBatch;
845 use datafusion::datasource::MemTable;
846 use std::sync::Arc;
847
848 async fn create_test_context(values: Vec<Option<&str>>) -> SessionContext {
849 let ctx = SessionContext::new();
850
851 let schema = Arc::new(Schema::new(vec![Field::new(
852 "text_col",
853 DataType::Utf8,
854 true,
855 )]));
856
857 let array = StringArray::from(values);
858 let batch = RecordBatch::try_new(schema.clone(), vec![Arc::new(array)]).unwrap();
859
860 let provider = MemTable::try_new(schema, vec![vec![batch]]).unwrap();
861 ctx.register_table("data", Arc::new(provider)).unwrap();
862
863 ctx
864 }
865
866 #[tokio::test]
867 async fn test_email_format_constraint() {
868 let values = vec![
869 Some("test@example.com"),
870 Some("user@domain.org"),
871 Some("invalid-email"),
872 Some("another@test.net"),
873 ];
874 let ctx = create_test_context(values).await;
875
876 let constraint = FormatConstraint::email("text_col", 0.7).unwrap();
877
878 let result = constraint.evaluate(&ctx).await.unwrap();
879 assert_eq!(result.status, ConstraintStatus::Success);
880 assert_eq!(result.metric, Some(0.75)); assert_eq!(constraint.name(), "email");
882 }
883
884 #[tokio::test]
885 async fn test_url_format_constraint() {
886 let values = vec![
887 Some("https://example.com"),
888 Some("http://test.org"),
889 Some("not-a-url"),
890 Some("https://another.site.net/path"),
891 ];
892 let ctx = create_test_context(values).await;
893
894 let constraint = FormatConstraint::url("text_col", 0.7, false).unwrap();
895
896 let result = constraint.evaluate(&ctx).await.unwrap();
897 assert_eq!(result.status, ConstraintStatus::Success);
898 assert_eq!(result.metric, Some(0.75)); assert_eq!(constraint.name(), "url");
900 }
901
902 #[tokio::test]
903 async fn test_url_with_localhost() {
904 let values = vec![
905 Some("https://localhost:3000"),
906 Some("http://localhost"),
907 Some("https://example.com"),
908 Some("not-a-url"),
909 ];
910 let ctx = create_test_context(values).await;
911
912 let constraint = FormatConstraint::url("text_col", 0.7, true).unwrap();
913
914 let result = constraint.evaluate(&ctx).await.unwrap();
915 assert_eq!(result.status, ConstraintStatus::Success);
916 assert_eq!(result.metric, Some(0.75)); }
918
919 #[tokio::test]
920 async fn test_credit_card_detection() {
921 let values = vec![
922 Some("4111-1111-1111-1111"),
923 Some("5555 5555 5555 4444"),
924 Some("normal text"),
925 Some("4111111111111111"), ];
927 let ctx = create_test_context(values).await;
928
929 let constraint = FormatConstraint::credit_card("text_col", 0.8, true).unwrap();
931
932 let result = constraint.evaluate(&ctx).await.unwrap();
933 assert_eq!(result.status, ConstraintStatus::Success);
934 assert_eq!(constraint.name(), "credit_card");
935 }
936
937 #[tokio::test]
938 async fn test_phone_number_us() {
939 let values = vec![
940 Some("(555) 123-4567"),
941 Some("555-123-4567"),
942 Some("5551234567"),
943 Some("invalid-phone"),
944 ];
945 let ctx = create_test_context(values).await;
946
947 let constraint = FormatConstraint::phone("text_col", 0.7, Some("US".to_string())).unwrap();
948
949 let result = constraint.evaluate(&ctx).await.unwrap();
950 assert_eq!(result.status, ConstraintStatus::Success);
951 assert_eq!(constraint.name(), "phone");
952 }
953
954 #[tokio::test]
955 async fn test_postal_code_us() {
956 let values = vec![
957 Some("12345"),
958 Some("12345-6789"),
959 Some("invalid"),
960 Some("98765"),
961 ];
962 let ctx = create_test_context(values).await;
963
964 let constraint = FormatConstraint::postal_code("text_col", 0.7, "US").unwrap();
965
966 let result = constraint.evaluate(&ctx).await.unwrap();
967 assert_eq!(result.status, ConstraintStatus::Success);
968 assert_eq!(result.metric, Some(0.75)); assert_eq!(constraint.name(), "postal_code");
970 }
971
972 #[tokio::test]
973 async fn test_uuid_format() {
974 let values = vec![
975 Some("550e8400-e29b-41d4-a716-446655440000"),
976 Some("6ba7b810-9dad-11d1-80b4-00c04fd430c8"),
977 Some("invalid-uuid"),
978 Some("6ba7b811-9dad-11d1-80b4-00c04fd430c8"),
979 ];
980 let ctx = create_test_context(values).await;
981
982 let constraint = FormatConstraint::uuid("text_col", 0.7).unwrap();
983
984 let result = constraint.evaluate(&ctx).await.unwrap();
985 assert_eq!(result.status, ConstraintStatus::Success);
986 assert_eq!(result.metric, Some(0.75)); assert_eq!(constraint.name(), "uuid");
988 }
989
990 #[tokio::test]
991 async fn test_ipv4_format() {
992 let values = vec![
993 Some("192.168.1.1"),
994 Some("10.0.0.1"),
995 Some("256.256.256.256"), Some("172.16.0.1"),
997 ];
998 let ctx = create_test_context(values).await;
999
1000 let constraint = FormatConstraint::ipv4("text_col", 0.7).unwrap();
1001
1002 let result = constraint.evaluate(&ctx).await.unwrap();
1003 assert_eq!(result.status, ConstraintStatus::Success);
1004 assert_eq!(result.metric, Some(0.75)); assert_eq!(constraint.name(), "ipv4");
1006 }
1007
1008 #[tokio::test]
1009 async fn test_ipv6_format() {
1010 let values = vec![
1011 Some("2001:0db8:85a3:0000:0000:8a2e:0370:7334"),
1012 Some("2001:db8:85a3::8a2e:370:7334"),
1013 Some("invalid-ipv6"),
1014 Some("::1"),
1015 ];
1016 let ctx = create_test_context(values).await;
1017
1018 let constraint = FormatConstraint::ipv6("text_col", 0.7).unwrap();
1019
1020 let result = constraint.evaluate(&ctx).await.unwrap();
1021 assert_eq!(result.status, ConstraintStatus::Success);
1022 assert_eq!(result.metric, Some(0.75)); assert_eq!(constraint.name(), "ipv6");
1024 }
1025
1026 #[tokio::test]
1027 async fn test_json_format() {
1028 let values = vec![
1029 Some(r#"{"key": "value"}"#),
1030 Some(r#"[1, 2, 3]"#),
1031 Some("not json"),
1032 Some(r#"{"nested": {"key": "value"}}"#),
1033 ];
1034 let ctx = create_test_context(values).await;
1035
1036 let constraint = FormatConstraint::json("text_col", 0.7).unwrap();
1037
1038 let result = constraint.evaluate(&ctx).await.unwrap();
1039 assert_eq!(result.status, ConstraintStatus::Success);
1040 assert_eq!(result.metric, Some(0.75)); assert_eq!(constraint.name(), "json");
1042 }
1043
1044 #[tokio::test]
1045 async fn test_iso8601_datetime_format() {
1046 let values = vec![
1047 Some("2023-12-25T10:30:00Z"),
1048 Some("2023-12-25T10:30:00.123Z"),
1049 Some("invalid-datetime"),
1050 Some("2023-12-25T10:30:00+05:30"),
1051 ];
1052 let ctx = create_test_context(values).await;
1053
1054 let constraint = FormatConstraint::iso8601_datetime("text_col", 0.7).unwrap();
1055
1056 let result = constraint.evaluate(&ctx).await.unwrap();
1057 assert_eq!(result.status, ConstraintStatus::Success);
1058 assert_eq!(result.metric, Some(0.75)); assert_eq!(constraint.name(), "iso8601_datetime");
1060 }
1061
1062 #[tokio::test]
1063 async fn test_custom_regex_format() {
1064 let values = vec![
1065 Some("ABC123"),
1066 Some("DEF456"),
1067 Some("invalid"),
1068 Some("GHI789"),
1069 ];
1070 let ctx = create_test_context(values).await;
1071
1072 let constraint = FormatConstraint::regex("text_col", r"^[A-Z]{3}\d{3}$", 0.7).unwrap();
1074
1075 let result = constraint.evaluate(&ctx).await.unwrap();
1076 assert_eq!(result.status, ConstraintStatus::Success);
1077 assert_eq!(result.metric, Some(0.75)); assert_eq!(constraint.name(), "regex");
1079 }
1080
1081 #[tokio::test]
1082 async fn test_format_options_case_insensitive() {
1083 let values = vec![
1084 Some("abc123"),
1085 Some("DEF456"),
1086 Some("invalid"),
1087 Some("ghi789"),
1088 ];
1089 let ctx = create_test_context(values).await;
1090
1091 let constraint = FormatConstraint::new(
1093 "text_col",
1094 FormatType::Regex(r"^[A-Z]{3}\d{3}$".to_string()),
1095 0.7,
1096 FormatOptions::new().case_sensitive(false),
1097 )
1098 .unwrap();
1099
1100 let result = constraint.evaluate(&ctx).await.unwrap();
1101 assert_eq!(result.status, ConstraintStatus::Success);
1102 assert_eq!(result.metric, Some(0.75)); }
1104
1105 #[tokio::test]
1106 async fn test_format_options_trim_whitespace() {
1107 let values = vec![
1108 Some(" test@example.com "),
1109 Some("user@domain.org"),
1110 Some(" invalid-email "),
1111 Some(" another@test.net "),
1112 ];
1113 let ctx = create_test_context(values).await;
1114
1115 let constraint = FormatConstraint::new(
1116 "text_col",
1117 FormatType::Email,
1118 0.7,
1119 FormatOptions::new().trim_before_check(true),
1120 )
1121 .unwrap();
1122
1123 let result = constraint.evaluate(&ctx).await.unwrap();
1124 assert_eq!(result.status, ConstraintStatus::Success);
1125 assert_eq!(result.metric, Some(0.75)); }
1127
1128 #[tokio::test]
1129 async fn test_format_options_null_handling() {
1130 let values = vec![Some("test@example.com"), None, Some("invalid-email"), None];
1131 let ctx = create_test_context(values).await;
1132
1133 let constraint1 = FormatConstraint::new(
1135 "text_col",
1136 FormatType::Email,
1137 0.6,
1138 FormatOptions::new().null_is_valid(true),
1139 )
1140 .unwrap();
1141
1142 let result1 = constraint1.evaluate(&ctx).await.unwrap();
1143 assert_eq!(result1.status, ConstraintStatus::Success);
1144 assert_eq!(result1.metric, Some(0.75)); let constraint2 = FormatConstraint::new(
1148 "text_col",
1149 FormatType::Email,
1150 0.2,
1151 FormatOptions::new().null_is_valid(false),
1152 )
1153 .unwrap();
1154
1155 let result2 = constraint2.evaluate(&ctx).await.unwrap();
1156 assert_eq!(result2.status, ConstraintStatus::Success);
1157 assert_eq!(result2.metric, Some(0.25)); }
1159
1160 #[tokio::test]
1161 async fn test_constraint_failure() {
1162 let values = vec![
1163 Some("invalid"),
1164 Some("also_invalid"),
1165 Some("nope"),
1166 Some("still_invalid"),
1167 ];
1168 let ctx = create_test_context(values).await;
1169
1170 let constraint = FormatConstraint::email("text_col", 0.5).unwrap();
1171
1172 let result = constraint.evaluate(&ctx).await.unwrap();
1173 assert_eq!(result.status, ConstraintStatus::Failure);
1174 assert_eq!(result.metric, Some(0.0)); assert!(result.message.is_some());
1176 }
1177
1178 #[tokio::test]
1179 async fn test_empty_data() {
1180 let ctx = create_test_context(vec![]).await;
1181 let constraint = FormatConstraint::email("text_col", 0.9).unwrap();
1182
1183 let result = constraint.evaluate(&ctx).await.unwrap();
1184 assert_eq!(result.status, ConstraintStatus::Skipped);
1185 }
1186
1187 #[test]
1188 fn test_invalid_threshold() {
1189 let result = FormatConstraint::email("col", 1.5);
1190 assert!(result.is_err());
1191 assert!(result
1192 .unwrap_err()
1193 .to_string()
1194 .contains("Threshold must be between 0.0 and 1.0"));
1195 }
1196
1197 #[test]
1198 fn test_pattern_caching() {
1199 let format1 = FormatType::Email;
1201 let format2 = FormatType::Email;
1202
1203 let pattern1 = format1.get_pattern().unwrap();
1204 let pattern2 = format2.get_pattern().unwrap();
1205
1206 assert_eq!(pattern1, pattern2);
1207
1208 for _ in 0..100 {
1210 let _ = format1.get_pattern().unwrap();
1211 }
1212 }
1213
1214 #[test]
1215 fn test_format_type_descriptions() {
1216 assert_eq!(FormatType::Email.description(), "are valid email addresses");
1217 assert_eq!(
1218 FormatType::Url {
1219 allow_localhost: true
1220 }
1221 .description(),
1222 "are valid URLs (including localhost)"
1223 );
1224 assert_eq!(
1225 FormatType::Phone {
1226 country: Some("US".to_string())
1227 }
1228 .description(),
1229 "are valid US phone numbers"
1230 );
1231 assert_eq!(
1232 FormatType::PostalCode {
1233 country: "CA".to_string()
1234 }
1235 .description(),
1236 "are valid CA postal codes"
1237 );
1238 }
1239
1240 #[test]
1241 fn test_all_format_types_have_patterns() {
1242 let formats = vec![
1244 FormatType::Email,
1245 FormatType::Url {
1246 allow_localhost: false,
1247 },
1248 FormatType::Url {
1249 allow_localhost: true,
1250 },
1251 FormatType::CreditCard { detect_only: false },
1252 FormatType::Phone { country: None },
1253 FormatType::Phone {
1254 country: Some("US".to_string()),
1255 },
1256 FormatType::PostalCode {
1257 country: "US".to_string(),
1258 },
1259 FormatType::UUID,
1260 FormatType::IPv4,
1261 FormatType::IPv6,
1262 FormatType::Json,
1263 FormatType::Iso8601DateTime,
1264 FormatType::Regex(r"^\d+$".to_string()),
1265 ];
1266
1267 for format in formats {
1268 assert!(
1269 format.get_pattern().is_ok(),
1270 "Format {format:?} should have a valid pattern"
1271 );
1272 }
1273 }
1274
1275 #[test]
1276 fn test_format_options_convenience_methods() {
1277 let options = FormatOptions::case_insensitive();
1279 assert!(!options.case_sensitive);
1280 assert!(!options.trim_before_check);
1281 assert!(options.null_is_valid);
1282
1283 let options = FormatOptions::strict();
1285 assert!(options.case_sensitive);
1286 assert!(!options.trim_before_check);
1287 assert!(!options.null_is_valid);
1288
1289 let options = FormatOptions::lenient();
1291 assert!(!options.case_sensitive);
1292 assert!(options.trim_before_check);
1293 assert!(options.null_is_valid);
1294
1295 let options = FormatOptions::with_trimming();
1297 assert!(options.case_sensitive);
1298 assert!(options.trim_before_check);
1299 assert!(options.null_is_valid);
1300 }
1301}