term_guard/lib.rs
1//! # Term - Data Validation for Rust
2//!
3//! Term is a powerful data validation library inspired by AWS Deequ, providing
4//! comprehensive data quality checks without requiring Apache Spark. It leverages
5//! DataFusion for efficient query execution and includes built-in observability
6//! through OpenTelemetry.
7//!
8//! ## Overview
9//!
10//! Term enables you to define and run data quality validations on your datasets,
11//! helping you ensure data correctness, completeness, and consistency. Whether
12//! you're validating data in ETL pipelines, ensuring data quality in analytics
13//! workflows, or monitoring data drift in production, Term provides the tools
14//! you need.
15//!
16//! ## Quick Start
17//!
18//! ```rust
19//! use term_guard::prelude::*;
20//! use term_guard::core::{ValidationSuite, Check, Level, ConstraintStatus, builder_extensions::CompletenessOptions};
21//! use term_guard::constraints::Assertion;
22//! use datafusion::prelude::*;
23//!
24//! # async fn example() -> std::result::Result<(), Box<dyn std::error::Error>> {
25//! // Create a validation suite
26//! let suite = ValidationSuite::builder("user_data_validation")
27//! .check(
28//! Check::builder("critical_checks")
29//! .level(Level::Error)
30//! .completeness("user_id", CompletenessOptions::full().into_constraint_options()) // No nulls allowed
31//! .validates_uniqueness(vec!["user_id"], 1.0) // Must be unique
32//! .completeness("email", CompletenessOptions::threshold(0.95).into_constraint_options()) // 95% non-null
33//! .build()
34//! )
35//! .check(
36//! Check::builder("data_quality")
37//! .level(Level::Warning)
38//! .validates_regex("email", r"^[^@]+@[^@]+$", 0.98)
39//! .statistic("age", term_guard::constraints::StatisticType::Min, Assertion::GreaterThanOrEqual(0.0))
40//! .statistic("age", term_guard::constraints::StatisticType::Max, Assertion::LessThanOrEqual(120.0))
41//! .build()
42//! )
43//! .build();
44//!
45//! // Create a DataFusion context with your data
46//! let ctx = SessionContext::new();
47//! // ... register your data tables ...
48//!
49//! // Run validation
50//! let results = suite.run(&ctx).await?;
51//!
52//! // Check results
53//! match &results {
54//! term_guard::core::ValidationResult::Success { report, .. } => {
55//! println!("Validation succeeded!");
56//! println!("Total checks: {}", report.metrics.total_checks);
57//! }
58//! term_guard::core::ValidationResult::Failure { report } => {
59//! println!("Validation failed!");
60//! for issue in &report.issues {
61//! println!("{}: {}", issue.check_name, issue.message);
62//! }
63//! }
64//! }
65//! # Ok(())
66//! # }
67//! ```
68//!
69//! ## Key Features
70//!
71//! ### Comprehensive Validation Constraints
72//!
73//! - **Completeness**: Check for null values and missing data
74//! - **Uniqueness**: Ensure values are unique (single or multi-column)
75//! - **Patterns**: Validate data against regex patterns
76//! - **Statistics**: Min, max, mean, sum, standard deviation checks
77//! - **Data Types**: Ensure consistent data types
78//! - **Custom SQL**: Define complex validation logic with SQL expressions
79//!
80//! ### Performance Optimization
81//!
82//! Term includes a query optimizer that dramatically improves performance:
83//!
84//! ```rust,no_run
85//! use term_guard::core::ValidationSuite;
86//!
87//! let suite = ValidationSuite::builder("optimized_validation")
88//! .with_optimizer(true) // Enable query optimization
89//! // .check(/* your checks */)
90//! .build();
91//! ```
92//!
93//! The optimizer combines multiple constraints into single queries when possible,
94//! reducing table scans and improving performance by up to 15x for suites with
95//! many constraints.
96//!
97//! ### Multiple Data Sources
98//!
99//! Term supports various data sources through the `sources` module:
100//!
101//! - CSV files
102//! - Parquet files
103//! - JSON files
104//! - PostgreSQL databases
105//! - Cloud storage (S3, Azure Blob, Google Cloud Storage)
106//!
107//! ### Observability
108//!
109//! Built-in OpenTelemetry integration provides:
110//!
111//! - Distributed tracing for validation runs
112//! - Metrics for constraint evaluation performance
113//! - Structured logging with the `tracing` crate
114//!
115//! ```rust,ignore
116//! use term_guard::telemetry::TermTelemetry;
117//! use opentelemetry::trace::Tracer;
118//!
119//! // User configures their own tracer
120//! let tracer = opentelemetry_jaeger::new_agent_pipeline()
121//! .with_service_name("data-validation")
122//! .install_simple()?;
123//!
124//! let telemetry = TermTelemetry::new(tracer);
125//! ```
126//!
127//! ## Architecture
128//!
129//! Term is built on a modular architecture:
130//!
131//! - **`analyzers`**: Advanced data analysis framework including:
132//! - Type Inference Engine: Automatic data type detection with confidence scores
133//! - Column Profiler: Three-pass algorithm for comprehensive column analysis
134//! - Basic & Advanced Analyzers: Metrics computation (mean, entropy, correlation, etc.)
135//! - **`core`**: Core types like `Check`, `ValidationSuite`, and `ConstraintResult`
136//! - **`constraints`**: All validation constraint implementations
137//! - **`sources`**: Data source connectors and loaders
138//! - **`optimizer`**: Query optimization engine
139//! - **`telemetry`**: OpenTelemetry integration
140//! - **`formatters`**: Result formatting utilities
141//!
142//! ## Examples
143//!
144//! See the `examples` directory for complete examples:
145//!
146//! - `basic_validation.rs`: Simple validation example
147//! - `tpc_h_validation.rs`: TPC-H benchmark data validation
148//! - `cloud_storage_example.rs`: Validating data in cloud storage
149//! - `deequ_migration.rs`: Migrating from Deequ to Term
150//!
151//! ## Migration from Deequ
152//!
153//! Term provides similar APIs to Deequ, making migration straightforward:
154//!
155//! ```rust
156//! use term_guard::core::{Check, builder_extensions::CompletenessOptions};
157//! use term_guard::constraints::Assertion;
158//!
159//! // Deequ-style checks in Term
160//! let check = Check::builder("data_quality")
161//! .has_size(Assertion::GreaterThan(1000.0))
162//! .completeness("id", CompletenessOptions::full().into_constraint_options())
163//! .completeness("name", CompletenessOptions::threshold(0.98).into_constraint_options())
164//! .validates_uniqueness(vec!["id"], 1.0)
165//! .build();
166//! ```
167
168pub mod analyzers;
169pub mod constraints;
170pub mod core;
171pub mod error;
172pub mod formatters;
173pub mod logging;
174pub mod optimizer;
175pub mod prelude;
176pub mod repository;
177pub mod security;
178pub mod sources;
179pub mod telemetry;
180
181#[cfg(test)]
182pub mod test_helpers;
183
184#[cfg(any(test, feature = "test-utils"))]
185pub mod test_utils;
186
187#[cfg(test)]
188pub mod test_utils_cached;
189
190#[cfg(any(test, feature = "test-utils"))]
191pub mod test_fixtures;