Skip to main content

datasynth_fingerprint/
lib.rs

1// Allow some clippy lints that are common in numerical/matrix code
2#![allow(clippy::needless_range_loop)]
3#![allow(clippy::explicit_counter_loop)]
4#![cfg_attr(not(test), deny(clippy::unwrap_used))]
5
6//! DataSynth Fingerprint - Privacy-preserving synthetic data fingerprinting.
7//!
8//! This crate provides functionality for:
9//! - **Extracting** statistical fingerprints from real data
10//! - **Applying privacy** mechanisms (differential privacy, k-anonymity)
11//! - **Storing** fingerprints in `.dsf` files
12//! - **Synthesizing** generator configurations from fingerprints
13//! - **Evaluating** fidelity of generated data
14//!
15//! # Overview
16//!
17//! A fingerprint captures the statistical properties of a dataset without storing
18//! any individual records, enabling privacy-preserving synthetic data generation.
19//!
20//! ```text
21//! Real Data → Extract → .dsf File → Generate → Synthetic Data → Evaluate
22//! ```
23//!
24//! # Quick Start
25//!
26//! ## Basic Extraction and Storage
27//!
28//! ```ignore
29//! use datasynth_fingerprint::{
30//!     extraction::{FingerprintExtractor, ExtractionConfig},
31//!     io::{FingerprintReader, FingerprintWriter},
32//!     models::PrivacyLevel,
33//! };
34//! use std::path::Path;
35//!
36//! // Extract fingerprint from CSV data with standard privacy
37//! let extractor = FingerprintExtractor::new(PrivacyLevel::Standard);
38//! let fingerprint = extractor.extract_from_csv(Path::new("data.csv"))?;
39//!
40//! // Write to .dsf file
41//! let writer = FingerprintWriter::new();
42//! writer.write_to_file(&fingerprint, Path::new("output.dsf"))?;
43//!
44//! // Read back from .dsf file
45//! let reader = FingerprintReader::new();
46//! let loaded = reader.read_from_file(Path::new("output.dsf"))?;
47//!
48//! // Check privacy audit
49//! println!("Epsilon spent: {}", loaded.epsilon_spent());
50//! ```
51//!
52//! ## Signed Fingerprints
53//!
54//! ```ignore
55//! use datasynth_fingerprint::io::{SigningKey, DsfSigner, DsfVerifier};
56//!
57//! // Generate a signing key
58//! let key = SigningKey::generate("my-org-key");
59//!
60//! // Sign when writing
61//! let signer = DsfSigner::new(key.clone());
62//! writer.write_to_file_signed(&fingerprint, Path::new("signed.dsf"), &signer)?;
63//!
64//! // Verify when reading
65//! let verifier = DsfVerifier::new(key);
66//! let verified = reader.read_from_file_verified(Path::new("signed.dsf"), &verifier)?;
67//! ```
68//!
69//! ## Streaming Extraction for Large Files
70//!
71//! ```ignore
72//! use datasynth_fingerprint::extraction::{FingerprintExtractor, ExtractionConfig};
73//!
74//! // Configure for streaming (memory-efficient for large files)
75//! let config = ExtractionConfig {
76//!     streaming: true,
77//!     stream_batch_size: 100_000,
78//!     ..ExtractionConfig::default()
79//! };
80//!
81//! let extractor = FingerprintExtractor::with_config(config);
82//! let fingerprint = extractor.extract_streaming_csv(Path::new("large_data.csv"))?;
83//! ```
84//!
85//! ## Config Synthesis
86//!
87//! ```ignore
88//! use datasynth_fingerprint::synthesis::{ConfigSynthesizer, SynthesisOptions};
89//!
90//! let options = SynthesisOptions {
91//!     scale: 2.0,              // Generate 2x original row count
92//!     seed: Some(42),          // Reproducible generation
93//!     preserve_correlations: true,
94//!     inject_anomalies: true,
95//! };
96//!
97//! let synthesizer = ConfigSynthesizer::with_options(options);
98//! let result = synthesizer.synthesize_full(&fingerprint, 42)?;
99//!
100//! // result.config_patch - configuration values for generators
101//! // result.copula_generators - for preserving correlations
102//! ```
103//!
104//! ## Fidelity Evaluation
105//!
106//! ```ignore
107//! use datasynth_fingerprint::evaluation::FidelityEvaluator;
108//!
109//! let evaluator = FidelityEvaluator::new();
110//! let report = evaluator.evaluate(&original_fingerprint, &synthetic_fingerprint)?;
111//!
112//! println!("Overall fidelity: {:.2}", report.overall_score);
113//! println!("Statistical fidelity: {:.2}", report.statistical_fidelity);
114//! println!("Correlation fidelity: {:.2}", report.correlation_fidelity);
115//! ```
116//!
117//! # DSF File Format
118//!
119//! A `.dsf` (DataSynth Fingerprint) file is a ZIP archive containing:
120//!
121//! | File | Format | Description |
122//! |------|--------|-------------|
123//! | `manifest.json` | JSON | Version, checksums, privacy config, optional signature |
124//! | `schema.yaml` | YAML | Tables, columns, types, relationships |
125//! | `statistics.yaml` | YAML | Distributions, percentiles, Benford analysis |
126//! | `correlations.yaml` | YAML | Correlation matrices, copulas (optional) |
127//! | `integrity.yaml` | YAML | FK relationships, cardinality (optional) |
128//! | `rules.yaml` | YAML | Balance constraints, approval thresholds (optional) |
129//! | `anomalies.yaml` | YAML | Anomaly rates, type distribution (optional) |
130//! | `privacy_audit.json` | JSON | Privacy decisions, epsilon spent |
131//!
132//! # Privacy Levels
133//!
134//! The crate supports four privacy levels with different tradeoffs:
135//!
136//! | Level | Epsilon | K | Description |
137//! |-------|---------|---|-------------|
138//! | [`PrivacyLevel::Minimal`] | 5.0 | 3 | Low privacy, high utility |
139//! | [`PrivacyLevel::Standard`] | 1.0 | 5 | Balanced (default) |
140//! | [`PrivacyLevel::High`] | 0.5 | 10 | Higher privacy for sensitive data |
141//! | [`PrivacyLevel::Maximum`] | 0.1 | 20 | Maximum privacy, reduced utility |
142//!
143//! # Privacy Mechanisms
144//!
145//! The fingerprinting process applies multiple privacy mechanisms:
146//!
147//! - **Differential Privacy**: Laplace noise calibrated to the sensitivity of each statistic,
148//!   with configurable epsilon budget. Privacy is enforced through composition tracking.
149//!
150//! - **K-Anonymity**: Categorical values appearing fewer than k times are suppressed to
151//!   prevent re-identification of rare values.
152//!
153//! - **Outlier Handling**: Extreme values are winsorized at configurable percentiles to
154//!   prevent leakage of unusual records.
155//!
156//! - **Privacy Audit Trail**: Every privacy decision (noise addition, suppression,
157//!   generalization) is logged in the fingerprint's `privacy_audit` field.
158//!
159//! # Supported Data Sources
160//!
161//! | Source | Method | Notes |
162//! |--------|--------|-------|
163//! | CSV | `extract_from_csv()` | Auto-infers column types |
164//! | Parquet | `extract_from_parquet()` | Preserves type information |
165//! | JSON/JSONL | `extract_from_json()` | Array or newline-delimited |
166//! | Directory | `extract_from_directory()` | Multi-table fingerprints |
167//! | Memory | `DataSource::Memory` | For in-memory data |
168//!
169//! # Module Overview
170//!
171//! ## [`models`] - Data Structures
172//!
173//! Core data structures for fingerprints:
174//! - [`Fingerprint`] - Root structure containing all components
175//! - [`SchemaFingerprint`] - Table schemas, column types, relationships
176//! - [`StatisticsFingerprint`] - Distribution parameters, percentiles
177//! - [`CorrelationFingerprint`] - Correlation matrices, Gaussian copulas
178//! - [`PrivacyAudit`] - Privacy action tracking
179//!
180//! ## [`io`] - File I/O
181//!
182//! Reading and writing `.dsf` files:
183//! - [`FingerprintWriter`] - Write fingerprints to `.dsf` files
184//! - [`FingerprintReader`] - Read fingerprints from `.dsf` files
185//! - [`FingerprintValidator`] - Validate `.dsf` file integrity
186//! - [`SigningKey`], [`DsfSigner`], [`DsfVerifier`] - Digital signatures
187//!
188//! ## [`extraction`] - Data Extraction
189//!
190//! Extract fingerprints from data sources:
191//! - [`FingerprintExtractor`] - Main extraction coordinator
192//! - [`DataSource`] - Input data source types
193//! - [`ExtractionConfig`] - Extraction settings
194//! - Streaming extraction for large files
195//!
196//! ## [`privacy`] - Privacy Mechanisms
197//!
198//! Privacy-preserving transformations:
199//! - Laplace noise for differential privacy
200//! - K-anonymity suppression
201//! - Privacy budget tracking
202//!
203//! ## [`synthesis`] - Config Synthesis
204//!
205//! Convert fingerprints to generator configurations:
206//! - [`ConfigSynthesizer`] - Synthesis coordinator
207//! - [`ConfigPatch`] - Configuration values to apply
208//! - Gaussian copula generation for correlations
209//!
210//! ## [`evaluation`] - Fidelity Evaluation
211//!
212//! Evaluate synthetic data quality:
213//! - [`FidelityEvaluator`] - Comparison engine
214//! - Statistical, correlation, and schema metrics
215//!
216//! # CLI Integration
217//!
218//! The fingerprint crate integrates with the `datasynth-data` CLI:
219//!
220//! ```bash
221//! # Extract fingerprint from data
222//! datasynth-data fingerprint extract \
223//!     --input ./real_data/ \
224//!     --output ./fingerprint.dsf \
225//!     --privacy-level standard
226//!
227//! # Validate fingerprint file
228//! datasynth-data fingerprint validate ./fingerprint.dsf
229//!
230//! # Generate from fingerprint
231//! datasynth-data generate \
232//!     --fingerprint ./fingerprint.dsf \
233//!     --output ./synthetic/ \
234//!     --scale 1.0
235//!
236//! # Evaluate fidelity
237//! datasynth-data fingerprint evaluate \
238//!     --fingerprint ./fingerprint.dsf \
239//!     --synthetic ./synthetic/
240//! ```
241//!
242//! [`Fingerprint`]: models::Fingerprint
243//! [`SchemaFingerprint`]: models::SchemaFingerprint
244//! [`StatisticsFingerprint`]: models::StatisticsFingerprint
245//! [`CorrelationFingerprint`]: models::CorrelationFingerprint
246//! [`PrivacyAudit`]: models::PrivacyAudit
247//! [`FingerprintWriter`]: io::FingerprintWriter
248//! [`FingerprintReader`]: io::FingerprintReader
249//! [`FingerprintValidator`]: io::FingerprintValidator
250//! [`SigningKey`]: io::SigningKey
251//! [`DsfSigner`]: io::DsfSigner
252//! [`DsfVerifier`]: io::DsfVerifier
253//! [`FingerprintExtractor`]: extraction::FingerprintExtractor
254//! [`DataSource`]: extraction::DataSource
255//! [`ExtractionConfig`]: extraction::ExtractionConfig
256//! [`ConfigSynthesizer`]: synthesis::ConfigSynthesizer
257//! [`ConfigPatch`]: synthesis::ConfigPatch
258//! [`FidelityEvaluator`]: evaluation::FidelityEvaluator
259//! [`PrivacyLevel::Minimal`]: models::PrivacyLevel::Minimal
260//! [`PrivacyLevel::Standard`]: models::PrivacyLevel::Standard
261//! [`PrivacyLevel::High`]: models::PrivacyLevel::High
262//! [`PrivacyLevel::Maximum`]: models::PrivacyLevel::Maximum
263
264pub mod aggregation;
265pub mod certificates;
266pub mod error;
267pub mod evaluation;
268pub mod extraction;
269pub mod federated;
270pub mod io;
271pub mod models;
272pub mod privacy;
273pub mod synthesis;
274
275// Re-export commonly used types
276pub use error::{FingerprintError, FingerprintResult};
277pub use io::{FingerprintReader, FingerprintValidator, FingerprintWriter};
278pub use models::{Fingerprint, Manifest, PrivacyLevel, PrivacyMetadata, SchemaFingerprint};