1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
// Allow some clippy lints that are common in numerical/matrix code
//! DataSynth Fingerprint - Privacy-preserving synthetic data fingerprinting.
//!
//! This crate provides functionality for:
//! - **Extracting** statistical fingerprints from real data
//! - **Applying privacy** mechanisms (differential privacy, k-anonymity)
//! - **Storing** fingerprints in `.dsf` files
//! - **Synthesizing** generator configurations from fingerprints
//! - **Evaluating** fidelity of generated data
//!
//! # Overview
//!
//! A fingerprint captures the statistical properties of a dataset without storing
//! any individual records, enabling privacy-preserving synthetic data generation.
//!
//! ```text
//! Real Data → Extract → .dsf File → Generate → Synthetic Data → Evaluate
//! ```
//!
//! # Quick Start
//!
//! ## Basic Extraction and Storage
//!
//! ```ignore
//! use datasynth_fingerprint::{
//! extraction::{FingerprintExtractor, ExtractionConfig},
//! io::{FingerprintReader, FingerprintWriter},
//! models::PrivacyLevel,
//! };
//! use std::path::Path;
//!
//! // Extract fingerprint from CSV data with standard privacy
//! let extractor = FingerprintExtractor::new(PrivacyLevel::Standard);
//! let fingerprint = extractor.extract_from_csv(Path::new("data.csv"))?;
//!
//! // Write to .dsf file
//! let writer = FingerprintWriter::new();
//! writer.write_to_file(&fingerprint, Path::new("output.dsf"))?;
//!
//! // Read back from .dsf file
//! let reader = FingerprintReader::new();
//! let loaded = reader.read_from_file(Path::new("output.dsf"))?;
//!
//! // Check privacy audit
//! println!("Epsilon spent: {}", loaded.epsilon_spent());
//! ```
//!
//! ## Signed Fingerprints
//!
//! ```ignore
//! use datasynth_fingerprint::io::{SigningKey, DsfSigner, DsfVerifier};
//!
//! // Generate a signing key
//! let key = SigningKey::generate("my-org-key");
//!
//! // Sign when writing
//! let signer = DsfSigner::new(key.clone());
//! writer.write_to_file_signed(&fingerprint, Path::new("signed.dsf"), &signer)?;
//!
//! // Verify when reading
//! let verifier = DsfVerifier::new(key);
//! let verified = reader.read_from_file_verified(Path::new("signed.dsf"), &verifier)?;
//! ```
//!
//! ## Streaming Extraction for Large Files
//!
//! ```ignore
//! use datasynth_fingerprint::extraction::{FingerprintExtractor, ExtractionConfig};
//!
//! // Configure for streaming (memory-efficient for large files)
//! let config = ExtractionConfig {
//! streaming: true,
//! stream_batch_size: 100_000,
//! ..ExtractionConfig::default()
//! };
//!
//! let extractor = FingerprintExtractor::with_config(config);
//! let fingerprint = extractor.extract_streaming_csv(Path::new("large_data.csv"))?;
//! ```
//!
//! ## Config Synthesis
//!
//! ```ignore
//! use datasynth_fingerprint::synthesis::{ConfigSynthesizer, SynthesisOptions};
//!
//! let options = SynthesisOptions {
//! scale: 2.0, // Generate 2x original row count
//! seed: Some(42), // Reproducible generation
//! preserve_correlations: true,
//! inject_anomalies: true,
//! };
//!
//! let synthesizer = ConfigSynthesizer::with_options(options);
//! let result = synthesizer.synthesize_full(&fingerprint, 42)?;
//!
//! // result.config_patch - configuration values for generators
//! // result.copula_generators - for preserving correlations
//! ```
//!
//! ## Fidelity Evaluation
//!
//! ```ignore
//! use datasynth_fingerprint::evaluation::FidelityEvaluator;
//!
//! let evaluator = FidelityEvaluator::new();
//! let report = evaluator.evaluate(&original_fingerprint, &synthetic_fingerprint)?;
//!
//! println!("Overall fidelity: {:.2}", report.overall_score);
//! println!("Statistical fidelity: {:.2}", report.statistical_fidelity);
//! println!("Correlation fidelity: {:.2}", report.correlation_fidelity);
//! ```
//!
//! # DSF File Format
//!
//! A `.dsf` (DataSynth Fingerprint) file is a ZIP archive containing:
//!
//! | File | Format | Description |
//! |------|--------|-------------|
//! | `manifest.json` | JSON | Version, checksums, privacy config, optional signature |
//! | `schema.yaml` | YAML | Tables, columns, types, relationships |
//! | `statistics.yaml` | YAML | Distributions, percentiles, Benford analysis |
//! | `correlations.yaml` | YAML | Correlation matrices, copulas (optional) |
//! | `integrity.yaml` | YAML | FK relationships, cardinality (optional) |
//! | `rules.yaml` | YAML | Balance constraints, approval thresholds (optional) |
//! | `anomalies.yaml` | YAML | Anomaly rates, type distribution (optional) |
//! | `privacy_audit.json` | JSON | Privacy decisions, epsilon spent |
//!
//! # Privacy Levels
//!
//! The crate supports four privacy levels with different tradeoffs:
//!
//! | Level | Epsilon | K | Description |
//! |-------|---------|---|-------------|
//! | [`PrivacyLevel::Minimal`] | 5.0 | 3 | Low privacy, high utility |
//! | [`PrivacyLevel::Standard`] | 1.0 | 5 | Balanced (default) |
//! | [`PrivacyLevel::High`] | 0.5 | 10 | Higher privacy for sensitive data |
//! | [`PrivacyLevel::Maximum`] | 0.1 | 20 | Maximum privacy, reduced utility |
//!
//! # Privacy Mechanisms
//!
//! The fingerprinting process applies multiple privacy mechanisms:
//!
//! - **Differential Privacy**: Laplace noise calibrated to the sensitivity of each statistic,
//! with configurable epsilon budget. Privacy is enforced through composition tracking.
//!
//! - **K-Anonymity**: Categorical values appearing fewer than k times are suppressed to
//! prevent re-identification of rare values.
//!
//! - **Outlier Handling**: Extreme values are winsorized at configurable percentiles to
//! prevent leakage of unusual records.
//!
//! - **Privacy Audit Trail**: Every privacy decision (noise addition, suppression,
//! generalization) is logged in the fingerprint's `privacy_audit` field.
//!
//! # Supported Data Sources
//!
//! | Source | Method | Notes |
//! |--------|--------|-------|
//! | CSV | `extract_from_csv()` | Auto-infers column types |
//! | Parquet | `extract_from_parquet()` | Preserves type information |
//! | JSON/JSONL | `extract_from_json()` | Array or newline-delimited |
//! | Directory | `extract_from_directory()` | Multi-table fingerprints |
//! | Memory | `DataSource::Memory` | For in-memory data |
//!
//! # Module Overview
//!
//! ## [`models`] - Data Structures
//!
//! Core data structures for fingerprints:
//! - [`Fingerprint`] - Root structure containing all components
//! - [`SchemaFingerprint`] - Table schemas, column types, relationships
//! - [`StatisticsFingerprint`] - Distribution parameters, percentiles
//! - [`CorrelationFingerprint`] - Correlation matrices, Gaussian copulas
//! - [`PrivacyAudit`] - Privacy action tracking
//!
//! ## [`io`] - File I/O
//!
//! Reading and writing `.dsf` files:
//! - [`FingerprintWriter`] - Write fingerprints to `.dsf` files
//! - [`FingerprintReader`] - Read fingerprints from `.dsf` files
//! - [`FingerprintValidator`] - Validate `.dsf` file integrity
//! - [`SigningKey`], [`DsfSigner`], [`DsfVerifier`] - Digital signatures
//!
//! ## [`extraction`] - Data Extraction
//!
//! Extract fingerprints from data sources:
//! - [`FingerprintExtractor`] - Main extraction coordinator
//! - [`DataSource`] - Input data source types
//! - [`ExtractionConfig`] - Extraction settings
//! - Streaming extraction for large files
//!
//! ## [`privacy`] - Privacy Mechanisms
//!
//! Privacy-preserving transformations:
//! - Laplace noise for differential privacy
//! - K-anonymity suppression
//! - Privacy budget tracking
//!
//! ## [`synthesis`] - Config Synthesis
//!
//! Convert fingerprints to generator configurations:
//! - [`ConfigSynthesizer`] - Synthesis coordinator
//! - [`ConfigPatch`] - Configuration values to apply
//! - Gaussian copula generation for correlations
//!
//! ## [`evaluation`] - Fidelity Evaluation
//!
//! Evaluate synthetic data quality:
//! - [`FidelityEvaluator`] - Comparison engine
//! - Statistical, correlation, and schema metrics
//!
//! # CLI Integration
//!
//! The fingerprint crate integrates with the `datasynth-data` CLI:
//!
//! ```bash
//! # Extract fingerprint from data
//! datasynth-data fingerprint extract \
//! --input ./real_data/ \
//! --output ./fingerprint.dsf \
//! --privacy-level standard
//!
//! # Validate fingerprint file
//! datasynth-data fingerprint validate ./fingerprint.dsf
//!
//! # Generate from fingerprint
//! datasynth-data generate \
//! --fingerprint ./fingerprint.dsf \
//! --output ./synthetic/ \
//! --scale 1.0
//!
//! # Evaluate fidelity
//! datasynth-data fingerprint evaluate \
//! --fingerprint ./fingerprint.dsf \
//! --synthetic ./synthetic/
//! ```
//!
//! [`Fingerprint`]: models::Fingerprint
//! [`SchemaFingerprint`]: models::SchemaFingerprint
//! [`StatisticsFingerprint`]: models::StatisticsFingerprint
//! [`CorrelationFingerprint`]: models::CorrelationFingerprint
//! [`PrivacyAudit`]: models::PrivacyAudit
//! [`FingerprintWriter`]: io::FingerprintWriter
//! [`FingerprintReader`]: io::FingerprintReader
//! [`FingerprintValidator`]: io::FingerprintValidator
//! [`SigningKey`]: io::SigningKey
//! [`DsfSigner`]: io::DsfSigner
//! [`DsfVerifier`]: io::DsfVerifier
//! [`FingerprintExtractor`]: extraction::FingerprintExtractor
//! [`DataSource`]: extraction::DataSource
//! [`ExtractionConfig`]: extraction::ExtractionConfig
//! [`ConfigSynthesizer`]: synthesis::ConfigSynthesizer
//! [`ConfigPatch`]: synthesis::ConfigPatch
//! [`FidelityEvaluator`]: evaluation::FidelityEvaluator
//! [`PrivacyLevel::Minimal`]: models::PrivacyLevel::Minimal
//! [`PrivacyLevel::Standard`]: models::PrivacyLevel::Standard
//! [`PrivacyLevel::High`]: models::PrivacyLevel::High
//! [`PrivacyLevel::Maximum`]: models::PrivacyLevel::Maximum
// Re-export commonly used types
pub use ;
pub use ;
pub use ;