datasynth-fingerprint 5.36.0

Privacy-preserving synthetic data fingerprinting for DataSynth
Documentation
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
//! Config synthesizer - converts fingerprints to generator configs.

use std::collections::HashMap;

use super::CopulaGenerator;
use crate::error::FingerprintResult;
use crate::models::{
    CorrelationMatrix, DistributionType, Fingerprint, GaussianCopula, NumericStats,
};

/// Options for config synthesis.
#[derive(Debug, Clone)]
pub struct SynthesisOptions {
    /// Scale factor for row counts (1.0 = same size, 2.0 = double).
    pub scale: f64,
    /// Random seed for generation.
    pub seed: Option<u64>,
    /// Whether to preserve correlations.
    pub preserve_correlations: bool,
    /// Whether to inject anomalies based on fingerprint.
    pub inject_anomalies: bool,
}

impl Default for SynthesisOptions {
    fn default() -> Self {
        Self {
            scale: 1.0,
            seed: None,
            preserve_correlations: true,
            inject_anomalies: true,
        }
    }
}

/// Synthesizer that converts fingerprints to generator configurations.
pub struct ConfigSynthesizer {
    options: SynthesisOptions,
}

impl ConfigSynthesizer {
    /// Create a new config synthesizer.
    pub fn new() -> Self {
        Self {
            options: SynthesisOptions::default(),
        }
    }

    /// Create with custom options.
    pub fn with_options(options: SynthesisOptions) -> Self {
        Self { options }
    }

    /// Synthesize a partial config from a fingerprint.
    ///
    /// Returns a ConfigPatch that can be merged with a base configuration.
    pub fn synthesize(&self, fingerprint: &Fingerprint) -> FingerprintResult<ConfigPatch> {
        let mut patch = ConfigPatch::new();

        // Extract row count with scaling
        let total_rows: u64 = fingerprint
            .schema
            .tables
            .values()
            .map(|t| t.row_count)
            .sum();
        let scaled_rows = (total_rows as f64 * self.options.scale) as u64;

        patch.set(
            "transactions.count",
            ConfigValue::Integer(scaled_rows as i64),
        );

        // Set seed if specified
        if let Some(seed) = self.options.seed {
            patch.set("global.seed", ConfigValue::Integer(seed as i64));
        }

        // Map numeric distributions to amount config. Match is case-INSENSITIVE — the
        // audit-data-standard column is `FunctionalAmount` (capital A), which a
        // case-sensitive `contains("amount")` silently missed, so amount params were
        // never synthesized and twins always fell back to engine defaults. Selection is
        // deterministic (HashMap order is not): prefer functional, then reporting, then
        // the alphabetically-first matching column.
        let mut amount_cols: Vec<(&String, &NumericStats)> = fingerprint
            .statistics
            .numeric_columns
            .iter()
            .filter(|(k, _)| {
                let kl = k.to_lowercase();
                kl.contains("amount")
                    || kl.contains("value")
                    || kl.contains("price")
                    || kl.contains("dmbtr")
                    || kl.contains("wrbtr")
            })
            .collect();
        amount_cols.sort_by(|(a, _), (b, _)| {
            fn rank(k: &str) -> u8 {
                let kl = k.to_lowercase();
                if kl.contains("functional") {
                    0
                } else if kl.contains("reporting") {
                    1
                } else {
                    2
                }
            }
            rank(a).cmp(&rank(b)).then_with(|| a.cmp(b))
        });
        if let Some((_, stats)) = amount_cols.first() {
            let amount_config = self.map_numeric_distribution(stats);
            for (k, v) in amount_config {
                patch.set(&format!("transactions.amounts.{k}"), v);
            }
        }

        // Map anomaly rates if present and enabled
        if self.options.inject_anomalies {
            if let Some(ref anomalies) = fingerprint.anomalies {
                let rate = anomalies.overall.anomaly_rate;
                patch.set("anomaly_injection.overall_rate", ConfigValue::Float(rate));
                patch.set("anomaly_injection.enabled", ConfigValue::Bool(rate > 0.0));
            }
        }

        Ok(patch)
    }

    /// Map numeric statistics to amount distribution config.
    fn map_numeric_distribution(&self, stats: &NumericStats) -> HashMap<String, ConfigValue> {
        let mut config = HashMap::new();

        if let Some(lmp) = &stats.log_magnitude_percentiles {
            // Robust path (preferred): fit the amount log-normal from log-magnitude quantiles
            // (ln|x| space). These are sign-agnostic and survive DP noise / the zero-median
            // problem that ruins mean/min/max-based fits for heavy-tailed, possibly-signed
            // columns. A SINGLE log-normal off the central IQR underestimates a real GL tail,
            // so emit a quantile-anchored mixture spanning body → shoulder → tail.
            // Data-driven quantile-anchored mixture: one component per available
            // log-magnitude quantile, weighted by the probability mass around it
            // (trapezoidal), sigma from local quantile spacing. Matches body AND tail
            // with no hand-tuned weights, and self-adapts to any column shape.
            let anchors: [(f64, f64); 6] = [
                (0.10, lmp.p10),
                (0.25, lmp.p25),
                (0.50, lmp.p50),
                (0.75, lmp.p75),
                (0.90, lmp.p90),
                (0.99, lmp.p99),
            ];
            let n = anchors.len();
            let mut comps: Vec<(f64, f64, f64)> = Vec::with_capacity(n);
            let mut wsum = 0.0_f64;
            for i in 0..n {
                let (_, v) = anchors[i];
                let p_prev = if i == 0 { 0.0 } else { anchors[i - 1].0 };
                let p_next = if i == n - 1 { 1.0 } else { anchors[i + 1].0 };
                let w = ((p_next - p_prev) / 2.0).max(1e-3);
                let v_lo = if i == 0 { v } else { anchors[i - 1].1 };
                let v_hi = if i == n - 1 { v } else { anchors[i + 1].1 };
                let span = if i == 0 {
                    v_hi - v
                } else if i == n - 1 {
                    v - v_lo
                } else {
                    (v_hi - v_lo) / 2.0
                };
                comps.push((w, v, (span / 2.0).abs().max(0.1)));
                wsum += w;
            }
            config.insert(
                "mixture_components".to_string(),
                ConfigValue::Integer(n as i64),
            );
            for (i, (w, mu, sg)) in comps.iter().enumerate() {
                config.insert(format!("comp{i}_weight"), ConfigValue::Float(w / wsum));
                config.insert(format!("comp{i}_mu"), ConfigValue::Float(*mu));
                config.insert(format!("comp{i}_sigma"), ConfigValue::Float(*sg));
            }
            // single-component params retained for back-compat / non-mixture consumers
            config.insert("lognormal_mu".to_string(), ConfigValue::Float(lmp.p50));
            config.insert(
                "lognormal_sigma".to_string(),
                ConfigValue::Float(((lmp.p75 - lmp.p25) / 1.349).abs().max(0.1)),
            );
            config.insert("min_amount".to_string(), ConfigValue::Float(lmp.p1.exp()));
            config.insert("max_amount".to_string(), ConfigValue::Float(lmp.p99.exp()));
        } else {
            config.insert("min_amount".to_string(), ConfigValue::Float(stats.min));
            config.insert("max_amount".to_string(), ConfigValue::Float(stats.max));

            match stats.distribution {
                DistributionType::LogNormal => {
                    if let (Some(mu), Some(sigma)) = (
                        stats.distribution_params.param1,
                        stats.distribution_params.param2,
                    ) {
                        config.insert("lognormal_mu".to_string(), ConfigValue::Float(mu));
                        config.insert("lognormal_sigma".to_string(), ConfigValue::Float(sigma));
                    }
                }
                DistributionType::Normal => {
                    // Convert normal to log-normal approximation for amounts
                    if stats.mean > 0.0 {
                        let variance = stats.std_dev.powi(2);
                        let sigma_sq = (1.0 + variance / stats.mean.powi(2)).ln();
                        let mu = stats.mean.ln() - sigma_sq / 2.0;

                        config.insert("lognormal_mu".to_string(), ConfigValue::Float(mu));
                        config.insert(
                            "lognormal_sigma".to_string(),
                            ConfigValue::Float(sigma_sq.sqrt()),
                        );
                    }
                }
                _ => {
                    // Use empirical parameters based on percentiles
                    if stats.percentiles.p50 > 0.0 {
                        let mu = stats.percentiles.p50.ln();
                        let sigma = (stats.percentiles.p75 / stats.percentiles.p25).ln() / 1.349;
                        config.insert("lognormal_mu".to_string(), ConfigValue::Float(mu));
                        config.insert(
                            "lognormal_sigma".to_string(),
                            ConfigValue::Float(sigma.abs()),
                        );
                    }
                }
            }
        }

        // Round number bias
        if let Some(benford) = stats.benford_first_digit {
            // Higher digit 1 frequency suggests round number bias
            let round_bias = if benford[0] < 0.25 { 0.3 } else { 0.15 };
            config.insert(
                "round_number_probability".to_string(),
                ConfigValue::Float(round_bias),
            );
        }

        config
    }
}

impl Default for ConfigSynthesizer {
    fn default() -> Self {
        Self::new()
    }
}

/// Result of config synthesis including optional copula generators.
#[derive(Debug)]
pub struct SynthesisResult {
    /// Configuration patch to apply.
    pub config_patch: ConfigPatch,
    /// Copula generators for preserving correlations (if enabled and correlations present).
    pub copula_generators: Vec<CopulaGeneratorSpec>,
}

/// Specification for a copula generator.
#[derive(Debug)]
pub struct CopulaGeneratorSpec {
    /// Name identifier.
    pub name: String,
    /// Table this copula applies to.
    pub table: String,
    /// Column names.
    pub columns: Vec<String>,
    /// The copula generator (ready to use).
    pub generator: CopulaGenerator,
}

impl ConfigSynthesizer {
    /// Synthesize config and copula generators from a fingerprint.
    ///
    /// This is the full synthesis method that also creates copula generators
    /// for preserving correlations.
    pub fn synthesize_full(
        &self,
        fingerprint: &Fingerprint,
        seed: u64,
    ) -> FingerprintResult<SynthesisResult> {
        let config_patch = self.synthesize(fingerprint)?;

        let mut copula_generators = Vec::new();

        if self.options.preserve_correlations {
            // Create copula generators from fingerprint
            if let Some(ref correlations) = fingerprint.correlations {
                // First, try to use pre-built copulas
                for copula in &correlations.copulas {
                    if let Some(generator) = CopulaGenerator::from_copula(copula, seed) {
                        copula_generators.push(CopulaGeneratorSpec {
                            name: copula.name.clone(),
                            table: copula.table.clone(),
                            columns: copula.columns.clone(),
                            generator,
                        });
                    }
                }

                // If no copulas, create from correlation matrices
                if copula_generators.is_empty() {
                    for (table_name, matrix) in &correlations.matrices {
                        if matrix.columns.len() >= 2 {
                            if let Some(generator) =
                                CopulaGenerator::from_correlation_matrix(matrix, seed)
                            {
                                copula_generators.push(CopulaGeneratorSpec {
                                    name: format!("{table_name}_copula"),
                                    table: table_name.clone(),
                                    columns: matrix.columns.clone(),
                                    generator,
                                });
                            }
                        }
                    }
                }
            }
        }

        Ok(SynthesisResult {
            config_patch,
            copula_generators,
        })
    }

    /// Create a copula generator from a Gaussian copula specification.
    pub fn create_copula_generator(copula: &GaussianCopula, seed: u64) -> Option<CopulaGenerator> {
        CopulaGenerator::from_copula(copula, seed)
    }

    /// Create a copula generator from a correlation matrix.
    pub fn create_copula_from_matrix(
        matrix: &CorrelationMatrix,
        seed: u64,
    ) -> Option<CopulaGenerator> {
        CopulaGenerator::from_correlation_matrix(matrix, seed)
    }
}

/// A patch of configuration values to be merged.
#[derive(Debug, Clone, Default)]
pub struct ConfigPatch {
    /// Configuration values keyed by dotted path.
    values: HashMap<String, ConfigValue>,
}

impl ConfigPatch {
    /// Create a new empty patch.
    pub fn new() -> Self {
        Self {
            values: HashMap::new(),
        }
    }

    /// Set a configuration value.
    pub fn set(&mut self, path: &str, value: ConfigValue) {
        self.values.insert(path.to_string(), value);
    }

    /// Get a configuration value.
    pub fn get(&self, path: &str) -> Option<&ConfigValue> {
        self.values.get(path)
    }

    /// Get all values.
    pub fn values(&self) -> &HashMap<String, ConfigValue> {
        &self.values
    }

    /// Merge another patch (other takes precedence).
    pub fn merge(&mut self, other: ConfigPatch) {
        self.values.extend(other.values);
    }

    /// Convert to YAML string.
    pub fn to_yaml(&self) -> FingerprintResult<String> {
        // Build nested structure from dotted paths
        let mut root = serde_yaml::Mapping::new();

        for (path, value) in &self.values {
            let parts: Vec<&str> = path.split('.').collect();
            set_nested_value(&mut root, &parts, value);
        }

        Ok(serde_yaml::to_string(&root)?)
    }
}

/// Configuration value types.
#[derive(Debug, Clone)]
pub enum ConfigValue {
    Bool(bool),
    Integer(i64),
    Float(f64),
    String(String),
    Array(Vec<ConfigValue>),
}

impl ConfigValue {
    /// Convert to YAML value.
    fn to_yaml_value(&self) -> serde_yaml::Value {
        match self {
            Self::Bool(b) => serde_yaml::Value::Bool(*b),
            Self::Integer(i) => serde_yaml::Value::Number(serde_yaml::Number::from(*i)),
            Self::Float(f) => {
                if f.is_finite() {
                    serde_yaml::Value::Number(serde_yaml::Number::from(*f))
                } else {
                    serde_yaml::Value::Null
                }
            }
            Self::String(s) => serde_yaml::Value::String(s.clone()),
            Self::Array(arr) => {
                serde_yaml::Value::Sequence(arr.iter().map(ConfigValue::to_yaml_value).collect())
            }
        }
    }
}

/// Set a nested value in a YAML mapping.
fn set_nested_value(root: &mut serde_yaml::Mapping, path: &[&str], value: &ConfigValue) {
    if path.is_empty() {
        return;
    }

    let key = serde_yaml::Value::String(path[0].to_string());

    if path.len() == 1 {
        root.insert(key, value.to_yaml_value());
    } else {
        let entry = root
            .entry(key)
            .or_insert_with(|| serde_yaml::Value::Mapping(serde_yaml::Mapping::new()));

        if let serde_yaml::Value::Mapping(ref mut nested) = entry {
            set_nested_value(nested, &path[1..], value);
        }
    }
}

#[cfg(test)]
mod tests {
    use super::*;

    #[test]
    fn test_config_patch() {
        let mut patch = ConfigPatch::new();
        patch.set("global.seed", ConfigValue::Integer(42));
        patch.set("transactions.count", ConfigValue::Integer(1000));

        assert!(patch.get("global.seed").is_some());

        let yaml = patch.to_yaml().unwrap();
        assert!(yaml.contains("global"));
        assert!(yaml.contains("seed"));
    }

    fn logmag_percentiles(p25: f64, p50: f64, p75: f64) -> crate::models::Percentiles {
        crate::models::Percentiles {
            p1: 0.0,
            p5: 1.0,
            p10: 2.0,
            p25,
            p50,
            p75,
            p90: p75 + 2.0,
            p95: p75 + 3.0,
            p99: 14.0,
        }
    }

    /// The robust path must derive log-normal mu/sigma from log-magnitude percentiles,
    /// IGNORING a DP-corrupted mean + winsorized min/max (the signed-amount failure mode).
    #[test]
    fn log_magnitude_percentiles_override_corrupted_mean() {
        let synth = ConfigSynthesizer::new();
        let mut stats = NumericStats::new(1_000_000, -3062.36, 3062.36, 378_348.3, 237_409.8);
        stats.distribution = DistributionType::Normal; // would trigger the corrupted-mean path
        stats.percentiles = logmag_percentiles(-36.0, 0.0, 37.7); // signed: median 0
        stats.log_magnitude_percentiles = Some(logmag_percentiles(3.0, 4.856, 6.5));

        let cfg = synth.map_numeric_distribution(&stats);
        let mu = match cfg.get("lognormal_mu") {
            Some(ConfigValue::Float(v)) => *v,
            other => panic!("expected lognormal_mu float, got {other:?}"),
        };
        let sigma = match cfg.get("lognormal_sigma") {
            Some(ConfigValue::Float(v)) => *v,
            other => panic!("expected lognormal_sigma float, got {other:?}"),
        };
        // mu == p50(ln|x|); sigma == IQR/1.349. NOT ln(378348)≈12.8 (the corrupted path).
        assert!(
            (mu - 4.856).abs() < 1e-9,
            "mu={mu} should equal log-mag p50"
        );
        assert!(
            (sigma - (6.5 - 3.0) / 1.349).abs() < 1e-9,
            "sigma={sigma} should be IQR/1.349"
        );
        // amount range comes from exp(log-mag p1/p99), not the winsorized signed min/max.
        match cfg.get("max_amount") {
            Some(ConfigValue::Float(v)) => assert!(
                *v > 1e5,
                "max_amount={v} should reflect the heavy tail exp(p99)"
            ),
            other => panic!("expected max_amount float, got {other:?}"),
        }
        // A quantile-anchored mixture (one component per log-mag percentile) is emitted,
        // spanning body → tail so the synthesized tail is heavier than a single log-normal.
        let nc = match cfg.get("mixture_components") {
            Some(ConfigValue::Integer(n)) => *n as usize,
            other => panic!("expected mixture_components integer, got {other:?}"),
        };
        assert!(nc >= 3, "expected a multi-component mixture, got {nc}");
        let f = |k: &str| match cfg.get(k) {
            Some(ConfigValue::Float(v)) => *v,
            other => panic!("expected {k} float, got {other:?}"),
        };
        // ascending anchors: first component is the lowest, last is the p99 tail.
        let last = nc - 1;
        assert!(
            (f(&format!("comp{last}_mu")) - 14.0).abs() < 1e-9,
            "tail component centered at p99"
        );
        assert!(
            f(&format!("comp{last}_mu")) > f("comp0_mu"),
            "tail mu must exceed the first component"
        );
        let wsum: f64 = (0..nc).map(|i| f(&format!("comp{i}_weight"))).sum();
        assert!(
            (wsum - 1.0).abs() < 1e-9,
            "component weights sum to 1.0, got {wsum}"
        );
    }

    /// Without log-magnitude percentiles, the legacy distribution-based path still runs.
    #[test]
    fn legacy_path_used_when_log_magnitude_absent() {
        let synth = ConfigSynthesizer::new();
        let mut stats = NumericStats::new(1000, 1.0, 1000.0, 100.0, 50.0);
        stats.distribution = DistributionType::Normal;
        stats.log_magnitude_percentiles = None;
        let cfg = synth.map_numeric_distribution(&stats);
        assert!(
            cfg.contains_key("lognormal_mu"),
            "legacy normal path should still fit"
        );
    }
}