ipfrs_semantic/
privacy.rs

1//! Differential privacy for embeddings
2//!
3//! This module provides privacy-preserving mechanisms for embedding-based search:
4//! - Noise injection (Laplacian, Gaussian)
5//! - Privacy budget tracking (epsilon-delta)
6//! - Utility-privacy trade-off analysis
7//! - Secure embedding release
8
9use ipfrs_core::{Error, Result};
10use serde::{Deserialize, Serialize};
11use std::sync::{Arc, Mutex};
12
13/// Noise distribution for differential privacy
14#[derive(Debug, Clone, Copy, PartialEq, Serialize, Deserialize)]
15pub enum NoiseDistribution {
16    /// Laplacian noise (for epsilon-DP)
17    Laplacian { scale: f32 },
18    /// Gaussian noise (for (epsilon, delta)-DP)
19    Gaussian { sigma: f32 },
20}
21
22/// Privacy mechanism for adding noise to embeddings
23#[derive(Debug, Clone, Serialize, Deserialize)]
24pub struct PrivacyMechanism {
25    /// Noise distribution
26    distribution: NoiseDistribution,
27    /// Privacy budget (epsilon)
28    epsilon: f32,
29    /// Privacy parameter delta (for Gaussian)
30    delta: f32,
31    /// Sensitivity of the query (L2 norm bound)
32    sensitivity: f32,
33}
34
35impl PrivacyMechanism {
36    /// Create a new Laplacian mechanism (epsilon-DP)
37    pub fn laplacian(epsilon: f32, sensitivity: f32) -> Result<Self> {
38        if epsilon <= 0.0 {
39            return Err(Error::InvalidInput("Epsilon must be positive".into()));
40        }
41        if sensitivity <= 0.0 {
42            return Err(Error::InvalidInput("Sensitivity must be positive".into()));
43        }
44
45        let scale = sensitivity / epsilon;
46
47        Ok(Self {
48            distribution: NoiseDistribution::Laplacian { scale },
49            epsilon,
50            delta: 0.0,
51            sensitivity,
52        })
53    }
54
55    /// Create a new Gaussian mechanism ((epsilon, delta)-DP)
56    pub fn gaussian(epsilon: f32, delta: f32, sensitivity: f32) -> Result<Self> {
57        if epsilon <= 0.0 {
58            return Err(Error::InvalidInput("Epsilon must be positive".into()));
59        }
60        if delta <= 0.0 || delta >= 1.0 {
61            return Err(Error::InvalidInput("Delta must be in (0, 1)".into()));
62        }
63        if sensitivity <= 0.0 {
64            return Err(Error::InvalidInput("Sensitivity must be positive".into()));
65        }
66
67        // Calculate sigma using the Gaussian mechanism formula
68        // sigma = sensitivity * sqrt(2 * ln(1.25 / delta)) / epsilon
69        let sigma = sensitivity * (2.0 * (1.25 / delta).ln()).sqrt() / epsilon;
70
71        Ok(Self {
72            distribution: NoiseDistribution::Gaussian { sigma },
73            epsilon,
74            delta,
75            sensitivity,
76        })
77    }
78
79    /// Add noise to an embedding
80    pub fn add_noise(&self, embedding: &[f32]) -> Vec<f32> {
81        use rand::Rng;
82        let mut rng = rand::rng();
83
84        match self.distribution {
85            NoiseDistribution::Laplacian { scale } => embedding
86                .iter()
87                .map(|&x| x + sample_laplacian(&mut rng, scale))
88                .collect(),
89            NoiseDistribution::Gaussian { sigma } => {
90                embedding
91                    .iter()
92                    .map(|&x| {
93                        // Sample from N(0, sigma^2)
94                        let noise: f32 = rng.random_range(-1.0..1.0);
95                        x + noise * sigma
96                    })
97                    .collect()
98            }
99        }
100    }
101
102    /// Get the privacy budget (epsilon)
103    pub fn epsilon(&self) -> f32 {
104        self.epsilon
105    }
106
107    /// Get the delta parameter
108    pub fn delta(&self) -> f32 {
109        self.delta
110    }
111
112    /// Calculate expected utility loss (L2 distance to original)
113    pub fn expected_utility_loss(&self, dimension: usize) -> f32 {
114        match self.distribution {
115            NoiseDistribution::Laplacian { scale } => {
116                // Expected L2 norm of Laplacian noise in d dimensions
117                // E[||noise||_2] ≈ scale * sqrt(d)
118                scale * (dimension as f32).sqrt()
119            }
120            NoiseDistribution::Gaussian { sigma } => {
121                // Expected L2 norm of Gaussian noise in d dimensions
122                // E[||noise||_2] = sigma * sqrt(d)
123                sigma * (dimension as f32).sqrt()
124            }
125        }
126    }
127}
128
129/// Sample from Laplacian distribution with scale parameter
130fn sample_laplacian<R: rand::Rng>(rng: &mut R, scale: f32) -> f32 {
131    let u: f32 = rng.random_range(-0.5..0.5);
132    if u >= 0.0 {
133        -scale * (1.0 - 2.0 * u).ln()
134    } else {
135        scale * (1.0 + 2.0 * u).ln()
136    }
137}
138
139/// Privacy budget tracker for managing cumulative privacy loss
140pub struct PrivacyBudget {
141    /// Total budget (epsilon)
142    total_epsilon: f32,
143    /// Remaining budget
144    remaining_epsilon: Arc<Mutex<f32>>,
145    /// Total delta
146    total_delta: f32,
147    /// Queries made
148    queries: Arc<Mutex<Vec<QueryRecord>>>,
149}
150
151/// Record of a privacy-consuming query
152#[derive(Debug, Clone, Serialize, Deserialize)]
153pub struct QueryRecord {
154    /// Epsilon consumed
155    pub epsilon: f32,
156    /// Delta consumed
157    pub delta: f32,
158    /// Timestamp
159    pub timestamp: std::time::SystemTime,
160}
161
162impl PrivacyBudget {
163    /// Create a new privacy budget
164    pub fn new(total_epsilon: f32, total_delta: f32) -> Result<Self> {
165        if total_epsilon <= 0.0 {
166            return Err(Error::InvalidInput("Total epsilon must be positive".into()));
167        }
168
169        Ok(Self {
170            total_epsilon,
171            remaining_epsilon: Arc::new(Mutex::new(total_epsilon)),
172            total_delta,
173            queries: Arc::new(Mutex::new(Vec::new())),
174        })
175    }
176
177    /// Check if we can afford a query with given epsilon/delta
178    pub fn can_afford(&self, epsilon: f32, delta: f32) -> bool {
179        let remaining = self.remaining_epsilon.lock().unwrap();
180        *remaining >= epsilon && self.total_delta >= delta
181    }
182
183    /// Consume budget for a query
184    pub fn consume(&self, epsilon: f32, delta: f32) -> Result<()> {
185        if !self.can_afford(epsilon, delta) {
186            return Err(Error::InvalidInput("Insufficient privacy budget".into()));
187        }
188
189        let mut remaining = self.remaining_epsilon.lock().unwrap();
190        *remaining -= epsilon;
191
192        let mut queries = self.queries.lock().unwrap();
193        queries.push(QueryRecord {
194            epsilon,
195            delta,
196            timestamp: std::time::SystemTime::now(),
197        });
198
199        Ok(())
200    }
201
202    /// Get remaining epsilon
203    pub fn remaining(&self) -> f32 {
204        *self.remaining_epsilon.lock().unwrap()
205    }
206
207    /// Get statistics
208    pub fn stats(&self) -> PrivacyBudgetStats {
209        let remaining = *self.remaining_epsilon.lock().unwrap();
210        let queries = self.queries.lock().unwrap();
211
212        PrivacyBudgetStats {
213            total_epsilon: self.total_epsilon,
214            remaining_epsilon: remaining,
215            consumed_epsilon: self.total_epsilon - remaining,
216            total_delta: self.total_delta,
217            num_queries: queries.len(),
218        }
219    }
220}
221
222/// Statistics about privacy budget usage
223#[derive(Debug, Clone, Serialize, Deserialize)]
224pub struct PrivacyBudgetStats {
225    /// Total privacy budget
226    pub total_epsilon: f32,
227    /// Remaining budget
228    pub remaining_epsilon: f32,
229    /// Consumed budget
230    pub consumed_epsilon: f32,
231    /// Total delta
232    pub total_delta: f32,
233    /// Number of queries made
234    pub num_queries: usize,
235}
236
237/// Privacy-preserving embedding wrapper
238pub struct PrivateEmbedding {
239    /// Original embedding (kept private)
240    #[allow(dead_code)]
241    original: Vec<f32>,
242    /// Noisy version (public)
243    pub noisy: Vec<f32>,
244    /// Privacy mechanism used
245    mechanism: PrivacyMechanism,
246}
247
248impl PrivateEmbedding {
249    /// Create a private embedding with noise
250    pub fn new(embedding: Vec<f32>, mechanism: PrivacyMechanism) -> Self {
251        let noisy = mechanism.add_noise(&embedding);
252
253        Self {
254            original: embedding,
255            noisy,
256            mechanism,
257        }
258    }
259
260    /// Get the noisy (public) embedding
261    pub fn public_embedding(&self) -> &[f32] {
262        &self.noisy
263    }
264
265    /// Get the privacy parameters
266    pub fn privacy_params(&self) -> (f32, f32) {
267        (self.mechanism.epsilon(), self.mechanism.delta())
268    }
269
270    /// Get expected utility loss
271    pub fn utility_loss(&self) -> f32 {
272        self.mechanism.expected_utility_loss(self.noisy.len())
273    }
274}
275
276/// Utility-privacy trade-off analyzer
277pub struct TradeoffAnalyzer {
278    /// Different epsilon values to test
279    epsilons: Vec<f32>,
280    /// Sensitivity
281    sensitivity: f32,
282}
283
284impl TradeoffAnalyzer {
285    /// Create a new analyzer
286    pub fn new(sensitivity: f32) -> Self {
287        // Test a range of epsilon values from 0.1 to 10.0
288        let epsilons = vec![0.1, 0.5, 1.0, 2.0, 5.0, 10.0];
289
290        Self {
291            epsilons,
292            sensitivity,
293        }
294    }
295
296    /// Analyze trade-offs for different epsilon values
297    pub fn analyze(&self, dimension: usize) -> Vec<TradeoffPoint> {
298        self.epsilons
299            .iter()
300            .map(|&epsilon| {
301                let mechanism = PrivacyMechanism::laplacian(epsilon, self.sensitivity).unwrap();
302                let utility_loss = mechanism.expected_utility_loss(dimension);
303
304                TradeoffPoint {
305                    epsilon,
306                    delta: 0.0,
307                    utility_loss,
308                }
309            })
310            .collect()
311    }
312
313    /// Find the best epsilon for a target utility loss
314    pub fn find_epsilon_for_utility(&self, dimension: usize, max_utility_loss: f32) -> Option<f32> {
315        let points = self.analyze(dimension);
316
317        points
318            .into_iter()
319            .filter(|p| p.utility_loss <= max_utility_loss)
320            .map(|p| p.epsilon)
321            .min_by(|a, b| a.partial_cmp(b).unwrap())
322    }
323}
324
325/// A point in the utility-privacy trade-off space
326#[derive(Debug, Clone, Serialize, Deserialize)]
327pub struct TradeoffPoint {
328    /// Privacy budget (epsilon)
329    pub epsilon: f32,
330    /// Delta parameter
331    pub delta: f32,
332    /// Expected utility loss
333    pub utility_loss: f32,
334}
335
336#[cfg(test)]
337mod tests {
338    use super::*;
339
340    #[test]
341    fn test_laplacian_mechanism() {
342        let mechanism = PrivacyMechanism::laplacian(1.0, 1.0).unwrap();
343        assert_eq!(mechanism.epsilon(), 1.0);
344        assert_eq!(mechanism.delta(), 0.0);
345
346        let embedding = vec![1.0, 2.0, 3.0];
347        let noisy = mechanism.add_noise(&embedding);
348
349        assert_eq!(noisy.len(), embedding.len());
350        // Noisy embedding should be different from original
351        assert_ne!(noisy, embedding);
352    }
353
354    #[test]
355    fn test_gaussian_mechanism() {
356        let mechanism = PrivacyMechanism::gaussian(1.0, 0.001, 1.0).unwrap();
357        assert_eq!(mechanism.epsilon(), 1.0);
358        assert!(mechanism.delta() > 0.0);
359
360        let embedding = vec![1.0, 2.0, 3.0];
361        let noisy = mechanism.add_noise(&embedding);
362
363        assert_eq!(noisy.len(), embedding.len());
364    }
365
366    #[test]
367    fn test_privacy_budget() {
368        let budget = PrivacyBudget::new(10.0, 0.001).unwrap();
369
370        assert!(budget.can_afford(1.0, 0.0001));
371        assert_eq!(budget.remaining(), 10.0);
372
373        budget.consume(1.0, 0.0001).unwrap();
374        assert_eq!(budget.remaining(), 9.0);
375
376        let stats = budget.stats();
377        assert_eq!(stats.consumed_epsilon, 1.0);
378        assert_eq!(stats.num_queries, 1);
379    }
380
381    #[test]
382    fn test_budget_exhaustion() {
383        let budget = PrivacyBudget::new(1.0, 0.001).unwrap();
384
385        budget.consume(0.5, 0.0001).unwrap();
386        budget.consume(0.5, 0.0001).unwrap();
387
388        // Should fail - budget exhausted
389        assert!(budget.consume(0.1, 0.0001).is_err());
390    }
391
392    #[test]
393    fn test_private_embedding() {
394        let embedding = vec![1.0, 2.0, 3.0];
395        let mechanism = PrivacyMechanism::laplacian(1.0, 1.0).unwrap();
396
397        let private_emb = PrivateEmbedding::new(embedding.clone(), mechanism);
398
399        assert_eq!(private_emb.public_embedding().len(), embedding.len());
400        assert_eq!(private_emb.privacy_params().0, 1.0);
401        assert!(private_emb.utility_loss() > 0.0);
402    }
403
404    #[test]
405    fn test_tradeoff_analyzer() {
406        let analyzer = TradeoffAnalyzer::new(1.0);
407        let points = analyzer.analyze(768);
408
409        assert!(!points.is_empty());
410        // Higher epsilon should give lower utility loss
411        assert!(points[0].utility_loss > points.last().unwrap().utility_loss);
412    }
413
414    #[test]
415    fn test_find_epsilon_for_utility() {
416        let analyzer = TradeoffAnalyzer::new(1.0);
417        let epsilon = analyzer.find_epsilon_for_utility(768, 10.0);
418
419        assert!(epsilon.is_some());
420        assert!(epsilon.unwrap() > 0.0);
421    }
422
423    #[test]
424    fn test_utility_loss_estimation() {
425        let mechanism = PrivacyMechanism::laplacian(1.0, 1.0).unwrap();
426        let loss = mechanism.expected_utility_loss(768);
427
428        // Should be roughly sqrt(768) for unit scale
429        assert!(loss > 20.0 && loss < 30.0);
430    }
431}