memscope_rs/core/
fast_data_deduplicator.rs

1//! Simple Data Deduplicator - High Performance Version
2//!
3//! Simplified version for better performance in demos
4
5use crate::core::types::TrackingResult;
6use serde::{Deserialize, Serialize};
7use std::collections::HashMap;
8use std::sync::Arc;
9
10/// Simple deduplicated string reference
11#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq, Hash)]
12pub struct SimpleDeduplicatedString {
13    pub hash: u64,
14    pub length: usize,
15    pub ref_count: u32,
16}
17
18/// Simple deduplication statistics
19#[derive(Debug, Default, Clone, Serialize, Deserialize)]
20pub struct SimpleDeduplicationStats {
21    pub total_operations: u64,
22    pub cache_hit_rate: f64,
23    pub memory_saved_bytes: u64,
24}
25
26/// Simple high-performance data deduplicator
27pub struct SimpleDataDeduplicator {
28    strings: HashMap<u64, Arc<String>>,
29    stats: SimpleDeduplicationStats,
30}
31
32impl SimpleDataDeduplicator {
33    pub fn new() -> Self {
34        Self {
35            strings: HashMap::new(),
36            stats: SimpleDeduplicationStats::default(),
37        }
38    }
39
40    pub fn deduplicate_string(&mut self, input: &str) -> TrackingResult<SimpleDeduplicatedString> {
41        let hash = self.calculate_hash(input);
42        self.stats.total_operations += 1;
43
44        if self.strings.contains_key(&hash) {
45            self.stats.cache_hit_rate = 1.0;
46            return Ok(SimpleDeduplicatedString {
47                hash,
48                length: input.len(),
49                ref_count: 2,
50            });
51        }
52
53        self.strings.insert(hash, Arc::new(input.to_string()));
54        Ok(SimpleDeduplicatedString {
55            hash,
56            length: input.len(),
57            ref_count: 1,
58        })
59    }
60
61    pub fn get_string(&self, dedup_ref: &SimpleDeduplicatedString) -> TrackingResult<Arc<String>> {
62        match self.strings.get(&dedup_ref.hash) {
63            Some(s) => Ok(Arc::clone(s)),
64            None => Ok(Arc::new("not found".to_string())),
65        }
66    }
67
68    pub fn get_stats(&self) -> TrackingResult<SimpleDeduplicationStats> {
69        Ok(self.stats.clone())
70    }
71
72    fn calculate_hash(&self, input: &str) -> u64 {
73        use std::collections::hash_map::DefaultHasher;
74        use std::hash::{Hash, Hasher};
75        let mut hasher = DefaultHasher::new();
76        input.hash(&mut hasher);
77        hasher.finish()
78    }
79}
80
81impl Default for SimpleDataDeduplicator {
82    fn default() -> Self {
83        Self::new()
84    }
85}
86
87/// Global simple deduplicator
88static mut GLOBAL_SIMPLE_DEDUPLICATOR: Option<SimpleDataDeduplicator> = None;
89static INIT: std::sync::Once = std::sync::Once::new();
90
91pub fn get_global_simple_data_deduplicator() -> &'static mut SimpleDataDeduplicator {
92    #[allow(static_mut_refs)]
93    unsafe {
94        INIT.call_once(|| {
95            GLOBAL_SIMPLE_DEDUPLICATOR = Some(SimpleDataDeduplicator::new());
96        });
97        GLOBAL_SIMPLE_DEDUPLICATOR.as_mut().unwrap()
98    }
99}
100
101#[cfg(test)]
102mod tests {
103    use super::*;
104
105    #[test]
106    fn test_new_deduplicator() {
107        // Test creating a new deduplicator
108        let dedup = SimpleDataDeduplicator::new();
109        let stats = dedup.get_stats().expect("Failed to get stats");
110
111        assert_eq!(stats.total_operations, 0);
112        assert_eq!(stats.cache_hit_rate, 0.0);
113        assert_eq!(stats.memory_saved_bytes, 0);
114    }
115
116    #[test]
117    fn test_deduplicate_string_first_time() {
118        // Test deduplicating a string for the first time
119        let mut dedup = SimpleDataDeduplicator::new();
120        let test_str = "test string for deduplication";
121
122        let result = dedup
123            .deduplicate_string(test_str)
124            .expect("Failed to deduplicate string");
125
126        assert_eq!(result.length, test_str.len());
127        assert_eq!(result.ref_count, 1);
128        assert!(result.hash > 0);
129
130        let stats = dedup.get_stats().expect("Failed to get stats");
131        assert_eq!(stats.total_operations, 1);
132    }
133
134    #[test]
135    fn test_deduplicate_string_duplicate() {
136        // Test deduplicating the same string twice
137        let mut dedup = SimpleDataDeduplicator::new();
138        let test_str = "duplicate test string";
139
140        let result1 = dedup
141            .deduplicate_string(test_str)
142            .expect("Failed to deduplicate string first time");
143        let result2 = dedup
144            .deduplicate_string(test_str)
145            .expect("Failed to deduplicate string second time");
146
147        // Both results should have the same hash
148        assert_eq!(result1.hash, result2.hash);
149        assert_eq!(result1.length, result2.length);
150        assert_eq!(result2.ref_count, 2); // Second time should have ref_count 2
151
152        let stats = dedup.get_stats().expect("Failed to get stats");
153        assert_eq!(stats.total_operations, 2);
154        assert_eq!(stats.cache_hit_rate, 1.0); // Last operation was a cache hit
155    }
156
157    #[test]
158    fn test_get_string_existing() {
159        // Test retrieving an existing string
160        let mut dedup = SimpleDataDeduplicator::new();
161        let test_str = "retrievable string";
162
163        let dedup_ref = dedup
164            .deduplicate_string(test_str)
165            .expect("Failed to deduplicate string");
166
167        let retrieved = dedup
168            .get_string(&dedup_ref)
169            .expect("Failed to retrieve string");
170
171        assert_eq!(*retrieved, test_str);
172    }
173
174    #[test]
175    fn test_get_string_non_existing() {
176        // Test retrieving a non-existing string
177        let dedup = SimpleDataDeduplicator::new();
178
179        let fake_ref = SimpleDeduplicatedString {
180            hash: 999999,
181            length: 10,
182            ref_count: 1,
183        };
184
185        let retrieved = dedup
186            .get_string(&fake_ref)
187            .expect("Failed to retrieve string");
188
189        assert_eq!(*retrieved, "not found");
190    }
191
192    #[test]
193    fn test_calculate_hash_consistency() {
194        // Test that hash calculation is consistent
195        let dedup = SimpleDataDeduplicator::new();
196        let test_str = "consistent hash test";
197
198        let hash1 = dedup.calculate_hash(test_str);
199        let hash2 = dedup.calculate_hash(test_str);
200
201        assert_eq!(hash1, hash2);
202        assert!(hash1 > 0);
203    }
204
205    #[test]
206    fn test_calculate_hash_different_strings() {
207        // Test that different strings produce different hashes
208        let dedup = SimpleDataDeduplicator::new();
209
210        let hash1 = dedup.calculate_hash("string one");
211        let hash2 = dedup.calculate_hash("string two");
212
213        assert_ne!(hash1, hash2);
214    }
215
216    #[test]
217    fn test_default_implementation() {
218        // Test the Default trait implementation
219        let dedup = SimpleDataDeduplicator::default();
220        let stats = dedup.get_stats().expect("Failed to get stats");
221
222        assert_eq!(stats.total_operations, 0);
223        assert_eq!(stats.cache_hit_rate, 0.0);
224        assert_eq!(stats.memory_saved_bytes, 0);
225    }
226
227    #[test]
228    fn test_multiple_string_deduplication() {
229        // Test deduplicating multiple different and duplicate strings
230        let mut dedup = SimpleDataDeduplicator::new();
231
232        let strings = vec![
233            "first string",
234            "second string",
235            "first string", // duplicate
236            "third string",
237            "second string", // duplicate
238            "first string",  // duplicate
239        ];
240
241        let mut results = Vec::new();
242        for s in &strings {
243            results.push(
244                dedup
245                    .deduplicate_string(s)
246                    .expect("Failed to deduplicate string"),
247            );
248        }
249
250        // Check that duplicates have the same hash
251        assert_eq!(results[0].hash, results[2].hash);
252        assert_eq!(results[0].hash, results[5].hash);
253        assert_eq!(results[1].hash, results[4].hash);
254
255        // Check that different strings have different hashes
256        assert_ne!(results[0].hash, results[1].hash);
257        assert_ne!(results[0].hash, results[3].hash);
258        assert_ne!(results[1].hash, results[3].hash);
259
260        let stats = dedup.get_stats().expect("Failed to get stats");
261        assert_eq!(stats.total_operations, 6);
262    }
263
264    #[test]
265    fn test_empty_string_deduplication() {
266        // Test deduplicating empty strings
267        let mut dedup = SimpleDataDeduplicator::new();
268
269        let result = dedup
270            .deduplicate_string("")
271            .expect("Failed to deduplicate empty string");
272
273        assert_eq!(result.length, 0);
274        assert_eq!(result.ref_count, 1);
275        assert!(result.hash > 0); // Even empty string should have a hash
276    }
277
278    #[test]
279    fn test_long_string_deduplication() {
280        // Test deduplicating very long strings
281        let mut dedup = SimpleDataDeduplicator::new();
282        let long_str = "a".repeat(10000);
283
284        let result = dedup
285            .deduplicate_string(&long_str)
286            .expect("Failed to deduplicate long string");
287
288        assert_eq!(result.length, 10000);
289        assert_eq!(result.ref_count, 1);
290
291        // Deduplicate again to verify caching works
292        let result2 = dedup
293            .deduplicate_string(&long_str)
294            .expect("Failed to deduplicate long string again");
295
296        assert_eq!(result.hash, result2.hash);
297        assert_eq!(result2.ref_count, 2);
298    }
299
300    #[test]
301    fn test_unicode_string_deduplication() {
302        // Test deduplicating unicode strings
303        let mut dedup = SimpleDataDeduplicator::new();
304        let unicode_str = "Hello World 🌍";
305
306        let result = dedup
307            .deduplicate_string(unicode_str)
308            .expect("Failed to deduplicate unicode string");
309
310        assert_eq!(result.length, unicode_str.len());
311
312        let retrieved = dedup
313            .get_string(&result)
314            .expect("Failed to retrieve unicode string");
315
316        assert_eq!(*retrieved, unicode_str);
317    }
318
319    #[test]
320    fn test_stats_accuracy() {
321        // Test that statistics are accurately maintained
322        let mut dedup = SimpleDataDeduplicator::new();
323
324        // Add some unique strings
325        dedup
326            .deduplicate_string("unique1")
327            .expect("Failed to deduplicate");
328        dedup
329            .deduplicate_string("unique2")
330            .expect("Failed to deduplicate");
331        dedup
332            .deduplicate_string("unique3")
333            .expect("Failed to deduplicate");
334
335        // Add some duplicates
336        dedup
337            .deduplicate_string("unique1")
338            .expect("Failed to deduplicate");
339        dedup
340            .deduplicate_string("unique2")
341            .expect("Failed to deduplicate");
342
343        let stats = dedup.get_stats().expect("Failed to get stats");
344        assert_eq!(stats.total_operations, 5);
345        // Last operation was a cache hit, so cache_hit_rate should be 1.0
346        assert_eq!(stats.cache_hit_rate, 1.0);
347    }
348}