1use std::collections::HashMap;
2use sha2::{Sha256, Digest};
3use serde::{Deserialize, Serialize};
4use anyhow::Result;
5
6#[derive(Debug)]
11pub struct ContentDeduplicator {
12 hash_to_storage: HashMap<String, String>,
14 ref_counts: HashMap<String, u32>,
16 storage_to_hash: HashMap<String, String>,
18}
19
20#[derive(Debug, Clone, Serialize, Deserialize)]
22pub struct DedupInfo {
23 pub is_reference: bool,
25 pub original_storage_id: Option<String>,
27 pub hash: String,
29 pub ref_count: u32,
31}
32
33impl ContentDeduplicator {
34 pub fn new() -> Self {
36 Self {
37 hash_to_storage: HashMap::new(),
38 ref_counts: HashMap::new(),
39 storage_to_hash: HashMap::new(),
40 }
41 }
42
43 pub fn calculate_hash(data: &[u8]) -> String {
45 let mut hasher = Sha256::new();
46 hasher.update(data);
47 format!("{:x}", hasher.finalize())
48 }
49
50 pub fn check_duplicate(&mut self, hash: &str) -> Option<String> {
54 if let Some(storage_id) = self.hash_to_storage.get(hash) {
55 *self.ref_counts.entry(storage_id.clone()).or_insert(0) += 1;
57 Some(storage_id.clone())
58 } else {
59 None
60 }
61 }
62
63 pub fn register_file(&mut self, hash: String, storage_id: String) {
67 self.hash_to_storage.insert(hash.clone(), storage_id.clone());
68 self.storage_to_hash.insert(storage_id.clone(), hash);
69 self.ref_counts.insert(storage_id, 1);
70 }
71
72 pub fn remove_reference(&mut self, storage_id: &str) -> bool {
77 if let Some(count) = self.ref_counts.get_mut(storage_id) {
78 *count -= 1;
79 if *count == 0 {
80 self.ref_counts.remove(storage_id);
82 if let Some(hash) = self.storage_to_hash.remove(storage_id) {
83 self.hash_to_storage.remove(&hash);
84 }
85 true } else {
87 false }
89 } else {
90 true }
92 }
93
94 pub fn remove_hash_reference(&mut self, hash: &str) -> bool {
99 if let Some(storage_id) = self.hash_to_storage.get(hash) {
100 let storage_id = storage_id.clone(); self.remove_reference(&storage_id)
102 } else {
103 true }
105 }
106
107 pub fn add_hash_reference(&mut self, hash: &str, storage_id: &str) {
111 if let Some(existing_storage_id) = self.hash_to_storage.get(hash) {
112 if existing_storage_id == storage_id {
114 *self.ref_counts.entry(storage_id.to_string()).or_insert(0) += 1;
116 }
117 } else {
118 self.hash_to_storage.insert(hash.to_string(), storage_id.to_string());
120 self.storage_to_hash.insert(storage_id.to_string(), hash.to_string());
121 *self.ref_counts.entry(storage_id.to_string()).or_insert(0) += 1;
122 }
123 }
124
125 pub fn get_dedup_info(&self, storage_id: &str) -> Option<DedupInfo> {
127 if let Some(hash) = self.storage_to_hash.get(storage_id) {
128 let ref_count = self.ref_counts.get(storage_id).copied().unwrap_or(0);
129 Some(DedupInfo {
130 is_reference: ref_count > 1,
131 original_storage_id: None, hash: hash.clone(),
133 ref_count,
134 })
135 } else {
136 None
137 }
138 }
139
140 pub fn get_reference_info(&self, hash: &str) -> Option<DedupInfo> {
142 if let Some(storage_id) = self.hash_to_storage.get(hash) {
143 let ref_count = self.ref_counts.get(storage_id).copied().unwrap_or(0);
144 Some(DedupInfo {
145 is_reference: true,
146 original_storage_id: Some(storage_id.clone()),
147 hash: hash.to_string(),
148 ref_count,
149 })
150 } else {
151 None
152 }
153 }
154
155 pub fn get_stats(&self) -> DedupStats {
157 let total_files = self.ref_counts.values().sum::<u32>();
158 let unique_files = self.ref_counts.len() as u32;
159 let duplicate_files = total_files.saturating_sub(unique_files);
160
161 DedupStats {
162 total_files,
163 unique_files,
164 duplicate_files,
165 dedup_ratio: if total_files > 0 {
166 duplicate_files as f32 / total_files as f32
167 } else {
168 0.0
169 },
170 }
171 }
172
173 pub fn rebuild_from_index(&mut self, entries: Vec<(String, String, u32)>) -> Result<()> {
175 self.hash_to_storage.clear();
177 self.ref_counts.clear();
178 self.storage_to_hash.clear();
179
180 for (storage_id, hash, ref_count) in entries {
181 self.hash_to_storage.insert(hash.clone(), storage_id.clone());
182 self.storage_to_hash.insert(storage_id.clone(), hash);
183 self.ref_counts.insert(storage_id, ref_count);
184 }
185
186 Ok(())
187 }
188}
189
190impl Default for ContentDeduplicator {
191 fn default() -> Self {
192 Self::new()
193 }
194}
195
196#[derive(Debug, Clone)]
198pub struct DedupStats {
199 pub total_files: u32,
201 pub unique_files: u32,
203 pub duplicate_files: u32,
205 pub dedup_ratio: f32,
207}
208
209#[cfg(test)]
210mod tests {
211 use super::*;
212
213 #[test]
214 fn test_deduplicator_basic() {
215 let mut dedup = ContentDeduplicator::new();
216
217 let hash1 = "abc123".to_string();
219 assert_eq!(dedup.check_duplicate(&hash1), None);
220
221 dedup.register_file(hash1.clone(), "storage1".to_string());
223
224 assert_eq!(dedup.check_duplicate(&hash1), Some("storage1".to_string()));
226
227 let info = dedup.get_dedup_info("storage1").unwrap();
229 assert_eq!(info.ref_count, 2); }
231
232 #[test]
233 fn test_hash_calculation() {
234 let data = b"Hello, World!";
235 let hash = ContentDeduplicator::calculate_hash(data);
236 assert!(!hash.is_empty());
237 assert_eq!(hash.len(), 64); }
239
240 #[test]
241 fn test_remove_reference() {
242 let mut dedup = ContentDeduplicator::new();
243
244 dedup.register_file("hash1".to_string(), "storage1".to_string());
245 dedup.check_duplicate("hash1"); assert!(!dedup.remove_reference("storage1"));
249
250 assert!(dedup.remove_reference("storage1"));
252 }
253
254 #[test]
255 fn test_remove_reference_by_hash() {
256 let mut dedup = ContentDeduplicator::new();
257
258 dedup.register_file("hash1".to_string(), "storage1".to_string());
259 dedup.check_duplicate("hash1"); assert!(!dedup.remove_hash_reference("hash1"));
263
264 assert!(dedup.remove_hash_reference("hash1"));
266 }
267
268 #[test]
269 fn test_add_reference_by_hash() {
270 let mut dedup = ContentDeduplicator::new();
271
272 dedup.register_file("hash1".to_string(), "storage1".to_string());
273
274 dedup.add_hash_reference("hash1", "storage1");
276
277 let info = dedup.get_dedup_info("storage1").unwrap();
279 assert_eq!(info.ref_count, 2);
280
281 dedup.add_hash_reference("hash2", "storage2");
283 assert_eq!(dedup.hash_to_storage.get("hash2"), Some(&"storage2".to_string()));
284 }
285}