stowr_core/
storage.rs

1use anyhow::{Context, Result};
2use flate2::read::GzDecoder;
3use flate2::write::GzEncoder;
4use flate2::Compression;
5use glob::glob;
6use std::fs::{self, File};
7use std::io::{self};
8use std::path::{Path, PathBuf};
9use uuid::Uuid;
10
11use crate::config::Config;
12use crate::index::{FileEntry, IndexStore};
13use crate::dedup::ContentDeduplicator;
14use crate::delta::DeltaStorage;
15
16pub struct StorageManager {
17    config: Config,
18    index: Box<dyn IndexStore>,
19    deduplicator: ContentDeduplicator,
20    delta_storage: DeltaStorage,
21}
22
23impl StorageManager {
24    pub fn new(config: Config, index: Box<dyn IndexStore>) -> Self {
25        let deduplicator = ContentDeduplicator::new();
26        let delta_storage = DeltaStorage::new(
27            config.similarity_threshold,
28            config.delta_algorithm.clone(),
29        );
30
31        let mut manager = Self {
32            config,
33            index,
34            deduplicator,
35            delta_storage,
36        };
37
38        // 从现有索引重建去重器状态
39        if let Err(e) = manager.rebuild_dedup_state() {
40            eprintln!("Warning: Failed to rebuild deduplication state: {}", e);
41        }
42
43        manager
44    }
45
46    pub fn store_file(&mut self, file_path: &Path, delete_source: bool) -> Result<()> {
47        if !file_path.exists() {
48            return Err(anyhow::anyhow!("File does not exist: {}", file_path.display()));
49        }
50
51        if !file_path.is_file() {
52            return Err(anyhow::anyhow!("Path is not a file: {}", file_path.display()));
53        }
54
55        // 检查文件路径是否已经存储（防止重复存储同一路径）
56        if self.index.get_file(file_path)?.is_some() {
57            println!("File already stored: {}", file_path.display());
58            if delete_source {
59                fs::remove_file(file_path)
60                    .context("Failed to delete source file")?;
61                println!("Source file deleted: {}", file_path.display());
62            }
63            return Ok(());
64        }
65
66        // 计算文件哈希进行内容去重
67        let file_content = fs::read(file_path)
68            .context("Failed to read file for hashing")?;
69        let file_hash = ContentDeduplicator::calculate_hash(&file_content);
70
71        // 检查是否启用去重功能
72        if self.config.enable_deduplication {
73            if let Some(existing_entry) = self.find_file_by_hash(&file_hash)? {
74                // 文件内容完全相同，创建引用
75                let entry = self.create_reference_entry(file_path, &existing_entry)?;
76                self.index.add_file(entry)?;
77                
78                // 增加去重器中的引用计数
79                self.deduplicator.add_hash_reference(&file_hash, &existing_entry.id);
80                
81                if delete_source {
82                    fs::remove_file(file_path)
83                        .context("Failed to delete source file")?;
84                    println!("Source file deleted: {}", file_path.display());
85                }
86                
87                println!("File deduplicated (reference created): {}", file_path.display());
88                println!("References existing file with hash: {}", file_hash);
89                return Ok(());
90            }
91        }
92
93        // 检查是否启用差分存储
94        if self.config.enable_delta_compression {
95            if let Some((base_entry, similarity)) = self.find_similar_file(&file_content)? {
96                if similarity >= self.config.similarity_threshold {
97                    // 创建差分文件
98                    return self.store_as_delta(file_path, &file_content, &base_entry, similarity, delete_source);
99                }
100            }
101        }
102
103        // 作为新的基础文件存储
104        self.store_as_base_file(file_path, &file_content, file_hash, delete_source)
105    }
106
107    pub fn owe_file(&mut self, file_path: &Path) -> Result<()> {
108        let entry = self.index.get_file(file_path)?
109            .ok_or_else(|| anyhow::anyhow!("File not found in storage: {}", file_path.display()))?;
110
111        // 根据文件类型处理不同的提取逻辑
112        if entry.is_reference.unwrap_or(false) {
113            // 引用文件：从原始存储位置提取内容
114            self.extract_reference_file(&entry)?;
115        } else if entry.is_delta.unwrap_or(false) {
116            // 差分文件：重建原文件
117            self.extract_delta_file(&entry)?;
118        } else {
119            // 基础文件：直接解压缩
120            self.decompress_file(&entry.stored_path, &entry.original_path)
121                .context("Failed to decompress file")?;
122            
123            // 对于基础文件，也需要处理引用计数
124            let should_delete_from_dedup = if let Some(hash) = &entry.hash {
125                self.deduplicator.remove_hash_reference(hash)
126            } else {
127                true // 如果没有哈希值，说明不是去重文件，可以删除
128            };
129            
130            // 检查是否还有其他引用
131            let has_references = self.has_references_to_storage(&entry.id)?;
132            
133            // 只有当去重器认为可以删除且没有其他引用时才删除存储文件
134            if should_delete_from_dedup && !has_references && entry.stored_path.exists() {
135                fs::remove_file(&entry.stored_path)
136                    .context("Failed to remove stored file")?;
137            }
138        }
139
140        // 从索引中移除
141        self.index.remove_file(file_path)?;
142
143        println!("File extracted successfully: {}", file_path.display());
144        Ok(())
145    }
146
147    pub fn list_files(&self) -> Result<Vec<FileEntry>> {
148        self.index.list_files()
149    }
150
151    pub fn search_files(&self, pattern: &str) -> Result<Vec<FileEntry>> {
152        let all_files = self.index.list_files()?;
153        let mut matching_files = Vec::new();
154
155        // 创建glob模式匹配器
156        for file_entry in all_files {
157            // 将路径转换为字符串进行匹配
158            let path_str = file_entry.original_path.to_string_lossy();
159            
160            // 使用glob模式匹配
161            if let Ok(matcher) = glob::Pattern::new(pattern) {
162                if matcher.matches(&path_str) {
163                    matching_files.push(file_entry);
164                }
165            } else {
166                // 如果不是有效的glob模式，进行简单的字符串匹配
167                if path_str.contains(pattern) {
168                    matching_files.push(file_entry);
169                }
170            }
171        }
172
173        Ok(matching_files)
174    }
175
176    pub fn rename_file(&mut self, old_path: &Path, new_path: &Path) -> Result<()> {
177        if self.index.get_file(old_path)?.is_none() {
178            return Err(anyhow::anyhow!("File not found in storage: {}", old_path.display()));
179        }
180
181        if self.index.get_file(new_path)?.is_some() {
182            return Err(anyhow::anyhow!("Target file already exists: {}", new_path.display()));
183        }
184
185        self.index.rename_file(old_path, new_path)
186            .context("Failed to rename file in index")?;
187
188        println!("File renamed: {} -> {}", old_path.display(), new_path.display());
189        Ok(())
190    }
191
192    pub fn move_file(&mut self, file_path: &Path, new_location: &Path) -> Result<()> {
193        if self.index.get_file(file_path)?.is_none() {
194            return Err(anyhow::anyhow!("File not found in storage: {}", file_path.display()));
195        }
196
197        let filename = file_path.file_name()
198            .ok_or_else(|| anyhow::anyhow!("Invalid file path"))?;
199        let new_path = new_location.join(filename);
200
201        if self.index.get_file(&new_path)?.is_some() {
202            return Err(anyhow::anyhow!("Target file already exists: {}", new_path.display()));
203        }
204
205        self.index.move_file(file_path, &new_path)
206            .context("Failed to move file in index")?;
207
208        println!("File moved: {} -> {}", file_path.display(), new_path.display());
209        Ok(())
210    }
211
212    pub fn delete_file(&mut self, file_path: &Path) -> Result<()> {
213        let entry = self.index.remove_file(file_path)?
214            .ok_or_else(|| anyhow::anyhow!("File not found in storage: {}", file_path.display()))?;
215
216        // 删除存储的文件
217        if entry.stored_path.exists() {
218            fs::remove_file(&entry.stored_path)
219                .context("Failed to remove stored file")?;
220        }
221
222        println!("File deleted from storage: {}", file_path.display());
223        Ok(())
224    }
225
226    fn decompress_file(&self, input_path: &Path, output_path: &Path) -> Result<()> {
227        // 根据文件扩展名确定压缩算法
228        let algorithm = if let Some(ext) = input_path.extension() {
229            match ext.to_str() {
230                Some("gz") => crate::config::CompressionAlgorithm::Gzip,
231                Some("zst") => crate::config::CompressionAlgorithm::Zstd,
232                Some("lz4") => crate::config::CompressionAlgorithm::Lz4,
233                _ => return Err(anyhow::anyhow!("Unsupported file extension: {:?}", ext)),
234            }
235        } else {
236            return Err(anyhow::anyhow!("No file extension found"));
237        };
238
239        match algorithm {
240            crate::config::CompressionAlgorithm::Gzip => {
241                self.decompress_file_gzip(input_path, output_path)
242            }
243            crate::config::CompressionAlgorithm::Zstd => {
244                self.decompress_file_zstd(input_path, output_path)
245            }
246            crate::config::CompressionAlgorithm::Lz4 => {
247                self.decompress_file_lz4(input_path, output_path)
248            }
249        }
250    }
251
252    fn decompress_file_gzip(&self, input_path: &Path, output_path: &Path) -> Result<()> {
253        let input_file = File::open(input_path)
254            .context("Failed to open compressed file")?;
255        let mut decoder = GzDecoder::new(input_file);
256
257        // 确保输出目录存在
258        if let Some(parent) = output_path.parent() {
259            fs::create_dir_all(parent)
260                .context("Failed to create output directory")?;
261        }
262
263        let mut output_file = File::create(output_path)
264            .context("Failed to create output file")?;
265
266        io::copy(&mut decoder, &mut output_file)
267            .context("Failed to decompress file")?;
268
269        Ok(())
270    }
271
272    fn decompress_file_zstd(&self, input_path: &Path, output_path: &Path) -> Result<()> {
273        let compressed_data = fs::read(input_path)
274            .context("Failed to read compressed file")?;
275
276        let decompressed_data = zstd::decode_all(compressed_data.as_slice())
277            .context("Failed to decompress with zstd")?;
278
279        // 确保输出目录存在
280        if let Some(parent) = output_path.parent() {
281            fs::create_dir_all(parent)
282                .context("Failed to create output directory")?;
283        }
284
285        fs::write(output_path, decompressed_data)
286            .context("Failed to write decompressed file")?;
287
288        Ok(())
289    }
290
291    fn decompress_file_lz4(&self, input_path: &Path, output_path: &Path) -> Result<()> {
292        let compressed_data = fs::read(input_path)
293            .context("Failed to read compressed file")?;
294
295        let decompressed_data = lz4_flex::decompress_size_prepended(&compressed_data)
296            .context("Failed to decompress with lz4")?;
297
298        // 确保输出目录存在
299        if let Some(parent) = output_path.parent() {
300            fs::create_dir_all(parent)
301                .context("Failed to create output directory")?;
302        }
303
304        fs::write(output_path, decompressed_data)
305            .context("Failed to write decompressed file")?;
306
307        Ok(())
308    }
309
310    pub fn store_files_from_list(&mut self, list_file: &Path, delete_source: bool) -> Result<()> {
311        let content = fs::read_to_string(list_file)
312            .context("Failed to read file list")?;
313
314        let mut include_patterns = Vec::new();
315        let mut exclude_patterns = Vec::new();
316
317        // 解析包含和排除模式
318        for line in content.lines() {
319            let line = line.trim();
320            if !line.is_empty() && !line.starts_with('#') {
321                if line.starts_with('!') {
322                    // 排除模式（以!开头）
323                    exclude_patterns.push(&line[1..]);
324                } else {
325                    // 包含模式
326                    include_patterns.push(line);
327                }
328            }
329        }
330
331        // 收集所有匹配的文件
332        let mut all_files = Vec::new();
333        
334        for pattern in include_patterns {
335            if pattern.contains('*') || pattern.contains('?') || pattern.contains('[') {
336                // 处理通配符模式
337                match self.process_glob_pattern(pattern) {
338                    Ok(files) => {
339                        all_files.extend(files);
340                    }
341                    Err(e) => {
342                        eprintln!("Failed to process glob pattern '{}': {}", pattern, e);
343                    }
344                }
345            } else {
346                // 普通文件路径
347                let file_path = PathBuf::from(pattern);
348                if file_path.exists() {
349                    all_files.push(file_path);
350                }
351            }
352        }
353
354        // 应用排除模式
355        let filtered_files = self.apply_exclude_patterns(all_files, &exclude_patterns)?;
356
357        // 如果启用多线程且文件数量足够
358        if self.config.multithread > 1 && filtered_files.len() > 1 {
359            // 使用多线程处理
360            self.store_files_parallel(filtered_files, delete_source)?;
361        } else {
362            // 使用单线程顺序处理
363            for file_path in filtered_files {
364                if let Err(e) = self.store_file(&file_path, delete_source) {
365                    eprintln!("Failed to store {}: {}", file_path.display(), e);
366                }
367            }
368        }
369
370        Ok(())
371    }
372
373    pub fn owe_files_from_list(&mut self, list_file: &Path) -> Result<()> {
374        let content = fs::read_to_string(list_file)
375            .context("Failed to read file list")?;
376
377        let mut include_patterns = Vec::new();
378        let mut exclude_patterns = Vec::new();
379
380        // 解析包含和排除模式
381        for line in content.lines() {
382            let line = line.trim();
383            if !line.is_empty() && !line.starts_with('#') {
384                if line.starts_with('!') {
385                    // 排除模式（以!开头）
386                    exclude_patterns.push(&line[1..]);
387                } else {
388                    // 包含模式
389                    include_patterns.push(line);
390                }
391            }
392        }
393
394        // 收集所有匹配的已存储文件
395        let mut all_files = Vec::new();
396
397        for pattern in include_patterns {
398            if pattern.contains('*') || pattern.contains('?') || pattern.contains('[') {
399                // 对于owe操作，我们需要从索引中查找匹配的文件
400                match self.find_stored_files_by_pattern(pattern) {
401                    Ok(files) => {
402                        all_files.extend(files);
403                    }
404                    Err(e) => {
405                        eprintln!("Failed to process pattern '{}': {}", pattern, e);
406                    }
407                }
408            } else {
409                // 普通文件路径
410                let file_path = PathBuf::from(pattern);
411                if self.index.get_file(&file_path)?.is_some() {
412                    all_files.push(file_path);
413                }
414            }
415        }
416
417        // 应用排除模式到已存储的文件
418        let filtered_files = self.apply_exclude_patterns_to_stored(all_files, &exclude_patterns)?;
419
420        // 如果启用多线程且文件数量足够
421        if self.config.multithread > 1 && filtered_files.len() > 1 {
422            // 使用多线程处理
423            self.owe_files_parallel(filtered_files)?;
424        } else {
425            // 使用单线程顺序处理
426            for file_path in filtered_files {
427                if let Err(e) = self.owe_file(&file_path) {
428                    eprintln!("Failed to owe {}: {}", file_path.display(), e);
429                }
430            }
431        }
432
433        Ok(())
434    }
435
436    /// 处理通配符模式，返回匹配的文件路径列表
437    fn process_glob_pattern(&self, pattern: &str) -> Result<Vec<PathBuf>> {
438        let mut files = Vec::new();
439        
440        // 使用glob crate处理通配符
441        for entry in glob(pattern).context("Failed to parse glob pattern")? {
442            match entry {
443                Ok(path) => {
444                    if path.is_file() {
445                        files.push(path);
446                    }
447                }
448                Err(e) => {
449                    eprintln!("Error reading path: {}", e);
450                }
451            }
452        }
453
454        if files.is_empty() {
455            println!("No files matched pattern: {}", pattern);
456        } else {
457            println!("Found {} files matching pattern: {}", files.len(), pattern);
458        }
459
460        Ok(files)
461    }
462
463    /// 在已存储的文件中查找匹配通配符模式的文件
464    fn find_stored_files_by_pattern(&self, pattern: &str) -> Result<Vec<PathBuf>> {
465        let stored_files = self.index.list_files()?;
466        let mut matching_files = Vec::new();
467
468        // 将通配符模式转换为正则表达式
469        let regex_pattern = self.glob_to_regex(pattern)?;
470        let regex = regex::Regex::new(&regex_pattern)
471            .context("Failed to compile regex pattern")?;
472
473        for entry in stored_files {
474            let path_str = entry.original_path.to_string_lossy();
475            if regex.is_match(&path_str) {
476                matching_files.push(entry.original_path);
477            }
478        }
479
480        if matching_files.is_empty() {
481            println!("No stored files matched pattern: {}", pattern);
482        } else {
483            println!("Found {} stored files matching pattern: {}", matching_files.len(), pattern);
484        }
485
486        Ok(matching_files)
487    }
488
489    /// 将通配符模式转换为正则表达式
490    pub fn glob_to_regex(&self, pattern: &str) -> Result<String> {
491        let mut regex = String::new();
492        let chars: Vec<char> = pattern.chars().collect();
493        let mut i = 0;
494
495        regex.push('^');
496
497        while i < chars.len() {
498            match chars[i] {
499                '*' => {
500                    if i + 1 < chars.len() && chars[i + 1] == '*' {
501                        // ** 匹配任意深度的目录
502                        regex.push_str(".*");
503                        i += 1; // 跳过下一个 *
504                    } else {
505                        // * 匹配单个目录层级中的任意字符（不包括路径分隔符）
506                        regex.push_str(r"[^/\\]*");
507                    }
508                }
509                '?' => {
510                    // ? 匹配单个字符（不包括路径分隔符）
511                    regex.push_str(r"[^/\\]");
512                }
513                '[' => {
514                    // 字符类保持原样
515                    regex.push('[');
516                }
517                ']' => {
518                    regex.push(']');
519                }
520                '\\' | '/' => {
521                    // 路径分隔符标准化为正则表达式
522                    regex.push_str(r"[/\\]");
523                }
524                c if "^$(){}|+.".contains(c) => {
525                    // 转义正则表达式特殊字符
526                    regex.push('\\');
527                    regex.push(c);
528                }
529                c => {
530                    regex.push(c);
531                }
532            }
533            i += 1;
534        }
535
536        regex.push('$');
537        Ok(regex)
538    }
539
540    /// 应用排除模式到文件列表
541    fn apply_exclude_patterns(&self, files: Vec<PathBuf>, exclude_patterns: &[&str]) -> Result<Vec<PathBuf>> {
542        if exclude_patterns.is_empty() {
543            return Ok(files);
544        }
545
546        let original_count = files.len();
547        let mut filtered_files = Vec::new();
548
549        for file_path in files {
550            let mut should_exclude = false;
551            
552            for pattern in exclude_patterns {
553                if self.matches_pattern(&file_path, pattern)? {
554                    should_exclude = true;
555                    break;
556                }
557            }
558            
559            if !should_exclude {
560                filtered_files.push(file_path);
561            }
562        }
563
564        if original_count != filtered_files.len() {
565            println!("Excluded {} files based on exclude patterns", original_count - filtered_files.len());
566        }
567
568        Ok(filtered_files)
569    }
570
571    /// 应用排除模式到已存储的文件列表
572    fn apply_exclude_patterns_to_stored(&self, files: Vec<PathBuf>, exclude_patterns: &[&str]) -> Result<Vec<PathBuf>> {
573        if exclude_patterns.is_empty() {
574            return Ok(files);
575        }
576
577        let original_count = files.len();
578        let mut filtered_files = Vec::new();
579
580        for file_path in files {
581            let mut should_exclude = false;
582            
583            for pattern in exclude_patterns {
584                // 将通配符模式转换为正则表达式进行匹配
585                let regex_pattern = self.glob_to_regex(pattern)?;
586                let regex = regex::Regex::new(&regex_pattern)
587                    .context("Failed to compile exclude regex pattern")?;
588                    
589                let path_str = file_path.to_string_lossy();
590                if regex.is_match(&path_str) {
591                    should_exclude = true;
592                    break;
593                }
594            }
595            
596            if !should_exclude {
597                filtered_files.push(file_path);
598            }
599        }
600
601        if original_count != filtered_files.len() {
602            println!("Excluded {} stored files based on exclude patterns", original_count - filtered_files.len());
603        }
604
605        Ok(filtered_files)
606    }
607
608    /// 检查文件路径是否匹配通配符模式
609    fn matches_pattern(&self, file_path: &Path, pattern: &str) -> Result<bool> {
610        // 使用glob进行文件系统匹配
611        for entry in glob(pattern).context("Failed to parse glob pattern")? {
612            match entry {
613                Ok(path) => {
614                    if path == file_path {
615                        return Ok(true);
616                    }
617                }
618                Err(_) => continue,
619            }
620        }
621        Ok(false)
622    }
623
624    pub fn owe_all_files(&mut self) -> Result<()> {
625        let files = self.index.list_files()?;
626        
627        if files.is_empty() {
628            println!("No files stored.");
629            return Ok(());
630        }
631
632        println!("Extracting {} stored files...", files.len());
633        
634        for entry in files {
635            match self.owe_file(&entry.original_path) {
636                Ok(()) => {
637                    println!("✓ Extracted: {}", entry.original_path.display());
638                }
639                Err(e) => {
640                    eprintln!("✗ Failed to extract {}: {}", entry.original_path.display(), e);
641                }
642            }
643        }
644
645        println!("Extraction complete.");
646        Ok(())
647    }
648
649    // 多线程存储文件
650    fn store_files_parallel(&mut self, files: Vec<PathBuf>, delete_source: bool) -> Result<()> {
651        // 对于去重和差分存储，我们需要顺序处理以正确比较文件
652        // 多线程会破坏去重和差分存储的逻辑，因为需要访问共享的索引和去重器状态
653        println!("Processing {} files sequentially to enable deduplication and delta compression...", files.len());
654        
655        let mut success_count = 0;
656        for file_path in files {
657            match self.store_file(&file_path, delete_source) {
658                Ok(()) => {
659                    success_count += 1;
660                }
661                Err(e) => {
662                    eprintln!("Failed to store {}: {}", file_path.display(), e);
663                }
664            }
665        }
666
667        println!("Stored {} files with deduplication and delta compression enabled", success_count);
668        Ok(())
669    }
670
671    // 多线程提取文件
672    fn owe_files_parallel(&mut self, files: Vec<PathBuf>) -> Result<()> {
673        use rayon::prelude::*;
674          // 设置全局线程池
675        rayon::ThreadPoolBuilder::new()
676            .num_threads(self.config.multithread)
677            .build_global()
678            .unwrap_or_else(|_| {
679                // 如果全局线程池已存在，继续使用
680            });
681
682        // 先获取所有文件的索引条目
683        let mut entries = Vec::new();
684        for file_path in &files {
685            if let Some(entry) = self.index.get_file(file_path)? {
686                entries.push(entry);
687            }
688        }
689
690        // 并行处理文件解压
691        let results: Vec<Result<PathBuf>> = entries
692            .par_iter()
693            .map(|entry| {
694                Self::decompress_file_static(&entry.stored_path, &entry.original_path)
695                    .map(|_| entry.original_path.clone())
696            })
697            .collect();
698
699        // 批量处理结果
700        let mut success_count = 0;
701        for (i, result) in results.into_iter().enumerate() {
702            match result {
703                Ok(file_path) => {
704                    // 删除压缩的存储文件
705                    if let Err(e) = fs::remove_file(&entries[i].stored_path) {
706                        eprintln!("Failed to remove stored file {}: {}", entries[i].stored_path.display(), e);
707                    }
708                    
709                    // 从索引中移除
710                    if let Err(e) = self.index.remove_file(&file_path) {
711                        eprintln!("Failed to remove from index {}: {}", file_path.display(), e);
712                    } else {
713                        success_count += 1;
714                        println!("File extracted successfully: {}", file_path.display());
715                    }
716                }
717                Err(e) => {
718                    eprintln!("Failed to extract file: {}", e);
719                }
720            }
721        }
722
723        println!("Extracted {} files using {} threads", success_count, self.config.multithread);
724        Ok(())
725    }
726
727    // 静态解压文件方法
728    fn decompress_file_static(input_path: &Path, output_path: &Path) -> Result<()> {
729        // 根据文件扩展名确定压缩算法
730        let algorithm = if let Some(ext) = input_path.extension() {
731            match ext.to_str() {
732                Some("gz") => crate::config::CompressionAlgorithm::Gzip,
733                Some("zst") => crate::config::CompressionAlgorithm::Zstd,
734                Some("lz4") => crate::config::CompressionAlgorithm::Lz4,
735                _ => return Err(anyhow::anyhow!("Unsupported file extension: {:?}", ext)),
736            }
737        } else {
738            return Err(anyhow::anyhow!("No file extension found"));
739        };
740
741        match algorithm {
742            crate::config::CompressionAlgorithm::Gzip => {
743                Self::decompress_file_gzip_static(input_path, output_path)
744            }
745            crate::config::CompressionAlgorithm::Zstd => {
746                Self::decompress_file_zstd_static(input_path, output_path)
747            }
748            crate::config::CompressionAlgorithm::Lz4 => {
749                Self::decompress_file_lz4_static(input_path, output_path)
750            }
751        }
752    }
753
754    fn decompress_file_gzip_static(input_path: &Path, output_path: &Path) -> Result<()> {
755        let input_file = File::open(input_path)
756            .context("Failed to open compressed file")?;
757        let mut decoder = GzDecoder::new(input_file);
758
759        // 确保输出目录存在
760        if let Some(parent) = output_path.parent() {
761            fs::create_dir_all(parent)
762                .context("Failed to create output directory")?;
763        }
764
765        let mut output_file = File::create(output_path)
766            .context("Failed to create output file")?;
767
768        io::copy(&mut decoder, &mut output_file)
769            .context("Failed to decompress file")?;
770
771        Ok(())
772    }
773
774    fn decompress_file_zstd_static(input_path: &Path, output_path: &Path) -> Result<()> {
775        let compressed_data = fs::read(input_path)
776            .context("Failed to read compressed file")?;
777
778        let decompressed_data = zstd::decode_all(compressed_data.as_slice())
779            .context("Failed to decompress with zstd")?;
780
781        // 确保输出目录存在
782        if let Some(parent) = output_path.parent() {
783            fs::create_dir_all(parent)
784                .context("Failed to create output directory")?;
785        }
786
787        fs::write(output_path, decompressed_data)
788            .context("Failed to write decompressed file")?;
789
790        Ok(())
791    }
792
793    fn decompress_file_lz4_static(input_path: &Path, output_path: &Path) -> Result<()> {
794        let compressed_data = fs::read(input_path)
795            .context("Failed to read compressed file")?;
796
797        let decompressed_data = lz4_flex::decompress_size_prepended(&compressed_data)
798            .context("Failed to decompress with lz4")?;
799
800        // 确保输出目录存在
801        if let Some(parent) = output_path.parent() {
802            fs::create_dir_all(parent)
803                .context("Failed to create output directory")?;
804        }
805
806        fs::write(output_path, decompressed_data)
807            .context("Failed to write decompressed file")?;
808
809        Ok(())
810    }
811
812    /// 获取去重统计信息
813    pub fn get_dedup_stats(&self) -> crate::dedup::DedupStats {
814        self.deduplicator.get_stats()
815    }
816
817    /// 获取差分存储统计信息
818    pub fn get_delta_stats(&self) -> crate::delta::DeltaStats {
819        self.delta_storage.get_stats()
820    }
821
822    /// 检查是否启用去重功能
823    pub fn is_dedup_enabled(&self) -> bool {
824        self.config.enable_deduplication
825    }
826
827    /// 检查是否启用差分存储功能
828    pub fn is_delta_enabled(&self) -> bool {
829        self.config.enable_delta_compression
830    }
831
832    /// 获取当前相似度阈值
833    pub fn get_similarity_threshold(&self) -> f32 {
834        self.config.similarity_threshold
835    }
836
837    /// 根据哈希值查找基础文件（用于去重）
838    fn find_file_by_hash(&self, hash: &str) -> Result<Option<FileEntry>> {
839        let all_files = self.index.list_files()?;
840        for file in all_files {
841            if let Some(file_hash) = &file.hash {
842                if file_hash == hash {
843                    // 只返回基础文件（非引用、非差分文件）
844                    if !file.is_reference.unwrap_or(false) && !file.is_delta.unwrap_or(false) {
845                        return Ok(Some(file));
846                    }
847                }
848            }
849        }
850        Ok(None)
851    }
852
853    /// 查找相似文件用于差分存储
854    fn find_similar_file(&self, content: &[u8]) -> Result<Option<(FileEntry, f32)>> {
855        let all_files = self.index.list_files()?;
856        let mut best_match: Option<(FileEntry, f32)> = None;
857
858        for file in all_files {
859            // 只考虑基础文件（非引用、非差分文件）
860            if file.is_reference.unwrap_or(false) || file.is_delta.unwrap_or(false) {
861                continue;
862            }
863
864            // 读取已存储的文件内容进行比较
865            if let Ok(stored_content) = self.read_stored_file_content(&file) {
866                let similarity = self.delta_storage.calculate_similarity(content, &stored_content);
867                
868                if let Some((_, current_best)) = &best_match {
869                    if similarity > *current_best {
870                        best_match = Some((file, similarity));
871                    }
872                } else {
873                    best_match = Some((file, similarity));
874                }
875            }
876        }
877
878        Ok(best_match)
879    }
880
881    /// 读取已存储文件的内容
882    fn read_stored_file_content(&self, entry: &FileEntry) -> Result<Vec<u8>> {
883        // 先解压缩文件到临时位置，然后读取内容
884        let compressed_data = fs::read(&entry.stored_path)
885            .context("Failed to read stored file")?;
886
887        match entry.compression_algorithm {
888            crate::config::CompressionAlgorithm::Gzip => {
889                let mut decoder = GzDecoder::new(compressed_data.as_slice());
890                let mut content = Vec::new();
891                std::io::Read::read_to_end(&mut decoder, &mut content)
892                    .context("Failed to decompress gzip file")?;
893                Ok(content)
894            }
895            crate::config::CompressionAlgorithm::Zstd => {
896                zstd::decode_all(compressed_data.as_slice())
897                    .context("Failed to decompress zstd file")
898            }
899            crate::config::CompressionAlgorithm::Lz4 => {
900                lz4_flex::decompress_size_prepended(&compressed_data)
901                    .context("Failed to decompress lz4 file")
902            }
903        }
904    }
905
906    /// 创建引用条目（用于去重）
907    fn create_reference_entry(&self, file_path: &Path, existing_entry: &FileEntry) -> Result<FileEntry> {
908        let id = Uuid::new_v4().to_string();
909        let mut entry = FileEntry::new(
910            id,
911            file_path.to_path_buf(),
912            existing_entry.stored_path.clone(), // 引用同样的存储路径
913            existing_entry.file_size,
914            0, // 引用文件的压缩大小为0
915            existing_entry.compression_algorithm.clone(),
916        );
917
918        // 设置引用相关字段
919        entry.is_reference = Some(true);
920        entry.base_storage_id = Some(existing_entry.id.clone());
921        entry.hash = existing_entry.hash.clone();
922
923        Ok(entry)
924    }
925
926    /// 存储为差分文件
927    fn store_as_delta(
928        &mut self,
929        file_path: &Path,
930        content: &[u8],
931        base_entry: &FileEntry,
932        similarity: f32,
933        delete_source: bool,
934    ) -> Result<()> {
935        // 读取基础文件内容
936        let base_content = self.read_stored_file_content(base_entry)?;
937
938        // 创建差分数据
939        let delta_data = self.delta_storage.create_delta(&base_content, content)?;
940
941        // 生成存储ID和路径
942        let id = Uuid::new_v4().to_string();
943        let extension = self.config.compression_algorithm.file_extension();
944        let stored_filename = format!("{}.{}", id, extension);
945        let stored_path = self.config.storage_path.join(&stored_filename);
946
947        // 确保存储目录存在
948        fs::create_dir_all(&self.config.storage_path)
949            .context("Failed to create storage directory")?;
950
951        // 压缩并存储差分数据
952        let compressed_size = self.compress_data(&delta_data, &stored_path)
953            .context("Failed to compress delta data")?;
954
955        // 创建索引条目
956        let mut entry = FileEntry::new(
957            id,
958            file_path.to_path_buf(),
959            stored_path,
960            content.len() as u64,
961            compressed_size,
962            self.config.compression_algorithm.clone(),
963        );
964
965        // 设置差分相关字段
966        entry.is_delta = Some(true);
967        entry.base_storage_id = Some(base_entry.id.clone());
968        entry.similarity_score = Some(similarity);
969        entry.hash = Some(ContentDeduplicator::calculate_hash(content));
970
971        // 添加到索引
972        self.index.add_file(entry)
973            .context("Failed to add delta file to index")?;
974
975        // 删除源文件（如果需要）
976        if delete_source {
977            fs::remove_file(file_path)
978                .context("Failed to delete source file")?;
979            println!("Source file deleted: {}", file_path.display());
980        }
981
982        println!("File stored as delta: {}", file_path.display());
983        println!("Similarity: {:.1}%, Delta size: {:.1}%", 
984                 similarity * 100.0,
985                 (compressed_size as f64 / content.len() as f64) * 100.0);
986
987        Ok(())
988    }
989
990    /// 存储为基础文件
991    fn store_as_base_file(
992        &mut self,
993        file_path: &Path,
994        content: &[u8],
995        hash: String,
996        delete_source: bool,
997    ) -> Result<()> {
998        // 生成唯一ID和存储路径
999        let id = Uuid::new_v4().to_string();
1000        let extension = self.config.compression_algorithm.file_extension();
1001        let stored_filename = format!("{}.{}", id, extension);
1002        let stored_path = self.config.storage_path.join(&stored_filename);
1003
1004        // 确保存储目录存在
1005        fs::create_dir_all(&self.config.storage_path)
1006            .context("Failed to create storage directory")?;
1007
1008        // 压缩并存储文件
1009        let compressed_size = self.compress_data(content, &stored_path)
1010            .context("Failed to compress file")?;
1011
1012        // 创建索引条目
1013        let mut entry = FileEntry::new(
1014            id.clone(),
1015            file_path.to_path_buf(),
1016            stored_path,
1017            content.len() as u64,
1018            compressed_size,
1019            self.config.compression_algorithm.clone(),
1020        );
1021
1022        // 设置哈希值
1023        entry.hash = Some(hash.clone());
1024
1025        // 注册到去重器（如果启用）
1026        if self.config.enable_deduplication {
1027            self.deduplicator.register_file(hash, id);
1028        }
1029
1030        // 添加到索引
1031        self.index.add_file(entry)
1032            .context("Failed to add file to index")?;
1033
1034        // 删除源文件（如果需要）
1035        if delete_source {
1036            fs::remove_file(file_path)
1037                .context("Failed to delete source file")?;
1038            println!("Source file deleted: {}", file_path.display());
1039        }
1040
1041        println!("File stored successfully: {}", file_path.display());
1042        println!("Compression ratio: {:.1}%", 
1043                 (compressed_size as f64 / content.len() as f64) * 100.0);
1044
1045        Ok(())
1046    }
1047
1048    /// 压缩数据到指定路径
1049    fn compress_data(&self, data: &[u8], output_path: &Path) -> Result<u64> {
1050        match self.config.compression_algorithm {
1051            crate::config::CompressionAlgorithm::Gzip => {
1052                let output_file = File::create(output_path)
1053                    .context("Failed to create output file")?;
1054                let mut encoder = GzEncoder::new(output_file, Compression::new(self.config.compression_level as u32));
1055                std::io::Write::write_all(&mut encoder, data)
1056                    .context("Failed to write compressed data")?;
1057                encoder.finish()
1058                    .context("Failed to finish compression")?;
1059                
1060                Ok(fs::metadata(output_path)?.len())
1061            }
1062            crate::config::CompressionAlgorithm::Zstd => {
1063                let compressed_data = zstd::encode_all(data, self.config.compression_level as i32)
1064                    .context("Failed to compress with zstd")?;
1065                fs::write(output_path, &compressed_data)
1066                    .context("Failed to write compressed file")?;
1067                
1068                Ok(compressed_data.len() as u64)
1069            }
1070            crate::config::CompressionAlgorithm::Lz4 => {
1071                let compressed_data = lz4_flex::compress_prepend_size(data);
1072                fs::write(output_path, &compressed_data)
1073                    .context("Failed to write compressed file")?;
1074                
1075                Ok(compressed_data.len() as u64)
1076            }
1077        }
1078    }
1079
1080    /// 提取引用文件
1081    fn extract_reference_file(&mut self, entry: &FileEntry) -> Result<()> {
1082        // 引用文件的stored_path指向原始存储文件
1083        // 直接解压缩到目标位置
1084        self.decompress_file(&entry.stored_path, &entry.original_path)
1085            .context("Failed to decompress reference file")?;
1086
1087        // 对于引用文件，检查是否需要删除基础存储文件
1088        if let Some(base_storage_id) = &entry.base_storage_id {
1089            // 检查是否有其他文件（除了当前文件）仍在引用这个存储
1090            let has_other_references = self.has_other_references_to_storage(base_storage_id, &entry.original_path)?;
1091            
1092            // 如果当前文件有哈希值，更新去重器的引用计数
1093            let should_delete_from_dedup = if let Some(hash) = &entry.hash {
1094                self.deduplicator.remove_hash_reference(hash)
1095            } else {
1096                false
1097            };
1098            
1099            // 只有当没有其他引用且去重器也认为应该删除时才删除物理文件
1100            if !has_other_references && should_delete_from_dedup && entry.stored_path.exists() {
1101                fs::remove_file(&entry.stored_path)
1102                    .context("Failed to remove stored file")?;
1103            }
1104        }
1105
1106        Ok(())
1107    }
1108
1109    /// 提取差分文件
1110    fn extract_delta_file(&mut self, entry: &FileEntry) -> Result<()> {
1111        // 获取基础文件ID
1112        let base_storage_id = entry.base_storage_id.as_ref()
1113            .ok_or_else(|| anyhow::anyhow!("Delta file missing base storage ID"))?;
1114
1115        // 查找基础文件
1116        let base_entry = self.find_file_by_storage_id(base_storage_id)?
1117            .ok_or_else(|| anyhow::anyhow!("Base file not found for delta: {}", base_storage_id))?;
1118
1119        // 读取基础文件内容
1120        let base_content = self.read_stored_file_content(&base_entry)?;
1121
1122        // 读取差分数据
1123        let delta_data = self.read_stored_file_content(entry)?;
1124
1125        // 应用差分重建原文件
1126        let reconstructed_content = self.delta_storage.apply_delta(&base_content, &delta_data)?;
1127
1128        // 确保输出目录存在
1129        if let Some(parent) = entry.original_path.parent() {
1130            fs::create_dir_all(parent)
1131                .context("Failed to create output directory")?;
1132        }
1133
1134        // 写入重建的文件
1135        fs::write(&entry.original_path, reconstructed_content)
1136            .context("Failed to write reconstructed file")?;
1137
1138        // 删除差分存储文件
1139        if entry.stored_path.exists() {
1140            fs::remove_file(&entry.stored_path)
1141                .context("Failed to remove delta file")?;
1142        }
1143
1144        Ok(())
1145    }
1146
1147    /// 根据存储ID查找文件
1148    fn find_file_by_storage_id(&self, storage_id: &str) -> Result<Option<FileEntry>> {
1149        let all_files = self.index.list_files()?;
1150        for file in all_files {
1151            if file.id == storage_id {
1152                return Ok(Some(file));
1153            }
1154        }
1155        Ok(None)
1156    }
1157
1158    /// 从现有索引重建去重器状态
1159    fn rebuild_dedup_state(&mut self) -> Result<()> {
1160        let all_files = self.index.list_files()?;
1161        let mut dedup_entries = Vec::new();
1162
1163        for file in all_files {
1164            if let Some(hash) = &file.hash {
1165                // 只有基础文件（非引用、非差分）才需要注册到去重器
1166                if !file.is_reference.unwrap_or(false) && !file.is_delta.unwrap_or(false) {
1167                    // 计算引用计数（包括自己）
1168                    let ref_count = self.count_references_for_hash(hash)?;
1169                    dedup_entries.push((file.id.clone(), hash.clone(), ref_count));
1170                }
1171            }
1172        }
1173
1174        self.deduplicator.rebuild_from_index(dedup_entries)?;
1175        Ok(())
1176    }
1177
1178    /// 计算特定哈希值的引用计数
1179    fn count_references_for_hash(&self, target_hash: &str) -> Result<u32> {
1180        let all_files = self.index.list_files()?;
1181        let mut count = 0;
1182
1183        for file in all_files {
1184            if let Some(hash) = &file.hash {
1185                if hash == target_hash {
1186                    count += 1;
1187                }
1188            }
1189        }
1190
1191        Ok(count)
1192    }
1193
1194    /// 检查是否有其他文件引用指定的存储ID
1195    fn has_references_to_storage(&self, storage_id: &str) -> Result<bool> {
1196        let all_files = self.index.list_files()?;
1197        
1198        for file in all_files {
1199            // 检查引用文件
1200            if file.is_reference.unwrap_or(false) {
1201                if let Some(base_id) = &file.base_storage_id {
1202                    if base_id == storage_id {
1203                        return Ok(true);
1204                    }
1205                }
1206            }
1207            
1208            // 检查差分文件
1209            if file.is_delta.unwrap_or(false) {
1210                if let Some(base_id) = &file.base_storage_id {
1211                    if base_id == storage_id {
1212                        return Ok(true);
1213                    }
1214                }
1215            }
1216        }
1217        
1218        Ok(false)
1219    }
1220
1221    /// 检查是否有其他文件（除了指定文件）引用指定的存储ID
1222    fn has_other_references_to_storage(&self, storage_id: &str, exclude_path: &Path) -> Result<bool> {
1223        let all_files = self.index.list_files()?;
1224        
1225        for file in all_files {
1226            // 跳过指定要排除的文件
1227            if file.original_path == exclude_path {
1228                continue;
1229            }
1230            
1231            // 检查引用文件
1232            if file.is_reference.unwrap_or(false) {
1233                if let Some(base_id) = &file.base_storage_id {
1234                    if base_id == storage_id {
1235                        return Ok(true);
1236                    }
1237                }
1238            }
1239            
1240            // 检查差分文件
1241            if file.is_delta.unwrap_or(false) {
1242                if let Some(base_id) = &file.base_storage_id {
1243                    if base_id == storage_id {
1244                        return Ok(true);
1245                    }
1246                }
1247            }
1248        }
1249        
1250        Ok(false)
1251    }
1252}
stowr_core/storage.rs

stowr_core/
storage.rs