cooklang_sync_client/
chunker.rs

1use log::trace;
2use quick_cache::{sync::Cache, Weighter};
3use sha2::{Digest, Sha256};
4use std::path::{Path, PathBuf};
5use tokio::fs::{self, create_dir_all, File};
6use tokio::io::{AsyncBufReadExt, AsyncReadExt, AsyncWriteExt, BufReader, BufWriter};
7
8use crate::errors::SyncError;
9
10const BINARY_CHUNK_SIZE: usize = 1_024 * 1_024; // 1 MB
11const BINARY_HASH_SIZE: usize = 32;
12const TEXT_HASH_SIZE: usize = 10;
13
14pub struct Chunker {
15    cache: InMemoryCache,
16    base_path: PathBuf,
17}
18
19type Result<T, E = SyncError> = std::result::Result<T, E>;
20
21impl Chunker {
22    pub fn new(cache: InMemoryCache, base_path: PathBuf) -> Chunker {
23        Chunker { cache, base_path }
24    }
25
26    fn full_path(&self, path: &str) -> PathBuf {
27        let mut base = self.base_path.clone();
28        base.push(path);
29        base
30    }
31
32    pub async fn hashify(&mut self, path: &str) -> Result<Vec<String>> {
33        let p = Path::new(path);
34
35        // TODO probably there's a better way to check if file is binary
36        if is_text(p) {
37            self.hashify_text(path).await
38        } else if is_binary(p) {
39            self.hashify_binary(path).await
40        } else {
41            Err(SyncError::UnlistedFileFormat(path.to_string()))
42        }
43    }
44
45    async fn hashify_binary(&mut self, path: &str) -> Result<Vec<String>> {
46        let file = File::open(self.full_path(path))
47            .await
48            .map_err(|e| SyncError::from_io_error(path, e))?;
49        let mut reader = BufReader::new(file);
50        let mut hashes = Vec::new();
51        let mut buffer = vec![0u8; BINARY_CHUNK_SIZE];
52
53        loop {
54            let bytes_read = reader
55                .read(&mut buffer)
56                .await
57                .map_err(|e| SyncError::from_io_error(path, e))?;
58            if bytes_read == 0 {
59                break;
60            }
61
62            let data = &buffer[..bytes_read].to_vec();
63            let hash = self.hash(data, BINARY_HASH_SIZE);
64            self.save_chunk(&hash, data.to_vec())?;
65            hashes.push(hash);
66        }
67
68        Ok(hashes)
69    }
70
71    async fn hashify_text(&mut self, path: &str) -> Result<Vec<String>> {
72        let file = File::open(self.full_path(path))
73            .await
74            .map_err(|e| SyncError::from_io_error(path, e))?;
75        let mut reader = BufReader::new(file);
76        let mut buffer = Vec::new();
77        let mut hashes = Vec::new();
78
79        while reader
80            .read_until(b'\n', &mut buffer)
81            .await
82            .map_err(|e| SyncError::from_io_error(path, e))?
83            > 0
84        {
85            let data: Vec<u8> = buffer.clone();
86            let hash = self.hash(&data, TEXT_HASH_SIZE);
87            self.save_chunk(&hash, data)?;
88            hashes.push(hash);
89
90            // Clear the buffer for the next line
91            buffer.clear();
92        }
93
94        Ok(hashes)
95    }
96
97    pub fn hash(&self, data: &Vec<u8>, size: usize) -> String {
98        let mut hasher = Sha256::new();
99
100        hasher.update(data);
101
102        let result = hasher.finalize();
103        let hex_string = format!("{:x}", result);
104
105        hex_string[0..size].to_string()
106    }
107
108    pub fn exists(&mut self, path: &str) -> bool {
109        let full_path = self.full_path(path);
110
111        full_path.exists()
112    }
113
114    // TODO can be a problem as it expects cache to contain all chunks
115    pub async fn save(&mut self, path: &str, hashes: Vec<&str>) -> Result<()> {
116        trace!("saving {:?}", path);
117        let full_path = self.full_path(path);
118
119        if let Some(parent) = full_path.parent() {
120            create_dir_all(parent)
121                .await
122                .map_err(|e| SyncError::from_io_error(path, e))?;
123        }
124
125        let file = File::create(full_path)
126            .await
127            .map_err(|e| SyncError::from_io_error(path, e))?;
128        let mut writer = BufWriter::new(file);
129
130        for hash in hashes {
131            let chunk = self.cache.get(hash)?;
132
133            writer
134                .write_all(&chunk)
135                .await
136                .map_err(|e| SyncError::from_io_error(path, e))?;
137        }
138
139        writer
140            .flush()
141            .await
142            .map_err(|e| SyncError::from_io_error(path, e))?;
143
144        Ok(())
145    }
146
147    pub async fn delete(&mut self, path: &str) -> Result<()> {
148        trace!("deleting {:?}", path);
149        let full_path = self.full_path(path);
150
151        // TODO delete folders up too if empty
152        fs::remove_file(full_path)
153            .await
154            .map_err(|e| SyncError::from_io_error(path, e))?;
155
156        Ok(())
157    }
158
159    pub fn read_chunk(&self, chunk_hash: &str) -> Result<Vec<u8>> {
160        self.cache.get(chunk_hash)
161    }
162
163    pub fn save_chunk(&mut self, chunk_hash: &str, content: Vec<u8>) -> Result<()> {
164        self.cache.set(chunk_hash, content)
165    }
166
167    pub fn check_chunk(&self, chunk_hash: &str) -> bool {
168        if chunk_hash.is_empty() {
169            true
170        } else {
171            self.cache.contains(chunk_hash)
172        }
173    }
174}
175
176#[derive(Clone)]
177pub struct BytesWeighter;
178
179impl Weighter<String, Vec<u8>> for BytesWeighter {
180    fn weight(&self, _key: &String, val: &Vec<u8>) -> u64 {
181        // Be cautions out about zero weights!
182        val.len().clamp(1, u64::MAX as usize) as u64
183    }
184}
185
186pub struct InMemoryCache {
187    cache: Cache<String, Vec<u8>, BytesWeighter>,
188}
189
190impl InMemoryCache {
191    pub fn new(total_keys: usize, total_weight: u64) -> InMemoryCache {
192        InMemoryCache {
193            cache: Cache::with_weighter(total_keys, total_weight, BytesWeighter),
194        }
195    }
196
197    fn get(&self, chunk_hash: &str) -> Result<Vec<u8>> {
198        if chunk_hash.is_empty() {
199            return Ok(vec![]);
200        }
201
202        match self.cache.get(chunk_hash) {
203            Some(content) => Ok(content.clone()),
204            None => Err(SyncError::GetFromCacheError),
205        }
206    }
207
208    fn set(&mut self, chunk_hash: &str, content: Vec<u8>) -> Result<()> {
209        // trace!("setting hash {:?} data  {:?}", chunk_hash, content.len());
210        self.cache.insert(chunk_hash.to_string(), content);
211        Ok(())
212    }
213
214    fn contains(&self, chunk_hash: &str) -> bool {
215        match self.cache.get(chunk_hash) {
216            Some(_content) => true,
217            None => false,
218        }
219    }
220}
221
222pub fn is_binary(p: &Path) -> bool {
223    if let Some(ext) = p.extension() {
224        let ext = ext.to_ascii_lowercase();
225
226        ext == "jpg" || ext == "jpeg" || ext == "png"
227    } else {
228        false
229    }
230}
231
232pub fn is_text(p: &Path) -> bool {
233    // Check for specific filenames without extensions
234    if let Some(file_name) = p.file_name() {
235        let file_name_str = file_name.to_string_lossy();
236        if file_name_str == ".shopping-list"
237            || file_name_str == ".shopping-checked"
238            || file_name_str == ".bookmarks"
239        {
240            return true;
241        }
242    }
243
244    // Check for file extensions
245    if let Some(ext) = p.extension() {
246        let ext = ext.to_ascii_lowercase();
247
248        ext == "cook"
249            || ext == "conf"
250            || ext == "yaml"
251            || ext == "yml"
252            || ext == "md"
253            || ext == "menu"
254            || ext == "jinja"
255            || ext == "j2"
256    } else {
257        false
258    }
259}
260
261#[cfg(test)]
262mod tests {
263    use super::*;
264    use std::path::Path;
265    use tempfile::TempDir;
266    use tokio::fs::File;
267    use tokio::io::AsyncWriteExt;
268
269    #[test]
270    fn test_is_binary_with_jpg() {
271        let path = Path::new("image.jpg");
272        assert!(is_binary(path));
273    }
274
275    #[test]
276    fn test_is_binary_with_jpeg() {
277        let path = Path::new("image.JPEG");
278        assert!(is_binary(path));
279    }
280
281    #[test]
282    fn test_is_binary_with_png() {
283        let path = Path::new("image.png");
284        assert!(is_binary(path));
285    }
286
287    #[test]
288    fn test_is_binary_returns_false_for_text() {
289        let path = Path::new("recipe.cook");
290        assert!(!is_binary(path));
291    }
292
293    #[test]
294    fn test_is_text_with_cook_extension() {
295        let path = Path::new("recipe.cook");
296        assert!(is_text(path));
297    }
298
299    #[test]
300    fn test_is_text_with_md_extension() {
301        let path = Path::new("README.md");
302        assert!(is_text(path));
303    }
304
305    #[test]
306    fn test_is_text_with_yaml_extension() {
307        let path = Path::new("config.yaml");
308        assert!(is_text(path));
309    }
310
311    #[test]
312    fn test_is_text_with_yml_extension() {
313        let path = Path::new("config.yml");
314        assert!(is_text(path));
315    }
316
317    #[test]
318    fn test_is_text_with_special_filenames() {
319        assert!(is_text(Path::new(".shopping-list")));
320        assert!(is_text(Path::new(".shopping-checked")));
321        assert!(is_text(Path::new(".bookmarks")));
322    }
323
324    #[test]
325    fn test_is_text_returns_false_for_unknown() {
326        let path = Path::new("file.unknown");
327        assert!(!is_text(path));
328    }
329
330    #[test]
331    fn test_hash_consistency() {
332        let cache = InMemoryCache::new(100, 1000);
333        let chunker = Chunker::new(cache, PathBuf::from("/tmp"));
334
335        let data = b"Hello, World!".to_vec();
336        let hash1 = chunker.hash(&data, 10);
337        let hash2 = chunker.hash(&data, 10);
338
339        // Same input should produce same hash
340        assert_eq!(hash1, hash2);
341    }
342
343    #[test]
344    fn test_hash_different_data_produces_different_hash() {
345        let cache = InMemoryCache::new(100, 1000);
346        let chunker = Chunker::new(cache, PathBuf::from("/tmp"));
347
348        let data1 = b"Hello, World!".to_vec();
349        let data2 = b"Goodbye, World!".to_vec();
350
351        let hash1 = chunker.hash(&data1, 10);
352        let hash2 = chunker.hash(&data2, 10);
353
354        // Different input should produce different hash
355        assert_ne!(hash1, hash2);
356    }
357
358    #[test]
359    fn test_hash_respects_size_parameter() {
360        let cache = InMemoryCache::new(100, 1000);
361        let chunker = Chunker::new(cache, PathBuf::from("/tmp"));
362
363        let data = b"Hello, World!".to_vec();
364        let hash_short = chunker.hash(&data, 5);
365        let hash_long = chunker.hash(&data, 10);
366
367        assert_eq!(hash_short.len(), 5);
368        assert_eq!(hash_long.len(), 10);
369        // Shorter hash should be prefix of longer hash
370        assert!(hash_long.starts_with(&hash_short));
371    }
372
373    #[test]
374    fn test_inmemory_cache_set_and_get() {
375        let mut cache = InMemoryCache::new(100, 1000);
376
377        let hash = "testhash123";
378        let data = vec![1, 2, 3, 4, 5];
379
380        cache.set(hash, data.clone()).unwrap();
381        let retrieved = cache.get(hash).unwrap();
382
383        assert_eq!(data, retrieved);
384    }
385
386    #[test]
387    fn test_inmemory_cache_get_nonexistent() {
388        let cache = InMemoryCache::new(100, 1000);
389
390        let result = cache.get("nonexistent");
391        assert!(result.is_err());
392    }
393
394    #[test]
395    fn test_inmemory_cache_contains() {
396        let mut cache = InMemoryCache::new(100, 1000);
397
398        let hash = "testhash456";
399        let data = vec![1, 2, 3];
400
401        assert!(!cache.contains(hash));
402        cache.set(hash, data).unwrap();
403        assert!(cache.contains(hash));
404    }
405
406    #[test]
407    fn test_inmemory_cache_empty_hash() {
408        let cache = InMemoryCache::new(100, 1000);
409
410        // Empty hash should return empty vector
411        let result = cache.get("").unwrap();
412        assert_eq!(result, Vec::<u8>::new());
413    }
414
415    #[test]
416    fn test_chunker_check_chunk_empty_hash() {
417        let cache = InMemoryCache::new(100, 1000);
418        let chunker = Chunker::new(cache, PathBuf::from("/tmp"));
419
420        // Empty hash should return true
421        assert!(chunker.check_chunk(""));
422    }
423
424    #[test]
425    fn test_chunker_check_chunk_existing() {
426        let mut cache = InMemoryCache::new(100, 1000);
427        cache.set("existinghash", vec![1, 2, 3]).unwrap();
428        let chunker = Chunker::new(cache, PathBuf::from("/tmp"));
429
430        assert!(chunker.check_chunk("existinghash"));
431    }
432
433    #[test]
434    fn test_chunker_check_chunk_nonexistent() {
435        let cache = InMemoryCache::new(100, 1000);
436        let chunker = Chunker::new(cache, PathBuf::from("/tmp"));
437
438        assert!(!chunker.check_chunk("nonexistent"));
439    }
440
441    #[tokio::test]
442    async fn test_chunker_hashify_text_round_trip() {
443        let temp_dir = TempDir::new().unwrap();
444        let cache = InMemoryCache::new(1000, 100000);
445        let mut chunker = Chunker::new(cache, temp_dir.path().to_path_buf());
446
447        // Create a test file
448        let test_file = "test.cook";
449        let content = "Line 1\nLine 2\nLine 3\n";
450        let mut file = File::create(temp_dir.path().join(test_file)).await.unwrap();
451        file.write_all(content.as_bytes()).await.unwrap();
452        file.flush().await.unwrap();
453
454        // Hashify the file
455        let hashes = chunker.hashify(test_file).await.unwrap();
456
457        // Should have 3 hashes (one per line)
458        assert_eq!(hashes.len(), 3);
459
460        // Verify all chunks are in cache
461        for hash in &hashes {
462            assert!(chunker.check_chunk(hash));
463        }
464    }
465
466    #[tokio::test]
467    async fn test_chunker_save_and_read() {
468        let temp_dir = TempDir::new().unwrap();
469        let cache = InMemoryCache::new(1000, 100000);
470        let mut chunker = Chunker::new(cache, temp_dir.path().to_path_buf());
471
472        // Save some chunks to cache
473        let chunk1 = b"Hello ".to_vec();
474        let chunk2 = b"World!".to_vec();
475        let hash1 = chunker.hash(&chunk1, 10);
476        let hash2 = chunker.hash(&chunk2, 10);
477
478        chunker.save_chunk(&hash1, chunk1).unwrap();
479        chunker.save_chunk(&hash2, chunk2).unwrap();
480
481        // Save to file
482        let test_file = "output.txt";
483        chunker.save(test_file, vec![&hash1, &hash2]).await.unwrap();
484
485        // Verify file exists
486        assert!(chunker.exists(test_file));
487
488        // Read file content
489        let content = tokio::fs::read(temp_dir.path().join(test_file))
490            .await
491            .unwrap();
492        assert_eq!(content, b"Hello World!");
493    }
494
495    #[tokio::test]
496    async fn test_chunker_delete() {
497        let temp_dir = TempDir::new().unwrap();
498        let cache = InMemoryCache::new(1000, 100000);
499        let mut chunker = Chunker::new(cache, temp_dir.path().to_path_buf());
500
501        // Create a test file
502        let test_file = "to_delete.txt";
503        let mut file = File::create(temp_dir.path().join(test_file)).await.unwrap();
504        file.write_all(b"test content").await.unwrap();
505        file.flush().await.unwrap();
506
507        assert!(chunker.exists(test_file));
508
509        // Delete the file
510        chunker.delete(test_file).await.unwrap();
511
512        // Verify file doesn't exist
513        assert!(!chunker.exists(test_file));
514    }
515
516    #[test]
517    fn test_bytes_weighter() {
518        let weighter = BytesWeighter;
519
520        let key = "test".to_string();
521        let small_val = vec![1, 2, 3];
522        let large_val = vec![0u8; 1000];
523
524        assert_eq!(weighter.weight(&key, &small_val), 3);
525        assert_eq!(weighter.weight(&key, &large_val), 1000);
526    }
527
528    #[test]
529    fn test_bytes_weighter_empty_vec() {
530        let weighter = BytesWeighter;
531
532        let key = "test".to_string();
533        let empty_val = vec![];
534
535        // Should clamp to minimum of 1
536        assert_eq!(weighter.weight(&key, &empty_val), 1);
537    }
538}