1use log::trace;
2use quick_cache::{sync::Cache, Weighter};
3use sha2::{Digest, Sha256};
4use std::path::{Path, PathBuf};
5use tokio::fs::{self, create_dir_all, File};
6use tokio::io::{AsyncBufReadExt, AsyncReadExt, AsyncWriteExt, BufReader, BufWriter};
7
8use crate::errors::SyncError;
9
10const BINARY_CHUNK_SIZE: usize = 1_024 * 1_024; const BINARY_HASH_SIZE: usize = 32;
12const TEXT_HASH_SIZE: usize = 10;
13
14pub struct Chunker {
15 cache: InMemoryCache,
16 base_path: PathBuf,
17}
18
19type Result<T, E = SyncError> = std::result::Result<T, E>;
20
21impl Chunker {
22 pub fn new(cache: InMemoryCache, base_path: PathBuf) -> Chunker {
23 Chunker { cache, base_path }
24 }
25
26 fn full_path(&self, path: &str) -> PathBuf {
27 let mut base = self.base_path.clone();
28 base.push(path);
29 base
30 }
31
32 pub async fn hashify(&mut self, path: &str) -> Result<Vec<String>> {
33 let p = Path::new(path);
34
35 if is_text(p) {
37 self.hashify_text(path).await
38 } else if is_binary(p) {
39 self.hashify_binary(path).await
40 } else {
41 Err(SyncError::UnlistedFileFormat(path.to_string()))
42 }
43 }
44
45 async fn hashify_binary(&mut self, path: &str) -> Result<Vec<String>> {
46 let file = File::open(self.full_path(path))
47 .await
48 .map_err(|e| SyncError::from_io_error(path, e))?;
49 let mut reader = BufReader::new(file);
50 let mut hashes = Vec::new();
51 let mut buffer = vec![0u8; BINARY_CHUNK_SIZE];
52
53 loop {
54 let bytes_read = reader
55 .read(&mut buffer)
56 .await
57 .map_err(|e| SyncError::from_io_error(path, e))?;
58 if bytes_read == 0 {
59 break;
60 }
61
62 let data = &buffer[..bytes_read].to_vec();
63 let hash = self.hash(data, BINARY_HASH_SIZE);
64 self.save_chunk(&hash, data.to_vec())?;
65 hashes.push(hash);
66 }
67
68 Ok(hashes)
69 }
70
71 async fn hashify_text(&mut self, path: &str) -> Result<Vec<String>> {
72 let file = File::open(self.full_path(path))
73 .await
74 .map_err(|e| SyncError::from_io_error(path, e))?;
75 let mut reader = BufReader::new(file);
76 let mut buffer = Vec::new();
77 let mut hashes = Vec::new();
78
79 while reader
80 .read_until(b'\n', &mut buffer)
81 .await
82 .map_err(|e| SyncError::from_io_error(path, e))?
83 > 0
84 {
85 let data: Vec<u8> = buffer.clone();
86 let hash = self.hash(&data, TEXT_HASH_SIZE);
87 self.save_chunk(&hash, data)?;
88 hashes.push(hash);
89
90 buffer.clear();
92 }
93
94 Ok(hashes)
95 }
96
97 pub fn hash(&self, data: &Vec<u8>, size: usize) -> String {
98 let mut hasher = Sha256::new();
99
100 hasher.update(data);
101
102 let result = hasher.finalize();
103 let hex_string = format!("{:x}", result);
104
105 hex_string[0..size].to_string()
106 }
107
108 pub fn exists(&mut self, path: &str) -> bool {
109 let full_path = self.full_path(path);
110
111 full_path.exists()
112 }
113
114 pub async fn save(&mut self, path: &str, hashes: Vec<&str>) -> Result<()> {
116 trace!("saving {:?}", path);
117 let full_path = self.full_path(path);
118
119 if let Some(parent) = full_path.parent() {
120 create_dir_all(parent)
121 .await
122 .map_err(|e| SyncError::from_io_error(path, e))?;
123 }
124
125 let file = File::create(full_path)
126 .await
127 .map_err(|e| SyncError::from_io_error(path, e))?;
128 let mut writer = BufWriter::new(file);
129
130 for hash in hashes {
131 let chunk = self.cache.get(hash)?;
132
133 writer
134 .write_all(&chunk)
135 .await
136 .map_err(|e| SyncError::from_io_error(path, e))?;
137 }
138
139 writer
140 .flush()
141 .await
142 .map_err(|e| SyncError::from_io_error(path, e))?;
143
144 Ok(())
145 }
146
147 pub async fn delete(&mut self, path: &str) -> Result<()> {
148 trace!("deleting {:?}", path);
149 let full_path = self.full_path(path);
150
151 fs::remove_file(full_path)
153 .await
154 .map_err(|e| SyncError::from_io_error(path, e))?;
155
156 Ok(())
157 }
158
159 pub fn read_chunk(&self, chunk_hash: &str) -> Result<Vec<u8>> {
160 self.cache.get(chunk_hash)
161 }
162
163 pub fn save_chunk(&mut self, chunk_hash: &str, content: Vec<u8>) -> Result<()> {
164 self.cache.set(chunk_hash, content)
165 }
166
167 pub fn check_chunk(&self, chunk_hash: &str) -> bool {
168 if chunk_hash.is_empty() {
169 true
170 } else {
171 self.cache.contains(chunk_hash)
172 }
173 }
174}
175
176#[derive(Clone)]
177pub struct BytesWeighter;
178
179impl Weighter<String, Vec<u8>> for BytesWeighter {
180 fn weight(&self, _key: &String, val: &Vec<u8>) -> u64 {
181 val.len().clamp(1, u64::MAX as usize) as u64
183 }
184}
185
186pub struct InMemoryCache {
187 cache: Cache<String, Vec<u8>, BytesWeighter>,
188}
189
190impl InMemoryCache {
191 pub fn new(total_keys: usize, total_weight: u64) -> InMemoryCache {
192 InMemoryCache {
193 cache: Cache::with_weighter(total_keys, total_weight, BytesWeighter),
194 }
195 }
196
197 fn get(&self, chunk_hash: &str) -> Result<Vec<u8>> {
198 if chunk_hash.is_empty() {
199 return Ok(vec![]);
200 }
201
202 match self.cache.get(chunk_hash) {
203 Some(content) => Ok(content.clone()),
204 None => Err(SyncError::GetFromCacheError),
205 }
206 }
207
208 fn set(&mut self, chunk_hash: &str, content: Vec<u8>) -> Result<()> {
209 self.cache.insert(chunk_hash.to_string(), content);
211 Ok(())
212 }
213
214 fn contains(&self, chunk_hash: &str) -> bool {
215 match self.cache.get(chunk_hash) {
216 Some(_content) => true,
217 None => false,
218 }
219 }
220}
221
222pub fn is_binary(p: &Path) -> bool {
223 if let Some(ext) = p.extension() {
224 let ext = ext.to_ascii_lowercase();
225
226 ext == "jpg" || ext == "jpeg" || ext == "png"
227 } else {
228 false
229 }
230}
231
232pub fn is_text(p: &Path) -> bool {
233 if let Some(file_name) = p.file_name() {
235 let file_name_str = file_name.to_string_lossy();
236 if file_name_str == ".shopping-list"
237 || file_name_str == ".shopping-checked"
238 || file_name_str == ".bookmarks"
239 {
240 return true;
241 }
242 }
243
244 if let Some(ext) = p.extension() {
246 let ext = ext.to_ascii_lowercase();
247
248 ext == "cook"
249 || ext == "conf"
250 || ext == "yaml"
251 || ext == "yml"
252 || ext == "md"
253 || ext == "menu"
254 || ext == "jinja"
255 || ext == "j2"
256 } else {
257 false
258 }
259}
260
261#[cfg(test)]
262mod tests {
263 use super::*;
264 use std::path::Path;
265 use tempfile::TempDir;
266 use tokio::fs::File;
267 use tokio::io::AsyncWriteExt;
268
269 #[test]
270 fn test_is_binary_with_jpg() {
271 let path = Path::new("image.jpg");
272 assert!(is_binary(path));
273 }
274
275 #[test]
276 fn test_is_binary_with_jpeg() {
277 let path = Path::new("image.JPEG");
278 assert!(is_binary(path));
279 }
280
281 #[test]
282 fn test_is_binary_with_png() {
283 let path = Path::new("image.png");
284 assert!(is_binary(path));
285 }
286
287 #[test]
288 fn test_is_binary_returns_false_for_text() {
289 let path = Path::new("recipe.cook");
290 assert!(!is_binary(path));
291 }
292
293 #[test]
294 fn test_is_text_with_cook_extension() {
295 let path = Path::new("recipe.cook");
296 assert!(is_text(path));
297 }
298
299 #[test]
300 fn test_is_text_with_md_extension() {
301 let path = Path::new("README.md");
302 assert!(is_text(path));
303 }
304
305 #[test]
306 fn test_is_text_with_yaml_extension() {
307 let path = Path::new("config.yaml");
308 assert!(is_text(path));
309 }
310
311 #[test]
312 fn test_is_text_with_yml_extension() {
313 let path = Path::new("config.yml");
314 assert!(is_text(path));
315 }
316
317 #[test]
318 fn test_is_text_with_special_filenames() {
319 assert!(is_text(Path::new(".shopping-list")));
320 assert!(is_text(Path::new(".shopping-checked")));
321 assert!(is_text(Path::new(".bookmarks")));
322 }
323
324 #[test]
325 fn test_is_text_returns_false_for_unknown() {
326 let path = Path::new("file.unknown");
327 assert!(!is_text(path));
328 }
329
330 #[test]
331 fn test_hash_consistency() {
332 let cache = InMemoryCache::new(100, 1000);
333 let chunker = Chunker::new(cache, PathBuf::from("/tmp"));
334
335 let data = b"Hello, World!".to_vec();
336 let hash1 = chunker.hash(&data, 10);
337 let hash2 = chunker.hash(&data, 10);
338
339 assert_eq!(hash1, hash2);
341 }
342
343 #[test]
344 fn test_hash_different_data_produces_different_hash() {
345 let cache = InMemoryCache::new(100, 1000);
346 let chunker = Chunker::new(cache, PathBuf::from("/tmp"));
347
348 let data1 = b"Hello, World!".to_vec();
349 let data2 = b"Goodbye, World!".to_vec();
350
351 let hash1 = chunker.hash(&data1, 10);
352 let hash2 = chunker.hash(&data2, 10);
353
354 assert_ne!(hash1, hash2);
356 }
357
358 #[test]
359 fn test_hash_respects_size_parameter() {
360 let cache = InMemoryCache::new(100, 1000);
361 let chunker = Chunker::new(cache, PathBuf::from("/tmp"));
362
363 let data = b"Hello, World!".to_vec();
364 let hash_short = chunker.hash(&data, 5);
365 let hash_long = chunker.hash(&data, 10);
366
367 assert_eq!(hash_short.len(), 5);
368 assert_eq!(hash_long.len(), 10);
369 assert!(hash_long.starts_with(&hash_short));
371 }
372
373 #[test]
374 fn test_inmemory_cache_set_and_get() {
375 let mut cache = InMemoryCache::new(100, 1000);
376
377 let hash = "testhash123";
378 let data = vec![1, 2, 3, 4, 5];
379
380 cache.set(hash, data.clone()).unwrap();
381 let retrieved = cache.get(hash).unwrap();
382
383 assert_eq!(data, retrieved);
384 }
385
386 #[test]
387 fn test_inmemory_cache_get_nonexistent() {
388 let cache = InMemoryCache::new(100, 1000);
389
390 let result = cache.get("nonexistent");
391 assert!(result.is_err());
392 }
393
394 #[test]
395 fn test_inmemory_cache_contains() {
396 let mut cache = InMemoryCache::new(100, 1000);
397
398 let hash = "testhash456";
399 let data = vec![1, 2, 3];
400
401 assert!(!cache.contains(hash));
402 cache.set(hash, data).unwrap();
403 assert!(cache.contains(hash));
404 }
405
406 #[test]
407 fn test_inmemory_cache_empty_hash() {
408 let cache = InMemoryCache::new(100, 1000);
409
410 let result = cache.get("").unwrap();
412 assert_eq!(result, Vec::<u8>::new());
413 }
414
415 #[test]
416 fn test_chunker_check_chunk_empty_hash() {
417 let cache = InMemoryCache::new(100, 1000);
418 let chunker = Chunker::new(cache, PathBuf::from("/tmp"));
419
420 assert!(chunker.check_chunk(""));
422 }
423
424 #[test]
425 fn test_chunker_check_chunk_existing() {
426 let mut cache = InMemoryCache::new(100, 1000);
427 cache.set("existinghash", vec![1, 2, 3]).unwrap();
428 let chunker = Chunker::new(cache, PathBuf::from("/tmp"));
429
430 assert!(chunker.check_chunk("existinghash"));
431 }
432
433 #[test]
434 fn test_chunker_check_chunk_nonexistent() {
435 let cache = InMemoryCache::new(100, 1000);
436 let chunker = Chunker::new(cache, PathBuf::from("/tmp"));
437
438 assert!(!chunker.check_chunk("nonexistent"));
439 }
440
441 #[tokio::test]
442 async fn test_chunker_hashify_text_round_trip() {
443 let temp_dir = TempDir::new().unwrap();
444 let cache = InMemoryCache::new(1000, 100000);
445 let mut chunker = Chunker::new(cache, temp_dir.path().to_path_buf());
446
447 let test_file = "test.cook";
449 let content = "Line 1\nLine 2\nLine 3\n";
450 let mut file = File::create(temp_dir.path().join(test_file)).await.unwrap();
451 file.write_all(content.as_bytes()).await.unwrap();
452 file.flush().await.unwrap();
453
454 let hashes = chunker.hashify(test_file).await.unwrap();
456
457 assert_eq!(hashes.len(), 3);
459
460 for hash in &hashes {
462 assert!(chunker.check_chunk(hash));
463 }
464 }
465
466 #[tokio::test]
467 async fn test_chunker_save_and_read() {
468 let temp_dir = TempDir::new().unwrap();
469 let cache = InMemoryCache::new(1000, 100000);
470 let mut chunker = Chunker::new(cache, temp_dir.path().to_path_buf());
471
472 let chunk1 = b"Hello ".to_vec();
474 let chunk2 = b"World!".to_vec();
475 let hash1 = chunker.hash(&chunk1, 10);
476 let hash2 = chunker.hash(&chunk2, 10);
477
478 chunker.save_chunk(&hash1, chunk1).unwrap();
479 chunker.save_chunk(&hash2, chunk2).unwrap();
480
481 let test_file = "output.txt";
483 chunker.save(test_file, vec![&hash1, &hash2]).await.unwrap();
484
485 assert!(chunker.exists(test_file));
487
488 let content = tokio::fs::read(temp_dir.path().join(test_file))
490 .await
491 .unwrap();
492 assert_eq!(content, b"Hello World!");
493 }
494
495 #[tokio::test]
496 async fn test_chunker_delete() {
497 let temp_dir = TempDir::new().unwrap();
498 let cache = InMemoryCache::new(1000, 100000);
499 let mut chunker = Chunker::new(cache, temp_dir.path().to_path_buf());
500
501 let test_file = "to_delete.txt";
503 let mut file = File::create(temp_dir.path().join(test_file)).await.unwrap();
504 file.write_all(b"test content").await.unwrap();
505 file.flush().await.unwrap();
506
507 assert!(chunker.exists(test_file));
508
509 chunker.delete(test_file).await.unwrap();
511
512 assert!(!chunker.exists(test_file));
514 }
515
516 #[test]
517 fn test_bytes_weighter() {
518 let weighter = BytesWeighter;
519
520 let key = "test".to_string();
521 let small_val = vec![1, 2, 3];
522 let large_val = vec![0u8; 1000];
523
524 assert_eq!(weighter.weight(&key, &small_val), 3);
525 assert_eq!(weighter.weight(&key, &large_val), 1000);
526 }
527
528 #[test]
529 fn test_bytes_weighter_empty_vec() {
530 let weighter = BytesWeighter;
531
532 let key = "test".to_string();
533 let empty_val = vec![];
534
535 assert_eq!(weighter.weight(&key, &empty_val), 1);
537 }
538}