1#![allow(dead_code)]
2
3use std::collections::VecDeque;
20use std::io::{self, Read};
21
22#[derive(Debug, Clone)]
24pub struct ChunkerConfig {
25 pub min_chunk: usize,
27 pub max_chunk: usize,
29 pub target_chunk: usize,
31 pub window_size: usize,
33 pub mask_bits: u32,
36}
37
38impl Default for ChunkerConfig {
39 fn default() -> Self {
40 Self {
41 min_chunk: 2048,
42 max_chunk: 65536,
43 target_chunk: 8192,
44 window_size: 48,
45 mask_bits: 13, }
47 }
48}
49
50impl ChunkerConfig {
51 #[must_use]
53 pub fn small() -> Self {
54 Self {
55 min_chunk: 512,
56 max_chunk: 8192,
57 target_chunk: 2048,
58 window_size: 32,
59 mask_bits: 11,
60 }
61 }
62
63 #[must_use]
65 pub fn large() -> Self {
66 Self {
67 min_chunk: 16384,
68 max_chunk: 524_288,
69 target_chunk: 65536,
70 window_size: 64,
71 mask_bits: 16,
72 }
73 }
74
75 #[must_use]
77 pub fn boundary_mask(&self) -> u64 {
78 (1u64 << self.mask_bits) - 1
79 }
80
81 #[must_use]
83 pub fn is_valid(&self) -> bool {
84 self.min_chunk > 0
85 && self.max_chunk >= self.min_chunk
86 && self.target_chunk >= self.min_chunk
87 && self.target_chunk <= self.max_chunk
88 && self.window_size > 0
89 && self.mask_bits > 0
90 && self.mask_bits <= 32
91 }
92}
93
94const BUZHASH_TABLE: [u64; 256] = {
96 let mut table = [0u64; 256];
97 let mut state: u64 = 0x5555_5555_5555_5555;
99 let mut i = 0;
100 while i < 256 {
101 state ^= state << 13;
102 state ^= state >> 7;
103 state ^= state << 17;
104 table[i] = state;
105 i += 1;
106 }
107 table
108};
109
110#[derive(Clone)]
115pub struct BuzHash {
116 hash: u64,
118 window: Vec<u8>,
120 window_pos: usize,
122 window_size: usize,
124 count: usize,
126}
127
128impl BuzHash {
129 #[must_use]
131 pub fn new(window_size: usize) -> Self {
132 Self {
133 hash: 0,
134 window: vec![0u8; window_size],
135 window_pos: 0,
136 window_size,
137 count: 0,
138 }
139 }
140
141 pub fn update(&mut self, byte: u8) -> u64 {
143 let out_byte = self.window[self.window_pos];
144 self.window[self.window_pos] = byte;
145 self.window_pos = (self.window_pos + 1) % self.window_size;
146
147 self.hash = self.hash.rotate_left(1);
149 self.hash ^= BUZHASH_TABLE[byte as usize];
151
152 if self.count >= self.window_size {
153 self.hash ^= BUZHASH_TABLE[out_byte as usize].rotate_left(self.window_size as u32);
155 } else {
156 self.count += 1;
157 }
158
159 self.hash
160 }
161
162 #[must_use]
164 pub fn value(&self) -> u64 {
165 self.hash
166 }
167
168 #[must_use]
170 pub fn count(&self) -> usize {
171 self.count
172 }
173
174 pub fn reset(&mut self) {
176 self.hash = 0;
177 self.window.fill(0);
178 self.window_pos = 0;
179 self.count = 0;
180 }
181}
182
183impl std::fmt::Debug for BuzHash {
184 fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
185 f.debug_struct("BuzHash")
186 .field("hash", &format_args!("0x{:016x}", self.hash))
187 .field("window_size", &self.window_size)
188 .field("count", &self.count)
189 .finish()
190 }
191}
192
193#[derive(Debug, Clone, PartialEq, Eq)]
195pub struct ChunkBoundary {
196 pub offset: usize,
198 pub hash: u64,
200 pub chunk_len: usize,
202}
203
204pub struct ContentChunker {
206 config: ChunkerConfig,
208 hasher: BuzHash,
210 position: usize,
212 last_boundary: usize,
214 boundaries: Vec<ChunkBoundary>,
216}
217
218impl ContentChunker {
219 #[must_use]
221 pub fn new(config: ChunkerConfig) -> Self {
222 let hasher = BuzHash::new(config.window_size);
223 Self {
224 config,
225 hasher,
226 position: 0,
227 last_boundary: 0,
228 boundaries: Vec::new(),
229 }
230 }
231
232 #[must_use]
234 pub fn with_defaults() -> Self {
235 Self::new(ChunkerConfig::default())
236 }
237
238 pub fn feed(&mut self, data: &[u8]) -> Vec<ChunkBoundary> {
242 let mask = self.config.boundary_mask();
243 let mut found = Vec::new();
244
245 for &byte in data {
246 let h = self.hasher.update(byte);
247 self.position += 1;
248
249 let chunk_len = self.position - self.last_boundary;
250
251 if chunk_len < self.config.min_chunk {
253 continue;
254 }
255
256 let is_boundary = (h & mask) == 0 || chunk_len >= self.config.max_chunk;
258
259 if is_boundary {
260 let boundary = ChunkBoundary {
261 offset: self.position,
262 hash: h,
263 chunk_len,
264 };
265 found.push(boundary.clone());
266 self.boundaries.push(boundary);
267 self.last_boundary = self.position;
268 }
269 }
270
271 found
272 }
273
274 pub fn finish(&mut self) -> Option<ChunkBoundary> {
276 let chunk_len = self.position - self.last_boundary;
277 if chunk_len > 0 {
278 let boundary = ChunkBoundary {
279 offset: self.position,
280 hash: self.hasher.value(),
281 chunk_len,
282 };
283 self.boundaries.push(boundary.clone());
284 self.last_boundary = self.position;
285 Some(boundary)
286 } else {
287 None
288 }
289 }
290
291 #[must_use]
293 pub fn boundaries(&self) -> &[ChunkBoundary] {
294 &self.boundaries
295 }
296
297 #[must_use]
299 pub fn position(&self) -> usize {
300 self.position
301 }
302
303 pub fn reset(&mut self) {
305 self.hasher.reset();
306 self.position = 0;
307 self.last_boundary = 0;
308 self.boundaries.clear();
309 }
310}
311
312#[must_use]
314pub fn chunk_bytes(data: &[u8], config: ChunkerConfig) -> Vec<ChunkBoundary> {
315 let mut chunker = ContentChunker::new(config);
316 let mut all = chunker.feed(data);
317 if let Some(last) = chunker.finish() {
318 all.push(last);
319 }
320 all
321}
322
323const CHUNK_SIZE: usize = 65_536;
327
328const RABIN_BASE: u64 = 0x08D3_B1B9_ADFA_BC4D;
330
331pub struct RollingHashStream<R: Read> {
343 inner: R,
345 window: VecDeque<u8>,
347 window_size: usize,
349 hash: u64,
351 pos: u64,
353 pow_table: u64,
355 buf: Box<[u8]>,
357 buf_len: usize,
359 buf_pos: usize,
361 eof: bool,
363 done: bool,
365}
366
367impl<R: Read> RollingHashStream<R> {
368 #[must_use]
373 pub fn new(inner: R, window_size: usize) -> Self {
374 let window_size = window_size.max(1);
375 let pow_table = (0..window_size).fold(1u64, |acc, _| acc.wrapping_mul(RABIN_BASE));
377 Self {
378 inner,
379 window: VecDeque::with_capacity(window_size),
380 window_size,
381 hash: 0,
382 pos: 0,
383 pow_table,
384 buf: vec![0u8; CHUNK_SIZE].into_boxed_slice(),
385 buf_len: 0,
386 buf_pos: 0,
387 eof: false,
388 done: false,
389 }
390 }
391
392 fn read_byte(&mut self) -> io::Result<Option<u8>> {
396 if self.buf_pos >= self.buf_len {
397 if self.eof {
398 return Ok(None);
399 }
400 let n = self.inner.read(&mut self.buf[..])?;
401 if n == 0 {
402 self.eof = true;
403 return Ok(None);
404 }
405 self.buf_len = n;
406 self.buf_pos = 0;
407 }
408 let byte = self.buf[self.buf_pos];
409 self.buf_pos += 1;
410 Ok(Some(byte))
411 }
412}
413
414impl<R: Read> Iterator for RollingHashStream<R> {
415 type Item = io::Result<(u64, u64)>;
417
418 fn next(&mut self) -> Option<Self::Item> {
419 if self.done {
420 return None;
421 }
422 loop {
424 let byte = match self.read_byte() {
425 Ok(Some(b)) => b,
426 Ok(None) => {
427 self.done = true;
428 return None;
429 }
430 Err(e) => {
431 self.done = true;
432 return Some(Err(e));
433 }
434 };
435
436 let byte_out = if self.window.len() == self.window_size {
438 self.window.pop_front()
439 } else {
440 None
441 };
442
443 self.window.push_back(byte);
444
445 self.hash = self
449 .hash
450 .wrapping_mul(RABIN_BASE)
451 .wrapping_add(u64::from(byte));
452 if let Some(out) = byte_out {
453 self.hash ^= self.pow_table.wrapping_mul(u64::from(out));
454 }
455
456 self.pos += 1;
457
458 if self.window.len() == self.window_size {
460 let window_start = self.pos - self.window_size as u64;
461 return Some(Ok((window_start, self.hash)));
462 }
463 }
464 }
465}
466
467#[must_use]
472pub fn rolling_hash_slice(data: &[u8], window_size: usize) -> Vec<(u64, u64)> {
473 let window_size = window_size.max(1);
474 let pow_table = (0..window_size).fold(1u64, |acc, _| acc.wrapping_mul(RABIN_BASE));
475 let mut window: VecDeque<u8> = VecDeque::with_capacity(window_size);
476 let mut hash: u64 = 0;
477 let mut results = Vec::with_capacity(data.len().saturating_sub(window_size) + 1);
478
479 for (i, &byte) in data.iter().enumerate() {
480 let byte_out = if window.len() == window_size {
481 window.pop_front()
482 } else {
483 None
484 };
485 window.push_back(byte);
486 hash = hash.wrapping_mul(RABIN_BASE).wrapping_add(u64::from(byte));
487 if let Some(out) = byte_out {
488 hash ^= pow_table.wrapping_mul(u64::from(out));
489 }
490 if window.len() == window_size {
491 let offset = (i + 1 - window_size) as u64;
492 results.push((offset, hash));
493 }
494 }
495 results
496}
497
498#[cfg(test)]
499mod tests {
500 use super::*;
501
502 #[test]
503 fn test_chunker_config_default() {
504 let cfg = ChunkerConfig::default();
505 assert_eq!(cfg.min_chunk, 2048);
506 assert_eq!(cfg.max_chunk, 65536);
507 assert_eq!(cfg.target_chunk, 8192);
508 assert!(cfg.is_valid());
509 }
510
511 #[test]
512 fn test_chunker_config_small() {
513 let cfg = ChunkerConfig::small();
514 assert_eq!(cfg.min_chunk, 512);
515 assert!(cfg.is_valid());
516 }
517
518 #[test]
519 fn test_chunker_config_large() {
520 let cfg = ChunkerConfig::large();
521 assert_eq!(cfg.min_chunk, 16384);
522 assert!(cfg.is_valid());
523 }
524
525 #[test]
526 fn test_chunker_config_boundary_mask() {
527 let cfg = ChunkerConfig::default(); assert_eq!(cfg.boundary_mask(), (1 << 13) - 1);
529 }
530
531 #[test]
532 fn test_buzhash_new() {
533 let h = BuzHash::new(32);
534 assert_eq!(h.value(), 0);
535 assert_eq!(h.count(), 0);
536 }
537
538 #[test]
539 fn test_buzhash_deterministic() {
540 let mut h1 = BuzHash::new(16);
541 let mut h2 = BuzHash::new(16);
542 for b in b"identical input" {
543 h1.update(*b);
544 h2.update(*b);
545 }
546 assert_eq!(h1.value(), h2.value());
547 }
548
549 #[test]
550 fn test_buzhash_different_input() {
551 let mut h1 = BuzHash::new(16);
552 let mut h2 = BuzHash::new(16);
553 for b in b"input A" {
554 h1.update(*b);
555 }
556 for b in b"input B" {
557 h2.update(*b);
558 }
559 assert_ne!(h1.value(), h2.value());
560 }
561
562 #[test]
563 fn test_buzhash_reset() {
564 let mut h = BuzHash::new(8);
565 for b in b"some data" {
566 h.update(*b);
567 }
568 assert_ne!(h.value(), 0);
569 h.reset();
570 assert_eq!(h.value(), 0);
571 assert_eq!(h.count(), 0);
572 }
573
574 #[test]
575 fn test_content_chunker_small_input() {
576 let config = ChunkerConfig {
578 min_chunk: 100,
579 max_chunk: 1000,
580 target_chunk: 500,
581 window_size: 8,
582 mask_bits: 3,
583 };
584 let mut chunker = ContentChunker::new(config);
585 let data = vec![0x42u8; 50];
586 let during = chunker.feed(&data);
587 assert!(during.is_empty()); let last = chunker.finish();
589 assert!(last.is_some());
590 assert_eq!(last.expect("operation should succeed").chunk_len, 50);
591 }
592
593 #[test]
594 fn test_content_chunker_max_chunk() {
595 let config = ChunkerConfig {
597 min_chunk: 4,
598 max_chunk: 16,
599 target_chunk: 8,
600 window_size: 4,
601 mask_bits: 30, };
603 let mut chunker = ContentChunker::new(config);
604 let data = vec![0u8; 100];
605 let boundaries = chunker.feed(&data);
606 assert!(!boundaries.is_empty());
608 for b in &boundaries {
609 assert!(b.chunk_len <= 16);
610 }
611 }
612
613 #[test]
614 fn test_chunk_bytes_convenience() {
615 let data = vec![0xABu8; 200];
616 let config = ChunkerConfig {
617 min_chunk: 10,
618 max_chunk: 50,
619 target_chunk: 30,
620 window_size: 4,
621 mask_bits: 30,
622 };
623 let boundaries = chunk_bytes(&data, config);
624 assert!(!boundaries.is_empty());
625
626 let total: usize = boundaries.iter().map(|b| b.chunk_len).sum();
628 assert_eq!(total, 200);
629 }
630
631 #[test]
632 fn test_content_chunker_reset() {
633 let mut chunker = ContentChunker::with_defaults();
634 chunker.feed(&vec![1u8; 100_000]);
635 assert!(chunker.position() > 0);
636 chunker.reset();
637 assert_eq!(chunker.position(), 0);
638 assert!(chunker.boundaries().is_empty());
639 }
640
641 #[test]
642 fn test_chunk_boundary_equality() {
643 let a = ChunkBoundary {
644 offset: 100,
645 hash: 42,
646 chunk_len: 50,
647 };
648 let b = ChunkBoundary {
649 offset: 100,
650 hash: 42,
651 chunk_len: 50,
652 };
653 assert_eq!(a, b);
654 }
655
656 #[test]
659 fn test_rolling_hash_stream_matches_slice() {
660 let data: Vec<u8> = (0u8..=255).cycle().take(1024).collect();
662 let window_size = 32;
663
664 let expected = rolling_hash_slice(&data, window_size);
666
667 let cursor = std::io::Cursor::new(&data);
669 let stream = RollingHashStream::new(cursor, window_size);
670 let actual: Vec<(u64, u64)> = stream
671 .map(|r| r.expect("stream should not error"))
672 .collect();
673
674 assert_eq!(
675 expected.len(),
676 actual.len(),
677 "number of hash pairs must match"
678 );
679 for (i, (exp, got)) in expected.iter().zip(actual.iter()).enumerate() {
680 assert_eq!(
681 exp, got,
682 "hash mismatch at position {i}: expected {exp:?}, got {got:?}"
683 );
684 }
685 }
686
687 #[test]
688 fn test_rolling_hash_stream_large_data() {
689 const MB4: usize = 4 * 1024 * 1024;
691 let data: Vec<u8> = (0u8..=255).cycle().take(MB4).collect();
692 let window_size = 64;
693
694 let cursor = std::io::Cursor::new(&data);
695 let stream = RollingHashStream::new(cursor, window_size);
696 let mut count = 0usize;
697 for item in stream {
698 let _ = item.expect("stream should not error");
699 count += 1;
700 }
701
702 let expected_count = MB4 - window_size + 1;
704 assert_eq!(
705 count, expected_count,
706 "expected {expected_count} hash pairs, got {count}"
707 );
708 }
709}