1use oxiarc_core::error::Result;
24
25#[derive(Debug, Clone, Copy)]
27pub struct Match {
28 pub offset: usize,
31 pub length: usize,
33}
34
35pub const MIN_MATCH: usize = 3;
37
38pub const MAX_MATCH: usize = 65539;
40
41const HASH_PRIME: u32 = 0x9E3779B1;
43
44#[derive(Debug, Clone)]
50pub struct LevelConfig {
51 pub hash_log: u8,
53 pub chain_log: u8,
55 pub search_depth: u32,
57 pub lazy_matching: bool,
59 pub lazy_min_gain: usize,
61 pub target_block_size: usize,
63}
64
65impl LevelConfig {
66 pub fn for_level(level: i32) -> Self {
71 let level = level.clamp(1, 22);
72 match level {
73 1 => LevelConfig {
74 hash_log: 17,
75 chain_log: 0,
76 search_depth: 1,
77 lazy_matching: false,
78 lazy_min_gain: 0,
79 target_block_size: 128 * 1024,
80 },
81 2 => LevelConfig {
82 hash_log: 17,
83 chain_log: 0,
84 search_depth: 2,
85 lazy_matching: false,
86 lazy_min_gain: 0,
87 target_block_size: 128 * 1024,
88 },
89 3 => LevelConfig {
90 hash_log: 17,
91 chain_log: 16,
92 search_depth: 6,
93 lazy_matching: false,
94 lazy_min_gain: 0,
95 target_block_size: 128 * 1024,
96 },
97 4..=6 => LevelConfig {
98 hash_log: 18,
99 chain_log: 17,
100 search_depth: (level * 4) as u32,
101 lazy_matching: true,
102 lazy_min_gain: 1,
103 target_block_size: 128 * 1024,
104 },
105 7..=9 => LevelConfig {
106 hash_log: 19,
107 chain_log: 18,
108 search_depth: (level * 8) as u32,
109 lazy_matching: true,
110 lazy_min_gain: 1,
111 target_block_size: 128 * 1024,
112 },
113 10..=15 => LevelConfig {
114 hash_log: 20,
115 chain_log: 19,
116 search_depth: (level * 16) as u32,
117 lazy_matching: true,
118 lazy_min_gain: 2,
119 target_block_size: 128 * 1024,
120 },
121 _ => LevelConfig {
122 hash_log: 20,
123 chain_log: 20,
124 search_depth: (level * 32) as u32,
125 lazy_matching: true,
126 lazy_min_gain: 3,
127 target_block_size: 128 * 1024,
128 },
129 }
130 }
131}
132
133#[derive(Debug, Clone)]
139pub struct Lz77Sequence {
140 pub literals: Vec<u8>,
142 pub offset: usize,
144 pub match_length: usize,
146}
147
148pub struct MatchFinder {
154 hash_table: Vec<u32>,
157 chain_table: Vec<u32>,
160 hash_mask: u32,
162 chain_mask: u32,
164 config: LevelConfig,
166}
167
168impl MatchFinder {
169 pub fn new(config: &LevelConfig) -> Self {
171 let hash_size = 1usize << config.hash_log;
172 let chain_size = if config.chain_log > 0 {
173 1usize << config.chain_log
174 } else {
175 0
176 };
177
178 Self {
179 hash_table: vec![0u32; hash_size],
180 chain_table: vec![0u32; chain_size],
181 hash_mask: (hash_size as u32).wrapping_sub(1),
182 chain_mask: if chain_size > 0 {
183 (chain_size as u32).wrapping_sub(1)
184 } else {
185 0
186 },
187 config: config.clone(),
188 }
189 }
190
191 pub fn find_sequences(&mut self, data: &[u8], dict: &[u8]) -> Result<Vec<Lz77Sequence>> {
202 if data.is_empty() {
203 return Ok(Vec::new());
204 }
205
206 let dict_len = dict.len();
207 let combined_len = dict_len + data.len();
211 let combined = CombinedBuffer::new(dict, data);
212
213 self.seed_dictionary(&combined, dict_len);
215
216 let mut sequences = Vec::new();
217 let mut pos = dict_len; let mut literal_start = dict_len; while pos < combined_len {
221 if pos + MIN_MATCH > combined_len {
223 break;
224 }
225
226 let best_match = self.find_best_match(&combined, pos, dict_len);
227
228 let (final_match, advance_one) = if self.config.lazy_matching {
230 if let Some(m1) = best_match {
231 if pos + 1 + MIN_MATCH <= combined_len {
232 self.insert_position(&combined, pos);
234 let m2 = self.find_best_match(&combined, pos + 1, dict_len);
235 if let Some(m2) = m2 {
236 if m2.length > m1.length + self.config.lazy_min_gain {
237 (Some(m2), true)
239 } else {
240 (Some(m1), false)
241 }
242 } else {
243 (Some(m1), false)
244 }
245 } else {
246 (Some(m1), false)
247 }
248 } else {
249 (None, false)
250 }
251 } else {
252 (best_match, false)
253 };
254
255 if advance_one {
256 pos += 1;
258 }
259
260 if let Some(m) = final_match {
261 let literals: Vec<u8> = (literal_start..pos).map(|p| combined.get(p)).collect();
263
264 sequences.push(Lz77Sequence {
267 literals,
268 offset: m.offset,
269 match_length: m.length,
270 });
271
272 let match_end = (pos + m.length).min(combined_len);
275 if !advance_one {
277 self.insert_position(&combined, pos);
278 }
279 for insert_pos in (pos + 1)..match_end {
281 if insert_pos + MIN_MATCH <= combined_len {
282 self.insert_position(&combined, insert_pos);
283 }
284 }
285
286 pos = match_end;
287 literal_start = pos;
288 } else {
289 self.insert_position(&combined, pos);
291 pos += 1;
292 }
293 }
294
295 if literal_start < combined_len {
297 let literals: Vec<u8> = (literal_start..combined_len)
298 .map(|p| combined.get(p))
299 .collect();
300
301 sequences.push(Lz77Sequence {
302 literals,
303 offset: 0,
304 match_length: 0,
305 });
306 }
307
308 Ok(sequences)
309 }
310
311 fn hash(&self, combined: &CombinedBuffer<'_>, pos: usize) -> u32 {
315 let b0 = combined.get(pos) as u32;
316 let b1 = combined.get(pos + 1) as u32;
317 let b2 = combined.get(pos + 2) as u32;
318 let b3 = combined.get(pos + 3) as u32;
319 let val = b0 | (b1 << 8) | (b2 << 16) | (b3 << 24);
320 let h = val.wrapping_mul(HASH_PRIME);
321 (h >> (32 - self.config.hash_log)) & self.hash_mask
322 }
323
324 fn insert_position(&mut self, combined: &CombinedBuffer<'_>, pos: usize) {
326 if pos + 4 > combined.len() {
327 return;
328 }
329
330 let h = self.hash(combined, pos) as usize;
331 let prev = self.hash_table[h];
332 self.hash_table[h] = pos as u32;
333
334 if !self.chain_table.is_empty() {
335 let chain_idx = (pos as u32) & self.chain_mask;
336 self.chain_table[chain_idx as usize] = prev;
337 }
338 }
339
340 fn seed_dictionary(&mut self, combined: &CombinedBuffer<'_>, dict_len: usize) {
342 if dict_len < 4 {
343 return;
344 }
345 for pos in 0..=(dict_len.saturating_sub(4)) {
346 self.insert_position(combined, pos);
347 }
348 }
349
350 fn find_best_match(
353 &self,
354 combined: &CombinedBuffer<'_>,
355 pos: usize,
356 dict_len: usize,
357 ) -> Option<Match> {
358 if pos + 4 > combined.len() {
359 return None;
360 }
361
362 let h = self.hash(combined, pos) as usize;
363 let mut candidate = self.hash_table[h] as usize;
364
365 if candidate >= pos {
367 return None;
368 }
369
370 let max_distance = if pos >= dict_len {
371 pos
373 } else {
374 return None;
376 };
377
378 let mut best: Option<Match> = None;
379 let mut steps = 0u32;
380 let max_steps = self.config.search_depth;
381
382 loop {
383 if steps >= max_steps {
384 break;
385 }
386 if candidate >= pos {
387 break;
388 }
389
390 let distance = pos - candidate;
391 if distance > max_distance || distance == 0 {
392 break;
393 }
394
395 let match_len = self.compute_match_length(combined, candidate, pos);
397
398 if match_len >= MIN_MATCH {
399 let is_better = match best {
400 Some(ref b) => {
401 match_len > b.length || (match_len == b.length && distance < b.offset)
402 }
403 None => true,
404 };
405 if is_better {
406 let clamped_len = match_len.min(MAX_MATCH);
407 best = Some(Match {
408 offset: distance,
409 length: clamped_len,
410 });
411 if clamped_len >= 128 {
413 break;
414 }
415 }
416 }
417
418 steps += 1;
419
420 if self.chain_table.is_empty() {
422 break;
423 }
424 let chain_idx = (candidate as u32) & self.chain_mask;
425 let next_candidate = self.chain_table[chain_idx as usize] as usize;
426 if next_candidate >= candidate || next_candidate == 0 {
427 break;
429 }
430 candidate = next_candidate;
431 }
432
433 best
434 }
435
436 fn compute_match_length(&self, combined: &CombinedBuffer<'_>, a: usize, b: usize) -> usize {
438 let max_len = (combined.len() - b).min(MAX_MATCH);
439 let mut len = 0usize;
440
441 while len + 8 <= max_len {
443 let va = combined.get_u64(a + len);
444 let vb = combined.get_u64(b + len);
445 if va != vb {
446 let diff = va ^ vb;
448 len += (diff.trailing_zeros() / 8) as usize;
449 return len;
450 }
451 len += 8;
452 }
453
454 while len < max_len {
456 if combined.get(a + len) != combined.get(b + len) {
457 break;
458 }
459 len += 1;
460 }
461
462 len
463 }
464
465 pub fn reset(&mut self) {
470 for entry in self.hash_table.iter_mut() {
471 *entry = 0;
472 }
473 for entry in self.chain_table.iter_mut() {
474 *entry = 0;
475 }
476 }
477}
478
479struct CombinedBuffer<'a> {
484 dict: &'a [u8],
486 data: &'a [u8],
488 total_len: usize,
490}
491
492impl<'a> CombinedBuffer<'a> {
493 fn new(dict: &'a [u8], data: &'a [u8]) -> Self {
495 Self {
496 dict,
497 data,
498 total_len: dict.len() + data.len(),
499 }
500 }
501
502 fn len(&self) -> usize {
504 self.total_len
505 }
506
507 fn get(&self, pos: usize) -> u8 {
509 if pos < self.dict.len() {
510 self.dict[pos]
511 } else {
512 self.data[pos - self.dict.len()]
513 }
514 }
515
516 fn get_u64(&self, pos: usize) -> u64 {
521 let dict_len = self.dict.len();
522
523 if pos + 8 <= dict_len {
525 let slice = &self.dict[pos..pos + 8];
527 return u64::from_le_bytes([
528 slice[0], slice[1], slice[2], slice[3], slice[4], slice[5], slice[6], slice[7],
529 ]);
530 }
531
532 let data_pos = pos.wrapping_sub(dict_len);
533 if pos >= dict_len && data_pos + 8 <= self.data.len() {
534 let slice = &self.data[data_pos..data_pos + 8];
536 return u64::from_le_bytes([
537 slice[0], slice[1], slice[2], slice[3], slice[4], slice[5], slice[6], slice[7],
538 ]);
539 }
540
541 let mut bytes = [0u8; 8];
543 for (i, byte) in bytes.iter_mut().enumerate() {
544 let p = pos + i;
545 if p < self.total_len {
546 *byte = self.get(p);
547 }
548 }
549 u64::from_le_bytes(bytes)
550 }
551}
552
553#[cfg(test)]
554mod tests {
555 use super::*;
556
557 #[test]
558 fn test_level_config_clamping() {
559 let c = LevelConfig::for_level(-5);
560 assert_eq!(c.hash_log, 17); let c = LevelConfig::for_level(100);
563 assert_eq!(c.hash_log, 20); }
565
566 #[test]
567 fn test_level_config_ranges() {
568 for level in 1..=22 {
569 let c = LevelConfig::for_level(level);
570 assert!(c.hash_log >= 17);
571 assert!(c.hash_log <= 20);
572 assert!(c.search_depth >= 1);
573 assert!(c.target_block_size > 0);
574 }
575 }
576
577 #[test]
578 fn test_find_sequences_empty() {
579 let config = LevelConfig::for_level(1);
580 let mut finder = MatchFinder::new(&config);
581 let seqs = finder.find_sequences(&[], &[]).expect("should succeed");
582 assert!(seqs.is_empty());
583 }
584
585 #[test]
586 fn test_find_sequences_no_matches() {
587 let config = LevelConfig::for_level(1);
588 let mut finder = MatchFinder::new(&config);
589 let data: Vec<u8> = (0..64).collect();
591 let seqs = finder.find_sequences(&data, &[]).expect("should succeed");
592
593 let total_literals: usize = seqs.iter().map(|s| s.literals.len()).sum();
595 let total_match_bytes: usize = seqs.iter().map(|s| s.match_length).sum();
596 assert_eq!(total_literals + total_match_bytes, data.len());
597 }
598
599 #[test]
600 fn test_find_sequences_with_repeats() {
601 let config = LevelConfig::for_level(3);
602 let mut finder = MatchFinder::new(&config);
603 let mut data = Vec::new();
605 for _ in 0..10 {
606 data.extend_from_slice(b"ABCDEFGHIJ");
607 }
608 let seqs = finder.find_sequences(&data, &[]).expect("should succeed");
609
610 let total_bytes: usize = seqs.iter().map(|s| s.literals.len() + s.match_length).sum();
612 assert_eq!(total_bytes, data.len());
613
614 let has_match = seqs.iter().any(|s| s.match_length > 0);
616 assert!(has_match, "Expected at least one match in repeated data");
617 }
618
619 #[test]
620 fn test_find_sequences_all_same_byte() {
621 let config = LevelConfig::for_level(1);
622 let mut finder = MatchFinder::new(&config);
623 let data = vec![0xAAu8; 1000];
624 let seqs = finder.find_sequences(&data, &[]).expect("should succeed");
625
626 let total_bytes: usize = seqs.iter().map(|s| s.literals.len() + s.match_length).sum();
627 assert_eq!(total_bytes, data.len());
628
629 let total_match_bytes: usize = seqs.iter().map(|s| s.match_length).sum();
631 assert!(
632 total_match_bytes > 0,
633 "Expected matches in all-same-byte data"
634 );
635 }
636
637 #[test]
638 fn test_find_sequences_with_dictionary() {
639 let config = LevelConfig::for_level(3);
640 let mut finder = MatchFinder::new(&config);
641 let dict = b"Hello, World! This is a dictionary.";
642 let data = b"Hello, World! This is actual data.";
643 let seqs = finder
644 .find_sequences(data.as_slice(), dict.as_slice())
645 .expect("should succeed");
646
647 let total_bytes: usize = seqs.iter().map(|s| s.literals.len() + s.match_length).sum();
648 assert_eq!(total_bytes, data.len());
649
650 let has_match = seqs.iter().any(|s| s.match_length > 0);
652 assert!(has_match, "Expected match referencing dictionary");
653 }
654
655 #[test]
656 fn test_find_sequences_lazy_matching() {
657 let config = LevelConfig::for_level(5); let mut finder = MatchFinder::new(&config);
659 let mut data = Vec::new();
662 data.push(b'X');
663 data.extend_from_slice(b"ABCDEFGHIJKLMNOP");
664 data.push(b'Y');
665 data.extend_from_slice(b"ABCDEFGHIJKLMNOP");
666 let seqs = finder.find_sequences(&data, &[]).expect("should succeed");
667
668 let total_bytes: usize = seqs.iter().map(|s| s.literals.len() + s.match_length).sum();
669 assert_eq!(total_bytes, data.len());
670 }
671
672 #[test]
673 fn test_match_finder_reset() {
674 let config = LevelConfig::for_level(1);
675 let mut finder = MatchFinder::new(&config);
676
677 let data = vec![0xBBu8; 100];
678 let _ = finder.find_sequences(&data, &[]).expect("should succeed");
679
680 finder.reset();
681
682 assert!(finder.hash_table.iter().all(|&v| v == 0));
684 }
685
686 #[test]
687 fn test_find_sequences_short_data() {
688 let config = LevelConfig::for_level(1);
689 let mut finder = MatchFinder::new(&config);
690
691 let data = b"AB";
693 let seqs = finder
694 .find_sequences(data.as_slice(), &[])
695 .expect("should succeed");
696 let total_bytes: usize = seqs.iter().map(|s| s.literals.len() + s.match_length).sum();
697 assert_eq!(total_bytes, data.len());
698 assert!(seqs.iter().all(|s| s.match_length == 0));
699 }
700
701 #[test]
702 fn test_find_sequences_exact_min_match() {
703 let config = LevelConfig::for_level(3);
704 let mut finder = MatchFinder::new(&config);
705
706 let mut data = Vec::new();
708 data.extend_from_slice(b"ABCXYZ");
709 data.extend_from_slice(b"ABCQRS");
710 let seqs = finder.find_sequences(&data, &[]).expect("should succeed");
711
712 let total_bytes: usize = seqs.iter().map(|s| s.literals.len() + s.match_length).sum();
713 assert_eq!(total_bytes, data.len());
714 }
715
716 #[test]
717 fn test_combined_buffer_get() {
718 let dict = b"DICT";
719 let data = b"DATA";
720 let combined = CombinedBuffer::new(dict.as_slice(), data.as_slice());
721
722 assert_eq!(combined.len(), 8);
723 assert_eq!(combined.get(0), b'D');
724 assert_eq!(combined.get(3), b'T');
725 assert_eq!(combined.get(4), b'D');
726 assert_eq!(combined.get(7), b'A');
727 }
728
729 #[test]
730 fn test_combined_buffer_get_u64() {
731 let dict = b"ABCD";
732 let data = b"EFGHIJKL";
733 let combined = CombinedBuffer::new(dict.as_slice(), data.as_slice());
734
735 let val = combined.get_u64(2);
737 let expected = u64::from_le_bytes([b'C', b'D', b'E', b'F', b'G', b'H', b'I', b'J']);
738 assert_eq!(val, expected);
739
740 let val = combined.get_u64(4);
742 let expected = u64::from_le_bytes([b'E', b'F', b'G', b'H', b'I', b'J', b'K', b'L']);
743 assert_eq!(val, expected);
744 }
745
746 #[test]
747 fn test_all_levels_produce_valid_output() {
748 let data = b"The quick brown fox jumps over the lazy dog. The quick brown fox.";
749 for level in 1..=22 {
750 let config = LevelConfig::for_level(level);
751 let mut finder = MatchFinder::new(&config);
752 let seqs = finder
753 .find_sequences(data.as_slice(), &[])
754 .expect("should succeed");
755
756 let total_bytes: usize = seqs.iter().map(|s| s.literals.len() + s.match_length).sum();
757 assert_eq!(
758 total_bytes,
759 data.len(),
760 "Level {} produced incorrect coverage",
761 level
762 );
763 }
764 }
765
766 #[test]
767 fn test_match_offset_validity() {
768 let config = LevelConfig::for_level(3);
769 let mut finder = MatchFinder::new(&config);
770
771 let mut data = Vec::new();
772 for _ in 0..5 {
773 data.extend_from_slice(b"REPEATED_PATTERN_");
774 }
775
776 let seqs = finder.find_sequences(&data, &[]).expect("should succeed");
777
778 let mut pos = 0usize;
780 for seq in &seqs {
781 pos += seq.literals.len();
782 if seq.match_length > 0 {
783 assert!(seq.offset > 0, "Match offset must be positive");
784 assert!(
785 seq.offset <= pos,
786 "Match offset {} exceeds position {}",
787 seq.offset,
788 pos
789 );
790 }
791 pos += seq.match_length;
792 }
793 }
794}