Skip to main content

haagenti_zstd/compress/
repeat_offset_finder.rs

1//! Repeat Offset-Aware Match Finding
2//!
3//! This module enhances match finding by exploiting zstd's repeat offset mechanism.
4//! Repeat offsets (rep0/rep1/rep2) encode with far fewer bits than new offsets:
5//!
6//! | Offset Type | FSE Code | Extra Bits | Total Bits |
7//! |-------------|----------|------------|------------|
8//! | rep0        | ~1 bit   | 0          | ~1 bit     |
9//! | rep1        | ~1 bit   | 0          | ~1 bit     |
10//! | rep2        | ~1 bit   | 0          | ~1 bit     |
11//! | new (100)   | ~4 bits  | 6 bits     | ~10 bits   |
12//!
13//! A match at a repeat offset saves ~8 bits compared to a new offset.
14//! This means a 4-byte match at rep0 is as good as a 5-byte match at a new offset!
15//!
16//! ## Strategy
17//!
18//! 1. **Proactive Probing**: Before hash chain search, probe all 3 repeat offsets
19//! 2. **Bonus Scoring**: Give repeat offset matches a length bonus
20//! 3. **State Tracking**: Keep repeat offsets synchronized with encoder
21
22use super::match_finder::{LazyMatchFinder, Match, MAX_MATCH_LENGTH, MIN_MATCH_LENGTH};
23
24/// Bonus (in bytes) for matches at repeat offsets.
25/// A match at rep0 saves ~8 bits = 1 byte, so a bonus of 1-2 is appropriate.
26const REP_OFFSET_BONUS: usize = 2;
27
28/// Match finder with repeat offset awareness.
29///
30/// Tracks the same repeat offset state as the encoder and proactively searches
31/// for matches at repeat offsets before falling back to hash chain search.
32#[derive(Debug)]
33pub struct RepeatOffsetMatchFinder {
34    /// Inner match finder for hash chain search
35    inner: LazyMatchFinder,
36    /// Current repeat offsets [rep0, rep1, rep2]
37    /// Initialized to [1, 4, 8] per RFC 8878
38    rep_offsets: [usize; 3],
39}
40
41impl RepeatOffsetMatchFinder {
42    /// Create a new repeat offset-aware match finder.
43    pub fn new(search_depth: usize) -> Self {
44        Self {
45            inner: LazyMatchFinder::new(search_depth),
46            rep_offsets: [1, 4, 8], // RFC 8878 initial values
47        }
48    }
49
50    /// Reset repeat offsets to initial state.
51    fn reset_rep_offsets(&mut self) {
52        self.rep_offsets = [1, 4, 8];
53    }
54
55    /// Update repeat offsets after using an offset.
56    ///
57    /// This must match the encoder's RepeatOffsetsEncoder logic exactly
58    /// to keep state synchronized.
59    fn update_rep_offsets(&mut self, actual_offset: usize, literal_length: usize) {
60        if literal_length > 0 {
61            // Normal case
62            if actual_offset == self.rep_offsets[0] {
63                // rep0 - no change
64            } else if actual_offset == self.rep_offsets[1] {
65                // rep1 -> rotate to front
66                self.rep_offsets.swap(1, 0);
67            } else if actual_offset == self.rep_offsets[2] {
68                // rep2 -> rotate to front
69                let temp = self.rep_offsets[2];
70                self.rep_offsets[2] = self.rep_offsets[1];
71                self.rep_offsets[1] = self.rep_offsets[0];
72                self.rep_offsets[0] = temp;
73            } else {
74                // New offset -> push to front
75                self.rep_offsets[2] = self.rep_offsets[1];
76                self.rep_offsets[1] = self.rep_offsets[0];
77                self.rep_offsets[0] = actual_offset;
78            }
79        } else {
80            // LL = 0: special case
81            if actual_offset == self.rep_offsets[1] {
82                self.rep_offsets.swap(0, 1);
83            } else if actual_offset == self.rep_offsets[2] {
84                let temp = self.rep_offsets[2];
85                self.rep_offsets[2] = self.rep_offsets[1];
86                self.rep_offsets[1] = self.rep_offsets[0];
87                self.rep_offsets[0] = temp;
88            } else if actual_offset == self.rep_offsets[0].saturating_sub(1).max(1) {
89                let new_offset = self.rep_offsets[0].saturating_sub(1).max(1);
90                self.rep_offsets[2] = self.rep_offsets[1];
91                self.rep_offsets[1] = self.rep_offsets[0];
92                self.rep_offsets[0] = new_offset;
93            } else {
94                // New offset
95                self.rep_offsets[2] = self.rep_offsets[1];
96                self.rep_offsets[1] = self.rep_offsets[0];
97                self.rep_offsets[0] = actual_offset;
98            }
99        }
100    }
101
102    /// Check if offset is a repeat offset and return its index (0, 1, 2) or None.
103    #[inline]
104    fn rep_offset_index(&self, offset: usize) -> Option<usize> {
105        if offset == self.rep_offsets[0] {
106            Some(0)
107        } else if offset == self.rep_offsets[1] {
108            Some(1)
109        } else if offset == self.rep_offsets[2] {
110            Some(2)
111        } else {
112            None
113        }
114    }
115
116    /// Probe for a match at a specific offset.
117    ///
118    /// Returns the match length if a valid match exists, 0 otherwise.
119    #[inline]
120    fn probe_offset(&self, input: &[u8], pos: usize, offset: usize) -> usize {
121        if offset == 0 || pos < offset {
122            return 0;
123        }
124
125        let match_pos = pos - offset;
126        let remaining = input.len() - pos;
127        let max_len = remaining.min(MAX_MATCH_LENGTH);
128
129        if max_len < MIN_MATCH_LENGTH {
130            return 0;
131        }
132
133        // Quick 4-byte prefix check
134        if pos + 4 <= input.len() && match_pos + 4 <= input.len() {
135            let cur = unsafe { std::ptr::read_unaligned(input.as_ptr().add(pos) as *const u32) };
136            let prev =
137                unsafe { std::ptr::read_unaligned(input.as_ptr().add(match_pos) as *const u32) };
138
139            if cur != prev {
140                return 0;
141            }
142
143            // Extend match
144            let mut len = 4;
145            while len < max_len && input[match_pos + len] == input[pos + len] {
146                len += 1;
147            }
148            len
149        } else {
150            // Byte-by-byte for edge cases
151            let mut len = 0;
152            while len < max_len && input[match_pos + len] == input[pos + len] {
153                len += 1;
154            }
155            if len >= MIN_MATCH_LENGTH {
156                len
157            } else {
158                0
159            }
160        }
161    }
162
163    /// Find the best match considering repeat offsets.
164    ///
165    /// Probes all 3 repeat offsets first, then falls back to hash chain search.
166    /// Returns the match with the best "effective length" (length + bonus for rep offsets).
167    fn find_best_match_with_rep(
168        &mut self,
169        input: &[u8],
170        pos: usize,
171        _literal_length: usize,
172    ) -> Option<Match> {
173        let mut best_match: Option<Match> = None;
174        let mut best_score: usize = 0;
175
176        // Probe repeat offsets first (cheap - no hash lookup)
177        for (rep_idx, &rep_offset) in self.rep_offsets.iter().enumerate() {
178            let len = self.probe_offset(input, pos, rep_offset);
179            if len >= MIN_MATCH_LENGTH {
180                // Score = length + bonus for repeat offset
181                // rep0 gets slightly more bonus than rep1/rep2
182                let bonus = REP_OFFSET_BONUS + (2 - rep_idx);
183                let score = len + bonus;
184
185                if score > best_score {
186                    best_score = score;
187                    best_match = Some(Match::new(pos, rep_offset, len));
188                }
189            }
190        }
191
192        // If we found a really good repeat offset match, skip hash search
193        if best_score >= MIN_MATCH_LENGTH + REP_OFFSET_BONUS + 8 {
194            return best_match;
195        }
196
197        // Fall back to hash chain search via inner match finder
198        // We need to check if hash search finds something better
199        let hash = if pos + 4 <= input.len() {
200            self.inner.inner.hash4(input, pos)
201        } else if pos + 3 <= input.len() {
202            self.inner.inner.hash3(input, pos)
203        } else {
204            return best_match;
205        };
206
207        if let Some(hash_match) = self.inner.inner.find_best_match(input, pos, hash as usize) {
208            // Score without repeat bonus
209            let _hash_score = hash_match.length;
210
211            // Check if hash match is at a repeat offset (gets bonus retroactively)
212            let hash_is_rep = self.rep_offset_index(hash_match.offset);
213            let hash_score = if hash_is_rep.is_some() {
214                hash_match.length + REP_OFFSET_BONUS
215            } else {
216                hash_match.length
217            };
218
219            if hash_score > best_score {
220                best_match = Some(hash_match);
221            }
222        }
223
224        best_match
225    }
226
227    /// Find all matches in the input with repeat offset awareness.
228    pub fn find_matches(&mut self, input: &[u8]) -> Vec<Match> {
229        if input.len() < MIN_MATCH_LENGTH {
230            return Vec::new();
231        }
232
233        self.reset_rep_offsets();
234        self.inner.inner.reset(input.len());
235        self.inner.configure_for_size(input.len());
236
237        let mut matches = Vec::with_capacity(input.len() / 16);
238        let mut pos = 0;
239        let end = input.len().saturating_sub(MIN_MATCH_LENGTH);
240        let mut literal_run = 0usize; // Track literals since last match
241
242        while pos <= end {
243            if let Some(m) = self.find_best_match_with_rep(input, pos, literal_run) {
244                matches.push(m);
245
246                // Update repeat offsets (must match encoder)
247                self.update_rep_offsets(m.offset, literal_run);
248
249                // Update hash table for skipped positions
250                let skip_end = (pos + m.length).min(end);
251                for update_pos in pos..skip_end.min(pos + 8) {
252                    if update_pos + 4 <= input.len() {
253                        let h = self.inner.inner.hash4(input, update_pos);
254                        self.inner.inner.update_hash(input, update_pos, h as usize);
255                    }
256                }
257
258                pos += m.length;
259                literal_run = 0;
260            } else {
261                // No match - update hash and advance
262                if pos + 4 <= input.len() {
263                    let h = self.inner.inner.hash4(input, pos);
264                    self.inner.inner.update_hash(input, pos, h as usize);
265                }
266                pos += 1;
267                literal_run += 1;
268            }
269        }
270
271        matches
272    }
273}
274
275#[cfg(test)]
276mod tests {
277    use super::*;
278
279    #[test]
280    fn test_repeat_offset_finder_basic() {
281        let mut finder = RepeatOffsetMatchFinder::new(16);
282
283        // Pattern with obvious repeat at offset 4
284        let input = b"abcdabcdabcdabcd";
285        let matches = finder.find_matches(input);
286
287        // Should find matches
288        assert!(!matches.is_empty());
289
290        // First match should be at offset 4 (which is rep1 initially!)
291        if let Some(m) = matches.first() {
292            assert_eq!(m.offset, 4);
293        }
294    }
295
296    #[test]
297    fn test_repeat_offset_tracking() {
298        let mut finder = RepeatOffsetMatchFinder::new(16);
299
300        // Check initial state
301        assert_eq!(finder.rep_offsets, [1, 4, 8]);
302
303        // Simulate using offset 100 with LL > 0
304        finder.update_rep_offsets(100, 5);
305        assert_eq!(finder.rep_offsets, [100, 1, 4]);
306
307        // Use offset 100 again (should stay at front)
308        finder.update_rep_offsets(100, 3);
309        assert_eq!(finder.rep_offsets, [100, 1, 4]);
310
311        // Use offset 1 (was rep1, should rotate to front)
312        finder.update_rep_offsets(1, 2);
313        assert_eq!(finder.rep_offsets, [1, 100, 4]);
314    }
315
316    #[test]
317    fn test_rep_offset_bonus() {
318        let mut finder = RepeatOffsetMatchFinder::new(16);
319
320        // Long pattern that can match at offset 1 (rep0)
321        let input = b"AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA";
322        let matches = finder.find_matches(input);
323
324        // Should find a match at offset 1 (rep0)
325        if let Some(m) = matches.first() {
326            assert_eq!(m.offset, 1);
327            assert!(m.length > 3);
328        }
329    }
330}