lzma_rust2/lz/
lz_encoder.rs

1use alloc::{vec, vec::Vec};
2use core::ops::Deref;
3
4use super::{bt4::Bt4, extend_match, hc4::Hc4};
5use crate::Write;
6
7/// Align to a 64-byte cache line
8const MOVE_BLOCK_ALIGN: i32 = 64;
9const MOVE_BLOCK_ALIGN_MASK: i32 = !(MOVE_BLOCK_ALIGN - 1);
10
11pub(crate) trait MatchFind {
12    fn find_matches(&mut self, encoder: &mut LzEncoderData, matches: &mut Matches);
13    fn skip(&mut self, encoder: &mut LzEncoderData, len: usize);
14}
15
16pub(crate) enum MatchFinders {
17    Hc4(Hc4),
18    Bt4(Bt4),
19}
20
21impl MatchFind for MatchFinders {
22    fn find_matches(&mut self, encoder: &mut LzEncoderData, matches: &mut Matches) {
23        match self {
24            MatchFinders::Hc4(m) => m.find_matches(encoder, matches),
25            MatchFinders::Bt4(m) => m.find_matches(encoder, matches),
26        }
27    }
28
29    fn skip(&mut self, encoder: &mut LzEncoderData, len: usize) {
30        match self {
31            MatchFinders::Hc4(m) => m.skip(encoder, len),
32            MatchFinders::Bt4(m) => m.skip(encoder, len),
33        }
34    }
35}
36
37/// Match finders to use when encoding.
38#[derive(Default, Debug, Clone, Copy, PartialEq, Eq)]
39pub enum MfType {
40    /// Hash chain for 4 bytes entries (lower quality but faster).
41    #[default]
42    Hc4,
43    /// Binary tree for 4 byte entries (higher quality but slower).
44    Bt4,
45}
46
47impl MfType {
48    #[inline]
49    fn get_memory_usage(self, dict_size: u32) -> u32 {
50        match self {
51            MfType::Hc4 => Hc4::get_mem_usage(dict_size),
52            MfType::Bt4 => Bt4::get_mem_usage(dict_size),
53        }
54    }
55}
56
57pub(crate) struct LzEncoder {
58    pub(crate) data: LzEncoderData,
59    pub(crate) matches: Matches,
60    pub(crate) match_finder: MatchFinders,
61}
62
63pub(crate) struct LzEncoderData {
64    pub(crate) keep_size_before: u32,
65    pub(crate) keep_size_after: u32,
66    pub(crate) match_len_max: u32,
67    pub(crate) nice_len: u32,
68    pub(crate) buf: Vec<u8>,
69    pub(crate) buf_size: usize,
70    pub(crate) buf_limit_u16: usize,
71    pub(crate) read_pos: i32,
72    pub(crate) read_limit: i32,
73    pub(crate) finishing: bool,
74    pub(crate) write_pos: i32,
75    pub(crate) pending_size: u32,
76}
77
78pub(crate) struct Matches {
79    pub(crate) len: Vec<u32>,
80    pub(crate) dist: Vec<i32>,
81    pub(crate) count: u32,
82}
83
84impl Matches {
85    pub(crate) fn new(count_max: usize) -> Self {
86        Self {
87            len: vec![0; count_max],
88            dist: vec![0; count_max],
89            count: 0,
90        }
91    }
92}
93
94impl LzEncoder {
95    pub(crate) fn get_memory_usage(
96        dict_size: u32,
97        extra_size_before: u32,
98        extra_size_after: u32,
99        match_len_max: u32,
100        mf: MfType,
101    ) -> u32 {
102        get_buf_size(
103            dict_size,
104            extra_size_before,
105            extra_size_after,
106            match_len_max,
107        ) + mf.get_memory_usage(dict_size)
108    }
109
110    pub(crate) fn new_hc4(
111        dict_size: u32,
112        extra_size_before: u32,
113        extra_size_after: u32,
114        nice_len: u32,
115        match_len_max: u32,
116        depth_limit: i32,
117    ) -> Self {
118        Self::new(
119            dict_size,
120            extra_size_before,
121            extra_size_after,
122            nice_len,
123            match_len_max,
124            MatchFinders::Hc4(Hc4::new(dict_size, nice_len, depth_limit)),
125        )
126    }
127
128    pub(crate) fn new_bt4(
129        dict_size: u32,
130        extra_size_before: u32,
131        extra_size_after: u32,
132        nice_len: u32,
133        match_len_max: u32,
134        depth_limit: i32,
135    ) -> Self {
136        Self::new(
137            dict_size,
138            extra_size_before,
139            extra_size_after,
140            nice_len,
141            match_len_max,
142            MatchFinders::Bt4(Bt4::new(dict_size, nice_len, depth_limit)),
143        )
144    }
145
146    fn new(
147        dict_size: u32,
148        extra_size_before: u32,
149        extra_size_after: u32,
150        nice_len: u32,
151        match_len_max: u32,
152        match_finder: MatchFinders,
153    ) -> Self {
154        let buf_size = get_buf_size(
155            dict_size,
156            extra_size_before,
157            extra_size_after,
158            match_len_max,
159        );
160        let buf_size = buf_size as usize;
161        let buf = vec![0; buf_size];
162        let buf_limit_u16 = buf_size.checked_sub(size_of::<u16>()).unwrap();
163
164        let keep_size_before = extra_size_before + dict_size;
165        let keep_size_after = extra_size_after + match_len_max;
166
167        Self {
168            data: LzEncoderData {
169                keep_size_before,
170                keep_size_after,
171                match_len_max,
172                nice_len,
173                buf,
174                buf_size,
175                buf_limit_u16,
176                read_pos: -1,
177                read_limit: -1,
178                finishing: false,
179                write_pos: 0,
180                pending_size: 0,
181            },
182            matches: Matches::new(nice_len as usize - 1),
183            match_finder,
184        }
185    }
186
187    pub(crate) fn normalize(positions: &mut [i32], norm_offset: i32) {
188        #[cfg(all(feature = "std", feature = "optimization", target_arch = "x86_64"))]
189        {
190            if std::arch::is_x86_feature_detected!("avx2") {
191                // SAFETY: We've checked that the CPU supports AVX2.
192                return unsafe { normalize_avx2(positions, norm_offset) };
193            }
194            if std::arch::is_x86_feature_detected!("sse4.1") {
195                // SAFETY: We've checked that the CPU supports SSE4.1.
196                return unsafe { normalize_sse41(positions, norm_offset) };
197            }
198        }
199
200        #[cfg(all(feature = "std", feature = "optimization", target_arch = "aarch64"))]
201        {
202            if std::arch::is_aarch64_feature_detected!("neon") {
203                // SAFETY: We've checked that the CPU supports NEON.
204                return unsafe { normalize_neon(positions, norm_offset) };
205            }
206        }
207
208        normalize_scalar(positions, norm_offset);
209    }
210
211    pub(crate) fn find_matches(&mut self) {
212        self.match_finder
213            .find_matches(&mut self.data, &mut self.matches)
214    }
215
216    pub(crate) fn matches(&mut self) -> &mut Matches {
217        &mut self.matches
218    }
219
220    pub(crate) fn skip(&mut self, len: usize) {
221        self.match_finder.skip(&mut self.data, len)
222    }
223
224    pub(crate) fn set_preset_dict(&mut self, dict_size: u32, preset_dict: &[u8]) {
225        self.data
226            .set_preset_dict(dict_size, preset_dict, &mut self.match_finder)
227    }
228
229    pub(crate) fn set_finishing(&mut self) {
230        self.data.set_finishing(&mut self.match_finder)
231    }
232
233    pub(crate) fn fill_window(&mut self, input: &[u8]) -> usize {
234        self.data.fill_window(input, &mut self.match_finder)
235    }
236
237    pub(crate) fn set_flushing(&mut self) {
238        self.data.set_flushing(&mut self.match_finder)
239    }
240
241    pub(crate) fn verify_matches(&self) -> bool {
242        self.data.verify_matches(&self.matches)
243    }
244}
245
246impl LzEncoderData {
247    pub(crate) fn is_started(&self) -> bool {
248        self.read_pos != -1
249    }
250
251    pub(crate) fn read_buffer(&self) -> &[u8] {
252        &self.buf[self.read_pos as usize..]
253    }
254
255    fn set_preset_dict(
256        &mut self,
257        dict_size: u32,
258        preset_dict: &[u8],
259        match_finder: &mut dyn MatchFind,
260    ) {
261        debug_assert!(!self.is_started());
262        debug_assert_eq!(self.write_pos, 0);
263        let copy_size = preset_dict.len().min(dict_size as usize);
264        let offset = preset_dict.len() - copy_size;
265        self.buf[0..copy_size].copy_from_slice(&preset_dict[offset..(offset + copy_size)]);
266        self.write_pos += copy_size as i32;
267        match_finder.skip(self, copy_size);
268    }
269
270    fn move_window(&mut self) {
271        let move_offset =
272            (self.read_pos + 1 - self.keep_size_before as i32) & MOVE_BLOCK_ALIGN_MASK;
273        let move_size = self.write_pos - move_offset;
274
275        debug_assert!(move_size >= 0);
276        debug_assert!(move_offset >= 0);
277
278        let move_size = move_size as usize;
279        let offset = move_offset as usize;
280
281        self.buf.copy_within(offset..offset + move_size, 0);
282
283        self.read_pos -= move_offset;
284        self.read_limit -= move_offset;
285        self.write_pos -= move_offset;
286    }
287
288    fn fill_window(&mut self, input: &[u8], match_finder: &mut dyn MatchFind) -> usize {
289        debug_assert!(!self.finishing);
290        if self.read_pos >= (self.buf_size as i32 - self.keep_size_after as i32) {
291            self.move_window();
292        }
293        let len = if input.len() as i32 > self.buf_size as i32 - self.write_pos {
294            (self.buf_size as i32 - self.write_pos) as usize
295        } else {
296            input.len()
297        };
298        let d_start = self.write_pos as usize;
299        let d_end = d_start + len;
300        self.buf[d_start..d_end].copy_from_slice(&input[..len]);
301        self.write_pos += len as i32;
302        if self.write_pos >= self.keep_size_after as i32 {
303            self.read_limit = self.write_pos - self.keep_size_after as i32;
304        }
305        self.process_pending_bytes(match_finder);
306        len
307    }
308
309    fn process_pending_bytes(&mut self, match_finder: &mut dyn MatchFind) {
310        if self.pending_size > 0 && self.read_pos < self.read_limit {
311            self.read_pos -= self.pending_size as i32;
312            let old_pending = self.pending_size;
313            self.pending_size = 0;
314            match_finder.skip(self, old_pending as _);
315            debug_assert!(self.pending_size < old_pending)
316        }
317    }
318
319    fn set_flushing(&mut self, match_finder: &mut dyn MatchFind) {
320        self.read_limit = self.write_pos - 1;
321        self.process_pending_bytes(match_finder);
322    }
323
324    fn set_finishing(&mut self, match_finder: &mut dyn MatchFind) {
325        self.read_limit = self.write_pos - 1;
326        self.finishing = true;
327        self.process_pending_bytes(match_finder);
328    }
329
330    pub fn has_enough_data(&self, already_read_len: i32) -> bool {
331        self.read_pos - already_read_len < self.read_limit
332    }
333
334    pub(crate) fn copy_uncompressed<W: Write>(
335        &self,
336        out: &mut W,
337        backward: i32,
338        len: usize,
339    ) -> crate::Result<()> {
340        let start = (self.read_pos + 1 - backward) as usize;
341        out.write_all(&self.buf[start..(start + len)])
342    }
343
344    #[inline(always)]
345    pub(crate) fn get_avail(&self) -> i32 {
346        debug_assert_ne!(self.read_pos, -1);
347        self.write_pos - self.read_pos
348    }
349
350    #[inline(always)]
351    pub(crate) fn get_pos(&self) -> i32 {
352        self.read_pos
353    }
354
355    #[inline(always)]
356    pub(crate) fn get_byte(&self, forward: i32, backward: i32) -> u8 {
357        self.buf[(self.read_pos + forward - backward) as usize]
358    }
359
360    #[inline(always)]
361    pub(crate) fn get_byte_by_pos(&self, pos: i32) -> u8 {
362        self.buf[pos as usize]
363    }
364
365    #[inline(always)]
366    pub(crate) fn get_byte_backward(&self, backward: i32) -> u8 {
367        self.buf[(self.read_pos - backward) as usize]
368    }
369
370    #[inline(always)]
371    pub(crate) fn get_current_byte(&self) -> u8 {
372        self.buf[self.read_pos as usize]
373    }
374
375    #[inline(always)]
376    pub(crate) fn get_match_len(&self, dist: i32, len_limit: i32) -> usize {
377        extend_match(&self.buf, self.read_pos, 0, dist + 1, len_limit) as usize
378    }
379
380    #[inline(always)]
381    pub(crate) fn get_match_len2(&self, forward: i32, dist: i32, len_limit: i32) -> u32 {
382        if len_limit <= 0 {
383            return 0;
384        }
385        extend_match(&self.buf, self.read_pos + forward, 0, dist + 1, len_limit) as u32
386    }
387
388    #[inline(always)]
389    pub(crate) fn get_match_len_fast_reject<const MATCH_LEN_MIN: usize>(
390        &self,
391        dist: i32,
392        len_limit: i32,
393    ) -> usize {
394        let match_dist = dist + 1;
395        let read_pos = self.read_pos as usize;
396
397        // Fast rejection
398        #[cfg(feature = "optimization")]
399        unsafe {
400            // SAFETY: We clamp the read positions in range of the buffer.
401            let clamped0 = read_pos.min(self.buf_limit_u16);
402            let clamped1 = (read_pos - match_dist as usize).min(self.buf_limit_u16);
403
404            if core::ptr::read_unaligned(self.buf.as_ptr().add(clamped0) as *const u16)
405                != core::ptr::read_unaligned(self.buf.as_ptr().add(clamped1) as *const u16)
406            {
407                return 0;
408            }
409        }
410        #[cfg(not(feature = "optimization"))]
411        if self.buf[read_pos] != self.buf[read_pos - match_dist as usize]
412            || self.buf[read_pos + 1] != self.buf[read_pos + 1 - match_dist as usize]
413        {
414            return 0;
415        }
416
417        extend_match(&self.buf, self.read_pos, 2, match_dist, len_limit) as usize
418    }
419
420    fn verify_matches(&self, matches: &Matches) -> bool {
421        let len_limit = self.get_avail().min(self.match_len_max as i32);
422
423        for i in 0..matches.count as usize {
424            let match_distance = matches.dist[i] + 1;
425            let actual_len = extend_match(&self.buf, self.read_pos, 0, match_distance, len_limit);
426
427            if actual_len as u32 != matches.len[i] {
428                return false;
429            }
430        }
431
432        true
433    }
434
435    pub(crate) fn move_pos(
436        &mut self,
437        required_for_flushing: i32,
438        required_for_finishing: i32,
439    ) -> i32 {
440        debug_assert!(required_for_flushing >= required_for_finishing);
441        self.read_pos += 1;
442        let mut avail = self.write_pos - self.read_pos;
443        if avail < required_for_flushing && (avail < required_for_finishing || !self.finishing) {
444            self.pending_size += 1;
445            avail = 0;
446        }
447        avail
448    }
449}
450
451impl Deref for LzEncoder {
452    type Target = LzEncoderData;
453
454    fn deref(&self) -> &Self::Target {
455        &self.data
456    }
457}
458
459fn get_buf_size(
460    dict_size: u32,
461    extra_size_before: u32,
462    extra_size_after: u32,
463    match_len_max: u32,
464) -> u32 {
465    let keep_size_before = extra_size_before + dict_size;
466    let keep_size_after = extra_size_after + match_len_max;
467    let reserve_size = (dict_size / 2 + (256 << 10)).min(512 << 20);
468    keep_size_before + keep_size_after + reserve_size
469}
470
471#[inline(always)]
472fn normalize_scalar(positions: &mut [i32], norm_offset: i32) {
473    positions
474        .iter_mut()
475        .for_each(|p| *p = p.saturating_sub(norm_offset));
476}
477
478/// Normalization implementation using ARM NEON for 128-bit SIMD processing.
479#[cfg(all(feature = "std", feature = "optimization", target_arch = "aarch64"))]
480#[target_feature(enable = "neon")]
481unsafe fn normalize_neon(positions: &mut [i32], norm_offset: i32) {
482    use core::arch::aarch64::*;
483
484    // Create a 128-bit vector with the offset broadcast to all 4 lanes.
485    let norm_v = vdupq_n_s32(norm_offset);
486
487    // Split the slice into a 16-byte aligned middle part and unaligned ends.
488    // `int32x4_t` is the NEON vector type for 4 x i32, which is 16 bytes.
489    let (prefix, chunks, suffix) = positions.align_to_mut::<int32x4_t>();
490
491    normalize_scalar(prefix, norm_offset);
492
493    for chunk in chunks {
494        let ptr = chunk as *mut int32x4_t as *mut i32;
495
496        let data = vld1q_s32(ptr);
497
498        // Perform saturated subtraction on 8 integers simultaneously.
499        let max_val = vmaxq_s32(data, norm_v);
500        let result = vsubq_s32(max_val, norm_v);
501
502        vst1q_s32(ptr, result);
503    }
504
505    normalize_scalar(suffix, norm_offset);
506}
507
508/// Normalization implementation using AVX2 for 256-bit SIMD processing.
509#[cfg(all(feature = "std", feature = "optimization", target_arch = "x86_64"))]
510#[target_feature(enable = "avx2")]
511unsafe fn normalize_avx2(positions: &mut [i32], norm_offset: i32) {
512    use core::arch::x86_64::*;
513
514    // Create a 256-bit vector with the normalization offset broadcast to all 8 lanes.
515    let norm_v = _mm256_set1_epi32(norm_offset);
516
517    // Split the slice into a 32-byte aligned middle part and unaligned ends.
518    let (prefix, chunks, suffix) = positions.align_to_mut::<__m256i>();
519
520    normalize_scalar(prefix, norm_offset);
521
522    for chunk in chunks {
523        // Use ALIGNED load. This is safe because `align_to_mut`
524        // guarantees that `chunk` is aligned to 32 bytes.
525        let data = _mm256_load_si256(chunk as *mut _);
526
527        // Perform saturated subtraction on 8 integers simultaneously.
528        let max_val = _mm256_max_epi32(data, norm_v);
529        let result = _mm256_sub_epi32(max_val, norm_v);
530
531        // Use ALIGNED store to write the results back.
532        _mm256_store_si256(chunk as *mut _, result);
533    }
534
535    normalize_scalar(suffix, norm_offset);
536}
537
538/// Normalization implementation using SSE4.1 for 128-bit SIMD processing.
539#[cfg(all(feature = "std", feature = "optimization", target_arch = "x86_64"))]
540#[target_feature(enable = "sse4.1")]
541unsafe fn normalize_sse41(positions: &mut [i32], norm_offset: i32) {
542    use core::arch::x86_64::*;
543
544    // Create a 128-bit vector with the offset broadcast to all 4 lanes.
545    let norm_v = _mm_set1_epi32(norm_offset);
546
547    // Split the slice into a 16-byte aligned middle part and unaligned ends.
548    let (prefix, chunks, suffix) = positions.align_to_mut::<__m128i>();
549
550    normalize_scalar(prefix, norm_offset);
551
552    // Process the aligned middle part in 128-bit (4 x i32) chunks.
553    for chunk in chunks {
554        // Use ALIGNED 128-bit load.
555        let data = _mm_load_si128(chunk as *mut _);
556
557        let max_val = _mm_max_epi32(data, norm_v);
558        let result = _mm_sub_epi32(max_val, norm_v);
559
560        // Use ALIGNED 128-bit store.
561        _mm_store_si128(chunk as *mut _, result);
562    }
563
564    normalize_scalar(suffix, norm_offset);
565}