haagenti_zstd/compress/block.rs
1//! Block-level encoding for Zstd compression.
2//!
3//! This module handles encoding of literals and sequences sections.
4//!
5//! ## Parallel Encoding
6//!
7//! When the `parallel` feature is enabled, 4-stream Huffman encoding
8//! uses rayon for parallel compression of the 4 literal segments,
9//! providing up to 2-3x speedup for large literals.
10
11use super::sequences::{analyze_for_rle, encode_sequences_fse_with_encoded};
12use super::Match;
13use crate::block::Sequence;
14use crate::huffman::HuffmanEncoder;
15use haagenti_core::Result;
16
17#[cfg(feature = "parallel")]
18use rayon::prelude::*;
19
20/// Block encoder for creating compressed blocks.
21#[derive(Debug)]
22pub struct BlockEncoder {
23 /// Accumulated literals.
24 literals: Vec<u8>,
25 /// Accumulated sequences.
26 sequences: Vec<Sequence>,
27}
28
29impl BlockEncoder {
30 /// Create a new block encoder.
31 pub fn new() -> Self {
32 Self {
33 literals: Vec::new(),
34 sequences: Vec::new(),
35 }
36 }
37
38 /// Add a literal byte.
39 pub fn add_literal(&mut self, byte: u8) {
40 self.literals.push(byte);
41 }
42
43 /// Add multiple literal bytes.
44 pub fn add_literals(&mut self, bytes: &[u8]) {
45 self.literals.extend_from_slice(bytes);
46 }
47
48 /// Add a match as a sequence.
49 pub fn add_match(&mut self, literal_length: u32, offset: u32, match_length: u32) {
50 self.sequences
51 .push(Sequence::new(literal_length, offset, match_length));
52 }
53
54 /// Get the accumulated literals.
55 pub fn literals(&self) -> &[u8] {
56 &self.literals
57 }
58
59 /// Get the accumulated sequences.
60 pub fn sequences(&self) -> &[Sequence] {
61 &self.sequences
62 }
63
64 /// Clear the encoder for reuse.
65 pub fn clear(&mut self) {
66 self.literals.clear();
67 self.sequences.clear();
68 }
69}
70
71impl Default for BlockEncoder {
72 fn default() -> Self {
73 Self::new()
74 }
75}
76
77/// Maximum match length that can be encoded in a single sequence.
78///
79/// Using zstd's predefined ML table values:
80/// - Code 52: baseline 65539, 16 extra bits → max = 65539 + 65535 = 131074
81///
82/// Note: zstd's predefined values differ from RFC 8878 for codes 43+.
83/// This allows encoding longer matches without splitting.
84const MAX_MATCH_LENGTH_PER_SEQUENCE: usize = 131074;
85
86/// Minimum match length for Zstd (must be at least 3).
87const MIN_MATCH_LENGTH: usize = 3;
88
89/// Tracks repeat offsets during encoding.
90///
91/// Per RFC 8878 Section 3.1.1.5, offset values 1-3 reference recent offsets.
92/// Initial values are [1, 4, 8].
93struct RepeatOffsetsEncoder {
94 offsets: [u32; 3],
95}
96
97impl RepeatOffsetsEncoder {
98 /// Create with initial values per RFC 8878.
99 fn new() -> Self {
100 Self { offsets: [1, 4, 8] }
101 }
102
103 /// Convert an actual offset to an offset_value for encoding.
104 ///
105 /// Returns the offset_value to encode:
106 /// - 1, 2, or 3 if the offset matches a repeat offset
107 /// - actual_offset + 3 for new offsets
108 ///
109 /// Also updates the repeat offset state to match the decoder's state machine.
110 /// This is critical: the encoder and decoder must stay in sync.
111 ///
112 /// Per RFC 8878 Section 3.1.1.5:
113 /// - When LL > 0: offset_value 1/2/3 map directly to repeat_offset 1/2/3
114 /// - When LL = 0: special handling (shifted mapping)
115 fn encode(&mut self, actual_offset: u32, literal_length: u32) -> u32 {
116 if literal_length > 0 {
117 // Normal case: check if offset matches any repeat offset
118 if actual_offset == self.offsets[0] {
119 // Matches repeat_offset_1 - no state change needed
120 return 1;
121 } else if actual_offset == self.offsets[1] {
122 // Matches repeat_offset_2 - rotate [1] to front
123 // Decoder does: temp = [idx], shift down, [0] = temp
124 self.offsets.swap(1, 0);
125 return 2;
126 } else if actual_offset == self.offsets[2] {
127 // Matches repeat_offset_3 - rotate [2] to front
128 let temp = self.offsets[2];
129 self.offsets[2] = self.offsets[1];
130 self.offsets[1] = self.offsets[0];
131 self.offsets[0] = temp;
132 return 3;
133 }
134 } else {
135 // LL = 0: special case encoding
136 // offset_value 1 -> decoded as offsets[1], then swap(0,1)
137 // offset_value 2 -> decoded as offsets[2], then rotate
138 // offset_value 3 -> decoded as offsets[0]-1, push to front
139 if actual_offset == self.offsets[1] {
140 // Decoder swaps [0] and [1]
141 self.offsets.swap(0, 1);
142 return 1;
143 } else if actual_offset == self.offsets[2] {
144 // Decoder rotates: [2]->[0], [0]->[1], [1]->[2]
145 let temp = self.offsets[2];
146 self.offsets[2] = self.offsets[1];
147 self.offsets[1] = self.offsets[0];
148 self.offsets[0] = temp;
149 return 2;
150 } else if actual_offset == self.offsets[0].saturating_sub(1).max(1) {
151 // Decoder uses offsets[0]-1, pushes to front
152 let new_offset = self.offsets[0].saturating_sub(1).max(1);
153 self.offsets[2] = self.offsets[1];
154 self.offsets[1] = self.offsets[0];
155 self.offsets[0] = new_offset;
156 return 3;
157 }
158 }
159
160 // New offset: push to front, encode as actual_offset + 3
161 self.offsets[2] = self.offsets[1];
162 self.offsets[1] = self.offsets[0];
163 self.offsets[0] = actual_offset;
164 actual_offset + 3
165 }
166}
167
168/// Convert matches to literals and sequences.
169///
170/// Long matches are split only when they exceed the maximum encodable length.
171/// This minimizes sequence count for better compression with FSE encoding.
172///
173/// Uses repeat offset tracking to efficiently encode offsets that match
174/// recently used offsets (per RFC 8878 Section 3.1.1.5).
175pub fn matches_to_sequences(input: &[u8], matches: &[Match]) -> (Vec<u8>, Vec<Sequence>) {
176 // Pre-allocate based on expected output sizes
177 // Literals: estimate ~50% of input for text, sequences ~1 per match
178 let mut literals = Vec::with_capacity(input.len() / 2);
179 let mut sequences = Vec::with_capacity(matches.len());
180 let mut repeat_offsets = RepeatOffsetsEncoder::new();
181 let mut pos = 0;
182
183 for m in matches {
184 // Add literals before the match
185 let literal_length = m.position - pos;
186 if literal_length > 0 {
187 literals.extend_from_slice(&input[pos..m.position]);
188 }
189
190 let actual_offset = m.offset as u32;
191
192 // Use maximum chunk size - only split when necessary
193 let chunk_size = MAX_MATCH_LENGTH_PER_SEQUENCE;
194
195 let mut remaining_match = m.length;
196 let mut first_sequence = true;
197
198 while remaining_match > 0 {
199 // Determine match length for this sequence
200 let match_len = remaining_match.min(chunk_size);
201
202 // Ensure we don't leave a too-short remainder
203 let after_this = remaining_match - match_len;
204 let match_len = if after_this > 0 && after_this < MIN_MATCH_LENGTH {
205 remaining_match - MIN_MATCH_LENGTH
206 } else {
207 match_len
208 };
209
210 // First sequence gets the literal length, subsequent ones get 0
211 let ll = if first_sequence {
212 literal_length as u32
213 } else {
214 0
215 };
216 first_sequence = false;
217
218 // Convert actual offset to offset_value using repeat offset tracking
219 let offset_value = repeat_offsets.encode(actual_offset, ll);
220
221 sequences.push(Sequence::new(ll, offset_value, match_len as u32));
222 remaining_match -= match_len;
223 }
224
225 pos = m.position + m.length;
226 }
227
228 // Add trailing literals
229 if pos < input.len() {
230 literals.extend_from_slice(&input[pos..]);
231 }
232
233 (literals, sequences)
234}
235
236/// Encode the literals section.
237///
238/// Attempts Huffman compression for better ratios, falls back to Raw/RLE.
239pub fn encode_literals(literals: &[u8], output: &mut Vec<u8>) -> Result<()> {
240 let size = literals.len();
241
242 if size == 0 {
243 // Empty literals: just the header byte
244 output.push(0x00);
245 return Ok(());
246 }
247
248 // Check for RLE pattern first (most efficient)
249 if literals.iter().all(|&b| b == literals[0]) {
250 return encode_rle_literals(literals[0], size, output);
251 }
252
253 // Try Huffman compression for literals of suitable size
254 // Min 64 bytes for meaningful compression
255 // Max 128KB (block size limit) - encode_huffman_literals handles header format limits
256 if size >= 64 {
257 if let Some(encoder) = HuffmanEncoder::build(literals) {
258 let estimated = encoder.estimate_size(literals);
259 if estimated + 20 < size {
260 return encode_huffman_literals(literals, &encoder, output);
261 }
262 }
263 }
264
265 // Fall back to raw literals
266 encode_raw_literals(literals, output)
267}
268
269/// Encode literals section using a pre-built Huffman encoder.
270///
271/// This function uses a custom Huffman encoder instead of building one from the data.
272/// Useful for dictionary compression or when using pre-trained encoders.
273///
274/// Falls back to raw encoding if the encoder can't compress the data efficiently.
275pub fn encode_literals_with_encoder(
276 literals: &[u8],
277 encoder: &HuffmanEncoder,
278 output: &mut Vec<u8>,
279) -> Result<()> {
280 let size = literals.len();
281
282 if size == 0 {
283 // Empty literals: just the header byte
284 output.push(0x00);
285 return Ok(());
286 }
287
288 // Check for RLE pattern first (most efficient, regardless of encoder)
289 if literals.iter().all(|&b| b == literals[0]) {
290 return encode_rle_literals(literals[0], size, output);
291 }
292
293 // Use the provided encoder for Huffman compression
294 // Min 64 bytes for meaningful compression
295 if size >= 64 {
296 let estimated = encoder.estimate_size(literals);
297 if estimated + 20 < size {
298 return encode_huffman_literals(literals, encoder, output);
299 }
300 }
301
302 // Fall back to raw literals
303 encode_raw_literals(literals, output)
304}
305
306/// Encode literals using Huffman compression.
307fn encode_huffman_literals(
308 literals: &[u8],
309 encoder: &HuffmanEncoder,
310 output: &mut Vec<u8>,
311) -> Result<()> {
312 let regenerated_size = literals.len();
313 let weights = encoder.serialize_weights();
314
315 // If weights is empty, there are too many symbols for direct encoding (>128).
316 // Fall back to raw literals since we don't implement FSE-compressed weights yet.
317 if weights.is_empty() {
318 return encode_raw_literals(literals, output);
319 }
320
321 // Header format for compressed literals (RFC 8878 Section 3.1.1.3.2):
322 // Literals_Block_Type = 2 (Compressed) with fresh Huffman table
323 //
324 // Size_Format determines header layout:
325 // - 0: 4 streams, 10-bit sizes, 3-byte header
326 // - 1: 4 streams, 14-bit sizes, 4-byte header
327 // - 2: 4 streams, 18-bit sizes, 5-byte header
328 // - 3: 1 stream, 10-bit sizes, 3-byte header
329
330 if regenerated_size <= 1023 {
331 // Single stream format (Size_Format = 3)
332 let compressed = encoder.encode(literals);
333 let compressed_size = weights.len() + compressed.len();
334
335 if compressed_size <= 1023 {
336 // Use Size_Format = 3 (single stream, 10-bit sizes, 3-byte header)
337 // Byte 0: regen[3:0] << 4 | Size_Format << 2 | Block_Type
338 // = regen[3:0] << 4 | 3 << 2 | 2 = regen[3:0] << 4 | 0x0E
339 // Byte 1: comp[1:0] << 6 | regen[9:4]
340 // Byte 2: comp[9:2]
341 let byte0 = (((regenerated_size & 0x0F) << 4) | 0x0E) as u8;
342 let byte1 = (((compressed_size & 0x03) << 6) | ((regenerated_size >> 4) & 0x3F)) as u8;
343 let byte2 = ((compressed_size >> 2) & 0xFF) as u8;
344
345 output.push(byte0);
346 output.push(byte1);
347 output.push(byte2);
348
349 // Write Huffman weights table
350 output.extend_from_slice(&weights);
351 // Write compressed literals
352 output.extend_from_slice(&compressed);
353
354 return Ok(());
355 }
356 }
357
358 // For larger sizes, use 4-stream format (supports up to 262143 bytes)
359 if regenerated_size <= 262143 {
360 return encode_huffman_4stream(literals, encoder, &weights, output);
361 }
362
363 // Fall back to raw literals for very large sizes (> 256KB)
364 encode_raw_literals(literals, output)
365}
366
367/// Encode literals using 4-stream Huffman compression.
368///
369/// 4-stream format splits literals into 4 segments, compresses each,
370/// and writes segment sizes for the first 3 streams (last stream size is inferred).
371///
372/// When the `parallel` feature is enabled, segments are compressed in parallel
373/// using rayon, providing up to 2-3x speedup for large literals.
374fn encode_huffman_4stream(
375 literals: &[u8],
376 encoder: &HuffmanEncoder,
377 weights: &[u8],
378 output: &mut Vec<u8>,
379) -> Result<()> {
380 let regenerated_size = literals.len();
381
382 // Split literals into 4 equal segments (with rounding for last segment)
383 let segment_size = regenerated_size.div_ceil(4);
384 let mut segments: [&[u8]; 4] = [&[], &[], &[], &[]];
385
386 for (i, segment) in segments.iter_mut().enumerate() {
387 let start = i * segment_size;
388 if start >= regenerated_size {
389 *segment = &[];
390 } else {
391 let end = ((i + 1) * segment_size).min(regenerated_size);
392 *segment = &literals[start..end];
393 }
394 }
395
396 // Compress each segment - parallel when feature enabled
397 #[cfg(feature = "parallel")]
398 let (compressed_0, compressed_1, compressed_2, compressed_3) = {
399 // Use rayon to compress all 4 segments in parallel
400 let compressed: Vec<Vec<u8>> = segments.par_iter().map(|seg| encoder.encode(seg)).collect();
401
402 let mut iter = compressed.into_iter();
403 (
404 iter.next().unwrap_or_default(),
405 iter.next().unwrap_or_default(),
406 iter.next().unwrap_or_default(),
407 iter.next().unwrap_or_default(),
408 )
409 };
410
411 #[cfg(not(feature = "parallel"))]
412 let (compressed_0, compressed_1, compressed_2, compressed_3) = (
413 encoder.encode(segments[0]),
414 encoder.encode(segments[1]),
415 encoder.encode(segments[2]),
416 encoder.encode(segments[3]),
417 );
418
419 // Calculate total compressed size
420 let stream_sizes = [
421 compressed_0.len(),
422 compressed_1.len(),
423 compressed_2.len(),
424 compressed_3.len(),
425 ];
426
427 // Total compressed = weights + 6 bytes jump table + all streams
428 let total_compressed = weights.len() + 6 + stream_sizes.iter().sum::<usize>();
429
430 // Check for individual stream overflow
431 if stream_sizes.iter().any(|&s| s > 65535) {
432 return encode_raw_literals(literals, output);
433 }
434
435 // Choose header format based on sizes - must match decoder's expected format!
436 // Size_Format=0: 10-bit sizes, 3-byte header (max 1023 each)
437 // Size_Format=2: 16/14-bit sizes, 5-byte header (larger sizes)
438 if regenerated_size <= 1023 && total_compressed <= 1023 {
439 // Use Size_Format = 0 (4 streams, 10-bit sizes, 3-byte header)
440 // Decoder expects:
441 // regen = (byte0 >> 4) | ((byte1 & 0x3F) << 4)
442 // comp = (byte1 >> 6) | (byte2 << 2)
443 // So encoder must:
444 // byte0 = regen[3:0] << 4 | Size_Format << 2 | Block_Type = regen[3:0] << 4 | 0x02
445 // byte1 = comp[1:0] << 6 | regen[9:4]
446 // byte2 = comp[9:2]
447 let byte0 = (((regenerated_size & 0x0F) << 4) | 0x02) as u8;
448 let byte1 = (((total_compressed & 0x03) << 6) | ((regenerated_size >> 4) & 0x3F)) as u8;
449 let byte2 = ((total_compressed >> 2) & 0xFF) as u8;
450
451 output.push(byte0);
452 output.push(byte1);
453 output.push(byte2);
454 } else if regenerated_size <= 65535 && total_compressed <= 16383 {
455 // Use Size_Format = 2 (4 streams, 5-byte header)
456 // Decoder expects:
457 // regen = ((byte0 >> 4) & 0x3F) | (byte1 << 4) | ((byte2 & 0x0F) << 12)
458 // comp = (byte2 >> 4) | (byte3 << 4) | ((byte4 & 0x03) << 12)
459 // So encoder must:
460 // byte0 = regen[3:0] << 4 | Size_Format << 2 | Block_Type = regen[3:0] << 4 | 0x0A
461 // byte1 = regen[11:4]
462 // byte2 = comp[3:0] << 4 | regen[15:12]
463 // byte3 = comp[11:4]
464 // byte4 = comp[13:12]
465 let byte0 = (((regenerated_size & 0x0F) << 4) | 0x0A) as u8;
466 let byte1 = ((regenerated_size >> 4) & 0xFF) as u8;
467 let byte2 = (((total_compressed & 0x0F) << 4) | ((regenerated_size >> 12) & 0x0F)) as u8;
468 let byte3 = ((total_compressed >> 4) & 0xFF) as u8;
469 let byte4 = ((total_compressed >> 12) & 0x03) as u8;
470
471 output.push(byte0);
472 output.push(byte1);
473 output.push(byte2);
474 output.push(byte3);
475 output.push(byte4);
476 } else {
477 // Too large for Huffman compression
478 return encode_raw_literals(literals, output);
479 }
480
481 // Write Huffman weights table
482 output.extend_from_slice(weights);
483
484 // Write jump table: 3 x 16-bit cumulative offsets (little-endian)
485 // RFC 8878: Jump values are cumulative offsets from position 6 (after jump table)
486 // - Stream 0 starts at position 6
487 // - Stream 1 starts at position 6 + jump1
488 // - Stream 2 starts at position 6 + jump2
489 // - Stream 3 starts at position 6 + jump3
490 let jump1 = stream_sizes[0];
491 let jump2 = jump1 + stream_sizes[1];
492 let jump3 = jump2 + stream_sizes[2];
493
494 output.extend_from_slice(&(jump1 as u16).to_le_bytes());
495 output.extend_from_slice(&(jump2 as u16).to_le_bytes());
496 output.extend_from_slice(&(jump3 as u16).to_le_bytes());
497
498 // Write all 4 compressed streams
499 output.extend_from_slice(&compressed_0);
500 output.extend_from_slice(&compressed_1);
501 output.extend_from_slice(&compressed_2);
502 output.extend_from_slice(&compressed_3);
503
504 Ok(())
505}
506
507/// Encode raw literals.
508///
509/// Uses RFC 8878 Section 3.1.1.3.1.1 header format:
510/// - Size_Format 0b00/0b01: 1-byte header, 5-bit size (0-31)
511/// - Size_Format 0b10: 2-byte header, 12-bit size (0-4095)
512/// - Size_Format 0b11: 3-byte header, 20-bit size (0-131071)
513///
514/// Note: The 1-byte format has a bit layout issue where `size << 3` puts
515/// size bit 0 into the Size_Format field (bits 3:2), causing odd sizes to
516/// produce Size_Format = 2. To avoid this, we use 2-byte format for all
517/// sizes > 0 up to 4095. This costs 1 extra byte but guarantees correctness.
518fn encode_raw_literals(literals: &[u8], output: &mut Vec<u8>) -> Result<()> {
519 let size = literals.len();
520
521 // Header format per RFC 8878 Section 3.1.1.1:
522 // Literals_Block_Type[1:0] = 0 (Raw)
523 // Size_Format[3:2]:
524 // 00 or 10: 5-bit size in 1-byte header
525 // 01: 12-bit size in 2-byte header
526 // 11: 20-bit size in 3-byte header
527 if size == 0 {
528 // Empty: 1-byte header with all zeros
529 output.push(0x00);
530 } else if size <= 31 {
531 // 5-bit size: 1-byte header with Size_Format = 0
532 // byte0 = size[4:0] << 3 | Size_Format << 2 | Block_Type
533 // = size << 3 | 0 << 2 | 0
534 // = size << 3
535 let byte0 = (size << 3) as u8;
536 output.push(byte0);
537 } else if size <= 4095 {
538 // 12-bit size: 2-byte header with Size_Format = 1
539 // byte0 = size[3:0] << 4 | Size_Format << 2 | Block_Type
540 // = size[3:0] << 4 | 1 << 2 | 0
541 // = size[3:0] << 4 | 0x04
542 // byte1 = size[11:4]
543 let byte0 = ((size & 0x0F) << 4) | 0x04;
544 let byte1 = (size >> 4) & 0xFF;
545 output.push(byte0 as u8);
546 output.push(byte1 as u8);
547 } else {
548 // 20-bit size: 3-byte header with Size_Format = 3
549 // byte0 = size[3:0] << 4 | 3 << 2 | 0 = size[3:0] << 4 | 0x0C
550 // byte1 = size[11:4]
551 // byte2 = size[19:12]
552 let byte0 = ((size & 0x0F) << 4) | 0x0C;
553 let byte1 = (size >> 4) & 0xFF;
554 let byte2 = (size >> 12) & 0xFF;
555 output.push(byte0 as u8);
556 output.push(byte1 as u8);
557 output.push(byte2 as u8);
558 }
559
560 // Literal data
561 output.extend_from_slice(literals);
562
563 Ok(())
564}
565
566/// Encode RLE literals.
567///
568/// Uses RFC 8878 Section 3.1.1.3.1.2 header format:
569/// - Size_Format 0b10: 2-byte header, 12-bit size (0-4095)
570/// - Size_Format 0b11: 3-byte header, 20-bit size (0-131071)
571///
572/// Same as raw literals, we use 2-byte format to avoid bit layout issues.
573fn encode_rle_literals(byte: u8, count: usize, output: &mut Vec<u8>) -> Result<()> {
574 // Header format per RFC 8878 Section 3.1.1.1:
575 // Literals_Block_Type[1:0] = 1 (RLE)
576 // Size_Format[3:2]:
577 // 00 or 10: 5-bit size in 1-byte header
578 // 01: 12-bit size in 2-byte header
579 // 11: 20-bit size in 3-byte header
580 if count == 0 {
581 // Empty RLE (shouldn't happen, but handle gracefully)
582 output.push(0x01); // Type=1 (RLE), Size_Format=0, Size=0
583 } else if count <= 31 {
584 // 5-bit size: 1-byte header with Size_Format = 0
585 // byte0 = count[4:0] << 3 | Size_Format << 2 | Block_Type
586 // = count << 3 | 0 << 2 | 1
587 // = count << 3 | 1
588 let byte0 = ((count << 3) | 1) as u8;
589 output.push(byte0);
590 } else if count <= 4095 {
591 // 12-bit size: 2-byte header with Size_Format = 1
592 // byte0 = count[3:0] << 4 | Size_Format << 2 | Block_Type
593 // = count[3:0] << 4 | 1 << 2 | 1
594 // = count[3:0] << 4 | 0x05
595 // byte1 = count[11:4]
596 let byte0 = ((count & 0x0F) << 4) | 0x05;
597 let byte1 = (count >> 4) & 0xFF;
598 output.push(byte0 as u8);
599 output.push(byte1 as u8);
600 } else {
601 // 20-bit size: 3-byte header with Size_Format = 3
602 // byte0 = count[3:0] << 4 | 3 << 2 | 1 = count[3:0] << 4 | 0x0D
603 // byte1 = count[11:4]
604 // byte2 = count[19:12]
605 let byte0 = ((count & 0x0F) << 4) | 0x0D;
606 let byte1 = (count >> 4) & 0xFF;
607 let byte2 = (count >> 12) & 0xFF;
608 output.push(byte0 as u8);
609 output.push(byte1 as u8);
610 output.push(byte2 as u8);
611 }
612
613 // Single byte value
614 output.push(byte);
615
616 Ok(())
617}
618
619/// Encode the sequences section.
620///
621/// Uses RLE mode when all sequences have the same codes (uniform),
622/// otherwise falls back to FSE encoding with predefined tables.
623///
624/// # Performance
625///
626/// Sequences are encoded once during RLE analysis and reused for FSE
627/// encoding if needed, avoiding redundant encoding work.
628pub fn encode_sequences(sequences: &[Sequence], output: &mut Vec<u8>) -> Result<()> {
629 if sequences.is_empty() {
630 output.push(0);
631 return Ok(());
632 }
633
634 // Analyze sequences to pre-encode into codes+extra bits.
635 let suitability = analyze_for_rle(sequences);
636
637 // Always use FSE/Predefined mode for better cross-decoder compatibility.
638 // RLE mode has subtle implementation differences that cause issues
639 // with some reference decoders. Predefined FSE mode (mode=0x00) is
640 // universally supported and produces identical output to reference zstd.
641 encode_sequences_fse_with_encoded(&suitability.encoded, output)
642}
643
644/// Encode a complete block.
645pub fn encode_block(input: &[u8], matches: &[Match], output: &mut Vec<u8>) -> Result<()> {
646 let (literals, sequences) = matches_to_sequences(input, matches);
647 encode_literals(&literals, output)?;
648 encode_sequences(&sequences, output)?;
649 Ok(())
650}
651
652// =============================================================================
653// Tests
654// =============================================================================
655
656#[cfg(test)]
657mod tests {
658 use super::*;
659
660 #[test]
661 fn test_block_encoder_creation() {
662 let encoder = BlockEncoder::new();
663 assert!(encoder.literals().is_empty());
664 assert!(encoder.sequences().is_empty());
665 }
666
667 #[test]
668 fn test_add_literals() {
669 let mut encoder = BlockEncoder::new();
670 encoder.add_literal(b'A');
671 encoder.add_literals(b"BC");
672
673 assert_eq!(encoder.literals(), b"ABC");
674 }
675
676 #[test]
677 fn test_add_match() {
678 let mut encoder = BlockEncoder::new();
679 encoder.add_match(5, 10, 4);
680
681 assert_eq!(encoder.sequences().len(), 1);
682 let seq = &encoder.sequences()[0];
683 assert_eq!(seq.literal_length, 5);
684 assert_eq!(seq.offset, 10);
685 assert_eq!(seq.match_length, 4);
686 }
687
688 #[test]
689 fn test_matches_to_sequences_empty() {
690 let input = b"hello";
691 let matches = vec![];
692
693 let (literals, sequences) = matches_to_sequences(input, &matches);
694
695 assert_eq!(literals, b"hello");
696 assert!(sequences.is_empty());
697 }
698
699 #[test]
700 fn test_matches_to_sequences_with_match() {
701 let input = b"abcdabcd";
702 let matches = vec![Match::new(4, 4, 4)];
703
704 let (literals, sequences) = matches_to_sequences(input, &matches);
705
706 assert_eq!(literals, b"abcd"); // Literals before match
707 assert_eq!(sequences.len(), 1);
708 assert_eq!(sequences[0].literal_length, 4);
709 // Offset 4 matches repeat_offset_2 (initial [1,4,8]), so encoded as 2
710 assert_eq!(sequences[0].offset, 2);
711 assert_eq!(sequences[0].match_length, 4);
712 }
713
714 #[test]
715 fn test_matches_to_sequences_new_offset() {
716 let input = b"abcdefXXXabcdef";
717 // Match at position 9, offset 9, length 6 (copying "abcdef" from position 0)
718 let matches = vec![Match::new(9, 9, 6)];
719
720 let (literals, sequences) = matches_to_sequences(input, &matches);
721
722 assert_eq!(literals, b"abcdefXXX"); // Literals before match
723 assert_eq!(sequences.len(), 1);
724 assert_eq!(sequences[0].literal_length, 9);
725 // Offset 9 encoded as 9 + 3 = 12
726 assert_eq!(sequences[0].offset, 12);
727 assert_eq!(sequences[0].match_length, 6);
728 }
729
730 #[test]
731 fn test_repeat_offset_encoder() {
732 let mut encoder = RepeatOffsetsEncoder::new();
733
734 // Initial repeat offsets are [1, 4, 8]
735 // Repeat offset optimization is now ENABLED for better compression
736
737 // Offset 4 matches repeat_offset_2 -> encoded as 2, state becomes [4, 1, 8]
738 assert_eq!(encoder.encode(4, 5), 2);
739 // Offset 4 is now repeat_offset_1 -> encoded as 1, state stays [4, 1, 8]
740 assert_eq!(encoder.encode(4, 5), 1);
741 // Offset 1 matches repeat_offset_2 -> encoded as 2, state becomes [1, 4, 8]
742 assert_eq!(encoder.encode(1, 5), 2);
743 // Offset 100 is new -> encoded as 100 + 3 = 103, state becomes [100, 1, 4]
744 assert_eq!(encoder.encode(100, 5), 103);
745 }
746
747 #[test]
748 fn test_encode_raw_literals_small() {
749 let mut output = Vec::new();
750 encode_raw_literals(b"Hello", &mut output).unwrap();
751
752 // 1-byte header with Size_Format = 0: byte0 = (5 << 3) | 0 = 0x28
753 assert_eq!(output[0], 0x28);
754 assert_eq!(&output[1..], b"Hello");
755 }
756
757 #[test]
758 fn test_encode_raw_literals_medium() {
759 let data: Vec<u8> = (0..100).collect();
760 let mut output = Vec::new();
761 encode_raw_literals(&data, &mut output).unwrap();
762
763 // 12-bit size format
764 assert_eq!(output.len(), 2 + 100);
765 }
766
767 #[test]
768 fn test_encode_rle_literals() {
769 let mut output = Vec::new();
770 encode_rle_literals(b'X', 10, &mut output).unwrap();
771
772 // 1-byte header with Size_Format = 0: byte0 = (10 << 3) | 1 = 0x51
773 assert_eq!(output[0], 0x51);
774 assert_eq!(output[1], b'X');
775 }
776
777 #[test]
778 fn test_encode_sequences_empty() {
779 let mut output = Vec::new();
780 encode_sequences(&[], &mut output).unwrap();
781
782 assert_eq!(output, vec![0]);
783 }
784
785 #[test]
786 fn test_encode_sequences_count_small() {
787 let sequences = vec![Sequence::new(0, 4, 3)];
788 let mut output = Vec::new();
789 encode_sequences(&sequences, &mut output).unwrap();
790
791 // Count = 1 (single byte)
792 assert_eq!(output[0], 1);
793 }
794
795 #[test]
796 fn test_encode_literals_detects_rle() {
797 let rle_data = vec![b'A'; 50];
798 let mut output = Vec::new();
799 encode_literals(&rle_data, &mut output).unwrap();
800
801 // Should be RLE encoded (much smaller)
802 assert!(output.len() < 50);
803 }
804
805 #[test]
806 fn test_encode_block() {
807 let input = b"abcdabcd";
808 let matches = vec![Match::new(4, 4, 4)];
809 let mut output = Vec::new();
810
811 encode_block(input, &matches, &mut output).unwrap();
812
813 // Should have literals section + sequences section
814 assert!(!output.is_empty());
815 }
816}