ziftsieve 0.1.0

Search compressed data without full decompression. Literal extraction and indexing for LZ4, Snappy, Gzip, and Zstd.
Documentation
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
//! LZ4 literal extraction without full decompression.
//!
//! This parser walks LZ4 blocks and frame members, extracts literal runs, and
//! skips match resolution. The resulting literals can be indexed directly or
//! attached to [`CompressedBlock`] values for later verification.
//!
//! LZ4 format:
//! ```text
//! Token: [literal_length:4][match_length:4]
//! If literal_length == 15: additional bytes follow (sum until < 255)
//! Literal bytes: [literal_length bytes]
//! If match_length > 0:
//!   Match offset: [offset: u16 LE]
//!   If match_length == 15: additional bytes follow
//! ```
//!
//! This module extracts only the literal bytes, skipping match resolution.

use crate::{CompressedBlock, ZiftError};

/// Maximum LZ4 block size (4MB as per spec).
const MAX_BLOCK_SIZE: usize = 4 * 1024 * 1024;
const LZ4_FRAME_MAGIC: [u8; 4] = [0x04, 0x22, 0x4D, 0x18];

/// Extract literals from an LZ4 block without decompressing matches.
///
/// This is the core optimization: we parse the LZ4 token stream but skip
/// the expensive part (resolving back-references). Only literal bytes are
/// extracted and returned.
///
/// # Performance
///
/// - Full decompression: `O(uncompressed_size)`
/// - Literal extraction: `O(compressed_size)` + `O(total_literal_bytes)`
///
/// For typical compression ratios (2:1 to 4:1), this is 2-4× faster.
/// For high-compressibility data (logs with repeated patterns), can be
/// 10-100× faster since we skip resolving long match chains.
///
/// # Errors
///
/// Returns `ZiftError` if the literal exceeds block bounds or if the
/// match offset is truncated.
///
/// # Parameters
///
/// - `compressed`: Encoded LZ4 block payload.
/// - `max_output`: Maximum number of literal bytes to collect before stopping.
///
/// # Returns
///
/// The literal bytes encountered while scanning the block.
/// Maximum number of sequences per block to prevent `DoS`.
const MAX_SEQUENCES_PER_BLOCK: usize = 100_000;

/// Extract literals from an LZ4 block without decompressing matches.
///
/// This function now properly respects `max_output` by truncating literals
/// at the limit rather than allowing over-allocation. It also includes
/// sequence counting to prevent `DoS` from streams with too many sequences.
///
/// # Performance
///
/// Uses chunked reservation to minimize reallocations while preventing
/// quadratic behavior on malicious input.
/// # Errors
///
/// Returns `ZiftError::InvalidData` if the block is malformed or exceeds maximum sequences limit.
pub fn extract_literals(compressed: &[u8], max_output: usize) -> Result<Vec<u8>, ZiftError> {
    // Pre-allocate with a reasonable estimate: compressed size × 2 or max_output, whichever is smaller.
    // This reduces reallocations while preventing over-allocation on small inputs.
    let initial_cap = (compressed.len().saturating_mul(2))
        .min(max_output)
        .min(MAX_BLOCK_SIZE);
    let mut literals = Vec::with_capacity(initial_cap);
    let mut pos = 0usize;
    let mut sequence_count = 0usize;

    while pos < compressed.len() && literals.len() < max_output {
        // Prevent DoS from too many sequences
        sequence_count += 1;
        if sequence_count >= MAX_SEQUENCES_PER_BLOCK {
            return Err(ZiftError::InvalidData {
                offset: pos,
                reason: format!("too many LZ4 sequences (max {MAX_SEQUENCES_PER_BLOCK})"),
            });
        }

        // Read token
        if pos >= compressed.len() {
            break;
        }
        let token = compressed[pos];
        pos += 1;

        let literal_len = (token >> 4) as usize;
        let match_len = (token & 0x0F) as usize;

        // Decode variable-length literal length
        let literal_len = if literal_len == 15 {
            decode_length(compressed, &mut pos, literal_len)?
        } else {
            literal_len
        };

        // Validate literal length before allocation
        if literal_len > MAX_BLOCK_SIZE {
            return Err(ZiftError::BlockTooLarge {
                size: literal_len,
                max: MAX_BLOCK_SIZE,
            });
        }

        // Check if adding this literal would exceed max_output
        let remaining_output = max_output.saturating_sub(literals.len());
        let to_copy = literal_len.min(remaining_output);

        // Copy literal bytes
        if to_copy > 0 {
            if pos + to_copy > compressed.len() {
                return Err(ZiftError::InvalidData {
                    offset: pos,
                    reason: "literal exceeds block bounds".to_string(),
                });
            }

            // Reserve space in chunks to reduce reallocations
            if to_copy > 1024 && literals.capacity() - literals.len() < to_copy {
                let reserve_amount = (MAX_BLOCK_SIZE / 4)
                    .min(remaining_output.saturating_sub(literals.capacity() - literals.len()));
                if reserve_amount > 0 {
                    literals.reserve(reserve_amount);
                }
            }

            literals.extend_from_slice(&compressed[pos..pos + to_copy]);
        }

        // Advance position by full literal_len even if we truncated
        pos = pos.saturating_add(literal_len);

        // Skip match data (we don't resolve back-references).
        // In LZ4, every sequence except the LAST has a match section.
        // The last sequence is detected by being at end-of-block after literals.
        // Match offset (2 bytes) is ALWAYS present when there's a match section,
        // even when token low nibble is 0 (which means min match length of 4).
        if pos < compressed.len() {
            // Read match offset (2 bytes, always present for non-final sequences)
            if pos + 2 > compressed.len() {
                return Err(ZiftError::InvalidData {
                    offset: pos,
                    reason: "truncated match offset".to_string(),
                });
            }
            pos += 2; // Skip offset

            // Decode variable-length match length
            if match_len == 15 {
                let _ = decode_length(compressed, &mut pos, match_len)?;
            }
        }
    }

    Ok(literals)
}

/// Decode a variable-length length field.
///
/// LZ4 uses additive encoding: if initial value is 15, sum subsequent
/// bytes until one < 255 is found.
fn decode_length(data: &[u8], pos: &mut usize, initial: usize) -> Result<usize, ZiftError> {
    let mut len = initial;

    loop {
        if *pos >= data.len() {
            return Err(ZiftError::InvalidData {
                offset: *pos,
                reason: "truncated length encoding".to_string(),
            });
        }
        let byte = data[*pos];
        *pos += 1;
        len = len
            .checked_add(byte as usize)
            .ok_or(ZiftError::InvalidData {
                offset: *pos,
                reason: "length overflow in variable-length encoding".to_string(),
            })?;

        if byte < 255 {
            break;
        }

        // Safety: cap iterations to prevent infinite loop on malicious input
        // (all-255 bytes). Max realistic LZ4 block is 4MB.
        if len > MAX_BLOCK_SIZE {
            return Err(ZiftError::BlockTooLarge {
                size: len,
                max: MAX_BLOCK_SIZE,
            });
        }
    }

    Ok(len)
}

/// Maximum number of blocks per stream to prevent `DoS`.
const MAX_BLOCKS_PER_STREAM: usize = 10_000;

/// Maximum total literals per stream to prevent OOM.
const MAX_TOTAL_LITERALS: usize = 256 * 1024 * 1024; // 256 MB

/// Maximum decompression ratio to prevent zip bombs.
const MAX_DECOMPRESSION_RATIO: usize = 250;

/// Parse multiple LZ4 blocks and extract literals from each.
///
/// # Errors
///
/// Returns `ZiftError` if the data is truncated, if block size exceeds
/// maximum limits, or if literal extraction fails.
///
/// # Parameters
///
/// - `data`: LZ4 frame or block bytes.
///
/// # Returns
///
/// A vector of [`CompressedBlock`] values in stream order.
pub fn parse_lz4_blocks(data: &[u8]) -> Result<Vec<CompressedBlock>, ZiftError> {
    let mut blocks = Vec::new();
    let mut offset = parse_frame_header(data)? as u64;

    let mut total_literals = 0usize;

    while offset < data.len() as u64 {
        // Prevent DoS from too many blocks
        if blocks.len() >= MAX_BLOCKS_PER_STREAM {
            return Err(ZiftError::InvalidData {
                offset: usize::try_from(offset).unwrap_or(0),
                reason: format!("too many LZ4 blocks (max {MAX_BLOCKS_PER_STREAM})"),
            });
        }

        if offset + 4 > data.len() as u64 {
            break; // Incomplete block header
        }

        // Read block header (compressed size)
        let block_size = u32::from_le_bytes([
            data[usize::try_from(offset).unwrap_or(0)],
            data[usize::try_from(offset).unwrap_or(0) + 1],
            data[usize::try_from(offset).unwrap_or(0) + 2],
            data[usize::try_from(offset).unwrap_or(0) + 3],
        ]) as usize;

        // High bit indicates uncompressed block
        let is_uncompressed = (block_size & 0x8000_0000) != 0;
        let size = block_size & 0x7FFF_FFFF;

        if size == 0 {
            // End marker
            break;
        }

        if size > MAX_BLOCK_SIZE {
            return Err(ZiftError::BlockTooLarge {
                size,
                max: MAX_BLOCK_SIZE,
            });
        }

        let header_size = 4usize;
        let data_start = usize::try_from(offset).unwrap_or(0) + header_size;
        let data_end = data_start + size;

        if data_end > data.len() {
            return Err(ZiftError::InvalidData {
                offset: usize::try_from(offset).unwrap_or(0),
                reason: "truncated block".to_string(),
            });
        }

        let block_data = &data[data_start..data_end];

        let mut block = CompressedBlock::new(
            offset,
            u32::try_from(size).map_err(|_| ZiftError::BlockTooLarge {
                size,
                max: MAX_BLOCK_SIZE,
            })?,
        );

        if is_uncompressed {
            // Uncompressed block - all bytes are literals
            block.literals = block_data.to_vec();
        } else {
            // Compressed block - extract literals
            block.literals = extract_literals(block_data, MAX_BLOCK_SIZE)?;
        }

        total_literals = total_literals.saturating_add(block.literals.len());
        if total_literals > MAX_TOTAL_LITERALS {
            return Err(ZiftError::BlockTooLarge {
                size: total_literals,
                max: MAX_TOTAL_LITERALS,
            });
        }

        let max_allowed_literals = data
            .len()
            .saturating_mul(MAX_DECOMPRESSION_RATIO)
            .max(1024 * 1024);
        if total_literals > max_allowed_literals {
            return Err(ZiftError::InvalidData {
                offset: usize::try_from(offset).unwrap_or(0),
                reason: format!("decompression ratio exceeded limit of {MAX_DECOMPRESSION_RATIO}"),
            });
        }

        blocks.push(block);
        offset = u64::try_from(data_end).map_err(|_| ZiftError::InvalidData {
            offset: usize::try_from(offset).unwrap_or(0),
            reason: "offset overflow".to_string(),
        })?;
    }

    Ok(blocks)
}

fn parse_frame_header(data: &[u8]) -> Result<usize, ZiftError> {
    if data.is_empty() {
        return Err(ZiftError::InvalidData {
            offset: 0,
            reason: "empty input is not valid LZ4 data".to_string(),
        });
    }
    if data.len() < 4 || data[..4] != LZ4_FRAME_MAGIC {
        // Not a framed LZ4 stream — treat as raw block data (legacy format).
        // Start parsing from offset 0.
        return Ok(0);
    }
    if data.len() < 7 {
        return Err(ZiftError::InvalidData {
            offset: data.len(),
            reason: "truncated lz4 frame header".to_string(),
        });
    }

    let flg = data[4];

    // Version is in bits 6-7; only version 1 (0b01) is supported.
    if (flg & 0xC0) != 0x40 {
        return Err(ZiftError::InvalidData {
            offset: 4,
            reason: "unsupported LZ4 frame version".to_string(),
        });
    }

    let mut pos = 6usize;

    if (flg & 0x08) != 0 {
        pos += 8;
    }
    if (flg & 0x01) != 0 {
        pos += 1;
    }
    if pos >= data.len() {
        return Err(ZiftError::InvalidData {
            offset: pos,
            reason: "truncated lz4 frame descriptor".to_string(),
        });
    }

    pos += 1; // header checksum
    Ok(pos)
}

#[cfg(test)]
#[allow(clippy::unwrap_used, clippy::expect_used)]
mod tests {
    use super::*;

    #[test]
    fn test_decode_length() {
        // Length with extension (15 + 100 = 115)
        let data = [100u8];
        let mut pos = 0;
        assert_eq!(decode_length(&data, &mut pos, 15).unwrap(), 115);
        assert_eq!(pos, 1); // Consumed 1 byte

        // Length with multiple extensions (15 + 255 + 50 = 320)
        let data = [255u8, 50];
        let mut pos = 0;
        assert_eq!(decode_length(&data, &mut pos, 15).unwrap(), 320);
        assert_eq!(pos, 2); // Consumed 2 bytes
    }

    #[test]
    fn test_extract_literals_with_match() {
        // Token: 0x11 = literal_len=1, match_len=1
        // Literal: 'A'
        // Match offset: 0x0001 (1 byte back)
        // We should skip the match, only get 'A'
        let data = [0x11, b'A', 0x01, 0x00];
        let literals = extract_literals(&data, 1024).unwrap();
        assert_eq!(literals, b"A");
    }

    #[test]
    fn test_extract_literals_extended() {
        // Token: 0xF0 = literal_len=15 (needs extension), match_len=0
        // Extension: 10 (total = 25)
        // 25 literal bytes
        let mut data = vec![0xF0, 10]; // 15 + 10 = 25 literal bytes
        data.extend_from_slice(&[b'X'; 25]);

        let literals = extract_literals(&data, 1024).unwrap();
        assert_eq!(literals.len(), 25);
        assert!(literals.iter().all(|&b| b == b'X'));
    }

    #[test]
    fn test_extract_literals_truncated() {
        // Claimed literal length exceeds data
        let data = [0x20, b'A']; // Claims 2 literals, only 1 present
        let result = extract_literals(&data, 1024);
        assert!(result.is_err());
    }

    #[test]
    fn test_parse_lz4_blocks_empty() {
        let data = []; // Empty
        let result = parse_lz4_blocks(&data);
        assert!(
            result.is_err(),
            "empty input must be rejected as invalid LZ4"
        );
    }

    #[test]
    fn test_parse_lz4_blocks_non_framed() {
        // Data without LZ4 frame magic is treated as raw block data (legacy format).
        // With invalid block headers, it should return empty blocks (no panic).
        let data = [0x00, 0x00, 0x00, 0x00]; // End-of-frame marker (size=0)
        let blocks = parse_lz4_blocks(&data).unwrap();
        assert!(
            blocks.is_empty(),
            "end-of-frame marker should produce empty blocks"
        );
    }

    #[test]
    fn test_parse_lz4_frame_header_then_blocks() {
        let mut data = vec![0x04, 0x22, 0x4D, 0x18, 0x60, 0x40, 0x00];
        data.extend_from_slice(&0x8000_0001_u32.to_le_bytes());
        data.push(b'A');
        data.extend_from_slice(&0_u32.to_le_bytes());

        let blocks = parse_lz4_blocks(&data).unwrap();
        assert_eq!(blocks.len(), 1);
        assert_eq!(blocks[0].literals(), b"A");
    }

    #[test]
    fn test_extract_literals_simple() {
        // Token: 0x10 = literal_len=1, match_len=0
        // Literal: 'A'
        let data = [0x10, b'A'];
        let literals = extract_literals(&data, 1024).unwrap();
        assert_eq!(literals, b"A");
    }
}