use crate::{CompressedBlock, ZiftError};
const MAX_BLOCK_SIZE: usize = 4 * 1024 * 1024;
const LZ4_FRAME_MAGIC: [u8; 4] = [0x04, 0x22, 0x4D, 0x18];
const MAX_SEQUENCES_PER_BLOCK: usize = 100_000;
pub fn extract_literals(compressed: &[u8], max_output: usize) -> Result<Vec<u8>, ZiftError> {
let initial_cap = (compressed.len().saturating_mul(2))
.min(max_output)
.min(MAX_BLOCK_SIZE);
let mut literals = Vec::with_capacity(initial_cap);
let mut pos = 0usize;
let mut sequence_count = 0usize;
while pos < compressed.len() && literals.len() < max_output {
sequence_count += 1;
if sequence_count >= MAX_SEQUENCES_PER_BLOCK {
return Err(ZiftError::InvalidData {
offset: pos,
reason: format!("too many LZ4 sequences (max {MAX_SEQUENCES_PER_BLOCK})"),
});
}
if pos >= compressed.len() {
break;
}
let token = compressed[pos];
pos += 1;
let literal_len = (token >> 4) as usize;
let match_len = (token & 0x0F) as usize;
let literal_len = if literal_len == 15 {
decode_length(compressed, &mut pos, literal_len)?
} else {
literal_len
};
if literal_len > MAX_BLOCK_SIZE {
return Err(ZiftError::BlockTooLarge {
size: literal_len,
max: MAX_BLOCK_SIZE,
});
}
let remaining_output = max_output.saturating_sub(literals.len());
let to_copy = literal_len.min(remaining_output);
if to_copy > 0 {
if pos + to_copy > compressed.len() {
return Err(ZiftError::InvalidData {
offset: pos,
reason: "literal exceeds block bounds".to_string(),
});
}
if to_copy > 1024 && literals.capacity() - literals.len() < to_copy {
let reserve_amount = (MAX_BLOCK_SIZE / 4)
.min(remaining_output.saturating_sub(literals.capacity() - literals.len()));
if reserve_amount > 0 {
literals.reserve(reserve_amount);
}
}
literals.extend_from_slice(&compressed[pos..pos + to_copy]);
}
pos = pos.saturating_add(literal_len);
if pos < compressed.len() {
if pos + 2 > compressed.len() {
return Err(ZiftError::InvalidData {
offset: pos,
reason: "truncated match offset".to_string(),
});
}
pos += 2;
if match_len == 15 {
let _ = decode_length(compressed, &mut pos, match_len)?;
}
}
}
Ok(literals)
}
fn decode_length(data: &[u8], pos: &mut usize, initial: usize) -> Result<usize, ZiftError> {
let mut len = initial;
loop {
if *pos >= data.len() {
return Err(ZiftError::InvalidData {
offset: *pos,
reason: "truncated length encoding".to_string(),
});
}
let byte = data[*pos];
*pos += 1;
len = len
.checked_add(byte as usize)
.ok_or(ZiftError::InvalidData {
offset: *pos,
reason: "length overflow in variable-length encoding".to_string(),
})?;
if byte < 255 {
break;
}
if len > MAX_BLOCK_SIZE {
return Err(ZiftError::BlockTooLarge {
size: len,
max: MAX_BLOCK_SIZE,
});
}
}
Ok(len)
}
const MAX_BLOCKS_PER_STREAM: usize = 10_000;
const MAX_TOTAL_LITERALS: usize = 256 * 1024 * 1024;
const MAX_DECOMPRESSION_RATIO: usize = 250;
pub fn parse_lz4_blocks(data: &[u8]) -> Result<Vec<CompressedBlock>, ZiftError> {
let mut blocks = Vec::new();
let mut offset = parse_frame_header(data)? as u64;
let mut total_literals = 0usize;
while offset < data.len() as u64 {
if blocks.len() >= MAX_BLOCKS_PER_STREAM {
return Err(ZiftError::InvalidData {
offset: usize::try_from(offset).unwrap_or(0),
reason: format!("too many LZ4 blocks (max {MAX_BLOCKS_PER_STREAM})"),
});
}
if offset + 4 > data.len() as u64 {
break; }
let block_size = u32::from_le_bytes([
data[usize::try_from(offset).unwrap_or(0)],
data[usize::try_from(offset).unwrap_or(0) + 1],
data[usize::try_from(offset).unwrap_or(0) + 2],
data[usize::try_from(offset).unwrap_or(0) + 3],
]) as usize;
let is_uncompressed = (block_size & 0x8000_0000) != 0;
let size = block_size & 0x7FFF_FFFF;
if size == 0 {
break;
}
if size > MAX_BLOCK_SIZE {
return Err(ZiftError::BlockTooLarge {
size,
max: MAX_BLOCK_SIZE,
});
}
let header_size = 4usize;
let data_start = usize::try_from(offset).unwrap_or(0) + header_size;
let data_end = data_start + size;
if data_end > data.len() {
return Err(ZiftError::InvalidData {
offset: usize::try_from(offset).unwrap_or(0),
reason: "truncated block".to_string(),
});
}
let block_data = &data[data_start..data_end];
let mut block = CompressedBlock::new(
offset,
u32::try_from(size).map_err(|_| ZiftError::BlockTooLarge {
size,
max: MAX_BLOCK_SIZE,
})?,
);
if is_uncompressed {
block.literals = block_data.to_vec();
} else {
block.literals = extract_literals(block_data, MAX_BLOCK_SIZE)?;
}
total_literals = total_literals.saturating_add(block.literals.len());
if total_literals > MAX_TOTAL_LITERALS {
return Err(ZiftError::BlockTooLarge {
size: total_literals,
max: MAX_TOTAL_LITERALS,
});
}
let max_allowed_literals = data
.len()
.saturating_mul(MAX_DECOMPRESSION_RATIO)
.max(1024 * 1024);
if total_literals > max_allowed_literals {
return Err(ZiftError::InvalidData {
offset: usize::try_from(offset).unwrap_or(0),
reason: format!("decompression ratio exceeded limit of {MAX_DECOMPRESSION_RATIO}"),
});
}
blocks.push(block);
offset = u64::try_from(data_end).map_err(|_| ZiftError::InvalidData {
offset: usize::try_from(offset).unwrap_or(0),
reason: "offset overflow".to_string(),
})?;
}
Ok(blocks)
}
fn parse_frame_header(data: &[u8]) -> Result<usize, ZiftError> {
if data.is_empty() {
return Err(ZiftError::InvalidData {
offset: 0,
reason: "empty input is not valid LZ4 data".to_string(),
});
}
if data.len() < 4 || data[..4] != LZ4_FRAME_MAGIC {
return Ok(0);
}
if data.len() < 7 {
return Err(ZiftError::InvalidData {
offset: data.len(),
reason: "truncated lz4 frame header".to_string(),
});
}
let flg = data[4];
if (flg & 0xC0) != 0x40 {
return Err(ZiftError::InvalidData {
offset: 4,
reason: "unsupported LZ4 frame version".to_string(),
});
}
let mut pos = 6usize;
if (flg & 0x08) != 0 {
pos += 8;
}
if (flg & 0x01) != 0 {
pos += 1;
}
if pos >= data.len() {
return Err(ZiftError::InvalidData {
offset: pos,
reason: "truncated lz4 frame descriptor".to_string(),
});
}
pos += 1; Ok(pos)
}
#[cfg(test)]
#[allow(clippy::unwrap_used, clippy::expect_used)]
mod tests {
use super::*;
#[test]
fn test_decode_length() {
let data = [100u8];
let mut pos = 0;
assert_eq!(decode_length(&data, &mut pos, 15).unwrap(), 115);
assert_eq!(pos, 1);
let data = [255u8, 50];
let mut pos = 0;
assert_eq!(decode_length(&data, &mut pos, 15).unwrap(), 320);
assert_eq!(pos, 2); }
#[test]
fn test_extract_literals_with_match() {
let data = [0x11, b'A', 0x01, 0x00];
let literals = extract_literals(&data, 1024).unwrap();
assert_eq!(literals, b"A");
}
#[test]
fn test_extract_literals_extended() {
let mut data = vec![0xF0, 10]; data.extend_from_slice(&[b'X'; 25]);
let literals = extract_literals(&data, 1024).unwrap();
assert_eq!(literals.len(), 25);
assert!(literals.iter().all(|&b| b == b'X'));
}
#[test]
fn test_extract_literals_truncated() {
let data = [0x20, b'A']; let result = extract_literals(&data, 1024);
assert!(result.is_err());
}
#[test]
fn test_parse_lz4_blocks_empty() {
let data = []; let result = parse_lz4_blocks(&data);
assert!(
result.is_err(),
"empty input must be rejected as invalid LZ4"
);
}
#[test]
fn test_parse_lz4_blocks_non_framed() {
let data = [0x00, 0x00, 0x00, 0x00]; let blocks = parse_lz4_blocks(&data).unwrap();
assert!(
blocks.is_empty(),
"end-of-frame marker should produce empty blocks"
);
}
#[test]
fn test_parse_lz4_frame_header_then_blocks() {
let mut data = vec![0x04, 0x22, 0x4D, 0x18, 0x60, 0x40, 0x00];
data.extend_from_slice(&0x8000_0001_u32.to_le_bytes());
data.push(b'A');
data.extend_from_slice(&0_u32.to_le_bytes());
let blocks = parse_lz4_blocks(&data).unwrap();
assert_eq!(blocks.len(), 1);
assert_eq!(blocks[0].literals(), b"A");
}
#[test]
fn test_extract_literals_simple() {
let data = [0x10, b'A'];
let literals = extract_literals(&data, 1024).unwrap();
assert_eq!(literals, b"A");
}
}