use crate::block::END_OFFSET;
use crate::block::LZ4_MIN_LENGTH;
use crate::block::MAX_DISTANCE;
use crate::block::MFLIMIT;
use crate::block::MINMATCH;
#[cfg(feature = "safe-encode")]
use std::convert::TryInto;
const INCREASE_STEPSIZE_BITSHIFT: usize = 4;
pub fn hash(sequence: u32) -> u32 {
(sequence.wrapping_mul(2654435761_u32)) >> 16
}
#[inline]
#[cfg(not(feature = "safe-encode"))]
fn get_batch(input: &[u8], n: usize) -> u32 {
unsafe { read_u32_ptr(input.as_ptr().add(n)) }
}
#[inline]
#[cfg(feature = "safe-encode")]
fn get_batch(input: &[u8], n: usize) -> u32 {
let arr: &[u8; 4] = input[n..n + 4].try_into().unwrap();
as_u32_le(arr)
}
#[inline]
fn get_hash_at(input: &[u8], pos: usize, dict_bitshift: u8) -> usize {
hash(get_batch(input, pos)) as usize >> dict_bitshift
}
#[inline]
fn token_from_literal(lit_len: usize) -> u8 {
if lit_len < 0xF {
(lit_len as u8) << 4
} else {
0xF0
}
}
#[inline]
#[cfg(feature = "safe-encode")]
fn count_same_bytes(first: &[u8], second: &[u8], cur: &mut usize) -> usize {
let cur_slice = &first[*cur..first.len() - END_OFFSET];
let mut num = 0;
for (block1, block2) in cur_slice.chunks_exact(8).zip(second.chunks_exact(8)) {
let input_block = usize::from_le(as_usize_le(block1));
let match_block = usize::from_le(as_usize_le(block2));
if input_block == match_block {
num += 8;
} else {
let diff = input_block ^ match_block;
num += get_common_bytes(diff) as usize;
break;
}
}
*cur += num;
return num;
}
#[inline]
#[cfg(feature = "safe-encode")]
fn as_usize_le(array: &[u8]) -> usize {
((array[0] as usize) << 0)
| ((array[1] as usize) << 8)
| ((array[2] as usize) << 16)
| ((array[3] as usize) << 24)
| ((array[4] as usize) << 32)
| ((array[5] as usize) << 40)
| ((array[6] as usize) << 48)
| ((array[7] as usize) << 56)
}
#[inline]
#[cfg(feature = "safe-encode")]
fn as_u32_le(array: &[u8; 4]) -> u32 {
((array[0] as u32) << 0)
| ((array[1] as u32) << 8)
| ((array[2] as u32) << 16)
| ((array[3] as u32) << 24)
}
#[inline]
#[cfg(not(feature = "safe-encode"))]
fn count_same_bytes(first: &[u8], mut second: &[u8], cur: &mut usize) -> usize {
let start = *cur;
const STEP_SIZE: usize = std::mem::size_of::<usize>();
while *cur + STEP_SIZE + END_OFFSET < first.len() {
let diff =
read_usize_ptr(unsafe { first.as_ptr().add(*cur) }) ^ read_usize_ptr(second.as_ptr());
if diff == 0 {
*cur += STEP_SIZE;
second = &second[STEP_SIZE..];
continue;
} else {
*cur += get_common_bytes(diff) as usize;
return *cur - start;
}
}
#[cfg(target_pointer_width = "64")]
{
if *cur + 4 + END_OFFSET < first.len() {
let diff =
read_u32_ptr(unsafe { first.as_ptr().add(*cur) }) ^ read_u32_ptr(second.as_ptr());
if diff == 0 {
*cur += 4;
return *cur - start;
} else {
*cur += (diff.trailing_zeros() >> 3) as usize;
return *cur - start;
}
}
}
if *cur + 2 + END_OFFSET < first.len() {
let diff =
read_u16_ptr(unsafe { first.as_ptr().add(*cur) }) ^ read_u16_ptr(second.as_ptr());
if diff == 0 {
*cur += 2;
return *cur - start;
} else {
*cur += (diff.trailing_zeros() >> 3) as usize;
return *cur - start;
}
}
if *cur + 1 + END_OFFSET < first.len()
&& unsafe { first.as_ptr().add(*cur).read() } == unsafe { second.as_ptr().read() }
{
*cur += 1;
}
*cur - start
}
#[inline]
fn write_integer(output: &mut Vec<u8>, mut n: usize) -> std::io::Result<()> {
while n >= 0xFF {
n -= 0xFF;
push_byte(output, 0xFF);
}
push_byte(output, n as u8);
Ok(())
}
#[inline]
fn handle_last_literals(
output: &mut Vec<u8>,
input: &[u8],
input_size: usize,
start: usize,
) -> std::io::Result<usize> {
let lit_len = input_size - start;
let token = token_from_literal(lit_len);
push_byte(output, token);
if lit_len >= 0xF {
write_integer(output, lit_len - 0xF)?;
}
#[cfg(feature = "safe-encode")]
{
copy_literals(output, &input[start..]);
}
#[cfg(not(feature = "safe-encode"))]
{
unsafe{copy_literals(output, &input[start..]);}
}
Ok(output.len())
}
#[inline]
pub fn compress_into(input: &[u8], output: &mut Vec<u8>) -> std::io::Result<usize> {
let (dict_size, dict_bitshift) = match input.len() {
0..=500 => (128, 9),
501..=1_000 => (256, 8),
1_001..=4_000 => (512, 7),
4_001..=8_000 => (1024, 6),
8_001..=16_000 => (2048, 5),
16_001..=30_000 => (8192, 3),
_ => (16384, 2),
};
let mut dict = vec![0; dict_size];
let input_size = input.len();
let dict_bitshift = dict_bitshift;
if input_size < LZ4_MIN_LENGTH as usize {
let lit_len = input_size;
let token = token_from_literal(lit_len);
push_byte(output, token);
if lit_len >= 0xF {
write_integer(output, lit_len - 0xF)?;
}
#[cfg(feature = "safe-encode")]
{
copy_literals(output, &input);
}
#[cfg(not(feature = "safe-encode"))]
{
unsafe{copy_literals(output, &input);}
}
return Ok(output.len());
}
let hash = get_hash_at(input, 0, dict_bitshift);
dict[hash] = 0;
assert!(LZ4_MIN_LENGTH as usize > END_OFFSET);
let end_pos_check = input_size - MFLIMIT as usize;
let mut candidate;
let mut cur = 0;
let mut start = cur;
cur += 1;
loop {
let mut step_size;
let mut non_match_count = 1 << INCREASE_STEPSIZE_BITSHIFT;
let mut next_cur = cur;
while {
non_match_count += 1;
step_size = non_match_count >> INCREASE_STEPSIZE_BITSHIFT;
cur = next_cur;
next_cur += step_size;
if cur > end_pos_check {
return handle_last_literals(output, input, input_size, start);
}
let hash = get_hash_at(input, cur, dict_bitshift);
candidate = dict[hash];
dict[hash] = cur;
(candidate + MAX_DISTANCE) < cur || get_batch(input, candidate) != get_batch(input, cur)
} {}
let lit_len = cur - start;
let mut token = token_from_literal(lit_len);
let offset = (cur - candidate) as u16;
cur += MINMATCH;
let duplicate_length = count_same_bytes(input, &input[candidate + MINMATCH..], &mut cur);
let hash = get_hash_at(input, cur - 2, dict_bitshift);
dict[hash] = cur - 2;
token |= if duplicate_length < 0xF {
duplicate_length as u8
} else {
0xF
};
push_byte(output, token);
if lit_len >= 0xF {
write_integer(output, lit_len - 0xF)?;
}
#[cfg(feature = "safe-encode")]
{
copy_literals(output, &input[start..start + lit_len]);
}
#[cfg(not(feature = "safe-encode"))]
{
unsafe{copy_literals(output, &input[start..start + lit_len]);}
}
push_byte(output, offset as u8);
push_byte(output, (offset >> 8) as u8);
if duplicate_length >= 0xF {
write_integer(output, duplicate_length - 0xF)?;
}
start = cur;
}
}
#[inline]
#[cfg(feature = "safe-encode")]
fn push_byte(output: &mut Vec<u8>, el: u8) {
output.push(el);
}
#[inline]
#[cfg(not(feature = "safe-encode"))]
fn push_byte(output: &mut Vec<u8>, el: u8) {
unsafe {
std::ptr::write(output.as_mut_ptr().add(output.len()), el);
output.set_len(output.len() + 1);
}
}
#[inline]
#[cfg(feature = "safe-encode")]
fn copy_literals(output: &mut Vec<u8>, input: &[u8]) {
output.extend_from_slice(input);
}
#[inline]
#[cfg(not(feature = "safe-encode"))]
unsafe fn copy_literals(output: &mut Vec<u8>, input: &[u8]) {
std::ptr::copy_nonoverlapping(
input.as_ptr(),
output.as_mut_ptr().add(output.len()),
input.len(),
);
output.set_len(output.len() + input.len());
}
#[inline]
pub fn compress_prepend_size(input: &[u8]) -> Vec<u8> {
let mut compressed = Vec::with_capacity(16 + 4 + (input.len() as f64 * 1.1) as usize);
unsafe { compressed.set_len(4) };
compress_into(input, &mut compressed).unwrap();
let size = input.len() as u32;
compressed[0] = size as u8;
compressed[1] = (size >> 8) as u8;
compressed[2] = (size >> 16) as u8;
compressed[3] = (size >> 24) as u8;
compressed
}
#[inline]
pub fn compress(input: &[u8]) -> Vec<u8> {
let mut compressed = Vec::with_capacity(16 + (input.len() as f64 * 1.1) as usize);
compress_into(input, &mut compressed).unwrap();
compressed
}
#[inline]
#[cfg(not(feature = "safe-encode"))]
fn read_u32_ptr(input: *const u8) -> u32 {
let mut num: u32 = 0;
unsafe {
std::ptr::copy_nonoverlapping(input, &mut num as *mut u32 as *mut u8, 4);
}
num
}
#[inline]
#[cfg(not(feature = "safe-encode"))]
fn read_usize_ptr(input: *const u8) -> usize {
let mut num: usize = 0;
unsafe {
std::ptr::copy_nonoverlapping(
input,
&mut num as *mut usize as *mut u8,
std::mem::size_of::<usize>(),
);
}
num
}
#[inline]
#[cfg(not(feature = "safe-encode"))]
fn read_u16_ptr(input: *const u8) -> u16 {
let mut num: u16 = 0;
unsafe {
std::ptr::copy_nonoverlapping(input, &mut num as *mut u16 as *mut u8, 2);
}
num
}
#[inline]
fn get_common_bytes(diff: usize) -> u32 {
let tr_zeroes = diff.trailing_zeros();
tr_zeroes >> 3
}
#[test]
#[cfg(target_pointer_width = "64")]
#[cfg(not(feature = "safe-encode"))]
fn test_get_common_bytes() {
let num1 = read_usize_ptr([0, 0, 0, 0, 0, 0, 0, 1].as_ptr());
let num2 = read_usize_ptr([0, 0, 0, 0, 0, 0, 0, 2].as_ptr());
let diff = num1 ^ num2;
assert_eq!(get_common_bytes(diff), 7);
let num1 = read_usize_ptr([0, 0, 0, 0, 0, 0, 1, 1].as_ptr());
let num2 = read_usize_ptr([0, 0, 0, 0, 0, 0, 0, 2].as_ptr());
let diff = num1 ^ num2;
assert_eq!(get_common_bytes(diff), 6);
let num1 = read_usize_ptr([1, 0, 0, 0, 0, 0, 1, 1].as_ptr());
let num2 = read_usize_ptr([0, 0, 0, 0, 0, 0, 0, 2].as_ptr());
let diff = num1 ^ num2;
assert_eq!(get_common_bytes(diff), 0);
}
#[test]
#[cfg(target_pointer_width = "32")]
fn test_get_common_bytes() {
let num1 = read_u32(&[0, 0, 0, 1]);
let num2 = read_u32(&[0, 0, 0, 2]);
let diff = num1 ^ num2;
assert_eq!(get_common_bytes(diff as usize), 3);
let num1 = read_u32(&[0, 0, 1, 1]);
let num2 = read_u32(&[0, 0, 0, 2]);
let diff = num1 ^ num2;
assert_eq!(get_common_bytes(diff as usize), 2);
let num1 = read_u32(&[1, 0, 1, 1]);
let num2 = read_u32(&[0, 0, 0, 2]);
let diff = num1 ^ num2;
assert_eq!(get_common_bytes(diff as usize), 0);
}
#[test]
fn test_bug() {
let input: &[u8] = &[
10, 12, 14, 16, 18, 10, 12, 14, 16, 18, 10, 12, 14, 16, 18, 10, 12, 14, 16, 18,
];
let _out = compress(&input);
}
#[test]
fn test_compare() {
let mut input: &[u8] = &[10, 12, 14, 16];
let mut cache = vec![];
let mut encoder = lz4::EncoderBuilder::new()
.level(2)
.build(&mut cache)
.unwrap();
std::io::copy(&mut input, &mut encoder).unwrap();
let (comp_lz4, _result) = encoder.finish();
println!("{:?}", comp_lz4);
let input: &[u8] = &[10, 12, 14, 16];
let out = compress(&input);
dbg!(&out);
}