use crate::trie::{Trie, NIL_ID};
pub const FACTOR_OFFSET: usize = 256;
#[derive(Clone, Copy, Debug)]
struct Factor {
id: usize,
len: usize,
}
#[derive(Clone, Copy, Debug)]
struct Node {
id: usize,
len: usize,
}
#[derive(Debug)]
pub struct Compressor<'text> {
trie: Trie<'text>,
traced: Vec<Node>,
num_factors: usize,
}
impl<'text> Compressor<'text> {
pub fn run<F>(text: &'text [u8], output: F) -> usize
where
F: FnMut(usize),
{
let mut worker = Compressor {
trie: Trie::new(),
traced: Vec::new(),
num_factors: FACTOR_OFFSET,
};
worker.do_work(text, output)
}
fn do_work<F>(&mut self, text: &'text [u8], mut output: F) -> usize
where
F: FnMut(usize),
{
let mut text_pos = 0;
while text_pos < text.len() {
let text_beg = text_pos;
let factor1 = self.find_longest_match(&text[text_pos..], true);
text_pos += factor1.len;
debug_print!(factor1);
output(factor1.id);
if text.len() <= text_pos {
break;
}
let factor2 = self.find_longest_match(&text[text_pos..], false);
text_pos += factor2.len;
debug_print!(factor2);
output(factor2.id);
if self.traced.is_empty() {
self.insert_new_factor(0, &text[text_beg..text_pos]);
} else {
let mut i = 0;
while i < self.traced.len() {
if text_pos < text_beg + self.traced[i].len {
break;
}
i += 1;
}
let node = self.traced[i - 1];
self.insert_new_factor(node.id, &text[text_beg + node.len..text_pos]);
}
self.num_factors += 1;
debug_print!(&text[text_beg..text_pos]);
debug_print!(&self);
}
debug_print!(&self);
self.num_factors
}
fn find_longest_match(&mut self, text: &'text [u8], trace: bool) -> Factor {
debug_assert!(!text.is_empty());
if trace {
self.traced.clear();
}
let mut node_id: usize = 0;
let mut text_pos: usize = 0;
let mut factor = Factor { id: NIL_ID, len: 0 };
while text_pos < text.len() {
node_id = self.trie.find_child(node_id, &text[text_pos..]);
if node_id == NIL_ID {
break;
}
text_pos += self.trie.get_edge_len(node_id);
let factor_id = self.trie.get_factor_id(node_id);
if factor_id != NIL_ID {
factor = Factor {
id: factor_id,
len: text_pos,
};
if trace {
self.traced.clear();
}
}
if trace {
self.traced.push(Node {
id: node_id,
len: text_pos,
});
}
}
if factor.id == NIL_ID {
factor = Factor {
id: text[0] as usize,
len: 1,
}
}
factor
}
fn insert_new_factor(&mut self, node_id: usize, text: &'text [u8]) {
if !text.is_empty() {
self.trie.add_child(node_id, self.num_factors, text);
} else {
self.trie.set_factor_id(node_id, self.num_factors);
}
}
}
#[cfg(test)]
mod tests {
use crate::compressor::Compressor;
use crate::decompressor::Decompressor;
use std::collections::HashSet;
#[derive(PartialEq, Eq, Hash, Debug)]
struct DoubleFactor(usize, usize);
fn check_duplication(ids: &[usize]) {
let mut checker: HashSet<DoubleFactor> = HashSet::new();
for i in (1..ids.len()).step_by(2) {
let df = DoubleFactor(ids[i - 1], ids[i]);
assert_eq!(checker.get(&df), None);
checker.insert(df);
}
}
fn check_decompress(ids: &[usize], text: &[u8]) {
let mut decoded = Vec::new();
Decompressor::run(ids, |c| decoded.push(c));
assert_eq!(text.len(), decoded.len());
for i in 0..text.len() {
assert_eq!(text[i], decoded[i]);
}
}
#[test]
fn tiny() {
let text = "abaaabababaabbabab".as_bytes();
let mut ids: Vec<usize> = Vec::new();
Compressor::run(text, |id: usize| {
ids.push(id);
});
check_duplication(&ids);
check_decompress(&ids, text);
}
#[test]
fn short() {
let text = "ddadabaddaccdababddccccaccdcaddccacaaabcdbbddacbcbbbdcaadbbcbaddcbcdbaacbbddddbcdbdabdbcbbbcdcdcbada".as_bytes();
let mut ids: Vec<usize> = Vec::new();
Compressor::run(text, |id: usize| {
ids.push(id);
});
check_duplication(&ids);
check_decompress(&ids, text);
}
}