use std::collections::VecDeque;
use crate::file_format::OriginPointer;
#[derive(Debug, Default, Clone)]
pub(crate) struct BwtEncoded {
pub(crate) data: Vec<u8>,
pub(crate) origin_pointer: OriginPointer,
}
impl BwtEncoded {
pub(crate) fn new(data: Vec<u8>, origin_pointer: OriginPointer) -> Self {
BwtEncoded {
data,
origin_pointer,
}
}
}
pub(super) fn encode(data: &[u8]) -> BwtEncoded {
assert!(data.len() < 900_000);
if data.is_empty() {
return BwtEncoded::default();
}
let mut all_rotations = all_rotations(data);
all_rotations.sort_unstable();
let origin_pointer: OriginPointer = all_rotations
.binary_search_by_key(&data, |v| &v[..])
.expect("`data` must be in `all_rotations`")
.try_into()
.expect("`origin_pointer` must fit into 24 bits");
let data: Vec<u8> = all_rotations
.into_iter()
.map(|mut v| {
v.pop()
.expect("presence of outer `Vec` means inner `Vec` was not empty")
})
.collect();
BwtEncoded {
data,
origin_pointer,
}
}
#[derive(Debug, thiserror::Error)]
pub enum DecodeError {
#[error("origin pointer out of range")]
InvalidOriginPointer,
}
pub(super) fn decode(
BwtEncoded {
data,
origin_pointer,
}: &BwtEncoded,
) -> Result<Vec<u8>, DecodeError> {
if data.is_empty() {
return Ok(Vec::new());
}
let origin_pointer: usize = origin_pointer.try_into().unwrap();
if origin_pointer > data.len() - 1 {
return Err(DecodeError::InvalidOriginPointer);
}
let mut frequency_histogram: [u64; 256] = [0; _];
let mut n = 0;
for value in data.iter() {
frequency_histogram[*value as usize] += 1;
}
for value in frequency_histogram.iter_mut() {
let copy = *value;
*value = n;
n += copy;
}
let mut permutation = vec![0; data.len()];
for (index, value) in data.iter().enumerate() {
permutation[frequency_histogram[*value as usize] as usize] = index;
frequency_histogram[*value as usize] += 1;
}
let mut input_index = permutation[origin_pointer];
let mut output = vec![0; data.len()];
for output in output.iter_mut() {
*output = data[input_index];
input_index = permutation[input_index];
}
Ok(output)
}
fn all_rotations(data: &[u8]) -> Vec<Vec<u8>> {
let mut deque: VecDeque<u8> = data.iter().copied().collect();
let mut output = Vec::with_capacity(deque.len());
for _ in 0..deque.len() {
output.push(deque.iter().copied().collect());
deque.rotate_left(1);
}
output
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn roundtrip() {
let input = b"adlfjasldjfaslkfdsjaklsd";
let result = decode(&encode(input)).unwrap();
assert_eq!(input.as_slice(), result);
}
mod decode {
use super::*;
#[test]
fn banana() {
let input = BwtEncoded {
data: b"BNN^AAsA".to_vec(),
origin_pointer: 6.into(),
};
let decoded = decode(&input).unwrap();
assert_eq!(decoded, b"^BANANAs");
}
#[test]
fn small() {
let input = BwtEncoded {
data: b"dabc".to_vec(),
origin_pointer: 2.into(),
};
let decoded = decode(&input).unwrap();
assert_eq!(decoded, b"cdab");
}
#[test]
fn empty() {
let decoded = decode(&BwtEncoded::default()).unwrap();
assert_eq!(decoded, &[]);
}
}
mod encode {
use super::*;
#[test]
fn small() {
let input = b"cdab";
let encoded = encode(input);
assert_eq!(encoded.data, b"dabc");
let origin_pointer: usize = encoded.origin_pointer.try_into().unwrap();
assert_eq!(origin_pointer, 2);
}
#[test]
fn empty() {
let encoded = encode(&[]);
assert_eq!(encoded.data, BwtEncoded::default().data);
}
}
mod all_rotations {
use super::*;
#[test]
fn small() {
let input = b"abcd";
let rotations = all_rotations(input);
assert!(rotations.iter().any(|v| v == b"abcd"));
assert!(rotations.iter().any(|v| v == b"bcda"));
assert!(rotations.iter().any(|v| v == b"cdab"));
assert!(rotations.iter().any(|v| v == b"dabc"));
}
#[test]
fn empty() {
let rotations = all_rotations(&[]);
assert!(rotations.is_empty());
}
}
}