mod code;
mod encoding;
mod model;
mod tokenization_scheme;
mod tokens;
mod util;
pub use crate::encoding::EncodingScheme;
pub use crate::tokenization_scheme::TokenizationScheme;
use anyhow::Result;
use std::path::Path;
#[cfg(feature = "serde")]
use serde::{Deserialize, Serialize};
#[derive(Copy, Clone, Debug, Eq, Hash, PartialEq)]
#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))]
pub enum Command {
Compress(CompressArgs),
Decompress(DecompressArgs),
}
#[derive(Copy, Clone, Debug, Eq, Hash, PartialEq)]
#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))]
pub struct CompressArgs {
pub tokenization_scheme: TokenizationScheme,
pub encoding_scheme: EncodingScheme,
}
#[derive(Copy, Clone, Debug, Eq, Hash, PartialEq)]
#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))]
pub struct DecompressArgs {}
#[derive(Copy, Clone, Debug, Eq, Hash, PartialEq)]
pub struct Args<'a> {
pub command: Command,
pub input_file: &'a Path,
pub output_file: &'a Path,
}
pub fn run(args: Args) -> Result<()> {
match args.command {
Command::Compress(command_args) => internal::compress(
args.input_file,
args.output_file,
command_args.encoding_scheme,
command_args.tokenization_scheme,
),
Command::Decompress(_) => internal::decompress(args.input_file, args.output_file),
}
}
mod internal {
use crate::code::Letter;
use crate::encoding::{new_encoder, Encoding};
use crate::model;
use crate::tokenization_scheme::{pack_tokenization_scheme, unpack_tokenization_scheme};
use crate::tokens::bytes::Byte;
use crate::tokens::graphemes::Grapheme;
use crate::tokens::words::Word;
use crate::tokens::{Token, TokenPacker, Tokenizer};
use crate::{EncodingScheme, TokenizationScheme};
use anyhow::{anyhow, Result};
use log::info;
use std::collections::HashMap;
use std::fs::File;
use std::io::{BufReader, BufWriter};
use std::path::Path;
pub fn compress(
input_file: &Path,
output_file: &Path,
encoding_scheme: EncodingScheme,
tokenization_scheme: TokenizationScheme,
) -> Result<()> {
info!("Compressing...");
let mut w = BufWriter::new(File::create(output_file)?);
pack_tokenization_scheme(tokenization_scheme, &mut w)?;
match tokenization_scheme {
TokenizationScheme::Byte => {
compress_with_token::<Byte, _>(input_file, w, encoding_scheme)
}
TokenizationScheme::Grapheme => {
compress_with_token::<Grapheme, _>(input_file, w, encoding_scheme)
}
TokenizationScheme::Word => {
compress_with_token::<Word, _>(input_file, w, encoding_scheme)
}
}
}
pub fn decompress(input_file: &Path, output_file: &Path) -> Result<()> {
info!("Decompressing...");
let w = BufWriter::new(File::create(output_file)?);
let mut r = BufReader::new(File::open(input_file)?);
match unpack_tokenization_scheme(&mut r)? {
TokenizationScheme::Byte => decompress_with_token::<Byte, _, _>(r, w),
TokenizationScheme::Grapheme => decompress_with_token::<Grapheme, _, _>(r, w),
TokenizationScheme::Word => decompress_with_token::<Word, _, _>(r, w),
}
}
fn compress_with_token<T: Token, W: std::io::Write>(
input_file: &Path,
mut w: W,
encoding_scheme: EncodingScheme,
) -> Result<()> {
info!("Compressing...");
let r = BufReader::new(File::open(input_file)?);
let tokens = T::Tokenizer::tokenize(r).unwrap().map(|r| r.unwrap());
let encoding = new_encoder(&&encoding_scheme, model::from(tokens))?;
let r = BufReader::new(File::open(input_file)?);
let tokens = T::Tokenizer::tokenize(r).unwrap().map(|r| r.unwrap());
let code_text = encode(encoding.map(), tokens).map(|r| r.unwrap());
encoding.pack(&mut w)?;
crate::code::pack(code_text, &mut w)?;
Ok(())
}
fn decompress_with_token<T: Token, R: std::io::Read, W: std::io::Write>(
mut r: R,
mut w: W,
) -> Result<()> {
let encoding: Encoding<T> = Encoding::unpack(&mut r).unwrap();
let map = encoding.reverse_map();
let coded_text = crate::code::parse(&encoding.alphabet(), r)?.map(|r| r.unwrap());
let decoded_text = decode(&map, coded_text).map(|r| r.unwrap());
T::Packer::pack(decoded_text, &mut w)?;
Ok(())
}
fn encode<'a, T, TS>(
encoding: &'a HashMap<T, Letter>,
input: TS,
) -> impl Iterator<Item = Result<&'a Letter>>
where
T: Token,
TS: std::iter::Iterator<Item = T>,
{
input.map(move |t| match encoding.get(&t) {
Some(l) => Ok(l),
None => Err(anyhow!("Unknown token {}", t.to_string())),
})
}
fn decode<'a, T, CS: 'a>(
encoding: &'a HashMap<&'a Letter, &'a T>,
input: CS,
) -> impl Iterator<Item = Result<T>> + 'a
where
T: Token,
CS: std::iter::Iterator<Item = &'a Letter>,
{
input.map(move |l| match encoding.get(l) {
Some(t) => Ok((*t).clone()),
None => Err(anyhow!("no encoding for letter {}", l)),
})
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_send() {
fn assert_send<T: Send>() {}
assert_send::<Args>();
}
#[test]
fn test_sync() {
fn assert_sync<T: Sync>() {}
assert_sync::<Args>();
}
}