compression_text_classification 0.2.0

text classification using compression alorithm
Documentation
use crate::{compressed_size, TrainItem};
use std::cmp::{max, min};

#[derive(Debug, Clone)]
pub struct CachedTrainItem {
    class: String,
    item: CachedItem,
}

impl CachedTrainItem {
    pub fn class(&self) -> &str {
        &self.class
    }
}

impl From<TrainItem> for CachedTrainItem {
    fn from(value: TrainItem) -> Self {
        Self {
            class: value.class,
            item: value.text.into(),
        }
    }
}

#[derive(Clone, Debug)]
pub struct CachedItem {
    text: String,
    compressed_size: usize,
}
impl<'a> From<&'a CachedTrainItem> for &'a CachedItem {
    fn from(value: &'a CachedTrainItem) -> Self {
        &value.item
    }
}
impl<S: Into<String>> From<S> for CachedItem {
    fn from(value: S) -> Self {
        let value: String = value.into();
        Self {
            compressed_size: compressed_size(&value),
            text: value,
        }
    }
}

pub fn normalized_compression_distance(x: &CachedItem, y: &CachedItem) -> f64 {
    let c_x = x.compressed_size;
    let c_y = y.compressed_size;
    let c_xy = compressed_size((x.text.clone() + y.text.as_str()).as_str());
    (c_xy - min(c_x, c_y)) as f64 / max(c_x, c_y) as f64
}