fsst-rust 0.1.1

FSST-Rust is a pure rust implementation of the Fast Static Symbol Table
Documentation
use std::cmp::{min, Ordering};
use std::fmt::{Display, Formatter};
use std::hash::{Hash, Hasher};
use std::ops::Add;

use crate::core::{CODE_MASK, fsst_hash, U64_SIZE, U64Bytes};

#[derive(Clone, Copy)]
pub struct Symbol {
    num: u64,
    icl: u64,
}

impl Symbol {
    pub(crate) const MAX_LEN: usize = U64_SIZE;
    const FREE_ICL: u64 = ((15 << 28) | ((CODE_MASK as u32) << 16)) as u64;

    pub fn from_str(str: &str) -> Symbol {
        Self::from_str_bytes(str.as_bytes())
    }

    pub fn from_str_bytes(bytes: &[u8]) -> Symbol {
        let len = min(bytes.len(), Self::MAX_LEN);
        let mut str_bytes = [0u8; Self::MAX_LEN];
        str_bytes[..len].copy_from_slice(&bytes[..len]);
        Symbol {
            num: Self::bytes_to_u64(str_bytes),
            icl: Self::compute_icl(CODE_MASK as u32, len as u32),
        }
    }

    pub fn from_byte_code(b: u8, code: u16) -> Symbol {
        Symbol {
            num: b as u64,
            icl: Self::compute_icl(code as u32, 1u32),
        }
    }

    pub fn from_str_num(num: u64, len: usize) -> Symbol {
        Symbol {
            num,
            icl: Self::compute_icl(CODE_MASK as u32, len as u32),
        }
    }

    pub fn free() -> Symbol {
        Symbol {
            num: 0,
            icl: Self::FREE_ICL,
        }
    }

    pub fn length(&self) -> usize {
        (self.icl >> 28) as usize
    }

    pub fn code(&self) -> u16 {
        ((self.icl >> 16) & CODE_MASK as u64) as u16
    }

    pub fn compute_icl(code: u32, len: u32) -> u64 {
        ((len << 28) | (code << 16) | ((8 - len) * 8)) as u64
    }

    pub fn set_code_len(&mut self, code: u16, len: usize) {
        self.icl = Self::compute_icl(code as u32, len as u32);
    }

    pub fn update_to(&mut self, target: &Symbol) {
        self.num = target.num;
        self.icl = target.icl;
    }

    pub fn reset(&mut self) {
        self.num = 0;
        self.icl = Self::FREE_ICL;
    }

    pub fn taken(&self) -> bool {
        self.icl < Self::FREE_ICL
    }

    pub fn prefix_match(&self, rhs: &Symbol) -> bool {
        self.icl >= rhs.icl && rhs.num == (self.num & (0xffffffffffffffff >> (rhs.icl as u8)))
    }

    pub fn word_match(&self, word: u64) -> bool {
        self.taken() && self.num == (word & (0xffffffffffffffff >> (self.icl as u8)))
    }

    pub fn as_u64(&self) -> u64 {
        self.num
    }

    pub fn first(&self) -> usize {
        (self.num & 0xff) as usize
    }

    pub fn first2(&self) -> usize {
        (self.num & 0xffff) as usize
    }

    pub fn hash(&self) -> usize {
        fsst_hash((self.num & 0xffffff) as usize)
    }

    fn bytes_to_u64(str_bytes: U64Bytes) -> u64 {
        unsafe {
            std::mem::transmute::<U64Bytes, u64>(str_bytes)
        }
    }

    fn u64_to_bytes(value: u64) -> U64Bytes {
        unsafe {
            std::mem::transmute::<u64, U64Bytes>(value)
        }
    }
}

impl Display for Symbol {
    fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
        let s = Self::u64_to_bytes(self.num.clone());
        write!(f, "{}", String::from_utf8_lossy(&s[0..self.length()]))
    }
}

impl Hash for Symbol {
    fn hash<H: Hasher>(&self, state: &mut H) {
        self.num.hash(state);
        self.length().hash(state);
    }
}

impl Add for Symbol {
    type Output = Symbol;

    fn add(self, rhs: Self) -> Self::Output {
        let this_len = self.length();
        let concat_len = min(this_len + rhs.length(), Symbol::MAX_LEN);
        Symbol {
            num: (&rhs.num << (8 * this_len)) | &self.num,
            icl: Self::compute_icl(CODE_MASK as u32, concat_len as u32),
        }
    }
}

impl PartialEq<Self> for Symbol {
    fn eq(&self, other: &Self) -> bool {
        self.num == other.num && self.length() == other.length()
    }
}

impl Eq for Symbol {}

impl PartialOrd<Self> for Symbol {
    fn partial_cmp(&self, other: &Self) -> Option<Ordering> {
        self.num.partial_cmp(&other.num)
    }
}

impl Ord for Symbol {
    fn cmp(&self, other: &Self) -> Ordering {
        self.num.cmp(&other.num)
    }
}

#[cfg(test)]
mod test {
    use crate::core::symbol::Symbol;

    #[test]
    pub fn test_symbol_add() {
        let s1 = Symbol::from_str("1234");
        assert_eq!("1234", s1.clone().to_string());
        let s2 = Symbol::from_str("567");
        assert_eq!("1234567", (s1.clone() + s2).to_string());
        let s3 = Symbol::from_str("56789");
        assert_ne!("123456789", (s1.clone() + s3.clone()).to_string());
        assert_eq!("12345678", (s1 + s3).to_string());
    }
}