fsst_rust/core/
codec.rs

1use crate::core::{bulk_load_u32, CODE_ESCAPE, U64_SIZE};
2use crate::core::symbol::Symbol;
3use crate::core::symbol_table::SymbolTable;
4use crate::util::endian::Endian;
5
6pub struct Encoder<'a> {
7    symbol_table: &'a Box<dyn SymbolTable>,
8}
9
10impl Encoder<'_> {
11    pub fn from_table(table: &Box<dyn SymbolTable>) -> Encoder {
12        Encoder { symbol_table: table }
13    }
14
15    pub fn encode_str(&self, str: &str) -> Vec<u8> {
16        let mut buf = vec![0; str.len() << 1];
17        let (mut pos_in, mut pos_out) = (0, 0);
18        while pos_in < str.len() {
19            let target = Symbol::from_str(&str[pos_in..]);
20            buf[pos_out + 1] = target.first() as u8;
21            let (code, s_len, out_len) = self.symbol_table.encode_for(&target);
22            buf[pos_out] = code;
23            pos_out += out_len;
24            pos_in += s_len;
25        }
26        buf.truncate(pos_out);
27        buf
28    }
29
30    pub fn encode(&self, str: &str, include_table: bool) -> Vec<u8> {
31        let mut buf = self.encode_str(str);
32        if include_table {
33            let mut table_buf = self.symbol_table.dump();
34            let mut buf_with_tab = Vec::with_capacity(table_buf.len() + table_buf.len());
35            buf_with_tab.append(&mut table_buf);
36            buf_with_tab.append(&mut buf);
37            buf_with_tab
38        } else {
39            buf
40        }
41    }
42}
43
44pub struct Decoder {
45    symbols: [u64; CODE_ESCAPE as usize],
46    lens: [u8; CODE_ESCAPE as usize],
47}
48
49impl Decoder {
50    pub fn from_table(table: &Box<dyn SymbolTable>) -> Decoder {
51        let mut symbols = [0u64; CODE_ESCAPE as usize];
52        let mut lens = [0u8; CODE_ESCAPE as usize];
53        for i in 0..table.len() {
54            let s = table.get_symbol(i as u16);
55            symbols[i] = s.as_u64();
56            lens[i] = s.length() as u8;
57        }
58        Decoder { symbols, lens }
59    }
60
61    pub fn from_table_bytes(buf: &Vec<u8>) -> (usize, Decoder) {
62        let mut symbols = [0u64; CODE_ESCAPE as usize];
63        let mut lens = [0u8; CODE_ESCAPE as usize];
64        let encode_endian = Endian::from_u8(*buf.get(0).unwrap());
65        let len_histo = &buf[1..9];
66        let (mut pos, mut code) = (9, 0usize);
67        for len in 1..=Symbol::MAX_LEN {
68            for _ in 0..len_histo[len - 1] {
69                let mut num = 0u64;
70                if Endian::get_native_endian() != encode_endian {
71                    num |= *buf.get(pos).unwrap() as u64;
72                    for i in 1..len {
73                        num <<= 8;
74                        num |= *buf.get(pos + i).unwrap() as u64;
75                    }
76                } else {
77                    num |= *buf.get(pos + len - 1).unwrap() as u64;
78                    for i in (0..len - 1).rev() {
79                        num <<= 8;
80                        num |= *buf.get(pos + i).unwrap() as u64;
81                    }
82                }
83                symbols[code] = num;
84                lens[code] = len as u8;
85                code += 1;
86                pos += len;
87            }
88        }
89        (pos, Decoder { symbols, lens })
90    }
91
92    /// safe decode method
93    pub fn decode_with_tab(table: &Box<dyn SymbolTable>, buf: &Vec<u8>) -> String {
94        let mut str = String::with_capacity(buf.len() * 4);
95        let mut pos = 0;
96        while pos < buf.len() {
97            let b = buf.get(pos).unwrap();
98            pos += 1;
99            if *b == 255 {
100                str.push(*buf.get(pos).unwrap() as char);
101                pos += 1;
102            } else {
103                str.push_str(&table.get_symbol(*b as u16).to_string());
104            }
105        }
106        str
107    }
108
109    /// decode method that uses the unsafe method
110    pub fn decode(&self, str_buf: &Vec<u8>) -> String {
111        let (mut pos_in, mut pos_out) = (0, 0);
112        let mut decode_buf = vec![0u8; str_buf.len() * Symbol::MAX_LEN];
113        unsafe {
114            let out = decode_buf.as_mut_ptr();
115            while pos_in + 4 < str_buf.len() {
116                let next_block = bulk_load_u32(&str_buf[pos_in..pos_in + 4]);
117                let escape_mask = (next_block & 0x80808080) & ((((!next_block) & 0x7F7F7F7F) + 0x7F7F7F7F) ^ 0x80808080);
118                if escape_mask == 0 {
119                    self.unaligned_store(&mut pos_in, &mut pos_out, &str_buf, out);
120                    self.unaligned_store(&mut pos_in, &mut pos_out, &str_buf, out);
121                    self.unaligned_store(&mut pos_in, &mut pos_out, &str_buf, out);
122                    self.unaligned_store(&mut pos_in, &mut pos_out, &str_buf, out);
123                } else {
124                    let mut first_escape_pos = escape_mask.trailing_zeros() >> 3;
125                    while first_escape_pos > 0 {
126                        self.unaligned_store(&mut pos_in, &mut pos_out, &str_buf, out);
127                        first_escape_pos -= 1;
128                    }
129                    decode_buf[pos_out] = str_buf[pos_in + 1];
130                    pos_in += 2;
131                    pos_out += 1;
132                }
133            }
134            while pos_in < str_buf.len() {
135                if str_buf[pos_in] != CODE_ESCAPE {
136                    self.unaligned_store(&mut pos_in, &mut pos_out, &str_buf, out);
137                } else {
138                    decode_buf[pos_out] = str_buf[pos_in + 1];
139                    pos_in += 2;
140                    pos_out += 1;
141                }
142            }
143            decode_buf.truncate(pos_out);
144            String::from_utf8_unchecked(decode_buf)
145        }
146    }
147
148    #[inline(always)]
149    unsafe fn unaligned_store(&self, pos_in: &mut usize, pos_out: &mut usize, str_in: &Vec<u8>, out: *mut u8) {
150        let code = str_in[*pos_in] as usize;
151        std::ptr::copy_nonoverlapping(self.symbols[code].to_ne_bytes().as_ptr(), out.add(*pos_out), U64_SIZE);
152        *pos_in += 1;
153        *pos_out += self.lens[code] as usize;
154    }
155}
156
157#[cfg(test)]
158mod test {
159    use crate::core::codec::{Decoder, Encoder};
160    use crate::core::symbol_table::SymbolTableBuilder;
161
162    #[test]
163    pub fn test_decode_with_dump_table() {
164        let test_str = "paqvawflxucgajxfzxwooypirnzkahobfvxzhrerdwzkerwwolqfbafwslwhsvuitbtgkvnjrdr";
165        let symbol_table = SymbolTableBuilder::build_from(test_str);
166        let encoder = Encoder::from_table(&symbol_table);
167        let buf = symbol_table.dump();
168        let (table_end_pos, decoder) = Decoder::from_table_bytes(&buf);
169        assert_eq!(buf.len(), table_end_pos);
170        let encode_buf = encoder.encode(test_str, false);
171        let decode_str = decoder.decode(&encode_buf);
172        assert_eq!(test_str, decode_str);
173    }
174}