fsst_rust/
lib.rs

1use std::fs::File;
2use std::io;
3use std::io::BufRead;
4use std::path::Path;
5
6use crate::core::codec::{Decoder, Encoder};
7use crate::core::symbol_table::{SymbolTable, SymbolTableBuilder};
8use crate::core::take_sample;
9
10pub mod core;
11mod util;
12
13/// build symbol table by sampling the given strings
14/// symbol table can be used to build `Encoder` and `Decoder`
15/// # Example
16///
17/// ```
18/// use fsst_rust::build_table_by_sampling;
19/// use fsst_rust::core::codec::{Decoder, Encoder};
20/// let strings = vec!["abcd".to_string(), "efgh".to_string()];
21/// let symbol_table = build_table_by_sampling(&strings);
22/// let encoder = Encoder::from_table(&symbol_table);
23/// let str = "abc";
24/// let encoding = encoder.encode(&str, false);
25/// let decoder = Decoder::from_table(&symbol_table);
26/// let decode_str = decoder.decode(&encoding);
27/// assert_eq!(str, decode_str);
28/// ```
29pub fn build_table_by_sampling(strings: &Vec<String>) -> Box<dyn SymbolTable> {
30    let sample = take_sample(&strings);
31    SymbolTableBuilder::build_from_samples(&sample)
32}
33
34/// encode all given strings
35/// it will sample the given strings and build a symbol table which will be returned in a tuple
36pub fn encode_all_strings(strings: &Vec<String>) -> (Box<dyn SymbolTable>, Vec<Vec<u8>>) {
37    let symbol_table = build_table_by_sampling(strings);
38    let encoder = Encoder::from_table(&symbol_table);
39    let mut encodings = Vec::with_capacity(strings.len());
40    for str in strings {
41        encodings.push(encoder.encode_str(str));
42    }
43    (symbol_table, encodings)
44}
45
46/// encode a single string
47/// if including_table is true, it will encode the symbol table to bytes
48/// and add it the encoding bytes header, i.e., | symbol table bytes | string encoding bytes |
49/// # Example
50///
51/// ```
52/// use fsst_rust::core::codec::Decoder;
53/// use fsst_rust::encode_string;
54/// let str = "hello world".to_string();
55/// let (_, encoding) = encode_string(&str, true);
56/// let (table_end_pos, decoder) = Decoder::from_table_bytes(&encoding);
57/// let decode_str = decoder.decode(&encoding[table_end_pos..].to_vec());
58/// assert_eq!(str, decode_str);
59/// ```
60pub fn encode_string(str: &str, including_table: bool) -> (Box<dyn SymbolTable>, Vec<u8>) {
61    let symbol_table = SymbolTableBuilder::build_from(str);
62    let encoder = Encoder::from_table(&symbol_table);
63    let encoding = encoder.encode(str, including_table);
64    (symbol_table, encoding)
65}
66
67/// decode bytes to string according to the give symbol table
68pub fn decode_string(table: &Box<dyn SymbolTable>, encoding: &Vec<u8>) -> String {
69    Decoder::from_table(table).decode(encoding)
70}
71
72/// decode all string encodings by the given symbol table
73pub fn decode_all_strings(table: &Box<dyn SymbolTable>, encodings: &Vec<Vec<u8>>) -> Vec<String> {
74    let mut strings = Vec::with_capacity(encodings.len());
75    let decoder = Decoder::from_table(table);
76    for encoding in encodings {
77        strings.push(decoder.decode(encoding))
78    }
79    strings
80}
81
82pub fn encode_all_strings_from_file<P: AsRef<Path>>(filename: P) -> io::Result<(Box<dyn SymbolTable>, Vec<Vec<u8>>)> {
83    let strings = read_string_lines(filename)?;
84    Ok(encode_all_strings(&strings))
85}
86
87pub fn read_string_lines<P>(filename: P) -> io::Result<Vec<String>>
88where
89    P: AsRef<Path>,
90{
91    let file = File::open(filename)?;
92    let strings: Vec<String> = io::BufReader::new(file)
93        .lines()
94        .map(|l| l.expect("read string failed"))
95        .collect();
96    Ok(strings)
97}
98
99#[cfg(test)]
100mod test {
101    use crate::{decode_all_strings, encode_all_strings, read_string_lines};
102
103    #[test]
104    pub fn test_codec() {
105        let group_test_data_path = "assets/test_data/c_name";
106        let mut strings = read_string_lines(group_test_data_path).unwrap();
107        strings.truncate(1000);
108        let (table, encodings) = encode_all_strings(&strings);
109        let decode_strings = decode_all_strings(&table, &encodings);
110        for i in 0..strings.len() {
111            assert_eq!(strings[i], decode_strings[i]);
112        }
113    }
114}
115