fsst_rust/core/
mod.rs

1use std::cmp::{max, min};
2
3mod symbol;
4mod counter;
5pub mod symbol_table;
6pub mod codec;
7
8const U64_SIZE: usize = size_of::<u64>();
9const CODE_MAX: u16 = 1 << 9;
10const CODE_MASK: u16 = CODE_MAX - 1;
11const CODE_BASE: u16 = 256;
12const CODE_ESCAPE: u8 = 255;
13const LEN_BITS: u16 = 12;
14const HASH_SHIFT: usize = 15;
15const HASH_PRIME: usize = 2971215073;
16const SAMPLE_TARGET: usize = 1 << 16;
17const SMALL_STR_THRESHOLD: usize = 1 << 14;
18
19type U64Bytes = [u8; U64_SIZE];
20
21pub fn is_escape_code(code: u16) -> bool {
22    code < CODE_BASE
23}
24
25fn fsst_hash(v: usize) -> usize {
26    let prime = v * HASH_PRIME;
27    prime ^ (prime >> HASH_SHIFT)
28}
29
30fn bulk_load(s: &[u8]) -> u64 {
31    let mut v = [0u8; U64_SIZE];
32    v[..s.len()].copy_from_slice(s);
33    unsafe {
34        std::mem::transmute::<U64Bytes, u64>(v)
35    }
36}
37
38fn bulk_load_u32(s: &[u8]) -> u32 {
39    let mut v = [0u8; 4];
40    v[..s.len()].copy_from_slice(s);
41    unsafe {
42        std::mem::transmute::<[u8; 4], u32>(v)
43    }
44}
45
46pub fn take_sample(sample_space: &Vec<String>) -> Vec<&String> {
47    let total_size = sample_space.iter().map(|s| s.len()).sum::<usize>();
48    let (mut sample_size, mut sample_prob, mut sample_target) = (0usize, 256usize, SAMPLE_TARGET);
49    if total_size > sample_target {
50        sample_prob = max(4, 256 * sample_target / total_size);
51    } else {
52        sample_target = total_size;
53    }
54    let mut sample = Vec::with_capacity(sample_space.len() * (sample_target / total_size));
55
56    let sample_rand = 1;
57    while sample_size < sample_target {
58        for str in sample_space {
59            if (sample_rand & 255) < sample_prob {
60                sample.push(str);
61                sample_size += str.len();
62                if sample_size >= sample_target {
63                    break;
64                }
65            }
66        }
67        sample_prob <<= 2;
68    }
69
70    sample
71}
72
73pub fn take_sample_from_bytes(sample_space: &[u8]) -> Vec<u8> {
74    if sample_space.len() < SMALL_STR_THRESHOLD {
75        return Vec::from(sample_space);
76    }
77
78    let sample_size = min(sample_space.len() >> 3, SAMPLE_TARGET);
79    let sample_seg_size = sample_size / 10;
80    let mut sample = vec![0; sample_size];
81    let gap = (sample_space.len() - sample.len()) / 10;
82    let (mut pos_in, mut pos_out) = (0, 0);
83    loop {
84        if pos_in + sample_seg_size >= sample_space.len()
85            || pos_out + sample_seg_size >= sample_size {
86            break;
87        }
88        sample[pos_out..pos_out + sample_seg_size]
89            .copy_from_slice(&sample_space[pos_in..pos_in + sample_seg_size]);
90        pos_out += sample_seg_size;
91        pos_in += sample_seg_size + gap;
92    }
93    sample.truncate(pos_out);
94    sample
95}