1use std::cmp::{max, min};
2
3mod symbol;
4mod counter;
5pub mod symbol_table;
6pub mod codec;
7
8const U64_SIZE: usize = size_of::<u64>();
9const CODE_MAX: u16 = 1 << 9;
10const CODE_MASK: u16 = CODE_MAX - 1;
11const CODE_BASE: u16 = 256;
12const CODE_ESCAPE: u8 = 255;
13const LEN_BITS: u16 = 12;
14const HASH_SHIFT: usize = 15;
15const HASH_PRIME: usize = 2971215073;
16const SAMPLE_TARGET: usize = 1 << 16;
17const SMALL_STR_THRESHOLD: usize = 1 << 14;
18
19type U64Bytes = [u8; U64_SIZE];
20
21pub fn is_escape_code(code: u16) -> bool {
22 code < CODE_BASE
23}
24
25fn fsst_hash(v: usize) -> usize {
26 let prime = v * HASH_PRIME;
27 prime ^ (prime >> HASH_SHIFT)
28}
29
30fn bulk_load(s: &[u8]) -> u64 {
31 let mut v = [0u8; U64_SIZE];
32 v[..s.len()].copy_from_slice(s);
33 unsafe {
34 std::mem::transmute::<U64Bytes, u64>(v)
35 }
36}
37
38fn bulk_load_u32(s: &[u8]) -> u32 {
39 let mut v = [0u8; 4];
40 v[..s.len()].copy_from_slice(s);
41 unsafe {
42 std::mem::transmute::<[u8; 4], u32>(v)
43 }
44}
45
46pub fn take_sample(sample_space: &Vec<String>) -> Vec<&String> {
47 let total_size = sample_space.iter().map(|s| s.len()).sum::<usize>();
48 let (mut sample_size, mut sample_prob, mut sample_target) = (0usize, 256usize, SAMPLE_TARGET);
49 if total_size > sample_target {
50 sample_prob = max(4, 256 * sample_target / total_size);
51 } else {
52 sample_target = total_size;
53 }
54 let mut sample = Vec::with_capacity(sample_space.len() * (sample_target / total_size));
55
56 let sample_rand = 1;
57 while sample_size < sample_target {
58 for str in sample_space {
59 if (sample_rand & 255) < sample_prob {
60 sample.push(str);
61 sample_size += str.len();
62 if sample_size >= sample_target {
63 break;
64 }
65 }
66 }
67 sample_prob <<= 2;
68 }
69
70 sample
71}
72
73pub fn take_sample_from_bytes(sample_space: &[u8]) -> Vec<u8> {
74 if sample_space.len() < SMALL_STR_THRESHOLD {
75 return Vec::from(sample_space);
76 }
77
78 let sample_size = min(sample_space.len() >> 3, SAMPLE_TARGET);
79 let sample_seg_size = sample_size / 10;
80 let mut sample = vec![0; sample_size];
81 let gap = (sample_space.len() - sample.len()) / 10;
82 let (mut pos_in, mut pos_out) = (0, 0);
83 loop {
84 if pos_in + sample_seg_size >= sample_space.len()
85 || pos_out + sample_seg_size >= sample_size {
86 break;
87 }
88 sample[pos_out..pos_out + sample_seg_size]
89 .copy_from_slice(&sample_space[pos_in..pos_in + sample_seg_size]);
90 pos_out += sample_seg_size;
91 pos_in += sample_seg_size + gap;
92 }
93 sample.truncate(pos_out);
94 sample
95}