1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
use std::cmp::{max, min};

mod symbol;
mod counter;
pub mod symbol_table;
pub mod codec;

const U64_SIZE: usize = size_of::<u64>();
const CODE_MAX: u16 = 1 << 9;
const CODE_MASK: u16 = CODE_MAX - 1;
const CODE_BASE: u16 = 256;
const CODE_ESCAPE: u8 = 255;
const LEN_BITS: u16 = 12;
const HASH_SHIFT: usize = 15;
const HASH_PRIME: usize = 2971215073;
const SAMPLE_TARGET: usize = 1 << 16;
const SMALL_STR_THRESHOLD: usize = 1 << 14;

type U64Bytes = [u8; U64_SIZE];

pub fn is_escape_code(code: u16) -> bool {
    code < CODE_BASE
}

fn fsst_hash(v: usize) -> usize {
    let prime = v * HASH_PRIME;
    prime ^ (prime >> HASH_SHIFT)
}

fn bulk_load(s: &[u8]) -> u64 {
    let mut v = [0u8; U64_SIZE];
    v[..s.len()].copy_from_slice(s);
    unsafe {
        std::mem::transmute::<U64Bytes, u64>(v)
    }
}

fn bulk_load_u32(s: &[u8]) -> u32 {
    let mut v = [0u8; 4];
    v[..s.len()].copy_from_slice(s);
    unsafe {
        std::mem::transmute::<[u8; 4], u32>(v)
    }
}

pub fn take_sample(sample_space: &Vec<String>) -> Vec<&String> {
    let total_size = sample_space.iter().map(|s| s.len()).sum::<usize>();
    let (mut sample_size, mut sample_prob, mut sample_target) = (0usize, 256usize, SAMPLE_TARGET);
    if total_size > sample_target {
        sample_prob = max(4, 256 * sample_target / total_size);
    } else {
        sample_target = total_size;
    }
    let mut sample = Vec::with_capacity(sample_space.len() * (sample_target / total_size));

    let sample_rand = 1;
    while sample_size < sample_target {
        for str in sample_space {
            if (sample_rand & 255) < sample_prob {
                sample.push(str);
                sample_size += str.len();
                if sample_size >= sample_target {
                    break;
                }
            }
        }
        sample_prob <<= 2;
    }

    sample
}

pub fn take_sample_from_bytes(sample_space: &[u8]) -> Vec<u8> {
    if sample_space.len() < SMALL_STR_THRESHOLD {
        return Vec::from(sample_space);
    }

    let sample_size = min(sample_space.len() >> 3, SAMPLE_TARGET);
    let sample_seg_size = sample_size / 10;
    let mut sample = vec![0; sample_size];
    let gap = (sample_space.len() - sample.len()) / 10;
    let (mut pos_in, mut pos_out) = (0, 0);
    loop {
        if pos_in + sample_seg_size >= sample_space.len()
            || pos_out + sample_seg_size >= sample_size {
            break;
        }
        sample[pos_out..pos_out + sample_seg_size]
            .copy_from_slice(&sample_space[pos_in..pos_in + sample_seg_size]);
        pos_out += sample_seg_size;
        pos_in += sample_seg_size + gap;
    }
    sample.truncate(pos_out);
    sample
}