kaiser/
stats.rs

1use crate::{Buffer, Char, CharStream};
2use itertools::Itertools;
3use lazy_static::lazy_static;
4
5lazy_static! {
6    static ref QUADGRAMS: &'static [f32] = {
7        let buf = include_bytes!("../data/quadgram_scores.raw");
8
9        unsafe { std::slice::from_raw_parts(buf.as_ptr() as *const f32, 26 * 26 * 26 * 26) }
10    };
11}
12
13pub fn letter_frequencies<'a, T: CharStream<'a>>(buf: &'a T) -> [u32; Char::MAX as usize] {
14    let mut out = [0; Char::MAX as usize];
15
16    for &b in buf.iter() {
17        out[u8::from(b) as usize] += 1;
18    }
19
20    out
21}
22
23pub fn chi_squared<'a, T: CharStream<'a>>(buf: &'a T) -> f64 {
24    let english_freqs = [
25        0.08167, 0.01492, 0.02782, 0.04253, 0.12702, 0.02228, 0.02015, 0.06094, 0.06966, 0.00153,
26        0.00772, 0.04025, 0.02406, 0.06749, 0.07507, 0.01929, 0.00095, 0.05987, 0.06327, 0.09056,
27        0.02758, 0.00978, 0.02360, 0.00150, 0.01974, 0.00074,
28    ];
29
30    let freqs = letter_frequencies(buf);
31    let len_f = buf.len() as f64;
32
33    freqs
34        .iter()
35        .enumerate()
36        .map(|(i, &f)| {
37            let e_count = len_f * english_freqs[i];
38            let diff = f as f64 - e_count;
39            (diff * diff) / e_count
40        })
41        .sum()
42}
43
44pub fn index_of_coincidence<'a, T: CharStream<'a>>(buf: &'a T) -> f64 {
45    let freqs = letter_frequencies(buf);
46
47    let total = freqs
48        .iter()
49        .filter(|&f| *f > 0)
50        .map(|&f| f * (f - 1))
51        .sum::<u32>() as f64;
52
53    let len = buf.len();
54    let denominator = (len * (len - 1)) as f64 / Char::MAX as f64;
55
56    total / denominator
57}
58
59pub fn quadgram_score(buf: &Buffer) -> f64 {
60    let mut score = 0.0_f64;
61
62    for (c1, c2, c3, c4) in buf.into_iter().tuple_windows() {
63        let hash = (u8::from(*c1) as usize * 26_usize.pow(3))
64            + (u8::from(*c2) as usize * 26_usize.pow(2))
65            + (u8::from(*c3) as usize * 26_usize.pow(1))
66            + (u8::from(*c4) as usize * 26_usize.pow(0));
67        score += QUADGRAMS[hash] as f64; // TODO: Remove bounds checks
68    }
69
70    score / (buf.len() as f64) // Normalise based on text length
71}
72
73#[cfg(test)]
74mod tests {
75    use super::*;
76
77    #[test]
78    fn test_heuristics() {
79        let buf = Buffer::from("Rust is the best programming language");
80
81        assert_eq!(1.310483870967742, index_of_coincidence(&buf));
82        assert_eq!(29.514280393617323, chi_squared(&buf));
83        // TODO: Quadgram score check?
84    }
85
86    #[test]
87    fn test_freqs() {
88        let buf = Buffer::from("Rust is the best programming language");
89
90        let expected = [
91            3, 1, 0, 0, 3, 0, 4, 1, 2, 0, 0, 1, 2, 2, 1, 1, 0, 3, 3, 3, 2, 0, 0, 0, 0, 0,
92        ];
93        assert_eq!(expected, letter_frequencies(&buf));
94    }
95}