1use crate::{Buffer, Char, CharStream};
2use itertools::Itertools;
3use lazy_static::lazy_static;
4
5lazy_static! {
6 static ref QUADGRAMS: &'static [f32] = {
7 let buf = include_bytes!("../data/quadgram_scores.raw");
8
9 unsafe { std::slice::from_raw_parts(buf.as_ptr() as *const f32, 26 * 26 * 26 * 26) }
10 };
11}
12
13pub fn letter_frequencies<'a, T: CharStream<'a>>(buf: &'a T) -> [u32; Char::MAX as usize] {
14 let mut out = [0; Char::MAX as usize];
15
16 for &b in buf.iter() {
17 out[u8::from(b) as usize] += 1;
18 }
19
20 out
21}
22
23pub fn chi_squared<'a, T: CharStream<'a>>(buf: &'a T) -> f64 {
24 let english_freqs = [
25 0.08167, 0.01492, 0.02782, 0.04253, 0.12702, 0.02228, 0.02015, 0.06094, 0.06966, 0.00153,
26 0.00772, 0.04025, 0.02406, 0.06749, 0.07507, 0.01929, 0.00095, 0.05987, 0.06327, 0.09056,
27 0.02758, 0.00978, 0.02360, 0.00150, 0.01974, 0.00074,
28 ];
29
30 let freqs = letter_frequencies(buf);
31 let len_f = buf.len() as f64;
32
33 freqs
34 .iter()
35 .enumerate()
36 .map(|(i, &f)| {
37 let e_count = len_f * english_freqs[i];
38 let diff = f as f64 - e_count;
39 (diff * diff) / e_count
40 })
41 .sum()
42}
43
44pub fn index_of_coincidence<'a, T: CharStream<'a>>(buf: &'a T) -> f64 {
45 let freqs = letter_frequencies(buf);
46
47 let total = freqs
48 .iter()
49 .filter(|&f| *f > 0)
50 .map(|&f| f * (f - 1))
51 .sum::<u32>() as f64;
52
53 let len = buf.len();
54 let denominator = (len * (len - 1)) as f64 / Char::MAX as f64;
55
56 total / denominator
57}
58
59pub fn quadgram_score(buf: &Buffer) -> f64 {
60 let mut score = 0.0_f64;
61
62 for (c1, c2, c3, c4) in buf.into_iter().tuple_windows() {
63 let hash = (u8::from(*c1) as usize * 26_usize.pow(3))
64 + (u8::from(*c2) as usize * 26_usize.pow(2))
65 + (u8::from(*c3) as usize * 26_usize.pow(1))
66 + (u8::from(*c4) as usize * 26_usize.pow(0));
67 score += QUADGRAMS[hash] as f64; }
69
70 score / (buf.len() as f64) }
72
73#[cfg(test)]
74mod tests {
75 use super::*;
76
77 #[test]
78 fn test_heuristics() {
79 let buf = Buffer::from("Rust is the best programming language");
80
81 assert_eq!(1.310483870967742, index_of_coincidence(&buf));
82 assert_eq!(29.514280393617323, chi_squared(&buf));
83 }
85
86 #[test]
87 fn test_freqs() {
88 let buf = Buffer::from("Rust is the best programming language");
89
90 let expected = [
91 3, 1, 0, 0, 3, 0, 4, 1, 2, 0, 0, 1, 2, 2, 1, 1, 0, 3, 3, 3, 2, 0, 0, 0, 0, 0,
92 ];
93 assert_eq!(expected, letter_frequencies(&buf));
94 }
95}