1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
use crate::{Buffer, Char, CharStream};
use itertools::Itertools;
use lazy_static::lazy_static;
lazy_static! {
static ref QUADGRAMS: &'static [f32] = {
let buf = include_bytes!("../data/quadgram_scores.raw");
unsafe { std::slice::from_raw_parts(buf.as_ptr() as *const f32, 26 * 26 * 26 * 26) }
};
}
pub fn letter_frequencies<'a, T: CharStream<'a>>(buf: &'a T) -> [u32; Char::MAX as usize] {
let mut out = [0; Char::MAX as usize];
for &b in buf.iter() {
out[u8::from(b) as usize] += 1;
}
out
}
pub fn chi_squared<'a, T: CharStream<'a>>(buf: &'a T) -> f64 {
let english_freqs = [
0.08167, 0.01492, 0.02782, 0.04253, 0.12702, 0.02228, 0.02015, 0.06094, 0.06966, 0.00153,
0.00772, 0.04025, 0.02406, 0.06749, 0.07507, 0.01929, 0.00095, 0.05987, 0.06327, 0.09056,
0.02758, 0.00978, 0.02360, 0.00150, 0.01974, 0.00074,
];
let freqs = letter_frequencies(buf);
let len_f = buf.len() as f64;
freqs
.iter()
.enumerate()
.map(|(i, &f)| {
let e_count = len_f * english_freqs[i];
let diff = f as f64 - e_count;
(diff * diff) / e_count
})
.sum()
}
pub fn index_of_coincidence<'a, T: CharStream<'a>>(buf: &'a T) -> f64 {
let freqs = letter_frequencies(buf);
let total = freqs
.iter()
.filter(|&f| *f > 0)
.map(|&f| f * (f - 1))
.sum::<u32>() as f64;
let len = buf.len();
let denominator = (len * (len - 1)) as f64 / Char::MAX as f64;
total / denominator
}
pub fn quadgram_score(buf: &Buffer) -> f64 {
let mut score = 0.0_f64;
for (c1, c2, c3, c4) in buf.into_iter().tuple_windows() {
let hash = (u8::from(*c1) as usize * 26_usize.pow(3))
+ (u8::from(*c2) as usize * 26_usize.pow(2))
+ (u8::from(*c3) as usize * 26_usize.pow(1))
+ (u8::from(*c4) as usize * 26_usize.pow(0));
score += QUADGRAMS[hash] as f64;
}
score / (buf.len() as f64)
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_heuristics() {
let buf = Buffer::from("Rust is the best programming language");
assert_eq!(1.310483870967742, index_of_coincidence(&buf));
assert_eq!(29.514280393617323, chi_squared(&buf));
}
#[test]
fn test_freqs() {
let buf = Buffer::from("Rust is the best programming language");
let expected = [
3, 1, 0, 0, 3, 0, 4, 1, 2, 0, 0, 1, 2, 2, 1, 1, 0, 3, 3, 3, 2, 0, 0, 0, 0, 0,
];
assert_eq!(expected, letter_frequencies(&buf));
}
}