use super::CharIndex;
lazy_static! {
static ref BIGRAMS: Vec<f64> = {
let f = include_str!("data/bigrams.txt");
let mut bigrams = vec![0.0; 676];
for line in f.lines() {
let line: Vec<_> = line.split(' ').collect();
let bigram: &str = line[0];
let count: f64 = line[1].parse().unwrap();
let index = bigram.chars().fold(0, |acc, c| 26 * acc + c.index());
bigrams[index] = count.ln();
}
bigrams
};
static ref TRIGRAMS: Vec<f64> = {
let f = include_str!("data/trigrams.txt");
let mut trigrams = vec![0.0; 17_576];
for line in f.lines() {
let line: Vec<_> = line.split(' ').collect();
let trigram: &str = line[0];
let count: f64 = line[1].parse().unwrap();
let index = trigram.chars().fold(0, |acc, c| 26 * acc + c.index());
trigrams[index] = count.ln();
}
trigrams
};
static ref QUADGRAMS: Vec<f64> = {
let f = include_str!("data/quadgrams.txt");
let mut quadgrams = vec![0.0; 456_976];
for line in f.lines() {
let line: Vec<_> = line.split(' ').collect();
let quadgram: &str = line[0];
let count: f64 = line[1].parse().unwrap();
let index = quadgram.chars().fold(0, |acc, c| 26 * acc + c.index());
quadgrams[index] = count.ln();
}
quadgrams
};
}
fn ngram_score(n: usize, ngrams: &Vec<f64>, msg: &str) -> f64 {
let char_indices: Vec<usize> = msg.to_uppercase().chars()
.filter(|&c| c.is_alphabetic())
.map(|c| c.index())
.collect();
if char_indices.len() < n {
panic!("Message has fewer than n alphabetic characters.");
}
char_indices.windows(n)
.map(|w| w.iter().fold(0, |acc, x| 26 * acc + x))
.map(|i| ngrams[i])
.sum()
}
pub trait FitnessFn {
fn score(msg: &str) -> f64;
}
pub struct Bigram;
impl FitnessFn for Bigram {
fn score(msg: &str) -> f64 {
ngram_score(2, &BIGRAMS, msg)
}
}
pub struct Trigram;
impl FitnessFn for Trigram {
fn score(msg: &str) -> f64 {
ngram_score(3, &TRIGRAMS, msg)
}
}
pub struct Quadgram;
impl FitnessFn for Quadgram {
fn score(msg: &str) -> f64 {
ngram_score(4, &QUADGRAMS, msg)
}
}
pub struct IoC;
impl FitnessFn for IoC {
fn score(msg: &str) -> f64 {
let char_indices: Vec<usize> = msg.chars()
.filter(|&c| c.is_alphabetic())
.map(|c| c.index())
.collect();
let mut buckets = [0; 26];
for c in &char_indices {
buckets[*c] += 1;
}
let tot: isize = buckets
.iter()
.map(|n| n * (n - 1))
.sum();
let n = char_indices.len();
return tot as f64 / (n * (n - 1) / 26) as f64;
}
}
#[cfg(test)]
mod tests {
use super::*;
macro_rules! assert_approx_eq {
($a:expr, $b:expr) => {
let (a, b) = (&$a, &$b);
assert!((*a - *b).abs() < 1.0e-6, "{} is not approximately equal to {}", *a, *b);
}
}
#[test]
fn quadgram_estimates() {
assert_approx_eq!(QUADGRAMS[0], 8.81060879);
assert_approx_eq!(Quadgram::score("THE QUICK BROWN FOX"), 149.80102862);
}
#[test]
fn sensible_ngram_scores() {
assert!(Bigram::score("AN ENGLISH PHRASE") > Bigram::score("ESARHP HSILGNE NA"));
assert!(Trigram::score("AN ENGLISH PHRASE") > Trigram::score("ESARHP HSILGNE NA"));
assert!(Quadgram::score("AN ENGLISH PHRASE") > Quadgram::score("ESARHP HSILGNE NA"));
}
#[test]
#[should_panic]
fn invalid_qgram_check() {
Quadgram::score("ABC");
}
#[test]
fn sensible_ioc_scores() {
assert_approx_eq!(IoC::score("THE INDEX OF COINCIDENCE PROVIDES A MEASURE OF HOW LIKELY IT IS TO DRAW TWO MATCHING LETTERS BY RANDOMLY SELECTING TWO LETTERS FROM A GIVEN TEXT"), 1.55925925);
assert_approx_eq!(IoC::score(&"ABCDEFGHIJKLMNOPQRSTUVWXYZ".repeat(100)), 0.99038091);
}
}