tongrams 0.1.4

Tons of N-grams
Documentation
use std::fs::File;
use std::io::{prelude::*, BufReader};
use std::path::PathBuf;
use std::str::FromStr;

use tongrams::loader::{GramsGzFileLoader, GramsLoader};
use tongrams::EliasFanoTrieCountLm;

const TEST_FILENAMES: [&str; 5] = [
    "../test_data/1-grams.sorted.gz",
    "../test_data/2-grams.sorted.gz",
    "../test_data/3-grams.sorted.gz",
    "../test_data/4-grams.sorted.gz",
    "../test_data/5-grams.sorted.gz",
];

const NOEXIST_FILENAME: &str = "../test_data/queries.noexist.5K.txt";

const NUM_GRAMS: [usize; 5] = [8761, 38900, 61516, 70186, 73187];

fn load_noexist_queries() -> Vec<String> {
    let file = File::open(NOEXIST_FILENAME).expect("No such file");
    let buf = BufReader::new(file);
    buf.lines()
        .map(|l| l.expect("Could not parse line"))
        .collect()
}

#[test]
fn test_parser() {
    for (&filename, &num_grams) in TEST_FILENAMES.iter().zip(NUM_GRAMS.iter()) {
        let loader = GramsGzFileLoader::new(PathBuf::from_str(filename).unwrap());
        let parser = loader.parser().unwrap();
        assert_eq!(parser.num_grams(), num_grams);
    }
}

#[test]
fn test_lookup() {
    let lm = EliasFanoTrieCountLm::from_gz_files(&TEST_FILENAMES).unwrap();
    assert_eq!(lm.num_orders(), 5);

    let mut lookuper = lm.lookuper();
    for filename in TEST_FILENAMES {
        let loader = GramsGzFileLoader::new(filename);
        let parser = loader.parser().unwrap();
        for rec in parser {
            let rec = rec.unwrap();
            assert_eq!(lookuper.with_gram(rec.gram()), Some(rec.count()));
        }
    }
}

#[test]
fn test_noexist_lookup() {
    let lm = EliasFanoTrieCountLm::from_gz_files(&TEST_FILENAMES).unwrap();
    assert_eq!(lm.num_orders(), 5);

    let mut lookuper = lm.lookuper();
    let queries = load_noexist_queries();
    for query in &queries {
        assert_eq!(lookuper.with_str(query), None);
    }
}

#[test]
fn test_serialization() {
    let lm = EliasFanoTrieCountLm::from_gz_files(&TEST_FILENAMES).unwrap();

    let mut data = vec![];
    lm.serialize_into(&mut data).unwrap();

    let other = EliasFanoTrieCountLm::deserialize_from(&data[..]).unwrap();
    assert_eq!(lm.num_orders(), other.num_orders());
    assert_eq!(lm.num_grams(), other.num_grams());
}