ferret 1.1.1

A trigram-based tool for detecting similarity in groups of text documents or program code.
Documentation
//! Splits a given text file into a sequence of trigrams.

use std::path::PathBuf;
use std::fs;

use super::chardrip::CharDrip;
use super::tokenisers;

/// A trigram reader gradually reads tokens through a given tokeniser, and stores 
/// the prestrings and tokens of the last trigram read.
pub struct TrigramReader {
    token_reader : Box<dyn tokenisers::TokenReader>,
    pub prestrings : Vec<String>,
    pub tokens : Vec<String>,
}

impl TrigramReader {
    /// Creates a new TrigramReader based on a given file.
    pub fn new (file : &PathBuf) -> TrigramReader {
        if let Ok(contents) = fs::read_to_string (file.as_path ()) {
            let dripper = CharDrip::new (contents.chars().collect ());
            let extn = file.extension().unwrap ();
            TrigramReader {
                token_reader: tokenisers::make_token_reader (extn, dripper),
                prestrings: vec![String::from(""), String::from(""), String::from("")],
                tokens: vec![String::from(""), String::from(""), String::from("")]
            }
        } else {
            panic! ("Could not open file");
        }
    }

    /// Returns the last trigram read.
    pub fn last_trigram (&self) -> String {
        format!("{} {} {}", &self.tokens[0], &self.tokens[1], &self.tokens[2])
    }

    /// Attempts to read the next word to form the next trigram. 
    /// Returns `true` if it succeeds, or `false` when at the end of a file.
    pub fn read_trigram (&mut self) -> bool {
        loop {
            match &self.token_reader.read_token () {
                Some(result) => {
                    push_to_triple (&mut self.prestrings, result.prestring.clone ());
                    push_to_triple (&mut self.tokens, result.token.clone ());

                    if &self.tokens[0] != "" { // needed for first two iterations
                        return true;
                    }
                },
                None => return false,
            }
        }
    }
}

// Adds the new token definition: note that prestrings and tokens must 
// always be of length 3.
fn push_to_triple (items : &mut Vec<String>, item : String) {
    debug_assert!(items.len() == 3, "Prestrings/Tokens must be length 3");
    items.remove (0);
    items.push (item);
}


#[cfg(test)]
mod tests {
    use std::ffi::OsStr;
    use super::*;

    #[test]
    fn test_java_trigram_reader () {
        let tests = vec![
            ("int x+=3;", vec!["int x +=", "x += 3", "+= 3 ;"]),
        ];
        test_trigram_reader (&tests, "java".to_string ());
    }

    #[test]
    fn test_lisp_trigram_reader () {
        let tests = vec![
            ("(a)", vec!["( a )"]),
            ("( a )", vec!["( a )"]),
            ("( a b )", vec!["( a b", "a b )"]),
            ("(abc b)", vec!["( abc b", "abc b )"]),
        ];
        test_trigram_reader (&tests, "ss".to_string ());
    }

    #[test]
    fn test_text_trigram_reader () {
        let tests = vec![
            ("", vec![]),
            ("a", vec![]),
            ("a b", vec![]),
            ("a b c", vec!["a b c"]),
            ("some. words, with, punctuation 123 Numbers",
             vec!["some words with", "words with punctuation", "with punctuation numbers"]),
        ];
        test_trigram_reader (&tests, "txt".to_string ());
    }

    fn test_trigram_reader (tests : &Vec<(&str, Vec<&str>)>, extn : String) {
        for (text, targets) in tests.iter () {
            let mut reader = TrigramReader { 
                token_reader: tokenisers::make_token_reader (&OsStr::new(&extn), CharDrip::new (text.chars().collect ())),
                prestrings: vec![String::from(""), String::from(""), String::from("")],
                tokens: vec![String::from(""), String::from(""), String::from("")]
            };
            let mut results = vec![];
            while reader.read_trigram () {
                results.push(reader.last_trigram());
            }

            assert_eq! (results.len(), targets.len ());
            for i in 0..results.len() {
                assert_eq!(results[i], targets[i]);
            }
        }
    }
}