ferret/
trigram_reader.rs

1//! Splits a given text file into a sequence of trigrams.
2
3use std::path::PathBuf;
4use std::fs;
5
6use super::chardrip::CharDrip;
7use super::tokenisers;
8
9/// A trigram reader gradually reads tokens through a given tokeniser, and stores 
10/// the prestrings and tokens of the last trigram read.
11pub struct TrigramReader {
12    token_reader : Box<dyn tokenisers::TokenReader>,
13    pub prestrings : Vec<String>,
14    pub tokens : Vec<String>,
15}
16
17impl TrigramReader {
18    /// Creates a new TrigramReader based on a given file.
19    pub fn new (file : &PathBuf) -> TrigramReader {
20        if let Ok(contents) = fs::read_to_string (file.as_path ()) {
21            let dripper = CharDrip::new (contents.chars().collect ());
22            let extn = file.extension().unwrap ();
23            TrigramReader {
24                token_reader: tokenisers::make_token_reader (extn, dripper),
25                prestrings: vec![String::from(""), String::from(""), String::from("")],
26                tokens: vec![String::from(""), String::from(""), String::from("")]
27            }
28        } else {
29            panic! ("Could not open file");
30        }
31    }
32
33    /// Returns the last trigram read.
34    pub fn last_trigram (&self) -> String {
35        format!("{} {} {}", &self.tokens[0], &self.tokens[1], &self.tokens[2])
36    }
37
38    /// Attempts to read the next word to form the next trigram. 
39    /// Returns `true` if it succeeds, or `false` when at the end of a file.
40    pub fn read_trigram (&mut self) -> bool {
41        loop {
42            match &self.token_reader.read_token () {
43                Some(result) => {
44                    push_to_triple (&mut self.prestrings, result.prestring.clone ());
45                    push_to_triple (&mut self.tokens, result.token.clone ());
46
47                    if &self.tokens[0] != "" { // needed for first two iterations
48                        return true;
49                    }
50                },
51                None => return false,
52            }
53        }
54    }
55}
56
57// Adds the new token definition: note that prestrings and tokens must 
58// always be of length 3.
59fn push_to_triple (items : &mut Vec<String>, item : String) {
60    debug_assert!(items.len() == 3, "Prestrings/Tokens must be length 3");
61    items.remove (0);
62    items.push (item);
63}
64
65
66#[cfg(test)]
67mod tests {
68    use std::ffi::OsStr;
69    use super::*;
70
71    #[test]
72    fn test_java_trigram_reader () {
73        let tests = vec![
74            ("int x+=3;", vec!["int x +=", "x += 3", "+= 3 ;"]),
75        ];
76        test_trigram_reader (&tests, "java".to_string ());
77    }
78
79    #[test]
80    fn test_lisp_trigram_reader () {
81        let tests = vec![
82            ("(a)", vec!["( a )"]),
83            ("( a )", vec!["( a )"]),
84            ("( a b )", vec!["( a b", "a b )"]),
85            ("(abc b)", vec!["( abc b", "abc b )"]),
86        ];
87        test_trigram_reader (&tests, "ss".to_string ());
88    }
89
90    #[test]
91    fn test_text_trigram_reader () {
92        let tests = vec![
93            ("", vec![]),
94            ("a", vec![]),
95            ("a b", vec![]),
96            ("a b c", vec!["a b c"]),
97            ("some. words, with, punctuation 123 Numbers",
98             vec!["some words with", "words with punctuation", "with punctuation numbers"]),
99        ];
100        test_trigram_reader (&tests, "txt".to_string ());
101    }
102
103    fn test_trigram_reader (tests : &Vec<(&str, Vec<&str>)>, extn : String) {
104        for (text, targets) in tests.iter () {
105            let mut reader = TrigramReader { 
106                token_reader: tokenisers::make_token_reader (&OsStr::new(&extn), CharDrip::new (text.chars().collect ())),
107                prestrings: vec![String::from(""), String::from(""), String::from("")],
108                tokens: vec![String::from(""), String::from(""), String::from("")]
109            };
110            let mut results = vec![];
111            while reader.read_trigram () {
112                results.push(reader.last_trigram());
113            }
114
115            assert_eq! (results.len(), targets.len ());
116            for i in 0..results.len() {
117                assert_eq!(results[i], targets[i]);
118            }
119        }
120    }
121}