ferret 1.1.1

A trigram-based tool for detecting similarity in groups of text documents or program code.
Documentation
//! Defines token readers for different kinds of text or code.

use std::ffi::OsStr;
use super::chardrip::*;

const DEFINITIONS : &[(&[&str], &[&str])] = &[
    (&["as", "actionscript"], 
     &["||=", "&&=", "||", "&&", "===", "!==", ">=",
     "<=", "!=", "==", "/*", "*/", "//", "&=", "|=", "<<=", ">>=", "^=", "%=",
     ">>>", ">>>=", "<<", ">>", "+=", "-=", "*=", "/=", "++", "--"]
    ),
    (&["c", "h", "cpp"],
     &["!=", "++", "--", "==", ">=", "<=", "||", "&&", "+=", "-=",
     "*=", "/=", "%=", "&=", "|=", "^=", "::", "->", "//", "<<",
     ">>", "##", "/*", "*/", ".*", "->*", "<<=", ">>="]
    ),
    (&["cs"],
     &["++", "--", "->", "<<", ">>", ">=", "<=", "==", "!=", "||",
     "&&", "+=", "-=", "*=", "/=", "%=", "&=", "|=", "^=", "<<=",
     ">>=", "??", "///", "/*", "*/", "//"]
    ),
    (&["go"],
     &["+=", "&=", "&&", "==", "!=", "-=", "|=", "||",
     "*=", "^=", "<-", ">=", "<<", "/=", "<<=", "++", ":=", ">>", "%=",
     ">>=", "--", "...", "&^", "&^=", "//", "/*", "*/"]
    ),
    (&["groovy"],
     &["!=", "++", "--", "==", ">=", "<=", "||", "&&", "+=", "-=",
     "*=", "/=", "%=", "&=", "|=", "^=", "//", "<<", ">>", "##",
     "/*", "*/", "/**", "<<=", ">>=", ">>>", ">>>=", "*.@", "<=>", "=~",
     "==~", "*.", ".@", "?:", "?."]
    ),
    (&["hs", "lhs"],
     &["--", "{-", "-}", "^^", "**", "&&", "||", "<=", "==", "/=",
     ">=", "++", "..", "::", "!!", "\\\\", "->", "<-", "=>", ">>",
     ">>=", ">@>"]
    ),
    (&["java"],
     &["!=", "++", "--", "==", ">=", "<=", "||", "&&", "+=", "-=",
     "*=", "/=", "%=", "&=", "|=", "^=", "//", "<<", ">>", "/*",
     "*/", "/**", "<<=", ">>=", ">>>", ">>>="]
    ),
    (&["lua"],
     &["<=", ">=", "==", "~="]
    ),
    (&["php"],
     &["+=", "-=", "*=", "/=", "%=", ".=", "++", "--", "!=", "==",
     "===", "<>", "!==", ">=", "<=", "||", "&&"]
    ),
    (&["pl"],
     &["=<", ">=", "==", "=:=", ":-", "?-"]
    ),
    (&["py"],
     &["**", "//", ">=", "<=", "==", "!=", "<>", "!=", "+=", "-=",
     "*=", "/=", "%=", "**=", "//=", "<<", ">>"]
    ),
    (&["rb"],
     &["**", ">=", "<=", "<<", ">>", "<=>", "=~", "==", "===", "!=",
     "!~", "||", "&&", "..", "...", "+=", "-=", "*=", "/=", "%=",
     "&=", "||=", "&&=", "<<=", ">>=", "**="]
    ),
    (&["rs"],
     &["!=", "%=", "&=", "&&", "*=", "+=", "-=", "->", "..", "..=", "...",
     "/=", "<<", "<<=", "<=", "==", "=>", ">=", ">>", ">>=", "^=", "|=",
     "||", "::", "//", "//!", "///", "/*", "*/", "/*!", "/**"]
     ),
    (&["vb"],
     &[">=", "<=", "<>", "==", "+=", "-=", "*=", "/=", "\\=", "&=",
     "^=", "<<", ">>"]
    ),
    (&["html", "xml"],
     &["<?", "?>", "</", "/>", "<!--", "-->"]
    ),
    ];

const SINGLE_CHAR_SYMBOLS : &[&str] = &[
    "!", "%", "/", "*", "+", "-", "=", "|", ",",
    "?", ".", "&", "(", ")", "{", "}", "<", ">", ":", ";", "^", "[", "]",
    "\"", "#", "~", "@", "^"
];

// lots of lisp/scheme extensions!
const LISP_EXTENSIONS : &[&str] = &[
    "clj", "lisp", "lsp", "rkt", "scm", "ss", "sld", "sld", "sps"
];

// some pure text extensions
const TEXT_EXTENSIONS : &[&str] = &[
    "adoc", "md", "txt"
];

/// Returns true if the given extension is one that the program knows about.
pub fn is_known_extension (extn : &OsStr) -> bool {
    let extn_str = extn.to_str().expect ("Fatal: cannot represent file extension as string");
    for (extns, _) in DEFINITIONS {
        if extns.contains (&extn_str) {
            return true;
        }
    }
    return LISP_EXTENSIONS.contains(&extn_str) || TEXT_EXTENSIONS.contains(&extn_str);
}

/// Creates a suitable tokeniser for the given file extension.
pub fn make_token_reader (extn : &OsStr, reader : CharDrip) -> Box<dyn TokenReader> {
    let extn_str = extn.to_str().expect ("Fatal: cannot represent file extension as string");

    if LISP_EXTENSIONS.contains (&extn_str) {
        Box::new(LispReader::new (reader))
    } else if TEXT_EXTENSIONS.contains (&extn_str) {
        Box::new(TextReader::new (reader))
    } else {
        let mut symbols : &'static [&'static str] = &[];
        for (extns, defns) in DEFINITIONS {
            if extns.contains (&extn_str) {
                symbols = defns;
            }
        }

        Box::new(CodeReader::new (reader, symbols))
    }
}

/// Holds a (prestring, token) pair.
pub struct TokenResult {
    /// the text between the end of the last token and the start of this token
    pub prestring : String,
    /// the text of the current token
    pub token : String,
}

/// Provides a common method for the different tokenisers, 
/// so that tokens can be read one by one.
pub trait TokenReader {
    fn read_token (&mut self) -> Option<TokenResult>;
}

// -- code reader

/// A generic tokeniser for various kinds of computer code. 
pub struct CodeReader {
    input : CharDrip,
    symbols : &'static [&'static str],
}

impl CodeReader {
    /// Creates an instance of a code reader.
    /// Is customised with a set of `symbols` appropriate to the language being tokenised.
    pub fn new (reader : CharDrip, symbols : &'static [&'static str]) -> CodeReader {
        CodeReader { input : reader, symbols : symbols}
    }
}

impl TokenReader for CodeReader {
    // When reading code, there are three broad categories: numbers, symbols and identifiers
    // -- numbers are assumed to be made from numbers and .
    // -- symbols are contained in the symbols set, as defined on setup from the file extension
    // -- identifiers are the rest
    // This produces a useful set of tokens, especially for syntactically correct input code.
    // Comments are processed, but also treated as code.
    // Function returns Some(TokenResult), with prestring of blanks and the token, or 
    // None, when it reaches the EOF
    fn read_token (&mut self) -> Option<TokenResult> {
        let prestring = skip_blanks (&mut self.input);
        
        if let Some(c) = &self.input.read () {
            let mut result = String::from("");
            result.push (*c);

            if c.is_ascii_digit () || *c == '.' { // **** number
                loop {
                    if let Some(c) = &self.input.read () {
                        if !c.is_ascii_digit() && *c != '.' {
                            let _ = &self.input.unread ();
                            break; 
                        }
                        result.push (*c);
                    } else {
                        break;
                    }
                }

            } else if SINGLE_CHAR_SYMBOLS.contains (&c.to_string().as_str ()) ||
                self.symbols.contains (&c.to_string().as_str ()) { // **** symbol
                    let mut possible_symbol = result.clone ();

                    loop {
                        if let Some(c) = &self.input.read () {
                            possible_symbol.push (*c);
                            if self.symbols.contains (&possible_symbol.as_str ()) {
                                result.push (*c);
                            } else {
                                let _ = &self.input.unread ();
                                break;
                            }
                        } else {
                            break;
                        }
                    }
                } else { // **** identifier
                    loop {
                        if let Some(c) = &self.input.read () {
                            if !c.is_ascii_alphanumeric() && *c != '_' {
                                let _ = &self.input.unread ();
                                break;
                            }
                            result.push (*c);
                        } else {
                            break;
                        }
                    }
                }
            Some(TokenResult { prestring: prestring, token: result })
        } else {
            None
        }
    }
}
// -- lisp reader

/// A tokeniser for lisp-like languages, using an S-exp representation.
pub struct LispReader {
    input : CharDrip,
}

impl LispReader {
    /// Creates an instance of a lisp reader.
    pub fn new (reader : CharDrip) -> LispReader {
        LispReader { input : reader }
    }
}

impl TokenReader for LispReader {
    fn read_token (&mut self) -> Option<TokenResult> {
        let prestring = skip_blanks (&mut self.input);
        match &self.input.read () {
            Some(c) => { 
                let mut result = String::from("");
                if *c == '(' || *c == ')' {
                    result.push (*c);
                } else { // must be a symbol, read until EOF, space, ( or )
                    result.push (*c);
                    loop { 
                        match &self.input.read () {
                            Some(c) => {
                                if c.is_ascii_whitespace () || *c == '(' || *c == ')' {
                                    let _ = &self.input.unread ();
                                    break;
                                } else {
                                    result.push (*c);
                                }
                            }
                            None => (),
                        }
                    }
                }
                Some(TokenResult { prestring: prestring, token: result })
            },
            None => None,
        }
    }
}
// -- text reader

/// A tokeniser for plain text. Splits text into words, ignoring case, letters and punctuation.
pub struct TextReader {
    input : CharDrip,
}

impl TextReader {
    /// Creates an instance of a text reader.
    pub fn new (reader : CharDrip) -> TextReader {
        TextReader { input : reader }
    }
}

impl TokenReader for TextReader {
    fn read_token (&mut self) -> Option<TokenResult> {
        let prestring = skip_non_alphabetic (&mut self.input);
        match &self.input.read () {
            Some(c) => { // read next word until EOF, space or !letter
                let mut result = String::from("");
                result.push (*c);
                loop {
                    match &self.input.read () {
                        Some(c) => { // TODO: Add test for Han unicode range
                            if c.is_ascii_whitespace() || !c.is_alphabetic() {
                                let _ = &self.input.unread ();
                                return Some(TokenResult { prestring: prestring, token: result.to_lowercase() });
                            } else {
                                result.push (*c);
                            }
                        }
                        None => {
                            return Some(TokenResult { prestring: prestring, token: result.to_lowercase() });
                        }
                    }
                }
            },
            None => return None,
        }
    }
}

#[cfg(test)]
mod tests {
    use super::*;

    #[test]
    fn test_skip_blanks () {
        let tests = [
            ("abc", ""),
            ("  abc", "  "),
            ("  ", "  "),
            (" \t\na", " \t\n"),
        ];
        for (text, pre) in tests.iter () {
            let mut reader = CharDrip::new (text.chars().collect ());
            let result = skip_blanks (&mut reader);
            assert_eq!(pre.to_string(), result);
        }
    }

    #[test]
    fn test_java_tokeniser () {
        let tests = vec![
            ("(", vec![("", "(")]),
            ("int x+=3;", vec![("", "int"), (" ", "x"), ("", "+="), ("", "3"), ("", ";")]),
        ];
        test_tokeniser (&tests, "java".to_string ());
    }

    #[test]
    fn test_lisp_tokeniser () {
        let tests = vec![
            ("(", vec![("", "(")]),
            ("(define)", vec![("", "("), ("", "define"), ("", ")")]),
            ("(define )", vec![("", "("), ("", "define"), (" ", ")")]),
        ];
        test_tokeniser (&tests, "ss".to_string ());
    }

    #[test]
    fn test_text_tokeniser () {
        let tests = vec![
            ("abc", vec![("", "abc")]),
            ("abc. DE3fg", vec![("", "abc"), (". ", "de"), ("3", "fg")]),
            // including special case for Han characters - TODO
            // ("象形字", vec![("", "象"), ("", "形"), ("", "字")]),
        ];
        test_tokeniser (&tests, "txt".to_string ());
    }

    fn test_tokeniser (tests : &Vec<(&str, Vec<(&str, &str)>)>, extn : String) {
        for (text, targets) in tests.iter () {
            let mut reader = make_token_reader (&OsStr::new(&extn), CharDrip::new (text.chars().collect ()));
            let mut results = vec![];
            loop {
                match reader.read_token () {
                    None => break,
                    Some(token) => results.push(token),
                }
            }

            assert_eq! (results.len(), targets.len ());
            for i in 0..results.len() {
                let (tar_pre, tar_tok) = targets[i];
                assert_eq!(results[i].prestring, tar_pre);
                assert_eq!(results[i].token, tar_tok);
            }
        }
    }
}