use std::ffi::OsStr;
use super::chardrip::*;
const DEFINITIONS : &[(&[&str], &[&str])] = &[
(&["as", "actionscript"],
&["||=", "&&=", "||", "&&", "===", "!==", ">=",
"<=", "!=", "==", "/*", "*/", "//", "&=", "|=", "<<=", ">>=", "^=", "%=",
">>>", ">>>=", "<<", ">>", "+=", "-=", "*=", "/=", "++", "--"]
),
(&["c", "h", "cpp"],
&["!=", "++", "--", "==", ">=", "<=", "||", "&&", "+=", "-=",
"*=", "/=", "%=", "&=", "|=", "^=", "::", "->", "//", "<<",
">>", "##", "/*", "*/", ".*", "->*", "<<=", ">>="]
),
(&["cs"],
&["++", "--", "->", "<<", ">>", ">=", "<=", "==", "!=", "||",
"&&", "+=", "-=", "*=", "/=", "%=", "&=", "|=", "^=", "<<=",
">>=", "??", "///", "/*", "*/", "//"]
),
(&["go"],
&["+=", "&=", "&&", "==", "!=", "-=", "|=", "||",
"*=", "^=", "<-", ">=", "<<", "/=", "<<=", "++", ":=", ">>", "%=",
">>=", "--", "...", "&^", "&^=", "//", "/*", "*/"]
),
(&["groovy"],
&["!=", "++", "--", "==", ">=", "<=", "||", "&&", "+=", "-=",
"*=", "/=", "%=", "&=", "|=", "^=", "//", "<<", ">>", "##",
"/*", "*/", "/**", "<<=", ">>=", ">>>", ">>>=", "*.@", "<=>", "=~",
"==~", "*.", ".@", "?:", "?."]
),
(&["hs", "lhs"],
&["--", "{-", "-}", "^^", "**", "&&", "||", "<=", "==", "/=",
">=", "++", "..", "::", "!!", "\\\\", "->", "<-", "=>", ">>",
">>=", ">@>"]
),
(&["java"],
&["!=", "++", "--", "==", ">=", "<=", "||", "&&", "+=", "-=",
"*=", "/=", "%=", "&=", "|=", "^=", "//", "<<", ">>", "/*",
"*/", "/**", "<<=", ">>=", ">>>", ">>>="]
),
(&["lua"],
&["<=", ">=", "==", "~="]
),
(&["php"],
&["+=", "-=", "*=", "/=", "%=", ".=", "++", "--", "!=", "==",
"===", "<>", "!==", ">=", "<=", "||", "&&"]
),
(&["pl"],
&["=<", ">=", "==", "=:=", ":-", "?-"]
),
(&["py"],
&["**", "//", ">=", "<=", "==", "!=", "<>", "!=", "+=", "-=",
"*=", "/=", "%=", "**=", "//=", "<<", ">>"]
),
(&["rb"],
&["**", ">=", "<=", "<<", ">>", "<=>", "=~", "==", "===", "!=",
"!~", "||", "&&", "..", "...", "+=", "-=", "*=", "/=", "%=",
"&=", "||=", "&&=", "<<=", ">>=", "**="]
),
(&["rs"],
&["!=", "%=", "&=", "&&", "*=", "+=", "-=", "->", "..", "..=", "...",
"/=", "<<", "<<=", "<=", "==", "=>", ">=", ">>", ">>=", "^=", "|=",
"||", "::", "//", "//!", "///", "/*", "*/", "/*!", "/**"]
),
(&["vb"],
&[">=", "<=", "<>", "==", "+=", "-=", "*=", "/=", "\\=", "&=",
"^=", "<<", ">>"]
),
(&["html", "xml"],
&["<?", "?>", "</", "/>", "<!--", "-->"]
),
];
const SINGLE_CHAR_SYMBOLS : &[&str] = &[
"!", "%", "/", "*", "+", "-", "=", "|", ",",
"?", ".", "&", "(", ")", "{", "}", "<", ">", ":", ";", "^", "[", "]",
"\"", "#", "~", "@", "^"
];
const LISP_EXTENSIONS : &[&str] = &[
"clj", "lisp", "lsp", "rkt", "scm", "ss", "sld", "sld", "sps"
];
const TEXT_EXTENSIONS : &[&str] = &[
"adoc", "md", "txt"
];
pub fn is_known_extension (extn : &OsStr) -> bool {
let extn_str = extn.to_str().expect ("Fatal: cannot represent file extension as string");
for (extns, _) in DEFINITIONS {
if extns.contains (&extn_str) {
return true;
}
}
return LISP_EXTENSIONS.contains(&extn_str) || TEXT_EXTENSIONS.contains(&extn_str);
}
pub fn make_token_reader (extn : &OsStr, reader : CharDrip) -> Box<dyn TokenReader> {
let extn_str = extn.to_str().expect ("Fatal: cannot represent file extension as string");
if LISP_EXTENSIONS.contains (&extn_str) {
Box::new(LispReader::new (reader))
} else if TEXT_EXTENSIONS.contains (&extn_str) {
Box::new(TextReader::new (reader))
} else {
let mut symbols : &'static [&'static str] = &[];
for (extns, defns) in DEFINITIONS {
if extns.contains (&extn_str) {
symbols = defns;
}
}
Box::new(CodeReader::new (reader, symbols))
}
}
pub struct TokenResult {
pub prestring : String,
pub token : String,
}
pub trait TokenReader {
fn read_token (&mut self) -> Option<TokenResult>;
}
pub struct CodeReader {
input : CharDrip,
symbols : &'static [&'static str],
}
impl CodeReader {
pub fn new (reader : CharDrip, symbols : &'static [&'static str]) -> CodeReader {
CodeReader { input : reader, symbols : symbols}
}
}
impl TokenReader for CodeReader {
fn read_token (&mut self) -> Option<TokenResult> {
let prestring = skip_blanks (&mut self.input);
if let Some(c) = &self.input.read () {
let mut result = String::from("");
result.push (*c);
if c.is_ascii_digit () || *c == '.' { loop {
if let Some(c) = &self.input.read () {
if !c.is_ascii_digit() && *c != '.' {
let _ = &self.input.unread ();
break;
}
result.push (*c);
} else {
break;
}
}
} else if SINGLE_CHAR_SYMBOLS.contains (&c.to_string().as_str ()) ||
self.symbols.contains (&c.to_string().as_str ()) { let mut possible_symbol = result.clone ();
loop {
if let Some(c) = &self.input.read () {
possible_symbol.push (*c);
if self.symbols.contains (&possible_symbol.as_str ()) {
result.push (*c);
} else {
let _ = &self.input.unread ();
break;
}
} else {
break;
}
}
} else { loop {
if let Some(c) = &self.input.read () {
if !c.is_ascii_alphanumeric() && *c != '_' {
let _ = &self.input.unread ();
break;
}
result.push (*c);
} else {
break;
}
}
}
Some(TokenResult { prestring: prestring, token: result })
} else {
None
}
}
}
pub struct LispReader {
input : CharDrip,
}
impl LispReader {
pub fn new (reader : CharDrip) -> LispReader {
LispReader { input : reader }
}
}
impl TokenReader for LispReader {
fn read_token (&mut self) -> Option<TokenResult> {
let prestring = skip_blanks (&mut self.input);
match &self.input.read () {
Some(c) => {
let mut result = String::from("");
if *c == '(' || *c == ')' {
result.push (*c);
} else { result.push (*c);
loop {
match &self.input.read () {
Some(c) => {
if c.is_ascii_whitespace () || *c == '(' || *c == ')' {
let _ = &self.input.unread ();
break;
} else {
result.push (*c);
}
}
None => (),
}
}
}
Some(TokenResult { prestring: prestring, token: result })
},
None => None,
}
}
}
pub struct TextReader {
input : CharDrip,
}
impl TextReader {
pub fn new (reader : CharDrip) -> TextReader {
TextReader { input : reader }
}
}
impl TokenReader for TextReader {
fn read_token (&mut self) -> Option<TokenResult> {
let prestring = skip_non_alphabetic (&mut self.input);
match &self.input.read () {
Some(c) => { let mut result = String::from("");
result.push (*c);
loop {
match &self.input.read () {
Some(c) => { if c.is_ascii_whitespace() || !c.is_alphabetic() {
let _ = &self.input.unread ();
return Some(TokenResult { prestring: prestring, token: result.to_lowercase() });
} else {
result.push (*c);
}
}
None => {
return Some(TokenResult { prestring: prestring, token: result.to_lowercase() });
}
}
}
},
None => return None,
}
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_skip_blanks () {
let tests = [
("abc", ""),
(" abc", " "),
(" ", " "),
(" \t\na", " \t\n"),
];
for (text, pre) in tests.iter () {
let mut reader = CharDrip::new (text.chars().collect ());
let result = skip_blanks (&mut reader);
assert_eq!(pre.to_string(), result);
}
}
#[test]
fn test_java_tokeniser () {
let tests = vec![
("(", vec![("", "(")]),
("int x+=3;", vec![("", "int"), (" ", "x"), ("", "+="), ("", "3"), ("", ";")]),
];
test_tokeniser (&tests, "java".to_string ());
}
#[test]
fn test_lisp_tokeniser () {
let tests = vec![
("(", vec![("", "(")]),
("(define)", vec![("", "("), ("", "define"), ("", ")")]),
("(define )", vec![("", "("), ("", "define"), (" ", ")")]),
];
test_tokeniser (&tests, "ss".to_string ());
}
#[test]
fn test_text_tokeniser () {
let tests = vec![
("abc", vec![("", "abc")]),
("abc. DE3fg", vec![("", "abc"), (". ", "de"), ("3", "fg")]),
];
test_tokeniser (&tests, "txt".to_string ());
}
fn test_tokeniser (tests : &Vec<(&str, Vec<(&str, &str)>)>, extn : String) {
for (text, targets) in tests.iter () {
let mut reader = make_token_reader (&OsStr::new(&extn), CharDrip::new (text.chars().collect ()));
let mut results = vec![];
loop {
match reader.read_token () {
None => break,
Some(token) => results.push(token),
}
}
assert_eq! (results.len(), targets.len ());
for i in 0..results.len() {
let (tar_pre, tar_tok) = targets[i];
assert_eq!(results[i].prestring, tar_pre);
assert_eq!(results[i].token, tar_tok);
}
}
}
}