use std::{ffi::OsString, path::Path};
use thiserror::Error;
use super::loader;
#[derive(Debug, Error)]
#[non_exhaustive]
pub enum TokenizerError {
#[error("{filename}:{line}: Failed to load included file {incname}")]
IncludeFileError {
filename: String,
line: u32,
incname: String,
},
#[error("{filename}:{line}: Include directive was not follwed by a filename")]
IncompleteIncludeError { filename: String, line: u32 },
#[error("{filename}:{line}: Input text \"{tokentext}...\" was not recognized as an a2l token")]
InvalidA2lToken {
filename: String,
line: u32,
tokentext: String,
},
#[error("{filename}:{line}: Invalid numerical constant \"{tokentext}\"")]
InvalidNumericalConstant {
filename: String,
line: u32,
tokentext: String,
},
#[error("{filename}:{line}: Block comment was not closed before the end of input was reached")]
UnclosedComment { filename: String, line: u32 },
#[error("{filename}:{line}: String was not closed before the end of input was reached")]
UnclosedString { filename: String, line: u32 },
#[error("{filename}:{line}: There is no whitespace separating the input tokens")]
MissingWhitespace { filename: String, line: u32 },
}
#[derive(Debug, PartialEq, Eq, Clone)]
pub enum A2lTokenType {
Identifier,
Begin,
End,
Include,
String,
Number,
}
#[derive(Debug, Clone)]
pub struct A2lToken {
pub ttype: A2lTokenType,
pub startpos: usize,
pub endpos: usize,
pub fileid: usize,
pub line: u32,
}
#[derive(Debug)]
pub(crate) struct TokenResult {
pub(crate) tokens: Vec<A2lToken>,
pub(crate) filedata: Vec<String>,
pub(crate) filenames: Vec<String>,
}
pub(crate) fn tokenize(
filename: String,
fileid: usize,
filetext: &str,
) -> Result<TokenResult, TokenizerError> {
let mut filenames: Vec<String> = vec![filename.clone()];
let mut filedatas: Vec<String> = vec![filetext.to_owned()];
let filebytes = filetext.as_bytes();
let mut next_fileid = fileid + 1;
let input_tokens = tokenize_core(filename.clone(), fileid, filetext)?;
let mut include_directives: Vec<usize> = input_tokens
.iter()
.enumerate()
.filter_map(|(pos, A2lToken { ttype, .. })| {
if *ttype == A2lTokenType::Include {
Some(pos)
} else {
None
}
})
.collect();
let tokens = if include_directives.is_empty() {
input_tokens
} else {
let mut tokens = input_tokens[0..include_directives[0]].to_vec();
include_directives.push(input_tokens.len());
for idx in 1..include_directives.len() {
let token_subseq =
&input_tokens[include_directives[idx - 1] + 1..include_directives[idx]];
if !token_subseq.is_empty()
&& (token_subseq[0].ttype == A2lTokenType::String
|| token_subseq[0].ttype == A2lTokenType::Identifier)
{
let mut filename_start = token_subseq[0].startpos;
let mut filename_end = token_subseq[0].endpos;
if filebytes[filename_start] == b'"' && filebytes[filename_end - 1] == b'"' {
filename_start += 1;
filename_end -= 1;
}
let incname = &filetext[filename_start..filename_end];
let incfilename = make_include_filename(incname, &filenames[0]);
let incpathref = Path::new(&incfilename);
let loadresult = loader::load(incpathref);
if let Ok(incfiledata) = loadresult {
let mut tokresult = tokenize(incname.to_owned(), next_fileid, &incfiledata)?;
next_fileid += tokresult.filenames.len();
tokens.append(&mut tokresult.tokens);
filenames.append(&mut tokresult.filenames);
filedatas.append(&mut tokresult.filedata);
} else {
return Err(TokenizerError::IncludeFileError {
filename,
line: token_subseq[0].line,
incname: incname.to_owned(),
});
}
tokens.extend_from_slice(&token_subseq[1..]);
} else {
let line = input_tokens[include_directives[idx - 1]].line;
return Err(TokenizerError::IncompleteIncludeError { filename, line });
}
}
tokens
};
Ok(TokenResult {
tokens,
filenames,
filedata: filedatas,
})
}
fn tokenize_core(
filename: String,
fileid: usize,
filetext: &str,
) -> Result<Vec<A2lToken>, TokenizerError> {
let filebytes = filetext.as_bytes();
let datalen = filebytes.len();
let mut tokens: Vec<A2lToken> = Vec::with_capacity(datalen / 20);
let mut bytepos = 0;
let mut separated = true;
let mut line = 1;
while bytepos < datalen {
let startpos = bytepos;
if filebytes[bytepos].is_ascii_whitespace() {
separated = true;
while bytepos < datalen && filebytes[bytepos].is_ascii_whitespace() {
bytepos += 1;
}
line += count_newlines(&filebytes[startpos..bytepos]);
continue;
} else if filebytes[bytepos] == b'/' && bytepos + 1 < datalen {
bytepos += 1;
if filebytes[bytepos] == b'*' {
separated = true;
bytepos = skip_block_comment(filebytes, bytepos + 1).map_err(|()| {
TokenizerError::UnclosedComment {
filename: filename.clone(),
line,
}
})?;
line += count_newlines(&filebytes[startpos..bytepos]);
} else if filebytes[bytepos] == b'/' {
separated = true;
while bytepos < datalen && filebytes[bytepos] != b'\n' {
bytepos += 1;
}
} else if filebytes[bytepos..].starts_with(b"begin") {
separator_check(separated, &filename, line)?;
bytepos += 5;
tokens.push(A2lToken {
ttype: A2lTokenType::Begin,
startpos,
endpos: bytepos,
fileid,
line,
});
separated = false;
} else if filebytes[bytepos..].starts_with(b"end") {
separator_check(separated, &filename, line)?;
bytepos += 3;
tokens.push(A2lToken {
ttype: A2lTokenType::End,
startpos,
endpos: bytepos,
fileid,
line,
});
separated = false;
} else if filebytes[bytepos..].starts_with(b"include") {
separator_check(separated, &filename, line)?;
bytepos += 7;
tokens.push(A2lToken {
ttype: A2lTokenType::Include,
startpos,
endpos: bytepos,
fileid,
line,
});
separated = false;
} else {
let endpos = if startpos + 10 < datalen {
startpos + 10
} else {
datalen
};
return Err(TokenizerError::InvalidA2lToken {
filename,
line,
tokentext: String::from_utf8_lossy(&filebytes[startpos..endpos]).into(),
});
}
} else if filebytes[bytepos] == b'"' {
separator_check(separated, &filename, line)?;
bytepos = find_string_end(filebytes, bytepos + 1).map_err(|()| {
TokenizerError::UnclosedString {
filename: filename.clone(),
line,
}
})?;
line += count_newlines(&filebytes[startpos..bytepos]);
tokens.push(A2lToken {
ttype: A2lTokenType::String,
startpos,
endpos: bytepos,
fileid,
line,
});
separated = false;
} else if !tokens.is_empty()
&& tokens.last().unwrap().ttype == A2lTokenType::Include
&& !(filebytes[bytepos]).is_ascii_digit()
&& is_identchar(filebytes[bytepos])
{
separator_check(separated, &filename, line)?;
while bytepos < datalen && is_pathchar(filebytes[bytepos]) {
bytepos += 1;
}
tokens.push(A2lToken {
ttype: A2lTokenType::Identifier,
startpos,
endpos: bytepos,
fileid,
line,
});
separated = false;
} else if !(filebytes[bytepos]).is_ascii_digit() && is_identchar(filebytes[bytepos]) {
separator_check(separated, &filename, line)?;
while bytepos < datalen && is_identchar(filebytes[bytepos]) {
bytepos += 1;
}
tokens.push(A2lToken {
ttype: A2lTokenType::Identifier,
startpos,
endpos: bytepos,
fileid,
line,
});
separated = false;
let (new_bytepos, new_line) = handle_a2ml(filetext, bytepos, line, fileid, &mut tokens);
if bytepos != new_bytepos {
separated = true;
}
bytepos = new_bytepos;
line = new_line;
} else if filebytes[bytepos] == b'-' || is_numchar(filebytes[bytepos]) {
separator_check(separated, &filename, line)?;
bytepos += 1;
while bytepos < datalen && is_numchar(filebytes[bytepos]) {
bytepos += 1;
}
if bytepos == datalen || !is_identchar(filebytes[bytepos]) {
let number = &filebytes[startpos..bytepos];
if number == b"-" {
return Err(TokenizerError::InvalidNumericalConstant {
filename,
line,
tokentext: "-".to_owned(),
});
} else if number == b"0x" {
return Err(TokenizerError::InvalidNumericalConstant {
filename,
line,
tokentext: "0x".to_owned(),
});
}
tokens.push(A2lToken {
ttype: A2lTokenType::Number,
startpos,
endpos: bytepos,
fileid,
line,
});
} else if bytepos < datalen && is_identchar(filebytes[bytepos]) {
while bytepos < datalen && is_identchar(filebytes[bytepos]) {
bytepos += 1;
}
tokens.push(A2lToken {
ttype: A2lTokenType::Identifier,
startpos,
endpos: bytepos,
fileid,
line,
});
}
separated = false;
} else {
let endpos = if startpos + 10 < datalen {
startpos + 10
} else {
datalen
};
return Err(TokenizerError::InvalidA2lToken {
filename,
line,
tokentext: String::from_utf8_lossy(&filebytes[startpos..endpos]).into(),
});
}
}
Ok(tokens)
}
fn skip_block_comment(filebytes: &[u8], mut bytepos: usize) -> Result<usize, ()> {
let datalen = filebytes.len();
bytepos += 1;
while bytepos < datalen && !(filebytes[bytepos - 1] == b'*' && filebytes[bytepos] == b'/') {
bytepos += 1;
}
if bytepos >= datalen {
return Err(());
}
bytepos += 1;
Ok(bytepos)
}
fn find_string_end(filebytes: &[u8], mut bytepos: usize) -> Result<usize, ()> {
let datalen = filebytes.len();
let mut end_found = false;
let mut prev_quote = false;
let mut prev_bkslash = false;
while bytepos < datalen && !end_found {
if filebytes[bytepos] == b'"' {
prev_quote = !(prev_quote || prev_bkslash);
prev_bkslash = false;
} else {
if prev_quote {
end_found = true;
} else if filebytes[bytepos] == b'\\' {
if prev_bkslash {
prev_bkslash = false;
} else {
prev_bkslash = true;
}
} else {
prev_bkslash = false;
}
prev_quote = false;
}
bytepos += 1;
}
if bytepos == datalen && !end_found {
if prev_quote {
bytepos += 1;
} else {
return Err(());
}
}
bytepos -= 1;
Ok(bytepos)
}
fn handle_a2ml(
filedata: &str,
mut bytepos: usize,
mut line: u32,
fileid: usize,
tokens: &mut Vec<A2lToken>,
) -> (usize, u32) {
let tokcount = tokens.len();
if tokcount >= 2 && tokens[tokcount - 2].ttype == A2lTokenType::Begin {
let startpos = bytepos;
let filebytes = filedata.as_bytes();
let datalen = filedata.len();
let tag = &filedata[tokens[tokcount - 1].startpos..tokens[tokcount - 1].endpos];
if tag == "A2ML" {
let mut done = false;
while !done && bytepos < datalen {
while bytepos < datalen && filebytes[bytepos] != b'/' {
bytepos += 1;
}
if filebytes[bytepos..].starts_with(b"//") {
bytepos += 2;
while bytepos < datalen && filebytes[bytepos] != b'\n' {
bytepos += 1;
}
} else if filebytes[bytepos..].starts_with(b"/*") {
bytepos += 2;
while bytepos < (datalen - 1)
&& !(filebytes[bytepos] == b'*' && filebytes[bytepos + 1] == b'/')
{
bytepos += 1;
}
bytepos += 2;
if bytepos > datalen {
bytepos = datalen;
}
} else if filebytes[bytepos..].starts_with(b"/end") {
done = true;
} else {
bytepos += 1;
}
}
while filebytes[bytepos - 1].is_ascii_whitespace()
&& filebytes[bytepos - 1] != b'\r'
&& filebytes[bytepos - 1] != b'\n'
{
bytepos -= 1;
}
if filebytes[bytepos - 1] == b'\r' && filebytes[bytepos - 1] == b'\n' {
bytepos -= 2;
} else if filebytes[bytepos - 1] == b'\n' {
bytepos -= 1;
}
}
if bytepos > startpos {
tokens.push(A2lToken {
ttype: A2lTokenType::String,
startpos,
endpos: bytepos,
fileid,
line,
});
line += count_newlines(&filebytes[startpos..bytepos]);
}
}
(bytepos, line)
}
fn separator_check(separated: bool, filename: &str, line: u32) -> Result<(), TokenizerError> {
if !separated {
return Err(TokenizerError::MissingWhitespace {
filename: filename.to_owned(),
line,
});
}
Ok(())
}
fn count_newlines(text: &[u8]) -> u32 {
text.iter().map(|c| u32::from(*c == b'\n')).sum()
}
fn is_pathchar(c: u8) -> bool {
is_identchar(c) || c == b'\\' || c == b'/'
}
fn is_identchar(c: u8) -> bool {
c.is_ascii_alphanumeric() || c == b'.' || c == b'[' || c == b']' || c == b'_'
}
fn is_numchar(c: u8) -> bool {
c.is_ascii_hexdigit() || c == b'x' || c == b'X' || c == b'.' || c == b'+' || c == b'-'
}
fn make_include_filename(incname: &str, base_filename: &str) -> OsString {
let base = std::path::Path::new(base_filename);
if let Some(basedir) = base.parent() {
let joined = basedir.join(incname);
if joined.exists() {
return OsString::from(joined);
}
}
OsString::from(incname)
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn tokenize_a2l_comment() {
let data = String::from("/**/");
let tokresult = tokenize("testcase".to_string(), 0, &data).expect("Error");
assert_eq!(tokresult.tokens.len(), 0);
let data = String::from("/*/*/");
let tokresult = tokenize("testcase".to_string(), 0, &data).expect("Error");
assert_eq!(tokresult.tokens.len(), 0);
let data = String::from("/***********/");
let tokresult = tokenize("testcase".to_string(), 0, &data).expect("Error");
assert_eq!(tokresult.tokens.len(), 0);
let data = String::from("/***********/ abcdef");
let tokresult = tokenize("testcase".to_string(), 0, &data).expect("Error");
assert_eq!(tokresult.tokens.len(), 1);
let data = String::from("//");
let tokresult = tokenize("testcase".to_string(), 0, &data).expect("Error");
assert_eq!(tokresult.tokens.len(), 0);
let data = String::from("// abcdef");
let tokresult = tokenize("testcase".to_string(), 0, &data).expect("Error");
assert_eq!(tokresult.tokens.len(), 0);
let data = String::from("// abcdef\nabcde");
let tokresult = tokenize("testcase".to_string(), 0, &data).expect("Error");
assert_eq!(tokresult.tokens.len(), 1);
}
#[test]
fn tokenize_a2l_command() {
let data = String::from("/begin");
let tokresult = tokenize("testcase".to_string(), 0, &data).expect("Error");
assert_eq!(tokresult.tokens.len(), 1);
assert_eq!(tokresult.tokens[0].ttype, A2lTokenType::Begin);
let data = String::from("/end");
let tokresult = tokenize("testcase".to_string(), 0, &data).expect("Error");
assert_eq!(tokresult.tokens.len(), 1);
assert_eq!(tokresult.tokens[0].ttype, A2lTokenType::End);
let data = String::from("/include");
let tokresult = tokenize_core("test".to_string(), 0, &data).expect("Error");
assert_eq!(tokresult.len(), 1);
assert_eq!(tokresult[0].ttype, A2lTokenType::Include);
}
#[test]
fn tokenize_a2l_string() {
let data = String::from(r#" "" "#);
let tokresult = tokenize("testcase".to_string(), 0, &data).expect("Error");
assert_eq!(tokresult.tokens.len(), 1);
assert_eq!(tokresult.tokens[0].ttype, A2lTokenType::String);
let data = String::from(r#" """" "#);
let tokresult = tokenize("testcase".to_string(), 0, &data).expect("Error");
assert_eq!(tokresult.tokens.len(), 1);
assert_eq!(tokresult.tokens[0].ttype, A2lTokenType::String);
let data = String::from(r#"" ""x"" ""#);
let tokresult = tokenize("testcase".to_string(), 0, &data).expect("Error");
assert_eq!(tokresult.tokens.len(), 1);
assert_eq!(tokresult.tokens[0].ttype, A2lTokenType::String);
let data = String::from(r#" "\"" "#);
let tokresult = tokenize("testcase".to_string(), 0, &data).expect("Error");
assert_eq!(tokresult.tokens.len(), 1);
assert_eq!(tokresult.tokens[0].ttype, A2lTokenType::String);
let data = String::from(r#"" \"x\" ""#);
let tokresult = tokenize("testcase".to_string(), 0, &data).expect("Error");
assert_eq!(tokresult.tokens.len(), 1);
assert_eq!(tokresult.tokens[0].ttype, A2lTokenType::String);
let data = String::from("\"sdf sdf sdf\"");
let tokresult = tokenize("testcase".to_string(), 0, &data).expect("Error");
assert_eq!(tokresult.tokens.len(), 1);
assert_eq!(tokresult.tokens[0].ttype, A2lTokenType::String);
let data = String::from("\"\u{1234}\u{2345}\"");
let tokresult = tokenize("testcase".to_string(), 0, &data).expect("Error");
assert_eq!(tokresult.tokens.len(), 1);
assert_eq!(tokresult.tokens[0].ttype, A2lTokenType::String);
}
#[test]
fn tokenize_a2l_item() {
let data = String::from("foo_bar");
let tokresult = tokenize("testcase".to_string(), 0, &data).expect("Error");
assert_eq!(tokresult.tokens.len(), 1);
assert_eq!(tokresult.tokens[0].ttype, A2lTokenType::Identifier);
}
#[test]
fn tokenize_a2l_number() {
let data = String::from("0xabc1234");
let tokresult = tokenize("testcase".to_string(), 0, &data).expect("Error");
assert_eq!(tokresult.tokens.len(), 1);
assert_eq!(tokresult.tokens[0].ttype, A2lTokenType::Number);
let data = String::from("0ident");
let tokresult = tokenize("testcase".to_string(), 0, &data).expect("Error");
assert_eq!(tokresult.tokens.len(), 1);
assert_eq!(tokresult.tokens[0].ttype, A2lTokenType::Identifier);
}
#[test]
fn tokenize_a2l_skip_whitespace() {
let data = String::from("");
let tokresult = tokenize("testcase".to_string(), 0, &data).expect("Error");
assert_eq!(tokresult.tokens.len(), 0);
let data = String::from(" ");
let tokresult = tokenize("testcase".to_string(), 0, &data).expect("Error");
assert_eq!(tokresult.tokens.len(), 0);
let data = String::from("\n\n ");
let tokresult = tokenize("testcase".to_string(), 0, &data).expect("Error");
assert_eq!(tokresult.tokens.len(), 0);
}
#[test]
fn tokenize_string_with_backslash() {
let data = String::from(r#" ident "\\" 0 "#);
let tokresult = tokenize("testcase".to_string(), 0, &data).expect("Error");
assert_eq!(tokresult.tokens.len(), 3);
}
#[test]
fn tokenize_skip_a2ml() {
let data = String::from(
r##"
ASAP2_VERSION 1 60
/begin PROJECT Test "test test"
/begin MODULE MODULE_NAME ""
/begin A2ML
struct Foo {
uint;
uint;
}; /* trap: /end A2ML */
/ / //
/end A2ML
/end MODULE
/end PROJECT
"##,
);
let tokresult = tokenize("testcase".to_string(), 0, &data).expect("Error");
println!("token count: {}", tokresult.tokens.len());
assert_eq!(tokresult.tokens.len(), 20);
assert_eq!(tokresult.tokens[0].ttype, A2lTokenType::Identifier);
assert_eq!(tokresult.tokens[13].ttype, A2lTokenType::String); }
#[test]
fn tokenize_include() {
let data = String::from(
r##"
/include ./tests/test.a2l
/include .\tests\test.a2l
/include ".\tests\test.a2l"
/include "./tests/test.a2l"
"##,
);
let tokresult = tokenize_core("test".to_string(), 0, &data).expect("Error");
assert_eq!(tokresult.len(), 8);
println!("{:?}", tokresult);
assert_eq!(tokresult[0].ttype, A2lTokenType::Include);
assert_eq!(tokresult[1].ttype, A2lTokenType::Identifier);
assert_eq!(tokresult[2].ttype, A2lTokenType::Include);
assert_eq!(tokresult[3].ttype, A2lTokenType::Identifier);
assert_eq!(tokresult[4].ttype, A2lTokenType::Include);
assert_eq!(tokresult[5].ttype, A2lTokenType::String);
assert_eq!(tokresult[6].ttype, A2lTokenType::Include);
assert_eq!(tokresult[7].ttype, A2lTokenType::String);
}
}