use {
crate::{
error::{listing, throw, Error, SourceRange},
format::CodeStr,
token::{
Token, Variant, ASYMMETRIC_KEYWORD, AS_KEYWORD, BOOL_KEYWORD, BYTES_KEYWORD,
CHOICE_KEYWORD, DELETED_KEYWORD, F64_KEYWORD, IMPORT_KEYWORD, OPTIONAL_KEYWORD,
S64_KEYWORD, STRING_KEYWORD, STRUCT_KEYWORD, U64_KEYWORD, UNIT_KEYWORD,
},
},
std::path::Path,
unicode_segmentation::GraphemeCursor,
};
const RAW_IDENTIFIER_SIGIL: char = '$';
#[allow(clippy::cognitive_complexity)]
#[allow(clippy::too_many_lines)]
pub fn tokenize(schema_path: &Path, schema_contents: &str) -> Result<Vec<Token>, Vec<Error>> {
let mut tokens = vec![];
let mut errors = vec![];
let mut iter = schema_contents.char_indices().peekable();
while let Some((i, c)) = iter.next() {
match c {
':' => {
tokens.push(Token {
source_range: SourceRange {
start: i,
end: i + 1,
},
variant: Variant::Colon,
});
}
'.' => {
tokens.push(Token {
source_range: SourceRange {
start: i,
end: i + 1,
},
variant: Variant::Dot,
});
}
'=' => {
tokens.push(Token {
source_range: SourceRange {
start: i,
end: i + 1,
},
variant: Variant::Equals,
});
}
'{' => {
tokens.push(Token {
source_range: SourceRange {
start: i,
end: i + 1,
},
variant: Variant::LeftCurly,
});
}
'[' => {
tokens.push(Token {
source_range: SourceRange {
start: i,
end: i + 1,
},
variant: Variant::LeftSquare,
});
}
'}' => {
tokens.push(Token {
source_range: SourceRange {
start: i,
end: i + 1,
},
variant: Variant::RightCurly,
});
}
']' => {
tokens.push(Token {
source_range: SourceRange {
start: i,
end: i + 1,
},
variant: Variant::RightSquare,
});
}
_ if c.is_alphabetic() || c == '_' || c == RAW_IDENTIFIER_SIGIL => {
let mut end = schema_contents.len();
while let Some((j, d)) = iter.peek() {
if d.is_alphanumeric() || *d == '_' {
iter.next();
} else {
end = *j;
break;
}
}
if &schema_contents[i..end] == AS_KEYWORD {
tokens.push(Token {
source_range: SourceRange { start: i, end },
variant: Variant::As,
});
} else if &schema_contents[i..end] == ASYMMETRIC_KEYWORD {
tokens.push(Token {
source_range: SourceRange { start: i, end },
variant: Variant::Asymmetric,
});
} else if &schema_contents[i..end] == BOOL_KEYWORD {
tokens.push(Token {
source_range: SourceRange { start: i, end },
variant: Variant::Bool,
});
} else if &schema_contents[i..end] == BYTES_KEYWORD {
tokens.push(Token {
source_range: SourceRange { start: i, end },
variant: Variant::Bytes,
});
} else if &schema_contents[i..end] == CHOICE_KEYWORD {
tokens.push(Token {
source_range: SourceRange { start: i, end },
variant: Variant::Choice,
});
} else if &schema_contents[i..end] == DELETED_KEYWORD {
tokens.push(Token {
source_range: SourceRange { start: i, end },
variant: Variant::Deleted,
});
} else if &schema_contents[i..end] == F64_KEYWORD {
tokens.push(Token {
source_range: SourceRange { start: i, end },
variant: Variant::F64,
});
} else if &schema_contents[i..end] == IMPORT_KEYWORD {
tokens.push(Token {
source_range: SourceRange { start: i, end },
variant: Variant::Import,
});
} else if &schema_contents[i..end] == OPTIONAL_KEYWORD {
tokens.push(Token {
source_range: SourceRange { start: i, end },
variant: Variant::Optional,
});
} else if &schema_contents[i..end] == S64_KEYWORD {
tokens.push(Token {
source_range: SourceRange { start: i, end },
variant: Variant::S64,
});
} else if &schema_contents[i..end] == STRING_KEYWORD {
tokens.push(Token {
source_range: SourceRange { start: i, end },
variant: Variant::String,
});
} else if &schema_contents[i..end] == STRUCT_KEYWORD {
tokens.push(Token {
source_range: SourceRange { start: i, end },
variant: Variant::Struct,
});
} else if &schema_contents[i..end] == U64_KEYWORD {
tokens.push(Token {
source_range: SourceRange { start: i, end },
variant: Variant::U64,
});
} else if &schema_contents[i..end] == UNIT_KEYWORD {
tokens.push(Token {
source_range: SourceRange { start: i, end },
variant: Variant::Unit,
});
} else {
let start = if c == RAW_IDENTIFIER_SIGIL { i + 1 } else { i };
if start == end {
errors.push(throw::<Error>(
"Identifiers cannot be empty.",
Some(schema_path),
Some(&listing(schema_contents, SourceRange { start: i, end })),
None,
));
}
if schema_contents[start..end].starts_with('_') {
errors.push(throw::<Error>(
"Identifiers cannot begin with `_`.",
Some(schema_path),
Some(&listing(schema_contents, SourceRange { start: i, end })),
None,
));
}
tokens.push(Token {
source_range: SourceRange { start: i, end },
variant: Variant::Identifier(schema_contents[start..end].into()),
});
}
}
'0'..='9' => {
let mut end = schema_contents.len();
while let Some((j, d)) = iter.peek() {
if ('0'..='9').contains(d) {
iter.next();
} else {
end = *j;
break;
}
}
match schema_contents[i..end].parse::<usize>() {
Ok(integer) => {
tokens.push(Token {
source_range: SourceRange { start: i, end },
variant: Variant::Integer(integer),
});
}
Err(_) => {
errors.push(throw::<Error>(
&format!(
"Integer {} must be less than 2^64.",
&schema_contents[i..end].code_str(),
),
Some(schema_path),
Some(&listing(schema_contents, SourceRange { start: i, end })),
None,
));
}
}
}
'\'' => {
let mut end = i;
for (j, d) in &mut iter {
if d == '\'' {
end = j;
break;
}
}
if end == i {
errors.push(throw::<Error>(
&format!(
"Path starting here must be terminated by a {}.",
"'".code_str(),
),
Some(schema_path),
Some(&listing(
schema_contents,
SourceRange {
start: i,
end: i + 1,
},
)),
None,
));
} else {
tokens.push(Token {
source_range: SourceRange {
start: i,
end: end + 1,
},
variant: Variant::Path(Path::new(&schema_contents[i + 1..end]).to_owned()),
});
}
}
'#' => {
let mut line_start = i + 1;
let mut line_end = schema_contents.len();
let mut lines = vec![];
loop {
for (j, d) in &mut iter {
if d == '\n' {
line_end = j;
break;
}
}
lines.push(schema_contents[line_start..line_end].trim().to_owned());
while let Some((_, d)) = iter.peek() {
if d.is_whitespace() && *d != '\n' {
iter.next();
} else {
break;
}
}
if let Some((j, '#')) = iter.peek() {
line_start = j + 1;
line_end = schema_contents.len();
} else {
break;
}
}
let mut paragraphs = vec![];
let mut paragraph = "".to_owned();
for line in lines {
if line.is_empty() && !paragraph.is_empty() {
paragraphs.push(paragraph.clone());
paragraph.clear();
} else if !line.is_empty() {
if !paragraph.is_empty() {
paragraph.push(' ');
}
paragraph.push_str(&line);
}
}
if !paragraph.is_empty() {
paragraphs.push(paragraph.clone());
}
tokens.push(Token {
source_range: SourceRange {
start: i,
end: line_end,
},
variant: Variant::Comment(paragraphs),
});
}
_ if c.is_whitespace() => continue,
_ => {
let mut cursor = GraphemeCursor::new(i, schema_contents.len(), true);
let end = cursor.next_boundary(schema_contents, 0).unwrap().unwrap();
errors.push(throw::<Error>(
&format!("Unexpected symbol {}.", &schema_contents[i..end].code_str()),
Some(schema_path),
Some(&listing(schema_contents, SourceRange { start: i, end: i })),
None,
));
}
}
}
if !errors.is_empty() {
return Err(errors);
}
Ok(tokens)
}
#[cfg(test)]
mod tests {
use {
crate::{
assert_fails, assert_same,
error::SourceRange,
token::{
Token, Variant, ASYMMETRIC_KEYWORD, AS_KEYWORD, BOOL_KEYWORD, BYTES_KEYWORD,
CHOICE_KEYWORD, DELETED_KEYWORD, F64_KEYWORD, IMPORT_KEYWORD, OPTIONAL_KEYWORD,
S64_KEYWORD, STRING_KEYWORD, STRUCT_KEYWORD, U64_KEYWORD, UNIT_KEYWORD,
},
tokenizer::{tokenize, RAW_IDENTIFIER_SIGIL},
},
std::path::Path,
};
#[test]
fn tokenize_example() {
let source = "
# This is a struct.
struct Foo {
}
# This is a choice.
choice Bar {
}
";
assert_same!(
tokenize(Path::new("foo.t"), source).unwrap(),
vec![
Token {
source_range: SourceRange { start: 13, end: 32 },
variant: Variant::Comment(vec!["This is a struct.".to_owned()]),
},
Token {
source_range: SourceRange { start: 45, end: 51 },
variant: Variant::Struct,
},
Token {
source_range: SourceRange { start: 52, end: 55 },
variant: Variant::Identifier("Foo".into()),
},
Token {
source_range: SourceRange { start: 56, end: 57 },
variant: Variant::LeftCurly,
},
Token {
source_range: SourceRange { start: 70, end: 71 },
variant: Variant::RightCurly,
},
Token {
source_range: SourceRange {
start: 85,
end: 104,
},
variant: Variant::Comment(vec!["This is a choice.".to_owned()]),
},
Token {
source_range: SourceRange {
start: 117,
end: 123,
},
variant: Variant::Choice,
},
Token {
source_range: SourceRange {
start: 124,
end: 127,
},
variant: Variant::Identifier("Bar".into()),
},
Token {
source_range: SourceRange {
start: 128,
end: 129,
},
variant: Variant::LeftCurly,
},
Token {
source_range: SourceRange {
start: 142,
end: 143,
},
variant: Variant::RightCurly,
},
],
);
}
#[test]
fn tokenize_empty() {
assert_same!(tokenize(Path::new("foo.t"), "").unwrap(), vec![]);
}
#[test]
fn tokenize_whitespace() {
assert_same!(tokenize(Path::new("foo.t"), " \t\n").unwrap(), vec![]);
}
#[test]
fn tokenize_as() {
assert_same!(
tokenize(Path::new("foo.t"), AS_KEYWORD).unwrap(),
vec![Token {
source_range: SourceRange {
start: 0,
end: AS_KEYWORD.len(),
},
variant: Variant::As,
}],
);
}
#[test]
fn tokenize_asymmetric() {
assert_same!(
tokenize(Path::new("foo.t"), ASYMMETRIC_KEYWORD).unwrap(),
vec![Token {
source_range: SourceRange {
start: 0,
end: ASYMMETRIC_KEYWORD.len(),
},
variant: Variant::Asymmetric,
}],
);
}
#[test]
fn tokenize_bool() {
assert_same!(
tokenize(Path::new("foo.t"), BOOL_KEYWORD).unwrap(),
vec![Token {
source_range: SourceRange {
start: 0,
end: BOOL_KEYWORD.len(),
},
variant: Variant::Bool,
}],
);
}
#[test]
fn tokenize_bytes() {
assert_same!(
tokenize(Path::new("foo.t"), BYTES_KEYWORD).unwrap(),
vec![Token {
source_range: SourceRange {
start: 0,
end: BYTES_KEYWORD.len(),
},
variant: Variant::Bytes,
}],
);
}
#[test]
fn tokenize_choice() {
assert_same!(
tokenize(Path::new("foo.t"), CHOICE_KEYWORD).unwrap(),
vec![Token {
source_range: SourceRange {
start: 0,
end: CHOICE_KEYWORD.len(),
},
variant: Variant::Choice,
}],
);
}
#[test]
fn tokenize_colon() {
assert_same!(
tokenize(Path::new("foo.t"), ":").unwrap(),
vec![Token {
source_range: SourceRange { start: 0, end: 1 },
variant: Variant::Colon,
}],
);
}
#[test]
fn tokenize_comment_simple() {
assert_same!(
tokenize(Path::new("foo.t"), "# Hello, World!").unwrap(),
vec![Token {
source_range: SourceRange { start: 0, end: 15 },
variant: Variant::Comment(vec!["Hello, World!".to_owned()]),
}],
);
}
#[test]
fn tokenize_comment_complex() {
assert_same!(
tokenize(
Path::new("foo.t"),
" # \n # Hello, \n # World! \n # \n # Hello, \n # Earth! \n # ",
)
.unwrap(),
vec![Token {
source_range: SourceRange { start: 1, end: 55 },
variant: Variant::Comment(vec![
"Hello, World!".to_owned(),
"Hello, Earth!".to_owned(),
]),
}],
);
}
#[test]
fn tokenize_deleted() {
assert_same!(
tokenize(Path::new("foo.t"), DELETED_KEYWORD).unwrap(),
vec![Token {
source_range: SourceRange {
start: 0,
end: DELETED_KEYWORD.len(),
},
variant: Variant::Deleted,
}],
);
}
#[test]
fn tokenize_dot() {
assert_same!(
tokenize(Path::new("foo.t"), ".").unwrap(),
vec![Token {
source_range: SourceRange { start: 0, end: 1 },
variant: Variant::Dot,
}],
);
}
#[test]
fn tokenize_equals() {
assert_same!(
tokenize(Path::new("foo.t"), "=").unwrap(),
vec![Token {
source_range: SourceRange { start: 0, end: 1 },
variant: Variant::Equals,
}],
);
}
#[test]
fn tokenize_f64() {
assert_same!(
tokenize(Path::new("foo.t"), F64_KEYWORD).unwrap(),
vec![Token {
source_range: SourceRange {
start: 0,
end: F64_KEYWORD.len(),
},
variant: Variant::F64,
}],
);
}
#[test]
fn tokenize_bare_identifier() {
assert_same!(
tokenize(Path::new("foo.t"), "\u{5e78}\u{798f}").unwrap(),
vec![Token {
source_range: SourceRange { start: 0, end: 6 },
variant: Variant::Identifier("\u{5e78}\u{798f}".into()),
}],
);
}
#[test]
fn tokenize_raw_identifier() {
assert_same!(
tokenize(
Path::new("foo.t"),
&format!("{}{}", RAW_IDENTIFIER_SIGIL, STRUCT_KEYWORD),
)
.unwrap(),
vec![Token {
source_range: SourceRange { start: 0, end: 7 },
variant: Variant::Identifier(STRUCT_KEYWORD.into()),
}],
);
}
#[test]
fn tokenize_bare_identifier_underscore_prefix() {
assert_fails!(
tokenize(Path::new("foo.t"), "_foo"),
"Identifiers cannot begin with `_`.",
);
}
#[test]
fn tokenize_raw_identifier_underscore_prefix() {
assert_fails!(
tokenize(
Path::new("foo.t"),
&format!("{}{}", RAW_IDENTIFIER_SIGIL, "_foo"),
),
"Identifiers cannot begin with `_`.",
);
}
#[test]
fn tokenize_import() {
assert_same!(
tokenize(Path::new("foo.t"), IMPORT_KEYWORD).unwrap(),
vec![Token {
source_range: SourceRange {
start: 0,
end: IMPORT_KEYWORD.len(),
},
variant: Variant::Import,
}],
);
}
#[test]
fn tokenize_integer_literal_valid() {
assert_same!(
tokenize(Path::new("foo.t"), "42").unwrap(),
vec![Token {
source_range: SourceRange { start: 0, end: 2 },
variant: Variant::Integer(42),
}],
);
}
#[test]
fn tokenize_integer_literal_out_of_range() {
assert_fails!(
tokenize(Path::new("foo.t"), "18446744073709551616"),
"Integer `18446744073709551616` must be less than 2^64.",
);
}
#[test]
fn tokenize_left_curly() {
assert_same!(
tokenize(Path::new("foo.t"), "{").unwrap(),
vec![Token {
source_range: SourceRange { start: 0, end: 1 },
variant: Variant::LeftCurly,
}],
);
}
#[test]
fn tokenize_left_square() {
assert_same!(
tokenize(Path::new("foo.t"), "[").unwrap(),
vec![Token {
source_range: SourceRange { start: 0, end: 1 },
variant: Variant::LeftSquare,
}],
);
}
#[test]
fn tokenize_optional() {
assert_same!(
tokenize(Path::new("foo.t"), OPTIONAL_KEYWORD).unwrap(),
vec![Token {
source_range: SourceRange {
start: 0,
end: OPTIONAL_KEYWORD.len(),
},
variant: Variant::Optional,
}],
);
}
#[test]
fn tokenize_path_non_empty() {
assert_same!(
tokenize(Path::new("foo.t"), "'bar.t'").unwrap(),
vec![Token {
source_range: SourceRange { start: 0, end: 7 },
variant: Variant::Path(Path::new("bar.t").to_owned()),
}],
);
}
#[test]
fn tokenize_path_empty() {
assert_same!(
tokenize(Path::new("foo.t"), "''").unwrap(),
vec![Token {
source_range: SourceRange { start: 0, end: 2 },
variant: Variant::Path(Path::new("").to_owned()),
}],
);
}
#[test]
fn tokenize_path_non_terminated() {
assert_fails!(
tokenize(Path::new("foo.t"), "'bar.t"),
"Path starting here must be terminated by a `\'`.",
);
}
#[test]
fn tokenize_right_curly() {
assert_same!(
tokenize(Path::new("foo.t"), "}").unwrap(),
vec![Token {
source_range: SourceRange { start: 0, end: 1 },
variant: Variant::RightCurly,
}],
);
}
#[test]
fn tokenize_right_square() {
assert_same!(
tokenize(Path::new("foo.t"), "]").unwrap(),
vec![Token {
source_range: SourceRange { start: 0, end: 1 },
variant: Variant::RightSquare,
}],
);
}
#[test]
fn tokenize_s64() {
assert_same!(
tokenize(Path::new("foo.t"), S64_KEYWORD).unwrap(),
vec![Token {
source_range: SourceRange {
start: 0,
end: S64_KEYWORD.len(),
},
variant: Variant::S64,
}],
);
}
#[test]
fn tokenize_string() {
assert_same!(
tokenize(Path::new("foo.t"), STRING_KEYWORD).unwrap(),
vec![Token {
source_range: SourceRange {
start: 0,
end: STRING_KEYWORD.len(),
},
variant: Variant::String,
}],
);
}
#[test]
fn tokenize_struct() {
assert_same!(
tokenize(Path::new("foo.t"), STRUCT_KEYWORD).unwrap(),
vec![Token {
source_range: SourceRange {
start: 0,
end: STRUCT_KEYWORD.len(),
},
variant: Variant::Struct,
}],
);
}
#[test]
fn tokenize_u64() {
assert_same!(
tokenize(Path::new("foo.t"), U64_KEYWORD).unwrap(),
vec![Token {
source_range: SourceRange {
start: 0,
end: U64_KEYWORD.len(),
},
variant: Variant::U64,
}],
);
}
#[test]
fn tokenize_unit() {
assert_same!(
tokenize(Path::new("foo.t"), UNIT_KEYWORD).unwrap(),
vec![Token {
source_range: SourceRange {
start: 0,
end: UNIT_KEYWORD.len(),
},
variant: Variant::Unit,
}],
);
}
#[test]
fn tokenize_unexpected_symbol() {
assert_fails!(
tokenize(Path::new("foo.t"), "\u{1f610}\u{fe0f}"),
"Unexpected symbol `\u{1f610}\u{fe0f}`.",
);
}
}