use crate::common::keywords::key_symbols_sorted_by_len_desc;
use crate::parse::TokenBlock;
use crate::prelude::*;
use std::collections::HashMap;
use std::rc::Rc;
pub struct Tokenizer {
pub macros_in_block_scopes: Vec<HashMap<String, Vec<String>>>,
}
impl Default for Tokenizer {
fn default() -> Self {
Self::new()
}
}
impl Tokenizer {
pub fn new() -> Self {
Self {
macros_in_block_scopes: vec![],
}
}
pub fn parse_blocks(
&mut self,
s: &str,
current_file_path: Rc<str>,
) -> Result<Vec<TokenBlock>, RuntimeError> {
let stripped_source_code = self.strip_triple_quote_comment_blocks(s);
let lines: Vec<_> = stripped_source_code.lines().collect();
let mut i = 0;
self.parse_level(&lines, &mut i, 0, current_file_path)
}
pub fn tokenize_line(
&self,
line: &str,
line_file: LineFile,
) -> Result<Vec<String>, RuntimeError> {
let raw_tokens = self.raw_tokenize_line(line);
self.expand_macro_tokens(raw_tokens, line_file)
}
fn raw_tokenize_line(&self, line: &str) -> Vec<String> {
let line = line.trim_end();
let symbols = key_symbols_sorted_by_len_desc();
let mut tokens = Vec::with_capacity(line.len());
let mut i = 0;
let bytes = line.as_bytes();
while i < bytes.len() {
if !line.is_char_boundary(i) {
let mut char_start = i;
while char_start > 0 && !line.is_char_boundary(char_start) {
char_start -= 1;
}
i = char_start;
continue;
}
if bytes[i] == b'#' {
break;
}
let ws_ch = line[i..].chars().next().unwrap_or('\0');
if ws_ch.is_whitespace() {
i += ws_ch.len_utf8();
continue;
}
let mut matched = false;
for &sym in &symbols {
let sym_length_bytes = sym.len();
if i + sym_length_bytes <= line.len()
&& line.is_char_boundary(i)
&& line.is_char_boundary(i + sym_length_bytes)
&& &line[i..i + sym_length_bytes] == sym
{
tokens.push(sym.to_string());
i += sym_length_bytes;
matched = true;
break;
}
}
if matched {
continue;
}
if bytes[i].is_ascii_alphabetic() || bytes[i] == b'_' {
let start = i;
i += 1;
while i < bytes.len() && (bytes[i].is_ascii_alphanumeric() || bytes[i] == b'_') {
i += 1;
}
tokens.push(line[start..i].to_string());
continue;
}
if bytes[i] == b'@' {
let start = i;
i += 1;
while i < bytes.len() && (bytes[i].is_ascii_alphanumeric() || bytes[i] == b'_') {
i += 1;
}
tokens.push(line[start..i].to_string());
continue;
}
if bytes[i].is_ascii_digit() {
let start = i;
i += 1;
while i < bytes.len() && bytes[i].is_ascii_digit() {
i += 1;
}
if i + 1 < bytes.len() && bytes[i] == b'.' && bytes[i + 1].is_ascii_digit() {
i += 1;
while i < bytes.len() && bytes[i].is_ascii_digit() {
i += 1;
}
}
tokens.push(line[start..i].to_string());
continue;
}
let ch = line[i..].chars().next().unwrap_or('\0');
tokens.push(ch.to_string());
i += ch.len_utf8();
}
tokens
}
fn expand_macro_tokens(
&self,
raw_tokens: Vec<String>,
line_file: LineFile,
) -> Result<Vec<String>, RuntimeError> {
let mut expanded = Vec::with_capacity(raw_tokens.len());
for token in raw_tokens {
if Self::is_macro_use_token(&token) {
let macro_name = &token[1..];
let macro_tokens = self.find_macro(macro_name).ok_or_else(|| {
Self::parse_error(format!("Unknown macro: {}", token), line_file.clone())
})?;
expanded.extend(macro_tokens.iter().cloned());
} else {
expanded.push(token);
}
}
Ok(expanded)
}
fn strip_triple_quote_comment_blocks(&self, source_code: &str) -> String {
let mut in_comment = false;
let line_count_upper_bound = source_code.lines().count();
let mut out_lines: Vec<String> = Vec::with_capacity(line_count_upper_bound);
for line in source_code.lines() {
let trimmed = line.trim();
let only_quote_chars = !trimmed.is_empty() && trimmed.chars().all(|c| c == '"');
if only_quote_chars {
in_comment = !in_comment;
out_lines.push(String::new());
continue;
}
if in_comment {
out_lines.push(String::new());
} else {
out_lines.push(line.to_string());
}
}
out_lines.join("\n")
}
fn parse_level(
&mut self,
lines: &[&str],
i: &mut usize,
base_indent: usize,
current_file_path: Rc<str>,
) -> Result<Vec<TokenBlock>, RuntimeError> {
self.macros_in_block_scopes.push(HashMap::new());
let result = self.parse_level_in_current_scope(lines, i, base_indent, current_file_path);
self.macros_in_block_scopes.pop();
result
}
fn parse_level_in_current_scope(
&mut self,
lines: &[&str],
i: &mut usize,
base_indent: usize,
current_file_path: Rc<str>,
) -> Result<Vec<TokenBlock>, RuntimeError> {
let remaining_line_count_upper_bound = lines.len().saturating_sub(*i);
let mut items = Vec::with_capacity(remaining_line_count_upper_bound);
let mut body_indent = None;
while *i < lines.len() {
let raw = lines[*i];
let line_no = *i + 1;
let line_file = (line_no, current_file_path.clone());
let indent = Self::indent_level(raw);
let content = raw.trim();
if content.is_empty() {
*i += 1;
continue;
}
if indent < base_indent {
break;
}
if indent > base_indent {
let trimmed_start = raw.trim_start();
if trimmed_start.is_empty() || trimmed_start.starts_with('#') {
*i += 1;
continue;
}
return Err(RuntimeError::from(ParseRuntimeError(
RuntimeErrorStruct::new_with_msg_and_line_file(
format!(
"unexpected indent at line {} in {}",
line_file.0,
line_file.1.as_ref()
),
line_file,
),
)));
}
*i += 1;
if self.register_macro_definition(content, line_file.clone())? {
continue;
}
let raw_header_tokens = self.raw_tokenize_line(content);
let header_tokens = self.expand_macro_tokens(raw_header_tokens, line_file.clone())?;
if header_tokens.is_empty() {
continue;
}
if Self::ends_with_colon(content) {
if *i >= lines.len() {
let line_file = (line_no, current_file_path.clone());
return Err(RuntimeError::from(ParseRuntimeError(
RuntimeErrorStruct::new_with_msg_and_line_file(
format!(
"block header missing body at line {} in {}",
line_file.0,
line_file.1.as_ref()
),
line_file,
),
)));
}
let next_indent = Self::indent_level(lines[*i]);
if next_indent <= indent {
let line_file = (*i + 1, current_file_path.clone());
return Err(RuntimeError::from(ParseRuntimeError(
RuntimeErrorStruct::new_with_msg_and_line_file(
format!(
"expected indent at line {} in {}",
line_file.0,
line_file.1.as_ref()
),
line_file,
),
)));
}
let body = self.parse_level(lines, i, next_indent, current_file_path.clone())?;
items.push(TokenBlock::new(
header_tokens,
body,
(line_no, current_file_path.clone()),
));
} else {
items.push(TokenBlock::new(
header_tokens,
vec![],
(line_no, current_file_path.clone()),
));
}
if let Some(expected) = body_indent {
if indent != expected {
let line_file = (line_no, current_file_path.clone());
return Err(RuntimeError::from(ParseRuntimeError(
RuntimeErrorStruct::new_with_msg_and_line_file(
format!(
"inconsistent indent at line {} in {}",
line_file.0,
line_file.1.as_ref()
),
line_file,
),
)));
}
} else {
body_indent = Some(indent);
}
}
Ok(items)
}
fn indent_level(line: &str) -> usize {
let mut n = 0;
for c in line.chars() {
match c {
' ' => n += 1,
'\t' => n += 4,
_ => break,
}
}
n
}
fn ends_with_colon(s: &str) -> bool {
let trimmed = s.trim_end();
trimmed.ends_with(COLON)
}
fn register_macro_definition(
&mut self,
line: &str,
line_file: LineFile,
) -> Result<bool, RuntimeError> {
let Some(mut rest) = line.strip_prefix("macro") else {
return Ok(false);
};
if !rest
.chars()
.next()
.map(|ch| ch.is_whitespace())
.unwrap_or(false)
{
return Ok(false);
}
if Self::ends_with_colon(line) {
return Err(Self::parse_error(
"macro definition cannot have a block body".to_string(),
line_file,
));
}
rest = rest.trim_start();
let name_end = rest
.char_indices()
.find(|(_, ch)| ch.is_whitespace())
.map(|(index, _)| index)
.unwrap_or(rest.len());
let name = &rest[..name_end];
if !Self::is_macro_name(name) {
return Err(Self::parse_error(
format!("Invalid macro name: {}", name),
line_file,
));
}
rest = rest[name_end..].trim_start();
let Some(replacement_and_tail) = rest.strip_prefix('"') else {
return Err(Self::parse_error(
"macro definition must be: macro name \"replacement\"".to_string(),
line_file,
));
};
let mut replacement_end = None;
let mut escaped = false;
for (index, ch) in replacement_and_tail.char_indices() {
if escaped {
escaped = false;
continue;
}
if ch == '\\' {
escaped = true;
continue;
}
if ch == '"' {
replacement_end = Some(index);
break;
}
}
let Some(replacement_end) = replacement_end else {
return Err(Self::parse_error(
"macro replacement must end with a double quote".to_string(),
line_file,
));
};
let replacement_source = &replacement_and_tail[..replacement_end];
let tail = replacement_and_tail[replacement_end + 1..].trim_start();
if !tail.is_empty() && !tail.starts_with('#') {
return Err(Self::parse_error(
"macro definition must be: macro name \"replacement\"".to_string(),
line_file,
));
}
let replacement_tokens = self.raw_tokenize_line(replacement_source);
if replacement_tokens
.iter()
.any(|token| Self::is_macro_use_token(token) || token == "macro")
{
return Err(Self::parse_error(
"macro replacement cannot contain macro use or macro definition".to_string(),
line_file,
));
}
let current_scope = self.macros_in_block_scopes.last_mut().ok_or_else(|| {
Self::parse_error(
"Internal tokenizer macro scope missing".to_string(),
line_file,
)
})?;
current_scope.insert(name.to_string(), replacement_tokens);
Ok(true)
}
fn find_macro(&self, name: &str) -> Option<&Vec<String>> {
self.macros_in_block_scopes
.iter()
.rev()
.find_map(|scope| scope.get(name))
}
fn is_macro_use_token(token: &str) -> bool {
let Some(name) = token.strip_prefix('@') else {
return false;
};
Self::is_macro_name(name)
}
fn is_macro_name(name: &str) -> bool {
let mut chars = name.chars();
match chars.next() {
Some(ch) if ch.is_ascii_alphabetic() || ch == '_' => {}
_ => return false,
}
chars.all(|ch| ch.is_ascii_alphanumeric() || ch == '_')
}
fn parse_error(message: String, line_file: LineFile) -> RuntimeError {
RuntimeError::from(ParseRuntimeError(
RuntimeErrorStruct::new_with_msg_and_line_file(message, line_file),
))
}
}
#[cfg(test)]
mod tests {
use super::Tokenizer;
use crate::prelude::*;
use std::rc::Rc;
fn test_line_file() -> LineFile {
(1, Rc::from("test.lit"))
}
#[test]
fn exist_bang_splits_into_exist_and_bang() {
let tokenizer = Tokenizer::new();
assert_eq!(
tokenizer
.tokenize_line("exist! a R st {}", test_line_file())
.unwrap(),
vec!["exist", "!", "a", "R", "st", "{", "}"]
);
}
#[test]
fn forall_bang_splits_like_exist_bang() {
let tokenizer = Tokenizer::new();
assert_eq!(
tokenizer
.tokenize_line("forall! x R: x > 0", test_line_file())
.unwrap(),
vec!["forall", "!", "x", "R", ":", "x", ">", "0"]
);
}
#[test]
fn exist_bang_with_whitespace() {
let tokenizer = Tokenizer::new();
assert_eq!(
tokenizer
.tokenize_line("exist ! a R st {}", test_line_file())
.unwrap(),
vec!["exist", "!", "a", "R", "st", "{", "}"]
);
}
#[test]
fn macro_definition_expands_later_line_in_same_block() {
let mut tokenizer = Tokenizer::new();
let blocks = tokenizer
.parse_blocks("macro eq \"a = b\"\nhave @eq", Rc::from("test.lit"))
.unwrap();
assert_eq!(blocks.len(), 1);
assert_eq!(blocks[0].header, vec!["have", "a", "=", "b"]);
}
#[test]
fn macro_scope_is_visible_in_child_block() {
let mut tokenizer = Tokenizer::new();
let blocks = tokenizer
.parse_blocks(
"macro eq \"a = b\"\nprove claim:\n have @eq",
Rc::from("test.lit"),
)
.unwrap();
assert_eq!(blocks[0].body[0].header, vec!["have", "a", "=", "b"]);
}
#[test]
fn macro_scope_expires_after_block() {
let mut tokenizer = Tokenizer::new();
let error = tokenizer
.parse_blocks(
"prove claim:\n macro eq \"a = b\"\n have @eq\nhave @eq",
Rc::from("test.lit"),
)
.unwrap_err();
assert!(format!("{:?}", error).contains("Unknown macro: @eq"));
}
#[test]
fn inner_macro_overrides_outer_macro_only_inside_child_block() {
let mut tokenizer = Tokenizer::new();
let blocks = tokenizer
.parse_blocks(
"macro x \"outer\"\nprove claim:\n macro x \"inner\"\n have @x\nhave @x",
Rc::from("test.lit"),
)
.unwrap();
assert_eq!(blocks[0].body[0].header, vec!["have", "inner"]);
assert_eq!(blocks[1].header, vec!["have", "outer"]);
}
#[test]
fn macro_replacement_cannot_use_macro() {
let mut tokenizer = Tokenizer::new();
let error = tokenizer
.parse_blocks("macro x \"@other\"", Rc::from("test.lit"))
.unwrap_err();
assert!(format!("{:?}", error).contains("macro replacement cannot contain"));
}
}