use crate::defs::*;
use crate::errors::*;
use normalized_line_endings::{Annotated, AnnotatedChar, LineEnding, LF};
pub fn tokenize(input: &str) -> Result<Vec<Token>> {
Tokenizer::new(input).tokenize()
}
#[derive(Debug, Clone, PartialEq, Eq)]
pub enum Token {
NodeName(String, char),
NodeContent(String),
Indentation(usize, char),
}
#[derive(Copy, Clone)]
enum TokenizerState {
Start,
NewLine,
Indentation,
NodeName,
NodeContent,
}
pub struct Tokenizer<'a> {
row: usize,
column: usize,
state: TokenizerState,
input: &'a str,
current_char: char,
line_ending: Option<LineEnding>,
indentation: String,
delimiter: char,
node_name: String,
node_content: String,
tokens: Vec<Token>,
}
impl<'a> Tokenizer<'a> {
pub fn new(input: &'a str) -> Self {
Self {
row: 1,
column: 0,
state: TokenizerState::Start,
input,
current_char: NULL,
line_ending: None,
indentation: "".to_string(),
delimiter: NULL,
node_name: "".to_string(),
node_content: "".to_string(),
tokens: vec![],
}
}
pub fn tokenize(mut self) -> Result<Vec<Token>> {
let mut chars = self.input.chars().annotated();
loop {
(self.current_char, self.line_ending) = if let Some(annotated_char) = chars.next() {
match annotated_char {
AnnotatedChar::Character(ch, row, column) => {
self.row = row;
self.column = column;
(ch, None)
}
AnnotatedChar::LineEnding(line_ending, row, column) => {
self.row = row;
self.column = column;
(LF, Some(line_ending))
}
}
} else {
(NULL, None)
};
match self.state {
TokenizerState::Start => {
match self.current_char {
NULL => return Err(err_empty_input()),
ch if self.is_allowed_char(ch) => {
self.delimiter = ch;
self.tokens.push(Token::Indentation(0, NULL));
self.state = TokenizerState::NodeName;
}
other => {
let ch = if other == LF { self.line_ending.unwrap_or(LineEnding::Lf).first() } else { other };
return Err(err_unexpected_character(ch, self.row, self.column));
}
}
}
TokenizerState::NewLine => {
match self.current_char {
NULL => {
self.consume_node_content();
break;
}
ch if self.is_delimiter(ch) => {
self.consume_node_content();
self.tokens.push(Token::Indentation(0, NULL));
self.state = TokenizerState::NodeName;
}
WS => {
self.indentation.push(WS);
self.state = TokenizerState::Indentation;
}
TAB => {
self.indentation.push(TAB);
self.state = TokenizerState::Indentation;
}
LF => {
self.next_row();
self.node_content.push_str(self.line_ending.unwrap_or(LineEnding::Lf).as_ref());
}
other => {
self.node_content.push(other);
self.state = TokenizerState::NodeContent;
}
}
}
TokenizerState::NodeName => {
match self.current_char {
NULL => {
return Err(err_unexpected_end());
}
WS => {
self.consume_node_name();
self.node_content.push(WS);
self.state = TokenizerState::NodeContent;
}
TAB => {
self.consume_node_name();
self.node_content.push(TAB);
self.state = TokenizerState::NodeContent;
}
LF => {
self.next_row();
self.consume_node_name();
self.node_content.push_str(self.line_ending.unwrap_or(LineEnding::Lf).as_ref());
self.state = TokenizerState::NewLine;
}
ch if self.is_allowed_char(ch) => {
self.node_name.push(self.current_char);
}
other => {
return Err(err_unexpected_character(other, self.row, self.column));
}
}
}
TokenizerState::Indentation => {
match self.current_char {
NULL => return Err(err_unexpected_end()),
ch if self.is_delimiter(ch) => {
self.consume_node_content();
self.consume_indentation()?;
self.state = TokenizerState::NodeName
}
WS => self.indentation.push(WS),
TAB => self.indentation.push(TAB),
ch => {
self.node_content.push_str(&self.indentation);
self.node_content.push(ch);
self.indentation.clear();
self.state = TokenizerState::NodeContent
}
}
}
TokenizerState::NodeContent => {
match self.current_char {
NULL => return Err(err_unexpected_end()),
LF => {
self.next_row();
self.node_content.push_str(self.line_ending.unwrap_or(LineEnding::Lf).as_ref());
self.state = TokenizerState::NewLine
}
other => self.node_content.push(other),
}
}
}
}
Ok(self.tokens.clone())
}
fn consume_indentation(&mut self) -> Result<()> {
if self.indentation.chars().all(|ch| ch == WS) {
self.tokens.push(Token::Indentation(self.indentation.len(), WS));
self.indentation.clear();
Ok(())
} else if self.indentation.chars().all(|ch| ch == TAB) {
self.tokens.push(Token::Indentation(self.indentation.len(), TAB));
self.indentation.clear();
Ok(())
} else {
Err(err_inconsistent_indentation())
}
}
fn consume_node_name(&mut self) {
self.tokens.push(Token::NodeName(self.node_name.clone(), self.delimiter));
self.node_name.clear();
}
fn consume_node_content(&mut self) {
self.tokens.push(Token::NodeContent(self.node_content.clone()));
self.node_content.clear();
}
fn is_allowed_char(&self, ch: char) -> bool {
matches!(ch, '\u{0021}'..='\u{10FFFF}')
}
fn is_delimiter(&self, ch: char) -> bool {
ch == self.delimiter
}
fn next_row(&mut self) {
self.row += 1;
self.column = 0;
}
}