use logos::Logos;
use std::fmt;
#[derive(Logos, Debug, PartialEq, Eq, Hash, Clone, serde::Serialize, serde::Deserialize)]
pub enum Token {
#[token("::")]
LexMarker,
#[regex(r" {4}|\t", priority = 3)] Indentation,
Indent(Vec<(Token, std::ops::Range<usize>)>),
Dedent(Vec<(Token, std::ops::Range<usize>)>),
#[regex(r"\n", |lex| Some(lex.slice().to_owned()))]
BlankLine(Option<String>),
#[regex(r" {1,3}", |lex| Some(lex.slice().len()), priority = 1)]
Whitespace(usize),
#[token("-")]
Dash,
#[token(".")]
Period,
#[token("(")]
OpenParen,
#[token(")")]
CloseParen,
#[token(":")]
Colon,
#[token("!")]
ExclamationMark,
#[token("?")]
QuestionMark,
#[token(";")]
Semicolon,
#[token("¡")]
InvertedExclamationMark,
#[token("¿")]
InvertedQuestionMark,
#[token("…")]
Ellipsis,
#[token("。")]
IdeographicFullStop,
#[token("!")]
FullwidthExclamationMark,
#[token("?")]
FullwidthQuestionMark,
#[token("⁉")]
ExclamationQuestionMark,
#[token("⁈")]
QuestionExclamationMark,
#[token("؟")]
ArabicQuestionMark,
#[token("۔")]
ArabicFullStop,
#[token("؍")]
ArabicTripleDot,
#[token("،")]
ArabicComma,
#[token("।")]
Danda,
#[token("॥")]
DoubleDanda,
#[token("৷")]
BengaliCurrencyNumeratorFour,
#[token("።")]
EthiopianFullStop,
#[token("։")]
ArmenianFullStop,
#[token("།")]
TibetanShad,
#[token("๏")]
ThaiFongman,
#[token("၊")]
MyanmarComma,
#[token("။")]
MyanmarFullStop,
#[token(",")]
Comma,
#[token("\"")]
Quote,
#[token("=")]
Equals,
#[regex(r"[0-9]+", |lex| lex.slice().to_owned(), priority = 2)]
Number(String),
#[regex(r#"[^\s\n\t\-\.\(\):0-9,="!?;¡¿…。!?⁉⁈؟۔؍،।॥৷።։།๏၊။]+"#, |lex| lex.slice().to_owned())]
Text(String),
}
impl fmt::Display for Token {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
let name = match self {
Token::LexMarker => "lex-marker",
Token::Indentation => "indentation",
Token::Indent(_) => "indent",
Token::Dedent(_) => "dedent",
Token::BlankLine(_) => "blank-line",
Token::Whitespace(_) => "whitespace",
Token::Dash => "dash",
Token::Period => "period",
Token::OpenParen => "open-paren",
Token::CloseParen => "close-paren",
Token::Colon => "colon",
Token::ExclamationMark => "exclamation-mark",
Token::QuestionMark => "question-mark",
Token::Semicolon => "semicolon",
Token::InvertedExclamationMark => "inverted-exclamation-mark",
Token::InvertedQuestionMark => "inverted-question-mark",
Token::Ellipsis => "ellipsis",
Token::IdeographicFullStop => "ideographic-full-stop",
Token::FullwidthExclamationMark => "fullwidth-exclamation-mark",
Token::FullwidthQuestionMark => "fullwidth-question-mark",
Token::ExclamationQuestionMark => "exclamation-question-mark",
Token::QuestionExclamationMark => "question-exclamation-mark",
Token::ArabicQuestionMark => "arabic-question-mark",
Token::ArabicFullStop => "arabic-full-stop",
Token::ArabicTripleDot => "arabic-triple-dot",
Token::ArabicComma => "arabic-comma",
Token::Danda => "danda",
Token::DoubleDanda => "double-danda",
Token::BengaliCurrencyNumeratorFour => "bengali-currency-numerator-four",
Token::EthiopianFullStop => "ethiopian-full-stop",
Token::ArmenianFullStop => "armenian-full-stop",
Token::TibetanShad => "tibetan-shad",
Token::ThaiFongman => "thai-fongman",
Token::MyanmarComma => "myanmar-comma",
Token::MyanmarFullStop => "myanmar-full-stop",
Token::Comma => "comma",
Token::Quote => "quote",
Token::Equals => "equals",
Token::Number(s) => return write!(f, "<number:{s}>"),
Token::Text(s) => return write!(f, "<text:{s}>"),
};
write!(f, "<{name}>")
}
}
impl Token {
pub fn simple_name(&self) -> &'static str {
match self {
Token::LexMarker => "LEX_MARKER",
Token::Indentation => "INDENTATION",
Token::Indent(_) => "INDENT",
Token::Dedent(_) => "DEDENT",
Token::BlankLine(_) => "BLANK_LINE",
Token::Whitespace(_) => "WHITESPACE",
Token::Dash => "DASH",
Token::Period => "PERIOD",
Token::OpenParen => "OPEN_PAREN",
Token::CloseParen => "CLOSE_PAREN",
Token::Colon => "COLON",
Token::ExclamationMark => "EXCLAMATION_MARK",
Token::QuestionMark => "QUESTION_MARK",
Token::Semicolon => "SEMICOLON",
Token::InvertedExclamationMark => "INVERTED_EXCLAMATION_MARK",
Token::InvertedQuestionMark => "INVERTED_QUESTION_MARK",
Token::Ellipsis => "ELLIPSIS",
Token::IdeographicFullStop => "IDEOGRAPHIC_FULL_STOP",
Token::FullwidthExclamationMark => "FULLWIDTH_EXCLAMATION_MARK",
Token::FullwidthQuestionMark => "FULLWIDTH_QUESTION_MARK",
Token::ExclamationQuestionMark => "EXCLAMATION_QUESTION_MARK",
Token::QuestionExclamationMark => "QUESTION_EXCLAMATION_MARK",
Token::ArabicQuestionMark => "ARABIC_QUESTION_MARK",
Token::ArabicFullStop => "ARABIC_FULL_STOP",
Token::ArabicTripleDot => "ARABIC_TRIPLE_DOT",
Token::ArabicComma => "ARABIC_COMMA",
Token::Danda => "DANDA",
Token::DoubleDanda => "DOUBLE_DANDA",
Token::BengaliCurrencyNumeratorFour => "BENGALI_CURRENCY_NUMERATOR_FOUR",
Token::EthiopianFullStop => "ETHIOPIAN_FULL_STOP",
Token::ArmenianFullStop => "ARMENIAN_FULL_STOP",
Token::TibetanShad => "TIBETAN_SHAD",
Token::ThaiFongman => "THAI_FONGMAN",
Token::MyanmarComma => "MYANMAR_COMMA",
Token::MyanmarFullStop => "MYANMAR_FULL_STOP",
Token::Comma => "COMMA",
Token::Quote => "QUOTE",
Token::Equals => "EQUALS",
Token::Number(_) => "NUMBER",
Token::Text(_) => "TEXT",
}
}
pub fn is_indent(&self) -> bool {
matches!(self, Token::Indentation)
}
pub fn is_indent_level(&self) -> bool {
matches!(self, Token::Indent(_))
}
pub fn is_dedent_level(&self) -> bool {
matches!(self, Token::Dedent(_))
}
pub fn is_whitespace(&self) -> bool {
matches!(
self,
Token::Indentation
| Token::Indent(_)
| Token::Dedent(_)
| Token::BlankLine(_)
| Token::Whitespace(_)
)
}
pub fn is_sequence_marker(&self) -> bool {
matches!(
self,
Token::Dash | Token::Period | Token::OpenParen | Token::CloseParen
)
}
pub fn is_number(&self) -> bool {
matches!(self, Token::Number(_))
}
pub fn is_text(&self) -> bool {
matches!(self, Token::Text(_))
}
pub fn is_end_punctuation(&self) -> bool {
matches!(
self,
Token::Period
| Token::ExclamationMark
| Token::QuestionMark
| Token::Semicolon
| Token::Comma
| Token::InvertedExclamationMark
| Token::InvertedQuestionMark
| Token::Ellipsis
| Token::IdeographicFullStop
| Token::FullwidthExclamationMark
| Token::FullwidthQuestionMark
| Token::ExclamationQuestionMark
| Token::QuestionExclamationMark
| Token::ArabicQuestionMark
| Token::ArabicFullStop
| Token::ArabicTripleDot
| Token::ArabicComma
| Token::Danda
| Token::DoubleDanda
| Token::BengaliCurrencyNumeratorFour
| Token::EthiopianFullStop
| Token::ArmenianFullStop
| Token::TibetanShad
| Token::ThaiFongman
| Token::MyanmarComma
| Token::MyanmarFullStop
)
}
}
#[cfg(test)]
mod tests {
use super::*;
use crate::lex::lexing::tokenize;
#[test]
fn test_lex_marker() {
let tokens: Vec<_> = tokenize("::").into_iter().map(|(t, _)| t).collect();
assert_eq!(tokens, vec![Token::LexMarker]);
}
#[test]
fn test_indentation_tokens() {
let tokens: Vec<_> = tokenize(" ").into_iter().map(|(t, _)| t).collect();
assert_eq!(tokens, vec![Token::Indentation]);
let tokens: Vec<_> = tokenize("\t").into_iter().map(|(t, _)| t).collect();
assert_eq!(tokens, vec![Token::Indentation]);
let tokens: Vec<_> = tokenize(" ").into_iter().map(|(t, _)| t).collect(); assert_eq!(tokens, vec![Token::Indentation, Token::Indentation]);
}
#[test]
fn test_sequence_markers() {
let tokens: Vec<_> = tokenize("- . ( ) :").into_iter().map(|(t, _)| t).collect();
assert_eq!(
tokens,
vec![
Token::Dash,
Token::Whitespace(1),
Token::Period,
Token::Whitespace(1),
Token::OpenParen,
Token::Whitespace(1),
Token::CloseParen,
Token::Whitespace(1),
Token::Colon
]
);
}
#[test]
fn test_text_tokens() {
let tokens: Vec<_> = tokenize("hello world")
.into_iter()
.map(|(t, _)| t)
.collect();
assert_eq!(
tokens,
vec![
Token::Text("hello".to_string()),
Token::Whitespace(1),
Token::Text("world".to_string())
]
);
}
#[test]
fn test_mixed_content() {
let tokens: Vec<_> = tokenize("1. Hello world\n - Item 1")
.into_iter()
.map(|(t, _)| t)
.collect();
assert_eq!(
tokens,
vec![
Token::Number("1".to_string()),
Token::Period,
Token::Whitespace(1),
Token::Text("Hello".to_string()),
Token::Whitespace(1),
Token::Text("world".to_string()),
Token::BlankLine(Some("\n".to_string())),
Token::Indentation,
Token::Dash,
Token::Whitespace(1),
Token::Text("Item".to_string()),
Token::Whitespace(1),
Token::Number("1".to_string()),
]
);
}
#[test]
fn test_number_tokens() {
let tokens: Vec<_> = tokenize("123 456").into_iter().map(|(t, _)| t).collect();
assert_eq!(
tokens,
vec![
Token::Number("123".to_string()),
Token::Whitespace(1),
Token::Number("456".to_string())
]
);
}
#[test]
fn test_token_predicates() {
assert!(Token::Indentation.is_indent());
assert!(Token::Indent(vec![]).is_indent_level());
assert!(Token::Dedent(vec![]).is_dedent_level());
assert!(!Token::Text("".to_string()).is_indent());
assert!(Token::Indentation.is_whitespace());
assert!(Token::Indent(vec![]).is_whitespace());
assert!(Token::Dedent(vec![]).is_whitespace());
assert!(Token::BlankLine(Some("".to_string())).is_whitespace());
assert!(Token::Whitespace(1).is_whitespace());
assert!(!Token::Text("".to_string()).is_whitespace());
assert!(Token::Dash.is_sequence_marker());
assert!(Token::Period.is_sequence_marker());
assert!(!Token::Text("".to_string()).is_sequence_marker());
assert!(!Token::Number("".to_string()).is_sequence_marker());
assert!(Token::Text("".to_string()).is_text());
assert!(!Token::Dash.is_text());
assert!(!Token::Number("".to_string()).is_text());
assert!(Token::Number("".to_string()).is_number());
assert!(!Token::Text("".to_string()).is_number());
assert!(!Token::Dash.is_number());
}
}