#![allow(missing_docs)]
use std::fmt;
pub mod token;
pub mod token_struct;
pub mod numeric;
pub mod string_utils;
pub mod optimized;
pub mod lexer;
pub mod internal_lexer;
pub use token::*;
pub use token_struct::*;
pub use numeric::*;
pub use string_utils::*;
pub use optimized::*;
pub use lexer::*;
pub use internal_lexer::*;
#[derive(Debug, Clone, PartialEq)]
pub enum TokenKind {
LeftParen,
RightParen,
LeftBracket,
RightBracket,
LeftBrace,
RightBrace,
Quote,
Quasiquote,
UnquoteSplicing,
Unquote,
Dot,
TypeAnnotation,
ComplexNumber,
RationalNumber,
RealNumber,
IntegerNumber,
Keyword,
String,
Character,
Boolean,
BlockComment,
LineComment,
Identifier,
Eof,
Error,
}
fn _escape_for_display(text: &str) -> String {
text.chars()
.map(|c| match c {
'\n' => "\\n".to_string(),
'\r' => "\\r".to_string(),
'\t' => "\\t".to_string(),
'\\' => "\\\\".to_string(),
'"' => "\\\"".to_string(),
c if c.is_control() => format!("\\x{:02x}", c as u8),
c => c.to_string(),
})
.collect()
}
impl fmt::Display for TokenKind {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
let name = match self {
TokenKind::LeftParen => "(",
TokenKind::RightParen => ")",
TokenKind::LeftBracket => "[",
TokenKind::RightBracket => "]",
TokenKind::LeftBrace => "{",
TokenKind::RightBrace => "}",
TokenKind::Quote => "'",
TokenKind::Quasiquote => "`",
TokenKind::Unquote => ",",
TokenKind::UnquoteSplicing => ",@",
TokenKind::Dot => ".",
TokenKind::TypeAnnotation => "::",
TokenKind::Keyword => "keyword",
TokenKind::Identifier => "identifier",
TokenKind::IntegerNumber | TokenKind::RealNumber | TokenKind::RationalNumber | TokenKind::ComplexNumber => "number",
TokenKind::String => "string",
TokenKind::Character => "character",
TokenKind::Boolean => "boolean",
TokenKind::LineComment | TokenKind::BlockComment => "comment",
TokenKind::Eof => "end of file",
TokenKind::Error => "error",
};
write!(f, "{name}")
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_basic_tokenization() {
let source = "(+ 1 2)";
let mut lexer = Lexer::new(source, Some("test"));
let tokens = lexer.tokenize().unwrap();
assert_eq!(tokens.len(), 6); assert_eq!(tokens[0].kind, TokenKind::LeftParen);
assert_eq!(tokens[1].kind, TokenKind::Identifier);
assert_eq!(tokens[1].text, "+");
assert_eq!(tokens[2].kind, TokenKind::IntegerNumber);
assert_eq!(tokens[2].text, "1");
assert_eq!(tokens[3].kind, TokenKind::IntegerNumber);
assert_eq!(tokens[3].text, "2");
assert_eq!(tokens[4].kind, TokenKind::RightParen);
assert_eq!(tokens[5].kind, TokenKind::Eof);
}
#[test]
fn test_numeric_tokenization() {
let source = "42 3.14 22/7 3+4i -5.2e-10";
let mut lexer = Lexer::new(source, Some("test"));
let tokens = lexer.tokenize().unwrap();
let tokens: Vec<_> = tokens.into_iter().filter(|t| t.kind != TokenKind::Eof).collect();
assert_eq!(tokens.len(), 5);
assert_eq!(tokens[0].kind, TokenKind::IntegerNumber);
assert_eq!(tokens[0].text, "42");
assert_eq!(tokens[1].kind, TokenKind::RealNumber);
assert_eq!(tokens[1].text, "3.14");
assert_eq!(tokens[2].kind, TokenKind::RationalNumber);
assert_eq!(tokens[2].text, "22/7");
assert_eq!(tokens[3].kind, TokenKind::ComplexNumber);
assert_eq!(tokens[3].text, "3+4i");
assert_eq!(tokens[4].kind, TokenKind::RealNumber);
assert_eq!(tokens[4].text, "-5.2e-10");
}
#[test]
fn test_keyword_tokenization() {
let source = "#:key #:type #:inline";
let mut lexer = Lexer::new(source, Some("test"));
let tokens = lexer.tokenize().unwrap();
let tokens: Vec<_> = tokens.into_iter().filter(|t| t.kind != TokenKind::Eof).collect();
assert_eq!(tokens.len(), 3);
assert_eq!(tokens[0].kind, TokenKind::Keyword);
assert_eq!(tokens[0].text, "#:key");
assert_eq!(tokens[1].kind, TokenKind::Keyword);
assert_eq!(tokens[1].text, "#:type");
assert_eq!(tokens[2].kind, TokenKind::Keyword);
assert_eq!(tokens[2].text, "#:inline");
}
#[test]
fn test_string_tokenization() {
let source = r#""hello world" "with\nescapes" "unicode: \x41;""#;
let mut lexer = Lexer::new(source, Some("test"));
let tokens = lexer.tokenize().unwrap();
let tokens: Vec<_> = tokens.into_iter().filter(|t| t.kind != TokenKind::Eof).collect();
assert_eq!(tokens.len(), 3);
assert_eq!(tokens[0].kind, TokenKind::String);
assert_eq!(tokens[0].text, r#""hello world""#);
assert_eq!(tokens[1].kind, TokenKind::String);
assert_eq!(tokens[1].text, r#""with\nescapes""#);
assert_eq!(tokens[2].kind, TokenKind::String);
assert_eq!(tokens[2].text, r#""unicode: \x41;""#);
}
#[test]
fn test_character_tokenization() {
let source = r"#\a #\space #\newline #\tab #\x41";
let mut lexer = Lexer::new(source, Some("test"));
let tokens = lexer.tokenize().unwrap();
let tokens: Vec<_> = tokens.into_iter().filter(|t| t.kind != TokenKind::Eof).collect();
assert_eq!(tokens.len(), 5);
assert!(tokens.iter().all(|t| t.kind == TokenKind::Character));
assert_eq!(tokens[0].text, r"#\a");
assert_eq!(tokens[1].text, r"#\space");
assert_eq!(tokens[2].text, r"#\newline");
assert_eq!(tokens[3].text, r"#\tab");
assert_eq!(tokens[4].text, r"#\x41");
}
#[test]
fn test_boolean_tokenization() {
let source = "#t #f #true #false";
let mut lexer = Lexer::new(source, Some("test"));
let tokens = lexer.tokenize().unwrap();
let tokens: Vec<_> = tokens.into_iter().filter(|t| t.kind != TokenKind::Eof).collect();
assert_eq!(tokens.len(), 4);
assert!(tokens.iter().all(|t| t.kind == TokenKind::Boolean));
assert_eq!(tokens[0].text, "#t");
assert_eq!(tokens[1].text, "#f");
assert_eq!(tokens[2].text, "#true");
assert_eq!(tokens[3].text, "#false");
}
#[test]
fn test_comment_skipping() {
let source = "; This is a comment\n(+ 1 2)";
let mut lexer = Lexer::new(source, Some("test"));
let tokens = lexer.tokenize().unwrap();
assert_eq!(tokens[0].kind, TokenKind::LeftParen);
assert_eq!(tokens[1].kind, TokenKind::Identifier);
assert_eq!(tokens[1].text, "+");
}
#[test]
fn test_block_comment_tokenization() {
let source = "#| simple comment |# (+ 1 #| nested #| comment |# here |# 2)";
let mut lexer = Lexer::new(source, Some("test"));
let tokens = lexer.tokenize().unwrap();
let tokens: Vec<_> = tokens.into_iter()
.filter(|t| !matches!(t.kind, TokenKind::Eof | TokenKind::BlockComment | TokenKind::Error))
.collect();
assert!(tokens.len() >= 5); assert_eq!(tokens[0].kind, TokenKind::LeftParen);
assert_eq!(tokens[1].kind, TokenKind::Identifier);
assert_eq!(tokens[1].text, "+");
assert_eq!(tokens[2].kind, TokenKind::IntegerNumber);
assert_eq!(tokens[2].text, "1");
let two_pos = tokens.iter().position(|t| t.text == "2" && t.kind == TokenKind::IntegerNumber);
let rparen_pos = tokens.iter().position(|t| t.kind == TokenKind::RightParen);
assert!(two_pos.is_some(), "Should find number '2' token");
assert!(rparen_pos.is_some(), "Should find right paren token");
}
#[test]
fn test_quote_forms() {
let source = "'x `(,a ,@b)";
let mut lexer = Lexer::new(source, Some("test"));
let tokens = lexer.tokenize().unwrap();
let tokens: Vec<_> = tokens.into_iter().filter(|t| t.kind != TokenKind::Eof).collect();
assert_eq!(tokens.len(), 9); assert_eq!(tokens[0].kind, TokenKind::Quote);
assert_eq!(tokens[1].kind, TokenKind::Identifier);
assert_eq!(tokens[1].text, "x");
assert_eq!(tokens[2].kind, TokenKind::Quasiquote);
assert_eq!(tokens[3].kind, TokenKind::LeftParen);
assert_eq!(tokens[4].kind, TokenKind::Unquote);
assert_eq!(tokens[5].kind, TokenKind::Identifier);
assert_eq!(tokens[5].text, "a");
assert_eq!(tokens[6].kind, TokenKind::UnquoteSplicing);
assert_eq!(tokens[7].kind, TokenKind::Identifier);
assert_eq!(tokens[7].text, "b");
assert_eq!(tokens[8].kind, TokenKind::RightParen);
}
#[test]
fn test_type_annotation() {
let source = "(:: expr Type)";
let mut lexer = Lexer::new(source, Some("test"));
let tokens = lexer.tokenize().unwrap();
assert_eq!(tokens[1].kind, TokenKind::TypeAnnotation);
assert_eq!(tokens[1].text, "::");
}
#[test]
fn test_delimiters() {
let source = "()[]{}.";
let mut lexer = Lexer::new(source, Some("test"));
let tokens = lexer.tokenize().unwrap();
let tokens: Vec<_> = tokens.into_iter().filter(|t| t.kind != TokenKind::Eof).collect();
assert_eq!(tokens.len(), 7);
assert_eq!(tokens[0].kind, TokenKind::LeftParen);
assert_eq!(tokens[1].kind, TokenKind::RightParen);
assert_eq!(tokens[2].kind, TokenKind::LeftBracket);
assert_eq!(tokens[3].kind, TokenKind::RightBracket);
assert_eq!(tokens[4].kind, TokenKind::LeftBrace);
assert_eq!(tokens[5].kind, TokenKind::RightBrace);
assert_eq!(tokens[6].kind, TokenKind::Dot);
}
#[test]
fn test_whitespace_and_newlines() {
let source = "a\n b\r\nc\td";
let mut lexer = Lexer::new(source, Some("test"));
let tokens = lexer.tokenize().unwrap();
let tokens: Vec<_> = tokens.into_iter().filter(|t| t.kind != TokenKind::Eof).collect();
assert_eq!(tokens.len(), 4);
assert_eq!(tokens[0].kind, TokenKind::Identifier);
assert_eq!(tokens[0].text, "a");
assert_eq!(tokens[1].kind, TokenKind::Identifier);
assert_eq!(tokens[1].text, "b");
assert_eq!(tokens[2].kind, TokenKind::Identifier);
assert_eq!(tokens[2].text, "c");
assert_eq!(tokens[3].kind, TokenKind::Identifier);
assert_eq!(tokens[3].text, "d");
}
#[test]
fn test_complex_expression() {
let source = r#"(define (factorial n)
#:type (-> Integer Integer)
(if (<= n 1)
1
(* n (factorial (- n 1)))))"#;
let mut lexer = Lexer::new(source, Some("test"));
let tokens = lexer.tokenize().unwrap();
assert!(!tokens.is_empty());
assert_eq!(tokens.last().unwrap().kind, TokenKind::Eof);
let define_token = tokens.iter().find(|t| t.text == "define");
assert!(define_token.is_some());
assert_eq!(define_token.unwrap().kind, TokenKind::Identifier);
let type_keyword = tokens.iter().find(|t| t.text == "#:type");
assert!(type_keyword.is_some());
assert_eq!(type_keyword.unwrap().kind, TokenKind::Keyword);
}
#[test]
fn test_invalid_character() {
let source = "@";
let mut lexer = Lexer::new(source, Some("test"));
let tokens = lexer.tokenize().unwrap();
assert_eq!(tokens.len(), 2); assert_eq!(tokens[0].kind, TokenKind::Error);
assert_eq!(tokens[0].text, "@");
}
#[test]
fn test_edge_cases() {
let source = "";
let mut lexer = Lexer::new(source, Some("test"));
let tokens = lexer.tokenize().unwrap();
assert_eq!(tokens.len(), 1);
assert_eq!(tokens[0].kind, TokenKind::Eof);
let source = " \t ";
let mut lexer = Lexer::new(source, Some("test"));
let tokens = lexer.tokenize().unwrap();
assert_eq!(tokens.len(), 1);
assert_eq!(tokens[0].kind, TokenKind::Eof);
let source = "; just a comment";
let mut lexer = Lexer::new(source, Some("test"));
let tokens = lexer.tokenize().unwrap();
assert_eq!(tokens.len(), 1);
assert_eq!(tokens[0].kind, TokenKind::Eof);
}
#[test]
fn test_number_edge_cases() {
let source = "+1 -2 +3.14 -4.56 1e10 -2E-5 +i -i 0+0i";
let mut lexer = Lexer::new(source, Some("test"));
let tokens = lexer.tokenize().unwrap();
let tokens: Vec<_> = tokens.into_iter().filter(|t| t.kind != TokenKind::Eof).collect();
assert!(tokens.len() >= 8);
for token in &tokens {
assert!(matches!(token.kind,
TokenKind::IntegerNumber |
TokenKind::RealNumber |
TokenKind::ComplexNumber
));
}
}
#[test]
fn test_identifier_edge_cases() {
let source = "+ - * / < <= > >= = eq? list->vector string-length";
let mut lexer = Lexer::new(source, Some("test"));
let tokens = lexer.tokenize().unwrap();
let tokens: Vec<_> = tokens.into_iter().filter(|t| t.kind != TokenKind::Eof).collect();
for token in &tokens {
assert_eq!(token.kind, TokenKind::Identifier);
}
}
}