lex_core/lex/lexing/base_tokenization.rs
1//! Base tokenization implementation for the lex lexer
2//!
3//! This module provides the raw tokenization using the logos lexer library.
4//! This is the entry point where source strings become token streams.
5//!
6//! We leverage the logos lexer to tokenize the source text into core tokens. This is done
7//! declaratively with no custom logic, and could not be simpler. The logos lexer produces
8//! tokens based on the grammar specification defined in the Token enum.
9//!
10//! This is NOT a transformation - transformations operate on token streams. This is the
11//! source that creates the initial token stream from a string.
12//!
13//! The tokens produced by this stage carry byte ranges into the source text. These byte
14//! ranges are preserved through all transformations and are used at the AST building stage
15//! for location tracking. It is critical that these ranges are not modified by any
16//! transformation step.
17
18use crate::lex::token::Token;
19use logos::Logos;
20
21/// Tokenize source code with location information
22///
23/// This function performs raw tokenization using the logos lexer, returning tokens
24/// paired with their source locations. This is the base tokenization step that
25/// converts source strings into token streams.
26///
27/// Pipelines and transformations should operate on the token stream produced by this function,
28/// not call it directly. The caller (e.g., LexerRegistry implementations) should call this
29/// and pass the result to pipelines.
30pub fn tokenize(source: &str) -> Vec<(Token, logos::Span)> {
31 let mut lexer = Token::lexer(source);
32 let mut tokens = Vec::new();
33
34 while let Some(result) = lexer.next() {
35 if let Ok(token) = result {
36 tokens.push((token, lexer.span()));
37 }
38 }
39
40 tokens
41}
42
43#[cfg(test)]
44mod tests {
45 use super::*;
46
47 #[test]
48 fn test_tokenizes() {
49 let tokenss = tokenize("hello world");
50 assert_eq!(tokenss.len(), 3);
51
52 // Check that tokens are correct
53 assert_eq!(tokenss[0].0, Token::Text("hello".to_string()));
54 assert_eq!(tokenss[1].0, Token::Whitespace(1));
55 assert_eq!(tokenss[2].0, Token::Text("world".to_string()));
56 }
57
58 #[test]
59 fn test_empty_input() {
60 let tokenss = tokenize("");
61 assert_eq!(tokenss, vec![]);
62 }
63
64 #[test]
65 fn test_complex_tokenization() {
66 let input = "1. Session Title\n - Item 1\n - Item 2";
67 let tokenss = tokenize(input);
68
69 // Expected tokens for "1. Session Title"
70 assert_eq!(tokenss[0].0, Token::Number("1".to_string())); // "1"
71 assert_eq!(tokenss[1].0, Token::Period); // "."
72 assert_eq!(tokenss[2].0, Token::Whitespace(1)); // " "
73 assert_eq!(tokenss[3].0, Token::Text("Session".to_string())); // "Session"
74 assert_eq!(tokenss[4].0, Token::Whitespace(1)); // " "
75 assert_eq!(tokenss[5].0, Token::Text("Title".to_string())); // "Title"
76 assert_eq!(tokenss[6].0, Token::BlankLine(Some("\n".to_string()))); // "\n"
77
78 // Expected tokens for " - Item 1"
79 assert_eq!(tokenss[7].0, Token::Indentation); // " "
80 assert_eq!(tokenss[8].0, Token::Dash); // "-"
81 assert_eq!(tokenss[9].0, Token::Whitespace(1)); // " "
82 assert_eq!(tokenss[10].0, Token::Text("Item".to_string())); // "Item"
83 assert_eq!(tokenss[11].0, Token::Whitespace(1)); // " "
84 assert_eq!(tokenss[12].0, Token::Number("1".to_string())); // "1"
85 assert_eq!(tokenss[13].0, Token::BlankLine(Some("\n".to_string()))); // "\n"
86
87 // Expected tokens for " - Item 2"
88 assert_eq!(tokenss[14].0, Token::Indentation); // " "
89 assert_eq!(tokenss[15].0, Token::Dash); // "-"
90 assert_eq!(tokenss[16].0, Token::Whitespace(1)); // " "
91 assert_eq!(tokenss[17].0, Token::Text("Item".to_string())); // "Item"
92 assert_eq!(tokenss[18].0, Token::Whitespace(1)); // " "
93 assert_eq!(tokenss[19].0, Token::Number("2".to_string()));
94 // "2"
95 }
96
97 #[test]
98 fn test_whitespace_only() {
99 let tokenss = tokenize(" \t ");
100 // Expected: 3 spaces -> Whitespace, 1 tab -> Indent, 2 spaces -> Whitespace
101 assert_eq!(tokenss.len(), 3);
102 assert_eq!(tokenss[0].0, Token::Whitespace(3));
103 assert_eq!(tokenss[1].0, Token::Indentation);
104 assert_eq!(tokenss[2].0, Token::Whitespace(2));
105 }
106}