lex_core/lex/lexing/
base_tokenization.rs

1//! Base tokenization implementation for the lex lexer
2//!
3//!     This module provides the raw tokenization using the logos lexer library.
4//!     This is the entry point where source strings become token streams.
5//!
6//!     We leverage the logos lexer to tokenize the source text into core tokens. This is done
7//!     declaratively with no custom logic, and could not be simpler. The logos lexer produces
8//!     tokens based on the grammar specification defined in the Token enum.
9//!
10//!     This is NOT a transformation - transformations operate on token streams. This is the
11//!     source that creates the initial token stream from a string.
12//!
13//!     The tokens produced by this stage carry byte ranges into the source text. These byte
14//!     ranges are preserved through all transformations and are used at the AST building stage
15//!     for location tracking. It is critical that these ranges are not modified by any
16//!     transformation step.
17
18use crate::lex::token::Token;
19use logos::Logos;
20
21/// Tokenize source code with location information
22///
23/// This function performs raw tokenization using the logos lexer, returning tokens
24/// paired with their source locations. This is the base tokenization step that
25/// converts source strings into token streams.
26///
27/// Pipelines and transformations should operate on the token stream produced by this function,
28/// not call it directly. The caller (e.g., LexerRegistry implementations) should call this
29/// and pass the result to pipelines.
30pub fn tokenize(source: &str) -> Vec<(Token, logos::Span)> {
31    let mut lexer = Token::lexer(source);
32    let mut tokens = Vec::new();
33
34    while let Some(result) = lexer.next() {
35        if let Ok(token) = result {
36            tokens.push((token, lexer.span()));
37        }
38    }
39
40    tokens
41}
42
43#[cfg(test)]
44mod tests {
45    use super::*;
46
47    #[test]
48    fn test_tokenizes() {
49        let tokenss = tokenize("hello world");
50        assert_eq!(tokenss.len(), 3);
51
52        // Check that tokens are correct
53        assert_eq!(tokenss[0].0, Token::Text("hello".to_string()));
54        assert_eq!(tokenss[1].0, Token::Whitespace(1));
55        assert_eq!(tokenss[2].0, Token::Text("world".to_string()));
56    }
57
58    #[test]
59    fn test_empty_input() {
60        let tokenss = tokenize("");
61        assert_eq!(tokenss, vec![]);
62    }
63
64    #[test]
65    fn test_complex_tokenization() {
66        let input = "1. Session Title\n    - Item 1\n    - Item 2";
67        let tokenss = tokenize(input);
68
69        // Expected tokens for "1. Session Title"
70        assert_eq!(tokenss[0].0, Token::Number("1".to_string())); // "1"
71        assert_eq!(tokenss[1].0, Token::Period); // "."
72        assert_eq!(tokenss[2].0, Token::Whitespace(1)); // " "
73        assert_eq!(tokenss[3].0, Token::Text("Session".to_string())); // "Session"
74        assert_eq!(tokenss[4].0, Token::Whitespace(1)); // " "
75        assert_eq!(tokenss[5].0, Token::Text("Title".to_string())); // "Title"
76        assert_eq!(tokenss[6].0, Token::BlankLine(Some("\n".to_string()))); // "\n"
77
78        // Expected tokens for "    - Item 1"
79        assert_eq!(tokenss[7].0, Token::Indentation); // "    "
80        assert_eq!(tokenss[8].0, Token::Dash); // "-"
81        assert_eq!(tokenss[9].0, Token::Whitespace(1)); // " "
82        assert_eq!(tokenss[10].0, Token::Text("Item".to_string())); // "Item"
83        assert_eq!(tokenss[11].0, Token::Whitespace(1)); // " "
84        assert_eq!(tokenss[12].0, Token::Number("1".to_string())); // "1"
85        assert_eq!(tokenss[13].0, Token::BlankLine(Some("\n".to_string()))); // "\n"
86
87        // Expected tokens for "    - Item 2"
88        assert_eq!(tokenss[14].0, Token::Indentation); // "    "
89        assert_eq!(tokenss[15].0, Token::Dash); // "-"
90        assert_eq!(tokenss[16].0, Token::Whitespace(1)); // " "
91        assert_eq!(tokenss[17].0, Token::Text("Item".to_string())); // "Item"
92        assert_eq!(tokenss[18].0, Token::Whitespace(1)); // " "
93        assert_eq!(tokenss[19].0, Token::Number("2".to_string()));
94        // "2"
95    }
96
97    #[test]
98    fn test_whitespace_only() {
99        let tokenss = tokenize("   \t  ");
100        // Expected: 3 spaces -> Whitespace, 1 tab -> Indent, 2 spaces -> Whitespace
101        assert_eq!(tokenss.len(), 3);
102        assert_eq!(tokenss[0].0, Token::Whitespace(3));
103        assert_eq!(tokenss[1].0, Token::Indentation);
104        assert_eq!(tokenss[2].0, Token::Whitespace(2));
105    }
106}
lex_core/lex/lexing/base_tokenization.rs

lex_core/lex/lexing/
base_tokenization.rs