lex_core/lex/token/
core.rs

1//! Token definitions for the lex format
2//!
3//!     This module defines all the core tokens that can be produced by the lex lexer.
4//!     The tokens are defined using the logos derive macro for efficient tokenization.
5//!
6//!     These are character/word level tokens produced directly by the logos lexer. They represent
7//!     the fundamental building blocks of lex source code: text, whitespace, markers, punctuation,
8//!     and indentation.
9//!
10//!     The logos lexer produces these tokens declaratively with no custom logic. This is a pure
11//!     tokenization step that converts source text into a stream of typed tokens with byte ranges.
12//!
13//!     For the complete grammar specification, see specs/v1/grammar-core.lex.
14//!
15//!     Note: These core tokens are transformed into semantic tokens (like Indent/Dedent) in later
16//!     stages of the lexing pipeline. See the [token](super) module for the complete token system.
17use logos::Logos;
18use std::fmt;
19
20/// All possible tokens in the lex format
21#[derive(Logos, Debug, PartialEq, Eq, Hash, Clone, serde::Serialize, serde::Deserialize)]
22pub enum Token {
23    // Special markers
24    #[token("::")]
25    LexMarker,
26
27    // Indentation (simplified - one token per 4 spaces or tab)
28    #[regex(r" {4}|\t", priority = 3)] // Either 4 spaces OR 1 tab - highest priority
29    Indentation,
30
31    // Semantic indentation tokens (generated by transformation)
32    // These store the original tokens they were created from
33    Indent(Vec<(Token, std::ops::Range<usize>)>),
34    Dedent(Vec<(Token, std::ops::Range<usize>)>),
35
36    // A blank line (whitespace only, followed by a newline)
37    #[regex(r"\n", |lex| Some(lex.slice().to_owned()))]
38    BlankLine(Option<String>),
39
40    // Whitespace (excluding newlines and indentation)
41    #[regex(r" {1,3}", |lex| Some(lex.slice().len()), priority = 1)]
42    // 1-3 spaces only, lower priority than indentation
43    Whitespace(usize),
44
45    // Sequence markers
46    #[token("-")]
47    Dash,
48    #[token(".")]
49    Period,
50    #[token("(")]
51    OpenParen,
52    #[token(")")]
53    CloseParen,
54    #[token(":")]
55    Colon,
56
57    // End Punctuation
58    #[token("!")]
59    ExclamationMark,
60    #[token("?")]
61    QuestionMark,
62    #[token(";")]
63    Semicolon,
64    #[token("¡")]
65    InvertedExclamationMark,
66    #[token("¿")]
67    InvertedQuestionMark,
68    #[token("…")]
69    Ellipsis,
70    #[token("。")]
71    IdeographicFullStop,
72    #[token("!")]
73    FullwidthExclamationMark,
74    #[token("?")]
75    FullwidthQuestionMark,
76    #[token("⁉")]
77    ExclamationQuestionMark,
78    #[token("⁈")]
79    QuestionExclamationMark,
80    #[token("؟")]
81    ArabicQuestionMark,
82    #[token("۔")]
83    ArabicFullStop,
84    #[token("؍")]
85    ArabicTripleDot,
86    #[token("،")]
87    ArabicComma,
88    #[token("।")]
89    Danda,
90    #[token("॥")]
91    DoubleDanda,
92    #[token("৷")]
93    BengaliCurrencyNumeratorFour,
94    #[token("።")]
95    EthiopianFullStop,
96    #[token("։")]
97    ArmenianFullStop,
98    #[token("།")]
99    TibetanShad,
100    #[token("๏")]
101    ThaiFongman,
102    #[token("၊")]
103    MyanmarComma,
104    #[token("။")]
105    MyanmarFullStop,
106
107    // Parameter markers (for annotations)
108    #[token(",")]
109    Comma,
110    #[token("\"")]
111    Quote,
112    #[token("=")]
113    Equals,
114
115    // Numbers (for ordered lists and session titles)
116    #[regex(r"[0-9]+", |lex| lex.slice().to_owned(), priority = 2)]
117    Number(String),
118
119    // Text content (catch-all for non-special characters, excluding numbers and special chars)
120    // The regex explicitly excludes all special characters that have dedicated tokens.
121    // Character categories in the exclusion set:
122    //   \s\n\t          - whitespace
123    //   \-\.\(\):       - structural punctuation (sequence markers)
124    //   0-9             - numbers
125    //   ,="             - parameter markers (annotations)
126    //   !?;             - basic latin punctuation
127    //   ¡¿…⁉⁈           - extended latin punctuation
128    //   。!?           - CJK punctuation
129    //   ؟۔؍،            - arabic punctuation
130    //   ।॥৷             - indic punctuation
131    //   ።։།๏၊။          - other scripts (ethiopian, armenian, tibetan, thai, myanmar)
132    #[regex(r#"[^\s\n\t\-\.\(\):0-9,="!?;¡¿…。!?⁉⁈؟۔؍،।॥৷።։།๏၊။]+"#, |lex| lex.slice().to_owned())]
133    Text(String),
134}
135
136impl fmt::Display for Token {
137    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
138        let name = match self {
139            Token::LexMarker => "lex-marker",
140            Token::Indentation => "indentation",
141            Token::Indent(_) => "indent",
142            Token::Dedent(_) => "dedent",
143            Token::BlankLine(_) => "blank-line",
144            Token::Whitespace(_) => "whitespace",
145            Token::Dash => "dash",
146            Token::Period => "period",
147            Token::OpenParen => "open-paren",
148            Token::CloseParen => "close-paren",
149            Token::Colon => "colon",
150            Token::ExclamationMark => "exclamation-mark",
151            Token::QuestionMark => "question-mark",
152            Token::Semicolon => "semicolon",
153            Token::InvertedExclamationMark => "inverted-exclamation-mark",
154            Token::InvertedQuestionMark => "inverted-question-mark",
155            Token::Ellipsis => "ellipsis",
156            Token::IdeographicFullStop => "ideographic-full-stop",
157            Token::FullwidthExclamationMark => "fullwidth-exclamation-mark",
158            Token::FullwidthQuestionMark => "fullwidth-question-mark",
159            Token::ExclamationQuestionMark => "exclamation-question-mark",
160            Token::QuestionExclamationMark => "question-exclamation-mark",
161            Token::ArabicQuestionMark => "arabic-question-mark",
162            Token::ArabicFullStop => "arabic-full-stop",
163            Token::ArabicTripleDot => "arabic-triple-dot",
164            Token::ArabicComma => "arabic-comma",
165            Token::Danda => "danda",
166            Token::DoubleDanda => "double-danda",
167            Token::BengaliCurrencyNumeratorFour => "bengali-currency-numerator-four",
168            Token::EthiopianFullStop => "ethiopian-full-stop",
169            Token::ArmenianFullStop => "armenian-full-stop",
170            Token::TibetanShad => "tibetan-shad",
171            Token::ThaiFongman => "thai-fongman",
172            Token::MyanmarComma => "myanmar-comma",
173            Token::MyanmarFullStop => "myanmar-full-stop",
174            Token::Comma => "comma",
175            Token::Quote => "quote",
176            Token::Equals => "equals",
177            Token::Number(s) => return write!(f, "<number:{s}>"),
178            Token::Text(s) => return write!(f, "<text:{s}>"),
179        };
180        write!(f, "<{name}>")
181    }
182}
183
184impl Token {
185    /// Get an uppercase identifier describing this token variant. Used by CLI simple output.
186    pub fn simple_name(&self) -> &'static str {
187        match self {
188            Token::LexMarker => "LEX_MARKER",
189            Token::Indentation => "INDENTATION",
190            Token::Indent(_) => "INDENT",
191            Token::Dedent(_) => "DEDENT",
192            Token::BlankLine(_) => "BLANK_LINE",
193            Token::Whitespace(_) => "WHITESPACE",
194            Token::Dash => "DASH",
195            Token::Period => "PERIOD",
196            Token::OpenParen => "OPEN_PAREN",
197            Token::CloseParen => "CLOSE_PAREN",
198            Token::Colon => "COLON",
199            Token::ExclamationMark => "EXCLAMATION_MARK",
200            Token::QuestionMark => "QUESTION_MARK",
201            Token::Semicolon => "SEMICOLON",
202            Token::InvertedExclamationMark => "INVERTED_EXCLAMATION_MARK",
203            Token::InvertedQuestionMark => "INVERTED_QUESTION_MARK",
204            Token::Ellipsis => "ELLIPSIS",
205            Token::IdeographicFullStop => "IDEOGRAPHIC_FULL_STOP",
206            Token::FullwidthExclamationMark => "FULLWIDTH_EXCLAMATION_MARK",
207            Token::FullwidthQuestionMark => "FULLWIDTH_QUESTION_MARK",
208            Token::ExclamationQuestionMark => "EXCLAMATION_QUESTION_MARK",
209            Token::QuestionExclamationMark => "QUESTION_EXCLAMATION_MARK",
210            Token::ArabicQuestionMark => "ARABIC_QUESTION_MARK",
211            Token::ArabicFullStop => "ARABIC_FULL_STOP",
212            Token::ArabicTripleDot => "ARABIC_TRIPLE_DOT",
213            Token::ArabicComma => "ARABIC_COMMA",
214            Token::Danda => "DANDA",
215            Token::DoubleDanda => "DOUBLE_DANDA",
216            Token::BengaliCurrencyNumeratorFour => "BENGALI_CURRENCY_NUMERATOR_FOUR",
217            Token::EthiopianFullStop => "ETHIOPIAN_FULL_STOP",
218            Token::ArmenianFullStop => "ARMENIAN_FULL_STOP",
219            Token::TibetanShad => "TIBETAN_SHAD",
220            Token::ThaiFongman => "THAI_FONGMAN",
221            Token::MyanmarComma => "MYANMAR_COMMA",
222            Token::MyanmarFullStop => "MYANMAR_FULL_STOP",
223            Token::Comma => "COMMA",
224            Token::Quote => "QUOTE",
225            Token::Equals => "EQUALS",
226            Token::Number(_) => "NUMBER",
227            Token::Text(_) => "TEXT",
228        }
229    }
230
231    /// Check if this token represents indentation
232    pub fn is_indent(&self) -> bool {
233        matches!(self, Token::Indentation)
234    }
235
236    /// Check if this token represents semantic indentation level
237    pub fn is_indent_level(&self) -> bool {
238        matches!(self, Token::Indent(_))
239    }
240
241    /// Check if this token represents semantic dedentation level
242    pub fn is_dedent_level(&self) -> bool {
243        matches!(self, Token::Dedent(_))
244    }
245
246    /// Check if this token is whitespace (including indentation)
247    pub fn is_whitespace(&self) -> bool {
248        matches!(
249            self,
250            Token::Indentation
251                | Token::Indent(_)
252                | Token::Dedent(_)
253                | Token::BlankLine(_)
254                | Token::Whitespace(_)
255        )
256    }
257
258    /// Check if this token is a sequence marker
259    pub fn is_sequence_marker(&self) -> bool {
260        matches!(
261            self,
262            Token::Dash | Token::Period | Token::OpenParen | Token::CloseParen
263        )
264    }
265
266    /// Check if this token is a number
267    pub fn is_number(&self) -> bool {
268        matches!(self, Token::Number(_))
269    }
270
271    /// Check if this token is text content
272    pub fn is_text(&self) -> bool {
273        matches!(self, Token::Text(_))
274    }
275
276    pub fn is_end_punctuation(&self) -> bool {
277        matches!(
278            self,
279            Token::Period
280                | Token::ExclamationMark
281                | Token::QuestionMark
282                | Token::Semicolon
283                | Token::Comma
284                | Token::InvertedExclamationMark
285                | Token::InvertedQuestionMark
286                | Token::Ellipsis
287                | Token::IdeographicFullStop
288                | Token::FullwidthExclamationMark
289                | Token::FullwidthQuestionMark
290                | Token::ExclamationQuestionMark
291                | Token::QuestionExclamationMark
292                | Token::ArabicQuestionMark
293                | Token::ArabicFullStop
294                | Token::ArabicTripleDot
295                | Token::ArabicComma
296                | Token::Danda
297                | Token::DoubleDanda
298                | Token::BengaliCurrencyNumeratorFour
299                | Token::EthiopianFullStop
300                | Token::ArmenianFullStop
301                | Token::TibetanShad
302                | Token::ThaiFongman
303                | Token::MyanmarComma
304                | Token::MyanmarFullStop
305        )
306    }
307}
308
309#[cfg(test)]
310mod tests {
311    use super::*;
312    use crate::lex::lexing::tokenize;
313
314    #[test]
315    fn test_lex_marker() {
316        let tokens: Vec<_> = tokenize("::").into_iter().map(|(t, _)| t).collect();
317        assert_eq!(tokens, vec![Token::LexMarker]);
318    }
319
320    #[test]
321    fn test_indentation_tokens() {
322        // Test 4 spaces
323        let tokens: Vec<_> = tokenize("    ").into_iter().map(|(t, _)| t).collect();
324        assert_eq!(tokens, vec![Token::Indentation]);
325
326        // Test tab
327        let tokens: Vec<_> = tokenize("\t").into_iter().map(|(t, _)| t).collect();
328        assert_eq!(tokens, vec![Token::Indentation]);
329
330        // Test multiple indent levels
331        let tokens: Vec<_> = tokenize("        ").into_iter().map(|(t, _)| t).collect(); // 8 spaces = 2 indent levels
332        assert_eq!(tokens, vec![Token::Indentation, Token::Indentation]);
333    }
334
335    #[test]
336    fn test_sequence_markers() {
337        let tokens: Vec<_> = tokenize("- . ( ) :").into_iter().map(|(t, _)| t).collect();
338        assert_eq!(
339            tokens,
340            vec![
341                Token::Dash,
342                Token::Whitespace(1),
343                Token::Period,
344                Token::Whitespace(1),
345                Token::OpenParen,
346                Token::Whitespace(1),
347                Token::CloseParen,
348                Token::Whitespace(1),
349                Token::Colon
350            ]
351        );
352    }
353
354    #[test]
355    fn test_text_tokens() {
356        let tokens: Vec<_> = tokenize("hello world")
357            .into_iter()
358            .map(|(t, _)| t)
359            .collect();
360        assert_eq!(
361            tokens,
362            vec![
363                Token::Text("hello".to_string()),
364                Token::Whitespace(1),
365                Token::Text("world".to_string())
366            ]
367        );
368    }
369
370    #[test]
371    fn test_mixed_content() {
372        let tokens: Vec<_> = tokenize("1. Hello world\n    - Item 1")
373            .into_iter()
374            .map(|(t, _)| t)
375            .collect();
376        assert_eq!(
377            tokens,
378            vec![
379                Token::Number("1".to_string()),
380                Token::Period,
381                Token::Whitespace(1),
382                Token::Text("Hello".to_string()),
383                Token::Whitespace(1),
384                Token::Text("world".to_string()),
385                Token::BlankLine(Some("\n".to_string())),
386                Token::Indentation,
387                Token::Dash,
388                Token::Whitespace(1),
389                Token::Text("Item".to_string()),
390                Token::Whitespace(1),
391                Token::Number("1".to_string()),
392            ]
393        );
394    }
395
396    #[test]
397    fn test_number_tokens() {
398        let tokens: Vec<_> = tokenize("123 456").into_iter().map(|(t, _)| t).collect();
399        assert_eq!(
400            tokens,
401            vec![
402                Token::Number("123".to_string()),
403                Token::Whitespace(1),
404                Token::Number("456".to_string())
405            ]
406        );
407    }
408
409    #[test]
410    fn test_token_predicates() {
411        assert!(Token::Indentation.is_indent());
412        assert!(Token::Indent(vec![]).is_indent_level());
413        assert!(Token::Dedent(vec![]).is_dedent_level());
414        assert!(!Token::Text("".to_string()).is_indent());
415
416        assert!(Token::Indentation.is_whitespace());
417        assert!(Token::Indent(vec![]).is_whitespace());
418        assert!(Token::Dedent(vec![]).is_whitespace());
419        assert!(Token::BlankLine(Some("".to_string())).is_whitespace());
420        assert!(Token::Whitespace(1).is_whitespace());
421        assert!(!Token::Text("".to_string()).is_whitespace());
422
423        assert!(Token::Dash.is_sequence_marker());
424        assert!(Token::Period.is_sequence_marker());
425        assert!(!Token::Text("".to_string()).is_sequence_marker());
426        assert!(!Token::Number("".to_string()).is_sequence_marker());
427
428        assert!(Token::Text("".to_string()).is_text());
429        assert!(!Token::Dash.is_text());
430        assert!(!Token::Number("".to_string()).is_text());
431
432        assert!(Token::Number("".to_string()).is_number());
433        assert!(!Token::Text("".to_string()).is_number());
434        assert!(!Token::Dash.is_number());
435    }
436}