Skip to main content

ucl_parser/
lexer.rs

1//! Lexer for UCL using Logos.
2
3use logos::Logos;
4
5/// Token kinds
6#[derive(Logos, Debug, Clone, PartialEq)]
7#[logos(skip r"[ \t]+")]
8pub enum TokenKind {
9    // Section headers (case-insensitive)
10    #[regex("(?i)STRUCTURE")]
11    Structure,
12    #[regex("(?i)BLOCKS")]
13    Blocks,
14    #[regex("(?i)COMMANDS")]
15    Commands,
16
17    // Commands (case-insensitive)
18    #[regex("(?i)EDIT")]
19    Edit,
20    #[regex("(?i)SET")]
21    Set,
22    #[regex("(?i)MOVE")]
23    Move,
24    #[regex("(?i)TO")]
25    To,
26    #[regex("(?i)AT")]
27    At,
28    #[regex("(?i)BEFORE")]
29    Before,
30    #[regex("(?i)AFTER")]
31    After,
32    #[regex("(?i)SWAP")]
33    Swap,
34    #[regex("(?i)APPEND")]
35    Append,
36    #[regex("(?i)WITH")]
37    With,
38    #[regex("(?i)DELETE")]
39    Delete,
40    #[regex("(?i)CASCADE")]
41    Cascade,
42    #[regex("(?i)PRESERVE_CHILDREN")]
43    PreserveChildren,
44    #[regex("(?i)PRUNE")]
45    Prune,
46    #[regex("(?i)UNREACHABLE")]
47    Unreachable,
48    #[regex("(?i)WHERE")]
49    Where,
50    #[regex("(?i)DRY_RUN")]
51    DryRun,
52    #[regex("(?i)FOLD")]
53    Fold,
54    #[regex("(?i)DEPTH")]
55    Depth,
56    #[regex("(?i)MAX_TOKENS")]
57    MaxTokens,
58    #[regex("(?i)PRESERVE_TAGS")]
59    PreserveTags,
60    #[regex("(?i)LINK")]
61    Link,
62    #[regex("(?i)UNLINK")]
63    Unlink,
64    #[regex("(?i)SNAPSHOT")]
65    Snapshot,
66    #[regex("(?i)CREATE")]
67    Create,
68    #[regex("(?i)RESTORE")]
69    Restore,
70    #[regex("(?i)LIST")]
71    List,
72    #[regex("(?i)DIFF")]
73    Diff,
74    #[regex("(?i)BEGIN")]
75    Begin,
76    #[regex("(?i)TRANSACTION")]
77    Transaction,
78    #[regex("(?i)COMMIT")]
79    Commit,
80    #[regex("(?i)ROLLBACK")]
81    Rollback,
82    #[regex("(?i)ATOMIC")]
83    Atomic,
84    #[regex("(?i)VIEW")]
85    View,
86    #[regex("(?i)FOLDED")]
87    Folded,
88    #[regex("(?i)FROM")]
89    From,
90    #[regex("(?i)TEMPLATE")]
91    Template,
92    #[regex("(?i)FIRST")]
93    First,
94    #[regex("(?i)LAST")]
95    Last,
96    #[regex("(?i)WRITE_SECTION")]
97    WriteSection,
98    #[regex("(?i)BASE_LEVEL")]
99    BaseLevel,
100
101    // Agent traversal commands (case-insensitive)
102    #[regex("(?i)GOTO")]
103    Goto,
104    #[regex("(?i)BACK")]
105    Back,
106    #[regex("(?i)EXPAND")]
107    Expand,
108    #[regex("(?i)FOLLOW")]
109    Follow,
110    #[regex("(?i)PATH")]
111    Path,
112    #[regex("(?i)SEARCH")]
113    Search,
114    #[regex("(?i)FIND")]
115    Find,
116    #[regex("(?i)CTX")]
117    Ctx,
118
119    // Traversal directions (case-insensitive)
120    #[regex("(?i)DOWN")]
121    Down,
122    #[regex("(?i)UP")]
123    Up,
124    #[regex("(?i)SEMANTIC")]
125    Semantic,
126
127    // Traversal options (case-insensitive)
128    #[regex("(?i)MODE")]
129    Mode,
130    #[regex("(?i)LIMIT")]
131    Limit,
132    #[regex("(?i)MIN_SIMILARITY")]
133    MinSimilarity,
134    #[regex("(?i)ROLES")]
135    Roles,
136    #[regex("(?i)TAGS")]
137    Tags,
138    #[regex("(?i)ROLE")]
139    Role,
140    #[regex("(?i)TAG")]
141    Tag,
142    #[regex("(?i)LABEL")]
143    Label,
144    #[regex("(?i)PATTERN")]
145    Pattern,
146    #[regex("(?i)MAX")]
147    Max,
148    #[regex("(?i)NEIGHBORHOOD")]
149    Neighborhood,
150
151    // Context commands (case-insensitive)
152    #[regex("(?i)ADD")]
153    Add,
154    #[regex("(?i)REMOVE")]
155    Remove,
156    #[regex("(?i)CLEAR")]
157    Clear,
158    #[regex("(?i)COMPRESS")]
159    Compress,
160    #[regex("(?i)RENDER")]
161    Render,
162    #[regex("(?i)STATS")]
163    Stats,
164    #[regex("(?i)FOCUS")]
165    Focus,
166    #[regex("(?i)RESULTS")]
167    Results,
168    #[regex("(?i)CHILDREN")]
169    Children,
170    #[regex("(?i)AUTO")]
171    Auto,
172    #[regex("(?i)TOKENS")]
173    Tokens,
174    #[regex("(?i)MAX_AGE")]
175    MaxAge,
176    #[regex("(?i)RELEVANCE")]
177    Relevance,
178    #[regex("(?i)REASON")]
179    Reason,
180    #[regex("(?i)METHOD")]
181    Method,
182    #[regex("(?i)FORMAT")]
183    Format,
184    #[regex("(?i)TRUNCATE")]
185    Truncate,
186    #[regex("(?i)SUMMARIZE")]
187    Summarize,
188    #[regex("(?i)STRUCTURE_ONLY")]
189    StructureOnly,
190    #[regex("(?i)SHORT_IDS")]
191    ShortIds,
192    #[regex("(?i)MARKDOWN")]
193    Markdown,
194    #[regex("(?i)FULL")]
195    Full,
196    #[regex("(?i)PREVIEW")]
197    Preview,
198    #[regex("(?i)METADATA")]
199    MetadataToken,
200    #[regex("(?i)IDS")]
201    Ids,
202    #[regex("(?i)BOTH")]
203    Both,
204
205    // Operators
206    #[token("=")]
207    Eq,
208    #[token("!=")]
209    Ne,
210    #[token(">")]
211    Gt,
212    #[token(">=")]
213    Ge,
214    #[token("<")]
215    Lt,
216    #[token("<=")]
217    Le,
218    #[token("+=")]
219    PlusEq,
220    #[token("-=")]
221    MinusEq,
222    #[token("++")]
223    PlusPlus,
224    #[token("--")]
225    MinusMinus,
226
227    // Logic (case-insensitive)
228    #[regex("(?i)AND")]
229    And,
230    #[regex("(?i)OR")]
231    Or,
232    #[regex("(?i)NOT")]
233    Not,
234    #[regex("(?i)CONTAINS")]
235    Contains,
236    #[regex("(?i)STARTS_WITH")]
237    StartsWith,
238    #[regex("(?i)ENDS_WITH")]
239    EndsWith,
240    #[regex("(?i)MATCHES")]
241    Matches,
242    #[regex("(?i)EXISTS")]
243    Exists,
244    #[regex("(?i)IS_NULL")]
245    IsNull,
246    #[regex("(?i)IS_NOT_NULL")]
247    IsNotNull,
248    #[regex("(?i)IS_EMPTY")]
249    IsEmpty,
250    #[regex("(?i)LENGTH")]
251    Length,
252
253    // Punctuation
254    #[token("::")]
255    DoubleColon,
256    #[token(":")]
257    Colon,
258    #[token(",")]
259    Comma,
260    #[token(".")]
261    Dot,
262    #[token("#")]
263    Hash,
264    #[token("@")]
265    At_,
266    #[token("$")]
267    Dollar,
268    #[token("[")]
269    LBracket,
270    #[token("]")]
271    RBracket,
272    #[token("{")]
273    LBrace,
274    #[token("}")]
275    RBrace,
276    #[token("(")]
277    LParen,
278    #[token(")")]
279    RParen,
280
281    // Content types
282    #[token("text")]
283    TextType,
284    #[token("table")]
285    TableType,
286    #[token("code")]
287    CodeType,
288    #[token("math")]
289    MathType,
290    #[token("media")]
291    MediaType,
292    #[token("json")]
293    JsonType,
294    #[token("binary")]
295    BinaryType,
296    #[token("composite")]
297    CompositeType,
298
299    // Literals
300    #[token("true")]
301    True,
302    #[token("false")]
303    False,
304    #[token("null")]
305    Null,
306
307    // Identifier (block IDs, property names)
308    #[regex(r"blk_[a-fA-F0-9]+")]
309    BlockId,
310
311    #[regex(r"[a-zA-Z_][a-zA-Z0-9_]*")]
312    Identifier,
313
314    // Numbers
315    #[regex(r"-?[0-9]+\.[0-9]+", |lex| lex.slice().parse::<f64>().ok())]
316    Float(f64),
317
318    #[regex(r"-?[0-9]+", |lex| lex.slice().parse::<i64>().ok())]
319    Integer(i64),
320
321    // Strings
322    #[regex(r#""([^"\\]|\\.)*""#, |lex| {
323        let s = lex.slice();
324        Some(s[1..s.len()-1].to_string())
325    })]
326    DoubleString(String),
327
328    #[regex(r#"'([^'\\]|\\.)*'"#, |lex| {
329        let s = lex.slice();
330        Some(s[1..s.len()-1].to_string())
331    })]
332    SingleString(String),
333
334    // Triple-quoted strings handled via callback in parser
335    TripleString(String),
336
337    // Code blocks handled via callback in parser
338    CodeBlock(String),
339
340    // Table literal
341    #[regex(r"\|[^\n]+\|(\n\|[^\n]+\|)+", |lex| {
342        Some(lex.slice().to_string())
343    })]
344    TableLiteral(String),
345
346    // Newline
347    #[regex(r"\n")]
348    Newline,
349
350    // Comment - use // style to avoid conflict with # delimiter in block definitions
351    #[regex(r"//[^\n]*")]
352    Comment,
353}
354
355/// Token with position information
356#[derive(Debug, Clone)]
357pub struct Token {
358    pub kind: TokenKind,
359    pub span: std::ops::Range<usize>,
360    pub line: usize,
361    pub column: usize,
362}
363
364/// Lexer wrapper that tracks position
365pub struct Lexer<'a> {
366    inner: logos::Lexer<'a, TokenKind>,
367    line: usize,
368    column: usize,
369    last_newline_pos: usize,
370}
371
372impl<'a> Lexer<'a> {
373    pub fn new(input: &'a str) -> Self {
374        Self {
375            inner: TokenKind::lexer(input),
376            line: 1,
377            column: 1,
378            last_newline_pos: 0,
379        }
380    }
381
382    pub fn source(&self) -> &'a str {
383        self.inner.source()
384    }
385}
386
387impl<'a> Iterator for Lexer<'a> {
388    type Item = Result<Token, ()>;
389
390    fn next(&mut self) -> Option<Self::Item> {
391        loop {
392            let kind = self.inner.next()?;
393            let span = self.inner.span();
394
395            // Update line/column tracking
396            let source = self.inner.source();
397            for c in source[self.last_newline_pos..span.start].chars() {
398                if c == '\n' {
399                    self.line += 1;
400                    self.column = 1;
401                    self.last_newline_pos = span.start;
402                } else {
403                    self.column += 1;
404                }
405            }
406
407            match kind {
408                Ok(TokenKind::Comment) => continue, // Skip comments
409                Ok(TokenKind::Newline) => {
410                    self.line += 1;
411                    self.column = 1;
412                    self.last_newline_pos = span.end;
413                    // Return newline token for line-aware parsing
414                    return Some(Ok(Token {
415                        kind: TokenKind::Newline,
416                        span,
417                        line: self.line - 1,
418                        column: 1,
419                    }));
420                }
421                Ok(kind) => {
422                    return Some(Ok(Token {
423                        kind,
424                        span,
425                        line: self.line,
426                        column: self.column,
427                    }));
428                }
429                Err(_) => return Some(Err(())),
430            }
431        }
432    }
433}
434
435#[cfg(test)]
436mod tests {
437    use super::*;
438
439    #[test]
440    fn test_lex_structure() {
441        let input = "STRUCTURE\nblk_abc123def456: [blk_111222333444]";
442        let lexer = Lexer::new(input);
443        let tokens: Vec<_> = lexer.filter_map(|r| r.ok()).collect();
444
445        assert!(matches!(tokens[0].kind, TokenKind::Structure));
446        assert!(matches!(tokens[2].kind, TokenKind::BlockId));
447    }
448
449    #[test]
450    fn test_lex_edit_command() {
451        let input = r#"EDIT blk_abc123def456 SET content.text = "hello""#;
452        let lexer = Lexer::new(input);
453        let tokens: Vec<_> = lexer.filter_map(|r| r.ok()).collect();
454
455        assert!(matches!(tokens[0].kind, TokenKind::Edit));
456        assert!(matches!(tokens[1].kind, TokenKind::BlockId));
457        assert!(matches!(tokens[2].kind, TokenKind::Set));
458    }
459
460    #[test]
461    fn test_lex_string_types() {
462        let input = r#""double" 'single'"#;
463        let lexer = Lexer::new(input);
464        let tokens: Vec<_> = lexer.filter_map(|r| r.ok()).collect();
465
466        assert!(matches!(tokens[0].kind, TokenKind::DoubleString(_)));
467        assert!(matches!(tokens[1].kind, TokenKind::SingleString(_)));
468    }
469
470    #[test]
471    fn test_lex_operators() {
472        let input = "= += -= != >= <=";
473        let lexer = Lexer::new(input);
474        let tokens: Vec<_> = lexer.filter_map(|r| r.ok()).collect();
475
476        assert!(matches!(tokens[0].kind, TokenKind::Eq));
477        assert!(matches!(tokens[1].kind, TokenKind::PlusEq));
478        assert!(matches!(tokens[2].kind, TokenKind::MinusEq));
479        assert!(matches!(tokens[3].kind, TokenKind::Ne));
480    }
481}