plotnik_compiler/parser/
lexer.rs

1//! Lexer for the query language.
2//!
3//! Produces span-based tokens without storing text - text is sliced from source only when needed.
4//!
5//! ## Error handling
6//!
7//! The lexer coalesces consecutive error characters into single `Garbage` tokens rather
8//! than producing one error per character. This keeps the token stream manageable for malformed input.
9
10use logos::Logos;
11use rowan::TextRange;
12use std::ops::Range;
13
14use super::cst::SyntaxKind;
15
16/// Zero-copy token: kind + span, text retrieved via [`token_text`] when needed.
17#[derive(Debug, Clone, Copy, PartialEq, Eq)]
18pub struct Token {
19    pub kind: SyntaxKind,
20    pub span: TextRange,
21}
22
23impl Token {
24    #[inline]
25    pub fn new(kind: SyntaxKind, span: TextRange) -> Self {
26        Self { kind, span }
27    }
28}
29
30fn range_to_text_range(range: Range<usize>) -> TextRange {
31    TextRange::new((range.start as u32).into(), (range.end as u32).into())
32}
33
34/// Tokenizes source into a vector of span-based tokens.
35///
36/// Post-processes the Logos output:
37/// - Coalesces consecutive lexer errors into single `Garbage` tokens
38/// - Splits `StringLiteral` tokens into quote + content + quote
39/// - Splits `RegexPredicateMatch`/`RegexPredicateNoMatch` into operator + whitespace + regex
40pub fn lex(source: &str) -> Vec<Token> {
41    let mut tokens = Vec::new();
42    let mut lexer = SyntaxKind::lexer(source);
43    let mut error_start: Option<usize> = None;
44
45    loop {
46        match lexer.next() {
47            Some(Ok(kind)) => {
48                if let Some(start) = error_start.take() {
49                    let end = lexer.span().start;
50                    tokens.push(Token::new(
51                        SyntaxKind::Garbage,
52                        range_to_text_range(start..end),
53                    ));
54                }
55
56                let span = lexer.span();
57                match kind {
58                    SyntaxKind::StringLiteral => {
59                        split_string_literal(source, span, &mut tokens);
60                    }
61                    SyntaxKind::RegexPredicateMatch => {
62                        split_regex_predicate(source, span, SyntaxKind::OpRegexMatch, &mut tokens);
63                    }
64                    SyntaxKind::RegexPredicateNoMatch => {
65                        split_regex_predicate(
66                            source,
67                            span,
68                            SyntaxKind::OpRegexNoMatch,
69                            &mut tokens,
70                        );
71                    }
72                    _ => {
73                        tokens.push(Token::new(kind, range_to_text_range(span)));
74                    }
75                }
76            }
77            Some(Err(())) => {
78                if error_start.is_none() {
79                    error_start = Some(lexer.span().start);
80                }
81            }
82            None => {
83                if let Some(start) = error_start.take() {
84                    tokens.push(Token::new(
85                        SyntaxKind::Garbage,
86                        range_to_text_range(start..source.len()),
87                    ));
88                }
89                break;
90            }
91        }
92    }
93
94    tokens
95}
96
97/// Splits a string literal token into: quote + content + quote
98fn split_string_literal(source: &str, span: Range<usize>, tokens: &mut Vec<Token>) {
99    let text = &source[span.clone()];
100    let quote_char = text.chars().next().unwrap();
101    let quote_kind = if quote_char == '"' {
102        SyntaxKind::DoubleQuote
103    } else {
104        SyntaxKind::SingleQuote
105    };
106
107    let start = span.start;
108    let end = span.end;
109
110    tokens.push(Token::new(
111        quote_kind,
112        range_to_text_range(start..start + 1),
113    ));
114
115    if end - start > 2 {
116        tokens.push(Token::new(
117            SyntaxKind::StrVal,
118            range_to_text_range(start + 1..end - 1),
119        ));
120    }
121
122    tokens.push(Token::new(quote_kind, range_to_text_range(end - 1..end)));
123}
124
125/// Splits a regex predicate token into: operator + whitespace (if any) + regex literal
126///
127/// Input: `=~ /pattern/` or `!~ /pattern/`
128/// Output: `OpRegexMatch`/`OpRegexNoMatch` + `Whitespace`? + `RegexLiteral`
129fn split_regex_predicate(
130    source: &str,
131    span: Range<usize>,
132    op_kind: SyntaxKind,
133    tokens: &mut Vec<Token>,
134) {
135    let text = &source[span.clone()];
136    let start = span.start;
137
138    // Operator is always 2 chars (=~ or !~)
139    tokens.push(Token::new(op_kind, range_to_text_range(start..start + 2)));
140
141    // Find where whitespace ends (where `/` starts)
142    let regex_start_in_text = text[2..].find('/').unwrap() + 2;
143
144    // Emit whitespace if present
145    if regex_start_in_text > 2 {
146        tokens.push(Token::new(
147            SyntaxKind::Whitespace,
148            range_to_text_range(start + 2..start + regex_start_in_text),
149        ));
150    }
151
152    // Emit regex literal (includes delimiters)
153    tokens.push(Token::new(
154        SyntaxKind::RegexLiteral,
155        range_to_text_range(start + regex_start_in_text..span.end),
156    ));
157}
158
159/// Retrieves the text slice for a token. O(1) slice into source.
160#[inline]
161pub fn token_text<'q>(source: &'q str, token: &Token) -> &'q str {
162    &source[std::ops::Range::<usize>::from(token.span)]
163}
plotnik_compiler/parser/lexer.rs

plotnik_compiler/parser/
lexer.rs