plotnik_compiler/parser/
lexer.rs

1//! Lexer for the query language.
2//!
3//! Produces span-based tokens without storing text - text is sliced from source only when needed.
4//!
5//! ## Error handling
6//!
7//! The lexer coalesces consecutive error characters into single `Garbage` tokens rather
8//! than producing one error per character. This keeps the token stream manageable for malformed input.
9
10use logos::Logos;
11use rowan::TextRange;
12use std::ops::Range;
13
14use super::cst::SyntaxKind;
15
16/// Zero-copy token: kind + span, text retrieved via [`token_text`] when needed.
17#[derive(Debug, Clone, Copy, PartialEq, Eq)]
18pub struct Token {
19    pub kind: SyntaxKind,
20    pub span: TextRange,
21}
22
23impl Token {
24    #[inline]
25    pub fn new(kind: SyntaxKind, span: TextRange) -> Self {
26        Self { kind, span }
27    }
28}
29
30fn range_to_text_range(range: Range<usize>) -> TextRange {
31    TextRange::new((range.start as u32).into(), (range.end as u32).into())
32}
33
34/// Tokenizes source into a vector of span-based tokens.
35///
36/// Post-processes the Logos output:
37/// - Coalesces consecutive lexer errors into single `Garbage` tokens
38/// - Splits `StringLiteral` tokens into quote + content + quote
39/// - Splits `RegexPredicateMatch`/`RegexPredicateNoMatch` into operator + whitespace + regex
40pub fn lex(source: &str) -> Vec<Token> {
41    let mut tokens = Vec::new();
42    let mut lexer = SyntaxKind::lexer(source);
43    let mut error_start: Option<usize> = None;
44
45    loop {
46        match lexer.next() {
47            Some(Ok(kind)) => {
48                if let Some(start) = error_start.take() {
49                    let end = lexer.span().start;
50                    tokens.push(Token::new(
51                        SyntaxKind::Garbage,
52                        range_to_text_range(start..end),
53                    ));
54                }
55
56                let span = lexer.span();
57                match kind {
58                    SyntaxKind::StringLiteral => {
59                        split_string_literal(source, span, &mut tokens);
60                    }
61                    SyntaxKind::RegexPredicateMatch => {
62                        split_regex_predicate(source, span, SyntaxKind::OpRegexMatch, &mut tokens);
63                    }
64                    SyntaxKind::RegexPredicateNoMatch => {
65                        split_regex_predicate(source, span, SyntaxKind::OpRegexNoMatch, &mut tokens);
66                    }
67                    _ => {
68                        tokens.push(Token::new(kind, range_to_text_range(span)));
69                    }
70                }
71            }
72            Some(Err(())) => {
73                if error_start.is_none() {
74                    error_start = Some(lexer.span().start);
75                }
76            }
77            None => {
78                if let Some(start) = error_start.take() {
79                    tokens.push(Token::new(
80                        SyntaxKind::Garbage,
81                        range_to_text_range(start..source.len()),
82                    ));
83                }
84                break;
85            }
86        }
87    }
88
89    tokens
90}
91
92/// Splits a string literal token into: quote + content + quote
93fn split_string_literal(source: &str, span: Range<usize>, tokens: &mut Vec<Token>) {
94    let text = &source[span.clone()];
95    let quote_char = text.chars().next().unwrap();
96    let quote_kind = if quote_char == '"' {
97        SyntaxKind::DoubleQuote
98    } else {
99        SyntaxKind::SingleQuote
100    };
101
102    let start = span.start;
103    let end = span.end;
104
105    tokens.push(Token::new(
106        quote_kind,
107        range_to_text_range(start..start + 1),
108    ));
109
110    if end - start > 2 {
111        tokens.push(Token::new(
112            SyntaxKind::StrVal,
113            range_to_text_range(start + 1..end - 1),
114        ));
115    }
116
117    tokens.push(Token::new(quote_kind, range_to_text_range(end - 1..end)));
118}
119
120/// Splits a regex predicate token into: operator + whitespace (if any) + regex literal
121///
122/// Input: `=~ /pattern/` or `!~ /pattern/`
123/// Output: `OpRegexMatch`/`OpRegexNoMatch` + `Whitespace`? + `RegexLiteral`
124fn split_regex_predicate(
125    source: &str,
126    span: Range<usize>,
127    op_kind: SyntaxKind,
128    tokens: &mut Vec<Token>,
129) {
130    let text = &source[span.clone()];
131    let start = span.start;
132
133    // Operator is always 2 chars (=~ or !~)
134    tokens.push(Token::new(op_kind, range_to_text_range(start..start + 2)));
135
136    // Find where whitespace ends (where `/` starts)
137    let regex_start_in_text = text[2..].find('/').unwrap() + 2;
138
139    // Emit whitespace if present
140    if regex_start_in_text > 2 {
141        tokens.push(Token::new(
142            SyntaxKind::Whitespace,
143            range_to_text_range(start + 2..start + regex_start_in_text),
144        ));
145    }
146
147    // Emit regex literal (includes delimiters)
148    tokens.push(Token::new(
149        SyntaxKind::RegexLiteral,
150        range_to_text_range(start + regex_start_in_text..span.end),
151    ));
152}
153
154/// Retrieves the text slice for a token. O(1) slice into source.
155#[inline]
156pub fn token_text<'q>(source: &'q str, token: &Token) -> &'q str {
157    &source[std::ops::Range::<usize>::from(token.span)]
158}