Skip to main content

kcl_syntax/
lexer.rs

1//! Logos-based lexer for KCL source.
2
3use std::ops::Range;
4
5use logos::Logos;
6
7use crate::syntax_kind::SyntaxKind;
8
9/// Lossless tokenization of a KCL source string.
10///
11/// `LexedSource` keeps the original source and the full token sequence,
12/// including whitespace, comments, and recovery tokens. Lexical errors are
13/// represented as token kinds such as [`SyntaxKind::Unknown`],
14/// [`SyntaxKind::UnterminatedString`], and
15/// [`SyntaxKind::UnterminatedBlockComment`].
16#[derive(Clone, Debug, Eq, PartialEq)]
17pub struct LexedSource<'a> {
18    source: &'a str,
19    tokens: Vec<Token<'a>>,
20}
21
22impl<'a> LexedSource<'a> {
23    /// Returns the original source string.
24    pub fn as_str(&self) -> &'a str {
25        self.source
26    }
27
28    /// Returns all tokens, including trivia and recovery tokens.
29    pub fn tokens(&self) -> &[Token<'a>] {
30        &self.tokens
31    }
32
33    /// Iterates over all tokens by reference.
34    pub fn iter(&self) -> impl Iterator<Item = &Token<'a>> {
35        self.tokens.iter()
36    }
37
38    /// Consumes the lexed source and returns its token vector.
39    pub fn into_tokens(self) -> Vec<Token<'a>> {
40        self.tokens
41    }
42
43    /// Returns the number of tokens.
44    pub fn len(&self) -> usize {
45        self.tokens.len()
46    }
47
48    /// Returns true when the source produced no tokens.
49    pub fn is_empty(&self) -> bool {
50        self.tokens.is_empty()
51    }
52}
53
54impl<'a> IntoIterator for LexedSource<'a> {
55    type IntoIter = std::vec::IntoIter<Token<'a>>;
56    type Item = Token<'a>;
57
58    fn into_iter(self) -> Self::IntoIter {
59        self.tokens.into_iter()
60    }
61}
62
63impl<'a, 's> IntoIterator for &'s LexedSource<'a> {
64    type IntoIter = std::slice::Iter<'s, Token<'a>>;
65    type Item = &'s Token<'a>;
66
67    fn into_iter(self) -> Self::IntoIter {
68        self.tokens.iter()
69    }
70}
71
72/// A single KCL token.
73///
74/// Token text is borrowed from the original source. Ranges are byte offsets into
75/// that source string.
76#[derive(Clone, Debug, Eq, PartialEq)]
77pub struct Token<'a> {
78    kind: SyntaxKind,
79    text: &'a str,
80    range: Range<usize>,
81}
82
83impl<'a> Token<'a> {
84    /// Returns this token's syntax kind.
85    pub fn kind(&self) -> SyntaxKind {
86        self.kind
87    }
88
89    /// Returns the exact source text for this token.
90    pub fn text(&self) -> &'a str {
91        self.text
92    }
93
94    /// Returns this token's byte range in the original source string.
95    pub fn range(&self) -> Range<usize> {
96        self.range.clone()
97    }
98}
99
100#[derive(Clone, Copy, Debug, Logos, PartialEq)]
101enum RawTokenKind {
102    #[regex(r"[ \t\n\r]+")]
103    Whitespace,
104    #[regex(r#""([^"\\\n\r]|\\[^\n\r])*""#)]
105    #[regex(r#"'([^'\\\n\r]|\\[^\n\r])*'"#)]
106    String,
107    #[regex(r#""([^"\\\n\r]|\\[^\n\r])*"#, unterminated_string)]
108    #[regex(r#"'([^'\\\n\r]|\\[^\n\r])*"#, unterminated_string)]
109    UnterminatedString,
110    #[regex(r"//[^\n\r]*", allow_greedy = true)]
111    LineComment,
112    #[regex(r"/\*", block_comment)]
113    BlockComment,
114    #[regex(r"[0-9]+(\.[0-9]+)?_?(mm|cm|m|inch|in|ft|yd|deg|rad|\?)?")]
115    #[regex(r"\.[0-9]+_?(mm|cm|m|inch|in|ft|yd|deg|rad|\?)?")]
116    Number,
117    #[token("..<")]
118    DoublePeriodLessThan,
119    #[token("..")]
120    DoublePeriod,
121    #[token("::")]
122    DoubleColon,
123    #[regex(r"[\p{Alphabetic}_][\p{Alphabetic}0-9_]*")]
124    Word,
125    #[token(">=")]
126    GtEq,
127    #[token("<=")]
128    LtEq,
129    #[token("==")]
130    EqEq,
131    #[token("=>")]
132    FatArrow,
133    #[token("!=")]
134    BangEq,
135    #[token("|>")]
136    PipeGt,
137    #[token("*")]
138    Star,
139    #[token("+")]
140    Plus,
141    #[token("-")]
142    Minus,
143    #[token("/")]
144    Slash,
145    #[token("%")]
146    Percent,
147    #[token("=")]
148    Eq,
149    #[token("<")]
150    Lt,
151    #[token(">")]
152    Gt,
153    #[token("\\")]
154    Backslash,
155    #[token("^")]
156    Caret,
157    #[token("||")]
158    PipePipe,
159    #[token("&&")]
160    AmpAmp,
161    #[token("|")]
162    Pipe,
163    #[token("&")]
164    Amp,
165    #[token("(")]
166    OpenParen,
167    #[token(")")]
168    CloseParen,
169    #[token("{")]
170    OpenBrace,
171    #[token("}")]
172    CloseBrace,
173    #[token("[")]
174    OpenBracket,
175    #[token("]")]
176    CloseBracket,
177    #[token("#")]
178    Hash,
179    #[token("!")]
180    Bang,
181    #[token("$")]
182    Dollar,
183    #[token(",")]
184    Comma,
185    #[token(":")]
186    Colon,
187    #[token(".")]
188    Period,
189    #[token("?")]
190    QuestionMark,
191    #[token("@")]
192    At,
193    #[token(";")]
194    SemiColon,
195}
196
197/// Lexes KCL source into a lossless token sequence.
198///
199/// This function does not return lexical errors separately. Invalid or
200/// recoverable input is preserved in the token stream with recovery token kinds.
201pub fn lex(source: &str) -> LexedSource<'_> {
202    let mut lexer = RawTokenKind::lexer(source);
203    let mut tokens = Vec::new();
204
205    while let Some(raw_kind) = lexer.next() {
206        let range = lexer.span();
207        let text = &source[range.clone()];
208        let kind = match raw_kind {
209            Ok(RawTokenKind::Whitespace) => SyntaxKind::Whitespace,
210            Ok(RawTokenKind::String) => SyntaxKind::String,
211            Ok(RawTokenKind::UnterminatedString) => SyntaxKind::UnterminatedString,
212            Ok(RawTokenKind::LineComment) => SyntaxKind::LineComment,
213            Ok(RawTokenKind::BlockComment) if text.ends_with("*/") => SyntaxKind::BlockComment,
214            Ok(RawTokenKind::BlockComment) => SyntaxKind::UnterminatedBlockComment,
215            Ok(RawTokenKind::Number) => SyntaxKind::Number,
216            Ok(RawTokenKind::DoublePeriodLessThan) => SyntaxKind::DoublePeriodLessThan,
217            Ok(RawTokenKind::DoublePeriod) => SyntaxKind::DoublePeriod,
218            Ok(RawTokenKind::DoubleColon) => SyntaxKind::DoubleColon,
219            Ok(RawTokenKind::Word) => keyword_or_word(text),
220            Ok(RawTokenKind::GtEq) => SyntaxKind::GtEq,
221            Ok(RawTokenKind::LtEq) => SyntaxKind::LtEq,
222            Ok(RawTokenKind::EqEq) => SyntaxKind::EqEq,
223            Ok(RawTokenKind::FatArrow) => SyntaxKind::FatArrow,
224            Ok(RawTokenKind::BangEq) => SyntaxKind::BangEq,
225            Ok(RawTokenKind::PipeGt) => SyntaxKind::PipeGt,
226            Ok(RawTokenKind::Star) => SyntaxKind::Star,
227            Ok(RawTokenKind::Plus) => SyntaxKind::Plus,
228            Ok(RawTokenKind::Minus) => SyntaxKind::Minus,
229            Ok(RawTokenKind::Slash) => SyntaxKind::Slash,
230            Ok(RawTokenKind::Percent) => SyntaxKind::Percent,
231            Ok(RawTokenKind::Eq) => SyntaxKind::Eq,
232            Ok(RawTokenKind::Lt) => SyntaxKind::Lt,
233            Ok(RawTokenKind::Gt) => SyntaxKind::Gt,
234            Ok(RawTokenKind::Backslash) => SyntaxKind::Backslash,
235            Ok(RawTokenKind::Caret) => SyntaxKind::Caret,
236            Ok(RawTokenKind::PipePipe) => SyntaxKind::PipePipe,
237            Ok(RawTokenKind::AmpAmp) => SyntaxKind::AmpAmp,
238            Ok(RawTokenKind::Pipe) => SyntaxKind::Pipe,
239            Ok(RawTokenKind::Amp) => SyntaxKind::Amp,
240            Ok(RawTokenKind::OpenParen) => SyntaxKind::OpenParen,
241            Ok(RawTokenKind::CloseParen) => SyntaxKind::CloseParen,
242            Ok(RawTokenKind::OpenBrace) => SyntaxKind::OpenBrace,
243            Ok(RawTokenKind::CloseBrace) => SyntaxKind::CloseBrace,
244            Ok(RawTokenKind::OpenBracket) => SyntaxKind::OpenBracket,
245            Ok(RawTokenKind::CloseBracket) => SyntaxKind::CloseBracket,
246            Ok(RawTokenKind::Hash) => SyntaxKind::Hash,
247            Ok(RawTokenKind::Bang) => SyntaxKind::Bang,
248            Ok(RawTokenKind::Dollar) => SyntaxKind::Dollar,
249            Ok(RawTokenKind::Comma) => SyntaxKind::Comma,
250            Ok(RawTokenKind::Colon) => SyntaxKind::Colon,
251            Ok(RawTokenKind::Period) => SyntaxKind::Period,
252            Ok(RawTokenKind::QuestionMark) => SyntaxKind::QuestionMark,
253            Ok(RawTokenKind::At) => SyntaxKind::At,
254            Ok(RawTokenKind::SemiColon) => SyntaxKind::SemiColon,
255            Err(()) => SyntaxKind::Unknown,
256        };
257        tokens.push(Token { kind, text, range });
258    }
259
260    LexedSource { source, tokens }
261}
262
263fn keyword_or_word(text: &str) -> SyntaxKind {
264    match text {
265        "if" => SyntaxKind::IfKw,
266        "else" => SyntaxKind::ElseKw,
267        "for" => SyntaxKind::ForKw,
268        "while" => SyntaxKind::WhileKw,
269        "return" => SyntaxKind::ReturnKw,
270        "break" => SyntaxKind::BreakKw,
271        "continue" => SyntaxKind::ContinueKw,
272        "fn" => SyntaxKind::FnKw,
273        "let" => SyntaxKind::LetKw,
274        "mut" => SyntaxKind::MutKw,
275        "as" => SyntaxKind::AsKw,
276        "loop" => SyntaxKind::LoopKw,
277        "true" => SyntaxKind::TrueKw,
278        "false" => SyntaxKind::FalseKw,
279        "nil" => SyntaxKind::NilKw,
280        "and" => SyntaxKind::AndKw,
281        "or" => SyntaxKind::OrKw,
282        "not" => SyntaxKind::NotKw,
283        "var" => SyntaxKind::VarKw,
284        "const" => SyntaxKind::ConstKw,
285        "import" => SyntaxKind::ImportKw,
286        "export" => SyntaxKind::ExportKw,
287        "type" => SyntaxKind::TypeKw,
288        "interface" => SyntaxKind::InterfaceKw,
289        "new" => SyntaxKind::NewKw,
290        "self" => SyntaxKind::SelfKw,
291        "record" => SyntaxKind::RecordKw,
292        "struct" => SyntaxKind::StructKw,
293        "object" => SyntaxKind::ObjectKw,
294        _ => SyntaxKind::Word,
295    }
296}
297
298fn block_comment(lexer: &mut logos::Lexer<'_, RawTokenKind>) {
299    if let Some(end) = lexer.remainder().find("*/") {
300        lexer.bump(end + 2);
301    } else {
302        lexer.bump(lexer.remainder().len());
303    }
304}
305
306fn unterminated_string(lexer: &mut logos::Lexer<'_, RawTokenKind>) {
307    let until_line_end = lexer
308        .remainder()
309        .find(['\n', '\r'])
310        .unwrap_or_else(|| lexer.remainder().len());
311    lexer.bump(until_line_end);
312}