Skip to main content

drawlang_syntax/
lexer.rs

1//! Lexer. Newlines are significant (they terminate statements and
2//! properties), comments are kept as tokens so the formatter can preserve
3//! them, and `2x4` lexes as a single grid-dimension token.
4
5use crate::diag::Diagnostic;
6use crate::span::Span;
7
8#[derive(Debug, Clone, PartialEq)]
9pub enum TokenKind {
10    Ident(String),
11    Int(i64),
12    Float(f64),
13    /// String literal: unescaped content, `{expr}` interpolation left intact.
14    Str(String),
15    /// `@accent` — theme token reference.
16    AtIdent(String),
17    /// `#1a2b3c` / `#abc` — color literal.
18    HexColor(String),
19    /// `2x4` — grid dimensions.
20    Dimension(u32, u32),
21    Comment(String),
22
23    LBrace,
24    RBrace,
25    LParen,
26    RParen,
27    LBracket,
28    RBracket,
29    Colon,
30    Semi,
31    Comma,
32    Dot,
33    DotDot,
34    Star,
35    Plus,
36    Minus,
37    Slash,
38    Percent,
39    Eq,
40    Arrow,     // ->
41    BidiArrow, // <->
42    BackArrow, // <-
43    Newline,
44    Eof,
45}
46
47impl TokenKind {
48    /// Human name used in "expected X, found Y" messages.
49    pub fn describe(&self) -> String {
50        match self {
51            TokenKind::Ident(name) => format!("identifier `{name}`"),
52            TokenKind::Int(v) => format!("number `{v}`"),
53            TokenKind::Float(v) => format!("number `{v}`"),
54            TokenKind::Str(_) => "string literal".into(),
55            TokenKind::AtIdent(name) => format!("theme token `@{name}`"),
56            TokenKind::HexColor(c) => format!("color `#{c}`"),
57            TokenKind::Dimension(a, b) => format!("grid size `{a}x{b}`"),
58            TokenKind::Comment(_) => "comment".into(),
59            TokenKind::LBrace => "`{`".into(),
60            TokenKind::RBrace => "`}`".into(),
61            TokenKind::LParen => "`(`".into(),
62            TokenKind::RParen => "`)`".into(),
63            TokenKind::LBracket => "`[`".into(),
64            TokenKind::RBracket => "`]`".into(),
65            TokenKind::Colon => "`:`".into(),
66            TokenKind::Semi => "`;`".into(),
67            TokenKind::Comma => "`,`".into(),
68            TokenKind::Dot => "`.`".into(),
69            TokenKind::DotDot => "`..`".into(),
70            TokenKind::Star => "`*`".into(),
71            TokenKind::Plus => "`+`".into(),
72            TokenKind::Minus => "`-`".into(),
73            TokenKind::Slash => "`/`".into(),
74            TokenKind::Percent => "`%`".into(),
75            TokenKind::Eq => "`=`".into(),
76            TokenKind::Arrow => "`->`".into(),
77            TokenKind::BidiArrow => "`<->`".into(),
78            TokenKind::BackArrow => "`<-`".into(),
79            TokenKind::Newline => "end of line".into(),
80            TokenKind::Eof => "end of file".into(),
81        }
82    }
83}
84
85#[derive(Debug, Clone)]
86pub struct Token {
87    pub kind: TokenKind,
88    pub span: Span,
89}
90
91pub struct LexOutput {
92    pub tokens: Vec<Token>,
93    pub diagnostics: Vec<Diagnostic>,
94}
95
96pub fn lex(text: &str) -> LexOutput {
97    Lexer {
98        text,
99        bytes: text.as_bytes(),
100        pos: 0,
101        tokens: Vec::new(),
102        diags: Vec::new(),
103    }
104    .run()
105}
106
107struct Lexer<'a> {
108    text: &'a str,
109    bytes: &'a [u8],
110    pos: usize,
111    tokens: Vec<Token>,
112    diags: Vec<Diagnostic>,
113}
114
115impl<'a> Lexer<'a> {
116    fn run(mut self) -> LexOutput {
117        while self.pos < self.bytes.len() {
118            let start = self.pos;
119            let b = self.bytes[self.pos];
120            match b {
121                b' ' | b'\t' | b'\r' => self.pos += 1,
122                b'\n' => {
123                    // One token per newline: the parser uses runs to detect
124                    // blank lines, which the formatter preserves.
125                    self.pos += 1;
126                    self.push(TokenKind::Newline, start);
127                }
128                b'/' if self.peek(1) == Some(b'/') => {
129                    let mut end = self.pos;
130                    while end < self.bytes.len() && self.bytes[end] != b'\n' {
131                        end += 1;
132                    }
133                    let content = self.text[self.pos + 2..end].trim().to_string();
134                    self.pos = end;
135                    self.push(TokenKind::Comment(content), start);
136                }
137                b'"' => self.lex_string(start),
138                b'0'..=b'9' => self.lex_number(start),
139                b'A'..=b'Z' | b'a'..=b'z' | b'_' => self.lex_ident(start),
140                b'@' => {
141                    self.pos += 1;
142                    if self.cur_is_ident_start() {
143                        let name = self.take_ident_text();
144                        self.push(TokenKind::AtIdent(name), start);
145                    } else {
146                        self.error_char(
147                            start,
148                            "`@` must be followed by a theme token name, like `@accent`",
149                        );
150                    }
151                }
152                b'#' => {
153                    self.pos += 1;
154                    let hex_start = self.pos;
155                    while self.pos < self.bytes.len() && self.bytes[self.pos].is_ascii_hexdigit() {
156                        self.pos += 1;
157                    }
158                    let hex = &self.text[hex_start..self.pos];
159                    if hex.len() == 3 || hex.len() == 6 || hex.len() == 8 {
160                        self.push(TokenKind::HexColor(hex.to_string()), start);
161                    } else {
162                        self.diags.push(
163                            Diagnostic::error("E0105", format!("invalid color literal `#{hex}`"))
164                                .with_label(
165                                    Span::new(start, self.pos),
166                                    "expected 3, 6, or 8 hex digits",
167                                )
168                                .with_help("write colors as `#rgb`, `#rrggbb`, or `#rrggbbaa`"),
169                        );
170                    }
171                }
172                b'{' => self.single(TokenKind::LBrace),
173                b'}' => self.single(TokenKind::RBrace),
174                b'(' => self.single(TokenKind::LParen),
175                b')' => self.single(TokenKind::RParen),
176                b'[' => self.single(TokenKind::LBracket),
177                b']' => self.single(TokenKind::RBracket),
178                b':' => self.single(TokenKind::Colon),
179                b';' => self.single(TokenKind::Semi),
180                b',' => self.single(TokenKind::Comma),
181                b'*' => self.single(TokenKind::Star),
182                b'+' => self.single(TokenKind::Plus),
183                b'%' => self.single(TokenKind::Percent),
184                b'=' => self.single(TokenKind::Eq),
185                b'/' => self.single(TokenKind::Slash),
186                b'.' => {
187                    if self.peek(1) == Some(b'.') {
188                        self.pos += 2;
189                        self.push(TokenKind::DotDot, start);
190                    } else {
191                        self.single(TokenKind::Dot);
192                    }
193                }
194                b'-' => {
195                    if self.peek(1) == Some(b'>') {
196                        self.pos += 2;
197                        self.push(TokenKind::Arrow, start);
198                    } else {
199                        self.single(TokenKind::Minus);
200                    }
201                }
202                b'<' => {
203                    if self.peek(1) == Some(b'-') && self.peek(2) == Some(b'>') {
204                        self.pos += 3;
205                        self.push(TokenKind::BidiArrow, start);
206                    } else if self.peek(1) == Some(b'-') {
207                        self.pos += 2;
208                        self.push(TokenKind::BackArrow, start);
209                    } else {
210                        self.error_char(
211                            start,
212                            "`<` is only used in the `<->` and `<-` edge arrows",
213                        );
214                    }
215                }
216                _ => {
217                    let ch_len = self.text[self.pos..]
218                        .chars()
219                        .next()
220                        .map(|c| c.len_utf8())
221                        .unwrap_or(1);
222                    self.pos += ch_len;
223                    let ch = &self.text[start..self.pos];
224                    self.diags.push(
225                        Diagnostic::error("E0101", format!("unexpected character `{ch}`"))
226                            .with_label(
227                                Span::new(start, self.pos),
228                                "not valid drawlang syntax here",
229                            ),
230                    );
231                }
232            }
233        }
234        let end = self.bytes.len();
235        if !matches!(
236            self.tokens.last().map(|t| &t.kind),
237            Some(TokenKind::Newline) | None
238        ) {
239            self.tokens.push(Token {
240                kind: TokenKind::Newline,
241                span: Span::new(end, end),
242            });
243        }
244        self.tokens.push(Token {
245            kind: TokenKind::Eof,
246            span: Span::new(end, end),
247        });
248        LexOutput {
249            tokens: self.tokens,
250            diagnostics: self.diags,
251        }
252    }
253
254    fn peek(&self, ahead: usize) -> Option<u8> {
255        self.bytes.get(self.pos + ahead).copied()
256    }
257
258    fn push(&mut self, kind: TokenKind, start: usize) {
259        self.tokens.push(Token {
260            kind,
261            span: Span::new(start, self.pos),
262        });
263    }
264
265    fn single(&mut self, kind: TokenKind) {
266        let start = self.pos;
267        self.pos += 1;
268        self.push(kind, start);
269    }
270
271    fn error_char(&mut self, start: usize, help: &str) {
272        let ch = &self.text[start..self.pos.max(start + 1).min(self.text.len())];
273        self.diags.push(
274            Diagnostic::error("E0101", format!("unexpected character `{ch}`"))
275                .with_label(Span::new(start, self.pos.max(start + 1)), "not valid here")
276                .with_help(help),
277        );
278    }
279
280    fn cur_is_ident_start(&self) -> bool {
281        matches!(
282            self.bytes.get(self.pos),
283            Some(b'A'..=b'Z' | b'a'..=b'z' | b'_')
284        )
285    }
286
287    fn take_ident_text(&mut self) -> String {
288        let start = self.pos;
289        while matches!(
290            self.bytes.get(self.pos),
291            Some(b'A'..=b'Z' | b'a'..=b'z' | b'0'..=b'9' | b'_')
292        ) {
293            self.pos += 1;
294        }
295        self.text[start..self.pos].to_string()
296    }
297
298    fn lex_ident(&mut self, start: usize) {
299        let name = self.take_ident_text();
300        self.push(TokenKind::Ident(name), start);
301    }
302
303    fn lex_number(&mut self, start: usize) {
304        while matches!(self.bytes.get(self.pos), Some(b'0'..=b'9')) {
305            self.pos += 1;
306        }
307        // `2x4` → grid dimension token (digits, `x`, digits, then non-ident).
308        if self.bytes.get(self.pos) == Some(&b'x')
309            && matches!(self.bytes.get(self.pos + 1), Some(b'0'..=b'9'))
310        {
311            let cols_str = &self.text[start..self.pos];
312            self.pos += 1; // consume `x`
313            let rows_start = self.pos;
314            while matches!(self.bytes.get(self.pos), Some(b'0'..=b'9')) {
315                self.pos += 1;
316            }
317            let rows_str = &self.text[rows_start..self.pos];
318            // Reject `2x4foo`.
319            if self.cur_is_ident_start() {
320                self.take_ident_text();
321                self.diags.push(
322                    Diagnostic::error("E0105", "malformed grid size")
323                        .with_label(Span::new(start, self.pos), "expected something like `2x4`"),
324                );
325                return;
326            }
327            let a: u32 = cols_str.parse().unwrap_or(0);
328            let b: u32 = rows_str.parse().unwrap_or(0);
329            self.push(TokenKind::Dimension(a, b), start);
330            return;
331        }
332        // Float? Only if `.` is followed by a digit (so `0..4` stays Int DotDot Int).
333        if self.bytes.get(self.pos) == Some(&b'.')
334            && matches!(self.bytes.get(self.pos + 1), Some(b'0'..=b'9'))
335        {
336            self.pos += 1;
337            while matches!(self.bytes.get(self.pos), Some(b'0'..=b'9')) {
338                self.pos += 1;
339            }
340            let v: f64 = self.text[start..self.pos].parse().unwrap();
341            self.push(TokenKind::Float(v), start);
342            return;
343        }
344        // Trailing ident chars (`16px`, `2foo`) are an error worth catching early.
345        if self.cur_is_ident_start() {
346            let unit_start = self.pos;
347            let unit = self.take_ident_text();
348            self.diags.push(
349                Diagnostic::error(
350                    "E0105",
351                    format!("numbers take no unit suffix; found `{unit}`"),
352                )
353                .with_label(Span::new(unit_start, self.pos), "remove this suffix")
354                .with_help("drawlang lengths are always in pixels; write the bare number"),
355            );
356            return;
357        }
358        let v: i64 = self.text[start..self.pos].parse().unwrap_or(0);
359        self.push(TokenKind::Int(v), start);
360    }
361
362    fn lex_string(&mut self, start: usize) {
363        self.pos += 1; // opening quote
364        let mut value = String::new();
365        let mut brace_depth = 0usize;
366        loop {
367            match self.bytes.get(self.pos) {
368                None | Some(b'\n') => {
369                    self.diags.push(
370                        Diagnostic::error("E0102", "unterminated string literal")
371                            .with_label(
372                                Span::new(start, self.pos),
373                                "string starts here and never closes",
374                            )
375                            .with_help("add a closing `\"` before the end of the line"),
376                    );
377                    self.push(TokenKind::Str(value), start);
378                    return;
379                }
380                Some(b'"') if brace_depth == 0 => {
381                    self.pos += 1;
382                    break;
383                }
384                Some(b'\\') => {
385                    self.pos += 1;
386                    match self.bytes.get(self.pos) {
387                        Some(b'n') => value.push('\n'),
388                        Some(b't') => value.push('\t'),
389                        Some(b'"') => value.push('"'),
390                        Some(b'\\') => value.push('\\'),
391                        // Escaped braces become literal braces, marked so the
392                        // interpolation pass leaves them alone.
393                        Some(b'{') => value.push('\u{1}'),
394                        Some(b'}') => value.push('\u{2}'),
395                        other => {
396                            let ch = other.map(|&b| b as char).unwrap_or('?');
397                            self.diags.push(
398                                Diagnostic::error("E0102", format!("unknown escape `\\{ch}`"))
399                                    .with_label(
400                                        Span::new(self.pos - 1, self.pos + 1),
401                                        "not a valid escape sequence",
402                                    )
403                                    .with_help(r#"valid escapes are \" \\ \n \t \{ \}"#),
404                            );
405                        }
406                    }
407                    self.pos += 1;
408                }
409                Some(&b'{') => {
410                    brace_depth += 1;
411                    value.push('{');
412                    self.pos += 1;
413                }
414                Some(&b'}') => {
415                    brace_depth = brace_depth.saturating_sub(1);
416                    value.push('}');
417                    self.pos += 1;
418                }
419                Some(_) => {
420                    let ch = self.text[self.pos..].chars().next().unwrap();
421                    value.push(ch);
422                    self.pos += ch.len_utf8();
423                }
424            }
425        }
426        if brace_depth > 0 {
427            self.diags.push(
428                Diagnostic::error("E0104", "unbalanced `{` in string interpolation")
429                    .with_label(
430                        Span::new(start, self.pos),
431                        "this string has an unclosed `{`",
432                    )
433                    .with_help(
434                        r#"close the interpolation (`"GPU {i}"`) or escape the brace as `\{`"#,
435                    ),
436            );
437        }
438        self.push(TokenKind::Str(value), start);
439    }
440}