cabalist_parser/
lexer.rs

1//! Hand-written lexer for `.cabal` files.
2//!
3//! The lexer operates line-by-line, classifying each line and producing tokens
4//! that capture every byte of the input (via spans and trivia). The parser
5//! consumes these tokens to build the CST.
6//!
7//! Key properties:
8//! - Zero-copy: tokens are [`Span`] references into the source string.
9//! - Full coverage: every byte is accounted for by a token span or trivia piece.
10//! - Indentation tracking: each token records its column position (tabs = 8).
11
12use crate::span::Span;
13
14// ---------------------------------------------------------------------------
15// Token types
16// ---------------------------------------------------------------------------
17
18/// The kind of a token.
19#[derive(Debug, Clone, Copy, PartialEq, Eq)]
20pub enum TokenKind {
21    /// An identifier that appears before a `:`: a field name.
22    FieldName,
23    /// The `:` separator after a field name.
24    Colon,
25    /// A section keyword: `library`, `executable`, `test-suite`, `benchmark`,
26    /// `flag`, `source-repository`, `common`.
27    SectionHeader,
28    /// The argument after a section header (e.g. the name in `executable foo`).
29    SectionArg,
30    /// The `if` keyword in a conditional.
31    If,
32    /// The `else` keyword.
33    Else,
34    /// The `elif` keyword (rare, but in the spec).
35    Elif,
36    /// Raw value text (the part after a colon on the same line, or a
37    /// continuation line that is part of a field value).
38    Value,
39    /// A comma `,`.
40    Comma,
41    /// `(`.
42    LParen,
43    /// `)`.
44    RParen,
45    /// `!` (negation in conditions).
46    Not,
47    /// `&&`.
48    And,
49    /// `||`.
50    Or,
51    /// A comparison operator: `==`, `>=`, `<=`, `>`, `<`.
52    CompOp,
53    /// A line comment (starts with `--`). Stored as trivia on the *next*
54    /// meaningful token, but also emitted as a standalone token when it is the
55    /// only content on a line.
56    Comment,
57    /// End of file.
58    Eof,
59}
60
61/// The kind of a trivia piece.
62#[derive(Debug, Clone, Copy, PartialEq, Eq)]
63pub enum TriviaKind {
64    /// Horizontal whitespace (spaces / tabs) within a line.
65    Whitespace,
66    /// A line-feed (`\n`) or carriage-return + line-feed (`\r\n`).
67    Newline,
68    /// A `--` comment (including the `--` prefix and everything to EOL,
69    /// but *not* the newline itself).
70    Comment,
71}
72
73/// A piece of trivia attached to a token.
74#[derive(Debug, Clone, Copy, PartialEq, Eq)]
75pub struct TriviaPiece {
76    pub kind: TriviaKind,
77    pub span: Span,
78}
79
80/// A single token produced by the lexer.
81#[derive(Debug, Clone, PartialEq, Eq)]
82pub struct Token {
83    pub kind: TokenKind,
84    /// Byte span of the meaningful content.
85    pub span: Span,
86    /// Column (0-based) of the first byte, with tabs expanded to multiples
87    /// of 8.
88    pub indent: usize,
89    /// Trivia that precedes this token (whitespace, newlines, comments).
90    pub leading_trivia: Vec<TriviaPiece>,
91}
92
93// ---------------------------------------------------------------------------
94// Line classification (first pass)
95// ---------------------------------------------------------------------------
96
97/// High-level classification of a source line. The lexer first splits the
98/// input into lines, classifies each, and then produces tokens.
99#[derive(Debug, Clone, PartialEq, Eq)]
100enum LineKind {
101    /// A blank line (only whitespace).
102    Blank,
103    /// A comment line (leading whitespace + `--`).
104    Comment,
105    /// A section header keyword (e.g. `library`, `executable foo`).
106    SectionHeader,
107    /// A conditional keyword (`if`, `else`, `elif`).
108    Conditional,
109    /// A field: `name: value`.
110    Field,
111    /// A continuation / value line that doesn't match any of the above.
112    Value,
113}
114
115/// Internal representation of one source line before tokenization.
116#[derive(Debug, Clone)]
117struct RawLine {
118    /// Byte offset of the first character of the line in the source.
119    start: usize,
120    /// Byte offset one past the last character (before the newline, if any).
121    end: usize,
122    /// Byte offset of the newline(s) at the end (`\n` or `\r\n`). Equal to
123    /// `end` if no newline (last line of file without trailing newline).
124    newline_start: usize,
125    /// Byte offset one past the newline (i.e. start of next line, or source
126    /// len).
127    line_end_with_newline: usize,
128    /// Column of first non-whitespace character (tabs expanded to multiples
129    /// of 8). `None` if the line is blank.
130    indent: Option<usize>,
131    /// Byte offset of the first non-whitespace character.
132    content_start: usize,
133    /// Classification.
134    kind: LineKind,
135}
136
137// ---------------------------------------------------------------------------
138// Section header keywords
139// ---------------------------------------------------------------------------
140
141const SECTION_KEYWORDS: &[&str] = &[
142    "library",
143    "executable",
144    "test-suite",
145    "benchmark",
146    "flag",
147    "source-repository",
148    "common",
149    "custom-setup",
150    "foreign-library",
151];
152
153const CONDITIONAL_KEYWORDS: &[&str] = &["if", "else", "elif"];
154
155/// Check whether `word` (already lowercased) is a section keyword.
156fn is_section_keyword(word: &str) -> bool {
157    SECTION_KEYWORDS
158        .iter()
159        .any(|kw| kw.eq_ignore_ascii_case(word))
160}
161
162fn is_conditional_keyword(word: &str) -> bool {
163    CONDITIONAL_KEYWORDS
164        .iter()
165        .any(|kw| kw.eq_ignore_ascii_case(word))
166}
167
168// ---------------------------------------------------------------------------
169// Helpers
170// ---------------------------------------------------------------------------
171
172/// Compute the visual column for a run of bytes starting at column 0, where
173/// tabs advance to the next multiple of 8.
174fn visual_column(source: &[u8], start: usize, end: usize) -> usize {
175    let mut col: usize = 0;
176    for &b in &source[start..end] {
177        if b == b'\t' {
178            col = (col + 8) & !7; // next multiple of 8
179        } else {
180            col += 1;
181        }
182    }
183    col
184}
185
186/// Extract the first "word" (letters, digits, hyphens, underscores) starting
187/// at `pos` in `source`. Returns `(word_slice, end_offset)`.
188fn scan_word(source: &[u8], pos: usize) -> (usize, usize) {
189    let start = pos;
190    let mut i = pos;
191    while i < source.len()
192        && (source[i].is_ascii_alphanumeric() || source[i] == b'-' || source[i] == b'_')
193    {
194        i += 1;
195    }
196    (start, i)
197}
198
199/// Skip horizontal whitespace (spaces and tabs) starting at `pos`.
200fn skip_hspace(source: &[u8], pos: usize) -> usize {
201    let mut i = pos;
202    while i < source.len() && (source[i] == b' ' || source[i] == b'\t') {
203        i += 1;
204    }
205    i
206}
207
208// ---------------------------------------------------------------------------
209// Line splitting
210// ---------------------------------------------------------------------------
211
212/// Split `source` into `RawLine`s and classify each one.
213fn split_lines(source: &str) -> Vec<RawLine> {
214    let bytes = source.as_bytes();
215    let len = bytes.len();
216    let mut lines = Vec::new();
217    let mut pos = 0;
218
219    while pos <= len {
220        let line_start = pos;
221
222        // Find end of line content (before newline).
223        let mut end = pos;
224        while end < len && bytes[end] != b'\n' && bytes[end] != b'\r' {
225            end += 1;
226        }
227        let content_end = end;
228
229        // Consume newline.
230        let newline_start = end;
231        if end < len && bytes[end] == b'\r' {
232            end += 1;
233        }
234        if end < len && bytes[end] == b'\n' {
235            end += 1;
236        }
237        let line_end = end;
238
239        // Find first non-whitespace.
240        let mut first_non_ws = line_start;
241        while first_non_ws < content_end
242            && (bytes[first_non_ws] == b' ' || bytes[first_non_ws] == b'\t')
243        {
244            first_non_ws += 1;
245        }
246
247        let indent = if first_non_ws == content_end {
248            None // blank line
249        } else {
250            Some(visual_column(bytes, line_start, first_non_ws))
251        };
252
253        let kind = classify_line(source, first_non_ws, content_end, indent.is_none());
254
255        lines.push(RawLine {
256            start: line_start,
257            end: content_end,
258            newline_start,
259            line_end_with_newline: line_end,
260            indent,
261            content_start: first_non_ws,
262            kind,
263        });
264
265        // Guard against infinite loop on last line without newline.
266        if line_end == pos {
267            break;
268        }
269        pos = line_end;
270    }
271
272    // Post-process: handle braced freeform text blocks.
273    // When a Field line's value ends with `{`, all subsequent lines up to and
274    // including a line that is just `}` are reclassified as Value lines so
275    // the parser treats them as continuation values.
276    reclassify_braced_freeform_blocks(&mut lines, source);
277
278    lines
279}
280
281/// Detect braced freeform text blocks (e.g. `Description: { ... }`) and
282/// reclassify contained lines as `Value` so the parser treats them as
283/// field continuation lines.
284fn reclassify_braced_freeform_blocks(lines: &mut [RawLine], source: &str) {
285    let bytes = source.as_bytes();
286    let mut i = 0;
287    while i < lines.len() {
288        // Look for a Field line whose value part ends with `{`.
289        if lines[i].kind == LineKind::Field {
290            let line = &lines[i];
291            // Check if the content (before newline) ends with `{` (possibly
292            // with trailing whitespace).
293            let mut check = line.end;
294            while check > line.content_start
295                && (bytes[check - 1] == b' ' || bytes[check - 1] == b'\t')
296            {
297                check -= 1;
298            }
299            if check > line.content_start && bytes[check - 1] == b'{' {
300                // This is a braced freeform text block. Reclassify all
301                // following lines as Value until we find a line that is
302                // just `}` (possibly with surrounding whitespace).
303                i += 1;
304                while i < lines.len() {
305                    let inner = &lines[i];
306                    // Check if this line is just `}` (with optional whitespace).
307                    let trimmed_start = inner.content_start;
308                    let trimmed_end = inner.end;
309                    if trimmed_start < trimmed_end
310                        && bytes[trimmed_start] == b'}'
311                        && is_only_closing_brace(bytes, trimmed_start, trimmed_end)
312                    {
313                        // The `}` line itself: reclassify as Value and stop.
314                        lines[i].kind = LineKind::Value;
315                        i += 1;
316                        break;
317                    }
318                    // Reclassify as Value (unless it's a blank line, which we keep).
319                    if inner.kind != LineKind::Blank {
320                        lines[i].kind = LineKind::Value;
321                    }
322                    i += 1;
323                }
324                continue;
325            }
326        }
327        i += 1;
328    }
329}
330
331/// Check if from `start` to `end`, the content is just `}` optionally
332/// followed by whitespace.
333fn is_only_closing_brace(bytes: &[u8], start: usize, end: usize) -> bool {
334    if start >= end || bytes[start] != b'}' {
335        return false;
336    }
337    for &b in &bytes[start + 1..end] {
338        if b != b' ' && b != b'\t' {
339            return false;
340        }
341    }
342    true
343}
344
345/// Classify a single line based on its content.
346fn classify_line(
347    source: &str,
348    content_start: usize,
349    content_end: usize,
350    is_blank: bool,
351) -> LineKind {
352    if is_blank {
353        return LineKind::Blank;
354    }
355
356    let bytes = source.as_bytes();
357
358    // Comment?
359    if content_start + 1 < content_end
360        && bytes[content_start] == b'-'
361        && bytes[content_start + 1] == b'-'
362    {
363        // Make sure it's `--` not `---foo` which could be a field name with
364        // lots of hyphens. In practice `--` is always a comment start.
365        return LineKind::Comment;
366    }
367
368    // Grab the first word.
369    let (word_start, word_end) = scan_word(bytes, content_start);
370    if word_start == word_end {
371        // No word found: treat as value.
372        return LineKind::Value;
373    }
374    let word = &source[word_start..word_end];
375
376    // Section header?
377    if is_section_keyword(word) {
378        // A section keyword must be followed by EOL, whitespace + section arg,
379        // or `{`. It must NOT be followed by punctuation like `.`: that
380        // indicates a continuation/description line (e.g., "  library. The...").
381        if word_end >= content_end {
382            // Keyword at EOL (e.g., `library\n`).
383            return LineKind::SectionHeader;
384        }
385        let ch = bytes[word_end];
386        if ch == b' ' || ch == b'\t' || ch == b'{' {
387            return LineKind::SectionHeader;
388        }
389    }
390
391    // Conditional?
392    if is_conditional_keyword(word) {
393        let after_word = skip_hspace(bytes, word_end);
394        if after_word >= content_end || bytes[after_word] != b':' {
395            return LineKind::Conditional;
396        }
397    }
398
399    // Field? Look for `:` after the first word (possibly with spaces).
400    // Field names can contain letters, digits, hyphens; e.g. `build-depends:`
401    let after_word = skip_hspace(bytes, word_end);
402    if after_word < content_end && bytes[after_word] == b':' {
403        return LineKind::Field;
404    }
405
406    // Otherwise, it's a value / continuation line.
407    LineKind::Value
408}
409
410// ---------------------------------------------------------------------------
411// Tokenizer
412// ---------------------------------------------------------------------------
413
414/// Tokenize a `.cabal` source string into a flat list of [`Token`]s.
415///
416/// The returned token list always ends with a [`TokenKind::Eof`] token.
417/// Every byte of the input is covered by either a token span or a trivia
418/// piece on one of the tokens.
419pub fn tokenize(source: &str) -> Vec<Token> {
420    let lines = split_lines(source);
421    let mut tokens = Vec::new();
422    let mut pending_trivia: Vec<TriviaPiece> = Vec::new();
423
424    for line in &lines {
425        match line.kind {
426            LineKind::Blank => {
427                // The whole line (including newline) is trivia.
428                if line.start < line.end {
429                    pending_trivia.push(TriviaPiece {
430                        kind: TriviaKind::Whitespace,
431                        span: Span::new(line.start, line.end),
432                    });
433                }
434                if line.newline_start < line.line_end_with_newline {
435                    pending_trivia.push(TriviaPiece {
436                        kind: TriviaKind::Newline,
437                        span: Span::new(line.newline_start, line.line_end_with_newline),
438                    });
439                }
440            }
441
442            LineKind::Comment => {
443                // Leading whitespace as trivia.
444                if line.start < line.content_start {
445                    pending_trivia.push(TriviaPiece {
446                        kind: TriviaKind::Whitespace,
447                        span: Span::new(line.start, line.content_start),
448                    });
449                }
450                // The comment text itself.
451                let comment_span = Span::new(line.content_start, line.end);
452                // Emit comment as a standalone token so the parser can place
453                // it in the CST. Attach accumulated trivia to it.
454                let trivia = std::mem::take(&mut pending_trivia);
455                // We don't add the comment to trivia: we emit it as a token.
456                // But we need to handle the newline.
457                tokens.push(Token {
458                    kind: TokenKind::Comment,
459                    span: comment_span,
460                    indent: line.indent.unwrap_or(0),
461                    leading_trivia: trivia,
462                });
463                // Newline after comment is trivia for the next token.
464                if line.newline_start < line.line_end_with_newline {
465                    pending_trivia.push(TriviaPiece {
466                        kind: TriviaKind::Newline,
467                        span: Span::new(line.newline_start, line.line_end_with_newline),
468                    });
469                }
470            }
471
472            LineKind::SectionHeader => {
473                tokenize_section_header(source, line, &mut tokens, &mut pending_trivia);
474            }
475
476            LineKind::Conditional => {
477                tokenize_conditional(source, line, &mut tokens, &mut pending_trivia);
478            }
479
480            LineKind::Field => {
481                tokenize_field(source, line, &mut tokens, &mut pending_trivia);
482            }
483
484            LineKind::Value => {
485                tokenize_value_line(source, line, &mut tokens, &mut pending_trivia);
486            }
487        }
488    }
489
490    // EOF token gets any remaining trivia.
491    let eof_offset = source.len();
492    tokens.push(Token {
493        kind: TokenKind::Eof,
494        span: Span::empty(eof_offset),
495        indent: 0,
496        leading_trivia: std::mem::take(&mut pending_trivia),
497    });
498
499    tokens
500}
501
502/// Tokenize a section header line like `executable my-exe` or `library`.
503fn tokenize_section_header(
504    source: &str,
505    line: &RawLine,
506    tokens: &mut Vec<Token>,
507    pending_trivia: &mut Vec<TriviaPiece>,
508) {
509    let bytes = source.as_bytes();
510
511    // Leading whitespace.
512    if line.start < line.content_start {
513        pending_trivia.push(TriviaPiece {
514            kind: TriviaKind::Whitespace,
515            span: Span::new(line.start, line.content_start),
516        });
517    }
518
519    // The section keyword.
520    let (kw_start, kw_end) = scan_word(bytes, line.content_start);
521    tokens.push(Token {
522        kind: TokenKind::SectionHeader,
523        span: Span::new(kw_start, kw_end),
524        indent: line.indent.unwrap_or(0),
525        leading_trivia: std::mem::take(pending_trivia),
526    });
527
528    // After the keyword: optional whitespace + section argument(s).
529    let mut pos = kw_end;
530    // Whitespace between keyword and arg.
531    let ws_start = pos;
532    pos = skip_hspace(bytes, pos);
533    if ws_start < pos {
534        pending_trivia.push(TriviaPiece {
535            kind: TriviaKind::Whitespace,
536            span: Span::new(ws_start, pos),
537        });
538    }
539
540    // Section argument: everything remaining on the line (trimmed).
541    if pos < line.end {
542        // Trim trailing whitespace from the arg.
543        let mut arg_end = line.end;
544        while arg_end > pos && (bytes[arg_end - 1] == b' ' || bytes[arg_end - 1] == b'\t') {
545            arg_end -= 1;
546        }
547        if pos < arg_end {
548            tokens.push(Token {
549                kind: TokenKind::SectionArg,
550                span: Span::new(pos, arg_end),
551                indent: visual_column(bytes, line.start, pos),
552                leading_trivia: std::mem::take(pending_trivia),
553            });
554            // Trailing whitespace as trivia.
555            if arg_end < line.end {
556                pending_trivia.push(TriviaPiece {
557                    kind: TriviaKind::Whitespace,
558                    span: Span::new(arg_end, line.end),
559                });
560            }
561        }
562    }
563
564    // Newline.
565    if line.newline_start < line.line_end_with_newline {
566        pending_trivia.push(TriviaPiece {
567            kind: TriviaKind::Newline,
568            span: Span::new(line.newline_start, line.line_end_with_newline),
569        });
570    }
571}
572
573/// Tokenize a conditional line like `if flag(dev)` or `else`.
574fn tokenize_conditional(
575    source: &str,
576    line: &RawLine,
577    tokens: &mut Vec<Token>,
578    pending_trivia: &mut Vec<TriviaPiece>,
579) {
580    let bytes = source.as_bytes();
581
582    // Leading whitespace.
583    if line.start < line.content_start {
584        pending_trivia.push(TriviaPiece {
585            kind: TriviaKind::Whitespace,
586            span: Span::new(line.start, line.content_start),
587        });
588    }
589
590    // The keyword (if / else / elif).
591    let (kw_start, kw_end) = scan_word(bytes, line.content_start);
592    let kw_str = &source[kw_start..kw_end];
593    let kind = if kw_str.eq_ignore_ascii_case("if") {
594        TokenKind::If
595    } else if kw_str.eq_ignore_ascii_case("else") {
596        TokenKind::Else
597    } else {
598        TokenKind::Elif
599    };
600
601    tokens.push(Token {
602        kind,
603        span: Span::new(kw_start, kw_end),
604        indent: line.indent.unwrap_or(0),
605        leading_trivia: std::mem::take(pending_trivia),
606    });
607
608    // For `if`/`elif`, tokenize the condition expression.
609    // For `else`, check if there's remaining content (e.g. `else {`).
610    if kind == TokenKind::If || kind == TokenKind::Elif {
611        tokenize_condition_expr(source, bytes, kw_end, line, tokens, pending_trivia);
612    } else if kind == TokenKind::Else {
613        // Capture any remaining content after `else` (e.g. `{` for braced blocks).
614        let after_kw = skip_hspace(bytes, kw_end);
615        if after_kw < line.end {
616            // There's content after `else`: emit whitespace + value.
617            if kw_end < after_kw {
618                pending_trivia.push(TriviaPiece {
619                    kind: TriviaKind::Whitespace,
620                    span: Span::new(kw_end, after_kw),
621                });
622            }
623            tokens.push(Token {
624                kind: TokenKind::Value,
625                span: Span::new(after_kw, line.end),
626                indent: visual_column(bytes, line.start, after_kw),
627                leading_trivia: std::mem::take(pending_trivia),
628            });
629        }
630    }
631
632    // Newline.
633    if line.newline_start < line.line_end_with_newline {
634        pending_trivia.push(TriviaPiece {
635            kind: TriviaKind::Newline,
636            span: Span::new(line.newline_start, line.line_end_with_newline),
637        });
638    }
639}
640
641/// Tokenize the condition expression portion of an `if`/`elif` line.
642///
643/// E.g. for `if flag(dev) && !os(windows)`, this tokenizes everything
644/// after `if`.
645fn tokenize_condition_expr(
646    _source: &str,
647    bytes: &[u8],
648    start: usize,
649    line: &RawLine,
650    tokens: &mut Vec<Token>,
651    pending_trivia: &mut Vec<TriviaPiece>,
652) {
653    let end = line.end;
654    let mut pos = start;
655
656    while pos < end {
657        let b = bytes[pos];
658        match b {
659            b' ' | b'\t' => {
660                let ws_start = pos;
661                pos = skip_hspace(bytes, pos);
662                pending_trivia.push(TriviaPiece {
663                    kind: TriviaKind::Whitespace,
664                    span: Span::new(ws_start, pos),
665                });
666            }
667            b'(' => {
668                tokens.push(Token {
669                    kind: TokenKind::LParen,
670                    span: Span::new(pos, pos + 1),
671                    indent: visual_column(bytes, line.start, pos),
672                    leading_trivia: std::mem::take(pending_trivia),
673                });
674                pos += 1;
675            }
676            b')' => {
677                tokens.push(Token {
678                    kind: TokenKind::RParen,
679                    span: Span::new(pos, pos + 1),
680                    indent: visual_column(bytes, line.start, pos),
681                    leading_trivia: std::mem::take(pending_trivia),
682                });
683                pos += 1;
684            }
685            b'!' => {
686                tokens.push(Token {
687                    kind: TokenKind::Not,
688                    span: Span::new(pos, pos + 1),
689                    indent: visual_column(bytes, line.start, pos),
690                    leading_trivia: std::mem::take(pending_trivia),
691                });
692                pos += 1;
693            }
694            b'&' => {
695                if pos + 1 < end && bytes[pos + 1] == b'&' {
696                    tokens.push(Token {
697                        kind: TokenKind::And,
698                        span: Span::new(pos, pos + 2),
699                        indent: visual_column(bytes, line.start, pos),
700                        leading_trivia: std::mem::take(pending_trivia),
701                    });
702                    pos += 2;
703                } else {
704                    // Stray `&` (not `&&`): emit as single-char Value for error recovery.
705                    tokens.push(Token {
706                        kind: TokenKind::Value,
707                        span: Span::new(pos, pos + 1),
708                        indent: visual_column(bytes, line.start, pos),
709                        leading_trivia: std::mem::take(pending_trivia),
710                    });
711                    pos += 1;
712                }
713            }
714            b'|' => {
715                if pos + 1 < end && bytes[pos + 1] == b'|' {
716                    tokens.push(Token {
717                        kind: TokenKind::Or,
718                        span: Span::new(pos, pos + 2),
719                        indent: visual_column(bytes, line.start, pos),
720                        leading_trivia: std::mem::take(pending_trivia),
721                    });
722                    pos += 2;
723                } else {
724                    tokens.push(Token {
725                        kind: TokenKind::Value,
726                        span: Span::new(pos, pos + 1),
727                        indent: visual_column(bytes, line.start, pos),
728                        leading_trivia: std::mem::take(pending_trivia),
729                    });
730                    pos += 1;
731                }
732            }
733            b'>' if pos + 1 < end && bytes[pos + 1] == b'=' => {
734                tokens.push(Token {
735                    kind: TokenKind::CompOp,
736                    span: Span::new(pos, pos + 2),
737                    indent: visual_column(bytes, line.start, pos),
738                    leading_trivia: std::mem::take(pending_trivia),
739                });
740                pos += 2;
741            }
742            b'<' if pos + 1 < end && bytes[pos + 1] == b'=' => {
743                tokens.push(Token {
744                    kind: TokenKind::CompOp,
745                    span: Span::new(pos, pos + 2),
746                    indent: visual_column(bytes, line.start, pos),
747                    leading_trivia: std::mem::take(pending_trivia),
748                });
749                pos += 2;
750            }
751            b'=' => {
752                let len = if pos + 1 < end && bytes[pos + 1] == b'=' {
753                    2
754                } else {
755                    1
756                };
757                tokens.push(Token {
758                    kind: TokenKind::CompOp,
759                    span: Span::new(pos, pos + len),
760                    indent: visual_column(bytes, line.start, pos),
761                    leading_trivia: std::mem::take(pending_trivia),
762                });
763                pos += len;
764            }
765            b'>' => {
766                tokens.push(Token {
767                    kind: TokenKind::CompOp,
768                    span: Span::new(pos, pos + 1),
769                    indent: visual_column(bytes, line.start, pos),
770                    leading_trivia: std::mem::take(pending_trivia),
771                });
772                pos += 1;
773            }
774            b'<' => {
775                tokens.push(Token {
776                    kind: TokenKind::CompOp,
777                    span: Span::new(pos, pos + 1),
778                    indent: visual_column(bytes, line.start, pos),
779                    leading_trivia: std::mem::take(pending_trivia),
780                });
781                pos += 1;
782            }
783            b',' => {
784                tokens.push(Token {
785                    kind: TokenKind::Comma,
786                    span: Span::new(pos, pos + 1),
787                    indent: visual_column(bytes, line.start, pos),
788                    leading_trivia: std::mem::take(pending_trivia),
789                });
790                pos += 1;
791            }
792            b'-' if pos + 1 < end && bytes[pos + 1] == b'-' => {
793                // Inline comment: rest of line.
794                pending_trivia.push(TriviaPiece {
795                    kind: TriviaKind::Comment,
796                    span: Span::new(pos, end),
797                });
798                pos = end;
799            }
800            _ => {
801                // An identifier or version number: emit as Value. Always
802                // consume at least one byte to guarantee forward progress
803                // on inputs containing stray operator chars.
804                let val_start = pos;
805                pos += 1;
806                while pos < end
807                    && !matches!(
808                        bytes[pos],
809                        b' ' | b'\t' | b'(' | b')' | b'!' | b',' | b'&' | b'|' | b'>' | b'<' | b'='
810                    )
811                {
812                    pos += 1;
813                }
814                tokens.push(Token {
815                    kind: TokenKind::Value,
816                    span: Span::new(val_start, pos),
817                    indent: visual_column(bytes, line.start, val_start),
818                    leading_trivia: std::mem::take(pending_trivia),
819                });
820            }
821        }
822    }
823}
824
825/// Tokenize a field line like `build-depends: base >=4.14`.
826fn tokenize_field(
827    source: &str,
828    line: &RawLine,
829    tokens: &mut Vec<Token>,
830    pending_trivia: &mut Vec<TriviaPiece>,
831) {
832    let bytes = source.as_bytes();
833
834    // Leading whitespace.
835    if line.start < line.content_start {
836        pending_trivia.push(TriviaPiece {
837            kind: TriviaKind::Whitespace,
838            span: Span::new(line.start, line.content_start),
839        });
840    }
841
842    // Field name.
843    let (name_start, name_end) = scan_word(bytes, line.content_start);
844    tokens.push(Token {
845        kind: TokenKind::FieldName,
846        span: Span::new(name_start, name_end),
847        indent: line.indent.unwrap_or(0),
848        leading_trivia: std::mem::take(pending_trivia),
849    });
850
851    // Optional whitespace between name and colon.
852    let mut pos = name_end;
853    let ws_start = pos;
854    pos = skip_hspace(bytes, pos);
855    if ws_start < pos {
856        pending_trivia.push(TriviaPiece {
857            kind: TriviaKind::Whitespace,
858            span: Span::new(ws_start, pos),
859        });
860    }
861
862    // Colon.
863    if pos < line.end && bytes[pos] == b':' {
864        tokens.push(Token {
865            kind: TokenKind::Colon,
866            span: Span::new(pos, pos + 1),
867            indent: visual_column(bytes, line.start, pos),
868            leading_trivia: std::mem::take(pending_trivia),
869        });
870        pos += 1;
871    }
872
873    // Optional whitespace after colon.
874    let ws_start2 = pos;
875    pos = skip_hspace(bytes, pos);
876    if ws_start2 < pos {
877        pending_trivia.push(TriviaPiece {
878            kind: TriviaKind::Whitespace,
879            span: Span::new(ws_start2, pos),
880        });
881    }
882
883    // Rest of line is the value (if non-empty).
884    if pos < line.end {
885        // Check for inline comment at the end.
886        let val_end = line.end;
887        tokens.push(Token {
888            kind: TokenKind::Value,
889            span: Span::new(pos, val_end),
890            indent: visual_column(bytes, line.start, pos),
891            leading_trivia: std::mem::take(pending_trivia),
892        });
893    }
894
895    // Newline.
896    if line.newline_start < line.line_end_with_newline {
897        pending_trivia.push(TriviaPiece {
898            kind: TriviaKind::Newline,
899            span: Span::new(line.newline_start, line.line_end_with_newline),
900        });
901    }
902}
903
904/// Tokenize a continuation / value line (no field name, no section header).
905fn tokenize_value_line(
906    source: &str,
907    line: &RawLine,
908    tokens: &mut Vec<Token>,
909    pending_trivia: &mut Vec<TriviaPiece>,
910) {
911    let _ = source;
912
913    // Leading whitespace.
914    if line.start < line.content_start {
915        pending_trivia.push(TriviaPiece {
916            kind: TriviaKind::Whitespace,
917            span: Span::new(line.start, line.content_start),
918        });
919    }
920
921    if line.content_start < line.end {
922        tokens.push(Token {
923            kind: TokenKind::Value,
924            span: Span::new(line.content_start, line.end),
925            indent: line.indent.unwrap_or(0),
926            leading_trivia: std::mem::take(pending_trivia),
927        });
928    }
929
930    // Newline.
931    if line.newline_start < line.line_end_with_newline {
932        pending_trivia.push(TriviaPiece {
933            kind: TriviaKind::Newline,
934            span: Span::new(line.newline_start, line.line_end_with_newline),
935        });
936    }
937}
938
939// ---------------------------------------------------------------------------
940// Tests
941// ---------------------------------------------------------------------------
942
943#[cfg(test)]
944mod tests {
945    use super::*;
946
947    /// Helper: collect just the (kind, text) pairs from tokenization.
948    fn tok_pairs(source: &str) -> Vec<(TokenKind, &str)> {
949        let tokens = tokenize(source);
950        tokens
951            .iter()
952            .map(|t| (t.kind, t.span.slice(source)))
953            .collect()
954    }
955
956    #[test]
957    fn lex_simple_field() {
958        let src = "name: foo\n";
959        let pairs = tok_pairs(src);
960        assert_eq!(
961            pairs,
962            vec![
963                (TokenKind::FieldName, "name"),
964                (TokenKind::Colon, ":"),
965                (TokenKind::Value, "foo"),
966                (TokenKind::Eof, ""),
967            ]
968        );
969    }
970
971    #[test]
972    fn lex_field_with_spaces() {
973        let src = "build-depends:    base >=4.14\n";
974        let pairs = tok_pairs(src);
975        assert_eq!(
976            pairs,
977            vec![
978                (TokenKind::FieldName, "build-depends"),
979                (TokenKind::Colon, ":"),
980                (TokenKind::Value, "base >=4.14"),
981                (TokenKind::Eof, ""),
982            ]
983        );
984    }
985
986    #[test]
987    fn lex_section_header_no_arg() {
988        let src = "library\n";
989        let pairs = tok_pairs(src);
990        assert_eq!(
991            pairs,
992            vec![(TokenKind::SectionHeader, "library"), (TokenKind::Eof, ""),]
993        );
994    }
995
996    #[test]
997    fn lex_section_header_with_arg() {
998        let src = "executable my-exe\n";
999        let pairs = tok_pairs(src);
1000        assert_eq!(
1001            pairs,
1002            vec![
1003                (TokenKind::SectionHeader, "executable"),
1004                (TokenKind::SectionArg, "my-exe"),
1005                (TokenKind::Eof, ""),
1006            ]
1007        );
1008    }
1009
1010    #[test]
1011    fn lex_conditional_if() {
1012        let src = "  if flag(dev)\n";
1013        let pairs = tok_pairs(src);
1014        assert_eq!(
1015            pairs,
1016            vec![
1017                (TokenKind::If, "if"),
1018                (TokenKind::Value, "flag"),
1019                (TokenKind::LParen, "("),
1020                (TokenKind::Value, "dev"),
1021                (TokenKind::RParen, ")"),
1022                (TokenKind::Eof, ""),
1023            ]
1024        );
1025    }
1026
1027    #[test]
1028    fn lex_conditional_complex() {
1029        let src = "  if flag(dev) && !os(windows)\n";
1030        let pairs = tok_pairs(src);
1031        assert_eq!(
1032            pairs,
1033            vec![
1034                (TokenKind::If, "if"),
1035                (TokenKind::Value, "flag"),
1036                (TokenKind::LParen, "("),
1037                (TokenKind::Value, "dev"),
1038                (TokenKind::RParen, ")"),
1039                (TokenKind::And, "&&"),
1040                (TokenKind::Not, "!"),
1041                (TokenKind::Value, "os"),
1042                (TokenKind::LParen, "("),
1043                (TokenKind::Value, "windows"),
1044                (TokenKind::RParen, ")"),
1045                (TokenKind::Eof, ""),
1046            ]
1047        );
1048    }
1049
1050    #[test]
1051    fn lex_else() {
1052        let src = "  else\n";
1053        let pairs = tok_pairs(src);
1054        assert_eq!(
1055            pairs,
1056            vec![(TokenKind::Else, "else"), (TokenKind::Eof, ""),]
1057        );
1058    }
1059
1060    #[test]
1061    fn lex_comment_line() {
1062        let src = "-- this is a comment\n";
1063        let pairs = tok_pairs(src);
1064        assert_eq!(
1065            pairs,
1066            vec![
1067                (TokenKind::Comment, "-- this is a comment"),
1068                (TokenKind::Eof, ""),
1069            ]
1070        );
1071    }
1072
1073    #[test]
1074    fn lex_blank_lines() {
1075        let src = "name: foo\n\nversion: 0.1\n";
1076        let tokens = tokenize(src);
1077        // The blank line should be trivia on the `version` field name token.
1078        let version_tok = tokens
1079            .iter()
1080            .find(|t| t.kind == TokenKind::FieldName && t.span.slice(src) == "version");
1081        assert!(version_tok.is_some());
1082        let trivia_kinds: Vec<_> = version_tok
1083            .unwrap()
1084            .leading_trivia
1085            .iter()
1086            .map(|t| t.kind)
1087            .collect();
1088        // Should include newline(s) from the blank line.
1089        assert!(trivia_kinds.contains(&TriviaKind::Newline));
1090    }
1091
1092    #[test]
1093    fn lex_indented_field() {
1094        let src = "  exposed-modules: Foo\n";
1095        let pairs = tok_pairs(src);
1096        assert_eq!(
1097            pairs,
1098            vec![
1099                (TokenKind::FieldName, "exposed-modules"),
1100                (TokenKind::Colon, ":"),
1101                (TokenKind::Value, "Foo"),
1102                (TokenKind::Eof, ""),
1103            ]
1104        );
1105        // Check indent.
1106        let tokens = tokenize(src);
1107        assert_eq!(tokens[0].indent, 2);
1108    }
1109
1110    #[test]
1111    fn lex_continuation_value() {
1112        let src = "    base >=4.14\n";
1113        let pairs = tok_pairs(src);
1114        assert_eq!(
1115            pairs,
1116            vec![(TokenKind::Value, "base >=4.14"), (TokenKind::Eof, ""),]
1117        );
1118        let tokens = tokenize(src);
1119        assert_eq!(tokens[0].indent, 4);
1120    }
1121
1122    #[test]
1123    fn lex_full_span_coverage() {
1124        let src = "name: foo\nversion: 0.1\n";
1125        let tokens = tokenize(src);
1126        // Collect all byte offsets covered.
1127        let mut covered = vec![false; src.len()];
1128        for tok in &tokens {
1129            for tp in &tok.leading_trivia {
1130                for (i, is_covered) in covered
1131                    .iter_mut()
1132                    .enumerate()
1133                    .take(tp.span.end)
1134                    .skip(tp.span.start)
1135                {
1136                    assert!(
1137                        !*is_covered,
1138                        "byte {i} covered twice (trivia on {:?})",
1139                        tok.kind
1140                    );
1141                    *is_covered = true;
1142                }
1143            }
1144            for (i, is_covered) in covered
1145                .iter_mut()
1146                .enumerate()
1147                .take(tok.span.end)
1148                .skip(tok.span.start)
1149            {
1150                assert!(
1151                    !*is_covered,
1152                    "byte {i} covered twice (token {:?})",
1153                    tok.kind
1154                );
1155                *is_covered = true;
1156            }
1157        }
1158        for (i, &c) in covered.iter().enumerate() {
1159            assert!(c, "byte {i} ({:?}) not covered", src.as_bytes()[i] as char);
1160        }
1161    }
1162
1163    #[test]
1164    fn lex_impl_condition() {
1165        let src = "  if impl(ghc >= 9.6)\n";
1166        let pairs = tok_pairs(src);
1167        assert_eq!(
1168            pairs,
1169            vec![
1170                (TokenKind::If, "if"),
1171                (TokenKind::Value, "impl"),
1172                (TokenKind::LParen, "("),
1173                (TokenKind::Value, "ghc"),
1174                (TokenKind::CompOp, ">="),
1175                (TokenKind::Value, "9.6"),
1176                (TokenKind::RParen, ")"),
1177                (TokenKind::Eof, ""),
1178            ]
1179        );
1180    }
1181
1182    #[test]
1183    fn lex_field_no_value() {
1184        let src = "build-depends:\n";
1185        let pairs = tok_pairs(src);
1186        assert_eq!(
1187            pairs,
1188            vec![
1189                (TokenKind::FieldName, "build-depends"),
1190                (TokenKind::Colon, ":"),
1191                (TokenKind::Eof, ""),
1192            ]
1193        );
1194    }
1195
1196    #[test]
1197    fn lex_import_as_field() {
1198        // `import:` should lex as a regular field name.
1199        let src = "  import: warnings\n";
1200        let pairs = tok_pairs(src);
1201        assert_eq!(
1202            pairs,
1203            vec![
1204                (TokenKind::FieldName, "import"),
1205                (TokenKind::Colon, ":"),
1206                (TokenKind::Value, "warnings"),
1207                (TokenKind::Eof, ""),
1208            ]
1209        );
1210    }
1211
1212    #[test]
1213    fn lex_tab_indent() {
1214        let src = "\texposed-modules: Foo\n";
1215        let tokens = tokenize(src);
1216        // Tab should expand to column 8.
1217        assert_eq!(tokens[0].indent, 8);
1218    }
1219
1220    #[test]
1221    fn lex_no_trailing_newline() {
1222        let src = "name: foo";
1223        let pairs = tok_pairs(src);
1224        assert_eq!(
1225            pairs,
1226            vec![
1227                (TokenKind::FieldName, "name"),
1228                (TokenKind::Colon, ":"),
1229                (TokenKind::Value, "foo"),
1230                (TokenKind::Eof, ""),
1231            ]
1232        );
1233    }
1234
1235    #[test]
1236    fn lex_common_stanza() {
1237        let src = "common warnings\n";
1238        let pairs = tok_pairs(src);
1239        assert_eq!(
1240            pairs,
1241            vec![
1242                (TokenKind::SectionHeader, "common"),
1243                (TokenKind::SectionArg, "warnings"),
1244                (TokenKind::Eof, ""),
1245            ]
1246        );
1247    }
1248
1249    #[test]
1250    fn full_span_coverage_multiline() {
1251        let src = "cabal-version: 3.0\nname: foo\n\n-- A comment\n\nlibrary\n  exposed-modules: Foo\n  build-depends:\n    base >=4.14\n";
1252        let tokens = tokenize(src);
1253        let mut covered = vec![false; src.len()];
1254        for tok in &tokens {
1255            for tp in &tok.leading_trivia {
1256                for (i, is_covered) in covered
1257                    .iter_mut()
1258                    .enumerate()
1259                    .take(tp.span.end)
1260                    .skip(tp.span.start)
1261                {
1262                    assert!(!*is_covered, "byte {i} covered twice (trivia)");
1263                    *is_covered = true;
1264                }
1265            }
1266            for (i, is_covered) in covered
1267                .iter_mut()
1268                .enumerate()
1269                .take(tok.span.end)
1270                .skip(tok.span.start)
1271            {
1272                assert!(
1273                    !*is_covered,
1274                    "byte {i} covered twice (token {:?})",
1275                    tok.kind
1276                );
1277                *is_covered = true;
1278            }
1279        }
1280        for (i, &c) in covered.iter().enumerate() {
1281            assert!(c, "byte {i} ({:?}) not covered", src.as_bytes()[i] as char);
1282        }
1283    }
1284}
cabalist_parser/lexer.rs

cabalist_parser/
lexer.rs