bynk_syntax/
lexer.rs

1//! Lexer for Bynk v0.
2//!
3//! Token kinds correspond to the terminals defined in the grammar (spec §3
4//! and §4). Whitespace is skipped; line comments are emitted as `Comment`
5//! tokens so the formatter can preserve them through round-trips (v1.1 LSP
6//! spec §3.5). Doc blocks (`---`) are emitted as `DocBlock` tokens, lexed
7//! outside of logos (see [`tokenize`]).
8
9use logos::Logos;
10
11use crate::error::CompileError;
12use crate::span::Span;
13
14/// Token kinds. Discriminants without payload data; the lexeme is recovered
15/// from the source string via the token's [`Span`].
16///
17/// Note: `--` line comments and `---` doc block markers are handled outside
18/// logos (see [`tokenize`]), because doc blocks are delimited by `---` lines
19/// containing only the marker and may span multiple source lines.
20#[derive(Logos, Debug, Clone, Copy, PartialEq, Eq)]
21#[logos(skip r"[ \t\r\n]+")]
22pub enum TokenKind {
23    // Keywords
24    #[token("commons")]
25    Commons,
26    #[token("type")]
27    Type,
28    #[token("fn")]
29    Fn,
30    #[token("where")]
31    Where,
32    #[token("and")]
33    And,
34    #[token("true")]
35    True,
36    #[token("false")]
37    False,
38    #[token("Int")]
39    Int,
40    #[token("String")]
41    String,
42    #[token("Bool")]
43    Bool,
44    // v0.21 keyword
45    #[token("Float")]
46    Float,
47    // v0.1 keywords
48    #[token("let")]
49    Let,
50    #[token("if")]
51    If,
52    #[token("else")]
53    Else,
54    #[token("Ok")]
55    Ok,
56    #[token("Err")]
57    Err,
58    #[token("Result")]
59    Result,
60    #[token("ValidationError")]
61    ValidationError,
62    // v0.22b keyword
63    #[token("JsonError")]
64    JsonError,
65    // v0.2 keywords
66    #[token("enum")]
67    Enum,
68    #[token("match")]
69    Match,
70    #[token("Option")]
71    Option,
72    #[token("record")]
73    Record,
74    #[token("self")]
75    Self_,
76    #[token("Some")]
77    Some,
78    #[token("None")]
79    None,
80    #[token("is")]
81    Is,
82    // v0.3 keywords
83    #[token("opaque")]
84    Opaque,
85    #[token("uses")]
86    Uses,
87    // v0.4 keywords
88    #[token("context")]
89    Context,
90    #[token("consumes")]
91    Consumes,
92    #[token("exports")]
93    Exports,
94    #[token("transparent")]
95    Transparent,
96    // v0.6 keywords
97    #[token("as")]
98    As,
99    // v0.7 keywords
100    #[token("assert")]
101    Assert,
102    #[token("expect")]
103    Expect,
104    #[token("mocks")]
105    Mocks,
106    #[token("test")]
107    Test,
108    // v0.16 keyword
109    #[token("wires")]
110    Wires,
111    // v0.17 keywords
112    #[token("adapter")]
113    Adapter,
114    #[token("binding")]
115    Binding,
116    // v0.5 keywords
117    #[token("agent")]
118    Agent,
119    #[token("capability")]
120    Capability,
121    #[token("commit")]
122    Commit,
123    #[token("Effect")]
124    Effect,
125    #[token("given")]
126    Given,
127    #[token("on")]
128    On,
129    // v0.9 keyword
130    #[token("http")]
131    Http,
132    // v0.10a keyword
133    #[token("cron")]
134    Cron,
135    // v0.10b keyword
136    #[token("queue")]
137    Queue,
138    // v0.44 keywords: `from` heads a service's protocol clause; `protocol` is
139    // reserved (protocols are a closed, compiler-known set — no declaration kind).
140    #[token("from")]
141    From,
142    #[token("protocol")]
143    Protocol,
144    #[token("provides")]
145    Provides,
146    #[token("service")]
147    Service,
148    #[token("state")]
149    State,
150    // v0.45 keywords: `actor` heads a boundary-contract declaration; `by`
151    // heads a handler's actor clause.
152    #[token("actor")]
153    Actor,
154    #[token("by")]
155    By,
156    /// `...` — used in record-spread expressions (v0.5).
157    #[token("...")]
158    DotDotDot,
159    /// `<-` — Effect bind operator (v0.5).
160    #[token("<-")]
161    LArrow,
162    /// `~>` — asynchronous fire-and-forget send marker (v0.79). A leading
163    /// statement marker, never on the RHS of a `let`; distinct from `<-` so the
164    /// call site shows whether the caller waits.
165    #[token("~>")]
166    TildeArrow,
167
168    /// A documentation block: `---` line ... `---` line. The token's span
169    /// covers the full block including both `---` markers. The body content
170    /// is recovered from the source via the span (see [`doc_block_content`]).
171    /// Inserted by [`tokenize`]; not lexed by logos directly.
172    DocBlock,
173
174    /// A line comment: `-- ...` running to end of line. The span starts at
175    /// the `--` marker and runs through the last character before the
176    /// terminating newline (exclusive). The trivia body (the text after the
177    /// `--` marker) is recovered from the source via the span. Inserted by
178    /// [`tokenize`]; not lexed by logos directly so it cannot be mistaken
179    /// for an `--` operator sequence.
180    Comment,
181
182    // Identifier
183    #[regex(r"[A-Za-z][A-Za-z0-9_]*")]
184    Ident,
185
186    // Literals
187    #[regex(r"[0-9]+")]
188    IntLit,
189    // A float literal: fraction with a digit on both sides of the `.`, an
190    // exponent, or both (v0.21 §3). `1.` and `.5` are NOT float literals —
191    // the digit-both-sides rule keeps `2.5.round()` / `1.toFloat()` lexing
192    // as method calls on numeric literals.
193    #[regex(r"[0-9]+\.[0-9]+([eE][+-]?[0-9]+)?|[0-9]+[eE][+-]?[0-9]+")]
194    FloatLit,
195    // A double-quoted string with simple escapes. The body excludes the closing
196    // quote; we accept any non-quote/non-backslash/non-newline char, or a
197    // backslash followed by one of the four allowed escapes.
198    #[regex(r#""([^"\\\n]|\\[nt"\\])*""#)]
199    StrLit,
200    // An interpolated string `"… \(expr) …"` (v0.43). Hand-scanned in
201    // `tokenize` (logos cannot balance the holes' parens), never produced by
202    // the logos lexer — like [`TokenKind::DocBlock`]/[`TokenKind::Comment`].
203    // The span covers the whole `"…"`; the parser splits chunks from holes.
204    InterpStr,
205
206    // Multi-char operators
207    #[token("->")]
208    Arrow,
209    #[token("==")]
210    EqEq,
211    #[token("!=")]
212    BangEq,
213    #[token("<=")]
214    LtEq,
215    #[token(">=")]
216    GtEq,
217    #[token("&&")]
218    AmpAmp,
219    #[token("||")]
220    PipePipe,
221
222    // Single-char operators
223    #[token("+")]
224    Plus,
225    #[token("-")]
226    Minus,
227    #[token("*")]
228    Star,
229    #[token("/")]
230    Slash,
231    #[token("!")]
232    Bang,
233    #[token("=")]
234    Eq,
235    #[token("<")]
236    Lt,
237    #[token(">")]
238    Gt,
239    // v0.1 postfix operator
240    #[token("?")]
241    Question,
242    // v0.2 match-arm arrow
243    #[token("=>")]
244    FatArrow,
245    // v0.2 wildcard pattern (also valid as identifier start; the lexer
246    // prefers identifier for any longer match, so `_foo` is still Ident).
247    #[token("_")]
248    Underscore,
249    // v0.2 sum-type variant separator (also used as future bitwise OR);
250    // single `|` distinct from `||`.
251    #[token("|")]
252    Pipe,
253
254    // Punctuation
255    #[token("(")]
256    LParen,
257    #[token(")")]
258    RParen,
259    #[token("{")]
260    LBrace,
261    #[token("}")]
262    RBrace,
263    #[token("[")]
264    LBracket,
265    #[token("]")]
266    RBracket,
267    #[token(",")]
268    Comma,
269    #[token(":")]
270    Colon,
271    #[token(".")]
272    Dot,
273}
274
275impl TokenKind {
276    /// Human-readable display name for diagnostics.
277    pub fn describe(self) -> &'static str {
278        use TokenKind::*;
279        match self {
280            Commons => "`commons`",
281            Type => "`type`",
282            Fn => "`fn`",
283            Where => "`where`",
284            And => "`and`",
285            True => "`true`",
286            False => "`false`",
287            Int => "`Int`",
288            String => "`String`",
289            Bool => "`Bool`",
290            Float => "`Float`",
291            Let => "`let`",
292            If => "`if`",
293            Else => "`else`",
294            Ok => "`Ok`",
295            Err => "`Err`",
296            Result => "`Result`",
297            ValidationError => "`ValidationError`",
298            JsonError => "`JsonError`",
299            Enum => "`enum`",
300            Match => "`match`",
301            Option => "`Option`",
302            Record => "`record`",
303            Self_ => "`self`",
304            Some => "`Some`",
305            None => "`None`",
306            Is => "`is`",
307            Opaque => "`opaque`",
308            Uses => "`uses`",
309            Context => "`context`",
310            Consumes => "`consumes`",
311            Exports => "`exports`",
312            Transparent => "`transparent`",
313            As => "`as`",
314            Assert => "`assert`",
315            Expect => "`expect`",
316            Mocks => "`mocks`",
317            Test => "`test`",
318            Wires => "`wires`",
319            Adapter => "`adapter`",
320            Binding => "`binding`",
321            Agent => "`agent`",
322            Capability => "`capability`",
323            Commit => "`commit`",
324            Effect => "`Effect`",
325            Given => "`given`",
326            On => "`on`",
327            Http => "`http`",
328            Cron => "`cron`",
329            Queue => "`queue`",
330            From => "`from`",
331            Protocol => "`protocol`",
332            Provides => "`provides`",
333            Service => "`service`",
334            State => "`state`",
335            Actor => "`actor`",
336            By => "`by`",
337            DotDotDot => "`...`",
338            LArrow => "`<-`",
339            TildeArrow => "`~>`",
340            DocBlock => "documentation block",
341            Comment => "line comment",
342            Ident => "identifier",
343            IntLit => "integer literal",
344            FloatLit => "float literal",
345            StrLit => "string literal",
346            InterpStr => "interpolated string",
347            Arrow => "`->`",
348            EqEq => "`==`",
349            BangEq => "`!=`",
350            LtEq => "`<=`",
351            GtEq => "`>=`",
352            AmpAmp => "`&&`",
353            PipePipe => "`||`",
354            Plus => "`+`",
355            Minus => "`-`",
356            Star => "`*`",
357            Slash => "`/`",
358            Bang => "`!`",
359            Eq => "`=`",
360            Lt => "`<`",
361            Gt => "`>`",
362            Question => "`?`",
363            FatArrow => "`=>`",
364            Underscore => "`_`",
365            Pipe => "`|`",
366            LParen => "`(`",
367            RParen => "`)`",
368            LBrace => "`{`",
369            RBrace => "`}`",
370            LBracket => "`[`",
371            RBracket => "`]`",
372            Comma => "`,`",
373            Colon => "`:`",
374            Dot => "`.`",
375        }
376    }
377}
378
379/// A token plus its source span.
380#[derive(Debug, Clone, Copy)]
381pub struct Token {
382    pub kind: TokenKind,
383    pub span: Span,
384}
385
386/// Tokenise a source string. Returns the full token vector or the first
387/// lexical error.
388///
389/// Doc blocks (`---` ... `---`) and line comments (`-- ...`) are recognised
390/// outside the logos-generated lexer: we scan the source one segment at a
391/// time, dispatching to logos for ordinary tokens between non-token spans.
392pub fn tokenize(source: &str) -> Result<Vec<Token>, CompileError> {
393    let mut tokens = Vec::new();
394    let bytes = source.as_bytes();
395    let mut pos = 0;
396    while pos < bytes.len() {
397        // Detect a `---` doc-block marker at the start of a line (the line may
398        // begin with leading whitespace; the marker itself must be alone on
399        // its line).
400        if let Some(open_end) = doc_block_open_at(source, pos) {
401            // Find the matching closing `---` line.
402            match doc_block_close(source, open_end) {
403                Some((close_start, close_end)) => {
404                    let span = Span::new(pos, close_end);
405                    tokens.push(Token {
406                        kind: TokenKind::DocBlock,
407                        span,
408                    });
409                    let _ = close_start;
410                    pos = close_end;
411                    continue;
412                }
413                None => {
414                    return Err(CompileError::new(
415                        "bynk.lex.unclosed_doc_block",
416                        Span::new(pos, open_end),
417                        "documentation block opened but never closed",
418                    )
419                    .with_note(
420                        "a doc block must be terminated by another `---` on a line by itself",
421                    ));
422                }
423            }
424        }
425        // A `--` line comment: emit a `Comment` token covering everything
426        // up to (but not including) the terminating newline. Doc-block
427        // detection above already ruled out a `---` marker at line start
428        // — and once we've consumed past the leading `--`, any further
429        // dashes are part of the comment body. Preserving comments as
430        // trivia tokens lets the parser attach them to declarations so
431        // the formatter can emit them in place (v1.1 LSP spec §3.5).
432        if pos + 1 < bytes.len() && bytes[pos] == b'-' && bytes[pos + 1] == b'-' {
433            let start = pos;
434            while pos < bytes.len() && bytes[pos] != b'\n' {
435                pos += 1;
436            }
437            tokens.push(Token {
438                kind: TokenKind::Comment,
439                span: Span::new(start, pos),
440            });
441            continue;
442        }
443        // Skip ordinary whitespace inline (logos handles it too, but we may
444        // be in the middle of the source between specials).
445        if matches!(bytes[pos], b' ' | b'\t' | b'\r' | b'\n') {
446            pos += 1;
447            continue;
448        }
449        // An interpolated string `"… \(expr) …"` (v0.43): only strings that
450        // actually contain a `\(` hole are hand-scanned here; plain strings
451        // fall through to the logos `StrLit` path unchanged. `\(` is an
452        // invalid escape in the logos grammar, so this never re-routes a
453        // currently-valid literal.
454        if bytes[pos] == b'"' && has_interp_hole(bytes, pos) {
455            let end = scan_str(bytes, source, pos)?;
456            tokens.push(Token {
457                kind: TokenKind::InterpStr,
458                span: Span::new(pos, end),
459            });
460            pos = end;
461            continue;
462        }
463        // Otherwise dispatch a single logos token starting at `pos`.
464        let mut lex = TokenKind::lexer(&source[pos..]);
465        let Some(result) = lex.next() else {
466            // No token at this position; treat as unexpected character so
467            // the user sees something useful.
468            let ch = source[pos..].chars().next().unwrap_or('\0');
469            let span = Span::new(pos, pos + ch.len_utf8());
470            return Err(CompileError::new(
471                "bynk.lex.unexpected_character",
472                span,
473                format!("unexpected character `{ch}`"),
474            ));
475        };
476        let local = lex.span();
477        let span: Span = Span::new(pos + local.start, pos + local.end);
478        match result {
479            Ok(kind) => {
480                if kind == TokenKind::IntLit {
481                    let slice = &source[span.range()];
482                    if slice.parse::<i64>().is_err() {
483                        return Err(CompileError::new(
484                            "bynk.lex.integer_overflow",
485                            span,
486                            format!(
487                                "integer literal `{slice}` is out of range for a 64-bit signed integer"
488                            ),
489                        )
490                        .with_note("the range is -2^63 to 2^63 - 1"));
491                    }
492                }
493                if kind == TokenKind::FloatLit {
494                    let slice = &source[span.range()];
495                    match slice.parse::<f64>() {
496                        Ok(v) if v.is_finite() => {}
497                        _ => {
498                            return Err(CompileError::new(
499                                "bynk.lex.float_literal_overflow",
500                                span,
501                                format!(
502                                    "float literal `{slice}` is out of range for a 64-bit float"
503                                ),
504                            )
505                            .with_note(
506                                "the literal does not fit a finite IEEE 754 double; \
507                                 the largest finite value is ~1.8e308",
508                            ));
509                        }
510                    }
511                }
512                tokens.push(Token { kind, span });
513                pos = span.end;
514            }
515            Err(()) => {
516                let slice = &source[span.range()];
517                let ch = slice.chars().next().unwrap_or('\0');
518                let err = if ch == '"' {
519                    CompileError::new(
520                        "bynk.lex.unterminated_string",
521                        span,
522                        "unterminated string literal",
523                    )
524                    .with_note(
525                        "string literals must close with `\"` on the same line; \
526                         supported escapes are `\\n`, `\\t`, `\\\"`, `\\\\`",
527                    )
528                } else {
529                    CompileError::new(
530                        "bynk.lex.unexpected_character",
531                        span,
532                        format!("unexpected character `{ch}`"),
533                    )
534                };
535                return Err(err);
536            }
537        }
538    }
539    Ok(tokens)
540}
541
542/// Cheap routing pre-scan (v0.43): does the string opening at `start` contain a
543/// `\(` interpolation hole before it closes (or the line ends)? Decides whether
544/// `tokenize` hand-scans the string as an `InterpStr` or defers to logos for a
545/// plain `StrLit`. Deliberately tolerant — a malformed string with a hole is
546/// routed here so the hole-aware scanner produces the precise error.
547fn has_interp_hole(bytes: &[u8], start: usize) -> bool {
548    let mut i = start + 1;
549    while i < bytes.len() {
550        match bytes[i] {
551            b'\n' | b'"' => return false,
552            b'\\' => {
553                if bytes.get(i + 1) == Some(&b'(') {
554                    return true;
555                }
556                i += 2;
557            }
558            _ => i += 1,
559        }
560    }
561    false
562}
563
564/// Scan a double-quoted string starting at `start` (the opening `"`), returning
565/// the byte offset just past the closing `"`. Recognises the four simple
566/// escapes plus `\(…)` interpolation holes, whose parens are balanced (and
567/// whose nested strings are skipped) by [`scan_hole`]. (v0.43.)
568fn scan_str(bytes: &[u8], source: &str, start: usize) -> Result<usize, CompileError> {
569    debug_assert_eq!(bytes[start], b'"');
570    let mut i = start + 1;
571    loop {
572        if i >= bytes.len() || bytes[i] == b'\n' {
573            return Err(CompileError::new(
574                "bynk.lex.unterminated_string",
575                Span::new(start, i.min(bytes.len())),
576                "unterminated string literal",
577            )
578            .with_note(
579                "string literals must close with `\"` on the same line; \
580                 supported escapes are `\\n`, `\\t`, `\\\"`, `\\\\`, and `\\(…)` interpolation",
581            ));
582        }
583        match bytes[i] {
584            b'"' => return Ok(i + 1),
585            b'\\' => match bytes.get(i + 1) {
586                Some(b'n' | b't' | b'"' | b'\\') => i += 2,
587                Some(b'(') => i = scan_hole(bytes, source, i + 2)?,
588                other => {
589                    let shown = other.map(|b| (*b as char).to_string()).unwrap_or_default();
590                    return Err(CompileError::new(
591                        "bynk.lex.bad_escape",
592                        Span::new(i, (i + 2).min(bytes.len())),
593                        format!("invalid escape sequence `\\{shown}` in string literal"),
594                    )
595                    .with_note("supported escapes: \\n \\t \\\" \\\\ \\(…)"));
596                }
597            },
598            // Any other byte advances one position. UTF-8 continuation bytes
599            // are all >= 0x80, so they never collide with the ASCII specials.
600            _ => i += 1,
601        }
602    }
603}
604
605/// Scan an interpolation hole body. `start` points just past the `\(`; returns
606/// the offset just past the matching `)`. Tracks paren depth and skips nested
607/// strings (whose own parens must not close the hole), recursing through
608/// [`scan_str`] so nested interpolation nests correctly. (v0.43.)
609fn scan_hole(bytes: &[u8], source: &str, start: usize) -> Result<usize, CompileError> {
610    let mut i = start;
611    let mut depth = 1usize;
612    loop {
613        if i >= bytes.len() || bytes[i] == b'\n' {
614            return Err(CompileError::new(
615                "bynk.lex.unterminated_interpolation",
616                Span::new(start.saturating_sub(2), i.min(bytes.len())),
617                "unterminated interpolation hole",
618            )
619            .with_note(
620                "an interpolation hole `\\(…)` must close with a matching `)` on the same line",
621            ));
622        }
623        match bytes[i] {
624            b'(' => {
625                depth += 1;
626                i += 1;
627            }
628            b')' => {
629                depth -= 1;
630                i += 1;
631                if depth == 0 {
632                    return Ok(i);
633                }
634            }
635            b'"' => i = scan_str(bytes, source, i)?,
636            _ => i += 1,
637        }
638    }
639}
640
641/// One segment of a split interpolated string (v0.43): literal text (escapes
642/// resolved) or the absolute source span of a hole's expression (the bytes
643/// between `\(` and its matching `)`). The parser turns the latter into a real
644/// `Expr`; the lexer owns only the scanning.
645pub(crate) enum InterpSegment {
646    Chunk(String),
647    Hole(Span),
648}
649
650/// Split an `InterpStr` token (its `span` covers the whole `"…"`) into chunks
651/// and hole spans. Escapes in the chunks are resolved here (mirroring
652/// [`parse_string_literal`]); holes are returned as spans for the parser to
653/// re-lex and parse as expressions. (v0.43.)
654pub(crate) fn split_interp(source: &str, span: Span) -> Result<Vec<InterpSegment>, CompileError> {
655    let bytes = source.as_bytes();
656    let inner_end = span.end - 1; // the closing `"`
657    let mut segments = Vec::new();
658    let mut chunk = String::new();
659    let mut i = span.start + 1; // past the opening `"`
660    while i < inner_end {
661        match bytes[i] {
662            b'\\' => match bytes[i + 1] {
663                b'n' => {
664                    chunk.push('\n');
665                    i += 2;
666                }
667                b't' => {
668                    chunk.push('\t');
669                    i += 2;
670                }
671                b'"' => {
672                    chunk.push('"');
673                    i += 2;
674                }
675                b'\\' => {
676                    chunk.push('\\');
677                    i += 2;
678                }
679                b'(' => {
680                    if !chunk.is_empty() {
681                        segments.push(InterpSegment::Chunk(std::mem::take(&mut chunk)));
682                    }
683                    let hole_start = i + 2;
684                    let after = scan_hole(bytes, source, hole_start)?;
685                    // `after` is one past the matching `)`; the hole body is
686                    // everything up to that `)`.
687                    segments.push(InterpSegment::Hole(Span::new(hole_start, after - 1)));
688                    i = after;
689                }
690                // The lexer already validated every escape, so nothing else
691                // can appear here.
692                other => unreachable!("unvalidated escape `\\{}` in InterpStr", other as char),
693            },
694            _ => {
695                let ch = source[i..].chars().next().unwrap();
696                chunk.push(ch);
697                i += ch.len_utf8();
698            }
699        }
700    }
701    if !chunk.is_empty() {
702        segments.push(InterpSegment::Chunk(chunk));
703    }
704    Ok(segments)
705}
706
707/// If a `---` doc-block marker line starts at or shortly after `pos` (which
708/// must be at a line boundary), return the byte offset just past the marker
709/// line (after the terminating newline, or at EOF). The doc-block grammar
710/// requires the marker to be alone on its line; leading horizontal whitespace
711/// is allowed and ignored.
712fn doc_block_open_at(source: &str, pos: usize) -> Option<usize> {
713    let bytes = source.as_bytes();
714    if !at_line_start(source, pos) {
715        return None;
716    }
717    // Skip leading horizontal whitespace.
718    let mut i = pos;
719    while i < bytes.len() && (bytes[i] == b' ' || bytes[i] == b'\t') {
720        i += 1;
721    }
722    if i + 3 > bytes.len() {
723        return None;
724    }
725    if &bytes[i..i + 3] != b"---" {
726        return None;
727    }
728    i += 3;
729    // The marker may have additional trailing dashes (per spec "three or more
730    // consecutive hyphens"). Consume them.
731    while i < bytes.len() && bytes[i] == b'-' {
732        i += 1;
733    }
734    // After the dashes, allow only horizontal whitespace then newline/EOF.
735    while i < bytes.len() && (bytes[i] == b' ' || bytes[i] == b'\t' || bytes[i] == b'\r') {
736        i += 1;
737    }
738    if i == bytes.len() {
739        return Some(i);
740    }
741    if bytes[i] == b'\n' {
742        return Some(i + 1);
743    }
744    None
745}
746
747/// Find the next closing `---` line at or after `pos`. Returns
748/// `(start_of_line, end_of_line)` (`end_of_line` is just past the
749/// terminating newline, or at EOF).
750fn doc_block_close(source: &str, mut pos: usize) -> Option<(usize, usize)> {
751    let bytes = source.as_bytes();
752    while pos < bytes.len() {
753        // Advance pos to the start of a line.
754        let line_start = pos;
755        // Find the end of this line.
756        let mut line_end = line_start;
757        while line_end < bytes.len() && bytes[line_end] != b'\n' {
758            line_end += 1;
759        }
760        // Check this line.
761        if let Some(end) = doc_block_open_at(source, line_start) {
762            return Some((line_start, end));
763        }
764        // Move to the next line.
765        pos = if line_end < bytes.len() {
766            line_end + 1
767        } else {
768            line_end
769        };
770    }
771    None
772}
773
774/// Returns true if byte offset `pos` is at a line start (column 0).
775fn at_line_start(source: &str, pos: usize) -> bool {
776    if pos == 0 {
777        return true;
778    }
779    let bytes = source.as_bytes();
780    bytes[pos - 1] == b'\n'
781}
782
783/// Extract the body content of a doc-block token from its source span.
784/// Strips the leading and trailing `---` marker lines and returns the body
785/// verbatim. If every non-empty content line begins with the same horizontal
786/// whitespace prefix (e.g., because the doc block sits inside a brace-form
787/// commons body), that common prefix is removed so the body reads naturally
788/// when emitted as JSDoc.
789pub fn doc_block_content(source: &str, span: Span) -> String {
790    let slice = &source[span.range()];
791    // Drop the first line (opening marker).
792    let after_open = match slice.find('\n') {
793        Some(i) => &slice[i + 1..],
794        None => return String::new(),
795    };
796    let bytes = after_open.as_bytes();
797    // Trim the trailing closing-marker line.
798    let mut i = bytes.len();
799    if i > 0 && bytes[i - 1] == b'\n' {
800        i -= 1;
801    }
802    while i > 0 && matches!(bytes[i - 1], b' ' | b'\t' | b'\r') {
803        i -= 1;
804    }
805    while i > 0 && bytes[i - 1] == b'-' {
806        i -= 1;
807    }
808    if i > 0 && bytes[i - 1] == b'\n' {
809        i -= 1;
810    }
811    let body = &after_open[..i];
812
813    // Compute the common leading-whitespace prefix across all non-empty lines
814    // and strip it. This lets writers indent the doc block alongside the
815    // declaration it documents without bleeding the indent into the JSDoc.
816    let common: Option<usize> = body
817        .lines()
818        .filter(|l| !l.trim().is_empty())
819        .map(|l| l.bytes().take_while(|&b| b == b' ' || b == b'\t').count())
820        .min();
821    let strip = common.unwrap_or(0);
822    if strip == 0 {
823        return body.to_string();
824    }
825    let mut out = String::with_capacity(body.len());
826    let mut first = true;
827    for line in body.lines() {
828        if !first {
829            out.push('\n');
830        }
831        first = false;
832        if line.trim().is_empty() {
833            // Preserve blank lines.
834            continue;
835        }
836        let leading: usize = line
837            .bytes()
838            .take_while(|&b| b == b' ' || b == b'\t')
839            .count();
840        let drop = strip.min(leading);
841        out.push_str(&line[drop..]);
842    }
843    out
844}
845
846/// Extract the body of a `Comment` trivia token: everything after the
847/// leading `--` marker, preserving its inline whitespace verbatim. Used by
848/// the parser when attaching comments to declarations.
849pub fn comment_body(source: &str, span: Span) -> &str {
850    let slice = &source[span.range()];
851    // Strip leading "--" if present (defensive — the lexer always emits
852    // Comment tokens whose span begins with `--`).
853    slice.strip_prefix("--").unwrap_or(slice)
854}
855
856/// Returns true if there is a blank line (a line containing only whitespace)
857/// in `source` strictly between byte offsets `from` (inclusive) and `to`
858/// (exclusive). Used by the parser to detect orphan doc blocks.
859///
860/// A doc-block token's span ends just past the closing-marker line's
861/// terminating newline. So if the next declaration begins on the immediately
862/// following line, the substring between contains no newline (only optional
863/// indentation). Any newline in the substring therefore implies at least one
864/// entirely-blank line separating the doc from the declaration.
865pub fn has_blank_line_between(source: &str, from: usize, to: usize) -> bool {
866    if to <= from {
867        return false;
868    }
869    let bytes = source.as_bytes();
870    let mut i = from;
871    while i < to {
872        if bytes[i] == b'\n' {
873            return true;
874        }
875        if !matches!(bytes[i], b' ' | b'\t' | b'\r') {
876            return false;
877        }
878        i += 1;
879    }
880    false
881}
882
883#[cfg(test)]
884mod tests {
885    use super::*;
886
887    fn kinds(source: &str) -> Vec<TokenKind> {
888        tokenize(source)
889            .unwrap()
890            .into_iter()
891            .map(|t| t.kind)
892            .collect()
893    }
894
895    #[test]
896    fn keywords_and_idents() {
897        use TokenKind::*;
898        assert_eq!(
899            kinds("commons type fn where and true false Int String Bool foo bar"),
900            vec![
901                Commons, Type, Fn, Where, And, True, False, Int, String, Bool, Ident, Ident
902            ],
903        );
904    }
905
906    #[test]
907    fn integer_and_string_literals() {
908        use TokenKind::*;
909        assert_eq!(
910            kinds(r#"0 42 "hello" "with\nescape""#),
911            vec![IntLit, IntLit, StrLit, StrLit]
912        );
913    }
914
915    #[test]
916    fn operators() {
917        use TokenKind::*;
918        assert_eq!(
919            kinds("-> == != <= >= && || + - * / ! = < > ( ) { } [ ] , : ."),
920            vec![
921                Arrow, EqEq, BangEq, LtEq, GtEq, AmpAmp, PipePipe, Plus, Minus, Star, Slash, Bang,
922                Eq, Lt, Gt, LParen, RParen, LBrace, RBrace, LBracket, RBracket, Comma, Colon, Dot,
923            ],
924        );
925    }
926
927    #[test]
928    fn line_comments_emitted_as_trivia() {
929        // v1.1: line comments are preserved as Comment tokens so the
930        // formatter can attach and re-emit them.
931        use TokenKind::*;
932        let src = "-- a comment\ntype X = Int -- trailing\n";
933        assert_eq!(kinds(src), vec![Comment, Type, Ident, Eq, Int, Comment],);
934    }
935
936    #[test]
937    fn comment_body_extracts_text_after_marker() {
938        let toks = tokenize("-- hello world\n").unwrap();
939        assert_eq!(toks.len(), 1);
940        assert_eq!(toks[0].kind, TokenKind::Comment);
941        assert_eq!(
942            comment_body("-- hello world\n", toks[0].span),
943            " hello world"
944        );
945    }
946
947    #[test]
948    fn comment_does_not_consume_newline() {
949        // Two adjacent comment lines should produce two distinct tokens
950        // — the newline between them is not part of either comment's span.
951        let toks = tokenize("-- one\n-- two\n").unwrap();
952        assert_eq!(toks.len(), 2);
953        assert!(toks.iter().all(|t| t.kind == TokenKind::Comment));
954    }
955
956    #[test]
957    fn unterminated_string_is_error() {
958        let err = tokenize("\"oops\n").unwrap_err();
959        assert_eq!(err.category, "bynk.lex.unterminated_string");
960    }
961
962    #[test]
963    fn integer_overflow_is_error() {
964        let err = tokenize("99999999999999999999").unwrap_err();
965        assert_eq!(err.category, "bynk.lex.integer_overflow");
966    }
967
968    #[test]
969    fn unexpected_character_is_error() {
970        let err = tokenize("type X = Int $").unwrap_err();
971        assert_eq!(err.category, "bynk.lex.unexpected_character");
972    }
973
974    #[test]
975    fn v0_1_keywords() {
976        use TokenKind::*;
977        assert_eq!(
978            kinds("let if else Ok Err Result ValidationError"),
979            vec![Let, If, Else, Ok, Err, Result, ValidationError],
980        );
981    }
982
983    #[test]
984    fn question_token() {
985        use TokenKind::*;
986        assert_eq!(kinds("x?"), vec![Ident, Question]);
987    }
988
989    #[test]
990    fn v0_2_keywords() {
991        use TokenKind::*;
992        assert_eq!(
993            kinds("enum match Option record self Some None is"),
994            vec![Enum, Match, Option, Record, Self_, Some, None, Is],
995        );
996    }
997
998    #[test]
999    fn pipe_and_pipe_pipe_disambiguated() {
1000        use TokenKind::*;
1001        assert_eq!(kinds("| || |"), vec![Pipe, PipePipe, Pipe]);
1002    }
1003
1004    #[test]
1005    fn v0_7_keywords() {
1006        use TokenKind::*;
1007        assert_eq!(
1008            kinds("assert expect mocks test"),
1009            vec![Assert, Expect, Mocks, Test],
1010        );
1011    }
1012
1013    #[test]
1014    fn fat_arrow_and_underscore() {
1015        use TokenKind::*;
1016        assert_eq!(kinds("_ =>"), vec![Underscore, FatArrow]);
1017    }
1018
1019    // -- v0.43 string interpolation --
1020
1021    #[test]
1022    fn interp_string_is_one_token() {
1023        use TokenKind::*;
1024        assert_eq!(kinds(r#""Hello, \(name)!""#), vec![InterpStr]);
1025        // A plain string (no hole) stays a `StrLit`, via the logos path.
1026        assert_eq!(kinds(r#""Hello, world""#), vec![StrLit]);
1027    }
1028
1029    #[test]
1030    fn interp_balances_nested_parens_and_strings() {
1031        use TokenKind::*;
1032        // The `)` inside `f(x)` must not close the hole early.
1033        assert_eq!(kinds(r#""= \(f(x))""#), vec![InterpStr]);
1034        // A `)` inside a nested string inside the hole is also ignored.
1035        assert_eq!(kinds(r#""= \(label(")"))""#), vec![InterpStr]);
1036        // A nested interpolated string inside a hole.
1037        assert_eq!(kinds(r#""out \("in \(x)")""#), vec![InterpStr]);
1038    }
1039
1040    #[test]
1041    fn escaped_open_paren_is_not_a_hole() {
1042        use TokenKind::*;
1043        // `\\(` is a literal backslash followed by `(` — no hole, so the
1044        // string lexes as a plain `StrLit` on the logos path.
1045        assert_eq!(kinds(r#""a \\(b) c""#), vec![StrLit]);
1046    }
1047
1048    #[test]
1049    fn unterminated_hole_is_an_error() {
1050        // The hole runs to end of line without its closing `)`.
1051        let err = tokenize("\"value \\(x + 1\n\"").unwrap_err();
1052        assert_eq!(err.category, "bynk.lex.unterminated_interpolation");
1053    }
1054
1055    #[test]
1056    fn unterminated_interp_string_is_an_error() {
1057        // A hole closes but the string never does (newline before the `"`).
1058        let err = tokenize("\"value \\(x) more\n").unwrap_err();
1059        assert_eq!(err.category, "bynk.lex.unterminated_string");
1060    }
1061
1062    #[test]
1063    fn bad_escape_in_interp_string_is_an_error() {
1064        let err = tokenize(r#""a \q \(x)""#).unwrap_err();
1065        assert_eq!(err.category, "bynk.lex.bad_escape");
1066    }
1067}
bynk_syntax/lexer.rs

bynk_syntax/
lexer.rs