bynk_syntax/
lexer.rs

1//! Lexer for Bynk v0.
2//!
3//! Token kinds correspond to the terminals defined in the grammar (spec §3
4//! and §4). Whitespace is skipped; line comments are emitted as `Comment`
5//! tokens so the formatter can preserve them through round-trips (v1.1 LSP
6//! spec §3.5). Doc blocks (`---`) are emitted as `DocBlock` tokens, lexed
7//! outside of logos (see [`tokenize`]).
8
9use logos::Logos;
10
11use crate::error::CompileError;
12use crate::span::Span;
13
14/// Token kinds. Discriminants without payload data; the lexeme is recovered
15/// from the source string via the token's [`Span`].
16///
17/// Note: `--` line comments and `---` doc block markers are handled outside
18/// logos (see [`tokenize`]), because doc blocks are delimited by `---` lines
19/// containing only the marker and may span multiple source lines.
20#[derive(Logos, Debug, Clone, Copy, PartialEq, Eq)]
21#[logos(skip r"[ \t\r\n]+")]
22pub enum TokenKind {
23    // Keywords
24    #[token("commons")]
25    Commons,
26    #[token("type")]
27    Type,
28    #[token("fn")]
29    Fn,
30    #[token("where")]
31    Where,
32    #[token("and")]
33    And,
34    #[token("true")]
35    True,
36    #[token("false")]
37    False,
38    #[token("Int")]
39    Int,
40    #[token("String")]
41    String,
42    #[token("Bool")]
43    Bool,
44    // v0.21 keyword
45    #[token("Float")]
46    Float,
47    // v0.86 keyword (ADR 0112): the `Duration` base type.
48    #[token("Duration")]
49    Duration,
50    // v0.90 keyword (ADR 0114): the `Instant` base type.
51    #[token("Instant")]
52    Instant,
53    // v0.110 keyword (ADR 0142): the `Bytes` base type.
54    #[token("Bytes")]
55    Bytes,
56    // v0.1 keywords
57    #[token("let")]
58    Let,
59    #[token("if")]
60    If,
61    #[token("else")]
62    Else,
63    #[token("Ok")]
64    Ok,
65    #[token("Err")]
66    Err,
67    #[token("Result")]
68    Result,
69    #[token("ValidationError")]
70    ValidationError,
71    // v0.22b keyword
72    #[token("JsonError")]
73    JsonError,
74    // v0.2 keywords
75    #[token("enum")]
76    Enum,
77    #[token("match")]
78    Match,
79    #[token("Option")]
80    Option,
81    #[token("record")]
82    Record,
83    #[token("self")]
84    Self_,
85    #[token("Some")]
86    Some,
87    #[token("None")]
88    None,
89    #[token("is")]
90    Is,
91    // v0.3 keywords
92    #[token("opaque")]
93    Opaque,
94    #[token("uses")]
95    Uses,
96    // v0.4 keywords
97    #[token("context")]
98    Context,
99    #[token("consumes")]
100    Consumes,
101    #[token("exports")]
102    Exports,
103    #[token("transparent")]
104    Transparent,
105    // v0.6 keywords
106    #[token("as")]
107    As,
108    // v0.7 keywords (v0.112: `assert`→`expect`, `test`→`suite`/`case`;
109    // v0.118: `mocks` retired — test doubles are `provides` at a seam)
110    #[token("expect")]
111    Expect,
112    #[token("suite")]
113    Suite,
114    #[token("case")]
115    Case,
116    // v0.114 keyword — generative tests (testing track slice 2). `for` and `all`
117    // are deliberately *not* keywords: `all` is a list combinator (`all(xs, p)`)
118    // and must stay a usable identifier. The `for all` binder is parsed
119    // contextually (two identifiers) inside a `property` body instead.
120    #[token("property")]
121    Property,
122    // v0.17 keywords
123    #[token("adapter")]
124    Adapter,
125    #[token("binding")]
126    Binding,
127    // v0.5 keywords
128    #[token("agent")]
129    Agent,
130    #[token("capability")]
131    Capability,
132    #[token("Effect")]
133    Effect,
134    #[token("given")]
135    Given,
136    #[token("on")]
137    On,
138    // v0.9 keyword
139    #[token("http")]
140    Http,
141    // v0.10a keyword
142    #[token("cron")]
143    Cron,
144    // v0.10b keyword
145    #[token("queue")]
146    Queue,
147    // v0.44 keywords: `from` heads a service's protocol clause; `protocol` is
148    // reserved (protocols are a closed, compiler-known set — no declaration kind).
149    #[token("from")]
150    From,
151    #[token("protocol")]
152    Protocol,
153    #[token("provides")]
154    Provides,
155    #[token("service")]
156    Service,
157    // v0.45 keywords: `actor` heads a boundary-contract declaration; `by`
158    // heads a handler's actor clause.
159    #[token("actor")]
160    Actor,
161    #[token("by")]
162    By,
163    // v0.80 keywords: `invariant` heads an agent invariant declaration; `implies`
164    // is the directional logical-implication operator (`P implies Q` ≡ `!P || Q`).
165    #[token("invariant")]
166    Invariant,
167    #[token("implies")]
168    Implies,
169    // v0.115 keywords — function contracts (testing track slice 3). `requires`
170    // and `ensures` head a contract clause on a `fn` signature (between the
171    // return type and the body). `result` is deliberately *not* a keyword: it is
172    // the ordinary value name outside a contract, so it stays a usable
173    // identifier; inside an `ensures` predicate it is bound contextually as the
174    // function's return value (parsed by scope, like `for`/`all` in slice 2).
175    // Distinct from ADR 0127's capability `@requires` annotation.
176    #[token("requires")]
177    Requires,
178    #[token("ensures")]
179    Ensures,
180    // v0.116 keyword — step invariants (testing track slice 4). `transition` heads
181    // an agent step-invariant declaration (beside `invariant`), a predicate over
182    // the pre- and post-commit state pair. `old` and `new` are deliberately *not*
183    // keywords: they stay ordinary value names outside a `transition`, and inside a
184    // `transition` predicate they are bound contextually to the old/new state
185    // records (parsed by scope, like `result` in an `ensures`).
186    #[token("transition")]
187    Transition,
188    /// `...` — used in record-spread expressions (v0.5).
189    #[token("...")]
190    DotDotDot,
191    /// `<-` — Effect bind operator (v0.5).
192    #[token("<-")]
193    LArrow,
194    /// `~>` — asynchronous fire-and-forget send marker (v0.79). A leading
195    /// statement marker, never on the RHS of a `let`; distinct from `<-` so the
196    /// call site shows whether the caller waits.
197    #[token("~>")]
198    TildeArrow,
199    /// `:=` — Cell write (v0.81, storage track). A handler statement
200    /// `cell := expr`; distinct from `=` (binding) and `:` (annotation). Longer
201    /// than `:`/`=` so logos matches it as one token.
202    #[token(":=")]
203    ColonEq,
204
205    /// A documentation block: `---` line ... `---` line. The token's span
206    /// covers the full block including both `---` markers. The body content
207    /// is recovered from the source via the span (see [`doc_block_content`]).
208    /// Inserted by [`tokenize`]; not lexed by logos directly.
209    DocBlock,
210
211    /// A line comment: `-- ...` running to end of line. The span starts at
212    /// the `--` marker and runs through the last character before the
213    /// terminating newline (exclusive). The trivia body (the text after the
214    /// `--` marker) is recovered from the source via the span. Inserted by
215    /// [`tokenize`]; not lexed by logos directly so it cannot be mistaken
216    /// for an `--` operator sequence.
217    Comment,
218
219    // Identifier
220    #[regex(r"[A-Za-z][A-Za-z0-9_]*")]
221    Ident,
222
223    // Literals
224    #[regex(r"[0-9]+")]
225    IntLit,
226    // A float literal: fraction with a digit on both sides of the `.`, an
227    // exponent, or both (v0.21 §3). `1.` and `.5` are NOT float literals —
228    // the digit-both-sides rule keeps `2.5.round()` / `1.toFloat()` lexing
229    // as method calls on numeric literals.
230    #[regex(r"[0-9]+\.[0-9]+([eE][+-]?[0-9]+)?|[0-9]+[eE][+-]?[0-9]+")]
231    FloatLit,
232    // A double-quoted string with simple escapes. The body excludes the closing
233    // quote; we accept any non-quote/non-backslash/non-newline char, or a
234    // backslash followed by one of the four allowed escapes.
235    #[regex(r#""([^"\\\n]|\\[nt"\\])*""#)]
236    StrLit,
237    // An interpolated string `"… \(expr) …"` (v0.43). Hand-scanned in
238    // `tokenize` (logos cannot balance the holes' parens), never produced by
239    // the logos lexer — like [`TokenKind::DocBlock`]/[`TokenKind::Comment`].
240    // The span covers the whole `"…"`; the parser splits chunks from holes.
241    InterpStr,
242
243    // Multi-char operators
244    #[token("->")]
245    Arrow,
246    #[token("==")]
247    EqEq,
248    #[token("!=")]
249    BangEq,
250    #[token("<=")]
251    LtEq,
252    #[token(">=")]
253    GtEq,
254    #[token("&&")]
255    AmpAmp,
256    #[token("||")]
257    PipePipe,
258
259    // Single-char operators
260    #[token("+")]
261    Plus,
262    #[token("-")]
263    Minus,
264    #[token("*")]
265    Star,
266    #[token("/")]
267    Slash,
268    #[token("!")]
269    Bang,
270    #[token("=")]
271    Eq,
272    #[token("<")]
273    Lt,
274    #[token(">")]
275    Gt,
276    // v0.1 postfix operator
277    #[token("?")]
278    Question,
279    // v0.2 match-arm arrow
280    #[token("=>")]
281    FatArrow,
282    // v0.2 wildcard pattern (also valid as identifier start; the lexer
283    // prefers identifier for any longer match, so `_foo` is still Ident).
284    #[token("_")]
285    Underscore,
286    // v0.2 sum-type variant separator (also used as future bitwise OR);
287    // single `|` distinct from `||`.
288    #[token("|")]
289    Pipe,
290    /// `@` — storage-annotation marker (v0.85, storage track; ADR 0111). Leads a
291    /// `@name(args)` annotation on a `store` field (`@ttl(…)`/`@indexed(…)`); it
292    /// appears only in store-field-declaration position, never as an expression
293    /// operator.
294    #[token("@")]
295    At,
296
297    // Punctuation
298    #[token("(")]
299    LParen,
300    #[token(")")]
301    RParen,
302    #[token("{")]
303    LBrace,
304    #[token("}")]
305    RBrace,
306    #[token("[")]
307    LBracket,
308    #[token("]")]
309    RBracket,
310    #[token(",")]
311    Comma,
312    #[token(":")]
313    Colon,
314    #[token(".")]
315    Dot,
316}
317
318impl TokenKind {
319    /// Human-readable display name for diagnostics.
320    pub fn describe(self) -> &'static str {
321        use TokenKind::*;
322        match self {
323            Commons => "`commons`",
324            Type => "`type`",
325            Fn => "`fn`",
326            Where => "`where`",
327            And => "`and`",
328            True => "`true`",
329            False => "`false`",
330            Int => "`Int`",
331            String => "`String`",
332            Bool => "`Bool`",
333            Float => "`Float`",
334            Duration => "`Duration`",
335            Instant => "`Instant`",
336            Bytes => "`Bytes`",
337            Let => "`let`",
338            If => "`if`",
339            Else => "`else`",
340            Ok => "`Ok`",
341            Err => "`Err`",
342            Result => "`Result`",
343            ValidationError => "`ValidationError`",
344            JsonError => "`JsonError`",
345            Enum => "`enum`",
346            Match => "`match`",
347            Option => "`Option`",
348            Record => "`record`",
349            Self_ => "`self`",
350            Some => "`Some`",
351            None => "`None`",
352            Is => "`is`",
353            Opaque => "`opaque`",
354            Uses => "`uses`",
355            Context => "`context`",
356            Consumes => "`consumes`",
357            Exports => "`exports`",
358            Transparent => "`transparent`",
359            As => "`as`",
360            Expect => "`expect`",
361            Suite => "`suite`",
362            Case => "`case`",
363            Property => "`property`",
364            Adapter => "`adapter`",
365            Binding => "`binding`",
366            Agent => "`agent`",
367            Capability => "`capability`",
368            Effect => "`Effect`",
369            Given => "`given`",
370            On => "`on`",
371            Http => "`http`",
372            Cron => "`cron`",
373            Queue => "`queue`",
374            From => "`from`",
375            Protocol => "`protocol`",
376            Provides => "`provides`",
377            Service => "`service`",
378            Actor => "`actor`",
379            By => "`by`",
380            Invariant => "`invariant`",
381            Implies => "`implies`",
382            Requires => "`requires`",
383            Ensures => "`ensures`",
384            Transition => "`transition`",
385            ColonEq => "`:=`",
386            DotDotDot => "`...`",
387            LArrow => "`<-`",
388            TildeArrow => "`~>`",
389            DocBlock => "documentation block",
390            Comment => "line comment",
391            Ident => "identifier",
392            IntLit => "integer literal",
393            FloatLit => "float literal",
394            StrLit => "string literal",
395            InterpStr => "interpolated string",
396            Arrow => "`->`",
397            EqEq => "`==`",
398            BangEq => "`!=`",
399            LtEq => "`<=`",
400            GtEq => "`>=`",
401            AmpAmp => "`&&`",
402            PipePipe => "`||`",
403            Plus => "`+`",
404            Minus => "`-`",
405            Star => "`*`",
406            Slash => "`/`",
407            Bang => "`!`",
408            Eq => "`=`",
409            Lt => "`<`",
410            Gt => "`>`",
411            Question => "`?`",
412            FatArrow => "`=>`",
413            Underscore => "`_`",
414            Pipe => "`|`",
415            At => "`@`",
416            LParen => "`(`",
417            RParen => "`)`",
418            LBrace => "`{`",
419            RBrace => "`}`",
420            LBracket => "`[`",
421            RBracket => "`]`",
422            Comma => "`,`",
423            Colon => "`:`",
424            Dot => "`.`",
425        }
426    }
427}
428
429/// A token plus its source span.
430#[derive(Debug, Clone, Copy)]
431pub struct Token {
432    pub kind: TokenKind,
433    pub span: Span,
434}
435
436/// Tokenise a source string. Returns the full token vector or the first
437/// lexical error.
438///
439/// Doc blocks (`---` ... `---`) and line comments (`-- ...`) are recognised
440/// outside the logos-generated lexer: we scan the source one segment at a
441/// time, dispatching to logos for ordinary tokens between non-token spans.
442pub fn tokenize(source: &str) -> Result<Vec<Token>, CompileError> {
443    let mut tokens = Vec::new();
444    let bytes = source.as_bytes();
445    let mut pos = 0;
446    while pos < bytes.len() {
447        // Detect a `---` doc-block marker at the start of a line (the line may
448        // begin with leading whitespace; the marker itself must be alone on
449        // its line).
450        if let Some(open_end) = doc_block_open_at(source, pos) {
451            // Find the matching closing `---` line.
452            match doc_block_close(source, open_end) {
453                Some((close_start, close_end)) => {
454                    let span = Span::new(pos, close_end);
455                    tokens.push(Token {
456                        kind: TokenKind::DocBlock,
457                        span,
458                    });
459                    let _ = close_start;
460                    pos = close_end;
461                    continue;
462                }
463                None => {
464                    return Err(CompileError::new(
465                        "bynk.lex.unclosed_doc_block",
466                        Span::new(pos, open_end),
467                        "documentation block opened but never closed",
468                    )
469                    .with_note(
470                        "a doc block must be terminated by another `---` on a line by itself",
471                    ));
472                }
473            }
474        }
475        // A `--` line comment: emit a `Comment` token covering everything
476        // up to (but not including) the terminating newline. Doc-block
477        // detection above already ruled out a `---` marker at line start
478        // — and once we've consumed past the leading `--`, any further
479        // dashes are part of the comment body. Preserving comments as
480        // trivia tokens lets the parser attach them to declarations so
481        // the formatter can emit them in place (v1.1 LSP spec §3.5).
482        if pos + 1 < bytes.len() && bytes[pos] == b'-' && bytes[pos + 1] == b'-' {
483            let start = pos;
484            while pos < bytes.len() && bytes[pos] != b'\n' {
485                pos += 1;
486            }
487            tokens.push(Token {
488                kind: TokenKind::Comment,
489                span: Span::new(start, pos),
490            });
491            continue;
492        }
493        // Skip ordinary whitespace inline (logos handles it too, but we may
494        // be in the middle of the source between specials).
495        if matches!(bytes[pos], b' ' | b'\t' | b'\r' | b'\n') {
496            pos += 1;
497            continue;
498        }
499        // An interpolated string `"… \(expr) …"` (v0.43): only strings that
500        // actually contain a `\(` hole are hand-scanned here; plain strings
501        // fall through to the logos `StrLit` path unchanged. `\(` is an
502        // invalid escape in the logos grammar, so this never re-routes a
503        // currently-valid literal.
504        if bytes[pos] == b'"' && has_interp_hole(bytes, pos) {
505            let end = scan_str(bytes, source, pos)?;
506            tokens.push(Token {
507                kind: TokenKind::InterpStr,
508                span: Span::new(pos, end),
509            });
510            pos = end;
511            continue;
512        }
513        // Otherwise dispatch a single logos token starting at `pos`.
514        let mut lex = TokenKind::lexer(&source[pos..]);
515        let Some(result) = lex.next() else {
516            // No token at this position; treat as unexpected character so
517            // the user sees something useful.
518            let ch = source[pos..].chars().next().unwrap_or('\0');
519            let span = Span::new(pos, pos + ch.len_utf8());
520            return Err(CompileError::new(
521                "bynk.lex.unexpected_character",
522                span,
523                format!("unexpected character `{ch}`"),
524            ));
525        };
526        let local = lex.span();
527        let span: Span = Span::new(pos + local.start, pos + local.end);
528        match result {
529            Ok(kind) => {
530                if kind == TokenKind::IntLit {
531                    let slice = &source[span.range()];
532                    if slice.parse::<i64>().is_err() {
533                        return Err(CompileError::new(
534                            "bynk.lex.integer_overflow",
535                            span,
536                            format!(
537                                "integer literal `{slice}` is out of range for a 64-bit signed integer"
538                            ),
539                        )
540                        .with_note("the range is -2^63 to 2^63 - 1"));
541                    }
542                }
543                if kind == TokenKind::FloatLit {
544                    let slice = &source[span.range()];
545                    match slice.parse::<f64>() {
546                        Ok(v) if v.is_finite() => {}
547                        _ => {
548                            return Err(CompileError::new(
549                                "bynk.lex.float_literal_overflow",
550                                span,
551                                format!(
552                                    "float literal `{slice}` is out of range for a 64-bit float"
553                                ),
554                            )
555                            .with_note(
556                                "the literal does not fit a finite IEEE 754 double; \
557                                 the largest finite value is ~1.8e308",
558                            ));
559                        }
560                    }
561                }
562                tokens.push(Token { kind, span });
563                pos = span.end;
564            }
565            Err(()) => {
566                let slice = &source[span.range()];
567                let ch = slice.chars().next().unwrap_or('\0');
568                let err = if ch == '"' {
569                    CompileError::new(
570                        "bynk.lex.unterminated_string",
571                        span,
572                        "unterminated string literal",
573                    )
574                    .with_note(
575                        "string literals must close with `\"` on the same line; \
576                         supported escapes are `\\n`, `\\t`, `\\\"`, `\\\\`",
577                    )
578                } else {
579                    CompileError::new(
580                        "bynk.lex.unexpected_character",
581                        span,
582                        format!("unexpected character `{ch}`"),
583                    )
584                };
585                return Err(err);
586            }
587        }
588    }
589    Ok(tokens)
590}
591
592/// Cheap routing pre-scan (v0.43): does the string opening at `start` contain a
593/// `\(` interpolation hole before it closes (or the line ends)? Decides whether
594/// `tokenize` hand-scans the string as an `InterpStr` or defers to logos for a
595/// plain `StrLit`. Deliberately tolerant — a malformed string with a hole is
596/// routed here so the hole-aware scanner produces the precise error.
597fn has_interp_hole(bytes: &[u8], start: usize) -> bool {
598    let mut i = start + 1;
599    while i < bytes.len() {
600        match bytes[i] {
601            b'\n' | b'"' => return false,
602            b'\\' => {
603                if bytes.get(i + 1) == Some(&b'(') {
604                    return true;
605                }
606                i += 2;
607            }
608            _ => i += 1,
609        }
610    }
611    false
612}
613
614/// Scan a double-quoted string starting at `start` (the opening `"`), returning
615/// the byte offset just past the closing `"`. Recognises the four simple
616/// escapes plus `\(…)` interpolation holes, whose parens are balanced (and
617/// whose nested strings are skipped) by [`scan_hole`]. (v0.43.)
618fn scan_str(bytes: &[u8], source: &str, start: usize) -> Result<usize, CompileError> {
619    debug_assert_eq!(bytes[start], b'"');
620    let mut i = start + 1;
621    loop {
622        if i >= bytes.len() || bytes[i] == b'\n' {
623            return Err(CompileError::new(
624                "bynk.lex.unterminated_string",
625                Span::new(start, i.min(bytes.len())),
626                "unterminated string literal",
627            )
628            .with_note(
629                "string literals must close with `\"` on the same line; \
630                 supported escapes are `\\n`, `\\t`, `\\\"`, `\\\\`, and `\\(…)` interpolation",
631            ));
632        }
633        match bytes[i] {
634            b'"' => return Ok(i + 1),
635            b'\\' => match bytes.get(i + 1) {
636                Some(b'n' | b't' | b'"' | b'\\') => i += 2,
637                Some(b'(') => i = scan_hole(bytes, source, i + 2)?,
638                other => {
639                    let shown = other.map(|b| (*b as char).to_string()).unwrap_or_default();
640                    return Err(CompileError::new(
641                        "bynk.lex.bad_escape",
642                        Span::new(i, (i + 2).min(bytes.len())),
643                        format!("invalid escape sequence `\\{shown}` in string literal"),
644                    )
645                    .with_note("supported escapes: \\n \\t \\\" \\\\ \\(…)"));
646                }
647            },
648            // Any other byte advances one position. UTF-8 continuation bytes
649            // are all >= 0x80, so they never collide with the ASCII specials.
650            _ => i += 1,
651        }
652    }
653}
654
655/// Scan an interpolation hole body. `start` points just past the `\(`; returns
656/// the offset just past the matching `)`. Tracks paren depth and skips nested
657/// strings (whose own parens must not close the hole), recursing through
658/// [`scan_str`] so nested interpolation nests correctly. (v0.43.)
659fn scan_hole(bytes: &[u8], source: &str, start: usize) -> Result<usize, CompileError> {
660    let mut i = start;
661    let mut depth = 1usize;
662    loop {
663        if i >= bytes.len() || bytes[i] == b'\n' {
664            return Err(CompileError::new(
665                "bynk.lex.unterminated_interpolation",
666                Span::new(start.saturating_sub(2), i.min(bytes.len())),
667                "unterminated interpolation hole",
668            )
669            .with_note(
670                "an interpolation hole `\\(…)` must close with a matching `)` on the same line",
671            ));
672        }
673        match bytes[i] {
674            b'(' => {
675                depth += 1;
676                i += 1;
677            }
678            b')' => {
679                depth -= 1;
680                i += 1;
681                if depth == 0 {
682                    return Ok(i);
683                }
684            }
685            b'"' => i = scan_str(bytes, source, i)?,
686            _ => i += 1,
687        }
688    }
689}
690
691/// One segment of a split interpolated string (v0.43): literal text (escapes
692/// resolved) or the absolute source span of a hole's expression (the bytes
693/// between `\(` and its matching `)`). The parser turns the latter into a real
694/// `Expr`; the lexer owns only the scanning.
695pub(crate) enum InterpSegment {
696    Chunk(String),
697    Hole(Span),
698}
699
700/// Split an `InterpStr` token (its `span` covers the whole `"…"`) into chunks
701/// and hole spans. Escapes in the chunks are resolved here (mirroring
702/// [`parse_string_literal`]); holes are returned as spans for the parser to
703/// re-lex and parse as expressions. (v0.43.)
704pub(crate) fn split_interp(source: &str, span: Span) -> Result<Vec<InterpSegment>, CompileError> {
705    let bytes = source.as_bytes();
706    let inner_end = span.end - 1; // the closing `"`
707    let mut segments = Vec::new();
708    let mut chunk = String::new();
709    let mut i = span.start + 1; // past the opening `"`
710    while i < inner_end {
711        match bytes[i] {
712            b'\\' => match bytes[i + 1] {
713                b'n' => {
714                    chunk.push('\n');
715                    i += 2;
716                }
717                b't' => {
718                    chunk.push('\t');
719                    i += 2;
720                }
721                b'"' => {
722                    chunk.push('"');
723                    i += 2;
724                }
725                b'\\' => {
726                    chunk.push('\\');
727                    i += 2;
728                }
729                b'(' => {
730                    if !chunk.is_empty() {
731                        segments.push(InterpSegment::Chunk(std::mem::take(&mut chunk)));
732                    }
733                    let hole_start = i + 2;
734                    let after = scan_hole(bytes, source, hole_start)?;
735                    // `after` is one past the matching `)`; the hole body is
736                    // everything up to that `)`.
737                    segments.push(InterpSegment::Hole(Span::new(hole_start, after - 1)));
738                    i = after;
739                }
740                // The lexer already validated every escape, so nothing else
741                // can appear here.
742                other => unreachable!("unvalidated escape `\\{}` in InterpStr", other as char),
743            },
744            _ => {
745                let ch = source[i..].chars().next().unwrap();
746                chunk.push(ch);
747                i += ch.len_utf8();
748            }
749        }
750    }
751    if !chunk.is_empty() {
752        segments.push(InterpSegment::Chunk(chunk));
753    }
754    Ok(segments)
755}
756
757/// If a `---` doc-block marker line starts at or shortly after `pos` (which
758/// must be at a line boundary), return the byte offset just past the marker
759/// line (after the terminating newline, or at EOF). The doc-block grammar
760/// requires the marker to be alone on its line; leading horizontal whitespace
761/// is allowed and ignored.
762fn doc_block_open_at(source: &str, pos: usize) -> Option<usize> {
763    let bytes = source.as_bytes();
764    if !at_line_start(source, pos) {
765        return None;
766    }
767    // Skip leading horizontal whitespace.
768    let mut i = pos;
769    while i < bytes.len() && (bytes[i] == b' ' || bytes[i] == b'\t') {
770        i += 1;
771    }
772    if i + 3 > bytes.len() {
773        return None;
774    }
775    if &bytes[i..i + 3] != b"---" {
776        return None;
777    }
778    i += 3;
779    // The marker may have additional trailing dashes (per spec "three or more
780    // consecutive hyphens"). Consume them.
781    while i < bytes.len() && bytes[i] == b'-' {
782        i += 1;
783    }
784    // After the dashes, allow only horizontal whitespace then newline/EOF.
785    while i < bytes.len() && (bytes[i] == b' ' || bytes[i] == b'\t' || bytes[i] == b'\r') {
786        i += 1;
787    }
788    if i == bytes.len() {
789        return Some(i);
790    }
791    if bytes[i] == b'\n' {
792        return Some(i + 1);
793    }
794    None
795}
796
797/// Find the next closing `---` line at or after `pos`. Returns
798/// `(start_of_line, end_of_line)` (`end_of_line` is just past the
799/// terminating newline, or at EOF).
800fn doc_block_close(source: &str, mut pos: usize) -> Option<(usize, usize)> {
801    let bytes = source.as_bytes();
802    while pos < bytes.len() {
803        // Advance pos to the start of a line.
804        let line_start = pos;
805        // Find the end of this line.
806        let mut line_end = line_start;
807        while line_end < bytes.len() && bytes[line_end] != b'\n' {
808            line_end += 1;
809        }
810        // Check this line.
811        if let Some(end) = doc_block_open_at(source, line_start) {
812            return Some((line_start, end));
813        }
814        // Move to the next line.
815        pos = if line_end < bytes.len() {
816            line_end + 1
817        } else {
818            line_end
819        };
820    }
821    None
822}
823
824/// Returns true if byte offset `pos` is at a line start (column 0).
825fn at_line_start(source: &str, pos: usize) -> bool {
826    if pos == 0 {
827        return true;
828    }
829    let bytes = source.as_bytes();
830    bytes[pos - 1] == b'\n'
831}
832
833/// Extract the body content of a doc-block token from its source span.
834/// Strips the leading and trailing `---` marker lines and returns the body
835/// verbatim. If every non-empty content line begins with the same horizontal
836/// whitespace prefix (e.g., because the doc block sits inside a brace-form
837/// commons body), that common prefix is removed so the body reads naturally
838/// when emitted as JSDoc.
839pub fn doc_block_content(source: &str, span: Span) -> String {
840    let slice = &source[span.range()];
841    // Drop the first line (opening marker).
842    let after_open = match slice.find('\n') {
843        Some(i) => &slice[i + 1..],
844        None => return String::new(),
845    };
846    let bytes = after_open.as_bytes();
847    // Trim the trailing closing-marker line.
848    let mut i = bytes.len();
849    if i > 0 && bytes[i - 1] == b'\n' {
850        i -= 1;
851    }
852    while i > 0 && matches!(bytes[i - 1], b' ' | b'\t' | b'\r') {
853        i -= 1;
854    }
855    while i > 0 && bytes[i - 1] == b'-' {
856        i -= 1;
857    }
858    if i > 0 && bytes[i - 1] == b'\n' {
859        i -= 1;
860    }
861    let body = &after_open[..i];
862
863    // Compute the common leading-whitespace prefix across all non-empty lines
864    // and strip it. This lets writers indent the doc block alongside the
865    // declaration it documents without bleeding the indent into the JSDoc.
866    let common: Option<usize> = body
867        .lines()
868        .filter(|l| !l.trim().is_empty())
869        .map(|l| l.bytes().take_while(|&b| b == b' ' || b == b'\t').count())
870        .min();
871    let strip = common.unwrap_or(0);
872    if strip == 0 {
873        return body.to_string();
874    }
875    let mut out = String::with_capacity(body.len());
876    let mut first = true;
877    for line in body.lines() {
878        if !first {
879            out.push('\n');
880        }
881        first = false;
882        if line.trim().is_empty() {
883            // Preserve blank lines.
884            continue;
885        }
886        let leading: usize = line
887            .bytes()
888            .take_while(|&b| b == b' ' || b == b'\t')
889            .count();
890        let drop = strip.min(leading);
891        out.push_str(&line[drop..]);
892    }
893    out
894}
895
896/// Extract the body of a `Comment` trivia token: everything after the
897/// leading `--` marker, preserving its inline whitespace verbatim. Used by
898/// the parser when attaching comments to declarations.
899pub fn comment_body(source: &str, span: Span) -> &str {
900    let slice = &source[span.range()];
901    // Strip leading "--" if present (defensive — the lexer always emits
902    // Comment tokens whose span begins with `--`).
903    slice.strip_prefix("--").unwrap_or(slice)
904}
905
906/// Returns true if there is a blank line (a line containing only whitespace)
907/// in `source` strictly between byte offsets `from` (inclusive) and `to`
908/// (exclusive). Used by the parser to detect orphan doc blocks.
909///
910/// A doc-block token's span ends just past the closing-marker line's
911/// terminating newline. So if the next declaration begins on the immediately
912/// following line, the substring between contains no newline (only optional
913/// indentation). Any newline in the substring therefore implies at least one
914/// entirely-blank line separating the doc from the declaration.
915pub fn has_blank_line_between(source: &str, from: usize, to: usize) -> bool {
916    if to <= from {
917        return false;
918    }
919    let bytes = source.as_bytes();
920    let mut i = from;
921    while i < to {
922        if bytes[i] == b'\n' {
923            return true;
924        }
925        if !matches!(bytes[i], b' ' | b'\t' | b'\r') {
926            return false;
927        }
928        i += 1;
929    }
930    false
931}
932
933#[cfg(test)]
934mod tests {
935    use super::*;
936
937    fn kinds(source: &str) -> Vec<TokenKind> {
938        tokenize(source)
939            .unwrap()
940            .into_iter()
941            .map(|t| t.kind)
942            .collect()
943    }
944
945    #[test]
946    fn keywords_and_idents() {
947        use TokenKind::*;
948        assert_eq!(
949            kinds("commons type fn where and true false Int String Bool foo bar"),
950            vec![
951                Commons, Type, Fn, Where, And, True, False, Int, String, Bool, Ident, Ident
952            ],
953        );
954    }
955
956    #[test]
957    fn integer_and_string_literals() {
958        use TokenKind::*;
959        assert_eq!(
960            kinds(r#"0 42 "hello" "with\nescape""#),
961            vec![IntLit, IntLit, StrLit, StrLit]
962        );
963    }
964
965    #[test]
966    fn operators() {
967        use TokenKind::*;
968        assert_eq!(
969            kinds("-> == != <= >= && || + - * / ! = < > ( ) { } [ ] , : . @"),
970            vec![
971                Arrow, EqEq, BangEq, LtEq, GtEq, AmpAmp, PipePipe, Plus, Minus, Star, Slash, Bang,
972                Eq, Lt, Gt, LParen, RParen, LBrace, RBrace, LBracket, RBracket, Comma, Colon, Dot,
973                At,
974            ],
975        );
976    }
977
978    #[test]
979    fn line_comments_emitted_as_trivia() {
980        // v1.1: line comments are preserved as Comment tokens so the
981        // formatter can attach and re-emit them.
982        use TokenKind::*;
983        let src = "-- a comment\ntype X = Int -- trailing\n";
984        assert_eq!(kinds(src), vec![Comment, Type, Ident, Eq, Int, Comment],);
985    }
986
987    #[test]
988    fn comment_body_extracts_text_after_marker() {
989        let toks = tokenize("-- hello world\n").unwrap();
990        assert_eq!(toks.len(), 1);
991        assert_eq!(toks[0].kind, TokenKind::Comment);
992        assert_eq!(
993            comment_body("-- hello world\n", toks[0].span),
994            " hello world"
995        );
996    }
997
998    #[test]
999    fn comment_does_not_consume_newline() {
1000        // Two adjacent comment lines should produce two distinct tokens
1001        // — the newline between them is not part of either comment's span.
1002        let toks = tokenize("-- one\n-- two\n").unwrap();
1003        assert_eq!(toks.len(), 2);
1004        assert!(toks.iter().all(|t| t.kind == TokenKind::Comment));
1005    }
1006
1007    #[test]
1008    fn unterminated_string_is_error() {
1009        let err = tokenize("\"oops\n").unwrap_err();
1010        assert_eq!(err.category, "bynk.lex.unterminated_string");
1011    }
1012
1013    #[test]
1014    fn integer_overflow_is_error() {
1015        let err = tokenize("99999999999999999999").unwrap_err();
1016        assert_eq!(err.category, "bynk.lex.integer_overflow");
1017    }
1018
1019    #[test]
1020    fn unexpected_character_is_error() {
1021        let err = tokenize("type X = Int $").unwrap_err();
1022        assert_eq!(err.category, "bynk.lex.unexpected_character");
1023    }
1024
1025    #[test]
1026    fn v0_1_keywords() {
1027        use TokenKind::*;
1028        assert_eq!(
1029            kinds("let if else Ok Err Result ValidationError"),
1030            vec![Let, If, Else, Ok, Err, Result, ValidationError],
1031        );
1032    }
1033
1034    #[test]
1035    fn question_token() {
1036        use TokenKind::*;
1037        assert_eq!(kinds("x?"), vec![Ident, Question]);
1038    }
1039
1040    #[test]
1041    fn v0_2_keywords() {
1042        use TokenKind::*;
1043        assert_eq!(
1044            kinds("enum match Option record self Some None is"),
1045            vec![Enum, Match, Option, Record, Self_, Some, None, Is],
1046        );
1047    }
1048
1049    #[test]
1050    fn pipe_and_pipe_pipe_disambiguated() {
1051        use TokenKind::*;
1052        assert_eq!(kinds("| || |"), vec![Pipe, PipePipe, Pipe]);
1053    }
1054
1055    #[test]
1056    fn v0_7_keywords() {
1057        use TokenKind::*;
1058        assert_eq!(kinds("expect suite case"), vec![Expect, Suite, Case],);
1059        // v0.118: `mocks` and `wires` are retired — plain identifiers now.
1060        assert_eq!(kinds("mocks wires"), vec![Ident, Ident]);
1061    }
1062
1063    #[test]
1064    fn fat_arrow_and_underscore() {
1065        use TokenKind::*;
1066        assert_eq!(kinds("_ =>"), vec![Underscore, FatArrow]);
1067    }
1068
1069    // -- v0.43 string interpolation --
1070
1071    #[test]
1072    fn interp_string_is_one_token() {
1073        use TokenKind::*;
1074        assert_eq!(kinds(r#""Hello, \(name)!""#), vec![InterpStr]);
1075        // A plain string (no hole) stays a `StrLit`, via the logos path.
1076        assert_eq!(kinds(r#""Hello, world""#), vec![StrLit]);
1077    }
1078
1079    #[test]
1080    fn interp_balances_nested_parens_and_strings() {
1081        use TokenKind::*;
1082        // The `)` inside `f(x)` must not close the hole early.
1083        assert_eq!(kinds(r#""= \(f(x))""#), vec![InterpStr]);
1084        // A `)` inside a nested string inside the hole is also ignored.
1085        assert_eq!(kinds(r#""= \(label(")"))""#), vec![InterpStr]);
1086        // A nested interpolated string inside a hole.
1087        assert_eq!(kinds(r#""out \("in \(x)")""#), vec![InterpStr]);
1088    }
1089
1090    #[test]
1091    fn escaped_open_paren_is_not_a_hole() {
1092        use TokenKind::*;
1093        // `\\(` is a literal backslash followed by `(` — no hole, so the
1094        // string lexes as a plain `StrLit` on the logos path.
1095        assert_eq!(kinds(r#""a \\(b) c""#), vec![StrLit]);
1096    }
1097
1098    #[test]
1099    fn unterminated_hole_is_an_error() {
1100        // The hole runs to end of line without its closing `)`.
1101        let err = tokenize("\"value \\(x + 1\n\"").unwrap_err();
1102        assert_eq!(err.category, "bynk.lex.unterminated_interpolation");
1103    }
1104
1105    #[test]
1106    fn unterminated_interp_string_is_an_error() {
1107        // A hole closes but the string never does (newline before the `"`).
1108        let err = tokenize("\"value \\(x) more\n").unwrap_err();
1109        assert_eq!(err.category, "bynk.lex.unterminated_string");
1110    }
1111
1112    #[test]
1113    fn bad_escape_in_interp_string_is_an_error() {
1114        let err = tokenize(r#""a \q \(x)""#).unwrap_err();
1115        assert_eq!(err.category, "bynk.lex.bad_escape");
1116    }
1117}
bynk_syntax/lexer.rs

bynk_syntax/
lexer.rs