bynk_syntax/
lexer.rs

1//! Lexer for Bynk v0.
2//!
3//! Token kinds correspond to the terminals defined in the grammar (spec §3
4//! and §4). Whitespace is skipped; line comments are emitted as `Comment`
5//! tokens so the formatter can preserve them through round-trips (v1.1 LSP
6//! spec §3.5). Doc blocks (`---`) are emitted as `DocBlock` tokens, lexed
7//! outside of logos (see [`tokenize`]).
8
9use logos::Logos;
10
11use crate::error::CompileError;
12use crate::span::Span;
13
14/// Token kinds. Discriminants without payload data; the lexeme is recovered
15/// from the source string via the token's [`Span`].
16///
17/// Note: `--` line comments and `---` doc block markers are handled outside
18/// logos (see [`tokenize`]), because doc blocks are delimited by `---` lines
19/// containing only the marker and may span multiple source lines.
20#[derive(Logos, Debug, Clone, Copy, PartialEq, Eq)]
21#[logos(skip r"[ \t\r\n]+")]
22pub enum TokenKind {
23    // Keywords
24    #[token("commons")]
25    Commons,
26    #[token("type")]
27    Type,
28    #[token("fn")]
29    Fn,
30    #[token("where")]
31    Where,
32    #[token("and")]
33    And,
34    #[token("true")]
35    True,
36    #[token("false")]
37    False,
38    #[token("Int")]
39    Int,
40    #[token("String")]
41    String,
42    #[token("Bool")]
43    Bool,
44    // v0.21 keyword
45    #[token("Float")]
46    Float,
47    // v0.86 keyword (ADR 0112): the `Duration` base type.
48    #[token("Duration")]
49    Duration,
50    // v0.90 keyword (ADR 0114): the `Instant` base type.
51    #[token("Instant")]
52    Instant,
53    // v0.110 keyword (ADR 0142): the `Bytes` base type.
54    #[token("Bytes")]
55    Bytes,
56    // v0.1 keywords
57    #[token("let")]
58    Let,
59    #[token("if")]
60    If,
61    #[token("else")]
62    Else,
63    #[token("Ok")]
64    Ok,
65    #[token("Err")]
66    Err,
67    #[token("Result")]
68    Result,
69    #[token("ValidationError")]
70    ValidationError,
71    // v0.22b keyword
72    #[token("JsonError")]
73    JsonError,
74    // v0.2 keywords
75    #[token("enum")]
76    Enum,
77    #[token("match")]
78    Match,
79    #[token("Option")]
80    Option,
81    #[token("record")]
82    Record,
83    #[token("self")]
84    Self_,
85    #[token("Some")]
86    Some,
87    #[token("None")]
88    None,
89    #[token("is")]
90    Is,
91    // v0.3 keywords
92    #[token("opaque")]
93    Opaque,
94    #[token("uses")]
95    Uses,
96    // v0.4 keywords
97    #[token("context")]
98    Context,
99    #[token("consumes")]
100    Consumes,
101    #[token("exports")]
102    Exports,
103    #[token("transparent")]
104    Transparent,
105    // v0.6 keywords
106    #[token("as")]
107    As,
108    // v0.7 keywords
109    #[token("assert")]
110    Assert,
111    #[token("expect")]
112    Expect,
113    #[token("mocks")]
114    Mocks,
115    #[token("test")]
116    Test,
117    // v0.16 keyword
118    #[token("wires")]
119    Wires,
120    // v0.17 keywords
121    #[token("adapter")]
122    Adapter,
123    #[token("binding")]
124    Binding,
125    // v0.5 keywords
126    #[token("agent")]
127    Agent,
128    #[token("capability")]
129    Capability,
130    #[token("Effect")]
131    Effect,
132    #[token("given")]
133    Given,
134    #[token("on")]
135    On,
136    // v0.9 keyword
137    #[token("http")]
138    Http,
139    // v0.10a keyword
140    #[token("cron")]
141    Cron,
142    // v0.10b keyword
143    #[token("queue")]
144    Queue,
145    // v0.44 keywords: `from` heads a service's protocol clause; `protocol` is
146    // reserved (protocols are a closed, compiler-known set — no declaration kind).
147    #[token("from")]
148    From,
149    #[token("protocol")]
150    Protocol,
151    #[token("provides")]
152    Provides,
153    #[token("service")]
154    Service,
155    // v0.45 keywords: `actor` heads a boundary-contract declaration; `by`
156    // heads a handler's actor clause.
157    #[token("actor")]
158    Actor,
159    #[token("by")]
160    By,
161    // v0.80 keywords: `invariant` heads an agent invariant declaration; `implies`
162    // is the directional logical-implication operator (`P implies Q` ≡ `!P || Q`).
163    #[token("invariant")]
164    Invariant,
165    #[token("implies")]
166    Implies,
167    /// `...` — used in record-spread expressions (v0.5).
168    #[token("...")]
169    DotDotDot,
170    /// `<-` — Effect bind operator (v0.5).
171    #[token("<-")]
172    LArrow,
173    /// `~>` — asynchronous fire-and-forget send marker (v0.79). A leading
174    /// statement marker, never on the RHS of a `let`; distinct from `<-` so the
175    /// call site shows whether the caller waits.
176    #[token("~>")]
177    TildeArrow,
178    /// `:=` — Cell write (v0.81, storage track). A handler statement
179    /// `cell := expr`; distinct from `=` (binding) and `:` (annotation). Longer
180    /// than `:`/`=` so logos matches it as one token.
181    #[token(":=")]
182    ColonEq,
183
184    /// A documentation block: `---` line ... `---` line. The token's span
185    /// covers the full block including both `---` markers. The body content
186    /// is recovered from the source via the span (see [`doc_block_content`]).
187    /// Inserted by [`tokenize`]; not lexed by logos directly.
188    DocBlock,
189
190    /// A line comment: `-- ...` running to end of line. The span starts at
191    /// the `--` marker and runs through the last character before the
192    /// terminating newline (exclusive). The trivia body (the text after the
193    /// `--` marker) is recovered from the source via the span. Inserted by
194    /// [`tokenize`]; not lexed by logos directly so it cannot be mistaken
195    /// for an `--` operator sequence.
196    Comment,
197
198    // Identifier
199    #[regex(r"[A-Za-z][A-Za-z0-9_]*")]
200    Ident,
201
202    // Literals
203    #[regex(r"[0-9]+")]
204    IntLit,
205    // A float literal: fraction with a digit on both sides of the `.`, an
206    // exponent, or both (v0.21 §3). `1.` and `.5` are NOT float literals —
207    // the digit-both-sides rule keeps `2.5.round()` / `1.toFloat()` lexing
208    // as method calls on numeric literals.
209    #[regex(r"[0-9]+\.[0-9]+([eE][+-]?[0-9]+)?|[0-9]+[eE][+-]?[0-9]+")]
210    FloatLit,
211    // A double-quoted string with simple escapes. The body excludes the closing
212    // quote; we accept any non-quote/non-backslash/non-newline char, or a
213    // backslash followed by one of the four allowed escapes.
214    #[regex(r#""([^"\\\n]|\\[nt"\\])*""#)]
215    StrLit,
216    // An interpolated string `"… \(expr) …"` (v0.43). Hand-scanned in
217    // `tokenize` (logos cannot balance the holes' parens), never produced by
218    // the logos lexer — like [`TokenKind::DocBlock`]/[`TokenKind::Comment`].
219    // The span covers the whole `"…"`; the parser splits chunks from holes.
220    InterpStr,
221
222    // Multi-char operators
223    #[token("->")]
224    Arrow,
225    #[token("==")]
226    EqEq,
227    #[token("!=")]
228    BangEq,
229    #[token("<=")]
230    LtEq,
231    #[token(">=")]
232    GtEq,
233    #[token("&&")]
234    AmpAmp,
235    #[token("||")]
236    PipePipe,
237
238    // Single-char operators
239    #[token("+")]
240    Plus,
241    #[token("-")]
242    Minus,
243    #[token("*")]
244    Star,
245    #[token("/")]
246    Slash,
247    #[token("!")]
248    Bang,
249    #[token("=")]
250    Eq,
251    #[token("<")]
252    Lt,
253    #[token(">")]
254    Gt,
255    // v0.1 postfix operator
256    #[token("?")]
257    Question,
258    // v0.2 match-arm arrow
259    #[token("=>")]
260    FatArrow,
261    // v0.2 wildcard pattern (also valid as identifier start; the lexer
262    // prefers identifier for any longer match, so `_foo` is still Ident).
263    #[token("_")]
264    Underscore,
265    // v0.2 sum-type variant separator (also used as future bitwise OR);
266    // single `|` distinct from `||`.
267    #[token("|")]
268    Pipe,
269    /// `@` — storage-annotation marker (v0.85, storage track; ADR 0111). Leads a
270    /// `@name(args)` annotation on a `store` field (`@ttl(…)`/`@indexed(…)`); it
271    /// appears only in store-field-declaration position, never as an expression
272    /// operator.
273    #[token("@")]
274    At,
275
276    // Punctuation
277    #[token("(")]
278    LParen,
279    #[token(")")]
280    RParen,
281    #[token("{")]
282    LBrace,
283    #[token("}")]
284    RBrace,
285    #[token("[")]
286    LBracket,
287    #[token("]")]
288    RBracket,
289    #[token(",")]
290    Comma,
291    #[token(":")]
292    Colon,
293    #[token(".")]
294    Dot,
295}
296
297impl TokenKind {
298    /// Human-readable display name for diagnostics.
299    pub fn describe(self) -> &'static str {
300        use TokenKind::*;
301        match self {
302            Commons => "`commons`",
303            Type => "`type`",
304            Fn => "`fn`",
305            Where => "`where`",
306            And => "`and`",
307            True => "`true`",
308            False => "`false`",
309            Int => "`Int`",
310            String => "`String`",
311            Bool => "`Bool`",
312            Float => "`Float`",
313            Duration => "`Duration`",
314            Instant => "`Instant`",
315            Bytes => "`Bytes`",
316            Let => "`let`",
317            If => "`if`",
318            Else => "`else`",
319            Ok => "`Ok`",
320            Err => "`Err`",
321            Result => "`Result`",
322            ValidationError => "`ValidationError`",
323            JsonError => "`JsonError`",
324            Enum => "`enum`",
325            Match => "`match`",
326            Option => "`Option`",
327            Record => "`record`",
328            Self_ => "`self`",
329            Some => "`Some`",
330            None => "`None`",
331            Is => "`is`",
332            Opaque => "`opaque`",
333            Uses => "`uses`",
334            Context => "`context`",
335            Consumes => "`consumes`",
336            Exports => "`exports`",
337            Transparent => "`transparent`",
338            As => "`as`",
339            Assert => "`assert`",
340            Expect => "`expect`",
341            Mocks => "`mocks`",
342            Test => "`test`",
343            Wires => "`wires`",
344            Adapter => "`adapter`",
345            Binding => "`binding`",
346            Agent => "`agent`",
347            Capability => "`capability`",
348            Effect => "`Effect`",
349            Given => "`given`",
350            On => "`on`",
351            Http => "`http`",
352            Cron => "`cron`",
353            Queue => "`queue`",
354            From => "`from`",
355            Protocol => "`protocol`",
356            Provides => "`provides`",
357            Service => "`service`",
358            Actor => "`actor`",
359            By => "`by`",
360            Invariant => "`invariant`",
361            Implies => "`implies`",
362            ColonEq => "`:=`",
363            DotDotDot => "`...`",
364            LArrow => "`<-`",
365            TildeArrow => "`~>`",
366            DocBlock => "documentation block",
367            Comment => "line comment",
368            Ident => "identifier",
369            IntLit => "integer literal",
370            FloatLit => "float literal",
371            StrLit => "string literal",
372            InterpStr => "interpolated string",
373            Arrow => "`->`",
374            EqEq => "`==`",
375            BangEq => "`!=`",
376            LtEq => "`<=`",
377            GtEq => "`>=`",
378            AmpAmp => "`&&`",
379            PipePipe => "`||`",
380            Plus => "`+`",
381            Minus => "`-`",
382            Star => "`*`",
383            Slash => "`/`",
384            Bang => "`!`",
385            Eq => "`=`",
386            Lt => "`<`",
387            Gt => "`>`",
388            Question => "`?`",
389            FatArrow => "`=>`",
390            Underscore => "`_`",
391            Pipe => "`|`",
392            At => "`@`",
393            LParen => "`(`",
394            RParen => "`)`",
395            LBrace => "`{`",
396            RBrace => "`}`",
397            LBracket => "`[`",
398            RBracket => "`]`",
399            Comma => "`,`",
400            Colon => "`:`",
401            Dot => "`.`",
402        }
403    }
404}
405
406/// A token plus its source span.
407#[derive(Debug, Clone, Copy)]
408pub struct Token {
409    pub kind: TokenKind,
410    pub span: Span,
411}
412
413/// Tokenise a source string. Returns the full token vector or the first
414/// lexical error.
415///
416/// Doc blocks (`---` ... `---`) and line comments (`-- ...`) are recognised
417/// outside the logos-generated lexer: we scan the source one segment at a
418/// time, dispatching to logos for ordinary tokens between non-token spans.
419pub fn tokenize(source: &str) -> Result<Vec<Token>, CompileError> {
420    let mut tokens = Vec::new();
421    let bytes = source.as_bytes();
422    let mut pos = 0;
423    while pos < bytes.len() {
424        // Detect a `---` doc-block marker at the start of a line (the line may
425        // begin with leading whitespace; the marker itself must be alone on
426        // its line).
427        if let Some(open_end) = doc_block_open_at(source, pos) {
428            // Find the matching closing `---` line.
429            match doc_block_close(source, open_end) {
430                Some((close_start, close_end)) => {
431                    let span = Span::new(pos, close_end);
432                    tokens.push(Token {
433                        kind: TokenKind::DocBlock,
434                        span,
435                    });
436                    let _ = close_start;
437                    pos = close_end;
438                    continue;
439                }
440                None => {
441                    return Err(CompileError::new(
442                        "bynk.lex.unclosed_doc_block",
443                        Span::new(pos, open_end),
444                        "documentation block opened but never closed",
445                    )
446                    .with_note(
447                        "a doc block must be terminated by another `---` on a line by itself",
448                    ));
449                }
450            }
451        }
452        // A `--` line comment: emit a `Comment` token covering everything
453        // up to (but not including) the terminating newline. Doc-block
454        // detection above already ruled out a `---` marker at line start
455        // — and once we've consumed past the leading `--`, any further
456        // dashes are part of the comment body. Preserving comments as
457        // trivia tokens lets the parser attach them to declarations so
458        // the formatter can emit them in place (v1.1 LSP spec §3.5).
459        if pos + 1 < bytes.len() && bytes[pos] == b'-' && bytes[pos + 1] == b'-' {
460            let start = pos;
461            while pos < bytes.len() && bytes[pos] != b'\n' {
462                pos += 1;
463            }
464            tokens.push(Token {
465                kind: TokenKind::Comment,
466                span: Span::new(start, pos),
467            });
468            continue;
469        }
470        // Skip ordinary whitespace inline (logos handles it too, but we may
471        // be in the middle of the source between specials).
472        if matches!(bytes[pos], b' ' | b'\t' | b'\r' | b'\n') {
473            pos += 1;
474            continue;
475        }
476        // An interpolated string `"… \(expr) …"` (v0.43): only strings that
477        // actually contain a `\(` hole are hand-scanned here; plain strings
478        // fall through to the logos `StrLit` path unchanged. `\(` is an
479        // invalid escape in the logos grammar, so this never re-routes a
480        // currently-valid literal.
481        if bytes[pos] == b'"' && has_interp_hole(bytes, pos) {
482            let end = scan_str(bytes, source, pos)?;
483            tokens.push(Token {
484                kind: TokenKind::InterpStr,
485                span: Span::new(pos, end),
486            });
487            pos = end;
488            continue;
489        }
490        // Otherwise dispatch a single logos token starting at `pos`.
491        let mut lex = TokenKind::lexer(&source[pos..]);
492        let Some(result) = lex.next() else {
493            // No token at this position; treat as unexpected character so
494            // the user sees something useful.
495            let ch = source[pos..].chars().next().unwrap_or('\0');
496            let span = Span::new(pos, pos + ch.len_utf8());
497            return Err(CompileError::new(
498                "bynk.lex.unexpected_character",
499                span,
500                format!("unexpected character `{ch}`"),
501            ));
502        };
503        let local = lex.span();
504        let span: Span = Span::new(pos + local.start, pos + local.end);
505        match result {
506            Ok(kind) => {
507                if kind == TokenKind::IntLit {
508                    let slice = &source[span.range()];
509                    if slice.parse::<i64>().is_err() {
510                        return Err(CompileError::new(
511                            "bynk.lex.integer_overflow",
512                            span,
513                            format!(
514                                "integer literal `{slice}` is out of range for a 64-bit signed integer"
515                            ),
516                        )
517                        .with_note("the range is -2^63 to 2^63 - 1"));
518                    }
519                }
520                if kind == TokenKind::FloatLit {
521                    let slice = &source[span.range()];
522                    match slice.parse::<f64>() {
523                        Ok(v) if v.is_finite() => {}
524                        _ => {
525                            return Err(CompileError::new(
526                                "bynk.lex.float_literal_overflow",
527                                span,
528                                format!(
529                                    "float literal `{slice}` is out of range for a 64-bit float"
530                                ),
531                            )
532                            .with_note(
533                                "the literal does not fit a finite IEEE 754 double; \
534                                 the largest finite value is ~1.8e308",
535                            ));
536                        }
537                    }
538                }
539                tokens.push(Token { kind, span });
540                pos = span.end;
541            }
542            Err(()) => {
543                let slice = &source[span.range()];
544                let ch = slice.chars().next().unwrap_or('\0');
545                let err = if ch == '"' {
546                    CompileError::new(
547                        "bynk.lex.unterminated_string",
548                        span,
549                        "unterminated string literal",
550                    )
551                    .with_note(
552                        "string literals must close with `\"` on the same line; \
553                         supported escapes are `\\n`, `\\t`, `\\\"`, `\\\\`",
554                    )
555                } else {
556                    CompileError::new(
557                        "bynk.lex.unexpected_character",
558                        span,
559                        format!("unexpected character `{ch}`"),
560                    )
561                };
562                return Err(err);
563            }
564        }
565    }
566    Ok(tokens)
567}
568
569/// Cheap routing pre-scan (v0.43): does the string opening at `start` contain a
570/// `\(` interpolation hole before it closes (or the line ends)? Decides whether
571/// `tokenize` hand-scans the string as an `InterpStr` or defers to logos for a
572/// plain `StrLit`. Deliberately tolerant — a malformed string with a hole is
573/// routed here so the hole-aware scanner produces the precise error.
574fn has_interp_hole(bytes: &[u8], start: usize) -> bool {
575    let mut i = start + 1;
576    while i < bytes.len() {
577        match bytes[i] {
578            b'\n' | b'"' => return false,
579            b'\\' => {
580                if bytes.get(i + 1) == Some(&b'(') {
581                    return true;
582                }
583                i += 2;
584            }
585            _ => i += 1,
586        }
587    }
588    false
589}
590
591/// Scan a double-quoted string starting at `start` (the opening `"`), returning
592/// the byte offset just past the closing `"`. Recognises the four simple
593/// escapes plus `\(…)` interpolation holes, whose parens are balanced (and
594/// whose nested strings are skipped) by [`scan_hole`]. (v0.43.)
595fn scan_str(bytes: &[u8], source: &str, start: usize) -> Result<usize, CompileError> {
596    debug_assert_eq!(bytes[start], b'"');
597    let mut i = start + 1;
598    loop {
599        if i >= bytes.len() || bytes[i] == b'\n' {
600            return Err(CompileError::new(
601                "bynk.lex.unterminated_string",
602                Span::new(start, i.min(bytes.len())),
603                "unterminated string literal",
604            )
605            .with_note(
606                "string literals must close with `\"` on the same line; \
607                 supported escapes are `\\n`, `\\t`, `\\\"`, `\\\\`, and `\\(…)` interpolation",
608            ));
609        }
610        match bytes[i] {
611            b'"' => return Ok(i + 1),
612            b'\\' => match bytes.get(i + 1) {
613                Some(b'n' | b't' | b'"' | b'\\') => i += 2,
614                Some(b'(') => i = scan_hole(bytes, source, i + 2)?,
615                other => {
616                    let shown = other.map(|b| (*b as char).to_string()).unwrap_or_default();
617                    return Err(CompileError::new(
618                        "bynk.lex.bad_escape",
619                        Span::new(i, (i + 2).min(bytes.len())),
620                        format!("invalid escape sequence `\\{shown}` in string literal"),
621                    )
622                    .with_note("supported escapes: \\n \\t \\\" \\\\ \\(…)"));
623                }
624            },
625            // Any other byte advances one position. UTF-8 continuation bytes
626            // are all >= 0x80, so they never collide with the ASCII specials.
627            _ => i += 1,
628        }
629    }
630}
631
632/// Scan an interpolation hole body. `start` points just past the `\(`; returns
633/// the offset just past the matching `)`. Tracks paren depth and skips nested
634/// strings (whose own parens must not close the hole), recursing through
635/// [`scan_str`] so nested interpolation nests correctly. (v0.43.)
636fn scan_hole(bytes: &[u8], source: &str, start: usize) -> Result<usize, CompileError> {
637    let mut i = start;
638    let mut depth = 1usize;
639    loop {
640        if i >= bytes.len() || bytes[i] == b'\n' {
641            return Err(CompileError::new(
642                "bynk.lex.unterminated_interpolation",
643                Span::new(start.saturating_sub(2), i.min(bytes.len())),
644                "unterminated interpolation hole",
645            )
646            .with_note(
647                "an interpolation hole `\\(…)` must close with a matching `)` on the same line",
648            ));
649        }
650        match bytes[i] {
651            b'(' => {
652                depth += 1;
653                i += 1;
654            }
655            b')' => {
656                depth -= 1;
657                i += 1;
658                if depth == 0 {
659                    return Ok(i);
660                }
661            }
662            b'"' => i = scan_str(bytes, source, i)?,
663            _ => i += 1,
664        }
665    }
666}
667
668/// One segment of a split interpolated string (v0.43): literal text (escapes
669/// resolved) or the absolute source span of a hole's expression (the bytes
670/// between `\(` and its matching `)`). The parser turns the latter into a real
671/// `Expr`; the lexer owns only the scanning.
672pub(crate) enum InterpSegment {
673    Chunk(String),
674    Hole(Span),
675}
676
677/// Split an `InterpStr` token (its `span` covers the whole `"…"`) into chunks
678/// and hole spans. Escapes in the chunks are resolved here (mirroring
679/// [`parse_string_literal`]); holes are returned as spans for the parser to
680/// re-lex and parse as expressions. (v0.43.)
681pub(crate) fn split_interp(source: &str, span: Span) -> Result<Vec<InterpSegment>, CompileError> {
682    let bytes = source.as_bytes();
683    let inner_end = span.end - 1; // the closing `"`
684    let mut segments = Vec::new();
685    let mut chunk = String::new();
686    let mut i = span.start + 1; // past the opening `"`
687    while i < inner_end {
688        match bytes[i] {
689            b'\\' => match bytes[i + 1] {
690                b'n' => {
691                    chunk.push('\n');
692                    i += 2;
693                }
694                b't' => {
695                    chunk.push('\t');
696                    i += 2;
697                }
698                b'"' => {
699                    chunk.push('"');
700                    i += 2;
701                }
702                b'\\' => {
703                    chunk.push('\\');
704                    i += 2;
705                }
706                b'(' => {
707                    if !chunk.is_empty() {
708                        segments.push(InterpSegment::Chunk(std::mem::take(&mut chunk)));
709                    }
710                    let hole_start = i + 2;
711                    let after = scan_hole(bytes, source, hole_start)?;
712                    // `after` is one past the matching `)`; the hole body is
713                    // everything up to that `)`.
714                    segments.push(InterpSegment::Hole(Span::new(hole_start, after - 1)));
715                    i = after;
716                }
717                // The lexer already validated every escape, so nothing else
718                // can appear here.
719                other => unreachable!("unvalidated escape `\\{}` in InterpStr", other as char),
720            },
721            _ => {
722                let ch = source[i..].chars().next().unwrap();
723                chunk.push(ch);
724                i += ch.len_utf8();
725            }
726        }
727    }
728    if !chunk.is_empty() {
729        segments.push(InterpSegment::Chunk(chunk));
730    }
731    Ok(segments)
732}
733
734/// If a `---` doc-block marker line starts at or shortly after `pos` (which
735/// must be at a line boundary), return the byte offset just past the marker
736/// line (after the terminating newline, or at EOF). The doc-block grammar
737/// requires the marker to be alone on its line; leading horizontal whitespace
738/// is allowed and ignored.
739fn doc_block_open_at(source: &str, pos: usize) -> Option<usize> {
740    let bytes = source.as_bytes();
741    if !at_line_start(source, pos) {
742        return None;
743    }
744    // Skip leading horizontal whitespace.
745    let mut i = pos;
746    while i < bytes.len() && (bytes[i] == b' ' || bytes[i] == b'\t') {
747        i += 1;
748    }
749    if i + 3 > bytes.len() {
750        return None;
751    }
752    if &bytes[i..i + 3] != b"---" {
753        return None;
754    }
755    i += 3;
756    // The marker may have additional trailing dashes (per spec "three or more
757    // consecutive hyphens"). Consume them.
758    while i < bytes.len() && bytes[i] == b'-' {
759        i += 1;
760    }
761    // After the dashes, allow only horizontal whitespace then newline/EOF.
762    while i < bytes.len() && (bytes[i] == b' ' || bytes[i] == b'\t' || bytes[i] == b'\r') {
763        i += 1;
764    }
765    if i == bytes.len() {
766        return Some(i);
767    }
768    if bytes[i] == b'\n' {
769        return Some(i + 1);
770    }
771    None
772}
773
774/// Find the next closing `---` line at or after `pos`. Returns
775/// `(start_of_line, end_of_line)` (`end_of_line` is just past the
776/// terminating newline, or at EOF).
777fn doc_block_close(source: &str, mut pos: usize) -> Option<(usize, usize)> {
778    let bytes = source.as_bytes();
779    while pos < bytes.len() {
780        // Advance pos to the start of a line.
781        let line_start = pos;
782        // Find the end of this line.
783        let mut line_end = line_start;
784        while line_end < bytes.len() && bytes[line_end] != b'\n' {
785            line_end += 1;
786        }
787        // Check this line.
788        if let Some(end) = doc_block_open_at(source, line_start) {
789            return Some((line_start, end));
790        }
791        // Move to the next line.
792        pos = if line_end < bytes.len() {
793            line_end + 1
794        } else {
795            line_end
796        };
797    }
798    None
799}
800
801/// Returns true if byte offset `pos` is at a line start (column 0).
802fn at_line_start(source: &str, pos: usize) -> bool {
803    if pos == 0 {
804        return true;
805    }
806    let bytes = source.as_bytes();
807    bytes[pos - 1] == b'\n'
808}
809
810/// Extract the body content of a doc-block token from its source span.
811/// Strips the leading and trailing `---` marker lines and returns the body
812/// verbatim. If every non-empty content line begins with the same horizontal
813/// whitespace prefix (e.g., because the doc block sits inside a brace-form
814/// commons body), that common prefix is removed so the body reads naturally
815/// when emitted as JSDoc.
816pub fn doc_block_content(source: &str, span: Span) -> String {
817    let slice = &source[span.range()];
818    // Drop the first line (opening marker).
819    let after_open = match slice.find('\n') {
820        Some(i) => &slice[i + 1..],
821        None => return String::new(),
822    };
823    let bytes = after_open.as_bytes();
824    // Trim the trailing closing-marker line.
825    let mut i = bytes.len();
826    if i > 0 && bytes[i - 1] == b'\n' {
827        i -= 1;
828    }
829    while i > 0 && matches!(bytes[i - 1], b' ' | b'\t' | b'\r') {
830        i -= 1;
831    }
832    while i > 0 && bytes[i - 1] == b'-' {
833        i -= 1;
834    }
835    if i > 0 && bytes[i - 1] == b'\n' {
836        i -= 1;
837    }
838    let body = &after_open[..i];
839
840    // Compute the common leading-whitespace prefix across all non-empty lines
841    // and strip it. This lets writers indent the doc block alongside the
842    // declaration it documents without bleeding the indent into the JSDoc.
843    let common: Option<usize> = body
844        .lines()
845        .filter(|l| !l.trim().is_empty())
846        .map(|l| l.bytes().take_while(|&b| b == b' ' || b == b'\t').count())
847        .min();
848    let strip = common.unwrap_or(0);
849    if strip == 0 {
850        return body.to_string();
851    }
852    let mut out = String::with_capacity(body.len());
853    let mut first = true;
854    for line in body.lines() {
855        if !first {
856            out.push('\n');
857        }
858        first = false;
859        if line.trim().is_empty() {
860            // Preserve blank lines.
861            continue;
862        }
863        let leading: usize = line
864            .bytes()
865            .take_while(|&b| b == b' ' || b == b'\t')
866            .count();
867        let drop = strip.min(leading);
868        out.push_str(&line[drop..]);
869    }
870    out
871}
872
873/// Extract the body of a `Comment` trivia token: everything after the
874/// leading `--` marker, preserving its inline whitespace verbatim. Used by
875/// the parser when attaching comments to declarations.
876pub fn comment_body(source: &str, span: Span) -> &str {
877    let slice = &source[span.range()];
878    // Strip leading "--" if present (defensive — the lexer always emits
879    // Comment tokens whose span begins with `--`).
880    slice.strip_prefix("--").unwrap_or(slice)
881}
882
883/// Returns true if there is a blank line (a line containing only whitespace)
884/// in `source` strictly between byte offsets `from` (inclusive) and `to`
885/// (exclusive). Used by the parser to detect orphan doc blocks.
886///
887/// A doc-block token's span ends just past the closing-marker line's
888/// terminating newline. So if the next declaration begins on the immediately
889/// following line, the substring between contains no newline (only optional
890/// indentation). Any newline in the substring therefore implies at least one
891/// entirely-blank line separating the doc from the declaration.
892pub fn has_blank_line_between(source: &str, from: usize, to: usize) -> bool {
893    if to <= from {
894        return false;
895    }
896    let bytes = source.as_bytes();
897    let mut i = from;
898    while i < to {
899        if bytes[i] == b'\n' {
900            return true;
901        }
902        if !matches!(bytes[i], b' ' | b'\t' | b'\r') {
903            return false;
904        }
905        i += 1;
906    }
907    false
908}
909
910#[cfg(test)]
911mod tests {
912    use super::*;
913
914    fn kinds(source: &str) -> Vec<TokenKind> {
915        tokenize(source)
916            .unwrap()
917            .into_iter()
918            .map(|t| t.kind)
919            .collect()
920    }
921
922    #[test]
923    fn keywords_and_idents() {
924        use TokenKind::*;
925        assert_eq!(
926            kinds("commons type fn where and true false Int String Bool foo bar"),
927            vec![
928                Commons, Type, Fn, Where, And, True, False, Int, String, Bool, Ident, Ident
929            ],
930        );
931    }
932
933    #[test]
934    fn integer_and_string_literals() {
935        use TokenKind::*;
936        assert_eq!(
937            kinds(r#"0 42 "hello" "with\nescape""#),
938            vec![IntLit, IntLit, StrLit, StrLit]
939        );
940    }
941
942    #[test]
943    fn operators() {
944        use TokenKind::*;
945        assert_eq!(
946            kinds("-> == != <= >= && || + - * / ! = < > ( ) { } [ ] , : . @"),
947            vec![
948                Arrow, EqEq, BangEq, LtEq, GtEq, AmpAmp, PipePipe, Plus, Minus, Star, Slash, Bang,
949                Eq, Lt, Gt, LParen, RParen, LBrace, RBrace, LBracket, RBracket, Comma, Colon, Dot,
950                At,
951            ],
952        );
953    }
954
955    #[test]
956    fn line_comments_emitted_as_trivia() {
957        // v1.1: line comments are preserved as Comment tokens so the
958        // formatter can attach and re-emit them.
959        use TokenKind::*;
960        let src = "-- a comment\ntype X = Int -- trailing\n";
961        assert_eq!(kinds(src), vec![Comment, Type, Ident, Eq, Int, Comment],);
962    }
963
964    #[test]
965    fn comment_body_extracts_text_after_marker() {
966        let toks = tokenize("-- hello world\n").unwrap();
967        assert_eq!(toks.len(), 1);
968        assert_eq!(toks[0].kind, TokenKind::Comment);
969        assert_eq!(
970            comment_body("-- hello world\n", toks[0].span),
971            " hello world"
972        );
973    }
974
975    #[test]
976    fn comment_does_not_consume_newline() {
977        // Two adjacent comment lines should produce two distinct tokens
978        // — the newline between them is not part of either comment's span.
979        let toks = tokenize("-- one\n-- two\n").unwrap();
980        assert_eq!(toks.len(), 2);
981        assert!(toks.iter().all(|t| t.kind == TokenKind::Comment));
982    }
983
984    #[test]
985    fn unterminated_string_is_error() {
986        let err = tokenize("\"oops\n").unwrap_err();
987        assert_eq!(err.category, "bynk.lex.unterminated_string");
988    }
989
990    #[test]
991    fn integer_overflow_is_error() {
992        let err = tokenize("99999999999999999999").unwrap_err();
993        assert_eq!(err.category, "bynk.lex.integer_overflow");
994    }
995
996    #[test]
997    fn unexpected_character_is_error() {
998        let err = tokenize("type X = Int $").unwrap_err();
999        assert_eq!(err.category, "bynk.lex.unexpected_character");
1000    }
1001
1002    #[test]
1003    fn v0_1_keywords() {
1004        use TokenKind::*;
1005        assert_eq!(
1006            kinds("let if else Ok Err Result ValidationError"),
1007            vec![Let, If, Else, Ok, Err, Result, ValidationError],
1008        );
1009    }
1010
1011    #[test]
1012    fn question_token() {
1013        use TokenKind::*;
1014        assert_eq!(kinds("x?"), vec![Ident, Question]);
1015    }
1016
1017    #[test]
1018    fn v0_2_keywords() {
1019        use TokenKind::*;
1020        assert_eq!(
1021            kinds("enum match Option record self Some None is"),
1022            vec![Enum, Match, Option, Record, Self_, Some, None, Is],
1023        );
1024    }
1025
1026    #[test]
1027    fn pipe_and_pipe_pipe_disambiguated() {
1028        use TokenKind::*;
1029        assert_eq!(kinds("| || |"), vec![Pipe, PipePipe, Pipe]);
1030    }
1031
1032    #[test]
1033    fn v0_7_keywords() {
1034        use TokenKind::*;
1035        assert_eq!(
1036            kinds("assert expect mocks test"),
1037            vec![Assert, Expect, Mocks, Test],
1038        );
1039    }
1040
1041    #[test]
1042    fn fat_arrow_and_underscore() {
1043        use TokenKind::*;
1044        assert_eq!(kinds("_ =>"), vec![Underscore, FatArrow]);
1045    }
1046
1047    // -- v0.43 string interpolation --
1048
1049    #[test]
1050    fn interp_string_is_one_token() {
1051        use TokenKind::*;
1052        assert_eq!(kinds(r#""Hello, \(name)!""#), vec![InterpStr]);
1053        // A plain string (no hole) stays a `StrLit`, via the logos path.
1054        assert_eq!(kinds(r#""Hello, world""#), vec![StrLit]);
1055    }
1056
1057    #[test]
1058    fn interp_balances_nested_parens_and_strings() {
1059        use TokenKind::*;
1060        // The `)` inside `f(x)` must not close the hole early.
1061        assert_eq!(kinds(r#""= \(f(x))""#), vec![InterpStr]);
1062        // A `)` inside a nested string inside the hole is also ignored.
1063        assert_eq!(kinds(r#""= \(label(")"))""#), vec![InterpStr]);
1064        // A nested interpolated string inside a hole.
1065        assert_eq!(kinds(r#""out \("in \(x)")""#), vec![InterpStr]);
1066    }
1067
1068    #[test]
1069    fn escaped_open_paren_is_not_a_hole() {
1070        use TokenKind::*;
1071        // `\\(` is a literal backslash followed by `(` — no hole, so the
1072        // string lexes as a plain `StrLit` on the logos path.
1073        assert_eq!(kinds(r#""a \\(b) c""#), vec![StrLit]);
1074    }
1075
1076    #[test]
1077    fn unterminated_hole_is_an_error() {
1078        // The hole runs to end of line without its closing `)`.
1079        let err = tokenize("\"value \\(x + 1\n\"").unwrap_err();
1080        assert_eq!(err.category, "bynk.lex.unterminated_interpolation");
1081    }
1082
1083    #[test]
1084    fn unterminated_interp_string_is_an_error() {
1085        // A hole closes but the string never does (newline before the `"`).
1086        let err = tokenize("\"value \\(x) more\n").unwrap_err();
1087        assert_eq!(err.category, "bynk.lex.unterminated_string");
1088    }
1089
1090    #[test]
1091    fn bad_escape_in_interp_string_is_an_error() {
1092        let err = tokenize(r#""a \q \(x)""#).unwrap_err();
1093        assert_eq!(err.category, "bynk.lex.bad_escape");
1094    }
1095}
bynk_syntax/lexer.rs

bynk_syntax/
lexer.rs