bynk_syntax/
lexer.rs

1//! Lexer for Bynk v0.
2//!
3//! Token kinds correspond to the terminals defined in the grammar (spec §3
4//! and §4). Whitespace is skipped; line comments are emitted as `Comment`
5//! tokens so the formatter can preserve them through round-trips (v1.1 LSP
6//! spec §3.5). Doc blocks (`---`) are emitted as `DocBlock` tokens, lexed
7//! outside of logos (see [`tokenize`]).
8
9use logos::Logos;
10
11use crate::error::CompileError;
12use crate::span::Span;
13
14/// Token kinds. Discriminants without payload data; the lexeme is recovered
15/// from the source string via the token's [`Span`].
16///
17/// Note: `--` line comments and `---` doc block markers are handled outside
18/// logos (see [`tokenize`]), because doc blocks are delimited by `---` lines
19/// containing only the marker and may span multiple source lines.
20#[derive(Logos, Debug, Clone, Copy, PartialEq, Eq)]
21#[logos(skip r"[ \t\r\n]+")]
22pub enum TokenKind {
23    // Keywords
24    #[token("commons")]
25    Commons,
26    #[token("type")]
27    Type,
28    #[token("fn")]
29    Fn,
30    #[token("where")]
31    Where,
32    #[token("and")]
33    And,
34    #[token("true")]
35    True,
36    #[token("false")]
37    False,
38    #[token("Int")]
39    Int,
40    #[token("String")]
41    String,
42    #[token("Bool")]
43    Bool,
44    // v0.21 keyword
45    #[token("Float")]
46    Float,
47    // v0.1 keywords
48    #[token("let")]
49    Let,
50    #[token("if")]
51    If,
52    #[token("else")]
53    Else,
54    #[token("Ok")]
55    Ok,
56    #[token("Err")]
57    Err,
58    #[token("Result")]
59    Result,
60    #[token("ValidationError")]
61    ValidationError,
62    // v0.22b keyword
63    #[token("JsonError")]
64    JsonError,
65    // v0.2 keywords
66    #[token("enum")]
67    Enum,
68    #[token("match")]
69    Match,
70    #[token("Option")]
71    Option,
72    #[token("record")]
73    Record,
74    #[token("self")]
75    Self_,
76    #[token("Some")]
77    Some,
78    #[token("None")]
79    None,
80    #[token("is")]
81    Is,
82    // v0.3 keywords
83    #[token("opaque")]
84    Opaque,
85    #[token("uses")]
86    Uses,
87    // v0.4 keywords
88    #[token("context")]
89    Context,
90    #[token("consumes")]
91    Consumes,
92    #[token("exports")]
93    Exports,
94    #[token("transparent")]
95    Transparent,
96    // v0.6 keywords
97    #[token("as")]
98    As,
99    // v0.7 keywords
100    #[token("assert")]
101    Assert,
102    #[token("expect")]
103    Expect,
104    #[token("mocks")]
105    Mocks,
106    #[token("test")]
107    Test,
108    // v0.16 keyword
109    #[token("wires")]
110    Wires,
111    // v0.17 keywords
112    #[token("adapter")]
113    Adapter,
114    #[token("binding")]
115    Binding,
116    // v0.5 keywords
117    #[token("agent")]
118    Agent,
119    #[token("capability")]
120    Capability,
121    #[token("commit")]
122    Commit,
123    #[token("Effect")]
124    Effect,
125    #[token("given")]
126    Given,
127    #[token("on")]
128    On,
129    // v0.9 keyword
130    #[token("http")]
131    Http,
132    // v0.10a keyword
133    #[token("cron")]
134    Cron,
135    // v0.10b keyword
136    #[token("queue")]
137    Queue,
138    // v0.44 keywords: `from` heads a service's protocol clause; `protocol` is
139    // reserved (protocols are a closed, compiler-known set — no declaration kind).
140    #[token("from")]
141    From,
142    #[token("protocol")]
143    Protocol,
144    #[token("provides")]
145    Provides,
146    #[token("service")]
147    Service,
148    #[token("state")]
149    State,
150    // v0.45 keywords: `actor` heads a boundary-contract declaration; `by`
151    // heads a handler's actor clause.
152    #[token("actor")]
153    Actor,
154    #[token("by")]
155    By,
156    /// `...` — used in record-spread expressions (v0.5).
157    #[token("...")]
158    DotDotDot,
159    /// `<-` — Effect bind operator (v0.5).
160    #[token("<-")]
161    LArrow,
162
163    /// A documentation block: `---` line ... `---` line. The token's span
164    /// covers the full block including both `---` markers. The body content
165    /// is recovered from the source via the span (see [`doc_block_content`]).
166    /// Inserted by [`tokenize`]; not lexed by logos directly.
167    DocBlock,
168
169    /// A line comment: `-- ...` running to end of line. The span starts at
170    /// the `--` marker and runs through the last character before the
171    /// terminating newline (exclusive). The trivia body (the text after the
172    /// `--` marker) is recovered from the source via the span. Inserted by
173    /// [`tokenize`]; not lexed by logos directly so it cannot be mistaken
174    /// for an `--` operator sequence.
175    Comment,
176
177    // Identifier
178    #[regex(r"[A-Za-z][A-Za-z0-9_]*")]
179    Ident,
180
181    // Literals
182    #[regex(r"[0-9]+")]
183    IntLit,
184    // A float literal: fraction with a digit on both sides of the `.`, an
185    // exponent, or both (v0.21 §3). `1.` and `.5` are NOT float literals —
186    // the digit-both-sides rule keeps `2.5.round()` / `1.toFloat()` lexing
187    // as method calls on numeric literals.
188    #[regex(r"[0-9]+\.[0-9]+([eE][+-]?[0-9]+)?|[0-9]+[eE][+-]?[0-9]+")]
189    FloatLit,
190    // A double-quoted string with simple escapes. The body excludes the closing
191    // quote; we accept any non-quote/non-backslash/non-newline char, or a
192    // backslash followed by one of the four allowed escapes.
193    #[regex(r#""([^"\\\n]|\\[nt"\\])*""#)]
194    StrLit,
195    // An interpolated string `"… \(expr) …"` (v0.43). Hand-scanned in
196    // `tokenize` (logos cannot balance the holes' parens), never produced by
197    // the logos lexer — like [`TokenKind::DocBlock`]/[`TokenKind::Comment`].
198    // The span covers the whole `"…"`; the parser splits chunks from holes.
199    InterpStr,
200
201    // Multi-char operators
202    #[token("->")]
203    Arrow,
204    #[token("==")]
205    EqEq,
206    #[token("!=")]
207    BangEq,
208    #[token("<=")]
209    LtEq,
210    #[token(">=")]
211    GtEq,
212    #[token("&&")]
213    AmpAmp,
214    #[token("||")]
215    PipePipe,
216
217    // Single-char operators
218    #[token("+")]
219    Plus,
220    #[token("-")]
221    Minus,
222    #[token("*")]
223    Star,
224    #[token("/")]
225    Slash,
226    #[token("!")]
227    Bang,
228    #[token("=")]
229    Eq,
230    #[token("<")]
231    Lt,
232    #[token(">")]
233    Gt,
234    // v0.1 postfix operator
235    #[token("?")]
236    Question,
237    // v0.2 match-arm arrow
238    #[token("=>")]
239    FatArrow,
240    // v0.2 wildcard pattern (also valid as identifier start; the lexer
241    // prefers identifier for any longer match, so `_foo` is still Ident).
242    #[token("_")]
243    Underscore,
244    // v0.2 sum-type variant separator (also used as future bitwise OR);
245    // single `|` distinct from `||`.
246    #[token("|")]
247    Pipe,
248
249    // Punctuation
250    #[token("(")]
251    LParen,
252    #[token(")")]
253    RParen,
254    #[token("{")]
255    LBrace,
256    #[token("}")]
257    RBrace,
258    #[token("[")]
259    LBracket,
260    #[token("]")]
261    RBracket,
262    #[token(",")]
263    Comma,
264    #[token(":")]
265    Colon,
266    #[token(".")]
267    Dot,
268}
269
270impl TokenKind {
271    /// Human-readable display name for diagnostics.
272    pub fn describe(self) -> &'static str {
273        use TokenKind::*;
274        match self {
275            Commons => "`commons`",
276            Type => "`type`",
277            Fn => "`fn`",
278            Where => "`where`",
279            And => "`and`",
280            True => "`true`",
281            False => "`false`",
282            Int => "`Int`",
283            String => "`String`",
284            Bool => "`Bool`",
285            Float => "`Float`",
286            Let => "`let`",
287            If => "`if`",
288            Else => "`else`",
289            Ok => "`Ok`",
290            Err => "`Err`",
291            Result => "`Result`",
292            ValidationError => "`ValidationError`",
293            JsonError => "`JsonError`",
294            Enum => "`enum`",
295            Match => "`match`",
296            Option => "`Option`",
297            Record => "`record`",
298            Self_ => "`self`",
299            Some => "`Some`",
300            None => "`None`",
301            Is => "`is`",
302            Opaque => "`opaque`",
303            Uses => "`uses`",
304            Context => "`context`",
305            Consumes => "`consumes`",
306            Exports => "`exports`",
307            Transparent => "`transparent`",
308            As => "`as`",
309            Assert => "`assert`",
310            Expect => "`expect`",
311            Mocks => "`mocks`",
312            Test => "`test`",
313            Wires => "`wires`",
314            Adapter => "`adapter`",
315            Binding => "`binding`",
316            Agent => "`agent`",
317            Capability => "`capability`",
318            Commit => "`commit`",
319            Effect => "`Effect`",
320            Given => "`given`",
321            On => "`on`",
322            Http => "`http`",
323            Cron => "`cron`",
324            Queue => "`queue`",
325            From => "`from`",
326            Protocol => "`protocol`",
327            Provides => "`provides`",
328            Service => "`service`",
329            State => "`state`",
330            Actor => "`actor`",
331            By => "`by`",
332            DotDotDot => "`...`",
333            LArrow => "`<-`",
334            DocBlock => "documentation block",
335            Comment => "line comment",
336            Ident => "identifier",
337            IntLit => "integer literal",
338            FloatLit => "float literal",
339            StrLit => "string literal",
340            InterpStr => "interpolated string",
341            Arrow => "`->`",
342            EqEq => "`==`",
343            BangEq => "`!=`",
344            LtEq => "`<=`",
345            GtEq => "`>=`",
346            AmpAmp => "`&&`",
347            PipePipe => "`||`",
348            Plus => "`+`",
349            Minus => "`-`",
350            Star => "`*`",
351            Slash => "`/`",
352            Bang => "`!`",
353            Eq => "`=`",
354            Lt => "`<`",
355            Gt => "`>`",
356            Question => "`?`",
357            FatArrow => "`=>`",
358            Underscore => "`_`",
359            Pipe => "`|`",
360            LParen => "`(`",
361            RParen => "`)`",
362            LBrace => "`{`",
363            RBrace => "`}`",
364            LBracket => "`[`",
365            RBracket => "`]`",
366            Comma => "`,`",
367            Colon => "`:`",
368            Dot => "`.`",
369        }
370    }
371}
372
373/// A token plus its source span.
374#[derive(Debug, Clone, Copy)]
375pub struct Token {
376    pub kind: TokenKind,
377    pub span: Span,
378}
379
380/// Tokenise a source string. Returns the full token vector or the first
381/// lexical error.
382///
383/// Doc blocks (`---` ... `---`) and line comments (`-- ...`) are recognised
384/// outside the logos-generated lexer: we scan the source one segment at a
385/// time, dispatching to logos for ordinary tokens between non-token spans.
386pub fn tokenize(source: &str) -> Result<Vec<Token>, CompileError> {
387    let mut tokens = Vec::new();
388    let bytes = source.as_bytes();
389    let mut pos = 0;
390    while pos < bytes.len() {
391        // Detect a `---` doc-block marker at the start of a line (the line may
392        // begin with leading whitespace; the marker itself must be alone on
393        // its line).
394        if let Some(open_end) = doc_block_open_at(source, pos) {
395            // Find the matching closing `---` line.
396            match doc_block_close(source, open_end) {
397                Some((close_start, close_end)) => {
398                    let span = Span::new(pos, close_end);
399                    tokens.push(Token {
400                        kind: TokenKind::DocBlock,
401                        span,
402                    });
403                    let _ = close_start;
404                    pos = close_end;
405                    continue;
406                }
407                None => {
408                    return Err(CompileError::new(
409                        "bynk.lex.unclosed_doc_block",
410                        Span::new(pos, open_end),
411                        "documentation block opened but never closed",
412                    )
413                    .with_note(
414                        "a doc block must be terminated by another `---` on a line by itself",
415                    ));
416                }
417            }
418        }
419        // A `--` line comment: emit a `Comment` token covering everything
420        // up to (but not including) the terminating newline. Doc-block
421        // detection above already ruled out a `---` marker at line start
422        // — and once we've consumed past the leading `--`, any further
423        // dashes are part of the comment body. Preserving comments as
424        // trivia tokens lets the parser attach them to declarations so
425        // the formatter can emit them in place (v1.1 LSP spec §3.5).
426        if pos + 1 < bytes.len() && bytes[pos] == b'-' && bytes[pos + 1] == b'-' {
427            let start = pos;
428            while pos < bytes.len() && bytes[pos] != b'\n' {
429                pos += 1;
430            }
431            tokens.push(Token {
432                kind: TokenKind::Comment,
433                span: Span::new(start, pos),
434            });
435            continue;
436        }
437        // Skip ordinary whitespace inline (logos handles it too, but we may
438        // be in the middle of the source between specials).
439        if matches!(bytes[pos], b' ' | b'\t' | b'\r' | b'\n') {
440            pos += 1;
441            continue;
442        }
443        // An interpolated string `"… \(expr) …"` (v0.43): only strings that
444        // actually contain a `\(` hole are hand-scanned here; plain strings
445        // fall through to the logos `StrLit` path unchanged. `\(` is an
446        // invalid escape in the logos grammar, so this never re-routes a
447        // currently-valid literal.
448        if bytes[pos] == b'"' && has_interp_hole(bytes, pos) {
449            let end = scan_str(bytes, source, pos)?;
450            tokens.push(Token {
451                kind: TokenKind::InterpStr,
452                span: Span::new(pos, end),
453            });
454            pos = end;
455            continue;
456        }
457        // Otherwise dispatch a single logos token starting at `pos`.
458        let mut lex = TokenKind::lexer(&source[pos..]);
459        let Some(result) = lex.next() else {
460            // No token at this position; treat as unexpected character so
461            // the user sees something useful.
462            let ch = source[pos..].chars().next().unwrap_or('\0');
463            let span = Span::new(pos, pos + ch.len_utf8());
464            return Err(CompileError::new(
465                "bynk.lex.unexpected_character",
466                span,
467                format!("unexpected character `{ch}`"),
468            ));
469        };
470        let local = lex.span();
471        let span: Span = Span::new(pos + local.start, pos + local.end);
472        match result {
473            Ok(kind) => {
474                if kind == TokenKind::IntLit {
475                    let slice = &source[span.range()];
476                    if slice.parse::<i64>().is_err() {
477                        return Err(CompileError::new(
478                            "bynk.lex.integer_overflow",
479                            span,
480                            format!(
481                                "integer literal `{slice}` is out of range for a 64-bit signed integer"
482                            ),
483                        )
484                        .with_note("the range is -2^63 to 2^63 - 1"));
485                    }
486                }
487                if kind == TokenKind::FloatLit {
488                    let slice = &source[span.range()];
489                    match slice.parse::<f64>() {
490                        Ok(v) if v.is_finite() => {}
491                        _ => {
492                            return Err(CompileError::new(
493                                "bynk.lex.float_literal_overflow",
494                                span,
495                                format!(
496                                    "float literal `{slice}` is out of range for a 64-bit float"
497                                ),
498                            )
499                            .with_note(
500                                "the literal does not fit a finite IEEE 754 double; \
501                                 the largest finite value is ~1.8e308",
502                            ));
503                        }
504                    }
505                }
506                tokens.push(Token { kind, span });
507                pos = span.end;
508            }
509            Err(()) => {
510                let slice = &source[span.range()];
511                let ch = slice.chars().next().unwrap_or('\0');
512                let err = if ch == '"' {
513                    CompileError::new(
514                        "bynk.lex.unterminated_string",
515                        span,
516                        "unterminated string literal",
517                    )
518                    .with_note(
519                        "string literals must close with `\"` on the same line; \
520                         supported escapes are `\\n`, `\\t`, `\\\"`, `\\\\`",
521                    )
522                } else {
523                    CompileError::new(
524                        "bynk.lex.unexpected_character",
525                        span,
526                        format!("unexpected character `{ch}`"),
527                    )
528                };
529                return Err(err);
530            }
531        }
532    }
533    Ok(tokens)
534}
535
536/// Cheap routing pre-scan (v0.43): does the string opening at `start` contain a
537/// `\(` interpolation hole before it closes (or the line ends)? Decides whether
538/// `tokenize` hand-scans the string as an `InterpStr` or defers to logos for a
539/// plain `StrLit`. Deliberately tolerant — a malformed string with a hole is
540/// routed here so the hole-aware scanner produces the precise error.
541fn has_interp_hole(bytes: &[u8], start: usize) -> bool {
542    let mut i = start + 1;
543    while i < bytes.len() {
544        match bytes[i] {
545            b'\n' | b'"' => return false,
546            b'\\' => {
547                if bytes.get(i + 1) == Some(&b'(') {
548                    return true;
549                }
550                i += 2;
551            }
552            _ => i += 1,
553        }
554    }
555    false
556}
557
558/// Scan a double-quoted string starting at `start` (the opening `"`), returning
559/// the byte offset just past the closing `"`. Recognises the four simple
560/// escapes plus `\(…)` interpolation holes, whose parens are balanced (and
561/// whose nested strings are skipped) by [`scan_hole`]. (v0.43.)
562fn scan_str(bytes: &[u8], source: &str, start: usize) -> Result<usize, CompileError> {
563    debug_assert_eq!(bytes[start], b'"');
564    let mut i = start + 1;
565    loop {
566        if i >= bytes.len() || bytes[i] == b'\n' {
567            return Err(CompileError::new(
568                "bynk.lex.unterminated_string",
569                Span::new(start, i.min(bytes.len())),
570                "unterminated string literal",
571            )
572            .with_note(
573                "string literals must close with `\"` on the same line; \
574                 supported escapes are `\\n`, `\\t`, `\\\"`, `\\\\`, and `\\(…)` interpolation",
575            ));
576        }
577        match bytes[i] {
578            b'"' => return Ok(i + 1),
579            b'\\' => match bytes.get(i + 1) {
580                Some(b'n' | b't' | b'"' | b'\\') => i += 2,
581                Some(b'(') => i = scan_hole(bytes, source, i + 2)?,
582                other => {
583                    let shown = other.map(|b| (*b as char).to_string()).unwrap_or_default();
584                    return Err(CompileError::new(
585                        "bynk.lex.bad_escape",
586                        Span::new(i, (i + 2).min(bytes.len())),
587                        format!("invalid escape sequence `\\{shown}` in string literal"),
588                    )
589                    .with_note("supported escapes: \\n \\t \\\" \\\\ \\(…)"));
590                }
591            },
592            // Any other byte advances one position. UTF-8 continuation bytes
593            // are all >= 0x80, so they never collide with the ASCII specials.
594            _ => i += 1,
595        }
596    }
597}
598
599/// Scan an interpolation hole body. `start` points just past the `\(`; returns
600/// the offset just past the matching `)`. Tracks paren depth and skips nested
601/// strings (whose own parens must not close the hole), recursing through
602/// [`scan_str`] so nested interpolation nests correctly. (v0.43.)
603fn scan_hole(bytes: &[u8], source: &str, start: usize) -> Result<usize, CompileError> {
604    let mut i = start;
605    let mut depth = 1usize;
606    loop {
607        if i >= bytes.len() || bytes[i] == b'\n' {
608            return Err(CompileError::new(
609                "bynk.lex.unterminated_interpolation",
610                Span::new(start.saturating_sub(2), i.min(bytes.len())),
611                "unterminated interpolation hole",
612            )
613            .with_note(
614                "an interpolation hole `\\(…)` must close with a matching `)` on the same line",
615            ));
616        }
617        match bytes[i] {
618            b'(' => {
619                depth += 1;
620                i += 1;
621            }
622            b')' => {
623                depth -= 1;
624                i += 1;
625                if depth == 0 {
626                    return Ok(i);
627                }
628            }
629            b'"' => i = scan_str(bytes, source, i)?,
630            _ => i += 1,
631        }
632    }
633}
634
635/// One segment of a split interpolated string (v0.43): literal text (escapes
636/// resolved) or the absolute source span of a hole's expression (the bytes
637/// between `\(` and its matching `)`). The parser turns the latter into a real
638/// `Expr`; the lexer owns only the scanning.
639pub(crate) enum InterpSegment {
640    Chunk(String),
641    Hole(Span),
642}
643
644/// Split an `InterpStr` token (its `span` covers the whole `"…"`) into chunks
645/// and hole spans. Escapes in the chunks are resolved here (mirroring
646/// [`parse_string_literal`]); holes are returned as spans for the parser to
647/// re-lex and parse as expressions. (v0.43.)
648pub(crate) fn split_interp(source: &str, span: Span) -> Result<Vec<InterpSegment>, CompileError> {
649    let bytes = source.as_bytes();
650    let inner_end = span.end - 1; // the closing `"`
651    let mut segments = Vec::new();
652    let mut chunk = String::new();
653    let mut i = span.start + 1; // past the opening `"`
654    while i < inner_end {
655        match bytes[i] {
656            b'\\' => match bytes[i + 1] {
657                b'n' => {
658                    chunk.push('\n');
659                    i += 2;
660                }
661                b't' => {
662                    chunk.push('\t');
663                    i += 2;
664                }
665                b'"' => {
666                    chunk.push('"');
667                    i += 2;
668                }
669                b'\\' => {
670                    chunk.push('\\');
671                    i += 2;
672                }
673                b'(' => {
674                    if !chunk.is_empty() {
675                        segments.push(InterpSegment::Chunk(std::mem::take(&mut chunk)));
676                    }
677                    let hole_start = i + 2;
678                    let after = scan_hole(bytes, source, hole_start)?;
679                    // `after` is one past the matching `)`; the hole body is
680                    // everything up to that `)`.
681                    segments.push(InterpSegment::Hole(Span::new(hole_start, after - 1)));
682                    i = after;
683                }
684                // The lexer already validated every escape, so nothing else
685                // can appear here.
686                other => unreachable!("unvalidated escape `\\{}` in InterpStr", other as char),
687            },
688            _ => {
689                let ch = source[i..].chars().next().unwrap();
690                chunk.push(ch);
691                i += ch.len_utf8();
692            }
693        }
694    }
695    if !chunk.is_empty() {
696        segments.push(InterpSegment::Chunk(chunk));
697    }
698    Ok(segments)
699}
700
701/// If a `---` doc-block marker line starts at or shortly after `pos` (which
702/// must be at a line boundary), return the byte offset just past the marker
703/// line (after the terminating newline, or at EOF). The doc-block grammar
704/// requires the marker to be alone on its line; leading horizontal whitespace
705/// is allowed and ignored.
706fn doc_block_open_at(source: &str, pos: usize) -> Option<usize> {
707    let bytes = source.as_bytes();
708    if !at_line_start(source, pos) {
709        return None;
710    }
711    // Skip leading horizontal whitespace.
712    let mut i = pos;
713    while i < bytes.len() && (bytes[i] == b' ' || bytes[i] == b'\t') {
714        i += 1;
715    }
716    if i + 3 > bytes.len() {
717        return None;
718    }
719    if &bytes[i..i + 3] != b"---" {
720        return None;
721    }
722    i += 3;
723    // The marker may have additional trailing dashes (per spec "three or more
724    // consecutive hyphens"). Consume them.
725    while i < bytes.len() && bytes[i] == b'-' {
726        i += 1;
727    }
728    // After the dashes, allow only horizontal whitespace then newline/EOF.
729    while i < bytes.len() && (bytes[i] == b' ' || bytes[i] == b'\t' || bytes[i] == b'\r') {
730        i += 1;
731    }
732    if i == bytes.len() {
733        return Some(i);
734    }
735    if bytes[i] == b'\n' {
736        return Some(i + 1);
737    }
738    None
739}
740
741/// Find the next closing `---` line at or after `pos`. Returns
742/// `(start_of_line, end_of_line)` (`end_of_line` is just past the
743/// terminating newline, or at EOF).
744fn doc_block_close(source: &str, mut pos: usize) -> Option<(usize, usize)> {
745    let bytes = source.as_bytes();
746    while pos < bytes.len() {
747        // Advance pos to the start of a line.
748        let line_start = pos;
749        // Find the end of this line.
750        let mut line_end = line_start;
751        while line_end < bytes.len() && bytes[line_end] != b'\n' {
752            line_end += 1;
753        }
754        // Check this line.
755        if let Some(end) = doc_block_open_at(source, line_start) {
756            return Some((line_start, end));
757        }
758        // Move to the next line.
759        pos = if line_end < bytes.len() {
760            line_end + 1
761        } else {
762            line_end
763        };
764    }
765    None
766}
767
768/// Returns true if byte offset `pos` is at a line start (column 0).
769fn at_line_start(source: &str, pos: usize) -> bool {
770    if pos == 0 {
771        return true;
772    }
773    let bytes = source.as_bytes();
774    bytes[pos - 1] == b'\n'
775}
776
777/// Extract the body content of a doc-block token from its source span.
778/// Strips the leading and trailing `---` marker lines and returns the body
779/// verbatim. If every non-empty content line begins with the same horizontal
780/// whitespace prefix (e.g., because the doc block sits inside a brace-form
781/// commons body), that common prefix is removed so the body reads naturally
782/// when emitted as JSDoc.
783pub fn doc_block_content(source: &str, span: Span) -> String {
784    let slice = &source[span.range()];
785    // Drop the first line (opening marker).
786    let after_open = match slice.find('\n') {
787        Some(i) => &slice[i + 1..],
788        None => return String::new(),
789    };
790    let bytes = after_open.as_bytes();
791    // Trim the trailing closing-marker line.
792    let mut i = bytes.len();
793    if i > 0 && bytes[i - 1] == b'\n' {
794        i -= 1;
795    }
796    while i > 0 && matches!(bytes[i - 1], b' ' | b'\t' | b'\r') {
797        i -= 1;
798    }
799    while i > 0 && bytes[i - 1] == b'-' {
800        i -= 1;
801    }
802    if i > 0 && bytes[i - 1] == b'\n' {
803        i -= 1;
804    }
805    let body = &after_open[..i];
806
807    // Compute the common leading-whitespace prefix across all non-empty lines
808    // and strip it. This lets writers indent the doc block alongside the
809    // declaration it documents without bleeding the indent into the JSDoc.
810    let common: Option<usize> = body
811        .lines()
812        .filter(|l| !l.trim().is_empty())
813        .map(|l| l.bytes().take_while(|&b| b == b' ' || b == b'\t').count())
814        .min();
815    let strip = common.unwrap_or(0);
816    if strip == 0 {
817        return body.to_string();
818    }
819    let mut out = String::with_capacity(body.len());
820    let mut first = true;
821    for line in body.lines() {
822        if !first {
823            out.push('\n');
824        }
825        first = false;
826        if line.trim().is_empty() {
827            // Preserve blank lines.
828            continue;
829        }
830        let leading: usize = line
831            .bytes()
832            .take_while(|&b| b == b' ' || b == b'\t')
833            .count();
834        let drop = strip.min(leading);
835        out.push_str(&line[drop..]);
836    }
837    out
838}
839
840/// Extract the body of a `Comment` trivia token: everything after the
841/// leading `--` marker, preserving its inline whitespace verbatim. Used by
842/// the parser when attaching comments to declarations.
843pub fn comment_body(source: &str, span: Span) -> &str {
844    let slice = &source[span.range()];
845    // Strip leading "--" if present (defensive — the lexer always emits
846    // Comment tokens whose span begins with `--`).
847    slice.strip_prefix("--").unwrap_or(slice)
848}
849
850/// Returns true if there is a blank line (a line containing only whitespace)
851/// in `source` strictly between byte offsets `from` (inclusive) and `to`
852/// (exclusive). Used by the parser to detect orphan doc blocks.
853///
854/// A doc-block token's span ends just past the closing-marker line's
855/// terminating newline. So if the next declaration begins on the immediately
856/// following line, the substring between contains no newline (only optional
857/// indentation). Any newline in the substring therefore implies at least one
858/// entirely-blank line separating the doc from the declaration.
859pub fn has_blank_line_between(source: &str, from: usize, to: usize) -> bool {
860    if to <= from {
861        return false;
862    }
863    let bytes = source.as_bytes();
864    let mut i = from;
865    while i < to {
866        if bytes[i] == b'\n' {
867            return true;
868        }
869        if !matches!(bytes[i], b' ' | b'\t' | b'\r') {
870            return false;
871        }
872        i += 1;
873    }
874    false
875}
876
877#[cfg(test)]
878mod tests {
879    use super::*;
880
881    fn kinds(source: &str) -> Vec<TokenKind> {
882        tokenize(source)
883            .unwrap()
884            .into_iter()
885            .map(|t| t.kind)
886            .collect()
887    }
888
889    #[test]
890    fn keywords_and_idents() {
891        use TokenKind::*;
892        assert_eq!(
893            kinds("commons type fn where and true false Int String Bool foo bar"),
894            vec![
895                Commons, Type, Fn, Where, And, True, False, Int, String, Bool, Ident, Ident
896            ],
897        );
898    }
899
900    #[test]
901    fn integer_and_string_literals() {
902        use TokenKind::*;
903        assert_eq!(
904            kinds(r#"0 42 "hello" "with\nescape""#),
905            vec![IntLit, IntLit, StrLit, StrLit]
906        );
907    }
908
909    #[test]
910    fn operators() {
911        use TokenKind::*;
912        assert_eq!(
913            kinds("-> == != <= >= && || + - * / ! = < > ( ) { } [ ] , : ."),
914            vec![
915                Arrow, EqEq, BangEq, LtEq, GtEq, AmpAmp, PipePipe, Plus, Minus, Star, Slash, Bang,
916                Eq, Lt, Gt, LParen, RParen, LBrace, RBrace, LBracket, RBracket, Comma, Colon, Dot,
917            ],
918        );
919    }
920
921    #[test]
922    fn line_comments_emitted_as_trivia() {
923        // v1.1: line comments are preserved as Comment tokens so the
924        // formatter can attach and re-emit them.
925        use TokenKind::*;
926        let src = "-- a comment\ntype X = Int -- trailing\n";
927        assert_eq!(kinds(src), vec![Comment, Type, Ident, Eq, Int, Comment],);
928    }
929
930    #[test]
931    fn comment_body_extracts_text_after_marker() {
932        let toks = tokenize("-- hello world\n").unwrap();
933        assert_eq!(toks.len(), 1);
934        assert_eq!(toks[0].kind, TokenKind::Comment);
935        assert_eq!(
936            comment_body("-- hello world\n", toks[0].span),
937            " hello world"
938        );
939    }
940
941    #[test]
942    fn comment_does_not_consume_newline() {
943        // Two adjacent comment lines should produce two distinct tokens
944        // — the newline between them is not part of either comment's span.
945        let toks = tokenize("-- one\n-- two\n").unwrap();
946        assert_eq!(toks.len(), 2);
947        assert!(toks.iter().all(|t| t.kind == TokenKind::Comment));
948    }
949
950    #[test]
951    fn unterminated_string_is_error() {
952        let err = tokenize("\"oops\n").unwrap_err();
953        assert_eq!(err.category, "bynk.lex.unterminated_string");
954    }
955
956    #[test]
957    fn integer_overflow_is_error() {
958        let err = tokenize("99999999999999999999").unwrap_err();
959        assert_eq!(err.category, "bynk.lex.integer_overflow");
960    }
961
962    #[test]
963    fn unexpected_character_is_error() {
964        let err = tokenize("type X = Int $").unwrap_err();
965        assert_eq!(err.category, "bynk.lex.unexpected_character");
966    }
967
968    #[test]
969    fn v0_1_keywords() {
970        use TokenKind::*;
971        assert_eq!(
972            kinds("let if else Ok Err Result ValidationError"),
973            vec![Let, If, Else, Ok, Err, Result, ValidationError],
974        );
975    }
976
977    #[test]
978    fn question_token() {
979        use TokenKind::*;
980        assert_eq!(kinds("x?"), vec![Ident, Question]);
981    }
982
983    #[test]
984    fn v0_2_keywords() {
985        use TokenKind::*;
986        assert_eq!(
987            kinds("enum match Option record self Some None is"),
988            vec![Enum, Match, Option, Record, Self_, Some, None, Is],
989        );
990    }
991
992    #[test]
993    fn pipe_and_pipe_pipe_disambiguated() {
994        use TokenKind::*;
995        assert_eq!(kinds("| || |"), vec![Pipe, PipePipe, Pipe]);
996    }
997
998    #[test]
999    fn v0_7_keywords() {
1000        use TokenKind::*;
1001        assert_eq!(
1002            kinds("assert expect mocks test"),
1003            vec![Assert, Expect, Mocks, Test],
1004        );
1005    }
1006
1007    #[test]
1008    fn fat_arrow_and_underscore() {
1009        use TokenKind::*;
1010        assert_eq!(kinds("_ =>"), vec![Underscore, FatArrow]);
1011    }
1012
1013    // -- v0.43 string interpolation --
1014
1015    #[test]
1016    fn interp_string_is_one_token() {
1017        use TokenKind::*;
1018        assert_eq!(kinds(r#""Hello, \(name)!""#), vec![InterpStr]);
1019        // A plain string (no hole) stays a `StrLit`, via the logos path.
1020        assert_eq!(kinds(r#""Hello, world""#), vec![StrLit]);
1021    }
1022
1023    #[test]
1024    fn interp_balances_nested_parens_and_strings() {
1025        use TokenKind::*;
1026        // The `)` inside `f(x)` must not close the hole early.
1027        assert_eq!(kinds(r#""= \(f(x))""#), vec![InterpStr]);
1028        // A `)` inside a nested string inside the hole is also ignored.
1029        assert_eq!(kinds(r#""= \(label(")"))""#), vec![InterpStr]);
1030        // A nested interpolated string inside a hole.
1031        assert_eq!(kinds(r#""out \("in \(x)")""#), vec![InterpStr]);
1032    }
1033
1034    #[test]
1035    fn escaped_open_paren_is_not_a_hole() {
1036        use TokenKind::*;
1037        // `\\(` is a literal backslash followed by `(` — no hole, so the
1038        // string lexes as a plain `StrLit` on the logos path.
1039        assert_eq!(kinds(r#""a \\(b) c""#), vec![StrLit]);
1040    }
1041
1042    #[test]
1043    fn unterminated_hole_is_an_error() {
1044        // The hole runs to end of line without its closing `)`.
1045        let err = tokenize("\"value \\(x + 1\n\"").unwrap_err();
1046        assert_eq!(err.category, "bynk.lex.unterminated_interpolation");
1047    }
1048
1049    #[test]
1050    fn unterminated_interp_string_is_an_error() {
1051        // A hole closes but the string never does (newline before the `"`).
1052        let err = tokenize("\"value \\(x) more\n").unwrap_err();
1053        assert_eq!(err.category, "bynk.lex.unterminated_string");
1054    }
1055
1056    #[test]
1057    fn bad_escape_in_interp_string_is_an_error() {
1058        let err = tokenize(r#""a \q \(x)""#).unwrap_err();
1059        assert_eq!(err.category, "bynk.lex.bad_escape");
1060    }
1061}
bynk_syntax/lexer.rs

bynk_syntax/
lexer.rs