bynk_syntax/lexer.rs
1//! Lexer for Bynk v0.
2//!
3//! Token kinds correspond to the terminals defined in the grammar (spec §3
4//! and §4). Whitespace is skipped; line comments are emitted as `Comment`
5//! tokens so the formatter can preserve them through round-trips (v1.1 LSP
6//! spec §3.5). Doc blocks (`---`) are emitted as `DocBlock` tokens, lexed
7//! outside of logos (see [`tokenize`]).
8
9use logos::Logos;
10
11use crate::error::CompileError;
12use crate::span::Span;
13
14/// Token kinds. Discriminants without payload data; the lexeme is recovered
15/// from the source string via the token's [`Span`].
16///
17/// Note: `--` line comments and `---` doc block markers are handled outside
18/// logos (see [`tokenize`]), because doc blocks are delimited by `---` lines
19/// containing only the marker and may span multiple source lines.
20#[derive(Logos, Debug, Clone, Copy, PartialEq, Eq)]
21#[logos(skip r"[ \t\r\n]+")]
22pub enum TokenKind {
23 // Keywords
24 #[token("commons")]
25 Commons,
26 #[token("type")]
27 Type,
28 #[token("fn")]
29 Fn,
30 #[token("where")]
31 Where,
32 #[token("and")]
33 And,
34 #[token("true")]
35 True,
36 #[token("false")]
37 False,
38 #[token("Int")]
39 Int,
40 #[token("String")]
41 String,
42 #[token("Bool")]
43 Bool,
44 // v0.21 keyword
45 #[token("Float")]
46 Float,
47 // v0.86 keyword (ADR 0112): the `Duration` base type.
48 #[token("Duration")]
49 Duration,
50 // v0.90 keyword (ADR 0114): the `Instant` base type.
51 #[token("Instant")]
52 Instant,
53 // v0.110 keyword (ADR 0142): the `Bytes` base type.
54 #[token("Bytes")]
55 Bytes,
56 // v0.1 keywords
57 #[token("let")]
58 Let,
59 #[token("if")]
60 If,
61 #[token("else")]
62 Else,
63 #[token("Ok")]
64 Ok,
65 #[token("Err")]
66 Err,
67 #[token("Result")]
68 Result,
69 #[token("ValidationError")]
70 ValidationError,
71 // v0.22b keyword
72 #[token("JsonError")]
73 JsonError,
74 // v0.2 keywords
75 #[token("enum")]
76 Enum,
77 #[token("match")]
78 Match,
79 #[token("Option")]
80 Option,
81 #[token("record")]
82 Record,
83 #[token("self")]
84 Self_,
85 #[token("Some")]
86 Some,
87 #[token("None")]
88 None,
89 #[token("is")]
90 Is,
91 // v0.3 keywords
92 #[token("opaque")]
93 Opaque,
94 #[token("uses")]
95 Uses,
96 // v0.4 keywords
97 #[token("context")]
98 Context,
99 #[token("consumes")]
100 Consumes,
101 #[token("exports")]
102 Exports,
103 #[token("transparent")]
104 Transparent,
105 // v0.6 keywords
106 #[token("as")]
107 As,
108 // v0.7 keywords (v0.112: `assert`→`expect`, `test`→`suite`/`case`;
109 // v0.118: `mocks` retired — test doubles are `provides` at a seam)
110 #[token("expect")]
111 Expect,
112 #[token("suite")]
113 Suite,
114 #[token("case")]
115 Case,
116 // v0.114 keyword — generative tests (testing track slice 2). `for` and `all`
117 // are deliberately *not* keywords: `all` is a list combinator (`all(xs, p)`)
118 // and must stay a usable identifier. The `for all` binder is parsed
119 // contextually (two identifiers) inside a `property` body instead.
120 #[token("property")]
121 Property,
122 // v0.17 keywords
123 #[token("adapter")]
124 Adapter,
125 #[token("binding")]
126 Binding,
127 // v0.5 keywords
128 #[token("agent")]
129 Agent,
130 #[token("capability")]
131 Capability,
132 #[token("Effect")]
133 Effect,
134 #[token("given")]
135 Given,
136 #[token("on")]
137 On,
138 // v0.9 keyword
139 #[token("http")]
140 Http,
141 // v0.10a keyword
142 #[token("cron")]
143 Cron,
144 // v0.10b keyword
145 #[token("queue")]
146 Queue,
147 // v0.44 keywords: `from` heads a service's protocol clause; `protocol` is
148 // reserved (protocols are a closed, compiler-known set — no declaration kind).
149 #[token("from")]
150 From,
151 #[token("protocol")]
152 Protocol,
153 #[token("provides")]
154 Provides,
155 #[token("service")]
156 Service,
157 // v0.45 keywords: `actor` heads a boundary-contract declaration; `by`
158 // heads a handler's actor clause.
159 #[token("actor")]
160 Actor,
161 #[token("by")]
162 By,
163 // v0.80 keywords: `invariant` heads an agent invariant declaration; `implies`
164 // is the directional logical-implication operator (`P implies Q` ≡ `!P || Q`).
165 #[token("invariant")]
166 Invariant,
167 #[token("implies")]
168 Implies,
169 // v0.115 keywords — function contracts (testing track slice 3). `requires`
170 // and `ensures` head a contract clause on a `fn` signature (between the
171 // return type and the body). `result` is deliberately *not* a keyword: it is
172 // the ordinary value name outside a contract, so it stays a usable
173 // identifier; inside an `ensures` predicate it is bound contextually as the
174 // function's return value (parsed by scope, like `for`/`all` in slice 2).
175 // Distinct from ADR 0127's capability `@requires` annotation.
176 #[token("requires")]
177 Requires,
178 #[token("ensures")]
179 Ensures,
180 // v0.116 keyword — step invariants (testing track slice 4). `transition` heads
181 // an agent step-invariant declaration (beside `invariant`), a predicate over
182 // the pre- and post-commit state pair. `old` and `new` are deliberately *not*
183 // keywords: they stay ordinary value names outside a `transition`, and inside a
184 // `transition` predicate they are bound contextually to the old/new state
185 // records (parsed by scope, like `result` in an `ensures`).
186 #[token("transition")]
187 Transition,
188 /// `...` — used in record-spread expressions (v0.5).
189 #[token("...")]
190 DotDotDot,
191 /// `<-` — Effect bind operator (v0.5).
192 #[token("<-")]
193 LArrow,
194 /// `~>` — asynchronous fire-and-forget send marker (v0.79). A leading
195 /// statement marker, never on the RHS of a `let`; distinct from `<-` so the
196 /// call site shows whether the caller waits.
197 #[token("~>")]
198 TildeArrow,
199 /// `:=` — Cell write (v0.81, storage track). A handler statement
200 /// `cell := expr`; distinct from `=` (binding) and `:` (annotation). Longer
201 /// than `:`/`=` so logos matches it as one token.
202 #[token(":=")]
203 ColonEq,
204
205 /// A documentation block: `---` line ... `---` line. The token's span
206 /// covers the full block including both `---` markers. The body content
207 /// is recovered from the source via the span (see [`doc_block_content`]).
208 /// Inserted by [`tokenize`]; not lexed by logos directly.
209 DocBlock,
210
211 /// A line comment: `-- ...` running to end of line. The span starts at
212 /// the `--` marker and runs through the last character before the
213 /// terminating newline (exclusive). The trivia body (the text after the
214 /// `--` marker) is recovered from the source via the span. Inserted by
215 /// [`tokenize`]; not lexed by logos directly so it cannot be mistaken
216 /// for an `--` operator sequence.
217 Comment,
218
219 // Identifier
220 #[regex(r"[A-Za-z][A-Za-z0-9_]*")]
221 Ident,
222
223 // Literals
224 #[regex(r"[0-9]+")]
225 IntLit,
226 // A float literal: fraction with a digit on both sides of the `.`, an
227 // exponent, or both (v0.21 §3). `1.` and `.5` are NOT float literals —
228 // the digit-both-sides rule keeps `2.5.round()` / `1.toFloat()` lexing
229 // as method calls on numeric literals.
230 #[regex(r"[0-9]+\.[0-9]+([eE][+-]?[0-9]+)?|[0-9]+[eE][+-]?[0-9]+")]
231 FloatLit,
232 // A double-quoted string with simple escapes. The body excludes the closing
233 // quote; we accept any non-quote/non-backslash/non-newline char, or a
234 // backslash followed by one of the four allowed escapes.
235 #[regex(r#""([^"\\\n]|\\[nt"\\])*""#)]
236 StrLit,
237 // An interpolated string `"… \(expr) …"` (v0.43). Hand-scanned in
238 // `tokenize` (logos cannot balance the holes' parens), never produced by
239 // the logos lexer — like [`TokenKind::DocBlock`]/[`TokenKind::Comment`].
240 // The span covers the whole `"…"`; the parser splits chunks from holes.
241 InterpStr,
242
243 // Multi-char operators
244 #[token("->")]
245 Arrow,
246 #[token("==")]
247 EqEq,
248 #[token("!=")]
249 BangEq,
250 #[token("<=")]
251 LtEq,
252 #[token(">=")]
253 GtEq,
254 #[token("&&")]
255 AmpAmp,
256 #[token("||")]
257 PipePipe,
258
259 // Single-char operators
260 #[token("+")]
261 Plus,
262 #[token("-")]
263 Minus,
264 #[token("*")]
265 Star,
266 #[token("/")]
267 Slash,
268 #[token("!")]
269 Bang,
270 #[token("=")]
271 Eq,
272 #[token("<")]
273 Lt,
274 #[token(">")]
275 Gt,
276 // v0.1 postfix operator
277 #[token("?")]
278 Question,
279 // v0.2 match-arm arrow
280 #[token("=>")]
281 FatArrow,
282 // v0.2 wildcard pattern (also valid as identifier start; the lexer
283 // prefers identifier for any longer match, so `_foo` is still Ident).
284 #[token("_")]
285 Underscore,
286 // v0.2 sum-type variant separator (also used as future bitwise OR);
287 // single `|` distinct from `||`.
288 #[token("|")]
289 Pipe,
290 /// `@` — storage-annotation marker (v0.85, storage track; ADR 0111). Leads a
291 /// `@name(args)` annotation on a `store` field (`@ttl(…)`/`@indexed(…)`); it
292 /// appears only in store-field-declaration position, never as an expression
293 /// operator.
294 #[token("@")]
295 At,
296
297 // Punctuation
298 #[token("(")]
299 LParen,
300 #[token(")")]
301 RParen,
302 #[token("{")]
303 LBrace,
304 #[token("}")]
305 RBrace,
306 #[token("[")]
307 LBracket,
308 #[token("]")]
309 RBracket,
310 #[token(",")]
311 Comma,
312 #[token(":")]
313 Colon,
314 #[token(".")]
315 Dot,
316}
317
318impl TokenKind {
319 /// Human-readable display name for diagnostics.
320 pub fn describe(self) -> &'static str {
321 use TokenKind::*;
322 match self {
323 Commons => "`commons`",
324 Type => "`type`",
325 Fn => "`fn`",
326 Where => "`where`",
327 And => "`and`",
328 True => "`true`",
329 False => "`false`",
330 Int => "`Int`",
331 String => "`String`",
332 Bool => "`Bool`",
333 Float => "`Float`",
334 Duration => "`Duration`",
335 Instant => "`Instant`",
336 Bytes => "`Bytes`",
337 Let => "`let`",
338 If => "`if`",
339 Else => "`else`",
340 Ok => "`Ok`",
341 Err => "`Err`",
342 Result => "`Result`",
343 ValidationError => "`ValidationError`",
344 JsonError => "`JsonError`",
345 Enum => "`enum`",
346 Match => "`match`",
347 Option => "`Option`",
348 Record => "`record`",
349 Self_ => "`self`",
350 Some => "`Some`",
351 None => "`None`",
352 Is => "`is`",
353 Opaque => "`opaque`",
354 Uses => "`uses`",
355 Context => "`context`",
356 Consumes => "`consumes`",
357 Exports => "`exports`",
358 Transparent => "`transparent`",
359 As => "`as`",
360 Expect => "`expect`",
361 Suite => "`suite`",
362 Case => "`case`",
363 Property => "`property`",
364 Adapter => "`adapter`",
365 Binding => "`binding`",
366 Agent => "`agent`",
367 Capability => "`capability`",
368 Effect => "`Effect`",
369 Given => "`given`",
370 On => "`on`",
371 Http => "`http`",
372 Cron => "`cron`",
373 Queue => "`queue`",
374 From => "`from`",
375 Protocol => "`protocol`",
376 Provides => "`provides`",
377 Service => "`service`",
378 Actor => "`actor`",
379 By => "`by`",
380 Invariant => "`invariant`",
381 Implies => "`implies`",
382 Requires => "`requires`",
383 Ensures => "`ensures`",
384 Transition => "`transition`",
385 ColonEq => "`:=`",
386 DotDotDot => "`...`",
387 LArrow => "`<-`",
388 TildeArrow => "`~>`",
389 DocBlock => "documentation block",
390 Comment => "line comment",
391 Ident => "identifier",
392 IntLit => "integer literal",
393 FloatLit => "float literal",
394 StrLit => "string literal",
395 InterpStr => "interpolated string",
396 Arrow => "`->`",
397 EqEq => "`==`",
398 BangEq => "`!=`",
399 LtEq => "`<=`",
400 GtEq => "`>=`",
401 AmpAmp => "`&&`",
402 PipePipe => "`||`",
403 Plus => "`+`",
404 Minus => "`-`",
405 Star => "`*`",
406 Slash => "`/`",
407 Bang => "`!`",
408 Eq => "`=`",
409 Lt => "`<`",
410 Gt => "`>`",
411 Question => "`?`",
412 FatArrow => "`=>`",
413 Underscore => "`_`",
414 Pipe => "`|`",
415 At => "`@`",
416 LParen => "`(`",
417 RParen => "`)`",
418 LBrace => "`{`",
419 RBrace => "`}`",
420 LBracket => "`[`",
421 RBracket => "`]`",
422 Comma => "`,`",
423 Colon => "`:`",
424 Dot => "`.`",
425 }
426 }
427}
428
429/// A token plus its source span.
430#[derive(Debug, Clone, Copy)]
431pub struct Token {
432 pub kind: TokenKind,
433 pub span: Span,
434}
435
436/// Tokenise a source string. Returns the full token vector or the first
437/// lexical error.
438///
439/// Doc blocks (`---` ... `---`) and line comments (`-- ...`) are recognised
440/// outside the logos-generated lexer: we scan the source one segment at a
441/// time, dispatching to logos for ordinary tokens between non-token spans.
442pub fn tokenize(source: &str) -> Result<Vec<Token>, CompileError> {
443 let mut tokens = Vec::new();
444 let bytes = source.as_bytes();
445 let mut pos = 0;
446 while pos < bytes.len() {
447 // Detect a `---` doc-block marker at the start of a line (the line may
448 // begin with leading whitespace; the marker itself must be alone on
449 // its line).
450 if let Some(open_end) = doc_block_open_at(source, pos) {
451 // Find the matching closing `---` line.
452 match doc_block_close(source, open_end) {
453 Some((close_start, close_end)) => {
454 let span = Span::new(pos, close_end);
455 tokens.push(Token {
456 kind: TokenKind::DocBlock,
457 span,
458 });
459 let _ = close_start;
460 pos = close_end;
461 continue;
462 }
463 None => {
464 return Err(CompileError::new(
465 "bynk.lex.unclosed_doc_block",
466 Span::new(pos, open_end),
467 "documentation block opened but never closed",
468 )
469 .with_note(
470 "a doc block must be terminated by another `---` on a line by itself",
471 ));
472 }
473 }
474 }
475 // A `--` line comment: emit a `Comment` token covering everything
476 // up to (but not including) the terminating newline. Doc-block
477 // detection above already ruled out a `---` marker at line start
478 // — and once we've consumed past the leading `--`, any further
479 // dashes are part of the comment body. Preserving comments as
480 // trivia tokens lets the parser attach them to declarations so
481 // the formatter can emit them in place (v1.1 LSP spec §3.5).
482 if pos + 1 < bytes.len() && bytes[pos] == b'-' && bytes[pos + 1] == b'-' {
483 let start = pos;
484 while pos < bytes.len() && bytes[pos] != b'\n' {
485 pos += 1;
486 }
487 tokens.push(Token {
488 kind: TokenKind::Comment,
489 span: Span::new(start, pos),
490 });
491 continue;
492 }
493 // Skip ordinary whitespace inline (logos handles it too, but we may
494 // be in the middle of the source between specials).
495 if matches!(bytes[pos], b' ' | b'\t' | b'\r' | b'\n') {
496 pos += 1;
497 continue;
498 }
499 // An interpolated string `"… \(expr) …"` (v0.43): only strings that
500 // actually contain a `\(` hole are hand-scanned here; plain strings
501 // fall through to the logos `StrLit` path unchanged. `\(` is an
502 // invalid escape in the logos grammar, so this never re-routes a
503 // currently-valid literal.
504 if bytes[pos] == b'"' && has_interp_hole(bytes, pos) {
505 let end = scan_str(bytes, source, pos)?;
506 tokens.push(Token {
507 kind: TokenKind::InterpStr,
508 span: Span::new(pos, end),
509 });
510 pos = end;
511 continue;
512 }
513 // Otherwise dispatch a single logos token starting at `pos`.
514 let mut lex = TokenKind::lexer(&source[pos..]);
515 let Some(result) = lex.next() else {
516 // No token at this position; treat as unexpected character so
517 // the user sees something useful.
518 let ch = source[pos..].chars().next().unwrap_or('\0');
519 let span = Span::new(pos, pos + ch.len_utf8());
520 return Err(CompileError::new(
521 "bynk.lex.unexpected_character",
522 span,
523 format!("unexpected character `{ch}`"),
524 ));
525 };
526 let local = lex.span();
527 let span: Span = Span::new(pos + local.start, pos + local.end);
528 match result {
529 Ok(kind) => {
530 if kind == TokenKind::IntLit {
531 let slice = &source[span.range()];
532 if slice.parse::<i64>().is_err() {
533 return Err(CompileError::new(
534 "bynk.lex.integer_overflow",
535 span,
536 format!(
537 "integer literal `{slice}` is out of range for a 64-bit signed integer"
538 ),
539 )
540 .with_note("the range is -2^63 to 2^63 - 1"));
541 }
542 }
543 if kind == TokenKind::FloatLit {
544 let slice = &source[span.range()];
545 match slice.parse::<f64>() {
546 Ok(v) if v.is_finite() => {}
547 _ => {
548 return Err(CompileError::new(
549 "bynk.lex.float_literal_overflow",
550 span,
551 format!(
552 "float literal `{slice}` is out of range for a 64-bit float"
553 ),
554 )
555 .with_note(
556 "the literal does not fit a finite IEEE 754 double; \
557 the largest finite value is ~1.8e308",
558 ));
559 }
560 }
561 }
562 tokens.push(Token { kind, span });
563 pos = span.end;
564 }
565 Err(()) => {
566 let slice = &source[span.range()];
567 let ch = slice.chars().next().unwrap_or('\0');
568 let err = if ch == '"' {
569 CompileError::new(
570 "bynk.lex.unterminated_string",
571 span,
572 "unterminated string literal",
573 )
574 .with_note(
575 "string literals must close with `\"` on the same line; \
576 supported escapes are `\\n`, `\\t`, `\\\"`, `\\\\`",
577 )
578 } else {
579 CompileError::new(
580 "bynk.lex.unexpected_character",
581 span,
582 format!("unexpected character `{ch}`"),
583 )
584 };
585 return Err(err);
586 }
587 }
588 }
589 Ok(tokens)
590}
591
592/// Like [`tokenize`], but with every interpolated-string token replaced by the
593/// tokens of its holes — each hole's bytes re-lexed and its token spans rebased
594/// to absolute source positions (the same rebase [`crate::parser`] applies when
595/// parsing a hole), recursing through nested interpolation. Chunk (literal) text
596/// between holes yields no tokens.
597///
598/// An interpolated string lexes to a single opaque `InterpStr` token, so the
599/// LSP's token-based cursor resolution (hover, go-to-definition, references,
600/// semantic tokens) is otherwise blind to identifiers inside `"… \(name) …"`.
601/// Expanding the holes makes those identifiers visible as ordinary `Ident`
602/// tokens with their real spans. (Issue #473.)
603///
604/// On a malformed interpolation (an `InterpStr` whose holes don't split, or a
605/// hole whose bytes don't re-lex) the offending token is kept opaque rather than
606/// dropped, so resolution degrades to the pre-fix behaviour instead of losing
607/// tokens.
608pub fn tokenize_expanding_holes(source: &str) -> Result<Vec<Token>, CompileError> {
609 let mut out = Vec::new();
610 for tok in tokenize(source)? {
611 expand_hole_token(source, tok, &mut out);
612 }
613 Ok(out)
614}
615
616/// Push `tok` onto `out`, expanding it into its holes' tokens if it is an
617/// `InterpStr` (see [`tokenize_expanding_holes`]); otherwise push it as-is.
618fn expand_hole_token(source: &str, tok: Token, out: &mut Vec<Token>) {
619 if tok.kind != TokenKind::InterpStr {
620 out.push(tok);
621 return;
622 }
623 let Ok(segments) = split_interp(source, tok.span) else {
624 out.push(tok); // malformed interpolation — keep the opaque token
625 return;
626 };
627 for segment in segments {
628 let InterpSegment::Hole(hole) = segment else {
629 continue; // chunk text carries no tokens
630 };
631 let Ok(hole_tokens) = tokenize(&source[hole.range()]) else {
632 continue;
633 };
634 for mut t in hole_tokens {
635 // Rebase the hole's local spans to absolute source positions.
636 t.span = Span::new(t.span.start + hole.start, t.span.end + hole.start);
637 expand_hole_token(source, t, out); // recurse for nested interpolation
638 }
639 }
640}
641
642/// Cheap routing pre-scan (v0.43): does the string opening at `start` contain a
643/// `\(` interpolation hole before it closes (or the line ends)? Decides whether
644/// `tokenize` hand-scans the string as an `InterpStr` or defers to logos for a
645/// plain `StrLit`. Deliberately tolerant — a malformed string with a hole is
646/// routed here so the hole-aware scanner produces the precise error.
647fn has_interp_hole(bytes: &[u8], start: usize) -> bool {
648 let mut i = start + 1;
649 while i < bytes.len() {
650 match bytes[i] {
651 b'\n' | b'"' => return false,
652 b'\\' => {
653 if bytes.get(i + 1) == Some(&b'(') {
654 return true;
655 }
656 i += 2;
657 }
658 _ => i += 1,
659 }
660 }
661 false
662}
663
664/// Scan a double-quoted string starting at `start` (the opening `"`), returning
665/// the byte offset just past the closing `"`. Recognises the four simple
666/// escapes plus `\(…)` interpolation holes, whose parens are balanced (and
667/// whose nested strings are skipped) by [`scan_hole`]. (v0.43.)
668fn scan_str(bytes: &[u8], source: &str, start: usize) -> Result<usize, CompileError> {
669 debug_assert_eq!(bytes[start], b'"');
670 let mut i = start + 1;
671 loop {
672 if i >= bytes.len() || bytes[i] == b'\n' {
673 return Err(CompileError::new(
674 "bynk.lex.unterminated_string",
675 Span::new(start, i.min(bytes.len())),
676 "unterminated string literal",
677 )
678 .with_note(
679 "string literals must close with `\"` on the same line; \
680 supported escapes are `\\n`, `\\t`, `\\\"`, `\\\\`, and `\\(…)` interpolation",
681 ));
682 }
683 match bytes[i] {
684 b'"' => return Ok(i + 1),
685 b'\\' => match bytes.get(i + 1) {
686 Some(b'n' | b't' | b'"' | b'\\') => i += 2,
687 Some(b'(') => i = scan_hole(bytes, source, i + 2)?,
688 other => {
689 let shown = other.map(|b| (*b as char).to_string()).unwrap_or_default();
690 return Err(CompileError::new(
691 "bynk.lex.bad_escape",
692 Span::new(i, (i + 2).min(bytes.len())),
693 format!("invalid escape sequence `\\{shown}` in string literal"),
694 )
695 .with_note("supported escapes: \\n \\t \\\" \\\\ \\(…)"));
696 }
697 },
698 // Any other byte advances one position. UTF-8 continuation bytes
699 // are all >= 0x80, so they never collide with the ASCII specials.
700 _ => i += 1,
701 }
702 }
703}
704
705/// Scan an interpolation hole body. `start` points just past the `\(`; returns
706/// the offset just past the matching `)`. Tracks paren depth and skips nested
707/// strings (whose own parens must not close the hole), recursing through
708/// [`scan_str`] so nested interpolation nests correctly. (v0.43.)
709fn scan_hole(bytes: &[u8], source: &str, start: usize) -> Result<usize, CompileError> {
710 let mut i = start;
711 let mut depth = 1usize;
712 loop {
713 if i >= bytes.len() || bytes[i] == b'\n' {
714 return Err(CompileError::new(
715 "bynk.lex.unterminated_interpolation",
716 Span::new(start.saturating_sub(2), i.min(bytes.len())),
717 "unterminated interpolation hole",
718 )
719 .with_note(
720 "an interpolation hole `\\(…)` must close with a matching `)` on the same line",
721 ));
722 }
723 match bytes[i] {
724 b'(' => {
725 depth += 1;
726 i += 1;
727 }
728 b')' => {
729 depth -= 1;
730 i += 1;
731 if depth == 0 {
732 return Ok(i);
733 }
734 }
735 b'"' => i = scan_str(bytes, source, i)?,
736 _ => i += 1,
737 }
738 }
739}
740
741/// One segment of a split interpolated string (v0.43): literal text (escapes
742/// resolved) or the absolute source span of a hole's expression (the bytes
743/// between `\(` and its matching `)`). The parser turns the latter into a real
744/// `Expr`; the lexer owns only the scanning.
745pub(crate) enum InterpSegment {
746 Chunk(String),
747 Hole(Span),
748}
749
750/// Split an `InterpStr` token (its `span` covers the whole `"…"`) into chunks
751/// and hole spans. Escapes in the chunks are resolved here (mirroring
752/// [`parse_string_literal`]); holes are returned as spans for the parser to
753/// re-lex and parse as expressions. (v0.43.)
754pub(crate) fn split_interp(source: &str, span: Span) -> Result<Vec<InterpSegment>, CompileError> {
755 let bytes = source.as_bytes();
756 let inner_end = span.end - 1; // the closing `"`
757 let mut segments = Vec::new();
758 let mut chunk = String::new();
759 let mut i = span.start + 1; // past the opening `"`
760 while i < inner_end {
761 match bytes[i] {
762 b'\\' => match bytes[i + 1] {
763 b'n' => {
764 chunk.push('\n');
765 i += 2;
766 }
767 b't' => {
768 chunk.push('\t');
769 i += 2;
770 }
771 b'"' => {
772 chunk.push('"');
773 i += 2;
774 }
775 b'\\' => {
776 chunk.push('\\');
777 i += 2;
778 }
779 b'(' => {
780 if !chunk.is_empty() {
781 segments.push(InterpSegment::Chunk(std::mem::take(&mut chunk)));
782 }
783 let hole_start = i + 2;
784 let after = scan_hole(bytes, source, hole_start)?;
785 // `after` is one past the matching `)`; the hole body is
786 // everything up to that `)`.
787 segments.push(InterpSegment::Hole(Span::new(hole_start, after - 1)));
788 i = after;
789 }
790 // The lexer already validated every escape, so nothing else
791 // can appear here.
792 other => unreachable!("unvalidated escape `\\{}` in InterpStr", other as char),
793 },
794 _ => {
795 let ch = source[i..].chars().next().unwrap();
796 chunk.push(ch);
797 i += ch.len_utf8();
798 }
799 }
800 }
801 if !chunk.is_empty() {
802 segments.push(InterpSegment::Chunk(chunk));
803 }
804 Ok(segments)
805}
806
807/// If a `---` doc-block marker line starts at or shortly after `pos` (which
808/// must be at a line boundary), return the byte offset just past the marker
809/// line (after the terminating newline, or at EOF). The doc-block grammar
810/// requires the marker to be alone on its line; leading horizontal whitespace
811/// is allowed and ignored.
812fn doc_block_open_at(source: &str, pos: usize) -> Option<usize> {
813 let bytes = source.as_bytes();
814 if !at_line_start(source, pos) {
815 return None;
816 }
817 // Skip leading horizontal whitespace.
818 let mut i = pos;
819 while i < bytes.len() && (bytes[i] == b' ' || bytes[i] == b'\t') {
820 i += 1;
821 }
822 if i + 3 > bytes.len() {
823 return None;
824 }
825 if &bytes[i..i + 3] != b"---" {
826 return None;
827 }
828 i += 3;
829 // The marker may have additional trailing dashes (per spec "three or more
830 // consecutive hyphens"). Consume them.
831 while i < bytes.len() && bytes[i] == b'-' {
832 i += 1;
833 }
834 // After the dashes, allow only horizontal whitespace then newline/EOF.
835 while i < bytes.len() && (bytes[i] == b' ' || bytes[i] == b'\t' || bytes[i] == b'\r') {
836 i += 1;
837 }
838 if i == bytes.len() {
839 return Some(i);
840 }
841 if bytes[i] == b'\n' {
842 return Some(i + 1);
843 }
844 None
845}
846
847/// Find the next closing `---` line at or after `pos`. Returns
848/// `(start_of_line, end_of_line)` (`end_of_line` is just past the
849/// terminating newline, or at EOF).
850fn doc_block_close(source: &str, mut pos: usize) -> Option<(usize, usize)> {
851 let bytes = source.as_bytes();
852 while pos < bytes.len() {
853 // Advance pos to the start of a line.
854 let line_start = pos;
855 // Find the end of this line.
856 let mut line_end = line_start;
857 while line_end < bytes.len() && bytes[line_end] != b'\n' {
858 line_end += 1;
859 }
860 // Check this line.
861 if let Some(end) = doc_block_open_at(source, line_start) {
862 return Some((line_start, end));
863 }
864 // Move to the next line.
865 pos = if line_end < bytes.len() {
866 line_end + 1
867 } else {
868 line_end
869 };
870 }
871 None
872}
873
874/// Returns true if byte offset `pos` is at a line start (column 0).
875fn at_line_start(source: &str, pos: usize) -> bool {
876 if pos == 0 {
877 return true;
878 }
879 let bytes = source.as_bytes();
880 bytes[pos - 1] == b'\n'
881}
882
883/// Extract the body content of a doc-block token from its source span.
884/// Strips the leading and trailing `---` marker lines and returns the body
885/// verbatim. If every non-empty content line begins with the same horizontal
886/// whitespace prefix (e.g., because the doc block sits inside a brace-form
887/// commons body), that common prefix is removed so the body reads naturally
888/// when emitted as JSDoc.
889pub fn doc_block_content(source: &str, span: Span) -> String {
890 let slice = &source[span.range()];
891 // Drop the first line (opening marker).
892 let after_open = match slice.find('\n') {
893 Some(i) => &slice[i + 1..],
894 None => return String::new(),
895 };
896 let bytes = after_open.as_bytes();
897 // Trim the trailing closing-marker line.
898 let mut i = bytes.len();
899 if i > 0 && bytes[i - 1] == b'\n' {
900 i -= 1;
901 }
902 while i > 0 && matches!(bytes[i - 1], b' ' | b'\t' | b'\r') {
903 i -= 1;
904 }
905 while i > 0 && bytes[i - 1] == b'-' {
906 i -= 1;
907 }
908 if i > 0 && bytes[i - 1] == b'\n' {
909 i -= 1;
910 }
911 let body = &after_open[..i];
912
913 // Compute the common leading-whitespace prefix across all non-empty lines
914 // and strip it. This lets writers indent the doc block alongside the
915 // declaration it documents without bleeding the indent into the JSDoc.
916 let common: Option<usize> = body
917 .lines()
918 .filter(|l| !l.trim().is_empty())
919 .map(|l| l.bytes().take_while(|&b| b == b' ' || b == b'\t').count())
920 .min();
921 let strip = common.unwrap_or(0);
922 if strip == 0 {
923 return body.to_string();
924 }
925 let mut out = String::with_capacity(body.len());
926 let mut first = true;
927 for line in body.lines() {
928 if !first {
929 out.push('\n');
930 }
931 first = false;
932 if line.trim().is_empty() {
933 // Preserve blank lines.
934 continue;
935 }
936 let leading: usize = line
937 .bytes()
938 .take_while(|&b| b == b' ' || b == b'\t')
939 .count();
940 let drop = strip.min(leading);
941 out.push_str(&line[drop..]);
942 }
943 out
944}
945
946/// Extract the body of a `Comment` trivia token: everything after the
947/// leading `--` marker, preserving its inline whitespace verbatim. Used by
948/// the parser when attaching comments to declarations.
949pub fn comment_body(source: &str, span: Span) -> &str {
950 let slice = &source[span.range()];
951 // Strip leading "--" if present (defensive — the lexer always emits
952 // Comment tokens whose span begins with `--`).
953 slice.strip_prefix("--").unwrap_or(slice)
954}
955
956/// Returns true if there is a blank line (a line containing only whitespace)
957/// in `source` strictly between byte offsets `from` (inclusive) and `to`
958/// (exclusive). Used by the parser to detect orphan doc blocks.
959///
960/// A doc-block token's span ends just past the closing-marker line's
961/// terminating newline. So if the next declaration begins on the immediately
962/// following line, the substring between contains no newline (only optional
963/// indentation). Any newline in the substring therefore implies at least one
964/// entirely-blank line separating the doc from the declaration.
965pub fn has_blank_line_between(source: &str, from: usize, to: usize) -> bool {
966 if to <= from {
967 return false;
968 }
969 let bytes = source.as_bytes();
970 let mut i = from;
971 while i < to {
972 if bytes[i] == b'\n' {
973 return true;
974 }
975 if !matches!(bytes[i], b' ' | b'\t' | b'\r') {
976 return false;
977 }
978 i += 1;
979 }
980 false
981}
982
983#[cfg(test)]
984mod tests {
985 use super::*;
986
987 fn kinds(source: &str) -> Vec<TokenKind> {
988 tokenize(source)
989 .unwrap()
990 .into_iter()
991 .map(|t| t.kind)
992 .collect()
993 }
994
995 #[test]
996 fn keywords_and_idents() {
997 use TokenKind::*;
998 assert_eq!(
999 kinds("commons type fn where and true false Int String Bool foo bar"),
1000 vec![
1001 Commons, Type, Fn, Where, And, True, False, Int, String, Bool, Ident, Ident
1002 ],
1003 );
1004 }
1005
1006 #[test]
1007 fn integer_and_string_literals() {
1008 use TokenKind::*;
1009 assert_eq!(
1010 kinds(r#"0 42 "hello" "with\nescape""#),
1011 vec![IntLit, IntLit, StrLit, StrLit]
1012 );
1013 }
1014
1015 #[test]
1016 fn operators() {
1017 use TokenKind::*;
1018 assert_eq!(
1019 kinds("-> == != <= >= && || + - * / ! = < > ( ) { } [ ] , : . @"),
1020 vec![
1021 Arrow, EqEq, BangEq, LtEq, GtEq, AmpAmp, PipePipe, Plus, Minus, Star, Slash, Bang,
1022 Eq, Lt, Gt, LParen, RParen, LBrace, RBrace, LBracket, RBracket, Comma, Colon, Dot,
1023 At,
1024 ],
1025 );
1026 }
1027
1028 #[test]
1029 fn line_comments_emitted_as_trivia() {
1030 // v1.1: line comments are preserved as Comment tokens so the
1031 // formatter can attach and re-emit them.
1032 use TokenKind::*;
1033 let src = "-- a comment\ntype X = Int -- trailing\n";
1034 assert_eq!(kinds(src), vec![Comment, Type, Ident, Eq, Int, Comment],);
1035 }
1036
1037 #[test]
1038 fn comment_body_extracts_text_after_marker() {
1039 let toks = tokenize("-- hello world\n").unwrap();
1040 assert_eq!(toks.len(), 1);
1041 assert_eq!(toks[0].kind, TokenKind::Comment);
1042 assert_eq!(
1043 comment_body("-- hello world\n", toks[0].span),
1044 " hello world"
1045 );
1046 }
1047
1048 #[test]
1049 fn comment_does_not_consume_newline() {
1050 // Two adjacent comment lines should produce two distinct tokens
1051 // — the newline between them is not part of either comment's span.
1052 let toks = tokenize("-- one\n-- two\n").unwrap();
1053 assert_eq!(toks.len(), 2);
1054 assert!(toks.iter().all(|t| t.kind == TokenKind::Comment));
1055 }
1056
1057 #[test]
1058 fn unterminated_string_is_error() {
1059 let err = tokenize("\"oops\n").unwrap_err();
1060 assert_eq!(err.category, "bynk.lex.unterminated_string");
1061 }
1062
1063 #[test]
1064 fn integer_overflow_is_error() {
1065 let err = tokenize("99999999999999999999").unwrap_err();
1066 assert_eq!(err.category, "bynk.lex.integer_overflow");
1067 }
1068
1069 #[test]
1070 fn unexpected_character_is_error() {
1071 let err = tokenize("type X = Int $").unwrap_err();
1072 assert_eq!(err.category, "bynk.lex.unexpected_character");
1073 }
1074
1075 #[test]
1076 fn v0_1_keywords() {
1077 use TokenKind::*;
1078 assert_eq!(
1079 kinds("let if else Ok Err Result ValidationError"),
1080 vec![Let, If, Else, Ok, Err, Result, ValidationError],
1081 );
1082 }
1083
1084 #[test]
1085 fn question_token() {
1086 use TokenKind::*;
1087 assert_eq!(kinds("x?"), vec![Ident, Question]);
1088 }
1089
1090 #[test]
1091 fn v0_2_keywords() {
1092 use TokenKind::*;
1093 assert_eq!(
1094 kinds("enum match Option record self Some None is"),
1095 vec![Enum, Match, Option, Record, Self_, Some, None, Is],
1096 );
1097 }
1098
1099 #[test]
1100 fn pipe_and_pipe_pipe_disambiguated() {
1101 use TokenKind::*;
1102 assert_eq!(kinds("| || |"), vec![Pipe, PipePipe, Pipe]);
1103 }
1104
1105 #[test]
1106 fn v0_7_keywords() {
1107 use TokenKind::*;
1108 assert_eq!(kinds("expect suite case"), vec![Expect, Suite, Case],);
1109 // v0.118: `mocks` and `wires` are retired — plain identifiers now.
1110 assert_eq!(kinds("mocks wires"), vec![Ident, Ident]);
1111 }
1112
1113 #[test]
1114 fn fat_arrow_and_underscore() {
1115 use TokenKind::*;
1116 assert_eq!(kinds("_ =>"), vec![Underscore, FatArrow]);
1117 }
1118
1119 // -- v0.43 string interpolation --
1120
1121 #[test]
1122 fn interp_string_is_one_token() {
1123 use TokenKind::*;
1124 assert_eq!(kinds(r#""Hello, \(name)!""#), vec![InterpStr]);
1125 // A plain string (no hole) stays a `StrLit`, via the logos path.
1126 assert_eq!(kinds(r#""Hello, world""#), vec![StrLit]);
1127 }
1128
1129 #[test]
1130 fn interp_balances_nested_parens_and_strings() {
1131 use TokenKind::*;
1132 // The `)` inside `f(x)` must not close the hole early.
1133 assert_eq!(kinds(r#""= \(f(x))""#), vec![InterpStr]);
1134 // A `)` inside a nested string inside the hole is also ignored.
1135 assert_eq!(kinds(r#""= \(label(")"))""#), vec![InterpStr]);
1136 // A nested interpolated string inside a hole.
1137 assert_eq!(kinds(r#""out \("in \(x)")""#), vec![InterpStr]);
1138 }
1139
1140 // Issue #473: hole-expanding tokenisation makes identifiers inside `\(…)`
1141 // visible to the LSP's token-based cursor resolution.
1142 #[test]
1143 fn expanding_holes_exposes_hole_identifiers() {
1144 use TokenKind::*;
1145 let expand = |src: &str| {
1146 tokenize_expanding_holes(src)
1147 .unwrap()
1148 .into_iter()
1149 .map(|t| t.kind)
1150 .collect::<Vec<_>>()
1151 };
1152 // The opaque `InterpStr` is replaced by its hole's tokens; the chunk
1153 // text (`Hello, ` / `!`) carries none.
1154 assert_eq!(expand(r#""Hello, \(name)!""#), vec![Ident]);
1155 // A call hole exposes every token of the call expression.
1156 assert_eq!(expand(r#""= \(f(x))""#), vec![Ident, LParen, Ident, RParen]);
1157 // Nested interpolation recurses to the innermost hole's identifier.
1158 assert_eq!(expand(r#""out \("in \(x)")""#), vec![Ident]);
1159 // A plain (hole-free) string is untouched.
1160 assert_eq!(expand(r#""Hello, world""#), vec![StrLit]);
1161 }
1162
1163 #[test]
1164 fn expanding_holes_rebases_spans_to_absolute() {
1165 let src = r#""Hello, \(name)!""#;
1166 let toks = tokenize_expanding_holes(src).unwrap();
1167 let ident = toks
1168 .iter()
1169 .find(|t| t.kind == TokenKind::Ident)
1170 .expect("the hole identifier is exposed");
1171 // The span points at `name` in the original source, not a hole-local 0.
1172 assert_eq!(&src[ident.span.range()], "name");
1173 assert_eq!(ident.span.start, src.find("name").unwrap());
1174 }
1175
1176 #[test]
1177 fn escaped_open_paren_is_not_a_hole() {
1178 use TokenKind::*;
1179 // `\\(` is a literal backslash followed by `(` — no hole, so the
1180 // string lexes as a plain `StrLit` on the logos path.
1181 assert_eq!(kinds(r#""a \\(b) c""#), vec![StrLit]);
1182 }
1183
1184 #[test]
1185 fn unterminated_hole_is_an_error() {
1186 // The hole runs to end of line without its closing `)`.
1187 let err = tokenize("\"value \\(x + 1\n\"").unwrap_err();
1188 assert_eq!(err.category, "bynk.lex.unterminated_interpolation");
1189 }
1190
1191 #[test]
1192 fn unterminated_interp_string_is_an_error() {
1193 // A hole closes but the string never does (newline before the `"`).
1194 let err = tokenize("\"value \\(x) more\n").unwrap_err();
1195 assert_eq!(err.category, "bynk.lex.unterminated_string");
1196 }
1197
1198 #[test]
1199 fn bad_escape_in_interp_string_is_an_error() {
1200 let err = tokenize(r#""a \q \(x)""#).unwrap_err();
1201 assert_eq!(err.category, "bynk.lex.bad_escape");
1202 }
1203}