lex-core 0.15.0

Parser library for the lex format
Documentation
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
//! Inline-position visitor for [`TextContent`].
//!
//! Walks the inline tree of a parsed text run while tracking byte offsets
//! through the raw source, and fires visitor callbacks at each leaf and
//! container boundary. Each callback receives a precise [`Range`] (with both
//! byte span and `line:column` positions) so consumers can emit LSP semantic
//! tokens, document links, text objects, references for goto-def, etc.
//! without re-implementing cursor and escape arithmetic.
//!
//! ## Why this exists
//!
//! Two consumers in the workspace need the same cursor logic:
//!
//! - [`super::Document::find_all_links`] /
//!   [`super::Session::find_all_links`] — emit a `DocumentLink` per
//!   URL/File reference, with a range covering exactly the `[bracketed]`
//!   text.
//! - `lex-analysis::semantic_tokens` — emits per-marker semantic tokens
//!   (`*` open, content span, `*` close, …) for every inline element type.
//!
//! Before consolidation, each consumer carried its own walker with its own
//! copy of the escape-handling and marker-stepping logic. Any change to the
//! inline parser (e.g., new escape rule) had to land in two places and was
//! easy to miss. This module is the single source of truth.
//!
//! ## Inline-tree shape
//!
//! The inline parser produces nodes from these variants (see
//! [`InlineNode`]):
//!
//! | Variant | Source shape | Notes |
//! |---------|--------------|-------|
//! | `Plain` | `text` | Subject to escape rules; raw bytes can be longer than the unescaped char count. |
//! | `Strong` | `*content*` | `content` is recursive `[InlineNode]`; markers are single ASCII byte. |
//! | `Emphasis` | `_content_` | Same as Strong with `_` marker. |
//! | `Code` | `` `text` `` | `text` is literal — no escape processing inside. |
//! | `Math` | `#text#` | Same as Code with `#` marker. |
//! | `Reference` | `[content]` | `content` is literal; classified into `ReferenceType` already. |
//!
//! Containers (Strong/Emphasis) emit `enter_*` before recursing into
//! children and `leave_*` after — the `content` range passed to `leave_*`
//! covers the span between (but excluding) the markers. Literals
//! (Code/Math/Reference) get a single combined call with separate
//! `open_marker`, `content`, `close_marker` ranges so consumers can decorate
//! markers and content independently.

use super::range::{Position, Range};
use super::text_content::TextContent;
use crate::lex::inlines::{InlineNode, ReferenceInline};

/// Visitor for inline-tree walks performed by
/// [`walk_text_content_positions`]. Each method has a default no-op
/// implementation — callers override only what they care about.
///
/// Containers (`Strong`, `Emphasis`) fire `enter_*` before child recursion
/// and `leave_*` after, with the resolved content/close ranges passed at
/// `leave_*` time. Visitors that need to suppress something while inside a
/// container can track an `in_formatted` counter incremented on `enter_*`
/// and decremented on `leave_*`; the walker guarantees balanced nesting.
pub trait InlinePositionVisitor {
    fn visit_plain(&mut self, _range: &Range, _text: &str) {}
    fn enter_strong(&mut self, _open_marker: &Range) {}
    fn leave_strong(&mut self, _content: &Range, _close_marker: &Range) {}
    fn enter_emphasis(&mut self, _open_marker: &Range) {}
    fn leave_emphasis(&mut self, _content: &Range, _close_marker: &Range) {}
    fn visit_code(
        &mut self,
        _open_marker: &Range,
        _content: &Range,
        _close_marker: &Range,
        _text: &str,
    ) {
    }
    fn visit_math(
        &mut self,
        _open_marker: &Range,
        _content: &Range,
        _close_marker: &Range,
        _text: &str,
    ) {
    }
    fn visit_reference(
        &mut self,
        _open_marker: &Range,
        _content: &Range,
        _close_marker: &Range,
        _data: &ReferenceInline,
    ) {
    }
}

/// Walk `text`'s parsed inline tree, firing visitor callbacks with precise
/// source-position ranges.
///
/// Returns immediately without invoking the visitor when `text.location` is
/// `None` (no source range to anchor positions) or the raw text is empty.
/// Inline nodes come from [`TextContent::inlines`] when already parsed
/// (zero-allocation borrow — the common case after the standard
/// `parse_document` pipeline ran [`crate::lex::transforms::stages::ParseInlines`]),
/// falling back to [`TextContent::inline_items`] for programmatically
/// constructed ASTs that haven't been parsed yet.
///
/// Concretely the cursor advances through `text.as_string()` byte-for-byte,
/// applying the inline parser's escape rules (`\X` where `X` is
/// non-alphanumeric → 2 raw bytes for 1 unescaped char, any other backslash
/// stays literal). Marker characters (`*`, `_`, `` ` ``, `#`, `[`, `]`) are
/// counted by their UTF-8 width.
pub fn walk_text_content_positions<V: InlinePositionVisitor>(text: &TextContent, visitor: &mut V) {
    let Some(base_range) = text.location.as_ref() else {
        return;
    };
    let raw = text.as_string();
    if raw.is_empty() {
        return;
    }
    // Borrow when inlines were pre-parsed; only allocate when we have to
    // parse fresh. The standard `parse_document` pipeline always pre-parses,
    // so production traffic hits the borrow path.
    let owned;
    let nodes: &[InlineNode] = match text.inlines() {
        Some(borrowed) => borrowed,
        None => {
            owned = text.inline_items();
            &owned
        }
    };
    let mut walker = InlinePositionWalker {
        raw,
        base_range,
        cursor: 0,
    };
    walker.walk_nodes(nodes, visitor);
}

struct InlinePositionWalker<'a> {
    raw: &'a str,
    base_range: &'a Range,
    cursor: usize,
}

impl<'a> InlinePositionWalker<'a> {
    fn walk_nodes<V: InlinePositionVisitor>(&mut self, nodes: &[InlineNode], v: &mut V) {
        for node in nodes {
            self.walk_node(node, v);
        }
    }

    fn walk_node<V: InlinePositionVisitor>(&mut self, node: &InlineNode, v: &mut V) {
        match node {
            InlineNode::Plain { text, .. } => {
                let start = self.cursor;
                self.advance_unescaped(text);
                let end = self.cursor;
                if start < end {
                    let range = self.make_range(start, end);
                    v.visit_plain(&range, text);
                }
            }
            InlineNode::Strong { content, .. } => self.walk_strong(content, v),
            InlineNode::Emphasis { content, .. } => self.walk_emphasis(content, v),
            InlineNode::Code { text, .. } => self.walk_literal(text, '`', v, EmitLiteral::Code),
            InlineNode::Math { text, .. } => self.walk_literal(text, '#', v, EmitLiteral::Math),
            InlineNode::Reference { data, .. } => self.walk_reference(data, v),
        }
    }

    fn walk_strong<V: InlinePositionVisitor>(&mut self, children: &[InlineNode], v: &mut V) {
        let m = '*'.len_utf8();
        let open_start = self.cursor;
        self.cursor += m;
        let open = self.make_range(open_start, self.cursor);
        v.enter_strong(&open);

        let content_start = self.cursor;
        self.walk_nodes(children, v);
        let content_end = self.cursor;

        let close_start = self.cursor;
        self.cursor += m;
        let close = self.make_range(close_start, self.cursor);
        let content = self.make_range(content_start, content_end);
        v.leave_strong(&content, &close);
    }

    fn walk_emphasis<V: InlinePositionVisitor>(&mut self, children: &[InlineNode], v: &mut V) {
        let m = '_'.len_utf8();
        let open_start = self.cursor;
        self.cursor += m;
        let open = self.make_range(open_start, self.cursor);
        v.enter_emphasis(&open);

        let content_start = self.cursor;
        self.walk_nodes(children, v);
        let content_end = self.cursor;

        let close_start = self.cursor;
        self.cursor += m;
        let close = self.make_range(close_start, self.cursor);
        let content = self.make_range(content_start, content_end);
        v.leave_emphasis(&content, &close);
    }

    fn walk_literal<V: InlinePositionVisitor>(
        &mut self,
        text: &str,
        marker: char,
        v: &mut V,
        kind: EmitLiteral,
    ) {
        let m = marker.len_utf8();
        let open_start = self.cursor;
        self.cursor += m;
        let open = self.make_range(open_start, self.cursor);

        let content_start = self.cursor;
        self.cursor += text.len();
        let content = self.make_range(content_start, self.cursor);

        let close_start = self.cursor;
        self.cursor += m;
        let close = self.make_range(close_start, self.cursor);

        match kind {
            EmitLiteral::Code => v.visit_code(&open, &content, &close, text),
            EmitLiteral::Math => v.visit_math(&open, &content, &close, text),
        }
    }

    fn walk_reference<V: InlinePositionVisitor>(&mut self, data: &ReferenceInline, v: &mut V) {
        let open_start = self.cursor;
        self.cursor += 1;
        let open = self.make_range(open_start, self.cursor);

        let content_start = self.cursor;
        self.cursor += data.raw.len();
        let content = self.make_range(content_start, self.cursor);

        let close_start = self.cursor;
        self.cursor += 1;
        let close = self.make_range(close_start, self.cursor);

        v.visit_reference(&open, &content, &close, data);
    }

    /// Mirror the inline parser's escape handling so the cursor advances
    /// through raw bytes by the same amount the parser consumed when
    /// producing each unescaped char in the `Plain` node. `\X` with a
    /// non-alphanumeric `X` is consumed as 2 raw bytes for 1 unescaped char;
    /// any other backslash stays literal.
    fn advance_unescaped(&mut self, text: &str) {
        for _expected in text.chars() {
            if self.cursor >= self.raw.len() {
                break;
            }
            let raw_ch = self.raw[self.cursor..].chars().next().unwrap();
            if raw_ch == '\\' {
                if self.cursor + 1 >= self.raw.len() {
                    self.cursor += 1;
                } else {
                    let next_ch = self.raw[self.cursor + 1..].chars().next();
                    match next_ch {
                        Some(nc) if !nc.is_alphanumeric() => {
                            self.cursor += 1 + nc.len_utf8();
                        }
                        _ => {
                            self.cursor += 1;
                        }
                    }
                }
            } else {
                self.cursor += raw_ch.len_utf8();
            }
        }
    }

    fn make_range(&self, start: usize, end: usize) -> Range {
        let start_pos = self.position_at(start);
        let end_pos = self.position_at(end);
        Range::new(
            (self.base_range.span.start + start)..(self.base_range.span.start + end),
            start_pos,
            end_pos,
        )
    }

    fn position_at(&self, offset: usize) -> Position {
        // `column` units must match the LSP `positionEncoding` capability
        // negotiated with the client. We don't currently negotiate
        // `utf-8`/`utf-32`, so the spec-default `utf-16` applies — and that
        // matches what VSCode, Helix, and most other LSP clients use even
        // when they accept negotiation. So columns advance by each char's
        // UTF-16 code-unit width: 1 for BMP chars, 2 for supplementary
        // (e.g., emoji). Using `len_utf8` (the byte width) instead used to
        // shift every subsequent token right by `len_utf8 - len_utf16` for
        // each non-ASCII char on the line — visible in editors as semantic
        // tokens landing on the wrong character.
        let mut line = self.base_range.start.line;
        let mut column = self.base_range.start.column;
        for ch in self.raw[..offset].chars() {
            if ch == '\n' {
                line += 1;
                column = 0;
            } else {
                column += ch.len_utf16();
            }
        }
        Position::new(line, column)
    }
}

enum EmitLiteral {
    Code,
    Math,
}

#[cfg(test)]
mod tests {
    use super::super::range::Position;
    use super::super::text_content::TextContent;
    use super::*;

    #[derive(Default)]
    struct CodeCapture {
        opens: Vec<Range>,
        contents: Vec<Range>,
        closes: Vec<Range>,
    }

    impl InlinePositionVisitor for CodeCapture {
        fn visit_code(&mut self, open: &Range, content: &Range, close: &Range, _text: &str) {
            self.opens.push(open.clone());
            self.contents.push(content.clone());
            self.closes.push(close.clone());
        }
    }

    #[derive(Default)]
    struct StrongCapture {
        opens: Vec<Range>,
    }

    impl InlinePositionVisitor for StrongCapture {
        fn enter_strong(&mut self, open: &Range) {
            self.opens.push(open.clone());
        }
    }

    fn make_text_content(raw: &str) -> TextContent {
        // Build a TextContent rooted at line 0, column 0, spanning the full
        // input. `end.column` is the UTF-16 code-unit width — irrelevant for
        // these tests since we only assert positions at offsets we compute.
        let location = Range::new(
            0..raw.len(),
            Position::new(0, 0),
            Position::new(0, raw.chars().map(char::len_utf16).sum::<usize>()),
        );
        TextContent::from_string(raw.to_string(), Some(location))
    }

    /// LSP's default `positionEncoding` is UTF-16 code units, but the cursor
    /// walker's `position_at` was accumulating `column += ch.len_utf8()` for
    /// each char — the byte width. After any non-ASCII char (e.g., `→` is 3
    /// UTF-8 bytes / 1 UTF-16 unit) every following column was offset by
    /// `len_utf8 - len_utf16`, so VSCode painted the open-backtick token on
    /// the *next* character. This test pins the `→` case.
    ///
    /// ```text
    ///   "Hello → `Setup`"
    ///   utf-16 col:  H=0 e=1 l=2 l=3 o=4 ' '=5 →=6 ' '=7 `=8 S=9 ...
    ///   utf-8  byte: H=0 e=1 l=2 l=3 o=4 ' '=5 →=6,7,8 ' '=9 `=10 S=11 ...
    /// ```
    #[test]
    fn code_marker_columns_are_utf16_code_units_after_arrow() {
        let raw = "Hello → `Setup`";
        let content = make_text_content(raw);

        let mut visitor = CodeCapture::default();
        walk_text_content_positions(&content, &mut visitor);

        let open = visitor.opens.first().expect("captured open marker");
        // Byte span is UTF-8 bytes — `Range::span` semantics — and that's right.
        assert_eq!(open.span, 10..11, "byte span of the open backtick");
        assert_eq!(
            open.start,
            Position::new(0, 8),
            "open-marker column must be UTF-16 unit (8) not UTF-8 byte (10) — \
             got {:?}",
            open.start
        );
        assert_eq!(open.end, Position::new(0, 9));

        let body = visitor.contents.first().expect("captured content");
        assert_eq!(body.span, 11..16, "byte span of `Setup` content");
        assert_eq!(body.start, Position::new(0, 9));
        assert_eq!(body.end, Position::new(0, 14));

        let close = visitor.closes.first().expect("captured close marker");
        assert_eq!(close.span, 16..17, "byte span of close backtick");
        assert_eq!(close.start, Position::new(0, 14));
        assert_eq!(close.end, Position::new(0, 15));
    }

    /// Same root cause covers `*strong*` — the bug isn't specific to
    /// backticks, every container/literal goes through the same `position_at`.
    #[test]
    fn strong_marker_columns_are_utf16_code_units_after_arrow() {
        let raw = "Hello → *bold*";
        let content = make_text_content(raw);

        let mut visitor = StrongCapture::default();
        walk_text_content_positions(&content, &mut visitor);

        let open = visitor.opens.first().expect("captured open marker");
        assert_eq!(open.span, 10..11, "byte span of `*`");
        assert_eq!(open.start, Position::new(0, 8));
        assert_eq!(open.end, Position::new(0, 9));
    }

    /// A character outside the BMP — `🦀` (U+1F980) — is 4 UTF-8 bytes and
    /// 2 UTF-16 code units. So `len_utf8 != 1` and `len_utf16 != 1`. Pin the
    /// math here too: column should advance by `len_utf16` (2), not by
    /// `len_utf8` (4) and not by `1` either.
    #[test]
    fn columns_advance_by_utf16_units_for_supplementary_chars() {
        let raw = "x🦀 `c`";
        // utf-16 cols: x=0 🦀=1,2 ' '=3 `=4 c=5 `=6
        // utf-8 bytes: x=0 🦀=1..5 ' '=5 `=6 c=7 `=8
        let content = make_text_content(raw);

        let mut visitor = CodeCapture::default();
        walk_text_content_positions(&content, &mut visitor);

        let open = visitor.opens.first().expect("captured open marker");
        assert_eq!(open.span, 6..7, "byte span of `\\``");
        assert_eq!(
            open.start,
            Position::new(0, 4),
            "🦀 contributes 2 UTF-16 units (got column {:?})",
            open.start.column
        );
    }
}