lex-core 0.15.0

//! Inline-position visitor for [`TextContent`].
//!
//! Walks the inline tree of a parsed text run while tracking byte offsets
//! through the raw source, and fires visitor callbacks at each leaf and
//! container boundary. Each callback receives a precise [`Range`] (with both
//! byte span and `line:column` positions) so consumers can emit LSP semantic
//! tokens, document links, text objects, references for goto-def, etc.
//! without re-implementing cursor and escape arithmetic.
//!
//! ## Why this exists
//!
//! Two consumers in the workspace need the same cursor logic:
//!
//! - [`super::Document::find_all_links`] /
//!   [`super::Session::find_all_links`] — emit a `DocumentLink` per
//!   URL/File reference, with a range covering exactly the `[bracketed]`
//!   text.
//! - `lex-analysis::semantic_tokens` — emits per-marker semantic tokens
//!   (`*` open, content span, `*` close, …) for every inline element type.
//!
//! Before consolidation, each consumer carried its own walker with its own
//! copy of the escape-handling and marker-stepping logic. Any change to the
//! inline parser (e.g., new escape rule) had to land in two places and was
//! easy to miss. This module is the single source of truth.
//!
//! ## Inline-tree shape
//!
//! The inline parser produces nodes from these variants (see
//! [`InlineNode`]):
//!
//! | Variant | Source shape | Notes |
//! |---------|--------------|-------|
//! | `Plain` | `text` | Subject to escape rules; raw bytes can be longer than the unescaped char count. |
//! | `Strong` | `*content*` | `content` is recursive `[InlineNode]`; markers are single ASCII byte. |
//! | `Emphasis` | `_content_` | Same as Strong with `_` marker. |
//! | `Code` | `` `text` `` | `text` is literal — no escape processing inside. |
//! | `Math` | `#text#` | Same as Code with `#` marker. |
//! | `Reference` | `[content]` | `content` is literal; classified into `ReferenceType` already. |
//!
//! Containers (Strong/Emphasis) emit `enter_*` before recursing into
//! children and `leave_*` after — the `content` range passed to `leave_*`
//! covers the span between (but excluding) the markers. Literals
//! (Code/Math/Reference) get a single combined call with separate
//! `open_marker`, `content`, `close_marker` ranges so consumers can decorate
//! markers and content independently.

use super::range::{Position, Range};
use super::text_content::TextContent;
use crate::lex::inlines::{InlineNode, ReferenceInline};

/// Visitor for inline-tree walks performed by
/// [`walk_text_content_positions`]. Each method has a default no-op
/// implementation — callers override only what they care about.
///
/// Containers (`Strong`, `Emphasis`) fire `enter_*` before child recursion
/// and `leave_*` after, with the resolved content/close ranges passed at
/// `leave_*` time. Visitors that need to suppress something while inside a
/// container can track an `in_formatted` counter incremented on `enter_*`
/// and decremented on `leave_*`; the walker guarantees balanced nesting.
pub trait InlinePositionVisitor {
    fn visit_plain(&mut self, _range: &Range, _text: &str) {}
    fn enter_strong(&mut self, _open_marker: &Range) {}
    fn leave_strong(&mut self, _content: &Range, _close_marker: &Range) {}
    fn enter_emphasis(&mut self, _open_marker: &Range) {}
    fn leave_emphasis(&mut self, _content: &Range, _close_marker: &Range) {}
    fn visit_code(
        &mut self,
        _open_marker: &Range,
        _content: &Range,
        _close_marker: &Range,
        _text: &str,
    ) {
    }
    fn visit_math(
        &mut self,
        _open_marker: &Range,
        _content: &Range,
        _close_marker: &Range,
        _text: &str,
    ) {
    }
    fn visit_reference(
        &mut self,
        _open_marker: &Range,
        _content: &Range,
        _close_marker: &Range,
        _data: &ReferenceInline,
    ) {
    }
}

/// Walk `text`'s parsed inline tree, firing visitor callbacks with precise
/// source-position ranges.
///
/// Returns immediately without invoking the visitor when `text.location` is
/// `None` (no source range to anchor positions) or the raw text is empty.
/// Inline nodes come from [`TextContent::inlines`] when already parsed
/// (zero-allocation borrow — the common case after the standard
/// `parse_document` pipeline ran [`crate::lex::transforms::stages::ParseInlines`]),
/// falling back to [`TextContent::inline_items`] for programmatically
/// constructed ASTs that haven't been parsed yet.
///
/// Concretely the cursor advances through `text.as_string()` byte-for-byte,
/// applying the inline parser's escape rules (`\X` where `X` is
/// non-alphanumeric → 2 raw bytes for 1 unescaped char, any other backslash
/// stays literal). Marker characters (`*`, `_`, `` ` ``, `#`, `[`, `]`) are
/// counted by their UTF-8 width.
pub fn walk_text_content_positions<V: InlinePositionVisitor>(text: &TextContent, visitor: &mut V) {
    let Some(base_range) = text.location.as_ref() else {
        return;
    };
    let raw = text.as_string();
    if raw.is_empty() {
        return;
    }
    // Borrow when inlines were pre-parsed; only allocate when we have to
    // parse fresh. The standard `parse_document` pipeline always pre-parses,
    // so production traffic hits the borrow path.
    let owned;
    let nodes: &[InlineNode] = match text.inlines() {
        Some(borrowed) => borrowed,
        None => {
            owned = text.inline_items();
            &owned
        }
    };
    let mut walker = InlinePositionWalker {
        raw,
        base_range,
        cursor: 0,
    };
    walker.walk_nodes(nodes, visitor);
}

struct InlinePositionWalker<'a> {
    raw: &'a str,
    base_range: &'a Range,
    cursor: usize,
}

impl<'a> InlinePositionWalker<'a> {
    fn walk_nodes<V: InlinePositionVisitor>(&mut self, nodes: &[InlineNode], v: &mut V) {
        for node in nodes {
            self.walk_node(node, v);
        }
    }

    fn walk_node<V: InlinePositionVisitor>(&mut self, node: &InlineNode, v: &mut V) {
        match node {
            InlineNode::Plain { text, .. } => {
                let start = self.cursor;
                self.advance_unescaped(text);
                let end = self.cursor;
                if start < end {
                    let range = self.make_range(start, end);
                    v.visit_plain(&range, text);
                }
            }
            InlineNode::Strong { content, .. } => self.walk_strong(content, v),
            InlineNode::Emphasis { content, .. } => self.walk_emphasis(content, v),
            InlineNode::Code { text, .. } => self.walk_literal(text, '`', v, EmitLiteral::Code),
            InlineNode::Math { text, .. } => self.walk_literal(text, '#', v, EmitLiteral::Math),
            InlineNode::Reference { data, .. } => self.walk_reference(data, v),
        }
    }

    fn walk_strong<V: InlinePositionVisitor>(&mut self, children: &[InlineNode], v: &mut V) {
        let m = '*'.len_utf8();
        let open_start = self.cursor;
        self.cursor += m;
        let open = self.make_range(open_start, self.cursor);
        v.enter_strong(&open);

        let content_start = self.cursor;
        self.walk_nodes(children, v);
        let content_end = self.cursor;

        let close_start = self.cursor;
        self.cursor += m;
        let close = self.make_range(close_start, self.cursor);
        let content = self.make_range(content_start, content_end);
        v.leave_strong(&content, &close);
    }

    fn walk_emphasis<V: InlinePositionVisitor>(&mut self, children: &[InlineNode], v: &mut V) {
        let m = '_'.len_utf8();
        let open_start = self.cursor;
        self.cursor += m;
        let open = self.make_range(open_start, self.cursor);
        v.enter_emphasis(&open);

        let content_start = self.cursor;
        self.walk_nodes(children, v);
        let content_end = self.cursor;

        let close_start = self.cursor;
        self.cursor += m;
        let close = self.make_range(close_start, self.cursor);
        let content = self.make_range(content_start, content_end);
        v.leave_emphasis(&content, &close);
    }

    fn walk_literal<V: InlinePositionVisitor>(
        &mut self,
        text: &str,
        marker: char,
        v: &mut V,
        kind: EmitLiteral,
    ) {
        let m = marker.len_utf8();
        let open_start = self.cursor;
        self.cursor += m;
        let open = self.make_range(open_start, self.cursor);

        let content_start = self.cursor;
        self.cursor += text.len();
        let content = self.make_range(content_start, self.cursor);

        let close_start = self.cursor;
        self.cursor += m;
        let close = self.make_range(close_start, self.cursor);

        match kind {
            EmitLiteral::Code => v.visit_code(&open, &content, &close, text),
            EmitLiteral::Math => v.visit_math(&open, &content, &close, text),
        }
    }

    fn walk_reference<V: InlinePositionVisitor>(&mut self, data: &ReferenceInline, v: &mut V) {
        let open_start = self.cursor;
        self.cursor += 1;
        let open = self.make_range(open_start, self.cursor);

        let content_start = self.cursor;
        self.cursor += data.raw.len();
        let content = self.make_range(content_start, self.cursor);

        let close_start = self.cursor;
        self.cursor += 1;
        let close = self.make_range(close_start, self.cursor);

        v.visit_reference(&open, &content, &close, data);
    }

    /// Mirror the inline parser's escape handling so the cursor advances
    /// through raw bytes by the same amount the parser consumed when
    /// producing each unescaped char in the `Plain` node. `\X` with a
    /// non-alphanumeric `X` is consumed as 2 raw bytes for 1 unescaped char;
    /// any other backslash stays literal.
    fn advance_unescaped(&mut self, text: &str) {
        for _expected in text.chars() {
            if self.cursor >= self.raw.len() {
                break;
            }
            let raw_ch = self.raw[self.cursor..].chars().next().unwrap();
            if raw_ch == '\\' {
                if self.cursor + 1 >= self.raw.len() {
                    self.cursor += 1;
                } else {
                    let next_ch = self.raw[self.cursor + 1..].chars().next();
                    match next_ch {
                        Some(nc) if !nc.is_alphanumeric() => {
                            self.cursor += 1 + nc.len_utf8();
                        }
                        _ => {
                            self.cursor += 1;
                        }
                    }
                }
            } else {
                self.cursor += raw_ch.len_utf8();
            }
        }
    }

    fn make_range(&self, start: usize, end: usize) -> Range {
        let start_pos = self.position_at(start);
        let end_pos = self.position_at(end);
        Range::new(
            (self.base_range.span.start + start)..(self.base_range.span.start + end),
            start_pos,
            end_pos,
        )
    }

    fn position_at(&self, offset: usize) -> Position {
        // `column` units must match the LSP `positionEncoding` capability
        // negotiated with the client. We don't currently negotiate
        // `utf-8`/`utf-32`, so the spec-default `utf-16` applies — and that
        // matches what VSCode, Helix, and most other LSP clients use even
        // when they accept negotiation. So columns advance by each char's
        // UTF-16 code-unit width: 1 for BMP chars, 2 for supplementary
        // (e.g., emoji). Using `len_utf8` (the byte width) instead used to
        // shift every subsequent token right by `len_utf8 - len_utf16` for
        // each non-ASCII char on the line — visible in editors as semantic
        // tokens landing on the wrong character.
        let mut line = self.base_range.start.line;
        let mut column = self.base_range.start.column;
        for ch in self.raw[..offset].chars() {
            if ch == '\n' {
                line += 1;
                column = 0;
            } else {
                column += ch.len_utf16();
            }
        }
        Position::new(line, column)
    }
}

enum EmitLiteral {
    Code,
    Math,
}

#[cfg(test)]
mod tests {
    use super::super::range::Position;
    use super::super::text_content::TextContent;
    use super::*;

    #[derive(Default)]
    struct CodeCapture {
        opens: Vec<Range>,
        contents: Vec<Range>,
        closes: Vec<Range>,
    }

    impl InlinePositionVisitor for CodeCapture {
        fn visit_code(&mut self, open: &Range, content: &Range, close: &Range, _text: &str) {
            self.opens.push(open.clone());
            self.contents.push(content.clone());
            self.closes.push(close.clone());
        }
    }

    #[derive(Default)]
    struct StrongCapture {
        opens: Vec<Range>,
    }

    impl InlinePositionVisitor for StrongCapture {
        fn enter_strong(&mut self, open: &Range) {
            self.opens.push(open.clone());
        }
    }

    fn make_text_content(raw: &str) -> TextContent {
        // Build a TextContent rooted at line 0, column 0, spanning the full
        // input. `end.column` is the UTF-16 code-unit width — irrelevant for
        // these tests since we only assert positions at offsets we compute.
        let location = Range::new(
            0..raw.len(),
            Position::new(0, 0),
            Position::new(0, raw.chars().map(char::len_utf16).sum::<usize>()),
        );
        TextContent::from_string(raw.to_string(), Some(location))
    }

    /// LSP's default `positionEncoding` is UTF-16 code units, but the cursor
    /// walker's `position_at` was accumulating `column += ch.len_utf8()` for
    /// each char — the byte width. After any non-ASCII char (e.g., `→` is 3
    /// UTF-8 bytes / 1 UTF-16 unit) every following column was offset by
    /// `len_utf8 - len_utf16`, so VSCode painted the open-backtick token on
    /// the *next* character. This test pins the `→` case.
    ///
    /// ```text
    ///   "Hello → `Setup`"
    ///   utf-16 col:  H=0 e=1 l=2 l=3 o=4 ' '=5 →=6 ' '=7 `=8 S=9 ...
    ///   utf-8  byte: H=0 e=1 l=2 l=3 o=4 ' '=5 →=6,7,8 ' '=9 `=10 S=11 ...
    /// ```
    #[test]
    fn code_marker_columns_are_utf16_code_units_after_arrow() {
        let raw = "Hello → `Setup`";
        let content = make_text_content(raw);

        let mut visitor = CodeCapture::default();
        walk_text_content_positions(&content, &mut visitor);

        let open = visitor.opens.first().expect("captured open marker");
        // Byte span is UTF-8 bytes — `Range::span` semantics — and that's right.
        assert_eq!(open.span, 10..11, "byte span of the open backtick");
        assert_eq!(
            open.start,
            Position::new(0, 8),
            "open-marker column must be UTF-16 unit (8) not UTF-8 byte (10) — \
             got {:?}",
            open.start
        );
        assert_eq!(open.end, Position::new(0, 9));

        let body = visitor.contents.first().expect("captured content");
        assert_eq!(body.span, 11..16, "byte span of `Setup` content");
        assert_eq!(body.start, Position::new(0, 9));
        assert_eq!(body.end, Position::new(0, 14));

        let close = visitor.closes.first().expect("captured close marker");
        assert_eq!(close.span, 16..17, "byte span of close backtick");
        assert_eq!(close.start, Position::new(0, 14));
        assert_eq!(close.end, Position::new(0, 15));
    }

    /// Same root cause covers `*strong*` — the bug isn't specific to
    /// backticks, every container/literal goes through the same `position_at`.
    #[test]
    fn strong_marker_columns_are_utf16_code_units_after_arrow() {
        let raw = "Hello → *bold*";
        let content = make_text_content(raw);

        let mut visitor = StrongCapture::default();
        walk_text_content_positions(&content, &mut visitor);

        let open = visitor.opens.first().expect("captured open marker");
        assert_eq!(open.span, 10..11, "byte span of `*`");
        assert_eq!(open.start, Position::new(0, 8));
        assert_eq!(open.end, Position::new(0, 9));
    }

    /// A character outside the BMP — `🦀` (U+1F980) — is 4 UTF-8 bytes and
    /// 2 UTF-16 code units. So `len_utf8 != 1` and `len_utf16 != 1`. Pin the
    /// math here too: column should advance by `len_utf16` (2), not by
    /// `len_utf8` (4) and not by `1` either.
    #[test]
    fn columns_advance_by_utf16_units_for_supplementary_chars() {
        let raw = "x🦀 `c`";
        // utf-16 cols: x=0 🦀=1,2 ' '=3 `=4 c=5 `=6
        // utf-8 bytes: x=0 🦀=1..5 ' '=5 `=6 c=7 `=8
        let content = make_text_content(raw);

        let mut visitor = CodeCapture::default();
        walk_text_content_positions(&content, &mut visitor);

        let open = visitor.opens.first().expect("captured open marker");
        assert_eq!(open.span, 6..7, "byte span of `\\``");
        assert_eq!(
            open.start,
            Position::new(0, 4),
            "🦀 contributes 2 UTF-16 units (got column {:?})",
            open.start.column
        );
    }
}