Skip to main content

lex_analysis/
inline.rs

1//! Inline-level analysis utilities.
2//!
3//! Extracts positioned references from text content by walking the AST's InlineNode
4//! tree and raw source text in parallel to compute correct byte positions.
5
6use lex_core::lex::ast::{Position, Range, TextContent};
7use lex_core::lex::inlines::{InlineNode, ReferenceInline, ReferenceType};
8
9/// A reference found in inline text, with its source position and classified type.
10#[derive(Debug, Clone, PartialEq)]
11pub struct PositionedReference {
12    pub range: Range,
13    pub reference_type: ReferenceType,
14    pub raw: String,
15}
16
17/// Extract all references from a text node with their source positions.
18///
19/// Walks the InlineNode tree (from `TextContent::inline_items()`) and the raw source
20/// text in parallel. Non-reference nodes (Plain, Strong, Emphasis, Code, Math) are
21/// skipped over — only Reference nodes produce output.
22pub fn extract_references(text: &TextContent) -> Vec<PositionedReference> {
23    let Some(base_range) = text.location.as_ref() else {
24        return Vec::new();
25    };
26    let raw = text.as_string();
27    if raw.is_empty() {
28        return Vec::new();
29    }
30    let nodes = text.inline_items();
31    let mut walker = ReferenceWalker {
32        raw,
33        base_range,
34        cursor: 0,
35        refs: Vec::new(),
36    };
37    walker.walk_nodes(&nodes);
38    walker.refs
39}
40
41struct ReferenceWalker<'a> {
42    raw: &'a str,
43    base_range: &'a Range,
44    cursor: usize,
45    refs: Vec<PositionedReference>,
46}
47
48impl<'a> ReferenceWalker<'a> {
49    fn walk_nodes(&mut self, nodes: &[InlineNode]) {
50        for node in nodes {
51            self.walk_node(node);
52        }
53    }
54
55    fn walk_node(&mut self, node: &InlineNode) {
56        match node {
57            InlineNode::Plain { text, .. } => self.skip_plain(text),
58            InlineNode::Strong { content, .. } => self.skip_container(content, '*'),
59            InlineNode::Emphasis { content, .. } => self.skip_container(content, '_'),
60            InlineNode::Code { text, .. } => self.skip_literal(text, '`'),
61            InlineNode::Math { text, .. } => self.skip_literal(text, '#'),
62            InlineNode::Reference { data, .. } => self.collect_reference(data),
63        }
64    }
65
66    fn skip_plain(&mut self, text: &str) {
67        self.advance_unescaped(text);
68    }
69
70    fn skip_container(&mut self, content: &[InlineNode], marker: char) {
71        self.cursor += marker.len_utf8(); // opening marker
72        self.walk_nodes(content);
73        self.cursor += marker.len_utf8(); // closing marker
74    }
75
76    fn skip_literal(&mut self, text: &str, marker: char) {
77        self.cursor += marker.len_utf8(); // opening marker
78        self.cursor += text.len(); // verbatim content
79        self.cursor += marker.len_utf8(); // closing marker
80    }
81
82    fn collect_reference(&mut self, data: &ReferenceInline) {
83        self.cursor += 1; // opening '['
84
85        let content_start = self.cursor;
86        self.cursor += data.raw.len();
87        let content_end = self.cursor;
88
89        self.cursor += 1; // closing ']'
90
91        if content_start < content_end {
92            self.refs.push(PositionedReference {
93                range: self.make_range(content_start, content_end),
94                reference_type: data.reference_type.clone(),
95                raw: data.raw.clone(),
96            });
97        }
98    }
99
100    /// Advance cursor through raw text matching unescaped plain text.
101    fn advance_unescaped(&mut self, text: &str) {
102        for _expected in text.chars() {
103            if self.cursor >= self.raw.len() {
104                break;
105            }
106            let raw_ch = self.raw[self.cursor..].chars().next().unwrap();
107            if raw_ch == '\\' {
108                if self.cursor + 1 >= self.raw.len() {
109                    // Trailing backslash: treat as literal to avoid out-of-bounds slicing.
110                    self.cursor += 1;
111                } else {
112                    let next_ch = self.raw[self.cursor + 1..].chars().next();
113                    match next_ch {
114                        Some(nc) if !nc.is_alphanumeric() => {
115                            // Escaped: raw `\X` → unescaped `X`
116                            self.cursor += 1 + nc.len_utf8();
117                        }
118                        _ => {
119                            // Literal backslash
120                            self.cursor += 1;
121                        }
122                    }
123                }
124            } else {
125                self.cursor += raw_ch.len_utf8();
126            }
127        }
128    }
129
130    fn make_range(&self, start: usize, end: usize) -> Range {
131        let start_pos = self.position_at(start);
132        let end_pos = self.position_at(end);
133        Range::new(
134            (self.base_range.span.start + start)..(self.base_range.span.start + end),
135            start_pos,
136            end_pos,
137        )
138    }
139
140    fn position_at(&self, offset: usize) -> Position {
141        // `column` units are UTF-16 code units to match LSP's default
142        // `positionEncoding`. See the matching `position_at` in
143        // `lex_core::lex::ast::inline_positions` for the full rationale.
144        // This duplicate walker should be folded into the shared
145        // `InlinePositionVisitor` machinery as a follow-up.
146        let mut line = self.base_range.start.line;
147        let mut column = self.base_range.start.column;
148        for ch in self.raw[..offset].chars() {
149            if ch == '\n' {
150                line += 1;
151                column = 0;
152            } else {
153                column += ch.len_utf16();
154            }
155        }
156        Position::new(line, column)
157    }
158}
159
160#[cfg(test)]
161mod tests {
162    use super::*;
163
164    fn text_with_range(content: &str, line: usize, column: usize) -> TextContent {
165        let start = Position::new(line, column);
166        let end = Position::new(line, column + content.len());
167        let range = Range::new(0..content.len(), start, end);
168        TextContent::from_string(content.to_string(), Some(range))
169    }
170
171    #[test]
172    fn extracts_references_with_classification() {
173        let text = text_with_range("See [::note] and [@spec2024] plus [42]", 0, 0);
174        let refs = extract_references(&text);
175        assert_eq!(refs.len(), 3);
176        assert!(refs
177            .iter()
178            .any(|r| matches!(r.reference_type, ReferenceType::AnnotationReference { .. })));
179        assert!(refs
180            .iter()
181            .any(|r| matches!(r.reference_type, ReferenceType::Citation(_))));
182        assert!(refs
183            .iter()
184            .any(|r| matches!(r.reference_type, ReferenceType::FootnoteNumber { .. })));
185    }
186
187    #[test]
188    fn reference_ranges_are_correct() {
189        let text = text_with_range("Hello [world] end", 0, 0);
190        let refs = extract_references(&text);
191        assert_eq!(refs.len(), 1);
192        assert_eq!(refs[0].raw, "world");
193        // "world" starts at byte 7 (after "Hello ["), ends at byte 12
194        assert_eq!(refs[0].range.span, 7..12);
195    }
196
197    #[test]
198    fn references_inside_formatting() {
199        let text = text_with_range("*bold [ref]* end", 0, 0);
200        let refs = extract_references(&text);
201        assert_eq!(refs.len(), 1);
202        assert_eq!(refs[0].raw, "ref");
203    }
204
205    #[test]
206    fn escaped_brackets_not_references() {
207        let text = text_with_range("\\[not a ref\\]", 0, 0);
208        let refs = extract_references(&text);
209        assert!(refs.is_empty());
210    }
211
212    #[test]
213    fn empty_text_returns_nothing() {
214        let text = text_with_range("", 0, 0);
215        let refs = extract_references(&text);
216        assert!(refs.is_empty());
217    }
218
219    /// Mirrors the UTF-16 invariant pinned in
220    /// `lex_core::lex::ast::inline_positions` — a `→` (1 UTF-16 unit, 3
221    /// UTF-8 bytes) before the reference must shift columns by 1, not by 3.
222    /// Without this, `find_references` / `goto_definition` jump cursors
223    /// would land on the wrong character whenever the line contained any
224    /// non-ASCII text before a reference.
225    #[test]
226    fn reference_columns_are_utf16_units_after_arrow() {
227        // "see → [ref] end"
228        //  utf-16 cols: s=0 e=1 e=2 ' '=3 →=4 ' '=5 [=6 r=7 e=8 f=9 ]=10 ' '=11 e=12 ...
229        //  utf-8 bytes: s=0 e=1 e=2 ' '=3 →=4,5,6 ' '=7 [=8 r=9 ...
230        let raw = "see → [ref] end";
231        let utf16_len: usize = raw.chars().map(char::len_utf16).sum();
232        let location = Range::new(
233            0..raw.len(),
234            Position::new(0, 0),
235            Position::new(0, utf16_len),
236        );
237        let text = TextContent::from_string(raw.to_string(), Some(location));
238
239        let refs = extract_references(&text);
240        assert_eq!(refs.len(), 1);
241        let r = &refs[0];
242        assert_eq!(r.raw, "ref");
243        // Byte span for the *content* between brackets — `[` at byte 8, `r`
244        // at byte 9, `]` at byte 12, content range is 9..12.
245        assert_eq!(r.range.span, 9..12, "byte span (UTF-8 bytes)");
246        // UTF-16: `r` at column 7, `]` at column 10, content range 7..10.
247        assert_eq!(
248            r.range.start,
249            Position::new(0, 7),
250            "content start column should be UTF-16 (got {:?})",
251            r.range.start
252        );
253        assert_eq!(r.range.end, Position::new(0, 10));
254    }
255
256    #[test]
257    fn no_location_returns_nothing() {
258        let text = TextContent::from_string("Hello [world]".to_string(), None);
259        let refs = extract_references(&text);
260        assert!(refs.is_empty());
261    }
262
263    #[test]
264    fn trailing_backslash_does_not_panic() {
265        // Double backslash in raw text: `Hello\\` — should not panic.
266        let text = text_with_range("Hello\\\\", 0, 0);
267        let refs = extract_references(&text);
268        assert!(refs.is_empty());
269
270        // Single trailing backslash in raw text: `Hello\` — the critical edge case.
271        let text2 = text_with_range("Hello\\", 0, 0);
272        let refs2 = extract_references(&text2);
273        assert!(refs2.is_empty());
274    }
275}