Skip to main content

lex_analysis/
inline.rs

1//! Inline-level analysis utilities.
2//!
3//! Extracts positioned references from text content by walking the AST's InlineNode
4//! tree and raw source text in parallel to compute correct byte positions.
5
6use lex_core::lex::ast::{Position, Range, TextContent};
7use lex_core::lex::inlines::{InlineNode, ReferenceInline, ReferenceType};
8
9/// A reference found in inline text, with its source position and classified type.
10#[derive(Debug, Clone, PartialEq)]
11pub struct PositionedReference {
12    pub range: Range,
13    pub reference_type: ReferenceType,
14    pub raw: String,
15}
16
17/// Extract all references from a text node with their source positions.
18///
19/// Walks the InlineNode tree (from `TextContent::inline_items()`) and the raw source
20/// text in parallel. Non-reference nodes (Plain, Strong, Emphasis, Code, Math) are
21/// skipped over — only Reference nodes produce output.
22pub fn extract_references(text: &TextContent) -> Vec<PositionedReference> {
23    let Some(base_range) = text.location.as_ref() else {
24        return Vec::new();
25    };
26    let raw = text.as_string();
27    if raw.is_empty() {
28        return Vec::new();
29    }
30    let nodes = text.inline_items();
31    let mut walker = ReferenceWalker {
32        raw,
33        base_range,
34        cursor: 0,
35        refs: Vec::new(),
36    };
37    walker.walk_nodes(&nodes);
38    walker.refs
39}
40
41struct ReferenceWalker<'a> {
42    raw: &'a str,
43    base_range: &'a Range,
44    cursor: usize,
45    refs: Vec<PositionedReference>,
46}
47
48impl<'a> ReferenceWalker<'a> {
49    fn walk_nodes(&mut self, nodes: &[InlineNode]) {
50        for node in nodes {
51            self.walk_node(node);
52        }
53    }
54
55    fn walk_node(&mut self, node: &InlineNode) {
56        match node {
57            InlineNode::Plain { text, .. } => self.skip_plain(text),
58            InlineNode::Strong { content, .. } => self.skip_container(content, '*'),
59            InlineNode::Emphasis { content, .. } => self.skip_container(content, '_'),
60            InlineNode::Code { text, .. } => self.skip_literal(text, '`'),
61            InlineNode::Math { text, .. } => self.skip_literal(text, '#'),
62            InlineNode::Reference { data, .. } => self.collect_reference(data),
63        }
64    }
65
66    fn skip_plain(&mut self, text: &str) {
67        self.advance_unescaped(text);
68    }
69
70    fn skip_container(&mut self, content: &[InlineNode], marker: char) {
71        self.cursor += marker.len_utf8(); // opening marker
72        self.walk_nodes(content);
73        self.cursor += marker.len_utf8(); // closing marker
74    }
75
76    fn skip_literal(&mut self, text: &str, marker: char) {
77        self.cursor += marker.len_utf8(); // opening marker
78        self.cursor += text.len(); // verbatim content
79        self.cursor += marker.len_utf8(); // closing marker
80    }
81
82    fn collect_reference(&mut self, data: &ReferenceInline) {
83        self.cursor += 1; // opening '['
84
85        let content_start = self.cursor;
86        self.cursor += data.raw.len();
87        let content_end = self.cursor;
88
89        self.cursor += 1; // closing ']'
90
91        if content_start < content_end {
92            self.refs.push(PositionedReference {
93                range: self.make_range(content_start, content_end),
94                reference_type: data.reference_type.clone(),
95                raw: data.raw.clone(),
96            });
97        }
98    }
99
100    /// Advance cursor through raw text matching unescaped plain text.
101    fn advance_unescaped(&mut self, text: &str) {
102        for _expected in text.chars() {
103            if self.cursor >= self.raw.len() {
104                break;
105            }
106            let raw_ch = self.raw[self.cursor..].chars().next().unwrap();
107            if raw_ch == '\\' {
108                if self.cursor + 1 >= self.raw.len() {
109                    // Trailing backslash: treat as literal to avoid out-of-bounds slicing.
110                    self.cursor += 1;
111                } else {
112                    let next_ch = self.raw[self.cursor + 1..].chars().next();
113                    match next_ch {
114                        Some(nc) if !nc.is_alphanumeric() => {
115                            // Escaped: raw `\X` → unescaped `X`
116                            self.cursor += 1 + nc.len_utf8();
117                        }
118                        _ => {
119                            // Literal backslash
120                            self.cursor += 1;
121                        }
122                    }
123                }
124            } else {
125                self.cursor += raw_ch.len_utf8();
126            }
127        }
128    }
129
130    fn make_range(&self, start: usize, end: usize) -> Range {
131        let start_pos = self.position_at(start);
132        let end_pos = self.position_at(end);
133        let mut range = Range::new(
134            (self.base_range.span.start + start)..(self.base_range.span.start + end),
135            start_pos,
136            end_pos,
137        );
138        // Carry the text's origin so reference diagnostics built from
139        // this range are blamed on the file the text came from (e.g. an
140        // included fragment), not the entry document.
141        range.origin_path = self.base_range.origin_path.clone();
142        range
143    }
144
145    fn position_at(&self, offset: usize) -> Position {
146        // `column` units are UTF-16 code units to match LSP's default
147        // `positionEncoding`. See the matching `position_at` in
148        // `lex_core::lex::ast::inline_positions` for the full rationale.
149        // This duplicate walker should be folded into the shared
150        // `InlinePositionVisitor` machinery as a follow-up.
151        let mut line = self.base_range.start.line;
152        let mut column = self.base_range.start.column;
153        for ch in self.raw[..offset].chars() {
154            if ch == '\n' {
155                line += 1;
156                column = 0;
157            } else {
158                column += ch.len_utf16();
159            }
160        }
161        Position::new(line, column)
162    }
163}
164
165#[cfg(test)]
166mod tests {
167    use super::*;
168
169    fn text_with_range(content: &str, line: usize, column: usize) -> TextContent {
170        let start = Position::new(line, column);
171        let end = Position::new(line, column + content.len());
172        let range = Range::new(0..content.len(), start, end);
173        TextContent::from_string(content.to_string(), Some(range))
174    }
175
176    #[test]
177    fn extracts_references_with_classification() {
178        let text = text_with_range("See [::note] and [@spec2024] plus [42]", 0, 0);
179        let refs = extract_references(&text);
180        assert_eq!(refs.len(), 3);
181        assert!(refs
182            .iter()
183            .any(|r| matches!(r.reference_type, ReferenceType::AnnotationReference { .. })));
184        assert!(refs
185            .iter()
186            .any(|r| matches!(r.reference_type, ReferenceType::Citation(_))));
187        assert!(refs
188            .iter()
189            .any(|r| matches!(r.reference_type, ReferenceType::FootnoteNumber { .. })));
190    }
191
192    #[test]
193    fn reference_ranges_are_correct() {
194        let text = text_with_range("Hello [world] end", 0, 0);
195        let refs = extract_references(&text);
196        assert_eq!(refs.len(), 1);
197        assert_eq!(refs[0].raw, "world");
198        // "world" starts at byte 7 (after "Hello ["), ends at byte 12
199        assert_eq!(refs[0].range.span, 7..12);
200    }
201
202    #[test]
203    fn references_inside_formatting() {
204        let text = text_with_range("*bold [ref]* end", 0, 0);
205        let refs = extract_references(&text);
206        assert_eq!(refs.len(), 1);
207        assert_eq!(refs[0].raw, "ref");
208    }
209
210    #[test]
211    fn escaped_brackets_not_references() {
212        let text = text_with_range("\\[not a ref\\]", 0, 0);
213        let refs = extract_references(&text);
214        assert!(refs.is_empty());
215    }
216
217    #[test]
218    fn empty_text_returns_nothing() {
219        let text = text_with_range("", 0, 0);
220        let refs = extract_references(&text);
221        assert!(refs.is_empty());
222    }
223
224    /// Mirrors the UTF-16 invariant pinned in
225    /// `lex_core::lex::ast::inline_positions` — a `→` (1 UTF-16 unit, 3
226    /// UTF-8 bytes) before the reference must shift columns by 1, not by 3.
227    /// Without this, `find_references` / `goto_definition` jump cursors
228    /// would land on the wrong character whenever the line contained any
229    /// non-ASCII text before a reference.
230    #[test]
231    fn reference_columns_are_utf16_units_after_arrow() {
232        // "see → [ref] end"
233        //  utf-16 cols: s=0 e=1 e=2 ' '=3 →=4 ' '=5 [=6 r=7 e=8 f=9 ]=10 ' '=11 e=12 ...
234        //  utf-8 bytes: s=0 e=1 e=2 ' '=3 →=4,5,6 ' '=7 [=8 r=9 ...
235        let raw = "see → [ref] end";
236        let utf16_len: usize = raw.chars().map(char::len_utf16).sum();
237        let location = Range::new(
238            0..raw.len(),
239            Position::new(0, 0),
240            Position::new(0, utf16_len),
241        );
242        let text = TextContent::from_string(raw.to_string(), Some(location));
243
244        let refs = extract_references(&text);
245        assert_eq!(refs.len(), 1);
246        let r = &refs[0];
247        assert_eq!(r.raw, "ref");
248        // Byte span for the *content* between brackets — `[` at byte 8, `r`
249        // at byte 9, `]` at byte 12, content range is 9..12.
250        assert_eq!(r.range.span, 9..12, "byte span (UTF-8 bytes)");
251        // UTF-16: `r` at column 7, `]` at column 10, content range 7..10.
252        assert_eq!(
253            r.range.start,
254            Position::new(0, 7),
255            "content start column should be UTF-16 (got {:?})",
256            r.range.start
257        );
258        assert_eq!(r.range.end, Position::new(0, 10));
259    }
260
261    #[test]
262    fn no_location_returns_nothing() {
263        let text = TextContent::from_string("Hello [world]".to_string(), None);
264        let refs = extract_references(&text);
265        assert!(refs.is_empty());
266    }
267
268    #[test]
269    fn trailing_backslash_does_not_panic() {
270        // Double backslash in raw text: `Hello\\` — should not panic.
271        let text = text_with_range("Hello\\\\", 0, 0);
272        let refs = extract_references(&text);
273        assert!(refs.is_empty());
274
275        // Single trailing backslash in raw text: `Hello\` — the critical edge case.
276        let text2 = text_with_range("Hello\\", 0, 0);
277        let refs2 = extract_references(&text2);
278        assert!(refs2.is_empty());
279    }
280}