Skip to main content

lex_analysis/
inline.rs

1//! Inline-level analysis utilities.
2//!
3//! Extracts positioned references from text content by walking the AST's InlineNode
4//! tree and raw source text in parallel to compute correct byte positions.
5
6use lex_core::lex::ast::{Position, Range, TextContent};
7use lex_core::lex::inlines::{InlineNode, ReferenceInline, ReferenceType};
8
9/// A reference found in inline text, with its source position and classified type.
10#[derive(Debug, Clone, PartialEq)]
11pub struct PositionedReference {
12    pub range: Range,
13    pub reference_type: ReferenceType,
14    pub raw: String,
15}
16
17/// Extract all references from a text node with their source positions.
18///
19/// Walks the InlineNode tree (from `TextContent::inline_items()`) and the raw source
20/// text in parallel. Non-reference nodes (Plain, Strong, Emphasis, Code, Math) are
21/// skipped over — only Reference nodes produce output.
22pub fn extract_references(text: &TextContent) -> Vec<PositionedReference> {
23    let Some(base_range) = text.location.as_ref() else {
24        return Vec::new();
25    };
26    let raw = text.as_string();
27    if raw.is_empty() {
28        return Vec::new();
29    }
30    let nodes = text.inline_items();
31    let mut walker = ReferenceWalker {
32        raw,
33        base_range,
34        cursor: 0,
35        refs: Vec::new(),
36    };
37    walker.walk_nodes(&nodes);
38    walker.refs
39}
40
41struct ReferenceWalker<'a> {
42    raw: &'a str,
43    base_range: &'a Range,
44    cursor: usize,
45    refs: Vec<PositionedReference>,
46}
47
48impl<'a> ReferenceWalker<'a> {
49    fn walk_nodes(&mut self, nodes: &[InlineNode]) {
50        for node in nodes {
51            self.walk_node(node);
52        }
53    }
54
55    fn walk_node(&mut self, node: &InlineNode) {
56        match node {
57            InlineNode::Plain { text, .. } => self.skip_plain(text),
58            InlineNode::Strong { content, .. } => self.skip_container(content, '*'),
59            InlineNode::Emphasis { content, .. } => self.skip_container(content, '_'),
60            InlineNode::Code { text, .. } => self.skip_literal(text, '`'),
61            InlineNode::Math { text, .. } => self.skip_literal(text, '#'),
62            InlineNode::Reference { data, .. } => self.collect_reference(data),
63        }
64    }
65
66    fn skip_plain(&mut self, text: &str) {
67        self.advance_unescaped(text);
68    }
69
70    fn skip_container(&mut self, content: &[InlineNode], marker: char) {
71        self.cursor += marker.len_utf8(); // opening marker
72        self.walk_nodes(content);
73        self.cursor += marker.len_utf8(); // closing marker
74    }
75
76    fn skip_literal(&mut self, text: &str, marker: char) {
77        self.cursor += marker.len_utf8(); // opening marker
78        self.cursor += text.len(); // verbatim content
79        self.cursor += marker.len_utf8(); // closing marker
80    }
81
82    fn collect_reference(&mut self, data: &ReferenceInline) {
83        self.cursor += 1; // opening '['
84
85        let content_start = self.cursor;
86        self.cursor += data.raw.len();
87        let content_end = self.cursor;
88
89        self.cursor += 1; // closing ']'
90
91        if content_start < content_end {
92            self.refs.push(PositionedReference {
93                range: self.make_range(content_start, content_end),
94                reference_type: data.reference_type.clone(),
95                raw: data.raw.clone(),
96            });
97        }
98    }
99
100    /// Advance cursor through raw text matching unescaped plain text.
101    fn advance_unescaped(&mut self, text: &str) {
102        for _expected in text.chars() {
103            if self.cursor >= self.raw.len() {
104                break;
105            }
106            let raw_ch = self.raw[self.cursor..].chars().next().unwrap();
107            if raw_ch == '\\' {
108                if self.cursor + 1 >= self.raw.len() {
109                    // Trailing backslash: treat as literal to avoid out-of-bounds slicing.
110                    self.cursor += 1;
111                } else {
112                    let next_ch = self.raw[self.cursor + 1..].chars().next();
113                    match next_ch {
114                        Some(nc) if !nc.is_alphanumeric() => {
115                            // Escaped: raw `\X` → unescaped `X`
116                            self.cursor += 1 + nc.len_utf8();
117                        }
118                        _ => {
119                            // Literal backslash
120                            self.cursor += 1;
121                        }
122                    }
123                }
124            } else {
125                self.cursor += raw_ch.len_utf8();
126            }
127        }
128    }
129
130    fn make_range(&self, start: usize, end: usize) -> Range {
131        let start_pos = self.position_at(start);
132        let end_pos = self.position_at(end);
133        Range::new(
134            (self.base_range.span.start + start)..(self.base_range.span.start + end),
135            start_pos,
136            end_pos,
137        )
138    }
139
140    fn position_at(&self, offset: usize) -> Position {
141        let mut line = self.base_range.start.line;
142        let mut column = self.base_range.start.column;
143        for ch in self.raw[..offset].chars() {
144            if ch == '\n' {
145                line += 1;
146                column = 0;
147            } else {
148                column += ch.len_utf8();
149            }
150        }
151        Position::new(line, column)
152    }
153}
154
155#[cfg(test)]
156mod tests {
157    use super::*;
158
159    fn text_with_range(content: &str, line: usize, column: usize) -> TextContent {
160        let start = Position::new(line, column);
161        let end = Position::new(line, column + content.len());
162        let range = Range::new(0..content.len(), start, end);
163        TextContent::from_string(content.to_string(), Some(range))
164    }
165
166    #[test]
167    fn extracts_references_with_classification() {
168        let text = text_with_range("See [^note] and [@spec2024] plus [42]", 0, 0);
169        let refs = extract_references(&text);
170        assert_eq!(refs.len(), 3);
171        assert!(refs
172            .iter()
173            .any(|r| matches!(r.reference_type, ReferenceType::FootnoteLabeled { .. })));
174        assert!(refs
175            .iter()
176            .any(|r| matches!(r.reference_type, ReferenceType::Citation(_))));
177        assert!(refs
178            .iter()
179            .any(|r| matches!(r.reference_type, ReferenceType::FootnoteNumber { .. })));
180    }
181
182    #[test]
183    fn reference_ranges_are_correct() {
184        let text = text_with_range("Hello [world] end", 0, 0);
185        let refs = extract_references(&text);
186        assert_eq!(refs.len(), 1);
187        assert_eq!(refs[0].raw, "world");
188        // "world" starts at byte 7 (after "Hello ["), ends at byte 12
189        assert_eq!(refs[0].range.span, 7..12);
190    }
191
192    #[test]
193    fn references_inside_formatting() {
194        let text = text_with_range("*bold [ref]* end", 0, 0);
195        let refs = extract_references(&text);
196        assert_eq!(refs.len(), 1);
197        assert_eq!(refs[0].raw, "ref");
198    }
199
200    #[test]
201    fn escaped_brackets_not_references() {
202        let text = text_with_range("\\[not a ref\\]", 0, 0);
203        let refs = extract_references(&text);
204        assert!(refs.is_empty());
205    }
206
207    #[test]
208    fn empty_text_returns_nothing() {
209        let text = text_with_range("", 0, 0);
210        let refs = extract_references(&text);
211        assert!(refs.is_empty());
212    }
213
214    #[test]
215    fn no_location_returns_nothing() {
216        let text = TextContent::from_string("Hello [world]".to_string(), None);
217        let refs = extract_references(&text);
218        assert!(refs.is_empty());
219    }
220
221    #[test]
222    fn trailing_backslash_does_not_panic() {
223        // Double backslash in raw text: `Hello\\` — should not panic.
224        let text = text_with_range("Hello\\\\", 0, 0);
225        let refs = extract_references(&text);
226        assert!(refs.is_empty());
227
228        // Single trailing backslash in raw text: `Hello\` — the critical edge case.
229        let text2 = text_with_range("Hello\\", 0, 0);
230        let refs2 = extract_references(&text2);
231        assert!(refs2.is_empty());
232    }
233}