Skip to main content

mdwright_latex/
inspect.rs

1//! Narrow inspection of math-body source for downstream linters.
2//!
3//! Linters that need to enumerate every TeX command, environment, and text-mode
4//! region inside a math body should use this surface rather than the parser:
5//! the parser rejects commands outside mdwright's Unicode subset, so its tree
6//! does not see them. The inspect walk operates on the token stream and yields
7//! every command sighting, whether or not mdwright can render it.
8
9use crate::SourceSpan;
10use crate::lexer::{Token, TokenKind, TokenStream};
11
12/// One event from a left-to-right walk of math-body source.
13///
14/// Spans are byte ranges into the source slice passed to `inspect_math_body`.
15/// `Command` and environment names are returned without the leading backslash.
16#[derive(Clone, Copy, Debug, PartialEq, Eq)]
17pub enum CommandEvent<'src> {
18    /// A `\name` use that is not `\begin`, `\end`, or a recognised text-mode
19    /// command. The span covers the backslash and the command name.
20    Command {
21        /// Command name without the leading backslash.
22        name: &'src str,
23        /// Byte range covering the command token in the source.
24        span: SourceSpan,
25    },
26    /// A `\begin{name}` opener. The span covers `\begin` and the brace group.
27    EnvironmentEnter {
28        /// Environment name as written inside the braces.
29        name: &'src str,
30        /// Byte range from `\begin` through the closing brace.
31        span: SourceSpan,
32    },
33    /// A matched `\end{name}` closer. The span covers `\end` and the brace group.
34    EnvironmentExit {
35        /// Environment name as written inside the braces.
36        name: &'src str,
37        /// Byte range from `\end` through the closing brace.
38        span: SourceSpan,
39    },
40    /// Entry into a text-mode region (`\text{...}` and friends). The span covers
41    /// the opening brace only.
42    TextModeEnter {
43        /// Byte range of the opening brace.
44        span: SourceSpan,
45    },
46    /// Exit from a text-mode region. The span covers the closing brace only.
47    TextModeExit {
48        /// Byte range of the closing brace.
49        span: SourceSpan,
50    },
51}
52
53/// Walk `source` as a TeX math body and return the command-usage event stream.
54///
55/// The walk is lexer-based: it does not run the parser, does not reject
56/// commands, and allocates only the result vector. Unbalanced groups inside
57/// `\begin{...}` or text-mode commands are tolerated by recovery — they may
58/// produce fewer paired `EnvironmentEnter`/`Exit` or `TextModeEnter`/`Exit`
59/// events but never dangling ones.
60#[must_use]
61pub fn inspect_math_body(source: &str) -> Vec<CommandEvent<'_>> {
62    let stream = TokenStream::new(source);
63    let tokens = stream.tokens();
64    let mut events = Vec::new();
65    let mut text_stack: Vec<usize> = Vec::new();
66    let mut env_stack: Vec<&str> = Vec::new();
67    let mut depth: usize = 0;
68
69    let mut index = 0;
70    while let Some(token) = tokens.get(index) {
71        match token.kind() {
72            TokenKind::CommandWord(raw) => {
73                let name = raw.strip_prefix('\\').unwrap_or(raw);
74                let next_index = index.saturating_add(1);
75                if name == "begin" {
76                    if let Some((env_name, group_end_index, end_span)) = read_braced_name(source, tokens, next_index) {
77                        let span = SourceSpan::new(token.span().start(), end_span.end());
78                        events.push(CommandEvent::EnvironmentEnter { name: env_name, span });
79                        env_stack.push(env_name);
80                        index = group_end_index.saturating_add(1);
81                        continue;
82                    }
83                    events.push(CommandEvent::Command {
84                        name,
85                        span: token.span(),
86                    });
87                } else if name == "end" {
88                    if let Some((env_name, group_end_index, end_span)) = read_braced_name(source, tokens, next_index) {
89                        let span = SourceSpan::new(token.span().start(), end_span.end());
90                        if env_stack.last() == Some(&env_name) {
91                            env_stack.pop();
92                        }
93                        events.push(CommandEvent::EnvironmentExit { name: env_name, span });
94                        index = group_end_index.saturating_add(1);
95                        continue;
96                    }
97                    events.push(CommandEvent::Command {
98                        name,
99                        span: token.span(),
100                    });
101                } else if is_text_mode_command(name) {
102                    events.push(CommandEvent::Command {
103                        name,
104                        span: token.span(),
105                    });
106                    if let Some(open_index) = skip_trivia(tokens, next_index)
107                        && let Some(open_token) = tokens.get(open_index)
108                        && matches!(open_token.kind(), TokenKind::LeftBrace)
109                    {
110                        events.push(CommandEvent::TextModeEnter {
111                            span: open_token.span(),
112                        });
113                        text_stack.push(depth.saturating_add(1));
114                    }
115                } else {
116                    events.push(CommandEvent::Command {
117                        name,
118                        span: token.span(),
119                    });
120                }
121            }
122            TokenKind::LeftBrace => {
123                depth = depth.saturating_add(1);
124            }
125            TokenKind::RightBrace => {
126                if text_stack.last() == Some(&depth) {
127                    text_stack.pop();
128                    events.push(CommandEvent::TextModeExit { span: token.span() });
129                }
130                depth = depth.saturating_sub(1);
131            }
132            TokenKind::ControlSymbol(_)
133            | TokenKind::LeftBracket
134            | TokenKind::RightBracket
135            | TokenKind::LeftParen
136            | TokenKind::RightParen
137            | TokenKind::Superscript
138            | TokenKind::Subscript
139            | TokenKind::Alignment
140            | TokenKind::RowSeparator
141            | TokenKind::Comment(_)
142            | TokenKind::Whitespace(_)
143            | TokenKind::Number(_)
144            | TokenKind::Identifier(_)
145            | TokenKind::Punctuation(_)
146            | TokenKind::UnicodeSymbol(_)
147            | TokenKind::Error
148            | TokenKind::Eof => {}
149        }
150        index = index.saturating_add(1);
151    }
152
153    events
154}
155
156fn skip_trivia(tokens: &[Token<'_>], start: usize) -> Option<usize> {
157    let mut index = start;
158    while let Some(token) = tokens.get(index) {
159        if matches!(token.kind(), TokenKind::Whitespace(_) | TokenKind::Comment(_)) {
160            index = index.saturating_add(1);
161            continue;
162        }
163        if matches!(token.kind(), TokenKind::Eof) {
164            return None;
165        }
166        return Some(index);
167    }
168    None
169}
170
171/// Read a `{ name }` group starting at `start`, mirroring the parser's
172/// `parse_raw_braced_text` behaviour: gather everything between the braces as
173/// raw source text. Returns the borrowed name slice, the index of the closing
174/// brace token, and that token's span. Returns `None` if the brace group is
175/// absent or unbalanced.
176fn read_braced_name<'src>(
177    source: &'src str,
178    tokens: &[Token<'src>],
179    start: usize,
180) -> Option<(&'src str, usize, SourceSpan)> {
181    let open_index = skip_trivia(tokens, start)?;
182    let open_token = tokens.get(open_index)?;
183    if !matches!(open_token.kind(), TokenKind::LeftBrace) {
184        return None;
185    }
186    let content_start = open_token.span().end();
187    let mut cursor = open_index.saturating_add(1);
188    while let Some(token) = tokens.get(cursor) {
189        if matches!(token.kind(), TokenKind::RightBrace) {
190            let close_span = token.span();
191            let content_end = close_span.start();
192            let raw = source.get(content_start..content_end)?;
193            let trimmed = raw.trim();
194            if trimmed.is_empty() {
195                return None;
196            }
197            let offset = raw.find(trimmed).unwrap_or(0);
198            let start_offset = content_start.saturating_add(offset);
199            let end_offset = start_offset.saturating_add(trimmed.len());
200            let borrowed = source.get(start_offset..end_offset)?;
201            return Some((borrowed, cursor, close_span));
202        }
203        if matches!(token.kind(), TokenKind::Eof) {
204            return None;
205        }
206        cursor = cursor.saturating_add(1);
207    }
208    None
209}
210
211fn is_text_mode_command(name: &str) -> bool {
212    matches!(
213        name,
214        "text" | "textrm" | "textbf" | "textit" | "textsf" | "texttt" | "textnormal" | "mbox" | "hbox"
215    )
216}
217
218#[cfg(test)]
219mod tests {
220    #![allow(
221        clippy::indexing_slicing,
222        clippy::panic,
223        clippy::unwrap_used,
224        reason = "tests assert event shape and span text against known inputs"
225    )]
226
227    use super::*;
228
229    fn names(events: &[CommandEvent<'_>]) -> Vec<String> {
230        events
231            .iter()
232            .map(|event| match event {
233                CommandEvent::Command { name, .. } => format!("cmd:{name}"),
234                CommandEvent::EnvironmentEnter { name, .. } => format!("env+:{name}"),
235                CommandEvent::EnvironmentExit { name, .. } => format!("env-:{name}"),
236                CommandEvent::TextModeEnter { .. } => "text+".to_owned(),
237                CommandEvent::TextModeExit { .. } => "text-".to_owned(),
238            })
239            .collect()
240    }
241
242    #[test]
243    fn enumerates_top_level_commands_with_spans() {
244        let source = r"\alpha + \beta";
245        let events = inspect_math_body(source);
246        assert_eq!(names(&events), vec!["cmd:alpha", "cmd:beta"]);
247        let CommandEvent::Command { span, .. } = events[0] else {
248            panic!("expected command event");
249        };
250        assert_eq!(&source[span.as_range()], r"\alpha");
251    }
252
253    #[test]
254    fn pairs_begin_and_end_for_environments() {
255        let source = r"\begin{matrix}a & b\end{matrix}";
256        let events = inspect_math_body(source);
257        assert_eq!(names(&events), vec!["env+:matrix", "env-:matrix"]);
258    }
259
260    #[test]
261    fn captures_starred_environment_names() {
262        let source = r"\begin{align*}x\end{align*}";
263        let events = inspect_math_body(source);
264        assert_eq!(names(&events), vec!["env+:align*", "env-:align*"]);
265    }
266
267    #[test]
268    fn enters_and_exits_text_mode_on_text_command() {
269        let source = r"\text{hello \alpha}";
270        let events = inspect_math_body(source);
271        assert_eq!(names(&events), vec!["cmd:text", "text+", "cmd:alpha", "text-"],);
272    }
273
274    #[test]
275    fn pairs_nested_text_with_outer_brace_groups() {
276        let source = r"{x \text{y \alpha} z}";
277        let events = inspect_math_body(source);
278        assert_eq!(names(&events), vec!["cmd:text", "text+", "cmd:alpha", "text-"],);
279    }
280
281    #[test]
282    fn surfaces_commands_the_parser_rejects() {
283        let source = r"\xrightarrow{f} \ce{H2O}";
284        let events = inspect_math_body(source);
285        assert_eq!(names(&events), vec!["cmd:xrightarrow", "cmd:ce"]);
286    }
287
288    #[test]
289    fn falls_back_to_command_when_begin_has_no_argument() {
290        let source = r"\begin";
291        let events = inspect_math_body(source);
292        assert_eq!(names(&events), vec!["cmd:begin"]);
293    }
294
295    #[test]
296    fn does_not_emit_text_events_inside_unrelated_groups() {
297        let source = r"{a + b}";
298        let events = inspect_math_body(source);
299        assert!(events.is_empty());
300    }
301}