Skip to main content

panache_parser/parser/blocks/
raw_blocks.rs

1//! Raw TeX block parsing (LaTeX commands and non-math environments)
2//!
3//! This module handles block-level raw TeX content:
4//! 1. LaTeX commands: `\DeclareMathOperator`, `\newcommand`, etc.
5//! 2. Non-math environments: `\begin{tabular}`, `\begin{figure}`, etc.
6//!
7//! Math environments (equation, align, etc.) are handled as INLINE content
8//! in paragraphs, not as blocks. See INLINE_MATH_ENVIRONMENTS list below.
9//!
10//! Per Pandoc behavior:
11//! - Consecutive LaTeX command lines are grouped into a single TEX_BLOCK
12//! - Non-math environments become TEX_BLOCK
13//! - Math environments are parsed inline (in paragraphs)
14//! - Blank lines or non-LaTeX content terminate the block
15//! - Only enabled when `raw_tex` extension is active
16
17use crate::options::ParserOptions;
18use crate::parser::utils::helpers::trim_end_newlines;
19use crate::syntax::SyntaxKind;
20use rowan::GreenNodeBuilder;
21
22/// Inline math environments from Pandoc (parsed as RawInline in Para).
23/// These should NOT be parsed as block-level environments.
24///
25/// Source: pandoc/src/Text/Pandoc/Readers/LaTeX/Math.hs:L97-L123
26const INLINE_MATH_ENVIRONMENTS: &[&str] = &[
27    "displaymath",
28    "math",
29    "equation",
30    "equation*",
31    "gather",
32    "gather*",
33    "multline",
34    "multline*",
35    "eqnarray",
36    "eqnarray*",
37    "align",
38    "align*",
39    "alignat",
40    "alignat*",
41    "flalign",
42    "flalign*",
43    "dmath",
44    "dmath*",
45    "dgroup",
46    "dgroup*",
47    "darray",
48    "darray*",
49    "subequations",
50];
51
52/// Check if an environment name is an inline math environment.
53pub fn is_inline_math_environment(name: &str) -> bool {
54    INLINE_MATH_ENVIRONMENTS.contains(&name)
55}
56
57/// Extract environment name from `\begin{name}` line as a borrowed
58/// slice. Returns `None` if not a valid `\begin{...}` line.
59///
60/// This is the hot path used by the block dispatcher / paragraph
61/// scanner; allocating a `String` per call shows up in profiles when
62/// the doc has many LaTeX-style lines.
63pub fn extract_environment_name(line: &str) -> Option<&str> {
64    // ASCII byte-level leading-whitespace skip; `str::trim_start`
65    // iterates Unicode whitespace which is unnecessary here.
66    let bytes = line.as_bytes();
67    let mut i = 0;
68    while i < bytes.len() && (bytes[i] == b' ' || bytes[i] == b'\t') {
69        i += 1;
70    }
71    let trimmed = &line[i..];
72
73    let after_begin = trimmed.strip_prefix("\\begin{")?;
74    let close_brace = after_begin.find('}')?;
75    let env_name = &after_begin[..close_brace];
76
77    if env_name.is_empty() {
78        return None;
79    }
80
81    Some(env_name)
82}
83
84/// Check if content could start a raw TeX block.
85///
86/// Requirements:
87/// - `raw_tex` extension must be enabled
88/// - Line must start with backslash followed by a letter
89/// - If it's a `\begin{env}`, the environment must NOT be an inline math env
90pub fn can_start_raw_block(content: &str, config: &ParserOptions) -> bool {
91    // Must have raw_tex extension enabled
92    if !config.extensions.raw_tex {
93        return false;
94    }
95
96    // Check if it's a \begin{env} line
97    if let Some(env_name) = extract_environment_name(content) {
98        // Skip inline math environments - they should be parsed inline in paragraphs
99        if is_inline_math_environment(env_name) {
100            return false;
101        }
102        // Non-math environment: parse as block
103        return true;
104    }
105
106    // Check if we're at the start of a line with a LaTeX command
107    is_latex_command_line(content)
108}
109
110/// Check if a line starts with a LaTeX command (backslash + letter).
111fn is_latex_command_line(line: &str) -> bool {
112    let trimmed = line.trim_start();
113
114    if !trimmed.starts_with('\\') {
115        return false;
116    }
117
118    // After backslash, must have at least one letter
119    let after_backslash = &trimmed[1..];
120
121    // Exclude display math delimiters \[ and \]
122    if after_backslash.starts_with('[') || after_backslash.starts_with(']') {
123        return false;
124    }
125
126    after_backslash
127        .chars()
128        .next()
129        .map(|c| c.is_ascii_alphabetic())
130        .unwrap_or(false)
131}
132
133/// Parse a raw TeX block from lines array.
134///
135/// Collects one or more consecutive lines of LaTeX commands into a single
136/// TEX_BLOCK node, stopping at blank lines or non-LaTeX content.
137///
138/// Returns the number of lines consumed.
139pub fn parse_raw_tex_block(
140    builder: &mut GreenNodeBuilder<'static>,
141    lines: &[&str],
142    start_pos: usize,
143    blockquote_depth: usize,
144) -> usize {
145    log::trace!("Starting raw TeX block at line {}", start_pos);
146
147    builder.start_node(SyntaxKind::TEX_BLOCK.into());
148
149    let first_line = lines[start_pos];
150    let first_line_inner = crate::parser::blocks::blockquotes::strip_n_blockquote_markers(
151        first_line,
152        blockquote_depth,
153    );
154    if !is_latex_command_line(first_line_inner)
155        && extract_environment_name(first_line_inner).is_none()
156    {
157        builder.finish_node();
158        log::trace!("Finished raw TeX block, consumed 0 lines");
159        return 0;
160    }
161
162    // Check if this is an environment
163    let lines_consumed = if let Some(env_name) = extract_environment_name(first_line_inner) {
164        // Parse environment: \begin{env}...content...\end{env}
165        parse_tex_environment_lines(builder, lines, start_pos, env_name, blockquote_depth)
166    } else {
167        // Parse consecutive LaTeX command lines
168        parse_tex_command_lines(builder, lines, start_pos, blockquote_depth)
169    };
170
171    builder.finish_node(); // TEX_BLOCK
172
173    log::trace!("Finished raw TeX block, consumed {} lines", lines_consumed);
174    lines_consumed
175}
176
177/// Parse consecutive LaTeX command lines.
178fn parse_tex_command_lines(
179    builder: &mut GreenNodeBuilder<'static>,
180    lines: &[&str],
181    start_pos: usize,
182    blockquote_depth: usize,
183) -> usize {
184    let mut lines_consumed = 0;
185    let mut first_line = true;
186    let mut brace_depth: i32 = 0;
187    let mut started_braced_command = false;
188
189    for line in &lines[start_pos..] {
190        let inner =
191            crate::parser::blocks::blockquotes::strip_n_blockquote_markers(line, blockquote_depth);
192        if !first_line && brace_depth == 0 {
193            // Stop at blank lines
194            if inner.trim().is_empty() {
195                break;
196            }
197
198            // Stop if not a LaTeX command line
199            if !is_latex_command_line(inner) {
200                break;
201            }
202
203            // Inside blockquotes, consume one command line at a time so outer parsing
204            // can preserve each line's blockquote markers losslessly.
205            if blockquote_depth > 0 {
206                break;
207            }
208        }
209
210        log::trace!("  Raw block line: {:?}", inner);
211
212        if !first_line {
213            builder.token(SyntaxKind::NEWLINE.into(), "\n");
214        }
215        first_line = false;
216
217        // Emit the line content (strip newline)
218        let content = trim_end_newlines(inner);
219        builder.token(SyntaxKind::TEXT.into(), content);
220
221        lines_consumed += 1;
222        brace_depth += brace_delta(content);
223        if brace_depth < 0 {
224            brace_depth = 0;
225        }
226        if first_line && brace_depth > 0 {
227            started_braced_command = true;
228        }
229        if started_braced_command && brace_depth == 0 {
230            break;
231        }
232        first_line = false;
233    }
234
235    // Emit final newline if there were any lines
236    if lines_consumed > 0 && !lines[start_pos + lines_consumed - 1].trim_end().is_empty() {
237        builder.token(SyntaxKind::NEWLINE.into(), "\n");
238    }
239
240    lines_consumed
241}
242
243fn brace_delta(text: &str) -> i32 {
244    let mut delta = 0i32;
245    let mut backslashes = 0usize;
246
247    for ch in text.chars() {
248        if ch == '\\' {
249            backslashes += 1;
250            continue;
251        }
252
253        let escaped = backslashes % 2 == 1;
254        backslashes = 0;
255
256        if escaped {
257            continue;
258        }
259
260        match ch {
261            '{' => delta += 1,
262            '}' => delta -= 1,
263            _ => {}
264        }
265    }
266
267    delta
268}
269
270/// Parse a LaTeX environment from \begin{env} to \end{env}.
271fn parse_tex_environment_lines(
272    builder: &mut GreenNodeBuilder<'static>,
273    lines: &[&str],
274    start_pos: usize,
275    env_name: &str,
276    blockquote_depth: usize,
277) -> usize {
278    let mut lines_consumed = 0;
279    let mut first_line = true;
280    let end_marker = format!("\\end{{{}}}", env_name);
281
282    for line in &lines[start_pos..] {
283        let inner =
284            crate::parser::blocks::blockquotes::strip_n_blockquote_markers(line, blockquote_depth);
285        log::trace!("  Environment line: {:?}", inner);
286
287        if !first_line {
288            builder.token(SyntaxKind::NEWLINE.into(), "\n");
289        }
290        first_line = false;
291
292        // Emit the line content (strip newline)
293        let content = trim_end_newlines(inner);
294        builder.token(SyntaxKind::TEXT.into(), content);
295
296        lines_consumed += 1;
297
298        // Check if this line contains the end marker
299        if inner.trim_start().starts_with(&end_marker) {
300            break;
301        }
302    }
303
304    // Emit final newline
305    if lines_consumed > 0 {
306        builder.token(SyntaxKind::NEWLINE.into(), "\n");
307    }
308
309    lines_consumed
310}
311
312#[cfg(test)]
313mod tests {
314    use super::*;
315    use crate::options::ParserOptions;
316    use crate::syntax::SyntaxNode;
317
318    #[test]
319    fn test_is_latex_command_line() {
320        assert!(is_latex_command_line("\\newcommand{foo}{bar}"));
321        assert!(is_latex_command_line("\\DeclareMathOperator{\\E}{E{}}"));
322        assert!(is_latex_command_line("  \\section{Title}"));
323        assert!(is_latex_command_line("\\usepackage{amsmath}"));
324
325        assert!(!is_latex_command_line("Regular text"));
326        assert!(!is_latex_command_line("\\123 numbers"));
327        assert!(!is_latex_command_line("\\  space"));
328        assert!(!is_latex_command_line(""));
329    }
330
331    #[test]
332    fn test_can_start_raw_block() {
333        let config = ParserOptions::default();
334        assert!(can_start_raw_block("\\newcommand{foo}{bar}", &config));
335        assert!(!can_start_raw_block("Regular text", &config));
336
337        let mut config_disabled = ParserOptions::default();
338        config_disabled.extensions.raw_tex = false;
339        assert!(!can_start_raw_block(
340            "\\newcommand{foo}{bar}",
341            &config_disabled
342        ));
343    }
344
345    #[test]
346    fn test_parse_single_command() {
347        let lines = vec!["\\DeclareMathOperator{\\E}{E{}}\n"];
348        let mut builder = GreenNodeBuilder::new();
349
350        let consumed = parse_raw_tex_block(&mut builder, &lines, 0, 0);
351        assert_eq!(consumed, 1);
352
353        let green = builder.finish();
354        let node = SyntaxNode::new_root(green);
355        // The node's text should be the lossless input
356        let text = node.text().to_string();
357        assert!(
358            text.contains("DeclareMathOperator"),
359            "Should contain command text: {}",
360            text
361        );
362    }
363
364    #[test]
365    fn test_parse_multiple_commands() {
366        let lines = vec![
367            "\\newcommand{\\foo}{bar}\n",
368            "\\DeclareMathOperator{\\E}{E{}}\n",
369        ];
370        let mut builder = GreenNodeBuilder::new();
371
372        let consumed = parse_raw_tex_block(&mut builder, &lines, 0, 0);
373        assert_eq!(consumed, 2);
374
375        let green = builder.finish();
376        let node = SyntaxNode::new_root(green);
377        let text = node.text().to_string();
378        assert!(
379            text.contains("newcommand"),
380            "Should contain newcommand: {}",
381            text
382        );
383        assert!(
384            text.contains("DeclareMathOperator"),
385            "Should contain DeclareMathOperator: {}",
386            text
387        );
388    }
389
390    #[test]
391    fn test_stops_at_blank_line() {
392        let lines = vec!["\\newcommand{\\foo}{bar}\n", "\n", "Regular paragraph\n"];
393        let mut builder = GreenNodeBuilder::new();
394
395        let consumed = parse_raw_tex_block(&mut builder, &lines, 0, 0);
396        assert_eq!(consumed, 1);
397
398        let green = builder.finish();
399        let node = SyntaxNode::new_root(green);
400        let text = node.text().to_string();
401        assert!(text.contains("newcommand"));
402        assert!(!text.contains("Regular paragraph"));
403    }
404
405    #[test]
406    fn test_stops_at_non_latex() {
407        let lines = vec!["\\newcommand{\\foo}{bar}\n", "Regular text\n"];
408        let mut builder = GreenNodeBuilder::new();
409
410        let consumed = parse_raw_tex_block(&mut builder, &lines, 0, 0);
411        assert_eq!(consumed, 1);
412    }
413
414    #[test]
415    fn test_blockquote_line_does_not_loop() {
416        let lines = vec!["> \\medskip\n"];
417        let mut builder = GreenNodeBuilder::new();
418
419        let consumed = parse_raw_tex_block(&mut builder, &lines, 0, 0);
420        assert_eq!(consumed, 0);
421    }
422
423    #[test]
424    fn test_blockquote_line_parses_tex_command() {
425        let lines = vec!["> \\medskip\n"];
426        let mut builder = GreenNodeBuilder::new();
427
428        let consumed = parse_raw_tex_block(&mut builder, &lines, 0, 1);
429        assert_eq!(consumed, 1);
430    }
431
432    #[test]
433    fn test_blockquote_multiple_tex_commands_consumes_one_line() {
434        let lines = vec!["> \\medskip\n", "> \\hfill---Joe Armstrong\n"];
435        let mut builder = GreenNodeBuilder::new();
436
437        let consumed = parse_raw_tex_block(&mut builder, &lines, 0, 1);
438        assert_eq!(consumed, 1);
439    }
440
441    #[test]
442    fn test_parse_braced_command_block_until_closing_brace() {
443        let lines = vec!["\\pdfpcnote{\n", "  - blabla\n", "}\n"];
444        let mut builder = GreenNodeBuilder::new();
445
446        let consumed = parse_raw_tex_block(&mut builder, &lines, 0, 0);
447        assert_eq!(consumed, 3);
448
449        let green = builder.finish();
450        let node = SyntaxNode::new_root(green);
451        assert_eq!(node.text().to_string(), "\\pdfpcnote{\n  - blabla\n}\n");
452    }
453}