Skip to main content

panache_parser/parser/blocks/
raw_blocks.rs

1//! Raw TeX block parsing (LaTeX commands and non-math environments)
2//!
3//! This module handles block-level raw TeX content:
4//! 1. LaTeX commands: `\DeclareMathOperator`, `\newcommand`, etc.
5//! 2. Non-math environments: `\begin{tabular}`, `\begin{figure}`, etc.
6//!
7//! Math environments (equation, align, etc.) are handled as INLINE content
8//! in paragraphs, not as blocks. See INLINE_MATH_ENVIRONMENTS list below.
9//!
10//! Per Pandoc behavior:
11//! - Consecutive LaTeX command lines are grouped into a single TEX_BLOCK
12//! - Non-math environments become TEX_BLOCK
13//! - Math environments are parsed inline (in paragraphs)
14//! - Blank lines or non-LaTeX content terminate the block
15//! - Only enabled when `raw_tex` extension is active
16
17use crate::config::Config;
18use crate::syntax::SyntaxKind;
19use rowan::GreenNodeBuilder;
20
21/// Inline math environments from Pandoc (parsed as RawInline in Para).
22/// These should NOT be parsed as block-level environments.
23///
24/// Source: pandoc/src/Text/Pandoc/Readers/LaTeX/Math.hs:L97-L123
25const INLINE_MATH_ENVIRONMENTS: &[&str] = &[
26    "displaymath",
27    "math",
28    "equation",
29    "equation*",
30    "gather",
31    "gather*",
32    "multline",
33    "multline*",
34    "eqnarray",
35    "eqnarray*",
36    "align",
37    "align*",
38    "alignat",
39    "alignat*",
40    "flalign",
41    "flalign*",
42    "dmath",
43    "dmath*",
44    "dgroup",
45    "dgroup*",
46    "darray",
47    "darray*",
48    "subequations",
49];
50
51/// Check if an environment name is an inline math environment.
52pub fn is_inline_math_environment(name: &str) -> bool {
53    INLINE_MATH_ENVIRONMENTS.contains(&name)
54}
55
56/// Extract environment name from `\begin{name}` line.
57/// Returns None if not a valid \begin{...} line.
58pub fn extract_environment_name(line: &str) -> Option<String> {
59    let trimmed = line.trim_start();
60
61    if !trimmed.starts_with("\\begin{") {
62        return None;
63    }
64
65    let after_begin = &trimmed[7..]; // Skip "\begin{"
66    let close_brace = after_begin.find('}')?;
67    let env_name = &after_begin[..close_brace];
68
69    if env_name.is_empty() {
70        return None;
71    }
72
73    Some(env_name.to_string())
74}
75
76/// Check if content could start a raw TeX block.
77///
78/// Requirements:
79/// - `raw_tex` extension must be enabled
80/// - Line must start with backslash followed by a letter
81/// - If it's a `\begin{env}`, the environment must NOT be an inline math env
82pub fn can_start_raw_block(content: &str, config: &Config) -> bool {
83    // Must have raw_tex extension enabled
84    if !config.extensions.raw_tex {
85        return false;
86    }
87
88    // Check if it's a \begin{env} line
89    if let Some(env_name) = extract_environment_name(content) {
90        // Skip inline math environments - they should be parsed inline in paragraphs
91        if is_inline_math_environment(&env_name) {
92            return false;
93        }
94        // Non-math environment: parse as block
95        return true;
96    }
97
98    // Check if we're at the start of a line with a LaTeX command
99    is_latex_command_line(content)
100}
101
102/// Check if a line starts with a LaTeX command (backslash + letter).
103fn is_latex_command_line(line: &str) -> bool {
104    let trimmed = line.trim_start();
105
106    if !trimmed.starts_with('\\') {
107        return false;
108    }
109
110    // After backslash, must have at least one letter
111    let after_backslash = &trimmed[1..];
112
113    // Exclude display math delimiters \[ and \]
114    if after_backslash.starts_with('[') || after_backslash.starts_with(']') {
115        return false;
116    }
117
118    after_backslash
119        .chars()
120        .next()
121        .map(|c| c.is_ascii_alphabetic())
122        .unwrap_or(false)
123}
124
125/// Parse a raw TeX block from lines array.
126///
127/// Collects one or more consecutive lines of LaTeX commands into a single
128/// TEX_BLOCK node, stopping at blank lines or non-LaTeX content.
129///
130/// Returns the number of lines consumed.
131pub fn parse_raw_tex_block(
132    builder: &mut GreenNodeBuilder<'static>,
133    lines: &[&str],
134    start_pos: usize,
135    blockquote_depth: usize,
136) -> usize {
137    log::debug!("Starting raw TeX block at line {}", start_pos);
138
139    builder.start_node(SyntaxKind::TEX_BLOCK.into());
140
141    let first_line = lines[start_pos];
142    let first_line_inner = crate::parser::blocks::blockquotes::strip_n_blockquote_markers(
143        first_line,
144        blockquote_depth,
145    );
146    if !is_latex_command_line(first_line_inner)
147        && extract_environment_name(first_line_inner).is_none()
148    {
149        builder.finish_node();
150        log::debug!("Finished raw TeX block, consumed 0 lines");
151        return 0;
152    }
153
154    // Check if this is an environment
155    let lines_consumed = if let Some(env_name) = extract_environment_name(first_line_inner) {
156        // Parse environment: \begin{env}...content...\end{env}
157        parse_tex_environment_lines(builder, lines, start_pos, &env_name, blockquote_depth)
158    } else {
159        // Parse consecutive LaTeX command lines
160        parse_tex_command_lines(builder, lines, start_pos, blockquote_depth)
161    };
162
163    builder.finish_node(); // TEX_BLOCK
164
165    log::debug!("Finished raw TeX block, consumed {} lines", lines_consumed);
166    lines_consumed
167}
168
169/// Parse consecutive LaTeX command lines.
170fn parse_tex_command_lines(
171    builder: &mut GreenNodeBuilder<'static>,
172    lines: &[&str],
173    start_pos: usize,
174    blockquote_depth: usize,
175) -> usize {
176    let mut lines_consumed = 0;
177    let mut first_line = true;
178    let mut brace_depth: i32 = 0;
179    let mut started_braced_command = false;
180
181    for line in &lines[start_pos..] {
182        let inner =
183            crate::parser::blocks::blockquotes::strip_n_blockquote_markers(line, blockquote_depth);
184        if !first_line && brace_depth == 0 {
185            // Stop at blank lines
186            if inner.trim().is_empty() {
187                break;
188            }
189
190            // Stop if not a LaTeX command line
191            if !is_latex_command_line(inner) {
192                break;
193            }
194
195            // Inside blockquotes, consume one command line at a time so outer parsing
196            // can preserve each line's blockquote markers losslessly.
197            if blockquote_depth > 0 {
198                break;
199            }
200        }
201
202        log::trace!("  Raw block line: {:?}", inner);
203
204        if !first_line {
205            builder.token(SyntaxKind::NEWLINE.into(), "\n");
206        }
207        first_line = false;
208
209        // Emit the line content (strip newline)
210        let content = inner.trim_end_matches(&['\r', '\n'][..]);
211        builder.token(SyntaxKind::TEXT.into(), content);
212
213        lines_consumed += 1;
214        brace_depth += brace_delta(content);
215        if brace_depth < 0 {
216            brace_depth = 0;
217        }
218        if first_line && brace_depth > 0 {
219            started_braced_command = true;
220        }
221        if started_braced_command && brace_depth == 0 {
222            break;
223        }
224        first_line = false;
225    }
226
227    // Emit final newline if there were any lines
228    if lines_consumed > 0 && !lines[start_pos + lines_consumed - 1].trim_end().is_empty() {
229        builder.token(SyntaxKind::NEWLINE.into(), "\n");
230    }
231
232    lines_consumed
233}
234
235fn brace_delta(text: &str) -> i32 {
236    let mut delta = 0i32;
237    let mut backslashes = 0usize;
238
239    for ch in text.chars() {
240        if ch == '\\' {
241            backslashes += 1;
242            continue;
243        }
244
245        let escaped = backslashes % 2 == 1;
246        backslashes = 0;
247
248        if escaped {
249            continue;
250        }
251
252        match ch {
253            '{' => delta += 1,
254            '}' => delta -= 1,
255            _ => {}
256        }
257    }
258
259    delta
260}
261
262/// Parse a LaTeX environment from \begin{env} to \end{env}.
263fn parse_tex_environment_lines(
264    builder: &mut GreenNodeBuilder<'static>,
265    lines: &[&str],
266    start_pos: usize,
267    env_name: &str,
268    blockquote_depth: usize,
269) -> usize {
270    let mut lines_consumed = 0;
271    let mut first_line = true;
272    let end_marker = format!("\\end{{{}}}", env_name);
273
274    for line in &lines[start_pos..] {
275        let inner =
276            crate::parser::blocks::blockquotes::strip_n_blockquote_markers(line, blockquote_depth);
277        log::trace!("  Environment line: {:?}", inner);
278
279        if !first_line {
280            builder.token(SyntaxKind::NEWLINE.into(), "\n");
281        }
282        first_line = false;
283
284        // Emit the line content (strip newline)
285        let content = inner.trim_end_matches(&['\r', '\n'][..]);
286        builder.token(SyntaxKind::TEXT.into(), content);
287
288        lines_consumed += 1;
289
290        // Check if this line contains the end marker
291        if inner.trim_start().starts_with(&end_marker) {
292            break;
293        }
294    }
295
296    // Emit final newline
297    if lines_consumed > 0 {
298        builder.token(SyntaxKind::NEWLINE.into(), "\n");
299    }
300
301    lines_consumed
302}
303
304#[cfg(test)]
305mod tests {
306    use super::*;
307    use crate::config::Config;
308    use crate::syntax::SyntaxNode;
309
310    #[test]
311    fn test_is_latex_command_line() {
312        assert!(is_latex_command_line("\\newcommand{foo}{bar}"));
313        assert!(is_latex_command_line("\\DeclareMathOperator{\\E}{E{}}"));
314        assert!(is_latex_command_line("  \\section{Title}"));
315        assert!(is_latex_command_line("\\usepackage{amsmath}"));
316
317        assert!(!is_latex_command_line("Regular text"));
318        assert!(!is_latex_command_line("\\123 numbers"));
319        assert!(!is_latex_command_line("\\  space"));
320        assert!(!is_latex_command_line(""));
321    }
322
323    #[test]
324    fn test_can_start_raw_block() {
325        let config = Config::default();
326        assert!(can_start_raw_block("\\newcommand{foo}{bar}", &config));
327        assert!(!can_start_raw_block("Regular text", &config));
328
329        let mut config_disabled = Config::default();
330        config_disabled.extensions.raw_tex = false;
331        assert!(!can_start_raw_block(
332            "\\newcommand{foo}{bar}",
333            &config_disabled
334        ));
335    }
336
337    #[test]
338    fn test_parse_single_command() {
339        let lines = vec!["\\DeclareMathOperator{\\E}{E{}}\n"];
340        let mut builder = GreenNodeBuilder::new();
341
342        let consumed = parse_raw_tex_block(&mut builder, &lines, 0, 0);
343        assert_eq!(consumed, 1);
344
345        let green = builder.finish();
346        let node = SyntaxNode::new_root(green);
347        // The node's text should be the lossless input
348        let text = node.text().to_string();
349        assert!(
350            text.contains("DeclareMathOperator"),
351            "Should contain command text: {}",
352            text
353        );
354    }
355
356    #[test]
357    fn test_parse_multiple_commands() {
358        let lines = vec![
359            "\\newcommand{\\foo}{bar}\n",
360            "\\DeclareMathOperator{\\E}{E{}}\n",
361        ];
362        let mut builder = GreenNodeBuilder::new();
363
364        let consumed = parse_raw_tex_block(&mut builder, &lines, 0, 0);
365        assert_eq!(consumed, 2);
366
367        let green = builder.finish();
368        let node = SyntaxNode::new_root(green);
369        let text = node.text().to_string();
370        assert!(
371            text.contains("newcommand"),
372            "Should contain newcommand: {}",
373            text
374        );
375        assert!(
376            text.contains("DeclareMathOperator"),
377            "Should contain DeclareMathOperator: {}",
378            text
379        );
380    }
381
382    #[test]
383    fn test_stops_at_blank_line() {
384        let lines = vec!["\\newcommand{\\foo}{bar}\n", "\n", "Regular paragraph\n"];
385        let mut builder = GreenNodeBuilder::new();
386
387        let consumed = parse_raw_tex_block(&mut builder, &lines, 0, 0);
388        assert_eq!(consumed, 1);
389
390        let green = builder.finish();
391        let node = SyntaxNode::new_root(green);
392        let text = node.text().to_string();
393        assert!(text.contains("newcommand"));
394        assert!(!text.contains("Regular paragraph"));
395    }
396
397    #[test]
398    fn test_stops_at_non_latex() {
399        let lines = vec!["\\newcommand{\\foo}{bar}\n", "Regular text\n"];
400        let mut builder = GreenNodeBuilder::new();
401
402        let consumed = parse_raw_tex_block(&mut builder, &lines, 0, 0);
403        assert_eq!(consumed, 1);
404    }
405
406    #[test]
407    fn test_blockquote_line_does_not_loop() {
408        let lines = vec!["> \\medskip\n"];
409        let mut builder = GreenNodeBuilder::new();
410
411        let consumed = parse_raw_tex_block(&mut builder, &lines, 0, 0);
412        assert_eq!(consumed, 0);
413    }
414
415    #[test]
416    fn test_blockquote_line_parses_tex_command() {
417        let lines = vec!["> \\medskip\n"];
418        let mut builder = GreenNodeBuilder::new();
419
420        let consumed = parse_raw_tex_block(&mut builder, &lines, 0, 1);
421        assert_eq!(consumed, 1);
422    }
423
424    #[test]
425    fn test_blockquote_multiple_tex_commands_consumes_one_line() {
426        let lines = vec!["> \\medskip\n", "> \\hfill---Joe Armstrong\n"];
427        let mut builder = GreenNodeBuilder::new();
428
429        let consumed = parse_raw_tex_block(&mut builder, &lines, 0, 1);
430        assert_eq!(consumed, 1);
431    }
432
433    #[test]
434    fn test_parse_braced_command_block_until_closing_brace() {
435        let lines = vec!["\\pdfpcnote{\n", "  - blabla\n", "}\n"];
436        let mut builder = GreenNodeBuilder::new();
437
438        let consumed = parse_raw_tex_block(&mut builder, &lines, 0, 0);
439        assert_eq!(consumed, 3);
440
441        let green = builder.finish();
442        let node = SyntaxNode::new_root(green);
443        assert_eq!(node.text().to_string(), "\\pdfpcnote{\n  - blabla\n}\n");
444    }
445}