panache-parser 0.4.2

Lossless CST parser and syntax wrappers for Pandoc markdown, Quarto, and RMarkdown
Documentation
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
//! Raw TeX block parsing (LaTeX commands and non-math environments)
//!
//! This module handles block-level raw TeX content:
//! 1. LaTeX commands: `\DeclareMathOperator`, `\newcommand`, etc.
//! 2. Non-math environments: `\begin{tabular}`, `\begin{figure}`, etc.
//!
//! Math environments (equation, align, etc.) are handled as INLINE content
//! in paragraphs, not as blocks. See INLINE_MATH_ENVIRONMENTS list below.
//!
//! Per Pandoc behavior:
//! - Consecutive LaTeX command lines are grouped into a single TEX_BLOCK
//! - Non-math environments become TEX_BLOCK
//! - Math environments are parsed inline (in paragraphs)
//! - Blank lines or non-LaTeX content terminate the block
//! - Only enabled when `raw_tex` extension is active

use crate::options::ParserOptions;
use crate::syntax::SyntaxKind;
use rowan::GreenNodeBuilder;

/// Inline math environments from Pandoc (parsed as RawInline in Para).
/// These should NOT be parsed as block-level environments.
///
/// Source: pandoc/src/Text/Pandoc/Readers/LaTeX/Math.hs:L97-L123
const INLINE_MATH_ENVIRONMENTS: &[&str] = &[
    "displaymath",
    "math",
    "equation",
    "equation*",
    "gather",
    "gather*",
    "multline",
    "multline*",
    "eqnarray",
    "eqnarray*",
    "align",
    "align*",
    "alignat",
    "alignat*",
    "flalign",
    "flalign*",
    "dmath",
    "dmath*",
    "dgroup",
    "dgroup*",
    "darray",
    "darray*",
    "subequations",
];

/// Check if an environment name is an inline math environment.
pub fn is_inline_math_environment(name: &str) -> bool {
    INLINE_MATH_ENVIRONMENTS.contains(&name)
}

/// Extract environment name from `\begin{name}` line.
/// Returns None if not a valid \begin{...} line.
pub fn extract_environment_name(line: &str) -> Option<String> {
    let trimmed = line.trim_start();

    if !trimmed.starts_with("\\begin{") {
        return None;
    }

    let after_begin = &trimmed[7..]; // Skip "\begin{"
    let close_brace = after_begin.find('}')?;
    let env_name = &after_begin[..close_brace];

    if env_name.is_empty() {
        return None;
    }

    Some(env_name.to_string())
}

/// Check if content could start a raw TeX block.
///
/// Requirements:
/// - `raw_tex` extension must be enabled
/// - Line must start with backslash followed by a letter
/// - If it's a `\begin{env}`, the environment must NOT be an inline math env
pub fn can_start_raw_block(content: &str, config: &ParserOptions) -> bool {
    // Must have raw_tex extension enabled
    if !config.extensions.raw_tex {
        return false;
    }

    // Check if it's a \begin{env} line
    if let Some(env_name) = extract_environment_name(content) {
        // Skip inline math environments - they should be parsed inline in paragraphs
        if is_inline_math_environment(&env_name) {
            return false;
        }
        // Non-math environment: parse as block
        return true;
    }

    // Check if we're at the start of a line with a LaTeX command
    is_latex_command_line(content)
}

/// Check if a line starts with a LaTeX command (backslash + letter).
fn is_latex_command_line(line: &str) -> bool {
    let trimmed = line.trim_start();

    if !trimmed.starts_with('\\') {
        return false;
    }

    // After backslash, must have at least one letter
    let after_backslash = &trimmed[1..];

    // Exclude display math delimiters \[ and \]
    if after_backslash.starts_with('[') || after_backslash.starts_with(']') {
        return false;
    }

    after_backslash
        .chars()
        .next()
        .map(|c| c.is_ascii_alphabetic())
        .unwrap_or(false)
}

/// Parse a raw TeX block from lines array.
///
/// Collects one or more consecutive lines of LaTeX commands into a single
/// TEX_BLOCK node, stopping at blank lines or non-LaTeX content.
///
/// Returns the number of lines consumed.
pub fn parse_raw_tex_block(
    builder: &mut GreenNodeBuilder<'static>,
    lines: &[&str],
    start_pos: usize,
    blockquote_depth: usize,
) -> usize {
    log::debug!("Starting raw TeX block at line {}", start_pos);

    builder.start_node(SyntaxKind::TEX_BLOCK.into());

    let first_line = lines[start_pos];
    let first_line_inner = crate::parser::blocks::blockquotes::strip_n_blockquote_markers(
        first_line,
        blockquote_depth,
    );
    if !is_latex_command_line(first_line_inner)
        && extract_environment_name(first_line_inner).is_none()
    {
        builder.finish_node();
        log::debug!("Finished raw TeX block, consumed 0 lines");
        return 0;
    }

    // Check if this is an environment
    let lines_consumed = if let Some(env_name) = extract_environment_name(first_line_inner) {
        // Parse environment: \begin{env}...content...\end{env}
        parse_tex_environment_lines(builder, lines, start_pos, &env_name, blockquote_depth)
    } else {
        // Parse consecutive LaTeX command lines
        parse_tex_command_lines(builder, lines, start_pos, blockquote_depth)
    };

    builder.finish_node(); // TEX_BLOCK

    log::debug!("Finished raw TeX block, consumed {} lines", lines_consumed);
    lines_consumed
}

/// Parse consecutive LaTeX command lines.
fn parse_tex_command_lines(
    builder: &mut GreenNodeBuilder<'static>,
    lines: &[&str],
    start_pos: usize,
    blockquote_depth: usize,
) -> usize {
    let mut lines_consumed = 0;
    let mut first_line = true;
    let mut brace_depth: i32 = 0;
    let mut started_braced_command = false;

    for line in &lines[start_pos..] {
        let inner =
            crate::parser::blocks::blockquotes::strip_n_blockquote_markers(line, blockquote_depth);
        if !first_line && brace_depth == 0 {
            // Stop at blank lines
            if inner.trim().is_empty() {
                break;
            }

            // Stop if not a LaTeX command line
            if !is_latex_command_line(inner) {
                break;
            }

            // Inside blockquotes, consume one command line at a time so outer parsing
            // can preserve each line's blockquote markers losslessly.
            if blockquote_depth > 0 {
                break;
            }
        }

        log::trace!("  Raw block line: {:?}", inner);

        if !first_line {
            builder.token(SyntaxKind::NEWLINE.into(), "\n");
        }
        first_line = false;

        // Emit the line content (strip newline)
        let content = inner.trim_end_matches(&['\r', '\n'][..]);
        builder.token(SyntaxKind::TEXT.into(), content);

        lines_consumed += 1;
        brace_depth += brace_delta(content);
        if brace_depth < 0 {
            brace_depth = 0;
        }
        if first_line && brace_depth > 0 {
            started_braced_command = true;
        }
        if started_braced_command && brace_depth == 0 {
            break;
        }
        first_line = false;
    }

    // Emit final newline if there were any lines
    if lines_consumed > 0 && !lines[start_pos + lines_consumed - 1].trim_end().is_empty() {
        builder.token(SyntaxKind::NEWLINE.into(), "\n");
    }

    lines_consumed
}

fn brace_delta(text: &str) -> i32 {
    let mut delta = 0i32;
    let mut backslashes = 0usize;

    for ch in text.chars() {
        if ch == '\\' {
            backslashes += 1;
            continue;
        }

        let escaped = backslashes % 2 == 1;
        backslashes = 0;

        if escaped {
            continue;
        }

        match ch {
            '{' => delta += 1,
            '}' => delta -= 1,
            _ => {}
        }
    }

    delta
}

/// Parse a LaTeX environment from \begin{env} to \end{env}.
fn parse_tex_environment_lines(
    builder: &mut GreenNodeBuilder<'static>,
    lines: &[&str],
    start_pos: usize,
    env_name: &str,
    blockquote_depth: usize,
) -> usize {
    let mut lines_consumed = 0;
    let mut first_line = true;
    let end_marker = format!("\\end{{{}}}", env_name);

    for line in &lines[start_pos..] {
        let inner =
            crate::parser::blocks::blockquotes::strip_n_blockquote_markers(line, blockquote_depth);
        log::trace!("  Environment line: {:?}", inner);

        if !first_line {
            builder.token(SyntaxKind::NEWLINE.into(), "\n");
        }
        first_line = false;

        // Emit the line content (strip newline)
        let content = inner.trim_end_matches(&['\r', '\n'][..]);
        builder.token(SyntaxKind::TEXT.into(), content);

        lines_consumed += 1;

        // Check if this line contains the end marker
        if inner.trim_start().starts_with(&end_marker) {
            break;
        }
    }

    // Emit final newline
    if lines_consumed > 0 {
        builder.token(SyntaxKind::NEWLINE.into(), "\n");
    }

    lines_consumed
}

#[cfg(test)]
mod tests {
    use super::*;
    use crate::options::ParserOptions;
    use crate::syntax::SyntaxNode;

    #[test]
    fn test_is_latex_command_line() {
        assert!(is_latex_command_line("\\newcommand{foo}{bar}"));
        assert!(is_latex_command_line("\\DeclareMathOperator{\\E}{E{}}"));
        assert!(is_latex_command_line("  \\section{Title}"));
        assert!(is_latex_command_line("\\usepackage{amsmath}"));

        assert!(!is_latex_command_line("Regular text"));
        assert!(!is_latex_command_line("\\123 numbers"));
        assert!(!is_latex_command_line("\\  space"));
        assert!(!is_latex_command_line(""));
    }

    #[test]
    fn test_can_start_raw_block() {
        let config = ParserOptions::default();
        assert!(can_start_raw_block("\\newcommand{foo}{bar}", &config));
        assert!(!can_start_raw_block("Regular text", &config));

        let mut config_disabled = ParserOptions::default();
        config_disabled.extensions.raw_tex = false;
        assert!(!can_start_raw_block(
            "\\newcommand{foo}{bar}",
            &config_disabled
        ));
    }

    #[test]
    fn test_parse_single_command() {
        let lines = vec!["\\DeclareMathOperator{\\E}{E{}}\n"];
        let mut builder = GreenNodeBuilder::new();

        let consumed = parse_raw_tex_block(&mut builder, &lines, 0, 0);
        assert_eq!(consumed, 1);

        let green = builder.finish();
        let node = SyntaxNode::new_root(green);
        // The node's text should be the lossless input
        let text = node.text().to_string();
        assert!(
            text.contains("DeclareMathOperator"),
            "Should contain command text: {}",
            text
        );
    }

    #[test]
    fn test_parse_multiple_commands() {
        let lines = vec![
            "\\newcommand{\\foo}{bar}\n",
            "\\DeclareMathOperator{\\E}{E{}}\n",
        ];
        let mut builder = GreenNodeBuilder::new();

        let consumed = parse_raw_tex_block(&mut builder, &lines, 0, 0);
        assert_eq!(consumed, 2);

        let green = builder.finish();
        let node = SyntaxNode::new_root(green);
        let text = node.text().to_string();
        assert!(
            text.contains("newcommand"),
            "Should contain newcommand: {}",
            text
        );
        assert!(
            text.contains("DeclareMathOperator"),
            "Should contain DeclareMathOperator: {}",
            text
        );
    }

    #[test]
    fn test_stops_at_blank_line() {
        let lines = vec!["\\newcommand{\\foo}{bar}\n", "\n", "Regular paragraph\n"];
        let mut builder = GreenNodeBuilder::new();

        let consumed = parse_raw_tex_block(&mut builder, &lines, 0, 0);
        assert_eq!(consumed, 1);

        let green = builder.finish();
        let node = SyntaxNode::new_root(green);
        let text = node.text().to_string();
        assert!(text.contains("newcommand"));
        assert!(!text.contains("Regular paragraph"));
    }

    #[test]
    fn test_stops_at_non_latex() {
        let lines = vec!["\\newcommand{\\foo}{bar}\n", "Regular text\n"];
        let mut builder = GreenNodeBuilder::new();

        let consumed = parse_raw_tex_block(&mut builder, &lines, 0, 0);
        assert_eq!(consumed, 1);
    }

    #[test]
    fn test_blockquote_line_does_not_loop() {
        let lines = vec!["> \\medskip\n"];
        let mut builder = GreenNodeBuilder::new();

        let consumed = parse_raw_tex_block(&mut builder, &lines, 0, 0);
        assert_eq!(consumed, 0);
    }

    #[test]
    fn test_blockquote_line_parses_tex_command() {
        let lines = vec!["> \\medskip\n"];
        let mut builder = GreenNodeBuilder::new();

        let consumed = parse_raw_tex_block(&mut builder, &lines, 0, 1);
        assert_eq!(consumed, 1);
    }

    #[test]
    fn test_blockquote_multiple_tex_commands_consumes_one_line() {
        let lines = vec!["> \\medskip\n", "> \\hfill---Joe Armstrong\n"];
        let mut builder = GreenNodeBuilder::new();

        let consumed = parse_raw_tex_block(&mut builder, &lines, 0, 1);
        assert_eq!(consumed, 1);
    }

    #[test]
    fn test_parse_braced_command_block_until_closing_brace() {
        let lines = vec!["\\pdfpcnote{\n", "  - blabla\n", "}\n"];
        let mut builder = GreenNodeBuilder::new();

        let consumed = parse_raw_tex_block(&mut builder, &lines, 0, 0);
        assert_eq!(consumed, 3);

        let green = builder.finish();
        let node = SyntaxNode::new_root(green);
        assert_eq!(node.text().to_string(), "\\pdfpcnote{\n  - blabla\n}\n");
    }
}