panache_parser/parser/inlines/
core.rs

1//! Recursive emphasis parsing using Pandoc's algorithm.
2//!
3//! This module implements emphasis/strong emphasis parsing using a recursive
4//! descent approach based on Pandoc's Haskell implementation in
5//! `Readers/Markdown.hs:L1662-L1722`.
6//!
7//! **Key algorithm**: Left-to-right, greedy, first-match wins
8//! 1. Parse text left-to-right
9//! 2. When we see delimiters, try to parse emphasis (look for matching closer)
10//! 3. If successful, emit emphasis node and continue from after closer
11//! 4. If failed (no closer found), emit delimiter as literal and continue
12//! 5. Nested emphasis is handled naturally by recursive parsing of content
13//!
14//! **Example**: `*foo **bar* baz**`
15//! - See `*`, try to parse EMPH
16//! - Parse content: see `**`, try to parse STRONG
17//! - STRONG finds closer `**` at end → succeeds, emits STRONG[bar* baz]
18//! - Outer `*` can't find closer (all delimiters consumed) → fails, emits `*foo` as literal
19//! - Result: `*foo` + STRONG[bar* baz]
20//!
21//! This matches Pandoc's behavior exactly.
22
23use crate::options::ParserOptions;
24use crate::syntax::SyntaxKind;
25use rowan::GreenNodeBuilder;
26
27// Import inline element parsers from sibling modules
28use super::bookdown::{
29    try_parse_bookdown_definition, try_parse_bookdown_reference, try_parse_bookdown_text_reference,
30};
31use super::bracketed_spans::{emit_bracketed_span, try_parse_bracketed_span};
32use super::citations::{
33    emit_bare_citation, emit_bracketed_citation, try_parse_bare_citation,
34    try_parse_bracketed_citation,
35};
36use super::code_spans::{emit_code_span, try_parse_code_span};
37use super::emoji::{emit_emoji, try_parse_emoji};
38use super::escapes::{EscapeType, emit_escape, try_parse_escape};
39use super::inline_executable::{emit_inline_executable, try_parse_inline_executable};
40use super::inline_footnotes::{
41    emit_footnote_reference, emit_inline_footnote, try_parse_footnote_reference,
42    try_parse_inline_footnote,
43};
44use super::latex::{parse_latex_command, try_parse_latex_command};
45use super::links::{
46    emit_autolink, emit_bare_uri_link, emit_inline_image, emit_inline_link, emit_reference_image,
47    emit_reference_link, try_parse_autolink, try_parse_bare_uri, try_parse_inline_image,
48    try_parse_inline_link, try_parse_reference_image, try_parse_reference_link,
49};
50use super::math::{
51    emit_display_math, emit_display_math_environment, emit_double_backslash_display_math,
52    emit_double_backslash_inline_math, emit_gfm_inline_math, emit_inline_math,
53    emit_single_backslash_display_math, emit_single_backslash_inline_math, try_parse_display_math,
54    try_parse_double_backslash_display_math, try_parse_double_backslash_inline_math,
55    try_parse_gfm_inline_math, try_parse_inline_math, try_parse_math_environment,
56    try_parse_single_backslash_display_math, try_parse_single_backslash_inline_math,
57};
58use super::native_spans::{emit_native_span, try_parse_native_span};
59use super::raw_inline::is_raw_inline;
60use super::shortcodes::{emit_shortcode, try_parse_shortcode};
61use super::strikeout::{emit_strikeout, try_parse_strikeout};
62use super::subscript::{emit_subscript, try_parse_subscript};
63use super::superscript::{emit_superscript, try_parse_superscript};
64
65/// Parse inline text using the recursive emphasis algorithm.
66///
67/// This is the main entry point for parsing inline content with Pandoc-style
68/// recursive emphasis handling. It uses a greedy left-to-right, first-match-wins
69/// approach that matches Pandoc's behavior exactly.
70///
71/// **Algorithm**:
72/// 1. Parse text left-to-right trying each inline element type in precedence order
73/// 2. When we see `*` or `_`, try to parse emphasis recursively
74/// 3. Nested emphasis naturally consumes delimiters before outer matches
75/// 4. All inline elements (code, links, math, etc.) are parsed on-the-fly
76///
77/// # Arguments
78/// * `text` - The inline text to parse
79/// * `config` - Configuration for extensions and formatting
80/// * `builder` - The CST builder to emit nodes to
81pub fn parse_inline_text_recursive(
82    builder: &mut GreenNodeBuilder,
83    text: &str,
84    config: &ParserOptions,
85) {
86    log::debug!(
87        "Recursive inline parsing: {:?} ({} bytes)",
88        &text[..text.len().min(40)],
89        text.len()
90    );
91
92    parse_inline_range(text, 0, text.len(), config, builder);
93
94    log::debug!("Recursive inline parsing complete");
95}
96
97/// Parse inline elements from text content.
98/// This is a standalone function used for recursive inline parsing within blocks.
99///
100/// The `allow_reference_links` parameter is accepted for compatibility but not currently used.
101/// Set to `false` in nested contexts (inside link text, image alt, spans) to prevent recursive parsing.
102pub fn parse_inline_text(
103    builder: &mut GreenNodeBuilder,
104    text: &str,
105    config: &ParserOptions,
106    _allow_reference_links: bool,
107) {
108    log::trace!(
109        "Parsing inline text (recursive): {:?} ({} bytes)",
110        &text[..text.len().min(40)],
111        text.len()
112    );
113
114    // Use recursive parsing with Pandoc's algorithm for emphasis
115    parse_inline_text_recursive(builder, text, config);
116}
117
118/// Try to parse emphasis starting at the given position.
119///
120/// This is the entry point for recursive emphasis parsing, equivalent to
121/// Pandoc's `enclosure` function.
122///
123/// Returns Some((bytes_consumed, delim_count)) if emphasis was successfully parsed,
124/// or None if the delimiter should be treated as literal text.
125/// When returning None, the delim_count tells the caller how many delimiter
126/// characters to skip (to avoid re-parsing parts of a failed delimiter run).
127///
128/// # Arguments
129/// * `text` - The full text being parsed
130/// * `pos` - Current position in text (where the delimiter starts)
131/// * `end` - End boundary (don't search for closers beyond this)
132/// * `config` - Configuration
133/// * `builder` - CST builder
134///
135/// **Algorithm**:
136/// 1. Count opening delimiters
137/// 2. Check if followed by whitespace (if so, return None)
138/// 3. Dispatch to parse_one/two/three based on count
139/// 4. Those functions parse content and look for matching closer (within bounds)
140/// 5. If closer found, emit node and return bytes consumed
141/// 6. If not found, return None with delimiter count (caller skips entire run)
142pub fn try_parse_emphasis(
143    text: &str,
144    pos: usize,
145    end: usize,
146    config: &ParserOptions,
147    builder: &mut GreenNodeBuilder,
148) -> Option<(usize, usize)> {
149    let bytes = text.as_bytes();
150
151    if pos >= bytes.len() {
152        return None;
153    }
154
155    let delim_char = bytes[pos] as char;
156    if delim_char != '*' && delim_char != '_' {
157        return None;
158    }
159
160    // Count consecutive delimiters
161    let mut count = 0;
162    while pos + count < bytes.len() && bytes[pos + count] == bytes[pos] {
163        count += 1;
164    }
165
166    let after_pos = pos + count;
167
168    log::debug!(
169        "try_parse_emphasis: '{}' x {} at pos {}",
170        delim_char,
171        count,
172        pos
173    );
174
175    // Check if followed by whitespace (Pandoc rule: treat as literal)
176    if after_pos < text.len()
177        && let Some(next_char) = text[after_pos..].chars().next()
178        && next_char.is_whitespace()
179    {
180        log::trace!("Delimiter followed by whitespace, treating as literal");
181        return None;
182    }
183
184    // For underscores: check intraword_underscores extension (Pandoc lines 1668-1672)
185    // Can't open if preceded by alphanumeric (prevents foo_bar from parsing)
186    if delim_char == '_'
187        && pos > 0
188        && let Some(prev_char) = text[..pos].chars().last()
189        && prev_char.is_alphanumeric()
190    {
191        log::trace!("Underscore preceded by alphanumeric, can't open (intraword)");
192        return None;
193    }
194
195    // Dispatch based on delimiter count
196    let result = match count {
197        1 => try_parse_one(text, pos, delim_char, end, config, builder),
198        2 => try_parse_two(text, pos, delim_char, end, config, builder),
199        3 => try_parse_three(text, pos, delim_char, end, config, builder),
200        _ => {
201            // 4+ delimiters: treat as literal (Pandoc behavior)
202            log::trace!("{} delimiters (4+), treating as literal", count);
203            None
204        }
205    };
206
207    // If parsing succeeded, return (bytes_consumed, delim_count)
208    // If failed, return None but the caller will know to skip `count` delimiters
209    result.map(|consumed| (consumed, count))
210}
211
212/// Try to parse emphasis in a nested context (bypassing opener validity checks).
213///
214/// This mirrors Pandoc's behavior where `one` can call `two c mempty` directly,
215/// bypassing the `enclosure` opener validity checks. This is needed because
216/// patterns like `***foo **bar** baz***` require `**` followed by space to be
217/// parsed as a nested strong opener.
218///
219/// Returns Some((bytes_consumed, delim_count)) if successful, None otherwise.
220fn try_parse_emphasis_nested(
221    text: &str,
222    pos: usize,
223    end: usize,
224    config: &ParserOptions,
225    builder: &mut GreenNodeBuilder,
226) -> Option<(usize, usize)> {
227    let bytes = text.as_bytes();
228
229    if pos >= bytes.len() {
230        return None;
231    }
232
233    let delim_char = bytes[pos] as char;
234    if delim_char != '*' && delim_char != '_' {
235        return None;
236    }
237
238    // Count consecutive delimiters
239    let mut count = 0;
240    while pos + count < bytes.len() && bytes[pos + count] == bytes[pos] {
241        count += 1;
242    }
243
244    log::debug!(
245        "try_parse_emphasis_nested: '{}' x {} at pos {}",
246        delim_char,
247        count,
248        pos
249    );
250
251    // For underscores: still check intraword_underscores (prevents foo_bar parsing)
252    // This check applies even in nested contexts
253    if delim_char == '_'
254        && pos > 0
255        && let Some(prev_char) = text[..pos].chars().last()
256        && prev_char.is_alphanumeric()
257    {
258        log::trace!("Underscore preceded by alphanumeric, can't open (intraword)");
259        return None;
260    }
261
262    // NOTE: We intentionally skip the "delimiter followed by whitespace" check here.
263    // In nested contexts (inside `one` calling `two`), Pandoc allows openers
264    // followed by whitespace because the opener has already been matched.
265
266    // Dispatch based on delimiter count
267    let result = match count {
268        1 => try_parse_one(text, pos, delim_char, end, config, builder),
269        2 => try_parse_two(text, pos, delim_char, end, config, builder),
270        3 => try_parse_three(text, pos, delim_char, end, config, builder),
271        _ => {
272            // 4+ delimiters: treat as literal (Pandoc behavior)
273            log::trace!("{} delimiters (4+), treating as literal", count);
274            None
275        }
276    };
277
278    result.map(|consumed| (consumed, count))
279}
280
281/// Try to parse emphasis with *** opening delimiter.
282///
283/// Tries to match closers in order: *** → ** → *
284/// Returns Some(bytes_consumed) if successful, None otherwise.
285fn try_parse_three(
286    text: &str,
287    pos: usize,
288    delim_char: char,
289    end: usize,
290    config: &ParserOptions,
291    builder: &mut GreenNodeBuilder,
292) -> Option<usize> {
293    let content_start = pos + 3;
294    let one = delim_char.to_string();
295    let two = one.repeat(2);
296
297    log::debug!("try_parse_three: '{}' x 3 at pos {}", delim_char, pos);
298
299    // Pandoc algorithm (line 1695): Parse content UNTIL we see a VALID ender
300    // We loop through potential enders, checking if each is valid.
301    // Invalid enders (like `**` preceded by whitespace) are skipped.
302    let mut search_pos = content_start;
303
304    loop {
305        // Find next potential ender
306        let closer_start = match find_first_potential_ender(text, search_pos, delim_char, end) {
307            Some(p) => p,
308            None => {
309                log::trace!("No potential ender found for ***");
310                return None;
311            }
312        };
313
314        log::debug!("Potential ender at pos {}", closer_start);
315
316        // Count how many delimiters we have at closer_start
317        let bytes = text.as_bytes();
318        let mut closer_count = 0;
319        let mut check_pos = closer_start;
320        while check_pos < bytes.len() && bytes[check_pos] == delim_char as u8 {
321            closer_count += 1;
322            check_pos += 1;
323        }
324
325        log::debug!(
326            "Found {} x {} at pos {}",
327            delim_char,
328            closer_count,
329            closer_start
330        );
331
332        // Try to match closers in order: ***, **, * (Pandoc lines 1696-1698)
333
334        // Try *** (line 1696)
335        if closer_count >= 3 && is_valid_ender(text, closer_start, delim_char, 3) {
336            log::debug!("Matched *** closer, emitting Strong[Emph[content]]");
337
338            builder.start_node(SyntaxKind::STRONG.into());
339            builder.token(SyntaxKind::STRONG_MARKER.into(), &two);
340
341            builder.start_node(SyntaxKind::EMPHASIS.into());
342            builder.token(SyntaxKind::EMPHASIS_MARKER.into(), &one);
343            parse_inline_range_nested(text, content_start, closer_start, config, builder);
344            builder.token(SyntaxKind::EMPHASIS_MARKER.into(), &one);
345            builder.finish_node(); // EMPHASIS
346
347            builder.token(SyntaxKind::STRONG_MARKER.into(), &two);
348            builder.finish_node(); // STRONG
349
350            return Some(closer_start + 3 - pos);
351        }
352
353        // Try ** (line 1697)
354        if closer_count >= 2 && is_valid_ender(text, closer_start, delim_char, 2) {
355            log::debug!("Matched ** closer, wrapping as Strong and continuing with one");
356
357            let continue_pos = closer_start + 2;
358
359            if let Some(final_closer_pos) =
360                parse_until_closer_with_nested_two(text, continue_pos, delim_char, 1, end, config)
361            {
362                log::debug!(
363                    "Found * closer at pos {}, emitting Emph[Strong[...], ...]",
364                    final_closer_pos
365                );
366
367                builder.start_node(SyntaxKind::EMPHASIS.into());
368                builder.token(SyntaxKind::EMPHASIS_MARKER.into(), &one);
369
370                builder.start_node(SyntaxKind::STRONG.into());
371                builder.token(SyntaxKind::STRONG_MARKER.into(), &two);
372                parse_inline_range_nested(text, content_start, closer_start, config, builder);
373                builder.token(SyntaxKind::STRONG_MARKER.into(), &two);
374                builder.finish_node(); // STRONG
375
376                // Parse additional content between ** and * (up to but not including the closer)
377                parse_inline_range_nested(text, continue_pos, final_closer_pos, config, builder);
378
379                builder.token(SyntaxKind::EMPHASIS_MARKER.into(), &one);
380                builder.finish_node(); // EMPHASIS
381
382                return Some(final_closer_pos + 1 - pos);
383            }
384
385            // Fallback: emit * + STRONG
386            log::debug!("No * closer found after **, emitting * + STRONG");
387            builder.token(SyntaxKind::TEXT.into(), &one);
388
389            builder.start_node(SyntaxKind::STRONG.into());
390            builder.token(SyntaxKind::STRONG_MARKER.into(), &two);
391            parse_inline_range_nested(text, content_start, closer_start, config, builder);
392            builder.token(SyntaxKind::STRONG_MARKER.into(), &two);
393            builder.finish_node(); // STRONG
394
395            return Some(closer_start + 2 - pos);
396        }
397
398        // Try * (line 1698)
399        if closer_count >= 1 && is_valid_ender(text, closer_start, delim_char, 1) {
400            log::debug!("Matched * closer, wrapping as Emph and continuing with two");
401
402            let continue_pos = closer_start + 1;
403
404            if let Some(final_closer_pos) =
405                parse_until_closer_with_nested_one(text, continue_pos, delim_char, 2, end, config)
406            {
407                log::debug!(
408                    "Found ** closer at pos {}, emitting Strong[Emph[...], ...]",
409                    final_closer_pos
410                );
411
412                builder.start_node(SyntaxKind::STRONG.into());
413                builder.token(SyntaxKind::STRONG_MARKER.into(), &two);
414
415                builder.start_node(SyntaxKind::EMPHASIS.into());
416                builder.token(SyntaxKind::EMPHASIS_MARKER.into(), &one);
417                parse_inline_range_nested(text, content_start, closer_start, config, builder);
418                builder.token(SyntaxKind::EMPHASIS_MARKER.into(), &one);
419                builder.finish_node(); // EMPHASIS
420
421                parse_inline_range_nested(text, continue_pos, final_closer_pos, config, builder);
422
423                builder.token(SyntaxKind::STRONG_MARKER.into(), &two);
424                builder.finish_node(); // STRONG
425
426                return Some(final_closer_pos + 2 - pos);
427            }
428
429            // Fallback: emit ** + EMPH
430            log::debug!("No ** closer found after *, emitting ** + EMPH");
431            builder.token(SyntaxKind::TEXT.into(), &two);
432
433            builder.start_node(SyntaxKind::EMPHASIS.into());
434            builder.token(SyntaxKind::EMPHASIS_MARKER.into(), &one);
435            parse_inline_range_nested(text, content_start, closer_start, config, builder);
436            builder.token(SyntaxKind::EMPHASIS_MARKER.into(), &one);
437            builder.finish_node(); // EMPHASIS
438
439            return Some(closer_start + 1 - pos);
440        }
441
442        // No valid ender at this position - continue searching after this delimiter run
443        log::debug!(
444            "No valid ender at pos {}, continuing search from {}",
445            closer_start,
446            closer_start + closer_count
447        );
448        search_pos = closer_start + closer_count;
449    }
450}
451
452/// Find the first potential emphasis ender (delimiter character) starting from `start`.
453/// This implements Pandoc's `many (notFollowedBy (ender c 1) >> inline)` -
454/// we parse inline content until we hit a delimiter that could be an ender.
455fn find_first_potential_ender(
456    text: &str,
457    start: usize,
458    delim_char: char,
459    end: usize,
460) -> Option<usize> {
461    let bytes = text.as_bytes();
462    let mut pos = start;
463
464    while pos < end.min(text.len()) {
465        // Check if we found the delimiter character
466        if bytes[pos] == delim_char as u8 {
467            // Check if it's escaped
468            let is_escaped = {
469                let mut backslash_count = 0;
470                let mut check_pos = pos;
471                while check_pos > 0 && bytes[check_pos - 1] == b'\\' {
472                    backslash_count += 1;
473                    check_pos -= 1;
474                }
475                backslash_count % 2 == 1
476            };
477
478            if !is_escaped {
479                // Found a potential ender
480                return Some(pos);
481            }
482        }
483
484        pos += 1;
485    }
486
487    None
488}
489
490/// Check if a delimiter at the given position is a valid ender.
491/// This implements Pandoc's `ender c n` function.
492fn is_valid_ender(text: &str, pos: usize, delim_char: char, delim_count: usize) -> bool {
493    let bytes = text.as_bytes();
494
495    // Check we have exactly delim_count delimiters (not more, not less)
496    if pos + delim_count > text.len() {
497        return false;
498    }
499
500    for i in 0..delim_count {
501        if bytes[pos + i] != delim_char as u8 {
502            return false;
503        }
504    }
505
506    // Check no delimiter immediately before
507    if pos > 0 && bytes[pos - 1] == delim_char as u8 {
508        return false;
509    }
510
511    // Check no delimiter immediately after
512    let after_pos = pos + delim_count;
513    if after_pos < bytes.len() && bytes[after_pos] == delim_char as u8 {
514        return false;
515    }
516
517    // For underscores, check right-flanking (not preceded by whitespace)
518    // Pandoc's `ender` for asterisks has NO right-flanking requirement
519    if delim_char == '_' {
520        if pos > 0
521            && let Some(prev_char) = text[..pos].chars().last()
522            && prev_char.is_whitespace()
523        {
524            return false;
525        }
526
527        // Check not followed by alphanumeric (right-flanking rule for underscores)
528        if after_pos < text.len()
529            && let Some(next_char) = text[after_pos..].chars().next()
530            && next_char.is_alphanumeric()
531        {
532            return false;
533        }
534    }
535
536    true
537}
538
539/// Try to parse emphasis with ** opening delimiter.
540///
541/// Tries to match ** closer only. No fallback.
542/// Returns Some(bytes_consumed) if successful, None otherwise.
543fn try_parse_two(
544    text: &str,
545    pos: usize,
546    delim_char: char,
547    end: usize,
548    config: &ParserOptions,
549    builder: &mut GreenNodeBuilder,
550) -> Option<usize> {
551    let content_start = pos + 2;
552
553    log::debug!("try_parse_two: '{}' x 2 at pos {}", delim_char, pos);
554
555    // Try to find ** closer, checking for nested * emphasis along the way
556    if let Some(closer_pos) =
557        parse_until_closer_with_nested_one(text, content_start, delim_char, 2, end, config)
558    {
559        log::debug!("Found ** closer at pos {}", closer_pos);
560
561        // Emit STRONG(content)
562        builder.start_node(SyntaxKind::STRONG.into());
563        builder.token(SyntaxKind::STRONG_MARKER.into(), &text[pos..pos + 2]);
564        parse_inline_range_nested(text, content_start, closer_pos, config, builder);
565        builder.token(
566            SyntaxKind::STRONG_MARKER.into(),
567            &text[closer_pos..closer_pos + 2],
568        );
569        builder.finish_node(); // STRONG
570
571        return Some(closer_pos + 2 - pos);
572    }
573
574    // No closer found
575    log::trace!("No closer found for **");
576    None
577}
578
579/// Try to parse emphasis with * opening delimiter.
580///
581/// Tries to match * closer.
582/// Returns Some(bytes_consumed) if successful, None otherwise.
583///
584/// **Pandoc algorithm**: While parsing content, if we encounter **,
585/// try to parse it as `two` (strong) recursively. If `two` succeeds,
586/// it consumes the ** delimiters, potentially preventing us from finding
587/// a closer for the outer *. This creates priority where ** can "steal"
588/// matches from *.
589fn try_parse_one(
590    text: &str,
591    pos: usize,
592    delim_char: char,
593    end: usize,
594    config: &ParserOptions,
595    builder: &mut GreenNodeBuilder,
596) -> Option<usize> {
597    let content_start = pos + 1;
598
599    log::debug!("try_parse_one: '{}' x 1 at pos {}", delim_char, pos);
600
601    // Try to find * closer using Pandoc's algorithm with nested two attempts
602    if let Some(closer_pos) =
603        parse_until_closer_with_nested_two(text, content_start, delim_char, 1, end, config)
604    {
605        log::debug!("Found * closer at pos {}", closer_pos);
606
607        // Emit EMPH(content)
608        builder.start_node(SyntaxKind::EMPHASIS.into());
609        builder.token(SyntaxKind::EMPHASIS_MARKER.into(), &text[pos..pos + 1]);
610        parse_inline_range_nested(text, content_start, closer_pos, config, builder);
611        builder.token(
612            SyntaxKind::EMPHASIS_MARKER.into(),
613            &text[closer_pos..closer_pos + 1],
614        );
615        builder.finish_node(); // EMPHASIS
616
617        return Some(closer_pos + 1 - pos);
618    }
619
620    // No closer found
621    log::trace!("No closer found for *");
622    None
623}
624
625/// Parse inline content and look for a matching closer, with nested two attempts.
626///
627/// This implements Pandoc's algorithm from Markdown.hs lines 1712-1717:
628/// When parsing `*...*`, if we encounter `**` (and it's not followed by
629/// another `*` that would close the outer emphasis), try to parse it as
630/// `two c mempty` (strong). If `two` succeeds, those `**` delimiters are
631/// consumed, and we continue searching for the `*` closer.
632///
633/// This creates a priority system where `**` can "steal" matches from `*`.
634///
635/// Example: `*foo **bar* baz**`
636/// - When parsing the outer `*...*`, we encounter `**` at position 5
637/// - We try `two` which succeeds with `**bar* baz**`
638/// - Now there's no `*` closer for the outer `*`, so it fails
639/// - Result: literal `*foo ` + STRONG("bar* baz")
640///
641/// # Arguments
642/// * `end` - Don't search beyond this position (respects nesting boundaries)
643fn parse_until_closer_with_nested_two(
644    text: &str,
645    start: usize,
646    delim_char: char,
647    delim_count: usize,
648    end: usize,
649    config: &ParserOptions,
650) -> Option<usize> {
651    let bytes = text.as_bytes();
652    let mut pos = start;
653
654    while pos < end.min(text.len()) {
655        if bytes[pos] == b'`'
656            && let Some(m) = try_parse_inline_executable(
657                &text[pos..],
658                config.extensions.rmarkdown_inline_code,
659                config.extensions.quarto_inline_code,
660            )
661        {
662            log::trace!(
663                "Skipping inline executable span of {} bytes at pos {}",
664                m.total_len,
665                pos
666            );
667            pos += m.total_len;
668            continue;
669        }
670
671        // Skip over code spans - their content is protected from delimiter matching
672        if bytes[pos] == b'`'
673            && let Some((len, _, _, _)) = try_parse_code_span(&text[pos..])
674        {
675            log::trace!("Skipping code span of {} bytes at pos {}", len, pos);
676            pos += len;
677            continue;
678        }
679
680        // Skip over inline math - their content is protected from delimiter matching
681        if bytes[pos] == b'$'
682            && let Some((len, _)) = try_parse_inline_math(&text[pos..])
683        {
684            log::trace!("Skipping inline math of {} bytes at pos {}", len, pos);
685            pos += len;
686            continue;
687        }
688
689        // Skip over links - their content is protected from delimiter matching
690        if bytes[pos] == b'['
691            && let Some((len, _, _, _)) = try_parse_inline_link(&text[pos..])
692        {
693            log::trace!("Skipping inline link of {} bytes at pos {}", len, pos);
694            pos += len;
695            continue;
696        }
697
698        // Pandoc algorithm: If we're looking for a single delimiter (*) and
699        // encounter a double delimiter (**), try to parse it as `two` (strong).
700        // This happens BEFORE checking if pos is a closer for our current emphasis.
701        if delim_count == 1
702            && pos + 2 <= text.len()
703            && bytes[pos] == delim_char as u8
704            && bytes[pos + 1] == delim_char as u8
705        {
706            // First check if the first delimiter is escaped
707            let first_is_escaped = {
708                let mut backslash_count = 0;
709                let mut check_pos = pos;
710                while check_pos > 0 && bytes[check_pos - 1] == b'\\' {
711                    backslash_count += 1;
712                    check_pos -= 1;
713                }
714                backslash_count % 2 == 1
715            };
716
717            if first_is_escaped {
718                // First * is escaped, skip it and continue
719                // The second * might be a closer or start of emphasis
720                log::trace!(
721                    "First * at pos {} is escaped, skipping to check second *",
722                    pos
723                );
724                pos += 1;
725                continue;
726            }
727
728            // Check that there's NOT a third delimiter (which would make this
729            // part of a longer run that we shouldn't treat as `two`)
730            let no_third_delim = pos + 2 >= bytes.len() || bytes[pos + 2] != delim_char as u8;
731
732            if no_third_delim {
733                log::trace!(
734                    "try_parse_one: found ** at pos {}, attempting nested two",
735                    pos
736                );
737
738                // Try to parse as `two` (strong emphasis)
739                // We create a temporary builder to test if `two` succeeds
740                let mut temp_builder = GreenNodeBuilder::new();
741                if let Some(two_consumed) =
742                    try_parse_two(text, pos, delim_char, end, config, &mut temp_builder)
743                {
744                    // `two` succeeded! Those ** delimiters are consumed.
745                    // We skip past the `two` and continue searching for our `*` closer.
746                    log::debug!(
747                        "Nested two succeeded, consumed {} bytes, continuing search",
748                        two_consumed
749                    );
750                    pos += two_consumed;
751                    continue;
752                }
753                // `two` failed - this means the entire `one` parse should fail!
754                // In Pandoc, the `try (string [c,c] >> notFollowedBy (ender c 1) >> two c mempty)`
755                // alternative fails, and the first alternative `notFollowedBy (ender c 1) >> inline`
756                // also fails because we ARE followed by an ender (the first * of **).
757                // So the entire content parsing fails, and `one` returns failure.
758                log::trace!("Nested two failed at pos {}, entire one() should fail", pos);
759                return None;
760            }
761        }
762
763        // Check if we have a potential closer here
764        if pos + delim_count <= text.len() {
765            let mut matches = true;
766            for i in 0..delim_count {
767                if bytes[pos + i] != delim_char as u8 {
768                    matches = false;
769                    break;
770                }
771            }
772
773            if matches {
774                // IMPORTANT: Check that there are EXACTLY delim_count delimiters,
775                // not more. E.g., when looking for `*`, we shouldn't match
776                // `*` that's part of a longer run.
777
778                // Check: not escaped (preceded by odd number of backslashes)
779                let is_escaped = {
780                    let mut backslash_count = 0;
781                    let mut check_pos = pos;
782                    while check_pos > 0 && bytes[check_pos - 1] == b'\\' {
783                        backslash_count += 1;
784                        check_pos -= 1;
785                    }
786                    backslash_count % 2 == 1 // Odd number = escaped
787                };
788
789                // Allow matching at the start OR end of a delimiter run.
790                // This lets `**` close at the end of `***` (after a nested `*` closes),
791                // while still avoiding matches in the middle of longer runs.
792                let at_run_start = pos == 0 || bytes[pos - 1] != delim_char as u8;
793                let after_pos = pos + delim_count;
794                let at_run_end = after_pos >= bytes.len() || bytes[after_pos] != delim_char as u8;
795
796                if (at_run_start || at_run_end) && !is_escaped {
797                    // Found a potential closer!
798                    // For underscores, check right-flanking: closer must be preceded by non-whitespace
799                    // For asterisks, Pandoc doesn't require right-flanking (see ender function in Markdown.hs)
800                    if delim_char == '_'
801                        && pos > start
802                        && let Some(prev_char) = text[..pos].chars().last()
803                        && prev_char.is_whitespace()
804                    {
805                        log::trace!(
806                            "Underscore closer preceded by whitespace at pos {}, not right-flanking",
807                            pos
808                        );
809                        // Not a valid closer, continue searching
810                        pos += 1;
811                        continue;
812                    }
813
814                    log::trace!(
815                        "Found exact {} x {} closer at pos {}",
816                        delim_char,
817                        delim_count,
818                        pos
819                    );
820                    return Some(pos);
821                }
822            }
823        }
824
825        // Not a closer, move to next position
826        // TODO: Should skip entire characters (UTF-8), not just bytes
827        pos += 1;
828    }
829
830    None
831}
832
833/// Parse inline content and look for a matching closer, with nested one attempts.
834///
835/// This implements the symmetric case to `parse_until_closer_with_nested_two`:
836/// When parsing `**...**`, if we encounter `*` (and it's not followed by
837/// another `*` that would be part of our `**` closer), try to parse it as
838/// `one c mempty` (emphasis). If `one` succeeds, those `*` delimiters are
839/// consumed, and we continue searching for the `**` closer.
840///
841/// This ensures nested emphasis closes before the outer strong emphasis.
842///
843/// Example: `**bold with *italic***`
844/// - When parsing the outer `**...**, we scan for `**` closer
845/// - At position 12, we encounter a single `*` (start of `*italic`)
846/// - We try `one` which succeeds with `*italic*` (consuming the first `*` from `***`)
847/// - We continue scanning and find `**` at position 20 (the remaining `**` from `***`)
848/// - Result: STRONG["bold with " EMPHASIS["italic"]]
849///
850/// # Arguments
851/// * `end` - Don't search beyond this position (respects nesting boundaries)
852fn parse_until_closer_with_nested_one(
853    text: &str,
854    start: usize,
855    delim_char: char,
856    delim_count: usize,
857    end: usize,
858    config: &ParserOptions,
859) -> Option<usize> {
860    let bytes = text.as_bytes();
861    let mut pos = start;
862
863    while pos < end.min(text.len()) {
864        if bytes[pos] == b'`'
865            && let Some(m) = try_parse_inline_executable(
866                &text[pos..],
867                config.extensions.rmarkdown_inline_code,
868                config.extensions.quarto_inline_code,
869            )
870        {
871            log::trace!(
872                "Skipping inline executable span of {} bytes at pos {}",
873                m.total_len,
874                pos
875            );
876            pos += m.total_len;
877            continue;
878        }
879
880        // Skip over code spans - their content is protected from delimiter matching
881        if bytes[pos] == b'`'
882            && let Some((len, _, _, _)) = try_parse_code_span(&text[pos..])
883        {
884            log::trace!("Skipping code span of {} bytes at pos {}", len, pos);
885            pos += len;
886            continue;
887        }
888
889        // Skip over inline math - their content is protected from delimiter matching
890        if bytes[pos] == b'$'
891            && let Some((len, _)) = try_parse_inline_math(&text[pos..])
892        {
893            log::trace!("Skipping inline math of {} bytes at pos {}", len, pos);
894            pos += len;
895            continue;
896        }
897
898        // Skip over links - their content is protected from delimiter matching
899        if bytes[pos] == b'['
900            && let Some((len, _, _, _)) = try_parse_inline_link(&text[pos..])
901        {
902            log::trace!("Skipping inline link of {} bytes at pos {}", len, pos);
903            pos += len;
904            continue;
905        }
906
907        // Pandoc algorithm: If we're looking for a double delimiter (**) and
908        // encounter a single delimiter (*), check if it's a valid emphasis opener.
909        // If it is, try to parse it as `one` (emphasis). If `one` succeeds, skip
910        // over it. If `one` fails, the outer `two` also fails (delimiter poisoning).
911        // If the `*` is NOT a valid opener (e.g., followed by whitespace or escaped),
912        // skip it and continue looking for the `**` closer.
913        if delim_count == 2 && pos < text.len() && bytes[pos] == delim_char as u8 {
914            // Check that there's NOT a second delimiter immediately after
915            // (which would make this part of our `**` closer or another `**` opener)
916            let no_second_delim = pos + 1 >= bytes.len() || bytes[pos + 1] != delim_char as u8;
917
918            if no_second_delim {
919                // Check if this * is escaped (preceded by odd number of backslashes)
920                let is_escaped = {
921                    let mut backslash_count = 0;
922                    let mut check_pos = pos;
923                    while check_pos > 0 && bytes[check_pos - 1] == b'\\' {
924                        backslash_count += 1;
925                        check_pos -= 1;
926                    }
927                    backslash_count % 2 == 1
928                };
929
930                if is_escaped {
931                    // Escaped delimiter - just literal text, skip it
932                    log::trace!("* at pos {} is escaped, skipping", pos);
933                    pos += 1;
934                    continue;
935                }
936
937                // Check if this * is a valid emphasis opener (Pandoc's enclosure rule).
938                // A delimiter followed by whitespace is NOT an opener - it's literal text.
939                let after_delim = pos + 1;
940                let followed_by_whitespace = after_delim < text.len()
941                    && text[after_delim..]
942                        .chars()
943                        .next()
944                        .is_some_and(|c| c.is_whitespace());
945
946                if followed_by_whitespace {
947                    // Not a valid opener - just literal text, skip it
948                    log::trace!(
949                        "* at pos {} followed by whitespace, not an opener, skipping",
950                        pos
951                    );
952                    pos += 1;
953                    continue;
954                }
955
956                log::trace!(
957                    "try_parse_two: found * at pos {}, attempting nested one",
958                    pos
959                );
960
961                // Try to parse as `one` (emphasis)
962                // We create a temporary builder to test if `one` succeeds
963                let mut temp_builder = GreenNodeBuilder::new();
964                if let Some(one_consumed) =
965                    try_parse_one(text, pos, delim_char, end, config, &mut temp_builder)
966                {
967                    // `one` succeeded! Those * delimiters are consumed.
968                    // We skip past the `one` and continue searching for our `**` closer.
969                    log::debug!(
970                        "Nested one succeeded, consumed {} bytes, continuing search",
971                        one_consumed
972                    );
973                    pos += one_consumed;
974                    continue;
975                }
976
977                // `one` failed to find a closer. According to Pandoc's algorithm,
978                // this means the outer `two` should also fail. An unmatched inner
979                // delimiter "poisons" the outer emphasis.
980                // Example: `**foo *bar**` - the `*` can't find a closer, so the
981                // outer `**` should fail and the whole thing becomes literal.
982                log::debug!(
983                    "Nested one failed at pos {}, poisoning outer two (no closer found)",
984                    pos
985                );
986                return None;
987            }
988        }
989
990        // Check if we have a potential closer here
991        if pos + delim_count <= text.len() {
992            let mut matches = true;
993            for i in 0..delim_count {
994                if bytes[pos + i] != delim_char as u8 {
995                    matches = false;
996                    break;
997                }
998            }
999
1000            if matches {
1001                // Check: not escaped (preceded by odd number of backslashes)
1002                let is_escaped = {
1003                    let mut backslash_count = 0;
1004                    let mut check_pos = pos;
1005                    while check_pos > 0 && bytes[check_pos - 1] == b'\\' {
1006                        backslash_count += 1;
1007                        check_pos -= 1;
1008                    }
1009                    backslash_count % 2 == 1 // Odd number = escaped
1010                };
1011
1012                // Allow matching at the start OR end of a delimiter run.
1013                // This lets `**` close at the end of `***` (after a nested `*` closes),
1014                // while still avoiding matches in the middle of longer runs.
1015                let at_run_start = pos == 0 || bytes[pos - 1] != delim_char as u8;
1016                let after_pos = pos + delim_count;
1017                let at_run_end = after_pos >= bytes.len() || bytes[after_pos] != delim_char as u8;
1018
1019                if (at_run_start || at_run_end) && !is_escaped {
1020                    // Found a potential closer!
1021                    // For underscores, check right-flanking: closer must be preceded by non-whitespace
1022                    // For asterisks, Pandoc doesn't require right-flanking (see ender function in Markdown.hs)
1023                    if delim_char == '_'
1024                        && pos > start
1025                        && let Some(prev_char) = text[..pos].chars().last()
1026                        && prev_char.is_whitespace()
1027                    {
1028                        log::trace!(
1029                            "Underscore closer preceded by whitespace at pos {}, not right-flanking",
1030                            pos
1031                        );
1032                        // Not a valid closer, continue searching
1033                        pos += 1;
1034                        continue;
1035                    }
1036
1037                    log::trace!(
1038                        "Found exact {} x {} closer at pos {}",
1039                        delim_char,
1040                        delim_count,
1041                        pos
1042                    );
1043                    return Some(pos);
1044                }
1045            }
1046        }
1047
1048        // Not a closer, move to next position
1049        // TODO: Should skip entire characters (UTF-8), not just bytes
1050        pos += 1;
1051    }
1052
1053    None
1054}
1055
1056///
1057/// This is the recursive inline parser that handles all inline elements:
1058/// - Text
1059/// - Escapes (highest priority)
1060/// - Code spans
1061/// - Math (inline and display)
1062/// - Emphasis/strong (via try_parse_emphasis)
1063/// - Other inline elements (TODO: links, images, citations, etc.)
1064///
1065/// **Important**: This is where the greedy left-to-right parsing happens.
1066/// When we see `**`, we try to parse it as STRONG. If it succeeds, those
1067/// delimiters are consumed and won't be available for outer emphasis.
1068///
1069/// # Arguments
1070/// * `nested_emphasis` - If true, bypass opener validity checks for emphasis.
1071///   Set to true when called from within emphasis parsing (e.g., from try_parse_one/two/three).
1072fn parse_inline_range(
1073    text: &str,
1074    start: usize,
1075    end: usize,
1076    config: &ParserOptions,
1077    builder: &mut GreenNodeBuilder,
1078) {
1079    parse_inline_range_impl(text, start, end, config, builder, false)
1080}
1081
1082/// Same as `parse_inline_range` but bypasses opener validity checks for emphasis.
1083/// Used within emphasis parsing contexts (e.g., from try_parse_one/two/three).
1084fn parse_inline_range_nested(
1085    text: &str,
1086    start: usize,
1087    end: usize,
1088    config: &ParserOptions,
1089    builder: &mut GreenNodeBuilder,
1090) {
1091    parse_inline_range_impl(text, start, end, config, builder, true)
1092}
1093
1094fn is_emoji_boundary(text: &str, pos: usize) -> bool {
1095    if pos > 0 {
1096        let prev = text.as_bytes()[pos - 1] as char;
1097        if prev.is_ascii_alphanumeric() || prev == '_' {
1098            return false;
1099        }
1100    }
1101    true
1102}
1103
1104fn parse_inline_range_impl(
1105    text: &str,
1106    start: usize,
1107    end: usize,
1108    config: &ParserOptions,
1109    builder: &mut GreenNodeBuilder,
1110    nested_emphasis: bool,
1111) {
1112    log::debug!(
1113        "parse_inline_range: start={}, end={}, text={:?}",
1114        start,
1115        end,
1116        &text[start..end]
1117    );
1118    let mut pos = start;
1119    let mut text_start = start;
1120
1121    while pos < end {
1122        let byte = text.as_bytes()[pos];
1123
1124        // Backslash math (highest priority if enabled)
1125        if byte == b'\\' {
1126            // Try double backslash display math first: \\[...\\]
1127            if config.extensions.tex_math_double_backslash {
1128                if let Some((len, content)) = try_parse_double_backslash_display_math(&text[pos..])
1129                {
1130                    if pos > text_start {
1131                        builder.token(SyntaxKind::TEXT.into(), &text[text_start..pos]);
1132                    }
1133                    log::debug!("Matched double backslash display math at pos {}", pos);
1134                    emit_double_backslash_display_math(builder, content);
1135                    pos += len;
1136                    text_start = pos;
1137                    continue;
1138                }
1139
1140                // Try double backslash inline math: \\(...\\)
1141                if let Some((len, content)) = try_parse_double_backslash_inline_math(&text[pos..]) {
1142                    if pos > text_start {
1143                        builder.token(SyntaxKind::TEXT.into(), &text[text_start..pos]);
1144                    }
1145                    log::debug!("Matched double backslash inline math at pos {}", pos);
1146                    emit_double_backslash_inline_math(builder, content);
1147                    pos += len;
1148                    text_start = pos;
1149                    continue;
1150                }
1151            }
1152
1153            // Try single backslash display math: \[...\]
1154            if config.extensions.tex_math_single_backslash {
1155                if let Some((len, content)) = try_parse_single_backslash_display_math(&text[pos..])
1156                {
1157                    if pos > text_start {
1158                        builder.token(SyntaxKind::TEXT.into(), &text[text_start..pos]);
1159                    }
1160                    log::debug!("Matched single backslash display math at pos {}", pos);
1161                    emit_single_backslash_display_math(builder, content);
1162                    pos += len;
1163                    text_start = pos;
1164                    continue;
1165                }
1166
1167                // Try single backslash inline math: \(...\)
1168                if let Some((len, content)) = try_parse_single_backslash_inline_math(&text[pos..]) {
1169                    if pos > text_start {
1170                        builder.token(SyntaxKind::TEXT.into(), &text[text_start..pos]);
1171                    }
1172                    log::debug!("Matched single backslash inline math at pos {}", pos);
1173                    emit_single_backslash_inline_math(builder, content);
1174                    pos += len;
1175                    text_start = pos;
1176                    continue;
1177                }
1178            }
1179
1180            // Try math environments \begin{equation}...\end{equation}
1181            if config.extensions.raw_tex
1182                && let Some((len, begin_marker, content, end_marker)) =
1183                    try_parse_math_environment(&text[pos..])
1184            {
1185                if pos > text_start {
1186                    builder.token(SyntaxKind::TEXT.into(), &text[text_start..pos]);
1187                }
1188                log::debug!("Matched math environment at pos {}", pos);
1189                emit_display_math_environment(builder, begin_marker, content, end_marker);
1190                pos += len;
1191                text_start = pos;
1192                continue;
1193            }
1194
1195            // Try bookdown reference: \@ref(label)
1196            if config.extensions.bookdown_references
1197                && let Some((len, label)) = try_parse_bookdown_reference(&text[pos..])
1198            {
1199                if pos > text_start {
1200                    builder.token(SyntaxKind::TEXT.into(), &text[text_start..pos]);
1201                }
1202                log::debug!("Matched bookdown reference at pos {}: {}", pos, label);
1203                super::citations::emit_bookdown_crossref(builder, label);
1204                pos += len;
1205                text_start = pos;
1206                continue;
1207            }
1208
1209            // Try escapes (after bookdown refs and backslash math)
1210            if let Some((len, ch, escape_type)) = try_parse_escape(&text[pos..]) {
1211                let escape_enabled = match escape_type {
1212                    EscapeType::HardLineBreak => config.extensions.escaped_line_breaks,
1213                    EscapeType::NonbreakingSpace => config.extensions.all_symbols_escapable,
1214                    EscapeType::Literal => {
1215                        const BASE_ESCAPABLE: &str = "\\`*_{}[]()>#+-.!";
1216                        BASE_ESCAPABLE.contains(ch) || config.extensions.all_symbols_escapable
1217                    }
1218                };
1219                if !escape_enabled {
1220                    // Don't treat as hard line break - skip the escape and continue
1221                    // The backslash will be included in the next TEXT token
1222                    pos += 1;
1223                    continue;
1224                }
1225
1226                // Emit accumulated text
1227                if pos > text_start {
1228                    builder.token(SyntaxKind::TEXT.into(), &text[text_start..pos]);
1229                }
1230
1231                log::debug!("Matched escape at pos {}: \\{}", pos, ch);
1232                emit_escape(builder, ch, escape_type);
1233                pos += len;
1234                text_start = pos;
1235                continue;
1236            }
1237
1238            // Try LaTeX commands (after escapes, before shortcodes)
1239            if config.extensions.raw_tex
1240                && let Some(len) = try_parse_latex_command(&text[pos..])
1241            {
1242                if pos > text_start {
1243                    builder.token(SyntaxKind::TEXT.into(), &text[text_start..pos]);
1244                }
1245                log::debug!("Matched LaTeX command at pos {}", pos);
1246                parse_latex_command(builder, &text[pos..], len);
1247                pos += len;
1248                text_start = pos;
1249                continue;
1250            }
1251        }
1252
1253        // Try Quarto shortcodes: {{< shortcode >}}
1254        if byte == b'{'
1255            && pos + 1 < text.len()
1256            && text.as_bytes()[pos + 1] == b'{'
1257            && let Some((len, name, attrs)) = try_parse_shortcode(&text[pos..])
1258        {
1259            if pos > text_start {
1260                builder.token(SyntaxKind::TEXT.into(), &text[text_start..pos]);
1261            }
1262            log::debug!("Matched shortcode at pos {}: {}", pos, &name);
1263            emit_shortcode(builder, &name, attrs);
1264            pos += len;
1265            text_start = pos;
1266            continue;
1267        }
1268
1269        // Try inline executable code spans (`... `r expr`` and `... `{r} expr``)
1270        if byte == b'`'
1271            && let Some(m) = try_parse_inline_executable(
1272                &text[pos..],
1273                config.extensions.rmarkdown_inline_code,
1274                config.extensions.quarto_inline_code,
1275            )
1276        {
1277            if pos > text_start {
1278                builder.token(SyntaxKind::TEXT.into(), &text[text_start..pos]);
1279            }
1280            log::debug!("Matched inline executable code at pos {}", pos);
1281            emit_inline_executable(builder, &m);
1282            pos += m.total_len;
1283            text_start = pos;
1284            continue;
1285        }
1286
1287        // Try code spans
1288        if byte == b'`'
1289            && let Some((len, content, backtick_count, attributes)) =
1290                try_parse_code_span(&text[pos..])
1291        {
1292            // Emit accumulated text
1293            if pos > text_start {
1294                builder.token(SyntaxKind::TEXT.into(), &text[text_start..pos]);
1295            }
1296
1297            log::debug!(
1298                "Matched code span at pos {}: {} backticks",
1299                pos,
1300                backtick_count
1301            );
1302
1303            // Check for raw inline
1304            if let Some(ref attrs) = attributes
1305                && config.extensions.raw_attribute
1306                && let Some(format) = is_raw_inline(attrs)
1307            {
1308                use super::raw_inline::emit_raw_inline;
1309                log::debug!("Matched raw inline span at pos {}: format={}", pos, format);
1310                emit_raw_inline(builder, content, backtick_count, format);
1311            } else if !config.extensions.inline_code_attributes && attributes.is_some() {
1312                let code_span_len = backtick_count * 2 + content.len();
1313                emit_code_span(builder, content, backtick_count, None);
1314                pos += code_span_len;
1315                text_start = pos;
1316                continue;
1317            } else {
1318                emit_code_span(builder, content, backtick_count, attributes);
1319            }
1320
1321            pos += len;
1322            text_start = pos;
1323            continue;
1324        }
1325
1326        // Try textual emoji aliases: :smile:
1327        if byte == b':'
1328            && config.extensions.emoji
1329            && is_emoji_boundary(text, pos)
1330            && let Some((len, _alias)) = try_parse_emoji(&text[pos..])
1331        {
1332            if pos > text_start {
1333                builder.token(SyntaxKind::TEXT.into(), &text[text_start..pos]);
1334            }
1335            log::debug!("Matched emoji at pos {}", pos);
1336            emit_emoji(builder, &text[pos..pos + len]);
1337            pos += len;
1338            text_start = pos;
1339            continue;
1340        }
1341
1342        // Try inline footnotes: ^[note]
1343        if byte == b'^'
1344            && pos + 1 < text.len()
1345            && text.as_bytes()[pos + 1] == b'['
1346            && config.extensions.inline_footnotes
1347            && let Some((len, content)) = try_parse_inline_footnote(&text[pos..])
1348        {
1349            if pos > text_start {
1350                builder.token(SyntaxKind::TEXT.into(), &text[text_start..pos]);
1351            }
1352            log::debug!("Matched inline footnote at pos {}", pos);
1353            emit_inline_footnote(builder, content, config);
1354            pos += len;
1355            text_start = pos;
1356            continue;
1357        }
1358
1359        // Try superscript: ^text^
1360        if byte == b'^'
1361            && config.extensions.superscript
1362            && let Some((len, content)) = try_parse_superscript(&text[pos..])
1363        {
1364            if pos > text_start {
1365                builder.token(SyntaxKind::TEXT.into(), &text[text_start..pos]);
1366            }
1367            log::debug!("Matched superscript at pos {}", pos);
1368            emit_superscript(builder, content, config);
1369            pos += len;
1370            text_start = pos;
1371            continue;
1372        }
1373
1374        // Try bookdown definition: (\#label) or (ref:label)
1375        if byte == b'(' && config.extensions.bookdown_references {
1376            if let Some((len, label)) = try_parse_bookdown_definition(&text[pos..]) {
1377                if pos > text_start {
1378                    builder.token(SyntaxKind::TEXT.into(), &text[text_start..pos]);
1379                }
1380                log::debug!("Matched bookdown definition at pos {}: {}", pos, label);
1381                builder.token(SyntaxKind::TEXT.into(), &text[pos..pos + len]);
1382                pos += len;
1383                text_start = pos;
1384                continue;
1385            }
1386            if let Some((len, label)) = try_parse_bookdown_text_reference(&text[pos..]) {
1387                if pos > text_start {
1388                    builder.token(SyntaxKind::TEXT.into(), &text[text_start..pos]);
1389                }
1390                log::debug!("Matched bookdown text reference at pos {}: {}", pos, label);
1391                builder.token(SyntaxKind::TEXT.into(), &text[pos..pos + len]);
1392                pos += len;
1393                text_start = pos;
1394                continue;
1395            }
1396        }
1397
1398        // Try subscript: ~text~
1399        if byte == b'~'
1400            && config.extensions.subscript
1401            && let Some((len, content)) = try_parse_subscript(&text[pos..])
1402        {
1403            if pos > text_start {
1404                builder.token(SyntaxKind::TEXT.into(), &text[text_start..pos]);
1405            }
1406            log::debug!("Matched subscript at pos {}", pos);
1407            emit_subscript(builder, content, config);
1408            pos += len;
1409            text_start = pos;
1410            continue;
1411        }
1412
1413        // Try strikeout: ~~text~~
1414        if byte == b'~'
1415            && config.extensions.strikeout
1416            && let Some((len, content)) = try_parse_strikeout(&text[pos..])
1417        {
1418            if pos > text_start {
1419                builder.token(SyntaxKind::TEXT.into(), &text[text_start..pos]);
1420            }
1421            log::debug!("Matched strikeout at pos {}", pos);
1422            emit_strikeout(builder, content, config);
1423            pos += len;
1424            text_start = pos;
1425            continue;
1426        }
1427
1428        // Try GFM inline math: $`...`$
1429        if byte == b'$'
1430            && config.extensions.tex_math_gfm
1431            && let Some((len, content)) = try_parse_gfm_inline_math(&text[pos..])
1432        {
1433            if pos > text_start {
1434                builder.token(SyntaxKind::TEXT.into(), &text[text_start..pos]);
1435            }
1436            log::debug!("Matched GFM inline math at pos {}", pos);
1437            emit_gfm_inline_math(builder, content);
1438            pos += len;
1439            text_start = pos;
1440            continue;
1441        }
1442
1443        // Try math ($...$, $$...$$)
1444        if byte == b'$' && config.extensions.tex_math_dollars {
1445            // Try display math first ($$...$$)
1446            if let Some((len, content)) = try_parse_display_math(&text[pos..]) {
1447                // Emit accumulated text
1448                if pos > text_start {
1449                    builder.token(SyntaxKind::TEXT.into(), &text[text_start..pos]);
1450                }
1451
1452                let dollar_count = text[pos..].chars().take_while(|&c| c == '$').count();
1453                log::debug!(
1454                    "Matched display math at pos {}: {} dollars",
1455                    pos,
1456                    dollar_count
1457                );
1458
1459                // Check for trailing attributes (Quarto cross-reference support)
1460                let after_math = &text[pos + len..];
1461                let attr_len = if config.extensions.quarto_crossrefs {
1462                    use crate::parser::utils::attributes::try_parse_trailing_attributes;
1463                    if let Some((_attr_block, _)) = try_parse_trailing_attributes(after_math) {
1464                        let trimmed_after = after_math.trim_start();
1465                        if let Some(open_brace_pos) = trimmed_after.find('{') {
1466                            let ws_before_brace = after_math.len() - trimmed_after.len();
1467                            let attr_text_len = trimmed_after[open_brace_pos..]
1468                                .find('}')
1469                                .map(|close| close + 1)
1470                                .unwrap_or(0);
1471                            ws_before_brace + open_brace_pos + attr_text_len
1472                        } else {
1473                            0
1474                        }
1475                    } else {
1476                        0
1477                    }
1478                } else {
1479                    0
1480                };
1481
1482                let total_len = len + attr_len;
1483                emit_display_math(builder, content, dollar_count);
1484
1485                // Emit attributes if present
1486                if attr_len > 0 {
1487                    use crate::parser::utils::attributes::{
1488                        emit_attributes, try_parse_trailing_attributes,
1489                    };
1490                    let attr_text = &text[pos + len..pos + total_len];
1491                    if let Some((attr_block, _text_before)) =
1492                        try_parse_trailing_attributes(attr_text)
1493                    {
1494                        let trimmed_after = attr_text.trim_start();
1495                        let ws_len = attr_text.len() - trimmed_after.len();
1496                        if ws_len > 0 {
1497                            builder.token(SyntaxKind::WHITESPACE.into(), &attr_text[..ws_len]);
1498                        }
1499                        emit_attributes(builder, &attr_block);
1500                    }
1501                }
1502
1503                pos += total_len;
1504                text_start = pos;
1505                continue;
1506            }
1507
1508            // Try inline math ($...$)
1509            if let Some((len, content)) = try_parse_inline_math(&text[pos..]) {
1510                // Emit accumulated text
1511                if pos > text_start {
1512                    builder.token(SyntaxKind::TEXT.into(), &text[text_start..pos]);
1513                }
1514
1515                log::debug!("Matched inline math at pos {}", pos);
1516                emit_inline_math(builder, content);
1517                pos += len;
1518                text_start = pos;
1519                continue;
1520            }
1521
1522            // Neither display nor inline math matched - emit the $ as literal text
1523            // This ensures each $ gets its own TEXT token for CST compatibility
1524            if pos > text_start {
1525                builder.token(SyntaxKind::TEXT.into(), &text[text_start..pos]);
1526            }
1527            builder.token(SyntaxKind::TEXT.into(), "$");
1528            pos += 1;
1529            text_start = pos;
1530            continue;
1531        }
1532
1533        // Try autolinks: <url> or <email>
1534        if byte == b'<'
1535            && config.extensions.autolinks
1536            && let Some((len, url)) = try_parse_autolink(&text[pos..])
1537        {
1538            if pos > text_start {
1539                builder.token(SyntaxKind::TEXT.into(), &text[text_start..pos]);
1540            }
1541            log::debug!("Matched autolink at pos {}", pos);
1542            emit_autolink(builder, &text[pos..pos + len], url);
1543            pos += len;
1544            text_start = pos;
1545            continue;
1546        }
1547
1548        if config.extensions.autolink_bare_uris
1549            && let Some((len, url)) = try_parse_bare_uri(&text[pos..])
1550        {
1551            if pos > text_start {
1552                builder.token(SyntaxKind::TEXT.into(), &text[text_start..pos]);
1553            }
1554            log::debug!("Matched bare URI at pos {}", pos);
1555            emit_bare_uri_link(builder, url, config);
1556            pos += len;
1557            text_start = pos;
1558            continue;
1559        }
1560
1561        // Try native spans: <span>text</span> (after autolink since both start with <)
1562        if byte == b'<'
1563            && config.extensions.native_spans
1564            && let Some((len, content, attributes)) = try_parse_native_span(&text[pos..])
1565        {
1566            if pos > text_start {
1567                builder.token(SyntaxKind::TEXT.into(), &text[text_start..pos]);
1568            }
1569            log::debug!("Matched native span at pos {}", pos);
1570            emit_native_span(builder, content, &attributes, config);
1571            pos += len;
1572            text_start = pos;
1573            continue;
1574        }
1575
1576        // Images and links - process in order: inline image, reference image, footnote ref, inline link, reference link
1577        if byte == b'!' && pos + 1 < text.len() && text.as_bytes()[pos + 1] == b'[' {
1578            // Try inline image: ![alt](url)
1579            if let Some((len, alt_text, dest, attributes)) = try_parse_inline_image(&text[pos..]) {
1580                if pos > text_start {
1581                    builder.token(SyntaxKind::TEXT.into(), &text[text_start..pos]);
1582                }
1583                log::debug!("Matched inline image at pos {}", pos);
1584                emit_inline_image(
1585                    builder,
1586                    &text[pos..pos + len],
1587                    alt_text,
1588                    dest,
1589                    attributes,
1590                    config,
1591                );
1592                pos += len;
1593                text_start = pos;
1594                continue;
1595            }
1596
1597            // Try reference image: ![alt][ref] or ![alt]
1598            if config.extensions.reference_links {
1599                let allow_shortcut = config.extensions.shortcut_reference_links;
1600                if let Some((len, alt_text, reference, is_implicit)) =
1601                    try_parse_reference_image(&text[pos..], allow_shortcut)
1602                {
1603                    if pos > text_start {
1604                        builder.token(SyntaxKind::TEXT.into(), &text[text_start..pos]);
1605                    }
1606                    log::debug!("Matched reference image at pos {}", pos);
1607                    emit_reference_image(builder, alt_text, &reference, is_implicit, config);
1608                    pos += len;
1609                    text_start = pos;
1610                    continue;
1611                }
1612            }
1613        }
1614
1615        // Process bracket-starting elements
1616        if byte == b'[' {
1617            // Try footnote reference: [^id]
1618            if config.extensions.footnotes
1619                && let Some((len, id)) = try_parse_footnote_reference(&text[pos..])
1620            {
1621                if pos > text_start {
1622                    builder.token(SyntaxKind::TEXT.into(), &text[text_start..pos]);
1623                }
1624                log::debug!("Matched footnote reference at pos {}", pos);
1625                emit_footnote_reference(builder, &id);
1626                pos += len;
1627                text_start = pos;
1628                continue;
1629            }
1630
1631            // Try inline link: [text](url)
1632            if config.extensions.inline_links
1633                && let Some((len, link_text, dest, attributes)) =
1634                    try_parse_inline_link(&text[pos..])
1635            {
1636                if pos > text_start {
1637                    builder.token(SyntaxKind::TEXT.into(), &text[text_start..pos]);
1638                }
1639                log::debug!("Matched inline link at pos {}", pos);
1640                emit_inline_link(
1641                    builder,
1642                    &text[pos..pos + len],
1643                    link_text,
1644                    dest,
1645                    attributes,
1646                    config,
1647                );
1648                pos += len;
1649                text_start = pos;
1650                continue;
1651            }
1652
1653            // Try reference link: [text][ref] or [text]
1654            if config.extensions.reference_links {
1655                let allow_shortcut = config.extensions.shortcut_reference_links;
1656                if let Some((len, link_text, reference, is_implicit)) =
1657                    try_parse_reference_link(&text[pos..], allow_shortcut)
1658                {
1659                    if pos > text_start {
1660                        builder.token(SyntaxKind::TEXT.into(), &text[text_start..pos]);
1661                    }
1662                    log::debug!("Matched reference link at pos {}", pos);
1663                    emit_reference_link(builder, link_text, &reference, is_implicit, config);
1664                    pos += len;
1665                    text_start = pos;
1666                    continue;
1667                }
1668            }
1669
1670            // Try bracketed citation: [@cite]
1671            if config.extensions.citations
1672                && let Some((len, content)) = try_parse_bracketed_citation(&text[pos..])
1673            {
1674                if pos > text_start {
1675                    builder.token(SyntaxKind::TEXT.into(), &text[text_start..pos]);
1676                }
1677                log::debug!("Matched bracketed citation at pos {}", pos);
1678                emit_bracketed_citation(builder, content);
1679                pos += len;
1680                text_start = pos;
1681                continue;
1682            }
1683        }
1684
1685        // Try bracketed spans: [text]{.class}
1686        // Must come after links/citations
1687        if byte == b'['
1688            && config.extensions.bracketed_spans
1689            && let Some((len, text_content, attrs)) = try_parse_bracketed_span(&text[pos..])
1690        {
1691            if pos > text_start {
1692                builder.token(SyntaxKind::TEXT.into(), &text[text_start..pos]);
1693            }
1694            log::debug!("Matched bracketed span at pos {}", pos);
1695            emit_bracketed_span(builder, &text_content, &attrs, config);
1696            pos += len;
1697            text_start = pos;
1698            continue;
1699        }
1700
1701        // Try bare citation: @cite (must come after bracketed elements)
1702        if byte == b'@'
1703            && (config.extensions.citations || config.extensions.quarto_crossrefs)
1704            && let Some((len, key, has_suppress)) = try_parse_bare_citation(&text[pos..])
1705        {
1706            let is_crossref =
1707                config.extensions.quarto_crossrefs && super::citations::is_quarto_crossref_key(key);
1708            if is_crossref || config.extensions.citations {
1709                if pos > text_start {
1710                    builder.token(SyntaxKind::TEXT.into(), &text[text_start..pos]);
1711                }
1712                if is_crossref {
1713                    log::debug!("Matched Quarto crossref at pos {}: {}", pos, &key);
1714                    super::citations::emit_crossref(builder, key, has_suppress);
1715                } else {
1716                    log::debug!("Matched bare citation at pos {}: {}", pos, &key);
1717                    emit_bare_citation(builder, key, has_suppress);
1718                }
1719                pos += len;
1720                text_start = pos;
1721                continue;
1722            }
1723        }
1724
1725        // Try suppress-author citation: -@cite
1726        if byte == b'-'
1727            && pos + 1 < text.len()
1728            && text.as_bytes()[pos + 1] == b'@'
1729            && (config.extensions.citations || config.extensions.quarto_crossrefs)
1730            && let Some((len, key, has_suppress)) = try_parse_bare_citation(&text[pos..])
1731        {
1732            let is_crossref =
1733                config.extensions.quarto_crossrefs && super::citations::is_quarto_crossref_key(key);
1734            if is_crossref || config.extensions.citations {
1735                if pos > text_start {
1736                    builder.token(SyntaxKind::TEXT.into(), &text[text_start..pos]);
1737                }
1738                if is_crossref {
1739                    log::debug!("Matched Quarto crossref at pos {}: {}", pos, &key);
1740                    super::citations::emit_crossref(builder, key, has_suppress);
1741                } else {
1742                    log::debug!("Matched suppress-author citation at pos {}: {}", pos, &key);
1743                    emit_bare_citation(builder, key, has_suppress);
1744                }
1745                pos += len;
1746                text_start = pos;
1747                continue;
1748            }
1749        }
1750
1751        // Try to parse emphasis at this position
1752        if byte == b'*' || byte == b'_' {
1753            // Count the delimiter run to avoid re-parsing
1754            let bytes = text.as_bytes();
1755            let mut delim_count = 0;
1756            while pos + delim_count < bytes.len() && bytes[pos + delim_count] == byte {
1757                delim_count += 1;
1758            }
1759
1760            // Emit any accumulated text before the delimiter
1761            if pos > text_start {
1762                log::debug!(
1763                    "Emitting TEXT before delimiter: {:?}",
1764                    &text[text_start..pos]
1765                );
1766                builder.token(SyntaxKind::TEXT.into(), &text[text_start..pos]);
1767                text_start = pos; // Update text_start after emission
1768            }
1769
1770            // Try to parse emphasis
1771            // Use nested variant (bypass opener validity) when in nested context
1772            let emphasis_result = if nested_emphasis {
1773                try_parse_emphasis_nested(text, pos, end, config, builder)
1774            } else {
1775                try_parse_emphasis(text, pos, end, config, builder)
1776            };
1777
1778            if let Some((consumed, _)) = emphasis_result {
1779                // Successfully parsed emphasis
1780                log::debug!(
1781                    "Parsed emphasis, consumed {} bytes from pos {}",
1782                    consumed,
1783                    pos
1784                );
1785                pos += consumed;
1786                text_start = pos;
1787            } else {
1788                // Failed to parse, delimiter run will be treated as regular text
1789                // Skip the ENTIRE delimiter run to avoid re-parsing parts of it
1790                log::debug!(
1791                    "Failed to parse emphasis at pos {}, skipping {} delimiters as literal",
1792                    pos,
1793                    delim_count
1794                );
1795                pos += delim_count;
1796                // DON'T update text_start - let the delimiters accumulate
1797            }
1798            continue;
1799        }
1800
1801        // Check for newlines - may need to emit as hard line break
1802        if byte == b'\r' && pos + 1 < end && text.as_bytes()[pos + 1] == b'\n' {
1803            let text_before = &text[text_start..pos];
1804
1805            // Check for trailing spaces hard line break (always enabled in Pandoc)
1806            let trailing_spaces = text_before.chars().rev().take_while(|&c| c == ' ').count();
1807            if trailing_spaces >= 2 {
1808                // Emit text before the trailing spaces
1809                let text_content = &text_before[..text_before.len() - trailing_spaces];
1810                if !text_content.is_empty() {
1811                    builder.token(SyntaxKind::TEXT.into(), text_content);
1812                }
1813                let spaces = " ".repeat(trailing_spaces);
1814                builder.token(
1815                    SyntaxKind::HARD_LINE_BREAK.into(),
1816                    &format!("{}\r\n", spaces),
1817                );
1818                pos += 2;
1819                text_start = pos;
1820                continue;
1821            }
1822
1823            // hard_line_breaks: treat all single newlines as hard line breaks
1824            if config.extensions.hard_line_breaks {
1825                if !text_before.is_empty() {
1826                    builder.token(SyntaxKind::TEXT.into(), text_before);
1827                }
1828                builder.token(SyntaxKind::HARD_LINE_BREAK.into(), "\r\n");
1829                pos += 2;
1830                text_start = pos;
1831                continue;
1832            }
1833
1834            // Regular newline
1835            if !text_before.is_empty() {
1836                builder.token(SyntaxKind::TEXT.into(), text_before);
1837            }
1838            builder.token(SyntaxKind::NEWLINE.into(), "\r\n");
1839            pos += 2;
1840            text_start = pos;
1841            continue;
1842        }
1843
1844        if byte == b'\n' {
1845            let text_before = &text[text_start..pos];
1846
1847            // Check for trailing spaces hard line break (always enabled in Pandoc)
1848            let trailing_spaces = text_before.chars().rev().take_while(|&c| c == ' ').count();
1849            if trailing_spaces >= 2 {
1850                // Emit text before the trailing spaces
1851                let text_content = &text_before[..text_before.len() - trailing_spaces];
1852                if !text_content.is_empty() {
1853                    builder.token(SyntaxKind::TEXT.into(), text_content);
1854                }
1855                let spaces = " ".repeat(trailing_spaces);
1856                builder.token(SyntaxKind::HARD_LINE_BREAK.into(), &format!("{}\n", spaces));
1857                pos += 1;
1858                text_start = pos;
1859                continue;
1860            }
1861
1862            // hard_line_breaks: treat all single newlines as hard line breaks
1863            if config.extensions.hard_line_breaks {
1864                if !text_before.is_empty() {
1865                    builder.token(SyntaxKind::TEXT.into(), text_before);
1866                }
1867                builder.token(SyntaxKind::HARD_LINE_BREAK.into(), "\n");
1868                pos += 1;
1869                text_start = pos;
1870                continue;
1871            }
1872
1873            // Regular newline
1874            if !text_before.is_empty() {
1875                builder.token(SyntaxKind::TEXT.into(), text_before);
1876            }
1877            builder.token(SyntaxKind::NEWLINE.into(), "\n");
1878            pos += 1;
1879            text_start = pos;
1880            continue;
1881        }
1882
1883        // Regular character, keep accumulating
1884        pos += 1;
1885    }
1886
1887    // Emit any remaining text
1888    if pos > text_start && text_start < end {
1889        log::debug!("Emitting remaining TEXT: {:?}", &text[text_start..end]);
1890        builder.token(SyntaxKind::TEXT.into(), &text[text_start..end]);
1891    }
1892
1893    log::debug!("parse_inline_range complete: start={}, end={}", start, end);
1894}
1895
1896#[cfg(test)]
1897mod tests {
1898    use super::*;
1899    use crate::syntax::{SyntaxKind, SyntaxNode};
1900    use rowan::GreenNode;
1901
1902    #[test]
1903    fn test_recursive_simple_emphasis() {
1904        let text = "*test*";
1905        let config = ParserOptions::default();
1906        let mut builder = GreenNodeBuilder::new();
1907
1908        parse_inline_text_recursive(&mut builder, text, &config);
1909
1910        let green: GreenNode = builder.finish();
1911        let node = SyntaxNode::new_root(green);
1912
1913        // Should be lossless
1914        assert_eq!(node.text().to_string(), text);
1915
1916        // Should have EMPHASIS node
1917        let has_emph = node.descendants().any(|n| n.kind() == SyntaxKind::EMPHASIS);
1918        assert!(has_emph, "Should have EMPHASIS node");
1919    }
1920
1921    #[test]
1922    fn test_recursive_nested() {
1923        let text = "*foo **bar** baz*";
1924        let config = ParserOptions::default();
1925        let mut builder = GreenNodeBuilder::new();
1926
1927        // Wrap in a PARAGRAPH node (inline content needs a parent)
1928        builder.start_node(SyntaxKind::PARAGRAPH.into());
1929        parse_inline_text_recursive(&mut builder, text, &config);
1930        builder.finish_node();
1931
1932        let green: GreenNode = builder.finish();
1933        let node = SyntaxNode::new_root(green);
1934
1935        // Should be lossless
1936        assert_eq!(node.text().to_string(), text);
1937
1938        // Should have both EMPHASIS and STRONG
1939        let has_emph = node.descendants().any(|n| n.kind() == SyntaxKind::EMPHASIS);
1940        let has_strong = node.descendants().any(|n| n.kind() == SyntaxKind::STRONG);
1941
1942        assert!(has_emph, "Should have EMPHASIS node");
1943        assert!(has_strong, "Should have STRONG node");
1944    }
1945
1946    /// Test that we can parse a simple emphasis case
1947    #[test]
1948    fn test_parse_simple_emphasis() {
1949        use crate::options::ParserOptions;
1950        use crate::syntax::SyntaxNode;
1951        use rowan::GreenNode;
1952
1953        let text = "*test*";
1954        let config = ParserOptions::default();
1955        let mut builder = GreenNodeBuilder::new();
1956
1957        // Try to parse emphasis at position 0
1958        let result = try_parse_emphasis(text, 0, text.len(), &config, &mut builder);
1959
1960        // Should successfully parse
1961        assert_eq!(result, Some((6, 1))); // Consumed all 6 bytes, delimiter count 1
1962
1963        // Check the generated CST
1964        let green: GreenNode = builder.finish();
1965        let node = SyntaxNode::new_root(green);
1966
1967        // The root IS the EMPHASIS node
1968        assert_eq!(node.kind(), SyntaxKind::EMPHASIS);
1969
1970        // Verify losslessness: CST text should match input
1971        assert_eq!(node.text().to_string(), text);
1972    }
1973
1974    /// Test parsing nested emphasis/strong
1975    #[test]
1976    fn test_parse_nested_emphasis_strong() {
1977        use crate::options::ParserOptions;
1978
1979        let text = "*foo **bar** baz*";
1980        let config = ParserOptions::default();
1981        let mut builder = GreenNodeBuilder::new();
1982
1983        // Parse the whole range
1984        parse_inline_range(text, 0, text.len(), &config, &mut builder);
1985
1986        let green = builder.finish();
1987        let node = crate::syntax::SyntaxNode::new_root(green);
1988
1989        // Verify losslessness
1990        assert_eq!(node.text().to_string(), text);
1991
1992        // Should have EMPHASIS and STRONG nodes
1993        let has_emph = node.descendants().any(|n| n.kind() == SyntaxKind::EMPHASIS);
1994        let has_strong = node.descendants().any(|n| n.kind() == SyntaxKind::STRONG);
1995
1996        assert!(has_emph, "Should have EMPHASIS node");
1997        assert!(has_strong, "Should have STRONG node");
1998    }
1999
2000    /// Test Pandoc's "three" algorithm: ***foo* bar**
2001    /// Expected: Strong[Emph[foo], bar]
2002    /// Current bug: Parses as *Strong[foo* bar]
2003    #[test]
2004    fn test_triple_emphasis_star_then_double_star() {
2005        use crate::options::ParserOptions;
2006        use crate::syntax::SyntaxNode;
2007        use rowan::GreenNode;
2008
2009        let text = "***foo* bar**";
2010        let config = ParserOptions::default();
2011        let mut builder = GreenNodeBuilder::new();
2012
2013        builder.start_node(SyntaxKind::DOCUMENT.into());
2014        parse_inline_range(text, 0, text.len(), &config, &mut builder);
2015        builder.finish_node();
2016
2017        let green: GreenNode = builder.finish();
2018        let node = SyntaxNode::new_root(green);
2019
2020        // Verify losslessness
2021        assert_eq!(node.text().to_string(), text);
2022
2023        // Expected structure: STRONG > EMPH > "foo"
2024        // The STRONG should contain EMPH, not the other way around
2025        let structure = format!("{:#?}", node);
2026
2027        // Should have both STRONG and EMPH
2028        assert!(structure.contains("STRONG"), "Should have STRONG node");
2029        assert!(structure.contains("EMPHASIS"), "Should have EMPHASIS node");
2030
2031        // STRONG should be outer, EMPH should be inner
2032        // Check that STRONG comes before EMPH in tree traversal
2033        let mut found_strong = false;
2034        let mut found_emph_after_strong = false;
2035        for descendant in node.descendants() {
2036            if descendant.kind() == SyntaxKind::STRONG {
2037                found_strong = true;
2038            }
2039            if found_strong && descendant.kind() == SyntaxKind::EMPHASIS {
2040                found_emph_after_strong = true;
2041                break;
2042            }
2043        }
2044
2045        assert!(
2046            found_emph_after_strong,
2047            "EMPH should be inside STRONG, not before it. Current structure:\n{}",
2048            structure
2049        );
2050    }
2051
2052    /// Test Pandoc's "three" algorithm: ***foo** bar*
2053    /// Expected: Emph[Strong[foo], bar]
2054    #[test]
2055    fn test_triple_emphasis_double_star_then_star() {
2056        use crate::options::ParserOptions;
2057        use crate::syntax::SyntaxNode;
2058        use rowan::GreenNode;
2059
2060        let text = "***foo** bar*";
2061        let config = ParserOptions::default();
2062        let mut builder = GreenNodeBuilder::new();
2063
2064        builder.start_node(SyntaxKind::DOCUMENT.into());
2065        parse_inline_range(text, 0, text.len(), &config, &mut builder);
2066        builder.finish_node();
2067
2068        let green: GreenNode = builder.finish();
2069        let node = SyntaxNode::new_root(green);
2070
2071        // Verify losslessness
2072        assert_eq!(node.text().to_string(), text);
2073
2074        // Expected structure: EMPH > STRONG > "foo"
2075        let structure = format!("{:#?}", node);
2076
2077        // Should have both EMPH and STRONG
2078        assert!(structure.contains("EMPHASIS"), "Should have EMPHASIS node");
2079        assert!(structure.contains("STRONG"), "Should have STRONG node");
2080
2081        // EMPH should be outer, STRONG should be inner
2082        let mut found_emph = false;
2083        let mut found_strong_after_emph = false;
2084        for descendant in node.descendants() {
2085            if descendant.kind() == SyntaxKind::EMPHASIS {
2086                found_emph = true;
2087            }
2088            if found_emph && descendant.kind() == SyntaxKind::STRONG {
2089                found_strong_after_emph = true;
2090                break;
2091            }
2092        }
2093
2094        assert!(
2095            found_strong_after_emph,
2096            "STRONG should be inside EMPH. Current structure:\n{}",
2097            structure
2098        );
2099    }
2100
2101    /// Test that display math with attributes parses correctly
2102    /// Regression test for equation_attributes_single_line golden test
2103    #[test]
2104    fn test_display_math_with_attributes() {
2105        use crate::options::ParserOptions;
2106        use crate::syntax::SyntaxNode;
2107        use rowan::GreenNode;
2108
2109        let text = "$$ E = mc^2 $$ {#eq-einstein}";
2110        let mut config = ParserOptions::default();
2111        config.extensions.quarto_crossrefs = true; // Enable Quarto cross-references
2112
2113        let mut builder = GreenNodeBuilder::new();
2114        builder.start_node(SyntaxKind::DOCUMENT.into()); // Need a root node
2115
2116        // Parse the whole text
2117        parse_inline_text_recursive(&mut builder, text, &config);
2118
2119        builder.finish_node(); // Finish ROOT
2120        let green: GreenNode = builder.finish();
2121        let node = SyntaxNode::new_root(green);
2122
2123        // Verify losslessness
2124        assert_eq!(node.text().to_string(), text);
2125
2126        // Should have DISPLAY_MATH node
2127        let has_display_math = node
2128            .descendants()
2129            .any(|n| n.kind() == SyntaxKind::DISPLAY_MATH);
2130        assert!(has_display_math, "Should have DISPLAY_MATH node");
2131
2132        // Should have ATTRIBUTE node
2133        let has_attributes = node
2134            .descendants()
2135            .any(|n| n.kind() == SyntaxKind::ATTRIBUTE);
2136        assert!(
2137            has_attributes,
2138            "Should have ATTRIBUTE node for {{#eq-einstein}}"
2139        );
2140
2141        // Attributes should not be TEXT
2142        let math_followed_by_text = node.descendants().any(|n| {
2143            n.kind() == SyntaxKind::DISPLAY_MATH
2144                && n.next_sibling()
2145                    .map(|s| {
2146                        s.kind() == SyntaxKind::TEXT
2147                            && s.text().to_string().contains("{#eq-einstein}")
2148                    })
2149                    .unwrap_or(false)
2150        });
2151        assert!(
2152            !math_followed_by_text,
2153            "Attributes should not be parsed as TEXT"
2154        );
2155    }
2156}
2157
2158#[test]
2159fn test_two_with_nested_one_and_triple_closer() {
2160    // **bold with *italic***
2161    // Should parse as: Strong["bold with ", Emph["italic"]]
2162    // The *** at end is parsed as * (closes Emph) + ** (closes Strong)
2163
2164    use crate::options::ParserOptions;
2165    use crate::syntax::SyntaxNode;
2166    use rowan::GreenNode;
2167
2168    let text = "**bold with *italic***";
2169    let config = ParserOptions::default();
2170    let mut builder = GreenNodeBuilder::new();
2171
2172    // parse_inline_range emits inline content directly
2173    parse_inline_range(text, 0, text.len(), &config, &mut builder);
2174
2175    let green: GreenNode = builder.finish();
2176    let node = SyntaxNode::new_root(green);
2177
2178    // Verify lossless parsing
2179    assert_eq!(node.text().to_string(), text, "Should be lossless");
2180
2181    // The root node should be STRONG (parse_inline_range doesn't add wrapper)
2182    assert_eq!(
2183        node.kind(),
2184        SyntaxKind::STRONG,
2185        "Root should be STRONG, got: {:?}",
2186        node.kind()
2187    );
2188
2189    // STRONG should contain EMPHASIS as a nested node
2190    let has_emphasis = node.children().any(|c| c.kind() == SyntaxKind::EMPHASIS);
2191    assert!(has_emphasis, "STRONG should contain EMPHASIS node");
2192}
2193
2194#[test]
2195fn test_emphasis_with_trailing_space_before_closer() {
2196    // *foo * should parse as emphasis (Pandoc behavior)
2197    // For asterisks, Pandoc doesn't require right-flanking for closers
2198
2199    use crate::options::ParserOptions;
2200    use crate::syntax::SyntaxNode;
2201    use rowan::GreenNode;
2202
2203    let text = "*foo *";
2204    let config = ParserOptions::default();
2205    let mut builder = GreenNodeBuilder::new();
2206
2207    // Try to parse emphasis at position 0
2208    let result = try_parse_emphasis(text, 0, text.len(), &config, &mut builder);
2209
2210    // Should successfully parse (consumed all 6 bytes, delimiter count 1)
2211    assert_eq!(
2212        result,
2213        Some((6, 1)),
2214        "Should parse as emphasis, result: {:?}",
2215        result
2216    );
2217
2218    // Check the generated CST
2219    let green: GreenNode = builder.finish();
2220    let node = SyntaxNode::new_root(green);
2221
2222    // The root IS the EMPHASIS node
2223    assert_eq!(node.kind(), SyntaxKind::EMPHASIS);
2224
2225    // Verify losslessness
2226    assert_eq!(node.text().to_string(), text);
2227}
2228
2229#[test]
2230fn test_triple_emphasis_all_strong_nested() {
2231    // ***foo** bar **baz*** should parse as Emph[Strong[foo], " bar ", Strong[baz]]
2232    // Pandoc output confirms this
2233
2234    use crate::options::ParserOptions;
2235    use crate::syntax::SyntaxNode;
2236    use rowan::GreenNode;
2237
2238    let text = "***foo** bar **baz***";
2239    let config = ParserOptions::default();
2240    let mut builder = GreenNodeBuilder::new();
2241
2242    parse_inline_range(text, 0, text.len(), &config, &mut builder);
2243
2244    let green: GreenNode = builder.finish();
2245    let node = SyntaxNode::new_root(green);
2246
2247    // Should have one EMPHASIS node at root
2248    let emphasis_nodes: Vec<_> = node
2249        .descendants()
2250        .filter(|n| n.kind() == SyntaxKind::EMPHASIS)
2251        .collect();
2252    assert_eq!(
2253        emphasis_nodes.len(),
2254        1,
2255        "Should have exactly one EMPHASIS node, found: {}",
2256        emphasis_nodes.len()
2257    );
2258
2259    // EMPHASIS should contain two STRONG nodes
2260    let emphasis_node = emphasis_nodes[0].clone();
2261    let strong_in_emphasis: Vec<_> = emphasis_node
2262        .children()
2263        .filter(|n| n.kind() == SyntaxKind::STRONG)
2264        .collect();
2265    assert_eq!(
2266        strong_in_emphasis.len(),
2267        2,
2268        "EMPHASIS should contain two STRONG nodes, found: {}",
2269        strong_in_emphasis.len()
2270    );
2271
2272    // Verify losslessness
2273    assert_eq!(node.text().to_string(), text);
2274}
2275
2276#[test]
2277fn test_triple_emphasis_all_emph_nested() {
2278    // ***foo* bar *baz*** should parse as Strong[Emph[foo], " bar ", Emph[baz]]
2279    // Pandoc output confirms this
2280
2281    use crate::options::ParserOptions;
2282    use crate::syntax::SyntaxNode;
2283    use rowan::GreenNode;
2284
2285    let text = "***foo* bar *baz***";
2286    let config = ParserOptions::default();
2287    let mut builder = GreenNodeBuilder::new();
2288
2289    parse_inline_range(text, 0, text.len(), &config, &mut builder);
2290
2291    let green: GreenNode = builder.finish();
2292    let node = SyntaxNode::new_root(green);
2293
2294    // Should have one STRONG node at root
2295    let strong_nodes: Vec<_> = node
2296        .descendants()
2297        .filter(|n| n.kind() == SyntaxKind::STRONG)
2298        .collect();
2299    assert_eq!(
2300        strong_nodes.len(),
2301        1,
2302        "Should have exactly one STRONG node, found: {}",
2303        strong_nodes.len()
2304    );
2305
2306    // STRONG should contain two EMPHASIS nodes
2307    let strong_node = strong_nodes[0].clone();
2308    let emph_in_strong: Vec<_> = strong_node
2309        .children()
2310        .filter(|n| n.kind() == SyntaxKind::EMPHASIS)
2311        .collect();
2312    assert_eq!(
2313        emph_in_strong.len(),
2314        2,
2315        "STRONG should contain two EMPHASIS nodes, found: {}",
2316        emph_in_strong.len()
2317    );
2318
2319    // Verify losslessness
2320    assert_eq!(node.text().to_string(), text);
2321}
2322
2323// Multiline emphasis tests
2324#[test]
2325fn test_parse_emphasis_multiline() {
2326    // Per Pandoc spec, emphasis CAN contain newlines (soft breaks)
2327    use crate::options::ParserOptions;
2328    use crate::syntax::SyntaxNode;
2329    use rowan::GreenNode;
2330
2331    let text = "*text on\nline two*";
2332    let config = ParserOptions::default();
2333    let mut builder = GreenNodeBuilder::new();
2334
2335    let result = try_parse_emphasis(text, 0, text.len(), &config, &mut builder);
2336
2337    // Should successfully parse all bytes
2338    assert_eq!(
2339        result,
2340        Some((text.len(), 1)),
2341        "Emphasis should parse multiline content"
2342    );
2343
2344    // Check the generated CST
2345    let green: GreenNode = builder.finish();
2346    let node = SyntaxNode::new_root(green);
2347
2348    // Should have EMPHASIS node
2349    assert_eq!(node.kind(), SyntaxKind::EMPHASIS);
2350
2351    // Verify losslessness: should preserve the newline
2352    assert_eq!(node.text().to_string(), text);
2353    assert!(
2354        node.text().to_string().contains('\n'),
2355        "Should preserve newline in emphasis content"
2356    );
2357}
2358
2359#[test]
2360fn test_parse_strong_multiline() {
2361    // Per Pandoc spec, strong emphasis CAN contain newlines
2362    use crate::options::ParserOptions;
2363    use crate::syntax::SyntaxNode;
2364    use rowan::GreenNode;
2365
2366    let text = "**strong on\nline two**";
2367    let config = ParserOptions::default();
2368    let mut builder = GreenNodeBuilder::new();
2369
2370    let result = try_parse_emphasis(text, 0, text.len(), &config, &mut builder);
2371
2372    // Should successfully parse all bytes
2373    assert_eq!(
2374        result,
2375        Some((text.len(), 2)),
2376        "Strong emphasis should parse multiline content"
2377    );
2378
2379    // Check the generated CST
2380    let green: GreenNode = builder.finish();
2381    let node = SyntaxNode::new_root(green);
2382
2383    // Should have STRONG node
2384    assert_eq!(node.kind(), SyntaxKind::STRONG);
2385
2386    // Verify losslessness
2387    assert_eq!(node.text().to_string(), text);
2388    assert!(
2389        node.text().to_string().contains('\n'),
2390        "Should preserve newline in strong content"
2391    );
2392}
2393
2394#[test]
2395fn test_parse_triple_emphasis_multiline() {
2396    // Triple emphasis with newlines
2397    use crate::options::ParserOptions;
2398    use crate::syntax::SyntaxNode;
2399    use rowan::GreenNode;
2400
2401    let text = "***both on\nline two***";
2402    let config = ParserOptions::default();
2403    let mut builder = GreenNodeBuilder::new();
2404
2405    let result = try_parse_emphasis(text, 0, text.len(), &config, &mut builder);
2406
2407    // Should successfully parse all bytes
2408    assert_eq!(
2409        result,
2410        Some((text.len(), 3)),
2411        "Triple emphasis should parse multiline content"
2412    );
2413
2414    // Check the generated CST
2415    let green: GreenNode = builder.finish();
2416    let node = SyntaxNode::new_root(green);
2417
2418    // Should have STRONG node (triple = strong + emph)
2419    let has_strong = node.descendants().any(|n| n.kind() == SyntaxKind::STRONG);
2420    assert!(has_strong, "Should have STRONG node");
2421
2422    // Verify losslessness
2423    assert_eq!(node.text().to_string(), text);
2424    assert!(
2425        node.text().to_string().contains('\n'),
2426        "Should preserve newline in triple emphasis content"
2427    );
2428}
panache_parser/parser/inlines/core.rs

panache_parser/parser/inlines/
core.rs