panache_parser/parser/inlines/
core.rs

1//! Recursive emphasis parsing using Pandoc's algorithm.
2//!
3//! This module implements emphasis/strong emphasis parsing using a recursive
4//! descent approach based on Pandoc's Haskell implementation in
5//! `Readers/Markdown.hs:L1662-L1722`.
6//!
7//! **Key algorithm**: Left-to-right, greedy, first-match wins
8//! 1. Parse text left-to-right
9//! 2. When we see delimiters, try to parse emphasis (look for matching closer)
10//! 3. If successful, emit emphasis node and continue from after closer
11//! 4. If failed (no closer found), emit delimiter as literal and continue
12//! 5. Nested emphasis is handled naturally by recursive parsing of content
13//!
14//! **Example**: `*foo **bar* baz**`
15//! - See `*`, try to parse EMPH
16//! - Parse content: see `**`, try to parse STRONG
17//! - STRONG finds closer `**` at end → succeeds, emits STRONG[bar* baz]
18//! - Outer `*` can't find closer (all delimiters consumed) → fails, emits `*foo` as literal
19//! - Result: `*foo` + STRONG[bar* baz]
20//!
21//! This matches Pandoc's behavior exactly.
22
23use crate::options::ParserOptions;
24use crate::syntax::SyntaxKind;
25use rowan::GreenNodeBuilder;
26
27// Import inline element parsers from sibling modules
28use super::bookdown::{
29    try_parse_bookdown_definition, try_parse_bookdown_reference, try_parse_bookdown_text_reference,
30};
31use super::bracketed_spans::{emit_bracketed_span, try_parse_bracketed_span};
32use super::citations::{
33    emit_bare_citation, emit_bracketed_citation, try_parse_bare_citation,
34    try_parse_bracketed_citation,
35};
36use super::code_spans::{emit_code_span, try_parse_code_span};
37use super::emoji::{emit_emoji, try_parse_emoji};
38use super::escapes::{EscapeType, emit_escape, try_parse_escape};
39use super::inline_executable::{emit_inline_executable, try_parse_inline_executable};
40use super::inline_footnotes::{
41    emit_footnote_reference, emit_inline_footnote, try_parse_footnote_reference,
42    try_parse_inline_footnote,
43};
44use super::latex::{parse_latex_command, try_parse_latex_command};
45use super::links::{
46    emit_autolink, emit_bare_uri_link, emit_inline_image, emit_inline_link, emit_reference_image,
47    emit_reference_link, try_parse_autolink, try_parse_bare_uri, try_parse_inline_image,
48    try_parse_inline_link, try_parse_reference_image, try_parse_reference_link,
49};
50use super::mark::{emit_mark, try_parse_mark};
51use super::math::{
52    emit_display_math, emit_display_math_environment, emit_double_backslash_display_math,
53    emit_double_backslash_inline_math, emit_gfm_inline_math, emit_inline_math,
54    emit_single_backslash_display_math, emit_single_backslash_inline_math, try_parse_display_math,
55    try_parse_double_backslash_display_math, try_parse_double_backslash_inline_math,
56    try_parse_gfm_inline_math, try_parse_inline_math, try_parse_math_environment,
57    try_parse_single_backslash_display_math, try_parse_single_backslash_inline_math,
58};
59use super::native_spans::{emit_native_span, try_parse_native_span};
60use super::raw_inline::is_raw_inline;
61use super::shortcodes::{emit_shortcode, try_parse_shortcode};
62use super::strikeout::{emit_strikeout, try_parse_strikeout};
63use super::subscript::{emit_subscript, try_parse_subscript};
64use super::superscript::{emit_superscript, try_parse_superscript};
65
66/// Parse inline text using the recursive emphasis algorithm.
67///
68/// This is the main entry point for parsing inline content with Pandoc-style
69/// recursive emphasis handling. It uses a greedy left-to-right, first-match-wins
70/// approach that matches Pandoc's behavior exactly.
71///
72/// **Algorithm**:
73/// 1. Parse text left-to-right trying each inline element type in precedence order
74/// 2. When we see `*` or `_`, try to parse emphasis recursively
75/// 3. Nested emphasis naturally consumes delimiters before outer matches
76/// 4. All inline elements (code, links, math, etc.) are parsed on-the-fly
77///
78/// # Arguments
79/// * `text` - The inline text to parse
80/// * `config` - Configuration for extensions and formatting
81/// * `builder` - The CST builder to emit nodes to
82pub fn parse_inline_text_recursive(
83    builder: &mut GreenNodeBuilder,
84    text: &str,
85    config: &ParserOptions,
86) {
87    log::debug!(
88        "Recursive inline parsing: {:?} ({} bytes)",
89        &text[..text.len().min(40)],
90        text.len()
91    );
92
93    parse_inline_range(text, 0, text.len(), config, builder);
94
95    log::debug!("Recursive inline parsing complete");
96}
97
98/// Parse inline elements from text content.
99/// This is a standalone function used for recursive inline parsing within blocks.
100///
101/// The `allow_reference_links` parameter is accepted for compatibility but not currently used.
102/// Set to `false` in nested contexts (inside link text, image alt, spans) to prevent recursive parsing.
103pub fn parse_inline_text(
104    builder: &mut GreenNodeBuilder,
105    text: &str,
106    config: &ParserOptions,
107    _allow_reference_links: bool,
108) {
109    log::trace!(
110        "Parsing inline text (recursive): {:?} ({} bytes)",
111        &text[..text.len().min(40)],
112        text.len()
113    );
114
115    // Use recursive parsing with Pandoc's algorithm for emphasis
116    parse_inline_text_recursive(builder, text, config);
117}
118
119/// Try to parse emphasis starting at the given position.
120///
121/// This is the entry point for recursive emphasis parsing, equivalent to
122/// Pandoc's `enclosure` function.
123///
124/// Returns Some((bytes_consumed, delim_count)) if emphasis was successfully parsed,
125/// or None if the delimiter should be treated as literal text.
126/// When returning None, the delim_count tells the caller how many delimiter
127/// characters to skip (to avoid re-parsing parts of a failed delimiter run).
128///
129/// # Arguments
130/// * `text` - The full text being parsed
131/// * `pos` - Current position in text (where the delimiter starts)
132/// * `end` - End boundary (don't search for closers beyond this)
133/// * `config` - Configuration
134/// * `builder` - CST builder
135///
136/// **Algorithm**:
137/// 1. Count opening delimiters
138/// 2. Check if followed by whitespace (if so, return None)
139/// 3. Dispatch to parse_one/two/three based on count
140/// 4. Those functions parse content and look for matching closer (within bounds)
141/// 5. If closer found, emit node and return bytes consumed
142/// 6. If not found, return None with delimiter count (caller skips entire run)
143pub fn try_parse_emphasis(
144    text: &str,
145    pos: usize,
146    end: usize,
147    config: &ParserOptions,
148    builder: &mut GreenNodeBuilder,
149) -> Option<(usize, usize)> {
150    let bytes = text.as_bytes();
151
152    if pos >= bytes.len() {
153        return None;
154    }
155
156    let delim_char = bytes[pos] as char;
157    if delim_char != '*' && delim_char != '_' {
158        return None;
159    }
160
161    // Count consecutive delimiters
162    let mut count = 0;
163    while pos + count < bytes.len() && bytes[pos + count] == bytes[pos] {
164        count += 1;
165    }
166
167    let after_pos = pos + count;
168
169    log::debug!(
170        "try_parse_emphasis: '{}' x {} at pos {}",
171        delim_char,
172        count,
173        pos
174    );
175
176    // Check if followed by whitespace (Pandoc rule: treat as literal)
177    if after_pos < text.len()
178        && let Some(next_char) = text[after_pos..].chars().next()
179        && next_char.is_whitespace()
180    {
181        log::trace!("Delimiter followed by whitespace, treating as literal");
182        return None;
183    }
184
185    // For underscores: check intraword_underscores extension (Pandoc lines 1668-1672)
186    // Can't open if preceded by alphanumeric (prevents foo_bar from parsing)
187    if delim_char == '_'
188        && pos > 0
189        && let Some(prev_char) = text[..pos].chars().last()
190        && prev_char.is_alphanumeric()
191    {
192        log::trace!("Underscore preceded by alphanumeric, can't open (intraword)");
193        return None;
194    }
195
196    // Dispatch based on delimiter count
197    let result = match count {
198        1 => try_parse_one(text, pos, delim_char, end, config, builder),
199        2 => try_parse_two(text, pos, delim_char, end, config, builder),
200        3 => try_parse_three(text, pos, delim_char, end, config, builder),
201        _ => {
202            // 4+ delimiters: treat as literal (Pandoc behavior)
203            log::trace!("{} delimiters (4+), treating as literal", count);
204            None
205        }
206    };
207
208    // If parsing succeeded, return (bytes_consumed, delim_count)
209    // If failed, return None but the caller will know to skip `count` delimiters
210    result.map(|consumed| (consumed, count))
211}
212
213/// Try to parse emphasis in a nested context (bypassing opener validity checks).
214///
215/// This mirrors Pandoc's behavior where `one` can call `two c mempty` directly,
216/// bypassing the `enclosure` opener validity checks. This is needed because
217/// patterns like `***foo **bar** baz***` require `**` followed by space to be
218/// parsed as a nested strong opener.
219///
220/// Returns Some((bytes_consumed, delim_count)) if successful, None otherwise.
221fn try_parse_emphasis_nested(
222    text: &str,
223    pos: usize,
224    end: usize,
225    config: &ParserOptions,
226    builder: &mut GreenNodeBuilder,
227) -> Option<(usize, usize)> {
228    let bytes = text.as_bytes();
229
230    if pos >= bytes.len() {
231        return None;
232    }
233
234    let delim_char = bytes[pos] as char;
235    if delim_char != '*' && delim_char != '_' {
236        return None;
237    }
238
239    // Count consecutive delimiters
240    let mut count = 0;
241    while pos + count < bytes.len() && bytes[pos + count] == bytes[pos] {
242        count += 1;
243    }
244
245    log::debug!(
246        "try_parse_emphasis_nested: '{}' x {} at pos {}",
247        delim_char,
248        count,
249        pos
250    );
251
252    // For underscores: still check intraword_underscores (prevents foo_bar parsing)
253    // This check applies even in nested contexts
254    if delim_char == '_'
255        && pos > 0
256        && let Some(prev_char) = text[..pos].chars().last()
257        && prev_char.is_alphanumeric()
258    {
259        log::trace!("Underscore preceded by alphanumeric, can't open (intraword)");
260        return None;
261    }
262
263    // NOTE: We intentionally skip the "delimiter followed by whitespace" check here.
264    // In nested contexts (inside `one` calling `two`), Pandoc allows openers
265    // followed by whitespace because the opener has already been matched.
266
267    // Dispatch based on delimiter count
268    let result = match count {
269        1 => try_parse_one(text, pos, delim_char, end, config, builder),
270        2 => try_parse_two(text, pos, delim_char, end, config, builder),
271        3 => try_parse_three(text, pos, delim_char, end, config, builder),
272        _ => {
273            // 4+ delimiters: treat as literal (Pandoc behavior)
274            log::trace!("{} delimiters (4+), treating as literal", count);
275            None
276        }
277    };
278
279    result.map(|consumed| (consumed, count))
280}
281
282/// Try to parse emphasis with *** opening delimiter.
283///
284/// Tries to match closers in order: *** → ** → *
285/// Returns Some(bytes_consumed) if successful, None otherwise.
286fn try_parse_three(
287    text: &str,
288    pos: usize,
289    delim_char: char,
290    end: usize,
291    config: &ParserOptions,
292    builder: &mut GreenNodeBuilder,
293) -> Option<usize> {
294    let content_start = pos + 3;
295    let one = delim_char.to_string();
296    let two = one.repeat(2);
297
298    log::debug!("try_parse_three: '{}' x 3 at pos {}", delim_char, pos);
299
300    // Pandoc algorithm (line 1695): Parse content UNTIL we see a VALID ender
301    // We loop through potential enders, checking if each is valid.
302    // Invalid enders (like `**` preceded by whitespace) are skipped.
303    let mut search_pos = content_start;
304
305    loop {
306        // Find next potential ender
307        let closer_start = match find_first_potential_ender(text, search_pos, delim_char, end) {
308            Some(p) => p,
309            None => {
310                log::trace!("No potential ender found for ***");
311                return None;
312            }
313        };
314
315        log::debug!("Potential ender at pos {}", closer_start);
316
317        // Count how many delimiters we have at closer_start
318        let bytes = text.as_bytes();
319        let mut closer_count = 0;
320        let mut check_pos = closer_start;
321        while check_pos < bytes.len() && bytes[check_pos] == delim_char as u8 {
322            closer_count += 1;
323            check_pos += 1;
324        }
325
326        log::debug!(
327            "Found {} x {} at pos {}",
328            delim_char,
329            closer_count,
330            closer_start
331        );
332
333        // Try to match closers in order: ***, **, * (Pandoc lines 1696-1698)
334
335        // Try *** (line 1696)
336        if closer_count >= 3 && is_valid_ender(text, closer_start, delim_char, 3) {
337            log::debug!("Matched *** closer, emitting Strong[Emph[content]]");
338
339            builder.start_node(SyntaxKind::STRONG.into());
340            builder.token(SyntaxKind::STRONG_MARKER.into(), &two);
341
342            builder.start_node(SyntaxKind::EMPHASIS.into());
343            builder.token(SyntaxKind::EMPHASIS_MARKER.into(), &one);
344            parse_inline_range_nested(text, content_start, closer_start, config, builder);
345            builder.token(SyntaxKind::EMPHASIS_MARKER.into(), &one);
346            builder.finish_node(); // EMPHASIS
347
348            builder.token(SyntaxKind::STRONG_MARKER.into(), &two);
349            builder.finish_node(); // STRONG
350
351            return Some(closer_start + 3 - pos);
352        }
353
354        // Try ** (line 1697)
355        if closer_count >= 2 && is_valid_ender(text, closer_start, delim_char, 2) {
356            log::debug!("Matched ** closer, wrapping as Strong and continuing with one");
357
358            let continue_pos = closer_start + 2;
359
360            if let Some(final_closer_pos) =
361                parse_until_closer_with_nested_two(text, continue_pos, delim_char, 1, end, config)
362            {
363                log::debug!(
364                    "Found * closer at pos {}, emitting Emph[Strong[...], ...]",
365                    final_closer_pos
366                );
367
368                builder.start_node(SyntaxKind::EMPHASIS.into());
369                builder.token(SyntaxKind::EMPHASIS_MARKER.into(), &one);
370
371                builder.start_node(SyntaxKind::STRONG.into());
372                builder.token(SyntaxKind::STRONG_MARKER.into(), &two);
373                parse_inline_range_nested(text, content_start, closer_start, config, builder);
374                builder.token(SyntaxKind::STRONG_MARKER.into(), &two);
375                builder.finish_node(); // STRONG
376
377                // Parse additional content between ** and * (up to but not including the closer)
378                parse_inline_range_nested(text, continue_pos, final_closer_pos, config, builder);
379
380                builder.token(SyntaxKind::EMPHASIS_MARKER.into(), &one);
381                builder.finish_node(); // EMPHASIS
382
383                return Some(final_closer_pos + 1 - pos);
384            }
385
386            // Fallback: emit * + STRONG
387            log::debug!("No * closer found after **, emitting * + STRONG");
388            builder.token(SyntaxKind::TEXT.into(), &one);
389
390            builder.start_node(SyntaxKind::STRONG.into());
391            builder.token(SyntaxKind::STRONG_MARKER.into(), &two);
392            parse_inline_range_nested(text, content_start, closer_start, config, builder);
393            builder.token(SyntaxKind::STRONG_MARKER.into(), &two);
394            builder.finish_node(); // STRONG
395
396            return Some(closer_start + 2 - pos);
397        }
398
399        // Try * (line 1698)
400        if closer_count >= 1 && is_valid_ender(text, closer_start, delim_char, 1) {
401            log::debug!("Matched * closer, wrapping as Emph and continuing with two");
402
403            let continue_pos = closer_start + 1;
404
405            if let Some(final_closer_pos) =
406                parse_until_closer_with_nested_one(text, continue_pos, delim_char, 2, end, config)
407            {
408                log::debug!(
409                    "Found ** closer at pos {}, emitting Strong[Emph[...], ...]",
410                    final_closer_pos
411                );
412
413                builder.start_node(SyntaxKind::STRONG.into());
414                builder.token(SyntaxKind::STRONG_MARKER.into(), &two);
415
416                builder.start_node(SyntaxKind::EMPHASIS.into());
417                builder.token(SyntaxKind::EMPHASIS_MARKER.into(), &one);
418                parse_inline_range_nested(text, content_start, closer_start, config, builder);
419                builder.token(SyntaxKind::EMPHASIS_MARKER.into(), &one);
420                builder.finish_node(); // EMPHASIS
421
422                parse_inline_range_nested(text, continue_pos, final_closer_pos, config, builder);
423
424                builder.token(SyntaxKind::STRONG_MARKER.into(), &two);
425                builder.finish_node(); // STRONG
426
427                return Some(final_closer_pos + 2 - pos);
428            }
429
430            // Fallback: emit ** + EMPH
431            log::debug!("No ** closer found after *, emitting ** + EMPH");
432            builder.token(SyntaxKind::TEXT.into(), &two);
433
434            builder.start_node(SyntaxKind::EMPHASIS.into());
435            builder.token(SyntaxKind::EMPHASIS_MARKER.into(), &one);
436            parse_inline_range_nested(text, content_start, closer_start, config, builder);
437            builder.token(SyntaxKind::EMPHASIS_MARKER.into(), &one);
438            builder.finish_node(); // EMPHASIS
439
440            return Some(closer_start + 1 - pos);
441        }
442
443        // No valid ender at this position - continue searching after this delimiter run
444        log::debug!(
445            "No valid ender at pos {}, continuing search from {}",
446            closer_start,
447            closer_start + closer_count
448        );
449        search_pos = closer_start + closer_count;
450    }
451}
452
453/// Find the first potential emphasis ender (delimiter character) starting from `start`.
454/// This implements Pandoc's `many (notFollowedBy (ender c 1) >> inline)` -
455/// we parse inline content until we hit a delimiter that could be an ender.
456fn find_first_potential_ender(
457    text: &str,
458    start: usize,
459    delim_char: char,
460    end: usize,
461) -> Option<usize> {
462    let bytes = text.as_bytes();
463    let mut pos = start;
464
465    while pos < end.min(text.len()) {
466        // Check if we found the delimiter character
467        if bytes[pos] == delim_char as u8 {
468            // Check if it's escaped
469            let is_escaped = {
470                let mut backslash_count = 0;
471                let mut check_pos = pos;
472                while check_pos > 0 && bytes[check_pos - 1] == b'\\' {
473                    backslash_count += 1;
474                    check_pos -= 1;
475                }
476                backslash_count % 2 == 1
477            };
478
479            if !is_escaped {
480                // Found a potential ender
481                return Some(pos);
482            }
483        }
484
485        pos += 1;
486    }
487
488    None
489}
490
491/// Check if a delimiter at the given position is a valid ender.
492/// This implements Pandoc's `ender c n` function.
493fn is_valid_ender(text: &str, pos: usize, delim_char: char, delim_count: usize) -> bool {
494    let bytes = text.as_bytes();
495
496    // Check we have exactly delim_count delimiters (not more, not less)
497    if pos + delim_count > text.len() {
498        return false;
499    }
500
501    for i in 0..delim_count {
502        if bytes[pos + i] != delim_char as u8 {
503            return false;
504        }
505    }
506
507    // Check no delimiter immediately before
508    if pos > 0 && bytes[pos - 1] == delim_char as u8 {
509        return false;
510    }
511
512    // Check no delimiter immediately after
513    let after_pos = pos + delim_count;
514    if after_pos < bytes.len() && bytes[after_pos] == delim_char as u8 {
515        return false;
516    }
517
518    // For underscores, check right-flanking (not preceded by whitespace)
519    // Pandoc's `ender` for asterisks has NO right-flanking requirement
520    if delim_char == '_' {
521        if pos > 0
522            && let Some(prev_char) = text[..pos].chars().last()
523            && prev_char.is_whitespace()
524        {
525            return false;
526        }
527
528        // Check not followed by alphanumeric (right-flanking rule for underscores)
529        if after_pos < text.len()
530            && let Some(next_char) = text[after_pos..].chars().next()
531            && next_char.is_alphanumeric()
532        {
533            return false;
534        }
535    }
536
537    true
538}
539
540/// Try to parse emphasis with ** opening delimiter.
541///
542/// Tries to match ** closer only. No fallback.
543/// Returns Some(bytes_consumed) if successful, None otherwise.
544fn try_parse_two(
545    text: &str,
546    pos: usize,
547    delim_char: char,
548    end: usize,
549    config: &ParserOptions,
550    builder: &mut GreenNodeBuilder,
551) -> Option<usize> {
552    let content_start = pos + 2;
553
554    log::debug!("try_parse_two: '{}' x 2 at pos {}", delim_char, pos);
555
556    // Try to find ** closer, checking for nested * emphasis along the way
557    if let Some(closer_pos) =
558        parse_until_closer_with_nested_one(text, content_start, delim_char, 2, end, config)
559    {
560        log::debug!("Found ** closer at pos {}", closer_pos);
561
562        // Emit STRONG(content)
563        builder.start_node(SyntaxKind::STRONG.into());
564        builder.token(SyntaxKind::STRONG_MARKER.into(), &text[pos..pos + 2]);
565        parse_inline_range_nested(text, content_start, closer_pos, config, builder);
566        builder.token(
567            SyntaxKind::STRONG_MARKER.into(),
568            &text[closer_pos..closer_pos + 2],
569        );
570        builder.finish_node(); // STRONG
571
572        return Some(closer_pos + 2 - pos);
573    }
574
575    // No closer found
576    log::trace!("No closer found for **");
577    None
578}
579
580/// Try to parse emphasis with * opening delimiter.
581///
582/// Tries to match * closer.
583/// Returns Some(bytes_consumed) if successful, None otherwise.
584///
585/// **Pandoc algorithm**: While parsing content, if we encounter **,
586/// try to parse it as `two` (strong) recursively. If `two` succeeds,
587/// it consumes the ** delimiters, potentially preventing us from finding
588/// a closer for the outer *. This creates priority where ** can "steal"
589/// matches from *.
590fn try_parse_one(
591    text: &str,
592    pos: usize,
593    delim_char: char,
594    end: usize,
595    config: &ParserOptions,
596    builder: &mut GreenNodeBuilder,
597) -> Option<usize> {
598    let content_start = pos + 1;
599
600    log::debug!("try_parse_one: '{}' x 1 at pos {}", delim_char, pos);
601
602    // Try to find * closer using Pandoc's algorithm with nested two attempts
603    if let Some(closer_pos) =
604        parse_until_closer_with_nested_two(text, content_start, delim_char, 1, end, config)
605    {
606        log::debug!("Found * closer at pos {}", closer_pos);
607
608        // Emit EMPH(content)
609        builder.start_node(SyntaxKind::EMPHASIS.into());
610        builder.token(SyntaxKind::EMPHASIS_MARKER.into(), &text[pos..pos + 1]);
611        parse_inline_range_nested(text, content_start, closer_pos, config, builder);
612        builder.token(
613            SyntaxKind::EMPHASIS_MARKER.into(),
614            &text[closer_pos..closer_pos + 1],
615        );
616        builder.finish_node(); // EMPHASIS
617
618        return Some(closer_pos + 1 - pos);
619    }
620
621    // No closer found
622    log::trace!("No closer found for *");
623    None
624}
625
626/// Parse inline content and look for a matching closer, with nested two attempts.
627///
628/// This implements Pandoc's algorithm from Markdown.hs lines 1712-1717:
629/// When parsing `*...*`, if we encounter `**` (and it's not followed by
630/// another `*` that would close the outer emphasis), try to parse it as
631/// `two c mempty` (strong). If `two` succeeds, those `**` delimiters are
632/// consumed, and we continue searching for the `*` closer.
633///
634/// This creates a priority system where `**` can "steal" matches from `*`.
635///
636/// Example: `*foo **bar* baz**`
637/// - When parsing the outer `*...*`, we encounter `**` at position 5
638/// - We try `two` which succeeds with `**bar* baz**`
639/// - Now there's no `*` closer for the outer `*`, so it fails
640/// - Result: literal `*foo ` + STRONG("bar* baz")
641///
642/// # Arguments
643/// * `end` - Don't search beyond this position (respects nesting boundaries)
644fn parse_until_closer_with_nested_two(
645    text: &str,
646    start: usize,
647    delim_char: char,
648    delim_count: usize,
649    end: usize,
650    config: &ParserOptions,
651) -> Option<usize> {
652    let bytes = text.as_bytes();
653    let mut pos = start;
654
655    while pos < end.min(text.len()) {
656        if bytes[pos] == b'`'
657            && let Some(m) = try_parse_inline_executable(
658                &text[pos..],
659                config.extensions.rmarkdown_inline_code,
660                config.extensions.quarto_inline_code,
661            )
662        {
663            log::trace!(
664                "Skipping inline executable span of {} bytes at pos {}",
665                m.total_len,
666                pos
667            );
668            pos += m.total_len;
669            continue;
670        }
671
672        // Skip over code spans - their content is protected from delimiter matching
673        if bytes[pos] == b'`'
674            && let Some((len, _, _, _)) = try_parse_code_span(&text[pos..])
675        {
676            log::trace!("Skipping code span of {} bytes at pos {}", len, pos);
677            pos += len;
678            continue;
679        }
680
681        // Skip over inline math - their content is protected from delimiter matching
682        if bytes[pos] == b'$'
683            && let Some((len, _)) = try_parse_inline_math(&text[pos..])
684        {
685            log::trace!("Skipping inline math of {} bytes at pos {}", len, pos);
686            pos += len;
687            continue;
688        }
689
690        // Skip over links - their content is protected from delimiter matching
691        if bytes[pos] == b'['
692            && let Some((len, _, _, _)) = try_parse_inline_link(&text[pos..])
693        {
694            log::trace!("Skipping inline link of {} bytes at pos {}", len, pos);
695            pos += len;
696            continue;
697        }
698
699        // Pandoc algorithm: If we're looking for a single delimiter (*) and
700        // encounter a double delimiter (**), try to parse it as `two` (strong).
701        // This happens BEFORE checking if pos is a closer for our current emphasis.
702        if delim_count == 1
703            && pos + 2 <= text.len()
704            && bytes[pos] == delim_char as u8
705            && bytes[pos + 1] == delim_char as u8
706        {
707            // First check if the first delimiter is escaped
708            let first_is_escaped = {
709                let mut backslash_count = 0;
710                let mut check_pos = pos;
711                while check_pos > 0 && bytes[check_pos - 1] == b'\\' {
712                    backslash_count += 1;
713                    check_pos -= 1;
714                }
715                backslash_count % 2 == 1
716            };
717
718            if first_is_escaped {
719                // First * is escaped, skip it and continue
720                // The second * might be a closer or start of emphasis
721                log::trace!(
722                    "First * at pos {} is escaped, skipping to check second *",
723                    pos
724                );
725                pos += 1;
726                continue;
727            }
728
729            // Check that there's NOT a third delimiter (which would make this
730            // part of a longer run that we shouldn't treat as `two`)
731            let no_third_delim = pos + 2 >= bytes.len() || bytes[pos + 2] != delim_char as u8;
732
733            if no_third_delim {
734                log::trace!(
735                    "try_parse_one: found ** at pos {}, attempting nested two",
736                    pos
737                );
738
739                // Try to parse as `two` (strong emphasis)
740                // We create a temporary builder to test if `two` succeeds
741                let mut temp_builder = GreenNodeBuilder::new();
742                if let Some(two_consumed) =
743                    try_parse_two(text, pos, delim_char, end, config, &mut temp_builder)
744                {
745                    // `two` succeeded! Those ** delimiters are consumed.
746                    // We skip past the `two` and continue searching for our `*` closer.
747                    log::debug!(
748                        "Nested two succeeded, consumed {} bytes, continuing search",
749                        two_consumed
750                    );
751                    pos += two_consumed;
752                    continue;
753                }
754                // `two` failed - this means the entire `one` parse should fail!
755                // In Pandoc, the `try (string [c,c] >> notFollowedBy (ender c 1) >> two c mempty)`
756                // alternative fails, and the first alternative `notFollowedBy (ender c 1) >> inline`
757                // also fails because we ARE followed by an ender (the first * of **).
758                // So the entire content parsing fails, and `one` returns failure.
759                log::trace!("Nested two failed at pos {}, entire one() should fail", pos);
760                return None;
761            }
762        }
763
764        // Check if we have a potential closer here
765        if pos + delim_count <= text.len() {
766            let mut matches = true;
767            for i in 0..delim_count {
768                if bytes[pos + i] != delim_char as u8 {
769                    matches = false;
770                    break;
771                }
772            }
773
774            if matches {
775                // IMPORTANT: Check that there are EXACTLY delim_count delimiters,
776                // not more. E.g., when looking for `*`, we shouldn't match
777                // `*` that's part of a longer run.
778
779                // Check: not escaped (preceded by odd number of backslashes)
780                let is_escaped = {
781                    let mut backslash_count = 0;
782                    let mut check_pos = pos;
783                    while check_pos > 0 && bytes[check_pos - 1] == b'\\' {
784                        backslash_count += 1;
785                        check_pos -= 1;
786                    }
787                    backslash_count % 2 == 1 // Odd number = escaped
788                };
789
790                // Allow matching at the start OR end of a delimiter run.
791                // This lets `**` close at the end of `***` (after a nested `*` closes),
792                // while still avoiding matches in the middle of longer runs.
793                let at_run_start = pos == 0 || bytes[pos - 1] != delim_char as u8;
794                let after_pos = pos + delim_count;
795                let at_run_end = after_pos >= bytes.len() || bytes[after_pos] != delim_char as u8;
796
797                if (at_run_start || at_run_end) && !is_escaped {
798                    // Found a potential closer!
799                    // For underscores, check right-flanking: closer must be preceded by non-whitespace
800                    // For asterisks, Pandoc doesn't require right-flanking (see ender function in Markdown.hs)
801                    if delim_char == '_'
802                        && pos > start
803                        && let Some(prev_char) = text[..pos].chars().last()
804                        && prev_char.is_whitespace()
805                    {
806                        log::trace!(
807                            "Underscore closer preceded by whitespace at pos {}, not right-flanking",
808                            pos
809                        );
810                        // Not a valid closer, continue searching
811                        pos += 1;
812                        continue;
813                    }
814
815                    log::trace!(
816                        "Found exact {} x {} closer at pos {}",
817                        delim_char,
818                        delim_count,
819                        pos
820                    );
821                    return Some(pos);
822                }
823            }
824        }
825
826        // Not a closer, move to next position
827        // TODO: Should skip entire characters (UTF-8), not just bytes
828        pos += 1;
829    }
830
831    None
832}
833
834/// Parse inline content and look for a matching closer, with nested one attempts.
835///
836/// This implements the symmetric case to `parse_until_closer_with_nested_two`:
837/// When parsing `**...**`, if we encounter `*` (and it's not followed by
838/// another `*` that would be part of our `**` closer), try to parse it as
839/// `one c mempty` (emphasis). If `one` succeeds, those `*` delimiters are
840/// consumed, and we continue searching for the `**` closer.
841///
842/// This ensures nested emphasis closes before the outer strong emphasis.
843///
844/// Example: `**bold with *italic***`
845/// - When parsing the outer `**...**, we scan for `**` closer
846/// - At position 12, we encounter a single `*` (start of `*italic`)
847/// - We try `one` which succeeds with `*italic*` (consuming the first `*` from `***`)
848/// - We continue scanning and find `**` at position 20 (the remaining `**` from `***`)
849/// - Result: STRONG["bold with " EMPHASIS["italic"]]
850///
851/// # Arguments
852/// * `end` - Don't search beyond this position (respects nesting boundaries)
853fn parse_until_closer_with_nested_one(
854    text: &str,
855    start: usize,
856    delim_char: char,
857    delim_count: usize,
858    end: usize,
859    config: &ParserOptions,
860) -> Option<usize> {
861    let bytes = text.as_bytes();
862    let mut pos = start;
863
864    while pos < end.min(text.len()) {
865        if bytes[pos] == b'`'
866            && let Some(m) = try_parse_inline_executable(
867                &text[pos..],
868                config.extensions.rmarkdown_inline_code,
869                config.extensions.quarto_inline_code,
870            )
871        {
872            log::trace!(
873                "Skipping inline executable span of {} bytes at pos {}",
874                m.total_len,
875                pos
876            );
877            pos += m.total_len;
878            continue;
879        }
880
881        // Skip over code spans - their content is protected from delimiter matching
882        if bytes[pos] == b'`'
883            && let Some((len, _, _, _)) = try_parse_code_span(&text[pos..])
884        {
885            log::trace!("Skipping code span of {} bytes at pos {}", len, pos);
886            pos += len;
887            continue;
888        }
889
890        // Skip over inline math - their content is protected from delimiter matching
891        if bytes[pos] == b'$'
892            && let Some((len, _)) = try_parse_inline_math(&text[pos..])
893        {
894            log::trace!("Skipping inline math of {} bytes at pos {}", len, pos);
895            pos += len;
896            continue;
897        }
898
899        // Skip over links - their content is protected from delimiter matching
900        if bytes[pos] == b'['
901            && let Some((len, _, _, _)) = try_parse_inline_link(&text[pos..])
902        {
903            log::trace!("Skipping inline link of {} bytes at pos {}", len, pos);
904            pos += len;
905            continue;
906        }
907
908        // Pandoc algorithm: If we're looking for a double delimiter (**) and
909        // encounter a single delimiter (*), check if it's a valid emphasis opener.
910        // If it is, try to parse it as `one` (emphasis). If `one` succeeds, skip
911        // over it. If `one` fails, the outer `two` also fails (delimiter poisoning).
912        // If the `*` is NOT a valid opener (e.g., followed by whitespace or escaped),
913        // skip it and continue looking for the `**` closer.
914        if delim_count == 2 && pos < text.len() && bytes[pos] == delim_char as u8 {
915            // Check that there's NOT a second delimiter immediately after
916            // (which would make this part of our `**` closer or another `**` opener)
917            let no_second_delim = pos + 1 >= bytes.len() || bytes[pos + 1] != delim_char as u8;
918
919            if no_second_delim {
920                // Check if this * is escaped (preceded by odd number of backslashes)
921                let is_escaped = {
922                    let mut backslash_count = 0;
923                    let mut check_pos = pos;
924                    while check_pos > 0 && bytes[check_pos - 1] == b'\\' {
925                        backslash_count += 1;
926                        check_pos -= 1;
927                    }
928                    backslash_count % 2 == 1
929                };
930
931                if is_escaped {
932                    // Escaped delimiter - just literal text, skip it
933                    log::trace!("* at pos {} is escaped, skipping", pos);
934                    pos += 1;
935                    continue;
936                }
937
938                // Check if this * is a valid emphasis opener (Pandoc's enclosure rule).
939                // A delimiter followed by whitespace is NOT an opener - it's literal text.
940                let after_delim = pos + 1;
941                let followed_by_whitespace = after_delim < text.len()
942                    && text[after_delim..]
943                        .chars()
944                        .next()
945                        .is_some_and(|c| c.is_whitespace());
946
947                if followed_by_whitespace {
948                    // Not a valid opener - just literal text, skip it
949                    log::trace!(
950                        "* at pos {} followed by whitespace, not an opener, skipping",
951                        pos
952                    );
953                    pos += 1;
954                    continue;
955                }
956
957                log::trace!(
958                    "try_parse_two: found * at pos {}, attempting nested one",
959                    pos
960                );
961
962                // Try to parse as `one` (emphasis)
963                // We create a temporary builder to test if `one` succeeds
964                let mut temp_builder = GreenNodeBuilder::new();
965                if let Some(one_consumed) =
966                    try_parse_one(text, pos, delim_char, end, config, &mut temp_builder)
967                {
968                    // `one` succeeded! Those * delimiters are consumed.
969                    // We skip past the `one` and continue searching for our `**` closer.
970                    log::debug!(
971                        "Nested one succeeded, consumed {} bytes, continuing search",
972                        one_consumed
973                    );
974                    pos += one_consumed;
975                    continue;
976                }
977
978                // `one` failed to find a closer. According to Pandoc's algorithm,
979                // this means the outer `two` should also fail. An unmatched inner
980                // delimiter "poisons" the outer emphasis.
981                // Example: `**foo *bar**` - the `*` can't find a closer, so the
982                // outer `**` should fail and the whole thing becomes literal.
983                log::debug!(
984                    "Nested one failed at pos {}, poisoning outer two (no closer found)",
985                    pos
986                );
987                return None;
988            }
989        }
990
991        // Check if we have a potential closer here
992        if pos + delim_count <= text.len() {
993            let mut matches = true;
994            for i in 0..delim_count {
995                if bytes[pos + i] != delim_char as u8 {
996                    matches = false;
997                    break;
998                }
999            }
1000
1001            if matches {
1002                // Check: not escaped (preceded by odd number of backslashes)
1003                let is_escaped = {
1004                    let mut backslash_count = 0;
1005                    let mut check_pos = pos;
1006                    while check_pos > 0 && bytes[check_pos - 1] == b'\\' {
1007                        backslash_count += 1;
1008                        check_pos -= 1;
1009                    }
1010                    backslash_count % 2 == 1 // Odd number = escaped
1011                };
1012
1013                // Allow matching at the start OR end of a delimiter run.
1014                // This lets `**` close at the end of `***` (after a nested `*` closes),
1015                // while still avoiding matches in the middle of longer runs.
1016                let at_run_start = pos == 0 || bytes[pos - 1] != delim_char as u8;
1017                let after_pos = pos + delim_count;
1018                let at_run_end = after_pos >= bytes.len() || bytes[after_pos] != delim_char as u8;
1019
1020                if (at_run_start || at_run_end) && !is_escaped {
1021                    // Found a potential closer!
1022                    // For underscores, check right-flanking: closer must be preceded by non-whitespace
1023                    // For asterisks, Pandoc doesn't require right-flanking (see ender function in Markdown.hs)
1024                    if delim_char == '_'
1025                        && pos > start
1026                        && let Some(prev_char) = text[..pos].chars().last()
1027                        && prev_char.is_whitespace()
1028                    {
1029                        log::trace!(
1030                            "Underscore closer preceded by whitespace at pos {}, not right-flanking",
1031                            pos
1032                        );
1033                        // Not a valid closer, continue searching
1034                        pos += 1;
1035                        continue;
1036                    }
1037
1038                    log::trace!(
1039                        "Found exact {} x {} closer at pos {}",
1040                        delim_char,
1041                        delim_count,
1042                        pos
1043                    );
1044                    return Some(pos);
1045                }
1046            }
1047        }
1048
1049        // Not a closer, move to next position
1050        // TODO: Should skip entire characters (UTF-8), not just bytes
1051        pos += 1;
1052    }
1053
1054    None
1055}
1056
1057///
1058/// This is the recursive inline parser that handles all inline elements:
1059/// - Text
1060/// - Escapes (highest priority)
1061/// - Code spans
1062/// - Math (inline and display)
1063/// - Emphasis/strong (via try_parse_emphasis)
1064/// - Other inline elements
1065///
1066/// **Important**: This is where the greedy left-to-right parsing happens.
1067/// When we see `**`, we try to parse it as STRONG. If it succeeds, those
1068/// delimiters are consumed and won't be available for outer emphasis.
1069///
1070/// # Arguments
1071/// * `nested_emphasis` - If true, bypass opener validity checks for emphasis.
1072///   Set to true when called from within emphasis parsing (e.g., from try_parse_one/two/three).
1073fn parse_inline_range(
1074    text: &str,
1075    start: usize,
1076    end: usize,
1077    config: &ParserOptions,
1078    builder: &mut GreenNodeBuilder,
1079) {
1080    parse_inline_range_impl(text, start, end, config, builder, false)
1081}
1082
1083/// Same as `parse_inline_range` but bypasses opener validity checks for emphasis.
1084/// Used within emphasis parsing contexts (e.g., from try_parse_one/two/three).
1085fn parse_inline_range_nested(
1086    text: &str,
1087    start: usize,
1088    end: usize,
1089    config: &ParserOptions,
1090    builder: &mut GreenNodeBuilder,
1091) {
1092    parse_inline_range_impl(text, start, end, config, builder, true)
1093}
1094
1095fn is_emoji_boundary(text: &str, pos: usize) -> bool {
1096    if pos > 0 {
1097        let prev = text.as_bytes()[pos - 1] as char;
1098        if prev.is_ascii_alphanumeric() || prev == '_' {
1099            return false;
1100        }
1101    }
1102    true
1103}
1104
1105fn parse_inline_range_impl(
1106    text: &str,
1107    start: usize,
1108    end: usize,
1109    config: &ParserOptions,
1110    builder: &mut GreenNodeBuilder,
1111    nested_emphasis: bool,
1112) {
1113    log::debug!(
1114        "parse_inline_range: start={}, end={}, text={:?}",
1115        start,
1116        end,
1117        &text[start..end]
1118    );
1119    let mut pos = start;
1120    let mut text_start = start;
1121
1122    while pos < end {
1123        let byte = text.as_bytes()[pos];
1124
1125        // Backslash math (highest priority if enabled)
1126        if byte == b'\\' {
1127            // Try double backslash display math first: \\[...\\]
1128            if config.extensions.tex_math_double_backslash {
1129                if let Some((len, content)) = try_parse_double_backslash_display_math(&text[pos..])
1130                {
1131                    if pos > text_start {
1132                        builder.token(SyntaxKind::TEXT.into(), &text[text_start..pos]);
1133                    }
1134                    log::debug!("Matched double backslash display math at pos {}", pos);
1135                    emit_double_backslash_display_math(builder, content);
1136                    pos += len;
1137                    text_start = pos;
1138                    continue;
1139                }
1140
1141                // Try double backslash inline math: \\(...\\)
1142                if let Some((len, content)) = try_parse_double_backslash_inline_math(&text[pos..]) {
1143                    if pos > text_start {
1144                        builder.token(SyntaxKind::TEXT.into(), &text[text_start..pos]);
1145                    }
1146                    log::debug!("Matched double backslash inline math at pos {}", pos);
1147                    emit_double_backslash_inline_math(builder, content);
1148                    pos += len;
1149                    text_start = pos;
1150                    continue;
1151                }
1152            }
1153
1154            // Try single backslash display math: \[...\]
1155            if config.extensions.tex_math_single_backslash {
1156                if let Some((len, content)) = try_parse_single_backslash_display_math(&text[pos..])
1157                {
1158                    if pos > text_start {
1159                        builder.token(SyntaxKind::TEXT.into(), &text[text_start..pos]);
1160                    }
1161                    log::debug!("Matched single backslash display math at pos {}", pos);
1162                    emit_single_backslash_display_math(builder, content);
1163                    pos += len;
1164                    text_start = pos;
1165                    continue;
1166                }
1167
1168                // Try single backslash inline math: \(...\)
1169                if let Some((len, content)) = try_parse_single_backslash_inline_math(&text[pos..]) {
1170                    if pos > text_start {
1171                        builder.token(SyntaxKind::TEXT.into(), &text[text_start..pos]);
1172                    }
1173                    log::debug!("Matched single backslash inline math at pos {}", pos);
1174                    emit_single_backslash_inline_math(builder, content);
1175                    pos += len;
1176                    text_start = pos;
1177                    continue;
1178                }
1179            }
1180
1181            // Try math environments \begin{equation}...\end{equation}
1182            if config.extensions.raw_tex
1183                && let Some((len, begin_marker, content, end_marker)) =
1184                    try_parse_math_environment(&text[pos..])
1185            {
1186                if pos > text_start {
1187                    builder.token(SyntaxKind::TEXT.into(), &text[text_start..pos]);
1188                }
1189                log::debug!("Matched math environment at pos {}", pos);
1190                emit_display_math_environment(builder, begin_marker, content, end_marker);
1191                pos += len;
1192                text_start = pos;
1193                continue;
1194            }
1195
1196            // Try bookdown reference: \@ref(label)
1197            if config.extensions.bookdown_references
1198                && let Some((len, label)) = try_parse_bookdown_reference(&text[pos..])
1199            {
1200                if pos > text_start {
1201                    builder.token(SyntaxKind::TEXT.into(), &text[text_start..pos]);
1202                }
1203                log::debug!("Matched bookdown reference at pos {}: {}", pos, label);
1204                super::citations::emit_bookdown_crossref(builder, label);
1205                pos += len;
1206                text_start = pos;
1207                continue;
1208            }
1209
1210            // Try escapes (after bookdown refs and backslash math)
1211            if let Some((len, ch, escape_type)) = try_parse_escape(&text[pos..]) {
1212                let escape_enabled = match escape_type {
1213                    EscapeType::HardLineBreak => config.extensions.escaped_line_breaks,
1214                    EscapeType::NonbreakingSpace => config.extensions.all_symbols_escapable,
1215                    EscapeType::Literal => {
1216                        const BASE_ESCAPABLE: &str = "\\`*_{}[]()>#+-.!";
1217                        BASE_ESCAPABLE.contains(ch) || config.extensions.all_symbols_escapable
1218                    }
1219                };
1220                if !escape_enabled {
1221                    // Don't treat as hard line break - skip the escape and continue
1222                    // The backslash will be included in the next TEXT token
1223                    pos += 1;
1224                    continue;
1225                }
1226
1227                // Emit accumulated text
1228                if pos > text_start {
1229                    builder.token(SyntaxKind::TEXT.into(), &text[text_start..pos]);
1230                }
1231
1232                log::debug!("Matched escape at pos {}: \\{}", pos, ch);
1233                emit_escape(builder, ch, escape_type);
1234                pos += len;
1235                text_start = pos;
1236                continue;
1237            }
1238
1239            // Try LaTeX commands (after escapes, before shortcodes)
1240            if config.extensions.raw_tex
1241                && let Some(len) = try_parse_latex_command(&text[pos..])
1242            {
1243                if pos > text_start {
1244                    builder.token(SyntaxKind::TEXT.into(), &text[text_start..pos]);
1245                }
1246                log::debug!("Matched LaTeX command at pos {}", pos);
1247                parse_latex_command(builder, &text[pos..], len);
1248                pos += len;
1249                text_start = pos;
1250                continue;
1251            }
1252        }
1253
1254        // Try Quarto shortcodes: {{< shortcode >}}
1255        if byte == b'{'
1256            && pos + 1 < text.len()
1257            && text.as_bytes()[pos + 1] == b'{'
1258            && let Some((len, name, attrs)) = try_parse_shortcode(&text[pos..])
1259        {
1260            if pos > text_start {
1261                builder.token(SyntaxKind::TEXT.into(), &text[text_start..pos]);
1262            }
1263            log::debug!("Matched shortcode at pos {}: {}", pos, &name);
1264            emit_shortcode(builder, &name, attrs);
1265            pos += len;
1266            text_start = pos;
1267            continue;
1268        }
1269
1270        // Try inline executable code spans (`... `r expr`` and `... `{r} expr``)
1271        if byte == b'`'
1272            && let Some(m) = try_parse_inline_executable(
1273                &text[pos..],
1274                config.extensions.rmarkdown_inline_code,
1275                config.extensions.quarto_inline_code,
1276            )
1277        {
1278            if pos > text_start {
1279                builder.token(SyntaxKind::TEXT.into(), &text[text_start..pos]);
1280            }
1281            log::debug!("Matched inline executable code at pos {}", pos);
1282            emit_inline_executable(builder, &m);
1283            pos += m.total_len;
1284            text_start = pos;
1285            continue;
1286        }
1287
1288        // Try code spans
1289        if byte == b'`'
1290            && let Some((len, content, backtick_count, attributes)) =
1291                try_parse_code_span(&text[pos..])
1292        {
1293            // Emit accumulated text
1294            if pos > text_start {
1295                builder.token(SyntaxKind::TEXT.into(), &text[text_start..pos]);
1296            }
1297
1298            log::debug!(
1299                "Matched code span at pos {}: {} backticks",
1300                pos,
1301                backtick_count
1302            );
1303
1304            // Check for raw inline
1305            if let Some(ref attrs) = attributes
1306                && config.extensions.raw_attribute
1307                && let Some(format) = is_raw_inline(attrs)
1308            {
1309                use super::raw_inline::emit_raw_inline;
1310                log::debug!("Matched raw inline span at pos {}: format={}", pos, format);
1311                emit_raw_inline(builder, content, backtick_count, format);
1312            } else if !config.extensions.inline_code_attributes && attributes.is_some() {
1313                let code_span_len = backtick_count * 2 + content.len();
1314                emit_code_span(builder, content, backtick_count, None);
1315                pos += code_span_len;
1316                text_start = pos;
1317                continue;
1318            } else {
1319                emit_code_span(builder, content, backtick_count, attributes);
1320            }
1321
1322            pos += len;
1323            text_start = pos;
1324            continue;
1325        }
1326
1327        // Try textual emoji aliases: :smile:
1328        if byte == b':'
1329            && config.extensions.emoji
1330            && is_emoji_boundary(text, pos)
1331            && let Some((len, _alias)) = try_parse_emoji(&text[pos..])
1332        {
1333            if pos > text_start {
1334                builder.token(SyntaxKind::TEXT.into(), &text[text_start..pos]);
1335            }
1336            log::debug!("Matched emoji at pos {}", pos);
1337            emit_emoji(builder, &text[pos..pos + len]);
1338            pos += len;
1339            text_start = pos;
1340            continue;
1341        }
1342
1343        // Try inline footnotes: ^[note]
1344        if byte == b'^'
1345            && pos + 1 < text.len()
1346            && text.as_bytes()[pos + 1] == b'['
1347            && config.extensions.inline_footnotes
1348            && let Some((len, content)) = try_parse_inline_footnote(&text[pos..])
1349        {
1350            if pos > text_start {
1351                builder.token(SyntaxKind::TEXT.into(), &text[text_start..pos]);
1352            }
1353            log::debug!("Matched inline footnote at pos {}", pos);
1354            emit_inline_footnote(builder, content, config);
1355            pos += len;
1356            text_start = pos;
1357            continue;
1358        }
1359
1360        // Try superscript: ^text^
1361        if byte == b'^'
1362            && config.extensions.superscript
1363            && let Some((len, content)) = try_parse_superscript(&text[pos..])
1364        {
1365            if pos > text_start {
1366                builder.token(SyntaxKind::TEXT.into(), &text[text_start..pos]);
1367            }
1368            log::debug!("Matched superscript at pos {}", pos);
1369            emit_superscript(builder, content, config);
1370            pos += len;
1371            text_start = pos;
1372            continue;
1373        }
1374
1375        // Try bookdown definition: (\#label) or (ref:label)
1376        if byte == b'(' && config.extensions.bookdown_references {
1377            if let Some((len, label)) = try_parse_bookdown_definition(&text[pos..]) {
1378                if pos > text_start {
1379                    builder.token(SyntaxKind::TEXT.into(), &text[text_start..pos]);
1380                }
1381                log::debug!("Matched bookdown definition at pos {}: {}", pos, label);
1382                builder.token(SyntaxKind::TEXT.into(), &text[pos..pos + len]);
1383                pos += len;
1384                text_start = pos;
1385                continue;
1386            }
1387            if let Some((len, label)) = try_parse_bookdown_text_reference(&text[pos..]) {
1388                if pos > text_start {
1389                    builder.token(SyntaxKind::TEXT.into(), &text[text_start..pos]);
1390                }
1391                log::debug!("Matched bookdown text reference at pos {}: {}", pos, label);
1392                builder.token(SyntaxKind::TEXT.into(), &text[pos..pos + len]);
1393                pos += len;
1394                text_start = pos;
1395                continue;
1396            }
1397        }
1398
1399        // Try subscript: ~text~
1400        if byte == b'~'
1401            && config.extensions.subscript
1402            && let Some((len, content)) = try_parse_subscript(&text[pos..])
1403        {
1404            if pos > text_start {
1405                builder.token(SyntaxKind::TEXT.into(), &text[text_start..pos]);
1406            }
1407            log::debug!("Matched subscript at pos {}", pos);
1408            emit_subscript(builder, content, config);
1409            pos += len;
1410            text_start = pos;
1411            continue;
1412        }
1413
1414        // Try strikeout: ~~text~~
1415        if byte == b'~'
1416            && config.extensions.strikeout
1417            && let Some((len, content)) = try_parse_strikeout(&text[pos..])
1418        {
1419            if pos > text_start {
1420                builder.token(SyntaxKind::TEXT.into(), &text[text_start..pos]);
1421            }
1422            log::debug!("Matched strikeout at pos {}", pos);
1423            emit_strikeout(builder, content, config);
1424            pos += len;
1425            text_start = pos;
1426            continue;
1427        }
1428
1429        // Try mark/highlight: ==text==
1430        if byte == b'='
1431            && config.extensions.mark
1432            && let Some((len, content)) = try_parse_mark(&text[pos..])
1433        {
1434            if pos > text_start {
1435                builder.token(SyntaxKind::TEXT.into(), &text[text_start..pos]);
1436            }
1437            log::debug!("Matched mark at pos {}", pos);
1438            emit_mark(builder, content, config);
1439            pos += len;
1440            text_start = pos;
1441            continue;
1442        }
1443
1444        // Try GFM inline math: $`...`$
1445        if byte == b'$'
1446            && config.extensions.tex_math_gfm
1447            && let Some((len, content)) = try_parse_gfm_inline_math(&text[pos..])
1448        {
1449            if pos > text_start {
1450                builder.token(SyntaxKind::TEXT.into(), &text[text_start..pos]);
1451            }
1452            log::debug!("Matched GFM inline math at pos {}", pos);
1453            emit_gfm_inline_math(builder, content);
1454            pos += len;
1455            text_start = pos;
1456            continue;
1457        }
1458
1459        // Try math ($...$, $$...$$)
1460        if byte == b'$' && config.extensions.tex_math_dollars {
1461            // Try display math first ($$...$$)
1462            if let Some((len, content)) = try_parse_display_math(&text[pos..]) {
1463                // Emit accumulated text
1464                if pos > text_start {
1465                    builder.token(SyntaxKind::TEXT.into(), &text[text_start..pos]);
1466                }
1467
1468                let dollar_count = text[pos..].chars().take_while(|&c| c == '$').count();
1469                log::debug!(
1470                    "Matched display math at pos {}: {} dollars",
1471                    pos,
1472                    dollar_count
1473                );
1474
1475                // Check for trailing attributes (Quarto cross-reference support)
1476                let after_math = &text[pos + len..];
1477                let attr_len = if config.extensions.quarto_crossrefs {
1478                    use crate::parser::utils::attributes::try_parse_trailing_attributes;
1479                    if let Some((_attr_block, _)) = try_parse_trailing_attributes(after_math) {
1480                        let trimmed_after = after_math.trim_start();
1481                        if let Some(open_brace_pos) = trimmed_after.find('{') {
1482                            let ws_before_brace = after_math.len() - trimmed_after.len();
1483                            let attr_text_len = trimmed_after[open_brace_pos..]
1484                                .find('}')
1485                                .map(|close| close + 1)
1486                                .unwrap_or(0);
1487                            ws_before_brace + open_brace_pos + attr_text_len
1488                        } else {
1489                            0
1490                        }
1491                    } else {
1492                        0
1493                    }
1494                } else {
1495                    0
1496                };
1497
1498                let total_len = len + attr_len;
1499                emit_display_math(builder, content, dollar_count);
1500
1501                // Emit attributes if present
1502                if attr_len > 0 {
1503                    use crate::parser::utils::attributes::{
1504                        emit_attributes, try_parse_trailing_attributes,
1505                    };
1506                    let attr_text = &text[pos + len..pos + total_len];
1507                    if let Some((attr_block, _text_before)) =
1508                        try_parse_trailing_attributes(attr_text)
1509                    {
1510                        let trimmed_after = attr_text.trim_start();
1511                        let ws_len = attr_text.len() - trimmed_after.len();
1512                        if ws_len > 0 {
1513                            builder.token(SyntaxKind::WHITESPACE.into(), &attr_text[..ws_len]);
1514                        }
1515                        emit_attributes(builder, &attr_block);
1516                    }
1517                }
1518
1519                pos += total_len;
1520                text_start = pos;
1521                continue;
1522            }
1523
1524            // Try inline math ($...$)
1525            if let Some((len, content)) = try_parse_inline_math(&text[pos..]) {
1526                // Emit accumulated text
1527                if pos > text_start {
1528                    builder.token(SyntaxKind::TEXT.into(), &text[text_start..pos]);
1529                }
1530
1531                log::debug!("Matched inline math at pos {}", pos);
1532                emit_inline_math(builder, content);
1533                pos += len;
1534                text_start = pos;
1535                continue;
1536            }
1537
1538            // Neither display nor inline math matched - emit the $ as literal text
1539            // This ensures each $ gets its own TEXT token for CST compatibility
1540            if pos > text_start {
1541                builder.token(SyntaxKind::TEXT.into(), &text[text_start..pos]);
1542            }
1543            builder.token(SyntaxKind::TEXT.into(), "$");
1544            pos += 1;
1545            text_start = pos;
1546            continue;
1547        }
1548
1549        // Try autolinks: <url> or <email>
1550        if byte == b'<'
1551            && config.extensions.autolinks
1552            && let Some((len, url)) = try_parse_autolink(&text[pos..])
1553        {
1554            if pos > text_start {
1555                builder.token(SyntaxKind::TEXT.into(), &text[text_start..pos]);
1556            }
1557            log::debug!("Matched autolink at pos {}", pos);
1558            emit_autolink(builder, &text[pos..pos + len], url);
1559            pos += len;
1560            text_start = pos;
1561            continue;
1562        }
1563
1564        if config.extensions.autolink_bare_uris
1565            && let Some((len, url)) = try_parse_bare_uri(&text[pos..])
1566        {
1567            if pos > text_start {
1568                builder.token(SyntaxKind::TEXT.into(), &text[text_start..pos]);
1569            }
1570            log::debug!("Matched bare URI at pos {}", pos);
1571            emit_bare_uri_link(builder, url, config);
1572            pos += len;
1573            text_start = pos;
1574            continue;
1575        }
1576
1577        // Try native spans: <span>text</span> (after autolink since both start with <)
1578        if byte == b'<'
1579            && config.extensions.native_spans
1580            && let Some((len, content, attributes)) = try_parse_native_span(&text[pos..])
1581        {
1582            if pos > text_start {
1583                builder.token(SyntaxKind::TEXT.into(), &text[text_start..pos]);
1584            }
1585            log::debug!("Matched native span at pos {}", pos);
1586            emit_native_span(builder, content, &attributes, config);
1587            pos += len;
1588            text_start = pos;
1589            continue;
1590        }
1591
1592        // Images and links - process in order: inline image, reference image, footnote ref, inline link, reference link
1593        if byte == b'!' && pos + 1 < text.len() && text.as_bytes()[pos + 1] == b'[' {
1594            // Try inline image: ![alt](url)
1595            if let Some((len, alt_text, dest, attributes)) = try_parse_inline_image(&text[pos..]) {
1596                if pos > text_start {
1597                    builder.token(SyntaxKind::TEXT.into(), &text[text_start..pos]);
1598                }
1599                log::debug!("Matched inline image at pos {}", pos);
1600                emit_inline_image(
1601                    builder,
1602                    &text[pos..pos + len],
1603                    alt_text,
1604                    dest,
1605                    attributes,
1606                    config,
1607                );
1608                pos += len;
1609                text_start = pos;
1610                continue;
1611            }
1612
1613            // Try reference image: ![alt][ref] or ![alt]
1614            if config.extensions.reference_links {
1615                let allow_shortcut = config.extensions.shortcut_reference_links;
1616                if let Some((len, alt_text, reference, is_implicit)) =
1617                    try_parse_reference_image(&text[pos..], allow_shortcut)
1618                {
1619                    if pos > text_start {
1620                        builder.token(SyntaxKind::TEXT.into(), &text[text_start..pos]);
1621                    }
1622                    log::debug!("Matched reference image at pos {}", pos);
1623                    emit_reference_image(builder, alt_text, &reference, is_implicit, config);
1624                    pos += len;
1625                    text_start = pos;
1626                    continue;
1627                }
1628            }
1629        }
1630
1631        // Process bracket-starting elements
1632        if byte == b'[' {
1633            // Try footnote reference: [^id]
1634            if config.extensions.footnotes
1635                && let Some((len, id)) = try_parse_footnote_reference(&text[pos..])
1636            {
1637                if pos > text_start {
1638                    builder.token(SyntaxKind::TEXT.into(), &text[text_start..pos]);
1639                }
1640                log::debug!("Matched footnote reference at pos {}", pos);
1641                emit_footnote_reference(builder, &id);
1642                pos += len;
1643                text_start = pos;
1644                continue;
1645            }
1646
1647            // Try inline link: [text](url)
1648            if config.extensions.inline_links
1649                && let Some((len, link_text, dest, attributes)) =
1650                    try_parse_inline_link(&text[pos..])
1651            {
1652                if pos > text_start {
1653                    builder.token(SyntaxKind::TEXT.into(), &text[text_start..pos]);
1654                }
1655                log::debug!("Matched inline link at pos {}", pos);
1656                emit_inline_link(
1657                    builder,
1658                    &text[pos..pos + len],
1659                    link_text,
1660                    dest,
1661                    attributes,
1662                    config,
1663                );
1664                pos += len;
1665                text_start = pos;
1666                continue;
1667            }
1668
1669            // Try reference link: [text][ref] or [text]
1670            if config.extensions.reference_links {
1671                let allow_shortcut = config.extensions.shortcut_reference_links;
1672                if let Some((len, link_text, reference, is_implicit)) =
1673                    try_parse_reference_link(&text[pos..], allow_shortcut)
1674                {
1675                    if pos > text_start {
1676                        builder.token(SyntaxKind::TEXT.into(), &text[text_start..pos]);
1677                    }
1678                    log::debug!("Matched reference link at pos {}", pos);
1679                    emit_reference_link(builder, link_text, &reference, is_implicit, config);
1680                    pos += len;
1681                    text_start = pos;
1682                    continue;
1683                }
1684            }
1685
1686            // Try bracketed citation: [@cite]
1687            if config.extensions.citations
1688                && let Some((len, content)) = try_parse_bracketed_citation(&text[pos..])
1689            {
1690                if pos > text_start {
1691                    builder.token(SyntaxKind::TEXT.into(), &text[text_start..pos]);
1692                }
1693                log::debug!("Matched bracketed citation at pos {}", pos);
1694                emit_bracketed_citation(builder, content);
1695                pos += len;
1696                text_start = pos;
1697                continue;
1698            }
1699        }
1700
1701        // Try bracketed spans: [text]{.class}
1702        // Must come after links/citations
1703        if byte == b'['
1704            && config.extensions.bracketed_spans
1705            && let Some((len, text_content, attrs)) = try_parse_bracketed_span(&text[pos..])
1706        {
1707            if pos > text_start {
1708                builder.token(SyntaxKind::TEXT.into(), &text[text_start..pos]);
1709            }
1710            log::debug!("Matched bracketed span at pos {}", pos);
1711            emit_bracketed_span(builder, &text_content, &attrs, config);
1712            pos += len;
1713            text_start = pos;
1714            continue;
1715        }
1716
1717        // Try bare citation: @cite (must come after bracketed elements)
1718        if byte == b'@'
1719            && (config.extensions.citations || config.extensions.quarto_crossrefs)
1720            && let Some((len, key, has_suppress)) = try_parse_bare_citation(&text[pos..])
1721        {
1722            let is_crossref =
1723                config.extensions.quarto_crossrefs && super::citations::is_quarto_crossref_key(key);
1724            if is_crossref || config.extensions.citations {
1725                if pos > text_start {
1726                    builder.token(SyntaxKind::TEXT.into(), &text[text_start..pos]);
1727                }
1728                if is_crossref {
1729                    log::debug!("Matched Quarto crossref at pos {}: {}", pos, &key);
1730                    super::citations::emit_crossref(builder, key, has_suppress);
1731                } else {
1732                    log::debug!("Matched bare citation at pos {}: {}", pos, &key);
1733                    emit_bare_citation(builder, key, has_suppress);
1734                }
1735                pos += len;
1736                text_start = pos;
1737                continue;
1738            }
1739        }
1740
1741        // Try suppress-author citation: -@cite
1742        if byte == b'-'
1743            && pos + 1 < text.len()
1744            && text.as_bytes()[pos + 1] == b'@'
1745            && (config.extensions.citations || config.extensions.quarto_crossrefs)
1746            && let Some((len, key, has_suppress)) = try_parse_bare_citation(&text[pos..])
1747        {
1748            let is_crossref =
1749                config.extensions.quarto_crossrefs && super::citations::is_quarto_crossref_key(key);
1750            if is_crossref || config.extensions.citations {
1751                if pos > text_start {
1752                    builder.token(SyntaxKind::TEXT.into(), &text[text_start..pos]);
1753                }
1754                if is_crossref {
1755                    log::debug!("Matched Quarto crossref at pos {}: {}", pos, &key);
1756                    super::citations::emit_crossref(builder, key, has_suppress);
1757                } else {
1758                    log::debug!("Matched suppress-author citation at pos {}: {}", pos, &key);
1759                    emit_bare_citation(builder, key, has_suppress);
1760                }
1761                pos += len;
1762                text_start = pos;
1763                continue;
1764            }
1765        }
1766
1767        // Try to parse emphasis at this position
1768        if byte == b'*' || byte == b'_' {
1769            // Count the delimiter run to avoid re-parsing
1770            let bytes = text.as_bytes();
1771            let mut delim_count = 0;
1772            while pos + delim_count < bytes.len() && bytes[pos + delim_count] == byte {
1773                delim_count += 1;
1774            }
1775
1776            // Emit any accumulated text before the delimiter
1777            if pos > text_start {
1778                log::debug!(
1779                    "Emitting TEXT before delimiter: {:?}",
1780                    &text[text_start..pos]
1781                );
1782                builder.token(SyntaxKind::TEXT.into(), &text[text_start..pos]);
1783                text_start = pos; // Update text_start after emission
1784            }
1785
1786            // Try to parse emphasis
1787            // Use nested variant (bypass opener validity) when in nested context
1788            let emphasis_result = if nested_emphasis {
1789                try_parse_emphasis_nested(text, pos, end, config, builder)
1790            } else {
1791                try_parse_emphasis(text, pos, end, config, builder)
1792            };
1793
1794            if let Some((consumed, _)) = emphasis_result {
1795                // Successfully parsed emphasis
1796                log::debug!(
1797                    "Parsed emphasis, consumed {} bytes from pos {}",
1798                    consumed,
1799                    pos
1800                );
1801                pos += consumed;
1802                text_start = pos;
1803            } else {
1804                // Failed to parse, delimiter run will be treated as regular text
1805                // Skip the ENTIRE delimiter run to avoid re-parsing parts of it
1806                log::debug!(
1807                    "Failed to parse emphasis at pos {}, skipping {} delimiters as literal",
1808                    pos,
1809                    delim_count
1810                );
1811                pos += delim_count;
1812                // DON'T update text_start - let the delimiters accumulate
1813            }
1814            continue;
1815        }
1816
1817        // Check for newlines - may need to emit as hard line break
1818        if byte == b'\r' && pos + 1 < end && text.as_bytes()[pos + 1] == b'\n' {
1819            let text_before = &text[text_start..pos];
1820
1821            // Check for trailing spaces hard line break (always enabled in Pandoc)
1822            let trailing_spaces = text_before.chars().rev().take_while(|&c| c == ' ').count();
1823            if trailing_spaces >= 2 {
1824                // Emit text before the trailing spaces
1825                let text_content = &text_before[..text_before.len() - trailing_spaces];
1826                if !text_content.is_empty() {
1827                    builder.token(SyntaxKind::TEXT.into(), text_content);
1828                }
1829                let spaces = " ".repeat(trailing_spaces);
1830                builder.token(
1831                    SyntaxKind::HARD_LINE_BREAK.into(),
1832                    &format!("{}\r\n", spaces),
1833                );
1834                pos += 2;
1835                text_start = pos;
1836                continue;
1837            }
1838
1839            // hard_line_breaks: treat all single newlines as hard line breaks
1840            if config.extensions.hard_line_breaks {
1841                if !text_before.is_empty() {
1842                    builder.token(SyntaxKind::TEXT.into(), text_before);
1843                }
1844                builder.token(SyntaxKind::HARD_LINE_BREAK.into(), "\r\n");
1845                pos += 2;
1846                text_start = pos;
1847                continue;
1848            }
1849
1850            // Regular newline
1851            if !text_before.is_empty() {
1852                builder.token(SyntaxKind::TEXT.into(), text_before);
1853            }
1854            builder.token(SyntaxKind::NEWLINE.into(), "\r\n");
1855            pos += 2;
1856            text_start = pos;
1857            continue;
1858        }
1859
1860        if byte == b'\n' {
1861            let text_before = &text[text_start..pos];
1862
1863            // Check for trailing spaces hard line break (always enabled in Pandoc)
1864            let trailing_spaces = text_before.chars().rev().take_while(|&c| c == ' ').count();
1865            if trailing_spaces >= 2 {
1866                // Emit text before the trailing spaces
1867                let text_content = &text_before[..text_before.len() - trailing_spaces];
1868                if !text_content.is_empty() {
1869                    builder.token(SyntaxKind::TEXT.into(), text_content);
1870                }
1871                let spaces = " ".repeat(trailing_spaces);
1872                builder.token(SyntaxKind::HARD_LINE_BREAK.into(), &format!("{}\n", spaces));
1873                pos += 1;
1874                text_start = pos;
1875                continue;
1876            }
1877
1878            // hard_line_breaks: treat all single newlines as hard line breaks
1879            if config.extensions.hard_line_breaks {
1880                if !text_before.is_empty() {
1881                    builder.token(SyntaxKind::TEXT.into(), text_before);
1882                }
1883                builder.token(SyntaxKind::HARD_LINE_BREAK.into(), "\n");
1884                pos += 1;
1885                text_start = pos;
1886                continue;
1887            }
1888
1889            // Regular newline
1890            if !text_before.is_empty() {
1891                builder.token(SyntaxKind::TEXT.into(), text_before);
1892            }
1893            builder.token(SyntaxKind::NEWLINE.into(), "\n");
1894            pos += 1;
1895            text_start = pos;
1896            continue;
1897        }
1898
1899        // Regular character, keep accumulating
1900        pos += 1;
1901    }
1902
1903    // Emit any remaining text
1904    if pos > text_start && text_start < end {
1905        log::debug!("Emitting remaining TEXT: {:?}", &text[text_start..end]);
1906        builder.token(SyntaxKind::TEXT.into(), &text[text_start..end]);
1907    }
1908
1909    log::debug!("parse_inline_range complete: start={}, end={}", start, end);
1910}
1911
1912#[cfg(test)]
1913mod tests {
1914    use super::*;
1915    use crate::syntax::{SyntaxKind, SyntaxNode};
1916    use rowan::GreenNode;
1917
1918    #[test]
1919    fn test_recursive_simple_emphasis() {
1920        let text = "*test*";
1921        let config = ParserOptions::default();
1922        let mut builder = GreenNodeBuilder::new();
1923
1924        parse_inline_text_recursive(&mut builder, text, &config);
1925
1926        let green: GreenNode = builder.finish();
1927        let node = SyntaxNode::new_root(green);
1928
1929        // Should be lossless
1930        assert_eq!(node.text().to_string(), text);
1931
1932        // Should have EMPHASIS node
1933        let has_emph = node.descendants().any(|n| n.kind() == SyntaxKind::EMPHASIS);
1934        assert!(has_emph, "Should have EMPHASIS node");
1935    }
1936
1937    #[test]
1938    fn test_recursive_nested() {
1939        let text = "*foo **bar** baz*";
1940        let config = ParserOptions::default();
1941        let mut builder = GreenNodeBuilder::new();
1942
1943        // Wrap in a PARAGRAPH node (inline content needs a parent)
1944        builder.start_node(SyntaxKind::PARAGRAPH.into());
1945        parse_inline_text_recursive(&mut builder, text, &config);
1946        builder.finish_node();
1947
1948        let green: GreenNode = builder.finish();
1949        let node = SyntaxNode::new_root(green);
1950
1951        // Should be lossless
1952        assert_eq!(node.text().to_string(), text);
1953
1954        // Should have both EMPHASIS and STRONG
1955        let has_emph = node.descendants().any(|n| n.kind() == SyntaxKind::EMPHASIS);
1956        let has_strong = node.descendants().any(|n| n.kind() == SyntaxKind::STRONG);
1957
1958        assert!(has_emph, "Should have EMPHASIS node");
1959        assert!(has_strong, "Should have STRONG node");
1960    }
1961
1962    /// Test that we can parse a simple emphasis case
1963    #[test]
1964    fn test_parse_simple_emphasis() {
1965        use crate::options::ParserOptions;
1966        use crate::syntax::SyntaxNode;
1967        use rowan::GreenNode;
1968
1969        let text = "*test*";
1970        let config = ParserOptions::default();
1971        let mut builder = GreenNodeBuilder::new();
1972
1973        // Try to parse emphasis at position 0
1974        let result = try_parse_emphasis(text, 0, text.len(), &config, &mut builder);
1975
1976        // Should successfully parse
1977        assert_eq!(result, Some((6, 1))); // Consumed all 6 bytes, delimiter count 1
1978
1979        // Check the generated CST
1980        let green: GreenNode = builder.finish();
1981        let node = SyntaxNode::new_root(green);
1982
1983        // The root IS the EMPHASIS node
1984        assert_eq!(node.kind(), SyntaxKind::EMPHASIS);
1985
1986        // Verify losslessness: CST text should match input
1987        assert_eq!(node.text().to_string(), text);
1988    }
1989
1990    /// Test parsing nested emphasis/strong
1991    #[test]
1992    fn test_parse_nested_emphasis_strong() {
1993        use crate::options::ParserOptions;
1994
1995        let text = "*foo **bar** baz*";
1996        let config = ParserOptions::default();
1997        let mut builder = GreenNodeBuilder::new();
1998
1999        // Parse the whole range
2000        parse_inline_range(text, 0, text.len(), &config, &mut builder);
2001
2002        let green = builder.finish();
2003        let node = crate::syntax::SyntaxNode::new_root(green);
2004
2005        // Verify losslessness
2006        assert_eq!(node.text().to_string(), text);
2007
2008        // Should have EMPHASIS and STRONG nodes
2009        let has_emph = node.descendants().any(|n| n.kind() == SyntaxKind::EMPHASIS);
2010        let has_strong = node.descendants().any(|n| n.kind() == SyntaxKind::STRONG);
2011
2012        assert!(has_emph, "Should have EMPHASIS node");
2013        assert!(has_strong, "Should have STRONG node");
2014    }
2015
2016    /// Test Pandoc's "three" algorithm: ***foo* bar**
2017    /// Expected: Strong[Emph[foo], bar]
2018    /// Current bug: Parses as *Strong[foo* bar]
2019    #[test]
2020    fn test_triple_emphasis_star_then_double_star() {
2021        use crate::options::ParserOptions;
2022        use crate::syntax::SyntaxNode;
2023        use rowan::GreenNode;
2024
2025        let text = "***foo* bar**";
2026        let config = ParserOptions::default();
2027        let mut builder = GreenNodeBuilder::new();
2028
2029        builder.start_node(SyntaxKind::DOCUMENT.into());
2030        parse_inline_range(text, 0, text.len(), &config, &mut builder);
2031        builder.finish_node();
2032
2033        let green: GreenNode = builder.finish();
2034        let node = SyntaxNode::new_root(green);
2035
2036        // Verify losslessness
2037        assert_eq!(node.text().to_string(), text);
2038
2039        // Expected structure: STRONG > EMPH > "foo"
2040        // The STRONG should contain EMPH, not the other way around
2041        let structure = format!("{:#?}", node);
2042
2043        // Should have both STRONG and EMPH
2044        assert!(structure.contains("STRONG"), "Should have STRONG node");
2045        assert!(structure.contains("EMPHASIS"), "Should have EMPHASIS node");
2046
2047        // STRONG should be outer, EMPH should be inner
2048        // Check that STRONG comes before EMPH in tree traversal
2049        let mut found_strong = false;
2050        let mut found_emph_after_strong = false;
2051        for descendant in node.descendants() {
2052            if descendant.kind() == SyntaxKind::STRONG {
2053                found_strong = true;
2054            }
2055            if found_strong && descendant.kind() == SyntaxKind::EMPHASIS {
2056                found_emph_after_strong = true;
2057                break;
2058            }
2059        }
2060
2061        assert!(
2062            found_emph_after_strong,
2063            "EMPH should be inside STRONG, not before it. Current structure:\n{}",
2064            structure
2065        );
2066    }
2067
2068    /// Test Pandoc's "three" algorithm: ***foo** bar*
2069    /// Expected: Emph[Strong[foo], bar]
2070    #[test]
2071    fn test_triple_emphasis_double_star_then_star() {
2072        use crate::options::ParserOptions;
2073        use crate::syntax::SyntaxNode;
2074        use rowan::GreenNode;
2075
2076        let text = "***foo** bar*";
2077        let config = ParserOptions::default();
2078        let mut builder = GreenNodeBuilder::new();
2079
2080        builder.start_node(SyntaxKind::DOCUMENT.into());
2081        parse_inline_range(text, 0, text.len(), &config, &mut builder);
2082        builder.finish_node();
2083
2084        let green: GreenNode = builder.finish();
2085        let node = SyntaxNode::new_root(green);
2086
2087        // Verify losslessness
2088        assert_eq!(node.text().to_string(), text);
2089
2090        // Expected structure: EMPH > STRONG > "foo"
2091        let structure = format!("{:#?}", node);
2092
2093        // Should have both EMPH and STRONG
2094        assert!(structure.contains("EMPHASIS"), "Should have EMPHASIS node");
2095        assert!(structure.contains("STRONG"), "Should have STRONG node");
2096
2097        // EMPH should be outer, STRONG should be inner
2098        let mut found_emph = false;
2099        let mut found_strong_after_emph = false;
2100        for descendant in node.descendants() {
2101            if descendant.kind() == SyntaxKind::EMPHASIS {
2102                found_emph = true;
2103            }
2104            if found_emph && descendant.kind() == SyntaxKind::STRONG {
2105                found_strong_after_emph = true;
2106                break;
2107            }
2108        }
2109
2110        assert!(
2111            found_strong_after_emph,
2112            "STRONG should be inside EMPH. Current structure:\n{}",
2113            structure
2114        );
2115    }
2116
2117    /// Test that display math with attributes parses correctly
2118    /// Regression test for equation_attributes_single_line golden test
2119    #[test]
2120    fn test_display_math_with_attributes() {
2121        use crate::options::ParserOptions;
2122        use crate::syntax::SyntaxNode;
2123        use rowan::GreenNode;
2124
2125        let text = "$$ E = mc^2 $$ {#eq-einstein}";
2126        let mut config = ParserOptions::default();
2127        config.extensions.quarto_crossrefs = true; // Enable Quarto cross-references
2128
2129        let mut builder = GreenNodeBuilder::new();
2130        builder.start_node(SyntaxKind::DOCUMENT.into()); // Need a root node
2131
2132        // Parse the whole text
2133        parse_inline_text_recursive(&mut builder, text, &config);
2134
2135        builder.finish_node(); // Finish ROOT
2136        let green: GreenNode = builder.finish();
2137        let node = SyntaxNode::new_root(green);
2138
2139        // Verify losslessness
2140        assert_eq!(node.text().to_string(), text);
2141
2142        // Should have DISPLAY_MATH node
2143        let has_display_math = node
2144            .descendants()
2145            .any(|n| n.kind() == SyntaxKind::DISPLAY_MATH);
2146        assert!(has_display_math, "Should have DISPLAY_MATH node");
2147
2148        // Should have ATTRIBUTE node
2149        let has_attributes = node
2150            .descendants()
2151            .any(|n| n.kind() == SyntaxKind::ATTRIBUTE);
2152        assert!(
2153            has_attributes,
2154            "Should have ATTRIBUTE node for {{#eq-einstein}}"
2155        );
2156
2157        // Attributes should not be TEXT
2158        let math_followed_by_text = node.descendants().any(|n| {
2159            n.kind() == SyntaxKind::DISPLAY_MATH
2160                && n.next_sibling()
2161                    .map(|s| {
2162                        s.kind() == SyntaxKind::TEXT
2163                            && s.text().to_string().contains("{#eq-einstein}")
2164                    })
2165                    .unwrap_or(false)
2166        });
2167        assert!(
2168            !math_followed_by_text,
2169            "Attributes should not be parsed as TEXT"
2170        );
2171    }
2172}
2173
2174#[test]
2175fn test_two_with_nested_one_and_triple_closer() {
2176    // **bold with *italic***
2177    // Should parse as: Strong["bold with ", Emph["italic"]]
2178    // The *** at end is parsed as * (closes Emph) + ** (closes Strong)
2179
2180    use crate::options::ParserOptions;
2181    use crate::syntax::SyntaxNode;
2182    use rowan::GreenNode;
2183
2184    let text = "**bold with *italic***";
2185    let config = ParserOptions::default();
2186    let mut builder = GreenNodeBuilder::new();
2187
2188    // parse_inline_range emits inline content directly
2189    parse_inline_range(text, 0, text.len(), &config, &mut builder);
2190
2191    let green: GreenNode = builder.finish();
2192    let node = SyntaxNode::new_root(green);
2193
2194    // Verify lossless parsing
2195    assert_eq!(node.text().to_string(), text, "Should be lossless");
2196
2197    // The root node should be STRONG (parse_inline_range doesn't add wrapper)
2198    assert_eq!(
2199        node.kind(),
2200        SyntaxKind::STRONG,
2201        "Root should be STRONG, got: {:?}",
2202        node.kind()
2203    );
2204
2205    // STRONG should contain EMPHASIS as a nested node
2206    let has_emphasis = node.children().any(|c| c.kind() == SyntaxKind::EMPHASIS);
2207    assert!(has_emphasis, "STRONG should contain EMPHASIS node");
2208}
2209
2210#[test]
2211fn test_emphasis_with_trailing_space_before_closer() {
2212    // *foo * should parse as emphasis (Pandoc behavior)
2213    // For asterisks, Pandoc doesn't require right-flanking for closers
2214
2215    use crate::options::ParserOptions;
2216    use crate::syntax::SyntaxNode;
2217    use rowan::GreenNode;
2218
2219    let text = "*foo *";
2220    let config = ParserOptions::default();
2221    let mut builder = GreenNodeBuilder::new();
2222
2223    // Try to parse emphasis at position 0
2224    let result = try_parse_emphasis(text, 0, text.len(), &config, &mut builder);
2225
2226    // Should successfully parse (consumed all 6 bytes, delimiter count 1)
2227    assert_eq!(
2228        result,
2229        Some((6, 1)),
2230        "Should parse as emphasis, result: {:?}",
2231        result
2232    );
2233
2234    // Check the generated CST
2235    let green: GreenNode = builder.finish();
2236    let node = SyntaxNode::new_root(green);
2237
2238    // The root IS the EMPHASIS node
2239    assert_eq!(node.kind(), SyntaxKind::EMPHASIS);
2240
2241    // Verify losslessness
2242    assert_eq!(node.text().to_string(), text);
2243}
2244
2245#[test]
2246fn test_triple_emphasis_all_strong_nested() {
2247    // ***foo** bar **baz*** should parse as Emph[Strong[foo], " bar ", Strong[baz]]
2248    // Pandoc output confirms this
2249
2250    use crate::options::ParserOptions;
2251    use crate::syntax::SyntaxNode;
2252    use rowan::GreenNode;
2253
2254    let text = "***foo** bar **baz***";
2255    let config = ParserOptions::default();
2256    let mut builder = GreenNodeBuilder::new();
2257
2258    parse_inline_range(text, 0, text.len(), &config, &mut builder);
2259
2260    let green: GreenNode = builder.finish();
2261    let node = SyntaxNode::new_root(green);
2262
2263    // Should have one EMPHASIS node at root
2264    let emphasis_nodes: Vec<_> = node
2265        .descendants()
2266        .filter(|n| n.kind() == SyntaxKind::EMPHASIS)
2267        .collect();
2268    assert_eq!(
2269        emphasis_nodes.len(),
2270        1,
2271        "Should have exactly one EMPHASIS node, found: {}",
2272        emphasis_nodes.len()
2273    );
2274
2275    // EMPHASIS should contain two STRONG nodes
2276    let emphasis_node = emphasis_nodes[0].clone();
2277    let strong_in_emphasis: Vec<_> = emphasis_node
2278        .children()
2279        .filter(|n| n.kind() == SyntaxKind::STRONG)
2280        .collect();
2281    assert_eq!(
2282        strong_in_emphasis.len(),
2283        2,
2284        "EMPHASIS should contain two STRONG nodes, found: {}",
2285        strong_in_emphasis.len()
2286    );
2287
2288    // Verify losslessness
2289    assert_eq!(node.text().to_string(), text);
2290}
2291
2292#[test]
2293fn test_triple_emphasis_all_emph_nested() {
2294    // ***foo* bar *baz*** should parse as Strong[Emph[foo], " bar ", Emph[baz]]
2295    // Pandoc output confirms this
2296
2297    use crate::options::ParserOptions;
2298    use crate::syntax::SyntaxNode;
2299    use rowan::GreenNode;
2300
2301    let text = "***foo* bar *baz***";
2302    let config = ParserOptions::default();
2303    let mut builder = GreenNodeBuilder::new();
2304
2305    parse_inline_range(text, 0, text.len(), &config, &mut builder);
2306
2307    let green: GreenNode = builder.finish();
2308    let node = SyntaxNode::new_root(green);
2309
2310    // Should have one STRONG node at root
2311    let strong_nodes: Vec<_> = node
2312        .descendants()
2313        .filter(|n| n.kind() == SyntaxKind::STRONG)
2314        .collect();
2315    assert_eq!(
2316        strong_nodes.len(),
2317        1,
2318        "Should have exactly one STRONG node, found: {}",
2319        strong_nodes.len()
2320    );
2321
2322    // STRONG should contain two EMPHASIS nodes
2323    let strong_node = strong_nodes[0].clone();
2324    let emph_in_strong: Vec<_> = strong_node
2325        .children()
2326        .filter(|n| n.kind() == SyntaxKind::EMPHASIS)
2327        .collect();
2328    assert_eq!(
2329        emph_in_strong.len(),
2330        2,
2331        "STRONG should contain two EMPHASIS nodes, found: {}",
2332        emph_in_strong.len()
2333    );
2334
2335    // Verify losslessness
2336    assert_eq!(node.text().to_string(), text);
2337}
2338
2339// Multiline emphasis tests
2340#[test]
2341fn test_parse_emphasis_multiline() {
2342    // Per Pandoc spec, emphasis CAN contain newlines (soft breaks)
2343    use crate::options::ParserOptions;
2344    use crate::syntax::SyntaxNode;
2345    use rowan::GreenNode;
2346
2347    let text = "*text on\nline two*";
2348    let config = ParserOptions::default();
2349    let mut builder = GreenNodeBuilder::new();
2350
2351    let result = try_parse_emphasis(text, 0, text.len(), &config, &mut builder);
2352
2353    // Should successfully parse all bytes
2354    assert_eq!(
2355        result,
2356        Some((text.len(), 1)),
2357        "Emphasis should parse multiline content"
2358    );
2359
2360    // Check the generated CST
2361    let green: GreenNode = builder.finish();
2362    let node = SyntaxNode::new_root(green);
2363
2364    // Should have EMPHASIS node
2365    assert_eq!(node.kind(), SyntaxKind::EMPHASIS);
2366
2367    // Verify losslessness: should preserve the newline
2368    assert_eq!(node.text().to_string(), text);
2369    assert!(
2370        node.text().to_string().contains('\n'),
2371        "Should preserve newline in emphasis content"
2372    );
2373}
2374
2375#[test]
2376fn test_parse_strong_multiline() {
2377    // Per Pandoc spec, strong emphasis CAN contain newlines
2378    use crate::options::ParserOptions;
2379    use crate::syntax::SyntaxNode;
2380    use rowan::GreenNode;
2381
2382    let text = "**strong on\nline two**";
2383    let config = ParserOptions::default();
2384    let mut builder = GreenNodeBuilder::new();
2385
2386    let result = try_parse_emphasis(text, 0, text.len(), &config, &mut builder);
2387
2388    // Should successfully parse all bytes
2389    assert_eq!(
2390        result,
2391        Some((text.len(), 2)),
2392        "Strong emphasis should parse multiline content"
2393    );
2394
2395    // Check the generated CST
2396    let green: GreenNode = builder.finish();
2397    let node = SyntaxNode::new_root(green);
2398
2399    // Should have STRONG node
2400    assert_eq!(node.kind(), SyntaxKind::STRONG);
2401
2402    // Verify losslessness
2403    assert_eq!(node.text().to_string(), text);
2404    assert!(
2405        node.text().to_string().contains('\n'),
2406        "Should preserve newline in strong content"
2407    );
2408}
2409
2410#[test]
2411fn test_parse_triple_emphasis_multiline() {
2412    // Triple emphasis with newlines
2413    use crate::options::ParserOptions;
2414    use crate::syntax::SyntaxNode;
2415    use rowan::GreenNode;
2416
2417    let text = "***both on\nline two***";
2418    let config = ParserOptions::default();
2419    let mut builder = GreenNodeBuilder::new();
2420
2421    let result = try_parse_emphasis(text, 0, text.len(), &config, &mut builder);
2422
2423    // Should successfully parse all bytes
2424    assert_eq!(
2425        result,
2426        Some((text.len(), 3)),
2427        "Triple emphasis should parse multiline content"
2428    );
2429
2430    // Check the generated CST
2431    let green: GreenNode = builder.finish();
2432    let node = SyntaxNode::new_root(green);
2433
2434    // Should have STRONG node (triple = strong + emph)
2435    let has_strong = node.descendants().any(|n| n.kind() == SyntaxKind::STRONG);
2436    assert!(has_strong, "Should have STRONG node");
2437
2438    // Verify losslessness
2439    assert_eq!(node.text().to_string(), text);
2440    assert!(
2441        node.text().to_string().contains('\n'),
2442        "Should preserve newline in triple emphasis content"
2443    );
2444}
panache_parser/parser/inlines/core.rs

panache_parser/parser/inlines/
core.rs