panache_parser/parser/inlines/
core.rs

1//! Recursive emphasis parsing using Pandoc's algorithm.
2//!
3//! This module implements emphasis/strong emphasis parsing using a recursive
4//! descent approach based on Pandoc's Haskell implementation in
5//! `Readers/Markdown.hs:L1662-L1722`.
6//!
7//! **Key algorithm**: Left-to-right, greedy, first-match wins
8//! 1. Parse text left-to-right
9//! 2. When we see delimiters, try to parse emphasis (look for matching closer)
10//! 3. If successful, emit emphasis node and continue from after closer
11//! 4. If failed (no closer found), emit delimiter as literal and continue
12//! 5. Nested emphasis is handled naturally by recursive parsing of content
13//!
14//! **Example**: `*foo **bar* baz**`
15//! - See `*`, try to parse EMPH
16//! - Parse content: see `**`, try to parse STRONG
17//! - STRONG finds closer `**` at end → succeeds, emits STRONG[bar* baz]
18//! - Outer `*` can't find closer (all delimiters consumed) → fails, emits `*foo` as literal
19//! - Result: `*foo` + STRONG[bar* baz]
20//!
21//! This matches Pandoc's behavior exactly.
22
23use crate::options::ParserOptions;
24use crate::syntax::SyntaxKind;
25use rowan::GreenNodeBuilder;
26
27// Import inline element parsers from sibling modules
28use super::bookdown::{
29    try_parse_bookdown_definition, try_parse_bookdown_reference, try_parse_bookdown_text_reference,
30};
31use super::bracketed_spans::{emit_bracketed_span, try_parse_bracketed_span};
32use super::citations::{
33    emit_bare_citation, emit_bracketed_citation, try_parse_bare_citation,
34    try_parse_bracketed_citation,
35};
36use super::code_spans::{emit_code_span, try_parse_code_span};
37use super::emoji::{emit_emoji, try_parse_emoji};
38use super::escapes::{EscapeType, emit_escape, try_parse_escape};
39use super::inline_executable::{emit_inline_executable, try_parse_inline_executable};
40use super::inline_footnotes::{
41    emit_footnote_reference, emit_inline_footnote, try_parse_footnote_reference,
42    try_parse_inline_footnote,
43};
44use super::latex::{parse_latex_command, try_parse_latex_command};
45use super::links::{
46    emit_autolink, emit_bare_uri_link, emit_inline_image, emit_inline_link, emit_reference_image,
47    emit_reference_link, try_parse_autolink, try_parse_bare_uri, try_parse_inline_image,
48    try_parse_inline_link, try_parse_reference_image, try_parse_reference_link,
49};
50use super::mark::{emit_mark, try_parse_mark};
51use super::math::{
52    emit_display_math, emit_display_math_environment, emit_double_backslash_display_math,
53    emit_double_backslash_inline_math, emit_gfm_inline_math, emit_inline_math,
54    emit_single_backslash_display_math, emit_single_backslash_inline_math, try_parse_display_math,
55    try_parse_double_backslash_display_math, try_parse_double_backslash_inline_math,
56    try_parse_gfm_inline_math, try_parse_inline_math, try_parse_math_environment,
57    try_parse_single_backslash_display_math, try_parse_single_backslash_inline_math,
58};
59use super::native_spans::{emit_native_span, try_parse_native_span};
60use super::raw_inline::is_raw_inline;
61use super::shortcodes::{emit_shortcode, try_parse_shortcode};
62use super::strikeout::{emit_strikeout, try_parse_strikeout};
63use super::subscript::{emit_subscript, try_parse_subscript};
64use super::superscript::{emit_superscript, try_parse_superscript};
65
66/// Parse inline text using the recursive emphasis algorithm.
67///
68/// This is the main entry point for parsing inline content with Pandoc-style
69/// recursive emphasis handling. It uses a greedy left-to-right, first-match-wins
70/// approach that matches Pandoc's behavior exactly.
71///
72/// **Algorithm**:
73/// 1. Parse text left-to-right trying each inline element type in precedence order
74/// 2. When we see `*` or `_`, try to parse emphasis recursively
75/// 3. Nested emphasis naturally consumes delimiters before outer matches
76/// 4. All inline elements (code, links, math, etc.) are parsed on-the-fly
77///
78/// # Arguments
79/// * `text` - The inline text to parse
80/// * `config` - Configuration for extensions and formatting
81/// * `builder` - The CST builder to emit nodes to
82pub fn parse_inline_text_recursive(
83    builder: &mut GreenNodeBuilder,
84    text: &str,
85    config: &ParserOptions,
86) {
87    log::trace!(
88        "Recursive inline parsing: {:?} ({} bytes)",
89        &text[..text.len().min(40)],
90        text.len()
91    );
92
93    parse_inline_range(text, 0, text.len(), config, builder);
94
95    log::trace!("Recursive inline parsing complete");
96}
97
98/// Parse inline elements from text content.
99/// This is a standalone function used for recursive inline parsing within blocks.
100///
101/// The `allow_reference_links` parameter is accepted for compatibility but not currently used.
102/// Set to `false` in nested contexts (inside link text, image alt, spans) to prevent recursive parsing.
103pub fn parse_inline_text(
104    builder: &mut GreenNodeBuilder,
105    text: &str,
106    config: &ParserOptions,
107    _allow_reference_links: bool,
108) {
109    log::trace!(
110        "Parsing inline text (recursive): {:?} ({} bytes)",
111        &text[..text.len().min(40)],
112        text.len()
113    );
114
115    // Use recursive parsing with Pandoc's algorithm for emphasis
116    parse_inline_text_recursive(builder, text, config);
117}
118
119/// Try to parse emphasis starting at the given position.
120///
121/// This is the entry point for recursive emphasis parsing, equivalent to
122/// Pandoc's `enclosure` function.
123///
124/// Returns Some((bytes_consumed, delim_count)) if emphasis was successfully parsed,
125/// or None if the delimiter should be treated as literal text.
126/// When returning None, the delim_count tells the caller how many delimiter
127/// characters to skip (to avoid re-parsing parts of a failed delimiter run).
128///
129/// # Arguments
130/// * `text` - The full text being parsed
131/// * `pos` - Current position in text (where the delimiter starts)
132/// * `end` - End boundary (don't search for closers beyond this)
133/// * `config` - Configuration
134/// * `builder` - CST builder
135///
136/// **Algorithm**:
137/// 1. Count opening delimiters
138/// 2. Check if followed by whitespace (if so, return None)
139/// 3. Dispatch to parse_one/two/three based on count
140/// 4. Those functions parse content and look for matching closer (within bounds)
141/// 5. If closer found, emit node and return bytes consumed
142/// 6. If not found, return None with delimiter count (caller skips entire run)
143pub fn try_parse_emphasis(
144    text: &str,
145    pos: usize,
146    end: usize,
147    config: &ParserOptions,
148    builder: &mut GreenNodeBuilder,
149) -> Option<(usize, usize)> {
150    let bytes = text.as_bytes();
151
152    if pos >= bytes.len() {
153        return None;
154    }
155
156    let delim_char = bytes[pos] as char;
157    if delim_char != '*' && delim_char != '_' {
158        return None;
159    }
160
161    // Count consecutive delimiters
162    let mut count = 0;
163    while pos + count < bytes.len() && bytes[pos + count] == bytes[pos] {
164        count += 1;
165    }
166
167    let after_pos = pos + count;
168
169    log::trace!(
170        "try_parse_emphasis: '{}' x {} at pos {}",
171        delim_char,
172        count,
173        pos
174    );
175
176    // Check if followed by whitespace (Pandoc rule: treat as literal)
177    if after_pos < text.len()
178        && let Some(next_char) = text[after_pos..].chars().next()
179        && next_char.is_whitespace()
180    {
181        log::trace!("Delimiter followed by whitespace, treating as literal");
182        return None;
183    }
184
185    // For underscores: check intraword_underscores extension (Pandoc lines 1668-1672)
186    // Can't open if preceded by alphanumeric (prevents foo_bar from parsing)
187    if delim_char == '_'
188        && pos > 0
189        && let Some(prev_char) = text[..pos].chars().last()
190        && prev_char.is_alphanumeric()
191    {
192        log::trace!("Underscore preceded by alphanumeric, can't open (intraword)");
193        return None;
194    }
195
196    // Dispatch based on delimiter count
197    let result = match count {
198        1 => try_parse_one(text, pos, delim_char, end, config, builder),
199        2 => try_parse_two(text, pos, delim_char, end, config, builder),
200        3 => try_parse_three(text, pos, delim_char, end, config, builder),
201        _ => {
202            // 4+ delimiters: treat as literal (Pandoc behavior)
203            log::trace!("{} delimiters (4+), treating as literal", count);
204            None
205        }
206    };
207
208    // If parsing succeeded, return (bytes_consumed, delim_count)
209    // If failed, return None but the caller will know to skip `count` delimiters
210    result.map(|consumed| (consumed, count))
211}
212
213/// Try to parse emphasis in a nested context (bypassing opener validity checks).
214///
215/// This mirrors Pandoc's behavior where `one` can call `two c mempty` directly,
216/// bypassing the `enclosure` opener validity checks. This is needed because
217/// patterns like `***foo **bar** baz***` require `**` followed by space to be
218/// parsed as a nested strong opener.
219///
220/// Returns Some((bytes_consumed, delim_count)) if successful, None otherwise.
221fn try_parse_emphasis_nested(
222    text: &str,
223    pos: usize,
224    end: usize,
225    config: &ParserOptions,
226    builder: &mut GreenNodeBuilder,
227) -> Option<(usize, usize)> {
228    let bytes = text.as_bytes();
229
230    if pos >= bytes.len() {
231        return None;
232    }
233
234    let delim_char = bytes[pos] as char;
235    if delim_char != '*' && delim_char != '_' {
236        return None;
237    }
238
239    // Count consecutive delimiters
240    let mut count = 0;
241    while pos + count < bytes.len() && bytes[pos + count] == bytes[pos] {
242        count += 1;
243    }
244
245    log::trace!(
246        "try_parse_emphasis_nested: '{}' x {} at pos {}",
247        delim_char,
248        count,
249        pos
250    );
251
252    // For underscores: still check intraword_underscores (prevents foo_bar parsing)
253    // This check applies even in nested contexts
254    if delim_char == '_'
255        && pos > 0
256        && let Some(prev_char) = text[..pos].chars().last()
257        && prev_char.is_alphanumeric()
258    {
259        log::trace!("Underscore preceded by alphanumeric, can't open (intraword)");
260        return None;
261    }
262
263    // NOTE: We intentionally skip the "delimiter followed by whitespace" check here.
264    // In nested contexts (inside `one` calling `two`), Pandoc allows openers
265    // followed by whitespace because the opener has already been matched.
266
267    // Dispatch based on delimiter count
268    let result = match count {
269        1 => try_parse_one(text, pos, delim_char, end, config, builder),
270        2 => try_parse_two(text, pos, delim_char, end, config, builder),
271        3 => try_parse_three(text, pos, delim_char, end, config, builder),
272        _ => {
273            // 4+ delimiters: treat as literal (Pandoc behavior)
274            log::trace!("{} delimiters (4+), treating as literal", count);
275            None
276        }
277    };
278
279    result.map(|consumed| (consumed, count))
280}
281
282/// Try to parse emphasis with *** opening delimiter.
283///
284/// Tries to match closers in order: *** → ** → *
285/// Returns Some(bytes_consumed) if successful, None otherwise.
286fn try_parse_three(
287    text: &str,
288    pos: usize,
289    delim_char: char,
290    end: usize,
291    config: &ParserOptions,
292    builder: &mut GreenNodeBuilder,
293) -> Option<usize> {
294    let content_start = pos + 3;
295    let one = delim_char.to_string();
296    let two = one.repeat(2);
297
298    log::trace!("try_parse_three: '{}' x 3 at pos {}", delim_char, pos);
299
300    // Pandoc algorithm (line 1695): Parse content UNTIL we see a VALID ender
301    // We loop through potential enders, checking if each is valid.
302    // Invalid enders (like `**` preceded by whitespace) are skipped.
303    let mut search_pos = content_start;
304
305    loop {
306        // Find next potential ender
307        let closer_start = match find_first_potential_ender(text, search_pos, delim_char, end) {
308            Some(p) => p,
309            None => {
310                log::trace!("No potential ender found for ***");
311                return None;
312            }
313        };
314
315        log::trace!("Potential ender at pos {}", closer_start);
316
317        // Count how many delimiters we have at closer_start
318        let bytes = text.as_bytes();
319        let mut closer_count = 0;
320        let mut check_pos = closer_start;
321        while check_pos < bytes.len() && bytes[check_pos] == delim_char as u8 {
322            closer_count += 1;
323            check_pos += 1;
324        }
325
326        log::trace!(
327            "Found {} x {} at pos {}",
328            delim_char,
329            closer_count,
330            closer_start
331        );
332
333        // Try to match closers in order: ***, **, * (Pandoc lines 1696-1698)
334
335        // Try *** (line 1696)
336        if closer_count >= 3 && is_valid_ender(text, closer_start, delim_char, 3) {
337            log::trace!("Matched *** closer, emitting Strong[Emph[content]]");
338
339            builder.start_node(SyntaxKind::STRONG.into());
340            builder.token(SyntaxKind::STRONG_MARKER.into(), &two);
341
342            builder.start_node(SyntaxKind::EMPHASIS.into());
343            builder.token(SyntaxKind::EMPHASIS_MARKER.into(), &one);
344            parse_inline_range_nested(text, content_start, closer_start, config, builder);
345            builder.token(SyntaxKind::EMPHASIS_MARKER.into(), &one);
346            builder.finish_node(); // EMPHASIS
347
348            builder.token(SyntaxKind::STRONG_MARKER.into(), &two);
349            builder.finish_node(); // STRONG
350
351            return Some(closer_start + 3 - pos);
352        }
353
354        // Try ** (line 1697)
355        if closer_count >= 2 && is_valid_ender(text, closer_start, delim_char, 2) {
356            log::trace!("Matched ** closer, wrapping as Strong and continuing with one");
357
358            let continue_pos = closer_start + 2;
359
360            if let Some(final_closer_pos) =
361                parse_until_closer_with_nested_two(text, continue_pos, delim_char, 1, end, config)
362            {
363                log::trace!(
364                    "Found * closer at pos {}, emitting Emph[Strong[...], ...]",
365                    final_closer_pos
366                );
367
368                builder.start_node(SyntaxKind::EMPHASIS.into());
369                builder.token(SyntaxKind::EMPHASIS_MARKER.into(), &one);
370
371                builder.start_node(SyntaxKind::STRONG.into());
372                builder.token(SyntaxKind::STRONG_MARKER.into(), &two);
373                parse_inline_range_nested(text, content_start, closer_start, config, builder);
374                builder.token(SyntaxKind::STRONG_MARKER.into(), &two);
375                builder.finish_node(); // STRONG
376
377                // Parse additional content between ** and * (up to but not including the closer)
378                parse_inline_range_nested(text, continue_pos, final_closer_pos, config, builder);
379
380                builder.token(SyntaxKind::EMPHASIS_MARKER.into(), &one);
381                builder.finish_node(); // EMPHASIS
382
383                return Some(final_closer_pos + 1 - pos);
384            }
385
386            // Fallback: emit * + STRONG
387            log::trace!("No * closer found after **, emitting * + STRONG");
388            builder.token(SyntaxKind::TEXT.into(), &one);
389
390            builder.start_node(SyntaxKind::STRONG.into());
391            builder.token(SyntaxKind::STRONG_MARKER.into(), &two);
392            parse_inline_range_nested(text, content_start, closer_start, config, builder);
393            builder.token(SyntaxKind::STRONG_MARKER.into(), &two);
394            builder.finish_node(); // STRONG
395
396            return Some(closer_start + 2 - pos);
397        }
398
399        // Try * (line 1698)
400        if closer_count >= 1 && is_valid_ender(text, closer_start, delim_char, 1) {
401            log::trace!("Matched * closer, wrapping as Emph and continuing with two");
402
403            let continue_pos = closer_start + 1;
404
405            if let Some(final_closer_pos) =
406                parse_until_closer_with_nested_one(text, continue_pos, delim_char, 2, end, config)
407            {
408                log::trace!(
409                    "Found ** closer at pos {}, emitting Strong[Emph[...], ...]",
410                    final_closer_pos
411                );
412
413                builder.start_node(SyntaxKind::STRONG.into());
414                builder.token(SyntaxKind::STRONG_MARKER.into(), &two);
415
416                builder.start_node(SyntaxKind::EMPHASIS.into());
417                builder.token(SyntaxKind::EMPHASIS_MARKER.into(), &one);
418                parse_inline_range_nested(text, content_start, closer_start, config, builder);
419                builder.token(SyntaxKind::EMPHASIS_MARKER.into(), &one);
420                builder.finish_node(); // EMPHASIS
421
422                parse_inline_range_nested(text, continue_pos, final_closer_pos, config, builder);
423
424                builder.token(SyntaxKind::STRONG_MARKER.into(), &two);
425                builder.finish_node(); // STRONG
426
427                return Some(final_closer_pos + 2 - pos);
428            }
429
430            // Fallback: emit ** + EMPH
431            log::trace!("No ** closer found after *, emitting ** + EMPH");
432            builder.token(SyntaxKind::TEXT.into(), &two);
433
434            builder.start_node(SyntaxKind::EMPHASIS.into());
435            builder.token(SyntaxKind::EMPHASIS_MARKER.into(), &one);
436            parse_inline_range_nested(text, content_start, closer_start, config, builder);
437            builder.token(SyntaxKind::EMPHASIS_MARKER.into(), &one);
438            builder.finish_node(); // EMPHASIS
439
440            return Some(closer_start + 1 - pos);
441        }
442
443        // No valid ender at this position - continue searching after this delimiter run
444        log::trace!(
445            "No valid ender at pos {}, continuing search from {}",
446            closer_start,
447            closer_start + closer_count
448        );
449        search_pos = closer_start + closer_count;
450    }
451}
452
453/// Find the first potential emphasis ender (delimiter character) starting from `start`.
454/// This implements Pandoc's `many (notFollowedBy (ender c 1) >> inline)` -
455/// we parse inline content until we hit a delimiter that could be an ender.
456fn find_first_potential_ender(
457    text: &str,
458    start: usize,
459    delim_char: char,
460    end: usize,
461) -> Option<usize> {
462    let bytes = text.as_bytes();
463    let mut pos = start;
464
465    while pos < end.min(text.len()) {
466        // Check if we found the delimiter character
467        if bytes[pos] == delim_char as u8 {
468            // Check if it's escaped
469            let is_escaped = {
470                let mut backslash_count = 0;
471                let mut check_pos = pos;
472                while check_pos > 0 && bytes[check_pos - 1] == b'\\' {
473                    backslash_count += 1;
474                    check_pos -= 1;
475                }
476                backslash_count % 2 == 1
477            };
478
479            if !is_escaped {
480                // Found a potential ender
481                return Some(pos);
482            }
483        }
484
485        pos += 1;
486    }
487
488    None
489}
490
491/// Check if a delimiter at the given position is a valid ender.
492/// This implements Pandoc's `ender c n` function.
493fn is_valid_ender(text: &str, pos: usize, delim_char: char, delim_count: usize) -> bool {
494    let bytes = text.as_bytes();
495
496    // Check we have exactly delim_count delimiters (not more, not less)
497    if pos + delim_count > text.len() {
498        return false;
499    }
500
501    for i in 0..delim_count {
502        if bytes[pos + i] != delim_char as u8 {
503            return false;
504        }
505    }
506
507    // Check no delimiter immediately before
508    if pos > 0 && bytes[pos - 1] == delim_char as u8 {
509        return false;
510    }
511
512    // Check no delimiter immediately after
513    let after_pos = pos + delim_count;
514    if after_pos < bytes.len() && bytes[after_pos] == delim_char as u8 {
515        return false;
516    }
517
518    // For underscores, check right-flanking (not preceded by whitespace)
519    // Pandoc's `ender` for asterisks has NO right-flanking requirement
520    if delim_char == '_' {
521        if pos > 0
522            && let Some(prev_char) = text[..pos].chars().last()
523            && prev_char.is_whitespace()
524        {
525            return false;
526        }
527
528        // Check not followed by alphanumeric (right-flanking rule for underscores)
529        if after_pos < text.len()
530            && let Some(next_char) = text[after_pos..].chars().next()
531            && next_char.is_alphanumeric()
532        {
533            return false;
534        }
535    }
536
537    true
538}
539
540/// Try to parse emphasis with ** opening delimiter.
541///
542/// Tries to match ** closer only. No fallback.
543/// Returns Some(bytes_consumed) if successful, None otherwise.
544fn try_parse_two(
545    text: &str,
546    pos: usize,
547    delim_char: char,
548    end: usize,
549    config: &ParserOptions,
550    builder: &mut GreenNodeBuilder,
551) -> Option<usize> {
552    let content_start = pos + 2;
553
554    log::trace!("try_parse_two: '{}' x 2 at pos {}", delim_char, pos);
555
556    // Try to find ** closer, checking for nested * emphasis along the way
557    if let Some(closer_pos) =
558        parse_until_closer_with_nested_one(text, content_start, delim_char, 2, end, config)
559    {
560        log::trace!("Found ** closer at pos {}", closer_pos);
561
562        // Emit STRONG(content)
563        builder.start_node(SyntaxKind::STRONG.into());
564        builder.token(SyntaxKind::STRONG_MARKER.into(), &text[pos..pos + 2]);
565        parse_inline_range_nested(text, content_start, closer_pos, config, builder);
566        builder.token(
567            SyntaxKind::STRONG_MARKER.into(),
568            &text[closer_pos..closer_pos + 2],
569        );
570        builder.finish_node(); // STRONG
571
572        return Some(closer_pos + 2 - pos);
573    }
574
575    // No closer found
576    log::trace!("No closer found for **");
577    None
578}
579
580/// Try to parse emphasis with * opening delimiter.
581///
582/// Tries to match * closer.
583/// Returns Some(bytes_consumed) if successful, None otherwise.
584///
585/// **Pandoc algorithm**: While parsing content, if we encounter **,
586/// try to parse it as `two` (strong) recursively. If `two` succeeds,
587/// it consumes the ** delimiters, potentially preventing us from finding
588/// a closer for the outer *. This creates priority where ** can "steal"
589/// matches from *.
590fn try_parse_one(
591    text: &str,
592    pos: usize,
593    delim_char: char,
594    end: usize,
595    config: &ParserOptions,
596    builder: &mut GreenNodeBuilder,
597) -> Option<usize> {
598    let content_start = pos + 1;
599
600    log::trace!("try_parse_one: '{}' x 1 at pos {}", delim_char, pos);
601
602    // Try to find * closer using Pandoc's algorithm with nested two attempts
603    if let Some(closer_pos) =
604        parse_until_closer_with_nested_two(text, content_start, delim_char, 1, end, config)
605    {
606        log::trace!("Found * closer at pos {}", closer_pos);
607
608        // Emit EMPH(content)
609        builder.start_node(SyntaxKind::EMPHASIS.into());
610        builder.token(SyntaxKind::EMPHASIS_MARKER.into(), &text[pos..pos + 1]);
611        parse_inline_range_nested(text, content_start, closer_pos, config, builder);
612        builder.token(
613            SyntaxKind::EMPHASIS_MARKER.into(),
614            &text[closer_pos..closer_pos + 1],
615        );
616        builder.finish_node(); // EMPHASIS
617
618        return Some(closer_pos + 1 - pos);
619    }
620
621    // No closer found
622    log::trace!("No closer found for *");
623    None
624}
625
626/// Parse inline content and look for a matching closer, with nested two attempts.
627///
628/// This implements Pandoc's algorithm from Markdown.hs lines 1712-1717:
629/// When parsing `*...*`, if we encounter `**` (and it's not followed by
630/// another `*` that would close the outer emphasis), try to parse it as
631/// `two c mempty` (strong). If `two` succeeds, those `**` delimiters are
632/// consumed, and we continue searching for the `*` closer.
633///
634/// This creates a priority system where `**` can "steal" matches from `*`.
635///
636/// Example: `*foo **bar* baz**`
637/// - When parsing the outer `*...*`, we encounter `**` at position 5
638/// - We try `two` which succeeds with `**bar* baz**`
639/// - Now there's no `*` closer for the outer `*`, so it fails
640/// - Result: literal `*foo ` + STRONG("bar* baz")
641///
642/// # Arguments
643/// * `end` - Don't search beyond this position (respects nesting boundaries)
644fn parse_until_closer_with_nested_two(
645    text: &str,
646    start: usize,
647    delim_char: char,
648    delim_count: usize,
649    end: usize,
650    config: &ParserOptions,
651) -> Option<usize> {
652    let bytes = text.as_bytes();
653    let mut pos = start;
654
655    while pos < end.min(text.len()) {
656        if bytes[pos] == b'`'
657            && let Some(m) = try_parse_inline_executable(
658                &text[pos..],
659                config.extensions.rmarkdown_inline_code,
660                config.extensions.quarto_inline_code,
661            )
662        {
663            log::trace!(
664                "Skipping inline executable span of {} bytes at pos {}",
665                m.total_len,
666                pos
667            );
668            pos += m.total_len;
669            continue;
670        }
671
672        // Skip over code spans - their content is protected from delimiter matching
673        if bytes[pos] == b'`'
674            && let Some((len, _, _, _)) = try_parse_code_span(&text[pos..])
675        {
676            log::trace!("Skipping code span of {} bytes at pos {}", len, pos);
677            pos += len;
678            continue;
679        }
680
681        // Skip over inline math - their content is protected from delimiter matching
682        if bytes[pos] == b'$'
683            && let Some((len, _)) = try_parse_inline_math(&text[pos..])
684        {
685            log::trace!("Skipping inline math of {} bytes at pos {}", len, pos);
686            pos += len;
687            continue;
688        }
689
690        // Skip over links - their content is protected from delimiter matching
691        if bytes[pos] == b'['
692            && let Some((len, _, _, _)) = try_parse_inline_link(&text[pos..])
693        {
694            log::trace!("Skipping inline link of {} bytes at pos {}", len, pos);
695            pos += len;
696            continue;
697        }
698
699        // Pandoc algorithm: If we're looking for a single delimiter (*) and
700        // encounter a double delimiter (**), try to parse it as `two` (strong).
701        // This happens BEFORE checking if pos is a closer for our current emphasis.
702        if delim_count == 1
703            && pos + 2 <= text.len()
704            && bytes[pos] == delim_char as u8
705            && bytes[pos + 1] == delim_char as u8
706        {
707            // First check if the first delimiter is escaped
708            let first_is_escaped = {
709                let mut backslash_count = 0;
710                let mut check_pos = pos;
711                while check_pos > 0 && bytes[check_pos - 1] == b'\\' {
712                    backslash_count += 1;
713                    check_pos -= 1;
714                }
715                backslash_count % 2 == 1
716            };
717
718            if first_is_escaped {
719                // First * is escaped, skip it and continue
720                // The second * might be a closer or start of emphasis
721                log::trace!(
722                    "First * at pos {} is escaped, skipping to check second *",
723                    pos
724                );
725                pos = advance_char_boundary(text, pos, end);
726                continue;
727            }
728
729            // Check that there's NOT a third delimiter (which would make this
730            // part of a longer run that we shouldn't treat as `two`)
731            let no_third_delim = pos + 2 >= bytes.len() || bytes[pos + 2] != delim_char as u8;
732
733            if no_third_delim {
734                log::trace!(
735                    "try_parse_one: found ** at pos {}, attempting nested two",
736                    pos
737                );
738
739                // Try to parse as `two` (strong emphasis)
740                // We create a temporary builder to test if `two` succeeds
741                let mut temp_builder = GreenNodeBuilder::new();
742                if let Some(two_consumed) =
743                    try_parse_two(text, pos, delim_char, end, config, &mut temp_builder)
744                {
745                    // `two` succeeded! Those ** delimiters are consumed.
746                    // We skip past the `two` and continue searching for our `*` closer.
747                    log::trace!(
748                        "Nested two succeeded, consumed {} bytes, continuing search",
749                        two_consumed
750                    );
751                    pos += two_consumed;
752                    continue;
753                }
754                // `two` failed - this means the entire `one` parse should fail!
755                // In Pandoc, the `try (string [c,c] >> notFollowedBy (ender c 1) >> two c mempty)`
756                // alternative fails, and the first alternative `notFollowedBy (ender c 1) >> inline`
757                // also fails because we ARE followed by an ender (the first * of **).
758                // So the entire content parsing fails, and `one` returns failure.
759                log::trace!("Nested two failed at pos {}, entire one() should fail", pos);
760                return None;
761            }
762        }
763
764        // Check if we have a potential closer here
765        if pos + delim_count <= text.len() {
766            let mut matches = true;
767            for i in 0..delim_count {
768                if bytes[pos + i] != delim_char as u8 {
769                    matches = false;
770                    break;
771                }
772            }
773
774            if matches {
775                // IMPORTANT: Check that there are EXACTLY delim_count delimiters,
776                // not more. E.g., when looking for `*`, we shouldn't match
777                // `*` that's part of a longer run.
778
779                // Check: not escaped (preceded by odd number of backslashes)
780                let is_escaped = {
781                    let mut backslash_count = 0;
782                    let mut check_pos = pos;
783                    while check_pos > 0 && bytes[check_pos - 1] == b'\\' {
784                        backslash_count += 1;
785                        check_pos -= 1;
786                    }
787                    backslash_count % 2 == 1 // Odd number = escaped
788                };
789
790                // Allow matching at the start OR end of a delimiter run.
791                // This lets `**` close at the end of `***` (after a nested `*` closes),
792                // while still avoiding matches in the middle of longer runs.
793                let at_run_start = pos == 0 || bytes[pos - 1] != delim_char as u8;
794                let after_pos = pos + delim_count;
795                let at_run_end = after_pos >= bytes.len() || bytes[after_pos] != delim_char as u8;
796
797                if (at_run_start || at_run_end) && !is_escaped {
798                    // Found a potential closer!
799                    // For underscores, check right-flanking: closer must be preceded by non-whitespace
800                    // For asterisks, Pandoc doesn't require right-flanking (see ender function in Markdown.hs)
801                    if delim_char == '_'
802                        && pos > start
803                        && let Some(prev_char) = text[..pos].chars().last()
804                        && prev_char.is_whitespace()
805                    {
806                        log::trace!(
807                            "Underscore closer preceded by whitespace at pos {}, not right-flanking",
808                            pos
809                        );
810                        // Not a valid closer, continue searching
811                        pos = advance_char_boundary(text, pos, end);
812                        continue;
813                    }
814
815                    log::trace!(
816                        "Found exact {} x {} closer at pos {}",
817                        delim_char,
818                        delim_count,
819                        pos
820                    );
821                    return Some(pos);
822                }
823            }
824        }
825
826        // Not a closer, move to next UTF-8 boundary.
827        pos = advance_char_boundary(text, pos, end);
828    }
829
830    None
831}
832
833/// Parse inline content and look for a matching closer, with nested one attempts.
834///
835/// This implements the symmetric case to `parse_until_closer_with_nested_two`:
836/// When parsing `**...**`, if we encounter `*` (and it's not followed by
837/// another `*` that would be part of our `**` closer), try to parse it as
838/// `one c mempty` (emphasis). If `one` succeeds, those `*` delimiters are
839/// consumed, and we continue searching for the `**` closer.
840///
841/// This ensures nested emphasis closes before the outer strong emphasis.
842///
843/// Example: `**bold with *italic***`
844/// - When parsing the outer `**...**, we scan for `**` closer
845/// - At position 12, we encounter a single `*` (start of `*italic`)
846/// - We try `one` which succeeds with `*italic*` (consuming the first `*` from `***`)
847/// - We continue scanning and find `**` at position 20 (the remaining `**` from `***`)
848/// - Result: STRONG["bold with " EMPHASIS["italic"]]
849///
850/// # Arguments
851/// * `end` - Don't search beyond this position (respects nesting boundaries)
852fn parse_until_closer_with_nested_one(
853    text: &str,
854    start: usize,
855    delim_char: char,
856    delim_count: usize,
857    end: usize,
858    config: &ParserOptions,
859) -> Option<usize> {
860    let bytes = text.as_bytes();
861    let mut pos = start;
862
863    while pos < end.min(text.len()) {
864        if bytes[pos] == b'`'
865            && let Some(m) = try_parse_inline_executable(
866                &text[pos..],
867                config.extensions.rmarkdown_inline_code,
868                config.extensions.quarto_inline_code,
869            )
870        {
871            log::trace!(
872                "Skipping inline executable span of {} bytes at pos {}",
873                m.total_len,
874                pos
875            );
876            pos += m.total_len;
877            continue;
878        }
879
880        // Skip over code spans - their content is protected from delimiter matching
881        if bytes[pos] == b'`'
882            && let Some((len, _, _, _)) = try_parse_code_span(&text[pos..])
883        {
884            log::trace!("Skipping code span of {} bytes at pos {}", len, pos);
885            pos += len;
886            continue;
887        }
888
889        // Skip over inline math - their content is protected from delimiter matching
890        if bytes[pos] == b'$'
891            && let Some((len, _)) = try_parse_inline_math(&text[pos..])
892        {
893            log::trace!("Skipping inline math of {} bytes at pos {}", len, pos);
894            pos += len;
895            continue;
896        }
897
898        // Skip over links - their content is protected from delimiter matching
899        if bytes[pos] == b'['
900            && let Some((len, _, _, _)) = try_parse_inline_link(&text[pos..])
901        {
902            log::trace!("Skipping inline link of {} bytes at pos {}", len, pos);
903            pos += len;
904            continue;
905        }
906
907        // Pandoc algorithm: If we're looking for a double delimiter (**) and
908        // encounter a single delimiter (*), check if it's a valid emphasis opener.
909        // If it is, try to parse it as `one` (emphasis). If `one` succeeds, skip
910        // over it. If `one` fails, the outer `two` also fails (delimiter poisoning).
911        // If the `*` is NOT a valid opener (e.g., followed by whitespace or escaped),
912        // skip it and continue looking for the `**` closer.
913        if delim_count == 2 && pos < text.len() && bytes[pos] == delim_char as u8 {
914            // Check that there's NOT a second delimiter immediately after
915            // (which would make this part of our `**` closer or another `**` opener)
916            let no_second_delim = pos + 1 >= bytes.len() || bytes[pos + 1] != delim_char as u8;
917
918            if no_second_delim {
919                // Check if this * is escaped (preceded by odd number of backslashes)
920                let is_escaped = {
921                    let mut backslash_count = 0;
922                    let mut check_pos = pos;
923                    while check_pos > 0 && bytes[check_pos - 1] == b'\\' {
924                        backslash_count += 1;
925                        check_pos -= 1;
926                    }
927                    backslash_count % 2 == 1
928                };
929
930                if is_escaped {
931                    // Escaped delimiter - just literal text, skip it
932                    log::trace!("* at pos {} is escaped, skipping", pos);
933                    pos = advance_char_boundary(text, pos, end);
934                    continue;
935                }
936
937                // Check if this * is a valid emphasis opener (Pandoc's enclosure rule).
938                // A delimiter followed by whitespace is NOT an opener - it's literal text.
939                let after_delim = pos + 1;
940                let followed_by_whitespace = after_delim < text.len()
941                    && text[after_delim..]
942                        .chars()
943                        .next()
944                        .is_some_and(|c| c.is_whitespace());
945
946                if followed_by_whitespace {
947                    // Not a valid opener - just literal text, skip it
948                    log::trace!(
949                        "* at pos {} followed by whitespace, not an opener, skipping",
950                        pos
951                    );
952                    pos = advance_char_boundary(text, pos, end);
953                    continue;
954                }
955
956                log::trace!(
957                    "try_parse_two: found * at pos {}, attempting nested one",
958                    pos
959                );
960
961                // Try to parse as `one` (emphasis)
962                // We create a temporary builder to test if `one` succeeds
963                let mut temp_builder = GreenNodeBuilder::new();
964                if let Some(one_consumed) =
965                    try_parse_one(text, pos, delim_char, end, config, &mut temp_builder)
966                {
967                    // `one` succeeded! Those * delimiters are consumed.
968                    // We skip past the `one` and continue searching for our `**` closer.
969                    log::trace!(
970                        "Nested one succeeded, consumed {} bytes, continuing search",
971                        one_consumed
972                    );
973                    pos += one_consumed;
974                    continue;
975                }
976
977                // `one` failed to find a closer. According to Pandoc's algorithm,
978                // this means the outer `two` should also fail. An unmatched inner
979                // delimiter "poisons" the outer emphasis.
980                // Example: `**foo *bar**` - the `*` can't find a closer, so the
981                // outer `**` should fail and the whole thing becomes literal.
982                log::trace!(
983                    "Nested one failed at pos {}, poisoning outer two (no closer found)",
984                    pos
985                );
986                return None;
987            }
988        }
989
990        // Check if we have a potential closer here
991        if pos + delim_count <= text.len() {
992            let mut matches = true;
993            for i in 0..delim_count {
994                if bytes[pos + i] != delim_char as u8 {
995                    matches = false;
996                    break;
997                }
998            }
999
1000            if matches {
1001                // Check: not escaped (preceded by odd number of backslashes)
1002                let is_escaped = {
1003                    let mut backslash_count = 0;
1004                    let mut check_pos = pos;
1005                    while check_pos > 0 && bytes[check_pos - 1] == b'\\' {
1006                        backslash_count += 1;
1007                        check_pos -= 1;
1008                    }
1009                    backslash_count % 2 == 1 // Odd number = escaped
1010                };
1011
1012                // Allow matching at the start OR end of a delimiter run.
1013                // This lets `**` close at the end of `***` (after a nested `*` closes),
1014                // while still avoiding matches in the middle of longer runs.
1015                let at_run_start = pos == 0 || bytes[pos - 1] != delim_char as u8;
1016                let after_pos = pos + delim_count;
1017                let at_run_end = after_pos >= bytes.len() || bytes[after_pos] != delim_char as u8;
1018
1019                if (at_run_start || at_run_end) && !is_escaped {
1020                    // Found a potential closer!
1021                    // For underscores, check right-flanking: closer must be preceded by non-whitespace
1022                    // For asterisks, Pandoc doesn't require right-flanking (see ender function in Markdown.hs)
1023                    if delim_char == '_'
1024                        && pos > start
1025                        && let Some(prev_char) = text[..pos].chars().last()
1026                        && prev_char.is_whitespace()
1027                    {
1028                        log::trace!(
1029                            "Underscore closer preceded by whitespace at pos {}, not right-flanking",
1030                            pos
1031                        );
1032                        // Not a valid closer, continue searching
1033                        pos = advance_char_boundary(text, pos, end);
1034                        continue;
1035                    }
1036
1037                    log::trace!(
1038                        "Found exact {} x {} closer at pos {}",
1039                        delim_char,
1040                        delim_count,
1041                        pos
1042                    );
1043                    return Some(pos);
1044                }
1045            }
1046        }
1047
1048        // Not a closer, move to next UTF-8 boundary.
1049        pos = advance_char_boundary(text, pos, end);
1050    }
1051
1052    None
1053}
1054
1055///
1056/// This is the recursive inline parser that handles all inline elements:
1057/// - Text
1058/// - Escapes (highest priority)
1059/// - Code spans
1060/// - Math (inline and display)
1061/// - Emphasis/strong (via try_parse_emphasis)
1062/// - Other inline elements
1063///
1064/// **Important**: This is where the greedy left-to-right parsing happens.
1065/// When we see `**`, we try to parse it as STRONG. If it succeeds, those
1066/// delimiters are consumed and won't be available for outer emphasis.
1067///
1068/// # Arguments
1069/// * `nested_emphasis` - If true, bypass opener validity checks for emphasis.
1070///   Set to true when called from within emphasis parsing (e.g., from try_parse_one/two/three).
1071fn parse_inline_range(
1072    text: &str,
1073    start: usize,
1074    end: usize,
1075    config: &ParserOptions,
1076    builder: &mut GreenNodeBuilder,
1077) {
1078    parse_inline_range_impl(text, start, end, config, builder, false)
1079}
1080
1081/// Same as `parse_inline_range` but bypasses opener validity checks for emphasis.
1082/// Used within emphasis parsing contexts (e.g., from try_parse_one/two/three).
1083fn parse_inline_range_nested(
1084    text: &str,
1085    start: usize,
1086    end: usize,
1087    config: &ParserOptions,
1088    builder: &mut GreenNodeBuilder,
1089) {
1090    parse_inline_range_impl(text, start, end, config, builder, true)
1091}
1092
1093fn is_emoji_boundary(text: &str, pos: usize) -> bool {
1094    if pos > 0 {
1095        let prev = text.as_bytes()[pos - 1] as char;
1096        if prev.is_ascii_alphanumeric() || prev == '_' {
1097            return false;
1098        }
1099    }
1100    true
1101}
1102
1103#[inline]
1104fn advance_char_boundary(text: &str, pos: usize, end: usize) -> usize {
1105    if pos >= end || pos >= text.len() {
1106        return pos;
1107    }
1108    let ch_len = text[pos..]
1109        .chars()
1110        .next()
1111        .map_or(1, std::primitive::char::len_utf8);
1112    (pos + ch_len).min(end)
1113}
1114
1115fn parse_inline_range_impl(
1116    text: &str,
1117    start: usize,
1118    end: usize,
1119    config: &ParserOptions,
1120    builder: &mut GreenNodeBuilder,
1121    nested_emphasis: bool,
1122) {
1123    log::trace!(
1124        "parse_inline_range: start={}, end={}, text={:?}",
1125        start,
1126        end,
1127        &text[start..end]
1128    );
1129    let mut pos = start;
1130    let mut text_start = start;
1131
1132    while pos < end {
1133        let byte = text.as_bytes()[pos];
1134
1135        // Backslash math (highest priority if enabled)
1136        if byte == b'\\' {
1137            // Try double backslash display math first: \\[...\\]
1138            if config.extensions.tex_math_double_backslash {
1139                if let Some((len, content)) = try_parse_double_backslash_display_math(&text[pos..])
1140                {
1141                    if pos > text_start {
1142                        builder.token(SyntaxKind::TEXT.into(), &text[text_start..pos]);
1143                    }
1144                    log::trace!("Matched double backslash display math at pos {}", pos);
1145                    emit_double_backslash_display_math(builder, content);
1146                    pos += len;
1147                    text_start = pos;
1148                    continue;
1149                }
1150
1151                // Try double backslash inline math: \\(...\\)
1152                if let Some((len, content)) = try_parse_double_backslash_inline_math(&text[pos..]) {
1153                    if pos > text_start {
1154                        builder.token(SyntaxKind::TEXT.into(), &text[text_start..pos]);
1155                    }
1156                    log::trace!("Matched double backslash inline math at pos {}", pos);
1157                    emit_double_backslash_inline_math(builder, content);
1158                    pos += len;
1159                    text_start = pos;
1160                    continue;
1161                }
1162            }
1163
1164            // Try single backslash display math: \[...\]
1165            if config.extensions.tex_math_single_backslash {
1166                if let Some((len, content)) = try_parse_single_backslash_display_math(&text[pos..])
1167                {
1168                    if pos > text_start {
1169                        builder.token(SyntaxKind::TEXT.into(), &text[text_start..pos]);
1170                    }
1171                    log::trace!("Matched single backslash display math at pos {}", pos);
1172                    emit_single_backslash_display_math(builder, content);
1173                    pos += len;
1174                    text_start = pos;
1175                    continue;
1176                }
1177
1178                // Try single backslash inline math: \(...\)
1179                if let Some((len, content)) = try_parse_single_backslash_inline_math(&text[pos..]) {
1180                    if pos > text_start {
1181                        builder.token(SyntaxKind::TEXT.into(), &text[text_start..pos]);
1182                    }
1183                    log::trace!("Matched single backslash inline math at pos {}", pos);
1184                    emit_single_backslash_inline_math(builder, content);
1185                    pos += len;
1186                    text_start = pos;
1187                    continue;
1188                }
1189            }
1190
1191            // Try math environments \begin{equation}...\end{equation}
1192            if config.extensions.raw_tex
1193                && let Some((len, begin_marker, content, end_marker)) =
1194                    try_parse_math_environment(&text[pos..])
1195            {
1196                if pos > text_start {
1197                    builder.token(SyntaxKind::TEXT.into(), &text[text_start..pos]);
1198                }
1199                log::trace!("Matched math environment at pos {}", pos);
1200                emit_display_math_environment(builder, begin_marker, content, end_marker);
1201                pos += len;
1202                text_start = pos;
1203                continue;
1204            }
1205
1206            // Try bookdown reference: \@ref(label)
1207            if config.extensions.bookdown_references
1208                && let Some((len, label)) = try_parse_bookdown_reference(&text[pos..])
1209            {
1210                if pos > text_start {
1211                    builder.token(SyntaxKind::TEXT.into(), &text[text_start..pos]);
1212                }
1213                log::trace!("Matched bookdown reference at pos {}: {}", pos, label);
1214                super::citations::emit_bookdown_crossref(builder, label);
1215                pos += len;
1216                text_start = pos;
1217                continue;
1218            }
1219
1220            // Try escapes (after bookdown refs and backslash math)
1221            if let Some((len, ch, escape_type)) = try_parse_escape(&text[pos..]) {
1222                let escape_enabled = match escape_type {
1223                    EscapeType::HardLineBreak => config.extensions.escaped_line_breaks,
1224                    EscapeType::NonbreakingSpace => config.extensions.all_symbols_escapable,
1225                    EscapeType::Literal => {
1226                        // BASE_ESCAPABLE matches Pandoc's markdown_strict /
1227                        // original Markdown set, plus `|` and `~` which the
1228                        // formatter emits as escapes for pipe-table separators
1229                        // and strikethrough delimiters. Recognising those here
1230                        // keeps round-trips idempotent in flavors that don't
1231                        // enable all_symbols_escapable (notably GFM/CommonMark,
1232                        // which per spec allow any ASCII punctuation anyway).
1233                        const BASE_ESCAPABLE: &str = "\\`*_{}[]()>#+-.!|~";
1234                        BASE_ESCAPABLE.contains(ch) || config.extensions.all_symbols_escapable
1235                    }
1236                };
1237                if !escape_enabled {
1238                    // Don't treat as hard line break - skip the escape and continue
1239                    // The backslash will be included in the next TEXT token
1240                    pos = advance_char_boundary(text, pos, end);
1241                    continue;
1242                }
1243
1244                // Emit accumulated text
1245                if pos > text_start {
1246                    builder.token(SyntaxKind::TEXT.into(), &text[text_start..pos]);
1247                }
1248
1249                log::trace!("Matched escape at pos {}: \\{}", pos, ch);
1250                emit_escape(builder, ch, escape_type);
1251                pos += len;
1252                text_start = pos;
1253                continue;
1254            }
1255
1256            // Try LaTeX commands (after escapes, before shortcodes)
1257            if config.extensions.raw_tex
1258                && let Some(len) = try_parse_latex_command(&text[pos..])
1259            {
1260                if pos > text_start {
1261                    builder.token(SyntaxKind::TEXT.into(), &text[text_start..pos]);
1262                }
1263                log::trace!("Matched LaTeX command at pos {}", pos);
1264                parse_latex_command(builder, &text[pos..], len);
1265                pos += len;
1266                text_start = pos;
1267                continue;
1268            }
1269        }
1270
1271        // Try Quarto shortcodes: {{< shortcode >}}
1272        if byte == b'{'
1273            && pos + 1 < text.len()
1274            && text.as_bytes()[pos + 1] == b'{'
1275            && let Some((len, name, attrs)) = try_parse_shortcode(&text[pos..])
1276        {
1277            if pos > text_start {
1278                builder.token(SyntaxKind::TEXT.into(), &text[text_start..pos]);
1279            }
1280            log::trace!("Matched shortcode at pos {}: {}", pos, &name);
1281            emit_shortcode(builder, &name, attrs);
1282            pos += len;
1283            text_start = pos;
1284            continue;
1285        }
1286
1287        // Try inline executable code spans (`... `r expr`` and `... `{r} expr``)
1288        if byte == b'`'
1289            && let Some(m) = try_parse_inline_executable(
1290                &text[pos..],
1291                config.extensions.rmarkdown_inline_code,
1292                config.extensions.quarto_inline_code,
1293            )
1294        {
1295            if pos > text_start {
1296                builder.token(SyntaxKind::TEXT.into(), &text[text_start..pos]);
1297            }
1298            log::trace!("Matched inline executable code at pos {}", pos);
1299            emit_inline_executable(builder, &m);
1300            pos += m.total_len;
1301            text_start = pos;
1302            continue;
1303        }
1304
1305        // Try code spans
1306        if byte == b'`'
1307            && let Some((len, content, backtick_count, attributes)) =
1308                try_parse_code_span(&text[pos..])
1309        {
1310            // Emit accumulated text
1311            if pos > text_start {
1312                builder.token(SyntaxKind::TEXT.into(), &text[text_start..pos]);
1313            }
1314
1315            log::trace!(
1316                "Matched code span at pos {}: {} backticks",
1317                pos,
1318                backtick_count
1319            );
1320
1321            // Check for raw inline
1322            if let Some(ref attrs) = attributes
1323                && config.extensions.raw_attribute
1324                && let Some(format) = is_raw_inline(attrs)
1325            {
1326                use super::raw_inline::emit_raw_inline;
1327                log::trace!("Matched raw inline span at pos {}: format={}", pos, format);
1328                emit_raw_inline(builder, content, backtick_count, format);
1329            } else if !config.extensions.inline_code_attributes && attributes.is_some() {
1330                let code_span_len = backtick_count * 2 + content.len();
1331                emit_code_span(builder, content, backtick_count, None);
1332                pos += code_span_len;
1333                text_start = pos;
1334                continue;
1335            } else {
1336                emit_code_span(builder, content, backtick_count, attributes);
1337            }
1338
1339            pos += len;
1340            text_start = pos;
1341            continue;
1342        }
1343
1344        // Try textual emoji aliases: :smile:
1345        if byte == b':'
1346            && config.extensions.emoji
1347            && is_emoji_boundary(text, pos)
1348            && let Some((len, _alias)) = try_parse_emoji(&text[pos..])
1349        {
1350            if pos > text_start {
1351                builder.token(SyntaxKind::TEXT.into(), &text[text_start..pos]);
1352            }
1353            log::trace!("Matched emoji at pos {}", pos);
1354            emit_emoji(builder, &text[pos..pos + len]);
1355            pos += len;
1356            text_start = pos;
1357            continue;
1358        }
1359
1360        // Try inline footnotes: ^[note]
1361        if byte == b'^'
1362            && pos + 1 < text.len()
1363            && text.as_bytes()[pos + 1] == b'['
1364            && config.extensions.inline_footnotes
1365            && let Some((len, content)) = try_parse_inline_footnote(&text[pos..])
1366        {
1367            if pos > text_start {
1368                builder.token(SyntaxKind::TEXT.into(), &text[text_start..pos]);
1369            }
1370            log::trace!("Matched inline footnote at pos {}", pos);
1371            emit_inline_footnote(builder, content, config);
1372            pos += len;
1373            text_start = pos;
1374            continue;
1375        }
1376
1377        // Try superscript: ^text^
1378        if byte == b'^'
1379            && config.extensions.superscript
1380            && let Some((len, content)) = try_parse_superscript(&text[pos..])
1381        {
1382            if pos > text_start {
1383                builder.token(SyntaxKind::TEXT.into(), &text[text_start..pos]);
1384            }
1385            log::trace!("Matched superscript at pos {}", pos);
1386            emit_superscript(builder, content, config);
1387            pos += len;
1388            text_start = pos;
1389            continue;
1390        }
1391
1392        // Try bookdown definition: (\#label) or (ref:label)
1393        if byte == b'(' && config.extensions.bookdown_references {
1394            if let Some((len, label)) = try_parse_bookdown_definition(&text[pos..]) {
1395                if pos > text_start {
1396                    builder.token(SyntaxKind::TEXT.into(), &text[text_start..pos]);
1397                }
1398                log::trace!("Matched bookdown definition at pos {}: {}", pos, label);
1399                builder.token(SyntaxKind::TEXT.into(), &text[pos..pos + len]);
1400                pos += len;
1401                text_start = pos;
1402                continue;
1403            }
1404            if let Some((len, label)) = try_parse_bookdown_text_reference(&text[pos..]) {
1405                if pos > text_start {
1406                    builder.token(SyntaxKind::TEXT.into(), &text[text_start..pos]);
1407                }
1408                log::trace!("Matched bookdown text reference at pos {}: {}", pos, label);
1409                builder.token(SyntaxKind::TEXT.into(), &text[pos..pos + len]);
1410                pos += len;
1411                text_start = pos;
1412                continue;
1413            }
1414        }
1415
1416        // Try subscript: ~text~
1417        if byte == b'~'
1418            && config.extensions.subscript
1419            && let Some((len, content)) = try_parse_subscript(&text[pos..])
1420        {
1421            if pos > text_start {
1422                builder.token(SyntaxKind::TEXT.into(), &text[text_start..pos]);
1423            }
1424            log::trace!("Matched subscript at pos {}", pos);
1425            emit_subscript(builder, content, config);
1426            pos += len;
1427            text_start = pos;
1428            continue;
1429        }
1430
1431        // Try strikeout: ~~text~~
1432        if byte == b'~'
1433            && config.extensions.strikeout
1434            && let Some((len, content)) = try_parse_strikeout(&text[pos..])
1435        {
1436            if pos > text_start {
1437                builder.token(SyntaxKind::TEXT.into(), &text[text_start..pos]);
1438            }
1439            log::trace!("Matched strikeout at pos {}", pos);
1440            emit_strikeout(builder, content, config);
1441            pos += len;
1442            text_start = pos;
1443            continue;
1444        }
1445
1446        // Try mark/highlight: ==text==
1447        if byte == b'='
1448            && config.extensions.mark
1449            && let Some((len, content)) = try_parse_mark(&text[pos..])
1450        {
1451            if pos > text_start {
1452                builder.token(SyntaxKind::TEXT.into(), &text[text_start..pos]);
1453            }
1454            log::trace!("Matched mark at pos {}", pos);
1455            emit_mark(builder, content, config);
1456            pos += len;
1457            text_start = pos;
1458            continue;
1459        }
1460
1461        // Try GFM inline math: $`...`$
1462        if byte == b'$'
1463            && config.extensions.tex_math_gfm
1464            && let Some((len, content)) = try_parse_gfm_inline_math(&text[pos..])
1465        {
1466            if pos > text_start {
1467                builder.token(SyntaxKind::TEXT.into(), &text[text_start..pos]);
1468            }
1469            log::trace!("Matched GFM inline math at pos {}", pos);
1470            emit_gfm_inline_math(builder, content);
1471            pos += len;
1472            text_start = pos;
1473            continue;
1474        }
1475
1476        // Try math ($...$, $$...$$)
1477        if byte == b'$' && config.extensions.tex_math_dollars {
1478            // Try display math first ($$...$$)
1479            if let Some((len, content)) = try_parse_display_math(&text[pos..]) {
1480                // Emit accumulated text
1481                if pos > text_start {
1482                    builder.token(SyntaxKind::TEXT.into(), &text[text_start..pos]);
1483                }
1484
1485                let dollar_count = text[pos..].chars().take_while(|&c| c == '$').count();
1486                log::trace!(
1487                    "Matched display math at pos {}: {} dollars",
1488                    pos,
1489                    dollar_count
1490                );
1491
1492                // Check for trailing attributes (Quarto cross-reference support)
1493                let after_math = &text[pos + len..];
1494                let attr_len = if config.extensions.quarto_crossrefs {
1495                    use crate::parser::utils::attributes::try_parse_trailing_attributes;
1496                    if let Some((_attr_block, _)) = try_parse_trailing_attributes(after_math) {
1497                        let trimmed_after = after_math.trim_start();
1498                        if let Some(open_brace_pos) = trimmed_after.find('{') {
1499                            let ws_before_brace = after_math.len() - trimmed_after.len();
1500                            let attr_text_len = trimmed_after[open_brace_pos..]
1501                                .find('}')
1502                                .map(|close| close + 1)
1503                                .unwrap_or(0);
1504                            ws_before_brace + open_brace_pos + attr_text_len
1505                        } else {
1506                            0
1507                        }
1508                    } else {
1509                        0
1510                    }
1511                } else {
1512                    0
1513                };
1514
1515                let total_len = len + attr_len;
1516                emit_display_math(builder, content, dollar_count);
1517
1518                // Emit attributes if present
1519                if attr_len > 0 {
1520                    use crate::parser::utils::attributes::{
1521                        emit_attributes, try_parse_trailing_attributes,
1522                    };
1523                    let attr_text = &text[pos + len..pos + total_len];
1524                    if let Some((attr_block, _text_before)) =
1525                        try_parse_trailing_attributes(attr_text)
1526                    {
1527                        let trimmed_after = attr_text.trim_start();
1528                        let ws_len = attr_text.len() - trimmed_after.len();
1529                        if ws_len > 0 {
1530                            builder.token(SyntaxKind::WHITESPACE.into(), &attr_text[..ws_len]);
1531                        }
1532                        emit_attributes(builder, &attr_block);
1533                    }
1534                }
1535
1536                pos += total_len;
1537                text_start = pos;
1538                continue;
1539            }
1540
1541            // Try inline math ($...$)
1542            if let Some((len, content)) = try_parse_inline_math(&text[pos..]) {
1543                // Emit accumulated text
1544                if pos > text_start {
1545                    builder.token(SyntaxKind::TEXT.into(), &text[text_start..pos]);
1546                }
1547
1548                log::trace!("Matched inline math at pos {}", pos);
1549                emit_inline_math(builder, content);
1550                pos += len;
1551                text_start = pos;
1552                continue;
1553            }
1554
1555            // Neither display nor inline math matched - emit the $ as literal text
1556            // This ensures each $ gets its own TEXT token for CST compatibility
1557            if pos > text_start {
1558                builder.token(SyntaxKind::TEXT.into(), &text[text_start..pos]);
1559            }
1560            builder.token(SyntaxKind::TEXT.into(), "$");
1561            pos = advance_char_boundary(text, pos, end);
1562            text_start = pos;
1563            continue;
1564        }
1565
1566        // Try autolinks: <url> or <email>
1567        if byte == b'<'
1568            && config.extensions.autolinks
1569            && let Some((len, url)) = try_parse_autolink(&text[pos..])
1570        {
1571            if pos > text_start {
1572                builder.token(SyntaxKind::TEXT.into(), &text[text_start..pos]);
1573            }
1574            log::trace!("Matched autolink at pos {}", pos);
1575            emit_autolink(builder, &text[pos..pos + len], url);
1576            pos += len;
1577            text_start = pos;
1578            continue;
1579        }
1580
1581        if config.extensions.autolink_bare_uris
1582            && let Some((len, url)) = try_parse_bare_uri(&text[pos..])
1583        {
1584            if pos > text_start {
1585                builder.token(SyntaxKind::TEXT.into(), &text[text_start..pos]);
1586            }
1587            log::trace!("Matched bare URI at pos {}", pos);
1588            emit_bare_uri_link(builder, url, config);
1589            pos += len;
1590            text_start = pos;
1591            continue;
1592        }
1593
1594        // Try native spans: <span>text</span> (after autolink since both start with <)
1595        if byte == b'<'
1596            && config.extensions.native_spans
1597            && let Some((len, content, attributes)) = try_parse_native_span(&text[pos..])
1598        {
1599            if pos > text_start {
1600                builder.token(SyntaxKind::TEXT.into(), &text[text_start..pos]);
1601            }
1602            log::trace!("Matched native span at pos {}", pos);
1603            emit_native_span(builder, content, &attributes, config);
1604            pos += len;
1605            text_start = pos;
1606            continue;
1607        }
1608
1609        // Images and links - process in order: inline image, reference image, footnote ref, inline link, reference link
1610        if byte == b'!' && pos + 1 < text.len() && text.as_bytes()[pos + 1] == b'[' {
1611            // Try inline image: ![alt](url)
1612            if let Some((len, alt_text, dest, attributes)) = try_parse_inline_image(&text[pos..]) {
1613                if pos > text_start {
1614                    builder.token(SyntaxKind::TEXT.into(), &text[text_start..pos]);
1615                }
1616                log::trace!("Matched inline image at pos {}", pos);
1617                emit_inline_image(
1618                    builder,
1619                    &text[pos..pos + len],
1620                    alt_text,
1621                    dest,
1622                    attributes,
1623                    config,
1624                );
1625                pos += len;
1626                text_start = pos;
1627                continue;
1628            }
1629
1630            // Try reference image: ![alt][ref] or ![alt]
1631            if config.extensions.reference_links {
1632                let allow_shortcut = config.extensions.shortcut_reference_links;
1633                if let Some((len, alt_text, reference, is_implicit)) =
1634                    try_parse_reference_image(&text[pos..], allow_shortcut)
1635                {
1636                    if pos > text_start {
1637                        builder.token(SyntaxKind::TEXT.into(), &text[text_start..pos]);
1638                    }
1639                    log::trace!("Matched reference image at pos {}", pos);
1640                    emit_reference_image(builder, alt_text, &reference, is_implicit, config);
1641                    pos += len;
1642                    text_start = pos;
1643                    continue;
1644                }
1645            }
1646        }
1647
1648        // Process bracket-starting elements
1649        if byte == b'[' {
1650            // Try footnote reference: [^id]
1651            if config.extensions.footnotes
1652                && let Some((len, id)) = try_parse_footnote_reference(&text[pos..])
1653            {
1654                if pos > text_start {
1655                    builder.token(SyntaxKind::TEXT.into(), &text[text_start..pos]);
1656                }
1657                log::trace!("Matched footnote reference at pos {}", pos);
1658                emit_footnote_reference(builder, &id);
1659                pos += len;
1660                text_start = pos;
1661                continue;
1662            }
1663
1664            // Try inline link: [text](url)
1665            if config.extensions.inline_links
1666                && let Some((len, link_text, dest, attributes)) =
1667                    try_parse_inline_link(&text[pos..])
1668            {
1669                if pos > text_start {
1670                    builder.token(SyntaxKind::TEXT.into(), &text[text_start..pos]);
1671                }
1672                log::trace!("Matched inline link at pos {}", pos);
1673                emit_inline_link(
1674                    builder,
1675                    &text[pos..pos + len],
1676                    link_text,
1677                    dest,
1678                    attributes,
1679                    config,
1680                );
1681                pos += len;
1682                text_start = pos;
1683                continue;
1684            }
1685
1686            // Try reference link: [text][ref] or [text]
1687            if config.extensions.reference_links {
1688                let allow_shortcut = config.extensions.shortcut_reference_links;
1689                if let Some((len, link_text, reference, is_implicit)) =
1690                    try_parse_reference_link(&text[pos..], allow_shortcut)
1691                {
1692                    if pos > text_start {
1693                        builder.token(SyntaxKind::TEXT.into(), &text[text_start..pos]);
1694                    }
1695                    log::trace!("Matched reference link at pos {}", pos);
1696                    emit_reference_link(builder, link_text, &reference, is_implicit, config);
1697                    pos += len;
1698                    text_start = pos;
1699                    continue;
1700                }
1701            }
1702
1703            // Try bracketed citation: [@cite]
1704            if config.extensions.citations
1705                && let Some((len, content)) = try_parse_bracketed_citation(&text[pos..])
1706            {
1707                if pos > text_start {
1708                    builder.token(SyntaxKind::TEXT.into(), &text[text_start..pos]);
1709                }
1710                log::trace!("Matched bracketed citation at pos {}", pos);
1711                emit_bracketed_citation(builder, content);
1712                pos += len;
1713                text_start = pos;
1714                continue;
1715            }
1716        }
1717
1718        // Try bracketed spans: [text]{.class}
1719        // Must come after links/citations
1720        if byte == b'['
1721            && config.extensions.bracketed_spans
1722            && let Some((len, text_content, attrs)) = try_parse_bracketed_span(&text[pos..])
1723        {
1724            if pos > text_start {
1725                builder.token(SyntaxKind::TEXT.into(), &text[text_start..pos]);
1726            }
1727            log::trace!("Matched bracketed span at pos {}", pos);
1728            emit_bracketed_span(builder, &text_content, &attrs, config);
1729            pos += len;
1730            text_start = pos;
1731            continue;
1732        }
1733
1734        // Try bare citation: @cite (must come after bracketed elements)
1735        if byte == b'@'
1736            && (config.extensions.citations || config.extensions.quarto_crossrefs)
1737            && let Some((len, key, has_suppress)) = try_parse_bare_citation(&text[pos..])
1738        {
1739            let is_crossref =
1740                config.extensions.quarto_crossrefs && super::citations::is_quarto_crossref_key(key);
1741            if is_crossref || config.extensions.citations {
1742                if pos > text_start {
1743                    builder.token(SyntaxKind::TEXT.into(), &text[text_start..pos]);
1744                }
1745                if is_crossref {
1746                    log::trace!("Matched Quarto crossref at pos {}: {}", pos, &key);
1747                    super::citations::emit_crossref(builder, key, has_suppress);
1748                } else {
1749                    log::trace!("Matched bare citation at pos {}: {}", pos, &key);
1750                    emit_bare_citation(builder, key, has_suppress);
1751                }
1752                pos += len;
1753                text_start = pos;
1754                continue;
1755            }
1756        }
1757
1758        // Try suppress-author citation: -@cite
1759        if byte == b'-'
1760            && pos + 1 < text.len()
1761            && text.as_bytes()[pos + 1] == b'@'
1762            && (config.extensions.citations || config.extensions.quarto_crossrefs)
1763            && let Some((len, key, has_suppress)) = try_parse_bare_citation(&text[pos..])
1764        {
1765            let is_crossref =
1766                config.extensions.quarto_crossrefs && super::citations::is_quarto_crossref_key(key);
1767            if is_crossref || config.extensions.citations {
1768                if pos > text_start {
1769                    builder.token(SyntaxKind::TEXT.into(), &text[text_start..pos]);
1770                }
1771                if is_crossref {
1772                    log::trace!("Matched Quarto crossref at pos {}: {}", pos, &key);
1773                    super::citations::emit_crossref(builder, key, has_suppress);
1774                } else {
1775                    log::trace!("Matched suppress-author citation at pos {}: {}", pos, &key);
1776                    emit_bare_citation(builder, key, has_suppress);
1777                }
1778                pos += len;
1779                text_start = pos;
1780                continue;
1781            }
1782        }
1783
1784        // Try to parse emphasis at this position
1785        if byte == b'*' || byte == b'_' {
1786            // Count the delimiter run to avoid re-parsing
1787            let bytes = text.as_bytes();
1788            let mut delim_count = 0;
1789            while pos + delim_count < bytes.len() && bytes[pos + delim_count] == byte {
1790                delim_count += 1;
1791            }
1792
1793            // Emit any accumulated text before the delimiter
1794            if pos > text_start {
1795                log::trace!(
1796                    "Emitting TEXT before delimiter: {:?}",
1797                    &text[text_start..pos]
1798                );
1799                builder.token(SyntaxKind::TEXT.into(), &text[text_start..pos]);
1800                text_start = pos; // Update text_start after emission
1801            }
1802
1803            // Try to parse emphasis
1804            // Use nested variant (bypass opener validity) when in nested context
1805            let emphasis_result = if nested_emphasis {
1806                try_parse_emphasis_nested(text, pos, end, config, builder)
1807            } else {
1808                try_parse_emphasis(text, pos, end, config, builder)
1809            };
1810
1811            if let Some((consumed, _)) = emphasis_result {
1812                // Successfully parsed emphasis
1813                log::trace!(
1814                    "Parsed emphasis, consumed {} bytes from pos {}",
1815                    consumed,
1816                    pos
1817                );
1818                pos += consumed;
1819                text_start = pos;
1820            } else {
1821                // Failed to parse, delimiter run will be treated as regular text
1822                // Skip the ENTIRE delimiter run to avoid re-parsing parts of it
1823                log::trace!(
1824                    "Failed to parse emphasis at pos {}, skipping {} delimiters as literal",
1825                    pos,
1826                    delim_count
1827                );
1828                pos += delim_count;
1829                // DON'T update text_start - let the delimiters accumulate
1830            }
1831            continue;
1832        }
1833
1834        // Check for newlines - may need to emit as hard line break
1835        if byte == b'\r' && pos + 1 < end && text.as_bytes()[pos + 1] == b'\n' {
1836            let text_before = &text[text_start..pos];
1837
1838            // Check for trailing spaces hard line break (always enabled in Pandoc)
1839            let trailing_spaces = text_before.chars().rev().take_while(|&c| c == ' ').count();
1840            if trailing_spaces >= 2 {
1841                // Emit text before the trailing spaces
1842                let text_content = &text_before[..text_before.len() - trailing_spaces];
1843                if !text_content.is_empty() {
1844                    builder.token(SyntaxKind::TEXT.into(), text_content);
1845                }
1846                let spaces = " ".repeat(trailing_spaces);
1847                builder.token(
1848                    SyntaxKind::HARD_LINE_BREAK.into(),
1849                    &format!("{}\r\n", spaces),
1850                );
1851                pos += 2;
1852                text_start = pos;
1853                continue;
1854            }
1855
1856            // hard_line_breaks: treat all single newlines as hard line breaks
1857            if config.extensions.hard_line_breaks {
1858                if !text_before.is_empty() {
1859                    builder.token(SyntaxKind::TEXT.into(), text_before);
1860                }
1861                builder.token(SyntaxKind::HARD_LINE_BREAK.into(), "\r\n");
1862                pos += 2;
1863                text_start = pos;
1864                continue;
1865            }
1866
1867            // Regular newline
1868            if !text_before.is_empty() {
1869                builder.token(SyntaxKind::TEXT.into(), text_before);
1870            }
1871            builder.token(SyntaxKind::NEWLINE.into(), "\r\n");
1872            pos += 2;
1873            text_start = pos;
1874            continue;
1875        }
1876
1877        if byte == b'\n' {
1878            let text_before = &text[text_start..pos];
1879
1880            // Check for trailing spaces hard line break (always enabled in Pandoc)
1881            let trailing_spaces = text_before.chars().rev().take_while(|&c| c == ' ').count();
1882            if trailing_spaces >= 2 {
1883                // Emit text before the trailing spaces
1884                let text_content = &text_before[..text_before.len() - trailing_spaces];
1885                if !text_content.is_empty() {
1886                    builder.token(SyntaxKind::TEXT.into(), text_content);
1887                }
1888                let spaces = " ".repeat(trailing_spaces);
1889                builder.token(SyntaxKind::HARD_LINE_BREAK.into(), &format!("{}\n", spaces));
1890                pos += 1;
1891                text_start = pos;
1892                continue;
1893            }
1894
1895            // hard_line_breaks: treat all single newlines as hard line breaks
1896            if config.extensions.hard_line_breaks {
1897                if !text_before.is_empty() {
1898                    builder.token(SyntaxKind::TEXT.into(), text_before);
1899                }
1900                builder.token(SyntaxKind::HARD_LINE_BREAK.into(), "\n");
1901                pos += 1;
1902                text_start = pos;
1903                continue;
1904            }
1905
1906            // Regular newline
1907            if !text_before.is_empty() {
1908                builder.token(SyntaxKind::TEXT.into(), text_before);
1909            }
1910            builder.token(SyntaxKind::NEWLINE.into(), "\n");
1911            pos += 1;
1912            text_start = pos;
1913            continue;
1914        }
1915
1916        // Regular character, keep accumulating
1917        pos = advance_char_boundary(text, pos, end);
1918    }
1919
1920    // Emit any remaining text
1921    if pos > text_start && text_start < end {
1922        log::trace!("Emitting remaining TEXT: {:?}", &text[text_start..end]);
1923        builder.token(SyntaxKind::TEXT.into(), &text[text_start..end]);
1924    }
1925
1926    log::trace!("parse_inline_range complete: start={}, end={}", start, end);
1927}
1928
1929#[cfg(test)]
1930mod tests {
1931    use super::*;
1932    use crate::syntax::{SyntaxKind, SyntaxNode};
1933    use rowan::GreenNode;
1934
1935    #[test]
1936    fn test_recursive_simple_emphasis() {
1937        let text = "*test*";
1938        let config = ParserOptions::default();
1939        let mut builder = GreenNodeBuilder::new();
1940
1941        parse_inline_text_recursive(&mut builder, text, &config);
1942
1943        let green: GreenNode = builder.finish();
1944        let node = SyntaxNode::new_root(green);
1945
1946        // Should be lossless
1947        assert_eq!(node.text().to_string(), text);
1948
1949        // Should have EMPHASIS node
1950        let has_emph = node.descendants().any(|n| n.kind() == SyntaxKind::EMPHASIS);
1951        assert!(has_emph, "Should have EMPHASIS node");
1952    }
1953
1954    #[test]
1955    fn test_recursive_nested() {
1956        let text = "*foo **bar** baz*";
1957        let config = ParserOptions::default();
1958        let mut builder = GreenNodeBuilder::new();
1959
1960        // Wrap in a PARAGRAPH node (inline content needs a parent)
1961        builder.start_node(SyntaxKind::PARAGRAPH.into());
1962        parse_inline_text_recursive(&mut builder, text, &config);
1963        builder.finish_node();
1964
1965        let green: GreenNode = builder.finish();
1966        let node = SyntaxNode::new_root(green);
1967
1968        // Should be lossless
1969        assert_eq!(node.text().to_string(), text);
1970
1971        // Should have both EMPHASIS and STRONG
1972        let has_emph = node.descendants().any(|n| n.kind() == SyntaxKind::EMPHASIS);
1973        let has_strong = node.descendants().any(|n| n.kind() == SyntaxKind::STRONG);
1974
1975        assert!(has_emph, "Should have EMPHASIS node");
1976        assert!(has_strong, "Should have STRONG node");
1977    }
1978
1979    /// Test that we can parse a simple emphasis case
1980    #[test]
1981    fn test_parse_simple_emphasis() {
1982        use crate::options::ParserOptions;
1983        use crate::syntax::SyntaxNode;
1984        use rowan::GreenNode;
1985
1986        let text = "*test*";
1987        let config = ParserOptions::default();
1988        let mut builder = GreenNodeBuilder::new();
1989
1990        // Try to parse emphasis at position 0
1991        let result = try_parse_emphasis(text, 0, text.len(), &config, &mut builder);
1992
1993        // Should successfully parse
1994        assert_eq!(result, Some((6, 1))); // Consumed all 6 bytes, delimiter count 1
1995
1996        // Check the generated CST
1997        let green: GreenNode = builder.finish();
1998        let node = SyntaxNode::new_root(green);
1999
2000        // The root IS the EMPHASIS node
2001        assert_eq!(node.kind(), SyntaxKind::EMPHASIS);
2002
2003        // Verify losslessness: CST text should match input
2004        assert_eq!(node.text().to_string(), text);
2005    }
2006
2007    /// Test parsing nested emphasis/strong
2008    #[test]
2009    fn test_parse_nested_emphasis_strong() {
2010        use crate::options::ParserOptions;
2011
2012        let text = "*foo **bar** baz*";
2013        let config = ParserOptions::default();
2014        let mut builder = GreenNodeBuilder::new();
2015
2016        // Parse the whole range
2017        parse_inline_range(text, 0, text.len(), &config, &mut builder);
2018
2019        let green = builder.finish();
2020        let node = crate::syntax::SyntaxNode::new_root(green);
2021
2022        // Verify losslessness
2023        assert_eq!(node.text().to_string(), text);
2024
2025        // Should have EMPHASIS and STRONG nodes
2026        let has_emph = node.descendants().any(|n| n.kind() == SyntaxKind::EMPHASIS);
2027        let has_strong = node.descendants().any(|n| n.kind() == SyntaxKind::STRONG);
2028
2029        assert!(has_emph, "Should have EMPHASIS node");
2030        assert!(has_strong, "Should have STRONG node");
2031    }
2032
2033    /// Test Pandoc's "three" algorithm: ***foo* bar**
2034    /// Expected: Strong[Emph[foo], bar]
2035    /// Current bug: Parses as *Strong[foo* bar]
2036    #[test]
2037    fn test_triple_emphasis_star_then_double_star() {
2038        use crate::options::ParserOptions;
2039        use crate::syntax::SyntaxNode;
2040        use rowan::GreenNode;
2041
2042        let text = "***foo* bar**";
2043        let config = ParserOptions::default();
2044        let mut builder = GreenNodeBuilder::new();
2045
2046        builder.start_node(SyntaxKind::DOCUMENT.into());
2047        parse_inline_range(text, 0, text.len(), &config, &mut builder);
2048        builder.finish_node();
2049
2050        let green: GreenNode = builder.finish();
2051        let node = SyntaxNode::new_root(green);
2052
2053        // Verify losslessness
2054        assert_eq!(node.text().to_string(), text);
2055
2056        // Expected structure: STRONG > EMPH > "foo"
2057        // The STRONG should contain EMPH, not the other way around
2058        let structure = format!("{:#?}", node);
2059
2060        // Should have both STRONG and EMPH
2061        assert!(structure.contains("STRONG"), "Should have STRONG node");
2062        assert!(structure.contains("EMPHASIS"), "Should have EMPHASIS node");
2063
2064        // STRONG should be outer, EMPH should be inner
2065        // Check that STRONG comes before EMPH in tree traversal
2066        let mut found_strong = false;
2067        let mut found_emph_after_strong = false;
2068        for descendant in node.descendants() {
2069            if descendant.kind() == SyntaxKind::STRONG {
2070                found_strong = true;
2071            }
2072            if found_strong && descendant.kind() == SyntaxKind::EMPHASIS {
2073                found_emph_after_strong = true;
2074                break;
2075            }
2076        }
2077
2078        assert!(
2079            found_emph_after_strong,
2080            "EMPH should be inside STRONG, not before it. Current structure:\n{}",
2081            structure
2082        );
2083    }
2084
2085    /// Test Pandoc's "three" algorithm: ***foo** bar*
2086    /// Expected: Emph[Strong[foo], bar]
2087    #[test]
2088    fn test_triple_emphasis_double_star_then_star() {
2089        use crate::options::ParserOptions;
2090        use crate::syntax::SyntaxNode;
2091        use rowan::GreenNode;
2092
2093        let text = "***foo** bar*";
2094        let config = ParserOptions::default();
2095        let mut builder = GreenNodeBuilder::new();
2096
2097        builder.start_node(SyntaxKind::DOCUMENT.into());
2098        parse_inline_range(text, 0, text.len(), &config, &mut builder);
2099        builder.finish_node();
2100
2101        let green: GreenNode = builder.finish();
2102        let node = SyntaxNode::new_root(green);
2103
2104        // Verify losslessness
2105        assert_eq!(node.text().to_string(), text);
2106
2107        // Expected structure: EMPH > STRONG > "foo"
2108        let structure = format!("{:#?}", node);
2109
2110        // Should have both EMPH and STRONG
2111        assert!(structure.contains("EMPHASIS"), "Should have EMPHASIS node");
2112        assert!(structure.contains("STRONG"), "Should have STRONG node");
2113
2114        // EMPH should be outer, STRONG should be inner
2115        let mut found_emph = false;
2116        let mut found_strong_after_emph = false;
2117        for descendant in node.descendants() {
2118            if descendant.kind() == SyntaxKind::EMPHASIS {
2119                found_emph = true;
2120            }
2121            if found_emph && descendant.kind() == SyntaxKind::STRONG {
2122                found_strong_after_emph = true;
2123                break;
2124            }
2125        }
2126
2127        assert!(
2128            found_strong_after_emph,
2129            "STRONG should be inside EMPH. Current structure:\n{}",
2130            structure
2131        );
2132    }
2133
2134    /// Test that display math with attributes parses correctly
2135    /// Regression test for equation_attributes_single_line golden test
2136    #[test]
2137    fn test_display_math_with_attributes() {
2138        use crate::options::ParserOptions;
2139        use crate::syntax::SyntaxNode;
2140        use rowan::GreenNode;
2141
2142        let text = "$$ E = mc^2 $$ {#eq-einstein}";
2143        let mut config = ParserOptions::default();
2144        config.extensions.quarto_crossrefs = true; // Enable Quarto cross-references
2145
2146        let mut builder = GreenNodeBuilder::new();
2147        builder.start_node(SyntaxKind::DOCUMENT.into()); // Need a root node
2148
2149        // Parse the whole text
2150        parse_inline_text_recursive(&mut builder, text, &config);
2151
2152        builder.finish_node(); // Finish ROOT
2153        let green: GreenNode = builder.finish();
2154        let node = SyntaxNode::new_root(green);
2155
2156        // Verify losslessness
2157        assert_eq!(node.text().to_string(), text);
2158
2159        // Should have DISPLAY_MATH node
2160        let has_display_math = node
2161            .descendants()
2162            .any(|n| n.kind() == SyntaxKind::DISPLAY_MATH);
2163        assert!(has_display_math, "Should have DISPLAY_MATH node");
2164
2165        // Should have ATTRIBUTE node
2166        let has_attributes = node
2167            .descendants()
2168            .any(|n| n.kind() == SyntaxKind::ATTRIBUTE);
2169        assert!(
2170            has_attributes,
2171            "Should have ATTRIBUTE node for {{#eq-einstein}}"
2172        );
2173
2174        // Attributes should not be TEXT
2175        let math_followed_by_text = node.descendants().any(|n| {
2176            n.kind() == SyntaxKind::DISPLAY_MATH
2177                && n.next_sibling()
2178                    .map(|s| {
2179                        s.kind() == SyntaxKind::TEXT
2180                            && s.text().to_string().contains("{#eq-einstein}")
2181                    })
2182                    .unwrap_or(false)
2183        });
2184        assert!(
2185            !math_followed_by_text,
2186            "Attributes should not be parsed as TEXT"
2187        );
2188    }
2189
2190    #[test]
2191    fn test_parse_inline_text_gfm_inline_link_destination_not_autolinked() {
2192        use crate::options::{Extensions, Flavor};
2193
2194        let config = ParserOptions {
2195            flavor: Flavor::Gfm,
2196            extensions: Extensions::for_flavor(Flavor::Gfm),
2197            ..ParserOptions::default()
2198        };
2199
2200        let mut builder = GreenNodeBuilder::new();
2201        builder.start_node(SyntaxKind::PARAGRAPH.into());
2202        parse_inline_text_recursive(
2203            &mut builder,
2204            "Second Link [link_text](https://link.com)",
2205            &config,
2206        );
2207        builder.finish_node();
2208        let green = builder.finish();
2209        let root = SyntaxNode::new_root(green);
2210
2211        let links: Vec<_> = root
2212            .descendants()
2213            .filter(|n| n.kind() == SyntaxKind::LINK)
2214            .collect();
2215        assert_eq!(
2216            links.len(),
2217            1,
2218            "Expected exactly one LINK node for inline link, not nested bare URI autolink"
2219        );
2220
2221        let link = links[0].clone();
2222        let mut link_text = None::<String>;
2223        let mut link_dest = None::<String>;
2224
2225        for child in link.children() {
2226            match child.kind() {
2227                SyntaxKind::LINK_TEXT => link_text = Some(child.text().to_string()),
2228                SyntaxKind::LINK_DEST => link_dest = Some(child.text().to_string()),
2229                _ => {}
2230            }
2231        }
2232
2233        assert_eq!(link_text.as_deref(), Some("link_text"));
2234        assert_eq!(link_dest.as_deref(), Some("https://link.com"));
2235    }
2236
2237    #[test]
2238    fn test_autolink_bare_uri_utf8_boundary_safe() {
2239        let text = "§";
2240        let mut config = ParserOptions::default();
2241        config.extensions.autolink_bare_uris = true;
2242        let mut builder = GreenNodeBuilder::new();
2243
2244        builder.start_node(SyntaxKind::DOCUMENT.into());
2245        parse_inline_text_recursive(&mut builder, text, &config);
2246        builder.finish_node();
2247
2248        let green: GreenNode = builder.finish();
2249        let node = SyntaxNode::new_root(green);
2250        assert_eq!(node.text().to_string(), text);
2251    }
2252
2253    #[test]
2254    fn test_parse_emphasis_unicode_content_no_panic() {
2255        let text = "*§*";
2256        let config = ParserOptions::default();
2257        let mut builder = GreenNodeBuilder::new();
2258
2259        let result = try_parse_emphasis(text, 0, text.len(), &config, &mut builder);
2260        assert_eq!(result, Some((text.len(), 1)));
2261
2262        let green: GreenNode = builder.finish();
2263        let node = SyntaxNode::new_root(green);
2264        assert_eq!(node.kind(), SyntaxKind::EMPHASIS);
2265        assert_eq!(node.text().to_string(), text);
2266    }
2267}
2268
2269#[test]
2270fn test_two_with_nested_one_and_triple_closer() {
2271    // **bold with *italic***
2272    // Should parse as: Strong["bold with ", Emph["italic"]]
2273    // The *** at end is parsed as * (closes Emph) + ** (closes Strong)
2274
2275    use crate::options::ParserOptions;
2276    use crate::syntax::SyntaxNode;
2277    use rowan::GreenNode;
2278
2279    let text = "**bold with *italic***";
2280    let config = ParserOptions::default();
2281    let mut builder = GreenNodeBuilder::new();
2282
2283    // parse_inline_range emits inline content directly
2284    parse_inline_range(text, 0, text.len(), &config, &mut builder);
2285
2286    let green: GreenNode = builder.finish();
2287    let node = SyntaxNode::new_root(green);
2288
2289    // Verify lossless parsing
2290    assert_eq!(node.text().to_string(), text, "Should be lossless");
2291
2292    // The root node should be STRONG (parse_inline_range doesn't add wrapper)
2293    assert_eq!(
2294        node.kind(),
2295        SyntaxKind::STRONG,
2296        "Root should be STRONG, got: {:?}",
2297        node.kind()
2298    );
2299
2300    // STRONG should contain EMPHASIS as a nested node
2301    let has_emphasis = node.children().any(|c| c.kind() == SyntaxKind::EMPHASIS);
2302    assert!(has_emphasis, "STRONG should contain EMPHASIS node");
2303}
2304
2305#[test]
2306fn test_emphasis_with_trailing_space_before_closer() {
2307    // *foo * should parse as emphasis (Pandoc behavior)
2308    // For asterisks, Pandoc doesn't require right-flanking for closers
2309
2310    use crate::options::ParserOptions;
2311    use crate::syntax::SyntaxNode;
2312    use rowan::GreenNode;
2313
2314    let text = "*foo *";
2315    let config = ParserOptions::default();
2316    let mut builder = GreenNodeBuilder::new();
2317
2318    // Try to parse emphasis at position 0
2319    let result = try_parse_emphasis(text, 0, text.len(), &config, &mut builder);
2320
2321    // Should successfully parse (consumed all 6 bytes, delimiter count 1)
2322    assert_eq!(
2323        result,
2324        Some((6, 1)),
2325        "Should parse as emphasis, result: {:?}",
2326        result
2327    );
2328
2329    // Check the generated CST
2330    let green: GreenNode = builder.finish();
2331    let node = SyntaxNode::new_root(green);
2332
2333    // The root IS the EMPHASIS node
2334    assert_eq!(node.kind(), SyntaxKind::EMPHASIS);
2335
2336    // Verify losslessness
2337    assert_eq!(node.text().to_string(), text);
2338}
2339
2340#[test]
2341fn test_triple_emphasis_all_strong_nested() {
2342    // ***foo** bar **baz*** should parse as Emph[Strong[foo], " bar ", Strong[baz]]
2343    // Pandoc output confirms this
2344
2345    use crate::options::ParserOptions;
2346    use crate::syntax::SyntaxNode;
2347    use rowan::GreenNode;
2348
2349    let text = "***foo** bar **baz***";
2350    let config = ParserOptions::default();
2351    let mut builder = GreenNodeBuilder::new();
2352
2353    parse_inline_range(text, 0, text.len(), &config, &mut builder);
2354
2355    let green: GreenNode = builder.finish();
2356    let node = SyntaxNode::new_root(green);
2357
2358    // Should have one EMPHASIS node at root
2359    let emphasis_nodes: Vec<_> = node
2360        .descendants()
2361        .filter(|n| n.kind() == SyntaxKind::EMPHASIS)
2362        .collect();
2363    assert_eq!(
2364        emphasis_nodes.len(),
2365        1,
2366        "Should have exactly one EMPHASIS node, found: {}",
2367        emphasis_nodes.len()
2368    );
2369
2370    // EMPHASIS should contain two STRONG nodes
2371    let emphasis_node = emphasis_nodes[0].clone();
2372    let strong_in_emphasis: Vec<_> = emphasis_node
2373        .children()
2374        .filter(|n| n.kind() == SyntaxKind::STRONG)
2375        .collect();
2376    assert_eq!(
2377        strong_in_emphasis.len(),
2378        2,
2379        "EMPHASIS should contain two STRONG nodes, found: {}",
2380        strong_in_emphasis.len()
2381    );
2382
2383    // Verify losslessness
2384    assert_eq!(node.text().to_string(), text);
2385}
2386
2387#[test]
2388fn test_triple_emphasis_all_emph_nested() {
2389    // ***foo* bar *baz*** should parse as Strong[Emph[foo], " bar ", Emph[baz]]
2390    // Pandoc output confirms this
2391
2392    use crate::options::ParserOptions;
2393    use crate::syntax::SyntaxNode;
2394    use rowan::GreenNode;
2395
2396    let text = "***foo* bar *baz***";
2397    let config = ParserOptions::default();
2398    let mut builder = GreenNodeBuilder::new();
2399
2400    parse_inline_range(text, 0, text.len(), &config, &mut builder);
2401
2402    let green: GreenNode = builder.finish();
2403    let node = SyntaxNode::new_root(green);
2404
2405    // Should have one STRONG node at root
2406    let strong_nodes: Vec<_> = node
2407        .descendants()
2408        .filter(|n| n.kind() == SyntaxKind::STRONG)
2409        .collect();
2410    assert_eq!(
2411        strong_nodes.len(),
2412        1,
2413        "Should have exactly one STRONG node, found: {}",
2414        strong_nodes.len()
2415    );
2416
2417    // STRONG should contain two EMPHASIS nodes
2418    let strong_node = strong_nodes[0].clone();
2419    let emph_in_strong: Vec<_> = strong_node
2420        .children()
2421        .filter(|n| n.kind() == SyntaxKind::EMPHASIS)
2422        .collect();
2423    assert_eq!(
2424        emph_in_strong.len(),
2425        2,
2426        "STRONG should contain two EMPHASIS nodes, found: {}",
2427        emph_in_strong.len()
2428    );
2429
2430    // Verify losslessness
2431    assert_eq!(node.text().to_string(), text);
2432}
2433
2434// Multiline emphasis tests
2435#[test]
2436fn test_parse_emphasis_multiline() {
2437    // Per Pandoc spec, emphasis CAN contain newlines (soft breaks)
2438    use crate::options::ParserOptions;
2439    use crate::syntax::SyntaxNode;
2440    use rowan::GreenNode;
2441
2442    let text = "*text on\nline two*";
2443    let config = ParserOptions::default();
2444    let mut builder = GreenNodeBuilder::new();
2445
2446    let result = try_parse_emphasis(text, 0, text.len(), &config, &mut builder);
2447
2448    // Should successfully parse all bytes
2449    assert_eq!(
2450        result,
2451        Some((text.len(), 1)),
2452        "Emphasis should parse multiline content"
2453    );
2454
2455    // Check the generated CST
2456    let green: GreenNode = builder.finish();
2457    let node = SyntaxNode::new_root(green);
2458
2459    // Should have EMPHASIS node
2460    assert_eq!(node.kind(), SyntaxKind::EMPHASIS);
2461
2462    // Verify losslessness: should preserve the newline
2463    assert_eq!(node.text().to_string(), text);
2464    assert!(
2465        node.text().to_string().contains('\n'),
2466        "Should preserve newline in emphasis content"
2467    );
2468}
2469
2470#[test]
2471fn test_parse_strong_multiline() {
2472    // Per Pandoc spec, strong emphasis CAN contain newlines
2473    use crate::options::ParserOptions;
2474    use crate::syntax::SyntaxNode;
2475    use rowan::GreenNode;
2476
2477    let text = "**strong on\nline two**";
2478    let config = ParserOptions::default();
2479    let mut builder = GreenNodeBuilder::new();
2480
2481    let result = try_parse_emphasis(text, 0, text.len(), &config, &mut builder);
2482
2483    // Should successfully parse all bytes
2484    assert_eq!(
2485        result,
2486        Some((text.len(), 2)),
2487        "Strong emphasis should parse multiline content"
2488    );
2489
2490    // Check the generated CST
2491    let green: GreenNode = builder.finish();
2492    let node = SyntaxNode::new_root(green);
2493
2494    // Should have STRONG node
2495    assert_eq!(node.kind(), SyntaxKind::STRONG);
2496
2497    // Verify losslessness
2498    assert_eq!(node.text().to_string(), text);
2499    assert!(
2500        node.text().to_string().contains('\n'),
2501        "Should preserve newline in strong content"
2502    );
2503}
2504
2505#[test]
2506fn test_parse_triple_emphasis_multiline() {
2507    // Triple emphasis with newlines
2508    use crate::options::ParserOptions;
2509    use crate::syntax::SyntaxNode;
2510    use rowan::GreenNode;
2511
2512    let text = "***both on\nline two***";
2513    let config = ParserOptions::default();
2514    let mut builder = GreenNodeBuilder::new();
2515
2516    let result = try_parse_emphasis(text, 0, text.len(), &config, &mut builder);
2517
2518    // Should successfully parse all bytes
2519    assert_eq!(
2520        result,
2521        Some((text.len(), 3)),
2522        "Triple emphasis should parse multiline content"
2523    );
2524
2525    // Check the generated CST
2526    let green: GreenNode = builder.finish();
2527    let node = SyntaxNode::new_root(green);
2528
2529    // Should have STRONG node (triple = strong + emph)
2530    let has_strong = node.descendants().any(|n| n.kind() == SyntaxKind::STRONG);
2531    assert!(has_strong, "Should have STRONG node");
2532
2533    // Verify losslessness
2534    assert_eq!(node.text().to_string(), text);
2535    assert!(
2536        node.text().to_string().contains('\n'),
2537        "Should preserve newline in triple emphasis content"
2538    );
2539}
panache_parser/parser/inlines/core.rs

panache_parser/parser/inlines/
core.rs