panache_parser/parser/inlines/
core.rs

1//! Recursive emphasis parsing using Pandoc's algorithm.
2//!
3//! This module implements emphasis/strong emphasis parsing using a recursive
4//! descent approach based on Pandoc's Haskell implementation in
5//! `Readers/Markdown.hs:L1662-L1722`.
6//!
7//! **Key algorithm**: Left-to-right, greedy, first-match wins
8//! 1. Parse text left-to-right
9//! 2. When we see delimiters, try to parse emphasis (look for matching closer)
10//! 3. If successful, emit emphasis node and continue from after closer
11//! 4. If failed (no closer found), emit delimiter as literal and continue
12//! 5. Nested emphasis is handled naturally by recursive parsing of content
13//!
14//! **Example**: `*foo **bar* baz**`
15//! - See `*`, try to parse EMPH
16//! - Parse content: see `**`, try to parse STRONG
17//! - STRONG finds closer `**` at end → succeeds, emits STRONG[bar* baz]
18//! - Outer `*` can't find closer (all delimiters consumed) → fails, emits `*foo` as literal
19//! - Result: `*foo` + STRONG[bar* baz]
20//!
21//! This matches Pandoc's behavior exactly.
22
23use crate::options::ParserOptions;
24use crate::syntax::SyntaxKind;
25use rowan::GreenNodeBuilder;
26
27// Import inline element parsers from sibling modules
28use super::bookdown::{
29    try_parse_bookdown_definition, try_parse_bookdown_reference, try_parse_bookdown_text_reference,
30};
31use super::bracketed_spans::{emit_bracketed_span, try_parse_bracketed_span};
32use super::citations::{
33    emit_bare_citation, emit_bracketed_citation, try_parse_bare_citation,
34    try_parse_bracketed_citation,
35};
36use super::code_spans::{emit_code_span, try_parse_code_span};
37use super::emoji::{emit_emoji, try_parse_emoji};
38use super::escapes::{EscapeType, emit_escape, try_parse_escape};
39use super::inline_executable::{emit_inline_executable, try_parse_inline_executable};
40use super::inline_footnotes::{
41    emit_footnote_reference, emit_inline_footnote, try_parse_footnote_reference,
42    try_parse_inline_footnote,
43};
44use super::latex::{parse_latex_command, try_parse_latex_command};
45use super::links::{
46    emit_autolink, emit_bare_uri_link, emit_inline_image, emit_inline_link, emit_reference_image,
47    emit_reference_link, try_parse_autolink, try_parse_bare_uri, try_parse_inline_image,
48    try_parse_inline_link, try_parse_reference_image, try_parse_reference_link,
49};
50use super::mark::{emit_mark, try_parse_mark};
51use super::math::{
52    emit_display_math, emit_display_math_environment, emit_double_backslash_display_math,
53    emit_double_backslash_inline_math, emit_gfm_inline_math, emit_inline_math,
54    emit_single_backslash_display_math, emit_single_backslash_inline_math, try_parse_display_math,
55    try_parse_double_backslash_display_math, try_parse_double_backslash_inline_math,
56    try_parse_gfm_inline_math, try_parse_inline_math, try_parse_math_environment,
57    try_parse_single_backslash_display_math, try_parse_single_backslash_inline_math,
58};
59use super::native_spans::{emit_native_span, try_parse_native_span};
60use super::raw_inline::is_raw_inline;
61use super::shortcodes::{emit_shortcode, try_parse_shortcode};
62use super::strikeout::{emit_strikeout, try_parse_strikeout};
63use super::subscript::{emit_subscript, try_parse_subscript};
64use super::superscript::{emit_superscript, try_parse_superscript};
65
66/// Parse inline text using the recursive emphasis algorithm.
67///
68/// This is the main entry point for parsing inline content with Pandoc-style
69/// recursive emphasis handling. It uses a greedy left-to-right, first-match-wins
70/// approach that matches Pandoc's behavior exactly.
71///
72/// **Algorithm**:
73/// 1. Parse text left-to-right trying each inline element type in precedence order
74/// 2. When we see `*` or `_`, try to parse emphasis recursively
75/// 3. Nested emphasis naturally consumes delimiters before outer matches
76/// 4. All inline elements (code, links, math, etc.) are parsed on-the-fly
77///
78/// # Arguments
79/// * `text` - The inline text to parse
80/// * `config` - Configuration for extensions and formatting
81/// * `builder` - The CST builder to emit nodes to
82pub fn parse_inline_text_recursive(
83    builder: &mut GreenNodeBuilder,
84    text: &str,
85    config: &ParserOptions,
86) {
87    log::trace!(
88        "Recursive inline parsing: {:?} ({} bytes)",
89        &text[..text.len().min(40)],
90        text.len()
91    );
92
93    parse_inline_range(text, 0, text.len(), config, builder);
94
95    log::trace!("Recursive inline parsing complete");
96}
97
98/// Parse inline elements from text content.
99/// This is a standalone function used for recursive inline parsing within blocks.
100///
101/// The `allow_reference_links` parameter is accepted for compatibility but not currently used.
102/// Set to `false` in nested contexts (inside link text, image alt, spans) to prevent recursive parsing.
103pub fn parse_inline_text(
104    builder: &mut GreenNodeBuilder,
105    text: &str,
106    config: &ParserOptions,
107    _allow_reference_links: bool,
108) {
109    log::trace!(
110        "Parsing inline text (recursive): {:?} ({} bytes)",
111        &text[..text.len().min(40)],
112        text.len()
113    );
114
115    // Use recursive parsing with Pandoc's algorithm for emphasis
116    parse_inline_text_recursive(builder, text, config);
117}
118
119/// Try to parse emphasis starting at the given position.
120///
121/// This is the entry point for recursive emphasis parsing, equivalent to
122/// Pandoc's `enclosure` function.
123///
124/// Returns Some((bytes_consumed, delim_count)) if emphasis was successfully parsed,
125/// or None if the delimiter should be treated as literal text.
126/// When returning None, the delim_count tells the caller how many delimiter
127/// characters to skip (to avoid re-parsing parts of a failed delimiter run).
128///
129/// # Arguments
130/// * `text` - The full text being parsed
131/// * `pos` - Current position in text (where the delimiter starts)
132/// * `end` - End boundary (don't search for closers beyond this)
133/// * `config` - Configuration
134/// * `builder` - CST builder
135///
136/// **Algorithm**:
137/// 1. Count opening delimiters
138/// 2. Check if followed by whitespace (if so, return None)
139/// 3. Dispatch to parse_one/two/three based on count
140/// 4. Those functions parse content and look for matching closer (within bounds)
141/// 5. If closer found, emit node and return bytes consumed
142/// 6. If not found, return None with delimiter count (caller skips entire run)
143pub fn try_parse_emphasis(
144    text: &str,
145    pos: usize,
146    end: usize,
147    config: &ParserOptions,
148    builder: &mut GreenNodeBuilder,
149) -> Option<(usize, usize)> {
150    let bytes = text.as_bytes();
151
152    if pos >= bytes.len() {
153        return None;
154    }
155
156    let delim_char = bytes[pos] as char;
157    if delim_char != '*' && delim_char != '_' {
158        return None;
159    }
160
161    // Count consecutive delimiters
162    let mut count = 0;
163    while pos + count < bytes.len() && bytes[pos + count] == bytes[pos] {
164        count += 1;
165    }
166
167    let after_pos = pos + count;
168
169    log::trace!(
170        "try_parse_emphasis: '{}' x {} at pos {}",
171        delim_char,
172        count,
173        pos
174    );
175
176    // Check if followed by whitespace (Pandoc rule: treat as literal)
177    if after_pos < text.len()
178        && let Some(next_char) = text[after_pos..].chars().next()
179        && next_char.is_whitespace()
180    {
181        log::trace!("Delimiter followed by whitespace, treating as literal");
182        return None;
183    }
184
185    // For underscores: check intraword_underscores extension (Pandoc lines 1668-1672)
186    // Can't open if preceded by alphanumeric (prevents foo_bar from parsing)
187    if delim_char == '_'
188        && pos > 0
189        && let Some(prev_char) = text[..pos].chars().last()
190        && prev_char.is_alphanumeric()
191    {
192        log::trace!("Underscore preceded by alphanumeric, can't open (intraword)");
193        return None;
194    }
195
196    // Dispatch based on delimiter count
197    let result = match count {
198        1 => try_parse_one(text, pos, delim_char, end, config, builder),
199        2 => try_parse_two(text, pos, delim_char, end, config, builder),
200        3 => try_parse_three(text, pos, delim_char, end, config, builder),
201        _ => {
202            // 4+ delimiters: treat as literal (Pandoc behavior)
203            log::trace!("{} delimiters (4+), treating as literal", count);
204            None
205        }
206    };
207
208    // If parsing succeeded, return (bytes_consumed, delim_count)
209    // If failed, return None but the caller will know to skip `count` delimiters
210    result.map(|consumed| (consumed, count))
211}
212
213/// Try to parse emphasis in a nested context (bypassing opener validity checks).
214///
215/// This mirrors Pandoc's behavior where `one` can call `two c mempty` directly,
216/// bypassing the `enclosure` opener validity checks. This is needed because
217/// patterns like `***foo **bar** baz***` require `**` followed by space to be
218/// parsed as a nested strong opener.
219///
220/// Returns Some((bytes_consumed, delim_count)) if successful, None otherwise.
221fn try_parse_emphasis_nested(
222    text: &str,
223    pos: usize,
224    end: usize,
225    config: &ParserOptions,
226    builder: &mut GreenNodeBuilder,
227) -> Option<(usize, usize)> {
228    let bytes = text.as_bytes();
229
230    if pos >= bytes.len() {
231        return None;
232    }
233
234    let delim_char = bytes[pos] as char;
235    if delim_char != '*' && delim_char != '_' {
236        return None;
237    }
238
239    // Count consecutive delimiters
240    let mut count = 0;
241    while pos + count < bytes.len() && bytes[pos + count] == bytes[pos] {
242        count += 1;
243    }
244
245    log::trace!(
246        "try_parse_emphasis_nested: '{}' x {} at pos {}",
247        delim_char,
248        count,
249        pos
250    );
251
252    // For underscores: still check intraword_underscores (prevents foo_bar parsing)
253    // This check applies even in nested contexts
254    if delim_char == '_'
255        && pos > 0
256        && let Some(prev_char) = text[..pos].chars().last()
257        && prev_char.is_alphanumeric()
258    {
259        log::trace!("Underscore preceded by alphanumeric, can't open (intraword)");
260        return None;
261    }
262
263    // NOTE: We intentionally skip the "delimiter followed by whitespace" check here.
264    // In nested contexts (inside `one` calling `two`), Pandoc allows openers
265    // followed by whitespace because the opener has already been matched.
266
267    // Dispatch based on delimiter count
268    let result = match count {
269        1 => try_parse_one(text, pos, delim_char, end, config, builder),
270        2 => try_parse_two(text, pos, delim_char, end, config, builder),
271        3 => try_parse_three(text, pos, delim_char, end, config, builder),
272        _ => {
273            // 4+ delimiters: treat as literal (Pandoc behavior)
274            log::trace!("{} delimiters (4+), treating as literal", count);
275            None
276        }
277    };
278
279    result.map(|consumed| (consumed, count))
280}
281
282/// Try to parse emphasis with *** opening delimiter.
283///
284/// Tries to match closers in order: *** → ** → *
285/// Returns Some(bytes_consumed) if successful, None otherwise.
286fn try_parse_three(
287    text: &str,
288    pos: usize,
289    delim_char: char,
290    end: usize,
291    config: &ParserOptions,
292    builder: &mut GreenNodeBuilder,
293) -> Option<usize> {
294    let content_start = pos + 3;
295    let one = delim_char.to_string();
296    let two = one.repeat(2);
297
298    log::trace!("try_parse_three: '{}' x 3 at pos {}", delim_char, pos);
299
300    // Pandoc algorithm (line 1695): Parse content UNTIL we see a VALID ender
301    // We loop through potential enders, checking if each is valid.
302    // Invalid enders (like `**` preceded by whitespace) are skipped.
303    let mut search_pos = content_start;
304
305    loop {
306        // Find next potential ender
307        let closer_start = match find_first_potential_ender(text, search_pos, delim_char, end) {
308            Some(p) => p,
309            None => {
310                log::trace!("No potential ender found for ***");
311                return None;
312            }
313        };
314
315        log::trace!("Potential ender at pos {}", closer_start);
316
317        // Count how many delimiters we have at closer_start
318        let bytes = text.as_bytes();
319        let mut closer_count = 0;
320        let mut check_pos = closer_start;
321        while check_pos < bytes.len() && bytes[check_pos] == delim_char as u8 {
322            closer_count += 1;
323            check_pos += 1;
324        }
325
326        log::trace!(
327            "Found {} x {} at pos {}",
328            delim_char,
329            closer_count,
330            closer_start
331        );
332
333        // Try to match closers in order: ***, **, * (Pandoc lines 1696-1698)
334
335        // Try *** (line 1696)
336        if closer_count >= 3 && is_valid_ender(text, closer_start, delim_char, 3) {
337            log::trace!("Matched *** closer, emitting Strong[Emph[content]]");
338
339            builder.start_node(SyntaxKind::STRONG.into());
340            builder.token(SyntaxKind::STRONG_MARKER.into(), &two);
341
342            builder.start_node(SyntaxKind::EMPHASIS.into());
343            builder.token(SyntaxKind::EMPHASIS_MARKER.into(), &one);
344            parse_inline_range_nested(text, content_start, closer_start, config, builder);
345            builder.token(SyntaxKind::EMPHASIS_MARKER.into(), &one);
346            builder.finish_node(); // EMPHASIS
347
348            builder.token(SyntaxKind::STRONG_MARKER.into(), &two);
349            builder.finish_node(); // STRONG
350
351            return Some(closer_start + 3 - pos);
352        }
353
354        // Try ** (line 1697)
355        if closer_count >= 2 && is_valid_ender(text, closer_start, delim_char, 2) {
356            log::trace!("Matched ** closer, wrapping as Strong and continuing with one");
357
358            let continue_pos = closer_start + 2;
359
360            if let Some(final_closer_pos) =
361                parse_until_closer_with_nested_two(text, continue_pos, delim_char, 1, end, config)
362            {
363                log::trace!(
364                    "Found * closer at pos {}, emitting Emph[Strong[...], ...]",
365                    final_closer_pos
366                );
367
368                builder.start_node(SyntaxKind::EMPHASIS.into());
369                builder.token(SyntaxKind::EMPHASIS_MARKER.into(), &one);
370
371                builder.start_node(SyntaxKind::STRONG.into());
372                builder.token(SyntaxKind::STRONG_MARKER.into(), &two);
373                parse_inline_range_nested(text, content_start, closer_start, config, builder);
374                builder.token(SyntaxKind::STRONG_MARKER.into(), &two);
375                builder.finish_node(); // STRONG
376
377                // Parse additional content between ** and * (up to but not including the closer)
378                parse_inline_range_nested(text, continue_pos, final_closer_pos, config, builder);
379
380                builder.token(SyntaxKind::EMPHASIS_MARKER.into(), &one);
381                builder.finish_node(); // EMPHASIS
382
383                return Some(final_closer_pos + 1 - pos);
384            }
385
386            // Fallback: emit * + STRONG
387            log::trace!("No * closer found after **, emitting * + STRONG");
388            builder.token(SyntaxKind::TEXT.into(), &one);
389
390            builder.start_node(SyntaxKind::STRONG.into());
391            builder.token(SyntaxKind::STRONG_MARKER.into(), &two);
392            parse_inline_range_nested(text, content_start, closer_start, config, builder);
393            builder.token(SyntaxKind::STRONG_MARKER.into(), &two);
394            builder.finish_node(); // STRONG
395
396            return Some(closer_start + 2 - pos);
397        }
398
399        // Try * (line 1698)
400        if closer_count >= 1 && is_valid_ender(text, closer_start, delim_char, 1) {
401            log::trace!("Matched * closer, wrapping as Emph and continuing with two");
402
403            let continue_pos = closer_start + 1;
404
405            if let Some(final_closer_pos) =
406                parse_until_closer_with_nested_one(text, continue_pos, delim_char, 2, end, config)
407            {
408                log::trace!(
409                    "Found ** closer at pos {}, emitting Strong[Emph[...], ...]",
410                    final_closer_pos
411                );
412
413                builder.start_node(SyntaxKind::STRONG.into());
414                builder.token(SyntaxKind::STRONG_MARKER.into(), &two);
415
416                builder.start_node(SyntaxKind::EMPHASIS.into());
417                builder.token(SyntaxKind::EMPHASIS_MARKER.into(), &one);
418                parse_inline_range_nested(text, content_start, closer_start, config, builder);
419                builder.token(SyntaxKind::EMPHASIS_MARKER.into(), &one);
420                builder.finish_node(); // EMPHASIS
421
422                parse_inline_range_nested(text, continue_pos, final_closer_pos, config, builder);
423
424                builder.token(SyntaxKind::STRONG_MARKER.into(), &two);
425                builder.finish_node(); // STRONG
426
427                return Some(final_closer_pos + 2 - pos);
428            }
429
430            // Fallback: emit ** + EMPH
431            log::trace!("No ** closer found after *, emitting ** + EMPH");
432            builder.token(SyntaxKind::TEXT.into(), &two);
433
434            builder.start_node(SyntaxKind::EMPHASIS.into());
435            builder.token(SyntaxKind::EMPHASIS_MARKER.into(), &one);
436            parse_inline_range_nested(text, content_start, closer_start, config, builder);
437            builder.token(SyntaxKind::EMPHASIS_MARKER.into(), &one);
438            builder.finish_node(); // EMPHASIS
439
440            return Some(closer_start + 1 - pos);
441        }
442
443        // No valid ender at this position - continue searching after this delimiter run
444        log::trace!(
445            "No valid ender at pos {}, continuing search from {}",
446            closer_start,
447            closer_start + closer_count
448        );
449        search_pos = closer_start + closer_count;
450    }
451}
452
453/// Find the first potential emphasis ender (delimiter character) starting from `start`.
454/// This implements Pandoc's `many (notFollowedBy (ender c 1) >> inline)` -
455/// we parse inline content until we hit a delimiter that could be an ender.
456fn find_first_potential_ender(
457    text: &str,
458    start: usize,
459    delim_char: char,
460    end: usize,
461) -> Option<usize> {
462    let bytes = text.as_bytes();
463    let mut pos = start;
464
465    while pos < end.min(text.len()) {
466        // Check if we found the delimiter character
467        if bytes[pos] == delim_char as u8 {
468            // Check if it's escaped
469            let is_escaped = {
470                let mut backslash_count = 0;
471                let mut check_pos = pos;
472                while check_pos > 0 && bytes[check_pos - 1] == b'\\' {
473                    backslash_count += 1;
474                    check_pos -= 1;
475                }
476                backslash_count % 2 == 1
477            };
478
479            if !is_escaped {
480                // Found a potential ender
481                return Some(pos);
482            }
483        }
484
485        pos += 1;
486    }
487
488    None
489}
490
491/// Check if a delimiter at the given position is a valid ender.
492/// This implements Pandoc's `ender c n` function.
493fn is_valid_ender(text: &str, pos: usize, delim_char: char, delim_count: usize) -> bool {
494    let bytes = text.as_bytes();
495
496    // Check we have exactly delim_count delimiters (not more, not less)
497    if pos + delim_count > text.len() {
498        return false;
499    }
500
501    for i in 0..delim_count {
502        if bytes[pos + i] != delim_char as u8 {
503            return false;
504        }
505    }
506
507    // Check no delimiter immediately before
508    if pos > 0 && bytes[pos - 1] == delim_char as u8 {
509        return false;
510    }
511
512    // Check no delimiter immediately after
513    let after_pos = pos + delim_count;
514    if after_pos < bytes.len() && bytes[after_pos] == delim_char as u8 {
515        return false;
516    }
517
518    // For underscores, check right-flanking (not preceded by whitespace)
519    // Pandoc's `ender` for asterisks has NO right-flanking requirement
520    if delim_char == '_' {
521        if pos > 0
522            && let Some(prev_char) = text[..pos].chars().last()
523            && prev_char.is_whitespace()
524        {
525            return false;
526        }
527
528        // Check not followed by alphanumeric (right-flanking rule for underscores)
529        if after_pos < text.len()
530            && let Some(next_char) = text[after_pos..].chars().next()
531            && next_char.is_alphanumeric()
532        {
533            return false;
534        }
535    }
536
537    true
538}
539
540/// Try to parse emphasis with ** opening delimiter.
541///
542/// Tries to match ** closer only. No fallback.
543/// Returns Some(bytes_consumed) if successful, None otherwise.
544fn try_parse_two(
545    text: &str,
546    pos: usize,
547    delim_char: char,
548    end: usize,
549    config: &ParserOptions,
550    builder: &mut GreenNodeBuilder,
551) -> Option<usize> {
552    let content_start = pos + 2;
553
554    log::trace!("try_parse_two: '{}' x 2 at pos {}", delim_char, pos);
555
556    // Try to find ** closer, checking for nested * emphasis along the way
557    if let Some(closer_pos) =
558        parse_until_closer_with_nested_one(text, content_start, delim_char, 2, end, config)
559    {
560        log::trace!("Found ** closer at pos {}", closer_pos);
561
562        // Emit STRONG(content)
563        builder.start_node(SyntaxKind::STRONG.into());
564        builder.token(SyntaxKind::STRONG_MARKER.into(), &text[pos..pos + 2]);
565        parse_inline_range_nested(text, content_start, closer_pos, config, builder);
566        builder.token(
567            SyntaxKind::STRONG_MARKER.into(),
568            &text[closer_pos..closer_pos + 2],
569        );
570        builder.finish_node(); // STRONG
571
572        return Some(closer_pos + 2 - pos);
573    }
574
575    // No closer found
576    log::trace!("No closer found for **");
577    None
578}
579
580/// Try to parse emphasis with * opening delimiter.
581///
582/// Tries to match * closer.
583/// Returns Some(bytes_consumed) if successful, None otherwise.
584///
585/// **Pandoc algorithm**: While parsing content, if we encounter **,
586/// try to parse it as `two` (strong) recursively. If `two` succeeds,
587/// it consumes the ** delimiters, potentially preventing us from finding
588/// a closer for the outer *. This creates priority where ** can "steal"
589/// matches from *.
590fn try_parse_one(
591    text: &str,
592    pos: usize,
593    delim_char: char,
594    end: usize,
595    config: &ParserOptions,
596    builder: &mut GreenNodeBuilder,
597) -> Option<usize> {
598    let content_start = pos + 1;
599
600    log::trace!("try_parse_one: '{}' x 1 at pos {}", delim_char, pos);
601
602    // Try to find * closer using Pandoc's algorithm with nested two attempts
603    if let Some(closer_pos) =
604        parse_until_closer_with_nested_two(text, content_start, delim_char, 1, end, config)
605    {
606        log::trace!("Found * closer at pos {}", closer_pos);
607
608        // Emit EMPH(content)
609        builder.start_node(SyntaxKind::EMPHASIS.into());
610        builder.token(SyntaxKind::EMPHASIS_MARKER.into(), &text[pos..pos + 1]);
611        parse_inline_range_nested(text, content_start, closer_pos, config, builder);
612        builder.token(
613            SyntaxKind::EMPHASIS_MARKER.into(),
614            &text[closer_pos..closer_pos + 1],
615        );
616        builder.finish_node(); // EMPHASIS
617
618        return Some(closer_pos + 1 - pos);
619    }
620
621    // No closer found
622    log::trace!("No closer found for *");
623    None
624}
625
626/// Parse inline content and look for a matching closer, with nested two attempts.
627///
628/// This implements Pandoc's algorithm from Markdown.hs lines 1712-1717:
629/// When parsing `*...*`, if we encounter `**` (and it's not followed by
630/// another `*` that would close the outer emphasis), try to parse it as
631/// `two c mempty` (strong). If `two` succeeds, those `**` delimiters are
632/// consumed, and we continue searching for the `*` closer.
633///
634/// This creates a priority system where `**` can "steal" matches from `*`.
635///
636/// Example: `*foo **bar* baz**`
637/// - When parsing the outer `*...*`, we encounter `**` at position 5
638/// - We try `two` which succeeds with `**bar* baz**`
639/// - Now there's no `*` closer for the outer `*`, so it fails
640/// - Result: literal `*foo ` + STRONG("bar* baz")
641///
642/// # Arguments
643/// * `end` - Don't search beyond this position (respects nesting boundaries)
644fn parse_until_closer_with_nested_two(
645    text: &str,
646    start: usize,
647    delim_char: char,
648    delim_count: usize,
649    end: usize,
650    config: &ParserOptions,
651) -> Option<usize> {
652    let bytes = text.as_bytes();
653    let mut pos = start;
654
655    while pos < end.min(text.len()) {
656        if bytes[pos] == b'`'
657            && let Some(m) = try_parse_inline_executable(
658                &text[pos..],
659                config.extensions.rmarkdown_inline_code,
660                config.extensions.quarto_inline_code,
661            )
662        {
663            log::trace!(
664                "Skipping inline executable span of {} bytes at pos {}",
665                m.total_len,
666                pos
667            );
668            pos += m.total_len;
669            continue;
670        }
671
672        // Skip over code spans - their content is protected from delimiter matching
673        if bytes[pos] == b'`'
674            && let Some((len, _, _, _)) = try_parse_code_span(&text[pos..])
675        {
676            log::trace!("Skipping code span of {} bytes at pos {}", len, pos);
677            pos += len;
678            continue;
679        }
680
681        // Skip over inline math - their content is protected from delimiter matching
682        if bytes[pos] == b'$'
683            && let Some((len, _)) = try_parse_inline_math(&text[pos..])
684        {
685            log::trace!("Skipping inline math of {} bytes at pos {}", len, pos);
686            pos += len;
687            continue;
688        }
689
690        // Skip over links - their content is protected from delimiter matching
691        if bytes[pos] == b'['
692            && let Some((len, _, _, _)) = try_parse_inline_link(&text[pos..])
693        {
694            log::trace!("Skipping inline link of {} bytes at pos {}", len, pos);
695            pos += len;
696            continue;
697        }
698
699        // Pandoc algorithm: If we're looking for a single delimiter (*) and
700        // encounter a double delimiter (**), try to parse it as `two` (strong).
701        // This happens BEFORE checking if pos is a closer for our current emphasis.
702        if delim_count == 1
703            && pos + 2 <= text.len()
704            && bytes[pos] == delim_char as u8
705            && bytes[pos + 1] == delim_char as u8
706        {
707            // First check if the first delimiter is escaped
708            let first_is_escaped = {
709                let mut backslash_count = 0;
710                let mut check_pos = pos;
711                while check_pos > 0 && bytes[check_pos - 1] == b'\\' {
712                    backslash_count += 1;
713                    check_pos -= 1;
714                }
715                backslash_count % 2 == 1
716            };
717
718            if first_is_escaped {
719                // First * is escaped, skip it and continue
720                // The second * might be a closer or start of emphasis
721                log::trace!(
722                    "First * at pos {} is escaped, skipping to check second *",
723                    pos
724                );
725                pos = advance_char_boundary(text, pos, end);
726                continue;
727            }
728
729            // Check that there's NOT a third delimiter (which would make this
730            // part of a longer run that we shouldn't treat as `two`)
731            let no_third_delim = pos + 2 >= bytes.len() || bytes[pos + 2] != delim_char as u8;
732
733            if no_third_delim {
734                log::trace!(
735                    "try_parse_one: found ** at pos {}, attempting nested two",
736                    pos
737                );
738
739                // Try to parse as `two` (strong emphasis)
740                // We create a temporary builder to test if `two` succeeds
741                let mut temp_builder = GreenNodeBuilder::new();
742                if let Some(two_consumed) =
743                    try_parse_two(text, pos, delim_char, end, config, &mut temp_builder)
744                {
745                    // `two` succeeded! Those ** delimiters are consumed.
746                    // We skip past the `two` and continue searching for our `*` closer.
747                    log::trace!(
748                        "Nested two succeeded, consumed {} bytes, continuing search",
749                        two_consumed
750                    );
751                    pos += two_consumed;
752                    continue;
753                }
754                // `two` failed - this means the entire `one` parse should fail!
755                // In Pandoc, the `try (string [c,c] >> notFollowedBy (ender c 1) >> two c mempty)`
756                // alternative fails, and the first alternative `notFollowedBy (ender c 1) >> inline`
757                // also fails because we ARE followed by an ender (the first * of **).
758                // So the entire content parsing fails, and `one` returns failure.
759                log::trace!("Nested two failed at pos {}, entire one() should fail", pos);
760                return None;
761            }
762        }
763
764        // Check if we have a potential closer here
765        if pos + delim_count <= text.len() {
766            let mut matches = true;
767            for i in 0..delim_count {
768                if bytes[pos + i] != delim_char as u8 {
769                    matches = false;
770                    break;
771                }
772            }
773
774            if matches {
775                // IMPORTANT: Check that there are EXACTLY delim_count delimiters,
776                // not more. E.g., when looking for `*`, we shouldn't match
777                // `*` that's part of a longer run.
778
779                // Check: not escaped (preceded by odd number of backslashes)
780                let is_escaped = {
781                    let mut backslash_count = 0;
782                    let mut check_pos = pos;
783                    while check_pos > 0 && bytes[check_pos - 1] == b'\\' {
784                        backslash_count += 1;
785                        check_pos -= 1;
786                    }
787                    backslash_count % 2 == 1 // Odd number = escaped
788                };
789
790                // Allow matching at the start OR end of a delimiter run.
791                // This lets `**` close at the end of `***` (after a nested `*` closes),
792                // while still avoiding matches in the middle of longer runs.
793                let at_run_start = pos == 0 || bytes[pos - 1] != delim_char as u8;
794                let after_pos = pos + delim_count;
795                let at_run_end = after_pos >= bytes.len() || bytes[after_pos] != delim_char as u8;
796
797                if (at_run_start || at_run_end) && !is_escaped {
798                    // Found a potential closer!
799                    // For underscores, check right-flanking: closer must be preceded by non-whitespace
800                    // For asterisks, Pandoc doesn't require right-flanking (see ender function in Markdown.hs)
801                    if delim_char == '_'
802                        && pos > start
803                        && let Some(prev_char) = text[..pos].chars().last()
804                        && prev_char.is_whitespace()
805                    {
806                        log::trace!(
807                            "Underscore closer preceded by whitespace at pos {}, not right-flanking",
808                            pos
809                        );
810                        // Not a valid closer, continue searching
811                        pos = advance_char_boundary(text, pos, end);
812                        continue;
813                    }
814
815                    log::trace!(
816                        "Found exact {} x {} closer at pos {}",
817                        delim_char,
818                        delim_count,
819                        pos
820                    );
821                    return Some(pos);
822                }
823            }
824        }
825
826        // Not a closer, move to next UTF-8 boundary.
827        pos = advance_char_boundary(text, pos, end);
828    }
829
830    None
831}
832
833/// Parse inline content and look for a matching closer, with nested one attempts.
834///
835/// This implements the symmetric case to `parse_until_closer_with_nested_two`:
836/// When parsing `**...**`, if we encounter `*` (and it's not followed by
837/// another `*` that would be part of our `**` closer), try to parse it as
838/// `one c mempty` (emphasis). If `one` succeeds, those `*` delimiters are
839/// consumed, and we continue searching for the `**` closer.
840///
841/// This ensures nested emphasis closes before the outer strong emphasis.
842///
843/// Example: `**bold with *italic***`
844/// - When parsing the outer `**...**, we scan for `**` closer
845/// - At position 12, we encounter a single `*` (start of `*italic`)
846/// - We try `one` which succeeds with `*italic*` (consuming the first `*` from `***`)
847/// - We continue scanning and find `**` at position 20 (the remaining `**` from `***`)
848/// - Result: STRONG["bold with " EMPHASIS["italic"]]
849///
850/// # Arguments
851/// * `end` - Don't search beyond this position (respects nesting boundaries)
852fn parse_until_closer_with_nested_one(
853    text: &str,
854    start: usize,
855    delim_char: char,
856    delim_count: usize,
857    end: usize,
858    config: &ParserOptions,
859) -> Option<usize> {
860    let bytes = text.as_bytes();
861    let mut pos = start;
862
863    while pos < end.min(text.len()) {
864        if bytes[pos] == b'`'
865            && let Some(m) = try_parse_inline_executable(
866                &text[pos..],
867                config.extensions.rmarkdown_inline_code,
868                config.extensions.quarto_inline_code,
869            )
870        {
871            log::trace!(
872                "Skipping inline executable span of {} bytes at pos {}",
873                m.total_len,
874                pos
875            );
876            pos += m.total_len;
877            continue;
878        }
879
880        // Skip over code spans - their content is protected from delimiter matching
881        if bytes[pos] == b'`'
882            && let Some((len, _, _, _)) = try_parse_code_span(&text[pos..])
883        {
884            log::trace!("Skipping code span of {} bytes at pos {}", len, pos);
885            pos += len;
886            continue;
887        }
888
889        // Skip over inline math - their content is protected from delimiter matching
890        if bytes[pos] == b'$'
891            && let Some((len, _)) = try_parse_inline_math(&text[pos..])
892        {
893            log::trace!("Skipping inline math of {} bytes at pos {}", len, pos);
894            pos += len;
895            continue;
896        }
897
898        // Skip over links - their content is protected from delimiter matching
899        if bytes[pos] == b'['
900            && let Some((len, _, _, _)) = try_parse_inline_link(&text[pos..])
901        {
902            log::trace!("Skipping inline link of {} bytes at pos {}", len, pos);
903            pos += len;
904            continue;
905        }
906
907        // Pandoc algorithm: If we're looking for a double delimiter (**) and
908        // encounter a single delimiter (*), check if it's a valid emphasis opener.
909        // If it is, try to parse it as `one` (emphasis). If `one` succeeds, skip
910        // over it. If `one` fails, the outer `two` also fails (delimiter poisoning).
911        // If the `*` is NOT a valid opener (e.g., followed by whitespace or escaped),
912        // skip it and continue looking for the `**` closer.
913        if delim_count == 2 && pos < text.len() && bytes[pos] == delim_char as u8 {
914            // Check that there's NOT a second delimiter immediately after
915            // (which would make this part of our `**` closer or another `**` opener)
916            let no_second_delim = pos + 1 >= bytes.len() || bytes[pos + 1] != delim_char as u8;
917
918            if no_second_delim {
919                // Check if this * is escaped (preceded by odd number of backslashes)
920                let is_escaped = {
921                    let mut backslash_count = 0;
922                    let mut check_pos = pos;
923                    while check_pos > 0 && bytes[check_pos - 1] == b'\\' {
924                        backslash_count += 1;
925                        check_pos -= 1;
926                    }
927                    backslash_count % 2 == 1
928                };
929
930                if is_escaped {
931                    // Escaped delimiter - just literal text, skip it
932                    log::trace!("* at pos {} is escaped, skipping", pos);
933                    pos = advance_char_boundary(text, pos, end);
934                    continue;
935                }
936
937                // Check if this * is a valid emphasis opener (Pandoc's enclosure rule).
938                // A delimiter followed by whitespace is NOT an opener - it's literal text.
939                let after_delim = pos + 1;
940                let followed_by_whitespace = after_delim < text.len()
941                    && text[after_delim..]
942                        .chars()
943                        .next()
944                        .is_some_and(|c| c.is_whitespace());
945
946                if followed_by_whitespace {
947                    // Not a valid opener - just literal text, skip it
948                    log::trace!(
949                        "* at pos {} followed by whitespace, not an opener, skipping",
950                        pos
951                    );
952                    pos = advance_char_boundary(text, pos, end);
953                    continue;
954                }
955
956                log::trace!(
957                    "try_parse_two: found * at pos {}, attempting nested one",
958                    pos
959                );
960
961                // Try to parse as `one` (emphasis)
962                // We create a temporary builder to test if `one` succeeds
963                let mut temp_builder = GreenNodeBuilder::new();
964                if let Some(one_consumed) =
965                    try_parse_one(text, pos, delim_char, end, config, &mut temp_builder)
966                {
967                    // `one` succeeded! Those * delimiters are consumed.
968                    // We skip past the `one` and continue searching for our `**` closer.
969                    log::trace!(
970                        "Nested one succeeded, consumed {} bytes, continuing search",
971                        one_consumed
972                    );
973                    pos += one_consumed;
974                    continue;
975                }
976
977                // `one` failed to find a closer. According to Pandoc's algorithm,
978                // this means the outer `two` should also fail. An unmatched inner
979                // delimiter "poisons" the outer emphasis.
980                // Example: `**foo *bar**` - the `*` can't find a closer, so the
981                // outer `**` should fail and the whole thing becomes literal.
982                log::trace!(
983                    "Nested one failed at pos {}, poisoning outer two (no closer found)",
984                    pos
985                );
986                return None;
987            }
988        }
989
990        // Check if we have a potential closer here
991        if pos + delim_count <= text.len() {
992            let mut matches = true;
993            for i in 0..delim_count {
994                if bytes[pos + i] != delim_char as u8 {
995                    matches = false;
996                    break;
997                }
998            }
999
1000            if matches {
1001                // Check: not escaped (preceded by odd number of backslashes)
1002                let is_escaped = {
1003                    let mut backslash_count = 0;
1004                    let mut check_pos = pos;
1005                    while check_pos > 0 && bytes[check_pos - 1] == b'\\' {
1006                        backslash_count += 1;
1007                        check_pos -= 1;
1008                    }
1009                    backslash_count % 2 == 1 // Odd number = escaped
1010                };
1011
1012                // Allow matching at the start OR end of a delimiter run.
1013                // This lets `**` close at the end of `***` (after a nested `*` closes),
1014                // while still avoiding matches in the middle of longer runs.
1015                let at_run_start = pos == 0 || bytes[pos - 1] != delim_char as u8;
1016                let after_pos = pos + delim_count;
1017                let at_run_end = after_pos >= bytes.len() || bytes[after_pos] != delim_char as u8;
1018
1019                if (at_run_start || at_run_end) && !is_escaped {
1020                    // Found a potential closer!
1021                    // For underscores, check right-flanking: closer must be preceded by non-whitespace
1022                    // For asterisks, Pandoc doesn't require right-flanking (see ender function in Markdown.hs)
1023                    if delim_char == '_'
1024                        && pos > start
1025                        && let Some(prev_char) = text[..pos].chars().last()
1026                        && prev_char.is_whitespace()
1027                    {
1028                        log::trace!(
1029                            "Underscore closer preceded by whitespace at pos {}, not right-flanking",
1030                            pos
1031                        );
1032                        // Not a valid closer, continue searching
1033                        pos = advance_char_boundary(text, pos, end);
1034                        continue;
1035                    }
1036
1037                    log::trace!(
1038                        "Found exact {} x {} closer at pos {}",
1039                        delim_char,
1040                        delim_count,
1041                        pos
1042                    );
1043                    return Some(pos);
1044                }
1045            }
1046        }
1047
1048        // Not a closer, move to next UTF-8 boundary.
1049        pos = advance_char_boundary(text, pos, end);
1050    }
1051
1052    None
1053}
1054
1055///
1056/// This is the recursive inline parser that handles all inline elements:
1057/// - Text
1058/// - Escapes (highest priority)
1059/// - Code spans
1060/// - Math (inline and display)
1061/// - Emphasis/strong (via try_parse_emphasis)
1062/// - Other inline elements
1063///
1064/// **Important**: This is where the greedy left-to-right parsing happens.
1065/// When we see `**`, we try to parse it as STRONG. If it succeeds, those
1066/// delimiters are consumed and won't be available for outer emphasis.
1067///
1068/// # Arguments
1069/// * `nested_emphasis` - If true, bypass opener validity checks for emphasis.
1070///   Set to true when called from within emphasis parsing (e.g., from try_parse_one/two/three).
1071fn parse_inline_range(
1072    text: &str,
1073    start: usize,
1074    end: usize,
1075    config: &ParserOptions,
1076    builder: &mut GreenNodeBuilder,
1077) {
1078    parse_inline_range_impl(text, start, end, config, builder, false)
1079}
1080
1081/// Same as `parse_inline_range` but bypasses opener validity checks for emphasis.
1082/// Used within emphasis parsing contexts (e.g., from try_parse_one/two/three).
1083fn parse_inline_range_nested(
1084    text: &str,
1085    start: usize,
1086    end: usize,
1087    config: &ParserOptions,
1088    builder: &mut GreenNodeBuilder,
1089) {
1090    parse_inline_range_impl(text, start, end, config, builder, true)
1091}
1092
1093fn is_emoji_boundary(text: &str, pos: usize) -> bool {
1094    if pos > 0 {
1095        let prev = text.as_bytes()[pos - 1] as char;
1096        if prev.is_ascii_alphanumeric() || prev == '_' {
1097            return false;
1098        }
1099    }
1100    true
1101}
1102
1103#[inline]
1104fn advance_char_boundary(text: &str, pos: usize, end: usize) -> usize {
1105    if pos >= end || pos >= text.len() {
1106        return pos;
1107    }
1108    let ch_len = text[pos..]
1109        .chars()
1110        .next()
1111        .map_or(1, std::primitive::char::len_utf8);
1112    (pos + ch_len).min(end)
1113}
1114
1115fn parse_inline_range_impl(
1116    text: &str,
1117    start: usize,
1118    end: usize,
1119    config: &ParserOptions,
1120    builder: &mut GreenNodeBuilder,
1121    nested_emphasis: bool,
1122) {
1123    log::trace!(
1124        "parse_inline_range: start={}, end={}, text={:?}",
1125        start,
1126        end,
1127        &text[start..end]
1128    );
1129    let mut pos = start;
1130    let mut text_start = start;
1131
1132    while pos < end {
1133        let byte = text.as_bytes()[pos];
1134
1135        // Backslash math (highest priority if enabled)
1136        if byte == b'\\' {
1137            // Try double backslash display math first: \\[...\\]
1138            if config.extensions.tex_math_double_backslash {
1139                if let Some((len, content)) = try_parse_double_backslash_display_math(&text[pos..])
1140                {
1141                    if pos > text_start {
1142                        builder.token(SyntaxKind::TEXT.into(), &text[text_start..pos]);
1143                    }
1144                    log::trace!("Matched double backslash display math at pos {}", pos);
1145                    emit_double_backslash_display_math(builder, content);
1146                    pos += len;
1147                    text_start = pos;
1148                    continue;
1149                }
1150
1151                // Try double backslash inline math: \\(...\\)
1152                if let Some((len, content)) = try_parse_double_backslash_inline_math(&text[pos..]) {
1153                    if pos > text_start {
1154                        builder.token(SyntaxKind::TEXT.into(), &text[text_start..pos]);
1155                    }
1156                    log::trace!("Matched double backslash inline math at pos {}", pos);
1157                    emit_double_backslash_inline_math(builder, content);
1158                    pos += len;
1159                    text_start = pos;
1160                    continue;
1161                }
1162            }
1163
1164            // Try single backslash display math: \[...\]
1165            if config.extensions.tex_math_single_backslash {
1166                if let Some((len, content)) = try_parse_single_backslash_display_math(&text[pos..])
1167                {
1168                    if pos > text_start {
1169                        builder.token(SyntaxKind::TEXT.into(), &text[text_start..pos]);
1170                    }
1171                    log::trace!("Matched single backslash display math at pos {}", pos);
1172                    emit_single_backslash_display_math(builder, content);
1173                    pos += len;
1174                    text_start = pos;
1175                    continue;
1176                }
1177
1178                // Try single backslash inline math: \(...\)
1179                if let Some((len, content)) = try_parse_single_backslash_inline_math(&text[pos..]) {
1180                    if pos > text_start {
1181                        builder.token(SyntaxKind::TEXT.into(), &text[text_start..pos]);
1182                    }
1183                    log::trace!("Matched single backslash inline math at pos {}", pos);
1184                    emit_single_backslash_inline_math(builder, content);
1185                    pos += len;
1186                    text_start = pos;
1187                    continue;
1188                }
1189            }
1190
1191            // Try math environments \begin{equation}...\end{equation}
1192            if config.extensions.raw_tex
1193                && let Some((len, begin_marker, content, end_marker)) =
1194                    try_parse_math_environment(&text[pos..])
1195            {
1196                if pos > text_start {
1197                    builder.token(SyntaxKind::TEXT.into(), &text[text_start..pos]);
1198                }
1199                log::trace!("Matched math environment at pos {}", pos);
1200                emit_display_math_environment(builder, begin_marker, content, end_marker);
1201                pos += len;
1202                text_start = pos;
1203                continue;
1204            }
1205
1206            // Try bookdown reference: \@ref(label)
1207            if config.extensions.bookdown_references
1208                && let Some((len, label)) = try_parse_bookdown_reference(&text[pos..])
1209            {
1210                if pos > text_start {
1211                    builder.token(SyntaxKind::TEXT.into(), &text[text_start..pos]);
1212                }
1213                log::trace!("Matched bookdown reference at pos {}: {}", pos, label);
1214                super::citations::emit_bookdown_crossref(builder, label);
1215                pos += len;
1216                text_start = pos;
1217                continue;
1218            }
1219
1220            // Try escapes (after bookdown refs and backslash math)
1221            if let Some((len, ch, escape_type)) = try_parse_escape(&text[pos..]) {
1222                let escape_enabled = match escape_type {
1223                    EscapeType::HardLineBreak => config.extensions.escaped_line_breaks,
1224                    EscapeType::NonbreakingSpace => config.extensions.all_symbols_escapable,
1225                    EscapeType::Literal => {
1226                        const BASE_ESCAPABLE: &str = "\\`*_{}[]()>#+-.!";
1227                        BASE_ESCAPABLE.contains(ch) || config.extensions.all_symbols_escapable
1228                    }
1229                };
1230                if !escape_enabled {
1231                    // Don't treat as hard line break - skip the escape and continue
1232                    // The backslash will be included in the next TEXT token
1233                    pos = advance_char_boundary(text, pos, end);
1234                    continue;
1235                }
1236
1237                // Emit accumulated text
1238                if pos > text_start {
1239                    builder.token(SyntaxKind::TEXT.into(), &text[text_start..pos]);
1240                }
1241
1242                log::trace!("Matched escape at pos {}: \\{}", pos, ch);
1243                emit_escape(builder, ch, escape_type);
1244                pos += len;
1245                text_start = pos;
1246                continue;
1247            }
1248
1249            // Try LaTeX commands (after escapes, before shortcodes)
1250            if config.extensions.raw_tex
1251                && let Some(len) = try_parse_latex_command(&text[pos..])
1252            {
1253                if pos > text_start {
1254                    builder.token(SyntaxKind::TEXT.into(), &text[text_start..pos]);
1255                }
1256                log::trace!("Matched LaTeX command at pos {}", pos);
1257                parse_latex_command(builder, &text[pos..], len);
1258                pos += len;
1259                text_start = pos;
1260                continue;
1261            }
1262        }
1263
1264        // Try Quarto shortcodes: {{< shortcode >}}
1265        if byte == b'{'
1266            && pos + 1 < text.len()
1267            && text.as_bytes()[pos + 1] == b'{'
1268            && let Some((len, name, attrs)) = try_parse_shortcode(&text[pos..])
1269        {
1270            if pos > text_start {
1271                builder.token(SyntaxKind::TEXT.into(), &text[text_start..pos]);
1272            }
1273            log::trace!("Matched shortcode at pos {}: {}", pos, &name);
1274            emit_shortcode(builder, &name, attrs);
1275            pos += len;
1276            text_start = pos;
1277            continue;
1278        }
1279
1280        // Try inline executable code spans (`... `r expr`` and `... `{r} expr``)
1281        if byte == b'`'
1282            && let Some(m) = try_parse_inline_executable(
1283                &text[pos..],
1284                config.extensions.rmarkdown_inline_code,
1285                config.extensions.quarto_inline_code,
1286            )
1287        {
1288            if pos > text_start {
1289                builder.token(SyntaxKind::TEXT.into(), &text[text_start..pos]);
1290            }
1291            log::trace!("Matched inline executable code at pos {}", pos);
1292            emit_inline_executable(builder, &m);
1293            pos += m.total_len;
1294            text_start = pos;
1295            continue;
1296        }
1297
1298        // Try code spans
1299        if byte == b'`'
1300            && let Some((len, content, backtick_count, attributes)) =
1301                try_parse_code_span(&text[pos..])
1302        {
1303            // Emit accumulated text
1304            if pos > text_start {
1305                builder.token(SyntaxKind::TEXT.into(), &text[text_start..pos]);
1306            }
1307
1308            log::trace!(
1309                "Matched code span at pos {}: {} backticks",
1310                pos,
1311                backtick_count
1312            );
1313
1314            // Check for raw inline
1315            if let Some(ref attrs) = attributes
1316                && config.extensions.raw_attribute
1317                && let Some(format) = is_raw_inline(attrs)
1318            {
1319                use super::raw_inline::emit_raw_inline;
1320                log::trace!("Matched raw inline span at pos {}: format={}", pos, format);
1321                emit_raw_inline(builder, content, backtick_count, format);
1322            } else if !config.extensions.inline_code_attributes && attributes.is_some() {
1323                let code_span_len = backtick_count * 2 + content.len();
1324                emit_code_span(builder, content, backtick_count, None);
1325                pos += code_span_len;
1326                text_start = pos;
1327                continue;
1328            } else {
1329                emit_code_span(builder, content, backtick_count, attributes);
1330            }
1331
1332            pos += len;
1333            text_start = pos;
1334            continue;
1335        }
1336
1337        // Try textual emoji aliases: :smile:
1338        if byte == b':'
1339            && config.extensions.emoji
1340            && is_emoji_boundary(text, pos)
1341            && let Some((len, _alias)) = try_parse_emoji(&text[pos..])
1342        {
1343            if pos > text_start {
1344                builder.token(SyntaxKind::TEXT.into(), &text[text_start..pos]);
1345            }
1346            log::trace!("Matched emoji at pos {}", pos);
1347            emit_emoji(builder, &text[pos..pos + len]);
1348            pos += len;
1349            text_start = pos;
1350            continue;
1351        }
1352
1353        // Try inline footnotes: ^[note]
1354        if byte == b'^'
1355            && pos + 1 < text.len()
1356            && text.as_bytes()[pos + 1] == b'['
1357            && config.extensions.inline_footnotes
1358            && let Some((len, content)) = try_parse_inline_footnote(&text[pos..])
1359        {
1360            if pos > text_start {
1361                builder.token(SyntaxKind::TEXT.into(), &text[text_start..pos]);
1362            }
1363            log::trace!("Matched inline footnote at pos {}", pos);
1364            emit_inline_footnote(builder, content, config);
1365            pos += len;
1366            text_start = pos;
1367            continue;
1368        }
1369
1370        // Try superscript: ^text^
1371        if byte == b'^'
1372            && config.extensions.superscript
1373            && let Some((len, content)) = try_parse_superscript(&text[pos..])
1374        {
1375            if pos > text_start {
1376                builder.token(SyntaxKind::TEXT.into(), &text[text_start..pos]);
1377            }
1378            log::trace!("Matched superscript at pos {}", pos);
1379            emit_superscript(builder, content, config);
1380            pos += len;
1381            text_start = pos;
1382            continue;
1383        }
1384
1385        // Try bookdown definition: (\#label) or (ref:label)
1386        if byte == b'(' && config.extensions.bookdown_references {
1387            if let Some((len, label)) = try_parse_bookdown_definition(&text[pos..]) {
1388                if pos > text_start {
1389                    builder.token(SyntaxKind::TEXT.into(), &text[text_start..pos]);
1390                }
1391                log::trace!("Matched bookdown definition at pos {}: {}", pos, label);
1392                builder.token(SyntaxKind::TEXT.into(), &text[pos..pos + len]);
1393                pos += len;
1394                text_start = pos;
1395                continue;
1396            }
1397            if let Some((len, label)) = try_parse_bookdown_text_reference(&text[pos..]) {
1398                if pos > text_start {
1399                    builder.token(SyntaxKind::TEXT.into(), &text[text_start..pos]);
1400                }
1401                log::trace!("Matched bookdown text reference at pos {}: {}", pos, label);
1402                builder.token(SyntaxKind::TEXT.into(), &text[pos..pos + len]);
1403                pos += len;
1404                text_start = pos;
1405                continue;
1406            }
1407        }
1408
1409        // Try subscript: ~text~
1410        if byte == b'~'
1411            && config.extensions.subscript
1412            && let Some((len, content)) = try_parse_subscript(&text[pos..])
1413        {
1414            if pos > text_start {
1415                builder.token(SyntaxKind::TEXT.into(), &text[text_start..pos]);
1416            }
1417            log::trace!("Matched subscript at pos {}", pos);
1418            emit_subscript(builder, content, config);
1419            pos += len;
1420            text_start = pos;
1421            continue;
1422        }
1423
1424        // Try strikeout: ~~text~~
1425        if byte == b'~'
1426            && config.extensions.strikeout
1427            && let Some((len, content)) = try_parse_strikeout(&text[pos..])
1428        {
1429            if pos > text_start {
1430                builder.token(SyntaxKind::TEXT.into(), &text[text_start..pos]);
1431            }
1432            log::trace!("Matched strikeout at pos {}", pos);
1433            emit_strikeout(builder, content, config);
1434            pos += len;
1435            text_start = pos;
1436            continue;
1437        }
1438
1439        // Try mark/highlight: ==text==
1440        if byte == b'='
1441            && config.extensions.mark
1442            && let Some((len, content)) = try_parse_mark(&text[pos..])
1443        {
1444            if pos > text_start {
1445                builder.token(SyntaxKind::TEXT.into(), &text[text_start..pos]);
1446            }
1447            log::trace!("Matched mark at pos {}", pos);
1448            emit_mark(builder, content, config);
1449            pos += len;
1450            text_start = pos;
1451            continue;
1452        }
1453
1454        // Try GFM inline math: $`...`$
1455        if byte == b'$'
1456            && config.extensions.tex_math_gfm
1457            && let Some((len, content)) = try_parse_gfm_inline_math(&text[pos..])
1458        {
1459            if pos > text_start {
1460                builder.token(SyntaxKind::TEXT.into(), &text[text_start..pos]);
1461            }
1462            log::trace!("Matched GFM inline math at pos {}", pos);
1463            emit_gfm_inline_math(builder, content);
1464            pos += len;
1465            text_start = pos;
1466            continue;
1467        }
1468
1469        // Try math ($...$, $$...$$)
1470        if byte == b'$' && config.extensions.tex_math_dollars {
1471            // Try display math first ($$...$$)
1472            if let Some((len, content)) = try_parse_display_math(&text[pos..]) {
1473                // Emit accumulated text
1474                if pos > text_start {
1475                    builder.token(SyntaxKind::TEXT.into(), &text[text_start..pos]);
1476                }
1477
1478                let dollar_count = text[pos..].chars().take_while(|&c| c == '$').count();
1479                log::trace!(
1480                    "Matched display math at pos {}: {} dollars",
1481                    pos,
1482                    dollar_count
1483                );
1484
1485                // Check for trailing attributes (Quarto cross-reference support)
1486                let after_math = &text[pos + len..];
1487                let attr_len = if config.extensions.quarto_crossrefs {
1488                    use crate::parser::utils::attributes::try_parse_trailing_attributes;
1489                    if let Some((_attr_block, _)) = try_parse_trailing_attributes(after_math) {
1490                        let trimmed_after = after_math.trim_start();
1491                        if let Some(open_brace_pos) = trimmed_after.find('{') {
1492                            let ws_before_brace = after_math.len() - trimmed_after.len();
1493                            let attr_text_len = trimmed_after[open_brace_pos..]
1494                                .find('}')
1495                                .map(|close| close + 1)
1496                                .unwrap_or(0);
1497                            ws_before_brace + open_brace_pos + attr_text_len
1498                        } else {
1499                            0
1500                        }
1501                    } else {
1502                        0
1503                    }
1504                } else {
1505                    0
1506                };
1507
1508                let total_len = len + attr_len;
1509                emit_display_math(builder, content, dollar_count);
1510
1511                // Emit attributes if present
1512                if attr_len > 0 {
1513                    use crate::parser::utils::attributes::{
1514                        emit_attributes, try_parse_trailing_attributes,
1515                    };
1516                    let attr_text = &text[pos + len..pos + total_len];
1517                    if let Some((attr_block, _text_before)) =
1518                        try_parse_trailing_attributes(attr_text)
1519                    {
1520                        let trimmed_after = attr_text.trim_start();
1521                        let ws_len = attr_text.len() - trimmed_after.len();
1522                        if ws_len > 0 {
1523                            builder.token(SyntaxKind::WHITESPACE.into(), &attr_text[..ws_len]);
1524                        }
1525                        emit_attributes(builder, &attr_block);
1526                    }
1527                }
1528
1529                pos += total_len;
1530                text_start = pos;
1531                continue;
1532            }
1533
1534            // Try inline math ($...$)
1535            if let Some((len, content)) = try_parse_inline_math(&text[pos..]) {
1536                // Emit accumulated text
1537                if pos > text_start {
1538                    builder.token(SyntaxKind::TEXT.into(), &text[text_start..pos]);
1539                }
1540
1541                log::trace!("Matched inline math at pos {}", pos);
1542                emit_inline_math(builder, content);
1543                pos += len;
1544                text_start = pos;
1545                continue;
1546            }
1547
1548            // Neither display nor inline math matched - emit the $ as literal text
1549            // This ensures each $ gets its own TEXT token for CST compatibility
1550            if pos > text_start {
1551                builder.token(SyntaxKind::TEXT.into(), &text[text_start..pos]);
1552            }
1553            builder.token(SyntaxKind::TEXT.into(), "$");
1554            pos = advance_char_boundary(text, pos, end);
1555            text_start = pos;
1556            continue;
1557        }
1558
1559        // Try autolinks: <url> or <email>
1560        if byte == b'<'
1561            && config.extensions.autolinks
1562            && let Some((len, url)) = try_parse_autolink(&text[pos..])
1563        {
1564            if pos > text_start {
1565                builder.token(SyntaxKind::TEXT.into(), &text[text_start..pos]);
1566            }
1567            log::trace!("Matched autolink at pos {}", pos);
1568            emit_autolink(builder, &text[pos..pos + len], url);
1569            pos += len;
1570            text_start = pos;
1571            continue;
1572        }
1573
1574        if config.extensions.autolink_bare_uris
1575            && let Some((len, url)) = try_parse_bare_uri(&text[pos..])
1576        {
1577            if pos > text_start {
1578                builder.token(SyntaxKind::TEXT.into(), &text[text_start..pos]);
1579            }
1580            log::trace!("Matched bare URI at pos {}", pos);
1581            emit_bare_uri_link(builder, url, config);
1582            pos += len;
1583            text_start = pos;
1584            continue;
1585        }
1586
1587        // Try native spans: <span>text</span> (after autolink since both start with <)
1588        if byte == b'<'
1589            && config.extensions.native_spans
1590            && let Some((len, content, attributes)) = try_parse_native_span(&text[pos..])
1591        {
1592            if pos > text_start {
1593                builder.token(SyntaxKind::TEXT.into(), &text[text_start..pos]);
1594            }
1595            log::trace!("Matched native span at pos {}", pos);
1596            emit_native_span(builder, content, &attributes, config);
1597            pos += len;
1598            text_start = pos;
1599            continue;
1600        }
1601
1602        // Images and links - process in order: inline image, reference image, footnote ref, inline link, reference link
1603        if byte == b'!' && pos + 1 < text.len() && text.as_bytes()[pos + 1] == b'[' {
1604            // Try inline image: ![alt](url)
1605            if let Some((len, alt_text, dest, attributes)) = try_parse_inline_image(&text[pos..]) {
1606                if pos > text_start {
1607                    builder.token(SyntaxKind::TEXT.into(), &text[text_start..pos]);
1608                }
1609                log::trace!("Matched inline image at pos {}", pos);
1610                emit_inline_image(
1611                    builder,
1612                    &text[pos..pos + len],
1613                    alt_text,
1614                    dest,
1615                    attributes,
1616                    config,
1617                );
1618                pos += len;
1619                text_start = pos;
1620                continue;
1621            }
1622
1623            // Try reference image: ![alt][ref] or ![alt]
1624            if config.extensions.reference_links {
1625                let allow_shortcut = config.extensions.shortcut_reference_links;
1626                if let Some((len, alt_text, reference, is_implicit)) =
1627                    try_parse_reference_image(&text[pos..], allow_shortcut)
1628                {
1629                    if pos > text_start {
1630                        builder.token(SyntaxKind::TEXT.into(), &text[text_start..pos]);
1631                    }
1632                    log::trace!("Matched reference image at pos {}", pos);
1633                    emit_reference_image(builder, alt_text, &reference, is_implicit, config);
1634                    pos += len;
1635                    text_start = pos;
1636                    continue;
1637                }
1638            }
1639        }
1640
1641        // Process bracket-starting elements
1642        if byte == b'[' {
1643            // Try footnote reference: [^id]
1644            if config.extensions.footnotes
1645                && let Some((len, id)) = try_parse_footnote_reference(&text[pos..])
1646            {
1647                if pos > text_start {
1648                    builder.token(SyntaxKind::TEXT.into(), &text[text_start..pos]);
1649                }
1650                log::trace!("Matched footnote reference at pos {}", pos);
1651                emit_footnote_reference(builder, &id);
1652                pos += len;
1653                text_start = pos;
1654                continue;
1655            }
1656
1657            // Try inline link: [text](url)
1658            if config.extensions.inline_links
1659                && let Some((len, link_text, dest, attributes)) =
1660                    try_parse_inline_link(&text[pos..])
1661            {
1662                if pos > text_start {
1663                    builder.token(SyntaxKind::TEXT.into(), &text[text_start..pos]);
1664                }
1665                log::trace!("Matched inline link at pos {}", pos);
1666                emit_inline_link(
1667                    builder,
1668                    &text[pos..pos + len],
1669                    link_text,
1670                    dest,
1671                    attributes,
1672                    config,
1673                );
1674                pos += len;
1675                text_start = pos;
1676                continue;
1677            }
1678
1679            // Try reference link: [text][ref] or [text]
1680            if config.extensions.reference_links {
1681                let allow_shortcut = config.extensions.shortcut_reference_links;
1682                if let Some((len, link_text, reference, is_implicit)) =
1683                    try_parse_reference_link(&text[pos..], allow_shortcut)
1684                {
1685                    if pos > text_start {
1686                        builder.token(SyntaxKind::TEXT.into(), &text[text_start..pos]);
1687                    }
1688                    log::trace!("Matched reference link at pos {}", pos);
1689                    emit_reference_link(builder, link_text, &reference, is_implicit, config);
1690                    pos += len;
1691                    text_start = pos;
1692                    continue;
1693                }
1694            }
1695
1696            // Try bracketed citation: [@cite]
1697            if config.extensions.citations
1698                && let Some((len, content)) = try_parse_bracketed_citation(&text[pos..])
1699            {
1700                if pos > text_start {
1701                    builder.token(SyntaxKind::TEXT.into(), &text[text_start..pos]);
1702                }
1703                log::trace!("Matched bracketed citation at pos {}", pos);
1704                emit_bracketed_citation(builder, content);
1705                pos += len;
1706                text_start = pos;
1707                continue;
1708            }
1709        }
1710
1711        // Try bracketed spans: [text]{.class}
1712        // Must come after links/citations
1713        if byte == b'['
1714            && config.extensions.bracketed_spans
1715            && let Some((len, text_content, attrs)) = try_parse_bracketed_span(&text[pos..])
1716        {
1717            if pos > text_start {
1718                builder.token(SyntaxKind::TEXT.into(), &text[text_start..pos]);
1719            }
1720            log::trace!("Matched bracketed span at pos {}", pos);
1721            emit_bracketed_span(builder, &text_content, &attrs, config);
1722            pos += len;
1723            text_start = pos;
1724            continue;
1725        }
1726
1727        // Try bare citation: @cite (must come after bracketed elements)
1728        if byte == b'@'
1729            && (config.extensions.citations || config.extensions.quarto_crossrefs)
1730            && let Some((len, key, has_suppress)) = try_parse_bare_citation(&text[pos..])
1731        {
1732            let is_crossref =
1733                config.extensions.quarto_crossrefs && super::citations::is_quarto_crossref_key(key);
1734            if is_crossref || config.extensions.citations {
1735                if pos > text_start {
1736                    builder.token(SyntaxKind::TEXT.into(), &text[text_start..pos]);
1737                }
1738                if is_crossref {
1739                    log::trace!("Matched Quarto crossref at pos {}: {}", pos, &key);
1740                    super::citations::emit_crossref(builder, key, has_suppress);
1741                } else {
1742                    log::trace!("Matched bare citation at pos {}: {}", pos, &key);
1743                    emit_bare_citation(builder, key, has_suppress);
1744                }
1745                pos += len;
1746                text_start = pos;
1747                continue;
1748            }
1749        }
1750
1751        // Try suppress-author citation: -@cite
1752        if byte == b'-'
1753            && pos + 1 < text.len()
1754            && text.as_bytes()[pos + 1] == b'@'
1755            && (config.extensions.citations || config.extensions.quarto_crossrefs)
1756            && let Some((len, key, has_suppress)) = try_parse_bare_citation(&text[pos..])
1757        {
1758            let is_crossref =
1759                config.extensions.quarto_crossrefs && super::citations::is_quarto_crossref_key(key);
1760            if is_crossref || config.extensions.citations {
1761                if pos > text_start {
1762                    builder.token(SyntaxKind::TEXT.into(), &text[text_start..pos]);
1763                }
1764                if is_crossref {
1765                    log::trace!("Matched Quarto crossref at pos {}: {}", pos, &key);
1766                    super::citations::emit_crossref(builder, key, has_suppress);
1767                } else {
1768                    log::trace!("Matched suppress-author citation at pos {}: {}", pos, &key);
1769                    emit_bare_citation(builder, key, has_suppress);
1770                }
1771                pos += len;
1772                text_start = pos;
1773                continue;
1774            }
1775        }
1776
1777        // Try to parse emphasis at this position
1778        if byte == b'*' || byte == b'_' {
1779            // Count the delimiter run to avoid re-parsing
1780            let bytes = text.as_bytes();
1781            let mut delim_count = 0;
1782            while pos + delim_count < bytes.len() && bytes[pos + delim_count] == byte {
1783                delim_count += 1;
1784            }
1785
1786            // Emit any accumulated text before the delimiter
1787            if pos > text_start {
1788                log::trace!(
1789                    "Emitting TEXT before delimiter: {:?}",
1790                    &text[text_start..pos]
1791                );
1792                builder.token(SyntaxKind::TEXT.into(), &text[text_start..pos]);
1793                text_start = pos; // Update text_start after emission
1794            }
1795
1796            // Try to parse emphasis
1797            // Use nested variant (bypass opener validity) when in nested context
1798            let emphasis_result = if nested_emphasis {
1799                try_parse_emphasis_nested(text, pos, end, config, builder)
1800            } else {
1801                try_parse_emphasis(text, pos, end, config, builder)
1802            };
1803
1804            if let Some((consumed, _)) = emphasis_result {
1805                // Successfully parsed emphasis
1806                log::trace!(
1807                    "Parsed emphasis, consumed {} bytes from pos {}",
1808                    consumed,
1809                    pos
1810                );
1811                pos += consumed;
1812                text_start = pos;
1813            } else {
1814                // Failed to parse, delimiter run will be treated as regular text
1815                // Skip the ENTIRE delimiter run to avoid re-parsing parts of it
1816                log::trace!(
1817                    "Failed to parse emphasis at pos {}, skipping {} delimiters as literal",
1818                    pos,
1819                    delim_count
1820                );
1821                pos += delim_count;
1822                // DON'T update text_start - let the delimiters accumulate
1823            }
1824            continue;
1825        }
1826
1827        // Check for newlines - may need to emit as hard line break
1828        if byte == b'\r' && pos + 1 < end && text.as_bytes()[pos + 1] == b'\n' {
1829            let text_before = &text[text_start..pos];
1830
1831            // Check for trailing spaces hard line break (always enabled in Pandoc)
1832            let trailing_spaces = text_before.chars().rev().take_while(|&c| c == ' ').count();
1833            if trailing_spaces >= 2 {
1834                // Emit text before the trailing spaces
1835                let text_content = &text_before[..text_before.len() - trailing_spaces];
1836                if !text_content.is_empty() {
1837                    builder.token(SyntaxKind::TEXT.into(), text_content);
1838                }
1839                let spaces = " ".repeat(trailing_spaces);
1840                builder.token(
1841                    SyntaxKind::HARD_LINE_BREAK.into(),
1842                    &format!("{}\r\n", spaces),
1843                );
1844                pos += 2;
1845                text_start = pos;
1846                continue;
1847            }
1848
1849            // hard_line_breaks: treat all single newlines as hard line breaks
1850            if config.extensions.hard_line_breaks {
1851                if !text_before.is_empty() {
1852                    builder.token(SyntaxKind::TEXT.into(), text_before);
1853                }
1854                builder.token(SyntaxKind::HARD_LINE_BREAK.into(), "\r\n");
1855                pos += 2;
1856                text_start = pos;
1857                continue;
1858            }
1859
1860            // Regular newline
1861            if !text_before.is_empty() {
1862                builder.token(SyntaxKind::TEXT.into(), text_before);
1863            }
1864            builder.token(SyntaxKind::NEWLINE.into(), "\r\n");
1865            pos += 2;
1866            text_start = pos;
1867            continue;
1868        }
1869
1870        if byte == b'\n' {
1871            let text_before = &text[text_start..pos];
1872
1873            // Check for trailing spaces hard line break (always enabled in Pandoc)
1874            let trailing_spaces = text_before.chars().rev().take_while(|&c| c == ' ').count();
1875            if trailing_spaces >= 2 {
1876                // Emit text before the trailing spaces
1877                let text_content = &text_before[..text_before.len() - trailing_spaces];
1878                if !text_content.is_empty() {
1879                    builder.token(SyntaxKind::TEXT.into(), text_content);
1880                }
1881                let spaces = " ".repeat(trailing_spaces);
1882                builder.token(SyntaxKind::HARD_LINE_BREAK.into(), &format!("{}\n", spaces));
1883                pos += 1;
1884                text_start = pos;
1885                continue;
1886            }
1887
1888            // hard_line_breaks: treat all single newlines as hard line breaks
1889            if config.extensions.hard_line_breaks {
1890                if !text_before.is_empty() {
1891                    builder.token(SyntaxKind::TEXT.into(), text_before);
1892                }
1893                builder.token(SyntaxKind::HARD_LINE_BREAK.into(), "\n");
1894                pos += 1;
1895                text_start = pos;
1896                continue;
1897            }
1898
1899            // Regular newline
1900            if !text_before.is_empty() {
1901                builder.token(SyntaxKind::TEXT.into(), text_before);
1902            }
1903            builder.token(SyntaxKind::NEWLINE.into(), "\n");
1904            pos += 1;
1905            text_start = pos;
1906            continue;
1907        }
1908
1909        // Regular character, keep accumulating
1910        pos = advance_char_boundary(text, pos, end);
1911    }
1912
1913    // Emit any remaining text
1914    if pos > text_start && text_start < end {
1915        log::trace!("Emitting remaining TEXT: {:?}", &text[text_start..end]);
1916        builder.token(SyntaxKind::TEXT.into(), &text[text_start..end]);
1917    }
1918
1919    log::trace!("parse_inline_range complete: start={}, end={}", start, end);
1920}
1921
1922#[cfg(test)]
1923mod tests {
1924    use super::*;
1925    use crate::syntax::{SyntaxKind, SyntaxNode};
1926    use rowan::GreenNode;
1927
1928    #[test]
1929    fn test_recursive_simple_emphasis() {
1930        let text = "*test*";
1931        let config = ParserOptions::default();
1932        let mut builder = GreenNodeBuilder::new();
1933
1934        parse_inline_text_recursive(&mut builder, text, &config);
1935
1936        let green: GreenNode = builder.finish();
1937        let node = SyntaxNode::new_root(green);
1938
1939        // Should be lossless
1940        assert_eq!(node.text().to_string(), text);
1941
1942        // Should have EMPHASIS node
1943        let has_emph = node.descendants().any(|n| n.kind() == SyntaxKind::EMPHASIS);
1944        assert!(has_emph, "Should have EMPHASIS node");
1945    }
1946
1947    #[test]
1948    fn test_recursive_nested() {
1949        let text = "*foo **bar** baz*";
1950        let config = ParserOptions::default();
1951        let mut builder = GreenNodeBuilder::new();
1952
1953        // Wrap in a PARAGRAPH node (inline content needs a parent)
1954        builder.start_node(SyntaxKind::PARAGRAPH.into());
1955        parse_inline_text_recursive(&mut builder, text, &config);
1956        builder.finish_node();
1957
1958        let green: GreenNode = builder.finish();
1959        let node = SyntaxNode::new_root(green);
1960
1961        // Should be lossless
1962        assert_eq!(node.text().to_string(), text);
1963
1964        // Should have both EMPHASIS and STRONG
1965        let has_emph = node.descendants().any(|n| n.kind() == SyntaxKind::EMPHASIS);
1966        let has_strong = node.descendants().any(|n| n.kind() == SyntaxKind::STRONG);
1967
1968        assert!(has_emph, "Should have EMPHASIS node");
1969        assert!(has_strong, "Should have STRONG node");
1970    }
1971
1972    /// Test that we can parse a simple emphasis case
1973    #[test]
1974    fn test_parse_simple_emphasis() {
1975        use crate::options::ParserOptions;
1976        use crate::syntax::SyntaxNode;
1977        use rowan::GreenNode;
1978
1979        let text = "*test*";
1980        let config = ParserOptions::default();
1981        let mut builder = GreenNodeBuilder::new();
1982
1983        // Try to parse emphasis at position 0
1984        let result = try_parse_emphasis(text, 0, text.len(), &config, &mut builder);
1985
1986        // Should successfully parse
1987        assert_eq!(result, Some((6, 1))); // Consumed all 6 bytes, delimiter count 1
1988
1989        // Check the generated CST
1990        let green: GreenNode = builder.finish();
1991        let node = SyntaxNode::new_root(green);
1992
1993        // The root IS the EMPHASIS node
1994        assert_eq!(node.kind(), SyntaxKind::EMPHASIS);
1995
1996        // Verify losslessness: CST text should match input
1997        assert_eq!(node.text().to_string(), text);
1998    }
1999
2000    /// Test parsing nested emphasis/strong
2001    #[test]
2002    fn test_parse_nested_emphasis_strong() {
2003        use crate::options::ParserOptions;
2004
2005        let text = "*foo **bar** baz*";
2006        let config = ParserOptions::default();
2007        let mut builder = GreenNodeBuilder::new();
2008
2009        // Parse the whole range
2010        parse_inline_range(text, 0, text.len(), &config, &mut builder);
2011
2012        let green = builder.finish();
2013        let node = crate::syntax::SyntaxNode::new_root(green);
2014
2015        // Verify losslessness
2016        assert_eq!(node.text().to_string(), text);
2017
2018        // Should have EMPHASIS and STRONG nodes
2019        let has_emph = node.descendants().any(|n| n.kind() == SyntaxKind::EMPHASIS);
2020        let has_strong = node.descendants().any(|n| n.kind() == SyntaxKind::STRONG);
2021
2022        assert!(has_emph, "Should have EMPHASIS node");
2023        assert!(has_strong, "Should have STRONG node");
2024    }
2025
2026    /// Test Pandoc's "three" algorithm: ***foo* bar**
2027    /// Expected: Strong[Emph[foo], bar]
2028    /// Current bug: Parses as *Strong[foo* bar]
2029    #[test]
2030    fn test_triple_emphasis_star_then_double_star() {
2031        use crate::options::ParserOptions;
2032        use crate::syntax::SyntaxNode;
2033        use rowan::GreenNode;
2034
2035        let text = "***foo* bar**";
2036        let config = ParserOptions::default();
2037        let mut builder = GreenNodeBuilder::new();
2038
2039        builder.start_node(SyntaxKind::DOCUMENT.into());
2040        parse_inline_range(text, 0, text.len(), &config, &mut builder);
2041        builder.finish_node();
2042
2043        let green: GreenNode = builder.finish();
2044        let node = SyntaxNode::new_root(green);
2045
2046        // Verify losslessness
2047        assert_eq!(node.text().to_string(), text);
2048
2049        // Expected structure: STRONG > EMPH > "foo"
2050        // The STRONG should contain EMPH, not the other way around
2051        let structure = format!("{:#?}", node);
2052
2053        // Should have both STRONG and EMPH
2054        assert!(structure.contains("STRONG"), "Should have STRONG node");
2055        assert!(structure.contains("EMPHASIS"), "Should have EMPHASIS node");
2056
2057        // STRONG should be outer, EMPH should be inner
2058        // Check that STRONG comes before EMPH in tree traversal
2059        let mut found_strong = false;
2060        let mut found_emph_after_strong = false;
2061        for descendant in node.descendants() {
2062            if descendant.kind() == SyntaxKind::STRONG {
2063                found_strong = true;
2064            }
2065            if found_strong && descendant.kind() == SyntaxKind::EMPHASIS {
2066                found_emph_after_strong = true;
2067                break;
2068            }
2069        }
2070
2071        assert!(
2072            found_emph_after_strong,
2073            "EMPH should be inside STRONG, not before it. Current structure:\n{}",
2074            structure
2075        );
2076    }
2077
2078    /// Test Pandoc's "three" algorithm: ***foo** bar*
2079    /// Expected: Emph[Strong[foo], bar]
2080    #[test]
2081    fn test_triple_emphasis_double_star_then_star() {
2082        use crate::options::ParserOptions;
2083        use crate::syntax::SyntaxNode;
2084        use rowan::GreenNode;
2085
2086        let text = "***foo** bar*";
2087        let config = ParserOptions::default();
2088        let mut builder = GreenNodeBuilder::new();
2089
2090        builder.start_node(SyntaxKind::DOCUMENT.into());
2091        parse_inline_range(text, 0, text.len(), &config, &mut builder);
2092        builder.finish_node();
2093
2094        let green: GreenNode = builder.finish();
2095        let node = SyntaxNode::new_root(green);
2096
2097        // Verify losslessness
2098        assert_eq!(node.text().to_string(), text);
2099
2100        // Expected structure: EMPH > STRONG > "foo"
2101        let structure = format!("{:#?}", node);
2102
2103        // Should have both EMPH and STRONG
2104        assert!(structure.contains("EMPHASIS"), "Should have EMPHASIS node");
2105        assert!(structure.contains("STRONG"), "Should have STRONG node");
2106
2107        // EMPH should be outer, STRONG should be inner
2108        let mut found_emph = false;
2109        let mut found_strong_after_emph = false;
2110        for descendant in node.descendants() {
2111            if descendant.kind() == SyntaxKind::EMPHASIS {
2112                found_emph = true;
2113            }
2114            if found_emph && descendant.kind() == SyntaxKind::STRONG {
2115                found_strong_after_emph = true;
2116                break;
2117            }
2118        }
2119
2120        assert!(
2121            found_strong_after_emph,
2122            "STRONG should be inside EMPH. Current structure:\n{}",
2123            structure
2124        );
2125    }
2126
2127    /// Test that display math with attributes parses correctly
2128    /// Regression test for equation_attributes_single_line golden test
2129    #[test]
2130    fn test_display_math_with_attributes() {
2131        use crate::options::ParserOptions;
2132        use crate::syntax::SyntaxNode;
2133        use rowan::GreenNode;
2134
2135        let text = "$$ E = mc^2 $$ {#eq-einstein}";
2136        let mut config = ParserOptions::default();
2137        config.extensions.quarto_crossrefs = true; // Enable Quarto cross-references
2138
2139        let mut builder = GreenNodeBuilder::new();
2140        builder.start_node(SyntaxKind::DOCUMENT.into()); // Need a root node
2141
2142        // Parse the whole text
2143        parse_inline_text_recursive(&mut builder, text, &config);
2144
2145        builder.finish_node(); // Finish ROOT
2146        let green: GreenNode = builder.finish();
2147        let node = SyntaxNode::new_root(green);
2148
2149        // Verify losslessness
2150        assert_eq!(node.text().to_string(), text);
2151
2152        // Should have DISPLAY_MATH node
2153        let has_display_math = node
2154            .descendants()
2155            .any(|n| n.kind() == SyntaxKind::DISPLAY_MATH);
2156        assert!(has_display_math, "Should have DISPLAY_MATH node");
2157
2158        // Should have ATTRIBUTE node
2159        let has_attributes = node
2160            .descendants()
2161            .any(|n| n.kind() == SyntaxKind::ATTRIBUTE);
2162        assert!(
2163            has_attributes,
2164            "Should have ATTRIBUTE node for {{#eq-einstein}}"
2165        );
2166
2167        // Attributes should not be TEXT
2168        let math_followed_by_text = node.descendants().any(|n| {
2169            n.kind() == SyntaxKind::DISPLAY_MATH
2170                && n.next_sibling()
2171                    .map(|s| {
2172                        s.kind() == SyntaxKind::TEXT
2173                            && s.text().to_string().contains("{#eq-einstein}")
2174                    })
2175                    .unwrap_or(false)
2176        });
2177        assert!(
2178            !math_followed_by_text,
2179            "Attributes should not be parsed as TEXT"
2180        );
2181    }
2182
2183    #[test]
2184    fn test_parse_inline_text_gfm_inline_link_destination_not_autolinked() {
2185        use crate::options::{Extensions, Flavor};
2186
2187        let config = ParserOptions {
2188            flavor: Flavor::Gfm,
2189            extensions: Extensions::for_flavor(Flavor::Gfm),
2190            ..ParserOptions::default()
2191        };
2192
2193        let mut builder = GreenNodeBuilder::new();
2194        builder.start_node(SyntaxKind::PARAGRAPH.into());
2195        parse_inline_text_recursive(
2196            &mut builder,
2197            "Second Link [link_text](https://link.com)",
2198            &config,
2199        );
2200        builder.finish_node();
2201        let green = builder.finish();
2202        let root = SyntaxNode::new_root(green);
2203
2204        let links: Vec<_> = root
2205            .descendants()
2206            .filter(|n| n.kind() == SyntaxKind::LINK)
2207            .collect();
2208        assert_eq!(
2209            links.len(),
2210            1,
2211            "Expected exactly one LINK node for inline link, not nested bare URI autolink"
2212        );
2213
2214        let link = links[0].clone();
2215        let mut link_text = None::<String>;
2216        let mut link_dest = None::<String>;
2217
2218        for child in link.children() {
2219            match child.kind() {
2220                SyntaxKind::LINK_TEXT => link_text = Some(child.text().to_string()),
2221                SyntaxKind::LINK_DEST => link_dest = Some(child.text().to_string()),
2222                _ => {}
2223            }
2224        }
2225
2226        assert_eq!(link_text.as_deref(), Some("link_text"));
2227        assert_eq!(link_dest.as_deref(), Some("https://link.com"));
2228    }
2229
2230    #[test]
2231    fn test_autolink_bare_uri_utf8_boundary_safe() {
2232        let text = "§";
2233        let mut config = ParserOptions::default();
2234        config.extensions.autolink_bare_uris = true;
2235        let mut builder = GreenNodeBuilder::new();
2236
2237        builder.start_node(SyntaxKind::DOCUMENT.into());
2238        parse_inline_text_recursive(&mut builder, text, &config);
2239        builder.finish_node();
2240
2241        let green: GreenNode = builder.finish();
2242        let node = SyntaxNode::new_root(green);
2243        assert_eq!(node.text().to_string(), text);
2244    }
2245
2246    #[test]
2247    fn test_parse_emphasis_unicode_content_no_panic() {
2248        let text = "*§*";
2249        let config = ParserOptions::default();
2250        let mut builder = GreenNodeBuilder::new();
2251
2252        let result = try_parse_emphasis(text, 0, text.len(), &config, &mut builder);
2253        assert_eq!(result, Some((text.len(), 1)));
2254
2255        let green: GreenNode = builder.finish();
2256        let node = SyntaxNode::new_root(green);
2257        assert_eq!(node.kind(), SyntaxKind::EMPHASIS);
2258        assert_eq!(node.text().to_string(), text);
2259    }
2260}
2261
2262#[test]
2263fn test_two_with_nested_one_and_triple_closer() {
2264    // **bold with *italic***
2265    // Should parse as: Strong["bold with ", Emph["italic"]]
2266    // The *** at end is parsed as * (closes Emph) + ** (closes Strong)
2267
2268    use crate::options::ParserOptions;
2269    use crate::syntax::SyntaxNode;
2270    use rowan::GreenNode;
2271
2272    let text = "**bold with *italic***";
2273    let config = ParserOptions::default();
2274    let mut builder = GreenNodeBuilder::new();
2275
2276    // parse_inline_range emits inline content directly
2277    parse_inline_range(text, 0, text.len(), &config, &mut builder);
2278
2279    let green: GreenNode = builder.finish();
2280    let node = SyntaxNode::new_root(green);
2281
2282    // Verify lossless parsing
2283    assert_eq!(node.text().to_string(), text, "Should be lossless");
2284
2285    // The root node should be STRONG (parse_inline_range doesn't add wrapper)
2286    assert_eq!(
2287        node.kind(),
2288        SyntaxKind::STRONG,
2289        "Root should be STRONG, got: {:?}",
2290        node.kind()
2291    );
2292
2293    // STRONG should contain EMPHASIS as a nested node
2294    let has_emphasis = node.children().any(|c| c.kind() == SyntaxKind::EMPHASIS);
2295    assert!(has_emphasis, "STRONG should contain EMPHASIS node");
2296}
2297
2298#[test]
2299fn test_emphasis_with_trailing_space_before_closer() {
2300    // *foo * should parse as emphasis (Pandoc behavior)
2301    // For asterisks, Pandoc doesn't require right-flanking for closers
2302
2303    use crate::options::ParserOptions;
2304    use crate::syntax::SyntaxNode;
2305    use rowan::GreenNode;
2306
2307    let text = "*foo *";
2308    let config = ParserOptions::default();
2309    let mut builder = GreenNodeBuilder::new();
2310
2311    // Try to parse emphasis at position 0
2312    let result = try_parse_emphasis(text, 0, text.len(), &config, &mut builder);
2313
2314    // Should successfully parse (consumed all 6 bytes, delimiter count 1)
2315    assert_eq!(
2316        result,
2317        Some((6, 1)),
2318        "Should parse as emphasis, result: {:?}",
2319        result
2320    );
2321
2322    // Check the generated CST
2323    let green: GreenNode = builder.finish();
2324    let node = SyntaxNode::new_root(green);
2325
2326    // The root IS the EMPHASIS node
2327    assert_eq!(node.kind(), SyntaxKind::EMPHASIS);
2328
2329    // Verify losslessness
2330    assert_eq!(node.text().to_string(), text);
2331}
2332
2333#[test]
2334fn test_triple_emphasis_all_strong_nested() {
2335    // ***foo** bar **baz*** should parse as Emph[Strong[foo], " bar ", Strong[baz]]
2336    // Pandoc output confirms this
2337
2338    use crate::options::ParserOptions;
2339    use crate::syntax::SyntaxNode;
2340    use rowan::GreenNode;
2341
2342    let text = "***foo** bar **baz***";
2343    let config = ParserOptions::default();
2344    let mut builder = GreenNodeBuilder::new();
2345
2346    parse_inline_range(text, 0, text.len(), &config, &mut builder);
2347
2348    let green: GreenNode = builder.finish();
2349    let node = SyntaxNode::new_root(green);
2350
2351    // Should have one EMPHASIS node at root
2352    let emphasis_nodes: Vec<_> = node
2353        .descendants()
2354        .filter(|n| n.kind() == SyntaxKind::EMPHASIS)
2355        .collect();
2356    assert_eq!(
2357        emphasis_nodes.len(),
2358        1,
2359        "Should have exactly one EMPHASIS node, found: {}",
2360        emphasis_nodes.len()
2361    );
2362
2363    // EMPHASIS should contain two STRONG nodes
2364    let emphasis_node = emphasis_nodes[0].clone();
2365    let strong_in_emphasis: Vec<_> = emphasis_node
2366        .children()
2367        .filter(|n| n.kind() == SyntaxKind::STRONG)
2368        .collect();
2369    assert_eq!(
2370        strong_in_emphasis.len(),
2371        2,
2372        "EMPHASIS should contain two STRONG nodes, found: {}",
2373        strong_in_emphasis.len()
2374    );
2375
2376    // Verify losslessness
2377    assert_eq!(node.text().to_string(), text);
2378}
2379
2380#[test]
2381fn test_triple_emphasis_all_emph_nested() {
2382    // ***foo* bar *baz*** should parse as Strong[Emph[foo], " bar ", Emph[baz]]
2383    // Pandoc output confirms this
2384
2385    use crate::options::ParserOptions;
2386    use crate::syntax::SyntaxNode;
2387    use rowan::GreenNode;
2388
2389    let text = "***foo* bar *baz***";
2390    let config = ParserOptions::default();
2391    let mut builder = GreenNodeBuilder::new();
2392
2393    parse_inline_range(text, 0, text.len(), &config, &mut builder);
2394
2395    let green: GreenNode = builder.finish();
2396    let node = SyntaxNode::new_root(green);
2397
2398    // Should have one STRONG node at root
2399    let strong_nodes: Vec<_> = node
2400        .descendants()
2401        .filter(|n| n.kind() == SyntaxKind::STRONG)
2402        .collect();
2403    assert_eq!(
2404        strong_nodes.len(),
2405        1,
2406        "Should have exactly one STRONG node, found: {}",
2407        strong_nodes.len()
2408    );
2409
2410    // STRONG should contain two EMPHASIS nodes
2411    let strong_node = strong_nodes[0].clone();
2412    let emph_in_strong: Vec<_> = strong_node
2413        .children()
2414        .filter(|n| n.kind() == SyntaxKind::EMPHASIS)
2415        .collect();
2416    assert_eq!(
2417        emph_in_strong.len(),
2418        2,
2419        "STRONG should contain two EMPHASIS nodes, found: {}",
2420        emph_in_strong.len()
2421    );
2422
2423    // Verify losslessness
2424    assert_eq!(node.text().to_string(), text);
2425}
2426
2427// Multiline emphasis tests
2428#[test]
2429fn test_parse_emphasis_multiline() {
2430    // Per Pandoc spec, emphasis CAN contain newlines (soft breaks)
2431    use crate::options::ParserOptions;
2432    use crate::syntax::SyntaxNode;
2433    use rowan::GreenNode;
2434
2435    let text = "*text on\nline two*";
2436    let config = ParserOptions::default();
2437    let mut builder = GreenNodeBuilder::new();
2438
2439    let result = try_parse_emphasis(text, 0, text.len(), &config, &mut builder);
2440
2441    // Should successfully parse all bytes
2442    assert_eq!(
2443        result,
2444        Some((text.len(), 1)),
2445        "Emphasis should parse multiline content"
2446    );
2447
2448    // Check the generated CST
2449    let green: GreenNode = builder.finish();
2450    let node = SyntaxNode::new_root(green);
2451
2452    // Should have EMPHASIS node
2453    assert_eq!(node.kind(), SyntaxKind::EMPHASIS);
2454
2455    // Verify losslessness: should preserve the newline
2456    assert_eq!(node.text().to_string(), text);
2457    assert!(
2458        node.text().to_string().contains('\n'),
2459        "Should preserve newline in emphasis content"
2460    );
2461}
2462
2463#[test]
2464fn test_parse_strong_multiline() {
2465    // Per Pandoc spec, strong emphasis CAN contain newlines
2466    use crate::options::ParserOptions;
2467    use crate::syntax::SyntaxNode;
2468    use rowan::GreenNode;
2469
2470    let text = "**strong on\nline two**";
2471    let config = ParserOptions::default();
2472    let mut builder = GreenNodeBuilder::new();
2473
2474    let result = try_parse_emphasis(text, 0, text.len(), &config, &mut builder);
2475
2476    // Should successfully parse all bytes
2477    assert_eq!(
2478        result,
2479        Some((text.len(), 2)),
2480        "Strong emphasis should parse multiline content"
2481    );
2482
2483    // Check the generated CST
2484    let green: GreenNode = builder.finish();
2485    let node = SyntaxNode::new_root(green);
2486
2487    // Should have STRONG node
2488    assert_eq!(node.kind(), SyntaxKind::STRONG);
2489
2490    // Verify losslessness
2491    assert_eq!(node.text().to_string(), text);
2492    assert!(
2493        node.text().to_string().contains('\n'),
2494        "Should preserve newline in strong content"
2495    );
2496}
2497
2498#[test]
2499fn test_parse_triple_emphasis_multiline() {
2500    // Triple emphasis with newlines
2501    use crate::options::ParserOptions;
2502    use crate::syntax::SyntaxNode;
2503    use rowan::GreenNode;
2504
2505    let text = "***both on\nline two***";
2506    let config = ParserOptions::default();
2507    let mut builder = GreenNodeBuilder::new();
2508
2509    let result = try_parse_emphasis(text, 0, text.len(), &config, &mut builder);
2510
2511    // Should successfully parse all bytes
2512    assert_eq!(
2513        result,
2514        Some((text.len(), 3)),
2515        "Triple emphasis should parse multiline content"
2516    );
2517
2518    // Check the generated CST
2519    let green: GreenNode = builder.finish();
2520    let node = SyntaxNode::new_root(green);
2521
2522    // Should have STRONG node (triple = strong + emph)
2523    let has_strong = node.descendants().any(|n| n.kind() == SyntaxKind::STRONG);
2524    assert!(has_strong, "Should have STRONG node");
2525
2526    // Verify losslessness
2527    assert_eq!(node.text().to_string(), text);
2528    assert!(
2529        node.text().to_string().contains('\n'),
2530        "Should preserve newline in triple emphasis content"
2531    );
2532}
panache_parser/parser/inlines/core.rs

panache_parser/parser/inlines/
core.rs