panache_parser/parser/inlines/
core.rs

1//! Recursive emphasis parsing using Pandoc's algorithm.
2//!
3//! This module implements emphasis/strong emphasis parsing using a recursive
4//! descent approach based on Pandoc's Haskell implementation in
5//! `Readers/Markdown.hs:L1662-L1722`.
6//!
7//! **Key algorithm**: Left-to-right, greedy, first-match wins
8//! 1. Parse text left-to-right
9//! 2. When we see delimiters, try to parse emphasis (look for matching closer)
10//! 3. If successful, emit emphasis node and continue from after closer
11//! 4. If failed (no closer found), emit delimiter as literal and continue
12//! 5. Nested emphasis is handled naturally by recursive parsing of content
13//!
14//! **Example**: `*foo **bar* baz**`
15//! - See `*`, try to parse EMPH
16//! - Parse content: see `**`, try to parse STRONG
17//! - STRONG finds closer `**` at end → succeeds, emits STRONG[bar* baz]
18//! - Outer `*` can't find closer (all delimiters consumed) → fails, emits `*foo` as literal
19//! - Result: `*foo` + STRONG[bar* baz]
20//!
21//! This matches Pandoc's behavior exactly.
22
23use crate::options::{Dialect, ParserOptions};
24use crate::syntax::SyntaxKind;
25use rowan::GreenNodeBuilder;
26
27// Import inline element parsers from sibling modules
28use super::bookdown::{
29    try_parse_bookdown_definition, try_parse_bookdown_reference, try_parse_bookdown_text_reference,
30};
31use super::bracketed_spans::{emit_bracketed_span, try_parse_bracketed_span};
32use super::citations::{
33    emit_bare_citation, emit_bracketed_citation, try_parse_bare_citation,
34    try_parse_bracketed_citation,
35};
36use super::code_spans::{emit_code_span, try_parse_code_span};
37use super::emoji::{emit_emoji, try_parse_emoji};
38use super::escapes::{EscapeType, emit_escape, try_parse_escape};
39use super::inline_executable::{emit_inline_executable, try_parse_inline_executable};
40use super::inline_footnotes::{
41    emit_footnote_reference, emit_inline_footnote, try_parse_footnote_reference,
42    try_parse_inline_footnote,
43};
44use super::inline_html::{emit_inline_html, try_parse_inline_html};
45use super::latex::{parse_latex_command, try_parse_latex_command};
46use super::links::{
47    emit_autolink, emit_bare_uri_link, emit_inline_image, emit_inline_link, emit_reference_image,
48    emit_reference_link, try_parse_autolink, try_parse_bare_uri, try_parse_inline_image,
49    try_parse_inline_link, try_parse_reference_image, try_parse_reference_link,
50};
51use super::mark::{emit_mark, try_parse_mark};
52use super::math::{
53    emit_display_math, emit_display_math_environment, emit_double_backslash_display_math,
54    emit_double_backslash_inline_math, emit_gfm_inline_math, emit_inline_math,
55    emit_single_backslash_display_math, emit_single_backslash_inline_math, try_parse_display_math,
56    try_parse_double_backslash_display_math, try_parse_double_backslash_inline_math,
57    try_parse_gfm_inline_math, try_parse_inline_math, try_parse_math_environment,
58    try_parse_single_backslash_display_math, try_parse_single_backslash_inline_math,
59};
60use super::native_spans::{emit_native_span, try_parse_native_span};
61use super::raw_inline::is_raw_inline;
62use super::shortcodes::{emit_shortcode, try_parse_shortcode};
63use super::strikeout::{emit_strikeout, try_parse_strikeout};
64use super::subscript::{emit_subscript, try_parse_subscript};
65use super::superscript::{emit_superscript, try_parse_superscript};
66
67/// Parse inline text using the recursive emphasis algorithm.
68///
69/// This is the main entry point for parsing inline content with Pandoc-style
70/// recursive emphasis handling. It uses a greedy left-to-right, first-match-wins
71/// approach that matches Pandoc's behavior exactly.
72///
73/// **Algorithm**:
74/// 1. Parse text left-to-right trying each inline element type in precedence order
75/// 2. When we see `*` or `_`, try to parse emphasis recursively
76/// 3. Nested emphasis naturally consumes delimiters before outer matches
77/// 4. All inline elements (code, links, math, etc.) are parsed on-the-fly
78///
79/// # Arguments
80/// * `text` - The inline text to parse
81/// * `config` - Configuration for extensions and formatting
82/// * `builder` - The CST builder to emit nodes to
83pub fn parse_inline_text_recursive(
84    builder: &mut GreenNodeBuilder,
85    text: &str,
86    config: &ParserOptions,
87) {
88    log::trace!(
89        "Recursive inline parsing: {:?} ({} bytes)",
90        &text[..text.len().min(40)],
91        text.len()
92    );
93
94    parse_inline_range(text, 0, text.len(), config, builder);
95
96    log::trace!("Recursive inline parsing complete");
97}
98
99/// Parse inline elements from text content nested inside a link/image/span.
100///
101/// Used for recursive inline parsing of link text, image alt, span content, etc.
102/// Suppresses constructs that would create nested links (CommonMark §6.3 forbids
103/// links inside links), notably extended bare-URI autolinks under GFM.
104///
105/// The `_allow_reference_links` parameter is accepted for compatibility and is
106/// currently unused.
107pub fn parse_inline_text(
108    builder: &mut GreenNodeBuilder,
109    text: &str,
110    config: &ParserOptions,
111    _allow_reference_links: bool,
112) {
113    log::trace!(
114        "Parsing inline text (nested in link): {:?} ({} bytes)",
115        &text[..text.len().min(40)],
116        text.len()
117    );
118
119    parse_inline_range_impl(text, 0, text.len(), config, builder, false, true);
120}
121
122/// Try to parse emphasis starting at the given position.
123///
124/// This is the entry point for recursive emphasis parsing, equivalent to
125/// Pandoc's `enclosure` function.
126///
127/// Returns Some((bytes_consumed, delim_count)) if emphasis was successfully parsed,
128/// or None if the delimiter should be treated as literal text.
129/// When returning None, the delim_count tells the caller how many delimiter
130/// characters to skip (to avoid re-parsing parts of a failed delimiter run).
131///
132/// # Arguments
133/// * `text` - The full text being parsed
134/// * `pos` - Current position in text (where the delimiter starts)
135/// * `end` - End boundary (don't search for closers beyond this)
136/// * `config` - Configuration
137/// * `builder` - CST builder
138///
139/// **Algorithm**:
140/// 1. Count opening delimiters
141/// 2. Check if followed by whitespace (if so, return None)
142/// 3. Dispatch to parse_one/two/three based on count
143/// 4. Those functions parse content and look for matching closer (within bounds)
144/// 5. If closer found, emit node and return bytes consumed
145/// 6. If not found, return None with delimiter count (caller skips entire run)
146pub fn try_parse_emphasis(
147    text: &str,
148    pos: usize,
149    end: usize,
150    config: &ParserOptions,
151    builder: &mut GreenNodeBuilder,
152) -> Option<(usize, usize)> {
153    let bytes = text.as_bytes();
154
155    if pos >= bytes.len() {
156        return None;
157    }
158
159    let delim_char = bytes[pos] as char;
160    if delim_char != '*' && delim_char != '_' {
161        return None;
162    }
163
164    // Count consecutive delimiters
165    let mut count = 0;
166    while pos + count < bytes.len() && bytes[pos + count] == bytes[pos] {
167        count += 1;
168    }
169
170    let after_pos = pos + count;
171
172    log::trace!(
173        "try_parse_emphasis: '{}' x {} at pos {}",
174        delim_char,
175        count,
176        pos
177    );
178
179    // Check if followed by whitespace (Pandoc rule: treat as literal)
180    if after_pos < text.len()
181        && let Some(next_char) = text[after_pos..].chars().next()
182        && next_char.is_whitespace()
183    {
184        log::trace!("Delimiter followed by whitespace, treating as literal");
185        return None;
186    }
187
188    // For underscores: check intraword_underscores extension (Pandoc lines 1668-1672)
189    // Can't open if preceded by alphanumeric (prevents foo_bar from parsing)
190    if delim_char == '_'
191        && pos > 0
192        && let Some(prev_char) = text[..pos].chars().last()
193        && prev_char.is_alphanumeric()
194    {
195        log::trace!("Underscore preceded by alphanumeric, can't open (intraword)");
196        return None;
197    }
198
199    // Dispatch based on delimiter count
200    let result = match count {
201        1 => try_parse_one(text, pos, delim_char, end, config, builder),
202        2 => try_parse_two(text, pos, delim_char, end, config, builder),
203        3 => try_parse_three(text, pos, delim_char, end, config, builder),
204        _ => {
205            // 4+ delimiters: treat as literal (Pandoc behavior)
206            log::trace!("{} delimiters (4+), treating as literal", count);
207            None
208        }
209    };
210
211    // If parsing succeeded, return (bytes_consumed, delim_count)
212    // If failed, return None but the caller will know to skip `count` delimiters
213    result.map(|consumed| (consumed, count))
214}
215
216/// Try to parse emphasis in a nested context (bypassing opener validity checks).
217///
218/// This mirrors Pandoc's behavior where `one` can call `two c mempty` directly,
219/// bypassing the `enclosure` opener validity checks. This is needed because
220/// patterns like `***foo **bar** baz***` require `**` followed by space to be
221/// parsed as a nested strong opener.
222///
223/// Returns Some((bytes_consumed, delim_count)) if successful, None otherwise.
224fn try_parse_emphasis_nested(
225    text: &str,
226    pos: usize,
227    end: usize,
228    config: &ParserOptions,
229    builder: &mut GreenNodeBuilder,
230) -> Option<(usize, usize)> {
231    let bytes = text.as_bytes();
232
233    if pos >= bytes.len() {
234        return None;
235    }
236
237    let delim_char = bytes[pos] as char;
238    if delim_char != '*' && delim_char != '_' {
239        return None;
240    }
241
242    // Count consecutive delimiters
243    let mut count = 0;
244    while pos + count < bytes.len() && bytes[pos + count] == bytes[pos] {
245        count += 1;
246    }
247
248    log::trace!(
249        "try_parse_emphasis_nested: '{}' x {} at pos {}",
250        delim_char,
251        count,
252        pos
253    );
254
255    // For underscores: still check intraword_underscores (prevents foo_bar parsing)
256    // This check applies even in nested contexts
257    if delim_char == '_'
258        && pos > 0
259        && let Some(prev_char) = text[..pos].chars().last()
260        && prev_char.is_alphanumeric()
261    {
262        log::trace!("Underscore preceded by alphanumeric, can't open (intraword)");
263        return None;
264    }
265
266    // NOTE: We intentionally skip the "delimiter followed by whitespace" check here.
267    // In nested contexts (inside `one` calling `two`), Pandoc allows openers
268    // followed by whitespace because the opener has already been matched.
269
270    // Dispatch based on delimiter count
271    let result = match count {
272        1 => try_parse_one(text, pos, delim_char, end, config, builder),
273        2 => try_parse_two(text, pos, delim_char, end, config, builder),
274        3 => try_parse_three(text, pos, delim_char, end, config, builder),
275        _ => {
276            // 4+ delimiters: treat as literal (Pandoc behavior)
277            log::trace!("{} delimiters (4+), treating as literal", count);
278            None
279        }
280    };
281
282    result.map(|consumed| (consumed, count))
283}
284
285/// Try to parse emphasis with *** opening delimiter.
286///
287/// Tries to match closers in order: *** → ** → *
288/// Returns Some(bytes_consumed) if successful, None otherwise.
289fn try_parse_three(
290    text: &str,
291    pos: usize,
292    delim_char: char,
293    end: usize,
294    config: &ParserOptions,
295    builder: &mut GreenNodeBuilder,
296) -> Option<usize> {
297    let content_start = pos + 3;
298    let one = delim_char.to_string();
299    let two = one.repeat(2);
300
301    log::trace!("try_parse_three: '{}' x 3 at pos {}", delim_char, pos);
302
303    // Pandoc algorithm (line 1695): Parse content UNTIL we see a VALID ender
304    // We loop through potential enders, checking if each is valid.
305    // Invalid enders (like `**` preceded by whitespace) are skipped.
306    let mut search_pos = content_start;
307
308    loop {
309        // Find next potential ender
310        let closer_start = match find_first_potential_ender(text, search_pos, delim_char, end) {
311            Some(p) => p,
312            None => {
313                log::trace!("No potential ender found for ***");
314                return None;
315            }
316        };
317
318        log::trace!("Potential ender at pos {}", closer_start);
319
320        // Count how many delimiters we have at closer_start
321        let bytes = text.as_bytes();
322        let mut closer_count = 0;
323        let mut check_pos = closer_start;
324        while check_pos < bytes.len() && bytes[check_pos] == delim_char as u8 {
325            closer_count += 1;
326            check_pos += 1;
327        }
328
329        log::trace!(
330            "Found {} x {} at pos {}",
331            delim_char,
332            closer_count,
333            closer_start
334        );
335
336        // Try to match closers in order: ***, **, * (Pandoc lines 1696-1698)
337
338        // Try *** (line 1696)
339        if closer_count >= 3 && is_valid_ender(text, closer_start, delim_char, 3) {
340            log::trace!("Matched *** closer, emitting Strong[Emph[content]]");
341
342            builder.start_node(SyntaxKind::STRONG.into());
343            builder.token(SyntaxKind::STRONG_MARKER.into(), &two);
344
345            builder.start_node(SyntaxKind::EMPHASIS.into());
346            builder.token(SyntaxKind::EMPHASIS_MARKER.into(), &one);
347            parse_inline_range_nested(text, content_start, closer_start, config, builder);
348            builder.token(SyntaxKind::EMPHASIS_MARKER.into(), &one);
349            builder.finish_node(); // EMPHASIS
350
351            builder.token(SyntaxKind::STRONG_MARKER.into(), &two);
352            builder.finish_node(); // STRONG
353
354            return Some(closer_start + 3 - pos);
355        }
356
357        // Try ** (line 1697)
358        if closer_count >= 2 && is_valid_ender(text, closer_start, delim_char, 2) {
359            log::trace!("Matched ** closer, wrapping as Strong and continuing with one");
360
361            let continue_pos = closer_start + 2;
362
363            if let Some(final_closer_pos) =
364                parse_until_closer_with_nested_two(text, continue_pos, delim_char, 1, end, config)
365            {
366                log::trace!(
367                    "Found * closer at pos {}, emitting Emph[Strong[...], ...]",
368                    final_closer_pos
369                );
370
371                builder.start_node(SyntaxKind::EMPHASIS.into());
372                builder.token(SyntaxKind::EMPHASIS_MARKER.into(), &one);
373
374                builder.start_node(SyntaxKind::STRONG.into());
375                builder.token(SyntaxKind::STRONG_MARKER.into(), &two);
376                parse_inline_range_nested(text, content_start, closer_start, config, builder);
377                builder.token(SyntaxKind::STRONG_MARKER.into(), &two);
378                builder.finish_node(); // STRONG
379
380                // Parse additional content between ** and * (up to but not including the closer)
381                parse_inline_range_nested(text, continue_pos, final_closer_pos, config, builder);
382
383                builder.token(SyntaxKind::EMPHASIS_MARKER.into(), &one);
384                builder.finish_node(); // EMPHASIS
385
386                return Some(final_closer_pos + 1 - pos);
387            }
388
389            // Fallback: emit * + STRONG
390            log::trace!("No * closer found after **, emitting * + STRONG");
391            builder.token(SyntaxKind::TEXT.into(), &one);
392
393            builder.start_node(SyntaxKind::STRONG.into());
394            builder.token(SyntaxKind::STRONG_MARKER.into(), &two);
395            parse_inline_range_nested(text, content_start, closer_start, config, builder);
396            builder.token(SyntaxKind::STRONG_MARKER.into(), &two);
397            builder.finish_node(); // STRONG
398
399            return Some(closer_start + 2 - pos);
400        }
401
402        // Try * (line 1698)
403        if closer_count >= 1 && is_valid_ender(text, closer_start, delim_char, 1) {
404            log::trace!("Matched * closer, wrapping as Emph and continuing with two");
405
406            let continue_pos = closer_start + 1;
407
408            if let Some(final_closer_pos) =
409                parse_until_closer_with_nested_one(text, continue_pos, delim_char, 2, end, config)
410            {
411                log::trace!(
412                    "Found ** closer at pos {}, emitting Strong[Emph[...], ...]",
413                    final_closer_pos
414                );
415
416                builder.start_node(SyntaxKind::STRONG.into());
417                builder.token(SyntaxKind::STRONG_MARKER.into(), &two);
418
419                builder.start_node(SyntaxKind::EMPHASIS.into());
420                builder.token(SyntaxKind::EMPHASIS_MARKER.into(), &one);
421                parse_inline_range_nested(text, content_start, closer_start, config, builder);
422                builder.token(SyntaxKind::EMPHASIS_MARKER.into(), &one);
423                builder.finish_node(); // EMPHASIS
424
425                parse_inline_range_nested(text, continue_pos, final_closer_pos, config, builder);
426
427                builder.token(SyntaxKind::STRONG_MARKER.into(), &two);
428                builder.finish_node(); // STRONG
429
430                return Some(final_closer_pos + 2 - pos);
431            }
432
433            // Fallback: emit ** + EMPH
434            log::trace!("No ** closer found after *, emitting ** + EMPH");
435            builder.token(SyntaxKind::TEXT.into(), &two);
436
437            builder.start_node(SyntaxKind::EMPHASIS.into());
438            builder.token(SyntaxKind::EMPHASIS_MARKER.into(), &one);
439            parse_inline_range_nested(text, content_start, closer_start, config, builder);
440            builder.token(SyntaxKind::EMPHASIS_MARKER.into(), &one);
441            builder.finish_node(); // EMPHASIS
442
443            return Some(closer_start + 1 - pos);
444        }
445
446        // No valid ender at this position - continue searching after this delimiter run
447        log::trace!(
448            "No valid ender at pos {}, continuing search from {}",
449            closer_start,
450            closer_start + closer_count
451        );
452        search_pos = closer_start + closer_count;
453    }
454}
455
456/// Find the first potential emphasis ender (delimiter character) starting from `start`.
457/// This implements Pandoc's `many (notFollowedBy (ender c 1) >> inline)` -
458/// we parse inline content until we hit a delimiter that could be an ender.
459fn find_first_potential_ender(
460    text: &str,
461    start: usize,
462    delim_char: char,
463    end: usize,
464) -> Option<usize> {
465    let bytes = text.as_bytes();
466    let mut pos = start;
467
468    while pos < end.min(text.len()) {
469        // Check if we found the delimiter character
470        if bytes[pos] == delim_char as u8 {
471            // Check if it's escaped
472            let is_escaped = {
473                let mut backslash_count = 0;
474                let mut check_pos = pos;
475                while check_pos > 0 && bytes[check_pos - 1] == b'\\' {
476                    backslash_count += 1;
477                    check_pos -= 1;
478                }
479                backslash_count % 2 == 1
480            };
481
482            if !is_escaped {
483                // Found a potential ender
484                return Some(pos);
485            }
486        }
487
488        pos += 1;
489    }
490
491    None
492}
493
494/// Check if a delimiter at the given position is a valid ender.
495/// This implements Pandoc's `ender c n` function.
496fn is_valid_ender(text: &str, pos: usize, delim_char: char, delim_count: usize) -> bool {
497    let bytes = text.as_bytes();
498
499    // Check we have exactly delim_count delimiters (not more, not less)
500    if pos + delim_count > text.len() {
501        return false;
502    }
503
504    for i in 0..delim_count {
505        if bytes[pos + i] != delim_char as u8 {
506            return false;
507        }
508    }
509
510    // Check no delimiter immediately before
511    if pos > 0 && bytes[pos - 1] == delim_char as u8 {
512        return false;
513    }
514
515    // Check no delimiter immediately after
516    let after_pos = pos + delim_count;
517    if after_pos < bytes.len() && bytes[after_pos] == delim_char as u8 {
518        return false;
519    }
520
521    // For underscores, check right-flanking (not preceded by whitespace)
522    // Pandoc's `ender` for asterisks has NO right-flanking requirement
523    if delim_char == '_' {
524        if pos > 0
525            && let Some(prev_char) = text[..pos].chars().last()
526            && prev_char.is_whitespace()
527        {
528            return false;
529        }
530
531        // Check not followed by alphanumeric (right-flanking rule for underscores)
532        if after_pos < text.len()
533            && let Some(next_char) = text[after_pos..].chars().next()
534            && next_char.is_alphanumeric()
535        {
536            return false;
537        }
538    }
539
540    true
541}
542
543/// Try to parse emphasis with ** opening delimiter.
544///
545/// Tries to match ** closer only. No fallback.
546/// Returns Some(bytes_consumed) if successful, None otherwise.
547fn try_parse_two(
548    text: &str,
549    pos: usize,
550    delim_char: char,
551    end: usize,
552    config: &ParserOptions,
553    builder: &mut GreenNodeBuilder,
554) -> Option<usize> {
555    let content_start = pos + 2;
556
557    log::trace!("try_parse_two: '{}' x 2 at pos {}", delim_char, pos);
558
559    // Try to find ** closer, checking for nested * emphasis along the way
560    if let Some(closer_pos) =
561        parse_until_closer_with_nested_one(text, content_start, delim_char, 2, end, config)
562    {
563        log::trace!("Found ** closer at pos {}", closer_pos);
564
565        // Emit STRONG(content)
566        builder.start_node(SyntaxKind::STRONG.into());
567        builder.token(SyntaxKind::STRONG_MARKER.into(), &text[pos..pos + 2]);
568        parse_inline_range_nested(text, content_start, closer_pos, config, builder);
569        builder.token(
570            SyntaxKind::STRONG_MARKER.into(),
571            &text[closer_pos..closer_pos + 2],
572        );
573        builder.finish_node(); // STRONG
574
575        return Some(closer_pos + 2 - pos);
576    }
577
578    // No closer found
579    log::trace!("No closer found for **");
580    None
581}
582
583/// Try to parse emphasis with * opening delimiter.
584///
585/// Tries to match * closer.
586/// Returns Some(bytes_consumed) if successful, None otherwise.
587///
588/// **Pandoc algorithm**: While parsing content, if we encounter **,
589/// try to parse it as `two` (strong) recursively. If `two` succeeds,
590/// it consumes the ** delimiters, potentially preventing us from finding
591/// a closer for the outer *. This creates priority where ** can "steal"
592/// matches from *.
593fn try_parse_one(
594    text: &str,
595    pos: usize,
596    delim_char: char,
597    end: usize,
598    config: &ParserOptions,
599    builder: &mut GreenNodeBuilder,
600) -> Option<usize> {
601    let content_start = pos + 1;
602
603    log::trace!("try_parse_one: '{}' x 1 at pos {}", delim_char, pos);
604
605    // Try to find * closer using Pandoc's algorithm with nested two attempts
606    if let Some(closer_pos) =
607        parse_until_closer_with_nested_two(text, content_start, delim_char, 1, end, config)
608    {
609        log::trace!("Found * closer at pos {}", closer_pos);
610
611        // Emit EMPH(content)
612        builder.start_node(SyntaxKind::EMPHASIS.into());
613        builder.token(SyntaxKind::EMPHASIS_MARKER.into(), &text[pos..pos + 1]);
614        parse_inline_range_nested(text, content_start, closer_pos, config, builder);
615        builder.token(
616            SyntaxKind::EMPHASIS_MARKER.into(),
617            &text[closer_pos..closer_pos + 1],
618        );
619        builder.finish_node(); // EMPHASIS
620
621        return Some(closer_pos + 1 - pos);
622    }
623
624    // No closer found
625    log::trace!("No closer found for *");
626    None
627}
628
629/// Parse inline content and look for a matching closer, with nested two attempts.
630///
631/// This implements Pandoc's algorithm from Markdown.hs lines 1712-1717:
632/// When parsing `*...*`, if we encounter `**` (and it's not followed by
633/// another `*` that would close the outer emphasis), try to parse it as
634/// `two c mempty` (strong). If `two` succeeds, those `**` delimiters are
635/// consumed, and we continue searching for the `*` closer.
636///
637/// This creates a priority system where `**` can "steal" matches from `*`.
638///
639/// Example: `*foo **bar* baz**`
640/// - When parsing the outer `*...*`, we encounter `**` at position 5
641/// - We try `two` which succeeds with `**bar* baz**`
642/// - Now there's no `*` closer for the outer `*`, so it fails
643/// - Result: literal `*foo ` + STRONG("bar* baz")
644///
645/// # Arguments
646/// * `end` - Don't search beyond this position (respects nesting boundaries)
647fn parse_until_closer_with_nested_two(
648    text: &str,
649    start: usize,
650    delim_char: char,
651    delim_count: usize,
652    end: usize,
653    config: &ParserOptions,
654) -> Option<usize> {
655    let bytes = text.as_bytes();
656    let mut pos = start;
657
658    while pos < end.min(text.len()) {
659        if bytes[pos] == b'`'
660            && let Some(m) = try_parse_inline_executable(
661                &text[pos..],
662                config.extensions.rmarkdown_inline_code,
663                config.extensions.quarto_inline_code,
664            )
665        {
666            log::trace!(
667                "Skipping inline executable span of {} bytes at pos {}",
668                m.total_len,
669                pos
670            );
671            pos += m.total_len;
672            continue;
673        }
674
675        // Skip over code spans - their content is protected from delimiter matching
676        if bytes[pos] == b'`'
677            && let Some((len, _, _, _)) = try_parse_code_span(&text[pos..])
678        {
679            log::trace!("Skipping code span of {} bytes at pos {}", len, pos);
680            pos += len;
681            continue;
682        }
683
684        // Skip over inline math - their content is protected from delimiter matching
685        if bytes[pos] == b'$'
686            && let Some((len, _)) = try_parse_inline_math(&text[pos..])
687        {
688            log::trace!("Skipping inline math of {} bytes at pos {}", len, pos);
689            pos += len;
690            continue;
691        }
692
693        // Skip over links - their content is protected from delimiter matching
694        if bytes[pos] == b'['
695            && let Some((len, _, _, _)) = try_parse_inline_link(
696                &text[pos..],
697                config.dialect == crate::options::Dialect::CommonMark,
698            )
699        {
700            log::trace!("Skipping inline link of {} bytes at pos {}", len, pos);
701            pos += len;
702            continue;
703        }
704
705        // Pandoc algorithm: If we're looking for a single delimiter (*) and
706        // encounter a double delimiter (**), try to parse it as `two` (strong).
707        // This happens BEFORE checking if pos is a closer for our current emphasis.
708        if delim_count == 1
709            && pos + 2 <= text.len()
710            && bytes[pos] == delim_char as u8
711            && bytes[pos + 1] == delim_char as u8
712        {
713            // First check if the first delimiter is escaped
714            let first_is_escaped = {
715                let mut backslash_count = 0;
716                let mut check_pos = pos;
717                while check_pos > 0 && bytes[check_pos - 1] == b'\\' {
718                    backslash_count += 1;
719                    check_pos -= 1;
720                }
721                backslash_count % 2 == 1
722            };
723
724            if first_is_escaped {
725                // First * is escaped, skip it and continue
726                // The second * might be a closer or start of emphasis
727                log::trace!(
728                    "First * at pos {} is escaped, skipping to check second *",
729                    pos
730                );
731                pos = advance_char_boundary(text, pos, end);
732                continue;
733            }
734
735            // Check that there's NOT a third delimiter (which would make this
736            // part of a longer run that we shouldn't treat as `two`)
737            let no_third_delim = pos + 2 >= bytes.len() || bytes[pos + 2] != delim_char as u8;
738
739            if no_third_delim {
740                log::trace!(
741                    "try_parse_one: found ** at pos {}, attempting nested two",
742                    pos
743                );
744
745                // Try to parse as `two` (strong emphasis)
746                // We create a temporary builder to test if `two` succeeds
747                let mut temp_builder = GreenNodeBuilder::new();
748                if let Some(two_consumed) =
749                    try_parse_two(text, pos, delim_char, end, config, &mut temp_builder)
750                {
751                    // `two` succeeded! Those ** delimiters are consumed.
752                    // We skip past the `two` and continue searching for our `*` closer.
753                    log::trace!(
754                        "Nested two succeeded, consumed {} bytes, continuing search",
755                        two_consumed
756                    );
757                    pos += two_consumed;
758                    continue;
759                }
760                // `two` failed - this means the entire `one` parse should fail!
761                // In Pandoc, the `try (string [c,c] >> notFollowedBy (ender c 1) >> two c mempty)`
762                // alternative fails, and the first alternative `notFollowedBy (ender c 1) >> inline`
763                // also fails because we ARE followed by an ender (the first * of **).
764                // So the entire content parsing fails, and `one` returns failure.
765                log::trace!("Nested two failed at pos {}, entire one() should fail", pos);
766                return None;
767            }
768        }
769
770        // Check if we have a potential closer here
771        if pos + delim_count <= text.len() {
772            let mut matches = true;
773            for i in 0..delim_count {
774                if bytes[pos + i] != delim_char as u8 {
775                    matches = false;
776                    break;
777                }
778            }
779
780            if matches {
781                // IMPORTANT: Check that there are EXACTLY delim_count delimiters,
782                // not more. E.g., when looking for `*`, we shouldn't match
783                // `*` that's part of a longer run.
784
785                // Check: not escaped (preceded by odd number of backslashes)
786                let is_escaped = {
787                    let mut backslash_count = 0;
788                    let mut check_pos = pos;
789                    while check_pos > 0 && bytes[check_pos - 1] == b'\\' {
790                        backslash_count += 1;
791                        check_pos -= 1;
792                    }
793                    backslash_count % 2 == 1 // Odd number = escaped
794                };
795
796                // Allow matching at the start OR end of a delimiter run.
797                // This lets `**` close at the end of `***` (after a nested `*` closes),
798                // while still avoiding matches in the middle of longer runs.
799                let at_run_start = pos == 0 || bytes[pos - 1] != delim_char as u8;
800                let after_pos = pos + delim_count;
801                let at_run_end = after_pos >= bytes.len() || bytes[after_pos] != delim_char as u8;
802
803                if (at_run_start || at_run_end) && !is_escaped {
804                    // Found a potential closer!
805                    // For underscores, check right-flanking: closer must be preceded by non-whitespace
806                    // For asterisks, Pandoc doesn't require right-flanking (see ender function in Markdown.hs)
807                    if delim_char == '_'
808                        && pos > start
809                        && let Some(prev_char) = text[..pos].chars().last()
810                        && prev_char.is_whitespace()
811                    {
812                        log::trace!(
813                            "Underscore closer preceded by whitespace at pos {}, not right-flanking",
814                            pos
815                        );
816                        // Not a valid closer, continue searching
817                        pos = advance_char_boundary(text, pos, end);
818                        continue;
819                    }
820
821                    log::trace!(
822                        "Found exact {} x {} closer at pos {}",
823                        delim_char,
824                        delim_count,
825                        pos
826                    );
827                    return Some(pos);
828                }
829            }
830        }
831
832        // Not a closer, move to next UTF-8 boundary.
833        pos = advance_char_boundary(text, pos, end);
834    }
835
836    None
837}
838
839/// Parse inline content and look for a matching closer, with nested one attempts.
840///
841/// This implements the symmetric case to `parse_until_closer_with_nested_two`:
842/// When parsing `**...**`, if we encounter `*` (and it's not followed by
843/// another `*` that would be part of our `**` closer), try to parse it as
844/// `one c mempty` (emphasis). If `one` succeeds, those `*` delimiters are
845/// consumed, and we continue searching for the `**` closer.
846///
847/// This ensures nested emphasis closes before the outer strong emphasis.
848///
849/// Example: `**bold with *italic***`
850/// - When parsing the outer `**...**, we scan for `**` closer
851/// - At position 12, we encounter a single `*` (start of `*italic`)
852/// - We try `one` which succeeds with `*italic*` (consuming the first `*` from `***`)
853/// - We continue scanning and find `**` at position 20 (the remaining `**` from `***`)
854/// - Result: STRONG["bold with " EMPHASIS["italic"]]
855///
856/// # Arguments
857/// * `end` - Don't search beyond this position (respects nesting boundaries)
858fn parse_until_closer_with_nested_one(
859    text: &str,
860    start: usize,
861    delim_char: char,
862    delim_count: usize,
863    end: usize,
864    config: &ParserOptions,
865) -> Option<usize> {
866    let bytes = text.as_bytes();
867    let mut pos = start;
868
869    while pos < end.min(text.len()) {
870        if bytes[pos] == b'`'
871            && let Some(m) = try_parse_inline_executable(
872                &text[pos..],
873                config.extensions.rmarkdown_inline_code,
874                config.extensions.quarto_inline_code,
875            )
876        {
877            log::trace!(
878                "Skipping inline executable span of {} bytes at pos {}",
879                m.total_len,
880                pos
881            );
882            pos += m.total_len;
883            continue;
884        }
885
886        // Skip over code spans - their content is protected from delimiter matching
887        if bytes[pos] == b'`'
888            && let Some((len, _, _, _)) = try_parse_code_span(&text[pos..])
889        {
890            log::trace!("Skipping code span of {} bytes at pos {}", len, pos);
891            pos += len;
892            continue;
893        }
894
895        // Skip over inline math - their content is protected from delimiter matching
896        if bytes[pos] == b'$'
897            && let Some((len, _)) = try_parse_inline_math(&text[pos..])
898        {
899            log::trace!("Skipping inline math of {} bytes at pos {}", len, pos);
900            pos += len;
901            continue;
902        }
903
904        // Skip over links - their content is protected from delimiter matching
905        if bytes[pos] == b'['
906            && let Some((len, _, _, _)) = try_parse_inline_link(
907                &text[pos..],
908                config.dialect == crate::options::Dialect::CommonMark,
909            )
910        {
911            log::trace!("Skipping inline link of {} bytes at pos {}", len, pos);
912            pos += len;
913            continue;
914        }
915
916        // Pandoc algorithm: If we're looking for a double delimiter (**) and
917        // encounter a single delimiter (*), check if it's a valid emphasis opener.
918        // If it is, try to parse it as `one` (emphasis). If `one` succeeds, skip
919        // over it. If `one` fails, the outer `two` also fails (delimiter poisoning).
920        // If the `*` is NOT a valid opener (e.g., followed by whitespace or escaped),
921        // skip it and continue looking for the `**` closer.
922        if delim_count == 2 && pos < text.len() && bytes[pos] == delim_char as u8 {
923            // Check that there's NOT a second delimiter immediately after
924            // (which would make this part of our `**` closer or another `**` opener)
925            let no_second_delim = pos + 1 >= bytes.len() || bytes[pos + 1] != delim_char as u8;
926
927            if no_second_delim {
928                // Check if this * is escaped (preceded by odd number of backslashes)
929                let is_escaped = {
930                    let mut backslash_count = 0;
931                    let mut check_pos = pos;
932                    while check_pos > 0 && bytes[check_pos - 1] == b'\\' {
933                        backslash_count += 1;
934                        check_pos -= 1;
935                    }
936                    backslash_count % 2 == 1
937                };
938
939                if is_escaped {
940                    // Escaped delimiter - just literal text, skip it
941                    log::trace!("* at pos {} is escaped, skipping", pos);
942                    pos = advance_char_boundary(text, pos, end);
943                    continue;
944                }
945
946                // Check if this * is a valid emphasis opener (Pandoc's enclosure rule).
947                // A delimiter followed by whitespace is NOT an opener - it's literal text.
948                let after_delim = pos + 1;
949                let followed_by_whitespace = after_delim < text.len()
950                    && text[after_delim..]
951                        .chars()
952                        .next()
953                        .is_some_and(|c| c.is_whitespace());
954
955                if followed_by_whitespace {
956                    // Not a valid opener - just literal text, skip it
957                    log::trace!(
958                        "* at pos {} followed by whitespace, not an opener, skipping",
959                        pos
960                    );
961                    pos = advance_char_boundary(text, pos, end);
962                    continue;
963                }
964
965                log::trace!(
966                    "try_parse_two: found * at pos {}, attempting nested one",
967                    pos
968                );
969
970                // Try to parse as `one` (emphasis)
971                // We create a temporary builder to test if `one` succeeds
972                let mut temp_builder = GreenNodeBuilder::new();
973                if let Some(one_consumed) =
974                    try_parse_one(text, pos, delim_char, end, config, &mut temp_builder)
975                {
976                    // `one` succeeded! Those * delimiters are consumed.
977                    // We skip past the `one` and continue searching for our `**` closer.
978                    log::trace!(
979                        "Nested one succeeded, consumed {} bytes, continuing search",
980                        one_consumed
981                    );
982                    pos += one_consumed;
983                    continue;
984                }
985
986                // `one` failed to find a closer. According to Pandoc's algorithm,
987                // this means the outer `two` should also fail. An unmatched inner
988                // delimiter "poisons" the outer emphasis.
989                // Example: `**foo *bar**` - the `*` can't find a closer, so the
990                // outer `**` should fail and the whole thing becomes literal.
991                log::trace!(
992                    "Nested one failed at pos {}, poisoning outer two (no closer found)",
993                    pos
994                );
995                return None;
996            }
997        }
998
999        // Check if we have a potential closer here
1000        if pos + delim_count <= text.len() {
1001            let mut matches = true;
1002            for i in 0..delim_count {
1003                if bytes[pos + i] != delim_char as u8 {
1004                    matches = false;
1005                    break;
1006                }
1007            }
1008
1009            if matches {
1010                // Check: not escaped (preceded by odd number of backslashes)
1011                let is_escaped = {
1012                    let mut backslash_count = 0;
1013                    let mut check_pos = pos;
1014                    while check_pos > 0 && bytes[check_pos - 1] == b'\\' {
1015                        backslash_count += 1;
1016                        check_pos -= 1;
1017                    }
1018                    backslash_count % 2 == 1 // Odd number = escaped
1019                };
1020
1021                // Allow matching at the start OR end of a delimiter run.
1022                // This lets `**` close at the end of `***` (after a nested `*` closes),
1023                // while still avoiding matches in the middle of longer runs.
1024                let at_run_start = pos == 0 || bytes[pos - 1] != delim_char as u8;
1025                let after_pos = pos + delim_count;
1026                let at_run_end = after_pos >= bytes.len() || bytes[after_pos] != delim_char as u8;
1027
1028                if (at_run_start || at_run_end) && !is_escaped {
1029                    // Found a potential closer!
1030                    // For underscores, check right-flanking: closer must be preceded by non-whitespace
1031                    // For asterisks, Pandoc doesn't require right-flanking (see ender function in Markdown.hs)
1032                    if delim_char == '_'
1033                        && pos > start
1034                        && let Some(prev_char) = text[..pos].chars().last()
1035                        && prev_char.is_whitespace()
1036                    {
1037                        log::trace!(
1038                            "Underscore closer preceded by whitespace at pos {}, not right-flanking",
1039                            pos
1040                        );
1041                        // Not a valid closer, continue searching
1042                        pos = advance_char_boundary(text, pos, end);
1043                        continue;
1044                    }
1045
1046                    log::trace!(
1047                        "Found exact {} x {} closer at pos {}",
1048                        delim_char,
1049                        delim_count,
1050                        pos
1051                    );
1052                    return Some(pos);
1053                }
1054            }
1055        }
1056
1057        // Not a closer, move to next UTF-8 boundary.
1058        pos = advance_char_boundary(text, pos, end);
1059    }
1060
1061    None
1062}
1063
1064///
1065/// This is the recursive inline parser that handles all inline elements:
1066/// - Text
1067/// - Escapes (highest priority)
1068/// - Code spans
1069/// - Math (inline and display)
1070/// - Emphasis/strong (via try_parse_emphasis)
1071/// - Other inline elements
1072///
1073/// **Important**: This is where the greedy left-to-right parsing happens.
1074/// When we see `**`, we try to parse it as STRONG. If it succeeds, those
1075/// delimiters are consumed and won't be available for outer emphasis.
1076///
1077/// # Arguments
1078/// * `nested_emphasis` - If true, bypass opener validity checks for emphasis.
1079///   Set to true when called from within emphasis parsing (e.g., from try_parse_one/two/three).
1080fn parse_inline_range(
1081    text: &str,
1082    start: usize,
1083    end: usize,
1084    config: &ParserOptions,
1085    builder: &mut GreenNodeBuilder,
1086) {
1087    parse_inline_range_impl(text, start, end, config, builder, false, false)
1088}
1089
1090/// Same as `parse_inline_range` but bypasses opener validity checks for emphasis.
1091/// Used within emphasis parsing contexts (e.g., from try_parse_one/two/three).
1092fn parse_inline_range_nested(
1093    text: &str,
1094    start: usize,
1095    end: usize,
1096    config: &ParserOptions,
1097    builder: &mut GreenNodeBuilder,
1098) {
1099    parse_inline_range_impl(text, start, end, config, builder, true, false)
1100}
1101
1102fn is_emoji_boundary(text: &str, pos: usize) -> bool {
1103    if pos > 0 {
1104        let prev = text.as_bytes()[pos - 1] as char;
1105        if prev.is_ascii_alphanumeric() || prev == '_' {
1106            return false;
1107        }
1108    }
1109    true
1110}
1111
1112#[inline]
1113fn advance_char_boundary(text: &str, pos: usize, end: usize) -> usize {
1114    if pos >= end || pos >= text.len() {
1115        return pos;
1116    }
1117    let ch_len = text[pos..]
1118        .chars()
1119        .next()
1120        .map_or(1, std::primitive::char::len_utf8);
1121    (pos + ch_len).min(end)
1122}
1123
1124fn parse_inline_range_impl(
1125    text: &str,
1126    start: usize,
1127    end: usize,
1128    config: &ParserOptions,
1129    builder: &mut GreenNodeBuilder,
1130    nested_emphasis: bool,
1131    nested_in_link: bool,
1132) {
1133    log::trace!(
1134        "parse_inline_range: start={}, end={}, text={:?}",
1135        start,
1136        end,
1137        &text[start..end]
1138    );
1139    let mut pos = start;
1140    let mut text_start = start;
1141
1142    while pos < end {
1143        let byte = text.as_bytes()[pos];
1144
1145        // Backslash math (highest priority if enabled)
1146        if byte == b'\\' {
1147            // Try double backslash display math first: \\[...\\]
1148            if config.extensions.tex_math_double_backslash {
1149                if let Some((len, content)) = try_parse_double_backslash_display_math(&text[pos..])
1150                {
1151                    if pos > text_start {
1152                        builder.token(SyntaxKind::TEXT.into(), &text[text_start..pos]);
1153                    }
1154                    log::trace!("Matched double backslash display math at pos {}", pos);
1155                    emit_double_backslash_display_math(builder, content);
1156                    pos += len;
1157                    text_start = pos;
1158                    continue;
1159                }
1160
1161                // Try double backslash inline math: \\(...\\)
1162                if let Some((len, content)) = try_parse_double_backslash_inline_math(&text[pos..]) {
1163                    if pos > text_start {
1164                        builder.token(SyntaxKind::TEXT.into(), &text[text_start..pos]);
1165                    }
1166                    log::trace!("Matched double backslash inline math at pos {}", pos);
1167                    emit_double_backslash_inline_math(builder, content);
1168                    pos += len;
1169                    text_start = pos;
1170                    continue;
1171                }
1172            }
1173
1174            // Try single backslash display math: \[...\]
1175            if config.extensions.tex_math_single_backslash {
1176                if let Some((len, content)) = try_parse_single_backslash_display_math(&text[pos..])
1177                {
1178                    if pos > text_start {
1179                        builder.token(SyntaxKind::TEXT.into(), &text[text_start..pos]);
1180                    }
1181                    log::trace!("Matched single backslash display math at pos {}", pos);
1182                    emit_single_backslash_display_math(builder, content);
1183                    pos += len;
1184                    text_start = pos;
1185                    continue;
1186                }
1187
1188                // Try single backslash inline math: \(...\)
1189                if let Some((len, content)) = try_parse_single_backslash_inline_math(&text[pos..]) {
1190                    if pos > text_start {
1191                        builder.token(SyntaxKind::TEXT.into(), &text[text_start..pos]);
1192                    }
1193                    log::trace!("Matched single backslash inline math at pos {}", pos);
1194                    emit_single_backslash_inline_math(builder, content);
1195                    pos += len;
1196                    text_start = pos;
1197                    continue;
1198                }
1199            }
1200
1201            // Try math environments \begin{equation}...\end{equation}
1202            if config.extensions.raw_tex
1203                && let Some((len, begin_marker, content, end_marker)) =
1204                    try_parse_math_environment(&text[pos..])
1205            {
1206                if pos > text_start {
1207                    builder.token(SyntaxKind::TEXT.into(), &text[text_start..pos]);
1208                }
1209                log::trace!("Matched math environment at pos {}", pos);
1210                emit_display_math_environment(builder, begin_marker, content, end_marker);
1211                pos += len;
1212                text_start = pos;
1213                continue;
1214            }
1215
1216            // Try bookdown reference: \@ref(label)
1217            if config.extensions.bookdown_references
1218                && let Some((len, label)) = try_parse_bookdown_reference(&text[pos..])
1219            {
1220                if pos > text_start {
1221                    builder.token(SyntaxKind::TEXT.into(), &text[text_start..pos]);
1222                }
1223                log::trace!("Matched bookdown reference at pos {}: {}", pos, label);
1224                super::citations::emit_bookdown_crossref(builder, label);
1225                pos += len;
1226                text_start = pos;
1227                continue;
1228            }
1229
1230            // Try escapes (after bookdown refs and backslash math)
1231            if let Some((len, ch, escape_type)) = try_parse_escape(&text[pos..]) {
1232                let escape_enabled = match escape_type {
1233                    EscapeType::HardLineBreak => config.extensions.escaped_line_breaks,
1234                    EscapeType::NonbreakingSpace => config.extensions.all_symbols_escapable,
1235                    EscapeType::Literal => {
1236                        // BASE_ESCAPABLE matches Pandoc's markdown_strict /
1237                        // original Markdown set, plus `|` and `~` which the
1238                        // formatter emits as escapes for pipe-table separators
1239                        // and strikethrough delimiters. Recognising those here
1240                        // keeps round-trips idempotent in flavors that don't
1241                        // enable all_symbols_escapable.
1242                        //
1243                        // Under CommonMark dialect, the spec (§2.4) explicitly
1244                        // allows ANY ASCII punctuation to be backslash-escaped,
1245                        // independent of the all_symbols_escapable extension
1246                        // (which also widens to whitespace, a Pandoc-only
1247                        // construct).
1248                        const BASE_ESCAPABLE: &str = "\\`*_{}[]()>#+-.!|~";
1249                        BASE_ESCAPABLE.contains(ch)
1250                            || config.extensions.all_symbols_escapable
1251                            || (config.dialect == crate::Dialect::CommonMark
1252                                && ch.is_ascii_punctuation())
1253                    }
1254                };
1255                if !escape_enabled {
1256                    // Don't treat as hard line break - skip the escape and continue
1257                    // The backslash will be included in the next TEXT token
1258                    pos = advance_char_boundary(text, pos, end);
1259                    continue;
1260                }
1261
1262                // Emit accumulated text
1263                if pos > text_start {
1264                    builder.token(SyntaxKind::TEXT.into(), &text[text_start..pos]);
1265                }
1266
1267                log::trace!("Matched escape at pos {}: \\{}", pos, ch);
1268                emit_escape(builder, ch, escape_type);
1269                pos += len;
1270                text_start = pos;
1271                continue;
1272            }
1273
1274            // Try LaTeX commands (after escapes, before shortcodes)
1275            if config.extensions.raw_tex
1276                && let Some(len) = try_parse_latex_command(&text[pos..])
1277            {
1278                if pos > text_start {
1279                    builder.token(SyntaxKind::TEXT.into(), &text[text_start..pos]);
1280                }
1281                log::trace!("Matched LaTeX command at pos {}", pos);
1282                parse_latex_command(builder, &text[pos..], len);
1283                pos += len;
1284                text_start = pos;
1285                continue;
1286            }
1287        }
1288
1289        // Try Quarto shortcodes: {{< shortcode >}}
1290        if byte == b'{'
1291            && pos + 1 < text.len()
1292            && text.as_bytes()[pos + 1] == b'{'
1293            && let Some((len, name, attrs)) = try_parse_shortcode(&text[pos..])
1294        {
1295            if pos > text_start {
1296                builder.token(SyntaxKind::TEXT.into(), &text[text_start..pos]);
1297            }
1298            log::trace!("Matched shortcode at pos {}: {}", pos, &name);
1299            emit_shortcode(builder, &name, attrs);
1300            pos += len;
1301            text_start = pos;
1302            continue;
1303        }
1304
1305        // Try inline executable code spans (`... `r expr`` and `... `{r} expr``)
1306        if byte == b'`'
1307            && let Some(m) = try_parse_inline_executable(
1308                &text[pos..],
1309                config.extensions.rmarkdown_inline_code,
1310                config.extensions.quarto_inline_code,
1311            )
1312        {
1313            if pos > text_start {
1314                builder.token(SyntaxKind::TEXT.into(), &text[text_start..pos]);
1315            }
1316            log::trace!("Matched inline executable code at pos {}", pos);
1317            emit_inline_executable(builder, &m);
1318            pos += m.total_len;
1319            text_start = pos;
1320            continue;
1321        }
1322
1323        // Try code spans
1324        if byte == b'`' {
1325            if let Some((len, content, backtick_count, attributes)) =
1326                try_parse_code_span(&text[pos..])
1327            {
1328                // Emit accumulated text
1329                if pos > text_start {
1330                    builder.token(SyntaxKind::TEXT.into(), &text[text_start..pos]);
1331                }
1332
1333                log::trace!(
1334                    "Matched code span at pos {}: {} backticks",
1335                    pos,
1336                    backtick_count
1337                );
1338
1339                // Check for raw inline
1340                if let Some(ref attrs) = attributes
1341                    && config.extensions.raw_attribute
1342                    && let Some(format) = is_raw_inline(attrs)
1343                {
1344                    use super::raw_inline::emit_raw_inline;
1345                    log::trace!("Matched raw inline span at pos {}: format={}", pos, format);
1346                    emit_raw_inline(builder, content, backtick_count, format);
1347                } else if !config.extensions.inline_code_attributes && attributes.is_some() {
1348                    let code_span_len = backtick_count * 2 + content.len();
1349                    emit_code_span(builder, content, backtick_count, None);
1350                    pos += code_span_len;
1351                    text_start = pos;
1352                    continue;
1353                } else {
1354                    emit_code_span(builder, content, backtick_count, attributes);
1355                }
1356
1357                pos += len;
1358                text_start = pos;
1359                continue;
1360            }
1361
1362            // Unmatched backtick run.
1363            //
1364            // CommonMark (and GFM) treat the whole run as literal text — the
1365            // run cannot be re-entered as a shorter opener. Pandoc-markdown
1366            // instead lets a longer run shadow a shorter one (e.g.
1367            // `` ```foo`` `` parses as `` ` `` + ``<code>foo</code>``), so
1368            // for the Pandoc dialect we fall through and advance one byte at
1369            // a time, allowing the inner run to be tried on a later iteration.
1370            if config.dialect == Dialect::CommonMark {
1371                let run_len = text[pos..].bytes().take_while(|&b| b == b'`').count();
1372                pos += run_len;
1373                continue;
1374            }
1375        }
1376
1377        // Try textual emoji aliases: :smile:
1378        if byte == b':'
1379            && config.extensions.emoji
1380            && is_emoji_boundary(text, pos)
1381            && let Some((len, _alias)) = try_parse_emoji(&text[pos..])
1382        {
1383            if pos > text_start {
1384                builder.token(SyntaxKind::TEXT.into(), &text[text_start..pos]);
1385            }
1386            log::trace!("Matched emoji at pos {}", pos);
1387            emit_emoji(builder, &text[pos..pos + len]);
1388            pos += len;
1389            text_start = pos;
1390            continue;
1391        }
1392
1393        // Try inline footnotes: ^[note]
1394        if byte == b'^'
1395            && pos + 1 < text.len()
1396            && text.as_bytes()[pos + 1] == b'['
1397            && config.extensions.inline_footnotes
1398            && let Some((len, content)) = try_parse_inline_footnote(&text[pos..])
1399        {
1400            if pos > text_start {
1401                builder.token(SyntaxKind::TEXT.into(), &text[text_start..pos]);
1402            }
1403            log::trace!("Matched inline footnote at pos {}", pos);
1404            emit_inline_footnote(builder, content, config);
1405            pos += len;
1406            text_start = pos;
1407            continue;
1408        }
1409
1410        // Try superscript: ^text^
1411        if byte == b'^'
1412            && config.extensions.superscript
1413            && let Some((len, content)) = try_parse_superscript(&text[pos..])
1414        {
1415            if pos > text_start {
1416                builder.token(SyntaxKind::TEXT.into(), &text[text_start..pos]);
1417            }
1418            log::trace!("Matched superscript at pos {}", pos);
1419            emit_superscript(builder, content, config);
1420            pos += len;
1421            text_start = pos;
1422            continue;
1423        }
1424
1425        // Try bookdown definition: (\#label) or (ref:label)
1426        if byte == b'(' && config.extensions.bookdown_references {
1427            if let Some((len, label)) = try_parse_bookdown_definition(&text[pos..]) {
1428                if pos > text_start {
1429                    builder.token(SyntaxKind::TEXT.into(), &text[text_start..pos]);
1430                }
1431                log::trace!("Matched bookdown definition at pos {}: {}", pos, label);
1432                builder.token(SyntaxKind::TEXT.into(), &text[pos..pos + len]);
1433                pos += len;
1434                text_start = pos;
1435                continue;
1436            }
1437            if let Some((len, label)) = try_parse_bookdown_text_reference(&text[pos..]) {
1438                if pos > text_start {
1439                    builder.token(SyntaxKind::TEXT.into(), &text[text_start..pos]);
1440                }
1441                log::trace!("Matched bookdown text reference at pos {}: {}", pos, label);
1442                builder.token(SyntaxKind::TEXT.into(), &text[pos..pos + len]);
1443                pos += len;
1444                text_start = pos;
1445                continue;
1446            }
1447        }
1448
1449        // Try subscript: ~text~
1450        if byte == b'~'
1451            && config.extensions.subscript
1452            && let Some((len, content)) = try_parse_subscript(&text[pos..])
1453        {
1454            if pos > text_start {
1455                builder.token(SyntaxKind::TEXT.into(), &text[text_start..pos]);
1456            }
1457            log::trace!("Matched subscript at pos {}", pos);
1458            emit_subscript(builder, content, config);
1459            pos += len;
1460            text_start = pos;
1461            continue;
1462        }
1463
1464        // Try strikeout: ~~text~~
1465        if byte == b'~'
1466            && config.extensions.strikeout
1467            && let Some((len, content)) = try_parse_strikeout(&text[pos..])
1468        {
1469            if pos > text_start {
1470                builder.token(SyntaxKind::TEXT.into(), &text[text_start..pos]);
1471            }
1472            log::trace!("Matched strikeout at pos {}", pos);
1473            emit_strikeout(builder, content, config);
1474            pos += len;
1475            text_start = pos;
1476            continue;
1477        }
1478
1479        // Try mark/highlight: ==text==
1480        if byte == b'='
1481            && config.extensions.mark
1482            && let Some((len, content)) = try_parse_mark(&text[pos..])
1483        {
1484            if pos > text_start {
1485                builder.token(SyntaxKind::TEXT.into(), &text[text_start..pos]);
1486            }
1487            log::trace!("Matched mark at pos {}", pos);
1488            emit_mark(builder, content, config);
1489            pos += len;
1490            text_start = pos;
1491            continue;
1492        }
1493
1494        // Try GFM inline math: $`...`$
1495        if byte == b'$'
1496            && config.extensions.tex_math_gfm
1497            && let Some((len, content)) = try_parse_gfm_inline_math(&text[pos..])
1498        {
1499            if pos > text_start {
1500                builder.token(SyntaxKind::TEXT.into(), &text[text_start..pos]);
1501            }
1502            log::trace!("Matched GFM inline math at pos {}", pos);
1503            emit_gfm_inline_math(builder, content);
1504            pos += len;
1505            text_start = pos;
1506            continue;
1507        }
1508
1509        // Try math ($...$, $$...$$)
1510        if byte == b'$' && config.extensions.tex_math_dollars {
1511            // Try display math first ($$...$$)
1512            if let Some((len, content)) = try_parse_display_math(&text[pos..]) {
1513                // Emit accumulated text
1514                if pos > text_start {
1515                    builder.token(SyntaxKind::TEXT.into(), &text[text_start..pos]);
1516                }
1517
1518                let dollar_count = text[pos..].chars().take_while(|&c| c == '$').count();
1519                log::trace!(
1520                    "Matched display math at pos {}: {} dollars",
1521                    pos,
1522                    dollar_count
1523                );
1524
1525                // Check for trailing attributes (Quarto cross-reference support)
1526                let after_math = &text[pos + len..];
1527                let attr_len = if config.extensions.quarto_crossrefs {
1528                    use crate::parser::utils::attributes::try_parse_trailing_attributes;
1529                    if let Some((_attr_block, _)) = try_parse_trailing_attributes(after_math) {
1530                        let trimmed_after = after_math.trim_start();
1531                        if let Some(open_brace_pos) = trimmed_after.find('{') {
1532                            let ws_before_brace = after_math.len() - trimmed_after.len();
1533                            let attr_text_len = trimmed_after[open_brace_pos..]
1534                                .find('}')
1535                                .map(|close| close + 1)
1536                                .unwrap_or(0);
1537                            ws_before_brace + open_brace_pos + attr_text_len
1538                        } else {
1539                            0
1540                        }
1541                    } else {
1542                        0
1543                    }
1544                } else {
1545                    0
1546                };
1547
1548                let total_len = len + attr_len;
1549                emit_display_math(builder, content, dollar_count);
1550
1551                // Emit attributes if present
1552                if attr_len > 0 {
1553                    use crate::parser::utils::attributes::{
1554                        emit_attributes, try_parse_trailing_attributes,
1555                    };
1556                    let attr_text = &text[pos + len..pos + total_len];
1557                    if let Some((attr_block, _text_before)) =
1558                        try_parse_trailing_attributes(attr_text)
1559                    {
1560                        let trimmed_after = attr_text.trim_start();
1561                        let ws_len = attr_text.len() - trimmed_after.len();
1562                        if ws_len > 0 {
1563                            builder.token(SyntaxKind::WHITESPACE.into(), &attr_text[..ws_len]);
1564                        }
1565                        emit_attributes(builder, &attr_block);
1566                    }
1567                }
1568
1569                pos += total_len;
1570                text_start = pos;
1571                continue;
1572            }
1573
1574            // Try inline math ($...$)
1575            if let Some((len, content)) = try_parse_inline_math(&text[pos..]) {
1576                // Emit accumulated text
1577                if pos > text_start {
1578                    builder.token(SyntaxKind::TEXT.into(), &text[text_start..pos]);
1579                }
1580
1581                log::trace!("Matched inline math at pos {}", pos);
1582                emit_inline_math(builder, content);
1583                pos += len;
1584                text_start = pos;
1585                continue;
1586            }
1587
1588            // Neither display nor inline math matched - emit the $ as literal text
1589            // This ensures each $ gets its own TEXT token for CST compatibility
1590            if pos > text_start {
1591                builder.token(SyntaxKind::TEXT.into(), &text[text_start..pos]);
1592            }
1593            builder.token(SyntaxKind::TEXT.into(), "$");
1594            pos = advance_char_boundary(text, pos, end);
1595            text_start = pos;
1596            continue;
1597        }
1598
1599        // Try autolinks: <url> or <email>
1600        if byte == b'<'
1601            && config.extensions.autolinks
1602            && let Some((len, url)) = try_parse_autolink(
1603                &text[pos..],
1604                config.dialect == crate::options::Dialect::CommonMark,
1605            )
1606        {
1607            if pos > text_start {
1608                builder.token(SyntaxKind::TEXT.into(), &text[text_start..pos]);
1609            }
1610            log::trace!("Matched autolink at pos {}", pos);
1611            emit_autolink(builder, &text[pos..pos + len], url);
1612            pos += len;
1613            text_start = pos;
1614            continue;
1615        }
1616
1617        if !nested_in_link
1618            && config.extensions.autolink_bare_uris
1619            && let Some((len, url)) = try_parse_bare_uri(&text[pos..])
1620        {
1621            if pos > text_start {
1622                builder.token(SyntaxKind::TEXT.into(), &text[text_start..pos]);
1623            }
1624            log::trace!("Matched bare URI at pos {}", pos);
1625            emit_bare_uri_link(builder, url, config);
1626            pos += len;
1627            text_start = pos;
1628            continue;
1629        }
1630
1631        // Try native spans: <span>text</span> (after autolink since both start with <)
1632        if byte == b'<'
1633            && config.extensions.native_spans
1634            && let Some((len, content, attributes)) = try_parse_native_span(&text[pos..])
1635        {
1636            if pos > text_start {
1637                builder.token(SyntaxKind::TEXT.into(), &text[text_start..pos]);
1638            }
1639            log::trace!("Matched native span at pos {}", pos);
1640            emit_native_span(builder, content, &attributes, config);
1641            pos += len;
1642            text_start = pos;
1643            continue;
1644        }
1645
1646        // Try inline raw HTML (CommonMark §6.6 / Pandoc raw_html). Must run
1647        // after autolinks (more specific) and native spans (Pandoc
1648        // <span>…</span> wrapper) since all three start with `<`.
1649        if byte == b'<'
1650            && config.extensions.raw_html
1651            && let Some(len) = try_parse_inline_html(&text[pos..])
1652        {
1653            if pos > text_start {
1654                builder.token(SyntaxKind::TEXT.into(), &text[text_start..pos]);
1655            }
1656            log::trace!("Matched inline raw HTML at pos {}", pos);
1657            emit_inline_html(builder, &text[pos..pos + len]);
1658            pos += len;
1659            text_start = pos;
1660            continue;
1661        }
1662
1663        // Images and links - process in order: inline image, reference image, footnote ref, inline link, reference link
1664        if byte == b'!' && pos + 1 < text.len() && text.as_bytes()[pos + 1] == b'[' {
1665            // Try inline image: ![alt](url)
1666            if let Some((len, alt_text, dest, attributes)) = try_parse_inline_image(&text[pos..]) {
1667                if pos > text_start {
1668                    builder.token(SyntaxKind::TEXT.into(), &text[text_start..pos]);
1669                }
1670                log::trace!("Matched inline image at pos {}", pos);
1671                emit_inline_image(
1672                    builder,
1673                    &text[pos..pos + len],
1674                    alt_text,
1675                    dest,
1676                    attributes,
1677                    config,
1678                );
1679                pos += len;
1680                text_start = pos;
1681                continue;
1682            }
1683
1684            // Try reference image: ![alt][ref] or ![alt]
1685            if config.extensions.reference_links {
1686                let allow_shortcut = config.extensions.shortcut_reference_links;
1687                if let Some((len, alt_text, reference, is_implicit)) =
1688                    try_parse_reference_image(&text[pos..], allow_shortcut)
1689                {
1690                    if pos > text_start {
1691                        builder.token(SyntaxKind::TEXT.into(), &text[text_start..pos]);
1692                    }
1693                    log::trace!("Matched reference image at pos {}", pos);
1694                    emit_reference_image(builder, alt_text, &reference, is_implicit, config);
1695                    pos += len;
1696                    text_start = pos;
1697                    continue;
1698                }
1699            }
1700        }
1701
1702        // Process bracket-starting elements
1703        if byte == b'[' {
1704            // Try footnote reference: [^id]
1705            if config.extensions.footnotes
1706                && let Some((len, id)) = try_parse_footnote_reference(&text[pos..])
1707            {
1708                if pos > text_start {
1709                    builder.token(SyntaxKind::TEXT.into(), &text[text_start..pos]);
1710                }
1711                log::trace!("Matched footnote reference at pos {}", pos);
1712                emit_footnote_reference(builder, &id);
1713                pos += len;
1714                text_start = pos;
1715                continue;
1716            }
1717
1718            // Try inline link: [text](url)
1719            if config.extensions.inline_links
1720                && let Some((len, link_text, dest, attributes)) = try_parse_inline_link(
1721                    &text[pos..],
1722                    config.dialect == crate::options::Dialect::CommonMark,
1723                )
1724            {
1725                if pos > text_start {
1726                    builder.token(SyntaxKind::TEXT.into(), &text[text_start..pos]);
1727                }
1728                log::trace!("Matched inline link at pos {}", pos);
1729                emit_inline_link(
1730                    builder,
1731                    &text[pos..pos + len],
1732                    link_text,
1733                    dest,
1734                    attributes,
1735                    config,
1736                );
1737                pos += len;
1738                text_start = pos;
1739                continue;
1740            }
1741
1742            // Try reference link: [text][ref] or [text]
1743            if config.extensions.reference_links {
1744                let allow_shortcut = config.extensions.shortcut_reference_links;
1745                if let Some((len, link_text, reference, is_implicit)) =
1746                    try_parse_reference_link(&text[pos..], allow_shortcut)
1747                {
1748                    if pos > text_start {
1749                        builder.token(SyntaxKind::TEXT.into(), &text[text_start..pos]);
1750                    }
1751                    log::trace!("Matched reference link at pos {}", pos);
1752                    emit_reference_link(builder, link_text, &reference, is_implicit, config);
1753                    pos += len;
1754                    text_start = pos;
1755                    continue;
1756                }
1757            }
1758
1759            // Try bracketed citation: [@cite]
1760            if config.extensions.citations
1761                && let Some((len, content)) = try_parse_bracketed_citation(&text[pos..])
1762            {
1763                if pos > text_start {
1764                    builder.token(SyntaxKind::TEXT.into(), &text[text_start..pos]);
1765                }
1766                log::trace!("Matched bracketed citation at pos {}", pos);
1767                emit_bracketed_citation(builder, content);
1768                pos += len;
1769                text_start = pos;
1770                continue;
1771            }
1772        }
1773
1774        // Try bracketed spans: [text]{.class}
1775        // Must come after links/citations
1776        if byte == b'['
1777            && config.extensions.bracketed_spans
1778            && let Some((len, text_content, attrs)) = try_parse_bracketed_span(&text[pos..])
1779        {
1780            if pos > text_start {
1781                builder.token(SyntaxKind::TEXT.into(), &text[text_start..pos]);
1782            }
1783            log::trace!("Matched bracketed span at pos {}", pos);
1784            emit_bracketed_span(builder, &text_content, &attrs, config);
1785            pos += len;
1786            text_start = pos;
1787            continue;
1788        }
1789
1790        // Try bare citation: @cite (must come after bracketed elements)
1791        if byte == b'@'
1792            && (config.extensions.citations || config.extensions.quarto_crossrefs)
1793            && let Some((len, key, has_suppress)) = try_parse_bare_citation(&text[pos..])
1794        {
1795            let is_crossref =
1796                config.extensions.quarto_crossrefs && super::citations::is_quarto_crossref_key(key);
1797            if is_crossref || config.extensions.citations {
1798                if pos > text_start {
1799                    builder.token(SyntaxKind::TEXT.into(), &text[text_start..pos]);
1800                }
1801                if is_crossref {
1802                    log::trace!("Matched Quarto crossref at pos {}: {}", pos, &key);
1803                    super::citations::emit_crossref(builder, key, has_suppress);
1804                } else {
1805                    log::trace!("Matched bare citation at pos {}: {}", pos, &key);
1806                    emit_bare_citation(builder, key, has_suppress);
1807                }
1808                pos += len;
1809                text_start = pos;
1810                continue;
1811            }
1812        }
1813
1814        // Try suppress-author citation: -@cite
1815        if byte == b'-'
1816            && pos + 1 < text.len()
1817            && text.as_bytes()[pos + 1] == b'@'
1818            && (config.extensions.citations || config.extensions.quarto_crossrefs)
1819            && let Some((len, key, has_suppress)) = try_parse_bare_citation(&text[pos..])
1820        {
1821            let is_crossref =
1822                config.extensions.quarto_crossrefs && super::citations::is_quarto_crossref_key(key);
1823            if is_crossref || config.extensions.citations {
1824                if pos > text_start {
1825                    builder.token(SyntaxKind::TEXT.into(), &text[text_start..pos]);
1826                }
1827                if is_crossref {
1828                    log::trace!("Matched Quarto crossref at pos {}: {}", pos, &key);
1829                    super::citations::emit_crossref(builder, key, has_suppress);
1830                } else {
1831                    log::trace!("Matched suppress-author citation at pos {}: {}", pos, &key);
1832                    emit_bare_citation(builder, key, has_suppress);
1833                }
1834                pos += len;
1835                text_start = pos;
1836                continue;
1837            }
1838        }
1839
1840        // Try to parse emphasis at this position
1841        if byte == b'*' || byte == b'_' {
1842            // Count the delimiter run to avoid re-parsing
1843            let bytes = text.as_bytes();
1844            let mut delim_count = 0;
1845            while pos + delim_count < bytes.len() && bytes[pos + delim_count] == byte {
1846                delim_count += 1;
1847            }
1848
1849            // Emit any accumulated text before the delimiter
1850            if pos > text_start {
1851                log::trace!(
1852                    "Emitting TEXT before delimiter: {:?}",
1853                    &text[text_start..pos]
1854                );
1855                builder.token(SyntaxKind::TEXT.into(), &text[text_start..pos]);
1856                text_start = pos; // Update text_start after emission
1857            }
1858
1859            // Try to parse emphasis
1860            // Use nested variant (bypass opener validity) when in nested context
1861            let emphasis_result = if nested_emphasis {
1862                try_parse_emphasis_nested(text, pos, end, config, builder)
1863            } else {
1864                try_parse_emphasis(text, pos, end, config, builder)
1865            };
1866
1867            if let Some((consumed, _)) = emphasis_result {
1868                // Successfully parsed emphasis
1869                log::trace!(
1870                    "Parsed emphasis, consumed {} bytes from pos {}",
1871                    consumed,
1872                    pos
1873                );
1874                pos += consumed;
1875                text_start = pos;
1876            } else {
1877                // Failed to parse, delimiter run will be treated as regular text
1878                // Skip the ENTIRE delimiter run to avoid re-parsing parts of it
1879                log::trace!(
1880                    "Failed to parse emphasis at pos {}, skipping {} delimiters as literal",
1881                    pos,
1882                    delim_count
1883                );
1884                pos += delim_count;
1885                // DON'T update text_start - let the delimiters accumulate
1886            }
1887            continue;
1888        }
1889
1890        // Check for newlines - may need to emit as hard line break
1891        if byte == b'\r' && pos + 1 < end && text.as_bytes()[pos + 1] == b'\n' {
1892            let text_before = &text[text_start..pos];
1893
1894            // Check for trailing spaces hard line break (always enabled in Pandoc)
1895            let trailing_spaces = text_before.chars().rev().take_while(|&c| c == ' ').count();
1896            if trailing_spaces >= 2 {
1897                // Emit text before the trailing spaces
1898                let text_content = &text_before[..text_before.len() - trailing_spaces];
1899                if !text_content.is_empty() {
1900                    builder.token(SyntaxKind::TEXT.into(), text_content);
1901                }
1902                let spaces = " ".repeat(trailing_spaces);
1903                builder.token(
1904                    SyntaxKind::HARD_LINE_BREAK.into(),
1905                    &format!("{}\r\n", spaces),
1906                );
1907                pos += 2;
1908                text_start = pos;
1909                continue;
1910            }
1911
1912            // hard_line_breaks: treat all single newlines as hard line breaks
1913            if config.extensions.hard_line_breaks {
1914                if !text_before.is_empty() {
1915                    builder.token(SyntaxKind::TEXT.into(), text_before);
1916                }
1917                builder.token(SyntaxKind::HARD_LINE_BREAK.into(), "\r\n");
1918                pos += 2;
1919                text_start = pos;
1920                continue;
1921            }
1922
1923            // Regular newline
1924            if !text_before.is_empty() {
1925                builder.token(SyntaxKind::TEXT.into(), text_before);
1926            }
1927            builder.token(SyntaxKind::NEWLINE.into(), "\r\n");
1928            pos += 2;
1929            text_start = pos;
1930            continue;
1931        }
1932
1933        if byte == b'\n' {
1934            let text_before = &text[text_start..pos];
1935
1936            // Check for trailing spaces hard line break (always enabled in Pandoc)
1937            let trailing_spaces = text_before.chars().rev().take_while(|&c| c == ' ').count();
1938            if trailing_spaces >= 2 {
1939                // Emit text before the trailing spaces
1940                let text_content = &text_before[..text_before.len() - trailing_spaces];
1941                if !text_content.is_empty() {
1942                    builder.token(SyntaxKind::TEXT.into(), text_content);
1943                }
1944                let spaces = " ".repeat(trailing_spaces);
1945                builder.token(SyntaxKind::HARD_LINE_BREAK.into(), &format!("{}\n", spaces));
1946                pos += 1;
1947                text_start = pos;
1948                continue;
1949            }
1950
1951            // hard_line_breaks: treat all single newlines as hard line breaks
1952            if config.extensions.hard_line_breaks {
1953                if !text_before.is_empty() {
1954                    builder.token(SyntaxKind::TEXT.into(), text_before);
1955                }
1956                builder.token(SyntaxKind::HARD_LINE_BREAK.into(), "\n");
1957                pos += 1;
1958                text_start = pos;
1959                continue;
1960            }
1961
1962            // Regular newline
1963            if !text_before.is_empty() {
1964                builder.token(SyntaxKind::TEXT.into(), text_before);
1965            }
1966            builder.token(SyntaxKind::NEWLINE.into(), "\n");
1967            pos += 1;
1968            text_start = pos;
1969            continue;
1970        }
1971
1972        // Regular character, keep accumulating
1973        pos = advance_char_boundary(text, pos, end);
1974    }
1975
1976    // Emit any remaining text
1977    if pos > text_start && text_start < end {
1978        log::trace!("Emitting remaining TEXT: {:?}", &text[text_start..end]);
1979        builder.token(SyntaxKind::TEXT.into(), &text[text_start..end]);
1980    }
1981
1982    log::trace!("parse_inline_range complete: start={}, end={}", start, end);
1983}
1984
1985#[cfg(test)]
1986mod tests {
1987    use super::*;
1988    use crate::syntax::{SyntaxKind, SyntaxNode};
1989    use rowan::GreenNode;
1990
1991    #[test]
1992    fn test_recursive_simple_emphasis() {
1993        let text = "*test*";
1994        let config = ParserOptions::default();
1995        let mut builder = GreenNodeBuilder::new();
1996
1997        parse_inline_text_recursive(&mut builder, text, &config);
1998
1999        let green: GreenNode = builder.finish();
2000        let node = SyntaxNode::new_root(green);
2001
2002        // Should be lossless
2003        assert_eq!(node.text().to_string(), text);
2004
2005        // Should have EMPHASIS node
2006        let has_emph = node.descendants().any(|n| n.kind() == SyntaxKind::EMPHASIS);
2007        assert!(has_emph, "Should have EMPHASIS node");
2008    }
2009
2010    #[test]
2011    fn test_recursive_nested() {
2012        let text = "*foo **bar** baz*";
2013        let config = ParserOptions::default();
2014        let mut builder = GreenNodeBuilder::new();
2015
2016        // Wrap in a PARAGRAPH node (inline content needs a parent)
2017        builder.start_node(SyntaxKind::PARAGRAPH.into());
2018        parse_inline_text_recursive(&mut builder, text, &config);
2019        builder.finish_node();
2020
2021        let green: GreenNode = builder.finish();
2022        let node = SyntaxNode::new_root(green);
2023
2024        // Should be lossless
2025        assert_eq!(node.text().to_string(), text);
2026
2027        // Should have both EMPHASIS and STRONG
2028        let has_emph = node.descendants().any(|n| n.kind() == SyntaxKind::EMPHASIS);
2029        let has_strong = node.descendants().any(|n| n.kind() == SyntaxKind::STRONG);
2030
2031        assert!(has_emph, "Should have EMPHASIS node");
2032        assert!(has_strong, "Should have STRONG node");
2033    }
2034
2035    /// Test that we can parse a simple emphasis case
2036    #[test]
2037    fn test_parse_simple_emphasis() {
2038        use crate::options::ParserOptions;
2039        use crate::syntax::SyntaxNode;
2040        use rowan::GreenNode;
2041
2042        let text = "*test*";
2043        let config = ParserOptions::default();
2044        let mut builder = GreenNodeBuilder::new();
2045
2046        // Try to parse emphasis at position 0
2047        let result = try_parse_emphasis(text, 0, text.len(), &config, &mut builder);
2048
2049        // Should successfully parse
2050        assert_eq!(result, Some((6, 1))); // Consumed all 6 bytes, delimiter count 1
2051
2052        // Check the generated CST
2053        let green: GreenNode = builder.finish();
2054        let node = SyntaxNode::new_root(green);
2055
2056        // The root IS the EMPHASIS node
2057        assert_eq!(node.kind(), SyntaxKind::EMPHASIS);
2058
2059        // Verify losslessness: CST text should match input
2060        assert_eq!(node.text().to_string(), text);
2061    }
2062
2063    /// Test parsing nested emphasis/strong
2064    #[test]
2065    fn test_parse_nested_emphasis_strong() {
2066        use crate::options::ParserOptions;
2067
2068        let text = "*foo **bar** baz*";
2069        let config = ParserOptions::default();
2070        let mut builder = GreenNodeBuilder::new();
2071
2072        // Parse the whole range
2073        parse_inline_range(text, 0, text.len(), &config, &mut builder);
2074
2075        let green = builder.finish();
2076        let node = crate::syntax::SyntaxNode::new_root(green);
2077
2078        // Verify losslessness
2079        assert_eq!(node.text().to_string(), text);
2080
2081        // Should have EMPHASIS and STRONG nodes
2082        let has_emph = node.descendants().any(|n| n.kind() == SyntaxKind::EMPHASIS);
2083        let has_strong = node.descendants().any(|n| n.kind() == SyntaxKind::STRONG);
2084
2085        assert!(has_emph, "Should have EMPHASIS node");
2086        assert!(has_strong, "Should have STRONG node");
2087    }
2088
2089    /// Test Pandoc's "three" algorithm: ***foo* bar**
2090    /// Expected: Strong[Emph[foo], bar]
2091    /// Current bug: Parses as *Strong[foo* bar]
2092    #[test]
2093    fn test_triple_emphasis_star_then_double_star() {
2094        use crate::options::ParserOptions;
2095        use crate::syntax::SyntaxNode;
2096        use rowan::GreenNode;
2097
2098        let text = "***foo* bar**";
2099        let config = ParserOptions::default();
2100        let mut builder = GreenNodeBuilder::new();
2101
2102        builder.start_node(SyntaxKind::DOCUMENT.into());
2103        parse_inline_range(text, 0, text.len(), &config, &mut builder);
2104        builder.finish_node();
2105
2106        let green: GreenNode = builder.finish();
2107        let node = SyntaxNode::new_root(green);
2108
2109        // Verify losslessness
2110        assert_eq!(node.text().to_string(), text);
2111
2112        // Expected structure: STRONG > EMPH > "foo"
2113        // The STRONG should contain EMPH, not the other way around
2114        let structure = format!("{:#?}", node);
2115
2116        // Should have both STRONG and EMPH
2117        assert!(structure.contains("STRONG"), "Should have STRONG node");
2118        assert!(structure.contains("EMPHASIS"), "Should have EMPHASIS node");
2119
2120        // STRONG should be outer, EMPH should be inner
2121        // Check that STRONG comes before EMPH in tree traversal
2122        let mut found_strong = false;
2123        let mut found_emph_after_strong = false;
2124        for descendant in node.descendants() {
2125            if descendant.kind() == SyntaxKind::STRONG {
2126                found_strong = true;
2127            }
2128            if found_strong && descendant.kind() == SyntaxKind::EMPHASIS {
2129                found_emph_after_strong = true;
2130                break;
2131            }
2132        }
2133
2134        assert!(
2135            found_emph_after_strong,
2136            "EMPH should be inside STRONG, not before it. Current structure:\n{}",
2137            structure
2138        );
2139    }
2140
2141    /// Test Pandoc's "three" algorithm: ***foo** bar*
2142    /// Expected: Emph[Strong[foo], bar]
2143    #[test]
2144    fn test_triple_emphasis_double_star_then_star() {
2145        use crate::options::ParserOptions;
2146        use crate::syntax::SyntaxNode;
2147        use rowan::GreenNode;
2148
2149        let text = "***foo** bar*";
2150        let config = ParserOptions::default();
2151        let mut builder = GreenNodeBuilder::new();
2152
2153        builder.start_node(SyntaxKind::DOCUMENT.into());
2154        parse_inline_range(text, 0, text.len(), &config, &mut builder);
2155        builder.finish_node();
2156
2157        let green: GreenNode = builder.finish();
2158        let node = SyntaxNode::new_root(green);
2159
2160        // Verify losslessness
2161        assert_eq!(node.text().to_string(), text);
2162
2163        // Expected structure: EMPH > STRONG > "foo"
2164        let structure = format!("{:#?}", node);
2165
2166        // Should have both EMPH and STRONG
2167        assert!(structure.contains("EMPHASIS"), "Should have EMPHASIS node");
2168        assert!(structure.contains("STRONG"), "Should have STRONG node");
2169
2170        // EMPH should be outer, STRONG should be inner
2171        let mut found_emph = false;
2172        let mut found_strong_after_emph = false;
2173        for descendant in node.descendants() {
2174            if descendant.kind() == SyntaxKind::EMPHASIS {
2175                found_emph = true;
2176            }
2177            if found_emph && descendant.kind() == SyntaxKind::STRONG {
2178                found_strong_after_emph = true;
2179                break;
2180            }
2181        }
2182
2183        assert!(
2184            found_strong_after_emph,
2185            "STRONG should be inside EMPH. Current structure:\n{}",
2186            structure
2187        );
2188    }
2189
2190    /// Test that display math with attributes parses correctly
2191    /// Regression test for equation_attributes_single_line golden test
2192    #[test]
2193    fn test_display_math_with_attributes() {
2194        use crate::options::ParserOptions;
2195        use crate::syntax::SyntaxNode;
2196        use rowan::GreenNode;
2197
2198        let text = "$$ E = mc^2 $$ {#eq-einstein}";
2199        let mut config = ParserOptions::default();
2200        config.extensions.quarto_crossrefs = true; // Enable Quarto cross-references
2201
2202        let mut builder = GreenNodeBuilder::new();
2203        builder.start_node(SyntaxKind::DOCUMENT.into()); // Need a root node
2204
2205        // Parse the whole text
2206        parse_inline_text_recursive(&mut builder, text, &config);
2207
2208        builder.finish_node(); // Finish ROOT
2209        let green: GreenNode = builder.finish();
2210        let node = SyntaxNode::new_root(green);
2211
2212        // Verify losslessness
2213        assert_eq!(node.text().to_string(), text);
2214
2215        // Should have DISPLAY_MATH node
2216        let has_display_math = node
2217            .descendants()
2218            .any(|n| n.kind() == SyntaxKind::DISPLAY_MATH);
2219        assert!(has_display_math, "Should have DISPLAY_MATH node");
2220
2221        // Should have ATTRIBUTE node
2222        let has_attributes = node
2223            .descendants()
2224            .any(|n| n.kind() == SyntaxKind::ATTRIBUTE);
2225        assert!(
2226            has_attributes,
2227            "Should have ATTRIBUTE node for {{#eq-einstein}}"
2228        );
2229
2230        // Attributes should not be TEXT
2231        let math_followed_by_text = node.descendants().any(|n| {
2232            n.kind() == SyntaxKind::DISPLAY_MATH
2233                && n.next_sibling()
2234                    .map(|s| {
2235                        s.kind() == SyntaxKind::TEXT
2236                            && s.text().to_string().contains("{#eq-einstein}")
2237                    })
2238                    .unwrap_or(false)
2239        });
2240        assert!(
2241            !math_followed_by_text,
2242            "Attributes should not be parsed as TEXT"
2243        );
2244    }
2245
2246    #[test]
2247    fn test_parse_inline_text_gfm_inline_link_destination_not_autolinked() {
2248        use crate::options::{Dialect, Extensions, Flavor};
2249
2250        let config = ParserOptions {
2251            flavor: Flavor::Gfm,
2252            dialect: Dialect::for_flavor(Flavor::Gfm),
2253            extensions: Extensions::for_flavor(Flavor::Gfm),
2254            ..ParserOptions::default()
2255        };
2256
2257        let mut builder = GreenNodeBuilder::new();
2258        builder.start_node(SyntaxKind::PARAGRAPH.into());
2259        parse_inline_text_recursive(
2260            &mut builder,
2261            "Second Link [link_text](https://link.com)",
2262            &config,
2263        );
2264        builder.finish_node();
2265        let green = builder.finish();
2266        let root = SyntaxNode::new_root(green);
2267
2268        let links: Vec<_> = root
2269            .descendants()
2270            .filter(|n| n.kind() == SyntaxKind::LINK)
2271            .collect();
2272        assert_eq!(
2273            links.len(),
2274            1,
2275            "Expected exactly one LINK node for inline link, not nested bare URI autolink"
2276        );
2277
2278        let link = links[0].clone();
2279        let mut link_text = None::<String>;
2280        let mut link_dest = None::<String>;
2281
2282        for child in link.children() {
2283            match child.kind() {
2284                SyntaxKind::LINK_TEXT => link_text = Some(child.text().to_string()),
2285                SyntaxKind::LINK_DEST => link_dest = Some(child.text().to_string()),
2286                _ => {}
2287            }
2288        }
2289
2290        assert_eq!(link_text.as_deref(), Some("link_text"));
2291        assert_eq!(link_dest.as_deref(), Some("https://link.com"));
2292    }
2293
2294    #[test]
2295    fn test_autolink_bare_uri_utf8_boundary_safe() {
2296        let text = "§";
2297        let mut config = ParserOptions::default();
2298        config.extensions.autolink_bare_uris = true;
2299        let mut builder = GreenNodeBuilder::new();
2300
2301        builder.start_node(SyntaxKind::DOCUMENT.into());
2302        parse_inline_text_recursive(&mut builder, text, &config);
2303        builder.finish_node();
2304
2305        let green: GreenNode = builder.finish();
2306        let node = SyntaxNode::new_root(green);
2307        assert_eq!(node.text().to_string(), text);
2308    }
2309
2310    #[test]
2311    fn test_parse_emphasis_unicode_content_no_panic() {
2312        let text = "*§*";
2313        let config = ParserOptions::default();
2314        let mut builder = GreenNodeBuilder::new();
2315
2316        let result = try_parse_emphasis(text, 0, text.len(), &config, &mut builder);
2317        assert_eq!(result, Some((text.len(), 1)));
2318
2319        let green: GreenNode = builder.finish();
2320        let node = SyntaxNode::new_root(green);
2321        assert_eq!(node.kind(), SyntaxKind::EMPHASIS);
2322        assert_eq!(node.text().to_string(), text);
2323    }
2324}
2325
2326#[test]
2327fn test_two_with_nested_one_and_triple_closer() {
2328    // **bold with *italic***
2329    // Should parse as: Strong["bold with ", Emph["italic"]]
2330    // The *** at end is parsed as * (closes Emph) + ** (closes Strong)
2331
2332    use crate::options::ParserOptions;
2333    use crate::syntax::SyntaxNode;
2334    use rowan::GreenNode;
2335
2336    let text = "**bold with *italic***";
2337    let config = ParserOptions::default();
2338    let mut builder = GreenNodeBuilder::new();
2339
2340    // parse_inline_range emits inline content directly
2341    parse_inline_range(text, 0, text.len(), &config, &mut builder);
2342
2343    let green: GreenNode = builder.finish();
2344    let node = SyntaxNode::new_root(green);
2345
2346    // Verify lossless parsing
2347    assert_eq!(node.text().to_string(), text, "Should be lossless");
2348
2349    // The root node should be STRONG (parse_inline_range doesn't add wrapper)
2350    assert_eq!(
2351        node.kind(),
2352        SyntaxKind::STRONG,
2353        "Root should be STRONG, got: {:?}",
2354        node.kind()
2355    );
2356
2357    // STRONG should contain EMPHASIS as a nested node
2358    let has_emphasis = node.children().any(|c| c.kind() == SyntaxKind::EMPHASIS);
2359    assert!(has_emphasis, "STRONG should contain EMPHASIS node");
2360}
2361
2362#[test]
2363fn test_emphasis_with_trailing_space_before_closer() {
2364    // *foo * should parse as emphasis (Pandoc behavior)
2365    // For asterisks, Pandoc doesn't require right-flanking for closers
2366
2367    use crate::options::ParserOptions;
2368    use crate::syntax::SyntaxNode;
2369    use rowan::GreenNode;
2370
2371    let text = "*foo *";
2372    let config = ParserOptions::default();
2373    let mut builder = GreenNodeBuilder::new();
2374
2375    // Try to parse emphasis at position 0
2376    let result = try_parse_emphasis(text, 0, text.len(), &config, &mut builder);
2377
2378    // Should successfully parse (consumed all 6 bytes, delimiter count 1)
2379    assert_eq!(
2380        result,
2381        Some((6, 1)),
2382        "Should parse as emphasis, result: {:?}",
2383        result
2384    );
2385
2386    // Check the generated CST
2387    let green: GreenNode = builder.finish();
2388    let node = SyntaxNode::new_root(green);
2389
2390    // The root IS the EMPHASIS node
2391    assert_eq!(node.kind(), SyntaxKind::EMPHASIS);
2392
2393    // Verify losslessness
2394    assert_eq!(node.text().to_string(), text);
2395}
2396
2397#[test]
2398fn test_triple_emphasis_all_strong_nested() {
2399    // ***foo** bar **baz*** should parse as Emph[Strong[foo], " bar ", Strong[baz]]
2400    // Pandoc output confirms this
2401
2402    use crate::options::ParserOptions;
2403    use crate::syntax::SyntaxNode;
2404    use rowan::GreenNode;
2405
2406    let text = "***foo** bar **baz***";
2407    let config = ParserOptions::default();
2408    let mut builder = GreenNodeBuilder::new();
2409
2410    parse_inline_range(text, 0, text.len(), &config, &mut builder);
2411
2412    let green: GreenNode = builder.finish();
2413    let node = SyntaxNode::new_root(green);
2414
2415    // Should have one EMPHASIS node at root
2416    let emphasis_nodes: Vec<_> = node
2417        .descendants()
2418        .filter(|n| n.kind() == SyntaxKind::EMPHASIS)
2419        .collect();
2420    assert_eq!(
2421        emphasis_nodes.len(),
2422        1,
2423        "Should have exactly one EMPHASIS node, found: {}",
2424        emphasis_nodes.len()
2425    );
2426
2427    // EMPHASIS should contain two STRONG nodes
2428    let emphasis_node = emphasis_nodes[0].clone();
2429    let strong_in_emphasis: Vec<_> = emphasis_node
2430        .children()
2431        .filter(|n| n.kind() == SyntaxKind::STRONG)
2432        .collect();
2433    assert_eq!(
2434        strong_in_emphasis.len(),
2435        2,
2436        "EMPHASIS should contain two STRONG nodes, found: {}",
2437        strong_in_emphasis.len()
2438    );
2439
2440    // Verify losslessness
2441    assert_eq!(node.text().to_string(), text);
2442}
2443
2444#[test]
2445fn test_triple_emphasis_all_emph_nested() {
2446    // ***foo* bar *baz*** should parse as Strong[Emph[foo], " bar ", Emph[baz]]
2447    // Pandoc output confirms this
2448
2449    use crate::options::ParserOptions;
2450    use crate::syntax::SyntaxNode;
2451    use rowan::GreenNode;
2452
2453    let text = "***foo* bar *baz***";
2454    let config = ParserOptions::default();
2455    let mut builder = GreenNodeBuilder::new();
2456
2457    parse_inline_range(text, 0, text.len(), &config, &mut builder);
2458
2459    let green: GreenNode = builder.finish();
2460    let node = SyntaxNode::new_root(green);
2461
2462    // Should have one STRONG node at root
2463    let strong_nodes: Vec<_> = node
2464        .descendants()
2465        .filter(|n| n.kind() == SyntaxKind::STRONG)
2466        .collect();
2467    assert_eq!(
2468        strong_nodes.len(),
2469        1,
2470        "Should have exactly one STRONG node, found: {}",
2471        strong_nodes.len()
2472    );
2473
2474    // STRONG should contain two EMPHASIS nodes
2475    let strong_node = strong_nodes[0].clone();
2476    let emph_in_strong: Vec<_> = strong_node
2477        .children()
2478        .filter(|n| n.kind() == SyntaxKind::EMPHASIS)
2479        .collect();
2480    assert_eq!(
2481        emph_in_strong.len(),
2482        2,
2483        "STRONG should contain two EMPHASIS nodes, found: {}",
2484        emph_in_strong.len()
2485    );
2486
2487    // Verify losslessness
2488    assert_eq!(node.text().to_string(), text);
2489}
2490
2491// Multiline emphasis tests
2492#[test]
2493fn test_parse_emphasis_multiline() {
2494    // Per Pandoc spec, emphasis CAN contain newlines (soft breaks)
2495    use crate::options::ParserOptions;
2496    use crate::syntax::SyntaxNode;
2497    use rowan::GreenNode;
2498
2499    let text = "*text on\nline two*";
2500    let config = ParserOptions::default();
2501    let mut builder = GreenNodeBuilder::new();
2502
2503    let result = try_parse_emphasis(text, 0, text.len(), &config, &mut builder);
2504
2505    // Should successfully parse all bytes
2506    assert_eq!(
2507        result,
2508        Some((text.len(), 1)),
2509        "Emphasis should parse multiline content"
2510    );
2511
2512    // Check the generated CST
2513    let green: GreenNode = builder.finish();
2514    let node = SyntaxNode::new_root(green);
2515
2516    // Should have EMPHASIS node
2517    assert_eq!(node.kind(), SyntaxKind::EMPHASIS);
2518
2519    // Verify losslessness: should preserve the newline
2520    assert_eq!(node.text().to_string(), text);
2521    assert!(
2522        node.text().to_string().contains('\n'),
2523        "Should preserve newline in emphasis content"
2524    );
2525}
2526
2527#[test]
2528fn test_parse_strong_multiline() {
2529    // Per Pandoc spec, strong emphasis CAN contain newlines
2530    use crate::options::ParserOptions;
2531    use crate::syntax::SyntaxNode;
2532    use rowan::GreenNode;
2533
2534    let text = "**strong on\nline two**";
2535    let config = ParserOptions::default();
2536    let mut builder = GreenNodeBuilder::new();
2537
2538    let result = try_parse_emphasis(text, 0, text.len(), &config, &mut builder);
2539
2540    // Should successfully parse all bytes
2541    assert_eq!(
2542        result,
2543        Some((text.len(), 2)),
2544        "Strong emphasis should parse multiline content"
2545    );
2546
2547    // Check the generated CST
2548    let green: GreenNode = builder.finish();
2549    let node = SyntaxNode::new_root(green);
2550
2551    // Should have STRONG node
2552    assert_eq!(node.kind(), SyntaxKind::STRONG);
2553
2554    // Verify losslessness
2555    assert_eq!(node.text().to_string(), text);
2556    assert!(
2557        node.text().to_string().contains('\n'),
2558        "Should preserve newline in strong content"
2559    );
2560}
2561
2562#[test]
2563fn test_parse_triple_emphasis_multiline() {
2564    // Triple emphasis with newlines
2565    use crate::options::ParserOptions;
2566    use crate::syntax::SyntaxNode;
2567    use rowan::GreenNode;
2568
2569    let text = "***both on\nline two***";
2570    let config = ParserOptions::default();
2571    let mut builder = GreenNodeBuilder::new();
2572
2573    let result = try_parse_emphasis(text, 0, text.len(), &config, &mut builder);
2574
2575    // Should successfully parse all bytes
2576    assert_eq!(
2577        result,
2578        Some((text.len(), 3)),
2579        "Triple emphasis should parse multiline content"
2580    );
2581
2582    // Check the generated CST
2583    let green: GreenNode = builder.finish();
2584    let node = SyntaxNode::new_root(green);
2585
2586    // Should have STRONG node (triple = strong + emph)
2587    let has_strong = node.descendants().any(|n| n.kind() == SyntaxKind::STRONG);
2588    assert!(has_strong, "Should have STRONG node");
2589
2590    // Verify losslessness
2591    assert_eq!(node.text().to_string(), text);
2592    assert!(
2593        node.text().to_string().contains('\n'),
2594        "Should preserve newline in triple emphasis content"
2595    );
2596}
panache_parser/parser/inlines/core.rs

panache_parser/parser/inlines/
core.rs