Skip to main content

panache_parser/parser/inlines/
core.rs

1//! Inline emission walk.
2//!
3//! Consumes the IR plans built by [`super::inline_ir::build_full_plans`]
4//! (emphasis pairings, bracket resolutions, standalone Pandoc constructs)
5//! and emits the inline CST tokens / nodes in source order. Resolution
6//! decisions for emphasis, brackets, and standalone Pandoc constructs
7//! are entirely IR-driven for both dialects; the dispatcher's
8//! `try_parse_*` recognizers are still called to *parse* a matched byte
9//! range into a CST subtree, but "what is this byte range?" is answered
10//! exclusively by the IR.
11
12use crate::options::{Dialect, ParserOptions};
13use crate::syntax::SyntaxKind;
14use rowan::GreenNodeBuilder;
15
16use super::inline_ir::{
17    BracketPlan, ConstructDispo, ConstructPlan, DelimChar, EmphasisKind, EmphasisPlan,
18};
19
20// Import inline element parsers from sibling modules
21use super::bookdown::{
22    try_parse_bookdown_definition, try_parse_bookdown_reference, try_parse_bookdown_text_reference,
23};
24use super::bracketed_spans::{emit_bracketed_span, try_parse_bracketed_span};
25use super::citations::{
26    emit_bare_citation, emit_bracketed_citation, try_parse_bare_citation,
27    try_parse_bracketed_citation,
28};
29use super::code_spans::{emit_code_span, try_parse_code_span};
30use super::emoji::{emit_emoji, try_parse_emoji};
31use super::escapes::{EscapeType, emit_escape, try_parse_escape};
32use super::inline_executable::{emit_inline_executable, try_parse_inline_executable};
33use super::inline_footnotes::{
34    emit_footnote_reference, emit_inline_footnote, try_parse_footnote_reference,
35    try_parse_inline_footnote,
36};
37use super::inline_html::{emit_inline_html, try_parse_inline_html};
38use super::latex::{parse_latex_command, try_parse_latex_command};
39use super::links::{
40    LinkScanContext, emit_autolink, emit_bare_uri_link, emit_inline_image, emit_inline_link,
41    emit_reference_image, emit_reference_link, try_parse_autolink, try_parse_bare_uri,
42    try_parse_inline_image, try_parse_inline_link, try_parse_reference_image,
43    try_parse_reference_link,
44};
45use super::mark::{emit_mark, try_parse_mark};
46use super::math::{
47    emit_display_math, emit_display_math_environment, emit_double_backslash_display_math,
48    emit_double_backslash_inline_math, emit_gfm_inline_math, emit_inline_math,
49    emit_single_backslash_display_math, emit_single_backslash_inline_math, try_parse_display_math,
50    try_parse_double_backslash_display_math, try_parse_double_backslash_inline_math,
51    try_parse_gfm_inline_math, try_parse_inline_math, try_parse_math_environment,
52    try_parse_single_backslash_display_math, try_parse_single_backslash_inline_math,
53};
54use super::native_spans::{emit_native_span, try_parse_native_span};
55use super::raw_inline::is_raw_inline;
56use super::shortcodes::{emit_shortcode, try_parse_shortcode};
57use super::strikeout::{emit_strikeout, try_parse_strikeout};
58use super::subscript::{emit_subscript, try_parse_subscript};
59use super::superscript::{emit_superscript, try_parse_superscript};
60
61/// Parse inline text into the CST builder.
62///
63/// Top-level entry point for inline parsing. Builds the IR plans
64/// (emphasis pairings, bracket resolutions, standalone Pandoc constructs)
65/// once via [`super::inline_ir::build_full_plans`], then walks the byte
66/// range left-to-right consulting those plans plus the dispatcher's
67/// ordered-try chain for non-IR-resolved constructs (autolinks, code
68/// spans, escapes, math, etc.). Dialect-specific behavior is selected
69/// inside `build_full_plans`.
70///
71/// # Arguments
72/// * `text` - The inline text to parse
73/// * `config` - Configuration for extensions and formatting
74/// * `builder` - The CST builder to emit nodes to
75pub fn parse_inline_text_recursive(
76    builder: &mut GreenNodeBuilder,
77    text: &str,
78    config: &ParserOptions,
79) {
80    log::trace!(
81        "Recursive inline parsing: {:?} ({} bytes)",
82        &text[..text.len().min(40)],
83        text.len()
84    );
85
86    let mask = structural_byte_mask(config);
87    if try_emit_plain_text_fast_path_with_mask(builder, text, &mask) {
88        log::trace!("Recursive inline parsing complete (plain-text fast path)");
89        return;
90    }
91
92    let plans = super::inline_ir::build_full_plans(text, 0, text.len(), config);
93    parse_inline_range_impl(
94        text,
95        0,
96        text.len(),
97        config,
98        builder,
99        false,
100        &plans.emphasis,
101        &plans.brackets,
102        &plans.constructs,
103        false,
104        &mask,
105    );
106
107    log::trace!("Recursive inline parsing complete");
108}
109
110/// Parse inline elements from text content nested inside a link/image/span.
111///
112/// Used for recursive inline parsing of link text, image alt, span content, etc.
113/// Suppresses constructs that would create nested links (CommonMark §6.3 forbids
114/// links inside links), notably extended bare-URI autolinks under GFM.
115///
116/// `suppress_inner_links` should be `true` when the recursion is for a
117/// LINK or REFERENCE-LINK's text, where inner link / reference-link
118/// brackets must emit as literal text (pandoc-native:
119/// `[link [inner](u2)](u1)` → outer `Link` with `Str "[inner](u2)"`).
120/// Image alt text and all non-link contexts pass `false`:
121/// pandoc-native verifies `![alt with [inner](u)](u2)` keeps the inner
122/// `Link`, and bracketed spans / native spans / inline footnotes /
123/// emphasis all allow nested links.
124pub fn parse_inline_text(
125    builder: &mut GreenNodeBuilder,
126    text: &str,
127    config: &ParserOptions,
128    suppress_inner_links: bool,
129) {
130    log::trace!(
131        "Parsing inline text (nested in link): {:?} ({} bytes)",
132        &text[..text.len().min(40)],
133        text.len()
134    );
135
136    let mask = structural_byte_mask(config);
137    if try_emit_plain_text_fast_path_with_mask(builder, text, &mask) {
138        return;
139    }
140
141    let plans = super::inline_ir::build_full_plans(text, 0, text.len(), config);
142    parse_inline_range_impl(
143        text,
144        0,
145        text.len(),
146        config,
147        builder,
148        true,
149        &plans.emphasis,
150        &plans.brackets,
151        &plans.constructs,
152        suppress_inner_links,
153        &mask,
154    );
155}
156
157/// Plain-text fast path for inline ranges with no structural bytes.
158///
159/// Returns `true` if the range was emitted as a single `TEXT` token and
160/// the caller should skip the IR + dispatcher pipeline. Returns `false`
161/// if any structural byte appears (or the range is empty), letting the
162/// caller proceed normally. Empty input returns `false` so the caller's
163/// existing "no events → no output" path is preserved exactly.
164///
165/// The structural byte set is computed from `config.dialect` and
166/// `config.extensions` so prose containing dialect-irrelevant punctuation
167/// (e.g. `-` outside a citation flavor) doesn't unnecessarily disable the
168/// fast path. `\n` and `\r` are always structural — multi-line inline
169/// content must still split into TEXT + NEWLINE tokens like the slow path.
170fn try_emit_plain_text_fast_path_with_mask(
171    builder: &mut GreenNodeBuilder,
172    text: &str,
173    mask: &[bool; 256],
174) -> bool {
175    if text.is_empty() {
176        return false;
177    }
178    for &b in text.as_bytes() {
179        if mask[b as usize] {
180            return false;
181        }
182    }
183    builder.token(SyntaxKind::TEXT.into(), text);
184    true
185}
186
187/// Build a 256-entry byte mask: `mask[b]` is `true` iff byte `b` could
188/// trigger any IR-recognised construct or dispatcher branch under the
189/// current dialect/extensions. Used by the plain-text fast path to scan
190/// inline ranges in a single pass.
191fn structural_byte_mask(config: &ParserOptions) -> [bool; 256] {
192    let mut mask = [false; 256];
193    let exts = &config.extensions;
194    let pandoc = config.dialect == Dialect::Pandoc;
195
196    // Always structural: line breaks (CST splits TEXT/NEWLINE), backslash
197    // (escape / hard break / backslash-math / latex / bookdown ref),
198    // backtick (code span / inline executable), `*`/`_` (emphasis is a
199    // core CommonMark construct, not extension-gated), and `[`/`]` if
200    // any bracket-shaped construct is reachable.
201    mask[b'\n' as usize] = true;
202    mask[b'\r' as usize] = true;
203    mask[b'\\' as usize] = true;
204    mask[b'`' as usize] = true;
205    mask[b'*' as usize] = true;
206    mask[b'_' as usize] = true;
207
208    // Brackets: the IR/dispatcher only acts on `[`/`]` if some
209    // bracket-shaped feature is reachable. `!` is the leading byte of
210    // `![alt]` image brackets — the IR's `BracketPlan` keys image
211    // openers at the `!` position, so the dispatcher must stop here
212    // to consult the plan.
213    if exts.inline_links
214        || exts.reference_links
215        || exts.inline_images
216        || exts.bracketed_spans
217        || exts.footnotes
218        || exts.citations
219    {
220        mask[b'[' as usize] = true;
221        mask[b']' as usize] = true;
222    }
223    if exts.inline_images || exts.reference_links {
224        mask[b'!' as usize] = true;
225    }
226
227    // `<` covers autolinks, raw HTML, and Pandoc native spans.
228    if exts.autolinks || exts.raw_html || exts.native_spans {
229        mask[b'<' as usize] = true;
230    }
231
232    // `^` covers Pandoc inline footnotes (`^[...]`), CM inline footnotes
233    // (when explicitly enabled), and superscript (`^text^`).
234    if exts.inline_footnotes || exts.superscript {
235        mask[b'^' as usize] = true;
236    }
237
238    // `@` and `-` cover Pandoc citation forms (`@cite`, `-@cite`,
239    // `[@cite]`). Under Pandoc dialect, the IR's `ConstructPlan` keys
240    // bare citations at the `@` or `-` position, so the dispatcher
241    // must stop at either to consult the plan. Including `-` is
242    // pessimistic — most prose hyphens won't form `-@` — but missing
243    // it would skip past valid suppress-author citations.
244    if exts.citations || exts.quarto_crossrefs {
245        mask[b'@' as usize] = true;
246        if pandoc {
247            mask[b'-' as usize] = true;
248        }
249    }
250
251    // `$` covers dollar-math and GFM math.
252    if exts.tex_math_dollars || exts.tex_math_gfm {
253        mask[b'$' as usize] = true;
254    }
255
256    // `~` covers subscript and strikeout (both `~text~` and `~~text~~`).
257    if exts.subscript || exts.strikeout {
258        mask[b'~' as usize] = true;
259    }
260
261    if exts.mark {
262        mask[b'=' as usize] = true;
263    }
264    if exts.emoji {
265        mask[b':' as usize] = true;
266    }
267    if exts.bookdown_references {
268        mask[b'(' as usize] = true;
269    }
270    // `{{< ... >}}` shortcodes: the dispatcher tries them on any
271    // `{` regardless of the `quarto_shortcodes` extension flag, so
272    // `{` must always be flagged here.
273    mask[b'{' as usize] = true;
274
275    // Bare-URI autolinks (`http://...` without `<>`) have no
276    // leading-byte gate in the dispatcher — `try_parse_bare_uri`
277    // probes for a URI scheme starting at every byte. Flag all
278    // ASCII alphabetic bytes so the bulk-skip stops on every
279    // potential scheme starter. This effectively disables the
280    // bulk-skip benefit for prose under GFM-style flavors but
281    // preserves correctness; ASCII digits / punctuation / non-ASCII
282    // bytes still skip cleanly.
283    if exts.autolink_bare_uris {
284        for b in b'a'..=b'z' {
285            mask[b as usize] = true;
286        }
287        for b in b'A'..=b'Z' {
288            mask[b as usize] = true;
289        }
290    }
291
292    mask
293}
294
295fn is_emoji_boundary(text: &str, pos: usize) -> bool {
296    if pos > 0 {
297        let prev = text.as_bytes()[pos - 1] as char;
298        if prev.is_ascii_alphanumeric() || prev == '_' {
299            return false;
300        }
301    }
302    true
303}
304
305#[inline]
306fn advance_char_boundary(text: &str, pos: usize, end: usize) -> usize {
307    if pos >= end || pos >= text.len() {
308        return pos;
309    }
310    let ch_len = text[pos..]
311        .chars()
312        .next()
313        .map_or(1, std::primitive::char::len_utf8);
314    (pos + ch_len).min(end)
315}
316
317#[allow(clippy::too_many_arguments)]
318fn parse_inline_range_impl(
319    text: &str,
320    start: usize,
321    end: usize,
322    config: &ParserOptions,
323    builder: &mut GreenNodeBuilder,
324    nested_in_link: bool,
325    plan: &EmphasisPlan,
326    bracket_plan: &BracketPlan,
327    construct_plan: &ConstructPlan,
328    suppress_inner_links: bool,
329    mask: &[bool; 256],
330) {
331    log::trace!(
332        "parse_inline_range: start={}, end={}, text={:?}",
333        start,
334        end,
335        &text[start..end]
336    );
337    let mut pos = start;
338    let mut text_start = start;
339    let bytes = text.as_bytes();
340
341    while pos < end {
342        // Bulk-skip plain bytes between structural bytes. Plans
343        // (`construct_plan`, `bracket_plan`, emphasis `plan`) only
344        // resolve at structural byte positions, so skipping here
345        // never elides a real match. `text_start` is preserved
346        // across the skip; the next emitted construct flushes the
347        // accumulated TEXT span.
348        if !mask[bytes[pos] as usize] {
349            let mut next = pos + 1;
350            while next < end && !mask[bytes[next] as usize] {
351                next += 1;
352            }
353            pos = next;
354            if pos >= end {
355                break;
356            }
357        }
358        // IR-driven dispatch: if the IR identified a Pandoc standalone
359        // construct starting here, emit it directly. Bypasses the
360        // dispatcher's ordered-try chain for inline footnotes, native
361        // spans, footnote references, citations, and bracketed spans
362        // under `Dialect::Pandoc`. The IR scan gates these on
363        // `!is_commonmark` and the relevant extension flag, so this
364        // branch is empty under CommonMark dialect (where the legacy
365        // dispatcher branches still run when the extension is enabled).
366        if let Some(dispo) = construct_plan.lookup(pos) {
367            match *dispo {
368                ConstructDispo::InlineFootnote { end: dispo_end } => {
369                    if dispo_end <= end
370                        && let Some((len, content)) = try_parse_inline_footnote(&text[pos..])
371                        && pos + len == dispo_end
372                    {
373                        if pos > text_start {
374                            builder.token(SyntaxKind::TEXT.into(), &text[text_start..pos]);
375                        }
376                        log::trace!("IR: matched inline footnote at pos {}", pos);
377                        emit_inline_footnote(builder, content, config);
378                        pos += len;
379                        text_start = pos;
380                        continue;
381                    }
382                }
383                ConstructDispo::NativeSpan { end: dispo_end } => {
384                    if dispo_end <= end
385                        && let Some((len, content, attributes)) =
386                            try_parse_native_span(&text[pos..])
387                        && pos + len == dispo_end
388                    {
389                        if pos > text_start {
390                            builder.token(SyntaxKind::TEXT.into(), &text[text_start..pos]);
391                        }
392                        log::trace!("IR: matched native span at pos {}", pos);
393                        emit_native_span(builder, content, &attributes, config);
394                        pos += len;
395                        text_start = pos;
396                        continue;
397                    }
398                }
399                ConstructDispo::FootnoteReference { end: dispo_end } => {
400                    if dispo_end <= end
401                        && let Some((len, id)) = try_parse_footnote_reference(&text[pos..])
402                        && pos + len == dispo_end
403                    {
404                        if pos > text_start {
405                            builder.token(SyntaxKind::TEXT.into(), &text[text_start..pos]);
406                        }
407                        log::trace!("IR: matched footnote reference at pos {}", pos);
408                        emit_footnote_reference(builder, &id);
409                        pos += len;
410                        text_start = pos;
411                        continue;
412                    }
413                }
414                ConstructDispo::BracketedCitation { end: dispo_end } => {
415                    if dispo_end <= end
416                        && let Some((len, content)) = try_parse_bracketed_citation(&text[pos..])
417                        && pos + len == dispo_end
418                    {
419                        if pos > text_start {
420                            builder.token(SyntaxKind::TEXT.into(), &text[text_start..pos]);
421                        }
422                        log::trace!("IR: matched bracketed citation at pos {}", pos);
423                        emit_bracketed_citation(builder, content);
424                        pos += len;
425                        text_start = pos;
426                        continue;
427                    }
428                }
429                ConstructDispo::BareCitation { end: dispo_end } => {
430                    if dispo_end <= end
431                        && let Some((len, key, has_suppress)) =
432                            try_parse_bare_citation(&text[pos..])
433                        && pos + len == dispo_end
434                    {
435                        let is_crossref = config.extensions.quarto_crossrefs
436                            && super::citations::is_quarto_crossref_key(key);
437                        if is_crossref || config.extensions.citations {
438                            if pos > text_start {
439                                builder.token(SyntaxKind::TEXT.into(), &text[text_start..pos]);
440                            }
441                            if is_crossref {
442                                log::trace!("IR: matched Quarto crossref at pos {}: {}", pos, key);
443                                super::citations::emit_crossref(builder, key, has_suppress);
444                            } else {
445                                log::trace!("IR: matched bare citation at pos {}: {}", pos, key);
446                                emit_bare_citation(builder, key, has_suppress);
447                            }
448                            pos += len;
449                            text_start = pos;
450                            continue;
451                        }
452                    }
453                }
454                ConstructDispo::BracketedSpan { end: dispo_end } => {
455                    if dispo_end <= end
456                        && let Some((len, content, attrs)) = try_parse_bracketed_span(&text[pos..])
457                        && pos + len == dispo_end
458                    {
459                        if pos > text_start {
460                            builder.token(SyntaxKind::TEXT.into(), &text[text_start..pos]);
461                        }
462                        log::trace!("IR: matched bracketed span at pos {}", pos);
463                        emit_bracketed_span(builder, &content, &attrs, config);
464                        pos += len;
465                        text_start = pos;
466                        continue;
467                    }
468                }
469            }
470        }
471
472        // IR-driven bracket dispatch: if the IR's `process_brackets`
473        // resolved a bracket pair starting at this position, emit it
474        // directly via the appropriate helper. The
475        // dispatcher's `try_parse_*` recognizers compute the actual
476        // byte length and extract content / attributes; the IR's
477        // `suffix_end` is used to constrain the dispatcher's match
478        // shape so the two pipelines agree on which link variant
479        // resolved (e.g. `[foo][bar]` with `bar` undefined and `foo`
480        // defined: IR resolves `[foo]` as shortcut, but the
481        // dispatcher's `try_parse_reference_link` would otherwise
482        // greedily return the full-ref shape). Suppression of inner
483        // LINK / REFERENCE LINK during LINK-text recursion is applied
484        // here (pandoc-native: outer-wins for nested links).
485        //
486        // Pandoc-extended `{.attrs}` after a link can extend the
487        // dispatcher's match length past the IR's `suffix_end`. The
488        // dispatcher's len is therefore constrained to
489        // `[suffix_end, end]` rather than required to equal
490        // `suffix_end` exactly.
491        if let Some(super::inline_ir::BracketDispo::Open {
492            is_image,
493            suffix_end,
494            ..
495        }) = bracket_plan.lookup(pos)
496        {
497            let is_image = *is_image;
498            let dispo_suffix_end = *suffix_end;
499            let suppress = suppress_inner_links && !is_image;
500            if !suppress {
501                let ctx = LinkScanContext::from_options(config);
502                let allow_shortcut = config.extensions.shortcut_reference_links;
503                let is_commonmark = config.dialect == Dialect::CommonMark;
504                if is_image {
505                    if config.extensions.inline_images
506                        && let Some((len, alt_text, dest, attributes)) =
507                            try_parse_inline_image(&text[pos..], ctx)
508                        && pos + len >= dispo_suffix_end
509                        && pos + len <= end
510                    {
511                        if pos > text_start {
512                            builder.token(SyntaxKind::TEXT.into(), &text[text_start..pos]);
513                        }
514                        log::trace!("IR: matched inline image at pos {}", pos);
515                        emit_inline_image(
516                            builder,
517                            &text[pos..pos + len],
518                            alt_text,
519                            dest,
520                            attributes,
521                            config,
522                        );
523                        pos += len;
524                        text_start = pos;
525                        continue;
526                    }
527                    if config.extensions.reference_links
528                        && let Some((len, alt_text, reference, is_shortcut)) =
529                            try_parse_reference_image(&text[pos..], allow_shortcut)
530                        && pos + len == dispo_suffix_end
531                        && pos + len <= end
532                    {
533                        if pos > text_start {
534                            builder.token(SyntaxKind::TEXT.into(), &text[text_start..pos]);
535                        }
536                        log::trace!("IR: matched reference image at pos {}", pos);
537                        emit_reference_image(builder, alt_text, &reference, is_shortcut, config);
538                        pos += len;
539                        text_start = pos;
540                        continue;
541                    }
542                } else {
543                    if config.extensions.inline_links
544                        && let Some((len, link_text, dest, attributes)) =
545                            try_parse_inline_link(&text[pos..], is_commonmark, ctx)
546                        && pos + len >= dispo_suffix_end
547                        && pos + len <= end
548                    {
549                        if pos > text_start {
550                            builder.token(SyntaxKind::TEXT.into(), &text[text_start..pos]);
551                        }
552                        log::trace!("IR: matched inline link at pos {}", pos);
553                        emit_inline_link(
554                            builder,
555                            &text[pos..pos + len],
556                            link_text,
557                            dest,
558                            attributes,
559                            config,
560                        );
561                        pos += len;
562                        text_start = pos;
563                        continue;
564                    }
565                    if config.extensions.reference_links
566                        && let Some((len, link_text, reference, is_shortcut)) =
567                            try_parse_reference_link(
568                                &text[pos..],
569                                allow_shortcut,
570                                config.extensions.inline_links,
571                                ctx,
572                            )
573                        && pos + len == dispo_suffix_end
574                        && pos + len <= end
575                    {
576                        if pos > text_start {
577                            builder.token(SyntaxKind::TEXT.into(), &text[text_start..pos]);
578                        }
579                        log::trace!("IR: matched reference link at pos {}", pos);
580                        emit_reference_link(builder, link_text, &reference, is_shortcut, config);
581                        pos += len;
582                        text_start = pos;
583                        continue;
584                    }
585                }
586            }
587        }
588
589        let byte = text.as_bytes()[pos];
590
591        // Backslash math (highest priority if enabled)
592        if byte == b'\\' {
593            // Try double backslash display math first: \\[...\\]
594            if config.extensions.tex_math_double_backslash {
595                if let Some((len, content)) = try_parse_double_backslash_display_math(&text[pos..])
596                {
597                    if pos > text_start {
598                        builder.token(SyntaxKind::TEXT.into(), &text[text_start..pos]);
599                    }
600                    log::trace!("Matched double backslash display math at pos {}", pos);
601                    emit_double_backslash_display_math(builder, content);
602                    pos += len;
603                    text_start = pos;
604                    continue;
605                }
606
607                // Try double backslash inline math: \\(...\\)
608                if let Some((len, content)) = try_parse_double_backslash_inline_math(&text[pos..]) {
609                    if pos > text_start {
610                        builder.token(SyntaxKind::TEXT.into(), &text[text_start..pos]);
611                    }
612                    log::trace!("Matched double backslash inline math at pos {}", pos);
613                    emit_double_backslash_inline_math(builder, content);
614                    pos += len;
615                    text_start = pos;
616                    continue;
617                }
618            }
619
620            // Try single backslash display math: \[...\]
621            if config.extensions.tex_math_single_backslash {
622                if let Some((len, content)) = try_parse_single_backslash_display_math(&text[pos..])
623                {
624                    if pos > text_start {
625                        builder.token(SyntaxKind::TEXT.into(), &text[text_start..pos]);
626                    }
627                    log::trace!("Matched single backslash display math at pos {}", pos);
628                    emit_single_backslash_display_math(builder, content);
629                    pos += len;
630                    text_start = pos;
631                    continue;
632                }
633
634                // Try single backslash inline math: \(...\)
635                if let Some((len, content)) = try_parse_single_backslash_inline_math(&text[pos..]) {
636                    if pos > text_start {
637                        builder.token(SyntaxKind::TEXT.into(), &text[text_start..pos]);
638                    }
639                    log::trace!("Matched single backslash inline math at pos {}", pos);
640                    emit_single_backslash_inline_math(builder, content);
641                    pos += len;
642                    text_start = pos;
643                    continue;
644                }
645            }
646
647            // Try math environments \begin{equation}...\end{equation}
648            if config.extensions.raw_tex
649                && let Some((len, begin_marker, content, end_marker)) =
650                    try_parse_math_environment(&text[pos..])
651            {
652                if pos > text_start {
653                    builder.token(SyntaxKind::TEXT.into(), &text[text_start..pos]);
654                }
655                log::trace!("Matched math environment at pos {}", pos);
656                emit_display_math_environment(builder, begin_marker, content, end_marker);
657                pos += len;
658                text_start = pos;
659                continue;
660            }
661
662            // Try bookdown reference: \@ref(label)
663            if config.extensions.bookdown_references
664                && let Some((len, label)) = try_parse_bookdown_reference(&text[pos..])
665            {
666                if pos > text_start {
667                    builder.token(SyntaxKind::TEXT.into(), &text[text_start..pos]);
668                }
669                log::trace!("Matched bookdown reference at pos {}: {}", pos, label);
670                super::citations::emit_bookdown_crossref(builder, label);
671                pos += len;
672                text_start = pos;
673                continue;
674            }
675
676            // Try escapes (after bookdown refs and backslash math)
677            if let Some((len, ch, escape_type)) = try_parse_escape(&text[pos..]) {
678                let escape_enabled = match escape_type {
679                    EscapeType::HardLineBreak => config.extensions.escaped_line_breaks,
680                    EscapeType::NonbreakingSpace => config.extensions.all_symbols_escapable,
681                    EscapeType::Literal => {
682                        // BASE_ESCAPABLE matches Pandoc's markdown_strict /
683                        // original Markdown set, plus `|` and `~` which the
684                        // formatter emits as escapes for pipe-table separators
685                        // and strikethrough delimiters. Recognising those here
686                        // keeps round-trips idempotent in flavors that don't
687                        // enable all_symbols_escapable.
688                        //
689                        // Under CommonMark dialect, the spec (§2.4) explicitly
690                        // allows ANY ASCII punctuation to be backslash-escaped,
691                        // independent of the all_symbols_escapable extension
692                        // (which also widens to whitespace, a Pandoc-only
693                        // construct).
694                        const BASE_ESCAPABLE: &str = "\\`*_{}[]()>#+-.!|~";
695                        BASE_ESCAPABLE.contains(ch)
696                            || config.extensions.all_symbols_escapable
697                            || (config.dialect == crate::Dialect::CommonMark
698                                && ch.is_ascii_punctuation())
699                    }
700                };
701                if !escape_enabled {
702                    // Don't treat as hard line break - skip the escape and continue
703                    // The backslash will be included in the next TEXT token
704                    pos = advance_char_boundary(text, pos, end);
705                    continue;
706                }
707
708                // Emit accumulated text
709                if pos > text_start {
710                    builder.token(SyntaxKind::TEXT.into(), &text[text_start..pos]);
711                }
712
713                log::trace!("Matched escape at pos {}: \\{}", pos, ch);
714                emit_escape(builder, ch, escape_type);
715                pos += len;
716                text_start = pos;
717                continue;
718            }
719
720            // Try LaTeX commands (after escapes, before shortcodes)
721            if config.extensions.raw_tex
722                && let Some(len) = try_parse_latex_command(&text[pos..])
723            {
724                if pos > text_start {
725                    builder.token(SyntaxKind::TEXT.into(), &text[text_start..pos]);
726                }
727                log::trace!("Matched LaTeX command at pos {}", pos);
728                parse_latex_command(builder, &text[pos..], len);
729                pos += len;
730                text_start = pos;
731                continue;
732            }
733        }
734
735        // Try Quarto shortcodes: {{< shortcode >}}
736        if byte == b'{'
737            && pos + 1 < text.len()
738            && text.as_bytes()[pos + 1] == b'{'
739            && let Some((len, name, attrs)) = try_parse_shortcode(&text[pos..])
740        {
741            if pos > text_start {
742                builder.token(SyntaxKind::TEXT.into(), &text[text_start..pos]);
743            }
744            log::trace!("Matched shortcode at pos {}: {}", pos, &name);
745            emit_shortcode(builder, &name, attrs);
746            pos += len;
747            text_start = pos;
748            continue;
749        }
750
751        // Try inline executable code spans (`... `r expr`` and `... `{r} expr``)
752        if byte == b'`'
753            && let Some(m) = try_parse_inline_executable(
754                &text[pos..],
755                config.extensions.rmarkdown_inline_code,
756                config.extensions.quarto_inline_code,
757            )
758        {
759            if pos > text_start {
760                builder.token(SyntaxKind::TEXT.into(), &text[text_start..pos]);
761            }
762            log::trace!("Matched inline executable code at pos {}", pos);
763            emit_inline_executable(builder, &m);
764            pos += m.total_len;
765            text_start = pos;
766            continue;
767        }
768
769        // Try code spans
770        if byte == b'`' {
771            if let Some((len, content, backtick_count, attributes)) =
772                try_parse_code_span(&text[pos..])
773            {
774                // Emit accumulated text
775                if pos > text_start {
776                    builder.token(SyntaxKind::TEXT.into(), &text[text_start..pos]);
777                }
778
779                log::trace!(
780                    "Matched code span at pos {}: {} backticks",
781                    pos,
782                    backtick_count
783                );
784
785                // Check for raw inline
786                if let Some(ref attrs) = attributes
787                    && config.extensions.raw_attribute
788                    && let Some(format) = is_raw_inline(attrs)
789                {
790                    use super::raw_inline::emit_raw_inline;
791                    log::trace!("Matched raw inline span at pos {}: format={}", pos, format);
792                    emit_raw_inline(builder, content, backtick_count, format);
793                } else if !config.extensions.inline_code_attributes && attributes.is_some() {
794                    let code_span_len = backtick_count * 2 + content.len();
795                    emit_code_span(builder, content, backtick_count, None);
796                    pos += code_span_len;
797                    text_start = pos;
798                    continue;
799                } else {
800                    emit_code_span(builder, content, backtick_count, attributes);
801                }
802
803                pos += len;
804                text_start = pos;
805                continue;
806            }
807
808            // Unmatched backtick run.
809            //
810            // CommonMark (and GFM) treat the whole run as literal text — the
811            // run cannot be re-entered as a shorter opener. Pandoc-markdown
812            // instead lets a longer run shadow a shorter one (e.g.
813            // `` ```foo`` `` parses as `` ` `` + ``<code>foo</code>``), so
814            // for the Pandoc dialect we fall through and advance one byte at
815            // a time, allowing the inner run to be tried on a later iteration.
816            if config.dialect == Dialect::CommonMark {
817                let run_len = text[pos..].bytes().take_while(|&b| b == b'`').count();
818                pos += run_len;
819                continue;
820            }
821        }
822
823        // Try textual emoji aliases: :smile:
824        if byte == b':'
825            && config.extensions.emoji
826            && is_emoji_boundary(text, pos)
827            && let Some((len, _alias)) = try_parse_emoji(&text[pos..])
828        {
829            if pos > text_start {
830                builder.token(SyntaxKind::TEXT.into(), &text[text_start..pos]);
831            }
832            log::trace!("Matched emoji at pos {}", pos);
833            emit_emoji(builder, &text[pos..pos + len]);
834            pos += len;
835            text_start = pos;
836            continue;
837        }
838
839        // Try inline footnotes: ^[note]. Under Pandoc dialect this is
840        // consumed via the IR's `ConstructPlan` at the top of the loop;
841        // this dispatcher branch only fires for CommonMark dialect with
842        // the extension explicitly enabled.
843        if byte == b'^'
844            && pos + 1 < text.len()
845            && text.as_bytes()[pos + 1] == b'['
846            && config.dialect == Dialect::CommonMark
847            && config.extensions.inline_footnotes
848            && let Some((len, content)) = try_parse_inline_footnote(&text[pos..])
849        {
850            if pos > text_start {
851                builder.token(SyntaxKind::TEXT.into(), &text[text_start..pos]);
852            }
853            log::trace!("Matched inline footnote at pos {}", pos);
854            emit_inline_footnote(builder, content, config);
855            pos += len;
856            text_start = pos;
857            continue;
858        }
859
860        // Try superscript: ^text^
861        if byte == b'^'
862            && config.extensions.superscript
863            && let Some((len, content)) = try_parse_superscript(&text[pos..])
864        {
865            if pos > text_start {
866                builder.token(SyntaxKind::TEXT.into(), &text[text_start..pos]);
867            }
868            log::trace!("Matched superscript at pos {}", pos);
869            emit_superscript(builder, content, config);
870            pos += len;
871            text_start = pos;
872            continue;
873        }
874
875        // Try bookdown definition: (\#label) or (ref:label)
876        if byte == b'(' && config.extensions.bookdown_references {
877            if let Some((len, label)) = try_parse_bookdown_definition(&text[pos..]) {
878                if pos > text_start {
879                    builder.token(SyntaxKind::TEXT.into(), &text[text_start..pos]);
880                }
881                log::trace!("Matched bookdown definition at pos {}: {}", pos, label);
882                builder.token(SyntaxKind::TEXT.into(), &text[pos..pos + len]);
883                pos += len;
884                text_start = pos;
885                continue;
886            }
887            if let Some((len, label)) = try_parse_bookdown_text_reference(&text[pos..]) {
888                if pos > text_start {
889                    builder.token(SyntaxKind::TEXT.into(), &text[text_start..pos]);
890                }
891                log::trace!("Matched bookdown text reference at pos {}: {}", pos, label);
892                builder.token(SyntaxKind::TEXT.into(), &text[pos..pos + len]);
893                pos += len;
894                text_start = pos;
895                continue;
896            }
897        }
898
899        // Try subscript: ~text~
900        if byte == b'~'
901            && config.extensions.subscript
902            && let Some((len, content)) = try_parse_subscript(&text[pos..])
903        {
904            if pos > text_start {
905                builder.token(SyntaxKind::TEXT.into(), &text[text_start..pos]);
906            }
907            log::trace!("Matched subscript at pos {}", pos);
908            emit_subscript(builder, content, config);
909            pos += len;
910            text_start = pos;
911            continue;
912        }
913
914        // Try strikeout: ~~text~~
915        if byte == b'~'
916            && config.extensions.strikeout
917            && let Some((len, content)) = try_parse_strikeout(&text[pos..])
918        {
919            if pos > text_start {
920                builder.token(SyntaxKind::TEXT.into(), &text[text_start..pos]);
921            }
922            log::trace!("Matched strikeout at pos {}", pos);
923            emit_strikeout(builder, content, config);
924            pos += len;
925            text_start = pos;
926            continue;
927        }
928
929        // Try mark/highlight: ==text==
930        if byte == b'='
931            && config.extensions.mark
932            && let Some((len, content)) = try_parse_mark(&text[pos..])
933        {
934            if pos > text_start {
935                builder.token(SyntaxKind::TEXT.into(), &text[text_start..pos]);
936            }
937            log::trace!("Matched mark at pos {}", pos);
938            emit_mark(builder, content, config);
939            pos += len;
940            text_start = pos;
941            continue;
942        }
943
944        // Try GFM inline math: $`...`$
945        if byte == b'$'
946            && config.extensions.tex_math_gfm
947            && let Some((len, content)) = try_parse_gfm_inline_math(&text[pos..])
948        {
949            if pos > text_start {
950                builder.token(SyntaxKind::TEXT.into(), &text[text_start..pos]);
951            }
952            log::trace!("Matched GFM inline math at pos {}", pos);
953            emit_gfm_inline_math(builder, content);
954            pos += len;
955            text_start = pos;
956            continue;
957        }
958
959        // Try math ($...$, $$...$$)
960        if byte == b'$' && config.extensions.tex_math_dollars {
961            // Try display math first ($$...$$)
962            if let Some((len, content)) = try_parse_display_math(&text[pos..]) {
963                // Emit accumulated text
964                if pos > text_start {
965                    builder.token(SyntaxKind::TEXT.into(), &text[text_start..pos]);
966                }
967
968                let dollar_count = text[pos..].chars().take_while(|&c| c == '$').count();
969                log::trace!(
970                    "Matched display math at pos {}: {} dollars",
971                    pos,
972                    dollar_count
973                );
974
975                // Check for trailing attributes (Quarto cross-reference support)
976                let after_math = &text[pos + len..];
977                let attr_len = if config.extensions.quarto_crossrefs {
978                    use crate::parser::utils::attributes::try_parse_trailing_attributes;
979                    if let Some((_attr_block, _)) = try_parse_trailing_attributes(after_math) {
980                        let trimmed_after = after_math.trim_start();
981                        if let Some(open_brace_pos) = trimmed_after.find('{') {
982                            let ws_before_brace = after_math.len() - trimmed_after.len();
983                            let attr_text_len = trimmed_after[open_brace_pos..]
984                                .find('}')
985                                .map(|close| close + 1)
986                                .unwrap_or(0);
987                            ws_before_brace + open_brace_pos + attr_text_len
988                        } else {
989                            0
990                        }
991                    } else {
992                        0
993                    }
994                } else {
995                    0
996                };
997
998                let total_len = len + attr_len;
999                emit_display_math(builder, content, dollar_count);
1000
1001                // Emit attributes if present
1002                if attr_len > 0 {
1003                    use crate::parser::utils::attributes::{
1004                        emit_attributes, try_parse_trailing_attributes,
1005                    };
1006                    let attr_text = &text[pos + len..pos + total_len];
1007                    if let Some((attr_block, _text_before)) =
1008                        try_parse_trailing_attributes(attr_text)
1009                    {
1010                        let trimmed_after = attr_text.trim_start();
1011                        let ws_len = attr_text.len() - trimmed_after.len();
1012                        if ws_len > 0 {
1013                            builder.token(SyntaxKind::WHITESPACE.into(), &attr_text[..ws_len]);
1014                        }
1015                        emit_attributes(builder, &attr_block);
1016                    }
1017                }
1018
1019                pos += total_len;
1020                text_start = pos;
1021                continue;
1022            }
1023
1024            // Try inline math ($...$)
1025            if let Some((len, content)) = try_parse_inline_math(&text[pos..]) {
1026                // Emit accumulated text
1027                if pos > text_start {
1028                    builder.token(SyntaxKind::TEXT.into(), &text[text_start..pos]);
1029                }
1030
1031                log::trace!("Matched inline math at pos {}", pos);
1032                emit_inline_math(builder, content);
1033                pos += len;
1034                text_start = pos;
1035                continue;
1036            }
1037
1038            // Neither display nor inline math matched - emit the $ as literal text
1039            // This ensures each $ gets its own TEXT token for CST compatibility
1040            if pos > text_start {
1041                builder.token(SyntaxKind::TEXT.into(), &text[text_start..pos]);
1042            }
1043            builder.token(SyntaxKind::TEXT.into(), "$");
1044            pos = advance_char_boundary(text, pos, end);
1045            text_start = pos;
1046            continue;
1047        }
1048
1049        // Try autolinks: <url> or <email>
1050        if byte == b'<'
1051            && config.extensions.autolinks
1052            && let Some((len, url)) = try_parse_autolink(
1053                &text[pos..],
1054                config.dialect == crate::options::Dialect::CommonMark,
1055            )
1056        {
1057            if pos > text_start {
1058                builder.token(SyntaxKind::TEXT.into(), &text[text_start..pos]);
1059            }
1060            log::trace!("Matched autolink at pos {}", pos);
1061            emit_autolink(builder, &text[pos..pos + len], url);
1062            pos += len;
1063            text_start = pos;
1064            continue;
1065        }
1066
1067        if !nested_in_link
1068            && config.extensions.autolink_bare_uris
1069            && let Some((len, url)) = try_parse_bare_uri(&text[pos..])
1070        {
1071            if pos > text_start {
1072                builder.token(SyntaxKind::TEXT.into(), &text[text_start..pos]);
1073            }
1074            log::trace!("Matched bare URI at pos {}", pos);
1075            emit_bare_uri_link(builder, url, config);
1076            pos += len;
1077            text_start = pos;
1078            continue;
1079        }
1080
1081        // Try native spans: <span>text</span> (after autolink since both
1082        // start with <). Under Pandoc dialect this is consumed via the
1083        // IR's `ConstructPlan` at the top of the loop; this dispatcher
1084        // branch only fires for CommonMark dialect with the extension
1085        // explicitly enabled.
1086        if byte == b'<'
1087            && config.dialect == Dialect::CommonMark
1088            && config.extensions.native_spans
1089            && let Some((len, content, attributes)) = try_parse_native_span(&text[pos..])
1090        {
1091            if pos > text_start {
1092                builder.token(SyntaxKind::TEXT.into(), &text[text_start..pos]);
1093            }
1094            log::trace!("Matched native span at pos {}", pos);
1095            emit_native_span(builder, content, &attributes, config);
1096            pos += len;
1097            text_start = pos;
1098            continue;
1099        }
1100
1101        // Try inline raw HTML (CommonMark §6.6 / Pandoc raw_html). Must run
1102        // after autolinks (more specific) and native spans (Pandoc
1103        // <span>…</span> wrapper) since all three start with `<`.
1104        if byte == b'<'
1105            && config.extensions.raw_html
1106            && let Some(len) = try_parse_inline_html(&text[pos..])
1107        {
1108            if pos > text_start {
1109                builder.token(SyntaxKind::TEXT.into(), &text[text_start..pos]);
1110            }
1111            log::trace!("Matched inline raw HTML at pos {}", pos);
1112            emit_inline_html(builder, &text[pos..pos + len]);
1113            pos += len;
1114            text_start = pos;
1115            continue;
1116        }
1117
1118        // Bracket-starting elements: inline / reference links and
1119        // images are dispatched via the IR-driven arm at the top of
1120        // the loop, gated by the IR's `BracketPlan`. Only dialect-CM-
1121        // specific Pandoc-extension constructs that share the `[...]`
1122        // shape (footnote refs, bracketed citations) need a CM-gated
1123        // dispatcher branch — under Pandoc dialect they're consumed
1124        // via the IR's `ConstructPlan` instead.
1125        if byte == b'['
1126            && config.dialect == Dialect::CommonMark
1127            && config.extensions.footnotes
1128            && let Some((len, id)) = try_parse_footnote_reference(&text[pos..])
1129        {
1130            if pos > text_start {
1131                builder.token(SyntaxKind::TEXT.into(), &text[text_start..pos]);
1132            }
1133            log::trace!("Matched footnote reference at pos {}", pos);
1134            emit_footnote_reference(builder, &id);
1135            pos += len;
1136            text_start = pos;
1137            continue;
1138        }
1139        if byte == b'['
1140            && config.dialect == Dialect::CommonMark
1141            && config.extensions.citations
1142            && let Some((len, content)) = try_parse_bracketed_citation(&text[pos..])
1143        {
1144            if pos > text_start {
1145                builder.token(SyntaxKind::TEXT.into(), &text[text_start..pos]);
1146            }
1147            log::trace!("Matched bracketed citation at pos {}", pos);
1148            emit_bracketed_citation(builder, content);
1149            pos += len;
1150            text_start = pos;
1151            continue;
1152        }
1153
1154        // Try bracketed spans: [text]{.class}. Must come after
1155        // links/citations. Under Pandoc dialect this is consumed via
1156        // the IR's `ConstructPlan` at the top of the loop; this
1157        // dispatcher branch only fires for CommonMark dialect with the
1158        // extension explicitly enabled.
1159        if config.dialect == Dialect::CommonMark
1160            && byte == b'['
1161            && config.extensions.bracketed_spans
1162            && let Some((len, text_content, attrs)) = try_parse_bracketed_span(&text[pos..])
1163        {
1164            if pos > text_start {
1165                builder.token(SyntaxKind::TEXT.into(), &text[text_start..pos]);
1166            }
1167            log::trace!("Matched bracketed span at pos {}", pos);
1168            emit_bracketed_span(builder, &text_content, &attrs, config);
1169            pos += len;
1170            text_start = pos;
1171            continue;
1172        }
1173
1174        // Try bare citation: @cite (must come after bracketed elements).
1175        // Under Pandoc dialect this is consumed via the IR's
1176        // `ConstructPlan` at the top of the loop; this dispatcher branch
1177        // only fires for CommonMark dialect with the extension
1178        // explicitly enabled.
1179        if config.dialect == Dialect::CommonMark
1180            && byte == b'@'
1181            && (config.extensions.citations || config.extensions.quarto_crossrefs)
1182            && let Some((len, key, has_suppress)) = try_parse_bare_citation(&text[pos..])
1183        {
1184            let is_crossref =
1185                config.extensions.quarto_crossrefs && super::citations::is_quarto_crossref_key(key);
1186            if is_crossref || config.extensions.citations {
1187                if pos > text_start {
1188                    builder.token(SyntaxKind::TEXT.into(), &text[text_start..pos]);
1189                }
1190                if is_crossref {
1191                    log::trace!("Matched Quarto crossref at pos {}: {}", pos, &key);
1192                    super::citations::emit_crossref(builder, key, has_suppress);
1193                } else {
1194                    log::trace!("Matched bare citation at pos {}: {}", pos, &key);
1195                    emit_bare_citation(builder, key, has_suppress);
1196                }
1197                pos += len;
1198                text_start = pos;
1199                continue;
1200            }
1201        }
1202
1203        // Try suppress-author citation: -@cite. Under Pandoc dialect
1204        // this is consumed via the IR's `ConstructPlan` at the top of
1205        // the loop; this dispatcher branch only fires for CommonMark
1206        // dialect with the extension explicitly enabled.
1207        if config.dialect == Dialect::CommonMark
1208            && byte == b'-'
1209            && pos + 1 < text.len()
1210            && text.as_bytes()[pos + 1] == b'@'
1211            && (config.extensions.citations || config.extensions.quarto_crossrefs)
1212            && let Some((len, key, has_suppress)) = try_parse_bare_citation(&text[pos..])
1213        {
1214            let is_crossref =
1215                config.extensions.quarto_crossrefs && super::citations::is_quarto_crossref_key(key);
1216            if is_crossref || config.extensions.citations {
1217                if pos > text_start {
1218                    builder.token(SyntaxKind::TEXT.into(), &text[text_start..pos]);
1219                }
1220                if is_crossref {
1221                    log::trace!("Matched Quarto crossref at pos {}: {}", pos, &key);
1222                    super::citations::emit_crossref(builder, key, has_suppress);
1223                } else {
1224                    log::trace!("Matched suppress-author citation at pos {}: {}", pos, &key);
1225                    emit_bare_citation(builder, key, has_suppress);
1226                }
1227                pos += len;
1228                text_start = pos;
1229                continue;
1230            }
1231        }
1232
1233        // Emphasis emission, plan-driven. The IR's emphasis pass has
1234        // already decided every delimiter byte's disposition (open
1235        // marker, close marker, or unmatched literal); consult the
1236        // plan here instead of re-scanning.
1237        if byte == b'*' || byte == b'_' {
1238            match plan.lookup(pos) {
1239                Some(DelimChar::Open {
1240                    len,
1241                    partner,
1242                    partner_len,
1243                    kind,
1244                }) => {
1245                    if pos > text_start {
1246                        builder.token(SyntaxKind::TEXT.into(), &text[text_start..pos]);
1247                    }
1248                    let len = len as usize;
1249                    let partner_len = partner_len as usize;
1250                    let (wrapper_kind, marker_kind) = match kind {
1251                        EmphasisKind::Strong => (SyntaxKind::STRONG, SyntaxKind::STRONG_MARKER),
1252                        EmphasisKind::Emph => (SyntaxKind::EMPHASIS, SyntaxKind::EMPHASIS_MARKER),
1253                    };
1254                    builder.start_node(wrapper_kind.into());
1255                    builder.token(marker_kind.into(), &text[pos..pos + len]);
1256                    parse_inline_range_impl(
1257                        text,
1258                        pos + len,
1259                        partner,
1260                        config,
1261                        builder,
1262                        nested_in_link,
1263                        plan,
1264                        bracket_plan,
1265                        construct_plan,
1266                        suppress_inner_links,
1267                        mask,
1268                    );
1269                    builder.token(marker_kind.into(), &text[partner..partner + partner_len]);
1270                    builder.finish_node();
1271                    pos = partner + partner_len;
1272                    text_start = pos;
1273                    continue;
1274                }
1275                Some(DelimChar::Close) => {
1276                    // Defensive: a close should be jumped past by its
1277                    // matching open. If we hit one anyway (e.g. when the
1278                    // outer caller's range starts mid-pair), let it be
1279                    // emitted as part of the surrounding text by simply
1280                    // advancing. text_start stays put so the byte folds
1281                    // into the next TEXT flush.
1282                    pos += 1;
1283                    continue;
1284                }
1285                Some(DelimChar::Literal) | None => {
1286                    // Unmatched delim chars at this position behave as
1287                    // literal text. Don't emit yet — let them coalesce
1288                    // with surrounding plain bytes via the existing
1289                    // text_start flushing so the CST keeps the same TEXT
1290                    // token granularity Pandoc fixtures expect.
1291                    let bytes = text.as_bytes();
1292                    let mut end_pos = pos + 1;
1293                    while end_pos < end && bytes[end_pos] == byte {
1294                        match plan.lookup(end_pos) {
1295                            Some(DelimChar::Literal) | None => end_pos += 1,
1296                            _ => break,
1297                        }
1298                    }
1299                    pos = end_pos;
1300                    continue;
1301                }
1302            }
1303        }
1304
1305        // Check for newlines - may need to emit as hard line break
1306        if byte == b'\r' && pos + 1 < end && text.as_bytes()[pos + 1] == b'\n' {
1307            let text_before = &text[text_start..pos];
1308
1309            // Check for trailing spaces hard line break (always enabled in Pandoc)
1310            let trailing_spaces = text_before.chars().rev().take_while(|&c| c == ' ').count();
1311            if trailing_spaces >= 2 {
1312                // Emit text before the trailing spaces
1313                let text_content = &text_before[..text_before.len() - trailing_spaces];
1314                if !text_content.is_empty() {
1315                    builder.token(SyntaxKind::TEXT.into(), text_content);
1316                }
1317                let spaces = " ".repeat(trailing_spaces);
1318                builder.token(
1319                    SyntaxKind::HARD_LINE_BREAK.into(),
1320                    &format!("{}\r\n", spaces),
1321                );
1322                pos += 2;
1323                text_start = pos;
1324                continue;
1325            }
1326
1327            // hard_line_breaks: treat all single newlines as hard line breaks
1328            if config.extensions.hard_line_breaks {
1329                if !text_before.is_empty() {
1330                    builder.token(SyntaxKind::TEXT.into(), text_before);
1331                }
1332                builder.token(SyntaxKind::HARD_LINE_BREAK.into(), "\r\n");
1333                pos += 2;
1334                text_start = pos;
1335                continue;
1336            }
1337
1338            // Regular newline
1339            if !text_before.is_empty() {
1340                builder.token(SyntaxKind::TEXT.into(), text_before);
1341            }
1342            builder.token(SyntaxKind::NEWLINE.into(), "\r\n");
1343            pos += 2;
1344            text_start = pos;
1345            continue;
1346        }
1347
1348        if byte == b'\n' {
1349            let text_before = &text[text_start..pos];
1350
1351            // Check for trailing spaces hard line break (always enabled in Pandoc)
1352            let trailing_spaces = text_before.chars().rev().take_while(|&c| c == ' ').count();
1353            if trailing_spaces >= 2 {
1354                // Emit text before the trailing spaces
1355                let text_content = &text_before[..text_before.len() - trailing_spaces];
1356                if !text_content.is_empty() {
1357                    builder.token(SyntaxKind::TEXT.into(), text_content);
1358                }
1359                let spaces = " ".repeat(trailing_spaces);
1360                builder.token(SyntaxKind::HARD_LINE_BREAK.into(), &format!("{}\n", spaces));
1361                pos += 1;
1362                text_start = pos;
1363                continue;
1364            }
1365
1366            // hard_line_breaks: treat all single newlines as hard line breaks
1367            if config.extensions.hard_line_breaks {
1368                if !text_before.is_empty() {
1369                    builder.token(SyntaxKind::TEXT.into(), text_before);
1370                }
1371                builder.token(SyntaxKind::HARD_LINE_BREAK.into(), "\n");
1372                pos += 1;
1373                text_start = pos;
1374                continue;
1375            }
1376
1377            // Regular newline
1378            if !text_before.is_empty() {
1379                builder.token(SyntaxKind::TEXT.into(), text_before);
1380            }
1381            builder.token(SyntaxKind::NEWLINE.into(), "\n");
1382            pos += 1;
1383            text_start = pos;
1384            continue;
1385        }
1386
1387        // Regular character, keep accumulating
1388        pos = advance_char_boundary(text, pos, end);
1389    }
1390
1391    // Emit any remaining text
1392    if pos > text_start && text_start < end {
1393        log::trace!("Emitting remaining TEXT: {:?}", &text[text_start..end]);
1394        builder.token(SyntaxKind::TEXT.into(), &text[text_start..end]);
1395    }
1396
1397    log::trace!("parse_inline_range complete: start={}, end={}", start, end);
1398}
1399
1400#[cfg(test)]
1401mod tests {
1402    use super::*;
1403    use crate::syntax::{SyntaxKind, SyntaxNode};
1404    use rowan::GreenNode;
1405
1406    #[test]
1407    fn test_recursive_simple_emphasis() {
1408        let text = "*test*";
1409        let config = ParserOptions::default();
1410        let mut builder = GreenNodeBuilder::new();
1411
1412        parse_inline_text_recursive(&mut builder, text, &config);
1413
1414        let green: GreenNode = builder.finish();
1415        let node = SyntaxNode::new_root(green);
1416
1417        // Should be lossless
1418        assert_eq!(node.text().to_string(), text);
1419
1420        // Should have EMPHASIS node
1421        let has_emph = node.descendants().any(|n| n.kind() == SyntaxKind::EMPHASIS);
1422        assert!(has_emph, "Should have EMPHASIS node");
1423    }
1424
1425    #[test]
1426    fn test_recursive_nested() {
1427        let text = "*foo **bar** baz*";
1428        let config = ParserOptions::default();
1429        let mut builder = GreenNodeBuilder::new();
1430
1431        // Wrap in a PARAGRAPH node (inline content needs a parent)
1432        builder.start_node(SyntaxKind::PARAGRAPH.into());
1433        parse_inline_text_recursive(&mut builder, text, &config);
1434        builder.finish_node();
1435
1436        let green: GreenNode = builder.finish();
1437        let node = SyntaxNode::new_root(green);
1438
1439        // Should be lossless
1440        assert_eq!(node.text().to_string(), text);
1441
1442        // Should have both EMPHASIS and STRONG
1443        let has_emph = node.descendants().any(|n| n.kind() == SyntaxKind::EMPHASIS);
1444        let has_strong = node.descendants().any(|n| n.kind() == SyntaxKind::STRONG);
1445
1446        assert!(has_emph, "Should have EMPHASIS node");
1447        assert!(has_strong, "Should have STRONG node");
1448    }
1449
1450    /// Test Pandoc's "three" algorithm: ***foo* bar**
1451    /// Expected: Strong[Emph[foo], bar]
1452    #[test]
1453    fn test_triple_emphasis_star_then_double_star() {
1454        use crate::options::ParserOptions;
1455        use crate::syntax::SyntaxNode;
1456        use rowan::GreenNode;
1457
1458        let text = "***foo* bar**";
1459        let config = ParserOptions::default();
1460        let mut builder = GreenNodeBuilder::new();
1461
1462        builder.start_node(SyntaxKind::DOCUMENT.into());
1463        parse_inline_text_recursive(&mut builder, text, &config);
1464        builder.finish_node();
1465
1466        let green: GreenNode = builder.finish();
1467        let node = SyntaxNode::new_root(green);
1468
1469        // Verify losslessness
1470        assert_eq!(node.text().to_string(), text);
1471
1472        // Expected structure: STRONG > EMPH > "foo"
1473        // The STRONG should contain EMPH, not the other way around
1474        let structure = format!("{:#?}", node);
1475
1476        // Should have both STRONG and EMPH
1477        assert!(structure.contains("STRONG"), "Should have STRONG node");
1478        assert!(structure.contains("EMPHASIS"), "Should have EMPHASIS node");
1479
1480        // STRONG should be outer, EMPH should be inner
1481        // Check that STRONG comes before EMPH in tree traversal
1482        let mut found_strong = false;
1483        let mut found_emph_after_strong = false;
1484        for descendant in node.descendants() {
1485            if descendant.kind() == SyntaxKind::STRONG {
1486                found_strong = true;
1487            }
1488            if found_strong && descendant.kind() == SyntaxKind::EMPHASIS {
1489                found_emph_after_strong = true;
1490                break;
1491            }
1492        }
1493
1494        assert!(
1495            found_emph_after_strong,
1496            "EMPH should be inside STRONG, not before it. Current structure:\n{}",
1497            structure
1498        );
1499    }
1500
1501    /// Test Pandoc's "three" algorithm: ***foo** bar*
1502    /// Expected: Emph[Strong[foo], bar]
1503    #[test]
1504    fn test_triple_emphasis_double_star_then_star() {
1505        use crate::options::ParserOptions;
1506        use crate::syntax::SyntaxNode;
1507        use rowan::GreenNode;
1508
1509        let text = "***foo** bar*";
1510        let config = ParserOptions::default();
1511        let mut builder = GreenNodeBuilder::new();
1512
1513        builder.start_node(SyntaxKind::DOCUMENT.into());
1514        parse_inline_text_recursive(&mut builder, text, &config);
1515        builder.finish_node();
1516
1517        let green: GreenNode = builder.finish();
1518        let node = SyntaxNode::new_root(green);
1519
1520        // Verify losslessness
1521        assert_eq!(node.text().to_string(), text);
1522
1523        // Expected structure: EMPH > STRONG > "foo"
1524        let structure = format!("{:#?}", node);
1525
1526        // Should have both EMPH and STRONG
1527        assert!(structure.contains("EMPHASIS"), "Should have EMPHASIS node");
1528        assert!(structure.contains("STRONG"), "Should have STRONG node");
1529
1530        // EMPH should be outer, STRONG should be inner
1531        let mut found_emph = false;
1532        let mut found_strong_after_emph = false;
1533        for descendant in node.descendants() {
1534            if descendant.kind() == SyntaxKind::EMPHASIS {
1535                found_emph = true;
1536            }
1537            if found_emph && descendant.kind() == SyntaxKind::STRONG {
1538                found_strong_after_emph = true;
1539                break;
1540            }
1541        }
1542
1543        assert!(
1544            found_strong_after_emph,
1545            "STRONG should be inside EMPH. Current structure:\n{}",
1546            structure
1547        );
1548    }
1549
1550    /// Test that display math with attributes parses correctly
1551    /// Regression test for equation_attributes_single_line golden test
1552    #[test]
1553    fn test_display_math_with_attributes() {
1554        use crate::options::ParserOptions;
1555        use crate::syntax::SyntaxNode;
1556        use rowan::GreenNode;
1557
1558        let text = "$$ E = mc^2 $$ {#eq-einstein}";
1559        let mut config = ParserOptions::default();
1560        config.extensions.quarto_crossrefs = true; // Enable Quarto cross-references
1561
1562        let mut builder = GreenNodeBuilder::new();
1563        builder.start_node(SyntaxKind::DOCUMENT.into()); // Need a root node
1564
1565        // Parse the whole text
1566        parse_inline_text_recursive(&mut builder, text, &config);
1567
1568        builder.finish_node(); // Finish ROOT
1569        let green: GreenNode = builder.finish();
1570        let node = SyntaxNode::new_root(green);
1571
1572        // Verify losslessness
1573        assert_eq!(node.text().to_string(), text);
1574
1575        // Should have DISPLAY_MATH node
1576        let has_display_math = node
1577            .descendants()
1578            .any(|n| n.kind() == SyntaxKind::DISPLAY_MATH);
1579        assert!(has_display_math, "Should have DISPLAY_MATH node");
1580
1581        // Should have ATTRIBUTE node
1582        let has_attributes = node
1583            .descendants()
1584            .any(|n| n.kind() == SyntaxKind::ATTRIBUTE);
1585        assert!(
1586            has_attributes,
1587            "Should have ATTRIBUTE node for {{#eq-einstein}}"
1588        );
1589
1590        // Attributes should not be TEXT
1591        let math_followed_by_text = node.descendants().any(|n| {
1592            n.kind() == SyntaxKind::DISPLAY_MATH
1593                && n.next_sibling()
1594                    .map(|s| {
1595                        s.kind() == SyntaxKind::TEXT
1596                            && s.text().to_string().contains("{#eq-einstein}")
1597                    })
1598                    .unwrap_or(false)
1599        });
1600        assert!(
1601            !math_followed_by_text,
1602            "Attributes should not be parsed as TEXT"
1603        );
1604    }
1605
1606    #[test]
1607    fn test_parse_inline_text_gfm_inline_link_destination_not_autolinked() {
1608        use crate::options::{Dialect, Extensions, Flavor};
1609
1610        let config = ParserOptions {
1611            flavor: Flavor::Gfm,
1612            dialect: Dialect::for_flavor(Flavor::Gfm),
1613            extensions: Extensions::for_flavor(Flavor::Gfm),
1614            ..ParserOptions::default()
1615        };
1616
1617        let mut builder = GreenNodeBuilder::new();
1618        builder.start_node(SyntaxKind::PARAGRAPH.into());
1619        parse_inline_text_recursive(
1620            &mut builder,
1621            "Second Link [link_text](https://link.com)",
1622            &config,
1623        );
1624        builder.finish_node();
1625        let green = builder.finish();
1626        let root = SyntaxNode::new_root(green);
1627
1628        let links: Vec<_> = root
1629            .descendants()
1630            .filter(|n| n.kind() == SyntaxKind::LINK)
1631            .collect();
1632        assert_eq!(
1633            links.len(),
1634            1,
1635            "Expected exactly one LINK node for inline link, not nested bare URI autolink"
1636        );
1637
1638        let link = links[0].clone();
1639        let mut link_text = None::<String>;
1640        let mut link_dest = None::<String>;
1641
1642        for child in link.children() {
1643            match child.kind() {
1644                SyntaxKind::LINK_TEXT => link_text = Some(child.text().to_string()),
1645                SyntaxKind::LINK_DEST => link_dest = Some(child.text().to_string()),
1646                _ => {}
1647            }
1648        }
1649
1650        assert_eq!(link_text.as_deref(), Some("link_text"));
1651        assert_eq!(link_dest.as_deref(), Some("https://link.com"));
1652    }
1653
1654    #[test]
1655    fn test_autolink_bare_uri_utf8_boundary_safe() {
1656        let text = "§";
1657        let mut config = ParserOptions::default();
1658        config.extensions.autolink_bare_uris = true;
1659        let mut builder = GreenNodeBuilder::new();
1660
1661        builder.start_node(SyntaxKind::DOCUMENT.into());
1662        parse_inline_text_recursive(&mut builder, text, &config);
1663        builder.finish_node();
1664
1665        let green: GreenNode = builder.finish();
1666        let node = SyntaxNode::new_root(green);
1667        assert_eq!(node.text().to_string(), text);
1668    }
1669
1670    #[test]
1671    fn test_parse_emphasis_unicode_content_no_panic() {
1672        let text = "*§*";
1673        let config = ParserOptions::default();
1674        let mut builder = GreenNodeBuilder::new();
1675
1676        builder.start_node(SyntaxKind::PARAGRAPH.into());
1677        parse_inline_text_recursive(&mut builder, text, &config);
1678        builder.finish_node();
1679
1680        let green: GreenNode = builder.finish();
1681        let node = SyntaxNode::new_root(green);
1682        let has_emph = node.descendants().any(|n| n.kind() == SyntaxKind::EMPHASIS);
1683        assert!(has_emph, "Should have EMPHASIS node");
1684        assert_eq!(node.text().to_string(), text);
1685    }
1686}
1687
1688#[test]
1689fn test_two_with_nested_one_and_triple_closer() {
1690    // **bold with *italic***
1691    // Should parse as: Strong["bold with ", Emph["italic"]]
1692    // The *** at end is parsed as * (closes Emph) + ** (closes Strong)
1693
1694    use crate::options::ParserOptions;
1695    use crate::syntax::SyntaxNode;
1696    use rowan::GreenNode;
1697
1698    let text = "**bold with *italic***";
1699    let config = ParserOptions::default();
1700    let mut builder = GreenNodeBuilder::new();
1701
1702    builder.start_node(SyntaxKind::PARAGRAPH.into());
1703    parse_inline_text_recursive(&mut builder, text, &config);
1704    builder.finish_node();
1705
1706    let green: GreenNode = builder.finish();
1707    let node = SyntaxNode::new_root(green);
1708
1709    assert_eq!(node.text().to_string(), text, "Should be lossless");
1710
1711    let strong_nodes: Vec<_> = node
1712        .descendants()
1713        .filter(|n| n.kind() == SyntaxKind::STRONG)
1714        .collect();
1715    assert_eq!(strong_nodes.len(), 1, "Should have exactly one STRONG node");
1716    let has_emphasis_in_strong = strong_nodes[0]
1717        .descendants()
1718        .any(|n| n.kind() == SyntaxKind::EMPHASIS);
1719    assert!(
1720        has_emphasis_in_strong,
1721        "STRONG should contain EMPHASIS node"
1722    );
1723}
1724
1725#[test]
1726fn test_emphasis_with_trailing_space_before_closer() {
1727    // *foo * should parse as emphasis (Pandoc behavior)
1728    // For asterisks, Pandoc doesn't require right-flanking for closers
1729
1730    use crate::options::ParserOptions;
1731    use crate::syntax::SyntaxNode;
1732    use rowan::GreenNode;
1733
1734    let text = "*foo *";
1735    let config = ParserOptions::default();
1736    let mut builder = GreenNodeBuilder::new();
1737
1738    builder.start_node(SyntaxKind::PARAGRAPH.into());
1739    parse_inline_text_recursive(&mut builder, text, &config);
1740    builder.finish_node();
1741
1742    let green: GreenNode = builder.finish();
1743    let node = SyntaxNode::new_root(green);
1744
1745    let has_emph = node.descendants().any(|n| n.kind() == SyntaxKind::EMPHASIS);
1746    assert!(has_emph, "Should have EMPHASIS node");
1747    assert_eq!(node.text().to_string(), text);
1748}
1749
1750#[test]
1751fn test_triple_emphasis_all_strong_nested() {
1752    // ***foo** bar **baz*** should parse as Emph[Strong[foo], " bar ", Strong[baz]]
1753    // Pandoc output confirms this
1754
1755    use crate::options::ParserOptions;
1756    use crate::syntax::SyntaxNode;
1757    use rowan::GreenNode;
1758
1759    let text = "***foo** bar **baz***";
1760    let config = ParserOptions::default();
1761    let mut builder = GreenNodeBuilder::new();
1762
1763    builder.start_node(SyntaxKind::DOCUMENT.into());
1764    parse_inline_text_recursive(&mut builder, text, &config);
1765    builder.finish_node();
1766
1767    let green: GreenNode = builder.finish();
1768    let node = SyntaxNode::new_root(green);
1769
1770    // Should have one EMPHASIS node at root
1771    let emphasis_nodes: Vec<_> = node
1772        .descendants()
1773        .filter(|n| n.kind() == SyntaxKind::EMPHASIS)
1774        .collect();
1775    assert_eq!(
1776        emphasis_nodes.len(),
1777        1,
1778        "Should have exactly one EMPHASIS node, found: {}",
1779        emphasis_nodes.len()
1780    );
1781
1782    // EMPHASIS should contain two STRONG nodes
1783    let emphasis_node = emphasis_nodes[0].clone();
1784    let strong_in_emphasis: Vec<_> = emphasis_node
1785        .children()
1786        .filter(|n| n.kind() == SyntaxKind::STRONG)
1787        .collect();
1788    assert_eq!(
1789        strong_in_emphasis.len(),
1790        2,
1791        "EMPHASIS should contain two STRONG nodes, found: {}",
1792        strong_in_emphasis.len()
1793    );
1794
1795    // Verify losslessness
1796    assert_eq!(node.text().to_string(), text);
1797}
1798
1799#[test]
1800fn test_triple_emphasis_all_emph_nested() {
1801    // ***foo* bar *baz*** should parse as Strong[Emph[foo], " bar ", Emph[baz]]
1802    // Pandoc output confirms this
1803
1804    use crate::options::ParserOptions;
1805    use crate::syntax::SyntaxNode;
1806    use rowan::GreenNode;
1807
1808    let text = "***foo* bar *baz***";
1809    let config = ParserOptions::default();
1810    let mut builder = GreenNodeBuilder::new();
1811
1812    builder.start_node(SyntaxKind::DOCUMENT.into());
1813    parse_inline_text_recursive(&mut builder, text, &config);
1814    builder.finish_node();
1815
1816    let green: GreenNode = builder.finish();
1817    let node = SyntaxNode::new_root(green);
1818
1819    // Should have one STRONG node at root
1820    let strong_nodes: Vec<_> = node
1821        .descendants()
1822        .filter(|n| n.kind() == SyntaxKind::STRONG)
1823        .collect();
1824    assert_eq!(
1825        strong_nodes.len(),
1826        1,
1827        "Should have exactly one STRONG node, found: {}",
1828        strong_nodes.len()
1829    );
1830
1831    // STRONG should contain two EMPHASIS nodes
1832    let strong_node = strong_nodes[0].clone();
1833    let emph_in_strong: Vec<_> = strong_node
1834        .children()
1835        .filter(|n| n.kind() == SyntaxKind::EMPHASIS)
1836        .collect();
1837    assert_eq!(
1838        emph_in_strong.len(),
1839        2,
1840        "STRONG should contain two EMPHASIS nodes, found: {}",
1841        emph_in_strong.len()
1842    );
1843
1844    // Verify losslessness
1845    assert_eq!(node.text().to_string(), text);
1846}
1847
1848// Multiline emphasis tests
1849#[test]
1850fn test_parse_emphasis_multiline() {
1851    // Per Pandoc spec, emphasis CAN contain newlines (soft breaks)
1852    use crate::options::ParserOptions;
1853    use crate::syntax::SyntaxNode;
1854    use rowan::GreenNode;
1855
1856    let text = "*text on\nline two*";
1857    let config = ParserOptions::default();
1858    let mut builder = GreenNodeBuilder::new();
1859
1860    builder.start_node(SyntaxKind::PARAGRAPH.into());
1861    parse_inline_text_recursive(&mut builder, text, &config);
1862    builder.finish_node();
1863
1864    let green: GreenNode = builder.finish();
1865    let node = SyntaxNode::new_root(green);
1866
1867    let has_emph = node.descendants().any(|n| n.kind() == SyntaxKind::EMPHASIS);
1868    assert!(has_emph, "Should have EMPHASIS node");
1869
1870    assert_eq!(node.text().to_string(), text);
1871    assert!(
1872        node.text().to_string().contains('\n'),
1873        "Should preserve newline in emphasis content"
1874    );
1875}
1876
1877#[test]
1878fn test_parse_strong_multiline() {
1879    // Per Pandoc spec, strong emphasis CAN contain newlines
1880    use crate::options::ParserOptions;
1881    use crate::syntax::SyntaxNode;
1882    use rowan::GreenNode;
1883
1884    let text = "**strong on\nline two**";
1885    let config = ParserOptions::default();
1886    let mut builder = GreenNodeBuilder::new();
1887
1888    builder.start_node(SyntaxKind::PARAGRAPH.into());
1889    parse_inline_text_recursive(&mut builder, text, &config);
1890    builder.finish_node();
1891
1892    let green: GreenNode = builder.finish();
1893    let node = SyntaxNode::new_root(green);
1894
1895    let has_strong = node.descendants().any(|n| n.kind() == SyntaxKind::STRONG);
1896    assert!(has_strong, "Should have STRONG node");
1897
1898    assert_eq!(node.text().to_string(), text);
1899    assert!(
1900        node.text().to_string().contains('\n'),
1901        "Should preserve newline in strong content"
1902    );
1903}
1904
1905#[test]
1906fn test_parse_triple_emphasis_multiline() {
1907    // Triple emphasis with newlines
1908    use crate::options::ParserOptions;
1909    use crate::syntax::SyntaxNode;
1910    use rowan::GreenNode;
1911
1912    let text = "***both on\nline two***";
1913    let config = ParserOptions::default();
1914    let mut builder = GreenNodeBuilder::new();
1915
1916    builder.start_node(SyntaxKind::PARAGRAPH.into());
1917    parse_inline_text_recursive(&mut builder, text, &config);
1918    builder.finish_node();
1919
1920    let green: GreenNode = builder.finish();
1921    let node = SyntaxNode::new_root(green);
1922
1923    // Should have STRONG node (triple = strong + emph)
1924    let has_strong = node.descendants().any(|n| n.kind() == SyntaxKind::STRONG);
1925    assert!(has_strong, "Should have STRONG node");
1926
1927    assert_eq!(node.text().to_string(), text);
1928    assert!(
1929        node.text().to_string().contains('\n'),
1930        "Should preserve newline in triple emphasis content"
1931    );
1932}