Skip to main content

panache_parser/parser/inlines/
links.rs

1//! Parsing for links, images, and automatic links.
2//!
3//! Implements:
4//! - Automatic links: `<http://example.com>` and `<user@example.com>`
5//! - Inline links: `[text](url)` and `[text](url "title")`
6//! - Link attributes: `[text](url){#id .class key=value}`
7//! - Inline images: `![alt](url)` and `![alt](url "title")`
8//! - Image attributes: `![alt](url){#id .class key=value}`
9//! - Reference links: `[text][ref]`, `[text][]`, `[text]`
10//! - Reference images: `![alt][ref]`, `![alt][]`, `![alt]`
11
12use super::code_spans::try_parse_code_span;
13use super::core::parse_inline_text;
14use super::inline_html::try_parse_inline_html;
15use super::sink::InlineSink;
16use crate::options::ParserOptions;
17use crate::syntax::SyntaxKind;
18
19// Import attribute parsing
20use crate::parser::utils::attributes::{emit_attribute_node, try_parse_trailing_attributes};
21
22/// Flags that control which inline spans the link-bracket scanner treats as
23/// opaque (so a `]` inside them does not terminate the link/image text).
24///
25/// - `skip_raw_html` is universal across dialects: pandoc-markdown and
26///   CommonMark both refuse to close link text inside a raw HTML span (e.g.
27///   `[foo <bar attr="](baz)">`), per CommonMark spec example #524 / #536.
28/// - `skip_autolinks` is **CommonMark-only**. Pandoc-markdown does *not*
29///   treat `<scheme://...>` as opaque inside link text, so the same input
30///   produces a different parse under each dialect (CommonMark spec example
31///   #526 / #538). Always derive this from
32///   `extensions.autolinks && dialect == Dialect::CommonMark`.
33/// - `disallow_inner_links` is **CommonMark-only** structural rule (§6.4):
34///   "Links may not contain other links, at any level of nesting." When the
35///   candidate link/image text contains a valid inline link or image, the
36///   outer match is rejected so the inner-most definition is used instead
37///   (spec examples #518–#520, #532). Pandoc-markdown allows nested links,
38///   so the flag is `false` there.
39#[derive(Clone, Copy)]
40pub struct LinkScanContext {
41    pub skip_raw_html: bool,
42    pub skip_autolinks: bool,
43    pub disallow_inner_links: bool,
44    /// Dialect controlling which HTML constructs the raw-HTML opacity check
45    /// recognizes. Pandoc-markdown excludes bare declarations and CDATA
46    /// from its inline raw HTML grammar.
47    pub dialect: crate::options::Dialect,
48}
49
50impl Default for LinkScanContext {
51    fn default() -> Self {
52        Self {
53            skip_raw_html: false,
54            skip_autolinks: false,
55            disallow_inner_links: false,
56            dialect: crate::options::Dialect::Pandoc,
57        }
58    }
59}
60
61impl LinkScanContext {
62    pub fn from_options(config: &ParserOptions) -> Self {
63        let is_commonmark = config.dialect == crate::options::Dialect::CommonMark;
64        Self {
65            skip_raw_html: config.extensions.raw_html,
66            skip_autolinks: config.extensions.autolinks && is_commonmark,
67            disallow_inner_links: is_commonmark,
68            dialect: config.dialect,
69        }
70    }
71}
72
73/// Find the closing `]` of a link/image text span, starting from `start`.
74///
75/// Walks `text[start..]` tracking nested brackets and backslash escapes. When
76/// a backtick run starting a valid code span is encountered, the entire span
77/// (including any trailing attribute block) is skipped — per CommonMark §6
78/// precedence, code spans bind tighter than links/images, so a `]` *inside*
79/// a code span cannot terminate the link's text. The same opacity applies to
80/// raw HTML and (CommonMark-only) autolink spans gated through `ctx`.
81/// Returns the byte offset of the closing `]` within `text`, or `None` if no
82/// unmatched `]` is reached.
83fn find_link_close_bracket(text: &str, start: usize, ctx: LinkScanContext) -> Option<usize> {
84    let bytes = text.as_bytes();
85    let mut bracket_depth = 0;
86    let mut escape_next = false;
87    let mut i = start;
88
89    while i < bytes.len() {
90        let b = bytes[i];
91
92        if escape_next {
93            escape_next = false;
94            i += step(text, i);
95            continue;
96        }
97
98        match b {
99            b'\\' => {
100                escape_next = true;
101                i += 1;
102            }
103            b'`' => {
104                if let Some((len, _, _, _)) = try_parse_code_span(&text[i..]) {
105                    i += len;
106                } else {
107                    i += 1;
108                }
109            }
110            b'<' => {
111                // Order matters: autolinks are the more specific `<...>`
112                // shape (URI/email between angle brackets), so try that
113                // before falling through to general inline raw HTML which
114                // would also match `<bar attr="...">`-style tags.
115                if ctx.skip_autolinks
116                    && let Some((len, _)) = try_parse_autolink(&text[i..], true)
117                {
118                    i += len;
119                } else if ctx.skip_raw_html
120                    && let Some(len) = try_parse_inline_html(&text[i..], ctx.dialect)
121                {
122                    i += len;
123                } else {
124                    i += 1;
125                }
126            }
127            b'[' => {
128                bracket_depth += 1;
129                i += 1;
130            }
131            b']' => {
132                if bracket_depth == 0 {
133                    return Some(i);
134                }
135                bracket_depth -= 1;
136                i += 1;
137            }
138            _ => i += step(text, i),
139        }
140    }
141    None
142}
143
144/// Find the closing `)` of a link/image destination, given the text *after*
145/// the opening `(`. Tracks paren nesting, quoted titles, and angle-bracketed
146/// destinations (`<...>` may legitimately contain unbalanced parens — see
147/// spec example #499). Returns the byte offset of the closing `)` within the
148/// passed slice, or `None` if not found.
149fn find_dest_close_paren(remaining: &str) -> Option<usize> {
150    let bytes = remaining.as_bytes();
151    let mut paren_depth = 0;
152    let mut escape_next = false;
153    let mut in_quotes = false;
154    let mut in_angle = false;
155    let mut i = 0;
156
157    while i < bytes.len() {
158        let b = bytes[i];
159
160        if escape_next {
161            escape_next = false;
162            i += step(remaining, i);
163            continue;
164        }
165
166        match b {
167            b'\\' => {
168                escape_next = true;
169                i += 1;
170            }
171            b'<' if !in_quotes && !in_angle => {
172                in_angle = true;
173                i += 1;
174            }
175            b'>' if in_angle => {
176                in_angle = false;
177                i += 1;
178            }
179            b'"' if !in_angle => {
180                in_quotes = !in_quotes;
181                i += 1;
182            }
183            b'(' if !in_quotes && !in_angle => {
184                paren_depth += 1;
185                i += 1;
186            }
187            b')' if !in_quotes && !in_angle => {
188                if paren_depth == 0 {
189                    return Some(i);
190                }
191                paren_depth -= 1;
192                i += 1;
193            }
194            _ => i += step(remaining, i),
195        }
196    }
197    None
198}
199
200/// Byte length of the UTF-8 character starting at byte index `i` in `s`.
201/// Used to advance an index loop char-by-char without incurring `char_indices`
202/// overhead and without splitting on a UTF-8 boundary.
203fn step(s: &str, i: usize) -> usize {
204    s[i..].chars().next().map(|c| c.len_utf8()).unwrap_or(1)
205}
206
207/// CommonMark §6.4: "Links may not contain other links, at any level of
208/// nesting. If multiple otherwise valid link definitions appear nested inside
209/// each other, the inner-most definition is used." This helper scans a
210/// candidate link text for any `[` that starts a valid inline link; when
211/// found, the outer link must be rejected so the inner-most wins (spec
212/// examples #518–#519, #532).
213///
214/// Images themselves do not count as inner links — a link can contain an
215/// image (#517, #531). A link *inside* an image's alt text, however, still
216/// deactivates outer link openers per CommonMark's bracket-scanner rules, so
217/// the helper recurses into image alt text looking for inner links.
218///
219/// Reference-link nesting (#533, #569, #571) requires resolving labels
220/// against the document's reference-definition map, which the parser does
221/// not have at this point — those cases remain unhandled and need a later
222/// stack-based pass.
223fn link_text_contains_inner_link(text: &str, ctx: LinkScanContext, strict_dest: bool) -> bool {
224    let bytes = text.as_bytes();
225    let mut i = 0;
226    let mut escape_next = false;
227    while i < bytes.len() {
228        let b = bytes[i];
229        if escape_next {
230            escape_next = false;
231            i += step(text, i);
232            continue;
233        }
234        match b {
235            b'\\' => {
236                escape_next = true;
237                i += 1;
238            }
239            b'`' => {
240                if let Some((len, _, _, _)) = try_parse_code_span(&text[i..]) {
241                    i += len;
242                } else {
243                    i += 1;
244                }
245            }
246            b'<' => {
247                if ctx.skip_autolinks
248                    && let Some((len, _)) = try_parse_autolink(&text[i..], true)
249                {
250                    i += len;
251                } else if ctx.skip_raw_html
252                    && let Some(len) = try_parse_inline_html(&text[i..], ctx.dialect)
253                {
254                    i += len;
255                } else {
256                    i += 1;
257                }
258            }
259            b'!' if i + 1 < bytes.len() && bytes[i + 1] == b'[' => {
260                if let Some((len, alt, _, _)) = try_parse_inline_image(&text[i..], ctx) {
261                    if link_text_contains_inner_link(alt, ctx, strict_dest) {
262                        return true;
263                    }
264                    i += len;
265                } else {
266                    i += 2;
267                }
268            }
269            b'[' => {
270                if try_parse_inline_link(&text[i..], strict_dest, ctx).is_some() {
271                    return true;
272                }
273                i += 1;
274            }
275            _ => i += step(text, i),
276        }
277    }
278    false
279}
280
281/// Try to parse an inline image starting at the current position.
282///
283/// Inline images have the form `![alt](url)` or `![alt](url "title")`.
284/// Can also have trailing attributes: `![alt](url){#id .class}`.
285/// Returns Some((length, alt_text, dest_content, raw_attributes)) if a valid image is found.
286///
287/// `ctx` controls bracket-scanner opacity for raw HTML / autolink spans;
288/// see `LinkScanContext`.
289pub fn try_parse_inline_image(
290    text: &str,
291    ctx: LinkScanContext,
292) -> Option<(usize, &str, &str, Option<&str>)> {
293    if !text.starts_with("![") {
294        return None;
295    }
296
297    // Find the closing ]
298    let close_bracket = find_link_close_bracket(text, 2, ctx)?;
299    let alt_text = &text[2..close_bracket];
300
301    // Check for immediate ( after ]
302    let after_bracket = close_bracket + 1;
303    if text.len() <= after_bracket || !text[after_bracket..].starts_with('(') {
304        return None;
305    }
306
307    // Find closing ) for destination (reuse same logic as links)
308    let dest_start = after_bracket + 1;
309    let remaining = &text[dest_start..];
310
311    let close_paren = find_dest_close_paren(remaining)?;
312    let dest_content = &remaining[..close_paren];
313
314    // Check for trailing attributes {#id .class key=value}
315    let after_paren = dest_start + close_paren + 1;
316    let after_close = &text[after_paren..];
317
318    // Attributes must start immediately after closing paren (no whitespace/newlines)
319    if after_close.starts_with('{') {
320        // Find the closing brace
321        if let Some(close_brace_pos) = after_close.find('}') {
322            let attr_text = &after_close[..=close_brace_pos];
323            // Try to parse as attributes to validate
324            if let Some((_attrs, _)) = try_parse_trailing_attributes(attr_text) {
325                let total_len = after_paren + close_brace_pos + 1;
326                // Return raw attribute string for lossless parsing
327                let raw_attrs = attr_text;
328                return Some((total_len, alt_text, dest_content, Some(raw_attrs)));
329            }
330        }
331    }
332
333    // No attributes, just return the image
334    let total_len = after_paren;
335    Some((total_len, alt_text, dest_content, None))
336}
337
338/// Emit an inline image node to the builder.
339/// Note: alt_text may contain inline elements and should be parsed recursively.
340pub fn emit_inline_image(
341    builder: &mut impl InlineSink,
342    _text: &str,
343    alt_text: &str,
344    dest: &str,
345    raw_attributes: Option<&str>,
346    config: &ParserOptions,
347    suppress_footnote_refs: bool,
348) {
349    builder.start_node(SyntaxKind::IMAGE_LINK.into());
350
351    // Opening ![
352    builder.start_node(SyntaxKind::IMAGE_LINK_START.into());
353    builder.token(SyntaxKind::IMAGE_LINK_START.into(), "![");
354    builder.finish_node();
355
356    // Alt text (recursively parse inline elements)
357    builder.start_node(SyntaxKind::IMAGE_ALT.into());
358    // Use the standalone parse_inline_text function for recursive parsing
359    // Note: nested contexts don't resolve references
360    parse_inline_text(builder, alt_text, config, false, suppress_footnote_refs);
361    builder.finish_node();
362
363    // Closing ]
364    builder.token(SyntaxKind::IMAGE_ALT_END.into(), "]");
365
366    // Opening (
367    builder.token(SyntaxKind::IMAGE_DEST_START.into(), "(");
368
369    // Destination
370    builder.start_node(SyntaxKind::LINK_DEST.into());
371    builder.token(SyntaxKind::TEXT.into(), dest);
372    builder.finish_node();
373
374    // Closing )
375    builder.token(SyntaxKind::IMAGE_DEST_END.into(), ")");
376
377    // Emit raw attributes if present (preserve original formatting)
378    if let Some(raw_attrs) = raw_attributes {
379        emit_attribute_node(builder, raw_attrs);
380    }
381
382    builder.finish_node();
383}
384
385/// Try to parse an automatic link starting at the current position.
386///
387/// Automatic links have the form `<url>` (URI autolink) or `<email>`
388/// (email autolink) per CommonMark §6.4. Under `Dialect::CommonMark` the
389/// scheme/email grammar is enforced strictly (e.g. scheme must be 2-32
390/// ASCII chars; email local parts cannot contain backslashes). Pandoc
391/// markdown is laxer — it accepts Unicode in email addresses, for
392/// example — so non-CommonMark callers fall back to the heuristic
393/// "contains `:` or `@`" check that the parser used historically.
394pub fn try_parse_autolink(text: &str, is_commonmark: bool) -> Option<(usize, &str)> {
395    if !text.starts_with('<') {
396        return None;
397    }
398
399    let close_pos = text[1..].find('>')?;
400    let content = &text[1..1 + close_pos];
401
402    if content.is_empty() {
403        return None;
404    }
405    if content.contains(|c: char| c.is_whitespace()) {
406        return None;
407    }
408
409    if is_commonmark {
410        if !is_valid_uri_autolink(content) && !is_valid_email_autolink(content) {
411            return None;
412        }
413    } else if !content.contains(':') && !content.contains('@') {
414        return None;
415    }
416
417    Some((close_pos + 2, content))
418}
419
420/// CommonMark §6.4 URI autolink:
421/// scheme = 2-32 chars, ASCII letter then `[a-zA-Z0-9+.-]`, followed by `:`,
422/// followed by URI body (any char except control, space, `<`, `>`).
423fn is_valid_uri_autolink(s: &str) -> bool {
424    let bytes = s.as_bytes();
425    if bytes.is_empty() || !bytes[0].is_ascii_alphabetic() {
426        return false;
427    }
428    let mut i = 1;
429    while i < bytes.len() {
430        let b = bytes[i];
431        if b.is_ascii_alphanumeric() || b == b'+' || b == b'-' || b == b'.' {
432            i += 1;
433        } else {
434            break;
435        }
436    }
437    if !(2..=32).contains(&i) {
438        return false;
439    }
440    if i >= bytes.len() || bytes[i] != b':' {
441        return false;
442    }
443    for &b in &bytes[i + 1..] {
444        if b < 0x20 || b == 0x7f || b == b'<' || b == b'>' {
445            return false;
446        }
447    }
448    true
449}
450
451/// CommonMark §6.4 email autolink, matching the HTML5 non-normative regex:
452/// `^[a-zA-Z0-9.!#$%&'*+/=?^_`{|}~-]+@[a-zA-Z0-9](?:[a-zA-Z0-9-]{0,61}[a-zA-Z0-9])?
453///  (?:\.[a-zA-Z0-9](?:[a-zA-Z0-9-]{0,61}[a-zA-Z0-9])?)*$`.
454fn is_valid_email_autolink(s: &str) -> bool {
455    let Some(at) = s.find('@') else {
456        return false;
457    };
458    let local = &s[..at];
459    let domain = &s[at + 1..];
460    if local.is_empty() || !local.bytes().all(is_email_local_byte) {
461        return false;
462    }
463    if domain.is_empty() {
464        return false;
465    }
466    domain.split('.').all(is_valid_email_label)
467}
468
469fn is_email_local_byte(b: u8) -> bool {
470    matches!(
471        b,
472        b'a'..=b'z'
473            | b'A'..=b'Z'
474            | b'0'..=b'9'
475            | b'.'
476            | b'!'
477            | b'#'
478            | b'$'
479            | b'%'
480            | b'&'
481            | b'\''
482            | b'*'
483            | b'+'
484            | b'/'
485            | b'='
486            | b'?'
487            | b'^'
488            | b'_'
489            | b'`'
490            | b'{'
491            | b'|'
492            | b'}'
493            | b'~'
494            | b'-'
495    )
496}
497
498fn is_valid_email_label(label: &str) -> bool {
499    let bytes = label.as_bytes();
500    if bytes.is_empty() || bytes.len() > 63 {
501        return false;
502    }
503    if !bytes[0].is_ascii_alphanumeric() {
504        return false;
505    }
506    if !bytes[bytes.len() - 1].is_ascii_alphanumeric() {
507        return false;
508    }
509    bytes[1..bytes.len() - 1]
510        .iter()
511        .all(|b| b.is_ascii_alphanumeric() || *b == b'-')
512}
513
514/// Emit an automatic link node to the builder.
515pub fn emit_autolink(builder: &mut impl InlineSink, _text: &str, url: &str) {
516    builder.start_node(SyntaxKind::AUTO_LINK.into());
517
518    // Opening <
519    builder.start_node(SyntaxKind::AUTO_LINK_MARKER.into());
520    builder.token(SyntaxKind::AUTO_LINK_MARKER.into(), "<");
521    builder.finish_node();
522
523    // URL content
524    builder.token(SyntaxKind::TEXT.into(), url);
525
526    // Closing >
527    builder.start_node(SyntaxKind::AUTO_LINK_MARKER.into());
528    builder.token(SyntaxKind::AUTO_LINK_MARKER.into(), ">");
529    builder.finish_node();
530
531    builder.finish_node();
532}
533
534// Recognized URI schemes for pandoc's `autolink_bare_uris` extension.
535// Generated at build time by `build.rs`
536// from the vendored IANA registry plus pandoc's nonstandard additions,
537// as a sorted `const BARE_URI_SCHEMES: &[&str]`.
538// A `scheme:` prefix outside this set stays literal.
539include!(concat!(env!("OUT_DIR"), "/uri_schemes.rs"));
540
541/// Returns `true` if `scheme` (matched case-insensitively) is a recognized URI scheme.
542/// See [`BARE_URI_SCHEMES`].
543fn is_known_bare_uri_scheme(scheme: &str) -> bool {
544    let lower = scheme.to_ascii_lowercase();
545    BARE_URI_SCHEMES.binary_search(&lower.as_str()).is_ok()
546}
547
548pub fn try_parse_bare_uri(text: &str) -> Option<(usize, &str)> {
549    let mut chars = text.char_indices();
550    let (_, first) = chars.next()?;
551    if !first.is_ascii_alphabetic() {
552        return None;
553    }
554
555    let mut scheme_end = None;
556    for (idx, ch) in text.char_indices() {
557        if ch == ':' {
558            scheme_end = Some(idx);
559            break;
560        }
561        if !ch.is_ascii_alphanumeric() && ch != '+' && ch != '-' && ch != '.' {
562            return None;
563        }
564    }
565    let scheme_end = scheme_end?;
566    if scheme_end == 0 {
567        return None;
568    }
569
570    if !is_known_bare_uri_scheme(&text[..scheme_end]) {
571        return None;
572    }
573
574    let mut end = scheme_end + 1;
575    let bytes = text.as_bytes();
576    while end < text.len() {
577        let b = bytes[end];
578        if b.is_ascii_whitespace() {
579            break;
580        }
581        if matches!(b, b'<' | b'>' | b'`' | b'"' | b'\'') {
582            break;
583        }
584        end += 1;
585    }
586
587    if end == scheme_end + 1 {
588        return None;
589    }
590
591    let mut trimmed = end;
592    while trimmed > scheme_end + 1 {
593        let ch = text[..trimmed].chars().last().unwrap();
594        if matches!(ch, '.' | ',' | ';' | ':' | ')' | ']' | '}') {
595            trimmed -= ch.len_utf8();
596        } else {
597            break;
598        }
599    }
600
601    if trimmed <= scheme_end + 1 {
602        return None;
603    }
604
605    // If trimming terminal punctuation leaves a dangling backslash, the match
606    // came from escaped punctuation (e.g., `a:\]`) and should stay literal.
607    if text[..trimmed].ends_with('\\') {
608        return None;
609    }
610
611    Some((trimmed, &text[..trimmed]))
612}
613
614/// Try to parse an inline link starting at the current position.
615///
616/// Inline links have the form `[text](url)` or `[text](url "title")`.
617/// Can also have trailing attributes: `[text](url){#id .class}`.
618/// Returns Some((length, text_content, dest_content, raw_attributes)) if a valid link is found.
619///
620/// `strict_dest` enables CommonMark §6.4 destination-and-title validation:
621/// the bare destination form may not contain spaces or ASCII control
622/// characters and must have balanced parentheses; if a title follows it
623/// must be properly delimited; only whitespace is allowed before/after.
624/// Pandoc-markdown is more permissive, so leave this off for that dialect.
625pub fn try_parse_inline_link(
626    text: &str,
627    strict_dest: bool,
628    ctx: LinkScanContext,
629) -> Option<(usize, &str, &str, Option<&str>)> {
630    if !text.starts_with('[') {
631        return None;
632    }
633
634    // Find the closing ]
635    let close_bracket = find_link_close_bracket(text, 1, ctx)?;
636    let link_text = &text[1..close_bracket];
637
638    // Check for immediate ( after ]
639    let after_bracket = close_bracket + 1;
640    if text.len() <= after_bracket || !text[after_bracket..].starts_with('(') {
641        return None;
642    }
643
644    // Find closing ) for destination
645    let dest_start = after_bracket + 1;
646    let remaining = &text[dest_start..];
647
648    let close_paren = find_dest_close_paren(remaining)?;
649    let dest_content = &remaining[..close_paren];
650
651    if strict_dest && !dest_and_title_ok_commonmark(dest_content) {
652        return None;
653    }
654
655    // CommonMark §6.4: outer link is rejected when its text contains a valid
656    // inner inline link or image, so the inner-most definition wins.
657    if ctx.disallow_inner_links && link_text_contains_inner_link(link_text, ctx, strict_dest) {
658        return None;
659    }
660
661    // Check for trailing attributes {#id .class key=value}
662    let after_paren = dest_start + close_paren + 1;
663    let after_close = &text[after_paren..];
664
665    // Attributes must start immediately after closing paren (no whitespace/newlines)
666    if after_close.starts_with('{') {
667        // Find the closing brace
668        if let Some(close_brace_pos) = after_close.find('}') {
669            let attr_text = &after_close[..=close_brace_pos];
670            // Try to parse as attributes to validate
671            if let Some((_attrs, _)) = try_parse_trailing_attributes(attr_text) {
672                let total_len = after_paren + close_brace_pos + 1;
673                // Return raw attribute string for lossless parsing
674                let raw_attrs = attr_text;
675                return Some((total_len, link_text, dest_content, Some(raw_attrs)));
676            }
677        }
678    }
679
680    // No attributes, just return the link
681    let total_len = after_paren;
682    Some((total_len, link_text, dest_content, None))
683}
684
685/// CommonMark §6.4 destination + optional title validation. The text passed
686/// in is whatever the parser captured between `(` and `)`. A valid form is:
687/// `[ws] destination [ws title [ws]]` where:
688/// - bare destination has no spaces, tabs, ASCII control chars, and balanced
689///   parentheses (escaped parens permitted);
690/// - bracketed destination is `<...>` with no newlines and no unescaped `<>`;
691/// - the optional title is delimited by `"..."`, `'...'`, or `(...)`;
692/// - any text outside that structure invalidates the link.
693fn dest_and_title_ok_commonmark(content: &str) -> bool {
694    let trimmed = trim_start_link_ws(content);
695    if trimmed.is_empty() {
696        return true;
697    }
698
699    let after_dest = if let Some(rest) = trimmed.strip_prefix('<') {
700        let mut escape = false;
701        let mut end_byte = None;
702        for (i, c) in rest.char_indices() {
703            if escape {
704                escape = false;
705                continue;
706            }
707            match c {
708                '\\' => escape = true,
709                '\n' | '<' => return false,
710                '>' => {
711                    end_byte = Some(i);
712                    break;
713                }
714                _ => {}
715            }
716        }
717        match end_byte {
718            Some(e) => &rest[e + 1..],
719            None => return false,
720        }
721    } else {
722        let mut escape = false;
723        let mut depth: i32 = 0;
724        let mut end = trimmed.len();
725        for (i, c) in trimmed.char_indices() {
726            if escape {
727                escape = false;
728                continue;
729            }
730            match c {
731                '\\' => escape = true,
732                ' ' | '\t' | '\n' => {
733                    end = i;
734                    break;
735                }
736                _ if c.is_ascii_control() => return false,
737                '(' => depth += 1,
738                ')' => {
739                    if depth == 0 {
740                        end = i;
741                        break;
742                    }
743                    depth -= 1;
744                }
745                _ => {}
746            }
747        }
748        if depth != 0 {
749            return false;
750        }
751        if end == 0 {
752            // bare destination must be nonempty if the field is non-blank
753            return false;
754        }
755        &trimmed[end..]
756    };
757
758    let after_dest = trim_start_link_ws(after_dest);
759    if after_dest.is_empty() {
760        return true;
761    }
762
763    let bytes = after_dest.as_bytes();
764    let close = match bytes[0] {
765        b'"' => b'"',
766        b'\'' => b'\'',
767        b'(' => b')',
768        _ => return false,
769    };
770    let opens_paren = bytes[0] == b'(';
771    let mut escape = false;
772    let mut title_close_pos = None;
773    for (i, &b) in after_dest.as_bytes().iter().enumerate().skip(1) {
774        if escape {
775            escape = false;
776            continue;
777        }
778        if b == b'\\' {
779            escape = true;
780            continue;
781        }
782        if opens_paren && b == b'(' {
783            return false;
784        }
785        if b == close {
786            title_close_pos = Some(i);
787            break;
788        }
789    }
790    let close_idx = match title_close_pos {
791        Some(p) => p,
792        None => return false,
793    };
794
795    let after_title = &after_dest[close_idx + 1..];
796    is_link_ws_only(after_title)
797}
798
799/// Strip leading ASCII space/tab/newline bytes. Byte-level equivalent of
800/// `s.trim_start_matches([' ', '\t', '\n'])`; called for every
801/// CommonMark inline-link destination/title scan, so the slice-pattern
802/// MultiCharEqSearcher overhead matters.
803#[inline]
804fn trim_start_link_ws(s: &str) -> &str {
805    let bytes = s.as_bytes();
806    let mut i = 0;
807    while i < bytes.len() {
808        let b = bytes[i];
809        if b == b' ' || b == b'\t' || b == b'\n' {
810            i += 1;
811        } else {
812            break;
813        }
814    }
815    // SAFETY: stripped only ASCII whitespace bytes.
816    unsafe { std::str::from_utf8_unchecked(&bytes[i..]) }
817}
818
819#[inline]
820fn is_link_ws_only(s: &str) -> bool {
821    s.as_bytes()
822        .iter()
823        .all(|&b| b == b' ' || b == b'\t' || b == b'\n')
824}
825
826/// Emit an inline link node to the builder.
827/// Note: link_text may contain inline elements and should be parsed recursively.
828pub fn emit_inline_link(
829    builder: &mut impl InlineSink,
830    _text: &str,
831    link_text: &str,
832    dest: &str,
833    raw_attributes: Option<&str>,
834    config: &ParserOptions,
835    suppress_footnote_refs: bool,
836) {
837    builder.start_node(SyntaxKind::LINK.into());
838
839    // Opening [
840    builder.start_node(SyntaxKind::LINK_START.into());
841    builder.token(SyntaxKind::LINK_START.into(), "[");
842    builder.finish_node();
843
844    // Link text (recursively parse inline elements). Pandoc-native:
845    // links cannot contain other links, so suppress inner LINK / ref-link
846    // recognition during the recursion. Images, emphasis, code, etc. are
847    // still recognised. CommonMark relies on outer-level process_brackets
848    // to prevent nested links, but the flag is harmless under CM.
849    builder.start_node(SyntaxKind::LINK_TEXT.into());
850    parse_inline_text(builder, link_text, config, true, suppress_footnote_refs);
851    builder.finish_node();
852
853    // Closing ]
854    builder.token(SyntaxKind::LINK_TEXT_END.into(), "]");
855
856    // Opening (
857    builder.token(SyntaxKind::LINK_DEST_START.into(), "(");
858
859    // Destination
860    builder.start_node(SyntaxKind::LINK_DEST.into());
861    builder.token(SyntaxKind::TEXT.into(), dest);
862    builder.finish_node();
863
864    // Closing )
865    builder.token(SyntaxKind::LINK_DEST_END.into(), ")");
866
867    // Emit raw attributes if present (preserve original formatting)
868    if let Some(raw_attrs) = raw_attributes {
869        emit_attribute_node(builder, raw_attrs);
870    }
871
872    builder.finish_node();
873}
874
875/// Emit a bare-URI autolink (pandoc's `autolink_bare_uris`).
876///
877/// A bare URI like `https://example.com` carries no syntactic markers in the
878/// source, so the CST must contain exactly its bytes — nothing else. We emit it
879/// as a marker-less [`AUTO_LINK`](SyntaxKind::AUTO_LINK) holding a single `TEXT`
880/// token: lossless, and a faithful structural sibling of the angle-bracket
881/// autolink (`<url>`), which the same node represents with `AUTO_LINK_MARKER`
882/// tokens around the text. Downstream (formatter, pandoc AST, HTML renderer)
883/// derives the destination from the `TEXT` token and re-emits markers verbatim,
884/// so a bare URI round-trips to `url` while `<url>` round-trips to `<url>`.
885///
886/// Emitting a `LINK` with fabricated `[`/`]`/`(`/`)` tokens (the previous
887/// approach) duplicated the URL and inflated the node's text range, breaking
888/// losslessness and desyncing every byte offset after the URI.
889pub fn emit_bare_uri_link(builder: &mut impl InlineSink, uri: &str, _config: &ParserOptions) {
890    builder.start_node(SyntaxKind::AUTO_LINK.into());
891    builder.token(SyntaxKind::TEXT.into(), uri);
892    builder.finish_node();
893}
894
895/// Try to parse a reference link starting at the current position.
896///
897/// Reference links have three forms:
898/// - Explicit: `[text][label]`
899/// - Implicit: `[text][]` (label = text)
900/// - Shortcut: `[text]` (if shortcut_reference_links enabled)
901///
902/// Returns Some((length, text_content, label, is_shortcut)) if a valid reference link is found.
903/// The label is what should be looked up in the registry.
904pub fn try_parse_reference_link(
905    text: &str,
906    allow_shortcut: bool,
907    inline_link_attempted: bool,
908    allow_spaced: bool,
909    ctx: LinkScanContext,
910) -> Option<(usize, &str, String, &str, bool)> {
911    if !text.starts_with('[') {
912        return None;
913    }
914
915    // Don't match citations (which start with [@) or suppress-author citations (which start with [-@)
916    if text.len() > 1 {
917        let bytes = text.as_bytes();
918        if bytes[1] == b'@' {
919            return None;
920        }
921        if bytes[1] == b'-' && text.len() > 2 && bytes[2] == b'@' {
922            return None;
923        }
924    }
925
926    // Find the closing ] for the text. Uses the shared helper so that a
927    // `]` inside a code span doesn't terminate the link text (CommonMark
928    // §6 — code spans bind tighter than links). See spec examples #342
929    // and #525. Raw HTML and (CommonMark-only) autolink spans are also
930    // opaque per `ctx`.
931    let close_bracket = find_link_close_bracket(text, 1, ctx)?;
932    let link_text = &text[1..close_bracket];
933
934    // CommonMark §6.4: outer reference link is rejected when its text contains
935    // a valid inner inline link/image (spec example #532). Reference-link
936    // nesting (#533/#569/#571) is not handled here; it requires resolving
937    // labels against the document refdef map.
938    if ctx.disallow_inner_links
939        && link_text_contains_inner_link(link_text, ctx, ctx.disallow_inner_links)
940    {
941        return None;
942    }
943
944    // Check what follows the ]
945    let after_bracket = close_bracket + 1;
946
947    // `[content]{...}` is reserved for bracketed spans / attribute
948    // trailers, never a shortcut.
949    if after_bracket < text.len() && text[after_bracket..].starts_with('{') {
950        return None;
951    }
952
953    // `[text](...)` is the inline-link shape. CommonMark spec example
954    // #568 (`[foo](not a link)` with `[foo]: /url`) requires the shortcut
955    // to succeed for `[foo]`, leaving `(not a link)` as literal text when
956    // the upstream inline-link parse was rejected by `strict_dest`. We
957    // only fall through to shortcut here when the caller has already
958    // tried the inline-link form (`inline_link_attempted`) — otherwise
959    // disabling the `inline_links` extension would silently let
960    // `[text](url)` become a shortcut + literal text, which the
961    // `inline_links_disabled_keeps_inline_link_literal` test guards
962    // against.
963    if after_bracket < text.len()
964        && text[after_bracket..].starts_with('(')
965        && (!allow_shortcut || !inline_link_attempted)
966    {
967        return None;
968    }
969
970    // Pandoc `spaced_reference_links`: allow whitespace (space, tab, and a
971    // single LF — block parsing already enforces blank-line boundaries) between
972    // the link-text `]` and the label `[`. Without the extension, gap stays
973    // empty and the next byte must be `[` directly.
974    let gap_end = if allow_spaced {
975        let bytes = text.as_bytes();
976        let mut p = after_bracket;
977        let mut saw_newline = false;
978        while p < bytes.len() {
979            match bytes[p] {
980                b' ' | b'\t' => p += 1,
981                b'\n' if !saw_newline => {
982                    saw_newline = true;
983                    p += 1;
984                }
985                _ => break,
986            }
987        }
988        p
989    } else {
990        after_bracket
991    };
992    let gap = &text[after_bracket..gap_end];
993
994    // Check for explicit reference [text][label] or implicit [text][]
995    if gap_end < text.len() && text[gap_end..].starts_with('[') {
996        // Find the closing ] for the label
997        let label_start = gap_end + 1;
998        let mut label_end = None;
999
1000        for (i, ch) in text[label_start..].char_indices() {
1001            if ch == ']' {
1002                label_end = Some(i + label_start);
1003                break;
1004            }
1005            // Labels can't contain newlines
1006            if ch == '\n' {
1007                return None;
1008            }
1009        }
1010
1011        let label_end = label_end?;
1012        let label = &text[label_start..label_end];
1013
1014        // Total length includes both bracket pairs (and any gap between them)
1015        let total_len = label_end + 1;
1016
1017        // Implicit reference: empty label means emit [text][]
1018        if label.is_empty() {
1019            return Some((total_len, link_text, String::new(), gap, false));
1020        }
1021
1022        // Explicit reference: use the provided label
1023        Some((total_len, link_text, label.to_string(), gap, false))
1024    } else if allow_shortcut {
1025        // Shortcut reference: [text] with no second bracket pair
1026        // The text is both the display text and the label. Any whitespace we
1027        // tentatively consumed for the spaced-form lookahead belongs to the
1028        // surrounding text, so we report the shortcut at its strict length.
1029        if link_text.is_empty() {
1030            return None;
1031        }
1032        Some((after_bracket, link_text, link_text.to_string(), "", true))
1033    } else {
1034        // No second bracket pair and shortcut not allowed - not a reference link
1035        None
1036    }
1037}
1038
1039/// Emit a reference link node to the builder.
1040/// Preserves the original reference syntax (explicit [text][ref], implicit [text][], or shortcut [text]).
1041/// `gap` carries any whitespace consumed between the link-text `]` and the
1042/// label `[` under `spaced_reference_links`; empty otherwise.
1043pub fn emit_reference_link(
1044    builder: &mut impl InlineSink,
1045    link_text: &str,
1046    label: &str,
1047    gap: &str,
1048    is_shortcut: bool,
1049    config: &ParserOptions,
1050    suppress_footnote_refs: bool,
1051) {
1052    builder.start_node(SyntaxKind::LINK.into());
1053
1054    // Opening [
1055    builder.start_node(SyntaxKind::LINK_START.into());
1056    builder.token(SyntaxKind::LINK_START.into(), "[");
1057    builder.finish_node();
1058
1059    // Link text (recursively parse inline elements). Pandoc-native:
1060    // links cannot contain other links, so suppress inner LINK / ref-link
1061    // recognition during the recursion. Images, emphasis, code, etc. are
1062    // still recognised.
1063    builder.start_node(SyntaxKind::LINK_TEXT.into());
1064    parse_inline_text(builder, link_text, config, true, suppress_footnote_refs);
1065    builder.finish_node();
1066
1067    // Closing ] and reference label
1068    builder.token(SyntaxKind::TEXT.into(), "]");
1069
1070    if !is_shortcut {
1071        // Explicit or implicit reference: [text][label] or [text][]
1072        emit_reference_link_gap(builder, gap);
1073        builder.token(SyntaxKind::TEXT.into(), "[");
1074        builder.start_node(SyntaxKind::LINK_REF.into());
1075        // For implicit references, label is empty and we emit [text][]
1076        // For explicit references, emit the label to get [text][label]
1077        if !label.is_empty() {
1078            builder.token(SyntaxKind::TEXT.into(), label);
1079        }
1080        builder.finish_node();
1081        builder.token(SyntaxKind::TEXT.into(), "]");
1082    }
1083    // For shortcut references, just [text] - no second bracket pair
1084
1085    builder.finish_node();
1086}
1087
1088/// Emit the whitespace gap between `]` and `[` of a spaced reference link,
1089/// preserving exact bytes by splitting into WHITESPACE / NEWLINE tokens.
1090fn emit_reference_link_gap(builder: &mut impl InlineSink, gap: &str) {
1091    if gap.is_empty() {
1092        return;
1093    }
1094    let bytes = gap.as_bytes();
1095    let mut start = 0;
1096    while start < bytes.len() {
1097        match bytes[start] {
1098            b'\r' => {
1099                let end = if start + 1 < bytes.len() && bytes[start + 1] == b'\n' {
1100                    start + 2
1101                } else {
1102                    start + 1
1103                };
1104                builder.token(SyntaxKind::NEWLINE.into(), &gap[start..end]);
1105                start = end;
1106            }
1107            b'\n' => {
1108                builder.token(SyntaxKind::NEWLINE.into(), &gap[start..start + 1]);
1109                start += 1;
1110            }
1111            _ => {
1112                let mut end = start + 1;
1113                while end < bytes.len() && !matches!(bytes[end], b'\r' | b'\n') {
1114                    end += 1;
1115                }
1116                builder.token(SyntaxKind::WHITESPACE.into(), &gap[start..end]);
1117                start = end;
1118            }
1119        }
1120    }
1121}
1122
1123/// Try to parse a reference-style image: `![alt][ref]`, `![alt][]`, or `![alt]`
1124/// Returns (total_len, alt_text, label, gap, is_shortcut) if successful. `gap`
1125/// is the whitespace between `]` and `[` consumed under
1126/// `spaced_reference_links`; empty otherwise (and always empty for shortcuts).
1127pub fn try_parse_reference_image(
1128    text: &str,
1129    allow_shortcut: bool,
1130    allow_spaced: bool,
1131) -> Option<(usize, &str, String, &str, bool)> {
1132    let bytes = text.as_bytes();
1133    if bytes.len() < 4 || bytes[0] != b'!' || bytes[1] != b'[' {
1134        return None;
1135    }
1136
1137    let mut pos = 2;
1138    let mut bracket_depth = 1;
1139    let alt_start = pos;
1140
1141    // Find the end of the alt text (allowing nested brackets)
1142    while pos < bytes.len() && bracket_depth > 0 {
1143        match bytes[pos] {
1144            b'[' => bracket_depth += 1,
1145            b']' => bracket_depth -= 1,
1146            b'\\' if pos + 1 < bytes.len() => pos += 1, // skip escaped char
1147            _ => {}
1148        }
1149        pos += 1;
1150    }
1151
1152    if bracket_depth > 0 {
1153        return None; // Unclosed brackets
1154    }
1155
1156    let alt_text = &text[alt_start..pos - 1];
1157    let after_alt_close = pos;
1158
1159    // Pandoc `spaced_reference_links` applies to reference images too: allow
1160    // whitespace (space, tab, single LF) between `]` and `[`.
1161    if allow_spaced {
1162        let mut saw_newline = false;
1163        while pos < bytes.len() {
1164            match bytes[pos] {
1165                b' ' | b'\t' => pos += 1,
1166                b'\n' if !saw_newline => {
1167                    saw_newline = true;
1168                    pos += 1;
1169                }
1170                _ => break,
1171            }
1172        }
1173    }
1174    let gap = &text[after_alt_close..pos];
1175
1176    // Now check for the label part
1177    if pos >= bytes.len() {
1178        if allow_shortcut && gap.is_empty() {
1179            let label = alt_text.to_string();
1180            return Some((pos, alt_text, label, "", true));
1181        }
1182        return None;
1183    }
1184
1185    // Explicit reference: `![alt][label]`
1186    if bytes[pos] == b'[' {
1187        pos += 1;
1188        let label_start = pos;
1189
1190        // Find the end of the label (no nested brackets, no newlines)
1191        while pos < bytes.len() && bytes[pos] != b']' && bytes[pos] != b'\n' && bytes[pos] != b'\r'
1192        {
1193            pos += 1;
1194        }
1195
1196        if pos >= bytes.len() || bytes[pos] != b']' {
1197            return None;
1198        }
1199
1200        let label_text = &text[label_start..pos];
1201        pos += 1;
1202
1203        // Return the original label text for formatting preservation
1204        // Empty label means implicit reference
1205        let label = if label_text.is_empty() {
1206            alt_text.to_string() // For implicit references, use alt text as label for equality check
1207        } else {
1208            label_text.to_string() // Preserve original case
1209        };
1210
1211        return Some((pos, alt_text, label, gap, false));
1212    }
1213
1214    // Shortcut reference: `![alt]` (only if enabled). Any whitespace we
1215    // tentatively consumed past the alt-text `]` belongs to surrounding text.
1216    if allow_shortcut {
1217        // Check if next char is ( - if so, not a reference
1218        if bytes[after_alt_close] == b'(' {
1219            return None;
1220        }
1221
1222        let label = alt_text.to_string();
1223        return Some((after_alt_close, alt_text, label, "", true));
1224    }
1225
1226    None
1227}
1228
1229/// Emit a reference image node with registry lookup. `gap` carries whitespace
1230/// consumed between `]` and `[` under `spaced_reference_links`; empty otherwise.
1231pub fn emit_reference_image(
1232    builder: &mut impl InlineSink,
1233    alt_text: &str,
1234    label: &str,
1235    gap: &str,
1236    is_shortcut: bool,
1237    config: &ParserOptions,
1238    suppress_footnote_refs: bool,
1239) {
1240    builder.start_node(SyntaxKind::IMAGE_LINK.into());
1241
1242    // Emit as reference image (preserve original syntax)
1243    builder.start_node(SyntaxKind::IMAGE_LINK_START.into());
1244    builder.token(SyntaxKind::IMAGE_LINK_START.into(), "![");
1245    builder.finish_node();
1246
1247    // Alt text (recursively parse inline elements)
1248    builder.start_node(SyntaxKind::IMAGE_ALT.into());
1249    parse_inline_text(builder, alt_text, config, false, suppress_footnote_refs);
1250    builder.finish_node();
1251
1252    // Closing ] and reference label
1253    builder.token(SyntaxKind::TEXT.into(), "]");
1254
1255    if !is_shortcut {
1256        // Explicit or implicit reference: ![alt][label] or ![alt][]
1257        emit_reference_link_gap(builder, gap);
1258        builder.token(SyntaxKind::TEXT.into(), "[");
1259        builder.start_node(SyntaxKind::LINK_REF.into());
1260        // For implicit references, emit empty label (label == alt means implicit from parser)
1261        if label != alt_text {
1262            builder.token(SyntaxKind::TEXT.into(), label);
1263        }
1264        builder.finish_node();
1265        builder.token(SyntaxKind::TEXT.into(), "]");
1266    }
1267    // For shortcut references, just ![alt] - no second bracket pair
1268
1269    builder.finish_node();
1270}
1271
1272/// Emit an `UNRESOLVED_REFERENCE` node for a Pandoc bracket-shape
1273/// pattern whose label didn't resolve. The wrapper covers the original
1274/// bracket bytes; the inner text recurses through normal inline
1275/// parsing (with inner-link suppression so a stray inner inline link
1276/// doesn't reorder semantics relative to pandoc-native).
1277///
1278/// `source` is `text[start..end]` — the full bracket-shape pattern.
1279/// `text_content` is the inner text between the outer `[` and `]`
1280/// (the bytes used for inline recursion). `label_suffix` carries the
1281/// `[label]` / `[]` suffix bytes verbatim, or `None` for shortcut form.
1282pub fn emit_unresolved_reference(
1283    builder: &mut impl InlineSink,
1284    is_image: bool,
1285    text_content: &str,
1286    label_suffix: Option<&str>,
1287    config: &ParserOptions,
1288    suppress_footnote_refs: bool,
1289) {
1290    builder.start_node(SyntaxKind::UNRESOLVED_REFERENCE.into());
1291
1292    if is_image {
1293        builder.start_node(SyntaxKind::IMAGE_LINK_START.into());
1294        builder.token(SyntaxKind::IMAGE_LINK_START.into(), "![");
1295        builder.finish_node();
1296        builder.start_node(SyntaxKind::IMAGE_ALT.into());
1297        parse_inline_text(builder, text_content, config, false, suppress_footnote_refs);
1298        builder.finish_node();
1299    } else {
1300        builder.start_node(SyntaxKind::LINK_START.into());
1301        builder.token(SyntaxKind::LINK_START.into(), "[");
1302        builder.finish_node();
1303        builder.start_node(SyntaxKind::LINK_TEXT.into());
1304        parse_inline_text(builder, text_content, config, true, suppress_footnote_refs);
1305        builder.finish_node();
1306    }
1307
1308    builder.token(SyntaxKind::TEXT.into(), "]");
1309
1310    if let Some(suffix) = label_suffix {
1311        // suffix is either "[label]" or "[]"; preserve original bytes.
1312        // Split as `[` + LINK_REF(label) + `]` so wrapper accessors find
1313        // the label via `support::child::<LinkRef>()`.
1314        debug_assert!(suffix.starts_with('[') && suffix.ends_with(']'));
1315        builder.token(SyntaxKind::TEXT.into(), "[");
1316        let label = &suffix[1..suffix.len() - 1];
1317        builder.start_node(SyntaxKind::LINK_REF.into());
1318        if !label.is_empty() {
1319            builder.token(SyntaxKind::TEXT.into(), label);
1320        }
1321        builder.finish_node();
1322        builder.token(SyntaxKind::TEXT.into(), "]");
1323    }
1324
1325    builder.finish_node();
1326}
1327
1328#[cfg(test)]
1329mod tests {
1330    use super::*;
1331
1332    #[test]
1333    fn test_parse_autolink_url() {
1334        let input = "<https://example.com>";
1335        assert_eq!(
1336            try_parse_autolink(input, false),
1337            Some((21, "https://example.com"))
1338        );
1339        assert_eq!(
1340            try_parse_autolink(input, true),
1341            Some((21, "https://example.com"))
1342        );
1343    }
1344
1345    #[test]
1346    fn test_parse_autolink_email() {
1347        let input = "<user@example.com>";
1348        assert_eq!(
1349            try_parse_autolink(input, false),
1350            Some((18, "user@example.com"))
1351        );
1352        assert_eq!(
1353            try_parse_autolink(input, true),
1354            Some((18, "user@example.com"))
1355        );
1356    }
1357
1358    #[test]
1359    fn test_parse_autolink_no_close() {
1360        let input = "<https://example.com";
1361        assert_eq!(try_parse_autolink(input, false), None);
1362        assert_eq!(try_parse_autolink(input, true), None);
1363    }
1364
1365    #[test]
1366    fn test_parse_autolink_with_space() {
1367        let input = "<https://example.com >";
1368        assert_eq!(try_parse_autolink(input, false), None);
1369        assert_eq!(try_parse_autolink(input, true), None);
1370    }
1371
1372    #[test]
1373    fn test_parse_autolink_not_url_or_email() {
1374        let input = "<notaurl>";
1375        assert_eq!(try_parse_autolink(input, false), None);
1376        assert_eq!(try_parse_autolink(input, true), None);
1377    }
1378
1379    #[test]
1380    fn test_parse_autolink_commonmark_strict_scheme() {
1381        // Scheme too short (1 char) — invalid under CommonMark, lax-accepted
1382        // under Pandoc dialect (matches historical behavior).
1383        let input = "<m:abc>";
1384        assert_eq!(try_parse_autolink(input, true), None);
1385        assert_eq!(try_parse_autolink(input, false), Some((7, "m:abc")));
1386    }
1387
1388    #[test]
1389    fn test_parse_autolink_commonmark_email_disallows_backslash() {
1390        let input = "<foo\\+@bar.example.com>";
1391        assert_eq!(try_parse_autolink(input, true), None);
1392        assert_eq!(
1393            try_parse_autolink(input, false),
1394            Some((23, "foo\\+@bar.example.com"))
1395        );
1396    }
1397
1398    #[test]
1399    fn test_parse_inline_link_simple() {
1400        let input = "[text](url)";
1401        let result = try_parse_inline_link(input, false, LinkScanContext::default());
1402        assert_eq!(result, Some((11, "text", "url", None)));
1403    }
1404
1405    #[test]
1406    fn test_parse_inline_link_with_title() {
1407        let input = r#"[text](url "title")"#;
1408        let result = try_parse_inline_link(input, false, LinkScanContext::default());
1409        assert_eq!(result, Some((19, "text", r#"url "title""#, None)));
1410    }
1411
1412    #[test]
1413    fn test_parse_inline_link_with_nested_brackets() {
1414        let input = "[outer [inner] text](url)";
1415        let result = try_parse_inline_link(input, false, LinkScanContext::default());
1416        assert_eq!(result, Some((25, "outer [inner] text", "url", None)));
1417    }
1418
1419    #[test]
1420    fn test_parse_inline_link_no_space_between_brackets_and_parens() {
1421        let input = "[text] (url)";
1422        let result = try_parse_inline_link(input, false, LinkScanContext::default());
1423        assert_eq!(result, None);
1424    }
1425
1426    #[test]
1427    fn test_parse_inline_link_no_closing_bracket() {
1428        let input = "[text(url)";
1429        let result = try_parse_inline_link(input, false, LinkScanContext::default());
1430        assert_eq!(result, None);
1431    }
1432
1433    #[test]
1434    fn test_parse_inline_link_no_closing_paren() {
1435        let input = "[text](url";
1436        let result = try_parse_inline_link(input, false, LinkScanContext::default());
1437        assert_eq!(result, None);
1438    }
1439
1440    #[test]
1441    fn test_parse_inline_link_escaped_bracket() {
1442        let input = r"[text\]more](url)";
1443        let result = try_parse_inline_link(input, false, LinkScanContext::default());
1444        assert_eq!(result, Some((17, r"text\]more", "url", None)));
1445    }
1446
1447    #[test]
1448    fn test_parse_inline_link_parens_in_url() {
1449        let input = "[text](url(with)parens)";
1450        let result = try_parse_inline_link(input, false, LinkScanContext::default());
1451        assert_eq!(result, Some((23, "text", "url(with)parens", None)));
1452    }
1453
1454    #[test]
1455    fn test_parse_inline_image_simple() {
1456        let input = "![alt](image.jpg)";
1457        let result = try_parse_inline_image(input, LinkScanContext::default());
1458        assert_eq!(result, Some((17, "alt", "image.jpg", None)));
1459    }
1460
1461    #[test]
1462    fn test_parse_inline_image_with_title() {
1463        let input = r#"![alt](image.jpg "A title")"#;
1464        let result = try_parse_inline_image(input, LinkScanContext::default());
1465        assert_eq!(result, Some((27, "alt", r#"image.jpg "A title""#, None)));
1466    }
1467
1468    #[test]
1469    fn test_parse_inline_image_with_nested_brackets() {
1470        let input = "![outer [inner] alt](image.jpg)";
1471        let result = try_parse_inline_image(input, LinkScanContext::default());
1472        assert_eq!(result, Some((31, "outer [inner] alt", "image.jpg", None)));
1473    }
1474
1475    #[test]
1476    fn test_parse_bare_uri_rejects_dangling_backslash_after_trim() {
1477        let input = r"a:\]";
1478        let result = try_parse_bare_uri(input);
1479        assert_eq!(result, None);
1480    }
1481
1482    #[test]
1483    fn test_parse_bare_uri_rejects_unknown_scheme() {
1484        assert_eq!(try_parse_bare_uri("Note:**"), None);
1485        assert_eq!(try_parse_bare_uri("Note:foo"), None);
1486        assert_eq!(try_parse_bare_uri("foo:bar"), None);
1487    }
1488
1489    #[test]
1490    fn test_parse_bare_uri_accepts_known_schemes() {
1491        assert_eq!(
1492            try_parse_bare_uri("http://example.com"),
1493            Some((18, "http://example.com"))
1494        );
1495        assert_eq!(
1496            try_parse_bare_uri("HTTPS://EXAMPLE.COM"),
1497            Some((19, "HTTPS://EXAMPLE.COM"))
1498        );
1499        assert_eq!(
1500            try_parse_bare_uri("mailto:a@b.com"),
1501            Some((14, "mailto:a@b.com"))
1502        );
1503        assert_eq!(try_parse_bare_uri("doi:10.1/x"), Some((10, "doi:10.1/x")));
1504    }
1505
1506    #[test]
1507    fn bare_uri_scheme_table_is_well_formed() {
1508        assert!(
1509            BARE_URI_SCHEMES.len() > 300,
1510            "only {} schemes",
1511            BARE_URI_SCHEMES.len()
1512        );
1513        assert!(BARE_URI_SCHEMES.windows(2).all(|w| w[0] < w[1]));
1514        for known in ["http", "https", "mailto", "ftp", "mongodb", "shttp"] {
1515            assert!(is_known_bare_uri_scheme(known), "missing scheme {known}");
1516        }
1517        for extra in ["doi", "gemini", "isbn", "pmid"] {
1518            assert!(is_known_bare_uri_scheme(extra), "missing scheme {extra}");
1519        }
1520        assert!(!is_known_bare_uri_scheme("note"));
1521    }
1522
1523    #[test]
1524    fn test_parse_inline_image_no_space_between_brackets_and_parens() {
1525        let input = "![alt] (image.jpg)";
1526        let result = try_parse_inline_image(input, LinkScanContext::default());
1527        assert_eq!(result, None);
1528    }
1529
1530    #[test]
1531    fn test_parse_inline_image_no_closing_bracket() {
1532        let input = "![alt(image.jpg)";
1533        let result = try_parse_inline_image(input, LinkScanContext::default());
1534        assert_eq!(result, None);
1535    }
1536
1537    #[test]
1538    fn test_parse_inline_image_no_closing_paren() {
1539        let input = "![alt](image.jpg";
1540        let result = try_parse_inline_image(input, LinkScanContext::default());
1541        assert_eq!(result, None);
1542    }
1543
1544    #[test]
1545    fn test_parse_inline_image_with_simple_class() {
1546        let input = "![alt](img.png){.large}";
1547        let result = try_parse_inline_image(input, LinkScanContext::default());
1548        let (len, alt, dest, attrs) = result.unwrap();
1549        assert_eq!(len, 23);
1550        assert_eq!(alt, "alt");
1551        assert_eq!(dest, "img.png");
1552        assert!(attrs.is_some());
1553        let attrs = attrs.unwrap();
1554        assert_eq!(attrs, "{.large}");
1555    }
1556
1557    #[test]
1558    fn test_parse_inline_image_with_id() {
1559        let input = "![Figure 1](fig1.png){#fig-1}";
1560        let result = try_parse_inline_image(input, LinkScanContext::default());
1561        let (len, alt, dest, attrs) = result.unwrap();
1562        assert_eq!(len, 29);
1563        assert_eq!(alt, "Figure 1");
1564        assert_eq!(dest, "fig1.png");
1565        assert!(attrs.is_some());
1566        let attrs = attrs.unwrap();
1567        assert_eq!(attrs, "{#fig-1}");
1568    }
1569
1570    #[test]
1571    fn test_parse_inline_image_with_full_attributes() {
1572        let input = "![alt](img.png){#fig .large width=\"80%\"}";
1573        let result = try_parse_inline_image(input, LinkScanContext::default());
1574        let (len, alt, dest, attrs) = result.unwrap();
1575        assert_eq!(len, 40);
1576        assert_eq!(alt, "alt");
1577        assert_eq!(dest, "img.png");
1578        assert!(attrs.is_some());
1579        let attrs = attrs.unwrap();
1580        assert_eq!(attrs, "{#fig .large width=\"80%\"}");
1581    }
1582
1583    #[test]
1584    fn test_parse_inline_image_attributes_must_be_adjacent() {
1585        // Space between ) and { should not parse as attributes
1586        let input = "![alt](img.png) {.large}";
1587        let result = try_parse_inline_image(input, LinkScanContext::default());
1588        assert_eq!(result, Some((15, "alt", "img.png", None)));
1589    }
1590
1591    // Link attribute tests
1592    #[test]
1593    fn test_parse_inline_link_with_id() {
1594        let input = "[text](url){#link-1}";
1595        let result = try_parse_inline_link(input, false, LinkScanContext::default());
1596        let (len, text, dest, attrs) = result.unwrap();
1597        assert_eq!(len, 20);
1598        assert_eq!(text, "text");
1599        assert_eq!(dest, "url");
1600        assert!(attrs.is_some());
1601        let attrs = attrs.unwrap();
1602        assert_eq!(attrs, "{#link-1}");
1603    }
1604
1605    #[test]
1606    fn test_parse_inline_link_with_full_attributes() {
1607        let input = "[text](url){#link .external target=\"_blank\"}";
1608        let result = try_parse_inline_link(input, false, LinkScanContext::default());
1609        let (len, text, dest, attrs) = result.unwrap();
1610        assert_eq!(len, 44);
1611        assert_eq!(text, "text");
1612        assert_eq!(dest, "url");
1613        assert!(attrs.is_some());
1614        let attrs = attrs.unwrap();
1615        assert_eq!(attrs, "{#link .external target=\"_blank\"}");
1616    }
1617
1618    #[test]
1619    fn test_parse_inline_link_attributes_must_be_adjacent() {
1620        // Space between ) and { should not parse as attributes
1621        let input = "[text](url) {.class}";
1622        let result = try_parse_inline_link(input, false, LinkScanContext::default());
1623        assert_eq!(result, Some((11, "text", "url", None)));
1624    }
1625
1626    #[test]
1627    fn test_parse_inline_link_with_title_and_attributes() {
1628        let input = r#"[text](url "title"){.external}"#;
1629        let result = try_parse_inline_link(input, false, LinkScanContext::default());
1630        let (len, text, dest, attrs) = result.unwrap();
1631        assert_eq!(len, 30);
1632        assert_eq!(text, "text");
1633        assert_eq!(dest, r#"url "title""#);
1634        assert!(attrs.is_some());
1635        let attrs = attrs.unwrap();
1636        assert_eq!(attrs, "{.external}");
1637    }
1638
1639    // Reference link tests
1640    #[test]
1641    fn test_parse_reference_link_explicit() {
1642        let input = "[link text][label]";
1643        let result =
1644            try_parse_reference_link(input, false, true, false, LinkScanContext::default());
1645        assert_eq!(
1646            result,
1647            Some((18, "link text", "label".to_string(), "", false))
1648        );
1649    }
1650
1651    #[test]
1652    fn test_parse_reference_link_implicit() {
1653        let input = "[link text][]";
1654        let result =
1655            try_parse_reference_link(input, false, true, false, LinkScanContext::default());
1656        assert_eq!(result, Some((13, "link text", String::new(), "", false)));
1657    }
1658
1659    #[test]
1660    fn test_parse_reference_link_explicit_same_label_as_text() {
1661        let input = "[stack][stack]";
1662        let result =
1663            try_parse_reference_link(input, false, true, false, LinkScanContext::default());
1664        assert_eq!(result, Some((14, "stack", "stack".to_string(), "", false)));
1665    }
1666
1667    #[test]
1668    fn test_parse_reference_link_shortcut() {
1669        let input = "[link text] rest";
1670        let result = try_parse_reference_link(input, true, true, false, LinkScanContext::default());
1671        assert_eq!(
1672            result,
1673            Some((11, "link text", "link text".to_string(), "", true))
1674        );
1675    }
1676
1677    #[test]
1678    fn test_parse_reference_link_shortcut_rejects_empty_label() {
1679        let input = "[] rest";
1680        let result = try_parse_reference_link(input, true, true, false, LinkScanContext::default());
1681        assert_eq!(result, None);
1682    }
1683
1684    #[test]
1685    fn test_parse_reference_link_shortcut_disabled() {
1686        let input = "[link text] rest";
1687        let result =
1688            try_parse_reference_link(input, false, true, false, LinkScanContext::default());
1689        assert_eq!(result, None);
1690    }
1691
1692    #[test]
1693    fn test_parse_reference_link_not_inline_link() {
1694        // With shortcut disabled, `[text](url)` is rejected so the inline
1695        // link form upstream gets exclusive ownership.
1696        let input = "[text](url)";
1697        let result =
1698            try_parse_reference_link(input, false, true, false, LinkScanContext::default());
1699        assert_eq!(result, None);
1700    }
1701
1702    #[test]
1703    fn test_parse_reference_link_shortcut_falls_through_inline_link() {
1704        // CommonMark spec example #568: when an inline-link attempt would
1705        // fail (here we model the reachability — the caller tries inline
1706        // link first; if that returns None, we should still see `[text]`
1707        // as a shortcut and leave `(url)` to be parsed as following text).
1708        let input = "[text](url)";
1709        let result = try_parse_reference_link(input, true, true, false, LinkScanContext::default());
1710        assert_eq!(result, Some((6, "text", "text".to_string(), "", true)));
1711    }
1712
1713    #[test]
1714    fn test_parse_reference_link_with_nested_brackets() {
1715        let input = "[outer [inner] text][ref]";
1716        let result =
1717            try_parse_reference_link(input, false, true, false, LinkScanContext::default());
1718        assert_eq!(
1719            result,
1720            Some((25, "outer [inner] text", "ref".to_string(), "", false))
1721        );
1722    }
1723
1724    #[test]
1725    fn test_parse_reference_link_label_no_newline() {
1726        let input = "[text][label\nmore]";
1727        let result =
1728            try_parse_reference_link(input, false, true, false, LinkScanContext::default());
1729        assert_eq!(result, None);
1730    }
1731
1732    #[test]
1733    fn test_parse_reference_link_spaced_disabled() {
1734        // Without `spaced_reference_links`, a space between brackets blocks the
1735        // explicit form; shortcut takes over so `[foo]` matches at length 5.
1736        let input = "[foo] [bar]";
1737        let result = try_parse_reference_link(input, true, true, false, LinkScanContext::default());
1738        assert_eq!(result, Some((5, "foo", "foo".to_string(), "", true)));
1739    }
1740
1741    #[test]
1742    fn test_parse_reference_link_spaced_single_space() {
1743        let input = "[foo] [bar]";
1744        let result = try_parse_reference_link(input, true, true, true, LinkScanContext::default());
1745        assert_eq!(result, Some((11, "foo", "bar".to_string(), " ", false)));
1746    }
1747
1748    #[test]
1749    fn test_parse_reference_link_spaced_multiple_spaces_and_tab() {
1750        let input = "[foo]  \t[bar]";
1751        let result = try_parse_reference_link(input, true, true, true, LinkScanContext::default());
1752        assert_eq!(result, Some((13, "foo", "bar".to_string(), "  \t", false)));
1753    }
1754
1755    #[test]
1756    fn test_parse_reference_link_spaced_newline() {
1757        let input = "[foo]\n[bar]";
1758        let result = try_parse_reference_link(input, true, true, true, LinkScanContext::default());
1759        assert_eq!(result, Some((11, "foo", "bar".to_string(), "\n", false)));
1760    }
1761
1762    #[test]
1763    fn test_parse_reference_link_spaced_implicit() {
1764        // Pandoc: with the extension, `[foo] []` resolves to implicit `[foo][]`.
1765        let input = "[foo] []";
1766        let result = try_parse_reference_link(input, true, true, true, LinkScanContext::default());
1767        assert_eq!(result, Some((8, "foo", String::new(), " ", false)));
1768    }
1769
1770    // Reference image tests
1771    #[test]
1772    fn test_parse_reference_image_explicit() {
1773        let input = "![alt text][label]";
1774        let result = try_parse_reference_image(input, false, false);
1775        assert_eq!(
1776            result,
1777            Some((18, "alt text", "label".to_string(), "", false))
1778        );
1779    }
1780
1781    #[test]
1782    fn test_parse_reference_image_implicit() {
1783        let input = "![alt text][]";
1784        let result = try_parse_reference_image(input, false, false);
1785        assert_eq!(
1786            result,
1787            Some((13, "alt text", "alt text".to_string(), "", false))
1788        );
1789    }
1790
1791    #[test]
1792    fn test_parse_reference_image_shortcut() {
1793        let input = "![alt text] rest";
1794        let result = try_parse_reference_image(input, true, false);
1795        assert_eq!(
1796            result,
1797            Some((11, "alt text", "alt text".to_string(), "", true))
1798        );
1799    }
1800
1801    #[test]
1802    fn test_parse_reference_image_shortcut_disabled() {
1803        let input = "![alt text] rest";
1804        let result = try_parse_reference_image(input, false, false);
1805        assert_eq!(result, None);
1806    }
1807
1808    #[test]
1809    fn test_parse_reference_image_not_inline() {
1810        // Should not match inline images with (url)
1811        let input = "![alt](url)";
1812        let result = try_parse_reference_image(input, true, false);
1813        assert_eq!(result, None);
1814    }
1815
1816    #[test]
1817    fn test_parse_reference_image_with_nested_brackets() {
1818        let input = "![alt [nested] text][ref]";
1819        let result = try_parse_reference_image(input, false, false);
1820        assert_eq!(
1821            result,
1822            Some((25, "alt [nested] text", "ref".to_string(), "", false))
1823        );
1824    }
1825
1826    #[test]
1827    fn test_parse_reference_image_spaced() {
1828        let input = "![alt] [ref]";
1829        let result = try_parse_reference_image(input, true, true);
1830        assert_eq!(result, Some((12, "alt", "ref".to_string(), " ", false)));
1831    }
1832
1833    #[test]
1834    fn test_reference_link_label_with_crlf() {
1835        // Reference link labels should not span lines with CRLF
1836        let input = "[foo\r\nbar]";
1837        let result =
1838            try_parse_reference_link(input, false, true, false, LinkScanContext::default());
1839
1840        // Should fail to parse because label contains line break
1841        assert_eq!(
1842            result, None,
1843            "Should not parse reference link with CRLF in label"
1844        );
1845    }
1846
1847    #[test]
1848    fn test_reference_link_label_with_lf() {
1849        // Reference link labels should not span lines with LF either
1850        let input = "[foo\nbar]";
1851        let result =
1852            try_parse_reference_link(input, false, true, false, LinkScanContext::default());
1853
1854        // Should fail to parse because label contains line break
1855        assert_eq!(
1856            result, None,
1857            "Should not parse reference link with LF in label"
1858        );
1859    }
1860
1861    // Multiline link text tests
1862    #[test]
1863    fn test_parse_inline_link_multiline_text() {
1864        // Per Pandoc spec, link text CAN contain newlines (soft breaks)
1865        let input = "[text on\nline two](url)";
1866        let result = try_parse_inline_link(input, false, LinkScanContext::default());
1867        assert_eq!(
1868            result,
1869            Some((23, "text on\nline two", "url", None)),
1870            "Link text should allow newlines"
1871        );
1872    }
1873
1874    #[test]
1875    fn test_parse_inline_link_multiline_with_formatting() {
1876        // Link text with newlines and other inline elements
1877        let input =
1878            "[A network graph. Different edges\nwith probability](../images/networkfig.png)";
1879        let result = try_parse_inline_link(input, false, LinkScanContext::default());
1880        assert!(result.is_some(), "Link text with newlines should parse");
1881        let (len, text, _dest, _attrs) = result.unwrap();
1882        assert!(text.contains('\n'), "Link text should preserve newline");
1883        assert_eq!(len, input.len());
1884    }
1885
1886    #[test]
1887    fn test_parse_inline_image_multiline_alt() {
1888        // Per Pandoc spec, image alt text CAN contain newlines
1889        let input = "![alt on\nline two](img.png)";
1890        let result = try_parse_inline_image(input, LinkScanContext::default());
1891        assert_eq!(
1892            result,
1893            Some((27, "alt on\nline two", "img.png", None)),
1894            "Image alt text should allow newlines"
1895        );
1896    }
1897
1898    #[test]
1899    fn test_parse_inline_image_multiline_with_attributes() {
1900        // Image with multiline alt text and attributes
1901        let input = "![network graph\ndiagram](../images/fig.png){width=70%}";
1902        let result = try_parse_inline_image(input, LinkScanContext::default());
1903        assert!(
1904            result.is_some(),
1905            "Image alt with newlines and attributes should parse"
1906        );
1907        let (len, alt, dest, attrs) = result.unwrap();
1908        assert!(alt.contains('\n'), "Alt text should preserve newline");
1909        assert_eq!(dest, "../images/fig.png");
1910        assert_eq!(attrs, Some("{width=70%}"));
1911        assert_eq!(len, input.len());
1912    }
1913
1914    #[test]
1915    fn test_parse_inline_link_with_attributes_after_newline() {
1916        // Test for regression: when text is concatenated with newlines,
1917        // attributes after ) should still be recognized
1918        let input = "[A network graph.](../images/networkfig.png){width=70%}\nA word\n";
1919        let result = try_parse_inline_link(input, false, LinkScanContext::default());
1920        assert!(
1921            result.is_some(),
1922            "Link with attributes should parse even with following text"
1923        );
1924        let (len, text, dest, attrs) = result.unwrap();
1925        assert_eq!(text, "A network graph.");
1926        assert_eq!(dest, "../images/networkfig.png");
1927        assert_eq!(attrs, Some("{width=70%}"), "Attributes should be captured");
1928        assert_eq!(
1929            len, 55,
1930            "Length should include attributes (up to closing brace)"
1931        );
1932    }
1933}