Skip to main content

panache_parser/parser/inlines/
links.rs

1//! Parsing for links, images, and automatic links.
2//!
3//! Implements:
4//! - Automatic links: `<http://example.com>` and `<user@example.com>`
5//! - Inline links: `[text](url)` and `[text](url "title")`
6//! - Link attributes: `[text](url){#id .class key=value}`
7//! - Inline images: `![alt](url)` and `![alt](url "title")`
8//! - Image attributes: `![alt](url){#id .class key=value}`
9//! - Reference links: `[text][ref]`, `[text][]`, `[text]`
10//! - Reference images: `![alt][ref]`, `![alt][]`, `![alt]`
11
12use super::code_spans::try_parse_code_span;
13use super::core::parse_inline_text;
14use super::inline_html::try_parse_inline_html;
15use crate::options::ParserOptions;
16use crate::syntax::SyntaxKind;
17use rowan::GreenNodeBuilder;
18
19// Import attribute parsing
20use crate::parser::utils::attributes::try_parse_trailing_attributes;
21
22/// Flags that control which inline spans the link-bracket scanner treats as
23/// opaque (so a `]` inside them does not terminate the link/image text).
24///
25/// - `skip_raw_html` is universal across dialects: pandoc-markdown and
26///   CommonMark both refuse to close link text inside a raw HTML span (e.g.
27///   `[foo <bar attr="](baz)">`), per CommonMark spec example #524 / #536.
28/// - `skip_autolinks` is **CommonMark-only**. Pandoc-markdown does *not*
29///   treat `<scheme://...>` as opaque inside link text, so the same input
30///   produces a different parse under each dialect (CommonMark spec example
31///   #526 / #538). Always derive this from
32///   `extensions.autolinks && dialect == Dialect::CommonMark`.
33/// - `disallow_inner_links` is **CommonMark-only** structural rule (§6.4):
34///   "Links may not contain other links, at any level of nesting." When the
35///   candidate link/image text contains a valid inline link or image, the
36///   outer match is rejected so the inner-most definition is used instead
37///   (spec examples #518–#520, #532). Pandoc-markdown allows nested links,
38///   so the flag is `false` there.
39#[derive(Clone, Copy)]
40pub struct LinkScanContext {
41    pub skip_raw_html: bool,
42    pub skip_autolinks: bool,
43    pub disallow_inner_links: bool,
44    /// Dialect controlling which HTML constructs the raw-HTML opacity check
45    /// recognizes. Pandoc-markdown excludes bare declarations and CDATA
46    /// from its inline raw HTML grammar.
47    pub dialect: crate::options::Dialect,
48}
49
50impl Default for LinkScanContext {
51    fn default() -> Self {
52        Self {
53            skip_raw_html: false,
54            skip_autolinks: false,
55            disallow_inner_links: false,
56            dialect: crate::options::Dialect::Pandoc,
57        }
58    }
59}
60
61impl LinkScanContext {
62    pub fn from_options(config: &ParserOptions) -> Self {
63        let is_commonmark = config.dialect == crate::options::Dialect::CommonMark;
64        Self {
65            skip_raw_html: config.extensions.raw_html,
66            skip_autolinks: config.extensions.autolinks && is_commonmark,
67            disallow_inner_links: is_commonmark,
68            dialect: config.dialect,
69        }
70    }
71}
72
73/// Find the closing `]` of a link/image text span, starting from `start`.
74///
75/// Walks `text[start..]` tracking nested brackets and backslash escapes. When
76/// a backtick run starting a valid code span is encountered, the entire span
77/// (including any trailing attribute block) is skipped — per CommonMark §6
78/// precedence, code spans bind tighter than links/images, so a `]` *inside*
79/// a code span cannot terminate the link's text. The same opacity applies to
80/// raw HTML and (CommonMark-only) autolink spans gated through `ctx`.
81/// Returns the byte offset of the closing `]` within `text`, or `None` if no
82/// unmatched `]` is reached.
83fn find_link_close_bracket(text: &str, start: usize, ctx: LinkScanContext) -> Option<usize> {
84    let bytes = text.as_bytes();
85    let mut bracket_depth = 0;
86    let mut escape_next = false;
87    let mut i = start;
88
89    while i < bytes.len() {
90        let b = bytes[i];
91
92        if escape_next {
93            escape_next = false;
94            i += step(text, i);
95            continue;
96        }
97
98        match b {
99            b'\\' => {
100                escape_next = true;
101                i += 1;
102            }
103            b'`' => {
104                if let Some((len, _, _, _)) = try_parse_code_span(&text[i..]) {
105                    i += len;
106                } else {
107                    i += 1;
108                }
109            }
110            b'<' => {
111                // Order matters: autolinks are the more specific `<...>`
112                // shape (URI/email between angle brackets), so try that
113                // before falling through to general inline raw HTML which
114                // would also match `<bar attr="...">`-style tags.
115                if ctx.skip_autolinks
116                    && let Some((len, _)) = try_parse_autolink(&text[i..], true)
117                {
118                    i += len;
119                } else if ctx.skip_raw_html
120                    && let Some(len) = try_parse_inline_html(&text[i..], ctx.dialect)
121                {
122                    i += len;
123                } else {
124                    i += 1;
125                }
126            }
127            b'[' => {
128                bracket_depth += 1;
129                i += 1;
130            }
131            b']' => {
132                if bracket_depth == 0 {
133                    return Some(i);
134                }
135                bracket_depth -= 1;
136                i += 1;
137            }
138            _ => i += step(text, i),
139        }
140    }
141    None
142}
143
144/// Find the closing `)` of a link/image destination, given the text *after*
145/// the opening `(`. Tracks paren nesting, quoted titles, and angle-bracketed
146/// destinations (`<...>` may legitimately contain unbalanced parens — see
147/// spec example #499). Returns the byte offset of the closing `)` within the
148/// passed slice, or `None` if not found.
149fn find_dest_close_paren(remaining: &str) -> Option<usize> {
150    let bytes = remaining.as_bytes();
151    let mut paren_depth = 0;
152    let mut escape_next = false;
153    let mut in_quotes = false;
154    let mut in_angle = false;
155    let mut i = 0;
156
157    while i < bytes.len() {
158        let b = bytes[i];
159
160        if escape_next {
161            escape_next = false;
162            i += step(remaining, i);
163            continue;
164        }
165
166        match b {
167            b'\\' => {
168                escape_next = true;
169                i += 1;
170            }
171            b'<' if !in_quotes && !in_angle => {
172                in_angle = true;
173                i += 1;
174            }
175            b'>' if in_angle => {
176                in_angle = false;
177                i += 1;
178            }
179            b'"' if !in_angle => {
180                in_quotes = !in_quotes;
181                i += 1;
182            }
183            b'(' if !in_quotes && !in_angle => {
184                paren_depth += 1;
185                i += 1;
186            }
187            b')' if !in_quotes && !in_angle => {
188                if paren_depth == 0 {
189                    return Some(i);
190                }
191                paren_depth -= 1;
192                i += 1;
193            }
194            _ => i += step(remaining, i),
195        }
196    }
197    None
198}
199
200/// Byte length of the UTF-8 character starting at byte index `i` in `s`.
201/// Used to advance an index loop char-by-char without incurring `char_indices`
202/// overhead and without splitting on a UTF-8 boundary.
203fn step(s: &str, i: usize) -> usize {
204    s[i..].chars().next().map(|c| c.len_utf8()).unwrap_or(1)
205}
206
207/// CommonMark §6.4: "Links may not contain other links, at any level of
208/// nesting. If multiple otherwise valid link definitions appear nested inside
209/// each other, the inner-most definition is used." This helper scans a
210/// candidate link text for any `[` that starts a valid inline link; when
211/// found, the outer link must be rejected so the inner-most wins (spec
212/// examples #518–#519, #532).
213///
214/// Images themselves do not count as inner links — a link can contain an
215/// image (#517, #531). A link *inside* an image's alt text, however, still
216/// deactivates outer link openers per CommonMark's bracket-scanner rules, so
217/// the helper recurses into image alt text looking for inner links.
218///
219/// Reference-link nesting (#533, #569, #571) requires resolving labels
220/// against the document's reference-definition map, which the parser does
221/// not have at this point — those cases remain unhandled and need a later
222/// stack-based pass.
223fn link_text_contains_inner_link(text: &str, ctx: LinkScanContext, strict_dest: bool) -> bool {
224    let bytes = text.as_bytes();
225    let mut i = 0;
226    let mut escape_next = false;
227    while i < bytes.len() {
228        let b = bytes[i];
229        if escape_next {
230            escape_next = false;
231            i += step(text, i);
232            continue;
233        }
234        match b {
235            b'\\' => {
236                escape_next = true;
237                i += 1;
238            }
239            b'`' => {
240                if let Some((len, _, _, _)) = try_parse_code_span(&text[i..]) {
241                    i += len;
242                } else {
243                    i += 1;
244                }
245            }
246            b'<' => {
247                if ctx.skip_autolinks
248                    && let Some((len, _)) = try_parse_autolink(&text[i..], true)
249                {
250                    i += len;
251                } else if ctx.skip_raw_html
252                    && let Some(len) = try_parse_inline_html(&text[i..], ctx.dialect)
253                {
254                    i += len;
255                } else {
256                    i += 1;
257                }
258            }
259            b'!' if i + 1 < bytes.len() && bytes[i + 1] == b'[' => {
260                if let Some((len, alt, _, _)) = try_parse_inline_image(&text[i..], ctx) {
261                    if link_text_contains_inner_link(alt, ctx, strict_dest) {
262                        return true;
263                    }
264                    i += len;
265                } else {
266                    i += 2;
267                }
268            }
269            b'[' => {
270                if try_parse_inline_link(&text[i..], strict_dest, ctx).is_some() {
271                    return true;
272                }
273                i += 1;
274            }
275            _ => i += step(text, i),
276        }
277    }
278    false
279}
280
281/// Try to parse an inline image starting at the current position.
282///
283/// Inline images have the form `![alt](url)` or `![alt](url "title")`.
284/// Can also have trailing attributes: `![alt](url){#id .class}`.
285/// Returns Some((length, alt_text, dest_content, raw_attributes)) if a valid image is found.
286///
287/// `ctx` controls bracket-scanner opacity for raw HTML / autolink spans;
288/// see `LinkScanContext`.
289pub fn try_parse_inline_image(
290    text: &str,
291    ctx: LinkScanContext,
292) -> Option<(usize, &str, &str, Option<&str>)> {
293    if !text.starts_with("![") {
294        return None;
295    }
296
297    // Find the closing ]
298    let close_bracket = find_link_close_bracket(text, 2, ctx)?;
299    let alt_text = &text[2..close_bracket];
300
301    // Check for immediate ( after ]
302    let after_bracket = close_bracket + 1;
303    if text.len() <= after_bracket || !text[after_bracket..].starts_with('(') {
304        return None;
305    }
306
307    // Find closing ) for destination (reuse same logic as links)
308    let dest_start = after_bracket + 1;
309    let remaining = &text[dest_start..];
310
311    let close_paren = find_dest_close_paren(remaining)?;
312    let dest_content = &remaining[..close_paren];
313
314    // Check for trailing attributes {#id .class key=value}
315    let after_paren = dest_start + close_paren + 1;
316    let after_close = &text[after_paren..];
317
318    // Attributes must start immediately after closing paren (no whitespace/newlines)
319    if after_close.starts_with('{') {
320        // Find the closing brace
321        if let Some(close_brace_pos) = after_close.find('}') {
322            let attr_text = &after_close[..=close_brace_pos];
323            // Try to parse as attributes to validate
324            if let Some((_attrs, _)) = try_parse_trailing_attributes(attr_text) {
325                let total_len = after_paren + close_brace_pos + 1;
326                // Return raw attribute string for lossless parsing
327                let raw_attrs = attr_text;
328                return Some((total_len, alt_text, dest_content, Some(raw_attrs)));
329            }
330        }
331    }
332
333    // No attributes, just return the image
334    let total_len = after_paren;
335    Some((total_len, alt_text, dest_content, None))
336}
337
338/// Emit an inline image node to the builder.
339/// Note: alt_text may contain inline elements and should be parsed recursively.
340pub fn emit_inline_image(
341    builder: &mut GreenNodeBuilder,
342    _text: &str,
343    alt_text: &str,
344    dest: &str,
345    raw_attributes: Option<&str>,
346    config: &ParserOptions,
347) {
348    builder.start_node(SyntaxKind::IMAGE_LINK.into());
349
350    // Opening ![
351    builder.start_node(SyntaxKind::IMAGE_LINK_START.into());
352    builder.token(SyntaxKind::IMAGE_LINK_START.into(), "![");
353    builder.finish_node();
354
355    // Alt text (recursively parse inline elements)
356    builder.start_node(SyntaxKind::IMAGE_ALT.into());
357    // Use the standalone parse_inline_text function for recursive parsing
358    // Note: nested contexts don't resolve references
359    parse_inline_text(builder, alt_text, config, false);
360    builder.finish_node();
361
362    // Closing ]
363    builder.token(SyntaxKind::IMAGE_ALT_END.into(), "]");
364
365    // Opening (
366    builder.token(SyntaxKind::IMAGE_DEST_START.into(), "(");
367
368    // Destination
369    builder.start_node(SyntaxKind::LINK_DEST.into());
370    builder.token(SyntaxKind::TEXT.into(), dest);
371    builder.finish_node();
372
373    // Closing )
374    builder.token(SyntaxKind::IMAGE_DEST_END.into(), ")");
375
376    // Emit raw attributes if present (preserve original formatting)
377    if let Some(raw_attrs) = raw_attributes {
378        builder.start_node(SyntaxKind::ATTRIBUTE.into());
379        builder.token(SyntaxKind::ATTRIBUTE.into(), raw_attrs);
380        builder.finish_node();
381    }
382
383    builder.finish_node();
384}
385
386/// Try to parse an automatic link starting at the current position.
387///
388/// Automatic links have the form `<url>` (URI autolink) or `<email>`
389/// (email autolink) per CommonMark §6.4. Under `Dialect::CommonMark` the
390/// scheme/email grammar is enforced strictly (e.g. scheme must be 2-32
391/// ASCII chars; email local parts cannot contain backslashes). Pandoc
392/// markdown is laxer — it accepts Unicode in email addresses, for
393/// example — so non-CommonMark callers fall back to the heuristic
394/// "contains `:` or `@`" check that the parser used historically.
395pub fn try_parse_autolink(text: &str, is_commonmark: bool) -> Option<(usize, &str)> {
396    if !text.starts_with('<') {
397        return None;
398    }
399
400    let close_pos = text[1..].find('>')?;
401    let content = &text[1..1 + close_pos];
402
403    if content.is_empty() {
404        return None;
405    }
406    if content.contains(|c: char| c.is_whitespace()) {
407        return None;
408    }
409
410    if is_commonmark {
411        if !is_valid_uri_autolink(content) && !is_valid_email_autolink(content) {
412            return None;
413        }
414    } else if !content.contains(':') && !content.contains('@') {
415        return None;
416    }
417
418    Some((close_pos + 2, content))
419}
420
421/// CommonMark §6.4 URI autolink:
422/// scheme = 2-32 chars, ASCII letter then `[a-zA-Z0-9+.-]`, followed by `:`,
423/// followed by URI body (any char except control, space, `<`, `>`).
424fn is_valid_uri_autolink(s: &str) -> bool {
425    let bytes = s.as_bytes();
426    if bytes.is_empty() || !bytes[0].is_ascii_alphabetic() {
427        return false;
428    }
429    let mut i = 1;
430    while i < bytes.len() {
431        let b = bytes[i];
432        if b.is_ascii_alphanumeric() || b == b'+' || b == b'-' || b == b'.' {
433            i += 1;
434        } else {
435            break;
436        }
437    }
438    if !(2..=32).contains(&i) {
439        return false;
440    }
441    if i >= bytes.len() || bytes[i] != b':' {
442        return false;
443    }
444    for &b in &bytes[i + 1..] {
445        if b < 0x20 || b == 0x7f || b == b'<' || b == b'>' {
446            return false;
447        }
448    }
449    true
450}
451
452/// CommonMark §6.4 email autolink, matching the HTML5 non-normative regex:
453/// `^[a-zA-Z0-9.!#$%&'*+/=?^_`{|}~-]+@[a-zA-Z0-9](?:[a-zA-Z0-9-]{0,61}[a-zA-Z0-9])?
454///  (?:\.[a-zA-Z0-9](?:[a-zA-Z0-9-]{0,61}[a-zA-Z0-9])?)*$`.
455fn is_valid_email_autolink(s: &str) -> bool {
456    let Some(at) = s.find('@') else {
457        return false;
458    };
459    let local = &s[..at];
460    let domain = &s[at + 1..];
461    if local.is_empty() || !local.bytes().all(is_email_local_byte) {
462        return false;
463    }
464    if domain.is_empty() {
465        return false;
466    }
467    domain.split('.').all(is_valid_email_label)
468}
469
470fn is_email_local_byte(b: u8) -> bool {
471    matches!(
472        b,
473        b'a'..=b'z'
474            | b'A'..=b'Z'
475            | b'0'..=b'9'
476            | b'.'
477            | b'!'
478            | b'#'
479            | b'$'
480            | b'%'
481            | b'&'
482            | b'\''
483            | b'*'
484            | b'+'
485            | b'/'
486            | b'='
487            | b'?'
488            | b'^'
489            | b'_'
490            | b'`'
491            | b'{'
492            | b'|'
493            | b'}'
494            | b'~'
495            | b'-'
496    )
497}
498
499fn is_valid_email_label(label: &str) -> bool {
500    let bytes = label.as_bytes();
501    if bytes.is_empty() || bytes.len() > 63 {
502        return false;
503    }
504    if !bytes[0].is_ascii_alphanumeric() {
505        return false;
506    }
507    if !bytes[bytes.len() - 1].is_ascii_alphanumeric() {
508        return false;
509    }
510    bytes[1..bytes.len() - 1]
511        .iter()
512        .all(|b| b.is_ascii_alphanumeric() || *b == b'-')
513}
514
515/// Emit an automatic link node to the builder.
516pub fn emit_autolink(builder: &mut GreenNodeBuilder, _text: &str, url: &str) {
517    builder.start_node(SyntaxKind::AUTO_LINK.into());
518
519    // Opening <
520    builder.start_node(SyntaxKind::AUTO_LINK_MARKER.into());
521    builder.token(SyntaxKind::AUTO_LINK_MARKER.into(), "<");
522    builder.finish_node();
523
524    // URL content
525    builder.token(SyntaxKind::TEXT.into(), url);
526
527    // Closing >
528    builder.start_node(SyntaxKind::AUTO_LINK_MARKER.into());
529    builder.token(SyntaxKind::AUTO_LINK_MARKER.into(), ">");
530    builder.finish_node();
531
532    builder.finish_node();
533}
534
535pub fn try_parse_bare_uri(text: &str) -> Option<(usize, &str)> {
536    let mut chars = text.char_indices();
537    let (_, first) = chars.next()?;
538    if !first.is_ascii_alphabetic() {
539        return None;
540    }
541
542    let mut scheme_end = None;
543    for (idx, ch) in text.char_indices() {
544        if ch == ':' {
545            scheme_end = Some(idx);
546            break;
547        }
548        if !ch.is_ascii_alphanumeric() && ch != '+' && ch != '-' && ch != '.' {
549            return None;
550        }
551    }
552    let scheme_end = scheme_end?;
553    if scheme_end == 0 {
554        return None;
555    }
556
557    let mut end = scheme_end + 1;
558    let bytes = text.as_bytes();
559    while end < text.len() {
560        let b = bytes[end];
561        if b.is_ascii_whitespace() {
562            break;
563        }
564        if matches!(b, b'<' | b'>' | b'`' | b'"' | b'\'') {
565            break;
566        }
567        end += 1;
568    }
569
570    if end == scheme_end + 1 {
571        return None;
572    }
573
574    let mut trimmed = end;
575    while trimmed > scheme_end + 1 {
576        let ch = text[..trimmed].chars().last().unwrap();
577        if matches!(ch, '.' | ',' | ';' | ':' | ')' | ']' | '}') {
578            trimmed -= ch.len_utf8();
579        } else {
580            break;
581        }
582    }
583
584    if trimmed <= scheme_end + 1 {
585        return None;
586    }
587
588    // If trimming terminal punctuation leaves a dangling backslash, the match
589    // came from escaped punctuation (e.g., `a:\]`) and should stay literal.
590    if text[..trimmed].ends_with('\\') {
591        return None;
592    }
593
594    Some((trimmed, &text[..trimmed]))
595}
596
597/// Try to parse an inline link starting at the current position.
598///
599/// Inline links have the form `[text](url)` or `[text](url "title")`.
600/// Can also have trailing attributes: `[text](url){#id .class}`.
601/// Returns Some((length, text_content, dest_content, raw_attributes)) if a valid link is found.
602///
603/// `strict_dest` enables CommonMark §6.4 destination-and-title validation:
604/// the bare destination form may not contain spaces or ASCII control
605/// characters and must have balanced parentheses; if a title follows it
606/// must be properly delimited; only whitespace is allowed before/after.
607/// Pandoc-markdown is more permissive, so leave this off for that dialect.
608pub fn try_parse_inline_link(
609    text: &str,
610    strict_dest: bool,
611    ctx: LinkScanContext,
612) -> Option<(usize, &str, &str, Option<&str>)> {
613    if !text.starts_with('[') {
614        return None;
615    }
616
617    // Find the closing ]
618    let close_bracket = find_link_close_bracket(text, 1, ctx)?;
619    let link_text = &text[1..close_bracket];
620
621    // Check for immediate ( after ]
622    let after_bracket = close_bracket + 1;
623    if text.len() <= after_bracket || !text[after_bracket..].starts_with('(') {
624        return None;
625    }
626
627    // Find closing ) for destination
628    let dest_start = after_bracket + 1;
629    let remaining = &text[dest_start..];
630
631    let close_paren = find_dest_close_paren(remaining)?;
632    let dest_content = &remaining[..close_paren];
633
634    if strict_dest && !dest_and_title_ok_commonmark(dest_content) {
635        return None;
636    }
637
638    // CommonMark §6.4: outer link is rejected when its text contains a valid
639    // inner inline link or image, so the inner-most definition wins.
640    if ctx.disallow_inner_links && link_text_contains_inner_link(link_text, ctx, strict_dest) {
641        return None;
642    }
643
644    // Check for trailing attributes {#id .class key=value}
645    let after_paren = dest_start + close_paren + 1;
646    let after_close = &text[after_paren..];
647
648    // Attributes must start immediately after closing paren (no whitespace/newlines)
649    if after_close.starts_with('{') {
650        // Find the closing brace
651        if let Some(close_brace_pos) = after_close.find('}') {
652            let attr_text = &after_close[..=close_brace_pos];
653            // Try to parse as attributes to validate
654            if let Some((_attrs, _)) = try_parse_trailing_attributes(attr_text) {
655                let total_len = after_paren + close_brace_pos + 1;
656                // Return raw attribute string for lossless parsing
657                let raw_attrs = attr_text;
658                return Some((total_len, link_text, dest_content, Some(raw_attrs)));
659            }
660        }
661    }
662
663    // No attributes, just return the link
664    let total_len = after_paren;
665    Some((total_len, link_text, dest_content, None))
666}
667
668/// CommonMark §6.4 destination + optional title validation. The text passed
669/// in is whatever the parser captured between `(` and `)`. A valid form is:
670/// `[ws] destination [ws title [ws]]` where:
671/// - bare destination has no spaces, tabs, ASCII control chars, and balanced
672///   parentheses (escaped parens permitted);
673/// - bracketed destination is `<...>` with no newlines and no unescaped `<>`;
674/// - the optional title is delimited by `"..."`, `'...'`, or `(...)`;
675/// - any text outside that structure invalidates the link.
676fn dest_and_title_ok_commonmark(content: &str) -> bool {
677    let trimmed = trim_start_link_ws(content);
678    if trimmed.is_empty() {
679        return true;
680    }
681
682    let after_dest = if let Some(rest) = trimmed.strip_prefix('<') {
683        let mut escape = false;
684        let mut end_byte = None;
685        for (i, c) in rest.char_indices() {
686            if escape {
687                escape = false;
688                continue;
689            }
690            match c {
691                '\\' => escape = true,
692                '\n' | '<' => return false,
693                '>' => {
694                    end_byte = Some(i);
695                    break;
696                }
697                _ => {}
698            }
699        }
700        match end_byte {
701            Some(e) => &rest[e + 1..],
702            None => return false,
703        }
704    } else {
705        let mut escape = false;
706        let mut depth: i32 = 0;
707        let mut end = trimmed.len();
708        for (i, c) in trimmed.char_indices() {
709            if escape {
710                escape = false;
711                continue;
712            }
713            match c {
714                '\\' => escape = true,
715                ' ' | '\t' | '\n' => {
716                    end = i;
717                    break;
718                }
719                _ if c.is_ascii_control() => return false,
720                '(' => depth += 1,
721                ')' => {
722                    if depth == 0 {
723                        end = i;
724                        break;
725                    }
726                    depth -= 1;
727                }
728                _ => {}
729            }
730        }
731        if depth != 0 {
732            return false;
733        }
734        if end == 0 {
735            // bare destination must be nonempty if the field is non-blank
736            return false;
737        }
738        &trimmed[end..]
739    };
740
741    let after_dest = trim_start_link_ws(after_dest);
742    if after_dest.is_empty() {
743        return true;
744    }
745
746    let bytes = after_dest.as_bytes();
747    let close = match bytes[0] {
748        b'"' => b'"',
749        b'\'' => b'\'',
750        b'(' => b')',
751        _ => return false,
752    };
753    let opens_paren = bytes[0] == b'(';
754    let mut escape = false;
755    let mut title_close_pos = None;
756    for (i, &b) in after_dest.as_bytes().iter().enumerate().skip(1) {
757        if escape {
758            escape = false;
759            continue;
760        }
761        if b == b'\\' {
762            escape = true;
763            continue;
764        }
765        if opens_paren && b == b'(' {
766            return false;
767        }
768        if b == close {
769            title_close_pos = Some(i);
770            break;
771        }
772    }
773    let close_idx = match title_close_pos {
774        Some(p) => p,
775        None => return false,
776    };
777
778    let after_title = &after_dest[close_idx + 1..];
779    is_link_ws_only(after_title)
780}
781
782/// Strip leading ASCII space/tab/newline bytes. Byte-level equivalent of
783/// `s.trim_start_matches([' ', '\t', '\n'])`; called for every
784/// CommonMark inline-link destination/title scan, so the slice-pattern
785/// MultiCharEqSearcher overhead matters.
786#[inline]
787fn trim_start_link_ws(s: &str) -> &str {
788    let bytes = s.as_bytes();
789    let mut i = 0;
790    while i < bytes.len() {
791        let b = bytes[i];
792        if b == b' ' || b == b'\t' || b == b'\n' {
793            i += 1;
794        } else {
795            break;
796        }
797    }
798    // SAFETY: stripped only ASCII whitespace bytes.
799    unsafe { std::str::from_utf8_unchecked(&bytes[i..]) }
800}
801
802#[inline]
803fn is_link_ws_only(s: &str) -> bool {
804    s.as_bytes()
805        .iter()
806        .all(|&b| b == b' ' || b == b'\t' || b == b'\n')
807}
808
809/// Emit an inline link node to the builder.
810/// Note: link_text may contain inline elements and should be parsed recursively.
811pub fn emit_inline_link(
812    builder: &mut GreenNodeBuilder,
813    _text: &str,
814    link_text: &str,
815    dest: &str,
816    raw_attributes: Option<&str>,
817    config: &ParserOptions,
818) {
819    builder.start_node(SyntaxKind::LINK.into());
820
821    // Opening [
822    builder.start_node(SyntaxKind::LINK_START.into());
823    builder.token(SyntaxKind::LINK_START.into(), "[");
824    builder.finish_node();
825
826    // Link text (recursively parse inline elements). Pandoc-native:
827    // links cannot contain other links, so suppress inner LINK / ref-link
828    // recognition during the recursion. Images, emphasis, code, etc. are
829    // still recognised. CommonMark relies on outer-level process_brackets
830    // to prevent nested links, but the flag is harmless under CM.
831    builder.start_node(SyntaxKind::LINK_TEXT.into());
832    parse_inline_text(builder, link_text, config, true);
833    builder.finish_node();
834
835    // Closing ]
836    builder.token(SyntaxKind::LINK_TEXT_END.into(), "]");
837
838    // Opening (
839    builder.token(SyntaxKind::LINK_DEST_START.into(), "(");
840
841    // Destination
842    builder.start_node(SyntaxKind::LINK_DEST.into());
843    builder.token(SyntaxKind::TEXT.into(), dest);
844    builder.finish_node();
845
846    // Closing )
847    builder.token(SyntaxKind::LINK_DEST_END.into(), ")");
848
849    // Emit raw attributes if present (preserve original formatting)
850    if let Some(raw_attrs) = raw_attributes {
851        builder.start_node(SyntaxKind::ATTRIBUTE.into());
852        builder.token(SyntaxKind::ATTRIBUTE.into(), raw_attrs);
853        builder.finish_node();
854    }
855
856    builder.finish_node();
857}
858
859pub fn emit_bare_uri_link(builder: &mut GreenNodeBuilder, uri: &str, _config: &ParserOptions) {
860    builder.start_node(SyntaxKind::LINK.into());
861
862    builder.start_node(SyntaxKind::LINK_START.into());
863    builder.token(SyntaxKind::LINK_START.into(), "[");
864    builder.finish_node();
865
866    builder.start_node(SyntaxKind::LINK_TEXT.into());
867    builder.token(SyntaxKind::TEXT.into(), uri);
868    builder.finish_node();
869
870    builder.token(SyntaxKind::LINK_TEXT_END.into(), "]");
871    builder.token(SyntaxKind::LINK_DEST_START.into(), "(");
872
873    builder.start_node(SyntaxKind::LINK_DEST.into());
874    builder.token(SyntaxKind::TEXT.into(), uri);
875    builder.finish_node();
876
877    builder.token(SyntaxKind::LINK_DEST_END.into(), ")");
878
879    builder.finish_node();
880}
881
882/// Try to parse a reference link starting at the current position.
883///
884/// Reference links have three forms:
885/// - Explicit: `[text][label]`
886/// - Implicit: `[text][]` (label = text)
887/// - Shortcut: `[text]` (if shortcut_reference_links enabled)
888///
889/// Returns Some((length, text_content, label, is_shortcut)) if a valid reference link is found.
890/// The label is what should be looked up in the registry.
891pub fn try_parse_reference_link(
892    text: &str,
893    allow_shortcut: bool,
894    inline_link_attempted: bool,
895    ctx: LinkScanContext,
896) -> Option<(usize, &str, String, bool)> {
897    if !text.starts_with('[') {
898        return None;
899    }
900
901    // Don't match citations (which start with [@) or suppress-author citations (which start with [-@)
902    if text.len() > 1 {
903        let bytes = text.as_bytes();
904        if bytes[1] == b'@' {
905            return None;
906        }
907        if bytes[1] == b'-' && text.len() > 2 && bytes[2] == b'@' {
908            return None;
909        }
910    }
911
912    // Find the closing ] for the text. Uses the shared helper so that a
913    // `]` inside a code span doesn't terminate the link text (CommonMark
914    // §6 — code spans bind tighter than links). See spec examples #342
915    // and #525. Raw HTML and (CommonMark-only) autolink spans are also
916    // opaque per `ctx`.
917    let close_bracket = find_link_close_bracket(text, 1, ctx)?;
918    let link_text = &text[1..close_bracket];
919
920    // CommonMark §6.4: outer reference link is rejected when its text contains
921    // a valid inner inline link/image (spec example #532). Reference-link
922    // nesting (#533/#569/#571) is not handled here; it requires resolving
923    // labels against the document refdef map.
924    if ctx.disallow_inner_links
925        && link_text_contains_inner_link(link_text, ctx, ctx.disallow_inner_links)
926    {
927        return None;
928    }
929
930    // Check what follows the ]
931    let after_bracket = close_bracket + 1;
932
933    // `[content]{...}` is reserved for bracketed spans / attribute
934    // trailers, never a shortcut.
935    if after_bracket < text.len() && text[after_bracket..].starts_with('{') {
936        return None;
937    }
938
939    // `[text](...)` is the inline-link shape. CommonMark spec example
940    // #568 (`[foo](not a link)` with `[foo]: /url`) requires the shortcut
941    // to succeed for `[foo]`, leaving `(not a link)` as literal text when
942    // the upstream inline-link parse was rejected by `strict_dest`. We
943    // only fall through to shortcut here when the caller has already
944    // tried the inline-link form (`inline_link_attempted`) — otherwise
945    // disabling the `inline_links` extension would silently let
946    // `[text](url)` become a shortcut + literal text, which the
947    // `inline_links_disabled_keeps_inline_link_literal` test guards
948    // against.
949    if after_bracket < text.len()
950        && text[after_bracket..].starts_with('(')
951        && (!allow_shortcut || !inline_link_attempted)
952    {
953        return None;
954    }
955
956    // Check for explicit reference [text][label] or implicit [text][]
957    if after_bracket < text.len() && text[after_bracket..].starts_with('[') {
958        // Find the closing ] for the label
959        let label_start = after_bracket + 1;
960        let mut label_end = None;
961
962        for (i, ch) in text[label_start..].char_indices() {
963            if ch == ']' {
964                label_end = Some(i + label_start);
965                break;
966            }
967            // Labels can't contain newlines
968            if ch == '\n' {
969                return None;
970            }
971        }
972
973        let label_end = label_end?;
974        let label = &text[label_start..label_end];
975
976        // Total length includes both bracket pairs
977        let total_len = label_end + 1;
978
979        // Implicit reference: empty label means emit [text][]
980        if label.is_empty() {
981            return Some((total_len, link_text, String::new(), false));
982        }
983
984        // Explicit reference: use the provided label
985        Some((total_len, link_text, label.to_string(), false))
986    } else if allow_shortcut {
987        // Shortcut reference: [text] with no second bracket pair
988        // The text is both the display text and the label
989        if link_text.is_empty() {
990            return None;
991        }
992        Some((after_bracket, link_text, link_text.to_string(), true))
993    } else {
994        // No second bracket pair and shortcut not allowed - not a reference link
995        None
996    }
997}
998
999/// Emit a reference link node to the builder.
1000/// Preserves the original reference syntax (explicit [text][ref], implicit [text][], or shortcut [text]).
1001pub fn emit_reference_link(
1002    builder: &mut GreenNodeBuilder,
1003    link_text: &str,
1004    label: &str,
1005    is_shortcut: bool,
1006    config: &ParserOptions,
1007) {
1008    builder.start_node(SyntaxKind::LINK.into());
1009
1010    // Opening [
1011    builder.start_node(SyntaxKind::LINK_START.into());
1012    builder.token(SyntaxKind::LINK_START.into(), "[");
1013    builder.finish_node();
1014
1015    // Link text (recursively parse inline elements). Pandoc-native:
1016    // links cannot contain other links, so suppress inner LINK / ref-link
1017    // recognition during the recursion. Images, emphasis, code, etc. are
1018    // still recognised.
1019    builder.start_node(SyntaxKind::LINK_TEXT.into());
1020    parse_inline_text(builder, link_text, config, true);
1021    builder.finish_node();
1022
1023    // Closing ] and reference label
1024    builder.token(SyntaxKind::TEXT.into(), "]");
1025
1026    if !is_shortcut {
1027        // Explicit or implicit reference: [text][label] or [text][]
1028        builder.token(SyntaxKind::TEXT.into(), "[");
1029        builder.start_node(SyntaxKind::LINK_REF.into());
1030        // For implicit references, label is empty and we emit [text][]
1031        // For explicit references, emit the label to get [text][label]
1032        if !label.is_empty() {
1033            builder.token(SyntaxKind::TEXT.into(), label);
1034        }
1035        builder.finish_node();
1036        builder.token(SyntaxKind::TEXT.into(), "]");
1037    }
1038    // For shortcut references, just [text] - no second bracket pair
1039
1040    builder.finish_node();
1041}
1042
1043/// Try to parse a reference-style image: `![alt][ref]`, `![alt][]`, or `![alt]`
1044/// Returns (total_len, alt_text, label, is_shortcut) if successful.
1045pub fn try_parse_reference_image(
1046    text: &str,
1047    allow_shortcut: bool,
1048) -> Option<(usize, &str, String, bool)> {
1049    let bytes = text.as_bytes();
1050    if bytes.len() < 4 || bytes[0] != b'!' || bytes[1] != b'[' {
1051        return None;
1052    }
1053
1054    let mut pos = 2;
1055    let mut bracket_depth = 1;
1056    let alt_start = pos;
1057
1058    // Find the end of the alt text (allowing nested brackets)
1059    while pos < bytes.len() && bracket_depth > 0 {
1060        match bytes[pos] {
1061            b'[' => bracket_depth += 1,
1062            b']' => bracket_depth -= 1,
1063            b'\\' if pos + 1 < bytes.len() => pos += 1, // skip escaped char
1064            _ => {}
1065        }
1066        pos += 1;
1067    }
1068
1069    if bracket_depth > 0 {
1070        return None; // Unclosed brackets
1071    }
1072
1073    let alt_text = &text[alt_start..pos - 1];
1074
1075    // Now check for the label part
1076    if pos >= bytes.len() {
1077        return None;
1078    }
1079
1080    // Explicit reference: `![alt][label]`
1081    if bytes[pos] == b'[' {
1082        pos += 1;
1083        let label_start = pos;
1084
1085        // Find the end of the label (no nested brackets, no newlines)
1086        while pos < bytes.len() && bytes[pos] != b']' && bytes[pos] != b'\n' && bytes[pos] != b'\r'
1087        {
1088            pos += 1;
1089        }
1090
1091        if pos >= bytes.len() || bytes[pos] != b']' {
1092            return None;
1093        }
1094
1095        let label_text = &text[label_start..pos];
1096        pos += 1;
1097
1098        // Return the original label text for formatting preservation
1099        // Empty label means implicit reference
1100        let label = if label_text.is_empty() {
1101            alt_text.to_string() // For implicit references, use alt text as label for equality check
1102        } else {
1103            label_text.to_string() // Preserve original case
1104        };
1105
1106        return Some((pos, alt_text, label, false));
1107    }
1108
1109    // Shortcut reference: `![alt]` (only if enabled)
1110    // BUT not if followed by (url) - that's an inline image
1111    if allow_shortcut {
1112        // Check if next char is ( - if so, not a reference
1113        if pos < bytes.len() && bytes[pos] == b'(' {
1114            return None;
1115        }
1116
1117        // For shortcut references, use alt text as label for equality check
1118        let label = alt_text.to_string();
1119        return Some((pos, alt_text, label, true));
1120    }
1121
1122    None
1123}
1124
1125/// Emit a reference image node with registry lookup.
1126pub fn emit_reference_image(
1127    builder: &mut GreenNodeBuilder,
1128    alt_text: &str,
1129    label: &str,
1130    is_shortcut: bool,
1131    config: &ParserOptions,
1132) {
1133    builder.start_node(SyntaxKind::IMAGE_LINK.into());
1134
1135    // Emit as reference image (preserve original syntax)
1136    builder.start_node(SyntaxKind::IMAGE_LINK_START.into());
1137    builder.token(SyntaxKind::IMAGE_LINK_START.into(), "![");
1138    builder.finish_node();
1139
1140    // Alt text (recursively parse inline elements)
1141    builder.start_node(SyntaxKind::IMAGE_ALT.into());
1142    parse_inline_text(builder, alt_text, config, false);
1143    builder.finish_node();
1144
1145    // Closing ] and reference label
1146    builder.token(SyntaxKind::TEXT.into(), "]");
1147
1148    if !is_shortcut {
1149        // Explicit or implicit reference: ![alt][label] or ![alt][]
1150        builder.token(SyntaxKind::TEXT.into(), "[");
1151        builder.start_node(SyntaxKind::LINK_REF.into());
1152        // For implicit references, emit empty label (label == alt means implicit from parser)
1153        if label != alt_text {
1154            builder.token(SyntaxKind::TEXT.into(), label);
1155        }
1156        builder.finish_node();
1157        builder.token(SyntaxKind::TEXT.into(), "]");
1158    }
1159    // For shortcut references, just ![alt] - no second bracket pair
1160
1161    builder.finish_node();
1162}
1163
1164/// Emit an `UNRESOLVED_REFERENCE` node for a Pandoc bracket-shape
1165/// pattern whose label didn't resolve. The wrapper covers the original
1166/// bracket bytes; the inner text recurses through normal inline
1167/// parsing (with inner-link suppression so a stray inner inline link
1168/// doesn't reorder semantics relative to pandoc-native).
1169///
1170/// `source` is `text[start..end]` — the full bracket-shape pattern.
1171/// `text_content` is the inner text between the outer `[` and `]`
1172/// (the bytes used for inline recursion). `label_suffix` carries the
1173/// `[label]` / `[]` suffix bytes verbatim, or `None` for shortcut form.
1174pub fn emit_unresolved_reference(
1175    builder: &mut GreenNodeBuilder,
1176    is_image: bool,
1177    text_content: &str,
1178    label_suffix: Option<&str>,
1179    config: &ParserOptions,
1180) {
1181    builder.start_node(SyntaxKind::UNRESOLVED_REFERENCE.into());
1182
1183    if is_image {
1184        builder.start_node(SyntaxKind::IMAGE_LINK_START.into());
1185        builder.token(SyntaxKind::IMAGE_LINK_START.into(), "![");
1186        builder.finish_node();
1187        builder.start_node(SyntaxKind::IMAGE_ALT.into());
1188        parse_inline_text(builder, text_content, config, false);
1189        builder.finish_node();
1190    } else {
1191        builder.start_node(SyntaxKind::LINK_START.into());
1192        builder.token(SyntaxKind::LINK_START.into(), "[");
1193        builder.finish_node();
1194        builder.start_node(SyntaxKind::LINK_TEXT.into());
1195        parse_inline_text(builder, text_content, config, true);
1196        builder.finish_node();
1197    }
1198
1199    builder.token(SyntaxKind::TEXT.into(), "]");
1200
1201    if let Some(suffix) = label_suffix {
1202        // suffix is either "[label]" or "[]"; preserve original bytes.
1203        // Split as `[` + LINK_REF(label) + `]` so wrapper accessors find
1204        // the label via `support::child::<LinkRef>()`.
1205        debug_assert!(suffix.starts_with('[') && suffix.ends_with(']'));
1206        builder.token(SyntaxKind::TEXT.into(), "[");
1207        let label = &suffix[1..suffix.len() - 1];
1208        builder.start_node(SyntaxKind::LINK_REF.into());
1209        if !label.is_empty() {
1210            builder.token(SyntaxKind::TEXT.into(), label);
1211        }
1212        builder.finish_node();
1213        builder.token(SyntaxKind::TEXT.into(), "]");
1214    }
1215
1216    builder.finish_node();
1217}
1218
1219#[cfg(test)]
1220mod tests {
1221    use super::*;
1222
1223    #[test]
1224    fn test_parse_autolink_url() {
1225        let input = "<https://example.com>";
1226        assert_eq!(
1227            try_parse_autolink(input, false),
1228            Some((21, "https://example.com"))
1229        );
1230        assert_eq!(
1231            try_parse_autolink(input, true),
1232            Some((21, "https://example.com"))
1233        );
1234    }
1235
1236    #[test]
1237    fn test_parse_autolink_email() {
1238        let input = "<user@example.com>";
1239        assert_eq!(
1240            try_parse_autolink(input, false),
1241            Some((18, "user@example.com"))
1242        );
1243        assert_eq!(
1244            try_parse_autolink(input, true),
1245            Some((18, "user@example.com"))
1246        );
1247    }
1248
1249    #[test]
1250    fn test_parse_autolink_no_close() {
1251        let input = "<https://example.com";
1252        assert_eq!(try_parse_autolink(input, false), None);
1253        assert_eq!(try_parse_autolink(input, true), None);
1254    }
1255
1256    #[test]
1257    fn test_parse_autolink_with_space() {
1258        let input = "<https://example.com >";
1259        assert_eq!(try_parse_autolink(input, false), None);
1260        assert_eq!(try_parse_autolink(input, true), None);
1261    }
1262
1263    #[test]
1264    fn test_parse_autolink_not_url_or_email() {
1265        let input = "<notaurl>";
1266        assert_eq!(try_parse_autolink(input, false), None);
1267        assert_eq!(try_parse_autolink(input, true), None);
1268    }
1269
1270    #[test]
1271    fn test_parse_autolink_commonmark_strict_scheme() {
1272        // Scheme too short (1 char) — invalid under CommonMark, lax-accepted
1273        // under Pandoc dialect (matches historical behavior).
1274        let input = "<m:abc>";
1275        assert_eq!(try_parse_autolink(input, true), None);
1276        assert_eq!(try_parse_autolink(input, false), Some((7, "m:abc")));
1277    }
1278
1279    #[test]
1280    fn test_parse_autolink_commonmark_email_disallows_backslash() {
1281        let input = "<foo\\+@bar.example.com>";
1282        assert_eq!(try_parse_autolink(input, true), None);
1283        assert_eq!(
1284            try_parse_autolink(input, false),
1285            Some((23, "foo\\+@bar.example.com"))
1286        );
1287    }
1288
1289    #[test]
1290    fn test_parse_inline_link_simple() {
1291        let input = "[text](url)";
1292        let result = try_parse_inline_link(input, false, LinkScanContext::default());
1293        assert_eq!(result, Some((11, "text", "url", None)));
1294    }
1295
1296    #[test]
1297    fn test_parse_inline_link_with_title() {
1298        let input = r#"[text](url "title")"#;
1299        let result = try_parse_inline_link(input, false, LinkScanContext::default());
1300        assert_eq!(result, Some((19, "text", r#"url "title""#, None)));
1301    }
1302
1303    #[test]
1304    fn test_parse_inline_link_with_nested_brackets() {
1305        let input = "[outer [inner] text](url)";
1306        let result = try_parse_inline_link(input, false, LinkScanContext::default());
1307        assert_eq!(result, Some((25, "outer [inner] text", "url", None)));
1308    }
1309
1310    #[test]
1311    fn test_parse_inline_link_no_space_between_brackets_and_parens() {
1312        let input = "[text] (url)";
1313        let result = try_parse_inline_link(input, false, LinkScanContext::default());
1314        assert_eq!(result, None);
1315    }
1316
1317    #[test]
1318    fn test_parse_inline_link_no_closing_bracket() {
1319        let input = "[text(url)";
1320        let result = try_parse_inline_link(input, false, LinkScanContext::default());
1321        assert_eq!(result, None);
1322    }
1323
1324    #[test]
1325    fn test_parse_inline_link_no_closing_paren() {
1326        let input = "[text](url";
1327        let result = try_parse_inline_link(input, false, LinkScanContext::default());
1328        assert_eq!(result, None);
1329    }
1330
1331    #[test]
1332    fn test_parse_inline_link_escaped_bracket() {
1333        let input = r"[text\]more](url)";
1334        let result = try_parse_inline_link(input, false, LinkScanContext::default());
1335        assert_eq!(result, Some((17, r"text\]more", "url", None)));
1336    }
1337
1338    #[test]
1339    fn test_parse_inline_link_parens_in_url() {
1340        let input = "[text](url(with)parens)";
1341        let result = try_parse_inline_link(input, false, LinkScanContext::default());
1342        assert_eq!(result, Some((23, "text", "url(with)parens", None)));
1343    }
1344
1345    #[test]
1346    fn test_parse_inline_image_simple() {
1347        let input = "![alt](image.jpg)";
1348        let result = try_parse_inline_image(input, LinkScanContext::default());
1349        assert_eq!(result, Some((17, "alt", "image.jpg", None)));
1350    }
1351
1352    #[test]
1353    fn test_parse_inline_image_with_title() {
1354        let input = r#"![alt](image.jpg "A title")"#;
1355        let result = try_parse_inline_image(input, LinkScanContext::default());
1356        assert_eq!(result, Some((27, "alt", r#"image.jpg "A title""#, None)));
1357    }
1358
1359    #[test]
1360    fn test_parse_inline_image_with_nested_brackets() {
1361        let input = "![outer [inner] alt](image.jpg)";
1362        let result = try_parse_inline_image(input, LinkScanContext::default());
1363        assert_eq!(result, Some((31, "outer [inner] alt", "image.jpg", None)));
1364    }
1365
1366    #[test]
1367    fn test_parse_bare_uri_rejects_dangling_backslash_after_trim() {
1368        let input = r"a:\]";
1369        let result = try_parse_bare_uri(input);
1370        assert_eq!(result, None);
1371    }
1372
1373    #[test]
1374    fn test_parse_inline_image_no_space_between_brackets_and_parens() {
1375        let input = "![alt] (image.jpg)";
1376        let result = try_parse_inline_image(input, LinkScanContext::default());
1377        assert_eq!(result, None);
1378    }
1379
1380    #[test]
1381    fn test_parse_inline_image_no_closing_bracket() {
1382        let input = "![alt(image.jpg)";
1383        let result = try_parse_inline_image(input, LinkScanContext::default());
1384        assert_eq!(result, None);
1385    }
1386
1387    #[test]
1388    fn test_parse_inline_image_no_closing_paren() {
1389        let input = "![alt](image.jpg";
1390        let result = try_parse_inline_image(input, LinkScanContext::default());
1391        assert_eq!(result, None);
1392    }
1393
1394    #[test]
1395    fn test_parse_inline_image_with_simple_class() {
1396        let input = "![alt](img.png){.large}";
1397        let result = try_parse_inline_image(input, LinkScanContext::default());
1398        let (len, alt, dest, attrs) = result.unwrap();
1399        assert_eq!(len, 23);
1400        assert_eq!(alt, "alt");
1401        assert_eq!(dest, "img.png");
1402        assert!(attrs.is_some());
1403        let attrs = attrs.unwrap();
1404        assert_eq!(attrs, "{.large}");
1405    }
1406
1407    #[test]
1408    fn test_parse_inline_image_with_id() {
1409        let input = "![Figure 1](fig1.png){#fig-1}";
1410        let result = try_parse_inline_image(input, LinkScanContext::default());
1411        let (len, alt, dest, attrs) = result.unwrap();
1412        assert_eq!(len, 29);
1413        assert_eq!(alt, "Figure 1");
1414        assert_eq!(dest, "fig1.png");
1415        assert!(attrs.is_some());
1416        let attrs = attrs.unwrap();
1417        assert_eq!(attrs, "{#fig-1}");
1418    }
1419
1420    #[test]
1421    fn test_parse_inline_image_with_full_attributes() {
1422        let input = "![alt](img.png){#fig .large width=\"80%\"}";
1423        let result = try_parse_inline_image(input, LinkScanContext::default());
1424        let (len, alt, dest, attrs) = result.unwrap();
1425        assert_eq!(len, 40);
1426        assert_eq!(alt, "alt");
1427        assert_eq!(dest, "img.png");
1428        assert!(attrs.is_some());
1429        let attrs = attrs.unwrap();
1430        assert_eq!(attrs, "{#fig .large width=\"80%\"}");
1431    }
1432
1433    #[test]
1434    fn test_parse_inline_image_attributes_must_be_adjacent() {
1435        // Space between ) and { should not parse as attributes
1436        let input = "![alt](img.png) {.large}";
1437        let result = try_parse_inline_image(input, LinkScanContext::default());
1438        assert_eq!(result, Some((15, "alt", "img.png", None)));
1439    }
1440
1441    // Link attribute tests
1442    #[test]
1443    fn test_parse_inline_link_with_id() {
1444        let input = "[text](url){#link-1}";
1445        let result = try_parse_inline_link(input, false, LinkScanContext::default());
1446        let (len, text, dest, attrs) = result.unwrap();
1447        assert_eq!(len, 20);
1448        assert_eq!(text, "text");
1449        assert_eq!(dest, "url");
1450        assert!(attrs.is_some());
1451        let attrs = attrs.unwrap();
1452        assert_eq!(attrs, "{#link-1}");
1453    }
1454
1455    #[test]
1456    fn test_parse_inline_link_with_full_attributes() {
1457        let input = "[text](url){#link .external target=\"_blank\"}";
1458        let result = try_parse_inline_link(input, false, LinkScanContext::default());
1459        let (len, text, dest, attrs) = result.unwrap();
1460        assert_eq!(len, 44);
1461        assert_eq!(text, "text");
1462        assert_eq!(dest, "url");
1463        assert!(attrs.is_some());
1464        let attrs = attrs.unwrap();
1465        assert_eq!(attrs, "{#link .external target=\"_blank\"}");
1466    }
1467
1468    #[test]
1469    fn test_parse_inline_link_attributes_must_be_adjacent() {
1470        // Space between ) and { should not parse as attributes
1471        let input = "[text](url) {.class}";
1472        let result = try_parse_inline_link(input, false, LinkScanContext::default());
1473        assert_eq!(result, Some((11, "text", "url", None)));
1474    }
1475
1476    #[test]
1477    fn test_parse_inline_link_with_title_and_attributes() {
1478        let input = r#"[text](url "title"){.external}"#;
1479        let result = try_parse_inline_link(input, false, LinkScanContext::default());
1480        let (len, text, dest, attrs) = result.unwrap();
1481        assert_eq!(len, 30);
1482        assert_eq!(text, "text");
1483        assert_eq!(dest, r#"url "title""#);
1484        assert!(attrs.is_some());
1485        let attrs = attrs.unwrap();
1486        assert_eq!(attrs, "{.external}");
1487    }
1488
1489    // Reference link tests
1490    #[test]
1491    fn test_parse_reference_link_explicit() {
1492        let input = "[link text][label]";
1493        let result = try_parse_reference_link(input, false, true, LinkScanContext::default());
1494        assert_eq!(result, Some((18, "link text", "label".to_string(), false)));
1495    }
1496
1497    #[test]
1498    fn test_parse_reference_link_implicit() {
1499        let input = "[link text][]";
1500        let result = try_parse_reference_link(input, false, true, LinkScanContext::default());
1501        assert_eq!(result, Some((13, "link text", String::new(), false)));
1502    }
1503
1504    #[test]
1505    fn test_parse_reference_link_explicit_same_label_as_text() {
1506        let input = "[stack][stack]";
1507        let result = try_parse_reference_link(input, false, true, LinkScanContext::default());
1508        assert_eq!(result, Some((14, "stack", "stack".to_string(), false)));
1509    }
1510
1511    #[test]
1512    fn test_parse_reference_link_shortcut() {
1513        let input = "[link text] rest";
1514        let result = try_parse_reference_link(input, true, true, LinkScanContext::default());
1515        assert_eq!(
1516            result,
1517            Some((11, "link text", "link text".to_string(), true))
1518        );
1519    }
1520
1521    #[test]
1522    fn test_parse_reference_link_shortcut_rejects_empty_label() {
1523        let input = "[] rest";
1524        let result = try_parse_reference_link(input, true, true, LinkScanContext::default());
1525        assert_eq!(result, None);
1526    }
1527
1528    #[test]
1529    fn test_parse_reference_link_shortcut_disabled() {
1530        let input = "[link text] rest";
1531        let result = try_parse_reference_link(input, false, true, LinkScanContext::default());
1532        assert_eq!(result, None);
1533    }
1534
1535    #[test]
1536    fn test_parse_reference_link_not_inline_link() {
1537        // With shortcut disabled, `[text](url)` is rejected so the inline
1538        // link form upstream gets exclusive ownership.
1539        let input = "[text](url)";
1540        let result = try_parse_reference_link(input, false, true, LinkScanContext::default());
1541        assert_eq!(result, None);
1542    }
1543
1544    #[test]
1545    fn test_parse_reference_link_shortcut_falls_through_inline_link() {
1546        // CommonMark spec example #568: when an inline-link attempt would
1547        // fail (here we model the reachability — the caller tries inline
1548        // link first; if that returns None, we should still see `[text]`
1549        // as a shortcut and leave `(url)` to be parsed as following text).
1550        let input = "[text](url)";
1551        let result = try_parse_reference_link(input, true, true, LinkScanContext::default());
1552        assert_eq!(result, Some((6, "text", "text".to_string(), true)));
1553    }
1554
1555    #[test]
1556    fn test_parse_reference_link_with_nested_brackets() {
1557        let input = "[outer [inner] text][ref]";
1558        let result = try_parse_reference_link(input, false, true, LinkScanContext::default());
1559        assert_eq!(
1560            result,
1561            Some((25, "outer [inner] text", "ref".to_string(), false))
1562        );
1563    }
1564
1565    #[test]
1566    fn test_parse_reference_link_label_no_newline() {
1567        let input = "[text][label\nmore]";
1568        let result = try_parse_reference_link(input, false, true, LinkScanContext::default());
1569        assert_eq!(result, None);
1570    }
1571
1572    // Reference image tests
1573    #[test]
1574    fn test_parse_reference_image_explicit() {
1575        let input = "![alt text][label]";
1576        let result = try_parse_reference_image(input, false);
1577        assert_eq!(result, Some((18, "alt text", "label".to_string(), false)));
1578    }
1579
1580    #[test]
1581    fn test_parse_reference_image_implicit() {
1582        let input = "![alt text][]";
1583        let result = try_parse_reference_image(input, false);
1584        assert_eq!(
1585            result,
1586            Some((13, "alt text", "alt text".to_string(), false))
1587        );
1588    }
1589
1590    #[test]
1591    fn test_parse_reference_image_shortcut() {
1592        let input = "![alt text] rest";
1593        let result = try_parse_reference_image(input, true);
1594        assert_eq!(result, Some((11, "alt text", "alt text".to_string(), true)));
1595    }
1596
1597    #[test]
1598    fn test_parse_reference_image_shortcut_disabled() {
1599        let input = "![alt text] rest";
1600        let result = try_parse_reference_image(input, false);
1601        assert_eq!(result, None);
1602    }
1603
1604    #[test]
1605    fn test_parse_reference_image_not_inline() {
1606        // Should not match inline images with (url)
1607        let input = "![alt](url)";
1608        let result = try_parse_reference_image(input, true);
1609        assert_eq!(result, None);
1610    }
1611
1612    #[test]
1613    fn test_parse_reference_image_with_nested_brackets() {
1614        let input = "![alt [nested] text][ref]";
1615        let result = try_parse_reference_image(input, false);
1616        assert_eq!(
1617            result,
1618            Some((25, "alt [nested] text", "ref".to_string(), false))
1619        );
1620    }
1621
1622    #[test]
1623    fn test_reference_link_label_with_crlf() {
1624        // Reference link labels should not span lines with CRLF
1625        let input = "[foo\r\nbar]";
1626        let result = try_parse_reference_link(input, false, true, LinkScanContext::default());
1627
1628        // Should fail to parse because label contains line break
1629        assert_eq!(
1630            result, None,
1631            "Should not parse reference link with CRLF in label"
1632        );
1633    }
1634
1635    #[test]
1636    fn test_reference_link_label_with_lf() {
1637        // Reference link labels should not span lines with LF either
1638        let input = "[foo\nbar]";
1639        let result = try_parse_reference_link(input, false, true, LinkScanContext::default());
1640
1641        // Should fail to parse because label contains line break
1642        assert_eq!(
1643            result, None,
1644            "Should not parse reference link with LF in label"
1645        );
1646    }
1647
1648    // Multiline link text tests
1649    #[test]
1650    fn test_parse_inline_link_multiline_text() {
1651        // Per Pandoc spec, link text CAN contain newlines (soft breaks)
1652        let input = "[text on\nline two](url)";
1653        let result = try_parse_inline_link(input, false, LinkScanContext::default());
1654        assert_eq!(
1655            result,
1656            Some((23, "text on\nline two", "url", None)),
1657            "Link text should allow newlines"
1658        );
1659    }
1660
1661    #[test]
1662    fn test_parse_inline_link_multiline_with_formatting() {
1663        // Link text with newlines and other inline elements
1664        let input =
1665            "[A network graph. Different edges\nwith probability](../images/networkfig.png)";
1666        let result = try_parse_inline_link(input, false, LinkScanContext::default());
1667        assert!(result.is_some(), "Link text with newlines should parse");
1668        let (len, text, _dest, _attrs) = result.unwrap();
1669        assert!(text.contains('\n'), "Link text should preserve newline");
1670        assert_eq!(len, input.len());
1671    }
1672
1673    #[test]
1674    fn test_parse_inline_image_multiline_alt() {
1675        // Per Pandoc spec, image alt text CAN contain newlines
1676        let input = "![alt on\nline two](img.png)";
1677        let result = try_parse_inline_image(input, LinkScanContext::default());
1678        assert_eq!(
1679            result,
1680            Some((27, "alt on\nline two", "img.png", None)),
1681            "Image alt text should allow newlines"
1682        );
1683    }
1684
1685    #[test]
1686    fn test_parse_inline_image_multiline_with_attributes() {
1687        // Image with multiline alt text and attributes
1688        let input = "![network graph\ndiagram](../images/fig.png){width=70%}";
1689        let result = try_parse_inline_image(input, LinkScanContext::default());
1690        assert!(
1691            result.is_some(),
1692            "Image alt with newlines and attributes should parse"
1693        );
1694        let (len, alt, dest, attrs) = result.unwrap();
1695        assert!(alt.contains('\n'), "Alt text should preserve newline");
1696        assert_eq!(dest, "../images/fig.png");
1697        assert_eq!(attrs, Some("{width=70%}"));
1698        assert_eq!(len, input.len());
1699    }
1700
1701    #[test]
1702    fn test_parse_inline_link_with_attributes_after_newline() {
1703        // Test for regression: when text is concatenated with newlines,
1704        // attributes after ) should still be recognized
1705        let input = "[A network graph.](../images/networkfig.png){width=70%}\nA word\n";
1706        let result = try_parse_inline_link(input, false, LinkScanContext::default());
1707        assert!(
1708            result.is_some(),
1709            "Link with attributes should parse even with following text"
1710        );
1711        let (len, text, dest, attrs) = result.unwrap();
1712        assert_eq!(text, "A network graph.");
1713        assert_eq!(dest, "../images/networkfig.png");
1714        assert_eq!(attrs, Some("{width=70%}"), "Attributes should be captured");
1715        assert_eq!(
1716            len, 55,
1717            "Length should include attributes (up to closing brace)"
1718        );
1719    }
1720}