panache_parser/parser/inlines/
links.rs

1//! Parsing for links, images, and automatic links.
2//!
3//! Implements:
4//! - Automatic links: `<http://example.com>` and `<user@example.com>`
5//! - Inline links: `[text](url)` and `[text](url "title")`
6//! - Link attributes: `[text](url){#id .class key=value}`
7//! - Inline images: `![alt](url)` and `![alt](url "title")`
8//! - Image attributes: `![alt](url){#id .class key=value}`
9//! - Reference links: `[text][ref]`, `[text][]`, `[text]`
10//! - Reference images: `![alt][ref]`, `![alt][]`, `![alt]`
11
12use super::code_spans::try_parse_code_span;
13use super::core::parse_inline_text;
14use super::inline_html::try_parse_inline_html;
15use crate::options::ParserOptions;
16use crate::syntax::SyntaxKind;
17use rowan::GreenNodeBuilder;
18
19// Import attribute parsing
20use crate::parser::utils::attributes::{emit_attribute_node, try_parse_trailing_attributes};
21
22/// Flags that control which inline spans the link-bracket scanner treats as
23/// opaque (so a `]` inside them does not terminate the link/image text).
24///
25/// - `skip_raw_html` is universal across dialects: pandoc-markdown and
26///   CommonMark both refuse to close link text inside a raw HTML span (e.g.
27///   `[foo <bar attr="](baz)">`), per CommonMark spec example #524 / #536.
28/// - `skip_autolinks` is **CommonMark-only**. Pandoc-markdown does *not*
29///   treat `<scheme://...>` as opaque inside link text, so the same input
30///   produces a different parse under each dialect (CommonMark spec example
31///   #526 / #538). Always derive this from
32///   `extensions.autolinks && dialect == Dialect::CommonMark`.
33/// - `disallow_inner_links` is **CommonMark-only** structural rule (§6.4):
34///   "Links may not contain other links, at any level of nesting." When the
35///   candidate link/image text contains a valid inline link or image, the
36///   outer match is rejected so the inner-most definition is used instead
37///   (spec examples #518–#520, #532). Pandoc-markdown allows nested links,
38///   so the flag is `false` there.
39#[derive(Clone, Copy)]
40pub struct LinkScanContext {
41    pub skip_raw_html: bool,
42    pub skip_autolinks: bool,
43    pub disallow_inner_links: bool,
44    /// Dialect controlling which HTML constructs the raw-HTML opacity check
45    /// recognizes. Pandoc-markdown excludes bare declarations and CDATA
46    /// from its inline raw HTML grammar.
47    pub dialect: crate::options::Dialect,
48}
49
50impl Default for LinkScanContext {
51    fn default() -> Self {
52        Self {
53            skip_raw_html: false,
54            skip_autolinks: false,
55            disallow_inner_links: false,
56            dialect: crate::options::Dialect::Pandoc,
57        }
58    }
59}
60
61impl LinkScanContext {
62    pub fn from_options(config: &ParserOptions) -> Self {
63        let is_commonmark = config.dialect == crate::options::Dialect::CommonMark;
64        Self {
65            skip_raw_html: config.extensions.raw_html,
66            skip_autolinks: config.extensions.autolinks && is_commonmark,
67            disallow_inner_links: is_commonmark,
68            dialect: config.dialect,
69        }
70    }
71}
72
73/// Find the closing `]` of a link/image text span, starting from `start`.
74///
75/// Walks `text[start..]` tracking nested brackets and backslash escapes. When
76/// a backtick run starting a valid code span is encountered, the entire span
77/// (including any trailing attribute block) is skipped — per CommonMark §6
78/// precedence, code spans bind tighter than links/images, so a `]` *inside*
79/// a code span cannot terminate the link's text. The same opacity applies to
80/// raw HTML and (CommonMark-only) autolink spans gated through `ctx`.
81/// Returns the byte offset of the closing `]` within `text`, or `None` if no
82/// unmatched `]` is reached.
83fn find_link_close_bracket(text: &str, start: usize, ctx: LinkScanContext) -> Option<usize> {
84    let bytes = text.as_bytes();
85    let mut bracket_depth = 0;
86    let mut escape_next = false;
87    let mut i = start;
88
89    while i < bytes.len() {
90        let b = bytes[i];
91
92        if escape_next {
93            escape_next = false;
94            i += step(text, i);
95            continue;
96        }
97
98        match b {
99            b'\\' => {
100                escape_next = true;
101                i += 1;
102            }
103            b'`' => {
104                if let Some((len, _, _, _)) = try_parse_code_span(&text[i..]) {
105                    i += len;
106                } else {
107                    i += 1;
108                }
109            }
110            b'<' => {
111                // Order matters: autolinks are the more specific `<...>`
112                // shape (URI/email between angle brackets), so try that
113                // before falling through to general inline raw HTML which
114                // would also match `<bar attr="...">`-style tags.
115                if ctx.skip_autolinks
116                    && let Some((len, _)) = try_parse_autolink(&text[i..], true)
117                {
118                    i += len;
119                } else if ctx.skip_raw_html
120                    && let Some(len) = try_parse_inline_html(&text[i..], ctx.dialect)
121                {
122                    i += len;
123                } else {
124                    i += 1;
125                }
126            }
127            b'[' => {
128                bracket_depth += 1;
129                i += 1;
130            }
131            b']' => {
132                if bracket_depth == 0 {
133                    return Some(i);
134                }
135                bracket_depth -= 1;
136                i += 1;
137            }
138            _ => i += step(text, i),
139        }
140    }
141    None
142}
143
144/// Find the closing `)` of a link/image destination, given the text *after*
145/// the opening `(`. Tracks paren nesting, quoted titles, and angle-bracketed
146/// destinations (`<...>` may legitimately contain unbalanced parens — see
147/// spec example #499). Returns the byte offset of the closing `)` within the
148/// passed slice, or `None` if not found.
149fn find_dest_close_paren(remaining: &str) -> Option<usize> {
150    let bytes = remaining.as_bytes();
151    let mut paren_depth = 0;
152    let mut escape_next = false;
153    let mut in_quotes = false;
154    let mut in_angle = false;
155    let mut i = 0;
156
157    while i < bytes.len() {
158        let b = bytes[i];
159
160        if escape_next {
161            escape_next = false;
162            i += step(remaining, i);
163            continue;
164        }
165
166        match b {
167            b'\\' => {
168                escape_next = true;
169                i += 1;
170            }
171            b'<' if !in_quotes && !in_angle => {
172                in_angle = true;
173                i += 1;
174            }
175            b'>' if in_angle => {
176                in_angle = false;
177                i += 1;
178            }
179            b'"' if !in_angle => {
180                in_quotes = !in_quotes;
181                i += 1;
182            }
183            b'(' if !in_quotes && !in_angle => {
184                paren_depth += 1;
185                i += 1;
186            }
187            b')' if !in_quotes && !in_angle => {
188                if paren_depth == 0 {
189                    return Some(i);
190                }
191                paren_depth -= 1;
192                i += 1;
193            }
194            _ => i += step(remaining, i),
195        }
196    }
197    None
198}
199
200/// Byte length of the UTF-8 character starting at byte index `i` in `s`.
201/// Used to advance an index loop char-by-char without incurring `char_indices`
202/// overhead and without splitting on a UTF-8 boundary.
203fn step(s: &str, i: usize) -> usize {
204    s[i..].chars().next().map(|c| c.len_utf8()).unwrap_or(1)
205}
206
207/// CommonMark §6.4: "Links may not contain other links, at any level of
208/// nesting. If multiple otherwise valid link definitions appear nested inside
209/// each other, the inner-most definition is used." This helper scans a
210/// candidate link text for any `[` that starts a valid inline link; when
211/// found, the outer link must be rejected so the inner-most wins (spec
212/// examples #518–#519, #532).
213///
214/// Images themselves do not count as inner links — a link can contain an
215/// image (#517, #531). A link *inside* an image's alt text, however, still
216/// deactivates outer link openers per CommonMark's bracket-scanner rules, so
217/// the helper recurses into image alt text looking for inner links.
218///
219/// Reference-link nesting (#533, #569, #571) requires resolving labels
220/// against the document's reference-definition map, which the parser does
221/// not have at this point — those cases remain unhandled and need a later
222/// stack-based pass.
223fn link_text_contains_inner_link(text: &str, ctx: LinkScanContext, strict_dest: bool) -> bool {
224    let bytes = text.as_bytes();
225    let mut i = 0;
226    let mut escape_next = false;
227    while i < bytes.len() {
228        let b = bytes[i];
229        if escape_next {
230            escape_next = false;
231            i += step(text, i);
232            continue;
233        }
234        match b {
235            b'\\' => {
236                escape_next = true;
237                i += 1;
238            }
239            b'`' => {
240                if let Some((len, _, _, _)) = try_parse_code_span(&text[i..]) {
241                    i += len;
242                } else {
243                    i += 1;
244                }
245            }
246            b'<' => {
247                if ctx.skip_autolinks
248                    && let Some((len, _)) = try_parse_autolink(&text[i..], true)
249                {
250                    i += len;
251                } else if ctx.skip_raw_html
252                    && let Some(len) = try_parse_inline_html(&text[i..], ctx.dialect)
253                {
254                    i += len;
255                } else {
256                    i += 1;
257                }
258            }
259            b'!' if i + 1 < bytes.len() && bytes[i + 1] == b'[' => {
260                if let Some((len, alt, _, _)) = try_parse_inline_image(&text[i..], ctx) {
261                    if link_text_contains_inner_link(alt, ctx, strict_dest) {
262                        return true;
263                    }
264                    i += len;
265                } else {
266                    i += 2;
267                }
268            }
269            b'[' => {
270                if try_parse_inline_link(&text[i..], strict_dest, ctx).is_some() {
271                    return true;
272                }
273                i += 1;
274            }
275            _ => i += step(text, i),
276        }
277    }
278    false
279}
280
281/// Try to parse an inline image starting at the current position.
282///
283/// Inline images have the form `![alt](url)` or `![alt](url "title")`.
284/// Can also have trailing attributes: `![alt](url){#id .class}`.
285/// Returns Some((length, alt_text, dest_content, raw_attributes)) if a valid image is found.
286///
287/// `ctx` controls bracket-scanner opacity for raw HTML / autolink spans;
288/// see `LinkScanContext`.
289pub fn try_parse_inline_image(
290    text: &str,
291    ctx: LinkScanContext,
292) -> Option<(usize, &str, &str, Option<&str>)> {
293    if !text.starts_with("![") {
294        return None;
295    }
296
297    // Find the closing ]
298    let close_bracket = find_link_close_bracket(text, 2, ctx)?;
299    let alt_text = &text[2..close_bracket];
300
301    // Check for immediate ( after ]
302    let after_bracket = close_bracket + 1;
303    if text.len() <= after_bracket || !text[after_bracket..].starts_with('(') {
304        return None;
305    }
306
307    // Find closing ) for destination (reuse same logic as links)
308    let dest_start = after_bracket + 1;
309    let remaining = &text[dest_start..];
310
311    let close_paren = find_dest_close_paren(remaining)?;
312    let dest_content = &remaining[..close_paren];
313
314    // Check for trailing attributes {#id .class key=value}
315    let after_paren = dest_start + close_paren + 1;
316    let after_close = &text[after_paren..];
317
318    // Attributes must start immediately after closing paren (no whitespace/newlines)
319    if after_close.starts_with('{') {
320        // Find the closing brace
321        if let Some(close_brace_pos) = after_close.find('}') {
322            let attr_text = &after_close[..=close_brace_pos];
323            // Try to parse as attributes to validate
324            if let Some((_attrs, _)) = try_parse_trailing_attributes(attr_text) {
325                let total_len = after_paren + close_brace_pos + 1;
326                // Return raw attribute string for lossless parsing
327                let raw_attrs = attr_text;
328                return Some((total_len, alt_text, dest_content, Some(raw_attrs)));
329            }
330        }
331    }
332
333    // No attributes, just return the image
334    let total_len = after_paren;
335    Some((total_len, alt_text, dest_content, None))
336}
337
338/// Emit an inline image node to the builder.
339/// Note: alt_text may contain inline elements and should be parsed recursively.
340pub fn emit_inline_image(
341    builder: &mut GreenNodeBuilder,
342    _text: &str,
343    alt_text: &str,
344    dest: &str,
345    raw_attributes: Option<&str>,
346    config: &ParserOptions,
347    suppress_footnote_refs: bool,
348) {
349    builder.start_node(SyntaxKind::IMAGE_LINK.into());
350
351    // Opening ![
352    builder.start_node(SyntaxKind::IMAGE_LINK_START.into());
353    builder.token(SyntaxKind::IMAGE_LINK_START.into(), "![");
354    builder.finish_node();
355
356    // Alt text (recursively parse inline elements)
357    builder.start_node(SyntaxKind::IMAGE_ALT.into());
358    // Use the standalone parse_inline_text function for recursive parsing
359    // Note: nested contexts don't resolve references
360    parse_inline_text(builder, alt_text, config, false, suppress_footnote_refs);
361    builder.finish_node();
362
363    // Closing ]
364    builder.token(SyntaxKind::IMAGE_ALT_END.into(), "]");
365
366    // Opening (
367    builder.token(SyntaxKind::IMAGE_DEST_START.into(), "(");
368
369    // Destination
370    builder.start_node(SyntaxKind::LINK_DEST.into());
371    builder.token(SyntaxKind::TEXT.into(), dest);
372    builder.finish_node();
373
374    // Closing )
375    builder.token(SyntaxKind::IMAGE_DEST_END.into(), ")");
376
377    // Emit raw attributes if present (preserve original formatting)
378    if let Some(raw_attrs) = raw_attributes {
379        emit_attribute_node(builder, raw_attrs);
380    }
381
382    builder.finish_node();
383}
384
385/// Try to parse an automatic link starting at the current position.
386///
387/// Automatic links have the form `<url>` (URI autolink) or `<email>`
388/// (email autolink) per CommonMark §6.4. Under `Dialect::CommonMark` the
389/// scheme/email grammar is enforced strictly (e.g. scheme must be 2-32
390/// ASCII chars; email local parts cannot contain backslashes). Pandoc
391/// markdown is laxer — it accepts Unicode in email addresses, for
392/// example — so non-CommonMark callers fall back to the heuristic
393/// "contains `:` or `@`" check that the parser used historically.
394pub fn try_parse_autolink(text: &str, is_commonmark: bool) -> Option<(usize, &str)> {
395    if !text.starts_with('<') {
396        return None;
397    }
398
399    let close_pos = text[1..].find('>')?;
400    let content = &text[1..1 + close_pos];
401
402    if content.is_empty() {
403        return None;
404    }
405    if content.contains(|c: char| c.is_whitespace()) {
406        return None;
407    }
408
409    if is_commonmark {
410        if !is_valid_uri_autolink(content) && !is_valid_email_autolink(content) {
411            return None;
412        }
413    } else if !content.contains(':') && !content.contains('@') {
414        return None;
415    }
416
417    Some((close_pos + 2, content))
418}
419
420/// CommonMark §6.4 URI autolink:
421/// scheme = 2-32 chars, ASCII letter then `[a-zA-Z0-9+.-]`, followed by `:`,
422/// followed by URI body (any char except control, space, `<`, `>`).
423fn is_valid_uri_autolink(s: &str) -> bool {
424    let bytes = s.as_bytes();
425    if bytes.is_empty() || !bytes[0].is_ascii_alphabetic() {
426        return false;
427    }
428    let mut i = 1;
429    while i < bytes.len() {
430        let b = bytes[i];
431        if b.is_ascii_alphanumeric() || b == b'+' || b == b'-' || b == b'.' {
432            i += 1;
433        } else {
434            break;
435        }
436    }
437    if !(2..=32).contains(&i) {
438        return false;
439    }
440    if i >= bytes.len() || bytes[i] != b':' {
441        return false;
442    }
443    for &b in &bytes[i + 1..] {
444        if b < 0x20 || b == 0x7f || b == b'<' || b == b'>' {
445            return false;
446        }
447    }
448    true
449}
450
451/// CommonMark §6.4 email autolink, matching the HTML5 non-normative regex:
452/// `^[a-zA-Z0-9.!#$%&'*+/=?^_`{|}~-]+@[a-zA-Z0-9](?:[a-zA-Z0-9-]{0,61}[a-zA-Z0-9])?
453///  (?:\.[a-zA-Z0-9](?:[a-zA-Z0-9-]{0,61}[a-zA-Z0-9])?)*$`.
454fn is_valid_email_autolink(s: &str) -> bool {
455    let Some(at) = s.find('@') else {
456        return false;
457    };
458    let local = &s[..at];
459    let domain = &s[at + 1..];
460    if local.is_empty() || !local.bytes().all(is_email_local_byte) {
461        return false;
462    }
463    if domain.is_empty() {
464        return false;
465    }
466    domain.split('.').all(is_valid_email_label)
467}
468
469fn is_email_local_byte(b: u8) -> bool {
470    matches!(
471        b,
472        b'a'..=b'z'
473            | b'A'..=b'Z'
474            | b'0'..=b'9'
475            | b'.'
476            | b'!'
477            | b'#'
478            | b'$'
479            | b'%'
480            | b'&'
481            | b'\''
482            | b'*'
483            | b'+'
484            | b'/'
485            | b'='
486            | b'?'
487            | b'^'
488            | b'_'
489            | b'`'
490            | b'{'
491            | b'|'
492            | b'}'
493            | b'~'
494            | b'-'
495    )
496}
497
498fn is_valid_email_label(label: &str) -> bool {
499    let bytes = label.as_bytes();
500    if bytes.is_empty() || bytes.len() > 63 {
501        return false;
502    }
503    if !bytes[0].is_ascii_alphanumeric() {
504        return false;
505    }
506    if !bytes[bytes.len() - 1].is_ascii_alphanumeric() {
507        return false;
508    }
509    bytes[1..bytes.len() - 1]
510        .iter()
511        .all(|b| b.is_ascii_alphanumeric() || *b == b'-')
512}
513
514/// Emit an automatic link node to the builder.
515pub fn emit_autolink(builder: &mut GreenNodeBuilder, _text: &str, url: &str) {
516    builder.start_node(SyntaxKind::AUTO_LINK.into());
517
518    // Opening <
519    builder.start_node(SyntaxKind::AUTO_LINK_MARKER.into());
520    builder.token(SyntaxKind::AUTO_LINK_MARKER.into(), "<");
521    builder.finish_node();
522
523    // URL content
524    builder.token(SyntaxKind::TEXT.into(), url);
525
526    // Closing >
527    builder.start_node(SyntaxKind::AUTO_LINK_MARKER.into());
528    builder.token(SyntaxKind::AUTO_LINK_MARKER.into(), ">");
529    builder.finish_node();
530
531    builder.finish_node();
532}
533
534pub fn try_parse_bare_uri(text: &str) -> Option<(usize, &str)> {
535    let mut chars = text.char_indices();
536    let (_, first) = chars.next()?;
537    if !first.is_ascii_alphabetic() {
538        return None;
539    }
540
541    let mut scheme_end = None;
542    for (idx, ch) in text.char_indices() {
543        if ch == ':' {
544            scheme_end = Some(idx);
545            break;
546        }
547        if !ch.is_ascii_alphanumeric() && ch != '+' && ch != '-' && ch != '.' {
548            return None;
549        }
550    }
551    let scheme_end = scheme_end?;
552    if scheme_end == 0 {
553        return None;
554    }
555
556    let mut end = scheme_end + 1;
557    let bytes = text.as_bytes();
558    while end < text.len() {
559        let b = bytes[end];
560        if b.is_ascii_whitespace() {
561            break;
562        }
563        if matches!(b, b'<' | b'>' | b'`' | b'"' | b'\'') {
564            break;
565        }
566        end += 1;
567    }
568
569    if end == scheme_end + 1 {
570        return None;
571    }
572
573    let mut trimmed = end;
574    while trimmed > scheme_end + 1 {
575        let ch = text[..trimmed].chars().last().unwrap();
576        if matches!(ch, '.' | ',' | ';' | ':' | ')' | ']' | '}') {
577            trimmed -= ch.len_utf8();
578        } else {
579            break;
580        }
581    }
582
583    if trimmed <= scheme_end + 1 {
584        return None;
585    }
586
587    // If trimming terminal punctuation leaves a dangling backslash, the match
588    // came from escaped punctuation (e.g., `a:\]`) and should stay literal.
589    if text[..trimmed].ends_with('\\') {
590        return None;
591    }
592
593    Some((trimmed, &text[..trimmed]))
594}
595
596/// Try to parse an inline link starting at the current position.
597///
598/// Inline links have the form `[text](url)` or `[text](url "title")`.
599/// Can also have trailing attributes: `[text](url){#id .class}`.
600/// Returns Some((length, text_content, dest_content, raw_attributes)) if a valid link is found.
601///
602/// `strict_dest` enables CommonMark §6.4 destination-and-title validation:
603/// the bare destination form may not contain spaces or ASCII control
604/// characters and must have balanced parentheses; if a title follows it
605/// must be properly delimited; only whitespace is allowed before/after.
606/// Pandoc-markdown is more permissive, so leave this off for that dialect.
607pub fn try_parse_inline_link(
608    text: &str,
609    strict_dest: bool,
610    ctx: LinkScanContext,
611) -> Option<(usize, &str, &str, Option<&str>)> {
612    if !text.starts_with('[') {
613        return None;
614    }
615
616    // Find the closing ]
617    let close_bracket = find_link_close_bracket(text, 1, ctx)?;
618    let link_text = &text[1..close_bracket];
619
620    // Check for immediate ( after ]
621    let after_bracket = close_bracket + 1;
622    if text.len() <= after_bracket || !text[after_bracket..].starts_with('(') {
623        return None;
624    }
625
626    // Find closing ) for destination
627    let dest_start = after_bracket + 1;
628    let remaining = &text[dest_start..];
629
630    let close_paren = find_dest_close_paren(remaining)?;
631    let dest_content = &remaining[..close_paren];
632
633    if strict_dest && !dest_and_title_ok_commonmark(dest_content) {
634        return None;
635    }
636
637    // CommonMark §6.4: outer link is rejected when its text contains a valid
638    // inner inline link or image, so the inner-most definition wins.
639    if ctx.disallow_inner_links && link_text_contains_inner_link(link_text, ctx, strict_dest) {
640        return None;
641    }
642
643    // Check for trailing attributes {#id .class key=value}
644    let after_paren = dest_start + close_paren + 1;
645    let after_close = &text[after_paren..];
646
647    // Attributes must start immediately after closing paren (no whitespace/newlines)
648    if after_close.starts_with('{') {
649        // Find the closing brace
650        if let Some(close_brace_pos) = after_close.find('}') {
651            let attr_text = &after_close[..=close_brace_pos];
652            // Try to parse as attributes to validate
653            if let Some((_attrs, _)) = try_parse_trailing_attributes(attr_text) {
654                let total_len = after_paren + close_brace_pos + 1;
655                // Return raw attribute string for lossless parsing
656                let raw_attrs = attr_text;
657                return Some((total_len, link_text, dest_content, Some(raw_attrs)));
658            }
659        }
660    }
661
662    // No attributes, just return the link
663    let total_len = after_paren;
664    Some((total_len, link_text, dest_content, None))
665}
666
667/// CommonMark §6.4 destination + optional title validation. The text passed
668/// in is whatever the parser captured between `(` and `)`. A valid form is:
669/// `[ws] destination [ws title [ws]]` where:
670/// - bare destination has no spaces, tabs, ASCII control chars, and balanced
671///   parentheses (escaped parens permitted);
672/// - bracketed destination is `<...>` with no newlines and no unescaped `<>`;
673/// - the optional title is delimited by `"..."`, `'...'`, or `(...)`;
674/// - any text outside that structure invalidates the link.
675fn dest_and_title_ok_commonmark(content: &str) -> bool {
676    let trimmed = trim_start_link_ws(content);
677    if trimmed.is_empty() {
678        return true;
679    }
680
681    let after_dest = if let Some(rest) = trimmed.strip_prefix('<') {
682        let mut escape = false;
683        let mut end_byte = None;
684        for (i, c) in rest.char_indices() {
685            if escape {
686                escape = false;
687                continue;
688            }
689            match c {
690                '\\' => escape = true,
691                '\n' | '<' => return false,
692                '>' => {
693                    end_byte = Some(i);
694                    break;
695                }
696                _ => {}
697            }
698        }
699        match end_byte {
700            Some(e) => &rest[e + 1..],
701            None => return false,
702        }
703    } else {
704        let mut escape = false;
705        let mut depth: i32 = 0;
706        let mut end = trimmed.len();
707        for (i, c) in trimmed.char_indices() {
708            if escape {
709                escape = false;
710                continue;
711            }
712            match c {
713                '\\' => escape = true,
714                ' ' | '\t' | '\n' => {
715                    end = i;
716                    break;
717                }
718                _ if c.is_ascii_control() => return false,
719                '(' => depth += 1,
720                ')' => {
721                    if depth == 0 {
722                        end = i;
723                        break;
724                    }
725                    depth -= 1;
726                }
727                _ => {}
728            }
729        }
730        if depth != 0 {
731            return false;
732        }
733        if end == 0 {
734            // bare destination must be nonempty if the field is non-blank
735            return false;
736        }
737        &trimmed[end..]
738    };
739
740    let after_dest = trim_start_link_ws(after_dest);
741    if after_dest.is_empty() {
742        return true;
743    }
744
745    let bytes = after_dest.as_bytes();
746    let close = match bytes[0] {
747        b'"' => b'"',
748        b'\'' => b'\'',
749        b'(' => b')',
750        _ => return false,
751    };
752    let opens_paren = bytes[0] == b'(';
753    let mut escape = false;
754    let mut title_close_pos = None;
755    for (i, &b) in after_dest.as_bytes().iter().enumerate().skip(1) {
756        if escape {
757            escape = false;
758            continue;
759        }
760        if b == b'\\' {
761            escape = true;
762            continue;
763        }
764        if opens_paren && b == b'(' {
765            return false;
766        }
767        if b == close {
768            title_close_pos = Some(i);
769            break;
770        }
771    }
772    let close_idx = match title_close_pos {
773        Some(p) => p,
774        None => return false,
775    };
776
777    let after_title = &after_dest[close_idx + 1..];
778    is_link_ws_only(after_title)
779}
780
781/// Strip leading ASCII space/tab/newline bytes. Byte-level equivalent of
782/// `s.trim_start_matches([' ', '\t', '\n'])`; called for every
783/// CommonMark inline-link destination/title scan, so the slice-pattern
784/// MultiCharEqSearcher overhead matters.
785#[inline]
786fn trim_start_link_ws(s: &str) -> &str {
787    let bytes = s.as_bytes();
788    let mut i = 0;
789    while i < bytes.len() {
790        let b = bytes[i];
791        if b == b' ' || b == b'\t' || b == b'\n' {
792            i += 1;
793        } else {
794            break;
795        }
796    }
797    // SAFETY: stripped only ASCII whitespace bytes.
798    unsafe { std::str::from_utf8_unchecked(&bytes[i..]) }
799}
800
801#[inline]
802fn is_link_ws_only(s: &str) -> bool {
803    s.as_bytes()
804        .iter()
805        .all(|&b| b == b' ' || b == b'\t' || b == b'\n')
806}
807
808/// Emit an inline link node to the builder.
809/// Note: link_text may contain inline elements and should be parsed recursively.
810pub fn emit_inline_link(
811    builder: &mut GreenNodeBuilder,
812    _text: &str,
813    link_text: &str,
814    dest: &str,
815    raw_attributes: Option<&str>,
816    config: &ParserOptions,
817    suppress_footnote_refs: bool,
818) {
819    builder.start_node(SyntaxKind::LINK.into());
820
821    // Opening [
822    builder.start_node(SyntaxKind::LINK_START.into());
823    builder.token(SyntaxKind::LINK_START.into(), "[");
824    builder.finish_node();
825
826    // Link text (recursively parse inline elements). Pandoc-native:
827    // links cannot contain other links, so suppress inner LINK / ref-link
828    // recognition during the recursion. Images, emphasis, code, etc. are
829    // still recognised. CommonMark relies on outer-level process_brackets
830    // to prevent nested links, but the flag is harmless under CM.
831    builder.start_node(SyntaxKind::LINK_TEXT.into());
832    parse_inline_text(builder, link_text, config, true, suppress_footnote_refs);
833    builder.finish_node();
834
835    // Closing ]
836    builder.token(SyntaxKind::LINK_TEXT_END.into(), "]");
837
838    // Opening (
839    builder.token(SyntaxKind::LINK_DEST_START.into(), "(");
840
841    // Destination
842    builder.start_node(SyntaxKind::LINK_DEST.into());
843    builder.token(SyntaxKind::TEXT.into(), dest);
844    builder.finish_node();
845
846    // Closing )
847    builder.token(SyntaxKind::LINK_DEST_END.into(), ")");
848
849    // Emit raw attributes if present (preserve original formatting)
850    if let Some(raw_attrs) = raw_attributes {
851        emit_attribute_node(builder, raw_attrs);
852    }
853
854    builder.finish_node();
855}
856
857pub fn emit_bare_uri_link(builder: &mut GreenNodeBuilder, uri: &str, _config: &ParserOptions) {
858    builder.start_node(SyntaxKind::LINK.into());
859
860    builder.start_node(SyntaxKind::LINK_START.into());
861    builder.token(SyntaxKind::LINK_START.into(), "[");
862    builder.finish_node();
863
864    builder.start_node(SyntaxKind::LINK_TEXT.into());
865    builder.token(SyntaxKind::TEXT.into(), uri);
866    builder.finish_node();
867
868    builder.token(SyntaxKind::LINK_TEXT_END.into(), "]");
869    builder.token(SyntaxKind::LINK_DEST_START.into(), "(");
870
871    builder.start_node(SyntaxKind::LINK_DEST.into());
872    builder.token(SyntaxKind::TEXT.into(), uri);
873    builder.finish_node();
874
875    builder.token(SyntaxKind::LINK_DEST_END.into(), ")");
876
877    builder.finish_node();
878}
879
880/// Try to parse a reference link starting at the current position.
881///
882/// Reference links have three forms:
883/// - Explicit: `[text][label]`
884/// - Implicit: `[text][]` (label = text)
885/// - Shortcut: `[text]` (if shortcut_reference_links enabled)
886///
887/// Returns Some((length, text_content, label, is_shortcut)) if a valid reference link is found.
888/// The label is what should be looked up in the registry.
889pub fn try_parse_reference_link(
890    text: &str,
891    allow_shortcut: bool,
892    inline_link_attempted: bool,
893    ctx: LinkScanContext,
894) -> Option<(usize, &str, String, bool)> {
895    if !text.starts_with('[') {
896        return None;
897    }
898
899    // Don't match citations (which start with [@) or suppress-author citations (which start with [-@)
900    if text.len() > 1 {
901        let bytes = text.as_bytes();
902        if bytes[1] == b'@' {
903            return None;
904        }
905        if bytes[1] == b'-' && text.len() > 2 && bytes[2] == b'@' {
906            return None;
907        }
908    }
909
910    // Find the closing ] for the text. Uses the shared helper so that a
911    // `]` inside a code span doesn't terminate the link text (CommonMark
912    // §6 — code spans bind tighter than links). See spec examples #342
913    // and #525. Raw HTML and (CommonMark-only) autolink spans are also
914    // opaque per `ctx`.
915    let close_bracket = find_link_close_bracket(text, 1, ctx)?;
916    let link_text = &text[1..close_bracket];
917
918    // CommonMark §6.4: outer reference link is rejected when its text contains
919    // a valid inner inline link/image (spec example #532). Reference-link
920    // nesting (#533/#569/#571) is not handled here; it requires resolving
921    // labels against the document refdef map.
922    if ctx.disallow_inner_links
923        && link_text_contains_inner_link(link_text, ctx, ctx.disallow_inner_links)
924    {
925        return None;
926    }
927
928    // Check what follows the ]
929    let after_bracket = close_bracket + 1;
930
931    // `[content]{...}` is reserved for bracketed spans / attribute
932    // trailers, never a shortcut.
933    if after_bracket < text.len() && text[after_bracket..].starts_with('{') {
934        return None;
935    }
936
937    // `[text](...)` is the inline-link shape. CommonMark spec example
938    // #568 (`[foo](not a link)` with `[foo]: /url`) requires the shortcut
939    // to succeed for `[foo]`, leaving `(not a link)` as literal text when
940    // the upstream inline-link parse was rejected by `strict_dest`. We
941    // only fall through to shortcut here when the caller has already
942    // tried the inline-link form (`inline_link_attempted`) — otherwise
943    // disabling the `inline_links` extension would silently let
944    // `[text](url)` become a shortcut + literal text, which the
945    // `inline_links_disabled_keeps_inline_link_literal` test guards
946    // against.
947    if after_bracket < text.len()
948        && text[after_bracket..].starts_with('(')
949        && (!allow_shortcut || !inline_link_attempted)
950    {
951        return None;
952    }
953
954    // Check for explicit reference [text][label] or implicit [text][]
955    if after_bracket < text.len() && text[after_bracket..].starts_with('[') {
956        // Find the closing ] for the label
957        let label_start = after_bracket + 1;
958        let mut label_end = None;
959
960        for (i, ch) in text[label_start..].char_indices() {
961            if ch == ']' {
962                label_end = Some(i + label_start);
963                break;
964            }
965            // Labels can't contain newlines
966            if ch == '\n' {
967                return None;
968            }
969        }
970
971        let label_end = label_end?;
972        let label = &text[label_start..label_end];
973
974        // Total length includes both bracket pairs
975        let total_len = label_end + 1;
976
977        // Implicit reference: empty label means emit [text][]
978        if label.is_empty() {
979            return Some((total_len, link_text, String::new(), false));
980        }
981
982        // Explicit reference: use the provided label
983        Some((total_len, link_text, label.to_string(), false))
984    } else if allow_shortcut {
985        // Shortcut reference: [text] with no second bracket pair
986        // The text is both the display text and the label
987        if link_text.is_empty() {
988            return None;
989        }
990        Some((after_bracket, link_text, link_text.to_string(), true))
991    } else {
992        // No second bracket pair and shortcut not allowed - not a reference link
993        None
994    }
995}
996
997/// Emit a reference link node to the builder.
998/// Preserves the original reference syntax (explicit [text][ref], implicit [text][], or shortcut [text]).
999pub fn emit_reference_link(
1000    builder: &mut GreenNodeBuilder,
1001    link_text: &str,
1002    label: &str,
1003    is_shortcut: bool,
1004    config: &ParserOptions,
1005    suppress_footnote_refs: bool,
1006) {
1007    builder.start_node(SyntaxKind::LINK.into());
1008
1009    // Opening [
1010    builder.start_node(SyntaxKind::LINK_START.into());
1011    builder.token(SyntaxKind::LINK_START.into(), "[");
1012    builder.finish_node();
1013
1014    // Link text (recursively parse inline elements). Pandoc-native:
1015    // links cannot contain other links, so suppress inner LINK / ref-link
1016    // recognition during the recursion. Images, emphasis, code, etc. are
1017    // still recognised.
1018    builder.start_node(SyntaxKind::LINK_TEXT.into());
1019    parse_inline_text(builder, link_text, config, true, suppress_footnote_refs);
1020    builder.finish_node();
1021
1022    // Closing ] and reference label
1023    builder.token(SyntaxKind::TEXT.into(), "]");
1024
1025    if !is_shortcut {
1026        // Explicit or implicit reference: [text][label] or [text][]
1027        builder.token(SyntaxKind::TEXT.into(), "[");
1028        builder.start_node(SyntaxKind::LINK_REF.into());
1029        // For implicit references, label is empty and we emit [text][]
1030        // For explicit references, emit the label to get [text][label]
1031        if !label.is_empty() {
1032            builder.token(SyntaxKind::TEXT.into(), label);
1033        }
1034        builder.finish_node();
1035        builder.token(SyntaxKind::TEXT.into(), "]");
1036    }
1037    // For shortcut references, just [text] - no second bracket pair
1038
1039    builder.finish_node();
1040}
1041
1042/// Try to parse a reference-style image: `![alt][ref]`, `![alt][]`, or `![alt]`
1043/// Returns (total_len, alt_text, label, is_shortcut) if successful.
1044pub fn try_parse_reference_image(
1045    text: &str,
1046    allow_shortcut: bool,
1047) -> Option<(usize, &str, String, bool)> {
1048    let bytes = text.as_bytes();
1049    if bytes.len() < 4 || bytes[0] != b'!' || bytes[1] != b'[' {
1050        return None;
1051    }
1052
1053    let mut pos = 2;
1054    let mut bracket_depth = 1;
1055    let alt_start = pos;
1056
1057    // Find the end of the alt text (allowing nested brackets)
1058    while pos < bytes.len() && bracket_depth > 0 {
1059        match bytes[pos] {
1060            b'[' => bracket_depth += 1,
1061            b']' => bracket_depth -= 1,
1062            b'\\' if pos + 1 < bytes.len() => pos += 1, // skip escaped char
1063            _ => {}
1064        }
1065        pos += 1;
1066    }
1067
1068    if bracket_depth > 0 {
1069        return None; // Unclosed brackets
1070    }
1071
1072    let alt_text = &text[alt_start..pos - 1];
1073
1074    // Now check for the label part
1075    if pos >= bytes.len() {
1076        return None;
1077    }
1078
1079    // Explicit reference: `![alt][label]`
1080    if bytes[pos] == b'[' {
1081        pos += 1;
1082        let label_start = pos;
1083
1084        // Find the end of the label (no nested brackets, no newlines)
1085        while pos < bytes.len() && bytes[pos] != b']' && bytes[pos] != b'\n' && bytes[pos] != b'\r'
1086        {
1087            pos += 1;
1088        }
1089
1090        if pos >= bytes.len() || bytes[pos] != b']' {
1091            return None;
1092        }
1093
1094        let label_text = &text[label_start..pos];
1095        pos += 1;
1096
1097        // Return the original label text for formatting preservation
1098        // Empty label means implicit reference
1099        let label = if label_text.is_empty() {
1100            alt_text.to_string() // For implicit references, use alt text as label for equality check
1101        } else {
1102            label_text.to_string() // Preserve original case
1103        };
1104
1105        return Some((pos, alt_text, label, false));
1106    }
1107
1108    // Shortcut reference: `![alt]` (only if enabled)
1109    // BUT not if followed by (url) - that's an inline image
1110    if allow_shortcut {
1111        // Check if next char is ( - if so, not a reference
1112        if pos < bytes.len() && bytes[pos] == b'(' {
1113            return None;
1114        }
1115
1116        // For shortcut references, use alt text as label for equality check
1117        let label = alt_text.to_string();
1118        return Some((pos, alt_text, label, true));
1119    }
1120
1121    None
1122}
1123
1124/// Emit a reference image node with registry lookup.
1125pub fn emit_reference_image(
1126    builder: &mut GreenNodeBuilder,
1127    alt_text: &str,
1128    label: &str,
1129    is_shortcut: bool,
1130    config: &ParserOptions,
1131    suppress_footnote_refs: bool,
1132) {
1133    builder.start_node(SyntaxKind::IMAGE_LINK.into());
1134
1135    // Emit as reference image (preserve original syntax)
1136    builder.start_node(SyntaxKind::IMAGE_LINK_START.into());
1137    builder.token(SyntaxKind::IMAGE_LINK_START.into(), "![");
1138    builder.finish_node();
1139
1140    // Alt text (recursively parse inline elements)
1141    builder.start_node(SyntaxKind::IMAGE_ALT.into());
1142    parse_inline_text(builder, alt_text, config, false, suppress_footnote_refs);
1143    builder.finish_node();
1144
1145    // Closing ] and reference label
1146    builder.token(SyntaxKind::TEXT.into(), "]");
1147
1148    if !is_shortcut {
1149        // Explicit or implicit reference: ![alt][label] or ![alt][]
1150        builder.token(SyntaxKind::TEXT.into(), "[");
1151        builder.start_node(SyntaxKind::LINK_REF.into());
1152        // For implicit references, emit empty label (label == alt means implicit from parser)
1153        if label != alt_text {
1154            builder.token(SyntaxKind::TEXT.into(), label);
1155        }
1156        builder.finish_node();
1157        builder.token(SyntaxKind::TEXT.into(), "]");
1158    }
1159    // For shortcut references, just ![alt] - no second bracket pair
1160
1161    builder.finish_node();
1162}
1163
1164/// Emit an `UNRESOLVED_REFERENCE` node for a Pandoc bracket-shape
1165/// pattern whose label didn't resolve. The wrapper covers the original
1166/// bracket bytes; the inner text recurses through normal inline
1167/// parsing (with inner-link suppression so a stray inner inline link
1168/// doesn't reorder semantics relative to pandoc-native).
1169///
1170/// `source` is `text[start..end]` — the full bracket-shape pattern.
1171/// `text_content` is the inner text between the outer `[` and `]`
1172/// (the bytes used for inline recursion). `label_suffix` carries the
1173/// `[label]` / `[]` suffix bytes verbatim, or `None` for shortcut form.
1174pub fn emit_unresolved_reference(
1175    builder: &mut GreenNodeBuilder,
1176    is_image: bool,
1177    text_content: &str,
1178    label_suffix: Option<&str>,
1179    config: &ParserOptions,
1180    suppress_footnote_refs: bool,
1181) {
1182    builder.start_node(SyntaxKind::UNRESOLVED_REFERENCE.into());
1183
1184    if is_image {
1185        builder.start_node(SyntaxKind::IMAGE_LINK_START.into());
1186        builder.token(SyntaxKind::IMAGE_LINK_START.into(), "![");
1187        builder.finish_node();
1188        builder.start_node(SyntaxKind::IMAGE_ALT.into());
1189        parse_inline_text(builder, text_content, config, false, suppress_footnote_refs);
1190        builder.finish_node();
1191    } else {
1192        builder.start_node(SyntaxKind::LINK_START.into());
1193        builder.token(SyntaxKind::LINK_START.into(), "[");
1194        builder.finish_node();
1195        builder.start_node(SyntaxKind::LINK_TEXT.into());
1196        parse_inline_text(builder, text_content, config, true, suppress_footnote_refs);
1197        builder.finish_node();
1198    }
1199
1200    builder.token(SyntaxKind::TEXT.into(), "]");
1201
1202    if let Some(suffix) = label_suffix {
1203        // suffix is either "[label]" or "[]"; preserve original bytes.
1204        // Split as `[` + LINK_REF(label) + `]` so wrapper accessors find
1205        // the label via `support::child::<LinkRef>()`.
1206        debug_assert!(suffix.starts_with('[') && suffix.ends_with(']'));
1207        builder.token(SyntaxKind::TEXT.into(), "[");
1208        let label = &suffix[1..suffix.len() - 1];
1209        builder.start_node(SyntaxKind::LINK_REF.into());
1210        if !label.is_empty() {
1211            builder.token(SyntaxKind::TEXT.into(), label);
1212        }
1213        builder.finish_node();
1214        builder.token(SyntaxKind::TEXT.into(), "]");
1215    }
1216
1217    builder.finish_node();
1218}
1219
1220#[cfg(test)]
1221mod tests {
1222    use super::*;
1223
1224    #[test]
1225    fn test_parse_autolink_url() {
1226        let input = "<https://example.com>";
1227        assert_eq!(
1228            try_parse_autolink(input, false),
1229            Some((21, "https://example.com"))
1230        );
1231        assert_eq!(
1232            try_parse_autolink(input, true),
1233            Some((21, "https://example.com"))
1234        );
1235    }
1236
1237    #[test]
1238    fn test_parse_autolink_email() {
1239        let input = "<user@example.com>";
1240        assert_eq!(
1241            try_parse_autolink(input, false),
1242            Some((18, "user@example.com"))
1243        );
1244        assert_eq!(
1245            try_parse_autolink(input, true),
1246            Some((18, "user@example.com"))
1247        );
1248    }
1249
1250    #[test]
1251    fn test_parse_autolink_no_close() {
1252        let input = "<https://example.com";
1253        assert_eq!(try_parse_autolink(input, false), None);
1254        assert_eq!(try_parse_autolink(input, true), None);
1255    }
1256
1257    #[test]
1258    fn test_parse_autolink_with_space() {
1259        let input = "<https://example.com >";
1260        assert_eq!(try_parse_autolink(input, false), None);
1261        assert_eq!(try_parse_autolink(input, true), None);
1262    }
1263
1264    #[test]
1265    fn test_parse_autolink_not_url_or_email() {
1266        let input = "<notaurl>";
1267        assert_eq!(try_parse_autolink(input, false), None);
1268        assert_eq!(try_parse_autolink(input, true), None);
1269    }
1270
1271    #[test]
1272    fn test_parse_autolink_commonmark_strict_scheme() {
1273        // Scheme too short (1 char) — invalid under CommonMark, lax-accepted
1274        // under Pandoc dialect (matches historical behavior).
1275        let input = "<m:abc>";
1276        assert_eq!(try_parse_autolink(input, true), None);
1277        assert_eq!(try_parse_autolink(input, false), Some((7, "m:abc")));
1278    }
1279
1280    #[test]
1281    fn test_parse_autolink_commonmark_email_disallows_backslash() {
1282        let input = "<foo\\+@bar.example.com>";
1283        assert_eq!(try_parse_autolink(input, true), None);
1284        assert_eq!(
1285            try_parse_autolink(input, false),
1286            Some((23, "foo\\+@bar.example.com"))
1287        );
1288    }
1289
1290    #[test]
1291    fn test_parse_inline_link_simple() {
1292        let input = "[text](url)";
1293        let result = try_parse_inline_link(input, false, LinkScanContext::default());
1294        assert_eq!(result, Some((11, "text", "url", None)));
1295    }
1296
1297    #[test]
1298    fn test_parse_inline_link_with_title() {
1299        let input = r#"[text](url "title")"#;
1300        let result = try_parse_inline_link(input, false, LinkScanContext::default());
1301        assert_eq!(result, Some((19, "text", r#"url "title""#, None)));
1302    }
1303
1304    #[test]
1305    fn test_parse_inline_link_with_nested_brackets() {
1306        let input = "[outer [inner] text](url)";
1307        let result = try_parse_inline_link(input, false, LinkScanContext::default());
1308        assert_eq!(result, Some((25, "outer [inner] text", "url", None)));
1309    }
1310
1311    #[test]
1312    fn test_parse_inline_link_no_space_between_brackets_and_parens() {
1313        let input = "[text] (url)";
1314        let result = try_parse_inline_link(input, false, LinkScanContext::default());
1315        assert_eq!(result, None);
1316    }
1317
1318    #[test]
1319    fn test_parse_inline_link_no_closing_bracket() {
1320        let input = "[text(url)";
1321        let result = try_parse_inline_link(input, false, LinkScanContext::default());
1322        assert_eq!(result, None);
1323    }
1324
1325    #[test]
1326    fn test_parse_inline_link_no_closing_paren() {
1327        let input = "[text](url";
1328        let result = try_parse_inline_link(input, false, LinkScanContext::default());
1329        assert_eq!(result, None);
1330    }
1331
1332    #[test]
1333    fn test_parse_inline_link_escaped_bracket() {
1334        let input = r"[text\]more](url)";
1335        let result = try_parse_inline_link(input, false, LinkScanContext::default());
1336        assert_eq!(result, Some((17, r"text\]more", "url", None)));
1337    }
1338
1339    #[test]
1340    fn test_parse_inline_link_parens_in_url() {
1341        let input = "[text](url(with)parens)";
1342        let result = try_parse_inline_link(input, false, LinkScanContext::default());
1343        assert_eq!(result, Some((23, "text", "url(with)parens", None)));
1344    }
1345
1346    #[test]
1347    fn test_parse_inline_image_simple() {
1348        let input = "![alt](image.jpg)";
1349        let result = try_parse_inline_image(input, LinkScanContext::default());
1350        assert_eq!(result, Some((17, "alt", "image.jpg", None)));
1351    }
1352
1353    #[test]
1354    fn test_parse_inline_image_with_title() {
1355        let input = r#"![alt](image.jpg "A title")"#;
1356        let result = try_parse_inline_image(input, LinkScanContext::default());
1357        assert_eq!(result, Some((27, "alt", r#"image.jpg "A title""#, None)));
1358    }
1359
1360    #[test]
1361    fn test_parse_inline_image_with_nested_brackets() {
1362        let input = "![outer [inner] alt](image.jpg)";
1363        let result = try_parse_inline_image(input, LinkScanContext::default());
1364        assert_eq!(result, Some((31, "outer [inner] alt", "image.jpg", None)));
1365    }
1366
1367    #[test]
1368    fn test_parse_bare_uri_rejects_dangling_backslash_after_trim() {
1369        let input = r"a:\]";
1370        let result = try_parse_bare_uri(input);
1371        assert_eq!(result, None);
1372    }
1373
1374    #[test]
1375    fn test_parse_inline_image_no_space_between_brackets_and_parens() {
1376        let input = "![alt] (image.jpg)";
1377        let result = try_parse_inline_image(input, LinkScanContext::default());
1378        assert_eq!(result, None);
1379    }
1380
1381    #[test]
1382    fn test_parse_inline_image_no_closing_bracket() {
1383        let input = "![alt(image.jpg)";
1384        let result = try_parse_inline_image(input, LinkScanContext::default());
1385        assert_eq!(result, None);
1386    }
1387
1388    #[test]
1389    fn test_parse_inline_image_no_closing_paren() {
1390        let input = "![alt](image.jpg";
1391        let result = try_parse_inline_image(input, LinkScanContext::default());
1392        assert_eq!(result, None);
1393    }
1394
1395    #[test]
1396    fn test_parse_inline_image_with_simple_class() {
1397        let input = "![alt](img.png){.large}";
1398        let result = try_parse_inline_image(input, LinkScanContext::default());
1399        let (len, alt, dest, attrs) = result.unwrap();
1400        assert_eq!(len, 23);
1401        assert_eq!(alt, "alt");
1402        assert_eq!(dest, "img.png");
1403        assert!(attrs.is_some());
1404        let attrs = attrs.unwrap();
1405        assert_eq!(attrs, "{.large}");
1406    }
1407
1408    #[test]
1409    fn test_parse_inline_image_with_id() {
1410        let input = "![Figure 1](fig1.png){#fig-1}";
1411        let result = try_parse_inline_image(input, LinkScanContext::default());
1412        let (len, alt, dest, attrs) = result.unwrap();
1413        assert_eq!(len, 29);
1414        assert_eq!(alt, "Figure 1");
1415        assert_eq!(dest, "fig1.png");
1416        assert!(attrs.is_some());
1417        let attrs = attrs.unwrap();
1418        assert_eq!(attrs, "{#fig-1}");
1419    }
1420
1421    #[test]
1422    fn test_parse_inline_image_with_full_attributes() {
1423        let input = "![alt](img.png){#fig .large width=\"80%\"}";
1424        let result = try_parse_inline_image(input, LinkScanContext::default());
1425        let (len, alt, dest, attrs) = result.unwrap();
1426        assert_eq!(len, 40);
1427        assert_eq!(alt, "alt");
1428        assert_eq!(dest, "img.png");
1429        assert!(attrs.is_some());
1430        let attrs = attrs.unwrap();
1431        assert_eq!(attrs, "{#fig .large width=\"80%\"}");
1432    }
1433
1434    #[test]
1435    fn test_parse_inline_image_attributes_must_be_adjacent() {
1436        // Space between ) and { should not parse as attributes
1437        let input = "![alt](img.png) {.large}";
1438        let result = try_parse_inline_image(input, LinkScanContext::default());
1439        assert_eq!(result, Some((15, "alt", "img.png", None)));
1440    }
1441
1442    // Link attribute tests
1443    #[test]
1444    fn test_parse_inline_link_with_id() {
1445        let input = "[text](url){#link-1}";
1446        let result = try_parse_inline_link(input, false, LinkScanContext::default());
1447        let (len, text, dest, attrs) = result.unwrap();
1448        assert_eq!(len, 20);
1449        assert_eq!(text, "text");
1450        assert_eq!(dest, "url");
1451        assert!(attrs.is_some());
1452        let attrs = attrs.unwrap();
1453        assert_eq!(attrs, "{#link-1}");
1454    }
1455
1456    #[test]
1457    fn test_parse_inline_link_with_full_attributes() {
1458        let input = "[text](url){#link .external target=\"_blank\"}";
1459        let result = try_parse_inline_link(input, false, LinkScanContext::default());
1460        let (len, text, dest, attrs) = result.unwrap();
1461        assert_eq!(len, 44);
1462        assert_eq!(text, "text");
1463        assert_eq!(dest, "url");
1464        assert!(attrs.is_some());
1465        let attrs = attrs.unwrap();
1466        assert_eq!(attrs, "{#link .external target=\"_blank\"}");
1467    }
1468
1469    #[test]
1470    fn test_parse_inline_link_attributes_must_be_adjacent() {
1471        // Space between ) and { should not parse as attributes
1472        let input = "[text](url) {.class}";
1473        let result = try_parse_inline_link(input, false, LinkScanContext::default());
1474        assert_eq!(result, Some((11, "text", "url", None)));
1475    }
1476
1477    #[test]
1478    fn test_parse_inline_link_with_title_and_attributes() {
1479        let input = r#"[text](url "title"){.external}"#;
1480        let result = try_parse_inline_link(input, false, LinkScanContext::default());
1481        let (len, text, dest, attrs) = result.unwrap();
1482        assert_eq!(len, 30);
1483        assert_eq!(text, "text");
1484        assert_eq!(dest, r#"url "title""#);
1485        assert!(attrs.is_some());
1486        let attrs = attrs.unwrap();
1487        assert_eq!(attrs, "{.external}");
1488    }
1489
1490    // Reference link tests
1491    #[test]
1492    fn test_parse_reference_link_explicit() {
1493        let input = "[link text][label]";
1494        let result = try_parse_reference_link(input, false, true, LinkScanContext::default());
1495        assert_eq!(result, Some((18, "link text", "label".to_string(), false)));
1496    }
1497
1498    #[test]
1499    fn test_parse_reference_link_implicit() {
1500        let input = "[link text][]";
1501        let result = try_parse_reference_link(input, false, true, LinkScanContext::default());
1502        assert_eq!(result, Some((13, "link text", String::new(), false)));
1503    }
1504
1505    #[test]
1506    fn test_parse_reference_link_explicit_same_label_as_text() {
1507        let input = "[stack][stack]";
1508        let result = try_parse_reference_link(input, false, true, LinkScanContext::default());
1509        assert_eq!(result, Some((14, "stack", "stack".to_string(), false)));
1510    }
1511
1512    #[test]
1513    fn test_parse_reference_link_shortcut() {
1514        let input = "[link text] rest";
1515        let result = try_parse_reference_link(input, true, true, LinkScanContext::default());
1516        assert_eq!(
1517            result,
1518            Some((11, "link text", "link text".to_string(), true))
1519        );
1520    }
1521
1522    #[test]
1523    fn test_parse_reference_link_shortcut_rejects_empty_label() {
1524        let input = "[] rest";
1525        let result = try_parse_reference_link(input, true, true, LinkScanContext::default());
1526        assert_eq!(result, None);
1527    }
1528
1529    #[test]
1530    fn test_parse_reference_link_shortcut_disabled() {
1531        let input = "[link text] rest";
1532        let result = try_parse_reference_link(input, false, true, LinkScanContext::default());
1533        assert_eq!(result, None);
1534    }
1535
1536    #[test]
1537    fn test_parse_reference_link_not_inline_link() {
1538        // With shortcut disabled, `[text](url)` is rejected so the inline
1539        // link form upstream gets exclusive ownership.
1540        let input = "[text](url)";
1541        let result = try_parse_reference_link(input, false, true, LinkScanContext::default());
1542        assert_eq!(result, None);
1543    }
1544
1545    #[test]
1546    fn test_parse_reference_link_shortcut_falls_through_inline_link() {
1547        // CommonMark spec example #568: when an inline-link attempt would
1548        // fail (here we model the reachability — the caller tries inline
1549        // link first; if that returns None, we should still see `[text]`
1550        // as a shortcut and leave `(url)` to be parsed as following text).
1551        let input = "[text](url)";
1552        let result = try_parse_reference_link(input, true, true, LinkScanContext::default());
1553        assert_eq!(result, Some((6, "text", "text".to_string(), true)));
1554    }
1555
1556    #[test]
1557    fn test_parse_reference_link_with_nested_brackets() {
1558        let input = "[outer [inner] text][ref]";
1559        let result = try_parse_reference_link(input, false, true, LinkScanContext::default());
1560        assert_eq!(
1561            result,
1562            Some((25, "outer [inner] text", "ref".to_string(), false))
1563        );
1564    }
1565
1566    #[test]
1567    fn test_parse_reference_link_label_no_newline() {
1568        let input = "[text][label\nmore]";
1569        let result = try_parse_reference_link(input, false, true, LinkScanContext::default());
1570        assert_eq!(result, None);
1571    }
1572
1573    // Reference image tests
1574    #[test]
1575    fn test_parse_reference_image_explicit() {
1576        let input = "![alt text][label]";
1577        let result = try_parse_reference_image(input, false);
1578        assert_eq!(result, Some((18, "alt text", "label".to_string(), false)));
1579    }
1580
1581    #[test]
1582    fn test_parse_reference_image_implicit() {
1583        let input = "![alt text][]";
1584        let result = try_parse_reference_image(input, false);
1585        assert_eq!(
1586            result,
1587            Some((13, "alt text", "alt text".to_string(), false))
1588        );
1589    }
1590
1591    #[test]
1592    fn test_parse_reference_image_shortcut() {
1593        let input = "![alt text] rest";
1594        let result = try_parse_reference_image(input, true);
1595        assert_eq!(result, Some((11, "alt text", "alt text".to_string(), true)));
1596    }
1597
1598    #[test]
1599    fn test_parse_reference_image_shortcut_disabled() {
1600        let input = "![alt text] rest";
1601        let result = try_parse_reference_image(input, false);
1602        assert_eq!(result, None);
1603    }
1604
1605    #[test]
1606    fn test_parse_reference_image_not_inline() {
1607        // Should not match inline images with (url)
1608        let input = "![alt](url)";
1609        let result = try_parse_reference_image(input, true);
1610        assert_eq!(result, None);
1611    }
1612
1613    #[test]
1614    fn test_parse_reference_image_with_nested_brackets() {
1615        let input = "![alt [nested] text][ref]";
1616        let result = try_parse_reference_image(input, false);
1617        assert_eq!(
1618            result,
1619            Some((25, "alt [nested] text", "ref".to_string(), false))
1620        );
1621    }
1622
1623    #[test]
1624    fn test_reference_link_label_with_crlf() {
1625        // Reference link labels should not span lines with CRLF
1626        let input = "[foo\r\nbar]";
1627        let result = try_parse_reference_link(input, false, true, LinkScanContext::default());
1628
1629        // Should fail to parse because label contains line break
1630        assert_eq!(
1631            result, None,
1632            "Should not parse reference link with CRLF in label"
1633        );
1634    }
1635
1636    #[test]
1637    fn test_reference_link_label_with_lf() {
1638        // Reference link labels should not span lines with LF either
1639        let input = "[foo\nbar]";
1640        let result = try_parse_reference_link(input, false, true, LinkScanContext::default());
1641
1642        // Should fail to parse because label contains line break
1643        assert_eq!(
1644            result, None,
1645            "Should not parse reference link with LF in label"
1646        );
1647    }
1648
1649    // Multiline link text tests
1650    #[test]
1651    fn test_parse_inline_link_multiline_text() {
1652        // Per Pandoc spec, link text CAN contain newlines (soft breaks)
1653        let input = "[text on\nline two](url)";
1654        let result = try_parse_inline_link(input, false, LinkScanContext::default());
1655        assert_eq!(
1656            result,
1657            Some((23, "text on\nline two", "url", None)),
1658            "Link text should allow newlines"
1659        );
1660    }
1661
1662    #[test]
1663    fn test_parse_inline_link_multiline_with_formatting() {
1664        // Link text with newlines and other inline elements
1665        let input =
1666            "[A network graph. Different edges\nwith probability](../images/networkfig.png)";
1667        let result = try_parse_inline_link(input, false, LinkScanContext::default());
1668        assert!(result.is_some(), "Link text with newlines should parse");
1669        let (len, text, _dest, _attrs) = result.unwrap();
1670        assert!(text.contains('\n'), "Link text should preserve newline");
1671        assert_eq!(len, input.len());
1672    }
1673
1674    #[test]
1675    fn test_parse_inline_image_multiline_alt() {
1676        // Per Pandoc spec, image alt text CAN contain newlines
1677        let input = "![alt on\nline two](img.png)";
1678        let result = try_parse_inline_image(input, LinkScanContext::default());
1679        assert_eq!(
1680            result,
1681            Some((27, "alt on\nline two", "img.png", None)),
1682            "Image alt text should allow newlines"
1683        );
1684    }
1685
1686    #[test]
1687    fn test_parse_inline_image_multiline_with_attributes() {
1688        // Image with multiline alt text and attributes
1689        let input = "![network graph\ndiagram](../images/fig.png){width=70%}";
1690        let result = try_parse_inline_image(input, LinkScanContext::default());
1691        assert!(
1692            result.is_some(),
1693            "Image alt with newlines and attributes should parse"
1694        );
1695        let (len, alt, dest, attrs) = result.unwrap();
1696        assert!(alt.contains('\n'), "Alt text should preserve newline");
1697        assert_eq!(dest, "../images/fig.png");
1698        assert_eq!(attrs, Some("{width=70%}"));
1699        assert_eq!(len, input.len());
1700    }
1701
1702    #[test]
1703    fn test_parse_inline_link_with_attributes_after_newline() {
1704        // Test for regression: when text is concatenated with newlines,
1705        // attributes after ) should still be recognized
1706        let input = "[A network graph.](../images/networkfig.png){width=70%}\nA word\n";
1707        let result = try_parse_inline_link(input, false, LinkScanContext::default());
1708        assert!(
1709            result.is_some(),
1710            "Link with attributes should parse even with following text"
1711        );
1712        let (len, text, dest, attrs) = result.unwrap();
1713        assert_eq!(text, "A network graph.");
1714        assert_eq!(dest, "../images/networkfig.png");
1715        assert_eq!(attrs, Some("{width=70%}"), "Attributes should be captured");
1716        assert_eq!(
1717            len, 55,
1718            "Length should include attributes (up to closing brace)"
1719        );
1720    }
1721}
panache_parser/parser/inlines/links.rs

panache_parser/parser/inlines/
links.rs