panache_parser/parser/inlines/
links.rs

1//! Parsing for links, images, and automatic links.
2//!
3//! Implements:
4//! - Automatic links: `<http://example.com>` and `<user@example.com>`
5//! - Inline links: `[text](url)` and `[text](url "title")`
6//! - Link attributes: `[text](url){#id .class key=value}`
7//! - Inline images: `![alt](url)` and `![alt](url "title")`
8//! - Image attributes: `![alt](url){#id .class key=value}`
9//! - Reference links: `[text][ref]`, `[text][]`, `[text]`
10//! - Reference images: `![alt][ref]`, `![alt][]`, `![alt]`
11
12use super::code_spans::try_parse_code_span;
13use super::core::parse_inline_text;
14use super::inline_html::try_parse_inline_html;
15use crate::options::ParserOptions;
16use crate::syntax::SyntaxKind;
17use rowan::GreenNodeBuilder;
18
19// Import attribute parsing
20use crate::parser::utils::attributes::try_parse_trailing_attributes;
21
22/// Flags that control which inline spans the link-bracket scanner treats as
23/// opaque (so a `]` inside them does not terminate the link/image text).
24///
25/// - `skip_raw_html` is universal across dialects: pandoc-markdown and
26///   CommonMark both refuse to close link text inside a raw HTML span (e.g.
27///   `[foo <bar attr="](baz)">`), per CommonMark spec example #524 / #536.
28/// - `skip_autolinks` is **CommonMark-only**. Pandoc-markdown does *not*
29///   treat `<scheme://...>` as opaque inside link text, so the same input
30///   produces a different parse under each dialect (CommonMark spec example
31///   #526 / #538). Always derive this from
32///   `extensions.autolinks && dialect == Dialect::CommonMark`.
33/// - `disallow_inner_links` is **CommonMark-only** structural rule (§6.4):
34///   "Links may not contain other links, at any level of nesting." When the
35///   candidate link/image text contains a valid inline link or image, the
36///   outer match is rejected so the inner-most definition is used instead
37///   (spec examples #518–#520, #532). Pandoc-markdown allows nested links,
38///   so the flag is `false` there.
39#[derive(Clone, Copy)]
40pub struct LinkScanContext {
41    pub skip_raw_html: bool,
42    pub skip_autolinks: bool,
43    pub disallow_inner_links: bool,
44    /// Dialect controlling which HTML constructs the raw-HTML opacity check
45    /// recognizes. Pandoc-markdown excludes bare declarations and CDATA
46    /// from its inline raw HTML grammar.
47    pub dialect: crate::options::Dialect,
48}
49
50impl Default for LinkScanContext {
51    fn default() -> Self {
52        Self {
53            skip_raw_html: false,
54            skip_autolinks: false,
55            disallow_inner_links: false,
56            dialect: crate::options::Dialect::Pandoc,
57        }
58    }
59}
60
61impl LinkScanContext {
62    pub fn from_options(config: &ParserOptions) -> Self {
63        let is_commonmark = config.dialect == crate::options::Dialect::CommonMark;
64        Self {
65            skip_raw_html: config.extensions.raw_html,
66            skip_autolinks: config.extensions.autolinks && is_commonmark,
67            disallow_inner_links: is_commonmark,
68            dialect: config.dialect,
69        }
70    }
71}
72
73/// Find the closing `]` of a link/image text span, starting from `start`.
74///
75/// Walks `text[start..]` tracking nested brackets and backslash escapes. When
76/// a backtick run starting a valid code span is encountered, the entire span
77/// (including any trailing attribute block) is skipped — per CommonMark §6
78/// precedence, code spans bind tighter than links/images, so a `]` *inside*
79/// a code span cannot terminate the link's text. The same opacity applies to
80/// raw HTML and (CommonMark-only) autolink spans gated through `ctx`.
81/// Returns the byte offset of the closing `]` within `text`, or `None` if no
82/// unmatched `]` is reached.
83fn find_link_close_bracket(text: &str, start: usize, ctx: LinkScanContext) -> Option<usize> {
84    let bytes = text.as_bytes();
85    let mut bracket_depth = 0;
86    let mut escape_next = false;
87    let mut i = start;
88
89    while i < bytes.len() {
90        let b = bytes[i];
91
92        if escape_next {
93            escape_next = false;
94            i += step(text, i);
95            continue;
96        }
97
98        match b {
99            b'\\' => {
100                escape_next = true;
101                i += 1;
102            }
103            b'`' => {
104                if let Some((len, _, _, _)) = try_parse_code_span(&text[i..]) {
105                    i += len;
106                } else {
107                    i += 1;
108                }
109            }
110            b'<' => {
111                // Order matters: autolinks are the more specific `<...>`
112                // shape (URI/email between angle brackets), so try that
113                // before falling through to general inline raw HTML which
114                // would also match `<bar attr="...">`-style tags.
115                if ctx.skip_autolinks
116                    && let Some((len, _)) = try_parse_autolink(&text[i..], true)
117                {
118                    i += len;
119                } else if ctx.skip_raw_html
120                    && let Some(len) = try_parse_inline_html(&text[i..], ctx.dialect)
121                {
122                    i += len;
123                } else {
124                    i += 1;
125                }
126            }
127            b'[' => {
128                bracket_depth += 1;
129                i += 1;
130            }
131            b']' => {
132                if bracket_depth == 0 {
133                    return Some(i);
134                }
135                bracket_depth -= 1;
136                i += 1;
137            }
138            _ => i += step(text, i),
139        }
140    }
141    None
142}
143
144/// Find the closing `)` of a link/image destination, given the text *after*
145/// the opening `(`. Tracks paren nesting, quoted titles, and angle-bracketed
146/// destinations (`<...>` may legitimately contain unbalanced parens — see
147/// spec example #499). Returns the byte offset of the closing `)` within the
148/// passed slice, or `None` if not found.
149fn find_dest_close_paren(remaining: &str) -> Option<usize> {
150    let bytes = remaining.as_bytes();
151    let mut paren_depth = 0;
152    let mut escape_next = false;
153    let mut in_quotes = false;
154    let mut in_angle = false;
155    let mut i = 0;
156
157    while i < bytes.len() {
158        let b = bytes[i];
159
160        if escape_next {
161            escape_next = false;
162            i += step(remaining, i);
163            continue;
164        }
165
166        match b {
167            b'\\' => {
168                escape_next = true;
169                i += 1;
170            }
171            b'<' if !in_quotes && !in_angle => {
172                in_angle = true;
173                i += 1;
174            }
175            b'>' if in_angle => {
176                in_angle = false;
177                i += 1;
178            }
179            b'"' if !in_angle => {
180                in_quotes = !in_quotes;
181                i += 1;
182            }
183            b'(' if !in_quotes && !in_angle => {
184                paren_depth += 1;
185                i += 1;
186            }
187            b')' if !in_quotes && !in_angle => {
188                if paren_depth == 0 {
189                    return Some(i);
190                }
191                paren_depth -= 1;
192                i += 1;
193            }
194            _ => i += step(remaining, i),
195        }
196    }
197    None
198}
199
200/// Byte length of the UTF-8 character starting at byte index `i` in `s`.
201/// Used to advance an index loop char-by-char without incurring `char_indices`
202/// overhead and without splitting on a UTF-8 boundary.
203fn step(s: &str, i: usize) -> usize {
204    s[i..].chars().next().map(|c| c.len_utf8()).unwrap_or(1)
205}
206
207/// CommonMark §6.4: "Links may not contain other links, at any level of
208/// nesting. If multiple otherwise valid link definitions appear nested inside
209/// each other, the inner-most definition is used." This helper scans a
210/// candidate link text for any `[` that starts a valid inline link; when
211/// found, the outer link must be rejected so the inner-most wins (spec
212/// examples #518–#519, #532).
213///
214/// Images themselves do not count as inner links — a link can contain an
215/// image (#517, #531). A link *inside* an image's alt text, however, still
216/// deactivates outer link openers per CommonMark's bracket-scanner rules, so
217/// the helper recurses into image alt text looking for inner links.
218///
219/// Reference-link nesting (#533, #569, #571) requires resolving labels
220/// against the document's reference-definition map, which the parser does
221/// not have at this point — those cases remain unhandled and need a later
222/// stack-based pass.
223fn link_text_contains_inner_link(text: &str, ctx: LinkScanContext, strict_dest: bool) -> bool {
224    let bytes = text.as_bytes();
225    let mut i = 0;
226    let mut escape_next = false;
227    while i < bytes.len() {
228        let b = bytes[i];
229        if escape_next {
230            escape_next = false;
231            i += step(text, i);
232            continue;
233        }
234        match b {
235            b'\\' => {
236                escape_next = true;
237                i += 1;
238            }
239            b'`' => {
240                if let Some((len, _, _, _)) = try_parse_code_span(&text[i..]) {
241                    i += len;
242                } else {
243                    i += 1;
244                }
245            }
246            b'<' => {
247                if ctx.skip_autolinks
248                    && let Some((len, _)) = try_parse_autolink(&text[i..], true)
249                {
250                    i += len;
251                } else if ctx.skip_raw_html
252                    && let Some(len) = try_parse_inline_html(&text[i..], ctx.dialect)
253                {
254                    i += len;
255                } else {
256                    i += 1;
257                }
258            }
259            b'!' if i + 1 < bytes.len() && bytes[i + 1] == b'[' => {
260                if let Some((len, alt, _, _)) = try_parse_inline_image(&text[i..], ctx) {
261                    if link_text_contains_inner_link(alt, ctx, strict_dest) {
262                        return true;
263                    }
264                    i += len;
265                } else {
266                    i += 2;
267                }
268            }
269            b'[' => {
270                if try_parse_inline_link(&text[i..], strict_dest, ctx).is_some() {
271                    return true;
272                }
273                i += 1;
274            }
275            _ => i += step(text, i),
276        }
277    }
278    false
279}
280
281/// Try to parse an inline image starting at the current position.
282///
283/// Inline images have the form `![alt](url)` or `![alt](url "title")`.
284/// Can also have trailing attributes: `![alt](url){#id .class}`.
285/// Returns Some((length, alt_text, dest_content, raw_attributes)) if a valid image is found.
286///
287/// `ctx` controls bracket-scanner opacity for raw HTML / autolink spans;
288/// see `LinkScanContext`.
289pub fn try_parse_inline_image(
290    text: &str,
291    ctx: LinkScanContext,
292) -> Option<(usize, &str, &str, Option<&str>)> {
293    if !text.starts_with("![") {
294        return None;
295    }
296
297    // Find the closing ]
298    let close_bracket = find_link_close_bracket(text, 2, ctx)?;
299    let alt_text = &text[2..close_bracket];
300
301    // Check for immediate ( after ]
302    let after_bracket = close_bracket + 1;
303    if text.len() <= after_bracket || !text[after_bracket..].starts_with('(') {
304        return None;
305    }
306
307    // Find closing ) for destination (reuse same logic as links)
308    let dest_start = after_bracket + 1;
309    let remaining = &text[dest_start..];
310
311    let close_paren = find_dest_close_paren(remaining)?;
312    let dest_content = &remaining[..close_paren];
313
314    // Check for trailing attributes {#id .class key=value}
315    let after_paren = dest_start + close_paren + 1;
316    let after_close = &text[after_paren..];
317
318    // Attributes must start immediately after closing paren (no whitespace/newlines)
319    if after_close.starts_with('{') {
320        // Find the closing brace
321        if let Some(close_brace_pos) = after_close.find('}') {
322            let attr_text = &after_close[..=close_brace_pos];
323            // Try to parse as attributes to validate
324            if let Some((_attrs, _)) = try_parse_trailing_attributes(attr_text) {
325                let total_len = after_paren + close_brace_pos + 1;
326                // Return raw attribute string for lossless parsing
327                let raw_attrs = attr_text;
328                return Some((total_len, alt_text, dest_content, Some(raw_attrs)));
329            }
330        }
331    }
332
333    // No attributes, just return the image
334    let total_len = after_paren;
335    Some((total_len, alt_text, dest_content, None))
336}
337
338/// Emit an inline image node to the builder.
339/// Note: alt_text may contain inline elements and should be parsed recursively.
340pub fn emit_inline_image(
341    builder: &mut GreenNodeBuilder,
342    _text: &str,
343    alt_text: &str,
344    dest: &str,
345    raw_attributes: Option<&str>,
346    config: &ParserOptions,
347    suppress_footnote_refs: bool,
348) {
349    builder.start_node(SyntaxKind::IMAGE_LINK.into());
350
351    // Opening ![
352    builder.start_node(SyntaxKind::IMAGE_LINK_START.into());
353    builder.token(SyntaxKind::IMAGE_LINK_START.into(), "![");
354    builder.finish_node();
355
356    // Alt text (recursively parse inline elements)
357    builder.start_node(SyntaxKind::IMAGE_ALT.into());
358    // Use the standalone parse_inline_text function for recursive parsing
359    // Note: nested contexts don't resolve references
360    parse_inline_text(builder, alt_text, config, false, suppress_footnote_refs);
361    builder.finish_node();
362
363    // Closing ]
364    builder.token(SyntaxKind::IMAGE_ALT_END.into(), "]");
365
366    // Opening (
367    builder.token(SyntaxKind::IMAGE_DEST_START.into(), "(");
368
369    // Destination
370    builder.start_node(SyntaxKind::LINK_DEST.into());
371    builder.token(SyntaxKind::TEXT.into(), dest);
372    builder.finish_node();
373
374    // Closing )
375    builder.token(SyntaxKind::IMAGE_DEST_END.into(), ")");
376
377    // Emit raw attributes if present (preserve original formatting)
378    if let Some(raw_attrs) = raw_attributes {
379        builder.start_node(SyntaxKind::ATTRIBUTE.into());
380        builder.token(SyntaxKind::ATTRIBUTE.into(), raw_attrs);
381        builder.finish_node();
382    }
383
384    builder.finish_node();
385}
386
387/// Try to parse an automatic link starting at the current position.
388///
389/// Automatic links have the form `<url>` (URI autolink) or `<email>`
390/// (email autolink) per CommonMark §6.4. Under `Dialect::CommonMark` the
391/// scheme/email grammar is enforced strictly (e.g. scheme must be 2-32
392/// ASCII chars; email local parts cannot contain backslashes). Pandoc
393/// markdown is laxer — it accepts Unicode in email addresses, for
394/// example — so non-CommonMark callers fall back to the heuristic
395/// "contains `:` or `@`" check that the parser used historically.
396pub fn try_parse_autolink(text: &str, is_commonmark: bool) -> Option<(usize, &str)> {
397    if !text.starts_with('<') {
398        return None;
399    }
400
401    let close_pos = text[1..].find('>')?;
402    let content = &text[1..1 + close_pos];
403
404    if content.is_empty() {
405        return None;
406    }
407    if content.contains(|c: char| c.is_whitespace()) {
408        return None;
409    }
410
411    if is_commonmark {
412        if !is_valid_uri_autolink(content) && !is_valid_email_autolink(content) {
413            return None;
414        }
415    } else if !content.contains(':') && !content.contains('@') {
416        return None;
417    }
418
419    Some((close_pos + 2, content))
420}
421
422/// CommonMark §6.4 URI autolink:
423/// scheme = 2-32 chars, ASCII letter then `[a-zA-Z0-9+.-]`, followed by `:`,
424/// followed by URI body (any char except control, space, `<`, `>`).
425fn is_valid_uri_autolink(s: &str) -> bool {
426    let bytes = s.as_bytes();
427    if bytes.is_empty() || !bytes[0].is_ascii_alphabetic() {
428        return false;
429    }
430    let mut i = 1;
431    while i < bytes.len() {
432        let b = bytes[i];
433        if b.is_ascii_alphanumeric() || b == b'+' || b == b'-' || b == b'.' {
434            i += 1;
435        } else {
436            break;
437        }
438    }
439    if !(2..=32).contains(&i) {
440        return false;
441    }
442    if i >= bytes.len() || bytes[i] != b':' {
443        return false;
444    }
445    for &b in &bytes[i + 1..] {
446        if b < 0x20 || b == 0x7f || b == b'<' || b == b'>' {
447            return false;
448        }
449    }
450    true
451}
452
453/// CommonMark §6.4 email autolink, matching the HTML5 non-normative regex:
454/// `^[a-zA-Z0-9.!#$%&'*+/=?^_`{|}~-]+@[a-zA-Z0-9](?:[a-zA-Z0-9-]{0,61}[a-zA-Z0-9])?
455///  (?:\.[a-zA-Z0-9](?:[a-zA-Z0-9-]{0,61}[a-zA-Z0-9])?)*$`.
456fn is_valid_email_autolink(s: &str) -> bool {
457    let Some(at) = s.find('@') else {
458        return false;
459    };
460    let local = &s[..at];
461    let domain = &s[at + 1..];
462    if local.is_empty() || !local.bytes().all(is_email_local_byte) {
463        return false;
464    }
465    if domain.is_empty() {
466        return false;
467    }
468    domain.split('.').all(is_valid_email_label)
469}
470
471fn is_email_local_byte(b: u8) -> bool {
472    matches!(
473        b,
474        b'a'..=b'z'
475            | b'A'..=b'Z'
476            | b'0'..=b'9'
477            | b'.'
478            | b'!'
479            | b'#'
480            | b'$'
481            | b'%'
482            | b'&'
483            | b'\''
484            | b'*'
485            | b'+'
486            | b'/'
487            | b'='
488            | b'?'
489            | b'^'
490            | b'_'
491            | b'`'
492            | b'{'
493            | b'|'
494            | b'}'
495            | b'~'
496            | b'-'
497    )
498}
499
500fn is_valid_email_label(label: &str) -> bool {
501    let bytes = label.as_bytes();
502    if bytes.is_empty() || bytes.len() > 63 {
503        return false;
504    }
505    if !bytes[0].is_ascii_alphanumeric() {
506        return false;
507    }
508    if !bytes[bytes.len() - 1].is_ascii_alphanumeric() {
509        return false;
510    }
511    bytes[1..bytes.len() - 1]
512        .iter()
513        .all(|b| b.is_ascii_alphanumeric() || *b == b'-')
514}
515
516/// Emit an automatic link node to the builder.
517pub fn emit_autolink(builder: &mut GreenNodeBuilder, _text: &str, url: &str) {
518    builder.start_node(SyntaxKind::AUTO_LINK.into());
519
520    // Opening <
521    builder.start_node(SyntaxKind::AUTO_LINK_MARKER.into());
522    builder.token(SyntaxKind::AUTO_LINK_MARKER.into(), "<");
523    builder.finish_node();
524
525    // URL content
526    builder.token(SyntaxKind::TEXT.into(), url);
527
528    // Closing >
529    builder.start_node(SyntaxKind::AUTO_LINK_MARKER.into());
530    builder.token(SyntaxKind::AUTO_LINK_MARKER.into(), ">");
531    builder.finish_node();
532
533    builder.finish_node();
534}
535
536pub fn try_parse_bare_uri(text: &str) -> Option<(usize, &str)> {
537    let mut chars = text.char_indices();
538    let (_, first) = chars.next()?;
539    if !first.is_ascii_alphabetic() {
540        return None;
541    }
542
543    let mut scheme_end = None;
544    for (idx, ch) in text.char_indices() {
545        if ch == ':' {
546            scheme_end = Some(idx);
547            break;
548        }
549        if !ch.is_ascii_alphanumeric() && ch != '+' && ch != '-' && ch != '.' {
550            return None;
551        }
552    }
553    let scheme_end = scheme_end?;
554    if scheme_end == 0 {
555        return None;
556    }
557
558    let mut end = scheme_end + 1;
559    let bytes = text.as_bytes();
560    while end < text.len() {
561        let b = bytes[end];
562        if b.is_ascii_whitespace() {
563            break;
564        }
565        if matches!(b, b'<' | b'>' | b'`' | b'"' | b'\'') {
566            break;
567        }
568        end += 1;
569    }
570
571    if end == scheme_end + 1 {
572        return None;
573    }
574
575    let mut trimmed = end;
576    while trimmed > scheme_end + 1 {
577        let ch = text[..trimmed].chars().last().unwrap();
578        if matches!(ch, '.' | ',' | ';' | ':' | ')' | ']' | '}') {
579            trimmed -= ch.len_utf8();
580        } else {
581            break;
582        }
583    }
584
585    if trimmed <= scheme_end + 1 {
586        return None;
587    }
588
589    // If trimming terminal punctuation leaves a dangling backslash, the match
590    // came from escaped punctuation (e.g., `a:\]`) and should stay literal.
591    if text[..trimmed].ends_with('\\') {
592        return None;
593    }
594
595    Some((trimmed, &text[..trimmed]))
596}
597
598/// Try to parse an inline link starting at the current position.
599///
600/// Inline links have the form `[text](url)` or `[text](url "title")`.
601/// Can also have trailing attributes: `[text](url){#id .class}`.
602/// Returns Some((length, text_content, dest_content, raw_attributes)) if a valid link is found.
603///
604/// `strict_dest` enables CommonMark §6.4 destination-and-title validation:
605/// the bare destination form may not contain spaces or ASCII control
606/// characters and must have balanced parentheses; if a title follows it
607/// must be properly delimited; only whitespace is allowed before/after.
608/// Pandoc-markdown is more permissive, so leave this off for that dialect.
609pub fn try_parse_inline_link(
610    text: &str,
611    strict_dest: bool,
612    ctx: LinkScanContext,
613) -> Option<(usize, &str, &str, Option<&str>)> {
614    if !text.starts_with('[') {
615        return None;
616    }
617
618    // Find the closing ]
619    let close_bracket = find_link_close_bracket(text, 1, ctx)?;
620    let link_text = &text[1..close_bracket];
621
622    // Check for immediate ( after ]
623    let after_bracket = close_bracket + 1;
624    if text.len() <= after_bracket || !text[after_bracket..].starts_with('(') {
625        return None;
626    }
627
628    // Find closing ) for destination
629    let dest_start = after_bracket + 1;
630    let remaining = &text[dest_start..];
631
632    let close_paren = find_dest_close_paren(remaining)?;
633    let dest_content = &remaining[..close_paren];
634
635    if strict_dest && !dest_and_title_ok_commonmark(dest_content) {
636        return None;
637    }
638
639    // CommonMark §6.4: outer link is rejected when its text contains a valid
640    // inner inline link or image, so the inner-most definition wins.
641    if ctx.disallow_inner_links && link_text_contains_inner_link(link_text, ctx, strict_dest) {
642        return None;
643    }
644
645    // Check for trailing attributes {#id .class key=value}
646    let after_paren = dest_start + close_paren + 1;
647    let after_close = &text[after_paren..];
648
649    // Attributes must start immediately after closing paren (no whitespace/newlines)
650    if after_close.starts_with('{') {
651        // Find the closing brace
652        if let Some(close_brace_pos) = after_close.find('}') {
653            let attr_text = &after_close[..=close_brace_pos];
654            // Try to parse as attributes to validate
655            if let Some((_attrs, _)) = try_parse_trailing_attributes(attr_text) {
656                let total_len = after_paren + close_brace_pos + 1;
657                // Return raw attribute string for lossless parsing
658                let raw_attrs = attr_text;
659                return Some((total_len, link_text, dest_content, Some(raw_attrs)));
660            }
661        }
662    }
663
664    // No attributes, just return the link
665    let total_len = after_paren;
666    Some((total_len, link_text, dest_content, None))
667}
668
669/// CommonMark §6.4 destination + optional title validation. The text passed
670/// in is whatever the parser captured between `(` and `)`. A valid form is:
671/// `[ws] destination [ws title [ws]]` where:
672/// - bare destination has no spaces, tabs, ASCII control chars, and balanced
673///   parentheses (escaped parens permitted);
674/// - bracketed destination is `<...>` with no newlines and no unescaped `<>`;
675/// - the optional title is delimited by `"..."`, `'...'`, or `(...)`;
676/// - any text outside that structure invalidates the link.
677fn dest_and_title_ok_commonmark(content: &str) -> bool {
678    let trimmed = trim_start_link_ws(content);
679    if trimmed.is_empty() {
680        return true;
681    }
682
683    let after_dest = if let Some(rest) = trimmed.strip_prefix('<') {
684        let mut escape = false;
685        let mut end_byte = None;
686        for (i, c) in rest.char_indices() {
687            if escape {
688                escape = false;
689                continue;
690            }
691            match c {
692                '\\' => escape = true,
693                '\n' | '<' => return false,
694                '>' => {
695                    end_byte = Some(i);
696                    break;
697                }
698                _ => {}
699            }
700        }
701        match end_byte {
702            Some(e) => &rest[e + 1..],
703            None => return false,
704        }
705    } else {
706        let mut escape = false;
707        let mut depth: i32 = 0;
708        let mut end = trimmed.len();
709        for (i, c) in trimmed.char_indices() {
710            if escape {
711                escape = false;
712                continue;
713            }
714            match c {
715                '\\' => escape = true,
716                ' ' | '\t' | '\n' => {
717                    end = i;
718                    break;
719                }
720                _ if c.is_ascii_control() => return false,
721                '(' => depth += 1,
722                ')' => {
723                    if depth == 0 {
724                        end = i;
725                        break;
726                    }
727                    depth -= 1;
728                }
729                _ => {}
730            }
731        }
732        if depth != 0 {
733            return false;
734        }
735        if end == 0 {
736            // bare destination must be nonempty if the field is non-blank
737            return false;
738        }
739        &trimmed[end..]
740    };
741
742    let after_dest = trim_start_link_ws(after_dest);
743    if after_dest.is_empty() {
744        return true;
745    }
746
747    let bytes = after_dest.as_bytes();
748    let close = match bytes[0] {
749        b'"' => b'"',
750        b'\'' => b'\'',
751        b'(' => b')',
752        _ => return false,
753    };
754    let opens_paren = bytes[0] == b'(';
755    let mut escape = false;
756    let mut title_close_pos = None;
757    for (i, &b) in after_dest.as_bytes().iter().enumerate().skip(1) {
758        if escape {
759            escape = false;
760            continue;
761        }
762        if b == b'\\' {
763            escape = true;
764            continue;
765        }
766        if opens_paren && b == b'(' {
767            return false;
768        }
769        if b == close {
770            title_close_pos = Some(i);
771            break;
772        }
773    }
774    let close_idx = match title_close_pos {
775        Some(p) => p,
776        None => return false,
777    };
778
779    let after_title = &after_dest[close_idx + 1..];
780    is_link_ws_only(after_title)
781}
782
783/// Strip leading ASCII space/tab/newline bytes. Byte-level equivalent of
784/// `s.trim_start_matches([' ', '\t', '\n'])`; called for every
785/// CommonMark inline-link destination/title scan, so the slice-pattern
786/// MultiCharEqSearcher overhead matters.
787#[inline]
788fn trim_start_link_ws(s: &str) -> &str {
789    let bytes = s.as_bytes();
790    let mut i = 0;
791    while i < bytes.len() {
792        let b = bytes[i];
793        if b == b' ' || b == b'\t' || b == b'\n' {
794            i += 1;
795        } else {
796            break;
797        }
798    }
799    // SAFETY: stripped only ASCII whitespace bytes.
800    unsafe { std::str::from_utf8_unchecked(&bytes[i..]) }
801}
802
803#[inline]
804fn is_link_ws_only(s: &str) -> bool {
805    s.as_bytes()
806        .iter()
807        .all(|&b| b == b' ' || b == b'\t' || b == b'\n')
808}
809
810/// Emit an inline link node to the builder.
811/// Note: link_text may contain inline elements and should be parsed recursively.
812pub fn emit_inline_link(
813    builder: &mut GreenNodeBuilder,
814    _text: &str,
815    link_text: &str,
816    dest: &str,
817    raw_attributes: Option<&str>,
818    config: &ParserOptions,
819    suppress_footnote_refs: bool,
820) {
821    builder.start_node(SyntaxKind::LINK.into());
822
823    // Opening [
824    builder.start_node(SyntaxKind::LINK_START.into());
825    builder.token(SyntaxKind::LINK_START.into(), "[");
826    builder.finish_node();
827
828    // Link text (recursively parse inline elements). Pandoc-native:
829    // links cannot contain other links, so suppress inner LINK / ref-link
830    // recognition during the recursion. Images, emphasis, code, etc. are
831    // still recognised. CommonMark relies on outer-level process_brackets
832    // to prevent nested links, but the flag is harmless under CM.
833    builder.start_node(SyntaxKind::LINK_TEXT.into());
834    parse_inline_text(builder, link_text, config, true, suppress_footnote_refs);
835    builder.finish_node();
836
837    // Closing ]
838    builder.token(SyntaxKind::LINK_TEXT_END.into(), "]");
839
840    // Opening (
841    builder.token(SyntaxKind::LINK_DEST_START.into(), "(");
842
843    // Destination
844    builder.start_node(SyntaxKind::LINK_DEST.into());
845    builder.token(SyntaxKind::TEXT.into(), dest);
846    builder.finish_node();
847
848    // Closing )
849    builder.token(SyntaxKind::LINK_DEST_END.into(), ")");
850
851    // Emit raw attributes if present (preserve original formatting)
852    if let Some(raw_attrs) = raw_attributes {
853        builder.start_node(SyntaxKind::ATTRIBUTE.into());
854        builder.token(SyntaxKind::ATTRIBUTE.into(), raw_attrs);
855        builder.finish_node();
856    }
857
858    builder.finish_node();
859}
860
861pub fn emit_bare_uri_link(builder: &mut GreenNodeBuilder, uri: &str, _config: &ParserOptions) {
862    builder.start_node(SyntaxKind::LINK.into());
863
864    builder.start_node(SyntaxKind::LINK_START.into());
865    builder.token(SyntaxKind::LINK_START.into(), "[");
866    builder.finish_node();
867
868    builder.start_node(SyntaxKind::LINK_TEXT.into());
869    builder.token(SyntaxKind::TEXT.into(), uri);
870    builder.finish_node();
871
872    builder.token(SyntaxKind::LINK_TEXT_END.into(), "]");
873    builder.token(SyntaxKind::LINK_DEST_START.into(), "(");
874
875    builder.start_node(SyntaxKind::LINK_DEST.into());
876    builder.token(SyntaxKind::TEXT.into(), uri);
877    builder.finish_node();
878
879    builder.token(SyntaxKind::LINK_DEST_END.into(), ")");
880
881    builder.finish_node();
882}
883
884/// Try to parse a reference link starting at the current position.
885///
886/// Reference links have three forms:
887/// - Explicit: `[text][label]`
888/// - Implicit: `[text][]` (label = text)
889/// - Shortcut: `[text]` (if shortcut_reference_links enabled)
890///
891/// Returns Some((length, text_content, label, is_shortcut)) if a valid reference link is found.
892/// The label is what should be looked up in the registry.
893pub fn try_parse_reference_link(
894    text: &str,
895    allow_shortcut: bool,
896    inline_link_attempted: bool,
897    ctx: LinkScanContext,
898) -> Option<(usize, &str, String, bool)> {
899    if !text.starts_with('[') {
900        return None;
901    }
902
903    // Don't match citations (which start with [@) or suppress-author citations (which start with [-@)
904    if text.len() > 1 {
905        let bytes = text.as_bytes();
906        if bytes[1] == b'@' {
907            return None;
908        }
909        if bytes[1] == b'-' && text.len() > 2 && bytes[2] == b'@' {
910            return None;
911        }
912    }
913
914    // Find the closing ] for the text. Uses the shared helper so that a
915    // `]` inside a code span doesn't terminate the link text (CommonMark
916    // §6 — code spans bind tighter than links). See spec examples #342
917    // and #525. Raw HTML and (CommonMark-only) autolink spans are also
918    // opaque per `ctx`.
919    let close_bracket = find_link_close_bracket(text, 1, ctx)?;
920    let link_text = &text[1..close_bracket];
921
922    // CommonMark §6.4: outer reference link is rejected when its text contains
923    // a valid inner inline link/image (spec example #532). Reference-link
924    // nesting (#533/#569/#571) is not handled here; it requires resolving
925    // labels against the document refdef map.
926    if ctx.disallow_inner_links
927        && link_text_contains_inner_link(link_text, ctx, ctx.disallow_inner_links)
928    {
929        return None;
930    }
931
932    // Check what follows the ]
933    let after_bracket = close_bracket + 1;
934
935    // `[content]{...}` is reserved for bracketed spans / attribute
936    // trailers, never a shortcut.
937    if after_bracket < text.len() && text[after_bracket..].starts_with('{') {
938        return None;
939    }
940
941    // `[text](...)` is the inline-link shape. CommonMark spec example
942    // #568 (`[foo](not a link)` with `[foo]: /url`) requires the shortcut
943    // to succeed for `[foo]`, leaving `(not a link)` as literal text when
944    // the upstream inline-link parse was rejected by `strict_dest`. We
945    // only fall through to shortcut here when the caller has already
946    // tried the inline-link form (`inline_link_attempted`) — otherwise
947    // disabling the `inline_links` extension would silently let
948    // `[text](url)` become a shortcut + literal text, which the
949    // `inline_links_disabled_keeps_inline_link_literal` test guards
950    // against.
951    if after_bracket < text.len()
952        && text[after_bracket..].starts_with('(')
953        && (!allow_shortcut || !inline_link_attempted)
954    {
955        return None;
956    }
957
958    // Check for explicit reference [text][label] or implicit [text][]
959    if after_bracket < text.len() && text[after_bracket..].starts_with('[') {
960        // Find the closing ] for the label
961        let label_start = after_bracket + 1;
962        let mut label_end = None;
963
964        for (i, ch) in text[label_start..].char_indices() {
965            if ch == ']' {
966                label_end = Some(i + label_start);
967                break;
968            }
969            // Labels can't contain newlines
970            if ch == '\n' {
971                return None;
972            }
973        }
974
975        let label_end = label_end?;
976        let label = &text[label_start..label_end];
977
978        // Total length includes both bracket pairs
979        let total_len = label_end + 1;
980
981        // Implicit reference: empty label means emit [text][]
982        if label.is_empty() {
983            return Some((total_len, link_text, String::new(), false));
984        }
985
986        // Explicit reference: use the provided label
987        Some((total_len, link_text, label.to_string(), false))
988    } else if allow_shortcut {
989        // Shortcut reference: [text] with no second bracket pair
990        // The text is both the display text and the label
991        if link_text.is_empty() {
992            return None;
993        }
994        Some((after_bracket, link_text, link_text.to_string(), true))
995    } else {
996        // No second bracket pair and shortcut not allowed - not a reference link
997        None
998    }
999}
1000
1001/// Emit a reference link node to the builder.
1002/// Preserves the original reference syntax (explicit [text][ref], implicit [text][], or shortcut [text]).
1003pub fn emit_reference_link(
1004    builder: &mut GreenNodeBuilder,
1005    link_text: &str,
1006    label: &str,
1007    is_shortcut: bool,
1008    config: &ParserOptions,
1009    suppress_footnote_refs: bool,
1010) {
1011    builder.start_node(SyntaxKind::LINK.into());
1012
1013    // Opening [
1014    builder.start_node(SyntaxKind::LINK_START.into());
1015    builder.token(SyntaxKind::LINK_START.into(), "[");
1016    builder.finish_node();
1017
1018    // Link text (recursively parse inline elements). Pandoc-native:
1019    // links cannot contain other links, so suppress inner LINK / ref-link
1020    // recognition during the recursion. Images, emphasis, code, etc. are
1021    // still recognised.
1022    builder.start_node(SyntaxKind::LINK_TEXT.into());
1023    parse_inline_text(builder, link_text, config, true, suppress_footnote_refs);
1024    builder.finish_node();
1025
1026    // Closing ] and reference label
1027    builder.token(SyntaxKind::TEXT.into(), "]");
1028
1029    if !is_shortcut {
1030        // Explicit or implicit reference: [text][label] or [text][]
1031        builder.token(SyntaxKind::TEXT.into(), "[");
1032        builder.start_node(SyntaxKind::LINK_REF.into());
1033        // For implicit references, label is empty and we emit [text][]
1034        // For explicit references, emit the label to get [text][label]
1035        if !label.is_empty() {
1036            builder.token(SyntaxKind::TEXT.into(), label);
1037        }
1038        builder.finish_node();
1039        builder.token(SyntaxKind::TEXT.into(), "]");
1040    }
1041    // For shortcut references, just [text] - no second bracket pair
1042
1043    builder.finish_node();
1044}
1045
1046/// Try to parse a reference-style image: `![alt][ref]`, `![alt][]`, or `![alt]`
1047/// Returns (total_len, alt_text, label, is_shortcut) if successful.
1048pub fn try_parse_reference_image(
1049    text: &str,
1050    allow_shortcut: bool,
1051) -> Option<(usize, &str, String, bool)> {
1052    let bytes = text.as_bytes();
1053    if bytes.len() < 4 || bytes[0] != b'!' || bytes[1] != b'[' {
1054        return None;
1055    }
1056
1057    let mut pos = 2;
1058    let mut bracket_depth = 1;
1059    let alt_start = pos;
1060
1061    // Find the end of the alt text (allowing nested brackets)
1062    while pos < bytes.len() && bracket_depth > 0 {
1063        match bytes[pos] {
1064            b'[' => bracket_depth += 1,
1065            b']' => bracket_depth -= 1,
1066            b'\\' if pos + 1 < bytes.len() => pos += 1, // skip escaped char
1067            _ => {}
1068        }
1069        pos += 1;
1070    }
1071
1072    if bracket_depth > 0 {
1073        return None; // Unclosed brackets
1074    }
1075
1076    let alt_text = &text[alt_start..pos - 1];
1077
1078    // Now check for the label part
1079    if pos >= bytes.len() {
1080        return None;
1081    }
1082
1083    // Explicit reference: `![alt][label]`
1084    if bytes[pos] == b'[' {
1085        pos += 1;
1086        let label_start = pos;
1087
1088        // Find the end of the label (no nested brackets, no newlines)
1089        while pos < bytes.len() && bytes[pos] != b']' && bytes[pos] != b'\n' && bytes[pos] != b'\r'
1090        {
1091            pos += 1;
1092        }
1093
1094        if pos >= bytes.len() || bytes[pos] != b']' {
1095            return None;
1096        }
1097
1098        let label_text = &text[label_start..pos];
1099        pos += 1;
1100
1101        // Return the original label text for formatting preservation
1102        // Empty label means implicit reference
1103        let label = if label_text.is_empty() {
1104            alt_text.to_string() // For implicit references, use alt text as label for equality check
1105        } else {
1106            label_text.to_string() // Preserve original case
1107        };
1108
1109        return Some((pos, alt_text, label, false));
1110    }
1111
1112    // Shortcut reference: `![alt]` (only if enabled)
1113    // BUT not if followed by (url) - that's an inline image
1114    if allow_shortcut {
1115        // Check if next char is ( - if so, not a reference
1116        if pos < bytes.len() && bytes[pos] == b'(' {
1117            return None;
1118        }
1119
1120        // For shortcut references, use alt text as label for equality check
1121        let label = alt_text.to_string();
1122        return Some((pos, alt_text, label, true));
1123    }
1124
1125    None
1126}
1127
1128/// Emit a reference image node with registry lookup.
1129pub fn emit_reference_image(
1130    builder: &mut GreenNodeBuilder,
1131    alt_text: &str,
1132    label: &str,
1133    is_shortcut: bool,
1134    config: &ParserOptions,
1135    suppress_footnote_refs: bool,
1136) {
1137    builder.start_node(SyntaxKind::IMAGE_LINK.into());
1138
1139    // Emit as reference image (preserve original syntax)
1140    builder.start_node(SyntaxKind::IMAGE_LINK_START.into());
1141    builder.token(SyntaxKind::IMAGE_LINK_START.into(), "![");
1142    builder.finish_node();
1143
1144    // Alt text (recursively parse inline elements)
1145    builder.start_node(SyntaxKind::IMAGE_ALT.into());
1146    parse_inline_text(builder, alt_text, config, false, suppress_footnote_refs);
1147    builder.finish_node();
1148
1149    // Closing ] and reference label
1150    builder.token(SyntaxKind::TEXT.into(), "]");
1151
1152    if !is_shortcut {
1153        // Explicit or implicit reference: ![alt][label] or ![alt][]
1154        builder.token(SyntaxKind::TEXT.into(), "[");
1155        builder.start_node(SyntaxKind::LINK_REF.into());
1156        // For implicit references, emit empty label (label == alt means implicit from parser)
1157        if label != alt_text {
1158            builder.token(SyntaxKind::TEXT.into(), label);
1159        }
1160        builder.finish_node();
1161        builder.token(SyntaxKind::TEXT.into(), "]");
1162    }
1163    // For shortcut references, just ![alt] - no second bracket pair
1164
1165    builder.finish_node();
1166}
1167
1168/// Emit an `UNRESOLVED_REFERENCE` node for a Pandoc bracket-shape
1169/// pattern whose label didn't resolve. The wrapper covers the original
1170/// bracket bytes; the inner text recurses through normal inline
1171/// parsing (with inner-link suppression so a stray inner inline link
1172/// doesn't reorder semantics relative to pandoc-native).
1173///
1174/// `source` is `text[start..end]` — the full bracket-shape pattern.
1175/// `text_content` is the inner text between the outer `[` and `]`
1176/// (the bytes used for inline recursion). `label_suffix` carries the
1177/// `[label]` / `[]` suffix bytes verbatim, or `None` for shortcut form.
1178pub fn emit_unresolved_reference(
1179    builder: &mut GreenNodeBuilder,
1180    is_image: bool,
1181    text_content: &str,
1182    label_suffix: Option<&str>,
1183    config: &ParserOptions,
1184    suppress_footnote_refs: bool,
1185) {
1186    builder.start_node(SyntaxKind::UNRESOLVED_REFERENCE.into());
1187
1188    if is_image {
1189        builder.start_node(SyntaxKind::IMAGE_LINK_START.into());
1190        builder.token(SyntaxKind::IMAGE_LINK_START.into(), "![");
1191        builder.finish_node();
1192        builder.start_node(SyntaxKind::IMAGE_ALT.into());
1193        parse_inline_text(builder, text_content, config, false, suppress_footnote_refs);
1194        builder.finish_node();
1195    } else {
1196        builder.start_node(SyntaxKind::LINK_START.into());
1197        builder.token(SyntaxKind::LINK_START.into(), "[");
1198        builder.finish_node();
1199        builder.start_node(SyntaxKind::LINK_TEXT.into());
1200        parse_inline_text(builder, text_content, config, true, suppress_footnote_refs);
1201        builder.finish_node();
1202    }
1203
1204    builder.token(SyntaxKind::TEXT.into(), "]");
1205
1206    if let Some(suffix) = label_suffix {
1207        // suffix is either "[label]" or "[]"; preserve original bytes.
1208        // Split as `[` + LINK_REF(label) + `]` so wrapper accessors find
1209        // the label via `support::child::<LinkRef>()`.
1210        debug_assert!(suffix.starts_with('[') && suffix.ends_with(']'));
1211        builder.token(SyntaxKind::TEXT.into(), "[");
1212        let label = &suffix[1..suffix.len() - 1];
1213        builder.start_node(SyntaxKind::LINK_REF.into());
1214        if !label.is_empty() {
1215            builder.token(SyntaxKind::TEXT.into(), label);
1216        }
1217        builder.finish_node();
1218        builder.token(SyntaxKind::TEXT.into(), "]");
1219    }
1220
1221    builder.finish_node();
1222}
1223
1224#[cfg(test)]
1225mod tests {
1226    use super::*;
1227
1228    #[test]
1229    fn test_parse_autolink_url() {
1230        let input = "<https://example.com>";
1231        assert_eq!(
1232            try_parse_autolink(input, false),
1233            Some((21, "https://example.com"))
1234        );
1235        assert_eq!(
1236            try_parse_autolink(input, true),
1237            Some((21, "https://example.com"))
1238        );
1239    }
1240
1241    #[test]
1242    fn test_parse_autolink_email() {
1243        let input = "<user@example.com>";
1244        assert_eq!(
1245            try_parse_autolink(input, false),
1246            Some((18, "user@example.com"))
1247        );
1248        assert_eq!(
1249            try_parse_autolink(input, true),
1250            Some((18, "user@example.com"))
1251        );
1252    }
1253
1254    #[test]
1255    fn test_parse_autolink_no_close() {
1256        let input = "<https://example.com";
1257        assert_eq!(try_parse_autolink(input, false), None);
1258        assert_eq!(try_parse_autolink(input, true), None);
1259    }
1260
1261    #[test]
1262    fn test_parse_autolink_with_space() {
1263        let input = "<https://example.com >";
1264        assert_eq!(try_parse_autolink(input, false), None);
1265        assert_eq!(try_parse_autolink(input, true), None);
1266    }
1267
1268    #[test]
1269    fn test_parse_autolink_not_url_or_email() {
1270        let input = "<notaurl>";
1271        assert_eq!(try_parse_autolink(input, false), None);
1272        assert_eq!(try_parse_autolink(input, true), None);
1273    }
1274
1275    #[test]
1276    fn test_parse_autolink_commonmark_strict_scheme() {
1277        // Scheme too short (1 char) — invalid under CommonMark, lax-accepted
1278        // under Pandoc dialect (matches historical behavior).
1279        let input = "<m:abc>";
1280        assert_eq!(try_parse_autolink(input, true), None);
1281        assert_eq!(try_parse_autolink(input, false), Some((7, "m:abc")));
1282    }
1283
1284    #[test]
1285    fn test_parse_autolink_commonmark_email_disallows_backslash() {
1286        let input = "<foo\\+@bar.example.com>";
1287        assert_eq!(try_parse_autolink(input, true), None);
1288        assert_eq!(
1289            try_parse_autolink(input, false),
1290            Some((23, "foo\\+@bar.example.com"))
1291        );
1292    }
1293
1294    #[test]
1295    fn test_parse_inline_link_simple() {
1296        let input = "[text](url)";
1297        let result = try_parse_inline_link(input, false, LinkScanContext::default());
1298        assert_eq!(result, Some((11, "text", "url", None)));
1299    }
1300
1301    #[test]
1302    fn test_parse_inline_link_with_title() {
1303        let input = r#"[text](url "title")"#;
1304        let result = try_parse_inline_link(input, false, LinkScanContext::default());
1305        assert_eq!(result, Some((19, "text", r#"url "title""#, None)));
1306    }
1307
1308    #[test]
1309    fn test_parse_inline_link_with_nested_brackets() {
1310        let input = "[outer [inner] text](url)";
1311        let result = try_parse_inline_link(input, false, LinkScanContext::default());
1312        assert_eq!(result, Some((25, "outer [inner] text", "url", None)));
1313    }
1314
1315    #[test]
1316    fn test_parse_inline_link_no_space_between_brackets_and_parens() {
1317        let input = "[text] (url)";
1318        let result = try_parse_inline_link(input, false, LinkScanContext::default());
1319        assert_eq!(result, None);
1320    }
1321
1322    #[test]
1323    fn test_parse_inline_link_no_closing_bracket() {
1324        let input = "[text(url)";
1325        let result = try_parse_inline_link(input, false, LinkScanContext::default());
1326        assert_eq!(result, None);
1327    }
1328
1329    #[test]
1330    fn test_parse_inline_link_no_closing_paren() {
1331        let input = "[text](url";
1332        let result = try_parse_inline_link(input, false, LinkScanContext::default());
1333        assert_eq!(result, None);
1334    }
1335
1336    #[test]
1337    fn test_parse_inline_link_escaped_bracket() {
1338        let input = r"[text\]more](url)";
1339        let result = try_parse_inline_link(input, false, LinkScanContext::default());
1340        assert_eq!(result, Some((17, r"text\]more", "url", None)));
1341    }
1342
1343    #[test]
1344    fn test_parse_inline_link_parens_in_url() {
1345        let input = "[text](url(with)parens)";
1346        let result = try_parse_inline_link(input, false, LinkScanContext::default());
1347        assert_eq!(result, Some((23, "text", "url(with)parens", None)));
1348    }
1349
1350    #[test]
1351    fn test_parse_inline_image_simple() {
1352        let input = "![alt](image.jpg)";
1353        let result = try_parse_inline_image(input, LinkScanContext::default());
1354        assert_eq!(result, Some((17, "alt", "image.jpg", None)));
1355    }
1356
1357    #[test]
1358    fn test_parse_inline_image_with_title() {
1359        let input = r#"![alt](image.jpg "A title")"#;
1360        let result = try_parse_inline_image(input, LinkScanContext::default());
1361        assert_eq!(result, Some((27, "alt", r#"image.jpg "A title""#, None)));
1362    }
1363
1364    #[test]
1365    fn test_parse_inline_image_with_nested_brackets() {
1366        let input = "![outer [inner] alt](image.jpg)";
1367        let result = try_parse_inline_image(input, LinkScanContext::default());
1368        assert_eq!(result, Some((31, "outer [inner] alt", "image.jpg", None)));
1369    }
1370
1371    #[test]
1372    fn test_parse_bare_uri_rejects_dangling_backslash_after_trim() {
1373        let input = r"a:\]";
1374        let result = try_parse_bare_uri(input);
1375        assert_eq!(result, None);
1376    }
1377
1378    #[test]
1379    fn test_parse_inline_image_no_space_between_brackets_and_parens() {
1380        let input = "![alt] (image.jpg)";
1381        let result = try_parse_inline_image(input, LinkScanContext::default());
1382        assert_eq!(result, None);
1383    }
1384
1385    #[test]
1386    fn test_parse_inline_image_no_closing_bracket() {
1387        let input = "![alt(image.jpg)";
1388        let result = try_parse_inline_image(input, LinkScanContext::default());
1389        assert_eq!(result, None);
1390    }
1391
1392    #[test]
1393    fn test_parse_inline_image_no_closing_paren() {
1394        let input = "![alt](image.jpg";
1395        let result = try_parse_inline_image(input, LinkScanContext::default());
1396        assert_eq!(result, None);
1397    }
1398
1399    #[test]
1400    fn test_parse_inline_image_with_simple_class() {
1401        let input = "![alt](img.png){.large}";
1402        let result = try_parse_inline_image(input, LinkScanContext::default());
1403        let (len, alt, dest, attrs) = result.unwrap();
1404        assert_eq!(len, 23);
1405        assert_eq!(alt, "alt");
1406        assert_eq!(dest, "img.png");
1407        assert!(attrs.is_some());
1408        let attrs = attrs.unwrap();
1409        assert_eq!(attrs, "{.large}");
1410    }
1411
1412    #[test]
1413    fn test_parse_inline_image_with_id() {
1414        let input = "![Figure 1](fig1.png){#fig-1}";
1415        let result = try_parse_inline_image(input, LinkScanContext::default());
1416        let (len, alt, dest, attrs) = result.unwrap();
1417        assert_eq!(len, 29);
1418        assert_eq!(alt, "Figure 1");
1419        assert_eq!(dest, "fig1.png");
1420        assert!(attrs.is_some());
1421        let attrs = attrs.unwrap();
1422        assert_eq!(attrs, "{#fig-1}");
1423    }
1424
1425    #[test]
1426    fn test_parse_inline_image_with_full_attributes() {
1427        let input = "![alt](img.png){#fig .large width=\"80%\"}";
1428        let result = try_parse_inline_image(input, LinkScanContext::default());
1429        let (len, alt, dest, attrs) = result.unwrap();
1430        assert_eq!(len, 40);
1431        assert_eq!(alt, "alt");
1432        assert_eq!(dest, "img.png");
1433        assert!(attrs.is_some());
1434        let attrs = attrs.unwrap();
1435        assert_eq!(attrs, "{#fig .large width=\"80%\"}");
1436    }
1437
1438    #[test]
1439    fn test_parse_inline_image_attributes_must_be_adjacent() {
1440        // Space between ) and { should not parse as attributes
1441        let input = "![alt](img.png) {.large}";
1442        let result = try_parse_inline_image(input, LinkScanContext::default());
1443        assert_eq!(result, Some((15, "alt", "img.png", None)));
1444    }
1445
1446    // Link attribute tests
1447    #[test]
1448    fn test_parse_inline_link_with_id() {
1449        let input = "[text](url){#link-1}";
1450        let result = try_parse_inline_link(input, false, LinkScanContext::default());
1451        let (len, text, dest, attrs) = result.unwrap();
1452        assert_eq!(len, 20);
1453        assert_eq!(text, "text");
1454        assert_eq!(dest, "url");
1455        assert!(attrs.is_some());
1456        let attrs = attrs.unwrap();
1457        assert_eq!(attrs, "{#link-1}");
1458    }
1459
1460    #[test]
1461    fn test_parse_inline_link_with_full_attributes() {
1462        let input = "[text](url){#link .external target=\"_blank\"}";
1463        let result = try_parse_inline_link(input, false, LinkScanContext::default());
1464        let (len, text, dest, attrs) = result.unwrap();
1465        assert_eq!(len, 44);
1466        assert_eq!(text, "text");
1467        assert_eq!(dest, "url");
1468        assert!(attrs.is_some());
1469        let attrs = attrs.unwrap();
1470        assert_eq!(attrs, "{#link .external target=\"_blank\"}");
1471    }
1472
1473    #[test]
1474    fn test_parse_inline_link_attributes_must_be_adjacent() {
1475        // Space between ) and { should not parse as attributes
1476        let input = "[text](url) {.class}";
1477        let result = try_parse_inline_link(input, false, LinkScanContext::default());
1478        assert_eq!(result, Some((11, "text", "url", None)));
1479    }
1480
1481    #[test]
1482    fn test_parse_inline_link_with_title_and_attributes() {
1483        let input = r#"[text](url "title"){.external}"#;
1484        let result = try_parse_inline_link(input, false, LinkScanContext::default());
1485        let (len, text, dest, attrs) = result.unwrap();
1486        assert_eq!(len, 30);
1487        assert_eq!(text, "text");
1488        assert_eq!(dest, r#"url "title""#);
1489        assert!(attrs.is_some());
1490        let attrs = attrs.unwrap();
1491        assert_eq!(attrs, "{.external}");
1492    }
1493
1494    // Reference link tests
1495    #[test]
1496    fn test_parse_reference_link_explicit() {
1497        let input = "[link text][label]";
1498        let result = try_parse_reference_link(input, false, true, LinkScanContext::default());
1499        assert_eq!(result, Some((18, "link text", "label".to_string(), false)));
1500    }
1501
1502    #[test]
1503    fn test_parse_reference_link_implicit() {
1504        let input = "[link text][]";
1505        let result = try_parse_reference_link(input, false, true, LinkScanContext::default());
1506        assert_eq!(result, Some((13, "link text", String::new(), false)));
1507    }
1508
1509    #[test]
1510    fn test_parse_reference_link_explicit_same_label_as_text() {
1511        let input = "[stack][stack]";
1512        let result = try_parse_reference_link(input, false, true, LinkScanContext::default());
1513        assert_eq!(result, Some((14, "stack", "stack".to_string(), false)));
1514    }
1515
1516    #[test]
1517    fn test_parse_reference_link_shortcut() {
1518        let input = "[link text] rest";
1519        let result = try_parse_reference_link(input, true, true, LinkScanContext::default());
1520        assert_eq!(
1521            result,
1522            Some((11, "link text", "link text".to_string(), true))
1523        );
1524    }
1525
1526    #[test]
1527    fn test_parse_reference_link_shortcut_rejects_empty_label() {
1528        let input = "[] rest";
1529        let result = try_parse_reference_link(input, true, true, LinkScanContext::default());
1530        assert_eq!(result, None);
1531    }
1532
1533    #[test]
1534    fn test_parse_reference_link_shortcut_disabled() {
1535        let input = "[link text] rest";
1536        let result = try_parse_reference_link(input, false, true, LinkScanContext::default());
1537        assert_eq!(result, None);
1538    }
1539
1540    #[test]
1541    fn test_parse_reference_link_not_inline_link() {
1542        // With shortcut disabled, `[text](url)` is rejected so the inline
1543        // link form upstream gets exclusive ownership.
1544        let input = "[text](url)";
1545        let result = try_parse_reference_link(input, false, true, LinkScanContext::default());
1546        assert_eq!(result, None);
1547    }
1548
1549    #[test]
1550    fn test_parse_reference_link_shortcut_falls_through_inline_link() {
1551        // CommonMark spec example #568: when an inline-link attempt would
1552        // fail (here we model the reachability — the caller tries inline
1553        // link first; if that returns None, we should still see `[text]`
1554        // as a shortcut and leave `(url)` to be parsed as following text).
1555        let input = "[text](url)";
1556        let result = try_parse_reference_link(input, true, true, LinkScanContext::default());
1557        assert_eq!(result, Some((6, "text", "text".to_string(), true)));
1558    }
1559
1560    #[test]
1561    fn test_parse_reference_link_with_nested_brackets() {
1562        let input = "[outer [inner] text][ref]";
1563        let result = try_parse_reference_link(input, false, true, LinkScanContext::default());
1564        assert_eq!(
1565            result,
1566            Some((25, "outer [inner] text", "ref".to_string(), false))
1567        );
1568    }
1569
1570    #[test]
1571    fn test_parse_reference_link_label_no_newline() {
1572        let input = "[text][label\nmore]";
1573        let result = try_parse_reference_link(input, false, true, LinkScanContext::default());
1574        assert_eq!(result, None);
1575    }
1576
1577    // Reference image tests
1578    #[test]
1579    fn test_parse_reference_image_explicit() {
1580        let input = "![alt text][label]";
1581        let result = try_parse_reference_image(input, false);
1582        assert_eq!(result, Some((18, "alt text", "label".to_string(), false)));
1583    }
1584
1585    #[test]
1586    fn test_parse_reference_image_implicit() {
1587        let input = "![alt text][]";
1588        let result = try_parse_reference_image(input, false);
1589        assert_eq!(
1590            result,
1591            Some((13, "alt text", "alt text".to_string(), false))
1592        );
1593    }
1594
1595    #[test]
1596    fn test_parse_reference_image_shortcut() {
1597        let input = "![alt text] rest";
1598        let result = try_parse_reference_image(input, true);
1599        assert_eq!(result, Some((11, "alt text", "alt text".to_string(), true)));
1600    }
1601
1602    #[test]
1603    fn test_parse_reference_image_shortcut_disabled() {
1604        let input = "![alt text] rest";
1605        let result = try_parse_reference_image(input, false);
1606        assert_eq!(result, None);
1607    }
1608
1609    #[test]
1610    fn test_parse_reference_image_not_inline() {
1611        // Should not match inline images with (url)
1612        let input = "![alt](url)";
1613        let result = try_parse_reference_image(input, true);
1614        assert_eq!(result, None);
1615    }
1616
1617    #[test]
1618    fn test_parse_reference_image_with_nested_brackets() {
1619        let input = "![alt [nested] text][ref]";
1620        let result = try_parse_reference_image(input, false);
1621        assert_eq!(
1622            result,
1623            Some((25, "alt [nested] text", "ref".to_string(), false))
1624        );
1625    }
1626
1627    #[test]
1628    fn test_reference_link_label_with_crlf() {
1629        // Reference link labels should not span lines with CRLF
1630        let input = "[foo\r\nbar]";
1631        let result = try_parse_reference_link(input, false, true, LinkScanContext::default());
1632
1633        // Should fail to parse because label contains line break
1634        assert_eq!(
1635            result, None,
1636            "Should not parse reference link with CRLF in label"
1637        );
1638    }
1639
1640    #[test]
1641    fn test_reference_link_label_with_lf() {
1642        // Reference link labels should not span lines with LF either
1643        let input = "[foo\nbar]";
1644        let result = try_parse_reference_link(input, false, true, LinkScanContext::default());
1645
1646        // Should fail to parse because label contains line break
1647        assert_eq!(
1648            result, None,
1649            "Should not parse reference link with LF in label"
1650        );
1651    }
1652
1653    // Multiline link text tests
1654    #[test]
1655    fn test_parse_inline_link_multiline_text() {
1656        // Per Pandoc spec, link text CAN contain newlines (soft breaks)
1657        let input = "[text on\nline two](url)";
1658        let result = try_parse_inline_link(input, false, LinkScanContext::default());
1659        assert_eq!(
1660            result,
1661            Some((23, "text on\nline two", "url", None)),
1662            "Link text should allow newlines"
1663        );
1664    }
1665
1666    #[test]
1667    fn test_parse_inline_link_multiline_with_formatting() {
1668        // Link text with newlines and other inline elements
1669        let input =
1670            "[A network graph. Different edges\nwith probability](../images/networkfig.png)";
1671        let result = try_parse_inline_link(input, false, LinkScanContext::default());
1672        assert!(result.is_some(), "Link text with newlines should parse");
1673        let (len, text, _dest, _attrs) = result.unwrap();
1674        assert!(text.contains('\n'), "Link text should preserve newline");
1675        assert_eq!(len, input.len());
1676    }
1677
1678    #[test]
1679    fn test_parse_inline_image_multiline_alt() {
1680        // Per Pandoc spec, image alt text CAN contain newlines
1681        let input = "![alt on\nline two](img.png)";
1682        let result = try_parse_inline_image(input, LinkScanContext::default());
1683        assert_eq!(
1684            result,
1685            Some((27, "alt on\nline two", "img.png", None)),
1686            "Image alt text should allow newlines"
1687        );
1688    }
1689
1690    #[test]
1691    fn test_parse_inline_image_multiline_with_attributes() {
1692        // Image with multiline alt text and attributes
1693        let input = "![network graph\ndiagram](../images/fig.png){width=70%}";
1694        let result = try_parse_inline_image(input, LinkScanContext::default());
1695        assert!(
1696            result.is_some(),
1697            "Image alt with newlines and attributes should parse"
1698        );
1699        let (len, alt, dest, attrs) = result.unwrap();
1700        assert!(alt.contains('\n'), "Alt text should preserve newline");
1701        assert_eq!(dest, "../images/fig.png");
1702        assert_eq!(attrs, Some("{width=70%}"));
1703        assert_eq!(len, input.len());
1704    }
1705
1706    #[test]
1707    fn test_parse_inline_link_with_attributes_after_newline() {
1708        // Test for regression: when text is concatenated with newlines,
1709        // attributes after ) should still be recognized
1710        let input = "[A network graph.](../images/networkfig.png){width=70%}\nA word\n";
1711        let result = try_parse_inline_link(input, false, LinkScanContext::default());
1712        assert!(
1713            result.is_some(),
1714            "Link with attributes should parse even with following text"
1715        );
1716        let (len, text, dest, attrs) = result.unwrap();
1717        assert_eq!(text, "A network graph.");
1718        assert_eq!(dest, "../images/networkfig.png");
1719        assert_eq!(attrs, Some("{width=70%}"), "Attributes should be captured");
1720        assert_eq!(
1721            len, 55,
1722            "Length should include attributes (up to closing brace)"
1723        );
1724    }
1725}
panache_parser/parser/inlines/links.rs

panache_parser/parser/inlines/
links.rs