panache_parser/parser/inlines/
links.rs

1//! Parsing for links, images, and automatic links.
2//!
3//! Implements:
4//! - Automatic links: `<http://example.com>` and `<user@example.com>`
5//! - Inline links: `[text](url)` and `[text](url "title")`
6//! - Link attributes: `[text](url){#id .class key=value}`
7//! - Inline images: `![alt](url)` and `![alt](url "title")`
8//! - Image attributes: `![alt](url){#id .class key=value}`
9//! - Reference links: `[text][ref]`, `[text][]`, `[text]`
10//! - Reference images: `![alt][ref]`, `![alt][]`, `![alt]`
11
12use super::code_spans::try_parse_code_span;
13use super::core::parse_inline_text;
14use super::inline_html::try_parse_inline_html;
15use crate::options::ParserOptions;
16use crate::syntax::SyntaxKind;
17use rowan::GreenNodeBuilder;
18
19// Import attribute parsing
20use crate::parser::utils::attributes::try_parse_trailing_attributes;
21
22/// Flags that control which inline spans the link-bracket scanner treats as
23/// opaque (so a `]` inside them does not terminate the link/image text).
24///
25/// - `skip_raw_html` is universal across dialects: pandoc-markdown and
26///   CommonMark both refuse to close link text inside a raw HTML span (e.g.
27///   `[foo <bar attr="](baz)">`), per CommonMark spec example #524 / #536.
28/// - `skip_autolinks` is **CommonMark-only**. Pandoc-markdown does *not*
29///   treat `<scheme://...>` as opaque inside link text, so the same input
30///   produces a different parse under each dialect (CommonMark spec example
31///   #526 / #538). Always derive this from
32///   `extensions.autolinks && dialect == Dialect::CommonMark`.
33/// - `disallow_inner_links` is **CommonMark-only** structural rule (§6.4):
34///   "Links may not contain other links, at any level of nesting." When the
35///   candidate link/image text contains a valid inline link or image, the
36///   outer match is rejected so the inner-most definition is used instead
37///   (spec examples #518–#520, #532). Pandoc-markdown allows nested links,
38///   so the flag is `false` there.
39#[derive(Clone, Copy, Default)]
40pub struct LinkScanContext {
41    pub skip_raw_html: bool,
42    pub skip_autolinks: bool,
43    pub disallow_inner_links: bool,
44}
45
46impl LinkScanContext {
47    pub fn from_options(config: &ParserOptions) -> Self {
48        let is_commonmark = config.dialect == crate::options::Dialect::CommonMark;
49        Self {
50            skip_raw_html: config.extensions.raw_html,
51            skip_autolinks: config.extensions.autolinks && is_commonmark,
52            disallow_inner_links: is_commonmark,
53        }
54    }
55}
56
57/// Find the closing `]` of a link/image text span, starting from `start`.
58///
59/// Walks `text[start..]` tracking nested brackets and backslash escapes. When
60/// a backtick run starting a valid code span is encountered, the entire span
61/// (including any trailing attribute block) is skipped — per CommonMark §6
62/// precedence, code spans bind tighter than links/images, so a `]` *inside*
63/// a code span cannot terminate the link's text. The same opacity applies to
64/// raw HTML and (CommonMark-only) autolink spans gated through `ctx`.
65/// Returns the byte offset of the closing `]` within `text`, or `None` if no
66/// unmatched `]` is reached.
67fn find_link_close_bracket(text: &str, start: usize, ctx: LinkScanContext) -> Option<usize> {
68    let bytes = text.as_bytes();
69    let mut bracket_depth = 0;
70    let mut escape_next = false;
71    let mut i = start;
72
73    while i < bytes.len() {
74        let b = bytes[i];
75
76        if escape_next {
77            escape_next = false;
78            i += step(text, i);
79            continue;
80        }
81
82        match b {
83            b'\\' => {
84                escape_next = true;
85                i += 1;
86            }
87            b'`' => {
88                if let Some((len, _, _, _)) = try_parse_code_span(&text[i..]) {
89                    i += len;
90                } else {
91                    i += 1;
92                }
93            }
94            b'<' => {
95                // Order matters: autolinks are the more specific `<...>`
96                // shape (URI/email between angle brackets), so try that
97                // before falling through to general inline raw HTML which
98                // would also match `<bar attr="...">`-style tags.
99                if ctx.skip_autolinks
100                    && let Some((len, _)) = try_parse_autolink(&text[i..], true)
101                {
102                    i += len;
103                } else if ctx.skip_raw_html
104                    && let Some(len) = try_parse_inline_html(&text[i..])
105                {
106                    i += len;
107                } else {
108                    i += 1;
109                }
110            }
111            b'[' => {
112                bracket_depth += 1;
113                i += 1;
114            }
115            b']' => {
116                if bracket_depth == 0 {
117                    return Some(i);
118                }
119                bracket_depth -= 1;
120                i += 1;
121            }
122            _ => i += step(text, i),
123        }
124    }
125    None
126}
127
128/// Find the closing `)` of a link/image destination, given the text *after*
129/// the opening `(`. Tracks paren nesting, quoted titles, and angle-bracketed
130/// destinations (`<...>` may legitimately contain unbalanced parens — see
131/// spec example #499). Returns the byte offset of the closing `)` within the
132/// passed slice, or `None` if not found.
133fn find_dest_close_paren(remaining: &str) -> Option<usize> {
134    let bytes = remaining.as_bytes();
135    let mut paren_depth = 0;
136    let mut escape_next = false;
137    let mut in_quotes = false;
138    let mut in_angle = false;
139    let mut i = 0;
140
141    while i < bytes.len() {
142        let b = bytes[i];
143
144        if escape_next {
145            escape_next = false;
146            i += step(remaining, i);
147            continue;
148        }
149
150        match b {
151            b'\\' => {
152                escape_next = true;
153                i += 1;
154            }
155            b'<' if !in_quotes && !in_angle => {
156                in_angle = true;
157                i += 1;
158            }
159            b'>' if in_angle => {
160                in_angle = false;
161                i += 1;
162            }
163            b'"' if !in_angle => {
164                in_quotes = !in_quotes;
165                i += 1;
166            }
167            b'(' if !in_quotes && !in_angle => {
168                paren_depth += 1;
169                i += 1;
170            }
171            b')' if !in_quotes && !in_angle => {
172                if paren_depth == 0 {
173                    return Some(i);
174                }
175                paren_depth -= 1;
176                i += 1;
177            }
178            _ => i += step(remaining, i),
179        }
180    }
181    None
182}
183
184/// Byte length of the UTF-8 character starting at byte index `i` in `s`.
185/// Used to advance an index loop char-by-char without incurring `char_indices`
186/// overhead and without splitting on a UTF-8 boundary.
187fn step(s: &str, i: usize) -> usize {
188    s[i..].chars().next().map(|c| c.len_utf8()).unwrap_or(1)
189}
190
191/// CommonMark §6.4: "Links may not contain other links, at any level of
192/// nesting. If multiple otherwise valid link definitions appear nested inside
193/// each other, the inner-most definition is used." This helper scans a
194/// candidate link text for any `[` that starts a valid inline link; when
195/// found, the outer link must be rejected so the inner-most wins (spec
196/// examples #518–#519, #532).
197///
198/// Images themselves do not count as inner links — a link can contain an
199/// image (#517, #531). A link *inside* an image's alt text, however, still
200/// deactivates outer link openers per CommonMark's bracket-scanner rules, so
201/// the helper recurses into image alt text looking for inner links.
202///
203/// Reference-link nesting (#533, #569, #571) requires resolving labels
204/// against the document's reference-definition map, which the parser does
205/// not have at this point — those cases remain unhandled and need a later
206/// stack-based pass.
207fn link_text_contains_inner_link(text: &str, ctx: LinkScanContext, strict_dest: bool) -> bool {
208    let bytes = text.as_bytes();
209    let mut i = 0;
210    let mut escape_next = false;
211    while i < bytes.len() {
212        let b = bytes[i];
213        if escape_next {
214            escape_next = false;
215            i += step(text, i);
216            continue;
217        }
218        match b {
219            b'\\' => {
220                escape_next = true;
221                i += 1;
222            }
223            b'`' => {
224                if let Some((len, _, _, _)) = try_parse_code_span(&text[i..]) {
225                    i += len;
226                } else {
227                    i += 1;
228                }
229            }
230            b'<' => {
231                if ctx.skip_autolinks
232                    && let Some((len, _)) = try_parse_autolink(&text[i..], true)
233                {
234                    i += len;
235                } else if ctx.skip_raw_html
236                    && let Some(len) = try_parse_inline_html(&text[i..])
237                {
238                    i += len;
239                } else {
240                    i += 1;
241                }
242            }
243            b'!' if i + 1 < bytes.len() && bytes[i + 1] == b'[' => {
244                if let Some((len, alt, _, _)) = try_parse_inline_image(&text[i..], ctx) {
245                    if link_text_contains_inner_link(alt, ctx, strict_dest) {
246                        return true;
247                    }
248                    i += len;
249                } else {
250                    i += 2;
251                }
252            }
253            b'[' => {
254                if try_parse_inline_link(&text[i..], strict_dest, ctx).is_some() {
255                    return true;
256                }
257                i += 1;
258            }
259            _ => i += step(text, i),
260        }
261    }
262    false
263}
264
265/// Try to parse an inline image starting at the current position.
266///
267/// Inline images have the form `![alt](url)` or `![alt](url "title")`.
268/// Can also have trailing attributes: `![alt](url){#id .class}`.
269/// Returns Some((length, alt_text, dest_content, raw_attributes)) if a valid image is found.
270///
271/// `ctx` controls bracket-scanner opacity for raw HTML / autolink spans;
272/// see `LinkScanContext`.
273pub fn try_parse_inline_image(
274    text: &str,
275    ctx: LinkScanContext,
276) -> Option<(usize, &str, &str, Option<&str>)> {
277    if !text.starts_with("![") {
278        return None;
279    }
280
281    // Find the closing ]
282    let close_bracket = find_link_close_bracket(text, 2, ctx)?;
283    let alt_text = &text[2..close_bracket];
284
285    // Check for immediate ( after ]
286    let after_bracket = close_bracket + 1;
287    if text.len() <= after_bracket || !text[after_bracket..].starts_with('(') {
288        return None;
289    }
290
291    // Find closing ) for destination (reuse same logic as links)
292    let dest_start = after_bracket + 1;
293    let remaining = &text[dest_start..];
294
295    let close_paren = find_dest_close_paren(remaining)?;
296    let dest_content = &remaining[..close_paren];
297
298    // Check for trailing attributes {#id .class key=value}
299    let after_paren = dest_start + close_paren + 1;
300    let after_close = &text[after_paren..];
301
302    // Attributes must start immediately after closing paren (no whitespace/newlines)
303    if after_close.starts_with('{') {
304        // Find the closing brace
305        if let Some(close_brace_pos) = after_close.find('}') {
306            let attr_text = &after_close[..=close_brace_pos];
307            // Try to parse as attributes to validate
308            if let Some((_attrs, _)) = try_parse_trailing_attributes(attr_text) {
309                let total_len = after_paren + close_brace_pos + 1;
310                // Return raw attribute string for lossless parsing
311                let raw_attrs = attr_text;
312                return Some((total_len, alt_text, dest_content, Some(raw_attrs)));
313            }
314        }
315    }
316
317    // No attributes, just return the image
318    let total_len = after_paren;
319    Some((total_len, alt_text, dest_content, None))
320}
321
322/// Emit an inline image node to the builder.
323/// Note: alt_text may contain inline elements and should be parsed recursively.
324pub fn emit_inline_image(
325    builder: &mut GreenNodeBuilder,
326    _text: &str,
327    alt_text: &str,
328    dest: &str,
329    raw_attributes: Option<&str>,
330    config: &ParserOptions,
331) {
332    builder.start_node(SyntaxKind::IMAGE_LINK.into());
333
334    // Opening ![
335    builder.start_node(SyntaxKind::IMAGE_LINK_START.into());
336    builder.token(SyntaxKind::IMAGE_LINK_START.into(), "![");
337    builder.finish_node();
338
339    // Alt text (recursively parse inline elements)
340    builder.start_node(SyntaxKind::IMAGE_ALT.into());
341    // Use the standalone parse_inline_text function for recursive parsing
342    // Note: nested contexts don't resolve references
343    parse_inline_text(builder, alt_text, config, false);
344    builder.finish_node();
345
346    // Closing ]
347    builder.token(SyntaxKind::IMAGE_ALT_END.into(), "]");
348
349    // Opening (
350    builder.token(SyntaxKind::IMAGE_DEST_START.into(), "(");
351
352    // Destination
353    builder.start_node(SyntaxKind::LINK_DEST.into());
354    builder.token(SyntaxKind::TEXT.into(), dest);
355    builder.finish_node();
356
357    // Closing )
358    builder.token(SyntaxKind::IMAGE_DEST_END.into(), ")");
359
360    // Emit raw attributes if present (preserve original formatting)
361    if let Some(raw_attrs) = raw_attributes {
362        builder.start_node(SyntaxKind::ATTRIBUTE.into());
363        builder.token(SyntaxKind::ATTRIBUTE.into(), raw_attrs);
364        builder.finish_node();
365    }
366
367    builder.finish_node();
368}
369
370/// Try to parse an automatic link starting at the current position.
371///
372/// Automatic links have the form `<url>` (URI autolink) or `<email>`
373/// (email autolink) per CommonMark §6.4. Under `Dialect::CommonMark` the
374/// scheme/email grammar is enforced strictly (e.g. scheme must be 2-32
375/// ASCII chars; email local parts cannot contain backslashes). Pandoc
376/// markdown is laxer — it accepts Unicode in email addresses, for
377/// example — so non-CommonMark callers fall back to the heuristic
378/// "contains `:` or `@`" check that the parser used historically.
379pub fn try_parse_autolink(text: &str, is_commonmark: bool) -> Option<(usize, &str)> {
380    if !text.starts_with('<') {
381        return None;
382    }
383
384    let close_pos = text[1..].find('>')?;
385    let content = &text[1..1 + close_pos];
386
387    if content.is_empty() {
388        return None;
389    }
390    if content.contains(|c: char| c.is_whitespace()) {
391        return None;
392    }
393
394    if is_commonmark {
395        if !is_valid_uri_autolink(content) && !is_valid_email_autolink(content) {
396            return None;
397        }
398    } else if !content.contains(':') && !content.contains('@') {
399        return None;
400    }
401
402    Some((close_pos + 2, content))
403}
404
405/// CommonMark §6.4 URI autolink:
406/// scheme = 2-32 chars, ASCII letter then `[a-zA-Z0-9+.-]`, followed by `:`,
407/// followed by URI body (any char except control, space, `<`, `>`).
408fn is_valid_uri_autolink(s: &str) -> bool {
409    let bytes = s.as_bytes();
410    if bytes.is_empty() || !bytes[0].is_ascii_alphabetic() {
411        return false;
412    }
413    let mut i = 1;
414    while i < bytes.len() {
415        let b = bytes[i];
416        if b.is_ascii_alphanumeric() || b == b'+' || b == b'-' || b == b'.' {
417            i += 1;
418        } else {
419            break;
420        }
421    }
422    if !(2..=32).contains(&i) {
423        return false;
424    }
425    if i >= bytes.len() || bytes[i] != b':' {
426        return false;
427    }
428    for &b in &bytes[i + 1..] {
429        if b < 0x20 || b == 0x7f || b == b'<' || b == b'>' {
430            return false;
431        }
432    }
433    true
434}
435
436/// CommonMark §6.4 email autolink, matching the HTML5 non-normative regex:
437/// `^[a-zA-Z0-9.!#$%&'*+/=?^_`{|}~-]+@[a-zA-Z0-9](?:[a-zA-Z0-9-]{0,61}[a-zA-Z0-9])?
438///  (?:\.[a-zA-Z0-9](?:[a-zA-Z0-9-]{0,61}[a-zA-Z0-9])?)*$`.
439fn is_valid_email_autolink(s: &str) -> bool {
440    let Some(at) = s.find('@') else {
441        return false;
442    };
443    let local = &s[..at];
444    let domain = &s[at + 1..];
445    if local.is_empty() || !local.bytes().all(is_email_local_byte) {
446        return false;
447    }
448    if domain.is_empty() {
449        return false;
450    }
451    domain.split('.').all(is_valid_email_label)
452}
453
454fn is_email_local_byte(b: u8) -> bool {
455    matches!(
456        b,
457        b'a'..=b'z'
458            | b'A'..=b'Z'
459            | b'0'..=b'9'
460            | b'.'
461            | b'!'
462            | b'#'
463            | b'$'
464            | b'%'
465            | b'&'
466            | b'\''
467            | b'*'
468            | b'+'
469            | b'/'
470            | b'='
471            | b'?'
472            | b'^'
473            | b'_'
474            | b'`'
475            | b'{'
476            | b'|'
477            | b'}'
478            | b'~'
479            | b'-'
480    )
481}
482
483fn is_valid_email_label(label: &str) -> bool {
484    let bytes = label.as_bytes();
485    if bytes.is_empty() || bytes.len() > 63 {
486        return false;
487    }
488    if !bytes[0].is_ascii_alphanumeric() {
489        return false;
490    }
491    if !bytes[bytes.len() - 1].is_ascii_alphanumeric() {
492        return false;
493    }
494    bytes[1..bytes.len() - 1]
495        .iter()
496        .all(|b| b.is_ascii_alphanumeric() || *b == b'-')
497}
498
499/// Emit an automatic link node to the builder.
500pub fn emit_autolink(builder: &mut GreenNodeBuilder, _text: &str, url: &str) {
501    builder.start_node(SyntaxKind::AUTO_LINK.into());
502
503    // Opening <
504    builder.start_node(SyntaxKind::AUTO_LINK_MARKER.into());
505    builder.token(SyntaxKind::AUTO_LINK_MARKER.into(), "<");
506    builder.finish_node();
507
508    // URL content
509    builder.token(SyntaxKind::TEXT.into(), url);
510
511    // Closing >
512    builder.start_node(SyntaxKind::AUTO_LINK_MARKER.into());
513    builder.token(SyntaxKind::AUTO_LINK_MARKER.into(), ">");
514    builder.finish_node();
515
516    builder.finish_node();
517}
518
519pub fn try_parse_bare_uri(text: &str) -> Option<(usize, &str)> {
520    let mut chars = text.char_indices();
521    let (_, first) = chars.next()?;
522    if !first.is_ascii_alphabetic() {
523        return None;
524    }
525
526    let mut scheme_end = None;
527    for (idx, ch) in text.char_indices() {
528        if ch == ':' {
529            scheme_end = Some(idx);
530            break;
531        }
532        if !ch.is_ascii_alphanumeric() && ch != '+' && ch != '-' && ch != '.' {
533            return None;
534        }
535    }
536    let scheme_end = scheme_end?;
537    if scheme_end == 0 {
538        return None;
539    }
540
541    let mut end = scheme_end + 1;
542    let bytes = text.as_bytes();
543    while end < text.len() {
544        let b = bytes[end];
545        if b.is_ascii_whitespace() {
546            break;
547        }
548        if matches!(b, b'<' | b'>' | b'`' | b'"' | b'\'') {
549            break;
550        }
551        end += 1;
552    }
553
554    if end == scheme_end + 1 {
555        return None;
556    }
557
558    let mut trimmed = end;
559    while trimmed > scheme_end + 1 {
560        let ch = text[..trimmed].chars().last().unwrap();
561        if matches!(ch, '.' | ',' | ';' | ':' | ')' | ']' | '}') {
562            trimmed -= ch.len_utf8();
563        } else {
564            break;
565        }
566    }
567
568    if trimmed <= scheme_end + 1 {
569        return None;
570    }
571
572    // If trimming terminal punctuation leaves a dangling backslash, the match
573    // came from escaped punctuation (e.g., `a:\]`) and should stay literal.
574    if text[..trimmed].ends_with('\\') {
575        return None;
576    }
577
578    Some((trimmed, &text[..trimmed]))
579}
580
581/// Try to parse an inline link starting at the current position.
582///
583/// Inline links have the form `[text](url)` or `[text](url "title")`.
584/// Can also have trailing attributes: `[text](url){#id .class}`.
585/// Returns Some((length, text_content, dest_content, raw_attributes)) if a valid link is found.
586///
587/// `strict_dest` enables CommonMark §6.4 destination-and-title validation:
588/// the bare destination form may not contain spaces or ASCII control
589/// characters and must have balanced parentheses; if a title follows it
590/// must be properly delimited; only whitespace is allowed before/after.
591/// Pandoc-markdown is more permissive, so leave this off for that dialect.
592pub fn try_parse_inline_link(
593    text: &str,
594    strict_dest: bool,
595    ctx: LinkScanContext,
596) -> Option<(usize, &str, &str, Option<&str>)> {
597    if !text.starts_with('[') {
598        return None;
599    }
600
601    // Find the closing ]
602    let close_bracket = find_link_close_bracket(text, 1, ctx)?;
603    let link_text = &text[1..close_bracket];
604
605    // Check for immediate ( after ]
606    let after_bracket = close_bracket + 1;
607    if text.len() <= after_bracket || !text[after_bracket..].starts_with('(') {
608        return None;
609    }
610
611    // Find closing ) for destination
612    let dest_start = after_bracket + 1;
613    let remaining = &text[dest_start..];
614
615    let close_paren = find_dest_close_paren(remaining)?;
616    let dest_content = &remaining[..close_paren];
617
618    if strict_dest && !dest_and_title_ok_commonmark(dest_content) {
619        return None;
620    }
621
622    // CommonMark §6.4: outer link is rejected when its text contains a valid
623    // inner inline link or image, so the inner-most definition wins.
624    if ctx.disallow_inner_links && link_text_contains_inner_link(link_text, ctx, strict_dest) {
625        return None;
626    }
627
628    // Check for trailing attributes {#id .class key=value}
629    let after_paren = dest_start + close_paren + 1;
630    let after_close = &text[after_paren..];
631
632    // Attributes must start immediately after closing paren (no whitespace/newlines)
633    if after_close.starts_with('{') {
634        // Find the closing brace
635        if let Some(close_brace_pos) = after_close.find('}') {
636            let attr_text = &after_close[..=close_brace_pos];
637            // Try to parse as attributes to validate
638            if let Some((_attrs, _)) = try_parse_trailing_attributes(attr_text) {
639                let total_len = after_paren + close_brace_pos + 1;
640                // Return raw attribute string for lossless parsing
641                let raw_attrs = attr_text;
642                return Some((total_len, link_text, dest_content, Some(raw_attrs)));
643            }
644        }
645    }
646
647    // No attributes, just return the link
648    let total_len = after_paren;
649    Some((total_len, link_text, dest_content, None))
650}
651
652/// CommonMark §6.4 destination + optional title validation. The text passed
653/// in is whatever the parser captured between `(` and `)`. A valid form is:
654/// `[ws] destination [ws title [ws]]` where:
655/// - bare destination has no spaces, tabs, ASCII control chars, and balanced
656///   parentheses (escaped parens permitted);
657/// - bracketed destination is `<...>` with no newlines and no unescaped `<>`;
658/// - the optional title is delimited by `"..."`, `'...'`, or `(...)`;
659/// - any text outside that structure invalidates the link.
660fn dest_and_title_ok_commonmark(content: &str) -> bool {
661    let trimmed = trim_start_link_ws(content);
662    if trimmed.is_empty() {
663        return true;
664    }
665
666    let after_dest = if let Some(rest) = trimmed.strip_prefix('<') {
667        let mut escape = false;
668        let mut end_byte = None;
669        for (i, c) in rest.char_indices() {
670            if escape {
671                escape = false;
672                continue;
673            }
674            match c {
675                '\\' => escape = true,
676                '\n' | '<' => return false,
677                '>' => {
678                    end_byte = Some(i);
679                    break;
680                }
681                _ => {}
682            }
683        }
684        match end_byte {
685            Some(e) => &rest[e + 1..],
686            None => return false,
687        }
688    } else {
689        let mut escape = false;
690        let mut depth: i32 = 0;
691        let mut end = trimmed.len();
692        for (i, c) in trimmed.char_indices() {
693            if escape {
694                escape = false;
695                continue;
696            }
697            match c {
698                '\\' => escape = true,
699                ' ' | '\t' | '\n' => {
700                    end = i;
701                    break;
702                }
703                _ if c.is_ascii_control() => return false,
704                '(' => depth += 1,
705                ')' => {
706                    if depth == 0 {
707                        end = i;
708                        break;
709                    }
710                    depth -= 1;
711                }
712                _ => {}
713            }
714        }
715        if depth != 0 {
716            return false;
717        }
718        if end == 0 {
719            // bare destination must be nonempty if the field is non-blank
720            return false;
721        }
722        &trimmed[end..]
723    };
724
725    let after_dest = trim_start_link_ws(after_dest);
726    if after_dest.is_empty() {
727        return true;
728    }
729
730    let bytes = after_dest.as_bytes();
731    let close = match bytes[0] {
732        b'"' => b'"',
733        b'\'' => b'\'',
734        b'(' => b')',
735        _ => return false,
736    };
737    let opens_paren = bytes[0] == b'(';
738    let mut escape = false;
739    let mut title_close_pos = None;
740    for (i, &b) in after_dest.as_bytes().iter().enumerate().skip(1) {
741        if escape {
742            escape = false;
743            continue;
744        }
745        if b == b'\\' {
746            escape = true;
747            continue;
748        }
749        if opens_paren && b == b'(' {
750            return false;
751        }
752        if b == close {
753            title_close_pos = Some(i);
754            break;
755        }
756    }
757    let close_idx = match title_close_pos {
758        Some(p) => p,
759        None => return false,
760    };
761
762    let after_title = &after_dest[close_idx + 1..];
763    is_link_ws_only(after_title)
764}
765
766/// Strip leading ASCII space/tab/newline bytes. Byte-level equivalent of
767/// `s.trim_start_matches([' ', '\t', '\n'])`; called for every
768/// CommonMark inline-link destination/title scan, so the slice-pattern
769/// MultiCharEqSearcher overhead matters.
770#[inline]
771fn trim_start_link_ws(s: &str) -> &str {
772    let bytes = s.as_bytes();
773    let mut i = 0;
774    while i < bytes.len() {
775        let b = bytes[i];
776        if b == b' ' || b == b'\t' || b == b'\n' {
777            i += 1;
778        } else {
779            break;
780        }
781    }
782    // SAFETY: stripped only ASCII whitespace bytes.
783    unsafe { std::str::from_utf8_unchecked(&bytes[i..]) }
784}
785
786#[inline]
787fn is_link_ws_only(s: &str) -> bool {
788    s.as_bytes()
789        .iter()
790        .all(|&b| b == b' ' || b == b'\t' || b == b'\n')
791}
792
793/// Emit an inline link node to the builder.
794/// Note: link_text may contain inline elements and should be parsed recursively.
795pub fn emit_inline_link(
796    builder: &mut GreenNodeBuilder,
797    _text: &str,
798    link_text: &str,
799    dest: &str,
800    raw_attributes: Option<&str>,
801    config: &ParserOptions,
802) {
803    builder.start_node(SyntaxKind::LINK.into());
804
805    // Opening [
806    builder.start_node(SyntaxKind::LINK_START.into());
807    builder.token(SyntaxKind::LINK_START.into(), "[");
808    builder.finish_node();
809
810    // Link text (recursively parse inline elements). Pandoc-native:
811    // links cannot contain other links, so suppress inner LINK / ref-link
812    // recognition during the recursion. Images, emphasis, code, etc. are
813    // still recognised. CommonMark relies on outer-level process_brackets
814    // to prevent nested links, but the flag is harmless under CM.
815    builder.start_node(SyntaxKind::LINK_TEXT.into());
816    parse_inline_text(builder, link_text, config, true);
817    builder.finish_node();
818
819    // Closing ]
820    builder.token(SyntaxKind::LINK_TEXT_END.into(), "]");
821
822    // Opening (
823    builder.token(SyntaxKind::LINK_DEST_START.into(), "(");
824
825    // Destination
826    builder.start_node(SyntaxKind::LINK_DEST.into());
827    builder.token(SyntaxKind::TEXT.into(), dest);
828    builder.finish_node();
829
830    // Closing )
831    builder.token(SyntaxKind::LINK_DEST_END.into(), ")");
832
833    // Emit raw attributes if present (preserve original formatting)
834    if let Some(raw_attrs) = raw_attributes {
835        builder.start_node(SyntaxKind::ATTRIBUTE.into());
836        builder.token(SyntaxKind::ATTRIBUTE.into(), raw_attrs);
837        builder.finish_node();
838    }
839
840    builder.finish_node();
841}
842
843pub fn emit_bare_uri_link(builder: &mut GreenNodeBuilder, uri: &str, _config: &ParserOptions) {
844    builder.start_node(SyntaxKind::LINK.into());
845
846    builder.start_node(SyntaxKind::LINK_START.into());
847    builder.token(SyntaxKind::LINK_START.into(), "[");
848    builder.finish_node();
849
850    builder.start_node(SyntaxKind::LINK_TEXT.into());
851    builder.token(SyntaxKind::TEXT.into(), uri);
852    builder.finish_node();
853
854    builder.token(SyntaxKind::LINK_TEXT_END.into(), "]");
855    builder.token(SyntaxKind::LINK_DEST_START.into(), "(");
856
857    builder.start_node(SyntaxKind::LINK_DEST.into());
858    builder.token(SyntaxKind::TEXT.into(), uri);
859    builder.finish_node();
860
861    builder.token(SyntaxKind::LINK_DEST_END.into(), ")");
862
863    builder.finish_node();
864}
865
866/// Try to parse a reference link starting at the current position.
867///
868/// Reference links have three forms:
869/// - Explicit: `[text][label]`
870/// - Implicit: `[text][]` (label = text)
871/// - Shortcut: `[text]` (if shortcut_reference_links enabled)
872///
873/// Returns Some((length, text_content, label, is_shortcut)) if a valid reference link is found.
874/// The label is what should be looked up in the registry.
875pub fn try_parse_reference_link(
876    text: &str,
877    allow_shortcut: bool,
878    inline_link_attempted: bool,
879    ctx: LinkScanContext,
880) -> Option<(usize, &str, String, bool)> {
881    if !text.starts_with('[') {
882        return None;
883    }
884
885    // Don't match citations (which start with [@) or suppress-author citations (which start with [-@)
886    if text.len() > 1 {
887        let bytes = text.as_bytes();
888        if bytes[1] == b'@' {
889            return None;
890        }
891        if bytes[1] == b'-' && text.len() > 2 && bytes[2] == b'@' {
892            return None;
893        }
894    }
895
896    // Find the closing ] for the text. Uses the shared helper so that a
897    // `]` inside a code span doesn't terminate the link text (CommonMark
898    // §6 — code spans bind tighter than links). See spec examples #342
899    // and #525. Raw HTML and (CommonMark-only) autolink spans are also
900    // opaque per `ctx`.
901    let close_bracket = find_link_close_bracket(text, 1, ctx)?;
902    let link_text = &text[1..close_bracket];
903
904    // CommonMark §6.4: outer reference link is rejected when its text contains
905    // a valid inner inline link/image (spec example #532). Reference-link
906    // nesting (#533/#569/#571) is not handled here; it requires resolving
907    // labels against the document refdef map.
908    if ctx.disallow_inner_links
909        && link_text_contains_inner_link(link_text, ctx, ctx.disallow_inner_links)
910    {
911        return None;
912    }
913
914    // Check what follows the ]
915    let after_bracket = close_bracket + 1;
916
917    // `[content]{...}` is reserved for bracketed spans / attribute
918    // trailers, never a shortcut.
919    if after_bracket < text.len() && text[after_bracket..].starts_with('{') {
920        return None;
921    }
922
923    // `[text](...)` is the inline-link shape. CommonMark spec example
924    // #568 (`[foo](not a link)` with `[foo]: /url`) requires the shortcut
925    // to succeed for `[foo]`, leaving `(not a link)` as literal text when
926    // the upstream inline-link parse was rejected by `strict_dest`. We
927    // only fall through to shortcut here when the caller has already
928    // tried the inline-link form (`inline_link_attempted`) — otherwise
929    // disabling the `inline_links` extension would silently let
930    // `[text](url)` become a shortcut + literal text, which the
931    // `inline_links_disabled_keeps_inline_link_literal` test guards
932    // against.
933    if after_bracket < text.len()
934        && text[after_bracket..].starts_with('(')
935        && (!allow_shortcut || !inline_link_attempted)
936    {
937        return None;
938    }
939
940    // Check for explicit reference [text][label] or implicit [text][]
941    if after_bracket < text.len() && text[after_bracket..].starts_with('[') {
942        // Find the closing ] for the label
943        let label_start = after_bracket + 1;
944        let mut label_end = None;
945
946        for (i, ch) in text[label_start..].char_indices() {
947            if ch == ']' {
948                label_end = Some(i + label_start);
949                break;
950            }
951            // Labels can't contain newlines
952            if ch == '\n' {
953                return None;
954            }
955        }
956
957        let label_end = label_end?;
958        let label = &text[label_start..label_end];
959
960        // Total length includes both bracket pairs
961        let total_len = label_end + 1;
962
963        // Implicit reference: empty label means emit [text][]
964        if label.is_empty() {
965            return Some((total_len, link_text, String::new(), false));
966        }
967
968        // Explicit reference: use the provided label
969        Some((total_len, link_text, label.to_string(), false))
970    } else if allow_shortcut {
971        // Shortcut reference: [text] with no second bracket pair
972        // The text is both the display text and the label
973        if link_text.is_empty() {
974            return None;
975        }
976        Some((after_bracket, link_text, link_text.to_string(), true))
977    } else {
978        // No second bracket pair and shortcut not allowed - not a reference link
979        None
980    }
981}
982
983/// Emit a reference link node to the builder.
984/// Preserves the original reference syntax (explicit [text][ref], implicit [text][], or shortcut [text]).
985pub fn emit_reference_link(
986    builder: &mut GreenNodeBuilder,
987    link_text: &str,
988    label: &str,
989    is_shortcut: bool,
990    config: &ParserOptions,
991) {
992    builder.start_node(SyntaxKind::LINK.into());
993
994    // Opening [
995    builder.start_node(SyntaxKind::LINK_START.into());
996    builder.token(SyntaxKind::LINK_START.into(), "[");
997    builder.finish_node();
998
999    // Link text (recursively parse inline elements). Pandoc-native:
1000    // links cannot contain other links, so suppress inner LINK / ref-link
1001    // recognition during the recursion. Images, emphasis, code, etc. are
1002    // still recognised.
1003    builder.start_node(SyntaxKind::LINK_TEXT.into());
1004    parse_inline_text(builder, link_text, config, true);
1005    builder.finish_node();
1006
1007    // Closing ] and reference label
1008    builder.token(SyntaxKind::TEXT.into(), "]");
1009
1010    if !is_shortcut {
1011        // Explicit or implicit reference: [text][label] or [text][]
1012        builder.token(SyntaxKind::TEXT.into(), "[");
1013        builder.start_node(SyntaxKind::LINK_REF.into());
1014        // For implicit references, label is empty and we emit [text][]
1015        // For explicit references, emit the label to get [text][label]
1016        if !label.is_empty() {
1017            builder.token(SyntaxKind::TEXT.into(), label);
1018        }
1019        builder.finish_node();
1020        builder.token(SyntaxKind::TEXT.into(), "]");
1021    }
1022    // For shortcut references, just [text] - no second bracket pair
1023
1024    builder.finish_node();
1025}
1026
1027/// Try to parse a reference-style image: `![alt][ref]`, `![alt][]`, or `![alt]`
1028/// Returns (total_len, alt_text, label, is_shortcut) if successful.
1029pub fn try_parse_reference_image(
1030    text: &str,
1031    allow_shortcut: bool,
1032) -> Option<(usize, &str, String, bool)> {
1033    let bytes = text.as_bytes();
1034    if bytes.len() < 4 || bytes[0] != b'!' || bytes[1] != b'[' {
1035        return None;
1036    }
1037
1038    let mut pos = 2;
1039    let mut bracket_depth = 1;
1040    let alt_start = pos;
1041
1042    // Find the end of the alt text (allowing nested brackets)
1043    while pos < bytes.len() && bracket_depth > 0 {
1044        match bytes[pos] {
1045            b'[' => bracket_depth += 1,
1046            b']' => bracket_depth -= 1,
1047            b'\\' if pos + 1 < bytes.len() => pos += 1, // skip escaped char
1048            _ => {}
1049        }
1050        pos += 1;
1051    }
1052
1053    if bracket_depth > 0 {
1054        return None; // Unclosed brackets
1055    }
1056
1057    let alt_text = &text[alt_start..pos - 1];
1058
1059    // Now check for the label part
1060    if pos >= bytes.len() {
1061        return None;
1062    }
1063
1064    // Explicit reference: `![alt][label]`
1065    if bytes[pos] == b'[' {
1066        pos += 1;
1067        let label_start = pos;
1068
1069        // Find the end of the label (no nested brackets, no newlines)
1070        while pos < bytes.len() && bytes[pos] != b']' && bytes[pos] != b'\n' && bytes[pos] != b'\r'
1071        {
1072            pos += 1;
1073        }
1074
1075        if pos >= bytes.len() || bytes[pos] != b']' {
1076            return None;
1077        }
1078
1079        let label_text = &text[label_start..pos];
1080        pos += 1;
1081
1082        // Return the original label text for formatting preservation
1083        // Empty label means implicit reference
1084        let label = if label_text.is_empty() {
1085            alt_text.to_string() // For implicit references, use alt text as label for equality check
1086        } else {
1087            label_text.to_string() // Preserve original case
1088        };
1089
1090        return Some((pos, alt_text, label, false));
1091    }
1092
1093    // Shortcut reference: `![alt]` (only if enabled)
1094    // BUT not if followed by (url) - that's an inline image
1095    if allow_shortcut {
1096        // Check if next char is ( - if so, not a reference
1097        if pos < bytes.len() && bytes[pos] == b'(' {
1098            return None;
1099        }
1100
1101        // For shortcut references, use alt text as label for equality check
1102        let label = alt_text.to_string();
1103        return Some((pos, alt_text, label, true));
1104    }
1105
1106    None
1107}
1108
1109/// Emit a reference image node with registry lookup.
1110pub fn emit_reference_image(
1111    builder: &mut GreenNodeBuilder,
1112    alt_text: &str,
1113    label: &str,
1114    is_shortcut: bool,
1115    config: &ParserOptions,
1116) {
1117    builder.start_node(SyntaxKind::IMAGE_LINK.into());
1118
1119    // Emit as reference image (preserve original syntax)
1120    builder.start_node(SyntaxKind::IMAGE_LINK_START.into());
1121    builder.token(SyntaxKind::IMAGE_LINK_START.into(), "![");
1122    builder.finish_node();
1123
1124    // Alt text (recursively parse inline elements)
1125    builder.start_node(SyntaxKind::IMAGE_ALT.into());
1126    parse_inline_text(builder, alt_text, config, false);
1127    builder.finish_node();
1128
1129    // Closing ] and reference label
1130    builder.token(SyntaxKind::TEXT.into(), "]");
1131
1132    if !is_shortcut {
1133        // Explicit or implicit reference: ![alt][label] or ![alt][]
1134        builder.token(SyntaxKind::TEXT.into(), "[");
1135        builder.start_node(SyntaxKind::LINK_REF.into());
1136        // For implicit references, emit empty label (label == alt means implicit from parser)
1137        if label != alt_text {
1138            builder.token(SyntaxKind::TEXT.into(), label);
1139        }
1140        builder.finish_node();
1141        builder.token(SyntaxKind::TEXT.into(), "]");
1142    }
1143    // For shortcut references, just ![alt] - no second bracket pair
1144
1145    builder.finish_node();
1146}
1147
1148#[cfg(test)]
1149mod tests {
1150    use super::*;
1151
1152    #[test]
1153    fn test_parse_autolink_url() {
1154        let input = "<https://example.com>";
1155        assert_eq!(
1156            try_parse_autolink(input, false),
1157            Some((21, "https://example.com"))
1158        );
1159        assert_eq!(
1160            try_parse_autolink(input, true),
1161            Some((21, "https://example.com"))
1162        );
1163    }
1164
1165    #[test]
1166    fn test_parse_autolink_email() {
1167        let input = "<user@example.com>";
1168        assert_eq!(
1169            try_parse_autolink(input, false),
1170            Some((18, "user@example.com"))
1171        );
1172        assert_eq!(
1173            try_parse_autolink(input, true),
1174            Some((18, "user@example.com"))
1175        );
1176    }
1177
1178    #[test]
1179    fn test_parse_autolink_no_close() {
1180        let input = "<https://example.com";
1181        assert_eq!(try_parse_autolink(input, false), None);
1182        assert_eq!(try_parse_autolink(input, true), None);
1183    }
1184
1185    #[test]
1186    fn test_parse_autolink_with_space() {
1187        let input = "<https://example.com >";
1188        assert_eq!(try_parse_autolink(input, false), None);
1189        assert_eq!(try_parse_autolink(input, true), None);
1190    }
1191
1192    #[test]
1193    fn test_parse_autolink_not_url_or_email() {
1194        let input = "<notaurl>";
1195        assert_eq!(try_parse_autolink(input, false), None);
1196        assert_eq!(try_parse_autolink(input, true), None);
1197    }
1198
1199    #[test]
1200    fn test_parse_autolink_commonmark_strict_scheme() {
1201        // Scheme too short (1 char) — invalid under CommonMark, lax-accepted
1202        // under Pandoc dialect (matches historical behavior).
1203        let input = "<m:abc>";
1204        assert_eq!(try_parse_autolink(input, true), None);
1205        assert_eq!(try_parse_autolink(input, false), Some((7, "m:abc")));
1206    }
1207
1208    #[test]
1209    fn test_parse_autolink_commonmark_email_disallows_backslash() {
1210        let input = "<foo\\+@bar.example.com>";
1211        assert_eq!(try_parse_autolink(input, true), None);
1212        assert_eq!(
1213            try_parse_autolink(input, false),
1214            Some((23, "foo\\+@bar.example.com"))
1215        );
1216    }
1217
1218    #[test]
1219    fn test_parse_inline_link_simple() {
1220        let input = "[text](url)";
1221        let result = try_parse_inline_link(input, false, LinkScanContext::default());
1222        assert_eq!(result, Some((11, "text", "url", None)));
1223    }
1224
1225    #[test]
1226    fn test_parse_inline_link_with_title() {
1227        let input = r#"[text](url "title")"#;
1228        let result = try_parse_inline_link(input, false, LinkScanContext::default());
1229        assert_eq!(result, Some((19, "text", r#"url "title""#, None)));
1230    }
1231
1232    #[test]
1233    fn test_parse_inline_link_with_nested_brackets() {
1234        let input = "[outer [inner] text](url)";
1235        let result = try_parse_inline_link(input, false, LinkScanContext::default());
1236        assert_eq!(result, Some((25, "outer [inner] text", "url", None)));
1237    }
1238
1239    #[test]
1240    fn test_parse_inline_link_no_space_between_brackets_and_parens() {
1241        let input = "[text] (url)";
1242        let result = try_parse_inline_link(input, false, LinkScanContext::default());
1243        assert_eq!(result, None);
1244    }
1245
1246    #[test]
1247    fn test_parse_inline_link_no_closing_bracket() {
1248        let input = "[text(url)";
1249        let result = try_parse_inline_link(input, false, LinkScanContext::default());
1250        assert_eq!(result, None);
1251    }
1252
1253    #[test]
1254    fn test_parse_inline_link_no_closing_paren() {
1255        let input = "[text](url";
1256        let result = try_parse_inline_link(input, false, LinkScanContext::default());
1257        assert_eq!(result, None);
1258    }
1259
1260    #[test]
1261    fn test_parse_inline_link_escaped_bracket() {
1262        let input = r"[text\]more](url)";
1263        let result = try_parse_inline_link(input, false, LinkScanContext::default());
1264        assert_eq!(result, Some((17, r"text\]more", "url", None)));
1265    }
1266
1267    #[test]
1268    fn test_parse_inline_link_parens_in_url() {
1269        let input = "[text](url(with)parens)";
1270        let result = try_parse_inline_link(input, false, LinkScanContext::default());
1271        assert_eq!(result, Some((23, "text", "url(with)parens", None)));
1272    }
1273
1274    #[test]
1275    fn test_parse_inline_image_simple() {
1276        let input = "![alt](image.jpg)";
1277        let result = try_parse_inline_image(input, LinkScanContext::default());
1278        assert_eq!(result, Some((17, "alt", "image.jpg", None)));
1279    }
1280
1281    #[test]
1282    fn test_parse_inline_image_with_title() {
1283        let input = r#"![alt](image.jpg "A title")"#;
1284        let result = try_parse_inline_image(input, LinkScanContext::default());
1285        assert_eq!(result, Some((27, "alt", r#"image.jpg "A title""#, None)));
1286    }
1287
1288    #[test]
1289    fn test_parse_inline_image_with_nested_brackets() {
1290        let input = "![outer [inner] alt](image.jpg)";
1291        let result = try_parse_inline_image(input, LinkScanContext::default());
1292        assert_eq!(result, Some((31, "outer [inner] alt", "image.jpg", None)));
1293    }
1294
1295    #[test]
1296    fn test_parse_bare_uri_rejects_dangling_backslash_after_trim() {
1297        let input = r"a:\]";
1298        let result = try_parse_bare_uri(input);
1299        assert_eq!(result, None);
1300    }
1301
1302    #[test]
1303    fn test_parse_inline_image_no_space_between_brackets_and_parens() {
1304        let input = "![alt] (image.jpg)";
1305        let result = try_parse_inline_image(input, LinkScanContext::default());
1306        assert_eq!(result, None);
1307    }
1308
1309    #[test]
1310    fn test_parse_inline_image_no_closing_bracket() {
1311        let input = "![alt(image.jpg)";
1312        let result = try_parse_inline_image(input, LinkScanContext::default());
1313        assert_eq!(result, None);
1314    }
1315
1316    #[test]
1317    fn test_parse_inline_image_no_closing_paren() {
1318        let input = "![alt](image.jpg";
1319        let result = try_parse_inline_image(input, LinkScanContext::default());
1320        assert_eq!(result, None);
1321    }
1322
1323    #[test]
1324    fn test_parse_inline_image_with_simple_class() {
1325        let input = "![alt](img.png){.large}";
1326        let result = try_parse_inline_image(input, LinkScanContext::default());
1327        let (len, alt, dest, attrs) = result.unwrap();
1328        assert_eq!(len, 23);
1329        assert_eq!(alt, "alt");
1330        assert_eq!(dest, "img.png");
1331        assert!(attrs.is_some());
1332        let attrs = attrs.unwrap();
1333        assert_eq!(attrs, "{.large}");
1334    }
1335
1336    #[test]
1337    fn test_parse_inline_image_with_id() {
1338        let input = "![Figure 1](fig1.png){#fig-1}";
1339        let result = try_parse_inline_image(input, LinkScanContext::default());
1340        let (len, alt, dest, attrs) = result.unwrap();
1341        assert_eq!(len, 29);
1342        assert_eq!(alt, "Figure 1");
1343        assert_eq!(dest, "fig1.png");
1344        assert!(attrs.is_some());
1345        let attrs = attrs.unwrap();
1346        assert_eq!(attrs, "{#fig-1}");
1347    }
1348
1349    #[test]
1350    fn test_parse_inline_image_with_full_attributes() {
1351        let input = "![alt](img.png){#fig .large width=\"80%\"}";
1352        let result = try_parse_inline_image(input, LinkScanContext::default());
1353        let (len, alt, dest, attrs) = result.unwrap();
1354        assert_eq!(len, 40);
1355        assert_eq!(alt, "alt");
1356        assert_eq!(dest, "img.png");
1357        assert!(attrs.is_some());
1358        let attrs = attrs.unwrap();
1359        assert_eq!(attrs, "{#fig .large width=\"80%\"}");
1360    }
1361
1362    #[test]
1363    fn test_parse_inline_image_attributes_must_be_adjacent() {
1364        // Space between ) and { should not parse as attributes
1365        let input = "![alt](img.png) {.large}";
1366        let result = try_parse_inline_image(input, LinkScanContext::default());
1367        assert_eq!(result, Some((15, "alt", "img.png", None)));
1368    }
1369
1370    // Link attribute tests
1371    #[test]
1372    fn test_parse_inline_link_with_id() {
1373        let input = "[text](url){#link-1}";
1374        let result = try_parse_inline_link(input, false, LinkScanContext::default());
1375        let (len, text, dest, attrs) = result.unwrap();
1376        assert_eq!(len, 20);
1377        assert_eq!(text, "text");
1378        assert_eq!(dest, "url");
1379        assert!(attrs.is_some());
1380        let attrs = attrs.unwrap();
1381        assert_eq!(attrs, "{#link-1}");
1382    }
1383
1384    #[test]
1385    fn test_parse_inline_link_with_full_attributes() {
1386        let input = "[text](url){#link .external target=\"_blank\"}";
1387        let result = try_parse_inline_link(input, false, LinkScanContext::default());
1388        let (len, text, dest, attrs) = result.unwrap();
1389        assert_eq!(len, 44);
1390        assert_eq!(text, "text");
1391        assert_eq!(dest, "url");
1392        assert!(attrs.is_some());
1393        let attrs = attrs.unwrap();
1394        assert_eq!(attrs, "{#link .external target=\"_blank\"}");
1395    }
1396
1397    #[test]
1398    fn test_parse_inline_link_attributes_must_be_adjacent() {
1399        // Space between ) and { should not parse as attributes
1400        let input = "[text](url) {.class}";
1401        let result = try_parse_inline_link(input, false, LinkScanContext::default());
1402        assert_eq!(result, Some((11, "text", "url", None)));
1403    }
1404
1405    #[test]
1406    fn test_parse_inline_link_with_title_and_attributes() {
1407        let input = r#"[text](url "title"){.external}"#;
1408        let result = try_parse_inline_link(input, false, LinkScanContext::default());
1409        let (len, text, dest, attrs) = result.unwrap();
1410        assert_eq!(len, 30);
1411        assert_eq!(text, "text");
1412        assert_eq!(dest, r#"url "title""#);
1413        assert!(attrs.is_some());
1414        let attrs = attrs.unwrap();
1415        assert_eq!(attrs, "{.external}");
1416    }
1417
1418    // Reference link tests
1419    #[test]
1420    fn test_parse_reference_link_explicit() {
1421        let input = "[link text][label]";
1422        let result = try_parse_reference_link(input, false, true, LinkScanContext::default());
1423        assert_eq!(result, Some((18, "link text", "label".to_string(), false)));
1424    }
1425
1426    #[test]
1427    fn test_parse_reference_link_implicit() {
1428        let input = "[link text][]";
1429        let result = try_parse_reference_link(input, false, true, LinkScanContext::default());
1430        assert_eq!(result, Some((13, "link text", String::new(), false)));
1431    }
1432
1433    #[test]
1434    fn test_parse_reference_link_explicit_same_label_as_text() {
1435        let input = "[stack][stack]";
1436        let result = try_parse_reference_link(input, false, true, LinkScanContext::default());
1437        assert_eq!(result, Some((14, "stack", "stack".to_string(), false)));
1438    }
1439
1440    #[test]
1441    fn test_parse_reference_link_shortcut() {
1442        let input = "[link text] rest";
1443        let result = try_parse_reference_link(input, true, true, LinkScanContext::default());
1444        assert_eq!(
1445            result,
1446            Some((11, "link text", "link text".to_string(), true))
1447        );
1448    }
1449
1450    #[test]
1451    fn test_parse_reference_link_shortcut_rejects_empty_label() {
1452        let input = "[] rest";
1453        let result = try_parse_reference_link(input, true, true, LinkScanContext::default());
1454        assert_eq!(result, None);
1455    }
1456
1457    #[test]
1458    fn test_parse_reference_link_shortcut_disabled() {
1459        let input = "[link text] rest";
1460        let result = try_parse_reference_link(input, false, true, LinkScanContext::default());
1461        assert_eq!(result, None);
1462    }
1463
1464    #[test]
1465    fn test_parse_reference_link_not_inline_link() {
1466        // With shortcut disabled, `[text](url)` is rejected so the inline
1467        // link form upstream gets exclusive ownership.
1468        let input = "[text](url)";
1469        let result = try_parse_reference_link(input, false, true, LinkScanContext::default());
1470        assert_eq!(result, None);
1471    }
1472
1473    #[test]
1474    fn test_parse_reference_link_shortcut_falls_through_inline_link() {
1475        // CommonMark spec example #568: when an inline-link attempt would
1476        // fail (here we model the reachability — the caller tries inline
1477        // link first; if that returns None, we should still see `[text]`
1478        // as a shortcut and leave `(url)` to be parsed as following text).
1479        let input = "[text](url)";
1480        let result = try_parse_reference_link(input, true, true, LinkScanContext::default());
1481        assert_eq!(result, Some((6, "text", "text".to_string(), true)));
1482    }
1483
1484    #[test]
1485    fn test_parse_reference_link_with_nested_brackets() {
1486        let input = "[outer [inner] text][ref]";
1487        let result = try_parse_reference_link(input, false, true, LinkScanContext::default());
1488        assert_eq!(
1489            result,
1490            Some((25, "outer [inner] text", "ref".to_string(), false))
1491        );
1492    }
1493
1494    #[test]
1495    fn test_parse_reference_link_label_no_newline() {
1496        let input = "[text][label\nmore]";
1497        let result = try_parse_reference_link(input, false, true, LinkScanContext::default());
1498        assert_eq!(result, None);
1499    }
1500
1501    // Reference image tests
1502    #[test]
1503    fn test_parse_reference_image_explicit() {
1504        let input = "![alt text][label]";
1505        let result = try_parse_reference_image(input, false);
1506        assert_eq!(result, Some((18, "alt text", "label".to_string(), false)));
1507    }
1508
1509    #[test]
1510    fn test_parse_reference_image_implicit() {
1511        let input = "![alt text][]";
1512        let result = try_parse_reference_image(input, false);
1513        assert_eq!(
1514            result,
1515            Some((13, "alt text", "alt text".to_string(), false))
1516        );
1517    }
1518
1519    #[test]
1520    fn test_parse_reference_image_shortcut() {
1521        let input = "![alt text] rest";
1522        let result = try_parse_reference_image(input, true);
1523        assert_eq!(result, Some((11, "alt text", "alt text".to_string(), true)));
1524    }
1525
1526    #[test]
1527    fn test_parse_reference_image_shortcut_disabled() {
1528        let input = "![alt text] rest";
1529        let result = try_parse_reference_image(input, false);
1530        assert_eq!(result, None);
1531    }
1532
1533    #[test]
1534    fn test_parse_reference_image_not_inline() {
1535        // Should not match inline images with (url)
1536        let input = "![alt](url)";
1537        let result = try_parse_reference_image(input, true);
1538        assert_eq!(result, None);
1539    }
1540
1541    #[test]
1542    fn test_parse_reference_image_with_nested_brackets() {
1543        let input = "![alt [nested] text][ref]";
1544        let result = try_parse_reference_image(input, false);
1545        assert_eq!(
1546            result,
1547            Some((25, "alt [nested] text", "ref".to_string(), false))
1548        );
1549    }
1550
1551    #[test]
1552    fn test_reference_link_label_with_crlf() {
1553        // Reference link labels should not span lines with CRLF
1554        let input = "[foo\r\nbar]";
1555        let result = try_parse_reference_link(input, false, true, LinkScanContext::default());
1556
1557        // Should fail to parse because label contains line break
1558        assert_eq!(
1559            result, None,
1560            "Should not parse reference link with CRLF in label"
1561        );
1562    }
1563
1564    #[test]
1565    fn test_reference_link_label_with_lf() {
1566        // Reference link labels should not span lines with LF either
1567        let input = "[foo\nbar]";
1568        let result = try_parse_reference_link(input, false, true, LinkScanContext::default());
1569
1570        // Should fail to parse because label contains line break
1571        assert_eq!(
1572            result, None,
1573            "Should not parse reference link with LF in label"
1574        );
1575    }
1576
1577    // Multiline link text tests
1578    #[test]
1579    fn test_parse_inline_link_multiline_text() {
1580        // Per Pandoc spec, link text CAN contain newlines (soft breaks)
1581        let input = "[text on\nline two](url)";
1582        let result = try_parse_inline_link(input, false, LinkScanContext::default());
1583        assert_eq!(
1584            result,
1585            Some((23, "text on\nline two", "url", None)),
1586            "Link text should allow newlines"
1587        );
1588    }
1589
1590    #[test]
1591    fn test_parse_inline_link_multiline_with_formatting() {
1592        // Link text with newlines and other inline elements
1593        let input =
1594            "[A network graph. Different edges\nwith probability](../images/networkfig.png)";
1595        let result = try_parse_inline_link(input, false, LinkScanContext::default());
1596        assert!(result.is_some(), "Link text with newlines should parse");
1597        let (len, text, _dest, _attrs) = result.unwrap();
1598        assert!(text.contains('\n'), "Link text should preserve newline");
1599        assert_eq!(len, input.len());
1600    }
1601
1602    #[test]
1603    fn test_parse_inline_image_multiline_alt() {
1604        // Per Pandoc spec, image alt text CAN contain newlines
1605        let input = "![alt on\nline two](img.png)";
1606        let result = try_parse_inline_image(input, LinkScanContext::default());
1607        assert_eq!(
1608            result,
1609            Some((27, "alt on\nline two", "img.png", None)),
1610            "Image alt text should allow newlines"
1611        );
1612    }
1613
1614    #[test]
1615    fn test_parse_inline_image_multiline_with_attributes() {
1616        // Image with multiline alt text and attributes
1617        let input = "![network graph\ndiagram](../images/fig.png){width=70%}";
1618        let result = try_parse_inline_image(input, LinkScanContext::default());
1619        assert!(
1620            result.is_some(),
1621            "Image alt with newlines and attributes should parse"
1622        );
1623        let (len, alt, dest, attrs) = result.unwrap();
1624        assert!(alt.contains('\n'), "Alt text should preserve newline");
1625        assert_eq!(dest, "../images/fig.png");
1626        assert_eq!(attrs, Some("{width=70%}"));
1627        assert_eq!(len, input.len());
1628    }
1629
1630    #[test]
1631    fn test_parse_inline_link_with_attributes_after_newline() {
1632        // Test for regression: when text is concatenated with newlines,
1633        // attributes after ) should still be recognized
1634        let input = "[A network graph.](../images/networkfig.png){width=70%}\nA word\n";
1635        let result = try_parse_inline_link(input, false, LinkScanContext::default());
1636        assert!(
1637            result.is_some(),
1638            "Link with attributes should parse even with following text"
1639        );
1640        let (len, text, dest, attrs) = result.unwrap();
1641        assert_eq!(text, "A network graph.");
1642        assert_eq!(dest, "../images/networkfig.png");
1643        assert_eq!(attrs, Some("{width=70%}"), "Attributes should be captured");
1644        assert_eq!(
1645            len, 55,
1646            "Length should include attributes (up to closing brace)"
1647        );
1648    }
1649}
panache_parser/parser/inlines/links.rs

panache_parser/parser/inlines/
links.rs