panache_parser/parser/inlines/
links.rs

1//! Parsing for links, images, and automatic links.
2//!
3//! Implements:
4//! - Automatic links: `<http://example.com>` and `<user@example.com>`
5//! - Inline links: `[text](url)` and `[text](url "title")`
6//! - Link attributes: `[text](url){#id .class key=value}`
7//! - Inline images: `![alt](url)` and `![alt](url "title")`
8//! - Image attributes: `![alt](url){#id .class key=value}`
9//! - Reference links: `[text][ref]`, `[text][]`, `[text]`
10//! - Reference images: `![alt][ref]`, `![alt][]`, `![alt]`
11
12use super::code_spans::try_parse_code_span;
13use super::core::parse_inline_text;
14use super::inline_html::try_parse_inline_html;
15use crate::options::ParserOptions;
16use crate::syntax::SyntaxKind;
17use rowan::GreenNodeBuilder;
18
19// Import attribute parsing
20use crate::parser::utils::attributes::{emit_attribute_node, try_parse_trailing_attributes};
21
22/// Flags that control which inline spans the link-bracket scanner treats as
23/// opaque (so a `]` inside them does not terminate the link/image text).
24///
25/// - `skip_raw_html` is universal across dialects: pandoc-markdown and
26///   CommonMark both refuse to close link text inside a raw HTML span (e.g.
27///   `[foo <bar attr="](baz)">`), per CommonMark spec example #524 / #536.
28/// - `skip_autolinks` is **CommonMark-only**. Pandoc-markdown does *not*
29///   treat `<scheme://...>` as opaque inside link text, so the same input
30///   produces a different parse under each dialect (CommonMark spec example
31///   #526 / #538). Always derive this from
32///   `extensions.autolinks && dialect == Dialect::CommonMark`.
33/// - `disallow_inner_links` is **CommonMark-only** structural rule (§6.4):
34///   "Links may not contain other links, at any level of nesting." When the
35///   candidate link/image text contains a valid inline link or image, the
36///   outer match is rejected so the inner-most definition is used instead
37///   (spec examples #518–#520, #532). Pandoc-markdown allows nested links,
38///   so the flag is `false` there.
39#[derive(Clone, Copy)]
40pub struct LinkScanContext {
41    pub skip_raw_html: bool,
42    pub skip_autolinks: bool,
43    pub disallow_inner_links: bool,
44    /// Dialect controlling which HTML constructs the raw-HTML opacity check
45    /// recognizes. Pandoc-markdown excludes bare declarations and CDATA
46    /// from its inline raw HTML grammar.
47    pub dialect: crate::options::Dialect,
48}
49
50impl Default for LinkScanContext {
51    fn default() -> Self {
52        Self {
53            skip_raw_html: false,
54            skip_autolinks: false,
55            disallow_inner_links: false,
56            dialect: crate::options::Dialect::Pandoc,
57        }
58    }
59}
60
61impl LinkScanContext {
62    pub fn from_options(config: &ParserOptions) -> Self {
63        let is_commonmark = config.dialect == crate::options::Dialect::CommonMark;
64        Self {
65            skip_raw_html: config.extensions.raw_html,
66            skip_autolinks: config.extensions.autolinks && is_commonmark,
67            disallow_inner_links: is_commonmark,
68            dialect: config.dialect,
69        }
70    }
71}
72
73/// Find the closing `]` of a link/image text span, starting from `start`.
74///
75/// Walks `text[start..]` tracking nested brackets and backslash escapes. When
76/// a backtick run starting a valid code span is encountered, the entire span
77/// (including any trailing attribute block) is skipped — per CommonMark §6
78/// precedence, code spans bind tighter than links/images, so a `]` *inside*
79/// a code span cannot terminate the link's text. The same opacity applies to
80/// raw HTML and (CommonMark-only) autolink spans gated through `ctx`.
81/// Returns the byte offset of the closing `]` within `text`, or `None` if no
82/// unmatched `]` is reached.
83fn find_link_close_bracket(text: &str, start: usize, ctx: LinkScanContext) -> Option<usize> {
84    let bytes = text.as_bytes();
85    let mut bracket_depth = 0;
86    let mut escape_next = false;
87    let mut i = start;
88
89    while i < bytes.len() {
90        let b = bytes[i];
91
92        if escape_next {
93            escape_next = false;
94            i += step(text, i);
95            continue;
96        }
97
98        match b {
99            b'\\' => {
100                escape_next = true;
101                i += 1;
102            }
103            b'`' => {
104                if let Some((len, _, _, _)) = try_parse_code_span(&text[i..]) {
105                    i += len;
106                } else {
107                    i += 1;
108                }
109            }
110            b'<' => {
111                // Order matters: autolinks are the more specific `<...>`
112                // shape (URI/email between angle brackets), so try that
113                // before falling through to general inline raw HTML which
114                // would also match `<bar attr="...">`-style tags.
115                if ctx.skip_autolinks
116                    && let Some((len, _)) = try_parse_autolink(&text[i..], true)
117                {
118                    i += len;
119                } else if ctx.skip_raw_html
120                    && let Some(len) = try_parse_inline_html(&text[i..], ctx.dialect)
121                {
122                    i += len;
123                } else {
124                    i += 1;
125                }
126            }
127            b'[' => {
128                bracket_depth += 1;
129                i += 1;
130            }
131            b']' => {
132                if bracket_depth == 0 {
133                    return Some(i);
134                }
135                bracket_depth -= 1;
136                i += 1;
137            }
138            _ => i += step(text, i),
139        }
140    }
141    None
142}
143
144/// Find the closing `)` of a link/image destination, given the text *after*
145/// the opening `(`. Tracks paren nesting, quoted titles, and angle-bracketed
146/// destinations (`<...>` may legitimately contain unbalanced parens — see
147/// spec example #499). Returns the byte offset of the closing `)` within the
148/// passed slice, or `None` if not found.
149fn find_dest_close_paren(remaining: &str) -> Option<usize> {
150    let bytes = remaining.as_bytes();
151    let mut paren_depth = 0;
152    let mut escape_next = false;
153    let mut in_quotes = false;
154    let mut in_angle = false;
155    let mut i = 0;
156
157    while i < bytes.len() {
158        let b = bytes[i];
159
160        if escape_next {
161            escape_next = false;
162            i += step(remaining, i);
163            continue;
164        }
165
166        match b {
167            b'\\' => {
168                escape_next = true;
169                i += 1;
170            }
171            b'<' if !in_quotes && !in_angle => {
172                in_angle = true;
173                i += 1;
174            }
175            b'>' if in_angle => {
176                in_angle = false;
177                i += 1;
178            }
179            b'"' if !in_angle => {
180                in_quotes = !in_quotes;
181                i += 1;
182            }
183            b'(' if !in_quotes && !in_angle => {
184                paren_depth += 1;
185                i += 1;
186            }
187            b')' if !in_quotes && !in_angle => {
188                if paren_depth == 0 {
189                    return Some(i);
190                }
191                paren_depth -= 1;
192                i += 1;
193            }
194            _ => i += step(remaining, i),
195        }
196    }
197    None
198}
199
200/// Byte length of the UTF-8 character starting at byte index `i` in `s`.
201/// Used to advance an index loop char-by-char without incurring `char_indices`
202/// overhead and without splitting on a UTF-8 boundary.
203fn step(s: &str, i: usize) -> usize {
204    s[i..].chars().next().map(|c| c.len_utf8()).unwrap_or(1)
205}
206
207/// CommonMark §6.4: "Links may not contain other links, at any level of
208/// nesting. If multiple otherwise valid link definitions appear nested inside
209/// each other, the inner-most definition is used." This helper scans a
210/// candidate link text for any `[` that starts a valid inline link; when
211/// found, the outer link must be rejected so the inner-most wins (spec
212/// examples #518–#519, #532).
213///
214/// Images themselves do not count as inner links — a link can contain an
215/// image (#517, #531). A link *inside* an image's alt text, however, still
216/// deactivates outer link openers per CommonMark's bracket-scanner rules, so
217/// the helper recurses into image alt text looking for inner links.
218///
219/// Reference-link nesting (#533, #569, #571) requires resolving labels
220/// against the document's reference-definition map, which the parser does
221/// not have at this point — those cases remain unhandled and need a later
222/// stack-based pass.
223fn link_text_contains_inner_link(text: &str, ctx: LinkScanContext, strict_dest: bool) -> bool {
224    let bytes = text.as_bytes();
225    let mut i = 0;
226    let mut escape_next = false;
227    while i < bytes.len() {
228        let b = bytes[i];
229        if escape_next {
230            escape_next = false;
231            i += step(text, i);
232            continue;
233        }
234        match b {
235            b'\\' => {
236                escape_next = true;
237                i += 1;
238            }
239            b'`' => {
240                if let Some((len, _, _, _)) = try_parse_code_span(&text[i..]) {
241                    i += len;
242                } else {
243                    i += 1;
244                }
245            }
246            b'<' => {
247                if ctx.skip_autolinks
248                    && let Some((len, _)) = try_parse_autolink(&text[i..], true)
249                {
250                    i += len;
251                } else if ctx.skip_raw_html
252                    && let Some(len) = try_parse_inline_html(&text[i..], ctx.dialect)
253                {
254                    i += len;
255                } else {
256                    i += 1;
257                }
258            }
259            b'!' if i + 1 < bytes.len() && bytes[i + 1] == b'[' => {
260                if let Some((len, alt, _, _)) = try_parse_inline_image(&text[i..], ctx) {
261                    if link_text_contains_inner_link(alt, ctx, strict_dest) {
262                        return true;
263                    }
264                    i += len;
265                } else {
266                    i += 2;
267                }
268            }
269            b'[' => {
270                if try_parse_inline_link(&text[i..], strict_dest, ctx).is_some() {
271                    return true;
272                }
273                i += 1;
274            }
275            _ => i += step(text, i),
276        }
277    }
278    false
279}
280
281/// Try to parse an inline image starting at the current position.
282///
283/// Inline images have the form `![alt](url)` or `![alt](url "title")`.
284/// Can also have trailing attributes: `![alt](url){#id .class}`.
285/// Returns Some((length, alt_text, dest_content, raw_attributes)) if a valid image is found.
286///
287/// `ctx` controls bracket-scanner opacity for raw HTML / autolink spans;
288/// see `LinkScanContext`.
289pub fn try_parse_inline_image(
290    text: &str,
291    ctx: LinkScanContext,
292) -> Option<(usize, &str, &str, Option<&str>)> {
293    if !text.starts_with("![") {
294        return None;
295    }
296
297    // Find the closing ]
298    let close_bracket = find_link_close_bracket(text, 2, ctx)?;
299    let alt_text = &text[2..close_bracket];
300
301    // Check for immediate ( after ]
302    let after_bracket = close_bracket + 1;
303    if text.len() <= after_bracket || !text[after_bracket..].starts_with('(') {
304        return None;
305    }
306
307    // Find closing ) for destination (reuse same logic as links)
308    let dest_start = after_bracket + 1;
309    let remaining = &text[dest_start..];
310
311    let close_paren = find_dest_close_paren(remaining)?;
312    let dest_content = &remaining[..close_paren];
313
314    // Check for trailing attributes {#id .class key=value}
315    let after_paren = dest_start + close_paren + 1;
316    let after_close = &text[after_paren..];
317
318    // Attributes must start immediately after closing paren (no whitespace/newlines)
319    if after_close.starts_with('{') {
320        // Find the closing brace
321        if let Some(close_brace_pos) = after_close.find('}') {
322            let attr_text = &after_close[..=close_brace_pos];
323            // Try to parse as attributes to validate
324            if let Some((_attrs, _)) = try_parse_trailing_attributes(attr_text) {
325                let total_len = after_paren + close_brace_pos + 1;
326                // Return raw attribute string for lossless parsing
327                let raw_attrs = attr_text;
328                return Some((total_len, alt_text, dest_content, Some(raw_attrs)));
329            }
330        }
331    }
332
333    // No attributes, just return the image
334    let total_len = after_paren;
335    Some((total_len, alt_text, dest_content, None))
336}
337
338/// Emit an inline image node to the builder.
339/// Note: alt_text may contain inline elements and should be parsed recursively.
340pub fn emit_inline_image(
341    builder: &mut GreenNodeBuilder,
342    _text: &str,
343    alt_text: &str,
344    dest: &str,
345    raw_attributes: Option<&str>,
346    config: &ParserOptions,
347    suppress_footnote_refs: bool,
348) {
349    builder.start_node(SyntaxKind::IMAGE_LINK.into());
350
351    // Opening ![
352    builder.start_node(SyntaxKind::IMAGE_LINK_START.into());
353    builder.token(SyntaxKind::IMAGE_LINK_START.into(), "![");
354    builder.finish_node();
355
356    // Alt text (recursively parse inline elements)
357    builder.start_node(SyntaxKind::IMAGE_ALT.into());
358    // Use the standalone parse_inline_text function for recursive parsing
359    // Note: nested contexts don't resolve references
360    parse_inline_text(builder, alt_text, config, false, suppress_footnote_refs);
361    builder.finish_node();
362
363    // Closing ]
364    builder.token(SyntaxKind::IMAGE_ALT_END.into(), "]");
365
366    // Opening (
367    builder.token(SyntaxKind::IMAGE_DEST_START.into(), "(");
368
369    // Destination
370    builder.start_node(SyntaxKind::LINK_DEST.into());
371    builder.token(SyntaxKind::TEXT.into(), dest);
372    builder.finish_node();
373
374    // Closing )
375    builder.token(SyntaxKind::IMAGE_DEST_END.into(), ")");
376
377    // Emit raw attributes if present (preserve original formatting)
378    if let Some(raw_attrs) = raw_attributes {
379        emit_attribute_node(builder, raw_attrs);
380    }
381
382    builder.finish_node();
383}
384
385/// Try to parse an automatic link starting at the current position.
386///
387/// Automatic links have the form `<url>` (URI autolink) or `<email>`
388/// (email autolink) per CommonMark §6.4. Under `Dialect::CommonMark` the
389/// scheme/email grammar is enforced strictly (e.g. scheme must be 2-32
390/// ASCII chars; email local parts cannot contain backslashes). Pandoc
391/// markdown is laxer — it accepts Unicode in email addresses, for
392/// example — so non-CommonMark callers fall back to the heuristic
393/// "contains `:` or `@`" check that the parser used historically.
394pub fn try_parse_autolink(text: &str, is_commonmark: bool) -> Option<(usize, &str)> {
395    if !text.starts_with('<') {
396        return None;
397    }
398
399    let close_pos = text[1..].find('>')?;
400    let content = &text[1..1 + close_pos];
401
402    if content.is_empty() {
403        return None;
404    }
405    if content.contains(|c: char| c.is_whitespace()) {
406        return None;
407    }
408
409    if is_commonmark {
410        if !is_valid_uri_autolink(content) && !is_valid_email_autolink(content) {
411            return None;
412        }
413    } else if !content.contains(':') && !content.contains('@') {
414        return None;
415    }
416
417    Some((close_pos + 2, content))
418}
419
420/// CommonMark §6.4 URI autolink:
421/// scheme = 2-32 chars, ASCII letter then `[a-zA-Z0-9+.-]`, followed by `:`,
422/// followed by URI body (any char except control, space, `<`, `>`).
423fn is_valid_uri_autolink(s: &str) -> bool {
424    let bytes = s.as_bytes();
425    if bytes.is_empty() || !bytes[0].is_ascii_alphabetic() {
426        return false;
427    }
428    let mut i = 1;
429    while i < bytes.len() {
430        let b = bytes[i];
431        if b.is_ascii_alphanumeric() || b == b'+' || b == b'-' || b == b'.' {
432            i += 1;
433        } else {
434            break;
435        }
436    }
437    if !(2..=32).contains(&i) {
438        return false;
439    }
440    if i >= bytes.len() || bytes[i] != b':' {
441        return false;
442    }
443    for &b in &bytes[i + 1..] {
444        if b < 0x20 || b == 0x7f || b == b'<' || b == b'>' {
445            return false;
446        }
447    }
448    true
449}
450
451/// CommonMark §6.4 email autolink, matching the HTML5 non-normative regex:
452/// `^[a-zA-Z0-9.!#$%&'*+/=?^_`{|}~-]+@[a-zA-Z0-9](?:[a-zA-Z0-9-]{0,61}[a-zA-Z0-9])?
453///  (?:\.[a-zA-Z0-9](?:[a-zA-Z0-9-]{0,61}[a-zA-Z0-9])?)*$`.
454fn is_valid_email_autolink(s: &str) -> bool {
455    let Some(at) = s.find('@') else {
456        return false;
457    };
458    let local = &s[..at];
459    let domain = &s[at + 1..];
460    if local.is_empty() || !local.bytes().all(is_email_local_byte) {
461        return false;
462    }
463    if domain.is_empty() {
464        return false;
465    }
466    domain.split('.').all(is_valid_email_label)
467}
468
469fn is_email_local_byte(b: u8) -> bool {
470    matches!(
471        b,
472        b'a'..=b'z'
473            | b'A'..=b'Z'
474            | b'0'..=b'9'
475            | b'.'
476            | b'!'
477            | b'#'
478            | b'$'
479            | b'%'
480            | b'&'
481            | b'\''
482            | b'*'
483            | b'+'
484            | b'/'
485            | b'='
486            | b'?'
487            | b'^'
488            | b'_'
489            | b'`'
490            | b'{'
491            | b'|'
492            | b'}'
493            | b'~'
494            | b'-'
495    )
496}
497
498fn is_valid_email_label(label: &str) -> bool {
499    let bytes = label.as_bytes();
500    if bytes.is_empty() || bytes.len() > 63 {
501        return false;
502    }
503    if !bytes[0].is_ascii_alphanumeric() {
504        return false;
505    }
506    if !bytes[bytes.len() - 1].is_ascii_alphanumeric() {
507        return false;
508    }
509    bytes[1..bytes.len() - 1]
510        .iter()
511        .all(|b| b.is_ascii_alphanumeric() || *b == b'-')
512}
513
514/// Emit an automatic link node to the builder.
515pub fn emit_autolink(builder: &mut GreenNodeBuilder, _text: &str, url: &str) {
516    builder.start_node(SyntaxKind::AUTO_LINK.into());
517
518    // Opening <
519    builder.start_node(SyntaxKind::AUTO_LINK_MARKER.into());
520    builder.token(SyntaxKind::AUTO_LINK_MARKER.into(), "<");
521    builder.finish_node();
522
523    // URL content
524    builder.token(SyntaxKind::TEXT.into(), url);
525
526    // Closing >
527    builder.start_node(SyntaxKind::AUTO_LINK_MARKER.into());
528    builder.token(SyntaxKind::AUTO_LINK_MARKER.into(), ">");
529    builder.finish_node();
530
531    builder.finish_node();
532}
533
534// Recognized URI schemes for pandoc's `autolink_bare_uris` extension.
535// Generated at build time by `build.rs`
536// from the vendored IANA registry plus pandoc's nonstandard additions,
537// as a sorted `const BARE_URI_SCHEMES: &[&str]`.
538// A `scheme:` prefix outside this set stays literal.
539include!(concat!(env!("OUT_DIR"), "/uri_schemes.rs"));
540
541/// Returns `true` if `scheme` (matched case-insensitively) is a recognized URI scheme.
542/// See [`BARE_URI_SCHEMES`].
543fn is_known_bare_uri_scheme(scheme: &str) -> bool {
544    let lower = scheme.to_ascii_lowercase();
545    BARE_URI_SCHEMES.binary_search(&lower.as_str()).is_ok()
546}
547
548pub fn try_parse_bare_uri(text: &str) -> Option<(usize, &str)> {
549    let mut chars = text.char_indices();
550    let (_, first) = chars.next()?;
551    if !first.is_ascii_alphabetic() {
552        return None;
553    }
554
555    let mut scheme_end = None;
556    for (idx, ch) in text.char_indices() {
557        if ch == ':' {
558            scheme_end = Some(idx);
559            break;
560        }
561        if !ch.is_ascii_alphanumeric() && ch != '+' && ch != '-' && ch != '.' {
562            return None;
563        }
564    }
565    let scheme_end = scheme_end?;
566    if scheme_end == 0 {
567        return None;
568    }
569
570    if !is_known_bare_uri_scheme(&text[..scheme_end]) {
571        return None;
572    }
573
574    let mut end = scheme_end + 1;
575    let bytes = text.as_bytes();
576    while end < text.len() {
577        let b = bytes[end];
578        if b.is_ascii_whitespace() {
579            break;
580        }
581        if matches!(b, b'<' | b'>' | b'`' | b'"' | b'\'') {
582            break;
583        }
584        end += 1;
585    }
586
587    if end == scheme_end + 1 {
588        return None;
589    }
590
591    let mut trimmed = end;
592    while trimmed > scheme_end + 1 {
593        let ch = text[..trimmed].chars().last().unwrap();
594        if matches!(ch, '.' | ',' | ';' | ':' | ')' | ']' | '}') {
595            trimmed -= ch.len_utf8();
596        } else {
597            break;
598        }
599    }
600
601    if trimmed <= scheme_end + 1 {
602        return None;
603    }
604
605    // If trimming terminal punctuation leaves a dangling backslash, the match
606    // came from escaped punctuation (e.g., `a:\]`) and should stay literal.
607    if text[..trimmed].ends_with('\\') {
608        return None;
609    }
610
611    Some((trimmed, &text[..trimmed]))
612}
613
614/// Try to parse an inline link starting at the current position.
615///
616/// Inline links have the form `[text](url)` or `[text](url "title")`.
617/// Can also have trailing attributes: `[text](url){#id .class}`.
618/// Returns Some((length, text_content, dest_content, raw_attributes)) if a valid link is found.
619///
620/// `strict_dest` enables CommonMark §6.4 destination-and-title validation:
621/// the bare destination form may not contain spaces or ASCII control
622/// characters and must have balanced parentheses; if a title follows it
623/// must be properly delimited; only whitespace is allowed before/after.
624/// Pandoc-markdown is more permissive, so leave this off for that dialect.
625pub fn try_parse_inline_link(
626    text: &str,
627    strict_dest: bool,
628    ctx: LinkScanContext,
629) -> Option<(usize, &str, &str, Option<&str>)> {
630    if !text.starts_with('[') {
631        return None;
632    }
633
634    // Find the closing ]
635    let close_bracket = find_link_close_bracket(text, 1, ctx)?;
636    let link_text = &text[1..close_bracket];
637
638    // Check for immediate ( after ]
639    let after_bracket = close_bracket + 1;
640    if text.len() <= after_bracket || !text[after_bracket..].starts_with('(') {
641        return None;
642    }
643
644    // Find closing ) for destination
645    let dest_start = after_bracket + 1;
646    let remaining = &text[dest_start..];
647
648    let close_paren = find_dest_close_paren(remaining)?;
649    let dest_content = &remaining[..close_paren];
650
651    if strict_dest && !dest_and_title_ok_commonmark(dest_content) {
652        return None;
653    }
654
655    // CommonMark §6.4: outer link is rejected when its text contains a valid
656    // inner inline link or image, so the inner-most definition wins.
657    if ctx.disallow_inner_links && link_text_contains_inner_link(link_text, ctx, strict_dest) {
658        return None;
659    }
660
661    // Check for trailing attributes {#id .class key=value}
662    let after_paren = dest_start + close_paren + 1;
663    let after_close = &text[after_paren..];
664
665    // Attributes must start immediately after closing paren (no whitespace/newlines)
666    if after_close.starts_with('{') {
667        // Find the closing brace
668        if let Some(close_brace_pos) = after_close.find('}') {
669            let attr_text = &after_close[..=close_brace_pos];
670            // Try to parse as attributes to validate
671            if let Some((_attrs, _)) = try_parse_trailing_attributes(attr_text) {
672                let total_len = after_paren + close_brace_pos + 1;
673                // Return raw attribute string for lossless parsing
674                let raw_attrs = attr_text;
675                return Some((total_len, link_text, dest_content, Some(raw_attrs)));
676            }
677        }
678    }
679
680    // No attributes, just return the link
681    let total_len = after_paren;
682    Some((total_len, link_text, dest_content, None))
683}
684
685/// CommonMark §6.4 destination + optional title validation. The text passed
686/// in is whatever the parser captured between `(` and `)`. A valid form is:
687/// `[ws] destination [ws title [ws]]` where:
688/// - bare destination has no spaces, tabs, ASCII control chars, and balanced
689///   parentheses (escaped parens permitted);
690/// - bracketed destination is `<...>` with no newlines and no unescaped `<>`;
691/// - the optional title is delimited by `"..."`, `'...'`, or `(...)`;
692/// - any text outside that structure invalidates the link.
693fn dest_and_title_ok_commonmark(content: &str) -> bool {
694    let trimmed = trim_start_link_ws(content);
695    if trimmed.is_empty() {
696        return true;
697    }
698
699    let after_dest = if let Some(rest) = trimmed.strip_prefix('<') {
700        let mut escape = false;
701        let mut end_byte = None;
702        for (i, c) in rest.char_indices() {
703            if escape {
704                escape = false;
705                continue;
706            }
707            match c {
708                '\\' => escape = true,
709                '\n' | '<' => return false,
710                '>' => {
711                    end_byte = Some(i);
712                    break;
713                }
714                _ => {}
715            }
716        }
717        match end_byte {
718            Some(e) => &rest[e + 1..],
719            None => return false,
720        }
721    } else {
722        let mut escape = false;
723        let mut depth: i32 = 0;
724        let mut end = trimmed.len();
725        for (i, c) in trimmed.char_indices() {
726            if escape {
727                escape = false;
728                continue;
729            }
730            match c {
731                '\\' => escape = true,
732                ' ' | '\t' | '\n' => {
733                    end = i;
734                    break;
735                }
736                _ if c.is_ascii_control() => return false,
737                '(' => depth += 1,
738                ')' => {
739                    if depth == 0 {
740                        end = i;
741                        break;
742                    }
743                    depth -= 1;
744                }
745                _ => {}
746            }
747        }
748        if depth != 0 {
749            return false;
750        }
751        if end == 0 {
752            // bare destination must be nonempty if the field is non-blank
753            return false;
754        }
755        &trimmed[end..]
756    };
757
758    let after_dest = trim_start_link_ws(after_dest);
759    if after_dest.is_empty() {
760        return true;
761    }
762
763    let bytes = after_dest.as_bytes();
764    let close = match bytes[0] {
765        b'"' => b'"',
766        b'\'' => b'\'',
767        b'(' => b')',
768        _ => return false,
769    };
770    let opens_paren = bytes[0] == b'(';
771    let mut escape = false;
772    let mut title_close_pos = None;
773    for (i, &b) in after_dest.as_bytes().iter().enumerate().skip(1) {
774        if escape {
775            escape = false;
776            continue;
777        }
778        if b == b'\\' {
779            escape = true;
780            continue;
781        }
782        if opens_paren && b == b'(' {
783            return false;
784        }
785        if b == close {
786            title_close_pos = Some(i);
787            break;
788        }
789    }
790    let close_idx = match title_close_pos {
791        Some(p) => p,
792        None => return false,
793    };
794
795    let after_title = &after_dest[close_idx + 1..];
796    is_link_ws_only(after_title)
797}
798
799/// Strip leading ASCII space/tab/newline bytes. Byte-level equivalent of
800/// `s.trim_start_matches([' ', '\t', '\n'])`; called for every
801/// CommonMark inline-link destination/title scan, so the slice-pattern
802/// MultiCharEqSearcher overhead matters.
803#[inline]
804fn trim_start_link_ws(s: &str) -> &str {
805    let bytes = s.as_bytes();
806    let mut i = 0;
807    while i < bytes.len() {
808        let b = bytes[i];
809        if b == b' ' || b == b'\t' || b == b'\n' {
810            i += 1;
811        } else {
812            break;
813        }
814    }
815    // SAFETY: stripped only ASCII whitespace bytes.
816    unsafe { std::str::from_utf8_unchecked(&bytes[i..]) }
817}
818
819#[inline]
820fn is_link_ws_only(s: &str) -> bool {
821    s.as_bytes()
822        .iter()
823        .all(|&b| b == b' ' || b == b'\t' || b == b'\n')
824}
825
826/// Emit an inline link node to the builder.
827/// Note: link_text may contain inline elements and should be parsed recursively.
828pub fn emit_inline_link(
829    builder: &mut GreenNodeBuilder,
830    _text: &str,
831    link_text: &str,
832    dest: &str,
833    raw_attributes: Option<&str>,
834    config: &ParserOptions,
835    suppress_footnote_refs: bool,
836) {
837    builder.start_node(SyntaxKind::LINK.into());
838
839    // Opening [
840    builder.start_node(SyntaxKind::LINK_START.into());
841    builder.token(SyntaxKind::LINK_START.into(), "[");
842    builder.finish_node();
843
844    // Link text (recursively parse inline elements). Pandoc-native:
845    // links cannot contain other links, so suppress inner LINK / ref-link
846    // recognition during the recursion. Images, emphasis, code, etc. are
847    // still recognised. CommonMark relies on outer-level process_brackets
848    // to prevent nested links, but the flag is harmless under CM.
849    builder.start_node(SyntaxKind::LINK_TEXT.into());
850    parse_inline_text(builder, link_text, config, true, suppress_footnote_refs);
851    builder.finish_node();
852
853    // Closing ]
854    builder.token(SyntaxKind::LINK_TEXT_END.into(), "]");
855
856    // Opening (
857    builder.token(SyntaxKind::LINK_DEST_START.into(), "(");
858
859    // Destination
860    builder.start_node(SyntaxKind::LINK_DEST.into());
861    builder.token(SyntaxKind::TEXT.into(), dest);
862    builder.finish_node();
863
864    // Closing )
865    builder.token(SyntaxKind::LINK_DEST_END.into(), ")");
866
867    // Emit raw attributes if present (preserve original formatting)
868    if let Some(raw_attrs) = raw_attributes {
869        emit_attribute_node(builder, raw_attrs);
870    }
871
872    builder.finish_node();
873}
874
875pub fn emit_bare_uri_link(builder: &mut GreenNodeBuilder, uri: &str, _config: &ParserOptions) {
876    builder.start_node(SyntaxKind::LINK.into());
877
878    builder.start_node(SyntaxKind::LINK_START.into());
879    builder.token(SyntaxKind::LINK_START.into(), "[");
880    builder.finish_node();
881
882    builder.start_node(SyntaxKind::LINK_TEXT.into());
883    builder.token(SyntaxKind::TEXT.into(), uri);
884    builder.finish_node();
885
886    builder.token(SyntaxKind::LINK_TEXT_END.into(), "]");
887    builder.token(SyntaxKind::LINK_DEST_START.into(), "(");
888
889    builder.start_node(SyntaxKind::LINK_DEST.into());
890    builder.token(SyntaxKind::TEXT.into(), uri);
891    builder.finish_node();
892
893    builder.token(SyntaxKind::LINK_DEST_END.into(), ")");
894
895    builder.finish_node();
896}
897
898/// Try to parse a reference link starting at the current position.
899///
900/// Reference links have three forms:
901/// - Explicit: `[text][label]`
902/// - Implicit: `[text][]` (label = text)
903/// - Shortcut: `[text]` (if shortcut_reference_links enabled)
904///
905/// Returns Some((length, text_content, label, is_shortcut)) if a valid reference link is found.
906/// The label is what should be looked up in the registry.
907pub fn try_parse_reference_link(
908    text: &str,
909    allow_shortcut: bool,
910    inline_link_attempted: bool,
911    allow_spaced: bool,
912    ctx: LinkScanContext,
913) -> Option<(usize, &str, String, &str, bool)> {
914    if !text.starts_with('[') {
915        return None;
916    }
917
918    // Don't match citations (which start with [@) or suppress-author citations (which start with [-@)
919    if text.len() > 1 {
920        let bytes = text.as_bytes();
921        if bytes[1] == b'@' {
922            return None;
923        }
924        if bytes[1] == b'-' && text.len() > 2 && bytes[2] == b'@' {
925            return None;
926        }
927    }
928
929    // Find the closing ] for the text. Uses the shared helper so that a
930    // `]` inside a code span doesn't terminate the link text (CommonMark
931    // §6 — code spans bind tighter than links). See spec examples #342
932    // and #525. Raw HTML and (CommonMark-only) autolink spans are also
933    // opaque per `ctx`.
934    let close_bracket = find_link_close_bracket(text, 1, ctx)?;
935    let link_text = &text[1..close_bracket];
936
937    // CommonMark §6.4: outer reference link is rejected when its text contains
938    // a valid inner inline link/image (spec example #532). Reference-link
939    // nesting (#533/#569/#571) is not handled here; it requires resolving
940    // labels against the document refdef map.
941    if ctx.disallow_inner_links
942        && link_text_contains_inner_link(link_text, ctx, ctx.disallow_inner_links)
943    {
944        return None;
945    }
946
947    // Check what follows the ]
948    let after_bracket = close_bracket + 1;
949
950    // `[content]{...}` is reserved for bracketed spans / attribute
951    // trailers, never a shortcut.
952    if after_bracket < text.len() && text[after_bracket..].starts_with('{') {
953        return None;
954    }
955
956    // `[text](...)` is the inline-link shape. CommonMark spec example
957    // #568 (`[foo](not a link)` with `[foo]: /url`) requires the shortcut
958    // to succeed for `[foo]`, leaving `(not a link)` as literal text when
959    // the upstream inline-link parse was rejected by `strict_dest`. We
960    // only fall through to shortcut here when the caller has already
961    // tried the inline-link form (`inline_link_attempted`) — otherwise
962    // disabling the `inline_links` extension would silently let
963    // `[text](url)` become a shortcut + literal text, which the
964    // `inline_links_disabled_keeps_inline_link_literal` test guards
965    // against.
966    if after_bracket < text.len()
967        && text[after_bracket..].starts_with('(')
968        && (!allow_shortcut || !inline_link_attempted)
969    {
970        return None;
971    }
972
973    // Pandoc `spaced_reference_links`: allow whitespace (space, tab, and a
974    // single LF — block parsing already enforces blank-line boundaries) between
975    // the link-text `]` and the label `[`. Without the extension, gap stays
976    // empty and the next byte must be `[` directly.
977    let gap_end = if allow_spaced {
978        let bytes = text.as_bytes();
979        let mut p = after_bracket;
980        let mut saw_newline = false;
981        while p < bytes.len() {
982            match bytes[p] {
983                b' ' | b'\t' => p += 1,
984                b'\n' if !saw_newline => {
985                    saw_newline = true;
986                    p += 1;
987                }
988                _ => break,
989            }
990        }
991        p
992    } else {
993        after_bracket
994    };
995    let gap = &text[after_bracket..gap_end];
996
997    // Check for explicit reference [text][label] or implicit [text][]
998    if gap_end < text.len() && text[gap_end..].starts_with('[') {
999        // Find the closing ] for the label
1000        let label_start = gap_end + 1;
1001        let mut label_end = None;
1002
1003        for (i, ch) in text[label_start..].char_indices() {
1004            if ch == ']' {
1005                label_end = Some(i + label_start);
1006                break;
1007            }
1008            // Labels can't contain newlines
1009            if ch == '\n' {
1010                return None;
1011            }
1012        }
1013
1014        let label_end = label_end?;
1015        let label = &text[label_start..label_end];
1016
1017        // Total length includes both bracket pairs (and any gap between them)
1018        let total_len = label_end + 1;
1019
1020        // Implicit reference: empty label means emit [text][]
1021        if label.is_empty() {
1022            return Some((total_len, link_text, String::new(), gap, false));
1023        }
1024
1025        // Explicit reference: use the provided label
1026        Some((total_len, link_text, label.to_string(), gap, false))
1027    } else if allow_shortcut {
1028        // Shortcut reference: [text] with no second bracket pair
1029        // The text is both the display text and the label. Any whitespace we
1030        // tentatively consumed for the spaced-form lookahead belongs to the
1031        // surrounding text, so we report the shortcut at its strict length.
1032        if link_text.is_empty() {
1033            return None;
1034        }
1035        Some((after_bracket, link_text, link_text.to_string(), "", true))
1036    } else {
1037        // No second bracket pair and shortcut not allowed - not a reference link
1038        None
1039    }
1040}
1041
1042/// Emit a reference link node to the builder.
1043/// Preserves the original reference syntax (explicit [text][ref], implicit [text][], or shortcut [text]).
1044/// `gap` carries any whitespace consumed between the link-text `]` and the
1045/// label `[` under `spaced_reference_links`; empty otherwise.
1046pub fn emit_reference_link(
1047    builder: &mut GreenNodeBuilder,
1048    link_text: &str,
1049    label: &str,
1050    gap: &str,
1051    is_shortcut: bool,
1052    config: &ParserOptions,
1053    suppress_footnote_refs: bool,
1054) {
1055    builder.start_node(SyntaxKind::LINK.into());
1056
1057    // Opening [
1058    builder.start_node(SyntaxKind::LINK_START.into());
1059    builder.token(SyntaxKind::LINK_START.into(), "[");
1060    builder.finish_node();
1061
1062    // Link text (recursively parse inline elements). Pandoc-native:
1063    // links cannot contain other links, so suppress inner LINK / ref-link
1064    // recognition during the recursion. Images, emphasis, code, etc. are
1065    // still recognised.
1066    builder.start_node(SyntaxKind::LINK_TEXT.into());
1067    parse_inline_text(builder, link_text, config, true, suppress_footnote_refs);
1068    builder.finish_node();
1069
1070    // Closing ] and reference label
1071    builder.token(SyntaxKind::TEXT.into(), "]");
1072
1073    if !is_shortcut {
1074        // Explicit or implicit reference: [text][label] or [text][]
1075        emit_reference_link_gap(builder, gap);
1076        builder.token(SyntaxKind::TEXT.into(), "[");
1077        builder.start_node(SyntaxKind::LINK_REF.into());
1078        // For implicit references, label is empty and we emit [text][]
1079        // For explicit references, emit the label to get [text][label]
1080        if !label.is_empty() {
1081            builder.token(SyntaxKind::TEXT.into(), label);
1082        }
1083        builder.finish_node();
1084        builder.token(SyntaxKind::TEXT.into(), "]");
1085    }
1086    // For shortcut references, just [text] - no second bracket pair
1087
1088    builder.finish_node();
1089}
1090
1091/// Emit the whitespace gap between `]` and `[` of a spaced reference link,
1092/// preserving exact bytes by splitting into WHITESPACE / NEWLINE tokens.
1093fn emit_reference_link_gap(builder: &mut GreenNodeBuilder, gap: &str) {
1094    if gap.is_empty() {
1095        return;
1096    }
1097    let bytes = gap.as_bytes();
1098    let mut start = 0;
1099    while start < bytes.len() {
1100        match bytes[start] {
1101            b'\r' => {
1102                let end = if start + 1 < bytes.len() && bytes[start + 1] == b'\n' {
1103                    start + 2
1104                } else {
1105                    start + 1
1106                };
1107                builder.token(SyntaxKind::NEWLINE.into(), &gap[start..end]);
1108                start = end;
1109            }
1110            b'\n' => {
1111                builder.token(SyntaxKind::NEWLINE.into(), &gap[start..start + 1]);
1112                start += 1;
1113            }
1114            _ => {
1115                let mut end = start + 1;
1116                while end < bytes.len() && !matches!(bytes[end], b'\r' | b'\n') {
1117                    end += 1;
1118                }
1119                builder.token(SyntaxKind::WHITESPACE.into(), &gap[start..end]);
1120                start = end;
1121            }
1122        }
1123    }
1124}
1125
1126/// Try to parse a reference-style image: `![alt][ref]`, `![alt][]`, or `![alt]`
1127/// Returns (total_len, alt_text, label, gap, is_shortcut) if successful. `gap`
1128/// is the whitespace between `]` and `[` consumed under
1129/// `spaced_reference_links`; empty otherwise (and always empty for shortcuts).
1130pub fn try_parse_reference_image(
1131    text: &str,
1132    allow_shortcut: bool,
1133    allow_spaced: bool,
1134) -> Option<(usize, &str, String, &str, bool)> {
1135    let bytes = text.as_bytes();
1136    if bytes.len() < 4 || bytes[0] != b'!' || bytes[1] != b'[' {
1137        return None;
1138    }
1139
1140    let mut pos = 2;
1141    let mut bracket_depth = 1;
1142    let alt_start = pos;
1143
1144    // Find the end of the alt text (allowing nested brackets)
1145    while pos < bytes.len() && bracket_depth > 0 {
1146        match bytes[pos] {
1147            b'[' => bracket_depth += 1,
1148            b']' => bracket_depth -= 1,
1149            b'\\' if pos + 1 < bytes.len() => pos += 1, // skip escaped char
1150            _ => {}
1151        }
1152        pos += 1;
1153    }
1154
1155    if bracket_depth > 0 {
1156        return None; // Unclosed brackets
1157    }
1158
1159    let alt_text = &text[alt_start..pos - 1];
1160    let after_alt_close = pos;
1161
1162    // Pandoc `spaced_reference_links` applies to reference images too: allow
1163    // whitespace (space, tab, single LF) between `]` and `[`.
1164    if allow_spaced {
1165        let mut saw_newline = false;
1166        while pos < bytes.len() {
1167            match bytes[pos] {
1168                b' ' | b'\t' => pos += 1,
1169                b'\n' if !saw_newline => {
1170                    saw_newline = true;
1171                    pos += 1;
1172                }
1173                _ => break,
1174            }
1175        }
1176    }
1177    let gap = &text[after_alt_close..pos];
1178
1179    // Now check for the label part
1180    if pos >= bytes.len() {
1181        if allow_shortcut && gap.is_empty() {
1182            let label = alt_text.to_string();
1183            return Some((pos, alt_text, label, "", true));
1184        }
1185        return None;
1186    }
1187
1188    // Explicit reference: `![alt][label]`
1189    if bytes[pos] == b'[' {
1190        pos += 1;
1191        let label_start = pos;
1192
1193        // Find the end of the label (no nested brackets, no newlines)
1194        while pos < bytes.len() && bytes[pos] != b']' && bytes[pos] != b'\n' && bytes[pos] != b'\r'
1195        {
1196            pos += 1;
1197        }
1198
1199        if pos >= bytes.len() || bytes[pos] != b']' {
1200            return None;
1201        }
1202
1203        let label_text = &text[label_start..pos];
1204        pos += 1;
1205
1206        // Return the original label text for formatting preservation
1207        // Empty label means implicit reference
1208        let label = if label_text.is_empty() {
1209            alt_text.to_string() // For implicit references, use alt text as label for equality check
1210        } else {
1211            label_text.to_string() // Preserve original case
1212        };
1213
1214        return Some((pos, alt_text, label, gap, false));
1215    }
1216
1217    // Shortcut reference: `![alt]` (only if enabled). Any whitespace we
1218    // tentatively consumed past the alt-text `]` belongs to surrounding text.
1219    if allow_shortcut {
1220        // Check if next char is ( - if so, not a reference
1221        if bytes[after_alt_close] == b'(' {
1222            return None;
1223        }
1224
1225        let label = alt_text.to_string();
1226        return Some((after_alt_close, alt_text, label, "", true));
1227    }
1228
1229    None
1230}
1231
1232/// Emit a reference image node with registry lookup. `gap` carries whitespace
1233/// consumed between `]` and `[` under `spaced_reference_links`; empty otherwise.
1234pub fn emit_reference_image(
1235    builder: &mut GreenNodeBuilder,
1236    alt_text: &str,
1237    label: &str,
1238    gap: &str,
1239    is_shortcut: bool,
1240    config: &ParserOptions,
1241    suppress_footnote_refs: bool,
1242) {
1243    builder.start_node(SyntaxKind::IMAGE_LINK.into());
1244
1245    // Emit as reference image (preserve original syntax)
1246    builder.start_node(SyntaxKind::IMAGE_LINK_START.into());
1247    builder.token(SyntaxKind::IMAGE_LINK_START.into(), "![");
1248    builder.finish_node();
1249
1250    // Alt text (recursively parse inline elements)
1251    builder.start_node(SyntaxKind::IMAGE_ALT.into());
1252    parse_inline_text(builder, alt_text, config, false, suppress_footnote_refs);
1253    builder.finish_node();
1254
1255    // Closing ] and reference label
1256    builder.token(SyntaxKind::TEXT.into(), "]");
1257
1258    if !is_shortcut {
1259        // Explicit or implicit reference: ![alt][label] or ![alt][]
1260        emit_reference_link_gap(builder, gap);
1261        builder.token(SyntaxKind::TEXT.into(), "[");
1262        builder.start_node(SyntaxKind::LINK_REF.into());
1263        // For implicit references, emit empty label (label == alt means implicit from parser)
1264        if label != alt_text {
1265            builder.token(SyntaxKind::TEXT.into(), label);
1266        }
1267        builder.finish_node();
1268        builder.token(SyntaxKind::TEXT.into(), "]");
1269    }
1270    // For shortcut references, just ![alt] - no second bracket pair
1271
1272    builder.finish_node();
1273}
1274
1275/// Emit an `UNRESOLVED_REFERENCE` node for a Pandoc bracket-shape
1276/// pattern whose label didn't resolve. The wrapper covers the original
1277/// bracket bytes; the inner text recurses through normal inline
1278/// parsing (with inner-link suppression so a stray inner inline link
1279/// doesn't reorder semantics relative to pandoc-native).
1280///
1281/// `source` is `text[start..end]` — the full bracket-shape pattern.
1282/// `text_content` is the inner text between the outer `[` and `]`
1283/// (the bytes used for inline recursion). `label_suffix` carries the
1284/// `[label]` / `[]` suffix bytes verbatim, or `None` for shortcut form.
1285pub fn emit_unresolved_reference(
1286    builder: &mut GreenNodeBuilder,
1287    is_image: bool,
1288    text_content: &str,
1289    label_suffix: Option<&str>,
1290    config: &ParserOptions,
1291    suppress_footnote_refs: bool,
1292) {
1293    builder.start_node(SyntaxKind::UNRESOLVED_REFERENCE.into());
1294
1295    if is_image {
1296        builder.start_node(SyntaxKind::IMAGE_LINK_START.into());
1297        builder.token(SyntaxKind::IMAGE_LINK_START.into(), "![");
1298        builder.finish_node();
1299        builder.start_node(SyntaxKind::IMAGE_ALT.into());
1300        parse_inline_text(builder, text_content, config, false, suppress_footnote_refs);
1301        builder.finish_node();
1302    } else {
1303        builder.start_node(SyntaxKind::LINK_START.into());
1304        builder.token(SyntaxKind::LINK_START.into(), "[");
1305        builder.finish_node();
1306        builder.start_node(SyntaxKind::LINK_TEXT.into());
1307        parse_inline_text(builder, text_content, config, true, suppress_footnote_refs);
1308        builder.finish_node();
1309    }
1310
1311    builder.token(SyntaxKind::TEXT.into(), "]");
1312
1313    if let Some(suffix) = label_suffix {
1314        // suffix is either "[label]" or "[]"; preserve original bytes.
1315        // Split as `[` + LINK_REF(label) + `]` so wrapper accessors find
1316        // the label via `support::child::<LinkRef>()`.
1317        debug_assert!(suffix.starts_with('[') && suffix.ends_with(']'));
1318        builder.token(SyntaxKind::TEXT.into(), "[");
1319        let label = &suffix[1..suffix.len() - 1];
1320        builder.start_node(SyntaxKind::LINK_REF.into());
1321        if !label.is_empty() {
1322            builder.token(SyntaxKind::TEXT.into(), label);
1323        }
1324        builder.finish_node();
1325        builder.token(SyntaxKind::TEXT.into(), "]");
1326    }
1327
1328    builder.finish_node();
1329}
1330
1331#[cfg(test)]
1332mod tests {
1333    use super::*;
1334
1335    #[test]
1336    fn test_parse_autolink_url() {
1337        let input = "<https://example.com>";
1338        assert_eq!(
1339            try_parse_autolink(input, false),
1340            Some((21, "https://example.com"))
1341        );
1342        assert_eq!(
1343            try_parse_autolink(input, true),
1344            Some((21, "https://example.com"))
1345        );
1346    }
1347
1348    #[test]
1349    fn test_parse_autolink_email() {
1350        let input = "<user@example.com>";
1351        assert_eq!(
1352            try_parse_autolink(input, false),
1353            Some((18, "user@example.com"))
1354        );
1355        assert_eq!(
1356            try_parse_autolink(input, true),
1357            Some((18, "user@example.com"))
1358        );
1359    }
1360
1361    #[test]
1362    fn test_parse_autolink_no_close() {
1363        let input = "<https://example.com";
1364        assert_eq!(try_parse_autolink(input, false), None);
1365        assert_eq!(try_parse_autolink(input, true), None);
1366    }
1367
1368    #[test]
1369    fn test_parse_autolink_with_space() {
1370        let input = "<https://example.com >";
1371        assert_eq!(try_parse_autolink(input, false), None);
1372        assert_eq!(try_parse_autolink(input, true), None);
1373    }
1374
1375    #[test]
1376    fn test_parse_autolink_not_url_or_email() {
1377        let input = "<notaurl>";
1378        assert_eq!(try_parse_autolink(input, false), None);
1379        assert_eq!(try_parse_autolink(input, true), None);
1380    }
1381
1382    #[test]
1383    fn test_parse_autolink_commonmark_strict_scheme() {
1384        // Scheme too short (1 char) — invalid under CommonMark, lax-accepted
1385        // under Pandoc dialect (matches historical behavior).
1386        let input = "<m:abc>";
1387        assert_eq!(try_parse_autolink(input, true), None);
1388        assert_eq!(try_parse_autolink(input, false), Some((7, "m:abc")));
1389    }
1390
1391    #[test]
1392    fn test_parse_autolink_commonmark_email_disallows_backslash() {
1393        let input = "<foo\\+@bar.example.com>";
1394        assert_eq!(try_parse_autolink(input, true), None);
1395        assert_eq!(
1396            try_parse_autolink(input, false),
1397            Some((23, "foo\\+@bar.example.com"))
1398        );
1399    }
1400
1401    #[test]
1402    fn test_parse_inline_link_simple() {
1403        let input = "[text](url)";
1404        let result = try_parse_inline_link(input, false, LinkScanContext::default());
1405        assert_eq!(result, Some((11, "text", "url", None)));
1406    }
1407
1408    #[test]
1409    fn test_parse_inline_link_with_title() {
1410        let input = r#"[text](url "title")"#;
1411        let result = try_parse_inline_link(input, false, LinkScanContext::default());
1412        assert_eq!(result, Some((19, "text", r#"url "title""#, None)));
1413    }
1414
1415    #[test]
1416    fn test_parse_inline_link_with_nested_brackets() {
1417        let input = "[outer [inner] text](url)";
1418        let result = try_parse_inline_link(input, false, LinkScanContext::default());
1419        assert_eq!(result, Some((25, "outer [inner] text", "url", None)));
1420    }
1421
1422    #[test]
1423    fn test_parse_inline_link_no_space_between_brackets_and_parens() {
1424        let input = "[text] (url)";
1425        let result = try_parse_inline_link(input, false, LinkScanContext::default());
1426        assert_eq!(result, None);
1427    }
1428
1429    #[test]
1430    fn test_parse_inline_link_no_closing_bracket() {
1431        let input = "[text(url)";
1432        let result = try_parse_inline_link(input, false, LinkScanContext::default());
1433        assert_eq!(result, None);
1434    }
1435
1436    #[test]
1437    fn test_parse_inline_link_no_closing_paren() {
1438        let input = "[text](url";
1439        let result = try_parse_inline_link(input, false, LinkScanContext::default());
1440        assert_eq!(result, None);
1441    }
1442
1443    #[test]
1444    fn test_parse_inline_link_escaped_bracket() {
1445        let input = r"[text\]more](url)";
1446        let result = try_parse_inline_link(input, false, LinkScanContext::default());
1447        assert_eq!(result, Some((17, r"text\]more", "url", None)));
1448    }
1449
1450    #[test]
1451    fn test_parse_inline_link_parens_in_url() {
1452        let input = "[text](url(with)parens)";
1453        let result = try_parse_inline_link(input, false, LinkScanContext::default());
1454        assert_eq!(result, Some((23, "text", "url(with)parens", None)));
1455    }
1456
1457    #[test]
1458    fn test_parse_inline_image_simple() {
1459        let input = "![alt](image.jpg)";
1460        let result = try_parse_inline_image(input, LinkScanContext::default());
1461        assert_eq!(result, Some((17, "alt", "image.jpg", None)));
1462    }
1463
1464    #[test]
1465    fn test_parse_inline_image_with_title() {
1466        let input = r#"![alt](image.jpg "A title")"#;
1467        let result = try_parse_inline_image(input, LinkScanContext::default());
1468        assert_eq!(result, Some((27, "alt", r#"image.jpg "A title""#, None)));
1469    }
1470
1471    #[test]
1472    fn test_parse_inline_image_with_nested_brackets() {
1473        let input = "![outer [inner] alt](image.jpg)";
1474        let result = try_parse_inline_image(input, LinkScanContext::default());
1475        assert_eq!(result, Some((31, "outer [inner] alt", "image.jpg", None)));
1476    }
1477
1478    #[test]
1479    fn test_parse_bare_uri_rejects_dangling_backslash_after_trim() {
1480        let input = r"a:\]";
1481        let result = try_parse_bare_uri(input);
1482        assert_eq!(result, None);
1483    }
1484
1485    #[test]
1486    fn test_parse_bare_uri_rejects_unknown_scheme() {
1487        assert_eq!(try_parse_bare_uri("Note:**"), None);
1488        assert_eq!(try_parse_bare_uri("Note:foo"), None);
1489        assert_eq!(try_parse_bare_uri("foo:bar"), None);
1490    }
1491
1492    #[test]
1493    fn test_parse_bare_uri_accepts_known_schemes() {
1494        assert_eq!(
1495            try_parse_bare_uri("http://example.com"),
1496            Some((18, "http://example.com"))
1497        );
1498        assert_eq!(
1499            try_parse_bare_uri("HTTPS://EXAMPLE.COM"),
1500            Some((19, "HTTPS://EXAMPLE.COM"))
1501        );
1502        assert_eq!(
1503            try_parse_bare_uri("mailto:a@b.com"),
1504            Some((14, "mailto:a@b.com"))
1505        );
1506        assert_eq!(try_parse_bare_uri("doi:10.1/x"), Some((10, "doi:10.1/x")));
1507    }
1508
1509    #[test]
1510    fn bare_uri_scheme_table_is_well_formed() {
1511        assert!(
1512            BARE_URI_SCHEMES.len() > 300,
1513            "only {} schemes",
1514            BARE_URI_SCHEMES.len()
1515        );
1516        assert!(BARE_URI_SCHEMES.windows(2).all(|w| w[0] < w[1]));
1517        for known in ["http", "https", "mailto", "ftp", "mongodb", "shttp"] {
1518            assert!(is_known_bare_uri_scheme(known), "missing scheme {known}");
1519        }
1520        for extra in ["doi", "gemini", "isbn", "pmid"] {
1521            assert!(is_known_bare_uri_scheme(extra), "missing scheme {extra}");
1522        }
1523        assert!(!is_known_bare_uri_scheme("note"));
1524    }
1525
1526    #[test]
1527    fn test_parse_inline_image_no_space_between_brackets_and_parens() {
1528        let input = "![alt] (image.jpg)";
1529        let result = try_parse_inline_image(input, LinkScanContext::default());
1530        assert_eq!(result, None);
1531    }
1532
1533    #[test]
1534    fn test_parse_inline_image_no_closing_bracket() {
1535        let input = "![alt(image.jpg)";
1536        let result = try_parse_inline_image(input, LinkScanContext::default());
1537        assert_eq!(result, None);
1538    }
1539
1540    #[test]
1541    fn test_parse_inline_image_no_closing_paren() {
1542        let input = "![alt](image.jpg";
1543        let result = try_parse_inline_image(input, LinkScanContext::default());
1544        assert_eq!(result, None);
1545    }
1546
1547    #[test]
1548    fn test_parse_inline_image_with_simple_class() {
1549        let input = "![alt](img.png){.large}";
1550        let result = try_parse_inline_image(input, LinkScanContext::default());
1551        let (len, alt, dest, attrs) = result.unwrap();
1552        assert_eq!(len, 23);
1553        assert_eq!(alt, "alt");
1554        assert_eq!(dest, "img.png");
1555        assert!(attrs.is_some());
1556        let attrs = attrs.unwrap();
1557        assert_eq!(attrs, "{.large}");
1558    }
1559
1560    #[test]
1561    fn test_parse_inline_image_with_id() {
1562        let input = "![Figure 1](fig1.png){#fig-1}";
1563        let result = try_parse_inline_image(input, LinkScanContext::default());
1564        let (len, alt, dest, attrs) = result.unwrap();
1565        assert_eq!(len, 29);
1566        assert_eq!(alt, "Figure 1");
1567        assert_eq!(dest, "fig1.png");
1568        assert!(attrs.is_some());
1569        let attrs = attrs.unwrap();
1570        assert_eq!(attrs, "{#fig-1}");
1571    }
1572
1573    #[test]
1574    fn test_parse_inline_image_with_full_attributes() {
1575        let input = "![alt](img.png){#fig .large width=\"80%\"}";
1576        let result = try_parse_inline_image(input, LinkScanContext::default());
1577        let (len, alt, dest, attrs) = result.unwrap();
1578        assert_eq!(len, 40);
1579        assert_eq!(alt, "alt");
1580        assert_eq!(dest, "img.png");
1581        assert!(attrs.is_some());
1582        let attrs = attrs.unwrap();
1583        assert_eq!(attrs, "{#fig .large width=\"80%\"}");
1584    }
1585
1586    #[test]
1587    fn test_parse_inline_image_attributes_must_be_adjacent() {
1588        // Space between ) and { should not parse as attributes
1589        let input = "![alt](img.png) {.large}";
1590        let result = try_parse_inline_image(input, LinkScanContext::default());
1591        assert_eq!(result, Some((15, "alt", "img.png", None)));
1592    }
1593
1594    // Link attribute tests
1595    #[test]
1596    fn test_parse_inline_link_with_id() {
1597        let input = "[text](url){#link-1}";
1598        let result = try_parse_inline_link(input, false, LinkScanContext::default());
1599        let (len, text, dest, attrs) = result.unwrap();
1600        assert_eq!(len, 20);
1601        assert_eq!(text, "text");
1602        assert_eq!(dest, "url");
1603        assert!(attrs.is_some());
1604        let attrs = attrs.unwrap();
1605        assert_eq!(attrs, "{#link-1}");
1606    }
1607
1608    #[test]
1609    fn test_parse_inline_link_with_full_attributes() {
1610        let input = "[text](url){#link .external target=\"_blank\"}";
1611        let result = try_parse_inline_link(input, false, LinkScanContext::default());
1612        let (len, text, dest, attrs) = result.unwrap();
1613        assert_eq!(len, 44);
1614        assert_eq!(text, "text");
1615        assert_eq!(dest, "url");
1616        assert!(attrs.is_some());
1617        let attrs = attrs.unwrap();
1618        assert_eq!(attrs, "{#link .external target=\"_blank\"}");
1619    }
1620
1621    #[test]
1622    fn test_parse_inline_link_attributes_must_be_adjacent() {
1623        // Space between ) and { should not parse as attributes
1624        let input = "[text](url) {.class}";
1625        let result = try_parse_inline_link(input, false, LinkScanContext::default());
1626        assert_eq!(result, Some((11, "text", "url", None)));
1627    }
1628
1629    #[test]
1630    fn test_parse_inline_link_with_title_and_attributes() {
1631        let input = r#"[text](url "title"){.external}"#;
1632        let result = try_parse_inline_link(input, false, LinkScanContext::default());
1633        let (len, text, dest, attrs) = result.unwrap();
1634        assert_eq!(len, 30);
1635        assert_eq!(text, "text");
1636        assert_eq!(dest, r#"url "title""#);
1637        assert!(attrs.is_some());
1638        let attrs = attrs.unwrap();
1639        assert_eq!(attrs, "{.external}");
1640    }
1641
1642    // Reference link tests
1643    #[test]
1644    fn test_parse_reference_link_explicit() {
1645        let input = "[link text][label]";
1646        let result =
1647            try_parse_reference_link(input, false, true, false, LinkScanContext::default());
1648        assert_eq!(
1649            result,
1650            Some((18, "link text", "label".to_string(), "", false))
1651        );
1652    }
1653
1654    #[test]
1655    fn test_parse_reference_link_implicit() {
1656        let input = "[link text][]";
1657        let result =
1658            try_parse_reference_link(input, false, true, false, LinkScanContext::default());
1659        assert_eq!(result, Some((13, "link text", String::new(), "", false)));
1660    }
1661
1662    #[test]
1663    fn test_parse_reference_link_explicit_same_label_as_text() {
1664        let input = "[stack][stack]";
1665        let result =
1666            try_parse_reference_link(input, false, true, false, LinkScanContext::default());
1667        assert_eq!(result, Some((14, "stack", "stack".to_string(), "", false)));
1668    }
1669
1670    #[test]
1671    fn test_parse_reference_link_shortcut() {
1672        let input = "[link text] rest";
1673        let result = try_parse_reference_link(input, true, true, false, LinkScanContext::default());
1674        assert_eq!(
1675            result,
1676            Some((11, "link text", "link text".to_string(), "", true))
1677        );
1678    }
1679
1680    #[test]
1681    fn test_parse_reference_link_shortcut_rejects_empty_label() {
1682        let input = "[] rest";
1683        let result = try_parse_reference_link(input, true, true, false, LinkScanContext::default());
1684        assert_eq!(result, None);
1685    }
1686
1687    #[test]
1688    fn test_parse_reference_link_shortcut_disabled() {
1689        let input = "[link text] rest";
1690        let result =
1691            try_parse_reference_link(input, false, true, false, LinkScanContext::default());
1692        assert_eq!(result, None);
1693    }
1694
1695    #[test]
1696    fn test_parse_reference_link_not_inline_link() {
1697        // With shortcut disabled, `[text](url)` is rejected so the inline
1698        // link form upstream gets exclusive ownership.
1699        let input = "[text](url)";
1700        let result =
1701            try_parse_reference_link(input, false, true, false, LinkScanContext::default());
1702        assert_eq!(result, None);
1703    }
1704
1705    #[test]
1706    fn test_parse_reference_link_shortcut_falls_through_inline_link() {
1707        // CommonMark spec example #568: when an inline-link attempt would
1708        // fail (here we model the reachability — the caller tries inline
1709        // link first; if that returns None, we should still see `[text]`
1710        // as a shortcut and leave `(url)` to be parsed as following text).
1711        let input = "[text](url)";
1712        let result = try_parse_reference_link(input, true, true, false, LinkScanContext::default());
1713        assert_eq!(result, Some((6, "text", "text".to_string(), "", true)));
1714    }
1715
1716    #[test]
1717    fn test_parse_reference_link_with_nested_brackets() {
1718        let input = "[outer [inner] text][ref]";
1719        let result =
1720            try_parse_reference_link(input, false, true, false, LinkScanContext::default());
1721        assert_eq!(
1722            result,
1723            Some((25, "outer [inner] text", "ref".to_string(), "", false))
1724        );
1725    }
1726
1727    #[test]
1728    fn test_parse_reference_link_label_no_newline() {
1729        let input = "[text][label\nmore]";
1730        let result =
1731            try_parse_reference_link(input, false, true, false, LinkScanContext::default());
1732        assert_eq!(result, None);
1733    }
1734
1735    #[test]
1736    fn test_parse_reference_link_spaced_disabled() {
1737        // Without `spaced_reference_links`, a space between brackets blocks the
1738        // explicit form; shortcut takes over so `[foo]` matches at length 5.
1739        let input = "[foo] [bar]";
1740        let result = try_parse_reference_link(input, true, true, false, LinkScanContext::default());
1741        assert_eq!(result, Some((5, "foo", "foo".to_string(), "", true)));
1742    }
1743
1744    #[test]
1745    fn test_parse_reference_link_spaced_single_space() {
1746        let input = "[foo] [bar]";
1747        let result = try_parse_reference_link(input, true, true, true, LinkScanContext::default());
1748        assert_eq!(result, Some((11, "foo", "bar".to_string(), " ", false)));
1749    }
1750
1751    #[test]
1752    fn test_parse_reference_link_spaced_multiple_spaces_and_tab() {
1753        let input = "[foo]  \t[bar]";
1754        let result = try_parse_reference_link(input, true, true, true, LinkScanContext::default());
1755        assert_eq!(result, Some((13, "foo", "bar".to_string(), "  \t", false)));
1756    }
1757
1758    #[test]
1759    fn test_parse_reference_link_spaced_newline() {
1760        let input = "[foo]\n[bar]";
1761        let result = try_parse_reference_link(input, true, true, true, LinkScanContext::default());
1762        assert_eq!(result, Some((11, "foo", "bar".to_string(), "\n", false)));
1763    }
1764
1765    #[test]
1766    fn test_parse_reference_link_spaced_implicit() {
1767        // Pandoc: with the extension, `[foo] []` resolves to implicit `[foo][]`.
1768        let input = "[foo] []";
1769        let result = try_parse_reference_link(input, true, true, true, LinkScanContext::default());
1770        assert_eq!(result, Some((8, "foo", String::new(), " ", false)));
1771    }
1772
1773    // Reference image tests
1774    #[test]
1775    fn test_parse_reference_image_explicit() {
1776        let input = "![alt text][label]";
1777        let result = try_parse_reference_image(input, false, false);
1778        assert_eq!(
1779            result,
1780            Some((18, "alt text", "label".to_string(), "", false))
1781        );
1782    }
1783
1784    #[test]
1785    fn test_parse_reference_image_implicit() {
1786        let input = "![alt text][]";
1787        let result = try_parse_reference_image(input, false, false);
1788        assert_eq!(
1789            result,
1790            Some((13, "alt text", "alt text".to_string(), "", false))
1791        );
1792    }
1793
1794    #[test]
1795    fn test_parse_reference_image_shortcut() {
1796        let input = "![alt text] rest";
1797        let result = try_parse_reference_image(input, true, false);
1798        assert_eq!(
1799            result,
1800            Some((11, "alt text", "alt text".to_string(), "", true))
1801        );
1802    }
1803
1804    #[test]
1805    fn test_parse_reference_image_shortcut_disabled() {
1806        let input = "![alt text] rest";
1807        let result = try_parse_reference_image(input, false, false);
1808        assert_eq!(result, None);
1809    }
1810
1811    #[test]
1812    fn test_parse_reference_image_not_inline() {
1813        // Should not match inline images with (url)
1814        let input = "![alt](url)";
1815        let result = try_parse_reference_image(input, true, false);
1816        assert_eq!(result, None);
1817    }
1818
1819    #[test]
1820    fn test_parse_reference_image_with_nested_brackets() {
1821        let input = "![alt [nested] text][ref]";
1822        let result = try_parse_reference_image(input, false, false);
1823        assert_eq!(
1824            result,
1825            Some((25, "alt [nested] text", "ref".to_string(), "", false))
1826        );
1827    }
1828
1829    #[test]
1830    fn test_parse_reference_image_spaced() {
1831        let input = "![alt] [ref]";
1832        let result = try_parse_reference_image(input, true, true);
1833        assert_eq!(result, Some((12, "alt", "ref".to_string(), " ", false)));
1834    }
1835
1836    #[test]
1837    fn test_reference_link_label_with_crlf() {
1838        // Reference link labels should not span lines with CRLF
1839        let input = "[foo\r\nbar]";
1840        let result =
1841            try_parse_reference_link(input, false, true, false, LinkScanContext::default());
1842
1843        // Should fail to parse because label contains line break
1844        assert_eq!(
1845            result, None,
1846            "Should not parse reference link with CRLF in label"
1847        );
1848    }
1849
1850    #[test]
1851    fn test_reference_link_label_with_lf() {
1852        // Reference link labels should not span lines with LF either
1853        let input = "[foo\nbar]";
1854        let result =
1855            try_parse_reference_link(input, false, true, false, LinkScanContext::default());
1856
1857        // Should fail to parse because label contains line break
1858        assert_eq!(
1859            result, None,
1860            "Should not parse reference link with LF in label"
1861        );
1862    }
1863
1864    // Multiline link text tests
1865    #[test]
1866    fn test_parse_inline_link_multiline_text() {
1867        // Per Pandoc spec, link text CAN contain newlines (soft breaks)
1868        let input = "[text on\nline two](url)";
1869        let result = try_parse_inline_link(input, false, LinkScanContext::default());
1870        assert_eq!(
1871            result,
1872            Some((23, "text on\nline two", "url", None)),
1873            "Link text should allow newlines"
1874        );
1875    }
1876
1877    #[test]
1878    fn test_parse_inline_link_multiline_with_formatting() {
1879        // Link text with newlines and other inline elements
1880        let input =
1881            "[A network graph. Different edges\nwith probability](../images/networkfig.png)";
1882        let result = try_parse_inline_link(input, false, LinkScanContext::default());
1883        assert!(result.is_some(), "Link text with newlines should parse");
1884        let (len, text, _dest, _attrs) = result.unwrap();
1885        assert!(text.contains('\n'), "Link text should preserve newline");
1886        assert_eq!(len, input.len());
1887    }
1888
1889    #[test]
1890    fn test_parse_inline_image_multiline_alt() {
1891        // Per Pandoc spec, image alt text CAN contain newlines
1892        let input = "![alt on\nline two](img.png)";
1893        let result = try_parse_inline_image(input, LinkScanContext::default());
1894        assert_eq!(
1895            result,
1896            Some((27, "alt on\nline two", "img.png", None)),
1897            "Image alt text should allow newlines"
1898        );
1899    }
1900
1901    #[test]
1902    fn test_parse_inline_image_multiline_with_attributes() {
1903        // Image with multiline alt text and attributes
1904        let input = "![network graph\ndiagram](../images/fig.png){width=70%}";
1905        let result = try_parse_inline_image(input, LinkScanContext::default());
1906        assert!(
1907            result.is_some(),
1908            "Image alt with newlines and attributes should parse"
1909        );
1910        let (len, alt, dest, attrs) = result.unwrap();
1911        assert!(alt.contains('\n'), "Alt text should preserve newline");
1912        assert_eq!(dest, "../images/fig.png");
1913        assert_eq!(attrs, Some("{width=70%}"));
1914        assert_eq!(len, input.len());
1915    }
1916
1917    #[test]
1918    fn test_parse_inline_link_with_attributes_after_newline() {
1919        // Test for regression: when text is concatenated with newlines,
1920        // attributes after ) should still be recognized
1921        let input = "[A network graph.](../images/networkfig.png){width=70%}\nA word\n";
1922        let result = try_parse_inline_link(input, false, LinkScanContext::default());
1923        assert!(
1924            result.is_some(),
1925            "Link with attributes should parse even with following text"
1926        );
1927        let (len, text, dest, attrs) = result.unwrap();
1928        assert_eq!(text, "A network graph.");
1929        assert_eq!(dest, "../images/networkfig.png");
1930        assert_eq!(attrs, Some("{width=70%}"), "Attributes should be captured");
1931        assert_eq!(
1932            len, 55,
1933            "Length should include attributes (up to closing brace)"
1934        );
1935    }
1936}
panache_parser/parser/inlines/links.rs

panache_parser/parser/inlines/
links.rs