Skip to main content

panache_parser/parser/inlines/
links.rs

1//! Parsing for links, images, and automatic links.
2//!
3//! Implements:
4//! - Automatic links: `<http://example.com>` and `<user@example.com>`
5//! - Inline links: `[text](url)` and `[text](url "title")`
6//! - Link attributes: `[text](url){#id .class key=value}`
7//! - Inline images: `![alt](url)` and `![alt](url "title")`
8//! - Image attributes: `![alt](url){#id .class key=value}`
9//! - Reference links: `[text][ref]`, `[text][]`, `[text]`
10//! - Reference images: `![alt][ref]`, `![alt][]`, `![alt]`
11
12use super::core::parse_inline_text;
13use crate::options::ParserOptions;
14use crate::syntax::SyntaxKind;
15use rowan::GreenNodeBuilder;
16
17// Import attribute parsing
18use crate::parser::utils::attributes::try_parse_trailing_attributes;
19
20/// Try to parse an inline image starting at the current position.
21///
22/// Inline images have the form `![alt](url)` or `![alt](url "title")`.
23/// Can also have trailing attributes: `![alt](url){#id .class}`.
24/// Returns Some((length, alt_text, dest_content, raw_attributes)) if a valid image is found.
25pub fn try_parse_inline_image(text: &str) -> Option<(usize, &str, &str, Option<&str>)> {
26    if !text.starts_with("![") {
27        return None;
28    }
29
30    // Find the closing ]
31    let mut bracket_depth = 0;
32    let mut escape_next = false;
33    let mut close_bracket_pos = None;
34
35    for (i, ch) in text[2..].char_indices() {
36        if escape_next {
37            escape_next = false;
38            continue;
39        }
40
41        match ch {
42            '\\' => escape_next = true,
43            '[' => bracket_depth += 1,
44            ']' => {
45                if bracket_depth == 0 {
46                    close_bracket_pos = Some(i + 2);
47                    break;
48                }
49                bracket_depth -= 1;
50            }
51            _ => {}
52        }
53    }
54
55    let close_bracket = close_bracket_pos?;
56    let alt_text = &text[2..close_bracket];
57
58    // Check for immediate ( after ]
59    let after_bracket = close_bracket + 1;
60    if text.len() <= after_bracket || !text[after_bracket..].starts_with('(') {
61        return None;
62    }
63
64    // Find closing ) for destination (reuse same logic as links)
65    let dest_start = after_bracket + 1;
66    let remaining = &text[dest_start..];
67
68    let mut paren_depth = 0;
69    let mut escape_next = false;
70    let mut in_quotes = false;
71    let mut close_paren_pos = None;
72
73    for (i, ch) in remaining.char_indices() {
74        if escape_next {
75            escape_next = false;
76            continue;
77        }
78
79        match ch {
80            '\\' => escape_next = true,
81            '"' => in_quotes = !in_quotes,
82            '(' if !in_quotes => paren_depth += 1,
83            ')' if !in_quotes => {
84                if paren_depth == 0 {
85                    close_paren_pos = Some(i);
86                    break;
87                }
88                paren_depth -= 1;
89            }
90            _ => {}
91        }
92    }
93
94    let close_paren = close_paren_pos?;
95    let dest_content = &remaining[..close_paren];
96
97    // Check for trailing attributes {#id .class key=value}
98    let after_paren = dest_start + close_paren + 1;
99    let after_close = &text[after_paren..];
100
101    // Attributes must start immediately after closing paren (no whitespace/newlines)
102    if after_close.starts_with('{') {
103        // Find the closing brace
104        if let Some(close_brace_pos) = after_close.find('}') {
105            let attr_text = &after_close[..=close_brace_pos];
106            // Try to parse as attributes to validate
107            if let Some((_attrs, _)) = try_parse_trailing_attributes(attr_text) {
108                let total_len = after_paren + close_brace_pos + 1;
109                // Return raw attribute string for lossless parsing
110                let raw_attrs = attr_text;
111                return Some((total_len, alt_text, dest_content, Some(raw_attrs)));
112            }
113        }
114    }
115
116    // No attributes, just return the image
117    let total_len = after_paren;
118    Some((total_len, alt_text, dest_content, None))
119}
120
121/// Emit an inline image node to the builder.
122/// Note: alt_text may contain inline elements and should be parsed recursively.
123pub fn emit_inline_image(
124    builder: &mut GreenNodeBuilder,
125    _text: &str,
126    alt_text: &str,
127    dest: &str,
128    raw_attributes: Option<&str>,
129    config: &ParserOptions,
130) {
131    builder.start_node(SyntaxKind::IMAGE_LINK.into());
132
133    // Opening ![
134    builder.start_node(SyntaxKind::IMAGE_LINK_START.into());
135    builder.token(SyntaxKind::IMAGE_LINK_START.into(), "![");
136    builder.finish_node();
137
138    // Alt text (recursively parse inline elements)
139    builder.start_node(SyntaxKind::IMAGE_ALT.into());
140    // Use the standalone parse_inline_text function for recursive parsing
141    // Note: nested contexts don't resolve references
142    parse_inline_text(builder, alt_text, config, false);
143    builder.finish_node();
144
145    // Closing ]
146    builder.token(SyntaxKind::IMAGE_ALT_END.into(), "]");
147
148    // Opening (
149    builder.token(SyntaxKind::IMAGE_DEST_START.into(), "(");
150
151    // Destination
152    builder.start_node(SyntaxKind::LINK_DEST.into());
153    builder.token(SyntaxKind::TEXT.into(), dest);
154    builder.finish_node();
155
156    // Closing )
157    builder.token(SyntaxKind::IMAGE_DEST_END.into(), ")");
158
159    // Emit raw attributes if present (preserve original formatting)
160    if let Some(raw_attrs) = raw_attributes {
161        builder.start_node(SyntaxKind::ATTRIBUTE.into());
162        builder.token(SyntaxKind::ATTRIBUTE.into(), raw_attrs);
163        builder.finish_node();
164    }
165
166    builder.finish_node();
167}
168
169/// Try to parse an automatic link starting at the current position.
170///
171/// Automatic links have the form `<url>` (URI autolink) or `<email>`
172/// (email autolink) per CommonMark §6.4. Under `Dialect::CommonMark` the
173/// scheme/email grammar is enforced strictly (e.g. scheme must be 2-32
174/// ASCII chars; email local parts cannot contain backslashes). Pandoc
175/// markdown is laxer — it accepts Unicode in email addresses, for
176/// example — so non-CommonMark callers fall back to the heuristic
177/// "contains `:` or `@`" check that the parser used historically.
178pub fn try_parse_autolink(text: &str, is_commonmark: bool) -> Option<(usize, &str)> {
179    if !text.starts_with('<') {
180        return None;
181    }
182
183    let close_pos = text[1..].find('>')?;
184    let content = &text[1..1 + close_pos];
185
186    if content.is_empty() {
187        return None;
188    }
189    if content.contains(|c: char| c.is_whitespace()) {
190        return None;
191    }
192
193    if is_commonmark {
194        if !is_valid_uri_autolink(content) && !is_valid_email_autolink(content) {
195            return None;
196        }
197    } else if !content.contains(':') && !content.contains('@') {
198        return None;
199    }
200
201    Some((close_pos + 2, content))
202}
203
204/// CommonMark §6.4 URI autolink:
205/// scheme = 2-32 chars, ASCII letter then `[a-zA-Z0-9+.-]`, followed by `:`,
206/// followed by URI body (any char except control, space, `<`, `>`).
207fn is_valid_uri_autolink(s: &str) -> bool {
208    let bytes = s.as_bytes();
209    if bytes.is_empty() || !bytes[0].is_ascii_alphabetic() {
210        return false;
211    }
212    let mut i = 1;
213    while i < bytes.len() {
214        let b = bytes[i];
215        if b.is_ascii_alphanumeric() || b == b'+' || b == b'-' || b == b'.' {
216            i += 1;
217        } else {
218            break;
219        }
220    }
221    if !(2..=32).contains(&i) {
222        return false;
223    }
224    if i >= bytes.len() || bytes[i] != b':' {
225        return false;
226    }
227    for &b in &bytes[i + 1..] {
228        if b < 0x20 || b == 0x7f || b == b'<' || b == b'>' {
229            return false;
230        }
231    }
232    true
233}
234
235/// CommonMark §6.4 email autolink, matching the HTML5 non-normative regex:
236/// `^[a-zA-Z0-9.!#$%&'*+/=?^_`{|}~-]+@[a-zA-Z0-9](?:[a-zA-Z0-9-]{0,61}[a-zA-Z0-9])?
237///  (?:\.[a-zA-Z0-9](?:[a-zA-Z0-9-]{0,61}[a-zA-Z0-9])?)*$`.
238fn is_valid_email_autolink(s: &str) -> bool {
239    let Some(at) = s.find('@') else {
240        return false;
241    };
242    let local = &s[..at];
243    let domain = &s[at + 1..];
244    if local.is_empty() || !local.bytes().all(is_email_local_byte) {
245        return false;
246    }
247    if domain.is_empty() {
248        return false;
249    }
250    domain.split('.').all(is_valid_email_label)
251}
252
253fn is_email_local_byte(b: u8) -> bool {
254    matches!(
255        b,
256        b'a'..=b'z'
257            | b'A'..=b'Z'
258            | b'0'..=b'9'
259            | b'.'
260            | b'!'
261            | b'#'
262            | b'$'
263            | b'%'
264            | b'&'
265            | b'\''
266            | b'*'
267            | b'+'
268            | b'/'
269            | b'='
270            | b'?'
271            | b'^'
272            | b'_'
273            | b'`'
274            | b'{'
275            | b'|'
276            | b'}'
277            | b'~'
278            | b'-'
279    )
280}
281
282fn is_valid_email_label(label: &str) -> bool {
283    let bytes = label.as_bytes();
284    if bytes.is_empty() || bytes.len() > 63 {
285        return false;
286    }
287    if !bytes[0].is_ascii_alphanumeric() {
288        return false;
289    }
290    if !bytes[bytes.len() - 1].is_ascii_alphanumeric() {
291        return false;
292    }
293    bytes[1..bytes.len() - 1]
294        .iter()
295        .all(|b| b.is_ascii_alphanumeric() || *b == b'-')
296}
297
298/// Emit an automatic link node to the builder.
299pub fn emit_autolink(builder: &mut GreenNodeBuilder, _text: &str, url: &str) {
300    builder.start_node(SyntaxKind::AUTO_LINK.into());
301
302    // Opening <
303    builder.start_node(SyntaxKind::AUTO_LINK_MARKER.into());
304    builder.token(SyntaxKind::AUTO_LINK_MARKER.into(), "<");
305    builder.finish_node();
306
307    // URL content
308    builder.token(SyntaxKind::TEXT.into(), url);
309
310    // Closing >
311    builder.start_node(SyntaxKind::AUTO_LINK_MARKER.into());
312    builder.token(SyntaxKind::AUTO_LINK_MARKER.into(), ">");
313    builder.finish_node();
314
315    builder.finish_node();
316}
317
318pub fn try_parse_bare_uri(text: &str) -> Option<(usize, &str)> {
319    let mut chars = text.char_indices();
320    let (_, first) = chars.next()?;
321    if !first.is_ascii_alphabetic() {
322        return None;
323    }
324
325    let mut scheme_end = None;
326    for (idx, ch) in text.char_indices() {
327        if ch == ':' {
328            scheme_end = Some(idx);
329            break;
330        }
331        if !ch.is_ascii_alphanumeric() && ch != '+' && ch != '-' && ch != '.' {
332            return None;
333        }
334    }
335    let scheme_end = scheme_end?;
336    if scheme_end == 0 {
337        return None;
338    }
339
340    let mut end = scheme_end + 1;
341    let bytes = text.as_bytes();
342    while end < text.len() {
343        let b = bytes[end];
344        if b.is_ascii_whitespace() {
345            break;
346        }
347        if matches!(b, b'<' | b'>' | b'`' | b'"' | b'\'') {
348            break;
349        }
350        end += 1;
351    }
352
353    if end == scheme_end + 1 {
354        return None;
355    }
356
357    let mut trimmed = end;
358    while trimmed > scheme_end + 1 {
359        let ch = text[..trimmed].chars().last().unwrap();
360        if matches!(ch, '.' | ',' | ';' | ':' | ')' | ']' | '}') {
361            trimmed -= ch.len_utf8();
362        } else {
363            break;
364        }
365    }
366
367    if trimmed <= scheme_end + 1 {
368        return None;
369    }
370
371    // If trimming terminal punctuation leaves a dangling backslash, the match
372    // came from escaped punctuation (e.g., `a:\]`) and should stay literal.
373    if text[..trimmed].ends_with('\\') {
374        return None;
375    }
376
377    Some((trimmed, &text[..trimmed]))
378}
379
380/// Try to parse an inline link starting at the current position.
381///
382/// Inline links have the form `[text](url)` or `[text](url "title")`.
383/// Can also have trailing attributes: `[text](url){#id .class}`.
384/// Returns Some((length, text_content, dest_content, raw_attributes)) if a valid link is found.
385///
386/// `strict_dest` enables CommonMark §6.4 destination-and-title validation:
387/// the bare destination form may not contain spaces or ASCII control
388/// characters and must have balanced parentheses; if a title follows it
389/// must be properly delimited; only whitespace is allowed before/after.
390/// Pandoc-markdown is more permissive, so leave this off for that dialect.
391pub fn try_parse_inline_link(
392    text: &str,
393    strict_dest: bool,
394) -> Option<(usize, &str, &str, Option<&str>)> {
395    if !text.starts_with('[') {
396        return None;
397    }
398
399    // Find the closing ]
400    let mut bracket_depth = 0;
401    let mut escape_next = false;
402    let mut close_bracket_pos = None;
403
404    for (i, ch) in text[1..].char_indices() {
405        if escape_next {
406            escape_next = false;
407            continue;
408        }
409
410        match ch {
411            '\\' => escape_next = true,
412            '[' => bracket_depth += 1,
413            ']' => {
414                if bracket_depth == 0 {
415                    close_bracket_pos = Some(i + 1);
416                    break;
417                }
418                bracket_depth -= 1;
419            }
420            _ => {}
421        }
422    }
423
424    let close_bracket = close_bracket_pos?;
425    let link_text = &text[1..close_bracket];
426
427    // Check for immediate ( after ]
428    let after_bracket = close_bracket + 1;
429    if text.len() <= after_bracket || !text[after_bracket..].starts_with('(') {
430        return None;
431    }
432
433    // Find closing ) for destination
434    let dest_start = after_bracket + 1;
435    let remaining = &text[dest_start..];
436
437    let mut paren_depth = 0;
438    let mut escape_next = false;
439    let mut in_quotes = false;
440    let mut close_paren_pos = None;
441
442    for (i, ch) in remaining.char_indices() {
443        if escape_next {
444            escape_next = false;
445            continue;
446        }
447
448        match ch {
449            '\\' => escape_next = true,
450            '"' => in_quotes = !in_quotes,
451            '(' if !in_quotes => paren_depth += 1,
452            ')' if !in_quotes => {
453                if paren_depth == 0 {
454                    close_paren_pos = Some(i);
455                    break;
456                }
457                paren_depth -= 1;
458            }
459            _ => {}
460        }
461    }
462
463    let close_paren = close_paren_pos?;
464    let dest_content = &remaining[..close_paren];
465
466    if strict_dest && !dest_and_title_ok_commonmark(dest_content) {
467        return None;
468    }
469
470    // Check for trailing attributes {#id .class key=value}
471    let after_paren = dest_start + close_paren + 1;
472    let after_close = &text[after_paren..];
473
474    // Attributes must start immediately after closing paren (no whitespace/newlines)
475    if after_close.starts_with('{') {
476        // Find the closing brace
477        if let Some(close_brace_pos) = after_close.find('}') {
478            let attr_text = &after_close[..=close_brace_pos];
479            // Try to parse as attributes to validate
480            if let Some((_attrs, _)) = try_parse_trailing_attributes(attr_text) {
481                let total_len = after_paren + close_brace_pos + 1;
482                // Return raw attribute string for lossless parsing
483                let raw_attrs = attr_text;
484                return Some((total_len, link_text, dest_content, Some(raw_attrs)));
485            }
486        }
487    }
488
489    // No attributes, just return the link
490    let total_len = after_paren;
491    Some((total_len, link_text, dest_content, None))
492}
493
494/// CommonMark §6.4 destination + optional title validation. The text passed
495/// in is whatever the parser captured between `(` and `)`. A valid form is:
496/// `[ws] destination [ws title [ws]]` where:
497/// - bare destination has no spaces, tabs, ASCII control chars, and balanced
498///   parentheses (escaped parens permitted);
499/// - bracketed destination is `<...>` with no newlines and no unescaped `<>`;
500/// - the optional title is delimited by `"..."`, `'...'`, or `(...)`;
501/// - any text outside that structure invalidates the link.
502fn dest_and_title_ok_commonmark(content: &str) -> bool {
503    let trimmed = content.trim_start_matches([' ', '\t', '\n']);
504    if trimmed.is_empty() {
505        return true;
506    }
507
508    let after_dest = if let Some(rest) = trimmed.strip_prefix('<') {
509        let mut escape = false;
510        let mut end_byte = None;
511        for (i, c) in rest.char_indices() {
512            if escape {
513                escape = false;
514                continue;
515            }
516            match c {
517                '\\' => escape = true,
518                '\n' | '<' => return false,
519                '>' => {
520                    end_byte = Some(i);
521                    break;
522                }
523                _ => {}
524            }
525        }
526        match end_byte {
527            Some(e) => &rest[e + 1..],
528            None => return false,
529        }
530    } else {
531        let mut escape = false;
532        let mut depth: i32 = 0;
533        let mut end = trimmed.len();
534        for (i, c) in trimmed.char_indices() {
535            if escape {
536                escape = false;
537                continue;
538            }
539            match c {
540                '\\' => escape = true,
541                ' ' | '\t' | '\n' => {
542                    end = i;
543                    break;
544                }
545                _ if c.is_ascii_control() => return false,
546                '(' => depth += 1,
547                ')' => {
548                    if depth == 0 {
549                        end = i;
550                        break;
551                    }
552                    depth -= 1;
553                }
554                _ => {}
555            }
556        }
557        if depth != 0 {
558            return false;
559        }
560        if end == 0 {
561            // bare destination must be nonempty if the field is non-blank
562            return false;
563        }
564        &trimmed[end..]
565    };
566
567    let after_dest = after_dest.trim_start_matches([' ', '\t', '\n']);
568    if after_dest.is_empty() {
569        return true;
570    }
571
572    let bytes = after_dest.as_bytes();
573    let close = match bytes[0] {
574        b'"' => b'"',
575        b'\'' => b'\'',
576        b'(' => b')',
577        _ => return false,
578    };
579    let opens_paren = bytes[0] == b'(';
580    let mut escape = false;
581    let mut title_close_pos = None;
582    for (i, &b) in after_dest.as_bytes().iter().enumerate().skip(1) {
583        if escape {
584            escape = false;
585            continue;
586        }
587        if b == b'\\' {
588            escape = true;
589            continue;
590        }
591        if opens_paren && b == b'(' {
592            return false;
593        }
594        if b == close {
595            title_close_pos = Some(i);
596            break;
597        }
598    }
599    let close_idx = match title_close_pos {
600        Some(p) => p,
601        None => return false,
602    };
603
604    let after_title = &after_dest[close_idx + 1..];
605    after_title.trim_matches([' ', '\t', '\n']).is_empty()
606}
607
608/// Emit an inline link node to the builder.
609/// Note: link_text may contain inline elements and should be parsed recursively.
610pub fn emit_inline_link(
611    builder: &mut GreenNodeBuilder,
612    _text: &str,
613    link_text: &str,
614    dest: &str,
615    raw_attributes: Option<&str>,
616    config: &ParserOptions,
617) {
618    builder.start_node(SyntaxKind::LINK.into());
619
620    // Opening [
621    builder.start_node(SyntaxKind::LINK_START.into());
622    builder.token(SyntaxKind::LINK_START.into(), "[");
623    builder.finish_node();
624
625    // Link text (recursively parse inline elements)
626    builder.start_node(SyntaxKind::LINK_TEXT.into());
627    // Use the standalone parse_inline_text function for recursive parsing
628    parse_inline_text(builder, link_text, config, false);
629    builder.finish_node();
630
631    // Closing ]
632    builder.token(SyntaxKind::LINK_TEXT_END.into(), "]");
633
634    // Opening (
635    builder.token(SyntaxKind::LINK_DEST_START.into(), "(");
636
637    // Destination
638    builder.start_node(SyntaxKind::LINK_DEST.into());
639    builder.token(SyntaxKind::TEXT.into(), dest);
640    builder.finish_node();
641
642    // Closing )
643    builder.token(SyntaxKind::LINK_DEST_END.into(), ")");
644
645    // Emit raw attributes if present (preserve original formatting)
646    if let Some(raw_attrs) = raw_attributes {
647        builder.start_node(SyntaxKind::ATTRIBUTE.into());
648        builder.token(SyntaxKind::ATTRIBUTE.into(), raw_attrs);
649        builder.finish_node();
650    }
651
652    builder.finish_node();
653}
654
655pub fn emit_bare_uri_link(builder: &mut GreenNodeBuilder, uri: &str, _config: &ParserOptions) {
656    builder.start_node(SyntaxKind::LINK.into());
657
658    builder.start_node(SyntaxKind::LINK_START.into());
659    builder.token(SyntaxKind::LINK_START.into(), "[");
660    builder.finish_node();
661
662    builder.start_node(SyntaxKind::LINK_TEXT.into());
663    builder.token(SyntaxKind::TEXT.into(), uri);
664    builder.finish_node();
665
666    builder.token(SyntaxKind::LINK_TEXT_END.into(), "]");
667    builder.token(SyntaxKind::LINK_DEST_START.into(), "(");
668
669    builder.start_node(SyntaxKind::LINK_DEST.into());
670    builder.token(SyntaxKind::TEXT.into(), uri);
671    builder.finish_node();
672
673    builder.token(SyntaxKind::LINK_DEST_END.into(), ")");
674
675    builder.finish_node();
676}
677
678/// Try to parse a reference link starting at the current position.
679///
680/// Reference links have three forms:
681/// - Explicit: `[text][label]`
682/// - Implicit: `[text][]` (label = text)
683/// - Shortcut: `[text]` (if shortcut_reference_links enabled)
684///
685/// Returns Some((length, text_content, label, is_shortcut)) if a valid reference link is found.
686/// The label is what should be looked up in the registry.
687pub fn try_parse_reference_link(
688    text: &str,
689    allow_shortcut: bool,
690) -> Option<(usize, &str, String, bool)> {
691    if !text.starts_with('[') {
692        return None;
693    }
694
695    // Don't match citations (which start with [@) or suppress-author citations (which start with [-@)
696    if text.len() > 1 {
697        let bytes = text.as_bytes();
698        if bytes[1] == b'@' {
699            return None;
700        }
701        if bytes[1] == b'-' && text.len() > 2 && bytes[2] == b'@' {
702            return None;
703        }
704    }
705
706    // Find the closing ] for the text
707    let mut bracket_depth = 0;
708    let mut escape_next = false;
709    let mut close_bracket_pos = None;
710
711    for (i, ch) in text[1..].char_indices() {
712        if escape_next {
713            escape_next = false;
714            continue;
715        }
716
717        match ch {
718            '\\' => escape_next = true,
719            '[' => bracket_depth += 1,
720            ']' => {
721                if bracket_depth == 0 {
722                    close_bracket_pos = Some(i + 1);
723                    break;
724                }
725                bracket_depth -= 1;
726            }
727            _ => {}
728        }
729    }
730
731    let close_bracket = close_bracket_pos?;
732    let link_text = &text[1..close_bracket];
733
734    // Check what follows the ]
735    let after_bracket = close_bracket + 1;
736
737    // Check if followed by ( - if so, this is an inline link, not a reference link
738    if after_bracket < text.len() && text[after_bracket..].starts_with('(') {
739        return None;
740    }
741
742    // Check if followed by { - if so, this is a bracketed span, not a reference link
743    if after_bracket < text.len() && text[after_bracket..].starts_with('{') {
744        return None;
745    }
746
747    // Check for explicit reference [text][label] or implicit [text][]
748    if after_bracket < text.len() && text[after_bracket..].starts_with('[') {
749        // Find the closing ] for the label
750        let label_start = after_bracket + 1;
751        let mut label_end = None;
752
753        for (i, ch) in text[label_start..].char_indices() {
754            if ch == ']' {
755                label_end = Some(i + label_start);
756                break;
757            }
758            // Labels can't contain newlines
759            if ch == '\n' {
760                return None;
761            }
762        }
763
764        let label_end = label_end?;
765        let label = &text[label_start..label_end];
766
767        // Total length includes both bracket pairs
768        let total_len = label_end + 1;
769
770        // Implicit reference: empty label means emit [text][]
771        if label.is_empty() {
772            return Some((total_len, link_text, String::new(), false));
773        }
774
775        // Explicit reference: use the provided label
776        Some((total_len, link_text, label.to_string(), false))
777    } else if allow_shortcut {
778        // Shortcut reference: [text] with no second bracket pair
779        // The text is both the display text and the label
780        if link_text.is_empty() {
781            return None;
782        }
783        Some((after_bracket, link_text, link_text.to_string(), true))
784    } else {
785        // No second bracket pair and shortcut not allowed - not a reference link
786        None
787    }
788}
789
790/// Emit a reference link node to the builder.
791/// Preserves the original reference syntax (explicit [text][ref], implicit [text][], or shortcut [text]).
792pub fn emit_reference_link(
793    builder: &mut GreenNodeBuilder,
794    link_text: &str,
795    label: &str,
796    is_shortcut: bool,
797    config: &ParserOptions,
798) {
799    builder.start_node(SyntaxKind::LINK.into());
800
801    // Opening [
802    builder.start_node(SyntaxKind::LINK_START.into());
803    builder.token(SyntaxKind::LINK_START.into(), "[");
804    builder.finish_node();
805
806    // Link text (recursively parse inline elements)
807    builder.start_node(SyntaxKind::LINK_TEXT.into());
808    parse_inline_text(builder, link_text, config, false);
809    builder.finish_node();
810
811    // Closing ] and reference label
812    builder.token(SyntaxKind::TEXT.into(), "]");
813
814    if !is_shortcut {
815        // Explicit or implicit reference: [text][label] or [text][]
816        builder.token(SyntaxKind::TEXT.into(), "[");
817        builder.start_node(SyntaxKind::LINK_REF.into());
818        // For implicit references, label is empty and we emit [text][]
819        // For explicit references, emit the label to get [text][label]
820        if !label.is_empty() {
821            builder.token(SyntaxKind::TEXT.into(), label);
822        }
823        builder.finish_node();
824        builder.token(SyntaxKind::TEXT.into(), "]");
825    }
826    // For shortcut references, just [text] - no second bracket pair
827
828    builder.finish_node();
829}
830
831/// Try to parse a reference-style image: `![alt][ref]`, `![alt][]`, or `![alt]`
832/// Returns (total_len, alt_text, label, is_shortcut) if successful.
833pub fn try_parse_reference_image(
834    text: &str,
835    allow_shortcut: bool,
836) -> Option<(usize, &str, String, bool)> {
837    let bytes = text.as_bytes();
838    if bytes.len() < 4 || bytes[0] != b'!' || bytes[1] != b'[' {
839        return None;
840    }
841
842    let mut pos = 2;
843    let mut bracket_depth = 1;
844    let alt_start = pos;
845
846    // Find the end of the alt text (allowing nested brackets)
847    while pos < bytes.len() && bracket_depth > 0 {
848        match bytes[pos] {
849            b'[' => bracket_depth += 1,
850            b']' => bracket_depth -= 1,
851            b'\\' if pos + 1 < bytes.len() => pos += 1, // skip escaped char
852            _ => {}
853        }
854        pos += 1;
855    }
856
857    if bracket_depth > 0 {
858        return None; // Unclosed brackets
859    }
860
861    let alt_text = &text[alt_start..pos - 1];
862
863    // Now check for the label part
864    if pos >= bytes.len() {
865        return None;
866    }
867
868    // Explicit reference: `![alt][label]`
869    if bytes[pos] == b'[' {
870        pos += 1;
871        let label_start = pos;
872
873        // Find the end of the label (no nested brackets, no newlines)
874        while pos < bytes.len() && bytes[pos] != b']' && bytes[pos] != b'\n' && bytes[pos] != b'\r'
875        {
876            pos += 1;
877        }
878
879        if pos >= bytes.len() || bytes[pos] != b']' {
880            return None;
881        }
882
883        let label_text = &text[label_start..pos];
884        pos += 1;
885
886        // Return the original label text for formatting preservation
887        // Empty label means implicit reference
888        let label = if label_text.is_empty() {
889            alt_text.to_string() // For implicit references, use alt text as label for equality check
890        } else {
891            label_text.to_string() // Preserve original case
892        };
893
894        return Some((pos, alt_text, label, false));
895    }
896
897    // Shortcut reference: `![alt]` (only if enabled)
898    // BUT not if followed by (url) - that's an inline image
899    if allow_shortcut {
900        // Check if next char is ( - if so, not a reference
901        if pos < bytes.len() && bytes[pos] == b'(' {
902            return None;
903        }
904
905        // For shortcut references, use alt text as label for equality check
906        let label = alt_text.to_string();
907        return Some((pos, alt_text, label, true));
908    }
909
910    None
911}
912
913/// Emit a reference image node with registry lookup.
914pub fn emit_reference_image(
915    builder: &mut GreenNodeBuilder,
916    alt_text: &str,
917    label: &str,
918    is_shortcut: bool,
919    config: &ParserOptions,
920) {
921    builder.start_node(SyntaxKind::IMAGE_LINK.into());
922
923    // Emit as reference image (preserve original syntax)
924    builder.start_node(SyntaxKind::IMAGE_LINK_START.into());
925    builder.token(SyntaxKind::IMAGE_LINK_START.into(), "![");
926    builder.finish_node();
927
928    // Alt text (recursively parse inline elements)
929    builder.start_node(SyntaxKind::IMAGE_ALT.into());
930    parse_inline_text(builder, alt_text, config, false);
931    builder.finish_node();
932
933    // Closing ] and reference label
934    builder.token(SyntaxKind::TEXT.into(), "]");
935
936    if !is_shortcut {
937        // Explicit or implicit reference: ![alt][label] or ![alt][]
938        builder.token(SyntaxKind::TEXT.into(), "[");
939        builder.start_node(SyntaxKind::LINK_REF.into());
940        // For implicit references, emit empty label (label == alt means implicit from parser)
941        if label != alt_text {
942            builder.token(SyntaxKind::TEXT.into(), label);
943        }
944        builder.finish_node();
945        builder.token(SyntaxKind::TEXT.into(), "]");
946    }
947    // For shortcut references, just ![alt] - no second bracket pair
948
949    builder.finish_node();
950}
951
952#[cfg(test)]
953mod tests {
954    use super::*;
955
956    #[test]
957    fn test_parse_autolink_url() {
958        let input = "<https://example.com>";
959        assert_eq!(
960            try_parse_autolink(input, false),
961            Some((21, "https://example.com"))
962        );
963        assert_eq!(
964            try_parse_autolink(input, true),
965            Some((21, "https://example.com"))
966        );
967    }
968
969    #[test]
970    fn test_parse_autolink_email() {
971        let input = "<user@example.com>";
972        assert_eq!(
973            try_parse_autolink(input, false),
974            Some((18, "user@example.com"))
975        );
976        assert_eq!(
977            try_parse_autolink(input, true),
978            Some((18, "user@example.com"))
979        );
980    }
981
982    #[test]
983    fn test_parse_autolink_no_close() {
984        let input = "<https://example.com";
985        assert_eq!(try_parse_autolink(input, false), None);
986        assert_eq!(try_parse_autolink(input, true), None);
987    }
988
989    #[test]
990    fn test_parse_autolink_with_space() {
991        let input = "<https://example.com >";
992        assert_eq!(try_parse_autolink(input, false), None);
993        assert_eq!(try_parse_autolink(input, true), None);
994    }
995
996    #[test]
997    fn test_parse_autolink_not_url_or_email() {
998        let input = "<notaurl>";
999        assert_eq!(try_parse_autolink(input, false), None);
1000        assert_eq!(try_parse_autolink(input, true), None);
1001    }
1002
1003    #[test]
1004    fn test_parse_autolink_commonmark_strict_scheme() {
1005        // Scheme too short (1 char) — invalid under CommonMark, lax-accepted
1006        // under Pandoc dialect (matches historical behavior).
1007        let input = "<m:abc>";
1008        assert_eq!(try_parse_autolink(input, true), None);
1009        assert_eq!(try_parse_autolink(input, false), Some((7, "m:abc")));
1010    }
1011
1012    #[test]
1013    fn test_parse_autolink_commonmark_email_disallows_backslash() {
1014        let input = "<foo\\+@bar.example.com>";
1015        assert_eq!(try_parse_autolink(input, true), None);
1016        assert_eq!(
1017            try_parse_autolink(input, false),
1018            Some((23, "foo\\+@bar.example.com"))
1019        );
1020    }
1021
1022    #[test]
1023    fn test_parse_inline_link_simple() {
1024        let input = "[text](url)";
1025        let result = try_parse_inline_link(input, false);
1026        assert_eq!(result, Some((11, "text", "url", None)));
1027    }
1028
1029    #[test]
1030    fn test_parse_inline_link_with_title() {
1031        let input = r#"[text](url "title")"#;
1032        let result = try_parse_inline_link(input, false);
1033        assert_eq!(result, Some((19, "text", r#"url "title""#, None)));
1034    }
1035
1036    #[test]
1037    fn test_parse_inline_link_with_nested_brackets() {
1038        let input = "[outer [inner] text](url)";
1039        let result = try_parse_inline_link(input, false);
1040        assert_eq!(result, Some((25, "outer [inner] text", "url", None)));
1041    }
1042
1043    #[test]
1044    fn test_parse_inline_link_no_space_between_brackets_and_parens() {
1045        let input = "[text] (url)";
1046        let result = try_parse_inline_link(input, false);
1047        assert_eq!(result, None);
1048    }
1049
1050    #[test]
1051    fn test_parse_inline_link_no_closing_bracket() {
1052        let input = "[text(url)";
1053        let result = try_parse_inline_link(input, false);
1054        assert_eq!(result, None);
1055    }
1056
1057    #[test]
1058    fn test_parse_inline_link_no_closing_paren() {
1059        let input = "[text](url";
1060        let result = try_parse_inline_link(input, false);
1061        assert_eq!(result, None);
1062    }
1063
1064    #[test]
1065    fn test_parse_inline_link_escaped_bracket() {
1066        let input = r"[text\]more](url)";
1067        let result = try_parse_inline_link(input, false);
1068        assert_eq!(result, Some((17, r"text\]more", "url", None)));
1069    }
1070
1071    #[test]
1072    fn test_parse_inline_link_parens_in_url() {
1073        let input = "[text](url(with)parens)";
1074        let result = try_parse_inline_link(input, false);
1075        assert_eq!(result, Some((23, "text", "url(with)parens", None)));
1076    }
1077
1078    #[test]
1079    fn test_parse_inline_image_simple() {
1080        let input = "![alt](image.jpg)";
1081        let result = try_parse_inline_image(input);
1082        assert_eq!(result, Some((17, "alt", "image.jpg", None)));
1083    }
1084
1085    #[test]
1086    fn test_parse_inline_image_with_title() {
1087        let input = r#"![alt](image.jpg "A title")"#;
1088        let result = try_parse_inline_image(input);
1089        assert_eq!(result, Some((27, "alt", r#"image.jpg "A title""#, None)));
1090    }
1091
1092    #[test]
1093    fn test_parse_inline_image_with_nested_brackets() {
1094        let input = "![outer [inner] alt](image.jpg)";
1095        let result = try_parse_inline_image(input);
1096        assert_eq!(result, Some((31, "outer [inner] alt", "image.jpg", None)));
1097    }
1098
1099    #[test]
1100    fn test_parse_bare_uri_rejects_dangling_backslash_after_trim() {
1101        let input = r"a:\]";
1102        let result = try_parse_bare_uri(input);
1103        assert_eq!(result, None);
1104    }
1105
1106    #[test]
1107    fn test_parse_inline_image_no_space_between_brackets_and_parens() {
1108        let input = "![alt] (image.jpg)";
1109        let result = try_parse_inline_image(input);
1110        assert_eq!(result, None);
1111    }
1112
1113    #[test]
1114    fn test_parse_inline_image_no_closing_bracket() {
1115        let input = "![alt(image.jpg)";
1116        let result = try_parse_inline_image(input);
1117        assert_eq!(result, None);
1118    }
1119
1120    #[test]
1121    fn test_parse_inline_image_no_closing_paren() {
1122        let input = "![alt](image.jpg";
1123        let result = try_parse_inline_image(input);
1124        assert_eq!(result, None);
1125    }
1126
1127    #[test]
1128    fn test_parse_inline_image_with_simple_class() {
1129        let input = "![alt](img.png){.large}";
1130        let result = try_parse_inline_image(input);
1131        let (len, alt, dest, attrs) = result.unwrap();
1132        assert_eq!(len, 23);
1133        assert_eq!(alt, "alt");
1134        assert_eq!(dest, "img.png");
1135        assert!(attrs.is_some());
1136        let attrs = attrs.unwrap();
1137        assert_eq!(attrs, "{.large}");
1138    }
1139
1140    #[test]
1141    fn test_parse_inline_image_with_id() {
1142        let input = "![Figure 1](fig1.png){#fig-1}";
1143        let result = try_parse_inline_image(input);
1144        let (len, alt, dest, attrs) = result.unwrap();
1145        assert_eq!(len, 29);
1146        assert_eq!(alt, "Figure 1");
1147        assert_eq!(dest, "fig1.png");
1148        assert!(attrs.is_some());
1149        let attrs = attrs.unwrap();
1150        assert_eq!(attrs, "{#fig-1}");
1151    }
1152
1153    #[test]
1154    fn test_parse_inline_image_with_full_attributes() {
1155        let input = "![alt](img.png){#fig .large width=\"80%\"}";
1156        let result = try_parse_inline_image(input);
1157        let (len, alt, dest, attrs) = result.unwrap();
1158        assert_eq!(len, 40);
1159        assert_eq!(alt, "alt");
1160        assert_eq!(dest, "img.png");
1161        assert!(attrs.is_some());
1162        let attrs = attrs.unwrap();
1163        assert_eq!(attrs, "{#fig .large width=\"80%\"}");
1164    }
1165
1166    #[test]
1167    fn test_parse_inline_image_attributes_must_be_adjacent() {
1168        // Space between ) and { should not parse as attributes
1169        let input = "![alt](img.png) {.large}";
1170        let result = try_parse_inline_image(input);
1171        assert_eq!(result, Some((15, "alt", "img.png", None)));
1172    }
1173
1174    // Link attribute tests
1175    #[test]
1176    fn test_parse_inline_link_with_id() {
1177        let input = "[text](url){#link-1}";
1178        let result = try_parse_inline_link(input, false);
1179        let (len, text, dest, attrs) = result.unwrap();
1180        assert_eq!(len, 20);
1181        assert_eq!(text, "text");
1182        assert_eq!(dest, "url");
1183        assert!(attrs.is_some());
1184        let attrs = attrs.unwrap();
1185        assert_eq!(attrs, "{#link-1}");
1186    }
1187
1188    #[test]
1189    fn test_parse_inline_link_with_full_attributes() {
1190        let input = "[text](url){#link .external target=\"_blank\"}";
1191        let result = try_parse_inline_link(input, false);
1192        let (len, text, dest, attrs) = result.unwrap();
1193        assert_eq!(len, 44);
1194        assert_eq!(text, "text");
1195        assert_eq!(dest, "url");
1196        assert!(attrs.is_some());
1197        let attrs = attrs.unwrap();
1198        assert_eq!(attrs, "{#link .external target=\"_blank\"}");
1199    }
1200
1201    #[test]
1202    fn test_parse_inline_link_attributes_must_be_adjacent() {
1203        // Space between ) and { should not parse as attributes
1204        let input = "[text](url) {.class}";
1205        let result = try_parse_inline_link(input, false);
1206        assert_eq!(result, Some((11, "text", "url", None)));
1207    }
1208
1209    #[test]
1210    fn test_parse_inline_link_with_title_and_attributes() {
1211        let input = r#"[text](url "title"){.external}"#;
1212        let result = try_parse_inline_link(input, false);
1213        let (len, text, dest, attrs) = result.unwrap();
1214        assert_eq!(len, 30);
1215        assert_eq!(text, "text");
1216        assert_eq!(dest, r#"url "title""#);
1217        assert!(attrs.is_some());
1218        let attrs = attrs.unwrap();
1219        assert_eq!(attrs, "{.external}");
1220    }
1221
1222    // Reference link tests
1223    #[test]
1224    fn test_parse_reference_link_explicit() {
1225        let input = "[link text][label]";
1226        let result = try_parse_reference_link(input, false);
1227        assert_eq!(result, Some((18, "link text", "label".to_string(), false)));
1228    }
1229
1230    #[test]
1231    fn test_parse_reference_link_implicit() {
1232        let input = "[link text][]";
1233        let result = try_parse_reference_link(input, false);
1234        assert_eq!(result, Some((13, "link text", String::new(), false)));
1235    }
1236
1237    #[test]
1238    fn test_parse_reference_link_explicit_same_label_as_text() {
1239        let input = "[stack][stack]";
1240        let result = try_parse_reference_link(input, false);
1241        assert_eq!(result, Some((14, "stack", "stack".to_string(), false)));
1242    }
1243
1244    #[test]
1245    fn test_parse_reference_link_shortcut() {
1246        let input = "[link text] rest";
1247        let result = try_parse_reference_link(input, true);
1248        assert_eq!(
1249            result,
1250            Some((11, "link text", "link text".to_string(), true))
1251        );
1252    }
1253
1254    #[test]
1255    fn test_parse_reference_link_shortcut_rejects_empty_label() {
1256        let input = "[] rest";
1257        let result = try_parse_reference_link(input, true);
1258        assert_eq!(result, None);
1259    }
1260
1261    #[test]
1262    fn test_parse_reference_link_shortcut_disabled() {
1263        let input = "[link text] rest";
1264        let result = try_parse_reference_link(input, false);
1265        assert_eq!(result, None);
1266    }
1267
1268    #[test]
1269    fn test_parse_reference_link_not_inline_link() {
1270        // Should not match inline links with (url)
1271        let input = "[text](url)";
1272        let result = try_parse_reference_link(input, true);
1273        assert_eq!(result, None);
1274    }
1275
1276    #[test]
1277    fn test_parse_reference_link_with_nested_brackets() {
1278        let input = "[outer [inner] text][ref]";
1279        let result = try_parse_reference_link(input, false);
1280        assert_eq!(
1281            result,
1282            Some((25, "outer [inner] text", "ref".to_string(), false))
1283        );
1284    }
1285
1286    #[test]
1287    fn test_parse_reference_link_label_no_newline() {
1288        let input = "[text][label\nmore]";
1289        let result = try_parse_reference_link(input, false);
1290        assert_eq!(result, None);
1291    }
1292
1293    // Reference image tests
1294    #[test]
1295    fn test_parse_reference_image_explicit() {
1296        let input = "![alt text][label]";
1297        let result = try_parse_reference_image(input, false);
1298        assert_eq!(result, Some((18, "alt text", "label".to_string(), false)));
1299    }
1300
1301    #[test]
1302    fn test_parse_reference_image_implicit() {
1303        let input = "![alt text][]";
1304        let result = try_parse_reference_image(input, false);
1305        assert_eq!(
1306            result,
1307            Some((13, "alt text", "alt text".to_string(), false))
1308        );
1309    }
1310
1311    #[test]
1312    fn test_parse_reference_image_shortcut() {
1313        let input = "![alt text] rest";
1314        let result = try_parse_reference_image(input, true);
1315        assert_eq!(result, Some((11, "alt text", "alt text".to_string(), true)));
1316    }
1317
1318    #[test]
1319    fn test_parse_reference_image_shortcut_disabled() {
1320        let input = "![alt text] rest";
1321        let result = try_parse_reference_image(input, false);
1322        assert_eq!(result, None);
1323    }
1324
1325    #[test]
1326    fn test_parse_reference_image_not_inline() {
1327        // Should not match inline images with (url)
1328        let input = "![alt](url)";
1329        let result = try_parse_reference_image(input, true);
1330        assert_eq!(result, None);
1331    }
1332
1333    #[test]
1334    fn test_parse_reference_image_with_nested_brackets() {
1335        let input = "![alt [nested] text][ref]";
1336        let result = try_parse_reference_image(input, false);
1337        assert_eq!(
1338            result,
1339            Some((25, "alt [nested] text", "ref".to_string(), false))
1340        );
1341    }
1342
1343    #[test]
1344    fn test_reference_link_label_with_crlf() {
1345        // Reference link labels should not span lines with CRLF
1346        let input = "[foo\r\nbar]";
1347        let result = try_parse_reference_link(input, false);
1348
1349        // Should fail to parse because label contains line break
1350        assert_eq!(
1351            result, None,
1352            "Should not parse reference link with CRLF in label"
1353        );
1354    }
1355
1356    #[test]
1357    fn test_reference_link_label_with_lf() {
1358        // Reference link labels should not span lines with LF either
1359        let input = "[foo\nbar]";
1360        let result = try_parse_reference_link(input, false);
1361
1362        // Should fail to parse because label contains line break
1363        assert_eq!(
1364            result, None,
1365            "Should not parse reference link with LF in label"
1366        );
1367    }
1368
1369    // Multiline link text tests
1370    #[test]
1371    fn test_parse_inline_link_multiline_text() {
1372        // Per Pandoc spec, link text CAN contain newlines (soft breaks)
1373        let input = "[text on\nline two](url)";
1374        let result = try_parse_inline_link(input, false);
1375        assert_eq!(
1376            result,
1377            Some((23, "text on\nline two", "url", None)),
1378            "Link text should allow newlines"
1379        );
1380    }
1381
1382    #[test]
1383    fn test_parse_inline_link_multiline_with_formatting() {
1384        // Link text with newlines and other inline elements
1385        let input =
1386            "[A network graph. Different edges\nwith probability](../images/networkfig.png)";
1387        let result = try_parse_inline_link(input, false);
1388        assert!(result.is_some(), "Link text with newlines should parse");
1389        let (len, text, _dest, _attrs) = result.unwrap();
1390        assert!(text.contains('\n'), "Link text should preserve newline");
1391        assert_eq!(len, input.len());
1392    }
1393
1394    #[test]
1395    fn test_parse_inline_image_multiline_alt() {
1396        // Per Pandoc spec, image alt text CAN contain newlines
1397        let input = "![alt on\nline two](img.png)";
1398        let result = try_parse_inline_image(input);
1399        assert_eq!(
1400            result,
1401            Some((27, "alt on\nline two", "img.png", None)),
1402            "Image alt text should allow newlines"
1403        );
1404    }
1405
1406    #[test]
1407    fn test_parse_inline_image_multiline_with_attributes() {
1408        // Image with multiline alt text and attributes
1409        let input = "![network graph\ndiagram](../images/fig.png){width=70%}";
1410        let result = try_parse_inline_image(input);
1411        assert!(
1412            result.is_some(),
1413            "Image alt with newlines and attributes should parse"
1414        );
1415        let (len, alt, dest, attrs) = result.unwrap();
1416        assert!(alt.contains('\n'), "Alt text should preserve newline");
1417        assert_eq!(dest, "../images/fig.png");
1418        assert_eq!(attrs, Some("{width=70%}"));
1419        assert_eq!(len, input.len());
1420    }
1421
1422    #[test]
1423    fn test_parse_inline_link_with_attributes_after_newline() {
1424        // Test for regression: when text is concatenated with newlines,
1425        // attributes after ) should still be recognized
1426        let input = "[A network graph.](../images/networkfig.png){width=70%}\nA word\n";
1427        let result = try_parse_inline_link(input, false);
1428        assert!(
1429            result.is_some(),
1430            "Link with attributes should parse even with following text"
1431        );
1432        let (len, text, dest, attrs) = result.unwrap();
1433        assert_eq!(text, "A network graph.");
1434        assert_eq!(dest, "../images/networkfig.png");
1435        assert_eq!(attrs, Some("{width=70%}"), "Attributes should be captured");
1436        assert_eq!(
1437            len, 55,
1438            "Length should include attributes (up to closing brace)"
1439        );
1440    }
1441}