panache_parser/parser/blocks/
reference_links.rs

1//! Reference definition and footnote parsing functions.
2//!
3//! Reference definitions have the form:
4//! ```markdown
5//! [label]: url "optional title"
6//! [label]: url 'optional title'
7//! [label]: url (optional title)
8//! [label]: <url> "title"
9//! ```
10//!
11//! Footnote definitions have the form:
12//! ```markdown
13//! [^id]: Footnote content here.
14//!     Can continue on multiple lines
15//!     as long as they're indented.
16//! ```
17
18/// Try to parse a reference definition starting at the current position.
19/// Returns Some((bytes_consumed, label, url, title)) on success.
20///
21/// `text` may span multiple lines. The destination and title may each be
22/// preceded by at most one newline (per CommonMark §4.7). Blank lines
23/// terminate the definition: callers should stop the input at the first
24/// blank line so the parser cannot cross one.
25///
26/// `dialect` controls a CommonMark-only constraint (§4.7): the title, if
27/// present on the same line as the destination, must be separated from the
28/// destination by at least one space or tab. Pandoc-markdown accepts the
29/// title even when it's directly attached (e.g. `[foo]: <bar>(baz)`).
30///
31/// Syntax:
32/// ```markdown
33/// [label]: url "title"
34/// [label]: <url> 'title'
35/// [label]:
36///   url
37///   "title"
38/// ```
39pub fn try_parse_reference_definition(
40    text: &str,
41    dialect: crate::options::Dialect,
42) -> Option<(usize, String, String, Option<String>)> {
43    try_parse_reference_definition_with_mode(text, true, dialect)
44}
45
46/// Multimarkdown-flavored variant: tolerates trailing content after the title
47/// on the same line (e.g. `[ref]: /url "title" width=20px ...`). Callers in
48/// the MMD code path then keep collecting attribute-continuation lines.
49pub fn try_parse_reference_definition_lax(
50    text: &str,
51    dialect: crate::options::Dialect,
52) -> Option<(usize, String, String, Option<String>)> {
53    try_parse_reference_definition_with_mode(text, false, dialect)
54}
55
56fn try_parse_reference_definition_with_mode(
57    text: &str,
58    strict_eol: bool,
59    dialect: crate::options::Dialect,
60) -> Option<(usize, String, String, Option<String>)> {
61    let spans = reference_definition_spans(text, strict_eol, dialect)?;
62    let label = text[spans.indent + 1..spans.label_close].to_string();
63    let url = if spans.url_is_angle {
64        text[spans.url.start + 1..spans.url.end - 1].to_string()
65    } else {
66        text[spans.url.clone()].to_string()
67    };
68    let title = spans
69        .title
70        .as_ref()
71        .map(|r| text[r.start + 1..r.end - 1].to_string());
72    Some((spans.consumed, label, url, title))
73}
74
75/// Byte spans of a recognized reference definition, all relative to the `text`
76/// passed to [`reference_definition_spans`].
77///
78/// This is the single source of truth shared by *detection*
79/// (`try_parse_reference_definition`, which extracts the component strings)
80/// and *emission* (`emit_reference_definition_lines`, which wraps the same
81/// byte ranges in `REFERENCE_URL` / `REFERENCE_TITLE` CST nodes). Keeping both
82/// phases on one walker is what prevents the detect/emit drift the dispatcher's
83/// doc comment warns about.
84#[derive(Debug, Clone)]
85pub(crate) struct ReferenceSpans {
86    /// Leading-space count before `[` (0..=3); also the byte index of `[`.
87    pub indent: usize,
88    /// Byte index of the label-closing `]`.
89    pub label_close: usize,
90    /// Byte index of the `:` after the label.
91    pub colon: usize,
92    /// Destination byte range, *including* `<>` when angle-bracketed.
93    pub url: std::ops::Range<usize>,
94    /// Whether the destination is `<…>` angle-bracketed.
95    pub url_is_angle: bool,
96    /// Title byte range, *including* its quote/paren delimiters, when present.
97    pub title: Option<std::ops::Range<usize>>,
98    /// Total bytes consumed (matches the legacy `bytes_consumed`).
99    pub consumed: usize,
100}
101
102/// Scan a reference definition and record the byte spans of its components.
103///
104/// The walk is identical to the legacy string-returning parser — it just
105/// records offsets instead of allocating component strings, so detection and
106/// emission stay byte-for-byte consistent. See [`ReferenceSpans`].
107pub(crate) fn reference_definition_spans(
108    text: &str,
109    strict_eol: bool,
110    dialect: crate::options::Dialect,
111) -> Option<ReferenceSpans> {
112    let leading_spaces = text.chars().take_while(|&c| c == ' ').count();
113    if leading_spaces > 3 {
114        return None;
115    }
116    let inner = &text[leading_spaces..];
117    let bytes = inner.as_bytes();
118
119    // Must start at beginning of line with [
120    if bytes.is_empty() || bytes[0] != b'[' {
121        return None;
122    }
123
124    // Check if it's a footnote definition [^id]: - not a reference definition
125    if bytes.len() >= 2 && bytes[1] == b'^' {
126        return None;
127    }
128
129    // Find the closing ] for the label. Labels may span lines (CommonMark
130    // §4.7) but a blank line inside the label terminates the attempt. We also
131    // reject unescaped `[` inside the label per spec.
132    let mut pos = 1;
133    let mut escape_next = false;
134
135    while pos < bytes.len() {
136        if escape_next {
137            escape_next = false;
138            pos += 1;
139            continue;
140        }
141
142        match bytes[pos] {
143            b'\\' => {
144                escape_next = true;
145                pos += 1;
146            }
147            b']' => {
148                break;
149            }
150            b'[' => {
151                return None;
152            }
153            b'\n' | b'\r' => {
154                let nl_end =
155                    if bytes[pos] == b'\r' && pos + 1 < bytes.len() && bytes[pos + 1] == b'\n' {
156                        pos + 2
157                    } else {
158                        pos + 1
159                    };
160                let mut probe = nl_end;
161                while probe < bytes.len() && matches!(bytes[probe], b' ' | b'\t') {
162                    probe += 1;
163                }
164                if probe >= bytes.len() || bytes[probe] == b'\n' || bytes[probe] == b'\r' {
165                    return None;
166                }
167                pos = nl_end;
168            }
169            _ => {
170                pos += 1;
171            }
172        }
173    }
174
175    if pos >= bytes.len() || bytes[pos] != b']' {
176        return None;
177    }
178
179    let label = &inner[1..pos];
180    if label.trim().is_empty() {
181        return None;
182    }
183    let label_close = leading_spaces + pos;
184
185    pos += 1; // Skip ]
186
187    // Must be followed by :
188    if pos >= bytes.len() || bytes[pos] != b':' {
189        return None;
190    }
191    let colon = leading_spaces + pos;
192    pos += 1;
193
194    // Skip ws + at most one newline + ws to the URL.
195    pos = skip_ws_one_newline(bytes, pos)?;
196
197    // Parse URL
198    let url_start = pos;
199    let url_is_angle = pos < bytes.len() && bytes[pos] == b'<';
200
201    if url_is_angle {
202        pos += 1;
203        while pos < bytes.len() && bytes[pos] != b'>' && bytes[pos] != b'\n' && bytes[pos] != b'\r'
204        {
205            pos += 1;
206        }
207        if pos >= bytes.len() || bytes[pos] != b'>' {
208            return None;
209        }
210        pos += 1; // Skip >
211    } else {
212        while pos < bytes.len() && !matches!(bytes[pos], b' ' | b'\t' | b'\n' | b'\r') {
213            pos += 1;
214        }
215        if pos == url_start {
216            return None;
217        }
218    }
219    let url = (leading_spaces + url_start)..(leading_spaces + pos);
220
221    // After URL, try optional title. If a title attempt is malformed but we
222    // had to cross a newline to reach it, fall back to "no title, end of URL
223    // line" — the next line is then parsed independently (e.g.
224    // `[foo]: /url\n"title" ok\n` → ref def `[foo]: /url`, paragraph
225    // `"title" ok`).
226    let after_url = pos;
227    let url_line_end = consume_to_eol(bytes, after_url);
228    let url_line_end_lax = if strict_eol {
229        url_line_end
230    } else {
231        Some(consume_to_eol_lax(bytes, after_url))
232    };
233
234    let mut title: Option<std::ops::Range<usize>> = None;
235    let mut end_pos: Option<usize> = None;
236
237    if let Some(title_start) = skip_ws_one_newline(bytes, after_url) {
238        let crossed_newline = bytes[after_url..title_start]
239            .iter()
240            .any(|&b| b == b'\n' || b == b'\r');
241        // CommonMark §4.7: when the title is on the same line as the
242        // destination, it must be separated from the destination by at least
243        // one space or tab. `<bar>(baz)` (no whitespace between `>` and `(`)
244        // is therefore not a valid LRD under CommonMark; Pandoc accepts it.
245        let cmark_requires_separator = dialect == crate::options::Dialect::CommonMark
246            && !crossed_newline
247            && title_start == after_url;
248        if cmark_requires_separator {
249            return Some(ReferenceSpans {
250                indent: leading_spaces,
251                label_close,
252                colon,
253                url,
254                url_is_angle,
255                title: None,
256                consumed: leading_spaces + url_line_end_lax?,
257            });
258        }
259        let mut title_pos = title_start;
260        match parse_title(bytes, &mut title_pos) {
261            Some(Some(range)) => {
262                let line_end = if strict_eol {
263                    consume_to_eol(bytes, title_pos)
264                } else {
265                    Some(consume_to_eol_lax(bytes, title_pos))
266                };
267                if let Some(end) = line_end {
268                    title = Some((leading_spaces + range.start)..(leading_spaces + range.end));
269                    end_pos = Some(end);
270                } else if !crossed_newline {
271                    return None;
272                }
273            }
274            None => {
275                if !crossed_newline {
276                    return None;
277                }
278            }
279            Some(None) => {}
280        }
281    }
282
283    let end = match end_pos {
284        Some(p) => p,
285        None => url_line_end_lax?,
286    };
287
288    Some(ReferenceSpans {
289        indent: leading_spaces,
290        label_close,
291        colon,
292        url,
293        url_is_angle,
294        title,
295        consumed: leading_spaces + end,
296    })
297}
298
299/// Like `consume_to_eol` but returns the end-of-line position regardless of
300/// whether the line had non-whitespace content after the parsed segment.
301fn consume_to_eol_lax(bytes: &[u8], mut pos: usize) -> usize {
302    while pos < bytes.len() && bytes[pos] != b'\n' && bytes[pos] != b'\r' {
303        pos += 1;
304    }
305    if pos < bytes.len() {
306        if bytes[pos] == b'\r' && pos + 1 < bytes.len() && bytes[pos + 1] == b'\n' {
307            pos += 2;
308        } else {
309            pos += 1;
310        }
311    }
312    pos
313}
314
315/// Skip space/tab from `pos`, then consume one line ending if present.
316/// Returns `None` if non-whitespace is found before the line ending.
317fn consume_to_eol(bytes: &[u8], mut pos: usize) -> Option<usize> {
318    while pos < bytes.len() && matches!(bytes[pos], b' ' | b'\t') {
319        pos += 1;
320    }
321    if pos >= bytes.len() {
322        return Some(pos);
323    }
324    match bytes[pos] {
325        b'\n' => Some(pos + 1),
326        b'\r' => {
327            if pos + 1 < bytes.len() && bytes[pos + 1] == b'\n' {
328                Some(pos + 2)
329            } else {
330                Some(pos + 1)
331            }
332        }
333        _ => None,
334    }
335}
336
337/// Skip space/tab and optionally one line ending followed by more space/tab,
338/// per the "optional spaces or tabs (including up to one [line ending])" rule
339/// in CommonMark §4.7. Returns `None` if a *second* line ending is encountered
340/// (i.e. a blank line), which terminates the definition.
341fn skip_ws_one_newline(bytes: &[u8], mut pos: usize) -> Option<usize> {
342    while pos < bytes.len() && matches!(bytes[pos], b' ' | b'\t') {
343        pos += 1;
344    }
345    if pos < bytes.len() && (bytes[pos] == b'\n' || bytes[pos] == b'\r') {
346        if bytes[pos] == b'\r' && pos + 1 < bytes.len() && bytes[pos + 1] == b'\n' {
347            pos += 2;
348        } else {
349            pos += 1;
350        }
351        while pos < bytes.len() && matches!(bytes[pos], b' ' | b'\t') {
352            pos += 1;
353        }
354        if pos < bytes.len() && (bytes[pos] == b'\n' || bytes[pos] == b'\r') {
355            return None;
356        }
357    }
358    Some(pos)
359}
360
361pub fn line_is_mmd_link_attribute_continuation(line: &str) -> bool {
362    if !(line.starts_with(' ') || line.starts_with('\t')) {
363        return false;
364    }
365
366    let trimmed = line.trim();
367    if trimmed.is_empty() {
368        return false;
369    }
370
371    let bytes = trimmed.as_bytes();
372    let mut pos = 0usize;
373    let len = bytes.len();
374    let mut saw_pair = false;
375
376    while pos < len {
377        // Skip inter-token whitespace.
378        while pos < len && (bytes[pos] == b' ' || bytes[pos] == b'\t') {
379            pos += 1;
380        }
381        if pos >= len {
382            break;
383        }
384
385        // Parse key until '=' or whitespace.
386        let key_start = pos;
387        while pos < len && bytes[pos] != b'=' && bytes[pos] != b' ' && bytes[pos] != b'\t' {
388            pos += 1;
389        }
390        if pos == key_start || pos >= len || bytes[pos] != b'=' {
391            return false;
392        }
393        pos += 1; // skip '='
394
395        // Parse value (quoted or unquoted), require non-empty value.
396        if pos >= len {
397            return false;
398        }
399        if bytes[pos] == b'"' || bytes[pos] == b'\'' {
400            let quote = bytes[pos];
401            pos += 1;
402            let value_start = pos;
403            while pos < len && bytes[pos] != quote {
404                pos += 1;
405            }
406            if pos == value_start || pos >= len {
407                return false;
408            }
409            pos += 1; // skip closing quote
410        } else {
411            let value_start = pos;
412            while pos < len && bytes[pos] != b' ' && bytes[pos] != b'\t' {
413                pos += 1;
414            }
415            if pos == value_start {
416                return false;
417            }
418        }
419
420        saw_pair = true;
421    }
422
423    saw_pair
424}
425
426/// Parse an optional title after the URL.
427/// Titles can be in double quotes, single quotes, or parentheses.
428///
429/// Returns `Some(Some(range))` with the title's *outer* byte range (delimiters
430/// included, relative to `bytes`) when a title is found, `Some(None)` if there
431/// is no title, and `None` if a title is started but malformed. On success
432/// `*pos` is advanced past the closing delimiter and any trailing space/tab.
433fn parse_title(bytes: &[u8], pos: &mut usize) -> Option<Option<std::ops::Range<usize>>> {
434    let base_pos = *pos;
435
436    // Skip whitespace (including newlines for multi-line titles)
437    while *pos < bytes.len() && matches!(bytes[*pos], b' ' | b'\t' | b'\n' | b'\r') {
438        *pos += 1;
439    }
440
441    // Check if there's a title
442    if *pos >= bytes.len() {
443        return Some(None);
444    }
445
446    let quote_char = bytes[*pos];
447    if !matches!(quote_char, b'"' | b'\'' | b'(') {
448        // No title, that's okay
449        *pos = base_pos; // Reset position
450        return Some(None);
451    }
452
453    let closing_char = if quote_char == b'(' { b')' } else { quote_char };
454
455    let open = *pos;
456    *pos += 1; // Skip opening quote
457
458    // Find closing quote
459    let mut escape_next = false;
460    while *pos < bytes.len() {
461        if escape_next {
462            escape_next = false;
463            *pos += 1;
464            continue;
465        }
466
467        match bytes[*pos] {
468            b'\\' => {
469                escape_next = true;
470                *pos += 1;
471            }
472            c if c == closing_char => {
473                *pos += 1; // Skip closing quote
474                let close_end = *pos;
475
476                // Skip trailing whitespace to end of line
477                while *pos < bytes.len() && matches!(bytes[*pos], b' ' | b'\t') {
478                    *pos += 1;
479                }
480
481                return Some(Some(open..close_end));
482            }
483            b'\n' if quote_char == b'(' => {
484                // Parenthetical titles can span lines
485                *pos += 1;
486            }
487            _ => {
488                *pos += 1;
489            }
490        }
491    }
492
493    // No closing quote found
494    None
495}
496
497/// Try to parse just the footnote marker [^id]: from a line.
498/// Returns Some((id, content_start_col)) if the line starts with a footnote marker.
499///
500/// Syntax:
501/// ```markdown
502/// [^id]: Footnote content.
503/// ```
504pub fn try_parse_footnote_marker(line: &str) -> Option<(String, usize)> {
505    let bytes = line.as_bytes();
506
507    // Must start with [^
508    if bytes.len() < 4 || bytes[0] != b'[' || bytes[1] != b'^' {
509        return None;
510    }
511
512    // Find the closing ] for the ID
513    let mut pos = 2;
514    while pos < bytes.len() && bytes[pos] != b']' && bytes[pos] != b'\n' && bytes[pos] != b'\r' {
515        pos += 1;
516    }
517
518    if pos >= bytes.len() || bytes[pos] != b']' {
519        return None;
520    }
521
522    let id = &line[2..pos];
523    if id.is_empty() {
524        return None;
525    }
526
527    pos += 1; // Skip ]
528
529    // Must be followed by :
530    if pos >= bytes.len() || bytes[pos] != b':' {
531        return None;
532    }
533    pos += 1;
534
535    // Skip spaces/tabs until content (or end of line)
536    while pos < bytes.len() && matches!(bytes[pos], b' ' | b'\t') {
537        pos += 1;
538    }
539
540    Some((id.to_string(), pos))
541}
542
543#[cfg(test)]
544mod tests {
545    use super::{line_is_mmd_link_attribute_continuation, try_parse_reference_definition};
546    use crate::syntax::SyntaxKind;
547
548    #[test]
549    fn test_footnote_definition_body_layout_is_lossless() {
550        let input = "[^note-on-refs]:\n    Note that if `--file-scope` is used,\n";
551        let tree = crate::parse(input, Some(crate::ParserOptions::default()));
552        assert_eq!(tree.text().to_string(), input);
553    }
554
555    #[test]
556    fn test_footnote_definition_marker_emits_structural_tokens() {
557        let input = "[^note-on-refs]: body\n";
558        let tree = crate::parse(input, Some(crate::ParserOptions::default()));
559        let def = tree
560            .descendants()
561            .find(|n| n.kind() == SyntaxKind::FOOTNOTE_DEFINITION)
562            .expect("footnote definition");
563        let token_kinds: Vec<_> = def
564            .children_with_tokens()
565            .filter_map(|e| e.into_token())
566            .map(|t| t.kind())
567            .collect();
568        assert!(token_kinds.contains(&SyntaxKind::FOOTNOTE_LABEL_START));
569        assert!(token_kinds.contains(&SyntaxKind::FOOTNOTE_LABEL_ID));
570        assert!(token_kinds.contains(&SyntaxKind::FOOTNOTE_LABEL_END));
571        assert!(token_kinds.contains(&SyntaxKind::FOOTNOTE_LABEL_COLON));
572    }
573
574    #[test]
575    fn footnote_multiline_dollar_math_parses_as_display_math_not_tex_block() {
576        let input = "[^note]: Intro line before math:\n    $$\n    \\begin{aligned} a &= b \\\\ c &= d \\end{aligned}\n    $$\n";
577        let tree = crate::parse(input, Some(crate::ParserOptions::default()));
578
579        let def = tree
580            .descendants()
581            .find(|n| n.kind() == SyntaxKind::FOOTNOTE_DEFINITION)
582            .expect("footnote definition");
583
584        let has_display_math = def
585            .descendants()
586            .any(|n| n.kind() == SyntaxKind::DISPLAY_MATH);
587        let has_tex_block = def.descendants().any(|n| n.kind() == SyntaxKind::TEX_BLOCK);
588
589        assert!(
590            has_display_math,
591            "Expected DISPLAY_MATH in footnote definition, got:\n{}",
592            tree
593        );
594        assert!(
595            !has_tex_block,
596            "Did not expect TEX_BLOCK in footnote definition for $$...$$ math, got:\n{}",
597            tree
598        );
599    }
600
601    #[test]
602    fn test_reference_definition_with_up_to_three_leading_spaces() {
603        let d = crate::options::Dialect::Pandoc;
604        assert!(try_parse_reference_definition("   [foo]: #bar", d).is_some());
605        assert!(try_parse_reference_definition("    [foo]: #bar", d).is_none());
606    }
607
608    #[test]
609    fn test_reference_definition_commonmark_requires_separator_before_title() {
610        // Pandoc: title `(baz)` directly attached after `<bar>` is accepted.
611        let pandoc =
612            try_parse_reference_definition("[foo]: <bar>(baz)\n", crate::options::Dialect::Pandoc);
613        assert_eq!(
614            pandoc
615                .as_ref()
616                .map(|(_, _, url, title)| (url.as_str(), title.as_deref())),
617            Some(("bar", Some("baz")))
618        );
619
620        // CommonMark: same input is not a valid LRD because the title `(baz)`
621        // is not space-separated from the destination; the parser rejects the
622        // candidate so the dispatcher falls back to a paragraph.
623        let cmark = try_parse_reference_definition(
624            "[foo]: <bar>(baz)\n",
625            crate::options::Dialect::CommonMark,
626        );
627        assert!(cmark.is_none());
628
629        // CommonMark with a space before the title does parse as an LRD with a
630        // title.
631        let cmark_ok = try_parse_reference_definition(
632            "[foo]: <bar> (baz)\n",
633            crate::options::Dialect::CommonMark,
634        );
635        assert_eq!(
636            cmark_ok
637                .as_ref()
638                .map(|(_, _, url, title)| (url.as_str(), title.as_deref())),
639            Some(("bar", Some("baz")))
640        );
641    }
642
643    #[test]
644    fn test_reference_definition_emits_structured_url_and_title() {
645        let input = "[ref]: <https://example.com> \"The Title\"\n";
646        let tree = crate::parse(input, Some(crate::ParserOptions::default()));
647        assert_eq!(tree.text().to_string(), input, "must stay lossless");
648
649        let def = tree
650            .descendants()
651            .find(|n| n.kind() == SyntaxKind::REFERENCE_DEFINITION)
652            .expect("reference definition");
653
654        let url = def
655            .children()
656            .find(|n| n.kind() == SyntaxKind::REFERENCE_URL)
657            .expect("REFERENCE_URL node");
658        assert_eq!(url.text().to_string(), "<https://example.com>");
659        // Angle brackets are kept inside the node as their own delimiter tokens.
660        assert!(
661            url.children_with_tokens()
662                .any(|e| e.kind() == SyntaxKind::LINK_DEST_START)
663        );
664        assert!(
665            url.children_with_tokens()
666                .any(|e| e.kind() == SyntaxKind::LINK_DEST_END)
667        );
668
669        let title = def
670            .children()
671            .find(|n| n.kind() == SyntaxKind::REFERENCE_TITLE)
672            .expect("REFERENCE_TITLE node");
673        assert_eq!(title.text().to_string(), "\"The Title\"");
674    }
675
676    #[test]
677    fn test_reference_definition_without_title_omits_title_node() {
678        let input = "[ref]: /url\n";
679        let tree = crate::parse(input, Some(crate::ParserOptions::default()));
680        assert_eq!(tree.text().to_string(), input, "must stay lossless");
681
682        let def = tree
683            .descendants()
684            .find(|n| n.kind() == SyntaxKind::REFERENCE_DEFINITION)
685            .expect("reference definition");
686
687        let url = def
688            .children()
689            .find(|n| n.kind() == SyntaxKind::REFERENCE_URL)
690            .expect("REFERENCE_URL node");
691        assert_eq!(url.text().to_string(), "/url");
692        assert!(
693            !def.children()
694                .any(|n| n.kind() == SyntaxKind::REFERENCE_TITLE),
695            "no title => no REFERENCE_TITLE node"
696        );
697    }
698
699    #[test]
700    fn mmd_link_attribute_continuation_detects_valid_tokens() {
701        assert!(line_is_mmd_link_attribute_continuation(
702            "    width=20px height=30px id=myId"
703        ));
704        assert!(line_is_mmd_link_attribute_continuation(
705            "\tclass=\"myClass1 myClass2\""
706        ));
707    }
708
709    #[test]
710    fn mmd_link_attribute_continuation_rejects_non_attribute_lines() {
711        assert!(!line_is_mmd_link_attribute_continuation(
712            "not-indented width=20px"
713        ));
714        assert!(!line_is_mmd_link_attribute_continuation(
715            "    not-an-attr token"
716        ));
717    }
718}
panache_parser/parser/blocks/reference_links.rs

panache_parser/parser/blocks/
reference_links.rs