panache_parser/parser/blocks/
reference_links.rs

1//! Reference definition and footnote parsing functions.
2//!
3//! Reference definitions have the form:
4//! ```markdown
5//! [label]: url "optional title"
6//! [label]: url 'optional title'
7//! [label]: url (optional title)
8//! [label]: <url> "title"
9//! ```
10//!
11//! Footnote definitions have the form:
12//! ```markdown
13//! [^id]: Footnote content here.
14//!     Can continue on multiple lines
15//!     as long as they're indented.
16//! ```
17
18/// Try to parse a reference definition starting at the current position.
19/// Returns Some((bytes_consumed, label, url, title)) on success.
20///
21/// `text` may span multiple lines. The destination and title may each be
22/// preceded by at most one newline (per CommonMark §4.7). Blank lines
23/// terminate the definition: callers should stop the input at the first
24/// blank line so the parser cannot cross one.
25///
26/// `dialect` controls a CommonMark-only constraint (§4.7): the title, if
27/// present on the same line as the destination, must be separated from the
28/// destination by at least one space or tab. Pandoc-markdown accepts the
29/// title even when it's directly attached (e.g. `[foo]: <bar>(baz)`).
30///
31/// Syntax:
32/// ```markdown
33/// [label]: url "title"
34/// [label]: <url> 'title'
35/// [label]:
36///   url
37///   "title"
38/// ```
39pub fn try_parse_reference_definition(
40    text: &str,
41    dialect: crate::options::Dialect,
42) -> Option<(usize, String, String, Option<String>)> {
43    try_parse_reference_definition_with_mode(text, true, dialect)
44}
45
46/// Multimarkdown-flavored variant: tolerates trailing content after the title
47/// on the same line (e.g. `[ref]: /url "title" width=20px ...`). Callers in
48/// the MMD code path then keep collecting attribute-continuation lines.
49pub fn try_parse_reference_definition_lax(
50    text: &str,
51    dialect: crate::options::Dialect,
52) -> Option<(usize, String, String, Option<String>)> {
53    try_parse_reference_definition_with_mode(text, false, dialect)
54}
55
56fn try_parse_reference_definition_with_mode(
57    text: &str,
58    strict_eol: bool,
59    dialect: crate::options::Dialect,
60) -> Option<(usize, String, String, Option<String>)> {
61    let leading_spaces = text.chars().take_while(|&c| c == ' ').count();
62    if leading_spaces > 3 {
63        return None;
64    }
65    let inner = &text[leading_spaces..];
66    let bytes = inner.as_bytes();
67
68    // Must start at beginning of line with [
69    if bytes.is_empty() || bytes[0] != b'[' {
70        return None;
71    }
72
73    // Check if it's a footnote definition [^id]: - not a reference definition
74    if bytes.len() >= 2 && bytes[1] == b'^' {
75        return None;
76    }
77
78    // Find the closing ] for the label. Labels may span lines (CommonMark
79    // §4.7) but a blank line inside the label terminates the attempt. We also
80    // reject unescaped `[` inside the label per spec.
81    let mut pos = 1;
82    let mut escape_next = false;
83
84    while pos < bytes.len() {
85        if escape_next {
86            escape_next = false;
87            pos += 1;
88            continue;
89        }
90
91        match bytes[pos] {
92            b'\\' => {
93                escape_next = true;
94                pos += 1;
95            }
96            b']' => {
97                break;
98            }
99            b'[' => {
100                return None;
101            }
102            b'\n' | b'\r' => {
103                let nl_end =
104                    if bytes[pos] == b'\r' && pos + 1 < bytes.len() && bytes[pos + 1] == b'\n' {
105                        pos + 2
106                    } else {
107                        pos + 1
108                    };
109                let mut probe = nl_end;
110                while probe < bytes.len() && matches!(bytes[probe], b' ' | b'\t') {
111                    probe += 1;
112                }
113                if probe >= bytes.len() || bytes[probe] == b'\n' || bytes[probe] == b'\r' {
114                    return None;
115                }
116                pos = nl_end;
117            }
118            _ => {
119                pos += 1;
120            }
121        }
122    }
123
124    if pos >= bytes.len() || bytes[pos] != b']' {
125        return None;
126    }
127
128    let label = &inner[1..pos];
129    if label.trim().is_empty() {
130        return None;
131    }
132
133    pos += 1; // Skip ]
134
135    // Must be followed by :
136    if pos >= bytes.len() || bytes[pos] != b':' {
137        return None;
138    }
139    pos += 1;
140
141    // Skip ws + at most one newline + ws to the URL.
142    pos = skip_ws_one_newline(bytes, pos)?;
143
144    // Parse URL
145    let url_start = pos;
146
147    let url = if pos < bytes.len() && bytes[pos] == b'<' {
148        pos += 1;
149        let url_content_start = pos;
150        while pos < bytes.len() && bytes[pos] != b'>' && bytes[pos] != b'\n' && bytes[pos] != b'\r'
151        {
152            pos += 1;
153        }
154        if pos >= bytes.len() || bytes[pos] != b'>' {
155            return None;
156        }
157        let url = inner[url_content_start..pos].to_string();
158        pos += 1; // Skip >
159        url
160    } else {
161        while pos < bytes.len() && !matches!(bytes[pos], b' ' | b'\t' | b'\n' | b'\r') {
162            pos += 1;
163        }
164        if pos == url_start {
165            return None;
166        }
167        inner[url_start..pos].to_string()
168    };
169
170    // After URL, try optional title. If a title attempt is malformed but we
171    // had to cross a newline to reach it, fall back to "no title, end of URL
172    // line" — the next line is then parsed independently (e.g.
173    // `[foo]: /url\n"title" ok\n` → ref def `[foo]: /url`, paragraph
174    // `"title" ok`).
175    let after_url = pos;
176    let url_line_end = consume_to_eol(bytes, after_url);
177    let url_line_end_lax = if strict_eol {
178        url_line_end
179    } else {
180        Some(consume_to_eol_lax(bytes, after_url))
181    };
182
183    let mut title: Option<String> = None;
184    let mut end_pos: Option<usize> = None;
185
186    if let Some(title_start) = skip_ws_one_newline(bytes, after_url) {
187        let crossed_newline = bytes[after_url..title_start]
188            .iter()
189            .any(|&b| b == b'\n' || b == b'\r');
190        // CommonMark §4.7: when the title is on the same line as the
191        // destination, it must be separated from the destination by at least
192        // one space or tab. `<bar>(baz)` (no whitespace between `>` and `(`)
193        // is therefore not a valid LRD under CommonMark; Pandoc accepts it.
194        let cmark_requires_separator = dialect == crate::options::Dialect::CommonMark
195            && !crossed_newline
196            && title_start == after_url;
197        if cmark_requires_separator {
198            return Some((
199                leading_spaces + url_line_end_lax?,
200                label.to_string(),
201                url,
202                None,
203            ));
204        }
205        let mut title_pos = title_start;
206        match parse_title(inner, bytes, &mut title_pos) {
207            Some(Some(t)) => {
208                let line_end = if strict_eol {
209                    consume_to_eol(bytes, title_pos)
210                } else {
211                    Some(consume_to_eol_lax(bytes, title_pos))
212                };
213                if let Some(end) = line_end {
214                    title = Some(t);
215                    end_pos = Some(end);
216                } else if !crossed_newline {
217                    return None;
218                }
219            }
220            None => {
221                if !crossed_newline {
222                    return None;
223                }
224            }
225            Some(None) => {}
226        }
227    }
228
229    let end = match end_pos {
230        Some(p) => p,
231        None => url_line_end_lax?,
232    };
233
234    Some((leading_spaces + end, label.to_string(), url, title))
235}
236
237/// Like `consume_to_eol` but returns the end-of-line position regardless of
238/// whether the line had non-whitespace content after the parsed segment.
239fn consume_to_eol_lax(bytes: &[u8], mut pos: usize) -> usize {
240    while pos < bytes.len() && bytes[pos] != b'\n' && bytes[pos] != b'\r' {
241        pos += 1;
242    }
243    if pos < bytes.len() {
244        if bytes[pos] == b'\r' && pos + 1 < bytes.len() && bytes[pos + 1] == b'\n' {
245            pos += 2;
246        } else {
247            pos += 1;
248        }
249    }
250    pos
251}
252
253/// Skip space/tab from `pos`, then consume one line ending if present.
254/// Returns `None` if non-whitespace is found before the line ending.
255fn consume_to_eol(bytes: &[u8], mut pos: usize) -> Option<usize> {
256    while pos < bytes.len() && matches!(bytes[pos], b' ' | b'\t') {
257        pos += 1;
258    }
259    if pos >= bytes.len() {
260        return Some(pos);
261    }
262    match bytes[pos] {
263        b'\n' => Some(pos + 1),
264        b'\r' => {
265            if pos + 1 < bytes.len() && bytes[pos + 1] == b'\n' {
266                Some(pos + 2)
267            } else {
268                Some(pos + 1)
269            }
270        }
271        _ => None,
272    }
273}
274
275/// Skip space/tab and optionally one line ending followed by more space/tab,
276/// per the "optional spaces or tabs (including up to one [line ending])" rule
277/// in CommonMark §4.7. Returns `None` if a *second* line ending is encountered
278/// (i.e. a blank line), which terminates the definition.
279fn skip_ws_one_newline(bytes: &[u8], mut pos: usize) -> Option<usize> {
280    while pos < bytes.len() && matches!(bytes[pos], b' ' | b'\t') {
281        pos += 1;
282    }
283    if pos < bytes.len() && (bytes[pos] == b'\n' || bytes[pos] == b'\r') {
284        if bytes[pos] == b'\r' && pos + 1 < bytes.len() && bytes[pos + 1] == b'\n' {
285            pos += 2;
286        } else {
287            pos += 1;
288        }
289        while pos < bytes.len() && matches!(bytes[pos], b' ' | b'\t') {
290            pos += 1;
291        }
292        if pos < bytes.len() && (bytes[pos] == b'\n' || bytes[pos] == b'\r') {
293            return None;
294        }
295    }
296    Some(pos)
297}
298
299pub fn line_is_mmd_link_attribute_continuation(line: &str) -> bool {
300    if !(line.starts_with(' ') || line.starts_with('\t')) {
301        return false;
302    }
303
304    let trimmed = line.trim();
305    if trimmed.is_empty() {
306        return false;
307    }
308
309    let bytes = trimmed.as_bytes();
310    let mut pos = 0usize;
311    let len = bytes.len();
312    let mut saw_pair = false;
313
314    while pos < len {
315        // Skip inter-token whitespace.
316        while pos < len && (bytes[pos] == b' ' || bytes[pos] == b'\t') {
317            pos += 1;
318        }
319        if pos >= len {
320            break;
321        }
322
323        // Parse key until '=' or whitespace.
324        let key_start = pos;
325        while pos < len && bytes[pos] != b'=' && bytes[pos] != b' ' && bytes[pos] != b'\t' {
326            pos += 1;
327        }
328        if pos == key_start || pos >= len || bytes[pos] != b'=' {
329            return false;
330        }
331        pos += 1; // skip '='
332
333        // Parse value (quoted or unquoted), require non-empty value.
334        if pos >= len {
335            return false;
336        }
337        if bytes[pos] == b'"' || bytes[pos] == b'\'' {
338            let quote = bytes[pos];
339            pos += 1;
340            let value_start = pos;
341            while pos < len && bytes[pos] != quote {
342                pos += 1;
343            }
344            if pos == value_start || pos >= len {
345                return false;
346            }
347            pos += 1; // skip closing quote
348        } else {
349            let value_start = pos;
350            while pos < len && bytes[pos] != b' ' && bytes[pos] != b'\t' {
351                pos += 1;
352            }
353            if pos == value_start {
354                return false;
355            }
356        }
357
358        saw_pair = true;
359    }
360
361    saw_pair
362}
363
364/// Parse an optional title after the URL.
365/// Titles can be in double quotes, single quotes, or parentheses.
366/// Returns Some(Some(title)) if title found, Some(None) if no title, None if malformed.
367fn parse_title(text: &str, bytes: &[u8], pos: &mut usize) -> Option<Option<String>> {
368    let base_pos = *pos;
369
370    // Skip whitespace (including newlines for multi-line titles)
371    while *pos < bytes.len() && matches!(bytes[*pos], b' ' | b'\t' | b'\n' | b'\r') {
372        *pos += 1;
373    }
374
375    // Check if there's a title
376    if *pos >= bytes.len() {
377        return Some(None);
378    }
379
380    let quote_char = bytes[*pos];
381    if !matches!(quote_char, b'"' | b'\'' | b'(') {
382        // No title, that's okay
383        *pos = base_pos; // Reset position
384        return Some(None);
385    }
386
387    let closing_char = if quote_char == b'(' { b')' } else { quote_char };
388
389    *pos += 1; // Skip opening quote
390    let title_start = *pos;
391
392    // Find closing quote
393    let mut escape_next = false;
394    while *pos < bytes.len() {
395        if escape_next {
396            escape_next = false;
397            *pos += 1;
398            continue;
399        }
400
401        match bytes[*pos] {
402            b'\\' => {
403                escape_next = true;
404                *pos += 1;
405            }
406            c if c == closing_char => {
407                let title_end = *pos;
408                *pos += 1; // Skip closing quote
409
410                // Skip trailing whitespace to end of line
411                while *pos < bytes.len() && matches!(bytes[*pos], b' ' | b'\t') {
412                    *pos += 1;
413                }
414
415                // Extract title from the original text using correct indices
416                let title = text[title_start..title_end].to_string();
417                return Some(Some(title));
418            }
419            b'\n' if quote_char == b'(' => {
420                // Parenthetical titles can span lines
421                *pos += 1;
422            }
423            _ => {
424                *pos += 1;
425            }
426        }
427    }
428
429    // No closing quote found
430    None
431}
432
433/// Try to parse just the footnote marker [^id]: from a line.
434/// Returns Some((id, content_start_col)) if the line starts with a footnote marker.
435///
436/// Syntax:
437/// ```markdown
438/// [^id]: Footnote content.
439/// ```
440pub fn try_parse_footnote_marker(line: &str) -> Option<(String, usize)> {
441    let bytes = line.as_bytes();
442
443    // Must start with [^
444    if bytes.len() < 4 || bytes[0] != b'[' || bytes[1] != b'^' {
445        return None;
446    }
447
448    // Find the closing ] for the ID
449    let mut pos = 2;
450    while pos < bytes.len() && bytes[pos] != b']' && bytes[pos] != b'\n' && bytes[pos] != b'\r' {
451        pos += 1;
452    }
453
454    if pos >= bytes.len() || bytes[pos] != b']' {
455        return None;
456    }
457
458    let id = &line[2..pos];
459    if id.is_empty() {
460        return None;
461    }
462
463    pos += 1; // Skip ]
464
465    // Must be followed by :
466    if pos >= bytes.len() || bytes[pos] != b':' {
467        return None;
468    }
469    pos += 1;
470
471    // Skip spaces/tabs until content (or end of line)
472    while pos < bytes.len() && matches!(bytes[pos], b' ' | b'\t') {
473        pos += 1;
474    }
475
476    Some((id.to_string(), pos))
477}
478
479#[cfg(test)]
480mod tests {
481    use super::{line_is_mmd_link_attribute_continuation, try_parse_reference_definition};
482    use crate::syntax::SyntaxKind;
483
484    #[test]
485    fn test_footnote_definition_body_layout_is_lossless() {
486        let input = "[^note-on-refs]:\n    Note that if `--file-scope` is used,\n";
487        let tree = crate::parse(input, Some(crate::ParserOptions::default()));
488        assert_eq!(tree.text().to_string(), input);
489    }
490
491    #[test]
492    fn test_footnote_definition_marker_emits_structural_tokens() {
493        let input = "[^note-on-refs]: body\n";
494        let tree = crate::parse(input, Some(crate::ParserOptions::default()));
495        let def = tree
496            .descendants()
497            .find(|n| n.kind() == SyntaxKind::FOOTNOTE_DEFINITION)
498            .expect("footnote definition");
499        let token_kinds: Vec<_> = def
500            .children_with_tokens()
501            .filter_map(|e| e.into_token())
502            .map(|t| t.kind())
503            .collect();
504        assert!(token_kinds.contains(&SyntaxKind::FOOTNOTE_LABEL_START));
505        assert!(token_kinds.contains(&SyntaxKind::FOOTNOTE_LABEL_ID));
506        assert!(token_kinds.contains(&SyntaxKind::FOOTNOTE_LABEL_END));
507        assert!(token_kinds.contains(&SyntaxKind::FOOTNOTE_LABEL_COLON));
508    }
509
510    #[test]
511    fn footnote_multiline_dollar_math_parses_as_display_math_not_tex_block() {
512        let input = "[^note]: Intro line before math:\n    $$\n    \\begin{aligned} a &= b \\\\ c &= d \\end{aligned}\n    $$\n";
513        let tree = crate::parse(input, Some(crate::ParserOptions::default()));
514
515        let def = tree
516            .descendants()
517            .find(|n| n.kind() == SyntaxKind::FOOTNOTE_DEFINITION)
518            .expect("footnote definition");
519
520        let has_display_math = def
521            .descendants()
522            .any(|n| n.kind() == SyntaxKind::DISPLAY_MATH);
523        let has_tex_block = def.descendants().any(|n| n.kind() == SyntaxKind::TEX_BLOCK);
524
525        assert!(
526            has_display_math,
527            "Expected DISPLAY_MATH in footnote definition, got:\n{}",
528            tree
529        );
530        assert!(
531            !has_tex_block,
532            "Did not expect TEX_BLOCK in footnote definition for $$...$$ math, got:\n{}",
533            tree
534        );
535    }
536
537    #[test]
538    fn test_reference_definition_with_up_to_three_leading_spaces() {
539        let d = crate::options::Dialect::Pandoc;
540        assert!(try_parse_reference_definition("   [foo]: #bar", d).is_some());
541        assert!(try_parse_reference_definition("    [foo]: #bar", d).is_none());
542    }
543
544    #[test]
545    fn test_reference_definition_commonmark_requires_separator_before_title() {
546        // Pandoc: title `(baz)` directly attached after `<bar>` is accepted.
547        let pandoc =
548            try_parse_reference_definition("[foo]: <bar>(baz)\n", crate::options::Dialect::Pandoc);
549        assert_eq!(
550            pandoc
551                .as_ref()
552                .map(|(_, _, url, title)| (url.as_str(), title.as_deref())),
553            Some(("bar", Some("baz")))
554        );
555
556        // CommonMark: same input is not a valid LRD because the title `(baz)`
557        // is not space-separated from the destination; the parser rejects the
558        // candidate so the dispatcher falls back to a paragraph.
559        let cmark = try_parse_reference_definition(
560            "[foo]: <bar>(baz)\n",
561            crate::options::Dialect::CommonMark,
562        );
563        assert!(cmark.is_none());
564
565        // CommonMark with a space before the title does parse as an LRD with a
566        // title.
567        let cmark_ok = try_parse_reference_definition(
568            "[foo]: <bar> (baz)\n",
569            crate::options::Dialect::CommonMark,
570        );
571        assert_eq!(
572            cmark_ok
573                .as_ref()
574                .map(|(_, _, url, title)| (url.as_str(), title.as_deref())),
575            Some(("bar", Some("baz")))
576        );
577    }
578
579    #[test]
580    fn mmd_link_attribute_continuation_detects_valid_tokens() {
581        assert!(line_is_mmd_link_attribute_continuation(
582            "    width=20px height=30px id=myId"
583        ));
584        assert!(line_is_mmd_link_attribute_continuation(
585            "\tclass=\"myClass1 myClass2\""
586        ));
587    }
588
589    #[test]
590    fn mmd_link_attribute_continuation_rejects_non_attribute_lines() {
591        assert!(!line_is_mmd_link_attribute_continuation(
592            "not-indented width=20px"
593        ));
594        assert!(!line_is_mmd_link_attribute_continuation(
595            "    not-an-attr token"
596        ));
597    }
598}
panache_parser/parser/blocks/reference_links.rs

panache_parser/parser/blocks/
reference_links.rs