Skip to main content

panache_parser/parser/inlines/
refdef_map.rs

1//! Document-level link reference definition map for CommonMark inline
2//! parsing.
3//!
4//! CommonMark §6.3 says reference links are valid only when the label
5//! matches a definition that appears anywhere in the document, including
6//! after the use site. The block-level parser already recognises
7//! `[label]: dest` lines and emits them as separate blocks, but inline
8//! parsing has historically treated every `[bracket pair]` as opaque on
9//! shape alone — without checking whether the label resolves.
10//!
11//! The fix is a single forward scan over the input *before* inline
12//! parsing runs, collecting every refdef label into a [`RefdefMap`].
13//! The IR's bracket resolution pass consults this map to decide whether
14//! a `[...]` (or `[...][...]`) opens a link or falls through to literal
15//! text.
16//!
17//! ## Scope
18//!
19//! - The set is computed once per `Parser::parse` call from the original
20//!   input string and shared (via `Arc`) with every inline parsing
21//!   invocation that needs it. Inline fragments (e.g. heading text,
22//!   paragraph text, table cell text) do not contain refdef definitions
23//!   themselves, so a *fragment-level* scan is insufficient.
24//!
25//! - Labels are normalised per CommonMark §4.7: case-folded, leading and
26//!   trailing whitespace stripped, internal whitespace runs collapsed to
27//!   a single space. The same normalisation applies on the lookup side
28//!   in the bracket resolution pass.
29//!
30//! - The scan does not attempt to detect refdefs inside code fences or
31//!   raw HTML blocks; it accepts a small over-approximation in exchange
32//!   for being a context-free linear walk. A bracket label that happens
33//!   to *spell* a defined refdef inside a fenced code block would still
34//!   resolve correctly under emission because emission walks the CST,
35//!   which already excludes the fenced region. The over-approximation
36//!   only matters if a bogus refdef-shaped line *outside* a code block
37//!   would shadow real text — that case is also wrong under CommonMark
38//!   semantics, so the approximation is fine.
39
40use crate::options::Dialect;
41use std::collections::HashSet;
42use std::sync::Arc;
43
44use crate::parser::blocks::reference_links::try_parse_reference_definition;
45
46/// Set of normalised refdef labels collected from the document. Wrapped
47/// in `Arc` so the (immutable) set can be cheaply cloned into every
48/// inline parsing call.
49pub type RefdefMap = Arc<HashSet<String>>;
50
51/// Normalise a refdef label per CommonMark §4.7.
52///
53/// 1. Strip leading and trailing whitespace.
54/// 2. Collapse internal whitespace runs (any mixture of spaces, tabs,
55///    line endings) to a single space.
56/// 3. Case-fold. CommonMark mandates Unicode case folding rather than
57///    plain lowercasing; the two differ for characters whose folded
58///    form is longer than the lowercased form, most notably the German
59///    sharp S (`ẞ` lowercases to `ß` but folds to `ss`). We approximate
60///    by lowercasing and then expanding any remaining `ß` to `ss` —
61///    that matches the test renderer's `normalize_label` and is the
62///    only multi-character fold spec.txt exercises beyond ASCII (spec
63///    example #540).
64pub fn normalize_label(label: &str) -> String {
65    let trimmed = label.trim();
66    let mut out = String::with_capacity(trimmed.len());
67    let mut prev_ws = false;
68    for ch in trimmed.chars() {
69        if ch.is_whitespace() {
70            if !prev_ws {
71                out.push(' ');
72                prev_ws = true;
73            }
74        } else {
75            for low in ch.to_lowercase() {
76                out.push(low);
77            }
78            prev_ws = false;
79        }
80    }
81    out.replace('ß', "ss")
82}
83
84/// Walk the input string once and collect all reference definitions into
85/// a [`RefdefMap`]. Only used for `Dialect::CommonMark`; callers should
86/// pass an empty (or `None`) map for other dialects.
87///
88/// The scanner is line-based: at each line-start, it strips any
89/// blockquote markers (`> ` / `>` runs) — refdefs are valid inside a
90/// blockquote per CommonMark §4.7 (spec example #218) — and tries
91/// [`try_parse_reference_definition`] on the surviving bytes. When the
92/// parser reports a multi-line consumption the cursor advances past the
93/// whole refdef in one step.
94pub fn collect_refdef_labels(input: &str, dialect: Dialect) -> RefdefMap {
95    let mut set: HashSet<String> = HashSet::new();
96    let bytes = input.as_bytes();
97    let mut pos = 0;
98
99    while pos < bytes.len() {
100        // Cheap leading-byte gate: a refdef line starts with `[` after
101        // up to 3 spaces, or with `>` (blockquote-wrapped). Anything
102        // else can't be a refdef — skip the full
103        // `try_parse_reference_definition` scan and the blockquote
104        // strip+retry. Most lines in a typical doc fail this gate.
105        let mut gate = pos;
106        while gate < bytes.len() && gate - pos < 3 && bytes[gate] == b' ' {
107            gate += 1;
108        }
109        let gate_byte = bytes.get(gate).copied();
110        if gate_byte == Some(b'[') {
111            if let Some((consumed, label, _url, _title)) =
112                try_parse_reference_definition(&input[pos..], dialect)
113            {
114                set.insert(normalize_label(&label));
115                pos += consumed.max(1);
116                continue;
117            }
118        } else if gate_byte == Some(b'>')
119            && let Some(stripped) = strip_blockquote_line(&input[pos..])
120            && let Some((_, label, _, _)) = try_parse_reference_definition(&stripped, dialect)
121        {
122            set.insert(normalize_label(&label));
123        }
124
125        match memchr_newline(&bytes[pos..]) {
126            Some(off) => {
127                pos += off + 1;
128            }
129            None => break,
130        }
131    }
132
133    Arc::new(set)
134}
135
136fn memchr_newline(bytes: &[u8]) -> Option<usize> {
137    bytes.iter().position(|&b| b == b'\n')
138}
139
140/// `true` if the line starting at `text[0]` begins with a blockquote
141/// marker (`>` after up to 3 leading spaces).
142fn line_starts_with_blockquote(text: &str) -> bool {
143    let bytes = text.as_bytes();
144    let mut i = 0;
145    while i < bytes.len() && i < 3 && bytes[i] == b' ' {
146        i += 1;
147    }
148    bytes.get(i) == Some(&b'>')
149}
150
151/// Strip leading blockquote markers from a single line plus its possible
152/// continuation lines, producing the inner text suitable for refdef
153/// parsing. Returns `None` if no blockquote prefix is present.
154///
155/// Refdefs can span multiple lines (e.g. the title can wrap), so we
156/// strip blockquote markers from each continuation line too. We stop at
157/// a blank line or a line that doesn't continue with a blockquote
158/// marker.
159fn strip_blockquote_line(text: &str) -> Option<String> {
160    if !line_starts_with_blockquote(text) {
161        return None;
162    }
163    let mut out = String::with_capacity(text.len());
164    for line in text.split_inclusive('\n') {
165        let bytes = line.as_bytes();
166        let mut i = 0;
167        while i < bytes.len() && i < 3 && bytes[i] == b' ' {
168            i += 1;
169        }
170        if bytes.get(i) != Some(&b'>') {
171            // Not a blockquote continuation — stop.
172            break;
173        }
174        i += 1;
175        // Optional single space after `>`.
176        if bytes.get(i) == Some(&b' ') {
177            i += 1;
178        }
179        out.push_str(&line[i..]);
180    }
181    Some(out)
182}
183
184#[cfg(test)]
185mod tests {
186    use super::*;
187
188    #[test]
189    fn collects_simple_refdef() {
190        let map = collect_refdef_labels("[foo]: /url\n", Dialect::CommonMark);
191        assert!(map.contains("foo"));
192    }
193
194    #[test]
195    fn collects_multiple_refdefs() {
196        let input = "[foo]: /a\n[bar]: /b\n[baz]: /c\n";
197        let map = collect_refdef_labels(input, Dialect::CommonMark);
198        assert!(map.contains("foo"));
199        assert!(map.contains("bar"));
200        assert!(map.contains("baz"));
201    }
202
203    #[test]
204    fn does_not_collect_non_refdef_lines() {
205        let input = "Just a paragraph.\n\nAnother one.\n";
206        let map = collect_refdef_labels(input, Dialect::CommonMark);
207        assert!(map.is_empty());
208    }
209
210    #[test]
211    fn collects_after_paragraph() {
212        let input = "Some paragraph.\n\n[foo]: /url\n";
213        let map = collect_refdef_labels(input, Dialect::CommonMark);
214        assert!(map.contains("foo"));
215    }
216
217    #[test]
218    fn case_folded_label() {
219        let map = collect_refdef_labels("[FOO Bar]: /url\n", Dialect::CommonMark);
220        assert!(map.contains("foo bar"));
221    }
222
223    #[test]
224    fn collapses_internal_whitespace() {
225        assert_eq!(normalize_label("  foo   bar\tbaz  "), "foo bar baz");
226    }
227
228    #[test]
229    fn label_523_is_not_collected() {
230        // CMark example 523 has no refdef; the bracket should fall through
231        // to literal text under bracket resolution.
232        let map = collect_refdef_labels("*foo [bar* baz]\n", Dialect::CommonMark);
233        assert!(map.is_empty());
234    }
235}