Skip to main content

panache_parser/parser/inlines/
refdef_map.rs

1//! Document-level link reference definition map for CommonMark inline
2//! parsing.
3//!
4//! CommonMark §6.3 says reference links are valid only when the label
5//! matches a definition that appears anywhere in the document, including
6//! after the use site. The block-level parser already recognises
7//! `[label]: dest` lines and emits them as separate blocks, but inline
8//! parsing has historically treated every `[bracket pair]` as opaque on
9//! shape alone — without checking whether the label resolves.
10//!
11//! The fix is a single forward scan over the input *before* inline
12//! parsing runs, collecting every refdef label into a [`RefdefMap`].
13//! The IR's bracket resolution pass consults this map to decide whether
14//! a `[...]` (or `[...][...]`) opens a link or falls through to literal
15//! text.
16//!
17//! ## Scope
18//!
19//! - The set is computed once per `Parser::parse` call from the original
20//!   input string and shared (via `Arc`) with every inline parsing
21//!   invocation that needs it. Inline fragments (e.g. heading text,
22//!   paragraph text, table cell text) do not contain refdef definitions
23//!   themselves, so a *fragment-level* scan is insufficient.
24//!
25//! - Labels are normalised per CommonMark §4.7: case-folded, leading and
26//!   trailing whitespace stripped, internal whitespace runs collapsed to
27//!   a single space. The same normalisation applies on the lookup side
28//!   in the bracket resolution pass.
29//!
30//! - The scan does not attempt to detect refdefs inside code fences or
31//!   raw HTML blocks; it accepts a small over-approximation in exchange
32//!   for being a context-free linear walk. A bracket label that happens
33//!   to *spell* a defined refdef inside a fenced code block would still
34//!   resolve correctly under emission because emission walks the CST,
35//!   which already excludes the fenced region. The over-approximation
36//!   only matters if a bogus refdef-shaped line *outside* a code block
37//!   would shadow real text — that case is also wrong under CommonMark
38//!   semantics, so the approximation is fine.
39
40use crate::options::Dialect;
41use std::collections::HashSet;
42use std::sync::Arc;
43
44use crate::parser::blocks::reference_links::try_parse_reference_definition;
45
46/// Set of normalised refdef labels collected from the document. Wrapped
47/// in `Arc` so the (immutable) set can be cheaply cloned into every
48/// inline parsing call.
49pub type RefdefMap = Arc<HashSet<String>>;
50
51/// Secondary O(K) cap on the number of continuation lines the
52/// blockquote refdef strip walks. A blank `>` line terminates any
53/// refdef (the primary stop, since a refdef cannot contain a blank
54/// line); this cap only bounds a contrived all-`>`-with-content
55/// blockquote that has no blank lines, keeping per-line work O(K) so
56/// the whole scan stays O(N). A well-formed wrapped refdef spans only a
57/// handful of lines (label, destination, title-open, title-close), so
58/// 32 leaves generous headroom.
59const MAX_BLOCKQUOTE_REFDEF_LINES: usize = 32;
60
61/// Normalise a refdef label per CommonMark §4.7.
62///
63/// 1. Strip leading and trailing whitespace.
64/// 2. Collapse internal whitespace runs (any mixture of spaces, tabs,
65///    line endings) to a single space.
66/// 3. Case-fold. CommonMark mandates Unicode case folding rather than
67///    plain lowercasing; the two differ for characters whose folded
68///    form is longer than the lowercased form, most notably the German
69///    sharp S (`ẞ` lowercases to `ß` but folds to `ss`). We approximate
70///    by lowercasing and then expanding any remaining `ß` to `ss` —
71///    that matches the test renderer's `normalize_label` and is the
72///    only multi-character fold spec.txt exercises beyond ASCII (spec
73///    example #540).
74pub fn normalize_label(label: &str) -> String {
75    let trimmed = label.trim();
76    let mut out = String::with_capacity(trimmed.len());
77    let mut prev_ws = false;
78    for ch in trimmed.chars() {
79        if ch.is_whitespace() {
80            if !prev_ws {
81                out.push(' ');
82                prev_ws = true;
83            }
84        } else {
85            for low in ch.to_lowercase() {
86                out.push(low);
87            }
88            prev_ws = false;
89        }
90    }
91    out.replace('ß', "ss")
92}
93
94/// Walk the input string once and collect all reference definitions into
95/// a [`RefdefMap`]. Only used for `Dialect::CommonMark`; callers should
96/// pass an empty (or `None`) map for other dialects.
97///
98/// The scanner is line-based: at each line-start, it strips any
99/// blockquote markers (`> ` / `>` runs) — refdefs are valid inside a
100/// blockquote per CommonMark §4.7 (spec example #218) — and tries
101/// [`try_parse_reference_definition`] on the surviving bytes. When the
102/// parser reports a multi-line consumption the cursor advances past the
103/// whole refdef in one step.
104pub fn collect_refdef_labels(input: &str, dialect: Dialect) -> RefdefMap {
105    let mut set: HashSet<String> = HashSet::new();
106    let bytes = input.as_bytes();
107    let mut pos = 0;
108
109    while pos < bytes.len() {
110        // Cheap leading-byte gate: a refdef line starts with `[` after
111        // up to 3 spaces, or with `>` (blockquote-wrapped). Anything
112        // else can't be a refdef — skip the full
113        // `try_parse_reference_definition` scan and the blockquote
114        // strip+retry. Most lines in a typical doc fail this gate.
115        let mut gate = pos;
116        while gate < bytes.len() && gate - pos < 3 && bytes[gate] == b' ' {
117            gate += 1;
118        }
119        let gate_byte = bytes.get(gate).copied();
120        if gate_byte == Some(b'[') {
121            if let Some((consumed, label, _url, _title)) =
122                try_parse_reference_definition(&input[pos..], dialect)
123            {
124                set.insert(normalize_label(&label));
125                pos += consumed.max(1);
126                continue;
127            }
128        } else if gate_byte == Some(b'>')
129            && let Some(stripped) =
130                strip_blockquote_line(&input[pos..], MAX_BLOCKQUOTE_REFDEF_LINES)
131            && let Some((_, label, _, _)) = try_parse_reference_definition(&stripped, dialect)
132        {
133            set.insert(normalize_label(&label));
134        }
135
136        match memchr_newline(&bytes[pos..]) {
137            Some(off) => {
138                pos += off + 1;
139            }
140            None => break,
141        }
142    }
143
144    Arc::new(set)
145}
146
147fn memchr_newline(bytes: &[u8]) -> Option<usize> {
148    bytes.iter().position(|&b| b == b'\n')
149}
150
151/// `true` if the line starting at `text[0]` begins with a blockquote
152/// marker (`>` after up to 3 leading spaces).
153fn line_starts_with_blockquote(text: &str) -> bool {
154    let bytes = text.as_bytes();
155    let mut i = 0;
156    while i < bytes.len() && i < 3 && bytes[i] == b' ' {
157        i += 1;
158    }
159    bytes.get(i) == Some(&b'>')
160}
161
162/// Strip leading blockquote markers from a single line plus its possible
163/// continuation lines, producing the inner text suitable for refdef
164/// parsing. Returns `None` if no blockquote prefix is present.
165///
166/// Refdefs can span multiple lines (e.g. the title can wrap), so we
167/// strip blockquote markers from each continuation line too. We stop at:
168///
169/// - a line that doesn't continue with a blockquote marker;
170/// - a continuation line that is blank after marker-stripping (a `>`
171///   separator) — a blank line terminates any refdef per CommonMark
172///   §4.7, so we must not slurp past it into the next blockquote
173///   paragraph; and
174/// - after `max_lines` continuation lines, as a hard O(K) cap so a
175///   blockquote with no blank lines can't make this O(N) per call.
176///
177/// Without these bounds the walk copies the entire remaining blockquote
178/// body on every `>`-line, which is O(N²) over a large blockquote.
179fn strip_blockquote_line(text: &str, max_lines: usize) -> Option<String> {
180    if !line_starts_with_blockquote(text) {
181        return None;
182    }
183    let mut out = String::with_capacity(text.len());
184    for (idx, line) in text.split_inclusive('\n').enumerate() {
185        if idx > max_lines {
186            break;
187        }
188        let bytes = line.as_bytes();
189        let mut i = 0;
190        while i < bytes.len() && i < 3 && bytes[i] == b' ' {
191            i += 1;
192        }
193        if bytes.get(i) != Some(&b'>') {
194            // Not a blockquote continuation — stop.
195            break;
196        }
197        i += 1;
198        // Optional single space after `>`.
199        if bytes.get(i) == Some(&b' ') {
200            i += 1;
201        }
202        let rest = &line[i..];
203        // A blank blockquote line (`>` with nothing after) terminates the
204        // refdef. Only applies to continuation lines: the first line is a
205        // refdef candidate by construction (the `>` gate matched content).
206        if idx > 0 && rest.trim().is_empty() {
207            break;
208        }
209        out.push_str(rest);
210    }
211    Some(out)
212}
213
214#[cfg(test)]
215mod tests {
216    use super::*;
217
218    #[test]
219    fn collects_simple_refdef() {
220        let map = collect_refdef_labels("[foo]: /url\n", Dialect::CommonMark);
221        assert!(map.contains("foo"));
222    }
223
224    #[test]
225    fn collects_multiple_refdefs() {
226        let input = "[foo]: /a\n[bar]: /b\n[baz]: /c\n";
227        let map = collect_refdef_labels(input, Dialect::CommonMark);
228        assert!(map.contains("foo"));
229        assert!(map.contains("bar"));
230        assert!(map.contains("baz"));
231    }
232
233    #[test]
234    fn does_not_collect_non_refdef_lines() {
235        let input = "Just a paragraph.\n\nAnother one.\n";
236        let map = collect_refdef_labels(input, Dialect::CommonMark);
237        assert!(map.is_empty());
238    }
239
240    #[test]
241    fn collects_after_paragraph() {
242        let input = "Some paragraph.\n\n[foo]: /url\n";
243        let map = collect_refdef_labels(input, Dialect::CommonMark);
244        assert!(map.contains("foo"));
245    }
246
247    #[test]
248    fn case_folded_label() {
249        let map = collect_refdef_labels("[FOO Bar]: /url\n", Dialect::CommonMark);
250        assert!(map.contains("foo bar"));
251    }
252
253    #[test]
254    fn collapses_internal_whitespace() {
255        assert_eq!(normalize_label("  foo   bar\tbaz  "), "foo bar baz");
256    }
257
258    #[test]
259    fn collects_blockquote_wrapped_refdef() {
260        // CommonMark spec example #218: a refdef inside a blockquote.
261        let input = "> [foo]: /url\n>\n> [foo]\n";
262        let map = collect_refdef_labels(input, Dialect::CommonMark);
263        assert!(map.contains("foo"));
264    }
265
266    #[test]
267    fn collects_blockquote_refdef_with_wrapped_title() {
268        // The destination and title wrap onto continuation lines, each
269        // still carrying the `>` marker.
270        let input = "> [foo]:\n>   /url\n>   \"the title\"\n";
271        let map = collect_refdef_labels(input, Dialect::CommonMark);
272        assert!(map.contains("foo"));
273    }
274
275    #[test]
276    fn collects_refdef_in_multiparagraph_blockquote() {
277        // A refdef in the first and last paragraph of a blockquote, with
278        // blank `>` separators between paragraphs. Both must be collected,
279        // proving the blank-stop doesn't skip past later definitions.
280        let input =
281            "> [foo]: /url\n>\n> first paragraph\n>\n> second paragraph\n>\n> [bar]: /url2\n";
282        let map = collect_refdef_labels(input, Dialect::CommonMark);
283        assert!(map.contains("foo"));
284        assert!(map.contains("bar"));
285    }
286
287    #[test]
288    fn ignores_non_refdef_blockquote_lines() {
289        let input = "> just some quoted text\n> more text\n";
290        let map = collect_refdef_labels(input, Dialect::CommonMark);
291        assert!(map.is_empty());
292    }
293
294    #[test]
295    fn label_523_is_not_collected() {
296        // CMark example 523 has no refdef; the bracket should fall through
297        // to literal text under bracket resolution.
298        let map = collect_refdef_labels("*foo [bar* baz]\n", Dialect::CommonMark);
299        assert!(map.is_empty());
300    }
301}