panache_parser/parser/inlines/refdef_map.rs
1//! Document-level link reference definition map for CommonMark inline
2//! parsing.
3//!
4//! CommonMark §6.3 says reference links are valid only when the label
5//! matches a definition that appears anywhere in the document, including
6//! after the use site. The block-level parser already recognises
7//! `[label]: dest` lines and emits them as separate blocks, but inline
8//! parsing has historically treated every `[bracket pair]` as opaque on
9//! shape alone — without checking whether the label resolves.
10//!
11//! The fix is a single forward scan over the input *before* inline
12//! parsing runs, collecting every refdef label into a [`RefdefMap`].
13//! The IR's bracket resolution pass consults this map to decide whether
14//! a `[...]` (or `[...][...]`) opens a link or falls through to literal
15//! text.
16//!
17//! ## Scope
18//!
19//! - The set is computed once per `Parser::parse` call from the original
20//! input string and shared (via `Arc`) with every inline parsing
21//! invocation that needs it. Inline fragments (e.g. heading text,
22//! paragraph text, table cell text) do not contain refdef definitions
23//! themselves, so a *fragment-level* scan is insufficient.
24//!
25//! - Labels are normalised per CommonMark §4.7: case-folded, leading and
26//! trailing whitespace stripped, internal whitespace runs collapsed to
27//! a single space. The same normalisation applies on the lookup side
28//! in the bracket resolution pass.
29//!
30//! - The scan does not attempt to detect refdefs inside code fences or
31//! raw HTML blocks; it accepts a small over-approximation in exchange
32//! for being a context-free linear walk. A bracket label that happens
33//! to *spell* a defined refdef inside a fenced code block would still
34//! resolve correctly under emission because emission walks the CST,
35//! which already excludes the fenced region. The over-approximation
36//! only matters if a bogus refdef-shaped line *outside* a code block
37//! would shadow real text — that case is also wrong under CommonMark
38//! semantics, so the approximation is fine.
39
40use crate::options::Dialect;
41use std::collections::HashSet;
42use std::sync::Arc;
43
44use crate::parser::blocks::reference_links::try_parse_reference_definition;
45
46/// Set of normalised refdef labels collected from the document. Wrapped
47/// in `Arc` so the (immutable) set can be cheaply cloned into every
48/// inline parsing call.
49pub type RefdefMap = Arc<HashSet<String>>;
50
51/// Secondary O(K) cap on the number of continuation lines the
52/// blockquote refdef strip walks. A blank `>` line terminates any
53/// refdef (the primary stop, since a refdef cannot contain a blank
54/// line); this cap only bounds a contrived all-`>`-with-content
55/// blockquote that has no blank lines, keeping per-line work O(K) so
56/// the whole scan stays O(N). A well-formed wrapped refdef spans only a
57/// handful of lines (label, destination, title-open, title-close), so
58/// 32 leaves generous headroom.
59const MAX_BLOCKQUOTE_REFDEF_LINES: usize = 32;
60
61/// Normalise a refdef label per CommonMark §4.7.
62///
63/// 1. Strip leading and trailing whitespace.
64/// 2. Collapse internal whitespace runs (any mixture of spaces, tabs,
65/// line endings) to a single space.
66/// 3. Case-fold. CommonMark mandates Unicode case folding rather than
67/// plain lowercasing; the two differ for characters whose folded
68/// form is longer than the lowercased form, most notably the German
69/// sharp S (`ẞ` lowercases to `ß` but folds to `ss`). We approximate
70/// by lowercasing and then expanding any remaining `ß` to `ss` —
71/// that matches the test renderer's `normalize_label` and is the
72/// only multi-character fold spec.txt exercises beyond ASCII (spec
73/// example #540).
74pub fn normalize_label(label: &str) -> String {
75 let trimmed = label.trim();
76 let mut out = String::with_capacity(trimmed.len());
77 let mut prev_ws = false;
78 for ch in trimmed.chars() {
79 if ch.is_whitespace() {
80 if !prev_ws {
81 out.push(' ');
82 prev_ws = true;
83 }
84 } else {
85 for low in ch.to_lowercase() {
86 out.push(low);
87 }
88 prev_ws = false;
89 }
90 }
91 out.replace('ß', "ss")
92}
93
94/// Walk the input string once and collect all reference definitions into
95/// a [`RefdefMap`]. Only used for `Dialect::CommonMark`; callers should
96/// pass an empty (or `None`) map for other dialects.
97///
98/// The scanner is line-based: at each line-start, it strips any
99/// blockquote markers (`> ` / `>` runs) — refdefs are valid inside a
100/// blockquote per CommonMark §4.7 (spec example #218) — and tries
101/// [`try_parse_reference_definition`] on the surviving bytes. When the
102/// parser reports a multi-line consumption the cursor advances past the
103/// whole refdef in one step.
104pub fn collect_refdef_labels(input: &str, dialect: Dialect) -> RefdefMap {
105 let mut set: HashSet<String> = HashSet::new();
106 let bytes = input.as_bytes();
107 let mut pos = 0;
108
109 while pos < bytes.len() {
110 // Cheap leading-byte gate: a refdef line starts with `[` after
111 // up to 3 spaces, or with `>` (blockquote-wrapped). Anything
112 // else can't be a refdef — skip the full
113 // `try_parse_reference_definition` scan and the blockquote
114 // strip+retry. Most lines in a typical doc fail this gate.
115 let mut gate = pos;
116 while gate < bytes.len() && gate - pos < 3 && bytes[gate] == b' ' {
117 gate += 1;
118 }
119 let gate_byte = bytes.get(gate).copied();
120 if gate_byte == Some(b'[') {
121 if let Some((consumed, label, _url, _title)) =
122 try_parse_reference_definition(&input[pos..], dialect)
123 {
124 set.insert(normalize_label(&label));
125 pos += consumed.max(1);
126 continue;
127 }
128 } else if gate_byte == Some(b'>')
129 && let Some(stripped) =
130 strip_blockquote_line(&input[pos..], MAX_BLOCKQUOTE_REFDEF_LINES)
131 && let Some((_, label, _, _)) = try_parse_reference_definition(&stripped, dialect)
132 {
133 set.insert(normalize_label(&label));
134 }
135
136 match memchr_newline(&bytes[pos..]) {
137 Some(off) => {
138 pos += off + 1;
139 }
140 None => break,
141 }
142 }
143
144 Arc::new(set)
145}
146
147fn memchr_newline(bytes: &[u8]) -> Option<usize> {
148 memchr::memchr(b'\n', bytes)
149}
150
151/// `true` if the line starting at `text[0]` begins with a blockquote
152/// marker (`>` after up to 3 leading spaces).
153fn line_starts_with_blockquote(text: &str) -> bool {
154 let bytes = text.as_bytes();
155 let mut i = 0;
156 while i < bytes.len() && i < 3 && bytes[i] == b' ' {
157 i += 1;
158 }
159 bytes.get(i) == Some(&b'>')
160}
161
162/// Strip leading blockquote markers from a single line plus its possible
163/// continuation lines, producing the inner text suitable for refdef
164/// parsing. Returns `None` if no blockquote prefix is present.
165///
166/// Refdefs can span multiple lines (e.g. the title can wrap), so we
167/// strip blockquote markers from each continuation line too. We stop at:
168///
169/// - a line that doesn't continue with a blockquote marker;
170/// - a continuation line that is blank after marker-stripping (a `>`
171/// separator) — a blank line terminates any refdef per CommonMark
172/// §4.7, so we must not slurp past it into the next blockquote
173/// paragraph; and
174/// - after `max_lines` continuation lines, as a hard O(K) cap so a
175/// blockquote with no blank lines can't make this O(N) per call.
176///
177/// Without these bounds the walk copies the entire remaining blockquote
178/// body on every `>`-line, which is O(N²) over a large blockquote.
179fn strip_blockquote_line(text: &str, max_lines: usize) -> Option<String> {
180 if !line_starts_with_blockquote(text) {
181 return None;
182 }
183 let mut out = String::with_capacity(text.len());
184 for (idx, line) in text.split_inclusive('\n').enumerate() {
185 if idx > max_lines {
186 break;
187 }
188 let bytes = line.as_bytes();
189 let mut i = 0;
190 while i < bytes.len() && i < 3 && bytes[i] == b' ' {
191 i += 1;
192 }
193 if bytes.get(i) != Some(&b'>') {
194 // Not a blockquote continuation — stop.
195 break;
196 }
197 i += 1;
198 // Optional single space after `>`.
199 if bytes.get(i) == Some(&b' ') {
200 i += 1;
201 }
202 let rest = &line[i..];
203 // A blank blockquote line (`>` with nothing after) terminates the
204 // refdef. Only applies to continuation lines: the first line is a
205 // refdef candidate by construction (the `>` gate matched content).
206 if idx > 0 && rest.trim().is_empty() {
207 break;
208 }
209 out.push_str(rest);
210 }
211 Some(out)
212}
213
214#[cfg(test)]
215mod tests {
216 use super::*;
217
218 #[test]
219 fn collects_simple_refdef() {
220 let map = collect_refdef_labels("[foo]: /url\n", Dialect::CommonMark);
221 assert!(map.contains("foo"));
222 }
223
224 #[test]
225 fn collects_multiple_refdefs() {
226 let input = "[foo]: /a\n[bar]: /b\n[baz]: /c\n";
227 let map = collect_refdef_labels(input, Dialect::CommonMark);
228 assert!(map.contains("foo"));
229 assert!(map.contains("bar"));
230 assert!(map.contains("baz"));
231 }
232
233 #[test]
234 fn does_not_collect_non_refdef_lines() {
235 let input = "Just a paragraph.\n\nAnother one.\n";
236 let map = collect_refdef_labels(input, Dialect::CommonMark);
237 assert!(map.is_empty());
238 }
239
240 #[test]
241 fn collects_after_paragraph() {
242 let input = "Some paragraph.\n\n[foo]: /url\n";
243 let map = collect_refdef_labels(input, Dialect::CommonMark);
244 assert!(map.contains("foo"));
245 }
246
247 #[test]
248 fn case_folded_label() {
249 let map = collect_refdef_labels("[FOO Bar]: /url\n", Dialect::CommonMark);
250 assert!(map.contains("foo bar"));
251 }
252
253 #[test]
254 fn collapses_internal_whitespace() {
255 assert_eq!(normalize_label(" foo bar\tbaz "), "foo bar baz");
256 }
257
258 #[test]
259 fn collects_blockquote_wrapped_refdef() {
260 // CommonMark spec example #218: a refdef inside a blockquote.
261 let input = "> [foo]: /url\n>\n> [foo]\n";
262 let map = collect_refdef_labels(input, Dialect::CommonMark);
263 assert!(map.contains("foo"));
264 }
265
266 #[test]
267 fn collects_blockquote_refdef_with_wrapped_title() {
268 // The destination and title wrap onto continuation lines, each
269 // still carrying the `>` marker.
270 let input = "> [foo]:\n> /url\n> \"the title\"\n";
271 let map = collect_refdef_labels(input, Dialect::CommonMark);
272 assert!(map.contains("foo"));
273 }
274
275 #[test]
276 fn collects_refdef_in_multiparagraph_blockquote() {
277 // A refdef in the first and last paragraph of a blockquote, with
278 // blank `>` separators between paragraphs. Both must be collected,
279 // proving the blank-stop doesn't skip past later definitions.
280 let input =
281 "> [foo]: /url\n>\n> first paragraph\n>\n> second paragraph\n>\n> [bar]: /url2\n";
282 let map = collect_refdef_labels(input, Dialect::CommonMark);
283 assert!(map.contains("foo"));
284 assert!(map.contains("bar"));
285 }
286
287 #[test]
288 fn ignores_non_refdef_blockquote_lines() {
289 let input = "> just some quoted text\n> more text\n";
290 let map = collect_refdef_labels(input, Dialect::CommonMark);
291 assert!(map.is_empty());
292 }
293
294 #[test]
295 fn label_523_is_not_collected() {
296 // CMark example 523 has no refdef; the bracket should fall through
297 // to literal text under bracket resolution.
298 let map = collect_refdef_labels("*foo [bar* baz]\n", Dialect::CommonMark);
299 assert!(map.is_empty());
300 }
301}