panache_parser/parser/inlines/refdef_map.rs
1//! Document-level link reference definition map for CommonMark inline
2//! parsing.
3//!
4//! CommonMark §6.3 says reference links are valid only when the label
5//! matches a definition that appears anywhere in the document, including
6//! after the use site. The block-level parser already recognises
7//! `[label]: dest` lines and emits them as separate blocks, but inline
8//! parsing has historically treated every `[bracket pair]` as opaque on
9//! shape alone — without checking whether the label resolves.
10//!
11//! The fix is a single forward scan over the input *before* inline
12//! parsing runs, collecting every refdef label into a [`RefdefMap`].
13//! The IR's bracket resolution pass consults this map to decide whether
14//! a `[...]` (or `[...][...]`) opens a link or falls through to literal
15//! text.
16//!
17//! ## Scope
18//!
19//! - The set is computed once per `Parser::parse` call from the original
20//! input string and shared (via `Arc`) with every inline parsing
21//! invocation that needs it. Inline fragments (e.g. heading text,
22//! paragraph text, table cell text) do not contain refdef definitions
23//! themselves, so a *fragment-level* scan is insufficient.
24//!
25//! - Labels are normalised per CommonMark §4.7: case-folded, leading and
26//! trailing whitespace stripped, internal whitespace runs collapsed to
27//! a single space. The same normalisation applies on the lookup side
28//! in the bracket resolution pass.
29//!
30//! - The scan does not attempt to detect refdefs inside code fences or
31//! raw HTML blocks; it accepts a small over-approximation in exchange
32//! for being a context-free linear walk. A bracket label that happens
33//! to *spell* a defined refdef inside a fenced code block would still
34//! resolve correctly under emission because emission walks the CST,
35//! which already excludes the fenced region. The over-approximation
36//! only matters if a bogus refdef-shaped line *outside* a code block
37//! would shadow real text — that case is also wrong under CommonMark
38//! semantics, so the approximation is fine.
39
40use crate::options::Dialect;
41use std::collections::HashSet;
42use std::sync::Arc;
43
44use crate::parser::blocks::reference_links::try_parse_reference_definition;
45
46/// Set of normalised refdef labels collected from the document. Wrapped
47/// in `Arc` so the (immutable) set can be cheaply cloned into every
48/// inline parsing call.
49pub type RefdefMap = Arc<HashSet<String>>;
50
51/// Normalise a refdef label per CommonMark §4.7.
52///
53/// 1. Strip leading and trailing whitespace.
54/// 2. Collapse internal whitespace runs (any mixture of spaces, tabs,
55/// line endings) to a single space.
56/// 3. Case-fold. CommonMark mandates Unicode case folding rather than
57/// plain lowercasing; the two differ for characters whose folded
58/// form is longer than the lowercased form, most notably the German
59/// sharp S (`ẞ` lowercases to `ß` but folds to `ss`). We approximate
60/// by lowercasing and then expanding any remaining `ß` to `ss` —
61/// that matches the test renderer's `normalize_label` and is the
62/// only multi-character fold spec.txt exercises beyond ASCII (spec
63/// example #540).
64pub fn normalize_label(label: &str) -> String {
65 let trimmed = label.trim();
66 let mut out = String::with_capacity(trimmed.len());
67 let mut prev_ws = false;
68 for ch in trimmed.chars() {
69 if ch.is_whitespace() {
70 if !prev_ws {
71 out.push(' ');
72 prev_ws = true;
73 }
74 } else {
75 for low in ch.to_lowercase() {
76 out.push(low);
77 }
78 prev_ws = false;
79 }
80 }
81 out.replace('ß', "ss")
82}
83
84/// Walk the input string once and collect all reference definitions into
85/// a [`RefdefMap`]. Only used for `Dialect::CommonMark`; callers should
86/// pass an empty (or `None`) map for other dialects.
87///
88/// The scanner is line-based: at each line-start, it strips any
89/// blockquote markers (`> ` / `>` runs) — refdefs are valid inside a
90/// blockquote per CommonMark §4.7 (spec example #218) — and tries
91/// [`try_parse_reference_definition`] on the surviving bytes. When the
92/// parser reports a multi-line consumption the cursor advances past the
93/// whole refdef in one step.
94pub fn collect_refdef_labels(input: &str, dialect: Dialect) -> RefdefMap {
95 let mut set: HashSet<String> = HashSet::new();
96 let bytes = input.as_bytes();
97 let mut pos = 0;
98
99 while pos < bytes.len() {
100 // Cheap leading-byte gate: a refdef line starts with `[` after
101 // up to 3 spaces, or with `>` (blockquote-wrapped). Anything
102 // else can't be a refdef — skip the full
103 // `try_parse_reference_definition` scan and the blockquote
104 // strip+retry. Most lines in a typical doc fail this gate.
105 let mut gate = pos;
106 while gate < bytes.len() && gate - pos < 3 && bytes[gate] == b' ' {
107 gate += 1;
108 }
109 let gate_byte = bytes.get(gate).copied();
110 if gate_byte == Some(b'[') {
111 if let Some((consumed, label, _url, _title)) =
112 try_parse_reference_definition(&input[pos..], dialect)
113 {
114 set.insert(normalize_label(&label));
115 pos += consumed.max(1);
116 continue;
117 }
118 } else if gate_byte == Some(b'>')
119 && let Some(stripped) = strip_blockquote_line(&input[pos..])
120 && let Some((_, label, _, _)) = try_parse_reference_definition(&stripped, dialect)
121 {
122 set.insert(normalize_label(&label));
123 }
124
125 match memchr_newline(&bytes[pos..]) {
126 Some(off) => {
127 pos += off + 1;
128 }
129 None => break,
130 }
131 }
132
133 Arc::new(set)
134}
135
136fn memchr_newline(bytes: &[u8]) -> Option<usize> {
137 bytes.iter().position(|&b| b == b'\n')
138}
139
140/// `true` if the line starting at `text[0]` begins with a blockquote
141/// marker (`>` after up to 3 leading spaces).
142fn line_starts_with_blockquote(text: &str) -> bool {
143 let bytes = text.as_bytes();
144 let mut i = 0;
145 while i < bytes.len() && i < 3 && bytes[i] == b' ' {
146 i += 1;
147 }
148 bytes.get(i) == Some(&b'>')
149}
150
151/// Strip leading blockquote markers from a single line plus its possible
152/// continuation lines, producing the inner text suitable for refdef
153/// parsing. Returns `None` if no blockquote prefix is present.
154///
155/// Refdefs can span multiple lines (e.g. the title can wrap), so we
156/// strip blockquote markers from each continuation line too. We stop at
157/// a blank line or a line that doesn't continue with a blockquote
158/// marker.
159fn strip_blockquote_line(text: &str) -> Option<String> {
160 if !line_starts_with_blockquote(text) {
161 return None;
162 }
163 let mut out = String::with_capacity(text.len());
164 for line in text.split_inclusive('\n') {
165 let bytes = line.as_bytes();
166 let mut i = 0;
167 while i < bytes.len() && i < 3 && bytes[i] == b' ' {
168 i += 1;
169 }
170 if bytes.get(i) != Some(&b'>') {
171 // Not a blockquote continuation — stop.
172 break;
173 }
174 i += 1;
175 // Optional single space after `>`.
176 if bytes.get(i) == Some(&b' ') {
177 i += 1;
178 }
179 out.push_str(&line[i..]);
180 }
181 Some(out)
182}
183
184#[cfg(test)]
185mod tests {
186 use super::*;
187
188 #[test]
189 fn collects_simple_refdef() {
190 let map = collect_refdef_labels("[foo]: /url\n", Dialect::CommonMark);
191 assert!(map.contains("foo"));
192 }
193
194 #[test]
195 fn collects_multiple_refdefs() {
196 let input = "[foo]: /a\n[bar]: /b\n[baz]: /c\n";
197 let map = collect_refdef_labels(input, Dialect::CommonMark);
198 assert!(map.contains("foo"));
199 assert!(map.contains("bar"));
200 assert!(map.contains("baz"));
201 }
202
203 #[test]
204 fn does_not_collect_non_refdef_lines() {
205 let input = "Just a paragraph.\n\nAnother one.\n";
206 let map = collect_refdef_labels(input, Dialect::CommonMark);
207 assert!(map.is_empty());
208 }
209
210 #[test]
211 fn collects_after_paragraph() {
212 let input = "Some paragraph.\n\n[foo]: /url\n";
213 let map = collect_refdef_labels(input, Dialect::CommonMark);
214 assert!(map.contains("foo"));
215 }
216
217 #[test]
218 fn case_folded_label() {
219 let map = collect_refdef_labels("[FOO Bar]: /url\n", Dialect::CommonMark);
220 assert!(map.contains("foo bar"));
221 }
222
223 #[test]
224 fn collapses_internal_whitespace() {
225 assert_eq!(normalize_label(" foo bar\tbaz "), "foo bar baz");
226 }
227
228 #[test]
229 fn label_523_is_not_collected() {
230 // CMark example 523 has no refdef; the bracket should fall through
231 // to literal text under bracket resolution.
232 let map = collect_refdef_labels("*foo [bar* baz]\n", Dialect::CommonMark);
233 assert!(map.is_empty());
234 }
235}