Skip to main content

dmc_parser/
refs.rs

1//! Link- and footnote-reference tables. CM 4.7 + 6.3 + GFM footnotes
2//! require a two-pass parse: first walk the token stream to harvest all
3//! definitions, then resolve `[label]` / `[text][label]` / `[label][]`
4//! references against the table during the main parse.
5
6use std::collections::HashMap;
7
8/// Resolved link reference: destination URL plus optional title.
9pub type LinkRef = (String, Option<String>);
10
11/// Lookup table built once per parse.
12#[derive(Debug, Default, Clone)]
13pub struct RefMap {
14  links: HashMap<String, LinkRef>,
15}
16
17impl RefMap {
18  pub fn new() -> Self {
19    Self::default()
20  }
21
22  /// First definition wins (CM 4.7).
23  pub fn insert(&mut self, label: &str, url: String, title: Option<String>) {
24    let key = normalize_label(label);
25    if !key.is_empty() {
26      self.links.entry(key).or_insert((url, title));
27    }
28  }
29
30  pub fn get(&self, label: &str) -> Option<&LinkRef> {
31    self.links.get(&normalize_label(label))
32  }
33
34  pub fn is_empty(&self) -> bool {
35    self.links.is_empty()
36  }
37}
38
39/// CM 4.7: case-insensitive comparison, internal whitespace collapsed
40/// to single spaces, leading/trailing whitespace trimmed. Backslash
41/// escapes resolve before comparison so `[Foo\]]` and `Foo]` match.
42pub fn normalize_label(s: &str) -> String {
43  // CM 4.7: normalize by case-fold + ws-collapse only. Backslash
44  // escapes are NOT unescaped during label matching, so `[foo\!]` and
45  // `[foo!]` match different labels.
46  let mut out = String::with_capacity(s.len());
47  let mut prev_ws = true;
48  for c in s.chars() {
49    if c.is_whitespace() {
50      if !prev_ws {
51        out.push(' ');
52        prev_ws = true;
53      }
54    } else {
55      push_case_folded(&mut out, c);
56      prev_ws = false;
57    }
58  }
59  if out.ends_with(' ') {
60    out.pop();
61  }
62  out
63}
64
65/// CM 4.7 / 6.3: link reference labels are matched after Unicode
66/// case folding. `to_lowercase` matches that for most code points, but
67/// `ß` (U+00DF) folds to `ss` and capital `ẞ` (U+1E9E) lowercases to
68/// `ß` -- so a `[ẞ]` reference fails to match a `[SS]:` definition
69/// unless we explicitly fold `ß` -> `ss` here.
70///
71/// Broader full-Unicode case folding still needs a dedicated mapping
72/// table or crate; for now the parser keeps the lightweight in-tree
73/// approximation plus the CommonMark-critical sharp-s special case.
74fn push_case_folded(out: &mut String, c: char) {
75  for low in c.to_lowercase() {
76    if low == '\u{00DF}' {
77      out.push_str("ss");
78    } else {
79      out.push(low);
80    }
81  }
82}
83
84/// Parse the raw lexeme of a `LinkRefDef` token into
85/// `(label, url, title)`. Returns `None` on malformed input; the lexer
86/// already validated the gross structure (`[label]:` plus a non-empty
87/// destination), so failures here mostly mean a missing `]` or `:`.
88pub fn parse_link_ref_def(raw: &str) -> Option<(String, String, Option<String>)> {
89  let bytes = raw.as_bytes();
90  if bytes.first() != Some(&b'[') {
91    return None;
92  }
93  // Find the unescaped closing `]`.
94  let mut i = 1usize;
95  while i < bytes.len() && bytes[i] != b']' {
96    if bytes[i] == b'\\' && i + 1 < bytes.len() {
97      i += 2;
98      continue;
99    }
100    i += 1;
101  }
102  if i >= bytes.len() {
103    return None;
104  }
105  let label = raw[1..i].to_string();
106  let after = i + 1;
107  if bytes.get(after) != Some(&b':') {
108    return None;
109  }
110  // Skip whitespace after the colon.
111  let mut j = after + 1;
112  while j < bytes.len() && matches!(bytes[j], b' ' | b'\t' | b'\n') {
113    j += 1;
114  }
115  // Destination: bracketed `<...>` form (allows spaces) or bare run
116  // up to the next whitespace.
117  let (url, mut k) = if bytes.get(j) == Some(&b'<') {
118    let start = j + 1;
119    let mut p = start;
120    while p < bytes.len() && bytes[p] != b'>' && bytes[p] != b'\n' {
121      p += 1;
122    }
123    if p >= bytes.len() || bytes[p] != b'>' {
124      return None;
125    }
126    (raw[start..p].to_string(), p + 1)
127  } else {
128    let start = j;
129    let mut p = start;
130    while p < bytes.len() && !matches!(bytes[p], b' ' | b'\t' | b'\n') {
131      p += 1;
132    }
133    if start == p {
134      return None;
135    }
136    (raw[start..p].to_string(), p)
137  };
138  // Optional title: rest of line after whitespace, wrapped in
139  // matched `"..."`, `'...'`, or `(...)`.
140  while k < bytes.len() && matches!(bytes[k], b' ' | b'\t' | b'\n') {
141    k += 1;
142  }
143  let title = if k >= bytes.len() {
144    None
145  } else {
146    let rest = raw[k..].trim_end();
147    if rest.is_empty() {
148      None
149    } else {
150      let bs = rest.as_bytes();
151      let first = *bs.first()?;
152      let last = *bs.last()?;
153      let starts_title = matches!(first, b'"' | b'\'' | b'(');
154      let matched =
155        (first == b'"' && last == b'"') || (first == b'\'' && last == b'\'') || (first == b'(' && last == b')');
156      if starts_title {
157        if matched && rest.len() >= 2 {
158          Some(rest[1..rest.len() - 1].to_string())
159        } else {
160          return None;
161        }
162      } else {
163        return None;
164      }
165    }
166  };
167  Some((label, url, title))
168}