Skip to main content

dmc_transform/builtin/
bare_url.rs

1//! Bare-URL autolinker. See `transformers/bare-url.md` for full docs.
2
3use crate::pipeline::Transformer;
4use crate::visit::{NodeAction, Visitor, walk_root};
5use dmc_diagnostic::Code;
6use dmc_diagnostic::metadata::SourceMeta;
7use dmc_parser::ast::*;
8
9/// Wrap bare `http(s)://...` substrings in `Text` nodes with synthesised
10/// `Link` nodes. Scans `Paragraph`, `Heading`, and inline emphasis containers.
11#[derive(Default)]
12pub struct BareUrlAutolink;
13
14impl Transformer for BareUrlAutolink {
15  fn name(&self) -> &str {
16    "bare-url-autolink"
17  }
18  fn transform(
19    &self,
20    doc: &mut Document,
21    _meta: &SourceMeta,
22    _diag_engine: &mut duck_diagnostic::DiagnosticEngine<Code>,
23  ) {
24    let mut v = Apply;
25    walk_root(&mut doc.children, &mut v);
26  }
27}
28
29struct Apply;
30
31impl Apply {
32  /// Expand any `Text` whose value contains a URL into `[Text, Link, Text,
33  /// ...]` pieces. Non-Text nodes pass through.
34  fn rewrite_children(nodes: Vec<Node>) -> Vec<Node> {
35    let mut out = Vec::new();
36    for n in nodes {
37      if let Node::Text(t) = &n {
38        let pieces = Self::split_by_url(&t.value);
39        // No URL found if every piece is a Text (just the original
40        // string round-tripping). Otherwise rewrite into the
41        // text+link mix.
42        let any_url = pieces.iter().any(|p| matches!(p, Piece::Url(_) | Piece::Display(_)));
43        if !any_url {
44          out.push(n.clone());
45          continue;
46        }
47        let span = t.span.clone();
48        let mut iter = pieces.into_iter().peekable();
49        while let Some(piece) = iter.next() {
50          match piece {
51            Piece::Text(s) if !s.is_empty() => out.push(Node::Text(Text { value: s, span: span.clone() })),
52            Piece::Text(_) => {},
53            Piece::Url(href) => {
54              let display = match iter.peek() {
55                Some(Piece::Display(_)) => match iter.next() {
56                  Some(Piece::Display(d)) => d,
57                  _ => href.clone(),
58                },
59                _ => href.clone(),
60              };
61              out.push(Node::Link(Link {
62                href,
63                title: None,
64                children: vec![Node::Text(Text { value: display, span: span.clone() })],
65                span: span.clone(),
66              }));
67            },
68            Piece::Display(d) => {
69              // Stray Display without preceding Url -- emit as text.
70              if !d.is_empty() {
71                out.push(Node::Text(Text { value: d, span: span.clone() }));
72              }
73            },
74          }
75        }
76      } else {
77        out.push(n);
78      }
79    }
80    out
81  }
82
83  /// Split `s` into alternating `Text` / `Url` pieces around GFM
84  /// autolink runs: `http(s)://...` plus `www....`. URL boundary is
85  /// whitespace, `<`, or unbalanced `)`. Trailing `?!.,:*_~` is
86  /// trimmed as sentence punctuation; trailing `&entity;` is also
87  /// stripped because GFM treats the entity ref as following text.
88  fn split_by_url(s: &str) -> Vec<Piece> {
89    fn next_url_match(rest: &str) -> Option<(usize, &'static str)> {
90      // Find the earliest position where one of the GFM autolink
91      // prefixes starts at a valid boundary (start of string or
92      // preceded by a non-alphanumeric / `_`).
93      let bytes = rest.as_bytes();
94      let mut best: Option<(usize, &'static str)> = None;
95      for prefix in ["http://", "https://", "www."] {
96        if let Some(idx) = rest.find(prefix) {
97          let ok_boundary =
98            idx == 0 || matches!(bytes.get(idx - 1).copied(), Some(b) if !b.is_ascii_alphanumeric() && b != b'_');
99          if !ok_boundary {
100            continue;
101          }
102          if best.is_none_or(|(b, _)| idx < b) {
103            best = Some((idx, prefix));
104          }
105        }
106      }
107      best
108    }
109    fn url_body_end(after: &str) -> usize {
110      after.find(|c: char| c.is_whitespace() || c == '<').unwrap_or(after.len())
111    }
112    fn trim_trailing(s: &str) -> (&str, &str) {
113      let bytes = s.as_bytes();
114      let mut end = bytes.len();
115      loop {
116        if end == 0 {
117          break;
118        }
119        let last = bytes[end - 1];
120        // Strip trailing sentence punctuation.
121        if matches!(last, b'?' | b'!' | b'.' | b',' | b':' | b'*' | b'_' | b'~') {
122          end -= 1;
123          continue;
124        }
125        // Strip an unmatched `)` (more closes than opens in the
126        // current URL body).
127        if last == b')' {
128          let opens = bytes[..end].iter().filter(|&&b| b == b'(').count();
129          let closes = bytes[..end].iter().filter(|&&b| b == b')').count();
130          if closes > opens {
131            end -= 1;
132            continue;
133          }
134        }
135        // Strip a trailing `&entity;` (entity refs render as following
136        // text per GFM autolink rule).
137        if last == b';'
138          && let Some(amp) = bytes[..end - 1].iter().rposition(|&b| b == b'&')
139        {
140          let inner = &bytes[amp + 1..end - 1];
141          if !inner.is_empty() && inner.iter().all(|&b| b.is_ascii_alphanumeric()) {
142            end = amp;
143            continue;
144          }
145        }
146        break;
147      }
148      (&s[..end], &s[end..])
149    }
150
151    let mut out = Vec::new();
152    let mut rest = s;
153    while let Some((idx, prefix)) = next_url_match(rest) {
154      let before = &rest[..idx];
155      let after = &rest[idx..];
156      let url_end = url_body_end(after);
157      let raw = &after[..url_end];
158      let (url, trailing_punct) = trim_trailing(raw);
159      // GFM: `www.` autolinks require a `.` in the body after the prefix
160      // (the prefix itself ends with `.`). `trim_trailing` can shave the
161      // body down to (or below) the prefix length -- eg `www.` alone, or
162      // `www.` followed only by trailing punctuation -- so look up the
163      // body fallibly instead of slicing `url[prefix.len()..]` blindly.
164      if prefix == "www." && !url.get(prefix.len()..).is_some_and(|body| body.contains('.')) {
165        out.push(Piece::Text(format!("{}{}", before, prefix)));
166        rest = &after[prefix.len()..];
167        continue;
168      }
169      if url.is_empty() {
170        out.push(Piece::Text(before.to_string()));
171        rest = &after[1..];
172        continue;
173      }
174      if !before.is_empty() {
175        out.push(Piece::Text(before.to_string()));
176      }
177      let href = if prefix == "www." { format!("http://{}", url) } else { url.to_string() };
178      out.push(Piece::Url(href));
179      out.push(Piece::Display(url.to_string()));
180      if !trailing_punct.is_empty() {
181        out.push(Piece::Text(trailing_punct.to_string()));
182      }
183      rest = &after[url_end..];
184    }
185    if !rest.is_empty() {
186      out.push(Piece::Text(rest.to_string()));
187    }
188    if out.is_empty() {
189      out.push(Piece::Text(String::new()));
190    }
191    out
192  }
193}
194
195impl Visitor for Apply {
196  fn visit_node(&mut self, node: &mut Node) -> NodeAction {
197    match node {
198      Node::Paragraph(p) => p.children = Self::rewrite_children(std::mem::take(&mut p.children)),
199      Node::Heading(h) => h.children = Self::rewrite_children(std::mem::take(&mut h.children)),
200      Node::Bold(i) | Node::Italic(i) | Node::Strikethrough(i) => {
201        i.children = Self::rewrite_children(std::mem::take(&mut i.children));
202      },
203      _ => {},
204    }
205    NodeAction::Keep
206  }
207}
208
209enum Piece {
210  Text(String),
211  /// Resolved link destination (with `http://` prefix injected for
212  /// `www.` matches). Always immediately followed by `Display`.
213  Url(String),
214  /// Visible text inside the synthesized `<a>` (matches the raw
215  /// autolink slice in the source, eg `www.commonmark.org`).
216  Display(String),
217}