Skip to main content

mdwright_document/
gfm.rs

1//! GitHub Flavored Markdown extension overlay.
2//!
3//! Pulldown-cmark owns `CommonMark` parsing, but it does not expose every
4//! GFM behaviour mdwright needs as document facts. This module keeps the
5//! GFM scanner and render/signature event transform together so callers
6//! consume recognised facts instead of extension mechanics.
7
8use std::ops::Range;
9use std::sync::OnceLock;
10
11use pulldown_cmark::{CowStr, Event, LinkType, Tag, TagEnd};
12use regex::Regex;
13
14use crate::{GfmAutolinkPolicy, GfmOptions};
15
16/// Where an autolink came from.
17#[derive(Copy, Clone, Debug, PartialEq, Eq)]
18pub enum AutolinkOrigin {
19    CommonMark,
20    GfmUrl,
21    GfmEmail,
22}
23
24/// One autolink recognised in source.
25#[derive(Clone, Debug, PartialEq, Eq)]
26pub struct AutolinkFact {
27    raw_range: Range<usize>,
28    text: String,
29    href: String,
30    origin: AutolinkOrigin,
31}
32
33impl AutolinkFact {
34    fn new(raw_range: Range<usize>, text: String, href: String, origin: AutolinkOrigin) -> Self {
35        Self {
36            raw_range,
37            text,
38            href,
39            origin,
40        }
41    }
42
43    /// Source byte range of the autolink.
44    #[must_use]
45    pub fn raw_range(&self) -> Range<usize> {
46        self.raw_range.clone()
47    }
48
49    /// Text displayed by the autolink.
50    #[must_use]
51    pub fn text(&self) -> &str {
52        &self.text
53    }
54
55    /// Link destination after GFM / `CommonMark` href normalisation.
56    #[must_use]
57    pub fn href(&self) -> &str {
58        &self.href
59    }
60
61    /// Recognition origin.
62    #[must_use]
63    pub fn origin(&self) -> AutolinkOrigin {
64        self.origin
65    }
66}
67
68#[derive(Clone, Debug, PartialEq, Eq)]
69struct AutolinkMatch {
70    range: Range<usize>,
71    text: String,
72    href: String,
73    origin: AutolinkOrigin,
74}
75
76pub(crate) fn collect_autolinks(
77    source: &str,
78    events: &[(Event<'_>, Range<usize>)],
79    opts: GfmOptions,
80) -> Vec<AutolinkFact> {
81    let mut out = Vec::new();
82    let mut link_depth = 0u32;
83    let mut code_block_depth = 0u32;
84    for (event, range) in events {
85        match event {
86            Event::Start(Tag::CodeBlock(_)) => {
87                code_block_depth = code_block_depth.saturating_add(1);
88            }
89            Event::End(TagEnd::CodeBlock) => {
90                code_block_depth = code_block_depth.saturating_sub(1);
91            }
92            Event::Start(Tag::Link {
93                link_type, dest_url, ..
94            }) if code_block_depth == 0 && matches!(link_type, LinkType::Autolink | LinkType::Email) => {
95                out.push(commonmark_autolink_fact(
96                    source,
97                    range.clone(),
98                    *link_type,
99                    dest_url.as_ref(),
100                ));
101                link_depth = link_depth.saturating_add(1);
102            }
103            Event::Start(Tag::Link { .. } | Tag::Image { .. }) => {
104                link_depth = link_depth.saturating_add(1);
105            }
106            Event::End(TagEnd::Link | TagEnd::Image) => {
107                link_depth = link_depth.saturating_sub(1);
108            }
109            Event::Text(text) if link_depth == 0 && code_block_depth == 0 => {
110                out.extend(scan_gfm_autolinks_in_source(
111                    text.as_ref(),
112                    range.start,
113                    source,
114                    opts.autolinks,
115                ));
116            }
117            Event::Start(_)
118            | Event::End(_)
119            | Event::Text(_)
120            | Event::Code(_)
121            | Event::InlineMath(_)
122            | Event::DisplayMath(_)
123            | Event::Html(_)
124            | Event::InlineHtml(_)
125            | Event::FootnoteReference(_)
126            | Event::SoftBreak
127            | Event::HardBreak
128            | Event::Rule
129            | Event::TaskListMarker(_) => {}
130        }
131    }
132    out
133}
134
135pub(crate) fn apply_gfm_render_policy<'a>(
136    source: &str,
137    events: Vec<(Event<'a>, Range<usize>)>,
138    opts: GfmOptions,
139) -> Vec<Event<'a>> {
140    let mut out = Vec::with_capacity(events.len());
141    let mut link_depth = 0u32;
142    let mut code_block_depth = 0u32;
143    let mut skip_until = 0usize;
144    for (event, range) in events {
145        if range.end <= skip_until {
146            continue;
147        }
148        match event {
149            Event::Start(Tag::CodeBlock(_)) => {
150                code_block_depth = code_block_depth.saturating_add(1);
151                out.push(event);
152            }
153            Event::End(TagEnd::CodeBlock) => {
154                code_block_depth = code_block_depth.saturating_sub(1);
155                out.push(event);
156            }
157            Event::Start(Tag::Link { .. } | Tag::Image { .. }) => {
158                link_depth = link_depth.saturating_add(1);
159                out.push(event);
160            }
161            Event::End(TagEnd::Link | TagEnd::Image) => {
162                link_depth = link_depth.saturating_sub(1);
163                out.push(event);
164            }
165            Event::Text(text) if link_depth == 0 && code_block_depth == 0 => {
166                let text = text.as_ref();
167                let local_skip = skip_until.saturating_sub(range.start).min(text.len());
168                if let Some(text) = text.get(local_skip..) {
169                    skip_until = push_text_with_gfm_autolinks(
170                        text,
171                        range.start.saturating_add(local_skip),
172                        source,
173                        opts.autolinks,
174                        &mut out,
175                    )
176                    .max(skip_until);
177                }
178            }
179            Event::Html(html) if opts.tagfilter => {
180                out.push(Event::Html(CowStr::from(tagfilter_html(html.as_ref()))));
181            }
182            Event::InlineHtml(html) if opts.tagfilter => {
183                out.push(Event::InlineHtml(CowStr::from(tagfilter_html(html.as_ref()))));
184            }
185            Event::Start(_)
186            | Event::End(_)
187            | Event::Text(_)
188            | Event::Code(_)
189            | Event::InlineMath(_)
190            | Event::DisplayMath(_)
191            | Event::Html(_)
192            | Event::InlineHtml(_)
193            | Event::FootnoteReference(_)
194            | Event::SoftBreak
195            | Event::HardBreak
196            | Event::Rule
197            | Event::TaskListMarker(_) => out.push(event),
198        }
199    }
200    out
201}
202
203fn commonmark_autolink_fact(source: &str, range: Range<usize>, link_type: LinkType, href: &str) -> AutolinkFact {
204    let text = source
205        .get(range.clone())
206        .and_then(|raw| raw.strip_prefix('<').and_then(|s| s.strip_suffix('>')))
207        .unwrap_or(href)
208        .to_owned();
209    let href = match link_type {
210        LinkType::Email if href.starts_with("mailto:") => href.to_owned(),
211        LinkType::Email => format!("mailto:{href}"),
212        LinkType::Inline
213        | LinkType::Reference
214        | LinkType::ReferenceUnknown
215        | LinkType::Collapsed
216        | LinkType::CollapsedUnknown
217        | LinkType::Shortcut
218        | LinkType::ShortcutUnknown
219        | LinkType::Autolink
220        | LinkType::WikiLink { .. } => href.to_owned(),
221    };
222    AutolinkFact::new(range, text, href, AutolinkOrigin::CommonMark)
223}
224
225fn scan_gfm_autolinks_in_source(text: &str, base: usize, source: &str, policy: GfmAutolinkPolicy) -> Vec<AutolinkFact> {
226    scan_gfm_autolink_matches(text, base, source, policy)
227        .into_iter()
228        .map(|m| {
229            AutolinkFact::new(
230                base.saturating_add(m.range.start)..base.saturating_add(m.range.end),
231                m.text,
232                m.href,
233                m.origin,
234            )
235        })
236        .collect()
237}
238
239fn push_text_with_gfm_autolinks(
240    text: &str,
241    base: usize,
242    source: &str,
243    policy: GfmAutolinkPolicy,
244    out: &mut Vec<Event<'_>>,
245) -> usize {
246    let matches = scan_gfm_autolink_matches(text, base, source, policy);
247    if matches.is_empty() {
248        out.push(Event::Text(CowStr::from(text.to_owned())));
249        return base.saturating_add(text.len());
250    }
251    let mut cursor = 0usize;
252    let mut skip_until = base;
253    for m in matches {
254        if m.range.start > cursor
255            && let Some(prefix) = text.get(cursor..m.range.start)
256        {
257            out.push(Event::Text(CowStr::from(prefix.to_owned())));
258        }
259        out.push(Event::Start(Tag::Link {
260            link_type: LinkType::Autolink,
261            dest_url: CowStr::from(m.href.clone()),
262            title: CowStr::from(String::new()),
263            id: CowStr::from(String::new()),
264        }));
265        out.push(Event::Text(CowStr::from(m.text)));
266        out.push(Event::End(TagEnd::Link));
267        cursor = m.range.end;
268        skip_until = skip_until.max(base.saturating_add(m.range.end));
269    }
270    if cursor < text.len()
271        && let Some(suffix) = text.get(cursor..)
272    {
273        out.push(Event::Text(CowStr::from(suffix.to_owned())));
274    }
275    skip_until
276}
277
278fn scan_gfm_autolink_matches(text: &str, base: usize, source: &str, policy: GfmAutolinkPolicy) -> Vec<AutolinkMatch> {
279    if policy == GfmAutolinkPolicy::Disabled {
280        return Vec::new();
281    }
282    let mut matches = scan_gfm_url_matches(text, base, source);
283    if policy == GfmAutolinkPolicy::UrlsAndEmails {
284        matches.extend(scan_gfm_email_matches(text, base, source));
285    }
286    matches.sort_by_key(|m| (m.range.start, m.range.end));
287    let mut out = Vec::new();
288    let mut consumed_until = 0usize;
289    for m in matches {
290        if m.range.start < consumed_until {
291            continue;
292        }
293        consumed_until = m.range.end;
294        out.push(m);
295    }
296    out
297}
298
299fn scan_gfm_url_matches(text: &str, base: usize, source: &str) -> Vec<AutolinkMatch> {
300    let mut out = Vec::new();
301    let mut consumed_until = 0usize;
302    for caps in bare_autolink_regex().captures_iter(text) {
303        let Some(candidate) = caps.get(2) else {
304            continue;
305        };
306        if candidate.start() < consumed_until {
307            continue;
308        }
309        let Some(m) = classify_url_candidate(text, candidate.start(), candidate.end(), base, source) else {
310            continue;
311        };
312        consumed_until = m.range.end;
313        out.push(m);
314    }
315    out
316}
317
318#[allow(clippy::expect_used, reason = "static GFM autolink regex is validated by unit tests")]
319fn bare_autolink_regex() -> &'static Regex {
320    static RE: OnceLock<Regex> = OnceLock::new();
321    RE.get_or_init(|| {
322        Regex::new(r"(?i)(^|[\s*_~(])((?:https?|ftp)://[^\s<]+|www\.[^\s<]+)")
323            .expect("GFM bare autolink regex compiles")
324    })
325}
326
327fn classify_url_candidate(text: &str, start: usize, end: usize, base: usize, source: &str) -> Option<AutolinkMatch> {
328    let raw = url_source_candidate(text, start, end, base, source)?;
329    if raw.starts_with("www.") || raw.starts_with("WWW.") {
330        classify_www(raw, start)
331    } else if raw.contains("://") {
332        classify_url(raw, start)
333    } else {
334        None
335    }
336}
337
338fn url_source_candidate<'a>(text: &'a str, start: usize, end: usize, base: usize, source: &'a str) -> Option<&'a str> {
339    if end < text.len() {
340        return text.get(start..end);
341    }
342    let abs_start = base.saturating_add(start);
343    let rest = source.get(abs_start..)?;
344    let rel_end = rest
345        .char_indices()
346        .find_map(|(i, ch)| (ch.is_whitespace() || ch == '<').then_some(i))
347        .unwrap_or(rest.len());
348    rest.get(..rel_end)
349}
350
351fn classify_www(raw: &str, start: usize) -> Option<AutolinkMatch> {
352    let rest = raw.get(4..)?;
353    let host_len = valid_domain_prefix(rest)?;
354    let candidate_end = extend_path_and_trim(raw, 4usize.saturating_add(host_len));
355    let text = raw.get(..candidate_end)?.to_owned();
356    Some(AutolinkMatch {
357        range: start..start.saturating_add(candidate_end),
358        href: format!("http://{text}"),
359        text,
360        origin: AutolinkOrigin::GfmUrl,
361    })
362}
363
364fn classify_url(raw: &str, start: usize) -> Option<AutolinkMatch> {
365    let scheme_end = raw.find("://")?;
366    let scheme = raw.get(..scheme_end)?.to_ascii_lowercase();
367    if !matches!(scheme.as_str(), "http" | "https" | "ftp") {
368        return None;
369    }
370    let host_start = scheme_end.saturating_add(3);
371    let host = raw.get(host_start..)?;
372    let host_len = valid_domain_prefix(host)?;
373    let candidate_end = extend_path_and_trim(raw, host_start.saturating_add(host_len));
374    let text = raw.get(..candidate_end)?.to_owned();
375    Some(AutolinkMatch {
376        range: start..start.saturating_add(candidate_end),
377        href: text.clone(),
378        text,
379        origin: AutolinkOrigin::GfmUrl,
380    })
381}
382
383fn scan_gfm_email_matches(text: &str, base: usize, source: &str) -> Vec<AutolinkMatch> {
384    email_regex()
385        .find_iter(text)
386        .filter_map(|m| classify_email_candidate(m.as_str(), m.start(), base, source))
387        .collect()
388}
389
390#[allow(clippy::expect_used, reason = "static GFM email regex is validated by unit tests")]
391fn email_regex() -> &'static Regex {
392    static RE: OnceLock<Regex> = OnceLock::new();
393    RE.get_or_init(|| {
394        Regex::new(r"[A-Za-z0-9._+-]+@[A-Za-z0-9_-]+(?:\.[A-Za-z0-9_-]+)+\.?")
395            .expect("GFM email autolink regex compiles")
396    })
397}
398
399fn classify_email_candidate(raw: &str, start: usize, base: usize, source: &str) -> Option<AutolinkMatch> {
400    let trimmed = raw.strip_suffix('.').unwrap_or(raw);
401    let domain = trimmed.split_once('@')?.1;
402    let last = domain.as_bytes().last().copied()?;
403    if matches!(last, b'-' | b'_') {
404        return None;
405    }
406    let absolute_end = base.saturating_add(start).saturating_add(trimmed.len());
407    if source
408        .as_bytes()
409        .get(absolute_end)
410        .is_some_and(|b| matches!(b, b'-' | b'_'))
411    {
412        return None;
413    }
414    let text = trimmed.to_owned();
415    Some(AutolinkMatch {
416        range: start..start.saturating_add(trimmed.len()),
417        href: format!("mailto:{text}"),
418        text,
419        origin: AutolinkOrigin::GfmEmail,
420    })
421}
422
423fn valid_domain_prefix(data: &str) -> Option<usize> {
424    let mut last_end = 0usize;
425    let mut labels = Vec::new();
426    for (i, ch) in data.char_indices() {
427        if ch == '.' || ch == '-' || ch == '_' || ch.is_ascii_alphanumeric() {
428            last_end = i.saturating_add(ch.len_utf8());
429            continue;
430        }
431        break;
432    }
433    while last_end > 0 && data.as_bytes().get(last_end.saturating_sub(1)) == Some(&b'.') {
434        last_end = last_end.saturating_sub(1);
435    }
436    let domain = data.get(..last_end)?;
437    if domain.is_empty() || !domain.contains('.') {
438        return None;
439    }
440    for label in domain.split('.') {
441        if label.is_empty() || label.starts_with('-') || label.ends_with('-') {
442            return None;
443        }
444        labels.push(label);
445    }
446    let len = labels.len();
447    if len < 2 {
448        return None;
449    }
450    if labels
451        .iter()
452        .skip(len.saturating_sub(2))
453        .any(|label| label.contains('_'))
454    {
455        return None;
456    }
457    Some(last_end)
458}
459
460fn extend_path_and_trim(raw: &str, min_end: usize) -> usize {
461    let mut end = raw.len();
462    while end > min_end {
463        let Some(&b) = raw.as_bytes().get(end.saturating_sub(1)) else {
464            break;
465        };
466        if matches!(b, b'?' | b'!' | b'.' | b',' | b':' | b'*' | b'_' | b'~' | b'\'' | b'"') {
467            end = end.saturating_sub(1);
468        } else if b == b';' && looks_like_entity_suffix(raw, end) {
469            end = trim_entity_suffix(raw, end);
470        } else if b == b')' && has_unbalanced_trailing_paren(raw, end) {
471            end = end.saturating_sub(1);
472        } else {
473            break;
474        }
475    }
476    end
477}
478
479fn looks_like_entity_suffix(raw: &str, end: usize) -> bool {
480    trim_entity_suffix(raw, end) < end
481}
482
483fn trim_entity_suffix(raw: &str, end: usize) -> usize {
484    let bytes = raw.as_bytes();
485    let mut i = end.saturating_sub(1);
486    while i > 0 && bytes.get(i.saturating_sub(1)).is_some_and(u8::is_ascii_alphanumeric) {
487        i = i.saturating_sub(1);
488    }
489    if i > 0 && bytes.get(i.saturating_sub(1)) == Some(&b'&') {
490        i.saturating_sub(1)
491    } else {
492        end.saturating_sub(1)
493    }
494}
495
496fn has_unbalanced_trailing_paren(raw: &str, end: usize) -> bool {
497    let Some(slice) = raw.get(..end) else {
498        return false;
499    };
500    let open = slice.bytes().filter(|&b| b == b'(').count();
501    let close = slice.bytes().filter(|&b| b == b')').count();
502    close > open
503}
504
505fn tagfilter_html(html: &str) -> String {
506    tagfilter_regex().replace_all(html, "&lt;$rest").into_owned()
507}
508
509#[allow(
510    clippy::expect_used,
511    reason = "static GFM tagfilter regex is validated by unit tests"
512)]
513fn tagfilter_regex() -> &'static Regex {
514    static RE: OnceLock<Regex> = OnceLock::new();
515    RE.get_or_init(|| {
516        Regex::new(r"(?ix)<(?P<rest>/?(?:title|textarea|style|xmp|iframe|noembed|noframes|script|plaintext)(?:\s|>|/))")
517            .expect("GFM tagfilter regex compiles")
518    })
519}
520
521#[cfg(test)]
522mod tests {
523    use super::{AutolinkFact, AutolinkOrigin, GfmAutolinkPolicy, scan_gfm_autolinks_in_source, tagfilter_html};
524
525    #[test]
526    fn scans_gfm_www_url_and_email_autolinks() {
527        let matches = scan_gfm_autolinks_in_source(
528            "www.commonmark.org http://commonmark.org ftp://foo.bar.baz foo@bar.baz",
529            10,
530            "www.commonmark.org http://commonmark.org ftp://foo.bar.baz foo@bar.baz",
531            GfmAutolinkPolicy::UrlsAndEmails,
532        );
533        let hrefs: Vec<&str> = matches.iter().map(|m| m.href()).collect();
534        assert_eq!(
535            hrefs,
536            [
537                "http://www.commonmark.org",
538                "http://commonmark.org",
539                "ftp://foo.bar.baz",
540                "mailto:foo@bar.baz",
541            ]
542        );
543        assert_eq!(matches.first().map(|m| m.raw_range()), Some(10..28));
544    }
545
546    #[test]
547    fn trims_gfm_trailing_punctuation_and_balances_parentheses() {
548        let matches = scan_gfm_autolinks_in_source(
549            "Visit www.commonmark.org/a.b. (www.google.com/q=(x)))",
550            0,
551            "Visit www.commonmark.org/a.b. (www.google.com/q=(x)))",
552            GfmAutolinkPolicy::Urls,
553        );
554        let texts: Vec<&str> = matches.iter().map(|m| m.text()).collect();
555        assert_eq!(texts, ["www.commonmark.org/a.b", "www.google.com/q=(x)"]);
556    }
557
558    #[test]
559    fn rejects_invalid_domains_and_email_tails() {
560        assert!(
561            scan_gfm_autolinks_in_source("foo www. foo", 0, "foo www. foo", GfmAutolinkPolicy::UrlsAndEmails)
562                .is_empty()
563        );
564        assert!(
565            scan_gfm_autolinks_in_source(
566                "foo http:// foo",
567                0,
568                "foo http:// foo",
569                GfmAutolinkPolicy::UrlsAndEmails
570            )
571            .is_empty()
572        );
573        assert!(
574            scan_gfm_autolinks_in_source(
575                "www.xxx.yyy._zzz",
576                0,
577                "www.xxx.yyy._zzz",
578                GfmAutolinkPolicy::UrlsAndEmails
579            )
580            .is_empty()
581        );
582        assert!(
583            scan_gfm_autolinks_in_source("a.b-c_d@a.b-", 0, "a.b-c_d@a.b-", GfmAutolinkPolicy::UrlsAndEmails)
584                .is_empty()
585        );
586        assert!(
587            scan_gfm_autolinks_in_source("a.b-c_d@a.b_", 0, "a.b-c_d@a.b_", GfmAutolinkPolicy::UrlsAndEmails)
588                .is_empty()
589        );
590    }
591
592    #[test]
593    fn email_autolink_policy_can_be_url_only() {
594        let matches = scan_gfm_autolinks_in_source(
595            "https://example.com foo@bar.baz",
596            0,
597            "https://example.com foo@bar.baz",
598            GfmAutolinkPolicy::Urls,
599        );
600        assert_eq!(matches.len(), 1);
601        assert_eq!(matches.first().map(AutolinkFact::origin), Some(AutolinkOrigin::GfmUrl));
602    }
603
604    #[test]
605    fn tagfilter_escapes_disallowed_tags() {
606        assert_eq!(
607            tagfilter_html("<script>alert(1)</script>"),
608            "&lt;script>alert(1)&lt;/script>"
609        );
610        assert_eq!(tagfilter_html("<custom>ok</custom>"), "<custom>ok</custom>");
611    }
612}