1use std::ops::Range;
9use std::sync::OnceLock;
10
11use pulldown_cmark::{CowStr, Event, LinkType, Tag, TagEnd};
12use regex::Regex;
13
14use crate::{GfmAutolinkPolicy, GfmOptions};
15
16#[derive(Copy, Clone, Debug, PartialEq, Eq)]
18pub enum AutolinkOrigin {
19 CommonMark,
20 GfmUrl,
21 GfmEmail,
22}
23
24#[derive(Clone, Debug, PartialEq, Eq)]
26pub struct AutolinkFact {
27 raw_range: Range<usize>,
28 text: String,
29 href: String,
30 origin: AutolinkOrigin,
31}
32
33impl AutolinkFact {
34 fn new(raw_range: Range<usize>, text: String, href: String, origin: AutolinkOrigin) -> Self {
35 Self {
36 raw_range,
37 text,
38 href,
39 origin,
40 }
41 }
42
43 #[must_use]
45 pub fn raw_range(&self) -> Range<usize> {
46 self.raw_range.clone()
47 }
48
49 #[must_use]
51 pub fn text(&self) -> &str {
52 &self.text
53 }
54
55 #[must_use]
57 pub fn href(&self) -> &str {
58 &self.href
59 }
60
61 #[must_use]
63 pub fn origin(&self) -> AutolinkOrigin {
64 self.origin
65 }
66}
67
68#[derive(Clone, Debug, PartialEq, Eq)]
69struct AutolinkMatch {
70 range: Range<usize>,
71 text: String,
72 href: String,
73 origin: AutolinkOrigin,
74}
75
76pub(crate) fn collect_autolinks(
77 source: &str,
78 events: &[(Event<'_>, Range<usize>)],
79 opts: GfmOptions,
80) -> Vec<AutolinkFact> {
81 let mut out = Vec::new();
82 let mut link_depth = 0u32;
83 let mut code_block_depth = 0u32;
84 for (event, range) in events {
85 match event {
86 Event::Start(Tag::CodeBlock(_)) => {
87 code_block_depth = code_block_depth.saturating_add(1);
88 }
89 Event::End(TagEnd::CodeBlock) => {
90 code_block_depth = code_block_depth.saturating_sub(1);
91 }
92 Event::Start(Tag::Link {
93 link_type, dest_url, ..
94 }) if code_block_depth == 0 && matches!(link_type, LinkType::Autolink | LinkType::Email) => {
95 out.push(commonmark_autolink_fact(
96 source,
97 range.clone(),
98 *link_type,
99 dest_url.as_ref(),
100 ));
101 link_depth = link_depth.saturating_add(1);
102 }
103 Event::Start(Tag::Link { .. } | Tag::Image { .. }) => {
104 link_depth = link_depth.saturating_add(1);
105 }
106 Event::End(TagEnd::Link | TagEnd::Image) => {
107 link_depth = link_depth.saturating_sub(1);
108 }
109 Event::Text(text) if link_depth == 0 && code_block_depth == 0 => {
110 out.extend(scan_gfm_autolinks_in_source(
111 text.as_ref(),
112 range.start,
113 source,
114 opts.autolinks,
115 ));
116 }
117 Event::Start(_)
118 | Event::End(_)
119 | Event::Text(_)
120 | Event::Code(_)
121 | Event::InlineMath(_)
122 | Event::DisplayMath(_)
123 | Event::Html(_)
124 | Event::InlineHtml(_)
125 | Event::FootnoteReference(_)
126 | Event::SoftBreak
127 | Event::HardBreak
128 | Event::Rule
129 | Event::TaskListMarker(_) => {}
130 }
131 }
132 out
133}
134
135pub(crate) fn apply_gfm_render_policy<'a>(
136 source: &str,
137 events: Vec<(Event<'a>, Range<usize>)>,
138 opts: GfmOptions,
139) -> Vec<Event<'a>> {
140 let mut out = Vec::with_capacity(events.len());
141 let mut link_depth = 0u32;
142 let mut code_block_depth = 0u32;
143 let mut skip_until = 0usize;
144 for (event, range) in events {
145 if range.end <= skip_until {
146 continue;
147 }
148 match event {
149 Event::Start(Tag::CodeBlock(_)) => {
150 code_block_depth = code_block_depth.saturating_add(1);
151 out.push(event);
152 }
153 Event::End(TagEnd::CodeBlock) => {
154 code_block_depth = code_block_depth.saturating_sub(1);
155 out.push(event);
156 }
157 Event::Start(Tag::Link { .. } | Tag::Image { .. }) => {
158 link_depth = link_depth.saturating_add(1);
159 out.push(event);
160 }
161 Event::End(TagEnd::Link | TagEnd::Image) => {
162 link_depth = link_depth.saturating_sub(1);
163 out.push(event);
164 }
165 Event::Text(text) if link_depth == 0 && code_block_depth == 0 => {
166 let text = text.as_ref();
167 let local_skip = skip_until.saturating_sub(range.start).min(text.len());
168 if let Some(text) = text.get(local_skip..) {
169 skip_until = push_text_with_gfm_autolinks(
170 text,
171 range.start.saturating_add(local_skip),
172 source,
173 opts.autolinks,
174 &mut out,
175 )
176 .max(skip_until);
177 }
178 }
179 Event::Html(html) if opts.tagfilter => {
180 out.push(Event::Html(CowStr::from(tagfilter_html(html.as_ref()))));
181 }
182 Event::InlineHtml(html) if opts.tagfilter => {
183 out.push(Event::InlineHtml(CowStr::from(tagfilter_html(html.as_ref()))));
184 }
185 Event::Start(_)
186 | Event::End(_)
187 | Event::Text(_)
188 | Event::Code(_)
189 | Event::InlineMath(_)
190 | Event::DisplayMath(_)
191 | Event::Html(_)
192 | Event::InlineHtml(_)
193 | Event::FootnoteReference(_)
194 | Event::SoftBreak
195 | Event::HardBreak
196 | Event::Rule
197 | Event::TaskListMarker(_) => out.push(event),
198 }
199 }
200 out
201}
202
203fn commonmark_autolink_fact(source: &str, range: Range<usize>, link_type: LinkType, href: &str) -> AutolinkFact {
204 let text = source
205 .get(range.clone())
206 .and_then(|raw| raw.strip_prefix('<').and_then(|s| s.strip_suffix('>')))
207 .unwrap_or(href)
208 .to_owned();
209 let href = match link_type {
210 LinkType::Email if href.starts_with("mailto:") => href.to_owned(),
211 LinkType::Email => format!("mailto:{href}"),
212 LinkType::Inline
213 | LinkType::Reference
214 | LinkType::ReferenceUnknown
215 | LinkType::Collapsed
216 | LinkType::CollapsedUnknown
217 | LinkType::Shortcut
218 | LinkType::ShortcutUnknown
219 | LinkType::Autolink
220 | LinkType::WikiLink { .. } => href.to_owned(),
221 };
222 AutolinkFact::new(range, text, href, AutolinkOrigin::CommonMark)
223}
224
225fn scan_gfm_autolinks_in_source(text: &str, base: usize, source: &str, policy: GfmAutolinkPolicy) -> Vec<AutolinkFact> {
226 scan_gfm_autolink_matches(text, base, source, policy)
227 .into_iter()
228 .map(|m| {
229 AutolinkFact::new(
230 base.saturating_add(m.range.start)..base.saturating_add(m.range.end),
231 m.text,
232 m.href,
233 m.origin,
234 )
235 })
236 .collect()
237}
238
239fn push_text_with_gfm_autolinks(
240 text: &str,
241 base: usize,
242 source: &str,
243 policy: GfmAutolinkPolicy,
244 out: &mut Vec<Event<'_>>,
245) -> usize {
246 let matches = scan_gfm_autolink_matches(text, base, source, policy);
247 if matches.is_empty() {
248 out.push(Event::Text(CowStr::from(text.to_owned())));
249 return base.saturating_add(text.len());
250 }
251 let mut cursor = 0usize;
252 let mut skip_until = base;
253 for m in matches {
254 if m.range.start > cursor
255 && let Some(prefix) = text.get(cursor..m.range.start)
256 {
257 out.push(Event::Text(CowStr::from(prefix.to_owned())));
258 }
259 out.push(Event::Start(Tag::Link {
260 link_type: LinkType::Autolink,
261 dest_url: CowStr::from(m.href.clone()),
262 title: CowStr::from(String::new()),
263 id: CowStr::from(String::new()),
264 }));
265 out.push(Event::Text(CowStr::from(m.text)));
266 out.push(Event::End(TagEnd::Link));
267 cursor = m.range.end;
268 skip_until = skip_until.max(base.saturating_add(m.range.end));
269 }
270 if cursor < text.len()
271 && let Some(suffix) = text.get(cursor..)
272 {
273 out.push(Event::Text(CowStr::from(suffix.to_owned())));
274 }
275 skip_until
276}
277
278fn scan_gfm_autolink_matches(text: &str, base: usize, source: &str, policy: GfmAutolinkPolicy) -> Vec<AutolinkMatch> {
279 if policy == GfmAutolinkPolicy::Disabled {
280 return Vec::new();
281 }
282 let mut matches = scan_gfm_url_matches(text, base, source);
283 if policy == GfmAutolinkPolicy::UrlsAndEmails {
284 matches.extend(scan_gfm_email_matches(text, base, source));
285 }
286 matches.sort_by_key(|m| (m.range.start, m.range.end));
287 let mut out = Vec::new();
288 let mut consumed_until = 0usize;
289 for m in matches {
290 if m.range.start < consumed_until {
291 continue;
292 }
293 consumed_until = m.range.end;
294 out.push(m);
295 }
296 out
297}
298
299fn scan_gfm_url_matches(text: &str, base: usize, source: &str) -> Vec<AutolinkMatch> {
300 let mut out = Vec::new();
301 let mut consumed_until = 0usize;
302 for caps in bare_autolink_regex().captures_iter(text) {
303 let Some(candidate) = caps.get(2) else {
304 continue;
305 };
306 if candidate.start() < consumed_until {
307 continue;
308 }
309 let Some(m) = classify_url_candidate(text, candidate.start(), candidate.end(), base, source) else {
310 continue;
311 };
312 consumed_until = m.range.end;
313 out.push(m);
314 }
315 out
316}
317
318#[allow(clippy::expect_used, reason = "static GFM autolink regex is validated by unit tests")]
319fn bare_autolink_regex() -> &'static Regex {
320 static RE: OnceLock<Regex> = OnceLock::new();
321 RE.get_or_init(|| {
322 Regex::new(r"(?i)(^|[\s*_~(])((?:https?|ftp)://[^\s<]+|www\.[^\s<]+)")
323 .expect("GFM bare autolink regex compiles")
324 })
325}
326
327fn classify_url_candidate(text: &str, start: usize, end: usize, base: usize, source: &str) -> Option<AutolinkMatch> {
328 let raw = url_source_candidate(text, start, end, base, source)?;
329 if raw.starts_with("www.") || raw.starts_with("WWW.") {
330 classify_www(raw, start)
331 } else if raw.contains("://") {
332 classify_url(raw, start)
333 } else {
334 None
335 }
336}
337
338fn url_source_candidate<'a>(text: &'a str, start: usize, end: usize, base: usize, source: &'a str) -> Option<&'a str> {
339 if end < text.len() {
340 return text.get(start..end);
341 }
342 let abs_start = base.saturating_add(start);
343 let rest = source.get(abs_start..)?;
344 let rel_end = rest
345 .char_indices()
346 .find_map(|(i, ch)| (ch.is_whitespace() || ch == '<').then_some(i))
347 .unwrap_or(rest.len());
348 rest.get(..rel_end)
349}
350
351fn classify_www(raw: &str, start: usize) -> Option<AutolinkMatch> {
352 let rest = raw.get(4..)?;
353 let host_len = valid_domain_prefix(rest)?;
354 let candidate_end = extend_path_and_trim(raw, 4usize.saturating_add(host_len));
355 let text = raw.get(..candidate_end)?.to_owned();
356 Some(AutolinkMatch {
357 range: start..start.saturating_add(candidate_end),
358 href: format!("http://{text}"),
359 text,
360 origin: AutolinkOrigin::GfmUrl,
361 })
362}
363
364fn classify_url(raw: &str, start: usize) -> Option<AutolinkMatch> {
365 let scheme_end = raw.find("://")?;
366 let scheme = raw.get(..scheme_end)?.to_ascii_lowercase();
367 if !matches!(scheme.as_str(), "http" | "https" | "ftp") {
368 return None;
369 }
370 let host_start = scheme_end.saturating_add(3);
371 let host = raw.get(host_start..)?;
372 let host_len = valid_domain_prefix(host)?;
373 let candidate_end = extend_path_and_trim(raw, host_start.saturating_add(host_len));
374 let text = raw.get(..candidate_end)?.to_owned();
375 Some(AutolinkMatch {
376 range: start..start.saturating_add(candidate_end),
377 href: text.clone(),
378 text,
379 origin: AutolinkOrigin::GfmUrl,
380 })
381}
382
383fn scan_gfm_email_matches(text: &str, base: usize, source: &str) -> Vec<AutolinkMatch> {
384 email_regex()
385 .find_iter(text)
386 .filter_map(|m| classify_email_candidate(m.as_str(), m.start(), base, source))
387 .collect()
388}
389
390#[allow(clippy::expect_used, reason = "static GFM email regex is validated by unit tests")]
391fn email_regex() -> &'static Regex {
392 static RE: OnceLock<Regex> = OnceLock::new();
393 RE.get_or_init(|| {
394 Regex::new(r"[A-Za-z0-9._+-]+@[A-Za-z0-9_-]+(?:\.[A-Za-z0-9_-]+)+\.?")
395 .expect("GFM email autolink regex compiles")
396 })
397}
398
399fn classify_email_candidate(raw: &str, start: usize, base: usize, source: &str) -> Option<AutolinkMatch> {
400 let trimmed = raw.strip_suffix('.').unwrap_or(raw);
401 let domain = trimmed.split_once('@')?.1;
402 let last = domain.as_bytes().last().copied()?;
403 if matches!(last, b'-' | b'_') {
404 return None;
405 }
406 let absolute_end = base.saturating_add(start).saturating_add(trimmed.len());
407 if source
408 .as_bytes()
409 .get(absolute_end)
410 .is_some_and(|b| matches!(b, b'-' | b'_'))
411 {
412 return None;
413 }
414 let text = trimmed.to_owned();
415 Some(AutolinkMatch {
416 range: start..start.saturating_add(trimmed.len()),
417 href: format!("mailto:{text}"),
418 text,
419 origin: AutolinkOrigin::GfmEmail,
420 })
421}
422
423fn valid_domain_prefix(data: &str) -> Option<usize> {
424 let mut last_end = 0usize;
425 let mut labels = Vec::new();
426 for (i, ch) in data.char_indices() {
427 if ch == '.' || ch == '-' || ch == '_' || ch.is_ascii_alphanumeric() {
428 last_end = i.saturating_add(ch.len_utf8());
429 continue;
430 }
431 break;
432 }
433 while last_end > 0 && data.as_bytes().get(last_end.saturating_sub(1)) == Some(&b'.') {
434 last_end = last_end.saturating_sub(1);
435 }
436 let domain = data.get(..last_end)?;
437 if domain.is_empty() || !domain.contains('.') {
438 return None;
439 }
440 for label in domain.split('.') {
441 if label.is_empty() || label.starts_with('-') || label.ends_with('-') {
442 return None;
443 }
444 labels.push(label);
445 }
446 let len = labels.len();
447 if len < 2 {
448 return None;
449 }
450 if labels
451 .iter()
452 .skip(len.saturating_sub(2))
453 .any(|label| label.contains('_'))
454 {
455 return None;
456 }
457 Some(last_end)
458}
459
460fn extend_path_and_trim(raw: &str, min_end: usize) -> usize {
461 let mut end = raw.len();
462 while end > min_end {
463 let Some(&b) = raw.as_bytes().get(end.saturating_sub(1)) else {
464 break;
465 };
466 if matches!(b, b'?' | b'!' | b'.' | b',' | b':' | b'*' | b'_' | b'~' | b'\'' | b'"') {
467 end = end.saturating_sub(1);
468 } else if b == b';' && looks_like_entity_suffix(raw, end) {
469 end = trim_entity_suffix(raw, end);
470 } else if b == b')' && has_unbalanced_trailing_paren(raw, end) {
471 end = end.saturating_sub(1);
472 } else {
473 break;
474 }
475 }
476 end
477}
478
479fn looks_like_entity_suffix(raw: &str, end: usize) -> bool {
480 trim_entity_suffix(raw, end) < end
481}
482
483fn trim_entity_suffix(raw: &str, end: usize) -> usize {
484 let bytes = raw.as_bytes();
485 let mut i = end.saturating_sub(1);
486 while i > 0 && bytes.get(i.saturating_sub(1)).is_some_and(u8::is_ascii_alphanumeric) {
487 i = i.saturating_sub(1);
488 }
489 if i > 0 && bytes.get(i.saturating_sub(1)) == Some(&b'&') {
490 i.saturating_sub(1)
491 } else {
492 end.saturating_sub(1)
493 }
494}
495
496fn has_unbalanced_trailing_paren(raw: &str, end: usize) -> bool {
497 let Some(slice) = raw.get(..end) else {
498 return false;
499 };
500 let open = slice.bytes().filter(|&b| b == b'(').count();
501 let close = slice.bytes().filter(|&b| b == b')').count();
502 close > open
503}
504
505fn tagfilter_html(html: &str) -> String {
506 tagfilter_regex().replace_all(html, "<$rest").into_owned()
507}
508
509#[allow(
510 clippy::expect_used,
511 reason = "static GFM tagfilter regex is validated by unit tests"
512)]
513fn tagfilter_regex() -> &'static Regex {
514 static RE: OnceLock<Regex> = OnceLock::new();
515 RE.get_or_init(|| {
516 Regex::new(r"(?ix)<(?P<rest>/?(?:title|textarea|style|xmp|iframe|noembed|noframes|script|plaintext)(?:\s|>|/))")
517 .expect("GFM tagfilter regex compiles")
518 })
519}
520
521#[cfg(test)]
522mod tests {
523 use super::{AutolinkFact, AutolinkOrigin, GfmAutolinkPolicy, scan_gfm_autolinks_in_source, tagfilter_html};
524
525 #[test]
526 fn scans_gfm_www_url_and_email_autolinks() {
527 let matches = scan_gfm_autolinks_in_source(
528 "www.commonmark.org http://commonmark.org ftp://foo.bar.baz foo@bar.baz",
529 10,
530 "www.commonmark.org http://commonmark.org ftp://foo.bar.baz foo@bar.baz",
531 GfmAutolinkPolicy::UrlsAndEmails,
532 );
533 let hrefs: Vec<&str> = matches.iter().map(|m| m.href()).collect();
534 assert_eq!(
535 hrefs,
536 [
537 "http://www.commonmark.org",
538 "http://commonmark.org",
539 "ftp://foo.bar.baz",
540 "mailto:foo@bar.baz",
541 ]
542 );
543 assert_eq!(matches.first().map(|m| m.raw_range()), Some(10..28));
544 }
545
546 #[test]
547 fn trims_gfm_trailing_punctuation_and_balances_parentheses() {
548 let matches = scan_gfm_autolinks_in_source(
549 "Visit www.commonmark.org/a.b. (www.google.com/q=(x)))",
550 0,
551 "Visit www.commonmark.org/a.b. (www.google.com/q=(x)))",
552 GfmAutolinkPolicy::Urls,
553 );
554 let texts: Vec<&str> = matches.iter().map(|m| m.text()).collect();
555 assert_eq!(texts, ["www.commonmark.org/a.b", "www.google.com/q=(x)"]);
556 }
557
558 #[test]
559 fn rejects_invalid_domains_and_email_tails() {
560 assert!(
561 scan_gfm_autolinks_in_source("foo www. foo", 0, "foo www. foo", GfmAutolinkPolicy::UrlsAndEmails)
562 .is_empty()
563 );
564 assert!(
565 scan_gfm_autolinks_in_source(
566 "foo http:// foo",
567 0,
568 "foo http:// foo",
569 GfmAutolinkPolicy::UrlsAndEmails
570 )
571 .is_empty()
572 );
573 assert!(
574 scan_gfm_autolinks_in_source(
575 "www.xxx.yyy._zzz",
576 0,
577 "www.xxx.yyy._zzz",
578 GfmAutolinkPolicy::UrlsAndEmails
579 )
580 .is_empty()
581 );
582 assert!(
583 scan_gfm_autolinks_in_source("a.b-c_d@a.b-", 0, "a.b-c_d@a.b-", GfmAutolinkPolicy::UrlsAndEmails)
584 .is_empty()
585 );
586 assert!(
587 scan_gfm_autolinks_in_source("a.b-c_d@a.b_", 0, "a.b-c_d@a.b_", GfmAutolinkPolicy::UrlsAndEmails)
588 .is_empty()
589 );
590 }
591
592 #[test]
593 fn email_autolink_policy_can_be_url_only() {
594 let matches = scan_gfm_autolinks_in_source(
595 "https://example.com foo@bar.baz",
596 0,
597 "https://example.com foo@bar.baz",
598 GfmAutolinkPolicy::Urls,
599 );
600 assert_eq!(matches.len(), 1);
601 assert_eq!(matches.first().map(AutolinkFact::origin), Some(AutolinkOrigin::GfmUrl));
602 }
603
604 #[test]
605 fn tagfilter_escapes_disallowed_tags() {
606 assert_eq!(
607 tagfilter_html("<script>alert(1)</script>"),
608 "<script>alert(1)</script>"
609 );
610 assert_eq!(tagfilter_html("<custom>ok</custom>"), "<custom>ok</custom>");
611 }
612}