use super::shared::{opt_span, GrammarSpan};
use crate::parser::ast::{Node, NodeKind};
use nom::bytes::complete::take;
use nom::IResult;
use nom::Parser;
const HTTP_SCHEME: &str = "http";
const HTTPS_SCHEME: &str = "https";
const SCHEME_SEPARATOR: &str = "://";
#[derive(Debug, Clone)]
struct Match {
len: usize, href: String, }
pub fn parse_gfm_autolink_literal(input: GrammarSpan) -> IResult<GrammarSpan, Node> {
let fragment = *input.fragment();
let Some(m) = match_at_start(fragment) else {
return Err(nom::Err::Error(nom::error::Error::new(
input,
nom::error::ErrorKind::Tag,
)));
};
let (rest, matched_span) = take(m.len).parse(input)?;
let label = matched_span.fragment().to_string();
let span = opt_span(matched_span);
let node = Node {
kind: NodeKind::Link {
url: m.href,
title: None,
},
span,
children: vec![Node {
kind: NodeKind::Text(label),
span: opt_span(matched_span),
children: Vec::new(),
}],
};
Ok((rest, node))
}
pub(crate) fn find_next_autolink_literal_start(text: &str) -> Option<usize> {
let mut best: Option<usize> = None;
let mut consider = |idx: usize| {
best = Some(best.map_or(idx, |b| b.min(idx)));
};
if let Some(idx) = find_first_valid_substring(text, "mailto:", |s| match_mailto(s).is_some()) {
consider(idx);
}
if let Some(idx) = find_first_valid_substring(text, "xmpp:", |s| match_xmpp(s).is_some()) {
consider(idx);
}
if let Some(idx) = find_first_valid_substring(text, "https://", |s| match_url(s).is_some()) {
if boundary_ok(text, idx) {
consider(idx);
}
}
if let Some(idx) = find_first_valid_substring(text, "http:", |s| match_url(s).is_some()) {
if boundary_ok(text, idx) {
consider(idx);
}
}
if let Some(idx) = find_first_valid_substring(text, "www.", |s| match_www(s).is_some()) {
if boundary_ok(text, idx) {
consider(idx);
}
}
if let Some(idx) = find_first_valid_email_start(text) {
consider(idx);
}
best
}
fn match_at_start(s: &str) -> Option<Match> {
if let Some(m) = match_mailto(s) {
return Some(m);
}
if let Some(m) = match_xmpp(s) {
return Some(m);
}
if let Some(m) = match_url(s) {
return Some(m);
}
if let Some(m) = match_www(s) {
return Some(m);
}
if let Some(m) = match_email(s) {
return Some(m);
}
None
}
fn match_www(s: &str) -> Option<Match> {
if !s.starts_with("www.") {
return None;
}
let after = &s[4..];
let domain_len = parse_domain(after, DomainRules::WwwOrHttp)?;
let max_end = scan_nonspace_non_lt(after, domain_len);
let candidate = &s[..(4 + max_end)];
let final_len = apply_extended_path_validation_len(candidate);
let label = &s[..final_len];
Some(Match {
len: label.len(),
href: format!("{}{}{}", HTTP_SCHEME, SCHEME_SEPARATOR, label),
})
}
fn match_url(s: &str) -> Option<Match> {
let (scheme_len, rest) = if let Some((rest, scheme_len)) = strip_scheme_prefix(s, HTTP_SCHEME) {
(scheme_len, rest)
} else if let Some((rest, scheme_len)) = strip_scheme_prefix(s, HTTPS_SCHEME) {
(scheme_len, rest)
} else {
return None;
};
let domain_len = parse_domain(rest, DomainRules::WwwOrHttp)?;
let max_end = scan_nonspace_non_lt(rest, domain_len);
let candidate = &s[..(scheme_len + max_end)];
let final_len = apply_extended_path_validation_len(candidate);
let label = &s[..final_len];
Some(Match {
len: label.len(),
href: label.to_string(),
})
}
fn strip_scheme_prefix<'a>(s: &'a str, scheme: &str) -> Option<(&'a str, usize)> {
let after_scheme = s.strip_prefix(scheme)?;
let rest = after_scheme.strip_prefix(SCHEME_SEPARATOR)?;
Some((rest, scheme.len() + SCHEME_SEPARATOR.len()))
}
fn match_email(s: &str) -> Option<Match> {
let email_len = parse_extended_email(s)?;
let label = &s[..email_len];
Some(Match {
len: label.len(),
href: format!("mailto:{}", label),
})
}
fn match_mailto(s: &str) -> Option<Match> {
let after = s.strip_prefix("mailto:")?;
let email_len = parse_extended_email(after)?;
let full_len = "mailto:".len() + email_len;
let label = &s[..full_len];
Some(Match {
len: label.len(),
href: label.to_string(),
})
}
fn match_xmpp(s: &str) -> Option<Match> {
let after = s.strip_prefix("xmpp:")?;
let email_len = parse_extended_email(after)?;
let mut full_len = "xmpp:".len() + email_len;
let remainder = &s[full_len..];
if let Some(rest) = remainder.strip_prefix('/') {
let resource_len = rest
.char_indices()
.take_while(|&(_, c)| c.is_ascii_alphanumeric() || c == '@' || c == '.')
.last()
.map(|(idx, c)| idx + c.len_utf8())
.unwrap_or(0);
if resource_len > 0 {
full_len += 1 + resource_len;
}
}
let label = &s[..full_len];
Some(Match {
len: label.len(),
href: label.to_string(),
})
}
fn boundary_ok(text: &str, idx: usize) -> bool {
if idx == 0 {
return true;
}
let prev = text[..idx].chars().next_back();
match prev {
Some(c) if c.is_whitespace() => true,
Some('*' | '_' | '~' | '(') => true,
_ => false,
}
}
fn find_first_valid_substring<F>(text: &str, needle: &str, mut validate: F) -> Option<usize>
where
F: FnMut(&str) -> bool,
{
let mut start = 0;
while start < text.len() {
let rel = text[start..].find(needle)?;
let idx = start + rel;
if validate(&text[idx..]) {
return Some(idx);
}
start = idx + needle.len();
}
None
}
fn find_first_valid_email_start(text: &str) -> Option<usize> {
let mut search_start = 0;
while search_start < text.len() {
let rel_at = text[search_start..].find('@')?;
let at_idx = search_start + rel_at;
let local_start = text[..at_idx]
.char_indices()
.rev()
.take_while(|&(_, c)| is_email_local_char(c))
.last()
.map(|(idx, _)| idx)
.unwrap_or(at_idx);
if local_start < at_idx {
if let Some(email_len) = parse_extended_email(&text[local_start..]) {
if local_start + email_len > at_idx {
return Some(local_start);
}
}
}
search_start = at_idx + 1;
}
None
}
fn is_email_local_char(c: char) -> bool {
c.is_ascii_alphanumeric() || matches!(c, '.' | '-' | '_' | '+')
}
fn is_email_domain_char(c: char) -> bool {
c.is_ascii_alphanumeric() || matches!(c, '-' | '_')
}
fn parse_extended_email(s: &str) -> Option<usize> {
let mut idx = 0;
let mut saw_local = false;
for (i, c) in s.char_indices() {
if is_email_local_char(c) {
idx = i + c.len_utf8();
saw_local = true;
continue;
}
if c == '@' {
idx = i;
break;
}
return None;
}
if !saw_local {
return None;
}
let rest = s.get(idx..)?;
let rest = rest.strip_prefix('@')?;
let domain_len = parse_email_domain(rest)?;
Some(idx + 1 + domain_len)
}
fn parse_email_domain(s: &str) -> Option<usize> {
let mut idx = 0;
let (seg_len, seg_has_underscore, seg_last_char) =
parse_domain_segment(s, DomainSegmentRules::Email)?;
idx += seg_len;
let mut segments = 1usize;
let mut last_seg_last_char = seg_last_char;
let mut _unused = seg_has_underscore;
while s[idx..].starts_with('.') {
let after_dot = &s[(idx + 1)..];
if !can_start_domain_segment(after_dot, DomainSegmentRules::Email) {
break;
}
let (seg_len, seg_has_underscore, seg_last_char) =
parse_domain_segment(after_dot, DomainSegmentRules::Email)?;
idx += 1 + seg_len;
segments += 1;
last_seg_last_char = seg_last_char;
_unused = seg_has_underscore;
}
if segments < 2 {
return None;
}
if matches!(last_seg_last_char, '-' | '_') {
return None;
}
Some(idx)
}
#[derive(Copy, Clone)]
enum DomainRules {
WwwOrHttp,
}
#[derive(Copy, Clone)]
enum DomainSegmentRules {
Email,
WwwOrHttp,
}
fn can_start_domain_segment(s: &str, rules: DomainSegmentRules) -> bool {
let Some(c) = s.chars().next() else {
return false;
};
match rules {
DomainSegmentRules::Email => is_email_domain_char(c),
DomainSegmentRules::WwwOrHttp => c.is_ascii_alphanumeric() || matches!(c, '-' | '_'),
}
}
fn parse_domain(s: &str, rules: DomainRules) -> Option<usize> {
let mut idx = 0;
let (seg_len, seg_has_underscore, _last_char) =
parse_domain_segment(s, DomainSegmentRules::WwwOrHttp)?;
idx += seg_len;
let mut segments = 1usize;
let mut prev_has_underscore: Option<bool> = None;
let mut last_has_underscore: bool = seg_has_underscore;
while s[idx..].starts_with('.') {
let after_dot = &s[(idx + 1)..];
if !can_start_domain_segment(after_dot, DomainSegmentRules::WwwOrHttp) {
break;
}
let (seg_len, seg_has_underscore, _last_char) =
parse_domain_segment(after_dot, DomainSegmentRules::WwwOrHttp)?;
idx += 1 + seg_len;
segments += 1;
prev_has_underscore = Some(last_has_underscore);
last_has_underscore = seg_has_underscore;
}
if segments < 2 {
return None;
}
match rules {
DomainRules::WwwOrHttp => {
if last_has_underscore {
return None;
}
if let Some(prev) = prev_has_underscore {
if prev {
return None;
}
}
}
}
Some(idx)
}
fn parse_domain_segment(s: &str, rules: DomainSegmentRules) -> Option<(usize, bool, char)> {
let mut len = 0usize;
let mut has_underscore = false;
let mut last_char: Option<char> = None;
for (i, c) in s.char_indices() {
let ok = match rules {
DomainSegmentRules::Email => is_email_domain_char(c),
DomainSegmentRules::WwwOrHttp => c.is_ascii_alphanumeric() || matches!(c, '-' | '_'),
};
if !ok {
break;
}
if c == '_' {
has_underscore = true;
}
len = i + c.len_utf8();
last_char = Some(c);
}
let last_char = last_char?;
Some((len, has_underscore, last_char))
}
fn scan_nonspace_non_lt(s: &str, start: usize) -> usize {
let mut end = start;
for (i, c) in s[start..].char_indices() {
if c.is_whitespace() || c == '<' {
break;
}
end = start + i + c.len_utf8();
}
end
}
fn apply_extended_path_validation_len(candidate: &str) -> usize {
let mut end = candidate.len();
while end > 0 {
let b = candidate.as_bytes()[end - 1];
let is_punct = matches!(
b,
b'?' | b'!' | b'.' | b',' | b':' | b'*' | b'_' | b'~' | b']'
);
if is_punct {
end -= 1;
} else {
break;
}
}
if end > 0 && candidate.as_bytes()[end - 1] == b')' {
let mut open = 0usize;
let mut close = 0usize;
for &b in candidate.as_bytes().iter().take(end) {
if b == b'(' {
open += 1;
} else if b == b')' {
close += 1;
}
}
while end > 0 && candidate.as_bytes()[end - 1] == b')' && close > open {
end -= 1;
close -= 1;
}
}
if end > 0 && candidate.as_bytes()[end - 1] == b';' {
let mut i = end - 1;
let mut saw_alnum = false;
while i > 0 {
let b = candidate.as_bytes()[i - 1];
if b.is_ascii_alphanumeric() {
saw_alnum = true;
i -= 1;
continue;
}
if b == b'&' {
if saw_alnum {
end = i - 1;
}
break;
}
break;
}
}
end
}
#[cfg(test)]
mod tests {
use super::*;
fn http_prefixed(url_without_scheme: &str) -> String {
format!("{}{}{}", HTTP_SCHEME, SCHEME_SEPARATOR, url_without_scheme)
}
#[test]
fn smoke_test_www_trailing_punctuation_trimmed() {
let s = "www.commonmark.org.";
let m = match_www(s).expect("should match");
assert_eq!(m.len, "www.commonmark.org".len());
assert_eq!(m.href, http_prefixed("www.commonmark.org"));
}
#[test]
fn smoke_test_email_plus_rules() {
assert!(match_email("hello+xyz@mail.example").is_some());
assert!(match_email("hello@mail+xyz.example").is_none());
}
#[test]
fn smoke_test_entity_suffix_trimmed() {
let s = "www.google.com/search?q=commonmark&hl;";
let m = match_www(s).expect("should match");
assert_eq!(m.len, "www.google.com/search?q=commonmark".len());
assert_eq!(m.href, http_prefixed("www.google.com/search?q=commonmark"));
}
#[test]
fn smoke_test_xmpp_resource_single_slash() {
let s = "xmpp:foo@bar.baz/txt/bin";
let m = match_xmpp(s).expect("should match");
assert_eq!(m.len, "xmpp:foo@bar.baz/txt".len());
}
#[test]
fn smoke_test_mailto_rejects_invalid_domain_endings() {
assert!(match_mailto("mailto:a.b-c_d@a.b-").is_none());
assert!(match_mailto("mailto:a.b-c_d@a.b_").is_none());
}
}