#[derive(Debug, Clone, PartialEq, Eq)]
pub struct HyperlinkSpan {
pub start: usize,
pub end: usize,
pub url: String,
}
const URL_TERMINATORS: &[char] = &['.', ',', '!', '?', ';', ':', ')', ']', '}', '\'', '"'];
pub fn find_hyperlinks(text: &str) -> Vec<HyperlinkSpan> {
let mut spans: Vec<HyperlinkSpan> = Vec::new();
let bytes = text.as_bytes();
let len = bytes.len();
let mut i = 0usize;
while i < len {
let prefix = try_match_prefix(text, i);
if let Some(prefix_end) = prefix {
let url_end = extend_url(text, i, prefix_end);
let url = &text[i..url_end];
let url = strip_trailing_punct(url);
let url_end = i + url.len();
if url_end > i {
spans.push(HyperlinkSpan {
start: i,
end: url_end,
url: url.to_owned(),
});
i = url_end;
continue;
}
}
i += char_len_at(bytes, i);
}
spans
}
fn try_match_prefix(text: &str, pos: usize) -> Option<usize> {
let rest = &text[pos..];
for prefix in &["https://", "http://", "www."] {
if rest.starts_with(prefix) {
return Some(pos + prefix.len());
}
}
None
}
fn extend_url(text: &str, start: usize, _prefix_end: usize) -> usize {
let rest = &text[start..];
let end_local = rest
.char_indices()
.find(|(_, c)| c.is_ascii_whitespace())
.map(|(i, _)| i)
.unwrap_or(rest.len());
start + end_local
}
fn strip_trailing_punct(url: &str) -> &str {
let mut end = url.len();
while end > 0 {
let ch = url[..end].chars().next_back().unwrap_or('\0');
if URL_TERMINATORS.contains(&ch) {
end -= ch.len_utf8();
} else {
break;
}
}
&url[..end]
}
fn char_len_at(bytes: &[u8], pos: usize) -> usize {
match bytes[pos] {
b if b < 0x80 => 1,
b if b < 0xC0 => 1, b if b < 0xE0 => 2,
b if b < 0xF0 => 3,
_ => 4,
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn hyperlink_finds_https() {
let spans = find_hyperlinks("visit https://example.com today");
assert_eq!(spans.len(), 1);
assert!(spans[0].url.starts_with("https://"));
}
#[test]
fn hyperlink_finds_http() {
let spans = find_hyperlinks("see http://example.com/path");
assert_eq!(spans.len(), 1);
assert!(spans[0].url.starts_with("http://"));
}
#[test]
fn hyperlink_finds_www() {
let spans = find_hyperlinks("see www.example.com");
assert_eq!(spans.len(), 1);
assert!(spans[0].url.starts_with("www."));
}
#[test]
fn hyperlink_ignores_plain_text() {
let spans = find_hyperlinks("hello world");
assert!(spans.is_empty());
}
#[test]
fn hyperlink_multiple_urls() {
let spans = find_hyperlinks("a https://a.com b http://b.com c");
assert_eq!(spans.len(), 2);
}
#[test]
fn hyperlink_strips_trailing_punctuation() {
let spans = find_hyperlinks("visit https://example.com.");
assert_eq!(spans.len(), 1);
assert!(
!spans[0].url.ends_with('.'),
"trailing dot must be stripped"
);
}
#[test]
fn hyperlink_correct_byte_offsets() {
let text = "x https://example.com y";
let spans = find_hyperlinks(text);
assert_eq!(spans.len(), 1);
let span = &spans[0];
assert_eq!(&text[span.start..span.end], span.url.as_str());
}
#[test]
fn hyperlink_empty_string() {
assert!(find_hyperlinks("").is_empty());
}
}