use std::sync::LazyLock;
use regex::Regex;
static URL_RE: LazyLock<Regex> =
LazyLock::new(|| Regex::new(r#"https?://[^\s<>"')\]]+"#).expect("URL_RE is a valid regex"));
static IMAGE_EXT_RE: LazyLock<Regex> = LazyLock::new(|| {
Regex::new(r"(?i)\.(jpe?g|png|gif|webp|bmp)(\?.*)?$").expect("IMAGE_EXT_RE is a valid regex")
});
static DIRECT_HOST_RE: LazyLock<Regex> = LazyLock::new(|| {
Regex::new(r"(?i)^i\.(imgur\.com|ibb\.co)$").expect("DIRECT_HOST_RE is a valid regex")
});
static IMGUR_PAGE_RE: LazyLock<Regex> = LazyLock::new(|| {
Regex::new(r"(?i)^(www\.)?imgur\.com/").expect("IMGUR_PAGE_RE is a valid regex")
});
static IMGBB_PAGE_RE: LazyLock<Regex> =
LazyLock::new(|| Regex::new(r"(?i)^(www\.)?ibb\.co/").expect("IMGBB_PAGE_RE is a valid regex"));
#[derive(Debug, Clone, PartialEq, Eq)]
pub enum UrlType {
DirectImage,
ImgurPage,
ImgbbPage,
GenericPage,
}
#[derive(Debug, Clone, PartialEq, Eq)]
pub struct UrlClassification {
pub url: String,
pub url_type: UrlType,
pub title: Option<String>,
}
#[must_use]
pub fn extract_urls(text: &str) -> Vec<UrlClassification> {
URL_RE
.find_iter(text)
.filter_map(|m| classify_url(m.as_str()))
.collect()
}
#[must_use]
pub fn classify_url(url: &str) -> Option<UrlClassification> {
if !url.starts_with("http://") && !url.starts_with("https://") {
return None;
}
let (host, path) = parse_host_and_path(url);
let url_type = classify_host_and_path(&host, &path);
let title = extract_title(&path);
Some(UrlClassification {
url: url.to_owned(),
url_type,
title,
})
}
fn classify_host_and_path(host: &str, path: &str) -> UrlType {
if IMAGE_EXT_RE.is_match(path) {
return UrlType::DirectImage;
}
if DIRECT_HOST_RE.is_match(host) {
return UrlType::DirectImage;
}
let host_path = format!("{host}/{}", path.trim_start_matches('/'));
if IMGUR_PAGE_RE.is_match(&host_path) {
return UrlType::ImgurPage;
}
if IMGBB_PAGE_RE.is_match(&host_path) {
return UrlType::ImgbbPage;
}
UrlType::GenericPage
}
fn parse_host_and_path(url: &str) -> (String, String) {
let without_scheme = url
.strip_prefix("https://")
.or_else(|| url.strip_prefix("http://"))
.unwrap_or(url);
let (host, path) = without_scheme.find('/').map_or_else(
|| (without_scheme.to_ascii_lowercase(), String::from("/")),
|idx| {
(
without_scheme[..idx].to_ascii_lowercase(),
without_scheme[idx..].to_owned(),
)
},
);
let host_no_port = match host.rfind(':') {
Some(idx) => host[..idx].to_owned(),
None => host,
};
(host_no_port, path)
}
fn extract_title(path: &str) -> Option<String> {
let clean = path.split('?').next().unwrap_or(path);
let clean = clean.split('#').next().unwrap_or(clean);
let segment = clean.rsplit('/').find(|s| !s.is_empty())?;
let decoded = percent_decode_simple(segment);
if decoded.is_empty() {
None
} else {
Some(decoded)
}
}
fn percent_decode_simple(input: &str) -> String {
let bytes = input.as_bytes();
let mut decoded: Vec<u8> = Vec::with_capacity(bytes.len());
let mut i = 0;
while i < bytes.len() {
if bytes[i] == b'%'
&& i + 2 < bytes.len()
&& let Some(byte) = decode_hex_pair(bytes[i + 1], bytes[i + 2])
{
decoded.push(byte);
i += 3;
continue;
}
decoded.push(bytes[i]);
i += 1;
}
String::from_utf8(decoded).unwrap_or_else(|_| input.to_owned())
}
const fn decode_hex_pair(hi: u8, lo: u8) -> Option<u8> {
let h = match hi {
b'0'..=b'9' => hi - b'0',
b'a'..=b'f' => hi - b'a' + 10,
b'A'..=b'F' => hi - b'A' + 10,
_ => return None,
};
let l = match lo {
b'0'..=b'9' => lo - b'0',
b'a'..=b'f' => lo - b'a' + 10,
b'A'..=b'F' => lo - b'A' + 10,
_ => return None,
};
Some(h << 4 | l)
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn direct_image_by_jpg_extension() {
let c = classify_url("https://example.com/photo.jpg").unwrap();
assert_eq!(c.url_type, UrlType::DirectImage);
}
#[test]
fn direct_image_by_jpeg_extension() {
let c = classify_url("https://example.com/photo.jpeg").unwrap();
assert_eq!(c.url_type, UrlType::DirectImage);
}
#[test]
fn direct_image_by_png_extension() {
let c = classify_url("https://example.com/photo.png").unwrap();
assert_eq!(c.url_type, UrlType::DirectImage);
}
#[test]
fn direct_image_by_gif_extension() {
let c = classify_url("https://example.com/anim.gif").unwrap();
assert_eq!(c.url_type, UrlType::DirectImage);
}
#[test]
fn direct_image_by_webp_extension() {
let c = classify_url("https://example.com/pic.webp").unwrap();
assert_eq!(c.url_type, UrlType::DirectImage);
}
#[test]
fn direct_image_by_bmp_extension() {
let c = classify_url("https://example.com/old.bmp").unwrap();
assert_eq!(c.url_type, UrlType::DirectImage);
}
#[test]
fn direct_image_extension_case_insensitive() {
let c = classify_url("https://example.com/PHOTO.JPG").unwrap();
assert_eq!(c.url_type, UrlType::DirectImage);
}
#[test]
fn direct_image_extension_with_query_string() {
let c = classify_url("https://cdn.example.com/img.png?width=800").unwrap();
assert_eq!(c.url_type, UrlType::DirectImage);
}
#[test]
fn direct_image_i_imgur_host() {
let c = classify_url("https://i.imgur.com/abc123.png").unwrap();
assert_eq!(c.url_type, UrlType::DirectImage);
}
#[test]
fn direct_image_i_imgur_no_extension() {
let c = classify_url("https://i.imgur.com/abc123").unwrap();
assert_eq!(c.url_type, UrlType::DirectImage);
}
#[test]
fn direct_image_i_ibb_host() {
let c = classify_url("https://i.ibb.co/xyz789.jpg").unwrap();
assert_eq!(c.url_type, UrlType::DirectImage);
}
#[test]
fn direct_image_i_imgur_host_case_insensitive() {
let c = classify_url("https://I.IMGUR.COM/abc123").unwrap();
assert_eq!(c.url_type, UrlType::DirectImage);
}
#[test]
fn imgur_page_gallery() {
let c = classify_url("https://imgur.com/gallery/abc").unwrap();
assert_eq!(c.url_type, UrlType::ImgurPage);
}
#[test]
fn imgur_page_www() {
let c = classify_url("https://www.imgur.com/gallery/abc").unwrap();
assert_eq!(c.url_type, UrlType::ImgurPage);
}
#[test]
fn imgur_page_single_image() {
let c = classify_url("https://imgur.com/abc123").unwrap();
assert_eq!(c.url_type, UrlType::ImgurPage);
}
#[test]
fn imgbb_page() {
let c = classify_url("https://ibb.co/abc123").unwrap();
assert_eq!(c.url_type, UrlType::ImgbbPage);
}
#[test]
fn imgbb_page_www() {
let c = classify_url("https://www.ibb.co/abc123").unwrap();
assert_eq!(c.url_type, UrlType::ImgbbPage);
}
#[test]
fn generic_page() {
let c = classify_url("https://example.com/page").unwrap();
assert_eq!(c.url_type, UrlType::GenericPage);
}
#[test]
fn generic_page_with_path() {
let c = classify_url("https://news.ycombinator.com/item?id=12345").unwrap();
assert_eq!(c.url_type, UrlType::GenericPage);
}
#[test]
fn non_http_ftp_returns_none() {
assert!(classify_url("ftp://example.com/file.jpg").is_none());
}
#[test]
fn non_http_irc_returns_none() {
assert!(classify_url("irc://irc.libera.chat/rust").is_none());
}
#[test]
fn non_http_mailto_returns_none() {
assert!(classify_url("mailto:user@example.com").is_none());
}
#[test]
fn http_scheme_accepted() {
let c = classify_url("http://example.com/photo.png").unwrap();
assert_eq!(c.url_type, UrlType::DirectImage);
}
#[test]
fn title_from_filename() {
let c = classify_url("https://i.imgur.com/abc123.png").unwrap();
assert_eq!(c.title.as_deref(), Some("abc123.png"));
}
#[test]
fn title_from_deep_path() {
let c = classify_url("https://cdn.example.com/images/2024/sunset.jpg").unwrap();
assert_eq!(c.title.as_deref(), Some("sunset.jpg"));
}
#[test]
fn title_strips_query_string() {
let c = classify_url("https://cdn.example.com/img.png?width=800&quality=90").unwrap();
assert_eq!(c.title.as_deref(), Some("img.png"));
}
#[test]
fn title_strips_fragment() {
let c = classify_url("https://example.com/page#section").unwrap();
assert_eq!(c.title.as_deref(), Some("page"));
}
#[test]
fn title_from_gallery_path() {
let c = classify_url("https://imgur.com/gallery/abc").unwrap();
assert_eq!(c.title.as_deref(), Some("abc"));
}
#[test]
fn title_none_for_root_path() {
let c = classify_url("https://example.com").unwrap();
assert_eq!(c.title, None);
}
#[test]
fn title_none_for_trailing_slash() {
let c = classify_url("https://example.com/").unwrap();
assert_eq!(c.title, None);
}
#[test]
fn title_percent_decoded() {
let c = classify_url("https://example.com/my%20photo.jpg").unwrap();
assert_eq!(c.title.as_deref(), Some("my photo.jpg"));
}
#[test]
fn extract_single_url_from_text() {
let results = extract_urls("check out https://example.com/pic.png please");
assert_eq!(results.len(), 1);
assert_eq!(results[0].url, "https://example.com/pic.png");
assert_eq!(results[0].url_type, UrlType::DirectImage);
}
#[test]
fn extract_multiple_urls() {
let text = "look at https://i.imgur.com/a.png and https://example.com/page";
let results = extract_urls(text);
assert_eq!(results.len(), 2);
assert_eq!(results[0].url_type, UrlType::DirectImage);
assert_eq!(results[1].url_type, UrlType::GenericPage);
}
#[test]
fn extract_no_urls() {
let results = extract_urls("just a normal message with no links");
assert!(results.is_empty());
}
#[test]
fn extract_url_at_end_of_sentence() {
let results = extract_urls("visit https://example.com/photo.jpg.");
assert_eq!(results.len(), 1);
}
#[test]
fn extract_url_in_angle_brackets() {
let results = extract_urls("link: <https://example.com/pic.png>");
assert_eq!(results.len(), 1);
assert_eq!(results[0].url, "https://example.com/pic.png");
assert_eq!(results[0].url_type, UrlType::DirectImage);
}
#[test]
fn extract_url_in_parentheses() {
let results = extract_urls("(https://example.com/pic.png)");
assert_eq!(results.len(), 1);
assert_eq!(results[0].url, "https://example.com/pic.png");
}
#[test]
fn extract_url_surrounded_by_quotes() {
let results = extract_urls(r#""https://example.com/pic.png""#);
assert_eq!(results.len(), 1);
assert_eq!(results[0].url, "https://example.com/pic.png");
}
#[test]
fn parse_host_strips_port() {
let (host, path) = parse_host_and_path("https://example.com:8080/path");
assert_eq!(host, "example.com");
assert_eq!(path, "/path");
}
#[test]
fn parse_host_lowercases() {
let (host, _) = parse_host_and_path("https://EXAMPLE.COM/Path");
assert_eq!(host, "example.com");
}
#[test]
fn parse_host_no_path() {
let (host, path) = parse_host_and_path("https://example.com");
assert_eq!(host, "example.com");
assert_eq!(path, "/");
}
#[test]
fn percent_decode_space() {
assert_eq!(percent_decode_simple("hello%20world"), "hello world");
}
#[test]
fn percent_decode_multiple() {
assert_eq!(percent_decode_simple("%48%65%6C%6Co"), "Hello");
}
#[test]
fn percent_decode_invalid_sequence_left_as_is() {
assert_eq!(percent_decode_simple("100%zz"), "100%zz");
}
#[test]
fn percent_decode_truncated_at_end() {
assert_eq!(percent_decode_simple("test%2"), "test%2");
}
#[test]
fn decode_hex_pair_valid() {
assert_eq!(decode_hex_pair(b'4', b'1'), Some(0x41)); }
#[test]
fn decode_hex_pair_invalid() {
assert_eq!(decode_hex_pair(b'G', b'0'), None);
}
}