use crate::error::{Result, ScrapeError};
use psl::{List, Psl};
use tracing::debug;
use url::Url;
pub fn extract_etld_plus_one(url_str: &str) -> Result<String> {
let url = Url::parse(url_str)
.map_err(|e| ScrapeError::InvalidUrl(format!("Invalid URL: {}", e)))?;
let host = url
.host_str()
.ok_or_else(|| ScrapeError::InvalidUrl("No host in URL".to_string()))?;
match List.domain(host.as_bytes()) {
Some(domain) => {
let etld_plus_one = std::str::from_utf8(domain.as_bytes())
.map_err(|e| ScrapeError::InvalidUrl(format!("Invalid domain encoding: {}", e)))?
.to_string();
debug!("Extracted eTLD+1: {} from {}", etld_plus_one, url_str);
Ok(etld_plus_one)
}
None => {
debug!("PSL could not determine domain for {}, using host: {}", url_str, host);
Ok(host.to_string())
}
}
}
pub fn extract_etld_plus_one_or_host(url_str: &str) -> Result<String> {
match extract_etld_plus_one(url_str) {
Ok(etld) => Ok(etld),
Err(_) => {
let url = Url::parse(url_str)
.map_err(|e| ScrapeError::InvalidUrl(format!("Invalid URL: {}", e)))?;
let host = url
.host_str()
.ok_or_else(|| ScrapeError::InvalidUrl("No host in URL".to_string()))?;
debug!("eTLD+1 extraction failed, using hostname: {}", host);
Ok(host.to_string())
}
}
}
pub fn is_public_suffix(domain: &str) -> bool {
List.domain(domain.as_bytes()).is_none()
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_extract_etld_basic() {
assert_eq!(
extract_etld_plus_one("https://www.example.com/path").unwrap(),
"example.com"
);
assert_eq!(
extract_etld_plus_one("https://subdomain.example.com").unwrap(),
"example.com"
);
}
#[test]
fn test_extract_etld_complex_tld() {
assert_eq!(
extract_etld_plus_one("https://www.example.co.uk/path").unwrap(),
"example.co.uk"
);
assert_eq!(
extract_etld_plus_one("https://api.example.co.jp").unwrap(),
"example.co.jp"
);
}
#[test]
fn test_extract_etld_public_suffix() {
let result = extract_etld_plus_one("https://username.github.io").unwrap();
assert_eq!(result, "username.github.io");
}
#[test]
fn test_extract_etld_ip_address() {
let result = extract_etld_plus_one("http://192.168.1.1/path");
assert!(result.is_ok());
}
#[test]
fn test_extract_etld_localhost() {
let result = extract_etld_plus_one("http://localhost:8080");
assert!(result.is_ok());
assert_eq!(result.unwrap(), "localhost");
}
#[test]
fn test_is_public_suffix() {
assert!(!is_public_suffix("example.com"));
assert!(!is_public_suffix("google.co.uk"));
}
#[test]
fn test_extract_etld_or_host_fallback() {
assert_eq!(
extract_etld_plus_one_or_host("https://www.example.com").unwrap(),
"example.com"
);
let result = extract_etld_plus_one_or_host("https://weird-domain-123.local");
assert!(result.is_ok());
}
}