use std::collections::HashMap;
#[derive(Debug, Clone)]
#[allow(dead_code)] pub struct ResolvedContent {
pub text: String,
pub metadata: HashMap<String, String>,
pub source_url: String,
}
pub trait UrlResolver: std::fmt::Debug {
fn can_resolve(&self, url: &str) -> bool;
fn resolve(&self, url: &str) -> Result<ResolvedContent, String>;
}
#[derive(Debug, Default)]
pub struct HttpResolver;
impl HttpResolver {
#[must_use]
pub fn new() -> Self {
Self
}
}
impl UrlResolver for HttpResolver {
fn can_resolve(&self, url: &str) -> bool {
url.starts_with("http://") || url.starts_with("https://")
}
fn resolve(&self, url: &str) -> Result<ResolvedContent, String> {
let response = ureq::get(url)
.timeout(std::time::Duration::from_secs(60))
.call()
.map_err(|e| {
format!(
"Network error fetching {}: {}. \
Check your internet connection and try again.",
url, e
)
})?;
if response.status() != 200 {
return Err(format!(
"HTTP {} fetching {}. \
Server returned error status. \
URL may be temporarily unavailable or changed.",
response.status(),
url
));
}
let content = response.into_string().map_err(|e| {
format!(
"Failed to read response from {}: {}. \
Response may be too large or corrupted.",
url, e
)
})?;
let mut metadata = HashMap::new();
metadata.insert("source".to_string(), "http".to_string());
let text = if deformat::detect::is_html(&content) {
metadata.insert("content-type".to_string(), "text/html".to_string());
let result = deformat::extract_readable(&content, Some(url));
metadata.insert("extractor".to_string(), result.extractor.to_string());
if let Some(title) = &result.title {
metadata.insert("title".to_string(), title.clone());
}
if result.fallback {
metadata.insert("fallback".to_string(), "true".to_string());
}
result.text
} else {
metadata.insert("content-type".to_string(), "text/plain".to_string());
content
};
Ok(ResolvedContent {
text,
metadata,
source_url: url.to_string(),
})
}
}
pub struct CompositeResolver {
resolvers: Vec<Box<dyn UrlResolver>>,
}
impl std::fmt::Debug for CompositeResolver {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
f.debug_struct("CompositeResolver")
.field("resolver_count", &self.resolvers.len())
.finish()
}
}
impl CompositeResolver {
#[must_use]
pub fn new() -> Self {
let resolvers = vec![Box::new(HttpResolver::new()) as Box<dyn UrlResolver>];
Self { resolvers }
}
}
impl Default for CompositeResolver {
fn default() -> Self {
Self::new()
}
}
impl UrlResolver for CompositeResolver {
fn can_resolve(&self, url: &str) -> bool {
self.resolvers.iter().any(|r| r.can_resolve(url))
}
fn resolve(&self, url: &str) -> Result<ResolvedContent, String> {
for resolver in &self.resolvers {
if resolver.can_resolve(url) {
return resolver.resolve(url);
}
}
Err(format!("No resolver available for URL: {}", url))
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn http_resolver_can_resolve_http() {
let resolver = HttpResolver::new();
assert!(resolver.can_resolve("http://example.com"));
assert!(resolver.can_resolve("https://example.com"));
assert!(resolver.can_resolve("http://example.com/path?query=1"));
assert!(resolver.can_resolve("https://subdomain.example.com/path"));
}
#[test]
fn http_resolver_case_sensitive() {
let resolver = HttpResolver::new();
assert!(!resolver.can_resolve("HTTP://example.com"));
assert!(!resolver.can_resolve("HTTPS://example.com"));
}
#[test]
fn http_resolver_cannot_resolve_other_schemes() {
let resolver = HttpResolver::new();
assert!(!resolver.can_resolve("ftp://example.com"));
assert!(!resolver.can_resolve("file:///path/to/file"));
assert!(!resolver.can_resolve("mailto:test@example.com"));
assert!(!resolver.can_resolve("not_a_url"));
}
#[test]
fn resolved_content_struct() {
let content = ResolvedContent {
text: "Hello world".to_string(),
metadata: HashMap::new(),
source_url: "https://example.com".to_string(),
};
assert_eq!(content.text, "Hello world");
assert!(content.metadata.is_empty());
assert_eq!(content.source_url, "https://example.com");
}
#[test]
fn composite_resolver_creation() {
let resolver = CompositeResolver::new();
assert!(resolver.can_resolve("https://example.com"));
}
#[test]
fn composite_resolver_default() {
let resolver = CompositeResolver::default();
assert!(resolver.can_resolve("http://example.com"));
}
#[test]
fn composite_resolver_cannot_resolve_unknown() {
let resolver = CompositeResolver::new();
assert!(!resolver.can_resolve("custom://unknown"));
}
#[test]
fn composite_resolver_no_matching_resolver() {
let resolver = CompositeResolver { resolvers: vec![] };
let result = resolver.resolve("any://url");
assert!(result.is_err());
assert!(result.unwrap_err().contains("No resolver available"));
}
#[test]
fn composite_resolver_debug() {
let resolver = CompositeResolver::new();
let debug = format!("{:?}", resolver);
assert!(debug.contains("CompositeResolver"));
assert!(debug.contains("resolver_count"));
}
#[test]
fn http_resolver_debug() {
let resolver = HttpResolver::new();
let debug = format!("{:?}", resolver);
assert!(debug.contains("HttpResolver"));
}
#[test]
fn resolved_content_clone() {
let mut metadata = HashMap::new();
metadata.insert("key".to_string(), "value".to_string());
let content = ResolvedContent {
text: "test".to_string(),
metadata,
source_url: "http://test.com".to_string(),
};
let cloned = content.clone();
assert_eq!(content.text, cloned.text);
assert_eq!(content.source_url, cloned.source_url);
assert_eq!(content.metadata, cloned.metadata);
}
#[test]
fn readability_extracts_article_text() {
let html = r#"<!DOCTYPE html>
<html><head><title>Breaking News: Scientists Discover New Species</title></head>
<body>
<nav><a href="/">Home</a><a href="/about">About</a><a href="/contact">Contact</a></nav>
<div id="content">
<h1>Breaking News: Scientists Discover New Species</h1>
<p>A team of researchers at the University of Cambridge has announced
the discovery of a previously unknown species of beetle in the
Amazon rainforest. The discovery was published in the journal
Nature on March 15, 2026. The finding represents one of the most
significant entomological discoveries in the region in recent years,
and has drawn attention from conservation organizations worldwide.</p>
<p>Lead researcher Dr. Sarah Chen said the species, named Chrysina
amazonica, was found during an expedition in January near the city
of Manaus. The beetle has unique iridescent markings that distinguish
it from related species in the genus. Chen and her team spent three
weeks collecting specimens and documenting the habitat conditions
where the species was found along tributary streams of the Amazon River.</p>
<p>The Amazon rainforest continues to yield new discoveries despite
decades of intensive exploration by international research teams.
Conservation groups including the World Wildlife Fund and the
International Union for Conservation of Nature have called for
increased protection of the region. Brazil's Environment Ministry
said it would review the protected area boundaries in light of
the new findings. Local indigenous communities have long known
about the beetle but it had never been formally described by science.</p>
<p>The research was funded by a grant from the European Research Council
and the National Geographic Society. Additional specimens will be
housed at the Natural History Museum in London and the Smithsonian
Institution in Washington, D.C. Future expeditions are planned for
July 2026 to search for related species in neighboring regions of
Peru and Colombia.</p>
</div>
<footer><p>Copyright 2026 News Corp. All rights reserved.</p></footer>
</body></html>"#;
let result = deformat::html::extract_with_readability(html, "https://example.com/article");
assert!(result.is_some(), "readability should extract article text");
let (text, title, _excerpt) = result.unwrap();
assert!(
text.contains("Dr. Sarah Chen"),
"should contain person name, got: {}",
text
);
assert!(
text.contains("University of Cambridge"),
"should contain org, got: {}",
text
);
assert!(title.is_some(), "should extract title");
}
#[test]
fn readability_returns_none_for_minimal_html() {
let html = "<html><body><p>Hi</p></body></html>";
let result = deformat::html::extract_with_readability(html, "https://example.com");
assert!(
result.is_none(),
"should return None for trivial HTML (<50 chars)"
);
}
#[test]
fn readability_returns_none_for_empty_html() {
let result = deformat::html::extract_with_readability("", "https://example.com");
assert!(result.is_none(), "should return None for empty HTML");
}
#[test]
fn readability_returns_none_for_nav_only_page() {
let html = r#"<html><body>
<nav><a href="/">Home</a><a href="/about">About</a></nav>
</body></html>"#;
let result = deformat::html::extract_with_readability(html, "https://example.com");
assert!(result.is_none(), "should return None for nav-only page");
}
}