use serde::{Deserialize, Serialize};
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct CrawlResult {
pub url: String,
pub status: u16,
pub text: String,
pub title: Option<String>,
pub links: Vec<String>,
pub content_type: Option<String>,
pub crawl_time_ms: u64,
}
impl CrawlResult {
pub fn is_success(&self) -> bool {
(200..300).contains(&self.status)
}
}
pub fn strip_html_tags(html: &str) -> String {
let re = regex::Regex::new(r"<[^>]+>").expect("valid regex");
let text = re.replace_all(html, " ");
let ws = regex::Regex::new(r"\s+").expect("valid regex");
ws.replace_all(text.trim(), " ").to_string()
}
pub fn extract_title(html: &str) -> Option<String> {
let re = regex::Regex::new(r"(?i)<title[^>]*>(.*?)</title>").ok()?;
re.captures(html).map(|c| c[1].trim().to_string())
}
pub fn extract_links(html: &str, base_url: &str) -> Vec<String> {
let re = regex::Regex::new(r#"href\s*=\s*["']([^"']+)["']"#).expect("valid regex");
re.captures_iter(html)
.filter_map(|cap| {
let href = cap[1].trim();
if href.starts_with("http://") || href.starts_with("https://") {
Some(href.to_string())
} else if href.starts_with('/') {
let base = base_url.trim_end_matches('/');
if let Some(idx) = base.find("://") {
let rest = &base[idx + 3..];
if let Some(slash) = rest.find('/') {
let origin = &base[..idx + 3 + slash];
Some(format!("{}{}", origin, href))
} else {
Some(format!("{}{}", base, href))
}
} else {
None
}
} else {
None
}
})
.collect()
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_strip_html_tags() {
let html = "<html><body><h1>Hello</h1><p>World</p></body></html>";
let text = strip_html_tags(html);
assert!(text.contains("Hello"));
assert!(text.contains("World"));
assert!(!text.contains("<h1>"));
}
#[test]
fn test_extract_title() {
let html = "<html><head><title>My Page</title></head><body></body></html>";
assert_eq!(extract_title(html), Some("My Page".to_string()));
}
#[test]
fn test_extract_title_missing() {
let html = "<html><head></head><body></body></html>";
assert_eq!(extract_title(html), None);
}
#[test]
fn test_extract_links() {
let html = r#"<a href="https://example.com/a">A</a><a href="/b">B</a>"#;
let links = extract_links(html, "https://example.com/page");
assert!(links.contains(&"https://example.com/a".to_string()));
assert!(links.contains(&"https://example.com/b".to_string()));
}
#[test]
fn test_crawl_result_success() {
let r = CrawlResult {
url: "https://example.com".into(),
status: 200,
text: "Hello".into(),
title: Some("Test".into()),
links: vec![],
content_type: Some("text/html".into()),
crawl_time_ms: 42,
};
assert!(r.is_success());
let r2 = CrawlResult { status: 404, ..r };
assert!(!r2.is_success());
}
}