#![cfg(feature = "crawler")]
use anyhow::Result;
use serde::{Deserialize, Serialize};
use std::collections::HashSet;
pub use spider;
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct CrawlerConfig {
pub max_concurrent: usize,
pub timeout_secs: u64,
pub delay_ms: u64,
pub max_depth: usize,
pub max_pages_per_domain: usize,
pub user_agent: String,
pub respect_robots_txt: bool,
pub smart_mode: bool,
pub follow_redirects: bool,
pub max_redirects: usize,
}
impl Default for CrawlerConfig {
fn default() -> Self {
Self {
max_concurrent: 10,
timeout_secs: 30,
delay_ms: 100,
max_depth: 3,
max_pages_per_domain: 1000,
user_agent: "ReasonKit-Web/0.1 (+https://reasonkit.sh)".to_string(),
respect_robots_txt: true,
smart_mode: true,
follow_redirects: true,
max_redirects: 5,
}
}
}
impl CrawlerConfig {
pub fn polite() -> Self {
Self {
max_concurrent: 2,
delay_ms: 1000,
..Default::default()
}
}
pub fn aggressive() -> Self {
Self {
max_concurrent: 50,
delay_ms: 10,
respect_robots_txt: false,
..Default::default()
}
}
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct CrawledPage {
pub url: String,
pub status: u16,
pub title: Option<String>,
pub description: Option<String>,
pub content: String,
pub html: String,
pub links: Vec<ExtractedLink>,
pub metadata: PageMetadata,
pub crawled_at: chrono::DateTime<chrono::Utc>,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct ExtractedLink {
pub url: String,
pub text: Option<String>,
pub rel: Option<String>,
pub is_internal: bool,
}
#[derive(Debug, Clone, Default, Serialize, Deserialize)]
pub struct PageMetadata {
pub content_type: Option<String>,
pub content_length: Option<usize>,
pub last_modified: Option<String>,
pub etag: Option<String>,
pub language: Option<String>,
pub keywords: Vec<String>,
pub author: Option<String>,
pub canonical_url: Option<String>,
}
#[derive(Debug, Clone, Default, Serialize, Deserialize)]
pub struct CrawlStats {
pub pages_crawled: usize,
pub pages_failed: usize,
pub bytes_downloaded: usize,
pub links_found: usize,
pub duration_ms: u64,
}
pub struct WebCrawler {
config: CrawlerConfig,
visited: HashSet<String>,
}
impl WebCrawler {
pub fn new(config: CrawlerConfig) -> Self {
Self {
config,
visited: HashSet::new(),
}
}
pub fn config(&self) -> &CrawlerConfig {
&self.config
}
pub fn is_visited(&self, url: &str) -> bool {
self.visited.contains(url)
}
pub fn mark_visited(&mut self, url: &str) {
self.visited.insert(url.to_string());
}
pub fn visited_count(&self) -> usize {
self.visited.len()
}
pub fn clear_visited(&mut self) {
self.visited.clear();
}
pub async fn crawl_url(&mut self, url: &str) -> Result<CrawledPage> {
use spider::website::Website;
let mut website = Website::new(url);
website.with_limit(1);
if !self.config.respect_robots_txt {
website.with_respect_robots_txt(false);
}
website.crawl().await;
let pages = website.get_pages().expect("Failed to get pages");
if let Some(page) = pages.first() {
self.mark_visited(url);
let html = page.get_html();
let content = self.extract_content(&html);
let links = self.extract_links(&html, url);
let title = self.extract_title(&html);
let description = self.extract_meta(&html, "description");
Ok(CrawledPage {
url: url.to_string(),
status: 200,
title,
description,
content,
html,
links,
metadata: PageMetadata::default(),
crawled_at: chrono::Utc::now(),
})
} else {
anyhow::bail!("Failed to crawl URL: {}", url)
}
}
pub async fn crawl_site(&mut self, start_url: &str) -> Result<Vec<CrawledPage>> {
use spider::website::Website;
let mut website = Website::new(start_url);
website.with_limit(self.config.max_pages_per_domain as u32);
if !self.config.respect_robots_txt {
website.with_respect_robots_txt(false);
}
website.crawl().await;
let pages = website.get_pages().expect("Failed to get pages");
let mut crawled_pages = Vec::new();
for page in pages.iter() {
let url = page.get_url();
let html = page.get_html();
self.mark_visited(&url);
let title = self.extract_title(&html);
let description = self.extract_meta(&html, "description");
let content = self.extract_content(&html);
let links = self.extract_links(&html, &url);
crawled_pages.push(CrawledPage {
url: url.to_string(),
status: 200,
title,
description,
content,
html,
links,
metadata: PageMetadata::default(),
crawled_at: chrono::Utc::now(),
});
}
Ok(crawled_pages)
}
fn extract_content(&self, html: &str) -> String {
#[cfg(feature = "content-extraction")]
{
use readability::extractor;
use url::Url;
if let Ok(product) = extractor::extract(
&mut html.as_bytes(),
&Url::parse("http://example.com").unwrap(),
) {
return product.text;
}
}
self.strip_html(html)
}
fn strip_html(&self, html: &str) -> String {
let re = regex::Regex::new(r"<[^>]+>").unwrap();
let text = re.replace_all(html, " ");
let text = text.split_whitespace().collect::<Vec<_>>().join(" ");
text
}
fn extract_title(&self, html: &str) -> Option<String> {
let re = regex::Regex::new(r"<title[^>]*>([^<]+)</title>").ok()?;
re.captures(html).map(|c| c[1].to_string())
}
fn extract_meta(&self, html: &str, name: &str) -> Option<String> {
let pattern = format!(
r#"<meta[^>]+name=["']{}["'][^>]+content=["']([^"']+)["']"#,
name
);
let re = regex::Regex::new(&pattern).ok()?;
re.captures(html).map(|c| c[1].to_string())
}
fn extract_links(&self, html: &str, base_url: &str) -> Vec<ExtractedLink> {
let mut links = Vec::new();
let base = url::Url::parse(base_url).ok();
let re = regex::Regex::new(r#"<a[^>]+href=["']([^"']+)["'][^>]*>([^<]*)</a>"#).unwrap();
for cap in re.captures_iter(html) {
let href = &cap[1];
let text = cap.get(2).map(|m| m.as_str().to_string());
let resolved_url = if let Some(base) = &base {
base.join(href)
.map(|u| u.to_string())
.unwrap_or_else(|_| href.to_string())
} else {
href.to_string()
};
let is_internal = base
.as_ref()
.and_then(|_| url::Url::parse(&resolved_url).ok())
.map(|u| u.host_str() == base.as_ref().and_then(|b| b.host_str()))
.unwrap_or(false);
links.push(ExtractedLink {
url: resolved_url,
text,
rel: None,
is_internal,
});
}
links
}
}
impl Default for WebCrawler {
fn default() -> Self {
Self::new(CrawlerConfig::default())
}
}
#[cfg(feature = "compliant-crawling")]
pub mod sitemap_parser {
use super::*;
pub fn parse_sitemap(xml: &str) -> Result<Vec<SitemapEntry>> {
use sitemap::reader::{SiteMapEntity, SiteMapReader};
let mut entries = Vec::new();
let reader = SiteMapReader::new(xml.as_bytes());
for entry in reader {
if let SiteMapEntity::Url(url_entry) = entry {
let change_freq = match &url_entry.changefreq {
sitemap::structs::ChangeFreq::None => None,
freq => Some(format!("{:?}", freq)),
};
entries.push(SitemapEntry {
url: url_entry
.loc
.get_url()
.map(|u| u.to_string())
.unwrap_or_default(),
last_modified: url_entry.lastmod.get_time().map(|d| d.to_string()),
change_frequency: change_freq,
priority: url_entry.priority.get_priority().map(|p| p as f64),
});
}
}
Ok(entries)
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct SitemapEntry {
pub url: String,
pub last_modified: Option<String>,
pub change_frequency: Option<String>,
pub priority: Option<f64>,
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_config_default() {
let config = CrawlerConfig::default();
assert!(config.respect_robots_txt);
assert!(config.smart_mode);
}
#[test]
fn test_config_polite() {
let config = CrawlerConfig::polite();
assert_eq!(config.max_concurrent, 2);
assert_eq!(config.delay_ms, 1000);
}
#[test]
fn test_crawler_visited() {
let mut crawler = WebCrawler::default();
assert!(!crawler.is_visited("https://example.com"));
crawler.mark_visited("https://example.com");
assert!(crawler.is_visited("https://example.com"));
assert_eq!(crawler.visited_count(), 1);
}
}