mod support;
use kumo::{
CrawlRequest,
engine::CrawlEngine,
error::KumoError,
extract::Response,
fetch::MockFetcher,
spider::{Output, Spider},
};
use support::VecStore;
struct SinglePageSpider {
start: String,
}
#[async_trait::async_trait]
impl Spider for SinglePageSpider {
type Item = serde_json::Value;
fn name(&self) -> &str {
"single-page"
}
fn start_urls(&self) -> Vec<String> {
vec![self.start.clone()]
}
async fn parse(&self, res: &Response) -> Result<Output<Self::Item>, KumoError> {
let title = res
.css("h1")
.first()
.map(|el| el.text())
.unwrap_or_default();
Ok(Output::new().item(serde_json::json!({ "title": title })))
}
}
struct PaginatedSpider {
page1: String,
}
struct PrioritySpider {
start: String,
low: String,
high: String,
}
struct DontFilterSpider {
start: String,
target: String,
}
#[async_trait::async_trait]
impl Spider for PaginatedSpider {
type Item = serde_json::Value;
fn name(&self) -> &str {
"paginated"
}
fn start_urls(&self) -> Vec<String> {
vec![self.page1.clone()]
}
async fn parse(&self, res: &Response) -> Result<Output<Self::Item>, KumoError> {
let item = serde_json::json!({ "url": res.url() });
let next = res
.css("a.next")
.first()
.and_then(|el| el.attr("href"))
.map(|href| res.urljoin(&href));
let mut output = Output::new().item(item);
if let Some(url) = next {
output = output.follow(url);
}
Ok(output)
}
}
#[async_trait::async_trait]
impl Spider for PrioritySpider {
type Item = serde_json::Value;
fn name(&self) -> &str {
"priority"
}
fn start_urls(&self) -> Vec<String> {
vec![self.start.clone()]
}
async fn parse(&self, res: &Response) -> Result<Output<Self::Item>, KumoError> {
if res.url() == self.start {
return Ok(Output::new()
.request(CrawlRequest::get(&self.low).priority(-1))
.request(CrawlRequest::get(&self.high).priority(10)));
}
Ok(Output::new().item(serde_json::json!({ "url": res.url() })))
}
}
#[async_trait::async_trait]
impl Spider for DontFilterSpider {
type Item = serde_json::Value;
fn name(&self) -> &str {
"dont-filter"
}
fn start_urls(&self) -> Vec<String> {
vec![self.start.clone()]
}
async fn parse(&self, res: &Response) -> Result<Output<Self::Item>, KumoError> {
if res.url() == self.start {
return Ok(Output::new()
.request(CrawlRequest::get(&self.target).dont_filter(true))
.request(CrawlRequest::get(&self.target).dont_filter(true)));
}
Ok(Output::new().item(serde_json::json!({ "url": res.url() })))
}
}
#[tokio::test]
async fn engine_scrapes_single_page() {
let mut server = mockito::Server::new_async().await;
let _mock = server
.mock("GET", "/")
.with_status(200)
.with_header("content-type", "text/html")
.with_body("<html><body><h1>Hello Kumo</h1></body></html>")
.create_async()
.await;
let store = VecStore::default();
let stats = CrawlEngine::builder()
.concurrency(1)
.respect_robots_txt(false)
.store(store.clone())
.run(SinglePageSpider {
start: server.url(),
})
.await
.unwrap();
assert_eq!(stats.pages_crawled, 1);
assert_eq!(stats.items_scraped, 1);
assert_eq!(stats.errors, 0);
assert_eq!(store.collected()[0]["title"], "Hello Kumo");
}
#[tokio::test]
async fn engine_follows_pagination() {
let mut server = mockito::Server::new_async().await;
let base = server.url();
let _m1 = server
.mock("GET", "/page/1")
.with_status(200)
.with_header("content-type", "text/html")
.with_body(format!(
r#"<html><body><a class="next" href="{base}/page/2">Next</a></body></html>"#
))
.create_async()
.await;
let _m2 = server
.mock("GET", "/page/2")
.with_status(200)
.with_header("content-type", "text/html")
.with_body("<html><body><p>Last page</p></body></html>")
.create_async()
.await;
let stats = CrawlEngine::builder()
.concurrency(1)
.respect_robots_txt(false)
.store(VecStore::default())
.run(PaginatedSpider {
page1: format!("{base}/page/1"),
})
.await
.unwrap();
assert_eq!(stats.pages_crawled, 2);
assert_eq!(stats.items_scraped, 2);
assert_eq!(stats.errors, 0);
}
#[tokio::test]
async fn engine_schedules_higher_priority_request_first() {
let start = "https://example.com/start";
let low = "https://example.com/low";
let high = "https://example.com/high";
let store = VecStore::default();
let mock = MockFetcher::new()
.with_response(start, 200, "<a>start</a>")
.with_response(low, 200, "<h1>low</h1>")
.with_response(high, 200, "<h1>high</h1>");
let stats = CrawlEngine::builder()
.concurrency(1)
.respect_robots_txt(false)
.fetcher(mock)
.store(store.clone())
.run(PrioritySpider {
start: start.to_string(),
low: low.to_string(),
high: high.to_string(),
})
.await
.unwrap();
assert_eq!(stats.pages_crawled, 3);
let items = store.collected();
assert_eq!(items[0]["url"], high);
assert_eq!(items[1]["url"], low);
}
#[tokio::test]
async fn engine_respects_dont_filter_requests() {
let start = "https://example.com/start";
let target = "https://example.com/revisit";
let store = VecStore::default();
let mock = MockFetcher::new()
.with_response(start, 200, "<a>start</a>")
.with_response(target, 200, "<h1>target</h1>");
let stats = CrawlEngine::builder()
.concurrency(1)
.respect_robots_txt(false)
.fetcher(mock)
.store(store.clone())
.run(DontFilterSpider {
start: start.to_string(),
target: target.to_string(),
})
.await
.unwrap();
assert_eq!(stats.pages_crawled, 3);
assert_eq!(store.collected().len(), 2);
}
#[tokio::test]
async fn mock_fetcher_runs_spider_without_network() {
struct TitleSpider;
#[async_trait::async_trait]
impl Spider for TitleSpider {
type Item = serde_json::Value;
fn name(&self) -> &str {
"title"
}
fn start_urls(&self) -> Vec<String> {
vec!["https://example.com".into()]
}
async fn parse(&self, res: &Response) -> Result<Output<Self::Item>, KumoError> {
let title = res.css("h1").first().map(|e| e.text()).unwrap_or_default();
Ok(Output::new().item(serde_json::json!({ "title": title })))
}
}
let store = VecStore::default();
let mock = MockFetcher::new().with_response("https://example.com", 200, "<h1>Test Title</h1>");
let stats = CrawlEngine::builder()
.store(store.clone())
.fetcher(mock)
.respect_robots_txt(false)
.run(TitleSpider)
.await
.unwrap();
assert_eq!(stats.pages_crawled, 1);
assert_eq!(stats.items_scraped, 1);
let items = store.collected();
assert_eq!(items[0]["title"], "Test Title");
}
#[tokio::test]
async fn response_from_file_loads_html() {
let tmp = tempfile::NamedTempFile::new().unwrap();
std::fs::write(tmp.path(), "<h1>From File</h1>").unwrap();
let res = Response::from_file("https://example.com", tmp.path()).unwrap();
assert_eq!(res.url(), "https://example.com");
assert_eq!(res.status(), 200);
assert_eq!(res.text(), Some("<h1>From File</h1>"));
}