kumo 0.2.7

An async web crawling framework for Rust — Scrapy for Rust
Documentation
use kumo::{
    CrawlRequest,
    engine::CrawlEngine,
    error::KumoError,
    extract::Response,
    fetch::MockFetcher,
    spider::{Output, Spider},
    stats::{CrawlReport, CrawlStats},
    store::StdoutStore,
};

#[test]
fn crawl_report_exposes_scheduler_counters() {
    let mut stats = CrawlStats::default();
    stats.record_scheduled("example.com");
    stats.record_deduped("example.com");
    stats.record_retry("example.com");
    stats.record_robots_blocked("example.com");

    let report = CrawlReport::from(stats);

    assert_eq!(report.scheduled, 1);
    assert_eq!(report.deduped, 1);
    assert_eq!(report.retries, 1);
    assert_eq!(report.robots_blocked, 1);
    assert_eq!(report.domains["example.com"].scheduled, 1);
    assert_eq!(report.domains["example.com"].deduped, 1);
    assert_eq!(report.domains["example.com"].retries, 1);
    assert_eq!(report.domains["example.com"].robots_blocked, 1);
}

struct DuplicateSpider {
    start: String,
    target: String,
}

struct PanicSpider {
    start: String,
    name: &'static str,
}

#[async_trait::async_trait]
impl Spider for DuplicateSpider {
    type Item = serde_json::Value;

    fn name(&self) -> &str {
        "duplicate-stats"
    }

    fn start_urls(&self) -> Vec<String> {
        vec![self.start.clone()]
    }

    async fn parse(&self, res: &Response) -> Result<Output<Self::Item>, KumoError> {
        if res.url() == self.start {
            return Ok(Output::new()
                .request(CrawlRequest::get(&self.target))
                .request(CrawlRequest::get(&self.target)));
        }
        Ok(Output::new())
    }
}

#[async_trait::async_trait]
impl Spider for PanicSpider {
    type Item = serde_json::Value;

    fn name(&self) -> &str {
        self.name
    }

    fn start_urls(&self) -> Vec<String> {
        vec![self.start.clone()]
    }

    async fn parse(&self, _res: &Response) -> Result<Output<Self::Item>, KumoError> {
        panic!("intentional panic for stats coverage");
    }
}

#[tokio::test]
async fn engine_stats_count_scheduled_completed_and_deduped_requests() {
    let start = "https://example.com/start";
    let target = "https://example.com/target";
    let mock = MockFetcher::new()
        .with_response(start, 200, "<a>start</a>")
        .with_response(target, 200, "<h1>target</h1>");

    let stats = CrawlEngine::builder()
        .concurrency(1)
        .respect_robots_txt(false)
        .fetcher(mock)
        .store(StdoutStore)
        .run(DuplicateSpider {
            start: start.to_string(),
            target: target.to_string(),
        })
        .await
        .unwrap();

    assert_eq!(stats.pages_crawled, 2);
    assert_eq!(stats.scheduled, 2);
    assert_eq!(stats.deduped, 1);
    assert_eq!(stats.domains["example.com"].scheduled, 2);
    assert_eq!(stats.domains["example.com"].deduped, 1);
    assert_eq!(stats.domains["example.com"].completed, 2);
}

#[tokio::test]
async fn engine_stats_count_task_panic_as_domain_failure() {
    let start = "https://panic.example.com/start";
    let mock = MockFetcher::new().with_response(start, 200, "<h1>panic</h1>");

    let stats = CrawlEngine::builder()
        .concurrency(1)
        .respect_robots_txt(false)
        .fetcher(mock)
        .store(StdoutStore)
        .run(PanicSpider {
            start: start.to_string(),
            name: "panic-single",
        })
        .await
        .unwrap();

    assert_eq!(stats.errors, 1);
    assert_eq!(stats.pages_crawled, 0);
    assert_eq!(stats.domains["panic.example.com"].failed, 1);
}

#[tokio::test]
async fn run_all_stats_count_task_panic_for_the_right_spider() {
    let panic_url = "https://panic.example.com/start";
    let ok_url = "https://ok.example.com/start";
    let mock = MockFetcher::new()
        .with_response(panic_url, 200, "<h1>panic</h1>")
        .with_response(ok_url, 200, "<h1>ok</h1>");

    struct OkSpider(String);

    #[async_trait::async_trait]
    impl Spider for OkSpider {
        type Item = serde_json::Value;

        fn name(&self) -> &str {
            "ok"
        }

        fn start_urls(&self) -> Vec<String> {
            vec![self.0.clone()]
        }

        async fn parse(&self, _res: &Response) -> Result<Output<Self::Item>, KumoError> {
            Ok(Output::new())
        }
    }

    let stats = CrawlEngine::builder()
        .concurrency(2)
        .respect_robots_txt(false)
        .fetcher(mock)
        .store(StdoutStore)
        .add_spider(PanicSpider {
            start: panic_url.to_string(),
            name: "panic-multi",
        })
        .add_spider(OkSpider(ok_url.to_string()))
        .run_all()
        .await
        .unwrap();

    assert_eq!(stats[0].errors, 1);
    assert_eq!(stats[0].pages_crawled, 0);
    assert_eq!(stats[0].domains["panic.example.com"].failed, 1);

    assert_eq!(stats[1].errors, 0);
    assert_eq!(stats[1].pages_crawled, 1);
    assert_eq!(stats[1].domains["ok.example.com"].completed, 1);
}