kumo 0.2.5

An async web crawling framework for Rust — Scrapy for Rust
Documentation
use kumo::{
    CrawlRequest,
    frontier::{Frontier, MemoryFrontier},
};

#[tokio::test]
async fn push_new_url_returns_true() {
    let frontier = MemoryFrontier::new(1000);
    assert!(frontier.push("https://example.com".into(), 0).await);
}

#[tokio::test]
async fn push_duplicate_url_returns_false() {
    let frontier = MemoryFrontier::new(1000);
    frontier.push("https://example.com".into(), 0).await;
    assert!(!frontier.push("https://example.com".into(), 0).await);
}

#[tokio::test]
async fn pop_empty_returns_none() {
    let frontier = MemoryFrontier::new(1000);
    assert!(frontier.pop().await.is_none());
}

#[tokio::test]
async fn push_then_pop_returns_url_and_depth() {
    let frontier = MemoryFrontier::new(1000);
    frontier.push("https://example.com".into(), 3).await;
    let item = frontier.pop().await.unwrap();
    assert_eq!(item.0, "https://example.com");
    assert_eq!(item.1, 3);
    assert_eq!(item.2, 0);
}

#[tokio::test]
async fn pop_is_fifo() {
    let frontier = MemoryFrontier::new(1000);
    frontier.push("https://a.com".into(), 0).await;
    frontier.push("https://b.com".into(), 0).await;
    frontier.push("https://c.com".into(), 0).await;
    assert_eq!(frontier.pop().await.unwrap().0, "https://a.com");
    assert_eq!(frontier.pop().await.unwrap().0, "https://b.com");
    assert_eq!(frontier.pop().await.unwrap().0, "https://c.com");
}

#[tokio::test]
async fn higher_priority_pops_first() {
    let frontier = MemoryFrontier::new(1000);
    frontier
        .push_request(CrawlRequest::get("https://low.com").priority(-1), 0)
        .await;
    frontier
        .push_request(CrawlRequest::get("https://high.com").priority(10), 0)
        .await;
    frontier
        .push_request(CrawlRequest::get("https://mid.com").priority(2), 0)
        .await;

    assert_eq!(frontier.pop().await.unwrap().0, "https://high.com");
    assert_eq!(frontier.pop().await.unwrap().0, "https://mid.com");
    assert_eq!(frontier.pop().await.unwrap().0, "https://low.com");
}

#[tokio::test]
async fn equal_priority_preserves_fifo_order() {
    let frontier = MemoryFrontier::new(1000);
    frontier
        .push_request(CrawlRequest::get("https://a.com").priority(5), 0)
        .await;
    frontier
        .push_request(CrawlRequest::get("https://b.com").priority(5), 0)
        .await;
    frontier
        .push_request(CrawlRequest::get("https://c.com").priority(5), 0)
        .await;

    assert_eq!(frontier.pop().await.unwrap().0, "https://a.com");
    assert_eq!(frontier.pop().await.unwrap().0, "https://b.com");
    assert_eq!(frontier.pop().await.unwrap().0, "https://c.com");
}

#[tokio::test]
async fn dont_filter_allows_duplicate_url() {
    let frontier = MemoryFrontier::new(1000);
    assert!(
        frontier
            .push_request(CrawlRequest::get("https://example.com"), 0)
            .await
    );
    assert!(
        frontier
            .push_request(
                CrawlRequest::get("https://example.com").dont_filter(true),
                0,
            )
            .await
    );
    assert_eq!(frontier.len().await, 2);
}

#[tokio::test]
async fn len_reflects_queue_size() {
    let frontier = MemoryFrontier::new(1000);
    assert_eq!(frontier.len().await, 0);
    frontier.push("https://a.com".into(), 0).await;
    frontier.push("https://b.com".into(), 0).await;
    assert_eq!(frontier.len().await, 2);
    frontier.pop().await;
    assert_eq!(frontier.len().await, 1);
}

#[tokio::test]
async fn is_empty_true_when_empty() {
    let frontier = MemoryFrontier::new(1000);
    assert!(frontier.is_empty().await);
    frontier.push("https://a.com".into(), 0).await;
    assert!(!frontier.is_empty().await);
}

#[tokio::test]
async fn different_urls_are_not_deduplicated() {
    let frontier = MemoryFrontier::new(1000);
    assert!(frontier.push("https://a.com".into(), 0).await);
    assert!(frontier.push("https://b.com".into(), 0).await);
    assert_eq!(frontier.len().await, 2);
}

#[tokio::test]
async fn push_force_bypasses_dedup_and_carries_retry_count() {
    let frontier = MemoryFrontier::new(1000);
    frontier.push("https://example.com".into(), 0).await;
    frontier
        .push_force("https://example.com".into(), 0, 1)
        .await;
    let _ = frontier.pop().await;
    let retried = frontier.pop().await.unwrap();
    assert_eq!(retried.0, "https://example.com");
    assert_eq!(retried.2, 1);
}