spider-lib 3.0.4

A Rust-based web scraping framework inspired by Scrapy (Python).
Documentation
#[path = "showcase/support.rs"]
mod showcase;

use showcase::{ShowcaseSpider, prepare_output_dir};
use spider_lib::prelude::*;
use spider_middleware::proxy::{ProxyRotationStrategy, ProxySource};
use spider_middleware::user_agent::{
    BuiltinUserAgentList, UserAgentRotationStrategy, UserAgentSource,
};
use std::time::Duration;

fn build_proxy_middleware() -> Result<ProxyMiddleware, SpiderError> {
    ProxyMiddleware::builder()
        .source(ProxySource::List(vec![
            "http://127.0.0.1:8080".to_string(),
            "http://127.0.0.1:8081".to_string(),
        ]))
        .strategy(ProxyRotationStrategy::Sequential)
        .build()
}

fn build_cache_middleware() -> Result<HttpCacheMiddleware, SpiderError> {
    HttpCacheMiddleware::builder()
        .cache_dir("output/http-cache")
        .build()
}

fn build_user_agent_middleware() -> Result<UserAgentMiddleware, SpiderError> {
    UserAgentMiddleware::builder()
        .source(UserAgentSource::Builtin(BuiltinUserAgentList::Random))
        .strategy(UserAgentRotationStrategy::Random)
        .fallback_user_agent("spider-lib-showcase/1.0".to_string())
        .build()
}

#[tokio::main]
async fn main() -> Result<(), SpiderError> {
    prepare_output_dir()?;

    let crawler = CrawlerBuilder::new(ShowcaseSpider)
        .limit(1)
        .log_level(log::LevelFilter::Info)
        .add_middleware(RefererMiddleware::new().same_origin_only(false))
        .add_middleware(
            RateLimitMiddleware::builder()
                .use_token_bucket_limiter(2)
                .build(),
        )
        .add_middleware(
            RetryMiddleware::new()
                .max_retries(1)
                .backoff_factor(0.25)
                .max_delay(Duration::from_secs(2)),
        )
        .add_middleware(
            AutoThrottleMiddleware::builder()
                .min_delay(Duration::from_millis(50))
                .max_delay(Duration::from_secs(2))
                .target_concurrency(1.0)
                .build(),
        )
        .add_middleware(build_cache_middleware()?)
        .add_middleware(build_proxy_middleware()?)
        .add_middleware(build_user_agent_middleware()?)
        .add_middleware(RobotsTxtMiddleware::new().request_timeout(Duration::from_secs(2)))
        .add_middleware(CookieMiddleware::new())
        .build()
        .await?;

    let state = crawler.state_arc();
    crawler.start_crawl().await?;

    println!("showcase middleware summary: {}", state.summary());

    Ok(())
}