kumo 0.2.7

An async web crawling framework for Rust — Scrapy for Rust
Documentation
use std::{sync::Arc, time::Duration};

use crate::{
    engine::USER_AGENT,
    error::KumoError,
    extract::Response,
    fetch::{Fetcher, http::HttpFetcher},
    robots::RobotsCache,
};

#[cfg(feature = "browser")]
use crate::fetch::{BrowserConfig, BrowserFetcher};

pub(super) fn build_http_client(
    concurrency: usize,
    timeout: Option<Duration>,
    customize: Option<Box<dyn FnOnce(reqwest::ClientBuilder) -> reqwest::ClientBuilder + Send>>,
) -> Result<reqwest::Client, KumoError> {
    let mut builder = reqwest::Client::builder()
        .cookie_store(true)
        .user_agent(USER_AGENT)
        .pool_max_idle_per_host(concurrency)
        .tcp_keepalive(Duration::from_secs(60));
    if let Some(t) = timeout {
        builder = builder.timeout(t);
    }
    if let Some(f) = customize {
        builder = f(builder);
    }
    builder.build().map_err(KumoError::Fetch)
}

pub(super) fn build_robots_cache(respect: bool, ttl: Duration) -> Option<Arc<RobotsCache>> {
    if respect {
        Some(Arc::new(RobotsCache::with_ttl(USER_AGENT, ttl)))
    } else {
        None
    }
}

pub(super) fn wrap_with_cache(
    fetcher: Arc<dyn Fetcher>,
    cache_dir: Option<std::path::PathBuf>,
    cache_ttl: Option<Duration>,
) -> Result<Arc<dyn Fetcher>, KumoError> {
    if let Some(dir) = cache_dir {
        let mut cf = crate::fetch::CachingFetcher::new(ArcFetcher(fetcher), dir)?;
        if let Some(ttl) = cache_ttl {
            cf = cf.ttl(ttl);
        }
        Ok(Arc::new(cf))
    } else {
        Ok(fetcher)
    }
}

#[allow(dead_code)]
pub(super) struct FetcherArgs {
    pub(super) fetcher_override: Option<Arc<dyn Fetcher>>,
    pub(super) client: reqwest::Client,
    pub(super) concurrency: usize,
    #[cfg(feature = "stealth")]
    pub(super) stealth_profile: Option<crate::fetch::StealthProfile>,
    #[cfg(feature = "browser")]
    pub(super) browser: Option<BrowserConfig>,
}

pub(super) async fn build_raw_fetcher(args: FetcherArgs) -> Result<Arc<dyn Fetcher>, KumoError> {
    if let Some(f) = args.fetcher_override {
        return Ok(f);
    }

    #[cfg(feature = "stealth")]
    if let Some(profile) = args.stealth_profile {
        return Ok(Arc::new(crate::fetch::StealthHttpFetcher::new(profile)?));
    }

    #[cfg(feature = "browser")]
    if let Some(cfg) = args.browser {
        return Ok(Arc::new(
            BrowserFetcher::launch(cfg, args.concurrency).await?,
        ));
    }

    Ok(Arc::new(HttpFetcher::new(args.client, USER_AGENT)))
}

struct ArcFetcher(Arc<dyn Fetcher>);

#[async_trait::async_trait]
impl Fetcher for ArcFetcher {
    async fn fetch(
        &self,
        request: &crate::middleware::FetchRequest,
    ) -> Result<Response, KumoError> {
        self.0.fetch(request).await
    }
}