kumo 0.2.5

An async web crawling framework for Rust — Scrapy for Rust
Documentation
use std::{collections::HashMap, time::Duration};

#[derive(Debug, Clone, PartialEq, Eq)]
pub struct DomainPolicy {
    per_domain_concurrency: usize,
    per_domain_delay: Option<Duration>,
}

impl DomainPolicy {
    pub fn new() -> Self {
        Self::default()
    }

    pub fn per_domain_concurrency(mut self, value: usize) -> Self {
        self.per_domain_concurrency = value.max(1);
        self
    }

    pub fn per_domain_delay(mut self, delay: Duration) -> Self {
        self.per_domain_delay = Some(delay);
        self
    }

    pub fn concurrency(&self) -> usize {
        self.per_domain_concurrency
    }

    pub fn delay(&self) -> Option<Duration> {
        self.per_domain_delay
    }
}

impl Default for DomainPolicy {
    fn default() -> Self {
        Self {
            per_domain_concurrency: 8,
            per_domain_delay: None,
        }
    }
}

#[derive(Debug, Clone)]
pub struct PolitenessPolicy {
    default: DomainPolicy,
    jitter: Option<Duration>,
    respect_robots_crawl_delay: bool,
    domains: HashMap<String, DomainPolicy>,
}

impl PolitenessPolicy {
    pub fn new() -> Self {
        Self::default()
    }

    pub fn per_domain_concurrency(mut self, value: usize) -> Self {
        self.default = self.default.per_domain_concurrency(value);
        self
    }

    pub fn per_domain_delay(mut self, delay: Duration) -> Self {
        self.default = self.default.per_domain_delay(delay);
        self
    }

    pub fn jitter(mut self, jitter: Duration) -> Self {
        self.jitter = Some(jitter);
        self
    }

    pub fn respect_robots_crawl_delay(mut self, value: bool) -> Self {
        self.respect_robots_crawl_delay = value;
        self
    }

    pub fn domain(mut self, domain: impl Into<String>, policy: DomainPolicy) -> Self {
        self.domains
            .insert(domain.into().to_ascii_lowercase(), policy);
        self
    }

    pub fn policy_for(&self, domain: &str) -> &DomainPolicy {
        self.domains
            .get(&domain.to_ascii_lowercase())
            .unwrap_or(&self.default)
    }

    pub fn default_per_domain_concurrency(&self) -> usize {
        self.default.concurrency()
    }

    pub fn default_per_domain_delay(&self) -> Option<Duration> {
        self.default.delay()
    }

    pub fn jitter_range(&self) -> Option<Duration> {
        self.jitter
    }

    pub fn respects_robots_crawl_delay(&self) -> bool {
        self.respect_robots_crawl_delay
    }
}

impl Default for PolitenessPolicy {
    fn default() -> Self {
        Self {
            default: DomainPolicy::default(),
            jitter: None,
            respect_robots_crawl_delay: true,
            domains: HashMap::new(),
        }
    }
}