mod error;
mod serde;
mod toml;
pub use self::{
error::ConfigError,
serde::{SerializableConfig, compile_config},
toml::read_config,
};
use alloc::sync::Arc;
use core::{cmp::Reverse, ops::Deref, time::Duration};
use http::{HeaderMap, StatusCode};
use regex::Regex;
use rlimit::{Resource, getrlimit};
use std::collections::{HashMap, HashSet};
use url::Url;
pub const DEFAULT_ACCEPTED_SCHEMES: &[&str] = &["http", "https"];
pub const DEFAULT_ACCEPTED_STATUS_CODES: &[StatusCode] = &[StatusCode::OK];
pub const DEFAULT_MAX_REDIRECTS: usize = 16;
pub const DEFAULT_TIMEOUT: Duration = Duration::from_secs(30);
const DEFAULT_MINIMUM_CONCURRENCY: usize = 256;
pub fn default_concurrency() -> usize {
getrlimit(Resource::NOFILE)
.map(|(count, _)| (count / 2) as _)
.unwrap_or(DEFAULT_MINIMUM_CONCURRENCY)
}
#[derive(Clone, Debug)]
pub struct Config {
roots: Vec<String>,
ignored_links: Vec<Regex>,
default: Arc<SiteConfig>,
sites: HashMap<String, Vec<(String, Arc<SiteConfig>)>>,
concurrency: ConcurrencyConfig,
persistent_cache: bool,
rate_limit: RateLimitConfig,
}
impl Config {
pub fn new(
roots: Vec<String>,
default: Arc<SiteConfig>,
sites: HashMap<String, HashMap<String, Arc<SiteConfig>>>,
) -> Self {
Self {
roots,
ignored_links: Default::default(),
default,
sites: sites
.into_iter()
.map(|(host, value)| {
let mut paths = value.into_iter().collect::<Vec<_>>();
paths.sort_by_key(|(path, _)| Reverse(path.clone()));
(host, paths)
})
.collect(),
concurrency: Default::default(),
persistent_cache: false,
rate_limit: Default::default(),
}
}
pub fn roots(&self) -> impl Iterator<Item = &str> {
self.roots.iter().map(Deref::deref)
}
pub fn ignored_links(&self) -> impl Iterator<Item = &Regex> {
self.ignored_links.iter()
}
pub const fn sites(&self) -> &HashMap<String, Vec<(String, Arc<SiteConfig>)>> {
&self.sites
}
pub fn site(&self, url: &Url) -> &SiteConfig {
self.get_site(url).unwrap_or(&self.default)
}
pub const fn concurrency(&self) -> &ConcurrencyConfig {
&self.concurrency
}
pub const fn persistent_cache(&self) -> bool {
self.persistent_cache
}
pub const fn rate_limit(&self) -> &RateLimitConfig {
&self.rate_limit
}
pub fn set_concurrency(mut self, concurrency: ConcurrencyConfig) -> Self {
self.concurrency = concurrency;
self
}
pub fn set_ignored_links(mut self, links: Vec<Regex>) -> Self {
self.ignored_links = links;
self
}
pub const fn set_persistent_cache(mut self, persistent_cache: bool) -> Self {
self.persistent_cache = persistent_cache;
self
}
pub fn set_rate_limit(mut self, rate_limit: RateLimitConfig) -> Self {
self.rate_limit = rate_limit;
self
}
fn get_site(&self, url: &Url) -> Option<&SiteConfig> {
self.sites()
.get(url.host_str()?)?
.iter()
.find_map(|(path, config)| url.path().starts_with(path).then_some(config.as_ref()))
}
}
#[derive(Clone, Debug, Default, PartialEq)]
pub struct SiteConfig {
id: Option<Arc<str>>,
cache: CacheConfig,
fragments_ignored: bool,
headers: HeaderMap,
max_redirects: usize,
recursive: bool,
retry: Arc<RetryConfig>,
scheme: SchemeConfig,
status: StatusConfig,
timeout: Option<Duration>,
validation: ValidationConfig,
}
impl SiteConfig {
pub fn new() -> Self {
Self::default()
}
pub const fn id(&self) -> Option<&Arc<str>> {
self.id.as_ref()
}
pub const fn cache(&self) -> &CacheConfig {
&self.cache
}
pub const fn fragments_ignored(&self) -> bool {
self.fragments_ignored
}
pub const fn headers(&self) -> &HeaderMap {
&self.headers
}
pub const fn retry(&self) -> &Arc<RetryConfig> {
&self.retry
}
pub const fn status(&self) -> &StatusConfig {
&self.status
}
pub const fn scheme(&self) -> &SchemeConfig {
&self.scheme
}
pub const fn max_redirects(&self) -> usize {
self.max_redirects
}
pub const fn timeout(&self) -> Option<Duration> {
self.timeout
}
pub const fn recursive(&self) -> bool {
self.recursive
}
pub const fn validation(&self) -> &ValidationConfig {
&self.validation
}
pub fn set_id(mut self, id: Option<Arc<str>>) -> Self {
self.id = id;
self
}
pub const fn set_cache(mut self, cache: CacheConfig) -> Self {
self.cache = cache;
self
}
pub const fn set_fragments_ignored(mut self, ignored: bool) -> Self {
self.fragments_ignored = ignored;
self
}
pub fn set_headers(mut self, headers: HeaderMap) -> Self {
self.headers = headers;
self
}
pub fn set_retry(mut self, retry: Arc<RetryConfig>) -> Self {
self.retry = retry;
self
}
pub fn set_status(mut self, status: StatusConfig) -> Self {
self.status = status;
self
}
pub fn set_scheme(mut self, scheme: SchemeConfig) -> Self {
self.scheme = scheme;
self
}
pub const fn set_max_redirects(mut self, count: usize) -> Self {
self.max_redirects = count;
self
}
pub const fn set_timeout(mut self, duration: Option<Duration>) -> Self {
self.timeout = duration;
self
}
pub const fn set_recursive(mut self, recursive: bool) -> Self {
self.recursive = recursive;
self
}
pub fn set_validation(mut self, validation: ValidationConfig) -> Self {
self.validation = validation;
self
}
}
#[derive(Clone, Debug, Eq, PartialEq)]
pub struct StatusConfig {
accepted: HashSet<StatusCode>,
}
impl StatusConfig {
pub const fn new(accepted: HashSet<StatusCode>) -> Self {
Self { accepted }
}
pub fn accepted(&self, status: StatusCode) -> bool {
self.accepted.contains(&status)
}
}
impl Default for StatusConfig {
fn default() -> Self {
Self {
accepted: DEFAULT_ACCEPTED_STATUS_CODES.iter().copied().collect(),
}
}
}
#[derive(Clone, Debug, Eq, PartialEq)]
pub struct SchemeConfig {
accepted: HashSet<String>,
}
impl SchemeConfig {
pub const fn new(accepted: HashSet<String>) -> Self {
Self { accepted }
}
pub fn accepted(&self, scheme: &str) -> bool {
self.accepted.contains(scheme)
}
}
impl Default for SchemeConfig {
fn default() -> Self {
Self {
accepted: DEFAULT_ACCEPTED_SCHEMES
.iter()
.copied()
.map(ToOwned::to_owned)
.collect(),
}
}
}
#[derive(Clone, Debug, Eq, PartialEq, Default)]
pub struct ValidationConfig {
html: Option<MarkupConfig>,
svg: Option<MarkupConfig>,
css: bool,
}
impl ValidationConfig {
pub const fn html(&self) -> Option<&MarkupConfig> {
self.html.as_ref()
}
pub const fn svg(&self) -> Option<&MarkupConfig> {
self.svg.as_ref()
}
pub const fn css(&self) -> bool {
self.css
}
pub fn set_html(mut self, config: Option<MarkupConfig>) -> Self {
self.html = config;
self
}
pub fn set_svg(mut self, config: Option<MarkupConfig>) -> Self {
self.svg = config;
self
}
pub const fn set_css(mut self, enabled: bool) -> Self {
self.css = enabled;
self
}
}
#[derive(Clone, Debug, Default)]
pub struct MarkupConfig {
ignored_attributes: Vec<Regex>,
ignored_elements: Vec<Regex>,
}
impl MarkupConfig {
pub const fn new(ignored_attributes: Vec<Regex>, ignored_elements: Vec<Regex>) -> Self {
Self {
ignored_attributes,
ignored_elements,
}
}
pub fn ignored_attributes(&self) -> &[Regex] {
&self.ignored_attributes
}
pub fn ignored_elements(&self) -> &[Regex] {
&self.ignored_elements
}
}
impl PartialEq for MarkupConfig {
fn eq(&self, other: &Self) -> bool {
self.ignored_attributes.len() == other.ignored_attributes.len()
&& self.ignored_elements.len() == other.ignored_elements.len()
&& self
.ignored_attributes
.iter()
.zip(&other.ignored_attributes)
.chain(self.ignored_elements.iter().zip(&other.ignored_elements))
.all(|(one, other)| one.as_str() == other.as_str())
}
}
impl Eq for MarkupConfig {}
#[derive(Clone, Debug, Default, Eq, PartialEq)]
pub struct CacheConfig {
max_age: Duration,
}
impl CacheConfig {
pub fn new() -> Self {
Self::default()
}
pub const fn max_age(&self) -> Duration {
self.max_age
}
pub const fn set_max_age(mut self, age: Duration) -> Self {
self.max_age = age;
self
}
}
#[derive(Clone, Debug, Default, PartialEq)]
pub struct RetryConfig {
count: usize,
factor: f64,
interval: RetryDurationConfig,
statuses: HashSet<StatusCode>,
}
impl RetryConfig {
pub fn new() -> Self {
Self {
count: 0,
factor: 1.0,
interval: Default::default(),
statuses: Default::default(),
}
}
pub const fn count(&self) -> usize {
self.count
}
pub const fn factor(&self) -> f64 {
self.factor
}
pub const fn interval(&self) -> &RetryDurationConfig {
&self.interval
}
pub const fn statuses(&self) -> &HashSet<StatusCode> {
&self.statuses
}
pub const fn set_count(mut self, count: usize) -> Self {
self.count = count;
self
}
pub const fn set_factor(mut self, factor: f64) -> Self {
self.factor = factor;
self
}
pub const fn set_interval(mut self, duration: RetryDurationConfig) -> Self {
self.interval = duration;
self
}
pub fn set_statuses(mut self, statuses: HashSet<StatusCode>) -> Self {
self.statuses = statuses;
self
}
}
#[derive(Clone, Debug, Default, Eq, PartialEq)]
pub struct RetryDurationConfig {
initial: Duration,
cap: Option<Duration>,
}
impl RetryDurationConfig {
pub fn new() -> Self {
Self::default()
}
pub const fn initial(&self) -> Duration {
self.initial
}
pub const fn cap(&self) -> Option<Duration> {
self.cap
}
pub const fn set_initial(mut self, duration: Duration) -> Self {
self.initial = duration;
self
}
pub const fn set_cap(mut self, duration: Option<Duration>) -> Self {
self.cap = duration;
self
}
}
#[derive(Clone, Debug, Default, Eq, PartialEq)]
pub struct ConcurrencyConfig {
global: Option<usize>,
sites: HashMap<String, usize>,
}
impl ConcurrencyConfig {
pub fn new() -> Self {
Self::default()
}
pub const fn global(&self) -> Option<usize> {
self.global
}
pub const fn sites(&self) -> &HashMap<String, usize> {
&self.sites
}
pub const fn set_global(mut self, concurrency: Option<usize>) -> Self {
self.global = concurrency;
self
}
pub fn set_sites(mut self, sites: HashMap<String, usize>) -> Self {
self.sites = sites;
self
}
}
#[derive(Clone, Debug, Default, Eq, PartialEq)]
pub struct RateLimitConfig {
global: Option<SiteRateLimitConfig>,
sites: HashMap<String, SiteRateLimitConfig>,
}
impl RateLimitConfig {
pub fn new() -> Self {
Self::default()
}
pub const fn global(&self) -> Option<&SiteRateLimitConfig> {
self.global.as_ref()
}
pub const fn sites(&self) -> &HashMap<String, SiteRateLimitConfig> {
&self.sites
}
pub const fn set_global(mut self, rate_limit: Option<SiteRateLimitConfig>) -> Self {
self.global = rate_limit;
self
}
pub fn set_sites(mut self, sites: HashMap<String, SiteRateLimitConfig>) -> Self {
self.sites = sites;
self
}
}
#[derive(Clone, Debug, Default, Eq, PartialEq)]
pub struct SiteRateLimitConfig {
supply: u64,
window: Duration,
}
impl SiteRateLimitConfig {
pub const fn new(supply: u64, window: Duration) -> Self {
Self { supply, window }
}
pub const fn supply(&self) -> u64 {
self.supply
}
pub const fn window(&self) -> Duration {
self.window
}
pub const fn set_supply(mut self, supply: u64) -> Self {
self.supply = supply;
self
}
pub const fn set_window(mut self, window: Duration) -> Self {
self.window = window;
self
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn site_config_path_order() {
let config = Config::new(
vec![],
Default::default(),
[(
"example.com".to_string(),
[
(
"/foo".to_string(),
SiteConfig::default()
.set_id(Some("foo".into()))
.set_recursive(true)
.into(),
),
(
"/bar".to_string(),
SiteConfig::default()
.set_id(Some("bar".into()))
.set_recursive(true)
.into(),
),
(
"/".to_string(),
SiteConfig::default()
.set_id(Some("top".into()))
.set_recursive(false)
.into(),
),
(
"/baz".to_string(),
SiteConfig::default()
.set_id(Some("baz".into()))
.set_recursive(true)
.into(),
),
(
"/qux".to_string(),
SiteConfig::default()
.set_id(Some("qux".into()))
.set_recursive(true)
.into(),
),
]
.into_iter()
.collect(),
)]
.into(),
);
assert!(
config
.site(&Url::parse("http://example.com/foo").unwrap())
.recursive()
);
assert!(
config
.site(&Url::parse("http://example.com/bar").unwrap())
.recursive()
);
assert!(
config
.site(&Url::parse("http://example.com/baz").unwrap())
.recursive()
);
assert!(
config
.site(&Url::parse("http://example.com/qux").unwrap())
.recursive()
);
assert!(
!config
.site(&Url::parse("http://example.com/other").unwrap())
.recursive()
);
}
#[test]
fn default_validation_config() {
let config = ValidationConfig::default();
assert!(config.html().is_none());
assert!(config.svg().is_none());
assert!(!config.css());
}
#[test]
fn set_validation_config_enabled() {
let config = ValidationConfig::default()
.set_html(Some(MarkupConfig::default()))
.set_svg(Some(MarkupConfig::default()))
.set_css(true);
assert!(config.html().is_some());
assert!(config.svg().is_some());
assert!(config.css());
}
#[test]
fn validate_site_config() {
let config = SiteConfig::default();
assert!(config.validation().html().is_none());
assert!(
config
.set_validation(ValidationConfig::default().set_html(Some(MarkupConfig::default())))
.validation()
.html()
.is_some()
);
}
#[test]
fn retry_config_statuses() {
let config = RetryConfig::new().set_statuses(HashSet::from([StatusCode::REQUEST_TIMEOUT]));
assert_eq!(
config.statuses(),
&HashSet::from([StatusCode::REQUEST_TIMEOUT])
);
}
}