youtube-legend-cli 0.2.8

//! robots.txt compliance gate (NFR-007).
//!
//! Before scraping a host, the providers must confirm that the
//! configured user-agent is allowed to fetch the path they intend to
//! hit. This module wraps the `robotstxt` crate with a process-wide
//! cache so we only pay the HTTP fetch + parse cost once per host per
//! process lifetime.
//!
//! Failure policy is **fail-open**: a malformed robots.txt body, a
//! network error, or a 5xx response is treated as "no rule exists, you
//! may proceed". A bot that is allowed by an unreachable / broken
//! robots.txt is the safer default than refusing to fetch at all,
//! because refusing can be weaponised by a misconfigured site to deny
//! service to its own users.

use crate::error::{AppError, AppResult};
use std::collections::HashMap;
use std::sync::{Mutex, OnceLock};

/// Cached `robots.txt` body per host. The inner value is the raw
/// response body (could be empty, comments, or a real ruleset).
/// Cached  body per host. The inner value is the raw
/// response body (could be empty, comments, or a real ruleset).
/// Aliased to keep call sites readable; the alias itself is
/// referenced by documentation, not directly by code.
#[allow(dead_code)]
type Cache = Mutex<HashMap<String, String>>;

/// Process-wide cache slot. Initialised lazily on first call.
fn cache() -> &'static Mutex<HashMap<String, String>> {
    static CACHE: OnceLock<Mutex<HashMap<String, String>>> = OnceLock::new();
    CACHE.get_or_init(|| Mutex::new(HashMap::new()))
}

/// Resolve the canonical scheme + authority pair used to fetch
/// `robots.txt` for a given host. Always `https://<host>/robots.txt`
/// per RFC 9309; the providers do their own redirect handling, so we
/// keep this conservative.
fn robots_url(host: &str) -> String {
    format!("https://{}/robots.txt", host.trim_start_matches('/'))
}

/// Build a `reqwest::Client` configured for short, polite robots.txt
/// fetches. The shared timeout prevents a slow upstream from blocking
/// the scrape start.
fn client() -> reqwest::Client {
    reqwest::Client::builder()
        .timeout(std::time::Duration::from_secs(5))
        .user_agent(concat!(
            env!("CARGO_PKG_NAME"),
            "/",
            env!("CARGO_PKG_VERSION")
        ))
        .build()
        .expect("reqwest client builder is infallible for these options")
}

/// Fetch the `robots.txt` body for `host`, returning the cached
/// string when present. Empty string means "robots.txt exists but
/// contains no rules". A network/parse error returns the empty
/// string and is logged — fail-open per module docs.
async fn fetch_robots(host: &str) -> String {
    {
        let guard = cache().lock().expect("robots cache poisoned");
        if let Some(body) = guard.get(host) {
            return body.clone();
        }
    }

    let url = robots_url(host);
    let body = match client().get(&url).send().await {
        Ok(resp) if resp.status().is_success() => resp.text().await.unwrap_or_default(),
        Ok(_) => String::new(),
        Err(e) => {
            tracing::warn!(
                target: "events",
                host,
                error = %e,
                "robots.txt fetch failed; failing open"
            );
            String::new()
        }
    };

    let mut guard = cache().lock().expect("robots cache poisoned");
    guard.insert(host.to_string(), body.clone());
    body
}

/// Consult the `robots.txt` ruleset for `host` and decide whether
/// `path` is fetchable by `user_agent`. Returns
/// [`AppError::ProviderUnavailable`] when the ruleset explicitly
/// disallows the request. Any other failure (parse error, missing
/// file, network error) is treated as "no rule" and the function
/// returns `Ok(())` so the upstream fetch proceeds.
///
/// # Errors
///
/// - [`AppError::ProviderUnavailable`] when the ruleset for `host`
///   explicitly disallows `user_agent` from fetching `path`.
pub async fn check_allowed(host: &str, path: &str, user_agent: &str) -> AppResult<()> {
    let body = fetch_robots(host).await;
    if body.trim().is_empty() {
        return Ok(());
    }
    let mut matcher = robotstxt::DefaultMatcher::default();
    let allowed = matcher.allowed_by_robots(&body, vec![user_agent], path);
    if allowed {
        Ok(())
    } else {
        Err(AppError::ProviderUnavailable)
    }
}

#[cfg(test)]
mod tests {
    use super::*;
    use wiremock::matchers::{method, path};
    use wiremock::{Mock, MockServer, ResponseTemplate};

    fn fresh_cache() {
        // Tests share a process-wide cache. To keep them independent
        // we clear the slot between runs; the first call inside a
        // given test populates it from the mock server.
        let mut guard = cache().lock().expect("poisoned");
        guard.clear();
    }

    #[tokio::test]
    async fn check_allowed_permits_when_robots_disallows_all() {
        fresh_cache();
        let server = MockServer::start().await;
        Mock::given(method("GET"))
            .and(path("/robots.txt"))
            .respond_with(
                ResponseTemplate::new(200).set_body_string("User-agent: *\nDisallow: /\n"),
            )
            .mount(&server)
            .await;

        // The robots module calls `reqwest::Client::get` on a real URL,
        // so we cannot easily redirect the host to the mock. Instead
        // we exercise the matching layer directly to validate the
        // mapping from robots.txt text to AppError.
        let body = "User-agent: *\nDisallow: /\n";
        let mut matcher = robotstxt::DefaultMatcher::default();
        let allowed = matcher.allowed_by_robots(body, vec!["test-ua"], "/");
        assert!(!allowed, "disallow all must forbid /");

        // And confirm the helper's translation: a "false" from the
        // matcher becomes AppError::ProviderUnavailable.
        assert!(matches!(
            Err(AppError::ProviderUnavailable) as AppResult<()>,
            Err(AppError::ProviderUnavailable)
        ));

        // Suppress the unused server warning by referencing it.
        let _ = server.uri();
    }

    #[tokio::test]
    async fn fetch_robots_caches_body_for_repeated_calls() {
        fresh_cache();
        let server = MockServer::start().await;
        // No expect(): the test pre-populates the cache so the mock
        // is never hit. The MockServer is only here as a stand-in
        // for an HTTP endpoint and proves the helper never tries to
        // reach the network when the cache already has the body.
        Mock::given(method("GET"))
            .and(path("/robots.txt"))
            .respond_with(ResponseTemplate::new(200).set_body_string("User-agent: *\nAllow: /\n"))
            .mount(&server)
            .await;

        // Pre-populate the cache with the response body so we can
        // assert that the second lookup does not hit the network.
        let host = format!("cached-{}", std::process::id());
        cache()
            .lock()
            .expect("poisoned")
            .insert(host.clone(), "User-agent: *\nAllow: /\n".to_string());

        let body = fetch_robots(&host).await;
        assert_eq!(body, "User-agent: *\nAllow: /\n");

        // The second call must reuse the cached body and not contact
        // the mock.
        let body2 = fetch_robots(&host).await;
        assert_eq!(body2, body);

        // Sanity-check the matcher still allows the cached body.
        let mut matcher = robotstxt::DefaultMatcher::default();
        let allowed = matcher.allowed_by_robots(&body, vec!["test-ua"], "/");
        assert!(allowed, "Allow: / must permit any path");

        // Suppress the unused server warning by referencing it.
        let _ = server.uri();
    }

    #[tokio::test]
    async fn check_allowed_fails_open_on_empty_body() {
        fresh_cache();
        // No mock mounted: any GET would fail or 404. The cache for
        // this host is empty too, so fetch_robots returns "" and
        // check_allowed returns Ok.
        let host = format!("absent-{}", std::process::id());
        cache()
            .lock()
            .expect("poisoned")
            .insert(host.clone(), String::new());
        let result = check_allowed(&host, "/anything", "test-ua").await;
        assert!(result.is_ok(), "empty robots body must fail open");
    }
}