weave-content 0.2.13

Content DSL parser, validator, and builder for OSINT case files
Documentation
use std::collections::HashMap;
use std::path::Path;
use std::time::{SystemTime, UNIX_EPOCH};

use serde::{Deserialize, Serialize};

use crate::verifier::CheckStatus;

/// Cache TTL for `ok` results: 7 days in seconds.
const OK_TTL_SECS: u64 = 7 * 24 * 60 * 60;

/// Maximum cache entries to prevent unbounded growth.
const MAX_CACHE_ENTRIES: usize = 10_000;

/// A single cached URL check result.
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct CacheEntry {
    pub status: String,
    pub checked_at: u64,
    #[serde(skip_serializing_if = "Option::is_none")]
    pub http_status: Option<u16>,
    #[serde(skip_serializing_if = "Option::is_none")]
    pub detail: Option<String>,
}

/// Verify cache backed by a JSON file.
#[derive(Debug)]
pub struct VerifyCache {
    path: String,
    entries: HashMap<String, CacheEntry>,
}

impl VerifyCache {
    /// Create an empty in-memory cache (no file backing).
    pub fn empty() -> Self {
        Self {
            path: String::new(),
            entries: HashMap::new(),
        }
    }

    /// Load cache from file, or create empty if file doesn't exist or is invalid.
    ///
    /// # Errors
    ///
    /// Returns an error if the file exists but cannot be read (permissions, etc.).
    /// Invalid JSON is treated as empty cache (not an error).
    pub fn load(path: &str) -> Result<Self, String> {
        let entries = if Path::new(path).exists() {
            let content = std::fs::read_to_string(path)
                .map_err(|e| format!("failed to read cache file {path}: {e}"))?;
            serde_json::from_str(&content).unwrap_or_default()
        } else {
            HashMap::new()
        };

        Ok(Self {
            path: path.to_string(),
            entries,
        })
    }

    /// Check if a URL has a valid (non-expired) cache entry.
    /// Returns `None` if the URL should be re-checked.
    pub fn get(&self, url: &str) -> Option<&CacheEntry> {
        let entry = self.entries.get(url)?;
        let now = now_secs();

        // Always re-check error results
        if entry.status == "error" {
            return None;
        }

        // Check TTL for ok/warn results
        if now.saturating_sub(entry.checked_at) > OK_TTL_SECS {
            return None;
        }

        Some(entry)
    }

    /// Record a check result in the cache.
    pub fn put(&mut self, url: &str, status: CheckStatus, detail: Option<&str>) {
        // Enforce boundary
        if self.entries.len() >= MAX_CACHE_ENTRIES && !self.entries.contains_key(url) {
            return;
        }

        self.entries.insert(
            url.to_string(),
            CacheEntry {
                status: status.to_string(),
                checked_at: now_secs(),
                http_status: extract_http_status(detail),
                detail: detail.map(String::from),
            },
        );
    }

    /// Save cache to file.
    ///
    /// # Errors
    ///
    /// Returns an error if the file cannot be written.
    pub fn save(&self) -> Result<(), String> {
        // Prune expired entries before saving
        let now = now_secs();
        let pruned: HashMap<&String, &CacheEntry> = self
            .entries
            .iter()
            .filter(|(_, e)| {
                if e.status == "error" {
                    return false;
                }
                now.saturating_sub(e.checked_at) <= OK_TTL_SECS
            })
            .collect();

        let json = serde_json::to_string_pretty(&pruned)
            .map_err(|e| format!("failed to serialize cache: {e}"))?;
        std::fs::write(&self.path, json)
            .map_err(|e| format!("failed to write cache file {}: {e}", self.path))
    }
}

fn now_secs() -> u64 {
    SystemTime::now()
        .duration_since(UNIX_EPOCH)
        .map_or(0, |d| d.as_secs())
}

/// Try to extract HTTP status code from detail string like "HTTP 404 Not Found".
fn extract_http_status(detail: Option<&str>) -> Option<u16> {
    let detail = detail?;
    let rest = detail.strip_prefix("HTTP ")?;
    let code_str: String = rest.chars().take_while(char::is_ascii_digit).collect();
    code_str.parse().ok()
}

#[cfg(test)]
mod tests {
    use super::*;

    #[test]
    fn cache_put_and_get_ok() {
        let mut cache = VerifyCache {
            path: String::new(),
            entries: HashMap::new(),
        };

        cache.put("https://example.com", CheckStatus::Ok, None);
        assert!(cache.get("https://example.com").is_some());
    }

    #[test]
    fn cache_error_always_rechecked() {
        let mut cache = VerifyCache {
            path: String::new(),
            entries: HashMap::new(),
        };

        cache.put("https://example.com", CheckStatus::Error, Some("HTTP 404"));
        assert!(cache.get("https://example.com").is_none());
    }

    #[test]
    fn cache_expired_entry_not_returned() {
        let mut cache = VerifyCache {
            path: String::new(),
            entries: HashMap::new(),
        };

        cache.entries.insert(
            "https://old.com".into(),
            CacheEntry {
                status: "ok".into(),
                checked_at: 0, // epoch = very expired
                http_status: None,
                detail: None,
            },
        );

        assert!(cache.get("https://old.com").is_none());
    }

    #[test]
    fn cache_warn_within_ttl() {
        let mut cache = VerifyCache {
            path: String::new(),
            entries: HashMap::new(),
        };

        cache.put("https://example.com", CheckStatus::Warn, Some("timeout"));
        assert!(cache.get("https://example.com").is_some());
    }

    #[test]
    fn cache_unknown_url_returns_none() {
        let cache = VerifyCache {
            path: String::new(),
            entries: HashMap::new(),
        };

        assert!(cache.get("https://unknown.com").is_none());
    }

    #[test]
    fn extract_http_status_parses() {
        assert_eq!(extract_http_status(Some("HTTP 404 Not Found")), Some(404));
        assert_eq!(extract_http_status(Some("HTTP 200")), Some(200));
        assert_eq!(extract_http_status(Some("timeout")), None);
        assert_eq!(extract_http_status(None), None);
    }

    #[test]
    fn cache_boundary_enforced() {
        let mut cache = VerifyCache {
            path: String::new(),
            entries: HashMap::new(),
        };

        for i in 0..MAX_CACHE_ENTRIES {
            cache.put(&format!("https://example.com/{i}"), CheckStatus::Ok, None);
        }

        // One more should be rejected
        cache.put("https://overflow.com", CheckStatus::Ok, None);
        assert!(cache.get("https://overflow.com").is_none());
        assert_eq!(cache.entries.len(), MAX_CACHE_ENTRIES);
    }

    #[test]
    fn cache_update_existing_within_boundary() {
        let mut cache = VerifyCache {
            path: String::new(),
            entries: HashMap::new(),
        };

        for i in 0..MAX_CACHE_ENTRIES {
            cache.put(&format!("https://example.com/{i}"), CheckStatus::Ok, None);
        }

        // Updating existing should work even at capacity
        cache.put(
            "https://example.com/0",
            CheckStatus::Error,
            Some("HTTP 500"),
        );
        assert_eq!(cache.entries.len(), MAX_CACHE_ENTRIES);
    }
}