pageinfo-rs 0.2.2

CLI tool that analyzes web pages and produces structured LLM-friendly output
Documentation
use std::collections::HashMap;
use std::fs;
use std::path::{Path, PathBuf};

use crate::cache::error::CacheError;
use crate::cache::key::CacheKey;
use crate::cache::types::{CacheConfig, CachedFetch, CachedPage};

const CACHE_VERSION: u32 = 1;

pub trait Cache {
    fn init(&self) -> Result<(), CacheError>;
    fn key_for_final_url(&self, final_url: &str) -> Result<CacheKey, CacheError>;
    fn load(&self, key: &CacheKey) -> Result<Option<CachedPage>, CacheError>;
    fn store(&self, page: CachedPage) -> Result<CacheKey, CacheError>;
    #[allow(dead_code)]
    fn delete(&self, key: &CacheKey) -> Result<(), CacheError>;
}

#[derive(Debug, Clone)]
pub struct FileCache {
    config: CacheConfig,
}

impl FileCache {
    pub fn new(config: CacheConfig) -> Self {
        Self { config }
    }

    pub fn is_enabled(&self) -> bool {
        self.config.enabled
    }

    pub fn should_refresh(&self) -> bool {
        self.config.refresh
    }

    fn version_path(&self) -> PathBuf {
        self.config.root_dir.join("VERSION")
    }

    fn pages_dir(&self) -> PathBuf {
        self.config.root_dir.join("pages")
    }

    fn entry_dir(&self, key: &CacheKey) -> PathBuf {
        self.pages_dir().join(&key.hash)
    }

    fn fetch_path(&self, key: &CacheKey) -> PathBuf {
        self.entry_dir(key).join("fetch.json")
    }

    fn headers_path(&self, key: &CacheKey) -> PathBuf {
        self.entry_dir(key).join("headers.json")
    }

    fn html_path(&self, key: &CacheKey) -> PathBuf {
        self.entry_dir(key).join("page.html")
    }

    fn read_version(&self) -> Result<Option<String>, CacheError> {
        let path = self.version_path();
        if !path.exists() {
            return Ok(None);
        }
        Ok(Some(fs::read_to_string(path)?.trim().to_string()))
    }

    fn write_json<T: serde::Serialize>(
        &self,
        path: &Path,
        value: &T,
    ) -> Result<(), CacheError> {
        let bytes = serde_json::to_vec_pretty(value)?;
        fs::write(path, bytes)?;
        Ok(())
    }
}

impl Cache for FileCache {
    fn init(&self) -> Result<(), CacheError> {
        if !self.is_enabled() {
            return Ok(());
        }

        fs::create_dir_all(self.pages_dir())?;

        match self.read_version()? {
            Some(found) if found != CACHE_VERSION.to_string() => {
                Err(CacheError::VersionMismatch {
                    expected: CACHE_VERSION,
                    found,
                })
            }
            Some(_) => Ok(()),
            None => {
                fs::write(self.version_path(), CACHE_VERSION.to_string())?;
                Ok(())
            }
        }
    }

    fn key_for_final_url(&self, final_url: &str) -> Result<CacheKey, CacheError> {
        CacheKey::new(final_url)
    }

    fn load(&self, key: &CacheKey) -> Result<Option<CachedPage>, CacheError> {
        if !self.is_enabled() {
            return Ok(None);
        }

        let entry_dir = self.entry_dir(key);
        if !entry_dir.exists() {
            return Ok(None);
        }

        let fetch_path = self.fetch_path(key);
        let headers_path = self.headers_path(key);
        let html_path = self.html_path(key);

        if !fetch_path.exists() || !headers_path.exists() || !html_path.exists() {
            return Ok(None);
        }

        let fetch: CachedFetch = serde_json::from_slice(&fs::read(fetch_path)?)?;
        let headers: HashMap<String, String> =
            serde_json::from_slice(&fs::read(headers_path)?)?;
        let html = fs::read_to_string(html_path)?;

        Ok(Some(CachedPage {
            fetch,
            headers,
            html,
        }))
    }

    fn store(&self, page: CachedPage) -> Result<CacheKey, CacheError> {
        let key = self.key_for_final_url(&page.fetch.final_url)?;
        let entry_dir = self.entry_dir(&key);
        fs::create_dir_all(&entry_dir)?;
        self.write_json(&self.fetch_path(&key), &page.fetch)?;
        self.write_json(&self.headers_path(&key), &page.headers)?;
        fs::write(self.html_path(&key), &page.html)?;
        Ok(key)
    }

    fn delete(&self, key: &CacheKey) -> Result<(), CacheError> {
        let entry_dir = self.entry_dir(key);
        if entry_dir.exists() {
            fs::remove_dir_all(entry_dir)?;
        }
        Ok(())
    }
}

#[cfg(test)]
mod tests {
    use std::collections::HashMap;
    use std::time::{SystemTime, UNIX_EPOCH};

    use super::*;

    fn temp_root() -> PathBuf {
        let nanos = SystemTime::now()
            .duration_since(UNIX_EPOCH)
            .unwrap()
            .as_nanos();
        std::env::temp_dir().join(format!("pageinfo-cache-test-{nanos}"))
    }

    fn make_cached_page(
        input_url: &str,
        final_url: &str,
        status: u16,
        headers: HashMap<String, String>,
        html: &str,
    ) -> CachedPage {
        let key = CacheKey::new(final_url).unwrap();
        CachedPage {
            fetch: CachedFetch {
                input_url: input_url.to_string(),
                final_url: final_url.to_string(),
                normalized_final_url: key.normalized_final_url,
                status,
                fetched_at: "0".to_string(),
            },
            headers,
            html: html.to_string(),
        }
    }

    #[test]
    fn round_trip_store_and_load() {
        let root_dir = temp_root();
        let cache = FileCache::new(CacheConfig {
            root_dir: root_dir.clone(),
            enabled: true,
            refresh: false,
        });

        cache.init().unwrap();

        let mut headers = HashMap::new();
        headers.insert("content-type".to_string(), "text/html".to_string());

        let page = make_cached_page(
            "https://example.com",
            "https://example.com/news",
            200,
            headers,
            "<html></html>",
        );

        let key = cache.store(page).unwrap();
        let loaded = cache.load(&key).unwrap().unwrap();

        assert_eq!(loaded.fetch.final_url, "https://example.com/news");
        assert_eq!(loaded.fetch.status, 200);
        assert_eq!(loaded.html, "<html></html>");

        fs::remove_dir_all(root_dir).unwrap();
    }

    #[test]
    fn delete_removes_cached_entry() {
        let root_dir = temp_root();
        let cache = FileCache::new(CacheConfig {
            root_dir: root_dir.clone(),
            enabled: true,
            refresh: false,
        });

        cache.init().unwrap();

        let page = make_cached_page(
            "https://example.com",
            "https://example.com/news",
            200,
            HashMap::new(),
            "<html></html>",
        );

        let key = cache.store(page).unwrap();
        assert!(cache.load(&key).unwrap().is_some());

        cache.delete(&key).unwrap();
        assert!(cache.load(&key).unwrap().is_none());

        fs::remove_dir_all(root_dir).unwrap();
    }

    #[test]
    fn init_fails_on_version_mismatch() {
        let root_dir = temp_root();
        fs::create_dir_all(root_dir.join("pages")).unwrap();
        fs::write(root_dir.join("VERSION"), "999").unwrap();

        let cache = FileCache::new(CacheConfig {
            root_dir: root_dir.clone(),
            enabled: true,
            refresh: false,
        });

        let result = cache.init();
        assert!(matches!(
            result,
            Err(CacheError::VersionMismatch {
                expected: 1,
                found
            }) if found == "999"
        ));

        fs::remove_dir_all(root_dir).unwrap();
    }

    #[test]
    fn disabled_cache_skips_init_and_load() {
        let root_dir = temp_root();
        let cache = FileCache::new(CacheConfig {
            root_dir: root_dir.clone(),
            enabled: false,
            refresh: false,
        });

        cache.init().unwrap();
        assert!(!root_dir.exists());

        let key = cache.key_for_final_url("https://example.com/news").unwrap();
        assert!(cache.load(&key).unwrap().is_none());
    }
}