weave-content 0.2.12

Content DSL parser, validator, and builder for OSINT case files
Documentation
use std::collections::{HashMap, HashSet};
use std::path::{Path, PathBuf};

use serde::{Deserialize, Serialize};
use sha2::{Digest, Sha256};

/// Maximum build cache file size: 50 MB.
const MAX_CACHE_SIZE_BYTES: u64 = 50 * 1024 * 1024;

/// Maximum entries in the build cache.
const MAX_CACHE_ENTRIES: usize = 50_000;

/// Cache file name (gitignored, lives in content root).
pub const CACHE_FILENAME: &str = ".build-cache.json";

/// A single file entry in the build cache.
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct BuildCacheEntry {
    /// SHA-256 hex digest of the file contents.
    pub hash: String,
    /// Paths of entity files this case depends on (empty for entity files).
    #[serde(default, skip_serializing_if = "Vec::is_empty")]
    pub deps: Vec<String>,
}

/// Incremental build cache backed by a JSON file.
#[derive(Debug)]
pub struct BuildCache {
    path: PathBuf,
    entries: HashMap<String, BuildCacheEntry>,
}

impl BuildCache {
    /// Create an empty in-memory cache (no file backing).
    pub fn empty() -> Self {
        Self {
            path: PathBuf::new(),
            entries: HashMap::new(),
        }
    }

    /// Load cache from file, or create empty if file doesn't exist or is invalid.
    ///
    /// # Errors
    ///
    /// Returns an error if the file exists but cannot be read.
    pub fn load(content_root: &Path) -> Result<Self, String> {
        let path = content_root.join(CACHE_FILENAME);
        let entries = if path.exists() {
            // Check file size before reading
            let meta =
                std::fs::metadata(&path).map_err(|e| format!("failed to stat cache file: {e}"))?;
            if meta.len() > MAX_CACHE_SIZE_BYTES {
                eprintln!("build cache exceeds {MAX_CACHE_SIZE_BYTES} bytes, starting fresh");
                HashMap::new()
            } else {
                let content = std::fs::read_to_string(&path)
                    .map_err(|e| format!("failed to read cache file: {e}"))?;
                serde_json::from_str(&content).unwrap_or_default()
            }
        } else {
            HashMap::new()
        };

        Ok(Self { path, entries })
    }

    /// Check if a file is unchanged since last build.
    /// Returns `true` if the file hash matches the cache and all dependencies
    /// are also unchanged.
    pub fn is_unchanged(&self, path: &str, current_hash: &str) -> bool {
        if let Some(entry) = self.entries.get(path) {
            if entry.hash != current_hash {
                return false;
            }
            // Check dependencies too
            for dep in &entry.deps {
                if let Some(dep_entry) = self.entries.get(dep) {
                    // Dep entry exists — hash was recorded. We need to compare
                    // with the current file hash, but we only know if it changed
                    // if the caller already hashed it. For simplicity, we check
                    // that the dep entry still exists (was not pruned).
                    let _ = dep_entry;
                } else {
                    // Dependency not in cache — must rebuild
                    return false;
                }
            }
            true
        } else {
            false
        }
    }

    /// Check if a file and all its dependencies are unchanged, given a map
    /// of current file hashes.
    pub fn is_unchanged_with_hashes(
        &self,
        path: &str,
        current_hash: &str,
        current_hashes: &HashMap<String, String>,
    ) -> bool {
        let Some(entry) = self.entries.get(path) else {
            return false;
        };
        if entry.hash != current_hash {
            return false;
        }
        for dep in &entry.deps {
            let Some(dep_entry) = self.entries.get(dep) else {
                return false;
            };
            if let Some(current_dep_hash) = current_hashes.get(dep) {
                if dep_entry.hash != *current_dep_hash {
                    return false;
                }
            } else {
                // Dep file no longer exists
                return false;
            }
        }
        true
    }

    /// Record a file's hash and dependencies in the cache.
    pub fn put(&mut self, path: &str, hash: String, deps: Vec<String>) {
        if self.entries.len() >= MAX_CACHE_ENTRIES && !self.entries.contains_key(path) {
            return;
        }
        self.entries
            .insert(path.to_string(), BuildCacheEntry { hash, deps });
    }

    /// Remove entries for files that no longer exist.
    pub fn prune(&mut self, existing_files: &HashSet<String>) {
        self.entries.retain(|k, _| existing_files.contains(k));
    }

    /// Save cache to file.
    ///
    /// # Errors
    ///
    /// Returns an error if the file cannot be written.
    pub fn save(&self) -> Result<(), String> {
        if self.path.as_os_str().is_empty() {
            return Ok(());
        }
        let json = serde_json::to_string_pretty(&self.entries)
            .map_err(|e| format!("failed to serialize build cache: {e}"))?;
        std::fs::write(&self.path, json).map_err(|e| format!("failed to write build cache: {e}"))
    }

    /// Number of entries in the cache.
    pub fn len(&self) -> usize {
        self.entries.len()
    }

    /// Whether the cache is empty.
    pub fn is_empty(&self) -> bool {
        self.entries.is_empty()
    }
}

/// Compute SHA-256 hex digest of file contents.
pub fn hash_file(path: &Path) -> Result<String, String> {
    let content =
        std::fs::read(path).map_err(|e| format!("failed to read {}: {e}", path.display()))?;
    Ok(hash_bytes(&content))
}

/// Compute SHA-256 hex digest of bytes.
pub fn hash_bytes(data: &[u8]) -> String {
    let mut hasher = Sha256::new();
    hasher.update(data);
    let result = hasher.finalize();
    hex_encode(&result)
}

fn hex_encode(bytes: &[u8]) -> String {
    let mut s = String::with_capacity(bytes.len() * 2);
    for b in bytes {
        use std::fmt::Write;
        let _ = write!(s, "{b:02x}");
    }
    s
}

#[cfg(test)]
mod tests {
    use super::*;

    #[test]
    fn hash_bytes_deterministic() {
        let h1 = hash_bytes(b"hello world");
        let h2 = hash_bytes(b"hello world");
        assert_eq!(h1, h2);
        assert_eq!(h1.len(), 64); // SHA-256 = 32 bytes = 64 hex chars
    }

    #[test]
    fn hash_bytes_different_input() {
        let h1 = hash_bytes(b"hello");
        let h2 = hash_bytes(b"world");
        assert_ne!(h1, h2);
    }

    #[test]
    fn cache_put_and_check() {
        let mut cache = BuildCache::empty();
        cache.put("cases/test.md", "abc123".to_string(), vec![]);
        assert!(cache.is_unchanged("cases/test.md", "abc123"));
        assert!(!cache.is_unchanged("cases/test.md", "different"));
    }

    #[test]
    fn cache_missing_entry() {
        let cache = BuildCache::empty();
        assert!(!cache.is_unchanged("missing.md", "abc"));
    }

    #[test]
    fn cache_with_deps() {
        let mut cache = BuildCache::empty();
        cache.put("people/test.md", "entity_hash".to_string(), vec![]);
        cache.put(
            "cases/test.md",
            "case_hash".to_string(),
            vec!["people/test.md".to_string()],
        );

        let mut hashes = HashMap::new();
        hashes.insert("cases/test.md".to_string(), "case_hash".to_string());
        hashes.insert("people/test.md".to_string(), "entity_hash".to_string());

        assert!(cache.is_unchanged_with_hashes("cases/test.md", "case_hash", &hashes));

        // Change dep hash
        hashes.insert("people/test.md".to_string(), "changed".to_string());
        assert!(!cache.is_unchanged_with_hashes("cases/test.md", "case_hash", &hashes));
    }

    #[test]
    fn cache_prune() {
        let mut cache = BuildCache::empty();
        cache.put("keep.md", "h1".to_string(), vec![]);
        cache.put("remove.md", "h2".to_string(), vec![]);

        let existing: HashSet<String> = ["keep.md".to_string()].into();
        cache.prune(&existing);

        assert_eq!(cache.len(), 1);
        assert!(cache.is_unchanged("keep.md", "h1"));
    }

    #[test]
    fn cache_boundary_enforced() {
        let mut cache = BuildCache::empty();
        for i in 0..MAX_CACHE_ENTRIES {
            cache.put(&format!("file{i}.md"), format!("h{i}"), vec![]);
        }
        assert_eq!(cache.len(), MAX_CACHE_ENTRIES);

        // One more should be rejected
        cache.put("overflow.md", "hx".to_string(), vec![]);
        assert_eq!(cache.len(), MAX_CACHE_ENTRIES);
        assert!(!cache.is_unchanged("overflow.md", "hx"));
    }

    #[test]
    fn cache_update_existing_within_boundary() {
        let mut cache = BuildCache::empty();
        for i in 0..MAX_CACHE_ENTRIES {
            cache.put(&format!("file{i}.md"), format!("h{i}"), vec![]);
        }

        // Updating existing should work
        cache.put("file0.md", "updated".to_string(), vec![]);
        assert!(cache.is_unchanged("file0.md", "updated"));
        assert_eq!(cache.len(), MAX_CACHE_ENTRIES);
    }
}