kreuzberg 4.3.8

High-performance document intelligence library for Rust. Extract text, metadata, and structured data from PDFs, Office documents, images, and 75+ formats with async/sync APIs.
Documentation
//! Cache utilities for key generation and disk space management.

use crate::error::Result;
use ahash::AHasher;
use std::hash::{Hash, Hasher};

#[cfg(unix)]
use crate::error::KreuzbergError;
#[cfg(unix)]
use std::path::Path;

/// Cache key hash format width (32 hex digits for u64 hash)
const CACHE_KEY_HASH_WIDTH: usize = 32;

/// Generate a deterministic cache key from configuration parameters.
///
/// # Algorithm
///
/// Uses ahash (non-cryptographic 64-bit hash) for performance. Cache keys are
/// generated by:
/// 1. Sorting key-value pairs by key (for determinism)
/// 2. Concatenating as "key1=val1&key2=val2&..."
/// 3. Hashing with ahash and formatting as 32-character hex
///
/// # Collision Probability
///
/// AHash produces 64-bit hashes, leading to birthday paradox collisions:
/// - **~0.01%** probability at 1 million cache entries
/// - **~1%** probability at 100 million entries
/// - **~50%** probability at 4.3 billion (2^32) entries
///
/// For context: P(collision) ≈ n^2 / (2 * 2^64) where n = number of entries.
///
/// # Performance vs Security Trade-off
///
/// - **ahash**: ~10x faster than SHA256, sufficient for cache keys
/// - **SHA256**: Collision-resistant but overkill for caching
/// - **Practical risk**: Low for typical usage (< 1M entries)
///
/// # Impact of Collisions
///
/// If two different configurations hash to the same key:
/// - One configuration reads the other's cached data
/// - Results in incorrect data served from cache
/// - Detected via metadata validation (size/mtime checks)
///
/// # Recommendations
///
/// - **< 1M entries**: ahash is safe and fast
/// - **> 100M entries**: Monitor cache size, consider periodic clearing
/// - **Critical data**: If collision risk is unacceptable, add SHA256 option
///
/// # Example
///
/// ```rust
/// use kreuzberg::cache::generate_cache_key;
///
/// let parts = [("format", "pdf"), ("ocr", "true"), ("lang", "en")];
/// let key = generate_cache_key(&parts);
/// assert_eq!(key.len(), 32); // 64-bit hash as hex
/// ```
pub fn generate_cache_key(parts: &[(&str, &str)]) -> String {
    if parts.is_empty() {
        return "empty".to_string();
    }

    let mut sorted_parts: Vec<_> = parts.to_vec();
    sorted_parts.sort_by_key(|(k, _)| *k);

    let estimated_size = sorted_parts.iter().map(|(k, v)| k.len() + v.len() + 2).sum::<usize>();
    let mut cache_str = String::with_capacity(estimated_size);

    for (i, (key, val)) in sorted_parts.iter().enumerate() {
        if i > 0 {
            cache_str.push('&');
        }
        cache_str.push_str(&format!("{}={}", key, val));
    }

    let mut hasher = AHasher::default();
    cache_str.hash(&mut hasher);
    let hash = hasher.finish();

    format!("{:0width$x}", hash, width = CACHE_KEY_HASH_WIDTH)
}

#[allow(unsafe_code)]
pub fn get_available_disk_space(path: &str) -> Result<f64> {
    #[cfg(unix)]
    {
        let path = Path::new(path);
        let check_path = if path.exists() {
            path
        } else if let Some(parent) = path.parent() {
            parent
        } else {
            Path::new("/")
        };

        use libc::{statvfs, statvfs as statvfs_struct};
        use std::ffi::CString;

        let path_str = check_path
            .to_str()
            .ok_or_else(|| KreuzbergError::validation("Path contains invalid UTF-8".to_string()))?;
        let c_path = CString::new(path_str).map_err(|e| KreuzbergError::validation(format!("Invalid path: {}", e)))?;

        let mut stat: statvfs_struct = unsafe { std::mem::zeroed() };

        let result = unsafe { statvfs(c_path.as_ptr(), &mut stat) };

        if result == 0 {
            #[allow(clippy::unnecessary_cast)]
            let available_bytes = stat.f_bavail as u64 * stat.f_frsize as u64;
            Ok(available_bytes as f64 / (1024.0 * 1024.0))
        } else {
            tracing::debug!("Failed to get disk stats for {}: errno {}", path_str, result);
            Ok(10000.0)
        }
    }

    #[cfg(not(unix))]
    {
        let _ = path;
        Ok(10000.0)
    }
}

pub fn fast_hash(data: &[u8]) -> u64 {
    let mut hasher = AHasher::default();
    data.hash(&mut hasher);
    hasher.finish()
}

pub fn validate_cache_key(key: &str) -> bool {
    key.len() == 32 && key.chars().all(|c| c.is_ascii_hexdigit())
}

pub fn filter_old_cache_entries(cache_times: &[f64], current_time: f64, max_age_seconds: f64) -> Vec<usize> {
    cache_times
        .iter()
        .enumerate()
        .filter_map(|(idx, &time)| {
            if current_time - time > max_age_seconds {
                Some(idx)
            } else {
                None
            }
        })
        .collect()
}

pub fn sort_cache_by_access_time(mut entries: Vec<(String, f64)>) -> Vec<String> {
    entries.sort_by(|a, b| a.1.partial_cmp(&b.1).unwrap_or(std::cmp::Ordering::Equal));
    entries.into_iter().map(|(key, _)| key).collect()
}