kreuzberg 4.5.4

High-performance document intelligence library for Rust. Extract text, metadata, and structured data from PDFs, Office documents, images, and 88+ formats with async/sync APIs.
Documentation
//! Cache utilities for key generation and disk space management.

use crate::error::Result;
use std::io::Read;
use std::path::Path;

#[cfg(unix)]
use crate::error::KreuzbergError;

/// Cache key hash format width (32 hex digits = 128 bits of blake3 output).
const CACHE_KEY_HASH_WIDTH: usize = 32;

/// Generate a deterministic cache key from configuration parameters.
///
/// # Algorithm
///
/// Uses blake3 (cryptographic, SIMD-accelerated) for collision-resistant cache keys.
/// Cache keys are generated by:
/// 1. Sorting key-value pairs by key (for determinism)
/// 2. Concatenating as "key1=val1&key2=val2&..."
/// 3. Hashing with blake3 and formatting as 32-character hex (first 128 bits)
///
/// # Example
///
/// ```rust
/// use kreuzberg::cache::generate_cache_key;
///
/// let parts = [("format", "pdf"), ("ocr", "true"), ("lang", "en")];
/// let key = generate_cache_key(&parts);
/// assert_eq!(key.len(), 32);
/// ```
pub fn generate_cache_key(parts: &[(&str, &str)]) -> String {
    if parts.is_empty() {
        return "empty".to_string();
    }

    let mut sorted_parts: Vec<_> = parts.to_vec();
    sorted_parts.sort_by_key(|(k, _)| *k);

    let estimated_size = sorted_parts.iter().map(|(k, v)| k.len() + v.len() + 2).sum::<usize>();
    let mut cache_str = String::with_capacity(estimated_size);

    for (i, (key, val)) in sorted_parts.iter().enumerate() {
        if i > 0 {
            cache_str.push('&');
        }
        cache_str.push_str(&format!("{}={}", key, val));
    }

    blake3_hash_to_hex(cache_str.as_bytes())
}

/// Hash arbitrary bytes with blake3, returning a 32-char hex string.
pub fn blake3_hash_bytes(data: &[u8]) -> String {
    blake3_hash_to_hex(data)
}

/// Hash a file's content with blake3 using streaming 64 KiB reads.
///
/// Returns a 32-char hex string (128 bits of blake3 output).
pub fn blake3_hash_file(path: &Path) -> Result<String> {
    let file = std::fs::File::open(path)
        .map_err(|e| crate::error::KreuzbergError::cache(format!("Failed to open file for hashing: {e}")))?;
    let mut reader = std::io::BufReader::with_capacity(64 * 1024, file);
    let mut hasher = blake3::Hasher::new();

    let mut buf = [0u8; 64 * 1024];
    loop {
        let n = reader
            .read(&mut buf)
            .map_err(|e| crate::error::KreuzbergError::cache(format!("Failed to read file for hashing: {e}")))?;
        if n == 0 {
            break;
        }
        hasher.update(&buf[..n]);
    }

    let hash = hasher.finalize();
    Ok(hex::encode(&hash.as_bytes()[..CACHE_KEY_HASH_WIDTH / 2]))
}

/// Hash bytes with blake3 and return first 128 bits as 32 hex chars.
fn blake3_hash_to_hex(data: &[u8]) -> String {
    let hash = blake3::hash(data);
    hex::encode(&hash.as_bytes()[..CACHE_KEY_HASH_WIDTH / 2])
}

#[allow(unsafe_code)]
pub fn get_available_disk_space(path: &str) -> Result<f64> {
    #[cfg(unix)]
    {
        let path = Path::new(path);
        let check_path = if path.exists() {
            path
        } else if let Some(parent) = path.parent() {
            parent
        } else {
            Path::new("/")
        };

        use libc::{statvfs, statvfs as statvfs_struct};
        use std::ffi::CString;

        let path_str = check_path
            .to_str()
            .ok_or_else(|| KreuzbergError::validation("Path contains invalid UTF-8".to_string()))?;
        let c_path = CString::new(path_str).map_err(|e| KreuzbergError::validation(format!("Invalid path: {}", e)))?;

        let mut stat: statvfs_struct = unsafe { std::mem::zeroed() };

        let result = unsafe { statvfs(c_path.as_ptr(), &mut stat) };

        if result == 0 {
            #[allow(clippy::unnecessary_cast)]
            let available_bytes = stat.f_bavail as u64 * stat.f_frsize as u64;
            Ok(available_bytes as f64 / (1024.0 * 1024.0))
        } else {
            tracing::debug!("Failed to get disk stats for {}: errno {}", path_str, result);
            Ok(10000.0)
        }
    }

    #[cfg(not(unix))]
    {
        let _ = path;
        Ok(10000.0)
    }
}

pub fn fast_hash(data: &[u8]) -> u64 {
    let hash = blake3::hash(data);
    u64::from_le_bytes(hash.as_bytes()[..8].try_into().unwrap())
}

pub fn validate_cache_key(key: &str) -> bool {
    key.len() == 32 && key.chars().all(|c| c.is_ascii_hexdigit())
}

pub fn filter_old_cache_entries(cache_times: &[f64], current_time: f64, max_age_seconds: f64) -> Vec<usize> {
    cache_times
        .iter()
        .enumerate()
        .filter_map(|(idx, &time)| {
            if current_time - time > max_age_seconds {
                Some(idx)
            } else {
                None
            }
        })
        .collect()
}

pub fn sort_cache_by_access_time(mut entries: Vec<(String, f64)>) -> Vec<String> {
    entries.sort_by(|a, b| a.1.total_cmp(&b.1));
    entries.into_iter().map(|(key, _)| key).collect()
}

/// Validate and sanitize a cache namespace string.
///
/// Namespace must be alphanumeric, hyphens, or underscores only, max 64 chars.
/// Returns `None` if the input is invalid.
pub fn sanitize_namespace(namespace: &str) -> Option<String> {
    if namespace.is_empty() || namespace.len() > 64 {
        return None;
    }
    if namespace
        .chars()
        .all(|c| c.is_ascii_alphanumeric() || c == '-' || c == '_')
    {
        Some(namespace.to_string())
    } else {
        None
    }
}