scanbridge 0.3.0

A unified, pluggable API for malware scanning with circuit breakers, policy enforcement, and audit logging
Documentation
//! Efficient file hashing with BLAKE3.
//!
//! This module provides `FileHasher` for computing file hashes.
//! BLAKE3 is used as the primary hash due to its speed (~10x faster than SHA256).
//! Optional SHA256 and MD5 hashes can be computed for external API compatibility.

use crate::core::error::ScanError;
use crate::core::input::FileInput;
use crate::core::types::FileHash;

use std::io::Read;
use std::path::Path;

/// Configuration for computing file hashes.
///
/// BLAKE3 is always computed as the primary hash.
/// SHA256 and MD5 can be enabled for compatibility with external systems.
///
/// # Examples
///
/// ```rust
/// use scanbridge::core::FileHasher;
///
/// // Default: only BLAKE3
/// let hasher = FileHasher::new();
///
/// // With SHA256 for VirusTotal compatibility
/// let hasher = FileHasher::new().with_sha256(true);
///
/// // With all hashes
/// let hasher = FileHasher::new().with_sha256(true).with_md5(true);
/// ```
#[derive(Debug, Clone, Default)]
pub struct FileHasher {
    /// Whether to compute SHA256 hash.
    compute_sha256: bool,
    /// Whether to compute MD5 hash.
    compute_md5: bool,
}

impl FileHasher {
    /// Creates a new `FileHasher` with default settings (BLAKE3 only).
    pub fn new() -> Self {
        Self::default()
    }

    /// Enables or disables SHA256 hash computation.
    pub fn with_sha256(mut self, enabled: bool) -> Self {
        self.compute_sha256 = enabled;
        self
    }

    /// Enables or disables MD5 hash computation.
    pub fn with_md5(mut self, enabled: bool) -> Self {
        self.compute_md5 = enabled;
        self
    }

    /// Returns whether SHA256 computation is enabled.
    pub fn computes_sha256(&self) -> bool {
        self.compute_sha256
    }

    /// Returns whether MD5 computation is enabled.
    pub fn computes_md5(&self) -> bool {
        self.compute_md5
    }

    /// Computes hashes from bytes.
    ///
    /// This is the most efficient method for small to medium files
    /// that are already in memory.
    pub fn hash_bytes(&self, data: &[u8]) -> FileHash {
        // BLAKE3 is always computed
        let blake3 = blake3::hash(data).to_hex().to_string();

        // Compute optional hashes
        let sha256 = if self.compute_sha256 {
            Some(compute_sha256_sync(data))
        } else {
            None
        };

        let md5 = if self.compute_md5 {
            Some(compute_md5_sync(data))
        } else {
            None
        };

        FileHash { blake3, sha256, md5 }
    }

    /// Computes hashes from a file path.
    ///
    /// For large files, this streams the file to avoid loading
    /// it entirely into memory.
    pub fn hash_file(&self, path: &Path) -> Result<FileHash, ScanError> {
        let file = std::fs::File::open(path).map_err(|e| {
            if e.kind() == std::io::ErrorKind::NotFound {
                ScanError::FileNotFound {
                    path: path.display().to_string(),
                }
            } else {
                ScanError::Io(e)
            }
        })?;

        let mut reader = std::io::BufReader::new(file);
        self.hash_reader(&mut reader)
    }

    /// Computes hashes from a synchronous reader.
    ///
    /// This streams the data to compute all hashes in a single pass.
    pub fn hash_reader<R: Read>(&self, reader: &mut R) -> Result<FileHash, ScanError> {
        // Initialize hashers
        let mut blake3_hasher = blake3::Hasher::new();
        let mut sha256_hasher: Option<Sha256State> = if self.compute_sha256 {
            Some(Sha256State::new())
        } else {
            None
        };
        let mut md5_hasher: Option<Md5State> = if self.compute_md5 {
            Some(Md5State::new())
        } else {
            None
        };

        // Stream data through all hashers
        let mut buffer = [0u8; 64 * 1024]; // 64KB buffer
        loop {
            let bytes_read = reader.read(&mut buffer)?;
            if bytes_read == 0 {
                break;
            }

            let chunk = &buffer[..bytes_read];
            blake3_hasher.update(chunk);

            if let Some(ref mut h) = sha256_hasher {
                h.update(chunk);
            }
            if let Some(ref mut h) = md5_hasher {
                h.update(chunk);
            }
        }

        // Finalize hashes
        let blake3 = blake3_hasher.finalize().to_hex().to_string();
        let sha256 = sha256_hasher.map(|h| h.finalize());
        let md5 = md5_hasher.map(|h| h.finalize());

        Ok(FileHash { blake3, sha256, md5 })
    }

    /// Computes hashes from a `FileInput`.
    ///
    /// This dispatches to the appropriate method based on the input type.
    pub fn hash_input(&self, input: &FileInput) -> Result<FileHash, ScanError> {
        match input {
            FileInput::Path(path) => self.hash_file(path),
            FileInput::Bytes { data, .. } => Ok(self.hash_bytes(data)),
            FileInput::Stream { .. } => Err(ScanError::internal(
                "Cannot hash stream synchronously; use hash_input_async",
            )),
        }
    }
}

// SHA256 state wrapper (using pure Rust implementation if feature not enabled)
struct Sha256State {
    buffer: Vec<u8>,
}

impl Sha256State {
    fn new() -> Self {
        Self {
            buffer: Vec::new(),
        }
    }

    fn update(&mut self, data: &[u8]) {
        // For now, we collect data and compute at the end
        // In production with sha2 crate, this would stream
        self.buffer.extend_from_slice(data);
    }

    fn finalize(self) -> String {
        // When sha2 crate is enabled, this uses proper SHA256
        compute_sha256_sync(&self.buffer)
    }
}

// MD5 state wrapper
struct Md5State {
    buffer: Vec<u8>,
}

impl Md5State {
    fn new() -> Self {
        Self { buffer: Vec::new() }
    }

    fn update(&mut self, data: &[u8]) {
        self.buffer.extend_from_slice(data);
    }

    fn finalize(self) -> String {
        compute_md5_sync(&self.buffer)
    }
}

/// Computes SHA256 hash of bytes (fallback implementation).
///
/// Note: For production use with the `sha2` feature, this uses
/// the sha2 crate for proper SHA256. This fallback uses a simple
/// representation for testing.
fn compute_sha256_sync(data: &[u8]) -> String {
    // Simple implementation using blake3 with a domain separator
    // In production, this would use the sha2 crate
    #[cfg(feature = "sha2")]
    {
        use sha2::{Sha256, Digest};
        let mut hasher = Sha256::new();
        hasher.update(data);
        format!("{:x}", hasher.finalize())
    }

    #[cfg(not(feature = "sha2"))]
    {
        // Fallback: return a placeholder that's clearly not real SHA256
        // This should only be used when sha2 feature is not enabled
        let b3 = blake3::hash(data);
        format!("sha256-compat-{}", &b3.to_hex()[..32])
    }
}

/// Computes MD5 hash of bytes (fallback implementation).
fn compute_md5_sync(data: &[u8]) -> String {
    #[cfg(feature = "md-5")]
    {
        use md5::{Md5, Digest};
        let mut hasher = Md5::new();
        hasher.update(data);
        format!("{:x}", hasher.finalize())
    }

    #[cfg(not(feature = "md-5"))]
    {
        // Fallback: return a placeholder
        let b3 = blake3::hash(data);
        format!("md5-compat-{}", &b3.to_hex()[..16])
    }
}

/// Async file hasher for use with async runtimes.
#[cfg(feature = "tokio-runtime")]
pub mod async_hasher {
    use super::*;

    impl FileHasher {
        /// Asynchronously computes hashes from a file path.
        pub async fn hash_file_async(&self, path: &Path) -> Result<FileHash, ScanError> {
            let data = tokio::fs::read(path).await.map_err(|e| {
                if e.kind() == std::io::ErrorKind::NotFound {
                    ScanError::FileNotFound {
                        path: path.display().to_string(),
                    }
                } else {
                    ScanError::Io(e)
                }
            })?;
            Ok(self.hash_bytes(&data))
        }

        /// Asynchronously computes hashes from a `FileInput`.
        pub async fn hash_input_async(&self, input: &FileInput) -> Result<FileHash, ScanError> {
            match input {
                FileInput::Path(path) => self.hash_file_async(path).await,
                FileInput::Bytes { data, .. } => Ok(self.hash_bytes(data)),
                FileInput::Stream { reader, .. } => {
                    // Read stream into memory for hashing
                    // For very large files, consider streaming hash updates
                    let mut guard = reader.lock().await;
                    let mut data = Vec::new();
                    let mut buf = [0u8; 64 * 1024];
                    loop {
                        let n = futures::AsyncReadExt::read(guard.as_mut(), &mut buf)
                            .await
                            .map_err(ScanError::Io)?;
                        if n == 0 {
                            break;
                        }
                        data.extend_from_slice(&buf[..n]);
                    }
                    Ok(self.hash_bytes(&data))
                }
            }
        }
    }
}

#[cfg(test)]
mod tests {
    use super::*;

    #[test]
    fn test_hash_bytes_blake3_only() {
        let hasher = FileHasher::new();
        let hash = hasher.hash_bytes(b"hello world");

        assert!(!hash.blake3.is_empty());
        assert_eq!(hash.sha256, None);
        assert_eq!(hash.md5, None);
    }

    #[test]
    fn test_hash_bytes_with_sha256() {
        let hasher = FileHasher::new().with_sha256(true);
        let hash = hasher.hash_bytes(b"hello world");

        assert!(!hash.blake3.is_empty());
        assert!(hash.sha256.is_some());
        assert_eq!(hash.md5, None);
    }

    #[test]
    fn test_hash_bytes_all() {
        let hasher = FileHasher::new().with_sha256(true).with_md5(true);
        let hash = hasher.hash_bytes(b"hello world");

        assert!(!hash.blake3.is_empty());
        assert!(hash.sha256.is_some());
        assert!(hash.md5.is_some());
    }

    #[test]
    fn test_hash_deterministic() {
        let hasher = FileHasher::new();
        let data = b"test data for hashing";

        let hash1 = hasher.hash_bytes(data);
        let hash2 = hasher.hash_bytes(data);

        assert_eq!(hash1, hash2);
    }

    #[test]
    fn test_hash_different_data() {
        let hasher = FileHasher::new();

        let hash1 = hasher.hash_bytes(b"data1");
        let hash2 = hasher.hash_bytes(b"data2");

        assert_ne!(hash1.blake3, hash2.blake3);
    }
}