vyre-wgpu 0.1.0

// Bulk byte decoding engine.
//
// NOTE: This is a host-side workflow dispatcher, not an IR op domain. It
// accepts runtime bytes and TOML-configured decode rules, owns GPU buffers,
// dispatches format-specific kernels, and returns decoded byte regions. The
// IR-side decode operations live under `vyre::ops::decode`; those produce
// `Program` values that go through validate and lower.

/// The `codec` module.
pub mod codec;
/// The `dispatch` module.
pub mod dispatch;

// Entropy helpers used by decode region discovery.
//
// NOTE: This is a host-side CPU helper, not part of the vyre IR. The
// IR-side entropy operation lives in `vyre::ops::hash::entropy`.

/// `MAX_WINDOW_SIZE` constant.
pub const MAX_WINDOW_SIZE: usize = 256;

/// `DEFAULT_REGION_EXPANSION` constant.
pub const DEFAULT_REGION_EXPANSION: usize = 256;

/// `MAX_INPUT_BYTES` constant.
pub const MAX_INPUT_BYTES: usize = 64 * 1024 * 1024;

/// Error type for entropy computation.
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub enum EntropyError {
    /// Input length exceeds the maximum allowed size.
    InputTooLarge,
}

impl core::fmt::Display for EntropyError {
    fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result {
        match self {
            Self::InputTooLarge => write!(
                f,
                "input length exceeds 64 MiB. Fix: split the input into smaller chunks."
            ),
        }
    }
}

impl std::error::Error for EntropyError {}

/// Uses f64 for the size division: f32 mantissa loses precision on files > 16 MB and the entropy estimate drifts.
pub fn shannon_entropy(bytes: &[u8]) -> f32 {
    if bytes.is_empty() {
        return 0.0;
    }
    let mut counts = [0u32; 256];
    for &b in bytes {
        counts[b as usize] = counts[b as usize].saturating_add(1);
    }
    let total = bytes.len() as f64;
    let mut entropy = 0.0_f64;
    for &count in &counts {
        if count == 0 {
            continue;
        }
        let p = count as f64 / total;
        entropy -= p * p.log2();
    }
    entropy as f32
}

/// Compute Shannon entropy for each sliding window on CPU.
pub fn entropy_map_cpu(
    data: &[u8],
    window_size: usize,
) -> std::result::Result<Vec<f32>, EntropyError> {
    if data.len() > MAX_INPUT_BYTES {
        return Err(EntropyError::InputTooLarge);
    }
    if data.is_empty()
        || window_size == 0
        || window_size > data.len()
        || window_size > MAX_WINDOW_SIZE
    {
        return Ok(Vec::new());
    }
    let windows: Vec<f32> = (0..=data.len() - window_size)
        .map(|start| shannon_entropy(&data[start..start + window_size]))
        .collect();
    Ok(windows)
}

/// Convert entropy values to contiguous high-entropy regions.
pub fn find_high_entropy_regions(entropy: &[f32], threshold: f32) -> Vec<(usize, usize)> {
    find_high_entropy_regions_with_window(entropy, threshold, DEFAULT_REGION_EXPANSION)
}

/// `find_high_entropy_regions_with_window` function.
pub fn find_high_entropy_regions_with_window(
    entropy: &[f32],
    threshold: f32,
    window_size: usize,
) -> Vec<(usize, usize)> {
    let mut regions = Vec::new();
    let mut run_start = None;
    for (offset, value) in entropy.iter().enumerate() {
        match (*value > threshold, run_start) {
            (true, None) => run_start = Some(offset),
            (false, Some(start)) => {
                regions.push((start, offset.saturating_add(window_size)));
                run_start = None;
            }
            _ => {}
        }
    }
    if let Some(start) = run_start {
        regions.push((start, entropy.len().saturating_add(window_size)));
    }
    regions
}

// Host-side recursive decode frontier management.
//
// NOTE: This is NOT part of the vyre IR. It runs on the CPU around decode
// GPU dispatches, tracks decoded-region frontiers, rejects malformed region
// bounds, and deduplicates recursive decode work. It does not produce a
// Program, does not go through validate or lower, and is not registered in
// the op registry.

use std::collections::{HashSet, VecDeque};
use std::hash::{Hash, Hasher};
use vyre::{Error, Result};

/// Fixes architecture_deep_audit.md#10/#13: crate-private visibility avoids
/// restricted visibility audit blind spots.
pub(crate) fn recursive_decode<F>(
    file_bytes: &[u8],
    rules: &DecodeRules,
    mut decode_one: F,
) -> Result<Vec<DecodedRegion>>
where
    F: FnMut(DecodeFormat, &[u8], &DecodeRules) -> Result<Vec<DecodedRegion>>,
{
    if rules.max_passes == 0 {
        return Err(Error::Decode {
            message: "max_passes must be at least 1. Fix: call DecodeRules::validate before dispatch or set max_passes to a positive value.".to_string(),
        });
    }
    let mut visited_hashes = HashSet::<u64>::from([stable_hash(file_bytes)]);
    let mut seen_regions = HashSet::<(usize, usize)>::new();
    let mut frontier = VecDeque::from([(0usize, file_bytes.to_vec())]);
    let mut all_regions = Vec::<DecodedRegion>::new();

    for _ in 0..rules.max_passes {
        let mut next_frontier = VecDeque::new();
        let mut progress = false;
        while let Some((base_offset, bytes)) = frontier.pop_front() {
            let mut state = FrontierState {
                seen_regions: &mut seen_regions,
                visited_hashes: &mut visited_hashes,
                next_frontier: &mut next_frontier,
                all_regions: &mut all_regions,
                progress: &mut progress,
            };
            decode_frontier(base_offset, &bytes, rules, &mut decode_one, &mut state)?;
        }
        if !progress {
            break;
        }
        frontier = next_frontier;
    }
    all_regions.sort_by(|left, right| {
        left.offset
            .cmp(&right.offset)
            .then(left.length.cmp(&right.length))
            .then(left.decoded_bytes.cmp(&right.decoded_bytes))
    });
    Ok(all_regions)
}

/// `decode_frontier` function.
pub fn decode_frontier<F>(
    base_offset: usize,
    bytes: &[u8],
    rules: &DecodeRules,
    decode_one: &mut F,
    state: &mut FrontierState<'_>,
) -> Result<()>
where
    F: FnMut(DecodeFormat, &[u8], &DecodeRules) -> Result<Vec<DecodedRegion>>,
{
    for format in [
        DecodeFormat::Base64,
        DecodeFormat::Hex,
        DecodeFormat::Url,
        DecodeFormat::Unicode,
    ] {
        for region in decode_one(format, bytes, rules)? {
            push_region(base_offset, bytes, region, state)?;
        }
    }
    Ok(())
}

/// `FrontierState` struct.
pub struct FrontierState<'a> {
    seen_regions: &'a mut HashSet<(usize, usize)>,
    visited_hashes: &'a mut HashSet<u64>,
    next_frontier: &'a mut VecDeque<(usize, Vec<u8>)>,
    all_regions: &'a mut Vec<DecodedRegion>,
    progress: &'a mut bool,
}

/// `push_region` function.
pub fn push_region(
    base_offset: usize,
    bytes: &[u8],
    region: DecodedRegion,
    state: &mut FrontierState<'_>,
) -> Result<()> {
    let source_end = region
        .offset
        .checked_add(region.length)
        .ok_or_else(|| Error::Decode {
            message: "region overflow while validating source bounds. Fix: ensure the GPU decoder returns offset + length within usize bounds.".to_string(),
        })?;
    if source_end > bytes.len() {
        return Err(Error::Decode {
            message: "decoder returned a region beyond input bounds. Fix: report the decoder shader output and reject this malformed region.".to_string(),
        });
    }
    if region.decoded_bytes == bytes[region.offset..source_end] {
        return Ok(());
    }
    let normalized = DecodedRegion {
        offset: base_offset + region.offset,
        length: region.length,
        decoded_bytes: region.decoded_bytes,
    };
    if state
        .seen_regions
        .insert((normalized.offset, normalized.length))
    {
        *state.progress = true;
        let hash = stable_hash(&normalized.decoded_bytes);
        if state.visited_hashes.insert(hash) {
            state
                .next_frontier
                .push_back((normalized.offset, normalized.decoded_bytes.clone()));
        }
        state.all_regions.push(normalized);
    }
    Ok(())
}

/// `stable_hash` function.
pub fn stable_hash(bytes: &[u8]) -> u64 {
    let mut hasher = std::collections::hash_map::DefaultHasher::new();
    bytes.hash(&mut hasher);
    hasher.finish()
}

/// Fixes architecture_deep_audit.md#10/#13: crate-private visibility avoids
/// restricted visibility audit blind spots.
pub(crate) fn flatten_regions(regions: Vec<DecodedRegion>) -> Vec<u8> {
    regions
        .into_iter()
        .flat_map(|region| region.decoded_bytes)
        .collect()
}
// Decoded region metadata.

/// A decoded region produced by one decode pass.
///
/// This struct is `#[non_exhaustive]` to allow adding new region metadata
/// (like character encoding or confidence scores) without breaking consumers.
///
/// # Examples
///
/// ```
/// use vyre_wgpu::engine::decode::DecodedRegion;
///
/// let region = DecodedRegion::new(0, 4, vec![1, 2, 3]);
/// assert_eq!(region.decoded_bytes.len(), 3);
/// ```
#[derive(Debug, Clone, PartialEq, Eq, Hash)]
#[non_exhaustive]
pub struct DecodedRegion {
    /// Source offset of the encoded region.
    pub offset: usize,
    /// Source length of the encoded region.
    pub length: usize,
    /// Decoded bytes emitted for the region.
    pub decoded_bytes: Vec<u8>,
}

impl DecodedRegion {
    /// Create a decoded region.
    ///
    /// # Examples
    ///
    /// ```
    /// use vyre_wgpu::engine::decode::DecodedRegion;
    ///
    /// let region = DecodedRegion::new(5, 3, vec![0x20]);
    /// assert_eq!(region.offset, 5);
    /// ```
    #[must_use]
    pub fn new(offset: usize, length: usize, decoded_bytes: Vec<u8>) -> Self {
        Self {
            offset,
            length,
            decoded_bytes,
        }
    }
}
// TOML decode rule validation.

use serde::Deserialize;

impl DecodeRules {
    /// Create decode rules with explicit values.
    ///
    /// Call [`validate`](Self::validate) to check that the values are acceptable
    /// before using the rules for decode work.
    ///
    /// # Examples
    ///
    /// ```
    /// use vyre_wgpu::engine::decode::DecodeRules;
    ///
    /// let rules = DecodeRules::with_values(12, 16, 4);
    /// assert_eq!(rules.min_base64_run, 12);
    /// ```
    #[must_use]
    pub fn with_values(min_base64_run: u32, min_hex_run: u32, max_passes: u32) -> Self {
        Self {
            min_base64_run,
            min_hex_run,
            max_passes,
        }
    }

    /// Parse decode rules from a TOML document.
    ///
    /// # Errors
    ///
    /// Returns `Error::DecodeConfig` if the TOML is unparsable or the rules fail validation.
    pub fn from_toml(toml_source: &str) -> Result<Self> {
        let rules = toml::from_str::<Self>(toml_source).map_err(|error| {
            Error::DecodeConfig {
                message: format!("failed to parse decode rules TOML: {error}. Fix: correct the TOML syntax and provide min_base64_run, min_hex_run, and max_passes values."),
            }
        })?;
        rules.validate().map_err(|error| Error::DecodeConfig {
            message: error.to_string(),
        })?;
        Ok(rules)
    }

    /// Validate thresholds before CPU or GPU work starts.
    ///
    /// # Errors
    ///
    /// Returns `Error::DecodeConfig` if any threshold is out of range.
    pub fn validate(&self) -> std::result::Result<(), DecodeError> {
        if self.min_base64_run < 4 {
            return Err(DecodeError::MinBase64RunTooSmall);
        }
        if self.min_hex_run < 2 {
            return Err(DecodeError::MinHexRunTooSmall);
        }
        if self.max_passes == 0 {
            return Err(DecodeError::MaxPassesZero);
        }
        if self.max_passes > 64 {
            return Err(DecodeError::MaxPassesOutOfRange);
        }
        Ok(())
    }
}

/// Error type for decode rule validation.
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub enum DecodeError {
    /// `min_base64_run` is below the minimum threshold.
    MinBase64RunTooSmall,
    /// `min_hex_run` is below the minimum threshold.
    MinHexRunTooSmall,
    /// `max_passes` is zero.
    MaxPassesZero,
    /// `max_passes` exceeds the allowed upper bound.
    MaxPassesOutOfRange,
}

impl core::fmt::Display for DecodeError {
    fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result {
        match self {
            Self::MinBase64RunTooSmall => write!(
                f,
                "min_base64_run must be at least 4 to preserve base64 quartets. Fix: set min_base64_run to 4 or greater."
            ),
            Self::MinHexRunTooSmall => write!(
                f,
                "min_hex_run must be at least 2 to preserve full bytes. Fix: set min_hex_run to 2 or greater."
            ),
            Self::MaxPassesZero => write!(
                f,
                "max_passes must be greater than zero. Fix: set max_passes to at least 1."
            ),
            Self::MaxPassesOutOfRange => write!(
                f,
                "max_passes must be at most 64. Fix: set max_passes to 64 or lower."
            ),
        }
    }
}

impl std::error::Error for DecodeError {}

/// TOML-configurable decode thresholds and recursion limits.
///
/// This struct is `#[non_exhaustive]` to allow adding new configuration fields
/// (like per-format recursion caps) without breaking downstream consumers.
///
/// # Examples
///
/// ```
/// use vyre_wgpu::engine::decode::DecodeRules;
///
/// # fn main() -> Result<(), Box<dyn std::error::Error>> {
/// let rules = DecodeRules::from_toml("min_base64_run = 12\nmin_hex_run = 16\nmax_passes = 4")?;
/// assert_eq!(rules.min_base64_run, 12);
/// # Ok(())
/// # }
/// ```
#[derive(Debug, Clone, Deserialize, PartialEq, Eq)]
#[non_exhaustive]
pub struct DecodeRules {
    /// Minimum contiguous base64 run length to attempt decoding.
    pub min_base64_run: u32,
    /// Minimum contiguous hex run length to attempt decoding.
    pub min_hex_run: u32,
    /// Maximum recursive decode passes.
    pub max_passes: u32,
}

impl Default for DecodeRules {
    fn default() -> Self {
        Self {
            min_base64_run: 8,
            min_hex_run: 8,
            max_passes: 8,
        }
    }
}

pub use codec::decoder::{
    decode_base64, decode_bytes, decode_file, decode_file_with_rules, decode_hex, decode_regions,
    decode_unicode, decode_url, GpuDecoder,
};
pub use codec::format::DecodeFormat;