pub mod codec;
pub mod dispatch;
pub const MAX_WINDOW_SIZE: usize = 256;
pub const DEFAULT_REGION_EXPANSION: usize = 256;
pub const MAX_INPUT_BYTES: usize = 64 * 1024 * 1024;
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub enum EntropyError {
InputTooLarge,
}
impl core::fmt::Display for EntropyError {
fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result {
match self {
Self::InputTooLarge => write!(
f,
"input length exceeds 64 MiB. Fix: split the input into smaller chunks."
),
}
}
}
impl std::error::Error for EntropyError {}
pub fn shannon_entropy(bytes: &[u8]) -> f32 {
if bytes.is_empty() {
return 0.0;
}
let mut counts = [0u32; 256];
for &b in bytes {
counts[b as usize] = counts[b as usize].saturating_add(1);
}
let total = bytes.len() as f64;
let mut entropy = 0.0_f64;
for &count in &counts {
if count == 0 {
continue;
}
let p = count as f64 / total;
entropy -= p * p.log2();
}
entropy as f32
}
pub fn entropy_map_cpu(
data: &[u8],
window_size: usize,
) -> std::result::Result<Vec<f32>, EntropyError> {
if data.len() > MAX_INPUT_BYTES {
return Err(EntropyError::InputTooLarge);
}
if data.is_empty()
|| window_size == 0
|| window_size > data.len()
|| window_size > MAX_WINDOW_SIZE
{
return Ok(Vec::new());
}
let windows: Vec<f32> = (0..=data.len() - window_size)
.map(|start| shannon_entropy(&data[start..start + window_size]))
.collect();
Ok(windows)
}
pub fn find_high_entropy_regions(entropy: &[f32], threshold: f32) -> Vec<(usize, usize)> {
find_high_entropy_regions_with_window(entropy, threshold, DEFAULT_REGION_EXPANSION)
}
pub fn find_high_entropy_regions_with_window(
entropy: &[f32],
threshold: f32,
window_size: usize,
) -> Vec<(usize, usize)> {
let mut regions = Vec::new();
let mut run_start = None;
for (offset, value) in entropy.iter().enumerate() {
match (*value > threshold, run_start) {
(true, None) => run_start = Some(offset),
(false, Some(start)) => {
regions.push((start, offset.saturating_add(window_size)));
run_start = None;
}
_ => {}
}
}
if let Some(start) = run_start {
regions.push((start, entropy.len().saturating_add(window_size)));
}
regions
}
use std::collections::{HashSet, VecDeque};
use std::hash::{Hash, Hasher};
use vyre::{Error, Result};
pub(crate) fn recursive_decode<F>(
file_bytes: &[u8],
rules: &DecodeRules,
mut decode_one: F,
) -> Result<Vec<DecodedRegion>>
where
F: FnMut(DecodeFormat, &[u8], &DecodeRules) -> Result<Vec<DecodedRegion>>,
{
if rules.max_passes == 0 {
return Err(Error::Decode {
message: "max_passes must be at least 1. Fix: call DecodeRules::validate before dispatch or set max_passes to a positive value.".to_string(),
});
}
let mut visited_hashes = HashSet::<u64>::from([stable_hash(file_bytes)]);
let mut seen_regions = HashSet::<(usize, usize)>::new();
let mut frontier = VecDeque::from([(0usize, file_bytes.to_vec())]);
let mut all_regions = Vec::<DecodedRegion>::new();
for _ in 0..rules.max_passes {
let mut next_frontier = VecDeque::new();
let mut progress = false;
while let Some((base_offset, bytes)) = frontier.pop_front() {
let mut state = FrontierState {
seen_regions: &mut seen_regions,
visited_hashes: &mut visited_hashes,
next_frontier: &mut next_frontier,
all_regions: &mut all_regions,
progress: &mut progress,
};
decode_frontier(base_offset, &bytes, rules, &mut decode_one, &mut state)?;
}
if !progress {
break;
}
frontier = next_frontier;
}
all_regions.sort_by(|left, right| {
left.offset
.cmp(&right.offset)
.then(left.length.cmp(&right.length))
.then(left.decoded_bytes.cmp(&right.decoded_bytes))
});
Ok(all_regions)
}
pub fn decode_frontier<F>(
base_offset: usize,
bytes: &[u8],
rules: &DecodeRules,
decode_one: &mut F,
state: &mut FrontierState<'_>,
) -> Result<()>
where
F: FnMut(DecodeFormat, &[u8], &DecodeRules) -> Result<Vec<DecodedRegion>>,
{
for format in [
DecodeFormat::Base64,
DecodeFormat::Hex,
DecodeFormat::Url,
DecodeFormat::Unicode,
] {
for region in decode_one(format, bytes, rules)? {
push_region(base_offset, bytes, region, state)?;
}
}
Ok(())
}
pub struct FrontierState<'a> {
seen_regions: &'a mut HashSet<(usize, usize)>,
visited_hashes: &'a mut HashSet<u64>,
next_frontier: &'a mut VecDeque<(usize, Vec<u8>)>,
all_regions: &'a mut Vec<DecodedRegion>,
progress: &'a mut bool,
}
pub fn push_region(
base_offset: usize,
bytes: &[u8],
region: DecodedRegion,
state: &mut FrontierState<'_>,
) -> Result<()> {
let source_end = region
.offset
.checked_add(region.length)
.ok_or_else(|| Error::Decode {
message: "region overflow while validating source bounds. Fix: ensure the GPU decoder returns offset + length within usize bounds.".to_string(),
})?;
if source_end > bytes.len() {
return Err(Error::Decode {
message: "decoder returned a region beyond input bounds. Fix: report the decoder shader output and reject this malformed region.".to_string(),
});
}
if region.decoded_bytes == bytes[region.offset..source_end] {
return Ok(());
}
let normalized = DecodedRegion {
offset: base_offset + region.offset,
length: region.length,
decoded_bytes: region.decoded_bytes,
};
if state
.seen_regions
.insert((normalized.offset, normalized.length))
{
*state.progress = true;
let hash = stable_hash(&normalized.decoded_bytes);
if state.visited_hashes.insert(hash) {
state
.next_frontier
.push_back((normalized.offset, normalized.decoded_bytes.clone()));
}
state.all_regions.push(normalized);
}
Ok(())
}
pub fn stable_hash(bytes: &[u8]) -> u64 {
let mut hasher = std::collections::hash_map::DefaultHasher::new();
bytes.hash(&mut hasher);
hasher.finish()
}
pub(crate) fn flatten_regions(regions: Vec<DecodedRegion>) -> Vec<u8> {
regions
.into_iter()
.flat_map(|region| region.decoded_bytes)
.collect()
}
#[derive(Debug, Clone, PartialEq, Eq, Hash)]
#[non_exhaustive]
pub struct DecodedRegion {
pub offset: usize,
pub length: usize,
pub decoded_bytes: Vec<u8>,
}
impl DecodedRegion {
#[must_use]
pub fn new(offset: usize, length: usize, decoded_bytes: Vec<u8>) -> Self {
Self {
offset,
length,
decoded_bytes,
}
}
}
use serde::Deserialize;
impl DecodeRules {
#[must_use]
pub fn with_values(min_base64_run: u32, min_hex_run: u32, max_passes: u32) -> Self {
Self {
min_base64_run,
min_hex_run,
max_passes,
}
}
pub fn from_toml(toml_source: &str) -> Result<Self> {
let rules = toml::from_str::<Self>(toml_source).map_err(|error| {
Error::DecodeConfig {
message: format!("failed to parse decode rules TOML: {error}. Fix: correct the TOML syntax and provide min_base64_run, min_hex_run, and max_passes values."),
}
})?;
rules.validate().map_err(|error| Error::DecodeConfig {
message: error.to_string(),
})?;
Ok(rules)
}
pub fn validate(&self) -> std::result::Result<(), DecodeError> {
if self.min_base64_run < 4 {
return Err(DecodeError::MinBase64RunTooSmall);
}
if self.min_hex_run < 2 {
return Err(DecodeError::MinHexRunTooSmall);
}
if self.max_passes == 0 {
return Err(DecodeError::MaxPassesZero);
}
if self.max_passes > 64 {
return Err(DecodeError::MaxPassesOutOfRange);
}
Ok(())
}
}
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub enum DecodeError {
MinBase64RunTooSmall,
MinHexRunTooSmall,
MaxPassesZero,
MaxPassesOutOfRange,
}
impl core::fmt::Display for DecodeError {
fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result {
match self {
Self::MinBase64RunTooSmall => write!(
f,
"min_base64_run must be at least 4 to preserve base64 quartets. Fix: set min_base64_run to 4 or greater."
),
Self::MinHexRunTooSmall => write!(
f,
"min_hex_run must be at least 2 to preserve full bytes. Fix: set min_hex_run to 2 or greater."
),
Self::MaxPassesZero => write!(
f,
"max_passes must be greater than zero. Fix: set max_passes to at least 1."
),
Self::MaxPassesOutOfRange => write!(
f,
"max_passes must be at most 64. Fix: set max_passes to 64 or lower."
),
}
}
}
impl std::error::Error for DecodeError {}
#[derive(Debug, Clone, Deserialize, PartialEq, Eq)]
#[non_exhaustive]
pub struct DecodeRules {
pub min_base64_run: u32,
pub min_hex_run: u32,
pub max_passes: u32,
}
impl Default for DecodeRules {
fn default() -> Self {
Self {
min_base64_run: 8,
min_hex_run: 8,
max_passes: 8,
}
}
}
pub use codec::decoder::{
decode_base64, decode_bytes, decode_file, decode_file_with_rules, decode_hex, decode_regions,
decode_unicode, decode_url, GpuDecoder,
};
pub use codec::format::DecodeFormat;