1pub mod codec;
11pub mod dispatch;
13
14pub const MAX_WINDOW_SIZE: usize = 256;
21
22pub const DEFAULT_REGION_EXPANSION: usize = 256;
24
25pub const MAX_INPUT_BYTES: usize = 64 * 1024 * 1024;
27
28#[derive(Debug, Clone, Copy, PartialEq, Eq)]
30pub enum EntropyError {
31 InputTooLarge,
33}
34
35impl core::fmt::Display for EntropyError {
36 fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result {
37 match self {
38 Self::InputTooLarge => write!(
39 f,
40 "input length exceeds 64 MiB. Fix: split the input into smaller chunks."
41 ),
42 }
43 }
44}
45
46impl std::error::Error for EntropyError {}
47
48pub fn shannon_entropy(bytes: &[u8]) -> f32 {
50 if bytes.is_empty() {
51 return 0.0;
52 }
53 let mut counts = [0u32; 256];
54 for &b in bytes {
55 counts[b as usize] = counts[b as usize].saturating_add(1);
56 }
57 let total = bytes.len() as f64;
58 let mut entropy = 0.0_f64;
59 for &count in &counts {
60 if count == 0 {
61 continue;
62 }
63 let p = count as f64 / total;
64 entropy -= p * p.log2();
65 }
66 entropy as f32
67}
68
69pub fn entropy_map_cpu(
71 data: &[u8],
72 window_size: usize,
73) -> std::result::Result<Vec<f32>, EntropyError> {
74 if data.len() > MAX_INPUT_BYTES {
75 return Err(EntropyError::InputTooLarge);
76 }
77 if data.is_empty()
78 || window_size == 0
79 || window_size > data.len()
80 || window_size > MAX_WINDOW_SIZE
81 {
82 return Ok(Vec::new());
83 }
84 let windows: Vec<f32> = (0..=data.len() - window_size)
85 .map(|start| shannon_entropy(&data[start..start + window_size]))
86 .collect();
87 Ok(windows)
88}
89
90pub fn find_high_entropy_regions(entropy: &[f32], threshold: f32) -> Vec<(usize, usize)> {
92 find_high_entropy_regions_with_window(entropy, threshold, DEFAULT_REGION_EXPANSION)
93}
94
95pub fn find_high_entropy_regions_with_window(
97 entropy: &[f32],
98 threshold: f32,
99 window_size: usize,
100) -> Vec<(usize, usize)> {
101 let mut regions = Vec::new();
102 let mut run_start = None;
103 for (offset, value) in entropy.iter().enumerate() {
104 match (*value > threshold, run_start) {
105 (true, None) => run_start = Some(offset),
106 (false, Some(start)) => {
107 regions.push((start, offset.saturating_add(window_size)));
108 run_start = None;
109 }
110 _ => {}
111 }
112 }
113 if let Some(start) = run_start {
114 regions.push((start, entropy.len().saturating_add(window_size)));
115 }
116 regions
117}
118
119use std::collections::{HashSet, VecDeque};
128use std::hash::{Hash, Hasher};
129use vyre::{Error, Result};
130
131pub(crate) fn recursive_decode<F>(
134 file_bytes: &[u8],
135 rules: &DecodeRules,
136 mut decode_one: F,
137) -> Result<Vec<DecodedRegion>>
138where
139 F: FnMut(DecodeFormat, &[u8], &DecodeRules) -> Result<Vec<DecodedRegion>>,
140{
141 if rules.max_passes == 0 {
142 return Err(Error::Decode {
143 message: "max_passes must be at least 1. Fix: call DecodeRules::validate before dispatch or set max_passes to a positive value.".to_string(),
144 });
145 }
146 let mut visited_hashes = HashSet::<u64>::from([stable_hash(file_bytes)]);
147 let mut seen_regions = HashSet::<(usize, usize)>::new();
148 let mut frontier = VecDeque::from([(0usize, file_bytes.to_vec())]);
149 let mut all_regions = Vec::<DecodedRegion>::new();
150
151 for _ in 0..rules.max_passes {
152 let mut next_frontier = VecDeque::new();
153 let mut progress = false;
154 while let Some((base_offset, bytes)) = frontier.pop_front() {
155 let mut state = FrontierState {
156 seen_regions: &mut seen_regions,
157 visited_hashes: &mut visited_hashes,
158 next_frontier: &mut next_frontier,
159 all_regions: &mut all_regions,
160 progress: &mut progress,
161 };
162 decode_frontier(base_offset, &bytes, rules, &mut decode_one, &mut state)?;
163 }
164 if !progress {
165 break;
166 }
167 frontier = next_frontier;
168 }
169 all_regions.sort_by(|left, right| {
170 left.offset
171 .cmp(&right.offset)
172 .then(left.length.cmp(&right.length))
173 .then(left.decoded_bytes.cmp(&right.decoded_bytes))
174 });
175 Ok(all_regions)
176}
177
178pub fn decode_frontier<F>(
180 base_offset: usize,
181 bytes: &[u8],
182 rules: &DecodeRules,
183 decode_one: &mut F,
184 state: &mut FrontierState<'_>,
185) -> Result<()>
186where
187 F: FnMut(DecodeFormat, &[u8], &DecodeRules) -> Result<Vec<DecodedRegion>>,
188{
189 for format in [
190 DecodeFormat::Base64,
191 DecodeFormat::Hex,
192 DecodeFormat::Url,
193 DecodeFormat::Unicode,
194 ] {
195 for region in decode_one(format, bytes, rules)? {
196 push_region(base_offset, bytes, region, state)?;
197 }
198 }
199 Ok(())
200}
201
202pub struct FrontierState<'a> {
204 seen_regions: &'a mut HashSet<(usize, usize)>,
205 visited_hashes: &'a mut HashSet<u64>,
206 next_frontier: &'a mut VecDeque<(usize, Vec<u8>)>,
207 all_regions: &'a mut Vec<DecodedRegion>,
208 progress: &'a mut bool,
209}
210
211pub fn push_region(
213 base_offset: usize,
214 bytes: &[u8],
215 region: DecodedRegion,
216 state: &mut FrontierState<'_>,
217) -> Result<()> {
218 let source_end = region
219 .offset
220 .checked_add(region.length)
221 .ok_or_else(|| Error::Decode {
222 message: "region overflow while validating source bounds. Fix: ensure the GPU decoder returns offset + length within usize bounds.".to_string(),
223 })?;
224 if source_end > bytes.len() {
225 return Err(Error::Decode {
226 message: "decoder returned a region beyond input bounds. Fix: report the decoder shader output and reject this malformed region.".to_string(),
227 });
228 }
229 if region.decoded_bytes == bytes[region.offset..source_end] {
230 return Ok(());
231 }
232 let normalized = DecodedRegion {
233 offset: base_offset + region.offset,
234 length: region.length,
235 decoded_bytes: region.decoded_bytes,
236 };
237 if state
238 .seen_regions
239 .insert((normalized.offset, normalized.length))
240 {
241 *state.progress = true;
242 let hash = stable_hash(&normalized.decoded_bytes);
243 if state.visited_hashes.insert(hash) {
244 state
245 .next_frontier
246 .push_back((normalized.offset, normalized.decoded_bytes.clone()));
247 }
248 state.all_regions.push(normalized);
249 }
250 Ok(())
251}
252
253pub fn stable_hash(bytes: &[u8]) -> u64 {
255 let mut hasher = std::collections::hash_map::DefaultHasher::new();
256 bytes.hash(&mut hasher);
257 hasher.finish()
258}
259
260pub(crate) fn flatten_regions(regions: Vec<DecodedRegion>) -> Vec<u8> {
263 regions
264 .into_iter()
265 .flat_map(|region| region.decoded_bytes)
266 .collect()
267}
268#[derive(Debug, Clone, PartialEq, Eq, Hash)]
284#[non_exhaustive]
285pub struct DecodedRegion {
286 pub offset: usize,
288 pub length: usize,
290 pub decoded_bytes: Vec<u8>,
292}
293
294impl DecodedRegion {
295 #[must_use]
306 pub fn new(offset: usize, length: usize, decoded_bytes: Vec<u8>) -> Self {
307 Self {
308 offset,
309 length,
310 decoded_bytes,
311 }
312 }
313}
314use serde::Deserialize;
317
318impl DecodeRules {
319 #[must_use]
333 pub fn with_values(min_base64_run: u32, min_hex_run: u32, max_passes: u32) -> Self {
334 Self {
335 min_base64_run,
336 min_hex_run,
337 max_passes,
338 }
339 }
340
341 pub fn from_toml(toml_source: &str) -> Result<Self> {
347 let rules = toml::from_str::<Self>(toml_source).map_err(|error| {
348 Error::DecodeConfig {
349 message: format!("failed to parse decode rules TOML: {error}. Fix: correct the TOML syntax and provide min_base64_run, min_hex_run, and max_passes values."),
350 }
351 })?;
352 rules.validate().map_err(|error| Error::DecodeConfig {
353 message: error.to_string(),
354 })?;
355 Ok(rules)
356 }
357
358 pub fn validate(&self) -> std::result::Result<(), DecodeError> {
364 if self.min_base64_run < 4 {
365 return Err(DecodeError::MinBase64RunTooSmall);
366 }
367 if self.min_hex_run < 2 {
368 return Err(DecodeError::MinHexRunTooSmall);
369 }
370 if self.max_passes == 0 {
371 return Err(DecodeError::MaxPassesZero);
372 }
373 if self.max_passes > 64 {
374 return Err(DecodeError::MaxPassesOutOfRange);
375 }
376 Ok(())
377 }
378}
379
380#[derive(Debug, Clone, Copy, PartialEq, Eq)]
382pub enum DecodeError {
383 MinBase64RunTooSmall,
385 MinHexRunTooSmall,
387 MaxPassesZero,
389 MaxPassesOutOfRange,
391}
392
393impl core::fmt::Display for DecodeError {
394 fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result {
395 match self {
396 Self::MinBase64RunTooSmall => write!(
397 f,
398 "min_base64_run must be at least 4 to preserve base64 quartets. Fix: set min_base64_run to 4 or greater."
399 ),
400 Self::MinHexRunTooSmall => write!(
401 f,
402 "min_hex_run must be at least 2 to preserve full bytes. Fix: set min_hex_run to 2 or greater."
403 ),
404 Self::MaxPassesZero => write!(
405 f,
406 "max_passes must be greater than zero. Fix: set max_passes to at least 1."
407 ),
408 Self::MaxPassesOutOfRange => write!(
409 f,
410 "max_passes must be at most 64. Fix: set max_passes to 64 or lower."
411 ),
412 }
413 }
414}
415
416impl std::error::Error for DecodeError {}
417
418#[derive(Debug, Clone, Deserialize, PartialEq, Eq)]
435#[non_exhaustive]
436pub struct DecodeRules {
437 pub min_base64_run: u32,
439 pub min_hex_run: u32,
441 pub max_passes: u32,
443}
444
445impl Default for DecodeRules {
446 fn default() -> Self {
447 Self {
448 min_base64_run: 8,
449 min_hex_run: 8,
450 max_passes: 8,
451 }
452 }
453}
454
455pub use codec::decoder::{
456 decode_base64, decode_bytes, decode_file, decode_file_with_rules, decode_hex, decode_regions,
457 decode_unicode, decode_url, GpuDecoder,
458};
459pub use codec::format::DecodeFormat;