1use std::ffi::OsString;
10use std::fs::OpenOptions;
11use std::io::Write;
12use std::path::{Path, PathBuf};
13
14use regex::bytes::Regex;
15
16#[derive(Debug, Clone, PartialEq, Eq)]
23pub enum ChunkOrigin {
24 Raw,
26 Cmdline,
28 Env,
30 StringTable(String),
32 Section(String),
34 NestedMember { path: String, format: String },
36}
37
38impl ChunkOrigin {
39 pub fn nested_within(self, container_member: &str, inner_format: &str) -> ChunkOrigin {
41 match self {
42 ChunkOrigin::NestedMember { path, format } => ChunkOrigin::NestedMember {
43 path: format!("{container_member}!{path}"),
44 format,
45 },
46 _ => ChunkOrigin::NestedMember {
47 path: container_member.to_string(),
48 format: inner_format.to_string(),
49 },
50 }
51 }
52}
53
54#[derive(Debug, Clone)]
56pub struct Chunk<'a> {
57 pub bytes: &'a [u8],
58 pub offset: u64,
59 pub origin: ChunkOrigin,
60}
61
62#[derive(Debug, Clone, PartialEq, Eq)]
64pub enum Replacement {
65 ZeroFill,
67 Pattern(Vec<u8>),
69 Drop,
72}
73
74#[derive(Debug, Clone)]
76pub struct Hit {
77 pub offset: u64,
78 pub len: usize,
79 pub rule_id: String,
80 pub verified: Option<bool>,
81 pub replacement: Replacement,
82 pub origin: ChunkOrigin,
83}
84
85#[derive(Debug, Clone, Copy, PartialEq, Eq)]
87pub enum VerifyResult {
88 Live,
89 Dead,
90 Unknown,
91}
92
93#[derive(Debug, thiserror::Error)]
95pub enum ScrumpError {
96 #[error("io: {0}")]
97 Io(#[from] std::io::Error),
98 #[error("unsupported format: {0}")]
99 UnsupportedFormat(String),
100 #[error("invalid file: {0}")]
101 InvalidFile(String),
102 #[error("redaction failed: {0}")]
103 RedactionFailed(String),
104 #[error("{0}")]
105 Other(String),
106}
107
108pub type Result<T> = std::result::Result<T, ScrumpError>;
109
110pub trait Format: Send {
119 fn name(&self) -> &'static str;
121
122 fn chunks<'a>(&'a self) -> Box<dyn Iterator<Item = Chunk<'a>> + 'a>;
124
125 fn apply(&mut self, hits: &[Hit]) -> Result<()>;
130
131 fn to_bytes(&self) -> Result<Vec<u8>>;
135}
136
137pub type DetectFn = fn(head: &[u8], path: &Path) -> bool;
140
141pub type OpenPathFn = fn(path: &Path) -> Result<Box<dyn Format>>;
143
144pub type OpenBytesFn = fn(bytes: Vec<u8>, hint_path: Option<&Path>) -> Result<Box<dyn Format>>;
148
149#[derive(Clone, Copy)]
151pub struct Handler {
152 pub name: &'static str,
153 pub detect: DetectFn,
154 pub open_path: OpenPathFn,
155 pub open_bytes: OpenBytesFn,
156}
157
158impl std::fmt::Debug for Handler {
159 fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
160 f.debug_struct("Handler")
161 .field("name", &self.name)
162 .finish_non_exhaustive()
163 }
164}
165
166#[derive(Default)]
176pub struct Dispatcher {
177 handlers: Vec<Handler>,
178 fallback: Option<Handler>,
179}
180
181impl Dispatcher {
182 pub fn new() -> Self {
183 Self::default()
184 }
185
186 pub fn register(&mut self, h: Handler) {
187 self.handlers.push(h);
188 }
189
190 pub fn set_fallback(&mut self, h: Handler) {
191 self.fallback = Some(h);
192 }
193
194 pub fn handlers(&self) -> &[Handler] {
195 &self.handlers
196 }
197
198 pub fn fallback(&self) -> Option<&Handler> {
199 self.fallback.as_ref()
200 }
201
202 pub fn find(&self, head: &[u8], path: &Path) -> Option<&Handler> {
204 for h in &self.handlers {
205 if (h.detect)(head, path) {
206 return Some(h);
207 }
208 }
209 self.fallback.as_ref()
210 }
211
212 pub fn find_by_name(&self, name: &str) -> Option<&Handler> {
214 self.handlers
215 .iter()
216 .chain(self.fallback.as_ref())
217 .find(|h| h.name == name)
218 }
219
220 pub fn open_path(&self, path: &Path) -> Result<Box<dyn Format>> {
222 let head = read_head(path)?;
223 let h = self.find(&head, path).ok_or_else(|| {
224 ScrumpError::UnsupportedFormat(format!("no handler for {}", path.display()))
225 })?;
226 (h.open_path)(path)
227 }
228
229 pub fn open_bytes(&self, bytes: Vec<u8>, hint_path: Option<&Path>) -> Result<Box<dyn Format>> {
231 let head_len = bytes.len().min(512);
232 let head = &bytes[..head_len];
233 let placeholder_path = PathBuf::from("");
234 let hint = hint_path.unwrap_or(&placeholder_path);
235 let h = self.find(head, hint).ok_or_else(|| {
236 ScrumpError::UnsupportedFormat(format!(
237 "no handler for in-memory bytes (hint = {})",
238 hint.display()
239 ))
240 })?;
241 (h.open_bytes)(bytes, hint_path)
242 }
243
244 pub fn open_path_with(&self, path: &Path, handler_name: &str) -> Result<Box<dyn Format>> {
246 let h = self
247 .find_by_name(handler_name)
248 .ok_or_else(|| ScrumpError::UnsupportedFormat(handler_name.into()))?;
249 (h.open_path)(path)
250 }
251}
252
253fn read_head(path: &Path) -> Result<Vec<u8>> {
254 use std::io::Read;
255 let mut f = std::fs::File::open(path)?;
256 let mut buf = vec![0u8; 512];
257 let n = f.read(&mut buf)?;
258 buf.truncate(n);
259 Ok(buf)
260}
261
262pub trait Detector: Send + Sync {
268 fn id(&self) -> &str;
269 fn pattern(&self) -> &Regex;
270 fn min_entropy(&self) -> Option<f64> {
271 None
272 }
273 fn replacement(&self) -> Replacement {
274 Replacement::ZeroFill
275 }
276 fn capture_index(&self) -> Option<usize> {
281 None
282 }
283 fn post_filter(&self, _candidate: &[u8]) -> bool {
290 true
291 }
292 fn verify(&self, _candidate: &[u8]) -> VerifyResult {
293 VerifyResult::Unknown
294 }
295}
296
297pub fn shannon_entropy(bytes: &[u8]) -> f64 {
299 if bytes.is_empty() {
300 return 0.0;
301 }
302 let mut counts = [0u64; 256];
303 for &b in bytes {
304 counts[b as usize] += 1;
305 }
306 let total = bytes.len() as f64;
307 let mut h = 0.0;
308 for &c in &counts {
309 if c == 0 {
310 continue;
311 }
312 let p = c as f64 / total;
313 h -= p * p.log2();
314 }
315 h
316}
317
318pub fn write_atomic(out: &Path, bytes: &[u8]) -> Result<()> {
324 let tmp = tmp_sibling(out);
325 {
326 let mut f = OpenOptions::new()
327 .create(true)
328 .write(true)
329 .truncate(true)
330 .open(&tmp)?;
331 f.write_all(bytes)?;
332 f.sync_all()?;
333 }
334 std::fs::rename(&tmp, out)?;
335 Ok(())
336}
337
338fn tmp_sibling(p: &Path) -> PathBuf {
339 let mut name: OsString = p
340 .file_name()
341 .map_or_else(|| OsString::from("out"), |s| s.to_os_string());
342 name.push(".scrump.tmp");
343 match p.parent() {
344 Some(d) if !d.as_os_str().is_empty() => d.join(name),
345 _ => PathBuf::from(name),
346 }
347}
348
349pub fn apply_hits_in_place(buf: &mut [u8], hits: &[Hit]) -> Result<()> {
361 for h in hits {
362 let start = h.offset as usize;
363 let end = start
364 .checked_add(h.len)
365 .ok_or_else(|| ScrumpError::RedactionFailed("hit offset+len overflow".into()))?;
366 if end > buf.len() {
367 return Err(ScrumpError::RedactionFailed(format!(
368 "hit out of bounds: {start}..{end} (buf len {})",
369 buf.len()
370 )));
371 }
372 match &h.replacement {
373 Replacement::ZeroFill => {
374 for b in &mut buf[start..end] {
375 *b = 0;
376 }
377 }
378 Replacement::Pattern(p) => {
379 if p.is_empty() {
380 return Err(ScrumpError::RedactionFailed(
381 "empty replacement pattern".into(),
382 ));
383 }
384 for (i, b) in buf[start..end].iter_mut().enumerate() {
385 *b = p[i % p.len()];
386 }
387 }
388 Replacement::Drop => {
389 return Err(ScrumpError::RedactionFailed(
390 "Drop replacement requires a structurally-aware format".into(),
391 ));
392 }
393 }
394 }
395 Ok(())
396}
397
398#[cfg(test)]
399mod tests {
400 use super::*;
401
402 #[test]
403 fn entropy_of_empty_is_zero() {
404 assert_eq!(shannon_entropy(&[]), 0.0);
405 }
406
407 #[test]
408 fn entropy_of_uniform_byte_is_zero() {
409 assert_eq!(shannon_entropy(&[0u8; 100]), 0.0);
410 }
411
412 #[test]
413 fn entropy_of_two_balanced_bytes_is_one() {
414 let bytes: Vec<u8> = (0..100)
415 .map(|i| if i % 2 == 0 { 0u8 } else { 1u8 })
416 .collect();
417 assert!((shannon_entropy(&bytes) - 1.0).abs() < 1e-9);
418 }
419
420 #[test]
421 fn entropy_of_random_bytes_is_near_eight() {
422 let mut state: u32 = 0xdead_beef;
423 let mut bytes = vec![0u8; 4096];
424 for b in &mut bytes {
425 state = state.wrapping_mul(1_664_525).wrapping_add(1_013_904_223);
426 *b = (state >> 24) as u8;
427 }
428 let h = shannon_entropy(&bytes);
429 assert!(h > 7.5, "expected near-uniform entropy, got {h}");
430 }
431
432 #[test]
433 fn apply_hits_in_place_zero_fill_preserves_length() {
434 let mut buf = b"abcdEFGHijkl".to_vec();
435 let hit = Hit {
436 offset: 4,
437 len: 4,
438 rule_id: "x".into(),
439 verified: None,
440 replacement: Replacement::ZeroFill,
441 origin: ChunkOrigin::Raw,
442 };
443 apply_hits_in_place(&mut buf, &[hit]).unwrap();
444 assert_eq!(buf, b"abcd\0\0\0\0ijkl");
445 }
446
447 #[test]
448 fn apply_hits_in_place_pattern_repeats() {
449 let mut buf = b"abcdEFGHijkl".to_vec();
450 let hit = Hit {
451 offset: 4,
452 len: 4,
453 rule_id: "x".into(),
454 verified: None,
455 replacement: Replacement::Pattern(b"XY".to_vec()),
456 origin: ChunkOrigin::Raw,
457 };
458 apply_hits_in_place(&mut buf, &[hit]).unwrap();
459 assert_eq!(buf, b"abcdXYXYijkl");
460 }
461
462 #[test]
463 fn apply_hits_in_place_oob_errors() {
464 let mut buf = b"short".to_vec();
465 let hit = Hit {
466 offset: 0,
467 len: 100,
468 rule_id: "x".into(),
469 verified: None,
470 replacement: Replacement::ZeroFill,
471 origin: ChunkOrigin::Raw,
472 };
473 assert!(apply_hits_in_place(&mut buf, &[hit]).is_err());
474 }
475
476 #[test]
477 fn write_atomic_writes_and_renames() {
478 let dir = std::env::temp_dir().join(format!(
479 "scrump-core-test-{}-{}",
480 std::process::id(),
481 std::time::SystemTime::now()
482 .duration_since(std::time::UNIX_EPOCH)
483 .unwrap()
484 .as_nanos()
485 ));
486 std::fs::create_dir_all(&dir).unwrap();
487 let target = dir.join("file.bin");
488 write_atomic(&target, b"hello").unwrap();
489 assert_eq!(std::fs::read(&target).unwrap(), b"hello");
490 write_atomic(&target, b"world").unwrap();
492 assert_eq!(std::fs::read(&target).unwrap(), b"world");
493 std::fs::remove_dir_all(&dir).ok();
494 }
495
496 #[test]
497 fn dispatcher_picks_first_match_then_fallback() {
498 fn d_yes(_h: &[u8], _p: &Path) -> bool {
499 true
500 }
501 fn d_no(_h: &[u8], _p: &Path) -> bool {
502 false
503 }
504 fn op(_p: &Path) -> Result<Box<dyn Format>> {
505 Err(ScrumpError::Other("not used".into()))
506 }
507 fn ob(_b: Vec<u8>, _p: Option<&Path>) -> Result<Box<dyn Format>> {
508 Err(ScrumpError::Other("not used".into()))
509 }
510 let mut d = Dispatcher::new();
511 d.register(Handler {
512 name: "first",
513 detect: d_no,
514 open_path: op,
515 open_bytes: ob,
516 });
517 d.register(Handler {
518 name: "second",
519 detect: d_yes,
520 open_path: op,
521 open_bytes: ob,
522 });
523 let pick = d.find(b"", Path::new("/")).unwrap();
524 assert_eq!(pick.name, "second");
525 d.set_fallback(Handler {
526 name: "fb",
527 detect: d_no,
528 open_path: op,
529 open_bytes: ob,
530 });
531 let pick = d.find(b"", Path::new("/")).unwrap();
533 assert_eq!(pick.name, "second");
534 }
535
536 #[test]
537 fn dispatcher_uses_fallback_when_nothing_matches() {
538 fn d_no(_h: &[u8], _p: &Path) -> bool {
539 false
540 }
541 fn op(_p: &Path) -> Result<Box<dyn Format>> {
542 Err(ScrumpError::Other("nope".into()))
543 }
544 fn ob(_b: Vec<u8>, _p: Option<&Path>) -> Result<Box<dyn Format>> {
545 Err(ScrumpError::Other("nope".into()))
546 }
547 let mut d = Dispatcher::new();
548 d.register(Handler {
549 name: "n",
550 detect: d_no,
551 open_path: op,
552 open_bytes: ob,
553 });
554 assert!(d.find(b"", Path::new("/")).is_none());
555 d.set_fallback(Handler {
556 name: "fb",
557 detect: d_no,
558 open_path: op,
559 open_bytes: ob,
560 });
561 let pick = d.find(b"", Path::new("/")).unwrap();
562 assert_eq!(pick.name, "fb");
563 }
564}