Skip to main content

scrump_core/
lib.rs

1//! Core traits and types for `scrump`.
2//!
3//! A capture file is opened by exactly one [`Format`] implementation, which
4//! yields scannable [`Chunk`]s for the detection engine. The engine produces
5//! [`Hit`]s, which the format then applies as in-place redactions. The final
6//! bytes can be obtained via [`Format::to_bytes`] (used by container formats
7//! to repackage their members) or written atomically with [`write_atomic`].
8
9use std::ffi::OsString;
10use std::fs::OpenOptions;
11use std::io::Write;
12use std::path::{Path, PathBuf};
13
14use regex::bytes::Regex;
15
16// -----------------------------------------------------------------------------
17// Types
18
19/// Origin of a chunk inside a capture file. Carrying this with each chunk
20/// lets format-specific scrubbers make smarter redaction decisions and lets
21/// the CLI explain *where* a hit was found.
22#[derive(Debug, Clone, PartialEq, Eq)]
23pub enum ChunkOrigin {
24    /// Generic bytes from an unknown / passthrough format.
25    Raw,
26    /// The captured process's command-line arguments.
27    Cmdline,
28    /// The captured process's environment block.
29    Env,
30    /// A named string table or string pool inside a structured format.
31    StringTable(String),
32    /// A named subsection of a structured format.
33    Section(String),
34    /// A nested member (e.g. inside a tar archive).
35    NestedMember { path: String, format: String },
36}
37
38impl ChunkOrigin {
39    /// Wrap an inner origin with an outer container-member context.
40    pub fn nested_within(self, container_member: &str, inner_format: &str) -> ChunkOrigin {
41        match self {
42            ChunkOrigin::NestedMember { path, format } => ChunkOrigin::NestedMember {
43                path: format!("{container_member}!{path}"),
44                format,
45            },
46            _ => ChunkOrigin::NestedMember {
47                path: container_member.to_string(),
48                format: inner_format.to_string(),
49            },
50        }
51    }
52}
53
54/// A region inside the source file that the detection engine should scan.
55#[derive(Debug, Clone)]
56pub struct Chunk<'a> {
57    pub bytes: &'a [u8],
58    pub offset: u64,
59    pub origin: ChunkOrigin,
60}
61
62/// Strategy for redacting a [`Hit`].
63#[derive(Debug, Clone, PartialEq, Eq)]
64pub enum Replacement {
65    /// Replace the matched bytes with NUL of equal length. Structure-preserving.
66    ZeroFill,
67    /// Replace with a repeating byte pattern of equal length.
68    Pattern(Vec<u8>),
69    /// Drop the matched region entirely. Only valid for formats that can
70    /// absorb length changes (most binary formats cannot).
71    Drop,
72}
73
74/// A confirmed sensitive region to be redacted.
75#[derive(Debug, Clone)]
76pub struct Hit {
77    pub offset: u64,
78    pub len: usize,
79    pub rule_id: String,
80    pub verified: Option<bool>,
81    pub replacement: Replacement,
82    pub origin: ChunkOrigin,
83}
84
85/// Verification result from an optional live HTTP probe.
86#[derive(Debug, Clone, Copy, PartialEq, Eq)]
87pub enum VerifyResult {
88    Live,
89    Dead,
90    Unknown,
91}
92
93/// Errors produced by format and engine code.
94#[derive(Debug, thiserror::Error)]
95pub enum ScrumpError {
96    #[error("io: {0}")]
97    Io(#[from] std::io::Error),
98    #[error("unsupported format: {0}")]
99    UnsupportedFormat(String),
100    #[error("invalid file: {0}")]
101    InvalidFile(String),
102    #[error("redaction failed: {0}")]
103    RedactionFailed(String),
104    #[error("{0}")]
105    Other(String),
106}
107
108pub type Result<T> = std::result::Result<T, ScrumpError>;
109
110// -----------------------------------------------------------------------------
111// Format trait + Handler
112
113/// A handler for one specific capture-file format.
114///
115/// The trait is intentionally `dyn`-compatible: no `Self: Sized` methods, no
116/// associated constants. Construction is done via free `fn` pointers on a
117/// [`Handler`] so the [`Dispatcher`] can route based on file head bytes.
118pub trait Format: Send {
119    /// Short human-readable name of the format (e.g. `"perf"`, `"tar"`).
120    fn name(&self) -> &'static str;
121
122    /// Iterate scannable chunks for the detection engine.
123    fn chunks<'a>(&'a self) -> Box<dyn Iterator<Item = Chunk<'a>> + 'a>;
124
125    /// Apply redactions in place. The implementation chooses whether each
126    /// [`Hit`] can be satisfied with [`Replacement::ZeroFill`] alone or
127    /// whether structural updates (offsets, checksums, child-format
128    /// repackaging) are also needed.
129    fn apply(&mut self, hits: &[Hit]) -> Result<()>;
130
131    /// Serialize the (possibly-scrubbed) file to an in-memory byte vector.
132    /// Used by container formats (tar, zip, nsys) to repackage child members
133    /// without going through a temp file.
134    fn to_bytes(&self) -> Result<Vec<u8>>;
135}
136
137/// Detection function: given the first ~512 bytes and the original path,
138/// decide whether this handler claims the file.
139pub type DetectFn = fn(head: &[u8], path: &Path) -> bool;
140
141/// Open from a filesystem path.
142pub type OpenPathFn = fn(path: &Path) -> Result<Box<dyn Format>>;
143
144/// Open from an in-memory buffer. `hint_path` lets the implementation
145/// preserve filename context (used for atomic-write naming and for
146/// extension-based detection of inner members).
147pub type OpenBytesFn = fn(bytes: Vec<u8>, hint_path: Option<&Path>) -> Result<Box<dyn Format>>;
148
149/// A handler entry registered with the [`Dispatcher`].
150#[derive(Clone, Copy)]
151pub struct Handler {
152    pub name: &'static str,
153    pub detect: DetectFn,
154    pub open_path: OpenPathFn,
155    pub open_bytes: OpenBytesFn,
156}
157
158impl std::fmt::Debug for Handler {
159    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
160        f.debug_struct("Handler")
161            .field("name", &self.name)
162            .finish_non_exhaustive()
163    }
164}
165
166// -----------------------------------------------------------------------------
167// Dispatcher
168
169/// Routes a file (by path or bytes) to the appropriate [`Format`] handler.
170///
171/// Handlers are tried in registration order; the first one whose `detect`
172/// returns true wins. If none match and a [`fallback`](Dispatcher::set_fallback)
173/// is set, the fallback handles the file. Otherwise [`Dispatcher::open_path`] /
174/// [`Dispatcher::open_bytes`] return [`ScrumpError::UnsupportedFormat`].
175#[derive(Default)]
176pub struct Dispatcher {
177    handlers: Vec<Handler>,
178    fallback: Option<Handler>,
179}
180
181impl Dispatcher {
182    pub fn new() -> Self {
183        Self::default()
184    }
185
186    pub fn register(&mut self, h: Handler) {
187        self.handlers.push(h);
188    }
189
190    pub fn set_fallback(&mut self, h: Handler) {
191        self.fallback = Some(h);
192    }
193
194    pub fn handlers(&self) -> &[Handler] {
195        &self.handlers
196    }
197
198    pub fn fallback(&self) -> Option<&Handler> {
199        self.fallback.as_ref()
200    }
201
202    /// Pick a handler for the given head bytes + path, without opening.
203    pub fn find(&self, head: &[u8], path: &Path) -> Option<&Handler> {
204        for h in &self.handlers {
205            if (h.detect)(head, path) {
206                return Some(h);
207            }
208        }
209        self.fallback.as_ref()
210    }
211
212    /// Find a handler by name (used by `--format <name>`).
213    pub fn find_by_name(&self, name: &str) -> Option<&Handler> {
214        self.handlers
215            .iter()
216            .chain(self.fallback.as_ref())
217            .find(|h| h.name == name)
218    }
219
220    /// Open a path: read the head, find a handler, open the file.
221    pub fn open_path(&self, path: &Path) -> Result<Box<dyn Format>> {
222        let head = read_head(path)?;
223        let h = self.find(&head, path).ok_or_else(|| {
224            ScrumpError::UnsupportedFormat(format!("no handler for {}", path.display()))
225        })?;
226        (h.open_path)(path)
227    }
228
229    /// Open in-memory bytes; use `hint_path` for extension/naming context.
230    pub fn open_bytes(&self, bytes: Vec<u8>, hint_path: Option<&Path>) -> Result<Box<dyn Format>> {
231        let head_len = bytes.len().min(512);
232        let head = &bytes[..head_len];
233        let placeholder_path = PathBuf::from("");
234        let hint = hint_path.unwrap_or(&placeholder_path);
235        let h = self.find(head, hint).ok_or_else(|| {
236            ScrumpError::UnsupportedFormat(format!(
237                "no handler for in-memory bytes (hint = {})",
238                hint.display()
239            ))
240        })?;
241        (h.open_bytes)(bytes, hint_path)
242    }
243
244    /// Force a specific handler by name (CLI `--format`).
245    pub fn open_path_with(&self, path: &Path, handler_name: &str) -> Result<Box<dyn Format>> {
246        let h = self
247            .find_by_name(handler_name)
248            .ok_or_else(|| ScrumpError::UnsupportedFormat(handler_name.into()))?;
249        (h.open_path)(path)
250    }
251}
252
253fn read_head(path: &Path) -> Result<Vec<u8>> {
254    use std::io::Read;
255    let mut f = std::fs::File::open(path)?;
256    let mut buf = vec![0u8; 512];
257    let n = f.read(&mut buf)?;
258    buf.truncate(n);
259    Ok(buf)
260}
261
262// -----------------------------------------------------------------------------
263// Detector trait + engine helpers
264
265/// A detection rule: a regex (+ optional entropy floor) matching candidate
266/// secrets in arbitrary bytes.
267pub trait Detector: Send + Sync {
268    fn id(&self) -> &str;
269    fn pattern(&self) -> &Regex;
270    fn min_entropy(&self) -> Option<f64> {
271        None
272    }
273    fn replacement(&self) -> Replacement {
274        Replacement::ZeroFill
275    }
276    /// If `Some(n)`, the engine reports the n-th regex capture group as the
277    /// hit range instead of the whole match. Enables keyword-proximity
278    /// patterns like `wandb[\s\S]{0,300}([0-9a-f]{40})` that anchor on a
279    /// nearby keyword but redact only the secret itself.
280    fn capture_index(&self) -> Option<usize> {
281        None
282    }
283    /// Optional post-pattern filter. Receives the candidate bytes (the
284    /// regex match — or the capture group if `capture_index` is set) and
285    /// must return `true` to keep the hit, `false` to drop it.
286    ///
287    /// Used to encode semantic constraints regex can't express — e.g. for
288    /// JWT we drop HMAC-signed tokens after base64-decoding the header.
289    fn post_filter(&self, _candidate: &[u8]) -> bool {
290        true
291    }
292    fn verify(&self, _candidate: &[u8]) -> VerifyResult {
293        VerifyResult::Unknown
294    }
295}
296
297/// Shannon entropy of a byte slice in bits per byte (range 0.0..=8.0).
298pub fn shannon_entropy(bytes: &[u8]) -> f64 {
299    if bytes.is_empty() {
300        return 0.0;
301    }
302    let mut counts = [0u64; 256];
303    for &b in bytes {
304        counts[b as usize] += 1;
305    }
306    let total = bytes.len() as f64;
307    let mut h = 0.0;
308    for &c in &counts {
309        if c == 0 {
310            continue;
311        }
312        let p = c as f64 / total;
313        h -= p * p.log2();
314    }
315    h
316}
317
318// -----------------------------------------------------------------------------
319// Atomic write helper
320
321/// Write `bytes` to `out` atomically (write to a sibling tmp file, fsync,
322/// then rename over the destination).
323pub fn write_atomic(out: &Path, bytes: &[u8]) -> Result<()> {
324    let tmp = tmp_sibling(out);
325    {
326        let mut f = OpenOptions::new()
327            .create(true)
328            .write(true)
329            .truncate(true)
330            .open(&tmp)?;
331        f.write_all(bytes)?;
332        f.sync_all()?;
333    }
334    std::fs::rename(&tmp, out)?;
335    Ok(())
336}
337
338fn tmp_sibling(p: &Path) -> PathBuf {
339    let mut name: OsString = p
340        .file_name()
341        .map_or_else(|| OsString::from("out"), |s| s.to_os_string());
342    name.push(".scrump.tmp");
343    match p.parent() {
344        Some(d) if !d.as_os_str().is_empty() => d.join(name),
345        _ => PathBuf::from(name),
346    }
347}
348
349// -----------------------------------------------------------------------------
350// In-place byte editor used by every format's `apply` impl.
351//
352// Single source of truth for byte-level redaction so all formats behave
353// identically (length-preserving zero-fill, optional repeating pattern,
354// rejection of length-changing `Drop`).
355
356/// Apply a list of [`Hit`]s to a flat byte buffer in place.
357///
358/// Returns `Err(ScrumpError::RedactionFailed)` on out-of-bounds, empty
359/// `Pattern`, or unsupported `Drop`.
360pub fn apply_hits_in_place(buf: &mut [u8], hits: &[Hit]) -> Result<()> {
361    for h in hits {
362        let start = h.offset as usize;
363        let end = start
364            .checked_add(h.len)
365            .ok_or_else(|| ScrumpError::RedactionFailed("hit offset+len overflow".into()))?;
366        if end > buf.len() {
367            return Err(ScrumpError::RedactionFailed(format!(
368                "hit out of bounds: {start}..{end} (buf len {})",
369                buf.len()
370            )));
371        }
372        match &h.replacement {
373            Replacement::ZeroFill => {
374                for b in &mut buf[start..end] {
375                    *b = 0;
376                }
377            }
378            Replacement::Pattern(p) => {
379                if p.is_empty() {
380                    return Err(ScrumpError::RedactionFailed(
381                        "empty replacement pattern".into(),
382                    ));
383                }
384                for (i, b) in buf[start..end].iter_mut().enumerate() {
385                    *b = p[i % p.len()];
386                }
387            }
388            Replacement::Drop => {
389                return Err(ScrumpError::RedactionFailed(
390                    "Drop replacement requires a structurally-aware format".into(),
391                ));
392            }
393        }
394    }
395    Ok(())
396}
397
398#[cfg(test)]
399mod tests {
400    use super::*;
401
402    #[test]
403    fn entropy_of_empty_is_zero() {
404        assert_eq!(shannon_entropy(&[]), 0.0);
405    }
406
407    #[test]
408    fn entropy_of_uniform_byte_is_zero() {
409        assert_eq!(shannon_entropy(&[0u8; 100]), 0.0);
410    }
411
412    #[test]
413    fn entropy_of_two_balanced_bytes_is_one() {
414        let bytes: Vec<u8> = (0..100)
415            .map(|i| if i % 2 == 0 { 0u8 } else { 1u8 })
416            .collect();
417        assert!((shannon_entropy(&bytes) - 1.0).abs() < 1e-9);
418    }
419
420    #[test]
421    fn entropy_of_random_bytes_is_near_eight() {
422        let mut state: u32 = 0xdead_beef;
423        let mut bytes = vec![0u8; 4096];
424        for b in &mut bytes {
425            state = state.wrapping_mul(1_664_525).wrapping_add(1_013_904_223);
426            *b = (state >> 24) as u8;
427        }
428        let h = shannon_entropy(&bytes);
429        assert!(h > 7.5, "expected near-uniform entropy, got {h}");
430    }
431
432    #[test]
433    fn apply_hits_in_place_zero_fill_preserves_length() {
434        let mut buf = b"abcdEFGHijkl".to_vec();
435        let hit = Hit {
436            offset: 4,
437            len: 4,
438            rule_id: "x".into(),
439            verified: None,
440            replacement: Replacement::ZeroFill,
441            origin: ChunkOrigin::Raw,
442        };
443        apply_hits_in_place(&mut buf, &[hit]).unwrap();
444        assert_eq!(buf, b"abcd\0\0\0\0ijkl");
445    }
446
447    #[test]
448    fn apply_hits_in_place_pattern_repeats() {
449        let mut buf = b"abcdEFGHijkl".to_vec();
450        let hit = Hit {
451            offset: 4,
452            len: 4,
453            rule_id: "x".into(),
454            verified: None,
455            replacement: Replacement::Pattern(b"XY".to_vec()),
456            origin: ChunkOrigin::Raw,
457        };
458        apply_hits_in_place(&mut buf, &[hit]).unwrap();
459        assert_eq!(buf, b"abcdXYXYijkl");
460    }
461
462    #[test]
463    fn apply_hits_in_place_oob_errors() {
464        let mut buf = b"short".to_vec();
465        let hit = Hit {
466            offset: 0,
467            len: 100,
468            rule_id: "x".into(),
469            verified: None,
470            replacement: Replacement::ZeroFill,
471            origin: ChunkOrigin::Raw,
472        };
473        assert!(apply_hits_in_place(&mut buf, &[hit]).is_err());
474    }
475
476    #[test]
477    fn write_atomic_writes_and_renames() {
478        let dir = std::env::temp_dir().join(format!(
479            "scrump-core-test-{}-{}",
480            std::process::id(),
481            std::time::SystemTime::now()
482                .duration_since(std::time::UNIX_EPOCH)
483                .unwrap()
484                .as_nanos()
485        ));
486        std::fs::create_dir_all(&dir).unwrap();
487        let target = dir.join("file.bin");
488        write_atomic(&target, b"hello").unwrap();
489        assert_eq!(std::fs::read(&target).unwrap(), b"hello");
490        // Re-write overwrites cleanly.
491        write_atomic(&target, b"world").unwrap();
492        assert_eq!(std::fs::read(&target).unwrap(), b"world");
493        std::fs::remove_dir_all(&dir).ok();
494    }
495
496    #[test]
497    fn dispatcher_picks_first_match_then_fallback() {
498        fn d_yes(_h: &[u8], _p: &Path) -> bool {
499            true
500        }
501        fn d_no(_h: &[u8], _p: &Path) -> bool {
502            false
503        }
504        fn op(_p: &Path) -> Result<Box<dyn Format>> {
505            Err(ScrumpError::Other("not used".into()))
506        }
507        fn ob(_b: Vec<u8>, _p: Option<&Path>) -> Result<Box<dyn Format>> {
508            Err(ScrumpError::Other("not used".into()))
509        }
510        let mut d = Dispatcher::new();
511        d.register(Handler {
512            name: "first",
513            detect: d_no,
514            open_path: op,
515            open_bytes: ob,
516        });
517        d.register(Handler {
518            name: "second",
519            detect: d_yes,
520            open_path: op,
521            open_bytes: ob,
522        });
523        let pick = d.find(b"", Path::new("/")).unwrap();
524        assert_eq!(pick.name, "second");
525        d.set_fallback(Handler {
526            name: "fb",
527            detect: d_no,
528            open_path: op,
529            open_bytes: ob,
530        });
531        // First positive still wins
532        let pick = d.find(b"", Path::new("/")).unwrap();
533        assert_eq!(pick.name, "second");
534    }
535
536    #[test]
537    fn dispatcher_uses_fallback_when_nothing_matches() {
538        fn d_no(_h: &[u8], _p: &Path) -> bool {
539            false
540        }
541        fn op(_p: &Path) -> Result<Box<dyn Format>> {
542            Err(ScrumpError::Other("nope".into()))
543        }
544        fn ob(_b: Vec<u8>, _p: Option<&Path>) -> Result<Box<dyn Format>> {
545            Err(ScrumpError::Other("nope".into()))
546        }
547        let mut d = Dispatcher::new();
548        d.register(Handler {
549            name: "n",
550            detect: d_no,
551            open_path: op,
552            open_bytes: ob,
553        });
554        assert!(d.find(b"", Path::new("/")).is_none());
555        d.set_fallback(Handler {
556            name: "fb",
557            detect: d_no,
558            open_path: op,
559            open_bytes: ob,
560        });
561        let pick = d.find(b"", Path::new("/")).unwrap();
562        assert_eq!(pick.name, "fb");
563    }
564}