sanitize_engine/lib.rs
1//! # rust-sanitize
2//!
3//! Deterministic, one-way data sanitization engine.
4//!
5//! This crate provides the core replacement infrastructure for replacing
6//! sensitive values with category-aware, deterministic substitutes.
7//! Replacements are **one-way only** — there is no key file, mapping
8//! table, or restore mode. It is the foundation layer consumed by
9//! higher-level streaming and CLI components.
10//!
11//! ## Key Components
12//!
13//! - [`category::Category`] — Classification of sensitive values (email,
14//! IP, name, etc.) that determines replacement format.
15//! - [`generator::ReplacementGenerator`] — Trait abstracting replacement
16//! strategy (HMAC-deterministic or CSPRNG-random).
17//! - [`strategy::Strategy`] — Pluggable replacement strategies that can
18//! be called **directly** without any mapping table.
19//! - [`store::MappingStore`] — Optional thread-safe per-run dedup cache
20//! ensuring the same input always maps to the same output within a run.
21//! - [`scanner::StreamScanner`] — Streaming regex scanner with chunk +
22//! overlap for bounded-memory processing.
23//!
24//! ## Concurrency Model
25//!
26//! The `MappingStore` uses `DashMap` (shard-level locking) for the forward
27//! dedup cache. All types are `Send + Sync`.
28//!
29//! ## Stability
30//!
31//! As of 0.8.0 the public API is considered stable and follows Semantic Versioning.
32//! Breaking changes require a major version bump. The core guarantees —
33//! one-way replacement, deterministic mode, and length preservation — are
34//! stable across all 1.x releases. Processor heuristics, default limit
35//! values, and report schema may change in minor releases (additive only).
36//!
37//! ## Example: Store-Level Replacement
38//!
39//! ```rust
40//! use sanitize_engine::category::Category;
41//! use sanitize_engine::generator::HmacGenerator;
42//! use sanitize_engine::store::MappingStore;
43//! use std::sync::Arc;
44//!
45//! // Create a deterministic generator with a fixed seed.
46//! let generator = Arc::new(HmacGenerator::new([42u8; 32]));
47//!
48//! // Create the replacement store (optional capacity limit).
49//! let store = MappingStore::new(generator, None);
50//!
51//! // Sanitize a value (one-way).
52//! let sanitized = store.get_or_insert(&Category::Email, "alice@corp.com").unwrap();
53//! assert!(sanitized.contains("@corp.com"));
54//! assert_eq!(sanitized.len(), "alice@corp.com".len());
55//!
56//! // Same input → same output (per-run consistency).
57//! let again = store.get_or_insert(&Category::Email, "alice@corp.com").unwrap();
58//! assert_eq!(sanitized, again);
59//! ```
60//!
61//! ## Example: Streaming Scanner
62//!
63//! ```rust
64//! use sanitize_engine::category::Category;
65//! use sanitize_engine::generator::HmacGenerator;
66//! use sanitize_engine::scanner::{ScanConfig, ScanPattern, StreamScanner};
67//! use sanitize_engine::store::MappingStore;
68//! use std::sync::Arc;
69//!
70//! // Build patterns.
71//! let patterns = vec![
72//! ScanPattern::from_regex(r"alice@corp\.com", Category::Email, "alice_email").unwrap(),
73//! ];
74//!
75//! // Store with deterministic generator.
76//! let generator = Arc::new(HmacGenerator::new([42u8; 32]));
77//! let store = Arc::new(MappingStore::new(generator, Some(1_000_000)));
78//!
79//! // Scanner with default chunk config.
80//! let config = ScanConfig::new(1_048_576, 4096);
81//! let scanner = StreamScanner::new(patterns, store, config).unwrap();
82//!
83//! // Scan bytes in-memory.
84//! let input = b"Contact alice@corp.com for details.";
85//! let (output, stats) = scanner.scan_bytes(input).unwrap();
86//!
87//! assert_eq!(stats.replacements_applied, 1);
88//! assert_eq!(output.len(), input.len());
89//! ```
90//!
91//! ## Example: Log Context Extraction
92//!
93//! After sanitizing, scan the output for error/warning keywords and capture
94//! surrounding lines for LLM-friendly triage:
95//!
96//! ```rust
97//! use sanitize_engine::log_context::{extract_context, LogContextConfig};
98//!
99//! let sanitized = "INFO request received\n\
100//! ERROR disk full on /dev/sda1\n\
101//! INFO retrying mount\n\
102//! WARN filesystem degraded\n\
103//! INFO recovery complete";
104//!
105//! let config = LogContextConfig::new().with_context_lines(1);
106//! let result = extract_context(sanitized, &config);
107//!
108//! // Two keyword hits: "error" and "warn".
109//! assert_eq!(result.match_count, 2);
110//!
111//! // First match: ERROR line with one line of context on each side.
112//! assert_eq!(result.matches[0].keyword, "error");
113//! assert_eq!(result.matches[0].before, vec!["INFO request received"]);
114//! assert_eq!(result.matches[0].after, vec!["INFO retrying mount"]);
115//! ```
116
117// Crate-level lint configuration.
118#![forbid(unsafe_code)]
119#![warn(clippy::all, clippy::pedantic)]
120// Allow specific pedantic lints that are too noisy for this crate.
121#![allow(
122 clippy::module_name_repetitions,
123 clippy::missing_panics_doc,
124 clippy::must_use_candidate, // We add #[must_use] manually on key APIs.
125 clippy::uninlined_format_args,
126 clippy::redundant_closure_for_method_calls,
127 clippy::doc_markdown,
128 clippy::similar_names
129)]
130
131pub mod allowlist;
132pub mod atomic;
133pub mod category;
134pub mod error;
135pub mod generator;
136pub mod llm;
137pub mod log_context;
138pub mod processor;
139pub mod report;
140pub mod scanner;
141pub mod secrets;
142pub mod store;
143pub mod strategy;
144pub mod strip_values;
145
146// Re-exports for convenience.
147pub use atomic::{atomic_write, atomic_write_private, AtomicFileWriter};
148pub use category::Category;
149pub use error::{Result, SanitizeError};
150pub use generator::{HmacGenerator, RandomGenerator, ReplacementGenerator};
151pub use llm::{
152 format_llm_prompt, format_llm_prompt_reference, resolve_llm_template, LlmEntry, LlmPathEntry,
153 PROMPT_PREAMBLE, TEMPLATE_REVIEW_CONFIG, TEMPLATE_REVIEW_SECURITY, TEMPLATE_TROUBLESHOOT,
154};
155pub use log_context::{
156 extract_context, extract_context_reader, LogContextConfig, LogContextMatch, LogContextResult,
157 DEFAULT_CONTEXT_LINES, DEFAULT_KEYWORDS, DEFAULT_MAX_MATCHES,
158};
159pub use processor::archive::{
160 ArchiveFilter, ArchiveFormat, ArchiveProcessor, ArchiveProgress, ArchiveStats, EntryCallback,
161};
162pub use processor::limits::DEFAULT_ARCHIVE_DEPTH;
163pub use processor::{
164 FieldNameSignal, FieldRule, FileTypeProfile, Processor, ProcessorRegistry,
165 DEFAULT_FIELD_SIGNAL_THRESHOLD,
166};
167pub use report::{FileReport, ReportBuilder, ReportMetadata, SanitizeReport};
168pub use scanner::{ScanConfig, ScanPattern, ScanProgress, ScanStats, StreamScanner};
169pub use secrets::{
170 decrypt_secrets, encrypt_secrets, load_secrets_auto, looks_encrypted, SecretEntry,
171 SecretsFormat,
172};
173pub use store::MappingStore;
174pub use strategy::{
175 EntropyMode, FakeIp, HmacHash, PreserveLength, RandomString, RandomUuid, Strategy,
176 StrategyGenerator,
177};
178pub use strip_values::strip_values_from_text;