use regex::Regex;
use std::sync::Arc;
pub const LARGE_FALLBACK_SCAN_THRESHOLD: usize = 10_000;
pub const MAX_WINDOW_DEDUP_ENTRIES: usize = 100_000;
pub const MAX_SCAN_CHUNK_BYTES: usize = 1024 * 1024;
pub const WINDOW_OVERLAP_BYTES: usize = 128 * 1024;
pub const MIN_FALLBACK_LINE_LENGTH: usize = 8;
pub const FULL_MATCH_INDEX: usize = 0;
pub const FIRST_CAPTURE_GROUP_INDEX: usize = 1;
pub const FIRST_LINE_NUMBER: usize = 1;
pub const PREVIOUS_LINE_DISTANCE: usize = 1;
pub const MIN_LITERAL_PREFIX_CHARS: usize = 3;
pub const REGEX_SIZE_LIMIT_BYTES: usize = 1 << 20;
static REGEX_DFA_LIMIT_OVERRIDE: std::sync::atomic::AtomicUsize =
std::sync::atomic::AtomicUsize::new(0);
pub fn set_regex_dfa_limit(bytes: usize) {
REGEX_DFA_LIMIT_OVERRIDE.store(bytes, std::sync::atomic::Ordering::Relaxed);
}
#[must_use]
pub fn regex_dfa_limit() -> usize {
match REGEX_DFA_LIMIT_OVERRIDE.load(std::sync::atomic::Ordering::Relaxed) {
0 => REGEX_SIZE_LIMIT_BYTES,
n => n,
}
}
pub const HEX_CONTEXT_RADIUS_CHARS: usize = 20;
pub const MIN_HEX_MATCH_LEN: usize = 16;
pub const MIN_HEX_DIGITS_IN_MATCH: usize = 16;
pub const MIN_HEX_CONTEXT_DIGITS: usize = 8;
pub const MAX_HEX_CONTEXT_SEPARATORS: usize = 4;
#[cfg(feature = "ml")]
pub const MAX_ML_CACHE_ENTRIES: usize = 1024;
#[cfg(feature = "ml")]
pub const MAX_ML_CACHE_BYTES: usize = 256 * 1024;
#[cfg(feature = "ml")]
pub const ML_CONTEXT_RADIUS_LINES: usize = 5;
#[cfg(feature = "ml")]
pub const ML_WEIGHT: f64 = 0.6;
#[cfg(feature = "ml")]
pub const HEURISTIC_WEIGHT: f64 = 0.4;
#[cfg(not(feature = "multiline"))]
#[derive(Debug, Clone)]
pub struct LineMapping {
pub start_offset: usize,
pub end_offset: usize,
pub line_number: usize,
}
#[cfg(not(feature = "multiline"))]
#[derive(Debug, Clone)]
pub struct PreprocessedText {
pub text: String,
pub mappings: Vec<LineMapping>,
}
#[cfg(not(feature = "multiline"))]
impl PreprocessedText {
pub fn line_for_offset(&self, offset: usize) -> Option<usize> {
let idx = self.mappings.partition_point(|m| m.start_offset <= offset);
if idx == 0 {
return None;
}
let m = &self.mappings[idx - 1];
if offset < m.end_offset {
Some(m.line_number)
} else {
None
}
}
pub fn passthrough(line: &str) -> Self {
Self {
text: line.to_string(),
mappings: vec![LineMapping {
line_number: 1,
start_offset: 0,
end_offset: line.len(),
}],
}
}
}
#[cfg(feature = "multiline")]
pub type ScannerPreprocessedText = crate::multiline::PreprocessedText;
#[cfg(not(feature = "multiline"))]
pub type ScannerPreprocessedText = PreprocessedText;
#[derive(Debug, Clone)]
pub struct LazyRegex {
src: Arc<str>,
case_insensitive: bool,
cell: Arc<std::sync::OnceLock<Arc<Regex>>>,
}
impl LazyRegex {
pub fn detector(src: impl Into<Arc<str>>) -> Self {
Self {
src: src.into(),
case_insensitive: true,
cell: Arc::new(std::sync::OnceLock::new()),
}
}
pub fn plain(src: impl Into<Arc<str>>) -> Self {
Self {
src: src.into(),
case_insensitive: false,
cell: Arc::new(std::sync::OnceLock::new()),
}
}
pub fn as_str(&self) -> &str {
&self.src
}
pub fn get(&self) -> &Regex {
self.cell
.get_or_init(|| {
let built = if self.case_insensitive {
crate::compiler::compiler_compile::shared_regex(&self.src)
} else {
Regex::new(&self.src).map(Arc::new)
};
match built {
Ok(rx) => rx,
Err(error) => {
tracing::error!(
pattern = %self.src,
%error,
"detector regex failed to compile on first use; \
this pattern is disabled for this run"
);
never_match_regex()
}
}
})
.as_ref()
}
}
fn never_match_regex() -> Arc<Regex> {
static NEVER: std::sync::OnceLock<Arc<Regex>> = std::sync::OnceLock::new();
NEVER
.get_or_init(|| Arc::new(Regex::new(r"[^\s\S]").expect("empty-language regex is valid")))
.clone()
}
#[derive(Debug, Clone)]
pub struct CompiledPattern {
pub detector_index: usize,
pub regex: LazyRegex,
pub group: Option<usize>,
pub client_safe: bool,
}
pub struct CompiledCompanion {
pub name: String,
pub regex: Regex,
pub capture_group: Option<usize>,
pub within_lines: usize,
pub required: bool,
}
pub use crate::scanner_config::{ScanState, ScannerConfig};
#[cfg(feature = "ml")]
pub use crate::scanner_config::MlPendingMatch;