keyhog_scanner/multiline/config.rs
1use regex::Regex;
2use std::sync::LazyLock;
3
4const MAX_MULTILINE_PREPROCESS_BYTES: usize = 2 * 1024 * 1024;
5const MAX_MULTILINE_LINE_BYTES: usize = 64 * 1024;
6
7static VAR_REF_CONCAT_RE: LazyLock<Option<Regex>> = LazyLock::new(|| {
8 Regex::new(
9 r#"(?i)^\s*[a-z0-9_\-\.]{2,64}\s*[:=]\s*[a-z0-9_\-]{2,32}(?:\s*\+\s*[a-z0-9_\-]{2,32}){1,8}\s*;?\s*$"#,
10 )
11 .ok()
12});
13
14pub(crate) fn warm_runtime_regexes() {
15 let _ = VAR_REF_CONCAT_RE.as_ref();
16}
17
18/// A mapping from an offset in the joined text back to the original line number.
19#[derive(Debug, Clone)]
20pub struct LineMapping {
21 /// Start offset in the joined text (inclusive).
22 pub start_offset: usize,
23 /// End offset in the joined text (exclusive).
24 pub end_offset: usize,
25 /// Original line number (1-indexed).
26 pub line_number: usize,
27}
28
29/// Result of preprocessing text for multi-line concatenation.
30///
31/// `text` is a [`Cow`] so the overwhelmingly common passthrough/identity case
32/// (a chunk with no structured-config shape and no multiline concatenation)
33/// can BORROW the caller's chunk bytes with zero allocation instead of paying a
34/// full-body `to_string()` heap copy + memcpy on every chunk. Only the paths
35/// that genuinely synthesize NEW bytes — multiline-joined concatenation,
36/// structured-config key/value reassembly, homoglyph normalization — own a
37/// `String` via `Cow::Owned`. Downstream consumers read `text` as `&str` via
38/// `Deref`, so the borrow is internal to preprocessing.
39#[derive(Debug, Clone)]
40pub struct PreprocessedText<'a> {
41 /// Original text (borrowed for passthrough) plus, for the synthesizing
42 /// paths, appended multiline-joined / structured segments (owned).
43 pub text: std::borrow::Cow<'a, str>,
44 /// Byte offset where appended joined segments start.
45 pub original_end: usize,
46 /// Mapping from offsets in `text` to original line numbers.
47 pub mappings: Vec<LineMapping>,
48}
49
50impl<'a> PreprocessedText<'a> {
51 /// Map a byte offset in preprocessed text back to an original line number.
52 ///
53 /// Mappings are stored in `start_offset`-sorted, contiguous order
54 /// (the preprocessor appends them as it walks the input), so a
55 /// `partition_point` binary search resolves the lookup in
56 /// `O(log L)` instead of the prior `O(L)` linear scan. On a
57 /// 10 000-line file with ~100 matches that's 10 000 × 100 = 1 M
58 /// pointer compares cut to ~1 400.
59 pub fn line_for_offset(&self, offset: usize) -> Option<usize> {
60 let idx = self.mappings.partition_point(|m| m.start_offset <= offset);
61 if idx == 0 {
62 return None;
63 }
64 let m = &self.mappings[idx - 1];
65 if offset < m.end_offset {
66 Some(m.line_number)
67 } else {
68 None
69 }
70 }
71
72 /// Build a preprocessed representation with a one-line identity mapping.
73 ///
74 /// Takes the text as a [`Cow`] so a byte-identical passthrough chunk can be
75 /// carried as `Cow::Borrowed` (zero allocation — no heap alloc or memcpy of
76 /// the chunk body) while a normalization-rewritten chunk passes its already-
77 /// owned `String` through as `Cow::Owned`. Only the per-line `mappings`
78 /// bookkeeping (size-independent of the body bytes) is allocated either way.
79 pub fn passthrough(text: impl Into<std::borrow::Cow<'a, str>>) -> Self {
80 let text: std::borrow::Cow<'a, str> = text.into();
81 let mut mappings = Vec::new();
82 let mut offset = 0;
83 for (line_idx, line) in text.split('\n').enumerate() {
84 let end = offset + line.len();
85 mappings.push(LineMapping {
86 line_number: line_idx + 1,
87 start_offset: offset,
88 end_offset: end + 1,
89 });
90 offset = end + 1;
91 }
92 if let Some(last) = mappings.last_mut() {
93 last.end_offset = text.len();
94 }
95 let original_end = text.len();
96 Self {
97 text,
98 original_end,
99 mappings,
100 }
101 }
102}
103
104/// Configuration for multiline concatenation recovery.
105#[derive(Debug, Clone)]
106pub struct MultilineConfig {
107 /// Maximum number of lines to join in a single concatenation chain.
108 pub max_join_lines: usize,
109 /// Whether to enable Python-style implicit concatenation.
110 pub python_implicit: bool,
111 /// Whether to enable backslash line continuation.
112 pub backslash_continuation: bool,
113 /// Whether to enable explicit concatenation with `+`.
114 pub plus_concatenation: bool,
115 /// Whether to enable JavaScript template literal concatenation.
116 pub template_literals: bool,
117}
118
119impl Default for MultilineConfig {
120 fn default() -> Self {
121 Self {
122 max_join_lines: 10,
123 python_implicit: true,
124 backslash_continuation: true,
125 plus_concatenation: true,
126 template_literals: true,
127 }
128 }
129}
130
131/// Check if text contains any concatenation indicators.
132pub(crate) fn has_concatenation_indicators(text: &str) -> bool {
133 let trimmed = text.trim_start();
134 if trimmed.starts_with('{')
135 || trimmed.starts_with('[')
136 || trimmed.starts_with("<?xml")
137 || trimmed.starts_with('<')
138 {
139 return false;
140 }
141
142 let bytes = text.as_bytes();
143
144 // For large files, only preprocess if secret-related keywords are present.
145 if bytes.len() > 4096 {
146 let has_secret_keyword = memchr::memmem::find(bytes, b"ecret").is_some()
147 || memchr::memmem::find(bytes, b"oken").is_some()
148 || memchr::memmem::find(bytes, b"assword").is_some()
149 || memchr::memmem::find(bytes, b"api_key").is_some()
150 || memchr::memmem::find(bytes, b"API_KEY").is_some()
151 || memchr::memmem::find(bytes, b"redential").is_some();
152 if !has_secret_keyword {
153 return false;
154 }
155 }
156
157 let has_explicit_concat = text.contains("\" +") || text.contains("' +");
158 let has_backslash_cont = text.contains("\" \\") || text.contains("' \\");
159 let has_template = memchr::memchr(b'`', bytes).is_some();
160 // Function-style string concatenation: R's paste()/paste0() and Rust's
161 // concat!() macro. All three splice multiple string literals into one
162 // value, so any of them is a concat indicator.
163 let has_paste =
164 text.contains("paste0(") || text.contains("paste(") || text.contains("concat!(");
165 let has_implicit = bytes.windows(3).any(|window| {
166 (window[0] == b'"' && window[1] == b' ' && window[2] == b'"')
167 || (window[0] == b'\'' && window[1] == b' ' && window[2] == b'\'')
168 || (window[0] == b'"'
169 && window[1] == b'\n'
170 && (window[2] == b'"' || window[2] == b' ' || window[2] == b'\t'))
171 || (window[0] == b'\''
172 && window[1] == b'\n'
173 && (window[2] == b'\'' || window[2] == b' ' || window[2] == b'\t'))
174 });
175 if !has_explicit_concat
176 && !has_backslash_cont
177 && !has_template
178 && !has_paste
179 && !has_implicit
180 && !has_var_ref_concatenation(text)
181 {
182 return false;
183 }
184
185 for line in text.lines() {
186 let trimmed = line.trim();
187 if trimmed.ends_with('+')
188 || trimmed.starts_with('+')
189 || trimmed.starts_with("+ ")
190 || trimmed.contains("paste0(")
191 || trimmed.contains("paste(")
192 || trimmed.contains("concat!(")
193 || trimmed.contains("\" +")
194 || trimmed.contains("' +")
195 || trimmed.contains("+ \"")
196 || trimmed.contains("+ '")
197 || (trimmed.ends_with('\\') && !trimmed.ends_with("\\\\"))
198 || trimmed.contains("\" \"")
199 || trimmed.contains("' '")
200 || has_var_ref_concat_line(trimmed)
201 || (trimmed.ends_with('`') && trimmed.matches('`').count() == 1)
202 // String literal interpolated INTO a template literal:
203 // `ghp_${"BODY"}` / `${'a'}${'b'}`. The `${"`/`${'` shape is the
204 // concat-evasion signal - a string literal spliced into an
205 // interpolation. Deliberately narrow: bare `${ident}` (normal
206 // runtime interpolation, ubiquitous in JS/TS) is NOT flagged, so
207 // this adds no preprocessing cost to ordinary template code.
208 || trimmed.contains("${\"")
209 || trimmed.contains("${'")
210 // Adjacent template interpolations `${a}${b}` - the close-brace
211 // immediately followed by `${` is the concat-via-interpolation
212 // signal. Ordinary single interpolation (`Hi ${name}!`) has
213 // literal text between/around the braces and never produces
214 // `}${`, so this stays clear of the ubiquitous JS/TS template
215 // case and adds no cost to it.
216 || trimmed.contains("}${")
217 {
218 return true;
219 }
220 }
221
222 false
223}
224
225/// Variable-reference concatenation: `token = head + tail` (no quoted
226/// literals on the RHS). The structural reassembly pass resolves these
227/// via `resolve_concat_reference`; without this indicator the multiline
228/// preprocessor passthroughs and the split credential never surfaces.
229fn has_var_ref_concatenation(text: &str) -> bool {
230 text.lines().any(has_var_ref_concat_line)
231}
232
233fn has_var_ref_concat_line(line: &str) -> bool {
234 // Cheap precheck: var-ref concatenation REQUIRES at least one `+`
235 // separator between two identifiers. Lines without one cannot
236 // possibly match - skip the regex entirely. Without this, the
237 // `(?:\s*\+\s*[a-z0-9_\-]{2,32}){1,8}` repeated-group bound forces
238 // the regex crate's NFA to evaluate every starting position on
239 // identifier-dense source lines, which on Apple Silicon
240 // (regex 1.12, lazy-DFA construction stalled by the `{1,8}`-bounded
241 // alternation) burns minutes of CPU per line. Surfaced during
242 // v0.5.25 cross-platform dogfood: a 171-byte Go file with shape
243 // `var token = receiver.Flag("x", "y").Required().String()` hung
244 // for 6+ minutes on Mac arm64 portable while Linux x86_64
245 // completed it in 0.6 s. The precheck is correctness-preserving:
246 // when no `+` exists in the line, the regex *cannot* match.
247 if !line.contains('+') {
248 return false;
249 }
250 VAR_REF_CONCAT_RE
251 .as_ref()
252 .is_some_and(|re| re.is_match(line))
253}
254
255pub(crate) fn should_passthrough(text: &str) -> bool {
256 text.len() > MAX_MULTILINE_PREPROCESS_BYTES
257 || text
258 .lines()
259 .any(|line| line.len() > MAX_MULTILINE_LINE_BYTES)
260 || !has_concatenation_indicators(text)
261}