Skip to main content

keyhog_scanner/multiline/
config.rs

1use regex::Regex;
2use std::sync::LazyLock;
3
4const MAX_MULTILINE_PREPROCESS_BYTES: usize = 2 * 1024 * 1024;
5const MAX_MULTILINE_LINE_BYTES: usize = 64 * 1024;
6
7static VAR_REF_CONCAT_RE: LazyLock<Option<Regex>> = LazyLock::new(|| {
8    Regex::new(
9        r#"(?i)^\s*[a-z0-9_\-\.]{2,64}\s*[:=]\s*[a-z0-9_\-]{2,32}(?:\s*\+\s*[a-z0-9_\-]{2,32}){1,8}\s*;?\s*$"#,
10    )
11    .ok()
12});
13
14pub(crate) fn warm_runtime_regexes() {
15    let _ = VAR_REF_CONCAT_RE.as_ref();
16}
17
18/// A mapping from an offset in the joined text back to the original line number.
19#[derive(Debug, Clone)]
20pub struct LineMapping {
21    /// Start offset in the joined text (inclusive).
22    pub start_offset: usize,
23    /// End offset in the joined text (exclusive).
24    pub end_offset: usize,
25    /// Original line number (1-indexed).
26    pub line_number: usize,
27}
28
29/// Result of preprocessing text for multi-line concatenation.
30///
31/// `text` is a [`Cow`] so the overwhelmingly common passthrough/identity case
32/// (a chunk with no structured-config shape and no multiline concatenation)
33/// can BORROW the caller's chunk bytes with zero allocation instead of paying a
34/// full-body `to_string()` heap copy + memcpy on every chunk. Only the paths
35/// that genuinely synthesize NEW bytes — multiline-joined concatenation,
36/// structured-config key/value reassembly, homoglyph normalization — own a
37/// `String` via `Cow::Owned`. Downstream consumers read `text` as `&str` via
38/// `Deref`, so the borrow is internal to preprocessing.
39#[derive(Debug, Clone)]
40pub struct PreprocessedText<'a> {
41    /// Original text (borrowed for passthrough) plus, for the synthesizing
42    /// paths, appended multiline-joined / structured segments (owned).
43    pub text: std::borrow::Cow<'a, str>,
44    /// Byte offset where appended joined segments start.
45    pub original_end: usize,
46    /// Mapping from offsets in `text` to original line numbers.
47    pub mappings: Vec<LineMapping>,
48}
49
50impl<'a> PreprocessedText<'a> {
51    /// Map a byte offset in preprocessed text back to an original line number.
52    ///
53    /// Mappings are stored in `start_offset`-sorted, contiguous order
54    /// (the preprocessor appends them as it walks the input), so a
55    /// `partition_point` binary search resolves the lookup in
56    /// `O(log L)` instead of the prior `O(L)` linear scan. On a
57    /// 10 000-line file with ~100 matches that's 10 000 × 100 = 1 M
58    /// pointer compares cut to ~1 400.
59    pub fn line_for_offset(&self, offset: usize) -> Option<usize> {
60        let idx = self.mappings.partition_point(|m| m.start_offset <= offset);
61        if idx == 0 {
62            return None;
63        }
64        let m = &self.mappings[idx - 1];
65        if offset < m.end_offset {
66            Some(m.line_number)
67        } else {
68            None
69        }
70    }
71
72    /// Build a preprocessed representation with a one-line identity mapping.
73    ///
74    /// Takes the text as a [`Cow`] so a byte-identical passthrough chunk can be
75    /// carried as `Cow::Borrowed` (zero allocation — no heap alloc or memcpy of
76    /// the chunk body) while a normalization-rewritten chunk passes its already-
77    /// owned `String` through as `Cow::Owned`. Only the per-line `mappings`
78    /// bookkeeping (size-independent of the body bytes) is allocated either way.
79    pub fn passthrough(text: impl Into<std::borrow::Cow<'a, str>>) -> Self {
80        let text: std::borrow::Cow<'a, str> = text.into();
81        let mut mappings = Vec::new();
82        let mut offset = 0;
83        for (line_idx, line) in text.split('\n').enumerate() {
84            let end = offset + line.len();
85            mappings.push(LineMapping {
86                line_number: line_idx + 1,
87                start_offset: offset,
88                end_offset: end + 1,
89            });
90            offset = end + 1;
91        }
92        if let Some(last) = mappings.last_mut() {
93            last.end_offset = text.len();
94        }
95        let original_end = text.len();
96        Self {
97            text,
98            original_end,
99            mappings,
100        }
101    }
102}
103
104/// Configuration for multiline concatenation recovery.
105#[derive(Debug, Clone)]
106pub struct MultilineConfig {
107    /// Maximum number of lines to join in a single concatenation chain.
108    pub max_join_lines: usize,
109    /// Whether to enable Python-style implicit concatenation.
110    pub python_implicit: bool,
111    /// Whether to enable backslash line continuation.
112    pub backslash_continuation: bool,
113    /// Whether to enable explicit concatenation with `+`.
114    pub plus_concatenation: bool,
115    /// Whether to enable JavaScript template literal concatenation.
116    pub template_literals: bool,
117}
118
119impl Default for MultilineConfig {
120    fn default() -> Self {
121        Self {
122            max_join_lines: 10,
123            python_implicit: true,
124            backslash_continuation: true,
125            plus_concatenation: true,
126            template_literals: true,
127        }
128    }
129}
130
131/// Check if text contains any concatenation indicators.
132pub(crate) fn has_concatenation_indicators(text: &str) -> bool {
133    let trimmed = text.trim_start();
134    if trimmed.starts_with('{')
135        || trimmed.starts_with('[')
136        || trimmed.starts_with("<?xml")
137        || trimmed.starts_with('<')
138    {
139        return false;
140    }
141
142    let bytes = text.as_bytes();
143
144    // For large files, only preprocess if secret-related keywords are present.
145    if bytes.len() > 4096 {
146        let has_secret_keyword = memchr::memmem::find(bytes, b"ecret").is_some()
147            || memchr::memmem::find(bytes, b"oken").is_some()
148            || memchr::memmem::find(bytes, b"assword").is_some()
149            || memchr::memmem::find(bytes, b"api_key").is_some()
150            || memchr::memmem::find(bytes, b"API_KEY").is_some()
151            || memchr::memmem::find(bytes, b"redential").is_some();
152        if !has_secret_keyword {
153            return false;
154        }
155    }
156
157    let has_explicit_concat = text.contains("\" +") || text.contains("' +");
158    let has_backslash_cont = text.contains("\" \\") || text.contains("' \\");
159    let has_template = memchr::memchr(b'`', bytes).is_some();
160    // Function-style string concatenation: R's paste()/paste0() and Rust's
161    // concat!() macro. All three splice multiple string literals into one
162    // value, so any of them is a concat indicator.
163    let has_paste =
164        text.contains("paste0(") || text.contains("paste(") || text.contains("concat!(");
165    let has_implicit = bytes.windows(3).any(|window| {
166        (window[0] == b'"' && window[1] == b' ' && window[2] == b'"')
167            || (window[0] == b'\'' && window[1] == b' ' && window[2] == b'\'')
168            || (window[0] == b'"'
169                && window[1] == b'\n'
170                && (window[2] == b'"' || window[2] == b' ' || window[2] == b'\t'))
171            || (window[0] == b'\''
172                && window[1] == b'\n'
173                && (window[2] == b'\'' || window[2] == b' ' || window[2] == b'\t'))
174    });
175    if !has_explicit_concat
176        && !has_backslash_cont
177        && !has_template
178        && !has_paste
179        && !has_implicit
180        && !has_var_ref_concatenation(text)
181    {
182        return false;
183    }
184
185    for line in text.lines() {
186        let trimmed = line.trim();
187        if trimmed.ends_with('+')
188            || trimmed.starts_with('+')
189            || trimmed.starts_with("+ ")
190            || trimmed.contains("paste0(")
191            || trimmed.contains("paste(")
192            || trimmed.contains("concat!(")
193            || trimmed.contains("\" +")
194            || trimmed.contains("' +")
195            || trimmed.contains("+ \"")
196            || trimmed.contains("+ '")
197            || (trimmed.ends_with('\\') && !trimmed.ends_with("\\\\"))
198            || trimmed.contains("\" \"")
199            || trimmed.contains("' '")
200            || has_var_ref_concat_line(trimmed)
201            || (trimmed.ends_with('`') && trimmed.matches('`').count() == 1)
202            // String literal interpolated INTO a template literal:
203            // `ghp_${"BODY"}` / `${'a'}${'b'}`. The `${"`/`${'` shape is the
204            // concat-evasion signal - a string literal spliced into an
205            // interpolation. Deliberately narrow: bare `${ident}` (normal
206            // runtime interpolation, ubiquitous in JS/TS) is NOT flagged, so
207            // this adds no preprocessing cost to ordinary template code.
208            || trimmed.contains("${\"")
209            || trimmed.contains("${'")
210            // Adjacent template interpolations `${a}${b}` - the close-brace
211            // immediately followed by `${` is the concat-via-interpolation
212            // signal. Ordinary single interpolation (`Hi ${name}!`) has
213            // literal text between/around the braces and never produces
214            // `}${`, so this stays clear of the ubiquitous JS/TS template
215            // case and adds no cost to it.
216            || trimmed.contains("}${")
217        {
218            return true;
219        }
220    }
221
222    false
223}
224
225/// Variable-reference concatenation: `token = head + tail` (no quoted
226/// literals on the RHS). The structural reassembly pass resolves these
227/// via `resolve_concat_reference`; without this indicator the multiline
228/// preprocessor passthroughs and the split credential never surfaces.
229fn has_var_ref_concatenation(text: &str) -> bool {
230    text.lines().any(has_var_ref_concat_line)
231}
232
233fn has_var_ref_concat_line(line: &str) -> bool {
234    // Cheap precheck: var-ref concatenation REQUIRES at least one `+`
235    // separator between two identifiers. Lines without one cannot
236    // possibly match - skip the regex entirely. Without this, the
237    // `(?:\s*\+\s*[a-z0-9_\-]{2,32}){1,8}` repeated-group bound forces
238    // the regex crate's NFA to evaluate every starting position on
239    // identifier-dense source lines, which on Apple Silicon
240    // (regex 1.12, lazy-DFA construction stalled by the `{1,8}`-bounded
241    // alternation) burns minutes of CPU per line. Surfaced during
242    // v0.5.25 cross-platform dogfood: a 171-byte Go file with shape
243    // `var token = receiver.Flag("x", "y").Required().String()` hung
244    // for 6+ minutes on Mac arm64 portable while Linux x86_64
245    // completed it in 0.6 s. The precheck is correctness-preserving:
246    // when no `+` exists in the line, the regex *cannot* match.
247    if !line.contains('+') {
248        return false;
249    }
250    VAR_REF_CONCAT_RE
251        .as_ref()
252        .is_some_and(|re| re.is_match(line))
253}
254
255pub(crate) fn should_passthrough(text: &str) -> bool {
256    text.len() > MAX_MULTILINE_PREPROCESS_BYTES
257        || text
258            .lines()
259            .any(|line| line.len() > MAX_MULTILINE_LINE_BYTES)
260        || !has_concatenation_indicators(text)
261}