Skip to main content

keyhog_scanner/multiline/
config.rs

1use regex::Regex;
2use std::sync::LazyLock;
3
4const MAX_MULTILINE_PREPROCESS_BYTES: usize = 2 * 1024 * 1024;
5const MAX_MULTILINE_LINE_BYTES: usize = 64 * 1024;
6
7/// A mapping from an offset in the joined text back to the original line number.
8#[derive(Debug, Clone)]
9pub struct LineMapping {
10    /// Start offset in the joined text (inclusive).
11    pub start_offset: usize,
12    /// End offset in the joined text (exclusive).
13    pub end_offset: usize,
14    /// Original line number (1-indexed).
15    pub line_number: usize,
16}
17
18/// Result of preprocessing text for multi-line concatenation.
19#[derive(Debug, Clone)]
20pub struct PreprocessedText {
21    /// Original text plus appended multiline-joined segments.
22    pub text: String,
23    /// Byte offset where appended joined segments start.
24    pub original_end: usize,
25    /// Mapping from offsets in `text` to original line numbers.
26    pub mappings: Vec<LineMapping>,
27}
28
29impl PreprocessedText {
30    /// Map a byte offset in preprocessed text back to an original line number.
31    ///
32    /// Mappings are stored in `start_offset`-sorted, contiguous order
33    /// (the preprocessor appends them as it walks the input), so a
34    /// `partition_point` binary search resolves the lookup in
35    /// `O(log L)` instead of the prior `O(L)` linear scan. On a
36    /// 10 000-line file with ~100 matches that's 10 000 × 100 = 1 M
37    /// pointer compares cut to ~1 400.
38    pub fn line_for_offset(&self, offset: usize) -> Option<usize> {
39        let idx = self.mappings.partition_point(|m| m.start_offset <= offset);
40        if idx == 0 {
41            return None;
42        }
43        let m = &self.mappings[idx - 1];
44        if offset < m.end_offset {
45            Some(m.line_number)
46        } else {
47            None
48        }
49    }
50
51    /// Build a preprocessed representation with a one-line identity mapping.
52    pub fn passthrough(text: &str) -> Self {
53        let mut mappings = Vec::new();
54        let mut offset = 0;
55        for (line_idx, line) in text.split('\n').enumerate() {
56            let end = offset + line.len();
57            mappings.push(LineMapping {
58                line_number: line_idx + 1,
59                start_offset: offset,
60                end_offset: end + 1,
61            });
62            offset = end + 1;
63        }
64        if let Some(last) = mappings.last_mut() {
65            last.end_offset = text.len();
66        }
67        let original_end = text.len();
68        Self {
69            text: text.to_string(),
70            original_end,
71            mappings,
72        }
73    }
74}
75
76/// Configuration for multiline concatenation recovery.
77#[derive(Debug, Clone)]
78pub struct MultilineConfig {
79    /// Maximum number of lines to join in a single concatenation chain.
80    pub max_join_lines: usize,
81    /// Whether to enable Python-style implicit concatenation.
82    pub python_implicit: bool,
83    /// Whether to enable backslash line continuation.
84    pub backslash_continuation: bool,
85    /// Whether to enable explicit concatenation with `+`.
86    pub plus_concatenation: bool,
87    /// Whether to enable JavaScript template literal concatenation.
88    pub template_literals: bool,
89}
90
91impl Default for MultilineConfig {
92    fn default() -> Self {
93        Self {
94            max_join_lines: 10,
95            python_implicit: true,
96            backslash_continuation: true,
97            plus_concatenation: true,
98            template_literals: true,
99        }
100    }
101}
102
103/// Check if text contains any concatenation indicators.
104pub(crate) fn has_concatenation_indicators(text: &str) -> bool {
105    let trimmed = text.trim_start();
106    if trimmed.starts_with('{')
107        || trimmed.starts_with('[')
108        || trimmed.starts_with("<?xml")
109        || trimmed.starts_with('<')
110    {
111        return false;
112    }
113
114    let bytes = text.as_bytes();
115
116    // For large files, only preprocess if secret-related keywords are present.
117    if bytes.len() > 4096 {
118        let has_secret_keyword = memchr::memmem::find(bytes, b"ecret").is_some()
119            || memchr::memmem::find(bytes, b"oken").is_some()
120            || memchr::memmem::find(bytes, b"assword").is_some()
121            || memchr::memmem::find(bytes, b"api_key").is_some()
122            || memchr::memmem::find(bytes, b"API_KEY").is_some()
123            || memchr::memmem::find(bytes, b"redential").is_some();
124        if !has_secret_keyword {
125            return false;
126        }
127    }
128
129    let has_explicit_concat = text.contains("\" +") || text.contains("' +");
130    let has_backslash_cont = text.contains("\" \\") || text.contains("' \\");
131    let has_template = memchr::memchr(b'`', bytes).is_some();
132    // Function-style string concatenation: R's paste()/paste0() and Rust's
133    // concat!() macro. All three splice multiple string literals into one
134    // value, so any of them is a concat indicator.
135    let has_paste =
136        text.contains("paste0(") || text.contains("paste(") || text.contains("concat!(");
137    let has_implicit = bytes.windows(3).any(|window| {
138        (window[0] == b'"' && window[1] == b' ' && window[2] == b'"')
139            || (window[0] == b'\'' && window[1] == b' ' && window[2] == b'\'')
140            || (window[0] == b'"'
141                && window[1] == b'\n'
142                && (window[2] == b'"' || window[2] == b' ' || window[2] == b'\t'))
143            || (window[0] == b'\''
144                && window[1] == b'\n'
145                && (window[2] == b'\'' || window[2] == b' ' || window[2] == b'\t'))
146    });
147    if !has_explicit_concat
148        && !has_backslash_cont
149        && !has_template
150        && !has_paste
151        && !has_implicit
152        && !has_var_ref_concatenation(text)
153    {
154        return false;
155    }
156
157    for line in text.lines() {
158        let trimmed = line.trim();
159        if trimmed.ends_with('+')
160            || trimmed.starts_with('+')
161            || trimmed.starts_with("+ ")
162            || trimmed.contains("paste0(")
163            || trimmed.contains("paste(")
164            || trimmed.contains("concat!(")
165            || trimmed.contains("\" +")
166            || trimmed.contains("' +")
167            || trimmed.contains("+ \"")
168            || trimmed.contains("+ '")
169            || (trimmed.ends_with('\\') && !trimmed.ends_with("\\\\"))
170            || trimmed.contains("\" \"")
171            || trimmed.contains("' '")
172            || has_var_ref_concat_line(trimmed)
173            || (trimmed.ends_with('`') && trimmed.matches('`').count() == 1)
174            // String literal interpolated INTO a template literal:
175            // `ghp_${"BODY"}` / `${'a'}${'b'}`. The `${"`/`${'` shape is the
176            // concat-evasion signal - a string literal spliced into an
177            // interpolation. Deliberately narrow: bare `${ident}` (normal
178            // runtime interpolation, ubiquitous in JS/TS) is NOT flagged, so
179            // this adds no preprocessing cost to ordinary template code.
180            || trimmed.contains("${\"")
181            || trimmed.contains("${'")
182            // Adjacent template interpolations `${a}${b}` - the close-brace
183            // immediately followed by `${` is the concat-via-interpolation
184            // signal. Ordinary single interpolation (`Hi ${name}!`) has
185            // literal text between/around the braces and never produces
186            // `}${`, so this stays clear of the ubiquitous JS/TS template
187            // case and adds no cost to it.
188            || trimmed.contains("}${")
189        {
190            return true;
191        }
192    }
193
194    false
195}
196
197/// Variable-reference concatenation: `token = head + tail` (no quoted
198/// literals on the RHS). The structural reassembly pass resolves these
199/// via `resolve_concat_reference`; without this indicator the multiline
200/// preprocessor passthroughs and the split credential never surfaces.
201fn has_var_ref_concatenation(text: &str) -> bool {
202    text.lines().any(has_var_ref_concat_line)
203}
204
205fn has_var_ref_concat_line(line: &str) -> bool {
206    // Cheap precheck: var-ref concatenation REQUIRES at least one `+`
207    // separator between two identifiers. Lines without one cannot
208    // possibly match - skip the regex entirely. Without this, the
209    // `(?:\s*\+\s*[a-z0-9_\-]{2,32}){1,8}` repeated-group bound forces
210    // the regex crate's NFA to evaluate every starting position on
211    // identifier-dense source lines, which on Apple Silicon
212    // (regex 1.12, lazy-DFA construction stalled by the `{1,8}`-bounded
213    // alternation) burns minutes of CPU per line. Surfaced during
214    // v0.5.25 cross-platform dogfood: a 171-byte Go file with shape
215    // `var token = receiver.Flag("x", "y").Required().String()` hung
216    // for 6+ minutes on Mac arm64 portable while Linux x86_64
217    // completed it in 0.6 s. The precheck is correctness-preserving:
218    // when no `+` exists in the line, the regex *cannot* match.
219    if !line.contains('+') {
220        return false;
221    }
222    static VAR_REF_CONCAT_RE: LazyLock<Option<Regex>> = LazyLock::new(|| {
223        Regex::new(
224            r#"(?i)^\s*[a-z0-9_\-\.]{2,64}\s*[:=]\s*[a-z0-9_\-]{2,32}(?:\s*\+\s*[a-z0-9_\-]{2,32}){1,8}\s*;?\s*$"#,
225        )
226        .ok()
227    });
228    VAR_REF_CONCAT_RE
229        .as_ref()
230        .is_some_and(|re| re.is_match(line))
231}
232
233pub(crate) fn should_passthrough(text: &str) -> bool {
234    text.len() > MAX_MULTILINE_PREPROCESS_BYTES
235        || text
236            .lines()
237            .any(|line| line.len() > MAX_MULTILINE_LINE_BYTES)
238        || !has_concatenation_indicators(text)
239}