Skip to main content

keyhog_scanner/multiline/
config.rs

1const MAX_MULTILINE_PREPROCESS_BYTES: usize = 2 * 1024 * 1024;
2const MAX_MULTILINE_LINE_BYTES: usize = 64 * 1024;
3
4/// A mapping from an offset in the joined text back to the original line number.
5#[derive(Debug, Clone)]
6pub struct LineMapping {
7    /// Start offset in the joined text (inclusive).
8    pub start_offset: usize,
9    /// End offset in the joined text (exclusive).
10    pub end_offset: usize,
11    /// Original line number (1-indexed).
12    pub line_number: usize,
13}
14
15/// Result of preprocessing text for multi-line concatenation.
16#[derive(Debug, Clone)]
17pub struct PreprocessedText {
18    /// Original text plus appended multiline-joined segments.
19    pub text: String,
20    /// Byte offset where appended joined segments start.
21    pub original_end: usize,
22    /// Mapping from offsets in `text` to original line numbers.
23    pub mappings: Vec<LineMapping>,
24}
25
26impl PreprocessedText {
27    /// Map a byte offset in preprocessed text back to an original line number.
28    pub fn line_for_offset(&self, offset: usize) -> Option<usize> {
29        self.mappings
30            .iter()
31            .find(|mapping| offset >= mapping.start_offset && offset < mapping.end_offset)
32            .map(|mapping| mapping.line_number)
33    }
34
35    /// Build a preprocessed representation with a one-line identity mapping.
36    pub fn passthrough(text: &str) -> Self {
37        let mut mappings = Vec::new();
38        let mut offset = 0;
39        for (line_idx, line) in text.split('\n').enumerate() {
40            let end = offset + line.len();
41            mappings.push(LineMapping {
42                line_number: line_idx + 1,
43                start_offset: offset,
44                end_offset: end + 1,
45            });
46            offset = end + 1;
47        }
48        if let Some(last) = mappings.last_mut() {
49            last.end_offset = text.len();
50        }
51        let original_end = text.len();
52        Self {
53            text: text.to_string(),
54            original_end,
55            mappings,
56        }
57    }
58}
59
60/// Configuration for multiline concatenation recovery.
61#[derive(Debug, Clone)]
62pub struct MultilineConfig {
63    /// Maximum number of lines to join in a single concatenation chain.
64    pub max_join_lines: usize,
65    /// Whether to enable Python-style implicit concatenation.
66    pub python_implicit: bool,
67    /// Whether to enable backslash line continuation.
68    pub backslash_continuation: bool,
69    /// Whether to enable explicit concatenation with `+`.
70    pub plus_concatenation: bool,
71    /// Whether to enable JavaScript template literal concatenation.
72    pub template_literals: bool,
73}
74
75impl Default for MultilineConfig {
76    fn default() -> Self {
77        Self {
78            max_join_lines: 10,
79            python_implicit: true,
80            backslash_continuation: true,
81            plus_concatenation: true,
82            template_literals: true,
83        }
84    }
85}
86
87/// Check if text contains any concatenation indicators.
88pub(crate) fn has_concatenation_indicators(text: &str) -> bool {
89    let trimmed = text.trim_start();
90    if trimmed.starts_with('{')
91        || trimmed.starts_with('[')
92        || trimmed.starts_with("<?xml")
93        || trimmed.starts_with('<')
94    {
95        return false;
96    }
97
98    let bytes = text.as_bytes();
99
100    // For large files, only preprocess if secret-related keywords are present.
101    if bytes.len() > 4096 {
102        let has_secret_keyword = memchr::memmem::find(bytes, b"ecret").is_some()
103            || memchr::memmem::find(bytes, b"oken").is_some()
104            || memchr::memmem::find(bytes, b"assword").is_some()
105            || memchr::memmem::find(bytes, b"api_key").is_some()
106            || memchr::memmem::find(bytes, b"API_KEY").is_some()
107            || memchr::memmem::find(bytes, b"redential").is_some();
108        if !has_secret_keyword {
109            return false;
110        }
111    }
112
113    let has_explicit_concat = text.contains("\" +") || text.contains("' +");
114    let has_backslash_cont = text.contains("\" \\") || text.contains("' \\");
115    let has_template = memchr::memchr(b'`', bytes).is_some();
116    let has_paste = text.contains("paste0(");
117    let has_implicit = bytes.windows(3).any(|window| {
118        (window[0] == b'"' && window[1] == b' ' && window[2] == b'"')
119            || (window[0] == b'\'' && window[1] == b' ' && window[2] == b'\'')
120            || (window[0] == b'"'
121                && window[1] == b'\n'
122                && (window[2] == b'"' || window[2] == b' ' || window[2] == b'\t'))
123            || (window[0] == b'\''
124                && window[1] == b'\n'
125                && (window[2] == b'\'' || window[2] == b' ' || window[2] == b'\t'))
126    });
127    if !has_explicit_concat && !has_backslash_cont && !has_template && !has_paste && !has_implicit {
128        return false;
129    }
130
131    for line in text.lines() {
132        let trimmed = line.trim();
133        if trimmed.ends_with('+')
134            || trimmed.starts_with('+')
135            || trimmed.starts_with("+ ")
136            || trimmed.contains("paste0(")
137            || trimmed.contains("paste(")
138            || trimmed.contains("\" +")
139            || trimmed.contains("' +")
140            || trimmed.contains("+ \"")
141            || trimmed.contains("+ '")
142            || (trimmed.ends_with('\\') && !trimmed.ends_with("\\\\"))
143            || trimmed.contains("\" \"")
144            || trimmed.contains("' '")
145            || (trimmed.ends_with('`') && trimmed.matches('`').count() == 1)
146        {
147            return true;
148        }
149    }
150
151    false
152}
153
154pub(crate) fn should_passthrough(text: &str) -> bool {
155    text.len() > MAX_MULTILINE_PREPROCESS_BYTES
156        || text
157            .lines()
158            .any(|line| line.len() > MAX_MULTILINE_LINE_BYTES)
159        || !has_concatenation_indicators(text)
160}