keyhog_scanner/context/
inference.rs

1use super::{documentation::documentation_line_flags, CodeContext};
2
3const TEST_PREFIX_LEN: usize = 5;
4const ENCRYPTED_BLOCK_LOOKBACK_LINES: usize = 10;
5// 100 lines covers large Go/Java test functions with extensive setup.
6// The previous 30-line limit caused test fixtures to be reported as findings.
7const TEST_FUNCTION_LOOKBACK_LINES: usize = 100;
8
9/// Infer the structural context of a match at a given line.
10pub fn infer_context(lines: &[&str], line_idx: usize, file_path: Option<&str>) -> CodeContext {
11    let documentation_lines = documentation_line_flags(lines);
12    infer_context_with_documentation(lines, line_idx, file_path, &documentation_lines)
13}
14
15/// Detect example/placeholder credentials using ONLY algorithmic heuristics.
16/// No hardcoded credential lists - every suppression is based on a structural
17/// property that generalizes to all credentials of that shape.
18pub fn is_known_example_credential(credential: &str) -> bool {
19    let upper = credential.to_uppercase();
20
21    // EXAMPLE/EXAMPLEKEY is a universal documentation convention.
22    if upper.ends_with("EXAMPLE") || upper.ends_with("EXAMPLEKEY") {
23        return true;
24    }
25
26    // x/X-dominated values are masking filler.
27    let body = credential.as_bytes();
28    let x_count = body.iter().filter(|&&b| b == b'x' || b == b'X').count();
29    if body.len() >= 16 && x_count > body.len() * 3 / 4 {
30        return true;
31    }
32
33    // Ascending hex pairs are documentation placeholders.
34    if is_hex_sequential_placeholder(credential) {
35        return true;
36    }
37
38    // These appear in integrity checks, not as secrets.
39    if is_empty_input_hash(credential) {
40        return true;
41    }
42
43    // Monotonic or repetitive bodies remain placeholders after stripping prefixes.
44    is_sequential_placeholder(credential)
45}
46
47/// Returns true if the credential is the hash of an empty input (common in
48/// integrity/checksum fields, never a real secret).
49fn is_empty_input_hash(credential: &str) -> bool {
50    let lower = credential.to_ascii_lowercase();
51    // Only match exact lengths to avoid false positives on substrings.
52    match lower.len() {
53        32 => lower == "d41d8cd98f00b204e9800998ecf8427e", // MD5("")
54        40 => lower == "da39a3ee5e6b4b0d3255bfef95601890afd80709", // SHA1("")
55        64 => lower == "e3b0c44298fc1c149afbf4c8996fb92427ae41e4649b934ca495991b7852b855", // SHA256("")
56        _ => false,
57    }
58}
59
60pub fn is_sequential_placeholder(credential: &str) -> bool {
61    // Strip ALL known service prefixes before checking for sequential/placeholder patterns.
62    // Single source of truth: crate::confidence::KNOWN_PREFIXES.
63    // Missing a prefix here = false positive (placeholder not suppressed).
64    let body = crate::confidence::KNOWN_PREFIXES
65        .iter()
66        .find_map(|prefix| credential.strip_prefix(prefix))
67        .unwrap_or(credential);
68    if body.len() < 8 {
69        return false;
70    }
71
72    let bytes = body.as_bytes();
73    if bytes.iter().all(|&byte| byte == bytes[0]) {
74        return true;
75    }
76    if bytes.len() >= 8 {
77        let pair = &bytes[..2];
78        if bytes
79            .chunks(2)
80            .all(|chunk| chunk == pair || (chunk.len() < 2 && chunk[0] == pair[0]))
81        {
82            return true;
83        }
84    }
85    false
86}
87
88fn is_hex_sequential_placeholder(credential: &str) -> bool {
89    // Same canonical prefix list as is_sequential_placeholder. Strip the
90    // prefix before the hex-sequence check so e.g. `ghp_0123456789abcdef`
91    // still trips the "monotonic hex" suppression on the BODY.
92    let body = crate::confidence::KNOWN_PREFIXES
93        .iter()
94        .find_map(|prefix| credential.strip_prefix(prefix))
95        .unwrap_or(credential);
96
97    if body.len() < 16 || !body.bytes().all(|b| b.is_ascii_hexdigit()) {
98        return false;
99    }
100
101    let bytes: Vec<u8> = body.bytes().collect();
102
103    // Single-byte monotonic sequences such as 0123456789abcdef or fedcba9876543210.
104    if bytes.len() >= 16 {
105        let ascending = bytes
106            .windows(2)
107            .filter(|w| {
108                w[1] == w[0] + 1 || (w[0] == b'9' && w[1] == b'a') || (w[0] == b'f' && w[1] == b'0')
109            })
110            .count();
111        let descending = bytes
112            .windows(2)
113            .filter(|w| {
114                w[1] + 1 == w[0] || (w[0] == b'a' && w[1] == b'9') || (w[0] == b'0' && w[1] == b'f')
115            })
116            .count();
117        let threshold = (bytes.len() - 1) * 9 / 10;
118        if ascending > threshold || descending > threshold {
119            return true;
120        }
121    }
122
123    let pairs: Vec<&[u8]> = bytes.chunks(2).filter(|chunk| chunk.len() == 2).collect();
124    if pairs.len() < 8 {
125        return false;
126    }
127
128    let first_chars: Vec<u8> = pairs
129        .iter()
130        .map(|pair| pair[0].to_ascii_lowercase())
131        .collect();
132    let ascending = first_chars
133        .windows(2)
134        .filter(|window| {
135            window[1] == window[0] + 1
136                || (window[0] == b'f' && window[1] == b'a')
137                || (window[0] == b'9' && window[1] == b'a')
138                || (window[0] == b'9' && window[1] == b'0')
139        })
140        .count();
141
142    let second_chars: Vec<u8> = pairs
143        .iter()
144        .map(|pair| pair[1].to_ascii_lowercase())
145        .collect();
146    let ascending2 = second_chars
147        .windows(2)
148        .filter(|window| {
149            window[1] == window[0] + 1
150                || (window[0] == b'f' && window[1] == b'0')
151                || (window[0] == b'9' && window[1] == b'0')
152                || (window[0] == b'9' && window[1] == b'a')
153        })
154        .count();
155
156    let threshold = pairs.len() * 9 / 10;
157    ascending > threshold && ascending2 > threshold
158}
159
160/// Per-line region membership precomputed once per chunk.
161///
162/// `is_in_encrypted_block` and `is_in_test_function` both depend only on the
163/// surrounding lines, so when many matches land in the same function the
164/// per-match 10/100-line backward walks recompute identical answers. Building
165/// these flags in a single forward pass turns the per-match work into an O(1)
166/// lookup, converting O(matches * 100) line scans into O(lines) once per chunk.
167pub struct ContextRegions {
168    encrypted: Vec<bool>,
169    test_function: Vec<bool>,
170}
171
172impl ContextRegions {
173    /// Build the region table in a single forward pass over `lines`.
174    pub fn new(lines: &[&str]) -> Self {
175        ContextRegions {
176            encrypted: encrypted_block_flags(lines),
177            test_function: test_function_flags(lines),
178        }
179    }
180
181    fn is_encrypted(&self, line_idx: usize) -> bool {
182        self.encrypted.get(line_idx).copied().unwrap_or(false)
183    }
184
185    fn is_test_function(&self, line_idx: usize) -> bool {
186        self.test_function.get(line_idx).copied().unwrap_or(false)
187    }
188}
189
190/// Forward-pass equivalent of `is_in_encrypted_block` for every line.
191///
192/// A line is "in" an encrypted block if any of the preceding
193/// `ENCRYPTED_BLOCK_LOOKBACK_LINES` lines (or itself) starts an encrypted
194/// marker. Tracking the distance since the last marker line reproduces the
195/// per-line backward window in one sweep.
196fn encrypted_block_flags(lines: &[&str]) -> Vec<bool> {
197    let mut flags = vec![false; lines.len()];
198    let mut lines_since_marker = ENCRYPTED_BLOCK_LOOKBACK_LINES + 1;
199    for (idx, line) in lines.iter().enumerate() {
200        if is_encrypted_marker_line(line.trim()) {
201            lines_since_marker = 0;
202        }
203        flags[idx] = lines_since_marker <= ENCRYPTED_BLOCK_LOOKBACK_LINES;
204        lines_since_marker = lines_since_marker.saturating_add(1);
205    }
206    flags
207}
208
209fn is_encrypted_marker_line(trimmed: &str) -> bool {
210    trimmed.starts_with("$ANSIBLE_VAULT")
211        || trimmed.starts_with("ENC[")
212        || memchr::memmem::find(trimmed.as_bytes(), b"sops:").is_some()
213        || memchr::memmem::find(trimmed.as_bytes(), b"sealed-secrets").is_some()
214        || trimmed.starts_with("-----BEGIN PGP MESSAGE-----")
215        || trimmed.starts_with("-----BEGIN AGE ENCRYPTED")
216}
217
218/// One line of the test-function backward scan, classified independently.
219#[derive(Clone, Copy, PartialEq)]
220enum TestScanMark {
221    /// A test-function/test-attribute start: a match here means "in a test".
222    TestStart,
223    /// A non-test function/class boundary: a match here means "not in a test".
224    Boundary,
225    /// Neither - keep walking back.
226    Neither,
227}
228
229/// Forward-pass equivalent of `is_in_test_function` for every line.
230///
231/// `is_in_test_function` walks back up to `TEST_FUNCTION_LOOKBACK_LINES`,
232/// returning on the first `TestStart`/`Boundary` it sees. Classifying each
233/// line once and then tracking the index of the most recent non-`Neither`
234/// line reproduces that "nearest interesting line within the window decides"
235/// behaviour in a single O(lines) sweep.
236fn test_function_flags(lines: &[&str]) -> Vec<bool> {
237    let marks: Vec<TestScanMark> = (0..lines.len())
238        .map(|idx| classify_test_scan_line(lines, idx))
239        .collect();
240
241    let mut flags = vec![false; lines.len()];
242    // Index of the most recent line whose mark is not `Neither`, if any.
243    let mut last_interesting: Option<usize> = None;
244    for line_idx in 0..lines.len() {
245        if let Some(prev_idx) = last_interesting {
246            if line_idx - prev_idx <= TEST_FUNCTION_LOOKBACK_LINES {
247                flags[line_idx] = marks[prev_idx] == TestScanMark::TestStart;
248            }
249        }
250        // The next line scans back through this one, so update after deciding
251        // the current line (the original scans the range `start..line_idx`,
252        // i.e. it never inspects its own line).
253        if marks[line_idx] != TestScanMark::Neither {
254            last_interesting = Some(line_idx);
255        }
256    }
257    flags
258}
259
260/// Classify a single line for the test-function backward scan, mirroring the
261/// per-line decisions in `is_in_test_function` (including the 3-line attribute
262/// lookback for bare Rust `fn` declarations).
263fn classify_test_scan_line(lines: &[&str], candidate_line_idx: usize) -> TestScanMark {
264    let trimmed = lines[candidate_line_idx].trim();
265
266    if trimmed.starts_with("def test_")
267        || trimmed.starts_with("class Test")
268        || trimmed.starts_with("it(")
269        || trimmed.starts_with("describe(")
270        || trimmed.starts_with("test(")
271        || trimmed == "#[test]"
272        || trimmed == concat!("#[cfg(", "test)]")
273        || trimmed.starts_with("#[tokio::test")
274        || trimmed.starts_with("func Test")
275        || trimmed == "@Test"
276    {
277        return TestScanMark::TestStart;
278    }
279
280    // Stop looking back when we hit a non-test class or function boundary.
281    if trimmed.starts_with("class ") {
282        return TestScanMark::Boundary;
283    }
284
285    if (trimmed.starts_with("def ") || trimmed.starts_with("async def "))
286        && !trimmed.contains("def test_")
287    {
288        return TestScanMark::Boundary;
289    }
290
291    if trimmed.starts_with("func ") && !trimmed.contains("func Test") {
292        return TestScanMark::Boundary;
293    }
294
295    if (trimmed.starts_with("fn ")
296        || trimmed.starts_with("pub fn ")
297        || trimmed.starts_with("async fn ")
298        || trimmed.starts_with("pub async fn "))
299        && !trimmed.contains("fn test_")
300    {
301        let pre_start = candidate_line_idx.saturating_sub(3);
302        for pre_line in &lines[pre_start..candidate_line_idx] {
303            let pre_trimmed = pre_line.trim();
304            if pre_trimmed == "#[test]"
305                || pre_trimmed == concat!("#[cfg(", "test)]")
306                || pre_trimmed.starts_with("#[tokio::test")
307                || pre_trimmed.starts_with("#[test")
308                || pre_trimmed == "@Test"
309            {
310                return TestScanMark::TestStart;
311            }
312        }
313        return TestScanMark::Boundary;
314    }
315
316    if trimmed.starts_with("function ") && !trimmed.contains("function test") {
317        return TestScanMark::Boundary;
318    }
319
320    TestScanMark::Neither
321}
322
323/// Infer context when documentation-line flags have already been computed.
324pub fn infer_context_with_documentation(
325    lines: &[&str],
326    line_idx: usize,
327    file_path: Option<&str>,
328    documentation_lines: &[bool],
329) -> CodeContext {
330    if line_idx >= lines.len() {
331        return CodeContext::Unknown;
332    }
333
334    let line = lines[line_idx];
335    let trimmed = line.trim();
336
337    if file_path.is_some_and(is_test_file) {
338        return CodeContext::TestCode;
339    }
340    if is_in_encrypted_block(lines, line_idx) {
341        return CodeContext::Encrypted;
342    }
343    if is_commented_assignment_line(trimmed) {
344        return CodeContext::Assignment;
345    }
346    if is_comment_line(trimmed) {
347        return CodeContext::Comment;
348    }
349    if documentation_lines.get(line_idx).copied().unwrap_or(false) {
350        return CodeContext::Documentation;
351    }
352    if is_in_test_function(lines, line_idx) {
353        return CodeContext::TestCode;
354    }
355    if is_assignment_line(trimmed) {
356        return CodeContext::Assignment;
357    }
358    infer_default_context(trimmed)
359}
360
361/// Like `infer_context_with_documentation`, but reads encrypted-block and
362/// test-function membership from a `ContextRegions` table precomputed once per
363/// chunk instead of re-walking up to 100 lines backward per match.
364pub fn infer_context_with_regions(
365    lines: &[&str],
366    line_idx: usize,
367    file_path: Option<&str>,
368    documentation_lines: &[bool],
369    regions: &ContextRegions,
370) -> CodeContext {
371    if line_idx >= lines.len() {
372        return CodeContext::Unknown;
373    }
374
375    let trimmed = lines[line_idx].trim();
376
377    if file_path.is_some_and(is_test_file) {
378        return CodeContext::TestCode;
379    }
380    if regions.is_encrypted(line_idx) {
381        return CodeContext::Encrypted;
382    }
383    if is_commented_assignment_line(trimmed) {
384        return CodeContext::Assignment;
385    }
386    if is_comment_line(trimmed) {
387        return CodeContext::Comment;
388    }
389    if documentation_lines.get(line_idx).copied().unwrap_or(false) {
390        return CodeContext::Documentation;
391    }
392    if regions.is_test_function(line_idx) {
393        return CodeContext::TestCode;
394    }
395    if is_assignment_line(trimmed) {
396        return CodeContext::Assignment;
397    }
398    infer_default_context(trimmed)
399}
400
401fn is_test_file(path: &str) -> bool {
402    // Split on both / and \ for cross-platform compatibility.
403    let filename = path.rsplit(['/', '\\']).next().unwrap_or(path);
404    let stem = filename.split('.').next().unwrap_or(filename);
405
406    stem.len() > TEST_PREFIX_LEN
407        && stem
408            .as_bytes()
409            .get(..TEST_PREFIX_LEN)
410            .is_some_and(|bytes| bytes.eq_ignore_ascii_case(b"test_"))
411        || filename.ends_with("_test.go")
412        || filename.ends_with("_test.rs")
413        || filename.ends_with("_test.py")
414        || filename.ends_with("_test.rb")
415        || filename.ends_with("_test.java")
416        || filename.ends_with("Test.java")
417        || filename.ends_with("Tests.java")
418        || filename.ends_with(".test.js")
419        || filename.ends_with(".test.ts")
420        || filename.ends_with(".spec.js")
421        || filename.ends_with(".spec.ts")
422        || path.split(['/', '\\']).any(|component| {
423            component.eq_ignore_ascii_case("test")
424                || component.eq_ignore_ascii_case("tests")
425                || component.eq_ignore_ascii_case("__tests__")
426                || component.eq_ignore_ascii_case("fixtures")
427                || component.eq_ignore_ascii_case("testdata")
428                || component.eq_ignore_ascii_case("spec")
429        })
430}
431
432fn infer_default_context(trimmed: &str) -> CodeContext {
433    if memchr::memchr(b'"', trimmed.as_bytes()).is_some()
434        || memchr::memchr(b'\'', trimmed.as_bytes()).is_some()
435    {
436        CodeContext::StringLiteral
437    } else {
438        CodeContext::Unknown
439    }
440}
441
442fn is_comment_line(trimmed: &str) -> bool {
443    trimmed.starts_with("//")
444        || trimmed.starts_with('#')
445        || (trimmed.starts_with("--") && !trimmed.starts_with("---"))
446        || trimmed.starts_with("/*")
447        || trimmed.starts_with("<!--")
448        || trimmed.starts_with("<#")
449        || trimmed.starts_with("* ")
450        || trimmed.starts_with("*/")
451        || trimmed.starts_with("rem ")
452        || trimmed.starts_with("REM ")
453}
454
455fn is_commented_assignment_line(trimmed: &str) -> bool {
456    let Some(comment_body) = strip_comment_prefix(trimmed) else {
457        return false;
458    };
459    let body = comment_body
460        .trim_start()
461        .trim_end_matches("*/")
462        .trim_end_matches("-->")
463        .trim();
464    has_assignment_operator(body) || has_yaml_mapping(body)
465}
466
467fn strip_comment_prefix(trimmed: &str) -> Option<&str> {
468    if let Some(rest) = trimmed.strip_prefix("//") {
469        Some(rest)
470    } else if let Some(rest) = trimmed.strip_prefix('#') {
471        Some(rest)
472    } else if trimmed.starts_with("--") && !trimmed.starts_with("---") {
473        trimmed.strip_prefix("--")
474    } else if let Some(rest) = trimmed.strip_prefix("/*") {
475        Some(rest)
476    } else if let Some(rest) = trimmed.strip_prefix("<!--") {
477        Some(rest)
478    } else if let Some(rest) = trimmed.strip_prefix("<#") {
479        Some(rest)
480    } else if let Some(rest) = trimmed.strip_prefix("* ") {
481        Some(rest)
482    } else if let Some(rest) = trimmed.strip_prefix("rem ") {
483        Some(rest)
484    } else {
485        trimmed.strip_prefix("REM ")
486    }
487}
488
489fn is_assignment_line(trimmed: &str) -> bool {
490    has_assignment_operator(trimmed) || has_yaml_mapping(trimmed)
491}
492
493pub(crate) fn has_assignment_operator(trimmed: &str) -> bool {
494    for operator in [":=", "->", "="] {
495        if let Some(pos) = trimmed.find(operator) {
496            if !is_comparison_operator(trimmed, pos, operator) {
497                return true;
498            }
499        }
500    }
501    false
502}
503
504fn has_yaml_mapping(trimmed: &str) -> bool {
505    memchr::memmem::find(trimmed.as_bytes(), b": ").is_some() && !trimmed.starts_with("- ")
506}
507
508fn is_comparison_operator(trimmed: &str, pos: usize, operator: &str) -> bool {
509    if operator != "=" {
510        return false;
511    }
512
513    let before = trimmed[..pos].chars().last();
514    let after = trimmed[pos + operator.len()..].chars().next();
515    matches!(before, Some('=' | '!' | '>' | '<')) || matches!(after, Some('='))
516}
517
518fn is_in_encrypted_block(lines: &[&str], line_idx: usize) -> bool {
519    let start = line_idx.saturating_sub(ENCRYPTED_BLOCK_LOOKBACK_LINES);
520    lines
521        .iter()
522        .take(line_idx + 1)
523        .skip(start)
524        .any(|line| is_encrypted_marker_line(line.trim()))
525}
526
527fn is_in_test_function(lines: &[&str], line_idx: usize) -> bool {
528    let start = line_idx.saturating_sub(TEST_FUNCTION_LOOKBACK_LINES);
529    for candidate_line_idx in (start..line_idx).rev() {
530        let trimmed = lines[candidate_line_idx].trim();
531
532        if trimmed.starts_with("def test_")
533            || trimmed.starts_with("class Test")
534            || trimmed.starts_with("it(")
535            || trimmed.starts_with("describe(")
536            || trimmed.starts_with("test(")
537            || trimmed == "#[test]"
538            || trimmed == concat!("#[cfg(", "test)]")
539            || trimmed.starts_with("#[tokio::test")
540            || trimmed.starts_with("func Test")
541            || trimmed == "@Test"
542        {
543            return true;
544        }
545
546        // Stop looking back when we hit a non-test class or function boundary.
547        if trimmed.starts_with("class ") {
548            return false;
549        }
550
551        if (trimmed.starts_with("def ") || trimmed.starts_with("async def "))
552            && !trimmed.contains("def test_")
553        {
554            return false;
555        }
556
557        if trimmed.starts_with("func ") && !trimmed.contains("func Test") {
558            return false;
559        }
560
561        if (trimmed.starts_with("fn ")
562            || trimmed.starts_with("pub fn ")
563            || trimmed.starts_with("async fn ")
564            || trimmed.starts_with("pub async fn "))
565            && !trimmed.contains("fn test_")
566        {
567            let pre_start = candidate_line_idx.saturating_sub(3);
568            let mut is_test_attr = false;
569            for pre_line in &lines[pre_start..candidate_line_idx] {
570                let pre_trimmed = pre_line.trim();
571                if pre_trimmed == "#[test]"
572                    || pre_trimmed == concat!("#[cfg(", "test)]")
573                    || pre_trimmed.starts_with("#[tokio::test")
574                    || pre_trimmed.starts_with("#[test")
575                    || pre_trimmed == "@Test"
576                {
577                    is_test_attr = true;
578                    break;
579                }
580            }
581            if is_test_attr {
582                return true;
583            }
584            return false;
585        }
586
587        if trimmed.starts_with("function ") && !trimmed.contains("function test") {
588            return false;
589        }
590    }
591    false
592}
593
594pub(crate) fn surrounding_line_window(text: &str, offset: usize, radius: usize) -> &str {
595    if text.is_empty() {
596        return "";
597    }
598    let bytes = text.as_bytes();
599    let safe_offset = offset.min(bytes.len());
600
601    // Hard byte cap on each direction. The scan normally stops at a line
602    // terminator, so for ordinary source (lines well under this cap) the
603    // window is byte-identical to an uncapped walk. It only bites on a
604    // pathological line with no `\n` for kilobytes (e.g. a minified bundle,
605    // or a file that is one giant space-separated run of credential-shaped
606    // tokens): there, an uncapped per-match `O(line_len)` walk turned the
607    // whole-file scan quadratic — a 164 KiB single-line file with 8 K matches
608    // took ~18 s, a 656 KiB one timed out. Capping the window keeps each
609    // match's context cost O(1); the FP heuristics only need nearby keywords,
610    // for which the immediate line is enough — these FP heuristics detect
611    // HTTP cache / CORS / integrity-hash / renovate-digest *line* context, so
612    // 2 KiB each side covers any real header line while keeping the per-match
613    // substring scans cheap (this also speeds ordinary minified-bundle scans,
614    // whose lines are routinely tens of KiB).
615    const MAX_WINDOW_BYTES: usize = 2 * 1024;
616
617    let mut start = safe_offset;
618    let mut found_lines = 0;
619    while start > 0 && found_lines <= radius && safe_offset - start < MAX_WINDOW_BYTES {
620        start -= 1;
621        if bytes[start] == b'\n' {
622            found_lines += 1;
623        }
624    }
625    if start > 0 || (start == 0 && bytes[0] == b'\n') {
626        start += 1;
627    }
628
629    let mut end = safe_offset;
630    let mut found_lines = 0;
631    while end < bytes.len() && found_lines <= radius && end - safe_offset < MAX_WINDOW_BYTES {
632        if bytes[end] == b'\n' {
633            found_lines += 1;
634        }
635        end += 1;
636    }
637
638    while start < text.len() && !text.is_char_boundary(start) {
639        start += 1;
640    }
641    while end > start && !text.is_char_boundary(end) {
642        end -= 1;
643    }
644    &text[start..end]
645}
keyhog_scanner/context/inference.rs

keyhog_scanner/context/
inference.rs