keyhog_scanner/
context.rs

1//! Structural context analysis: understand WHERE in code a potential secret appears.
2//!
3//! Instead of treating code as flat text, we infer the structural context of
4//! each match (assignment, comment, test code, encrypted block, documentation)
5//! and adjust confidence accordingly. Not an AST parser — just fast,
6//! language-agnostic structural inference.
7
8const ASSIGNMENT_CONFIDENCE_MULTIPLIER: f64 = 1.0;
9const STRING_LITERAL_CONFIDENCE_MULTIPLIER: f64 = 0.9;
10const UNKNOWN_CONFIDENCE_MULTIPLIER: f64 = 0.8;
11const DOCUMENTATION_CONFIDENCE_MULTIPLIER: f64 = 0.3;
12const COMMENT_CONFIDENCE_MULTIPLIER: f64 = 0.4;
13const TEST_CODE_CONFIDENCE_MULTIPLIER: f64 = 0.3;
14const ENCRYPTED_CONFIDENCE_MULTIPLIER: f64 = 0.05;
15const TEST_PREFIX_LEN: usize = 5;
16
17const ENCRYPTED_BLOCK_LOOKBACK_LINES: usize = 10;
18const TEST_FUNCTION_LOOKBACK_LINES: usize = 30;
19const DOCSTRING_TOGGLE_REMAINDER: usize = 2;
20const DOCSTRING_TOGGLE_MATCH: usize = 1;
21
22/// The structural context of a code location.
23///
24/// # Examples
25///
26/// ```rust
27/// use keyhog_scanner::context::CodeContext;
28///
29/// assert!(matches!(CodeContext::Assignment, CodeContext::Assignment));
30/// ```
31#[derive(Debug, Clone, Copy, PartialEq)]
32pub enum CodeContext {
33    /// Direct assignment: key = value, key: value, KEY=value
34    Assignment,
35    /// Inside a comment (// # /* -- etc.)
36    Comment,
37    /// Inside a test function or test file
38    TestCode,
39    /// Inside an encrypted/sealed block
40    Encrypted,
41    /// Inside documentation (docstring, markdown code fence)
42    Documentation,
43    /// Inside a string literal (normal code)
44    StringLiteral,
45    /// Unknown / unstructured context
46    Unknown,
47}
48
49impl CodeContext {
50    /// Confidence multiplier for this context.
51    /// Assignment = boost. Test/comment/encrypted = reduce.
52    ///
53    /// # Examples
54    ///
55    /// ```rust
56    /// use keyhog_scanner::context::CodeContext;
57    ///
58    /// assert!(CodeContext::Documentation.confidence_multiplier() < 1.0);
59    /// ```
60    pub fn confidence_multiplier(&self) -> f64 {
61        match self {
62            Self::Assignment => ASSIGNMENT_CONFIDENCE_MULTIPLIER,
63            Self::StringLiteral => STRING_LITERAL_CONFIDENCE_MULTIPLIER,
64            Self::Unknown => UNKNOWN_CONFIDENCE_MULTIPLIER,
65            Self::Documentation => DOCUMENTATION_CONFIDENCE_MULTIPLIER,
66            Self::Comment => COMMENT_CONFIDENCE_MULTIPLIER,
67            Self::TestCode => TEST_CODE_CONFIDENCE_MULTIPLIER,
68            Self::Encrypted => ENCRYPTED_CONFIDENCE_MULTIPLIER,
69        }
70    }
71}
72
73/// Infer the structural context of a match at a given line.
74///
75/// # Examples
76///
77/// ```rust
78/// use keyhog_scanner::context::{CodeContext, infer_context};
79///
80/// let lines = vec!["API_KEY=demo_ABC12345"];
81/// assert!(matches!(infer_context(&lines, 0, Some(".env")), CodeContext::Assignment));
82/// ```
83pub fn infer_context(lines: &[&str], line_idx: usize, file_path: Option<&str>) -> CodeContext {
84    let documentation_lines = documentation_line_flags(lines);
85    infer_context_with_documentation(lines, line_idx, file_path, &documentation_lines)
86}
87
88/// Returns true if the match is in a context that indicates a false positive (lockfile, regex def, etc).
89///
90/// # Examples
91///
92/// ```rust
93/// use keyhog_scanner::context::is_false_positive_match_context;
94///
95/// assert!(is_false_positive_match_context("sha256-integrity abcdef", 0, Some("package-lock.json")));
96/// ```
97pub fn is_false_positive_match_context(
98    text: &str,
99    match_start: usize,
100    file_path: Option<&str>,
101) -> bool {
102    let window = surrounding_line_window(text, match_start, 1);
103    let lower = window.to_ascii_lowercase();
104    let path_lower = file_path.map(str::to_ascii_lowercase);
105
106    is_go_sum_checksum(&lower, path_lower.as_deref())
107        || is_integrity_hash(&lower)
108        || is_configmap_binary_data(&lower)
109        || is_git_lfs_pointer_context(&lower)
110        || is_renovate_digest_context(&lower)
111        || is_cors_header(&lower)
112        || is_http_cache_header(&lower)
113}
114
115/// Known example/documentation credentials that are intentionally public and
116/// should never be flagged. These are published in official vendor docs, SDKs,
117/// and test suites. Every major scanner (TruffleHog, Gitleaks) suppresses them.
118///
119/// Matching is exact and case-sensitive for prefixed keys (AWS, GitHub, Stripe)
120/// and case-insensitive for hex hashes. This is a credential-value check, not a
121/// context check — it runs before expensive context analysis.
122///
123/// # Examples
124///
125/// ```rust
126/// use keyhog_scanner::context::is_known_example_credential;
127///
128/// assert!(is_known_example_credential("AKIAIOSFODNN7EXAMPLE"));
129/// ```
130pub fn is_known_example_credential(credential: &str) -> bool {
131    // AWS official example keys from documentation
132    if credential == "AKIAIOSFODNN7EXAMPLE"
133        || credential == "wJalrXUtnFEMI/K7MDENG/bPxRfiCYEXAMPLEKEY"
134    {
135        return true;
136    }
137
138    // Stripe test-mode keys from docs (always start with sk_test_ or pk_test_
139    // followed by a known example suffix)
140    if credential == "sk_test_FAKE"
141        || credential == "pk_test_FAKE"
142        || credential == "sk_test_FAKE_2"
143        || credential == "sk_test_FAKE_1"
144    {
145        return true;
146    }
147
148    // GitHub official example PATs from docs
149    if credential == "ghp_ABCDEFGHIJKLMNOPQRSTUVWXYZabcdef01"
150        || credential == "ghp_xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx"
151        || credential == "ghp_1234567890abcdefghij1234567890abcdef"
152        || credential == "ghp_1234567890abcdefghij1234567890abcdefgh"
153        || credential
154            == "github_pat_11AAAAAA0xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx"
155    {
156        return true;
157    }
158
159    // Suffix-based: keys ending in EXAMPLE, example, _test, _sample, _demo
160    if credential.ends_with("EXAMPLE")
161        || credential.ends_with("EXAMPLEKEY")
162        || credential.ends_with("example")
163    {
164        return true;
165    }
166
167    // All-x placeholders (any prefix)
168    {
169        let body = credential.as_bytes();
170        let x_count = body.iter().filter(|&&b| b == b'x' || b == b'X').count();
171        if body.len() >= 16 && x_count > body.len() * 3 / 4 {
172            return true;
173        }
174    }
175
176    // Hex-sequential placeholders: a1b2c3d4e5f6..., 8f3a9b2c1d4e...
177    // These are commonly used in test fixtures and documentation.
178    // Detect by checking if the credential body (minus prefix) is entirely hex
179    // and consists of incrementing hex nibble pairs.
180    if is_hex_sequential_placeholder(credential) {
181        return true;
182    }
183
184    // Well-known hashes that appear everywhere (case-insensitive)
185    let lower = credential.to_ascii_lowercase();
186    // MD5 of empty string
187    if lower == "d41d8cd98f00b204e9800998ecf8427e" {
188        return true;
189    }
190    // SHA-256 of "password"
191    if lower == "5e884898da28047151d0e56f8dc6292773603d0d6aabbdd62a11ef721d1542d8" {
192        return true;
193    }
194    // SHA-1 of empty string
195    if lower == "da39a3ee5e6b4b0d3255bfef95601890afd80709" {
196        return true;
197    }
198    // SHA-256 of empty string
199    if lower == "e3b0c44298fc1c149afbf4c8996fb92427ae41e4649b934ca495991b7852b855" {
200        return true;
201    }
202
203    // Well-known JWT from jwt.io documentation
204    if credential.starts_with("eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9.eyJzdWIiOiIxMjM0NTY3ODkwIiw") {
205        return true;
206    }
207    if credential.starts_with("eyJhbGciOiJSUzI1NiIsInR5cCI6IkpXVCJ9.eyJzdWIiOiIxMjM0NTY3ODkwIn0") {
208        return true;
209    }
210
211    // Sequential/placeholder pattern: strip prefix, check if remainder is sequential hex/alphanum
212    if is_sequential_placeholder(credential) {
213        return true;
214    }
215
216    false
217}
218
219/// Detect placeholder credentials with sequential or repetitive character patterns.
220/// E.g. `ghp_1234567890abcdefghij...`, `sk-proj-a1b2c3d4e5f6a7b8...`, `0xA0b86a33E...5E5E5E`
221fn is_sequential_placeholder(credential: &str) -> bool {
222    // Strip common prefixes to get the "secret part"
223    let body = credential
224        .strip_prefix("ghp_")
225        .or_else(|| credential.strip_prefix("gho_"))
226        .or_else(|| credential.strip_prefix("ghs_"))
227        .or_else(|| credential.strip_prefix("ghu_"))
228        .or_else(|| credential.strip_prefix("github_pat_"))
229        .or_else(|| credential.strip_prefix("sk-proj-"))
230        .or_else(|| credential.strip_prefix("sk-"))
231        .or_else(|| credential.strip_prefix("sk_test_"))
232        .or_else(|| credential.strip_prefix("sk_live_"))
233        .or_else(|| credential.strip_prefix("pk_test_"))
234        .or_else(|| credential.strip_prefix("pk_live_"))
235        .or_else(|| credential.strip_prefix("AKIA"))
236        .or_else(|| credential.strip_prefix("xoxb-"))
237        .or_else(|| credential.strip_prefix("xoxp-"))
238        .or_else(|| credential.strip_prefix("0x"))
239        .unwrap_or(credential);
240    if body.len() < 16 {
241        return false;
242    }
243
244    let bytes = body.as_bytes();
245
246    // Check for repeating single character (xxxxxxxxxx)
247    if bytes.iter().all(|&b| b == bytes[0]) {
248        return true;
249    }
250
251    // Check for repeating 2-char pattern (e.g., "5E5E5E5E")
252    if bytes.len() >= 8 {
253        let pair = &bytes[..2];
254        if bytes
255            .chunks(2)
256            .all(|chunk| chunk == pair || chunk.len() < 2)
257        {
258            return true;
259        }
260    }
261
262    false
263}
264
265/// Detect hex-sequential placeholder credentials like `a1b2c3d4e5f6a7b8...`
266/// or `8f3a9b2c1d4e5f60...`. These are commonly used in test fixtures.
267fn is_hex_sequential_placeholder(credential: &str) -> bool {
268    // Strip known prefixes
269    let body = credential
270        .strip_prefix("sk-proj-")
271        .or_else(|| credential.strip_prefix("sk-"))
272        .or_else(|| credential.strip_prefix("ghp_"))
273        .or_else(|| credential.strip_prefix("0x"))
274        .unwrap_or(credential);
275
276    if body.len() < 16 {
277        return false;
278    }
279
280    // Must be all hex characters
281    if !body.bytes().all(|b| b.is_ascii_hexdigit()) {
282        return false;
283    }
284
285    // Check if it's a repeating hex pattern with incrementing nibbles
286    // Pattern: each pair of hex chars increments: a1, b2, c3, d4, e5, f6, a7, b8...
287    let bytes: Vec<u8> = body.bytes().collect();
288    let pairs: Vec<&[u8]> = bytes.chunks(2).filter(|c| c.len() == 2).collect();
289    if pairs.len() < 8 {
290        return false;
291    }
292
293    // Check if the first hex char of each pair follows an ascending pattern
294    let first_chars: Vec<u8> = pairs.iter().map(|p| p[0].to_ascii_lowercase()).collect();
295    let ascending = first_chars
296        .windows(2)
297        .filter(|w| {
298            w[1] == w[0] + 1
299                || (w[0] == b'f' && w[1] == b'a')
300                || (w[0] == b'9' && w[1] == b'a')
301                || (w[0] == b'9' && w[1] == b'0')
302        })
303        .count();
304
305    // Or check second hex char ascending
306    let second_chars: Vec<u8> = pairs.iter().map(|p| p[1].to_ascii_lowercase()).collect();
307    let ascending2 = second_chars
308        .windows(2)
309        .filter(|w| {
310            w[1] == w[0] + 1
311                || (w[0] == b'f' && w[1] == b'0')
312                || (w[0] == b'9' && w[1] == b'0')
313                || (w[0] == b'9' && w[1] == b'a')
314        })
315        .count();
316
317    // Require 75%+ ascending to avoid false-suppressing real hex hashes.
318    // True placeholders like "a1b2c3d4e5f6..." score 90%+.
319    ascending > pairs.len() * 3 / 4 || ascending2 > pairs.len() * 3 / 4
320}
321
322/// Returns true if the match at the given line should be suppressed as a false positive.
323/// Check whether a line-level match sits in known false-positive context.
324///
325/// # Examples
326///
327/// ```rust
328/// use keyhog_scanner::context::is_false_positive_context;
329///
330/// let lines = vec!["version https://git-lfs.github.com/spec/v1"];
331/// assert!(is_false_positive_context(&lines, 0, Some(".gitattributes")));
332/// ```
333pub fn is_false_positive_context(lines: &[&str], line_idx: usize, file_path: Option<&str>) -> bool {
334    let path_lower = file_path.map(str::to_ascii_lowercase);
335    is_false_positive_context_with_path(lines, line_idx, path_lower.as_deref())
336}
337
338/// Same as `is_false_positive_context` but accepts a pre-lowered path to avoid
339/// re-allocating for every match in the same chunk.
340/// Variant of [`is_false_positive_context`] that accepts an owned path hint.
341///
342/// # Examples
343///
344/// ```rust
345/// use keyhog_scanner::context::is_false_positive_context_with_path;
346///
347/// let lines = vec!["version https://git-lfs.github.com/spec/v1"];
348/// assert!(is_false_positive_context_with_path(&lines, 0, Some(".gitattributes")));
349/// ```
350pub fn is_false_positive_context_with_path(
351    lines: &[&str],
352    line_idx: usize,
353    path_lower: Option<&str>,
354) -> bool {
355    if line_idx >= lines.len() {
356        return false;
357    }
358
359    let line = lines[line_idx];
360    let lower = line.to_ascii_lowercase();
361
362    is_go_sum_checksum(&lower, path_lower)
363        || is_integrity_hash_context(lines, line_idx, &lower)
364        || is_configmap_binary_data_context(lines, line_idx, &lower)
365        || is_git_lfs_pointer_context_with_lines(lines, line_idx, &lower)
366        || is_renovate_digest_context_with_lines(lines, line_idx, &lower)
367        || is_cors_header(&lower)
368        || is_http_cache_header_context(lines, line_idx, &lower)
369}
370
371/// Infer the structural context of a match, considering documentation blocks.
372/// Infer context when documentation-line flags have already been computed.
373///
374/// # Examples
375///
376/// ```rust
377/// use keyhog_scanner::context::{CodeContext, documentation_line_flags, infer_context_with_documentation};
378///
379/// let lines = vec!["API_KEY=demo_ABC12345"];
380/// let docs = documentation_line_flags(&lines);
381/// assert!(matches!(infer_context_with_documentation(&lines, 0, Some(".env"), &docs), CodeContext::Assignment));
382/// ```
383pub fn infer_context_with_documentation(
384    lines: &[&str],
385    line_idx: usize,
386    file_path: Option<&str>,
387    documentation_lines: &[bool],
388) -> CodeContext {
389    if line_idx >= lines.len() {
390        return CodeContext::Unknown;
391    }
392
393    let line = lines[line_idx];
394    let trimmed = line.trim();
395
396    if file_path.is_some_and(is_test_file) {
397        return CodeContext::TestCode;
398    }
399
400    if is_in_encrypted_block(lines, line_idx) {
401        return CodeContext::Encrypted;
402    }
403
404    if is_comment_line(trimmed) {
405        return CodeContext::Comment;
406    }
407
408    if documentation_lines.get(line_idx).copied().unwrap_or(false) {
409        return CodeContext::Documentation;
410    }
411
412    if is_in_test_function(lines, line_idx) {
413        return CodeContext::TestCode;
414    }
415
416    if is_assignment_line(trimmed) {
417        return CodeContext::Assignment;
418    }
419
420    infer_default_context(trimmed)
421}
422
423/// Pre-compute which lines are inside documentation blocks (markdown fences, docstrings).
424/// Mark lines that appear to be documentation or docstrings.
425///
426/// # Examples
427///
428/// ```rust
429/// use keyhog_scanner::context::documentation_line_flags;
430///
431/// let flags = documentation_line_flags(&["# Example", "token = value"]);
432/// assert_eq!(flags.len(), 2);
433/// ```
434pub fn documentation_line_flags(lines: &[&str]) -> Vec<bool> {
435    let mut flags = vec![false; lines.len()];
436    let mut in_markdown_code_block = false;
437    let mut in_docstring = false;
438
439    for (idx, line) in lines.iter().enumerate() {
440        let trimmed = line.trim();
441        let is_fence = trimmed.starts_with("```");
442        let triple_count = trimmed.matches("\"\"\"").count() + trimmed.matches("'''").count();
443        let toggles_docstring = triple_count % DOCSTRING_TOGGLE_REMAINDER == DOCSTRING_TOGGLE_MATCH;
444
445        if is_fence || in_markdown_code_block || in_docstring {
446            flags[idx] = true;
447        }
448
449        if is_fence {
450            in_markdown_code_block = !in_markdown_code_block;
451        }
452        if toggles_docstring {
453            in_docstring = !in_docstring;
454        }
455    }
456
457    flags
458}
459
460fn is_test_file(path: &str) -> bool {
461    // Extract filename without allocation
462    let filename = path.rsplit('/').next().unwrap_or(path);
463    let stem = filename.split('.').next().unwrap_or(filename);
464
465    // Safe prefix/suffix checks — `starts_with` and `ends_with` never panic
466    // on multi-byte UTF-8, unlike byte-index slicing.
467    stem.eq_ignore_ascii_case("test")
468        || stem.len() > TEST_PREFIX_LEN
469            && stem
470                .as_bytes()
471                .get(..TEST_PREFIX_LEN)
472                .is_some_and(|b| b.eq_ignore_ascii_case(b"test_"))
473        || filename.ends_with("_test.go")
474        || filename.ends_with("_test.rs")
475        || filename.ends_with("_test.py")
476        || filename.ends_with("_test.rb")
477        || filename.ends_with(".test.js")
478        || filename.ends_with(".test.ts")
479        || filename.ends_with(".spec.js")
480        || filename.ends_with(".spec.ts")
481        || path.split('/').any(|component| {
482            component.eq_ignore_ascii_case("test")
483                || component.eq_ignore_ascii_case("tests")
484                || component.eq_ignore_ascii_case("__tests__")
485                || component.eq_ignore_ascii_case("fixtures")
486                || component.eq_ignore_ascii_case("testdata")
487                || component.eq_ignore_ascii_case("spec")
488        })
489}
490
491fn infer_default_context(trimmed: &str) -> CodeContext {
492    if memchr::memchr(b'"', trimmed.as_bytes()).is_some()
493        || memchr::memchr(b'\'', trimmed.as_bytes()).is_some()
494    {
495        CodeContext::StringLiteral
496    } else {
497        CodeContext::Unknown
498    }
499}
500
501fn is_go_sum_checksum(lower: &str, path_lower: Option<&str>) -> bool {
502    memchr::memmem::find(lower.as_bytes(), b"h1:").is_some()
503        || path_lower.is_some_and(|path| path.ends_with(".sum"))
504}
505
506fn is_integrity_hash_context(lines: &[&str], line_idx: usize, lower: &str) -> bool {
507    is_integrity_hash(lower)
508        || surrounding_lines_contain(lines, line_idx, 2, |candidate| {
509            is_integrity_hash(&candidate.to_ascii_lowercase())
510        })
511}
512
513fn is_integrity_hash(lower: &str) -> bool {
514    memchr::memmem::find(lower.as_bytes(), b"integrity").is_some()
515        && (memchr::memmem::find(lower.as_bytes(), b"sha256-").is_some()
516            || memchr::memmem::find(lower.as_bytes(), b"sha512-").is_some())
517}
518
519fn is_configmap_binary_data_context(lines: &[&str], line_idx: usize, lower: &str) -> bool {
520    is_configmap_binary_data(lower)
521        || nearby_lines_contain(lines, line_idx, 8, |candidate| {
522            let candidate = candidate.trim().to_ascii_lowercase();
523            is_configmap_binary_data(&candidate)
524        })
525}
526
527fn is_configmap_binary_data(lower: &str) -> bool {
528    memchr::memmem::find(lower.as_bytes(), b"binarydata:").is_some()
529}
530
531fn is_git_lfs_pointer_context_with_lines(lines: &[&str], line_idx: usize, lower: &str) -> bool {
532    is_git_lfs_pointer_context(lower)
533        || nearby_lines_contain(lines, line_idx, 3, |candidate| {
534            is_git_lfs_pointer_context(&candidate.to_ascii_lowercase())
535        })
536}
537
538fn is_git_lfs_pointer_context(lower: &str) -> bool {
539    memchr::memmem::find(lower.as_bytes(), b"oid sha256:").is_some()
540        || memchr::memmem::find(lower.as_bytes(), b"git-lfs").is_some()
541}
542
543fn is_renovate_digest_context_with_lines(lines: &[&str], line_idx: usize, lower: &str) -> bool {
544    is_renovate_digest_context(lower)
545        || surrounding_lines_contain(lines, line_idx, 2, |candidate| {
546            is_renovate_digest_context(&candidate.to_ascii_lowercase())
547        })
548}
549
550fn is_renovate_digest_context(lower: &str) -> bool {
551    memchr::memmem::find(lower.as_bytes(), b"renovate/").is_some() && contains_hex_sequence(lower)
552}
553
554fn is_cors_header(lower: &str) -> bool {
555    memchr::memmem::find(lower.as_bytes(), b"access-control-").is_some()
556}
557
558fn is_http_cache_header_context(lines: &[&str], line_idx: usize, lower: &str) -> bool {
559    is_http_cache_header(lower)
560        || surrounding_lines_contain(lines, line_idx, 1, |candidate| {
561            is_http_cache_header(&candidate.to_ascii_lowercase())
562        })
563}
564
565fn is_http_cache_header(lower: &str) -> bool {
566    memchr::memmem::find(lower.as_bytes(), b"etag:").is_some()
567        || lower.trim_start().starts_with("etag")
568        || memchr::memmem::find(lower.as_bytes(), b" etag").is_some()
569        || memchr::memmem::find(lower.as_bytes(), b"\"etag\"").is_some()
570}
571
572fn contains_hex_sequence(lower: &str) -> bool {
573    let mut run = 0usize;
574    for ch in lower.chars() {
575        if ch.is_ascii_hexdigit() {
576            run += 1;
577            if run >= 8 {
578                return true;
579            }
580        } else {
581            run = 0;
582        }
583    }
584    false
585}
586
587fn nearby_lines_contain(
588    lines: &[&str],
589    line_idx: usize,
590    lookback_lines: usize,
591    predicate: impl Fn(&str) -> bool,
592) -> bool {
593    let start = line_idx.saturating_sub(lookback_lines);
594    lines
595        .iter()
596        .take(line_idx + 1)
597        .skip(start)
598        .copied()
599        .any(predicate)
600}
601
602fn surrounding_lines_contain(
603    lines: &[&str],
604    line_idx: usize,
605    radius: usize,
606    predicate: impl Fn(&str) -> bool,
607) -> bool {
608    let start = line_idx.saturating_sub(radius);
609    let end = (line_idx + radius + 1).min(lines.len());
610    lines[start..end].iter().copied().any(predicate)
611}
612
613fn surrounding_line_window(text: &str, offset: usize, radius: usize) -> String {
614    let safe_offset = offset.min(text.len());
615    let line_idx = memchr::memchr_iter(b'\n', &text.as_bytes()[..safe_offset]).count();
616    let lines: Vec<&str> = text.lines().collect();
617    if lines.is_empty() {
618        return String::new();
619    }
620
621    let start = line_idx.saturating_sub(radius);
622    let end = (line_idx + radius + 1).min(lines.len());
623    lines[start..end].join("\n")
624}
625
626fn is_comment_line(trimmed: &str) -> bool {
627    trimmed.starts_with("//")
628        || trimmed.starts_with('#')
629        || (trimmed.starts_with("--") && !trimmed.starts_with("---"))
630        || trimmed.starts_with("/*")
631        || trimmed.starts_with("<!--")
632        || trimmed.starts_with("<#")
633        // Block comment continuation: "* text" or "*/", but NOT bare "*" which
634        // matches Markdown list items, shell globs, and Makefile rules.
635        || trimmed.starts_with("* ") || trimmed.starts_with("*/")
636        || trimmed.starts_with("rem ")
637        || trimmed.starts_with("REM ")
638}
639
640fn is_assignment_line(trimmed: &str) -> bool {
641    has_assignment_operator(trimmed) || has_yaml_mapping(trimmed)
642}
643
644fn has_assignment_operator(trimmed: &str) -> bool {
645    for operator in [":=", "->", "="] {
646        if let Some(pos) = trimmed.find(operator)
647            && !is_comparison_operator(trimmed, pos, operator)
648        {
649            return true;
650        }
651    }
652    false
653}
654
655fn has_yaml_mapping(trimmed: &str) -> bool {
656    memchr::memmem::find(trimmed.as_bytes(), b": ").is_some() && !trimmed.starts_with("- ")
657}
658
659fn is_comparison_operator(trimmed: &str, pos: usize, operator: &str) -> bool {
660    if operator != "=" {
661        return false;
662    }
663
664    let before = trimmed[..pos].chars().last();
665    let after = trimmed[pos + operator.len()..].chars().next();
666    matches!(before, Some('=' | '!' | '>' | '<')) || matches!(after, Some('='))
667}
668
669fn is_in_encrypted_block(lines: &[&str], line_idx: usize) -> bool {
670    // Look back up to 10 lines for encryption markers.
671    let start = line_idx.saturating_sub(ENCRYPTED_BLOCK_LOOKBACK_LINES);
672    for line in lines.iter().take(line_idx + 1).skip(start) {
673        let trimmed = line.trim();
674        if trimmed.starts_with("$ANSIBLE_VAULT")
675            || trimmed.starts_with("ENC[")
676            || memchr::memmem::find(trimmed.as_bytes(), b"sops:").is_some()
677            || memchr::memmem::find(trimmed.as_bytes(), b"sealed-secrets").is_some()
678            || trimmed.starts_with("-----BEGIN PGP MESSAGE-----")
679            || trimmed.starts_with("-----BEGIN AGE ENCRYPTED")
680        {
681            return true;
682        }
683    }
684    false
685}
686
687fn is_in_test_function(lines: &[&str], line_idx: usize) -> bool {
688    // Look back for test function definition.
689    let start = line_idx.saturating_sub(TEST_FUNCTION_LOOKBACK_LINES);
690    for candidate_line_idx in (start..line_idx).rev() {
691        let trimmed = lines[candidate_line_idx].trim();
692
693        // Python: def test_*, class Test*
694        if trimmed.starts_with("def test_") || trimmed.starts_with("class Test") {
695            return true;
696        }
697        // JavaScript: it(', describe(', test('
698        if trimmed.starts_with("it(")
699            || trimmed.starts_with("describe(")
700            || trimmed.starts_with("test(")
701        {
702            return true;
703        }
704        // Rust: #[test], #[cfg(test)]
705        if trimmed == "#[test]" || trimmed == "#[cfg(test)]" {
706            return true;
707        }
708        // Go: func Test*
709        if trimmed.starts_with("func Test") {
710            return true;
711        }
712        // Java: @Test
713        if trimmed == "@Test" {
714            return true;
715        }
716        // If we hit a non-test function definition, stop looking.
717        if (trimmed.starts_with("def ")
718            || trimmed.starts_with("func ")
719            || trimmed.starts_with("fn ")
720            || trimmed.starts_with("function "))
721            && memchr::memmem::find(trimmed.as_bytes(), b"test").is_none()
722        {
723            return false;
724        }
725    }
726    false
727}
728
729#[cfg(test)]
730mod tests {
731    use super::*;
732
733    #[test]
734    fn assignment_context() {
735        let lines = vec!["API_KEY = sk-proj-abc123"];
736        assert_eq!(infer_context(&lines, 0, None), CodeContext::Assignment);
737    }
738
739    #[test]
740    fn comment_context() {
741        let lines = vec!["# old key: sk-proj-abc123"];
742        assert_eq!(infer_context(&lines, 0, None), CodeContext::Comment);
743    }
744
745    #[test]
746    fn test_file_context() {
747        let lines = vec!["key = sk-proj-abc123"];
748        assert_eq!(
749            infer_context(&lines, 0, Some("tests/test_auth.py")),
750            CodeContext::TestCode
751        );
752    }
753
754    #[test]
755    fn encrypted_block_context() {
756        let lines = vec!["$ANSIBLE_VAULT;1.1;AES256", "6162636465666768"];
757        assert_eq!(infer_context(&lines, 1, None), CodeContext::Encrypted);
758    }
759
760    #[test]
761    fn documentation_context() {
762        let lines = vec![
763            "```bash",
764            "curl -H 'Authorization: Bearer sk-proj-abc'",
765            "```",
766        ];
767        assert_eq!(infer_context(&lines, 1, None), CodeContext::Documentation);
768    }
769
770    #[test]
771    fn test_function_context() {
772        let lines = vec![
773            "def test_api_call():",
774            "    key = 'sk-proj-abc123'",
775            "    assert call(key)",
776        ];
777        assert_eq!(infer_context(&lines, 1, None), CodeContext::TestCode);
778    }
779
780    #[test]
781    fn confidence_multipliers() {
782        assert!(
783            CodeContext::Assignment.confidence_multiplier()
784                > CodeContext::Comment.confidence_multiplier()
785        );
786        assert!(
787            CodeContext::Comment.confidence_multiplier()
788                > CodeContext::Encrypted.confidence_multiplier()
789        );
790        assert!(
791            CodeContext::TestCode.confidence_multiplier()
792                < CodeContext::Assignment.confidence_multiplier()
793        );
794    }
795
796    #[test]
797    fn false_positive_context_detects_go_sum() {
798        let lines = vec!["github.com/example/module v1.0.0 h1:AKIAIOSFODNN7EXAMPLEabc"];
799        assert!(is_false_positive_context(&lines, 0, Some("deps/go.sum")));
800    }
801
802    #[test]
803    fn false_positive_context_detects_configmap_binary_data_block() {
804        let lines = vec![
805            "kind: ConfigMap",
806            "binaryData:",
807            "  cert-fingerprint-sha256: Z2hwX2FiYw==",
808        ];
809        assert!(is_false_positive_context(&lines, 2, None));
810    }
811
812    #[test]
813    fn false_positive_context_detects_git_lfs_pointer() {
814        let lines = vec![
815            "version https://git-lfs.github.com/spec/v1",
816            "oid sha256:sk-proj-abcdefghijklmnopqrstuvwxyz123456",
817        ];
818        assert!(is_false_positive_context(&lines, 1, None));
819    }
820
821    #[test]
822    fn false_positive_context_detects_integrity_hash() {
823        let lines = vec!["integrity sha512-sk-proj-abcdefghijklmnopqrstuvwxyz123456"];
824        assert!(is_false_positive_context(&lines, 0, None));
825    }
826
827    #[test]
828    fn false_positive_context_detects_sum_file_path() {
829        let lines = vec!["github.com/example/module v1.0.0 checksum"];
830        assert!(is_false_positive_context(
831            &lines,
832            0,
833            Some("deps/vendor.sum")
834        ));
835    }
836
837    #[test]
838    fn false_positive_context_detects_renovate_digest() {
839        let lines = vec![r#""branchName": "renovate/node-8f3a9b2c1d4e5f60""#];
840        assert!(is_false_positive_context(&lines, 0, None));
841    }
842
843    #[test]
844    fn false_positive_context_detects_cors_header() {
845        let lines = vec!["Access-Control-Allow-Headers: Authorization, X-API-Key"];
846        assert!(is_false_positive_context(&lines, 0, None));
847    }
848
849    #[test]
850    fn false_positive_context_detects_http_cache_header() {
851        let lines = vec![r#"ETag: W/"xoxb-8f3a9b2c1d4e5f60718293a4b5c6d7e8f9a0b""#];
852        assert!(is_false_positive_context(&lines, 0, None));
853    }
854}
keyhog_scanner/context.rs

keyhog_scanner/
context.rs