keyhog-scanner 0.5.37

keyhog-scanner: high-performance SIMD-accelerated secret detection engine
Documentation
use regex::Regex;
use std::sync::LazyLock;

const MAX_MULTILINE_PREPROCESS_BYTES: usize = 2 * 1024 * 1024;
const MAX_MULTILINE_LINE_BYTES: usize = 64 * 1024;

/// A mapping from an offset in the joined text back to the original line number.
#[derive(Debug, Clone)]
pub struct LineMapping {
    /// Start offset in the joined text (inclusive).
    pub start_offset: usize,
    /// End offset in the joined text (exclusive).
    pub end_offset: usize,
    /// Original line number (1-indexed).
    pub line_number: usize,
}

/// Result of preprocessing text for multi-line concatenation.
#[derive(Debug, Clone)]
pub struct PreprocessedText {
    /// Original text plus appended multiline-joined segments.
    pub text: String,
    /// Byte offset where appended joined segments start.
    pub original_end: usize,
    /// Mapping from offsets in `text` to original line numbers.
    pub mappings: Vec<LineMapping>,
}

impl PreprocessedText {
    /// Map a byte offset in preprocessed text back to an original line number.
    ///
    /// Mappings are stored in `start_offset`-sorted, contiguous order
    /// (the preprocessor appends them as it walks the input), so a
    /// `partition_point` binary search resolves the lookup in
    /// `O(log L)` instead of the prior `O(L)` linear scan. On a
    /// 10 000-line file with ~100 matches that's 10 000 × 100 = 1 M
    /// pointer compares cut to ~1 400.
    pub fn line_for_offset(&self, offset: usize) -> Option<usize> {
        let idx = self.mappings.partition_point(|m| m.start_offset <= offset);
        if idx == 0 {
            return None;
        }
        let m = &self.mappings[idx - 1];
        if offset < m.end_offset {
            Some(m.line_number)
        } else {
            None
        }
    }

    /// Build a preprocessed representation with a one-line identity mapping.
    pub fn passthrough(text: &str) -> Self {
        let mut mappings = Vec::new();
        let mut offset = 0;
        for (line_idx, line) in text.split('\n').enumerate() {
            let end = offset + line.len();
            mappings.push(LineMapping {
                line_number: line_idx + 1,
                start_offset: offset,
                end_offset: end + 1,
            });
            offset = end + 1;
        }
        if let Some(last) = mappings.last_mut() {
            last.end_offset = text.len();
        }
        let original_end = text.len();
        Self {
            text: text.to_string(),
            original_end,
            mappings,
        }
    }
}

/// Configuration for multiline concatenation recovery.
#[derive(Debug, Clone)]
pub struct MultilineConfig {
    /// Maximum number of lines to join in a single concatenation chain.
    pub max_join_lines: usize,
    /// Whether to enable Python-style implicit concatenation.
    pub python_implicit: bool,
    /// Whether to enable backslash line continuation.
    pub backslash_continuation: bool,
    /// Whether to enable explicit concatenation with `+`.
    pub plus_concatenation: bool,
    /// Whether to enable JavaScript template literal concatenation.
    pub template_literals: bool,
}

impl Default for MultilineConfig {
    fn default() -> Self {
        Self {
            max_join_lines: 10,
            python_implicit: true,
            backslash_continuation: true,
            plus_concatenation: true,
            template_literals: true,
        }
    }
}

/// Check if text contains any concatenation indicators.
pub(crate) fn has_concatenation_indicators(text: &str) -> bool {
    let trimmed = text.trim_start();
    if trimmed.starts_with('{')
        || trimmed.starts_with('[')
        || trimmed.starts_with("<?xml")
        || trimmed.starts_with('<')
    {
        return false;
    }

    let bytes = text.as_bytes();

    // For large files, only preprocess if secret-related keywords are present.
    if bytes.len() > 4096 {
        let has_secret_keyword = memchr::memmem::find(bytes, b"ecret").is_some()
            || memchr::memmem::find(bytes, b"oken").is_some()
            || memchr::memmem::find(bytes, b"assword").is_some()
            || memchr::memmem::find(bytes, b"api_key").is_some()
            || memchr::memmem::find(bytes, b"API_KEY").is_some()
            || memchr::memmem::find(bytes, b"redential").is_some();
        if !has_secret_keyword {
            return false;
        }
    }

    let has_explicit_concat = text.contains("\" +") || text.contains("' +");
    let has_backslash_cont = text.contains("\" \\") || text.contains("' \\");
    let has_template = memchr::memchr(b'`', bytes).is_some();
    // Function-style string concatenation: R's paste()/paste0() and Rust's
    // concat!() macro. All three splice multiple string literals into one
    // value, so any of them is a concat indicator.
    let has_paste =
        text.contains("paste0(") || text.contains("paste(") || text.contains("concat!(");
    let has_implicit = bytes.windows(3).any(|window| {
        (window[0] == b'"' && window[1] == b' ' && window[2] == b'"')
            || (window[0] == b'\'' && window[1] == b' ' && window[2] == b'\'')
            || (window[0] == b'"'
                && window[1] == b'\n'
                && (window[2] == b'"' || window[2] == b' ' || window[2] == b'\t'))
            || (window[0] == b'\''
                && window[1] == b'\n'
                && (window[2] == b'\'' || window[2] == b' ' || window[2] == b'\t'))
    });
    if !has_explicit_concat
        && !has_backslash_cont
        && !has_template
        && !has_paste
        && !has_implicit
        && !has_var_ref_concatenation(text)
    {
        return false;
    }

    for line in text.lines() {
        let trimmed = line.trim();
        if trimmed.ends_with('+')
            || trimmed.starts_with('+')
            || trimmed.starts_with("+ ")
            || trimmed.contains("paste0(")
            || trimmed.contains("paste(")
            || trimmed.contains("concat!(")
            || trimmed.contains("\" +")
            || trimmed.contains("' +")
            || trimmed.contains("+ \"")
            || trimmed.contains("+ '")
            || (trimmed.ends_with('\\') && !trimmed.ends_with("\\\\"))
            || trimmed.contains("\" \"")
            || trimmed.contains("' '")
            || has_var_ref_concat_line(trimmed)
            || (trimmed.ends_with('`') && trimmed.matches('`').count() == 1)
            // String literal interpolated INTO a template literal:
            // `ghp_${"BODY"}` / `${'a'}${'b'}`. The `${"`/`${'` shape is the
            // concat-evasion signal - a string literal spliced into an
            // interpolation. Deliberately narrow: bare `${ident}` (normal
            // runtime interpolation, ubiquitous in JS/TS) is NOT flagged, so
            // this adds no preprocessing cost to ordinary template code.
            || trimmed.contains("${\"")
            || trimmed.contains("${'")
            // Adjacent template interpolations `${a}${b}` - the close-brace
            // immediately followed by `${` is the concat-via-interpolation
            // signal. Ordinary single interpolation (`Hi ${name}!`) has
            // literal text between/around the braces and never produces
            // `}${`, so this stays clear of the ubiquitous JS/TS template
            // case and adds no cost to it.
            || trimmed.contains("}${")
        {
            return true;
        }
    }

    false
}

/// Variable-reference concatenation: `token = head + tail` (no quoted
/// literals on the RHS). The structural reassembly pass resolves these
/// via `resolve_concat_reference`; without this indicator the multiline
/// preprocessor passthroughs and the split credential never surfaces.
fn has_var_ref_concatenation(text: &str) -> bool {
    text.lines().any(has_var_ref_concat_line)
}

fn has_var_ref_concat_line(line: &str) -> bool {
    // Cheap precheck: var-ref concatenation REQUIRES at least one `+`
    // separator between two identifiers. Lines without one cannot
    // possibly match - skip the regex entirely. Without this, the
    // `(?:\s*\+\s*[a-z0-9_\-]{2,32}){1,8}` repeated-group bound forces
    // the regex crate's NFA to evaluate every starting position on
    // identifier-dense source lines, which on Apple Silicon
    // (regex 1.12, lazy-DFA construction stalled by the `{1,8}`-bounded
    // alternation) burns minutes of CPU per line. Surfaced during
    // v0.5.25 cross-platform dogfood: a 171-byte Go file with shape
    // `var token = receiver.Flag("x", "y").Required().String()` hung
    // for 6+ minutes on Mac arm64 portable while Linux x86_64
    // completed it in 0.6 s. The precheck is correctness-preserving:
    // when no `+` exists in the line, the regex *cannot* match.
    if !line.contains('+') {
        return false;
    }
    static VAR_REF_CONCAT_RE: LazyLock<Option<Regex>> = LazyLock::new(|| {
        Regex::new(
            r#"(?i)^\s*[a-z0-9_\-\.]{2,64}\s*[:=]\s*[a-z0-9_\-]{2,32}(?:\s*\+\s*[a-z0-9_\-]{2,32}){1,8}\s*;?\s*$"#,
        )
        .ok()
    });
    VAR_REF_CONCAT_RE
        .as_ref()
        .is_some_and(|re| re.is_match(line))
}

pub(crate) fn should_passthrough(text: &str) -> bool {
    text.len() > MAX_MULTILINE_PREPROCESS_BYTES
        || text
            .lines()
            .any(|line| line.len() > MAX_MULTILINE_LINE_BYTES)
        || !has_concatenation_indicators(text)
}