repotoire 0.9.0

Graph-powered code analysis CLI. 110 detectors for security, architecture, bus factor, and code quality.
Documentation
//! Known credential formats + Shannon entropy + non-credential-context guard for SecretDetector's
//! blocking tier. See spec §2 Group B. A SecretDetector finding is Blocking iff it matches a known
//! format (with checksum where applicable) OR is a high-entropy generic string, AND is not in a
//! redaction-list / test-fixture / doc-example / placeholder context.
//!
//! The public API here is consumed by `SecretDetector` (Task 10). Until that integration lands,
//! clippy would flag everything as dead code — suppress that at the module level.
#![allow(dead_code)]

use regex::Regex;
use std::sync::LazyLock;

/// A match against a known credential format table.
pub struct FormatMatch {
    /// The name of the matched format (e.g. `"aws_access_key_id"`).
    pub format: &'static str,
    /// Whether the format-specific checksum (if any) was valid.
    /// `None` means the format has no checksum.
    /// `Some(true)` means checksum passed; `Some(false)` means it failed (and the match was
    /// rejected).
    pub checksum_valid: Option<bool>,
}

// One LazyLock<Regex> per format, following the existing security-detector pattern.
static RE_AWS_ACCESS_KEY_ID: LazyLock<Regex> =
    LazyLock::new(|| Regex::new(r"\b(AKIA|ASIA)[A-Z0-9]{16}\b").expect("valid regex"));
static RE_GITHUB_PAT: LazyLock<Regex> =
    LazyLock::new(|| Regex::new(r"\bgh[posru]_[A-Za-z0-9]{36,255}\b").expect("valid regex"));
static RE_STRIPE_SECRET_KEY: LazyLock<Regex> =
    LazyLock::new(|| Regex::new(r"\b(sk|rk)_live_[A-Za-z0-9]{24,99}\b").expect("valid regex"));
static RE_SLACK_TOKEN: LazyLock<Regex> =
    LazyLock::new(|| Regex::new(r"\bxox[baprs]-[A-Za-z0-9\-]{10,}\b").expect("valid regex"));
static RE_GOOGLE_API_KEY: LazyLock<Regex> =
    LazyLock::new(|| Regex::new(r"\bAIza[0-9A-Za-z\-_]{35}\b").expect("valid regex"));
static RE_OPENAI_API_KEY: LazyLock<Regex> =
    LazyLock::new(|| Regex::new(r"\bsk-[A-Za-z0-9]{20,}\b").expect("valid regex"));
static RE_TWILIO_API_KEY: LazyLock<Regex> =
    LazyLock::new(|| Regex::new(r"\bSK[0-9a-fA-F]{32}\b").expect("valid regex"));
static RE_SENDGRID_API_KEY: LazyLock<Regex> = LazyLock::new(|| {
    Regex::new(r"\bSG\.[A-Za-z0-9_\-]{22}\.[A-Za-z0-9_\-]{43}\b").expect("valid regex")
});
static RE_NPM_TOKEN: LazyLock<Regex> =
    LazyLock::new(|| Regex::new(r"\bnpm_[A-Za-z0-9]{36}\b").expect("valid regex"));
static RE_PRIVATE_KEY_PEM: LazyLock<Regex> = LazyLock::new(|| {
    Regex::new(r"-----BEGIN (RSA |EC |DSA |OPENSSH |PGP )?PRIVATE KEY-----").expect("valid regex")
});
static RE_JWT: LazyLock<Regex> = LazyLock::new(|| {
    Regex::new(r"\beyJ[A-Za-z0-9_\-]+\.eyJ[A-Za-z0-9_\-]+\.[A-Za-z0-9_\-]+\b").expect("valid regex")
});

/// Table entry: regex reference + format name + optional checksum.
struct FormatEntry {
    name: &'static str,
    re: &'static LazyLock<Regex>,
    checksum: Option<fn(&str) -> bool>,
}

static FORMAT_TABLE: &[FormatEntry] = &[
    FormatEntry {
        name: "aws_access_key_id",
        re: &RE_AWS_ACCESS_KEY_ID,
        checksum: None,
    },
    FormatEntry {
        name: "github_pat",
        re: &RE_GITHUB_PAT,
        checksum: None,
    },
    FormatEntry {
        name: "stripe_secret_key",
        re: &RE_STRIPE_SECRET_KEY,
        checksum: None,
    },
    FormatEntry {
        name: "slack_token",
        re: &RE_SLACK_TOKEN,
        checksum: None,
    },
    FormatEntry {
        name: "google_api_key",
        re: &RE_GOOGLE_API_KEY,
        checksum: None,
    },
    FormatEntry {
        name: "openai_api_key",
        re: &RE_OPENAI_API_KEY,
        checksum: None,
    },
    FormatEntry {
        name: "twilio_api_key",
        re: &RE_TWILIO_API_KEY,
        checksum: None,
    },
    FormatEntry {
        name: "sendgrid_api_key",
        re: &RE_SENDGRID_API_KEY,
        checksum: None,
    },
    FormatEntry {
        name: "npm_token",
        re: &RE_NPM_TOKEN,
        checksum: None,
    },
    FormatEntry {
        name: "private_key_pem",
        re: &RE_PRIVATE_KEY_PEM,
        checksum: None,
    },
    FormatEntry {
        name: "jwt",
        re: &RE_JWT,
        checksum: None,
    },
];

/// Try to match `s` against the known credential format table.
///
/// Returns the first match, or `None` if no format matches. Formats with a
/// checksum function will skip the match if the checksum fails.
pub fn match_known_format(s: &str) -> Option<FormatMatch> {
    for entry in FORMAT_TABLE {
        if entry.re.is_match(s) {
            let cv = entry.checksum.map(|c| c(s));
            // A failing checksum disqualifies the match — move on.
            if let Some(false) = cv {
                continue;
            }
            return Some(FormatMatch {
                format: entry.name,
                checksum_valid: cv,
            });
        }
    }
    None
}

/// Compute Shannon entropy in bits per character for `s`.
///
/// Returns 0.0 for an empty string. Maximum for a uniform distribution over
/// 64 distinct symbols is ~6.0 bits/char.
pub fn shannon_entropy_bits_per_char(s: &str) -> f32 {
    if s.is_empty() {
        return 0.0;
    }
    let mut counts: std::collections::HashMap<char, u32> = std::collections::HashMap::new();
    for c in s.chars() {
        *counts.entry(c).or_insert(0) += 1;
    }
    let n = s.chars().count() as f32;
    -counts
        .values()
        .map(|&c| {
            let p = c as f32 / n;
            p * p.log2()
        })
        .sum::<f32>()
}

/// Entropy threshold (bits/char) for a *generic* (non-format-matched) high-entropy string to be
/// considered a likely secret.
pub const GENERIC_ENTROPY_FLOOR: f32 = 4.0;

/// Minimum length (chars) for the entropy gate to apply to generic strings.
pub const GENERIC_MIN_LEN: usize = 20;

/// Returns `true` if `needle` appears in `haystack` at a word boundary — i.e. the character
/// immediately before and after the match (if any) is non-alphanumeric. Both inputs are assumed
/// to be already lowercased. This prevents "example" from matching inside "AKIAIOSFODNN7EXAMPLE".
pub fn contains_as_word(haystack: &str, needle: &str) -> bool {
    if needle.is_empty() {
        return false;
    }
    let hb = haystack.as_bytes();
    let nb = needle.as_bytes();
    let nlen = nb.len();
    for i in 0..hb.len().saturating_sub(nlen - 1) {
        if &hb[i..i + nlen] == nb {
            let before_ok = i == 0 || !hb[i - 1].is_ascii_alphanumeric();
            let after_ok = i + nlen >= hb.len() || !hb[i + nlen].is_ascii_alphanumeric();
            if before_ok && after_ok {
                return true;
            }
        }
    }
    false
}

/// Returns `true` when the context strongly suggests the literal is NOT a real credential.
///
/// # Arguments
/// * `file` — source file path (relative or absolute)
/// * `surrounding_name` — nearest enclosing const/var/field name, or `""` if unknown
/// * `literal` — the string value being evaluated
/// * `in_identifier_list` — `true` when the literal is an element of a list/array of
///   short lowercase identifier-like strings (the SENSITIVE_FIELD_PATTERNS / redaction-list case)
pub fn is_non_credential_context(
    file: &str,
    surrounding_name: &str,
    literal: &str,
    in_identifier_list: bool,
) -> bool {
    let f = file.to_lowercase();

    // Documentation / example files
    if f.ends_with(".md")
        || f.ends_with(".rst")
        || f.ends_with(".txt")
        || f.contains("/examples/")
        || f.contains("example")
        || f.contains("/docs/")
        || f.ends_with(".sample")
    {
        return true;
    }

    // Test fixtures
    if f.contains("/tests/")
        || f.contains("/test/")
        || f.contains("/__fixtures__/")
        || f.contains("/fixtures/")
        || f.contains("conftest.py")
        || f.contains("_test.")
        || f.contains(".test.")
        || f.contains(".spec.")
    {
        return true;
    }

    // Redaction / sensitive-field lists — matched by surrounding name or file name
    let name = surrounding_name.to_lowercase();
    if in_identifier_list
        || name.contains("sensitive")
        || name.contains("redact")
        || name.contains("filter")
        || name.contains("blocklist")
        || name.contains("blacklist")
        || name.contains("scrub")
        || name.contains("sanitiz")
    {
        return true;
    }

    // Obvious placeholders / template strings.
    // Use word-boundary-aware matching for alpha terms so that random keys
    // that happen to end with a common word (e.g. "...7EXAMPLE") don't
    // falsely trigger. Structural markers (<, ${, ...) are checked directly.
    let l = literal.to_lowercase();
    if l.contains('<') || l.contains("${") || l.contains("...") {
        return true;
    }
    // Check whether any placeholder word appears at a word boundary (i.e.
    // preceded/followed by a non-alphanumeric character or string edge).
    const PLACEHOLDER_WORDS: &[&str] = &[
        "your",
        "placeholder",
        "example",
        "changeme",
        "xxxx",
        "dummy",
        "fake",
        "test",
        "sample",
    ];
    for word in PLACEHOLDER_WORDS {
        if contains_as_word(&l, word) {
            return true;
        }
    }

    // Repeated single character (e.g. "aaaaaaa", "0000000")
    if !literal.is_empty() {
        let first = literal.as_bytes()[0];
        if literal.bytes().all(|b| b == first) {
            return true;
        }
    }

    false
}

#[cfg(test)]
mod tests {
    use super::*;

    #[test]
    fn recognizes_known_formats() {
        assert_eq!(
            match_known_format("AKIAIOSFODNN7EXAMPLE").map(|m| m.format),
            Some("aws_access_key_id")
        );
        assert_eq!(
            match_known_format("ghp_1234567890abcdefghijklmnopqrstuvwxyz").map(|m| m.format),
            Some("github_pat")
        );
        assert!(
            match_known_format("-----BEGIN RSA PRIVATE KEY-----").map(|m| m.format)
                == Some("private_key_pem")
        );
        assert!(match_known_format("hello world").is_none());
    }

    #[test]
    fn entropy_gate() {
        assert!(shannon_entropy_bits_per_char("aaaaaaaaaaaaaaaaaaaa") < 1.0);
        assert!(shannon_entropy_bits_per_char("xR7$kP2!mZ9@qW4#vL8&") > 3.5);
    }

    #[test]
    fn context_guard_rejects_redaction_lists_and_fixtures() {
        // a string literal that's an element of a list of short lowercase identifiers
        assert!(is_non_credential_context(
            "filters.py",
            "SENSITIVE_FIELD_PATTERNS",
            "password",
            true
        ));
        // a test fixture path
        assert!(is_non_credential_context(
            "tests/fixtures/auth.py",
            "",
            "AKIAIOSFODNN7EXAMPLE",
            false
        ));
        // a real-looking key in app code -> not a non-credential context
        assert!(!is_non_credential_context(
            "src/aws.py",
            "",
            "AKIAIOSFODNN7EXAMPLE",
            false
        ));
        // a doc example
        assert!(is_non_credential_context("README.md", "", "AKIA...", false));
        // an obvious placeholder
        assert!(is_non_credential_context(
            "src/x.py",
            "",
            "your-api-key-here",
            false
        ));
    }
}