guardy 0.2.4 - Docs.rs

// Entropy analysis for secret detection
// Extracted and adapted from ripsecrets: https://github.com/sirwart/ripsecrets
// Original implementation by sirwart, adapted for Guardy

use std::{
    collections::{hash_map::HashMap, hash_set::HashSet},
    sync::{Arc, LazyLock},
};

use memoize::memoize;
use regex::bytes::Regex;

// Static regexes for entropy analysis
static HEX_STRING_REGEX: LazyLock<Arc<Regex>> =
    LazyLock::new(|| Arc::new(Regex::new("^[0-9a-fA-F]{16,}$").unwrap()));

static CAP_AND_NUMBERS_REGEX: LazyLock<Arc<Regex>> =
    LazyLock::new(|| Arc::new(Regex::new("^[0-9A-Z]{16,}$").unwrap()));

// Static bigrams set - same 488 bigrams as before, just optimized storage
static BIGRAMS_SET: LazyLock<Arc<HashSet<&'static [u8]>>> = LazyLock::new(|| {
    let bigrams_bytes = b"er,te,an,en,ma,ke,10,at,/m,on,09,ti,al,io,.h,./,..,ra,ht,es,or,tm,pe,ml,re,in,3/,n3,0F,ok,ey,00,80,08,ss,07,15,81,F3,st,52,KE,To,01,it,2B,2C,/E,P_,EY,B7,se,73,de,VP,EV,to,od,B0,0E,nt,et,_P,A0,60,90,0A,ri,30,ar,C0,op,03,ec,ns,as,FF,F7,po,PK,la,.p,AE,62,me,F4,71,8E,yp,pa,50,qu,D7,7D,rs,ea,Y_,t_,ha,3B,c/,D2,ls,DE,pr,am,E0,oc,06,li,do,id,05,51,40,ED,_p,70,ed,04,02,t.,rd,mp,20,d_,co,ro,ex,11,ua,nd,0C,0D,D0,Eq,le,EF,wo,e_,e.,ct,0B,_c,Li,45,rT,pt,14,61,Th,56,sT,E6,DF,nT,16,85,em,BF,9E,ne,_s,25,91,78,57,BE,ta,ng,cl,_t,E1,1F,y_,xp,cr,4F,si,s_,E5,pl,AB,ge,7E,F8,35,E2,s.,CF,58,32,2F,E7,1B,ve,B1,3D,nc,Gr,EB,C6,77,64,sl,8A,6A,_k,79,C8,88,ce,Ex,5C,28,EA,A6,2A,Ke,A7,th,CA,ry,F0,B6,7/,D9,6B,4D,DA,3C,ue,n7,9C,.c,7B,72,ac,98,22,/o,va,2D,n.,_m,B8,A3,8D,n_,12,nE,ca,3A,is,AD,rt,r_,l-,_C,n1,_v,y.,yw,1/,ov,_n,_d,ut,no,ul,sa,CT,_K,SS,_e,F1,ty,ou,nG,tr,s/,il,na,iv,L_,AA,da,Ty,EC,ur,TX,xt,lu,No,r.,SL,Re,sw,_1,om,e/,Pa,xc,_g,_a,X_,/e,vi,ds,ai,==,ts,ni,mg,ic,o/,mt,gm,pk,d.,ch,/p,tu,sp,17,/c,ym,ot,ki,Te,FE,ub,nL,eL,.k,if,he,34,e-,23,ze,rE,iz,St,EE,-p,be,In,ER,67,13,yn,ig,ib,_f,.o,el,55,Un,21,fi,54,mo,mb,gi,_r,Qu,FD,-o,ie,fo,As,7F,48,41,/i,eS,ab,FB,1E,h_,ef,rr,rc,di,b.,ol,im,eg,ap,_l,Se,19,oS,ew,bs,Su,F5,Co,BC,ud,C1,r-,ia,_o,65,.r,sk,o_,ck,CD,Am,9F,un,fa,F6,5F,nk,lo,ev,/f,.t,sE,nO,a_,EN,E4,Di,AC,95,74,1_,1A,us,ly,ll,_b,SA,FC,69,5E,43,um,tT,OS,CE,87,7A,59,44,t-,bl,ad,Or,D5,A_,31,24,t/,ph,mm,f.,ag,RS,Of,It,FA,De,1D,/d,-k,lf,hr,gu,fy,D6,89,6F,4E,/k,w_,cu,br,TE,ST,R_,E8,/O";
    let bigrams = bigrams_bytes.split(|b| *b == b',');
    let bigrams_set = HashSet::<_>::from_iter(bigrams);
    Arc::new(bigrams_set)
});

/// Determines if a byte sequence is likely a secret based on entropy analysis
///
/// This function uses statistical analysis to determine if a string appears random enough
/// to be a secret. It combines three metrics:
/// 1. Number of distinct values
/// 2. Character class distribution
/// 3. Bigram frequency analysis
///
/// Special handling:
/// - If threshold is f32::MAX, always returns true (skip entropy check)
/// - Strings without numbers need 10x higher probability
///
/// Returns true if the string appears to be randomly generated (likely a secret)
pub fn is_likely_secret(data: &[u8], min_threshold: f32) -> bool {
    // Special case: skip entropy check entirely
    if min_threshold == f32::MAX {
        return true;
    }
    let probability = calculate_randomness_probability(data);

    // Use tracing for debug output instead of loading config every time
    tracing::trace!(
        "Testing <REDACTED-{}-chars> - prob: {:.2e}, threshold: {:.2e}",
        data.len(),
        probability,
        min_threshold
    );

    if probability < min_threshold as f64 {
        tracing::trace!("Failed basic threshold check");
        return false;
    }

    // Additional check: strings without numbers need higher probability
    let mut contains_number = false;
    for &byte in data {
        if byte.is_ascii_digit() {
            contains_number = true;
            break;
        }
    }

    if !contains_number && probability < (min_threshold as f64) * 10.0 {
        tracing::trace!(
            "Failed no-numbers threshold check (needs {:.2e})",
            (min_threshold as f64) * 10.0
        );
        return false;
    }

    tracing::trace!("Passed all checks - returning true");
    true
}

/// Calculate the probability that a string occurred by random chance
///
/// When we get a potential secret that doesn't match any known secret patterns, we need to make
/// some determination of whether it's a random string or not. To do that we assume it's random, and
/// then calculate the probability that a few metrics came about by chance:
///
/// 1. Number of distinct values. Non-random text is generally going to have much fewer distinct
///    values than random text.
/// 2. Number of numbers. It's very common to have very few numbers in non-random text.
/// 3. Number of bigrams. If we take a sample of roughly 10% of possible bigrams that are common in
///    source code, we should expect that a random string should have about 10% of those bigrams.
///
/// This math is probably not perfect, but it should be in the right ballpark and it's ultimately a
/// heuristic so it should be judged on how well it's able to distinguish random from non-random
/// text.
pub fn calculate_randomness_probability(s: &[u8]) -> f64 {
    let base = if HEX_STRING_REGEX.is_match(s) {
        16.0
    } else if CAP_AND_NUMBERS_REGEX.is_match(s) {
        36.0
    } else {
        64.0
    };

    let distinct_count = count_distinct_values(s);

    // Log potential edge case scenarios for study
    if distinct_count > base as usize {
        tracing::trace!(
            "Entropy base detection mismatch: string has {} distinct chars but detected base is {} - string preview: {:?}",
            distinct_count,
            base,
            String::from_utf8_lossy(&s[..s.len().min(50)]) // Show first 50 chars for debugging
        );
    }

    let mut probability =
        probability_random_distinct_values(s, base) * probability_random_char_class(s, base);

    if base == 64.0 {
        // Bigrams are only calibrated for base64
        probability *= probability_random_bigrams(s);
    }

    probability
}

fn probability_random_bigrams(s: &[u8]) -> f64 {
    // Use the static bigrams set - same 488 bigrams, zero runtime overhead
    let bigrams_set = &*BIGRAMS_SET;

    let mut num_bigrams = 0;
    for i in 0..s.len() - 1 {
        let bigram = &s[i..=i + 1];
        if bigrams_set.contains(&bigram) {
            num_bigrams += 1;
        }
    }

    binomial_probability(
        s.len(),
        num_bigrams,
        (bigrams_set.len() as f64) / (64.0 * 64.0),
    )
}

fn probability_random_char_class(s: &[u8], base: f64) -> f64 {
    // Look at the 3 main char classes (uppercase, lowercase, and numbers) if it's not hex and pick
    // the least probable one
    if base == 16.0 {
        probability_random_char_class_aux(s, b'0', b'9', 16.0)
    } else {
        let mut min_probability = f64::INFINITY;

        let char_classes_36: &[(u8, u8)] = &[(b'0', b'9'), (b'A', b'Z')];
        let char_classes_64: &[(u8, u8)] = &[(b'0', b'9'), (b'A', b'Z'), (b'a', b'z')];
        let char_classes = if base == 36.0 {
            char_classes_36
        } else {
            char_classes_64
        };

        for (min, max) in char_classes {
            let probability = probability_random_char_class_aux(s, *min, *max, base);
            if probability < min_probability {
                min_probability = probability;
            }
        }
        min_probability
    }
}

fn probability_random_char_class_aux(s: &[u8], min: u8, max: u8, base: f64) -> f64 {
    let mut count = 0;
    for &byte in s {
        if byte >= min && byte <= max {
            count += 1
        }
    }
    let num_chars = (max - min + 1) as f64;
    binomial_probability(s.len(), count, num_chars / base)
}

fn binomial_probability(n: usize, x: usize, p: f64) -> f64 {
    let left_tail = (x as f64) < n as f64 * p;
    let min = if left_tail { 0 } else { x };
    let max = if left_tail { x } else { n };

    let mut total_probability = 0.0;
    for i in min..=max {
        total_probability += factorial(n) / (factorial(n - i) * factorial(i))
            * p.powi(i as i32)
            * (1.0 - p).powi((n - i) as i32);
    }
    total_probability
}

fn factorial(n: usize) -> f64 {
    let mut result = 1.0;
    for i in 2..=n {
        result *= i as f64;
    }
    result
}

fn probability_random_distinct_values(s: &[u8], base: f64) -> f64 {
    let total_possible: f64 = base.powi(s.len() as i32);
    let num_distinct_values = count_distinct_values(s);
    let mut num_more_extreme_outcomes: f64 = 0.0;
    for i in 1..=num_distinct_values {
        num_more_extreme_outcomes += num_possible_outcomes(s.len(), i, base as usize);
    }
    num_more_extreme_outcomes / total_possible
}

fn count_distinct_values(s: &[u8]) -> usize {
    let mut values_count = HashMap::<u8, usize>::new();
    for &byte in s {
        let count = values_count.entry(byte).or_insert(0);
        *count += 1;
    }
    values_count.len()
}

fn num_possible_outcomes(num_values: usize, num_distinct_values: usize, base: usize) -> f64 {
    // Check for edge case where we have more distinct values than the base allows
    if num_distinct_values > base {
        tracing::trace!(
            "Entropy edge case: num_distinct_values ({}) > base ({}), num_values: {} - returning 0.0 to avoid underflow",
            num_distinct_values,
            base,
            num_values
        );
        return 0.0;
    }

    let mut result = base as f64;
    for i in 1..num_distinct_values {
        result *= (base - i) as f64;
    }
    result *= num_distinct_configurations(num_values, num_distinct_values);
    result
}

fn num_distinct_configurations(num_values: usize, num_distinct_values: usize) -> f64 {
    if num_distinct_values == 1 || num_distinct_values == num_values {
        return 1.0;
    }
    num_distinct_configurations_aux(num_distinct_values, 0, num_values - num_distinct_values)
}

#[memoize]
fn num_distinct_configurations_aux(
    num_positions: usize,
    position: usize,
    remaining_values: usize,
) -> f64 {
    if remaining_values == 0 {
        return 1.0;
    }
    let mut num_configs = 0.0;
    if position + 1 < num_positions {
        num_configs +=
            num_distinct_configurations_aux(num_positions, position + 1, remaining_values);
    }
    num_configs += (position + 1) as f64
        * num_distinct_configurations_aux(num_positions, position, remaining_values - 1);
    num_configs
}

#[cfg(test)]
mod tests {
    use super::*;

    #[test]
    fn test_distinct_values() {
        assert_eq!(count_distinct_values(b"abca"), 3);
    }

    #[test]
    fn test_configurations() {
        assert_eq!(num_distinct_configurations(3, 2), 3.0);
        assert_eq!(num_distinct_configurations(4, 3), 6.0);
        assert_eq!(num_distinct_configurations(4, 2), 7.0);
        assert_eq!(num_distinct_configurations(6, 4), 65.0);
        assert_eq!(num_possible_outcomes(32, 1, 64), 64.0);
    }

    #[test]
    fn test_distinct_values_probability() {
        assert!(probability_random_distinct_values(b"aaaaaaaaa", 64.0) < 1.0 / 1e6);
        assert!(probability_random_distinct_values(b"abcdefghi", 64.0) > 1.0 / 1e6);
    }

    #[test]
    fn test_binomial() {
        assert_eq!(binomial_probability(2, 0, 0.5), 0.25);
        assert_eq!(binomial_probability(2, 1, 0.5), 0.75);
        assert!(probability_random_bigrams(b"hello_world") < 1.0 / 1e4);
    }

    #[test]
    fn test_overall_randomness() {
        // Read test data from encrypted fixtures
        let fixtures_path = std::path::Path::new(env!("CARGO_MANIFEST_DIR"))
            .join("tests/fixtures/entropy_test_data.txt");
        let test_data = std::fs::read_to_string(fixtures_path)
            .expect("Failed to read test fixtures - ensure git-crypt is unlocked");

        let lines: Vec<&str> = test_data.lines().collect();
        let pk_test_key = lines[0].as_bytes();
        let sk_test_key = lines[1].as_bytes();

        assert!(calculate_randomness_probability(b"hello_world") < 1.0 / 1e6);
        assert!(calculate_randomness_probability(pk_test_key) > 1.0 / 1e4);
        assert!(calculate_randomness_probability(sk_test_key) > 1.0 / 1e4);
        assert!(calculate_randomness_probability(b"PROJECT_NAME_ALIAS") < 1.0 / 1e4);
    }

    #[test]
    fn test_is_likely_secret() {
        // Read test data from encrypted fixtures
        let fixtures_path = std::path::Path::new(env!("CARGO_MANIFEST_DIR"))
            .join("tests/fixtures/entropy_test_data.txt");
        let test_data = std::fs::read_to_string(fixtures_path)
            .expect("Failed to read test fixtures - ensure git-crypt is unlocked");

        let lines: Vec<&str> = test_data.lines().collect();
        let pk_test_key = lines[0].as_bytes();
        let sk_test_key = lines[1].as_bytes();

        // Should detect real secrets
        assert!(is_likely_secret(sk_test_key, 1.0 / 1e5));
        assert!(is_likely_secret(pk_test_key, 1.0 / 1e5));

        // Should ignore common variable names
        assert!(!is_likely_secret(b"API_KEY_CONSTANT", 1.0 / 1e5));
        assert!(!is_likely_secret(b"hello_world", 1.0 / 1e5));
        assert!(!is_likely_secret(b"PROJECT_NAME_ALIAS", 1.0 / 1e5));
    }
}