vsec 0.0.1

Detect secrets and in Rust codebases
Documentation
//! SIMD-accelerated fuzzy string matching using Myers' bit-parallel algorithm.
//!
//! This module provides fast approximate string matching to catch typos and
//! variations in variable names (e.g., "passwd" vs "password", "passwrd").
//!
//! # Algorithm
//!
//! Uses Myers' bit-parallel algorithm for Levenshtein distance calculation,
//! which computes edit distance in O(n*m/w) where w is the word size (64 bits).
//! For short patterns (≤64 chars), this is effectively O(n).

use std::sync::LazyLock;

/// Maximum pattern length for bit-parallel algorithm (64 bits).
const MAX_PATTERN_LEN: usize = 64;

/// Pre-computed pattern masks for a dictionary word.
#[derive(Clone)]
struct PatternMasks {
    /// The original pattern
    pattern: String,
    /// Bitmask for each character (256 possible bytes)
    /// Bit i is set if pattern[i] == character
    char_masks: [u64; 256],
    /// Pattern length
    len: usize,
}

impl PatternMasks {
    fn new(pattern: &str) -> Self {
        let bytes = pattern.as_bytes();
        let len = bytes.len().min(MAX_PATTERN_LEN);
        let mut char_masks = [0u64; 256];

        for (i, &byte) in bytes.iter().take(len).enumerate() {
            char_masks[byte as usize] |= 1u64 << i;
            // Also set for case-insensitive matching
            if byte.is_ascii_lowercase() {
                char_masks[byte.to_ascii_uppercase() as usize] |= 1u64 << i;
            } else if byte.is_ascii_uppercase() {
                char_masks[byte.to_ascii_lowercase() as usize] |= 1u64 << i;
            }
        }

        Self {
            pattern: pattern.to_string(),
            char_masks,
            len,
        }
    }

    /// Calculate Levenshtein distance using Myers' bit-parallel algorithm.
    ///
    /// Returns the edit distance between the pattern and text.
    #[inline]
    fn levenshtein(&self, text: &str) -> usize {
        if self.len == 0 {
            return text.len();
        }
        if text.is_empty() {
            return self.len;
        }

        let text_bytes = text.as_bytes();
        let m = self.len;

        // Initialize bit vectors
        let mut vp: u64 = u64::MAX; // Vertical positive delta
        let mut vn: u64 = 0; // Vertical negative delta
        let mut score = m;

        // Mask for pattern length
        let pattern_mask = if m >= 64 {
            u64::MAX
        } else {
            (1u64 << m) - 1
        };
        let last_bit = 1u64 << (m - 1);

        for &byte in text_bytes {
            let pm = self.char_masks[byte as usize];

            // Myers' algorithm core
            let d0 = ((pm & vp).wrapping_add(vp)) ^ vp | pm | vn;
            let mut hp = vn | !(d0 | vp);
            let hn = d0 & vp;

            // Update score
            if hp & last_bit != 0 {
                score += 1;
            } else if hn & last_bit != 0 {
                score = score.saturating_sub(1);
            }

            // Shift for next iteration
            hp = (hp << 1) | 1;
            let hn_shifted = hn << 1;

            vp = hn_shifted | !(d0 | hp);
            vn = d0 & hp;
        }

        score
    }
}

/// A dictionary of suspicious terms for fuzzy matching.
pub struct FuzzyMatcher {
    patterns: Vec<PatternMasks>,
    /// Maximum edit distance to consider a match
    max_distance: usize,
}

impl FuzzyMatcher {
    /// Create a new fuzzy matcher from a list of patterns.
    pub fn new(patterns: &[&str], max_distance: usize) -> Self {
        Self {
            patterns: patterns.iter().map(|&p| PatternMasks::new(p)).collect(),
            max_distance,
        }
    }

    /// Check if the text fuzzy-matches any pattern within the max distance.
    ///
    /// Returns the best matching pattern and its distance, if any.
    #[inline]
    pub fn find_match(&self, text: &str) -> Option<(&str, usize)> {
        let text_lower = text.to_ascii_lowercase();
        let mut best_match: Option<(&str, usize)> = None;

        for pattern in &self.patterns {
            // Quick length check: edit distance is at least |len1 - len2|
            let len_diff = (pattern.len as isize - text_lower.len() as isize).unsigned_abs();
            if len_diff > self.max_distance {
                continue;
            }

            let distance = pattern.levenshtein(&text_lower);

            if distance <= self.max_distance {
                match best_match {
                    None => best_match = Some((&pattern.pattern, distance)),
                    Some((_, best_dist)) if distance < best_dist => {
                        best_match = Some((&pattern.pattern, distance));
                    }
                    _ => {}
                }

                // Early exit on exact match
                if distance == 0 {
                    return best_match;
                }
            }
        }

        best_match
    }

    /// Check if the text fuzzy-matches any pattern.
    #[inline]
    pub fn is_fuzzy_match(&self, text: &str) -> bool {
        self.find_match(text).is_some()
    }

    /// Get all matches within the max distance.
    pub fn find_all_matches(&self, text: &str) -> Vec<(&str, usize)> {
        let text_lower = text.to_ascii_lowercase();
        let mut matches = Vec::new();

        for pattern in &self.patterns {
            let len_diff = (pattern.len as isize - text_lower.len() as isize).unsigned_abs();
            if len_diff > self.max_distance {
                continue;
            }

            let distance = pattern.levenshtein(&text_lower);

            if distance <= self.max_distance {
                matches.push((&pattern.pattern as &str, distance));
            }
        }

        matches.sort_by_key(|&(_, d)| d);
        matches
    }
}

/// Pre-built fuzzy matcher for authentication-related terms.
/// Catches common typos like "passwd", "passwrd", "passowrd", "pasword".
pub static AUTH_TERMS: LazyLock<FuzzyMatcher> = LazyLock::new(|| {
    FuzzyMatcher::new(
        &[
            "password",
            "secret",
            "credential",
            "token",
            "apikey",
            "api_key",
            "private_key",
            "secret_key",
            "access_token",
            "auth_token",
            "bearer",
            "jwt",
            "session",
        ],
        2, // Allow up to 2 edits
    )
});

/// Pre-built fuzzy matcher for sensitive field names.
pub static SENSITIVE_FIELDS: LazyLock<FuzzyMatcher> = LazyLock::new(|| {
    FuzzyMatcher::new(
        &[
            "password",
            "passwd",
            "secret",
            "private",
            "credential",
            "auth",
            "authorization",
            "authenticate",
            "token",
            "key",
            "apikey",
            "certificate",
            "signature",
        ],
        2,
    )
});

/// Check if a variable name fuzzy-matches any auth-related term.
#[inline]
pub fn is_fuzzy_auth_like(name: &str) -> Option<(&'static str, usize)> {
    AUTH_TERMS.find_match(name)
}

/// Check if a field name fuzzy-matches any sensitive term.
#[inline]
pub fn is_fuzzy_sensitive(name: &str) -> Option<(&'static str, usize)> {
    SENSITIVE_FIELDS.find_match(name)
}

#[cfg(test)]
mod tests {
    use super::*;

    #[test]
    fn test_exact_match() {
        let matcher = FuzzyMatcher::new(&["password", "secret"], 2);
        assert_eq!(matcher.find_match("password"), Some(("password", 0)));
        assert_eq!(matcher.find_match("secret"), Some(("secret", 0)));
    }

    #[test]
    fn test_typo_matches() {
        let matcher = FuzzyMatcher::new(&["password"], 2);

        // Common typos
        assert!(matcher.find_match("passwd").is_some()); // deletion
        assert!(matcher.find_match("passwrd").is_some()); // deletion
        assert!(matcher.find_match("passowrd").is_some()); // transposition
        assert!(matcher.find_match("pasword").is_some()); // deletion
        assert!(matcher.find_match("passw0rd").is_some()); // substitution
    }

    #[test]
    fn test_case_insensitive() {
        let matcher = FuzzyMatcher::new(&["password"], 0);
        assert_eq!(matcher.find_match("PASSWORD"), Some(("password", 0)));
        assert_eq!(matcher.find_match("Password"), Some(("password", 0)));
    }

    #[test]
    fn test_no_match() {
        let matcher = FuzzyMatcher::new(&["password"], 2);
        assert!(matcher.find_match("username").is_none()); // Too different
        assert!(matcher.find_match("config").is_none());
    }

    #[test]
    fn test_distance_threshold() {
        let matcher = FuzzyMatcher::new(&["password"], 1);
        assert!(matcher.find_match("passwd").is_none()); // Distance 2, threshold 1
        assert!(matcher.find_match("passwort").is_some()); // Distance 1
    }

    #[test]
    fn test_prebuilt_auth_terms() {
        assert!(is_fuzzy_auth_like("password").is_some());
        assert!(is_fuzzy_auth_like("passwd").is_some());
        assert!(is_fuzzy_auth_like("passowrd").is_some());
        assert!(is_fuzzy_auth_like("api_key").is_some());
        assert!(is_fuzzy_auth_like("apikey").is_some());
        assert!(is_fuzzy_auth_like("username").is_none());
    }

    #[test]
    fn test_find_all_matches() {
        let matcher = FuzzyMatcher::new(&["password", "passwd", "passkey"], 2);
        let matches = matcher.find_all_matches("passwd");

        // "passwd" should match itself exactly (distance 0)
        assert!(matches.iter().any(|(p, d)| *p == "passwd" && *d == 0));
        // "password" should match with small distance
        assert!(matches.iter().any(|(p, _)| *p == "password"));
    }

    #[test]
    fn test_myers_algorithm() {
        let pattern = PatternMasks::new("kitten");

        // Classic edit distance examples
        assert_eq!(pattern.levenshtein("kitten"), 0);
        assert_eq!(pattern.levenshtein("sitten"), 1); // k->s
        assert_eq!(pattern.levenshtein("sittin"), 2); // k->s, e->i
        assert_eq!(pattern.levenshtein("sitting"), 3); // k->s, e->i, +g
    }

    #[test]
    fn test_empty_strings() {
        let pattern = PatternMasks::new("test");
        assert_eq!(pattern.levenshtein(""), 4);

        let empty = PatternMasks::new("");
        assert_eq!(empty.levenshtein("test"), 4);
        assert_eq!(empty.levenshtein(""), 0);
    }
}