threatflux_string_analysis/
categorizer.rs

1//! String categorization functionality
2
3use crate::types::AnalysisResult;
4use once_cell::sync::Lazy;
5use regex::Regex;
6use serde::{Deserialize, Serialize};
7
8// Type aliases to reduce complexity
9type MatcherFn = Box<dyn Fn(&str) -> bool + Send + Sync>;
10
11// Pre-compiled regex patterns for performance
12static IPV4_REGEX: Lazy<Regex> =
13    Lazy::new(|| Regex::new(r"^\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}$").unwrap());
14
15static IPV6_REGEX: Lazy<Regex> =
16    Lazy::new(|| Regex::new(r"^([0-9a-fA-F]{1,4}:){1,7}[0-9a-fA-F]{1,4}$|^::1$|^::$").unwrap());
17
18static EMAIL_REGEX: Lazy<Regex> =
19    Lazy::new(|| Regex::new(r"^[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}$").unwrap());
20
21/// Represents a category that strings can belong to
22#[derive(Debug, Clone, PartialEq, Eq, Hash, Serialize, Deserialize)]
23pub struct StringCategory {
24    /// Name of the category
25    pub name: String,
26    /// Parent category (for hierarchical categorization)
27    pub parent: Option<String>,
28    /// Description of what this category represents
29    pub description: String,
30}
31
32/// Rule for categorizing strings
33pub struct CategoryRule {
34    /// Name of the rule
35    pub name: String,
36    /// Function that determines if a string matches this rule
37    pub matcher: MatcherFn,
38    /// Category to assign if the rule matches
39    pub category: StringCategory,
40    /// Priority (higher priority rules are evaluated first)
41    pub priority: i32,
42}
43
44/// Trait for categorizing strings
45pub trait Categorizer: Send + Sync {
46    /// Categorize a string
47    fn categorize(&self, value: &str) -> Vec<StringCategory>;
48
49    /// Add a categorization rule
50    fn add_rule(&mut self, rule: CategoryRule) -> AnalysisResult<()>;
51
52    /// Remove a rule by name
53    fn remove_rule(&mut self, name: &str) -> AnalysisResult<()>;
54
55    /// Get all categories
56    fn get_categories(&self) -> Vec<StringCategory>;
57}
58
59/// Default categorizer implementation
60pub struct DefaultCategorizer {
61    rules: Vec<CategoryRule>,
62}
63
64impl DefaultCategorizer {
65    /// Create a new categorizer with default rules
66    pub fn new() -> Self {
67        let mut categorizer = Self { rules: Vec::new() };
68
69        // Add default rules
70        categorizer.add_default_rules();
71
72        categorizer
73    }
74
75    /// Create an empty categorizer
76    #[allow(dead_code)]
77    pub fn empty() -> Self {
78        Self { rules: Vec::new() }
79    }
80
81    fn add_default_rules(&mut self) {
82        // URL categorization
83        self.rules.push(CategoryRule {
84            name: "url_rule".to_string(),
85            matcher: Box::new(|s| {
86                s.starts_with("http://") || s.starts_with("https://") || s.starts_with("ftp://")
87            }),
88            category: StringCategory {
89                name: "url".to_string(),
90                parent: Some("network".to_string()),
91                description: "URL or web address".to_string(),
92            },
93            priority: 100,
94        });
95
96        // File path categorization
97        self.rules.push(CategoryRule {
98            name: "path_rule".to_string(),
99            matcher: Box::new(|s| {
100                (s.contains('/') || s.contains('\\'))
101                    && (s.starts_with("/") || s.starts_with("\\") || s.contains(":\\"))
102            }),
103            category: StringCategory {
104                name: "path".to_string(),
105                parent: Some("filesystem".to_string()),
106                description: "File system path".to_string(),
107            },
108            priority: 90,
109        });
110
111        // Registry key categorization
112        self.rules.push(CategoryRule {
113            name: "registry_rule".to_string(),
114            matcher: Box::new(|s| s.starts_with("HKEY_") || s.contains("\\SOFTWARE\\")),
115            category: StringCategory {
116                name: "registry".to_string(),
117                parent: Some("windows".to_string()),
118                description: "Windows registry key".to_string(),
119            },
120            priority: 95,
121        });
122
123        // Library/DLL categorization
124        self.rules.push(CategoryRule {
125            name: "library_rule".to_string(),
126            matcher: Box::new(|s| {
127                s.ends_with(".dll") || s.ends_with(".so") || s.ends_with(".dylib") ||
128                s.contains(".so.") || // versioned shared libraries like libc.so.6
129                (s.ends_with(".dll") || s.contains("kernel32") || s.contains("ntdll"))
130            }),
131            category: StringCategory {
132                name: "library".to_string(),
133                parent: Some("binary".to_string()),
134                description: "Shared library or DLL".to_string(),
135            },
136            priority: 85,
137        });
138
139        // Command categorization
140        self.rules.push(CategoryRule {
141            name: "command_rule".to_string(),
142            matcher: Box::new(|s| {
143                s.contains("cmd")
144                    || s.contains("powershell")
145                    || s.contains("bash")
146                    || s.contains("/bin/")
147            }),
148            category: StringCategory {
149                name: "command".to_string(),
150                parent: Some("execution".to_string()),
151                description: "Command or shell-related string".to_string(),
152            },
153            priority: 80,
154        });
155
156        // IP address categorization (IPv4 and IPv6)
157        self.rules.push(CategoryRule {
158            name: "ip_rule".to_string(),
159            matcher: Box::new(|s| IPV4_REGEX.is_match(s) || IPV6_REGEX.is_match(s)),
160            category: StringCategory {
161                name: "ip_address".to_string(),
162                parent: Some("network".to_string()),
163                description: "IP address (IPv4 or IPv6)".to_string(),
164            },
165            priority: 95,
166        });
167
168        // Email categorization
169        self.rules.push(CategoryRule {
170            name: "email_rule".to_string(),
171            matcher: Box::new(|s| s.contains('@') && s.contains('.') && EMAIL_REGEX.is_match(s)),
172            category: StringCategory {
173                name: "email".to_string(),
174                parent: Some("contact".to_string()),
175                description: "Email address".to_string(),
176            },
177            priority: 85,
178        });
179
180        // API call categorization
181        self.rules.push(CategoryRule {
182            name: "api_call_rule".to_string(),
183            matcher: Box::new(|s| {
184                // Common Windows API calls
185                s.contains("CreateProcess") || s.contains("VirtualAlloc") || s.contains("WriteProcessMemory") ||
186                s.contains("GetProcAddress") || s.contains("LoadLibrary") || s.contains("OpenProcess") ||
187                // Unix/Linux API calls
188                s == "malloc" || s == "calloc" || s == "realloc" || s == "free" ||
189                s == "fork" || s == "exec" || s == "open" || s == "read" || s == "write" ||
190                // Common API patterns
191                s.ends_with("A") && s.len() > 5 && s.chars().any(|c| c.is_uppercase()) // Windows API naming pattern
192            }),
193            category: StringCategory {
194                name: "api_call".to_string(),
195                parent: Some("system".to_string()),
196                description: "System API call".to_string(),
197            },
198            priority: 90,
199        });
200
201        // Sort rules by priority (descending)
202        self.rules.sort_by(|a, b| b.priority.cmp(&a.priority));
203    }
204}
205
206impl Categorizer for DefaultCategorizer {
207    fn categorize(&self, value: &str) -> Vec<StringCategory> {
208        let mut categories = Vec::new();
209
210        for rule in &self.rules {
211            if (rule.matcher)(value) {
212                categories.push(rule.category.clone());
213            }
214        }
215
216        // If no specific category matched, return generic
217        if categories.is_empty() {
218            categories.push(StringCategory {
219                name: "generic".to_string(),
220                parent: None,
221                description: "Generic string".to_string(),
222            });
223        }
224
225        categories
226    }
227
228    fn add_rule(&mut self, rule: CategoryRule) -> AnalysisResult<()> {
229        self.rules.push(rule);
230        self.rules.sort_by(|a, b| b.priority.cmp(&a.priority));
231        Ok(())
232    }
233
234    fn remove_rule(&mut self, name: &str) -> AnalysisResult<()> {
235        self.rules.retain(|r| r.name != name);
236        Ok(())
237    }
238
239    fn get_categories(&self) -> Vec<StringCategory> {
240        let mut categories = Vec::new();
241        let mut seen = std::collections::HashSet::new();
242
243        for rule in &self.rules {
244            if seen.insert(rule.category.name.clone()) {
245                categories.push(rule.category.clone());
246            }
247        }
248
249        categories
250    }
251}
252
253impl Default for DefaultCategorizer {
254    fn default() -> Self {
255        Self::new()
256    }
257}