1use crate::analyzer::{DefaultStringAnalyzer, StringAnalyzer};
4use crate::categorizer::{Categorizer, DefaultCategorizer};
5use crate::patterns::{DefaultPatternProvider, PatternProvider};
6use anyhow::Result;
7use chrono::{DateTime, Utc};
8use serde::{Deserialize, Serialize};
9use std::collections::{HashMap, HashSet};
10use std::sync::{Arc, Mutex};
11
12type StringCountVec = Vec<(String, usize)>;
14type StringScoreVec = Vec<(String, f64)>;
15type DateTimeRange = (DateTime<Utc>, DateTime<Utc>);
16type StringEntryMap = Arc<Mutex<HashMap<String, StringEntry>>>;
17type BoxedAnalyzer = Arc<Box<dyn StringAnalyzer>>;
18type BoxedCategorizer = Arc<Box<dyn Categorizer>>;
19
20#[derive(Debug, Clone, Serialize, Deserialize)]
22pub enum StringContext {
23 FileString {
25 offset: Option<usize>,
27 },
28 Import {
30 library: String,
32 },
33 Export {
35 symbol: String,
37 },
38 Resource {
40 resource_type: String,
42 },
43 Section {
45 section_name: String,
47 },
48 Metadata {
50 field: String,
52 },
53 Path {
55 path_type: String,
57 },
58 Url {
60 protocol: Option<String>,
62 },
63 Registry {
65 hive: Option<String>,
67 },
68 Command {
70 command_type: String,
72 },
73 Other {
75 category: String,
77 },
78}
79
80#[derive(Debug, Clone, Serialize, Deserialize)]
82pub struct StringOccurrence {
83 pub file_path: String,
85 pub file_hash: String,
87 pub tool_name: String,
89 pub timestamp: DateTime<Utc>,
91 pub context: StringContext,
93}
94
95#[derive(Debug, Clone, Serialize, Deserialize)]
97pub struct StringEntry {
98 pub value: String,
100 pub first_seen: DateTime<Utc>,
102 pub last_seen: DateTime<Utc>,
104 pub total_occurrences: usize,
106 pub unique_files: HashSet<String>,
108 pub occurrences: Vec<StringOccurrence>,
110 pub categories: HashSet<String>,
112 pub is_suspicious: bool,
114 pub entropy: f64,
116}
117
118#[derive(Debug, Clone, Serialize, Deserialize)]
120pub struct StringStatistics {
121 pub total_unique_strings: usize,
123 pub total_occurrences: usize,
125 pub total_files_analyzed: usize,
127 pub most_common: StringCountVec,
129 pub suspicious_strings: Vec<String>,
131 pub high_entropy_strings: StringScoreVec,
133 pub category_distribution: HashMap<String, usize>,
135 pub length_distribution: HashMap<String, usize>,
137}
138
139#[derive(Debug, Clone, Default, Serialize, Deserialize)]
141pub struct StringFilter {
142 pub min_occurrences: Option<usize>,
144 pub max_occurrences: Option<usize>,
146 pub min_length: Option<usize>,
148 pub max_length: Option<usize>,
150 pub categories: Option<Vec<String>>,
152 pub file_paths: Option<Vec<String>>,
154 pub file_hashes: Option<Vec<String>>,
156 pub suspicious_only: Option<bool>,
158 pub regex_pattern: Option<String>,
160 pub min_entropy: Option<f64>,
162 pub max_entropy: Option<f64>,
164 pub date_range: Option<DateTimeRange>,
166}
167
168#[derive(Clone)]
170pub struct StringTracker {
171 entries: StringEntryMap,
172 analyzer: BoxedAnalyzer,
173 categorizer: BoxedCategorizer,
174 max_occurrences_per_string: usize,
175}
176
177impl Default for StringTracker {
178 fn default() -> Self {
179 Self::new()
180 }
181}
182
183impl StringTracker {
184 pub fn new() -> Self {
186 let pattern_provider = DefaultPatternProvider::default();
187 let analyzer = DefaultStringAnalyzer::new().with_patterns(pattern_provider.get_patterns());
188
189 Self {
190 entries: Arc::new(Mutex::new(HashMap::new())),
191 analyzer: Arc::new(Box::new(analyzer)),
192 categorizer: Arc::new(Box::new(DefaultCategorizer::new())),
193 max_occurrences_per_string: 1000,
194 }
195 }
196
197 pub fn with_components(
199 analyzer: Box<dyn StringAnalyzer>,
200 categorizer: Box<dyn Categorizer>,
201 ) -> Self {
202 Self {
203 entries: Arc::new(Mutex::new(HashMap::new())),
204 analyzer: Arc::new(analyzer),
205 categorizer: Arc::new(categorizer),
206 max_occurrences_per_string: 1000,
207 }
208 }
209
210 pub fn with_max_occurrences(mut self, max: usize) -> Self {
212 self.max_occurrences_per_string = max;
213 self
214 }
215
216 pub fn track_string(
218 &self,
219 value: &str,
220 file_path: &str,
221 file_hash: &str,
222 tool_name: &str,
223 context: StringContext,
224 ) -> Result<()> {
225 let mut entries = self.entries.lock().unwrap();
226
227 let occurrence = StringOccurrence {
228 file_path: file_path.to_string(),
229 file_hash: file_hash.to_string(),
230 tool_name: tool_name.to_string(),
231 timestamp: Utc::now(),
232 context: context.clone(),
233 };
234
235 let context_category = match &context {
237 StringContext::FileString { .. } => "file_string",
238 StringContext::Import { .. } => "import",
239 StringContext::Export { .. } => "export",
240 StringContext::Resource { .. } => "resource",
241 StringContext::Section { .. } => "section",
242 StringContext::Metadata { .. } => "metadata",
243 StringContext::Path { .. } => "path",
244 StringContext::Url { .. } => "url",
245 StringContext::Registry { .. } => "registry",
246 StringContext::Command { .. } => "command",
247 StringContext::Other { category } => category,
248 };
249
250 let entry = entries.entry(value.to_string()).or_insert_with(|| {
251 let analysis = self.analyzer.analyze(value);
252 let categories = self.categorizer.categorize(value);
253
254 let mut category_set =
255 HashSet::with_capacity(categories.len() + analysis.categories.len() + 1);
256 category_set.insert(context_category.to_string());
257 for cat in categories {
258 category_set.insert(cat.name);
259 }
260 category_set.extend(analysis.categories);
261
262 let now = Utc::now();
263 StringEntry {
264 value: value.to_string(),
265 first_seen: now,
266 last_seen: now,
267 total_occurrences: 0,
268 unique_files: HashSet::new(),
269 occurrences: Vec::new(),
270 categories: category_set,
271 is_suspicious: analysis.is_suspicious,
272 entropy: analysis.entropy,
273 }
274 });
275
276 entry.last_seen = Utc::now();
277 entry.total_occurrences += 1;
278 entry.unique_files.insert(file_path.to_string());
279 entry.occurrences.push(occurrence);
280
281 if entry.occurrences.len() > self.max_occurrences_per_string {
283 entry.occurrences.remove(0);
284 }
285
286 Ok(())
287 }
288
289 pub fn track_strings_from_results(
291 &self,
292 strings: &[String],
293 file_path: &str,
294 file_hash: &str,
295 tool_name: &str,
296 ) -> Result<()> {
297 for string in strings {
298 let categories = self.categorizer.categorize(string);
300
301 let context = if categories.iter().any(|c| c.name == "url") {
303 let protocol = string.split("://").next().map(|p| p.to_string());
304 StringContext::Url { protocol }
305 } else if categories.iter().any(|c| c.name == "path") {
306 let path_type = if string.contains("\\Windows") || string.contains("/usr") {
307 "system"
308 } else if string.contains("\\Temp") || string.contains("/tmp") {
309 "temp"
310 } else {
311 "general"
312 };
313 StringContext::Path {
314 path_type: path_type.to_string(),
315 }
316 } else if categories.iter().any(|c| c.name == "registry") {
317 let hive = string.split('\\').next().map(|h| h.to_string());
318 StringContext::Registry { hive }
319 } else if categories.iter().any(|c| c.name == "library") {
320 StringContext::Import {
321 library: string.to_string(),
322 }
323 } else if categories.iter().any(|c| c.name == "command") {
324 StringContext::Command {
325 command_type: "shell".to_string(),
326 }
327 } else {
328 StringContext::FileString { offset: None }
329 };
330
331 self.track_string(string, file_path, file_hash, tool_name, context)?;
332 }
333 Ok(())
334 }
335
336 pub fn get_statistics(&self, filter: Option<&StringFilter>) -> StringStatistics {
338 let entries = self.entries.lock().unwrap();
339
340 let filtered_entries: Vec<_> = entries
341 .values()
342 .filter(|entry| self.matches_filter(entry, filter))
343 .collect();
344
345 let total_unique_strings = filtered_entries.len();
346 let total_occurrences: usize = filtered_entries.iter().map(|e| e.total_occurrences).sum();
347
348 let total_files_analyzed: HashSet<_> = filtered_entries
349 .iter()
350 .flat_map(|e| e.unique_files.iter())
351 .collect();
352
353 let mut most_common: Vec<_> = filtered_entries
355 .iter()
356 .map(|e| (e.value.clone(), e.total_occurrences))
357 .collect();
358 most_common.sort_by(|a, b| b.1.cmp(&a.1));
359 most_common.truncate(100);
360
361 let suspicious_strings: Vec<_> = filtered_entries
363 .iter()
364 .filter(|e| e.is_suspicious)
365 .map(|e| e.value.clone())
366 .take(50)
367 .collect();
368
369 let mut high_entropy_strings: Vec<_> = filtered_entries
371 .iter()
372 .filter(|e| e.entropy > 4.0)
373 .map(|e| (e.value.clone(), e.entropy))
374 .collect();
375 high_entropy_strings.sort_by(|a, b| b.1.partial_cmp(&a.1).unwrap());
376 high_entropy_strings.truncate(50);
377
378 let mut category_distribution = HashMap::new();
380 for entry in &filtered_entries {
381 for category in &entry.categories {
382 *category_distribution.entry(category.clone()).or_insert(0) += 1;
383 }
384 }
385
386 let mut length_distribution = HashMap::new();
388 for entry in &filtered_entries {
389 let len_bucket = match entry.value.len() {
390 0..=10 => "0-10",
391 11..=20 => "11-20",
392 21..=50 => "21-50",
393 51..=100 => "51-100",
394 101..=200 => "101-200",
395 _ => "200+",
396 };
397 *length_distribution
398 .entry(len_bucket.to_string())
399 .or_insert(0) += 1;
400 }
401
402 StringStatistics {
403 total_unique_strings,
404 total_occurrences,
405 total_files_analyzed: total_files_analyzed.len(),
406 most_common,
407 suspicious_strings,
408 high_entropy_strings,
409 category_distribution,
410 length_distribution,
411 }
412 }
413
414 fn matches_filter(&self, entry: &StringEntry, filter: Option<&StringFilter>) -> bool {
415 let Some(f) = filter else {
416 return true;
417 };
418
419 if let Some(min) = f.min_occurrences {
420 if entry.total_occurrences < min {
421 return false;
422 }
423 }
424
425 if let Some(max) = f.max_occurrences {
426 if entry.total_occurrences > max {
427 return false;
428 }
429 }
430
431 if let Some(min) = f.min_length {
432 if entry.value.len() < min {
433 return false;
434 }
435 }
436
437 if let Some(max) = f.max_length {
438 if entry.value.len() > max {
439 return false;
440 }
441 }
442
443 if let Some(ref categories) = f.categories {
444 if !categories.iter().any(|c| entry.categories.contains(c)) {
445 return false;
446 }
447 }
448
449 if let Some(ref file_hashes) = f.file_hashes {
450 if !file_hashes.iter().any(|h| entry.unique_files.contains(h)) {
451 return false;
452 }
453 }
454
455 if let Some(suspicious_only) = f.suspicious_only {
456 if suspicious_only && !entry.is_suspicious {
457 return false;
458 }
459 }
460
461 if let Some(ref pattern) = f.regex_pattern {
462 if let Ok(re) = regex::Regex::new(pattern) {
463 if !re.is_match(&entry.value) {
464 return false;
465 }
466 }
467 }
468
469 if let Some(min_entropy) = f.min_entropy {
470 if entry.entropy < min_entropy {
471 return false;
472 }
473 }
474
475 if let Some(max_entropy) = f.max_entropy {
476 if entry.entropy > max_entropy {
477 return false;
478 }
479 }
480
481 true
482 }
483
484 pub fn get_string_details(&self, value: &str) -> Option<StringEntry> {
486 let entries = self.entries.lock().unwrap();
487 entries.get(value).cloned()
488 }
489
490 pub fn search_strings(&self, query: &str, limit: usize) -> Vec<StringEntry> {
492 if query.trim().is_empty() {
494 return Vec::new();
495 }
496
497 let entries = self.entries.lock().unwrap();
498 let query_lower = query.to_lowercase();
499
500 let mut results: Vec<_> = entries
501 .values()
502 .filter(|e| e.value.to_lowercase().contains(&query_lower))
503 .cloned()
504 .collect();
505
506 results.sort_by(|a, b| b.total_occurrences.cmp(&a.total_occurrences));
507 results.truncate(limit);
508 results
509 }
510
511 pub fn get_related_strings(&self, value: &str, limit: usize) -> StringScoreVec {
513 let entries = self.entries.lock().unwrap();
514
515 let Some(target_entry) = entries.get(value) else {
516 return vec![];
517 };
518
519 let mut similarities: Vec<_> = entries
520 .iter()
521 .filter(|(k, _)| *k != value)
522 .map(|(k, v)| {
523 let similarity = self.calculate_similarity(target_entry, v);
524 (k.clone(), similarity)
525 })
526 .filter(|(_, sim)| *sim > 0.3)
527 .collect();
528
529 similarities.sort_by(|a, b| b.1.partial_cmp(&a.1).unwrap());
530 similarities.truncate(limit);
531 similarities
532 }
533
534 fn calculate_similarity(&self, a: &StringEntry, b: &StringEntry) -> f64 {
535 let mut score = 0.0;
536 let mut factors = 0.0;
537
538 let shared_files: HashSet<_> = a.unique_files.intersection(&b.unique_files).collect();
540 if !shared_files.is_empty() {
541 score +=
542 shared_files.len() as f64 / a.unique_files.len().min(b.unique_files.len()) as f64;
543 factors += 1.0;
544 }
545
546 let shared_categories: HashSet<_> = a.categories.intersection(&b.categories).collect();
548 if !shared_categories.is_empty() {
549 score +=
550 shared_categories.len() as f64 / a.categories.len().min(b.categories.len()) as f64;
551 factors += 1.0;
552 }
553
554 let entropy_diff = (a.entropy - b.entropy).abs();
556 if entropy_diff < 0.5 {
557 score += 1.0 - (entropy_diff / 0.5);
558 factors += 1.0;
559 }
560
561 let len_a = a.value.len() as f64;
563 let len_b = b.value.len() as f64;
564 let len_ratio = len_a.min(len_b) / len_a.max(len_b);
565 score += len_ratio;
566 factors += 1.0;
567
568 if factors > 0.0 { score / factors } else { 0.0 }
569 }
570
571 #[allow(dead_code)]
573 pub fn clear(&self) {
574 let mut entries = self.entries.lock().unwrap();
575 entries.clear();
576 }
577}