threatflux_string_analysis/
tracker.rs

1//! String tracking and analysis functionality
2
3use crate::analyzer::{DefaultStringAnalyzer, StringAnalyzer};
4use crate::categorizer::{Categorizer, DefaultCategorizer};
5use crate::patterns::{DefaultPatternProvider, PatternProvider};
6use anyhow::Result;
7use chrono::{DateTime, Utc};
8use serde::{Deserialize, Serialize};
9use std::collections::{HashMap, HashSet};
10use std::sync::{Arc, Mutex};
11
12// Type aliases to reduce complexity
13type StringCountVec = Vec<(String, usize)>;
14type StringScoreVec = Vec<(String, f64)>;
15type DateTimeRange = (DateTime<Utc>, DateTime<Utc>);
16type StringEntryMap = Arc<Mutex<HashMap<String, StringEntry>>>;
17type BoxedAnalyzer = Arc<Box<dyn StringAnalyzer>>;
18type BoxedCategorizer = Arc<Box<dyn Categorizer>>;
19
20/// Context in which a string was found
21#[derive(Debug, Clone, Serialize, Deserialize)]
22pub enum StringContext {
23    /// String found in file content
24    FileString {
25        /// Byte offset within the file where the string was found
26        offset: Option<usize>,
27    },
28    /// String found in import tables or dependencies
29    Import {
30        /// Name of the imported library or module
31        library: String,
32    },
33    /// String found in export tables or exported symbols
34    Export {
35        /// Name of the exported symbol or function
36        symbol: String,
37    },
38    /// String found in embedded resources
39    Resource {
40        /// Type of resource (icon, string table, etc.)
41        resource_type: String,
42    },
43    /// String found in file sections
44    Section {
45        /// Name of the section where the string was found
46        section_name: String,
47    },
48    /// String found in file metadata
49    Metadata {
50        /// Metadata field name where the string was found
51        field: String,
52    },
53    /// String representing a file system path
54    Path {
55        /// Type of path (absolute, relative, UNC, etc.)
56        path_type: String,
57    },
58    /// String representing a URL
59    Url {
60        /// URL protocol (http, https, ftp, etc.)
61        protocol: Option<String>,
62    },
63    /// String found in Windows registry context
64    Registry {
65        /// Registry hive name (HKLM, HKCU, etc.)
66        hive: Option<String>,
67    },
68    /// String found in command or script context
69    Command {
70        /// Type of command (shell, powershell, batch, etc.)
71        command_type: String,
72    },
73    /// String found in other contexts
74    Other {
75        /// Category description for the context
76        category: String,
77    },
78}
79
80/// Record of a single string occurrence
81#[derive(Debug, Clone, Serialize, Deserialize)]
82pub struct StringOccurrence {
83    /// Path to the file where the string was found
84    pub file_path: String,
85    /// Hash of the file where the string was found
86    pub file_hash: String,
87    /// Name of the tool that discovered this string
88    pub tool_name: String,
89    /// Timestamp when the string was discovered
90    pub timestamp: DateTime<Utc>,
91    /// Context in which the string was found
92    pub context: StringContext,
93}
94
95/// Complete information about a tracked string
96#[derive(Debug, Clone, Serialize, Deserialize)]
97pub struct StringEntry {
98    /// The actual string value
99    pub value: String,
100    /// Timestamp when this string was first discovered
101    pub first_seen: DateTime<Utc>,
102    /// Timestamp when this string was last seen
103    pub last_seen: DateTime<Utc>,
104    /// Total number of times this string has been found
105    pub total_occurrences: usize,
106    /// Set of unique file paths where this string was found
107    pub unique_files: HashSet<String>,
108    /// Detailed records of each occurrence
109    pub occurrences: Vec<StringOccurrence>,
110    /// Set of categories this string belongs to
111    pub categories: HashSet<String>,
112    /// Whether this string is flagged as suspicious
113    pub is_suspicious: bool,
114    /// Shannon entropy score of the string
115    pub entropy: f64,
116}
117
118/// Statistics about tracked strings
119#[derive(Debug, Clone, Serialize, Deserialize)]
120pub struct StringStatistics {
121    /// Total number of unique strings tracked
122    pub total_unique_strings: usize,
123    /// Total number of string occurrences across all files
124    pub total_occurrences: usize,
125    /// Total number of files that have been analyzed
126    pub total_files_analyzed: usize,
127    /// Most frequently occurring strings with their occurrence counts
128    pub most_common: StringCountVec,
129    /// List of strings flagged as suspicious
130    pub suspicious_strings: Vec<String>,
131    /// Strings with high entropy scores and their entropy values
132    pub high_entropy_strings: StringScoreVec,
133    /// Distribution of strings across different categories
134    pub category_distribution: HashMap<String, usize>,
135    /// Distribution of strings by length ranges
136    pub length_distribution: HashMap<String, usize>,
137}
138
139/// Filter criteria for string queries
140#[derive(Debug, Clone, Default, Serialize, Deserialize)]
141pub struct StringFilter {
142    /// Minimum number of occurrences a string must have
143    pub min_occurrences: Option<usize>,
144    /// Maximum number of occurrences a string can have
145    pub max_occurrences: Option<usize>,
146    /// Minimum length of strings to include
147    pub min_length: Option<usize>,
148    /// Maximum length of strings to include
149    pub max_length: Option<usize>,
150    /// Filter by specific categories
151    pub categories: Option<Vec<String>>,
152    /// Filter by specific file paths
153    pub file_paths: Option<Vec<String>>,
154    /// Filter by specific file hashes
155    pub file_hashes: Option<Vec<String>>,
156    /// If true, only return suspicious strings
157    pub suspicious_only: Option<bool>,
158    /// Regular expression pattern to match string values
159    pub regex_pattern: Option<String>,
160    /// Minimum entropy score for strings
161    pub min_entropy: Option<f64>,
162    /// Maximum entropy score for strings
163    pub max_entropy: Option<f64>,
164    /// Date range filter for when strings were discovered
165    pub date_range: Option<DateTimeRange>,
166}
167
168/// Main string tracking system
169#[derive(Clone)]
170pub struct StringTracker {
171    entries: StringEntryMap,
172    analyzer: BoxedAnalyzer,
173    categorizer: BoxedCategorizer,
174    max_occurrences_per_string: usize,
175}
176
177impl Default for StringTracker {
178    fn default() -> Self {
179        Self::new()
180    }
181}
182
183impl StringTracker {
184    /// Create a new StringTracker with default analyzer and categorizer
185    pub fn new() -> Self {
186        let pattern_provider = DefaultPatternProvider::default();
187        let analyzer = DefaultStringAnalyzer::new().with_patterns(pattern_provider.get_patterns());
188
189        Self {
190            entries: Arc::new(Mutex::new(HashMap::new())),
191            analyzer: Arc::new(Box::new(analyzer)),
192            categorizer: Arc::new(Box::new(DefaultCategorizer::new())),
193            max_occurrences_per_string: 1000,
194        }
195    }
196
197    /// Create a StringTracker with custom analyzer and categorizer
198    pub fn with_components(
199        analyzer: Box<dyn StringAnalyzer>,
200        categorizer: Box<dyn Categorizer>,
201    ) -> Self {
202        Self {
203            entries: Arc::new(Mutex::new(HashMap::new())),
204            analyzer: Arc::new(analyzer),
205            categorizer: Arc::new(categorizer),
206            max_occurrences_per_string: 1000,
207        }
208    }
209
210    /// Set the maximum number of occurrences to track per string
211    pub fn with_max_occurrences(mut self, max: usize) -> Self {
212        self.max_occurrences_per_string = max;
213        self
214    }
215
216    /// Track a string occurrence
217    pub fn track_string(
218        &self,
219        value: &str,
220        file_path: &str,
221        file_hash: &str,
222        tool_name: &str,
223        context: StringContext,
224    ) -> Result<()> {
225        let mut entries = self.entries.lock().unwrap();
226
227        let occurrence = StringOccurrence {
228            file_path: file_path.to_string(),
229            file_hash: file_hash.to_string(),
230            tool_name: tool_name.to_string(),
231            timestamp: Utc::now(),
232            context: context.clone(),
233        };
234
235        // Get category from context
236        let context_category = match &context {
237            StringContext::FileString { .. } => "file_string",
238            StringContext::Import { .. } => "import",
239            StringContext::Export { .. } => "export",
240            StringContext::Resource { .. } => "resource",
241            StringContext::Section { .. } => "section",
242            StringContext::Metadata { .. } => "metadata",
243            StringContext::Path { .. } => "path",
244            StringContext::Url { .. } => "url",
245            StringContext::Registry { .. } => "registry",
246            StringContext::Command { .. } => "command",
247            StringContext::Other { category } => category,
248        };
249
250        let entry = entries.entry(value.to_string()).or_insert_with(|| {
251            let analysis = self.analyzer.analyze(value);
252            let categories = self.categorizer.categorize(value);
253
254            let mut category_set =
255                HashSet::with_capacity(categories.len() + analysis.categories.len() + 1);
256            category_set.insert(context_category.to_string());
257            for cat in categories {
258                category_set.insert(cat.name);
259            }
260            category_set.extend(analysis.categories);
261
262            let now = Utc::now();
263            StringEntry {
264                value: value.to_string(),
265                first_seen: now,
266                last_seen: now,
267                total_occurrences: 0,
268                unique_files: HashSet::new(),
269                occurrences: Vec::new(),
270                categories: category_set,
271                is_suspicious: analysis.is_suspicious,
272                entropy: analysis.entropy,
273            }
274        });
275
276        entry.last_seen = Utc::now();
277        entry.total_occurrences += 1;
278        entry.unique_files.insert(file_path.to_string());
279        entry.occurrences.push(occurrence);
280
281        // Limit occurrences per string to prevent memory explosion
282        if entry.occurrences.len() > self.max_occurrences_per_string {
283            entry.occurrences.remove(0);
284        }
285
286        Ok(())
287    }
288
289    /// Track multiple strings from results
290    pub fn track_strings_from_results(
291        &self,
292        strings: &[String],
293        file_path: &str,
294        file_hash: &str,
295        tool_name: &str,
296    ) -> Result<()> {
297        for string in strings {
298            // Categorize the string using the categorizer
299            let categories = self.categorizer.categorize(string);
300
301            // Determine context based on categories
302            let context = if categories.iter().any(|c| c.name == "url") {
303                let protocol = string.split("://").next().map(|p| p.to_string());
304                StringContext::Url { protocol }
305            } else if categories.iter().any(|c| c.name == "path") {
306                let path_type = if string.contains("\\Windows") || string.contains("/usr") {
307                    "system"
308                } else if string.contains("\\Temp") || string.contains("/tmp") {
309                    "temp"
310                } else {
311                    "general"
312                };
313                StringContext::Path {
314                    path_type: path_type.to_string(),
315                }
316            } else if categories.iter().any(|c| c.name == "registry") {
317                let hive = string.split('\\').next().map(|h| h.to_string());
318                StringContext::Registry { hive }
319            } else if categories.iter().any(|c| c.name == "library") {
320                StringContext::Import {
321                    library: string.to_string(),
322                }
323            } else if categories.iter().any(|c| c.name == "command") {
324                StringContext::Command {
325                    command_type: "shell".to_string(),
326                }
327            } else {
328                StringContext::FileString { offset: None }
329            };
330
331            self.track_string(string, file_path, file_hash, tool_name, context)?;
332        }
333        Ok(())
334    }
335
336    /// Get statistics about tracked strings
337    pub fn get_statistics(&self, filter: Option<&StringFilter>) -> StringStatistics {
338        let entries = self.entries.lock().unwrap();
339
340        let filtered_entries: Vec<_> = entries
341            .values()
342            .filter(|entry| self.matches_filter(entry, filter))
343            .collect();
344
345        let total_unique_strings = filtered_entries.len();
346        let total_occurrences: usize = filtered_entries.iter().map(|e| e.total_occurrences).sum();
347
348        let total_files_analyzed: HashSet<_> = filtered_entries
349            .iter()
350            .flat_map(|e| e.unique_files.iter())
351            .collect();
352
353        // Most common strings
354        let mut most_common: Vec<_> = filtered_entries
355            .iter()
356            .map(|e| (e.value.clone(), e.total_occurrences))
357            .collect();
358        most_common.sort_by(|a, b| b.1.cmp(&a.1));
359        most_common.truncate(100);
360
361        // Suspicious strings
362        let suspicious_strings: Vec<_> = filtered_entries
363            .iter()
364            .filter(|e| e.is_suspicious)
365            .map(|e| e.value.clone())
366            .take(50)
367            .collect();
368
369        // High entropy strings
370        let mut high_entropy_strings: Vec<_> = filtered_entries
371            .iter()
372            .filter(|e| e.entropy > 4.0)
373            .map(|e| (e.value.clone(), e.entropy))
374            .collect();
375        high_entropy_strings.sort_by(|a, b| b.1.partial_cmp(&a.1).unwrap());
376        high_entropy_strings.truncate(50);
377
378        // Category distribution
379        let mut category_distribution = HashMap::new();
380        for entry in &filtered_entries {
381            for category in &entry.categories {
382                *category_distribution.entry(category.clone()).or_insert(0) += 1;
383            }
384        }
385
386        // Length distribution
387        let mut length_distribution = HashMap::new();
388        for entry in &filtered_entries {
389            let len_bucket = match entry.value.len() {
390                0..=10 => "0-10",
391                11..=20 => "11-20",
392                21..=50 => "21-50",
393                51..=100 => "51-100",
394                101..=200 => "101-200",
395                _ => "200+",
396            };
397            *length_distribution
398                .entry(len_bucket.to_string())
399                .or_insert(0) += 1;
400        }
401
402        StringStatistics {
403            total_unique_strings,
404            total_occurrences,
405            total_files_analyzed: total_files_analyzed.len(),
406            most_common,
407            suspicious_strings,
408            high_entropy_strings,
409            category_distribution,
410            length_distribution,
411        }
412    }
413
414    fn matches_filter(&self, entry: &StringEntry, filter: Option<&StringFilter>) -> bool {
415        let Some(f) = filter else {
416            return true;
417        };
418
419        if let Some(min) = f.min_occurrences {
420            if entry.total_occurrences < min {
421                return false;
422            }
423        }
424
425        if let Some(max) = f.max_occurrences {
426            if entry.total_occurrences > max {
427                return false;
428            }
429        }
430
431        if let Some(min) = f.min_length {
432            if entry.value.len() < min {
433                return false;
434            }
435        }
436
437        if let Some(max) = f.max_length {
438            if entry.value.len() > max {
439                return false;
440            }
441        }
442
443        if let Some(ref categories) = f.categories {
444            if !categories.iter().any(|c| entry.categories.contains(c)) {
445                return false;
446            }
447        }
448
449        if let Some(ref file_hashes) = f.file_hashes {
450            if !file_hashes.iter().any(|h| entry.unique_files.contains(h)) {
451                return false;
452            }
453        }
454
455        if let Some(suspicious_only) = f.suspicious_only {
456            if suspicious_only && !entry.is_suspicious {
457                return false;
458            }
459        }
460
461        if let Some(ref pattern) = f.regex_pattern {
462            if let Ok(re) = regex::Regex::new(pattern) {
463                if !re.is_match(&entry.value) {
464                    return false;
465                }
466            }
467        }
468
469        if let Some(min_entropy) = f.min_entropy {
470            if entry.entropy < min_entropy {
471                return false;
472            }
473        }
474
475        if let Some(max_entropy) = f.max_entropy {
476            if entry.entropy > max_entropy {
477                return false;
478            }
479        }
480
481        true
482    }
483
484    /// Get detailed information about a specific string
485    pub fn get_string_details(&self, value: &str) -> Option<StringEntry> {
486        let entries = self.entries.lock().unwrap();
487        entries.get(value).cloned()
488    }
489
490    /// Search for strings matching a query
491    pub fn search_strings(&self, query: &str, limit: usize) -> Vec<StringEntry> {
492        // Return empty results for empty queries
493        if query.trim().is_empty() {
494            return Vec::new();
495        }
496
497        let entries = self.entries.lock().unwrap();
498        let query_lower = query.to_lowercase();
499
500        let mut results: Vec<_> = entries
501            .values()
502            .filter(|e| e.value.to_lowercase().contains(&query_lower))
503            .cloned()
504            .collect();
505
506        results.sort_by(|a, b| b.total_occurrences.cmp(&a.total_occurrences));
507        results.truncate(limit);
508        results
509    }
510
511    /// Get strings related to a given string
512    pub fn get_related_strings(&self, value: &str, limit: usize) -> StringScoreVec {
513        let entries = self.entries.lock().unwrap();
514
515        let Some(target_entry) = entries.get(value) else {
516            return vec![];
517        };
518
519        let mut similarities: Vec<_> = entries
520            .iter()
521            .filter(|(k, _)| *k != value)
522            .map(|(k, v)| {
523                let similarity = self.calculate_similarity(target_entry, v);
524                (k.clone(), similarity)
525            })
526            .filter(|(_, sim)| *sim > 0.3)
527            .collect();
528
529        similarities.sort_by(|a, b| b.1.partial_cmp(&a.1).unwrap());
530        similarities.truncate(limit);
531        similarities
532    }
533
534    fn calculate_similarity(&self, a: &StringEntry, b: &StringEntry) -> f64 {
535        let mut score = 0.0;
536        let mut factors = 0.0;
537
538        // Shared files
539        let shared_files: HashSet<_> = a.unique_files.intersection(&b.unique_files).collect();
540        if !shared_files.is_empty() {
541            score +=
542                shared_files.len() as f64 / a.unique_files.len().min(b.unique_files.len()) as f64;
543            factors += 1.0;
544        }
545
546        // Shared categories
547        let shared_categories: HashSet<_> = a.categories.intersection(&b.categories).collect();
548        if !shared_categories.is_empty() {
549            score +=
550                shared_categories.len() as f64 / a.categories.len().min(b.categories.len()) as f64;
551            factors += 1.0;
552        }
553
554        // Similar entropy
555        let entropy_diff = (a.entropy - b.entropy).abs();
556        if entropy_diff < 0.5 {
557            score += 1.0 - (entropy_diff / 0.5);
558            factors += 1.0;
559        }
560
561        // Similar length
562        let len_a = a.value.len() as f64;
563        let len_b = b.value.len() as f64;
564        let len_ratio = len_a.min(len_b) / len_a.max(len_b);
565        score += len_ratio;
566        factors += 1.0;
567
568        if factors > 0.0 { score / factors } else { 0.0 }
569    }
570
571    /// Clear all tracked strings
572    #[allow(dead_code)]
573    pub fn clear(&self) {
574        let mut entries = self.entries.lock().unwrap();
575        entries.clear();
576    }
577}