Skip to main content

mir_extractor/rules/
advanced_utils.rs

1//! Shared utilities for advanced dataflow-based rules.
2//!
3//! These utilities are used by the migrated rules from mir-advanced-rules
4//! for MIR text parsing and taint tracking.
5
6use once_cell::sync::Lazy;
7use regex::Regex;
8use std::collections::HashSet;
9
10/// Detect variable assignment in MIR line (e.g., `_1 = ...`)
11pub fn detect_assignment(line: &str) -> Option<String> {
12    static RE_ASSIGN: Lazy<Regex> = Lazy::new(|| Regex::new(r"^(_\d+)\s*=").expect("assign regex"));
13
14    if let Some(caps) = RE_ASSIGN.captures(line) {
15        return Some(caps[1].to_string());
16    }
17
18    if line.starts_with("(*_") {
19        if let Some(end) = line.find(')') {
20            return Some(line[2..end].to_string());
21        }
22    }
23
24    None
25}
26
27/// Extract call arguments (move/copy _N) from a MIR line
28pub fn extract_call_args(line: &str) -> Vec<String> {
29    static RE_ARG: Lazy<Regex> =
30        Lazy::new(|| Regex::new(r"(?:copy|move)\s+(_\d+)").expect("arg regex"));
31
32    RE_ARG
33        .captures_iter(line)
34        .map(|caps| caps[1].to_string())
35        .collect()
36}
37
38/// Detect length call pattern (e.g., `_1 = slice::len(move _2)`)
39pub fn detect_len_call(line: &str) -> Option<(String, String)> {
40    static RE_LEN: Lazy<Regex> =
41        Lazy::new(|| Regex::new(r"^(_\d+)\s*=.*::len\((?:move|copy)\s+(_\d+)").expect("len regex"));
42
43    RE_LEN
44        .captures(line)
45        .map(|caps| (caps[1].to_string(), caps[2].to_string()))
46}
47
48/// Detect length comparison pattern (e.g., `Gt(move _1, const 100)`)
49pub fn detect_len_comparison(line: &str) -> Option<String> {
50    static RE_LEN_CMP: Lazy<Regex> = Lazy::new(|| {
51        Regex::new(r"(?:Gt|Lt|Ge|Le)\((?:move|copy)\s+(_\d+),\s*const").expect("len cmp regex")
52    });
53
54    RE_LEN_CMP.captures(line).map(|caps| caps[1].to_string())
55}
56
57/// Check if text contains a variable reference
58pub fn contains_var(text: &str, var: &str) -> bool {
59    if text.contains(var) {
60        return true;
61    }
62
63    let var_num = var.trim_start_matches('_');
64    text.contains(&format!("move _{}", var_num))
65        || text.contains(&format!("copy _{}", var_num))
66        || text.contains(&format!("_{}.0", var_num))
67        || text.contains(&format!("_{}.1", var_num))
68        || text.contains(&format!("&_{}", var_num))
69        || text.contains(&format!("(*_{})", var_num))
70}
71
72/// Detect constant string assignment (e.g., `_1 = const "pattern"`)
73pub fn detect_const_string_assignment(line: &str) -> Option<(String, String)> {
74    static RE_CONST_STR: Lazy<Regex> = Lazy::new(|| {
75        Regex::new(r#"^(_\d+)\s*=\s*const\s*\"((?:\\.|[^\"])*)\""#).expect("const string regex")
76    });
77
78    RE_CONST_STR.captures(line).map(|caps| {
79        let var = caps[1].to_string();
80        let literal = caps[2].to_string();
81        (var, literal)
82    })
83}
84
85/// Detect variable alias assignment (e.g., `_1 = copy _2`)
86pub fn detect_var_alias(line: &str) -> Option<(String, String)> {
87    static RE_ALIAS: Lazy<Regex> =
88        Lazy::new(|| Regex::new(r"^(_\d+)\s*=\s*(?:copy|move)\s+(_\d+)").expect("alias regex"));
89
90    RE_ALIAS
91        .captures(line)
92        .map(|caps| (caps[1].to_string(), caps[2].to_string()))
93}
94
95/// Detect drop calls
96#[allow(dead_code)]
97pub fn detect_drop_calls(line: &str) -> Vec<String> {
98    static RE_DROP: Lazy<Regex> =
99        Lazy::new(|| Regex::new(r"drop\(\s*(?:move\s+)?(_\d+)\s*\)").expect("drop call regex"));
100
101    RE_DROP
102        .captures_iter(line)
103        .map(|caps| caps[1].to_string())
104        .collect()
105}
106
107/// Detect StorageDead statements
108#[allow(dead_code)]
109pub fn detect_storage_dead_vars(line: &str) -> Vec<String> {
110    static RE_DEAD: Lazy<Regex> =
111        Lazy::new(|| Regex::new(r"StorageDead\(\s*(_\d+)\s*\)").expect("storage dead regex"));
112
113    RE_DEAD
114        .captures_iter(line)
115        .map(|caps| caps[1].to_string())
116        .collect()
117}
118
119/// Extract constant string literals from a line
120pub fn extract_const_literals(line: &str) -> Vec<String> {
121    static RE_LITERAL: Lazy<Regex> =
122        Lazy::new(|| Regex::new(r#"const\s*\"((?:\\.|[^\"])*)\""#).expect("literal regex"));
123
124    RE_LITERAL
125        .captures_iter(line)
126        .map(|caps| caps[1].to_string())
127        .collect()
128}
129
130/// Unescape Rust string literal escape sequences
131pub fn unescape_rust_literal(raw: &str) -> String {
132    let mut result = String::with_capacity(raw.len());
133    let mut chars = raw.chars();
134    while let Some(ch) = chars.next() {
135        if ch == '\\' {
136            if let Some(next) = chars.next() {
137                match next {
138                    'n' => result.push('\n'),
139                    'r' => result.push('\r'),
140                    't' => result.push('\t'),
141                    '\\' => result.push('\\'),
142                    '"' => result.push('"'),
143                    other => {
144                        result.push(other);
145                    }
146                }
147            }
148        } else {
149            result.push(ch);
150        }
151    }
152    result
153}
154
155/// Check if a regex pattern is high-risk for catastrophic backtracking
156pub fn pattern_is_high_risk(pattern: &str) -> bool {
157    static RE_NESTED_QUANTIFIERS: Lazy<Regex> = Lazy::new(|| {
158        Regex::new(r"\((?:[^()]|\\.)*[+*](?:[^()]|\\.)*\)[+*{]").expect("nested quantifier regex")
159    });
160
161    static RE_DOT_STAR_LOOP: Lazy<Regex> =
162        Lazy::new(|| Regex::new(r"\(\?:?\.\*(?:[^()]|\\.)*\)[+*{]").expect("dot-star loop regex"));
163
164    let simplified = pattern.replace(' ', "");
165    RE_NESTED_QUANTIFIERS.is_match(&simplified) || RE_DOT_STAR_LOOP.is_match(&simplified)
166}
167
168/// Common untrusted source patterns
169pub const UNTRUSTED_PATTERNS: &[&str] = &[
170    "env::var",
171    "env::var_os",
172    "env::args",
173    "std::env::var",
174    "std::env::args",
175    "stdin",
176    "TcpStream",
177    "read_to_string",
178    "read_to_end",
179    "fs::read",
180    "File::open",
181];
182
183/// Check if a MIR line contains an untrusted source
184pub fn is_untrusted_source(line: &str) -> bool {
185    UNTRUSTED_PATTERNS
186        .iter()
187        .any(|pattern| line.contains(pattern))
188}
189
190/// Detect derive macro generated functions by name pattern
191pub fn is_derive_macro_function(func_name: &str) -> bool {
192    static RE_DERIVE: Lazy<Regex> = Lazy::new(|| {
193        Regex::new(r"<impl at [^>]+:\d+:\d+:\s*\d+:\d+>::").expect("derive macro regex")
194    });
195    RE_DERIVE.is_match(func_name)
196}
197
198/// Detect safe trait methods that commonly take references
199pub fn is_safe_trait_method(func_name: &str, _func_signature: &str) -> bool {
200    let safe_methods = [
201        "::eq",
202        "::ne",
203        "::partial_cmp",
204        "::cmp",
205        "::hash",
206        "::fmt",
207        "::clone",
208        "::default",
209        "::drop",
210    ];
211    safe_methods.iter().any(|m| func_name.ends_with(m))
212}
213
214/// Simple taint tracker for dataflow analysis
215#[derive(Default)]
216pub struct TaintTracker {
217    pub tainted: HashSet<String>,
218    pub taint_roots: std::collections::HashMap<String, String>,
219    pub sanitized_roots: HashSet<String>,
220    pub sources: std::collections::HashMap<String, String>,
221}
222
223impl TaintTracker {
224    pub fn mark_source(&mut self, var: &str, origin: &str) {
225        let var = var.to_string();
226        self.tainted.insert(var.clone());
227        self.taint_roots.insert(var.clone(), var.clone());
228        self.sources
229            .entry(var)
230            .or_insert_with(|| origin.trim().to_string());
231    }
232
233    pub fn mark_alias(&mut self, dest: &str, source: &str) {
234        if !self.tainted.contains(source) {
235            return;
236        }
237
238        if let Some(root) = self.taint_roots.get(source).cloned() {
239            self.tainted.insert(dest.to_string());
240            self.taint_roots.insert(dest.to_string(), root);
241        }
242    }
243
244    pub fn find_tainted_in_line(&self, line: &str) -> Option<String> {
245        self.tainted
246            .iter()
247            .find(|var| contains_var(line, var))
248            .cloned()
249    }
250
251    pub fn is_sanitized(&self, var: &str) -> bool {
252        if let Some(root) = self.taint_roots.get(var) {
253            self.sanitized_roots.contains(root)
254        } else {
255            false
256        }
257    }
258
259    pub fn sanitize_root(&mut self, root: &str) {
260        self.sanitized_roots.insert(root.to_string());
261    }
262}