Skip to main content

zeph_tools/shell/
deobfuscate.rs

1// SPDX-FileCopyrightText: 2026 Andrei G <bug-ops>
2// SPDX-License-Identifier: MIT OR Apache-2.0
3
4//! Shell command deobfuscation for pre-blocklist normalization.
5//!
6//! Transforms common obfuscation techniques (hex/octal escapes, subshell expansion,
7//! variable references, quote-based concatenation) into readable equivalents before
8//! the command is evaluated by the blocklist and permission policy.
9//!
10//! # Limitations
11//!
12//! Single-pass: subshell content (`$(...)`, `` `...` ``) is replaced with a
13//! `[subshell: ...]` placeholder but NOT re-scanned. The blocklist independently
14//! rejects `$(` and `` ` `` metacharacters, so nested constructs are caught at
15//! that layer rather than here.
16
17use std::sync::LazyLock;
18
19use regex::Regex;
20use tracing;
21
22/// Maximum input length processed by the deobfuscator.
23///
24/// Commands longer than this are silently truncated before normalization.
25const MAX_INPUT_BYTES: usize = 8192;
26
27static RE_HEX: LazyLock<Regex> =
28    LazyLock::new(|| Regex::new(r"\\x([0-9a-fA-F]{2})").expect("RE_HEX"));
29static RE_OCT: LazyLock<Regex> = LazyLock::new(|| Regex::new(r"\\([0-7]{1,3})").expect("RE_OCT"));
30static RE_UNI: LazyLock<Regex> =
31    LazyLock::new(|| Regex::new(r"\\u([0-9a-fA-F]{4})").expect("RE_UNI"));
32static RE_VAR_BRACE: LazyLock<Regex> =
33    LazyLock::new(|| Regex::new(r"\$\{([A-Za-z_][A-Za-z0-9_]*)\}").expect("RE_VAR_BRACE"));
34static RE_VAR_PLAIN: LazyLock<Regex> =
35    LazyLock::new(|| Regex::new(r"\$([A-Za-z_][A-Za-z0-9_]*)").expect("RE_VAR_PLAIN"));
36static RE_BACKTICK: LazyLock<Regex> =
37    LazyLock::new(|| Regex::new(r"`([^`]*)`").expect("RE_BACKTICK"));
38static RE_SPACE: LazyLock<Regex> = LazyLock::new(|| Regex::new(r"\s+").expect("RE_SPACE"));
39
40/// Normalize an obfuscated shell command string for blocklist and policy evaluation.
41///
42/// Applies transformations in order:
43/// 1. Truncate to 8 KiB.
44/// 2. Decode `\xNN` hex escapes.
45/// 3. Decode `\NNN` octal escapes.
46/// 4. Decode `\uNNNN` Unicode escapes.
47/// 5. Collapse backslash line-continuations (`\↵`).
48/// 6. Expand `${VAR}` / `$VAR` to `[var:VAR]`.
49/// 7. Replace backtick subshells `` `cmd` `` with `[subshell: cmd]`.
50/// 8. Replace `$(cmd)` with `[subshell: cmd]`.
51/// 9. Strip unescaped quotes used for string concatenation.
52/// 10. Normalize runs of whitespace to a single space and trim.
53///
54/// # Examples
55///
56/// ```
57/// use zeph_tools::shell::deobfuscate::deobfuscate;
58///
59/// assert_eq!(deobfuscate(r"\x63url"), "curl");
60/// assert_eq!(deobfuscate(r"\143at"), "cat");
61/// assert_eq!(deobfuscate("$(whoami)"), "[subshell: whoami]");
62/// assert_eq!(deobfuscate("${HOME}/file"), "[var:HOME]/file");
63/// ```
64#[must_use]
65pub fn deobfuscate(command: &str) -> String {
66    let _span = tracing::info_span!("tools.deobfuscate.normalize").entered();
67    // Step 1: truncate at a valid UTF-8 boundary.
68    let input = if command.len() > MAX_INPUT_BYTES {
69        let boundary = command.floor_char_boundary(MAX_INPUT_BYTES);
70        &command[..boundary]
71    } else {
72        command
73    };
74
75    // Step 2: hex escapes \xNN.
76    // After decoding, any newly-introduced backslash (e.g. from \x5c → `\`) is replaced with
77    // a safe placeholder to prevent it from becoming an octet or unicode escape prefix in step 3/4.
78    // This is a single-pass design: multi-stage decoding (encode a backslash as \x5c to enable
79    // a subsequent octal decode) is NOT supported and would be a bypass vector if not handled here.
80    let s = RE_HEX.replace_all(input, |caps: &regex::Captures<'_>| {
81        u8::from_str_radix(&caps[1], 16)
82            .ok()
83            .filter(u8::is_ascii)
84            .map_or_else(
85                || caps[0].to_owned(),
86                |b| {
87                    // Replace decoded backslash with a safe token to prevent cascading decode.
88                    if b == b'\\' {
89                        "[bs]".to_owned()
90                    } else {
91                        (b as char).to_string()
92                    }
93                },
94            )
95    });
96
97    // Step 3: octal escapes \NNN — only decode printable ASCII (0x20–0x7E).
98    // NUL and control characters (0x00–0x1F) are left unchanged to prevent NUL-truncation
99    // attacks where the blocklist sees a different string than the OS executes.
100    let s = RE_OCT.replace_all(&s, |caps: &regex::Captures<'_>| {
101        u8::from_str_radix(&caps[1], 8)
102            .ok()
103            .filter(|&b| (0x20u8..=0x7E).contains(&b))
104            .map_or_else(|| caps[0].to_owned(), |b| (b as char).to_string())
105    });
106
107    // Step 4: unicode escapes \uNNNN.
108    let s = RE_UNI.replace_all(&s, |caps: &regex::Captures<'_>| {
109        u32::from_str_radix(&caps[1], 16)
110            .ok()
111            .and_then(char::from_u32)
112            .map_or_else(|| caps[0].to_owned(), |c| c.to_string())
113    });
114
115    // Step 5: backslash line continuations — join lines without inserting space.
116    let s = s.replace("\\\n", "");
117
118    // Step 6: variable expansion — ${VAR} before $VAR to avoid partial match.
119    let s = RE_VAR_BRACE.replace_all(&s, "[var:$1]");
120    let s = RE_VAR_PLAIN.replace_all(&s, "[var:$1]");
121
122    // Step 7: backtick subshells.
123    let s = RE_BACKTICK.replace_all(&s, "[subshell: $1]");
124
125    // Step 8: $(...) subshells — depth-aware to handle nesting.
126    let s = replace_dollar_subshells(&s);
127
128    // Step 9: strip quotes used for concatenation only.
129    let s = strip_concatenation_quotes(&s);
130
131    // Step 10: normalize whitespace.
132    RE_SPACE.replace_all(&s, " ").trim().to_owned()
133}
134
135/// Replace `$(...)` patterns with `[subshell: ...]`.
136///
137/// Uses a manual depth-tracking scan rather than regex to handle balanced
138/// parentheses in nested constructs like `$(echo $(whoami))`.
139fn replace_dollar_subshells(s: &str) -> String {
140    let bytes = s.as_bytes();
141    let mut out = String::with_capacity(s.len());
142    let mut i = 0;
143    while i < bytes.len() {
144        if bytes[i] == b'$' && i + 1 < bytes.len() && bytes[i + 1] == b'(' {
145            let start = i + 2;
146            let mut depth = 1usize;
147            let mut j = start;
148            while j < bytes.len() && depth > 0 {
149                match bytes[j] {
150                    b'(' => depth += 1,
151                    b')' => depth -= 1,
152                    _ => {}
153                }
154                j += 1;
155            }
156            let end = j.saturating_sub(1).min(s.len());
157            let inner = s[start..end].trim();
158            out.push_str("[subshell: ");
159            out.push_str(inner);
160            out.push(']');
161            i = j;
162        } else {
163            // Advance by one full UTF-8 char to avoid splitting multi-byte sequences.
164            let ch = s[i..].chars().next().unwrap_or('\0');
165            out.push(ch);
166            i += ch.len_utf8();
167        }
168    }
169    out
170}
171
172/// Remove unescaped single and double quotes used as concatenation delimiters.
173///
174/// Preserves the quoted content; removes the surrounding quote characters.
175/// Example: `'cu'"rl"` → `curl`.
176fn strip_concatenation_quotes(s: &str) -> String {
177    let mut out = String::with_capacity(s.len());
178    let mut chars = s.chars().peekable();
179    while let Some(ch) = chars.next() {
180        match ch {
181            '\'' => {
182                for inner in chars.by_ref() {
183                    if inner == '\'' {
184                        break;
185                    }
186                    out.push(inner);
187                }
188            }
189            '"' => {
190                while let Some(inner) = chars.next() {
191                    if inner == '"' {
192                        break;
193                    }
194                    if inner == '\\' {
195                        if let Some(escaped) = chars.next() {
196                            out.push(escaped);
197                        }
198                    } else {
199                        out.push(inner);
200                    }
201                }
202            }
203            _ => out.push(ch),
204        }
205    }
206    out
207}
208
209#[cfg(test)]
210mod tests {
211    use super::*;
212
213    #[test]
214    fn hex_escape_decoded() {
215        assert_eq!(deobfuscate(r"\x63url"), "curl");
216        assert_eq!(deobfuscate(r"\x41\x42\x43"), "ABC");
217    }
218
219    #[test]
220    fn octal_escape_decoded() {
221        assert_eq!(deobfuscate(r"\143at"), "cat");
222        assert_eq!(deobfuscate(r"\101"), "A");
223    }
224
225    #[test]
226    fn unicode_escape_decoded() {
227        // c = 'c'
228        assert_eq!(deobfuscate(r"curl"), "curl");
229    }
230
231    #[test]
232    fn variable_expansion_brace() {
233        assert_eq!(deobfuscate("${HOME}/file"), "[var:HOME]/file");
234    }
235
236    #[test]
237    fn variable_expansion_plain() {
238        // After brace expansion, bare $VAR is replaced.
239        assert_eq!(deobfuscate("echo $PATH"), "echo [var:PATH]");
240    }
241
242    #[test]
243    fn backtick_subshell() {
244        assert_eq!(deobfuscate("`whoami`"), "[subshell: whoami]");
245    }
246
247    #[test]
248    fn dollar_subshell_simple() {
249        assert_eq!(deobfuscate("$(whoami)"), "[subshell: whoami]");
250    }
251
252    #[test]
253    fn quote_concatenation_collapse() {
254        assert_eq!(deobfuscate("'cu'\"rl\""), "curl");
255        assert_eq!(deobfuscate("'ab'\"cd\"'ef'"), "abcdef");
256    }
257
258    #[test]
259    fn line_continuation() {
260        assert_eq!(deobfuscate("cu\\\nrl"), "curl");
261    }
262
263    #[test]
264    fn whitespace_normalized() {
265        assert_eq!(deobfuscate("echo   hello"), "echo hello");
266        assert_eq!(deobfuscate("  ls  "), "ls");
267    }
268
269    #[test]
270    fn long_input_truncated() {
271        let long = "a".repeat(MAX_INPUT_BYTES + 100);
272        let result = deobfuscate(&long);
273        assert!(result.len() <= MAX_INPUT_BYTES);
274    }
275}