use clap::Parser;
use std::path::PathBuf;
#[derive(Parser, Debug, Clone)]
#[clap(name = "urx", version)]
pub struct Args {
#[clap(name = "DOMAINS")]
pub domains: Vec<String>,
#[clap(short, long, value_parser)]
pub config: Option<PathBuf>,
#[clap(help_heading = "Input Options")]
#[clap(long, action = clap::ArgAction::Append, num_args = 1.., value_parser)]
pub files: Vec<PathBuf>,
#[clap(help_heading = "Output Options")]
#[clap(short, long, value_parser)]
pub output: Option<PathBuf>,
#[clap(help_heading = "Output Options")]
#[clap(short, long, default_value = "plain")]
pub format: String,
#[clap(help_heading = "Output Options")]
#[clap(long)]
pub merge_endpoint: bool,
#[clap(help_heading = "Output Options")]
#[clap(long)]
pub normalize_url: bool,
#[clap(help_heading = "Provider Options")]
#[clap(long, value_delimiter = ',', default_value = "wayback,cc,otx")]
pub providers: Vec<String>,
#[clap(help_heading = "Provider Options")]
#[clap(long)]
pub subs: bool,
#[clap(help_heading = "Provider Options")]
#[clap(long, default_value = "CC-MAIN-2025-13")]
pub cc_index: String,
#[clap(help_heading = "Provider Options")]
#[clap(long, action = clap::ArgAction::Append)]
pub vt_api_key: Vec<String>,
#[clap(help_heading = "Provider Options")]
#[clap(long, action = clap::ArgAction::Append)]
pub urlscan_api_key: Vec<String>,
#[clap(long, default_value = "true", hide = true)]
pub include_robots: bool,
#[clap(long, help_heading = "Discovery Options")]
pub exclude_robots: bool,
#[clap(long, default_value = "true", hide = true)]
pub include_sitemap: bool,
#[clap(long, help_heading = "Discovery Options")]
pub exclude_sitemap: bool,
#[clap(help_heading = "Display Options")]
#[clap(short, long)]
pub verbose: bool,
#[clap(help_heading = "Display Options")]
#[clap(long)]
pub silent: bool,
#[clap(help_heading = "Display Options")]
#[clap(long)]
pub no_progress: bool,
#[clap(help_heading = "Filter Options")]
#[clap(short, long, value_delimiter = ',')]
pub preset: Vec<String>,
#[clap(help_heading = "Filter Options")]
#[clap(short, long, value_delimiter = ',')]
pub extensions: Vec<String>,
#[clap(help_heading = "Filter Options")]
#[clap(long, value_delimiter = ',')]
pub exclude_extensions: Vec<String>,
#[clap(help_heading = "Filter Options")]
#[clap(long, value_delimiter = ',')]
pub patterns: Vec<String>,
#[clap(help_heading = "Filter Options")]
#[clap(long, value_delimiter = ',')]
pub exclude_patterns: Vec<String>,
#[clap(help_heading = "Filter Options")]
#[clap(long)]
pub show_only_host: bool,
#[clap(help_heading = "Filter Options")]
#[clap(long)]
pub show_only_path: bool,
#[clap(help_heading = "Filter Options")]
#[clap(long)]
pub show_only_param: bool,
#[clap(help_heading = "Filter Options")]
#[clap(long = "min-length")]
pub min_length: Option<usize>,
#[clap(help_heading = "Filter Options")]
#[clap(long = "max-length")]
pub max_length: Option<usize>,
#[clap(help_heading = "Filter Options")]
#[clap(long, default_value = "true")]
pub strict: bool,
#[clap(help_heading = "Network Options")]
#[clap(long, default_value = "all", value_parser = validate_network_scope)]
pub network_scope: String,
#[clap(help_heading = "Network Options")]
#[clap(long)]
pub proxy: Option<String>,
#[clap(help_heading = "Network Options")]
#[clap(long)]
pub proxy_auth: Option<String>,
#[clap(help_heading = "Network Options")]
#[clap(long)]
pub insecure: bool,
#[clap(help_heading = "Network Options")]
#[clap(long)]
pub random_agent: bool,
#[clap(help_heading = "Network Options")]
#[clap(long, default_value = "120")]
pub timeout: u64,
#[clap(help_heading = "Network Options")]
#[clap(long, default_value = "2")]
pub retries: u32,
#[clap(help_heading = "Network Options")]
#[clap(long, default_value = "5")]
pub parallel: Option<u32>,
#[clap(help_heading = "Network Options")]
#[clap(long)]
pub rate_limit: Option<f32>,
#[clap(help_heading = "Testing Options")]
#[clap(long, alias = "cs", visible_alias = "--cs")]
pub check_status: bool,
#[clap(help_heading = "Testing Options")]
#[clap(long, alias = "is", visible_alias = "--is")]
pub include_status: Vec<String>,
#[clap(help_heading = "Testing Options")]
#[clap(long, alias = "es", visible_alias = "--es")]
pub exclude_status: Vec<String>,
#[clap(help_heading = "Testing Options")]
#[clap(long)]
pub extract_links: bool,
#[clap(help_heading = "Cache Options")]
#[clap(long)]
pub incremental: bool,
#[clap(help_heading = "Cache Options")]
#[clap(long, default_value = "sqlite")]
pub cache_type: String,
#[clap(help_heading = "Cache Options")]
#[clap(long)]
pub cache_path: Option<std::path::PathBuf>,
#[clap(help_heading = "Cache Options")]
#[clap(long)]
pub redis_url: Option<String>,
#[clap(help_heading = "Cache Options")]
#[clap(long, default_value = "86400")]
pub cache_ttl: u64,
#[clap(help_heading = "Cache Options")]
#[clap(long)]
pub no_cache: bool,
}
pub fn read_domains_from_stdin() -> anyhow::Result<Vec<String>> {
use anyhow::Context;
use std::io::{self, BufRead};
let stdin = io::stdin();
let mut domains = Vec::new();
for line in stdin.lock().lines() {
let domain = line.context("Failed to read line from stdin")?;
if !domain.trim().is_empty() {
domains.push(domain.trim().to_string());
}
}
Ok(domains)
}
impl Args {
pub fn should_use_robots(&self) -> bool {
!self.exclude_robots && self.include_robots
}
pub fn should_use_sitemap(&self) -> bool {
!self.exclude_sitemap && self.include_sitemap
}
}
fn validate_network_scope(s: &str) -> Result<String, String> {
match s {
"all" | "providers" | "testers" | "providers,testers" | "testers,providers" => Ok(s.to_string()),
_ => Err(format!("Invalid network scope: {s}. Allowed values are all, providers, testers, or providers,testers")),
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_args_default_values() {
let args = Args::parse_from(["urx", "example.com"]);
assert_eq!(args.domains, vec!["example.com"]);
assert_eq!(args.format, "plain");
assert_eq!(args.providers, vec!["wayback", "cc", "otx"]);
assert_eq!(args.cc_index, "CC-MAIN-2025-13");
assert_eq!(args.timeout, 120);
assert_eq!(args.retries, 2);
assert!(args.include_robots);
assert!(args.include_sitemap);
assert!(!args.exclude_robots);
assert!(!args.exclude_sitemap);
assert!(args.should_use_robots());
assert!(args.should_use_sitemap());
}
#[test]
fn test_args_multiple_domains() {
let args = Args::parse_from(["urx", "example.com", "example.org"]);
assert_eq!(args.domains, vec!["example.com", "example.org"]);
}
#[test]
fn test_args_output_options() {
let args = Args::parse_from(["urx", "example.com", "-o", "output.txt", "-f", "json"]);
assert_eq!(args.domains, vec!["example.com"]);
assert!(args.output.is_some());
assert_eq!(args.output.unwrap().to_str().unwrap(), "output.txt");
assert_eq!(args.format, "json");
}
#[test]
fn test_args_providers() {
let args = Args::parse_from(["urx", "example.com", "--providers", "wayback,vt"]);
assert_eq!(args.providers, vec!["wayback", "vt"]);
}
#[test]
fn test_network_options() {
let args = Args::parse_from([
"urx",
"example.com",
"--proxy",
"http://proxy:8080",
"--timeout",
"60",
]);
assert_eq!(args.proxy.unwrap(), "http://proxy:8080");
assert_eq!(args.timeout, 60);
}
#[test]
fn test_filter_options() {
let args = Args::parse_from([
"urx",
"example.com",
"-e",
"js,php",
"--exclude-extensions",
"html,css",
]);
assert_eq!(args.extensions, vec!["js", "php"]);
assert_eq!(args.exclude_extensions, vec!["html", "css"]);
}
#[test]
fn test_robots_sitemap_flags() {
let args = Args::parse_from(["urx", "example.com"]);
assert!(args.include_robots);
assert!(args.include_sitemap);
assert!(!args.exclude_robots);
assert!(!args.exclude_sitemap);
assert!(args.should_use_robots());
assert!(args.should_use_sitemap());
let args = Args::parse_from([
"urx",
"example.com",
"--exclude-robots",
"--exclude-sitemap",
]);
assert!(args.exclude_robots);
assert!(args.exclude_sitemap);
assert!(!args.should_use_robots());
assert!(!args.should_use_sitemap());
}
#[test]
fn test_robots_sitemap_helper_methods() {
let args = Args::parse_from(["urx", "example.com"]);
assert!(args.should_use_robots());
assert!(args.should_use_sitemap());
let args = Args::parse_from(["urx", "example.com", "--exclude-robots"]);
assert!(!args.should_use_robots());
assert!(args.should_use_sitemap());
let args = Args::parse_from(["urx", "example.com", "--include-robots", "--exclude-robots"]);
assert!(args.exclude_robots);
assert!(args.include_robots); assert!(!args.should_use_robots()); }
#[test]
fn test_validate_network_scope_valid() {
assert!(validate_network_scope("all").is_ok());
assert!(validate_network_scope("providers").is_ok());
assert!(validate_network_scope("testers").is_ok());
assert!(validate_network_scope("providers,testers").is_ok());
}
#[test]
fn test_validate_network_scope_invalid() {
assert!(validate_network_scope("invalid").is_err());
}
#[test]
fn test_files_flag() {
let args = Args::parse_from(["urx", "--files", "file1.txt", "file2.warc", "--verbose"]);
assert_eq!(args.files.len(), 2);
assert_eq!(args.files[0].to_str().unwrap(), "file1.txt");
assert_eq!(args.files[1].to_str().unwrap(), "file2.warc");
assert!(args.verbose);
}
#[test]
fn test_multiple_files_flags() {
let args = Args::parse_from(["urx", "--files", "file1.txt", "--files", "file2.warc"]);
assert_eq!(args.files.len(), 2);
assert_eq!(args.files[0].to_str().unwrap(), "file1.txt");
assert_eq!(args.files[1].to_str().unwrap(), "file2.warc");
}
#[test]
fn test_read_domains_from_stdin() {
use std::io::{self, BufRead, Cursor};
let input = "example.com\nexample.org\n\n";
let cursor = Cursor::new(input);
let buffer = io::BufReader::new(cursor);
let mut domains = Vec::new();
for line in buffer.lines() {
let domain = line.unwrap();
if !domain.trim().is_empty() {
domains.push(domain.trim().to_string());
}
}
assert_eq!(domains, vec!["example.com", "example.org"]);
}
}