use anyhow::Result;
use clap::Parser;
mod cache;
mod cli;
mod config;
mod filters;
mod network;
mod output;
mod progress;
mod providers;
mod readers;
mod runner;
mod tester_manager;
mod testers;
mod url_utils;
mod utils;
use cache::{CacheEntry, CacheFilters, CacheKey, CacheManager};
use cli::{read_domains_from_stdin, Args};
use config::Config;
use filters::{HostValidator, UrlFilter};
use network::NetworkSettings;
use output::create_outputter;
use progress::ProgressManager;
use providers::{
CommonCrawlProvider, OTXProvider, Provider, RobotsProvider, SitemapProvider, UrlscanProvider,
VirusTotalProvider, WaybackMachineProvider,
};
use readers::read_urls_from_file;
use runner::{add_provider, process_domains};
use tester_manager::{apply_network_settings_to_tester, process_urls_with_testers};
use testers::{LinkExtractor, StatusChecker, Tester};
use url_utils::UrlTransformer;
use utils::verbose_print;
pub fn parse_api_keys(cli_keys: Vec<String>, env_var_name: &str) -> Vec<String> {
let mut all_keys = cli_keys;
if let Ok(env_keys) = std::env::var(env_var_name) {
let env_keys: Vec<String> = env_keys
.split(',')
.map(|s| s.trim().to_string())
.filter(|s| !s.is_empty())
.collect();
all_keys.extend(env_keys);
}
let mut unique_keys = Vec::new();
for key in all_keys {
if !unique_keys.contains(&key) {
unique_keys.push(key);
}
}
unique_keys
}
pub fn auto_enable_provider(
providers_list: &mut Vec<String>,
api_keys: &[String],
provider_name: &str,
verbose: bool,
silent: bool,
) {
if !api_keys.is_empty() && !providers_list.iter().any(|p| p == provider_name) {
providers_list.push(provider_name.to_string());
if verbose && !silent {
println!("Auto-enabling {provider_name} provider because API key is provided");
}
}
}
async fn create_cache_manager(args: &Args) -> Result<Option<CacheManager>> {
if args.no_cache {
return Ok(None);
}
match args.cache_type.as_str() {
"sqlite" => {
let cache_path = args.cache_path.clone().unwrap_or_else(|| {
let home = std::env::var("HOME").unwrap_or_else(|_| ".".to_string());
std::path::PathBuf::from(home).join(".urx").join("cache.db")
});
verbose_print(
args,
format!("Using SQLite cache at: {}", cache_path.display()),
);
let manager = CacheManager::new_sqlite(cache_path).await?;
Ok(Some(manager))
}
#[cfg(feature = "redis-cache")]
"redis" => {
if let Some(redis_url) = &args.redis_url {
verbose_print(args, format!("Using Redis cache at: {}", redis_url));
let manager = CacheManager::new_redis(redis_url).await?;
Ok(Some(manager))
} else {
if !args.silent {
eprintln!("Error: Redis cache type selected but no --redis-url provided");
}
Err(anyhow::anyhow!("Redis URL required for Redis cache type"))
}
}
#[cfg(not(feature = "redis-cache"))]
"redis" => {
if !args.silent {
eprintln!("Error: Redis cache support not compiled in. Use 'sqlite' or compile with --features redis-cache");
}
Err(anyhow::anyhow!("Redis cache not supported"))
}
_ => {
if !args.silent {
eprintln!(
"Error: Unknown cache type '{}'. Use 'sqlite' or 'redis'",
args.cache_type
);
}
Err(anyhow::anyhow!("Invalid cache type"))
}
}
}
fn create_cache_key(domain: &str, args: &Args) -> CacheKey {
let filters = CacheFilters {
subs: args.subs,
extensions: args.extensions.clone(),
exclude_extensions: args.exclude_extensions.clone(),
patterns: args.patterns.clone(),
exclude_patterns: args.exclude_patterns.clone(),
presets: args.preset.clone(),
min_length: args.min_length,
max_length: args.max_length,
strict: args.strict,
normalize_url: args.normalize_url,
merge_endpoint: args.merge_endpoint,
};
CacheKey::new(domain, &args.providers, &filters)
}
async fn process_domains_with_cache(
domains: Vec<String>,
args: &Args,
progress_manager: &ProgressManager,
providers: &[Box<dyn Provider>],
provider_names: &[String],
cache_manager: Option<&CacheManager>,
) -> std::collections::HashSet<String> {
use std::collections::HashSet;
let mut final_urls = HashSet::new();
if cache_manager.is_none() {
return process_domains(domains, args, progress_manager, providers, provider_names).await;
}
let cache = cache_manager.unwrap();
let mut domains_to_process = Vec::new();
let mut cached_urls = HashSet::new();
for domain in &domains {
let cache_key = create_cache_key(domain, args);
if cache
.is_valid(&cache_key, args.cache_ttl)
.await
.unwrap_or(false)
{
if let Ok(Some(cached_entry)) = cache.get_cached_urls(&cache_key).await {
verbose_print(args, format!("Using cached results for domain: {}", domain));
if args.incremental {
domains_to_process.push(domain.clone());
} else {
cached_urls.extend(cached_entry.urls);
continue;
}
}
}
domains_to_process.push(domain.clone());
}
final_urls.extend(cached_urls);
if !domains_to_process.is_empty() {
verbose_print(
args,
format!(
"Processing {} domains (cache miss/expired)",
domains_to_process.len()
),
);
let fresh_urls = process_domains(
domains_to_process.clone(),
args,
progress_manager,
providers,
provider_names,
)
.await;
if args.incremental {
for domain in &domains_to_process {
let cache_key = create_cache_key(domain, args);
let domain_fresh_urls: HashSet<String> = fresh_urls
.iter()
.filter(|url| url.contains(domain))
.cloned()
.collect();
let new_urls = cache
.get_new_urls(&cache_key, &domain_fresh_urls)
.await
.unwrap_or(domain_fresh_urls.clone());
if !new_urls.is_empty() {
verbose_print(
args,
format!("Found {} new URLs for domain: {}", new_urls.len(), domain),
);
final_urls.extend(new_urls);
}
let entry = CacheEntry::new(domain_fresh_urls.into_iter().collect());
let _ = cache.store_urls(&cache_key, &entry).await;
}
} else {
final_urls.extend(fresh_urls.clone());
for domain in &domains_to_process {
let cache_key = create_cache_key(domain, args);
let domain_urls: Vec<String> = fresh_urls
.iter()
.filter(|url| url.contains(domain))
.cloned()
.collect();
if !domain_urls.is_empty() {
let entry = CacheEntry::new(domain_urls);
let _ = cache.store_urls(&cache_key, &entry).await;
}
}
}
}
let _ = cache.cleanup_expired(args.cache_ttl * 2).await;
final_urls
}
#[tokio::main]
async fn main() -> Result<()> {
let mut args = Args::parse();
let config = Config::load(&args);
config.apply_to_args(&mut args);
let urls_from_file = if !args.files.is_empty() {
let mut all_file_urls = Vec::new();
for file_path in &args.files {
match read_urls_from_file(file_path) {
Ok(urls) => {
if args.verbose && !args.silent {
println!(
"Read {} URLs from file: {}",
urls.len(),
file_path.display()
);
}
all_file_urls.extend(urls);
}
Err(e) => {
if !args.silent {
eprintln!("Error reading file {}: {}", file_path.display(), e);
}
return Err(e);
}
}
}
Some(all_file_urls)
} else {
None
};
let all_urls = if let Some(urls) = urls_from_file {
if args.verbose && !args.silent {
println!(
"Read {} URLs total from {} file(s)",
urls.len(),
args.files.len()
);
}
urls.into_iter().collect()
} else {
let domains = if args.domains.is_empty() {
read_domains_from_stdin()?
} else {
args.domains.clone()
};
if domains.is_empty() {
if !args.silent {
eprintln!(
"No domains provided. Please specify domains or pipe them through stdin."
);
}
return Ok(());
}
let network_settings = NetworkSettings::from_args(&args);
let mut providers: Vec<Box<dyn Provider>> = Vec::new();
let mut provider_names: Vec<String> = Vec::new();
let vt_api_keys = parse_api_keys(args.vt_api_key.clone(), "URX_VT_API_KEY");
let urlscan_api_keys = parse_api_keys(args.urlscan_api_key.clone(), "URX_URLSCAN_API_KEY");
let mut providers_list = args.providers.clone();
auto_enable_provider(
&mut providers_list,
&vt_api_keys,
"vt",
args.verbose,
args.silent,
);
auto_enable_provider(
&mut providers_list,
&urlscan_api_keys,
"urlscan",
args.verbose,
args.silent,
);
if providers_list.iter().any(|p| p == "wayback") {
add_provider(
&args,
&network_settings,
&mut providers,
&mut provider_names,
"Wayback Machine".to_string(),
WaybackMachineProvider::new,
);
}
if providers_list.iter().any(|p| p == "cc") {
add_provider(
&args,
&network_settings,
&mut providers,
&mut provider_names,
args.cc_index.to_string(),
|| CommonCrawlProvider::with_index(args.cc_index.clone()),
);
}
if args.should_use_robots() {
add_provider(
&args,
&network_settings,
&mut providers,
&mut provider_names,
"Robots.txt".to_string(),
RobotsProvider::new,
);
}
if args.should_use_sitemap() {
add_provider(
&args,
&network_settings,
&mut providers,
&mut provider_names,
"Sitemap".to_string(),
SitemapProvider::new,
);
}
if providers_list.iter().any(|p| p == "otx") {
add_provider(
&args,
&network_settings,
&mut providers,
&mut provider_names,
"OTX".to_string(),
OTXProvider::new,
);
}
if providers_list.iter().any(|p| p == "vt") {
if !vt_api_keys.is_empty() {
add_provider(
&args,
&network_settings,
&mut providers,
&mut provider_names,
"VirusTotal".to_string(),
|| VirusTotalProvider::new_with_keys(vt_api_keys.clone()),
);
} else if !args.silent {
eprintln!("Error: The VirusTotal provider (vt) requires an API key. Please use --vt-api-key or set the URX_VT_API_KEY environment variable.");
}
}
if providers_list.iter().any(|p| p == "urlscan") {
if !urlscan_api_keys.is_empty() {
add_provider(
&args,
&network_settings,
&mut providers,
&mut provider_names,
"Urlscan".to_string(),
|| UrlscanProvider::new_with_keys(urlscan_api_keys.clone()),
);
} else if !args.silent {
eprintln!("Error: The Urlscan provider (urlscan) requires an API key. Please use --urlscan-api-key or set the URX_URLSCAN_API_KEY environment variable.");
}
}
if providers.is_empty() {
if !args.silent {
eprintln!("Error: No valid providers specified. Please use --providers with valid provider names (wayback, cc, otx, vt, urlscan)");
}
return Ok(());
}
let progress_check = args.no_progress || args.silent;
let progress_manager = ProgressManager::new(progress_check);
let cache_manager = create_cache_manager(&args).await.ok().flatten();
process_domains_with_cache(
domains.clone(),
&args,
&progress_manager,
&providers,
&provider_names,
cache_manager.as_ref(),
)
.await
};
let network_settings = NetworkSettings::from_args(&args);
let progress_check = args.no_progress || args.silent;
let progress_manager = ProgressManager::new(progress_check);
let filter_bar = if !args.extensions.is_empty()
|| !args.patterns.is_empty()
|| !args.exclude_extensions.is_empty()
|| !args.exclude_patterns.is_empty()
|| args.min_length.is_some()
|| args.max_length.is_some()
{
let bar = progress_manager.create_filter_bar();
bar.set_message("Applying filters to URLs...");
Some(bar)
} else {
None
};
let mut url_filter = UrlFilter::new();
if !args.preset.is_empty() {
url_filter.apply_presets(&args.preset);
}
url_filter
.with_extensions(args.extensions.clone())
.with_exclude_extensions(args.exclude_extensions.clone())
.with_patterns(args.patterns.clone())
.with_exclude_patterns(args.exclude_patterns.clone())
.with_min_length(args.min_length)
.with_max_length(args.max_length);
let mut sorted_urls = url_filter.apply_filters(&all_urls);
if args.strict && args.files.is_empty() {
if args.verbose && !args.silent {
println!("Enforcing strict host validation...");
}
let domains = if args.domains.is_empty() {
read_domains_from_stdin().unwrap_or_default()
} else {
args.domains.clone()
};
if !domains.is_empty() {
let host_validator = HostValidator::new(&domains, args.subs);
sorted_urls.retain(|url| host_validator.is_valid_host(url));
if args.verbose && !args.silent {
println!(
"Number of valid URLs after host validation: {}",
sorted_urls.len()
);
}
}
}
if let Some(bar) = filter_bar {
bar.finish_with_message(format!("Filtered to {} URLs", sorted_urls.len()));
}
if args.verbose && !args.silent {
println!("Total unique URLs after filtering: {}", sorted_urls.len());
}
let transform_bar = if args.merge_endpoint
|| args.show_only_host
|| args.show_only_path
|| args.show_only_param
{
let bar = progress_manager.create_transform_bar();
bar.set_message("Applying URL transformations...");
Some(bar)
} else {
None
};
let mut url_transformer = UrlTransformer::new();
url_transformer
.with_normalize_url(args.normalize_url)
.with_merge_endpoint(args.merge_endpoint)
.with_show_only_host(args.show_only_host)
.with_show_only_path(args.show_only_path)
.with_show_only_param(args.show_only_param);
let transformed_urls = url_transformer.transform(sorted_urls);
if let Some(bar) = transform_bar {
bar.finish_with_message(format!("Transformed to {} URLs", transformed_urls.len()));
}
let outputter = create_outputter(&args.format);
let should_check_status =
args.check_status || !args.include_status.is_empty() || !args.exclude_status.is_empty();
let final_urls = if should_check_status || args.extract_links {
let mut testers: Vec<Box<dyn Tester>> = Vec::new();
if should_check_status {
verbose_print(&args, "Checking HTTP status codes for URLs");
let mut status_checker = StatusChecker::new();
apply_network_settings_to_tester(&mut status_checker, &network_settings);
if !args.include_status.is_empty() {
status_checker.with_include_status(Some(args.include_status.clone()));
verbose_print(
&args,
format!(
"Including only status codes that match: {}",
args.include_status.join(", ")
),
);
}
if !args.exclude_status.is_empty() {
status_checker.with_exclude_status(Some(args.exclude_status.clone()));
verbose_print(
&args,
format!(
"Excluding status codes that match: {}",
args.exclude_status.join(", ")
),
);
}
testers.push(Box::new(status_checker));
}
if args.extract_links {
if args.verbose && !args.silent {
println!("Extracting links from HTML content");
}
let mut link_extractor = LinkExtractor::new();
apply_network_settings_to_tester(&mut link_extractor, &network_settings);
testers.push(Box::new(link_extractor));
}
process_urls_with_testers(
transformed_urls,
&args,
&network_settings,
&progress_manager,
testers,
should_check_status,
)
.await
} else {
transformed_urls
.iter()
.map(|url| output::UrlData::new(url.clone()))
.collect()
};
match outputter.output(&final_urls, args.output.clone(), args.silent) {
Ok(_) => {
if args.verbose && !args.silent {
if let Some(path) = &args.output {
println!("Results written to: {}", path.display());
}
}
}
Err(e) => {
if !args.silent {
eprintln!("Error writing output: {e}");
}
}
}
Ok(())
}
#[cfg(test)]
mod tests {
use super::*;
use anyhow::Result;
use std::collections::HashSet;
use std::env;
use std::future::Future;
use std::pin::Pin;
use std::sync::{Arc, Mutex};
fn env_mutex() -> &'static std::sync::Mutex<()> {
static INSTANCE: std::sync::OnceLock<std::sync::Mutex<()>> = std::sync::OnceLock::new();
INSTANCE.get_or_init(|| std::sync::Mutex::new(()))
}
#[test]
fn test_auto_enable_provider() {
let mut providers_list = vec!["wayback".to_string(), "cc".to_string()];
let api_keys = vec!["test_api_key".to_string()];
auto_enable_provider(&mut providers_list, &api_keys, "vt", false, false);
assert!(providers_list.contains(&"vt".to_string()));
assert_eq!(providers_list.len(), 3);
auto_enable_provider(&mut providers_list, &api_keys, "vt", false, false);
assert_eq!(providers_list.len(), 3);
let empty_keys: Vec<String> = vec![];
auto_enable_provider(&mut providers_list, &empty_keys, "urlscan", false, false);
assert!(!providers_list.contains(&"urlscan".to_string()));
assert_eq!(providers_list.len(), 3);
}
#[test]
fn test_auto_enable_providers_with_env_vars() {
let _env_lock = env_mutex().lock().unwrap();
let old_vt_key = env::var("URX_VT_API_KEY").ok();
let old_urlscan_key = env::var("URX_URLSCAN_API_KEY").ok();
env::set_var("URX_VT_API_KEY", "test_vt_key");
env::set_var("URX_URLSCAN_API_KEY", "test_urlscan_key");
let args = Args::parse_from(["urx", "example.com"]);
let mut providers_list = Vec::new();
let vt_api_keys = parse_api_keys(args.vt_api_key.clone(), "URX_VT_API_KEY");
let urlscan_api_keys = parse_api_keys(args.urlscan_api_key.clone(), "URX_URLSCAN_API_KEY");
auto_enable_provider(&mut providers_list, &vt_api_keys, "vt", false, false);
auto_enable_provider(
&mut providers_list,
&urlscan_api_keys,
"urlscan",
false,
false,
);
assert!(providers_list.contains(&"vt".to_string()));
assert!(providers_list.contains(&"urlscan".to_string()));
assert_eq!(providers_list.len(), 2);
match old_vt_key {
Some(val) => env::set_var("URX_VT_API_KEY", val),
None => env::remove_var("URX_VT_API_KEY"),
}
match old_urlscan_key {
Some(val) => env::set_var("URX_URLSCAN_API_KEY", val),
None => env::remove_var("URX_URLSCAN_API_KEY"),
}
}
#[test]
fn test_parse_api_keys() {
let cli_keys = vec!["key1".to_string(), "key2".to_string()];
let result = parse_api_keys(cli_keys, "NONEXISTENT_ENV_VAR");
assert_eq!(result, vec!["key1", "key2"]);
let _env_lock = env_mutex().lock().unwrap();
env::set_var("TEST_API_KEYS", "env_key1,env_key2, env_key3 ");
let result = parse_api_keys(vec![], "TEST_API_KEYS");
assert_eq!(result, vec!["env_key1", "env_key2", "env_key3"]);
env::remove_var("TEST_API_KEYS");
env::set_var("TEST_API_KEYS", "env_key1,env_key2");
let cli_keys = vec!["cli_key1".to_string()];
let result = parse_api_keys(cli_keys, "TEST_API_KEYS");
assert_eq!(result, vec!["cli_key1", "env_key1", "env_key2"]);
env::remove_var("TEST_API_KEYS");
env::set_var("TEST_API_KEYS", "key1,key2");
let cli_keys = vec!["key1".to_string(), "key3".to_string()];
let result = parse_api_keys(cli_keys, "TEST_API_KEYS");
assert_eq!(result, vec!["key1", "key3", "key2"]);
env::remove_var("TEST_API_KEYS");
env::set_var("TEST_API_KEYS", "key1,,key2, ,key3");
let result = parse_api_keys(vec![], "TEST_API_KEYS");
assert_eq!(result, vec!["key1", "key2", "key3"]);
env::remove_var("TEST_API_KEYS");
}
#[test]
fn test_multiple_api_keys_integration() {
let _env_lock = env_mutex().lock().unwrap();
let old_vt_key = env::var("URX_VT_API_KEY").ok();
let old_urlscan_key = env::var("URX_URLSCAN_API_KEY").ok();
env::remove_var("URX_VT_API_KEY");
env::remove_var("URX_URLSCAN_API_KEY");
let args = Args::parse_from([
"urx",
"example.com",
"--vt-api-key",
"vt_key1",
"--vt-api-key",
"vt_key2",
"--urlscan-api-key",
"url_key1",
]);
assert_eq!(args.vt_api_key, vec!["vt_key1", "vt_key2"]);
assert_eq!(args.urlscan_api_key, vec!["url_key1"]);
let vt_keys = parse_api_keys(args.vt_api_key, "URX_VT_API_KEY");
let url_keys = parse_api_keys(args.urlscan_api_key, "URX_URLSCAN_API_KEY");
assert_eq!(vt_keys, vec!["vt_key1", "vt_key2"]);
assert_eq!(url_keys, vec!["url_key1"]);
match old_vt_key {
Some(val) => env::set_var("URX_VT_API_KEY", val),
None => env::remove_var("URX_VT_API_KEY"),
}
match old_urlscan_key {
Some(val) => env::set_var("URX_URLSCAN_API_KEY", val),
None => env::remove_var("URX_URLSCAN_API_KEY"),
}
}
#[test]
fn test_api_key_precedence() {
let _env_lock = env_mutex().lock().unwrap();
let old_vt_key = env::var("URX_VT_API_KEY").ok();
env::set_var("URX_VT_API_KEY", "env_vt_key");
let args = Args::parse_from(["urx", "example.com", "--vt-api-key", "arg_vt_key"]);
let vt_api_keys = parse_api_keys(args.vt_api_key.clone(), "URX_VT_API_KEY");
assert_eq!(vt_api_keys, vec!["arg_vt_key", "env_vt_key"]);
assert_eq!(vt_api_keys[0], "arg_vt_key");
let args = Args::parse_from(["urx", "example.com"]);
let vt_api_keys = parse_api_keys(args.vt_api_key.clone(), "URX_VT_API_KEY");
assert_eq!(vt_api_keys, vec!["env_vt_key"]);
match old_vt_key {
Some(val) => env::set_var("URX_VT_API_KEY", val),
None => env::remove_var("URX_VT_API_KEY"),
}
}
#[derive(Clone)]
struct MockProvider {
urls: Vec<String>,
should_fail: bool,
calls: Arc<Mutex<Vec<String>>>,
}
impl MockProvider {
fn new(urls: Vec<String>, should_fail: bool) -> Self {
MockProvider {
urls,
should_fail,
calls: Arc::new(Mutex::new(vec![])),
}
}
}
impl Provider for MockProvider {
fn clone_box(&self) -> Box<dyn Provider> {
Box::new(self.clone())
}
fn fetch_urls<'a>(
&'a self,
domain: &'a str,
) -> Pin<Box<dyn Future<Output = Result<Vec<String>>> + Send + 'a>> {
let urls = self.urls.clone();
let should_fail = self.should_fail;
let calls = self.calls.clone();
Box::pin(async move {
calls.lock().unwrap().push(domain.to_string());
if should_fail {
Err(anyhow::anyhow!("Mock provider failure"))
} else {
Ok(urls)
}
})
}
fn with_subdomains(&mut self, _include: bool) {}
fn with_proxy(&mut self, _proxy: Option<String>) {}
fn with_proxy_auth(&mut self, _auth: Option<String>) {}
fn with_timeout(&mut self, _seconds: u64) {}
fn with_retries(&mut self, _count: u32) {}
fn with_random_agent(&mut self, _enabled: bool) {}
fn with_insecure(&mut self, _enabled: bool) {}
fn with_parallel(&mut self, _parallel: u32) {}
fn with_rate_limit(&mut self, _rate_limit: Option<f32>) {}
}
#[derive(Clone)]
struct MockStatusChecker {
results: Vec<String>,
}
impl MockStatusChecker {
fn new(results: Vec<String>) -> Self {
MockStatusChecker { results }
}
}
impl Tester for MockStatusChecker {
fn clone_box(&self) -> Box<dyn Tester> {
Box::new(self.clone())
}
fn test_url<'a>(
&'a self,
_url: &'a str,
) -> Pin<Box<dyn Future<Output = Result<Vec<String>>> + Send + 'a>> {
let results = self.results.clone();
Box::pin(async move { Ok(results) })
}
fn with_timeout(&mut self, _seconds: u64) {}
fn with_retries(&mut self, _count: u32) {}
fn with_random_agent(&mut self, _enabled: bool) {}
fn with_insecure(&mut self, _enabled: bool) {}
fn with_proxy(&mut self, _proxy: Option<String>) {}
fn with_proxy_auth(&mut self, _auth: Option<String>) {}
}
#[tokio::test]
async fn test_process_domains() {
let mock_urls = vec![
"https://example.com/page1".to_string(),
"https://example.com/page2".to_string(),
];
let provider = MockProvider::new(mock_urls.clone(), false);
let calls = provider.calls.clone();
let providers: Vec<Box<dyn Provider>> = vec![Box::new(provider)];
let provider_names = vec!["MockProvider".to_string()];
let args = Args {
domains: vec!["example.com".to_string()],
config: None,
files: vec![],
output: None,
format: "plain".to_string(),
merge_endpoint: false,
normalize_url: false,
providers: vec!["mock".to_string()],
subs: false,
cc_index: "CC-MAIN-2025-13".to_string(),
vt_api_key: vec![],
urlscan_api_key: vec![],
verbose: false,
silent: true, no_progress: true, preset: vec![],
extensions: vec![],
exclude_extensions: vec![],
patterns: vec![],
exclude_patterns: vec![],
show_only_host: false,
show_only_path: false,
show_only_param: false,
min_length: None,
max_length: None,
strict: true, network_scope: "all".to_string(),
proxy: None,
proxy_auth: None,
insecure: false,
random_agent: false,
timeout: 30,
retries: 3,
parallel: Some(5),
rate_limit: None,
check_status: false,
include_status: vec![],
exclude_status: vec![],
extract_links: false,
include_robots: true,
include_sitemap: true,
exclude_robots: false,
exclude_sitemap: false,
incremental: false,
cache_type: "sqlite".to_string(),
cache_path: None,
redis_url: None,
cache_ttl: 86400,
no_cache: false,
};
let progress_manager = ProgressManager::new(true);
let urls = process_domains(
vec!["example.com".to_string()],
&args,
&progress_manager,
&providers,
&provider_names,
)
.await;
let calls = calls.lock().unwrap();
assert_eq!(calls.len(), 1);
assert_eq!(calls[0], "example.com");
assert_eq!(urls.len(), 2);
assert!(urls.contains("https://example.com/page1"));
assert!(urls.contains("https://example.com/page2"));
}
#[tokio::test]
async fn test_process_urls_with_testers() {
let mock_results = vec![
"https://example.com/result1".to_string(),
"https://example.com/result2".to_string(),
];
let mock_tester = MockStatusChecker::new(mock_results.clone());
let testers: Vec<Box<dyn Tester>> = vec![Box::new(mock_tester)];
let input_urls = vec![
"https://example.com/page1".to_string(),
"https://example.com/page2".to_string(),
];
let args = Args {
domains: vec![],
config: None,
files: vec![],
output: None,
format: "plain".to_string(),
merge_endpoint: false,
normalize_url: false,
providers: vec![],
subs: false,
cc_index: "CC-MAIN-2025-13".to_string(),
vt_api_key: vec![],
urlscan_api_key: vec![],
verbose: false,
silent: true,
no_progress: true,
preset: vec![],
extensions: vec![],
exclude_extensions: vec![],
patterns: vec![],
exclude_patterns: vec![],
show_only_host: false,
show_only_path: false,
show_only_param: false,
min_length: None,
max_length: None,
strict: true,
network_scope: "all".to_string(),
proxy: None,
proxy_auth: None,
insecure: false,
random_agent: false,
timeout: 30,
retries: 3,
parallel: Some(5),
rate_limit: None,
check_status: false,
include_status: vec![],
exclude_status: vec![],
extract_links: false,
include_robots: true,
include_sitemap: true,
exclude_robots: false,
exclude_sitemap: false,
incremental: false,
cache_type: "sqlite".to_string(),
cache_path: None,
redis_url: None,
cache_ttl: 86400,
no_cache: false,
};
let network_settings = NetworkSettings::new();
let progress_manager = ProgressManager::new(true);
let result_data = process_urls_with_testers(
input_urls,
&args,
&network_settings,
&progress_manager,
testers,
false, )
.await;
let result_urls: Vec<String> = result_data.iter().map(|data| data.url.clone()).collect();
assert_eq!(result_urls.len(), 2);
assert!(result_urls.contains(&"https://example.com/page1".to_string()));
assert!(result_urls.contains(&"https://example.com/page2".to_string()));
}
#[test]
fn test_url_filtering() {
let urls = HashSet::from([
"https://example.com/page1.html".to_string(),
"https://example.com/image.jpg".to_string(),
"https://example.com/script.js".to_string(),
"https://example.com/styles.css".to_string(),
]);
let mut filter = UrlFilter::new();
filter.with_extensions(vec!["html".to_string(), "js".to_string()]);
let filtered = filter.apply_filters(&urls);
assert_eq!(filtered.len(), 2);
assert!(filtered.contains(&"https://example.com/page1.html".to_string()));
assert!(filtered.contains(&"https://example.com/script.js".to_string()));
assert!(!filtered.contains(&"https://example.com/image.jpg".to_string()));
assert!(!filtered.contains(&"https://example.com/styles.css".to_string()));
}
#[test]
fn test_url_transformation() {
let urls = vec![
"https://example.com/path/to/page?param1=value1¶m2=value2".to_string(),
"https://subdomain.example.com/another/path?id=123".to_string(),
];
let mut transformer = UrlTransformer::new();
transformer.with_show_only_host(true);
let host_only = transformer.transform(urls.clone());
assert_eq!(host_only.len(), 2);
assert!(host_only.contains(&"example.com".to_string()));
assert!(host_only.contains(&"subdomain.example.com".to_string()));
let mut transformer = UrlTransformer::new();
transformer.with_show_only_path(true);
let path_only = transformer.transform(urls.clone());
assert_eq!(path_only.len(), 2);
assert!(path_only.contains(&"/path/to/page".to_string()));
assert!(path_only.contains(&"/another/path".to_string()));
let mut transformer = UrlTransformer::new();
transformer.with_show_only_param(true);
let param_only = transformer.transform(urls);
assert_eq!(param_only.len(), 2);
assert!(
param_only.contains(&"param1=value1¶m2=value2".to_string())
|| param_only.contains(&"param2=value2¶m1=value1".to_string())
);
assert!(param_only.contains(&"id=123".to_string()));
}
}