urx 0.8.0

Extracts URLs from OSINT Archives for Security Insights.
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
use clap::Parser;
use std::path::PathBuf;

#[derive(Parser, Debug, Clone)]
#[clap(name = "urx", version)]
pub struct Args {
    /// Domains to fetch URLs for
    #[clap(name = "DOMAINS")]
    pub domains: Vec<String>,

    /// Config file to load
    #[clap(short, long, value_parser)]
    pub config: Option<PathBuf>,

    #[clap(help_heading = "Input Options")]
    /// Read URLs directly from files (supports WARC, URLTeam compressed, and text files). Use multiple --files flags or space-separate multiple files.
    #[clap(long, action = clap::ArgAction::Append, num_args = 1.., value_parser)]
    pub files: Vec<PathBuf>,

    #[clap(help_heading = "Output Options")]
    /// Output file to write results
    #[clap(short, long, value_parser)]
    pub output: Option<PathBuf>,

    /// Output format (e.g., "plain", "json", "csv")
    #[clap(help_heading = "Output Options")]
    #[clap(short, long, default_value = "plain")]
    pub format: String,

    /// Merge endpoints with the same path and merge URL parameters
    #[clap(help_heading = "Output Options")]
    #[clap(long)]
    pub merge_endpoint: bool,

    /// Normalize URLs for better deduplication (sorts query parameters, removes trailing slashes)
    #[clap(help_heading = "Output Options")]
    #[clap(long)]
    pub normalize_url: bool,

    /// Providers to use (comma-separated, e.g., "wayback,cc,otx,vt,urlscan")
    #[clap(help_heading = "Provider Options")]
    #[clap(long, value_delimiter = ',', default_value = "wayback,cc,otx")]
    pub providers: Vec<String>,

    /// Include subdomains when searching
    #[clap(help_heading = "Provider Options")]
    #[clap(long)]
    pub subs: bool,

    #[clap(help_heading = "Provider Options")]
    /// Common Crawl index to use (e.g., CC-MAIN-2025-13)
    #[clap(long, default_value = "CC-MAIN-2025-13")]
    pub cc_index: String,

    #[clap(help_heading = "Provider Options")]
    /// API key for VirusTotal (can be used multiple times for rotation, can also use URX_VT_API_KEY environment variable with comma-separated keys)
    #[clap(long, action = clap::ArgAction::Append)]
    pub vt_api_key: Vec<String>,

    #[clap(help_heading = "Provider Options")]
    /// API key for Urlscan (can be used multiple times for rotation, can also use URX_URLSCAN_API_KEY environment variable with comma-separated keys)
    #[clap(long, action = clap::ArgAction::Append)]
    pub urlscan_api_key: Vec<String>,

    /// Include robots.txt discovery (default: true)
    #[clap(long, default_value = "true", hide = true)]
    pub include_robots: bool,

    /// Exclude robots.txt discovery
    #[clap(long, help_heading = "Discovery Options")]
    pub exclude_robots: bool,

    /// Include sitemap.xml discovery (default: true)
    #[clap(long, default_value = "true", hide = true)]
    pub include_sitemap: bool,

    /// Exclude sitemap.xml discovery
    #[clap(long, help_heading = "Discovery Options")]
    pub exclude_sitemap: bool,

    #[clap(help_heading = "Display Options")]
    /// Show verbose output
    #[clap(short, long)]
    pub verbose: bool,

    #[clap(help_heading = "Display Options")]
    /// Silent mode (no output)
    #[clap(long)]
    pub silent: bool,

    #[clap(help_heading = "Display Options")]
    /// No progress bar
    #[clap(long)]
    pub no_progress: bool,

    /// Filter Presets (e.g., "no-resources,no-images,only-js,only-style")
    #[clap(help_heading = "Filter Options")]
    #[clap(short, long, value_delimiter = ',')]
    pub preset: Vec<String>,

    /// Filter URLs to only include those with specific extensions (comma-separated, e.g., "js,php,aspx")
    #[clap(help_heading = "Filter Options")]
    #[clap(short, long, value_delimiter = ',')]
    pub extensions: Vec<String>,

    /// Filter URLs to exclude those with specific extensions (comma-separated, e.g., "html,txt")
    #[clap(help_heading = "Filter Options")]
    #[clap(long, value_delimiter = ',')]
    pub exclude_extensions: Vec<String>,

    /// Filter URLs to only include those containing specific patterns (comma-separated)
    #[clap(help_heading = "Filter Options")]
    #[clap(long, value_delimiter = ',')]
    pub patterns: Vec<String>,

    /// Filter URLs to exclude those containing specific patterns (comma-separated)
    #[clap(help_heading = "Filter Options")]
    #[clap(long, value_delimiter = ',')]
    pub exclude_patterns: Vec<String>,

    /// Only show the host part of the URLs
    #[clap(help_heading = "Filter Options")]
    #[clap(long)]
    pub show_only_host: bool,

    /// Only show the path part of the URLs
    #[clap(help_heading = "Filter Options")]
    #[clap(long)]
    pub show_only_path: bool,

    /// Only show the parameters part of the URLs
    #[clap(help_heading = "Filter Options")]
    #[clap(long)]
    pub show_only_param: bool,

    /// Minimum URL length to include
    #[clap(help_heading = "Filter Options")]
    #[clap(long = "min-length")]
    pub min_length: Option<usize>,

    /// Maximum URL length to include
    #[clap(help_heading = "Filter Options")]
    #[clap(long = "max-length")]
    pub max_length: Option<usize>,

    /// Enforce exact host validation (default)
    #[clap(help_heading = "Filter Options")]
    #[clap(long, default_value = "true")]
    pub strict: bool,

    /// Control which components network settings apply to (all, providers, testers, or providers,testers)
    #[clap(help_heading = "Network Options")]
    #[clap(long, default_value = "all", value_parser = validate_network_scope)]
    pub network_scope: String,

    #[clap(help_heading = "Network Options")]
    /// Use proxy for HTTP requests (format: <http://proxy.example.com:8080>)
    #[clap(long)]
    pub proxy: Option<String>,

    /// Proxy authentication credentials (format: username:password)
    #[clap(help_heading = "Network Options")]
    #[clap(long)]
    pub proxy_auth: Option<String>,

    /// Skip SSL certificate verification (accept self-signed certs)
    #[clap(help_heading = "Network Options")]
    #[clap(long)]
    pub insecure: bool,

    /// Use a random User-Agent for HTTP requests
    #[clap(help_heading = "Network Options")]
    #[clap(long)]
    pub random_agent: bool,

    /// Request timeout in seconds
    #[clap(help_heading = "Network Options")]
    #[clap(long, default_value = "120")]
    pub timeout: u64,

    /// Number of retries for failed requests
    #[clap(help_heading = "Network Options")]
    #[clap(long, default_value = "2")]
    pub retries: u32,

    /// Maximum number of parallel requests per provider and maximum concurrent domain processing
    #[clap(help_heading = "Network Options")]
    #[clap(long, default_value = "5")]
    pub parallel: Option<u32>,

    /// Rate limit (requests per second)
    #[clap(help_heading = "Network Options")]
    #[clap(long)]
    pub rate_limit: Option<f32>,

    /// Check HTTP status code of collected URLs
    #[clap(help_heading = "Testing Options")]
    #[clap(long, alias = "cs", visible_alias = "--cs")]
    pub check_status: bool,

    /// Include URLs with specific HTTP status codes or patterns (e.g., --is=200,30x)
    #[clap(help_heading = "Testing Options")]
    #[clap(long, alias = "is", visible_alias = "--is")]
    pub include_status: Vec<String>,

    /// Exclude URLs with specific HTTP status codes or patterns (e.g., --es=404,50x,5xx)
    #[clap(help_heading = "Testing Options")]
    #[clap(long, alias = "es", visible_alias = "--es")]
    pub exclude_status: Vec<String>,

    /// Extract additional links from collected URLs (requires HTTP requests)
    #[clap(help_heading = "Testing Options")]
    #[clap(long)]
    pub extract_links: bool,

    /// Enable incremental scanning mode (only return new URLs compared to previous scans)
    #[clap(help_heading = "Cache Options")]
    #[clap(long)]
    pub incremental: bool,

    /// Cache backend type (sqlite or redis)
    #[clap(help_heading = "Cache Options")]
    #[clap(long, default_value = "sqlite")]
    pub cache_type: String,

    /// Path for SQLite cache database
    #[clap(help_heading = "Cache Options")]
    #[clap(long)]
    pub cache_path: Option<std::path::PathBuf>,

    /// Redis connection URL for remote caching
    #[clap(help_heading = "Cache Options")]
    #[clap(long)]
    pub redis_url: Option<String>,

    /// Cache time-to-live in seconds (default: 24 hours)
    #[clap(help_heading = "Cache Options")]
    #[clap(long, default_value = "86400")]
    pub cache_ttl: u64,

    /// Disable caching entirely
    #[clap(help_heading = "Cache Options")]
    #[clap(long)]
    pub no_cache: bool,
}

pub fn read_domains_from_stdin() -> anyhow::Result<Vec<String>> {
    use anyhow::Context;
    use std::io::{self, BufRead};

    let stdin = io::stdin();
    let mut domains = Vec::new();

    for line in stdin.lock().lines() {
        let domain = line.context("Failed to read line from stdin")?;
        if !domain.trim().is_empty() {
            domains.push(domain.trim().to_string());
        }
    }

    Ok(domains)
}

impl Args {
    /// Check if robots.txt discovery should be used
    pub fn should_use_robots(&self) -> bool {
        !self.exclude_robots && self.include_robots
    }

    /// Check if sitemap.xml discovery should be used
    pub fn should_use_sitemap(&self) -> bool {
        !self.exclude_sitemap && self.include_sitemap
    }
}

fn validate_network_scope(s: &str) -> Result<String, String> {
    match s {
        "all" | "providers" | "testers" | "providers,testers" | "testers,providers" => Ok(s.to_string()),
        _ => Err(format!("Invalid network scope: {s}. Allowed values are all, providers, testers, or providers,testers")),
    }
}

#[cfg(test)]
mod tests {
    use super::*;

    #[test]
    fn test_args_default_values() {
        let args = Args::parse_from(["urx", "example.com"]);
        assert_eq!(args.domains, vec!["example.com"]);
        assert_eq!(args.format, "plain");
        assert_eq!(args.providers, vec!["wayback", "cc", "otx"]);
        assert_eq!(args.cc_index, "CC-MAIN-2025-13");
        assert_eq!(args.timeout, 120);
        assert_eq!(args.retries, 2);
        assert!(args.include_robots);
        assert!(args.include_sitemap);
        assert!(!args.exclude_robots);
        assert!(!args.exclude_sitemap);
        assert!(args.should_use_robots());
        assert!(args.should_use_sitemap());
    }

    #[test]
    fn test_args_multiple_domains() {
        let args = Args::parse_from(["urx", "example.com", "example.org"]);
        assert_eq!(args.domains, vec!["example.com", "example.org"]);
    }

    #[test]
    fn test_args_output_options() {
        let args = Args::parse_from(["urx", "example.com", "-o", "output.txt", "-f", "json"]);
        assert_eq!(args.domains, vec!["example.com"]);
        assert!(args.output.is_some());
        assert_eq!(args.output.unwrap().to_str().unwrap(), "output.txt");
        assert_eq!(args.format, "json");
    }

    #[test]
    fn test_args_providers() {
        let args = Args::parse_from(["urx", "example.com", "--providers", "wayback,vt"]);
        assert_eq!(args.providers, vec!["wayback", "vt"]);
    }

    #[test]
    fn test_network_options() {
        let args = Args::parse_from([
            "urx",
            "example.com",
            "--proxy",
            "http://proxy:8080",
            "--timeout",
            "60",
        ]);
        assert_eq!(args.proxy.unwrap(), "http://proxy:8080");
        assert_eq!(args.timeout, 60);
    }

    #[test]
    fn test_filter_options() {
        let args = Args::parse_from([
            "urx",
            "example.com",
            "-e",
            "js,php",
            "--exclude-extensions",
            "html,css",
        ]);
        assert_eq!(args.extensions, vec!["js", "php"]);
        assert_eq!(args.exclude_extensions, vec!["html", "css"]);
    }

    #[test]
    fn test_robots_sitemap_flags() {
        // Test default values are true for include flags and false for exclude flags
        let args = Args::parse_from(["urx", "example.com"]);
        assert!(args.include_robots);
        assert!(args.include_sitemap);
        assert!(!args.exclude_robots);
        assert!(!args.exclude_sitemap);
        assert!(args.should_use_robots());
        assert!(args.should_use_sitemap());

        // Test they can be disabled via exclude flags (visible in help)
        let args = Args::parse_from([
            "urx",
            "example.com",
            "--exclude-robots",
            "--exclude-sitemap",
        ]);
        assert!(args.exclude_robots);
        assert!(args.exclude_sitemap);
        assert!(!args.should_use_robots());
        assert!(!args.should_use_sitemap());
    }

    #[test]
    fn test_robots_sitemap_helper_methods() {
        // Default is to use both
        let args = Args::parse_from(["urx", "example.com"]);
        assert!(args.should_use_robots());
        assert!(args.should_use_sitemap());

        // Exclude flags take precedence over include flags
        let args = Args::parse_from(["urx", "example.com", "--exclude-robots"]);
        assert!(!args.should_use_robots());
        assert!(args.should_use_sitemap());

        // Explicit exclude always wins over include setting
        let args = Args::parse_from(["urx", "example.com", "--include-robots", "--exclude-robots"]);
        assert!(args.exclude_robots);
        assert!(args.include_robots); // Both flags retain their values
        assert!(!args.should_use_robots()); // But should_use_robots uses the logic
    }

    #[test]
    fn test_validate_network_scope_valid() {
        assert!(validate_network_scope("all").is_ok());
        assert!(validate_network_scope("providers").is_ok());
        assert!(validate_network_scope("testers").is_ok());
        assert!(validate_network_scope("providers,testers").is_ok());
    }

    #[test]
    fn test_validate_network_scope_invalid() {
        assert!(validate_network_scope("invalid").is_err());
    }

    #[test]
    fn test_files_flag() {
        // Test that the new --files flag accepts multiple files
        let args = Args::parse_from(["urx", "--files", "file1.txt", "file2.warc", "--verbose"]);
        assert_eq!(args.files.len(), 2);
        assert_eq!(args.files[0].to_str().unwrap(), "file1.txt");
        assert_eq!(args.files[1].to_str().unwrap(), "file2.warc");
        assert!(args.verbose);
    }

    #[test]
    fn test_multiple_files_flags() {
        // Test that repeated --files flags work
        let args = Args::parse_from(["urx", "--files", "file1.txt", "--files", "file2.warc"]);
        assert_eq!(args.files.len(), 2);
        assert_eq!(args.files[0].to_str().unwrap(), "file1.txt");
        assert_eq!(args.files[1].to_str().unwrap(), "file2.warc");
    }

    #[test]
    fn test_read_domains_from_stdin() {
        use std::io::{self, BufRead, Cursor};

        // Create a cursor with test input data
        let input = "example.com\nexample.org\n\n";
        let cursor = Cursor::new(input);

        // Extract lines from the cursor
        let buffer = io::BufReader::new(cursor);
        let mut domains = Vec::new();
        for line in buffer.lines() {
            let domain = line.unwrap();
            if !domain.trim().is_empty() {
                domains.push(domain.trim().to_string());
            }
        }

        assert_eq!(domains, vec!["example.com", "example.org"]);
    }
}