1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
use clap::{Parser, ValueEnum};
#[derive(Parser, Debug)]
#[command(name = "urlsieve")]
#[command(about = "Intelligent URL deduplication tool for bug bounty workflows")]
#[command(version)]
#[allow(clippy::struct_excessive_bools)]
pub struct Cli {
/// Input file (reads from stdin if omitted)
#[arg(value_name = "INPUT", short = 'i', long)]
pub input: Option<String>,
/// Output file (writes to stdout if omitted)
#[arg(short, long)]
pub output: Option<String>,
/// Output format
#[arg(short, long, default_value = "rep")]
pub format: OutputFormat,
/// Config file (TOML)
#[arg(short, long)]
pub config: Option<String>,
/// Show deduplication statistics
#[arg(long)]
pub stats: bool,
/// Patterns to enable (comma-separated)
#[arg(long)]
pub patterns: Option<String>,
/// Minimum segment length for entropy check
#[arg(long)]
pub min_segment_len: Option<usize>,
/// Shannon entropy threshold for dynamic detection
#[arg(long)]
pub entropy_threshold: Option<f64>,
/// Parameter keys whose values are always normalized (comma-separated)
#[arg(long)]
pub normalize_param_keys: Option<String>,
/// Parameter keys whose values are never normalized (comma-separated)
#[arg(long)]
pub keep_param_keys: Option<String>,
/// Remove query params entirely from fingerprint
#[arg(long)]
pub strip_query: bool,
/// Prepend scheme to scheme-less URLs
#[arg(long, default_value = "https")]
pub assume_scheme: String,
/// Write invalid/malformed URLs to file
#[arg(long)]
pub invalid_output: Option<String>,
/// Analyze cardinality and print report
#[arg(long)]
pub learn: bool,
/// Apply learned config during analysis
#[arg(long, requires = "learn")]
pub apply: bool,
/// Save learned config to TOML file
#[arg(long, requires = "learn")]
pub save_config: Option<String>,
/// Compare input against baseline file (fingerprint match)
#[arg(long)]
pub diff: Option<String>,
/// Use exact URL matching in diff mode
#[arg(long, requires = "diff")]
pub diff_strict: bool,
/// Sort output by fingerprint (deterministic but slower for large inputs)
#[arg(long)]
pub sort: bool,
}
#[derive(Debug, Clone, Copy, PartialEq, Eq, ValueEnum)]
pub enum OutputFormat {
/// One representative URL per group
Rep,
/// URL with duplicate count as comment
Counted,
/// Structured JSON output (single JSON object)
Json,
/// JSON Lines (one JSON object per line, stream-friendly)
Jsonl,
}