use clap::{Args, Parser, Subcommand};
#[derive(Parser, Debug)]
#[command(
name = "crawlex",
version,
about = "Stealth crawler with Chrome-perfect fingerprint"
)]
pub struct Cli {
#[command(subcommand)]
pub command: Command,
}
#[derive(Subcommand, Debug)]
pub enum Command {
#[command(subcommand)]
Pages(PagesVerb),
#[command(subcommand)]
Crawl(CrawlVerb),
#[command(subcommand)]
Fingerprint(FingerprintVerb),
#[command(subcommand)]
Graph(GraphVerb),
#[command(subcommand)]
Queue(QueueVerb),
#[command(subcommand)]
Sessions(SessionsVerb),
#[command(subcommand)]
Session(SessionVerb),
#[command(subcommand)]
Telemetry(TelemetryVerb),
#[command(subcommand)]
Stealth(StealthVerb),
}
#[derive(Subcommand, Debug)]
pub enum PagesVerb {
Run(CrawlArgs),
}
#[derive(Subcommand, Debug)]
pub enum CrawlVerb {
Resume(ResumeArgs),
}
#[derive(Subcommand, Debug)]
pub enum FingerprintVerb {
Run(IntelArgs),
Show(IntelShowArgs),
Export(IntelExportArgs),
}
#[derive(Subcommand, Debug)]
pub enum GraphVerb {
Export(ExportGraphArgs),
}
#[derive(Subcommand, Debug)]
pub enum QueueVerb {
Stats(QueueStatsArgs),
Purge(QueuePurgeArgs),
Export(QueueExportArgs),
}
#[derive(Subcommand, Debug)]
pub enum SessionsVerb {
List(SessionsListArgs),
}
#[derive(Subcommand, Debug)]
pub enum SessionVerb {
Drop(SessionDropArgs),
}
#[derive(Subcommand, Debug)]
pub enum TelemetryVerb {
Show(TelemetryShowArgs),
}
#[derive(Subcommand, Debug)]
pub enum StealthVerb {
Test,
Inspect(InspectArgs),
#[command(subcommand)]
Catalog(CatalogVerb),
}
#[derive(Subcommand, Debug)]
pub enum CatalogVerb {
List(CatalogListArgs),
Show(CatalogShowArgs),
}
#[derive(Args, Debug)]
pub struct CatalogListArgs {
#[arg(long)]
pub filter: Option<String>,
#[arg(long, default_value_t = false)]
pub json: bool,
}
#[derive(Args, Debug)]
pub struct CatalogShowArgs {
pub profile: String,
#[arg(long, default_value_t = false)]
pub json: bool,
}
#[derive(Args, Debug)]
pub struct TelemetryShowArgs {
#[arg(long)]
pub db: String,
#[arg(long, default_value_t = 20)]
pub top: usize,
}
#[derive(Args, Debug)]
pub struct QueueStatsArgs {
#[arg(long)]
pub queue_path: String,
}
#[derive(Args, Debug)]
pub struct QueuePurgeArgs {
#[arg(long)]
pub queue_path: String,
}
#[derive(Args, Debug)]
pub struct QueueExportArgs {
#[arg(long)]
pub queue_path: String,
#[arg(long)]
pub out: String,
}
#[derive(Args, Debug)]
pub struct SessionsListArgs {
#[arg(long)]
pub storage_path: String,
#[arg(long)]
pub state: Option<String>,
}
#[derive(Args, Debug)]
pub struct SessionDropArgs {
#[arg(long)]
pub storage_path: String,
#[arg(long)]
pub id: String,
}
#[derive(Args, Debug, Clone)]
pub struct IntelExportArgs {
pub target: String,
#[arg(long, default_value = "./crawlex.db")]
pub db: String,
#[arg(long)]
pub out: Option<String>,
#[arg(long)]
pub html: Option<String>,
#[arg(long, default_value_t = false)]
pub pretty: bool,
}
#[derive(Args, Debug, Clone)]
pub struct IntelShowArgs {
pub target: String,
#[arg(long, default_value = "./crawlex.db")]
pub db: String,
#[arg(long, default_value_t = 30)]
pub limit: usize,
}
#[derive(Args, Debug, Clone)]
pub struct IntelArgs {
pub target: String,
#[arg(long, default_value = "./crawlex.db")]
pub db: String,
#[arg(long)]
pub no_subdomains: bool,
#[arg(long)]
pub no_dns: bool,
#[arg(long)]
pub no_whois: bool,
#[arg(long)]
pub no_cert: bool,
#[arg(long)]
pub network_probe: bool,
}
#[derive(Args, Debug, Clone)]
pub struct CrawlArgs {
#[arg(long, action = clap::ArgAction::Append)]
pub seed: Vec<String>,
#[arg(long)]
pub seeds_file: Option<String>,
#[arg(long, default_value = "spoof")]
pub method: String,
#[arg(long)]
pub max_concurrent_render: Option<usize>,
#[arg(long)]
pub max_concurrent_http: Option<usize>,
#[arg(long)]
pub max_depth: Option<u32>,
#[arg(long, default_value_t = false)]
pub same_host_only: bool,
#[arg(long, default_value_t = true)]
pub include_subdomains: bool,
#[arg(long)]
pub respect_robots_txt: Option<bool>,
#[arg(long)]
pub wait_strategy: Option<String>,
#[arg(long)]
pub wait_idle_ms: Option<u64>,
#[arg(long = "render-request-timeout-ms")]
pub render_request_timeout_ms: Option<u64>,
#[arg(long = "navigation-lifecycle")]
pub navigation_lifecycle: Option<String>,
#[arg(long)]
pub profile: Option<String>,
#[arg(long)]
pub chrome_path: Option<String>,
#[arg(long = "chrome-flag", action = clap::ArgAction::Append)]
pub chrome_flag: Vec<String>,
#[arg(long)]
pub block_resource: Option<String>,
#[arg(long)]
pub queue: Option<String>,
#[arg(long)]
pub queue_path: Option<String>,
#[arg(long)]
pub queue_redis_url: Option<String>,
#[arg(long)]
pub storage: Option<String>,
#[arg(long)]
pub storage_path: Option<String>,
#[arg(long)]
pub output_html_dir: Option<String>,
#[arg(long)]
pub output_graph: Option<String>,
#[arg(long)]
pub output_metadata: Option<String>,
#[arg(long, default_value_t = false)]
pub screenshot: bool,
#[arg(long)]
pub screenshot_dir: Option<String>,
#[arg(long)]
pub screenshot_mode: Option<String>,
#[arg(long, default_value = "off")]
pub doh: String,
#[arg(long = "proxy", action = clap::ArgAction::Append)]
pub proxy: Vec<String>,
#[arg(long)]
pub proxy_file: Option<String>,
#[arg(long)]
pub proxy_strategy: Option<String>,
#[arg(long, default_value_t = false)]
pub proxy_sticky_per_host: bool,
#[arg(long)]
pub proxy_health_check_interval_secs: Option<u64>,
#[arg(long, default_value_t = false)]
pub raffel_proxy: bool,
#[arg(long, default_value = "/home/cyber/Work/tetis/libs/raffel")]
pub raffel_proxy_path: String,
#[arg(long, default_value = "127.0.0.1")]
pub raffel_proxy_host: String,
#[arg(long, default_value_t = 8899)]
pub raffel_proxy_port: u16,
#[arg(long = "hook-script", action = clap::ArgAction::Append)]
pub hook_script: Vec<String>,
#[arg(long = "hook-bridge")]
pub hook_bridge: Option<String>,
#[arg(long)]
pub on_discovery_filter_regex: Option<String>,
#[arg(long)]
pub identity_preset: Option<u8>,
#[arg(long, conflicts_with = "identity_preset")]
pub persona: Option<String>,
#[arg(long, default_value_t = false)]
pub follow_all_assets: bool,
#[arg(long, default_value_t = false)]
pub crtsh: bool,
#[arg(long, default_value_t = false)]
pub no_robots_paths: bool,
#[arg(long, default_value_t = false)]
pub no_well_known: bool,
#[arg(long, default_value_t = false)]
pub no_pwa: bool,
#[arg(long, default_value_t = false)]
pub no_favicon: bool,
#[arg(long, default_value_t = false)]
pub wayback: bool,
#[arg(long, default_value_t = false)]
pub dns: bool,
#[arg(long, default_value_t = false)]
pub metrics: bool,
#[arg(long, default_value_t = false)]
pub metrics_net: bool,
#[arg(long, default_value_t = false)]
pub metrics_vitals: bool,
#[arg(long, default_value_t = false)]
pub peer_cert: bool,
#[arg(long, default_value_t = false)]
pub rdap: bool,
#[arg(long, default_value_t = false)]
pub no_cookies: bool,
#[arg(long)]
pub render_session_scope: Option<String>,
#[arg(long, default_value_t = false)]
pub no_follow_redirects: bool,
#[arg(long, default_value_t = false)]
pub no_fetch_chromium: bool,
#[arg(long)]
pub max_redirects: Option<u8>,
#[arg(long)]
pub actions_file: Option<String>,
#[arg(long, value_name = "PATH", conflicts_with = "actions_file")]
pub script_spec: Option<String>,
#[arg(long)]
pub rate_per_host_rps: Option<f64>,
#[arg(long)]
pub retry_max: Option<u32>,
#[arg(long)]
pub retry_backoff_ms: Option<u64>,
#[arg(long)]
pub user_agent_override: Option<String>,
#[arg(long)]
pub timezone: Option<String>,
#[arg(long)]
pub locale: Option<String>,
#[arg(long)]
pub metrics_prometheus_port: Option<u16>,
#[arg(long, default_value = "info")]
pub log_level: String,
#[arg(long, default_value = "text")]
pub log_format: String,
#[arg(long, default_value = "none")]
pub emit: String,
#[arg(long = "policy", default_value = "balanced")]
pub policy: String,
#[arg(long)]
pub config: Option<String>,
#[arg(long, default_value_t = false)]
pub explain: bool,
#[arg(long, default_value_t = false)]
pub no_spa_observer: bool,
#[arg(long, default_value_t = false)]
pub collect_indexeddb: bool,
#[arg(long, default_value_t = false)]
pub collect_cache_storage: bool,
#[arg(long, default_value_t = false)]
pub collect_spa_state: bool,
#[arg(long)]
pub action_policy: Option<String>,
#[arg(long)]
pub challenge_mode: Option<String>,
#[arg(long, value_name = "LEVEL")]
pub antibot_bypass: Option<String>,
#[arg(long)]
pub max_browsers: Option<usize>,
#[arg(long)]
pub max_pages_per_context: Option<usize>,
#[arg(long)]
pub max_per_host_inflight: Option<usize>,
#[arg(long)]
pub max_per_origin_inflight: Option<usize>,
#[arg(long)]
pub max_per_proxy_inflight: Option<usize>,
#[arg(long)]
pub max_per_session_inflight: Option<usize>,
#[arg(long)]
pub session_ttl_secs: Option<u64>,
#[arg(long, default_value_t = false)]
pub session_scope_auto: bool,
#[arg(long, default_value_t = false)]
pub keep_blocked_sessions: bool,
#[arg(long)]
pub motion_profile: Option<String>,
#[arg(long, default_value_t = false)]
pub reading_dwell: bool,
#[arg(long, default_value_t = 250)]
pub reading_dwell_wpm: u32,
#[arg(long, default_value_t = 40)]
pub reading_dwell_jitter_ms: u64,
#[arg(long, default_value = "none")]
pub residential_provider: String,
#[arg(long, default_value = "none")]
pub captcha_solver: String,
#[arg(long)]
pub mobile_profile: Option<String>,
}
#[derive(Args, Debug)]
pub struct ResumeArgs {
#[arg(long)]
pub queue_path: String,
}
#[derive(Args, Debug)]
pub struct InspectArgs {
pub url: String,
#[arg(long)]
pub profile: Option<String>,
}
#[derive(Subcommand, Debug)]
pub enum QueueCmd {
Stats {
#[arg(long)]
queue_path: String,
},
Purge {
#[arg(long)]
queue_path: String,
},
Export {
#[arg(long)]
queue_path: String,
#[arg(long)]
out: String,
},
}
#[derive(Subcommand, Debug)]
pub enum SessionsCmd {
List {
#[arg(long)]
storage_path: String,
#[arg(long)]
state: Option<String>,
},
Drop {
#[arg(long)]
storage_path: String,
#[arg(long)]
id: String,
},
}
#[derive(Subcommand, Debug)]
pub enum TelemetryCmd {
Show {
#[arg(long)]
db: String,
#[arg(long, default_value_t = 20)]
top: usize,
},
}
#[derive(Args, Debug)]
pub struct ExportGraphArgs {
#[arg(long)]
pub storage_path: String,
#[arg(long)]
pub out: String,
}