Skip to main content

api_scanner/
cli.rs

1// src/cli.rs
2//
3// CLI argument definitions and helpers shared by main and tests.
4
5use std::{
6    fs,
7    io::{self, BufRead},
8    path::PathBuf,
9};
10
11use anyhow::{Context, Result};
12use clap::{ArgGroup, Parser, ValueEnum};
13use url::Url;
14
15use crate::reports::{ReportFormat, Severity};
16
17// ── CLI definition ────────────────────────────────────────────────────────────
18
19/// A fast, async web security scanner.
20///
21/// Reads a list of URLs from a file or stdin, runs the enabled checks
22/// concurrently, and writes findings in JSON or NDJSON format.
23#[derive(Debug, Parser)]
24#[command(
25    author,
26    version,
27    about,
28    long_about = None,
29    // Require exactly one of --urls, --stdin, or --har
30    group(
31        ArgGroup::new("input")
32            .required(true)
33            .args(["urls", "stdin", "har"])
34    )
35)]
36pub struct Cli {
37    // ── Input ────────────────────────────────────────────────────────────────
38    /// Path to a newline-delimited file of URLs to scan.
39    #[arg(short = 'u', long, value_name = "FILE", group = "input")]
40    pub urls: Option<PathBuf>,
41
42    /// Read newline-delimited URLs from stdin instead of a file.
43    #[arg(long, group = "input")]
44    pub stdin: bool,
45
46    /// Path to a HAR file; imports `log.entries[].request.url` as scan seeds.
47    #[arg(long, value_name = "FILE", group = "input")]
48    pub har: Option<PathBuf>,
49
50    /// Skip pre-filtering of inaccessible URLs (enabled by default).
51    #[arg(long)]
52    pub no_filter: bool,
53
54    /// Timeout for accessibility pre-check (seconds).
55    #[arg(long, default_value_t = 3, value_name = "SECS")]
56    pub filter_timeout: u64,
57
58    /// Skip endpoint discovery and scan only the provided seed URLs.
59    #[arg(long)]
60    pub no_discovery: bool,
61
62    // ── Output ───────────────────────────────────────────────────────────────
63    /// Write findings to this file path (default: stdout).
64    #[arg(short = 'o', long, value_name = "FILE")]
65    pub output: Option<PathBuf>,
66
67    /// Output format.
68    #[arg(short = 'f', long, default_value = "pretty", value_name = "FORMAT")]
69    pub format: CliFormat,
70
71    /// Emit NDJSON findings as they arrive (NDJSON only).
72    #[arg(long)]
73    pub stream: bool,
74
75    /// Baseline NDJSON file; suppress findings already present in baseline.
76    #[arg(long, value_name = "FILE")]
77    pub baseline: Option<PathBuf>,
78
79    /// Suppress all stdout output except findings (no summary box).
80    #[arg(short = 'q', long)]
81    pub quiet: bool,
82
83    /// Print the summary box even in quiet mode.
84    #[arg(long)]
85    pub summary: bool,
86
87    /// Disable automatic local report persistence under ~/Documents/ApiHunterReports.
88    #[arg(long)]
89    pub no_auto_report: bool,
90
91    // ── Concurrency & limits ─────────────────────────────────────────────────
92    /// Maximum number of concurrent in-flight requests.
93    #[arg(short = 'c', long, default_value_t = 20, value_name = "N")]
94    pub concurrency: usize,
95
96    /// Maximum number of endpoints to scan per site (0 = unlimited).
97    #[arg(short = 'n', long, default_value_t = 50, value_name = "N")]
98    pub max_endpoints: usize,
99
100    // ── Politeness ───────────────────────────────────────────────────────────
101    /// Per-domain minimum delay between requests (milliseconds).
102    #[arg(long, default_value_t = 150, value_name = "MS")]
103    pub delay_ms: u64,
104
105    /// Maximum number of retry attempts on transient errors.
106    #[arg(long, default_value_t = 1, value_name = "N")]
107    pub retries: u32,
108
109    /// Per-request timeout (seconds).
110    #[arg(long, default_value_t = 8, value_name = "SECS")]
111    pub timeout_secs: u64,
112
113    // ── WAF evasion ──────────────────────────────────────────────────────────
114    /// Enable WAF-evasion heuristics (randomised UA, header shuffling, jitter).
115    #[arg(long)]
116    pub waf_evasion: bool,
117
118    /// Rotate through these User-Agent strings (comma-separated).
119    /// Implies --waf-evasion.
120    #[arg(long, value_name = "UA,...", value_delimiter = ',')]
121    pub user_agents: Vec<String>,
122
123    // ── Proxy / TLS ──────────────────────────────────────────────────────────
124    /// Extra request headers applied to every request (e.g. "Authorization: Bearer xxx").
125    #[arg(long, value_name = "NAME:VALUE", value_delimiter = ',')]
126    pub headers: Vec<String>,
127
128    /// Cookies applied to every request (e.g. "session=abc123,theme=dark").
129    #[arg(long, value_name = "NAME=VALUE", value_delimiter = ',')]
130    pub cookies: Vec<String>,
131
132    /// HTTP/HTTPS proxy URL (e.g. http://127.0.0.1:8080).
133    #[arg(long, value_name = "URL")]
134    pub proxy: Option<String>,
135
136    /// Accept invalid / self-signed TLS certificates (dangerous).
137    #[arg(long)]
138    pub danger_accept_invalid_certs: bool,
139
140    /// Enable active (potentially invasive) checks.
141    #[arg(long)]
142    pub active_checks: bool,
143
144    /// Dry-run active checks: do not send mutation probes, emit informational "would test" findings.
145    #[arg(long)]
146    pub dry_run: bool,
147
148    /// Use per-host HTTP client pools.
149    #[arg(long)]
150    pub per_host_clients: bool,
151
152    /// Enable adaptive concurrency (AIMD).
153    #[arg(long)]
154    pub adaptive_concurrency: bool,
155
156    /// Convenience: add `Authorization: Bearer <token>`.
157    #[arg(long, value_name = "TOKEN")]
158    pub auth_bearer: Option<String>,
159
160    /// Convenience: add `Authorization: Basic <base64(user:pass)>`.
161    #[arg(long, value_name = "USER:PASS")]
162    pub auth_basic: Option<String>,
163
164    /// Path to a JSON auth flow descriptor for pre-scan login.
165    /// See docs/auth-flow.md for the format.
166    #[arg(long, value_name = "FILE")]
167    pub auth_flow: Option<PathBuf>,
168
169    /// Second auth flow for cross-user IDOR checks (--active-checks required).
170    #[arg(long, value_name = "FILE")]
171    pub auth_flow_b: Option<PathBuf>,
172
173    /// Extra auth-like headers to strip for unauthenticated probes (comma-separated).
174    #[arg(long, value_name = "NAME", value_delimiter = ',')]
175    pub unauth_strip_headers: Option<Vec<String>>,
176
177    /// Load/save cookies from a JSON session file.
178    #[arg(long, value_name = "FILE")]
179    pub session_file: Option<PathBuf>,
180
181    // ── Scanner toggles ──────────────────────────────────────────────────────
182    /// Disable the CORS scanner.
183    #[arg(long)]
184    pub no_cors: bool,
185
186    /// Disable the CSP scanner.
187    #[arg(long)]
188    pub no_csp: bool,
189
190    /// Disable the GraphQL scanner.
191    #[arg(long)]
192    pub no_graphql: bool,
193
194    /// Disable the API-security scanner.
195    #[arg(long)]
196    pub no_api_security: bool,
197
198    /// Disable the JWT scanner.
199    #[arg(long)]
200    pub no_jwt: bool,
201
202    /// Disable the OpenAPI scanner.
203    #[arg(long)]
204    pub no_openapi: bool,
205
206    /// Disable the Mass Assignment scanner (active checks).
207    #[arg(long)]
208    pub no_mass_assignment: bool,
209
210    /// Disable the OAuth/OIDC scanner (active checks).
211    #[arg(long)]
212    pub no_oauth_oidc: bool,
213
214    /// Disable the Rate Limit scanner (active checks).
215    #[arg(long)]
216    pub no_rate_limit: bool,
217
218    /// Disable the CVE Template scanner (active checks).
219    #[arg(long)]
220    pub no_cve_templates: bool,
221
222    /// Disable the WebSocket scanner (active checks).
223    #[arg(long)]
224    pub no_websocket: bool,
225
226    // ── Reporting threshold ───────────────────────────────────────────────────
227    /// Minimum severity to include in findings output.
228    #[arg(long, value_name = "LEVEL")]
229    pub min_severity: Option<CliSeverity>,
230
231    /// Exit with code 1 when findings at or above this severity are found.
232    #[arg(long, default_value = "medium", value_name = "LEVEL")]
233    pub fail_on: CliSeverity,
234}
235
236// ── Clap value enums ──────────────────────────────────────────────────────────
237
238#[derive(Debug, Clone, Copy, ValueEnum)]
239pub enum CliFormat {
240    Pretty,
241    Ndjson,
242    Sarif,
243}
244
245#[derive(Debug, Clone, Copy, ValueEnum)]
246pub enum CliSeverity {
247    Critical,
248    High,
249    Medium,
250    Low,
251    Info,
252}
253
254impl From<CliSeverity> for Severity {
255    fn from(c: CliSeverity) -> Self {
256        match c {
257            CliSeverity::Critical => Severity::Critical,
258            CliSeverity::High => Severity::High,
259            CliSeverity::Medium => Severity::Medium,
260            CliSeverity::Low => Severity::Low,
261            CliSeverity::Info => Severity::Info,
262        }
263    }
264}
265
266impl From<CliFormat> for ReportFormat {
267    fn from(c: CliFormat) -> Self {
268        match c {
269            CliFormat::Pretty => ReportFormat::Pretty,
270            CliFormat::Ndjson => ReportFormat::Ndjson,
271            CliFormat::Sarif => ReportFormat::Sarif,
272        }
273    }
274}
275
276// ── URL loader ────────────────────────────────────────────────────────────────
277
278#[derive(Debug, serde::Deserialize)]
279struct HarFile {
280    log: HarLog,
281}
282
283#[derive(Debug, serde::Deserialize)]
284struct HarLog {
285    entries: Vec<HarEntry>,
286}
287
288#[derive(Debug, serde::Deserialize)]
289struct HarEntry {
290    request: HarRequest,
291}
292
293#[derive(Debug, serde::Deserialize)]
294struct HarRequest {
295    url: String,
296    #[serde(default)]
297    method: String,
298}
299
300/// Read URLs from a file, stdin, or HAR input.
301/// Blank lines and lines starting with `#` are ignored.
302pub fn load_urls(cli: &Cli) -> Result<Vec<String>> {
303    let lines: Vec<String> = if let Some(ref path) = cli.urls {
304        let content = fs::read_to_string(path)
305            .with_context(|| format!("Cannot read URL file: {}", path.display()))?;
306        content.lines().map(str::to_owned).collect()
307    } else if let Some(ref path) = cli.har {
308        load_urls_from_har(path)?
309    } else {
310        // --stdin
311        let stdin = io::stdin();
312        stdin
313            .lock()
314            .lines()
315            .collect::<Result<_, _>>()
316            .context("Failed to read URLs from stdin")?
317    };
318
319    let urls = lines
320        .into_iter()
321        .map(|l| l.trim().to_owned())
322        .filter(|l| !l.is_empty() && !l.starts_with('#'))
323        .collect();
324
325    Ok(urls)
326}
327
328fn load_urls_from_har(path: &PathBuf) -> Result<Vec<String>> {
329    let content = fs::read_to_string(path)
330        .with_context(|| format!("Cannot read HAR file: {}", path.display()))?;
331    let har: HarFile = serde_json::from_str(&content)
332        .with_context(|| format!("Cannot parse HAR file: {}", path.display()))?;
333
334    Ok(har
335        .log
336        .entries
337        .into_iter()
338        .filter_map(|entry| {
339            let url = entry.request.url.trim().to_string();
340            if !(url.starts_with("http://") || url.starts_with("https://")) {
341                return None;
342            }
343            if !is_likely_api_url(&url, &entry.request.method) {
344                return None;
345            }
346            Some(url)
347        })
348        .collect())
349}
350
351fn is_likely_api_url(raw_url: &str, method: &str) -> bool {
352    let parsed = match Url::parse(raw_url) {
353        Ok(u) => u,
354        Err(_) => return false,
355    };
356
357    let host = parsed.host_str().unwrap_or("").to_ascii_lowercase();
358    let path = parsed.path().to_ascii_lowercase();
359    let query = parsed.query().unwrap_or("").to_ascii_lowercase();
360    let method = method.to_ascii_uppercase();
361
362    if is_likely_static_host(&host) || is_static_asset_path(&path) {
363        return false;
364    }
365
366    // Non-read methods in HAR are usually API/business operations.
367    if !matches!(method.as_str(), "" | "GET" | "HEAD" | "OPTIONS") {
368        return true;
369    }
370
371    if host.starts_with("api.") || host.contains(".api.") {
372        return true;
373    }
374
375    let needle_haystack = format!("{path}?{query}");
376    const KEYWORDS: &[&str] = &[
377        "/api", "graphql", "openapi", "swagger", "oauth", "oidc", "auth", "token", "session",
378        "login", "logout", "signin", "identity", "/v1", "/v2", "/v3", "/rpc",
379    ];
380
381    KEYWORDS.iter().any(|k| needle_haystack.contains(k))
382}
383
384fn is_likely_static_host(host: &str) -> bool {
385    if host.ends_with("awsstatic.com")
386        || host.ends_with("cloudfront.net")
387        || host.contains("fonts.")
388        || host.contains("analytics")
389    {
390        return true;
391    }
392
393    host.starts_with("cdn.")
394        || host.contains(".cdn.")
395        || host.starts_with("static.")
396        || host.contains(".static.")
397        || host.starts_with("assets.")
398        || host.contains(".assets.")
399}
400
401fn is_static_asset_path(path: &str) -> bool {
402    const EXTENSIONS: &[&str] = &[
403        ".js", ".css", ".map", ".png", ".jpg", ".jpeg", ".gif", ".svg", ".ico", ".woff", ".woff2",
404        ".ttf", ".eot", ".webp", ".avif", ".mp4", ".webm", ".mp3", ".wav", ".pdf", ".zip",
405    ];
406    EXTENSIONS.iter().any(|ext| path.ends_with(ext))
407}
408
409// ── Default user-agents ───────────────────────────────────────────────────────
410
411pub fn default_user_agents() -> Vec<String> {
412    crate::waf::WafEvasion::user_agent_pool()
413}