crawlex 1.0.3

Stealth crawler with Chrome-perfect TLS/H2 fingerprint, render pool, hooks, persistent queue
Documentation
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
use clap::{Args, Parser, Subcommand};

#[derive(Parser, Debug)]
#[command(
    name = "crawlex",
    version,
    about = "Stealth crawler with Chrome-perfect fingerprint"
)]
pub struct Cli {
    #[command(subcommand)]
    pub command: Command,
}

/// Top-level command tree — resource-first, verb last.
///
/// Grammar: `crawlex <resource> <verb> [<name>] [flags]`.
///
/// Reading-order: noun first, then the action you want to do on it.
/// An operator spells out which thing they're touching before the
/// action, which lets `--help` on the noun list all applicable
/// verbs in one screen.
///
/// Examples:
///   crawlex pages       run       --seed https://example.com/
///   crawlex crawl       resume
///   crawlex fingerprint run       www.stone.com.br --include-subdomains
///   crawlex fingerprint show      stone.com.br     --limit 30
///   crawlex fingerprint export    stone.com.br     --out stone.json --pretty
///   crawlex graph       export    --storage-path crawlex.db --out graph.json
///   crawlex queue       stats     --queue-path queue.sqlite
///   crawlex queue       purge     --queue-path queue.sqlite
///   crawlex queue       export    --queue-path queue.sqlite --out q.json
///   crawlex sessions    list      --storage-path crawlex.db
///   crawlex session     drop      --id abcd --storage-path crawlex.db
///   crawlex telemetry   show      --db crawlex.db
///   crawlex stealth     test
///   crawlex stealth     inspect
#[derive(Subcommand, Debug)]
pub enum Command {
    #[command(subcommand)]
    Pages(PagesVerb),
    #[command(subcommand)]
    Crawl(CrawlVerb),
    #[command(subcommand)]
    Fingerprint(FingerprintVerb),
    #[command(subcommand)]
    Graph(GraphVerb),
    #[command(subcommand)]
    Queue(QueueVerb),
    #[command(subcommand)]
    Sessions(SessionsVerb),
    #[command(subcommand)]
    Session(SessionVerb),
    #[command(subcommand)]
    Telemetry(TelemetryVerb),
    #[command(subcommand)]
    Stealth(StealthVerb),
}

#[derive(Subcommand, Debug)]
pub enum PagesVerb {
    /// Start a new page crawl from the given seeds.
    Run(CrawlArgs),
}

#[derive(Subcommand, Debug)]
pub enum CrawlVerb {
    /// Resume a paused / interrupted crawl from its persisted queue.
    Resume(ResumeArgs),
}

#[derive(Subcommand, Debug)]
pub enum FingerprintVerb {
    /// Run the passive+active recon stages against a target.
    Run(IntelArgs),
    /// Read-only summary of persisted intel for a target.
    Show(IntelShowArgs),
    /// Dump every intel table for a target as JSON.
    Export(IntelExportArgs),
}

#[derive(Subcommand, Debug)]
pub enum GraphVerb {
    /// Export the discovery edges (JSON / DOT, picked by extension).
    Export(ExportGraphArgs),
}

#[derive(Subcommand, Debug)]
pub enum QueueVerb {
    /// Size + by-state counters.
    Stats(QueueStatsArgs),
    /// Delete every row.
    Purge(QueuePurgeArgs),
    /// Dump rows as JSON.
    Export(QueueExportArgs),
}

#[derive(Subcommand, Debug)]
pub enum SessionsVerb {
    /// Enumerate archived sessions.
    List(SessionsListArgs),
}

#[derive(Subcommand, Debug)]
pub enum SessionVerb {
    /// Evict a session by id.
    Drop(SessionDropArgs),
}

#[derive(Subcommand, Debug)]
pub enum TelemetryVerb {
    /// Antibot challenge-rate aggregation tables (SQLite views).
    Show(TelemetryShowArgs),
}

#[derive(Subcommand, Debug)]
pub enum StealthVerb {
    /// Verify ALPN/cipher/JA4 against the built-in expectations.
    Test,
    /// Print the active IdentityBundle fingerprint summary.
    Inspect(InspectArgs),
    /// Browse the TLS fingerprint catalog (vendored + captured + mined).
    #[command(subcommand)]
    Catalog(CatalogVerb),
}

#[derive(Subcommand, Debug)]
pub enum CatalogVerb {
    /// List every fingerprint registered in the catalog.
    /// Filter by browser via `--filter chrome` / `firefox` / `chromium` / `edge` / `safari`.
    List(CatalogListArgs),
    /// Show the full fingerprint for a single profile by curl-impersonate
    /// name (e.g. `chrome_116.0.5845.180_win10`) or by `<browser>-<major>-<os>`
    /// (e.g. `chrome-149-linux`).
    Show(CatalogShowArgs),
}

#[derive(Args, Debug)]
pub struct CatalogListArgs {
    /// Restrict to one browser family (`chrome`, `chromium`, `firefox`,
    /// `edge`, `safari`). Omit to list all.
    #[arg(long)]
    pub filter: Option<String>,
    /// Output as JSON (compact one-line per profile) instead of the
    /// default human-readable table.
    #[arg(long, default_value_t = false)]
    pub json: bool,
}

#[derive(Args, Debug)]
pub struct CatalogShowArgs {
    /// Profile identifier — either the catalog name
    /// (`chrome_116.0.5845.180_win10`) or a `<browser>-<major>-<os>`
    /// spec that resolves via era fallback (`chrome-149-linux`).
    pub profile: String,
    /// Output as JSON instead of the default human-readable layout.
    #[arg(long, default_value_t = false)]
    pub json: bool,
}

#[derive(Args, Debug)]
pub struct TelemetryShowArgs {
    #[arg(long)]
    pub db: String,
    #[arg(long, default_value_t = 20)]
    pub top: usize,
}

#[derive(Args, Debug)]
pub struct QueueStatsArgs {
    #[arg(long)]
    pub queue_path: String,
}

#[derive(Args, Debug)]
pub struct QueuePurgeArgs {
    #[arg(long)]
    pub queue_path: String,
}

#[derive(Args, Debug)]
pub struct QueueExportArgs {
    #[arg(long)]
    pub queue_path: String,
    #[arg(long)]
    pub out: String,
}

#[derive(Args, Debug)]
pub struct SessionsListArgs {
    #[arg(long)]
    pub storage_path: String,
    #[arg(long)]
    pub state: Option<String>,
}

#[derive(Args, Debug)]
pub struct SessionDropArgs {
    #[arg(long)]
    pub storage_path: String,
    #[arg(long)]
    pub id: String,
}

#[derive(Args, Debug, Clone)]
pub struct IntelExportArgs {
    /// Registrable domain whose intel was previously collected.
    pub target: String,
    #[arg(long, default_value = "./crawlex.db")]
    pub db: String,
    /// Write JSON to this file. Empty/omitted ⇒ stdout.
    #[arg(long)]
    pub out: Option<String>,
    /// Write a single-file HTML dashboard to this path instead of JSON.
    /// Takes precedence over `--out` when both are supplied.
    #[arg(long)]
    pub html: Option<String>,
    /// Pretty-print (2-space indent). Off ⇒ compact one-line JSON.
    #[arg(long, default_value_t = false)]
    pub pretty: bool,
}

#[derive(Args, Debug, Clone)]
pub struct IntelShowArgs {
    /// Registrable domain whose intel was previously collected.
    pub target: String,
    /// SQLite database path.
    #[arg(long, default_value = "./crawlex.db")]
    pub db: String,
    /// Cap on how many rows each list-section prints. Zero disables
    /// truncation for that section.
    #[arg(long, default_value_t = 30)]
    pub limit: usize,
}

#[derive(Args, Debug, Clone)]
pub struct IntelArgs {
    /// Registrable domain to investigate (e.g. `stone.com.br`).
    pub target: String,
    /// SQLite database path — re-uses the same schema the crawl
    /// subcommand writes into, so an intel run + crawl run populate
    /// one coherent store.
    #[arg(long, default_value = "./crawlex.db")]
    pub db: String,
    /// Skip the subdomain enumeration step.
    #[arg(long)]
    pub no_subdomains: bool,
    /// Skip DNS record collection.
    #[arg(long)]
    pub no_dns: bool,
    /// Skip WHOIS/RDAP.
    #[arg(long)]
    pub no_whois: bool,
    /// Skip the TLS handshake / certificate grab.
    #[arg(long)]
    pub no_cert: bool,
    /// Opt-in to active TCP-connect port probes (top ~20 ports) +
    /// reverse DNS + cloud/CDN IP-range tagging per unique IP. Even
    /// though it runs without CAP_NET_RAW, the 3-way handshake still
    /// shows up in the target's access logs — default OFF.
    #[arg(long)]
    pub network_probe: bool,
}

#[derive(Args, Debug, Clone)]
pub struct CrawlArgs {
    #[arg(long, action = clap::ArgAction::Append)]
    pub seed: Vec<String>,
    #[arg(long)]
    pub seeds_file: Option<String>,

    /// Default fetch method: "spoof" (HTTP), "render" (Chrome), "auto"
    #[arg(long, default_value = "spoof")]
    pub method: String,

    #[arg(long)]
    pub max_concurrent_render: Option<usize>,
    #[arg(long)]
    pub max_concurrent_http: Option<usize>,
    #[arg(long)]
    pub max_depth: Option<u32>,
    #[arg(long, default_value_t = false)]
    pub same_host_only: bool,
    #[arg(long, default_value_t = true)]
    pub include_subdomains: bool,
    #[arg(long)]
    pub respect_robots_txt: Option<bool>,

    #[arg(long)]
    pub wait_strategy: Option<String>,
    #[arg(long)]
    pub wait_idle_ms: Option<u64>,

    /// Per-CDP-command timeout in milliseconds (default 30000). Bumps the
    /// deadline applied to every CDP request, including `Page.navigate` —
    /// heavy real-world targets (Cloudflare-fronted SPAs with WordPress +
    /// ad scripts) regularly exceed 30s before lifecycle settles. Mirrors
    /// the `CRAWLEX_REQUEST_TIMEOUT_MS` env var; flag wins when both set.
    #[arg(long = "render-request-timeout-ms")]
    pub render_request_timeout_ms: Option<u64>,

    /// Lifecycle event the navigation watcher waits for. `load` (default)
    /// blocks until window onload fires; `domcontentloaded` returns as
    /// soon as the parser is done — much faster on heavy pages whose
    /// `load` never settles within the request timeout. Mirrors the
    /// `CRAWLEX_NAVIGATION_LIFECYCLE` env var; flag wins when both set.
    #[arg(long = "navigation-lifecycle")]
    pub navigation_lifecycle: Option<String>,

    #[arg(long)]
    pub profile: Option<String>,
    #[arg(long)]
    pub chrome_path: Option<String>,
    #[arg(long = "chrome-flag", action = clap::ArgAction::Append)]
    pub chrome_flag: Vec<String>,
    #[arg(long)]
    pub block_resource: Option<String>,

    #[arg(long)]
    pub queue: Option<String>,
    #[arg(long)]
    pub queue_path: Option<String>,
    #[arg(long)]
    pub queue_redis_url: Option<String>,

    #[arg(long)]
    pub storage: Option<String>,
    #[arg(long)]
    pub storage_path: Option<String>,

    #[arg(long)]
    pub output_html_dir: Option<String>,
    #[arg(long)]
    pub output_graph: Option<String>,
    #[arg(long)]
    pub output_metadata: Option<String>,
    #[arg(long, default_value_t = false)]
    pub screenshot: bool,
    #[arg(long)]
    pub screenshot_dir: Option<String>,
    /// Screenshot capture mode: `viewport`, `fullpage` (default), or
    /// `element:<css>`. The capture runs *after* wait-strategy + actions +
    /// Lua hooks have mutated the DOM, so SPA post-click views are the
    /// surface being grabbed. Element mode falls back to None (no file) if
    /// the selector can't be resolved.
    #[arg(long)]
    pub screenshot_mode: Option<String>,

    /// DNS-over-HTTPS provider. One of `off` (default; use system
    /// resolver), `cloudflare`, `google`, `quad9`, or a custom
    /// `https://…/dns-query` URL. Default OFF so existing behaviour is
    /// preserved; operators opt in when they want the crawl's DNS
    /// queries off the ISP resolver. See `src/impersonate/doh.rs` for
    /// the current wiring status (config-only today).
    #[arg(long, default_value = "off")]
    pub doh: String,

    #[arg(long = "proxy", action = clap::ArgAction::Append)]
    pub proxy: Vec<String>,
    #[arg(long)]
    pub proxy_file: Option<String>,
    #[arg(long)]
    pub proxy_strategy: Option<String>,
    #[arg(long, default_value_t = false)]
    pub proxy_sticky_per_host: bool,
    #[arg(long)]
    pub proxy_health_check_interval_secs: Option<u64>,
    /// Launch a local explicit HTTP proxy backed by Raffel and use it as the
    /// crawler's sole proxy.
    #[arg(long, default_value_t = false)]
    pub raffel_proxy: bool,
    /// Path to the local Raffel checkout/build. Defaults to the workspace used
    /// during development on this machine.
    #[arg(long, default_value = "/home/cyber/Work/tetis/libs/raffel")]
    pub raffel_proxy_path: String,
    /// Host/interface for the local Raffel proxy listener.
    #[arg(long, default_value = "127.0.0.1")]
    pub raffel_proxy_host: String,
    /// Listen port for the local Raffel proxy.
    #[arg(long, default_value_t = 8899)]
    pub raffel_proxy_port: u16,

    #[arg(long = "hook-script", action = clap::ArgAction::Append)]
    pub hook_script: Vec<String>,

    /// Drive lifecycle hooks via the JS bridge protocol. Format:
    /// `stdio` (multiplex on stdin+stdout — bridge envelopes
    /// disambiguated from NDJSON events by their `kind` field) or
    /// `fd:N` for an explicit file-descriptor pair (`N` reads, `N+1`
    /// writes). Spawn convention is owned by the SDK — see
    /// `sdk/crawlex-sdk.js::crawl({hooks})`. Off by default.
    #[arg(long = "hook-bridge")]
    pub hook_bridge: Option<String>,

    #[arg(long)]
    pub on_discovery_filter_regex: Option<String>,

    /// Pick a persona from `identity::profiles::catalog()` (0-indexed).
    /// When set, overrides the historical Linux/Intel default and wires
    /// `IdentityBundle::from_persona(catalog()[N], …)` into the render
    /// pool. Prefer `--persona <name>` for legibility — this remains for
    /// existing scripts that pass numeric indices.
    #[arg(long)]
    pub identity_preset: Option<u8>,

    /// Pick a persona by codename (`tux`, `office`, `gamer`, `atlas`,
    /// `pixel`). Resolves to the same row as `--identity-preset N` but
    /// tracks the row even if catalog ordering shifts.
    /// `tux` = Linux Intel desktop, `office` = Win10 Intel laptop,
    /// `gamer` = Win10 NVIDIA desktop, `atlas` = macOS Apple M1,
    /// `pixel` = Android mobile (Adreno). Run `crawlex stealth catalog list`
    /// to see all rows. Mutually exclusive with `--identity-preset`.
    #[arg(long, conflicts_with = "identity_preset")]
    pub persona: Option<String>,

    /// Only follow URLs classified as page/document/api; other assets are
    /// stored but not enqueued. Set --follow-all-assets to disable.
    #[arg(long, default_value_t = false)]
    pub follow_all_assets: bool,

    /// Enable certificate-transparency subdomain seeding (crt.sh).
    #[arg(long, default_value_t = false)]
    pub crtsh: bool,

    /// Skip robots.txt Disallow/Allow path expansion (enabled by default).
    #[arg(long, default_value_t = false)]
    pub no_robots_paths: bool,

    /// Skip .well-known/* probes (enabled by default).
    #[arg(long, default_value_t = false)]
    pub no_well_known: bool,

    /// Skip PWA manifest / service worker probes (enabled by default).
    #[arg(long, default_value_t = false)]
    pub no_pwa: bool,

    /// Skip favicon mmh3 fingerprinting (enabled by default).
    #[arg(long, default_value_t = false)]
    pub no_favicon: bool,

    /// Enable Wayback Machine (CDX) URL seeding.
    #[arg(long, default_value_t = false)]
    pub wayback: bool,

    /// Enable DNS record enumeration and seed related hosts.
    #[arg(long, default_value_t = false)]
    pub dns: bool,

    /// Opt-in: collect both network timings and Web Vitals (overrides the
    /// granular flags below). OFF by default — speed first.
    #[arg(long, default_value_t = false)]
    pub metrics: bool,

    /// Opt-in: measure DNS/TCP/TLS/TTFB/download on HTTP path and store.
    #[arg(long, default_value_t = false)]
    pub metrics_net: bool,

    /// Opt-in: execute Web Vitals JS after render and store.
    #[arg(long, default_value_t = false)]
    pub metrics_vitals: bool,

    /// Opt-in: extract peer TLS cert (CN, SANs, fingerprint) and seed SANs.
    #[arg(long, default_value_t = false)]
    pub peer_cert: bool,

    /// Opt-in: RDAP lookup per registrable domain (registrar, expires, NS).
    #[arg(long, default_value_t = false)]
    pub rdap: bool,

    /// Disable cookie persistence across requests (default: enabled).
    #[arg(long, default_value_t = false)]
    pub no_cookies: bool,

    /// Browser session reuse boundary for render jobs:
    /// registrable_domain|host|origin|url.
    #[arg(long)]
    pub render_session_scope: Option<String>,

    /// Disable 3xx redirect following (default: enabled).
    #[arg(long, default_value_t = false)]
    pub no_follow_redirects: bool,

    /// Disable auto-download of a pinned Chromium-for-Testing when no system
    /// Chrome is found (default: enabled). Only meaningful with the
    /// `chromium-fetcher` feature compiled in.
    #[arg(long, default_value_t = false)]
    pub no_fetch_chromium: bool,

    /// Max redirects to follow (default 10).
    #[arg(long)]
    pub max_redirects: Option<u8>,

    /// Path to a JSON file with an Actions script executed on every rendered
    /// page (see src/render/actions.rs for schema). Enables form fill, click,
    /// scroll, type with human-like timing.
    #[arg(long)]
    pub actions_file: Option<String>,

    /// Path to a ScriptSpec v1 JSON file (see `crate::script::spec`).
    /// When set, each rendered page runs the declarative script instead
    /// of the legacy `--actions-file` recipe — mutually exclusive with
    /// `--actions-file`. ScriptSpec is the recommended replacement for
    /// multi-step interactive crawls (click, type, wait_for, screenshot,
    /// snapshot, extract, assert, export).
    #[arg(long, value_name = "PATH", conflicts_with = "actions_file")]
    pub script_spec: Option<String>,

    #[arg(long)]
    pub rate_per_host_rps: Option<f64>,
    #[arg(long)]
    pub retry_max: Option<u32>,
    #[arg(long)]
    pub retry_backoff_ms: Option<u64>,

    #[arg(long)]
    pub user_agent_override: Option<String>,
    #[arg(long)]
    pub timezone: Option<String>,
    #[arg(long)]
    pub locale: Option<String>,

    #[arg(long)]
    pub metrics_prometheus_port: Option<u16>,
    #[arg(long, default_value = "info")]
    pub log_level: String,
    #[arg(long, default_value = "text")]
    pub log_format: String,

    // ----- v0.2 contract flags ---------------------------------------
    /// Emit lifecycle events on stdout. `ndjson` writes one JSON object per
    /// line, `none` keeps stdout silent (default).
    #[arg(long, default_value = "none")]
    pub emit: String,

    /// Policy preset that shapes every decide-here-or-there call.
    /// `fast` minimises render escalation; `balanced` is the default;
    /// `deep` prefers render when uncertain; `forensics` collects full
    /// artifacts on every job.
    #[arg(long = "policy", default_value = "balanced")]
    pub policy: String,

    /// Load a `Config` JSON from `<path>` or stdin (`-`). When set,
    /// individual CLI flags still override fields the config sets.
    /// Schema mirrors `crawlex::config::Config`.
    #[arg(long)]
    pub config: Option<String>,

    /// Mirror every `decision.made` event to stderr in human-readable
    /// form. The NDJSON stream on stdout is unaffected.
    #[arg(long, default_value_t = false)]
    pub explain: bool,

    /// Disable the SPA JS observer (History API + fetch + XHR wrappers).
    /// Default: observer is active and runtime_routes/network_endpoints
    /// artifacts are emitted + pushed to the frontier.
    #[arg(long, default_value_t = false)]
    pub no_spa_observer: bool,

    /// Enable the IndexedDB inventory collector (opt-in — heavy on
    /// data-rich apps). Emits `snapshot.indexeddb` artifacts.
    #[arg(long, default_value_t = false)]
    pub collect_indexeddb: bool,

    /// Enable the Cache Storage inventory collector (opt-in — heavy
    /// on SW-backed apps). Emits `snapshot.cache_storage` artifacts.
    #[arg(long, default_value_t = false)]
    pub collect_cache_storage: bool,

    /// Turn on ALL SPA/PWA state collectors at once: runtime routes,
    /// network endpoints, IndexedDB, Cache Storage, manifest JSON and
    /// service workers. Convenient umbrella for `--policy forensics`
    /// style crawls. Individual `--collect-*` / `--no-spa-observer`
    /// flags still apply when set explicitly.
    #[arg(long, default_value_t = false)]
    pub collect_spa_state: bool,

    /// Per-verb policy applied to every action step (click/type/eval/...).
    /// Accepts `permissive` (default, all allowed), `strict` (deny all),
    /// `default` (conservative: eval=deny, download=confirm, rest=allow),
    /// or a path to a JSON policy file. Use when running a ScriptSpec
    /// from an untrusted source (LLM-generated, shared fixture).
    #[arg(long)]
    pub action_policy: Option<String>,

    /// Challenge handling mode: `avoidance` keeps captcha handling strictly
    /// prevention-only; `solver-ready` records extra widget metadata so a
    /// solver can be integrated later without changing the capture contract.
    #[arg(long)]
    pub challenge_mode: Option<String>,

    /// Vendor-specific bypass tier. `none` (default) disables every
    /// trick; `replay` enables conservative cookie pinning for cookies
    /// the crawler's own sessions earned (Akamai `_abck`, DataDome,
    /// PerimeterX `_px*`); `aggressive` additionally allows best-effort
    /// Turnstile invisible-widget dummy attempts. Opt-in only.
    #[arg(long, value_name = "LEVEL")]
    pub antibot_bypass: Option<String>,

    // ----- Phase 5: throughput / budgets ---------------------------
    /// Max Chrome instances kept alive simultaneously. Each proxy key
    /// gets its own Chrome; LRU eviction fires when the cap is hit.
    #[arg(long)]
    pub max_browsers: Option<usize>,

    /// Max idle + in-flight pages per BrowserContext. Higher = more
    /// parallel tabs per session; lower = better memory reuse.
    #[arg(long)]
    pub max_pages_per_context: Option<usize>,

    /// Max concurrent render jobs targeting a single host.
    #[arg(long)]
    pub max_per_host_inflight: Option<usize>,

    /// Max concurrent render jobs targeting a single origin.
    #[arg(long)]
    pub max_per_origin_inflight: Option<usize>,

    /// Max concurrent render jobs routed through a single proxy.
    #[arg(long)]
    pub max_per_proxy_inflight: Option<usize>,

    /// Max concurrent render jobs per stateful session. Default 1 so
    /// per-session cookies / SPA state don't interleave across tabs.
    #[arg(long)]
    pub max_per_session_inflight: Option<usize>,

    // ----- Phase 6: session isolation -------------------------------
    /// Render session time-to-live (seconds). Sessions not touched for
    /// this long are torn down (BrowserContext disposed, cookies
    /// dropped). Default 3600 (1h).
    #[arg(long)]
    pub session_ttl_secs: Option<u64>,

    /// When set, policy may automatically demote the
    /// `--render-session-scope` on login pages and hard antibot
    /// walls. Default off (scope stays what the operator declared).
    #[arg(long, default_value_t = false)]
    pub session_scope_auto: bool,

    /// Keep contaminated/blocked sessions around even when policy would
    /// otherwise drop them on first hit. Default: drop on block.
    #[arg(long, default_value_t = false)]
    pub keep_blocked_sessions: bool,

    /// Human motion engine preset:
    /// `fast` — linear path + minimal delay (throughput first);
    /// `balanced` (default) — WindMouse + Fitts + OU jitter;
    /// `human` — realistic cadence, overshoots, ~2–4s/click;
    /// `paranoid` — aggressive realism, 5–10s/click.
    #[arg(long)]
    pub motion_profile: Option<String>,

    /// Enable the post-settle "reading" dwell: after the wait strategy
    /// fires, sleep proportional to the rendered body's word count
    /// before we serialise the DOM. Trades throughput for stealth —
    /// reCAPTCHA v3 / DataDome score instant extraction as bot-like.
    #[arg(long, default_value_t = false)]
    pub reading_dwell: bool,

    /// Words-per-minute the "reader" simulates. 250 ≈ typical adult
    /// prose speed. Only consulted when `--reading-dwell` is set.
    #[arg(long, default_value_t = 250)]
    pub reading_dwell_wpm: u32,

    /// Gaussian jitter σ (ms) applied to the computed dwell, so
    /// successive requests aren't exactly identical. Only consulted
    /// when `--reading-dwell` is set.
    #[arg(long, default_value_t = 40)]
    pub reading_dwell_jitter_ms: u64,

    // ----- Wave 2 infra-scaffold wire-ups --------------------------
    /// Residential-proxy provider adapter (stub). One of
    /// `none` (default), `brightdata`, `oxylabs`, `iproyal`. All
    /// adapters are scaffold-only in this build — they return
    /// `AdapterNotConfigured` until real API credentials are wired.
    /// Provided here so operator config files + shell scripts can
    /// settle on the final flag name ahead of adapter rollout.
    #[arg(long, default_value = "none")]
    pub residential_provider: String,

    /// Captcha-solver adapter (stub). One of `none` (default),
    /// `2captcha`, `anticaptcha`, `vlm`. Crawlex policy stays
    /// prevention-first: every adapter refuses to answer unless the
    /// operator wires an API key via env vars documented in
    /// `docs/infra-tier-operator.md`.
    #[arg(long, default_value = "none")]
    pub captcha_solver: String,

    /// Mobile device profile for the Chromium backend. Accepts the
    /// aliases documented in `src/render/android_profile.rs` (e.g.
    /// `pixel-7-pro`, `pixel8`, `s23`, `android`). Default: desktop
    /// profile (no mobile emulation).
    #[arg(long)]
    pub mobile_profile: Option<String>,
}

#[derive(Args, Debug)]
pub struct ResumeArgs {
    #[arg(long)]
    pub queue_path: String,
}

#[derive(Args, Debug)]
pub struct InspectArgs {
    pub url: String,
    #[arg(long)]
    pub profile: Option<String>,
}

#[derive(Subcommand, Debug)]
pub enum QueueCmd {
    Stats {
        #[arg(long)]
        queue_path: String,
    },
    Purge {
        #[arg(long)]
        queue_path: String,
    },
    Export {
        #[arg(long)]
        queue_path: String,
        #[arg(long)]
        out: String,
    },
}

#[derive(Subcommand, Debug)]
pub enum SessionsCmd {
    /// List sessions persisted in the `sessions_archive` table.
    List {
        #[arg(long)]
        storage_path: String,
        /// Optional state filter: clean|warm|contaminated|blocked.
        #[arg(long)]
        state: Option<String>,
    },
    /// Archive (evict) a session by id. Requires the SQLite storage
    /// backend — the registry itself is in-process and can only be
    /// mutated by the running crawler.
    Drop {
        #[arg(long)]
        storage_path: String,
        #[arg(long)]
        id: String,
    },
}

#[derive(Subcommand, Debug)]
pub enum TelemetryCmd {
    /// Print aggregated challenge-rate dashboards (by vendor / proxy /
    /// session). Reads the `v_challenge_rate_*` views created by the
    /// storage layer on startup. Verb under the `challenge` resource
    /// so the full invocation stays `telemetry challenge show`.
    Show {
        /// Path to the crawlex SQLite storage (same as `--storage-path`).
        #[arg(long)]
        db: String,
        /// Cap rows for the session view (operator-first: keep terminals
        /// readable). Defaults to 20.
        #[arg(long, default_value_t = 20)]
        top: usize,
    },
}

#[derive(Args, Debug)]
pub struct ExportGraphArgs {
    #[arg(long)]
    pub storage_path: String,
    #[arg(long)]
    pub out: String,
}