crawlex 1.0.4

Stealth crawler with Chrome-perfect TLS/H2 fingerprint, render pool, hooks, persistent queue
Documentation
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
//! Human-like interaction primitives driven through the Chrome DevTools
//! Protocol Input domain.
//!
//! Bot detectors (reCAPTCHA v3, Cloudflare Turnstile, PerimeterX, DataDome)
//! score on **behavioural** signals, not just fingerprints. This module
//! produces the signal shapes those detectors look for:
//!
//! * **Mouse trajectories** are generated by the WindMouse engine
//!   ([`crate::render::motion`]) with Fitts-derived total movement time,
//!   Ornstein-Uhlenbeck jitter, and probabilistic overshoot.
//! * **Event sequence integrity**: every click emits
//!   `mousemove…mouseover+mouseenter → mousedown → mouseup → click`. A
//!   click without a preceding move is a dead giveaway for modern ML
//!   classifiers — this primitive never dispatches one.
//! * **Keystrokes** are scheduled by [`crate::render::keyboard`] using
//!   log-normal hold times, log-logistic inter-key flights, Pareto
//!   thinking pauses, and probabilistic typos with backspace correction.
//! * **Scroll** uses wheel events (not `Element.scrollIntoView`) so
//!   Chrome dispatches real `wheel` handlers.
//!
//! The active [`MotionProfile`](crate::render::motion::MotionProfile) is
//! read from the process-wide ambient slot (`MotionProfile::active()`),
//! which the CLI / pool sets at startup from `Config::motion_profile`.

use crate::render::chrome::page::Page;
use crate::render::chrome_protocol::cdp::browser_protocol::input::{
    DispatchKeyEventParams, DispatchKeyEventType, DispatchMouseEventParams, DispatchMouseEventType,
    InsertTextParams, MouseButton,
};
use crate::render::chrome_protocol::cdp::js_protocol::runtime::EvaluateParams;
use crate::render::keyboard::{KeyEvent, TypingEngine};
use crate::render::motion::lifecycle::{HIDE_SNIPPET, SHOW_SNIPPET};
use crate::render::motion::scroll::{schedule_for_active_profile as scroll_schedule, ScrollParams};
use crate::render::motion::{
    fatigue, idle::IdleState, scroll as scroll_mod, MotionEngine, MotionProfile, Point, TimedPoint,
};
use rand::rngs::SmallRng;
use rand::RngExt;
use serde::Deserialize;
use std::sync::Arc;
use std::time::Duration;

use crate::{Error, Result};

/// Rectangle in viewport coords returned by our DOM probe.
#[derive(Debug, Clone, Copy, Deserialize)]
pub struct Rect {
    pub x: f64,
    pub y: f64,
    pub w: f64,
    pub h: f64,
}

/// Resolve a selector (full DSL — css/text/role/label/testid/xpath chains)
/// to a bounding rect. Returns `None` when not found or not visible.
async fn element_rect(page: &Page, selector: &str) -> Result<Option<Rect>> {
    crate::render::selector::resolve_rect(page, selector).await
}

/// Keep a mouse position in the page so successive actions curve off each
/// other rather than all starting at (0,0). The script runner / action
/// executor threads this through every click and scroll.
#[derive(Clone, Copy, Debug, Default)]
pub struct MousePos {
    pub x: f64,
    pub y: f64,
}

/// Dispatch a single `mouseMoved` CDP event at `(x, y)`.
async fn dispatch_move(page: &Page, x: f64, y: f64) -> Result<()> {
    let params = DispatchMouseEventParams::builder()
        .r#type(DispatchMouseEventType::MouseMoved)
        .x(x)
        .y(y)
        .build()
        .map_err(|e| Error::Render(format!("mouse params: {e}")))?;
    page.execute(params)
        .await
        .map_err(|e| Error::Render(format!("mouse move: {e}")))?;
    Ok(())
}

/// Walk a precomputed trajectory as a series of CDP `mouseMoved` events,
/// honouring the per-step delay baked into each `TimedPoint`.
async fn walk_trajectory(page: &Page, pts: &[TimedPoint]) -> Result<()> {
    for p in pts {
        dispatch_move(page, p.x, p.y).await?;
        if p.delay_ms > 0 {
            tokio::time::sleep(Duration::from_millis(p.delay_ms)).await;
        }
    }
    Ok(())
}

/// Move the virtual cursor from `from` to `(x, y)` along a WindMouse
/// trajectory. Returns the new cursor position so callers can chain.
pub async fn mouse_move_to(page: &Page, from: MousePos, x: f64, y: f64) -> Result<MousePos> {
    let profile = MotionProfile::active();
    let mut engine = MotionEngine::new(profile);
    let pts = engine.trajectory(
        Point {
            x: from.x,
            y: from.y,
        },
        Point { x, y },
        // Unknown target width: assume ~40px (typical button). Fitts MT
        // still scales with distance — the width only nudges it a little.
        40.0,
    );
    walk_trajectory(page, &pts).await?;
    Ok(MousePos { x, y })
}

/// Click a CSS selector with full event-sequence integrity:
/// WindMouse trajectory → mouseover/mouseenter → post-move pause →
/// mouseDown → hold → mouseUp. The cursor state advances to the click
/// point; callers chain subsequent moves from there.
pub async fn click_selector(page: &Page, css: &str, from: MousePos) -> Result<MousePos> {
    let rect = element_rect(page, css)
        .await?
        .ok_or_else(|| Error::Render(format!("element not found: {css}")))?;
    let mut rng = rand::make_rng::<SmallRng>();
    // Jitter inside the bbox, avoiding the exact center — ML classifiers
    // flag clicks that always land on (x+w/2, y+h/2).
    let tx = rect.x + rect.w * rng.random_range(0.25..0.75);
    let ty = rect.y + rect.h * rng.random_range(0.25..0.75);
    click_point(page, from, tx, ty, rect.w.min(rect.h).max(10.0)).await
}

/// Shared click primitive: WindMouse move, mouseover/enter burst,
/// post-move pause, mouseDown, hold, mouseUp. Exposed at crate scope so
/// `ref_resolver` can share the exact sequence without duplicating CDP
/// wire code.
pub async fn click_point(
    page: &Page,
    from: MousePos,
    x: f64,
    y: f64,
    target_width: f64,
) -> Result<MousePos> {
    let profile = MotionProfile::active();
    let params = profile.params();
    let mut engine = MotionEngine::new(profile);

    let pts = engine.trajectory(
        Point {
            x: from.x,
            y: from.y,
        },
        Point { x, y },
        target_width,
    );
    walk_trajectory(page, &pts).await?;

    // Event-sequence integrity: detectors flag mousedown-without-mouseover.
    // Real browsers fire mouseover + mouseenter the instant the cursor
    // enters the target's bounding box — replay that here by re-emitting a
    // `mouseMoved` at the final point so Chrome's hit-testing machinery
    // dispatches the over/enter events itself.
    if params.emit_mouseover {
        dispatch_move(page, x, y).await?;
    }

    let mut rng = rand::make_rng::<SmallRng>();
    let pause = u64_range(
        &mut rng,
        params.post_move_pause_ms_min,
        params.post_move_pause_ms_max,
    );
    if pause > 0 {
        tokio::time::sleep(Duration::from_millis(pause)).await;
    }

    let press = DispatchMouseEventParams::builder()
        .r#type(DispatchMouseEventType::MousePressed)
        .x(x)
        .y(y)
        .button(MouseButton::Left)
        .click_count(1)
        .build()
        .map_err(|e| Error::Render(format!("click params: {e}")))?;
    page.execute(press)
        .await
        .map_err(|e| Error::Render(format!("click: {e}")))?;

    let hold = u64_range(
        &mut rng,
        params.mouse_down_pause_ms_min,
        params.mouse_down_pause_ms_max,
    );
    if hold > 0 {
        tokio::time::sleep(Duration::from_millis(hold)).await;
    }

    let release = DispatchMouseEventParams::builder()
        .r#type(DispatchMouseEventType::MouseReleased)
        .x(x)
        .y(y)
        .button(MouseButton::Left)
        .click_count(1)
        .build()
        .map_err(|e| Error::Render(format!("click params: {e}")))?;
    page.execute(release)
        .await
        .map_err(|e| Error::Render(format!("click: {e}")))?;

    Ok(MousePos { x, y })
}

fn u64_range(rng: &mut SmallRng, lo: u64, hi: u64) -> u64 {
    if hi <= lo {
        return lo;
    }
    rng.random_range(lo..hi)
}

pub async fn type_text(page: &Page, selector: &str, text: &str) -> Result<()> {
    // Use the selector engine so callers get the full DSL here too.
    if !crate::render::selector::focus(page, selector).await? {
        return Err(Error::Render(format!("focus failed: {selector}")));
    }
    dispatch_typing(page, text).await
}

/// Dispatch `text` as a scheduled keystroke timeline on the currently
/// focused element. Public so `ref_resolver` can reuse the exact same
/// distribution family when typing into an AX-snapshot-resolved node.
pub async fn dispatch_typing(page: &Page, text: &str) -> Result<()> {
    let profile = MotionProfile::active();
    let mut engine = TypingEngine::new(profile);
    let events = engine.schedule(text);

    for ev in events {
        match ev {
            KeyEvent::Pause { ms } => {
                if ms > 0 {
                    tokio::time::sleep(Duration::from_millis(ms)).await;
                }
            }
            KeyEvent::Char { ch, hold_ms } => {
                dispatch_char(page, ch, hold_ms).await?;
            }
            KeyEvent::Typo { wrong, hold_ms } => {
                dispatch_char(page, wrong, hold_ms).await?;
            }
            KeyEvent::Backspace { hold_ms } => {
                let down = DispatchKeyEventParams::builder()
                    .r#type(DispatchKeyEventType::KeyDown)
                    .key("Backspace".to_string())
                    .code("Backspace".to_string())
                    .build()
                    .map_err(|e| Error::Render(format!("backspace params: {e}")))?;
                page.execute(down)
                    .await
                    .map_err(|e| Error::Render(format!("backspace: {e}")))?;
                if hold_ms > 0 {
                    tokio::time::sleep(Duration::from_millis(hold_ms)).await;
                }
                let up = DispatchKeyEventParams::builder()
                    .r#type(DispatchKeyEventType::KeyUp)
                    .key("Backspace".to_string())
                    .code("Backspace".to_string())
                    .build()
                    .map_err(|e| Error::Render(format!("backspace params: {e}")))?;
                page.execute(up)
                    .await
                    .map_err(|e| Error::Render(format!("backspace: {e}")))?;
            }
        }
    }
    Ok(())
}

async fn dispatch_char(page: &Page, ch: char, hold_ms: u64) -> Result<()> {
    if ch.is_ascii() && !ch.is_control() {
        let text = ch.to_string();
        // KeyDown.
        let down = DispatchKeyEventParams::builder()
            .r#type(DispatchKeyEventType::KeyDown)
            .text(text.clone())
            .build()
            .map_err(|e| Error::Render(format!("key params: {e}")))?;
        page.execute(down)
            .await
            .map_err(|e| Error::Render(format!("key: {e}")))?;
        if hold_ms > 0 {
            tokio::time::sleep(Duration::from_millis(hold_ms)).await;
        }
        let up = DispatchKeyEventParams::builder()
            .r#type(DispatchKeyEventType::KeyUp)
            .text(text)
            .build()
            .map_err(|e| Error::Render(format!("key params: {e}")))?;
        page.execute(up)
            .await
            .map_err(|e| Error::Render(format!("key: {e}")))?;
    } else {
        let p = InsertTextParams::builder()
            .text(ch.to_string())
            .build()
            .map_err(|e| Error::Render(format!("insert params: {e}")))?;
        page.execute(p)
            .await
            .map_err(|e| Error::Render(format!("insert: {e}")))?;
        if hold_ms > 0 {
            tokio::time::sleep(Duration::from_millis(hold_ms)).await;
        }
    }
    Ok(())
}

pub async fn scroll_by(page: &Page, dy: f64, from: MousePos) -> Result<()> {
    // Delegate to the scroll scheduler: bursts with bell-curve velocity
    // interleaved with Pareto-distributed reading dwells (B.1). Fast
    // profile collapses to the legacy flat 120px tick flow.
    let ticks = scroll_schedule(dy);
    for tick in ticks {
        if tick.delta_y.abs() >= 0.5 {
            let params = DispatchMouseEventParams::builder()
                .r#type(DispatchMouseEventType::MouseWheel)
                .x(from.x.max(10.0))
                .y(from.y.max(10.0))
                .delta_x(0.0)
                .delta_y(tick.delta_y)
                .build()
                .map_err(|e| Error::Render(format!("wheel params: {e}")))?;
            page.execute(params)
                .await
                .map_err(|e| Error::Render(format!("wheel: {e}")))?;
        }
        if tick.delay_ms > 0 {
            tokio::time::sleep(Duration::from_millis(tick.delay_ms)).await;
        }
    }
    Ok(())
}

/// Emit a page-lifecycle transition: hide (fires `visibilitychange` +
/// window `blur`, flips `document.hidden=true`) or show (symmetric pair).
/// Used by background drivers to inject realistic focus/tab-switch events
/// — see `motion::lifecycle::schedule` for the timing plan.
pub async fn emit_page_hidden(page: &Page) -> Result<()> {
    let _ = eval_js(page, HIDE_SNIPPET).await?;
    Ok(())
}

pub async fn emit_page_visible(page: &Page) -> Result<()> {
    let _ = eval_js(page, SHOW_SNIPPET).await?;
    Ok(())
}

/// Spawn a background tokio task that dispatches ambient idle-drift
/// `mouseMoved` events when no action is active. Drops automatically when
/// `state.action_active` flips or the page connection dies. Fast profile
/// is a no-op — we intentionally preserve throughput there.
pub fn spawn_idle_drift(page: std::sync::Arc<Page>, origin: MousePos, state: Arc<IdleState>) {
    let profile = MotionProfile::active();
    if matches!(profile, MotionProfile::Fast) {
        return;
    }
    let seed = rand::make_rng::<SmallRng>().random::<u64>();
    tokio::spawn(async move {
        let mut drift = crate::render::motion::IdleDrift::for_profile(profile, seed);
        // Bounded iterations as a belt-and-braces against leaking the task
        // if the caller forgets to flip `action_active`. 6h at ~1s cadence.
        for _ in 0..21_600u64 {
            if state.is_action_active() {
                tokio::time::sleep(Duration::from_millis(200)).await;
                continue;
            }
            let (dx, dy) = drift.next_offset();
            let x = (origin.x + dx).max(1.0);
            let y = (origin.y + dy).max(1.0);
            // Best-effort: if the move fails (page closed), bail the task.
            if dispatch_move(&page, x, y).await.is_err() {
                return;
            }
            let sleep_ms = drift.next_delay_ms();
            tokio::time::sleep(Duration::from_millis(sleep_ms)).await;
        }
    });
}

/// Consume the active session's fatigue factor and return a scroll-param
/// override with decayed per-tick velocity. Handy for callers that build
/// their own scroll schedules.
pub fn scroll_params_with_fatigue() -> ScrollParams {
    let mut p = ScrollParams::for_profile(MotionProfile::active());
    let factor = fatigue::current_velocity_factor();
    p.peak_tick_px *= factor;
    p
}

// Keep the `scroll_mod` import live even if callers only use the
// re-exported schedule helper.
#[allow(dead_code)]
fn _assert_scroll_mod_linked() -> usize {
    std::mem::size_of::<scroll_mod::ScrollTick>()
}

pub async fn wait_for_selector(page: &Page, css: &str, timeout_ms: u64) -> Result<()> {
    let deadline = std::time::Instant::now() + Duration::from_millis(timeout_ms);
    loop {
        if element_rect(page, css).await?.is_some() {
            return Ok(());
        }
        if std::time::Instant::now() >= deadline {
            return Err(Error::Render(format!("wait_for_selector timeout: {css}")));
        }
        tokio::time::sleep(Duration::from_millis(50)).await;
    }
}

pub async fn eval_js(page: &Page, script: &str) -> Result<serde_json::Value> {
    let params = EvaluateParams::builder()
        .expression(script.to_string())
        .return_by_value(true)
        .await_promise(true)
        .build()
        .map_err(|e| Error::Render(format!("eval params: {e}")))?;
    let res = page
        .evaluate_expression(params)
        .await
        .map_err(|e| Error::Render(format!("eval: {e}")))?;
    Ok(res.value().cloned().unwrap_or(serde_json::Value::Null))
}